From 2cdb3f1d834aab27a927be7555fbf4f9e43e9261 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 24 Jun 2009 19:01:19 -0700 Subject: x86/PCI: fix boundary checking when using root CRS Don't touch info->res_num if we are out of space. Acked-by: Gary Hade Tested-by: Gary Hade Signed-off-by: Yinghai Lu Signed-off-by: Jesse Barnes --- arch/x86/pci/acpi.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index b26626dc517c..8bf152910eb0 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -68,6 +68,10 @@ setup_resource(struct acpi_resource *acpi_res, void *data) unsigned long flags; struct resource *root; int max_root_bus_resources = PCI_BUS_NUM_RESOURCES; + u64 start, end; + + if (bus_has_transparent_bridge(info->bus)) + max_root_bus_resources -= 3; status = resource_to_addr(acpi_res, &addr); if (!ACPI_SUCCESS(status)) @@ -84,25 +88,24 @@ setup_resource(struct acpi_resource *acpi_res, void *data) } else return AE_OK; - res = &info->res[info->res_num]; - res->name = info->name; - res->flags = flags; - res->start = addr.minimum + addr.translation_offset; - res->end = res->start + addr.address_length - 1; - res->child = NULL; - - if (bus_has_transparent_bridge(info->bus)) - max_root_bus_resources -= 3; + start = addr.minimum + addr.translation_offset; + end = start + addr.address_length - 1; if (info->res_num >= max_root_bus_resources) { printk(KERN_WARNING "PCI: Failed to allocate 0x%lx-0x%lx " "from %s for %s due to _CRS returning more than " - "%d resource descriptors\n", (unsigned long) res->start, - (unsigned long) res->end, root->name, info->name, + "%d resource descriptors\n", (unsigned long) start, + (unsigned long) end, root->name, info->name, max_root_bus_resources); - info->res_num++; return AE_OK; } + res = &info->res[info->res_num]; + res->name = info->name; + res->flags = flags; + res->start = start; + res->end = end; + res->child = NULL; + if (insert_resource(root, res)) { printk(KERN_ERR "PCI: Failed to allocate 0x%lx-0x%lx " "from %s for %s\n", (unsigned long) res->start, -- cgit v1.2.3 From 626fdfec1588ac1341a37805809d03a719d977e0 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 24 Jun 2009 20:00:12 -0700 Subject: x86/PCI: get root CRS before scanning children This allows us to remove adjust_transparent_bridge_resources and give x86_pci_root_bus_res_quirks a chance when _CRS is not used or not there. Acked-by: Gary Hade Tested-by: Gary Hade Signed-off-by: Yinghai Lu Signed-off-by: Jesse Barnes --- arch/x86/pci/acpi.c | 32 +++++++++----------------------- arch/x86/pci/amd_bus.c | 8 ++++++-- 2 files changed, 15 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 8bf152910eb0..1014eb4bfc37 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -117,23 +117,6 @@ setup_resource(struct acpi_resource *acpi_res, void *data) return AE_OK; } -static void -adjust_transparent_bridge_resources(struct pci_bus *bus) -{ - struct pci_dev *dev; - - list_for_each_entry(dev, &bus->devices, bus_list) { - int i; - u16 class = dev->class >> 8; - - if (class == PCI_CLASS_BRIDGE_PCI && dev->transparent) { - for(i = 3; i < PCI_BUS_NUM_RESOURCES; i++) - dev->subordinate->resource[i] = - dev->bus->resource[i - 3]; - } - } -} - static void get_current_resources(struct acpi_device *device, int busnum, int domain, struct pci_bus *bus) @@ -161,8 +144,6 @@ get_current_resources(struct acpi_device *device, int busnum, info.res_num = 0; acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource, &info); - if (info.res_num) - adjust_transparent_bridge_resources(bus); return; @@ -225,8 +206,15 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do */ memcpy(bus->sysdata, sd, sizeof(*sd)); kfree(sd); - } else - bus = pci_scan_bus_parented(NULL, busnum, &pci_root_ops, sd); + } else { + bus = pci_create_bus(NULL, busnum, &pci_root_ops, sd); + if (bus) { + if (pci_probe & PCI_USE__CRS) + get_current_resources(device, busnum, domain, + bus); + bus->subordinate = pci_scan_child_bus(bus); + } + } if (!bus) kfree(sd); @@ -241,8 +229,6 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do #endif } - if (bus && (pci_probe & PCI_USE__CRS)) - get_current_resources(device, busnum, domain, bus); return bus; } diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index f893d6a6e803..3ffa10df20b9 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -100,8 +100,9 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b) int j; struct pci_root_info *info; - /* don't go for it if _CRS is used */ - if (pci_probe & PCI_USE__CRS) + /* don't go for it if _CRS is used already */ + if (b->resource[0] != &ioport_resource || + b->resource[1] != &iomem_resource) return; /* if only one root bus, don't need to anything */ @@ -116,6 +117,9 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b) if (i == pci_root_num) return; + printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n", + b->number); + info = &pci_root_info[i]; for (j = 0; j < info->res_num; j++) { struct resource *res; -- cgit v1.2.3 From 9e314996e3dc5189b9b36dce67088e882e989897 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 30 Jun 2009 15:00:20 +0200 Subject: x86: Fix symbol annotation for arch/x86/lib/clear_page_64.S::clear_page_c Noticed the zero-sized function symbol while looking at 'perf' profiles, it causes the profiler to display those addresses in hexa. Turns out that this was wrong/bogus for an eternity. Signed-off-by: Mike Galbraith Acked-by: Alexander van Heukelum Acked-by: Cyrill Gorcunov LKML-Reference: <1246366820.6538.1.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- arch/x86/lib/clear_page_64.S | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index 9a10a78bb4a4..ebeafcce04a9 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -5,15 +5,14 @@ * Zero a page. * rdi page */ - ALIGN -clear_page_c: +ENTRY(clear_page_c) CFI_STARTPROC movl $4096/8,%ecx xorl %eax,%eax rep stosq ret CFI_ENDPROC -ENDPROC(clear_page) +ENDPROC(clear_page_c) ENTRY(clear_page) CFI_STARTPROC -- cgit v1.2.3 From 789d03f584484af85dbdc64935270c8e45f36ef7 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 30 Jun 2009 11:52:23 +0100 Subject: x86: Fix fixmap ordering The merge of the 32- and 64-bit fixmap headers made a latent bug on x86-64 a real one: with the right config settings it is possible for FIX_OHCI1394_BASE to overlap the FIX_BTMAP_* range. Signed-off-by: Jan Beulich Cc: # for 2.6.30.x LKML-Reference: <4A4A0A8702000078000082E8@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fixmap.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 2d81af3974a0..3eb0f79a5320 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -114,9 +114,6 @@ enum fixed_addresses { FIX_TEXT_POKE0, /* reserve 2 pages for text_poke() */ FIX_TEXT_POKE1, __end_of_permanent_fixed_addresses, -#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT - FIX_OHCI1394_BASE, -#endif /* * 256 temporary boot-time mappings, used by early_ioremap(), * before ioremap() is functional. @@ -129,6 +126,9 @@ enum fixed_addresses { FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 - (__end_of_permanent_fixed_addresses & 255), FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1, +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT + FIX_OHCI1394_BASE, +#endif #ifdef CONFIG_X86_32 FIX_WP_TEST, #endif -- cgit v1.2.3 From 44973998a111dfda09b952aa0f27cad326a97793 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 1 Jul 2009 17:49:38 +0530 Subject: x86: Remove double declaration of MSR_P6_EVNTSEL0 and MSR_P6_EVNTSEL1 MSR_P6_EVNTSEL0 and MSR_P6_EVNTSEL1 is already declared in msr-index.h. Signed-off-by: Jaswinder Singh Rajput LKML-Reference: <1246450778.6940.8.camel@hpdv5.satnam> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/msr-index.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 1692fb5050e3..6be7fc254b59 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -246,10 +246,6 @@ #define MSR_IA32_MISC_ENABLE_TURBO_DISABLE (1ULL << 38) #define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE (1ULL << 39) -/* Intel Model 6 */ -#define MSR_P6_EVNTSEL0 0x00000186 -#define MSR_P6_EVNTSEL1 0x00000187 - /* P4/Xeon+ specific */ #define MSR_IA32_MCG_EAX 0x00000180 #define MSR_IA32_MCG_EBX 0x00000181 -- cgit v1.2.3 From b25ae679f613ed04aaf6ccbfdb9122fce668e4bb Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 1 Jul 2009 19:53:14 +0530 Subject: x86: Mark device_nb as static and fix NULL noise This sparse warning: arch/x86/kernel/amd_iommu.c:1195:23: warning: symbol 'device_nb' was not declared. Should it be static? triggers because device_nb is global but is only used in a single .c file. change device_nb to static to fix that - this also addresses the sparse warning. This sparse warning: arch/x86/kernel/amd_iommu.c:1766:10: warning: Using plain integer as NULL pointer triggers because plain integer 0 is used in place of a NULL pointer. change 0 to NULL to fix that - this also address the sparse warning. Signed-off-by: Jaswinder Singh Rajput Cc: Joerg Roedel LKML-Reference: <1246458194.6940.20.camel@hpdv5.satnam> Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 9372f0406ad4..6c99f5037801 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1192,7 +1192,7 @@ out: return 0; } -struct notifier_block device_nb = { +static struct notifier_block device_nb = { .notifier_call = device_change_notifier, }; @@ -1763,7 +1763,7 @@ static void *alloc_coherent(struct device *dev, size_t size, flag |= __GFP_ZERO; virt_addr = (void *)__get_free_pages(flag, get_order(size)); if (!virt_addr) - return 0; + return NULL; paddr = virt_to_phys(virt_addr); -- cgit v1.2.3 From 76c06927f2a78143763dcff9b4c362d15eb29cc2 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 1 Jul 2009 19:54:23 +0530 Subject: x86: Declare check_efer() before it gets used This sparse warning: arch/x86/mm/init.c:83:16: warning: symbol 'check_efer' was not declared. Should it be static? triggers because check_efer() is not decalared before using it. asm/proto.h includes the declaration of check_efer(), so including asm/proto.h to fix that - this also addresses the sparse warning. Signed-off-by: Jaswinder Singh Rajput Cc: Andrew Morton LKML-Reference: <1246458263.6940.22.camel@hpdv5.satnam> Signed-off-by: Ingo Molnar --- arch/x86/mm/init.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 47ce9a2ce5e7..0607119cef94 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -12,6 +12,7 @@ #include #include #include +#include DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); -- cgit v1.2.3 From 0406ca6d8e849d9dd027c8cb6791448e81411aef Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 1 Jul 2009 21:02:09 +0200 Subject: perf_counter: Ignore the nmi call frames in the x86-64 backtraces About every callchains recorded with perf record are filled up including the internal perfcounter nmi frame: perf_callchain perf_counter_overflow intel_pmu_handle_irq perf_counter_nmi_handler notifier_call_chain atomic_notifier_call_chain notify_die do_nmi nmi We want ignore this frame as it's not interesting for instrumentation. To solve this, we simply ignore every frames from nmi context. New example of "perf report -s sym -c" after this patch: 9.59% [k] search_by_key 4.88% search_by_key reiserfs_read_locked_inode reiserfs_iget reiserfs_lookup do_lookup __link_path_walk path_walk do_path_lookup user_path_at vfs_fstatat vfs_lstat sys_newlstat system_call_fastpath __lxstat 0x406fb1 3.19% search_by_key search_by_entry_key reiserfs_find_entry reiserfs_lookup do_lookup __link_path_walk path_walk do_path_lookup user_path_at vfs_fstatat vfs_lstat sys_newlstat system_call_fastpath __lxstat 0x406fb1 [...] For now this patch only solves the problem in x86-64. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Anton Blanchard Cc: Arnaldo Carvalho de Melo LKML-Reference: <1246474930-6088-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/stacktrace.h | 2 ++ arch/x86/kernel/cpu/perf_counter.c | 8 +++++++- arch/x86/kernel/dumpstack_32.c | 6 ++++++ arch/x86/kernel/dumpstack_64.c | 22 +++++++++++++++------- 4 files changed, 30 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index f517944b2b17..cf86a5e73815 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -3,6 +3,8 @@ extern int kstack_depth_to_print; +int x86_is_stack_id(int id, char *name); + /* Generic stack tracer with callbacks */ struct stacktrace_ops { diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index d4cf4ce19aac..36c3dc7b8991 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1561,6 +1561,7 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip) static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry); static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry); +static DEFINE_PER_CPU(int, in_nmi_frame); static void @@ -1576,7 +1577,9 @@ static void backtrace_warning(void *data, char *msg) static int backtrace_stack(void *data, char *name) { - /* Process all stacks: */ + per_cpu(in_nmi_frame, smp_processor_id()) = + x86_is_stack_id(NMI_STACK, name); + return 0; } @@ -1584,6 +1587,9 @@ static void backtrace_address(void *data, unsigned long addr, int reliable) { struct perf_callchain_entry *entry = data; + if (per_cpu(in_nmi_frame, smp_processor_id())) + return; + if (reliable) callchain_store(entry, addr); } diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index d593cd1f58dc..bca5fba91c9e 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -19,6 +19,12 @@ #include "dumpstack.h" +/* Just a stub for now */ +int x86_is_stack_id(int id, char *name) +{ + return 0; +} + void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index d35db5993fd6..54b0a3276766 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -19,10 +19,8 @@ #include "dumpstack.h" -static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, - unsigned *usedp, char **idp) -{ - static char ids[][8] = { + +static char x86_stack_ids[][8] = { [DEBUG_STACK - 1] = "#DB", [NMI_STACK - 1] = "NMI", [DOUBLEFAULT_STACK - 1] = "#DF", @@ -33,6 +31,15 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" #endif }; + +int x86_is_stack_id(int id, char *name) +{ + return x86_stack_ids[id - 1] == name; +} + +static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, + unsigned *usedp, char **idp) +{ unsigned k; /* @@ -61,7 +68,7 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, if (*usedp & (1U << k)) break; *usedp |= 1U << k; - *idp = ids[k]; + *idp = x86_stack_ids[k]; return (unsigned long *)end; } /* @@ -81,12 +88,13 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, do { ++j; end -= EXCEPTION_STKSZ; - ids[j][4] = '1' + (j - N_EXCEPTION_STACKS); + x86_stack_ids[j][4] = '1' + + (j - N_EXCEPTION_STACKS); } while (stack < end - EXCEPTION_STKSZ); if (*usedp & (1U << j)) break; *usedp |= 1U << j; - *idp = ids[j]; + *idp = x86_stack_ids[j]; return (unsigned long *)end; } #endif -- cgit v1.2.3 From 251e1e44b97852aa5e53e71c4b47e55b2dfd054e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 2 Jul 2009 08:54:01 +0200 Subject: x86: Fix printk call in print_local_apic() Instead of this: [ 75.690022] <7>printing local APIC contents on CPU#0/0: [ 75.704406] ... APIC ID: 00000000 (0) [ 75.707905] ... APIC VERSION: 00060015 [ 75.722551] ... APIC TASKPRI: 00000000 (00) [ 75.725473] ... APIC PROCPRI: 00000000 [ 75.728592] ... APIC LDR: 00000001 [ 75.742137] ... APIC SPIV: 000001ff [ 75.744101] ... APIC ISR field: [ 75.746648] 0123456789abcdef0123456789abcdef [ 75.746649] <7>00000000000000000000000000000000 Improve the code to be saner and simpler and just print out the bitfield in a single line using hexa values - not as a (rather pointless) binary bitfield. Partially reused Linus's initial fix for this. Reported-and-Tested-by: Yinghai Lu Signed-off-by: Yinghai Lu Cc: Linus Torvalds Cc: Andrew Morton LKML-Reference: <4A4C43BC.90506@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 4d0216fcb36c..8fd1efb5a0bd 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1716,25 +1716,19 @@ __apicdebuginit(void) print_IO_APIC(void) return; } -__apicdebuginit(void) print_APIC_bitfield(int base) +__apicdebuginit(void) print_APIC_field(int base) { - unsigned int v; - int i, j; + int i; if (apic_verbosity == APIC_QUIET) return; - printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); - for (i = 0; i < 8; i++) { - v = apic_read(base + i*0x10); - for (j = 0; j < 32; j++) { - if (v & (1< 3) /* Due to the Pentium erratum 3AP. */ -- cgit v1.2.3 From 7a6a3a086fe60bb3b739f66e47dc5bbc8f530d32 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 2 Jul 2009 12:23:23 +0200 Subject: amd-iommu: handle alias entries correctly in init code An alias entry in the ACPI table means that the device can send requests to the IOMMU with both device ids, its own and the alias. This is not handled properly in the ACPI init code. This patch fixes the issue. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 10b2accd12ea..ec72c7779d6b 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -691,6 +691,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, devid = e->devid; devid_to = e->ext >> 8; + set_dev_entry_from_acpi(iommu, devid , e->flags, 0); set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0); amd_iommu_alias_table[devid] = devid_to; break; @@ -749,11 +750,13 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, devid = e->devid; for (dev_i = devid_start; dev_i <= devid; ++dev_i) { - if (alias) + if (alias) { amd_iommu_alias_table[dev_i] = devid_to; - set_dev_entry_from_acpi(iommu, - amd_iommu_alias_table[dev_i], - flags, ext_flags); + set_dev_entry_from_acpi(iommu, + devid_to, flags, ext_flags); + } + set_dev_entry_from_acpi(iommu, dev_i, + flags, ext_flags); } break; default: -- cgit v1.2.3 From 1bc6f83813441d15a74dfa97966fb68fa1bdec76 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 2 Jul 2009 18:32:05 +0200 Subject: amd-iommu: set evt_buf_size correctly The setting of this variable got lost during the suspend/resume implementation. But keeping this variable zero causes a divide-by-zero error in the interrupt handler. This patch fixes this. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index ec72c7779d6b..c1b17e97252e 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -472,6 +472,8 @@ static u8 * __init alloc_event_buffer(struct amd_iommu *iommu) if (iommu->evt_buf == NULL) return NULL; + iommu->evt_buf_size = EVT_BUFFER_SIZE; + return iommu->evt_buf; } -- cgit v1.2.3 From bbf2a330d92c5afccfd17592ba9ccd50f41cf748 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 3 Jul 2009 00:08:26 +0200 Subject: x86: atomic64: The atomic64_t data type should be 8 bytes aligned on 32-bit too Locked instructions on two cache lines at once are painful. If atomic64_t uses two cache lines, my test program is 10x slower. The chance for that is significant: 4/32 or 12.5%. Make sure an atomic64_t is 8 bytes aligned. Signed-off-by: Eric Dumazet Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: David Howells Cc: Andrew Morton Cc: Arnd Bergmann LKML-Reference: [ changed it to __aligned(8) as per Andrew's suggestion ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/atomic_32.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h index 2503d4e64c2a..ae0fbb5b0578 100644 --- a/arch/x86/include/asm/atomic_32.h +++ b/arch/x86/include/asm/atomic_32.h @@ -250,7 +250,7 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u) /* An 64bit atomic type */ typedef struct { - unsigned long long counter; + unsigned long long __aligned(8) counter; } atomic64_t; #define ATOMIC64_INIT(val) { (val) } -- cgit v1.2.3 From b7882b7c65abb00194bdb3d4a22d27d70fcc59ba Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Jul 2009 13:26:39 +0200 Subject: x86: atomic64: Move the 32-bit atomic64_t implementation to a .c file Linus noted that the atomic64_t primitives are all inlines currently which is crazy because these functions have a large register footprint anyway. Move them to a separate file: arch/x86/lib/atomic64_32.c Also, while at it, rename all uses of 'unsigned long long' to the much shorter u64. This makes the appearance of the prototypes a lot nicer - and it also uncovered a few bugs where (yet unused) API variants had 'long' as their return type instead of u64. [ More intrusive changes are not yet done in this patch. ] Reported-by: Linus Torvalds Cc: Eric Dumazet Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: David Howells Cc: Andrew Morton Cc: Arnd Bergmann LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/include/asm/atomic_32.h | 137 ++++--------------------- arch/x86/lib/Makefile | 1 + arch/x86/lib/atomic64_32.c | 216 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 237 insertions(+), 117 deletions(-) create mode 100644 arch/x86/lib/atomic64_32.c (limited to 'arch/x86') diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h index ae0fbb5b0578..311a43e47c0b 100644 --- a/arch/x86/include/asm/atomic_32.h +++ b/arch/x86/include/asm/atomic_32.h @@ -250,7 +250,7 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u) /* An 64bit atomic type */ typedef struct { - unsigned long long __aligned(8) counter; + u64 __aligned(8) counter; } atomic64_t; #define ATOMIC64_INIT(val) { (val) } @@ -264,31 +264,7 @@ typedef struct { */ #define __atomic64_read(ptr) ((ptr)->counter) -static inline unsigned long long -cmpxchg8b(unsigned long long *ptr, unsigned long long old, unsigned long long new) -{ - asm volatile( - - LOCK_PREFIX "cmpxchg8b (%[ptr])\n" - - : "=A" (old) - - : [ptr] "D" (ptr), - "A" (old), - "b" (ll_low(new)), - "c" (ll_high(new)) - - : "memory"); - - return old; -} - -static inline unsigned long long -atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val, - unsigned long long new_val) -{ - return cmpxchg8b(&ptr->counter, old_val, new_val); -} +extern u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val); /** * atomic64_xchg - xchg atomic64 variable @@ -298,18 +274,7 @@ atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val, * Atomically xchgs the value of @ptr to @new_val and returns * the old value. */ - -static inline unsigned long long -atomic64_xchg(atomic64_t *ptr, unsigned long long new_val) -{ - unsigned long long old_val; - - do { - old_val = atomic_read(ptr); - } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val); - - return old_val; -} +extern u64 atomic64_xchg(atomic64_t *ptr, u64 new_val); /** * atomic64_set - set atomic64 variable @@ -318,10 +283,7 @@ atomic64_xchg(atomic64_t *ptr, unsigned long long new_val) * * Atomically sets the value of @ptr to @new_val. */ -static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val) -{ - atomic64_xchg(ptr, new_val); -} +extern void atomic64_set(atomic64_t *ptr, u64 new_val); /** * atomic64_read - read atomic64 variable @@ -329,16 +291,7 @@ static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val) * * Atomically reads the value of @ptr and returns it. */ -static inline unsigned long long atomic64_read(atomic64_t *ptr) -{ - unsigned long long curr_val; - - do { - curr_val = __atomic64_read(ptr); - } while (atomic64_cmpxchg(ptr, curr_val, curr_val) != curr_val); - - return curr_val; -} +extern u64 atomic64_read(atomic64_t *ptr); /** * atomic64_add_return - add and return @@ -347,34 +300,14 @@ static inline unsigned long long atomic64_read(atomic64_t *ptr) * * Atomically adds @delta to @ptr and returns @delta + *@ptr */ -static inline unsigned long long -atomic64_add_return(unsigned long long delta, atomic64_t *ptr) -{ - unsigned long long old_val, new_val; - - do { - old_val = atomic_read(ptr); - new_val = old_val + delta; - - } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val); - - return new_val; -} - -static inline long atomic64_sub_return(unsigned long long delta, atomic64_t *ptr) -{ - return atomic64_add_return(-delta, ptr); -} +extern u64 atomic64_add_return(u64 delta, atomic64_t *ptr); -static inline long atomic64_inc_return(atomic64_t *ptr) -{ - return atomic64_add_return(1, ptr); -} - -static inline long atomic64_dec_return(atomic64_t *ptr) -{ - return atomic64_sub_return(1, ptr); -} +/* + * Other variants with different arithmetic operators: + */ +extern u64 atomic64_sub_return(u64 delta, atomic64_t *ptr); +extern u64 atomic64_inc_return(atomic64_t *ptr); +extern u64 atomic64_dec_return(atomic64_t *ptr); /** * atomic64_add - add integer to atomic64 variable @@ -383,10 +316,7 @@ static inline long atomic64_dec_return(atomic64_t *ptr) * * Atomically adds @delta to @ptr. */ -static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr) -{ - atomic64_add_return(delta, ptr); -} +extern void atomic64_add(u64 delta, atomic64_t *ptr); /** * atomic64_sub - subtract the atomic64 variable @@ -395,10 +325,7 @@ static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr) * * Atomically subtracts @delta from @ptr. */ -static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr) -{ - atomic64_add(-delta, ptr); -} +extern void atomic64_sub(u64 delta, atomic64_t *ptr); /** * atomic64_sub_and_test - subtract value from variable and test result @@ -409,13 +336,7 @@ static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr) * true if the result is zero, or false for all * other cases. */ -static inline int -atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr) -{ - unsigned long long old_val = atomic64_sub_return(delta, ptr); - - return old_val == 0; -} +extern int atomic64_sub_and_test(u64 delta, atomic64_t *ptr); /** * atomic64_inc - increment atomic64 variable @@ -423,10 +344,7 @@ atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr) * * Atomically increments @ptr by 1. */ -static inline void atomic64_inc(atomic64_t *ptr) -{ - atomic64_add(1, ptr); -} +extern void atomic64_inc(atomic64_t *ptr); /** * atomic64_dec - decrement atomic64 variable @@ -434,10 +352,7 @@ static inline void atomic64_inc(atomic64_t *ptr) * * Atomically decrements @ptr by 1. */ -static inline void atomic64_dec(atomic64_t *ptr) -{ - atomic64_sub(1, ptr); -} +extern void atomic64_dec(atomic64_t *ptr); /** * atomic64_dec_and_test - decrement and test @@ -447,10 +362,7 @@ static inline void atomic64_dec(atomic64_t *ptr) * returns true if the result is 0, or false for all other * cases. */ -static inline int atomic64_dec_and_test(atomic64_t *ptr) -{ - return atomic64_sub_and_test(1, ptr); -} +extern int atomic64_dec_and_test(atomic64_t *ptr); /** * atomic64_inc_and_test - increment and test @@ -460,10 +372,7 @@ static inline int atomic64_dec_and_test(atomic64_t *ptr) * and returns true if the result is zero, or false for all * other cases. */ -static inline int atomic64_inc_and_test(atomic64_t *ptr) -{ - return atomic64_sub_and_test(-1, ptr); -} +extern int atomic64_inc_and_test(atomic64_t *ptr); /** * atomic64_add_negative - add and test if negative @@ -474,13 +383,7 @@ static inline int atomic64_inc_and_test(atomic64_t *ptr) * if the result is negative, or false when * result is greater than or equal to zero. */ -static inline int -atomic64_add_negative(unsigned long long delta, atomic64_t *ptr) -{ - long long old_val = atomic64_add_return(delta, ptr); - - return old_val < 0; -} +extern int atomic64_add_negative(u64 delta, atomic64_t *ptr); #include #endif /* _ASM_X86_ATOMIC_32_H */ diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index f9d35632666b..c3c657c8bb83 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -10,6 +10,7 @@ lib-y += usercopy_$(BITS).o getuser.o putuser.o lib-y += memcpy_$(BITS).o ifeq ($(CONFIG_X86_32),y) + lib-y += atomic64_32.o lib-y += checksum_32.o lib-y += strstr_32.o lib-y += semaphore_32.o string_32.o diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c new file mode 100644 index 000000000000..d21e725d3d84 --- /dev/null +++ b/arch/x86/lib/atomic64_32.c @@ -0,0 +1,216 @@ +#include +#include +#include +#include +#include + +static inline u64 cmpxchg8b(u64 *ptr, u64 old, u64 new) +{ + asm volatile( + + LOCK_PREFIX "cmpxchg8b (%[ptr])\n" + + : "=A" (old) + + : [ptr] "D" (ptr), + "A" (old), + "b" (ll_low(new)), + "c" (ll_high(new)) + + : "memory"); + + return old; +} + +u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val) +{ + return cmpxchg8b(&ptr->counter, old_val, new_val); +} + +/** + * atomic64_xchg - xchg atomic64 variable + * @ptr: pointer to type atomic64_t + * @new_val: value to assign + * + * Atomically xchgs the value of @ptr to @new_val and returns + * the old value. + */ + +u64 atomic64_xchg(atomic64_t *ptr, u64 new_val) +{ + u64 old_val; + + do { + old_val = atomic_read(ptr); + } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val); + + return old_val; +} + +/** + * atomic64_set - set atomic64 variable + * @ptr: pointer to type atomic64_t + * @new_val: value to assign + * + * Atomically sets the value of @ptr to @new_val. + */ +void atomic64_set(atomic64_t *ptr, u64 new_val) +{ + atomic64_xchg(ptr, new_val); +} + +/** + * atomic64_read - read atomic64 variable + * @ptr: pointer to type atomic64_t + * + * Atomically reads the value of @ptr and returns it. + */ +u64 atomic64_read(atomic64_t *ptr) +{ + u64 curr_val; + + do { + curr_val = __atomic64_read(ptr); + } while (atomic64_cmpxchg(ptr, curr_val, curr_val) != curr_val); + + return curr_val; +} + +/** + * atomic64_add_return - add and return + * @delta: integer value to add + * @ptr: pointer to type atomic64_t + * + * Atomically adds @delta to @ptr and returns @delta + *@ptr + */ +u64 atomic64_add_return(u64 delta, atomic64_t *ptr) +{ + u64 old_val, new_val; + + do { + old_val = atomic_read(ptr); + new_val = old_val + delta; + + } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val); + + return new_val; +} + +u64 atomic64_sub_return(u64 delta, atomic64_t *ptr) +{ + return atomic64_add_return(-delta, ptr); +} + +u64 atomic64_inc_return(atomic64_t *ptr) +{ + return atomic64_add_return(1, ptr); +} + +u64 atomic64_dec_return(atomic64_t *ptr) +{ + return atomic64_sub_return(1, ptr); +} + +/** + * atomic64_add - add integer to atomic64 variable + * @delta: integer value to add + * @ptr: pointer to type atomic64_t + * + * Atomically adds @delta to @ptr. + */ +void atomic64_add(u64 delta, atomic64_t *ptr) +{ + atomic64_add_return(delta, ptr); +} + +/** + * atomic64_sub - subtract the atomic64 variable + * @delta: integer value to subtract + * @ptr: pointer to type atomic64_t + * + * Atomically subtracts @delta from @ptr. + */ +void atomic64_sub(u64 delta, atomic64_t *ptr) +{ + atomic64_add(-delta, ptr); +} + +/** + * atomic64_sub_and_test - subtract value from variable and test result + * @delta: integer value to subtract + * @ptr: pointer to type atomic64_t + * + * Atomically subtracts @delta from @ptr and returns + * true if the result is zero, or false for all + * other cases. + */ +int atomic64_sub_and_test(u64 delta, atomic64_t *ptr) +{ + u64 old_val = atomic64_sub_return(delta, ptr); + + return old_val == 0; +} + +/** + * atomic64_inc - increment atomic64 variable + * @ptr: pointer to type atomic64_t + * + * Atomically increments @ptr by 1. + */ +void atomic64_inc(atomic64_t *ptr) +{ + atomic64_add(1, ptr); +} + +/** + * atomic64_dec - decrement atomic64 variable + * @ptr: pointer to type atomic64_t + * + * Atomically decrements @ptr by 1. + */ +void atomic64_dec(atomic64_t *ptr) +{ + atomic64_sub(1, ptr); +} + +/** + * atomic64_dec_and_test - decrement and test + * @ptr: pointer to type atomic64_t + * + * Atomically decrements @ptr by 1 and + * returns true if the result is 0, or false for all other + * cases. + */ +int atomic64_dec_and_test(atomic64_t *ptr) +{ + return atomic64_sub_and_test(1, ptr); +} + +/** + * atomic64_inc_and_test - increment and test + * @ptr: pointer to type atomic64_t + * + * Atomically increments @ptr by 1 + * and returns true if the result is zero, or false for all + * other cases. + */ +int atomic64_inc_and_test(atomic64_t *ptr) +{ + return atomic64_sub_and_test(-1, ptr); +} + +/** + * atomic64_add_negative - add and test if negative + * @delta: integer value to add + * @ptr: pointer to type atomic64_t + * + * Atomically adds @delta to @ptr and returns true + * if the result is negative, or false when + * result is greater than or equal to zero. + */ +int atomic64_add_negative(u64 delta, atomic64_t *ptr) +{ + long long old_val = atomic64_add_return(delta, ptr); + + return old_val < 0; +} -- cgit v1.2.3 From aacf682fd8c66b57383c407eecd9d4a28264ee91 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 3 Jul 2009 12:14:27 +0200 Subject: x86: atomic64: Improve atomic64_read() Linus noticed that the 32-bit version of atomic64_read() was being overly complex with re-reading the value and doing a retry loop over that. Instead we can just rely on cmpxchg8b returning either the new value or returning the current value. We can use any 'old' value, which will be faster as it can be loaded via immediates. Using some value that is not equal to the real value in memory the instruction gets faster. This also has the advantage that the CPU could avoid dirtying the cacheline. Reported-by: Linus Torvalds Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: David Howells Cc: Andrew Morton Cc: Arnd Bergmann LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/lib/atomic64_32.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c index d21e725d3d84..afa5d444918b 100644 --- a/arch/x86/lib/atomic64_32.c +++ b/arch/x86/lib/atomic64_32.c @@ -67,13 +67,9 @@ void atomic64_set(atomic64_t *ptr, u64 new_val) */ u64 atomic64_read(atomic64_t *ptr) { - u64 curr_val; + u64 old = 1LL << 32; - do { - curr_val = __atomic64_read(ptr); - } while (atomic64_cmpxchg(ptr, curr_val, curr_val) != curr_val); - - return curr_val; + return cmpxchg8b(&ptr->counter, old, old); } /** -- cgit v1.2.3 From 69237f94e65d3d7f539f1adb98ef68685c595004 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 3 Jul 2009 13:26:41 +0200 Subject: x86: atomic64: Improve cmpxchg8b() Rewrite cmpxchg8b() to not use %edi register but a generic "+m" constraint, to increase compiler freedom in code generation and possibly better code. Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: David Howells Cc: Andrew Morton Cc: Arnd Bergmann LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/lib/atomic64_32.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c index afa5d444918b..5fc1e2caa544 100644 --- a/arch/x86/lib/atomic64_32.c +++ b/arch/x86/lib/atomic64_32.c @@ -6,19 +6,14 @@ static inline u64 cmpxchg8b(u64 *ptr, u64 old, u64 new) { - asm volatile( - - LOCK_PREFIX "cmpxchg8b (%[ptr])\n" - - : "=A" (old) - - : [ptr] "D" (ptr), - "A" (old), - "b" (ll_low(new)), - "c" (ll_high(new)) - - : "memory"); + u32 low = new; + u32 high = new >> 32; + asm volatile( + LOCK_PREFIX "cmpxchg8b %1\n" + : "+A" (old), "+m" (*ptr) + : "b" (low), "c" (high) + ); return old; } -- cgit v1.2.3 From 824975ef190e7dcb77718d1cc2cb53769b16d918 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Jul 2009 12:39:07 +0200 Subject: x86: atomic64: Improve atomic64_add_return() Linus noted (based on Eric Dumazet's numbers) that we would probably be better off not trying an atomic_read() in atomic64_add_return() but intead intentionally let the first cmpxchg8b fail - to get a cache-friendly 'give me ownership of this cacheline' transaction. That can then be followed by the real cmpxchg8b which sets the value local to the CPU. Reported-by: Linus Torvalds Cc: Eric Dumazet Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: David Howells Cc: Andrew Morton Cc: Arnd Bergmann LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/lib/atomic64_32.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c index 5fc1e2caa544..61959627e1e1 100644 --- a/arch/x86/lib/atomic64_32.c +++ b/arch/x86/lib/atomic64_32.c @@ -76,13 +76,22 @@ u64 atomic64_read(atomic64_t *ptr) */ u64 atomic64_add_return(u64 delta, atomic64_t *ptr) { - u64 old_val, new_val; + /* + * Try first with a (probably incorrect) assumption about + * what we have there. We'll do two loops most likely, + * but we'll get an ownership MESI transaction straight away + * instead of a read transaction followed by a + * flush-for-ownership transaction: + */ + u64 old_val, new_val, real_val = 1ULL << 32; do { - old_val = atomic_read(ptr); + old_val = real_val; new_val = old_val + delta; - } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val); + real_val = atomic64_cmpxchg(ptr, old_val, new_val); + + } while (real_val != old_val); return new_val; } -- cgit v1.2.3 From 3ac805d2afd3fa4a07cb5bcf352fd7fa83f28935 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Jul 2009 12:51:19 +0200 Subject: x86: atomic64: Reduce size of functions cmpxchg8b is a huge instruction in terms of register footprint, we almost never want to inline it, not even within the same code module. GCC 4.3 still messes up for two functions, under-judging the true cost of this instruction - so annotate two key functions to reduce the bloat: arch/x86/lib/atomic64_32.o: text data bss dec hex filename 1763 0 0 1763 6e3 atomic64_32.o.before 435 0 0 435 1b3 atomic64_32.o.after Cc: Linus Torvalds Cc: Eric Dumazet Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: David Howells Cc: Andrew Morton Cc: Arnd Bergmann LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/lib/atomic64_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c index 61959627e1e1..a910238a7760 100644 --- a/arch/x86/lib/atomic64_32.c +++ b/arch/x86/lib/atomic64_32.c @@ -4,7 +4,7 @@ #include #include -static inline u64 cmpxchg8b(u64 *ptr, u64 old, u64 new) +static noinline u64 cmpxchg8b(u64 *ptr, u64 old, u64 new) { u32 low = new; u32 high = new >> 32; @@ -74,7 +74,7 @@ u64 atomic64_read(atomic64_t *ptr) * * Atomically adds @delta to @ptr and returns @delta + *@ptr */ -u64 atomic64_add_return(u64 delta, atomic64_t *ptr) +noinline u64 atomic64_add_return(u64 delta, atomic64_t *ptr) { /* * Try first with a (probably incorrect) assumption about -- cgit v1.2.3 From 3217120873598533234b6dedda9c371ce30001d0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Jul 2009 13:06:01 +0200 Subject: x86: atomic64: Make atomic_read() type-safe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Linus noticed that atomic64_xchg() uses atomic_read(), which happens to work because atomic_read() is a macro so the .counter value gets u64-read on 32-bit too - but this is really bogus and serious bugs are waiting to happen. Change atomic_read() to be a type-safe inline, and this exposes the atomic64 bogosity as well: arch/x86/lib/atomic64_32.c: In function ‘atomic64_xchg’: arch/x86/lib/atomic64_32.c:39: warning: passing argument 1 of ‘atomic_read’ from incompatible pointer type Reported-by: Linus Torvalds Cc: Eric Dumazet Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: David Howells Cc: Andrew Morton Cc: Arnd Bergmann LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/include/asm/atomic_32.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h index 311a43e47c0b..b551bb1ae3cf 100644 --- a/arch/x86/include/asm/atomic_32.h +++ b/arch/x86/include/asm/atomic_32.h @@ -19,7 +19,10 @@ * * Atomically reads the value of @v. */ -#define atomic_read(v) ((v)->counter) +static inline int atomic_read(const atomic_t *v) +{ + return v->counter; +} /** * atomic_set - set atomic variable -- cgit v1.2.3 From 199e23780a7e75c63a9e3d1108804e3af450ea3e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Jul 2009 13:02:39 +0200 Subject: x86: atomic64: Fix unclean type use in atomic64_xchg() Linus noticed that atomic64_xchg() uses atomic_read(), which happens to work because atomic_read() is a macro so the .counter value gets u64-read on 32-bit too - but this is really bogus and serious bugs are waiting to happen. Fix atomic64_xchg() to use __atomic64_read() instead. No code changed: arch/x86/lib/atomic64_32.o: text data bss dec hex filename 435 0 0 435 1b3 atomic64_32.o.before 435 0 0 435 1b3 atomic64_32.o.after md5: bd8ab95e69c93518578bfaf0ea3be4d9 atomic64_32.o.before.asm bd8ab95e69c93518578bfaf0ea3be4d9 atomic64_32.o.after.asm Reported-by: Linus Torvalds Cc: Eric Dumazet Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: David Howells Cc: Andrew Morton Cc: Arnd Bergmann LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/lib/atomic64_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c index a910238a7760..fd28fd3fb742 100644 --- a/arch/x86/lib/atomic64_32.c +++ b/arch/x86/lib/atomic64_32.c @@ -36,7 +36,7 @@ u64 atomic64_xchg(atomic64_t *ptr, u64 new_val) u64 old_val; do { - old_val = atomic_read(ptr); + old_val = __atomic64_read(ptr); } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val); return old_val; -- cgit v1.2.3 From 12b9d7ccb841805e347fec8f733f368f43ddba40 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Wed, 1 Jul 2009 17:37:22 -0400 Subject: x86: Fix fixmap page order for FIX_TEXT_POKE0,1 Masami reported: > Since the fixmap pages are assigned higher address to lower, > text_poke() has to use it with inverted order (FIX_TEXT_POKE1 > to FIX_TEXT_POKE0). I prefer to just invert the order of the fixmap declaration. It's simpler and more straightforward. Backward fixmaps seems to be used by both x86 32 and 64. It's really rare but a nasty bug, because it only hurts when instructions to patch are crossing a page boundary. If this happens, the fixmap write accesses will spill on the following fixmap, which may very well crash the system. And this does not crash the system, it could leave illegal instructions in place. Thanks Masami for finding this. It seems to have crept into the 2.6.30-rc series, so this calls for a -stable inclusion. Signed-off-by: Mathieu Desnoyers Acked-by: Masami Hiramatsu Cc: LKML-Reference: <20090701213722.GH19926@Krystal> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fixmap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 3eb0f79a5320..7b2d71df39a6 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -111,8 +111,8 @@ enum fixed_addresses { #ifdef CONFIG_PARAVIRT FIX_PARAVIRT_BOOTMAP, #endif - FIX_TEXT_POKE0, /* reserve 2 pages for text_poke() */ - FIX_TEXT_POKE1, + FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */ + FIX_TEXT_POKE0, /* first page is last, because allocation is backward */ __end_of_permanent_fixed_addresses, /* * 256 temporary boot-time mappings, used by early_ioremap(), -- cgit v1.2.3 From 3fd382cedfb4fb742a8cc17ba15288ec03131db1 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Thu, 2 Jul 2009 07:27:20 +0200 Subject: x86: Add missing annotation to arch/x86/lib/copy_user_64.S::copy_to_user While examining symbol generation in perf_counter tools, I noticed that copy_to_user() had no size in vmlinux's symtab. Signed-off-by: Mike Galbraith Acked-by: Alexander van Heukelum Acked-by: Cyrill Gorcunov LKML-Reference: <1246512440.13293.3.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- arch/x86/lib/copy_user_64.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index f118c110af32..6ba0f7bb85ea 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -75,6 +75,7 @@ ENTRY(copy_to_user) jae bad_to_user ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string CFI_ENDPROC +ENDPROC(copy_to_user) /* Standard copy_from_user with segment limit checking */ ENTRY(copy_from_user) -- cgit v1.2.3 From d3ac88157c1e9153f37cd2b9313117ae11735063 Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Thu, 2 Jul 2009 11:40:36 +0600 Subject: x86, kvm: Fix section mismatches in kvm.c The function paravirt_ops_setup() has been refering the variable no_timer_check, which is a __initdata. Thus generates the following warning. paravirt_ops_setup() function is called from kvm_guest_init() which is a __init function. So to fix this we mark paravirt_ops_setup as __init. The sections-check output that warned us about this was: LD arch/x86/built-in.o WARNING: arch/x86/built-in.o(.text+0x166ce): Section mismatch in reference from the function paravirt_ops_setup() to the variable .init.data:no_timer_check The function paravirt_ops_setup() references the variable __initdata no_timer_check. This is often because paravirt_ops_setup lacks a __initdata annotation or the annotation of no_timer_check is wrong. Signed-off-by: Rakib Mullick Acked-by: Avi Kivity Cc: Andrew Morton LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/kvm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index a78ecad0c900..c664d515f613 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -200,7 +200,7 @@ static void kvm_leave_lazy_mmu(void) state->mode = paravirt_get_lazy_mode(); } -static void paravirt_ops_setup(void) +static void __init paravirt_ops_setup(void) { pv_info.name = "KVM"; pv_info.paravirt_enabled = 1; -- cgit v1.2.3 From 23d0cd8e718723f1ddda37637bc6b7c34caec64a Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Thu, 2 Jul 2009 22:33:59 +0530 Subject: x86: Remove unused variable disable_x2apic setup_nox2apic() is writing 1 to disable_x2apic but no one is reading it. Signed-off-by: Jaswinder Singh Rajput Cc: Yinghai Lu Cc: Cyrill Gorcunov LKML-Reference: <1246554239.2242.27.camel@jaswinder.satnam> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 8c7c042ecad1..0a1c2830ec66 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -140,7 +140,6 @@ int x2apic_mode; #ifdef CONFIG_X86_X2APIC /* x2apic enabled before OS handover */ static int x2apic_preenabled; -static int disable_x2apic; static __init int setup_nox2apic(char *str) { if (x2apic_enabled()) { @@ -149,7 +148,6 @@ static __init int setup_nox2apic(char *str) return 0; } - disable_x2apic = 1; setup_clear_cpu_cap(X86_FEATURE_X2APIC); return 0; } -- cgit v1.2.3 From c7210e1ff8a95a0a6dba0d04a95b3ddd9d165fab Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Thu, 2 Jul 2009 22:35:35 +0530 Subject: x86: Remove unused function lapic_watchdog_ok() lapic_watchdog_ok() is a global function but no one is using it. Signed-off-by: Jaswinder Singh Rajput Cc: Andi Kleen Cc: Yinghai Lu LKML-Reference: <1246554335.2242.29.camel@jaswinder.satnam> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/nmi.h | 1 - arch/x86/kernel/cpu/perfctr-watchdog.c | 5 ----- 2 files changed, 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index c97264409934..c86e5ed4af51 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -72,7 +72,6 @@ void lapic_watchdog_stop(void); int lapic_watchdog_init(unsigned nmi_hz); int lapic_wd_event(unsigned nmi_hz); unsigned lapic_adjust_nmi_hz(unsigned hz); -int lapic_watchdog_ok(void); void disable_lapic_nmi_watchdog(void); void enable_lapic_nmi_watchdog(void); void stop_nmi(void); diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 5c481f6205bf..e60ed740d2b3 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -803,8 +803,3 @@ int __kprobes lapic_wd_event(unsigned nmi_hz) wd_ops->rearm(wd, nmi_hz); return 1; } - -int lapic_watchdog_ok(void) -{ - return wd_ops != NULL; -} -- cgit v1.2.3 From 8e049ef054f1cc765f05f13e1396bb9a17c19e66 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 2 Jul 2009 08:57:12 +1000 Subject: x86: atomic64: Code atomic(64)_read and atomic(64)_set in C not CPP Occasionally we get bugs where atomic_read or atomic_set are used on atomic64_t variables or vice versa. These bugs don't generate warnings on x86 because atomic_read and atomic_set are coded as macros rather than C functions, so we don't get any type-checking on their arguments; similarly for atomic64_read and atomic64_set in 64-bit kernels. This converts them to C functions so that the arguments are type-checked and bugs like this will get caught more easily. It also converts atomic_cmpxchg and atomic_xchg, and atomic64_cmpxchg and atomic64_xchg on 64-bit, so we get type-checking on their arguments too. Compiling a typical 64-bit x86 config, this generates no new warnings, and the vmlinux text is 86 bytes smaller. Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: David Howells Cc: Andrew Morton Cc: Arnd Bergmann LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/include/asm/atomic_32.h | 16 ++++++++++++--- arch/x86/include/asm/atomic_64.h | 42 ++++++++++++++++++++++++++++++++-------- 2 files changed, 47 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h index b551bb1ae3cf..aa045deb2e75 100644 --- a/arch/x86/include/asm/atomic_32.h +++ b/arch/x86/include/asm/atomic_32.h @@ -31,7 +31,10 @@ static inline int atomic_read(const atomic_t *v) * * Atomically sets the value of @v to @i. */ -#define atomic_set(v, i) (((v)->counter) = (i)) +static inline void atomic_set(atomic_t *v, int i) +{ + v->counter = i; +} /** * atomic_add - add integer to atomic variable @@ -203,8 +206,15 @@ static inline int atomic_sub_return(int i, atomic_t *v) return atomic_add_return(-i, v); } -#define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new))) -#define atomic_xchg(v, new) (xchg(&((v)->counter), (new))) +static inline int atomic_cmpxchg(atomic_t *v, int old, int new) +{ + return cmpxchg(&v->counter, old, new); +} + +static inline int atomic_xchg(atomic_t *v, int new) +{ + return xchg(&v->counter, new); +} /** * atomic_add_unless - add unless the number is already a given value diff --git a/arch/x86/include/asm/atomic_64.h b/arch/x86/include/asm/atomic_64.h index 0d6360220007..d605dc268e79 100644 --- a/arch/x86/include/asm/atomic_64.h +++ b/arch/x86/include/asm/atomic_64.h @@ -18,7 +18,10 @@ * * Atomically reads the value of @v. */ -#define atomic_read(v) ((v)->counter) +static inline int atomic_read(const atomic_t *v) +{ + return v->counter; +} /** * atomic_set - set atomic variable @@ -27,7 +30,10 @@ * * Atomically sets the value of @v to @i. */ -#define atomic_set(v, i) (((v)->counter) = (i)) +static inline void atomic_set(atomic_t *v, int i) +{ + v->counter = i; +} /** * atomic_add - add integer to atomic variable @@ -192,7 +198,10 @@ static inline int atomic_sub_return(int i, atomic_t *v) * Atomically reads the value of @v. * Doesn't imply a read memory barrier. */ -#define atomic64_read(v) ((v)->counter) +static inline long atomic64_read(const atomic64_t *v) +{ + return v->counter; +} /** * atomic64_set - set atomic64 variable @@ -201,7 +210,10 @@ static inline int atomic_sub_return(int i, atomic_t *v) * * Atomically sets the value of @v to @i. */ -#define atomic64_set(v, i) (((v)->counter) = (i)) +static inline void atomic64_set(atomic64_t *v, long i) +{ + v->counter = i; +} /** * atomic64_add - add integer to atomic64 variable @@ -355,11 +367,25 @@ static inline long atomic64_sub_return(long i, atomic64_t *v) #define atomic64_inc_return(v) (atomic64_add_return(1, (v))) #define atomic64_dec_return(v) (atomic64_sub_return(1, (v))) -#define atomic64_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new))) -#define atomic64_xchg(v, new) (xchg(&((v)->counter), new)) +static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new) +{ + return cmpxchg(&v->counter, old, new); +} + +static inline long atomic64_xchg(atomic64_t *v, long new) +{ + return xchg(&v->counter, new); +} -#define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new))) -#define atomic_xchg(v, new) (xchg(&((v)->counter), (new))) +static inline long atomic_cmpxchg(atomic_t *v, int old, int new) +{ + return cmpxchg(&v->counter, old, new); +} + +static inline long atomic_xchg(atomic_t *v, int new) +{ + return xchg(&v->counter, new); +} /** * atomic_add_unless - add unless the number is a given value -- cgit v1.2.3 From 67d7178f8fc64b7f68d7dd8a1b21dfa0d42c220c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 3 Jul 2009 13:23:02 +0200 Subject: x86: atomic64: Improve atomic64_read() Optimize atomic64_read() as a special open-coded cmpxchg8b variant. This generates nicer code: arch/x86/lib/atomic64_32.o: text data bss dec hex filename 435 0 0 435 1b3 atomic64_32.o.before 431 0 0 431 1af atomic64_32.o.after md5: bd8ab95e69c93518578bfaf0ea3be4d9 atomic64_32.o.before.asm 2bdfd4bd1f6b7b61b7fc127aef90ce3b atomic64_32.o.after.asm Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: David Howells Cc: Andrew Morton Cc: Arnd Bergmann LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/lib/atomic64_32.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c index fd28fd3fb742..cd11803f9448 100644 --- a/arch/x86/lib/atomic64_32.c +++ b/arch/x86/lib/atomic64_32.c @@ -62,9 +62,17 @@ void atomic64_set(atomic64_t *ptr, u64 new_val) */ u64 atomic64_read(atomic64_t *ptr) { - u64 old = 1LL << 32; + u64 res; - return cmpxchg8b(&ptr->counter, old, old); + asm volatile( + "mov %%ebx, %%eax\n\t" + "mov %%ecx, %%edx\n\t" + LOCK_PREFIX "cmpxchg8b %1\n" + : "+A" (res) + : "m" (*ptr) + ); + + return res; } /** -- cgit v1.2.3 From 1fde902d52ee13ab9fab155bbae757fdf7daf0c1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Jul 2009 17:28:57 +0200 Subject: x86: atomic64: Export APIs to modules atomic64_t primitives are used by a handful of drivers, so export the APIs consistently. These were inlined before. Also mark atomic64_32.o a core object, so that the symbols are available even if not linked to core kernel pieces. Cc: Eric Dumazet Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: David Howells Cc: Andrew Morton Cc: Arnd Bergmann LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/lib/Makefile | 2 +- arch/x86/lib/atomic64_32.c | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index c3c657c8bb83..07c31899c9c2 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -10,7 +10,7 @@ lib-y += usercopy_$(BITS).o getuser.o putuser.o lib-y += memcpy_$(BITS).o ifeq ($(CONFIG_X86_32),y) - lib-y += atomic64_32.o + obj-y += atomic64_32.o lib-y += checksum_32.o lib-y += strstr_32.o lib-y += semaphore_32.o string_32.o diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c index cd11803f9448..6722a092e407 100644 --- a/arch/x86/lib/atomic64_32.c +++ b/arch/x86/lib/atomic64_32.c @@ -1,5 +1,7 @@ #include +#include #include + #include #include #include @@ -21,6 +23,7 @@ u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val) { return cmpxchg8b(&ptr->counter, old_val, new_val); } +EXPORT_SYMBOL(atomic64_cmpxchg); /** * atomic64_xchg - xchg atomic64 variable @@ -41,6 +44,7 @@ u64 atomic64_xchg(atomic64_t *ptr, u64 new_val) return old_val; } +EXPORT_SYMBOL(atomic64_xchg); /** * atomic64_set - set atomic64 variable @@ -53,6 +57,7 @@ void atomic64_set(atomic64_t *ptr, u64 new_val) { atomic64_xchg(ptr, new_val); } +EXPORT_SYMBOL(atomic64_read); /** * atomic64_read - read atomic64 variable @@ -74,6 +79,7 @@ u64 atomic64_read(atomic64_t *ptr) return res; } +EXPORT_SYMBOL(atomic64_read); /** * atomic64_add_return - add and return @@ -103,21 +109,25 @@ noinline u64 atomic64_add_return(u64 delta, atomic64_t *ptr) return new_val; } +EXPORT_SYMBOL(atomic64_add_return); u64 atomic64_sub_return(u64 delta, atomic64_t *ptr) { return atomic64_add_return(-delta, ptr); } +EXPORT_SYMBOL(atomic64_sub_return); u64 atomic64_inc_return(atomic64_t *ptr) { return atomic64_add_return(1, ptr); } +EXPORT_SYMBOL(atomic64_inc_return); u64 atomic64_dec_return(atomic64_t *ptr) { return atomic64_sub_return(1, ptr); } +EXPORT_SYMBOL(atomic64_dec_return); /** * atomic64_add - add integer to atomic64 variable @@ -130,6 +140,7 @@ void atomic64_add(u64 delta, atomic64_t *ptr) { atomic64_add_return(delta, ptr); } +EXPORT_SYMBOL(atomic64_add); /** * atomic64_sub - subtract the atomic64 variable @@ -142,6 +153,7 @@ void atomic64_sub(u64 delta, atomic64_t *ptr) { atomic64_add(-delta, ptr); } +EXPORT_SYMBOL(atomic64_sub); /** * atomic64_sub_and_test - subtract value from variable and test result @@ -158,6 +170,7 @@ int atomic64_sub_and_test(u64 delta, atomic64_t *ptr) return old_val == 0; } +EXPORT_SYMBOL(atomic64_sub_and_test); /** * atomic64_inc - increment atomic64 variable @@ -169,6 +182,7 @@ void atomic64_inc(atomic64_t *ptr) { atomic64_add(1, ptr); } +EXPORT_SYMBOL(atomic64_inc); /** * atomic64_dec - decrement atomic64 variable @@ -180,6 +194,7 @@ void atomic64_dec(atomic64_t *ptr) { atomic64_sub(1, ptr); } +EXPORT_SYMBOL(atomic64_dec); /** * atomic64_dec_and_test - decrement and test @@ -193,6 +208,7 @@ int atomic64_dec_and_test(atomic64_t *ptr) { return atomic64_sub_and_test(1, ptr); } +EXPORT_SYMBOL(atomic64_dec_and_test); /** * atomic64_inc_and_test - increment and test @@ -206,6 +222,7 @@ int atomic64_inc_and_test(atomic64_t *ptr) { return atomic64_sub_and_test(-1, ptr); } +EXPORT_SYMBOL(atomic64_inc_and_test); /** * atomic64_add_negative - add and test if negative @@ -222,3 +239,4 @@ int atomic64_add_negative(u64 delta, atomic64_t *ptr) return old_val < 0; } +EXPORT_SYMBOL(atomic64_add_negative); -- cgit v1.2.3 From 3a8d1788b37435baf6c296f4ea8beb4fa4955f44 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Jul 2009 19:56:36 +0200 Subject: x86: atomic64: Improve atomic64_xchg() Remove the read-first logic from atomic64_xchg() and simplify the loop. This function was the last user of __atomic64_read() - remove it. Also, change the 'real_val' assumption from the somewhat quirky 1ULL << 32 value to the (just as arbitrary, but simpler) value of 0. Reported-by: Linus Torvalds Cc: Eric Dumazet Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: David Howells Cc: Andrew Morton Cc: Arnd Bergmann LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/include/asm/atomic_32.h | 9 --------- arch/x86/lib/atomic64_32.c | 21 +++++++++++++++------ 2 files changed, 15 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h index aa045deb2e75..d7c8849b8c67 100644 --- a/arch/x86/include/asm/atomic_32.h +++ b/arch/x86/include/asm/atomic_32.h @@ -268,15 +268,6 @@ typedef struct { #define ATOMIC64_INIT(val) { (val) } -/** - * atomic64_read - read atomic64 variable - * @ptr: pointer of type atomic64_t - * - * Atomically reads the value of @v. - * Doesn't imply a read memory barrier. - */ -#define __atomic64_read(ptr) ((ptr)->counter) - extern u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val); /** diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c index 6722a092e407..a804f96e90e2 100644 --- a/arch/x86/lib/atomic64_32.c +++ b/arch/x86/lib/atomic64_32.c @@ -33,14 +33,23 @@ EXPORT_SYMBOL(atomic64_cmpxchg); * Atomically xchgs the value of @ptr to @new_val and returns * the old value. */ - u64 atomic64_xchg(atomic64_t *ptr, u64 new_val) { - u64 old_val; + /* + * Try first with a (possibly incorrect) assumption about + * what we have there. We'll do two loops most likely, + * but we'll get an ownership MESI transaction straight away + * instead of a read transaction followed by a + * flush-for-ownership transaction: + */ + u64 old_val, real_val = 0; do { - old_val = __atomic64_read(ptr); - } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val); + old_val = real_val; + + real_val = atomic64_cmpxchg(ptr, old_val, new_val); + + } while (real_val != old_val); return old_val; } @@ -91,13 +100,13 @@ EXPORT_SYMBOL(atomic64_read); noinline u64 atomic64_add_return(u64 delta, atomic64_t *ptr) { /* - * Try first with a (probably incorrect) assumption about + * Try first with a (possibly incorrect) assumption about * what we have there. We'll do two loops most likely, * but we'll get an ownership MESI transaction straight away * instead of a read transaction followed by a * flush-for-ownership transaction: */ - u64 old_val, new_val, real_val = 1ULL << 32; + u64 old_val, new_val, real_val = 0; do { old_val = real_val; -- cgit v1.2.3 From ddf9a003d32f720805ac30bcc15755e9289073de Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Jul 2009 20:11:30 +0200 Subject: x86: atomic64: Clean up atomic64_sub_and_test() and atomic64_add_negative() Linus noticed that the variable name 'old_val' is confusingly named in these functions - the correct naming is 'new_val'. Reported-by: Linus Torvalds Cc: Eric Dumazet Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: David Howells Cc: Andrew Morton Cc: Arnd Bergmann LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/lib/atomic64_32.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c index a804f96e90e2..1d98c9eb6eac 100644 --- a/arch/x86/lib/atomic64_32.c +++ b/arch/x86/lib/atomic64_32.c @@ -175,9 +175,9 @@ EXPORT_SYMBOL(atomic64_sub); */ int atomic64_sub_and_test(u64 delta, atomic64_t *ptr) { - u64 old_val = atomic64_sub_return(delta, ptr); + u64 new_val = atomic64_sub_return(delta, ptr); - return old_val == 0; + return new_val == 0; } EXPORT_SYMBOL(atomic64_sub_and_test); @@ -244,8 +244,8 @@ EXPORT_SYMBOL(atomic64_inc_and_test); */ int atomic64_add_negative(u64 delta, atomic64_t *ptr) { - long long old_val = atomic64_add_return(delta, ptr); + s64 new_val = atomic64_add_return(delta, ptr); - return old_val < 0; + return new_val < 0; } EXPORT_SYMBOL(atomic64_add_negative); -- cgit v1.2.3 From a79f0da80a508448434476b77f9d3d1a469eab67 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 3 Jul 2009 16:50:10 +0200 Subject: x86: atomic64: Inline atomic64_read() again Now atomic64_read() is light weight (no register pressure and small icache), we can inline it again. Also use "=&A" constraint instead of "+A" to avoid warning about unitialized 'res' variable. (gcc had to force 0 in eax/edx) $ size vmlinux.prev vmlinux.after text data bss dec hex filename 4908667 451676 1684868 7045211 6b805b vmlinux.prev 4908651 451676 1684868 7045195 6b804b vmlinux.after Signed-off-by: Eric Dumazet Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: David Howells Cc: Andrew Morton Cc: Arnd Bergmann LKML-Reference: <4A4E1AA2.30002@gmail.com> [ Also fix typo in atomic64_set() export ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/atomic_32.h | 22 ++++++++++++++++++++++ arch/x86/lib/atomic64_32.c | 23 +---------------------- 2 files changed, 23 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h index d7c8849b8c67..dc5a667ff791 100644 --- a/arch/x86/include/asm/atomic_32.h +++ b/arch/x86/include/asm/atomic_32.h @@ -295,6 +295,28 @@ extern void atomic64_set(atomic64_t *ptr, u64 new_val); * * Atomically reads the value of @ptr and returns it. */ +static inline u64 atomic64_read(atomic64_t *ptr) +{ + u64 res; + + /* + * Note, we inline this atomic64_t primitive because + * it only clobbers EAX/EDX and leaves the others + * untouched. We also (somewhat subtly) rely on the + * fact that cmpxchg8b returns the current 64-bit value + * of the memory location we are touching: + */ + asm volatile( + "mov %%ebx, %%eax\n\t" + "mov %%ecx, %%edx\n\t" + LOCK_PREFIX "cmpxchg8b %1\n" + : "=&A" (res) + : "m" (*ptr) + ); + + return res; +} + extern u64 atomic64_read(atomic64_t *ptr); /** diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c index 1d98c9eb6eac..824fa0be55a3 100644 --- a/arch/x86/lib/atomic64_32.c +++ b/arch/x86/lib/atomic64_32.c @@ -66,31 +66,10 @@ void atomic64_set(atomic64_t *ptr, u64 new_val) { atomic64_xchg(ptr, new_val); } -EXPORT_SYMBOL(atomic64_read); +EXPORT_SYMBOL(atomic64_set); /** - * atomic64_read - read atomic64 variable - * @ptr: pointer to type atomic64_t - * - * Atomically reads the value of @ptr and returns it. - */ -u64 atomic64_read(atomic64_t *ptr) -{ - u64 res; - - asm volatile( - "mov %%ebx, %%eax\n\t" - "mov %%ecx, %%edx\n\t" - LOCK_PREFIX "cmpxchg8b %1\n" - : "+A" (res) - : "m" (*ptr) - ); - - return res; -} EXPORT_SYMBOL(atomic64_read); - -/** * atomic64_add_return - add and return * @delta: integer value to add * @ptr: pointer to type atomic64_t -- cgit v1.2.3 From 62edf5dc4a524e4cb525e6020b955a1ad593d9ba Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 4 Jul 2009 10:59:46 +0100 Subject: intel-iommu: Restore DMAR_BROKEN_GFX_WA option for broken graphics drivers We need to give people a little more time to fix the broken drivers. Re-introduce this, but tied in properly with the 'iommu=pt' support this time. Change the config option name and make it default to 'no' too. Signed-off-by: David Woodhouse --- arch/x86/Kconfig | 12 ++++++++++++ drivers/pci/intel-iommu.c | 29 +++++++++++++++++++---------- 2 files changed, 31 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c07f72205909..738bdc6b0f8b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1913,6 +1913,18 @@ config DMAR_DEFAULT_ON recommended you say N here while the DMAR code remains experimental. +config DMAR_BROKEN_GFX_WA + def_bool n + prompt "Workaround broken graphics drivers (going away soon)" + depends on DMAR + ---help--- + Current Graphics drivers tend to use physical address + for DMA and avoid using DMA APIs. Setting this config + option permits the IOMMU driver to set a unity map for + all the OS-visible memory. Hence the driver can continue + to use physical addresses for DMA, at least until this + option is removed in the 2.6.32 kernel. + config DMAR_FLOPPY_WA def_bool y depends on DMAR diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index ae5ccdf8b19f..5ee8305257ea 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -2127,16 +2127,18 @@ static int iommu_prepare_static_identity_mapping(void) return -EFAULT; for_each_pci_dev(pdev) { - printk(KERN_INFO "IOMMU: identity mapping for device %s\n", - pci_name(pdev)); + if (iommu_identity_mapping == 1 || IS_GFX_DEVICE(pdev)) { + printk(KERN_INFO "IOMMU: identity mapping for device %s\n", + pci_name(pdev)); - ret = domain_context_mapping(si_domain, pdev, - CONTEXT_TT_MULTI_LEVEL); - if (ret) - return ret; - ret = domain_add_dev_info(si_domain, pdev); - if (ret) - return ret; + ret = domain_context_mapping(si_domain, pdev, + CONTEXT_TT_MULTI_LEVEL); + if (ret) + return ret; + ret = domain_add_dev_info(si_domain, pdev); + if (ret) + return ret; + } } return 0; @@ -2291,6 +2293,10 @@ int __init init_dmars(void) * identity mapping if iommu_identity_mapping is set. */ if (!iommu_pass_through) { +#ifdef CONFIG_DMAR_BROKEN_GFX_WA + if (!iommu_identity_mapping) + iommu_identity_mapping = 2; +#endif if (iommu_identity_mapping) iommu_prepare_static_identity_mapping(); /* @@ -2444,7 +2450,10 @@ static int iommu_dummy(struct pci_dev *pdev) static int iommu_should_identity_map(struct pci_dev *pdev) { - return pdev->dma_mask > DMA_BIT_MASK(32); + if (iommu_identity_mapping == 2) + return IS_GFX_DEVICE(pdev); + else + return pdev->dma_mask > DMA_BIT_MASK(32); } /* Check if the pdev needs to go through non-identity map and unmap process.*/ -- cgit v1.2.3 From febe04de3be4bf66f9339d8847db2806d99fd164 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 1 Jul 2009 11:13:07 +0900 Subject: x86: fix usage of bios intcall() Some intcall() misuses the input biosregs as output in cf06de7b9cdd3efee7a59dced1977b3c21d43732 This fixes the problem vga=ask boot option doesn't show enough modes. Signed-off-by: Akinobu Mita LKML-Reference: <20090701021307.GA3127@localhost.localdomain> Signed-off-by: H. Peter Anvin --- arch/x86/boot/video-bios.c | 3 +-- arch/x86/boot/video-vesa.c | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/video-bios.c b/arch/x86/boot/video-bios.c index d660be492363..49e0c18833e0 100644 --- a/arch/x86/boot/video-bios.c +++ b/arch/x86/boot/video-bios.c @@ -37,14 +37,13 @@ static int set_bios_mode(u8 mode) ireg.al = mode; /* AH=0x00 Set Video Mode */ intcall(0x10, &ireg, NULL); - ireg.ah = 0x0f; /* Get Current Video Mode */ intcall(0x10, &ireg, &oreg); do_restore = 1; /* Assume video contents were lost */ /* Not all BIOSes are clean with the top bit */ - new_mode = ireg.al & 0x7f; + new_mode = oreg.al & 0x7f; if (new_mode == mode) return 0; /* Mode change OK */ diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c index c700147d6ffb..275dd177f198 100644 --- a/arch/x86/boot/video-vesa.c +++ b/arch/x86/boot/video-vesa.c @@ -45,7 +45,7 @@ static int vesa_probe(void) ireg.di = (size_t)&vginfo; intcall(0x10, &ireg, &oreg); - if (ireg.ax != 0x004f || + if (oreg.ax != 0x004f || vginfo.signature != VESA_MAGIC || vginfo.version < 0x0102) return 0; /* Not present */ @@ -70,7 +70,7 @@ static int vesa_probe(void) ireg.di = (size_t)&vminfo; intcall(0x10, &ireg, &oreg); - if (ireg.ax != 0x004f) + if (oreg.ax != 0x004f) continue; if ((vminfo.mode_attr & 0x15) == 0x05) { -- cgit v1.2.3 From f386c61fe1a1f36f0e434f1b577e6b112698caf7 Mon Sep 17 00:00:00 2001 From: Peter Oberparleiter Date: Sun, 5 Jul 2009 12:08:06 -0700 Subject: gcov: exclude code operating in userspace from profiling Fix for this issue on x86_64: rostedt@goodmis.org wrote: > On bootup of the latest kernel my init segfaults. Debugging it, > I found that vread_tsc (a vsyscall) increments some strange > kernel memory: > > 0000000000000000 : > 0: 55 push %rbp > 1: 48 ff 05 00 00 00 00 incq 0(%rip) > # 8 > 4: R_X86_64_PC32 .bss+0x3c > 8: 48 89 e5 mov %rsp,%rbp > b: 66 66 90 xchg %ax,%ax > e: 48 ff 05 00 00 00 00 incq 0(%rip) > # 15 > 11: R_X86_64_PC32 .bss+0x44 > 15: 66 66 90 xchg %ax,%ax > 18: 48 ff 05 00 00 00 00 incq 0(%rip) > # 1f > 1b: R_X86_64_PC32 .bss+0x4c > 1f: 0f 31 rdtsc > > > Those "incq" is very bad to happen in vsyscall memory, since > userspace can not modify it. You need to make something prevent > profiling of vsyscall memory (like I do with ftrace). Signed-off-by: Peter Oberparleiter Cc: Ingo Molnar Reported-by: Steven Rostedt Tested-by: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/Makefile | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 6c327b852e23..430d5b24af7b 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -26,6 +26,8 @@ CFLAGS_tsc.o := $(nostackp) CFLAGS_paravirt.o := $(nostackp) GCOV_PROFILE_vsyscall_64.o := n GCOV_PROFILE_hpet.o := n +GCOV_PROFILE_tsc.o := n +GCOV_PROFILE_paravirt.o := n obj-y := process_$(BITS).o signal.o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o -- cgit v1.2.3 From a2e1b4c31257c07f148a89eb7eea7ca959fd0642 Mon Sep 17 00:00:00 2001 From: Mark Langsdorf Date: Sun, 26 Jul 2009 10:55:25 -0500 Subject: [CPUFREQ] Powernow-k8: support family 0xf with 2 low p-states Provide support for family 0xf processors with 2 P-states below the elevator voltage. Remove the checks that prevent this configuration from being supported and increase the transition voltage to prevent errors during the transition. Signed-off-by: Mark Langsdorf Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 30 +++++++++++------------------- arch/x86/kernel/cpu/cpufreq/powernow-k8.h | 3 ++- 2 files changed, 13 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 81cbe64ed6b4..ae068f59603f 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -299,7 +299,7 @@ static int transition_pstate(struct powernow_k8_data *data, u32 pstate) static int transition_fid_vid(struct powernow_k8_data *data, u32 reqfid, u32 reqvid) { - if (core_voltage_pre_transition(data, reqvid)) + if (core_voltage_pre_transition(data, reqvid, reqfid)) return 1; if (core_frequency_transition(data, reqfid)) @@ -327,17 +327,20 @@ static int transition_fid_vid(struct powernow_k8_data *data, /* Phase 1 - core voltage transition ... setup voltage */ static int core_voltage_pre_transition(struct powernow_k8_data *data, - u32 reqvid) + u32 reqvid, u32 reqfid) { u32 rvosteps = data->rvo; u32 savefid = data->currfid; - u32 maxvid, lo; + u32 maxvid, lo, rvomult = 1; dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, " "reqvid 0x%x, rvo 0x%x\n", smp_processor_id(), data->currfid, data->currvid, reqvid, data->rvo); + if ((savefid < LO_FID_TABLE_TOP) && (reqfid < LO_FID_TABLE_TOP)) + rvomult = 2; + rvosteps *= rvomult; rdmsr(MSR_FIDVID_STATUS, lo, maxvid); maxvid = 0x1f & (maxvid >> 16); dprintk("ph1 maxvid=0x%x\n", maxvid); @@ -351,7 +354,8 @@ static int core_voltage_pre_transition(struct powernow_k8_data *data, return 1; } - while ((rvosteps > 0) && ((data->rvo + data->currvid) > reqvid)) { + while ((rvosteps > 0) && + ((rvomult * data->rvo + data->currvid) > reqvid)) { if (data->currvid == maxvid) { rvosteps = 0; } else { @@ -384,13 +388,6 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid) u32 vcoreqfid, vcocurrfid, vcofiddiff; u32 fid_interval, savevid = data->currvid; - if ((reqfid < HI_FID_TABLE_BOTTOM) && - (data->currfid < HI_FID_TABLE_BOTTOM)) { - printk(KERN_ERR PFX "ph2: illegal lo-lo transition " - "0x%x 0x%x\n", reqfid, data->currfid); - return 1; - } - if (data->currfid == reqfid) { printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n", data->currfid); @@ -407,6 +404,9 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid) vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid : vcoreqfid - vcocurrfid; + if ((reqfid <= LO_FID_TABLE_TOP) && (data->currfid <= LO_FID_TABLE_TOP)) + vcofiddiff = 0; + while (vcofiddiff > 2) { (data->currfid & 1) ? (fid_interval = 1) : (fid_interval = 2); @@ -1081,14 +1081,6 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, return 0; } - if ((fid < HI_FID_TABLE_BOTTOM) && - (data->currfid < HI_FID_TABLE_BOTTOM)) { - printk(KERN_ERR PFX - "ignoring illegal change in lo freq table-%x to 0x%x\n", - data->currfid, fid); - return 1; - } - dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n", smp_processor_id(), fid, vid); freqs.old = find_khz_freq_from_fid(data->currfid); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h index c9c1190b5e1f..02ce824073cb 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h @@ -215,7 +215,8 @@ struct pst_s { #define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k8", msg) -static int core_voltage_pre_transition(struct powernow_k8_data *data, u32 reqvid); +static int core_voltage_pre_transition(struct powernow_k8_data *data, + u32 reqvid, u32 regfid); static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid); static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid); -- cgit v1.2.3 From 00024be9689963c55cf2ab10ce3fb4502016c09a Mon Sep 17 00:00:00 2001 From: Peter Chubb Date: Wed, 8 Jul 2009 13:20:13 +0200 Subject: x86: Fix resume from suspend when CONFIG_CC_STACKPROTECTOR Patch 08687aec71bc9134fe336e561f6did877bacf74fc0a (x86: unify power/cpu_(32|64).c) renamed cpu_32.c to cpu.c, but did not update the special compilation flags for the file for the new name. This patch fixes the compilation flags, and therefore fixes resume from suspend on my Acer Aspire One. [rjw: The regression from 2.6.30 fixed by this patch is tracked as http://bugzilla.kernel.org/show_bug.cgi?id=13661] Signed-off-by: Peter Chubb Signed-off-by: Rafael J. Wysocki --- arch/x86/power/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile index de2abbd07544..a6a198c33623 100644 --- a/arch/x86/power/Makefile +++ b/arch/x86/power/Makefile @@ -1,7 +1,7 @@ # __restore_processor_state() restores %gs after S3 resume and so should not # itself be stack-protected nostackp := $(call cc-option, -fno-stack-protector) -CFLAGS_cpu_$(BITS).o := $(nostackp) +CFLAGS_cpu.o := $(nostackp) obj-$(CONFIG_PM_SLEEP) += cpu.o obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o -- cgit v1.2.3 From ad361c9884e809340f6daca80d56a9e9c871690a Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 6 Jul 2009 13:05:40 -0700 Subject: Remove multiple KERN_ prefixes from printk formats Commit 5fd29d6ccbc98884569d6f3105aeca70858b3e0f ("printk: clean up handling of log-levels and newlines") changed printk semantics. printk lines with multiple KERN_ prefixes are no longer emitted as before the patch. is now included in the output on each additional use. Remove all uses of multiple KERN_s in formats. Signed-off-by: Joe Perches Signed-off-by: Linus Torvalds --- arch/avr32/kernel/traps.c | 13 ++++---- arch/blackfin/kernel/setup.c | 41 ++++++++++++------------- arch/blackfin/kernel/traps.c | 34 ++++++++++----------- arch/m68knommu/kernel/process.c | 21 +++++++------ arch/m68knommu/kernel/traps.c | 6 ++-- arch/mn10300/kernel/traps.c | 21 +++++-------- arch/parisc/kernel/process.c | 2 +- arch/parisc/kernel/traps.c | 20 ++++++------- arch/um/kernel/sysrq.c | 4 +-- arch/x86/kernel/apic/io_apic.c | 1 - arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 2 +- arch/x86/kernel/cpu/mcheck/mce.c | 12 ++++---- arch/x86/kernel/e820.c | 7 ++--- arch/x86/kernel/pci-gart_64.c | 2 +- arch/x86/mm/fault.c | 9 +++--- arch/xtensa/kernel/traps.c | 6 ++-- drivers/block/amiflop.c | 2 +- drivers/block/xsysace.c | 7 +++-- drivers/char/hw_random/intel-rng.c | 9 +++--- drivers/char/isicom.c | 16 +++++----- drivers/i2c/busses/i2c-ibm_iic.c | 9 +++--- drivers/md/md.c | 18 ++++++----- drivers/misc/sgi-xp/xpnet.c | 4 +-- drivers/net/a2065.c | 12 ++------ drivers/net/arcnet/arcnet.c | 26 +++++++--------- drivers/net/bmac.c | 7 +++-- drivers/net/bnx2x_main.c | 7 +++-- drivers/net/dl2k.c | 17 ++++++----- drivers/net/epic100.c | 5 ++-- drivers/net/fealnx.c | 15 ++++++---- drivers/net/hamachi.c | 23 +++++++------- drivers/net/hamradio/baycom_epp.c | 2 +- drivers/net/hamradio/baycom_par.c | 2 +- drivers/net/hamradio/baycom_ser_fdx.c | 2 +- drivers/net/hamradio/baycom_ser_hdx.c | 2 +- drivers/net/natsemi.c | 4 +-- drivers/net/ne.c | 2 +- drivers/net/pci-skeleton.c | 2 +- drivers/net/pcmcia/ibmtr_cs.c | 13 ++++---- drivers/net/pcmcia/nmclan_cs.c | 15 ++++------ drivers/net/pcnet32.c | 50 +++++++++++++++---------------- drivers/net/starfire.c | 2 +- drivers/net/sundance.c | 6 ++-- drivers/net/tsi108_eth.c | 8 +++-- drivers/net/tulip/de2104x.c | 7 +++-- drivers/net/tulip/tulip_core.c | 14 +++++---- drivers/net/tulip/winbond-840.c | 6 ++-- drivers/net/wan/hd64570.c | 3 +- drivers/net/wan/hd64572.c | 3 +- drivers/net/wan/sbni.c | 8 ++--- drivers/net/wireless/ray_cs.c | 9 +++--- drivers/net/wireless/wavelan_cs.c | 13 ++------ drivers/net/yellowfin.c | 23 +++++++------- drivers/parisc/eisa_enumerator.c | 14 ++++----- drivers/pcmcia/tcic.c | 3 +- drivers/scsi/atari_NCR5380.c | 3 +- drivers/scsi/mac53c94.c | 5 ++-- drivers/scsi/sg.c | 2 +- drivers/scsi/sun3_NCR5380.c | 3 +- drivers/serial/8250_pci.c | 11 +++---- drivers/ssb/pcmcia.c | 4 +-- drivers/usb/core/hcd.c | 2 +- drivers/video/amba-clcd.c | 7 +++-- drivers/video/matrox/matroxfb_DAC1064.c | 4 +-- drivers/video/matrox/matroxfb_Ti3026.c | 4 +-- drivers/video/stifb.c | 7 ++--- fs/jffs2/erase.c | 10 ++++--- kernel/module.c | 6 ++-- sound/pci/emu10k1/p16v.c | 2 +- sound/usb/usx2y/usbusx2yaudio.c | 7 +++-- 70 files changed, 330 insertions(+), 338 deletions(-) (limited to 'arch/x86') diff --git a/arch/avr32/kernel/traps.c b/arch/avr32/kernel/traps.c index 6e3d491184ea..b91b2044af9c 100644 --- a/arch/avr32/kernel/traps.c +++ b/arch/avr32/kernel/traps.c @@ -32,22 +32,25 @@ void NORET_TYPE die(const char *str, struct pt_regs *regs, long err) spin_lock_irq(&die_lock); bust_spinlocks(1); - printk(KERN_ALERT "Oops: %s, sig: %ld [#%d]\n" KERN_EMERG, + printk(KERN_ALERT "Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter); + + printk(KERN_EMERG); + #ifdef CONFIG_PREEMPT - printk("PREEMPT "); + printk(KERN_CONT "PREEMPT "); #endif #ifdef CONFIG_FRAME_POINTER - printk("FRAME_POINTER "); + printk(KERN_CONT "FRAME_POINTER "); #endif if (current_cpu_data.features & AVR32_FEATURE_OCD) { unsigned long did = ocd_read(DID); - printk("chip: 0x%03lx:0x%04lx rev %lu\n", + printk(KERN_CONT "chip: 0x%03lx:0x%04lx rev %lu\n", (did >> 1) & 0x7ff, (did >> 12) & 0x7fff, (did >> 28) & 0xf); } else { - printk("cpu: arch %u r%u / core %u r%u\n", + printk(KERN_CONT "cpu: arch %u r%u / core %u r%u\n", current_cpu_data.arch_type, current_cpu_data.arch_revision, current_cpu_data.cpu_type, diff --git a/arch/blackfin/kernel/setup.c b/arch/blackfin/kernel/setup.c index 298f023bcc09..6136c33e919f 100644 --- a/arch/blackfin/kernel/setup.c +++ b/arch/blackfin/kernel/setup.c @@ -408,13 +408,14 @@ static void __init print_memory_map(char *who) bfin_memmap.map[i].addr + bfin_memmap.map[i].size); switch (bfin_memmap.map[i].type) { case BFIN_MEMMAP_RAM: - printk("(usable)\n"); - break; + printk(KERN_CONT "(usable)\n"); + break; case BFIN_MEMMAP_RESERVED: - printk("(reserved)\n"); - break; - default: printk("type %lu\n", bfin_memmap.map[i].type); - break; + printk(KERN_CONT "(reserved)\n"); + break; + default: + printk(KERN_CONT "type %lu\n", bfin_memmap.map[i].type); + break; } } } @@ -614,19 +615,19 @@ static __init void memory_setup(void) printk(KERN_INFO "Kernel Managed Memory: %ldMB\n", _ramend >> 20); printk(KERN_INFO "Memory map:\n" - KERN_INFO " fixedcode = 0x%p-0x%p\n" - KERN_INFO " text = 0x%p-0x%p\n" - KERN_INFO " rodata = 0x%p-0x%p\n" - KERN_INFO " bss = 0x%p-0x%p\n" - KERN_INFO " data = 0x%p-0x%p\n" - KERN_INFO " stack = 0x%p-0x%p\n" - KERN_INFO " init = 0x%p-0x%p\n" - KERN_INFO " available = 0x%p-0x%p\n" + " fixedcode = 0x%p-0x%p\n" + " text = 0x%p-0x%p\n" + " rodata = 0x%p-0x%p\n" + " bss = 0x%p-0x%p\n" + " data = 0x%p-0x%p\n" + " stack = 0x%p-0x%p\n" + " init = 0x%p-0x%p\n" + " available = 0x%p-0x%p\n" #ifdef CONFIG_MTD_UCLINUX - KERN_INFO " rootfs = 0x%p-0x%p\n" + " rootfs = 0x%p-0x%p\n" #endif #if DMA_UNCACHED_REGION > 0 - KERN_INFO " DMA Zone = 0x%p-0x%p\n" + " DMA Zone = 0x%p-0x%p\n" #endif , (void *)FIXED_CODE_START, (void *)FIXED_CODE_END, _stext, _etext, @@ -859,13 +860,13 @@ void __init setup_arch(char **cmdline_p) #endif printk(KERN_INFO "Hardware Trace "); if (bfin_read_TBUFCTL() & 0x1) - printk("Active "); + printk(KERN_CONT "Active "); else - printk("Off "); + printk(KERN_CONT "Off "); if (bfin_read_TBUFCTL() & 0x2) - printk("and Enabled\n"); + printk(KERN_CONT "and Enabled\n"); else - printk("and Disabled\n"); + printk(KERN_CONT "and Disabled\n"); #if defined(CONFIG_CHR_DEV_FLASH) || defined(CONFIG_BLK_DEV_FLASH) /* we need to initialize the Flashrom device here since we might diff --git a/arch/blackfin/kernel/traps.c b/arch/blackfin/kernel/traps.c index 8eeb457ce5d5..8a1caf2bb5b9 100644 --- a/arch/blackfin/kernel/traps.c +++ b/arch/blackfin/kernel/traps.c @@ -212,7 +212,7 @@ asmlinkage void double_fault_c(struct pt_regs *fp) console_verbose(); oops_in_progress = 1; #ifdef CONFIG_DEBUG_VERBOSE - printk(KERN_EMERG "\n" KERN_EMERG "Double Fault\n"); + printk(KERN_EMERG "Double Fault\n"); #ifdef CONFIG_DEBUG_DOUBLEFAULT_PRINT if (((long)fp->seqstat & SEQSTAT_EXCAUSE) == VEC_UNCOV) { unsigned int cpu = smp_processor_id(); @@ -583,15 +583,14 @@ asmlinkage void trap_c(struct pt_regs *fp) #ifndef CONFIG_DEBUG_BFIN_NO_KERN_HWTRACE if (trapnr == VEC_CPLB_I_M || trapnr == VEC_CPLB_M) verbose_printk(KERN_NOTICE "No trace since you do not have " - "CONFIG_DEBUG_BFIN_NO_KERN_HWTRACE enabled\n" - KERN_NOTICE "\n"); + "CONFIG_DEBUG_BFIN_NO_KERN_HWTRACE enabled\n\n"); else #endif dump_bfin_trace_buffer(); if (oops_in_progress) { /* Dump the current kernel stack */ - verbose_printk(KERN_NOTICE "\n" KERN_NOTICE "Kernel Stack\n"); + verbose_printk(KERN_NOTICE "Kernel Stack\n"); show_stack(current, NULL); print_modules(); #ifndef CONFIG_ACCESS_CHECK @@ -906,7 +905,7 @@ void show_stack(struct task_struct *task, unsigned long *stack) ret_addr = 0; if (!j && i % 8 == 0) - printk("\n" KERN_NOTICE "%p:",addr); + printk(KERN_NOTICE "%p:",addr); /* if it is an odd address, or zero, just skip it */ if (*addr & 0x1 || !*addr) @@ -996,9 +995,9 @@ void dump_bfin_process(struct pt_regs *fp) printk(KERN_NOTICE "CPU = %d\n", current_thread_info()->cpu); if (!((unsigned long)current->mm & 0x3) && (unsigned long)current->mm >= FIXED_CODE_START) - verbose_printk(KERN_NOTICE "TEXT = 0x%p-0x%p DATA = 0x%p-0x%p\n" - KERN_NOTICE " BSS = 0x%p-0x%p USER-STACK = 0x%p\n" - KERN_NOTICE "\n", + verbose_printk(KERN_NOTICE + "TEXT = 0x%p-0x%p DATA = 0x%p-0x%p\n" + " BSS = 0x%p-0x%p USER-STACK = 0x%p\n\n", (void *)current->mm->start_code, (void *)current->mm->end_code, (void *)current->mm->start_data, @@ -1009,8 +1008,8 @@ void dump_bfin_process(struct pt_regs *fp) else verbose_printk(KERN_NOTICE "invalid mm\n"); } else - verbose_printk(KERN_NOTICE "\n" KERN_NOTICE - "No Valid process in current context\n"); + verbose_printk(KERN_NOTICE + "No Valid process in current context\n"); #endif } @@ -1028,7 +1027,7 @@ void dump_bfin_mem(struct pt_regs *fp) addr < (unsigned short *)((unsigned long)erraddr & ~0xF) + 0x10; addr++) { if (!((unsigned long)addr & 0xF)) - verbose_printk("\n" KERN_NOTICE "0x%p: ", addr); + verbose_printk(KERN_NOTICE "0x%p: ", addr); if (!get_instruction(&val, addr)) { val = 0; @@ -1056,9 +1055,9 @@ void dump_bfin_mem(struct pt_regs *fp) oops_in_progress)){ verbose_printk(KERN_NOTICE "Looks like this was a deferred error - sorry\n"); #ifndef CONFIG_DEBUG_HWERR - verbose_printk(KERN_NOTICE "The remaining message may be meaningless\n" - KERN_NOTICE "You should enable CONFIG_DEBUG_HWERR to get a" - " better idea where it came from\n"); + verbose_printk(KERN_NOTICE +"The remaining message may be meaningless\n" +"You should enable CONFIG_DEBUG_HWERR to get a better idea where it came from\n"); #else /* If we are handling only one peripheral interrupt * and current mm and pid are valid, and the last error @@ -1114,9 +1113,10 @@ void show_regs(struct pt_regs *fp) verbose_printk(KERN_NOTICE "%s", linux_banner); - verbose_printk(KERN_NOTICE "\n" KERN_NOTICE "SEQUENCER STATUS:\t\t%s\n", print_tainted()); + verbose_printk(KERN_NOTICE "\nSEQUENCER STATUS:\t\t%s\n", + print_tainted()); verbose_printk(KERN_NOTICE " SEQSTAT: %08lx IPEND: %04lx SYSCFG: %04lx\n", - (long)fp->seqstat, fp->ipend, fp->syscfg); + (long)fp->seqstat, fp->ipend, fp->syscfg); if ((fp->seqstat & SEQSTAT_EXCAUSE) == VEC_HWERR) { verbose_printk(KERN_NOTICE " HWERRCAUSE: 0x%lx\n", (fp->seqstat & SEQSTAT_HWERRCAUSE) >> 14); @@ -1184,7 +1184,7 @@ unlock: verbose_printk(KERN_NOTICE "ICPLB_FAULT_ADDR: %s\n", buf); } - verbose_printk(KERN_NOTICE "\n" KERN_NOTICE "PROCESSOR STATE:\n"); + verbose_printk(KERN_NOTICE "PROCESSOR STATE:\n"); verbose_printk(KERN_NOTICE " R0 : %08lx R1 : %08lx R2 : %08lx R3 : %08lx\n", fp->r0, fp->r1, fp->r2, fp->r3); verbose_printk(KERN_NOTICE " R4 : %08lx R5 : %08lx R6 : %08lx R7 : %08lx\n", diff --git a/arch/m68knommu/kernel/process.c b/arch/m68knommu/kernel/process.c index 1e96c6eb6312..8f8f4abab2ff 100644 --- a/arch/m68knommu/kernel/process.c +++ b/arch/m68knommu/kernel/process.c @@ -290,7 +290,7 @@ void dump(struct pt_regs *fp) unsigned char *tp; int i; - printk(KERN_EMERG "\n" KERN_EMERG "CURRENT PROCESS:\n" KERN_EMERG "\n"); + printk(KERN_EMERG "\nCURRENT PROCESS:\n\n"); printk(KERN_EMERG "COMM=%s PID=%d\n", current->comm, current->pid); if (current->mm) { @@ -301,8 +301,7 @@ void dump(struct pt_regs *fp) (int) current->mm->end_data, (int) current->mm->end_data, (int) current->mm->brk); - printk(KERN_EMERG "USER-STACK=%08x KERNEL-STACK=%08x\n" - KERN_EMERG "\n", + printk(KERN_EMERG "USER-STACK=%08x KERNEL-STACK=%08x\n\n", (int) current->mm->start_stack, (int)(((unsigned long) current) + THREAD_SIZE)); } @@ -313,35 +312,35 @@ void dump(struct pt_regs *fp) fp->d0, fp->d1, fp->d2, fp->d3); printk(KERN_EMERG "d4: %08lx d5: %08lx a0: %08lx a1: %08lx\n", fp->d4, fp->d5, fp->a0, fp->a1); - printk(KERN_EMERG "\n" KERN_EMERG "USP: %08x TRAPFRAME: %08x\n", + printk(KERN_EMERG "\nUSP: %08x TRAPFRAME: %08x\n", (unsigned int) rdusp(), (unsigned int) fp); - printk(KERN_EMERG "\n" KERN_EMERG "CODE:"); + printk(KERN_EMERG "\nCODE:"); tp = ((unsigned char *) fp->pc) - 0x20; for (sp = (unsigned long *) tp, i = 0; (i < 0x40); i += 4) { if ((i % 0x10) == 0) - printk("\n" KERN_EMERG "%08x: ", (int) (tp + i)); + printk(KERN_EMERG "%08x: ", (int) (tp + i)); printk("%08x ", (int) *sp++); } - printk("\n" KERN_EMERG "\n"); + printk(KERN_EMERG "\n"); printk(KERN_EMERG "KERNEL STACK:"); tp = ((unsigned char *) fp) - 0x40; for (sp = (unsigned long *) tp, i = 0; (i < 0xc0); i += 4) { if ((i % 0x10) == 0) - printk("\n" KERN_EMERG "%08x: ", (int) (tp + i)); + printk(KERN_EMERG "%08x: ", (int) (tp + i)); printk("%08x ", (int) *sp++); } - printk("\n" KERN_EMERG "\n"); + printk(KERN_EMERG "\n"); printk(KERN_EMERG "USER STACK:"); tp = (unsigned char *) (rdusp() - 0x10); for (sp = (unsigned long *) tp, i = 0; (i < 0x80); i += 4) { if ((i % 0x10) == 0) - printk("\n" KERN_EMERG "%08x: ", (int) (tp + i)); + printk(KERN_EMERG "%08x: ", (int) (tp + i)); printk("%08x ", (int) *sp++); } - printk("\n" KERN_EMERG "\n"); + printk(KERN_EMERG "\n"); } /* diff --git a/arch/m68knommu/kernel/traps.c b/arch/m68knommu/kernel/traps.c index 51d325343ab5..3739c8f657d7 100644 --- a/arch/m68knommu/kernel/traps.c +++ b/arch/m68knommu/kernel/traps.c @@ -111,7 +111,7 @@ static void print_this_address(unsigned long addr, int i) if (i % 5) printk(KERN_CONT " [%08lx] ", addr); else - printk(KERN_CONT "\n" KERN_EMERG " [%08lx] ", addr); + printk(KERN_EMERG " [%08lx] ", addr); i++; #endif } @@ -137,8 +137,8 @@ static void __show_stack(struct task_struct *task, unsigned long *stack) if (stack + 1 + i > endstack) break; if (i % 8 == 0) - printk("\n" KERN_EMERG " "); - printk(" %08lx", *(stack + i)); + printk(KERN_EMERG " "); + printk(KERN_CONT " %08lx", *(stack + i)); } printk("\n"); i = 0; diff --git a/arch/mn10300/kernel/traps.c b/arch/mn10300/kernel/traps.c index 681ad8c9e4fb..0dfdc5001124 100644 --- a/arch/mn10300/kernel/traps.c +++ b/arch/mn10300/kernel/traps.c @@ -136,8 +136,7 @@ void show_trace(unsigned long *sp) unsigned long *stack, addr, module_start, module_end; int i; - printk(KERN_EMERG "\n" - KERN_EMERG "Call Trace:"); + printk(KERN_EMERG "\nCall Trace:"); stack = sp; i = 0; @@ -153,7 +152,7 @@ void show_trace(unsigned long *sp) printk("\n"); #else if ((i % 6) == 0) - printk("\n" KERN_EMERG " "); + printk(KERN_EMERG " "); printk("[<%08lx>] ", addr); i++; #endif @@ -180,7 +179,7 @@ void show_stack(struct task_struct *task, unsigned long *sp) if (((long) stack & (THREAD_SIZE - 1)) == 0) break; if ((i % 8) == 0) - printk("\n" KERN_EMERG " "); + printk(KERN_EMERG " "); printk("%08lx ", *stack++); } @@ -264,8 +263,7 @@ void show_registers(struct pt_regs *regs) show_stack(current, (unsigned long *) sp); #if 0 - printk(KERN_EMERG "\n" - KERN_EMERG "Code: "); + printk(KERN_EMERG "\nCode: "); if (regs->pc < PAGE_OFFSET) goto bad; @@ -311,16 +309,14 @@ void die(const char *str, struct pt_regs *regs, enum exception_code code) { console_verbose(); spin_lock_irq(&die_lock); - printk(KERN_EMERG "\n" - KERN_EMERG "%s: %04x\n", + printk(KERN_EMERG "\n%s: %04x\n", str, code & 0xffff); show_registers(regs); if (regs->pc >= 0x02000000 && regs->pc < 0x04000000 && (regs->epsw & (EPSW_IM | EPSW_IE)) != (EPSW_IM | EPSW_IE)) { printk(KERN_EMERG "Exception in usermode interrupt handler\n"); - printk(KERN_EMERG "\n" - KERN_EMERG " Please connect to kernel debugger !!\n"); + printk(KERN_EMERG "\nPlease connect to kernel debugger !!\n"); asm volatile ("0: bra 0b"); } @@ -429,9 +425,8 @@ asmlinkage void io_bus_error(u32 bcberr, u32 bcbear, struct pt_regs *regs) { console_verbose(); - printk(KERN_EMERG "\n" - KERN_EMERG "Asynchronous I/O Bus Error\n" - KERN_EMERG "==========================\n"); + printk(KERN_EMERG "Asynchronous I/O Bus Error\n"); + printk(KERN_EMERG "==========================\n"); if (bcberr & BCBERR_BEME) printk(KERN_EMERG "- Multiple recorded errors\n"); diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index 61c07078c072..1f3aa8db0203 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -156,7 +156,7 @@ void machine_power_off(void) * software. The user has to press the button himself. */ printk(KERN_EMERG "System shut down completed.\n" - KERN_EMERG "Please power this system off now."); + "Please power this system off now."); } void (*pm_power_off)(void) = machine_power_off; diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c index c32f5d6d778e..528f0ff9b273 100644 --- a/arch/parisc/kernel/traps.c +++ b/arch/parisc/kernel/traps.c @@ -250,15 +250,14 @@ void die_if_kernel(char *str, struct pt_regs *regs, long err) oops_enter(); /* Amuse the user in a SPARC fashion */ - if (err) printk( -KERN_CRIT " _______________________________ \n" -KERN_CRIT " < Your System ate a SPARC! Gah! >\n" -KERN_CRIT " ------------------------------- \n" -KERN_CRIT " \\ ^__^\n" -KERN_CRIT " \\ (xx)\\_______\n" -KERN_CRIT " (__)\\ )\\/\\\n" -KERN_CRIT " U ||----w |\n" -KERN_CRIT " || ||\n"); + if (err) printk(KERN_CRIT + " _______________________________ \n" + " < Your System ate a SPARC! Gah! >\n" + " ------------------------------- \n" + " \\ ^__^\n" + " (__)\\ )\\/\\\n" + " U ||----w |\n" + " || ||\n"); /* unlock the pdc lock if necessary */ pdc_emergency_unlock(); @@ -797,7 +796,8 @@ void notrace handle_interruption(int code, struct pt_regs *regs) else printk(KERN_DEBUG "User Fault (long pointer) (fault %d) ", code); - printk("pid=%d command='%s'\n", task_pid_nr(current), current->comm); + printk(KERN_CONT "pid=%d command='%s'\n", + task_pid_nr(current), current->comm); show_regs(regs); #endif si.si_signo = SIGSEGV; diff --git a/arch/um/kernel/sysrq.c b/arch/um/kernel/sysrq.c index 56d43d0a3960..0960de54495a 100644 --- a/arch/um/kernel/sysrq.c +++ b/arch/um/kernel/sysrq.c @@ -70,8 +70,8 @@ void show_stack(struct task_struct *task, unsigned long *esp) if (kstack_end(stack)) break; if (i && ((i % 8) == 0)) - printk("\n" KERN_INFO " "); - printk("%08lx ", *stack++); + printk(KERN_INFO " "); + printk(KERN_CONT "%08lx ", *stack++); } show_trace(task, esp); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 8fd1efb5a0bd..90b5e6efa938 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1739,7 +1739,6 @@ __apicdebuginit(void) print_local_APIC(void *dummy) if (apic_verbosity == APIC_QUIET) return; - printk(KERN_DEBUG "\n"); printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", smp_processor_id(), hard_smp_processor_id()); v = apic_read(APIC_ID); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index ae068f59603f..2a50ef891000 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1259,7 +1259,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) { static const char ACPI_PSS_BIOS_BUG_MSG[] = KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n" - KERN_ERR FW_BUG PFX "Try again with latest BIOS.\n"; + FW_BUG PFX "Try again with latest BIOS.\n"; struct powernow_k8_data *data; struct init_on_cpu init_on_cpu; int rc; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index af425b83202b..484c1e5f658e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -194,14 +194,14 @@ static void print_mce(struct mce *m) m->cs, m->ip); if (m->cs == __KERNEL_CS) print_symbol("{%s}", m->ip); - printk("\n"); + printk(KERN_CONT "\n"); } printk(KERN_EMERG "TSC %llx ", m->tsc); if (m->addr) - printk("ADDR %llx ", m->addr); + printk(KERN_CONT "ADDR %llx ", m->addr); if (m->misc) - printk("MISC %llx ", m->misc); - printk("\n"); + printk(KERN_CONT "MISC %llx ", m->misc); + printk(KERN_CONT "\n"); printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); @@ -209,13 +209,13 @@ static void print_mce(struct mce *m) static void print_mce_head(void) { - printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n"); + printk(KERN_EMERG "\nHARDWARE ERROR\n"); } static void print_mce_tail(void) { printk(KERN_EMERG "This is not a software problem!\n" - KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n"); + "Run through mcelog --ascii to decode and contact your hardware vendor\n"); } #define PANIC_TIMEOUT 5 /* 5 seconds */ diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index c4ca89d9aaf4..5cb5725b2bae 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -627,10 +627,9 @@ __init void e820_setup_gap(void) #ifdef CONFIG_X86_64 if (!found) { gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024; - printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit " - "address range\n" - KERN_ERR "PCI: Unassigned devices with 32bit resource " - "registers may break!\n"); + printk(KERN_ERR + "PCI: Warning: Cannot find a gap in the 32bit address range\n" + "PCI: Unassigned devices with 32bit resource registers may break!\n"); } #endif diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index cfd9f9063896..d2e56b8f48e7 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -675,7 +675,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info) nommu: /* Should not happen anymore */ printk(KERN_WARNING "PCI-DMA: More than 4GB of RAM and no IOMMU\n" - KERN_WARNING "falling back to iommu=soft.\n"); + "falling back to iommu=soft.\n"); return -1; } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 78a5fff857be..85307cc6e45f 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -426,10 +426,11 @@ static noinline int vmalloc_fault(unsigned long address) } static const char errata93_warning[] = -KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" -KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" -KERN_ERR "******* Please consider a BIOS update.\n" -KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; +KERN_ERR +"******* Your BIOS seems to not contain a fix for K8 errata #93\n" +"******* Working around it, but it may cause SEGVs or burn power.\n" +"******* Please consider a BIOS update.\n" +"******* Disabling USB legacy in the BIOS may also help.\n"; /* * No vm86 mode in 64-bit mode: diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c index ba9ab9349782..e64efac3b9db 100644 --- a/arch/xtensa/kernel/traps.c +++ b/arch/xtensa/kernel/traps.c @@ -354,10 +354,10 @@ void show_regs(struct pt_regs * regs) for (i = 0; i < 16; i++) { if ((i % 8) == 0) - printk ("\n" KERN_INFO "a%02d: ", i); - printk("%08lx ", regs->areg[i]); + printk(KERN_INFO "a%02d:", i); + printk(KERN_CONT " %08lx", regs->areg[i]); } - printk("\n"); + printk(KERN_CONT "\n"); printk("pc: %08lx, ps: %08lx, depc: %08lx, excvaddr: %08lx\n", regs->pc, regs->ps, regs->depc, regs->excvaddr); diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 9c6e5b0fe894..2f07b7c99a95 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -1645,7 +1645,7 @@ static int __init fd_probe_drives(void) { int drive,drives,nomem; - printk(KERN_INFO "FD: probing units\n" KERN_INFO "found "); + printk(KERN_INFO "FD: probing units\nfound "); drives=0; nomem=0; for(drive=0;drivedev, " ctrl: %.8x seccnt/cmd: %.4x ver:%.4x\n" - KERN_INFO " status:%.8x mpu_lba:%.8x busmode:%4x\n" - KERN_INFO " error: %.8x cfg_lba:%.8x fatstat:%.4x\n", + dev_info(ace->dev, + " ctrl: %.8x seccnt/cmd: %.4x ver:%.4x\n" + " status:%.8x mpu_lba:%.8x busmode:%4x\n" + " error: %.8x cfg_lba:%.8x fatstat:%.4x\n", ace_in32(ace, ACE_CTRL), ace_in(ace, ACE_SECCNTCMD), ace_in(ace, ACE_VERSION), diff --git a/drivers/char/hw_random/intel-rng.c b/drivers/char/hw_random/intel-rng.c index 5dcbe603eca2..91b53eb1c053 100644 --- a/drivers/char/hw_random/intel-rng.c +++ b/drivers/char/hw_random/intel-rng.c @@ -305,10 +305,11 @@ static int __init intel_init_hw_struct(struct intel_rng_hw *intel_rng_hw, (BIOS_CNTL_LOCK_ENABLE_MASK|BIOS_CNTL_WRITE_ENABLE_MASK)) == BIOS_CNTL_LOCK_ENABLE_MASK) { static __initdata /*const*/ char warning[] = - KERN_WARNING PFX "Firmware space is locked read-only. If you can't or\n" - KERN_WARNING PFX "don't want to disable this in firmware setup, and if\n" - KERN_WARNING PFX "you are certain that your system has a functional\n" - KERN_WARNING PFX "RNG, try using the 'no_fwh_detect' option.\n"; + KERN_WARNING +PFX "Firmware space is locked read-only. If you can't or\n" +PFX "don't want to disable this in firmware setup, and if\n" +PFX "you are certain that your system has a functional\n" +PFX "RNG, try using the 'no_fwh_detect' option.\n"; if (no_fwh_detect) return -ENODEV; diff --git a/drivers/char/isicom.c b/drivers/char/isicom.c index 4159292e35cf..621d1184673c 100644 --- a/drivers/char/isicom.c +++ b/drivers/char/isicom.c @@ -1478,10 +1478,10 @@ static int __devinit load_firmware(struct pci_dev *pdev, status = inw(base + 0x4); if (status != 0) { dev_warn(&pdev->dev, "Card%d rejected load header:\n" - KERN_WARNING "Address:0x%x\n" - KERN_WARNING "Count:0x%x\n" - KERN_WARNING "Status:0x%x\n", - index + 1, frame->addr, frame->count, status); + "Address:0x%x\n" + "Count:0x%x\n" + "Status:0x%x\n", + index + 1, frame->addr, frame->count, status); goto errrelfw; } outsw(base, frame->data, word_count); @@ -1526,10 +1526,10 @@ static int __devinit load_firmware(struct pci_dev *pdev, status = inw(base + 0x4); if (status != 0) { dev_warn(&pdev->dev, "Card%d rejected verify header:\n" - KERN_WARNING "Address:0x%x\n" - KERN_WARNING "Count:0x%x\n" - KERN_WARNING "Status: 0x%x\n", - index + 1, frame->addr, frame->count, status); + "Address:0x%x\n" + "Count:0x%x\n" + "Status: 0x%x\n", + index + 1, frame->addr, frame->count, status); goto errrelfw; } diff --git a/drivers/i2c/busses/i2c-ibm_iic.c b/drivers/i2c/busses/i2c-ibm_iic.c index e4476743f203..b1bc6e277d2a 100644 --- a/drivers/i2c/busses/i2c-ibm_iic.c +++ b/drivers/i2c/busses/i2c-ibm_iic.c @@ -85,10 +85,11 @@ static void dump_iic_regs(const char* header, struct ibm_iic_private* dev) { volatile struct iic_regs __iomem *iic = dev->vaddr; printk(KERN_DEBUG "ibm-iic%d: %s\n", dev->idx, header); - printk(KERN_DEBUG " cntl = 0x%02x, mdcntl = 0x%02x\n" - KERN_DEBUG " sts = 0x%02x, extsts = 0x%02x\n" - KERN_DEBUG " clkdiv = 0x%02x, xfrcnt = 0x%02x\n" - KERN_DEBUG " xtcntlss = 0x%02x, directcntl = 0x%02x\n", + printk(KERN_DEBUG + " cntl = 0x%02x, mdcntl = 0x%02x\n" + " sts = 0x%02x, extsts = 0x%02x\n" + " clkdiv = 0x%02x, xfrcnt = 0x%02x\n" + " xtcntlss = 0x%02x, directcntl = 0x%02x\n", in_8(&iic->cntl), in_8(&iic->mdcntl), in_8(&iic->sts), in_8(&iic->extsts), in_8(&iic->clkdiv), in_8(&iic->xfrcnt), in_8(&iic->xtcntlss), in_8(&iic->directcntl)); diff --git a/drivers/md/md.c b/drivers/md/md.c index 0f4a70c43ffc..d4351ff0849f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1756,9 +1756,10 @@ static void print_sb_1(struct mdp_superblock_1 *sb) __u8 *uuid; uuid = sb->set_uuid; - printk(KERN_INFO "md: SB: (V:%u) (F:0x%08x) Array-ID:<%02x%02x%02x%02x" - ":%02x%02x:%02x%02x:%02x%02x:%02x%02x%02x%02x%02x%02x>\n" - KERN_INFO "md: Name: \"%s\" CT:%llu\n", + printk(KERN_INFO + "md: SB: (V:%u) (F:0x%08x) Array-ID:<%02x%02x%02x%02x" + ":%02x%02x:%02x%02x:%02x%02x:%02x%02x%02x%02x%02x%02x>\n" + "md: Name: \"%s\" CT:%llu\n", le32_to_cpu(sb->major_version), le32_to_cpu(sb->feature_map), uuid[0], uuid[1], uuid[2], uuid[3], @@ -1770,12 +1771,13 @@ static void print_sb_1(struct mdp_superblock_1 *sb) & MD_SUPERBLOCK_1_TIME_SEC_MASK); uuid = sb->device_uuid; - printk(KERN_INFO "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu" + printk(KERN_INFO + "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu" " RO:%llu\n" - KERN_INFO "md: Dev:%08x UUID: %02x%02x%02x%02x:%02x%02x:%02x%02x:%02x%02x" - ":%02x%02x%02x%02x%02x%02x\n" - KERN_INFO "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n" - KERN_INFO "md: (MaxDev:%u) \n", + "md: Dev:%08x UUID: %02x%02x%02x%02x:%02x%02x:%02x%02x:%02x%02x" + ":%02x%02x%02x%02x%02x%02x\n" + "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n" + "md: (MaxDev:%u) \n", le32_to_cpu(sb->level), (unsigned long long)le64_to_cpu(sb->size), le32_to_cpu(sb->raid_disks), diff --git a/drivers/misc/sgi-xp/xpnet.c b/drivers/misc/sgi-xp/xpnet.c index 8d1c60a3f0df..5d778ec8cdb2 100644 --- a/drivers/misc/sgi-xp/xpnet.c +++ b/drivers/misc/sgi-xp/xpnet.c @@ -235,7 +235,7 @@ xpnet_receive(short partid, int channel, struct xpnet_message *msg) skb->ip_summed = CHECKSUM_UNNECESSARY; dev_dbg(xpnet, "passing skb to network layer\n" - KERN_DEBUG "\tskb->head=0x%p skb->data=0x%p skb->tail=0x%p " + "\tskb->head=0x%p skb->data=0x%p skb->tail=0x%p " "skb->end=0x%p skb->len=%d\n", (void *)skb->head, (void *)skb->data, skb_tail_pointer(skb), skb_end_pointer(skb), skb->len); @@ -399,7 +399,7 @@ xpnet_send(struct sk_buff *skb, struct xpnet_pending_msg *queued_msg, msg->buf_pa = xp_pa((void *)start_addr); dev_dbg(xpnet, "sending XPC message to %d:%d\n" - KERN_DEBUG "msg->buf_pa=0x%lx, msg->size=%u, " + "msg->buf_pa=0x%lx, msg->size=%u, " "msg->leadin_ignore=%u, msg->tailout_ignore=%u\n", dest_partid, XPC_NET_CHANNEL, msg->buf_pa, msg->size, msg->leadin_ignore, msg->tailout_ignore); diff --git a/drivers/net/a2065.c b/drivers/net/a2065.c index 85a18175730b..08787f5a22a3 100644 --- a/drivers/net/a2065.c +++ b/drivers/net/a2065.c @@ -569,16 +569,8 @@ static int lance_start_xmit (struct sk_buff *skb, struct net_device *dev) #ifdef DEBUG_DRIVER /* dump the packet */ - { - int i; - - for (i = 0; i < 64; i++) { - if ((i % 16) == 0) - printk("\n" KERN_DEBUG); - printk ("%2.2x ", skb->data [i]); - } - printk("\n"); - } + print_hex_dump(KERN_DEBUG, "skb->data: ", DUMP_PREFIX_NONE, + 16, 1, skb->data, 64, true); #endif entry = lp->tx_new & lp->tx_ring_mod_mask; ib->btx_ring [entry].length = (-skblen) | 0xf000; diff --git a/drivers/net/arcnet/arcnet.c b/drivers/net/arcnet/arcnet.c index d6d4ab3b430c..7d227cdab9f8 100644 --- a/drivers/net/arcnet/arcnet.c +++ b/drivers/net/arcnet/arcnet.c @@ -158,15 +158,12 @@ module_exit(arcnet_exit); void arcnet_dump_skb(struct net_device *dev, struct sk_buff *skb, char *desc) { - int i; + char hdr[32]; - printk(KERN_DEBUG "%6s: skb dump (%s) follows:", dev->name, desc); - for (i = 0; i < skb->len; i++) { - if (i % 16 == 0) - printk("\n" KERN_DEBUG "[%04X] ", i); - printk("%02X ", ((u_char *) skb->data)[i]); - } - printk("\n"); + /* dump the packet */ + snprintf(hdr, sizeof(hdr), "%6s:%s skb->data:", dev->name, desc); + print_hex_dump(KERN_DEBUG, hdr, DUMP_PREFIX_OFFSET, + 16, 1, skb->data, skb->len, true); } EXPORT_SYMBOL(arcnet_dump_skb); @@ -184,6 +181,7 @@ static void arcnet_dump_packet(struct net_device *dev, int bufnum, int i, length; unsigned long flags = 0; static uint8_t buf[512]; + char hdr[32]; /* hw.copy_from_card expects IRQ context so take the IRQ lock to keep it single threaded */ @@ -197,14 +195,10 @@ static void arcnet_dump_packet(struct net_device *dev, int bufnum, /* if the offset[0] byte is nonzero, this is a 256-byte packet */ length = (buf[2] ? 256 : 512); - printk(KERN_DEBUG "%6s: packet dump (%s) follows:", dev->name, desc); - for (i = 0; i < length; i++) { - if (i % 16 == 0) - printk("\n" KERN_DEBUG "[%04X] ", i); - printk("%02X ", buf[i]); - } - printk("\n"); - + /* dump the packet */ + snprintf(hdr, sizeof(hdr), "%6s:%s packet dump:", dev->name, desc); + print_hex_dump(KERN_DEBUG, hdr, DUMP_PREFIX_OFFSET, + 16, 1, buf, length, true); } #else diff --git a/drivers/net/bmac.c b/drivers/net/bmac.c index 9578a3dfac01..a76315dc7767 100644 --- a/drivers/net/bmac.c +++ b/drivers/net/bmac.c @@ -428,10 +428,11 @@ bmac_init_phy(struct net_device *dev) printk(KERN_DEBUG "phy registers:"); for (addr = 0; addr < 32; ++addr) { if ((addr & 7) == 0) - printk("\n" KERN_DEBUG); - printk(" %.4x", bmac_mif_read(dev, addr)); + printk(KERN_DEBUG); + printk(KERN_CONT " %.4x", bmac_mif_read(dev, addr)); } - printk("\n"); + print(KERN_CONT "\n"); + if (bp->is_bmac_plus) { unsigned int capable, ctrl; diff --git a/drivers/net/bnx2x_main.c b/drivers/net/bnx2x_main.c index 6c67be679764..c36a5f33739f 100644 --- a/drivers/net/bnx2x_main.c +++ b/drivers/net/bnx2x_main.c @@ -484,8 +484,9 @@ static void bnx2x_fw_dump(struct bnx2x *bp) mark = REG_RD(bp, MCP_REG_MCPR_SCRATCH + 0xf104); mark = ((mark + 0x3) & ~0x3); - printk(KERN_ERR PFX "begin fw dump (mark 0x%x)\n" KERN_ERR, mark); + printk(KERN_ERR PFX "begin fw dump (mark 0x%x)\n", mark); + printk(KERN_ERR PFX); for (offset = mark - 0x08000000; offset <= 0xF900; offset += 0x8*4) { for (word = 0; word < 8; word++) data[word] = htonl(REG_RD(bp, MCP_REG_MCPR_SCRATCH + @@ -500,7 +501,7 @@ static void bnx2x_fw_dump(struct bnx2x *bp) data[8] = 0x0; printk(KERN_CONT "%s", (char *)data); } - printk("\n" KERN_ERR PFX "end of fw dump\n"); + printk(KERN_ERR PFX "end of fw dump\n"); } static void bnx2x_panic_dump(struct bnx2x *bp) @@ -7354,7 +7355,7 @@ static void bnx2x_reset_task(struct work_struct *work) #ifdef BNX2X_STOP_ON_ERROR BNX2X_ERR("reset task called but STOP_ON_ERROR defined" " so reset not done to allow debug dump,\n" - KERN_ERR " you will need to reboot when done\n"); + " you will need to reboot when done\n"); return; #endif diff --git a/drivers/net/dl2k.c b/drivers/net/dl2k.c index 895d72143ee0..4b6a219fecea 100644 --- a/drivers/net/dl2k.c +++ b/drivers/net/dl2k.c @@ -268,8 +268,9 @@ rio_probe1 (struct pci_dev *pdev, const struct pci_device_id *ent) printk(KERN_INFO "tx_coalesce:\t%d packets\n", tx_coalesce); if (np->coalesce) - printk(KERN_INFO "rx_coalesce:\t%d packets\n" - KERN_INFO "rx_timeout: \t%d ns\n", + printk(KERN_INFO + "rx_coalesce:\t%d packets\n" + "rx_timeout: \t%d ns\n", np->rx_coalesce, np->rx_timeout*640); if (np->vlan) printk(KERN_INFO "vlan(id):\t%d\n", np->vlan); @@ -1522,9 +1523,9 @@ mii_get_media (struct net_device *dev) printk (KERN_INFO "Operating at 10 Mbps, "); } if (bmcr & MII_BMCR_DUPLEX_MODE) { - printk ("Full duplex\n"); + printk (KERN_CONT "Full duplex\n"); } else { - printk ("Half duplex\n"); + printk (KERN_CONT "Half duplex\n"); } } if (np->tx_flow) @@ -1614,9 +1615,9 @@ mii_set_media (struct net_device *dev) } if (np->full_duplex) { bmcr |= MII_BMCR_DUPLEX_MODE; - printk ("Full duplex\n"); + printk (KERN_CONT "Full duplex\n"); } else { - printk ("Half duplex\n"); + printk (KERN_CONT "Half duplex\n"); } #if 0 /* Set 1000BaseT Master/Slave setting */ @@ -1669,9 +1670,9 @@ mii_get_media_pcs (struct net_device *dev) __u16 bmcr = mii_read (dev, phy_addr, PCS_BMCR); printk (KERN_INFO "Operating at 1000 Mbps, "); if (bmcr & MII_BMCR_DUPLEX_MODE) { - printk ("Full duplex\n"); + printk (KERN_CONT "Full duplex\n"); } else { - printk ("Half duplex\n"); + printk (KERN_CONT "Half duplex\n"); } } if (np->tx_flow) diff --git a/drivers/net/epic100.c b/drivers/net/epic100.c index b60e27dfcfa7..88d7ebf31220 100644 --- a/drivers/net/epic100.c +++ b/drivers/net/epic100.c @@ -338,8 +338,7 @@ static int __devinit epic_init_one (struct pci_dev *pdev, #ifndef MODULE static int printed_version; if (!printed_version++) - printk (KERN_INFO "%s" KERN_INFO "%s", - version, version2); + printk(KERN_INFO "%s%s", version, version2); #endif card_idx++; @@ -1600,7 +1599,7 @@ static int __init epic_init (void) { /* when a module, this is printed whether or not devices are found in probe */ #ifdef MODULE - printk (KERN_INFO "%s" KERN_INFO "%s", + printk (KERN_INFO "%s%s", version, version2); #endif diff --git a/drivers/net/fealnx.c b/drivers/net/fealnx.c index 891be28a7d4f..053fb49820b9 100644 --- a/drivers/net/fealnx.c +++ b/drivers/net/fealnx.c @@ -1209,17 +1209,20 @@ static void fealnx_tx_timeout(struct net_device *dev) unsigned long flags; int i; - printk(KERN_WARNING "%s: Transmit timed out, status %8.8x," - " resetting...\n", dev->name, ioread32(ioaddr + ISR)); + printk(KERN_WARNING + "%s: Transmit timed out, status %8.8x, resetting...\n", + dev->name, ioread32(ioaddr + ISR)); { printk(KERN_DEBUG " Rx ring %p: ", np->rx_ring); for (i = 0; i < RX_RING_SIZE; i++) - printk(" %8.8x", (unsigned int) np->rx_ring[i].status); - printk("\n" KERN_DEBUG " Tx ring %p: ", np->tx_ring); + printk(PR_CONT " %8.8x", + (unsigned int) np->rx_ring[i].status); + printk(KERN_CONT "\n"); + printk(KERN_DEBUG " Tx ring %p: ", np->tx_ring); for (i = 0; i < TX_RING_SIZE; i++) - printk(" %4.4x", np->tx_ring[i].status); - printk("\n"); + printk(PR_CONT " %4.4x", np->tx_ring[i].status); + printk(PR_CONT "\n"); } spin_lock_irqsave(&np->lock, flags); diff --git a/drivers/net/hamachi.c b/drivers/net/hamachi.c index 9d5b62cb30f7..d62378cbc149 100644 --- a/drivers/net/hamachi.c +++ b/drivers/net/hamachi.c @@ -173,8 +173,8 @@ static int tx_params[MAX_UNITS] = {-1, -1, -1, -1, -1, -1, -1, -1}; static const char version[] __devinitconst = KERN_INFO DRV_NAME ".c:v" DRV_VERSION " " DRV_RELDATE " Written by Donald Becker\n" -KERN_INFO " Some modifications by Eric kasten \n" -KERN_INFO " Further modifications by Keith Underwood \n"; +" Some modifications by Eric kasten \n" +" Further modifications by Keith Underwood \n"; /* IP_MF appears to be only defined in , however, @@ -1080,11 +1080,14 @@ static void hamachi_tx_timeout(struct net_device *dev) { printk(KERN_DEBUG " Rx ring %p: ", hmp->rx_ring); for (i = 0; i < RX_RING_SIZE; i++) - printk(" %8.8x", le32_to_cpu(hmp->rx_ring[i].status_n_length)); - printk("\n"KERN_DEBUG" Tx ring %p: ", hmp->tx_ring); + printk(KERN_CONT " %8.8x", + le32_to_cpu(hmp->rx_ring[i].status_n_length)); + printk(KERN_CONT "\n"); + printk(KERN_DEBUG" Tx ring %p: ", hmp->tx_ring); for (i = 0; i < TX_RING_SIZE; i++) - printk(" %4.4x", le32_to_cpu(hmp->tx_ring[i].status_n_length)); - printk("\n"); + printk(KERN_CONT " %4.4x", + le32_to_cpu(hmp->tx_ring[i].status_n_length)); + printk(KERN_CONT "\n"); } /* Reinit the hardware and make sure the Rx and Tx processes @@ -1753,13 +1756,13 @@ static int hamachi_close(struct net_device *dev) #ifdef __i386__ if (hamachi_debug > 2) { - printk("\n"KERN_DEBUG" Tx ring at %8.8x:\n", + printk(KERN_DEBUG " Tx ring at %8.8x:\n", (int)hmp->tx_ring_dma); for (i = 0; i < TX_RING_SIZE; i++) - printk(" %c #%d desc. %8.8x %8.8x.\n", + printk(KERN_DEBUG " %c #%d desc. %8.8x %8.8x.\n", readl(ioaddr + TxCurPtr) == (long)&hmp->tx_ring[i] ? '>' : ' ', i, hmp->tx_ring[i].status_n_length, hmp->tx_ring[i].addr); - printk("\n"KERN_DEBUG " Rx ring %8.8x:\n", + printk(KERN_DEBUG " Rx ring %8.8x:\n", (int)hmp->rx_ring_dma); for (i = 0; i < RX_RING_SIZE; i++) { printk(KERN_DEBUG " %c #%d desc. %4.4x %8.8x\n", @@ -1770,7 +1773,7 @@ static int hamachi_close(struct net_device *dev) u16 *addr = (u16 *) hmp->rx_skbuff[i]->data; int j; - + printk(KERN_DEBUG "Addr: "); for (j = 0; j < 0x50; j++) printk(" %4.4x", addr[j]); printk("\n"); diff --git a/drivers/net/hamradio/baycom_epp.c b/drivers/net/hamradio/baycom_epp.c index 5e4b7afd0683..352703255bba 100644 --- a/drivers/net/hamradio/baycom_epp.c +++ b/drivers/net/hamradio/baycom_epp.c @@ -68,7 +68,7 @@ static const char paranoia_str[] = KERN_ERR static const char bc_drvname[] = "baycom_epp"; static const char bc_drvinfo[] = KERN_INFO "baycom_epp: (C) 1998-2000 Thomas Sailer, HB9JNX/AE4WA\n" -KERN_INFO "baycom_epp: version 0.7 compiled " __TIME__ " " __DATE__ "\n"; +"baycom_epp: version 0.7 compiled " __TIME__ " " __DATE__ "\n"; /* --------------------------------------------------------------------- */ diff --git a/drivers/net/hamradio/baycom_par.c b/drivers/net/hamradio/baycom_par.c index 2e6fc4dc74b1..5f5af9a606f8 100644 --- a/drivers/net/hamradio/baycom_par.c +++ b/drivers/net/hamradio/baycom_par.c @@ -102,7 +102,7 @@ static const char bc_drvname[] = "baycom_par"; static const char bc_drvinfo[] = KERN_INFO "baycom_par: (C) 1996-2000 Thomas Sailer, HB9JNX/AE4WA\n" -KERN_INFO "baycom_par: version 0.9 compiled " __TIME__ " " __DATE__ "\n"; +"baycom_par: version 0.9 compiled " __TIME__ " " __DATE__ "\n"; /* --------------------------------------------------------------------- */ diff --git a/drivers/net/hamradio/baycom_ser_fdx.c b/drivers/net/hamradio/baycom_ser_fdx.c index b6a816e60c0f..aa4488e871b2 100644 --- a/drivers/net/hamradio/baycom_ser_fdx.c +++ b/drivers/net/hamradio/baycom_ser_fdx.c @@ -91,7 +91,7 @@ static const char bc_drvname[] = "baycom_ser_fdx"; static const char bc_drvinfo[] = KERN_INFO "baycom_ser_fdx: (C) 1996-2000 Thomas Sailer, HB9JNX/AE4WA\n" -KERN_INFO "baycom_ser_fdx: version 0.10 compiled " __TIME__ " " __DATE__ "\n"; +"baycom_ser_fdx: version 0.10 compiled " __TIME__ " " __DATE__ "\n"; /* --------------------------------------------------------------------- */ diff --git a/drivers/net/hamradio/baycom_ser_hdx.c b/drivers/net/hamradio/baycom_ser_hdx.c index 3bcc57acbe6d..88c593596020 100644 --- a/drivers/net/hamradio/baycom_ser_hdx.c +++ b/drivers/net/hamradio/baycom_ser_hdx.c @@ -79,7 +79,7 @@ static const char bc_drvname[] = "baycom_ser_hdx"; static const char bc_drvinfo[] = KERN_INFO "baycom_ser_hdx: (C) 1996-2000 Thomas Sailer, HB9JNX/AE4WA\n" -KERN_INFO "baycom_ser_hdx: version 0.10 compiled " __TIME__ " " __DATE__ "\n"; +"baycom_ser_hdx: version 0.10 compiled " __TIME__ " " __DATE__ "\n"; /* --------------------------------------------------------------------- */ diff --git a/drivers/net/natsemi.c b/drivers/net/natsemi.c index c9bfe4eea189..78c088331f57 100644 --- a/drivers/net/natsemi.c +++ b/drivers/net/natsemi.c @@ -130,8 +130,8 @@ static int full_duplex[MAX_UNITS]; static const char version[] __devinitconst = KERN_INFO DRV_NAME " dp8381x driver, version " DRV_VERSION ", " DRV_RELDATE "\n" - KERN_INFO " originally by Donald Becker \n" - KERN_INFO " 2.4.x kernel port by Jeff Garzik, Tjeerd Mulder\n"; + " originally by Donald Becker \n" + " 2.4.x kernel port by Jeff Garzik, Tjeerd Mulder\n"; MODULE_AUTHOR("Donald Becker "); MODULE_DESCRIPTION("National Semiconductor DP8381x series PCI Ethernet driver"); diff --git a/drivers/net/ne.c b/drivers/net/ne.c index 5c3e242428f1..992dbfffdb05 100644 --- a/drivers/net/ne.c +++ b/drivers/net/ne.c @@ -321,7 +321,7 @@ static int __init ne_probe1(struct net_device *dev, unsigned long ioaddr) } if (ei_debug && version_printed++ == 0) - printk(KERN_INFO "%s" KERN_INFO "%s", version1, version2); + printk(KERN_INFO "%s%s", version1, version2); printk(KERN_INFO "NE*000 ethercard probe at %#3lx:", ioaddr); diff --git a/drivers/net/pci-skeleton.c b/drivers/net/pci-skeleton.c index 8c1f6988f398..89f7b2ad5231 100644 --- a/drivers/net/pci-skeleton.c +++ b/drivers/net/pci-skeleton.c @@ -105,7 +105,7 @@ IVc. Errata static char version[] __devinitdata = KERN_INFO NETDRV_DRIVER_LOAD_MSG "\n" -KERN_INFO " Support available from http://foo.com/bar/baz.html\n"; +" Support available from http://foo.com/bar/baz.html\n"; /* define to 1 to enable PIO instead of MMIO */ #undef USE_IO_OPS diff --git a/drivers/net/pcmcia/ibmtr_cs.c b/drivers/net/pcmcia/ibmtr_cs.c index f51944b28cfa..06618af1a468 100644 --- a/drivers/net/pcmcia/ibmtr_cs.c +++ b/drivers/net/pcmcia/ibmtr_cs.c @@ -298,14 +298,11 @@ static int __devinit ibmtr_config(struct pcmcia_device *link) strcpy(info->node.dev_name, dev->name); - printk(KERN_INFO "%s: port %#3lx, irq %d,", - dev->name, dev->base_addr, dev->irq); - printk (" mmio %#5lx,", (u_long)ti->mmio); - printk (" sram %#5lx,", (u_long)ti->sram_base << 12); - printk ("\n" KERN_INFO " hwaddr="); - for (i = 0; i < TR_ALEN; i++) - printk("%02X", dev->dev_addr[i]); - printk("\n"); + printk(KERN_INFO + "%s: port %#3lx, irq %d, mmio %#5lx, sram %#5lx, hwaddr=%pM\n", + dev->name, dev->base_addr, dev->irq, + (u_long)ti->mmio, (u_long)(ti->sram_base << 12), + dev->dev_addr); return 0; cs_failed: diff --git a/drivers/net/pcmcia/nmclan_cs.c b/drivers/net/pcmcia/nmclan_cs.c index 02ef63ed1f99..36de91baf238 100644 --- a/drivers/net/pcmcia/nmclan_cs.c +++ b/drivers/net/pcmcia/nmclan_cs.c @@ -1425,15 +1425,12 @@ static void BuildLAF(int *ladrf, int *adr) ladrf[byte] |= (1 << (hashcode & 7)); #ifdef PCMCIA_DEBUG - if (pc_debug > 2) { - printk(KERN_DEBUG " adr ="); - for (i = 0; i < 6; i++) - printk(" %02X", adr[i]); - printk("\n" KERN_DEBUG " hashcode = %d(decimal), ladrf[0:63]" - " =", hashcode); - for (i = 0; i < 8; i++) - printk(" %02X", ladrf[i]); - printk("\n"); + if (pc_debug > 2) + printk(KERN_DEBUG " adr =%pM\n", adr); + printk(KERN_DEBUG " hashcode = %d(decimal), ladrf[0:63] =", hashcode); + for (i = 0; i < 8; i++) + printk(KERN_CONT " %02X", ladrf[i]); + printk(KERN_CONT "\n"); } #endif } /* BuildLAF */ diff --git a/drivers/net/pcnet32.c b/drivers/net/pcnet32.c index 1c35e1d637a0..28368157dac4 100644 --- a/drivers/net/pcnet32.c +++ b/drivers/net/pcnet32.c @@ -485,7 +485,7 @@ static void pcnet32_realloc_tx_ring(struct net_device *dev, &new_ring_dma_addr); if (new_tx_ring == NULL) { if (netif_msg_drv(lp)) - printk("\n" KERN_ERR + printk(KERN_ERR "%s: Consistent memory allocation failed.\n", dev->name); return; @@ -496,7 +496,7 @@ static void pcnet32_realloc_tx_ring(struct net_device *dev, GFP_ATOMIC); if (!new_dma_addr_list) { if (netif_msg_drv(lp)) - printk("\n" KERN_ERR + printk(KERN_ERR "%s: Memory allocation failed.\n", dev->name); goto free_new_tx_ring; } @@ -505,7 +505,7 @@ static void pcnet32_realloc_tx_ring(struct net_device *dev, GFP_ATOMIC); if (!new_skb_list) { if (netif_msg_drv(lp)) - printk("\n" KERN_ERR + printk(KERN_ERR "%s: Memory allocation failed.\n", dev->name); goto free_new_lists; } @@ -563,7 +563,7 @@ static void pcnet32_realloc_rx_ring(struct net_device *dev, &new_ring_dma_addr); if (new_rx_ring == NULL) { if (netif_msg_drv(lp)) - printk("\n" KERN_ERR + printk(KERN_ERR "%s: Consistent memory allocation failed.\n", dev->name); return; @@ -574,7 +574,7 @@ static void pcnet32_realloc_rx_ring(struct net_device *dev, GFP_ATOMIC); if (!new_dma_addr_list) { if (netif_msg_drv(lp)) - printk("\n" KERN_ERR + printk(KERN_ERR "%s: Memory allocation failed.\n", dev->name); goto free_new_rx_ring; } @@ -583,7 +583,7 @@ static void pcnet32_realloc_rx_ring(struct net_device *dev, GFP_ATOMIC); if (!new_skb_list) { if (netif_msg_drv(lp)) - printk("\n" KERN_ERR + printk(KERN_ERR "%s: Memory allocation failed.\n", dev->name); goto free_new_lists; } @@ -1766,38 +1766,38 @@ pcnet32_probe1(unsigned long ioaddr, int shared, struct pci_dev *pdev) /* Version 0x2623 and 0x2624 */ if (((chip_version + 1) & 0xfffe) == 0x2624) { i = a->read_csr(ioaddr, 80) & 0x0C00; /* Check tx_start_pt */ - printk("\n" KERN_INFO " tx_start_pt(0x%04x):", i); + printk(KERN_INFO " tx_start_pt(0x%04x):", i); switch (i >> 10) { case 0: - printk(" 20 bytes,"); + printk(KERN_CONT " 20 bytes,"); break; case 1: - printk(" 64 bytes,"); + printk(KERN_CONT " 64 bytes,"); break; case 2: - printk(" 128 bytes,"); + printk(KERN_CONT " 128 bytes,"); break; case 3: - printk("~220 bytes,"); + printk(KERN_CONT "~220 bytes,"); break; } i = a->read_bcr(ioaddr, 18); /* Check Burst/Bus control */ - printk(" BCR18(%x):", i & 0xffff); + printk(KERN_CONT " BCR18(%x):", i & 0xffff); if (i & (1 << 5)) - printk("BurstWrEn "); + printk(KERN_CONT "BurstWrEn "); if (i & (1 << 6)) - printk("BurstRdEn "); + printk(KERN_CONT "BurstRdEn "); if (i & (1 << 7)) - printk("DWordIO "); + printk(KERN_CONT "DWordIO "); if (i & (1 << 11)) - printk("NoUFlow "); + printk(KERN_CONT "NoUFlow "); i = a->read_bcr(ioaddr, 25); - printk("\n" KERN_INFO " SRAMSIZE=0x%04x,", i << 8); + printk(KERN_INFO " SRAMSIZE=0x%04x,", i << 8); i = a->read_bcr(ioaddr, 26); - printk(" SRAM_BND=0x%04x,", i << 8); + printk(KERN_CONT " SRAM_BND=0x%04x,", i << 8); i = a->read_bcr(ioaddr, 27); if (i & (1 << 14)) - printk("LowLatRx"); + printk(KERN_CONT "LowLatRx"); } } @@ -1996,7 +1996,7 @@ static int pcnet32_alloc_ring(struct net_device *dev, const char *name) &lp->tx_ring_dma_addr); if (lp->tx_ring == NULL) { if (netif_msg_drv(lp)) - printk("\n" KERN_ERR PFX + printk(KERN_ERR PFX "%s: Consistent memory allocation failed.\n", name); return -ENOMEM; @@ -2008,7 +2008,7 @@ static int pcnet32_alloc_ring(struct net_device *dev, const char *name) &lp->rx_ring_dma_addr); if (lp->rx_ring == NULL) { if (netif_msg_drv(lp)) - printk("\n" KERN_ERR PFX + printk(KERN_ERR PFX "%s: Consistent memory allocation failed.\n", name); return -ENOMEM; @@ -2018,7 +2018,7 @@ static int pcnet32_alloc_ring(struct net_device *dev, const char *name) GFP_ATOMIC); if (!lp->tx_dma_addr) { if (netif_msg_drv(lp)) - printk("\n" KERN_ERR PFX + printk(KERN_ERR PFX "%s: Memory allocation failed.\n", name); return -ENOMEM; } @@ -2027,7 +2027,7 @@ static int pcnet32_alloc_ring(struct net_device *dev, const char *name) GFP_ATOMIC); if (!lp->rx_dma_addr) { if (netif_msg_drv(lp)) - printk("\n" KERN_ERR PFX + printk(KERN_ERR PFX "%s: Memory allocation failed.\n", name); return -ENOMEM; } @@ -2036,7 +2036,7 @@ static int pcnet32_alloc_ring(struct net_device *dev, const char *name) GFP_ATOMIC); if (!lp->tx_skbuff) { if (netif_msg_drv(lp)) - printk("\n" KERN_ERR PFX + printk(KERN_ERR PFX "%s: Memory allocation failed.\n", name); return -ENOMEM; } @@ -2045,7 +2045,7 @@ static int pcnet32_alloc_ring(struct net_device *dev, const char *name) GFP_ATOMIC); if (!lp->rx_skbuff) { if (netif_msg_drv(lp)) - printk("\n" KERN_ERR PFX + printk(KERN_ERR PFX "%s: Memory allocation failed.\n", name); return -ENOMEM; } diff --git a/drivers/net/starfire.c b/drivers/net/starfire.c index 838cce8b8fff..669253c7bd41 100644 --- a/drivers/net/starfire.c +++ b/drivers/net/starfire.c @@ -180,7 +180,7 @@ static int full_duplex[MAX_UNITS] = {0, }; /* These identify the driver base version and may not be removed. */ static const char version[] __devinitconst = KERN_INFO "starfire.c:v1.03 7/26/2000 Written by Donald Becker \n" -KERN_INFO " (unofficial 2.2/2.4 kernel port, version " DRV_VERSION ", " DRV_RELDATE ")\n"; +" (unofficial 2.2/2.4 kernel port, version " DRV_VERSION ", " DRV_RELDATE ")\n"; MODULE_AUTHOR("Donald Becker "); MODULE_DESCRIPTION("Adaptec Starfire Ethernet driver"); diff --git a/drivers/net/sundance.c b/drivers/net/sundance.c index 545f81b34ad7..d1521c3875b2 100644 --- a/drivers/net/sundance.c +++ b/drivers/net/sundance.c @@ -1698,13 +1698,13 @@ static int netdev_close(struct net_device *dev) #ifdef __i386__ if (netif_msg_hw(np)) { - printk("\n"KERN_DEBUG" Tx ring at %8.8x:\n", + printk(KERN_DEBUG " Tx ring at %8.8x:\n", (int)(np->tx_ring_dma)); for (i = 0; i < TX_RING_SIZE; i++) - printk(" #%d desc. %4.4x %8.8x %8.8x.\n", + printk(KERN_DEBUG " #%d desc. %4.4x %8.8x %8.8x.\n", i, np->tx_ring[i].status, np->tx_ring[i].frag[0].addr, np->tx_ring[i].frag[0].length); - printk("\n"KERN_DEBUG " Rx ring %8.8x:\n", + printk(KERN_DEBUG " Rx ring %8.8x:\n", (int)(np->rx_ring_dma)); for (i = 0; i < /*RX_RING_SIZE*/4 ; i++) { printk(KERN_DEBUG " #%d desc. %4.4x %4.4x %8.8x\n", diff --git a/drivers/net/tsi108_eth.c b/drivers/net/tsi108_eth.c index 0f78f99f9b20..7030bd5e9848 100644 --- a/drivers/net/tsi108_eth.c +++ b/drivers/net/tsi108_eth.c @@ -1132,7 +1132,9 @@ static int tsi108_get_mac(struct net_device *dev) } if (!is_valid_ether_addr(dev->dev_addr)) { - printk("KERN_ERR: word1: %08x, word2: %08x\n", word1, word2); + printk(KERN_ERR + "%s: Invalid MAC address. word1: %08x, word2: %08x\n", + dev->name, word1, word2); return -EINVAL; } @@ -1201,8 +1203,8 @@ static void tsi108_set_rx_mode(struct net_device *dev) __set_bit(hash, &data->mc_hash[0]); } else { printk(KERN_ERR - "%s: got multicast address of length %d " - "instead of 6.\n", dev->name, + "%s: got multicast address of length %d instead of 6.\n", + dev->name, mc->dmi_addrlen); } diff --git a/drivers/net/tulip/de2104x.c b/drivers/net/tulip/de2104x.c index 81f054dbb88d..ef49744a5085 100644 --- a/drivers/net/tulip/de2104x.c +++ b/drivers/net/tulip/de2104x.c @@ -944,9 +944,10 @@ static void de_set_media (struct de_private *de) macmode &= ~FullDuplex; if (netif_msg_link(de)) { - printk(KERN_INFO "%s: set link %s\n" - KERN_INFO "%s: mode 0x%x, sia 0x%x,0x%x,0x%x,0x%x\n" - KERN_INFO "%s: set mode 0x%x, set sia 0x%x,0x%x,0x%x\n", + printk(KERN_INFO + "%s: set link %s\n" + "%s: mode 0x%x, sia 0x%x,0x%x,0x%x,0x%x\n" + "%s: set mode 0x%x, set sia 0x%x,0x%x,0x%x\n", de->dev->name, media_name[media], de->dev->name, dr32(MacMode), dr32(SIAStatus), dr32(CSR13), dr32(CSR14), dr32(CSR15), diff --git a/drivers/net/tulip/tulip_core.c b/drivers/net/tulip/tulip_core.c index 2abb5d3becc6..99a63649f4fc 100644 --- a/drivers/net/tulip/tulip_core.c +++ b/drivers/net/tulip/tulip_core.c @@ -570,16 +570,18 @@ static void tulip_tx_timeout(struct net_device *dev) (unsigned int)tp->rx_ring[i].buffer2, buf[0], buf[1], buf[2]); for (j = 0; buf[j] != 0xee && j < 1600; j++) - if (j < 100) printk(" %2.2x", buf[j]); - printk(" j=%d.\n", j); + if (j < 100) + printk(KERN_CONT " %2.2x", buf[j]); + printk(KERN_CONT " j=%d.\n", j); } printk(KERN_DEBUG " Rx ring %8.8x: ", (int)tp->rx_ring); for (i = 0; i < RX_RING_SIZE; i++) - printk(" %8.8x", (unsigned int)tp->rx_ring[i].status); - printk("\n" KERN_DEBUG " Tx ring %8.8x: ", (int)tp->tx_ring); + printk(KERN_CONT " %8.8x", + (unsigned int)tp->rx_ring[i].status); + printk(KERN_DEBUG " Tx ring %8.8x: ", (int)tp->tx_ring); for (i = 0; i < TX_RING_SIZE; i++) - printk(" %8.8x", (unsigned int)tp->tx_ring[i].status); - printk("\n"); + printk(KERN_CONT " %8.8x", (unsigned int)tp->tx_ring[i].status); + printk(KERN_CONT "\n"); } #endif diff --git a/drivers/net/tulip/winbond-840.c b/drivers/net/tulip/winbond-840.c index 842b1a2c40d4..0f15773dae52 100644 --- a/drivers/net/tulip/winbond-840.c +++ b/drivers/net/tulip/winbond-840.c @@ -142,7 +142,7 @@ static int full_duplex[MAX_UNITS] = {-1, -1, -1, -1, -1, -1, -1, -1}; static const char version[] __initconst = KERN_INFO DRV_NAME ".c:v" DRV_VERSION " (2.4 port) " DRV_RELDATE " Donald Becker \n" - KERN_INFO " http://www.scyld.com/network/drivers.html\n"; + " http://www.scyld.com/network/drivers.html\n"; MODULE_AUTHOR("Donald Becker "); MODULE_DESCRIPTION("Winbond W89c840 Ethernet driver"); @@ -939,7 +939,7 @@ static void tx_timeout(struct net_device *dev) printk(KERN_DEBUG " Rx ring %p: ", np->rx_ring); for (i = 0; i < RX_RING_SIZE; i++) printk(" %8.8x", (unsigned int)np->rx_ring[i].status); - printk("\n"KERN_DEBUG" Tx ring %p: ", np->tx_ring); + printk(KERN_DEBUG" Tx ring %p: ", np->tx_ring); for (i = 0; i < TX_RING_SIZE; i++) printk(" %8.8x", np->tx_ring[i].status); printk("\n"); @@ -1520,7 +1520,7 @@ static int netdev_close(struct net_device *dev) printk(KERN_DEBUG " #%d desc. %4.4x %4.4x %8.8x.\n", i, np->tx_ring[i].length, np->tx_ring[i].status, np->tx_ring[i].buffer1); - printk("\n"KERN_DEBUG " Rx ring %8.8x:\n", + printk(KERN_DEBUG " Rx ring %8.8x:\n", (int)np->rx_ring); for (i = 0; i < RX_RING_SIZE; i++) { printk(KERN_DEBUG " #%d desc. %4.4x %4.4x %8.8x\n", diff --git a/drivers/net/wan/hd64570.c b/drivers/net/wan/hd64570.c index 223238de475c..1ea1ef6c3b96 100644 --- a/drivers/net/wan/hd64570.c +++ b/drivers/net/wan/hd64570.c @@ -584,8 +584,9 @@ static void sca_dump_rings(struct net_device *dev) sca_in(DSR_RX(phy_node(port)), card) & DSR_DE ? "" : "in"); for (cnt = 0; cnt < port_to_card(port)->rx_ring_buffers; cnt++) printk(" %02X", readb(&(desc_address(port, cnt, 0)->stat))); + printk(KERN_CONT "\n"); - printk("\n" KERN_DEBUG "TX ring: CDA=%u EDA=%u DSR=%02X in=%u " + printk(KERN_DEBUG "TX ring: CDA=%u EDA=%u DSR=%02X in=%u " "last=%u %sactive", sca_inw(get_dmac_tx(port) + CDAL, card), sca_inw(get_dmac_tx(port) + EDAL, card), diff --git a/drivers/net/wan/hd64572.c b/drivers/net/wan/hd64572.c index 497b003d7239..f099c34a3ae2 100644 --- a/drivers/net/wan/hd64572.c +++ b/drivers/net/wan/hd64572.c @@ -529,8 +529,9 @@ static void sca_dump_rings(struct net_device *dev) sca_in(DSR_RX(port->chan), card) & DSR_DE ? "" : "in"); for (cnt = 0; cnt < port->card->rx_ring_buffers; cnt++) printk(" %02X", readb(&(desc_address(port, cnt, 0)->stat))); + printk(KERN_CONT "\n"); - printk("\n" KERN_DEBUG "TX ring: CDA=%u EDA=%u DSR=%02X in=%u " + printk(KERN_DEBUG "TX ring: CDA=%u EDA=%u DSR=%02X in=%u " "last=%u %sactive", sca_inl(get_dmac_tx(port) + CDAL, card), sca_inl(get_dmac_tx(port) + EDAL, card), diff --git a/drivers/net/wan/sbni.c b/drivers/net/wan/sbni.c index 3fb9dbc88a1a..d14e95a08d66 100644 --- a/drivers/net/wan/sbni.c +++ b/drivers/net/wan/sbni.c @@ -326,11 +326,9 @@ sbni_pci_probe( struct net_device *dev ) } if (pci_irq_line <= 0 || pci_irq_line >= nr_irqs) - printk( KERN_WARNING " WARNING: The PCI BIOS assigned " - "this PCI card to IRQ %d, which is unlikely " - "to work!.\n" - KERN_WARNING " You should use the PCI BIOS " - "setup to assign a valid IRQ line.\n", + printk( KERN_WARNING + " WARNING: The PCI BIOS assigned this PCI card to IRQ %d, which is unlikely to work!.\n" + " You should use the PCI BIOS setup to assign a valid IRQ line.\n", pci_irq_line ); /* avoiding re-enable dual adapters */ diff --git a/drivers/net/wireless/ray_cs.c b/drivers/net/wireless/ray_cs.c index b10b0383dfa5..698b11b1cadb 100644 --- a/drivers/net/wireless/ray_cs.c +++ b/drivers/net/wireless/ray_cs.c @@ -2427,11 +2427,10 @@ static void untranslate(ray_dev_t *local, struct sk_buff *skb, int len) #ifdef PCMCIA_DEBUG if (pc_debug > 3) { - int i; - printk(KERN_DEBUG "skb->data before untranslate"); - for (i = 0; i < 64; i++) - printk("%02x ", skb->data[i]); - printk("\n" KERN_DEBUG + print_hex_dump(KERN_DEBUG, "skb->data before untranslate: ", + DUMP_PREFIX_NONE, 16, 1, + skb->data, 64, true); + printk(KERN_DEBUG "type = %08x, xsap = %02x%02x%02x, org = %02x02x02x\n", ntohs(type), psnap->dsap, psnap->ssap, psnap->ctrl, psnap->org[0], psnap->org[1], psnap->org[2]); diff --git a/drivers/net/wireless/wavelan_cs.c b/drivers/net/wireless/wavelan_cs.c index 6af706408ac0..c6d300666ad8 100644 --- a/drivers/net/wireless/wavelan_cs.c +++ b/drivers/net/wireless/wavelan_cs.c @@ -3556,17 +3556,8 @@ wv_82593_config(struct net_device * dev) cfblk.rcvstop = TRUE; /* Enable Receive Stop Register */ #ifdef DEBUG_I82593_SHOW - { - u_char *c = (u_char *) &cfblk; - int i; - printk(KERN_DEBUG "wavelan_cs: config block:"); - for(i = 0; i < sizeof(struct i82593_conf_block); i++,c++) - { - if((i % 16) == 0) printk("\n" KERN_DEBUG); - printk("%02x ", *c); - } - printk("\n"); - } + print_hex_dump(KERN_DEBUG, "wavelan_cs: config block: ", DUMP_PREFIX_NONE, + 16, 1, &cfblk, sizeof(struct i82593_conf_block), false); #endif /* Copy the config block to the i82593 */ diff --git a/drivers/net/yellowfin.c b/drivers/net/yellowfin.c index 3c7a5053f1da..a07580138e81 100644 --- a/drivers/net/yellowfin.c +++ b/drivers/net/yellowfin.c @@ -109,7 +109,7 @@ static int gx_fix; /* These identify the driver base version and may not be removed. */ static const char version[] __devinitconst = KERN_INFO DRV_NAME ".c:v1.05 1/09/2001 Written by Donald Becker \n" - KERN_INFO " (unofficial 2.4.x port, " DRV_VERSION ", " DRV_RELDATE ")\n"; + " (unofficial 2.4.x port, " DRV_VERSION ", " DRV_RELDATE ")\n"; MODULE_AUTHOR("Donald Becker "); MODULE_DESCRIPTION("Packet Engines Yellowfin G-NIC Gigabit Ethernet driver"); @@ -700,12 +700,15 @@ static void yellowfin_tx_timeout(struct net_device *dev) int i; printk(KERN_WARNING " Rx ring %p: ", yp->rx_ring); for (i = 0; i < RX_RING_SIZE; i++) - printk(" %8.8x", yp->rx_ring[i].result_status); - printk("\n"KERN_WARNING" Tx ring %p: ", yp->tx_ring); + printk(KERN_CONT " %8.8x", + yp->rx_ring[i].result_status); + printk(KERN_CONT "\n"); + printk(KERN_WARNING" Tx ring %p: ", yp->tx_ring); for (i = 0; i < TX_RING_SIZE; i++) - printk(" %4.4x /%8.8x", yp->tx_status[i].tx_errs, - yp->tx_ring[i].result_status); - printk("\n"); + printk(KERN_CONT " %4.4x /%8.8x", + yp->tx_status[i].tx_errs, + yp->tx_ring[i].result_status); + printk(KERN_CONT "\n"); } /* If the hardware is found to hang regularly, we will update the code @@ -1216,20 +1219,20 @@ static int yellowfin_close(struct net_device *dev) #if defined(__i386__) if (yellowfin_debug > 2) { - printk("\n"KERN_DEBUG" Tx ring at %8.8llx:\n", + printk(KERN_DEBUG" Tx ring at %8.8llx:\n", (unsigned long long)yp->tx_ring_dma); for (i = 0; i < TX_RING_SIZE*2; i++) - printk(" %c #%d desc. %8.8x %8.8x %8.8x %8.8x.\n", + printk(KERN_DEBUG " %c #%d desc. %8.8x %8.8x %8.8x %8.8x.\n", ioread32(ioaddr + TxPtr) == (long)&yp->tx_ring[i] ? '>' : ' ', i, yp->tx_ring[i].dbdma_cmd, yp->tx_ring[i].addr, yp->tx_ring[i].branch_addr, yp->tx_ring[i].result_status); printk(KERN_DEBUG " Tx status %p:\n", yp->tx_status); for (i = 0; i < TX_RING_SIZE; i++) - printk(" #%d status %4.4x %4.4x %4.4x %4.4x.\n", + printk(KERN_DEBUG " #%d status %4.4x %4.4x %4.4x %4.4x.\n", i, yp->tx_status[i].tx_cnt, yp->tx_status[i].tx_errs, yp->tx_status[i].total_tx_cnt, yp->tx_status[i].paused); - printk("\n"KERN_DEBUG " Rx ring %8.8llx:\n", + printk(KERN_DEBUG " Rx ring %8.8llx:\n", (unsigned long long)yp->rx_ring_dma); for (i = 0; i < RX_RING_SIZE; i++) { printk(KERN_DEBUG " %c #%d desc. %8.8x %8.8x %8.8x\n", diff --git a/drivers/parisc/eisa_enumerator.c b/drivers/parisc/eisa_enumerator.c index c709ecc2b7f7..0be1d50645ab 100644 --- a/drivers/parisc/eisa_enumerator.c +++ b/drivers/parisc/eisa_enumerator.c @@ -101,7 +101,7 @@ static int configure_memory(const unsigned char *buf, printk("memory %lx-%lx ", (unsigned long)res->start, (unsigned long)res->end); result = request_resource(mem_parent, res); if (result < 0) { - printk("\n" KERN_ERR "EISA Enumerator: failed to claim EISA Bus address space!\n"); + printk(KERN_ERR "EISA Enumerator: failed to claim EISA Bus address space!\n"); return result; } } @@ -191,7 +191,7 @@ static int configure_port(const unsigned char *buf, struct resource *io_parent, printk("ioports %lx-%lx ", (unsigned long)res->start, (unsigned long)res->end); result = request_resource(io_parent, res); if (result < 0) { - printk("\n" KERN_ERR "EISA Enumerator: failed to claim EISA Bus address space!\n"); + printk(KERN_ERR "EISA Enumerator: failed to claim EISA Bus address space!\n"); return result; } } @@ -224,7 +224,7 @@ static int configure_port_init(const unsigned char *buf) case HPEE_PORT_INIT_WIDTH_BYTE: s=1; if (c & HPEE_PORT_INIT_MASK) { - printk("\n" KERN_WARNING "port_init: unverified mask attribute\n"); + printk(KERN_WARNING "port_init: unverified mask attribute\n"); outb((inb(get_16(buf+len+1) & get_8(buf+len+3)) | get_8(buf+len+4)), get_16(buf+len+1)); @@ -249,7 +249,7 @@ static int configure_port_init(const unsigned char *buf) case HPEE_PORT_INIT_WIDTH_DWORD: s=4; if (c & HPEE_PORT_INIT_MASK) { - printk("\n" KERN_WARNING "port_init: unverified mask attribute\n"); + printk(KERN_WARNING "port_init: unverified mask attribute\n"); outl((inl(get_16(buf+len+1) & get_32(buf+len+3)) | get_32(buf+len+7)), get_16(buf+len+1)); @@ -259,7 +259,7 @@ static int configure_port_init(const unsigned char *buf) break; default: - printk("\n" KERN_ERR "Invalid port init word %02x\n", c); + printk(KERN_ERR "Invalid port init word %02x\n", c); return 0; } @@ -297,7 +297,7 @@ static int configure_type_string(const unsigned char *buf) /* just skip past the type field */ len = get_8(buf); if (len > 80) { - printk("\n" KERN_ERR "eisa_enumerator: type info field too long (%d, max is 80)\n", len); + printk(KERN_ERR "eisa_enumerator: type info field too long (%d, max is 80)\n", len); } return 1+len; @@ -398,7 +398,7 @@ static int parse_slot_config(int slot, } if (p0 + function_len < pos) { - printk("\n" KERN_ERR "eisa_enumerator: function %d length mis-match " + printk(KERN_ERR "eisa_enumerator: function %d length mis-match " "got %d, expected %d\n", num_func, pos-p0, function_len); res=-1; diff --git a/drivers/pcmcia/tcic.c b/drivers/pcmcia/tcic.c index 9ad97ea836e8..8eb04230fec7 100644 --- a/drivers/pcmcia/tcic.c +++ b/drivers/pcmcia/tcic.c @@ -472,7 +472,8 @@ static int __init init_tcic(void) init_timer(&poll_timer); /* Build interrupt mask */ - printk(", %d sockets\n" KERN_INFO " irq list (", sockets); + printk(KERN_CONT ", %d sockets\n", sockets); + printk(KERN_INFO " irq list ("); if (irq_list_count == 0) mask = irq_mask; else diff --git a/drivers/scsi/atari_NCR5380.c b/drivers/scsi/atari_NCR5380.c index 0471f8800483..4240b05aef6d 100644 --- a/drivers/scsi/atari_NCR5380.c +++ b/drivers/scsi/atari_NCR5380.c @@ -2826,8 +2826,7 @@ int NCR5380_abort(Scsi_Cmnd *cmd) */ local_irq_restore(flags); - printk(KERN_INFO "scsi%d: warning : SCSI command probably completed successfully\n" - KERN_INFO " before abortion\n", HOSTNO); + printk(KERN_INFO "scsi%d: warning : SCSI command probably completed successfully before abortion\n", HOSTNO); /* Maybe it is sufficient just to release the ST-DMA lock... (if * possible at all) At least, we should check if the lock could be diff --git a/drivers/scsi/mac53c94.c b/drivers/scsi/mac53c94.c index b12ad7c7c673..18735b39b3d3 100644 --- a/drivers/scsi/mac53c94.c +++ b/drivers/scsi/mac53c94.c @@ -75,8 +75,9 @@ static int mac53c94_queue(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd * int i; printk(KERN_DEBUG "mac53c94_queue %p: command is", cmd); for (i = 0; i < cmd->cmd_len; ++i) - printk(" %.2x", cmd->cmnd[i]); - printk("\n" KERN_DEBUG "use_sg=%d request_bufflen=%d request_buffer=%p\n", + printk(KERN_CONT " %.2x", cmd->cmnd[i]); + printk(KERN_CONT "\n"); + printk(KERN_DEBUG "use_sg=%d request_bufflen=%d request_buffer=%p\n", scsi_sg_count(cmd), scsi_bufflen(cmd), scsi_sglist(cmd)); } #endif diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index ef142fd47a83..4d6f2fe1cfe9 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -619,7 +619,7 @@ sg_write(struct file *filp, const char __user *buf, size_t count, loff_t * ppos) if (strcmp(current->comm, cmd) && printk_ratelimit()) { printk(KERN_WARNING "sg_write: data in/out %d/%d bytes for SCSI command 0x%x--" - "guessing data in;\n" KERN_WARNING " " + "guessing data in;\n " "program %s not setting count and/or reply_len properly\n", old_hdr.reply_len - (int)SZ_SG_HEADER, input_size, (unsigned int) cmnd[0], diff --git a/drivers/scsi/sun3_NCR5380.c b/drivers/scsi/sun3_NCR5380.c index bcaba86060ab..75da6e58ce55 100644 --- a/drivers/scsi/sun3_NCR5380.c +++ b/drivers/scsi/sun3_NCR5380.c @@ -2860,8 +2860,7 @@ static int NCR5380_abort(struct scsi_cmnd *cmd) */ local_irq_restore(flags); - printk(KERN_INFO "scsi%d: warning : SCSI command probably completed successfully\n" - KERN_INFO " before abortion\n", HOSTNO); + printk(KERN_INFO "scsi%d: warning : SCSI command probably completed successfully before abortion\n", HOSTNO); return SCSI_ABORT_NOT_RUNNING; } diff --git a/drivers/serial/8250_pci.c b/drivers/serial/8250_pci.c index 6160e03f410c..e7108e75653d 100644 --- a/drivers/serial/8250_pci.c +++ b/drivers/serial/8250_pci.c @@ -60,11 +60,12 @@ struct serial_private { static void moan_device(const char *str, struct pci_dev *dev) { - printk(KERN_WARNING "%s: %s\n" - KERN_WARNING "Please send the output of lspci -vv, this\n" - KERN_WARNING "message (0x%04x,0x%04x,0x%04x,0x%04x), the\n" - KERN_WARNING "manufacturer and name of serial board or\n" - KERN_WARNING "modem board to rmk+serial@arm.linux.org.uk.\n", + printk(KERN_WARNING + "%s: %s\n" + "Please send the output of lspci -vv, this\n" + "message (0x%04x,0x%04x,0x%04x,0x%04x), the\n" + "manufacturer and name of serial board or\n" + "modem board to rmk+serial@arm.linux.org.uk.\n", pci_name(dev), str, dev->vendor, dev->device, dev->subsystem_vendor, dev->subsystem_device); } diff --git a/drivers/ssb/pcmcia.c b/drivers/ssb/pcmcia.c index fbfadbac67e8..131030f693c7 100644 --- a/drivers/ssb/pcmcia.c +++ b/drivers/ssb/pcmcia.c @@ -583,7 +583,7 @@ static int ssb_pcmcia_sprom_write_all(struct ssb_bus *bus, const u16 *sprom) ssb_printk("."); err = ssb_pcmcia_sprom_write(bus, i, sprom[i]); if (err) { - ssb_printk("\n" KERN_NOTICE PFX + ssb_printk(KERN_NOTICE PFX "Failed to write to SPROM.\n"); failed = 1; break; @@ -591,7 +591,7 @@ static int ssb_pcmcia_sprom_write_all(struct ssb_bus *bus, const u16 *sprom) } err = ssb_pcmcia_sprom_command(bus, SSB_PCMCIA_SPROMCTL_WRITEDIS); if (err) { - ssb_printk("\n" KERN_NOTICE PFX + ssb_printk(KERN_NOTICE PFX "Could not disable SPROM write access.\n"); failed = 1; } diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c index ce3f453f02ef..95ccfa0b9fc5 100644 --- a/drivers/usb/core/hcd.c +++ b/drivers/usb/core/hcd.c @@ -648,7 +648,7 @@ void usb_hcd_poll_rh_status(struct usb_hcd *hcd) struct urb *urb; int length; unsigned long flags; - char buffer[4]; /* Any root hubs with > 31 ports? */ + char buffer[6]; /* Any root hubs with > 31 ports? */ if (unlikely(!hcd->rh_registered)) return; diff --git a/drivers/video/amba-clcd.c b/drivers/video/amba-clcd.c index fb8163d181ab..a21efcd10b78 100644 --- a/drivers/video/amba-clcd.c +++ b/drivers/video/amba-clcd.c @@ -226,9 +226,10 @@ static int clcdfb_set_par(struct fb_info *info) clcdfb_enable(fb, regs.cntl); #ifdef DEBUG - printk(KERN_INFO "CLCD: Registers set to\n" - KERN_INFO " %08x %08x %08x %08x\n" - KERN_INFO " %08x %08x %08x %08x\n", + printk(KERN_INFO + "CLCD: Registers set to\n" + " %08x %08x %08x %08x\n" + " %08x %08x %08x %08x\n", readl(fb->regs + CLCD_TIM0), readl(fb->regs + CLCD_TIM1), readl(fb->regs + CLCD_TIM2), readl(fb->regs + CLCD_TIM3), readl(fb->regs + CLCD_UBAS), readl(fb->regs + CLCD_LBAS), diff --git a/drivers/video/matrox/matroxfb_DAC1064.c b/drivers/video/matrox/matroxfb_DAC1064.c index 0ce3b0a89798..a74e5da17aa0 100644 --- a/drivers/video/matrox/matroxfb_DAC1064.c +++ b/drivers/video/matrox/matroxfb_DAC1064.c @@ -454,9 +454,9 @@ static void DAC1064_restore_2(WPMINFO2) { dprintk(KERN_DEBUG "DAC1064regs "); for (i = 0; i < sizeof(MGA1064_DAC_regs); i++) { dprintk("R%02X=%02X ", MGA1064_DAC_regs[i], ACCESS_FBINFO(hw).DACreg[i]); - if ((i & 0x7) == 0x7) dprintk("\n" KERN_DEBUG "continuing... "); + if ((i & 0x7) == 0x7) dprintk(KERN_DEBUG "continuing... "); } - dprintk("\n" KERN_DEBUG "DAC1064clk "); + dprintk(KERN_DEBUG "DAC1064clk "); for (i = 0; i < 6; i++) dprintk("C%02X=%02X ", i, ACCESS_FBINFO(hw).DACclk[i]); dprintk("\n"); diff --git a/drivers/video/matrox/matroxfb_Ti3026.c b/drivers/video/matrox/matroxfb_Ti3026.c index 13524821e242..4e825112a601 100644 --- a/drivers/video/matrox/matroxfb_Ti3026.c +++ b/drivers/video/matrox/matroxfb_Ti3026.c @@ -651,9 +651,9 @@ static void Ti3026_restore(WPMINFO2) { dprintk(KERN_DEBUG "3026DACregs "); for (i = 0; i < 21; i++) { dprintk("R%02X=%02X ", DACseq[i], hw->DACreg[i]); - if ((i & 0x7) == 0x7) dprintk("\n" KERN_DEBUG "continuing... "); + if ((i & 0x7) == 0x7) dprintk(KERN_DEBUG "continuing... "); } - dprintk("\n" KERN_DEBUG "DACclk "); + dprintk(KERN_DEBUG "DACclk "); for (i = 0; i < 6; i++) dprintk("C%02X=%02X ", i, hw->DACclk[i]); dprintk("\n"); diff --git a/drivers/video/stifb.c b/drivers/video/stifb.c index eec9dcb7f599..6120f0c526fe 100644 --- a/drivers/video/stifb.c +++ b/drivers/video/stifb.c @@ -1115,10 +1115,9 @@ static int __init stifb_init_fb(struct sti_struct *sti, int bpp_pref) if the device name contains the string "DX" and tell the user how to reconfigure the card. */ if (strstr(sti->outptr.dev_name, "DX")) { - printk(KERN_WARNING "WARNING: stifb framebuffer driver does not " - "support '%s' in double-buffer mode.\n" - KERN_WARNING "WARNING: Please disable the double-buffer mode " - "in IPL menu (the PARISC-BIOS).\n", + printk(KERN_WARNING +"WARNING: stifb framebuffer driver does not support '%s' in double-buffer mode.\n" +"WARNING: Please disable the double-buffer mode in IPL menu (the PARISC-BIOS).\n", sti->outptr.dev_name); goto out_err0; } diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c index a0244740b75a..b47679be118a 100644 --- a/fs/jffs2/erase.c +++ b/fs/jffs2/erase.c @@ -270,19 +270,21 @@ static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c, D2({ int i=0; struct jffs2_raw_node_ref *this; - printk(KERN_DEBUG "After remove_node_refs_from_ino_list: \n" KERN_DEBUG); + printk(KERN_DEBUG "After remove_node_refs_from_ino_list: \n"); this = ic->nodes; + printk(KERN_DEBUG); while(this) { - printk( "0x%08x(%d)->", ref_offset(this), ref_flags(this)); + printk(KERN_CONT "0x%08x(%d)->", + ref_offset(this), ref_flags(this)); if (++i == 5) { - printk("\n" KERN_DEBUG); + printk(KERN_DEBUG); i=0; } this = this->next_in_ino; } - printk("\n"); + printk(KERN_CONT "\n"); }); switch (ic->class) { diff --git a/kernel/module.c b/kernel/module.c index 38928fcaff2b..0a049837008e 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2451,9 +2451,9 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, return ret; } if (ret > 0) { - printk(KERN_WARNING "%s: '%s'->init suspiciously returned %d, " - "it should follow 0/-E convention\n" - KERN_WARNING "%s: loading module anyway...\n", + printk(KERN_WARNING +"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n" +"%s: loading module anyway...\n", __func__, mod->name, ret, __func__); dump_stack(); diff --git a/sound/pci/emu10k1/p16v.c b/sound/pci/emu10k1/p16v.c index e617acaf10e3..61b8ab39800f 100644 --- a/sound/pci/emu10k1/p16v.c +++ b/sound/pci/emu10k1/p16v.c @@ -644,7 +644,7 @@ int __devinit snd_p16v_pcm(struct snd_emu10k1 *emu, int device, struct snd_pcm * int err; int capture=1; - /* snd_printk("KERN_DEBUG snd_p16v_pcm called. device=%d\n", device); */ + /* snd_printk(KERN_DEBUG "snd_p16v_pcm called. device=%d\n", device); */ emu->p16v_device_offset = device; if (rpcm) *rpcm = NULL; diff --git a/sound/usb/usx2y/usbusx2yaudio.c b/sound/usb/usx2y/usbusx2yaudio.c index dd1ab6177840..9efd27f6b52f 100644 --- a/sound/usb/usx2y/usbusx2yaudio.c +++ b/sound/usb/usx2y/usbusx2yaudio.c @@ -296,9 +296,10 @@ static void usX2Y_error_urb_status(struct usX2Ydev *usX2Y, static void usX2Y_error_sequence(struct usX2Ydev *usX2Y, struct snd_usX2Y_substream *subs, struct urb *urb) { - snd_printk(KERN_ERR "Sequence Error!(hcd_frame=%i ep=%i%s;wait=%i,frame=%i).\n" - KERN_ERR "Most propably some urb of usb-frame %i is still missing.\n" - KERN_ERR "Cause could be too long delays in usb-hcd interrupt handling.\n", + snd_printk(KERN_ERR +"Sequence Error!(hcd_frame=%i ep=%i%s;wait=%i,frame=%i).\n" +"Most propably some urb of usb-frame %i is still missing.\n" +"Cause could be too long delays in usb-hcd interrupt handling.\n", usb_get_current_frame_number(usX2Y->chip.dev), subs->endpoint, usb_pipein(urb->pipe) ? "in" : "out", usX2Y->wait_iso_frame, urb->start_frame, usX2Y->wait_iso_frame); -- cgit v1.2.3 From 44b572809581d5a10dbe35aa6bf689f32b9c5ad6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 8 Jul 2009 09:50:19 -0700 Subject: x86: don't clear nodes_states[N_NORMAL_MEMORY] when numa is not compiled in Alex found that specjbb2005 still can not run with hugepages on an x86-64 machine. This only happens when numa is not compiled in. The root cause: node_set_state will not set it back for us in that case, so don't clear that when numa is not select in config [ v2: use node_clear_state instead ] Reported-and-Tested-by: Alex Shi Signed-off-by: Yinghai Lu Reviewed-by: Christoph Lameter Signed-off-by: Linus Torvalds --- arch/x86/mm/init_64.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index b177652251a4..6176fe8f29e0 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -598,8 +598,15 @@ void __init paging_init(void) sparse_memory_present_with_active_regions(MAX_NUMNODES); sparse_init(); - /* clear the default setting with node 0 */ - nodes_clear(node_states[N_NORMAL_MEMORY]); + + /* + * clear the default setting with node 0 + * note: don't use nodes_clear here, that is really clearing when + * numa support is not compiled in, and later node_set_state + * will not set it back. + */ + node_clear_state(0, N_NORMAL_MEMORY); + free_area_init_nodes(max_zone_pfns); } -- cgit v1.2.3 From ad46276952f1af34cd91d46d49ba13d347d56367 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 8 Jul 2009 12:10:31 +0000 Subject: memory barrier: adding smp_mb__after_lock Adding smp_mb__after_lock define to be used as a smp_mb call after a lock. Making it nop for x86, since {read|write|spin}_lock() on x86 are full memory barriers. Signed-off-by: Jiri Olsa Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- arch/x86/include/asm/spinlock.h | 4 ++++ include/linux/spinlock.h | 5 +++++ include/net/sock.h | 5 ++++- 3 files changed, 13 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index b7e5db876399..4e77853321db 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -302,4 +302,8 @@ static inline void __raw_write_unlock(raw_rwlock_t *rw) #define _raw_read_relax(lock) cpu_relax() #define _raw_write_relax(lock) cpu_relax() +/* The {read|write|spin}_lock() on x86 are full memory barriers. */ +static inline void smp_mb__after_lock(void) { } +#define ARCH_HAS_SMP_MB_AFTER_LOCK + #endif /* _ASM_X86_SPINLOCK_H */ diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 252b245cfcf4..4be57ab03478 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -132,6 +132,11 @@ do { \ #endif /*__raw_spin_is_contended*/ #endif +/* The lock does not imply full memory barrier. */ +#ifndef ARCH_HAS_SMP_MB_AFTER_LOCK +static inline void smp_mb__after_lock(void) { smp_mb(); } +#endif + /** * spin_unlock_wait - wait until the spinlock gets unlocked * @lock: the spinlock in question. diff --git a/include/net/sock.h b/include/net/sock.h index 4eb8409249f6..2c0da9239b95 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1271,6 +1271,9 @@ static inline int sk_has_allocations(const struct sock *sk) * in its cache, and so does the tp->rcv_nxt update on CPU2 side. The CPU1 * could then endup calling schedule and sleep forever if there are no more * data on the socket. + * + * The sk_has_sleeper is always called right after a call to read_lock, so we + * can use smp_mb__after_lock barrier. */ static inline int sk_has_sleeper(struct sock *sk) { @@ -1280,7 +1283,7 @@ static inline int sk_has_sleeper(struct sock *sk) * * This memory barrier is paired in the sock_poll_wait. */ - smp_mb(); + smp_mb__after_lock(); return sk->sk_sleep && waitqueue_active(sk->sk_sleep); } -- cgit v1.2.3 From 8d7ff4f2a0b22b7d6d7bc3982257d1dadea22824 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 23 Jun 2009 11:48:14 +0200 Subject: x86/oprofile: rename kernel parameter for architectural perfmon to arch_perfmon The short name of the achitecture is 'arch_perfmon'. This patch changes the kernel parameter to use this name. Cc: Andi Kleen Signed-off-by: Robert Richter Signed-off-by: Ingo Molnar --- Documentation/kernel-parameters.txt | 4 ++-- arch/x86/oprofile/nmi_int.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 92e1ab8178a8..c59e965a748d 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1728,8 +1728,8 @@ and is between 256 and 4096 characters. It is defined in the file oprofile.cpu_type= Force an oprofile cpu type This might be useful if you have an older oprofile userland or if you want common events. - Format: { archperfmon } - archperfmon: [X86] Force use of architectural + Format: { arch_perfmon } + arch_perfmon: [X86] Force use of architectural perfmon on Intel CPUs instead of the CPU specific event set. diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index b07dd8d0b321..89b9a5cd63da 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -390,7 +390,7 @@ static int __init p4_init(char **cpu_type) static int force_arch_perfmon; static int force_cpu_type(const char *str, struct kernel_param *kp) { - if (!strcmp(str, "archperfmon")) { + if (!strcmp(str, "arch_perfmon")) { force_arch_perfmon = 1; printk(KERN_INFO "oprofile: forcing architectural perfmon\n"); } -- cgit v1.2.3 From 11d1578f9454159c43499d1d8fe8a7d728c176a3 Mon Sep 17 00:00:00 2001 From: Vince Weaver Date: Wed, 8 Jul 2009 17:46:14 -0400 Subject: perf_counter: Add P6 PMU support Add basic P6 PMU support. The P6 uses the EVNTSEL0 EN bit to enable/disable both its counters. We use this for the global enable/disable, and clear all config bits (except EN) to disable individual counters. Actual ia32 hardware doesn't support lfence, so use a locked op without side-effect to implement a full barrier. perf stat and perf record seem to function correctly. [a.p.zijlstra@chello.nl: cleanups and complete the enable/disable code] Signed-off-by: Vince Weaver Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 227 ++++++++++++++++++++++++++++++++++--- tools/perf/perf.h | 8 +- 2 files changed, 220 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 36c3dc7b8991..1910f39ff19a 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -65,6 +65,44 @@ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { .enabled = 1, }; +/* + * Not sure about some of these + */ +static const u64 p6_perfmon_event_map[] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0000, + [PERF_COUNT_HW_CACHE_MISSES] = 0x0000, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, +}; + +static u64 p6_pmu_event_map(int event) +{ + return p6_perfmon_event_map[event]; +} + +static u64 p6_pmu_raw_event(u64 event) +{ +#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL +#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL +#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL +#define P6_EVNTSEL_INV_MASK 0x00800000ULL +#define P6_EVNTSEL_COUNTER_MASK 0xFF000000ULL + +#define P6_EVNTSEL_MASK \ + (P6_EVNTSEL_EVENT_MASK | \ + P6_EVNTSEL_UNIT_MASK | \ + P6_EVNTSEL_EDGE_MASK | \ + P6_EVNTSEL_INV_MASK | \ + P6_EVNTSEL_COUNTER_MASK) + + return event & P6_EVNTSEL_MASK; +} + + /* * Intel PerfMon v3. Used on Core2 and later. */ @@ -726,6 +764,23 @@ static int __hw_perf_counter_init(struct perf_counter *counter) return 0; } +static void p6_pmu_disable_all(void) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + unsigned long val; + + if (!cpuc->enabled) + return; + + cpuc->enabled = 0; + barrier(); + + /* p6 only has one enable register */ + rdmsrl(MSR_P6_EVNTSEL0, val); + val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_P6_EVNTSEL0, val); +} + static void intel_pmu_disable_all(void) { wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); @@ -767,6 +822,23 @@ void hw_perf_disable(void) return x86_pmu.disable_all(); } +static void p6_pmu_enable_all(void) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + unsigned long val; + + if (cpuc->enabled) + return; + + cpuc->enabled = 1; + barrier(); + + /* p6 only has one enable register */ + rdmsrl(MSR_P6_EVNTSEL0, val); + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_P6_EVNTSEL0, val); +} + static void intel_pmu_enable_all(void) { wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); @@ -819,16 +891,13 @@ static inline void intel_pmu_ack_status(u64 ack) static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) { - int err; - err = checking_wrmsrl(hwc->config_base + idx, + (void)checking_wrmsrl(hwc->config_base + idx, hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); } static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) { - int err; - err = checking_wrmsrl(hwc->config_base + idx, - hwc->config); + (void)checking_wrmsrl(hwc->config_base + idx, hwc->config); } static inline void @@ -836,13 +905,24 @@ intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx) { int idx = __idx - X86_PMC_IDX_FIXED; u64 ctrl_val, mask; - int err; mask = 0xfULL << (idx * 4); rdmsrl(hwc->config_base, ctrl_val); ctrl_val &= ~mask; - err = checking_wrmsrl(hwc->config_base, ctrl_val); + (void)checking_wrmsrl(hwc->config_base, ctrl_val); +} + +static inline void +p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + unsigned long val = ARCH_PERFMON_EVENTSEL0_ENABLE; + + if (!cpuc->enabled) + val = 0; + + (void)checking_wrmsrl(hwc->config_base + idx, val); } static inline void @@ -943,6 +1023,17 @@ intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx) err = checking_wrmsrl(hwc->config_base, ctrl_val); } +static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + + if (cpuc->enabled) + x86_pmu_enable_counter(hwc, idx); + else + x86_pmu_disable_counter(hwc, idx); +} + + static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) { if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { @@ -1176,6 +1267,49 @@ static void intel_pmu_reset(void) local_irq_restore(flags); } +static int p6_pmu_handle_irq(struct pt_regs *regs) +{ + struct perf_sample_data data; + struct cpu_hw_counters *cpuc; + struct perf_counter *counter; + struct hw_perf_counter *hwc; + int idx, handled = 0; + u64 val; + + data.regs = regs; + data.addr = 0; + + cpuc = &__get_cpu_var(cpu_hw_counters); + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + if (!test_bit(idx, cpuc->active_mask)) + continue; + + counter = cpuc->counters[idx]; + hwc = &counter->hw; + + val = x86_perf_counter_update(counter, hwc, idx); + if (val & (1ULL << (x86_pmu.counter_bits - 1))) + continue; + + /* + * counter overflow + */ + handled = 1; + data.period = counter->hw.last_period; + + if (!x86_perf_counter_set_period(counter, hwc, idx)) + continue; + + if (perf_counter_overflow(counter, 1, &data)) + p6_pmu_disable_counter(hwc, idx); + } + + if (handled) + inc_irq_stat(apic_perf_irqs); + + return handled; +} /* * This handler is triggered by the local APIC, so the APIC IRQ handling @@ -1185,14 +1319,13 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) { struct perf_sample_data data; struct cpu_hw_counters *cpuc; - int bit, cpu, loops; + int bit, loops; u64 ack, status; data.regs = regs; data.addr = 0; - cpu = smp_processor_id(); - cpuc = &per_cpu(cpu_hw_counters, cpu); + cpuc = &__get_cpu_var(cpu_hw_counters); perf_disable(); status = intel_pmu_get_status(); @@ -1249,14 +1382,13 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) struct cpu_hw_counters *cpuc; struct perf_counter *counter; struct hw_perf_counter *hwc; - int cpu, idx, handled = 0; + int idx, handled = 0; u64 val; data.regs = regs; data.addr = 0; - cpu = smp_processor_id(); - cpuc = &per_cpu(cpu_hw_counters, cpu); + cpuc = &__get_cpu_var(cpu_hw_counters); for (idx = 0; idx < x86_pmu.num_counters; idx++) { if (!test_bit(idx, cpuc->active_mask)) @@ -1353,6 +1485,32 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = { .priority = 1 }; +static struct x86_pmu p6_pmu = { + .name = "p6", + .handle_irq = p6_pmu_handle_irq, + .disable_all = p6_pmu_disable_all, + .enable_all = p6_pmu_enable_all, + .enable = p6_pmu_enable_counter, + .disable = p6_pmu_disable_counter, + .eventsel = MSR_P6_EVNTSEL0, + .perfctr = MSR_P6_PERFCTR0, + .event_map = p6_pmu_event_map, + .raw_event = p6_pmu_raw_event, + .max_events = ARRAY_SIZE(p6_perfmon_event_map), + .max_period = (1ULL << 31) - 1, + .version = 0, + .num_counters = 2, + /* + * Counters have 40 bits implemented. However they are designed such + * that bits [32-39] are sign extensions of bit 31. As such the + * effective width of a counter for P6-like PMU is 32 bits only. + * + * See IA-32 Intel Architecture Software developer manual Vol 3B + */ + .counter_bits = 32, + .counter_mask = (1ULL << 32) - 1, +}; + static struct x86_pmu intel_pmu = { .name = "Intel", .handle_irq = intel_pmu_handle_irq, @@ -1392,6 +1550,41 @@ static struct x86_pmu amd_pmu = { .max_period = (1ULL << 47) - 1, }; +static int p6_pmu_init(void) +{ + int high, low; + + switch (boot_cpu_data.x86_model) { + case 1: + case 3: /* Pentium Pro */ + case 5: + case 6: /* Pentium II */ + case 7: + case 8: + case 11: /* Pentium III */ + break; + case 9: + case 13: + /* for Pentium M, we need to check if PMU exist */ + rdmsr(MSR_IA32_MISC_ENABLE, low, high); + if (low & MSR_IA32_MISC_ENABLE_EMON) + break; + default: + pr_cont("unsupported p6 CPU model %d ", + boot_cpu_data.x86_model); + return -ENODEV; + } + + if (!cpu_has_apic) { + pr_info("no Local APIC, try rebooting with lapic"); + return -ENODEV; + } + + x86_pmu = p6_pmu; + + return 0; +} + static int intel_pmu_init(void) { union cpuid10_edx edx; @@ -1400,8 +1593,14 @@ static int intel_pmu_init(void) unsigned int ebx; int version; - if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { + /* check for P6 processor family */ + if (boot_cpu_data.x86 == 6) { + return p6_pmu_init(); + } else { return -ENODEV; + } + } /* * Check whether the Architectural PerfMon supports diff --git a/tools/perf/perf.h b/tools/perf/perf.h index 27887c916439..53bb9550def9 100644 --- a/tools/perf/perf.h +++ b/tools/perf/perf.h @@ -1,7 +1,13 @@ #ifndef _PERF_PERF_H #define _PERF_PERF_H -#if defined(__x86_64__) || defined(__i386__) +#if defined(__i386__) +#include "../../arch/x86/include/asm/unistd.h" +#define rmb() asm volatile("lock; addl $0,0(%%esp)" ::: "memory") +#define cpu_relax() asm volatile("rep; nop" ::: "memory"); +#endif + +#if defined(__x86_64__) #include "../../arch/x86/include/asm/unistd.h" #define rmb() asm volatile("lfence" ::: "memory") #define cpu_relax() asm volatile("rep; nop" ::: "memory"); -- cgit v1.2.3 From 9c74fb50867e8fb5f3be3be06716492c0f79309e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Jul 2009 10:21:41 +0200 Subject: perf_counter: Fix up P6 PMU details The P6 doesn't seem to support cache ref/hit/miss counts, so we extend the generic hardware event codes to have 0 and -1 mean the same thing as for the generic cache events. Furthermore, it turns out the 0 event does not count (that is, its reported that on PPro it actually does count something), therefore use a event configuration that's specified not to count to disable the counters. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 1910f39ff19a..c7cc6eac14ec 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -84,6 +84,14 @@ static u64 p6_pmu_event_map(int event) return p6_perfmon_event_map[event]; } +/* + * Counter setting that is specified not to count anything. + * We use this to effectively disable a counter. + * + * L2_RQSTS with 0 MESI unit mask. + */ +#define P6_NOP_COUNTER 0x0000002EULL + static u64 p6_pmu_raw_event(u64 event) { #define P6_EVNTSEL_EVENT_MASK 0x000000FFULL @@ -704,6 +712,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter) { struct perf_counter_attr *attr = &counter->attr; struct hw_perf_counter *hwc = &counter->hw; + u64 config; int err; if (!x86_pmu_initialized()) @@ -756,10 +765,19 @@ static int __hw_perf_counter_init(struct perf_counter *counter) if (attr->config >= x86_pmu.max_events) return -EINVAL; + /* * The generic map: */ - hwc->config |= x86_pmu.event_map(attr->config); + config = x86_pmu.event_map(attr->config); + + if (config == 0) + return -ENOENT; + + if (config == -1LL) + return -EINVAL; + + hwc->config |= config; return 0; } @@ -767,7 +785,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter) static void p6_pmu_disable_all(void) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - unsigned long val; + u64 val; if (!cpuc->enabled) return; @@ -917,10 +935,10 @@ static inline void p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - unsigned long val = ARCH_PERFMON_EVENTSEL0_ENABLE; + u64 val = P6_NOP_COUNTER; - if (!cpuc->enabled) - val = 0; + if (cpuc->enabled) + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; (void)checking_wrmsrl(hwc->config_base + idx, val); } -- cgit v1.2.3 From 984b838ce69c063a91b87550598ab7f3439dd94a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 10 Jul 2009 09:59:56 +0200 Subject: perf_counter: Clean up global vs counter enable Ingo noticed that both AMD and P6 call x86_pmu_disable_counter() on *_pmu_enable_counter(). This is because we rely on the side effect of that call to program the event config but not touch the EN bit. We change that for AMD by having enable_all() simply write the full config in, and for P6 by explicitly coding it. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index c7cc6eac14ec..bed1c4c2f251 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -874,13 +874,13 @@ static void amd_pmu_enable_all(void) barrier(); for (idx = 0; idx < x86_pmu.num_counters; idx++) { + struct perf_counter *counter = cpuc->counters[idx]; u64 val; if (!test_bit(idx, cpuc->active_mask)) continue; - rdmsrl(MSR_K7_EVNTSEL0 + idx, val); - if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) - continue; + + val = counter->hw.config; val |= ARCH_PERFMON_EVENTSEL0_ENABLE; wrmsrl(MSR_K7_EVNTSEL0 + idx, val); } @@ -1044,11 +1044,13 @@ intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx) static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + u64 val; + val = hwc->config; if (cpuc->enabled) - x86_pmu_enable_counter(hwc, idx); - else - x86_pmu_disable_counter(hwc, idx); + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + + (void)checking_wrmsrl(hwc->config_base + idx, val); } @@ -1068,8 +1070,6 @@ static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) if (cpuc->enabled) x86_pmu_enable_counter(hwc, idx); - else - x86_pmu_disable_counter(hwc, idx); } static int -- cgit v1.2.3 From 8aa7e847d834ed937a9ad37a0f2ad5b8584c1ab0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 9 Jul 2009 14:52:32 +0200 Subject: Fix congestion_wait() sync/async vs read/write confusion Commit 1faa16d22877f4839bd433547d770c676d1d964c accidentally broke the bdi congestion wait queue logic, causing us to wait on congestion for WRITE (== 1) when we really wanted BLK_RW_ASYNC (== 0) instead. Signed-off-by: Jens Axboe --- arch/x86/lib/usercopy_32.c | 2 +- drivers/block/pktcdvd.c | 10 ++++++---- drivers/md/dm-crypt.c | 2 +- fs/fat/file.c | 2 +- fs/fuse/dev.c | 8 ++++---- fs/nfs/write.c | 8 +++++--- fs/reiserfs/journal.c | 2 +- fs/xfs/linux-2.6/kmem.c | 4 ++-- fs/xfs/linux-2.6/xfs_buf.c | 2 +- include/linux/backing-dev.h | 6 +++--- include/linux/blkdev.h | 8 ++++---- mm/backing-dev.c | 7 +++---- mm/memcontrol.c | 2 +- mm/page-writeback.c | 8 ++++---- mm/page_alloc.c | 4 ++-- mm/vmscan.c | 8 ++++---- 16 files changed, 43 insertions(+), 40 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 7c8ca91bb9ec..1f118d462acc 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -751,7 +751,7 @@ survive: if (retval == -ENOMEM && is_global_init(current)) { up_read(¤t->mm->mmap_sem); - congestion_wait(WRITE, HZ/50); + congestion_wait(BLK_RW_ASYNC, HZ/50); goto survive; } diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 83650e00632d..99a506f619b7 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -1372,8 +1372,10 @@ try_next_bio: wakeup = (pd->write_congestion_on > 0 && pd->bio_queue_size <= pd->write_congestion_off); spin_unlock(&pd->lock); - if (wakeup) - clear_bdi_congested(&pd->disk->queue->backing_dev_info, WRITE); + if (wakeup) { + clear_bdi_congested(&pd->disk->queue->backing_dev_info, + BLK_RW_ASYNC); + } pkt->sleep_time = max(PACKET_WAIT_TIME, 1); pkt_set_state(pkt, PACKET_WAITING_STATE); @@ -2592,10 +2594,10 @@ static int pkt_make_request(struct request_queue *q, struct bio *bio) spin_lock(&pd->lock); if (pd->write_congestion_on > 0 && pd->bio_queue_size >= pd->write_congestion_on) { - set_bdi_congested(&q->backing_dev_info, WRITE); + set_bdi_congested(&q->backing_dev_info, BLK_RW_ASYNC); do { spin_unlock(&pd->lock); - congestion_wait(WRITE, HZ); + congestion_wait(BLK_RW_ASYNC, HZ); spin_lock(&pd->lock); } while(pd->bio_queue_size > pd->write_congestion_off); } diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 9933eb861c71..529e2ba505c3 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -776,7 +776,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) * But don't wait if split was due to the io size restriction */ if (unlikely(out_of_pages)) - congestion_wait(WRITE, HZ/100); + congestion_wait(BLK_RW_ASYNC, HZ/100); /* * With async crypto it is unsafe to share the crypto context diff --git a/fs/fat/file.c b/fs/fat/file.c index b28ea646ff60..f042b965c95c 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -134,7 +134,7 @@ static int fat_file_release(struct inode *inode, struct file *filp) if ((filp->f_mode & FMODE_WRITE) && MSDOS_SB(inode->i_sb)->options.flush) { fat_flush_inodes(inode->i_sb, inode, NULL); - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); } return 0; } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index f58ecbc416c8..6484eb75acd6 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -286,8 +286,8 @@ __releases(&fc->lock) } if (fc->num_background == FUSE_CONGESTION_THRESHOLD && fc->connected && fc->bdi_initialized) { - clear_bdi_congested(&fc->bdi, READ); - clear_bdi_congested(&fc->bdi, WRITE); + clear_bdi_congested(&fc->bdi, BLK_RW_SYNC); + clear_bdi_congested(&fc->bdi, BLK_RW_ASYNC); } fc->num_background--; fc->active_background--; @@ -414,8 +414,8 @@ static void fuse_request_send_nowait_locked(struct fuse_conn *fc, fc->blocked = 1; if (fc->num_background == FUSE_CONGESTION_THRESHOLD && fc->bdi_initialized) { - set_bdi_congested(&fc->bdi, READ); - set_bdi_congested(&fc->bdi, WRITE); + set_bdi_congested(&fc->bdi, BLK_RW_SYNC); + set_bdi_congested(&fc->bdi, BLK_RW_ASYNC); } list_add_tail(&req->list, &fc->bg_queue); flush_bg_queue(fc); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index ce728829f79a..0a0a2ff767c3 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -202,8 +202,10 @@ static int nfs_set_page_writeback(struct page *page) struct nfs_server *nfss = NFS_SERVER(inode); if (atomic_long_inc_return(&nfss->writeback) > - NFS_CONGESTION_ON_THRESH) - set_bdi_congested(&nfss->backing_dev_info, WRITE); + NFS_CONGESTION_ON_THRESH) { + set_bdi_congested(&nfss->backing_dev_info, + BLK_RW_ASYNC); + } } return ret; } @@ -215,7 +217,7 @@ static void nfs_end_page_writeback(struct page *page) end_page_writeback(page); if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) - clear_bdi_congested(&nfss->backing_dev_info, WRITE); + clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); } /* diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 77f5bb746bf0..90622200b39c 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -997,7 +997,7 @@ static int reiserfs_async_progress_wait(struct super_block *s) DEFINE_WAIT(wait); struct reiserfs_journal *j = SB_JOURNAL(s); if (atomic_read(&j->j_async_throttle)) - congestion_wait(WRITE, HZ / 10); + congestion_wait(BLK_RW_ASYNC, HZ / 10); return 0; } diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c index 1cd3b55ee3d2..2d3f90afe5f1 100644 --- a/fs/xfs/linux-2.6/kmem.c +++ b/fs/xfs/linux-2.6/kmem.c @@ -53,7 +53,7 @@ kmem_alloc(size_t size, unsigned int __nocast flags) printk(KERN_ERR "XFS: possible memory allocation " "deadlock in %s (mode:0x%x)\n", __func__, lflags); - congestion_wait(WRITE, HZ/50); + congestion_wait(BLK_RW_ASYNC, HZ/50); } while (1); } @@ -130,7 +130,7 @@ kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags) printk(KERN_ERR "XFS: possible memory allocation " "deadlock in %s (mode:0x%x)\n", __func__, lflags); - congestion_wait(WRITE, HZ/50); + congestion_wait(BLK_RW_ASYNC, HZ/50); } while (1); } diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 1418b916fc27..0c93c7ef3d18 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -412,7 +412,7 @@ _xfs_buf_lookup_pages( XFS_STATS_INC(xb_page_retries); xfsbufd_wakeup(0, gfp_mask); - congestion_wait(WRITE, HZ/50); + congestion_wait(BLK_RW_ASYNC, HZ/50); goto retry; } diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 0ec2c594868e..3a52a63c1351 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -229,9 +229,9 @@ static inline int bdi_rw_congested(struct backing_dev_info *bdi) (1 << BDI_async_congested)); } -void clear_bdi_congested(struct backing_dev_info *bdi, int rw); -void set_bdi_congested(struct backing_dev_info *bdi, int rw); -long congestion_wait(int rw, long timeout); +void clear_bdi_congested(struct backing_dev_info *bdi, int sync); +void set_bdi_congested(struct backing_dev_info *bdi, int sync); +long congestion_wait(int sync, long timeout); static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 49ae07951d55..bb3d39978701 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -779,18 +779,18 @@ extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t, * congested queues, and wake up anyone who was waiting for requests to be * put back. */ -static inline void blk_clear_queue_congested(struct request_queue *q, int rw) +static inline void blk_clear_queue_congested(struct request_queue *q, int sync) { - clear_bdi_congested(&q->backing_dev_info, rw); + clear_bdi_congested(&q->backing_dev_info, sync); } /* * A queue has just entered congestion. Flag that in the queue's VM-visible * state flags and increment the global gounter of congested queues. */ -static inline void blk_set_queue_congested(struct request_queue *q, int rw) +static inline void blk_set_queue_congested(struct request_queue *q, int sync) { - set_bdi_congested(&q->backing_dev_info, rw); + set_bdi_congested(&q->backing_dev_info, sync); } extern void blk_start_queue(struct request_queue *q); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 493b468a5035..c86edd244294 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -283,7 +283,6 @@ static wait_queue_head_t congestion_wqh[2] = { __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) }; - void clear_bdi_congested(struct backing_dev_info *bdi, int sync) { enum bdi_state bit; @@ -308,18 +307,18 @@ EXPORT_SYMBOL(set_bdi_congested); /** * congestion_wait - wait for a backing_dev to become uncongested - * @rw: READ or WRITE + * @sync: SYNC or ASYNC IO * @timeout: timeout in jiffies * * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit * write congestion. If no backing_devs are congested then just wait for the * next write to be completed. */ -long congestion_wait(int rw, long timeout) +long congestion_wait(int sync, long timeout) { long ret; DEFINE_WAIT(wait); - wait_queue_head_t *wqh = &congestion_wqh[rw]; + wait_queue_head_t *wqh = &congestion_wqh[sync]; prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); ret = io_schedule_timeout(timeout); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e2fa20dadf40..e717964cb5a0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1973,7 +1973,7 @@ try_to_free: if (!progress) { nr_retries--; /* maybe some writeback is necessary */ - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); } } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7687879253b9..81627ebcd313 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -575,7 +575,7 @@ static void balance_dirty_pages(struct address_space *mapping) if (pages_written >= write_chunk) break; /* We've done our duty */ - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); } if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && @@ -669,7 +669,7 @@ void throttle_vm_writeout(gfp_t gfp_mask) if (global_page_state(NR_UNSTABLE_NFS) + global_page_state(NR_WRITEBACK) <= dirty_thresh) break; - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); /* * The caller might hold locks which can prevent IO completion @@ -715,7 +715,7 @@ static void background_writeout(unsigned long _min_pages) if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { /* Wrote less than expected */ if (wbc.encountered_congestion || wbc.more_io) - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); else break; } @@ -787,7 +787,7 @@ static void wb_kupdate(unsigned long arg) writeback_inodes(&wbc); if (wbc.nr_to_write > 0) { if (wbc.encountered_congestion || wbc.more_io) - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); else break; /* All the old data is written */ } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ad7cd1c56b07..a35eeab2724c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1666,7 +1666,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, preferred_zone, migratetype); if (!page && gfp_mask & __GFP_NOFAIL) - congestion_wait(WRITE, HZ/50); + congestion_wait(BLK_RW_ASYNC, HZ/50); } while (!page && (gfp_mask & __GFP_NOFAIL)); return page; @@ -1831,7 +1831,7 @@ rebalance: pages_reclaimed += did_some_progress; if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { /* Wait for some write requests to complete then retry */ - congestion_wait(WRITE, HZ/50); + congestion_wait(BLK_RW_ASYNC, HZ/50); goto rebalance; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 54155268dfca..dea7abd31098 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1104,7 +1104,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, */ if (nr_freed < nr_taken && !current_is_kswapd() && lumpy_reclaim) { - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); /* * The attempt at page out may have made some @@ -1721,7 +1721,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, /* Take a nap, wait for some writeback to complete */ if (sc->nr_scanned && priority < DEF_PRIORITY - 2) - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); } /* top priority shrink_zones still had more to do? don't OOM, then */ if (!sc->all_unreclaimable && scanning_global_lru(sc)) @@ -1960,7 +1960,7 @@ loop_again: * another pass across the zones. */ if (total_scanned && priority < DEF_PRIORITY - 2) - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); /* * We do this so kswapd doesn't build up large priorities for @@ -2233,7 +2233,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) goto out; if (sc.nr_scanned && prio < DEF_PRIORITY - 2) - congestion_wait(WRITE, HZ / 10); + congestion_wait(BLK_RW_ASYNC, HZ / 10); } } -- cgit v1.2.3 From 857fdc53a0a90c3ba7fcf5b1fb4c7a62ae03cf82 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 10 Jul 2009 09:36:20 -0700 Subject: x86/pci: insert ioapic resource before assigning unassigned resources Stephen reported that his DL585 G2 needed noapic after 2.6.22 (?) Dann bisected it down to: commit 30a18d6c3f1e774de656ebd8ff219d53e2ba4029 Date: Tue Feb 19 03:21:20 2008 -0800 x86: multi pci root bus with different io resource range, on 64-bit It turns out that: 1. that AMD-based systems have two HT chains. 2. BIOS doesn't allocate resources for BAR 6 of devices under 8132 etc 3. that multi-peer-root patch will try to split root resources to peer root resources according to PCI conf of NB 4. PCI core assigns unassigned resources, but they overlap with BARs that are used by ioapic addr of io4 and 8132. The reason: at that point ioapic address are not inserted yet. Solution is to insert ioapic resources into the tree a bit earlier. Reported-by: Stephen Frost Reported-and-Tested-by: dann frazier Signed-off-by: Yinghai Lu Cc: stable@kernel.org Signed-off-by: Jesse Barnes --- arch/x86/include/asm/io_apic.h | 2 ++ arch/x86/kernel/apic/io_apic.c | 14 +++----------- arch/x86/pci/i386.c | 7 +++++++ 3 files changed, 12 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index daf866ed0612..330ee807f89e 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -161,6 +161,7 @@ extern int io_apic_set_pci_routing(struct device *dev, int irq, struct io_apic_irq_attr *irq_attr); extern int (*ioapic_renumber_irq)(int ioapic, int irq); extern void ioapic_init_mappings(void); +extern void ioapic_insert_resources(void); extern struct IO_APIC_route_entry **alloc_ioapic_entries(void); extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries); @@ -180,6 +181,7 @@ extern void ioapic_write_entry(int apic, int pin, #define io_apic_assign_pci_irqs 0 static const int timer_through_8259 = 0; static inline void ioapic_init_mappings(void) { } +static inline void ioapic_insert_resources(void) { } static inline void probe_nr_irqs_gsi(void) { } #endif diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 90b5e6efa938..2284a4812b68 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -4181,28 +4181,20 @@ fake_ioapic_page: } } -static int __init ioapic_insert_resources(void) +void __init ioapic_insert_resources(void) { int i; struct resource *r = ioapic_resources; if (!r) { - if (nr_ioapics > 0) { + if (nr_ioapics > 0) printk(KERN_ERR "IO APIC resources couldn't be allocated.\n"); - return -1; - } - return 0; + return; } for (i = 0; i < nr_ioapics; i++) { insert_resource(&iomem_resource, r); r++; } - - return 0; } - -/* Insert the IO APIC resources after PCI initialization has occured to handle - * IO APICS that are mapped in on a BAR in PCI space. */ -late_initcall(ioapic_insert_resources); diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 0fb56db16d18..52e62e57fedd 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -35,6 +35,7 @@ #include #include #include +#include static int @@ -227,6 +228,12 @@ void __init pcibios_resource_survey(void) pcibios_allocate_resources(1); e820_reserve_resources_late(); + /* + * Insert the IO APIC resources after PCI initialization has + * occured to handle IO APICS that are mapped in on a BAR in + * PCI space, but before trying to assign unassigned pci res. + */ + ioapic_insert_resources(); } /** -- cgit v1.2.3 From c99e6efe1ba04561e7d93a81f0be07e37427e835 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 10 Jul 2009 14:57:56 +0200 Subject: sched: INIT_PREEMPT_COUNT Pull the initial preempt_count value into a single definition site. Maintainers for: alpha, ia64 and m68k, please have a look, your arch code is funny. The header magic is a bit odd, but similar to the KERNEL_DS one, CPP waits with expanding these macros until the INIT_THREAD_INFO macro itself is expanded, which is in arch/*/kernel/init_task.c where we've already included sched.h so we're good. Cc: tony.luck@intel.com Cc: rth@twiddle.net Cc: geert@linux-m68k.org Signed-off-by: Peter Zijlstra Acked-by: Matt Mackall Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/thread_info.h | 1 + arch/arm/include/asm/thread_info.h | 2 +- arch/avr32/include/asm/thread_info.h | 2 +- arch/blackfin/include/asm/thread_info.h | 2 +- arch/cris/include/asm/thread_info.h | 4 +--- arch/frv/include/asm/thread_info.h | 4 +--- arch/h8300/include/asm/thread_info.h | 2 +- arch/ia64/include/asm/thread_info.h | 2 +- arch/m32r/include/asm/thread_info.h | 4 +--- arch/m68k/include/asm/thread_info_mm.h | 1 + arch/m68k/include/asm/thread_info_no.h | 1 + arch/microblaze/include/asm/thread_info.h | 4 +--- arch/mips/include/asm/thread_info.h | 4 +--- arch/mn10300/include/asm/thread_info.h | 4 +--- arch/parisc/include/asm/thread_info.h | 2 +- arch/powerpc/include/asm/thread_info.h | 4 +--- arch/s390/include/asm/thread_info.h | 2 +- arch/sh/include/asm/thread_info.h | 2 +- arch/sparc/include/asm/thread_info_32.h | 4 +--- arch/sparc/include/asm/thread_info_64.h | 4 +--- arch/um/include/asm/thread_info.h | 2 +- arch/x86/include/asm/thread_info.h | 2 +- arch/xtensa/include/asm/thread_info.h | 4 +--- include/linux/sched.h | 6 ++++++ 24 files changed, 29 insertions(+), 40 deletions(-) (limited to 'arch/x86') diff --git a/arch/alpha/include/asm/thread_info.h b/arch/alpha/include/asm/thread_info.h index d069526bd767..60c83abfde70 100644 --- a/arch/alpha/include/asm/thread_info.h +++ b/arch/alpha/include/asm/thread_info.h @@ -37,6 +37,7 @@ struct thread_info { .task = &tsk, \ .exec_domain = &default_exec_domain, \ .addr_limit = KERNEL_DS, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .restart_block = { \ .fn = do_no_restart_syscall, \ }, \ diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h index 4f8848260ee2..73394e50cbca 100644 --- a/arch/arm/include/asm/thread_info.h +++ b/arch/arm/include/asm/thread_info.h @@ -73,7 +73,7 @@ struct thread_info { .task = &tsk, \ .exec_domain = &default_exec_domain, \ .flags = 0, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ .cpu_domain = domain_val(DOMAIN_USER, DOMAIN_MANAGER) | \ domain_val(DOMAIN_KERNEL, DOMAIN_MANAGER) | \ diff --git a/arch/avr32/include/asm/thread_info.h b/arch/avr32/include/asm/thread_info.h index 4442f8d2d423..fc42de5ca209 100644 --- a/arch/avr32/include/asm/thread_info.h +++ b/arch/avr32/include/asm/thread_info.h @@ -40,7 +40,7 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .flags = 0, \ .cpu = 0, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .restart_block = { \ .fn = do_no_restart_syscall \ } \ diff --git a/arch/blackfin/include/asm/thread_info.h b/arch/blackfin/include/asm/thread_info.h index 2920087516f2..2bbfdd950afc 100644 --- a/arch/blackfin/include/asm/thread_info.h +++ b/arch/blackfin/include/asm/thread_info.h @@ -77,7 +77,7 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .flags = 0, \ .cpu = 0, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .restart_block = { \ .fn = do_no_restart_syscall, \ }, \ diff --git a/arch/cris/include/asm/thread_info.h b/arch/cris/include/asm/thread_info.h index bc5b2935ca53..c3aade36c330 100644 --- a/arch/cris/include/asm/thread_info.h +++ b/arch/cris/include/asm/thread_info.h @@ -50,8 +50,6 @@ struct thread_info { /* * macros/functions for gaining access to the thread information structure - * - * preempt_count needs to be 1 initially, until the scheduler is functional. */ #ifndef __ASSEMBLY__ #define INIT_THREAD_INFO(tsk) \ @@ -60,7 +58,7 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .flags = 0, \ .cpu = 0, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ .restart_block = { \ .fn = do_no_restart_syscall, \ diff --git a/arch/frv/include/asm/thread_info.h b/arch/frv/include/asm/thread_info.h index e8a5ed7be021..e608e056bb53 100644 --- a/arch/frv/include/asm/thread_info.h +++ b/arch/frv/include/asm/thread_info.h @@ -56,8 +56,6 @@ struct thread_info { /* * macros/functions for gaining access to the thread information structure - * - * preempt_count needs to be 1 initially, until the scheduler is functional. */ #ifndef __ASSEMBLY__ @@ -67,7 +65,7 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .flags = 0, \ .cpu = 0, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ .restart_block = { \ .fn = do_no_restart_syscall, \ diff --git a/arch/h8300/include/asm/thread_info.h b/arch/h8300/include/asm/thread_info.h index 700014d2155f..8bbc8b0ee45d 100644 --- a/arch/h8300/include/asm/thread_info.h +++ b/arch/h8300/include/asm/thread_info.h @@ -36,7 +36,7 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .flags = 0, \ .cpu = 0, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .restart_block = { \ .fn = do_no_restart_syscall, \ }, \ diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h index ae6922626bf4..8ce2e388e37c 100644 --- a/arch/ia64/include/asm/thread_info.h +++ b/arch/ia64/include/asm/thread_info.h @@ -48,7 +48,7 @@ struct thread_info { .flags = 0, \ .cpu = 0, \ .addr_limit = KERNEL_DS, \ - .preempt_count = 0, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .restart_block = { \ .fn = do_no_restart_syscall, \ }, \ diff --git a/arch/m32r/include/asm/thread_info.h b/arch/m32r/include/asm/thread_info.h index 8589d462df27..07bb5bd00e2a 100644 --- a/arch/m32r/include/asm/thread_info.h +++ b/arch/m32r/include/asm/thread_info.h @@ -57,8 +57,6 @@ struct thread_info { /* * macros/functions for gaining access to the thread information structure - * - * preempt_count needs to be 1 initially, until the scheduler is functional. */ #ifndef __ASSEMBLY__ @@ -68,7 +66,7 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .flags = 0, \ .cpu = 0, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ .restart_block = { \ .fn = do_no_restart_syscall, \ diff --git a/arch/m68k/include/asm/thread_info_mm.h b/arch/m68k/include/asm/thread_info_mm.h index af0fda46e94b..6ea5c33b3c56 100644 --- a/arch/m68k/include/asm/thread_info_mm.h +++ b/arch/m68k/include/asm/thread_info_mm.h @@ -19,6 +19,7 @@ struct thread_info { { \ .task = &tsk, \ .exec_domain = &default_exec_domain, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .restart_block = { \ .fn = do_no_restart_syscall, \ }, \ diff --git a/arch/m68k/include/asm/thread_info_no.h b/arch/m68k/include/asm/thread_info_no.h index 82529f424ea3..c2bde5e24b0b 100644 --- a/arch/m68k/include/asm/thread_info_no.h +++ b/arch/m68k/include/asm/thread_info_no.h @@ -49,6 +49,7 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .flags = 0, \ .cpu = 0, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .restart_block = { \ .fn = do_no_restart_syscall, \ }, \ diff --git a/arch/microblaze/include/asm/thread_info.h b/arch/microblaze/include/asm/thread_info.h index 7fac44498445..6e92885d381a 100644 --- a/arch/microblaze/include/asm/thread_info.h +++ b/arch/microblaze/include/asm/thread_info.h @@ -75,8 +75,6 @@ struct thread_info { /* * macros/functions for gaining access to the thread information structure - * - * preempt_count needs to be 1 initially, until the scheduler is functional. */ #define INIT_THREAD_INFO(tsk) \ { \ @@ -84,7 +82,7 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .flags = 0, \ .cpu = 0, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ .restart_block = { \ .fn = do_no_restart_syscall, \ diff --git a/arch/mips/include/asm/thread_info.h b/arch/mips/include/asm/thread_info.h index 143a48136a4b..f9df720d2e40 100644 --- a/arch/mips/include/asm/thread_info.h +++ b/arch/mips/include/asm/thread_info.h @@ -39,8 +39,6 @@ struct thread_info { /* * macros/functions for gaining access to the thread information structure - * - * preempt_count needs to be 1 initially, until the scheduler is functional. */ #define INIT_THREAD_INFO(tsk) \ { \ @@ -48,7 +46,7 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .flags = _TIF_FIXADE, \ .cpu = 0, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ .restart_block = { \ .fn = do_no_restart_syscall, \ diff --git a/arch/mn10300/include/asm/thread_info.h b/arch/mn10300/include/asm/thread_info.h index 78a3881f3c12..58d64f8b2cc3 100644 --- a/arch/mn10300/include/asm/thread_info.h +++ b/arch/mn10300/include/asm/thread_info.h @@ -65,8 +65,6 @@ struct thread_info { /* * macros/functions for gaining access to the thread information structure - * - * preempt_count needs to be 1 initially, until the scheduler is functional. */ #ifndef __ASSEMBLY__ @@ -76,7 +74,7 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .flags = 0, \ .cpu = 0, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ .restart_block = { \ .fn = do_no_restart_syscall, \ diff --git a/arch/parisc/include/asm/thread_info.h b/arch/parisc/include/asm/thread_info.h index 0407959da489..4ce0edfbe969 100644 --- a/arch/parisc/include/asm/thread_info.h +++ b/arch/parisc/include/asm/thread_info.h @@ -23,7 +23,7 @@ struct thread_info { .flags = 0, \ .cpu = 0, \ .addr_limit = KERNEL_DS, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .restart_block = { \ .fn = do_no_restart_syscall \ } \ diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index 9aba5a38a7c4..c8b329255678 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -46,15 +46,13 @@ struct thread_info { /* * macros/functions for gaining access to the thread information structure - * - * preempt_count needs to be 1 initially, until the scheduler is functional. */ #define INIT_THREAD_INFO(tsk) \ { \ .task = &tsk, \ .exec_domain = &default_exec_domain, \ .cpu = 0, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .restart_block = { \ .fn = do_no_restart_syscall, \ }, \ diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h index 925bcc649035..ba1cab9fc1f9 100644 --- a/arch/s390/include/asm/thread_info.h +++ b/arch/s390/include/asm/thread_info.h @@ -61,7 +61,7 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .flags = 0, \ .cpu = 0, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .restart_block = { \ .fn = do_no_restart_syscall, \ }, \ diff --git a/arch/sh/include/asm/thread_info.h b/arch/sh/include/asm/thread_info.h index f09ac4806294..d570ac2e5cb9 100644 --- a/arch/sh/include/asm/thread_info.h +++ b/arch/sh/include/asm/thread_info.h @@ -51,7 +51,7 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .flags = 0, \ .cpu = 0, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ .restart_block = { \ .fn = do_no_restart_syscall, \ diff --git a/arch/sparc/include/asm/thread_info_32.h b/arch/sparc/include/asm/thread_info_32.h index 0f7b0e5fb1c7..844d73a0340c 100644 --- a/arch/sparc/include/asm/thread_info_32.h +++ b/arch/sparc/include/asm/thread_info_32.h @@ -54,8 +54,6 @@ struct thread_info { /* * macros/functions for gaining access to the thread information structure - * - * preempt_count needs to be 1 initially, until the scheduler is functional. */ #define INIT_THREAD_INFO(tsk) \ { \ @@ -64,7 +62,7 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .flags = 0, \ .cpu = 0, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .restart_block = { \ .fn = do_no_restart_syscall, \ }, \ diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h index 65865726b283..1b45a7bbe407 100644 --- a/arch/sparc/include/asm/thread_info_64.h +++ b/arch/sparc/include/asm/thread_info_64.h @@ -125,8 +125,6 @@ struct thread_info { /* * macros/functions for gaining access to the thread information structure - * - * preempt_count needs to be 1 initially, until the scheduler is functional. */ #ifndef __ASSEMBLY__ @@ -135,7 +133,7 @@ struct thread_info { .task = &tsk, \ .flags = ((unsigned long)ASI_P) << TI_FLAG_CURRENT_DS_SHIFT, \ .exec_domain = &default_exec_domain, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .restart_block = { \ .fn = do_no_restart_syscall, \ }, \ diff --git a/arch/um/include/asm/thread_info.h b/arch/um/include/asm/thread_info.h index 62274ab9471f..fd911f855367 100644 --- a/arch/um/include/asm/thread_info.h +++ b/arch/um/include/asm/thread_info.h @@ -32,7 +32,7 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .flags = 0, \ .cpu = 0, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ .restart_block = { \ .fn = do_no_restart_syscall, \ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index b0783520988b..fad7d40b75f8 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -49,7 +49,7 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .flags = 0, \ .cpu = 0, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ .restart_block = { \ .fn = do_no_restart_syscall, \ diff --git a/arch/xtensa/include/asm/thread_info.h b/arch/xtensa/include/asm/thread_info.h index 0f4fe1faf9ba..13165641cc51 100644 --- a/arch/xtensa/include/asm/thread_info.h +++ b/arch/xtensa/include/asm/thread_info.h @@ -80,8 +80,6 @@ struct thread_info { /* * macros/functions for gaining access to the thread information structure - * - * preempt_count needs to be 1 initially, until the scheduler is functional. */ #ifndef __ASSEMBLY__ @@ -92,7 +90,7 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .flags = 0, \ .cpu = 0, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ .restart_block = { \ .fn = do_no_restart_syscall, \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 0085d758d645..2a99f1c15cf8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -498,6 +498,12 @@ struct task_cputime { .sum_exec_runtime = 0, \ } +/* + * Disable preemption until the scheduler is running. + * Reset by start_kernel()->sched_init()->init_idle(). + */ +#define INIT_PREEMPT_COUNT (1) + /** * struct thread_group_cputimer - thread group interval timer counts * @cputime: thread group interval timers. -- cgit v1.2.3 From a1a08d1cb0ab148fd74216e4c0b4d4db18fe62c6 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Sat, 11 Jul 2009 00:10:04 -0700 Subject: x86: Remove spurious printk level from segfault message Since commit 5fd29d6c ("printk: clean up handling of log-levels and newlines"), the kernel logs segfaults like: <6>gnome-power-man[24509]: segfault at 20 ip 00007f9d4950465a sp 00007fffbb50fc70 error 4 in libgobject-2.0.so.0.2103.0[7f9d494f7000+45000] with the extra "<6>" being KERN_INFO. This happens because the printk in show_signal_msg() started with KERN_CONT and then used "%s" to pass in the real level; and KERN_CONT is no longer an empty string, and printk only pays attention to the level at the very beginning of the format string. Therefore, remove the KERN_CONT from this printk, since it is now actively causing problems (and never really made any sense). Signed-off-by: Roland Dreier Cc: Linus Torvalds LKML-Reference: <874otjitkj.fsf@shaolin.home.digitalvampire.org> Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 85307cc6e45f..bfae139182ff 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -697,7 +697,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, if (!printk_ratelimit()) return; - printk(KERN_CONT "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", + printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx", task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, tsk->comm, task_pid_nr(tsk), address, (void *)regs->ip, (void *)regs->sp, error_code); -- cgit v1.2.3 From f1c6a58121f9846ac665b0fbd3cbab90ce8bcbac Mon Sep 17 00:00:00 2001 From: Daniel Qarras Date: Sun, 12 Jul 2009 04:32:40 -0700 Subject: perf_counter, x86: Extend perf_counter Pentium M support I've attached a patch to remove the Pentium M special casing of EMON and as noticed at least with my Pentium M the hardware PMU now works: Performance counter stats for '/bin/ls /var/tmp': 1.809988 task-clock-msecs # 0.125 CPUs 1 context-switches # 0.001 M/sec 0 CPU-migrations # 0.000 M/sec 224 page-faults # 0.124 M/sec 1425648 cycles # 787.656 M/sec 912755 instructions # 0.640 IPC Vince suggested that this code was trying to address erratum Y17 in Pentium-M's: http://download.intel.com/support/processors/mobile/pm/sb/25266532.pdf But that erratum (related to IA32_MISC_ENABLES.7) does not affect perfcounters as we dont use this toggle to disable RDPMC and WRMSR/RDMSR access to performance counters. We keep cr4's bit 8 (X86_CR4_PCE) clear so unprivileged RDPMC access is not allowed anyway. Cc: Vince Weaver Cc: Peter Zijlstra Cc: Stephane Eranian LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index bed1c4c2f251..7e346d4bc0fb 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1583,10 +1583,8 @@ static int p6_pmu_init(void) break; case 9: case 13: - /* for Pentium M, we need to check if PMU exist */ - rdmsr(MSR_IA32_MISC_ENABLE, low, high); - if (low & MSR_IA32_MISC_ENABLE_EMON) - break; + /* Pentium M */ + break; default: pr_cont("unsupported p6 CPU model %d ", boot_cpu_data.x86_model); -- cgit v1.2.3 From 151586d0f70405d99324d89aea13706cf6d7f993 Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Sun, 12 Jul 2009 17:04:12 +0600 Subject: x86: Fix false positive section mismatch in es7000_32.c The variable apic_es7000_cluster references the function __cpuinit wakeup_secondary_cpu_via_mip() from a noninit section. So we've been warned by the following warning. To avoid possible collision between init/noninit, its best to mark the variable as __refdata. We were warned by the following warning: LD arch/x86/kernel/apic/built-in.o WARNING: arch/x86/kernel/apic/built-in.o(.data+0x198c): Section mismatch in reference from the variable apic_es7000_cluster to the function .cpuinit.text:wakeup_secondary_cpu_via_mip() Signed-off-by: Rakib Mullick LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/es7000_32.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 69328ac8de9c..8952a5890281 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -652,7 +652,8 @@ static int es7000_mps_oem_check_cluster(struct mpc_table *mpc, char *oem, return ret && es7000_apic_is_cluster(); } -struct apic apic_es7000_cluster = { +/* We've been warned by a false positive warning.Use __refdata to keep calm. */ +struct apic __refdata apic_es7000_cluster = { .name = "es7000", .probe = probe_es7000, -- cgit v1.2.3 From 7473727be884293c8171775a148e1d174d1606e6 Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Sun, 12 Jul 2009 17:07:19 +0600 Subject: x86, apic: Fix false positive section mismatch in numaq_32.c The variable apic_numaq placed in noninit section references the function wakeup_secondary_cpu_via_nmi(), which is in __cpuinit section. Thus causes a section mismatch warning. To avoid such mismatch we mark apic_numaq as __refdata. We were warned by the following warning: WARNING: arch/x86/kernel/built-in.o(.data+0x932c): Section mismatch in reference from the variable apic_numaq to the function .cpuinit.text:wakeup_secondary_cpu_via_nmi() Signed-off-by: Rakib Mullick LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/numaq_32.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 533e59c6fc82..ca96e68f0d23 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -493,7 +493,8 @@ static void numaq_setup_portio_remap(void) (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD); } -struct apic apic_numaq = { +/* Use __refdata to keep false positive warning calm. */ +struct apic __refdata apic_numaq = { .name = "NUMAQ", .probe = probe_numaq, -- cgit v1.2.3 From 2ad76643ff58bb8841f391ea8327c14abe273ea3 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Mon, 13 Jul 2009 16:14:37 -0400 Subject: x86: Fix warning in pvclock.c when building 32-bit, I see this .. arch/x86/kernel/pvclock.c:63:7: warning: "__x86_64__" is not defined Signed-off-by: Dave Jones LKML-Reference: <20090713201437.GA12165@redhat.com> Signed-off-by: Thomas Gleixner --- arch/x86/kernel/pvclock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 4f9c55f3a7c0..03801f2f761f 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -60,7 +60,7 @@ static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) "adc %5,%%edx ; " : "=A" (product), "=r" (tmp1), "=r" (tmp2) : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); -#elif __x86_64__ +#elif defined(__x86_64__) __asm__ ( "mul %%rdx ; shrd $32,%%rdx,%%rax" : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); -- cgit v1.2.3 From 5780888bcac316508eb5f4dd23bbea8b5057647c Mon Sep 17 00:00:00 2001 From: Matias Zabaljauregui Date: Thu, 18 Jun 2009 11:44:06 -0300 Subject: lguest: fix journey fix: "make Guest" was complaining about duplicated G:032 Signed-off-by: Matias Zabaljauregui Signed-off-by: Rusty Russell --- arch/x86/include/asm/lguest_hcall.h | 2 +- arch/x86/lguest/boot.c | 2 +- include/linux/lguest.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index d31c4a684078..33600a66755f 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h @@ -30,7 +30,7 @@ #include #include -/*G:031 But first, how does our Guest contact the Host to ask for privileged +/*G:030 But first, how does our Guest contact the Host to ask for privileged * operations? There are two ways: the direct way is to make a "hypercall", * to make requests of the Host Itself. * diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 7bc65f0f62c4..0188fd37b6c0 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1079,7 +1079,7 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, return insn_len; } -/*G:030 Once we get to lguest_init(), we know we're a Guest. The various +/*G:029 Once we get to lguest_init(), we know we're a Guest. The various * pv_ops structures in the kernel provide points for (almost) every routine we * have to override to avoid privileged instructions. */ __init void lguest_init(void) diff --git a/include/linux/lguest.h b/include/linux/lguest.h index 7bc1440fc473..dbf2479e808e 100644 --- a/include/linux/lguest.h +++ b/include/linux/lguest.h @@ -11,7 +11,7 @@ #define LG_CLOCK_MIN_DELTA 100UL #define LG_CLOCK_MAX_DELTA ULONG_MAX -/*G:032 The second method of communicating with the Host is to via "struct +/*G:031 The second method of communicating with the Host is to via "struct * lguest_data". Once the Guest's initialization hypercall tells the Host where * this is, the Guest and Host both publish information in it. :*/ struct lguest_data -- cgit v1.2.3 From 7a5049205f7265620c48781814155f2763e70abb Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 17 Jul 2009 21:47:44 -0600 Subject: lguest: restrict CPUID to avoid perf counter wrmsr Avoid the following: [ 0.012093] WARNING: at arch/x86/kernel/apic/apic.c:249 native_apic_write_dummy+0x2f/0x40() Rather than chase each new cpuid-detected feature, just lie about the highest valid CPUID so this code is never run. Signed-off-by: Rusty Russell --- arch/x86/lguest/boot.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 0188fd37b6c0..f2bf1f73d468 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -379,6 +379,11 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, native_cpuid(ax, bx, cx, dx); switch (function) { + case 0: /* ID and highest CPUID. Futureproof a little by sticking to + * older ones. */ + if (*ax > 5) + *ax = 5; + break; case 1: /* Basic feature request. */ /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ *cx &= 0x00002201; -- cgit v1.2.3 From 6aa542a694dc9ea4344a8a590d2628c33d1b9431 Mon Sep 17 00:00:00 2001 From: Alexey Fisher Date: Wed, 15 Jul 2009 14:16:09 +0200 Subject: x86: Add quirk for Intel DG45ID board to avoid low memory corruption AMI BIOS with low memory corruption was found on Intel DG45ID board (Bug 13710). Add this board to the blacklist - in the (somewhat optimistic) hope of future boards/BIOSes from Intel not having this bug. Also see: http://bugzilla.kernel.org/show_bug.cgi?id=13736 Signed-off-by: Alexey Fisher Cc: ykzhao Cc: alan@lxorguk.ukuu.org.uk Cc: LKML-Reference: <1247660169-4503-1-git-send-email-bug-track@fisher-privat.net> Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index de2cab132844..63f32d220ef2 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -672,6 +672,19 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"), }, }, + { + /* + * AMI BIOS with low memory corruption was found on Intel DG45ID board. + * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will + * match only DMI_BOARD_NAME and see if there is more bad products + * with this vendor. + */ + .callback = dmi_low_memory_corruption, + .ident = "AMI BIOS", + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "DG45ID"), + }, + }, #endif {} }; -- cgit v1.2.3 From 8bcdbe427924a1e4b4e4cf68020e92e9f93fe011 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Tue, 14 Jul 2009 10:52:55 +0100 Subject: x86: Include all of .data.* sections in _edata on 64-bit The .data.read_mostly and .data.cacheline_aligned sections aren't covered by the _sdata .. _edata range on x86-64. This affects kmemleak reporting leading to possible false positives by not scanning the whole data section. Signed-off-by: Catalin Marinas Tested-by: Alexey Fisher Acked-by: Sam Ravnborg Cc: Pekka Enberg LKML-Reference: <1247565175.28240.37.camel@pc1117.cambridge.arm.com> Signed-off-by: Ingo Molnar Cc: Sam Ravnborg --- arch/x86/kernel/vmlinux.lds.S | 7 ------- 1 file changed, 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 367e87882041..59f31d2dd435 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -112,11 +112,6 @@ SECTIONS _sdata = .; DATA_DATA CONSTRUCTORS - -#ifdef CONFIG_X86_64 - /* End of data section */ - _edata = .; -#endif } :data #ifdef CONFIG_X86_32 @@ -156,10 +151,8 @@ SECTIONS .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { *(.data.read_mostly) -#ifdef CONFIG_X86_32 /* End of data section */ _edata = .; -#endif } #ifdef CONFIG_X86_64 -- cgit v1.2.3 From 77f32dfdd97857280ae1ebac64382ff082cd7ea0 Mon Sep 17 00:00:00 2001 From: Denis Turischev Date: Mon, 20 Jul 2009 18:48:17 +0300 Subject: x86: Add reboot fixup for SBC-fitPC2 The CompuLab SBC-fitPC2 board needs to reboot via BIOS. Signed-off-by: Denis Turischev Signed-off-by: Mike Rapoport Signed-off-by: H. Peter Anvin --- arch/x86/kernel/reboot.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index d2d1ce8170f0..508e982dd072 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -249,6 +249,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"), }, }, + { /* Handle problems with rebooting on CompuLab SBC-FITPC2 */ + .callback = set_bios_reboot, + .ident = "CompuLab SBC-FITPC2", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "CompuLab"), + DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"), + }, + }, { } }; -- cgit v1.2.3 From 155b73529583c38f30fd394d692b15a893960782 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Sun, 19 Jul 2009 18:06:35 +0200 Subject: x86: Fix movq immediate operand constraints in uaccess_64.h arch/x86/include/asm/uaccess_64.h uses wrong asm operand constraint ("ir") for movq insn. Since movq sign-extends its immediate operand, "er" constraint should be used instead. Attached patch changes all uses of __put_user_asm in uaccess_64.h to use "er" when "q" insn suffix is involved. Patch was compile tested on x86_64 with defconfig. Signed-off-by: Uros Bizjak Signed-off-by: H. Peter Anvin Cc: stable@kernel.org --- arch/x86/include/asm/uaccess_64.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 8cc687326eb8..db24b215fc50 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -88,11 +88,11 @@ int __copy_to_user(void __user *dst, const void *src, unsigned size) ret, "l", "k", "ir", 4); return ret; case 8:__put_user_asm(*(u64 *)src, (u64 __user *)dst, - ret, "q", "", "ir", 8); + ret, "q", "", "er", 8); return ret; case 10: __put_user_asm(*(u64 *)src, (u64 __user *)dst, - ret, "q", "", "ir", 10); + ret, "q", "", "er", 10); if (unlikely(ret)) return ret; asm("":::"memory"); @@ -101,12 +101,12 @@ int __copy_to_user(void __user *dst, const void *src, unsigned size) return ret; case 16: __put_user_asm(*(u64 *)src, (u64 __user *)dst, - ret, "q", "", "ir", 16); + ret, "q", "", "er", 16); if (unlikely(ret)) return ret; asm("":::"memory"); __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst, - ret, "q", "", "ir", 8); + ret, "q", "", "er", 8); return ret; default: return copy_user_generic((__force void *)dst, src, size); @@ -157,7 +157,7 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size) ret, "q", "", "=r", 8); if (likely(!ret)) __put_user_asm(tmp, (u64 __user *)dst, - ret, "q", "", "ir", 8); + ret, "q", "", "er", 8); return ret; } default: -- cgit v1.2.3 From ebe119cd0929df4878f758ebf880cb435e4dcaaf Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 20 Jul 2009 23:27:39 -0700 Subject: x86: Fix movq immediate operand constraints in uaccess.h The movq instruction, generated by __put_user_asm() when used for 64-bit data, takes a sign-extended immediate ("e") not a zero-extended immediate ("Z"). Signed-off-by: H. Peter Anvin Cc: Uros Bizjak Cc: stable@kernel.org --- arch/x86/include/asm/uaccess.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 20e6a795e160..d2c6c930b491 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -212,9 +212,9 @@ extern int __get_user_bad(void); : "A" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx") #else #define __put_user_asm_u64(x, ptr, retval, errret) \ - __put_user_asm(x, ptr, retval, "q", "", "Zr", errret) + __put_user_asm(x, ptr, retval, "q", "", "er", errret) #define __put_user_asm_ex_u64(x, addr) \ - __put_user_asm_ex(x, addr, "q", "", "Zr") + __put_user_asm_ex(x, addr, "q", "", "er") #define __put_user_x8(x, ptr, __ret_pu) __put_user_x(8, x, ptr, __ret_pu) #endif -- cgit v1.2.3 From e9084ec98bb9aa3abc6cf73181177780ce7546f8 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 16 Jul 2009 09:45:11 +0100 Subject: x86, mce: Fix set_trigger() accessor Fix the condition checking the result of strchr() (which previously could result in an oops), and make the function return the number of bytes actively used. [ Impact: fix oops ] Signed-off-by: Jan Beulich Cc: Andi Kleen LKML-Reference: <4A5F04B7020000780000AB59@vpn.id2.novell.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 484c1e5f658e..1cfb623ce11c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1692,17 +1692,15 @@ static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, const char *buf, size_t siz) { char *p; - int len; strncpy(mce_helper, buf, sizeof(mce_helper)); mce_helper[sizeof(mce_helper)-1] = 0; - len = strlen(mce_helper); p = strchr(mce_helper, '\n'); - if (*p) + if (p) *p = 0; - return len; + return strlen(mce_helper) + !!p; } static ssize_t set_ignore_ce(struct sys_device *s, -- cgit v1.2.3 From 429b2b319af3987e808c18f6b81313104caf782c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 18 Jul 2009 08:56:57 +0200 Subject: x86-64: Fix bad_srat() to clear all state Need to clear both nodes and nodes_add state for start/end. Signed-off-by: Andi Kleen LKML-Reference: <20090718065657.GA2898@basil.fritz.box> Signed-off-by: H. Peter Anvin Cc: stable@kernel.org --- arch/x86/mm/srat_64.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 2dfcbf9df2ae..dbb5381f7b3b 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -79,8 +79,10 @@ static __init void bad_srat(void) acpi_numa = -1; for (i = 0; i < MAX_LOCAL_APIC; i++) apicid_to_node[i] = NUMA_NO_NODE; - for (i = 0; i < MAX_NUMNODES; i++) - nodes_add[i].start = nodes[i].end = 0; + for (i = 0; i < MAX_NUMNODES; i++) { + nodes[i].start = nodes[i].end = 0; + nodes_add[i].start = nodes_add[i].end = 0; + } remove_all_active_ranges(); } -- cgit v1.2.3 From 6effa8f6fc786f00e3a23eae605e0f2e8e748faa Mon Sep 17 00:00:00 2001 From: Hidehiro Kawai Date: Wed, 22 Jul 2009 11:56:20 +0900 Subject: x86, mce: Rename incorrect macro name "CONFIG_X86_THRESHOLD" CONFIG_X86_THRESHOLD used in arch/x86/kernel/irqinit.c is always undefined. Rename it to the correct name "CONFIG_X86_MCE_THRESHOLD". Signed-off-by: Hidehiro Kawai Reviewed-by: Hidetoshi Seto Cc: Andi Kleen LKML-Reference: <4A667FD4.3010509@hitachi.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/irqinit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 696f0e475c2d..92b7703d3d58 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -187,7 +187,7 @@ static void __init apic_intr_init(void) #ifdef CONFIG_X86_THERMAL_VECTOR alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); #endif -#ifdef CONFIG_X86_THRESHOLD +#ifdef CONFIG_X86_MCE_THRESHOLD alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); #endif #if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC) -- cgit v1.2.3 From 9b7019ae6a542a801a80df6694c66e240e2c3e39 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Jul 2009 16:31:36 +0200 Subject: perf_counter: Remove unused variables Fix a gcc unused variables warning. Signed-off-by: Peter Zijlstra --- arch/x86/kernel/cpu/perf_counter.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 7e346d4bc0fb..a7aa8f900954 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1570,8 +1570,6 @@ static struct x86_pmu amd_pmu = { static int p6_pmu_init(void) { - int high, low; - switch (boot_cpu_data.x86_model) { case 1: case 3: /* Pentium Pro */ -- cgit v1.2.3 From 2cb078603abb612e3bcd428fb8122c3d39e08832 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 22 Jul 2009 09:59:35 -0700 Subject: x86, amd: Don't probe for extended APIC ID if APICs are disabled If we've logically disabled apics, don't probe the PCI space for the AMD extended APIC ID. [ Impact: prevent boot crash under Xen. ] Signed-off-by: Jeremy Fitzhardinge Reported-by: Bastian Blank Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 28e5f5956042..e2485b03f1cf 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -356,7 +356,7 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) #endif #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI) /* check CPU config space for extended APIC ID */ - if (c->x86 >= 0xf) { + if (cpu_has_apic && c->x86 >= 0xf) { unsigned int val; val = read_pci_config(0, 24, 0, 0x68); if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18))) -- cgit v1.2.3 From d6c585a4342a2ff627a29f9aea77c5ed4cd76023 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 24 Jul 2009 08:34:59 +0200 Subject: x86: geode: Mark mfgpt irq IRQF_TIMER to prevent resume failure Timer interrupts are excluded from being disabled during suspend. The clock events code manages the disabling of clock events on its own because the timer interrupt needs to be functional before the resume code reenables the device interrupts. The mfgpt timer request its interrupt without setting the IRQF_TIMER flag so suspend_device_irqs() disables it as well which results in a fatal resume failure. Adding IRQF_TIMER to the interupt flags when requesting the mrgpt timer interrupt solves the problem. Signed-off-by: Thomas Gleixner LKML-Reference: Cc: Andres Salomon Cc: stable@kernel.org --- arch/x86/kernel/mfgpt_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index 846510b78a09..2a62d843f015 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c @@ -347,7 +347,7 @@ static irqreturn_t mfgpt_tick(int irq, void *dev_id) static struct irqaction mfgptirq = { .handler = mfgpt_tick, - .flags = IRQF_DISABLED | IRQF_NOBALANCING, + .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER, .name = "mfgpt-timer" }; -- cgit v1.2.3 From 9e1b32caa525cb236e80e9c671e179bcecccc657 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 22 Jul 2009 15:44:28 +1000 Subject: mm: Pass virtual address to [__]p{te,ud,md}_free_tlb() mm: Pass virtual address to [__]p{te,ud,md}_free_tlb() Upcoming paches to support the new 64-bit "BookE" powerpc architecture will need to have the virtual address corresponding to PTE page when freeing it, due to the way the HW table walker works. Basically, the TLB can be loaded with "large" pages that cover the whole virtual space (well, sort-of, half of it actually) represented by a PTE page, and which contain an "indirect" bit indicating that this TLB entry RPN points to an array of PTEs from which the TLB can then create direct entries. Thus, in order to invalidate those when PTE pages are deleted, we need the virtual address to pass to tlbilx or tlbivax instructions. The old trick of sticking it somewhere in the PTE page struct page sucks too much, the address is almost readily available in all call sites and almost everybody implemets these as macros, so we may as well add the argument everywhere. I added it to the pmd and pud variants for consistency. Signed-off-by: Benjamin Herrenschmidt Acked-by: David Howells [MN10300 & FRV] Acked-by: Nick Piggin Acked-by: Martin Schwidefsky [s390] Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/tlb.h | 4 ++-- arch/arm/include/asm/tlb.h | 4 ++-- arch/avr32/include/asm/pgalloc.h | 2 +- arch/cris/include/asm/pgalloc.h | 2 +- arch/frv/include/asm/pgalloc.h | 4 ++-- arch/frv/include/asm/pgtable.h | 2 +- arch/ia64/include/asm/pgalloc.h | 6 +++--- arch/ia64/include/asm/tlb.h | 12 ++++++------ arch/m32r/include/asm/pgalloc.h | 4 ++-- arch/m68k/include/asm/motorola_pgalloc.h | 6 ++++-- arch/m68k/include/asm/sun3_pgalloc.h | 4 ++-- arch/microblaze/include/asm/pgalloc.h | 4 ++-- arch/mips/include/asm/pgalloc.h | 6 +++--- arch/mn10300/include/asm/pgalloc.h | 2 +- arch/parisc/include/asm/tlb.h | 4 ++-- arch/powerpc/include/asm/pgalloc-32.h | 2 +- arch/powerpc/include/asm/pgalloc-64.h | 4 ++-- arch/powerpc/include/asm/pgalloc.h | 6 +++--- arch/powerpc/mm/hugetlbpage.c | 4 ++-- arch/s390/include/asm/tlb.h | 9 ++++++--- arch/sh/include/asm/pgalloc.h | 4 ++-- arch/sh/include/asm/tlb.h | 6 +++--- arch/sparc/include/asm/pgalloc_32.h | 8 ++++---- arch/sparc/include/asm/tlb_64.h | 6 +++--- arch/um/include/asm/pgalloc.h | 4 ++-- arch/um/include/asm/tlb.h | 6 +++--- arch/x86/include/asm/pgalloc.h | 25 ++++++++++++++++++++++--- arch/x86/mm/pgtable.c | 6 +++--- arch/xtensa/include/asm/tlb.h | 2 +- include/asm-generic/4level-fixup.h | 4 ++-- include/asm-generic/pgtable-nopmd.h | 2 +- include/asm-generic/pgtable-nopud.h | 2 +- include/asm-generic/tlb.h | 12 ++++++------ mm/memory.c | 11 ++++++----- 34 files changed, 107 insertions(+), 82 deletions(-) (limited to 'arch/x86') diff --git a/arch/alpha/include/asm/tlb.h b/arch/alpha/include/asm/tlb.h index c13636575fba..42866759f3fa 100644 --- a/arch/alpha/include/asm/tlb.h +++ b/arch/alpha/include/asm/tlb.h @@ -9,7 +9,7 @@ #include -#define __pte_free_tlb(tlb, pte) pte_free((tlb)->mm, pte) -#define __pmd_free_tlb(tlb, pmd) pmd_free((tlb)->mm, pmd) +#define __pte_free_tlb(tlb, pte, address) pte_free((tlb)->mm, pte) +#define __pmd_free_tlb(tlb, pmd, address) pmd_free((tlb)->mm, pmd) #endif diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h index 321c83e43a1e..f41a6f57cd12 100644 --- a/arch/arm/include/asm/tlb.h +++ b/arch/arm/include/asm/tlb.h @@ -102,8 +102,8 @@ tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) } #define tlb_remove_page(tlb,page) free_page_and_swap_cache(page) -#define pte_free_tlb(tlb, ptep) pte_free((tlb)->mm, ptep) -#define pmd_free_tlb(tlb, pmdp) pmd_free((tlb)->mm, pmdp) +#define pte_free_tlb(tlb, ptep, addr) pte_free((tlb)->mm, ptep) +#define pmd_free_tlb(tlb, pmdp, addr) pmd_free((tlb)->mm, pmdp) #define tlb_migrate_finish(mm) do { } while (0) diff --git a/arch/avr32/include/asm/pgalloc.h b/arch/avr32/include/asm/pgalloc.h index 640821323943..92ecd8446ef8 100644 --- a/arch/avr32/include/asm/pgalloc.h +++ b/arch/avr32/include/asm/pgalloc.h @@ -83,7 +83,7 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t pte) quicklist_free_page(QUICK_PT, NULL, pte); } -#define __pte_free_tlb(tlb,pte) \ +#define __pte_free_tlb(tlb,pte,addr) \ do { \ pgtable_page_dtor(pte); \ tlb_remove_page((tlb), pte); \ diff --git a/arch/cris/include/asm/pgalloc.h b/arch/cris/include/asm/pgalloc.h index a1ba761d0573..6da975db112f 100644 --- a/arch/cris/include/asm/pgalloc.h +++ b/arch/cris/include/asm/pgalloc.h @@ -47,7 +47,7 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t pte) __free_page(pte); } -#define __pte_free_tlb(tlb,pte) \ +#define __pte_free_tlb(tlb,pte,address) \ do { \ pgtable_page_dtor(pte); \ tlb_remove_page((tlb), pte); \ diff --git a/arch/frv/include/asm/pgalloc.h b/arch/frv/include/asm/pgalloc.h index 971e6addb009..416d19a632f2 100644 --- a/arch/frv/include/asm/pgalloc.h +++ b/arch/frv/include/asm/pgalloc.h @@ -49,7 +49,7 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t pte) __free_page(pte); } -#define __pte_free_tlb(tlb,pte) \ +#define __pte_free_tlb(tlb,pte,address) \ do { \ pgtable_page_dtor(pte); \ tlb_remove_page((tlb),(pte)); \ @@ -62,7 +62,7 @@ do { \ */ #define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *) 2); }) #define pmd_free(mm, x) do { } while (0) -#define __pmd_free_tlb(tlb,x) do { } while (0) +#define __pmd_free_tlb(tlb,x,a) do { } while (0) #endif /* CONFIG_MMU */ diff --git a/arch/frv/include/asm/pgtable.h b/arch/frv/include/asm/pgtable.h index 33233011b1c1..22c60692b551 100644 --- a/arch/frv/include/asm/pgtable.h +++ b/arch/frv/include/asm/pgtable.h @@ -225,7 +225,7 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address) */ #define pud_alloc_one(mm, address) NULL #define pud_free(mm, x) do { } while (0) -#define __pud_free_tlb(tlb, x) do { } while (0) +#define __pud_free_tlb(tlb, x, address) do { } while (0) /* * The "pud_xxx()" functions here are trivial for a folded two-level diff --git a/arch/ia64/include/asm/pgalloc.h b/arch/ia64/include/asm/pgalloc.h index b9ac1a6fc216..96a8d927db28 100644 --- a/arch/ia64/include/asm/pgalloc.h +++ b/arch/ia64/include/asm/pgalloc.h @@ -48,7 +48,7 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud) { quicklist_free(0, NULL, pud); } -#define __pud_free_tlb(tlb, pud) pud_free((tlb)->mm, pud) +#define __pud_free_tlb(tlb, pud, address) pud_free((tlb)->mm, pud) #endif /* CONFIG_PGTABLE_4 */ static inline void @@ -67,7 +67,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) quicklist_free(0, NULL, pmd); } -#define __pmd_free_tlb(tlb, pmd) pmd_free((tlb)->mm, pmd) +#define __pmd_free_tlb(tlb, pmd, address) pmd_free((tlb)->mm, pmd) static inline void pmd_populate(struct mm_struct *mm, pmd_t * pmd_entry, pgtable_t pte) @@ -117,6 +117,6 @@ static inline void check_pgt_cache(void) quicklist_trim(0, NULL, 25, 16); } -#define __pte_free_tlb(tlb, pte) pte_free((tlb)->mm, pte) +#define __pte_free_tlb(tlb, pte, address) pte_free((tlb)->mm, pte) #endif /* _ASM_IA64_PGALLOC_H */ diff --git a/arch/ia64/include/asm/tlb.h b/arch/ia64/include/asm/tlb.h index 20d8a39680c2..85d965cb19a0 100644 --- a/arch/ia64/include/asm/tlb.h +++ b/arch/ia64/include/asm/tlb.h @@ -236,22 +236,22 @@ do { \ __tlb_remove_tlb_entry(tlb, ptep, addr); \ } while (0) -#define pte_free_tlb(tlb, ptep) \ +#define pte_free_tlb(tlb, ptep, address) \ do { \ tlb->need_flush = 1; \ - __pte_free_tlb(tlb, ptep); \ + __pte_free_tlb(tlb, ptep, address); \ } while (0) -#define pmd_free_tlb(tlb, ptep) \ +#define pmd_free_tlb(tlb, ptep, address) \ do { \ tlb->need_flush = 1; \ - __pmd_free_tlb(tlb, ptep); \ + __pmd_free_tlb(tlb, ptep, address); \ } while (0) -#define pud_free_tlb(tlb, pudp) \ +#define pud_free_tlb(tlb, pudp, address) \ do { \ tlb->need_flush = 1; \ - __pud_free_tlb(tlb, pudp); \ + __pud_free_tlb(tlb, pudp, address); \ } while (0) #endif /* _ASM_IA64_TLB_H */ diff --git a/arch/m32r/include/asm/pgalloc.h b/arch/m32r/include/asm/pgalloc.h index f11a2b909cdb..0fc736198979 100644 --- a/arch/m32r/include/asm/pgalloc.h +++ b/arch/m32r/include/asm/pgalloc.h @@ -58,7 +58,7 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t pte) __free_page(pte); } -#define __pte_free_tlb(tlb, pte) pte_free((tlb)->mm, (pte)) +#define __pte_free_tlb(tlb, pte, addr) pte_free((tlb)->mm, (pte)) /* * allocating and freeing a pmd is trivial: the 1-entry pmd is @@ -68,7 +68,7 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t pte) #define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); }) #define pmd_free(mm, x) do { } while (0) -#define __pmd_free_tlb(tlb, x) do { } while (0) +#define __pmd_free_tlb(tlb, x, addr) do { } while (0) #define pgd_populate(mm, pmd, pte) BUG() #define check_pgt_cache() do { } while (0) diff --git a/arch/m68k/include/asm/motorola_pgalloc.h b/arch/m68k/include/asm/motorola_pgalloc.h index d08bf6261df8..15ee4c74a9f0 100644 --- a/arch/m68k/include/asm/motorola_pgalloc.h +++ b/arch/m68k/include/asm/motorola_pgalloc.h @@ -54,7 +54,8 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t page) __free_page(page); } -static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t page) +static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t page, + unsigned long address) { pgtable_page_dtor(page); cache_page(kmap(page)); @@ -73,7 +74,8 @@ static inline int pmd_free(struct mm_struct *mm, pmd_t *pmd) return free_pointer_table(pmd); } -static inline int __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) +static inline int __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, + unsigned long address) { return free_pointer_table(pmd); } diff --git a/arch/m68k/include/asm/sun3_pgalloc.h b/arch/m68k/include/asm/sun3_pgalloc.h index d4c83f143816..48d80d5a666f 100644 --- a/arch/m68k/include/asm/sun3_pgalloc.h +++ b/arch/m68k/include/asm/sun3_pgalloc.h @@ -32,7 +32,7 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t page) __free_page(page); } -#define __pte_free_tlb(tlb,pte) \ +#define __pte_free_tlb(tlb,pte,addr) \ do { \ pgtable_page_dtor(pte); \ tlb_remove_page((tlb), pte); \ @@ -80,7 +80,7 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t page * inside the pgd, so has no extra memory associated with it. */ #define pmd_free(mm, x) do { } while (0) -#define __pmd_free_tlb(tlb, x) do { } while (0) +#define __pmd_free_tlb(tlb, x, addr) do { } while (0) static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { diff --git a/arch/microblaze/include/asm/pgalloc.h b/arch/microblaze/include/asm/pgalloc.h index 59a757e46ba5..b0131da1387b 100644 --- a/arch/microblaze/include/asm/pgalloc.h +++ b/arch/microblaze/include/asm/pgalloc.h @@ -180,7 +180,7 @@ extern inline void pte_free(struct mm_struct *mm, struct page *ptepage) __free_page(ptepage); } -#define __pte_free_tlb(tlb, pte) pte_free((tlb)->mm, (pte)) +#define __pte_free_tlb(tlb, pte, addr) pte_free((tlb)->mm, (pte)) #define pmd_populate(mm, pmd, pte) (pmd_val(*(pmd)) = page_address(pte)) @@ -193,7 +193,7 @@ extern inline void pte_free(struct mm_struct *mm, struct page *ptepage) */ #define pmd_alloc_one(mm, address) ({ BUG(); ((pmd_t *)2); }) /*#define pmd_free(mm, x) do { } while (0)*/ -#define __pmd_free_tlb(tlb, x) do { } while (0) +#define __pmd_free_tlb(tlb, x, addr) do { } while (0) #define pgd_populate(mm, pmd, pte) BUG() extern int do_check_pgt_cache(int, int); diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h index 1275831dda29..f705735feefc 100644 --- a/arch/mips/include/asm/pgalloc.h +++ b/arch/mips/include/asm/pgalloc.h @@ -98,7 +98,7 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t pte) __free_pages(pte, PTE_ORDER); } -#define __pte_free_tlb(tlb,pte) \ +#define __pte_free_tlb(tlb,pte,address) \ do { \ pgtable_page_dtor(pte); \ tlb_remove_page((tlb), pte); \ @@ -111,7 +111,7 @@ do { \ * inside the pgd, so has no extra memory associated with it. */ #define pmd_free(mm, x) do { } while (0) -#define __pmd_free_tlb(tlb, x) do { } while (0) +#define __pmd_free_tlb(tlb, x, addr) do { } while (0) #endif @@ -132,7 +132,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) free_pages((unsigned long)pmd, PMD_ORDER); } -#define __pmd_free_tlb(tlb, x) pmd_free((tlb)->mm, x) +#define __pmd_free_tlb(tlb, x, addr) pmd_free((tlb)->mm, x) #endif diff --git a/arch/mn10300/include/asm/pgalloc.h b/arch/mn10300/include/asm/pgalloc.h index ec057e1bd4cf..a19f11327cd8 100644 --- a/arch/mn10300/include/asm/pgalloc.h +++ b/arch/mn10300/include/asm/pgalloc.h @@ -51,6 +51,6 @@ static inline void pte_free(struct mm_struct *mm, struct page *pte) } -#define __pte_free_tlb(tlb, pte) tlb_remove_page((tlb), (pte)) +#define __pte_free_tlb(tlb, pte, addr) tlb_remove_page((tlb), (pte)) #endif /* _ASM_PGALLOC_H */ diff --git a/arch/parisc/include/asm/tlb.h b/arch/parisc/include/asm/tlb.h index 383b1db310ee..07924903989e 100644 --- a/arch/parisc/include/asm/tlb.h +++ b/arch/parisc/include/asm/tlb.h @@ -21,7 +21,7 @@ do { if (!(tlb)->fullmm) \ #include -#define __pmd_free_tlb(tlb, pmd) pmd_free((tlb)->mm, pmd) -#define __pte_free_tlb(tlb, pte) pte_free((tlb)->mm, pte) +#define __pmd_free_tlb(tlb, pmd, addr) pmd_free((tlb)->mm, pmd) +#define __pte_free_tlb(tlb, pte, addr) pte_free((tlb)->mm, pte) #endif diff --git a/arch/powerpc/include/asm/pgalloc-32.h b/arch/powerpc/include/asm/pgalloc-32.h index 0815eb40acae..c9500d666a1d 100644 --- a/arch/powerpc/include/asm/pgalloc-32.h +++ b/arch/powerpc/include/asm/pgalloc-32.h @@ -16,7 +16,7 @@ extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); */ /* #define pmd_alloc_one(mm,address) ({ BUG(); ((pmd_t *)2); }) */ #define pmd_free(mm, x) do { } while (0) -#define __pmd_free_tlb(tlb,x) do { } while (0) +#define __pmd_free_tlb(tlb,x,a) do { } while (0) /* #define pgd_populate(mm, pmd, pte) BUG() */ #ifndef CONFIG_BOOKE diff --git a/arch/powerpc/include/asm/pgalloc-64.h b/arch/powerpc/include/asm/pgalloc-64.h index afda2bdd860f..e6f069c4f713 100644 --- a/arch/powerpc/include/asm/pgalloc-64.h +++ b/arch/powerpc/include/asm/pgalloc-64.h @@ -118,11 +118,11 @@ static inline void pgtable_free(pgtable_free_t pgf) kmem_cache_free(pgtable_cache[cachenum], p); } -#define __pmd_free_tlb(tlb, pmd) \ +#define __pmd_free_tlb(tlb, pmd,addr) \ pgtable_free_tlb(tlb, pgtable_free_cache(pmd, \ PMD_CACHE_NUM, PMD_TABLE_SIZE-1)) #ifndef CONFIG_PPC_64K_PAGES -#define __pud_free_tlb(tlb, pud) \ +#define __pud_free_tlb(tlb, pud, addr) \ pgtable_free_tlb(tlb, pgtable_free_cache(pud, \ PUD_CACHE_NUM, PUD_TABLE_SIZE-1)) #endif /* CONFIG_PPC_64K_PAGES */ diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h index 5d8480265a77..1730e5e298d6 100644 --- a/arch/powerpc/include/asm/pgalloc.h +++ b/arch/powerpc/include/asm/pgalloc.h @@ -38,14 +38,14 @@ static inline pgtable_free_t pgtable_free_cache(void *p, int cachenum, extern void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf); #ifdef CONFIG_SMP -#define __pte_free_tlb(tlb,ptepage) \ +#define __pte_free_tlb(tlb,ptepage,address) \ do { \ pgtable_page_dtor(ptepage); \ pgtable_free_tlb(tlb, pgtable_free_cache(page_address(ptepage), \ - PTE_NONCACHE_NUM, PTE_TABLE_SIZE-1)); \ + PTE_NONCACHE_NUM, PTE_TABLE_SIZE-1)); \ } while (0) #else -#define __pte_free_tlb(tlb, pte) pte_free((tlb)->mm, (pte)) +#define __pte_free_tlb(tlb, pte, address) pte_free((tlb)->mm, (pte)) #endif diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 9920d6a7cf29..c46ef2ffa3d9 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -305,7 +305,7 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, pmd = pmd_offset(pud, start); pud_clear(pud); - pmd_free_tlb(tlb, pmd); + pmd_free_tlb(tlb, pmd, start); } static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, @@ -348,7 +348,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, pud = pud_offset(pgd, start); pgd_clear(pgd); - pud_free_tlb(tlb, pud); + pud_free_tlb(tlb, pud, start); } /* diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index 3d8a96d39d9d..81150b053689 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -96,7 +96,8 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) * pte_free_tlb frees a pte table and clears the CRSTE for the * page table from the tlb. */ -static inline void pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte) +static inline void pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, + unsigned long address) { if (!tlb->fullmm) { tlb->array[tlb->nr_ptes++] = pte; @@ -113,7 +114,8 @@ static inline void pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte) * as the pgd. pmd_free_tlb checks the asce_limit against 2GB * to avoid the double free of the pmd in this case. */ -static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) +static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, + unsigned long address) { #ifdef __s390x__ if (tlb->mm->context.asce_limit <= (1UL << 31)) @@ -134,7 +136,8 @@ static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) * as the pgd. pud_free_tlb checks the asce_limit against 4TB * to avoid the double free of the pud in this case. */ -static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) +static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, + unsigned long address) { #ifdef __s390x__ if (tlb->mm->context.asce_limit <= (1UL << 42)) diff --git a/arch/sh/include/asm/pgalloc.h b/arch/sh/include/asm/pgalloc.h index 84dd2db7104c..89a482750a5b 100644 --- a/arch/sh/include/asm/pgalloc.h +++ b/arch/sh/include/asm/pgalloc.h @@ -73,7 +73,7 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t pte) quicklist_free_page(QUICK_PT, NULL, pte); } -#define __pte_free_tlb(tlb,pte) \ +#define __pte_free_tlb(tlb,pte,addr) \ do { \ pgtable_page_dtor(pte); \ tlb_remove_page((tlb), (pte)); \ @@ -85,7 +85,7 @@ do { \ */ #define pmd_free(mm, x) do { } while (0) -#define __pmd_free_tlb(tlb,x) do { } while (0) +#define __pmd_free_tlb(tlb,x,addr) do { } while (0) static inline void check_pgt_cache(void) { diff --git a/arch/sh/include/asm/tlb.h b/arch/sh/include/asm/tlb.h index 9c16f737074a..da8fe7ab8728 100644 --- a/arch/sh/include/asm/tlb.h +++ b/arch/sh/include/asm/tlb.h @@ -91,9 +91,9 @@ tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) } #define tlb_remove_page(tlb,page) free_page_and_swap_cache(page) -#define pte_free_tlb(tlb, ptep) pte_free((tlb)->mm, ptep) -#define pmd_free_tlb(tlb, pmdp) pmd_free((tlb)->mm, pmdp) -#define pud_free_tlb(tlb, pudp) pud_free((tlb)->mm, pudp) +#define pte_free_tlb(tlb, ptep, addr) pte_free((tlb)->mm, ptep) +#define pmd_free_tlb(tlb, pmdp, addr) pmd_free((tlb)->mm, pmdp) +#define pud_free_tlb(tlb, pudp, addr) pud_free((tlb)->mm, pudp) #define tlb_migrate_finish(mm) do { } while (0) diff --git a/arch/sparc/include/asm/pgalloc_32.h b/arch/sparc/include/asm/pgalloc_32.h index 681582d26969..ca2b34456c4b 100644 --- a/arch/sparc/include/asm/pgalloc_32.h +++ b/arch/sparc/include/asm/pgalloc_32.h @@ -44,8 +44,8 @@ BTFIXUPDEF_CALL(pmd_t *, pmd_alloc_one, struct mm_struct *, unsigned long) BTFIXUPDEF_CALL(void, free_pmd_fast, pmd_t *) #define free_pmd_fast(pmd) BTFIXUP_CALL(free_pmd_fast)(pmd) -#define pmd_free(mm, pmd) free_pmd_fast(pmd) -#define __pmd_free_tlb(tlb, pmd) pmd_free((tlb)->mm, pmd) +#define pmd_free(mm, pmd) free_pmd_fast(pmd) +#define __pmd_free_tlb(tlb, pmd, addr) pmd_free((tlb)->mm, pmd) BTFIXUPDEF_CALL(void, pmd_populate, pmd_t *, struct page *) #define pmd_populate(MM, PMD, PTE) BTFIXUP_CALL(pmd_populate)(PMD, PTE) @@ -62,7 +62,7 @@ BTFIXUPDEF_CALL(void, free_pte_fast, pte_t *) #define pte_free_kernel(mm, pte) BTFIXUP_CALL(free_pte_fast)(pte) BTFIXUPDEF_CALL(void, pte_free, pgtable_t ) -#define pte_free(mm, pte) BTFIXUP_CALL(pte_free)(pte) -#define __pte_free_tlb(tlb, pte) pte_free((tlb)->mm, pte) +#define pte_free(mm, pte) BTFIXUP_CALL(pte_free)(pte) +#define __pte_free_tlb(tlb, pte, addr) pte_free((tlb)->mm, pte) #endif /* _SPARC_PGALLOC_H */ diff --git a/arch/sparc/include/asm/tlb_64.h b/arch/sparc/include/asm/tlb_64.h index ee38e731bfa6..dca406b9b6fc 100644 --- a/arch/sparc/include/asm/tlb_64.h +++ b/arch/sparc/include/asm/tlb_64.h @@ -100,9 +100,9 @@ static inline void tlb_remove_page(struct mmu_gather *mp, struct page *page) } #define tlb_remove_tlb_entry(mp,ptep,addr) do { } while (0) -#define pte_free_tlb(mp, ptepage) pte_free((mp)->mm, ptepage) -#define pmd_free_tlb(mp, pmdp) pmd_free((mp)->mm, pmdp) -#define pud_free_tlb(tlb,pudp) __pud_free_tlb(tlb,pudp) +#define pte_free_tlb(mp, ptepage, addr) pte_free((mp)->mm, ptepage) +#define pmd_free_tlb(mp, pmdp, addr) pmd_free((mp)->mm, pmdp) +#define pud_free_tlb(tlb,pudp, addr) __pud_free_tlb(tlb,pudp,addr) #define tlb_migrate_finish(mm) do { } while (0) #define tlb_start_vma(tlb, vma) do { } while (0) diff --git a/arch/um/include/asm/pgalloc.h b/arch/um/include/asm/pgalloc.h index 718984359f8c..32c8ce4e1515 100644 --- a/arch/um/include/asm/pgalloc.h +++ b/arch/um/include/asm/pgalloc.h @@ -40,7 +40,7 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t pte) __free_page(pte); } -#define __pte_free_tlb(tlb,pte) \ +#define __pte_free_tlb(tlb,pte, address) \ do { \ pgtable_page_dtor(pte); \ tlb_remove_page((tlb),(pte)); \ @@ -53,7 +53,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) free_page((unsigned long)pmd); } -#define __pmd_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x)) +#define __pmd_free_tlb(tlb,x, address) tlb_remove_page((tlb),virt_to_page(x)) #endif #define check_pgt_cache() do { } while (0) diff --git a/arch/um/include/asm/tlb.h b/arch/um/include/asm/tlb.h index 5240fa1c5e08..660caedac9eb 100644 --- a/arch/um/include/asm/tlb.h +++ b/arch/um/include/asm/tlb.h @@ -116,11 +116,11 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) __tlb_remove_tlb_entry(tlb, ptep, address); \ } while (0) -#define pte_free_tlb(tlb, ptep) __pte_free_tlb(tlb, ptep) +#define pte_free_tlb(tlb, ptep, addr) __pte_free_tlb(tlb, ptep, addr) -#define pud_free_tlb(tlb, pudp) __pud_free_tlb(tlb, pudp) +#define pud_free_tlb(tlb, pudp, addr) __pud_free_tlb(tlb, pudp, addr) -#define pmd_free_tlb(tlb, pmdp) __pmd_free_tlb(tlb, pmdp) +#define pmd_free_tlb(tlb, pmdp, addr) __pmd_free_tlb(tlb, pmdp, addr) #define tlb_migrate_finish(mm) do {} while (0) diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index dd14c54ac718..0e8c2a0fd922 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -46,7 +46,13 @@ static inline void pte_free(struct mm_struct *mm, struct page *pte) __free_page(pte); } -extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte); +extern void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte); + +static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte, + unsigned long address) +{ + ___pte_free_tlb(tlb, pte); +} static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) @@ -78,7 +84,13 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) free_page((unsigned long)pmd); } -extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd); +extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd); + +static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, + unsigned long adddress) +{ + ___pmd_free_tlb(tlb, pmd); +} #ifdef CONFIG_X86_PAE extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd); @@ -108,7 +120,14 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud) free_page((unsigned long)pud); } -extern void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud); +extern void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud); + +static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, + unsigned long address) +{ + ___pud_free_tlb(tlb, pud); +} + #endif /* PAGETABLE_LEVELS > 3 */ #endif /* PAGETABLE_LEVELS > 2 */ diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 8e43bdd45456..af8f9650058c 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -25,7 +25,7 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) return pte; } -void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) +void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { pgtable_page_dtor(pte); paravirt_release_pte(page_to_pfn(pte)); @@ -33,14 +33,14 @@ void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) } #if PAGETABLE_LEVELS > 2 -void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) +void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) { paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); tlb_remove_page(tlb, virt_to_page(pmd)); } #if PAGETABLE_LEVELS > 3 -void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) +void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) { paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); tlb_remove_page(tlb, virt_to_page(pud)); diff --git a/arch/xtensa/include/asm/tlb.h b/arch/xtensa/include/asm/tlb.h index 31c220faca02..0d766f9c1083 100644 --- a/arch/xtensa/include/asm/tlb.h +++ b/arch/xtensa/include/asm/tlb.h @@ -42,6 +42,6 @@ #include -#define __pte_free_tlb(tlb, pte) pte_free((tlb)->mm, pte) +#define __pte_free_tlb(tlb, pte, address) pte_free((tlb)->mm, pte) #endif /* _XTENSA_TLB_H */ diff --git a/include/asm-generic/4level-fixup.h b/include/asm-generic/4level-fixup.h index 9d40e879f99e..77ff547730af 100644 --- a/include/asm-generic/4level-fixup.h +++ b/include/asm-generic/4level-fixup.h @@ -27,9 +27,9 @@ #define pud_page_vaddr(pud) pgd_page_vaddr(pud) #undef pud_free_tlb -#define pud_free_tlb(tlb, x) do { } while (0) +#define pud_free_tlb(tlb, x, addr) do { } while (0) #define pud_free(mm, x) do { } while (0) -#define __pud_free_tlb(tlb, x) do { } while (0) +#define __pud_free_tlb(tlb, x, addr) do { } while (0) #undef pud_addr_end #define pud_addr_end(addr, end) (end) diff --git a/include/asm-generic/pgtable-nopmd.h b/include/asm-generic/pgtable-nopmd.h index a7cdc48e8b78..725612b793ce 100644 --- a/include/asm-generic/pgtable-nopmd.h +++ b/include/asm-generic/pgtable-nopmd.h @@ -59,7 +59,7 @@ static inline pmd_t * pmd_offset(pud_t * pud, unsigned long address) static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) { } -#define __pmd_free_tlb(tlb, x) do { } while (0) +#define __pmd_free_tlb(tlb, x, a) do { } while (0) #undef pmd_addr_end #define pmd_addr_end(addr, end) (end) diff --git a/include/asm-generic/pgtable-nopud.h b/include/asm-generic/pgtable-nopud.h index 87cf449a6df3..810431d8351b 100644 --- a/include/asm-generic/pgtable-nopud.h +++ b/include/asm-generic/pgtable-nopud.h @@ -52,7 +52,7 @@ static inline pud_t * pud_offset(pgd_t * pgd, unsigned long address) */ #define pud_alloc_one(mm, address) NULL #define pud_free(mm, x) do { } while (0) -#define __pud_free_tlb(tlb, x) do { } while (0) +#define __pud_free_tlb(tlb, x, a) do { } while (0) #undef pud_addr_end #define pud_addr_end(addr, end) (end) diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index f490e43a90b9..e43f9766259f 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -123,24 +123,24 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) __tlb_remove_tlb_entry(tlb, ptep, address); \ } while (0) -#define pte_free_tlb(tlb, ptep) \ +#define pte_free_tlb(tlb, ptep, address) \ do { \ tlb->need_flush = 1; \ - __pte_free_tlb(tlb, ptep); \ + __pte_free_tlb(tlb, ptep, address); \ } while (0) #ifndef __ARCH_HAS_4LEVEL_HACK -#define pud_free_tlb(tlb, pudp) \ +#define pud_free_tlb(tlb, pudp, address) \ do { \ tlb->need_flush = 1; \ - __pud_free_tlb(tlb, pudp); \ + __pud_free_tlb(tlb, pudp, address); \ } while (0) #endif -#define pmd_free_tlb(tlb, pmdp) \ +#define pmd_free_tlb(tlb, pmdp, address) \ do { \ tlb->need_flush = 1; \ - __pmd_free_tlb(tlb, pmdp); \ + __pmd_free_tlb(tlb, pmdp, address); \ } while (0) #define tlb_migrate_finish(mm) do {} while (0) diff --git a/mm/memory.c b/mm/memory.c index 65216194eb8d..aede2ce3aba4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -135,11 +135,12 @@ void pmd_clear_bad(pmd_t *pmd) * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. */ -static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd) +static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, + unsigned long addr) { pgtable_t token = pmd_pgtable(*pmd); pmd_clear(pmd); - pte_free_tlb(tlb, token); + pte_free_tlb(tlb, token, addr); tlb->mm->nr_ptes--; } @@ -157,7 +158,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; - free_pte_range(tlb, pmd); + free_pte_range(tlb, pmd, addr); } while (pmd++, addr = next, addr != end); start &= PUD_MASK; @@ -173,7 +174,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, pmd = pmd_offset(pud, start); pud_clear(pud); - pmd_free_tlb(tlb, pmd); + pmd_free_tlb(tlb, pmd, start); } static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, @@ -206,7 +207,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, pud = pud_offset(pgd, start); pgd_clear(pgd); - pud_free_tlb(tlb, pud); + pud_free_tlb(tlb, pud, start); } /* -- cgit v1.2.3 From 0e83815be719d3391bf5ea24b7fe696c07dbd417 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 27 Jul 2009 19:43:52 +0200 Subject: x86: fix section mismatch for i386 init code Startup code for i386 in arch/x86/kernel/head_32.S is using the reference variable initial_code that is located in the .cpuinit.data section. If CONFIG_HOTPLUG_CPU is enabled, startup code is not in an init section and can be called later too. In this case the reference initial_code must be kept too. This patch fixes this. See below for the section mismatch warning. WARNING: vmlinux.o(.cpuinit.data+0x0): Section mismatch in reference from the variable initial_code to the function .init.text:i386_start_kernel() The variable __cpuinitdata initial_code references a function __init i386_start_kernel(). If i386_start_kernel is only used by initial_code then annotate i386_start_kernel with a matching annotation. Signed-off-by: Robert Richter LKML-Reference: <1248716632-26844-1-git-send-email-robert.richter@amd.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/head_32.S | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 8663afb56535..0d98a01cbdb2 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -602,7 +602,11 @@ ignore_int: #endif iret -.section .cpuinit.data,"wa" +#ifndef CONFIG_HOTPLUG_CPU + __CPUINITDATA +#else + __REFDATA +#endif .align 4 ENTRY(initial_code) .long i386_start_kernel -- cgit v1.2.3 From 73ba651fc246fcc3e446da4155e0425b4219d2c4 Mon Sep 17 00:00:00 2001 From: Thomas Hellstrom Date: Fri, 24 Jul 2009 09:57:33 +0200 Subject: x86: Export kmap_atomic_prot() needed for TTM. This functionality is needed to kmap_atomic() highmem pages that may potentially have or are about to set up other mappings with non-standard caching attributes. Signed-off-by: Thomas Hellstrom Signed-off-by: Dave Airlie --- arch/x86/mm/highmem_32.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 58f621e81919..2112ed55e7ea 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -103,6 +103,7 @@ EXPORT_SYMBOL(kmap); EXPORT_SYMBOL(kunmap); EXPORT_SYMBOL(kmap_atomic); EXPORT_SYMBOL(kunmap_atomic); +EXPORT_SYMBOL(kmap_atomic_prot); void __init set_highmem_pages_init(void) { -- cgit v1.2.3 From 2e04ef76916d1e29a077ea9d0f2003c8fd86724d Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 30 Jul 2009 16:03:45 -0600 Subject: lguest: fix comment style I don't really notice it (except to begrudge the extra vertical space), but Ingo does. And he pointed out that one excuse of lguest is as a teaching tool, it should set a good example. Signed-off-by: Rusty Russell Cc: Ingo Molnar --- Documentation/lguest/lguest.c | 540 ++++++++++++++++++++++------------ arch/x86/include/asm/lguest.h | 3 +- arch/x86/include/asm/lguest_hcall.h | 10 +- arch/x86/lguest/boot.c | 428 +++++++++++++++++---------- arch/x86/lguest/i386_head.S | 110 ++++--- drivers/lguest/core.c | 114 ++++--- drivers/lguest/hypercalls.c | 141 ++++++--- drivers/lguest/interrupts_and_traps.c | 288 ++++++++++++------ drivers/lguest/lg.h | 23 +- drivers/lguest/lguest_device.c | 150 ++++++---- drivers/lguest/lguest_user.c | 137 ++++++--- drivers/lguest/page_tables.c | 427 ++++++++++++++++++--------- drivers/lguest/segments.c | 106 ++++--- drivers/lguest/x86/core.c | 372 +++++++++++++++-------- drivers/lguest/x86/switcher_32.S | 18 +- include/linux/lguest.h | 36 ++- include/linux/lguest_launcher.h | 18 +- 17 files changed, 1906 insertions(+), 1015 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index 45d7d6dcae7a..aa66a52b73e9 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -1,7 +1,9 @@ -/*P:100 This is the Launcher code, a simple program which lays out the - * "physical" memory for the new Guest by mapping the kernel image and - * the virtual devices, then opens /dev/lguest to tell the kernel - * about the Guest and control it. :*/ +/*P:100 + * This is the Launcher code, a simple program which lays out the "physical" + * memory for the new Guest by mapping the kernel image and the virtual + * devices, then opens /dev/lguest to tell the kernel about the Guest and + * control it. +:*/ #define _LARGEFILE64_SOURCE #define _GNU_SOURCE #include @@ -46,13 +48,15 @@ #include "linux/virtio_rng.h" #include "linux/virtio_ring.h" #include "asm/bootparam.h" -/*L:110 We can ignore the 39 include files we need for this program, but I do - * want to draw attention to the use of kernel-style types. +/*L:110 + * We can ignore the 39 include files we need for this program, but I do want + * to draw attention to the use of kernel-style types. * * As Linus said, "C is a Spartan language, and so should your naming be." I * like these abbreviations, so we define them here. Note that u64 is always * unsigned long long, which works on all Linux systems: this means that we can - * use %llu in printf for any u64. */ + * use %llu in printf for any u64. + */ typedef unsigned long long u64; typedef uint32_t u32; typedef uint16_t u16; @@ -69,8 +73,10 @@ typedef uint8_t u8; /* This will occupy 3 pages: it must be a power of 2. */ #define VIRTQUEUE_NUM 256 -/*L:120 verbose is both a global flag and a macro. The C preprocessor allows - * this, and although I wouldn't recommend it, it works quite nicely here. */ +/*L:120 + * verbose is both a global flag and a macro. The C preprocessor allows + * this, and although I wouldn't recommend it, it works quite nicely here. + */ static bool verbose; #define verbose(args...) \ do { if (verbose) printf(args); } while(0) @@ -100,8 +106,7 @@ struct device_list /* A single linked list of devices. */ struct device *dev; - /* And a pointer to the last device for easy append and also for - * configuration appending. */ + /* And a pointer to the last device for easy append. */ struct device *lastdev; }; @@ -168,20 +173,24 @@ static char **main_args; /* The original tty settings to restore on exit. */ static struct termios orig_term; -/* We have to be careful with barriers: our devices are all run in separate +/* + * We have to be careful with barriers: our devices are all run in separate * threads and so we need to make sure that changes visible to the Guest happen - * in precise order. */ + * in precise order. + */ #define wmb() __asm__ __volatile__("" : : : "memory") #define mb() __asm__ __volatile__("" : : : "memory") -/* Convert an iovec element to the given type. +/* + * Convert an iovec element to the given type. * * This is a fairly ugly trick: we need to know the size of the type and * alignment requirement to check the pointer is kosher. It's also nice to * have the name of the type in case we report failure. * * Typing those three things all the time is cumbersome and error prone, so we - * have a macro which sets them all up and passes to the real function. */ + * have a macro which sets them all up and passes to the real function. + */ #define convert(iov, type) \ ((type *)_convert((iov), sizeof(type), __alignof__(type), #type)) @@ -198,8 +207,10 @@ static void *_convert(struct iovec *iov, size_t size, size_t align, /* Wrapper for the last available index. Makes it easier to change. */ #define lg_last_avail(vq) ((vq)->last_avail_idx) -/* The virtio configuration space is defined to be little-endian. x86 is - * little-endian too, but it's nice to be explicit so we have these helpers. */ +/* + * The virtio configuration space is defined to be little-endian. x86 is + * little-endian too, but it's nice to be explicit so we have these helpers. + */ #define cpu_to_le16(v16) (v16) #define cpu_to_le32(v32) (v32) #define cpu_to_le64(v64) (v64) @@ -241,11 +252,12 @@ static u8 *get_feature_bits(struct device *dev) + dev->num_vq * sizeof(struct lguest_vqconfig); } -/*L:100 The Launcher code itself takes us out into userspace, that scary place - * where pointers run wild and free! Unfortunately, like most userspace - * programs, it's quite boring (which is why everyone likes to hack on the - * kernel!). Perhaps if you make up an Lguest Drinking Game at this point, it - * will get you through this section. Or, maybe not. +/*L:100 + * The Launcher code itself takes us out into userspace, that scary place where + * pointers run wild and free! Unfortunately, like most userspace programs, + * it's quite boring (which is why everyone likes to hack on the kernel!). + * Perhaps if you make up an Lguest Drinking Game at this point, it will get + * you through this section. Or, maybe not. * * The Launcher sets up a big chunk of memory to be the Guest's "physical" * memory and stores it in "guest_base". In other words, Guest physical == @@ -253,7 +265,8 @@ static u8 *get_feature_bits(struct device *dev) * * This can be tough to get your head around, but usually it just means that we * use these trivial conversion functions when the Guest gives us it's - * "physical" addresses: */ + * "physical" addresses: + */ static void *from_guest_phys(unsigned long addr) { return guest_base + addr; @@ -268,7 +281,8 @@ static unsigned long to_guest_phys(const void *addr) * Loading the Kernel. * * We start with couple of simple helper routines. open_or_die() avoids - * error-checking code cluttering the callers: */ + * error-checking code cluttering the callers: + */ static int open_or_die(const char *name, int flags) { int fd = open(name, flags); @@ -283,8 +297,10 @@ static void *map_zeroed_pages(unsigned int num) int fd = open_or_die("/dev/zero", O_RDONLY); void *addr; - /* We use a private mapping (ie. if we write to the page, it will be - * copied). */ + /* + * We use a private mapping (ie. if we write to the page, it will be + * copied). + */ addr = mmap(NULL, getpagesize() * num, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0); if (addr == MAP_FAILED) @@ -305,20 +321,24 @@ static void *get_pages(unsigned int num) return addr; } -/* This routine is used to load the kernel or initrd. It tries mmap, but if +/* + * This routine is used to load the kernel or initrd. It tries mmap, but if * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries), - * it falls back to reading the memory in. */ + * it falls back to reading the memory in. + */ static void map_at(int fd, void *addr, unsigned long offset, unsigned long len) { ssize_t r; - /* We map writable even though for some segments are marked read-only. + /* + * We map writable even though for some segments are marked read-only. * The kernel really wants to be writable: it patches its own * instructions. * * MAP_PRIVATE means that the page won't be copied until a write is * done to it. This allows us to share untouched memory between - * Guests. */ + * Guests. + */ if (mmap(addr, len, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED) return; @@ -329,7 +349,8 @@ static void map_at(int fd, void *addr, unsigned long offset, unsigned long len) err(1, "Reading offset %lu len %lu gave %zi", offset, len, r); } -/* This routine takes an open vmlinux image, which is in ELF, and maps it into +/* + * This routine takes an open vmlinux image, which is in ELF, and maps it into * the Guest memory. ELF = Embedded Linking Format, which is the format used * by all modern binaries on Linux including the kernel. * @@ -337,23 +358,28 @@ static void map_at(int fd, void *addr, unsigned long offset, unsigned long len) * address. We use the physical address; the Guest will map itself to the * virtual address. * - * We return the starting address. */ + * We return the starting address. + */ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr) { Elf32_Phdr phdr[ehdr->e_phnum]; unsigned int i; - /* Sanity checks on the main ELF header: an x86 executable with a - * reasonable number of correctly-sized program headers. */ + /* + * Sanity checks on the main ELF header: an x86 executable with a + * reasonable number of correctly-sized program headers. + */ if (ehdr->e_type != ET_EXEC || ehdr->e_machine != EM_386 || ehdr->e_phentsize != sizeof(Elf32_Phdr) || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr)) errx(1, "Malformed elf header"); - /* An ELF executable contains an ELF header and a number of "program" + /* + * An ELF executable contains an ELF header and a number of "program" * headers which indicate which parts ("segments") of the program to - * load where. */ + * load where. + */ /* We read in all the program headers at once: */ if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0) @@ -361,8 +387,10 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr) if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr)) err(1, "Reading program headers"); - /* Try all the headers: there are usually only three. A read-only one, - * a read-write one, and a "note" section which we don't load. */ + /* + * Try all the headers: there are usually only three. A read-only one, + * a read-write one, and a "note" section which we don't load. + */ for (i = 0; i < ehdr->e_phnum; i++) { /* If this isn't a loadable segment, we ignore it */ if (phdr[i].p_type != PT_LOAD) @@ -380,13 +408,15 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr) return ehdr->e_entry; } -/*L:150 A bzImage, unlike an ELF file, is not meant to be loaded. You're - * supposed to jump into it and it will unpack itself. We used to have to - * perform some hairy magic because the unpacking code scared me. +/*L:150 + * A bzImage, unlike an ELF file, is not meant to be loaded. You're supposed + * to jump into it and it will unpack itself. We used to have to perform some + * hairy magic because the unpacking code scared me. * * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote * a small patch to jump over the tricky bits in the Guest, so now we just read - * the funky header so we know where in the file to load, and away we go! */ + * the funky header so we know where in the file to load, and away we go! + */ static unsigned long load_bzimage(int fd) { struct boot_params boot; @@ -394,8 +424,10 @@ static unsigned long load_bzimage(int fd) /* Modern bzImages get loaded at 1M. */ void *p = from_guest_phys(0x100000); - /* Go back to the start of the file and read the header. It should be - * a Linux boot header (see Documentation/x86/i386/boot.txt) */ + /* + * Go back to the start of the file and read the header. It should be + * a Linux boot header (see Documentation/x86/i386/boot.txt) + */ lseek(fd, 0, SEEK_SET); read(fd, &boot, sizeof(boot)); @@ -414,9 +446,11 @@ static unsigned long load_bzimage(int fd) return boot.hdr.code32_start; } -/*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels +/*L:140 + * Loading the kernel is easy when it's a "vmlinux", but most kernels * come wrapped up in the self-decompressing "bzImage" format. With a little - * work, we can load those, too. */ + * work, we can load those, too. + */ static unsigned long load_kernel(int fd) { Elf32_Ehdr hdr; @@ -433,24 +467,28 @@ static unsigned long load_kernel(int fd) return load_bzimage(fd); } -/* This is a trivial little helper to align pages. Andi Kleen hated it because +/* + * This is a trivial little helper to align pages. Andi Kleen hated it because * it calls getpagesize() twice: "it's dumb code." * * Kernel guys get really het up about optimization, even when it's not - * necessary. I leave this code as a reaction against that. */ + * necessary. I leave this code as a reaction against that. + */ static inline unsigned long page_align(unsigned long addr) { /* Add upwards and truncate downwards. */ return ((addr + getpagesize()-1) & ~(getpagesize()-1)); } -/*L:180 An "initial ram disk" is a disk image loaded into memory along with - * the kernel which the kernel can use to boot from without needing any - * drivers. Most distributions now use this as standard: the initrd contains - * the code to load the appropriate driver modules for the current machine. +/*L:180 + * An "initial ram disk" is a disk image loaded into memory along with the + * kernel which the kernel can use to boot from without needing any drivers. + * Most distributions now use this as standard: the initrd contains the code to + * load the appropriate driver modules for the current machine. * * Importantly, James Morris works for RedHat, and Fedora uses initrds for its - * kernels. He sent me this (and tells me when I break it). */ + * kernels. He sent me this (and tells me when I break it). + */ static unsigned long load_initrd(const char *name, unsigned long mem) { int ifd; @@ -462,12 +500,16 @@ static unsigned long load_initrd(const char *name, unsigned long mem) if (fstat(ifd, &st) < 0) err(1, "fstat() on initrd '%s'", name); - /* We map the initrd at the top of memory, but mmap wants it to be - * page-aligned, so we round the size up for that. */ + /* + * We map the initrd at the top of memory, but mmap wants it to be + * page-aligned, so we round the size up for that. + */ len = page_align(st.st_size); map_at(ifd, from_guest_phys(mem - len), 0, st.st_size); - /* Once a file is mapped, you can close the file descriptor. It's a - * little odd, but quite useful. */ + /* + * Once a file is mapped, you can close the file descriptor. It's a + * little odd, but quite useful. + */ close(ifd); verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len); @@ -476,8 +518,10 @@ static unsigned long load_initrd(const char *name, unsigned long mem) } /*:*/ -/* Simple routine to roll all the commandline arguments together with spaces - * between them. */ +/* + * Simple routine to roll all the commandline arguments together with spaces + * between them. + */ static void concat(char *dst, char *args[]) { unsigned int i, len = 0; @@ -494,10 +538,12 @@ static void concat(char *dst, char *args[]) dst[len] = '\0'; } -/*L:185 This is where we actually tell the kernel to initialize the Guest. We +/*L:185 + * This is where we actually tell the kernel to initialize the Guest. We * saw the arguments it expects when we looked at initialize() in lguest_user.c: * the base of Guest "physical" memory, the top physical page to allow and the - * entry point for the Guest. */ + * entry point for the Guest. + */ static void tell_kernel(unsigned long start) { unsigned long args[] = { LHREQ_INITIALIZE, @@ -522,20 +568,26 @@ static void tell_kernel(unsigned long start) static void *_check_pointer(unsigned long addr, unsigned int size, unsigned int line) { - /* We have to separately check addr and addr+size, because size could - * be huge and addr + size might wrap around. */ + /* + * We have to separately check addr and addr+size, because size could + * be huge and addr + size might wrap around. + */ if (addr >= guest_limit || addr + size >= guest_limit) errx(1, "%s:%i: Invalid address %#lx", __FILE__, line, addr); - /* We return a pointer for the caller's convenience, now we know it's - * safe to use. */ + /* + * We return a pointer for the caller's convenience, now we know it's + * safe to use. + */ return from_guest_phys(addr); } /* A macro which transparently hands the line number to the real function. */ #define check_pointer(addr,size) _check_pointer(addr, size, __LINE__) -/* Each buffer in the virtqueues is actually a chain of descriptors. This +/* + * Each buffer in the virtqueues is actually a chain of descriptors. This * function returns the next descriptor in the chain, or vq->vring.num if we're - * at the end. */ + * at the end. + */ static unsigned next_desc(struct vring_desc *desc, unsigned int i, unsigned int max) { @@ -576,12 +628,14 @@ static void trigger_irq(struct virtqueue *vq) err(1, "Triggering irq %i", vq->config.irq); } -/* This looks in the virtqueue and for the first available buffer, and converts +/* + * This looks in the virtqueue and for the first available buffer, and converts * it to an iovec for convenient access. Since descriptors consist of some * number of output then some number of input descriptors, it's actually two * iovecs, but we pack them into one and note how many of each there were. * - * This function returns the descriptor number found. */ + * This function returns the descriptor number found. + */ static unsigned wait_for_vq_desc(struct virtqueue *vq, struct iovec iov[], unsigned int *out_num, unsigned int *in_num) @@ -599,8 +653,10 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq, /* OK, now we need to know about added descriptors. */ vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; - /* They could have slipped one in as we were doing that: make - * sure it's written, then check again. */ + /* + * They could have slipped one in as we were doing that: make + * sure it's written, then check again. + */ mb(); if (last_avail != vq->vring.avail->idx) { vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; @@ -620,8 +676,10 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq, errx(1, "Guest moved used index from %u to %u", last_avail, vq->vring.avail->idx); - /* Grab the next descriptor number they're advertising, and increment - * the index we've seen. */ + /* + * Grab the next descriptor number they're advertising, and increment + * the index we've seen. + */ head = vq->vring.avail->ring[last_avail % vq->vring.num]; lg_last_avail(vq)++; @@ -636,8 +694,10 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq, desc = vq->vring.desc; i = head; - /* If this is an indirect entry, then this buffer contains a descriptor - * table which we handle as if it's any normal descriptor chain. */ + /* + * If this is an indirect entry, then this buffer contains a descriptor + * table which we handle as if it's any normal descriptor chain. + */ if (desc[i].flags & VRING_DESC_F_INDIRECT) { if (desc[i].len % sizeof(struct vring_desc)) errx(1, "Invalid size for indirect buffer table"); @@ -656,8 +716,10 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq, if (desc[i].flags & VRING_DESC_F_WRITE) (*in_num)++; else { - /* If it's an output descriptor, they're all supposed - * to come before any input descriptors. */ + /* + * If it's an output descriptor, they're all supposed + * to come before any input descriptors. + */ if (*in_num) errx(1, "Descriptor has out after in"); (*out_num)++; @@ -671,14 +733,18 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq, return head; } -/* After we've used one of their buffers, we tell them about it. We'll then - * want to send them an interrupt, using trigger_irq(). */ +/* + * After we've used one of their buffers, we tell them about it. We'll then + * want to send them an interrupt, using trigger_irq(). + */ static void add_used(struct virtqueue *vq, unsigned int head, int len) { struct vring_used_elem *used; - /* The virtqueue contains a ring of used buffers. Get a pointer to the - * next entry in that used ring. */ + /* + * The virtqueue contains a ring of used buffers. Get a pointer to the + * next entry in that used ring. + */ used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num]; used->id = head; used->len = len; @@ -698,7 +764,8 @@ static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len) /* * The Console * - * We associate some data with the console for our exit hack. */ + * We associate some data with the console for our exit hack. + */ struct console_abort { /* How many times have they hit ^C? */ @@ -725,20 +792,24 @@ static void console_input(struct virtqueue *vq) if (len <= 0) { /* Ran out of input? */ warnx("Failed to get console input, ignoring console."); - /* For simplicity, dying threads kill the whole Launcher. So - * just nap here. */ + /* + * For simplicity, dying threads kill the whole Launcher. So + * just nap here. + */ for (;;) pause(); } add_used_and_trigger(vq, head, len); - /* Three ^C within one second? Exit. + /* + * Three ^C within one second? Exit. * * This is such a hack, but works surprisingly well. Each ^C has to * be in a buffer by itself, so they can't be too fast. But we check * that we get three within about a second, so they can't be too - * slow. */ + * slow. + */ if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) { abort->count = 0; return; @@ -809,8 +880,7 @@ static bool will_block(int fd) return select(fd+1, &fdset, NULL, NULL, &zero) != 1; } -/* This is where we handle packets coming in from the tun device to our - * Guest. */ +/* This handles packets coming in from the tun device to our Guest. */ static void net_input(struct virtqueue *vq) { int len; @@ -842,8 +912,10 @@ static int do_thread(void *_vq) return 0; } -/* When a child dies, we kill our entire process group with SIGTERM. This - * also has the side effect that the shell restores the console for us! */ +/* + * When a child dies, we kill our entire process group with SIGTERM. This + * also has the side effect that the shell restores the console for us! + */ static void kill_launcher(int signal) { kill(0, SIGTERM); @@ -880,9 +952,10 @@ static void reset_device(struct device *dev) static void create_thread(struct virtqueue *vq) { - /* Create stack for thread and run it. Since stack grows - * upwards, we point the stack pointer to the end of this - * region. */ + /* + * Create stack for thread and run it. Since the stack grows upwards, + * we point the stack pointer to the end of this region. + */ char *stack = malloc(32768); unsigned long args[] = { LHREQ_EVENTFD, vq->config.pfn*getpagesize(), 0 }; @@ -981,8 +1054,11 @@ static void handle_output(unsigned long addr) } } - /* Early console write is done using notify on a nul-terminated string - * in Guest memory. */ + /* + * Early console write is done using notify on a nul-terminated string + * in Guest memory. It's also great for hacking debugging messages + * into a Guest. + */ if (addr >= guest_limit) errx(1, "Bad NOTIFY %#lx", addr); @@ -998,10 +1074,12 @@ static void handle_output(unsigned long addr) * routines to allocate and manage them. */ -/* The layout of the device page is a "struct lguest_device_desc" followed by a +/* + * The layout of the device page is a "struct lguest_device_desc" followed by a * number of virtqueue descriptors, then two sets of feature bits, then an * array of configuration bytes. This routine returns the configuration - * pointer. */ + * pointer. + */ static u8 *device_config(const struct device *dev) { return (void *)(dev->desc + 1) @@ -1009,9 +1087,11 @@ static u8 *device_config(const struct device *dev) + dev->feature_len * 2; } -/* This routine allocates a new "struct lguest_device_desc" from descriptor +/* + * This routine allocates a new "struct lguest_device_desc" from descriptor * table page just above the Guest's normal memory. It returns a pointer to - * that descriptor. */ + * that descriptor. + */ static struct lguest_device_desc *new_dev_desc(u16 type) { struct lguest_device_desc d = { .type = type }; @@ -1032,8 +1112,10 @@ static struct lguest_device_desc *new_dev_desc(u16 type) return memcpy(p, &d, sizeof(d)); } -/* Each device descriptor is followed by the description of its virtqueues. We - * specify how many descriptors the virtqueue is to have. */ +/* + * Each device descriptor is followed by the description of its virtqueues. We + * specify how many descriptors the virtqueue is to have. + */ static void add_virtqueue(struct device *dev, unsigned int num_descs, void (*service)(struct virtqueue *)) { @@ -1061,10 +1143,12 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, /* Initialize the vring. */ vring_init(&vq->vring, num_descs, p, LGUEST_VRING_ALIGN); - /* Append virtqueue to this device's descriptor. We use + /* + * Append virtqueue to this device's descriptor. We use * device_config() to get the end of the device's current virtqueues; * we check that we haven't added any config or feature information - * yet, otherwise we'd be overwriting them. */ + * yet, otherwise we'd be overwriting them. + */ assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0); memcpy(device_config(dev), &vq->config, sizeof(vq->config)); dev->num_vq++; @@ -1072,14 +1156,18 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, verbose("Virtqueue page %#lx\n", to_guest_phys(p)); - /* Add to tail of list, so dev->vq is first vq, dev->vq->next is - * second. */ + /* + * Add to tail of list, so dev->vq is first vq, dev->vq->next is + * second. + */ for (i = &dev->vq; *i; i = &(*i)->next); *i = vq; } -/* The first half of the feature bitmask is for us to advertise features. The - * second half is for the Guest to accept features. */ +/* + * The first half of the feature bitmask is for us to advertise features. The + * second half is for the Guest to accept features. + */ static void add_feature(struct device *dev, unsigned bit) { u8 *features = get_feature_bits(dev); @@ -1093,9 +1181,11 @@ static void add_feature(struct device *dev, unsigned bit) features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT)); } -/* This routine sets the configuration fields for an existing device's +/* + * This routine sets the configuration fields for an existing device's * descriptor. It only works for the last device, but that's OK because that's - * how we use it. */ + * how we use it. + */ static void set_config(struct device *dev, unsigned len, const void *conf) { /* Check we haven't overflowed our single page. */ @@ -1110,10 +1200,12 @@ static void set_config(struct device *dev, unsigned len, const void *conf) assert(dev->desc->config_len == len); } -/* This routine does all the creation and setup of a new device, including +/* + * This routine does all the creation and setup of a new device, including * calling new_dev_desc() to allocate the descriptor and device memory. * - * See what I mean about userspace being boring? */ + * See what I mean about userspace being boring? + */ static struct device *new_device(const char *name, u16 type) { struct device *dev = malloc(sizeof(*dev)); @@ -1126,10 +1218,12 @@ static struct device *new_device(const char *name, u16 type) dev->num_vq = 0; dev->running = false; - /* Append to device list. Prepending to a single-linked list is + /* + * Append to device list. Prepending to a single-linked list is * easier, but the user expects the devices to be arranged on the bus * in command-line order. The first network device on the command line - * is eth0, the first block device /dev/vda, etc. */ + * is eth0, the first block device /dev/vda, etc. + */ if (devices.lastdev) devices.lastdev->next = dev; else @@ -1139,8 +1233,10 @@ static struct device *new_device(const char *name, u16 type) return dev; } -/* Our first setup routine is the console. It's a fairly simple device, but - * UNIX tty handling makes it uglier than it could be. */ +/* + * Our first setup routine is the console. It's a fairly simple device, but + * UNIX tty handling makes it uglier than it could be. + */ static void setup_console(void) { struct device *dev; @@ -1148,8 +1244,10 @@ static void setup_console(void) /* If we can save the initial standard input settings... */ if (tcgetattr(STDIN_FILENO, &orig_term) == 0) { struct termios term = orig_term; - /* Then we turn off echo, line buffering and ^C etc. We want a - * raw input stream to the Guest. */ + /* + * Then we turn off echo, line buffering and ^C etc: We want a + * raw input stream to the Guest. + */ term.c_lflag &= ~(ISIG|ICANON|ECHO); tcsetattr(STDIN_FILENO, TCSANOW, &term); } @@ -1160,10 +1258,12 @@ static void setup_console(void) dev->priv = malloc(sizeof(struct console_abort)); ((struct console_abort *)dev->priv)->count = 0; - /* The console needs two virtqueues: the input then the output. When + /* + * The console needs two virtqueues: the input then the output. When * they put something the input queue, we make sure we're listening to * stdin. When they put something in the output queue, we write it to - * stdout. */ + * stdout. + */ add_virtqueue(dev, VIRTQUEUE_NUM, console_input); add_virtqueue(dev, VIRTQUEUE_NUM, console_output); @@ -1171,7 +1271,8 @@ static void setup_console(void) } /*:*/ -/*M:010 Inter-guest networking is an interesting area. Simplest is to have a +/*M:010 + * Inter-guest networking is an interesting area. Simplest is to have a * --sharenet= option which opens or creates a named pipe. This can be * used to send packets to another guest in a 1:1 manner. * @@ -1185,7 +1286,8 @@ static void setup_console(void) * multiple inter-guest channels behind one interface, although it would * require some manner of hotplugging new virtio channels. * - * Finally, we could implement a virtio network switch in the kernel. :*/ + * Finally, we could implement a virtio network switch in the kernel. +:*/ static u32 str2ip(const char *ipaddr) { @@ -1210,11 +1312,13 @@ static void str2mac(const char *macaddr, unsigned char mac[6]) mac[5] = m[5]; } -/* This code is "adapted" from libbridge: it attaches the Host end of the +/* + * This code is "adapted" from libbridge: it attaches the Host end of the * network device to the bridge device specified by the command line. * * This is yet another James Morris contribution (I'm an IP-level guy, so I - * dislike bridging), and I just try not to break it. */ + * dislike bridging), and I just try not to break it. + */ static void add_to_bridge(int fd, const char *if_name, const char *br_name) { int ifidx; @@ -1234,9 +1338,11 @@ static void add_to_bridge(int fd, const char *if_name, const char *br_name) err(1, "can't add %s to bridge %s", if_name, br_name); } -/* This sets up the Host end of the network device with an IP address, brings +/* + * This sets up the Host end of the network device with an IP address, brings * it up so packets will flow, the copies the MAC address into the hwaddr - * pointer. */ + * pointer. + */ static void configure_device(int fd, const char *tapif, u32 ipaddr) { struct ifreq ifr; @@ -1263,10 +1369,12 @@ static int get_tun_device(char tapif[IFNAMSIZ]) /* Start with this zeroed. Messy but sure. */ memset(&ifr, 0, sizeof(ifr)); - /* We open the /dev/net/tun device and tell it we want a tap device. A + /* + * We open the /dev/net/tun device and tell it we want a tap device. A * tap device is like a tun device, only somehow different. To tell * the truth, I completely blundered my way through this code, but it - * works now! */ + * works now! + */ netfd = open_or_die("/dev/net/tun", O_RDWR); ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR; strcpy(ifr.ifr_name, "tap%d"); @@ -1277,18 +1385,22 @@ static int get_tun_device(char tapif[IFNAMSIZ]) TUN_F_CSUM|TUN_F_TSO4|TUN_F_TSO6|TUN_F_TSO_ECN) != 0) err(1, "Could not set features for tun device"); - /* We don't need checksums calculated for packets coming in this - * device: trust us! */ + /* + * We don't need checksums calculated for packets coming in this + * device: trust us! + */ ioctl(netfd, TUNSETNOCSUM, 1); memcpy(tapif, ifr.ifr_name, IFNAMSIZ); return netfd; } -/*L:195 Our network is a Host<->Guest network. This can either use bridging or +/*L:195 + * Our network is a Host<->Guest network. This can either use bridging or * routing, but the principle is the same: it uses the "tun" device to inject * packets into the Host as if they came in from a normal network card. We - * just shunt packets between the Guest and the tun device. */ + * just shunt packets between the Guest and the tun device. + */ static void setup_tun_net(char *arg) { struct device *dev; @@ -1305,13 +1417,14 @@ static void setup_tun_net(char *arg) dev = new_device("net", VIRTIO_ID_NET); dev->priv = net_info; - /* Network devices need a receive and a send queue, just like - * console. */ + /* Network devices need a recv and a send queue, just like console. */ add_virtqueue(dev, VIRTQUEUE_NUM, net_input); add_virtqueue(dev, VIRTQUEUE_NUM, net_output); - /* We need a socket to perform the magic network ioctls to bring up the - * tap interface, connect to the bridge etc. Any socket will do! */ + /* + * We need a socket to perform the magic network ioctls to bring up the + * tap interface, connect to the bridge etc. Any socket will do! + */ ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); if (ipfd < 0) err(1, "opening IP socket"); @@ -1366,7 +1479,8 @@ static void setup_tun_net(char *arg) devices.device_num, tapif, arg); } -/* Our block (disk) device should be really simple: the Guest asks for a block +/* + * Our block (disk) device should be really simple: the Guest asks for a block * number and we read or write that position in the file. Unfortunately, that * was amazingly slow: the Guest waits until the read is finished before * running anything else, even if it could have been doing useful work. @@ -1374,7 +1488,9 @@ static void setup_tun_net(char *arg) * We could use async I/O, except it's reputed to suck so hard that characters * actually go missing from your code when you try to use it. * - * So we farm the I/O out to thread, and communicate with it via a pipe. */ + * So this was one reason why lguest now does all virtqueue servicing in + * separate threads: it's more efficient and more like a real device. + */ /* This hangs off device->priv. */ struct vblk_info @@ -1412,9 +1528,11 @@ static void blk_request(struct virtqueue *vq) /* Get the next request. */ head = wait_for_vq_desc(vq, iov, &out_num, &in_num); - /* Every block request should contain at least one output buffer + /* + * Every block request should contain at least one output buffer * (detailing the location on disk and the type of request) and one - * input buffer (to hold the result). */ + * input buffer (to hold the result). + */ if (out_num == 0 || in_num == 0) errx(1, "Bad virtblk cmd %u out=%u in=%u", head, out_num, in_num); @@ -1423,33 +1541,41 @@ static void blk_request(struct virtqueue *vq) in = convert(&iov[out_num+in_num-1], u8); off = out->sector * 512; - /* The block device implements "barriers", where the Guest indicates + /* + * The block device implements "barriers", where the Guest indicates * that it wants all previous writes to occur before this write. We * don't have a way of asking our kernel to do a barrier, so we just - * synchronize all the data in the file. Pretty poor, no? */ + * synchronize all the data in the file. Pretty poor, no? + */ if (out->type & VIRTIO_BLK_T_BARRIER) fdatasync(vblk->fd); - /* In general the virtio block driver is allowed to try SCSI commands. - * It'd be nice if we supported eject, for example, but we don't. */ + /* + * In general the virtio block driver is allowed to try SCSI commands. + * It'd be nice if we supported eject, for example, but we don't. + */ if (out->type & VIRTIO_BLK_T_SCSI_CMD) { fprintf(stderr, "Scsi commands unsupported\n"); *in = VIRTIO_BLK_S_UNSUPP; wlen = sizeof(*in); } else if (out->type & VIRTIO_BLK_T_OUT) { - /* Write */ - - /* Move to the right location in the block file. This can fail - * if they try to write past end. */ + /* + * Write + * + * Move to the right location in the block file. This can fail + * if they try to write past end. + */ if (lseek64(vblk->fd, off, SEEK_SET) != off) err(1, "Bad seek to sector %llu", out->sector); ret = writev(vblk->fd, iov+1, out_num-1); verbose("WRITE to sector %llu: %i\n", out->sector, ret); - /* Grr... Now we know how long the descriptor they sent was, we + /* + * Grr... Now we know how long the descriptor they sent was, we * make sure they didn't try to write over the end of the block - * file (possibly extending it). */ + * file (possibly extending it). + */ if (ret > 0 && off + ret > vblk->len) { /* Trim it back to the correct length */ ftruncate64(vblk->fd, vblk->len); @@ -1459,10 +1585,12 @@ static void blk_request(struct virtqueue *vq) wlen = sizeof(*in); *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); } else { - /* Read */ - - /* Move to the right location in the block file. This can fail - * if they try to read past end. */ + /* + * Read + * + * Move to the right location in the block file. This can fail + * if they try to read past end. + */ if (lseek64(vblk->fd, off, SEEK_SET) != off) err(1, "Bad seek to sector %llu", out->sector); @@ -1477,10 +1605,12 @@ static void blk_request(struct virtqueue *vq) } } - /* OK, so we noted that it was pretty poor to use an fdatasync as a + /* + * OK, so we noted that it was pretty poor to use an fdatasync as a * barrier. But Christoph Hellwig points out that we need a sync * *afterwards* as well: "Barriers specify no reordering to the front - * or the back." And Jens Axboe confirmed it, so here we are: */ + * or the back." And Jens Axboe confirmed it, so here we are: + */ if (out->type & VIRTIO_BLK_T_BARRIER) fdatasync(vblk->fd); @@ -1494,7 +1624,7 @@ static void setup_block_file(const char *filename) struct vblk_info *vblk; struct virtio_blk_config conf; - /* The device responds to return from I/O thread. */ + /* Creat the device. */ dev = new_device("block", VIRTIO_ID_BLOCK); /* The device has one virtqueue, where the Guest places requests. */ @@ -1513,8 +1643,10 @@ static void setup_block_file(const char *filename) /* Tell Guest how many sectors this device has. */ conf.capacity = cpu_to_le64(vblk->len / 512); - /* Tell Guest not to put in too many descriptors at once: two are used - * for the in and out elements. */ + /* + * Tell Guest not to put in too many descriptors at once: two are used + * for the in and out elements. + */ add_feature(dev, VIRTIO_BLK_F_SEG_MAX); conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2); @@ -1525,16 +1657,18 @@ static void setup_block_file(const char *filename) ++devices.device_num, le64_to_cpu(conf.capacity)); } -struct rng_info { - int rfd; -}; - -/* Our random number generator device reads from /dev/random into the Guest's +/*L:211 + * Our random number generator device reads from /dev/random into the Guest's * input buffers. The usual case is that the Guest doesn't want random numbers * and so has no buffers although /dev/random is still readable, whereas * console is the reverse. * - * The same logic applies, however. */ + * The same logic applies, however. + */ +struct rng_info { + int rfd; +}; + static void rng_input(struct virtqueue *vq) { int len; @@ -1547,9 +1681,11 @@ static void rng_input(struct virtqueue *vq) if (out_num) errx(1, "Output buffers in rng?"); - /* This is why we convert to iovecs: the readv() call uses them, and so + /* + * This is why we convert to iovecs: the readv() call uses them, and so * it reads straight into the Guest's buffer. We loop to make sure we - * fill it. */ + * fill it. + */ while (!iov_empty(iov, in_num)) { len = readv(rng_info->rfd, iov, in_num); if (len <= 0) @@ -1562,15 +1698,18 @@ static void rng_input(struct virtqueue *vq) add_used(vq, head, totlen); } -/* And this creates a "hardware" random number device for the Guest. */ +/*L:199 + * This creates a "hardware" random number device for the Guest. + */ static void setup_rng(void) { struct device *dev; struct rng_info *rng_info = malloc(sizeof(*rng_info)); + /* Our device's privat info simply contains the /dev/random fd. */ rng_info->rfd = open_or_die("/dev/random", O_RDONLY); - /* The device responds to return from I/O thread. */ + /* Create the new device. */ dev = new_device("rng", VIRTIO_ID_RNG); dev->priv = rng_info; @@ -1586,8 +1725,10 @@ static void __attribute__((noreturn)) restart_guest(void) { unsigned int i; - /* Since we don't track all open fds, we simply close everything beyond - * stderr. */ + /* + * Since we don't track all open fds, we simply close everything beyond + * stderr. + */ for (i = 3; i < FD_SETSIZE; i++) close(i); @@ -1598,8 +1739,10 @@ static void __attribute__((noreturn)) restart_guest(void) err(1, "Could not exec %s", main_args[0]); } -/*L:220 Finally we reach the core of the Launcher which runs the Guest, serves - * its input and output, and finally, lays it to rest. */ +/*L:220 + * Finally we reach the core of the Launcher which runs the Guest, serves + * its input and output, and finally, lays it to rest. + */ static void __attribute__((noreturn)) run_guest(void) { for (;;) { @@ -1634,7 +1777,7 @@ static void __attribute__((noreturn)) run_guest(void) * * Are you ready? Take a deep breath and join me in the core of the Host, in * "make Host". - :*/ +:*/ static struct option opts[] = { { "verbose", 0, NULL, 'v' }, @@ -1655,8 +1798,7 @@ static void usage(void) /*L:105 The main routine is where the real work begins: */ int main(int argc, char *argv[]) { - /* Memory, top-level pagetable, code startpoint and size of the - * (optional) initrd. */ + /* Memory, code startpoint and size of the (optional) initrd. */ unsigned long mem = 0, start, initrd_size = 0; /* Two temporaries. */ int i, c; @@ -1668,24 +1810,30 @@ int main(int argc, char *argv[]) /* Save the args: we "reboot" by execing ourselves again. */ main_args = argv; - /* First we initialize the device list. We keep a pointer to the last + /* + * First we initialize the device list. We keep a pointer to the last * device, and the next interrupt number to use for devices (1: - * remember that 0 is used by the timer). */ + * remember that 0 is used by the timer). + */ devices.lastdev = NULL; devices.next_irq = 1; cpu_id = 0; - /* We need to know how much memory so we can set up the device + /* + * We need to know how much memory so we can set up the device * descriptor and memory pages for the devices as we parse the command * line. So we quickly look through the arguments to find the amount - * of memory now. */ + * of memory now. + */ for (i = 1; i < argc; i++) { if (argv[i][0] != '-') { mem = atoi(argv[i]) * 1024 * 1024; - /* We start by mapping anonymous pages over all of + /* + * We start by mapping anonymous pages over all of * guest-physical memory range. This fills it with 0, * and ensures that the Guest won't be killed when it - * tries to access it. */ + * tries to access it. + */ guest_base = map_zeroed_pages(mem / getpagesize() + DEVICE_PAGES); guest_limit = mem; @@ -1718,8 +1866,10 @@ int main(int argc, char *argv[]) usage(); } } - /* After the other arguments we expect memory and kernel image name, - * followed by command line arguments for the kernel. */ + /* + * After the other arguments we expect memory and kernel image name, + * followed by command line arguments for the kernel. + */ if (optind + 2 > argc) usage(); @@ -1737,20 +1887,26 @@ int main(int argc, char *argv[]) /* Map the initrd image if requested (at top of physical memory) */ if (initrd_name) { initrd_size = load_initrd(initrd_name, mem); - /* These are the location in the Linux boot header where the - * start and size of the initrd are expected to be found. */ + /* + * These are the location in the Linux boot header where the + * start and size of the initrd are expected to be found. + */ boot->hdr.ramdisk_image = mem - initrd_size; boot->hdr.ramdisk_size = initrd_size; /* The bootloader type 0xFF means "unknown"; that's OK. */ boot->hdr.type_of_loader = 0xFF; } - /* The Linux boot header contains an "E820" memory map: ours is a - * simple, single region. */ + /* + * The Linux boot header contains an "E820" memory map: ours is a + * simple, single region. + */ boot->e820_entries = 1; boot->e820_map[0] = ((struct e820entry) { 0, mem, E820_RAM }); - /* The boot header contains a command line pointer: we put the command - * line after the boot header. */ + /* + * The boot header contains a command line pointer: we put the command + * line after the boot header. + */ boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1); /* We use a simple helper to copy the arguments separated by spaces. */ concat((char *)(boot + 1), argv+optind+2); @@ -1764,8 +1920,10 @@ int main(int argc, char *argv[]) /* Tell the entry path not to try to reload segment registers. */ boot->hdr.loadflags |= KEEP_SEGMENTS; - /* We tell the kernel to initialize the Guest: this returns the open - * /dev/lguest file descriptor. */ + /* + * We tell the kernel to initialize the Guest: this returns the open + * /dev/lguest file descriptor. + */ tell_kernel(start); /* Ensure that we terminate if a child dies. */ diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h index 313389cd50d2..5136dad57cbb 100644 --- a/arch/x86/include/asm/lguest.h +++ b/arch/x86/include/asm/lguest.h @@ -17,8 +17,7 @@ /* Pages for switcher itself, then two pages per cpu */ #define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) -/* We map at -4M (-2M when PAE is activated) for ease of mapping - * into the guest (one PTE page). */ +/* We map at -4M (-2M for PAE) for ease of mapping (one PTE page). */ #ifdef CONFIG_X86_PAE #define SWITCHER_ADDR 0xFFE00000 #else diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index 33600a66755f..cceb73e12e50 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h @@ -30,7 +30,8 @@ #include #include -/*G:030 But first, how does our Guest contact the Host to ask for privileged +/*G:030 + * But first, how does our Guest contact the Host to ask for privileged * operations? There are two ways: the direct way is to make a "hypercall", * to make requests of the Host Itself. * @@ -41,16 +42,15 @@ * * Grossly invalid calls result in Sudden Death at the hands of the vengeful * Host, rather than returning failure. This reflects Winston Churchill's - * definition of a gentleman: "someone who is only rude intentionally". */ -/*:*/ + * definition of a gentleman: "someone who is only rude intentionally". +:*/ /* Can't use our min() macro here: needs to be a constant */ #define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) #define LHCALL_RING_SIZE 64 struct hcall_args { - /* These map directly onto eax, ebx, ecx, edx and esi - * in struct lguest_regs */ + /* These map directly onto eax/ebx/ecx/edx/esi in struct lguest_regs */ unsigned long arg0, arg1, arg2, arg3, arg4; }; diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index f2bf1f73d468..025c04d18f2b 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -22,7 +22,8 @@ * * So how does the kernel know it's a Guest? We'll see that later, but let's * just say that we end up here where we replace the native functions various - * "paravirt" structures with our Guest versions, then boot like normal. :*/ + * "paravirt" structures with our Guest versions, then boot like normal. +:*/ /* * Copyright (C) 2006, Rusty Russell IBM Corporation. @@ -74,7 +75,8 @@ * * The Guest in our tale is a simple creature: identical to the Host but * behaving in simplified but equivalent ways. In particular, the Guest is the - * same kernel as the Host (or at least, built from the same source code). :*/ + * same kernel as the Host (or at least, built from the same source code). +:*/ struct lguest_data lguest_data = { .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, @@ -85,7 +87,8 @@ struct lguest_data lguest_data = { .syscall_vec = SYSCALL_VECTOR, }; -/*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a +/*G:037 + * async_hcall() is pretty simple: I'm quite proud of it really. We have a * ring buffer of stored hypercalls which the Host will run though next time we * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall * arguments, and a "hcall_status" word which is 0 if the call is ready to go, @@ -94,7 +97,8 @@ struct lguest_data lguest_data = { * If we come around to a slot which hasn't been finished, then the table is * full and we just make the hypercall directly. This has the nice side * effect of causing the Host to run all the stored calls in the ring buffer - * which empties it for next time! */ + * which empties it for next time! + */ static void async_hcall(unsigned long call, unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4) @@ -103,9 +107,11 @@ static void async_hcall(unsigned long call, unsigned long arg1, static unsigned int next_call; unsigned long flags; - /* Disable interrupts if not already disabled: we don't want an + /* + * Disable interrupts if not already disabled: we don't want an * interrupt handler making a hypercall while we're already doing - * one! */ + * one! + */ local_irq_save(flags); if (lguest_data.hcall_status[next_call] != 0xFF) { /* Table full, so do normal hcall which will flush table. */ @@ -125,8 +131,9 @@ static void async_hcall(unsigned long call, unsigned long arg1, local_irq_restore(flags); } -/*G:035 Notice the lazy_hcall() above, rather than hcall(). This is our first - * real optimization trick! +/*G:035 + * Notice the lazy_hcall() above, rather than hcall(). This is our first real + * optimization trick! * * When lazy_mode is set, it means we're allowed to defer all hypercalls and do * them as a batch when lazy_mode is eventually turned off. Because hypercalls @@ -136,7 +143,8 @@ static void async_hcall(unsigned long call, unsigned long arg1, * lguest_leave_lazy_mode(). * * So, when we're in lazy mode, we call async_hcall() to store the call for - * future processing: */ + * future processing: + */ static void lazy_hcall1(unsigned long call, unsigned long arg1) { @@ -208,9 +216,11 @@ static void lguest_end_context_switch(struct task_struct *next) * check there before it tries to deliver an interrupt. */ -/* save_flags() is expected to return the processor state (ie. "flags"). The +/* + * save_flags() is expected to return the processor state (ie. "flags"). The * flags word contains all kind of stuff, but in practice Linux only cares - * about the interrupt flag. Our "save_flags()" just returns that. */ + * about the interrupt flag. Our "save_flags()" just returns that. + */ static unsigned long save_fl(void) { return lguest_data.irq_enabled; @@ -222,13 +232,15 @@ static void irq_disable(void) lguest_data.irq_enabled = 0; } -/* Let's pause a moment. Remember how I said these are called so often? +/* + * Let's pause a moment. Remember how I said these are called so often? * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to * break some rules. In particular, these functions are assumed to save their * own registers if they need to: normal C functions assume they can trash the * eax register. To use normal C functions, we use * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the - * C function, then restores it. */ + * C function, then restores it. + */ PV_CALLEE_SAVE_REGS_THUNK(save_fl); PV_CALLEE_SAVE_REGS_THUNK(irq_disable); /*:*/ @@ -237,18 +249,20 @@ PV_CALLEE_SAVE_REGS_THUNK(irq_disable); extern void lg_irq_enable(void); extern void lg_restore_fl(unsigned long flags); -/*M:003 Note that we don't check for outstanding interrupts when we re-enable - * them (or when we unmask an interrupt). This seems to work for the moment, - * since interrupts are rare and we'll just get the interrupt on the next timer - * tick, but now we can run with CONFIG_NO_HZ, we should revisit this. One way - * would be to put the "irq_enabled" field in a page by itself, and have the - * Host write-protect it when an interrupt comes in when irqs are disabled. - * There will then be a page fault as soon as interrupts are re-enabled. +/*M:003 + * Note that we don't check for outstanding interrupts when we re-enable them + * (or when we unmask an interrupt). This seems to work for the moment, since + * interrupts are rare and we'll just get the interrupt on the next timer tick, + * but now we can run with CONFIG_NO_HZ, we should revisit this. One way would + * be to put the "irq_enabled" field in a page by itself, and have the Host + * write-protect it when an interrupt comes in when irqs are disabled. There + * will then be a page fault as soon as interrupts are re-enabled. * * A better method is to implement soft interrupt disable generally for x86: * instead of disabling interrupts, we set a flag. If an interrupt does come * in, we then disable them for real. This is uncommon, so we could simply use - * a hypercall for interrupt control and not worry about efficiency. :*/ + * a hypercall for interrupt control and not worry about efficiency. +:*/ /*G:034 * The Interrupt Descriptor Table (IDT). @@ -261,10 +275,12 @@ extern void lg_restore_fl(unsigned long flags); static void lguest_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g) { - /* The gate_desc structure is 8 bytes long: we hand it to the Host in + /* + * The gate_desc structure is 8 bytes long: we hand it to the Host in * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors * around like this; typesafety wasn't a big concern in Linux's early - * years. */ + * years. + */ u32 *desc = (u32 *)g; /* Keep the local copy up to date. */ native_write_idt_entry(dt, entrynum, g); @@ -272,9 +288,11 @@ static void lguest_write_idt_entry(gate_desc *dt, kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]); } -/* Changing to a different IDT is very rare: we keep the IDT up-to-date every +/* + * Changing to a different IDT is very rare: we keep the IDT up-to-date every * time it is written, so we can simply loop through all entries and tell the - * Host about them. */ + * Host about them. + */ static void lguest_load_idt(const struct desc_ptr *desc) { unsigned int i; @@ -305,9 +323,11 @@ static void lguest_load_gdt(const struct desc_ptr *desc) kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b); } -/* For a single GDT entry which changes, we do the lazy thing: alter our GDT, +/* + * For a single GDT entry which changes, we do the lazy thing: alter our GDT, * then tell the Host to reload the entire thing. This operation is so rare - * that this naive implementation is reasonable. */ + * that this naive implementation is reasonable. + */ static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, const void *desc, int type) { @@ -317,29 +337,36 @@ static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, dt[entrynum].a, dt[entrynum].b); } -/* OK, I lied. There are three "thread local storage" GDT entries which change +/* + * OK, I lied. There are three "thread local storage" GDT entries which change * on every context switch (these three entries are how glibc implements - * __thread variables). So we have a hypercall specifically for this case. */ + * __thread variables). So we have a hypercall specifically for this case. + */ static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) { - /* There's one problem which normal hardware doesn't have: the Host + /* + * There's one problem which normal hardware doesn't have: the Host * can't handle us removing entries we're currently using. So we clear - * the GS register here: if it's needed it'll be reloaded anyway. */ + * the GS register here: if it's needed it'll be reloaded anyway. + */ lazy_load_gs(0); lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu); } -/*G:038 That's enough excitement for now, back to ploughing through each of - * the different pv_ops structures (we're about 1/3 of the way through). +/*G:038 + * That's enough excitement for now, back to ploughing through each of the + * different pv_ops structures (we're about 1/3 of the way through). * * This is the Local Descriptor Table, another weird Intel thingy. Linux only * uses this for some strange applications like Wine. We don't do anything - * here, so they'll get an informative and friendly Segmentation Fault. */ + * here, so they'll get an informative and friendly Segmentation Fault. + */ static void lguest_set_ldt(const void *addr, unsigned entries) { } -/* This loads a GDT entry into the "Task Register": that entry points to a +/* + * This loads a GDT entry into the "Task Register": that entry points to a * structure called the Task State Segment. Some comments scattered though the * kernel code indicate that this used for task switching in ages past, along * with blood sacrifice and astrology. @@ -347,19 +374,21 @@ static void lguest_set_ldt(const void *addr, unsigned entries) * Now there's nothing interesting in here that we don't get told elsewhere. * But the native version uses the "ltr" instruction, which makes the Host * complain to the Guest about a Segmentation Fault and it'll oops. So we - * override the native version with a do-nothing version. */ + * override the native version with a do-nothing version. + */ static void lguest_load_tr_desc(void) { } -/* The "cpuid" instruction is a way of querying both the CPU identity +/* + * The "cpuid" instruction is a way of querying both the CPU identity * (manufacturer, model, etc) and its features. It was introduced before the * Pentium in 1993 and keeps getting extended by both Intel, AMD and others. * As you might imagine, after a decade and a half this treatment, it is now a * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. * * This instruction even it has its own Wikipedia entry. The Wikipedia entry - * has been translated into 4 languages. I am not making this up! + * has been translated into 5 languages. I am not making this up! * * We could get funky here and identify ourselves as "GenuineLguest", but * instead we just use the real "cpuid" instruction. Then I pretty much turned @@ -371,7 +400,8 @@ static void lguest_load_tr_desc(void) * Replacing the cpuid so we can turn features off is great for the kernel, but * anyone (including userspace) can just use the raw "cpuid" instruction and * the Host won't even notice since it isn't privileged. So we try not to get - * too worked up about it. */ + * too worked up about it. + */ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, unsigned int *cx, unsigned int *dx) { @@ -379,43 +409,63 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, native_cpuid(ax, bx, cx, dx); switch (function) { - case 0: /* ID and highest CPUID. Futureproof a little by sticking to - * older ones. */ + /* + * CPUID 0 gives the highest legal CPUID number (and the ID string). + * We futureproof our code a little by sticking to known CPUID values. + */ + case 0: if (*ax > 5) *ax = 5; break; - case 1: /* Basic feature request. */ - /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ + + /* + * CPUID 1 is a basic feature request. + * + * CX: we only allow kernel to see SSE3, CMPXCHG16B and SSSE3 + * DX: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU and PAE. + */ + case 1: *cx &= 0x00002201; - /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU, PAE. */ *dx &= 0x07808151; - /* The Host can do a nice optimization if it knows that the + /* + * The Host can do a nice optimization if it knows that the * kernel mappings (addresses above 0xC0000000 or whatever * PAGE_OFFSET is set to) haven't changed. But Linux calls * flush_tlb_user() for both user and kernel mappings unless - * the Page Global Enable (PGE) feature bit is set. */ + * the Page Global Enable (PGE) feature bit is set. + */ *dx |= 0x00002000; - /* We also lie, and say we're family id 5. 6 or greater + /* + * We also lie, and say we're family id 5. 6 or greater * leads to a rdmsr in early_init_intel which we can't handle. - * Family ID is returned as bits 8-12 in ax. */ + * Family ID is returned as bits 8-12 in ax. + */ *ax &= 0xFFFFF0FF; *ax |= 0x00000500; break; + /* + * 0x80000000 returns the highest Extended Function, so we futureproof + * like we do above by limiting it to known fields. + */ case 0x80000000: - /* Futureproof this a little: if they ask how much extended - * processor information there is, limit it to known fields. */ if (*ax > 0x80000008) *ax = 0x80000008; break; + + /* + * PAE systems can mark pages as non-executable. Linux calls this the + * NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced + * Virus Protection). We just switch turn if off here, since we don't + * support it. + */ case 0x80000001: - /* Here we should fix nx cap depending on host. */ - /* For this version of PAE, we just clear NX bit. */ *dx &= ~(1 << 20); break; } } -/* Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. +/* + * Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother * it. The Host needs to know when the Guest wants to change them, so we have * a whole series of functions like read_cr0() and write_cr0(). @@ -430,7 +480,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, * name like "FPUTRAP bit" be a little less cryptic? * * We store cr0 locally because the Host never changes it. The Guest sometimes - * wants to read it and we'd prefer not to bother the Host unnecessarily. */ + * wants to read it and we'd prefer not to bother the Host unnecessarily. + */ static unsigned long current_cr0; static void lguest_write_cr0(unsigned long val) { @@ -443,18 +494,22 @@ static unsigned long lguest_read_cr0(void) return current_cr0; } -/* Intel provided a special instruction to clear the TS bit for people too cool +/* + * Intel provided a special instruction to clear the TS bit for people too cool * to use write_cr0() to do it. This "clts" instruction is faster, because all - * the vowels have been optimized out. */ + * the vowels have been optimized out. + */ static void lguest_clts(void) { lazy_hcall1(LHCALL_TS, 0); current_cr0 &= ~X86_CR0_TS; } -/* cr2 is the virtual address of the last page fault, which the Guest only ever +/* + * cr2 is the virtual address of the last page fault, which the Guest only ever * reads. The Host kindly writes this into our "struct lguest_data", so we - * just read it out of there. */ + * just read it out of there. + */ static unsigned long lguest_read_cr2(void) { return lguest_data.cr2; @@ -463,10 +518,12 @@ static unsigned long lguest_read_cr2(void) /* See lguest_set_pte() below. */ static bool cr3_changed = false; -/* cr3 is the current toplevel pagetable page: the principle is the same as +/* + * cr3 is the current toplevel pagetable page: the principle is the same as * cr0. Keep a local copy, and tell the Host when it changes. The only * difference is that our local copy is in lguest_data because the Host needs - * to set it upon our initial hypercall. */ + * to set it upon our initial hypercall. + */ static void lguest_write_cr3(unsigned long cr3) { lguest_data.pgdir = cr3; @@ -538,10 +595,12 @@ static void lguest_write_cr4(unsigned long val) * the real page tables based on the Guests'. */ -/* The Guest calls this to set a second-level entry (pte), ie. to map a page +/* + * The Guest calls this to set a second-level entry (pte), ie. to map a page * into a process' address space. We set the entry then tell the Host the * toplevel and address this corresponds to. The Guest uses one pagetable per - * process, so we need to tell the Host which one we're changing (mm->pgd). */ + * process, so we need to tell the Host which one we're changing (mm->pgd). + */ static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { @@ -560,10 +619,13 @@ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, lguest_pte_update(mm, addr, ptep); } -/* The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd +/* + * The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd * to set a middle-level entry when PAE is activated. + * * Again, we set the entry then tell the Host which page we changed, - * and the index of the entry we changed. */ + * and the index of the entry we changed. + */ #ifdef CONFIG_X86_PAE static void lguest_set_pud(pud_t *pudp, pud_t pudval) { @@ -582,8 +644,7 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) } #else -/* The Guest calls lguest_set_pmd to set a top-level entry when PAE is not - * activated. */ +/* The Guest calls lguest_set_pmd to set a top-level entry when !PAE. */ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) { native_set_pmd(pmdp, pmdval); @@ -592,7 +653,8 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) } #endif -/* There are a couple of legacy places where the kernel sets a PTE, but we +/* + * There are a couple of legacy places where the kernel sets a PTE, but we * don't know the top level any more. This is useless for us, since we don't * know which pagetable is changing or what address, so we just tell the Host * to forget all of them. Fortunately, this is very rare. @@ -600,7 +662,8 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) * ... except in early boot when the kernel sets up the initial pagetables, * which makes booting astonishingly slow: 1.83 seconds! So we don't even tell * the Host anything changed until we've done the first page table switch, - * which brings boot back to 0.25 seconds. */ + * which brings boot back to 0.25 seconds. + */ static void lguest_set_pte(pte_t *ptep, pte_t pteval) { native_set_pte(ptep, pteval); @@ -628,7 +691,8 @@ void lguest_pmd_clear(pmd_t *pmdp) } #endif -/* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on +/* + * Unfortunately for Lguest, the pv_mmu_ops for page tables were based on * native page table operations. On native hardware you can set a new page * table entry whenever you want, but if you want to remove one you have to do * a TLB flush (a TLB is a little cache of page table entries kept by the CPU). @@ -637,24 +701,29 @@ void lguest_pmd_clear(pmd_t *pmdp) * called when a valid entry is written, not when it's removed (ie. marked not * present). Instead, this is where we come when the Guest wants to remove a * page table entry: we tell the Host to set that entry to 0 (ie. the present - * bit is zero). */ + * bit is zero). + */ static void lguest_flush_tlb_single(unsigned long addr) { /* Simply set it to zero: if it was not, it will fault back in. */ lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0); } -/* This is what happens after the Guest has removed a large number of entries. +/* + * This is what happens after the Guest has removed a large number of entries. * This tells the Host that any of the page table entries for userspace might - * have changed, ie. virtual addresses below PAGE_OFFSET. */ + * have changed, ie. virtual addresses below PAGE_OFFSET. + */ static void lguest_flush_tlb_user(void) { lazy_hcall1(LHCALL_FLUSH_TLB, 0); } -/* This is called when the kernel page tables have changed. That's not very +/* + * This is called when the kernel page tables have changed. That's not very * common (unless the Guest is using highmem, which makes the Guest extremely - * slow), so it's worth separating this from the user flushing above. */ + * slow), so it's worth separating this from the user flushing above. + */ static void lguest_flush_tlb_kernel(void) { lazy_hcall1(LHCALL_FLUSH_TLB, 1); @@ -691,23 +760,27 @@ static struct irq_chip lguest_irq_controller = { .unmask = enable_lguest_irq, }; -/* This sets up the Interrupt Descriptor Table (IDT) entry for each hardware +/* + * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware * interrupt (except 128, which is used for system calls), and then tells the * Linux infrastructure that each interrupt is controlled by our level-based - * lguest interrupt controller. */ + * lguest interrupt controller. + */ static void __init lguest_init_IRQ(void) { unsigned int i; for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { - /* Some systems map "vectors" to interrupts weirdly. Lguest has - * a straightforward 1 to 1 mapping, so force that here. */ + /* Some systems map "vectors" to interrupts weirdly. Not us! */ __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR; if (i != SYSCALL_VECTOR) set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); } - /* This call is required to set up for 4k stacks, where we have - * separate stacks for hard and soft interrupts. */ + + /* + * This call is required to set up for 4k stacks, where we have + * separate stacks for hard and soft interrupts. + */ irq_ctx_init(smp_processor_id()); } @@ -729,31 +802,39 @@ static unsigned long lguest_get_wallclock(void) return lguest_data.time.tv_sec; } -/* The TSC is an Intel thing called the Time Stamp Counter. The Host tells us +/* + * The TSC is an Intel thing called the Time Stamp Counter. The Host tells us * what speed it runs at, or 0 if it's unusable as a reliable clock source. * This matches what we want here: if we return 0 from this function, the x86 - * TSC clock will give up and not register itself. */ + * TSC clock will give up and not register itself. + */ static unsigned long lguest_tsc_khz(void) { return lguest_data.tsc_khz; } -/* If we can't use the TSC, the kernel falls back to our lower-priority - * "lguest_clock", where we read the time value given to us by the Host. */ +/* + * If we can't use the TSC, the kernel falls back to our lower-priority + * "lguest_clock", where we read the time value given to us by the Host. + */ static cycle_t lguest_clock_read(struct clocksource *cs) { unsigned long sec, nsec; - /* Since the time is in two parts (seconds and nanoseconds), we risk + /* + * Since the time is in two parts (seconds and nanoseconds), we risk * reading it just as it's changing from 99 & 0.999999999 to 100 and 0, * and getting 99 and 0. As Linux tends to come apart under the stress - * of time travel, we must be careful: */ + * of time travel, we must be careful: + */ do { /* First we read the seconds part. */ sec = lguest_data.time.tv_sec; - /* This read memory barrier tells the compiler and the CPU that + /* + * This read memory barrier tells the compiler and the CPU that * this can't be reordered: we have to complete the above - * before going on. */ + * before going on. + */ rmb(); /* Now we read the nanoseconds part. */ nsec = lguest_data.time.tv_nsec; @@ -777,9 +858,11 @@ static struct clocksource lguest_clock = { .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; -/* We also need a "struct clock_event_device": Linux asks us to set it to go +/* + * We also need a "struct clock_event_device": Linux asks us to set it to go * off some time in the future. Actually, James Morris figured all this out, I - * just applied the patch. */ + * just applied the patch. + */ static int lguest_clockevent_set_next_event(unsigned long delta, struct clock_event_device *evt) { @@ -829,8 +912,10 @@ static struct clock_event_device lguest_clockevent = { .max_delta_ns = LG_CLOCK_MAX_DELTA, }; -/* This is the Guest timer interrupt handler (hardware interrupt 0). We just - * call the clockevent infrastructure and it does whatever needs doing. */ +/* + * This is the Guest timer interrupt handler (hardware interrupt 0). We just + * call the clockevent infrastructure and it does whatever needs doing. + */ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) { unsigned long flags; @@ -841,10 +926,12 @@ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) local_irq_restore(flags); } -/* At some point in the boot process, we get asked to set up our timing +/* + * At some point in the boot process, we get asked to set up our timing * infrastructure. The kernel doesn't expect timer interrupts before this, but * we cleverly initialized the "blocked_interrupts" field of "struct - * lguest_data" so that timer interrupts were blocked until now. */ + * lguest_data" so that timer interrupts were blocked until now. + */ static void lguest_time_init(void) { /* Set up the timer interrupt (0) to go to our simple timer routine */ @@ -868,14 +955,16 @@ static void lguest_time_init(void) * to work. They're pretty simple. */ -/* The Guest needs to tell the Host what stack it expects traps to use. For +/* + * The Guest needs to tell the Host what stack it expects traps to use. For * native hardware, this is part of the Task State Segment mentioned above in * lguest_load_tr_desc(), but to help hypervisors there's this special call. * * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data * segment), the privilege level (we're privilege level 1, the Host is 0 and * will not tolerate us trying to use that), the stack pointer, and the number - * of pages in the stack. */ + * of pages in the stack. + */ static void lguest_load_sp0(struct tss_struct *tss, struct thread_struct *thread) { @@ -889,7 +978,8 @@ static void lguest_set_debugreg(int regno, unsigned long value) /* FIXME: Implement */ } -/* There are times when the kernel wants to make sure that no memory writes are +/* + * There are times when the kernel wants to make sure that no memory writes are * caught in the cache (that they've all reached real hardware devices). This * doesn't matter for the Guest which has virtual hardware. * @@ -903,11 +993,13 @@ static void lguest_wbinvd(void) { } -/* If the Guest expects to have an Advanced Programmable Interrupt Controller, +/* + * If the Guest expects to have an Advanced Programmable Interrupt Controller, * we play dumb by ignoring writes and returning 0 for reads. So it's no * longer Programmable nor Controlling anything, and I don't think 8 lines of * code qualifies for Advanced. It will also never interrupt anything. It - * does, however, allow us to get through the Linux boot code. */ + * does, however, allow us to get through the Linux boot code. + */ #ifdef CONFIG_X86_LOCAL_APIC static void lguest_apic_write(u32 reg, u32 v) { @@ -956,11 +1048,13 @@ static void lguest_safe_halt(void) kvm_hypercall0(LHCALL_HALT); } -/* The SHUTDOWN hypercall takes a string to describe what's happening, and +/* + * The SHUTDOWN hypercall takes a string to describe what's happening, and * an argument which says whether this to restart (reboot) the Guest or not. * * Note that the Host always prefers that the Guest speak in physical addresses - * rather than virtual addresses, so we use __pa() here. */ + * rather than virtual addresses, so we use __pa() here. + */ static void lguest_power_off(void) { kvm_hypercall2(LHCALL_SHUTDOWN, __pa("Power down"), @@ -991,8 +1085,10 @@ static __init char *lguest_memory_setup(void) * nice to move it back to lguest_init. Patch welcome... */ atomic_notifier_chain_register(&panic_notifier_list, &paniced); - /* The Linux bootloader header contains an "e820" memory map: the - * Launcher populated the first entry with our memory limit. */ + /* + *The Linux bootloader header contains an "e820" memory map: the + * Launcher populated the first entry with our memory limit. + */ e820_add_region(boot_params.e820_map[0].addr, boot_params.e820_map[0].size, boot_params.e820_map[0].type); @@ -1001,16 +1097,17 @@ static __init char *lguest_memory_setup(void) return "LGUEST"; } -/* We will eventually use the virtio console device to produce console output, +/* + * We will eventually use the virtio console device to produce console output, * but before that is set up we use LHCALL_NOTIFY on normal memory to produce - * console output. */ + * console output. + */ static __init int early_put_chars(u32 vtermno, const char *buf, int count) { char scratch[17]; unsigned int len = count; - /* We use a nul-terminated string, so we have to make a copy. Icky, - * huh? */ + /* We use a nul-terminated string, so we make a copy. Icky, huh? */ if (len > sizeof(scratch) - 1) len = sizeof(scratch) - 1; scratch[len] = '\0'; @@ -1021,8 +1118,10 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count) return len; } -/* Rebooting also tells the Host we're finished, but the RESTART flag tells the - * Launcher to reboot us. */ +/* + * Rebooting also tells the Host we're finished, but the RESTART flag tells the + * Launcher to reboot us. + */ static void lguest_restart(char *reason) { kvm_hypercall2(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART); @@ -1049,7 +1148,8 @@ static void lguest_restart(char *reason) * fit comfortably. * * First we need assembly templates of each of the patchable Guest operations, - * and these are in i386_head.S. */ + * and these are in i386_head.S. + */ /*G:060 We construct a table from the assembler templates: */ static const struct lguest_insns @@ -1060,9 +1160,11 @@ static const struct lguest_insns [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, }; -/* Now our patch routine is fairly simple (based on the native one in +/* + * Now our patch routine is fairly simple (based on the native one in * paravirt.c). If we have a replacement, we copy it in and return how much of - * the available space we used. */ + * the available space we used. + */ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, unsigned long addr, unsigned len) { @@ -1074,8 +1176,7 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, insn_len = lguest_insns[type].end - lguest_insns[type].start; - /* Similarly if we can't fit replacement (shouldn't happen, but let's - * be thorough). */ + /* Similarly if it can't fit (doesn't happen, but let's be thorough). */ if (len < insn_len) return paravirt_patch_default(type, clobber, ibuf, addr, len); @@ -1084,22 +1185,28 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, return insn_len; } -/*G:029 Once we get to lguest_init(), we know we're a Guest. The various +/*G:029 + * Once we get to lguest_init(), we know we're a Guest. The various * pv_ops structures in the kernel provide points for (almost) every routine we - * have to override to avoid privileged instructions. */ + * have to override to avoid privileged instructions. + */ __init void lguest_init(void) { - /* We're under lguest, paravirt is enabled, and we're running at - * privilege level 1, not 0 as normal. */ + /* We're under lguest. */ pv_info.name = "lguest"; + /* Paravirt is enabled. */ pv_info.paravirt_enabled = 1; + /* We're running at privilege level 1, not 0 as normal. */ pv_info.kernel_rpl = 1; + /* Everyone except Xen runs with this set. */ pv_info.shared_kernel_pmd = 1; - /* We set up all the lguest overrides for sensitive operations. These - * are detailed with the operations themselves. */ + /* + * We set up all the lguest overrides for sensitive operations. These + * are detailed with the operations themselves. + */ - /* interrupt-related operations */ + /* Interrupt-related operations */ pv_irq_ops.init_IRQ = lguest_init_IRQ; pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); @@ -1107,11 +1214,11 @@ __init void lguest_init(void) pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable); pv_irq_ops.safe_halt = lguest_safe_halt; - /* init-time operations */ + /* Setup operations */ pv_init_ops.memory_setup = lguest_memory_setup; pv_init_ops.patch = lguest_patch; - /* Intercepts of various cpu instructions */ + /* Intercepts of various CPU instructions */ pv_cpu_ops.load_gdt = lguest_load_gdt; pv_cpu_ops.cpuid = lguest_cpuid; pv_cpu_ops.load_idt = lguest_load_idt; @@ -1132,7 +1239,7 @@ __init void lguest_init(void) pv_cpu_ops.start_context_switch = paravirt_start_context_switch; pv_cpu_ops.end_context_switch = lguest_end_context_switch; - /* pagetable management */ + /* Pagetable management */ pv_mmu_ops.write_cr3 = lguest_write_cr3; pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user; pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single; @@ -1154,54 +1261,71 @@ __init void lguest_init(void) pv_mmu_ops.pte_update_defer = lguest_pte_update; #ifdef CONFIG_X86_LOCAL_APIC - /* apic read/write intercepts */ + /* APIC read/write intercepts */ set_lguest_basic_apic_ops(); #endif - /* time operations */ + /* Time operations */ pv_time_ops.get_wallclock = lguest_get_wallclock; pv_time_ops.time_init = lguest_time_init; pv_time_ops.get_tsc_khz = lguest_tsc_khz; - /* Now is a good time to look at the implementations of these functions - * before returning to the rest of lguest_init(). */ + /* + * Now is a good time to look at the implementations of these functions + * before returning to the rest of lguest_init(). + */ - /*G:070 Now we've seen all the paravirt_ops, we return to + /*G:070 + * Now we've seen all the paravirt_ops, we return to * lguest_init() where the rest of the fairly chaotic boot setup - * occurs. */ + * occurs. + */ - /* The stack protector is a weird thing where gcc places a canary + /* + * The stack protector is a weird thing where gcc places a canary * value on the stack and then checks it on return. This file is * compiled with -fno-stack-protector it, so we got this far without * problems. The value of the canary is kept at offset 20 from the * %gs register, so we need to set that up before calling C functions - * in other files. */ + * in other files. + */ setup_stack_canary_segment(0); - /* We could just call load_stack_canary_segment(), but we might as - * call switch_to_new_gdt() which loads the whole table and sets up - * the per-cpu segment descriptor register %fs as well. */ + + /* + * We could just call load_stack_canary_segment(), but we might as well + * call switch_to_new_gdt() which loads the whole table and sets up the + * per-cpu segment descriptor register %fs as well. + */ switch_to_new_gdt(0); /* As described in head_32.S, we map the first 128M of memory. */ max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; - /* The Host<->Guest Switcher lives at the top of our address space, and + /* + * The Host<->Guest Switcher lives at the top of our address space, and * the Host told us how big it is when we made LGUEST_INIT hypercall: - * it put the answer in lguest_data.reserve_mem */ + * it put the answer in lguest_data.reserve_mem + */ reserve_top_address(lguest_data.reserve_mem); - /* If we don't initialize the lock dependency checker now, it crashes - * paravirt_disable_iospace. */ + /* + * If we don't initialize the lock dependency checker now, it crashes + * paravirt_disable_iospace. + */ lockdep_init(); - /* The IDE code spends about 3 seconds probing for disks: if we reserve + /* + * The IDE code spends about 3 seconds probing for disks: if we reserve * all the I/O ports up front it can't get them and so doesn't probe. * Other device drivers are similar (but less severe). This cuts the - * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. */ + * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. + */ paravirt_disable_iospace(); - /* This is messy CPU setup stuff which the native boot code does before - * start_kernel, so we have to do, too: */ + /* + * This is messy CPU setup stuff which the native boot code does before + * start_kernel, so we have to do, too: + */ cpu_detect(&new_cpu_data); /* head.S usually sets up the first capability word, so do it here. */ new_cpu_data.x86_capability[0] = cpuid_edx(1); @@ -1218,22 +1342,28 @@ __init void lguest_init(void) acpi_ht = 0; #endif - /* We set the preferred console to "hvc". This is the "hypervisor + /* + * We set the preferred console to "hvc". This is the "hypervisor * virtual console" driver written by the PowerPC people, which we also - * adapted for lguest's use. */ + * adapted for lguest's use. + */ add_preferred_console("hvc", 0, NULL); /* Register our very early console. */ virtio_cons_early_init(early_put_chars); - /* Last of all, we set the power management poweroff hook to point to + /* + * Last of all, we set the power management poweroff hook to point to * the Guest routine to power off, and the reboot hook to our restart - * routine. */ + * routine. + */ pm_power_off = lguest_power_off; machine_ops.restart = lguest_restart; - /* Now we're set up, call i386_start_kernel() in head32.c and we proceed - * to boot as normal. It never returns. */ + /* + * Now we're set up, call i386_start_kernel() in head32.c and we proceed + * to boot as normal. It never returns. + */ i386_start_kernel(); } /* diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index a9c8cfe61cd4..db6aa95eb054 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S @@ -5,7 +5,8 @@ #include #include -/*G:020 Our story starts with the kernel booting into startup_32 in +/*G:020 + * Our story starts with the kernel booting into startup_32 in * arch/x86/kernel/head_32.S. It expects a boot header, which is created by * the bootloader (the Launcher in our case). * @@ -21,11 +22,14 @@ * data without remembering to subtract __PAGE_OFFSET! * * The .section line puts this code in .init.text so it will be discarded after - * boot. */ + * boot. + */ .section .init.text, "ax", @progbits ENTRY(lguest_entry) - /* We make the "initialization" hypercall now to tell the Host about - * us, and also find out where it put our page tables. */ + /* + * We make the "initialization" hypercall now to tell the Host about + * us, and also find out where it put our page tables. + */ movl $LHCALL_LGUEST_INIT, %eax movl $lguest_data - __PAGE_OFFSET, %ebx .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ @@ -33,13 +37,14 @@ ENTRY(lguest_entry) /* Set up the initial stack so we can run C code. */ movl $(init_thread_union+THREAD_SIZE),%esp - /* Jumps are relative, and we're running __PAGE_OFFSET too low at the - * moment. */ + /* Jumps are relative: we're running __PAGE_OFFSET too low. */ jmp lguest_init+__PAGE_OFFSET -/*G:055 We create a macro which puts the assembler code between lgstart_ and - * lgend_ markers. These templates are put in the .text section: they can't be - * discarded after boot as we may need to patch modules, too. */ +/*G:055 + * We create a macro which puts the assembler code between lgstart_ and lgend_ + * markers. These templates are put in the .text section: they can't be + * discarded after boot as we may need to patch modules, too. + */ .text #define LGUEST_PATCH(name, insns...) \ lgstart_##name: insns; lgend_##name:; \ @@ -48,58 +53,74 @@ ENTRY(lguest_entry) LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) -/*G:033 But using those wrappers is inefficient (we'll see why that doesn't - * matter for save_fl and irq_disable later). If we write our routines - * carefully in assembler, we can avoid clobbering any registers and avoid - * jumping through the wrapper functions. +/*G:033 + * But using those wrappers is inefficient (we'll see why that doesn't matter + * for save_fl and irq_disable later). If we write our routines carefully in + * assembler, we can avoid clobbering any registers and avoid jumping through + * the wrapper functions. * * I skipped over our first piece of assembler, but this one is worth studying - * in a bit more detail so I'll describe in easy stages. First, the routine - * to enable interrupts: */ + * in a bit more detail so I'll describe in easy stages. First, the routine to + * enable interrupts: + */ ENTRY(lg_irq_enable) - /* The reverse of irq_disable, this sets lguest_data.irq_enabled to - * X86_EFLAGS_IF (ie. "Interrupts enabled"). */ + /* + * The reverse of irq_disable, this sets lguest_data.irq_enabled to + * X86_EFLAGS_IF (ie. "Interrupts enabled"). + */ movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled - /* But now we need to check if the Host wants to know: there might have + /* + * But now we need to check if the Host wants to know: there might have * been interrupts waiting to be delivered, in which case it will have * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we - * jump to send_interrupts, otherwise we're done. */ + * jump to send_interrupts, otherwise we're done. + */ testl $0, lguest_data+LGUEST_DATA_irq_pending jnz send_interrupts - /* One cool thing about x86 is that you can do many things without using + /* + * One cool thing about x86 is that you can do many things without using * a register. In this case, the normal path hasn't needed to save or - * restore any registers at all! */ + * restore any registers at all! + */ ret send_interrupts: - /* OK, now we need a register: eax is used for the hypercall number, + /* + * OK, now we need a register: eax is used for the hypercall number, * which is LHCALL_SEND_INTERRUPTS. * * We used not to bother with this pending detection at all, which was * much simpler. Sooner or later the Host would realize it had to * send us an interrupt. But that turns out to make performance 7 * times worse on a simple tcp benchmark. So now we do this the hard - * way. */ + * way. + */ pushl %eax movl $LHCALL_SEND_INTERRUPTS, %eax - /* This is a vmcall instruction (same thing that KVM uses). Older + /* + * This is a vmcall instruction (same thing that KVM uses). Older * assembler versions might not know the "vmcall" instruction, so we - * create one manually here. */ + * create one manually here. + */ .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ popl %eax ret -/* Finally, the "popf" or "restore flags" routine. The %eax register holds the +/* + * Finally, the "popf" or "restore flags" routine. The %eax register holds the * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're - * enabling interrupts again, if it's 0 we're leaving them off. */ + * enabling interrupts again, if it's 0 we're leaving them off. + */ ENTRY(lg_restore_fl) /* This is just "lguest_data.irq_enabled = flags;" */ movl %eax, lguest_data+LGUEST_DATA_irq_enabled - /* Now, if the %eax value has enabled interrupts and + /* + * Now, if the %eax value has enabled interrupts and * lguest_data.irq_pending is set, we want to tell the Host so it can * deliver any outstanding interrupts. Fortunately, both values will * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl" * instruction will AND them together for us. If both are set, we - * jump to send_interrupts. */ + * jump to send_interrupts. + */ testl lguest_data+LGUEST_DATA_irq_pending, %eax jnz send_interrupts /* Again, the normal path has used no extra registers. Clever, huh? */ @@ -109,22 +130,24 @@ ENTRY(lg_restore_fl) .global lguest_noirq_start .global lguest_noirq_end -/*M:004 When the Host reflects a trap or injects an interrupt into the Guest, - * it sets the eflags interrupt bit on the stack based on - * lguest_data.irq_enabled, so the Guest iret logic does the right thing when - * restoring it. However, when the Host sets the Guest up for direct traps, - * such as system calls, the processor is the one to push eflags onto the - * stack, and the interrupt bit will be 1 (in reality, interrupts are always - * enabled in the Guest). +/*M:004 + * When the Host reflects a trap or injects an interrupt into the Guest, it + * sets the eflags interrupt bit on the stack based on lguest_data.irq_enabled, + * so the Guest iret logic does the right thing when restoring it. However, + * when the Host sets the Guest up for direct traps, such as system calls, the + * processor is the one to push eflags onto the stack, and the interrupt bit + * will be 1 (in reality, interrupts are always enabled in the Guest). * * This turns out to be harmless: the only trap which should happen under Linux * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc * regions), which has to be reflected through the Host anyway. If another * trap *does* go off when interrupts are disabled, the Guest will panic, and - * we'll never get to this iret! :*/ + * we'll never get to this iret! +:*/ -/*G:045 There is one final paravirt_op that the Guest implements, and glancing - * at it you can see why I left it to last. It's *cool*! It's in *assembler*! +/*G:045 + * There is one final paravirt_op that the Guest implements, and glancing at it + * you can see why I left it to last. It's *cool*! It's in *assembler*! * * The "iret" instruction is used to return from an interrupt or trap. The * stack looks like this: @@ -148,15 +171,18 @@ ENTRY(lg_restore_fl) * return to userspace or wherever. Our solution to this is to surround the * code with lguest_noirq_start: and lguest_noirq_end: labels. We tell the * Host that it is *never* to interrupt us there, even if interrupts seem to be - * enabled. */ + * enabled. + */ ENTRY(lguest_iret) pushl %eax movl 12(%esp), %eax lguest_noirq_start: - /* Note the %ss: segment prefix here. Normal data accesses use the + /* + * Note the %ss: segment prefix here. Normal data accesses use the * "ds" segment, but that will have already been restored for whatever * we're returning to (such as userspace): we can't trust it. The %ss: - * prefix makes sure we use the stack segment, which is still valid. */ + * prefix makes sure we use the stack segment, which is still valid. + */ movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled popl %eax iret diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index a6974e9b8ebf..cd058bc903ff 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -1,6 +1,8 @@ -/*P:400 This contains run_guest() which actually calls into the Host<->Guest +/*P:400 + * This contains run_guest() which actually calls into the Host<->Guest * Switcher and analyzes the return, such as determining if the Guest wants the - * Host to do something. This file also contains useful helper routines. :*/ + * Host to do something. This file also contains useful helper routines. +:*/ #include #include #include @@ -24,7 +26,8 @@ static struct page **switcher_page; /* This One Big lock protects all inter-guest data structures. */ DEFINE_MUTEX(lguest_lock); -/*H:010 We need to set up the Switcher at a high virtual address. Remember the +/*H:010 + * We need to set up the Switcher at a high virtual address. Remember the * Switcher is a few hundred bytes of assembler code which actually changes the * CPU to run the Guest, and then changes back to the Host when a trap or * interrupt happens. @@ -33,7 +36,8 @@ DEFINE_MUTEX(lguest_lock); * Host since it will be running as the switchover occurs. * * Trying to map memory at a particular address is an unusual thing to do, so - * it's not a simple one-liner. */ + * it's not a simple one-liner. + */ static __init int map_switcher(void) { int i, err; @@ -47,8 +51,10 @@ static __init int map_switcher(void) * easy. */ - /* We allocate an array of struct page pointers. map_vm_area() wants - * this, rather than just an array of pages. */ + /* + * We allocate an array of struct page pointers. map_vm_area() wants + * this, rather than just an array of pages. + */ switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES, GFP_KERNEL); if (!switcher_page) { @@ -56,8 +62,10 @@ static __init int map_switcher(void) goto out; } - /* Now we actually allocate the pages. The Guest will see these pages, - * so we make sure they're zeroed. */ + /* + * Now we actually allocate the pages. The Guest will see these pages, + * so we make sure they're zeroed. + */ for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { unsigned long addr = get_zeroed_page(GFP_KERNEL); if (!addr) { @@ -67,19 +75,23 @@ static __init int map_switcher(void) switcher_page[i] = virt_to_page(addr); } - /* First we check that the Switcher won't overlap the fixmap area at + /* + * First we check that the Switcher won't overlap the fixmap area at * the top of memory. It's currently nowhere near, but it could have - * very strange effects if it ever happened. */ + * very strange effects if it ever happened. + */ if (SWITCHER_ADDR + (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE > FIXADDR_START){ err = -ENOMEM; printk("lguest: mapping switcher would thwack fixmap\n"); goto free_pages; } - /* Now we reserve the "virtual memory area" we want: 0xFFC00000 + /* + * Now we reserve the "virtual memory area" we want: 0xFFC00000 * (SWITCHER_ADDR). We might not get it in theory, but in practice * it's worked so far. The end address needs +1 because __get_vm_area - * allocates an extra guard page, so we need space for that. */ + * allocates an extra guard page, so we need space for that. + */ switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, VM_ALLOC, SWITCHER_ADDR, SWITCHER_ADDR + (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE); @@ -89,11 +101,13 @@ static __init int map_switcher(void) goto free_pages; } - /* This code actually sets up the pages we've allocated to appear at + /* + * This code actually sets up the pages we've allocated to appear at * SWITCHER_ADDR. map_vm_area() takes the vma we allocated above, the * kind of pages we're mapping (kernel pages), and a pointer to our * array of struct pages. It increments that pointer, but we don't - * care. */ + * care. + */ pagep = switcher_page; err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep); if (err) { @@ -101,8 +115,10 @@ static __init int map_switcher(void) goto free_vma; } - /* Now the Switcher is mapped at the right address, we can't fail! - * Copy in the compiled-in Switcher code (from _switcher.S). */ + /* + * Now the Switcher is mapped at the right address, we can't fail! + * Copy in the compiled-in Switcher code (from _switcher.S). + */ memcpy(switcher_vma->addr, start_switcher_text, end_switcher_text - start_switcher_text); @@ -124,8 +140,7 @@ out: } /*:*/ -/* Cleaning up the mapping when the module is unloaded is almost... - * too easy. */ +/* Cleaning up the mapping when the module is unloaded is almost... too easy. */ static void unmap_switcher(void) { unsigned int i; @@ -151,16 +166,19 @@ static void unmap_switcher(void) * But we can't trust the Guest: it might be trying to access the Launcher * code. We have to check that the range is below the pfn_limit the Launcher * gave us. We have to make sure that addr + len doesn't give us a false - * positive by overflowing, too. */ + * positive by overflowing, too. + */ bool lguest_address_ok(const struct lguest *lg, unsigned long addr, unsigned long len) { return (addr+len) / PAGE_SIZE < lg->pfn_limit && (addr+len >= addr); } -/* This routine copies memory from the Guest. Here we can see how useful the +/* + * This routine copies memory from the Guest. Here we can see how useful the * kill_lguest() routine we met in the Launcher can be: we return a random - * value (all zeroes) instead of needing to return an error. */ + * value (all zeroes) instead of needing to return an error. + */ void __lgread(struct lg_cpu *cpu, void *b, unsigned long addr, unsigned bytes) { if (!lguest_address_ok(cpu->lg, addr, bytes) @@ -181,9 +199,11 @@ void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b, } /*:*/ -/*H:030 Let's jump straight to the the main loop which runs the Guest. +/*H:030 + * Let's jump straight to the the main loop which runs the Guest. * Remember, this is called by the Launcher reading /dev/lguest, and we keep - * going around and around until something interesting happens. */ + * going around and around until something interesting happens. + */ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) { /* We stop running once the Guest is dead. */ @@ -195,8 +215,10 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) if (cpu->hcall) do_hypercalls(cpu); - /* It's possible the Guest did a NOTIFY hypercall to the - * Launcher, in which case we return from the read() now. */ + /* + * It's possible the Guest did a NOTIFY hypercall to the + * Launcher, in which case we return from the read() now. + */ if (cpu->pending_notify) { if (!send_notify_to_eventfd(cpu)) { if (put_user(cpu->pending_notify, user)) @@ -209,29 +231,39 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) if (signal_pending(current)) return -ERESTARTSYS; - /* Check if there are any interrupts which can be delivered now: + /* + * Check if there are any interrupts which can be delivered now: * if so, this sets up the hander to be executed when we next - * run the Guest. */ + * run the Guest. + */ irq = interrupt_pending(cpu, &more); if (irq < LGUEST_IRQS) try_deliver_interrupt(cpu, irq, more); - /* All long-lived kernel loops need to check with this horrible + /* + * All long-lived kernel loops need to check with this horrible * thing called the freezer. If the Host is trying to suspend, - * it stops us. */ + * it stops us. + */ try_to_freeze(); - /* Just make absolutely sure the Guest is still alive. One of - * those hypercalls could have been fatal, for example. */ + /* + * Just make absolutely sure the Guest is still alive. One of + * those hypercalls could have been fatal, for example. + */ if (cpu->lg->dead) break; - /* If the Guest asked to be stopped, we sleep. The Guest's - * clock timer will wake us. */ + /* + * If the Guest asked to be stopped, we sleep. The Guest's + * clock timer will wake us. + */ if (cpu->halted) { set_current_state(TASK_INTERRUPTIBLE); - /* Just before we sleep, make sure no interrupt snuck in - * which we should be doing. */ + /* + * Just before we sleep, make sure no interrupt snuck in + * which we should be doing. + */ if (interrupt_pending(cpu, &more) < LGUEST_IRQS) set_current_state(TASK_RUNNING); else @@ -239,8 +271,10 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) continue; } - /* OK, now we're ready to jump into the Guest. First we put up - * the "Do Not Disturb" sign: */ + /* + * OK, now we're ready to jump into the Guest. First we put up + * the "Do Not Disturb" sign: + */ local_irq_disable(); /* Actually run the Guest until something happens. */ @@ -327,8 +361,10 @@ static void __exit fini(void) } /*:*/ -/* The Host side of lguest can be a module. This is a nice way for people to - * play with it. */ +/* + * The Host side of lguest can be a module. This is a nice way for people to + * play with it. + */ module_init(init); module_exit(fini); MODULE_LICENSE("GPL"); diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index c29ffa19cb74..787ab4bc09f0 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c @@ -1,8 +1,10 @@ -/*P:500 Just as userspace programs request kernel operations through a system +/*P:500 + * Just as userspace programs request kernel operations through a system * call, the Guest requests Host operations through a "hypercall". You might * notice this nomenclature doesn't really follow any logic, but the name has * been around for long enough that we're stuck with it. As you'd expect, this - * code is basically a one big switch statement. :*/ + * code is basically a one big switch statement. +:*/ /* Copyright (C) 2006 Rusty Russell IBM Corporation @@ -28,30 +30,41 @@ #include #include "lg.h" -/*H:120 This is the core hypercall routine: where the Guest gets what it wants. - * Or gets killed. Or, in the case of LHCALL_SHUTDOWN, both. */ +/*H:120 + * This is the core hypercall routine: where the Guest gets what it wants. + * Or gets killed. Or, in the case of LHCALL_SHUTDOWN, both. + */ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) { switch (args->arg0) { case LHCALL_FLUSH_ASYNC: - /* This call does nothing, except by breaking out of the Guest - * it makes us process all the asynchronous hypercalls. */ + /* + * This call does nothing, except by breaking out of the Guest + * it makes us process all the asynchronous hypercalls. + */ break; case LHCALL_SEND_INTERRUPTS: - /* This call does nothing too, but by breaking out of the Guest - * it makes us process any pending interrupts. */ + /* + * This call does nothing too, but by breaking out of the Guest + * it makes us process any pending interrupts. + */ break; case LHCALL_LGUEST_INIT: - /* You can't get here unless you're already initialized. Don't - * do that. */ + /* + * You can't get here unless you're already initialized. Don't + * do that. + */ kill_guest(cpu, "already have lguest_data"); break; case LHCALL_SHUTDOWN: { - /* Shutdown is such a trivial hypercall that we do it in four - * lines right here. */ char msg[128]; - /* If the lgread fails, it will call kill_guest() itself; the - * kill_guest() with the message will be ignored. */ + /* + * Shutdown is such a trivial hypercall that we do it in four + * lines right here. + * + * If the lgread fails, it will call kill_guest() itself; the + * kill_guest() with the message will be ignored. + */ __lgread(cpu, msg, args->arg1, sizeof(msg)); msg[sizeof(msg)-1] = '\0'; kill_guest(cpu, "CRASH: %s", msg); @@ -60,16 +73,17 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) break; } case LHCALL_FLUSH_TLB: - /* FLUSH_TLB comes in two flavors, depending on the - * argument: */ + /* FLUSH_TLB comes in two flavors, depending on the argument: */ if (args->arg1) guest_pagetable_clear_all(cpu); else guest_pagetable_flush_user(cpu); break; - /* All these calls simply pass the arguments through to the right - * routines. */ + /* + * All these calls simply pass the arguments through to the right + * routines. + */ case LHCALL_NEW_PGTABLE: guest_new_pagetable(cpu, args->arg1); break; @@ -112,15 +126,16 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) kill_guest(cpu, "Bad hypercall %li\n", args->arg0); } } -/*:*/ -/*H:124 Asynchronous hypercalls are easy: we just look in the array in the +/*H:124 + * Asynchronous hypercalls are easy: we just look in the array in the * Guest's "struct lguest_data" to see if any new ones are marked "ready". * * We are careful to do these in order: obviously we respect the order the * Guest put them in the ring, but we also promise the Guest that they will * happen before any normal hypercall (which is why we check this before - * checking for a normal hcall). */ + * checking for a normal hcall). + */ static void do_async_hcalls(struct lg_cpu *cpu) { unsigned int i; @@ -133,22 +148,28 @@ static void do_async_hcalls(struct lg_cpu *cpu) /* We process "struct lguest_data"s hcalls[] ring once. */ for (i = 0; i < ARRAY_SIZE(st); i++) { struct hcall_args args; - /* We remember where we were up to from last time. This makes + /* + * We remember where we were up to from last time. This makes * sure that the hypercalls are done in the order the Guest - * places them in the ring. */ + * places them in the ring. + */ unsigned int n = cpu->next_hcall; /* 0xFF means there's no call here (yet). */ if (st[n] == 0xFF) break; - /* OK, we have hypercall. Increment the "next_hcall" cursor, - * and wrap back to 0 if we reach the end. */ + /* + * OK, we have hypercall. Increment the "next_hcall" cursor, + * and wrap back to 0 if we reach the end. + */ if (++cpu->next_hcall == LHCALL_RING_SIZE) cpu->next_hcall = 0; - /* Copy the hypercall arguments into a local copy of - * the hcall_args struct. */ + /* + * Copy the hypercall arguments into a local copy of the + * hcall_args struct. + */ if (copy_from_user(&args, &cpu->lg->lguest_data->hcalls[n], sizeof(struct hcall_args))) { kill_guest(cpu, "Fetching async hypercalls"); @@ -164,19 +185,25 @@ static void do_async_hcalls(struct lg_cpu *cpu) break; } - /* Stop doing hypercalls if they want to notify the Launcher: - * it needs to service this first. */ + /* + * Stop doing hypercalls if they want to notify the Launcher: + * it needs to service this first. + */ if (cpu->pending_notify) break; } } -/* Last of all, we look at what happens first of all. The very first time the - * Guest makes a hypercall, we end up here to set things up: */ +/* + * Last of all, we look at what happens first of all. The very first time the + * Guest makes a hypercall, we end up here to set things up: + */ static void initialize(struct lg_cpu *cpu) { - /* You can't do anything until you're initialized. The Guest knows the - * rules, so we're unforgiving here. */ + /* + * You can't do anything until you're initialized. The Guest knows the + * rules, so we're unforgiving here. + */ if (cpu->hcall->arg0 != LHCALL_LGUEST_INIT) { kill_guest(cpu, "hypercall %li before INIT", cpu->hcall->arg0); return; @@ -185,32 +212,40 @@ static void initialize(struct lg_cpu *cpu) if (lguest_arch_init_hypercalls(cpu)) kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); - /* The Guest tells us where we're not to deliver interrupts by putting - * the range of addresses into "struct lguest_data". */ + /* + * The Guest tells us where we're not to deliver interrupts by putting + * the range of addresses into "struct lguest_data". + */ if (get_user(cpu->lg->noirq_start, &cpu->lg->lguest_data->noirq_start) || get_user(cpu->lg->noirq_end, &cpu->lg->lguest_data->noirq_end)) kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); - /* We write the current time into the Guest's data page once so it can - * set its clock. */ + /* + * We write the current time into the Guest's data page once so it can + * set its clock. + */ write_timestamp(cpu); /* page_tables.c will also do some setup. */ page_table_guest_data_init(cpu); - /* This is the one case where the above accesses might have been the + /* + * This is the one case where the above accesses might have been the * first write to a Guest page. This may have caused a copy-on-write * fault, but the old page might be (read-only) in the Guest - * pagetable. */ + * pagetable. + */ guest_pagetable_clear_all(cpu); } /*:*/ -/*M:013 If a Guest reads from a page (so creates a mapping) that it has never +/*M:013 + * If a Guest reads from a page (so creates a mapping) that it has never * written to, and then the Launcher writes to it (ie. the output of a virtual * device), the Guest will still see the old page. In practice, this never * happens: why would the Guest read a page which it has never written to? But - * a similar scenario might one day bite us, so it's worth mentioning. :*/ + * a similar scenario might one day bite us, so it's worth mentioning. +:*/ /*H:100 * Hypercalls @@ -229,17 +264,22 @@ void do_hypercalls(struct lg_cpu *cpu) return; } - /* The Guest has initialized. + /* + * The Guest has initialized. * - * Look in the hypercall ring for the async hypercalls: */ + * Look in the hypercall ring for the async hypercalls: + */ do_async_hcalls(cpu); - /* If we stopped reading the hypercall ring because the Guest did a + /* + * If we stopped reading the hypercall ring because the Guest did a * NOTIFY to the Launcher, we want to return now. Otherwise we do - * the hypercall. */ + * the hypercall. + */ if (!cpu->pending_notify) { do_hcall(cpu, cpu->hcall); - /* Tricky point: we reset the hcall pointer to mark the + /* + * Tricky point: we reset the hcall pointer to mark the * hypercall as "done". We use the hcall pointer rather than * the trap number to indicate a hypercall is pending. * Normally it doesn't matter: the Guest will run again and @@ -248,13 +288,16 @@ void do_hypercalls(struct lg_cpu *cpu) * However, if we are signalled or the Guest sends I/O to the * Launcher, the run_guest() loop will exit without running the * Guest. When it comes back it would try to re-run the - * hypercall. Finding that bug sucked. */ + * hypercall. Finding that bug sucked. + */ cpu->hcall = NULL; } } -/* This routine supplies the Guest with time: it's used for wallclock time at - * initial boot and as a rough time source if the TSC isn't available. */ +/* + * This routine supplies the Guest with time: it's used for wallclock time at + * initial boot and as a rough time source if the TSC isn't available. + */ void write_timestamp(struct lg_cpu *cpu) { struct timespec now; diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c index 0e9067b0d507..18648180db02 100644 --- a/drivers/lguest/interrupts_and_traps.c +++ b/drivers/lguest/interrupts_and_traps.c @@ -1,4 +1,5 @@ -/*P:800 Interrupts (traps) are complicated enough to earn their own file. +/*P:800 + * Interrupts (traps) are complicated enough to earn their own file. * There are three classes of interrupts: * * 1) Real hardware interrupts which occur while we're running the Guest, @@ -10,7 +11,8 @@ * just like real hardware would deliver them. Traps from the Guest can be set * up to go directly back into the Guest, but sometimes the Host wants to see * them first, so we also have a way of "reflecting" them into the Guest as if - * they had been delivered to it directly. :*/ + * they had been delivered to it directly. +:*/ #include #include #include @@ -26,8 +28,10 @@ static unsigned long idt_address(u32 lo, u32 hi) return (lo & 0x0000FFFF) | (hi & 0xFFFF0000); } -/* The "type" of the interrupt handler is a 4 bit field: we only support a - * couple of types. */ +/* + * The "type" of the interrupt handler is a 4 bit field: we only support a + * couple of types. + */ static int idt_type(u32 lo, u32 hi) { return (hi >> 8) & 0xF; @@ -39,8 +43,10 @@ static bool idt_present(u32 lo, u32 hi) return (hi & 0x8000); } -/* We need a helper to "push" a value onto the Guest's stack, since that's a - * big part of what delivering an interrupt does. */ +/* + * We need a helper to "push" a value onto the Guest's stack, since that's a + * big part of what delivering an interrupt does. + */ static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val) { /* Stack grows upwards: move stack then write value. */ @@ -48,7 +54,8 @@ static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val) lgwrite(cpu, *gstack, u32, val); } -/*H:210 The set_guest_interrupt() routine actually delivers the interrupt or +/*H:210 + * The set_guest_interrupt() routine actually delivers the interrupt or * trap. The mechanics of delivering traps and interrupts to the Guest are the * same, except some traps have an "error code" which gets pushed onto the * stack as well: the caller tells us if this is one. @@ -59,7 +66,8 @@ static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val) * * We set up the stack just like the CPU does for a real interrupt, so it's * identical for the Guest (and the standard "iret" instruction will undo - * it). */ + * it). + */ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, bool has_err) { @@ -67,20 +75,26 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, u32 eflags, ss, irq_enable; unsigned long virtstack; - /* There are two cases for interrupts: one where the Guest is already + /* + * There are two cases for interrupts: one where the Guest is already * in the kernel, and a more complex one where the Guest is in - * userspace. We check the privilege level to find out. */ + * userspace. We check the privilege level to find out. + */ if ((cpu->regs->ss&0x3) != GUEST_PL) { - /* The Guest told us their kernel stack with the SET_STACK - * hypercall: both the virtual address and the segment */ + /* + * The Guest told us their kernel stack with the SET_STACK + * hypercall: both the virtual address and the segment. + */ virtstack = cpu->esp1; ss = cpu->ss1; origstack = gstack = guest_pa(cpu, virtstack); - /* We push the old stack segment and pointer onto the new + /* + * We push the old stack segment and pointer onto the new * stack: when the Guest does an "iret" back from the interrupt * handler the CPU will notice they're dropping privilege - * levels and expect these here. */ + * levels and expect these here. + */ push_guest_stack(cpu, &gstack, cpu->regs->ss); push_guest_stack(cpu, &gstack, cpu->regs->esp); } else { @@ -91,18 +105,22 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, origstack = gstack = guest_pa(cpu, virtstack); } - /* Remember that we never let the Guest actually disable interrupts, so + /* + * Remember that we never let the Guest actually disable interrupts, so * the "Interrupt Flag" bit is always set. We copy that bit from the * Guest's "irq_enabled" field into the eflags word: we saw the Guest - * copy it back in "lguest_iret". */ + * copy it back in "lguest_iret". + */ eflags = cpu->regs->eflags; if (get_user(irq_enable, &cpu->lg->lguest_data->irq_enabled) == 0 && !(irq_enable & X86_EFLAGS_IF)) eflags &= ~X86_EFLAGS_IF; - /* An interrupt is expected to push three things on the stack: the old + /* + * An interrupt is expected to push three things on the stack: the old * "eflags" word, the old code segment, and the old instruction - * pointer. */ + * pointer. + */ push_guest_stack(cpu, &gstack, eflags); push_guest_stack(cpu, &gstack, cpu->regs->cs); push_guest_stack(cpu, &gstack, cpu->regs->eip); @@ -111,15 +129,19 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, if (has_err) push_guest_stack(cpu, &gstack, cpu->regs->errcode); - /* Now we've pushed all the old state, we change the stack, the code - * segment and the address to execute. */ + /* + * Now we've pushed all the old state, we change the stack, the code + * segment and the address to execute. + */ cpu->regs->ss = ss; cpu->regs->esp = virtstack + (gstack - origstack); cpu->regs->cs = (__KERNEL_CS|GUEST_PL); cpu->regs->eip = idt_address(lo, hi); - /* There are two kinds of interrupt handlers: 0xE is an "interrupt - * gate" which expects interrupts to be disabled on entry. */ + /* + * There are two kinds of interrupt handlers: 0xE is an "interrupt + * gate" which expects interrupts to be disabled on entry. + */ if (idt_type(lo, hi) == 0xE) if (put_user(0, &cpu->lg->lguest_data->irq_enabled)) kill_guest(cpu, "Disabling interrupts"); @@ -130,7 +152,8 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, * * interrupt_pending() returns the first pending interrupt which isn't blocked * by the Guest. It is called before every entry to the Guest, and just before - * we go to sleep when the Guest has halted itself. */ + * we go to sleep when the Guest has halted itself. + */ unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more) { unsigned int irq; @@ -140,8 +163,10 @@ unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more) if (!cpu->lg->lguest_data) return LGUEST_IRQS; - /* Take our "irqs_pending" array and remove any interrupts the Guest - * wants blocked: the result ends up in "blk". */ + /* + * Take our "irqs_pending" array and remove any interrupts the Guest + * wants blocked: the result ends up in "blk". + */ if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts, sizeof(blk))) return LGUEST_IRQS; @@ -154,16 +179,20 @@ unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more) return irq; } -/* This actually diverts the Guest to running an interrupt handler, once an - * interrupt has been identified by interrupt_pending(). */ +/* + * This actually diverts the Guest to running an interrupt handler, once an + * interrupt has been identified by interrupt_pending(). + */ void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more) { struct desc_struct *idt; BUG_ON(irq >= LGUEST_IRQS); - /* They may be in the middle of an iret, where they asked us never to - * deliver interrupts. */ + /* + * They may be in the middle of an iret, where they asked us never to + * deliver interrupts. + */ if (cpu->regs->eip >= cpu->lg->noirq_start && (cpu->regs->eip < cpu->lg->noirq_end)) return; @@ -187,29 +216,37 @@ void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more) } } - /* Look at the IDT entry the Guest gave us for this interrupt. The + /* + * Look at the IDT entry the Guest gave us for this interrupt. The * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip - * over them. */ + * over them. + */ idt = &cpu->arch.idt[FIRST_EXTERNAL_VECTOR+irq]; /* If they don't have a handler (yet?), we just ignore it */ if (idt_present(idt->a, idt->b)) { /* OK, mark it no longer pending and deliver it. */ clear_bit(irq, cpu->irqs_pending); - /* set_guest_interrupt() takes the interrupt descriptor and a + /* + * set_guest_interrupt() takes the interrupt descriptor and a * flag to say whether this interrupt pushes an error code onto - * the stack as well: virtual interrupts never do. */ + * the stack as well: virtual interrupts never do. + */ set_guest_interrupt(cpu, idt->a, idt->b, false); } - /* Every time we deliver an interrupt, we update the timestamp in the + /* + * Every time we deliver an interrupt, we update the timestamp in the * Guest's lguest_data struct. It would be better for the Guest if we * did this more often, but it can actually be quite slow: doing it * here is a compromise which means at least it gets updated every - * timer interrupt. */ + * timer interrupt. + */ write_timestamp(cpu); - /* If there are no other interrupts we want to deliver, clear - * the pending flag. */ + /* + * If there are no other interrupts we want to deliver, clear + * the pending flag. + */ if (!more) put_user(0, &cpu->lg->lguest_data->irq_pending); } @@ -217,24 +254,29 @@ void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more) /* And this is the routine when we want to set an interrupt for the Guest. */ void set_interrupt(struct lg_cpu *cpu, unsigned int irq) { - /* Next time the Guest runs, the core code will see if it can deliver - * this interrupt. */ + /* + * Next time the Guest runs, the core code will see if it can deliver + * this interrupt. + */ set_bit(irq, cpu->irqs_pending); - /* Make sure it sees it; it might be asleep (eg. halted), or - * running the Guest right now, in which case kick_process() - * will knock it out. */ + /* + * Make sure it sees it; it might be asleep (eg. halted), or running + * the Guest right now, in which case kick_process() will knock it out. + */ if (!wake_up_process(cpu->tsk)) kick_process(cpu->tsk); } /*:*/ -/* Linux uses trap 128 for system calls. Plan9 uses 64, and Ron Minnich sent +/* + * Linux uses trap 128 for system calls. Plan9 uses 64, and Ron Minnich sent * me a patch, so we support that too. It'd be a big step for lguest if half * the Plan 9 user base were to start using it. * * Actually now I think of it, it's possible that Ron *is* half the Plan 9 - * userbase. Oh well. */ + * userbase. Oh well. + */ static bool could_be_syscall(unsigned int num) { /* Normal Linux SYSCALL_VECTOR or reserved vector? */ @@ -274,9 +316,11 @@ void free_interrupts(void) clear_bit(syscall_vector, used_vectors); } -/*H:220 Now we've got the routines to deliver interrupts, delivering traps like +/*H:220 + * Now we've got the routines to deliver interrupts, delivering traps like * page fault is easy. The only trick is that Intel decided that some traps - * should have error codes: */ + * should have error codes: + */ static bool has_err(unsigned int trap) { return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17); @@ -285,13 +329,17 @@ static bool has_err(unsigned int trap) /* deliver_trap() returns true if it could deliver the trap. */ bool deliver_trap(struct lg_cpu *cpu, unsigned int num) { - /* Trap numbers are always 8 bit, but we set an impossible trap number - * for traps inside the Switcher, so check that here. */ + /* + * Trap numbers are always 8 bit, but we set an impossible trap number + * for traps inside the Switcher, so check that here. + */ if (num >= ARRAY_SIZE(cpu->arch.idt)) return false; - /* Early on the Guest hasn't set the IDT entries (or maybe it put a - * bogus one in): if we fail here, the Guest will be killed. */ + /* + * Early on the Guest hasn't set the IDT entries (or maybe it put a + * bogus one in): if we fail here, the Guest will be killed. + */ if (!idt_present(cpu->arch.idt[num].a, cpu->arch.idt[num].b)) return false; set_guest_interrupt(cpu, cpu->arch.idt[num].a, @@ -299,7 +347,8 @@ bool deliver_trap(struct lg_cpu *cpu, unsigned int num) return true; } -/*H:250 Here's the hard part: returning to the Host every time a trap happens +/*H:250 + * Here's the hard part: returning to the Host every time a trap happens * and then calling deliver_trap() and re-entering the Guest is slow. * Particularly because Guest userspace system calls are traps (usually trap * 128). @@ -311,69 +360,87 @@ bool deliver_trap(struct lg_cpu *cpu, unsigned int num) * the other hypervisors would beat it up at lunchtime. * * This routine indicates if a particular trap number could be delivered - * directly. */ + * directly. + */ static bool direct_trap(unsigned int num) { - /* Hardware interrupts don't go to the Guest at all (except system - * call). */ + /* + * Hardware interrupts don't go to the Guest at all (except system + * call). + */ if (num >= FIRST_EXTERNAL_VECTOR && !could_be_syscall(num)) return false; - /* The Host needs to see page faults (for shadow paging and to save the + /* + * The Host needs to see page faults (for shadow paging and to save the * fault address), general protection faults (in/out emulation) and * device not available (TS handling), invalid opcode fault (kvm hcall), - * and of course, the hypercall trap. */ + * and of course, the hypercall trap. + */ return num != 14 && num != 13 && num != 7 && num != 6 && num != LGUEST_TRAP_ENTRY; } /*:*/ -/*M:005 The Guest has the ability to turn its interrupt gates into trap gates, +/*M:005 + * The Guest has the ability to turn its interrupt gates into trap gates, * if it is careful. The Host will let trap gates can go directly to the * Guest, but the Guest needs the interrupts atomically disabled for an * interrupt gate. It can do this by pointing the trap gate at instructions - * within noirq_start and noirq_end, where it can safely disable interrupts. */ + * within noirq_start and noirq_end, where it can safely disable interrupts. + */ -/*M:006 The Guests do not use the sysenter (fast system call) instruction, +/*M:006 + * The Guests do not use the sysenter (fast system call) instruction, * because it's hardcoded to enter privilege level 0 and so can't go direct. * It's about twice as fast as the older "int 0x80" system call, so it might * still be worthwhile to handle it in the Switcher and lcall down to the * Guest. The sysenter semantics are hairy tho: search for that keyword in - * entry.S :*/ + * entry.S +:*/ -/*H:260 When we make traps go directly into the Guest, we need to make sure +/*H:260 + * When we make traps go directly into the Guest, we need to make sure * the kernel stack is valid (ie. mapped in the page tables). Otherwise, the * CPU trying to deliver the trap will fault while trying to push the interrupt * words on the stack: this is called a double fault, and it forces us to kill * the Guest. * - * Which is deeply unfair, because (literally!) it wasn't the Guests' fault. */ + * Which is deeply unfair, because (literally!) it wasn't the Guests' fault. + */ void pin_stack_pages(struct lg_cpu *cpu) { unsigned int i; - /* Depending on the CONFIG_4KSTACKS option, the Guest can have one or - * two pages of stack space. */ + /* + * Depending on the CONFIG_4KSTACKS option, the Guest can have one or + * two pages of stack space. + */ for (i = 0; i < cpu->lg->stack_pages; i++) - /* The stack grows *upwards*, so the address we're given is the + /* + * The stack grows *upwards*, so the address we're given is the * start of the page after the kernel stack. Subtract one to * get back onto the first stack page, and keep subtracting to - * get to the rest of the stack pages. */ + * get to the rest of the stack pages. + */ pin_page(cpu, cpu->esp1 - 1 - i * PAGE_SIZE); } -/* Direct traps also mean that we need to know whenever the Guest wants to use +/* + * Direct traps also mean that we need to know whenever the Guest wants to use * a different kernel stack, so we can change the IDT entries to use that * stack. The IDT entries expect a virtual address, so unlike most addresses * the Guest gives us, the "esp" (stack pointer) value here is virtual, not * physical. * * In Linux each process has its own kernel stack, so this happens a lot: we - * change stacks on each context switch. */ + * change stacks on each context switch. + */ void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages) { - /* You are not allowed have a stack segment with privilege level 0: bad - * Guest! */ + /* + * You're not allowed a stack segment with privilege level 0: bad Guest! + */ if ((seg & 0x3) != GUEST_PL) kill_guest(cpu, "bad stack segment %i", seg); /* We only expect one or two stack pages. */ @@ -387,11 +454,15 @@ void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages) pin_stack_pages(cpu); } -/* All this reference to mapping stacks leads us neatly into the other complex - * part of the Host: page table handling. */ +/* + * All this reference to mapping stacks leads us neatly into the other complex + * part of the Host: page table handling. + */ -/*H:235 This is the routine which actually checks the Guest's IDT entry and - * transfers it into the entry in "struct lguest": */ +/*H:235 + * This is the routine which actually checks the Guest's IDT entry and + * transfers it into the entry in "struct lguest": + */ static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap, unsigned int num, u32 lo, u32 hi) { @@ -407,30 +478,38 @@ static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap, if (type != 0xE && type != 0xF) kill_guest(cpu, "bad IDT type %i", type); - /* We only copy the handler address, present bit, privilege level and + /* + * We only copy the handler address, present bit, privilege level and * type. The privilege level controls where the trap can be triggered * manually with an "int" instruction. This is usually GUEST_PL, - * except for system calls which userspace can use. */ + * except for system calls which userspace can use. + */ trap->a = ((__KERNEL_CS|GUEST_PL)<<16) | (lo&0x0000FFFF); trap->b = (hi&0xFFFFEF00); } -/*H:230 While we're here, dealing with delivering traps and interrupts to the +/*H:230 + * While we're here, dealing with delivering traps and interrupts to the * Guest, we might as well complete the picture: how the Guest tells us where * it wants them to go. This would be simple, except making traps fast * requires some tricks. * * We saw the Guest setting Interrupt Descriptor Table (IDT) entries with the - * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here. */ + * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here. + */ void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi) { - /* Guest never handles: NMI, doublefault, spurious interrupt or - * hypercall. We ignore when it tries to set them. */ + /* + * Guest never handles: NMI, doublefault, spurious interrupt or + * hypercall. We ignore when it tries to set them. + */ if (num == 2 || num == 8 || num == 15 || num == LGUEST_TRAP_ENTRY) return; - /* Mark the IDT as changed: next time the Guest runs we'll know we have - * to copy this again. */ + /* + * Mark the IDT as changed: next time the Guest runs we'll know we have + * to copy this again. + */ cpu->changed |= CHANGED_IDT; /* Check that the Guest doesn't try to step outside the bounds. */ @@ -440,9 +519,11 @@ void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi) set_trap(cpu, &cpu->arch.idt[num], num, lo, hi); } -/* The default entry for each interrupt points into the Switcher routines which +/* + * The default entry for each interrupt points into the Switcher routines which * simply return to the Host. The run_guest() loop will then call - * deliver_trap() to bounce it back into the Guest. */ + * deliver_trap() to bounce it back into the Guest. + */ static void default_idt_entry(struct desc_struct *idt, int trap, const unsigned long handler, @@ -451,13 +532,17 @@ static void default_idt_entry(struct desc_struct *idt, /* A present interrupt gate. */ u32 flags = 0x8e00; - /* Set the privilege level on the entry for the hypercall: this allows - * the Guest to use the "int" instruction to trigger it. */ + /* + * Set the privilege level on the entry for the hypercall: this allows + * the Guest to use the "int" instruction to trigger it. + */ if (trap == LGUEST_TRAP_ENTRY) flags |= (GUEST_PL << 13); else if (base) - /* Copy priv. level from what Guest asked for. This allows - * debug (int 3) traps from Guest userspace, for example. */ + /* + * Copy privilege level from what Guest asked for. This allows + * debug (int 3) traps from Guest userspace, for example. + */ flags |= (base->b & 0x6000); /* Now pack it into the IDT entry in its weird format. */ @@ -475,16 +560,20 @@ void setup_default_idt_entries(struct lguest_ro_state *state, default_idt_entry(&state->guest_idt[i], i, def[i], NULL); } -/*H:240 We don't use the IDT entries in the "struct lguest" directly, instead +/*H:240 + * We don't use the IDT entries in the "struct lguest" directly, instead * we copy them into the IDT which we've set up for Guests on this CPU, just - * before we run the Guest. This routine does that copy. */ + * before we run the Guest. This routine does that copy. + */ void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, const unsigned long *def) { unsigned int i; - /* We can simply copy the direct traps, otherwise we use the default - * ones in the Switcher: they will return to the Host. */ + /* + * We can simply copy the direct traps, otherwise we use the default + * ones in the Switcher: they will return to the Host. + */ for (i = 0; i < ARRAY_SIZE(cpu->arch.idt); i++) { const struct desc_struct *gidt = &cpu->arch.idt[i]; @@ -492,14 +581,16 @@ void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, if (!direct_trap(i)) continue; - /* Only trap gates (type 15) can go direct to the Guest. + /* + * Only trap gates (type 15) can go direct to the Guest. * Interrupt gates (type 14) disable interrupts as they are * entered, which we never let the Guest do. Not present * entries (type 0x0) also can't go direct, of course. * * If it can't go direct, we still need to copy the priv. level: * they might want to give userspace access to a software - * interrupt. */ + * interrupt. + */ if (idt_type(gidt->a, gidt->b) == 0xF) idt[i] = *gidt; else @@ -518,7 +609,8 @@ void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, * the next timer interrupt (in nanoseconds). We use the high-resolution timer * infrastructure to set a callback at that time. * - * 0 means "turn off the clock". */ + * 0 means "turn off the clock". + */ void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta) { ktime_t expires; @@ -529,9 +621,11 @@ void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta) return; } - /* We use wallclock time here, so the Guest might not be running for + /* + * We use wallclock time here, so the Guest might not be running for * all the time between now and the timer interrupt it asked for. This - * is almost always the right thing to do. */ + * is almost always the right thing to do. + */ expires = ktime_add_ns(ktime_get_real(), delta); hrtimer_start(&cpu->hrt, expires, HRTIMER_MODE_ABS); } diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 01c591923793..74c0db691b53 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -54,13 +54,13 @@ struct lg_cpu { unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */ - /* At end of a page shared mapped over lguest_pages in guest. */ + /* At end of a page shared mapped over lguest_pages in guest. */ unsigned long regs_page; struct lguest_regs *regs; struct lguest_pages *last_pages; - int cpu_pgd; /* which pgd this cpu is currently using */ + int cpu_pgd; /* Which pgd this cpu is currently using */ /* If a hypercall was asked for, this points to the arguments. */ struct hcall_args *hcall; @@ -96,8 +96,11 @@ struct lguest unsigned int nr_cpus; u32 pfn_limit; - /* This provides the offset to the base of guest-physical - * memory in the Launcher. */ + + /* + * This provides the offset to the base of guest-physical memory in the + * Launcher. + */ void __user *mem_base; unsigned long kernel_address; @@ -122,11 +125,13 @@ bool lguest_address_ok(const struct lguest *lg, void __lgread(struct lg_cpu *, void *, unsigned long, unsigned); void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned); -/*H:035 Using memory-copy operations like that is usually inconvient, so we +/*H:035 + * Using memory-copy operations like that is usually inconvient, so we * have the following helper macros which read and write a specific type (often * an unsigned long). * - * This reads into a variable of the given type then returns that. */ + * This reads into a variable of the given type then returns that. + */ #define lgread(cpu, addr, type) \ ({ type _v; __lgread((cpu), &_v, (addr), sizeof(_v)); _v; }) @@ -140,9 +145,11 @@ void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned); int run_guest(struct lg_cpu *cpu, unsigned long __user *user); -/* Helper macros to obtain the first 12 or the last 20 bits, this is only the +/* + * Helper macros to obtain the first 12 or the last 20 bits, this is only the * first step in the migration to the kernel types. pte_pfn is already defined - * in the kernel. */ + * in the kernel. + */ #define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK) #define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) #define pmd_flags(x) (pmd_val(x) & ~PAGE_MASK) diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c index e082cdac88b4..cc000e79c3d1 100644 --- a/drivers/lguest/lguest_device.c +++ b/drivers/lguest/lguest_device.c @@ -1,10 +1,12 @@ -/*P:050 Lguest guests use a very simple method to describe devices. It's a +/*P:050 + * Lguest guests use a very simple method to describe devices. It's a * series of device descriptors contained just above the top of normal Guest * memory. * * We use the standard "virtio" device infrastructure, which provides us with a * console, a network and a block driver. Each one expects some configuration - * information and a "virtqueue" or two to send and receive data. :*/ + * information and a "virtqueue" or two to send and receive data. +:*/ #include #include #include @@ -20,8 +22,10 @@ /* The pointer to our (page) of device descriptions. */ static void *lguest_devices; -/* For Guests, device memory can be used as normal memory, so we cast away the - * __iomem to quieten sparse. */ +/* + * For Guests, device memory can be used as normal memory, so we cast away the + * __iomem to quieten sparse. + */ static inline void *lguest_map(unsigned long phys_addr, unsigned long pages) { return (__force void *)ioremap_cache(phys_addr, PAGE_SIZE*pages); @@ -32,8 +36,10 @@ static inline void lguest_unmap(void *addr) iounmap((__force void __iomem *)addr); } -/*D:100 Each lguest device is just a virtio device plus a pointer to its entry - * in the lguest_devices page. */ +/*D:100 + * Each lguest device is just a virtio device plus a pointer to its entry + * in the lguest_devices page. + */ struct lguest_device { struct virtio_device vdev; @@ -41,9 +47,11 @@ struct lguest_device { struct lguest_device_desc *desc; }; -/* Since the virtio infrastructure hands us a pointer to the virtio_device all +/* + * Since the virtio infrastructure hands us a pointer to the virtio_device all * the time, it helps to have a curt macro to get a pointer to the struct - * lguest_device it's enclosed in. */ + * lguest_device it's enclosed in. + */ #define to_lgdev(vd) container_of(vd, struct lguest_device, vdev) /*D:130 @@ -55,7 +63,8 @@ struct lguest_device { * the driver will look at them during setup. * * A convenient routine to return the device's virtqueue config array: - * immediately after the descriptor. */ + * immediately after the descriptor. + */ static struct lguest_vqconfig *lg_vq(const struct lguest_device_desc *desc) { return (void *)(desc + 1); @@ -98,10 +107,12 @@ static u32 lg_get_features(struct virtio_device *vdev) return features; } -/* The virtio core takes the features the Host offers, and copies the - * ones supported by the driver into the vdev->features array. Once - * that's all sorted out, this routine is called so we can tell the - * Host which features we understand and accept. */ +/* + * The virtio core takes the features the Host offers, and copies the ones + * supported by the driver into the vdev->features array. Once that's all + * sorted out, this routine is called so we can tell the Host which features we + * understand and accept. + */ static void lg_finalize_features(struct virtio_device *vdev) { unsigned int i, bits; @@ -112,10 +123,11 @@ static void lg_finalize_features(struct virtio_device *vdev) /* Give virtio_ring a chance to accept features. */ vring_transport_features(vdev); - /* The vdev->feature array is a Linux bitmask: this isn't the - * same as a the simple array of bits used by lguest devices - * for features. So we do this slow, manual conversion which is - * completely general. */ + /* + * The vdev->feature array is a Linux bitmask: this isn't the same as a + * the simple array of bits used by lguest devices for features. So we + * do this slow, manual conversion which is completely general. + */ memset(out_features, 0, desc->feature_len); bits = min_t(unsigned, desc->feature_len, sizeof(vdev->features)) * 8; for (i = 0; i < bits; i++) { @@ -146,15 +158,19 @@ static void lg_set(struct virtio_device *vdev, unsigned int offset, memcpy(lg_config(desc) + offset, buf, len); } -/* The operations to get and set the status word just access the status field - * of the device descriptor. */ +/* + * The operations to get and set the status word just access the status field + * of the device descriptor. + */ static u8 lg_get_status(struct virtio_device *vdev) { return to_lgdev(vdev)->desc->status; } -/* To notify on status updates, we (ab)use the NOTIFY hypercall, with the - * descriptor address of the device. A zero status means "reset". */ +/* + * To notify on status updates, we (ab)use the NOTIFY hypercall, with the + * descriptor address of the device. A zero status means "reset". + */ static void set_status(struct virtio_device *vdev, u8 status) { unsigned long offset = (void *)to_lgdev(vdev)->desc - lguest_devices; @@ -200,13 +216,17 @@ struct lguest_vq_info void *pages; }; -/* When the virtio_ring code wants to prod the Host, it calls us here and we +/* + * When the virtio_ring code wants to prod the Host, it calls us here and we * make a hypercall. We hand the physical address of the virtqueue so the Host - * knows which virtqueue we're talking about. */ + * knows which virtqueue we're talking about. + */ static void lg_notify(struct virtqueue *vq) { - /* We store our virtqueue information in the "priv" pointer of the - * virtqueue structure. */ + /* + * We store our virtqueue information in the "priv" pointer of the + * virtqueue structure. + */ struct lguest_vq_info *lvq = vq->priv; kvm_hypercall1(LHCALL_NOTIFY, lvq->config.pfn << PAGE_SHIFT); @@ -215,7 +235,8 @@ static void lg_notify(struct virtqueue *vq) /* An extern declaration inside a C file is bad form. Don't do it. */ extern void lguest_setup_irq(unsigned int irq); -/* This routine finds the first virtqueue described in the configuration of +/* + * This routine finds the first virtqueue described in the configuration of * this device and sets it up. * * This is kind of an ugly duckling. It'd be nicer to have a standard @@ -225,7 +246,8 @@ extern void lguest_setup_irq(unsigned int irq); * simpler for the Host to simply tell us where the pages are. * * So we provide drivers with a "find the Nth virtqueue and set it up" - * function. */ + * function. + */ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, unsigned index, void (*callback)(struct virtqueue *vq), @@ -244,9 +266,11 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, if (!lvq) return ERR_PTR(-ENOMEM); - /* Make a copy of the "struct lguest_vqconfig" entry, which sits after + /* + * Make a copy of the "struct lguest_vqconfig" entry, which sits after * the descriptor. We need a copy because the config space might not - * be aligned correctly. */ + * be aligned correctly. + */ memcpy(&lvq->config, lg_vq(ldev->desc)+index, sizeof(lvq->config)); printk("Mapping virtqueue %i addr %lx\n", index, @@ -261,8 +285,10 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, goto free_lvq; } - /* OK, tell virtio_ring.c to set up a virtqueue now we know its size - * and we've got a pointer to its pages. */ + /* + * OK, tell virtio_ring.c to set up a virtqueue now we know its size + * and we've got a pointer to its pages. + */ vq = vring_new_virtqueue(lvq->config.num, LGUEST_VRING_ALIGN, vdev, lvq->pages, lg_notify, callback, name); if (!vq) { @@ -273,18 +299,23 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, /* Make sure the interrupt is allocated. */ lguest_setup_irq(lvq->config.irq); - /* Tell the interrupt for this virtqueue to go to the virtio_ring - * interrupt handler. */ - /* FIXME: We used to have a flag for the Host to tell us we could use + /* + * Tell the interrupt for this virtqueue to go to the virtio_ring + * interrupt handler. + * + * FIXME: We used to have a flag for the Host to tell us we could use * the interrupt as a source of randomness: it'd be nice to have that - * back.. */ + * back. + */ err = request_irq(lvq->config.irq, vring_interrupt, IRQF_SHARED, dev_name(&vdev->dev), vq); if (err) goto destroy_vring; - /* Last of all we hook up our 'struct lguest_vq_info" to the - * virtqueue's priv pointer. */ + /* + * Last of all we hook up our 'struct lguest_vq_info" to the + * virtqueue's priv pointer. + */ vq->priv = lvq; return vq; @@ -358,11 +389,14 @@ static struct virtio_config_ops lguest_config_ops = { .del_vqs = lg_del_vqs, }; -/* The root device for the lguest virtio devices. This makes them appear as - * /sys/devices/lguest/0,1,2 not /sys/devices/0,1,2. */ +/* + * The root device for the lguest virtio devices. This makes them appear as + * /sys/devices/lguest/0,1,2 not /sys/devices/0,1,2. + */ static struct device *lguest_root; -/*D:120 This is the core of the lguest bus: actually adding a new device. +/*D:120 + * This is the core of the lguest bus: actually adding a new device. * It's a separate function because it's neater that way, and because an * earlier version of the code supported hotplug and unplug. They were removed * early on because they were never used. @@ -371,14 +405,14 @@ static struct device *lguest_root; * * It's worth reading this carefully: we start with a pointer to the new device * descriptor in the "lguest_devices" page, and the offset into the device - * descriptor page so we can uniquely identify it if things go badly wrong. */ + * descriptor page so we can uniquely identify it if things go badly wrong. + */ static void add_lguest_device(struct lguest_device_desc *d, unsigned int offset) { struct lguest_device *ldev; - /* Start with zeroed memory; Linux's device layer seems to count on - * it. */ + /* Start with zeroed memory; Linux's device layer counts on it. */ ldev = kzalloc(sizeof(*ldev), GFP_KERNEL); if (!ldev) { printk(KERN_EMERG "Cannot allocate lguest dev %u type %u\n", @@ -390,15 +424,19 @@ static void add_lguest_device(struct lguest_device_desc *d, ldev->vdev.dev.parent = lguest_root; /* We have a unique device index thanks to the dev_index counter. */ ldev->vdev.id.device = d->type; - /* We have a simple set of routines for querying the device's - * configuration information and setting its status. */ + /* + * We have a simple set of routines for querying the device's + * configuration information and setting its status. + */ ldev->vdev.config = &lguest_config_ops; /* And we remember the device's descriptor for lguest_config_ops. */ ldev->desc = d; - /* register_virtio_device() sets up the generic fields for the struct + /* + * register_virtio_device() sets up the generic fields for the struct * virtio_device and calls device_register(). This makes the bus - * infrastructure look for a matching driver. */ + * infrastructure look for a matching driver. + */ if (register_virtio_device(&ldev->vdev) != 0) { printk(KERN_ERR "Failed to register lguest dev %u type %u\n", offset, d->type); @@ -406,8 +444,10 @@ static void add_lguest_device(struct lguest_device_desc *d, } } -/*D:110 scan_devices() simply iterates through the device page. The type 0 is - * reserved to mean "end of devices". */ +/*D:110 + * scan_devices() simply iterates through the device page. The type 0 is + * reserved to mean "end of devices". + */ static void scan_devices(void) { unsigned int i; @@ -426,7 +466,8 @@ static void scan_devices(void) } } -/*D:105 Fairly early in boot, lguest_devices_init() is called to set up the +/*D:105 + * Fairly early in boot, lguest_devices_init() is called to set up the * lguest device infrastructure. We check that we are a Guest by checking * pv_info.name: there are other ways of checking, but this seems most * obvious to me. @@ -437,7 +478,8 @@ static void scan_devices(void) * correct sysfs incantation). * * Finally we call scan_devices() which adds all the devices found in the - * lguest_devices page. */ + * lguest_devices page. + */ static int __init lguest_devices_init(void) { if (strcmp(pv_info.name, "lguest") != 0) @@ -456,11 +498,13 @@ static int __init lguest_devices_init(void) /* We do this after core stuff, but before the drivers. */ postcore_initcall(lguest_devices_init); -/*D:150 At this point in the journey we used to now wade through the lguest +/*D:150 + * At this point in the journey we used to now wade through the lguest * devices themselves: net, block and console. Since they're all now virtio * devices rather than lguest-specific, I've decided to ignore them. Mostly, * they're kind of boring. But this does mean you'll never experience the * thrill of reading the forbidden love scene buried deep in the block driver. * * "make Launcher" beckons, where we answer questions like "Where do Guests - * come from?", and "What do you do when someone asks for optimization?". */ + * come from?", and "What do you do when someone asks for optimization?". + */ diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index 407722a8e0c4..7e92017103dc 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -1,8 +1,10 @@ -/*P:200 This contains all the /dev/lguest code, whereby the userspace launcher +/*P:200 + * This contains all the /dev/lguest code, whereby the userspace launcher * controls and communicates with the Guest. For example, the first write will * tell us the Guest's memory layout, pagetable, entry point and kernel address * offset. A read will run the Guest until something happens, such as a signal - * or the Guest doing a NOTIFY out to the Launcher. :*/ + * or the Guest doing a NOTIFY out to the Launcher. +:*/ #include #include #include @@ -37,8 +39,10 @@ static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) if (!addr) return -EINVAL; - /* Replace the old array with the new one, carefully: others can - * be accessing it at the same time */ + /* + * Replace the old array with the new one, carefully: others can + * be accessing it at the same time. + */ new = kmalloc(sizeof(*new) + sizeof(new->map[0]) * (old->num + 1), GFP_KERNEL); if (!new) @@ -61,8 +65,10 @@ static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) /* Now put new one in place. */ rcu_assign_pointer(lg->eventfds, new); - /* We're not in a big hurry. Wait until noone's looking at old - * version, then delete it. */ + /* + * We're not in a big hurry. Wait until noone's looking at old + * version, then delete it. + */ synchronize_rcu(); kfree(old); @@ -87,8 +93,10 @@ static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) return err; } -/*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt - * number to /dev/lguest. */ +/*L:050 + * Sending an interrupt is done by writing LHREQ_IRQ and an interrupt + * number to /dev/lguest. + */ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input) { unsigned long irq; @@ -102,8 +110,10 @@ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input) return 0; } -/*L:040 Once our Guest is initialized, the Launcher makes it run by reading - * from /dev/lguest. */ +/*L:040 + * Once our Guest is initialized, the Launcher makes it run by reading + * from /dev/lguest. + */ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) { struct lguest *lg = file->private_data; @@ -139,8 +149,10 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) return len; } - /* If we returned from read() last time because the Guest sent I/O, - * clear the flag. */ + /* + * If we returned from read() last time because the Guest sent I/O, + * clear the flag. + */ if (cpu->pending_notify) cpu->pending_notify = 0; @@ -148,8 +160,10 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) return run_guest(cpu, (unsigned long __user *)user); } -/*L:025 This actually initializes a CPU. For the moment, a Guest is only - * uniprocessor, so "id" is always 0. */ +/*L:025 + * This actually initializes a CPU. For the moment, a Guest is only + * uniprocessor, so "id" is always 0. + */ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) { /* We have a limited number the number of CPUs in the lguest struct. */ @@ -164,8 +178,10 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) /* Each CPU has a timer it can set. */ init_clockdev(cpu); - /* We need a complete page for the Guest registers: they are accessible - * to the Guest and we can only grant it access to whole pages. */ + /* + * We need a complete page for the Guest registers: they are accessible + * to the Guest and we can only grant it access to whole pages. + */ cpu->regs_page = get_zeroed_page(GFP_KERNEL); if (!cpu->regs_page) return -ENOMEM; @@ -173,29 +189,38 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) /* We actually put the registers at the bottom of the page. */ cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs); - /* Now we initialize the Guest's registers, handing it the start - * address. */ + /* + * Now we initialize the Guest's registers, handing it the start + * address. + */ lguest_arch_setup_regs(cpu, start_ip); - /* We keep a pointer to the Launcher task (ie. current task) for when - * other Guests want to wake this one (eg. console input). */ + /* + * We keep a pointer to the Launcher task (ie. current task) for when + * other Guests want to wake this one (eg. console input). + */ cpu->tsk = current; - /* We need to keep a pointer to the Launcher's memory map, because if + /* + * We need to keep a pointer to the Launcher's memory map, because if * the Launcher dies we need to clean it up. If we don't keep a - * reference, it is destroyed before close() is called. */ + * reference, it is destroyed before close() is called. + */ cpu->mm = get_task_mm(cpu->tsk); - /* We remember which CPU's pages this Guest used last, for optimization - * when the same Guest runs on the same CPU twice. */ + /* + * We remember which CPU's pages this Guest used last, for optimization + * when the same Guest runs on the same CPU twice. + */ cpu->last_pages = NULL; /* No error == success. */ return 0; } -/*L:020 The initialization write supplies 3 pointer sized (32 or 64 bit) - * values (in addition to the LHREQ_INITIALIZE value). These are: +/*L:020 + * The initialization write supplies 3 pointer sized (32 or 64 bit) values (in + * addition to the LHREQ_INITIALIZE value). These are: * * base: The start of the Guest-physical memory inside the Launcher memory. * @@ -207,14 +232,15 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) */ static int initialize(struct file *file, const unsigned long __user *input) { - /* "struct lguest" contains everything we (the Host) know about a - * Guest. */ + /* "struct lguest" contains all we (the Host) know about a Guest. */ struct lguest *lg; int err; unsigned long args[3]; - /* We grab the Big Lguest lock, which protects against multiple - * simultaneous initializations. */ + /* + * We grab the Big Lguest lock, which protects against multiple + * simultaneous initializations. + */ mutex_lock(&lguest_lock); /* You can't initialize twice! Close the device and start again... */ if (file->private_data) { @@ -249,8 +275,10 @@ static int initialize(struct file *file, const unsigned long __user *input) if (err) goto free_eventfds; - /* Initialize the Guest's shadow page tables, using the toplevel - * address the Launcher gave us. This allocates memory, so can fail. */ + /* + * Initialize the Guest's shadow page tables, using the toplevel + * address the Launcher gave us. This allocates memory, so can fail. + */ err = init_guest_pagetable(lg); if (err) goto free_regs; @@ -275,7 +303,8 @@ unlock: return err; } -/*L:010 The first operation the Launcher does must be a write. All writes +/*L:010 + * The first operation the Launcher does must be a write. All writes * start with an unsigned long number: for the first write this must be * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use * writes of other values to send interrupts. @@ -283,12 +312,15 @@ unlock: * Note that we overload the "offset" in the /dev/lguest file to indicate what * CPU number we're dealing with. Currently this is always 0, since we only * support uniprocessor Guests, but you can see the beginnings of SMP support - * here. */ + * here. + */ static ssize_t write(struct file *file, const char __user *in, size_t size, loff_t *off) { - /* Once the Guest is initialized, we hold the "struct lguest" in the - * file private data. */ + /* + * Once the Guest is initialized, we hold the "struct lguest" in the + * file private data. + */ struct lguest *lg = file->private_data; const unsigned long __user *input = (const unsigned long __user *)in; unsigned long req; @@ -323,13 +355,15 @@ static ssize_t write(struct file *file, const char __user *in, } } -/*L:060 The final piece of interface code is the close() routine. It reverses +/*L:060 + * The final piece of interface code is the close() routine. It reverses * everything done in initialize(). This is usually called because the * Launcher exited. * * Note that the close routine returns 0 or a negative error number: it can't * really fail, but it can whine. I blame Sun for this wart, and K&R C for - * letting them do it. :*/ + * letting them do it. +:*/ static int close(struct inode *inode, struct file *file) { struct lguest *lg = file->private_data; @@ -339,8 +373,10 @@ static int close(struct inode *inode, struct file *file) if (!lg) return 0; - /* We need the big lock, to protect from inter-guest I/O and other - * Launchers initializing guests. */ + /* + * We need the big lock, to protect from inter-guest I/O and other + * Launchers initializing guests. + */ mutex_lock(&lguest_lock); /* Free up the shadow page tables for the Guest. */ @@ -351,8 +387,10 @@ static int close(struct inode *inode, struct file *file) hrtimer_cancel(&lg->cpus[i].hrt); /* We can free up the register page we allocated. */ free_page(lg->cpus[i].regs_page); - /* Now all the memory cleanups are done, it's safe to release - * the Launcher's memory management structure. */ + /* + * Now all the memory cleanups are done, it's safe to release + * the Launcher's memory management structure. + */ mmput(lg->cpus[i].mm); } @@ -361,8 +399,10 @@ static int close(struct inode *inode, struct file *file) eventfd_ctx_put(lg->eventfds->map[i].event); kfree(lg->eventfds); - /* If lg->dead doesn't contain an error code it will be NULL or a - * kmalloc()ed string, either of which is ok to hand to kfree(). */ + /* + * If lg->dead doesn't contain an error code it will be NULL or a + * kmalloc()ed string, either of which is ok to hand to kfree(). + */ if (!IS_ERR(lg->dead)) kfree(lg->dead); /* Free the memory allocated to the lguest_struct */ @@ -386,7 +426,8 @@ static int close(struct inode *inode, struct file *file) * * We begin our understanding with the Host kernel interface which the Launcher * uses: reading and writing a character device called /dev/lguest. All the - * work happens in the read(), write() and close() routines: */ + * work happens in the read(), write() and close() routines: + */ static struct file_operations lguest_fops = { .owner = THIS_MODULE, .release = close, @@ -394,8 +435,10 @@ static struct file_operations lguest_fops = { .read = read, }; -/* This is a textbook example of a "misc" character device. Populate a "struct - * miscdevice" and register it with misc_register(). */ +/* + * This is a textbook example of a "misc" character device. Populate a "struct + * miscdevice" and register it with misc_register(). + */ static struct miscdevice lguest_dev = { .minor = MISC_DYNAMIC_MINOR, .name = "lguest", diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index a6fe1abda240..3da902e4b4cb 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -1,9 +1,11 @@ -/*P:700 The pagetable code, on the other hand, still shows the scars of +/*P:700 + * The pagetable code, on the other hand, still shows the scars of * previous encounters. It's functional, and as neat as it can be in the * circumstances, but be wary, for these things are subtle and break easily. * The Guest provides a virtual to physical mapping, but we can neither trust * it nor use it: we verify and convert it here then point the CPU to the - * converted Guest pages when running the Guest. :*/ + * converted Guest pages when running the Guest. +:*/ /* Copyright (C) Rusty Russell IBM Corporation 2006. * GPL v2 and any later version */ @@ -17,10 +19,12 @@ #include #include "lg.h" -/*M:008 We hold reference to pages, which prevents them from being swapped. +/*M:008 + * We hold reference to pages, which prevents them from being swapped. * It'd be nice to have a callback in the "struct mm_struct" when Linux wants * to swap out. If we had this, and a shrinker callback to trim PTE pages, we - * could probably consider launching Guests as non-root. :*/ + * could probably consider launching Guests as non-root. +:*/ /*H:300 * The Page Table Code @@ -45,16 +49,19 @@ * (v) Flushing (throwing away) page tables, * (vi) Mapping the Switcher when the Guest is about to run, * (vii) Setting up the page tables initially. - :*/ +:*/ - -/* 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is +/* + * 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is * conveniently placed at the top 4MB, so it uses a separate, complete PTE - * page. */ + * page. + */ #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) -/* For PAE we need the PMD index as well. We use the last 2MB, so we - * will need the last pmd entry of the last pmd page. */ +/* + * For PAE we need the PMD index as well. We use the last 2MB, so we + * will need the last pmd entry of the last pmd page. + */ #ifdef CONFIG_X86_PAE #define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1) #define RESERVE_MEM 2U @@ -64,13 +71,16 @@ #define CHECK_GPGD_MASK _PAGE_TABLE #endif -/* We actually need a separate PTE page for each CPU. Remember that after the +/* + * We actually need a separate PTE page for each CPU. Remember that after the * Switcher code itself comes two pages for each CPU, and we don't want this - * CPU's guest to see the pages of any other CPU. */ + * CPU's guest to see the pages of any other CPU. + */ static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); #define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) -/*H:320 The page table code is curly enough to need helper functions to keep it +/*H:320 + * The page table code is curly enough to need helper functions to keep it * clear and clean. * * There are two functions which return pointers to the shadow (aka "real") @@ -79,7 +89,8 @@ static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); * spgd_addr() takes the virtual address and returns a pointer to the top-level * page directory entry (PGD) for that address. Since we keep track of several * page tables, the "i" argument tells us which one we're interested in (it's - * usually the current one). */ + * usually the current one). + */ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) { unsigned int index = pgd_index(vaddr); @@ -96,9 +107,11 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) } #ifdef CONFIG_X86_PAE -/* This routine then takes the PGD entry given above, which contains the +/* + * This routine then takes the PGD entry given above, which contains the * address of the PMD page. It then returns a pointer to the PMD entry for the - * given address. */ + * given address. + */ static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) { unsigned int index = pmd_index(vaddr); @@ -119,9 +132,11 @@ static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) } #endif -/* This routine then takes the page directory entry returned above, which +/* + * This routine then takes the page directory entry returned above, which * contains the address of the page table entry (PTE) page. It then returns a - * pointer to the PTE entry for the given address. */ + * pointer to the PTE entry for the given address. + */ static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) { #ifdef CONFIG_X86_PAE @@ -139,8 +154,10 @@ static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) return &page[pte_index(vaddr)]; } -/* These two functions just like the above two, except they access the Guest - * page tables. Hence they return a Guest address. */ +/* + * These two functions just like the above two, except they access the Guest + * page tables. Hence they return a Guest address. + */ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) { unsigned int index = vaddr >> (PGDIR_SHIFT); @@ -175,17 +192,21 @@ static unsigned long gpte_addr(struct lg_cpu *cpu, #endif /*:*/ -/*M:014 get_pfn is slow: we could probably try to grab batches of pages here as - * an optimization (ie. pre-faulting). :*/ +/*M:014 + * get_pfn is slow: we could probably try to grab batches of pages here as + * an optimization (ie. pre-faulting). +:*/ -/*H:350 This routine takes a page number given by the Guest and converts it to +/*H:350 + * This routine takes a page number given by the Guest and converts it to * an actual, physical page number. It can fail for several reasons: the * virtual address might not be mapped by the Launcher, the write flag is set * and the page is read-only, or the write flag was set and the page was * shared so had to be copied, but we ran out of memory. * * This holds a reference to the page, so release_pte() is careful to put that - * back. */ + * back. + */ static unsigned long get_pfn(unsigned long virtpfn, int write) { struct page *page; @@ -198,33 +219,41 @@ static unsigned long get_pfn(unsigned long virtpfn, int write) return -1UL; } -/*H:340 Converting a Guest page table entry to a shadow (ie. real) page table +/*H:340 + * Converting a Guest page table entry to a shadow (ie. real) page table * entry can be a little tricky. The flags are (almost) the same, but the * Guest PTE contains a virtual page number: the CPU needs the real page - * number. */ + * number. + */ static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write) { unsigned long pfn, base, flags; - /* The Guest sets the global flag, because it thinks that it is using + /* + * The Guest sets the global flag, because it thinks that it is using * PGE. We only told it to use PGE so it would tell us whether it was * flushing a kernel mapping or a userspace mapping. We don't actually - * use the global bit, so throw it away. */ + * use the global bit, so throw it away. + */ flags = (pte_flags(gpte) & ~_PAGE_GLOBAL); /* The Guest's pages are offset inside the Launcher. */ base = (unsigned long)cpu->lg->mem_base / PAGE_SIZE; - /* We need a temporary "unsigned long" variable to hold the answer from + /* + * We need a temporary "unsigned long" variable to hold the answer from * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't * fit in spte.pfn. get_pfn() finds the real physical number of the - * page, given the virtual number. */ + * page, given the virtual number. + */ pfn = get_pfn(base + pte_pfn(gpte), write); if (pfn == -1UL) { kill_guest(cpu, "failed to get page %lu", pte_pfn(gpte)); - /* When we destroy the Guest, we'll go through the shadow page + /* + * When we destroy the Guest, we'll go through the shadow page * tables and release_pte() them. Make sure we don't think - * this one is valid! */ + * this one is valid! + */ flags = 0; } /* Now we assemble our shadow PTE from the page number and flags. */ @@ -234,8 +263,10 @@ static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write) /*H:460 And to complete the chain, release_pte() looks like this: */ static void release_pte(pte_t pte) { - /* Remember that get_user_pages_fast() took a reference to the page, in - * get_pfn()? We have to put it back now. */ + /* + * Remember that get_user_pages_fast() took a reference to the page, in + * get_pfn()? We have to put it back now. + */ if (pte_flags(pte) & _PAGE_PRESENT) put_page(pte_page(pte)); } @@ -273,7 +304,8 @@ static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) * and return to the Guest without it knowing. * * If we fixed up the fault (ie. we mapped the address), this routine returns - * true. Otherwise, it was a real fault and we need to tell the Guest. */ + * true. Otherwise, it was a real fault and we need to tell the Guest. + */ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) { pgd_t gpgd; @@ -298,22 +330,26 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { /* No shadow entry: allocate a new shadow PTE page. */ unsigned long ptepage = get_zeroed_page(GFP_KERNEL); - /* This is not really the Guest's fault, but killing it is - * simple for this corner case. */ + /* + * This is not really the Guest's fault, but killing it is + * simple for this corner case. + */ if (!ptepage) { kill_guest(cpu, "out of memory allocating pte page"); return false; } /* We check that the Guest pgd is OK. */ check_gpgd(cpu, gpgd); - /* And we copy the flags to the shadow PGD entry. The page - * number in the shadow PGD is the page we just allocated. */ + /* + * And we copy the flags to the shadow PGD entry. The page + * number in the shadow PGD is the page we just allocated. + */ set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd))); } #ifdef CONFIG_X86_PAE gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); - /* middle level not present? We can't map it in. */ + /* Middle level not present? We can't map it in. */ if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) return false; @@ -324,8 +360,10 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) /* No shadow entry: allocate a new shadow PTE page. */ unsigned long ptepage = get_zeroed_page(GFP_KERNEL); - /* This is not really the Guest's fault, but killing it is - * simple for this corner case. */ + /* + * This is not really the Guest's fault, but killing it is + * simple for this corner case. + */ if (!ptepage) { kill_guest(cpu, "out of memory allocating pte page"); return false; @@ -334,17 +372,23 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) /* We check that the Guest pmd is OK. */ check_gpmd(cpu, gpmd); - /* And we copy the flags to the shadow PMD entry. The page - * number in the shadow PMD is the page we just allocated. */ + /* + * And we copy the flags to the shadow PMD entry. The page + * number in the shadow PMD is the page we just allocated. + */ native_set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd))); } - /* OK, now we look at the lower level in the Guest page table: keep its - * address, because we might update it later. */ + /* + * OK, now we look at the lower level in the Guest page table: keep its + * address, because we might update it later. + */ gpte_ptr = gpte_addr(cpu, gpmd, vaddr); #else - /* OK, now we look at the lower level in the Guest page table: keep its - * address, because we might update it later. */ + /* + * OK, now we look at the lower level in the Guest page table: keep its + * address, because we might update it later. + */ gpte_ptr = gpte_addr(cpu, gpgd, vaddr); #endif gpte = lgread(cpu, gpte_ptr, pte_t); @@ -353,8 +397,10 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) if (!(pte_flags(gpte) & _PAGE_PRESENT)) return false; - /* Check they're not trying to write to a page the Guest wants - * read-only (bit 2 of errcode == write). */ + /* + * Check they're not trying to write to a page the Guest wants + * read-only (bit 2 of errcode == write). + */ if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW)) return false; @@ -362,8 +408,10 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER)) return false; - /* Check that the Guest PTE flags are OK, and the page number is below - * the pfn_limit (ie. not mapping the Launcher binary). */ + /* + * Check that the Guest PTE flags are OK, and the page number is below + * the pfn_limit (ie. not mapping the Launcher binary). + */ check_gpte(cpu, gpte); /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ @@ -373,29 +421,40 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) /* Get the pointer to the shadow PTE entry we're going to set. */ spte = spte_addr(cpu, *spgd, vaddr); - /* If there was a valid shadow PTE entry here before, we release it. - * This can happen with a write to a previously read-only entry. */ + + /* + * If there was a valid shadow PTE entry here before, we release it. + * This can happen with a write to a previously read-only entry. + */ release_pte(*spte); - /* If this is a write, we insist that the Guest page is writable (the - * final arg to gpte_to_spte()). */ + /* + * If this is a write, we insist that the Guest page is writable (the + * final arg to gpte_to_spte()). + */ if (pte_dirty(gpte)) *spte = gpte_to_spte(cpu, gpte, 1); else - /* If this is a read, don't set the "writable" bit in the page + /* + * If this is a read, don't set the "writable" bit in the page * table entry, even if the Guest says it's writable. That way * we will come back here when a write does actually occur, so - * we can update the Guest's _PAGE_DIRTY flag. */ + * we can update the Guest's _PAGE_DIRTY flag. + */ native_set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0)); - /* Finally, we write the Guest PTE entry back: we've set the - * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ + /* + * Finally, we write the Guest PTE entry back: we've set the + * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. + */ lgwrite(cpu, gpte_ptr, pte_t, gpte); - /* The fault is fixed, the page table is populated, the mapping + /* + * The fault is fixed, the page table is populated, the mapping * manipulated, the result returned and the code complete. A small * delay and a trace of alliteration are the only indications the Guest - * has that a page fault occurred at all. */ + * has that a page fault occurred at all. + */ return true; } @@ -408,7 +467,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) * mapped, so it's overkill. * * This is a quick version which answers the question: is this virtual address - * mapped by the shadow page tables, and is it writable? */ + * mapped by the shadow page tables, and is it writable? + */ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) { pgd_t *spgd; @@ -428,16 +488,20 @@ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) return false; #endif - /* Check the flags on the pte entry itself: it must be present and - * writable. */ + /* + * Check the flags on the pte entry itself: it must be present and + * writable. + */ flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr))); return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); } -/* So, when pin_stack_pages() asks us to pin a page, we check if it's already +/* + * So, when pin_stack_pages() asks us to pin a page, we check if it's already * in the page tables, and if not, we call demand_page() with error code 2 - * (meaning "write"). */ + * (meaning "write"). + */ void pin_page(struct lg_cpu *cpu, unsigned long vaddr) { if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2)) @@ -485,9 +549,11 @@ static void release_pgd(pgd_t *spgd) /* If the entry's not present, there's nothing to release. */ if (pgd_flags(*spgd) & _PAGE_PRESENT) { unsigned int i; - /* Converting the pfn to find the actual PTE page is easy: turn + /* + * Converting the pfn to find the actual PTE page is easy: turn * the page number into a physical address, then convert to a - * virtual address (easy for kernel pages like this one). */ + * virtual address (easy for kernel pages like this one). + */ pte_t *ptepage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); /* For each entry in the page, we might need to release it. */ for (i = 0; i < PTRS_PER_PTE; i++) @@ -499,9 +565,12 @@ static void release_pgd(pgd_t *spgd) } } #endif -/*H:445 We saw flush_user_mappings() twice: once from the flush_user_mappings() + +/*H:445 + * We saw flush_user_mappings() twice: once from the flush_user_mappings() * hypercall and once in new_pgdir() when we re-used a top-level pgdir page. - * It simply releases every PTE page from 0 up to the Guest's kernel address. */ + * It simply releases every PTE page from 0 up to the Guest's kernel address. + */ static void flush_user_mappings(struct lguest *lg, int idx) { unsigned int i; @@ -510,10 +579,12 @@ static void flush_user_mappings(struct lguest *lg, int idx) release_pgd(lg->pgdirs[idx].pgdir + i); } -/*H:440 (v) Flushing (throwing away) page tables, +/*H:440 + * (v) Flushing (throwing away) page tables, * * The Guest has a hypercall to throw away the page tables: it's used when a - * large number of mappings have been changed. */ + * large number of mappings have been changed. + */ void guest_pagetable_flush_user(struct lg_cpu *cpu) { /* Drop the userspace part of the current page table. */ @@ -551,9 +622,11 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK); } -/* We keep several page tables. This is a simple routine to find the page +/* + * We keep several page tables. This is a simple routine to find the page * table (if any) corresponding to this top-level address the Guest has given - * us. */ + * us. + */ static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable) { unsigned int i; @@ -563,9 +636,11 @@ static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable) return i; } -/*H:435 And this is us, creating the new page directory. If we really do +/*H:435 + * And this is us, creating the new page directory. If we really do * allocate a new one (and so the kernel parts are not there), we set - * blank_pgdir. */ + * blank_pgdir. + */ static unsigned int new_pgdir(struct lg_cpu *cpu, unsigned long gpgdir, int *blank_pgdir) @@ -575,8 +650,10 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, pmd_t *pmd_table; #endif - /* We pick one entry at random to throw out. Choosing the Least - * Recently Used might be better, but this is easy. */ + /* + * We pick one entry at random to throw out. Choosing the Least + * Recently Used might be better, but this is easy. + */ next = random32() % ARRAY_SIZE(cpu->lg->pgdirs); /* If it's never been allocated at all before, try now. */ if (!cpu->lg->pgdirs[next].pgdir) { @@ -587,8 +664,10 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, next = cpu->cpu_pgd; else { #ifdef CONFIG_X86_PAE - /* In PAE mode, allocate a pmd page and populate the - * last pgd entry. */ + /* + * In PAE mode, allocate a pmd page and populate the + * last pgd entry. + */ pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL); if (!pmd_table) { free_page((long)cpu->lg->pgdirs[next].pgdir); @@ -598,8 +677,10 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, set_pgd(cpu->lg->pgdirs[next].pgdir + SWITCHER_PGD_INDEX, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - /* This is a blank page, so there are no kernel - * mappings: caller must map the stack! */ + /* + * This is a blank page, so there are no kernel + * mappings: caller must map the stack! + */ *blank_pgdir = 1; } #else @@ -615,19 +696,23 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, return next; } -/*H:430 (iv) Switching page tables +/*H:430 + * (iv) Switching page tables * * Now we've seen all the page table setting and manipulation, let's see * what happens when the Guest changes page tables (ie. changes the top-level - * pgdir). This occurs on almost every context switch. */ + * pgdir). This occurs on almost every context switch. + */ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) { int newpgdir, repin = 0; /* Look to see if we have this one already. */ newpgdir = find_pgdir(cpu->lg, pgtable); - /* If not, we allocate or mug an existing one: if it's a fresh one, - * repin gets set to 1. */ + /* + * If not, we allocate or mug an existing one: if it's a fresh one, + * repin gets set to 1. + */ if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs)) newpgdir = new_pgdir(cpu, pgtable, &repin); /* Change the current pgd index to the new one. */ @@ -637,9 +722,11 @@ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) pin_stack_pages(cpu); } -/*H:470 Finally, a routine which throws away everything: all PGD entries in all +/*H:470 + * Finally, a routine which throws away everything: all PGD entries in all * the shadow page tables, including the Guest's kernel mappings. This is used - * when we destroy the Guest. */ + * when we destroy the Guest. + */ static void release_all_pagetables(struct lguest *lg) { unsigned int i, j; @@ -656,8 +743,10 @@ static void release_all_pagetables(struct lguest *lg) spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX; pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); - /* And release the pmd entries of that pmd page, - * except for the switcher pmd. */ + /* + * And release the pmd entries of that pmd page, + * except for the switcher pmd. + */ for (k = 0; k < SWITCHER_PMD_INDEX; k++) release_pmd(&pmdpage[k]); #endif @@ -667,10 +756,12 @@ static void release_all_pagetables(struct lguest *lg) } } -/* We also throw away everything when a Guest tells us it's changed a kernel +/* + * We also throw away everything when a Guest tells us it's changed a kernel * mapping. Since kernel mappings are in every page table, it's easiest to * throw them all away. This traps the Guest in amber for a while as - * everything faults back in, but it's rare. */ + * everything faults back in, but it's rare. + */ void guest_pagetable_clear_all(struct lg_cpu *cpu) { release_all_pagetables(cpu->lg); @@ -678,15 +769,19 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu) pin_stack_pages(cpu); } /*:*/ -/*M:009 Since we throw away all mappings when a kernel mapping changes, our + +/*M:009 + * Since we throw away all mappings when a kernel mapping changes, our * performance sucks for guests using highmem. In fact, a guest with * PAGE_OFFSET 0xc0000000 (the default) and more than about 700MB of RAM is * usually slower than a Guest with less memory. * * This, of course, cannot be fixed. It would take some kind of... well, I - * don't know, but the term "puissant code-fu" comes to mind. :*/ + * don't know, but the term "puissant code-fu" comes to mind. +:*/ -/*H:420 This is the routine which actually sets the page table entry for then +/*H:420 + * This is the routine which actually sets the page table entry for then * "idx"'th shadow page table. * * Normally, we can just throw out the old entry and replace it with 0: if they @@ -715,31 +810,36 @@ static void do_set_pte(struct lg_cpu *cpu, int idx, spmd = spmd_addr(cpu, *spgd, vaddr); if (pmd_flags(*spmd) & _PAGE_PRESENT) { #endif - /* Otherwise, we start by releasing - * the existing entry. */ + /* Otherwise, start by releasing the existing entry. */ pte_t *spte = spte_addr(cpu, *spgd, vaddr); release_pte(*spte); - /* If they're setting this entry as dirty or accessed, - * we might as well put that entry they've given us - * in now. This shaves 10% off a - * copy-on-write micro-benchmark. */ + /* + * If they're setting this entry as dirty or accessed, + * we might as well put that entry they've given us in + * now. This shaves 10% off a copy-on-write + * micro-benchmark. + */ if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { check_gpte(cpu, gpte); native_set_pte(spte, gpte_to_spte(cpu, gpte, pte_flags(gpte) & _PAGE_DIRTY)); - } else - /* Otherwise kill it and we can demand_page() - * it in later. */ + } else { + /* + * Otherwise kill it and we can demand_page() + * it in later. + */ native_set_pte(spte, __pte(0)); + } #ifdef CONFIG_X86_PAE } #endif } } -/*H:410 Updating a PTE entry is a little trickier. +/*H:410 + * Updating a PTE entry is a little trickier. * * We keep track of several different page tables (the Guest uses one for each * process, so it makes sense to cache at least a few). Each of these have @@ -748,12 +848,15 @@ static void do_set_pte(struct lg_cpu *cpu, int idx, * all the page tables, not just the current one. This is rare. * * The benefit is that when we have to track a new page table, we can keep all - * the kernel mappings. This speeds up context switch immensely. */ + * the kernel mappings. This speeds up context switch immensely. + */ void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, unsigned long vaddr, pte_t gpte) { - /* Kernel mappings must be changed on all top levels. Slow, but doesn't - * happen often. */ + /* + * Kernel mappings must be changed on all top levels. Slow, but doesn't + * happen often. + */ if (vaddr >= cpu->lg->kernel_address) { unsigned int i; for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++) @@ -802,12 +905,14 @@ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) } #endif -/* Once we know how much memory we have we can construct simple identity - * (which set virtual == physical) and linear mappings - * which will get the Guest far enough into the boot to create its own. +/* + * Once we know how much memory we have we can construct simple identity (which + * set virtual == physical) and linear mappings which will get the Guest far + * enough into the boot to create its own. * * We lay them out of the way, just below the initrd (which is why we need to - * know its size here). */ + * know its size here). + */ static unsigned long setup_pagetables(struct lguest *lg, unsigned long mem, unsigned long initrd_size) @@ -825,8 +930,10 @@ static unsigned long setup_pagetables(struct lguest *lg, unsigned int phys_linear; #endif - /* We have mapped_pages frames to map, so we need - * linear_pages page tables to map them. */ + /* + * We have mapped_pages frames to map, so we need linear_pages page + * tables to map them. + */ mapped_pages = mem / PAGE_SIZE; linear_pages = (mapped_pages + PTRS_PER_PTE - 1) / PTRS_PER_PTE; @@ -839,8 +946,10 @@ static unsigned long setup_pagetables(struct lguest *lg, #ifdef CONFIG_X86_PAE pmds = (void *)linear - PAGE_SIZE; #endif - /* Linear mapping is easy: put every page's address into the - * mapping in order. */ + /* + * Linear mapping is easy: put every page's address into the + * mapping in order. + */ for (i = 0; i < mapped_pages; i++) { pte_t pte; pte = pfn_pte(i, __pgprot(_PAGE_PRESENT|_PAGE_RW|_PAGE_USER)); @@ -848,8 +957,10 @@ static unsigned long setup_pagetables(struct lguest *lg, return -EFAULT; } - /* The top level points to the linear page table pages above. - * We setup the identity and linear mappings here. */ + /* + * The top level points to the linear page table pages above. + * We setup the identity and linear mappings here. + */ #ifdef CONFIG_X86_PAE for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; i += PTRS_PER_PTE, j++) { @@ -880,15 +991,19 @@ static unsigned long setup_pagetables(struct lguest *lg, } #endif - /* We return the top level (guest-physical) address: remember where - * this is. */ + /* + * We return the top level (guest-physical) address: remember where + * this is. + */ return (unsigned long)pgdir - mem_base; } -/*H:500 (vii) Setting up the page tables initially. +/*H:500 + * (vii) Setting up the page tables initially. * * When a Guest is first created, the Launcher tells us where the toplevel of - * its first page table is. We set some things up here: */ + * its first page table is. We set some things up here: + */ int init_guest_pagetable(struct lguest *lg) { u64 mem; @@ -898,14 +1013,18 @@ int init_guest_pagetable(struct lguest *lg) pgd_t *pgd; pmd_t *pmd_table; #endif - /* Get the Guest memory size and the ramdisk size from the boot header - * located at lg->mem_base (Guest address 0). */ + /* + * Get the Guest memory size and the ramdisk size from the boot header + * located at lg->mem_base (Guest address 0). + */ if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem)) || get_user(initrd_size, &boot->hdr.ramdisk_size)) return -EFAULT; - /* We start on the first shadow page table, and give it a blank PGD - * page. */ + /* + * We start on the first shadow page table, and give it a blank PGD + * page. + */ lg->pgdirs[0].gpgdir = setup_pagetables(lg, mem, initrd_size); if (IS_ERR_VALUE(lg->pgdirs[0].gpgdir)) return lg->pgdirs[0].gpgdir; @@ -931,17 +1050,21 @@ void page_table_guest_data_init(struct lg_cpu *cpu) /* We get the kernel address: above this is all kernel memory. */ if (get_user(cpu->lg->kernel_address, &cpu->lg->lguest_data->kernel_address) - /* We tell the Guest that it can't use the top 2 or 4 MB - * of virtual addresses used by the Switcher. */ + /* + * We tell the Guest that it can't use the top 2 or 4 MB + * of virtual addresses used by the Switcher. + */ || put_user(RESERVE_MEM * 1024 * 1024, &cpu->lg->lguest_data->reserve_mem) || put_user(cpu->lg->pgdirs[0].gpgdir, &cpu->lg->lguest_data->pgdir)) kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); - /* In flush_user_mappings() we loop from 0 to + /* + * In flush_user_mappings() we loop from 0 to * "pgd_index(lg->kernel_address)". This assumes it won't hit the - * Switcher mappings, so check that now. */ + * Switcher mappings, so check that now. + */ #ifdef CONFIG_X86_PAE if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX && pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX) @@ -964,12 +1087,14 @@ void free_guest_pagetable(struct lguest *lg) free_page((long)lg->pgdirs[i].pgdir); } -/*H:480 (vi) Mapping the Switcher when the Guest is about to run. +/*H:480 + * (vi) Mapping the Switcher when the Guest is about to run. * * The Switcher and the two pages for this CPU need to be visible in the * Guest (and not the pages for other CPUs). We have the appropriate PTE pages * for each CPU already set up, we just need to hook them in now we know which - * Guest is about to run on this CPU. */ + * Guest is about to run on this CPU. + */ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) { pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); @@ -990,20 +1115,24 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) #else pgd_t switcher_pgd; - /* Make the last PGD entry for this Guest point to the Switcher's PTE - * page for this CPU (with appropriate flags). */ + /* + * Make the last PGD entry for this Guest point to the Switcher's PTE + * page for this CPU (with appropriate flags). + */ switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC); cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; #endif - /* We also change the Switcher PTE page. When we're running the Guest, + /* + * We also change the Switcher PTE page. When we're running the Guest, * we want the Guest's "regs" page to appear where the first Switcher * page for this CPU is. This is an optimization: when the Switcher * saves the Guest registers, it saves them into the first page of this * CPU's "struct lguest_pages": if we make sure the Guest's register * page is already mapped there, we don't have to copy them out - * again. */ + * again. + */ pfn = __pa(cpu->regs_page) >> PAGE_SHIFT; native_set_pte(®s_pte, pfn_pte(pfn, PAGE_KERNEL)); native_set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], @@ -1019,10 +1148,12 @@ static void free_switcher_pte_pages(void) free_page((long)switcher_pte_page(i)); } -/*H:520 Setting up the Switcher PTE page for given CPU is fairly easy, given +/*H:520 + * Setting up the Switcher PTE page for given CPU is fairly easy, given * the CPU number and the "struct page"s for the Switcher code itself. * - * Currently the Switcher is less than a page long, so "pages" is always 1. */ + * Currently the Switcher is less than a page long, so "pages" is always 1. + */ static __init void populate_switcher_pte_page(unsigned int cpu, struct page *switcher_page[], unsigned int pages) @@ -1043,13 +1174,16 @@ static __init void populate_switcher_pte_page(unsigned int cpu, native_set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]), __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW))); - /* The second page contains the "struct lguest_ro_state", and is - * read-only. */ + /* + * The second page contains the "struct lguest_ro_state", and is + * read-only. + */ native_set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]), __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); } -/* We've made it through the page table code. Perhaps our tired brains are +/* + * We've made it through the page table code. Perhaps our tired brains are * still processing the details, or perhaps we're simply glad it's over. * * If nothing else, note that all this complexity in juggling shadow page tables @@ -1058,10 +1192,13 @@ static __init void populate_switcher_pte_page(unsigned int cpu, * uses exotic direct Guest pagetable manipulation, and why both Intel and AMD * have implemented shadow page table support directly into hardware. * - * There is just one file remaining in the Host. */ + * There is just one file remaining in the Host. + */ -/*H:510 At boot or module load time, init_pagetables() allocates and populates - * the Switcher PTE page for each CPU. */ +/*H:510 + * At boot or module load time, init_pagetables() allocates and populates + * the Switcher PTE page for each CPU. + */ __init int init_pagetables(struct page **switcher_page, unsigned int pages) { unsigned int i; diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c index 482ed5a18750..951c57b0a7e0 100644 --- a/drivers/lguest/segments.c +++ b/drivers/lguest/segments.c @@ -1,4 +1,5 @@ -/*P:600 The x86 architecture has segments, which involve a table of descriptors +/*P:600 + * The x86 architecture has segments, which involve a table of descriptors * which can be used to do funky things with virtual address interpretation. * We originally used to use segments so the Guest couldn't alter the * Guest<->Host Switcher, and then we had to trim Guest segments, and restore @@ -8,7 +9,8 @@ * * In these modern times, the segment handling code consists of simple sanity * checks, and the worst you'll experience reading this code is butterfly-rash - * from frolicking through its parklike serenity. :*/ + * from frolicking through its parklike serenity. +:*/ #include "lg.h" /*H:600 @@ -41,10 +43,12 @@ * begin. */ -/* There are several entries we don't let the Guest set. The TSS entry is the +/* + * There are several entries we don't let the Guest set. The TSS entry is the * "Task State Segment" which controls all kinds of delicate things. The * LGUEST_CS and LGUEST_DS entries are reserved for the Switcher, and the - * the Guest can't be trusted to deal with double faults. */ + * the Guest can't be trusted to deal with double faults. + */ static bool ignored_gdt(unsigned int num) { return (num == GDT_ENTRY_TSS @@ -53,42 +57,52 @@ static bool ignored_gdt(unsigned int num) || num == GDT_ENTRY_DOUBLEFAULT_TSS); } -/*H:630 Once the Guest gave us new GDT entries, we fix them up a little. We +/*H:630 + * Once the Guest gave us new GDT entries, we fix them up a little. We * don't care if they're invalid: the worst that can happen is a General * Protection Fault in the Switcher when it restores a Guest segment register * which tries to use that entry. Then we kill the Guest for causing such a - * mess: the message will be "unhandled trap 256". */ + * mess: the message will be "unhandled trap 256". + */ static void fixup_gdt_table(struct lg_cpu *cpu, unsigned start, unsigned end) { unsigned int i; for (i = start; i < end; i++) { - /* We never copy these ones to real GDT, so we don't care what - * they say */ + /* + * We never copy these ones to real GDT, so we don't care what + * they say + */ if (ignored_gdt(i)) continue; - /* Segment descriptors contain a privilege level: the Guest is + /* + * Segment descriptors contain a privilege level: the Guest is * sometimes careless and leaves this as 0, even though it's - * running at privilege level 1. If so, we fix it here. */ + * running at privilege level 1. If so, we fix it here. + */ if ((cpu->arch.gdt[i].b & 0x00006000) == 0) cpu->arch.gdt[i].b |= (GUEST_PL << 13); - /* Each descriptor has an "accessed" bit. If we don't set it + /* + * Each descriptor has an "accessed" bit. If we don't set it * now, the CPU will try to set it when the Guest first loads * that entry into a segment register. But the GDT isn't - * writable by the Guest, so bad things can happen. */ + * writable by the Guest, so bad things can happen. + */ cpu->arch.gdt[i].b |= 0x00000100; } } -/*H:610 Like the IDT, we never simply use the GDT the Guest gives us. We keep +/*H:610 + * Like the IDT, we never simply use the GDT the Guest gives us. We keep * a GDT for each CPU, and copy across the Guest's entries each time we want to * run the Guest on that CPU. * * This routine is called at boot or modprobe time for each CPU to set up the * constant GDT entries: the ones which are the same no matter what Guest we're - * running. */ + * running. + */ void setup_default_gdt_entries(struct lguest_ro_state *state) { struct desc_struct *gdt = state->guest_gdt; @@ -98,30 +112,37 @@ void setup_default_gdt_entries(struct lguest_ro_state *state) gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; - /* The TSS segment refers to the TSS entry for this particular CPU. + /* + * The TSS segment refers to the TSS entry for this particular CPU. * Forgive the magic flags: the 0x8900 means the entry is Present, it's * privilege level 0 Available 386 TSS system segment, and the 0x67 - * means Saturn is eclipsed by Mercury in the twelfth house. */ + * means Saturn is eclipsed by Mercury in the twelfth house. + */ gdt[GDT_ENTRY_TSS].a = 0x00000067 | (tss << 16); gdt[GDT_ENTRY_TSS].b = 0x00008900 | (tss & 0xFF000000) | ((tss >> 16) & 0x000000FF); } -/* This routine sets up the initial Guest GDT for booting. All entries start - * as 0 (unusable). */ +/* + * This routine sets up the initial Guest GDT for booting. All entries start + * as 0 (unusable). + */ void setup_guest_gdt(struct lg_cpu *cpu) { - /* Start with full 0-4G segments... */ + /* + * Start with full 0-4G segments...except the Guest is allowed to use + * them, so set the privilege level appropriately in the flags. + */ cpu->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT; cpu->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT; - /* ...except the Guest is allowed to use them, so set the privilege - * level appropriately in the flags. */ cpu->arch.gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13); cpu->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13); } -/*H:650 An optimization of copy_gdt(), for just the three "thead-local storage" - * entries. */ +/*H:650 + * An optimization of copy_gdt(), for just the three "thead-local storage" + * entries. + */ void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt) { unsigned int i; @@ -130,26 +151,34 @@ void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt) gdt[i] = cpu->arch.gdt[i]; } -/*H:640 When the Guest is run on a different CPU, or the GDT entries have - * changed, copy_gdt() is called to copy the Guest's GDT entries across to this - * CPU's GDT. */ +/*H:640 + * When the Guest is run on a different CPU, or the GDT entries have changed, + * copy_gdt() is called to copy the Guest's GDT entries across to this CPU's + * GDT. + */ void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt) { unsigned int i; - /* The default entries from setup_default_gdt_entries() are not - * replaced. See ignored_gdt() above. */ + /* + * The default entries from setup_default_gdt_entries() are not + * replaced. See ignored_gdt() above. + */ for (i = 0; i < GDT_ENTRIES; i++) if (!ignored_gdt(i)) gdt[i] = cpu->arch.gdt[i]; } -/*H:620 This is where the Guest asks us to load a new GDT entry - * (LHCALL_LOAD_GDT_ENTRY). We tweak the entry and copy it in. */ +/*H:620 + * This is where the Guest asks us to load a new GDT entry + * (LHCALL_LOAD_GDT_ENTRY). We tweak the entry and copy it in. + */ void load_guest_gdt_entry(struct lg_cpu *cpu, u32 num, u32 lo, u32 hi) { - /* We assume the Guest has the same number of GDT entries as the - * Host, otherwise we'd have to dynamically allocate the Guest GDT. */ + /* + * We assume the Guest has the same number of GDT entries as the + * Host, otherwise we'd have to dynamically allocate the Guest GDT. + */ if (num >= ARRAY_SIZE(cpu->arch.gdt)) kill_guest(cpu, "too many gdt entries %i", num); @@ -157,15 +186,19 @@ void load_guest_gdt_entry(struct lg_cpu *cpu, u32 num, u32 lo, u32 hi) cpu->arch.gdt[num].a = lo; cpu->arch.gdt[num].b = hi; fixup_gdt_table(cpu, num, num+1); - /* Mark that the GDT changed so the core knows it has to copy it again, - * even if the Guest is run on the same CPU. */ + /* + * Mark that the GDT changed so the core knows it has to copy it again, + * even if the Guest is run on the same CPU. + */ cpu->changed |= CHANGED_GDT; } -/* This is the fast-track version for just changing the three TLS entries. +/* + * This is the fast-track version for just changing the three TLS entries. * Remember that this happens on every context switch, so it's worth * optimizing. But wouldn't it be neater to have a single hypercall to cover - * both cases? */ + * both cases? + */ void guest_load_tls(struct lg_cpu *cpu, unsigned long gtls) { struct desc_struct *tls = &cpu->arch.gdt[GDT_ENTRY_TLS_MIN]; @@ -175,7 +208,6 @@ void guest_load_tls(struct lg_cpu *cpu, unsigned long gtls) /* Note that just the TLS entries have changed. */ cpu->changed |= CHANGED_GDT_TLS; } -/*:*/ /*H:660 * With this, we have finished the Host. diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index eaf722fe309a..96f7d88ec7f8 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -17,13 +17,15 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -/*P:450 This file contains the x86-specific lguest code. It used to be all +/*P:450 + * This file contains the x86-specific lguest code. It used to be all * mixed in with drivers/lguest/core.c but several foolhardy code slashers * wrestled most of the dependencies out to here in preparation for porting * lguest to other architectures (see what I mean by foolhardy?). * * This also contains a couple of non-obvious setup and teardown pieces which - * were implemented after days of debugging pain. :*/ + * were implemented after days of debugging pain. +:*/ #include #include #include @@ -82,25 +84,33 @@ static DEFINE_PER_CPU(struct lg_cpu *, last_cpu); */ static void copy_in_guest_info(struct lg_cpu *cpu, struct lguest_pages *pages) { - /* Copying all this data can be quite expensive. We usually run the + /* + * Copying all this data can be quite expensive. We usually run the * same Guest we ran last time (and that Guest hasn't run anywhere else * meanwhile). If that's not the case, we pretend everything in the - * Guest has changed. */ + * Guest has changed. + */ if (__get_cpu_var(last_cpu) != cpu || cpu->last_pages != pages) { __get_cpu_var(last_cpu) = cpu; cpu->last_pages = pages; cpu->changed = CHANGED_ALL; } - /* These copies are pretty cheap, so we do them unconditionally: */ - /* Save the current Host top-level page directory. */ + /* + * These copies are pretty cheap, so we do them unconditionally: */ + /* Save the current Host top-level page directory. + */ pages->state.host_cr3 = __pa(current->mm->pgd); - /* Set up the Guest's page tables to see this CPU's pages (and no - * other CPU's pages). */ + /* + * Set up the Guest's page tables to see this CPU's pages (and no + * other CPU's pages). + */ map_switcher_in_guest(cpu, pages); - /* Set up the two "TSS" members which tell the CPU what stack to use + /* + * Set up the two "TSS" members which tell the CPU what stack to use * for traps which do directly into the Guest (ie. traps at privilege - * level 1). */ + * level 1). + */ pages->state.guest_tss.sp1 = cpu->esp1; pages->state.guest_tss.ss1 = cpu->ss1; @@ -125,40 +135,53 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages) /* This is a dummy value we need for GCC's sake. */ unsigned int clobber; - /* Copy the guest-specific information into this CPU's "struct - * lguest_pages". */ + /* + * Copy the guest-specific information into this CPU's "struct + * lguest_pages". + */ copy_in_guest_info(cpu, pages); - /* Set the trap number to 256 (impossible value). If we fault while + /* + * Set the trap number to 256 (impossible value). If we fault while * switching to the Guest (bad segment registers or bug), this will - * cause us to abort the Guest. */ + * cause us to abort the Guest. + */ cpu->regs->trapnum = 256; - /* Now: we push the "eflags" register on the stack, then do an "lcall". + /* + * Now: we push the "eflags" register on the stack, then do an "lcall". * This is how we change from using the kernel code segment to using * the dedicated lguest code segment, as well as jumping into the * Switcher. * * The lcall also pushes the old code segment (KERNEL_CS) onto the * stack, then the address of this call. This stack layout happens to - * exactly match the stack layout created by an interrupt... */ + * exactly match the stack layout created by an interrupt... + */ asm volatile("pushf; lcall *lguest_entry" - /* This is how we tell GCC that %eax ("a") and %ebx ("b") - * are changed by this routine. The "=" means output. */ + /* + * This is how we tell GCC that %eax ("a") and %ebx ("b") + * are changed by this routine. The "=" means output. + */ : "=a"(clobber), "=b"(clobber) - /* %eax contains the pages pointer. ("0" refers to the + /* + * %eax contains the pages pointer. ("0" refers to the * 0-th argument above, ie "a"). %ebx contains the * physical address of the Guest's top-level page - * directory. */ + * directory. + */ : "0"(pages), "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir)) - /* We tell gcc that all these registers could change, + /* + * We tell gcc that all these registers could change, * which means we don't have to save and restore them in - * the Switcher. */ + * the Switcher. + */ : "memory", "%edx", "%ecx", "%edi", "%esi"); } /*:*/ -/*M:002 There are hooks in the scheduler which we can register to tell when we +/*M:002 + * There are hooks in the scheduler which we can register to tell when we * get kicked off the CPU (preempt_notifier_register()). This would allow us * to lazily disable SYSENTER which would regain some performance, and should * also simplify copy_in_guest_info(). Note that we'd still need to restore @@ -166,56 +189,72 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages) * * We could also try using this hooks for PGE, but that might be too expensive. * - * The hooks were designed for KVM, but we can also put them to good use. :*/ + * The hooks were designed for KVM, but we can also put them to good use. +:*/ -/*H:040 This is the i386-specific code to setup and run the Guest. Interrupts - * are disabled: we own the CPU. */ +/*H:040 + * This is the i386-specific code to setup and run the Guest. Interrupts + * are disabled: we own the CPU. + */ void lguest_arch_run_guest(struct lg_cpu *cpu) { - /* Remember the awfully-named TS bit? If the Guest has asked to set it + /* + * Remember the awfully-named TS bit? If the Guest has asked to set it * we set it now, so we can trap and pass that trap to the Guest if it - * uses the FPU. */ + * uses the FPU. + */ if (cpu->ts) unlazy_fpu(current); - /* SYSENTER is an optimized way of doing system calls. We can't allow + /* + * SYSENTER is an optimized way of doing system calls. We can't allow * it because it always jumps to privilege level 0. A normal Guest * won't try it because we don't advertise it in CPUID, but a malicious * Guest (or malicious Guest userspace program) could, so we tell the - * CPU to disable it before running the Guest. */ + * CPU to disable it before running the Guest. + */ if (boot_cpu_has(X86_FEATURE_SEP)) wrmsr(MSR_IA32_SYSENTER_CS, 0, 0); - /* Now we actually run the Guest. It will return when something + /* + * Now we actually run the Guest. It will return when something * interesting happens, and we can examine its registers to see what it - * was doing. */ + * was doing. + */ run_guest_once(cpu, lguest_pages(raw_smp_processor_id())); - /* Note that the "regs" structure contains two extra entries which are + /* + * Note that the "regs" structure contains two extra entries which are * not really registers: a trap number which says what interrupt or * trap made the switcher code come back, and an error code which some - * traps set. */ + * traps set. + */ /* Restore SYSENTER if it's supposed to be on. */ if (boot_cpu_has(X86_FEATURE_SEP)) wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); - /* If the Guest page faulted, then the cr2 register will tell us the + /* + * If the Guest page faulted, then the cr2 register will tell us the * bad virtual address. We have to grab this now, because once we * re-enable interrupts an interrupt could fault and thus overwrite - * cr2, or we could even move off to a different CPU. */ + * cr2, or we could even move off to a different CPU. + */ if (cpu->regs->trapnum == 14) cpu->arch.last_pagefault = read_cr2(); - /* Similarly, if we took a trap because the Guest used the FPU, + /* + * Similarly, if we took a trap because the Guest used the FPU, * we have to restore the FPU it expects to see. * math_state_restore() may sleep and we may even move off to * a different CPU. So all the critical stuff should be done - * before this. */ + * before this. + */ else if (cpu->regs->trapnum == 7) math_state_restore(); } -/*H:130 Now we've examined the hypercall code; our Guest can make requests. +/*H:130 + * Now we've examined the hypercall code; our Guest can make requests. * Our Guest is usually so well behaved; it never tries to do things it isn't * allowed to, and uses hypercalls instead. Unfortunately, Linux's paravirtual * infrastructure isn't quite complete, because it doesn't contain replacements @@ -225,26 +264,33 @@ void lguest_arch_run_guest(struct lg_cpu *cpu) * * When the Guest uses one of these instructions, we get a trap (General * Protection Fault) and come here. We see if it's one of those troublesome - * instructions and skip over it. We return true if we did. */ + * instructions and skip over it. We return true if we did. + */ static int emulate_insn(struct lg_cpu *cpu) { u8 insn; unsigned int insnlen = 0, in = 0, shift = 0; - /* The eip contains the *virtual* address of the Guest's instruction: - * guest_pa just subtracts the Guest's page_offset. */ + /* + * The eip contains the *virtual* address of the Guest's instruction: + * guest_pa just subtracts the Guest's page_offset. + */ unsigned long physaddr = guest_pa(cpu, cpu->regs->eip); - /* This must be the Guest kernel trying to do something, not userspace! + /* + * This must be the Guest kernel trying to do something, not userspace! * The bottom two bits of the CS segment register are the privilege - * level. */ + * level. + */ if ((cpu->regs->cs & 3) != GUEST_PL) return 0; /* Decoding x86 instructions is icky. */ insn = lgread(cpu, physaddr, u8); - /* 0x66 is an "operand prefix". It means it's using the upper 16 bits - of the eax register. */ + /* + * 0x66 is an "operand prefix". It means it's using the upper 16 bits + * of the eax register. + */ if (insn == 0x66) { shift = 16; /* The instruction is 1 byte so far, read the next byte. */ @@ -252,8 +298,10 @@ static int emulate_insn(struct lg_cpu *cpu) insn = lgread(cpu, physaddr + insnlen, u8); } - /* We can ignore the lower bit for the moment and decode the 4 opcodes - * we need to emulate. */ + /* + * We can ignore the lower bit for the moment and decode the 4 opcodes + * we need to emulate. + */ switch (insn & 0xFE) { case 0xE4: /* in ,%al */ insnlen += 2; @@ -274,9 +322,11 @@ static int emulate_insn(struct lg_cpu *cpu) return 0; } - /* If it was an "IN" instruction, they expect the result to be read + /* + * If it was an "IN" instruction, they expect the result to be read * into %eax, so we change %eax. We always return all-ones, which - * traditionally means "there's nothing there". */ + * traditionally means "there's nothing there". + */ if (in) { /* Lower bit tells is whether it's a 16 or 32 bit access */ if (insn & 0x1) @@ -290,7 +340,8 @@ static int emulate_insn(struct lg_cpu *cpu) return 1; } -/* Our hypercalls mechanism used to be based on direct software interrupts. +/* + * Our hypercalls mechanism used to be based on direct software interrupts. * After Anthony's "Refactor hypercall infrastructure" kvm patch, we decided to * change over to using kvm hypercalls. * @@ -318,16 +369,20 @@ static int emulate_insn(struct lg_cpu *cpu) */ static void rewrite_hypercall(struct lg_cpu *cpu) { - /* This are the opcodes we use to patch the Guest. The opcode for "int + /* + * This are the opcodes we use to patch the Guest. The opcode for "int * $0x1f" is "0xcd 0x1f" but vmcall instruction is 3 bytes long, so we - * complete the sequence with a NOP (0x90). */ + * complete the sequence with a NOP (0x90). + */ u8 insn[3] = {0xcd, 0x1f, 0x90}; __lgwrite(cpu, guest_pa(cpu, cpu->regs->eip), insn, sizeof(insn)); - /* The above write might have caused a copy of that page to be made + /* + * The above write might have caused a copy of that page to be made * (if it was read-only). We need to make sure the Guest has * up-to-date pagetables. As this doesn't happen often, we can just - * drop them all. */ + * drop them all. + */ guest_pagetable_clear_all(cpu); } @@ -335,9 +390,11 @@ static bool is_hypercall(struct lg_cpu *cpu) { u8 insn[3]; - /* This must be the Guest kernel trying to do something. + /* + * This must be the Guest kernel trying to do something. * The bottom two bits of the CS segment register are the privilege - * level. */ + * level. + */ if ((cpu->regs->cs & 3) != GUEST_PL) return false; @@ -351,86 +408,105 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu) { switch (cpu->regs->trapnum) { case 13: /* We've intercepted a General Protection Fault. */ - /* Check if this was one of those annoying IN or OUT + /* + * Check if this was one of those annoying IN or OUT * instructions which we need to emulate. If so, we just go - * back into the Guest after we've done it. */ + * back into the Guest after we've done it. + */ if (cpu->regs->errcode == 0) { if (emulate_insn(cpu)) return; } - /* If KVM is active, the vmcall instruction triggers a - * General Protection Fault. Normally it triggers an - * invalid opcode fault (6): */ + /* + * If KVM is active, the vmcall instruction triggers a General + * Protection Fault. Normally it triggers an invalid opcode + * fault (6): + */ case 6: - /* We need to check if ring == GUEST_PL and - * faulting instruction == vmcall. */ + /* + * We need to check if ring == GUEST_PL and faulting + * instruction == vmcall. + */ if (is_hypercall(cpu)) { rewrite_hypercall(cpu); return; } break; case 14: /* We've intercepted a Page Fault. */ - /* The Guest accessed a virtual address that wasn't mapped. + /* + * The Guest accessed a virtual address that wasn't mapped. * This happens a lot: we don't actually set up most of the page * tables for the Guest at all when we start: as it runs it asks * for more and more, and we set them up as required. In this * case, we don't even tell the Guest that the fault happened. * * The errcode tells whether this was a read or a write, and - * whether kernel or userspace code. */ + * whether kernel or userspace code. + */ if (demand_page(cpu, cpu->arch.last_pagefault, cpu->regs->errcode)) return; - /* OK, it's really not there (or not OK): the Guest needs to + /* + * OK, it's really not there (or not OK): the Guest needs to * know. We write out the cr2 value so it knows where the * fault occurred. * * Note that if the Guest were really messed up, this could * happen before it's done the LHCALL_LGUEST_INIT hypercall, so - * lg->lguest_data could be NULL */ + * lg->lguest_data could be NULL + */ if (cpu->lg->lguest_data && put_user(cpu->arch.last_pagefault, &cpu->lg->lguest_data->cr2)) kill_guest(cpu, "Writing cr2"); break; case 7: /* We've intercepted a Device Not Available fault. */ - /* If the Guest doesn't want to know, we already restored the - * Floating Point Unit, so we just continue without telling - * it. */ + /* + * If the Guest doesn't want to know, we already restored the + * Floating Point Unit, so we just continue without telling it. + */ if (!cpu->ts) return; break; case 32 ... 255: - /* These values mean a real interrupt occurred, in which case + /* + * These values mean a real interrupt occurred, in which case * the Host handler has already been run. We just do a * friendly check if another process should now be run, then - * return to run the Guest again */ + * return to run the Guest again + */ cond_resched(); return; case LGUEST_TRAP_ENTRY: - /* Our 'struct hcall_args' maps directly over our regs: we set - * up the pointer now to indicate a hypercall is pending. */ + /* + * Our 'struct hcall_args' maps directly over our regs: we set + * up the pointer now to indicate a hypercall is pending. + */ cpu->hcall = (struct hcall_args *)cpu->regs; return; } /* We didn't handle the trap, so it needs to go to the Guest. */ if (!deliver_trap(cpu, cpu->regs->trapnum)) - /* If the Guest doesn't have a handler (either it hasn't + /* + * If the Guest doesn't have a handler (either it hasn't * registered any yet, or it's one of the faults we don't let - * it handle), it dies with this cryptic error message. */ + * it handle), it dies with this cryptic error message. + */ kill_guest(cpu, "unhandled trap %li at %#lx (%#lx)", cpu->regs->trapnum, cpu->regs->eip, cpu->regs->trapnum == 14 ? cpu->arch.last_pagefault : cpu->regs->errcode); } -/* Now we can look at each of the routines this calls, in increasing order of +/* + * Now we can look at each of the routines this calls, in increasing order of * complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(), * deliver_trap() and demand_page(). After all those, we'll be ready to * examine the Switcher, and our philosophical understanding of the Host/Guest - * duality will be complete. :*/ + * duality will be complete. +:*/ static void adjust_pge(void *on) { if (on) @@ -439,13 +515,16 @@ static void adjust_pge(void *on) write_cr4(read_cr4() & ~X86_CR4_PGE); } -/*H:020 Now the Switcher is mapped and every thing else is ready, we need to do - * some more i386-specific initialization. */ +/*H:020 + * Now the Switcher is mapped and every thing else is ready, we need to do + * some more i386-specific initialization. + */ void __init lguest_arch_host_init(void) { int i; - /* Most of the i386/switcher.S doesn't care that it's been moved; on + /* + * Most of the i386/switcher.S doesn't care that it's been moved; on * Intel, jumps are relative, and it doesn't access any references to * external code or data. * @@ -453,7 +532,8 @@ void __init lguest_arch_host_init(void) * addresses are placed in a table (default_idt_entries), so we need to * update the table with the new addresses. switcher_offset() is a * convenience function which returns the distance between the - * compiled-in switcher code and the high-mapped copy we just made. */ + * compiled-in switcher code and the high-mapped copy we just made. + */ for (i = 0; i < IDT_ENTRIES; i++) default_idt_entries[i] += switcher_offset(); @@ -468,63 +548,81 @@ void __init lguest_arch_host_init(void) for_each_possible_cpu(i) { /* lguest_pages() returns this CPU's two pages. */ struct lguest_pages *pages = lguest_pages(i); - /* This is a convenience pointer to make the code fit one - * statement to a line. */ + /* This is a convenience pointer to make the code neater. */ struct lguest_ro_state *state = &pages->state; - /* The Global Descriptor Table: the Host has a different one + /* + * The Global Descriptor Table: the Host has a different one * for each CPU. We keep a descriptor for the GDT which says * where it is and how big it is (the size is actually the last - * byte, not the size, hence the "-1"). */ + * byte, not the size, hence the "-1"). + */ state->host_gdt_desc.size = GDT_SIZE-1; state->host_gdt_desc.address = (long)get_cpu_gdt_table(i); - /* All CPUs on the Host use the same Interrupt Descriptor + /* + * All CPUs on the Host use the same Interrupt Descriptor * Table, so we just use store_idt(), which gets this CPU's IDT - * descriptor. */ + * descriptor. + */ store_idt(&state->host_idt_desc); - /* The descriptors for the Guest's GDT and IDT can be filled + /* + * The descriptors for the Guest's GDT and IDT can be filled * out now, too. We copy the GDT & IDT into ->guest_gdt and - * ->guest_idt before actually running the Guest. */ + * ->guest_idt before actually running the Guest. + */ state->guest_idt_desc.size = sizeof(state->guest_idt)-1; state->guest_idt_desc.address = (long)&state->guest_idt; state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1; state->guest_gdt_desc.address = (long)&state->guest_gdt; - /* We know where we want the stack to be when the Guest enters + /* + * We know where we want the stack to be when the Guest enters * the Switcher: in pages->regs. The stack grows upwards, so - * we start it at the end of that structure. */ + * we start it at the end of that structure. + */ state->guest_tss.sp0 = (long)(&pages->regs + 1); - /* And this is the GDT entry to use for the stack: we keep a - * couple of special LGUEST entries. */ + /* + * And this is the GDT entry to use for the stack: we keep a + * couple of special LGUEST entries. + */ state->guest_tss.ss0 = LGUEST_DS; - /* x86 can have a finegrained bitmap which indicates what I/O + /* + * x86 can have a finegrained bitmap which indicates what I/O * ports the process can use. We set it to the end of our - * structure, meaning "none". */ + * structure, meaning "none". + */ state->guest_tss.io_bitmap_base = sizeof(state->guest_tss); - /* Some GDT entries are the same across all Guests, so we can - * set them up now. */ + /* + * Some GDT entries are the same across all Guests, so we can + * set them up now. + */ setup_default_gdt_entries(state); /* Most IDT entries are the same for all Guests, too.*/ setup_default_idt_entries(state, default_idt_entries); - /* The Host needs to be able to use the LGUEST segments on this - * CPU, too, so put them in the Host GDT. */ + /* + * The Host needs to be able to use the LGUEST segments on this + * CPU, too, so put them in the Host GDT. + */ get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; } - /* In the Switcher, we want the %cs segment register to use the + /* + * In the Switcher, we want the %cs segment register to use the * LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so * it will be undisturbed when we switch. To change %cs and jump we - * need this structure to feed to Intel's "lcall" instruction. */ + * need this structure to feed to Intel's "lcall" instruction. + */ lguest_entry.offset = (long)switch_to_guest + switcher_offset(); lguest_entry.segment = LGUEST_CS; - /* Finally, we need to turn off "Page Global Enable". PGE is an + /* + * Finally, we need to turn off "Page Global Enable". PGE is an * optimization where page table entries are specially marked to show * they never change. The Host kernel marks all the kernel pages this * way because it's always present, even when userspace is running. @@ -534,16 +632,21 @@ void __init lguest_arch_host_init(void) * you'll get really weird bugs that you'll chase for two days. * * I used to turn PGE off every time we switched to the Guest and back - * on when we return, but that slowed the Switcher down noticibly. */ + * on when we return, but that slowed the Switcher down noticibly. + */ - /* We don't need the complexity of CPUs coming and going while we're - * doing this. */ + /* + * We don't need the complexity of CPUs coming and going while we're + * doing this. + */ get_online_cpus(); if (cpu_has_pge) { /* We have a broader idea of "global". */ /* Remember that this was originally set (for cleanup). */ cpu_had_pge = 1; - /* adjust_pge is a helper function which sets or unsets the PGE - * bit on its CPU, depending on the argument (0 == unset). */ + /* + * adjust_pge is a helper function which sets or unsets the PGE + * bit on its CPU, depending on the argument (0 == unset). + */ on_each_cpu(adjust_pge, (void *)0, 1); /* Turn off the feature in the global feature set. */ clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE); @@ -590,26 +693,32 @@ int lguest_arch_init_hypercalls(struct lg_cpu *cpu) { u32 tsc_speed; - /* The pointer to the Guest's "struct lguest_data" is the only argument. - * We check that address now. */ + /* + * The pointer to the Guest's "struct lguest_data" is the only argument. + * We check that address now. + */ if (!lguest_address_ok(cpu->lg, cpu->hcall->arg1, sizeof(*cpu->lg->lguest_data))) return -EFAULT; - /* Having checked it, we simply set lg->lguest_data to point straight + /* + * Having checked it, we simply set lg->lguest_data to point straight * into the Launcher's memory at the right place and then use * copy_to_user/from_user from now on, instead of lgread/write. I put * this in to show that I'm not immune to writing stupid - * optimizations. */ + * optimizations. + */ cpu->lg->lguest_data = cpu->lg->mem_base + cpu->hcall->arg1; - /* We insist that the Time Stamp Counter exist and doesn't change with + /* + * We insist that the Time Stamp Counter exist and doesn't change with * cpu frequency. Some devious chip manufacturers decided that TSC * changes could be handled in software. I decided that time going * backwards might be good for benchmarks, but it's bad for users. * * We also insist that the TSC be stable: the kernel detects unreliable - * TSCs for its own purposes, and we use that here. */ + * TSCs for its own purposes, and we use that here. + */ if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable()) tsc_speed = tsc_khz; else @@ -625,38 +734,47 @@ int lguest_arch_init_hypercalls(struct lg_cpu *cpu) } /*:*/ -/*L:030 lguest_arch_setup_regs() +/*L:030 + * lguest_arch_setup_regs() * * Most of the Guest's registers are left alone: we used get_zeroed_page() to - * allocate the structure, so they will be 0. */ + * allocate the structure, so they will be 0. + */ void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start) { struct lguest_regs *regs = cpu->regs; - /* There are four "segment" registers which the Guest needs to boot: + /* + * There are four "segment" registers which the Guest needs to boot: * The "code segment" register (cs) refers to the kernel code segment * __KERNEL_CS, and the "data", "extra" and "stack" segment registers * refer to the kernel data segment __KERNEL_DS. * * The privilege level is packed into the lower bits. The Guest runs - * at privilege level 1 (GUEST_PL).*/ + * at privilege level 1 (GUEST_PL). + */ regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL; regs->cs = __KERNEL_CS|GUEST_PL; - /* The "eflags" register contains miscellaneous flags. Bit 1 (0x002) + /* + * The "eflags" register contains miscellaneous flags. Bit 1 (0x002) * is supposed to always be "1". Bit 9 (0x200) controls whether * interrupts are enabled. We always leave interrupts enabled while - * running the Guest. */ + * running the Guest. + */ regs->eflags = X86_EFLAGS_IF | 0x2; - /* The "Extended Instruction Pointer" register says where the Guest is - * running. */ + /* + * The "Extended Instruction Pointer" register says where the Guest is + * running. + */ regs->eip = start; - /* %esi points to our boot information, at physical address 0, so don't - * touch it. */ + /* + * %esi points to our boot information, at physical address 0, so don't + * touch it. + */ - /* There are a couple of GDT entries the Guest expects when first - * booting. */ + /* There are a couple of GDT entries the Guest expects at boot. */ setup_guest_gdt(cpu); } diff --git a/drivers/lguest/x86/switcher_32.S b/drivers/lguest/x86/switcher_32.S index 3fc15318a80f..6dec09793836 100644 --- a/drivers/lguest/x86/switcher_32.S +++ b/drivers/lguest/x86/switcher_32.S @@ -1,12 +1,15 @@ -/*P:900 This is the Switcher: code which sits at 0xFFC00000 astride both the +/*P:900 + * This is the Switcher: code which sits at 0xFFC00000 astride both the * Host and Guest to do the low-level Guest<->Host switch. It is as simple as * it can be made, but it's naturally very specific to x86. * * You have now completed Preparation. If this has whet your appetite; if you * are feeling invigorated and refreshed then the next, more challenging stage - * can be found in "make Guest". :*/ + * can be found in "make Guest". + :*/ -/*M:012 Lguest is meant to be simple: my rule of thumb is that 1% more LOC must +/*M:012 + * Lguest is meant to be simple: my rule of thumb is that 1% more LOC must * gain at least 1% more performance. Since neither LOC nor performance can be * measured beforehand, it generally means implementing a feature then deciding * if it's worth it. And once it's implemented, who can say no? @@ -31,11 +34,14 @@ * Host (which is actually really easy). * * Two questions remain. Would the performance gain outweigh the complexity? - * And who would write the verse documenting it? :*/ + * And who would write the verse documenting it? +:*/ -/*M:011 Lguest64 handles NMI. This gave me NMI envy (until I looked at their +/*M:011 + * Lguest64 handles NMI. This gave me NMI envy (until I looked at their * code). It's worth doing though, since it would let us use oprofile in the - * Host when a Guest is running. :*/ + * Host when a Guest is running. +:*/ /*S:100 * Welcome to the Switcher itself! diff --git a/include/linux/lguest.h b/include/linux/lguest.h index dbf2479e808e..0a3a11afd64b 100644 --- a/include/linux/lguest.h +++ b/include/linux/lguest.h @@ -1,5 +1,7 @@ -/* Things the lguest guest needs to know. Note: like all lguest interfaces, - * this is subject to wild and random change between versions. */ +/* + * Things the lguest guest needs to know. Note: like all lguest interfaces, + * this is subject to wild and random change between versions. + */ #ifndef _LINUX_LGUEST_H #define _LINUX_LGUEST_H @@ -11,32 +13,42 @@ #define LG_CLOCK_MIN_DELTA 100UL #define LG_CLOCK_MAX_DELTA ULONG_MAX -/*G:031 The second method of communicating with the Host is to via "struct +/*G:031 + * The second method of communicating with the Host is to via "struct * lguest_data". Once the Guest's initialization hypercall tells the Host where - * this is, the Guest and Host both publish information in it. :*/ + * this is, the Guest and Host both publish information in it. +:*/ struct lguest_data { - /* 512 == enabled (same as eflags in normal hardware). The Guest - * changes interrupts so often that a hypercall is too slow. */ + /* + * 512 == enabled (same as eflags in normal hardware). The Guest + * changes interrupts so often that a hypercall is too slow. + */ unsigned int irq_enabled; /* Fine-grained interrupt disabling by the Guest */ DECLARE_BITMAP(blocked_interrupts, LGUEST_IRQS); - /* The Host writes the virtual address of the last page fault here, + /* + * The Host writes the virtual address of the last page fault here, * which saves the Guest a hypercall. CR2 is the native register where - * this address would normally be found. */ + * this address would normally be found. + */ unsigned long cr2; /* Wallclock time set by the Host. */ struct timespec time; - /* Interrupt pending set by the Host. The Guest should do a hypercall - * if it re-enables interrupts and sees this set (to X86_EFLAGS_IF). */ + /* + * Interrupt pending set by the Host. The Guest should do a hypercall + * if it re-enables interrupts and sees this set (to X86_EFLAGS_IF). + */ int irq_pending; - /* Async hypercall ring. Instead of directly making hypercalls, we can + /* + * Async hypercall ring. Instead of directly making hypercalls, we can * place them in here for processing the next time the Host wants. - * This batching can be quite efficient. */ + * This batching can be quite efficient. + */ /* 0xFF == done (set by Host), 0 == pending (set by Guest). */ u8 hcall_status[LHCALL_RING_SIZE]; diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h index bfefbdf7498a..495203ff221c 100644 --- a/include/linux/lguest_launcher.h +++ b/include/linux/lguest_launcher.h @@ -29,8 +29,10 @@ struct lguest_device_desc { __u8 type; /* The number of virtqueues (first in config array) */ __u8 num_vq; - /* The number of bytes of feature bits. Multiply by 2: one for host - * features and one for Guest acknowledgements. */ + /* + * The number of bytes of feature bits. Multiply by 2: one for host + * features and one for Guest acknowledgements. + */ __u8 feature_len; /* The number of bytes of the config array after virtqueues. */ __u8 config_len; @@ -39,8 +41,10 @@ struct lguest_device_desc { __u8 config[0]; }; -/*D:135 This is how we expect the device configuration field for a virtqueue - * to be laid out in config space. */ +/*D:135 + * This is how we expect the device configuration field for a virtqueue + * to be laid out in config space. + */ struct lguest_vqconfig { /* The number of entries in the virtio_ring */ __u16 num; @@ -61,7 +65,9 @@ enum lguest_req LHREQ_EVENTFD, /* + address, fd. */ }; -/* The alignment to use between consumer and producer parts of vring. - * x86 pagesize for historical reasons. */ +/* + * The alignment to use between consumer and producer parts of vring. + * x86 pagesize for historical reasons. + */ #define LGUEST_VRING_ALIGN 4096 #endif /* _LINUX_LGUEST_LAUNCHER */ -- cgit v1.2.3 From a91d74a3c4de8115295ee87350c13a329164aaaf Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 30 Jul 2009 16:03:45 -0600 Subject: lguest: update commentry Every so often, after code shuffles, I need to go through and unbitrot the Lguest Journey (see drivers/lguest/README). Since we now use RCU in a simple form in one place I took the opportunity to expand that explanation. Signed-off-by: Rusty Russell Cc: Ingo Molnar Cc: Paul McKenney --- Documentation/lguest/lguest.c | 184 +++++++++++++++++++++++++++--------- arch/x86/include/asm/lguest_hcall.h | 8 +- arch/x86/lguest/boot.c | 99 ++++++++++++++----- arch/x86/lguest/i386_head.S | 2 + drivers/lguest/core.c | 7 +- drivers/lguest/hypercalls.c | 6 +- drivers/lguest/lguest_device.c | 11 ++- drivers/lguest/lguest_user.c | 100 ++++++++++++++++++-- drivers/lguest/page_tables.c | 84 ++++++++++++---- drivers/lguest/x86/core.c | 2 +- drivers/lguest/x86/switcher_32.S | 6 +- 11 files changed, 398 insertions(+), 111 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index aa66a52b73e9..45163651b519 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -49,7 +49,7 @@ #include "linux/virtio_ring.h" #include "asm/bootparam.h" /*L:110 - * We can ignore the 39 include files we need for this program, but I do want + * We can ignore the 42 include files we need for this program, but I do want * to draw attention to the use of kernel-style types. * * As Linus said, "C is a Spartan language, and so should your naming be." I @@ -305,6 +305,11 @@ static void *map_zeroed_pages(unsigned int num) PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0); if (addr == MAP_FAILED) err(1, "Mmaping %u pages of /dev/zero", num); + + /* + * One neat mmap feature is that you can close the fd, and it + * stays mapped. + */ close(fd); return addr; @@ -557,7 +562,7 @@ static void tell_kernel(unsigned long start) } /*:*/ -/* +/*L:200 * Device Handling. * * When the Guest gives us a buffer, it sends an array of addresses and sizes. @@ -608,7 +613,10 @@ static unsigned next_desc(struct vring_desc *desc, return next; } -/* This actually sends the interrupt for this virtqueue */ +/* + * This actually sends the interrupt for this virtqueue, if we've used a + * buffer. + */ static void trigger_irq(struct virtqueue *vq) { unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; @@ -629,12 +637,12 @@ static void trigger_irq(struct virtqueue *vq) } /* - * This looks in the virtqueue and for the first available buffer, and converts + * This looks in the virtqueue for the first available buffer, and converts * it to an iovec for convenient access. Since descriptors consist of some * number of output then some number of input descriptors, it's actually two * iovecs, but we pack them into one and note how many of each there were. * - * This function returns the descriptor number found. + * This function waits if necessary, and returns the descriptor number found. */ static unsigned wait_for_vq_desc(struct virtqueue *vq, struct iovec iov[], @@ -644,10 +652,14 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq, struct vring_desc *desc; u16 last_avail = lg_last_avail(vq); + /* There's nothing available? */ while (last_avail == vq->vring.avail->idx) { u64 event; - /* OK, tell Guest about progress up to now. */ + /* + * Since we're about to sleep, now is a good time to tell the + * Guest about what we've used up to now. + */ trigger_irq(vq); /* OK, now we need to know about added descriptors. */ @@ -734,8 +746,9 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq, } /* - * After we've used one of their buffers, we tell them about it. We'll then - * want to send them an interrupt, using trigger_irq(). + * After we've used one of their buffers, we tell the Guest about it. Sometime + * later we'll want to send them an interrupt using trigger_irq(); note that + * wait_for_vq_desc() does that for us if it has to wait. */ static void add_used(struct virtqueue *vq, unsigned int head, int len) { @@ -782,12 +795,12 @@ static void console_input(struct virtqueue *vq) struct console_abort *abort = vq->dev->priv; struct iovec iov[vq->vring.num]; - /* Make sure there's a descriptor waiting. */ + /* Make sure there's a descriptor available. */ head = wait_for_vq_desc(vq, iov, &out_num, &in_num); if (out_num) errx(1, "Output buffers in console in queue?"); - /* Read it in. */ + /* Read into it. This is where we usually wait. */ len = readv(STDIN_FILENO, iov, in_num); if (len <= 0) { /* Ran out of input? */ @@ -800,6 +813,7 @@ static void console_input(struct virtqueue *vq) pause(); } + /* Tell the Guest we used a buffer. */ add_used_and_trigger(vq, head, len); /* @@ -834,15 +848,23 @@ static void console_output(struct virtqueue *vq) unsigned int head, out, in; struct iovec iov[vq->vring.num]; + /* We usually wait in here, for the Guest to give us something. */ head = wait_for_vq_desc(vq, iov, &out, &in); if (in) errx(1, "Input buffers in console output queue?"); + + /* writev can return a partial write, so we loop here. */ while (!iov_empty(iov, out)) { int len = writev(STDOUT_FILENO, iov, out); if (len <= 0) err(1, "Write to stdout gave %i", len); iov_consume(iov, out, len); } + + /* + * We're finished with that buffer: if we're going to sleep, + * wait_for_vq_desc() will prod the Guest with an interrupt. + */ add_used(vq, head, 0); } @@ -862,15 +884,30 @@ static void net_output(struct virtqueue *vq) unsigned int head, out, in; struct iovec iov[vq->vring.num]; + /* We usually wait in here for the Guest to give us a packet. */ head = wait_for_vq_desc(vq, iov, &out, &in); if (in) errx(1, "Input buffers in net output queue?"); + /* + * Send the whole thing through to /dev/net/tun. It expects the exact + * same format: what a coincidence! + */ if (writev(net_info->tunfd, iov, out) < 0) errx(1, "Write to tun failed?"); + + /* + * Done with that one; wait_for_vq_desc() will send the interrupt if + * all packets are processed. + */ add_used(vq, head, 0); } -/* Will reading from this file descriptor block? */ +/* + * Handling network input is a bit trickier, because I've tried to optimize it. + * + * First we have a helper routine which tells is if from this file descriptor + * (ie. the /dev/net/tun device) will block: + */ static bool will_block(int fd) { fd_set fdset; @@ -880,7 +917,11 @@ static bool will_block(int fd) return select(fd+1, &fdset, NULL, NULL, &zero) != 1; } -/* This handles packets coming in from the tun device to our Guest. */ +/* + * This handles packets coming in from the tun device to our Guest. Like all + * service routines, it gets called again as soon as it returns, so you don't + * see a while(1) loop here. + */ static void net_input(struct virtqueue *vq) { int len; @@ -888,21 +929,38 @@ static void net_input(struct virtqueue *vq) struct iovec iov[vq->vring.num]; struct net_info *net_info = vq->dev->priv; + /* + * Get a descriptor to write an incoming packet into. This will also + * send an interrupt if they're out of descriptors. + */ head = wait_for_vq_desc(vq, iov, &out, &in); if (out) errx(1, "Output buffers in net input queue?"); - /* Deliver interrupt now, since we're about to sleep. */ + /* + * If it looks like we'll block reading from the tun device, send them + * an interrupt. + */ if (vq->pending_used && will_block(net_info->tunfd)) trigger_irq(vq); + /* + * Read in the packet. This is where we normally wait (when there's no + * incoming network traffic). + */ len = readv(net_info->tunfd, iov, in); if (len <= 0) err(1, "Failed to read from tun."); + + /* + * Mark that packet buffer as used, but don't interrupt here. We want + * to wait until we've done as much work as we can. + */ add_used(vq, head, len); } +/*:*/ -/* This is the helper to create threads. */ +/* This is the helper to create threads: run the service routine in a loop. */ static int do_thread(void *_vq) { struct virtqueue *vq = _vq; @@ -950,11 +1008,14 @@ static void reset_device(struct device *dev) signal(SIGCHLD, (void *)kill_launcher); } +/*L:216 + * This actually creates the thread which services the virtqueue for a device. + */ static void create_thread(struct virtqueue *vq) { /* - * Create stack for thread and run it. Since the stack grows upwards, - * we point the stack pointer to the end of this region. + * Create stack for thread. Since the stack grows upwards, we point + * the stack pointer to the end of this region. */ char *stack = malloc(32768); unsigned long args[] = { LHREQ_EVENTFD, @@ -966,17 +1027,22 @@ static void create_thread(struct virtqueue *vq) err(1, "Creating eventfd"); args[2] = vq->eventfd; - /* Attach an eventfd to this virtqueue: it will go off - * when the Guest does an LHCALL_NOTIFY for this vq. */ + /* + * Attach an eventfd to this virtqueue: it will go off when the Guest + * does an LHCALL_NOTIFY for this vq. + */ if (write(lguest_fd, &args, sizeof(args)) != 0) err(1, "Attaching eventfd"); - /* CLONE_VM: because it has to access the Guest memory, and - * SIGCHLD so we get a signal if it dies. */ + /* + * CLONE_VM: because it has to access the Guest memory, and SIGCHLD so + * we get a signal if it dies. + */ vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq); if (vq->thread == (pid_t)-1) err(1, "Creating clone"); - /* We close our local copy, now the child has it. */ + + /* We close our local copy now the child has it. */ close(vq->eventfd); } @@ -1028,7 +1094,10 @@ static void update_device_status(struct device *dev) } } -/* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */ +/*L:215 + * This is the generic routine we call when the Guest uses LHCALL_NOTIFY. In + * particular, it's used to notify us of device status changes during boot. + */ static void handle_output(unsigned long addr) { struct device *i; @@ -1037,18 +1106,32 @@ static void handle_output(unsigned long addr) for (i = devices.dev; i; i = i->next) { struct virtqueue *vq; - /* Notifications to device descriptors update device status. */ + /* + * Notifications to device descriptors mean they updated the + * device status. + */ if (from_guest_phys(addr) == i->desc) { update_device_status(i); return; } - /* Devices *can* be used before status is set to DRIVER_OK. */ + /* + * Devices *can* be used before status is set to DRIVER_OK. + * The original plan was that they would never do this: they + * would always finish setting up their status bits before + * actually touching the virtqueues. In practice, we allowed + * them to, and they do (eg. the disk probes for partition + * tables as part of initialization). + * + * If we see this, we start the device: once it's running, we + * expect the device to catch all the notifications. + */ for (vq = i->vq; vq; vq = vq->next) { if (addr != vq->config.pfn*getpagesize()) continue; if (i->running) errx(1, "Notification on running %s", i->name); + /* This just calls create_thread() for each virtqueue */ start_device(i); return; } @@ -1132,6 +1215,11 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, vq->next = NULL; vq->last_avail_idx = 0; vq->dev = dev; + + /* + * This is the routine the service thread will run, and its Process ID + * once it's running. + */ vq->service = service; vq->thread = (pid_t)-1; @@ -1202,7 +1290,8 @@ static void set_config(struct device *dev, unsigned len, const void *conf) /* * This routine does all the creation and setup of a new device, including - * calling new_dev_desc() to allocate the descriptor and device memory. + * calling new_dev_desc() to allocate the descriptor and device memory. We + * don't actually start the service threads until later. * * See what I mean about userspace being boring? */ @@ -1478,19 +1567,7 @@ static void setup_tun_net(char *arg) verbose("device %u: tun %s: %s\n", devices.device_num, tapif, arg); } - -/* - * Our block (disk) device should be really simple: the Guest asks for a block - * number and we read or write that position in the file. Unfortunately, that - * was amazingly slow: the Guest waits until the read is finished before - * running anything else, even if it could have been doing useful work. - * - * We could use async I/O, except it's reputed to suck so hard that characters - * actually go missing from your code when you try to use it. - * - * So this was one reason why lguest now does all virtqueue servicing in - * separate threads: it's more efficient and more like a real device. - */ +/*:*/ /* This hangs off device->priv. */ struct vblk_info @@ -1512,8 +1589,16 @@ struct vblk_info /*L:210 * The Disk * - * Remember that the block device is handled by a separate I/O thread. We head - * straight into the core of that thread here: + * The disk only has one virtqueue, so it only has one thread. It is really + * simple: the Guest asks for a block number and we read or write that position + * in the file. + * + * Before we serviced each virtqueue in a separate thread, that was unacceptably + * slow: the Guest waits until the read is finished before running anything + * else, even if it could have been doing useful work. + * + * We could have used async I/O, except it's reputed to suck so hard that + * characters actually go missing from your code when you try to use it. */ static void blk_request(struct virtqueue *vq) { @@ -1525,7 +1610,10 @@ static void blk_request(struct virtqueue *vq) struct iovec iov[vq->vring.num]; off64_t off; - /* Get the next request. */ + /* + * Get the next request, where we normally wait. It triggers the + * interrupt to acknowledge previously serviced requests (if any). + */ head = wait_for_vq_desc(vq, iov, &out_num, &in_num); /* @@ -1539,6 +1627,10 @@ static void blk_request(struct virtqueue *vq) out = convert(&iov[0], struct virtio_blk_outhdr); in = convert(&iov[out_num+in_num-1], u8); + /* + * For historical reasons, block operations are expressed in 512 byte + * "sectors". + */ off = out->sector * 512; /* @@ -1614,6 +1706,7 @@ static void blk_request(struct virtqueue *vq) if (out->type & VIRTIO_BLK_T_BARRIER) fdatasync(vblk->fd); + /* Finished that request. */ add_used(vq, head, wlen); } @@ -1682,9 +1775,8 @@ static void rng_input(struct virtqueue *vq) errx(1, "Output buffers in rng?"); /* - * This is why we convert to iovecs: the readv() call uses them, and so - * it reads straight into the Guest's buffer. We loop to make sure we - * fill it. + * Just like the console write, we loop to cover the whole iovec. + * In this case, short reads actually happen quite a bit. */ while (!iov_empty(iov, in_num)) { len = readv(rng_info->rfd, iov, in_num); @@ -1818,7 +1910,9 @@ int main(int argc, char *argv[]) devices.lastdev = NULL; devices.next_irq = 1; + /* We're CPU 0. In fact, that's the only CPU possible right now. */ cpu_id = 0; + /* * We need to know how much memory so we can set up the device * descriptor and memory pages for the devices as we parse the command @@ -1926,7 +2020,7 @@ int main(int argc, char *argv[]) */ tell_kernel(start); - /* Ensure that we terminate if a child dies. */ + /* Ensure that we terminate if a device-servicing child dies. */ signal(SIGCHLD, kill_launcher); /* If we exit via err(), this kills all the threads, restores tty. */ diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index cceb73e12e50..ba0eed8aa1a6 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h @@ -35,10 +35,10 @@ * operations? There are two ways: the direct way is to make a "hypercall", * to make requests of the Host Itself. * - * We use the KVM hypercall mechanism. Seventeen hypercalls are - * available: the hypercall number is put in the %eax register, and the - * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. - * If a return value makes sense, it's returned in %eax. + * We use the KVM hypercall mechanism, though completely different hypercall + * numbers. Seventeen hypercalls are available: the hypercall number is put in + * the %eax register, and the arguments (when required) are placed in %ebx, + * %ecx, %edx and %esi. If a return value makes sense, it's returned in %eax. * * Grossly invalid calls result in Sudden Death at the hands of the vengeful * Host, rather than returning failure. This reflects Winston Churchill's diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 025c04d18f2b..d677fa9ca650 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -154,6 +154,7 @@ static void lazy_hcall1(unsigned long call, async_hcall(call, arg1, 0, 0, 0); } +/* You can imagine what lazy_hcall2, 3 and 4 look like. :*/ static void lazy_hcall2(unsigned long call, unsigned long arg1, unsigned long arg2) @@ -189,8 +190,10 @@ static void lazy_hcall4(unsigned long call, } #endif -/* When lazy mode is turned off reset the per-cpu lazy mode variable and then - * issue the do-nothing hypercall to flush any stored calls. */ +/*G:036 + * When lazy mode is turned off reset the per-cpu lazy mode variable and then + * issue the do-nothing hypercall to flush any stored calls. +:*/ static void lguest_leave_lazy_mmu_mode(void) { kvm_hypercall0(LHCALL_FLUSH_ASYNC); @@ -250,13 +253,11 @@ extern void lg_irq_enable(void); extern void lg_restore_fl(unsigned long flags); /*M:003 - * Note that we don't check for outstanding interrupts when we re-enable them - * (or when we unmask an interrupt). This seems to work for the moment, since - * interrupts are rare and we'll just get the interrupt on the next timer tick, - * but now we can run with CONFIG_NO_HZ, we should revisit this. One way would - * be to put the "irq_enabled" field in a page by itself, and have the Host - * write-protect it when an interrupt comes in when irqs are disabled. There - * will then be a page fault as soon as interrupts are re-enabled. + * We could be more efficient in our checking of outstanding interrupts, rather + * than using a branch. One way would be to put the "irq_enabled" field in a + * page by itself, and have the Host write-protect it when an interrupt comes + * in when irqs are disabled. There will then be a page fault as soon as + * interrupts are re-enabled. * * A better method is to implement soft interrupt disable generally for x86: * instead of disabling interrupts, we set a flag. If an interrupt does come @@ -568,7 +569,7 @@ static void lguest_write_cr4(unsigned long val) * cr3 ---> +---------+ * | --------->+---------+ * | | | PADDR1 | - * Top-level | | PADDR2 | + * Mid-level | | PADDR2 | * (PMD) page | | | * | | Lower-level | * | | (PTE) page | @@ -588,23 +589,62 @@ static void lguest_write_cr4(unsigned long val) * Index into top Index into second Offset within page * page directory page pagetable page * - * The kernel spends a lot of time changing both the top-level page directory - * and lower-level pagetable pages. The Guest doesn't know physical addresses, - * so while it maintains these page tables exactly like normal, it also needs - * to keep the Host informed whenever it makes a change: the Host will create - * the real page tables based on the Guests'. + * Now, unfortunately, this isn't the whole story: Intel added Physical Address + * Extension (PAE) to allow 32 bit systems to use 64GB of memory (ie. 36 bits). + * These are held in 64-bit page table entries, so we can now only fit 512 + * entries in a page, and the neat three-level tree breaks down. + * + * The result is a four level page table: + * + * cr3 --> [ 4 Upper ] + * [ Level ] + * [ Entries ] + * [(PUD Page)]---> +---------+ + * | --------->+---------+ + * | | | PADDR1 | + * Mid-level | | PADDR2 | + * (PMD) page | | | + * | | Lower-level | + * | | (PTE) page | + * | | | | + * .... .... + * + * + * And the virtual address is decoded as: + * + * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + * |<-2->|<--- 9 bits ---->|<---- 9 bits --->|<------ 12 bits ------>| + * Index into Index into mid Index into lower Offset within page + * top entries directory page pagetable page + * + * It's too hard to switch between these two formats at runtime, so Linux only + * supports one or the other depending on whether CONFIG_X86_PAE is set. Many + * distributions turn it on, and not just for people with silly amounts of + * memory: the larger PTE entries allow room for the NX bit, which lets the + * kernel disable execution of pages and increase security. + * + * This was a problem for lguest, which couldn't run on these distributions; + * then Matias Zabaljauregui figured it all out and implemented it, and only a + * handful of puppies were crushed in the process! + * + * Back to our point: the kernel spends a lot of time changing both the + * top-level page directory and lower-level pagetable pages. The Guest doesn't + * know physical addresses, so while it maintains these page tables exactly + * like normal, it also needs to keep the Host informed whenever it makes a + * change: the Host will create the real page tables based on the Guests'. */ /* - * The Guest calls this to set a second-level entry (pte), ie. to map a page - * into a process' address space. We set the entry then tell the Host the - * toplevel and address this corresponds to. The Guest uses one pagetable per - * process, so we need to tell the Host which one we're changing (mm->pgd). + * The Guest calls this after it has set a second-level entry (pte), ie. to map + * a page into a process' address space. Wetell the Host the toplevel and + * address this corresponds to. The Guest uses one pagetable per process, so + * we need to tell the Host which one we're changing (mm->pgd). */ static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { #ifdef CONFIG_X86_PAE + /* PAE needs to hand a 64 bit page table entry, so it uses two args. */ lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low, ptep->pte_high); #else @@ -612,6 +652,7 @@ static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, #endif } +/* This is the "set and update" combo-meal-deal version. */ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { @@ -672,6 +713,11 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval) } #ifdef CONFIG_X86_PAE +/* + * With 64-bit PTE values, we need to be careful setting them: if we set 32 + * bits at a time, the hardware could see a weird half-set entry. These + * versions ensure we update all 64 bits at once. + */ static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) { native_set_pte_atomic(ptep, pte); @@ -679,13 +725,14 @@ static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) lazy_hcall1(LHCALL_FLUSH_TLB, 1); } -void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +static void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) { native_pte_clear(mm, addr, ptep); lguest_pte_update(mm, addr, ptep); } -void lguest_pmd_clear(pmd_t *pmdp) +static void lguest_pmd_clear(pmd_t *pmdp) { lguest_set_pmd(pmdp, __pmd(0)); } @@ -784,6 +831,14 @@ static void __init lguest_init_IRQ(void) irq_ctx_init(smp_processor_id()); } +/* + * With CONFIG_SPARSE_IRQ, interrupt descriptors are allocated as-needed, so + * rather than set them in lguest_init_IRQ we are called here every time an + * lguest device needs an interrupt. + * + * FIXME: irq_to_desc_alloc_node() can fail due to lack of memory, we should + * pass that up! + */ void lguest_setup_irq(unsigned int irq) { irq_to_desc_alloc_node(irq, 0); @@ -1298,7 +1353,7 @@ __init void lguest_init(void) */ switch_to_new_gdt(0); - /* As described in head_32.S, we map the first 128M of memory. */ + /* We actually boot with all memory mapped, but let's say 128MB. */ max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; /* diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index db6aa95eb054..27eac0faee48 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S @@ -102,6 +102,7 @@ send_interrupts: * create one manually here. */ .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ + /* Put eax back the way we found it. */ popl %eax ret @@ -125,6 +126,7 @@ ENTRY(lg_restore_fl) jnz send_interrupts /* Again, the normal path has used no extra registers. Clever, huh? */ ret +/*:*/ /* These demark the EIP range where host should never deliver interrupts. */ .global lguest_noirq_start diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index cd058bc903ff..1e2cb846b3c9 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -217,10 +217,15 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) /* * It's possible the Guest did a NOTIFY hypercall to the - * Launcher, in which case we return from the read() now. + * Launcher. */ if (cpu->pending_notify) { + /* + * Does it just needs to write to a registered + * eventfd (ie. the appropriate virtqueue thread)? + */ if (!send_notify_to_eventfd(cpu)) { + /* OK, we tell the main Laucher. */ if (put_user(cpu->pending_notify, user)) return -EFAULT; return sizeof(cpu->pending_notify); diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index 787ab4bc09f0..83511eb0923d 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c @@ -59,7 +59,7 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) case LHCALL_SHUTDOWN: { char msg[128]; /* - * Shutdown is such a trivial hypercall that we do it in four + * Shutdown is such a trivial hypercall that we do it in five * lines right here. * * If the lgread fails, it will call kill_guest() itself; the @@ -245,6 +245,10 @@ static void initialize(struct lg_cpu *cpu) * device), the Guest will still see the old page. In practice, this never * happens: why would the Guest read a page which it has never written to? But * a similar scenario might one day bite us, so it's worth mentioning. + * + * Note that if we used a shared anonymous mapping in the Launcher instead of + * mapping /dev/zero private, we wouldn't worry about cop-on-write. And we + * need that to switch the Launcher to processes (away from threads) anyway. :*/ /*H:100 diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c index cc000e79c3d1..1401c1ace1ec 100644 --- a/drivers/lguest/lguest_device.c +++ b/drivers/lguest/lguest_device.c @@ -236,7 +236,7 @@ static void lg_notify(struct virtqueue *vq) extern void lguest_setup_irq(unsigned int irq); /* - * This routine finds the first virtqueue described in the configuration of + * This routine finds the Nth virtqueue described in the configuration of * this device and sets it up. * * This is kind of an ugly duckling. It'd be nicer to have a standard @@ -244,9 +244,6 @@ extern void lguest_setup_irq(unsigned int irq); * everyone wants to do it differently. The KVM coders want the Guest to * allocate its own pages and tell the Host where they are, but for lguest it's * simpler for the Host to simply tell us where the pages are. - * - * So we provide drivers with a "find the Nth virtqueue and set it up" - * function. */ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, unsigned index, @@ -422,7 +419,11 @@ static void add_lguest_device(struct lguest_device_desc *d, /* This devices' parent is the lguest/ dir. */ ldev->vdev.dev.parent = lguest_root; - /* We have a unique device index thanks to the dev_index counter. */ + /* + * The device type comes straight from the descriptor. There's also a + * device vendor field in the virtio_device struct, which we leave as + * 0. + */ ldev->vdev.id.device = d->type; /* * We have a simple set of routines for querying the device's diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index 7e92017103dc..b4d3f7ca554f 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -1,9 +1,8 @@ -/*P:200 - * This contains all the /dev/lguest code, whereby the userspace launcher +/*P:200 This contains all the /dev/lguest code, whereby the userspace launcher * controls and communicates with the Guest. For example, the first write will - * tell us the Guest's memory layout, pagetable, entry point and kernel address - * offset. A read will run the Guest until something happens, such as a signal - * or the Guest doing a NOTIFY out to the Launcher. + * tell us the Guest's memory layout and entry point. A read will run the + * Guest until something happens, such as a signal or the Guest doing a NOTIFY + * out to the Launcher. :*/ #include #include @@ -13,14 +12,41 @@ #include #include "lg.h" +/*L:056 + * Before we move on, let's jump ahead and look at what the kernel does when + * it needs to look up the eventfds. That will complete our picture of how we + * use RCU. + * + * The notification value is in cpu->pending_notify: we return true if it went + * to an eventfd. + */ bool send_notify_to_eventfd(struct lg_cpu *cpu) { unsigned int i; struct lg_eventfd_map *map; - /* lg->eventfds is RCU-protected */ + /* + * This "rcu_read_lock()" helps track when someone is still looking at + * the (RCU-using) eventfds array. It's not actually a lock at all; + * indeed it's a noop in many configurations. (You didn't expect me to + * explain all the RCU secrets here, did you?) + */ rcu_read_lock(); + /* + * rcu_dereference is the counter-side of rcu_assign_pointer(); it + * makes sure we don't access the memory pointed to by + * cpu->lg->eventfds before cpu->lg->eventfds is set. Sounds crazy, + * but Alpha allows this! Paul McKenney points out that a really + * aggressive compiler could have the same effect: + * http://lists.ozlabs.org/pipermail/lguest/2009-July/001560.html + * + * So play safe, use rcu_dereference to get the rcu-protected pointer: + */ map = rcu_dereference(cpu->lg->eventfds); + /* + * Simple array search: even if they add an eventfd while we do this, + * we'll continue to use the old array and just won't see the new one. + */ for (i = 0; i < map->num; i++) { if (map->map[i].addr == cpu->pending_notify) { eventfd_signal(map->map[i].event, 1); @@ -28,14 +54,43 @@ bool send_notify_to_eventfd(struct lg_cpu *cpu) break; } } + /* We're done with the rcu-protected variable cpu->lg->eventfds. */ rcu_read_unlock(); + + /* If we cleared the notification, it's because we found a match. */ return cpu->pending_notify == 0; } +/*L:055 + * One of the more tricksy tricks in the Linux Kernel is a technique called + * Read Copy Update. Since one point of lguest is to teach lguest journeyers + * about kernel coding, I use it here. (In case you're curious, other purposes + * include learning about virtualization and instilling a deep appreciation for + * simplicity and puppies). + * + * We keep a simple array which maps LHCALL_NOTIFY values to eventfds, but we + * add new eventfds without ever blocking readers from accessing the array. + * The current Launcher only does this during boot, so that never happens. But + * Read Copy Update is cool, and adding a lock risks damaging even more puppies + * than this code does. + * + * We allocate a brand new one-larger array, copy the old one and add our new + * element. Then we make the lg eventfd pointer point to the new array. + * That's the easy part: now we need to free the old one, but we need to make + * sure no slow CPU somewhere is still looking at it. That's what + * synchronize_rcu does for us: waits until every CPU has indicated that it has + * moved on to know it's no longer using the old one. + * + * If that's unclear, see http://en.wikipedia.org/wiki/Read-copy-update. + */ static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) { struct lg_eventfd_map *new, *old = lg->eventfds; + /* + * We don't allow notifications on value 0 anyway (pending_notify of + * 0 means "nothing pending"). + */ if (!addr) return -EINVAL; @@ -62,12 +117,20 @@ static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) } new->num++; - /* Now put new one in place. */ + /* + * Now put new one in place: rcu_assign_pointer() is a fancy way of + * doing "lg->eventfds = new", but it uses memory barriers to make + * absolutely sure that the contents of "new" written above is nailed + * down before we actually do the assignment. + * + * We have to think about these kinds of things when we're operating on + * live data without locks. + */ rcu_assign_pointer(lg->eventfds, new); /* * We're not in a big hurry. Wait until noone's looking at old - * version, then delete it. + * version, then free it. */ synchronize_rcu(); kfree(old); @@ -75,6 +138,14 @@ static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) return 0; } +/*L:052 + * Receiving notifications from the Guest is usually done by attaching a + * particular LHCALL_NOTIFY value to an event filedescriptor. The eventfd will + * become readable when the Guest does an LHCALL_NOTIFY with that value. + * + * This is really convenient for processing each virtqueue in a separate + * thread. + */ static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) { unsigned long addr, fd; @@ -86,6 +157,11 @@ static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) if (get_user(fd, input) != 0) return -EFAULT; + /* + * Just make sure two callers don't add eventfds at once. We really + * only need to lock against callers adding to the same Guest, so using + * the Big Lguest Lock is overkill. But this is setup, not a fast path. + */ mutex_lock(&lguest_lock); err = add_eventfd(lg, addr, fd); mutex_unlock(&lguest_lock); @@ -106,6 +182,10 @@ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input) if (irq >= LGUEST_IRQS) return -EINVAL; + /* + * Next time the Guest runs, the core code will see if it can deliver + * this interrupt. + */ set_interrupt(cpu, irq); return 0; } @@ -307,10 +387,10 @@ unlock: * The first operation the Launcher does must be a write. All writes * start with an unsigned long number: for the first write this must be * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use - * writes of other values to send interrupts. + * writes of other values to send interrupts or set up receipt of notifications. * * Note that we overload the "offset" in the /dev/lguest file to indicate what - * CPU number we're dealing with. Currently this is always 0, since we only + * CPU number we're dealing with. Currently this is always 0 since we only * support uniprocessor Guests, but you can see the beginnings of SMP support * here. */ diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 3da902e4b4cb..a8d0aee3bc0e 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -29,10 +29,10 @@ /*H:300 * The Page Table Code * - * We use two-level page tables for the Guest. If you're not entirely - * comfortable with virtual addresses, physical addresses and page tables then - * I recommend you review arch/x86/lguest/boot.c's "Page Table Handling" (with - * diagrams!). + * We use two-level page tables for the Guest, or three-level with PAE. If + * you're not entirely comfortable with virtual addresses, physical addresses + * and page tables then I recommend you review arch/x86/lguest/boot.c's "Page + * Table Handling" (with diagrams!). * * The Guest keeps page tables, but we maintain the actual ones here: these are * called "shadow" page tables. Which is a very Guest-centric name: these are @@ -52,9 +52,8 @@ :*/ /* - * 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is - * conveniently placed at the top 4MB, so it uses a separate, complete PTE - * page. + * The Switcher uses the complete top PTE page. That's 1024 PTE entries (4MB) + * or 512 PTE entries with PAE (2MB). */ #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) @@ -81,7 +80,8 @@ static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); /*H:320 * The page table code is curly enough to need helper functions to keep it - * clear and clean. + * clear and clean. The kernel itself provides many of them; one advantage + * of insisting that the Guest and Host use the same CONFIG_PAE setting. * * There are two functions which return pointers to the shadow (aka "real") * page tables. @@ -155,7 +155,7 @@ static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) } /* - * These two functions just like the above two, except they access the Guest + * These functions are just like the above two, except they access the Guest * page tables. Hence they return a Guest address. */ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) @@ -165,6 +165,7 @@ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) } #ifdef CONFIG_X86_PAE +/* Follow the PGD to the PMD. */ static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) { unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; @@ -172,6 +173,7 @@ static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) return gpage + pmd_index(vaddr) * sizeof(pmd_t); } +/* Follow the PMD to the PTE. */ static unsigned long gpte_addr(struct lg_cpu *cpu, pmd_t gpmd, unsigned long vaddr) { @@ -181,6 +183,7 @@ static unsigned long gpte_addr(struct lg_cpu *cpu, return gpage + pte_index(vaddr) * sizeof(pte_t); } #else +/* Follow the PGD to the PTE (no mid-level for !PAE). */ static unsigned long gpte_addr(struct lg_cpu *cpu, pgd_t gpgd, unsigned long vaddr) { @@ -314,6 +317,7 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) pte_t gpte; pte_t *spte; + /* Mid level for PAE. */ #ifdef CONFIG_X86_PAE pmd_t *spmd; pmd_t gpmd; @@ -391,6 +395,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) */ gpte_ptr = gpte_addr(cpu, gpgd, vaddr); #endif + + /* Read the actual PTE value. */ gpte = lgread(cpu, gpte_ptr, pte_t); /* If this page isn't in the Guest page tables, we can't page it in. */ @@ -507,6 +513,7 @@ void pin_page(struct lg_cpu *cpu, unsigned long vaddr) if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2)) kill_guest(cpu, "bad stack page %#lx", vaddr); } +/*:*/ #ifdef CONFIG_X86_PAE static void release_pmd(pmd_t *spmd) @@ -543,7 +550,11 @@ static void release_pgd(pgd_t *spgd) } #else /* !CONFIG_X86_PAE */ -/*H:450 If we chase down the release_pgd() code, it looks like this: */ +/*H:450 + * If we chase down the release_pgd() code, the non-PAE version looks like + * this. The PAE version is almost identical, but instead of calling + * release_pte it calls release_pmd(), which looks much like this. + */ static void release_pgd(pgd_t *spgd) { /* If the entry's not present, there's nothing to release. */ @@ -898,17 +909,21 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx) /* ... throw it away. */ release_pgd(lg->pgdirs[pgdir].pgdir + idx); } + #ifdef CONFIG_X86_PAE +/* For setting a mid-level, we just throw everything away. It's easy. */ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) { guest_pagetable_clear_all(&lg->cpus[0]); } #endif -/* - * Once we know how much memory we have we can construct simple identity (which +/*H:505 + * To get through boot, we construct simple identity page mappings (which * set virtual == physical) and linear mappings which will get the Guest far - * enough into the boot to create its own. + * enough into the boot to create its own. The linear mapping means we + * simplify the Guest boot, but it makes assumptions about their PAGE_OFFSET, + * as you'll see. * * We lay them out of the way, just below the initrd (which is why we need to * know its size here). @@ -944,6 +959,10 @@ static unsigned long setup_pagetables(struct lguest *lg, linear = (void *)pgdir - linear_pages * PAGE_SIZE; #ifdef CONFIG_X86_PAE + /* + * And the single mid page goes below that. We only use one, but + * that's enough to map 1G, which definitely gets us through boot. + */ pmds = (void *)linear - PAGE_SIZE; #endif /* @@ -957,13 +976,14 @@ static unsigned long setup_pagetables(struct lguest *lg, return -EFAULT; } +#ifdef CONFIG_X86_PAE /* - * The top level points to the linear page table pages above. - * We setup the identity and linear mappings here. + * Make the Guest PMD entries point to the corresponding place in the + * linear mapping (up to one page worth of PMD). */ -#ifdef CONFIG_X86_PAE for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; i += PTRS_PER_PTE, j++) { + /* FIXME: native_set_pmd is overkill here. */ native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i) - mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); @@ -971,18 +991,36 @@ static unsigned long setup_pagetables(struct lguest *lg, return -EFAULT; } + /* One PGD entry, pointing to that PMD page. */ set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT)); + /* Copy it in as the first PGD entry (ie. addresses 0-1G). */ if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0) return -EFAULT; + /* + * And the third PGD entry (ie. addresses 3G-4G). + * + * FIXME: This assumes that PAGE_OFFSET for the Guest is 0xC0000000. + */ if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0) return -EFAULT; #else + /* + * The top level points to the linear page table pages above. + * We setup the identity and linear mappings here. + */ phys_linear = (unsigned long)linear - mem_base; for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { pgd_t pgd; + /* + * Create a PGD entry which points to the right part of the + * linear PTE pages. + */ pgd = __pgd((phys_linear + i * sizeof(pte_t)) | (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); + /* + * Copy it into the PGD page at 0 and PAGE_OFFSET. + */ if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd)) || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET) + i / PTRS_PER_PTE], @@ -992,8 +1030,8 @@ static unsigned long setup_pagetables(struct lguest *lg, #endif /* - * We return the top level (guest-physical) address: remember where - * this is. + * We return the top level (guest-physical) address: we remember where + * this is to write it into lguest_data when the Guest initializes. */ return (unsigned long)pgdir - mem_base; } @@ -1031,7 +1069,9 @@ int init_guest_pagetable(struct lguest *lg) lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); if (!lg->pgdirs[0].pgdir) return -ENOMEM; + #ifdef CONFIG_X86_PAE + /* For PAE, we also create the initial mid-level. */ pgd = lg->pgdirs[0].pgdir; pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL); if (!pmd_table) @@ -1040,11 +1080,13 @@ int init_guest_pagetable(struct lguest *lg) set_pgd(pgd + SWITCHER_PGD_INDEX, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); #endif + + /* This is the current page table. */ lg->cpus[0].cpu_pgd = 0; return 0; } -/* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ +/*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ void page_table_guest_data_init(struct lg_cpu *cpu) { /* We get the kernel address: above this is all kernel memory. */ @@ -1105,12 +1147,16 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) pmd_t switcher_pmd; pmd_t *pmd_table; + /* FIXME: native_set_pmd is overkill here. */ native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >> PAGE_SHIFT, PAGE_KERNEL_EXEC)); + /* Figure out where the pmd page is, by reading the PGD, and converting + * it to a virtual address. */ pmd_table = __va(pgd_pfn(cpu->lg-> pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) << PAGE_SHIFT); + /* Now write it into the shadow page table. */ native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); #else pgd_t switcher_pgd; diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index 96f7d88ec7f8..6ae388849a3b 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -187,7 +187,7 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages) * also simplify copy_in_guest_info(). Note that we'd still need to restore * things when we exit to Launcher userspace, but that's fairly easy. * - * We could also try using this hooks for PGE, but that might be too expensive. + * We could also try using these hooks for PGE, but that might be too expensive. * * The hooks were designed for KVM, but we can also put them to good use. :*/ diff --git a/drivers/lguest/x86/switcher_32.S b/drivers/lguest/x86/switcher_32.S index 6dec09793836..40634b0db9f7 100644 --- a/drivers/lguest/x86/switcher_32.S +++ b/drivers/lguest/x86/switcher_32.S @@ -1,7 +1,7 @@ /*P:900 - * This is the Switcher: code which sits at 0xFFC00000 astride both the - * Host and Guest to do the low-level Guest<->Host switch. It is as simple as - * it can be made, but it's naturally very specific to x86. + * This is the Switcher: code which sits at 0xFFC00000 (or 0xFFE00000) astride + * both the Host and Guest to do the low-level Guest<->Host switch. It is as + * simple as it can be made, but it's naturally very specific to x86. * * You have now completed Preparation. If this has whet your appetite; if you * are feeling invigorated and refreshed then the next, more challenging stage -- cgit v1.2.3 From bdc6340f4eb68295b1e7c0ade2356b56dca93d93 Mon Sep 17 00:00:00 2001 From: "Pallipadi, Venkatesh" Date: Thu, 30 Jul 2009 14:43:19 -0700 Subject: x86, pat: Fix set_memory_wc related corruption Changeset 3869c4aa18835c8c61b44bd0f3ace36e9d3b5bd0 that went in after 2.6.30-rc1 was a seemingly small change to _set_memory_wc() to make it complaint with SDM requirements. But, introduced a nasty bug, which can result in crash and/or strange corruptions when set_memory_wc is used. One such crash reported here http://lkml.org/lkml/2009/7/30/94 Actually, that changeset introduced two bugs. * change_page_attr_set() takes &addr as first argument and can the addr value might have changed on return, even for single page change_page_attr_set() call. That will make the second change_page_attr_set() in this routine operate on unrelated addr, that can eventually cause strange corruptions and bad page state crash. * The second change_page_attr_set() call, before setting _PAGE_CACHE_WC, should clear the earlier _PAGE_CACHE_UC_MINUS, as otherwise cache attribute will not be WC (will be UC instead). The patch below fixes both these problems. Sending a single patch to fix both the problems, as the change is to the same line of code. The change to have a addr_copy is not very clean. But, it is simpler than making more changes through various routines in pageattr.c. A huge thanks to Jerome for reporting this problem and providing a simple test case that helped us root cause the problem. Reported-by: Jerome Glisse Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha LKML-Reference: <20090730214319.GA1889@linux-os.sc.intel.com> Acked-by: Dave Airlie Signed-off-by: H. Peter Anvin --- arch/x86/mm/pageattr.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 1b734d7a8966..895d90e1a81b 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -997,12 +997,15 @@ EXPORT_SYMBOL(set_memory_array_uc); int _set_memory_wc(unsigned long addr, int numpages) { int ret; + unsigned long addr_copy = addr; + ret = change_page_attr_set(&addr, numpages, __pgprot(_PAGE_CACHE_UC_MINUS), 0); - if (!ret) { - ret = change_page_attr_set(&addr, numpages, - __pgprot(_PAGE_CACHE_WC), 0); + ret = change_page_attr_set_clr(&addr_copy, numpages, + __pgprot(_PAGE_CACHE_WC), + __pgprot(_PAGE_CACHE_MASK), + 0, 0, NULL); } return ret; } -- cgit v1.2.3 From 8523acfe40efc1a8d3da8f473ca67cb195b06f0c Mon Sep 17 00:00:00 2001 From: Thomas Hellstrom Date: Mon, 3 Aug 2009 09:25:45 +0200 Subject: x86: Fix CPA memtype reserving in the set_pages_array*() cases The code was incorrectly reserving memtypes using the page virtual address instead of the physical address. Furthermore, the code was not ignoring highmem pages as it ought to. ( upstream does not pass in highmem pages yet - but upcoming graphics code will do it and there's no reason to not handle this properly in the CPA APIs.) Fixes: http://bugzilla.kernel.org/show_bug.cgi?id=13884 Signed-off-by: Thomas Hellstrom Acked-by: Suresh Siddha Cc: Cc: dri-devel@lists.sourceforge.net Cc: venkatesh.pallipadi@intel.com LKML-Reference: <1249284345-7654-1-git-send-email-thellstrom@vmware.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 895d90e1a81b..7e600c1962db 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -591,9 +591,12 @@ static int __change_page_attr(struct cpa_data *cpa, int primary) unsigned int level; pte_t *kpte, old_pte; - if (cpa->flags & CPA_PAGES_ARRAY) - address = (unsigned long)page_address(cpa->pages[cpa->curpage]); - else if (cpa->flags & CPA_ARRAY) + if (cpa->flags & CPA_PAGES_ARRAY) { + struct page *page = cpa->pages[cpa->curpage]; + if (unlikely(PageHighMem(page))) + return 0; + address = (unsigned long)page_address(page); + } else if (cpa->flags & CPA_ARRAY) address = cpa->vaddr[cpa->curpage]; else address = *cpa->vaddr; @@ -697,9 +700,12 @@ static int cpa_process_alias(struct cpa_data *cpa) * No need to redo, when the primary call touched the direct * mapping already: */ - if (cpa->flags & CPA_PAGES_ARRAY) - vaddr = (unsigned long)page_address(cpa->pages[cpa->curpage]); - else if (cpa->flags & CPA_ARRAY) + if (cpa->flags & CPA_PAGES_ARRAY) { + struct page *page = cpa->pages[cpa->curpage]; + if (unlikely(PageHighMem(page))) + return 0; + vaddr = (unsigned long)page_address(page); + } else if (cpa->flags & CPA_ARRAY) vaddr = cpa->vaddr[cpa->curpage]; else vaddr = *cpa->vaddr; @@ -1122,7 +1128,9 @@ int set_pages_array_uc(struct page **pages, int addrinarray) int free_idx; for (i = 0; i < addrinarray; i++) { - start = (unsigned long)page_address(pages[i]); + if (PageHighMem(pages[i])) + continue; + start = page_to_pfn(pages[i]) << PAGE_SHIFT; end = start + PAGE_SIZE; if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL)) goto err_out; @@ -1135,7 +1143,9 @@ int set_pages_array_uc(struct page **pages, int addrinarray) err_out: free_idx = i; for (i = 0; i < free_idx; i++) { - start = (unsigned long)page_address(pages[i]); + if (PageHighMem(pages[i])) + continue; + start = page_to_pfn(pages[i]) << PAGE_SHIFT; end = start + PAGE_SIZE; free_memtype(start, end); } @@ -1164,7 +1174,9 @@ int set_pages_array_wb(struct page **pages, int addrinarray) return retval; for (i = 0; i < addrinarray; i++) { - start = (unsigned long)page_address(pages[i]); + if (PageHighMem(pages[i])) + continue; + start = page_to_pfn(pages[i]) << PAGE_SHIFT; end = start + PAGE_SIZE; free_memtype(start, end); } -- cgit v1.2.3 From 6c6c51e4cc11a5456fb1172008f7c69d955af9f6 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 3 Aug 2009 22:47:32 +1000 Subject: x86: Add quirk to make Apple MacBook5,2 use reboot=pci The latest Apple MacBook (MacBook5,2) doesn't reboot successfully under Linux; neither the EFI reboot method nor the default method using the keyboard controller works (the system just hangs and doesn't reset). However, the method using the "PCI reset register" at 0xcf9 does work. This adds a quirk to detect this machine via DMI and force the reboot_type to BOOT_CF9. With this it reboots successfully without requiring a command-line option. Note that the EFI code forces reboot_type to BOOT_EFI when the machine is booted via EFI, but this overrides that since the core_initcall runs after the EFI initialization code. Signed-off-by: Paul Mackerras LKML-Reference: <19062.56420.501516.316181@cargo.ozlabs.ibm.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/reboot.c | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 508e982dd072..834c9da8bf9d 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -17,7 +18,6 @@ #include #ifdef CONFIG_X86_32 -# include # include # include #else @@ -404,6 +404,38 @@ EXPORT_SYMBOL(machine_real_restart); #endif /* CONFIG_X86_32 */ +/* + * Apple MacBook5,2 (2009 MacBook) needs reboot=p + */ +static int __init set_pci_reboot(const struct dmi_system_id *d) +{ + if (reboot_type != BOOT_CF9) { + reboot_type = BOOT_CF9; + printk(KERN_INFO "%s series board detected. " + "Selecting PCI-method for reboots.\n", d->ident); + } + return 0; +} + +static struct dmi_system_id __initdata pci_reboot_dmi_table[] = { + { /* Handle problems with rebooting on Apple MacBook5,2 */ + .callback = set_pci_reboot, + .ident = "Apple MacBook", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5,2"), + }, + }, + { } +}; + +static int __init pci_reboot_init(void) +{ + dmi_check_system(pci_reboot_dmi_table); + return 0; +} +core_initcall(pci_reboot_init); + static inline void kb_wait(void) { int i; -- cgit v1.2.3 From 6a7bbd57ed50bb62c9a81ae5f2e202ca689e5964 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 3 Aug 2009 22:38:10 +1000 Subject: x86: Make 64-bit efi_ioremap use ioremap on MMIO regions Booting current 64-bit x86 kernels on the latest Apple MacBook (MacBook5,2) via EFI gives the following warning: [ 0.182209] ------------[ cut here ]------------ [ 0.182222] WARNING: at arch/x86/mm/pageattr.c:581 __cpa_process_fault+0x44/0xa0() [ 0.182227] Hardware name: MacBook5,2 [ 0.182231] CPA: called for zero pte. vaddr = ffff8800ffe00000 cpa->vaddr = ffff8800ffe00000 [ 0.182236] Modules linked in: [ 0.182242] Pid: 0, comm: swapper Not tainted 2.6.31-rc4 #6 [ 0.182246] Call Trace: [ 0.182254] [] ? __cpa_process_fault+0x44/0xa0 [ 0.182261] [] warn_slowpath_common+0x78/0xd0 [ 0.182266] [] warn_slowpath_fmt+0x64/0x70 [ 0.182272] [] ? update_page_count+0x3c/0x50 [ 0.182280] [] ? phys_pmd_init+0x140/0x22e [ 0.182286] [] __cpa_process_fault+0x44/0xa0 [ 0.182292] [] __change_page_attr_set_clr+0x5f0/0xb40 [ 0.182301] [] ? vm_unmap_aliases+0x175/0x190 [ 0.182307] [] change_page_attr_set_clr+0xfe/0x3d0 [ 0.182314] [] _set_memory_uc+0x2a/0x30 [ 0.182319] [] set_memory_uc+0x7b/0xb0 [ 0.182327] [] efi_enter_virtual_mode+0x2ad/0x2c9 [ 0.182334] [] start_kernel+0x2db/0x3f4 [ 0.182340] [] x86_64_start_reservations+0x99/0xb9 [ 0.182345] [] x86_64_start_kernel+0xe0/0xf2 [ 0.182357] ---[ end trace 4eaa2a86a8e2da22 ]--- [ 0.182982] init_memory_mapping: 00000000ffffc000-0000000100000000 [ 0.182993] 00ffffc000 - 0100000000 page 4k This happens because the 64-bit version of efi_ioremap calls init_memory_mapping for all addresses, regardless of whether they are RAM or MMIO. The EFI tables on this machine ask for runtime access to some MMIO regions: [ 0.000000] EFI: mem195: type=11, attr=0x8000000000000000, range=[0x0000000093400000-0x0000000093401000) (0MB) [ 0.000000] EFI: mem196: type=11, attr=0x8000000000000000, range=[0x00000000ffc00000-0x00000000ffc40000) (0MB) [ 0.000000] EFI: mem197: type=11, attr=0x8000000000000000, range=[0x00000000ffc40000-0x00000000ffc80000) (0MB) [ 0.000000] EFI: mem198: type=11, attr=0x8000000000000000, range=[0x00000000ffc80000-0x00000000ffca4000) (0MB) [ 0.000000] EFI: mem199: type=11, attr=0x8000000000000000, range=[0x00000000ffca4000-0x00000000ffcb4000) (0MB) [ 0.000000] EFI: mem200: type=11, attr=0x8000000000000000, range=[0x00000000ffcb4000-0x00000000ffffc000) (3MB) [ 0.000000] EFI: mem201: type=11, attr=0x8000000000000000, range=[0x00000000ffffc000-0x0000000100000000) (0MB) This arranges to pass the EFI memory type through to efi_ioremap, and makes efi_ioremap use ioremap rather than init_memory_mapping if the type is EFI_MEMORY_MAPPED_IO. With this, the above warning goes away. Signed-off-by: Paul Mackerras LKML-Reference: <19062.55858.533494.471153@cargo.ozlabs.ibm.com> Cc: Huang Ying Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/efi.h | 5 +++-- arch/x86/kernel/efi.c | 2 +- arch/x86/kernel/efi_64.c | 6 +++++- 3 files changed, 9 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index edc90f23e708..8406ed7f9926 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -33,7 +33,7 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...); #define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \ efi_call_virt(f, a1, a2, a3, a4, a5, a6) -#define efi_ioremap(addr, size) ioremap_cache(addr, size) +#define efi_ioremap(addr, size, type) ioremap_cache(addr, size) #else /* !CONFIG_X86_32 */ @@ -84,7 +84,8 @@ extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3, efi_call6((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6)) -extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size); +extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size, + u32 type); #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index 96f7ac0bbf01..19ccf6d0dccf 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -512,7 +512,7 @@ void __init efi_enter_virtual_mode(void) && end_pfn <= max_pfn_mapped)) va = __va(md->phys_addr); else - va = efi_ioremap(md->phys_addr, size); + va = efi_ioremap(md->phys_addr, size, md->type); md->virt_addr = (u64) (unsigned long) va; diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c index 22c3b7828c50..ac0621a7ac3d 100644 --- a/arch/x86/kernel/efi_64.c +++ b/arch/x86/kernel/efi_64.c @@ -98,10 +98,14 @@ void __init efi_call_phys_epilog(void) early_runtime_code_mapping_set_exec(0); } -void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size) +void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size, + u32 type) { unsigned long last_map_pfn; + if (type == EFI_MEMORY_MAPPED_IO) + return ioremap(phys_addr, size); + last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size); if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) return NULL; -- cgit v1.2.3 From d2ba8b211bb8abc29aa627dbd4dce08cfbc8082b Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 3 Aug 2009 14:44:54 -0700 Subject: x86: Fix assert syntax in vmlinux.lds.S Older versions of binutils did not accept the naked "ASSERT" syntax; it is considered an expression whose value needs to be assigned to something. Reported-tested-and-fixed-by: Jean Delvare Signed-off-by: H. Peter Anvin --- arch/x86/kernel/vmlinux.lds.S | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 59f31d2dd435..78d185d797de 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -393,8 +393,8 @@ SECTIONS #ifdef CONFIG_X86_32 -ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), - "kernel image bigger than KERNEL_IMAGE_SIZE") +. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), + "kernel image bigger than KERNEL_IMAGE_SIZE"); #else /* * Per-cpu symbols which need to be offset from __per_cpu_load @@ -407,12 +407,12 @@ INIT_PER_CPU(irq_stack_union); /* * Build-time check on the image size: */ -ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), - "kernel image bigger than KERNEL_IMAGE_SIZE") +. = ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), + "kernel image bigger than KERNEL_IMAGE_SIZE"); #ifdef CONFIG_SMP -ASSERT((per_cpu__irq_stack_union == 0), - "irq_stack_union is not at start of per-cpu area"); +. = ASSERT((per_cpu__irq_stack_union == 0), + "irq_stack_union is not at start of per-cpu area"); #endif #endif /* CONFIG_X86_32 */ @@ -420,7 +420,7 @@ ASSERT((per_cpu__irq_stack_union == 0), #ifdef CONFIG_KEXEC #include -ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, - "kexec control code size is too big") +. = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, + "kexec control code size is too big"); #endif -- cgit v1.2.3 From bab9a3da93bfe09c609407dedae5708b07a7ac56 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 30 Jul 2009 11:10:01 +0200 Subject: x86, msr: execute on the correct CPU subset Make rdmsr_on_cpus/wrmsr_on_cpus execute on the current CPU only if it is in the supplied bitmask. Signed-off-by: Borislav Petkov Signed-off-by: H. Peter Anvin --- arch/x86/lib/msr.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c index 1440b9c0547e..caa24aca8115 100644 --- a/arch/x86/lib/msr.c +++ b/arch/x86/lib/msr.c @@ -89,16 +89,13 @@ void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs) rv.msrs = msrs; rv.msr_no = msr_no; - preempt_disable(); - /* - * FIXME: handle the CPU we're executing on separately for now until - * smp_call_function_many has been fixed to not skip it. - */ - this_cpu = raw_smp_processor_id(); - smp_call_function_single(this_cpu, __rdmsr_on_cpu, &rv, 1); + this_cpu = get_cpu(); + + if (cpumask_test_cpu(this_cpu, mask)) + __rdmsr_on_cpu(&rv); smp_call_function_many(mask, __rdmsr_on_cpu, &rv, 1); - preempt_enable(); + put_cpu(); } EXPORT_SYMBOL(rdmsr_on_cpus); @@ -121,16 +118,13 @@ void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs) rv.msrs = msrs; rv.msr_no = msr_no; - preempt_disable(); - /* - * FIXME: handle the CPU we're executing on separately for now until - * smp_call_function_many has been fixed to not skip it. - */ - this_cpu = raw_smp_processor_id(); - smp_call_function_single(this_cpu, __wrmsr_on_cpu, &rv, 1); + this_cpu = get_cpu(); + + if (cpumask_test_cpu(this_cpu, mask)) + __wrmsr_on_cpu(&rv); smp_call_function_many(mask, __wrmsr_on_cpu, &rv, 1); - preempt_enable(); + put_cpu(); } EXPORT_SYMBOL(wrmsr_on_cpus); -- cgit v1.2.3 From f1f029c7bfbf4ee1918b90a431ab823bed812504 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 3 Aug 2009 16:33:40 -0700 Subject: x86: fix assembly constraints in native_save_fl() From Gabe Black in bugzilla 13888: native_save_fl is implemented as follows: 11static inline unsigned long native_save_fl(void) 12{ 13 unsigned long flags; 14 15 asm volatile("# __raw_save_flags\n\t" 16 "pushf ; pop %0" 17 : "=g" (flags) 18 : /* no input */ 19 : "memory"); 20 21 return flags; 22} If gcc chooses to put flags on the stack, for instance because this is inlined into a larger function with more register pressure, the offset of the flags variable from the stack pointer will change when the pushf is performed. gcc doesn't attempt to understand that fact, and address used for pop will still be the same. It will write to somewhere near flags on the stack but not actually into it and overwrite some other value. I saw this happen in the ide_device_add_all function when running in a simulator I work on. I'm assuming that some quirk of how the simulated hardware is set up caused the code path this is on to be executed when it normally wouldn't. A simple fix might be to change "=g" to "=r". Reported-by: Gabe Black Signed-off-by: H. Peter Anvin Cc: Stable Team --- arch/x86/include/asm/irqflags.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 2bdab21f0898..c6ccbe7e81ad 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -12,9 +12,15 @@ static inline unsigned long native_save_fl(void) { unsigned long flags; + /* + * Note: this needs to be "=r" not "=rm", because we have the + * stack offset from what gcc expects at the time the "pop" is + * executed, and so a memory reference with respect to the stack + * would end up using the wrong address. + */ asm volatile("# __raw_save_flags\n\t" "pushf ; pop %0" - : "=g" (flags) + : "=r" (flags) : /* no input */ : "memory"); -- cgit v1.2.3 From 6c7184b77464261b7d55583a48accbd1350923a3 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Mon, 27 Jul 2009 09:35:07 -0500 Subject: x86, UV: Handle missing blade-local memory correctly UV blades may not have any blade-local memory. Add a field (nid) to the UV blade structure to indicates whether the node has local memory. This is needed by the GRU driver (pushed separately). Signed-off-by: Jack Steiner Cc: linux-mm@kvack.org LKML-Reference: <20090727143507.GA7006@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_hub.h | 7 +++++++ arch/x86/kernel/apic/x2apic_uv_x.c | 5 +++++ 2 files changed, 12 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 341070f7ad5c..a6dde059e02e 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -327,6 +327,7 @@ struct uv_blade_info { unsigned short nr_possible_cpus; unsigned short nr_online_cpus; unsigned short pnode; + short memory_nid; }; extern struct uv_blade_info *uv_blade_info; extern short *uv_node_to_blade; @@ -363,6 +364,12 @@ static inline int uv_blade_to_pnode(int bid) return uv_blade_info[bid].pnode; } +/* Nid of memory node on blade. -1 if no blade-local memory */ +static inline int uv_blade_to_memory_nid(int bid) +{ + return uv_blade_info[bid].memory_nid; +} + /* Determine the number of possible cpus on a blade */ static inline int uv_blade_nr_possible_cpus(int bid) { diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 096d19aea2f7..ad3f0a984cab 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -591,6 +591,8 @@ void __init uv_system_init(void) bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); uv_blade_info = kmalloc(bytes, GFP_KERNEL); BUG_ON(!uv_blade_info); + for (blade = 0; blade < uv_num_possible_blades(); blade++) + uv_blade_info[blade].memory_nid = -1; get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size); @@ -629,6 +631,9 @@ void __init uv_system_init(void) lcpu = uv_blade_info[blade].nr_possible_cpus; uv_blade_info[blade].nr_possible_cpus++; + /* Any node on the blade, else will contain -1. */ + uv_blade_info[blade].memory_nid = nid; + uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size; uv_cpu_hub_info(cpu)->m_val = m_val; -- cgit v1.2.3 From cc5e4fa1bd4d2f56da07f9092281afdcd2374ab9 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Mon, 27 Jul 2009 09:36:56 -0500 Subject: x86, UV: Delete mapping of MMR rangs mapped by BIOS The UV BIOS has added additional MMR ranges that are mapped via EFI virtual mode mappings. These ranges should be deleted from ranges mapped by uv_system_init(). Signed-off-by: Jack Steiner Cc: linux-mm@kvack.org LKML-Reference: <20090727143656.GA7698@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_uv_x.c | 31 +------------------------------ 1 file changed, 1 insertion(+), 30 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index ad3f0a984cab..445c210daf82 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -362,12 +362,6 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) BUG(); } -static __init void map_low_mmrs(void) -{ - init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE); - init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE); -} - enum map_type {map_wb, map_uc}; static __init void map_high(char *id, unsigned long base, int shift, @@ -395,26 +389,6 @@ static __init void map_gru_high(int max_pnode) map_high("GRU", gru.s.base, shift, max_pnode, map_wb); } -static __init void map_config_high(int max_pnode) -{ - union uvh_rh_gam_cfg_overlay_config_mmr_u cfg; - int shift = UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_SHFT; - - cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR); - if (cfg.s.enable) - map_high("CONFIG", cfg.s.base, shift, max_pnode, map_uc); -} - -static __init void map_mmr_high(int max_pnode) -{ - union uvh_rh_gam_mmr_overlay_config_mmr_u mmr; - int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT; - - mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); - if (mmr.s.enable) - map_high("MMR", mmr.s.base, shift, max_pnode, map_uc); -} - static __init void map_mmioh_high(int max_pnode) { union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; @@ -566,8 +540,6 @@ void __init uv_system_init(void) unsigned long mmr_base, present, paddr; unsigned short pnode_mask; - map_low_mmrs(); - m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); m_val = m_n_config.s.m_skt; n_val = m_n_config.s.n_skt; @@ -667,11 +639,10 @@ void __init uv_system_init(void) pnode = (paddr >> m_val) & pnode_mask; blade = boot_pnode_to_blade(pnode); uv_node_to_blade[nid] = blade; + max_pnode = max(pnode, max_pnode); } map_gru_high(max_pnode); - map_mmr_high(max_pnode); - map_config_high(max_pnode); map_mmioh_high(max_pnode); uv_cpu_init(); -- cgit v1.2.3 From 67e83f309ed0baaf01a2c956b5174905bcdc1242 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Mon, 27 Jul 2009 09:38:08 -0500 Subject: x86, UV: Fix macros for accessing large node numbers The UV chipset automatically supplies the upper bits on nodes being referenced by MMR accesses. These bit can be deleted from the hub addressing macros. Signed-off-by: Jack Steiner LKML-Reference: <20090727143808.GA8076@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_hub.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index a6dde059e02e..77a68505419a 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -175,7 +175,7 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); #define UV_GLOBAL_MMR32_PNODE_BITS(p) ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT)) #define UV_GLOBAL_MMR64_PNODE_BITS(p) \ - ((unsigned long)(UV_PNODE_TO_GNODE(p)) << UV_GLOBAL_MMR64_PNODE_SHIFT) + (((unsigned long)(p)) << UV_GLOBAL_MMR64_PNODE_SHIFT) #define UV_APIC_PNODE_SHIFT 6 -- cgit v1.2.3 From c5997fa8d7aca3c9876a6ff71bacf27c41095ce9 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Mon, 27 Jul 2009 09:38:56 -0500 Subject: x86, UV: Fix UV apic mode Change SGI UV default apicid mode to "physical". This is required to match settings in the UV hub chip. Signed-off-by: Jack Steiner LKML-Reference: <20090727143856.GA8905@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_uv_x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 445c210daf82..832e908adcb5 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -261,7 +261,7 @@ struct apic apic_x2apic_uv_x = { .apic_id_registered = uv_apic_id_registered, .irq_delivery_mode = dest_Fixed, - .irq_dest_mode = 1, /* logical */ + .irq_dest_mode = 0, /* physical */ .target_cpus = uv_target_cpus, .disable_esr = 0, -- cgit v1.2.3 From d8c7eb34c2db6268909ae8c3958be63bde254292 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 25 Jul 2009 03:23:09 -0700 Subject: x86: Don't use current_cpu_data in x2apic phys_pkg_id One system has socket 1 come up as BSP. kexeced kernel reports BSP as: [ 1.524550] Initializing cgroup subsys cpuacct [ 1.536064] initial_apicid:20 [ 1.537135] ht_mask_width:1 [ 1.538128] core_select_mask:f [ 1.539126] core_plus_mask_width:5 [ 1.558479] CPU: Physical Processor ID: 0 [ 1.559501] CPU: Processor Core ID: 0 [ 1.560539] CPU: L1 I cache: 32K, L1 D cache: 32K [ 1.579098] CPU: L2 cache: 256K [ 1.580085] CPU: L3 cache: 24576K [ 1.581108] CPU 0/0x20 -> Node 0 [ 1.596193] CPU 0 microcode level: 0xffff0008 It doesn't have correct physical processor id and will get an error: [ 38.840859] CPU0 attaching sched-domain: [ 38.848287] domain 0: span 0,8,72 level SIBLING [ 38.851151] groups: 0 8 72 [ 38.858137] domain 1: span 0,8-15,72-79 level MC [ 38.868944] groups: 0,8,72 9,73 10,74 11,75 12,76 13,77 14,78 15,79 [ 38.881383] ERROR: parent span is not a superset of domain->span [ 38.890724] domain 2: span 0-7,64-71 level CPU [ 38.899237] ERROR: domain->groups does not contain CPU0 [ 38.909229] groups: 8-15,72-79 [ 38.912547] ERROR: groups don't span domain->span [ 38.919665] domain 3: span 0-127 level NODE [ 38.930739] groups: 0-7,64-71 8-15,72-79 16-23,80-87 24-31,88-95 32-39,96-103 40-47,104-111 48-55,112-119 56-63,120-127 it turns out: we can not use current_cpu_data in phys_pgd_id for x2apic. identify_boot_cpu() is called by check_bugs() before smp_prepare_cpus() and till smp_prepare_cpus() current_cpu_data for bsp is assigned with boot_cpu_data. Just make phys_pkg_id for x2apic is aligned to xapic. Signed-off-by: Yinghai Lu Acked-by: Suresh Siddha Cc: Andrew Morton LKML-Reference: <4A6ADD0D.10002@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_cluster.c | 2 +- arch/x86/kernel/apic/x2apic_phys.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 8e4cbb255c38..2ed4e2bb3b32 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -170,7 +170,7 @@ static unsigned long set_apic_id(unsigned int id) static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb) { - return current_cpu_data.initial_apicid >> index_msb; + return initial_apicid >> index_msb; } static void x2apic_send_IPI_self(int vector) diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index a284359627e7..0b631c6a2e00 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -162,7 +162,7 @@ static unsigned long set_apic_id(unsigned int id) static int x2apic_phys_pkg_id(int initial_apicid, int index_msb) { - return current_cpu_data.initial_apicid >> index_msb; + return initial_apicid >> index_msb; } static void x2apic_send_IPI_self(int vector) -- cgit v1.2.3 From 6abf65510944d33b47575d151c6b318993c8d2b5 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 30 Jul 2009 16:49:30 +0100 Subject: x86, 32-bit: Fix double accounting in reserve_top_address() With VMALLOC_END included in the calculation of MAXMEM (as of 2.6.28) it is no longer correct to also bump __VMALLOC_RESERVE in reserve_top_address(). Doing so results in needlessly small lowmem. Signed-off-by: Jan Beulich LKML-Reference: <4A71DD2A020000780000D482@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/pgtable.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index af8f9650058c..ed34f5e35999 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -329,7 +329,6 @@ void __init reserve_top_address(unsigned long reserve) printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", (int)-reserve); __FIXADDR_TOP = -reserve - PAGE_SIZE; - __VMALLOC_RESERVE += reserve; #endif } -- cgit v1.2.3 From 2a5ef41661b56cf4eee042a6967c4e14b63e8eac Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Mon, 20 Jul 2009 09:28:41 -0500 Subject: x86, UV: Complete IRQ interrupt migration in arch_enable_uv_irq() In uv_setup_irq(), the call to create_irq() initially assigns IRQ vectors to cpu 0. The subsequent call to assign_irq_vector() in arch_enable_uv_irq() migrates the IRQ to another cpu and frees the cpu 0 vector - at least it will be freed as soon as the "IRQ move" completes. arch_enable_uv_irq() needs to send a cleanup IPI to complete the IRQ move. Otherwise, assignment of GRU interrupts on large systems (>200 cpus) will exhaust the cpu 0 interrupt vectors and initialization of the GRU driver will fail. Signed-off-by: Jack Steiner LKML-Reference: <20090720142840.GA8885@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 2284a4812b68..d2ed6c5ddc80 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3793,6 +3793,9 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, mmr_pnode = uv_blade_to_pnode(mmr_blade); uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); + if (cfg->move_in_progress) + send_cleanup_vector(cfg); + return irq; } -- cgit v1.2.3 From dc731fbbadf5d65c98fcd6c86472aa286c16458a Mon Sep 17 00:00:00 2001 From: Subrata Modak Date: Tue, 21 Jul 2009 08:02:27 +0530 Subject: x86: Work around compilation warning in arch/x86/kernel/apm_32.c The following fix was initially inspired by David Howells fix few days back: http://lkml.org/lkml/2009/7/9/109 However, Ingo disapproves such fixes as it's dangerous (it can hide future, relevant warnings) - in something as performance-uncritical. So, initialize 'err' to '0' to work around a GCC false positive warning: http://lkml.org/lkml/2009/7/18/89 Signed-off-by: Subrata Modak Cc: Sachin P Sant Cc: David Howells Cc: Balbir Singh Cc: Stephen Rothwell LKML-Reference: <20090721023226.31855.67236.sendpatchset@subratamodak.linux.ibm.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apm_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 79302e9a33a4..442b5508893f 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -811,7 +811,7 @@ static int apm_do_idle(void) u8 ret = 0; int idled = 0; int polling; - int err; + int err = 0; polling = !!(current_thread_info()->status & TS_POLLING); if (polling) { -- cgit v1.2.3 From 7d5b005652bc5ae3e1e0efc53fd0e25a643ec506 Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Tue, 4 Aug 2009 15:34:22 -0700 Subject: x86: Fix VMI && stack protector With CONFIG_STACK_PROTECTOR turned on, VMI doesn't boot with more than one processor. The problem is with the gs value not being initialized correctly when registering the secondary processor for VMI's case. The patch below initializes the gs value for the AP to __KERNEL_STACK_CANARY. Without this the secondary processor keeps on taking a GP on every gs access. Signed-off-by: Alok N Kataria Cc: # for v2.6.30.x LKML-Reference: <1249425262.18955.40.camel@ank32.eng.vmware.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmi_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index b263423fbe2a..95a7289e4b0c 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -441,7 +441,7 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip, ap.ds = __USER_DS; ap.es = __USER_DS; ap.fs = __KERNEL_PERCPU; - ap.gs = 0; + ap.gs = __KERNEL_STACK_CANARY; ap.eflags = 0; -- cgit v1.2.3 From e125e7b6944898831b56739a5448e705578bf7e2 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Thu, 2 Jul 2009 21:45:47 +0200 Subject: KVM: Fix KVM_GET_MSR_INDEX_LIST So far, KVM copied the emulated_msrs (only MSR_IA32_MISC_ENABLE) to a wrong address in user space due to broken pointer arithmetic. This caused subtle corruption up there (missing MSR_IA32_MISC_ENABLE had probably no practical relevance). Moreover, the size check for the user-provided kvm_msr_list forgot about emulated MSRs. Cc: stable@kernel.org Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fe5474aec41a..7bc311464fae 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1079,14 +1079,13 @@ long kvm_arch_dev_ioctl(struct file *filp, if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) goto out; r = -E2BIG; - if (n < num_msrs_to_save) + if (n < msr_list.nmsrs) goto out; r = -EFAULT; if (copy_to_user(user_msr_list->indices, &msrs_to_save, num_msrs_to_save * sizeof(u32))) goto out; - if (copy_to_user(user_msr_list->indices - + num_msrs_to_save * sizeof(u32), + if (copy_to_user(user_msr_list->indices + num_msrs_to_save, &emulated_msrs, ARRAY_SIZE(emulated_msrs) * sizeof(u32))) goto out; -- cgit v1.2.3 From 0ff77873b1318fc2d77a85e70690d3cd6cafbd41 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 2 Jul 2009 20:02:15 -0300 Subject: KVM: PIT: fix kpit_elapsed division by zero Fix division by zero triggered by latch count command on uninitialized counter. Cc: stable@kernel.org Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 4d6f0d293ee2..21f68e00524f 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -104,6 +104,9 @@ static s64 __kpit_elapsed(struct kvm *kvm) ktime_t remaining; struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; + if (!ps->pit_timer.period) + return 0; + /* * The Counter does not stop when it reaches zero. In * Modes 0, 1, 4, and 5 the Counter ``wraps around'' to -- cgit v1.2.3 From d6289b9365c3f622a8cfe62c4fb054bb70b5061a Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Mon, 22 Jun 2009 15:27:56 -0300 Subject: KVM: x86: verify MTRR/PAT validity Do not allow invalid memory types in MTRR/PAT (generating a #GP otherwise). Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7bc311464fae..3d4529011828 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -704,11 +704,48 @@ static bool msr_mtrr_valid(unsigned msr) return false; } +static bool valid_pat_type(unsigned t) +{ + return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */ +} + +static bool valid_mtrr_type(unsigned t) +{ + return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */ +} + +static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + int i; + + if (!msr_mtrr_valid(msr)) + return false; + + if (msr == MSR_IA32_CR_PAT) { + for (i = 0; i < 8; i++) + if (!valid_pat_type((data >> (i * 8)) & 0xff)) + return false; + return true; + } else if (msr == MSR_MTRRdefType) { + if (data & ~0xcff) + return false; + return valid_mtrr_type(data & 0xff); + } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) { + for (i = 0; i < 8 ; i++) + if (!valid_mtrr_type((data >> (i * 8)) & 0xff)) + return false; + return true; + } + + /* variable MTRRs */ + return valid_mtrr_type(data & 0xff); +} + static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) { u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; - if (!msr_mtrr_valid(msr)) + if (!mtrr_valid(vcpu, msr, data)) return 1; if (msr == MSR_MTRRdefType) { -- cgit v1.2.3 From 4b656b1202498184a0ecef86b3b89ff613b9c6ab Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 21 Jul 2009 12:47:45 -0300 Subject: KVM: SVM: force new asid on vcpu migration If a migrated vcpu matches the asid_generation value of the target pcpu, there will be no TLB flush via TLB_CONTROL_FLUSH_ALL_ASID. The check for vcpu.cpu in pre_svm_run is meaningless since svm_vcpu_load already updated it on schedule in. Such vcpu will VMRUN with stale TLB entries. Based on original patch from Joerg Roedel (http://patchwork.kernel.org/patch/10021/) Signed-off-by: Marcelo Tosatti Acked-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 71510e07e69e..b1f658ad2f06 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -711,6 +711,7 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) svm->vmcb->control.tsc_offset += delta; vcpu->cpu = cpu; kvm_migrate_timers(vcpu); + svm->asid_generation = 0; } for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) @@ -1031,7 +1032,6 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data) svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; } - svm->vcpu.cpu = svm_data->cpu; svm->asid_generation = svm_data->asid_generation; svm->vmcb->control.asid = svm_data->next_asid++; } @@ -2300,8 +2300,8 @@ static void pre_svm_run(struct vcpu_svm *svm) struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; - if (svm->vcpu.cpu != cpu || - svm->asid_generation != svm_data->asid_generation) + /* FIXME: handle wraparound of asid_generation */ + if (svm->asid_generation != svm_data->asid_generation) new_asid(svm, svm_data); } -- cgit v1.2.3 From 025dbbf36a7680bffe54d9dcbf0a8bc01a7cbd10 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 22 Jul 2009 13:05:49 -0300 Subject: KVM: MMU: handle n_free_mmu_pages > n_alloc_mmu_pages in kvm_mmu_change_mmu_pages kvm_mmu_change_mmu_pages mishandles the case where n_alloc_mmu_pages is smaller then n_free_mmu_pages, by not checking if the result of the subtraction is negative. Its a valid condition which can happen if a large number of pages has been recently freed. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 7030b5f911bf..49a10d008300 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1407,24 +1407,25 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) */ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) { + int used_pages; + + used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages; + used_pages = max(0, used_pages); + /* * If we set the number of mmu pages to be smaller be than the * number of actived pages , we must to free some mmu pages before we * change the value */ - if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) > - kvm_nr_mmu_pages) { - int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages - - kvm->arch.n_free_mmu_pages; - - while (n_used_mmu_pages > kvm_nr_mmu_pages) { + if (used_pages > kvm_nr_mmu_pages) { + while (used_pages > kvm_nr_mmu_pages) { struct kvm_mmu_page *page; page = container_of(kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); kvm_mmu_zap_page(kvm, page); - n_used_mmu_pages--; + used_pages--; } kvm->arch.n_free_mmu_pages = 0; } -- cgit v1.2.3 From 34f0c1ad27a74bd5eb0f99ea43ab6a4658d6419d Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 22 Jul 2009 23:53:26 +0200 Subject: KVM: VMX: Fix locking order in handle_invalid_guest_state Release and re-acquire preemption and IRQ lock in the same order as vcpu_enter_guest does. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 356a0ce85c68..6bf58c083f0a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3157,8 +3157,8 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx = to_vmx(vcpu); enum emulation_result err = EMULATE_DONE; - preempt_enable(); local_irq_enable(); + preempt_enable(); while (!guest_state_valid(vcpu)) { err = emulate_instruction(vcpu, kvm_run, 0, 0, 0); @@ -3177,8 +3177,8 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, schedule(); } - local_irq_disable(); preempt_disable(); + local_irq_disable(); vmx->invalid_state_emulation_result = err; } -- cgit v1.2.3 From 263799a3616242201e20fd2025fe84047b1379b1 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 21 Jul 2009 10:43:07 +0200 Subject: KVM: VMX: Fix locking imbalance on emulation failure We have to disable preemption and IRQs on every exit from handle_invalid_guest_state, otherwise we generate at least a preempt_disable imbalance. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6bf58c083f0a..29f912927a58 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3168,7 +3168,7 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, if (err != EMULATE_DONE) { kvm_report_emulation_failure(vcpu, "emulation failure"); - return; + break; } if (signal_pending(current)) -- cgit v1.2.3 From 53a27b39ff4d2492f84b1fdc2f0047175f0b0b93 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 5 Aug 2009 15:43:58 -0300 Subject: KVM: MMU: limit rmap chain length Otherwise the host can spend too long traversing an rmap chain, which happens under a spinlock. Cc: stable@kernel.org Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 49a10d008300..0ef5bb2b4043 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -489,16 +489,20 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage) * * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc * containing more mappings. + * + * Returns the number of rmap entries before the spte was added or zero if + * the spte was not added. + * */ -static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) +static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) { struct kvm_mmu_page *sp; struct kvm_rmap_desc *desc; unsigned long *rmapp; - int i; + int i, count = 0; if (!is_rmap_pte(*spte)) - return; + return count; gfn = unalias_gfn(vcpu->kvm, gfn); sp = page_header(__pa(spte)); sp->gfns[spte - sp->spt] = gfn; @@ -515,8 +519,10 @@ static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) } else { rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); - while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) + while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) { desc = desc->more; + count += RMAP_EXT; + } if (desc->shadow_ptes[RMAP_EXT-1]) { desc->more = mmu_alloc_rmap_desc(vcpu); desc = desc->more; @@ -525,6 +531,7 @@ static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) ; desc->shadow_ptes[i] = spte; } + return count; } static void rmap_desc_remove_entry(unsigned long *rmapp, @@ -754,6 +761,19 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp) return young; } +#define RMAP_RECYCLE_THRESHOLD 1000 + +static void rmap_recycle(struct kvm_vcpu *vcpu, gfn_t gfn, int lpage) +{ + unsigned long *rmapp; + + gfn = unalias_gfn(vcpu->kvm, gfn); + rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage); + + kvm_unmap_rmapp(vcpu->kvm, rmapp); + kvm_flush_remote_tlbs(vcpu->kvm); +} + int kvm_age_hva(struct kvm *kvm, unsigned long hva) { return kvm_handle_hva(kvm, hva, kvm_age_rmapp); @@ -1741,6 +1761,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, { int was_rmapped = 0; int was_writeble = is_writeble_pte(*shadow_pte); + int rmap_count; pgprintk("%s: spte %llx access %x write_fault %d" " user_fault %d gfn %lx\n", @@ -1782,9 +1803,11 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, page_header_update_slot(vcpu->kvm, shadow_pte, gfn); if (!was_rmapped) { - rmap_add(vcpu, shadow_pte, gfn, largepage); + rmap_count = rmap_add(vcpu, shadow_pte, gfn, largepage); if (!is_rmap_pte(*shadow_pte)) kvm_release_pfn_clean(pfn); + if (rmap_count > RMAP_RECYCLE_THRESHOLD) + rmap_recycle(vcpu, gfn, largepage); } else { if (was_writeble) kvm_release_pfn_dirty(pfn); -- cgit v1.2.3 From 087d7e56deffb611a098e7e257388a41edbeef1f Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 4 Aug 2009 08:59:59 -0700 Subject: x86: Fix MSI-X initialization by using online_mask for x2apic target_cpus found a system where x2apic reports an MSI-X irq initialization failure: [ 302.859446] igbvf 0000:81:10.4: enabling device (0000 -> 0002) [ 302.874369] igbvf 0000:81:10.4: using 64bit DMA mask [ 302.879023] igbvf 0000:81:10.4: using 64bit consistent DMA mask [ 302.894386] igbvf 0000:81:10.4: enabling bus mastering [ 302.898171] igbvf 0000:81:10.4: setting latency timer to 64 [ 302.914050] reserve_memtype added 0xefb08000-0xefb0c000, track uncached-minus, req uncached-minus, ret uncached-minus [ 302.933839] reserve_memtype added 0xefb28000-0xefb29000, track uncached-minus, req uncached-minus, ret uncached-minus [ 302.940367] alloc irq_desc for 265 on node 4 [ 302.956874] alloc kstat_irqs on node 4 [ 302.959452] alloc irq_2_iommu on node 0 [ 302.974328] igbvf 0000:81:10.4: irq 265 for MSI/MSI-X [ 302.977778] alloc irq_desc for 266 on node 4 [ 302.980347] alloc kstat_irqs on node 4 [ 302.995312] free_memtype request 0xefb28000-0xefb29000 [ 302.998816] igbvf 0000:81:10.4: Failed to initialize MSI-X interrupts. ... it turns out that when trying to enable MSI-X, __assign_irq_vector(new, cfg_new, apic->target_cpus()) can not get vector because for x2apic target-cpus returns cpumask_of(0) Update that to online_mask like xapic. Signed-off-by: Yinghai Lu Acked-by: Suresh Siddha LKML-Reference: <4A785AFF.3050902@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_cluster.c | 8 +++++--- arch/x86/kernel/apic/x2apic_phys.c | 8 +++++--- 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 2ed4e2bb3b32..a5371ec36776 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -17,11 +17,13 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return x2apic_enabled(); } -/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ - +/* + * need to use more than cpu 0, because we need more vectors when + * MSI-X are used. + */ static const struct cpumask *x2apic_target_cpus(void) { - return cpumask_of(0); + return cpu_online_mask; } /* diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 0b631c6a2e00..a8989aadc99a 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -27,11 +27,13 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 0; } -/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ - +/* + * need to use more than cpu 0, because we need more vectors when + * MSI-X are used. + */ static const struct cpumask *x2apic_target_cpus(void) { - return cpumask_of(0); + return cpu_online_mask; } static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask) -- cgit v1.2.3 From 498cdbfbcf98e0d2c90a26e6a02a82f043876e48 Mon Sep 17 00:00:00 2001 From: Ozan Çağlayan Date: Tue, 4 Aug 2009 19:39:31 +0300 Subject: x86: Add quirk to make Apple MacBookPro5,1 use reboot=pci MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MacBookPro5,1 is not able to reboot unless reboot=pci is set. This patch forces it through a DMI quirk specific to this device. Signed-off-by: Ozan Çağlayan LKML-Reference: <1249403971-6543-1-git-send-email-ozan@pardus.org.tr> Signed-off-by: Ingo Molnar --- arch/x86/kernel/reboot.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 834c9da8bf9d..9eb897603705 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -405,7 +405,7 @@ EXPORT_SYMBOL(machine_real_restart); #endif /* CONFIG_X86_32 */ /* - * Apple MacBook5,2 (2009 MacBook) needs reboot=p + * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot */ static int __init set_pci_reboot(const struct dmi_system_id *d) { @@ -426,6 +426,14 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5,2"), }, }, + { /* Handle problems with rebooting on Apple MacBookPro5,1 */ + .callback = set_pci_reboot, + .ident = "Apple MacBookPro5,1", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5,1"), + }, + }, { } }; -- cgit v1.2.3 From fdb8a42742ac95606668f73481dfb2f760658fdd Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Thu, 6 Aug 2009 15:58:13 -0700 Subject: x86: fix buffer overflow in efi_init() If the vendor name (from c16) can be longer than 100 bytes (or missing a terminating null), then the null is written past the end of vendor[]. Found with Parfait, http://research.sun.com/projects/parfait/ Signed-off-by: Roel Kluin Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: H. Peter Anvin Cc: Huang Ying --- arch/x86/kernel/efi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index 19ccf6d0dccf..fe26ba3e3451 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -354,7 +354,7 @@ void __init efi_init(void) */ c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2); if (c16) { - for (i = 0; i < sizeof(vendor) && *c16; ++i) + for (i = 0; i < sizeof(vendor) - 1 && *c16; ++i) vendor[i] = *c16++; vendor[i] = '\0'; } else -- cgit v1.2.3 From b6e61eef4f9f94714ac3ee4a5c96862d9bcd1836 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 31 Jul 2009 12:45:41 -0700 Subject: x86: Fix serialization in pit_expect_msb() Wei Chong Tan reported a fast-PIT-calibration corner-case: | pit_expect_msb() is vulnerable to SMI disturbance corner case | in some platforms which causes /proc/cpuinfo to show wrong | CPU MHz value when quick_pit_calibrate() jumps to success | section. I think that the real issue isn't even an SMI - but the fact that in the very last iteration of the loop, there's no serializing instruction _after_ the last 'rdtsc'. So even in the absense of SMI's, we do have a situation where the cycle counter was read without proper serialization. The last check should be done outside the outer loop, since _inside_ the outer loop, we'll be testing that the PIT has the right MSB value has the right value in the next iteration. So only the _last_ iteration is special, because that's the one that will not check the PIT MSB value any more, and because the final 'get_cycles()' isn't serialized. In other words: - I'd like to move the PIT MSB check to after the last iteration, rather than in every iteration - I think we should comment on the fact that it's also a serializing instruction and so 'fences in' the TSC read. Here's a suggested replacement. Signed-off-by: Linus Torvalds Reported-by: "Tan, Wei Chong" Tested-by: "Tan, Wei Chong" LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 6e1a368d21d4..71f4368b357e 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -275,15 +275,20 @@ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin) * use the TSC value at the transitions to calculate a pretty * good value for the TSC frequencty. */ +static inline int pit_verify_msb(unsigned char val) +{ + /* Ignore LSB */ + inb(0x42); + return inb(0x42) == val; +} + static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap) { int count; u64 tsc = 0; for (count = 0; count < 50000; count++) { - /* Ignore LSB */ - inb(0x42); - if (inb(0x42) != val) + if (!pit_verify_msb(val)) break; tsc = get_cycles(); } @@ -336,8 +341,7 @@ static unsigned long quick_pit_calibrate(void) * to do that is to just read back the 16-bit counter * once from the PIT. */ - inb(0x42); - inb(0x42); + pit_verify_msb(0); if (pit_expect_msb(0xff, &tsc, &d1)) { for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) { @@ -348,8 +352,19 @@ static unsigned long quick_pit_calibrate(void) * Iterate until the error is less than 500 ppm */ delta -= tsc; - if (d1+d2 < delta >> 11) - goto success; + if (d1+d2 >= delta >> 11) + continue; + + /* + * Check the PIT one more time to verify that + * all TSC reads were stable wrt the PIT. + * + * This also guarantees serialization of the + * last cycle read ('d2') in pit_expect_msb. + */ + if (!pit_verify_msb(0xfe - i)) + break; + goto success; } } printk("Fast TSC calibration failed\n"); -- cgit v1.2.3 From 3e03bbeac541856aaaf1ce1ab0250b6a490e4099 Mon Sep 17 00:00:00 2001 From: Shunichi Fuji Date: Tue, 11 Aug 2009 03:34:40 +0900 Subject: x86: Add reboot quirk for every 5 series MacBook/Pro MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reboot does not work on my MacBook Pro 13 inch (MacBookPro5,5) too. It seems all unibody MacBook and MacBookPro require PCI reboot handling, i guess. Following model/machine ID list shows unibody MacBook/Pro have the 5 series of model number: http://www.everymac.com/systems/by_capability/macs-by-machine-model-machine-id.html Signed-off-by: Shunichi Fuji Cc: Ozan Çağlayan LKML-Reference: <30046e3b0908101134p6487ddbftd8776e4ddef204be@mail.gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/reboot.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 9eb897603705..a06e8d101844 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -418,20 +418,20 @@ static int __init set_pci_reboot(const struct dmi_system_id *d) } static struct dmi_system_id __initdata pci_reboot_dmi_table[] = { - { /* Handle problems with rebooting on Apple MacBook5,2 */ + { /* Handle problems with rebooting on Apple MacBook5 */ .callback = set_pci_reboot, - .ident = "Apple MacBook", + .ident = "Apple MacBook5", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5,2"), + DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"), }, }, - { /* Handle problems with rebooting on Apple MacBookPro5,1 */ + { /* Handle problems with rebooting on Apple MacBookPro5 */ .callback = set_pci_reboot, - .ident = "Apple MacBookPro5,1", + .ident = "Apple MacBookPro5", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5,1"), + DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"), }, }, { } -- cgit v1.2.3 From 0d01f31439c1e4d602bf9fdc924ab66f407f5e38 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Sun, 9 Aug 2009 21:44:49 -0700 Subject: x86, mce: therm_throt - change when we print messages My Latitude d630 seems to be handling thermal events in SMI by lowering the max frequency of the CPU till it cools down but still leaks the "everything is normal" events. This spams the console and with high priority printks. Adjust therm_throt driver to only print messages about the fact that temperatire returned back to normal when leaving the throttling state. Also lower the severity of "back to normal" message from KERN_CRIT to KERN_INFO. Signed-off-by: Dmitry Torokhov Acked-by: H. Peter Anvin LKML-Reference: <20090810051513.0558F526EC9@mailhub.coreip.homeip.net> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/therm_throt.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index bff8dd191dd5..8bc64cfbe936 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -36,6 +36,7 @@ static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); +static DEFINE_PER_CPU(bool, thermal_throttle_active); static atomic_t therm_throt_en = ATOMIC_INIT(0); @@ -96,24 +97,27 @@ static int therm_throt_process(int curr) { unsigned int cpu = smp_processor_id(); __u64 tmp_jiffs = get_jiffies_64(); + bool was_throttled = __get_cpu_var(thermal_throttle_active); + bool is_throttled = __get_cpu_var(thermal_throttle_active) = curr; - if (curr) + if (is_throttled) __get_cpu_var(thermal_throttle_count)++; - if (time_before64(tmp_jiffs, __get_cpu_var(next_check))) + if (!(was_throttled ^ is_throttled) && + time_before64(tmp_jiffs, __get_cpu_var(next_check))) return 0; __get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL; /* if we just entered the thermal event */ - if (curr) { + if (is_throttled) { printk(KERN_CRIT "CPU%d: Temperature above threshold, " - "cpu clock throttled (total events = %lu)\n", cpu, - __get_cpu_var(thermal_throttle_count)); + "cpu clock throttled (total events = %lu)\n", + cpu, __get_cpu_var(thermal_throttle_count)); add_taint(TAINT_MACHINE_CHECK); - } else { - printk(KERN_CRIT "CPU%d: Temperature/speed normal\n", cpu); + } else if (was_throttled) { + printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu); } return 1; -- cgit v1.2.3 From 3c581a7f94542341bf0da496a226b44ac63521a8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Aug 2009 10:47:36 +0200 Subject: perf_counter, x86: Fix lapic printk message Instead of this garbled bootup on UP Pentium-M systems: [ 0.015048] Performance Counters: [ 0.016004] no Local APIC, try rebooting with lapicno PMU driver, software counters only. Print: [ 0.015050] Performance Counters: [ 0.016004] no APIC, boot with the "lapic" boot parameter to force-enable it. [ 0.017003] no PMU driver, software counters only. Cf: Frederic Weisbecker Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a7aa8f900954..40e233a24d9f 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1590,7 +1590,7 @@ static int p6_pmu_init(void) } if (!cpu_has_apic) { - pr_info("no Local APIC, try rebooting with lapic"); + pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); return -ENODEV; } -- cgit v1.2.3 From f64ccccb8afa43abdd63fcbd230f818d6ea0883f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Aug 2009 10:26:33 +0200 Subject: perf_counter, x86: Fix generic cache events on P6-mobile CPUs Johannes Stezenbach reported that 'perf stat' does not count cache-miss and cache-references events on his Pentium-M based laptop. This is because we left them blank in p6_perfmon_event_map[], fill them in. Reported-by: Johannes Stezenbach Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 40e233a24d9f..fffc126dbdf0 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -72,8 +72,8 @@ static const u64 p6_perfmon_event_map[] = { [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0000, - [PERF_COUNT_HW_CACHE_MISSES] = 0x0000, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, + [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, -- cgit v1.2.3 From fbd8b1819e80ac5a176d085fdddc3a34d1499318 Mon Sep 17 00:00:00 2001 From: Kevin Winchester Date: Mon, 10 Aug 2009 19:56:45 -0300 Subject: x86: Clear incorrectly forced X86_FEATURE_LAHF_LM flag Due to an erratum with certain AMD Athlon 64 processors, the BIOS may need to force enable the LAHF_LM capability. Unfortunately, in at least one case, the BIOS does this even for processors that do not support the functionality. Add a specific check that will clear the feature bit for processors known not to support the LAHF/SAHF instructions. Signed-off-by: Kevin Winchester Acked-by: Borislav Petkov LKML-Reference: <4A80A5AD.2000209@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index e2485b03f1cf..63fddcd082cd 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -400,6 +400,13 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) level = cpuid_eax(1); if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) set_cpu_cap(c, X86_FEATURE_REP_GOOD); + + /* + * Some BIOSes incorrectly force this feature, but only K8 + * revision D (model = 0x14) and later actually support it. + */ + if (c->x86_model < 0x14) + clear_cpu_cap(c, X86_FEATURE_LAHF_LM); } if (c->x86 == 0x10 || c->x86 == 0x11) set_cpu_cap(c, X86_FEATURE_REP_GOOD); -- cgit v1.2.3 From e8055139d996e85722984968472868d6dccb1490 Mon Sep 17 00:00:00 2001 From: Ondrej Zary Date: Tue, 11 Aug 2009 20:00:11 +0200 Subject: x86: Fix oops in identify_cpu() on CPUs without CPUID Kernel is broken for x86 CPUs without CPUID since 2.6.28. It crashes with NULL pointer dereference in identify_cpu(): 766 generic_identify(c); 767 768--> if (this_cpu->c_identify) 769 this_cpu->c_identify(c); this_cpu is NULL. This is because it's only initialized in get_cpu_vendor() function, which is not called if the CPU has no CPUID instruction. Signed-off-by: Ondrej Zary LKML-Reference: <200908112000.15993.linux@rainbow-software.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 48 ++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f1961c07af9a..5ce60a88027b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -59,7 +59,30 @@ void __init setup_cpu_local_masks(void) alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask); } -static const struct cpu_dev *this_cpu __cpuinitdata; +static void __cpuinit default_init(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_64 + display_cacheinfo(c); +#else + /* Not much we can do here... */ + /* Check if at least it has cpuid */ + if (c->cpuid_level == -1) { + /* No cpuid. It must be an ancient CPU */ + if (c->x86 == 4) + strcpy(c->x86_model_id, "486"); + else if (c->x86 == 3) + strcpy(c->x86_model_id, "386"); + } +#endif +} + +static const struct cpu_dev __cpuinitconst default_cpu = { + .c_init = default_init, + .c_vendor = "Unknown", + .c_x86_vendor = X86_VENDOR_UNKNOWN, +}; + +static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu; DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { #ifdef CONFIG_X86_64 @@ -332,29 +355,6 @@ void switch_to_new_gdt(int cpu) static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {}; -static void __cpuinit default_init(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_X86_64 - display_cacheinfo(c); -#else - /* Not much we can do here... */ - /* Check if at least it has cpuid */ - if (c->cpuid_level == -1) { - /* No cpuid. It must be an ancient CPU */ - if (c->x86 == 4) - strcpy(c->x86_model_id, "486"); - else if (c->x86 == 3) - strcpy(c->x86_model_id, "386"); - } -#endif -} - -static const struct cpu_dev __cpuinitconst default_cpu = { - .c_init = default_init, - .c_vendor = "Unknown", - .c_x86_vendor = X86_VENDOR_UNKNOWN, -}; - static void __cpuinit get_model_name(struct cpuinfo_x86 *c) { unsigned int *v; -- cgit v1.2.3 From 04da8a43da804723a550f00dd158fd5b5e25ae35 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Aug 2009 10:40:08 +0200 Subject: perf_counter, x86: Fix/improve apic fallback Johannes Stezenbach reported that his Pentium-M based laptop does not have the local APIC enabled by default, and hence perfcounters do not get initialized. Add a fallback for this case: allow non-sampled counters and return with an error on sampled counters. This allows 'perf stat' to work out of box - and allows 'perf top' and 'perf record' to fall back on a hrtimer based sampling method. ( Passing 'lapic' on the boot line will allow hardware sampling to occur - but if the APIC is disabled permanently by the hardware then this fallback still allows more systems to use perfcounters. ) Also decouple perfcounter support from X86_LOCAL_APIC. -v2: fix typo breaking counters on all other systems ... Reported-by: Johannes Stezenbach Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- arch/x86/kernel/cpu/perf_counter.c | 34 ++++++++++++++++++++++++++++++---- 2 files changed, 31 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 738bdc6b0f8b..13ffa5df37d7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -24,6 +24,7 @@ config X86 select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_IDE select HAVE_OPROFILE + select HAVE_PERF_COUNTERS if (!M386 && !M486) select HAVE_IOREMAP_PROT select HAVE_KPROBES select ARCH_WANT_OPTIONAL_GPIOLIB @@ -742,7 +743,6 @@ config X86_UP_IOAPIC config X86_LOCAL_APIC def_bool y depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC - select HAVE_PERF_COUNTERS if (!M386 && !M486) config X86_IO_APIC def_bool y diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index fffc126dbdf0..900332b800f8 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -55,6 +55,7 @@ struct x86_pmu { int num_counters_fixed; int counter_bits; u64 counter_mask; + int apic; u64 max_period; u64 intel_ctrl; }; @@ -613,6 +614,7 @@ static DEFINE_MUTEX(pmc_reserve_mutex); static bool reserve_pmc_hardware(void) { +#ifdef CONFIG_X86_LOCAL_APIC int i; if (nmi_watchdog == NMI_LOCAL_APIC) @@ -627,9 +629,11 @@ static bool reserve_pmc_hardware(void) if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) goto eventsel_fail; } +#endif return true; +#ifdef CONFIG_X86_LOCAL_APIC eventsel_fail: for (i--; i >= 0; i--) release_evntsel_nmi(x86_pmu.eventsel + i); @@ -644,10 +648,12 @@ perfctr_fail: enable_lapic_nmi_watchdog(); return false; +#endif } static void release_pmc_hardware(void) { +#ifdef CONFIG_X86_LOCAL_APIC int i; for (i = 0; i < x86_pmu.num_counters; i++) { @@ -657,6 +663,7 @@ static void release_pmc_hardware(void) if (nmi_watchdog == NMI_LOCAL_APIC) enable_lapic_nmi_watchdog(); +#endif } static void hw_perf_counter_destroy(struct perf_counter *counter) @@ -748,6 +755,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter) hwc->sample_period = x86_pmu.max_period; hwc->last_period = hwc->sample_period; atomic64_set(&hwc->period_left, hwc->sample_period); + } else { + /* + * If we have a PMU initialized but no APIC + * interrupts, we cannot sample hardware + * counters (user-space has to fall back and + * sample via a hrtimer based software counter): + */ + if (!x86_pmu.apic) + return -EOPNOTSUPP; } counter->destroy = hw_perf_counter_destroy; @@ -1449,18 +1465,22 @@ void smp_perf_pending_interrupt(struct pt_regs *regs) void set_perf_counter_pending(void) { +#ifdef CONFIG_X86_LOCAL_APIC apic->send_IPI_self(LOCAL_PENDING_VECTOR); +#endif } void perf_counters_lapic_init(void) { - if (!x86_pmu_initialized()) +#ifdef CONFIG_X86_LOCAL_APIC + if (!x86_pmu.apic || !x86_pmu_initialized()) return; /* * Always use NMI for PMU */ apic_write(APIC_LVTPC, APIC_DM_NMI); +#endif } static int __kprobes @@ -1484,7 +1504,9 @@ perf_counter_nmi_handler(struct notifier_block *self, regs = args->regs; +#ifdef CONFIG_X86_LOCAL_APIC apic_write(APIC_LVTPC, APIC_DM_NMI); +#endif /* * Can't rely on the handled return value to say it was our NMI, two * counters could trigger 'simultaneously' raising two back-to-back NMIs. @@ -1515,6 +1537,7 @@ static struct x86_pmu p6_pmu = { .event_map = p6_pmu_event_map, .raw_event = p6_pmu_raw_event, .max_events = ARRAY_SIZE(p6_perfmon_event_map), + .apic = 1, .max_period = (1ULL << 31) - 1, .version = 0, .num_counters = 2, @@ -1541,6 +1564,7 @@ static struct x86_pmu intel_pmu = { .event_map = intel_pmu_event_map, .raw_event = intel_pmu_raw_event, .max_events = ARRAY_SIZE(intel_perfmon_event_map), + .apic = 1, /* * Intel PMCs cannot be accessed sanely above 32 bit width, * so we install an artificial 1<<31 period regardless of @@ -1564,6 +1588,7 @@ static struct x86_pmu amd_pmu = { .num_counters = 4, .counter_bits = 48, .counter_mask = (1ULL << 48) - 1, + .apic = 1, /* use highest bit to detect overflow */ .max_period = (1ULL << 47) - 1, }; @@ -1589,13 +1614,14 @@ static int p6_pmu_init(void) return -ENODEV; } + x86_pmu = p6_pmu; + if (!cpu_has_apic) { pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); - return -ENODEV; + pr_info("no hardware sampling interrupt available.\n"); + x86_pmu.apic = 0; } - x86_pmu = p6_pmu; - return 0; } -- cgit v1.2.3 From 74d46d6b2d23d44d72c37df4c6a5d2e782f7b088 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 21 Jul 2009 17:11:50 +0900 Subject: percpu, sparc64: fix sparse possible cpu map handling percpu code has been assuming num_possible_cpus() == nr_cpu_ids which is incorrect if cpu_possible_map contains holes. This causes percpu code to access beyond allocated memories and vmalloc areas. On a sparc64 machine with cpus 0 and 2 (u60), this triggers the following warning or fails boot. WARNING: at /devel/tj/os/work/mm/vmalloc.c:106 vmap_page_range_noflush+0x1f0/0x240() Modules linked in: Call Trace: [00000000004b17d0] vmap_page_range_noflush+0x1f0/0x240 [00000000004b1840] map_vm_area+0x20/0x60 [00000000004b1950] __vmalloc_area_node+0xd0/0x160 [0000000000593434] deflate_init+0x14/0xe0 [0000000000583b94] __crypto_alloc_tfm+0xd4/0x1e0 [00000000005844f0] crypto_alloc_base+0x50/0xa0 [000000000058b898] alg_test_comp+0x18/0x80 [000000000058dad4] alg_test+0x54/0x180 [000000000058af00] cryptomgr_test+0x40/0x60 [0000000000473098] kthread+0x58/0x80 [000000000042b590] kernel_thread+0x30/0x60 [0000000000472fd0] kthreadd+0xf0/0x160 ---[ end trace 429b268a213317ba ]--- This patch fixes generic percpu functions and sparc64 setup_per_cpu_areas() so that they handle sparse cpu_possible_map properly. Please note that on x86, cpu_possible_map() doesn't contain holes and thus num_possible_cpus() == nr_cpu_ids and this patch doesn't cause any behavior difference. Signed-off-by: Tejun Heo Acked-by: David S. Miller Cc: Ingo Molnar --- arch/sparc/kernel/smp_64.c | 4 ++-- arch/x86/kernel/setup_percpu.c | 14 +++++++------- mm/percpu.c | 33 ++++++++++++++++++--------------- 3 files changed, 27 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index fa44eaf8d897..3691907a43b4 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1499,7 +1499,7 @@ void __init setup_per_cpu_areas(void) dyn_size = pcpur_size - static_size - PERCPU_MODULE_RESERVE; - ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0])); + ptrs_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpur_ptrs[0])); pcpur_ptrs = alloc_bootmem(ptrs_size); for_each_possible_cpu(cpu) { @@ -1514,7 +1514,7 @@ void __init setup_per_cpu_areas(void) /* allocate address and map */ vm.flags = VM_ALLOC; - vm.size = num_possible_cpus() * PCPU_CHUNK_SIZE; + vm.size = nr_cpu_ids * PCPU_CHUNK_SIZE; vm_area_register_early(&vm, PCPU_CHUNK_SIZE); for_each_possible_cpu(cpu) { diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 29a3eef7cf4a..07d81916f212 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -165,7 +165,7 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) if (!chosen) { size_t vm_size = VMALLOC_END - VMALLOC_START; - size_t tot_size = num_possible_cpus() * PMD_SIZE; + size_t tot_size = nr_cpu_ids * PMD_SIZE; /* on non-NUMA, embedding is better */ if (!pcpu_need_numa()) @@ -199,7 +199,7 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; /* allocate pointer array and alloc large pages */ - map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0])); + map_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpul_map[0])); pcpul_map = alloc_bootmem(map_size); for_each_possible_cpu(cpu) { @@ -228,7 +228,7 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) /* allocate address and map */ pcpul_vm.flags = VM_ALLOC; - pcpul_vm.size = num_possible_cpus() * PMD_SIZE; + pcpul_vm.size = nr_cpu_ids * PMD_SIZE; vm_area_register_early(&pcpul_vm, PMD_SIZE); for_each_possible_cpu(cpu) { @@ -250,8 +250,8 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) PMD_SIZE, pcpul_vm.addr, NULL); /* sort pcpul_map array for pcpu_lpage_remapped() */ - for (i = 0; i < num_possible_cpus() - 1; i++) - for (j = i + 1; j < num_possible_cpus(); j++) + for (i = 0; i < nr_cpu_ids - 1; i++) + for (j = i + 1; j < nr_cpu_ids; j++) if (pcpul_map[i].ptr > pcpul_map[j].ptr) { struct pcpul_ent tmp = pcpul_map[i]; pcpul_map[i] = pcpul_map[j]; @@ -288,7 +288,7 @@ void *pcpu_lpage_remapped(void *kaddr) { void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK); unsigned long offset = (unsigned long)kaddr & ~PMD_MASK; - int left = 0, right = num_possible_cpus() - 1; + int left = 0, right = nr_cpu_ids - 1; int pos; /* pcpul in use at all? */ @@ -377,7 +377,7 @@ static ssize_t __init setup_pcpu_4k(size_t static_size) pcpu4k_nr_static_pages = PFN_UP(static_size); /* unaligned allocations can't be freed, round up to page size */ - pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus() + pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * nr_cpu_ids * sizeof(pcpu4k_pages[0])); pcpu4k_pages = alloc_bootmem(pages_size); diff --git a/mm/percpu.c b/mm/percpu.c index b70f2acd8853..e0be1146f617 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -8,12 +8,12 @@ * * This is percpu allocator which can handle both static and dynamic * areas. Percpu areas are allocated in chunks in vmalloc area. Each - * chunk is consisted of num_possible_cpus() units and the first chunk - * is used for static percpu variables in the kernel image (special - * boot time alloc/init handling necessary as these areas need to be - * brought up before allocation services are running). Unit grows as - * necessary and all units grow or shrink in unison. When a chunk is - * filled up, another chunk is allocated. ie. in vmalloc area + * chunk is consisted of nr_cpu_ids units and the first chunk is used + * for static percpu variables in the kernel image (special boot time + * alloc/init handling necessary as these areas need to be brought up + * before allocation services are running). Unit grows as necessary + * and all units grow or shrink in unison. When a chunk is filled up, + * another chunk is allocated. ie. in vmalloc area * * c0 c1 c2 * ------------------- ------------------- ------------ @@ -558,7 +558,7 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, bool flush_tlb) { - unsigned int last = num_possible_cpus() - 1; + unsigned int last = nr_cpu_ids - 1; unsigned int cpu; /* unmap must not be done on immutable chunk */ @@ -643,7 +643,7 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, */ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) { - unsigned int last = num_possible_cpus() - 1; + unsigned int last = nr_cpu_ids - 1; unsigned int cpu; int err; @@ -1067,9 +1067,9 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, PFN_UP(size_sum)); pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; - pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; + pcpu_chunk_size = nr_cpu_ids * pcpu_unit_size; pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) - + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *); + + nr_cpu_ids * pcpu_unit_pages * sizeof(struct page *); if (dyn_size < 0) dyn_size = pcpu_unit_size - static_size - reserved_size; @@ -1248,7 +1248,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, } else pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); - chunk_size = pcpue_unit_size * num_possible_cpus(); + chunk_size = pcpue_unit_size * nr_cpu_ids; pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); @@ -1259,12 +1259,15 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, } /* return the leftover and copy */ - for_each_possible_cpu(cpu) { + for (cpu = 0; cpu < nr_cpu_ids; cpu++) { void *ptr = pcpue_ptr + cpu * pcpue_unit_size; - free_bootmem(__pa(ptr + pcpue_size), - pcpue_unit_size - pcpue_size); - memcpy(ptr, __per_cpu_load, static_size); + if (cpu_possible(cpu)) { + free_bootmem(__pa(ptr + pcpue_size), + pcpue_unit_size - pcpue_size); + memcpy(ptr, __per_cpu_load, static_size); + } else + free_bootmem(__pa(ptr), pcpue_unit_size); } /* we're ready, commit */ -- cgit v1.2.3