diff options
author | Linus Torvalds <torvalds@g5.osdl.org> | 2006-06-26 10:51:09 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2006-06-26 10:51:09 -0700 |
commit | 81a07d7588d376c530d006e24d7981304ce96e16 (patch) | |
tree | 1608e094c88b9702c86cf2e6f65339aab9ea3f3f /arch | |
parent | 8871e73fdbde07d0a41393f7ee30787b65387b36 (diff) | |
parent | 8501a2fbe762b21d2504ed3aca3b52be61b5e6e4 (diff) | |
download | lwn-81a07d7588d376c530d006e24d7981304ce96e16.tar.gz lwn-81a07d7588d376c530d006e24d7981304ce96e16.zip |
Merge branch 'x86-64'
* x86-64: (83 commits)
[PATCH] x86_64: x86_64 stack usage debugging
[PATCH] x86_64: (resend) x86_64 stack overflow debugging
[PATCH] x86_64: msi_apic.c build fix
[PATCH] x86_64: i386/x86-64 Add nmi watchdog support for new Intel CPUs
[PATCH] x86_64: Avoid broadcasting NMI IPIs
[PATCH] x86_64: fix apic error on bootup
[PATCH] x86_64: enlarge window for stack growth
[PATCH] x86_64: Minor string functions optimizations
[PATCH] x86_64: Move export symbols to their C functions
[PATCH] x86_64: Standardize i386/x86_64 handling of NMI_VECTOR
[PATCH] x86_64: Fix modular pc speaker
[PATCH] x86_64: remove sys32_ni_syscall()
[PATCH] x86_64: Do not use -ffunction-sections for modules
[PATCH] x86_64: Add cpu_relax to apic_wait_icr_idle
[PATCH] x86_64: adjust kstack_depth_to_print default
[PATCH] i386/x86-64: adjust /proc/interrupts column headings
[PATCH] x86_64: Fix race in cpu_local_* on preemptible kernels
[PATCH] x86_64: Fix fast check in safe_smp_processor_id
[PATCH] x86_64: x86_64 setup.c - printing cmp related boottime information
[PATCH] i386/x86-64/ia64: Move polling flag into thread_info_status
...
Manual resolve of trivial conflict in arch/i386/kernel/Makefile
Diffstat (limited to 'arch')
79 files changed, 3326 insertions, 1030 deletions
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index 374fb50608a0..f3eaf22f273d 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -328,6 +328,15 @@ config X86_MCE_P4THERMAL Enabling this feature will cause a message to be printed when the P4 enters thermal throttling. +config VM86 + default y + bool "Enable VM86 support" if EMBEDDED + help + This option is required by programs like DOSEMU to run 16-bit legacy + code on X86 processors. It also may be needed by software like + XFree86 to initialize some video cards via BIOS. Disabling this + option saves about 6k. + config TOSHIBA tristate "Toshiba Laptop support" ---help--- @@ -1068,6 +1077,10 @@ config SCx200HR_TIMER processor goes idle (as is done by the scheduler). The other workaround is idle=poll boot option. +config K8_NB + def_bool y + depends on AGP_AMD64 + source "drivers/pcmcia/Kconfig" source "drivers/pci/hotplug/Kconfig" diff --git a/arch/i386/boot/Makefile b/arch/i386/boot/Makefile index 33e55476381b..e97946626064 100644 --- a/arch/i386/boot/Makefile +++ b/arch/i386/boot/Makefile @@ -109,8 +109,13 @@ fdimage288: $(BOOTIMAGE) $(obj)/mtools.conf isoimage: $(BOOTIMAGE) -rm -rf $(obj)/isoimage mkdir $(obj)/isoimage - cp `echo /usr/lib*/syslinux/isolinux.bin | awk '{ print $1; }'` \ - $(obj)/isoimage + for i in lib lib64 share end ; do \ + if [ -f /usr/$$i/syslinux/isolinux.bin ] ; then \ + cp /usr/$$i/syslinux/isolinux.bin $(obj)/isoimage ; \ + break ; \ + fi ; \ + if [ $$i = end ] ; then exit 1 ; fi ; \ + done cp $(BOOTIMAGE) $(obj)/isoimage/linux echo '$(image_cmdline)' > $(obj)/isoimage/isolinux.cfg if [ -f '$(FDINITRD)' ] ; then \ diff --git a/arch/i386/boot/compressed/misc.c b/arch/i386/boot/compressed/misc.c index f19f3a7492a5..b2ccd543410d 100644 --- a/arch/i386/boot/compressed/misc.c +++ b/arch/i386/boot/compressed/misc.c @@ -24,14 +24,6 @@ #undef memset #undef memcpy - -/* - * Why do we do this? Don't ask me.. - * - * Incomprehensible are the ways of bootloaders. - */ -static void* memset(void *, int, size_t); -static void* memcpy(void *, __const void *, size_t); #define memzero(s, n) memset ((s), 0, (n)) typedef unsigned char uch; @@ -93,7 +85,7 @@ static unsigned char *real_mode; /* Pointer to real-mode data */ #endif #define RM_SCREEN_INFO (*(struct screen_info *)(real_mode+0)) -extern char input_data[]; +extern unsigned char input_data[]; extern int input_len; static long bytes_out = 0; @@ -103,6 +95,9 @@ static unsigned long output_ptr = 0; static void *malloc(int size); static void free(void *where); +static void *memset(void *s, int c, unsigned n); +static void *memcpy(void *dest, const void *src, unsigned n); + static void putstr(const char *); extern int end; @@ -205,7 +200,7 @@ static void putstr(const char *s) outb_p(0xff & (pos >> 1), vidport+1); } -static void* memset(void* s, int c, size_t n) +static void* memset(void* s, int c, unsigned n) { int i; char *ss = (char*)s; @@ -214,14 +209,13 @@ static void* memset(void* s, int c, size_t n) return s; } -static void* memcpy(void* __dest, __const void* __src, - size_t __n) +static void* memcpy(void* dest, const void* src, unsigned n) { int i; - char *d = (char *)__dest, *s = (char *)__src; + char *d = (char *)dest, *s = (char *)src; - for (i=0;i<__n;i++) d[i] = s[i]; - return __dest; + for (i=0;i<n;i++) d[i] = s[i]; + return dest; } /* =========================================================================== @@ -309,7 +303,7 @@ static void setup_normal_output_buffer(void) #else if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < 1024) error("Less than 2MB of memory"); #endif - output_data = (char *)__PHYSICAL_START; /* Normally Points to 1M */ + output_data = (unsigned char *)__PHYSICAL_START; /* Normally Points to 1M */ free_mem_end_ptr = (long)real_mode; } @@ -324,11 +318,9 @@ static void setup_output_buffer_if_we_run_high(struct moveparams *mv) #ifdef STANDARD_MEMORY_BIOS_CALL if (RM_EXT_MEM_K < (3*1024)) error("Less than 4MB of memory"); #else - if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < - (3*1024)) - error("Less than 4MB of memory"); + if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < (3*1024)) error("Less than 4MB of memory"); #endif - mv->low_buffer_start = output_data = (char *)LOW_BUFFER_START; + mv->low_buffer_start = output_data = (unsigned char *)LOW_BUFFER_START; low_buffer_end = ((unsigned int)real_mode > LOW_BUFFER_MAX ? LOW_BUFFER_MAX : (unsigned int)real_mode) & ~0xfff; low_buffer_size = low_buffer_end - LOW_BUFFER_START; diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index 0fac85df64f1..5e70c2fb273a 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -37,6 +37,7 @@ obj-$(CONFIG_DOUBLEFAULT) += doublefault.o obj-$(CONFIG_VM86) += vm86.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_HPET_TIMER) += hpet.o +obj-$(CONFIG_K8_NB) += k8.o EXTRA_AFLAGS := -traditional @@ -76,3 +77,6 @@ SYSCFLAGS_vsyscall-syms.o = -r $(obj)/vsyscall-syms.o: $(src)/vsyscall.lds \ $(obj)/vsyscall-sysenter.o $(obj)/vsyscall-note.o FORCE $(call if_changed,syscall) + +k8-y += ../../x86_64/kernel/k8.o + diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c index 5cbd6f99fb2a..50eb0e03777e 100644 --- a/arch/i386/kernel/alternative.c +++ b/arch/i386/kernel/alternative.c @@ -4,27 +4,41 @@ #include <asm/alternative.h> #include <asm/sections.h> -#define DEBUG 0 -#if DEBUG -# define DPRINTK(fmt, args...) printk(fmt, args) -#else -# define DPRINTK(fmt, args...) -#endif +static int no_replacement = 0; +static int smp_alt_once = 0; +static int debug_alternative = 0; + +static int __init noreplacement_setup(char *s) +{ + no_replacement = 1; + return 1; +} +static int __init bootonly(char *str) +{ + smp_alt_once = 1; + return 1; +} +static int __init debug_alt(char *str) +{ + debug_alternative = 1; + return 1; +} +__setup("noreplacement", noreplacement_setup); +__setup("smp-alt-boot", bootonly); +__setup("debug-alternative", debug_alt); + +#define DPRINTK(fmt, args...) if (debug_alternative) \ + printk(KERN_DEBUG fmt, args) + +#ifdef GENERIC_NOP1 /* Use inline assembly to define this because the nops are defined as inline assembly strings in the include files and we cannot get them easily into strings. */ asm("\t.data\nintelnops: " GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6 GENERIC_NOP7 GENERIC_NOP8); -asm("\t.data\nk8nops: " - K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 - K8_NOP7 K8_NOP8); -asm("\t.data\nk7nops: " - K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6 - K7_NOP7 K7_NOP8); - -extern unsigned char intelnops[], k8nops[], k7nops[]; +extern unsigned char intelnops[]; static unsigned char *intel_nops[ASM_NOP_MAX+1] = { NULL, intelnops, @@ -36,6 +50,13 @@ static unsigned char *intel_nops[ASM_NOP_MAX+1] = { intelnops + 1 + 2 + 3 + 4 + 5 + 6, intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7, }; +#endif + +#ifdef K8_NOP1 +asm("\t.data\nk8nops: " + K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 + K8_NOP7 K8_NOP8); +extern unsigned char k8nops[]; static unsigned char *k8_nops[ASM_NOP_MAX+1] = { NULL, k8nops, @@ -47,6 +68,13 @@ static unsigned char *k8_nops[ASM_NOP_MAX+1] = { k8nops + 1 + 2 + 3 + 4 + 5 + 6, k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, }; +#endif + +#ifdef K7_NOP1 +asm("\t.data\nk7nops: " + K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6 + K7_NOP7 K7_NOP8); +extern unsigned char k7nops[]; static unsigned char *k7_nops[ASM_NOP_MAX+1] = { NULL, k7nops, @@ -58,6 +86,18 @@ static unsigned char *k7_nops[ASM_NOP_MAX+1] = { k7nops + 1 + 2 + 3 + 4 + 5 + 6, k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, }; +#endif + +#ifdef CONFIG_X86_64 + +extern char __vsyscall_0; +static inline unsigned char** find_nop_table(void) +{ + return k8_nops; +} + +#else /* CONFIG_X86_64 */ + static struct nop { int cpuid; unsigned char **noptable; @@ -67,14 +107,6 @@ static struct nop { { -1, NULL } }; - -extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; -extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[]; -extern u8 *__smp_locks[], *__smp_locks_end[]; - -extern u8 __smp_alt_begin[], __smp_alt_end[]; - - static unsigned char** find_nop_table(void) { unsigned char **noptable = intel_nops; @@ -89,6 +121,14 @@ static unsigned char** find_nop_table(void) return noptable; } +#endif /* CONFIG_X86_64 */ + +extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; +extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[]; +extern u8 *__smp_locks[], *__smp_locks_end[]; + +extern u8 __smp_alt_begin[], __smp_alt_end[]; + /* Replace instructions with better alternatives for this CPU type. This runs before SMP is initialized to avoid SMP problems with self modifying code. This implies that assymetric systems where @@ -99,6 +139,7 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end) { unsigned char **noptable = find_nop_table(); struct alt_instr *a; + u8 *instr; int diff, i, k; DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end); @@ -106,7 +147,16 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end) BUG_ON(a->replacementlen > a->instrlen); if (!boot_cpu_has(a->cpuid)) continue; - memcpy(a->instr, a->replacement, a->replacementlen); + instr = a->instr; +#ifdef CONFIG_X86_64 + /* vsyscall code is not mapped yet. resolve it manually. */ + if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END) { + instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0)); + DPRINTK("%s: vsyscall fixup: %p => %p\n", + __FUNCTION__, a->instr, instr); + } +#endif + memcpy(instr, a->replacement, a->replacementlen); diff = a->instrlen - a->replacementlen; /* Pad the rest with nops */ for (i = a->replacementlen; diff > 0; diff -= k, i += k) { @@ -186,14 +236,6 @@ struct smp_alt_module { static LIST_HEAD(smp_alt_modules); static DEFINE_SPINLOCK(smp_alt); -static int smp_alt_once = 0; -static int __init bootonly(char *str) -{ - smp_alt_once = 1; - return 1; -} -__setup("smp-alt-boot", bootonly); - void alternatives_smp_module_add(struct module *mod, char *name, void *locks, void *locks_end, void *text, void *text_end) @@ -201,6 +243,9 @@ void alternatives_smp_module_add(struct module *mod, char *name, struct smp_alt_module *smp; unsigned long flags; + if (no_replacement) + return; + if (smp_alt_once) { if (boot_cpu_has(X86_FEATURE_UP)) alternatives_smp_unlock(locks, locks_end, @@ -235,7 +280,7 @@ void alternatives_smp_module_del(struct module *mod) struct smp_alt_module *item; unsigned long flags; - if (smp_alt_once) + if (no_replacement || smp_alt_once) return; spin_lock_irqsave(&smp_alt, flags); @@ -256,7 +301,7 @@ void alternatives_smp_switch(int smp) struct smp_alt_module *mod; unsigned long flags; - if (smp_alt_once) + if (no_replacement || smp_alt_once) return; BUG_ON(!smp && (num_online_cpus() > 1)); @@ -285,6 +330,13 @@ void alternatives_smp_switch(int smp) void __init alternative_instructions(void) { + if (no_replacement) { + printk(KERN_INFO "(SMP-)alternatives turned off\n"); + free_init_pages("SMP alternatives", + (unsigned long)__smp_alt_begin, + (unsigned long)__smp_alt_end); + return; + } apply_alternatives(__alt_instructions, __alt_instructions_end); /* switch to patch-once-at-boottime-only mode and free the diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c index 5ab59c12335b..7ce09492fc0c 100644 --- a/arch/i386/kernel/apic.c +++ b/arch/i386/kernel/apic.c @@ -36,6 +36,7 @@ #include <asm/arch_hooks.h> #include <asm/hpet.h> #include <asm/i8253.h> +#include <asm/nmi.h> #include <mach_apic.h> #include <mach_apicdef.h> @@ -156,7 +157,7 @@ void clear_local_APIC(void) maxlvt = get_maxlvt(); /* - * Masking an LVT entry on a P6 can trigger a local APIC error + * Masking an LVT entry can trigger a local APIC error * if the vector is zero. Mask LVTERR first to prevent this. */ if (maxlvt >= 3) { @@ -1117,7 +1118,18 @@ void disable_APIC_timer(void) unsigned long v; v = apic_read(APIC_LVTT); - apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); + /* + * When an illegal vector value (0-15) is written to an LVT + * entry and delivery mode is Fixed, the APIC may signal an + * illegal vector error, with out regard to whether the mask + * bit is set or whether an interrupt is actually seen on input. + * + * Boot sequence might call this function when the LVTT has + * '0' vector value. So make sure vector field is set to + * valid value. + */ + v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); + apic_write_around(APIC_LVTT, v); } } diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c index 9e819eb68229..7c5729d1fd06 100644 --- a/arch/i386/kernel/apm.c +++ b/arch/i386/kernel/apm.c @@ -764,9 +764,9 @@ static int apm_do_idle(void) int idled = 0; int polling; - polling = test_thread_flag(TIF_POLLING_NRFLAG); + polling = !!(current_thread_info()->status & TS_POLLING); if (polling) { - clear_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status &= ~TS_POLLING; smp_mb__after_clear_bit(); } if (!need_resched()) { @@ -774,7 +774,7 @@ static int apm_do_idle(void) ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax); } if (polling) - set_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status |= TS_POLLING; if (!idled) return 0; diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c index 786d1a57048b..fd0457c9c827 100644 --- a/arch/i386/kernel/cpu/amd.c +++ b/arch/i386/kernel/cpu/amd.c @@ -224,15 +224,17 @@ static void __init init_amd(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_HT /* - * On a AMD dual core setup the lower bits of the APIC id - * distingush the cores. Assumes number of cores is a power - * of two. + * On a AMD multi core setup the lower bits of the APIC id + * distingush the cores. */ if (c->x86_max_cores > 1) { int cpu = smp_processor_id(); - unsigned bits = 0; - while ((1 << bits) < c->x86_max_cores) - bits++; + unsigned bits = (cpuid_ecx(0x80000008) >> 12) & 0xf; + + if (bits == 0) { + while ((1 << bits) < c->x86_max_cores) + bits++; + } cpu_core_id[cpu] = phys_proc_id[cpu] & ((1<<bits)-1); phys_proc_id[cpu] >>= bits; printk(KERN_INFO "CPU %d(%d) -> Core %d\n", @@ -240,6 +242,8 @@ static void __init init_amd(struct cpuinfo_x86 *c) } #endif + if (cpuid_eax(0x80000000) >= 0x80000006) + num_cache_leaves = 3; } static unsigned int amd_size_cache(struct cpuinfo_x86 * c, unsigned int size) diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c index 5386b29bb5a5..10afc645c540 100644 --- a/arch/i386/kernel/cpu/intel.c +++ b/arch/i386/kernel/cpu/intel.c @@ -122,6 +122,12 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) select_idle_routine(c); l2 = init_intel_cacheinfo(c); + if (c->cpuid_level > 9 ) { + unsigned eax = cpuid_eax(10); + /* Check for version and the number of counters */ + if ((eax & 0xff) && (((eax>>8) & 0xff) > 1)) + set_bit(X86_FEATURE_ARCH_PERFMON, c->x86_capability); + } /* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until model 3 mask 3 */ if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633) diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c index c8547a6fa7e6..6c37b4fd8ce2 100644 --- a/arch/i386/kernel/cpu/intel_cacheinfo.c +++ b/arch/i386/kernel/cpu/intel_cacheinfo.c @@ -4,6 +4,7 @@ * Changes: * Venkatesh Pallipadi : Adding cache identification through cpuid(4) * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure. + * Andi Kleen : CPUID4 emulation on AMD. */ #include <linux/init.h> @@ -130,25 +131,111 @@ struct _cpuid4_info { cpumask_t shared_cpu_map; }; -static unsigned short num_cache_leaves; +unsigned short num_cache_leaves; + +/* AMD doesn't have CPUID4. Emulate it here to report the same + information to the user. This makes some assumptions about the machine: + No L3, L2 not shared, no SMT etc. that is currently true on AMD CPUs. + + In theory the TLBs could be reported as fake type (they are in "dummy"). + Maybe later */ +union l1_cache { + struct { + unsigned line_size : 8; + unsigned lines_per_tag : 8; + unsigned assoc : 8; + unsigned size_in_kb : 8; + }; + unsigned val; +}; + +union l2_cache { + struct { + unsigned line_size : 8; + unsigned lines_per_tag : 4; + unsigned assoc : 4; + unsigned size_in_kb : 16; + }; + unsigned val; +}; + +static unsigned short assocs[] = { + [1] = 1, [2] = 2, [4] = 4, [6] = 8, + [8] = 16, + [0xf] = 0xffff // ?? + }; +static unsigned char levels[] = { 1, 1, 2 }; +static unsigned char types[] = { 1, 2, 3 }; + +static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, + union _cpuid4_leaf_ebx *ebx, + union _cpuid4_leaf_ecx *ecx) +{ + unsigned dummy; + unsigned line_size, lines_per_tag, assoc, size_in_kb; + union l1_cache l1i, l1d; + union l2_cache l2; + + eax->full = 0; + ebx->full = 0; + ecx->full = 0; + + cpuid(0x80000005, &dummy, &dummy, &l1d.val, &l1i.val); + cpuid(0x80000006, &dummy, &dummy, &l2.val, &dummy); + + if (leaf > 2 || !l1d.val || !l1i.val || !l2.val) + return; + + eax->split.is_self_initializing = 1; + eax->split.type = types[leaf]; + eax->split.level = levels[leaf]; + eax->split.num_threads_sharing = 0; + eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1; + + if (leaf <= 1) { + union l1_cache *l1 = leaf == 0 ? &l1d : &l1i; + assoc = l1->assoc; + line_size = l1->line_size; + lines_per_tag = l1->lines_per_tag; + size_in_kb = l1->size_in_kb; + } else { + assoc = l2.assoc; + line_size = l2.line_size; + lines_per_tag = l2.lines_per_tag; + /* cpu_data has errata corrections for K7 applied */ + size_in_kb = current_cpu_data.x86_cache_size; + } + + if (assoc == 0xf) + eax->split.is_fully_associative = 1; + ebx->split.coherency_line_size = line_size - 1; + ebx->split.ways_of_associativity = assocs[assoc] - 1; + ebx->split.physical_line_partition = lines_per_tag - 1; + ecx->split.number_of_sets = (size_in_kb * 1024) / line_size / + (ebx->split.ways_of_associativity + 1) - 1; +} static int __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf) { - unsigned int eax, ebx, ecx, edx; - union _cpuid4_leaf_eax cache_eax; + union _cpuid4_leaf_eax eax; + union _cpuid4_leaf_ebx ebx; + union _cpuid4_leaf_ecx ecx; + unsigned edx; - cpuid_count(4, index, &eax, &ebx, &ecx, &edx); - cache_eax.full = eax; - if (cache_eax.split.type == CACHE_TYPE_NULL) + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + amd_cpuid4(index, &eax, &ebx, &ecx); + else + cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); + if (eax.split.type == CACHE_TYPE_NULL) return -EIO; /* better error ? */ - this_leaf->eax.full = eax; - this_leaf->ebx.full = ebx; - this_leaf->ecx.full = ecx; - this_leaf->size = (this_leaf->ecx.split.number_of_sets + 1) * - (this_leaf->ebx.split.coherency_line_size + 1) * - (this_leaf->ebx.split.physical_line_partition + 1) * - (this_leaf->ebx.split.ways_of_associativity + 1); + this_leaf->eax = eax; + this_leaf->ebx = ebx; + this_leaf->ecx = ecx; + this_leaf->size = (ecx.split.number_of_sets + 1) * + (ebx.split.coherency_line_size + 1) * + (ebx.split.physical_line_partition + 1) * + (ebx.split.ways_of_associativity + 1); return 0; } diff --git a/arch/i386/kernel/crash.c b/arch/i386/kernel/crash.c index 21dc1bbb8067..0c88d3ec8c18 100644 --- a/arch/i386/kernel/crash.c +++ b/arch/i386/kernel/crash.c @@ -120,14 +120,9 @@ static int crash_nmi_callback(struct pt_regs *regs, int cpu) return 1; } -/* - * By using the NMI code instead of a vector we just sneak thru the - * word generator coming out with just what we want. AND it does - * not matter if clustered_apic_mode is set or not. - */ static void smp_send_nmi_allbutself(void) { - send_IPI_allbutself(APIC_DM_NMI); + send_IPI_allbutself(NMI_VECTOR); } static void nmi_shootdown_cpus(void) diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index cfc683f153b9..e6e4506e749a 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -48,6 +48,7 @@ #include <asm/smp.h> #include <asm/page.h> #include <asm/desc.h> +#include <asm/dwarf2.h> #include "irq_vectors.h" #define nr_syscalls ((syscall_table_size)/4) @@ -85,31 +86,67 @@ VM_MASK = 0x00020000 #define SAVE_ALL \ cld; \ pushl %es; \ + CFI_ADJUST_CFA_OFFSET 4;\ + /*CFI_REL_OFFSET es, 0;*/\ pushl %ds; \ + CFI_ADJUST_CFA_OFFSET 4;\ + /*CFI_REL_OFFSET ds, 0;*/\ pushl %eax; \ + CFI_ADJUST_CFA_OFFSET 4;\ + CFI_REL_OFFSET eax, 0;\ pushl %ebp; \ + CFI_ADJUST_CFA_OFFSET 4;\ + CFI_REL_OFFSET ebp, 0;\ pushl %edi; \ + CFI_ADJUST_CFA_OFFSET 4;\ + CFI_REL_OFFSET edi, 0;\ pushl %esi; \ + CFI_ADJUST_CFA_OFFSET 4;\ + CFI_REL_OFFSET esi, 0;\ pushl %edx; \ + CFI_ADJUST_CFA_OFFSET 4;\ + CFI_REL_OFFSET edx, 0;\ pushl %ecx; \ + CFI_ADJUST_CFA_OFFSET 4;\ + CFI_REL_OFFSET ecx, 0;\ pushl %ebx; \ + CFI_ADJUST_CFA_OFFSET 4;\ + CFI_REL_OFFSET ebx, 0;\ movl $(__USER_DS), %edx; \ movl %edx, %ds; \ movl %edx, %es; #define RESTORE_INT_REGS \ popl %ebx; \ + CFI_ADJUST_CFA_OFFSET -4;\ + CFI_RESTORE ebx;\ popl %ecx; \ + CFI_ADJUST_CFA_OFFSET -4;\ + CFI_RESTORE ecx;\ popl %edx; \ + CFI_ADJUST_CFA_OFFSET -4;\ + CFI_RESTORE edx;\ popl %esi; \ + CFI_ADJUST_CFA_OFFSET -4;\ + CFI_RESTORE esi;\ popl %edi; \ + CFI_ADJUST_CFA_OFFSET -4;\ + CFI_RESTORE edi;\ popl %ebp; \ - popl %eax + CFI_ADJUST_CFA_OFFSET -4;\ + CFI_RESTORE ebp;\ + popl %eax; \ + CFI_ADJUST_CFA_OFFSET -4;\ + CFI_RESTORE eax #define RESTORE_REGS \ RESTORE_INT_REGS; \ 1: popl %ds; \ + CFI_ADJUST_CFA_OFFSET -4;\ + /*CFI_RESTORE ds;*/\ 2: popl %es; \ + CFI_ADJUST_CFA_OFFSET -4;\ + /*CFI_RESTORE es;*/\ .section .fixup,"ax"; \ 3: movl $0,(%esp); \ jmp 1b; \ @@ -122,13 +159,43 @@ VM_MASK = 0x00020000 .long 2b,4b; \ .previous +#define RING0_INT_FRAME \ + CFI_STARTPROC simple;\ + CFI_DEF_CFA esp, 3*4;\ + /*CFI_OFFSET cs, -2*4;*/\ + CFI_OFFSET eip, -3*4 + +#define RING0_EC_FRAME \ + CFI_STARTPROC simple;\ + CFI_DEF_CFA esp, 4*4;\ + /*CFI_OFFSET cs, -2*4;*/\ + CFI_OFFSET eip, -3*4 + +#define RING0_PTREGS_FRAME \ + CFI_STARTPROC simple;\ + CFI_DEF_CFA esp, OLDESP-EBX;\ + /*CFI_OFFSET cs, CS-OLDESP;*/\ + CFI_OFFSET eip, EIP-OLDESP;\ + /*CFI_OFFSET es, ES-OLDESP;*/\ + /*CFI_OFFSET ds, DS-OLDESP;*/\ + CFI_OFFSET eax, EAX-OLDESP;\ + CFI_OFFSET ebp, EBP-OLDESP;\ + CFI_OFFSET edi, EDI-OLDESP;\ + CFI_OFFSET esi, ESI-OLDESP;\ + CFI_OFFSET edx, EDX-OLDESP;\ + CFI_OFFSET ecx, ECX-OLDESP;\ + CFI_OFFSET ebx, EBX-OLDESP ENTRY(ret_from_fork) + CFI_STARTPROC pushl %eax + CFI_ADJUST_CFA_OFFSET -4 call schedule_tail GET_THREAD_INFO(%ebp) popl %eax + CFI_ADJUST_CFA_OFFSET -4 jmp syscall_exit + CFI_ENDPROC /* * Return to user mode is not as complex as all this looks, @@ -139,6 +206,7 @@ ENTRY(ret_from_fork) # userspace resumption stub bypassing syscall exit tracing ALIGN + RING0_PTREGS_FRAME ret_from_exception: preempt_stop ret_from_intr: @@ -171,20 +239,33 @@ need_resched: call preempt_schedule_irq jmp need_resched #endif + CFI_ENDPROC /* SYSENTER_RETURN points to after the "sysenter" instruction in the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ # sysenter call handler stub ENTRY(sysenter_entry) + CFI_STARTPROC simple + CFI_DEF_CFA esp, 0 + CFI_REGISTER esp, ebp movl TSS_sysenter_esp0(%esp),%esp sysenter_past_esp: sti pushl $(__USER_DS) + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET ss, 0*/ pushl %ebp + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET esp, 0 pushfl + CFI_ADJUST_CFA_OFFSET 4 pushl $(__USER_CS) + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET cs, 0*/ pushl $SYSENTER_RETURN + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET eip, 0 /* * Load the potential sixth argument from user stack. @@ -199,6 +280,7 @@ sysenter_past_esp: .previous pushl %eax + CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL GET_THREAD_INFO(%ebp) @@ -219,11 +301,14 @@ sysenter_past_esp: xorl %ebp,%ebp sti sysexit + CFI_ENDPROC # system call handler stub ENTRY(system_call) + RING0_INT_FRAME # can't unwind into user space anyway pushl %eax # save orig_eax + CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL GET_THREAD_INFO(%ebp) testl $TF_MASK,EFLAGS(%esp) @@ -256,10 +341,12 @@ restore_all: movb CS(%esp), %al andl $(VM_MASK | (4 << 8) | 3), %eax cmpl $((4 << 8) | 3), %eax + CFI_REMEMBER_STATE je ldt_ss # returning to user-space with LDT SS restore_nocheck: RESTORE_REGS addl $4, %esp + CFI_ADJUST_CFA_OFFSET -4 1: iret .section .fixup,"ax" iret_exc: @@ -273,6 +360,7 @@ iret_exc: .long 1b,iret_exc .previous + CFI_RESTORE_STATE ldt_ss: larl OLDSS(%esp), %eax jnz restore_nocheck @@ -285,11 +373,13 @@ ldt_ss: * CPUs, which we can try to work around to make * dosemu and wine happy. */ subl $8, %esp # reserve space for switch16 pointer + CFI_ADJUST_CFA_OFFSET 8 cli movl %esp, %eax /* Set up the 16bit stack frame with switch32 pointer on top, * and a switch16 pointer on top of the current frame. */ call setup_x86_bogus_stack + CFI_ADJUST_CFA_OFFSET -8 # frame has moved RESTORE_REGS lss 20+4(%esp), %esp # switch to 16bit stack 1: iret @@ -297,9 +387,11 @@ ldt_ss: .align 4 .long 1b,iret_exc .previous + CFI_ENDPROC # perform work that needs to be done immediately before resumption ALIGN + RING0_PTREGS_FRAME # can't unwind into user space anyway work_pending: testb $_TIF_NEED_RESCHED, %cl jz work_notifysig @@ -329,8 +421,10 @@ work_notifysig: # deal with pending signals and work_notifysig_v86: #ifdef CONFIG_VM86 pushl %ecx # save ti_flags for do_notify_resume + CFI_ADJUST_CFA_OFFSET 4 call save_v86_state # %eax contains pt_regs pointer popl %ecx + CFI_ADJUST_CFA_OFFSET -4 movl %eax, %esp xorl %edx, %edx call do_notify_resume @@ -363,19 +457,21 @@ syscall_exit_work: movl $1, %edx call do_syscall_trace jmp resume_userspace + CFI_ENDPROC - ALIGN + RING0_INT_FRAME # can't unwind into user space anyway syscall_fault: pushl %eax # save orig_eax + CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL GET_THREAD_INFO(%ebp) movl $-EFAULT,EAX(%esp) jmp resume_userspace - ALIGN syscall_badsys: movl $-ENOSYS,EAX(%esp) jmp resume_userspace + CFI_ENDPROC #define FIXUP_ESPFIX_STACK \ movl %esp, %eax; \ @@ -387,16 +483,21 @@ syscall_badsys: movl %eax, %esp; #define UNWIND_ESPFIX_STACK \ pushl %eax; \ + CFI_ADJUST_CFA_OFFSET 4; \ movl %ss, %eax; \ /* see if on 16bit stack */ \ cmpw $__ESPFIX_SS, %ax; \ - jne 28f; \ - movl $__KERNEL_DS, %edx; \ - movl %edx, %ds; \ - movl %edx, %es; \ + je 28f; \ +27: popl %eax; \ + CFI_ADJUST_CFA_OFFSET -4; \ +.section .fixup,"ax"; \ +28: movl $__KERNEL_DS, %eax; \ + movl %eax, %ds; \ + movl %eax, %es; \ /* switch to 32bit stack */ \ - FIXUP_ESPFIX_STACK \ -28: popl %eax; + FIXUP_ESPFIX_STACK; \ + jmp 27b; \ +.previous /* * Build the entry stubs and pointer table with @@ -408,9 +509,14 @@ ENTRY(interrupt) vector=0 ENTRY(irq_entries_start) + RING0_INT_FRAME .rept NR_IRQS ALIGN + .if vector + CFI_ADJUST_CFA_OFFSET -4 + .endif 1: pushl $vector-256 + CFI_ADJUST_CFA_OFFSET 4 jmp common_interrupt .data .long 1b @@ -424,60 +530,99 @@ common_interrupt: movl %esp,%eax call do_IRQ jmp ret_from_intr + CFI_ENDPROC #define BUILD_INTERRUPT(name, nr) \ ENTRY(name) \ + RING0_INT_FRAME; \ pushl $nr-256; \ - SAVE_ALL \ + CFI_ADJUST_CFA_OFFSET 4; \ + SAVE_ALL; \ movl %esp,%eax; \ call smp_/**/name; \ - jmp ret_from_intr; + jmp ret_from_intr; \ + CFI_ENDPROC /* The include is where all of the SMP etc. interrupts come from */ #include "entry_arch.h" ENTRY(divide_error) + RING0_INT_FRAME pushl $0 # no error code + CFI_ADJUST_CFA_OFFSET 4 pushl $do_divide_error + CFI_ADJUST_CFA_OFFSET 4 ALIGN error_code: pushl %ds + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET ds, 0*/ pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET eax, 0 xorl %eax, %eax pushl %ebp + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebp, 0 pushl %edi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edi, 0 pushl %esi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET esi, 0 pushl %edx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edx, 0 decl %eax # eax = -1 pushl %ecx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ecx, 0 pushl %ebx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebx, 0 cld pushl %es + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET es, 0*/ UNWIND_ESPFIX_STACK popl %ecx + CFI_ADJUST_CFA_OFFSET -4 + /*CFI_REGISTER es, ecx*/ movl ES(%esp), %edi # get the function address movl ORIG_EAX(%esp), %edx # get the error code movl %eax, ORIG_EAX(%esp) movl %ecx, ES(%esp) + /*CFI_REL_OFFSET es, ES*/ movl $(__USER_DS), %ecx movl %ecx, %ds movl %ecx, %es movl %esp,%eax # pt_regs pointer call *%edi jmp ret_from_exception + CFI_ENDPROC ENTRY(coprocessor_error) + RING0_INT_FRAME pushl $0 + CFI_ADJUST_CFA_OFFSET 4 pushl $do_coprocessor_error + CFI_ADJUST_CFA_OFFSET 4 jmp error_code + CFI_ENDPROC ENTRY(simd_coprocessor_error) + RING0_INT_FRAME pushl $0 + CFI_ADJUST_CFA_OFFSET 4 pushl $do_simd_coprocessor_error + CFI_ADJUST_CFA_OFFSET 4 jmp error_code + CFI_ENDPROC ENTRY(device_not_available) + RING0_INT_FRAME pushl $-1 # mark this as an int + CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL movl %cr0, %eax testl $0x4, %eax # EM (math emulation bit) @@ -487,9 +632,12 @@ ENTRY(device_not_available) jmp ret_from_exception device_not_available_emulate: pushl $0 # temporary storage for ORIG_EIP + CFI_ADJUST_CFA_OFFSET 4 call math_emulate addl $4, %esp + CFI_ADJUST_CFA_OFFSET -4 jmp ret_from_exception + CFI_ENDPROC /* * Debug traps and NMI can happen at the one SYSENTER instruction @@ -514,16 +662,19 @@ label: \ pushl $sysenter_past_esp KPROBE_ENTRY(debug) + RING0_INT_FRAME cmpl $sysenter_entry,(%esp) jne debug_stack_correct FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) debug_stack_correct: pushl $-1 # mark this as an int + CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL xorl %edx,%edx # error code 0 movl %esp,%eax # pt_regs pointer call do_debug jmp ret_from_exception + CFI_ENDPROC .previous .text /* * NMI is doubly nasty. It can happen _while_ we're handling @@ -534,14 +685,18 @@ debug_stack_correct: * fault happened on the sysenter path. */ ENTRY(nmi) + RING0_INT_FRAME pushl %eax + CFI_ADJUST_CFA_OFFSET 4 movl %ss, %eax cmpw $__ESPFIX_SS, %ax popl %eax + CFI_ADJUST_CFA_OFFSET -4 je nmi_16bit_stack cmpl $sysenter_entry,(%esp) je nmi_stack_fixup pushl %eax + CFI_ADJUST_CFA_OFFSET 4 movl %esp,%eax /* Do not access memory above the end of our stack page, * it might not exist. @@ -549,16 +704,19 @@ ENTRY(nmi) andl $(THREAD_SIZE-1),%eax cmpl $(THREAD_SIZE-20),%eax popl %eax + CFI_ADJUST_CFA_OFFSET -4 jae nmi_stack_correct cmpl $sysenter_entry,12(%esp) je nmi_debug_stack_check nmi_stack_correct: pushl %eax + CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL xorl %edx,%edx # zero error code movl %esp,%eax # pt_regs pointer call do_nmi jmp restore_all + CFI_ENDPROC nmi_stack_fixup: FIX_STACK(12,nmi_stack_correct, 1) @@ -574,94 +732,177 @@ nmi_debug_stack_check: jmp nmi_stack_correct nmi_16bit_stack: + RING0_INT_FRAME /* create the pointer to lss back */ pushl %ss + CFI_ADJUST_CFA_OFFSET 4 pushl %esp + CFI_ADJUST_CFA_OFFSET 4 movzwl %sp, %esp addw $4, (%esp) /* copy the iret frame of 12 bytes */ .rept 3 pushl 16(%esp) + CFI_ADJUST_CFA_OFFSET 4 .endr pushl %eax + CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL FIXUP_ESPFIX_STACK # %eax == %esp + CFI_ADJUST_CFA_OFFSET -20 # the frame has now moved xorl %edx,%edx # zero error code call do_nmi RESTORE_REGS lss 12+4(%esp), %esp # back to 16bit stack 1: iret + CFI_ENDPROC .section __ex_table,"a" .align 4 .long 1b,iret_exc .previous KPROBE_ENTRY(int3) + RING0_INT_FRAME pushl $-1 # mark this as an int + CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL xorl %edx,%edx # zero error code movl %esp,%eax # pt_regs pointer call do_int3 jmp ret_from_exception + CFI_ENDPROC .previous .text ENTRY(overflow) + RING0_INT_FRAME pushl $0 + CFI_ADJUST_CFA_OFFSET 4 pushl $do_overflow + CFI_ADJUST_CFA_OFFSET 4 jmp error_code + CFI_ENDPROC ENTRY(bounds) + RING0_INT_FRAME pushl $0 + CFI_ADJUST_CFA_OFFSET 4 pushl $do_bounds + CFI_ADJUST_CFA_OFFSET 4 jmp error_code + CFI_ENDPROC ENTRY(invalid_op) + RING0_INT_FRAME pushl $0 + CFI_ADJUST_CFA_OFFSET 4 pushl $do_invalid_op + CFI_ADJUST_CFA_OFFSET 4 jmp error_code + CFI_ENDPROC ENTRY(coprocessor_segment_overrun) + RING0_INT_FRAME pushl $0 + CFI_ADJUST_CFA_OFFSET 4 pushl $do_coprocessor_segment_overrun + CFI_ADJUST_CFA_OFFSET 4 jmp error_code + CFI_ENDPROC ENTRY(invalid_TSS) + RING0_EC_FRAME pushl $do_invalid_TSS + CFI_ADJUST_CFA_OFFSET 4 jmp error_code + CFI_ENDPROC ENTRY(segment_not_present) + RING0_EC_FRAME pushl $do_segment_not_present + CFI_ADJUST_CFA_OFFSET 4 jmp error_code + CFI_ENDPROC ENTRY(stack_segment) + RING0_EC_FRAME pushl $do_stack_segment + CFI_ADJUST_CFA_OFFSET 4 jmp error_code + CFI_ENDPROC KPROBE_ENTRY(general_protection) + RING0_EC_FRAME pushl $do_general_protection + CFI_ADJUST_CFA_OFFSET 4 jmp error_code + CFI_ENDPROC .previous .text ENTRY(alignment_check) + RING0_EC_FRAME pushl $do_alignment_check + CFI_ADJUST_CFA_OFFSET 4 jmp error_code + CFI_ENDPROC KPROBE_ENTRY(page_fault) + RING0_EC_FRAME pushl $do_page_fault + CFI_ADJUST_CFA_OFFSET 4 jmp error_code + CFI_ENDPROC .previous .text #ifdef CONFIG_X86_MCE ENTRY(machine_check) + RING0_INT_FRAME pushl $0 + CFI_ADJUST_CFA_OFFSET 4 pushl machine_check_vector + CFI_ADJUST_CFA_OFFSET 4 jmp error_code + CFI_ENDPROC #endif ENTRY(spurious_interrupt_bug) + RING0_INT_FRAME pushl $0 + CFI_ADJUST_CFA_OFFSET 4 pushl $do_spurious_interrupt_bug + CFI_ADJUST_CFA_OFFSET 4 jmp error_code + CFI_ENDPROC + +#ifdef CONFIG_STACK_UNWIND +ENTRY(arch_unwind_init_running) + CFI_STARTPROC + movl 4(%esp), %edx + movl (%esp), %ecx + leal 4(%esp), %eax + movl %ebx, EBX(%edx) + xorl %ebx, %ebx + movl %ebx, ECX(%edx) + movl %ebx, EDX(%edx) + movl %esi, ESI(%edx) + movl %edi, EDI(%edx) + movl %ebp, EBP(%edx) + movl %ebx, EAX(%edx) + movl $__USER_DS, DS(%edx) + movl $__USER_DS, ES(%edx) + movl %ebx, ORIG_EAX(%edx) + movl %ecx, EIP(%edx) + movl 12(%esp), %ecx + movl $__KERNEL_CS, CS(%edx) + movl %ebx, EFLAGS(%edx) + movl %eax, OLDESP(%edx) + movl 8(%esp), %eax + movl %ecx, 8(%esp) + movl EBX(%edx), %ebx + movl $__KERNEL_DS, OLDSS(%edx) + jmpl *%eax + CFI_ENDPROC +ENDPROC(arch_unwind_init_running) +#endif .section .rodata,"a" #include "syscall_table.S" diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index a62df3e764c5..72ae414e4d49 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -38,6 +38,7 @@ #include <asm/desc.h> #include <asm/timer.h> #include <asm/i8259.h> +#include <asm/nmi.h> #include <mach_apic.h> @@ -50,6 +51,7 @@ atomic_t irq_mis_count; static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; static DEFINE_SPINLOCK(ioapic_lock); +static DEFINE_SPINLOCK(vector_lock); int timer_over_8254 __initdata = 1; @@ -1161,10 +1163,17 @@ u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 }; int assign_irq_vector(int irq) { static int current_vector = FIRST_DEVICE_VECTOR, offset = 0; + unsigned long flags; + int vector; + + BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS); - BUG_ON(irq >= NR_IRQ_VECTORS); - if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) + spin_lock_irqsave(&vector_lock, flags); + + if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) { + spin_unlock_irqrestore(&vector_lock, flags); return IO_APIC_VECTOR(irq); + } next: current_vector += 8; if (current_vector == SYSCALL_VECTOR) @@ -1172,16 +1181,21 @@ next: if (current_vector >= FIRST_SYSTEM_VECTOR) { offset++; - if (!(offset%8)) + if (!(offset%8)) { + spin_unlock_irqrestore(&vector_lock, flags); return -ENOSPC; + } current_vector = FIRST_DEVICE_VECTOR + offset; } - vector_irq[current_vector] = irq; + vector = current_vector; + vector_irq[vector] = irq; if (irq != AUTO_ASSIGN) - IO_APIC_VECTOR(irq) = current_vector; + IO_APIC_VECTOR(irq) = vector; - return current_vector; + spin_unlock_irqrestore(&vector_lock, flags); + + return vector; } static struct hw_interrupt_type ioapic_level_type; @@ -1193,21 +1207,14 @@ static struct hw_interrupt_type ioapic_edge_type; static inline void ioapic_register_intr(int irq, int vector, unsigned long trigger) { - if (use_pci_vector() && !platform_legacy_irq(irq)) { - if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) - irq_desc[vector].handler = &ioapic_level_type; - else - irq_desc[vector].handler = &ioapic_edge_type; - set_intr_gate(vector, interrupt[vector]); - } else { - if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) - irq_desc[irq].handler = &ioapic_level_type; - else - irq_desc[irq].handler = &ioapic_edge_type; - set_intr_gate(vector, interrupt[irq]); - } + unsigned idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq; + + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || + trigger == IOAPIC_LEVEL) + irq_desc[idx].handler = &ioapic_level_type; + else + irq_desc[idx].handler = &ioapic_edge_type; + set_intr_gate(vector, interrupt[idx]); } static void __init setup_IO_APIC_irqs(void) diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c index 49ce4c31b713..061533e0cb5e 100644 --- a/arch/i386/kernel/irq.c +++ b/arch/i386/kernel/irq.c @@ -227,7 +227,7 @@ int show_interrupts(struct seq_file *p, void *v) if (i == 0) { seq_printf(p, " "); for_each_online_cpu(j) - seq_printf(p, "CPU%d ",j); + seq_printf(p, "CPU%-8d",j); seq_putc(p, '\n'); } diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index d43b498ec745..a76e93146585 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -14,21 +14,17 @@ */ #include <linux/config.h> -#include <linux/mm.h> #include <linux/delay.h> -#include <linux/bootmem.h> -#include <linux/smp_lock.h> #include <linux/interrupt.h> -#include <linux/mc146818rtc.h> -#include <linux/kernel_stat.h> #include <linux/module.h> #include <linux/nmi.h> #include <linux/sysdev.h> #include <linux/sysctl.h> +#include <linux/percpu.h> #include <asm/smp.h> -#include <asm/div64.h> #include <asm/nmi.h> +#include <asm/intel_arch_perfmon.h> #include "mach_traps.h" @@ -100,6 +96,9 @@ int nmi_active; (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \ P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE) +#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL +#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK + #ifdef CONFIG_SMP /* The performance counters used by NMI_LOCAL_APIC don't trigger when * the CPU is idle. To make sure the NMI watchdog really ticks on all @@ -212,6 +211,8 @@ static int __init setup_nmi_watchdog(char *str) __setup("nmi_watchdog=", setup_nmi_watchdog); +static void disable_intel_arch_watchdog(void); + static void disable_lapic_nmi_watchdog(void) { if (nmi_active <= 0) @@ -221,6 +222,10 @@ static void disable_lapic_nmi_watchdog(void) wrmsr(MSR_K7_EVNTSEL0, 0, 0); break; case X86_VENDOR_INTEL: + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { + disable_intel_arch_watchdog(); + break; + } switch (boot_cpu_data.x86) { case 6: if (boot_cpu_data.x86_model > 0xd) @@ -449,6 +454,53 @@ static int setup_p4_watchdog(void) return 1; } +static void disable_intel_arch_watchdog(void) +{ + unsigned ebx; + + /* + * Check whether the Architectural PerfMon supports + * Unhalted Core Cycles Event or not. + * NOTE: Corresponding bit = 0 in ebp indicates event present. + */ + ebx = cpuid_ebx(10); + if (!(ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) + wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, 0, 0); +} + +static int setup_intel_arch_watchdog(void) +{ + unsigned int evntsel; + unsigned ebx; + + /* + * Check whether the Architectural PerfMon supports + * Unhalted Core Cycles Event or not. + * NOTE: Corresponding bit = 0 in ebp indicates event present. + */ + ebx = cpuid_ebx(10); + if ((ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) + return 0; + + nmi_perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0; + + clear_msr_range(MSR_ARCH_PERFMON_EVENTSEL0, 2); + clear_msr_range(MSR_ARCH_PERFMON_PERFCTR0, 2); + + evntsel = ARCH_PERFMON_EVENTSEL_INT + | ARCH_PERFMON_EVENTSEL_OS + | ARCH_PERFMON_EVENTSEL_USR + | ARCH_PERFMON_NMI_EVENT_SEL + | ARCH_PERFMON_NMI_EVENT_UMASK; + + wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, evntsel, 0); + write_watchdog_counter("INTEL_ARCH_PERFCTR0"); + apic_write(APIC_LVTPC, APIC_DM_NMI); + evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, evntsel, 0); + return 1; +} + void setup_apic_nmi_watchdog (void) { switch (boot_cpu_data.x86_vendor) { @@ -458,6 +510,11 @@ void setup_apic_nmi_watchdog (void) setup_k7_watchdog(); break; case X86_VENDOR_INTEL: + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { + if (!setup_intel_arch_watchdog()) + return; + break; + } switch (boot_cpu_data.x86) { case 6: if (boot_cpu_data.x86_model > 0xd) @@ -561,7 +618,8 @@ void nmi_watchdog_tick (struct pt_regs * regs) wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); apic_write(APIC_LVTPC, APIC_DM_NMI); } - else if (nmi_perfctr_msr == MSR_P6_PERFCTR0) { + else if (nmi_perfctr_msr == MSR_P6_PERFCTR0 || + nmi_perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { /* Only P6 based Pentium M need to re-unmask * the apic vector but it doesn't hurt * other P6 variant */ diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 6259afea46d1..6946b06e2784 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -102,7 +102,7 @@ void default_idle(void) local_irq_enable(); if (!hlt_counter && boot_cpu_data.hlt_works_ok) { - clear_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status &= ~TS_POLLING; smp_mb__after_clear_bit(); while (!need_resched()) { local_irq_disable(); @@ -111,7 +111,7 @@ void default_idle(void) else local_irq_enable(); } - set_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status |= TS_POLLING; } else { while (!need_resched()) cpu_relax(); @@ -174,7 +174,7 @@ void cpu_idle(void) { int cpu = smp_processor_id(); - set_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status |= TS_POLLING; /* endless idle loop with no priority at all */ while (1) { @@ -312,7 +312,7 @@ void show_regs(struct pt_regs * regs) cr3 = read_cr3(); cr4 = read_cr4_safe(); printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); - show_trace(NULL, ®s->esp); + show_trace(NULL, regs, ®s->esp); } /* diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index d134e9643a58..c10789d7a9d3 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c @@ -114,7 +114,17 @@ DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_m static inline int __prepare_ICR (unsigned int shortcut, int vector) { - return APIC_DM_FIXED | shortcut | vector | APIC_DEST_LOGICAL; + unsigned int icr = shortcut | APIC_DEST_LOGICAL; + + switch (vector) { + default: + icr |= APIC_DM_FIXED | vector; + break; + case NMI_VECTOR: + icr |= APIC_DM_NMI; + break; + } + return icr; } static inline int __prepare_ICR2 (unsigned int mask) diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index bd0ca5c9f053..bce5470ecb42 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -52,6 +52,7 @@ #include <asm/tlbflush.h> #include <asm/desc.h> #include <asm/arch_hooks.h> +#include <asm/nmi.h> #include <mach_apic.h> #include <mach_wakecpu.h> diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index dcc14477af1f..78464097470a 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -28,6 +28,7 @@ #include <linux/utsname.h> #include <linux/kprobes.h> #include <linux/kexec.h> +#include <linux/unwind.h> #ifdef CONFIG_EISA #include <linux/ioport.h> @@ -47,7 +48,7 @@ #include <asm/desc.h> #include <asm/i387.h> #include <asm/nmi.h> - +#include <asm/unwind.h> #include <asm/smp.h> #include <asm/arch_hooks.h> #include <asm/kdebug.h> @@ -92,6 +93,7 @@ asmlinkage void spurious_interrupt_bug(void); asmlinkage void machine_check(void); static int kstack_depth_to_print = 24; +static int call_trace = 1; ATOMIC_NOTIFIER_HEAD(i386die_chain); int register_die_notifier(struct notifier_block *nb) @@ -170,7 +172,23 @@ static inline unsigned long print_context_stack(struct thread_info *tinfo, return ebp; } -static void show_trace_log_lvl(struct task_struct *task, +static asmlinkage int show_trace_unwind(struct unwind_frame_info *info, void *log_lvl) +{ + int n = 0; + int printed = 0; /* nr of entries already printed on current line */ + + while (unwind(info) == 0 && UNW_PC(info)) { + ++n; + printed = print_addr_and_symbol(UNW_PC(info), log_lvl, printed); + if (arch_unw_user_mode(info)) + break; + } + if (printed) + printk("\n"); + return n; +} + +static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, char *log_lvl) { unsigned long ebp; @@ -178,6 +196,26 @@ static void show_trace_log_lvl(struct task_struct *task, if (!task) task = current; + if (call_trace >= 0) { + int unw_ret = 0; + struct unwind_frame_info info; + + if (regs) { + if (unwind_init_frame_info(&info, task, regs) == 0) + unw_ret = show_trace_unwind(&info, log_lvl); + } else if (task == current) + unw_ret = unwind_init_running(&info, show_trace_unwind, log_lvl); + else { + if (unwind_init_blocked(&info, task) == 0) + unw_ret = show_trace_unwind(&info, log_lvl); + } + if (unw_ret > 0) { + if (call_trace > 0) + return; + printk("%sLegacy call trace:\n", log_lvl); + } + } + if (task == current) { /* Grab ebp right from our regs */ asm ("movl %%ebp, %0" : "=r" (ebp) : ); @@ -198,13 +236,13 @@ static void show_trace_log_lvl(struct task_struct *task, } } -void show_trace(struct task_struct *task, unsigned long * stack) +void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long * stack) { - show_trace_log_lvl(task, stack, ""); + show_trace_log_lvl(task, regs, stack, ""); } -static void show_stack_log_lvl(struct task_struct *task, unsigned long *esp, - char *log_lvl) +static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *esp, char *log_lvl) { unsigned long *stack; int i; @@ -225,13 +263,13 @@ static void show_stack_log_lvl(struct task_struct *task, unsigned long *esp, printk("%08lx ", *stack++); } printk("\n%sCall Trace:\n", log_lvl); - show_trace_log_lvl(task, esp, log_lvl); + show_trace_log_lvl(task, regs, esp, log_lvl); } void show_stack(struct task_struct *task, unsigned long *esp) { printk(" "); - show_stack_log_lvl(task, esp, ""); + show_stack_log_lvl(task, NULL, esp, ""); } /* @@ -241,7 +279,7 @@ void dump_stack(void) { unsigned long stack; - show_trace(current, &stack); + show_trace(current, NULL, &stack); } EXPORT_SYMBOL(dump_stack); @@ -285,7 +323,7 @@ void show_registers(struct pt_regs *regs) u8 __user *eip; printk("\n" KERN_EMERG "Stack: "); - show_stack_log_lvl(NULL, (unsigned long *)esp, KERN_EMERG); + show_stack_log_lvl(NULL, regs, (unsigned long *)esp, KERN_EMERG); printk(KERN_EMERG "Code: "); @@ -1215,3 +1253,15 @@ static int __init kstack_setup(char *s) return 1; } __setup("kstack=", kstack_setup); + +static int __init call_trace_setup(char *s) +{ + if (strcmp(s, "old") == 0) + call_trace = -1; + else if (strcmp(s, "both") == 0) + call_trace = 0; + else if (strcmp(s, "new") == 0) + call_trace = 1; + return 1; +} +__setup("call_trace=", call_trace_setup); diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index 7512f39c9f25..2d4f1386e2b1 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -71,6 +71,15 @@ SECTIONS .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { *(.data.read_mostly) } _edata = .; /* End of data section */ +#ifdef CONFIG_STACK_UNWIND + . = ALIGN(4); + .eh_frame : AT(ADDR(.eh_frame) - LOAD_OFFSET) { + __start_unwind = .; + *(.eh_frame) + __end_unwind = .; + } +#endif + . = ALIGN(THREAD_SIZE); /* init_task */ .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { *(.data.init_task) diff --git a/arch/i386/oprofile/op_model_athlon.c b/arch/i386/oprofile/op_model_athlon.c index 3ad9a72a5036..693bdea4a52b 100644 --- a/arch/i386/oprofile/op_model_athlon.c +++ b/arch/i386/oprofile/op_model_athlon.c @@ -13,6 +13,7 @@ #include <linux/oprofile.h> #include <asm/ptrace.h> #include <asm/msr.h> +#include <asm/nmi.h> #include "op_x86_model.h" #include "op_counter.h" diff --git a/arch/i386/oprofile/op_model_p4.c b/arch/i386/oprofile/op_model_p4.c index ac8a066035c2..7c61d357b82b 100644 --- a/arch/i386/oprofile/op_model_p4.c +++ b/arch/i386/oprofile/op_model_p4.c @@ -14,6 +14,7 @@ #include <asm/ptrace.h> #include <asm/fixmap.h> #include <asm/apic.h> +#include <asm/nmi.h> #include "op_x86_model.h" #include "op_counter.h" diff --git a/arch/i386/oprofile/op_model_ppro.c b/arch/i386/oprofile/op_model_ppro.c index d719015fc044..5c3ab4b027ad 100644 --- a/arch/i386/oprofile/op_model_ppro.c +++ b/arch/i386/oprofile/op_model_ppro.c @@ -14,6 +14,7 @@ #include <asm/ptrace.h> #include <asm/msr.h> #include <asm/apic.h> +#include <asm/nmi.h> #include "op_x86_model.h" #include "op_counter.h" diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index 355d57970ba3..b045c279136c 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -272,9 +272,9 @@ cpu_idle (void) /* endless idle loop with no priority at all */ while (1) { if (can_do_pal_halt) - clear_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status &= ~TS_POLLING; else - set_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status |= TS_POLLING; if (!need_resched()) { void (*idle)(void); diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index af44130f0d65..ccc4a7fb97a3 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -386,24 +386,45 @@ config HPET_EMULATE_RTC bool "Provide RTC interrupt" depends on HPET_TIMER && RTC=y -config GART_IOMMU - bool "K8 GART IOMMU support" +# Mark as embedded because too many people got it wrong. +# The code disables itself when not needed. +config IOMMU + bool "IOMMU support" if EMBEDDED default y select SWIOTLB select AGP depends on PCI help - Support for hardware IOMMU in AMD's Opteron/Athlon64 Processors - and for the bounce buffering software IOMMU. - Needed to run systems with more than 3GB of memory properly with - 32-bit PCI devices that do not support DAC (Double Address Cycle). - The IOMMU can be turned off at runtime with the iommu=off parameter. - Normally the kernel will take the right choice by itself. - This option includes a driver for the AMD Opteron/Athlon64 IOMMU - northbridge and a software emulation used on other systems without - hardware IOMMU. If unsure, say Y. - -# need this always selected by GART_IOMMU for the VIA workaround + Support for full DMA access of devices with 32bit memory access only + on systems with more than 3GB. This is usually needed for USB, + sound, many IDE/SATA chipsets and some other devices. + Provides a driver for the AMD Athlon64/Opteron/Turion/Sempron GART + based IOMMU and a software bounce buffer based IOMMU used on Intel + systems and as fallback. + The code is only active when needed (enough memory and limited + device) unless CONFIG_IOMMU_DEBUG or iommu=force is specified + too. + +config CALGARY_IOMMU + bool "IBM Calgary IOMMU support" + default y + select SWIOTLB + depends on PCI && EXPERIMENTAL + help + Support for hardware IOMMUs in IBM's xSeries x366 and x460 + systems. Needed to run systems with more than 3GB of memory + properly with 32-bit PCI devices that do not support DAC + (Double Address Cycle). Calgary also supports bus level + isolation, where all DMAs pass through the IOMMU. This + prevents them from going anywhere except their intended + destination. This catches hard-to-find kernel bugs and + mis-behaving drivers and devices that do not use the DMA-API + properly to set up their DMA buffers. The IOMMU can be + turned off at boot time with the iommu=off parameter. + Normally the kernel will make the right choice by itself. + If unsure, say Y. + +# need this always selected by IOMMU for the VIA workaround config SWIOTLB bool @@ -501,6 +522,10 @@ config REORDER optimal TLB usage. If you have pretty much any version of binutils, this can increase your kernel build time by roughly one minute. +config K8_NB + def_bool y + depends on AGP_AMD64 || IOMMU || (PCI && NUMA) + endmenu # diff --git a/arch/x86_64/Kconfig.debug b/arch/x86_64/Kconfig.debug index ea31b4c62105..1d92ab56c0f9 100644 --- a/arch/x86_64/Kconfig.debug +++ b/arch/x86_64/Kconfig.debug @@ -13,7 +13,7 @@ config DEBUG_RODATA If in doubt, say "N". config IOMMU_DEBUG - depends on GART_IOMMU && DEBUG_KERNEL + depends on IOMMU && DEBUG_KERNEL bool "Enable IOMMU debugging" help Force the IOMMU to on even when you have less than 4GB of @@ -35,6 +35,22 @@ config IOMMU_LEAK Add a simple leak tracer to the IOMMU code. This is useful when you are debugging a buggy device driver that leaks IOMMU mappings. +config DEBUG_STACKOVERFLOW + bool "Check for stack overflows" + depends on DEBUG_KERNEL + help + This option will cause messages to be printed if free stack space + drops below a certain limit. + +config DEBUG_STACK_USAGE + bool "Stack utilization instrumentation" + depends on DEBUG_KERNEL + help + Enables the display of the minimum amount of free stack which each + task has ever had available in the sysrq-T and sysrq-P debug output. + + This option will slow down process creation somewhat. + #config X86_REMOTE_DEBUG # bool "kgdb debugging stub" diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile index e573e2ab5510..431bb4bc36cd 100644 --- a/arch/x86_64/Makefile +++ b/arch/x86_64/Makefile @@ -27,6 +27,7 @@ LDFLAGS_vmlinux := CHECKFLAGS += -D__x86_64__ -m64 cflags-y := +cflags-kernel-y := cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8) cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona) cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic) @@ -35,7 +36,7 @@ cflags-y += -m64 cflags-y += -mno-red-zone cflags-y += -mcmodel=kernel cflags-y += -pipe -cflags-$(CONFIG_REORDER) += -ffunction-sections +cflags-kernel-$(CONFIG_REORDER) += -ffunction-sections # this makes reading assembly source easier, but produces worse code # actually it makes the kernel smaller too. cflags-y += -fno-reorder-blocks @@ -55,6 +56,7 @@ cflags-y += $(call cc-option,-funit-at-a-time) cflags-y += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,) CFLAGS += $(cflags-y) +CFLAGS_KERNEL += $(cflags-kernel-y) AFLAGS += -m64 head-y := arch/x86_64/kernel/head.o arch/x86_64/kernel/head64.o arch/x86_64/kernel/init_task.o diff --git a/arch/x86_64/boot/Makefile b/arch/x86_64/boot/Makefile index 43ee6c50c277..deb063e7762d 100644 --- a/arch/x86_64/boot/Makefile +++ b/arch/x86_64/boot/Makefile @@ -107,8 +107,13 @@ fdimage288: $(BOOTIMAGE) $(obj)/mtools.conf isoimage: $(BOOTIMAGE) -rm -rf $(obj)/isoimage mkdir $(obj)/isoimage - cp `echo /usr/lib*/syslinux/isolinux.bin | awk '{ print $1; }'` \ - $(obj)/isoimage + for i in lib lib64 share end ; do \ + if [ -f /usr/$$i/syslinux/isolinux.bin ] ; then \ + cp /usr/$$i/syslinux/isolinux.bin $(obj)/isoimage ; \ + break ; \ + fi ; \ + if [ $$i = end ] ; then exit 1 ; fi ; \ + done cp $(BOOTIMAGE) $(obj)/isoimage/linux echo '$(image_cmdline)' > $(obj)/isoimage/isolinux.cfg if [ -f '$(FDINITRD)' ] ; then \ diff --git a/arch/x86_64/boot/compressed/misc.c b/arch/x86_64/boot/compressed/misc.c index cf4b88c416dc..3755b2e394d0 100644 --- a/arch/x86_64/boot/compressed/misc.c +++ b/arch/x86_64/boot/compressed/misc.c @@ -77,11 +77,11 @@ static void gzip_release(void **); */ static unsigned char *real_mode; /* Pointer to real-mode data */ -#define EXT_MEM_K (*(unsigned short *)(real_mode + 0x2)) +#define RM_EXT_MEM_K (*(unsigned short *)(real_mode + 0x2)) #ifndef STANDARD_MEMORY_BIOS_CALL -#define ALT_MEM_K (*(unsigned long *)(real_mode + 0x1e0)) +#define RM_ALT_MEM_K (*(unsigned long *)(real_mode + 0x1e0)) #endif -#define SCREEN_INFO (*(struct screen_info *)(real_mode+0)) +#define RM_SCREEN_INFO (*(struct screen_info *)(real_mode+0)) extern unsigned char input_data[]; extern int input_len; @@ -92,9 +92,9 @@ static unsigned long output_ptr = 0; static void *malloc(int size); static void free(void *where); - -void* memset(void* s, int c, unsigned n); -void* memcpy(void* dest, const void* src, unsigned n); + +static void *memset(void *s, int c, unsigned n); +static void *memcpy(void *dest, const void *src, unsigned n); static void putstr(const char *); @@ -162,8 +162,8 @@ static void putstr(const char *s) int x,y,pos; char c; - x = SCREEN_INFO.orig_x; - y = SCREEN_INFO.orig_y; + x = RM_SCREEN_INFO.orig_x; + y = RM_SCREEN_INFO.orig_y; while ( ( c = *s++ ) != '\0' ) { if ( c == '\n' ) { @@ -184,8 +184,8 @@ static void putstr(const char *s) } } - SCREEN_INFO.orig_x = x; - SCREEN_INFO.orig_y = y; + RM_SCREEN_INFO.orig_x = x; + RM_SCREEN_INFO.orig_y = y; pos = (x + cols * y) * 2; /* Update cursor position */ outb_p(14, vidport); @@ -194,7 +194,7 @@ static void putstr(const char *s) outb_p(0xff & (pos >> 1), vidport+1); } -void* memset(void* s, int c, unsigned n) +static void* memset(void* s, int c, unsigned n) { int i; char *ss = (char*)s; @@ -203,7 +203,7 @@ void* memset(void* s, int c, unsigned n) return s; } -void* memcpy(void* dest, const void* src, unsigned n) +static void* memcpy(void* dest, const void* src, unsigned n) { int i; char *d = (char *)dest, *s = (char *)src; @@ -278,15 +278,15 @@ static void error(char *x) putstr(x); putstr("\n\n -- System halted"); - while(1); + while(1); /* Halt */ } -void setup_normal_output_buffer(void) +static void setup_normal_output_buffer(void) { #ifdef STANDARD_MEMORY_BIOS_CALL - if (EXT_MEM_K < 1024) error("Less than 2MB of memory"); + if (RM_EXT_MEM_K < 1024) error("Less than 2MB of memory"); #else - if ((ALT_MEM_K > EXT_MEM_K ? ALT_MEM_K : EXT_MEM_K) < 1024) error("Less than 2MB of memory"); + if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < 1024) error("Less than 2MB of memory"); #endif output_data = (unsigned char *)__PHYSICAL_START; /* Normally Points to 1M */ free_mem_end_ptr = (long)real_mode; @@ -297,13 +297,13 @@ struct moveparams { uch *high_buffer_start; int hcount; }; -void setup_output_buffer_if_we_run_high(struct moveparams *mv) +static void setup_output_buffer_if_we_run_high(struct moveparams *mv) { high_buffer_start = (uch *)(((ulg)&end) + HEAP_SIZE); #ifdef STANDARD_MEMORY_BIOS_CALL - if (EXT_MEM_K < (3*1024)) error("Less than 4MB of memory"); + if (RM_EXT_MEM_K < (3*1024)) error("Less than 4MB of memory"); #else - if ((ALT_MEM_K > EXT_MEM_K ? ALT_MEM_K : EXT_MEM_K) < (3*1024)) error("Less than 4MB of memory"); + if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < (3*1024)) error("Less than 4MB of memory"); #endif mv->low_buffer_start = output_data = (unsigned char *)LOW_BUFFER_START; low_buffer_end = ((unsigned int)real_mode > LOW_BUFFER_MAX @@ -319,7 +319,7 @@ void setup_output_buffer_if_we_run_high(struct moveparams *mv) mv->high_buffer_start = high_buffer_start; } -void close_output_buffer_if_we_run_high(struct moveparams *mv) +static void close_output_buffer_if_we_run_high(struct moveparams *mv) { if (bytes_out > low_buffer_size) { mv->lcount = low_buffer_size; @@ -335,7 +335,7 @@ int decompress_kernel(struct moveparams *mv, void *rmode) { real_mode = rmode; - if (SCREEN_INFO.orig_video_mode == 7) { + if (RM_SCREEN_INFO.orig_video_mode == 7) { vidmem = (char *) 0xb0000; vidport = 0x3b4; } else { @@ -343,8 +343,8 @@ int decompress_kernel(struct moveparams *mv, void *rmode) vidport = 0x3d4; } - lines = SCREEN_INFO.orig_video_lines; - cols = SCREEN_INFO.orig_video_cols; + lines = RM_SCREEN_INFO.orig_video_lines; + cols = RM_SCREEN_INFO.orig_video_cols; if (free_mem_ptr < 0x100000) setup_normal_output_buffer(); else setup_output_buffer_if_we_run_high(mv); diff --git a/arch/x86_64/boot/tools/build.c b/arch/x86_64/boot/tools/build.c index c44f5e2ec100..eae86691709a 100644 --- a/arch/x86_64/boot/tools/build.c +++ b/arch/x86_64/boot/tools/build.c @@ -149,10 +149,8 @@ int main(int argc, char ** argv) sz = sb.st_size; fprintf (stderr, "System is %d kB\n", sz/1024); sys_size = (sz + 15) / 16; - /* 0x40000*16 = 4.0 MB, reasonable estimate for the current maximum */ - if (sys_size > (is_big_kernel ? 0x40000 : DEF_SYSSIZE)) - die("System is too big. Try using %smodules.", - is_big_kernel ? "" : "bzImage or "); + if (!is_big_kernel && sys_size > DEF_SYSSIZE) + die("System is too big. Try using bzImage or modules."); while (sz > 0) { int l, n; diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig index 69db0c0721d1..e69d403949c8 100644 --- a/arch/x86_64/defconfig +++ b/arch/x86_64/defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.17-rc1-git11 -# Sun Apr 16 07:22:36 2006 +# Linux kernel version: 2.6.17-git6 +# Sat Jun 24 00:52:28 2006 # CONFIG_X86_64=y CONFIG_64BIT=y @@ -42,7 +42,6 @@ CONFIG_IKCONFIG_PROC=y # CONFIG_RELAY is not set CONFIG_INITRAMFS_SOURCE="" CONFIG_UID16=y -CONFIG_VM86=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y # CONFIG_EMBEDDED is not set CONFIG_KALLSYMS=y @@ -57,7 +56,6 @@ CONFIG_FUTEX=y CONFIG_EPOLL=y CONFIG_SHMEM=y CONFIG_SLAB=y -CONFIG_DOUBLEFAULT=y # CONFIG_TINY_SHMEM is not set CONFIG_BASE_SMALL=0 # CONFIG_SLOB is not set @@ -144,7 +142,8 @@ CONFIG_NR_CPUS=32 CONFIG_HOTPLUG_CPU=y CONFIG_HPET_TIMER=y CONFIG_HPET_EMULATE_RTC=y -CONFIG_GART_IOMMU=y +CONFIG_IOMMU=y +# CONFIG_CALGARY_IOMMU is not set CONFIG_SWIOTLB=y CONFIG_X86_MCE=y CONFIG_X86_MCE_INTEL=y @@ -158,6 +157,7 @@ CONFIG_HZ_250=y # CONFIG_HZ_1000 is not set CONFIG_HZ=250 # CONFIG_REORDER is not set +CONFIG_K8_NB=y CONFIG_GENERIC_HARDIRQS=y CONFIG_GENERIC_IRQ_PROBE=y CONFIG_ISA_DMA_API=y @@ -293,6 +293,8 @@ CONFIG_IP_PNP_DHCP=y # CONFIG_INET_IPCOMP is not set # CONFIG_INET_XFRM_TUNNEL is not set # CONFIG_INET_TUNNEL is not set +# CONFIG_INET_XFRM_MODE_TRANSPORT is not set +# CONFIG_INET_XFRM_MODE_TUNNEL is not set CONFIG_INET_DIAG=y CONFIG_INET_TCP_DIAG=y # CONFIG_TCP_CONG_ADVANCED is not set @@ -305,7 +307,10 @@ CONFIG_IPV6=y # CONFIG_INET6_IPCOMP is not set # CONFIG_INET6_XFRM_TUNNEL is not set # CONFIG_INET6_TUNNEL is not set +# CONFIG_INET6_XFRM_MODE_TRANSPORT is not set +# CONFIG_INET6_XFRM_MODE_TUNNEL is not set # CONFIG_IPV6_TUNNEL is not set +# CONFIG_NETWORK_SECMARK is not set # CONFIG_NETFILTER is not set # @@ -344,6 +349,7 @@ CONFIG_IPV6=y # Network testing # # CONFIG_NET_PKTGEN is not set +# CONFIG_NET_TCPPROBE is not set # CONFIG_HAMRADIO is not set # CONFIG_IRDA is not set # CONFIG_BT is not set @@ -360,6 +366,7 @@ CONFIG_STANDALONE=y CONFIG_PREVENT_FIRMWARE_BUILD=y CONFIG_FW_LOADER=y # CONFIG_DEBUG_DRIVER is not set +# CONFIG_SYS_HYPERVISOR is not set # # Connector - unified userspace <-> kernelspace linker @@ -526,6 +533,7 @@ CONFIG_SCSI_ATA_PIIX=y # CONFIG_SCSI_SATA_MV is not set CONFIG_SCSI_SATA_NV=y # CONFIG_SCSI_PDC_ADMA is not set +# CONFIG_SCSI_HPTIOP is not set # CONFIG_SCSI_SATA_QSTOR is not set # CONFIG_SCSI_SATA_PROMISE is not set # CONFIG_SCSI_SATA_SX4 is not set @@ -591,10 +599,7 @@ CONFIG_IEEE1394=y # # Device Drivers # - -# -# Texas Instruments PCILynx requires I2C -# +# CONFIG_IEEE1394_PCILYNX is not set CONFIG_IEEE1394_OHCI1394=y # @@ -645,7 +650,16 @@ CONFIG_VORTEX=y # # Tulip family network device support # -# CONFIG_NET_TULIP is not set +CONFIG_NET_TULIP=y +# CONFIG_DE2104X is not set +CONFIG_TULIP=y +# CONFIG_TULIP_MWI is not set +# CONFIG_TULIP_MMIO is not set +# CONFIG_TULIP_NAPI is not set +# CONFIG_DE4X5 is not set +# CONFIG_WINBOND_840 is not set +# CONFIG_DM9102 is not set +# CONFIG_ULI526X is not set # CONFIG_HP100 is not set CONFIG_NET_PCI=y # CONFIG_PCNET32 is not set @@ -697,6 +711,7 @@ CONFIG_TIGON3=y # CONFIG_IXGB is not set CONFIG_S2IO=m # CONFIG_S2IO_NAPI is not set +# CONFIG_MYRI10GE is not set # # Token Ring devices @@ -887,7 +902,56 @@ CONFIG_HPET_MMAP=y # # I2C support # -# CONFIG_I2C is not set +CONFIG_I2C=m +CONFIG_I2C_CHARDEV=m + +# +# I2C Algorithms +# +# CONFIG_I2C_ALGOBIT is not set +# CONFIG_I2C_ALGOPCF is not set +# CONFIG_I2C_ALGOPCA is not set + +# +# I2C Hardware Bus support +# +# CONFIG_I2C_ALI1535 is not set +# CONFIG_I2C_ALI1563 is not set +# CONFIG_I2C_ALI15X3 is not set +# CONFIG_I2C_AMD756 is not set +# CONFIG_I2C_AMD8111 is not set +# CONFIG_I2C_I801 is not set +# CONFIG_I2C_I810 is not set +# CONFIG_I2C_PIIX4 is not set +CONFIG_I2C_ISA=m +# CONFIG_I2C_NFORCE2 is not set +# CONFIG_I2C_OCORES is not set +# CONFIG_I2C_PARPORT_LIGHT is not set +# CONFIG_I2C_PROSAVAGE is not set +# CONFIG_I2C_SAVAGE4 is not set +# CONFIG_I2C_SIS5595 is not set +# CONFIG_I2C_SIS630 is not set +# CONFIG_I2C_SIS96X is not set +# CONFIG_I2C_STUB is not set +# CONFIG_I2C_VIA is not set +# CONFIG_I2C_VIAPRO is not set +# CONFIG_I2C_VOODOO3 is not set +# CONFIG_I2C_PCA_ISA is not set + +# +# Miscellaneous I2C Chip support +# +# CONFIG_SENSORS_DS1337 is not set +# CONFIG_SENSORS_DS1374 is not set +# CONFIG_SENSORS_EEPROM is not set +# CONFIG_SENSORS_PCF8574 is not set +# CONFIG_SENSORS_PCA9539 is not set +# CONFIG_SENSORS_PCF8591 is not set +# CONFIG_SENSORS_MAX6875 is not set +# CONFIG_I2C_DEBUG_CORE is not set +# CONFIG_I2C_DEBUG_ALGO is not set +# CONFIG_I2C_DEBUG_BUS is not set +# CONFIG_I2C_DEBUG_CHIP is not set # # SPI support @@ -898,14 +962,51 @@ CONFIG_HPET_MMAP=y # # Dallas's 1-wire bus # -# CONFIG_W1 is not set # # Hardware Monitoring support # CONFIG_HWMON=y # CONFIG_HWMON_VID is not set +# CONFIG_SENSORS_ABITUGURU is not set +# CONFIG_SENSORS_ADM1021 is not set +# CONFIG_SENSORS_ADM1025 is not set +# CONFIG_SENSORS_ADM1026 is not set +# CONFIG_SENSORS_ADM1031 is not set +# CONFIG_SENSORS_ADM9240 is not set +# CONFIG_SENSORS_ASB100 is not set +# CONFIG_SENSORS_ATXP1 is not set +# CONFIG_SENSORS_DS1621 is not set # CONFIG_SENSORS_F71805F is not set +# CONFIG_SENSORS_FSCHER is not set +# CONFIG_SENSORS_FSCPOS is not set +# CONFIG_SENSORS_GL518SM is not set +# CONFIG_SENSORS_GL520SM is not set +# CONFIG_SENSORS_IT87 is not set +# CONFIG_SENSORS_LM63 is not set +# CONFIG_SENSORS_LM75 is not set +# CONFIG_SENSORS_LM77 is not set +# CONFIG_SENSORS_LM78 is not set +# CONFIG_SENSORS_LM80 is not set +# CONFIG_SENSORS_LM83 is not set +# CONFIG_SENSORS_LM85 is not set +# CONFIG_SENSORS_LM87 is not set +# CONFIG_SENSORS_LM90 is not set +# CONFIG_SENSORS_LM92 is not set +# CONFIG_SENSORS_MAX1619 is not set +# CONFIG_SENSORS_PC87360 is not set +# CONFIG_SENSORS_SIS5595 is not set +# CONFIG_SENSORS_SMSC47M1 is not set +# CONFIG_SENSORS_SMSC47M192 is not set +CONFIG_SENSORS_SMSC47B397=m +# CONFIG_SENSORS_VIA686A is not set +# CONFIG_SENSORS_VT8231 is not set +# CONFIG_SENSORS_W83781D is not set +# CONFIG_SENSORS_W83791D is not set +# CONFIG_SENSORS_W83792D is not set +# CONFIG_SENSORS_W83L785TS is not set +# CONFIG_SENSORS_W83627HF is not set +# CONFIG_SENSORS_W83627EHF is not set # CONFIG_SENSORS_HDAPS is not set # CONFIG_HWMON_DEBUG_CHIP is not set @@ -918,6 +1019,7 @@ CONFIG_HWMON=y # Multimedia devices # # CONFIG_VIDEO_DEV is not set +CONFIG_VIDEO_V4L2=y # # Digital Video Broadcasting Devices @@ -953,28 +1055,17 @@ CONFIG_SOUND=y # Open Sound System # CONFIG_SOUND_PRIME=y -CONFIG_OBSOLETE_OSS_DRIVER=y # CONFIG_SOUND_BT878 is not set -# CONFIG_SOUND_CMPCI is not set # CONFIG_SOUND_EMU10K1 is not set # CONFIG_SOUND_FUSION is not set -# CONFIG_SOUND_CS4281 is not set -# CONFIG_SOUND_ES1370 is not set # CONFIG_SOUND_ES1371 is not set -# CONFIG_SOUND_ESSSOLO1 is not set -# CONFIG_SOUND_MAESTRO is not set -# CONFIG_SOUND_MAESTRO3 is not set CONFIG_SOUND_ICH=y -# CONFIG_SOUND_SONICVIBES is not set # CONFIG_SOUND_TRIDENT is not set # CONFIG_SOUND_MSNDCLAS is not set # CONFIG_SOUND_MSNDPIN is not set # CONFIG_SOUND_VIA82CXXX is not set # CONFIG_SOUND_OSS is not set -# CONFIG_SOUND_ALI5455 is not set -# CONFIG_SOUND_FORTE is not set -# CONFIG_SOUND_RME96XX is not set -# CONFIG_SOUND_AD1980 is not set +# CONFIG_SOUND_TVMIXER is not set # # USB support @@ -1000,6 +1091,7 @@ CONFIG_USB_DEVICEFS=y CONFIG_USB_EHCI_HCD=y # CONFIG_USB_EHCI_SPLIT_ISO is not set # CONFIG_USB_EHCI_ROOT_HUB_TT is not set +# CONFIG_USB_EHCI_TT_NEWSCHED is not set # CONFIG_USB_ISP116X_HCD is not set CONFIG_USB_OHCI_HCD=y # CONFIG_USB_OHCI_BIG_ENDIAN is not set @@ -1089,10 +1181,12 @@ CONFIG_USB_MON=y # CONFIG_USB_LEGOTOWER is not set # CONFIG_USB_LCD is not set # CONFIG_USB_LED is not set +# CONFIG_USB_CY7C63 is not set # CONFIG_USB_CYTHERM is not set # CONFIG_USB_PHIDGETKIT is not set # CONFIG_USB_PHIDGETSERVO is not set # CONFIG_USB_IDMOUSE is not set +# CONFIG_USB_APPLEDISPLAY is not set # CONFIG_USB_SISUSBVGA is not set # CONFIG_USB_LD is not set # CONFIG_USB_TEST is not set @@ -1141,6 +1235,19 @@ CONFIG_USB_MON=y # CONFIG_RTC_CLASS is not set # +# DMA Engine support +# +# CONFIG_DMA_ENGINE is not set + +# +# DMA Clients +# + +# +# DMA Devices +# + +# # Firmware Drivers # # CONFIG_EDD is not set @@ -1175,6 +1282,7 @@ CONFIG_FS_POSIX_ACL=y # CONFIG_MINIX_FS is not set # CONFIG_ROMFS_FS is not set CONFIG_INOTIFY=y +CONFIG_INOTIFY_USER=y # CONFIG_QUOTA is not set CONFIG_DNOTIFY=y CONFIG_AUTOFS_FS=y @@ -1331,7 +1439,8 @@ CONFIG_DETECT_SOFTLOCKUP=y CONFIG_DEBUG_FS=y # CONFIG_DEBUG_VM is not set # CONFIG_FRAME_POINTER is not set -# CONFIG_UNWIND_INFO is not set +CONFIG_UNWIND_INFO=y +CONFIG_STACK_UNWIND=y # CONFIG_FORCED_INLINING is not set # CONFIG_RCU_TORTURE_TEST is not set # CONFIG_DEBUG_RODATA is not set diff --git a/arch/x86_64/ia32/fpu32.c b/arch/x86_64/ia32/fpu32.c index 1c23095f1813..2c8209a3605a 100644 --- a/arch/x86_64/ia32/fpu32.c +++ b/arch/x86_64/ia32/fpu32.c @@ -2,7 +2,6 @@ * Copyright 2002 Andi Kleen, SuSE Labs. * FXSAVE<->i387 conversion support. Based on code by Gareth Hughes. * This is used for ptrace, signals and coredumps in 32bit emulation. - * $Id: fpu32.c,v 1.1 2002/03/21 14:16:32 ak Exp $ */ #include <linux/sched.h> diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c index e0a92439f634..25e5ca22204c 100644 --- a/arch/x86_64/ia32/ia32_signal.c +++ b/arch/x86_64/ia32/ia32_signal.c @@ -6,8 +6,6 @@ * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes * 2000-12-* x86-64 compatibility mode signal handling by Andi Kleen - * - * $Id: ia32_signal.c,v 1.22 2002/07/29 10:34:03 ak Exp $ */ #include <linux/sched.h> diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index 4ec594ab1a98..c536fa98ea37 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -155,6 +155,7 @@ sysenter_tracesys: .previous jmp sysenter_do_call CFI_ENDPROC +ENDPROC(ia32_sysenter_target) /* * 32bit SYSCALL instruction entry. @@ -178,7 +179,7 @@ sysenter_tracesys: */ ENTRY(ia32_cstar_target) CFI_STARTPROC32 simple - CFI_DEF_CFA rsp,0 + CFI_DEF_CFA rsp,PDA_STACKOFFSET CFI_REGISTER rip,rcx /*CFI_REGISTER rflags,r11*/ swapgs @@ -249,6 +250,7 @@ cstar_tracesys: .quad 1b,ia32_badarg .previous jmp cstar_do_call +END(ia32_cstar_target) ia32_badarg: movq $-EFAULT,%rax @@ -314,16 +316,13 @@ ia32_tracesys: LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ RESTORE_REST jmp ia32_do_syscall +END(ia32_syscall) ia32_badsys: movq $0,ORIG_RAX-ARGOFFSET(%rsp) movq $-ENOSYS,RAX-ARGOFFSET(%rsp) jmp int_ret_from_sys_call -ni_syscall: - movq %rax,%rdi - jmp sys32_ni_syscall - quiet_ni_syscall: movq $-ENOSYS,%rax ret @@ -370,10 +369,10 @@ ENTRY(ia32_ptregs_common) RESTORE_REST jmp ia32_sysret /* misbalances the return cache */ CFI_ENDPROC +END(ia32_ptregs_common) .section .rodata,"a" .align 8 - .globl ia32_sys_call_table ia32_sys_call_table: .quad sys_restart_syscall .quad sys_exit diff --git a/arch/x86_64/ia32/ptrace32.c b/arch/x86_64/ia32/ptrace32.c index 23a4515a73b4..a590b7a0d92d 100644 --- a/arch/x86_64/ia32/ptrace32.c +++ b/arch/x86_64/ia32/ptrace32.c @@ -7,8 +7,6 @@ * * This allows to access 64bit processes too; but there is no way to see the extended * register contents. - * - * $Id: ptrace32.c,v 1.16 2003/03/14 16:06:35 ak Exp $ */ #include <linux/kernel.h> @@ -27,6 +25,7 @@ #include <asm/debugreg.h> #include <asm/i387.h> #include <asm/fpu32.h> +#include <asm/ia32.h> /* * Determines which flags the user has access to [1 = access, 0 = no access]. @@ -199,6 +198,24 @@ static int getreg32(struct task_struct *child, unsigned regno, u32 *val) #undef R32 +static long ptrace32_siginfo(unsigned request, u32 pid, u32 addr, u32 data) +{ + int ret; + compat_siginfo_t *si32 = (compat_siginfo_t *)compat_ptr(data); + siginfo_t *si = compat_alloc_user_space(sizeof(siginfo_t)); + if (request == PTRACE_SETSIGINFO) { + ret = copy_siginfo_from_user32(si, si32); + if (ret) + return ret; + } + ret = sys_ptrace(request, pid, addr, (unsigned long)si); + if (ret) + return ret; + if (request == PTRACE_GETSIGINFO) + ret = copy_siginfo_to_user32(si32, si); + return ret; +} + asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data) { struct task_struct *child; @@ -208,9 +225,19 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data) __u32 val; switch (request) { - default: + case PTRACE_TRACEME: + case PTRACE_ATTACH: + case PTRACE_KILL: + case PTRACE_CONT: + case PTRACE_SINGLESTEP: + case PTRACE_DETACH: + case PTRACE_SYSCALL: + case PTRACE_SETOPTIONS: return sys_ptrace(request, pid, addr, data); + default: + return -EINVAL; + case PTRACE_PEEKTEXT: case PTRACE_PEEKDATA: case PTRACE_POKEDATA: @@ -225,10 +252,11 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data) case PTRACE_GETFPXREGS: case PTRACE_GETEVENTMSG: break; - } - if (request == PTRACE_TRACEME) - return ptrace_traceme(); + case PTRACE_SETSIGINFO: + case PTRACE_GETSIGINFO: + return ptrace32_siginfo(request, pid, addr, data); + } child = ptrace_get_task_struct(pid); if (IS_ERR(child)) @@ -349,8 +377,7 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data) break; default: - ret = -EINVAL; - break; + BUG(); } out: diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c index f182b20858e2..dc88154c412b 100644 --- a/arch/x86_64/ia32/sys_ia32.c +++ b/arch/x86_64/ia32/sys_ia32.c @@ -508,19 +508,6 @@ sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr, int options) return compat_sys_wait4(pid, stat_addr, options, NULL); } -int sys32_ni_syscall(int call) -{ - struct task_struct *me = current; - static char lastcomm[sizeof(me->comm)]; - - if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) { - printk(KERN_INFO "IA32 syscall %d from %s not implemented\n", - call, me->comm); - strncpy(lastcomm, me->comm, sizeof(lastcomm)); - } - return -ENOSYS; -} - /* 32-bit timeval and related flotsam. */ asmlinkage long @@ -916,7 +903,7 @@ long sys32_vm86_warning(void) struct task_struct *me = current; static char lastcomm[sizeof(me->comm)]; if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) { - printk(KERN_INFO "%s: vm86 mode not supported on 64 bit kernel\n", + compat_printk(KERN_INFO "%s: vm86 mode not supported on 64 bit kernel\n", me->comm); strncpy(lastcomm, me->comm, sizeof(lastcomm)); } @@ -929,13 +916,3 @@ long sys32_lookup_dcookie(u32 addr_low, u32 addr_high, return sys_lookup_dcookie(((u64)addr_high << 32) | addr_low, buf, len); } -static int __init ia32_init (void) -{ - printk("IA32 emulation $Id: sys_ia32.c,v 1.32 2002/03/24 13:02:28 ak Exp $\n"); - return 0; -} - -__initcall(ia32_init); - -extern unsigned long ia32_sys_call_table[]; -EXPORT_SYMBOL(ia32_sys_call_table); diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index 059c88313f4e..aeb9c560be88 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -8,7 +8,7 @@ obj-y := process.o signal.o entry.o traps.o irq.o \ ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \ x8664_ksyms.o i387.o syscall.o vsyscall.o \ setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \ - pci-dma.o pci-nommu.o + pci-dma.o pci-nommu.o alternative.o obj-$(CONFIG_X86_MCE) += mce.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o @@ -28,11 +28,13 @@ obj-$(CONFIG_PM) += suspend.o obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o obj-$(CONFIG_CPU_FREQ) += cpufreq/ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o -obj-$(CONFIG_GART_IOMMU) += pci-gart.o aperture.o +obj-$(CONFIG_IOMMU) += pci-gart.o aperture.o +obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary.o tce.o obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_X86_PM_TIMER) += pmtimer.o obj-$(CONFIG_X86_VSMP) += vsmp.o +obj-$(CONFIG_K8_NB) += k8.o obj-$(CONFIG_MODULES) += module.o @@ -49,3 +51,5 @@ intel_cacheinfo-y += ../../i386/kernel/cpu/intel_cacheinfo.o quirks-y += ../../i386/kernel/quirks.o i8237-y += ../../i386/kernel/i8237.o msr-$(subst m,y,$(CONFIG_X86_MSR)) += ../../i386/kernel/msr.o +alternative-y += ../../i386/kernel/alternative.o + diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c index 70b9d21ed675..a195ef06ec55 100644 --- a/arch/x86_64/kernel/aperture.c +++ b/arch/x86_64/kernel/aperture.c @@ -8,7 +8,6 @@ * because only the bootmem allocator can allocate 32+MB. * * Copyright 2002 Andi Kleen, SuSE Labs. - * $Id: aperture.c,v 1.7 2003/08/01 03:36:18 ak Exp $ */ #include <linux/config.h> #include <linux/kernel.h> @@ -24,6 +23,7 @@ #include <asm/proto.h> #include <asm/pci-direct.h> #include <asm/dma.h> +#include <asm/k8.h> int iommu_aperture; int iommu_aperture_disabled __initdata = 0; @@ -37,8 +37,6 @@ int fix_aperture __initdata = 1; /* This code runs before the PCI subsystem is initialized, so just access the northbridge directly. */ -#define NB_ID_3 (PCI_VENDOR_ID_AMD | (0x1103<<16)) - static u32 __init allocate_aperture(void) { pg_data_t *nd0 = NODE_DATA(0); @@ -68,20 +66,20 @@ static u32 __init allocate_aperture(void) return (u32)__pa(p); } -static int __init aperture_valid(char *name, u64 aper_base, u32 aper_size) +static int __init aperture_valid(u64 aper_base, u32 aper_size) { if (!aper_base) return 0; if (aper_size < 64*1024*1024) { - printk("Aperture from %s too small (%d MB)\n", name, aper_size>>20); + printk("Aperture too small (%d MB)\n", aper_size>>20); return 0; } if (aper_base + aper_size >= 0xffffffff) { - printk("Aperture from %s beyond 4GB. Ignoring.\n",name); + printk("Aperture beyond 4GB. Ignoring.\n"); return 0; } if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) { - printk("Aperture from %s pointing to e820 RAM. Ignoring.\n",name); + printk("Aperture pointing to e820 RAM. Ignoring.\n"); return 0; } return 1; @@ -140,7 +138,7 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) printk("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", aper, 32 << *order, apsizereg); - if (!aperture_valid("AGP bridge", aper, (32*1024*1024) << *order)) + if (!aperture_valid(aper, (32*1024*1024) << *order)) return 0; return (u32)aper; } @@ -208,10 +206,10 @@ void __init iommu_hole_init(void) fix = 0; for (num = 24; num < 32; num++) { - char name[30]; - if (read_pci_config(0, num, 3, 0x00) != NB_ID_3) - continue; + if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) + continue; + iommu_detected = 1; iommu_aperture = 1; aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7; @@ -222,9 +220,7 @@ void __init iommu_hole_init(void) printk("CPU %d: aperture @ %Lx size %u MB\n", num-24, aper_base, aper_size>>20); - sprintf(name, "northbridge cpu %d", num-24); - - if (!aperture_valid(name, aper_base, aper_size)) { + if (!aperture_valid(aper_base, aper_size)) { fix = 1; break; } @@ -273,7 +269,7 @@ void __init iommu_hole_init(void) /* Fix up the north bridges */ for (num = 24; num < 32; num++) { - if (read_pci_config(0, num, 3, 0x00) != NB_ID_3) + if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) continue; /* Don't enable translation yet. That is done later. diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 29ef99001e05..b2ead91df218 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -100,7 +100,7 @@ void clear_local_APIC(void) maxlvt = get_maxlvt(); /* - * Masking an LVT entry on a P6 can trigger a local APIC error + * Masking an LVT entry can trigger a local APIC error * if the vector is zero. Mask LVTERR first to prevent this. */ if (maxlvt >= 3) { @@ -851,7 +851,18 @@ void disable_APIC_timer(void) unsigned long v; v = apic_read(APIC_LVTT); - apic_write(APIC_LVTT, v | APIC_LVT_MASKED); + /* + * When an illegal vector value (0-15) is written to an LVT + * entry and delivery mode is Fixed, the APIC may signal an + * illegal vector error, with out regard to whether the mask + * bit is set or whether an interrupt is actually seen on input. + * + * Boot sequence might call this function when the LVTT has + * '0' vector value. So make sure vector field is set to + * valid value. + */ + v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); + apic_write(APIC_LVTT, v); } } @@ -909,15 +920,13 @@ int setup_profiling_timer(unsigned int multiplier) return -EINVAL; } -#ifdef CONFIG_X86_MCE_AMD -void setup_threshold_lvt(unsigned long lvt_off) +void setup_APIC_extened_lvt(unsigned char lvt_off, unsigned char vector, + unsigned char msg_type, unsigned char mask) { - unsigned int v = 0; - unsigned long reg = (lvt_off << 4) + 0x500; - v |= THRESHOLD_APIC_VECTOR; + unsigned long reg = (lvt_off << 4) + K8_APIC_EXT_LVT_BASE; + unsigned int v = (mask << 16) | (msg_type << 8) | vector; apic_write(reg, v); } -#endif /* CONFIG_X86_MCE_AMD */ #undef APIC_DIVISOR @@ -983,7 +992,7 @@ void smp_apic_timer_interrupt(struct pt_regs *regs) } /* - * oem_force_hpet_timer -- force HPET mode for some boxes. + * apic_is_clustered_box() -- Check if we can expect good TSC * * Thus far, the major user of this is IBM's Summit2 series: * @@ -991,7 +1000,7 @@ void smp_apic_timer_interrupt(struct pt_regs *regs) * multi-chassis. Use available data to take a good guess. * If in doubt, go HPET. */ -__cpuinit int oem_force_hpet_timer(void) +__cpuinit int apic_is_clustered_box(void) { int i, clusters, zeros; unsigned id; @@ -1022,8 +1031,7 @@ __cpuinit int oem_force_hpet_timer(void) } /* - * If clusters > 2, then should be multi-chassis. Return 1 for HPET. - * Else return 0 to use TSC. + * If clusters > 2, then should be multi-chassis. * May have to revisit this when multi-core + hyperthreaded CPUs come * out, but AFAIK this will work even for them. */ diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c index 4e6c3b729e39..8ca04912b1cc 100644 --- a/arch/x86_64/kernel/crash.c +++ b/arch/x86_64/kernel/crash.c @@ -111,14 +111,14 @@ static int crash_nmi_callback(struct pt_regs *regs, int cpu) atomic_dec(&waiting_for_crash_ipi); /* Assume hlt works */ for(;;) - asm("hlt"); + halt(); return 1; } static void smp_send_nmi_allbutself(void) { - send_IPI_allbutself(APIC_DM_NMI); + send_IPI_allbutself(NMI_VECTOR); } /* diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index 1ef6028f721e..9e94d834624b 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -1,7 +1,6 @@ /* * Handle the memory map. * The functions here do the job until bootmem takes over. - * $Id: e820.c,v 1.4 2002/09/19 19:25:32 ak Exp $ * * Getting sanitize_e820_map() in sync with i386 version by applying change: * - Provisions for empty E820 memory regions (reported by certain BIOSes). @@ -621,6 +620,7 @@ void __init parse_memmapopt(char *p, char **from) } unsigned long pci_mem_start = 0xaeedbabe; +EXPORT_SYMBOL(pci_mem_start); /* * Search for the biggest gap in the low 32 bits of the e820 diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index 586b34c00c48..7290e72b9a34 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -154,6 +154,7 @@ rff_trace: GET_THREAD_INFO(%rcx) jmp rff_action CFI_ENDPROC +END(ret_from_fork) /* * System call entry. Upto 6 arguments in registers are supported. @@ -188,7 +189,7 @@ rff_trace: ENTRY(system_call) CFI_STARTPROC simple - CFI_DEF_CFA rsp,0 + CFI_DEF_CFA rsp,PDA_STACKOFFSET CFI_REGISTER rip,rcx /*CFI_REGISTER rflags,r11*/ swapgs @@ -285,6 +286,7 @@ tracesys: /* Use IRET because user could have changed frame */ jmp int_ret_from_sys_call CFI_ENDPROC +END(system_call) /* * Syscall return path ending with IRET. @@ -364,6 +366,7 @@ int_restore_rest: cli jmp int_with_check CFI_ENDPROC +END(int_ret_from_sys_call) /* * Certain special system calls that need to save a complete full stack frame. @@ -375,6 +378,7 @@ int_restore_rest: leaq \func(%rip),%rax leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */ jmp ptregscall_common +END(\label) .endm CFI_STARTPROC @@ -404,6 +408,7 @@ ENTRY(ptregscall_common) CFI_REL_OFFSET rip, 0 ret CFI_ENDPROC +END(ptregscall_common) ENTRY(stub_execve) CFI_STARTPROC @@ -418,6 +423,7 @@ ENTRY(stub_execve) RESTORE_REST jmp int_ret_from_sys_call CFI_ENDPROC +END(stub_execve) /* * sigreturn is special because it needs to restore all registers on return. @@ -435,6 +441,7 @@ ENTRY(stub_rt_sigreturn) RESTORE_REST jmp int_ret_from_sys_call CFI_ENDPROC +END(stub_rt_sigreturn) /* * initial frame state for interrupts and exceptions @@ -466,29 +473,18 @@ ENTRY(stub_rt_sigreturn) /* 0(%rsp): interrupt number */ .macro interrupt func cld -#ifdef CONFIG_DEBUG_INFO - SAVE_ALL - movq %rsp,%rdi - /* - * Setup a stack frame pointer. This allows gdb to trace - * back to the original stack. - */ - movq %rsp,%rbp - CFI_DEF_CFA_REGISTER rbp -#else SAVE_ARGS leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler -#endif + pushq %rbp + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rbp, 0 + movq %rsp,%rbp + CFI_DEF_CFA_REGISTER rbp testl $3,CS(%rdi) je 1f swapgs 1: incl %gs:pda_irqcount # RED-PEN should check preempt count - movq %gs:pda_irqstackptr,%rax - cmoveq %rax,%rsp /*todo This needs CFI annotation! */ - pushq %rdi # save old stack -#ifndef CONFIG_DEBUG_INFO - CFI_ADJUST_CFA_OFFSET 8 -#endif + cmoveq %gs:pda_irqstackptr,%rsp call \func .endm @@ -497,17 +493,11 @@ ENTRY(common_interrupt) interrupt do_IRQ /* 0(%rsp): oldrsp-ARGOFFSET */ ret_from_intr: - popq %rdi -#ifndef CONFIG_DEBUG_INFO - CFI_ADJUST_CFA_OFFSET -8 -#endif cli decl %gs:pda_irqcount -#ifdef CONFIG_DEBUG_INFO - movq RBP(%rdi),%rbp + leaveq CFI_DEF_CFA_REGISTER rsp -#endif - leaq ARGOFFSET(%rdi),%rsp /*todo This needs CFI annotation! */ + CFI_ADJUST_CFA_OFFSET -8 exit_intr: GET_THREAD_INFO(%rcx) testl $3,CS-ARGOFFSET(%rsp) @@ -589,7 +579,9 @@ retint_kernel: call preempt_schedule_irq jmp exit_intr #endif + CFI_ENDPROC +END(common_interrupt) /* * APIC interrupts. @@ -605,17 +597,21 @@ retint_kernel: ENTRY(thermal_interrupt) apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt +END(thermal_interrupt) ENTRY(threshold_interrupt) apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt +END(threshold_interrupt) #ifdef CONFIG_SMP ENTRY(reschedule_interrupt) apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt +END(reschedule_interrupt) .macro INVALIDATE_ENTRY num ENTRY(invalidate_interrupt\num) apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt +END(invalidate_interrupt\num) .endm INVALIDATE_ENTRY 0 @@ -629,17 +625,21 @@ ENTRY(invalidate_interrupt\num) ENTRY(call_function_interrupt) apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt +END(call_function_interrupt) #endif #ifdef CONFIG_X86_LOCAL_APIC ENTRY(apic_timer_interrupt) apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt +END(apic_timer_interrupt) ENTRY(error_interrupt) apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt +END(error_interrupt) ENTRY(spurious_interrupt) apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt +END(spurious_interrupt) #endif /* @@ -777,6 +777,7 @@ error_kernelspace: cmpq $gs_change,RIP(%rsp) je error_swapgs jmp error_sti +END(error_entry) /* Reload gs selector with exception handling */ /* edi: new selector */ @@ -794,6 +795,7 @@ gs_change: CFI_ADJUST_CFA_OFFSET -8 ret CFI_ENDPROC +ENDPROC(load_gs_index) .section __ex_table,"a" .align 8 @@ -847,7 +849,7 @@ ENTRY(kernel_thread) UNFAKE_STACK_FRAME ret CFI_ENDPROC - +ENDPROC(kernel_thread) child_rip: /* @@ -860,6 +862,7 @@ child_rip: # exit xorl %edi, %edi call do_exit +ENDPROC(child_rip) /* * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. @@ -889,19 +892,24 @@ ENTRY(execve) UNFAKE_STACK_FRAME ret CFI_ENDPROC +ENDPROC(execve) KPROBE_ENTRY(page_fault) errorentry do_page_fault +END(page_fault) .previous .text ENTRY(coprocessor_error) zeroentry do_coprocessor_error +END(coprocessor_error) ENTRY(simd_coprocessor_error) zeroentry do_simd_coprocessor_error +END(simd_coprocessor_error) ENTRY(device_not_available) zeroentry math_state_restore +END(device_not_available) /* runs on exception stack */ KPROBE_ENTRY(debug) @@ -911,6 +919,7 @@ KPROBE_ENTRY(debug) paranoidentry do_debug, DEBUG_STACK jmp paranoid_exit CFI_ENDPROC +END(debug) .previous .text /* runs on exception stack */ @@ -961,6 +970,7 @@ paranoid_schedule: cli jmp paranoid_userspace CFI_ENDPROC +END(nmi) .previous .text KPROBE_ENTRY(int3) @@ -970,22 +980,28 @@ KPROBE_ENTRY(int3) paranoidentry do_int3, DEBUG_STACK jmp paranoid_exit CFI_ENDPROC +END(int3) .previous .text ENTRY(overflow) zeroentry do_overflow +END(overflow) ENTRY(bounds) zeroentry do_bounds +END(bounds) ENTRY(invalid_op) zeroentry do_invalid_op +END(invalid_op) ENTRY(coprocessor_segment_overrun) zeroentry do_coprocessor_segment_overrun +END(coprocessor_segment_overrun) ENTRY(reserved) zeroentry do_reserved +END(reserved) /* runs on exception stack */ ENTRY(double_fault) @@ -993,12 +1009,15 @@ ENTRY(double_fault) paranoidentry do_double_fault jmp paranoid_exit CFI_ENDPROC +END(double_fault) ENTRY(invalid_TSS) errorentry do_invalid_TSS +END(invalid_TSS) ENTRY(segment_not_present) errorentry do_segment_not_present +END(segment_not_present) /* runs on exception stack */ ENTRY(stack_segment) @@ -1006,19 +1025,24 @@ ENTRY(stack_segment) paranoidentry do_stack_segment jmp paranoid_exit CFI_ENDPROC +END(stack_segment) KPROBE_ENTRY(general_protection) errorentry do_general_protection +END(general_protection) .previous .text ENTRY(alignment_check) errorentry do_alignment_check +END(alignment_check) ENTRY(divide_error) zeroentry do_divide_error +END(divide_error) ENTRY(spurious_interrupt_bug) zeroentry do_spurious_interrupt_bug +END(spurious_interrupt_bug) #ifdef CONFIG_X86_MCE /* runs on exception stack */ @@ -1029,6 +1053,7 @@ ENTRY(machine_check) paranoidentry do_machine_check jmp paranoid_exit CFI_ENDPROC +END(machine_check) #endif ENTRY(call_softirq) @@ -1046,3 +1071,37 @@ ENTRY(call_softirq) decl %gs:pda_irqcount ret CFI_ENDPROC +ENDPROC(call_softirq) + +#ifdef CONFIG_STACK_UNWIND +ENTRY(arch_unwind_init_running) + CFI_STARTPROC + movq %r15, R15(%rdi) + movq %r14, R14(%rdi) + xchgq %rsi, %rdx + movq %r13, R13(%rdi) + movq %r12, R12(%rdi) + xorl %eax, %eax + movq %rbp, RBP(%rdi) + movq %rbx, RBX(%rdi) + movq (%rsp), %rcx + movq %rax, R11(%rdi) + movq %rax, R10(%rdi) + movq %rax, R9(%rdi) + movq %rax, R8(%rdi) + movq %rax, RAX(%rdi) + movq %rax, RCX(%rdi) + movq %rax, RDX(%rdi) + movq %rax, RSI(%rdi) + movq %rax, RDI(%rdi) + movq %rax, ORIG_RAX(%rdi) + movq %rcx, RIP(%rdi) + leaq 8(%rsp), %rcx + movq $__KERNEL_CS, CS(%rdi) + movq %rax, EFLAGS(%rdi) + movq %rcx, RSP(%rdi) + movq $__KERNEL_DS, SS(%rdi) + jmpq *%rdx + CFI_ENDPROC +ENDPROC(arch_unwind_init_running) +#endif diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c index 1a2ab825be98..21c7066e236a 100644 --- a/arch/x86_64/kernel/genapic_flat.c +++ b/arch/x86_64/kernel/genapic_flat.c @@ -78,22 +78,29 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector) static void flat_send_IPI_allbutself(int vector) { -#ifndef CONFIG_HOTPLUG_CPU - if (((num_online_cpus()) - 1) >= 1) - __send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL); +#ifdef CONFIG_HOTPLUG_CPU + int hotplug = 1; #else - cpumask_t allbutme = cpu_online_map; + int hotplug = 0; +#endif + if (hotplug || vector == NMI_VECTOR) { + cpumask_t allbutme = cpu_online_map; - cpu_clear(smp_processor_id(), allbutme); + cpu_clear(smp_processor_id(), allbutme); - if (!cpus_empty(allbutme)) - flat_send_IPI_mask(allbutme, vector); -#endif + if (!cpus_empty(allbutme)) + flat_send_IPI_mask(allbutme, vector); + } else if (num_online_cpus() > 1) { + __send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL); + } } static void flat_send_IPI_all(int vector) { - __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); + if (vector == NMI_VECTOR) + flat_send_IPI_mask(cpu_online_map, vector); + else + __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); } static int flat_apic_id_registered(void) @@ -108,10 +115,7 @@ static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask) static unsigned int phys_pkg_id(int index_msb) { - u32 ebx; - - ebx = cpuid_ebx(1); - return ((ebx >> 24) & 0xFF) >> index_msb; + return hard_smp_processor_id() >> index_msb; } struct genapic apic_flat = { diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c index cea20a66c150..e6a71c9556d9 100644 --- a/arch/x86_64/kernel/head64.c +++ b/arch/x86_64/kernel/head64.c @@ -2,8 +2,6 @@ * linux/arch/x86_64/kernel/head64.c -- prepare to run common code * * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE - * - * $Id: head64.c,v 1.22 2001/07/06 14:28:20 ak Exp $ */ #include <linux/init.h> diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c index 5ecd34ab8c2b..9b1a4e147321 100644 --- a/arch/x86_64/kernel/i8259.c +++ b/arch/x86_64/kernel/i8259.c @@ -44,11 +44,11 @@ BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ BI(x,c) BI(x,d) BI(x,e) BI(x,f) -#define BUILD_14_IRQS(x) \ +#define BUILD_15_IRQS(x) \ BI(x,0) BI(x,1) BI(x,2) BI(x,3) \ BI(x,4) BI(x,5) BI(x,6) BI(x,7) \ BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ - BI(x,c) BI(x,d) + BI(x,c) BI(x,d) BI(x,e) /* * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: @@ -73,13 +73,13 @@ BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb) BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) #ifdef CONFIG_PCI_MSI - BUILD_14_IRQS(0xe) + BUILD_15_IRQS(0xe) #endif #endif #undef BUILD_16_IRQS -#undef BUILD_14_IRQS +#undef BUILD_15_IRQS #undef BI @@ -92,11 +92,11 @@ BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) -#define IRQLIST_14(x) \ +#define IRQLIST_15(x) \ IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \ IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ - IRQ(x,c), IRQ(x,d) + IRQ(x,c), IRQ(x,d), IRQ(x,e) void (*interrupt[NR_IRQS])(void) = { IRQLIST_16(0x0), @@ -108,7 +108,7 @@ void (*interrupt[NR_IRQS])(void) = { IRQLIST_16(0xc), IRQLIST_16(0xd) #ifdef CONFIG_PCI_MSI - , IRQLIST_14(0xe) + , IRQLIST_15(0xe) #endif #endif diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 9cc7031b7151..c768d8a036d0 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -41,6 +41,7 @@ #include <asm/mach_apic.h> #include <asm/acpi.h> #include <asm/dma.h> +#include <asm/nmi.h> #define __apicdebuginit __init @@ -56,6 +57,7 @@ int timer_over_8254 __initdata = 0; static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; static DEFINE_SPINLOCK(ioapic_lock); +static DEFINE_SPINLOCK(vector_lock); /* * # of IRQ routing registers @@ -317,7 +319,7 @@ void __init check_ioapic(void) vendor &= 0xffff; switch (vendor) { case PCI_VENDOR_ID_VIA: -#ifdef CONFIG_GART_IOMMU +#ifdef CONFIG_IOMMU if ((end_pfn > MAX_DMA32_PFN || force_iommu) && !iommu_aperture_allowed) { @@ -834,10 +836,17 @@ u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 }; int assign_irq_vector(int irq) { static int current_vector = FIRST_DEVICE_VECTOR, offset = 0; + unsigned long flags; + int vector; BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS); - if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) + + spin_lock_irqsave(&vector_lock, flags); + + if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) { + spin_unlock_irqrestore(&vector_lock, flags); return IO_APIC_VECTOR(irq); + } next: current_vector += 8; if (current_vector == IA32_SYSCALL_VECTOR) @@ -849,11 +858,14 @@ next: current_vector = FIRST_DEVICE_VECTOR + offset; } - vector_irq[current_vector] = irq; + vector = current_vector; + vector_irq[vector] = irq; if (irq != AUTO_ASSIGN) - IO_APIC_VECTOR(irq) = current_vector; + IO_APIC_VECTOR(irq) = vector; + + spin_unlock_irqrestore(&vector_lock, flags); - return current_vector; + return vector; } extern void (*interrupt[NR_IRQS])(void); @@ -866,21 +878,14 @@ static struct hw_interrupt_type ioapic_edge_type; static inline void ioapic_register_intr(int irq, int vector, unsigned long trigger) { - if (use_pci_vector() && !platform_legacy_irq(irq)) { - if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) - irq_desc[vector].handler = &ioapic_level_type; - else - irq_desc[vector].handler = &ioapic_edge_type; - set_intr_gate(vector, interrupt[vector]); - } else { - if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) - irq_desc[irq].handler = &ioapic_level_type; - else - irq_desc[irq].handler = &ioapic_edge_type; - set_intr_gate(vector, interrupt[irq]); - } + unsigned idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq; + + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || + trigger == IOAPIC_LEVEL) + irq_desc[idx].handler = &ioapic_level_type; + else + irq_desc[idx].handler = &ioapic_edge_type; + set_intr_gate(vector, interrupt[idx]); } static void __init setup_IO_APIC_irqs(void) diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c index d8bd0b345b1e..59518d4d4358 100644 --- a/arch/x86_64/kernel/irq.c +++ b/arch/x86_64/kernel/irq.c @@ -26,6 +26,30 @@ atomic_t irq_mis_count; #endif #endif +#ifdef CONFIG_DEBUG_STACKOVERFLOW +/* + * Probabilistic stack overflow check: + * + * Only check the stack in process context, because everything else + * runs on the big interrupt stacks. Checking reliably is too expensive, + * so we just check from interrupts. + */ +static inline void stack_overflow_check(struct pt_regs *regs) +{ + u64 curbase = (u64) current->thread_info; + static unsigned long warned = -60*HZ; + + if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE && + regs->rsp < curbase + sizeof(struct thread_info) + 128 && + time_after(jiffies, warned + 60*HZ)) { + printk("do_IRQ: %s near stack overflow (cur:%Lx,rsp:%lx)\n", + current->comm, curbase, regs->rsp); + show_stack(NULL,NULL); + warned = jiffies; + } +} +#endif + /* * Generic, controller-independent functions: */ @@ -39,7 +63,7 @@ int show_interrupts(struct seq_file *p, void *v) if (i == 0) { seq_printf(p, " "); for_each_online_cpu(j) - seq_printf(p, "CPU%d ",j); + seq_printf(p, "CPU%-8d",j); seq_putc(p, '\n'); } @@ -96,7 +120,9 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs) exit_idle(); irq_enter(); - +#ifdef CONFIG_DEBUG_STACKOVERFLOW + stack_overflow_check(regs); +#endif __do_IRQ(irq, regs); irq_exit(); diff --git a/arch/x86_64/kernel/k8.c b/arch/x86_64/kernel/k8.c new file mode 100644 index 000000000000..6416682d33d0 --- /dev/null +++ b/arch/x86_64/kernel/k8.c @@ -0,0 +1,118 @@ +/* + * Shared support code for AMD K8 northbridges and derivates. + * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2. + */ +#include <linux/gfp.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/errno.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <asm/k8.h> + +int num_k8_northbridges; +EXPORT_SYMBOL(num_k8_northbridges); + +static u32 *flush_words; + +struct pci_device_id k8_nb_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) }, + {} +}; +EXPORT_SYMBOL(k8_nb_ids); + +struct pci_dev **k8_northbridges; +EXPORT_SYMBOL(k8_northbridges); + +static struct pci_dev *next_k8_northbridge(struct pci_dev *dev) +{ + do { + dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); + if (!dev) + break; + } while (!pci_match_id(&k8_nb_ids[0], dev)); + return dev; +} + +int cache_k8_northbridges(void) +{ + int i; + struct pci_dev *dev; + if (num_k8_northbridges) + return 0; + + num_k8_northbridges = 0; + dev = NULL; + while ((dev = next_k8_northbridge(dev)) != NULL) + num_k8_northbridges++; + + k8_northbridges = kmalloc((num_k8_northbridges + 1) * sizeof(void *), + GFP_KERNEL); + if (!k8_northbridges) + return -ENOMEM; + + flush_words = kmalloc(num_k8_northbridges * sizeof(u32), GFP_KERNEL); + if (!flush_words) { + kfree(k8_northbridges); + return -ENOMEM; + } + + dev = NULL; + i = 0; + while ((dev = next_k8_northbridge(dev)) != NULL) { + k8_northbridges[i++] = dev; + pci_read_config_dword(dev, 0x9c, &flush_words[i]); + } + k8_northbridges[i] = NULL; + return 0; +} +EXPORT_SYMBOL_GPL(cache_k8_northbridges); + +/* Ignores subdevice/subvendor but as far as I can figure out + they're useless anyways */ +int __init early_is_k8_nb(u32 device) +{ + struct pci_device_id *id; + u32 vendor = device & 0xffff; + device >>= 16; + for (id = k8_nb_ids; id->vendor; id++) + if (vendor == id->vendor && device == id->device) + return 1; + return 0; +} + +void k8_flush_garts(void) +{ + int flushed, i; + unsigned long flags; + static DEFINE_SPINLOCK(gart_lock); + + /* Avoid races between AGP and IOMMU. In theory it's not needed + but I'm not sure if the hardware won't lose flush requests + when another is pending. This whole thing is so expensive anyways + that it doesn't matter to serialize more. -AK */ + spin_lock_irqsave(&gart_lock, flags); + flushed = 0; + for (i = 0; i < num_k8_northbridges; i++) { + pci_write_config_dword(k8_northbridges[i], 0x9c, + flush_words[i]|1); + flushed++; + } + for (i = 0; i < num_k8_northbridges; i++) { + u32 w; + /* Make sure the hardware actually executed the flush*/ + for (;;) { + pci_read_config_dword(k8_northbridges[i], + 0x9c, &w); + if (!(w & 1)) + break; + cpu_relax(); + } + } + spin_unlock_irqrestore(&gart_lock, flags); + if (!flushed) + printk("nothing to flush?\n"); +} +EXPORT_SYMBOL_GPL(k8_flush_garts); + diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index c69fc43cee7b..acd5816b1a6f 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -562,7 +562,7 @@ static struct sysdev_class mce_sysclass = { set_kset_name("machinecheck"), }; -static DEFINE_PER_CPU(struct sys_device, device_mce); +DEFINE_PER_CPU(struct sys_device, device_mce); /* Why are there no generic functions for this? */ #define ACCESSOR(name, var, start) \ diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c index d13b241ad094..335200aa2737 100644 --- a/arch/x86_64/kernel/mce_amd.c +++ b/arch/x86_64/kernel/mce_amd.c @@ -1,5 +1,5 @@ /* - * (c) 2005 Advanced Micro Devices, Inc. + * (c) 2005, 2006 Advanced Micro Devices, Inc. * Your use of this code is subject to the terms and conditions of the * GNU general public license version 2. See "COPYING" or * http://www.gnu.org/licenses/gpl.html @@ -8,9 +8,10 @@ * * Support : jacob.shin@amd.com * - * MC4_MISC0 DRAM ECC Error Threshold available under AMD K8 Rev F. - * MC4_MISC0 exists per physical processor. + * April 2006 + * - added support for AMD Family 0x10 processors * + * All MC4_MISCi registers are shared between multi-cores */ #include <linux/cpu.h> @@ -29,32 +30,45 @@ #include <asm/percpu.h> #include <asm/idle.h> -#define PFX "mce_threshold: " -#define VERSION "version 1.00.9" -#define NR_BANKS 5 -#define THRESHOLD_MAX 0xFFF -#define INT_TYPE_APIC 0x00020000 -#define MASK_VALID_HI 0x80000000 -#define MASK_LVTOFF_HI 0x00F00000 -#define MASK_COUNT_EN_HI 0x00080000 -#define MASK_INT_TYPE_HI 0x00060000 -#define MASK_OVERFLOW_HI 0x00010000 +#define PFX "mce_threshold: " +#define VERSION "version 1.1.1" +#define NR_BANKS 6 +#define NR_BLOCKS 9 +#define THRESHOLD_MAX 0xFFF +#define INT_TYPE_APIC 0x00020000 +#define MASK_VALID_HI 0x80000000 +#define MASK_LVTOFF_HI 0x00F00000 +#define MASK_COUNT_EN_HI 0x00080000 +#define MASK_INT_TYPE_HI 0x00060000 +#define MASK_OVERFLOW_HI 0x00010000 #define MASK_ERR_COUNT_HI 0x00000FFF -#define MASK_OVERFLOW 0x0001000000000000L +#define MASK_BLKPTR_LO 0xFF000000 +#define MCG_XBLK_ADDR 0xC0000400 -struct threshold_bank { +struct threshold_block { + unsigned int block; + unsigned int bank; unsigned int cpu; - u8 bank; - u8 interrupt_enable; + u32 address; + u16 interrupt_enable; u16 threshold_limit; struct kobject kobj; + struct list_head miscj; }; -static struct threshold_bank threshold_defaults = { +/* defaults used early on boot */ +static struct threshold_block threshold_defaults = { .interrupt_enable = 0, .threshold_limit = THRESHOLD_MAX, }; +struct threshold_bank { + struct kobject kobj; + struct threshold_block *blocks; + cpumask_t cpus; +}; +static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); + #ifdef CONFIG_SMP static unsigned char shared_bank[NR_BANKS] = { 0, 0, 0, 0, 1 @@ -68,12 +82,12 @@ static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ */ /* must be called with correct cpu affinity */ -static void threshold_restart_bank(struct threshold_bank *b, +static void threshold_restart_bank(struct threshold_block *b, int reset, u16 old_limit) { u32 mci_misc_hi, mci_misc_lo; - rdmsr(MSR_IA32_MC0_MISC + b->bank * 4, mci_misc_lo, mci_misc_hi); + rdmsr(b->address, mci_misc_lo, mci_misc_hi); if (b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX)) reset = 1; /* limit cannot be lower than err count */ @@ -94,35 +108,57 @@ static void threshold_restart_bank(struct threshold_bank *b, (mci_misc_hi &= ~MASK_INT_TYPE_HI); mci_misc_hi |= MASK_COUNT_EN_HI; - wrmsr(MSR_IA32_MC0_MISC + b->bank * 4, mci_misc_lo, mci_misc_hi); + wrmsr(b->address, mci_misc_lo, mci_misc_hi); } +/* cpu init entry point, called from mce.c with preempt off */ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c) { - int bank; - u32 mci_misc_lo, mci_misc_hi; + unsigned int bank, block; unsigned int cpu = smp_processor_id(); + u32 low = 0, high = 0, address = 0; for (bank = 0; bank < NR_BANKS; ++bank) { - rdmsr(MSR_IA32_MC0_MISC + bank * 4, mci_misc_lo, mci_misc_hi); + for (block = 0; block < NR_BLOCKS; ++block) { + if (block == 0) + address = MSR_IA32_MC0_MISC + bank * 4; + else if (block == 1) + address = MCG_XBLK_ADDR + + ((low & MASK_BLKPTR_LO) >> 21); + else + ++address; + + if (rdmsr_safe(address, &low, &high)) + continue; - /* !valid, !counter present, bios locked */ - if (!(mci_misc_hi & MASK_VALID_HI) || - !(mci_misc_hi & MASK_VALID_HI >> 1) || - (mci_misc_hi & MASK_VALID_HI >> 2)) - continue; + if (!(high & MASK_VALID_HI)) { + if (block) + continue; + else + break; + } - per_cpu(bank_map, cpu) |= (1 << bank); + if (!(high & MASK_VALID_HI >> 1) || + (high & MASK_VALID_HI >> 2)) + continue; + if (!block) + per_cpu(bank_map, cpu) |= (1 << bank); #ifdef CONFIG_SMP - if (shared_bank[bank] && cpu_core_id[cpu]) - continue; + if (shared_bank[bank] && c->cpu_core_id) + break; #endif + high &= ~MASK_LVTOFF_HI; + high |= K8_APIC_EXT_LVT_ENTRY_THRESHOLD << 20; + wrmsr(address, low, high); - setup_threshold_lvt((mci_misc_hi & MASK_LVTOFF_HI) >> 20); - threshold_defaults.cpu = cpu; - threshold_defaults.bank = bank; - threshold_restart_bank(&threshold_defaults, 0, 0); + setup_APIC_extened_lvt(K8_APIC_EXT_LVT_ENTRY_THRESHOLD, + THRESHOLD_APIC_VECTOR, + K8_APIC_EXT_INT_MSG_FIX, 0); + + threshold_defaults.address = address; + threshold_restart_bank(&threshold_defaults, 0, 0); + } } } @@ -137,8 +173,9 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c) */ asmlinkage void mce_threshold_interrupt(void) { - int bank; + unsigned int bank, block; struct mce m; + u32 low = 0, high = 0, address = 0; ack_APIC_irq(); exit_idle(); @@ -150,15 +187,42 @@ asmlinkage void mce_threshold_interrupt(void) /* assume first bank caused it */ for (bank = 0; bank < NR_BANKS; ++bank) { - m.bank = MCE_THRESHOLD_BASE + bank; - rdmsrl(MSR_IA32_MC0_MISC + bank * 4, m.misc); + for (block = 0; block < NR_BLOCKS; ++block) { + if (block == 0) + address = MSR_IA32_MC0_MISC + bank * 4; + else if (block == 1) + address = MCG_XBLK_ADDR + + ((low & MASK_BLKPTR_LO) >> 21); + else + ++address; + + if (rdmsr_safe(address, &low, &high)) + continue; - if (m.misc & MASK_OVERFLOW) { - mce_log(&m); - goto out; + if (!(high & MASK_VALID_HI)) { + if (block) + continue; + else + break; + } + + if (!(high & MASK_VALID_HI >> 1) || + (high & MASK_VALID_HI >> 2)) + continue; + + if (high & MASK_OVERFLOW_HI) { + rdmsrl(address, m.misc); + rdmsrl(MSR_IA32_MC0_STATUS + bank * 4, + m.status); + m.bank = K8_MCE_THRESHOLD_BASE + + bank * NR_BLOCKS + + block; + mce_log(&m); + goto out; + } } } - out: +out: irq_exit(); } @@ -166,20 +230,12 @@ asmlinkage void mce_threshold_interrupt(void) * Sysfs Interface */ -static struct sysdev_class threshold_sysclass = { - set_kset_name("threshold"), -}; - -static DEFINE_PER_CPU(struct sys_device, device_threshold); - struct threshold_attr { - struct attribute attr; - ssize_t(*show) (struct threshold_bank *, char *); - ssize_t(*store) (struct threshold_bank *, const char *, size_t count); + struct attribute attr; + ssize_t(*show) (struct threshold_block *, char *); + ssize_t(*store) (struct threshold_block *, const char *, size_t count); }; -static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); - static cpumask_t affinity_set(unsigned int cpu) { cpumask_t oldmask = current->cpus_allowed; @@ -194,15 +250,15 @@ static void affinity_restore(cpumask_t oldmask) set_cpus_allowed(current, oldmask); } -#define SHOW_FIELDS(name) \ - static ssize_t show_ ## name(struct threshold_bank * b, char *buf) \ - { \ - return sprintf(buf, "%lx\n", (unsigned long) b->name); \ - } +#define SHOW_FIELDS(name) \ +static ssize_t show_ ## name(struct threshold_block * b, char *buf) \ +{ \ + return sprintf(buf, "%lx\n", (unsigned long) b->name); \ +} SHOW_FIELDS(interrupt_enable) SHOW_FIELDS(threshold_limit) -static ssize_t store_interrupt_enable(struct threshold_bank *b, +static ssize_t store_interrupt_enable(struct threshold_block *b, const char *buf, size_t count) { char *end; @@ -219,7 +275,7 @@ static ssize_t store_interrupt_enable(struct threshold_bank *b, return end - buf; } -static ssize_t store_threshold_limit(struct threshold_bank *b, +static ssize_t store_threshold_limit(struct threshold_block *b, const char *buf, size_t count) { char *end; @@ -242,18 +298,18 @@ static ssize_t store_threshold_limit(struct threshold_bank *b, return end - buf; } -static ssize_t show_error_count(struct threshold_bank *b, char *buf) +static ssize_t show_error_count(struct threshold_block *b, char *buf) { u32 high, low; cpumask_t oldmask; oldmask = affinity_set(b->cpu); - rdmsr(MSR_IA32_MC0_MISC + b->bank * 4, low, high); /* ignore low 32 */ + rdmsr(b->address, low, high); affinity_restore(oldmask); return sprintf(buf, "%x\n", (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit)); } -static ssize_t store_error_count(struct threshold_bank *b, +static ssize_t store_error_count(struct threshold_block *b, const char *buf, size_t count) { cpumask_t oldmask; @@ -269,13 +325,13 @@ static ssize_t store_error_count(struct threshold_bank *b, .store = _store, \ }; -#define ATTR_FIELDS(name) \ - static struct threshold_attr name = \ +#define RW_ATTR(name) \ +static struct threshold_attr name = \ THRESHOLD_ATTR(name, 0644, show_## name, store_## name) -ATTR_FIELDS(interrupt_enable); -ATTR_FIELDS(threshold_limit); -ATTR_FIELDS(error_count); +RW_ATTR(interrupt_enable); +RW_ATTR(threshold_limit); +RW_ATTR(error_count); static struct attribute *default_attrs[] = { &interrupt_enable.attr, @@ -284,12 +340,12 @@ static struct attribute *default_attrs[] = { NULL }; -#define to_bank(k) container_of(k,struct threshold_bank,kobj) -#define to_attr(a) container_of(a,struct threshold_attr,attr) +#define to_block(k) container_of(k, struct threshold_block, kobj) +#define to_attr(a) container_of(a, struct threshold_attr, attr) static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) { - struct threshold_bank *b = to_bank(kobj); + struct threshold_block *b = to_block(kobj); struct threshold_attr *a = to_attr(attr); ssize_t ret; ret = a->show ? a->show(b, buf) : -EIO; @@ -299,7 +355,7 @@ static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) static ssize_t store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { - struct threshold_bank *b = to_bank(kobj); + struct threshold_block *b = to_block(kobj); struct threshold_attr *a = to_attr(attr); ssize_t ret; ret = a->store ? a->store(b, buf, count) : -EIO; @@ -316,69 +372,174 @@ static struct kobj_type threshold_ktype = { .default_attrs = default_attrs, }; +static __cpuinit int allocate_threshold_blocks(unsigned int cpu, + unsigned int bank, + unsigned int block, + u32 address) +{ + int err; + u32 low, high; + struct threshold_block *b = NULL; + + if ((bank >= NR_BANKS) || (block >= NR_BLOCKS)) + return 0; + + if (rdmsr_safe(address, &low, &high)) + goto recurse; + + if (!(high & MASK_VALID_HI)) { + if (block) + goto recurse; + else + return 0; + } + + if (!(high & MASK_VALID_HI >> 1) || + (high & MASK_VALID_HI >> 2)) + goto recurse; + + b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL); + if (!b) + return -ENOMEM; + memset(b, 0, sizeof(struct threshold_block)); + + b->block = block; + b->bank = bank; + b->cpu = cpu; + b->address = address; + b->interrupt_enable = 0; + b->threshold_limit = THRESHOLD_MAX; + + INIT_LIST_HEAD(&b->miscj); + + if (per_cpu(threshold_banks, cpu)[bank]->blocks) + list_add(&b->miscj, + &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj); + else + per_cpu(threshold_banks, cpu)[bank]->blocks = b; + + kobject_set_name(&b->kobj, "misc%i", block); + b->kobj.parent = &per_cpu(threshold_banks, cpu)[bank]->kobj; + b->kobj.ktype = &threshold_ktype; + err = kobject_register(&b->kobj); + if (err) + goto out_free; +recurse: + if (!block) { + address = (low & MASK_BLKPTR_LO) >> 21; + if (!address) + return 0; + address += MCG_XBLK_ADDR; + } else + ++address; + + err = allocate_threshold_blocks(cpu, bank, ++block, address); + if (err) + goto out_free; + + return err; + +out_free: + if (b) { + kobject_unregister(&b->kobj); + kfree(b); + } + return err; +} + /* symlinks sibling shared banks to first core. first core owns dir/files. */ -static __cpuinit int threshold_create_bank(unsigned int cpu, int bank) +static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) { - int err = 0; + int i, err = 0; struct threshold_bank *b = NULL; + cpumask_t oldmask = CPU_MASK_NONE; + char name[32]; + + sprintf(name, "threshold_bank%i", bank); #ifdef CONFIG_SMP - if (cpu_core_id[cpu] && shared_bank[bank]) { /* symlink */ - char name[16]; - unsigned lcpu = first_cpu(cpu_core_map[cpu]); - if (cpu_core_id[lcpu]) - goto out; /* first core not up yet */ + if (cpu_data[cpu].cpu_core_id && shared_bank[bank]) { /* symlink */ + i = first_cpu(cpu_core_map[cpu]); + + /* first core not up yet */ + if (cpu_data[i].cpu_core_id) + goto out; + + /* already linked */ + if (per_cpu(threshold_banks, cpu)[bank]) + goto out; + + b = per_cpu(threshold_banks, i)[bank]; - b = per_cpu(threshold_banks, lcpu)[bank]; if (!b) goto out; - sprintf(name, "bank%i", bank); - err = sysfs_create_link(&per_cpu(device_threshold, cpu).kobj, + + err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj, &b->kobj, name); if (err) goto out; + + b->cpus = cpu_core_map[cpu]; per_cpu(threshold_banks, cpu)[bank] = b; goto out; } #endif - b = kmalloc(sizeof(struct threshold_bank), GFP_KERNEL); + b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); if (!b) { err = -ENOMEM; goto out; } memset(b, 0, sizeof(struct threshold_bank)); - b->cpu = cpu; - b->bank = bank; - b->interrupt_enable = 0; - b->threshold_limit = THRESHOLD_MAX; - kobject_set_name(&b->kobj, "bank%i", bank); - b->kobj.parent = &per_cpu(device_threshold, cpu).kobj; - b->kobj.ktype = &threshold_ktype; - + kobject_set_name(&b->kobj, "threshold_bank%i", bank); + b->kobj.parent = &per_cpu(device_mce, cpu).kobj; +#ifndef CONFIG_SMP + b->cpus = CPU_MASK_ALL; +#else + b->cpus = cpu_core_map[cpu]; +#endif err = kobject_register(&b->kobj); - if (err) { - kfree(b); - goto out; - } + if (err) + goto out_free; + per_cpu(threshold_banks, cpu)[bank] = b; - out: + + oldmask = affinity_set(cpu); + err = allocate_threshold_blocks(cpu, bank, 0, + MSR_IA32_MC0_MISC + bank * 4); + affinity_restore(oldmask); + + if (err) + goto out_free; + + for_each_cpu_mask(i, b->cpus) { + if (i == cpu) + continue; + + err = sysfs_create_link(&per_cpu(device_mce, i).kobj, + &b->kobj, name); + if (err) + goto out; + + per_cpu(threshold_banks, i)[bank] = b; + } + + goto out; + +out_free: + per_cpu(threshold_banks, cpu)[bank] = NULL; + kfree(b); +out: return err; } /* create dir/files for all valid threshold banks */ static __cpuinit int threshold_create_device(unsigned int cpu) { - int bank; + unsigned int bank; int err = 0; - per_cpu(device_threshold, cpu).id = cpu; - per_cpu(device_threshold, cpu).cls = &threshold_sysclass; - err = sysdev_register(&per_cpu(device_threshold, cpu)); - if (err) - goto out; - for (bank = 0; bank < NR_BANKS; ++bank) { if (!(per_cpu(bank_map, cpu) & 1 << bank)) continue; @@ -386,7 +547,7 @@ static __cpuinit int threshold_create_device(unsigned int cpu) if (err) goto out; } - out: +out: return err; } @@ -397,92 +558,85 @@ static __cpuinit int threshold_create_device(unsigned int cpu) * of shared sysfs dir/files, and rest of the cores will be symlinked to it. */ -/* cpu hotplug call removes all symlinks before first core dies */ +static __cpuinit void deallocate_threshold_block(unsigned int cpu, + unsigned int bank) +{ + struct threshold_block *pos = NULL; + struct threshold_block *tmp = NULL; + struct threshold_bank *head = per_cpu(threshold_banks, cpu)[bank]; + + if (!head) + return; + + list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) { + kobject_unregister(&pos->kobj); + list_del(&pos->miscj); + kfree(pos); + } + + kfree(per_cpu(threshold_banks, cpu)[bank]->blocks); + per_cpu(threshold_banks, cpu)[bank]->blocks = NULL; +} + static __cpuinit void threshold_remove_bank(unsigned int cpu, int bank) { + int i = 0; struct threshold_bank *b; - char name[16]; + char name[32]; b = per_cpu(threshold_banks, cpu)[bank]; + if (!b) return; - if (shared_bank[bank] && atomic_read(&b->kobj.kref.refcount) > 2) { - sprintf(name, "bank%i", bank); - sysfs_remove_link(&per_cpu(device_threshold, cpu).kobj, name); - per_cpu(threshold_banks, cpu)[bank] = NULL; - } else { - kobject_unregister(&b->kobj); - kfree(per_cpu(threshold_banks, cpu)[bank]); + + if (!b->blocks) + goto free_out; + + sprintf(name, "threshold_bank%i", bank); + + /* sibling symlink */ + if (shared_bank[bank] && b->blocks->cpu != cpu) { + sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name); + per_cpu(threshold_banks, i)[bank] = NULL; + return; + } + + /* remove all sibling symlinks before unregistering */ + for_each_cpu_mask(i, b->cpus) { + if (i == cpu) + continue; + + sysfs_remove_link(&per_cpu(device_mce, i).kobj, name); + per_cpu(threshold_banks, i)[bank] = NULL; } + + deallocate_threshold_block(cpu, bank); + +free_out: + kobject_unregister(&b->kobj); + kfree(b); + per_cpu(threshold_banks, cpu)[bank] = NULL; } static __cpuinit void threshold_remove_device(unsigned int cpu) { - int bank; + unsigned int bank; for (bank = 0; bank < NR_BANKS; ++bank) { if (!(per_cpu(bank_map, cpu) & 1 << bank)) continue; threshold_remove_bank(cpu, bank); } - sysdev_unregister(&per_cpu(device_threshold, cpu)); } -/* link all existing siblings when first core comes up */ -static __cpuinit int threshold_create_symlinks(unsigned int cpu) -{ - int bank, err = 0; - unsigned int lcpu = 0; - - if (cpu_core_id[cpu]) - return 0; - for_each_cpu_mask(lcpu, cpu_core_map[cpu]) { - if (lcpu == cpu) - continue; - for (bank = 0; bank < NR_BANKS; ++bank) { - if (!(per_cpu(bank_map, cpu) & 1 << bank)) - continue; - if (!shared_bank[bank]) - continue; - err = threshold_create_bank(lcpu, bank); - } - } - return err; -} - -/* remove all symlinks before first core dies. */ -static __cpuinit void threshold_remove_symlinks(unsigned int cpu) -{ - int bank; - unsigned int lcpu = 0; - if (cpu_core_id[cpu]) - return; - for_each_cpu_mask(lcpu, cpu_core_map[cpu]) { - if (lcpu == cpu) - continue; - for (bank = 0; bank < NR_BANKS; ++bank) { - if (!(per_cpu(bank_map, cpu) & 1 << bank)) - continue; - if (!shared_bank[bank]) - continue; - threshold_remove_bank(lcpu, bank); - } - } -} #else /* !CONFIG_HOTPLUG_CPU */ -static __cpuinit void threshold_create_symlinks(unsigned int cpu) -{ -} -static __cpuinit void threshold_remove_symlinks(unsigned int cpu) -{ -} static void threshold_remove_device(unsigned int cpu) { } #endif /* get notified when a cpu comes on/off */ -static int threshold_cpu_callback(struct notifier_block *nfb, +static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { /* cpu was unsigned int to begin with */ @@ -494,13 +648,6 @@ static int threshold_cpu_callback(struct notifier_block *nfb, switch (action) { case CPU_ONLINE: threshold_create_device(cpu); - threshold_create_symlinks(cpu); - break; - case CPU_DOWN_PREPARE: - threshold_remove_symlinks(cpu); - break; - case CPU_DOWN_FAILED: - threshold_create_symlinks(cpu); break; case CPU_DEAD: threshold_remove_device(cpu); @@ -512,29 +659,22 @@ static int threshold_cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } -static struct notifier_block threshold_cpu_notifier = { +static struct notifier_block threshold_cpu_notifier __cpuinitdata = { .notifier_call = threshold_cpu_callback, }; static __init int threshold_init_device(void) { - int err; - int lcpu = 0; - - err = sysdev_class_register(&threshold_sysclass); - if (err) - goto out; + unsigned lcpu = 0; /* to hit CPUs online before the notifier is up */ for_each_online_cpu(lcpu) { - err = threshold_create_device(lcpu); + int err = threshold_create_device(lcpu); if (err) - goto out; + return err; } register_cpu_notifier(&threshold_cpu_notifier); - - out: - return err; + return 0; } device_initcall(threshold_init_device); diff --git a/arch/x86_64/kernel/module.c b/arch/x86_64/kernel/module.c index bac195c74bcc..9d0958ff547f 100644 --- a/arch/x86_64/kernel/module.c +++ b/arch/x86_64/kernel/module.c @@ -145,26 +145,38 @@ int apply_relocate(Elf_Shdr *sechdrs, return -ENOSYS; } -extern void apply_alternatives(void *start, void *end); - int module_finalize(const Elf_Ehdr *hdr, - const Elf_Shdr *sechdrs, - struct module *me) + const Elf_Shdr *sechdrs, + struct module *me) { - const Elf_Shdr *s; + const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL; char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; - /* look for .altinstructions to patch */ - for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { - void *seg; - if (strcmp(".altinstructions", secstrings + s->sh_name)) - continue; - seg = (void *)s->sh_addr; - apply_alternatives(seg, seg + s->sh_size); - } + for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { + if (!strcmp(".text", secstrings + s->sh_name)) + text = s; + if (!strcmp(".altinstructions", secstrings + s->sh_name)) + alt = s; + if (!strcmp(".smp_locks", secstrings + s->sh_name)) + locks= s; + } + + if (alt) { + /* patch .altinstructions */ + void *aseg = (void *)alt->sh_addr; + apply_alternatives(aseg, aseg + alt->sh_size); + } + if (locks && text) { + void *lseg = (void *)locks->sh_addr; + void *tseg = (void *)text->sh_addr; + alternatives_smp_module_add(me, me->name, + lseg, lseg + locks->sh_size, + tseg, tseg + text->sh_size); + } return 0; } void module_arch_cleanup(struct module *mod) { + alternatives_smp_module_del(mod); } diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 4e6357fe0ec3..399489c93132 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -15,11 +15,7 @@ #include <linux/config.h> #include <linux/mm.h> #include <linux/delay.h> -#include <linux/bootmem.h> -#include <linux/smp_lock.h> #include <linux/interrupt.h> -#include <linux/mc146818rtc.h> -#include <linux/kernel_stat.h> #include <linux/module.h> #include <linux/sysdev.h> #include <linux/nmi.h> @@ -27,14 +23,11 @@ #include <linux/kprobes.h> #include <asm/smp.h> -#include <asm/mtrr.h> -#include <asm/mpspec.h> #include <asm/nmi.h> -#include <asm/msr.h> #include <asm/proto.h> #include <asm/kdebug.h> -#include <asm/local.h> #include <asm/mce.h> +#include <asm/intel_arch_perfmon.h> /* * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: @@ -74,6 +67,9 @@ static unsigned int nmi_p4_cccr_val; #define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 #define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING +#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL +#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK + #define MSR_P4_MISC_ENABLE 0x1A0 #define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7) #define MSR_P4_MISC_ENABLE_PEBS_UNAVAIL (1<<12) @@ -105,7 +101,10 @@ static __cpuinit inline int nmi_known_cpu(void) case X86_VENDOR_AMD: return boot_cpu_data.x86 == 15; case X86_VENDOR_INTEL: - return boot_cpu_data.x86 == 15; + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return 1; + else + return (boot_cpu_data.x86 == 15); } return 0; } @@ -211,6 +210,8 @@ int __init setup_nmi_watchdog(char *str) __setup("nmi_watchdog=", setup_nmi_watchdog); +static void disable_intel_arch_watchdog(void); + static void disable_lapic_nmi_watchdog(void) { if (nmi_active <= 0) @@ -223,6 +224,8 @@ static void disable_lapic_nmi_watchdog(void) if (boot_cpu_data.x86 == 15) { wrmsr(MSR_P4_IQ_CCCR0, 0, 0); wrmsr(MSR_P4_CRU_ESCR0, 0, 0); + } else if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { + disable_intel_arch_watchdog(); } break; } @@ -375,6 +378,53 @@ static void setup_k7_watchdog(void) wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); } +static void disable_intel_arch_watchdog(void) +{ + unsigned ebx; + + /* + * Check whether the Architectural PerfMon supports + * Unhalted Core Cycles Event or not. + * NOTE: Corresponding bit = 0 in ebp indicates event present. + */ + ebx = cpuid_ebx(10); + if (!(ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) + wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, 0, 0); +} + +static int setup_intel_arch_watchdog(void) +{ + unsigned int evntsel; + unsigned ebx; + + /* + * Check whether the Architectural PerfMon supports + * Unhalted Core Cycles Event or not. + * NOTE: Corresponding bit = 0 in ebp indicates event present. + */ + ebx = cpuid_ebx(10); + if ((ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) + return 0; + + nmi_perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0; + + clear_msr_range(MSR_ARCH_PERFMON_EVENTSEL0, 2); + clear_msr_range(MSR_ARCH_PERFMON_PERFCTR0, 2); + + evntsel = ARCH_PERFMON_EVENTSEL_INT + | ARCH_PERFMON_EVENTSEL_OS + | ARCH_PERFMON_EVENTSEL_USR + | ARCH_PERFMON_NMI_EVENT_SEL + | ARCH_PERFMON_NMI_EVENT_UMASK; + + wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, evntsel, 0); + wrmsrl(MSR_ARCH_PERFMON_PERFCTR0, -((u64)cpu_khz * 1000 / nmi_hz)); + apic_write(APIC_LVTPC, APIC_DM_NMI); + evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, evntsel, 0); + return 1; +} + static int setup_p4_watchdog(void) { @@ -428,10 +478,16 @@ void setup_apic_nmi_watchdog(void) setup_k7_watchdog(); break; case X86_VENDOR_INTEL: - if (boot_cpu_data.x86 != 15) - return; - if (!setup_p4_watchdog()) + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { + if (!setup_intel_arch_watchdog()) + return; + } else if (boot_cpu_data.x86 == 15) { + if (!setup_p4_watchdog()) + return; + } else { return; + } + break; default: @@ -516,7 +572,14 @@ void __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) */ wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); apic_write(APIC_LVTPC, APIC_DM_NMI); - } + } else if (nmi_perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { + /* + * For Intel based architectural perfmon + * - LVTPC is masked on interrupt and must be + * unmasked by the LVTPC handler. + */ + apic_write(APIC_LVTPC, APIC_DM_NMI); + } wrmsrl(nmi_perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); } } diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c new file mode 100644 index 000000000000..d91cb843f54d --- /dev/null +++ b/arch/x86_64/kernel/pci-calgary.c @@ -0,0 +1,1018 @@ +/* + * Derived from arch/powerpc/kernel/iommu.c + * + * Copyright (C) 2006 Jon Mason <jdmason@us.ibm.com>, IBM Corporation + * Copyright (C) 2006 Muli Ben-Yehuda <muli@il.ibm.com>, IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/spinlock.h> +#include <linux/string.h> +#include <linux/dma-mapping.h> +#include <linux/init.h> +#include <linux/bitops.h> +#include <linux/pci_ids.h> +#include <linux/pci.h> +#include <linux/delay.h> +#include <asm/proto.h> +#include <asm/calgary.h> +#include <asm/tce.h> +#include <asm/pci-direct.h> +#include <asm/system.h> +#include <asm/dma.h> + +#define PCI_DEVICE_ID_IBM_CALGARY 0x02a1 +#define PCI_VENDOR_DEVICE_ID_CALGARY \ + (PCI_VENDOR_ID_IBM | PCI_DEVICE_ID_IBM_CALGARY << 16) + +/* we need these for register space address calculation */ +#define START_ADDRESS 0xfe000000 +#define CHASSIS_BASE 0 +#define ONE_BASED_CHASSIS_NUM 1 + +/* register offsets inside the host bridge space */ +#define PHB_CSR_OFFSET 0x0110 +#define PHB_PLSSR_OFFSET 0x0120 +#define PHB_CONFIG_RW_OFFSET 0x0160 +#define PHB_IOBASE_BAR_LOW 0x0170 +#define PHB_IOBASE_BAR_HIGH 0x0180 +#define PHB_MEM_1_LOW 0x0190 +#define PHB_MEM_1_HIGH 0x01A0 +#define PHB_IO_ADDR_SIZE 0x01B0 +#define PHB_MEM_1_SIZE 0x01C0 +#define PHB_MEM_ST_OFFSET 0x01D0 +#define PHB_AER_OFFSET 0x0200 +#define PHB_CONFIG_0_HIGH 0x0220 +#define PHB_CONFIG_0_LOW 0x0230 +#define PHB_CONFIG_0_END 0x0240 +#define PHB_MEM_2_LOW 0x02B0 +#define PHB_MEM_2_HIGH 0x02C0 +#define PHB_MEM_2_SIZE_HIGH 0x02D0 +#define PHB_MEM_2_SIZE_LOW 0x02E0 +#define PHB_DOSHOLE_OFFSET 0x08E0 + +/* PHB_CONFIG_RW */ +#define PHB_TCE_ENABLE 0x20000000 +#define PHB_SLOT_DISABLE 0x1C000000 +#define PHB_DAC_DISABLE 0x01000000 +#define PHB_MEM2_ENABLE 0x00400000 +#define PHB_MCSR_ENABLE 0x00100000 +/* TAR (Table Address Register) */ +#define TAR_SW_BITS 0x0000ffffffff800fUL +#define TAR_VALID 0x0000000000000008UL +/* CSR (Channel/DMA Status Register) */ +#define CSR_AGENT_MASK 0xffe0ffff + +#define MAX_NUM_OF_PHBS 8 /* how many PHBs in total? */ +#define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * 2) /* max dev->bus->number */ +#define PHBS_PER_CALGARY 4 + +/* register offsets in Calgary's internal register space */ +static const unsigned long tar_offsets[] = { + 0x0580 /* TAR0 */, + 0x0588 /* TAR1 */, + 0x0590 /* TAR2 */, + 0x0598 /* TAR3 */ +}; + +static const unsigned long split_queue_offsets[] = { + 0x4870 /* SPLIT QUEUE 0 */, + 0x5870 /* SPLIT QUEUE 1 */, + 0x6870 /* SPLIT QUEUE 2 */, + 0x7870 /* SPLIT QUEUE 3 */ +}; + +static const unsigned long phb_offsets[] = { + 0x8000 /* PHB0 */, + 0x9000 /* PHB1 */, + 0xA000 /* PHB2 */, + 0xB000 /* PHB3 */ +}; + +void* tce_table_kva[MAX_NUM_OF_PHBS * MAX_NUMNODES]; +unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED; +static int translate_empty_slots __read_mostly = 0; +static int calgary_detected __read_mostly = 0; + +/* + * the bitmap of PHBs the user requested that we disable + * translation on. + */ +static DECLARE_BITMAP(translation_disabled, MAX_NUMNODES * MAX_PHB_BUS_NUM); + +static void tce_cache_blast(struct iommu_table *tbl); + +/* enable this to stress test the chip's TCE cache */ +#ifdef CONFIG_IOMMU_DEBUG +static inline void tce_cache_blast_stress(struct iommu_table *tbl) +{ + tce_cache_blast(tbl); +} +#else +static inline void tce_cache_blast_stress(struct iommu_table *tbl) +{ +} +#endif /* BLAST_TCE_CACHE_ON_UNMAP */ + +static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen) +{ + unsigned int npages; + + npages = PAGE_ALIGN(dma + dmalen) - (dma & PAGE_MASK); + npages >>= PAGE_SHIFT; + + return npages; +} + +static inline int translate_phb(struct pci_dev* dev) +{ + int disabled = test_bit(dev->bus->number, translation_disabled); + return !disabled; +} + +static void iommu_range_reserve(struct iommu_table *tbl, + unsigned long start_addr, unsigned int npages) +{ + unsigned long index; + unsigned long end; + + index = start_addr >> PAGE_SHIFT; + + /* bail out if we're asked to reserve a region we don't cover */ + if (index >= tbl->it_size) + return; + + end = index + npages; + if (end > tbl->it_size) /* don't go off the table */ + end = tbl->it_size; + + while (index < end) { + if (test_bit(index, tbl->it_map)) + printk(KERN_ERR "Calgary: entry already allocated at " + "0x%lx tbl %p dma 0x%lx npages %u\n", + index, tbl, start_addr, npages); + ++index; + } + set_bit_string(tbl->it_map, start_addr >> PAGE_SHIFT, npages); +} + +static unsigned long iommu_range_alloc(struct iommu_table *tbl, + unsigned int npages) +{ + unsigned long offset; + + BUG_ON(npages == 0); + + offset = find_next_zero_string(tbl->it_map, tbl->it_hint, + tbl->it_size, npages); + if (offset == ~0UL) { + tce_cache_blast(tbl); + offset = find_next_zero_string(tbl->it_map, 0, + tbl->it_size, npages); + if (offset == ~0UL) { + printk(KERN_WARNING "Calgary: IOMMU full.\n"); + if (panic_on_overflow) + panic("Calgary: fix the allocator.\n"); + else + return bad_dma_address; + } + } + + set_bit_string(tbl->it_map, offset, npages); + tbl->it_hint = offset + npages; + BUG_ON(tbl->it_hint > tbl->it_size); + + return offset; +} + +static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *vaddr, + unsigned int npages, int direction) +{ + unsigned long entry, flags; + dma_addr_t ret = bad_dma_address; + + spin_lock_irqsave(&tbl->it_lock, flags); + + entry = iommu_range_alloc(tbl, npages); + + if (unlikely(entry == bad_dma_address)) + goto error; + + /* set the return dma address */ + ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK); + + /* put the TCEs in the HW table */ + tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK, + direction); + + spin_unlock_irqrestore(&tbl->it_lock, flags); + + return ret; + +error: + spin_unlock_irqrestore(&tbl->it_lock, flags); + printk(KERN_WARNING "Calgary: failed to allocate %u pages in " + "iommu %p\n", npages, tbl); + return bad_dma_address; +} + +static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, + unsigned int npages) +{ + unsigned long entry; + unsigned long i; + + entry = dma_addr >> PAGE_SHIFT; + + BUG_ON(entry + npages > tbl->it_size); + + tce_free(tbl, entry, npages); + + for (i = 0; i < npages; ++i) { + if (!test_bit(entry + i, tbl->it_map)) + printk(KERN_ERR "Calgary: bit is off at 0x%lx " + "tbl %p dma 0x%Lx entry 0x%lx npages %u\n", + entry + i, tbl, dma_addr, entry, npages); + } + + __clear_bit_string(tbl->it_map, entry, npages); + + tce_cache_blast_stress(tbl); +} + +static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, + unsigned int npages) +{ + unsigned long flags; + + spin_lock_irqsave(&tbl->it_lock, flags); + + __iommu_free(tbl, dma_addr, npages); + + spin_unlock_irqrestore(&tbl->it_lock, flags); +} + +static void __calgary_unmap_sg(struct iommu_table *tbl, + struct scatterlist *sglist, int nelems, int direction) +{ + while (nelems--) { + unsigned int npages; + dma_addr_t dma = sglist->dma_address; + unsigned int dmalen = sglist->dma_length; + + if (dmalen == 0) + break; + + npages = num_dma_pages(dma, dmalen); + __iommu_free(tbl, dma, npages); + sglist++; + } +} + +void calgary_unmap_sg(struct device *dev, struct scatterlist *sglist, + int nelems, int direction) +{ + unsigned long flags; + struct iommu_table *tbl = to_pci_dev(dev)->bus->self->sysdata; + + if (!translate_phb(to_pci_dev(dev))) + return; + + spin_lock_irqsave(&tbl->it_lock, flags); + + __calgary_unmap_sg(tbl, sglist, nelems, direction); + + spin_unlock_irqrestore(&tbl->it_lock, flags); +} + +static int calgary_nontranslate_map_sg(struct device* dev, + struct scatterlist *sg, int nelems, int direction) +{ + int i; + + for (i = 0; i < nelems; i++ ) { + struct scatterlist *s = &sg[i]; + BUG_ON(!s->page); + s->dma_address = virt_to_bus(page_address(s->page) +s->offset); + s->dma_length = s->length; + } + return nelems; +} + +int calgary_map_sg(struct device *dev, struct scatterlist *sg, + int nelems, int direction) +{ + struct iommu_table *tbl = to_pci_dev(dev)->bus->self->sysdata; + unsigned long flags; + unsigned long vaddr; + unsigned int npages; + unsigned long entry; + int i; + + if (!translate_phb(to_pci_dev(dev))) + return calgary_nontranslate_map_sg(dev, sg, nelems, direction); + + spin_lock_irqsave(&tbl->it_lock, flags); + + for (i = 0; i < nelems; i++ ) { + struct scatterlist *s = &sg[i]; + BUG_ON(!s->page); + + vaddr = (unsigned long)page_address(s->page) + s->offset; + npages = num_dma_pages(vaddr, s->length); + + entry = iommu_range_alloc(tbl, npages); + if (entry == bad_dma_address) { + /* makes sure unmap knows to stop */ + s->dma_length = 0; + goto error; + } + + s->dma_address = (entry << PAGE_SHIFT) | s->offset; + + /* insert into HW table */ + tce_build(tbl, entry, npages, vaddr & PAGE_MASK, + direction); + + s->dma_length = s->length; + } + + spin_unlock_irqrestore(&tbl->it_lock, flags); + + return nelems; +error: + __calgary_unmap_sg(tbl, sg, nelems, direction); + for (i = 0; i < nelems; i++) { + sg[i].dma_address = bad_dma_address; + sg[i].dma_length = 0; + } + spin_unlock_irqrestore(&tbl->it_lock, flags); + return 0; +} + +dma_addr_t calgary_map_single(struct device *dev, void *vaddr, + size_t size, int direction) +{ + dma_addr_t dma_handle = bad_dma_address; + unsigned long uaddr; + unsigned int npages; + struct iommu_table *tbl = to_pci_dev(dev)->bus->self->sysdata; + + uaddr = (unsigned long)vaddr; + npages = num_dma_pages(uaddr, size); + + if (translate_phb(to_pci_dev(dev))) + dma_handle = iommu_alloc(tbl, vaddr, npages, direction); + else + dma_handle = virt_to_bus(vaddr); + + return dma_handle; +} + +void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, + size_t size, int direction) +{ + struct iommu_table *tbl = to_pci_dev(dev)->bus->self->sysdata; + unsigned int npages; + + if (!translate_phb(to_pci_dev(dev))) + return; + + npages = num_dma_pages(dma_handle, size); + iommu_free(tbl, dma_handle, npages); +} + +void* calgary_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t flag) +{ + void *ret = NULL; + dma_addr_t mapping; + unsigned int npages, order; + struct iommu_table *tbl; + + tbl = to_pci_dev(dev)->bus->self->sysdata; + + size = PAGE_ALIGN(size); /* size rounded up to full pages */ + npages = size >> PAGE_SHIFT; + order = get_order(size); + + /* alloc enough pages (and possibly more) */ + ret = (void *)__get_free_pages(flag, order); + if (!ret) + goto error; + memset(ret, 0, size); + + if (translate_phb(to_pci_dev(dev))) { + /* set up tces to cover the allocated range */ + mapping = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL); + if (mapping == bad_dma_address) + goto free; + + *dma_handle = mapping; + } else /* non translated slot */ + *dma_handle = virt_to_bus(ret); + + return ret; + +free: + free_pages((unsigned long)ret, get_order(size)); + ret = NULL; +error: + return ret; +} + +static struct dma_mapping_ops calgary_dma_ops = { + .alloc_coherent = calgary_alloc_coherent, + .map_single = calgary_map_single, + .unmap_single = calgary_unmap_single, + .map_sg = calgary_map_sg, + .unmap_sg = calgary_unmap_sg, +}; + +static inline int busno_to_phbid(unsigned char num) +{ + return bus_to_phb(num) % PHBS_PER_CALGARY; +} + +static inline unsigned long split_queue_offset(unsigned char num) +{ + size_t idx = busno_to_phbid(num); + + return split_queue_offsets[idx]; +} + +static inline unsigned long tar_offset(unsigned char num) +{ + size_t idx = busno_to_phbid(num); + + return tar_offsets[idx]; +} + +static inline unsigned long phb_offset(unsigned char num) +{ + size_t idx = busno_to_phbid(num); + + return phb_offsets[idx]; +} + +static inline void __iomem* calgary_reg(void __iomem *bar, unsigned long offset) +{ + unsigned long target = ((unsigned long)bar) | offset; + return (void __iomem*)target; +} + +static void tce_cache_blast(struct iommu_table *tbl) +{ + u64 val; + u32 aer; + int i = 0; + void __iomem *bbar = tbl->bbar; + void __iomem *target; + + /* disable arbitration on the bus */ + target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET); + aer = readl(target); + writel(0, target); + + /* read plssr to ensure it got there */ + target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET); + val = readl(target); + + /* poll split queues until all DMA activity is done */ + target = calgary_reg(bbar, split_queue_offset(tbl->it_busno)); + do { + val = readq(target); + i++; + } while ((val & 0xff) != 0xff && i < 100); + if (i == 100) + printk(KERN_WARNING "Calgary: PCI bus not quiesced, " + "continuing anyway\n"); + + /* invalidate TCE cache */ + target = calgary_reg(bbar, tar_offset(tbl->it_busno)); + writeq(tbl->tar_val, target); + + /* enable arbitration */ + target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET); + writel(aer, target); + (void)readl(target); /* flush */ +} + +static void __init calgary_reserve_mem_region(struct pci_dev *dev, u64 start, + u64 limit) +{ + unsigned int numpages; + + limit = limit | 0xfffff; + limit++; + + numpages = ((limit - start) >> PAGE_SHIFT); + iommu_range_reserve(dev->sysdata, start, numpages); +} + +static void __init calgary_reserve_peripheral_mem_1(struct pci_dev *dev) +{ + void __iomem *target; + u64 low, high, sizelow; + u64 start, limit; + struct iommu_table *tbl = dev->sysdata; + unsigned char busnum = dev->bus->number; + void __iomem *bbar = tbl->bbar; + + /* peripheral MEM_1 region */ + target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_LOW); + low = be32_to_cpu(readl(target)); + target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_HIGH); + high = be32_to_cpu(readl(target)); + target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_SIZE); + sizelow = be32_to_cpu(readl(target)); + + start = (high << 32) | low; + limit = sizelow; + + calgary_reserve_mem_region(dev, start, limit); +} + +static void __init calgary_reserve_peripheral_mem_2(struct pci_dev *dev) +{ + void __iomem *target; + u32 val32; + u64 low, high, sizelow, sizehigh; + u64 start, limit; + struct iommu_table *tbl = dev->sysdata; + unsigned char busnum = dev->bus->number; + void __iomem *bbar = tbl->bbar; + + /* is it enabled? */ + target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET); + val32 = be32_to_cpu(readl(target)); + if (!(val32 & PHB_MEM2_ENABLE)) + return; + + target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_LOW); + low = be32_to_cpu(readl(target)); + target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_HIGH); + high = be32_to_cpu(readl(target)); + target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_LOW); + sizelow = be32_to_cpu(readl(target)); + target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_HIGH); + sizehigh = be32_to_cpu(readl(target)); + + start = (high << 32) | low; + limit = (sizehigh << 32) | sizelow; + + calgary_reserve_mem_region(dev, start, limit); +} + +/* + * some regions of the IO address space do not get translated, so we + * must not give devices IO addresses in those regions. The regions + * are the 640KB-1MB region and the two PCI peripheral memory holes. + * Reserve all of them in the IOMMU bitmap to avoid giving them out + * later. + */ +static void __init calgary_reserve_regions(struct pci_dev *dev) +{ + unsigned int npages; + void __iomem *bbar; + unsigned char busnum; + u64 start; + struct iommu_table *tbl = dev->sysdata; + + bbar = tbl->bbar; + busnum = dev->bus->number; + + /* reserve bad_dma_address in case it's a legal address */ + iommu_range_reserve(tbl, bad_dma_address, 1); + + /* avoid the BIOS/VGA first 640KB-1MB region */ + start = (640 * 1024); + npages = ((1024 - 640) * 1024) >> PAGE_SHIFT; + iommu_range_reserve(tbl, start, npages); + + /* reserve the two PCI peripheral memory regions in IO space */ + calgary_reserve_peripheral_mem_1(dev); + calgary_reserve_peripheral_mem_2(dev); +} + +static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar) +{ + u64 val64; + u64 table_phys; + void __iomem *target; + int ret; + struct iommu_table *tbl; + + /* build TCE tables for each PHB */ + ret = build_tce_table(dev, bbar); + if (ret) + return ret; + + calgary_reserve_regions(dev); + + /* set TARs for each PHB */ + target = calgary_reg(bbar, tar_offset(dev->bus->number)); + val64 = be64_to_cpu(readq(target)); + + /* zero out all TAR bits under sw control */ + val64 &= ~TAR_SW_BITS; + + tbl = dev->sysdata; + table_phys = (u64)__pa(tbl->it_base); + val64 |= table_phys; + + BUG_ON(specified_table_size > TCE_TABLE_SIZE_8M); + val64 |= (u64) specified_table_size; + + tbl->tar_val = cpu_to_be64(val64); + writeq(tbl->tar_val, target); + readq(target); /* flush */ + + return 0; +} + +static void __init calgary_free_tar(struct pci_dev *dev) +{ + u64 val64; + struct iommu_table *tbl = dev->sysdata; + void __iomem *target; + + target = calgary_reg(tbl->bbar, tar_offset(dev->bus->number)); + val64 = be64_to_cpu(readq(target)); + val64 &= ~TAR_SW_BITS; + writeq(cpu_to_be64(val64), target); + readq(target); /* flush */ + + kfree(tbl); + dev->sysdata = NULL; +} + +static void calgary_watchdog(unsigned long data) +{ + struct pci_dev *dev = (struct pci_dev *)data; + struct iommu_table *tbl = dev->sysdata; + void __iomem *bbar = tbl->bbar; + u32 val32; + void __iomem *target; + + target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET); + val32 = be32_to_cpu(readl(target)); + + /* If no error, the agent ID in the CSR is not valid */ + if (val32 & CSR_AGENT_MASK) { + printk(KERN_EMERG "calgary_watchdog: DMA error on bus %d, " + "CSR = %#x\n", dev->bus->number, val32); + writel(0, target); + + /* Disable bus that caused the error */ + target = calgary_reg(bbar, phb_offset(tbl->it_busno) | + PHB_CONFIG_RW_OFFSET); + val32 = be32_to_cpu(readl(target)); + val32 |= PHB_SLOT_DISABLE; + writel(cpu_to_be32(val32), target); + readl(target); /* flush */ + } else { + /* Reset the timer */ + mod_timer(&tbl->watchdog_timer, jiffies + 2 * HZ); + } +} + +static void __init calgary_enable_translation(struct pci_dev *dev) +{ + u32 val32; + unsigned char busnum; + void __iomem *target; + void __iomem *bbar; + struct iommu_table *tbl; + + busnum = dev->bus->number; + tbl = dev->sysdata; + bbar = tbl->bbar; + + /* enable TCE in PHB Config Register */ + target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET); + val32 = be32_to_cpu(readl(target)); + val32 |= PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE; + + printk(KERN_INFO "Calgary: enabling translation on PHB %d\n", busnum); + printk(KERN_INFO "Calgary: errant DMAs will now be prevented on this " + "bus.\n"); + + writel(cpu_to_be32(val32), target); + readl(target); /* flush */ + + init_timer(&tbl->watchdog_timer); + tbl->watchdog_timer.function = &calgary_watchdog; + tbl->watchdog_timer.data = (unsigned long)dev; + mod_timer(&tbl->watchdog_timer, jiffies); +} + +static void __init calgary_disable_translation(struct pci_dev *dev) +{ + u32 val32; + unsigned char busnum; + void __iomem *target; + void __iomem *bbar; + struct iommu_table *tbl; + + busnum = dev->bus->number; + tbl = dev->sysdata; + bbar = tbl->bbar; + + /* disable TCE in PHB Config Register */ + target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET); + val32 = be32_to_cpu(readl(target)); + val32 &= ~(PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE); + + printk(KERN_INFO "Calgary: disabling translation on PHB %d!\n", busnum); + writel(cpu_to_be32(val32), target); + readl(target); /* flush */ + + del_timer_sync(&tbl->watchdog_timer); +} + +static inline unsigned int __init locate_register_space(struct pci_dev *dev) +{ + int rionodeid; + u32 address; + + rionodeid = (dev->bus->number % 15 > 4) ? 3 : 2; + /* + * register space address calculation as follows: + * FE0MB-8MB*OneBasedChassisNumber+1MB*(RioNodeId-ChassisBase) + * ChassisBase is always zero for x366/x260/x460 + * RioNodeId is 2 for first Calgary, 3 for second Calgary + */ + address = START_ADDRESS - + (0x800000 * (ONE_BASED_CHASSIS_NUM + dev->bus->number / 15)) + + (0x100000) * (rionodeid - CHASSIS_BASE); + return address; +} + +static int __init calgary_init_one_nontraslated(struct pci_dev *dev) +{ + dev->sysdata = NULL; + dev->bus->self = dev; + + return 0; +} + +static int __init calgary_init_one(struct pci_dev *dev) +{ + u32 address; + void __iomem *bbar; + int ret; + + address = locate_register_space(dev); + /* map entire 1MB of Calgary config space */ + bbar = ioremap_nocache(address, 1024 * 1024); + if (!bbar) { + ret = -ENODATA; + goto done; + } + + ret = calgary_setup_tar(dev, bbar); + if (ret) + goto iounmap; + + dev->bus->self = dev; + calgary_enable_translation(dev); + + return 0; + +iounmap: + iounmap(bbar); +done: + return ret; +} + +static int __init calgary_init(void) +{ + int i, ret = -ENODEV; + struct pci_dev *dev = NULL; + + for (i = 0; i <= num_online_nodes() * MAX_NUM_OF_PHBS; i++) { + dev = pci_get_device(PCI_VENDOR_ID_IBM, + PCI_DEVICE_ID_IBM_CALGARY, + dev); + if (!dev) + break; + if (!translate_phb(dev)) { + calgary_init_one_nontraslated(dev); + continue; + } + if (!tce_table_kva[i] && !translate_empty_slots) { + pci_dev_put(dev); + continue; + } + ret = calgary_init_one(dev); + if (ret) + goto error; + } + + return ret; + +error: + for (i--; i >= 0; i--) { + dev = pci_find_device_reverse(PCI_VENDOR_ID_IBM, + PCI_DEVICE_ID_IBM_CALGARY, + dev); + if (!translate_phb(dev)) { + pci_dev_put(dev); + continue; + } + if (!tce_table_kva[i] && !translate_empty_slots) + continue; + calgary_disable_translation(dev); + calgary_free_tar(dev); + pci_dev_put(dev); + } + + return ret; +} + +static inline int __init determine_tce_table_size(u64 ram) +{ + int ret; + + if (specified_table_size != TCE_TABLE_SIZE_UNSPECIFIED) + return specified_table_size; + + /* + * Table sizes are from 0 to 7 (TCE_TABLE_SIZE_64K to + * TCE_TABLE_SIZE_8M). Table size 0 has 8K entries and each + * larger table size has twice as many entries, so shift the + * max ram address by 13 to divide by 8K and then look at the + * order of the result to choose between 0-7. + */ + ret = get_order(ram >> 13); + if (ret > TCE_TABLE_SIZE_8M) + ret = TCE_TABLE_SIZE_8M; + + return ret; +} + +void __init detect_calgary(void) +{ + u32 val; + int bus, table_idx; + void *tbl; + int detected = 0; + + /* + * if the user specified iommu=off or iommu=soft or we found + * another HW IOMMU already, bail out. + */ + if (swiotlb || no_iommu || iommu_detected) + return; + + specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE); + + for (bus = 0, table_idx = 0; + bus <= num_online_nodes() * MAX_PHB_BUS_NUM; + bus++) { + BUG_ON(bus > MAX_NUMNODES * MAX_PHB_BUS_NUM); + if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY) + continue; + if (test_bit(bus, translation_disabled)) { + printk(KERN_INFO "Calgary: translation is disabled for " + "PHB 0x%x\n", bus); + /* skip this phb, don't allocate a tbl for it */ + tce_table_kva[table_idx] = NULL; + table_idx++; + continue; + } + /* + * scan the first slot of the PCI bus to see if there + * are any devices present + */ + val = read_pci_config(bus, 1, 0, 0); + if (val != 0xffffffff || translate_empty_slots) { + tbl = alloc_tce_table(); + if (!tbl) + goto cleanup; + detected = 1; + } else + tbl = NULL; + + tce_table_kva[table_idx] = tbl; + table_idx++; + } + + if (detected) { + iommu_detected = 1; + calgary_detected = 1; + printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected. " + "TCE table spec is %d.\n", specified_table_size); + } + return; + +cleanup: + for (--table_idx; table_idx >= 0; --table_idx) + if (tce_table_kva[table_idx]) + free_tce_table(tce_table_kva[table_idx]); +} + +int __init calgary_iommu_init(void) +{ + int ret; + + if (no_iommu || swiotlb) + return -ENODEV; + + if (!calgary_detected) + return -ENODEV; + + /* ok, we're trying to use Calgary - let's roll */ + printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n"); + + ret = calgary_init(); + if (ret) { + printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " + "falling back to no_iommu\n", ret); + if (end_pfn > MAX_DMA32_PFN) + printk(KERN_ERR "WARNING more than 4GB of memory, " + "32bit PCI may malfunction.\n"); + return ret; + } + + force_iommu = 1; + dma_ops = &calgary_dma_ops; + + return 0; +} + +static int __init calgary_parse_options(char *p) +{ + unsigned int bridge; + size_t len; + char* endp; + + while (*p) { + if (!strncmp(p, "64k", 3)) + specified_table_size = TCE_TABLE_SIZE_64K; + else if (!strncmp(p, "128k", 4)) + specified_table_size = TCE_TABLE_SIZE_128K; + else if (!strncmp(p, "256k", 4)) + specified_table_size = TCE_TABLE_SIZE_256K; + else if (!strncmp(p, "512k", 4)) + specified_table_size = TCE_TABLE_SIZE_512K; + else if (!strncmp(p, "1M", 2)) + specified_table_size = TCE_TABLE_SIZE_1M; + else if (!strncmp(p, "2M", 2)) + specified_table_size = TCE_TABLE_SIZE_2M; + else if (!strncmp(p, "4M", 2)) + specified_table_size = TCE_TABLE_SIZE_4M; + else if (!strncmp(p, "8M", 2)) + specified_table_size = TCE_TABLE_SIZE_8M; + + len = strlen("translate_empty_slots"); + if (!strncmp(p, "translate_empty_slots", len)) + translate_empty_slots = 1; + + len = strlen("disable"); + if (!strncmp(p, "disable", len)) { + p += len; + if (*p == '=') + ++p; + if (*p == '\0') + break; + bridge = simple_strtol(p, &endp, 0); + if (p == endp) + break; + + if (bridge <= (num_online_nodes() * MAX_PHB_BUS_NUM)) { + printk(KERN_INFO "Calgary: disabling " + "translation for PHB 0x%x\n", bridge); + set_bit(bridge, translation_disabled); + } + } + + p = strpbrk(p, ","); + if (!p) + break; + + p++; /* skip ',' */ + } + return 1; +} +__setup("calgary=", calgary_parse_options); diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c index a9275c9557cf..9c44f4f2433d 100644 --- a/arch/x86_64/kernel/pci-dma.c +++ b/arch/x86_64/kernel/pci-dma.c @@ -9,6 +9,7 @@ #include <linux/module.h> #include <asm/io.h> #include <asm/proto.h> +#include <asm/calgary.h> int iommu_merge __read_mostly = 0; EXPORT_SYMBOL(iommu_merge); @@ -33,12 +34,15 @@ int panic_on_overflow __read_mostly = 0; int force_iommu __read_mostly= 0; #endif +/* Set this to 1 if there is a HW IOMMU in the system */ +int iommu_detected __read_mostly = 0; + /* Dummy device used for NULL arguments (normally ISA). Better would be probably a smaller DMA mask, but this is bug-to-bug compatible to i386. */ struct device fallback_dev = { .bus_id = "fallback device", - .coherent_dma_mask = 0xffffffff, + .coherent_dma_mask = DMA_32BIT_MASK, .dma_mask = &fallback_dev.coherent_dma_mask, }; @@ -77,7 +81,7 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, dev = &fallback_dev; dma_mask = dev->coherent_dma_mask; if (dma_mask == 0) - dma_mask = 0xffffffff; + dma_mask = DMA_32BIT_MASK; /* Don't invoke OOM killer */ gfp |= __GFP_NORETRY; @@ -90,7 +94,7 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, larger than 16MB and in this case we have a chance of finding fitting memory in the next higher zone first. If not retry with true GFP_DMA. -AK */ - if (dma_mask <= 0xffffffff) + if (dma_mask <= DMA_32BIT_MASK) gfp |= GFP_DMA32; again: @@ -111,7 +115,7 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, /* Don't use the 16MB ZONE_DMA unless absolutely needed. It's better to use remapping first. */ - if (dma_mask < 0xffffffff && !(gfp & GFP_DMA)) { + if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) { gfp = (gfp & ~GFP_DMA32) | GFP_DMA; goto again; } @@ -174,7 +178,7 @@ int dma_supported(struct device *dev, u64 mask) /* Copied from i386. Doesn't make much sense, because it will only work for pci_alloc_coherent. The caller just has to use GFP_DMA in this case. */ - if (mask < 0x00ffffff) + if (mask < DMA_24BIT_MASK) return 0; /* Tell the device to use SAC when IOMMU force is on. This @@ -189,7 +193,7 @@ int dma_supported(struct device *dev, u64 mask) SAC for these. Assume all masks <= 40 bits are of this type. Normally this doesn't make any difference, but gives more gentle handling of IOMMU overflow. */ - if (iommu_sac_force && (mask >= 0xffffffffffULL)) { + if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) { printk(KERN_INFO "%s: Force SAC with mask %Lx\n", dev->bus_id,mask); return 0; } @@ -266,7 +270,7 @@ __init int iommu_setup(char *p) swiotlb = 1; #endif -#ifdef CONFIG_GART_IOMMU +#ifdef CONFIG_IOMMU gart_parse_options(p); #endif @@ -276,3 +280,40 @@ __init int iommu_setup(char *p) } return 1; } +__setup("iommu=", iommu_setup); + +void __init pci_iommu_alloc(void) +{ + /* + * The order of these functions is important for + * fall-back/fail-over reasons + */ +#ifdef CONFIG_IOMMU + iommu_hole_init(); +#endif + +#ifdef CONFIG_CALGARY_IOMMU + detect_calgary(); +#endif + +#ifdef CONFIG_SWIOTLB + pci_swiotlb_init(); +#endif +} + +static int __init pci_iommu_init(void) +{ +#ifdef CONFIG_CALGARY_IOMMU + calgary_iommu_init(); +#endif + +#ifdef CONFIG_IOMMU + gart_iommu_init(); +#endif + + no_iommu_init(); + return 0; +} + +/* Must execute after PCI subsystem */ +fs_initcall(pci_iommu_init); diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c index 82a7c9bfdfa0..4ca674d16b09 100644 --- a/arch/x86_64/kernel/pci-gart.c +++ b/arch/x86_64/kernel/pci-gart.c @@ -32,6 +32,7 @@ #include <asm/kdebug.h> #include <asm/swiotlb.h> #include <asm/dma.h> +#include <asm/k8.h> unsigned long iommu_bus_base; /* GART remapping area (physical) */ static unsigned long iommu_size; /* size of remapping area bytes */ @@ -46,8 +47,6 @@ u32 *iommu_gatt_base; /* Remapping table */ also seen with Qlogic at least). */ int iommu_fullflush = 1; -#define MAX_NB 8 - /* Allocation bitmap for the remapping area */ static DEFINE_SPINLOCK(iommu_bitmap_lock); static unsigned long *iommu_gart_bitmap; /* guarded by iommu_bitmap_lock */ @@ -63,13 +62,6 @@ static u32 gart_unmapped_entry; #define to_pages(addr,size) \ (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT) -#define for_all_nb(dev) \ - dev = NULL; \ - while ((dev = pci_get_device(PCI_VENDOR_ID_AMD, 0x1103, dev))!=NULL) - -static struct pci_dev *northbridges[MAX_NB]; -static u32 northbridge_flush_word[MAX_NB]; - #define EMERGENCY_PAGES 32 /* = 128KB */ #ifdef CONFIG_AGP @@ -93,7 +85,7 @@ static unsigned long alloc_iommu(int size) offset = find_next_zero_string(iommu_gart_bitmap,next_bit,iommu_pages,size); if (offset == -1) { need_flush = 1; - offset = find_next_zero_string(iommu_gart_bitmap,0,next_bit,size); + offset = find_next_zero_string(iommu_gart_bitmap,0,iommu_pages,size); } if (offset != -1) { set_bit_string(iommu_gart_bitmap, offset, size); @@ -120,44 +112,17 @@ static void free_iommu(unsigned long offset, int size) /* * Use global flush state to avoid races with multiple flushers. */ -static void flush_gart(struct device *dev) +static void flush_gart(void) { unsigned long flags; - int flushed = 0; - int i, max; - spin_lock_irqsave(&iommu_bitmap_lock, flags); - if (need_flush) { - max = 0; - for (i = 0; i < MAX_NB; i++) { - if (!northbridges[i]) - continue; - pci_write_config_dword(northbridges[i], 0x9c, - northbridge_flush_word[i] | 1); - flushed++; - max = i; - } - for (i = 0; i <= max; i++) { - u32 w; - if (!northbridges[i]) - continue; - /* Make sure the hardware actually executed the flush. */ - for (;;) { - pci_read_config_dword(northbridges[i], 0x9c, &w); - if (!(w & 1)) - break; - cpu_relax(); - } - } - if (!flushed) - printk("nothing to flush?\n"); + if (need_flush) { + k8_flush_garts(); need_flush = 0; } spin_unlock_irqrestore(&iommu_bitmap_lock, flags); } - - #ifdef CONFIG_IOMMU_LEAK #define SET_LEAK(x) if (iommu_leak_tab) \ @@ -266,7 +231,7 @@ static dma_addr_t gart_map_simple(struct device *dev, char *buf, size_t size, int dir) { dma_addr_t map = dma_map_area(dev, virt_to_bus(buf), size, dir); - flush_gart(dev); + flush_gart(); return map; } @@ -289,6 +254,28 @@ dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir) } /* + * Free a DMA mapping. + */ +void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, + size_t size, int direction) +{ + unsigned long iommu_page; + int npages; + int i; + + if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE || + dma_addr >= iommu_bus_base + iommu_size) + return; + iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; + npages = to_pages(dma_addr, size); + for (i = 0; i < npages; i++) { + iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; + CLEAR_LEAK(iommu_page + i); + } + free_iommu(iommu_page, npages); +} + +/* * Wrapper for pci_unmap_single working with scatterlists. */ void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) @@ -299,7 +286,7 @@ void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int di struct scatterlist *s = &sg[i]; if (!s->dma_length || !s->length) break; - dma_unmap_single(dev, s->dma_address, s->dma_length, dir); + gart_unmap_single(dev, s->dma_address, s->dma_length, dir); } } @@ -329,7 +316,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, s->dma_address = addr; s->dma_length = s->length; } - flush_gart(dev); + flush_gart(); return nents; } @@ -436,13 +423,13 @@ int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) if (dma_map_cont(sg, start, i, sg+out, pages, need) < 0) goto error; out++; - flush_gart(dev); + flush_gart(); if (out < nents) sg[out].dma_length = 0; return out; error: - flush_gart(NULL); + flush_gart(); gart_unmap_sg(dev, sg, nents, dir); /* When it was forced or merged try again in a dumb way */ if (force_iommu || iommu_merge) { @@ -458,28 +445,6 @@ error: return 0; } -/* - * Free a DMA mapping. - */ -void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, - size_t size, int direction) -{ - unsigned long iommu_page; - int npages; - int i; - - if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE || - dma_addr >= iommu_bus_base + iommu_size) - return; - iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; - npages = to_pages(dma_addr, size); - for (i = 0; i < npages; i++) { - iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; - CLEAR_LEAK(iommu_page + i); - } - free_iommu(iommu_page, npages); -} - static int no_agp; static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) @@ -532,10 +497,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info) void *gatt; unsigned aper_base, new_aper_base; unsigned aper_size, gatt_size, new_aper_size; - + int i; + printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); aper_size = aper_base = info->aper_size = 0; - for_all_nb(dev) { + dev = NULL; + for (i = 0; i < num_k8_northbridges; i++) { + dev = k8_northbridges[i]; new_aper_base = read_aperture(dev, &new_aper_size); if (!new_aper_base) goto nommu; @@ -558,11 +526,12 @@ static __init int init_k8_gatt(struct agp_kern_info *info) panic("Cannot allocate GATT table"); memset(gatt, 0, gatt_size); agp_gatt_table = gatt; - - for_all_nb(dev) { + + for (i = 0; i < num_k8_northbridges; i++) { u32 ctl; u32 gatt_reg; + dev = k8_northbridges[i]; gatt_reg = __pa(gatt) >> 12; gatt_reg <<= 4; pci_write_config_dword(dev, 0x98, gatt_reg); @@ -573,7 +542,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info) pci_write_config_dword(dev, 0x90, ctl); } - flush_gart(NULL); + flush_gart(); printk("PCI-DMA: aperture base @ %x size %u KB\n",aper_base, aper_size>>10); return 0; @@ -602,15 +571,19 @@ static struct dma_mapping_ops gart_dma_ops = { .unmap_sg = gart_unmap_sg, }; -static int __init pci_iommu_init(void) +void __init gart_iommu_init(void) { struct agp_kern_info info; unsigned long aper_size; unsigned long iommu_start; - struct pci_dev *dev; unsigned long scratch; long i; + if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) { + printk(KERN_INFO "PCI-GART: No AMD northbridge found.\n"); + return; + } + #ifndef CONFIG_AGP_AMD64 no_agp = 1; #else @@ -622,7 +595,11 @@ static int __init pci_iommu_init(void) #endif if (swiotlb) - return -1; + return; + + /* Did we detect a different HW IOMMU? */ + if (iommu_detected && !iommu_aperture) + return; if (no_iommu || (!force_iommu && end_pfn <= MAX_DMA32_PFN) || @@ -634,15 +611,7 @@ static int __init pci_iommu_init(void) "but IOMMU not available.\n" KERN_ERR "WARNING 32bit PCI may malfunction.\n"); } - return -1; - } - - i = 0; - for_all_nb(dev) - i++; - if (i > MAX_NB) { - printk(KERN_ERR "PCI-GART: Too many northbridges (%ld). Disabled\n", i); - return -1; + return; } printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); @@ -707,26 +676,10 @@ static int __init pci_iommu_init(void) for (i = EMERGENCY_PAGES; i < iommu_pages; i++) iommu_gatt_base[i] = gart_unmapped_entry; - for_all_nb(dev) { - u32 flag; - int cpu = PCI_SLOT(dev->devfn) - 24; - if (cpu >= MAX_NB) - continue; - northbridges[cpu] = dev; - pci_read_config_dword(dev, 0x9c, &flag); /* cache flush word */ - northbridge_flush_word[cpu] = flag; - } - - flush_gart(NULL); - + flush_gart(); dma_ops = &gart_dma_ops; - - return 0; } -/* Must execute after PCI subsystem */ -fs_initcall(pci_iommu_init); - void gart_parse_options(char *p) { int arg; diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c index 1f6ecc62061d..c4c3cc36ac5b 100644 --- a/arch/x86_64/kernel/pci-nommu.c +++ b/arch/x86_64/kernel/pci-nommu.c @@ -4,6 +4,8 @@ #include <linux/init.h> #include <linux/pci.h> #include <linux/string.h> +#include <linux/dma-mapping.h> + #include <asm/proto.h> #include <asm/processor.h> #include <asm/dma.h> @@ -12,10 +14,11 @@ static int check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) { if (hwdev && bus + size > *hwdev->dma_mask) { - if (*hwdev->dma_mask >= 0xffffffffULL) + if (*hwdev->dma_mask >= DMA_32BIT_MASK) printk(KERN_ERR - "nommu_%s: overflow %Lx+%lu of device mask %Lx\n", - name, (long long)bus, size, (long long)*hwdev->dma_mask); + "nommu_%s: overflow %Lx+%zu of device mask %Lx\n", + name, (long long)bus, size, + (long long)*hwdev->dma_mask); return 0; } return 1; diff --git a/arch/x86_64/kernel/pci-swiotlb.c b/arch/x86_64/kernel/pci-swiotlb.c index 990ed67896f2..ebdb77fe2057 100644 --- a/arch/x86_64/kernel/pci-swiotlb.c +++ b/arch/x86_64/kernel/pci-swiotlb.c @@ -31,7 +31,7 @@ struct dma_mapping_ops swiotlb_dma_ops = { void pci_swiotlb_init(void) { /* don't initialize swiotlb if iommu=off (no_iommu=1) */ - if (!iommu_aperture && !no_iommu && + if (!iommu_detected && !no_iommu && (end_pfn > MAX_DMA32_PFN || force_iommu)) swiotlb = 1; if (swiotlb) { diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index fb903e65e079..ca56e19b8b6e 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -10,7 +10,6 @@ * Andi Kleen. * * CPU hotplug support - ashok.raj@intel.com - * $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $ */ /* @@ -64,6 +63,7 @@ EXPORT_SYMBOL(boot_option_idle_override); * Powermanagement idle function, if any.. */ void (*pm_idle)(void); +EXPORT_SYMBOL(pm_idle); static DEFINE_PER_CPU(unsigned int, cpu_idle_state); static ATOMIC_NOTIFIER_HEAD(idle_notifier); @@ -111,7 +111,7 @@ static void default_idle(void) { local_irq_enable(); - clear_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status &= ~TS_POLLING; smp_mb__after_clear_bit(); while (!need_resched()) { local_irq_disable(); @@ -120,7 +120,7 @@ static void default_idle(void) else local_irq_enable(); } - set_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status |= TS_POLLING; } /* @@ -203,8 +203,7 @@ static inline void play_dead(void) */ void cpu_idle (void) { - set_thread_flag(TIF_POLLING_NRFLAG); - + current_thread_info()->status |= TS_POLLING; /* endless idle loop with no priority at all */ while (1) { while (!need_resched()) { @@ -335,7 +334,7 @@ void show_regs(struct pt_regs *regs) { printk("CPU %d:", smp_processor_id()); __show_regs(regs); - show_trace(®s->rsp); + show_trace(NULL, regs, (void *)(regs + 1)); } /* @@ -365,8 +364,11 @@ void flush_thread(void) struct task_struct *tsk = current; struct thread_info *t = current_thread_info(); - if (t->flags & _TIF_ABI_PENDING) + if (t->flags & _TIF_ABI_PENDING) { t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32); + if (t->flags & _TIF_IA32) + current_thread_info()->status |= TS_COMPAT; + } tsk->thread.debugreg0 = 0; tsk->thread.debugreg1 = 0; diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c index 57117b8beb2b..2d6769847456 100644 --- a/arch/x86_64/kernel/reboot.c +++ b/arch/x86_64/kernel/reboot.c @@ -20,6 +20,7 @@ * Power off function, if any */ void (*pm_power_off)(void); +EXPORT_SYMBOL(pm_power_off); static long no_idt[3]; static enum { diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 143c65031539..1129918ede82 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -5,8 +5,6 @@ * * Nov 2001 Dave Jones <davej@suse.de> * Forked from i386 setup code. - * - * $Id$ */ /* @@ -65,9 +63,7 @@ #include <asm/setup.h> #include <asm/mach_apic.h> #include <asm/numa.h> -#include <asm/swiotlb.h> #include <asm/sections.h> -#include <asm/gart-mapping.h> #include <asm/dmi.h> /* @@ -75,6 +71,7 @@ */ struct cpuinfo_x86 boot_cpu_data __read_mostly; +EXPORT_SYMBOL(boot_cpu_data); unsigned long mmu_cr4_features; @@ -103,6 +100,7 @@ char dmi_alloc_data[DMI_MAX_DATA]; * Setup options */ struct screen_info screen_info; +EXPORT_SYMBOL(screen_info); struct sys_desc_table_struct { unsigned short length; unsigned char table[0]; @@ -474,80 +472,6 @@ contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn) } #endif -/* Use inline assembly to define this because the nops are defined - as inline assembly strings in the include files and we cannot - get them easily into strings. */ -asm("\t.data\nk8nops: " - K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 - K8_NOP7 K8_NOP8); - -extern unsigned char k8nops[]; -static unsigned char *k8_nops[ASM_NOP_MAX+1] = { - NULL, - k8nops, - k8nops + 1, - k8nops + 1 + 2, - k8nops + 1 + 2 + 3, - k8nops + 1 + 2 + 3 + 4, - k8nops + 1 + 2 + 3 + 4 + 5, - k8nops + 1 + 2 + 3 + 4 + 5 + 6, - k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, -}; - -extern char __vsyscall_0; - -/* Replace instructions with better alternatives for this CPU type. - - This runs before SMP is initialized to avoid SMP problems with - self modifying code. This implies that assymetric systems where - APs have less capabilities than the boot processor are not handled. - In this case boot with "noreplacement". */ -void apply_alternatives(void *start, void *end) -{ - struct alt_instr *a; - int diff, i, k; - for (a = start; (void *)a < end; a++) { - u8 *instr; - - if (!boot_cpu_has(a->cpuid)) - continue; - - BUG_ON(a->replacementlen > a->instrlen); - instr = a->instr; - /* vsyscall code is not mapped yet. resolve it manually. */ - if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END) - instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0)); - __inline_memcpy(instr, a->replacement, a->replacementlen); - diff = a->instrlen - a->replacementlen; - - /* Pad the rest with nops */ - for (i = a->replacementlen; diff > 0; diff -= k, i += k) { - k = diff; - if (k > ASM_NOP_MAX) - k = ASM_NOP_MAX; - __inline_memcpy(instr + i, k8_nops[k], k); - } - } -} - -static int no_replacement __initdata = 0; - -void __init alternative_instructions(void) -{ - extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; - if (no_replacement) - return; - apply_alternatives(__alt_instructions, __alt_instructions_end); -} - -static int __init noreplacement_setup(char *s) -{ - no_replacement = 1; - return 1; -} - -__setup("noreplacement", noreplacement_setup); - #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) struct edd edd; #ifdef CONFIG_EDD_MODULE @@ -780,10 +704,6 @@ void __init setup_arch(char **cmdline_p) e820_setup_gap(); -#ifdef CONFIG_GART_IOMMU - iommu_hole_init(); -#endif - #ifdef CONFIG_VT #if defined(CONFIG_VGA_CONSOLE) conswitchp = &vga_con; @@ -868,24 +788,32 @@ static int nearby_node(int apicid) static void __init amd_detect_cmp(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP - int cpu = smp_processor_id(); unsigned bits; #ifdef CONFIG_NUMA + int cpu = smp_processor_id(); int node = 0; unsigned apicid = hard_smp_processor_id(); #endif + unsigned ecx = cpuid_ecx(0x80000008); + + c->x86_max_cores = (ecx & 0xff) + 1; - bits = 0; - while ((1 << bits) < c->x86_max_cores) - bits++; + /* CPU telling us the core id bits shift? */ + bits = (ecx >> 12) & 0xF; + + /* Otherwise recompute */ + if (bits == 0) { + while ((1 << bits) < c->x86_max_cores) + bits++; + } /* Low order bits define the core id (index of core in socket) */ - cpu_core_id[cpu] = phys_proc_id[cpu] & ((1 << bits)-1); + c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1); /* Convert the APIC ID into the socket ID */ - phys_proc_id[cpu] = phys_pkg_id(bits); + c->phys_proc_id = phys_pkg_id(bits); #ifdef CONFIG_NUMA - node = phys_proc_id[cpu]; + node = c->phys_proc_id; if (apicid_to_node[apicid] != NUMA_NO_NODE) node = apicid_to_node[apicid]; if (!node_online(node)) { @@ -898,7 +826,7 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c) but in the same order as the HT nodeids. If that doesn't result in a usable node fall back to the path for the previous case. */ - int ht_nodeid = apicid - (phys_proc_id[0] << bits); + int ht_nodeid = apicid - (cpu_data[0].phys_proc_id << bits); if (ht_nodeid >= 0 && apicid_to_node[ht_nodeid] != NUMA_NO_NODE) node = apicid_to_node[ht_nodeid]; @@ -908,15 +836,13 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c) } numa_set_node(cpu, node); - printk(KERN_INFO "CPU %d/%x(%d) -> Node %d -> Core %d\n", - cpu, apicid, c->x86_max_cores, node, cpu_core_id[cpu]); + printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); #endif #endif } -static int __init init_amd(struct cpuinfo_x86 *c) +static void __init init_amd(struct cpuinfo_x86 *c) { - int r; unsigned level; #ifdef CONFIG_SMP @@ -949,8 +875,8 @@ static int __init init_amd(struct cpuinfo_x86 *c) if (c->x86 >= 6) set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability); - r = get_model_name(c); - if (!r) { + level = get_model_name(c); + if (!level) { switch (c->x86) { case 15: /* Should distinguish Models here, but this is only @@ -965,13 +891,12 @@ static int __init init_amd(struct cpuinfo_x86 *c) if (c->x86_power & (1<<8)) set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); - if (c->extended_cpuid_level >= 0x80000008) { - c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; - + /* Multi core CPU? */ + if (c->extended_cpuid_level >= 0x80000008) amd_detect_cmp(c); - } - return r; + /* Fix cpuid4 emulation for more */ + num_cache_leaves = 3; } static void __cpuinit detect_ht(struct cpuinfo_x86 *c) @@ -979,13 +904,14 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c) #ifdef CONFIG_SMP u32 eax, ebx, ecx, edx; int index_msb, core_bits; - int cpu = smp_processor_id(); cpuid(1, &eax, &ebx, &ecx, &edx); - if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY)) + if (!cpu_has(c, X86_FEATURE_HT)) return; + if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) + goto out; smp_num_siblings = (ebx & 0xff0000) >> 16; @@ -1000,10 +926,7 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c) } index_msb = get_count_order(smp_num_siblings); - phys_proc_id[cpu] = phys_pkg_id(index_msb); - - printk(KERN_INFO "CPU: Physical Processor ID: %d\n", - phys_proc_id[cpu]); + c->phys_proc_id = phys_pkg_id(index_msb); smp_num_siblings = smp_num_siblings / c->x86_max_cores; @@ -1011,13 +934,15 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c) core_bits = get_count_order(c->x86_max_cores); - cpu_core_id[cpu] = phys_pkg_id(index_msb) & + c->cpu_core_id = phys_pkg_id(index_msb) & ((1 << core_bits) - 1); - - if (c->x86_max_cores > 1) - printk(KERN_INFO "CPU: Processor Core ID: %d\n", - cpu_core_id[cpu]); } +out: + if ((c->x86_max_cores * smp_num_siblings) > 1) { + printk(KERN_INFO "CPU: Physical Processor ID: %d\n", c->phys_proc_id); + printk(KERN_INFO "CPU: Processor Core ID: %d\n", c->cpu_core_id); + } + #endif } @@ -1026,15 +951,12 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c) */ static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) { - unsigned int eax; + unsigned int eax, t; if (c->cpuid_level < 4) return 1; - __asm__("cpuid" - : "=a" (eax) - : "0" (4), "c" (0) - : "bx", "dx"); + cpuid_count(4, 0, &eax, &t, &t, &t); if (eax & 0x1f) return ((eax >> 26) + 1); @@ -1047,16 +969,17 @@ static void srat_detect_node(void) #ifdef CONFIG_NUMA unsigned node; int cpu = smp_processor_id(); + int apicid = hard_smp_processor_id(); /* Don't do the funky fallback heuristics the AMD version employs for now. */ - node = apicid_to_node[hard_smp_processor_id()]; + node = apicid_to_node[apicid]; if (node == NUMA_NO_NODE) node = first_node(node_online_map); numa_set_node(cpu, node); if (acpi_numa > 0) - printk(KERN_INFO "CPU %d -> Node %d\n", cpu, node); + printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); #endif } @@ -1066,6 +989,13 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) unsigned n; init_intel_cacheinfo(c); + if (c->cpuid_level > 9 ) { + unsigned eax = cpuid_eax(10); + /* Check for version and the number of counters */ + if ((eax & 0xff) && (((eax>>8) & 0xff) > 1)) + set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability); + } + n = c->extended_cpuid_level; if (n >= 0x80000008) { unsigned eax = cpuid_eax(0x80000008); @@ -1157,7 +1087,7 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) } #ifdef CONFIG_SMP - phys_proc_id[smp_processor_id()] = (cpuid_ebx(1) >> 24) & 0xff; + c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff; #endif } @@ -1284,7 +1214,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL, - NULL, "fxsr_opt", "rdtscp", NULL, NULL, "lm", "3dnowext", "3dnow", + NULL, "fxsr_opt", NULL, "rdtscp", NULL, "lm", "3dnowext", "3dnow", /* Transmeta-defined */ "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, @@ -1295,7 +1225,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) /* Other (Linux-defined) */ "cxmmx", NULL, "cyrix_arr", "centaur_mcr", NULL, "constant_tsc", NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "up", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, @@ -1365,9 +1295,9 @@ static int show_cpuinfo(struct seq_file *m, void *v) #ifdef CONFIG_SMP if (smp_num_siblings * c->x86_max_cores > 1) { int cpu = c - cpu_data; - seq_printf(m, "physical id\t: %d\n", phys_proc_id[cpu]); + seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[cpu])); - seq_printf(m, "core id\t\t: %d\n", cpu_core_id[cpu]); + seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); } #endif @@ -1441,7 +1371,7 @@ struct seq_operations cpuinfo_op = { .show = show_cpuinfo, }; -#ifdef CONFIG_INPUT_PCSPKR +#if defined(CONFIG_INPUT_PCSPKR) || defined(CONFIG_INPUT_PCSPKR_MODULE) #include <linux/platform_device.h> static __init int add_pcspkr(void) { diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index 8a691fa6d393..f5934cb4a2b6 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -3,7 +3,6 @@ * Copyright (C) 1995 Linus Torvalds * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen. * See setup.c for older changelog. - * $Id: setup64.c,v 1.12 2002/03/21 10:09:17 ak Exp $ */ #include <linux/config.h> #include <linux/init.h> @@ -31,6 +30,7 @@ char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,}; cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly; +EXPORT_SYMBOL(_cpu_pda); struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned; struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; @@ -38,6 +38,7 @@ struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned"))); unsigned long __supported_pte_mask __read_mostly = ~0UL; +EXPORT_SYMBOL(__supported_pte_mask); static int do_not_nx __cpuinitdata = 0; /* noexec=on|off diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c index e5f5ce7909a3..28161170fb0a 100644 --- a/arch/x86_64/kernel/signal.c +++ b/arch/x86_64/kernel/signal.c @@ -7,8 +7,6 @@ * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes * 2000-2002 x86-64 support by Andi Kleen - * - * $Id: signal.c,v 1.18 2001/10/17 22:30:37 ak Exp $ */ #include <linux/sched.h> @@ -239,7 +237,6 @@ get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size) rsp = regs->rsp - 128; /* This is the X/Open sanctioned signal stack switching. */ - /* RED-PEN: redzone on that stack? */ if (ka->sa.sa_flags & SA_ONSTACK) { if (sas_ss_flags(rsp) == 0) rsp = current->sas_ss_sp + current->sas_ss_size; diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index 4a6628b14d99..8188bae9c6d5 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c @@ -224,6 +224,7 @@ void flush_tlb_current_task(void) flush_tlb_others(cpu_mask, mm, FLUSH_ALL); preempt_enable(); } +EXPORT_SYMBOL(flush_tlb_current_task); void flush_tlb_mm (struct mm_struct * mm) { @@ -244,6 +245,7 @@ void flush_tlb_mm (struct mm_struct * mm) preempt_enable(); } +EXPORT_SYMBOL(flush_tlb_mm); void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) { @@ -266,6 +268,7 @@ void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) preempt_enable(); } +EXPORT_SYMBOL(flush_tlb_page); static void do_flush_tlb_all(void* info) { @@ -443,6 +446,7 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic, spin_unlock(&call_lock); return 0; } +EXPORT_SYMBOL(smp_call_function); void smp_stop_cpu(void) { @@ -460,7 +464,7 @@ static void smp_really_stop_cpu(void *dummy) { smp_stop_cpu(); for (;;) - asm("hlt"); + halt(); } void smp_send_stop(void) @@ -520,13 +524,13 @@ asmlinkage void smp_call_function_interrupt(void) int safe_smp_processor_id(void) { - int apicid, i; + unsigned apicid, i; if (disable_apic) return 0; apicid = hard_smp_processor_id(); - if (x86_cpu_to_apicid[apicid] == apicid) + if (apicid < NR_CPUS && x86_cpu_to_apicid[apicid] == apicid) return apicid; for (i = 0; i < NR_CPUS; ++i) { diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index 71a7222cf9ce..4e9755179ecf 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -63,13 +63,11 @@ /* Number of siblings per CPU package */ int smp_num_siblings = 1; -/* Package ID of each logical CPU */ -u8 phys_proc_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; -/* core ID of each logical CPU */ -u8 cpu_core_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; +EXPORT_SYMBOL(smp_num_siblings); /* Last level cache ID of each logical CPU */ u8 cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID}; +EXPORT_SYMBOL(cpu_llc_id); /* Bitmask of currently online CPUs */ cpumask_t cpu_online_map __read_mostly; @@ -82,18 +80,21 @@ EXPORT_SYMBOL(cpu_online_map); */ cpumask_t cpu_callin_map; cpumask_t cpu_callout_map; +EXPORT_SYMBOL(cpu_callout_map); cpumask_t cpu_possible_map; EXPORT_SYMBOL(cpu_possible_map); /* Per CPU bogomips and other parameters */ struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; +EXPORT_SYMBOL(cpu_data); /* Set when the idlers are all forked */ int smp_threads_ready; /* representing HT siblings of each logical CPU */ cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly; +EXPORT_SYMBOL(cpu_sibling_map); /* representing HT and core siblings of each logical CPU */ cpumask_t cpu_core_map[NR_CPUS] __read_mostly; @@ -472,8 +473,8 @@ static inline void set_cpu_sibling_map(int cpu) if (smp_num_siblings > 1) { for_each_cpu_mask(i, cpu_sibling_setup_map) { - if (phys_proc_id[cpu] == phys_proc_id[i] && - cpu_core_id[cpu] == cpu_core_id[i]) { + if (c[cpu].phys_proc_id == c[i].phys_proc_id && + c[cpu].cpu_core_id == c[i].cpu_core_id) { cpu_set(i, cpu_sibling_map[cpu]); cpu_set(cpu, cpu_sibling_map[i]); cpu_set(i, cpu_core_map[cpu]); @@ -500,7 +501,7 @@ static inline void set_cpu_sibling_map(int cpu) cpu_set(i, c[cpu].llc_shared_map); cpu_set(cpu, c[i].llc_shared_map); } - if (phys_proc_id[cpu] == phys_proc_id[i]) { + if (c[cpu].phys_proc_id == c[i].phys_proc_id) { cpu_set(i, cpu_core_map[cpu]); cpu_set(cpu, cpu_core_map[i]); /* @@ -797,6 +798,8 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid) } + alternatives_smp_switch(1); + c_idle.idle = get_idle_for_cpu(cpu); if (c_idle.idle) { @@ -1199,8 +1202,8 @@ static void remove_siblinginfo(int cpu) cpu_clear(cpu, cpu_sibling_map[sibling]); cpus_clear(cpu_sibling_map[cpu]); cpus_clear(cpu_core_map[cpu]); - phys_proc_id[cpu] = BAD_APICID; - cpu_core_id[cpu] = BAD_APICID; + c[cpu].phys_proc_id = 0; + c[cpu].cpu_core_id = 0; cpu_clear(cpu, cpu_sibling_setup_map); } @@ -1259,6 +1262,8 @@ void __cpu_die(unsigned int cpu) /* They ack this in play_dead by setting CPU_DEAD */ if (per_cpu(cpu_state, cpu) == CPU_DEAD) { printk ("CPU %d is now offline\n", cpu); + if (1 == num_online_cpus()) + alternatives_smp_switch(0); return; } msleep(100); diff --git a/arch/x86_64/kernel/tce.c b/arch/x86_64/kernel/tce.c new file mode 100644 index 000000000000..8d4c67f61b8e --- /dev/null +++ b/arch/x86_64/kernel/tce.c @@ -0,0 +1,202 @@ +/* + * Derived from arch/powerpc/platforms/pseries/iommu.c + * + * Copyright (C) 2006 Jon Mason <jdmason@us.ibm.com>, IBM Corporation + * Copyright (C) 2006 Muli Ben-Yehuda <muli@il.ibm.com>, IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/spinlock.h> +#include <linux/string.h> +#include <linux/pci.h> +#include <linux/dma-mapping.h> +#include <linux/bootmem.h> +#include <asm/tce.h> +#include <asm/calgary.h> +#include <asm/proto.h> + +/* flush a tce at 'tceaddr' to main memory */ +static inline void flush_tce(void* tceaddr) +{ + /* a single tce can't cross a cache line */ + if (cpu_has_clflush) + asm volatile("clflush (%0)" :: "r" (tceaddr)); + else + asm volatile("wbinvd":::"memory"); +} + +void tce_build(struct iommu_table *tbl, unsigned long index, + unsigned int npages, unsigned long uaddr, int direction) +{ + u64* tp; + u64 t; + u64 rpn; + + t = (1 << TCE_READ_SHIFT); + if (direction != DMA_TO_DEVICE) + t |= (1 << TCE_WRITE_SHIFT); + + tp = ((u64*)tbl->it_base) + index; + + while (npages--) { + rpn = (virt_to_bus((void*)uaddr)) >> PAGE_SHIFT; + t &= ~TCE_RPN_MASK; + t |= (rpn << TCE_RPN_SHIFT); + + *tp = cpu_to_be64(t); + flush_tce(tp); + + uaddr += PAGE_SIZE; + tp++; + } +} + +void tce_free(struct iommu_table *tbl, long index, unsigned int npages) +{ + u64* tp; + + tp = ((u64*)tbl->it_base) + index; + + while (npages--) { + *tp = cpu_to_be64(0); + flush_tce(tp); + tp++; + } +} + +static inline unsigned int table_size_to_number_of_entries(unsigned char size) +{ + /* + * size is the order of the table, 0-7 + * smallest table is 8K entries, so shift result by 13 to + * multiply by 8K + */ + return (1 << size) << 13; +} + +static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl) +{ + unsigned int bitmapsz; + unsigned int tce_table_index; + unsigned long bmppages; + int ret; + + tbl->it_busno = dev->bus->number; + + /* set the tce table size - measured in entries */ + tbl->it_size = table_size_to_number_of_entries(specified_table_size); + + tce_table_index = bus_to_phb(tbl->it_busno); + tbl->it_base = (unsigned long)tce_table_kva[tce_table_index]; + if (!tbl->it_base) { + printk(KERN_ERR "Calgary: iommu_table_setparms: " + "no table allocated?!\n"); + ret = -ENOMEM; + goto done; + } + + /* + * number of bytes needed for the bitmap size in number of + * entries; we need one bit per entry + */ + bitmapsz = tbl->it_size / BITS_PER_BYTE; + bmppages = __get_free_pages(GFP_KERNEL, get_order(bitmapsz)); + if (!bmppages) { + printk(KERN_ERR "Calgary: cannot allocate bitmap\n"); + ret = -ENOMEM; + goto done; + } + + tbl->it_map = (unsigned long*)bmppages; + + memset(tbl->it_map, 0, bitmapsz); + + tbl->it_hint = 0; + + spin_lock_init(&tbl->it_lock); + + return 0; + +done: + return ret; +} + +int build_tce_table(struct pci_dev *dev, void __iomem *bbar) +{ + struct iommu_table *tbl; + int ret; + + if (dev->sysdata) { + printk(KERN_ERR "Calgary: dev %p has sysdata %p\n", + dev, dev->sysdata); + BUG(); + } + + tbl = kzalloc(sizeof(struct iommu_table), GFP_KERNEL); + if (!tbl) { + printk(KERN_ERR "Calgary: error allocating iommu_table\n"); + ret = -ENOMEM; + goto done; + } + + ret = tce_table_setparms(dev, tbl); + if (ret) + goto free_tbl; + + tce_free(tbl, 0, tbl->it_size); + + tbl->bbar = bbar; + + /* + * NUMA is already using the bus's sysdata pointer, so we use + * the bus's pci_dev's sysdata instead. + */ + dev->sysdata = tbl; + + return 0; + +free_tbl: + kfree(tbl); +done: + return ret; +} + +void* alloc_tce_table(void) +{ + unsigned int size; + + size = table_size_to_number_of_entries(specified_table_size); + size *= TCE_ENTRY_SIZE; + + return __alloc_bootmem_low(size, size, 0); +} + +void free_tce_table(void *tbl) +{ + unsigned int size; + + if (!tbl) + return; + + size = table_size_to_number_of_entries(specified_table_size); + size *= TCE_ENTRY_SIZE; + + free_bootmem(__pa(tbl), size); +} diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 7392570f975d..ebbee6f59ff5 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -8,7 +8,7 @@ * Copyright (c) 1995 Markus Kuhn * Copyright (c) 1996 Ingo Molnar * Copyright (c) 1998 Andrea Arcangeli - * Copyright (c) 2002 Vojtech Pavlik + * Copyright (c) 2002,2006 Vojtech Pavlik * Copyright (c) 2003 Andi Kleen * RTC support code taken from arch/i386/kernel/timers/time_hpet.c */ @@ -51,14 +51,21 @@ extern int using_apic_timer; static char *time_init_gtod(void); DEFINE_SPINLOCK(rtc_lock); +EXPORT_SYMBOL(rtc_lock); DEFINE_SPINLOCK(i8253_lock); int nohpet __initdata = 0; static int notsc __initdata = 0; -#undef HPET_HACK_ENABLE_DANGEROUS +#define USEC_PER_TICK (USEC_PER_SEC / HZ) +#define NSEC_PER_TICK (NSEC_PER_SEC / HZ) +#define FSEC_PER_TICK (FSEC_PER_SEC / HZ) + +#define NS_SCALE 10 /* 2^10, carefully chosen */ +#define US_SCALE 32 /* 2^32, arbitralrily chosen */ unsigned int cpu_khz; /* TSC clocks / usec, not used here */ +EXPORT_SYMBOL(cpu_khz); static unsigned long hpet_period; /* fsecs / HPET clock */ unsigned long hpet_tick; /* HPET clocks / interrupt */ int hpet_use_timer; /* Use counter of hpet for time keeping, otherwise PIT */ @@ -90,7 +97,7 @@ static inline unsigned int do_gettimeoffset_tsc(void) t = get_cycles_sync(); if (t < vxtime.last_tsc) t = vxtime.last_tsc; /* hack */ - x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> 32; + x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> US_SCALE; return x; } @@ -98,7 +105,7 @@ static inline unsigned int do_gettimeoffset_hpet(void) { /* cap counter read to one tick to avoid inconsistencies */ unsigned long counter = hpet_readl(HPET_COUNTER) - vxtime.last; - return (min(counter,hpet_tick) * vxtime.quot) >> 32; + return (min(counter,hpet_tick) * vxtime.quot) >> US_SCALE; } unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc; @@ -118,7 +125,7 @@ void do_gettimeofday(struct timeval *tv) seq = read_seqbegin(&xtime_lock); sec = xtime.tv_sec; - usec = xtime.tv_nsec / 1000; + usec = xtime.tv_nsec / NSEC_PER_USEC; /* i386 does some correction here to keep the clock monotonous even when ntpd is fixing drift. @@ -129,14 +136,14 @@ void do_gettimeofday(struct timeval *tv) in arch/x86_64/kernel/vsyscall.c and export all needed variables in vmlinux.lds. -AK */ - t = (jiffies - wall_jiffies) * (1000000L / HZ) + + t = (jiffies - wall_jiffies) * USEC_PER_TICK + do_gettimeoffset(); usec += t; } while (read_seqretry(&xtime_lock, seq)); - tv->tv_sec = sec + usec / 1000000; - tv->tv_usec = usec % 1000000; + tv->tv_sec = sec + usec / USEC_PER_SEC; + tv->tv_usec = usec % USEC_PER_SEC; } EXPORT_SYMBOL(do_gettimeofday); @@ -157,8 +164,8 @@ int do_settimeofday(struct timespec *tv) write_seqlock_irq(&xtime_lock); - nsec -= do_gettimeoffset() * 1000 + - (jiffies - wall_jiffies) * (NSEC_PER_SEC/HZ); + nsec -= do_gettimeoffset() * NSEC_PER_USEC + + (jiffies - wall_jiffies) * NSEC_PER_TICK; wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); @@ -288,7 +295,7 @@ unsigned long long monotonic_clock(void) this_offset = hpet_readl(HPET_COUNTER); } while (read_seqretry(&xtime_lock, seq)); offset = (this_offset - last_offset); - offset *= (NSEC_PER_SEC/HZ) / hpet_tick; + offset *= NSEC_PER_TICK / hpet_tick; } else { do { seq = read_seqbegin(&xtime_lock); @@ -297,7 +304,8 @@ unsigned long long monotonic_clock(void) base = monotonic_base; } while (read_seqretry(&xtime_lock, seq)); this_offset = get_cycles_sync(); - offset = (this_offset - last_offset)*1000 / cpu_khz; + /* FIXME: 1000 or 1000000? */ + offset = (this_offset - last_offset)*1000 / cpu_khz; } return base + offset; } @@ -382,7 +390,7 @@ void main_timer_handler(struct pt_regs *regs) } monotonic_base += - (offset - vxtime.last)*(NSEC_PER_SEC/HZ) / hpet_tick; + (offset - vxtime.last) * NSEC_PER_TICK / hpet_tick; vxtime.last = offset; #ifdef CONFIG_X86_PM_TIMER @@ -391,24 +399,25 @@ void main_timer_handler(struct pt_regs *regs) #endif } else { offset = (((tsc - vxtime.last_tsc) * - vxtime.tsc_quot) >> 32) - (USEC_PER_SEC / HZ); + vxtime.tsc_quot) >> US_SCALE) - USEC_PER_TICK; if (offset < 0) offset = 0; - if (offset > (USEC_PER_SEC / HZ)) { - lost = offset / (USEC_PER_SEC / HZ); - offset %= (USEC_PER_SEC / HZ); + if (offset > USEC_PER_TICK) { + lost = offset / USEC_PER_TICK; + offset %= USEC_PER_TICK; } - monotonic_base += (tsc - vxtime.last_tsc)*1000000/cpu_khz ; + /* FIXME: 1000 or 1000000? */ + monotonic_base += (tsc - vxtime.last_tsc) * 1000000 / cpu_khz; vxtime.last_tsc = tsc - vxtime.quot * delay / vxtime.tsc_quot; if ((((tsc - vxtime.last_tsc) * - vxtime.tsc_quot) >> 32) < offset) + vxtime.tsc_quot) >> US_SCALE) < offset) vxtime.last_tsc = tsc - - (((long) offset << 32) / vxtime.tsc_quot) - 1; + (((long) offset << US_SCALE) / vxtime.tsc_quot) - 1; } if (lost > 0) { @@ -468,16 +477,15 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) } static unsigned int cyc2ns_scale __read_mostly; -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ static inline void set_cyc2ns_scale(unsigned long cpu_khz) { - cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz; + cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / cpu_khz; } static inline unsigned long long cycles_2_ns(unsigned long long cyc) { - return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; + return (cyc * cyc2ns_scale) >> NS_SCALE; } unsigned long long sched_clock(void) @@ -490,7 +498,7 @@ unsigned long long sched_clock(void) Disadvantage is a small drift between CPUs in some configurations, but that should be tolerable. */ if (__vxtime.mode == VXTIME_HPET) - return (hpet_readl(HPET_COUNTER) * vxtime.quot) >> 32; + return (hpet_readl(HPET_COUNTER) * vxtime.quot) >> US_SCALE; #endif /* Could do CPU core sync here. Opteron can execute rdtsc speculatively, @@ -633,7 +641,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new); if (!(freq->flags & CPUFREQ_CONST_LOOPS)) - vxtime.tsc_quot = (1000L << 32) / cpu_khz; + vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz; } set_cyc2ns_scale(cpu_khz_ref); @@ -789,8 +797,8 @@ static int hpet_timer_stop_set_go(unsigned long tick) if (hpet_use_timer) { hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | HPET_TN_32BIT, HPET_T0_CFG); - hpet_writel(hpet_tick, HPET_T0_CMP); - hpet_writel(hpet_tick, HPET_T0_CMP); /* AK: why twice? */ + hpet_writel(hpet_tick, HPET_T0_CMP); /* next interrupt */ + hpet_writel(hpet_tick, HPET_T0_CMP); /* period */ cfg |= HPET_CFG_LEGACY; } /* @@ -825,8 +833,7 @@ static int hpet_init(void) if (hpet_period < 100000 || hpet_period > 100000000) return -1; - hpet_tick = (1000000000L * (USEC_PER_SEC / HZ) + hpet_period / 2) / - hpet_period; + hpet_tick = (FSEC_PER_TICK + hpet_period / 2) / hpet_period; hpet_use_timer = (id & HPET_ID_LEGSUP); @@ -890,18 +897,6 @@ void __init time_init(void) char *timename; char *gtod; -#ifdef HPET_HACK_ENABLE_DANGEROUS - if (!vxtime.hpet_address) { - printk(KERN_WARNING "time.c: WARNING: Enabling HPET base " - "manually!\n"); - outl(0x800038a0, 0xcf8); - outl(0xff000001, 0xcfc); - outl(0x800038a0, 0xcf8); - vxtime.hpet_address = inl(0xcfc) & 0xfffffffe; - printk(KERN_WARNING "time.c: WARNING: Enabled HPET " - "at %#lx.\n", vxtime.hpet_address); - } -#endif if (nohpet) vxtime.hpet_address = 0; @@ -912,7 +907,7 @@ void __init time_init(void) -xtime.tv_sec, -xtime.tv_nsec); if (!hpet_init()) - vxtime_hz = (1000000000000000L + hpet_period / 2) / hpet_period; + vxtime_hz = (FSEC_PER_SEC + hpet_period / 2) / hpet_period; else vxtime.hpet_address = 0; @@ -941,8 +936,8 @@ void __init time_init(void) vxtime_hz / 1000000, vxtime_hz % 1000000, timename, gtod); printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", cpu_khz / 1000, cpu_khz % 1000); - vxtime.quot = (1000000L << 32) / vxtime_hz; - vxtime.tsc_quot = (1000L << 32) / cpu_khz; + vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz; + vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz; vxtime.last_tsc = get_cycles_sync(); setup_irq(0, &irq0); @@ -956,10 +951,10 @@ void __init time_init(void) __cpuinit int unsynchronized_tsc(void) { #ifdef CONFIG_SMP - if (oem_force_hpet_timer()) + if (apic_is_clustered_box()) return 1; /* Intel systems are normally all synchronized. Exceptions - are handled in the OEM check above. */ + are handled in the check above. */ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) return 0; #endif diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index cea335e8746c..3d11a2fe45b7 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -6,8 +6,6 @@ * * Pentium III FXSR, SSE support * Gareth Hughes <gareth@valinux.com>, May 2000 - * - * $Id: traps.c,v 1.36 2002/03/24 11:09:10 ak Exp $ */ /* @@ -31,6 +29,7 @@ #include <linux/nmi.h> #include <linux/kprobes.h> #include <linux/kexec.h> +#include <linux/unwind.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -41,7 +40,7 @@ #include <asm/i387.h> #include <asm/kdebug.h> #include <asm/processor.h> - +#include <asm/unwind.h> #include <asm/smp.h> #include <asm/pgalloc.h> #include <asm/pda.h> @@ -71,6 +70,7 @@ asmlinkage void machine_check(void); asmlinkage void spurious_interrupt_bug(void); ATOMIC_NOTIFIER_HEAD(die_chain); +EXPORT_SYMBOL(die_chain); int register_die_notifier(struct notifier_block *nb) { @@ -107,7 +107,8 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) preempt_enable_no_resched(); } -static int kstack_depth_to_print = 10; +static int kstack_depth_to_print = 12; +static int call_trace = 1; #ifdef CONFIG_KALLSYMS #include <linux/kallsyms.h> @@ -191,6 +192,25 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, return NULL; } +static int show_trace_unwind(struct unwind_frame_info *info, void *context) +{ + int i = 11, n = 0; + + while (unwind(info) == 0 && UNW_PC(info)) { + ++n; + if (i > 50) { + printk("\n "); + i = 7; + } else + i += printk(" "); + i += printk_address(UNW_PC(info)); + if (arch_unw_user_mode(info)) + break; + } + printk("\n"); + return n; +} + /* * x86-64 can have upto three kernel stacks: * process stack @@ -198,15 +218,39 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack */ -void show_trace(unsigned long *stack) +void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack) { const unsigned cpu = safe_smp_processor_id(); unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; - int i; + int i = 11; unsigned used = 0; printk("\nCall Trace:"); + if (!tsk) + tsk = current; + + if (call_trace >= 0) { + int unw_ret = 0; + struct unwind_frame_info info; + + if (regs) { + if (unwind_init_frame_info(&info, tsk, regs) == 0) + unw_ret = show_trace_unwind(&info, NULL); + } else if (tsk == current) + unw_ret = unwind_init_running(&info, show_trace_unwind, NULL); + else { + if (unwind_init_blocked(&info, tsk) == 0) + unw_ret = show_trace_unwind(&info, NULL); + } + if (unw_ret > 0) { + if (call_trace > 0) + return; + printk("Legacy call trace:"); + i = 18; + } + } + #define HANDLE_STACK(cond) \ do while (cond) { \ unsigned long addr = *stack++; \ @@ -229,7 +273,7 @@ void show_trace(unsigned long *stack) } \ } while (0) - for(i = 11; ; ) { + for(; ; ) { const char *id; unsigned long *estack_end; estack_end = in_exception_stack(cpu, (unsigned long)stack, @@ -264,7 +308,7 @@ void show_trace(unsigned long *stack) printk("\n"); } -void show_stack(struct task_struct *tsk, unsigned long * rsp) +static void _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long * rsp) { unsigned long *stack; int i; @@ -298,7 +342,12 @@ void show_stack(struct task_struct *tsk, unsigned long * rsp) printk("%016lx ", *stack++); touch_nmi_watchdog(); } - show_trace((unsigned long *)rsp); + show_trace(tsk, regs, rsp); +} + +void show_stack(struct task_struct *tsk, unsigned long * rsp) +{ + _show_stack(tsk, NULL, rsp); } /* @@ -307,7 +356,7 @@ void show_stack(struct task_struct *tsk, unsigned long * rsp) void dump_stack(void) { unsigned long dummy; - show_trace(&dummy); + show_trace(NULL, NULL, &dummy); } EXPORT_SYMBOL(dump_stack); @@ -334,7 +383,7 @@ void show_registers(struct pt_regs *regs) if (in_kernel) { printk("Stack: "); - show_stack(NULL, (unsigned long*)rsp); + _show_stack(NULL, regs, (unsigned long*)rsp); printk("\nCode: "); if (regs->rip < PAGE_OFFSET) @@ -383,6 +432,7 @@ void out_of_line_bug(void) { BUG(); } +EXPORT_SYMBOL(out_of_line_bug); #endif static DEFINE_SPINLOCK(die_lock); @@ -1012,3 +1062,14 @@ static int __init kstack_setup(char *s) } __setup("kstack=", kstack_setup); +static int __init call_trace_setup(char *s) +{ + if (strcmp(s, "old") == 0) + call_trace = -1; + else if (strcmp(s, "both") == 0) + call_trace = 0; + else if (strcmp(s, "new") == 0) + call_trace = 1; + return 1; +} +__setup("call_trace=", call_trace_setup); diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index b81f473c4a19..1c6a5f322919 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -45,6 +45,15 @@ SECTIONS RODATA +#ifdef CONFIG_STACK_UNWIND + . = ALIGN(8); + .eh_frame : AT(ADDR(.eh_frame) - LOAD_OFFSET) { + __start_unwind = .; + *(.eh_frame) + __end_unwind = .; + } +#endif + /* Data */ .data : AT(ADDR(.data) - LOAD_OFFSET) { *(.data) @@ -131,6 +140,26 @@ SECTIONS *(.data.page_aligned) } + /* might get freed after init */ + . = ALIGN(4096); + __smp_alt_begin = .; + __smp_alt_instructions = .; + .smp_altinstructions : AT(ADDR(.smp_altinstructions) - LOAD_OFFSET) { + *(.smp_altinstructions) + } + __smp_alt_instructions_end = .; + . = ALIGN(8); + __smp_locks = .; + .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { + *(.smp_locks) + } + __smp_locks_end = .; + .smp_altinstr_replacement : AT(ADDR(.smp_altinstr_replacement) - LOAD_OFFSET) { + *(.smp_altinstr_replacement) + } + . = ALIGN(4096); + __smp_alt_end = .; + . = ALIGN(4096); /* Init code and data */ __init_begin = .; .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c index 9468fb20b0bc..f603037df162 100644 --- a/arch/x86_64/kernel/vsyscall.c +++ b/arch/x86_64/kernel/vsyscall.c @@ -107,7 +107,7 @@ static __always_inline long time_syscall(long *t) int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) { - if (unlikely(!__sysctl_vsyscall)) + if (!__sysctl_vsyscall) return gettimeofday(tv,tz); if (tv) do_vgettimeofday(tv); @@ -120,7 +120,7 @@ int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) * unlikely */ time_t __vsyscall(1) vtime(time_t *t) { - if (unlikely(!__sysctl_vsyscall)) + if (!__sysctl_vsyscall) return time_syscall(t); else if (t) *t = __xtime.tv_sec; diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c index 1def21c9f7cd..370952c4ff22 100644 --- a/arch/x86_64/kernel/x8664_ksyms.c +++ b/arch/x86_64/kernel/x8664_ksyms.c @@ -1,66 +1,21 @@ +/* Exports for assembly files. + All C exports should go in the respective C files. */ + #include <linux/config.h> #include <linux/module.h> #include <linux/smp.h> -#include <linux/user.h> -#include <linux/sched.h> -#include <linux/in6.h> -#include <linux/interrupt.h> -#include <linux/smp_lock.h> -#include <linux/pm.h> -#include <linux/pci.h> -#include <linux/apm_bios.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/syscalls.h> -#include <linux/tty.h> #include <asm/semaphore.h> #include <asm/processor.h> -#include <asm/i387.h> #include <asm/uaccess.h> -#include <asm/checksum.h> -#include <asm/io.h> -#include <asm/delay.h> -#include <asm/irq.h> -#include <asm/mmx.h> -#include <asm/desc.h> #include <asm/pgtable.h> -#include <asm/pgalloc.h> -#include <asm/nmi.h> -#include <asm/kdebug.h> -#include <asm/unistd.h> -#include <asm/tlbflush.h> -#include <asm/kdebug.h> - -extern spinlock_t rtc_lock; -#ifdef CONFIG_SMP -extern void __write_lock_failed(rwlock_t *rw); -extern void __read_lock_failed(rwlock_t *rw); -#endif - -/* platform dependent support */ -EXPORT_SYMBOL(boot_cpu_data); -//EXPORT_SYMBOL(dump_fpu); -EXPORT_SYMBOL(__ioremap); -EXPORT_SYMBOL(ioremap_nocache); -EXPORT_SYMBOL(iounmap); EXPORT_SYMBOL(kernel_thread); -EXPORT_SYMBOL(pm_idle); -EXPORT_SYMBOL(pm_power_off); EXPORT_SYMBOL(__down_failed); EXPORT_SYMBOL(__down_failed_interruptible); EXPORT_SYMBOL(__down_failed_trylock); EXPORT_SYMBOL(__up_wakeup); -/* Networking helper routines. */ -EXPORT_SYMBOL(csum_partial_copy_nocheck); -EXPORT_SYMBOL(ip_compute_csum); -/* Delay loops */ -EXPORT_SYMBOL(__udelay); -EXPORT_SYMBOL(__ndelay); -EXPORT_SYMBOL(__delay); -EXPORT_SYMBOL(__const_udelay); EXPORT_SYMBOL(__get_user_1); EXPORT_SYMBOL(__get_user_2); @@ -71,42 +26,20 @@ EXPORT_SYMBOL(__put_user_2); EXPORT_SYMBOL(__put_user_4); EXPORT_SYMBOL(__put_user_8); -EXPORT_SYMBOL(strncpy_from_user); -EXPORT_SYMBOL(__strncpy_from_user); -EXPORT_SYMBOL(clear_user); -EXPORT_SYMBOL(__clear_user); EXPORT_SYMBOL(copy_user_generic); EXPORT_SYMBOL(copy_from_user); EXPORT_SYMBOL(copy_to_user); -EXPORT_SYMBOL(copy_in_user); -EXPORT_SYMBOL(strnlen_user); - -#ifdef CONFIG_PCI -EXPORT_SYMBOL(pci_mem_start); -#endif EXPORT_SYMBOL(copy_page); EXPORT_SYMBOL(clear_page); -EXPORT_SYMBOL(_cpu_pda); #ifdef CONFIG_SMP -EXPORT_SYMBOL(cpu_data); +extern void FASTCALL( __write_lock_failed(rwlock_t *rw)); +extern void FASTCALL( __read_lock_failed(rwlock_t *rw)); EXPORT_SYMBOL(__write_lock_failed); EXPORT_SYMBOL(__read_lock_failed); - -EXPORT_SYMBOL(smp_call_function); -EXPORT_SYMBOL(cpu_callout_map); -#endif - -#ifdef CONFIG_VT -EXPORT_SYMBOL(screen_info); #endif -EXPORT_SYMBOL(rtc_lock); - -EXPORT_SYMBOL_GPL(set_nmi_callback); -EXPORT_SYMBOL_GPL(unset_nmi_callback); - /* Export string functions. We normally rely on gcc builtin for most of these, but gcc sometimes decides not to inline them. */ #undef memcpy @@ -114,51 +47,14 @@ EXPORT_SYMBOL_GPL(unset_nmi_callback); #undef memmove extern void * memset(void *,int,__kernel_size_t); -extern size_t strlen(const char *); -extern void * memmove(void * dest,const void *src,size_t count); extern void * memcpy(void *,const void *,__kernel_size_t); extern void * __memcpy(void *,const void *,__kernel_size_t); EXPORT_SYMBOL(memset); -EXPORT_SYMBOL(memmove); EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(__memcpy); -#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM -/* prototypes are wrong, these are assembly with custom calling functions */ -extern void rwsem_down_read_failed_thunk(void); -extern void rwsem_wake_thunk(void); -extern void rwsem_downgrade_thunk(void); -extern void rwsem_down_write_failed_thunk(void); -EXPORT_SYMBOL(rwsem_down_read_failed_thunk); -EXPORT_SYMBOL(rwsem_wake_thunk); -EXPORT_SYMBOL(rwsem_downgrade_thunk); -EXPORT_SYMBOL(rwsem_down_write_failed_thunk); -#endif - EXPORT_SYMBOL(empty_zero_page); - -EXPORT_SYMBOL(die_chain); - -#ifdef CONFIG_SMP -EXPORT_SYMBOL(cpu_sibling_map); -EXPORT_SYMBOL(smp_num_siblings); -#endif - -#ifdef CONFIG_BUG -EXPORT_SYMBOL(out_of_line_bug); -#endif - EXPORT_SYMBOL(init_level4_pgt); - -extern unsigned long __supported_pte_mask; -EXPORT_SYMBOL(__supported_pte_mask); - -#ifdef CONFIG_SMP -EXPORT_SYMBOL(flush_tlb_page); -#endif - -EXPORT_SYMBOL(cpu_khz); - EXPORT_SYMBOL(load_gs_index); diff --git a/arch/x86_64/lib/csum-partial.c b/arch/x86_64/lib/csum-partial.c index 5384e227cdf6..c493735218da 100644 --- a/arch/x86_64/lib/csum-partial.c +++ b/arch/x86_64/lib/csum-partial.c @@ -147,4 +147,5 @@ unsigned short ip_compute_csum(unsigned char * buff, int len) { return csum_fold(csum_partial(buff,len,0)); } +EXPORT_SYMBOL(ip_compute_csum); diff --git a/arch/x86_64/lib/csum-wrappers.c b/arch/x86_64/lib/csum-wrappers.c index 94323f20816e..b1320ec58428 100644 --- a/arch/x86_64/lib/csum-wrappers.c +++ b/arch/x86_64/lib/csum-wrappers.c @@ -109,6 +109,7 @@ csum_partial_copy_nocheck(const unsigned char *src, unsigned char *dst, int len, { return csum_partial_copy_generic(src,dst,len,sum,NULL,NULL); } +EXPORT_SYMBOL(csum_partial_copy_nocheck); unsigned short csum_ipv6_magic(struct in6_addr *saddr, struct in6_addr *daddr, __u32 len, unsigned short proto, unsigned int sum) diff --git a/arch/x86_64/lib/delay.c b/arch/x86_64/lib/delay.c index 03c460cbdd1c..b6cd3cca2f45 100644 --- a/arch/x86_64/lib/delay.c +++ b/arch/x86_64/lib/delay.c @@ -9,6 +9,7 @@ */ #include <linux/config.h> +#include <linux/module.h> #include <linux/sched.h> #include <linux/delay.h> #include <asm/delay.h> @@ -36,18 +37,22 @@ void __delay(unsigned long loops) } while((now-bclock) < loops); } +EXPORT_SYMBOL(__delay); inline void __const_udelay(unsigned long xloops) { __delay((xloops * HZ * cpu_data[raw_smp_processor_id()].loops_per_jiffy) >> 32); } +EXPORT_SYMBOL(__const_udelay); void __udelay(unsigned long usecs) { __const_udelay(usecs * 0x000010c6); /* 2**32 / 1000000 */ } +EXPORT_SYMBOL(__udelay); void __ndelay(unsigned long nsecs) { __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */ } +EXPORT_SYMBOL(__ndelay); diff --git a/arch/x86_64/lib/memmove.c b/arch/x86_64/lib/memmove.c index e93d5255fdc9..751ebae8ec42 100644 --- a/arch/x86_64/lib/memmove.c +++ b/arch/x86_64/lib/memmove.c @@ -3,12 +3,13 @@ */ #define _STRING_C #include <linux/string.h> +#include <linux/module.h> #undef memmove void *memmove(void * dest,const void *src,size_t count) { if (dest < src) { - __inline_memcpy(dest,src,count); + return memcpy(dest,src,count); } else { char *p = (char *) dest + count; char *s = (char *) src + count; @@ -17,3 +18,4 @@ void *memmove(void * dest,const void *src,size_t count) } return dest; } +EXPORT_SYMBOL(memmove); diff --git a/arch/x86_64/lib/usercopy.c b/arch/x86_64/lib/usercopy.c index 9bc2c295818e..893d43f838cc 100644 --- a/arch/x86_64/lib/usercopy.c +++ b/arch/x86_64/lib/usercopy.c @@ -5,6 +5,7 @@ * Copyright 1997 Linus Torvalds * Copyright 2002 Andi Kleen <ak@suse.de> */ +#include <linux/module.h> #include <asm/uaccess.h> /* @@ -47,15 +48,17 @@ __strncpy_from_user(char *dst, const char __user *src, long count) __do_strncpy_from_user(dst, src, count, res); return res; } +EXPORT_SYMBOL(__strncpy_from_user); long strncpy_from_user(char *dst, const char __user *src, long count) { long res = -EFAULT; if (access_ok(VERIFY_READ, src, 1)) - __do_strncpy_from_user(dst, src, count, res); + return __strncpy_from_user(dst, src, count); return res; } +EXPORT_SYMBOL(strncpy_from_user); /* * Zero Userspace @@ -94,7 +97,7 @@ unsigned long __clear_user(void __user *addr, unsigned long size) [zero] "r" (0UL), [eight] "r" (8UL)); return size; } - +EXPORT_SYMBOL(__clear_user); unsigned long clear_user(void __user *to, unsigned long n) { @@ -102,6 +105,7 @@ unsigned long clear_user(void __user *to, unsigned long n) return __clear_user(to, n); return n; } +EXPORT_SYMBOL(clear_user); /* * Return the size of a string (including the ending 0) @@ -125,6 +129,7 @@ long __strnlen_user(const char __user *s, long n) s++; } } +EXPORT_SYMBOL(__strnlen_user); long strnlen_user(const char __user *s, long n) { @@ -132,6 +137,7 @@ long strnlen_user(const char __user *s, long n) return 0; return __strnlen_user(s, n); } +EXPORT_SYMBOL(strnlen_user); long strlen_user(const char __user *s) { @@ -147,6 +153,7 @@ long strlen_user(const char __user *s) s++; } } +EXPORT_SYMBOL(strlen_user); unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len) { @@ -155,3 +162,5 @@ unsigned long copy_in_user(void __user *to, const void __user *from, unsigned le } return len; } +EXPORT_SYMBOL(copy_in_user); + diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c index 0803d3858af1..08dc696f54ee 100644 --- a/arch/x86_64/mm/fault.c +++ b/arch/x86_64/mm/fault.c @@ -195,7 +195,7 @@ void dump_pagetable(unsigned long address) printk("PGD %lx ", pgd_val(*pgd)); if (!pgd_present(*pgd)) goto ret; - pud = __pud_offset_k((pud_t *)pgd_page(*pgd), address); + pud = pud_offset(pgd, address); if (bad_address(pud)) goto bad; printk("PUD %lx ", pud_val(*pud)); if (!pud_present(*pud)) goto ret; @@ -445,8 +445,10 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, if (!(vma->vm_flags & VM_GROWSDOWN)) goto bad_area; if (error_code & 4) { - // XXX: align red zone size with ABI - if (address + 128 < regs->rsp) + /* Allow userspace just enough access below the stack pointer + * to let the 'enter' instruction work. + */ + if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp) goto bad_area; } if (expand_stack(vma, address)) diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index 4ba34e95d835..02add1d1dfa8 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -41,8 +41,6 @@ #include <asm/proto.h> #include <asm/smp.h> #include <asm/sections.h> -#include <asm/dma-mapping.h> -#include <asm/swiotlb.h> #ifndef Dprintk #define Dprintk(x...) @@ -90,8 +88,6 @@ void show_mem(void) printk(KERN_INFO "%lu pages swap cached\n",cached); } -/* References to section boundaries */ - int after_bootmem; static __init void *spp_getpage(void) @@ -261,9 +257,10 @@ phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end) for (i = 0; i < PTRS_PER_PMD; pmd++, i++, address += PMD_SIZE) { unsigned long entry; - if (address > end) { - for (; i < PTRS_PER_PMD; i++, pmd++) - set_pmd(pmd, __pmd(0)); + if (address >= end) { + if (!after_bootmem) + for (; i < PTRS_PER_PMD; i++, pmd++) + set_pmd(pmd, __pmd(0)); break; } entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address; @@ -341,7 +338,8 @@ static void __init find_early_table_space(unsigned long end) table_end = table_start; early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n", - end, table_start << PAGE_SHIFT, table_end << PAGE_SHIFT); + end, table_start << PAGE_SHIFT, + (table_start << PAGE_SHIFT) + tables); } /* Setup the direct mapping of the physical memory at PAGE_OFFSET. @@ -372,7 +370,7 @@ void __meminit init_memory_mapping(unsigned long start, unsigned long end) pud_t *pud; if (after_bootmem) - pud = pud_offset_k(pgd, start & PGDIR_MASK); + pud = pud_offset(pgd, start & PGDIR_MASK); else pud = alloc_low_page(&map, &pud_phys); @@ -587,10 +585,7 @@ void __init mem_init(void) { long codesize, reservedpages, datasize, initsize; -#ifdef CONFIG_SWIOTLB - pci_swiotlb_init(); -#endif - no_iommu_init(); + pci_iommu_alloc(); /* How many end-of-memory variables you have, grandma! */ max_low_pfn = end_pfn; @@ -644,20 +639,29 @@ void __init mem_init(void) #endif } -void free_initmem(void) +void free_init_pages(char *what, unsigned long begin, unsigned long end) { unsigned long addr; - addr = (unsigned long)(&__init_begin); - for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { + if (begin >= end) + return; + + printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); + for (addr = begin; addr < end; addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); init_page_count(virt_to_page(addr)); memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE); free_page(addr); totalram_pages++; } +} + +void free_initmem(void) +{ memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin); - printk ("Freeing unused kernel memory: %luk freed\n", (__init_end - __init_begin) >> 10); + free_init_pages("unused kernel memory", + (unsigned long)(&__init_begin), + (unsigned long)(&__init_end)); } #ifdef CONFIG_DEBUG_RODATA @@ -686,15 +690,7 @@ void mark_rodata_ro(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - if (start >= end) - return; - printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); - for (; start < end; start += PAGE_SIZE) { - ClearPageReserved(virt_to_page(start)); - init_page_count(virt_to_page(start)); - free_page(start); - totalram_pages++; - } + free_init_pages("initrd memory", start, end); } #endif diff --git a/arch/x86_64/mm/ioremap.c b/arch/x86_64/mm/ioremap.c index ae207064201e..45d7d823c3b8 100644 --- a/arch/x86_64/mm/ioremap.c +++ b/arch/x86_64/mm/ioremap.c @@ -11,6 +11,7 @@ #include <linux/vmalloc.h> #include <linux/init.h> #include <linux/slab.h> +#include <linux/module.h> #include <asm/io.h> #include <asm/pgalloc.h> #include <asm/fixmap.h> @@ -219,6 +220,7 @@ void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned l } return (__force void __iomem *) (offset + (char *)addr); } +EXPORT_SYMBOL(__ioremap); /** * ioremap_nocache - map bus memory into CPU space @@ -246,6 +248,7 @@ void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size) { return __ioremap(phys_addr, size, _PAGE_PCD); } +EXPORT_SYMBOL(ioremap_nocache); /** * iounmap - Free a IO remapping @@ -291,3 +294,5 @@ void iounmap(volatile void __iomem *addr) BUG_ON(p != o || o == NULL); kfree(p); } +EXPORT_SYMBOL(iounmap); + diff --git a/arch/x86_64/pci/k8-bus.c b/arch/x86_64/pci/k8-bus.c index 3acf60ded2a0..b50a7c7c47f8 100644 --- a/arch/x86_64/pci/k8-bus.c +++ b/arch/x86_64/pci/k8-bus.c @@ -2,6 +2,7 @@ #include <linux/pci.h> #include <asm/mpspec.h> #include <linux/cpumask.h> +#include <asm/k8.h> /* * This discovers the pcibus <-> node mapping on AMD K8. @@ -18,7 +19,6 @@ #define NR_LDT_BUS_NUMBER_REGISTERS 3 #define SECONDARY_LDT_BUS_NUMBER(dword) ((dword >> 8) & 0xFF) #define SUBORDINATE_LDT_BUS_NUMBER(dword) ((dword >> 16) & 0xFF) -#define PCI_DEVICE_ID_K8HTCONFIG 0x1100 /** * fill_mp_bus_to_cpumask() @@ -28,8 +28,7 @@ __init static int fill_mp_bus_to_cpumask(void) { - struct pci_dev *nb_dev = NULL; - int i, j; + int i, j, k; u32 ldtbus, nid; static int lbnr[3] = { LDT_BUS_NUMBER_REGISTER_0, @@ -37,8 +36,9 @@ fill_mp_bus_to_cpumask(void) LDT_BUS_NUMBER_REGISTER_2 }; - while ((nb_dev = pci_get_device(PCI_VENDOR_ID_AMD, - PCI_DEVICE_ID_K8HTCONFIG, nb_dev))) { + cache_k8_northbridges(); + for (k = 0; k < num_k8_northbridges; k++) { + struct pci_dev *nb_dev = k8_northbridges[k]; pci_read_config_dword(nb_dev, NODE_ID_REGISTER, &nid); for (i = 0; i < NR_LDT_BUS_NUMBER_REGISTERS; i++) { |