From 407984f1af259b31957c7c05075a454a751bb801 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] x86: Add abilty to enable/disable nmi watchdog with sysctl Adds a new /proc/sys/kernel/nmi call that will enable/disable the nmi watchdog. Signed-off-by: Don Zickus Signed-off-by: Andi Kleen --- include/linux/sysctl.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 736ed917a4f8..ecb79ba52ae1 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -150,6 +150,7 @@ enum KERN_IA64_UNALIGNED=72, /* int: ia64 unaligned userland trap enable */ KERN_COMPAT_LOG=73, /* int: print compat layer messages */ KERN_MAX_LOCK_DEPTH=74, + KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */ }; -- cgit v1.2.3 From 8da5adda91df3d2fcc5300e68da491694c9af019 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] x86: Allow users to force a panic on NMI To quote Alan Cox: The default Linux behaviour on an NMI of either memory or unknown is to continue operation. For many environments such as scientific computing it is preferable that the box is taken out and the error dealt with than an uncorrected parity/ECC error get propogated. A small number of systems do generate NMI's for bizarre random reasons such as power management so the default is unchanged. In other respects the new proc/sys entry works like the existing panic controls already in that directory. This is separate to the edac support - EDAC allows supported chipsets to handle ECC errors well, this change allows unsupported cases to at least panic rather than cause problems further down the line. Signed-off-by: Don Zickus Signed-off-by: Andi Kleen --- arch/i386/kernel/traps.c | 6 ++++++ arch/x86_64/kernel/traps.c | 6 ++++++ include/linux/kernel.h | 1 + include/linux/sysctl.h | 1 + kernel/panic.c | 1 + kernel/sysctl.c | 8 ++++++++ 6 files changed, 23 insertions(+) (limited to 'include/linux') diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 7db664d0b25c..2f6cb8276480 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -635,6 +635,8 @@ static void mem_parity_error(unsigned char reason, struct pt_regs * regs) "to continue\n"); printk(KERN_EMERG "You probably have a hardware problem with your RAM " "chips\n"); + if (panic_on_unrecovered_nmi) + panic("NMI: Not continuing"); /* Clear and disable the memory parity error line. */ clear_mem_error(reason); @@ -670,6 +672,10 @@ static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) reason, smp_processor_id()); printk("Dazed and confused, but trying to continue\n"); printk("Do you have a strange power saving mode enabled?\n"); + + if (panic_on_unrecovered_nmi) + panic("NMI: Not continuing"); + } static DEFINE_SPINLOCK(nmi_print_lock); diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 42bc070fdf11..b18829db2a6a 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -732,6 +732,8 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs) { printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n"); printk("You probably have a hardware problem with your RAM chips\n"); + if (panic_on_unrecovered_nmi) + panic("NMI: Not continuing"); /* Clear and disable the memory parity error line. */ reason = (reason & 0xf) | 4; @@ -757,6 +759,10 @@ unknown_nmi_error(unsigned char reason, struct pt_regs * regs) { printk("Uhhuh. NMI received for unknown reason %02x.\n", reason); printk("Dazed and confused, but trying to continue\n"); printk("Do you have a strange power saving mode enabled?\n"); + + if (panic_on_unrecovered_nmi) + panic("NMI: Not continuing"); + } /* Runs on IST stack. This code must keep interrupts off all the time. diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 2b2ae4fdce8b..1ff9609300b4 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -186,6 +186,7 @@ extern void bust_spinlocks(int yes); extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */ extern int panic_timeout; extern int panic_on_oops; +extern int panic_on_unrecovered_nmi; extern int tainted; extern const char *print_tainted(void); extern void add_taint(unsigned); diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index ecb79ba52ae1..432778446ad2 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -151,6 +151,7 @@ enum KERN_COMPAT_LOG=73, /* int: print compat layer messages */ KERN_MAX_LOCK_DEPTH=74, KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */ + KERN_PANIC_ON_NMI=76, /* int: whether we will panic on an unrecovered */ }; diff --git a/kernel/panic.c b/kernel/panic.c index 8010b9b17aca..d2db3e2209e0 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -21,6 +21,7 @@ #include int panic_on_oops; +int panic_on_unrecovered_nmi; int tainted; static int pause_on_oops; static int pause_on_oops_flag; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 040de6bd74dd..220e20564124 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -641,6 +641,14 @@ static ctl_table kern_table[] = { }, #endif #if defined(CONFIG_X86) + { + .ctl_name = KERN_PANIC_ON_NMI, + .procname = "panic_on_unrecovered_nmi", + .data = &panic_on_unrecovered_nmi, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = KERN_BOOTLOADER_TYPE, .procname = "bootloader_type", -- cgit v1.2.3 From c08c820508233b424deab3302bc404bbecc6493a Mon Sep 17 00:00:00 2001 From: Vojtech Pavlik Date: Tue, 26 Sep 2006 10:52:28 +0200 Subject: [PATCH] Add the vgetcpu vsyscall This patch adds a vgetcpu vsyscall, which depending on the CPU RDTSCP capability uses either the RDTSCP or CPUID to obtain a CPU and node numbers and pass them to the program. AK: Lots of changes over Vojtech's original code: Better prototype for vgetcpu() It's better to pass the cpu / node numbers as separate arguments to avoid mistakes when going from SMP to NUMA. Also add a fast time stamp based cache using a user supplied argument to speed things more up. Use fast method from Chuck Ebbert to retrieve node/cpu from GDT limit instead of CPUID Made sure RDTSCP init is always executed after node is known. Drop printk Signed-off-by: Vojtech Pavlik Signed-off-by: Andi Kleen --- arch/x86_64/kernel/head.S | 2 +- arch/x86_64/kernel/time.c | 13 ++++--- arch/x86_64/kernel/vmlinux.lds.S | 3 ++ arch/x86_64/kernel/vsyscall.c | 84 +++++++++++++++++++++++++++++++++++++++- include/asm-x86_64/segment.h | 5 ++- include/asm-x86_64/smp.h | 12 ++++-- include/asm-x86_64/vsyscall.h | 9 +++++ include/linux/getcpu.h | 16 ++++++++ 8 files changed, 130 insertions(+), 14 deletions(-) create mode 100644 include/linux/getcpu.h (limited to 'include/linux') diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index c9739ca81d06..505ec4a57506 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -371,7 +371,7 @@ ENTRY(cpu_gdt_table) .quad 0,0 /* TSS */ .quad 0,0 /* LDT */ .quad 0,0,0 /* three TLS descriptors */ - .quad 0 /* unused */ + .quad 0x0000f40000000000 /* node/CPU stored in limit */ gdt_end: /* asm/segment.h:GDT_ENTRIES must match this */ /* This should be a multiple of the cache line size */ diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 97b9e46d1992..560ed944dc0e 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -899,12 +899,8 @@ static int __cpuinit time_cpu_notifier(struct notifier_block *nb, unsigned long action, void *hcpu) { unsigned cpu = (unsigned long) hcpu; - if (action == CPU_ONLINE && - cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP)) { - unsigned p; - p = smp_processor_id() | (cpu_to_node(smp_processor_id())<<12); - write_rdtscp_aux(p); - } + if (action == CPU_ONLINE) + vsyscall_set_cpu(cpu); return NOTIFY_DONE; } @@ -993,6 +989,11 @@ void time_init_gtod(void) if (unsynchronized_tsc()) notsc = 1; + if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) + vgetcpu_mode = VGETCPU_RDTSCP; + else + vgetcpu_mode = VGETCPU_LSL; + if (vxtime.hpet_address && notsc) { timetype = hpet_use_timer ? "HPET" : "PIT/HPET"; if (hpet_use_timer) diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index 7c4de31471d4..8d5a5149bb3a 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -99,6 +99,9 @@ SECTIONS .vxtime : AT(VLOAD(.vxtime)) { *(.vxtime) } vxtime = VVIRT(.vxtime); + .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) } + vgetcpu_mode = VVIRT(.vgetcpu_mode); + .wall_jiffies : AT(VLOAD(.wall_jiffies)) { *(.wall_jiffies) } wall_jiffies = VVIRT(.wall_jiffies); diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c index f603037df162..902783bc4d53 100644 --- a/arch/x86_64/kernel/vsyscall.c +++ b/arch/x86_64/kernel/vsyscall.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -33,11 +34,15 @@ #include #include #include +#include +#include +#include #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) int __sysctl_vsyscall __section_sysctl_vsyscall = 1; seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED; +int __vgetcpu_mode __section_vgetcpu_mode; #include @@ -127,9 +132,46 @@ time_t __vsyscall(1) vtime(time_t *t) return __xtime.tv_sec; } -long __vsyscall(2) venosys_0(void) +/* Fast way to get current CPU and node. + This helps to do per node and per CPU caches in user space. + The result is not guaranteed without CPU affinity, but usually + works out because the scheduler tries to keep a thread on the same + CPU. + + tcache must point to a two element sized long array. + All arguments can be NULL. */ +long __vsyscall(2) +vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) { - return -ENOSYS; + unsigned int dummy, p; + unsigned long j = 0; + + /* Fast cache - only recompute value once per jiffies and avoid + relatively costly rdtscp/cpuid otherwise. + This works because the scheduler usually keeps the process + on the same CPU and this syscall doesn't guarantee its + results anyways. + We do this here because otherwise user space would do it on + its own in a likely inferior way (no access to jiffies). + If you don't like it pass NULL. */ + if (tcache && tcache->t0 == (j = __jiffies)) { + p = tcache->t1; + } else if (__vgetcpu_mode == VGETCPU_RDTSCP) { + /* Load per CPU data from RDTSCP */ + rdtscp(dummy, dummy, p); + } else { + /* Load per CPU data from GDT */ + asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); + } + if (tcache) { + tcache->t0 = j; + tcache->t1 = p; + } + if (cpu) + *cpu = p & 0xfff; + if (node) + *node = p >> 12; + return 0; } long __vsyscall(3) venosys_1(void) @@ -200,6 +242,43 @@ static ctl_table kernel_root_table2[] = { #endif +static void __cpuinit write_rdtscp_cb(void *info) +{ + write_rdtscp_aux((unsigned long)info); +} + +void __cpuinit vsyscall_set_cpu(int cpu) +{ + unsigned long *d; + unsigned long node = 0; +#ifdef CONFIG_NUMA + node = cpu_to_node[cpu]; +#endif + if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP)) { + void *info = (void *)((node << 12) | cpu); + /* Can happen on preemptive kernel */ + if (get_cpu() == cpu) + write_rdtscp_cb(info); +#ifdef CONFIG_SMP + else { + /* the notifier is unfortunately not executed on the + target CPU */ + smp_call_function_single(cpu,write_rdtscp_cb,info,0,1); + } +#endif + put_cpu(); + } + + /* Store cpu number in limit so that it can be loaded quickly + in user space in vgetcpu. + 12 bits for the CPU and 8 bits for the node. */ + d = (unsigned long *)(cpu_gdt(cpu) + GDT_ENTRY_PER_CPU); + *d = 0x0f40000000000ULL; + *d |= cpu; + *d |= (node & 0xf) << 12; + *d |= (node >> 4) << 48; +} + static void __init map_vsyscall(void) { extern char __vsyscall_0; @@ -214,6 +293,7 @@ static int __init vsyscall_init(void) VSYSCALL_ADDR(__NR_vgettimeofday))); BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); + BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu)); map_vsyscall(); #ifdef CONFIG_SYSCTL register_sysctl_table(kernel_root_table2, 0); diff --git a/include/asm-x86_64/segment.h b/include/asm-x86_64/segment.h index d4bed33fb32c..334ddcdd8f92 100644 --- a/include/asm-x86_64/segment.h +++ b/include/asm-x86_64/segment.h @@ -20,15 +20,16 @@ #define __USER_CS 0x33 /* 6*8+3 */ #define __USER32_DS __USER_DS -#define GDT_ENTRY_TLS 1 #define GDT_ENTRY_TSS 8 /* needs two entries */ #define GDT_ENTRY_LDT 10 /* needs two entries */ #define GDT_ENTRY_TLS_MIN 12 #define GDT_ENTRY_TLS_MAX 14 -/* 15 free */ #define GDT_ENTRY_TLS_ENTRIES 3 +#define GDT_ENTRY_PER_CPU 15 /* Abused to load per CPU data from limit */ +#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU * 8 + 3) + /* TLS indexes for 64bit - hardcoded in arch_prctl */ #define FS_TLS 0 #define GS_TLS 1 diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h index 6805e1feb300..d61547fd833b 100644 --- a/include/asm-x86_64/smp.h +++ b/include/asm-x86_64/smp.h @@ -133,13 +133,19 @@ static __inline int logical_smp_processor_id(void) /* we don't want to mark this access volatile - bad code generation */ return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR)); } -#endif #ifdef CONFIG_SMP #define cpu_physical_id(cpu) x86_cpu_to_apicid[cpu] #else #define cpu_physical_id(cpu) boot_cpu_id -#endif - +static inline int smp_call_function_single(int cpuid, void (*func) (void *info), + void *info, int retry, int wait) +{ + /* Disable interrupts here? */ + func(info); + return 0; +} +#endif /* !CONFIG_SMP */ +#endif /* !__ASSEMBLY */ #endif diff --git a/include/asm-x86_64/vsyscall.h b/include/asm-x86_64/vsyscall.h index 146b24402a5f..2281e9399b96 100644 --- a/include/asm-x86_64/vsyscall.h +++ b/include/asm-x86_64/vsyscall.h @@ -4,6 +4,7 @@ enum vsyscall_num { __NR_vgettimeofday, __NR_vtime, + __NR_vgetcpu, }; #define VSYSCALL_START (-10UL << 20) @@ -15,6 +16,7 @@ enum vsyscall_num { #include #define __section_vxtime __attribute__ ((unused, __section__ (".vxtime"), aligned(16))) +#define __section_vgetcpu_mode __attribute__ ((unused, __section__ (".vgetcpu_mode"), aligned(16))) #define __section_wall_jiffies __attribute__ ((unused, __section__ (".wall_jiffies"), aligned(16))) #define __section_jiffies __attribute__ ((unused, __section__ (".jiffies"), aligned(16))) #define __section_sys_tz __attribute__ ((unused, __section__ (".sys_tz"), aligned(16))) @@ -26,6 +28,9 @@ enum vsyscall_num { #define VXTIME_HPET 2 #define VXTIME_PMTMR 3 +#define VGETCPU_RDTSCP 1 +#define VGETCPU_LSL 2 + struct vxtime_data { long hpet_address; /* HPET base address */ int last; @@ -40,6 +45,7 @@ struct vxtime_data { /* vsyscall space (readonly) */ extern struct vxtime_data __vxtime; +extern int __vgetcpu_mode; extern struct timespec __xtime; extern volatile unsigned long __jiffies; extern unsigned long __wall_jiffies; @@ -48,6 +54,7 @@ extern seqlock_t __xtime_lock; /* kernel space (writeable) */ extern struct vxtime_data vxtime; +extern int vgetcpu_mode; extern unsigned long wall_jiffies; extern struct timezone sys_tz; extern int sysctl_vsyscall; @@ -55,6 +62,8 @@ extern seqlock_t xtime_lock; extern int sysctl_vsyscall; +extern void vsyscall_set_cpu(int cpu); + #define ARCH_HAVE_XTIME_LOCK 1 #endif /* __KERNEL__ */ diff --git a/include/linux/getcpu.h b/include/linux/getcpu.h new file mode 100644 index 000000000000..031ed3780e45 --- /dev/null +++ b/include/linux/getcpu.h @@ -0,0 +1,16 @@ +#ifndef _LINUX_GETCPU_H +#define _LINUX_GETCPU_H 1 + +/* Cache for getcpu() to speed it up. Results might be upto a jiffie + out of date, but will be faster. + User programs should not refer to the contents of this structure. + It is only a cache for vgetcpu(). It might change in future kernels. + The user program must store this information per thread (__thread) + If you want 100% accurate information pass NULL instead. */ +struct getcpu_cache { + unsigned long t0; + unsigned long t1; + unsigned long res[4]; +}; + +#endif -- cgit v1.2.3 From 3cfc348bf90ffaa777c188652aa297f04eb94de8 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:28 +0200 Subject: [PATCH] x86: Add portable getcpu call For NUMA optimization and some other algorithms it is useful to have a fast to get the current CPU and node numbers in user space. x86-64 added a fast way to do this in a vsyscall. This adds a generic syscall for other architectures to make it a generic portable facility. I expect some of them will also implement it as a faster vsyscall. The cache is an optimization for the x86-64 vsyscall optimization. Since what the syscall returns is an approximation anyways and user space often wants very fast results it can be cached for some time. The norma methods to get this information in user space are relatively slow The vsyscall is in a better position to manage the cache because it has direct access to a fast time stamp (jiffies). For the generic syscall optimization it doesn't help much, but enforce a valid argument to keep programs portable I only added an i386 syscall entry for now. Other architectures can follow as needed. AK: Also added some cleanups from Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/syscall_table.S | 1 + arch/x86_64/ia32/ia32entry.S | 1 + include/asm-i386/unistd.h | 3 ++- include/linux/syscalls.h | 2 ++ kernel/sys.c | 31 +++++++++++++++++++++++++++++++ 5 files changed, 37 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S index dd63d4775398..7e639f78b0b9 100644 --- a/arch/i386/kernel/syscall_table.S +++ b/arch/i386/kernel/syscall_table.S @@ -317,3 +317,4 @@ ENTRY(sys_call_table) .long sys_tee /* 315 */ .long sys_vmsplice .long sys_move_pages + .long sys_getcpu diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index 30ed0f6f4a2b..32fd32bea07c 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -713,4 +713,5 @@ ia32_sys_call_table: .quad sys_tee .quad compat_sys_vmsplice .quad compat_sys_move_pages + .quad sys_getcpu ia32_syscall_end: diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h index fc1c8ddae149..565d0897b205 100644 --- a/include/asm-i386/unistd.h +++ b/include/asm-i386/unistd.h @@ -323,10 +323,11 @@ #define __NR_tee 315 #define __NR_vmsplice 316 #define __NR_move_pages 317 +#define __NR_getcpu 318 #ifdef __KERNEL__ -#define NR_syscalls 318 +#define NR_syscalls 319 /* * user-visible error numbers are in the range -1 - -128: see diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 008f04c56737..3f0f716225ec 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -53,6 +53,7 @@ struct mq_attr; struct compat_stat; struct compat_timeval; struct robust_list_head; +struct getcpu_cache; #include #include @@ -596,5 +597,6 @@ asmlinkage long sys_get_robust_list(int pid, size_t __user *len_ptr); asmlinkage long sys_set_robust_list(struct robust_list_head __user *head, size_t len); +asmlinkage long sys_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *cache); #endif diff --git a/kernel/sys.c b/kernel/sys.c index e236f98f7ec5..3f894775488d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -2062,3 +2063,33 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, } return error; } + +asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep, + struct getcpu_cache __user *cache) +{ + int err = 0; + int cpu = raw_smp_processor_id(); + if (cpup) + err |= put_user(cpu, cpup); + if (nodep) + err |= put_user(cpu_to_node(cpu), nodep); + if (cache) { + /* + * The cache is not needed for this implementation, + * but make sure user programs pass something + * valid. vsyscall implementations can instead make + * good use of the cache. Only use t0 and t1 because + * these are available in both 32bit and 64bit ABI (no + * need for a compat_getcpu). 32bit has enough + * padding + */ + unsigned long t0, t1; + get_user(t0, &cache->t0); + get_user(t1, &cache->t1); + t0++; + t1++; + put_user(t0, &cache->t0); + put_user(t1, &cache->t1); + } + return err ? -EFAULT : 0; +} -- cgit v1.2.3 From 5a1b3999d6cb7ab87f1f3b1700bc91839fd6fa29 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:34 +0200 Subject: [PATCH] x86: Some preparationary cleanup for stack trace - Remove unused all_contexts parameter No caller used it - Move skip argument into the structure (needed for followon patches) Cc: mingo@elte.hu Signed-off-by: Andi Kleen --- arch/i386/kernel/stacktrace.c | 11 +++-------- arch/s390/kernel/stacktrace.c | 17 ++++++++--------- arch/x86_64/kernel/stacktrace.c | 14 +++++--------- include/linux/stacktrace.h | 7 ++++--- kernel/lockdep.c | 5 ++++- 5 files changed, 24 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/arch/i386/kernel/stacktrace.c b/arch/i386/kernel/stacktrace.c index e62a037ab399..ae3c32a87add 100644 --- a/arch/i386/kernel/stacktrace.c +++ b/arch/i386/kernel/stacktrace.c @@ -61,12 +61,8 @@ save_context_stack(struct stack_trace *trace, unsigned int skip, /* * Save stack-backtrace addresses into a stack_trace buffer. - * If all_contexts is set, all contexts (hardirq, softirq and process) - * are saved. If not set then only the current context is saved. */ -void save_stack_trace(struct stack_trace *trace, - struct task_struct *task, int all_contexts, - unsigned int skip) +void save_stack_trace(struct stack_trace *trace, struct task_struct *task) { unsigned long ebp; unsigned long *stack = &ebp; @@ -85,10 +81,9 @@ void save_stack_trace(struct stack_trace *trace, struct thread_info *context = (struct thread_info *) ((unsigned long)stack & (~(THREAD_SIZE - 1))); - ebp = save_context_stack(trace, skip, context, stack, ebp); + ebp = save_context_stack(trace, trace->skip, context, stack, ebp); stack = (unsigned long *)context->previous_esp; - if (!all_contexts || !stack || - trace->nr_entries >= trace->max_entries) + if (!stack || trace->nr_entries >= trace->max_entries) break; trace->entries[trace->nr_entries++] = ULONG_MAX; if (trace->nr_entries >= trace->max_entries) diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c index de83f38288d0..d9428a0fc8fb 100644 --- a/arch/s390/kernel/stacktrace.c +++ b/arch/s390/kernel/stacktrace.c @@ -59,9 +59,7 @@ static inline unsigned long save_context_stack(struct stack_trace *trace, } } -void save_stack_trace(struct stack_trace *trace, - struct task_struct *task, int all_contexts, - unsigned int skip) +void save_stack_trace(struct stack_trace *trace, struct task_struct *task) { register unsigned long sp asm ("15"); unsigned long orig_sp; @@ -69,22 +67,23 @@ void save_stack_trace(struct stack_trace *trace, sp &= PSW_ADDR_INSN; orig_sp = sp; - sp = save_context_stack(trace, &skip, sp, + sp = save_context_stack(trace, &trace->skip, sp, S390_lowcore.panic_stack - PAGE_SIZE, S390_lowcore.panic_stack); - if ((sp != orig_sp) && !all_contexts) + if ((sp != orig_sp) && !trace->all_contexts) return; - sp = save_context_stack(trace, &skip, sp, + sp = save_context_stack(trace, &trace->skip, sp, S390_lowcore.async_stack - ASYNC_SIZE, S390_lowcore.async_stack); - if ((sp != orig_sp) && !all_contexts) + if ((sp != orig_sp) && !trace->all_contexts) return; if (task) - save_context_stack(trace, &skip, sp, + save_context_stack(trace, &trace->skip, sp, (unsigned long) task_stack_page(task), (unsigned long) task_stack_page(task) + THREAD_SIZE); else - save_context_stack(trace, &skip, sp, S390_lowcore.thread_info, + save_context_stack(trace, &trace->skip, sp, + S390_lowcore.thread_info, S390_lowcore.thread_info + THREAD_SIZE); return; } diff --git a/arch/x86_64/kernel/stacktrace.c b/arch/x86_64/kernel/stacktrace.c index 32cf55eb9af8..1c022af8fe1e 100644 --- a/arch/x86_64/kernel/stacktrace.c +++ b/arch/x86_64/kernel/stacktrace.c @@ -109,9 +109,10 @@ out_restore: * Save stack-backtrace addresses into a stack_trace buffer: */ static inline unsigned long -save_context_stack(struct stack_trace *trace, unsigned int skip, +save_context_stack(struct stack_trace *trace, unsigned long stack, unsigned long stack_end) { + int skip = trace->skip; unsigned long addr; #ifdef CONFIG_FRAME_POINTER @@ -159,12 +160,8 @@ save_context_stack(struct stack_trace *trace, unsigned int skip, /* * Save stack-backtrace addresses into a stack_trace buffer. - * If all_contexts is set, all contexts (hardirq, softirq and process) - * are saved. If not set then only the current context is saved. */ -void save_stack_trace(struct stack_trace *trace, - struct task_struct *task, int all_contexts, - unsigned int skip) +void save_stack_trace(struct stack_trace *trace, struct task_struct *task) { unsigned long stack = (unsigned long)&stack; int i, nr_stacks = 0, stacks_done[MAX_STACKS]; @@ -207,9 +204,8 @@ void save_stack_trace(struct stack_trace *trace, return; stacks_done[nr_stacks] = stack_end; - stack = save_context_stack(trace, skip, stack, stack_end); - if (!all_contexts || !stack || - trace->nr_entries >= trace->max_entries) + stack = save_context_stack(trace, stack, stack_end); + if (!stack || trace->nr_entries >= trace->max_entries) return; trace->entries[trace->nr_entries++] = ULONG_MAX; if (trace->nr_entries >= trace->max_entries) diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h index 9cc81e572224..50e2b01e517c 100644 --- a/include/linux/stacktrace.h +++ b/include/linux/stacktrace.h @@ -5,15 +5,16 @@ struct stack_trace { unsigned int nr_entries, max_entries; unsigned long *entries; + int skip; /* input argument: How many entries to skip */ + int all_contexts; /* input argument: if true do than one stack */ }; extern void save_stack_trace(struct stack_trace *trace, - struct task_struct *task, int all_contexts, - unsigned int skip); + struct task_struct *task); extern void print_stack_trace(struct stack_trace *trace, int spaces); #else -# define save_stack_trace(trace, task, all, skip) do { } while (0) +# define save_stack_trace(trace, task) do { } while (0) # define print_stack_trace(trace) do { } while (0) #endif diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 9bad17884513..900b4cb1a024 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -224,7 +224,10 @@ static int save_trace(struct stack_trace *trace) trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; trace->entries = stack_trace + nr_stack_trace_entries; - save_stack_trace(trace, NULL, 0, 3); + trace->skip = 3; + trace->all_contexts = 0; + + save_stack_trace(trace, NULL); trace->max_entries = trace->nr_entries; -- cgit v1.2.3 From d28c4393a7bf558538e9def269c1caeab6ec056f Mon Sep 17 00:00:00 2001 From: "Prasanna S.P" Date: Tue, 26 Sep 2006 10:52:34 +0200 Subject: [PATCH] x86: error_code is not safe for kprobes This patch moves the entry.S:error_entry to .kprobes.text section, since code marked unsafe for kprobes jumps directly to entry.S::error_entry, that must be marked unsafe as well. This patch also moves all the ".previous.text" asm directives to ".previous" for kprobes section. AK: Following a similar i386 patch from Chuck Ebbert AK: Also merged Jeremy's fix in. +From: Jeremy Fitzhardinge KPROBE_ENTRY does a .section .kprobes.text, and expects its users to do a .previous at the end of the function. Unfortunately, if any code within the function switches sections, for example .fixup, then the .previous ends up putting all subsequent code into .fixup. Worse, any subsequent .fixup code gets intermingled with the code its supposed to be fixing (which is also in .fixup). It's surprising this didn't cause more havok. The fix is to use .pushsection/.popsection, so this stuff nests properly. A further cleanup would be to get rid of all .section/.previous pairs, since they're inherently fragile. +From: Chuck Ebbert <76306.1226@compuserve.com> Because code marked unsafe for kprobes jumps directly to entry.S::error_code, that must be marked unsafe as well. The easiest way to do that is to move the page fault entry point to just before error_code and let it inherit the same section. Also moved all the ".previous" asm directives for kprobes sections to column 1 and removed ".text" from them. Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Signed-off-by: Andi Kleen --- arch/i386/kernel/entry.S | 25 +++++++++++++------------ arch/x86_64/kernel/entry.S | 19 +++++++------------ include/linux/linkage.h | 6 +++++- 3 files changed, 25 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 87f9f60b803b..ba22ec8fab54 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -591,11 +591,9 @@ ENTRY(name) \ /* The include is where all of the SMP etc. interrupts come from */ #include "entry_arch.h" -ENTRY(divide_error) - RING0_INT_FRAME - pushl $0 # no error code - CFI_ADJUST_CFA_OFFSET 4 - pushl $do_divide_error +KPROBE_ENTRY(page_fault) + RING0_EC_FRAME + pushl $do_page_fault CFI_ADJUST_CFA_OFFSET 4 ALIGN error_code: @@ -645,6 +643,7 @@ error_code: call *%edi jmp ret_from_exception CFI_ENDPROC +KPROBE_END(page_fault) ENTRY(coprocessor_error) RING0_INT_FRAME @@ -720,7 +719,8 @@ debug_stack_correct: call do_debug jmp ret_from_exception CFI_ENDPROC - .previous .text +KPROBE_END(debug) + /* * NMI is doubly nasty. It can happen _while_ we're handling * a debug fault, and the debug fault hasn't yet been able to @@ -816,7 +816,7 @@ KPROBE_ENTRY(int3) call do_int3 jmp ret_from_exception CFI_ENDPROC - .previous .text +KPROBE_END(int3) ENTRY(overflow) RING0_INT_FRAME @@ -881,7 +881,7 @@ KPROBE_ENTRY(general_protection) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC - .previous .text +KPROBE_END(general_protection) ENTRY(alignment_check) RING0_EC_FRAME @@ -890,13 +890,14 @@ ENTRY(alignment_check) jmp error_code CFI_ENDPROC -KPROBE_ENTRY(page_fault) - RING0_EC_FRAME - pushl $do_page_fault +ENTRY(divide_error) + RING0_INT_FRAME + pushl $0 # no error code + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_divide_error CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC - .previous .text #ifdef CONFIG_X86_MCE ENTRY(machine_check) diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index 2092f565aa87..780f9b26169f 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -819,7 +819,7 @@ paranoid_schedule\trace: * Exception entry point. This expects an error code/orig_rax on the stack * and the exception handler in %rax. */ -ENTRY(error_entry) +KPROBE_ENTRY(error_entry) _frame RDI /* rdi slot contains rax, oldrax contains error code */ cld @@ -903,7 +903,7 @@ error_kernelspace: cmpq $gs_change,RIP(%rsp) je error_swapgs jmp error_sti -END(error_entry) +KPROBE_END(error_entry) /* Reload gs selector with exception handling */ /* edi: new selector */ @@ -1025,8 +1025,7 @@ ENDPROC(execve) KPROBE_ENTRY(page_fault) errorentry do_page_fault -END(page_fault) - .previous .text +KPROBE_END(page_fault) ENTRY(coprocessor_error) zeroentry do_coprocessor_error @@ -1047,8 +1046,7 @@ KPROBE_ENTRY(debug) CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_debug, DEBUG_STACK paranoidexit -END(debug) - .previous .text +KPROBE_END(debug) /* runs on exception stack */ KPROBE_ENTRY(nmi) @@ -1062,8 +1060,7 @@ KPROBE_ENTRY(nmi) jmp paranoid_exit1 CFI_ENDPROC #endif -END(nmi) - .previous .text +KPROBE_END(nmi) KPROBE_ENTRY(int3) INTR_FRAME @@ -1072,8 +1069,7 @@ KPROBE_ENTRY(int3) paranoidentry do_int3, DEBUG_STACK jmp paranoid_exit1 CFI_ENDPROC -END(int3) - .previous .text +KPROBE_END(int3) ENTRY(overflow) zeroentry do_overflow @@ -1121,8 +1117,7 @@ END(stack_segment) KPROBE_ENTRY(general_protection) errorentry do_general_protection -END(general_protection) - .previous .text +KPROBE_END(general_protection) ENTRY(alignment_check) errorentry do_alignment_check diff --git a/include/linux/linkage.h b/include/linux/linkage.h index 932021f872d5..6c9873f88287 100644 --- a/include/linux/linkage.h +++ b/include/linux/linkage.h @@ -35,9 +35,13 @@ #endif #define KPROBE_ENTRY(name) \ - .section .kprobes.text, "ax"; \ + .pushsection .kprobes.text, "ax"; \ ENTRY(name) +#define KPROBE_END(name) \ + END(name); \ + .popsection + #ifndef END #define END(name) \ .size name, .-name -- cgit v1.2.3 From e07e23e1fd3000289fc7ccc6c71879070d3b19e0 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Tue, 26 Sep 2006 10:52:36 +0200 Subject: [PATCH] non lazy "sleazy" fpu implementation Right now the kernel on x86-64 has a 100% lazy fpu behavior: after *every* context switch a trap is taken for the first FPU use to restore the FPU context lazily. This is of course great for applications that have very sporadic or no FPU use (since then you avoid doing the expensive save/restore all the time). However for very frequent FPU users... you take an extra trap every context switch. The patch below adds a simple heuristic to this code: After 5 consecutive context switches of FPU use, the lazy behavior is disabled and the context gets restored every context switch. If the app indeed uses the FPU, the trap is avoided. (the chance of the 6th time slice using FPU after the previous 5 having done so are quite high obviously). After 256 switches, this is reset and lazy behavior is returned (until there are 5 consecutive ones again). The reason for this is to give apps that do longer bursts of FPU use still the lazy behavior back after some time. [akpm@osdl.org: place new task_struct field next to jit_keyring to save space] Signed-off-by: Arjan van de Ven Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/x86_64/kernel/process.c | 10 ++++++++++ arch/x86_64/kernel/traps.c | 1 + include/asm-x86_64/i387.h | 5 ++++- include/linux/sched.h | 9 +++++++++ 4 files changed, 24 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index 6fbd19564e4e..9e9a70e50c72 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -552,6 +552,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); + /* we're going to use this soon, after a few expensive things */ + if (next_p->fpu_counter>5) + prefetch(&next->i387.fxsave); + /* * Reload esp0, LDT and the page table pointer: */ @@ -629,6 +633,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) __switch_to_xtra(prev_p, next_p, tss); + /* If the task has used fpu the last 5 timeslices, just do a full + * restore of the math state immediately to avoid the trap; the + * chances of needing FPU soon are obviously high now + */ + if (next_p->fpu_counter>5) + math_state_restore(); return prev_p; } diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 28e53342f294..ffc40cff1e07 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -1136,6 +1136,7 @@ asmlinkage void math_state_restore(void) init_fpu(me); restore_fpu_checking(&me->thread.i387.fxsave); task_thread_info(me)->status |= TS_USEDFPU; + me->fpu_counter++; } void __init trap_init(void) diff --git a/include/asm-x86_64/i387.h b/include/asm-x86_64/i387.h index cba8a3b0cded..60c0f4853fdb 100644 --- a/include/asm-x86_64/i387.h +++ b/include/asm-x86_64/i387.h @@ -24,6 +24,7 @@ extern unsigned int mxcsr_feature_mask; extern void mxcsr_feature_mask_init(void); extern void init_fpu(struct task_struct *child); extern int save_i387(struct _fpstate __user *buf); +extern asmlinkage void math_state_restore(void); /* * FPU lazy state save handling... @@ -31,7 +32,9 @@ extern int save_i387(struct _fpstate __user *buf); #define unlazy_fpu(tsk) do { \ if (task_thread_info(tsk)->status & TS_USEDFPU) \ - save_init_fpu(tsk); \ + save_init_fpu(tsk); \ + else \ + tsk->fpu_counter = 0; \ } while (0) /* Ignore delayed exceptions from user space */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 34ed0d99b1bd..807556c5bcd2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -865,6 +865,15 @@ struct task_struct { struct key *thread_keyring; /* keyring private to this thread */ unsigned char jit_keyring; /* default keyring to attach requested keys to */ #endif + /* + * fpu_counter contains the number of consecutive context switches + * that the FPU is used. If this is over a threshold, the lazy fpu + * saving becomes unlazy to save the trap. This is an unsigned char + * so that after 256 times the counter wraps and the behavior turns + * lazy again; this to deal with bursty apps that only use FPU for + * a short time + */ + unsigned char fpu_counter; int oomkilladj; /* OOM kill score adjustment (bit shift). */ char comm[TASK_COMM_LEN]; /* executable name excluding path - access with [gs]et_task_comm (which lock -- cgit v1.2.3 From 1bb4996bcebca1cde49d964b4e012699ce180e61 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:37 +0200 Subject: [PATCH] Move compiler check for modules to ia64 only Apparently IA64 needs it, but i386/x86-64 don't anymore since gcc 2.95 support was dropped. Nobody else on linux-arch requested keeping it generically Cc: tony.luck@intel.com Cc: kaos@sgi.com Signed-off-by: Andi Kleen --- include/asm-ia64/module.h | 3 ++- include/linux/vermagic.h | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/asm-ia64/module.h b/include/asm-ia64/module.h index 85c82bd819f2..d2da61e4c49b 100644 --- a/include/asm-ia64/module.h +++ b/include/asm-ia64/module.h @@ -28,7 +28,8 @@ struct mod_arch_specific { #define Elf_Ehdr Elf64_Ehdr #define MODULE_PROC_FAMILY "ia64" -#define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY +#define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY \ + "gcc-" __stringify(__GNUC__) "." __stringify(__GNUC_MINOR__) #define ARCH_SHF_SMALL SHF_IA_64_SHORT diff --git a/include/linux/vermagic.h b/include/linux/vermagic.h index 46919f9f5eb3..4d0909e53595 100644 --- a/include/linux/vermagic.h +++ b/include/linux/vermagic.h @@ -24,5 +24,5 @@ #define VERMAGIC_STRING \ UTS_RELEASE " " \ MODULE_VERMAGIC_SMP MODULE_VERMAGIC_PREEMPT \ - MODULE_VERMAGIC_MODULE_UNLOAD MODULE_ARCH_VERMAGIC \ - "gcc-" __stringify(__GNUC__) "." __stringify(__GNUC_MINOR__) + MODULE_VERMAGIC_MODULE_UNLOAD MODULE_ARCH_VERMAGIC + -- cgit v1.2.3 From 575400d1b483fbe9e03c68758059bfaf4e4768d1 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 26 Sep 2006 10:52:38 +0200 Subject: [PATCH] i386: Fix the EDD code misparsing the command line The EDD code would scan the command line as a fixed array, without taking account of either whitespace, null-termination, the old command-line protocol, late overrides early, or the fact that the command line may not be reachable from INITSEG. This should fix those problems, and enable us to use a longer command line. Signed-off-by: H. Peter Anvin Signed-off-by: Andi Kleen --- arch/i386/boot/edd.S | 97 ++++++++++++++++++++++++++++++++++++++++------------ include/linux/edd.h | 1 + 2 files changed, 76 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/arch/i386/boot/edd.S b/arch/i386/boot/edd.S index 4b84ea216f2b..34321368011a 100644 --- a/arch/i386/boot/edd.S +++ b/arch/i386/boot/edd.S @@ -15,42 +15,95 @@ #include #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) + +# It is assumed that %ds == INITSEG here + movb $0, (EDD_MBR_SIG_NR_BUF) movb $0, (EDDNR) -# Check the command line for two options: +# Check the command line for options: # edd=of disables EDD completely (edd=off) # edd=sk skips the MBR test (edd=skipmbr) +# edd=on re-enables EDD (edd=on) + pushl %esi - cmpl $0, %cs:cmd_line_ptr - jz done_cl + movw $edd_mbr_sig_start, %di # Default to edd=on + movl %cs:(cmd_line_ptr), %esi -# ds:esi has the pointer to the command line now - movl $(COMMAND_LINE_SIZE-7), %ecx -# loop through kernel command line one byte at a time -cl_loop: - cmpl $EDD_CL_EQUALS, (%si) + andl %esi, %esi + jz old_cl # Old boot protocol? + +# Convert to a real-mode pointer in fs:si + movl %esi, %eax + shrl $4, %eax + movw %ax, %fs + andw $0xf, %si + jmp have_cl_pointer + +# Old-style boot protocol? +old_cl: + push %ds # aka INITSEG + pop %fs + + cmpw $0xa33f, (0x20) + jne done_cl # No command line at all? + movw (0x22), %si # Pointer relative to INITSEG + +# fs:si has the pointer to the command line now +have_cl_pointer: + +# Loop through kernel command line one byte at a time. Just in +# case the loader is buggy and failed to null-terminate the command line +# terminate if we get close enough to the end of the segment that we +# cannot fit "edd=XX"... +cl_atspace: + cmpw $-5, %si # Watch for segment wraparound + jae done_cl + movl %fs:(%si), %eax + andb %al, %al # End of line? + jz done_cl + cmpl $EDD_CL_EQUALS, %eax jz found_edd_equals - incl %esi - loop cl_loop - jmp done_cl + cmpb $0x20, %al # <= space consider whitespace + ja cl_skipword + incw %si + jmp cl_atspace + +cl_skipword: + cmpw $-5, %si # Watch for segment wraparound + jae done_cl + movb %fs:(%si), %al # End of string? + andb %al, %al + jz done_cl + cmpb $0x20, %al + jbe cl_atspace + incw %si + jmp cl_skipword + found_edd_equals: # only looking at first two characters after equals - addl $4, %esi - cmpw $EDD_CL_OFF, (%si) # edd=of - jz do_edd_off - cmpw $EDD_CL_SKIP, (%si) # edd=sk - jz do_edd_skipmbr - jmp done_cl +# late overrides early on the command line, so keep going after finding something + movw %fs:4(%si), %ax + cmpw $EDD_CL_OFF, %ax # edd=of + je do_edd_off + cmpw $EDD_CL_SKIP, %ax # edd=sk + je do_edd_skipmbr + cmpw $EDD_CL_ON, %ax # edd=on + je do_edd_on + jmp cl_skipword do_edd_skipmbr: - popl %esi - jmp edd_start + movw $edd_start, %di + jmp cl_skipword do_edd_off: - popl %esi - jmp edd_done + movw $edd_done, %di + jmp cl_skipword +do_edd_on: + movw $edd_mbr_sig_start, %di + jmp cl_skipword + done_cl: popl %esi - + jmpw *%di # Read the first sector of each BIOS disk device and store the 4-byte signature edd_mbr_sig_start: diff --git a/include/linux/edd.h b/include/linux/edd.h index 162512b886f7..b2b3e68aa512 100644 --- a/include/linux/edd.h +++ b/include/linux/edd.h @@ -52,6 +52,7 @@ #define EDD_CL_EQUALS 0x3d646465 /* "edd=" */ #define EDD_CL_OFF 0x666f /* "of" for off */ #define EDD_CL_SKIP 0x6b73 /* "sk" for skipmbr */ +#define EDD_CL_ON 0x6e6f /* "on" for on */ #ifndef __ASSEMBLY__ -- cgit v1.2.3 From 0a4254058037eb172758961d0a5b94f4320a1425 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Tue, 26 Sep 2006 10:52:38 +0200 Subject: [PATCH] Add the canary field to the PDA area and the task struct This patch adds the per thread cookie field to the task struct and the PDA. Also it makes sure that the PDA value gets the new cookie value at context switch, and that a new task gets a new cookie at task creation time. Signed-off-by: Arjan van Ven Signed-off-by: Ingo Molnar Signed-off-by: Andi Kleen CC: Andi Kleen --- arch/x86_64/kernel/process.c | 8 ++++++++ include/asm-x86_64/pda.h | 7 ++++++- include/linux/sched.h | 5 +++++ kernel/fork.c | 5 +++++ 4 files changed, 24 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index 9e9a70e50c72..fba8dfeda67c 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -625,6 +625,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) unlazy_fpu(prev_p); write_pda(kernelstack, task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); +#ifdef CONFIG_CC_STACKPROTECTOR + write_pda(stack_canary, next_p->stack_canary); + /* + * Build time only check to make sure the stack_canary is at + * offset 40 in the pda; this is a gcc ABI requirement + */ + BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40); +#endif /* * Now maybe reload the debug registers and handle I/O bitmaps diff --git a/include/asm-x86_64/pda.h b/include/asm-x86_64/pda.h index 6794ffaae433..e7773e0af865 100644 --- a/include/asm-x86_64/pda.h +++ b/include/asm-x86_64/pda.h @@ -16,7 +16,12 @@ struct x8664_pda { unsigned long oldrsp; /* 24 user rsp for system call */ int irqcount; /* 32 Irq nesting counter. Starts with -1 */ int cpunumber; /* 36 Logical CPU number */ - char *irqstackptr; /* 40 top of irqstack */ +#ifdef CONFIG_CC_STACKPROTECTOR + unsigned long stack_canary; /* 40 stack canary value */ + /* gcc-ABI: this canary MUST be at + offset 40!!! */ +#endif + char *irqstackptr; int nodenumber; /* number of current node */ unsigned int __softirq_pending; unsigned int __nmi_count; /* number of NMI on this CPUs */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 807556c5bcd2..9d4aa7f95bc8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -819,6 +819,11 @@ struct task_struct { unsigned did_exec:1; pid_t pid; pid_t tgid; + +#ifdef CONFIG_CC_STACKPROTECTOR + /* Canary value for the -fstack-protector gcc feature */ + unsigned long stack_canary; +#endif /* * pointers to (original) parent process, youngest child, younger sibling, * older sibling, respectively. (p->father can be replaced with diff --git a/kernel/fork.c b/kernel/fork.c index f9b014e3e700..a0dad84567c9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include @@ -175,6 +176,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) tsk->thread_info = ti; setup_thread_stack(tsk, orig); +#ifdef CONFIG_CC_STACKPROTECTOR + tsk->stack_canary = get_random_int(); +#endif + /* One for us, one for whoever does the "release_task()" (usually parent) */ atomic_set(&tsk->usage,2); atomic_set(&tsk->fs_excl, 0); -- cgit v1.2.3 From 3b171672831b9633c2ed8fa94805255cd4d5af19 Mon Sep 17 00:00:00 2001 From: Dmitriy Zavin Date: Tue, 26 Sep 2006 10:52:42 +0200 Subject: [PATCH] Add 64bit jiffies compares (for use with get_jiffies_64) The current time_before/time_after macros will fail typechecks when passed u64 values (as returned by get_jiffies_64()). On 64bit systems, this will just result in a warning about mismatching types without explicit casts, but since unsigned long and u64 (unsigned long long) are of same size, it will still work. On 32bit systems, a long is 32bits, so the value from get_jiffies_64() will be truncated by the cast and thus lose all the precision gained by 64bit jiffies. Signed-off-by: Dmitriy Zavin Signed-off-by: Andi Kleen --- include/linux/jiffies.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 329ebcffa106..c8d5f207c3d4 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -115,6 +115,21 @@ static inline u64 get_jiffies_64(void) ((long)(a) - (long)(b) >= 0)) #define time_before_eq(a,b) time_after_eq(b,a) +/* Same as above, but does so with platform independent 64bit types. + * These must be used when utilizing jiffies_64 (i.e. return value of + * get_jiffies_64() */ +#define time_after64(a,b) \ + (typecheck(__u64, a) && \ + typecheck(__u64, b) && \ + ((__s64)(b) - (__s64)(a) < 0)) +#define time_before64(a,b) time_after64(b,a) + +#define time_after_eq64(a,b) \ + (typecheck(__u64, a) && \ + typecheck(__u64, b) && \ + ((__s64)(a) - (__s64)(b) >= 0)) +#define time_before_eq64(a,b) time_after_eq64(b,a) + /* * Have the 32 bit jiffies value wrap 5 minutes after boot * so jiffies wrap bugs show up earlier. -- cgit v1.2.3