From 2de1f33e99cec5fd79542a1d0e26efb9c36a98bb Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 11 Apr 2009 12:55:26 +0530 Subject: x86: apic/x2apic_cluster.c x86_cpu_to_logical_apicid should be static Impact: reduce kernel size a bit, address sparse warning Addresses the problem pointed out by this sparse warning: arch/x86/kernel/apic/x2apic_cluster.c:13:1: warning: symbol 'per_cpu__x86_cpu_to_logical_apicid' was not declared. Should it be static? Signed-off-by: Jaswinder Singh Rajput Cc: Suresh Siddha LKML-Reference: <1239434726.4418.24.camel@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_cluster.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel/apic') diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 4a903e2f0d17..8e4cbb255c38 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -10,7 +10,7 @@ #include #include -DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); +static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { -- cgit v1.2.3 From 2c1b284e4fa260fd922b9a65c99169e2630c6862 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 11 Apr 2009 00:03:10 +0530 Subject: x86: clean up declarations and variables Impact: cleanup, no code changed - syscalls.h update declarations due to unifications - irq.c declare smp_generic_interrupt() before it gets used - process.c declare sys_fork() and sys_vfork() before they get used - tsc.c rename tsc_khz shadowed variable - apic/probe_32.c declare apic_default before it gets used - apic/nmi.c prev_nmi_count should be unsigned - apic/io_apic.c declare smp_irq_move_cleanup_interrupt() before it gets used - mm/init.c declare direct_gbpages and free_initrd_mem before they get used Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 3 +++ arch/x86/include/asm/hw_irq.h | 4 ++++ arch/x86/include/asm/pgtable.h | 2 ++ arch/x86/include/asm/pgtable_64.h | 6 ------ arch/x86/include/asm/syscalls.h | 45 +++++++++++++++++++++------------------ arch/x86/kernel/apic/io_apic.c | 1 + arch/x86/kernel/apic/nmi.c | 2 +- arch/x86/kernel/apic/probe_32.c | 1 - arch/x86/kernel/irq.c | 1 + arch/x86/kernel/process.c | 1 + arch/x86/kernel/tsc.c | 8 +++---- arch/x86/mm/init.c | 1 + 12 files changed, 42 insertions(+), 33 deletions(-) (limited to 'arch/x86/kernel/apic') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 42f2f8377422..5773660c8cd5 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -478,6 +478,9 @@ static inline unsigned int read_apic_id(void) extern void default_setup_apic_routing(void); #ifdef CONFIG_X86_32 + +extern struct apic apic_default; + /* * Set up the logical destination ID. * diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index b762ea49bd70..be9ae4111c94 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -78,7 +78,11 @@ extern void eisa_set_level_irq(unsigned int irq); /* SMP */ extern void smp_apic_timer_interrupt(struct pt_regs *); extern void smp_spurious_interrupt(struct pt_regs *); +extern void smp_generic_interrupt(struct pt_regs *); extern void smp_error_interrupt(struct pt_regs *); +#ifdef CONFIG_X86_IO_APIC +extern asmlinkage void smp_irq_move_cleanup_interrupt(void); +#endif #ifdef CONFIG_SMP extern void smp_reschedule_interrupt(struct pt_regs *); extern void smp_call_function_interrupt(struct pt_regs *); diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 29d96d168bc0..3f8d09d94eb3 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -503,6 +503,8 @@ static inline int pgd_none(pgd_t pgd) #ifndef __ASSEMBLY__ +extern int direct_gbpages; + /* local pte updates need not use xchg for locking */ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) { diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 6b87bc6d5018..abde308fdb0f 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -25,10 +25,6 @@ extern pgd_t init_level4_pgt[]; extern void paging_init(void); -#endif /* !__ASSEMBLY__ */ - -#ifndef __ASSEMBLY__ - #define pte_ERROR(e) \ printk("%s:%d: bad pte %p(%016lx).\n", \ __FILE__, __LINE__, &(e), pte_val(e)) @@ -135,8 +131,6 @@ static inline int pgd_large(pgd_t pgd) { return 0; } #define update_mmu_cache(vma, address, pte) do { } while (0) -extern int direct_gbpages; - /* Encode and de-code a swap entry */ #if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE #define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 7043408f6904..372b76edd63f 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -1,7 +1,7 @@ /* * syscalls.h - Linux syscall interfaces (arch-specific) * - * Copyright (c) 2008 Jaswinder Singh + * Copyright (c) 2008 Jaswinder Singh Rajput * * This file is released under the GPLv2. * See the file COPYING for more details. @@ -12,50 +12,55 @@ #include #include -#include #include +#include /* Common in X86_32 and X86_64 */ /* kernel/ioport.c */ asmlinkage long sys_ioperm(unsigned long, unsigned long, int); +/* kernel/process.c */ +int sys_fork(struct pt_regs *); +int sys_vfork(struct pt_regs *); + /* kernel/ldt.c */ asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); +/* kernel/signal.c */ +long sys_rt_sigreturn(struct pt_regs *); + /* kernel/tls.c */ asmlinkage int sys_set_thread_area(struct user_desc __user *); asmlinkage int sys_get_thread_area(struct user_desc __user *); /* X86_32 only */ #ifdef CONFIG_X86_32 +/* kernel/ioport.c */ +long sys_iopl(struct pt_regs *); + /* kernel/process_32.c */ -int sys_fork(struct pt_regs *); int sys_clone(struct pt_regs *); -int sys_vfork(struct pt_regs *); int sys_execve(struct pt_regs *); -/* kernel/signal_32.c */ +/* kernel/signal.c */ asmlinkage int sys_sigsuspend(int, int, old_sigset_t); asmlinkage int sys_sigaction(int, const struct old_sigaction __user *, struct old_sigaction __user *); int sys_sigaltstack(struct pt_regs *); unsigned long sys_sigreturn(struct pt_regs *); -long sys_rt_sigreturn(struct pt_regs *); - -/* kernel/ioport.c */ -long sys_iopl(struct pt_regs *); /* kernel/sys_i386_32.c */ +struct mmap_arg_struct; +struct sel_arg_struct; +struct oldold_utsname; +struct old_utsname; + asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); -struct mmap_arg_struct; asmlinkage int old_mmap(struct mmap_arg_struct __user *); -struct sel_arg_struct; asmlinkage int old_select(struct sel_arg_struct __user *); asmlinkage int sys_ipc(uint, int, int, int, void __user *, long); -struct old_utsname; asmlinkage int sys_uname(struct old_utsname __user *); -struct oldold_utsname; asmlinkage int sys_olduname(struct oldold_utsname __user *); /* kernel/vm86_32.c */ @@ -65,29 +70,27 @@ int sys_vm86(struct pt_regs *); #else /* CONFIG_X86_32 */ /* X86_64 only */ +/* kernel/ioport.c */ +asmlinkage long sys_iopl(unsigned int, struct pt_regs *); + /* kernel/process_64.c */ -asmlinkage long sys_fork(struct pt_regs *); asmlinkage long sys_clone(unsigned long, unsigned long, void __user *, void __user *, struct pt_regs *); -asmlinkage long sys_vfork(struct pt_regs *); asmlinkage long sys_execve(char __user *, char __user * __user *, char __user * __user *, struct pt_regs *); long sys_arch_prctl(int, unsigned long); -/* kernel/ioport.c */ -asmlinkage long sys_iopl(unsigned int, struct pt_regs *); - -/* kernel/signal_64.c */ +/* kernel/signal.c */ asmlinkage long sys_sigaltstack(const stack_t __user *, stack_t __user *, struct pt_regs *); -long sys_rt_sigreturn(struct pt_regs *); /* kernel/sys_x86_64.c */ +struct new_utsname; + asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); -struct new_utsname; asmlinkage long sys_uname(struct new_utsname __user *); #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 767fe7e46d68..870c92ddaf9c 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -59,6 +59,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index d6bd62407152..02056310f2f5 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -104,7 +104,7 @@ static __init void nmi_cpu_busy(void *data) } #endif -static void report_broken_nmi(int cpu, int *prev_nmi_count) +static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count) { printk(KERN_CONT "\n"); diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 01eda2ac65e4..440a8bccd91a 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -160,7 +160,6 @@ extern struct apic apic_summit; extern struct apic apic_bigsmp; extern struct apic apic_es7000; extern struct apic apic_es7000_cluster; -extern struct apic apic_default; struct apic *apic = &apic_default; EXPORT_SYMBOL_GPL(apic); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 3aaf7b9e3a8b..2188267f523b 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -12,6 +12,7 @@ #include #include #include +#include atomic_t irq_err_count; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ca989158e847..3e21e38d7e37 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 7a567ebe6361..a8dc0d00b830 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -384,13 +384,13 @@ unsigned long native_calibrate_tsc(void) { u64 tsc1, tsc2, delta, ref1, ref2; unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; - unsigned long flags, latch, ms, fast_calibrate, tsc_khz; + unsigned long flags, latch, ms, fast_calibrate, hv_tsc_khz; int hpet = is_hpet_enabled(), i, loopmin; - tsc_khz = get_hypervisor_tsc_freq(); - if (tsc_khz) { + hv_tsc_khz = get_hypervisor_tsc_freq(); + if (hv_tsc_khz) { printk(KERN_INFO "TSC: Frequency read from the hypervisor\n"); - return tsc_khz; + return hv_tsc_khz; } local_irq_save(flags); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index fd3da1dda1c9..40924e445f57 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -1,3 +1,4 @@ +#include #include #include -- cgit v1.2.3 From a63eaf34ae60bdb067a354cc8def2e8f4a01f5f4 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 22 May 2009 14:17:31 +1000 Subject: perf_counter: Dynamically allocate tasks' perf_counter_context struct This replaces the struct perf_counter_context in the task_struct with a pointer to a dynamically allocated perf_counter_context struct. The main reason for doing is this is to allow us to transfer a perf_counter_context from one task to another when we do lazy PMU switching in a later patch. This has a few side-benefits: the task_struct becomes a little smaller, we save some memory because only tasks that have perf_counters attached get a perf_counter_context allocated for them, and we can remove the inclusion of in sched.h, meaning that we don't end up recompiling nearly everything whenever perf_counter.h changes. The perf_counter_context structures are reference-counted and freed when the last reference is dropped. A context can have references from its task and the counters on its task. Counters can outlive the task so it is possible that a context will be freed well after its task has exited. Contexts are allocated on fork if the parent had a context, or otherwise the first time that a per-task counter is created on a task. In the latter case, we set the context pointer in the task struct locklessly using an atomic compare-and-exchange operation in case we raced with some other task in creating a context for the subject task. This also removes the task pointer from the perf_counter struct. The task pointer was not used anywhere and would make it harder to move a context from one task to another. Anything that needed to know which task a counter was attached to was already using counter->ctx->task. The __perf_counter_init_context function moves up in perf_counter.c so that it can be called from find_get_context, and now initializes the refcount, but is otherwise unchanged. We were potentially calling list_del_counter twice: once from __perf_counter_exit_task when the task exits and once from __perf_counter_remove_from_context when the counter's fd gets closed. This adds a check in list_del_counter so it doesn't do anything if the counter has already been removed from the lists. Since perf_counter_task_sched_in doesn't do anything if the task doesn't have a context, and leaves cpuctx->task_ctx = NULL, this adds code to __perf_install_in_context to set cpuctx->task_ctx if necessary, i.e. in the case where the current task adds the first counter to itself and thus creates a context for itself. This also adds similar code to __perf_counter_enable to handle a similar situation which can arise when the counters have been disabled using prctl; that also leaves cpuctx->task_ctx = NULL. [ Impact: refactor counter context management to prepare for new feature ] Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo LKML-Reference: <18966.10075.781053.231153@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 1 + include/linux/init_task.h | 13 --- include/linux/perf_counter.h | 4 +- include/linux/sched.h | 6 +- kernel/exit.c | 3 +- kernel/fork.c | 1 + kernel/perf_counter.c | 218 +++++++++++++++++++++++++++---------------- 7 files changed, 145 insertions(+), 101 deletions(-) (limited to 'arch/x86/kernel/apic') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index e9021a908020..b4f64402a82a 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -14,6 +14,7 @@ * Mikael Pettersson : PM converted to driver model. */ +#include #include #include #include diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 503afaa0afa7..d87247d2641f 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -108,18 +108,6 @@ extern struct group_info init_groups; extern struct cred init_cred; -#ifdef CONFIG_PERF_COUNTERS -# define INIT_PERF_COUNTERS(tsk) \ - .perf_counter_ctx.counter_list = \ - LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list), \ - .perf_counter_ctx.event_list = \ - LIST_HEAD_INIT(tsk.perf_counter_ctx.event_list), \ - .perf_counter_ctx.lock = \ - __SPIN_LOCK_UNLOCKED(tsk.perf_counter_ctx.lock), -#else -# define INIT_PERF_COUNTERS(tsk) -#endif - /* * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x1fffff (=2MB) @@ -183,7 +171,6 @@ extern struct cred init_cred; }, \ .dirties = INIT_PROP_LOCAL_SINGLE(dirties), \ INIT_IDS \ - INIT_PERF_COUNTERS(tsk) \ INIT_TRACE_IRQFLAGS \ INIT_LOCKDEP \ INIT_FTRACE_GRAPH \ diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index f612941ef46e..071309005468 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -449,7 +449,6 @@ struct perf_counter { struct hw_perf_counter hw; struct perf_counter_context *ctx; - struct task_struct *task; struct file *filp; struct perf_counter *parent; @@ -498,7 +497,6 @@ struct perf_counter { * Used as a container for task counters and CPU counters as well: */ struct perf_counter_context { -#ifdef CONFIG_PERF_COUNTERS /* * Protect the states of the counters in the list, * nr_active, and the list: @@ -516,6 +514,7 @@ struct perf_counter_context { int nr_counters; int nr_active; int is_active; + atomic_t refcount; struct task_struct *task; /* @@ -523,7 +522,6 @@ struct perf_counter_context { */ u64 time; u64 timestamp; -#endif }; /** diff --git a/include/linux/sched.h b/include/linux/sched.h index ff59d1231519..9714d450f417 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -71,7 +71,6 @@ struct sched_param { #include #include #include -#include #include #include #include @@ -99,6 +98,7 @@ struct robust_list_head; struct bio; struct bts_tracer; struct fs_struct; +struct perf_counter_context; /* * List of flags we want to share for kernel threads, @@ -1387,7 +1387,9 @@ struct task_struct { struct list_head pi_state_list; struct futex_pi_state *pi_state_cache; #endif - struct perf_counter_context perf_counter_ctx; +#ifdef CONFIG_PERF_COUNTERS + struct perf_counter_context *perf_counter_ctxp; +#endif #ifdef CONFIG_NUMA struct mempolicy *mempolicy; short il_next; diff --git a/kernel/exit.c b/kernel/exit.c index f9dfedd94af0..99ad4063ee4a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include @@ -159,7 +160,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp) struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); #ifdef CONFIG_PERF_COUNTERS - WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list)); + WARN_ON_ONCE(tsk->perf_counter_ctxp); #endif trace_sched_process_free(tsk); put_task_struct(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index d32fef4d38e5..e72a09f5355b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 08584c16049f..06ea3eae886e 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -97,6 +97,17 @@ void perf_enable(void) hw_perf_enable(); } +static void get_ctx(struct perf_counter_context *ctx) +{ + atomic_inc(&ctx->refcount); +} + +static void put_ctx(struct perf_counter_context *ctx) +{ + if (atomic_dec_and_test(&ctx->refcount)) + kfree(ctx); +} + static void list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) { @@ -118,11 +129,17 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) ctx->nr_counters++; } +/* + * Remove a counter from the lists for its context. + * Must be called with counter->mutex and ctx->mutex held. + */ static void list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) { struct perf_counter *sibling, *tmp; + if (list_empty(&counter->list_entry)) + return; ctx->nr_counters--; list_del_init(&counter->list_entry); @@ -216,8 +233,6 @@ static void __perf_counter_remove_from_context(void *info) counter_sched_out(counter, cpuctx, ctx); - counter->task = NULL; - list_del_counter(counter, ctx); if (!ctx->task) { @@ -279,7 +294,6 @@ retry: */ if (!list_empty(&counter->list_entry)) { list_del_counter(counter, ctx); - counter->task = NULL; } spin_unlock_irq(&ctx->lock); } @@ -568,11 +582,17 @@ static void __perf_install_in_context(void *info) * If this is a task context, we need to check whether it is * the current task context of this cpu. If not it has been * scheduled out before the smp call arrived. + * Or possibly this is the right context but it isn't + * on this cpu because it had no counters. */ - if (ctx->task && cpuctx->task_ctx != ctx) - return; + if (ctx->task && cpuctx->task_ctx != ctx) { + if (cpuctx->task_ctx || ctx->task != current) + return; + cpuctx->task_ctx = ctx; + } spin_lock_irqsave(&ctx->lock, flags); + ctx->is_active = 1; update_context_time(ctx); /* @@ -653,7 +673,6 @@ perf_install_in_context(struct perf_counter_context *ctx, return; } - counter->task = task; retry: task_oncpu_function_call(task, __perf_install_in_context, counter); @@ -693,10 +712,14 @@ static void __perf_counter_enable(void *info) * If this is a per-task counter, need to check whether this * counter's task is the current task on this cpu. */ - if (ctx->task && cpuctx->task_ctx != ctx) - return; + if (ctx->task && cpuctx->task_ctx != ctx) { + if (cpuctx->task_ctx || ctx->task != current) + return; + cpuctx->task_ctx = ctx; + } spin_lock_irqsave(&ctx->lock, flags); + ctx->is_active = 1; update_context_time(ctx); counter->prev_state = counter->state; @@ -852,10 +875,10 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx, void perf_counter_task_sched_out(struct task_struct *task, int cpu) { struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_counter_context *ctx = &task->perf_counter_ctx; + struct perf_counter_context *ctx = task->perf_counter_ctxp; struct pt_regs *regs; - if (likely(!cpuctx->task_ctx)) + if (likely(!ctx || !cpuctx->task_ctx)) return; update_context_time(ctx); @@ -871,6 +894,8 @@ static void __perf_counter_task_sched_out(struct perf_counter_context *ctx) { struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + if (!cpuctx->task_ctx) + return; __perf_counter_sched_out(ctx, cpuctx); cpuctx->task_ctx = NULL; } @@ -969,8 +994,10 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, void perf_counter_task_sched_in(struct task_struct *task, int cpu) { struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_counter_context *ctx = &task->perf_counter_ctx; + struct perf_counter_context *ctx = task->perf_counter_ctxp; + if (likely(!ctx)) + return; __perf_counter_sched_in(ctx, cpuctx, cpu); cpuctx->task_ctx = ctx; } @@ -985,11 +1012,11 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) int perf_counter_task_disable(void) { struct task_struct *curr = current; - struct perf_counter_context *ctx = &curr->perf_counter_ctx; + struct perf_counter_context *ctx = curr->perf_counter_ctxp; struct perf_counter *counter; unsigned long flags; - if (likely(!ctx->nr_counters)) + if (!ctx || !ctx->nr_counters) return 0; local_irq_save(flags); @@ -1020,12 +1047,12 @@ int perf_counter_task_disable(void) int perf_counter_task_enable(void) { struct task_struct *curr = current; - struct perf_counter_context *ctx = &curr->perf_counter_ctx; + struct perf_counter_context *ctx = curr->perf_counter_ctxp; struct perf_counter *counter; unsigned long flags; int cpu; - if (likely(!ctx->nr_counters)) + if (!ctx || !ctx->nr_counters) return 0; local_irq_save(flags); @@ -1128,19 +1155,23 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) return; cpuctx = &per_cpu(perf_cpu_context, cpu); - ctx = &curr->perf_counter_ctx; + ctx = curr->perf_counter_ctxp; perf_adjust_freq(&cpuctx->ctx); - perf_adjust_freq(ctx); + if (ctx) + perf_adjust_freq(ctx); perf_counter_cpu_sched_out(cpuctx); - __perf_counter_task_sched_out(ctx); + if (ctx) + __perf_counter_task_sched_out(ctx); rotate_ctx(&cpuctx->ctx); - rotate_ctx(ctx); + if (ctx) + rotate_ctx(ctx); perf_counter_cpu_sched_in(cpuctx, cpu); - perf_counter_task_sched_in(curr, cpu); + if (ctx) + perf_counter_task_sched_in(curr, cpu); } /* @@ -1176,6 +1207,22 @@ static u64 perf_counter_read(struct perf_counter *counter) return atomic64_read(&counter->count); } +/* + * Initialize the perf_counter context in a task_struct: + */ +static void +__perf_counter_init_context(struct perf_counter_context *ctx, + struct task_struct *task) +{ + memset(ctx, 0, sizeof(*ctx)); + spin_lock_init(&ctx->lock); + mutex_init(&ctx->mutex); + INIT_LIST_HEAD(&ctx->counter_list); + INIT_LIST_HEAD(&ctx->event_list); + atomic_set(&ctx->refcount, 1); + ctx->task = task; +} + static void put_context(struct perf_counter_context *ctx) { if (ctx->task) @@ -1186,6 +1233,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) { struct perf_cpu_context *cpuctx; struct perf_counter_context *ctx; + struct perf_counter_context *tctx; struct task_struct *task; /* @@ -1225,15 +1273,36 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) if (!task) return ERR_PTR(-ESRCH); - ctx = &task->perf_counter_ctx; - ctx->task = task; - /* Reuse ptrace permission checks for now. */ if (!ptrace_may_access(task, PTRACE_MODE_READ)) { - put_context(ctx); + put_task_struct(task); return ERR_PTR(-EACCES); } + ctx = task->perf_counter_ctxp; + if (!ctx) { + ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); + if (!ctx) { + put_task_struct(task); + return ERR_PTR(-ENOMEM); + } + __perf_counter_init_context(ctx, task); + /* + * Make sure other cpus see correct values for *ctx + * once task->perf_counter_ctxp is visible to them. + */ + smp_wmb(); + tctx = cmpxchg(&task->perf_counter_ctxp, NULL, ctx); + if (tctx) { + /* + * We raced with some other task; use + * the context they set. + */ + kfree(ctx); + ctx = tctx; + } + } + return ctx; } @@ -1242,6 +1311,7 @@ static void free_counter_rcu(struct rcu_head *head) struct perf_counter *counter; counter = container_of(head, struct perf_counter, rcu_head); + put_ctx(counter->ctx); kfree(counter); } @@ -2247,7 +2317,7 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event) perf_counter_comm_ctx(&cpuctx->ctx, comm_event); put_cpu_var(perf_cpu_context); - perf_counter_comm_ctx(¤t->perf_counter_ctx, comm_event); + perf_counter_comm_ctx(current->perf_counter_ctxp, comm_event); } void perf_counter_comm(struct task_struct *task) @@ -2256,7 +2326,9 @@ void perf_counter_comm(struct task_struct *task) if (!atomic_read(&nr_comm_tracking)) return; - + if (!current->perf_counter_ctxp) + return; + comm_event = (struct perf_comm_event){ .task = task, .event = { @@ -2372,7 +2444,7 @@ got_name: perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event); put_cpu_var(perf_cpu_context); - perf_counter_mmap_ctx(¤t->perf_counter_ctx, mmap_event); + perf_counter_mmap_ctx(current->perf_counter_ctxp, mmap_event); kfree(buf); } @@ -2384,6 +2456,8 @@ void perf_counter_mmap(unsigned long addr, unsigned long len, if (!atomic_read(&nr_mmap_tracking)) return; + if (!current->perf_counter_ctxp) + return; mmap_event = (struct perf_mmap_event){ .file = file, @@ -2985,6 +3059,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, counter->group_leader = group_leader; counter->pmu = NULL; counter->ctx = ctx; + get_ctx(ctx); counter->state = PERF_COUNTER_STATE_INACTIVE; if (hw_event->disabled) @@ -3149,21 +3224,6 @@ err_put_context: goto out_fput; } -/* - * Initialize the perf_counter context in a task_struct: - */ -static void -__perf_counter_init_context(struct perf_counter_context *ctx, - struct task_struct *task) -{ - memset(ctx, 0, sizeof(*ctx)); - spin_lock_init(&ctx->lock); - mutex_init(&ctx->mutex); - INIT_LIST_HEAD(&ctx->counter_list); - INIT_LIST_HEAD(&ctx->event_list); - ctx->task = task; -} - /* * inherit a counter from parent task to child task: */ @@ -3195,7 +3255,6 @@ inherit_counter(struct perf_counter *parent_counter, /* * Link it up in the child's context: */ - child_counter->task = child; add_counter_to_ctx(child_counter, child_ctx); child_counter->parent = parent_counter; @@ -3294,40 +3353,15 @@ __perf_counter_exit_task(struct task_struct *child, struct perf_counter *parent_counter; /* - * If we do not self-reap then we have to wait for the - * child task to unschedule (it will happen for sure), - * so that its counter is at its final count. (This - * condition triggers rarely - child tasks usually get - * off their CPU before the parent has a chance to - * get this far into the reaping action) + * Protect against concurrent operations on child_counter + * due its fd getting closed, etc. */ - if (child != current) { - wait_task_inactive(child, 0); - update_counter_times(child_counter); - list_del_counter(child_counter, child_ctx); - } else { - struct perf_cpu_context *cpuctx; - unsigned long flags; - - /* - * Disable and unlink this counter. - * - * Be careful about zapping the list - IRQ/NMI context - * could still be processing it: - */ - local_irq_save(flags); - perf_disable(); - - cpuctx = &__get_cpu_var(perf_cpu_context); + mutex_lock(&child_counter->mutex); - group_sched_out(child_counter, cpuctx, child_ctx); - update_counter_times(child_counter); + update_counter_times(child_counter); + list_del_counter(child_counter, child_ctx); - list_del_counter(child_counter, child_ctx); - - perf_enable(); - local_irq_restore(flags); - } + mutex_unlock(&child_counter->mutex); parent_counter = child_counter->parent; /* @@ -3346,19 +3380,29 @@ __perf_counter_exit_task(struct task_struct *child, * * Note: we may be running in child context, but the PID is not hashed * anymore so new counters will not be added. + * (XXX not sure that is true when we get called from flush_old_exec. + * -- paulus) */ void perf_counter_exit_task(struct task_struct *child) { struct perf_counter *child_counter, *tmp; struct perf_counter_context *child_ctx; + unsigned long flags; WARN_ON_ONCE(child != current); - child_ctx = &child->perf_counter_ctx; + child_ctx = child->perf_counter_ctxp; - if (likely(!child_ctx->nr_counters)) + if (likely(!child_ctx)) return; + local_irq_save(flags); + __perf_counter_task_sched_out(child_ctx); + child->perf_counter_ctxp = NULL; + local_irq_restore(flags); + + mutex_lock(&child_ctx->mutex); + again: list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list, list_entry) @@ -3371,6 +3415,10 @@ again: */ if (!list_empty(&child_ctx->counter_list)) goto again; + + mutex_unlock(&child_ctx->mutex); + + put_ctx(child_ctx); } /* @@ -3382,19 +3430,25 @@ void perf_counter_init_task(struct task_struct *child) struct perf_counter *counter; struct task_struct *parent = current; - child_ctx = &child->perf_counter_ctx; - parent_ctx = &parent->perf_counter_ctx; - - __perf_counter_init_context(child_ctx, child); + child->perf_counter_ctxp = NULL; /* * This is executed from the parent task context, so inherit - * counters that have been marked for cloning: + * counters that have been marked for cloning. + * First allocate and initialize a context for the child. */ - if (likely(!parent_ctx->nr_counters)) + child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); + if (!child_ctx) + return; + + parent_ctx = parent->perf_counter_ctxp; + if (likely(!parent_ctx || !parent_ctx->nr_counters)) return; + __perf_counter_init_context(child_ctx, child); + child->perf_counter_ctxp = child_ctx; + /* * Lock the parent list. No need to lock the child - not PID * hashed yet and not running, so nobody can access it. -- cgit v1.2.3 From 48e22d56ecdeddd1ffb42a02fccba5c6ef42b133 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 May 2009 17:39:04 +0200 Subject: perf_counter: x86: Remove interrupt throttle remove the x86 specific interrupt throttle Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090525153931.616671838@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 2 -- arch/x86/kernel/cpu/perf_counter.c | 47 ++++---------------------------------- include/linux/perf_counter.h | 2 -- 3 files changed, 5 insertions(+), 46 deletions(-) (limited to 'arch/x86/kernel/apic') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b4f64402a82a..89b63b5fad33 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -763,8 +763,6 @@ static void local_apic_timer_interrupt(void) inc_irq_stat(apic_timer_irqs); evt->event_handler(evt); - - perf_counter_unthrottle(); } /* diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index c14437faf5d2..8c8177f859fe 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -718,11 +718,6 @@ static void intel_pmu_save_and_restart(struct perf_counter *counter) intel_pmu_enable_counter(hwc, idx); } -/* - * Maximum interrupt frequency of 100KHz per CPU - */ -#define PERFMON_MAX_INTERRUPTS (100000/HZ) - /* * This handler is triggered by the local APIC, so the APIC IRQ handling * rules apply: @@ -775,15 +770,14 @@ again: if (status) goto again; - if (++cpuc->interrupts != PERFMON_MAX_INTERRUPTS) - perf_enable(); + perf_enable(); return 1; } static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) { - int cpu, idx, throttle = 0, handled = 0; + int cpu, idx, handled = 0; struct cpu_hw_counters *cpuc; struct perf_counter *counter; struct hw_perf_counter *hwc; @@ -792,16 +786,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) cpu = smp_processor_id(); cpuc = &per_cpu(cpu_hw_counters, cpu); - if (++cpuc->interrupts == PERFMON_MAX_INTERRUPTS) { - throttle = 1; - __perf_disable(); - cpuc->enabled = 0; - barrier(); - } - for (idx = 0; idx < x86_pmu.num_counters; idx++) { - int disable = 0; - if (!test_bit(idx, cpuc->active_mask)) continue; @@ -809,45 +794,23 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) hwc = &counter->hw; if (counter->hw_event.nmi != nmi) - goto next; + continue; val = x86_perf_counter_update(counter, hwc, idx); if (val & (1ULL << (x86_pmu.counter_bits - 1))) - goto next; + continue; /* counter overflow */ x86_perf_counter_set_period(counter, hwc, idx); handled = 1; inc_irq_stat(apic_perf_irqs); - disable = perf_counter_overflow(counter, nmi, regs, 0); - -next: - if (disable || throttle) + if (perf_counter_overflow(counter, nmi, regs, 0)) amd_pmu_disable_counter(hwc, idx); } return handled; } -void perf_counter_unthrottle(void) -{ - struct cpu_hw_counters *cpuc; - - if (!x86_pmu_initialized()) - return; - - cpuc = &__get_cpu_var(cpu_hw_counters); - if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) { - /* - * Clear them before re-enabling irqs/NMIs again: - */ - cpuc->interrupts = 0; - perf_enable(); - } else { - cpuc->interrupts = 0; - } -} - void smp_perf_counter_interrupt(struct pt_regs *regs) { irq_enter(); diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index d3e85de9bf1e..0c160be2078f 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -570,7 +570,6 @@ extern int perf_counter_init_task(struct task_struct *child); extern void perf_counter_exit_task(struct task_struct *child); extern void perf_counter_do_pending(void); extern void perf_counter_print_debug(void); -extern void perf_counter_unthrottle(void); extern void __perf_disable(void); extern bool __perf_enable(void); extern void perf_disable(void); @@ -635,7 +634,6 @@ static inline int perf_counter_init_task(struct task_struct *child) { } static inline void perf_counter_exit_task(struct task_struct *child) { } static inline void perf_counter_do_pending(void) { } static inline void perf_counter_print_debug(void) { } -static inline void perf_counter_unthrottle(void) { } static inline void perf_disable(void) { } static inline void perf_enable(void) { } static inline int perf_counter_task_disable(void) { return -EINVAL; } -- cgit v1.2.3 From c323d95fa4dbe0b6bf6d59e24a0b7db067dd08a7 Mon Sep 17 00:00:00 2001 From: Yong Wang Date: Fri, 29 May 2009 13:28:35 +0800 Subject: perf_counter/x86: Always use NMI for performance-monitoring interrupt Always use NMI for performance-monitoring interrupt as there could be racy situations if we switch between irq and nmi mode frequently. Signed-off-by: Yong Wang LKML-Reference: <20090529052835.GA13657@ywang-moblin2.bj.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_counter.h | 4 ++-- arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/cpu/perf_counter.c | 19 +++++-------------- 3 files changed, 8 insertions(+), 17 deletions(-) (limited to 'arch/x86/kernel/apic') diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h index d08dd52cb8ff..876ed97147b3 100644 --- a/arch/x86/include/asm/perf_counter.h +++ b/arch/x86/include/asm/perf_counter.h @@ -91,10 +91,10 @@ extern void set_perf_counter_pending(void); #ifdef CONFIG_PERF_COUNTERS extern void init_hw_perf_counters(void); -extern void perf_counters_lapic_init(int nmi); +extern void perf_counters_lapic_init(void); #else static inline void init_hw_perf_counters(void) { } -static inline void perf_counters_lapic_init(int nmi) { } +static inline void perf_counters_lapic_init(void) { } #endif #endif /* _ASM_X86_PERF_COUNTER_H */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 89b63b5fad33..60df2efd7c80 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1135,7 +1135,7 @@ void __cpuinit setup_local_APIC(void) apic_write(APIC_ESR, 0); } #endif - perf_counters_lapic_init(0); + perf_counters_lapic_init(); preempt_disable(); diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 2eeaa99add1c..316b0c995f38 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -604,7 +604,7 @@ try_generic: hwc->counter_base = x86_pmu.perfctr; } - perf_counters_lapic_init(hwc->nmi); + perf_counters_lapic_init(); x86_pmu.disable(hwc, idx); @@ -863,24 +863,15 @@ void set_perf_counter_pending(void) apic->send_IPI_self(LOCAL_PENDING_VECTOR); } -void perf_counters_lapic_init(int nmi) +void perf_counters_lapic_init(void) { - u32 apic_val; - if (!x86_pmu_initialized()) return; /* - * Enable the performance counter vector in the APIC LVT: + * Always use NMI for PMU */ - apic_val = apic_read(APIC_LVTERR); - - apic_write(APIC_LVTERR, apic_val | APIC_LVT_MASKED); - if (nmi) - apic_write(APIC_LVTPC, APIC_DM_NMI); - else - apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR); - apic_write(APIC_LVTERR, apic_val); + apic_write(APIC_LVTPC, APIC_DM_NMI); } static int __kprobes @@ -1054,7 +1045,7 @@ void __init init_hw_perf_counters(void) pr_info("... counter mask: %016Lx\n", perf_counter_mask); - perf_counters_lapic_init(0); + perf_counters_lapic_init(); register_die_notifier(&perf_counter_nmi_notifier); } -- cgit v1.2.3 From 58f892e022e88438183c48661dcdc6a2997dab99 Mon Sep 17 00:00:00 2001 From: Naga Chumbalkar Date: Tue, 26 May 2009 21:48:07 +0000 Subject: x86: Print real IOAPIC version for x86-64 Fix the fact that the IOAPIC version number in the x86_64 code path always gets assigned to 0, instead of the correct value. Before the patch: (from "dmesg" output): ACPI: IOAPIC (id[0x08] address[0xfec00000] gsi_base[0]) IOAPIC[0]: apic_id 8, version 0, address 0xfec00000, GSI 0-23 <--- After the patch: ACPI: IOAPIC (id[0x08] address[0xfec00000] gsi_base[0]) IOAPIC[0]: apic_id 8, version 32, address 0xfec00000, GSI 0-23 <--- History: io_apic_get_version() was compiled out of the x86_64 code path in the commit f2c2cca3acef8b253a36381d9b469ad4fb08563a: Author: Andi Kleen Date: Tue Sep 26 10:52:37 2006 +0200 [PATCH] Remove APIC version/cpu capability mpparse checking/printing ACPI went to great trouble to get the APIC version and CPU capabilities of different CPUs before passing them to the mpparser. But all that data was used was to print it out. Actually it even faked some data based on the boot cpu, not on the actual CPU being booted. Remove all this code because it's not needed. Cc: len.brown@intel.com At the time, the IOAPIC version number was deliberately not printed in the x86_64 code path. However, after the x86 and x86_64 files were merged, the net result is that the IOAPIC version is printed incorrectly in the x86_64 code path. The patch below provides a fix. I have tested it with acpi, and with acpi=off, and did not see any problems. Signed-off-by: Naga Chumbalkar Acked-by: Yinghai Lu LKML-Reference: <20090416014230.4885.94926.sendpatchset@localhost.localdomain> Signed-off-by: Ingo Molnar ************************* --- arch/x86/kernel/acpi/boot.c | 5 +---- arch/x86/kernel/apic/io_apic.c | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel/apic') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 844e5e25213b..631086159c53 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -985,11 +985,8 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); mp_ioapics[idx].apicid = uniq_ioapic_id(id); -#ifdef CONFIG_X86_32 mp_ioapics[idx].apicver = io_apic_get_version(idx); -#else - mp_ioapics[idx].apicver = 0; -#endif + /* * Build basic GSI lookup table to facilitate gsi->io_apic lookups * and to prevent reprogramming of IOAPIC pins (PCI GSIs). diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ac7f3b6ad583..f712f8ff403c 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -4012,6 +4012,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id) return apic_id; } +#endif int __init io_apic_get_version(int ioapic) { @@ -4024,7 +4025,6 @@ int __init io_apic_get_version(int ioapic) return reg_01.bits.version; } -#endif int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) { -- cgit v1.2.3 From 3d58829b0510244596079c1d2f1762c53aef2e97 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 28 May 2009 09:54:47 +0200 Subject: x86, apic: Restore irqs on fail paths lapic_resume forgets to restore interrupts on fail paths. Fix that. Signed-off-by: Jiri Slaby Acked-by: Cyrill Gorcunov LKML-Reference: <1243497289-18591-1-git-send-email-jirislaby@gmail.com> Signed-off-by: Ingo Molnar Cc: H. Peter Anvin --- arch/x86/kernel/apic/apic.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel/apic') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b0fd26442c41..e82488d3f0ba 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2027,7 +2027,7 @@ static int lapic_resume(struct sys_device *dev) unsigned int l, h; unsigned long flags; int maxlvt; - int ret; + int ret = 0; struct IO_APIC_route_entry **ioapic_entries = NULL; if (!apic_pm_state.active) @@ -2038,14 +2038,15 @@ static int lapic_resume(struct sys_device *dev) ioapic_entries = alloc_ioapic_entries(); if (!ioapic_entries) { WARN(1, "Alloc ioapic_entries in lapic resume failed."); - return -ENOMEM; + ret = -ENOMEM; + goto restore; } ret = save_IO_APIC_setup(ioapic_entries); if (ret) { WARN(1, "Saving IO-APIC state failed: %d\n", ret); free_ioapic_entries(ioapic_entries); - return ret; + goto restore; } mask_IO_APIC_setup(ioapic_entries); @@ -2097,10 +2098,10 @@ static int lapic_resume(struct sys_device *dev) restore_IO_APIC_setup(ioapic_entries); free_ioapic_entries(ioapic_entries); } - +restore: local_irq_restore(flags); - return 0; + return ret; } /* -- cgit v1.2.3 From 103428e57be323c3c5545db8ad12667099bc6005 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 7 Jun 2009 16:48:40 +0400 Subject: x86, apic: Fix dummy apic read operation together with broken MP handling Ingo Molnar reported that read_apic is buggy novadays: [ 0.000000] Using APIC driver default [ 0.000000] SMP: Allowing 1 CPUs, 0 hotplug CPUs [ 0.000000] Local APIC disabled by BIOS -- you can enable it with "lapic" [ 0.000000] APIC: disable apic facility [ 0.000000] ------------[ cut here ]------------ [ 0.000000] WARNING: at arch/x86/kernel/apic/apic.c:254 native_apic_read_dummy+0x2d/0x3b() [ 0.000000] Hardware name: HP OmniBook PC Indeed we still rely on apic->read operation for SMP compiled kernel. And instead of disfigure the SMP code with #ifdef we allow to call apic->read. To capture any unexpected results we check for apic->read being called for sane reason via WARN_ON_ONCE but(!) instead of OR we should use AND logical operation (thanks Yinghai for spotting the root of the problem). Along with that we could be have bad MP table and we are to fix it that way no SMP started and no complains about BIOS bug if apic was just disabled via command line. Signed-off-by: Cyrill Gorcunov Cc: Yinghai Lu LKML-Reference: <20090607124840.GD4547@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 9 ++++++++- arch/x86/kernel/smpboot.c | 8 +++++--- 2 files changed, 13 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel/apic') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index e82488d3f0ba..a4c9cf0bf70b 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -249,7 +249,7 @@ static void native_apic_write_dummy(u32 reg, u32 v) static u32 native_apic_read_dummy(u32 reg) { - WARN_ON_ONCE((cpu_has_apic || !disable_apic)); + WARN_ON_ONCE((cpu_has_apic && !disable_apic)); return 0; } @@ -1609,6 +1609,13 @@ void __init init_apic_mappings(void) new_apicid = read_apic_id(); if (boot_cpu_physical_apicid != new_apicid) { boot_cpu_physical_apicid = new_apicid; + /* + * yeah -- we lie about apic_version + * in case if apic was disabled via boot option + * but it's not a problem for SMP compiled kernel + * since smp_sanity_check is prepared for such a case + * and disable smp mode + */ apic_version[new_apicid] = GET_APIC_VERSION(apic_read(APIC_LVR)); } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index d2e8de958156..7c80007ea5f7 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -992,10 +992,12 @@ static int __init smp_sanity_check(unsigned max_cpus) */ if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && !cpu_has_apic) { - printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", - boot_cpu_physical_apicid); - printk(KERN_ERR "... forcing use of dummy APIC emulation." + if (!disable_apic) { + pr_err("BIOS bug, local APIC #%d not detected!...\n", + boot_cpu_physical_apicid); + pr_err("... forcing use of dummy APIC emulation." "(tell your hw vendor)\n"); + } smpboot_clear_io_apic(); arch_disable_smp_support(); return -1; -- cgit v1.2.3 From c4ed3f04ba9defe22aa729d1646f970f791c03d7 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Mon, 8 Jun 2009 10:44:05 -0500 Subject: x86, UV: Fix macros for multiple coherency domains Fix bug in the SGI UV macros that support systems with multiple coherency domains. The macros used for referencing global MMR (chipset registers) are failing to correctly "or" the NASID (node identifier) bits that reside above M+N. These high bits are supplied automatically by the chipset for memory accesses coming from the processor socket. However, the bits must be present for references to the special global MMR space used to map chipset registers. (See uv_hub.h for more details ...) The bug results in references to invalid/incorrect nodes. Signed-off-by: Jack Steiner Cc: LKML-Reference: <20090608154405.GA16395@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_hub.h | 6 ++++-- arch/x86/kernel/apic/x2apic_uv_x.c | 15 +++++++++------ 2 files changed, 13 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel/apic') diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index d3a98ea1062e..341070f7ad5c 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -133,6 +133,7 @@ struct uv_scir_s { struct uv_hub_info_s { unsigned long global_mmr_base; unsigned long gpa_mask; + unsigned int gnode_extra; unsigned long gnode_upper; unsigned long lowmem_remap_top; unsigned long lowmem_remap_base; @@ -159,7 +160,8 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); * p - PNODE (local part of nsids, right shifted 1) */ #define UV_NASID_TO_PNODE(n) (((n) >> 1) & uv_hub_info->pnode_mask) -#define UV_PNODE_TO_NASID(p) (((p) << 1) | uv_hub_info->gnode_upper) +#define UV_PNODE_TO_GNODE(p) ((p) |uv_hub_info->gnode_extra) +#define UV_PNODE_TO_NASID(p) (UV_PNODE_TO_GNODE(p) << 1) #define UV_LOCAL_MMR_BASE 0xf4000000UL #define UV_GLOBAL_MMR32_BASE 0xf8000000UL @@ -173,7 +175,7 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); #define UV_GLOBAL_MMR32_PNODE_BITS(p) ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT)) #define UV_GLOBAL_MMR64_PNODE_BITS(p) \ - ((unsigned long)(p) << UV_GLOBAL_MMR64_PNODE_SHIFT) + ((unsigned long)(UV_PNODE_TO_GNODE(p)) << UV_GLOBAL_MMR64_PNODE_SHIFT) #define UV_APIC_PNODE_SHIFT 6 diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 2bda69352976..39f2af4b5467 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -562,7 +562,7 @@ void __init uv_system_init(void) union uvh_node_id_u node_id; unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val; - int max_pnode = 0; + int gnode_extra, max_pnode = 0; unsigned long mmr_base, present, paddr; unsigned short pnode_mask; @@ -574,6 +574,13 @@ void __init uv_system_init(void) mmr_base = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & ~UV_MMR_ENABLE; + pnode_mask = (1 << n_val) - 1; + node_id.v = uv_read_local_mmr(UVH_NODE_ID); + gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1; + gnode_upper = ((unsigned long)gnode_extra << m_val); + printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n", + n_val, m_val, gnode_upper, gnode_extra); + printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base); for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) @@ -607,11 +614,6 @@ void __init uv_system_init(void) } } - pnode_mask = (1 << n_val) - 1; - node_id.v = uv_read_local_mmr(UVH_NODE_ID); - gnode_upper = (((unsigned long)node_id.s.node_id) & - ~((1 << n_val) - 1)) << m_val; - uv_bios_init(); uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id, &sn_region_size); @@ -634,6 +636,7 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask; uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; + uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu; -- cgit v1.2.3 From 38c7fed2f5ffee17e1fa3e0f78b0e1bf43d52d13 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 25 May 2009 15:10:58 +0300 Subject: x86: remove some alloc_bootmem_cpumask_var calling Now that we set up the slab allocator earlier, we can get rid of some alloc_bootmem_cpumask_var() calls in boot code. Cc: Ingo Molnar Cc: Johannes Weiner Cc: Linus Torvalds Signed-off-by: Yinghai Lu Signed-off-by: Pekka Enberg --- arch/x86/kernel/apic/io_apic.c | 4 ++-- include/linux/irq.h | 18 +++++++----------- kernel/cpuset.c | 2 +- kernel/profile.c | 6 ------ lib/cpumask.c | 11 ++--------- 5 files changed, 12 insertions(+), 29 deletions(-) (limited to 'arch/x86/kernel/apic') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 1946fac42ab3..139201a562ba 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -185,8 +185,8 @@ int __init arch_early_irq_init(void) for (i = 0; i < count; i++) { desc = irq_to_desc(i); desc->chip_data = &cfg[i]; - alloc_bootmem_cpumask_var(&cfg[i].domain); - alloc_bootmem_cpumask_var(&cfg[i].old_domain); + alloc_cpumask_var(&cfg[i].domain, GFP_NOWAIT); + alloc_cpumask_var(&cfg[i].old_domain, GFP_NOWAIT); if (i < NR_IRQS_LEGACY) cpumask_setall(cfg[i].domain); } diff --git a/include/linux/irq.h b/include/linux/irq.h index eedbb8e5e0cc..1e50c34f0062 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -430,23 +430,19 @@ extern int set_irq_msi(unsigned int irq, struct msi_desc *entry); * Returns true if successful (or not required). */ static inline bool alloc_desc_masks(struct irq_desc *desc, int node, - bool boot) + bool boot) { -#ifdef CONFIG_CPUMASK_OFFSTACK - if (boot) { - alloc_bootmem_cpumask_var(&desc->affinity); + gfp_t gfp = GFP_ATOMIC; -#ifdef CONFIG_GENERIC_PENDING_IRQ - alloc_bootmem_cpumask_var(&desc->pending_mask); -#endif - return true; - } + if (boot) + gfp = GFP_NOWAIT; - if (!alloc_cpumask_var_node(&desc->affinity, GFP_ATOMIC, node)) +#ifdef CONFIG_CPUMASK_OFFSTACK + if (!alloc_cpumask_var_node(&desc->affinity, gfp, node)) return false; #ifdef CONFIG_GENERIC_PENDING_IRQ - if (!alloc_cpumask_var_node(&desc->pending_mask, GFP_ATOMIC, node)) { + if (!alloc_cpumask_var_node(&desc->pending_mask, gfp, node)) { free_cpumask_var(desc->affinity); return false; } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 026faccca869..d5a7e17474ee 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1857,7 +1857,7 @@ struct cgroup_subsys cpuset_subsys = { int __init cpuset_init_early(void) { - alloc_bootmem_cpumask_var(&top_cpuset.cpus_allowed); + alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_NOWAIT); top_cpuset.mems_generation = cpuset_mems_generation++; return 0; diff --git a/kernel/profile.c b/kernel/profile.c index 7724e0409bae..28cf26ad2d24 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -111,12 +111,6 @@ int __ref profile_init(void) /* only text is profiled */ prof_len = (_etext - _stext) >> prof_shift; buffer_bytes = prof_len*sizeof(atomic_t); - if (!slab_is_available()) { - prof_buffer = alloc_bootmem(buffer_bytes); - alloc_bootmem_cpumask_var(&prof_cpu_mask); - cpumask_copy(prof_cpu_mask, cpu_possible_mask); - return 0; - } if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL)) return -ENOMEM; diff --git a/lib/cpumask.c b/lib/cpumask.c index eb23aaa0c7b8..7bb4142a502f 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -92,15 +92,8 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu) */ bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) { - if (likely(slab_is_available())) - *mask = kmalloc_node(cpumask_size(), flags, node); - else { -#ifdef CONFIG_DEBUG_PER_CPU_MAPS - printk(KERN_ERR - "=> alloc_cpumask_var: kmalloc not available!\n"); -#endif - *mask = NULL; - } + *mask = kmalloc_node(cpumask_size(), flags, node); + #ifdef CONFIG_DEBUG_PER_CPU_MAPS if (!*mask) { printk(KERN_ERR "=> alloc_cpumask_var: failed!\n"); -- cgit v1.2.3 From dad213aeb59718623fc59defeff95fe8c3feb8a0 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 28 May 2009 18:14:40 -0700 Subject: irq/cpumask: make memoryless node zero happy Don't hardcode to node zero for early boot IRQ setup memory allocations. [ penberg@cs.helsinki.fi: minor cleanups ] Cc: Ingo Molnar Cc: Johannes Weiner Cc: Linus Torvalds Signed-off-by: Yinghai Lu Signed-off-by: Pekka Enberg --- arch/x86/kernel/apic/io_apic.c | 6 ++++-- kernel/irq/handle.c | 9 +++++---- 2 files changed, 9 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel/apic') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 139201a562ba..94605e7f6a54 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -177,16 +177,18 @@ int __init arch_early_irq_init(void) struct irq_cfg *cfg; struct irq_desc *desc; int count; + int node; int i; cfg = irq_cfgx; count = ARRAY_SIZE(irq_cfgx); + node= cpu_to_node(boot_cpu_id); for (i = 0; i < count; i++) { desc = irq_to_desc(i); desc->chip_data = &cfg[i]; - alloc_cpumask_var(&cfg[i].domain, GFP_NOWAIT); - alloc_cpumask_var(&cfg[i].old_domain, GFP_NOWAIT); + alloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); + alloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); if (i < NR_IRQS_LEGACY) cpumask_setall(cfg[i].domain); } diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index a60018402f42..e161999b6683 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -150,6 +150,7 @@ int __init early_irq_init(void) { struct irq_desc *desc; int legacy_count; + int node; int i; init_irq_default_affinity(); @@ -160,20 +161,20 @@ int __init early_irq_init(void) desc = irq_desc_legacy; legacy_count = ARRAY_SIZE(irq_desc_legacy); + node = first_online_node; /* allocate irq_desc_ptrs array based on nr_irqs */ irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *)); /* allocate based on nr_cpu_ids */ - /* FIXME: invert kstat_irgs, and it'd be a per_cpu_alloc'd thing */ - kstat_irqs_legacy = alloc_bootmem(NR_IRQS_LEGACY * nr_cpu_ids * - sizeof(int)); + kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids * + sizeof(int), GFP_NOWAIT, node); for (i = 0; i < legacy_count; i++) { desc[i].irq = i; desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids; lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); - alloc_desc_masks(&desc[i], 0, true); + alloc_desc_masks(&desc[i], node, true); init_desc_masks(&desc[i]); irq_desc_ptrs[i] = desc + i; } -- cgit v1.2.3