From 9b51f66dcb09ac5eb6bc68fc111d5c7a1e0131d6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 12 Dec 2008 13:49:45 +0100 Subject: perfcounters: implement "counter inheritance" Impact: implement new performance feature Counter inheritance can be used to run performance counters in a workload, transparently - and pipe back the counter results to the parent counter. Inheritance for performance counters works the following way: when creating a counter it can be marked with the .inherit=1 flag. Such counters are then 'inherited' by all child tasks (be they fork()-ed or clone()-ed). These counters get inherited through exec() boundaries as well (except through setuid boundaries). The counter values get added back to the parent counter(s) when the child task(s) exit - much like stime/utime statistics are gathered. So inherited counters are ideal to gather summary statistics about an application's behavior via shell commands, without having to modify that application. The timec.c command utilizes counter inheritance: http://redhat.com/~mingo/perfcounters/timec.c Sample output: $ ./timec -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null Performance counter stats for 'ls': 163516953 instructions 2295 cache-misses 2855182 branch-misses Signed-off-by: Ingo Molnar --- kernel/exit.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 2d8be7ebb0f7..d336c90a5f13 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1093,11 +1093,12 @@ NORET_TYPE void do_exit(long code) mpol_put(tsk->mempolicy); tsk->mempolicy = NULL; #endif -#ifdef CONFIG_FUTEX /* - * This must happen late, after the PID is not - * hashed anymore: + * These must happen late, after the PID is not + * hashed anymore, but still at a point that may sleep: */ + perf_counter_exit_task(tsk); +#ifdef CONFIG_FUTEX if (unlikely(!list_empty(&tsk->pi_state_list))) exit_pi_state_list(tsk); if (unlikely(current->pi_state_cache)) -- cgit v1.2.3 From aa9c4c0f967fdb482ea95e8473ec3d201e6e0781 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 17 Dec 2008 14:10:57 +0100 Subject: perfcounters: fix task clock counter Impact: fix per task clock counter precision Signed-off-by: Ingo Molnar --- include/linux/kernel_stat.h | 8 ++++++ kernel/exit.c | 17 +++++++---- kernel/perf_counter.c | 70 ++++++++++++++++++++++++++++++++++----------- kernel/sched.c | 49 +++++++++++++++++++++++++++++-- 4 files changed, 120 insertions(+), 24 deletions(-) (limited to 'kernel/exit.c') diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 4a145caeee07..1b2e3242497c 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -66,7 +66,15 @@ static inline unsigned int kstat_irqs(unsigned int irq) return sum; } + +/* + * Lock/unlock the current runqueue - to extract task statistics: + */ +extern void curr_rq_lock_irq_save(unsigned long *flags); +extern void curr_rq_unlock_irq_restore(unsigned long *flags); +extern unsigned long long __task_delta_exec(struct task_struct *tsk, int update); extern unsigned long long task_delta_exec(struct task_struct *); + extern void account_user_time(struct task_struct *, cputime_t); extern void account_user_time_scaled(struct task_struct *, cputime_t); extern void account_system_time(struct task_struct *, int, cputime_t); diff --git a/kernel/exit.c b/kernel/exit.c index d336c90a5f13..244edfd96865 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -922,6 +922,12 @@ static void exit_notify(struct task_struct *tsk, int group_dead) forget_original_parent(tsk); exit_task_namespaces(tsk); + /* + * Flush inherited counters to the parent - before the parent + * gets woken up by child-exit notifications. + */ + perf_counter_exit_task(tsk); + write_lock_irq(&tasklist_lock); if (group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); @@ -1093,11 +1099,6 @@ NORET_TYPE void do_exit(long code) mpol_put(tsk->mempolicy); tsk->mempolicy = NULL; #endif - /* - * These must happen late, after the PID is not - * hashed anymore, but still at a point that may sleep: - */ - perf_counter_exit_task(tsk); #ifdef CONFIG_FUTEX if (unlikely(!list_empty(&tsk->pi_state_list))) exit_pi_state_list(tsk); @@ -1121,6 +1122,12 @@ NORET_TYPE void do_exit(long code) if (tsk->splice_pipe) __free_pipe_info(tsk->splice_pipe); + /* + * These must happen late, after the PID is not + * hashed anymore, but still at a point that may sleep: + */ + perf_counter_exit_task(tsk); + preempt_disable(); /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 961d651aa574..f1110ac1267b 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -18,6 +18,7 @@ #include #include #include +#include #include /* @@ -106,7 +107,8 @@ static void __perf_counter_remove_from_context(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - spin_lock_irqsave(&ctx->lock, flags); + curr_rq_lock_irq_save(&flags); + spin_lock(&ctx->lock); if (counter->state == PERF_COUNTER_STATE_ACTIVE) { counter->hw_ops->disable(counter); @@ -135,7 +137,8 @@ static void __perf_counter_remove_from_context(void *info) perf_max_counters - perf_reserved_percpu); } - spin_unlock_irqrestore(&ctx->lock, flags); + spin_unlock(&ctx->lock); + curr_rq_unlock_irq_restore(&flags); } @@ -209,7 +212,8 @@ static void __perf_install_in_context(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - spin_lock_irqsave(&ctx->lock, flags); + curr_rq_lock_irq_save(&flags); + spin_lock(&ctx->lock); /* * Protect the list operation against NMI by disabling the @@ -232,7 +236,8 @@ static void __perf_install_in_context(void *info) if (!ctx->task && cpuctx->max_pertask) cpuctx->max_pertask--; - spin_unlock_irqrestore(&ctx->lock, flags); + spin_unlock(&ctx->lock); + curr_rq_unlock_irq_restore(&flags); } /* @@ -438,15 +443,19 @@ int perf_counter_task_disable(void) struct task_struct *curr = current; struct perf_counter_context *ctx = &curr->perf_counter_ctx; struct perf_counter *counter; + unsigned long flags; u64 perf_flags; int cpu; if (likely(!ctx->nr_counters)) return 0; - local_irq_disable(); + curr_rq_lock_irq_save(&flags); cpu = smp_processor_id(); + /* force the update of the task clock: */ + __task_delta_exec(curr, 1); + perf_counter_task_sched_out(curr, cpu); spin_lock(&ctx->lock); @@ -463,7 +472,7 @@ int perf_counter_task_disable(void) spin_unlock(&ctx->lock); - local_irq_enable(); + curr_rq_unlock_irq_restore(&flags); return 0; } @@ -473,15 +482,19 @@ int perf_counter_task_enable(void) struct task_struct *curr = current; struct perf_counter_context *ctx = &curr->perf_counter_ctx; struct perf_counter *counter; + unsigned long flags; u64 perf_flags; int cpu; if (likely(!ctx->nr_counters)) return 0; - local_irq_disable(); + curr_rq_lock_irq_save(&flags); cpu = smp_processor_id(); + /* force the update of the task clock: */ + __task_delta_exec(curr, 1); + spin_lock(&ctx->lock); /* @@ -493,6 +506,7 @@ int perf_counter_task_enable(void) if (counter->state != PERF_COUNTER_STATE_OFF) continue; counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->hw_event.disabled = 0; } hw_perf_restore(perf_flags); @@ -500,7 +514,7 @@ int perf_counter_task_enable(void) perf_counter_task_sched_in(curr, cpu); - local_irq_enable(); + curr_rq_unlock_irq_restore(&flags); return 0; } @@ -540,8 +554,11 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) static void __read(void *info) { struct perf_counter *counter = info; + unsigned long flags; + curr_rq_lock_irq_save(&flags); counter->hw_ops->read(counter); + curr_rq_unlock_irq_restore(&flags); } static u64 perf_counter_read(struct perf_counter *counter) @@ -860,13 +877,27 @@ static const struct hw_perf_counter_ops perf_ops_cpu_clock = { .read = cpu_clock_perf_counter_read, }; -static void task_clock_perf_counter_update(struct perf_counter *counter) +/* + * Called from within the scheduler: + */ +static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update) { - u64 prev, now; + struct task_struct *curr = counter->task; + u64 delta; + + WARN_ON_ONCE(counter->task != current); + + delta = __task_delta_exec(curr, update); + + return curr->se.sum_exec_runtime + delta; +} + +static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now) +{ + u64 prev; s64 delta; prev = atomic64_read(&counter->hw.prev_count); - now = current->se.sum_exec_runtime; atomic64_set(&counter->hw.prev_count, now); @@ -877,17 +908,23 @@ static void task_clock_perf_counter_update(struct perf_counter *counter) static void task_clock_perf_counter_read(struct perf_counter *counter) { - task_clock_perf_counter_update(counter); + u64 now = task_clock_perf_counter_val(counter, 1); + + task_clock_perf_counter_update(counter, now); } static void task_clock_perf_counter_enable(struct perf_counter *counter) { - atomic64_set(&counter->hw.prev_count, current->se.sum_exec_runtime); + u64 now = task_clock_perf_counter_val(counter, 0); + + atomic64_set(&counter->hw.prev_count, now); } static void task_clock_perf_counter_disable(struct perf_counter *counter) { - task_clock_perf_counter_update(counter); + u64 now = task_clock_perf_counter_val(counter, 0); + + task_clock_perf_counter_update(counter, now); } static const struct hw_perf_counter_ops perf_ops_task_clock = { @@ -1267,6 +1304,7 @@ __perf_counter_exit_task(struct task_struct *child, { struct perf_counter *parent_counter; u64 parent_val, child_val; + unsigned long flags; u64 perf_flags; /* @@ -1275,7 +1313,7 @@ __perf_counter_exit_task(struct task_struct *child, * Be careful about zapping the list - IRQ/NMI context * could still be processing it: */ - local_irq_disable(); + curr_rq_lock_irq_save(&flags); perf_flags = hw_perf_save_disable(); if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) { @@ -1294,7 +1332,7 @@ __perf_counter_exit_task(struct task_struct *child, list_del_init(&child_counter->list_entry); hw_perf_restore(perf_flags); - local_irq_enable(); + curr_rq_unlock_irq_restore(&flags); parent_counter = child_counter->parent; /* diff --git a/kernel/sched.c b/kernel/sched.c index 382cfdb5e38d..4d84ff4c8774 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -638,7 +638,7 @@ static inline int cpu_of(struct rq *rq) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) -static inline void update_rq_clock(struct rq *rq) +inline void update_rq_clock(struct rq *rq) { rq->clock = sched_clock_cpu(cpu_of(rq)); } @@ -969,6 +969,26 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) } } +void curr_rq_lock_irq_save(unsigned long *flags) + __acquires(rq->lock) +{ + struct rq *rq; + + local_irq_save(*flags); + rq = cpu_rq(smp_processor_id()); + spin_lock(&rq->lock); +} + +void curr_rq_unlock_irq_restore(unsigned long *flags) + __releases(rq->lock) +{ + struct rq *rq; + + rq = cpu_rq(smp_processor_id()); + spin_unlock(&rq->lock); + local_irq_restore(*flags); +} + void task_rq_unlock_wait(struct task_struct *p) { struct rq *rq = task_rq(p); @@ -2558,7 +2578,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { fire_sched_out_preempt_notifiers(prev, next); - perf_counter_task_sched_out(prev, cpu_of(rq)); prepare_lock_switch(rq, next); prepare_arch_switch(next); } @@ -4089,6 +4108,29 @@ DEFINE_PER_CPU(struct kernel_stat, kstat); EXPORT_PER_CPU_SYMBOL(kstat); +/* + * Return any ns on the sched_clock that have not yet been banked in + * @p in case that task is currently running. + */ +unsigned long long __task_delta_exec(struct task_struct *p, int update) +{ + s64 delta_exec; + struct rq *rq; + + rq = task_rq(p); + WARN_ON_ONCE(!runqueue_is_locked()); + WARN_ON_ONCE(!task_current(rq, p)); + + if (update) + update_rq_clock(rq); + + delta_exec = rq->clock - p->se.exec_start; + + WARN_ON_ONCE(delta_exec < 0); + + return delta_exec; +} + /* * Return any ns on the sched_clock that have not yet been banked in * @p in case that task is currently running. @@ -4316,13 +4358,13 @@ void scheduler_tick(void) update_rq_clock(rq); update_cpu_load(rq); curr->sched_class->task_tick(rq, curr, 0); + perf_counter_task_tick(curr, cpu); spin_unlock(&rq->lock); #ifdef CONFIG_SMP rq->idle_at_tick = idle_cpu(cpu); trigger_load_balance(rq, cpu); #endif - perf_counter_task_tick(curr, cpu); } #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ @@ -4512,6 +4554,7 @@ need_resched_nonpreemptible: if (likely(prev != next)) { sched_info_switch(prev, next); + perf_counter_task_sched_out(prev, cpu); rq->nr_switches++; rq->curr = next; -- cgit v1.2.3 From eef6cbf5844c620d9db9be99e4908cdf92492fb9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 19 Dec 2008 10:20:42 +0100 Subject: perfcounters: pull inherited counters Change counter inheritance from a 'push' to a 'pull' model: instead of child tasks pushing their final counts to the parent, reuse the wait4 infrastructure to pull counters as child tasks are exit-processed, much like how cutime/cstime is collected. Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 9 +++++++++ kernel/exit.c | 21 +++++++++------------ 2 files changed, 18 insertions(+), 12 deletions(-) (limited to 'kernel/exit.c') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 23fd8909b9e5..54fa2fa2c8e4 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -113,6 +113,14 @@ extern struct group_info init_groups; # define CAP_INIT_BSET CAP_INIT_EFF_SET #endif +#ifdef CONFIG_PERF_COUNTERS +# define INIT_PERF_COUNTERS(tsk) \ + .perf_counter_ctx.counter_list = \ + LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list), +#else +# define INIT_PERF_COUNTERS(tsk) +#endif + /* * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x1fffff (=2MB) @@ -180,6 +188,7 @@ extern struct group_info init_groups; INIT_IDS \ INIT_TRACE_IRQFLAGS \ INIT_LOCKDEP \ + INIT_PERF_COUNTERS(tsk) \ } diff --git a/kernel/exit.c b/kernel/exit.c index 244edfd96865..101b7eeff44c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -153,6 +153,9 @@ static void delayed_put_task_struct(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); +#ifdef CONFIG_PERF_COUNTERS + WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list)); +#endif trace_sched_process_free(tsk); put_task_struct(tsk); } @@ -922,12 +925,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead) forget_original_parent(tsk); exit_task_namespaces(tsk); - /* - * Flush inherited counters to the parent - before the parent - * gets woken up by child-exit notifications. - */ - perf_counter_exit_task(tsk); - write_lock_irq(&tasklist_lock); if (group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); @@ -1122,12 +1119,6 @@ NORET_TYPE void do_exit(long code) if (tsk->splice_pipe) __free_pipe_info(tsk->splice_pipe); - /* - * These must happen late, after the PID is not - * hashed anymore, but still at a point that may sleep: - */ - perf_counter_exit_task(tsk); - preempt_disable(); /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; @@ -1371,6 +1362,12 @@ static int wait_task_zombie(struct task_struct *p, int options, */ read_unlock(&tasklist_lock); + /* + * Flush inherited counters to the parent - before the parent + * gets woken up by child-exit notifications. + */ + perf_counter_exit_task(p); + retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; status = (p->signal->flags & SIGNAL_GROUP_EXIT) ? p->signal->group_exit_code : p->exit_code; -- cgit v1.2.3 From 856d56b9e5de650a64a6c41c17aaed702b55d578 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 May 2009 20:45:59 +0200 Subject: perf_counter: Fix counter inheritance Srivatsa Vaddagiri reported that a Java workload triggers this warning in kernel/exit.c: WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list)); Add the inherited counter propagation on self-detach, this could cause counter leaks and incomplete stats in threaded code like the below: #include #include void *thread(void *arg) { sleep(5); return NULL; } void main(void) { pthread_t thr; pthread_create(&thr, NULL, thread, NULL); } Reported-by: Srivatsa Vaddagiri Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: Marcelo Tosatti Signed-off-by: Ingo Molnar --- kernel/exit.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 4741376c8dec..16d74f13a3e7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -128,6 +128,12 @@ static void __exit_signal(struct task_struct *tsk) sig = NULL; /* Marker for below. */ } + /* + * Flush inherited counters to the parent - before the parent + * gets woken up by child-exit notifications. + */ + perf_counter_exit_task(tsk); + __unhash_process(tsk); /* -- cgit v1.2.3 From 0203026b58b4299ba7281c0b4b417207c1f05d0e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 17 May 2009 11:24:08 +0200 Subject: perf_counter: fix threaded task exit Flushing counters in __exit_signal() with irqs disabled is not a good idea as perf_counter_exit_task() acquires mutexes. So flush it before acquiring the tasklist lock. (Note, we still need a fix for when the PID has been unhashed.) [ Impact: fix crash with inherited counters ] Cc: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: Marcelo Tosatti Signed-off-by: Ingo Molnar --- kernel/exit.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 16d74f13a3e7..73affd35e76d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -128,12 +128,6 @@ static void __exit_signal(struct task_struct *tsk) sig = NULL; /* Marker for below. */ } - /* - * Flush inherited counters to the parent - before the parent - * gets woken up by child-exit notifications. - */ - perf_counter_exit_task(tsk); - __unhash_process(tsk); /* @@ -183,6 +177,13 @@ repeat: atomic_dec(&__task_cred(p)->user->processes); proc_flush_task(p); + + /* + * Flush inherited counters to the parent - before the parent + * gets woken up by child-exit notifications. + */ + perf_counter_exit_task(p); + write_lock_irq(&tasklist_lock); tracehook_finish_release_task(p); __exit_signal(p); -- cgit v1.2.3 From 33b2fb303fe7f6b08bbb32f708e67b96eaa94a7a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 17 May 2009 11:08:41 +0200 Subject: perf_counter: fix counter freeing logic Fix counter lifetime bugs which explain the crashes reported by Marcelo Tosatti and Arnaldo Carvalho de Melo. The new rule is: flushing + freeing is only done for a task's own counters, never for other tasks. [ Impact: fix crashes/lockups with inherited counters ] Reported-by: Arnaldo Carvalho de Melo Reported-by: Marcelo Tosatti Acked-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Signed-off-by: Ingo Molnar --- kernel/exit.c | 19 +++++++------------ kernel/perf_counter.c | 2 ++ 2 files changed, 9 insertions(+), 12 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 73affd35e76d..f9dfedd94af0 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -178,12 +178,6 @@ repeat: proc_flush_task(p); - /* - * Flush inherited counters to the parent - before the parent - * gets woken up by child-exit notifications. - */ - perf_counter_exit_task(p); - write_lock_irq(&tasklist_lock); tracehook_finish_release_task(p); __exit_signal(p); @@ -985,6 +979,13 @@ NORET_TYPE void do_exit(long code) module_put(tsk->binfmt->module); proc_exit_connector(tsk); + + /* + * Flush inherited counters to the parent - before the parent + * gets woken up by child-exit notifications. + */ + perf_counter_exit_task(tsk); + exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA mpol_put(tsk->mempolicy); @@ -1257,12 +1258,6 @@ static int wait_task_zombie(struct task_struct *p, int options, */ read_unlock(&tasklist_lock); - /* - * Flush inherited counters to the parent - before the parent - * gets woken up by child-exit notifications. - */ - perf_counter_exit_task(p); - retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; status = (p->signal->flags & SIGNAL_GROUP_EXIT) ? p->signal->group_exit_code : p->exit_code; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 59a926d04baf..7af16d1c480f 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3299,6 +3299,8 @@ void perf_counter_exit_task(struct task_struct *child) struct perf_counter *child_counter, *tmp; struct perf_counter_context *child_ctx; + WARN_ON_ONCE(child != current); + child_ctx = &child->perf_counter_ctx; if (likely(!child_ctx->nr_counters)) -- cgit v1.2.3 From a63eaf34ae60bdb067a354cc8def2e8f4a01f5f4 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 22 May 2009 14:17:31 +1000 Subject: perf_counter: Dynamically allocate tasks' perf_counter_context struct This replaces the struct perf_counter_context in the task_struct with a pointer to a dynamically allocated perf_counter_context struct. The main reason for doing is this is to allow us to transfer a perf_counter_context from one task to another when we do lazy PMU switching in a later patch. This has a few side-benefits: the task_struct becomes a little smaller, we save some memory because only tasks that have perf_counters attached get a perf_counter_context allocated for them, and we can remove the inclusion of in sched.h, meaning that we don't end up recompiling nearly everything whenever perf_counter.h changes. The perf_counter_context structures are reference-counted and freed when the last reference is dropped. A context can have references from its task and the counters on its task. Counters can outlive the task so it is possible that a context will be freed well after its task has exited. Contexts are allocated on fork if the parent had a context, or otherwise the first time that a per-task counter is created on a task. In the latter case, we set the context pointer in the task struct locklessly using an atomic compare-and-exchange operation in case we raced with some other task in creating a context for the subject task. This also removes the task pointer from the perf_counter struct. The task pointer was not used anywhere and would make it harder to move a context from one task to another. Anything that needed to know which task a counter was attached to was already using counter->ctx->task. The __perf_counter_init_context function moves up in perf_counter.c so that it can be called from find_get_context, and now initializes the refcount, but is otherwise unchanged. We were potentially calling list_del_counter twice: once from __perf_counter_exit_task when the task exits and once from __perf_counter_remove_from_context when the counter's fd gets closed. This adds a check in list_del_counter so it doesn't do anything if the counter has already been removed from the lists. Since perf_counter_task_sched_in doesn't do anything if the task doesn't have a context, and leaves cpuctx->task_ctx = NULL, this adds code to __perf_install_in_context to set cpuctx->task_ctx if necessary, i.e. in the case where the current task adds the first counter to itself and thus creates a context for itself. This also adds similar code to __perf_counter_enable to handle a similar situation which can arise when the counters have been disabled using prctl; that also leaves cpuctx->task_ctx = NULL. [ Impact: refactor counter context management to prepare for new feature ] Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo LKML-Reference: <18966.10075.781053.231153@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 1 + include/linux/init_task.h | 13 --- include/linux/perf_counter.h | 4 +- include/linux/sched.h | 6 +- kernel/exit.c | 3 +- kernel/fork.c | 1 + kernel/perf_counter.c | 218 +++++++++++++++++++++++++++---------------- 7 files changed, 145 insertions(+), 101 deletions(-) (limited to 'kernel/exit.c') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index e9021a908020..b4f64402a82a 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -14,6 +14,7 @@ * Mikael Pettersson : PM converted to driver model. */ +#include #include #include #include diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 503afaa0afa7..d87247d2641f 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -108,18 +108,6 @@ extern struct group_info init_groups; extern struct cred init_cred; -#ifdef CONFIG_PERF_COUNTERS -# define INIT_PERF_COUNTERS(tsk) \ - .perf_counter_ctx.counter_list = \ - LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list), \ - .perf_counter_ctx.event_list = \ - LIST_HEAD_INIT(tsk.perf_counter_ctx.event_list), \ - .perf_counter_ctx.lock = \ - __SPIN_LOCK_UNLOCKED(tsk.perf_counter_ctx.lock), -#else -# define INIT_PERF_COUNTERS(tsk) -#endif - /* * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x1fffff (=2MB) @@ -183,7 +171,6 @@ extern struct cred init_cred; }, \ .dirties = INIT_PROP_LOCAL_SINGLE(dirties), \ INIT_IDS \ - INIT_PERF_COUNTERS(tsk) \ INIT_TRACE_IRQFLAGS \ INIT_LOCKDEP \ INIT_FTRACE_GRAPH \ diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index f612941ef46e..071309005468 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -449,7 +449,6 @@ struct perf_counter { struct hw_perf_counter hw; struct perf_counter_context *ctx; - struct task_struct *task; struct file *filp; struct perf_counter *parent; @@ -498,7 +497,6 @@ struct perf_counter { * Used as a container for task counters and CPU counters as well: */ struct perf_counter_context { -#ifdef CONFIG_PERF_COUNTERS /* * Protect the states of the counters in the list, * nr_active, and the list: @@ -516,6 +514,7 @@ struct perf_counter_context { int nr_counters; int nr_active; int is_active; + atomic_t refcount; struct task_struct *task; /* @@ -523,7 +522,6 @@ struct perf_counter_context { */ u64 time; u64 timestamp; -#endif }; /** diff --git a/include/linux/sched.h b/include/linux/sched.h index ff59d1231519..9714d450f417 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -71,7 +71,6 @@ struct sched_param { #include #include #include -#include #include #include #include @@ -99,6 +98,7 @@ struct robust_list_head; struct bio; struct bts_tracer; struct fs_struct; +struct perf_counter_context; /* * List of flags we want to share for kernel threads, @@ -1387,7 +1387,9 @@ struct task_struct { struct list_head pi_state_list; struct futex_pi_state *pi_state_cache; #endif - struct perf_counter_context perf_counter_ctx; +#ifdef CONFIG_PERF_COUNTERS + struct perf_counter_context *perf_counter_ctxp; +#endif #ifdef CONFIG_NUMA struct mempolicy *mempolicy; short il_next; diff --git a/kernel/exit.c b/kernel/exit.c index f9dfedd94af0..99ad4063ee4a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include @@ -159,7 +160,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp) struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); #ifdef CONFIG_PERF_COUNTERS - WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list)); + WARN_ON_ONCE(tsk->perf_counter_ctxp); #endif trace_sched_process_free(tsk); put_task_struct(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index d32fef4d38e5..e72a09f5355b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 08584c16049f..06ea3eae886e 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -97,6 +97,17 @@ void perf_enable(void) hw_perf_enable(); } +static void get_ctx(struct perf_counter_context *ctx) +{ + atomic_inc(&ctx->refcount); +} + +static void put_ctx(struct perf_counter_context *ctx) +{ + if (atomic_dec_and_test(&ctx->refcount)) + kfree(ctx); +} + static void list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) { @@ -118,11 +129,17 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) ctx->nr_counters++; } +/* + * Remove a counter from the lists for its context. + * Must be called with counter->mutex and ctx->mutex held. + */ static void list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) { struct perf_counter *sibling, *tmp; + if (list_empty(&counter->list_entry)) + return; ctx->nr_counters--; list_del_init(&counter->list_entry); @@ -216,8 +233,6 @@ static void __perf_counter_remove_from_context(void *info) counter_sched_out(counter, cpuctx, ctx); - counter->task = NULL; - list_del_counter(counter, ctx); if (!ctx->task) { @@ -279,7 +294,6 @@ retry: */ if (!list_empty(&counter->list_entry)) { list_del_counter(counter, ctx); - counter->task = NULL; } spin_unlock_irq(&ctx->lock); } @@ -568,11 +582,17 @@ static void __perf_install_in_context(void *info) * If this is a task context, we need to check whether it is * the current task context of this cpu. If not it has been * scheduled out before the smp call arrived. + * Or possibly this is the right context but it isn't + * on this cpu because it had no counters. */ - if (ctx->task && cpuctx->task_ctx != ctx) - return; + if (ctx->task && cpuctx->task_ctx != ctx) { + if (cpuctx->task_ctx || ctx->task != current) + return; + cpuctx->task_ctx = ctx; + } spin_lock_irqsave(&ctx->lock, flags); + ctx->is_active = 1; update_context_time(ctx); /* @@ -653,7 +673,6 @@ perf_install_in_context(struct perf_counter_context *ctx, return; } - counter->task = task; retry: task_oncpu_function_call(task, __perf_install_in_context, counter); @@ -693,10 +712,14 @@ static void __perf_counter_enable(void *info) * If this is a per-task counter, need to check whether this * counter's task is the current task on this cpu. */ - if (ctx->task && cpuctx->task_ctx != ctx) - return; + if (ctx->task && cpuctx->task_ctx != ctx) { + if (cpuctx->task_ctx || ctx->task != current) + return; + cpuctx->task_ctx = ctx; + } spin_lock_irqsave(&ctx->lock, flags); + ctx->is_active = 1; update_context_time(ctx); counter->prev_state = counter->state; @@ -852,10 +875,10 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx, void perf_counter_task_sched_out(struct task_struct *task, int cpu) { struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_counter_context *ctx = &task->perf_counter_ctx; + struct perf_counter_context *ctx = task->perf_counter_ctxp; struct pt_regs *regs; - if (likely(!cpuctx->task_ctx)) + if (likely(!ctx || !cpuctx->task_ctx)) return; update_context_time(ctx); @@ -871,6 +894,8 @@ static void __perf_counter_task_sched_out(struct perf_counter_context *ctx) { struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + if (!cpuctx->task_ctx) + return; __perf_counter_sched_out(ctx, cpuctx); cpuctx->task_ctx = NULL; } @@ -969,8 +994,10 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, void perf_counter_task_sched_in(struct task_struct *task, int cpu) { struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_counter_context *ctx = &task->perf_counter_ctx; + struct perf_counter_context *ctx = task->perf_counter_ctxp; + if (likely(!ctx)) + return; __perf_counter_sched_in(ctx, cpuctx, cpu); cpuctx->task_ctx = ctx; } @@ -985,11 +1012,11 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) int perf_counter_task_disable(void) { struct task_struct *curr = current; - struct perf_counter_context *ctx = &curr->perf_counter_ctx; + struct perf_counter_context *ctx = curr->perf_counter_ctxp; struct perf_counter *counter; unsigned long flags; - if (likely(!ctx->nr_counters)) + if (!ctx || !ctx->nr_counters) return 0; local_irq_save(flags); @@ -1020,12 +1047,12 @@ int perf_counter_task_disable(void) int perf_counter_task_enable(void) { struct task_struct *curr = current; - struct perf_counter_context *ctx = &curr->perf_counter_ctx; + struct perf_counter_context *ctx = curr->perf_counter_ctxp; struct perf_counter *counter; unsigned long flags; int cpu; - if (likely(!ctx->nr_counters)) + if (!ctx || !ctx->nr_counters) return 0; local_irq_save(flags); @@ -1128,19 +1155,23 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) return; cpuctx = &per_cpu(perf_cpu_context, cpu); - ctx = &curr->perf_counter_ctx; + ctx = curr->perf_counter_ctxp; perf_adjust_freq(&cpuctx->ctx); - perf_adjust_freq(ctx); + if (ctx) + perf_adjust_freq(ctx); perf_counter_cpu_sched_out(cpuctx); - __perf_counter_task_sched_out(ctx); + if (ctx) + __perf_counter_task_sched_out(ctx); rotate_ctx(&cpuctx->ctx); - rotate_ctx(ctx); + if (ctx) + rotate_ctx(ctx); perf_counter_cpu_sched_in(cpuctx, cpu); - perf_counter_task_sched_in(curr, cpu); + if (ctx) + perf_counter_task_sched_in(curr, cpu); } /* @@ -1176,6 +1207,22 @@ static u64 perf_counter_read(struct perf_counter *counter) return atomic64_read(&counter->count); } +/* + * Initialize the perf_counter context in a task_struct: + */ +static void +__perf_counter_init_context(struct perf_counter_context *ctx, + struct task_struct *task) +{ + memset(ctx, 0, sizeof(*ctx)); + spin_lock_init(&ctx->lock); + mutex_init(&ctx->mutex); + INIT_LIST_HEAD(&ctx->counter_list); + INIT_LIST_HEAD(&ctx->event_list); + atomic_set(&ctx->refcount, 1); + ctx->task = task; +} + static void put_context(struct perf_counter_context *ctx) { if (ctx->task) @@ -1186,6 +1233,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) { struct perf_cpu_context *cpuctx; struct perf_counter_context *ctx; + struct perf_counter_context *tctx; struct task_struct *task; /* @@ -1225,15 +1273,36 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) if (!task) return ERR_PTR(-ESRCH); - ctx = &task->perf_counter_ctx; - ctx->task = task; - /* Reuse ptrace permission checks for now. */ if (!ptrace_may_access(task, PTRACE_MODE_READ)) { - put_context(ctx); + put_task_struct(task); return ERR_PTR(-EACCES); } + ctx = task->perf_counter_ctxp; + if (!ctx) { + ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); + if (!ctx) { + put_task_struct(task); + return ERR_PTR(-ENOMEM); + } + __perf_counter_init_context(ctx, task); + /* + * Make sure other cpus see correct values for *ctx + * once task->perf_counter_ctxp is visible to them. + */ + smp_wmb(); + tctx = cmpxchg(&task->perf_counter_ctxp, NULL, ctx); + if (tctx) { + /* + * We raced with some other task; use + * the context they set. + */ + kfree(ctx); + ctx = tctx; + } + } + return ctx; } @@ -1242,6 +1311,7 @@ static void free_counter_rcu(struct rcu_head *head) struct perf_counter *counter; counter = container_of(head, struct perf_counter, rcu_head); + put_ctx(counter->ctx); kfree(counter); } @@ -2247,7 +2317,7 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event) perf_counter_comm_ctx(&cpuctx->ctx, comm_event); put_cpu_var(perf_cpu_context); - perf_counter_comm_ctx(¤t->perf_counter_ctx, comm_event); + perf_counter_comm_ctx(current->perf_counter_ctxp, comm_event); } void perf_counter_comm(struct task_struct *task) @@ -2256,7 +2326,9 @@ void perf_counter_comm(struct task_struct *task) if (!atomic_read(&nr_comm_tracking)) return; - + if (!current->perf_counter_ctxp) + return; + comm_event = (struct perf_comm_event){ .task = task, .event = { @@ -2372,7 +2444,7 @@ got_name: perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event); put_cpu_var(perf_cpu_context); - perf_counter_mmap_ctx(¤t->perf_counter_ctx, mmap_event); + perf_counter_mmap_ctx(current->perf_counter_ctxp, mmap_event); kfree(buf); } @@ -2384,6 +2456,8 @@ void perf_counter_mmap(unsigned long addr, unsigned long len, if (!atomic_read(&nr_mmap_tracking)) return; + if (!current->perf_counter_ctxp) + return; mmap_event = (struct perf_mmap_event){ .file = file, @@ -2985,6 +3059,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, counter->group_leader = group_leader; counter->pmu = NULL; counter->ctx = ctx; + get_ctx(ctx); counter->state = PERF_COUNTER_STATE_INACTIVE; if (hw_event->disabled) @@ -3149,21 +3224,6 @@ err_put_context: goto out_fput; } -/* - * Initialize the perf_counter context in a task_struct: - */ -static void -__perf_counter_init_context(struct perf_counter_context *ctx, - struct task_struct *task) -{ - memset(ctx, 0, sizeof(*ctx)); - spin_lock_init(&ctx->lock); - mutex_init(&ctx->mutex); - INIT_LIST_HEAD(&ctx->counter_list); - INIT_LIST_HEAD(&ctx->event_list); - ctx->task = task; -} - /* * inherit a counter from parent task to child task: */ @@ -3195,7 +3255,6 @@ inherit_counter(struct perf_counter *parent_counter, /* * Link it up in the child's context: */ - child_counter->task = child; add_counter_to_ctx(child_counter, child_ctx); child_counter->parent = parent_counter; @@ -3294,40 +3353,15 @@ __perf_counter_exit_task(struct task_struct *child, struct perf_counter *parent_counter; /* - * If we do not self-reap then we have to wait for the - * child task to unschedule (it will happen for sure), - * so that its counter is at its final count. (This - * condition triggers rarely - child tasks usually get - * off their CPU before the parent has a chance to - * get this far into the reaping action) + * Protect against concurrent operations on child_counter + * due its fd getting closed, etc. */ - if (child != current) { - wait_task_inactive(child, 0); - update_counter_times(child_counter); - list_del_counter(child_counter, child_ctx); - } else { - struct perf_cpu_context *cpuctx; - unsigned long flags; - - /* - * Disable and unlink this counter. - * - * Be careful about zapping the list - IRQ/NMI context - * could still be processing it: - */ - local_irq_save(flags); - perf_disable(); - - cpuctx = &__get_cpu_var(perf_cpu_context); + mutex_lock(&child_counter->mutex); - group_sched_out(child_counter, cpuctx, child_ctx); - update_counter_times(child_counter); + update_counter_times(child_counter); + list_del_counter(child_counter, child_ctx); - list_del_counter(child_counter, child_ctx); - - perf_enable(); - local_irq_restore(flags); - } + mutex_unlock(&child_counter->mutex); parent_counter = child_counter->parent; /* @@ -3346,19 +3380,29 @@ __perf_counter_exit_task(struct task_struct *child, * * Note: we may be running in child context, but the PID is not hashed * anymore so new counters will not be added. + * (XXX not sure that is true when we get called from flush_old_exec. + * -- paulus) */ void perf_counter_exit_task(struct task_struct *child) { struct perf_counter *child_counter, *tmp; struct perf_counter_context *child_ctx; + unsigned long flags; WARN_ON_ONCE(child != current); - child_ctx = &child->perf_counter_ctx; + child_ctx = child->perf_counter_ctxp; - if (likely(!child_ctx->nr_counters)) + if (likely(!child_ctx)) return; + local_irq_save(flags); + __perf_counter_task_sched_out(child_ctx); + child->perf_counter_ctxp = NULL; + local_irq_restore(flags); + + mutex_lock(&child_ctx->mutex); + again: list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list, list_entry) @@ -3371,6 +3415,10 @@ again: */ if (!list_empty(&child_ctx->counter_list)) goto again; + + mutex_unlock(&child_ctx->mutex); + + put_ctx(child_ctx); } /* @@ -3382,19 +3430,25 @@ void perf_counter_init_task(struct task_struct *child) struct perf_counter *counter; struct task_struct *parent = current; - child_ctx = &child->perf_counter_ctx; - parent_ctx = &parent->perf_counter_ctx; - - __perf_counter_init_context(child_ctx, child); + child->perf_counter_ctxp = NULL; /* * This is executed from the parent task context, so inherit - * counters that have been marked for cloning: + * counters that have been marked for cloning. + * First allocate and initialize a context for the child. */ - if (likely(!parent_ctx->nr_counters)) + child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); + if (!child_ctx) + return; + + parent_ctx = parent->perf_counter_ctxp; + if (likely(!parent_ctx || !parent_ctx->nr_counters)) return; + __perf_counter_init_context(child_ctx, child); + child->perf_counter_ctxp = child_ctx; + /* * Lock the parent list. No need to lock the child - not PID * hashed yet and not running, so nobody can access it. -- cgit v1.2.3