summaryrefslogtreecommitdiff
path: root/kernel/events/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/events/core.c')
-rw-r--r--kernel/events/core.c1083
1 files changed, 686 insertions, 397 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 823aa0824916..0bb21659e252 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -55,6 +55,7 @@
#include <linux/pgtable.h>
#include <linux/buildid.h>
#include <linux/task_work.h>
+#include <linux/percpu-rwsem.h>
#include "internal.h"
@@ -452,8 +453,8 @@ static struct kmem_cache *perf_event_cache;
*/
int sysctl_perf_event_paranoid __read_mostly = 2;
-/* Minimum for 512 kiB + 1 user control page */
-int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
+/* Minimum for 512 kiB + 1 user control page. 'free' kiB per user. */
+static int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024);
/*
* max perf event sample rate
@@ -463,6 +464,7 @@ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free'
#define DEFAULT_CPU_TIME_MAX_PERCENT 25
int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
+static int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
@@ -484,7 +486,7 @@ static void update_perf_cpu_limits(void)
static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc);
-int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write,
+static int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
@@ -506,9 +508,7 @@ int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write,
return 0;
}
-int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
-
-int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write,
+static int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
@@ -528,6 +528,52 @@ int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write,
return 0;
}
+static const struct ctl_table events_core_sysctl_table[] = {
+ /*
+ * User-space relies on this file as a feature check for
+ * perf_events being enabled. It's an ABI, do not remove!
+ */
+ {
+ .procname = "perf_event_paranoid",
+ .data = &sysctl_perf_event_paranoid,
+ .maxlen = sizeof(sysctl_perf_event_paranoid),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "perf_event_mlock_kb",
+ .data = &sysctl_perf_event_mlock,
+ .maxlen = sizeof(sysctl_perf_event_mlock),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "perf_event_max_sample_rate",
+ .data = &sysctl_perf_event_sample_rate,
+ .maxlen = sizeof(sysctl_perf_event_sample_rate),
+ .mode = 0644,
+ .proc_handler = perf_event_max_sample_rate_handler,
+ .extra1 = SYSCTL_ONE,
+ },
+ {
+ .procname = "perf_cpu_time_max_percent",
+ .data = &sysctl_perf_cpu_time_max_percent,
+ .maxlen = sizeof(sysctl_perf_cpu_time_max_percent),
+ .mode = 0644,
+ .proc_handler = perf_cpu_time_max_percent_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_HUNDRED,
+ },
+};
+
+static int __init init_events_core_sysctls(void)
+{
+ register_sysctl_init("kernel", events_core_sysctl_table);
+ return 0;
+}
+core_initcall(init_events_core_sysctls);
+
+
/*
* perf samples are done in some very critical code paths (NMIs).
* If they take too much CPU time, the system can lock up and not
@@ -1147,8 +1193,8 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_pmu_context *cpc, int cpu)
cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
raw_spin_lock_init(&cpc->hrtimer_lock);
- hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
- timer->function = perf_mux_hrtimer_handler;
+ hrtimer_setup(timer, perf_mux_hrtimer_handler, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS_PINNED_HARD);
}
static int perf_mux_hrtimer_restart(struct perf_cpu_pmu_context *cpc)
@@ -1172,42 +1218,40 @@ static int perf_mux_hrtimer_restart_ipi(void *arg)
return perf_mux_hrtimer_restart(arg);
}
+static __always_inline struct perf_cpu_pmu_context *this_cpc(struct pmu *pmu)
+{
+ return *this_cpu_ptr(pmu->cpu_pmu_context);
+}
+
void perf_pmu_disable(struct pmu *pmu)
{
- int *count = this_cpu_ptr(pmu->pmu_disable_count);
+ int *count = &this_cpc(pmu)->pmu_disable_count;
if (!(*count)++)
pmu->pmu_disable(pmu);
}
void perf_pmu_enable(struct pmu *pmu)
{
- int *count = this_cpu_ptr(pmu->pmu_disable_count);
+ int *count = &this_cpc(pmu)->pmu_disable_count;
if (!--(*count))
pmu->pmu_enable(pmu);
}
static void perf_assert_pmu_disabled(struct pmu *pmu)
{
- WARN_ON_ONCE(*this_cpu_ptr(pmu->pmu_disable_count) == 0);
-}
-
-static void get_ctx(struct perf_event_context *ctx)
-{
- refcount_inc(&ctx->refcount);
+ int *count = &this_cpc(pmu)->pmu_disable_count;
+ WARN_ON_ONCE(*count == 0);
}
-static void *alloc_task_ctx_data(struct pmu *pmu)
+static inline void perf_pmu_read(struct perf_event *event)
{
- if (pmu->task_ctx_cache)
- return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL);
-
- return NULL;
+ if (event->state == PERF_EVENT_STATE_ACTIVE)
+ event->pmu->read(event);
}
-static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data)
+static void get_ctx(struct perf_event_context *ctx)
{
- if (pmu->task_ctx_cache && task_ctx_data)
- kmem_cache_free(pmu->task_ctx_cache, task_ctx_data);
+ refcount_inc(&ctx->refcount);
}
static void free_ctx(struct rcu_head *head)
@@ -2303,7 +2347,7 @@ static void
event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
{
struct perf_event_pmu_context *epc = event->pmu_ctx;
- struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
+ struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu);
enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
// XXX cpc serialization, probably per-cpu IRQ disabled
@@ -2444,9 +2488,8 @@ __perf_remove_from_context(struct perf_event *event,
pmu_ctx->rotate_necessary = 0;
if (ctx->task && ctx->is_active) {
- struct perf_cpu_pmu_context *cpc;
+ struct perf_cpu_pmu_context *cpc = this_cpc(pmu_ctx->pmu);
- cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
cpc->task_epc = NULL;
}
@@ -2584,7 +2627,7 @@ static int
event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
{
struct perf_event_pmu_context *epc = event->pmu_ctx;
- struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
+ struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu);
int ret = 0;
WARN_ON_ONCE(event->ctx != ctx);
@@ -2691,7 +2734,7 @@ error:
static int group_can_go_on(struct perf_event *event, int can_add_hw)
{
struct perf_event_pmu_context *epc = event->pmu_ctx;
- struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
+ struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu);
/*
* Groups consisting entirely of software events can always go on.
@@ -3314,9 +3357,8 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
struct pmu *pmu = pmu_ctx->pmu;
if (ctx->task && !(ctx->is_active & EVENT_ALL)) {
- struct perf_cpu_pmu_context *cpc;
+ struct perf_cpu_pmu_context *cpc = this_cpc(pmu);
- cpc = this_cpu_ptr(pmu->cpu_pmu_context);
WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
cpc->task_epc = NULL;
}
@@ -3473,8 +3515,7 @@ static void __perf_event_sync_stat(struct perf_event *event,
* we know the event must be on the current CPU, therefore we
* don't need to use it.
*/
- if (event->state == PERF_EVENT_STATE_ACTIVE)
- event->pmu->read(event);
+ perf_pmu_read(event);
perf_event_update_time(event);
@@ -3522,52 +3563,17 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
}
}
-#define double_list_for_each_entry(pos1, pos2, head1, head2, member) \
- for (pos1 = list_first_entry(head1, typeof(*pos1), member), \
- pos2 = list_first_entry(head2, typeof(*pos2), member); \
- !list_entry_is_head(pos1, head1, member) && \
- !list_entry_is_head(pos2, head2, member); \
- pos1 = list_next_entry(pos1, member), \
- pos2 = list_next_entry(pos2, member))
-
-static void perf_event_swap_task_ctx_data(struct perf_event_context *prev_ctx,
- struct perf_event_context *next_ctx)
-{
- struct perf_event_pmu_context *prev_epc, *next_epc;
-
- if (!prev_ctx->nr_task_data)
- return;
-
- double_list_for_each_entry(prev_epc, next_epc,
- &prev_ctx->pmu_ctx_list, &next_ctx->pmu_ctx_list,
- pmu_ctx_entry) {
-
- if (WARN_ON_ONCE(prev_epc->pmu != next_epc->pmu))
- continue;
-
- /*
- * PMU specific parts of task perf context can require
- * additional synchronization. As an example of such
- * synchronization see implementation details of Intel
- * LBR call stack data profiling;
- */
- if (prev_epc->pmu->swap_task_ctx)
- prev_epc->pmu->swap_task_ctx(prev_epc, next_epc);
- else
- swap(prev_epc->task_ctx_data, next_epc->task_ctx_data);
- }
-}
-
-static void perf_ctx_sched_task_cb(struct perf_event_context *ctx, bool sched_in)
+static void perf_ctx_sched_task_cb(struct perf_event_context *ctx,
+ struct task_struct *task, bool sched_in)
{
struct perf_event_pmu_context *pmu_ctx;
struct perf_cpu_pmu_context *cpc;
list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
- cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
+ cpc = this_cpc(pmu_ctx->pmu);
if (cpc->sched_cb_usage && pmu_ctx->pmu->sched_task)
- pmu_ctx->pmu->sched_task(pmu_ctx, sched_in);
+ pmu_ctx->pmu->sched_task(pmu_ctx, task, sched_in);
}
}
@@ -3630,17 +3636,16 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
WRITE_ONCE(ctx->task, next);
WRITE_ONCE(next_ctx->task, task);
- perf_ctx_sched_task_cb(ctx, false);
- perf_event_swap_task_ctx_data(ctx, next_ctx);
+ perf_ctx_sched_task_cb(ctx, task, false);
perf_ctx_enable(ctx, false);
/*
* RCU_INIT_POINTER here is safe because we've not
* modified the ctx and the above modification of
- * ctx->task and ctx->task_ctx_data are immaterial
- * since those values are always verified under
- * ctx->lock which we're now holding.
+ * ctx->task is immaterial since this value is
+ * always verified under ctx->lock which we're now
+ * holding.
*/
RCU_INIT_POINTER(task->perf_event_ctxp, next_ctx);
RCU_INIT_POINTER(next->perf_event_ctxp, ctx);
@@ -3660,7 +3665,7 @@ unlock:
perf_ctx_disable(ctx, false);
inside_switch:
- perf_ctx_sched_task_cb(ctx, false);
+ perf_ctx_sched_task_cb(ctx, task, false);
task_ctx_sched_out(ctx, NULL, EVENT_ALL);
perf_ctx_enable(ctx, false);
@@ -3673,7 +3678,7 @@ static DEFINE_PER_CPU(int, perf_sched_cb_usages);
void perf_sched_cb_dec(struct pmu *pmu)
{
- struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context);
+ struct perf_cpu_pmu_context *cpc = this_cpc(pmu);
this_cpu_dec(perf_sched_cb_usages);
barrier();
@@ -3685,7 +3690,7 @@ void perf_sched_cb_dec(struct pmu *pmu)
void perf_sched_cb_inc(struct pmu *pmu)
{
- struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context);
+ struct perf_cpu_pmu_context *cpc = this_cpc(pmu);
if (!cpc->sched_cb_usage++)
list_add(&cpc->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
@@ -3702,7 +3707,8 @@ void perf_sched_cb_inc(struct pmu *pmu)
* PEBS requires this to provide PID/TID information. This requires we flush
* all queued PEBS records before we context switch to a new task.
*/
-static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, bool sched_in)
+static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc,
+ struct task_struct *task, bool sched_in)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct pmu *pmu;
@@ -3716,7 +3722,7 @@ static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, bool sched_i
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
perf_pmu_disable(pmu);
- pmu->sched_task(cpc->task_epc, sched_in);
+ pmu->sched_task(cpc->task_epc, task, sched_in);
perf_pmu_enable(pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -3734,7 +3740,7 @@ static void perf_pmu_sched_task(struct task_struct *prev,
return;
list_for_each_entry(cpc, this_cpu_ptr(&sched_cb_list), sched_cb_entry)
- __perf_pmu_sched_task(cpc, sched_in);
+ __perf_pmu_sched_task(cpc, sched_in ? next : prev, sched_in);
}
static void perf_event_switch(struct task_struct *task,
@@ -3802,7 +3808,7 @@ static void __link_epc(struct perf_event_pmu_context *pmu_ctx)
if (!pmu_ctx->ctx->task)
return;
- cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
+ cpc = this_cpc(pmu_ctx->pmu);
WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
cpc->task_epc = pmu_ctx;
}
@@ -3930,11 +3936,15 @@ static int merge_sched_in(struct perf_event *event, void *data)
if (event->attr.pinned) {
perf_cgroup_event_disable(event, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+
+ if (*perf_event_fasync(event))
+ event->pending_kill = POLL_HUP;
+
+ perf_event_wakeup(event);
} else {
- struct perf_cpu_pmu_context *cpc;
+ struct perf_cpu_pmu_context *cpc = this_cpc(event->pmu_ctx->pmu);
event->pmu_ctx->rotate_necessary = 1;
- cpc = this_cpu_ptr(event->pmu_ctx->pmu->cpu_pmu_context);
perf_mux_hrtimer_restart(cpc);
group_update_userpage(event);
}
@@ -4029,7 +4039,7 @@ static void perf_event_context_sched_in(struct task_struct *task)
perf_ctx_lock(cpuctx, ctx);
perf_ctx_disable(ctx, false);
- perf_ctx_sched_task_cb(ctx, true);
+ perf_ctx_sched_task_cb(ctx, task, true);
perf_ctx_enable(ctx, false);
perf_ctx_unlock(cpuctx, ctx);
@@ -4060,7 +4070,7 @@ static void perf_event_context_sched_in(struct task_struct *task)
perf_event_sched_in(cpuctx, ctx, NULL);
- perf_ctx_sched_task_cb(cpuctx->task_ctx, true);
+ perf_ctx_sched_task_cb(cpuctx->task_ctx, task, true);
if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
perf_ctx_enable(&cpuctx->ctx, false);
@@ -4618,15 +4628,8 @@ static void __perf_event_read(void *info)
pmu->read(event);
- for_each_sibling_event(sub, event) {
- if (sub->state == PERF_EVENT_STATE_ACTIVE) {
- /*
- * Use sibling's PMU rather than @event's since
- * sibling could be on different (eg: software) PMU.
- */
- sub->pmu->read(sub);
- }
- }
+ for_each_sibling_event(sub, event)
+ perf_pmu_read(sub);
data->ret = pmu->commit_txn(pmu);
@@ -4883,7 +4886,7 @@ find_get_context(struct task_struct *task, struct perf_event *event)
if (!task) {
/* Must be root to operate on a CPU event: */
- err = perf_allow_cpu(&event->attr);
+ err = perf_allow_cpu();
if (err)
return ERR_PTR(err);
@@ -4951,7 +4954,6 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
struct perf_event *event)
{
struct perf_event_pmu_context *new = NULL, *pos = NULL, *epc;
- void *task_ctx_data = NULL;
if (!ctx->task) {
/*
@@ -4961,11 +4963,14 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
*/
struct perf_cpu_pmu_context *cpc;
- cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu);
+ cpc = *per_cpu_ptr(pmu->cpu_pmu_context, event->cpu);
epc = &cpc->epc;
raw_spin_lock_irq(&ctx->lock);
if (!epc->ctx) {
- atomic_set(&epc->refcount, 1);
+ /*
+ * One extra reference for the pmu; see perf_pmu_free().
+ */
+ atomic_set(&epc->refcount, 2);
epc->embedded = 1;
list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
epc->ctx = ctx;
@@ -4981,14 +4986,6 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
if (!new)
return ERR_PTR(-ENOMEM);
- if (event->attach_state & PERF_ATTACH_TASK_DATA) {
- task_ctx_data = alloc_task_ctx_data(pmu);
- if (!task_ctx_data) {
- kfree(new);
- return ERR_PTR(-ENOMEM);
- }
- }
-
__perf_init_event_pmu_context(new, pmu);
/*
@@ -5023,14 +5020,7 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
epc->ctx = ctx;
found_epc:
- if (task_ctx_data && !epc->task_ctx_data) {
- epc->task_ctx_data = task_ctx_data;
- task_ctx_data = NULL;
- ctx->nr_task_data++;
- }
raw_spin_unlock_irq(&ctx->lock);
-
- free_task_ctx_data(pmu, task_ctx_data);
kfree(new);
return epc;
@@ -5041,11 +5031,18 @@ static void get_pmu_ctx(struct perf_event_pmu_context *epc)
WARN_ON_ONCE(!atomic_inc_not_zero(&epc->refcount));
}
+static void free_cpc_rcu(struct rcu_head *head)
+{
+ struct perf_cpu_pmu_context *cpc =
+ container_of(head, typeof(*cpc), epc.rcu_head);
+
+ kfree(cpc);
+}
+
static void free_epc_rcu(struct rcu_head *head)
{
struct perf_event_pmu_context *epc = container_of(head, typeof(*epc), rcu_head);
- kfree(epc->task_ctx_data);
kfree(epc);
}
@@ -5075,8 +5072,10 @@ static void put_pmu_ctx(struct perf_event_pmu_context *epc)
raw_spin_unlock_irqrestore(&ctx->lock, flags);
- if (epc->embedded)
+ if (epc->embedded) {
+ call_rcu(&epc->rcu_head, free_cpc_rcu);
return;
+ }
call_rcu(&epc->rcu_head, free_epc_rcu);
}
@@ -5152,6 +5151,225 @@ static void unaccount_freq_event(void)
atomic_dec(&nr_freq_events);
}
+
+static struct perf_ctx_data *
+alloc_perf_ctx_data(struct kmem_cache *ctx_cache, bool global)
+{
+ struct perf_ctx_data *cd;
+
+ cd = kzalloc(sizeof(*cd), GFP_KERNEL);
+ if (!cd)
+ return NULL;
+
+ cd->data = kmem_cache_zalloc(ctx_cache, GFP_KERNEL);
+ if (!cd->data) {
+ kfree(cd);
+ return NULL;
+ }
+
+ cd->global = global;
+ cd->ctx_cache = ctx_cache;
+ refcount_set(&cd->refcount, 1);
+
+ return cd;
+}
+
+static void free_perf_ctx_data(struct perf_ctx_data *cd)
+{
+ kmem_cache_free(cd->ctx_cache, cd->data);
+ kfree(cd);
+}
+
+static void __free_perf_ctx_data_rcu(struct rcu_head *rcu_head)
+{
+ struct perf_ctx_data *cd;
+
+ cd = container_of(rcu_head, struct perf_ctx_data, rcu_head);
+ free_perf_ctx_data(cd);
+}
+
+static inline void perf_free_ctx_data_rcu(struct perf_ctx_data *cd)
+{
+ call_rcu(&cd->rcu_head, __free_perf_ctx_data_rcu);
+}
+
+static int
+attach_task_ctx_data(struct task_struct *task, struct kmem_cache *ctx_cache,
+ bool global)
+{
+ struct perf_ctx_data *cd, *old = NULL;
+
+ cd = alloc_perf_ctx_data(ctx_cache, global);
+ if (!cd)
+ return -ENOMEM;
+
+ for (;;) {
+ if (try_cmpxchg((struct perf_ctx_data **)&task->perf_ctx_data, &old, cd)) {
+ if (old)
+ perf_free_ctx_data_rcu(old);
+ return 0;
+ }
+
+ if (!old) {
+ /*
+ * After seeing a dead @old, we raced with
+ * removal and lost, try again to install @cd.
+ */
+ continue;
+ }
+
+ if (refcount_inc_not_zero(&old->refcount)) {
+ free_perf_ctx_data(cd); /* unused */
+ return 0;
+ }
+
+ /*
+ * @old is a dead object, refcount==0 is stable, try and
+ * replace it with @cd.
+ */
+ }
+ return 0;
+}
+
+static void __detach_global_ctx_data(void);
+DEFINE_STATIC_PERCPU_RWSEM(global_ctx_data_rwsem);
+static refcount_t global_ctx_data_ref;
+
+static int
+attach_global_ctx_data(struct kmem_cache *ctx_cache)
+{
+ struct task_struct *g, *p;
+ struct perf_ctx_data *cd;
+ int ret;
+
+ if (refcount_inc_not_zero(&global_ctx_data_ref))
+ return 0;
+
+ guard(percpu_write)(&global_ctx_data_rwsem);
+ if (refcount_inc_not_zero(&global_ctx_data_ref))
+ return 0;
+again:
+ /* Allocate everything */
+ scoped_guard (rcu) {
+ for_each_process_thread(g, p) {
+ cd = rcu_dereference(p->perf_ctx_data);
+ if (cd && !cd->global) {
+ cd->global = 1;
+ if (!refcount_inc_not_zero(&cd->refcount))
+ cd = NULL;
+ }
+ if (!cd) {
+ get_task_struct(p);
+ goto alloc;
+ }
+ }
+ }
+
+ refcount_set(&global_ctx_data_ref, 1);
+
+ return 0;
+alloc:
+ ret = attach_task_ctx_data(p, ctx_cache, true);
+ put_task_struct(p);
+ if (ret) {
+ __detach_global_ctx_data();
+ return ret;
+ }
+ goto again;
+}
+
+static int
+attach_perf_ctx_data(struct perf_event *event)
+{
+ struct task_struct *task = event->hw.target;
+ struct kmem_cache *ctx_cache = event->pmu->task_ctx_cache;
+ int ret;
+
+ if (!ctx_cache)
+ return -ENOMEM;
+
+ if (task)
+ return attach_task_ctx_data(task, ctx_cache, false);
+
+ ret = attach_global_ctx_data(ctx_cache);
+ if (ret)
+ return ret;
+
+ event->attach_state |= PERF_ATTACH_GLOBAL_DATA;
+ return 0;
+}
+
+static void
+detach_task_ctx_data(struct task_struct *p)
+{
+ struct perf_ctx_data *cd;
+
+ scoped_guard (rcu) {
+ cd = rcu_dereference(p->perf_ctx_data);
+ if (!cd || !refcount_dec_and_test(&cd->refcount))
+ return;
+ }
+
+ /*
+ * The old ctx_data may be lost because of the race.
+ * Nothing is required to do for the case.
+ * See attach_task_ctx_data().
+ */
+ if (try_cmpxchg((struct perf_ctx_data **)&p->perf_ctx_data, &cd, NULL))
+ perf_free_ctx_data_rcu(cd);
+}
+
+static void __detach_global_ctx_data(void)
+{
+ struct task_struct *g, *p;
+ struct perf_ctx_data *cd;
+
+again:
+ scoped_guard (rcu) {
+ for_each_process_thread(g, p) {
+ cd = rcu_dereference(p->perf_ctx_data);
+ if (!cd || !cd->global)
+ continue;
+ cd->global = 0;
+ get_task_struct(p);
+ goto detach;
+ }
+ }
+ return;
+detach:
+ detach_task_ctx_data(p);
+ put_task_struct(p);
+ goto again;
+}
+
+static void detach_global_ctx_data(void)
+{
+ if (refcount_dec_not_one(&global_ctx_data_ref))
+ return;
+
+ guard(percpu_write)(&global_ctx_data_rwsem);
+ if (!refcount_dec_and_test(&global_ctx_data_ref))
+ return;
+
+ /* remove everything */
+ __detach_global_ctx_data();
+}
+
+static void detach_perf_ctx_data(struct perf_event *event)
+{
+ struct task_struct *task = event->hw.target;
+
+ event->attach_state &= ~PERF_ATTACH_TASK_DATA;
+
+ if (task)
+ return detach_task_ctx_data(task);
+
+ if (event->attach_state & PERF_ATTACH_GLOBAL_DATA) {
+ detach_global_ctx_data();
+ event->attach_state &= ~PERF_ATTACH_GLOBAL_DATA;
+ }
+}
+
static void unaccount_event(struct perf_event *event)
{
bool dec = false;
@@ -5246,6 +5464,8 @@ static int exclusive_event_init(struct perf_event *event)
return -EBUSY;
}
+ event->attach_state |= PERF_ATTACH_EXCLUSIVE;
+
return 0;
}
@@ -5253,14 +5473,13 @@ static void exclusive_event_destroy(struct perf_event *event)
{
struct pmu *pmu = event->pmu;
- if (!is_exclusive_pmu(pmu))
- return;
-
/* see comment in exclusive_event_init() */
if (event->attach_state & PERF_ATTACH_TASK)
atomic_dec(&pmu->exclusive_cnt);
else
atomic_inc(&pmu->exclusive_cnt);
+
+ event->attach_state &= ~PERF_ATTACH_EXCLUSIVE;
}
static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
@@ -5292,8 +5511,7 @@ static bool exclusive_event_installable(struct perf_event *event,
return true;
}
-static void perf_addr_filters_splice(struct perf_event *event,
- struct list_head *head);
+static void perf_free_addr_filters(struct perf_event *event);
static void perf_pending_task_sync(struct perf_event *event)
{
@@ -5319,39 +5537,22 @@ static void perf_pending_task_sync(struct perf_event *event)
rcuwait_wait_event(&event->pending_work_wait, !event->pending_work, TASK_UNINTERRUPTIBLE);
}
-static void _free_event(struct perf_event *event)
+/* vs perf_event_alloc() error */
+static void __free_event(struct perf_event *event)
{
- irq_work_sync(&event->pending_irq);
- irq_work_sync(&event->pending_disable_irq);
- perf_pending_task_sync(event);
-
- unaccount_event(event);
+ if (event->attach_state & PERF_ATTACH_CALLCHAIN)
+ put_callchain_buffers();
- security_perf_event_free(event);
+ kfree(event->addr_filter_ranges);
- if (event->rb) {
- /*
- * Can happen when we close an event with re-directed output.
- *
- * Since we have a 0 refcount, perf_mmap_close() will skip
- * over us; possibly making our ring_buffer_put() the last.
- */
- mutex_lock(&event->mmap_mutex);
- ring_buffer_attach(event, NULL);
- mutex_unlock(&event->mmap_mutex);
- }
+ if (event->attach_state & PERF_ATTACH_EXCLUSIVE)
+ exclusive_event_destroy(event);
if (is_cgroup_event(event))
perf_detach_cgroup(event);
- if (!event->parent) {
- if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
- put_callchain_buffers();
- }
-
- perf_event_free_bpf_prog(event);
- perf_addr_filters_splice(event, NULL);
- kfree(event->addr_filter_ranges);
+ if (event->attach_state & PERF_ATTACH_TASK_DATA)
+ detach_perf_ctx_data(event);
if (event->destroy)
event->destroy(event);
@@ -5363,22 +5564,60 @@ static void _free_event(struct perf_event *event)
if (event->hw.target)
put_task_struct(event->hw.target);
- if (event->pmu_ctx)
+ if (event->pmu_ctx) {
+ /*
+ * put_pmu_ctx() needs an event->ctx reference, because of
+ * epc->ctx.
+ */
+ WARN_ON_ONCE(!event->ctx);
+ WARN_ON_ONCE(event->pmu_ctx->ctx != event->ctx);
put_pmu_ctx(event->pmu_ctx);
+ }
/*
- * perf_event_free_task() relies on put_ctx() being 'last', in particular
- * all task references must be cleaned up.
+ * perf_event_free_task() relies on put_ctx() being 'last', in
+ * particular all task references must be cleaned up.
*/
if (event->ctx)
put_ctx(event->ctx);
- exclusive_event_destroy(event);
- module_put(event->pmu->module);
+ if (event->pmu)
+ module_put(event->pmu->module);
call_rcu(&event->rcu_head, free_event_rcu);
}
+DEFINE_FREE(__free_event, struct perf_event *, if (_T) __free_event(_T))
+
+/* vs perf_event_alloc() success */
+static void _free_event(struct perf_event *event)
+{
+ irq_work_sync(&event->pending_irq);
+ irq_work_sync(&event->pending_disable_irq);
+ perf_pending_task_sync(event);
+
+ unaccount_event(event);
+
+ security_perf_event_free(event);
+
+ if (event->rb) {
+ /*
+ * Can happen when we close an event with re-directed output.
+ *
+ * Since we have a 0 refcount, perf_mmap_close() will skip
+ * over us; possibly making our ring_buffer_put() the last.
+ */
+ mutex_lock(&event->mmap_mutex);
+ ring_buffer_attach(event, NULL);
+ mutex_unlock(&event->mmap_mutex);
+ }
+
+ perf_event_free_bpf_prog(event);
+ perf_free_addr_filters(event);
+
+ __free_event(event);
+}
+
/*
* Used to free events which have a known refcount of 1, such as in error paths
* where the event isn't exposed yet and inherited events.
@@ -5847,6 +6086,10 @@ static __poll_t perf_poll(struct file *file, poll_table *wait)
if (is_event_hup(event))
return events;
+ if (unlikely(READ_ONCE(event->state) == PERF_EVENT_STATE_ERROR &&
+ event->attr.pinned))
+ return events;
+
/*
* Pin the event->rb by taking event->mmap_mutex; otherwise
* perf_event_set_output() can swizzle our rb and make us miss wakeups.
@@ -6616,7 +6859,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
unsigned long vma_size;
unsigned long nr_pages;
long user_extra = 0, extra = 0;
- int ret = 0, flags = 0;
+ int ret, flags = 0;
/*
* Don't allow mmap() of inherited per-task counters. This would
@@ -6634,9 +6877,54 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
return ret;
vma_size = vma->vm_end - vma->vm_start;
+ nr_pages = vma_size / PAGE_SIZE;
+
+ if (nr_pages > INT_MAX)
+ return -ENOMEM;
+
+ if (vma_size != PAGE_SIZE * nr_pages)
+ return -EINVAL;
+
+ user_extra = nr_pages;
+
+ mutex_lock(&event->mmap_mutex);
+ ret = -EINVAL;
if (vma->vm_pgoff == 0) {
- nr_pages = (vma_size / PAGE_SIZE) - 1;
+ nr_pages -= 1;
+
+ /*
+ * If we have rb pages ensure they're a power-of-two number, so we
+ * can do bitmasks instead of modulo.
+ */
+ if (nr_pages != 0 && !is_power_of_2(nr_pages))
+ goto unlock;
+
+ WARN_ON_ONCE(event->ctx->parent_ctx);
+
+ if (event->rb) {
+ if (data_page_nr(event->rb) != nr_pages)
+ goto unlock;
+
+ if (atomic_inc_not_zero(&event->rb->mmap_count)) {
+ /*
+ * Success -- managed to mmap() the same buffer
+ * multiple times.
+ */
+ ret = 0;
+ /* We need the rb to map pages. */
+ rb = event->rb;
+ goto unlock;
+ }
+
+ /*
+ * Raced against perf_mmap_close()'s
+ * atomic_dec_and_mutex_lock() remove the
+ * event and continue as if !event->rb
+ */
+ ring_buffer_attach(event, NULL);
+ }
+
} else {
/*
* AUX area mapping: if rb->aux_nr_pages != 0, it's already
@@ -6645,16 +6933,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
*/
u64 aux_offset, aux_size;
- if (!event->rb)
- return -EINVAL;
-
- nr_pages = vma_size / PAGE_SIZE;
- if (nr_pages > INT_MAX)
- return -ENOMEM;
-
- mutex_lock(&event->mmap_mutex);
- ret = -EINVAL;
-
rb = event->rb;
if (!rb)
goto aux_unlock;
@@ -6695,48 +6973,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
}
atomic_set(&rb->aux_mmap_count, 1);
- user_extra = nr_pages;
-
- goto accounting;
- }
-
- /*
- * If we have rb pages ensure they're a power-of-two number, so we
- * can do bitmasks instead of modulo.
- */
- if (nr_pages != 0 && !is_power_of_2(nr_pages))
- return -EINVAL;
-
- if (vma_size != PAGE_SIZE * (1 + nr_pages))
- return -EINVAL;
-
- WARN_ON_ONCE(event->ctx->parent_ctx);
-again:
- mutex_lock(&event->mmap_mutex);
- if (event->rb) {
- if (data_page_nr(event->rb) != nr_pages) {
- ret = -EINVAL;
- goto unlock;
- }
-
- if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
- /*
- * Raced against perf_mmap_close(); remove the
- * event and try again.
- */
- ring_buffer_attach(event, NULL);
- mutex_unlock(&event->mmap_mutex);
- goto again;
- }
-
- /* We need the rb to map pages. */
- rb = event->rb;
- goto unlock;
}
- user_extra = nr_pages + 1;
-
-accounting:
user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
/*
@@ -6804,6 +7042,8 @@ accounting:
rb->aux_mmap_locked = extra;
}
+ ret = 0;
+
unlock:
if (!ret) {
atomic_long_add(user_extra, &user->locked_vm);
@@ -6828,7 +7068,7 @@ aux_unlock:
if (!ret)
ret = map_range(rb, vma);
- if (event->pmu->event_mapped)
+ if (!ret && event->pmu->event_mapped)
event->pmu->event_mapped(event, vma->vm_mm);
return ret;
@@ -7452,9 +7692,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
values[n++] = running;
- if ((leader != event) &&
- (leader->state == PERF_EVENT_STATE_ACTIVE))
- leader->pmu->read(leader);
+ if ((leader != event) && !handle->skip_read)
+ perf_pmu_read(leader);
values[n++] = perf_event_count(leader, self);
if (read_format & PERF_FORMAT_ID)
@@ -7467,9 +7706,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
for_each_sibling_event(sub, leader) {
n = 0;
- if ((sub != event) &&
- (sub->state == PERF_EVENT_STATE_ACTIVE))
- sub->pmu->read(sub);
+ if ((sub != event) && !handle->skip_read)
+ perf_pmu_read(sub);
values[n++] = perf_event_count(sub, self);
if (read_format & PERF_FORMAT_ID)
@@ -7528,6 +7766,9 @@ void perf_output_sample(struct perf_output_handle *handle,
{
u64 sample_type = data->type;
+ if (data->sample_flags & PERF_SAMPLE_READ)
+ handle->skip_read = 1;
+
perf_output_put(handle, *header);
if (sample_type & PERF_SAMPLE_IDENTIFIER)
@@ -8522,10 +8763,58 @@ static void perf_event_task(struct task_struct *task,
task_ctx);
}
+/*
+ * Allocate data for a new task when profiling system-wide
+ * events which require PMU specific data
+ */
+static void
+perf_event_alloc_task_data(struct task_struct *child,
+ struct task_struct *parent)
+{
+ struct kmem_cache *ctx_cache = NULL;
+ struct perf_ctx_data *cd;
+
+ if (!refcount_read(&global_ctx_data_ref))
+ return;
+
+ scoped_guard (rcu) {
+ cd = rcu_dereference(parent->perf_ctx_data);
+ if (cd)
+ ctx_cache = cd->ctx_cache;
+ }
+
+ if (!ctx_cache)
+ return;
+
+ guard(percpu_read)(&global_ctx_data_rwsem);
+ scoped_guard (rcu) {
+ cd = rcu_dereference(child->perf_ctx_data);
+ if (!cd) {
+ /*
+ * A system-wide event may be unaccount,
+ * when attaching the perf_ctx_data.
+ */
+ if (!refcount_read(&global_ctx_data_ref))
+ return;
+ goto attach;
+ }
+
+ if (!cd->global) {
+ cd->global = 1;
+ refcount_inc(&cd->refcount);
+ }
+ }
+
+ return;
+attach:
+ attach_task_ctx_data(child, ctx_cache, true);
+}
+
void perf_event_fork(struct task_struct *task)
{
perf_event_task(task, NULL, 1);
perf_event_namespaces(task);
+ perf_event_alloc_task_data(task, current);
}
/*
@@ -8589,7 +8878,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
unsigned int size;
memset(comm, 0, sizeof(comm));
- strscpy(comm, comm_event->task->comm, sizeof(comm));
+ strscpy(comm, comm_event->task->comm);
size = ALIGN(strlen(comm)+1, sizeof(u64));
comm_event->comm = comm;
@@ -9033,7 +9322,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
}
cpy_name:
- strscpy(tmp, name, sizeof(tmp));
+ strscpy(tmp, name);
name = tmp;
got_name:
/*
@@ -9457,7 +9746,7 @@ void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,
ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)
goto err;
- strscpy(name, sym, KSYM_NAME_LEN);
+ strscpy(name, sym);
name_len = strlen(name) + 1;
while (!IS_ALIGNED(name_len, sizeof(u64)))
name[name_len++] = '\0';
@@ -10840,6 +11129,9 @@ int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
void perf_event_free_bpf_prog(struct perf_event *event)
{
+ if (!event->prog)
+ return;
+
if (!perf_event_is_tracing(event)) {
perf_event_free_bpf_handler(event);
return;
@@ -10938,6 +11230,17 @@ static void perf_addr_filters_splice(struct perf_event *event,
free_filters_list(&list);
}
+static void perf_free_addr_filters(struct perf_event *event)
+{
+ /*
+ * Used during free paths, there is no concurrency.
+ */
+ if (list_empty(&event->addr_filters.list))
+ return;
+
+ perf_addr_filters_splice(event, NULL);
+}
+
/*
* Scan through mm's vmas and see if one of them matches the
* @filter; if so, adjust filter's address range.
@@ -11376,8 +11679,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
if (!is_sampling_event(event))
return;
- hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
- hwc->hrtimer.function = perf_swevent_hrtimer;
+ hrtimer_setup(&hwc->hrtimer, perf_swevent_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
/*
* Since hrtimers have a fixed rate, we can do a static freq->period
@@ -11614,11 +11916,6 @@ static int perf_event_idx_default(struct perf_event *event)
return 0;
}
-static void free_pmu_context(struct pmu *pmu)
-{
- free_percpu(pmu->cpu_pmu_context);
-}
-
/*
* Let userspace know that this PMU supports address range filtering:
*/
@@ -11628,7 +11925,7 @@ static ssize_t nr_addr_filters_show(struct device *dev,
{
struct pmu *pmu = dev_get_drvdata(dev);
- return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
+ return sysfs_emit(page, "%d\n", pmu->nr_addr_filters);
}
DEVICE_ATTR_RO(nr_addr_filters);
@@ -11639,7 +11936,7 @@ type_show(struct device *dev, struct device_attribute *attr, char *page)
{
struct pmu *pmu = dev_get_drvdata(dev);
- return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->type);
+ return sysfs_emit(page, "%d\n", pmu->type);
}
static DEVICE_ATTR_RO(type);
@@ -11650,7 +11947,7 @@ perf_event_mux_interval_ms_show(struct device *dev,
{
struct pmu *pmu = dev_get_drvdata(dev);
- return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->hrtimer_interval_ms);
+ return sysfs_emit(page, "%d\n", pmu->hrtimer_interval_ms);
}
static DEFINE_MUTEX(mux_interval_mutex);
@@ -11681,7 +11978,7 @@ perf_event_mux_interval_ms_store(struct device *dev,
cpus_read_lock();
for_each_online_cpu(cpu) {
struct perf_cpu_pmu_context *cpc;
- cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu);
+ cpc = *per_cpu_ptr(pmu->cpu_pmu_context, cpu);
cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpc);
@@ -11824,6 +12121,7 @@ del_dev:
free_dev:
put_device(pmu->dev);
+ pmu->dev = NULL;
goto out;
}
@@ -11845,57 +12143,85 @@ static bool idr_cmpxchg(struct idr *idr, unsigned long id, void *old, void *new)
return true;
}
-int perf_pmu_register(struct pmu *pmu, const char *name, int type)
+static void perf_pmu_free(struct pmu *pmu)
{
- int cpu, ret, max = PERF_TYPE_MAX;
+ if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) {
+ if (pmu->nr_addr_filters)
+ device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
+ device_del(pmu->dev);
+ put_device(pmu->dev);
+ }
- mutex_lock(&pmus_lock);
- ret = -ENOMEM;
- pmu->pmu_disable_count = alloc_percpu(int);
- if (!pmu->pmu_disable_count)
- goto unlock;
+ if (pmu->cpu_pmu_context) {
+ int cpu;
- pmu->type = -1;
- if (WARN_ONCE(!name, "Can not register anonymous pmu.\n")) {
- ret = -EINVAL;
- goto free_pdc;
- }
+ for_each_possible_cpu(cpu) {
+ struct perf_cpu_pmu_context *cpc;
- if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, "Can not register a pmu with an invalid scope.\n")) {
- ret = -EINVAL;
- goto free_pdc;
+ cpc = *per_cpu_ptr(pmu->cpu_pmu_context, cpu);
+ if (!cpc)
+ continue;
+ if (cpc->epc.embedded) {
+ /* refcount managed */
+ put_pmu_ctx(&cpc->epc);
+ continue;
+ }
+ kfree(cpc);
+ }
+ free_percpu(pmu->cpu_pmu_context);
}
+}
+
+DEFINE_FREE(pmu_unregister, struct pmu *, if (_T) perf_pmu_free(_T))
+
+int perf_pmu_register(struct pmu *_pmu, const char *name, int type)
+{
+ int cpu, max = PERF_TYPE_MAX;
+
+ struct pmu *pmu __free(pmu_unregister) = _pmu;
+ guard(mutex)(&pmus_lock);
+
+ if (WARN_ONCE(!name, "Can not register anonymous pmu.\n"))
+ return -EINVAL;
+
+ if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE,
+ "Can not register a pmu with an invalid scope.\n"))
+ return -EINVAL;
pmu->name = name;
if (type >= 0)
max = type;
- ret = idr_alloc(&pmu_idr, NULL, max, 0, GFP_KERNEL);
- if (ret < 0)
- goto free_pdc;
+ CLASS(idr_alloc, pmu_type)(&pmu_idr, NULL, max, 0, GFP_KERNEL);
+ if (pmu_type.id < 0)
+ return pmu_type.id;
- WARN_ON(type >= 0 && ret != type);
+ WARN_ON(type >= 0 && pmu_type.id != type);
- type = ret;
- pmu->type = type;
+ pmu->type = pmu_type.id;
atomic_set(&pmu->exclusive_cnt, 0);
if (pmu_bus_running && !pmu->dev) {
- ret = pmu_dev_alloc(pmu);
+ int ret = pmu_dev_alloc(pmu);
if (ret)
- goto free_idr;
+ return ret;
}
- ret = -ENOMEM;
- pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context);
+ pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context *);
if (!pmu->cpu_pmu_context)
- goto free_dev;
+ return -ENOMEM;
for_each_possible_cpu(cpu) {
- struct perf_cpu_pmu_context *cpc;
+ struct perf_cpu_pmu_context *cpc =
+ kmalloc_node(sizeof(struct perf_cpu_pmu_context),
+ GFP_KERNEL | __GFP_ZERO,
+ cpu_to_node(cpu));
- cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu);
+ if (!cpc)
+ return -ENOMEM;
+
+ *per_cpu_ptr(pmu->cpu_pmu_context, cpu) = cpc;
__perf_init_event_pmu_context(&cpc->epc, pmu);
__perf_mux_hrtimer_init(cpc, cpu);
}
@@ -11932,39 +12258,21 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
* Now that the PMU is complete, make it visible to perf_try_init_event().
*/
if (!idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu))
- goto free_context;
+ return -EINVAL;
list_add_rcu(&pmu->entry, &pmus);
- ret = 0;
-unlock:
- mutex_unlock(&pmus_lock);
-
- return ret;
-
-free_context:
- free_percpu(pmu->cpu_pmu_context);
-
-free_dev:
- if (pmu->dev && pmu->dev != PMU_NULL_DEV) {
- device_del(pmu->dev);
- put_device(pmu->dev);
- }
-
-free_idr:
- idr_remove(&pmu_idr, pmu->type);
-
-free_pdc:
- free_percpu(pmu->pmu_disable_count);
- goto unlock;
+ take_idr_id(pmu_type);
+ _pmu = no_free_ptr(pmu); // let it rip
+ return 0;
}
EXPORT_SYMBOL_GPL(perf_pmu_register);
void perf_pmu_unregister(struct pmu *pmu)
{
- mutex_lock(&pmus_lock);
- list_del_rcu(&pmu->entry);
- idr_remove(&pmu_idr, pmu->type);
- mutex_unlock(&pmus_lock);
+ scoped_guard (mutex, &pmus_lock) {
+ list_del_rcu(&pmu->entry);
+ idr_remove(&pmu_idr, pmu->type);
+ }
/*
* We dereference the pmu list under both SRCU and regular RCU, so
@@ -11973,14 +12281,7 @@ void perf_pmu_unregister(struct pmu *pmu)
synchronize_srcu(&pmus_srcu);
synchronize_rcu();
- free_percpu(pmu->pmu_disable_count);
- if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) {
- if (pmu->nr_addr_filters)
- device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
- device_del(pmu->dev);
- put_device(pmu->dev);
- }
- free_pmu_context(pmu);
+ perf_pmu_free(pmu);
}
EXPORT_SYMBOL_GPL(perf_pmu_unregister);
@@ -12020,48 +12321,61 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
if (ctx)
perf_event_ctx_unlock(event->group_leader, ctx);
- if (!ret) {
- if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
- has_extended_regs(event))
- ret = -EOPNOTSUPP;
+ if (ret)
+ goto err_pmu;
- if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
- event_has_any_exclude_flag(event))
- ret = -EINVAL;
+ if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
+ has_extended_regs(event)) {
+ ret = -EOPNOTSUPP;
+ goto err_destroy;
+ }
- if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) {
- const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu);
- struct cpumask *pmu_cpumask = perf_scope_cpumask(pmu->scope);
- int cpu;
-
- if (pmu_cpumask && cpumask) {
- cpu = cpumask_any_and(pmu_cpumask, cpumask);
- if (cpu >= nr_cpu_ids)
- ret = -ENODEV;
- else
- event->event_caps |= PERF_EV_CAP_READ_SCOPE;
- } else {
- ret = -ENODEV;
- }
- }
+ if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
+ event_has_any_exclude_flag(event)) {
+ ret = -EINVAL;
+ goto err_destroy;
+ }
+
+ if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) {
+ const struct cpumask *cpumask;
+ struct cpumask *pmu_cpumask;
+ int cpu;
- if (ret && event->destroy)
- event->destroy(event);
+ cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu);
+ pmu_cpumask = perf_scope_cpumask(pmu->scope);
+
+ ret = -ENODEV;
+ if (!pmu_cpumask || !cpumask)
+ goto err_destroy;
+
+ cpu = cpumask_any_and(pmu_cpumask, cpumask);
+ if (cpu >= nr_cpu_ids)
+ goto err_destroy;
+
+ event->event_caps |= PERF_EV_CAP_READ_SCOPE;
}
- if (ret)
- module_put(pmu->module);
+ return 0;
+
+err_destroy:
+ if (event->destroy) {
+ event->destroy(event);
+ event->destroy = NULL;
+ }
+err_pmu:
+ event->pmu = NULL;
+ module_put(pmu->module);
return ret;
}
static struct pmu *perf_init_event(struct perf_event *event)
{
bool extended_type = false;
- int idx, type, ret;
struct pmu *pmu;
+ int type, ret;
- idx = srcu_read_lock(&pmus_srcu);
+ guard(srcu)(&pmus_srcu);
/*
* Save original type before calling pmu->event_init() since certain
@@ -12074,7 +12388,7 @@ static struct pmu *perf_init_event(struct perf_event *event)
pmu = event->parent->pmu;
ret = perf_try_init_event(pmu, event);
if (!ret)
- goto unlock;
+ return pmu;
}
/*
@@ -12093,13 +12407,12 @@ static struct pmu *perf_init_event(struct perf_event *event)
}
again:
- rcu_read_lock();
- pmu = idr_find(&pmu_idr, type);
- rcu_read_unlock();
+ scoped_guard (rcu)
+ pmu = idr_find(&pmu_idr, type);
if (pmu) {
if (event->attr.type != type && type != PERF_TYPE_RAW &&
!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE))
- goto fail;
+ return ERR_PTR(-ENOENT);
ret = perf_try_init_event(pmu, event);
if (ret == -ENOENT && event->attr.type != type && !extended_type) {
@@ -12108,27 +12421,21 @@ again:
}
if (ret)
- pmu = ERR_PTR(ret);
+ return ERR_PTR(ret);
- goto unlock;
+ return pmu;
}
list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
ret = perf_try_init_event(pmu, event);
if (!ret)
- goto unlock;
+ return pmu;
- if (ret != -ENOENT) {
- pmu = ERR_PTR(ret);
- goto unlock;
- }
+ if (ret != -ENOENT)
+ return ERR_PTR(ret);
}
-fail:
- pmu = ERR_PTR(-ENOENT);
-unlock:
- srcu_read_unlock(&pmus_srcu, idx);
- return pmu;
+ return ERR_PTR(-ENOENT);
}
static void attach_sb_event(struct perf_event *event)
@@ -12255,7 +12562,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
void *context, int cgroup_fd)
{
struct pmu *pmu;
- struct perf_event *event;
struct hw_perf_event *hwc;
long err = -EINVAL;
int node;
@@ -12270,8 +12576,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
}
node = (cpu >= 0) ? cpu_to_node(cpu) : -1;
- event = kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO,
- node);
+ struct perf_event *event __free(__free_event) =
+ kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO, node);
if (!event)
return ERR_PTR(-ENOMEM);
@@ -12378,15 +12684,25 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
* See perf_output_read().
*/
if (has_inherit_and_sample_read(attr) && !(attr->sample_type & PERF_SAMPLE_TID))
- goto err_ns;
+ return ERR_PTR(-EINVAL);
if (!has_branch_stack(event))
event->attr.branch_sample_type = 0;
pmu = perf_init_event(event);
- if (IS_ERR(pmu)) {
- err = PTR_ERR(pmu);
- goto err_ns;
+ if (IS_ERR(pmu))
+ return (void*)pmu;
+
+ /*
+ * The PERF_ATTACH_TASK_DATA is set in the event_init()->hw_config().
+ * The attach should be right after the perf_init_event().
+ * Otherwise, the __free_event() would mistakenly detach the non-exist
+ * perf_ctx_data because of the other errors between them.
+ */
+ if (event->attach_state & PERF_ATTACH_TASK_DATA) {
+ err = attach_perf_ctx_data(event);
+ if (err)
+ return ERR_PTR(err);
}
/*
@@ -12394,49 +12710,39 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
* events (they don't make sense as the cgroup will be different
* on other CPUs in the uncore mask).
*/
- if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) {
- err = -EINVAL;
- goto err_pmu;
- }
+ if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1))
+ return ERR_PTR(-EINVAL);
if (event->attr.aux_output &&
(!(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT) ||
- event->attr.aux_pause || event->attr.aux_resume)) {
- err = -EOPNOTSUPP;
- goto err_pmu;
- }
+ event->attr.aux_pause || event->attr.aux_resume))
+ return ERR_PTR(-EOPNOTSUPP);
- if (event->attr.aux_pause && event->attr.aux_resume) {
- err = -EINVAL;
- goto err_pmu;
- }
+ if (event->attr.aux_pause && event->attr.aux_resume)
+ return ERR_PTR(-EINVAL);
if (event->attr.aux_start_paused) {
- if (!(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) {
- err = -EOPNOTSUPP;
- goto err_pmu;
- }
+ if (!(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE))
+ return ERR_PTR(-EOPNOTSUPP);
event->hw.aux_paused = 1;
}
if (cgroup_fd != -1) {
err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
if (err)
- goto err_pmu;
+ return ERR_PTR(err);
}
err = exclusive_event_init(event);
if (err)
- goto err_pmu;
+ return ERR_PTR(err);
if (has_addr_filter(event)) {
event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters,
sizeof(struct perf_addr_filter_range),
GFP_KERNEL);
- if (!event->addr_filter_ranges) {
- err = -ENOMEM;
- goto err_per_task;
- }
+ if (!event->addr_filter_ranges)
+ return ERR_PTR(-ENOMEM);
/*
* Clone the parent's vma offsets: they are valid until exec()
@@ -12460,42 +12766,19 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
err = get_callchain_buffers(attr->sample_max_stack);
if (err)
- goto err_addr_filters;
+ return ERR_PTR(err);
+ event->attach_state |= PERF_ATTACH_CALLCHAIN;
}
}
err = security_perf_event_alloc(event);
if (err)
- goto err_callchain_buffer;
+ return ERR_PTR(err);
/* symmetric to unaccount_event() in _free_event() */
account_event(event);
- return event;
-
-err_callchain_buffer:
- if (!event->parent) {
- if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
- put_callchain_buffers();
- }
-err_addr_filters:
- kfree(event->addr_filter_ranges);
-
-err_per_task:
- exclusive_event_destroy(event);
-
-err_pmu:
- if (is_cgroup_event(event))
- perf_detach_cgroup(event);
- if (event->destroy)
- event->destroy(event);
- module_put(pmu->module);
-err_ns:
- if (event->hw.target)
- put_task_struct(event->hw.target);
- call_rcu(&event->rcu_head, free_event_rcu);
-
- return ERR_PTR(err);
+ return_ptr(event);
}
static int perf_copy_attr(struct perf_event_attr __user *uattr,
@@ -12565,7 +12848,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
}
/* privileged levels capture (kernel, hv): check permissions */
if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) {
- ret = perf_allow_kernel(attr);
+ ret = perf_allow_kernel();
if (ret)
return ret;
}
@@ -12822,12 +13105,12 @@ SYSCALL_DEFINE5(perf_event_open,
return err;
/* Do we allow access to perf_event_open(2) ? */
- err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
+ err = security_perf_event_open(PERF_SECURITY_OPEN);
if (err)
return err;
if (!attr.exclude_kernel) {
- err = perf_allow_kernel(&attr);
+ err = perf_allow_kernel();
if (err)
return err;
}
@@ -12847,7 +13130,7 @@ SYSCALL_DEFINE5(perf_event_open,
/* Only privileged users can get physical addresses */
if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) {
- err = perf_allow_kernel(&attr);
+ err = perf_allow_kernel();
if (err)
return err;
}
@@ -13569,6 +13852,12 @@ void perf_event_exit_task(struct task_struct *child)
* At this point we need to send EXIT events to cpu contexts.
*/
perf_event_task(child, NULL, 0);
+
+ /*
+ * Detach the perf_ctx_data for the system-wide event.
+ */
+ guard(percpu_read)(&global_ctx_data_rwsem);
+ detach_task_ctx_data(child);
}
static void perf_free_event(struct perf_event *event,
@@ -13680,12 +13969,12 @@ const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
return &event->attr;
}
-int perf_allow_kernel(struct perf_event_attr *attr)
+int perf_allow_kernel(void)
{
if (sysctl_perf_event_paranoid > 1 && !perfmon_capable())
return -EACCES;
- return security_perf_event_open(attr, PERF_SECURITY_KERNEL);
+ return security_perf_event_open(PERF_SECURITY_KERNEL);
}
EXPORT_SYMBOL_GPL(perf_allow_kernel);
@@ -13744,7 +14033,6 @@ inherit_event(struct perf_event *parent_event,
if (is_orphaned_event(parent_event) ||
!atomic_long_inc_not_zero(&parent_event->refcount)) {
mutex_unlock(&parent_event->child_mutex);
- /* task_ctx_data is freed with child_ctx */
free_event(child_event);
return NULL;
}
@@ -14002,6 +14290,7 @@ int perf_event_init_task(struct task_struct *child, u64 clone_flags)
child->perf_event_ctxp = NULL;
mutex_init(&child->perf_event_mutex);
INIT_LIST_HEAD(&child->perf_event_list);
+ child->perf_ctx_data = NULL;
ret = perf_event_init_context(child, clone_flags);
if (ret) {