diff options
Diffstat (limited to 'kernel/events')
-rw-r--r-- | kernel/events/core.c | 266 |
1 files changed, 157 insertions, 109 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c index e235bb991bdd..77a932b54a64 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -355,6 +355,8 @@ enum event_type_t { EVENT_FLEXIBLE = 0x1, EVENT_PINNED = 0x2, EVENT_TIME = 0x4, + /* see ctx_resched() for details */ + EVENT_CPU = 0x8, EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, }; @@ -678,6 +680,8 @@ perf_cgroup_set_timestamp(struct task_struct *task, info->timestamp = ctx->timestamp; } +static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list); + #define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */ #define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */ @@ -690,61 +694,46 @@ perf_cgroup_set_timestamp(struct task_struct *task, static void perf_cgroup_switch(struct task_struct *task, int mode) { struct perf_cpu_context *cpuctx; - struct pmu *pmu; + struct list_head *list; unsigned long flags; /* - * disable interrupts to avoid geting nr_cgroup - * changes via __perf_event_disable(). Also - * avoids preemption. + * Disable interrupts and preemption to avoid this CPU's + * cgrp_cpuctx_entry to change under us. */ local_irq_save(flags); - /* - * we reschedule only in the presence of cgroup - * constrained events. - */ + list = this_cpu_ptr(&cgrp_cpuctx_list); + list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) { + WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->unique_pmu != pmu) - continue; /* ensure we process each cpuctx once */ - - /* - * perf_cgroup_events says at least one - * context on this CPU has cgroup events. - * - * ctx->nr_cgroups reports the number of cgroup - * events for a context. - */ - if (cpuctx->ctx.nr_cgroups > 0) { - perf_ctx_lock(cpuctx, cpuctx->task_ctx); - perf_pmu_disable(cpuctx->ctx.pmu); + perf_ctx_lock(cpuctx, cpuctx->task_ctx); + perf_pmu_disable(cpuctx->ctx.pmu); - if (mode & PERF_CGROUP_SWOUT) { - cpu_ctx_sched_out(cpuctx, EVENT_ALL); - /* - * must not be done before ctxswout due - * to event_filter_match() in event_sched_out() - */ - cpuctx->cgrp = NULL; - } + if (mode & PERF_CGROUP_SWOUT) { + cpu_ctx_sched_out(cpuctx, EVENT_ALL); + /* + * must not be done before ctxswout due + * to event_filter_match() in event_sched_out() + */ + cpuctx->cgrp = NULL; + } - if (mode & PERF_CGROUP_SWIN) { - WARN_ON_ONCE(cpuctx->cgrp); - /* - * set cgrp before ctxsw in to allow - * event_filter_match() to not have to pass - * task around - * we pass the cpuctx->ctx to perf_cgroup_from_task() - * because cgorup events are only per-cpu - */ - cpuctx->cgrp = perf_cgroup_from_task(task, &cpuctx->ctx); - cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); - } - perf_pmu_enable(cpuctx->ctx.pmu); - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); + if (mode & PERF_CGROUP_SWIN) { + WARN_ON_ONCE(cpuctx->cgrp); + /* + * set cgrp before ctxsw in to allow + * event_filter_match() to not have to pass + * task around + * we pass the cpuctx->ctx to perf_cgroup_from_task() + * because cgorup events are only per-cpu + */ + cpuctx->cgrp = perf_cgroup_from_task(task, + &cpuctx->ctx); + cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); } + perf_pmu_enable(cpuctx->ctx.pmu); + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } local_irq_restore(flags); @@ -889,6 +878,7 @@ list_update_cgroup_event(struct perf_event *event, struct perf_event_context *ctx, bool add) { struct perf_cpu_context *cpuctx; + struct list_head *cpuctx_entry; if (!is_cgroup_event(event)) return; @@ -902,15 +892,16 @@ list_update_cgroup_event(struct perf_event *event, * this will always be called from the right CPU. */ cpuctx = __get_cpu_context(ctx); - - /* - * cpuctx->cgrp is NULL until a cgroup event is sched in or - * ctx->nr_cgroup == 0 . - */ - if (add && perf_cgroup_from_task(current, ctx) == event->cgrp) - cpuctx->cgrp = event->cgrp; - else if (!add) + cpuctx_entry = &cpuctx->cgrp_cpuctx_entry; + /* cpuctx->cgrp is NULL unless a cgroup event is active in this CPU .*/ + if (add) { + list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list)); + if (perf_cgroup_from_task(current, ctx) == event->cgrp) + cpuctx->cgrp = event->cgrp; + } else { + list_del(cpuctx_entry); cpuctx->cgrp = NULL; + } } #else /* !CONFIG_CGROUP_PERF */ @@ -1453,6 +1444,20 @@ static void update_group_times(struct perf_event *leader) update_event_times(event); } +static enum event_type_t get_event_type(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + enum event_type_t event_type; + + lockdep_assert_held(&ctx->lock); + + event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE; + if (!ctx->task) + event_type |= EVENT_CPU; + + return event_type; +} + static struct list_head * ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) { @@ -2226,7 +2231,8 @@ ctx_sched_in(struct perf_event_context *ctx, struct task_struct *task); static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) + struct perf_event_context *ctx, + enum event_type_t event_type) { if (!cpuctx->task_ctx) return; @@ -2234,7 +2240,7 @@ static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) return; - ctx_sched_out(ctx, cpuctx, EVENT_ALL); + ctx_sched_out(ctx, cpuctx, event_type); } static void perf_event_sched_in(struct perf_cpu_context *cpuctx, @@ -2249,13 +2255,51 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx, ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); } +/* + * We want to maintain the following priority of scheduling: + * - CPU pinned (EVENT_CPU | EVENT_PINNED) + * - task pinned (EVENT_PINNED) + * - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE) + * - task flexible (EVENT_FLEXIBLE). + * + * In order to avoid unscheduling and scheduling back in everything every + * time an event is added, only do it for the groups of equal priority and + * below. + * + * This can be called after a batch operation on task events, in which case + * event_type is a bit mask of the types of events involved. For CPU events, + * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE. + */ static void ctx_resched(struct perf_cpu_context *cpuctx, - struct perf_event_context *task_ctx) + struct perf_event_context *task_ctx, + enum event_type_t event_type) { + enum event_type_t ctx_event_type = event_type & EVENT_ALL; + bool cpu_event = !!(event_type & EVENT_CPU); + + /* + * If pinned groups are involved, flexible groups also need to be + * scheduled out. + */ + if (event_type & EVENT_PINNED) + event_type |= EVENT_FLEXIBLE; + perf_pmu_disable(cpuctx->ctx.pmu); if (task_ctx) - task_ctx_sched_out(cpuctx, task_ctx); - cpu_ctx_sched_out(cpuctx, EVENT_ALL); + task_ctx_sched_out(cpuctx, task_ctx, event_type); + + /* + * Decide which cpu ctx groups to schedule out based on the types + * of events that caused rescheduling: + * - EVENT_CPU: schedule out corresponding groups; + * - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups; + * - otherwise, do nothing more. + */ + if (cpu_event) + cpu_ctx_sched_out(cpuctx, ctx_event_type); + else if (ctx_event_type & EVENT_PINNED) + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); + perf_event_sched_in(cpuctx, task_ctx, current); perf_pmu_enable(cpuctx->ctx.pmu); } @@ -2302,7 +2346,7 @@ static int __perf_install_in_context(void *info) if (reprogram) { ctx_sched_out(ctx, cpuctx, EVENT_TIME); add_event_to_ctx(event, ctx); - ctx_resched(cpuctx, task_ctx); + ctx_resched(cpuctx, task_ctx, get_event_type(event)); } else { add_event_to_ctx(event, ctx); } @@ -2469,7 +2513,7 @@ static void __perf_event_enable(struct perf_event *event, if (ctx->task) WARN_ON_ONCE(task_ctx != ctx); - ctx_resched(cpuctx, task_ctx); + ctx_resched(cpuctx, task_ctx, get_event_type(event)); } /* @@ -2896,7 +2940,7 @@ unlock: if (do_switch) { raw_spin_lock(&ctx->lock); - task_ctx_sched_out(cpuctx, ctx); + task_ctx_sched_out(cpuctx, ctx, EVENT_ALL); raw_spin_unlock(&ctx->lock); } } @@ -2943,7 +2987,7 @@ static void perf_pmu_sched_task(struct task_struct *prev, return; list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) { - pmu = cpuctx->unique_pmu; /* software PMUs will not have sched_task */ + pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */ if (WARN_ON_ONCE(!pmu->sched_task)) continue; @@ -3133,8 +3177,12 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, * cpu flexible, task flexible. + * + * However, if task's ctx is not carrying any pinned + * events, no need to flip the cpuctx's events around. */ - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); + if (!list_empty(&ctx->pinned_groups)) + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); perf_event_sched_in(cpuctx, ctx, task); perf_pmu_enable(ctx->pmu); perf_ctx_unlock(cpuctx, ctx); @@ -3449,6 +3497,7 @@ static int event_enable_on_exec(struct perf_event *event, static void perf_event_enable_on_exec(int ctxn) { struct perf_event_context *ctx, *clone_ctx = NULL; + enum event_type_t event_type = 0; struct perf_cpu_context *cpuctx; struct perf_event *event; unsigned long flags; @@ -3462,15 +3511,17 @@ static void perf_event_enable_on_exec(int ctxn) cpuctx = __get_cpu_context(ctx); perf_ctx_lock(cpuctx, ctx); ctx_sched_out(ctx, cpuctx, EVENT_TIME); - list_for_each_entry(event, &ctx->event_list, event_entry) + list_for_each_entry(event, &ctx->event_list, event_entry) { enabled |= event_enable_on_exec(event, ctx); + event_type |= get_event_type(event); + } /* * Unclone and reschedule this context if we enabled any event. */ if (enabled) { clone_ctx = unclone_ctx(ctx); - ctx_resched(cpuctx, ctx); + ctx_resched(cpuctx, ctx, event_type); } perf_ctx_unlock(cpuctx, ctx); @@ -8044,6 +8095,9 @@ static void perf_event_addr_filters_apply(struct perf_event *event) if (task == TASK_TOMBSTONE) return; + if (!ifh->nr_file_filters) + return; + mm = get_task_mm(event->ctx->task); if (!mm) goto restart; @@ -8214,6 +8268,7 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, * attribute. */ if (state == IF_STATE_END) { + ret = -EINVAL; if (kernel && event->attr.exclude_kernel) goto fail; @@ -8221,6 +8276,18 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, if (!filename) goto fail; + /* + * For now, we only support file-based filters + * in per-task events; doing so for CPU-wide + * events requires additional context switching + * trickery, since same object code will be + * mapped at different virtual addresses in + * different processes. + */ + ret = -EOPNOTSUPP; + if (!event->ctx->task) + goto fail_free_name; + /* look up the path and grab its inode */ ret = kern_path(filename, LOOKUP_FOLLOW, &path); if (ret) @@ -8236,6 +8303,8 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, !S_ISREG(filter->inode->i_mode)) /* free_filters_list() will iput() */ goto fail; + + event->addr_filters.nr_file_filters++; } /* ready to consume more filters */ @@ -8275,24 +8344,13 @@ perf_event_set_addr_filter(struct perf_event *event, char *filter_str) if (WARN_ON_ONCE(event->parent)) return -EINVAL; - /* - * For now, we only support filtering in per-task events; doing so - * for CPU-wide events requires additional context switching trickery, - * since same object code will be mapped at different virtual - * addresses in different processes. - */ - if (!event->ctx->task) - return -EOPNOTSUPP; - ret = perf_event_parse_addr_filter(event, filter_str, &filters); if (ret) - return ret; + goto fail_clear_files; ret = event->pmu->addr_filters_validate(&filters); - if (ret) { - free_filters_list(&filters); - return ret; - } + if (ret) + goto fail_free_filters; /* remove existing filters, if any */ perf_addr_filters_splice(event, &filters); @@ -8301,6 +8359,14 @@ perf_event_set_addr_filter(struct perf_event *event, char *filter_str) perf_event_for_each_child(event, perf_event_addr_filters_apply); return ret; + +fail_free_filters: + free_filters_list(&filters); + +fail_clear_files: + event->addr_filters.nr_file_filters = 0; + + return ret; } static int perf_event_set_filter(struct perf_event *event, void __user *arg) @@ -8652,37 +8718,10 @@ static struct perf_cpu_context __percpu *find_pmu_context(int ctxn) return NULL; } -static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu) -{ - int cpu; - - for_each_possible_cpu(cpu) { - struct perf_cpu_context *cpuctx; - - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - - if (cpuctx->unique_pmu == old_pmu) - cpuctx->unique_pmu = pmu; - } -} - static void free_pmu_context(struct pmu *pmu) { - struct pmu *i; - mutex_lock(&pmus_lock); - /* - * Like a real lame refcount. - */ - list_for_each_entry(i, &pmus, entry) { - if (i->pmu_cpu_context == pmu->pmu_cpu_context) { - update_pmu_context(i, pmu); - goto out; - } - } - free_percpu(pmu->pmu_cpu_context); -out: mutex_unlock(&pmus_lock); } @@ -8886,8 +8925,6 @@ skip_type: cpuctx->ctx.pmu = pmu; __perf_mux_hrtimer_init(cpuctx, cpu); - - cpuctx->unique_pmu = pmu; } got_cpu_context: @@ -9005,6 +9042,14 @@ static struct pmu *perf_init_event(struct perf_event *event) idx = srcu_read_lock(&pmus_srcu); + /* Try parent's PMU first: */ + if (event->parent && event->parent->pmu) { + pmu = event->parent->pmu; + ret = perf_try_init_event(pmu, event); + if (!ret) + goto unlock; + } + rcu_read_lock(); pmu = idr_find(&pmu_idr, event->attr.type); rcu_read_unlock(); @@ -10265,7 +10310,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * in. */ raw_spin_lock_irq(&child_ctx->lock); - task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx); + task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL); /* * Now that the context is inactive, destroy the task <-> ctx relation @@ -10714,6 +10759,9 @@ static void __init perf_event_init_all_cpus(void) INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu)); raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu)); +#ifdef CONFIG_CGROUP_PERF + INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu)); +#endif INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu)); } } |