From bd1a5c849bdcc5c89e4a6a18216cd2b9a7a8a78f Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 13 Aug 2009 16:34:53 -0400 Subject: tracing: Ftrace dynamic ftrace_event_call support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add dynamic ftrace_event_call support to ftrace. Trace engines can add new ftrace_event_call to ftrace on the fly. Each operator function of the call takes an ftrace_event_call data structure as an argument, because these functions may be shared among several ftrace_event_calls. Changes from v13: - Define remove_subsystem_dir() always (revirt a2ca5e03), because trace_remove_event_call() uses it. - Modify syscall tracer because of ftrace_event_call change. [fweisbec@gmail.com: Fixed conflict against latest tracing/core] Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Avi Kivity Cc: Andi Kleen Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Jason Baron Cc: Jim Keniston Cc: K.Prasad Cc: Lai Jiangshan Cc: Li Zefan Cc: Przemysław Pawełczyk Cc: Roland McGrath Cc: Sam Ravnborg Cc: Srikar Dronamraju Cc: Steven Rostedt Cc: Tom Zanussi Cc: Vegard Nossum LKML-Reference: <20090813203453.31965.71901.stgit@localhost.localdomain> Signed-off-by: Frederic Weisbecker --- include/linux/ftrace_event.h | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'include/linux/ftrace_event.h') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index ace2da9e0a0d..1ab3089b5c59 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -112,12 +112,12 @@ struct ftrace_event_call { struct dentry *dir; struct trace_event *event; int enabled; - int (*regfunc)(void *); - void (*unregfunc)(void *); + int (*regfunc)(struct ftrace_event_call *); + void (*unregfunc)(struct ftrace_event_call *); int id; - int (*raw_init)(void); - int (*show_format)(struct ftrace_event_call *call, - struct trace_seq *s); + int (*raw_init)(struct ftrace_event_call *); + int (*show_format)(struct ftrace_event_call *, + struct trace_seq *); int (*define_fields)(struct ftrace_event_call *); struct list_head fields; int filter_active; @@ -147,11 +147,12 @@ enum { FILTER_PTR_STRING, }; -extern int trace_define_field(struct ftrace_event_call *call, - const char *type, const char *name, - int offset, int size, int is_signed, - int filter_type); extern int trace_define_common_fields(struct ftrace_event_call *call); +extern int trace_define_field(struct ftrace_event_call *call, char *type, + char *name, int offset, int size, int is_signed, + int filter_type); +extern int trace_add_event_call(struct ftrace_event_call *call); +extern void trace_remove_event_call(struct ftrace_event_call *call); #define is_signed_type(type) (((type)(-1)) < 0) -- cgit v1.2.3 From aeaeae1187d7520f1c5559623f0a149da6a1c96e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 27 Aug 2009 05:09:51 +0200 Subject: tracing: Restore the const qualifier for field names and types definition Restore the const qualifier in field's name and type parameters of trace_define_field that was lost while solving a conflict. Fields names and types are defined as builtin constant strings in static TRACE_EVENTs. But kprobes allocates these dynamically. That said, we still want to always pass these strings as const char * in trace_define_fields() to avoid any further accidental writes on the pointed strings. Reported-by: Li Zefan Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt --- include/linux/ftrace_event.h | 6 +++--- kernel/trace/trace_events.c | 4 ++-- kernel/trace/trace_syscalls.c | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux/ftrace_event.h') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 1ab3089b5c59..73edf5a52e31 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -148,9 +148,9 @@ enum { }; extern int trace_define_common_fields(struct ftrace_event_call *call); -extern int trace_define_field(struct ftrace_event_call *call, char *type, - char *name, int offset, int size, int is_signed, - int filter_type); +extern int trace_define_field(struct ftrace_event_call *call, const char *type, + const char *name, int offset, int size, + int is_signed, int filter_type); extern int trace_add_event_call(struct ftrace_event_call *call); extern void trace_remove_event_call(struct ftrace_event_call *call); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 8079bb511c43..197cdaa96c43 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -27,8 +27,8 @@ DEFINE_MUTEX(event_mutex); LIST_HEAD(ftrace_events); -int trace_define_field(struct ftrace_event_call *call, char *type, - char *name, int offset, int size, int is_signed, +int trace_define_field(struct ftrace_event_call *call, const char *type, + const char *name, int offset, int size, int is_signed, int filter_type) { struct ftrace_event_field *field; diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 5931933587e9..a928dd004535 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -193,8 +193,8 @@ int syscall_enter_define_fields(struct ftrace_event_call *call) return ret; for (i = 0; i < meta->nb_args; i++) { - ret = trace_define_field(call, (char *)meta->types[i], - (char *)meta->args[i], offset, + ret = trace_define_field(call, meta->types[i], + meta->args[i], offset, sizeof(unsigned long), 0, FILTER_OTHER); offset += sizeof(unsigned long); -- cgit v1.2.3 From 6fb2915df7f0747d9044da9dbff5b46dc2e20830 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 15 Oct 2009 11:21:42 +0800 Subject: tracing/profile: Add filter support - Add an ioctl to allocate a filter for a perf event. - Free the filter when the associated perf event is to be freed. - Do the filtering in perf_swevent_match(). Signed-off-by: Li Zefan Acked-by: Peter Zijlstra Acked-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Tom Zanussi LKML-Reference: <4AD69546.8050401@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 11 ++- include/linux/perf_counter.h | 1 + include/linux/perf_event.h | 6 ++ kernel/perf_event.c | 80 ++++++++++++++++++++-- kernel/trace/trace.h | 3 +- kernel/trace/trace_events_filter.c | 133 +++++++++++++++++++++++++++++-------- 6 files changed, 199 insertions(+), 35 deletions(-) (limited to 'include/linux/ftrace_event.h') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 4ec5e67e18cf..d11770472bc8 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -144,7 +144,7 @@ extern char *trace_profile_buf_nmi; #define MAX_FILTER_STR_VAL 256 /* Should handle KSYM_SYMBOL_LEN */ extern void destroy_preds(struct ftrace_event_call *call); -extern int filter_match_preds(struct ftrace_event_call *call, void *rec); +extern int filter_match_preds(struct event_filter *filter, void *rec); extern int filter_current_check_discard(struct ring_buffer *buffer, struct ftrace_event_call *call, void *rec, @@ -186,4 +186,13 @@ do { \ __trace_printk(ip, fmt, ##args); \ } while (0) +#ifdef CONFIG_EVENT_PROFILE +struct perf_event; +extern int ftrace_profile_enable(int event_id); +extern void ftrace_profile_disable(int event_id); +extern int ftrace_profile_set_filter(struct perf_event *event, int event_id, + char *filter_str); +extern void ftrace_profile_free_filter(struct perf_event *event); +#endif + #endif /* _LINUX_FTRACE_EVENT_H */ diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 7b7fbf433cff..91a2b4309e7a 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -225,6 +225,7 @@ struct perf_counter_attr { #define PERF_COUNTER_IOC_RESET _IO ('$', 3) #define PERF_COUNTER_IOC_PERIOD _IOW('$', 4, u64) #define PERF_COUNTER_IOC_SET_OUTPUT _IO ('$', 5) +#define PERF_COUNTER_IOC_SET_FILTER _IOW('$', 6, char *) enum perf_counter_ioc_flags { PERF_IOC_FLAG_GROUP = 1U << 0, diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 2e6d95f97419..df9d964c15fc 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -221,6 +221,7 @@ struct perf_event_attr { #define PERF_EVENT_IOC_RESET _IO ('$', 3) #define PERF_EVENT_IOC_PERIOD _IOW('$', 4, u64) #define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5) +#define PERF_EVENT_IOC_SET_FILTER _IOW('$', 6, char *) enum perf_event_ioc_flags { PERF_IOC_FLAG_GROUP = 1U << 0, @@ -633,7 +634,12 @@ struct perf_event { struct pid_namespace *ns; u64 id; + +#ifdef CONFIG_EVENT_PROFILE + struct event_filter *filter; #endif + +#endif /* CONFIG_PERF_EVENTS */ }; /** diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 9d0b5c665883..12b5ec39bf97 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -28,6 +28,7 @@ #include #include #include +#include #include @@ -1658,6 +1659,8 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu) return ERR_PTR(err); } +static void perf_event_free_filter(struct perf_event *event); + static void free_event_rcu(struct rcu_head *head) { struct perf_event *event; @@ -1665,6 +1668,7 @@ static void free_event_rcu(struct rcu_head *head) event = container_of(head, struct perf_event, rcu_head); if (event->ns) put_pid_ns(event->ns); + perf_event_free_filter(event); kfree(event); } @@ -1974,7 +1978,8 @@ unlock: return ret; } -int perf_event_set_output(struct perf_event *event, int output_fd); +static int perf_event_set_output(struct perf_event *event, int output_fd); +static int perf_event_set_filter(struct perf_event *event, void __user *arg); static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -2002,6 +2007,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case PERF_EVENT_IOC_SET_OUTPUT: return perf_event_set_output(event, arg); + case PERF_EVENT_IOC_SET_FILTER: + return perf_event_set_filter(event, (void __user *)arg); + default: return -ENOTTY; } @@ -3806,9 +3814,14 @@ static int perf_swevent_is_counting(struct perf_event *event) return 1; } +static int perf_tp_event_match(struct perf_event *event, + struct perf_sample_data *data); + static int perf_swevent_match(struct perf_event *event, enum perf_type_id type, - u32 event_id, struct pt_regs *regs) + u32 event_id, + struct perf_sample_data *data, + struct pt_regs *regs) { if (!perf_swevent_is_counting(event)) return 0; @@ -3826,6 +3839,10 @@ static int perf_swevent_match(struct perf_event *event, return 0; } + if (event->attr.type == PERF_TYPE_TRACEPOINT && + !perf_tp_event_match(event, data)) + return 0; + return 1; } @@ -3842,7 +3859,7 @@ static void perf_swevent_ctx_event(struct perf_event_context *ctx, rcu_read_lock(); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (perf_swevent_match(event, type, event_id, regs)) + if (perf_swevent_match(event, type, event_id, data, regs)) perf_swevent_add(event, nr, nmi, data, regs); } rcu_read_unlock(); @@ -4086,6 +4103,7 @@ static const struct pmu perf_ops_task_clock = { }; #ifdef CONFIG_EVENT_PROFILE + void perf_tp_event(int event_id, u64 addr, u64 count, void *record, int entry_size) { @@ -4109,8 +4127,15 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record, } EXPORT_SYMBOL_GPL(perf_tp_event); -extern int ftrace_profile_enable(int); -extern void ftrace_profile_disable(int); +static int perf_tp_event_match(struct perf_event *event, + struct perf_sample_data *data) +{ + void *record = data->raw->data; + + if (likely(!event->filter) || filter_match_preds(event->filter, record)) + return 1; + return 0; +} static void tp_perf_event_destroy(struct perf_event *event) { @@ -4135,12 +4160,53 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event) return &perf_ops_generic; } + +static int perf_event_set_filter(struct perf_event *event, void __user *arg) +{ + char *filter_str; + int ret; + + if (event->attr.type != PERF_TYPE_TRACEPOINT) + return -EINVAL; + + filter_str = strndup_user(arg, PAGE_SIZE); + if (IS_ERR(filter_str)) + return PTR_ERR(filter_str); + + ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); + + kfree(filter_str); + return ret; +} + +static void perf_event_free_filter(struct perf_event *event) +{ + ftrace_profile_free_filter(event); +} + #else + +static int perf_tp_event_match(struct perf_event *event, + struct perf_sample_data *data) +{ + return 1; +} + static const struct pmu *tp_perf_event_init(struct perf_event *event) { return NULL; } -#endif + +static int perf_event_set_filter(struct perf_event *event, void __user *arg) +{ + return -ENOENT; +} + +static void perf_event_free_filter(struct perf_event *event) +{ +} + +#endif /* CONFIG_EVENT_PROFILE */ atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; @@ -4394,7 +4460,7 @@ err_size: goto out; } -int perf_event_set_output(struct perf_event *event, int output_fd) +static int perf_event_set_output(struct perf_event *event, int output_fd) { struct perf_event *output_event = NULL; struct file *output_file = NULL; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index ffe53ddbe67a..4959ada9e0bb 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -743,7 +743,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event) { - if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) { + if (unlikely(call->filter_active) && + !filter_match_preds(call->filter, rec)) { ring_buffer_discard_commit(buffer, event); return 1; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 273845fce393..e27bb6acc2dd 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "trace.h" #include "trace_output.h" @@ -363,9 +364,8 @@ static void filter_build_regex(struct filter_pred *pred) } /* return 1 if event matches, 0 otherwise (discard) */ -int filter_match_preds(struct ftrace_event_call *call, void *rec) +int filter_match_preds(struct event_filter *filter, void *rec) { - struct event_filter *filter = call->filter; int match, top = 0, val1 = 0, val2 = 0; int stack[MAX_FILTER_PRED]; struct filter_pred *pred; @@ -538,9 +538,8 @@ static void filter_disable_preds(struct ftrace_event_call *call) filter->preds[i]->fn = filter_pred_none; } -void destroy_preds(struct ftrace_event_call *call) +static void __free_preds(struct event_filter *filter) { - struct event_filter *filter = call->filter; int i; if (!filter) @@ -553,21 +552,24 @@ void destroy_preds(struct ftrace_event_call *call) kfree(filter->preds); kfree(filter->filter_string); kfree(filter); +} + +void destroy_preds(struct ftrace_event_call *call) +{ + __free_preds(call->filter); call->filter = NULL; + call->filter_active = 0; } -static int init_preds(struct ftrace_event_call *call) +static struct event_filter *__alloc_preds(void) { struct event_filter *filter; struct filter_pred *pred; int i; - if (call->filter) - return 0; - - filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL); - if (!call->filter) - return -ENOMEM; + filter = kzalloc(sizeof(*filter), GFP_KERNEL); + if (!filter) + return ERR_PTR(-ENOMEM); filter->n_preds = 0; @@ -583,12 +585,24 @@ static int init_preds(struct ftrace_event_call *call) filter->preds[i] = pred; } - return 0; + return filter; oom: - destroy_preds(call); + __free_preds(filter); + return ERR_PTR(-ENOMEM); +} + +static int init_preds(struct ftrace_event_call *call) +{ + if (call->filter) + return 0; + + call->filter_active = 0; + call->filter = __alloc_preds(); + if (IS_ERR(call->filter)) + return PTR_ERR(call->filter); - return -ENOMEM; + return 0; } static int init_subsystem_preds(struct event_subsystem *system) @@ -629,10 +643,10 @@ static void filter_free_subsystem_preds(struct event_subsystem *system) static int filter_add_pred_fn(struct filter_parse_state *ps, struct ftrace_event_call *call, + struct event_filter *filter, struct filter_pred *pred, filter_pred_fn_t fn) { - struct event_filter *filter = call->filter; int idx, err; if (filter->n_preds == MAX_FILTER_PRED) { @@ -647,7 +661,6 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, return err; filter->n_preds++; - call->filter_active = 1; return 0; } @@ -726,6 +739,7 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size, static int filter_add_pred(struct filter_parse_state *ps, struct ftrace_event_call *call, + struct event_filter *filter, struct filter_pred *pred, bool dry_run) { @@ -795,7 +809,7 @@ static int filter_add_pred(struct filter_parse_state *ps, add_pred_fn: if (!dry_run) - return filter_add_pred_fn(ps, call, pred, fn); + return filter_add_pred_fn(ps, call, filter, pred, fn); return 0; } @@ -1154,6 +1168,7 @@ static int check_preds(struct filter_parse_state *ps) } static int replace_preds(struct ftrace_event_call *call, + struct event_filter *filter, struct filter_parse_state *ps, char *filter_string, bool dry_run) @@ -1200,7 +1215,7 @@ static int replace_preds(struct ftrace_event_call *call, add_pred: if (!pred) return -ENOMEM; - err = filter_add_pred(ps, call, pred, dry_run); + err = filter_add_pred(ps, call, filter, pred, dry_run); filter_free_pred(pred); if (err) return err; @@ -1216,6 +1231,7 @@ static int replace_system_preds(struct event_subsystem *system, char *filter_string) { struct ftrace_event_call *call; + struct event_filter *filter; int err; bool fail = true; @@ -1228,17 +1244,19 @@ static int replace_system_preds(struct event_subsystem *system, continue; /* try to see if the filter can be applied */ - err = replace_preds(call, ps, filter_string, true); + err = replace_preds(call, filter, ps, filter_string, true); if (err) continue; /* really apply the filter */ filter_disable_preds(call); - err = replace_preds(call, ps, filter_string, false); + err = replace_preds(call, filter, ps, filter_string, false); if (err) filter_disable_preds(call); - else - replace_filter_string(call->filter, filter_string); + else { + call->filter_active = 1; + replace_filter_string(filter, filter_string); + } fail = false; } @@ -1252,7 +1270,6 @@ static int replace_system_preds(struct event_subsystem *system, int apply_event_filter(struct ftrace_event_call *call, char *filter_string) { int err; - struct filter_parse_state *ps; mutex_lock(&event_mutex); @@ -1283,10 +1300,11 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) goto out; } - err = replace_preds(call, ps, filter_string, false); + err = replace_preds(call, call->filter, ps, filter_string, false); if (err) append_filter_err(ps, call->filter); - + else + call->filter_active = 1; out: filter_opstack_clear(ps); postfix_clear(ps); @@ -1301,7 +1319,6 @@ int apply_subsystem_event_filter(struct event_subsystem *system, char *filter_string) { int err; - struct filter_parse_state *ps; mutex_lock(&event_mutex); @@ -1345,3 +1362,67 @@ out_unlock: return err; } +#ifdef CONFIG_EVENT_PROFILE + +void ftrace_profile_free_filter(struct perf_event *event) +{ + struct event_filter *filter = event->filter; + + event->filter = NULL; + __free_preds(filter); +} + +int ftrace_profile_set_filter(struct perf_event *event, int event_id, + char *filter_str) +{ + int err; + struct event_filter *filter; + struct filter_parse_state *ps; + struct ftrace_event_call *call = NULL; + + mutex_lock(&event_mutex); + + list_for_each_entry(call, &ftrace_events, list) { + if (call->id == event_id) + break; + } + if (!call) + return -EINVAL; + + if (event->filter) + return -EEXIST; + + filter = __alloc_preds(); + if (IS_ERR(filter)) + return PTR_ERR(filter); + + err = -ENOMEM; + ps = kzalloc(sizeof(*ps), GFP_KERNEL); + if (!ps) + goto free_preds; + + parse_init(ps, filter_ops, filter_str); + err = filter_parse(ps); + if (err) + goto free_ps; + + err = replace_preds(call, filter, ps, filter_str, false); + if (!err) + event->filter = filter; + +free_ps: + filter_opstack_clear(ps); + postfix_clear(ps); + kfree(ps); + +free_preds: + if (err) + __free_preds(filter); + + mutex_unlock(&event_mutex); + + return err; +} + +#endif /* CONFIG_EVENT_PROFILE */ + -- cgit v1.2.3 From 444a2a3bcd6d5bed5c823136f68fcc93c0fe283f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 6 Nov 2009 04:13:05 +0100 Subject: tracing, perf_events: Protect the buffer from recursion in perf While tracing using events with perf, if one enables the lockdep:lock_acquire event, it will infect every other perf trace events. Basically, you can enable whatever set of trace events through perf but if this event is part of the set, the only result we can get is a long list of lock_acquire events of rcu read lock, and only that. This is because of a recursion inside perf. 1) When a trace event is triggered, it will fill a per cpu buffer and submit it to perf. 2) Perf will commit this event but will also protect some data using rcu_read_lock 3) A recursion appears: rcu_read_lock triggers a lock_acquire event that will fill the per cpu event and then submit the buffer to perf. 4) Perf detects a recursion and ignores it 5) Perf continues its work on the previous event, but its buffer has been overwritten by the lock_acquire event, it has then been turned into a lock_acquire event of rcu read lock Such scenario also happens with lock_release with rcu_read_unlock(). We could turn the rcu_read_lock() into __rcu_read_lock() to drop the lock debugging from perf fast path, but that would make us lose the rcu debugging and that doesn't prevent from other possible kind of recursion from perf in the future. This patch adds a recursion protection based on a counter on the perf trace per cpu buffers to solve the problem. -v2: Fixed lost whitespace, added reviewed-by tag Signed-off-by: Frederic Weisbecker Reviewed-by: Masami Hiramatsu Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith Cc: Paul Mackerras Cc: Steven Rostedt Cc: Li Zefan Cc: Jason Baron LKML-Reference: <1257477185-7838-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 9 +++++-- include/trace/ftrace.h | 39 ++++++++++++++++++++++------- kernel/trace/trace_event_profile.c | 41 ++++++++++++++----------------- kernel/trace/trace_kprobe.c | 50 ++++++++++++++++++++++++++++++++------ kernel/trace/trace_syscalls.c | 44 +++++++++++++++++++++++++++------ 5 files changed, 133 insertions(+), 50 deletions(-) (limited to 'include/linux/ftrace_event.h') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index f7b47c336703..43360c1d8f70 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -137,8 +137,13 @@ struct ftrace_event_call { #define FTRACE_MAX_PROFILE_SIZE 2048 -extern char *trace_profile_buf; -extern char *trace_profile_buf_nmi; +struct perf_trace_buf { + char buf[FTRACE_MAX_PROFILE_SIZE]; + int recursion; +}; + +extern struct perf_trace_buf *perf_trace_buf; +extern struct perf_trace_buf *perf_trace_buf_nmi; #define MAX_FILTER_PRED 32 #define MAX_FILTER_STR_VAL 256 /* Should handle KSYM_SYMBOL_LEN */ diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index a7f946094128..4945d1c99864 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -649,6 +649,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ * struct ftrace_event_call *event_call = &event_; * extern void perf_tp_event(int, u64, u64, void *, int); * struct ftrace_raw_##call *entry; + * struct perf_trace_buf *trace_buf; * u64 __addr = 0, __count = 1; * unsigned long irq_flags; * struct trace_entry *ent; @@ -673,14 +674,25 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ * __cpu = smp_processor_id(); * * if (in_nmi()) - * raw_data = rcu_dereference(trace_profile_buf_nmi); + * trace_buf = rcu_dereference(perf_trace_buf_nmi); * else - * raw_data = rcu_dereference(trace_profile_buf); + * trace_buf = rcu_dereference(perf_trace_buf); * - * if (!raw_data) + * if (!trace_buf) * goto end; * - * raw_data = per_cpu_ptr(raw_data, __cpu); + * trace_buf = per_cpu_ptr(trace_buf, __cpu); + * + * // Avoid recursion from perf that could mess up the buffer + * if (trace_buf->recursion++) + * goto end_recursion; + * + * raw_data = trace_buf->buf; + * + * // Make recursion update visible before entering perf_tp_event + * // so that we protect from perf recursions. + * + * barrier(); * * //zero dead bytes from alignment to avoid stack leak to userspace: * *(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL; @@ -713,8 +725,9 @@ static void ftrace_profile_##call(proto) \ { \ struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\ struct ftrace_event_call *event_call = &event_##call; \ - extern void perf_tp_event(int, u64, u64, void *, int); \ + extern void perf_tp_event(int, u64, u64, void *, int); \ struct ftrace_raw_##call *entry; \ + struct perf_trace_buf *trace_buf; \ u64 __addr = 0, __count = 1; \ unsigned long irq_flags; \ struct trace_entry *ent; \ @@ -739,14 +752,20 @@ static void ftrace_profile_##call(proto) \ __cpu = smp_processor_id(); \ \ if (in_nmi()) \ - raw_data = rcu_dereference(trace_profile_buf_nmi); \ + trace_buf = rcu_dereference(perf_trace_buf_nmi); \ else \ - raw_data = rcu_dereference(trace_profile_buf); \ + trace_buf = rcu_dereference(perf_trace_buf); \ \ - if (!raw_data) \ + if (!trace_buf) \ goto end; \ \ - raw_data = per_cpu_ptr(raw_data, __cpu); \ + trace_buf = per_cpu_ptr(trace_buf, __cpu); \ + if (trace_buf->recursion++) \ + goto end_recursion; \ + \ + barrier(); \ + \ + raw_data = trace_buf->buf; \ \ *(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL; \ entry = (struct ftrace_raw_##call *)raw_data; \ @@ -761,6 +780,8 @@ static void ftrace_profile_##call(proto) \ perf_tp_event(event_call->id, __addr, __count, entry, \ __entry_size); \ \ +end_recursion: \ + trace_buf->recursion--; \ end: \ local_irq_restore(irq_flags); \ \ diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index c9f687ab0d4f..e0d351b01f5a 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -8,41 +8,36 @@ #include #include "trace.h" -/* - * We can't use a size but a type in alloc_percpu() - * So let's create a dummy type that matches the desired size - */ -typedef struct {char buf[FTRACE_MAX_PROFILE_SIZE];} profile_buf_t; -char *trace_profile_buf; -EXPORT_SYMBOL_GPL(trace_profile_buf); +struct perf_trace_buf *perf_trace_buf; +EXPORT_SYMBOL_GPL(perf_trace_buf); -char *trace_profile_buf_nmi; -EXPORT_SYMBOL_GPL(trace_profile_buf_nmi); +struct perf_trace_buf *perf_trace_buf_nmi; +EXPORT_SYMBOL_GPL(perf_trace_buf_nmi); /* Count the events in use (per event id, not per instance) */ static int total_profile_count; static int ftrace_profile_enable_event(struct ftrace_event_call *event) { - char *buf; + struct perf_trace_buf *buf; int ret = -ENOMEM; if (atomic_inc_return(&event->profile_count)) return 0; if (!total_profile_count) { - buf = (char *)alloc_percpu(profile_buf_t); + buf = alloc_percpu(struct perf_trace_buf); if (!buf) goto fail_buf; - rcu_assign_pointer(trace_profile_buf, buf); + rcu_assign_pointer(perf_trace_buf, buf); - buf = (char *)alloc_percpu(profile_buf_t); + buf = alloc_percpu(struct perf_trace_buf); if (!buf) goto fail_buf_nmi; - rcu_assign_pointer(trace_profile_buf_nmi, buf); + rcu_assign_pointer(perf_trace_buf_nmi, buf); } ret = event->profile_enable(event); @@ -53,10 +48,10 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event) fail_buf_nmi: if (!total_profile_count) { - free_percpu(trace_profile_buf_nmi); - free_percpu(trace_profile_buf); - trace_profile_buf_nmi = NULL; - trace_profile_buf = NULL; + free_percpu(perf_trace_buf_nmi); + free_percpu(perf_trace_buf); + perf_trace_buf_nmi = NULL; + perf_trace_buf = NULL; } fail_buf: atomic_dec(&event->profile_count); @@ -84,7 +79,7 @@ int ftrace_profile_enable(int event_id) static void ftrace_profile_disable_event(struct ftrace_event_call *event) { - char *buf, *nmi_buf; + struct perf_trace_buf *buf, *nmi_buf; if (!atomic_add_negative(-1, &event->profile_count)) return; @@ -92,11 +87,11 @@ static void ftrace_profile_disable_event(struct ftrace_event_call *event) event->profile_disable(event); if (!--total_profile_count) { - buf = trace_profile_buf; - rcu_assign_pointer(trace_profile_buf, NULL); + buf = perf_trace_buf; + rcu_assign_pointer(perf_trace_buf, NULL); - nmi_buf = trace_profile_buf_nmi; - rcu_assign_pointer(trace_profile_buf_nmi, NULL); + nmi_buf = perf_trace_buf_nmi; + rcu_assign_pointer(perf_trace_buf_nmi, NULL); /* * Ensure every events in profiling have finished before diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index cf17a6694f32..3696476f307d 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1208,6 +1208,7 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp, struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); struct ftrace_event_call *call = &tp->call; struct kprobe_trace_entry *entry; + struct perf_trace_buf *trace_buf; struct trace_entry *ent; int size, __size, i, pc, __cpu; unsigned long irq_flags; @@ -1229,14 +1230,26 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp, __cpu = smp_processor_id(); if (in_nmi()) - raw_data = rcu_dereference(trace_profile_buf_nmi); + trace_buf = rcu_dereference(perf_trace_buf_nmi); else - raw_data = rcu_dereference(trace_profile_buf); + trace_buf = rcu_dereference(perf_trace_buf); - if (!raw_data) + if (!trace_buf) goto end; - raw_data = per_cpu_ptr(raw_data, __cpu); + trace_buf = per_cpu_ptr(trace_buf, __cpu); + + if (trace_buf->recursion++) + goto end_recursion; + + /* + * Make recursion update visible before entering perf_tp_event + * so that we protect from perf recursions. + */ + barrier(); + + raw_data = trace_buf->buf; + /* Zero dead bytes from alignment to avoid buffer leak to userspace */ *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; entry = (struct kprobe_trace_entry *)raw_data; @@ -1249,8 +1262,12 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp, for (i = 0; i < tp->nr_args; i++) entry->args[i] = call_fetch(&tp->args[i].fetch, regs); perf_tp_event(call->id, entry->ip, 1, entry, size); + +end_recursion: + trace_buf->recursion--; end: local_irq_restore(irq_flags); + return 0; } @@ -1261,6 +1278,7 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri, struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); struct ftrace_event_call *call = &tp->call; struct kretprobe_trace_entry *entry; + struct perf_trace_buf *trace_buf; struct trace_entry *ent; int size, __size, i, pc, __cpu; unsigned long irq_flags; @@ -1282,14 +1300,26 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri, __cpu = smp_processor_id(); if (in_nmi()) - raw_data = rcu_dereference(trace_profile_buf_nmi); + trace_buf = rcu_dereference(perf_trace_buf_nmi); else - raw_data = rcu_dereference(trace_profile_buf); + trace_buf = rcu_dereference(perf_trace_buf); - if (!raw_data) + if (!trace_buf) goto end; - raw_data = per_cpu_ptr(raw_data, __cpu); + trace_buf = per_cpu_ptr(trace_buf, __cpu); + + if (trace_buf->recursion++) + goto end_recursion; + + /* + * Make recursion update visible before entering perf_tp_event + * so that we protect from perf recursions. + */ + barrier(); + + raw_data = trace_buf->buf; + /* Zero dead bytes from alignment to avoid buffer leak to userspace */ *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; entry = (struct kretprobe_trace_entry *)raw_data; @@ -1303,8 +1333,12 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri, for (i = 0; i < tp->nr_args; i++) entry->args[i] = call_fetch(&tp->args[i].fetch, regs); perf_tp_event(call->id, entry->ret_ip, 1, entry, size); + +end_recursion: + trace_buf->recursion--; end: local_irq_restore(irq_flags); + return 0; } diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 58b8e5370767..51213b0aa81b 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -477,6 +477,7 @@ static int sys_prof_refcount_exit; static void prof_syscall_enter(struct pt_regs *regs, long id) { struct syscall_metadata *sys_data; + struct perf_trace_buf *trace_buf; struct syscall_trace_enter *rec; unsigned long flags; char *raw_data; @@ -507,14 +508,25 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) cpu = smp_processor_id(); if (in_nmi()) - raw_data = rcu_dereference(trace_profile_buf_nmi); + trace_buf = rcu_dereference(perf_trace_buf_nmi); else - raw_data = rcu_dereference(trace_profile_buf); + trace_buf = rcu_dereference(perf_trace_buf); - if (!raw_data) + if (!trace_buf) goto end; - raw_data = per_cpu_ptr(raw_data, cpu); + trace_buf = per_cpu_ptr(trace_buf, cpu); + + if (trace_buf->recursion++) + goto end_recursion; + + /* + * Make recursion update visible before entering perf_tp_event + * so that we protect from perf recursions. + */ + barrier(); + + raw_data = trace_buf->buf; /* zero the dead bytes from align to not leak stack to user */ *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; @@ -527,6 +539,8 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) (unsigned long *)&rec->args); perf_tp_event(sys_data->enter_id, 0, 1, rec, size); +end_recursion: + trace_buf->recursion--; end: local_irq_restore(flags); } @@ -574,6 +588,7 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) { struct syscall_metadata *sys_data; struct syscall_trace_exit *rec; + struct perf_trace_buf *trace_buf; unsigned long flags; int syscall_nr; char *raw_data; @@ -605,14 +620,25 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) cpu = smp_processor_id(); if (in_nmi()) - raw_data = rcu_dereference(trace_profile_buf_nmi); + trace_buf = rcu_dereference(perf_trace_buf_nmi); else - raw_data = rcu_dereference(trace_profile_buf); + trace_buf = rcu_dereference(perf_trace_buf); - if (!raw_data) + if (!trace_buf) goto end; - raw_data = per_cpu_ptr(raw_data, cpu); + trace_buf = per_cpu_ptr(trace_buf, cpu); + + if (trace_buf->recursion++) + goto end_recursion; + + /* + * Make recursion update visible before entering perf_tp_event + * so that we protect from perf recursions. + */ + barrier(); + + raw_data = trace_buf->buf; /* zero the dead bytes from align to not leak stack to user */ *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; @@ -626,6 +652,8 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) perf_tp_event(sys_data->exit_id, 0, 1, rec, size); +end_recursion: + trace_buf->recursion--; end: local_irq_restore(flags); } -- cgit v1.2.3 From ce71b9df8893ec954e56c5979df6da274f20f65e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 22 Nov 2009 05:26:55 +0100 Subject: tracing: Use the perf recursion protection from trace event When we commit a trace to perf, we first check if we are recursing in the same buffer so that we don't mess-up the buffer with a recursing trace. But later on, we do the same check from perf to avoid commit recursion. The recursion check is desired early before we touch the buffer but we want to do this check only once. Then export the recursion protection from perf and use it from the trace events before submitting a trace. v2: Put appropriate Reported-by tag Reported-by: Peter Zijlstra Signed-off-by: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Steven Rostedt Cc: Masami Hiramatsu Cc: Jason Baron LKML-Reference: <1258864015-10579-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 9 ++--- include/linux/perf_event.h | 4 +++ include/trace/ftrace.h | 23 +++++++------ kernel/perf_event.c | 68 +++++++++++++++++++++++++------------- kernel/trace/trace_event_profile.c | 14 ++++---- kernel/trace/trace_kprobe.c | 48 ++++++++++----------------- kernel/trace/trace_syscalls.c | 47 ++++++++++---------------- 7 files changed, 106 insertions(+), 107 deletions(-) (limited to 'include/linux/ftrace_event.h') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 43360c1d8f70..47bbdf9c38d0 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -137,13 +137,8 @@ struct ftrace_event_call { #define FTRACE_MAX_PROFILE_SIZE 2048 -struct perf_trace_buf { - char buf[FTRACE_MAX_PROFILE_SIZE]; - int recursion; -}; - -extern struct perf_trace_buf *perf_trace_buf; -extern struct perf_trace_buf *perf_trace_buf_nmi; +extern char *perf_trace_buf; +extern char *perf_trace_buf_nmi; #define MAX_FILTER_PRED 32 #define MAX_FILTER_STR_VAL 256 /* Should handle KSYM_SYMBOL_LEN */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 36fe89f72641..74e98b1d3391 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -874,6 +874,8 @@ extern int perf_output_begin(struct perf_output_handle *handle, extern void perf_output_end(struct perf_output_handle *handle); extern void perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len); +extern int perf_swevent_get_recursion_context(int **recursion); +extern void perf_swevent_put_recursion_context(int *recursion); #else static inline void perf_event_task_sched_in(struct task_struct *task, int cpu) { } @@ -902,6 +904,8 @@ static inline void perf_event_mmap(struct vm_area_struct *vma) { } static inline void perf_event_comm(struct task_struct *tsk) { } static inline void perf_event_fork(struct task_struct *tsk) { } static inline void perf_event_init(void) { } +static int perf_swevent_get_recursion_context(int **recursion) { return -1; } +static void perf_swevent_put_recursion_context(int *recursion) { } #endif diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 4945d1c99864..c222ef5238bf 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -724,16 +724,19 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ static void ftrace_profile_##call(proto) \ { \ struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\ + extern int perf_swevent_get_recursion_context(int **recursion); \ + extern void perf_swevent_put_recursion_context(int *recursion); \ struct ftrace_event_call *event_call = &event_##call; \ extern void perf_tp_event(int, u64, u64, void *, int); \ struct ftrace_raw_##call *entry; \ - struct perf_trace_buf *trace_buf; \ u64 __addr = 0, __count = 1; \ unsigned long irq_flags; \ struct trace_entry *ent; \ int __entry_size; \ int __data_size; \ + char *trace_buf; \ char *raw_data; \ + int *recursion; \ int __cpu; \ int pc; \ \ @@ -749,6 +752,10 @@ static void ftrace_profile_##call(proto) \ return; \ \ local_irq_save(irq_flags); \ + \ + if (perf_swevent_get_recursion_context(&recursion)) \ + goto end_recursion; \ + \ __cpu = smp_processor_id(); \ \ if (in_nmi()) \ @@ -759,13 +766,7 @@ static void ftrace_profile_##call(proto) \ if (!trace_buf) \ goto end; \ \ - trace_buf = per_cpu_ptr(trace_buf, __cpu); \ - if (trace_buf->recursion++) \ - goto end_recursion; \ - \ - barrier(); \ - \ - raw_data = trace_buf->buf; \ + raw_data = per_cpu_ptr(trace_buf, __cpu); \ \ *(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL; \ entry = (struct ftrace_raw_##call *)raw_data; \ @@ -780,9 +781,9 @@ static void ftrace_profile_##call(proto) \ perf_tp_event(event_call->id, __addr, __count, entry, \ __entry_size); \ \ -end_recursion: \ - trace_buf->recursion--; \ -end: \ +end: \ + perf_swevent_put_recursion_context(recursion); \ +end_recursion: \ local_irq_restore(irq_flags); \ \ } diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 718fa939b1a7..aba822722300 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -3880,34 +3880,42 @@ static void perf_swevent_ctx_event(struct perf_event_context *ctx, } } -static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx) +/* + * Must be called with preemption disabled + */ +int perf_swevent_get_recursion_context(int **recursion) { + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + if (in_nmi()) - return &cpuctx->recursion[3]; + *recursion = &cpuctx->recursion[3]; + else if (in_irq()) + *recursion = &cpuctx->recursion[2]; + else if (in_softirq()) + *recursion = &cpuctx->recursion[1]; + else + *recursion = &cpuctx->recursion[0]; - if (in_irq()) - return &cpuctx->recursion[2]; + if (**recursion) + return -1; - if (in_softirq()) - return &cpuctx->recursion[1]; + (**recursion)++; - return &cpuctx->recursion[0]; + return 0; } -static void do_perf_sw_event(enum perf_type_id type, u32 event_id, - u64 nr, int nmi, - struct perf_sample_data *data, - struct pt_regs *regs) +void perf_swevent_put_recursion_context(int *recursion) { - struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); - int *recursion = perf_swevent_recursion_context(cpuctx); - struct perf_event_context *ctx; - - if (*recursion) - goto out; + (*recursion)--; +} - (*recursion)++; - barrier(); +static void __do_perf_sw_event(enum perf_type_id type, u32 event_id, + u64 nr, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct perf_event_context *ctx; + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); rcu_read_lock(); perf_swevent_ctx_event(&cpuctx->ctx, type, event_id, @@ -3920,12 +3928,25 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id, if (ctx) perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs); rcu_read_unlock(); +} - barrier(); - (*recursion)--; +static void do_perf_sw_event(enum perf_type_id type, u32 event_id, + u64 nr, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + int *recursion; + + preempt_disable(); + + if (perf_swevent_get_recursion_context(&recursion)) + goto out; + + __do_perf_sw_event(type, event_id, nr, nmi, data, regs); + perf_swevent_put_recursion_context(recursion); out: - put_cpu_var(perf_cpu_context); + preempt_enable(); } void __perf_sw_event(u32 event_id, u64 nr, int nmi, @@ -4159,7 +4180,8 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record, if (!regs) regs = task_pt_regs(current); - do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, + /* Trace events already protected against recursion */ + __do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data, regs); } EXPORT_SYMBOL_GPL(perf_tp_event); diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index e0d351b01f5a..d9c60f80aa0d 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -9,31 +9,33 @@ #include "trace.h" -struct perf_trace_buf *perf_trace_buf; +char *perf_trace_buf; EXPORT_SYMBOL_GPL(perf_trace_buf); -struct perf_trace_buf *perf_trace_buf_nmi; +char *perf_trace_buf_nmi; EXPORT_SYMBOL_GPL(perf_trace_buf_nmi); +typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ; + /* Count the events in use (per event id, not per instance) */ static int total_profile_count; static int ftrace_profile_enable_event(struct ftrace_event_call *event) { - struct perf_trace_buf *buf; + char *buf; int ret = -ENOMEM; if (atomic_inc_return(&event->profile_count)) return 0; if (!total_profile_count) { - buf = alloc_percpu(struct perf_trace_buf); + buf = (char *)alloc_percpu(perf_trace_t); if (!buf) goto fail_buf; rcu_assign_pointer(perf_trace_buf, buf); - buf = alloc_percpu(struct perf_trace_buf); + buf = (char *)alloc_percpu(perf_trace_t); if (!buf) goto fail_buf_nmi; @@ -79,7 +81,7 @@ int ftrace_profile_enable(int event_id) static void ftrace_profile_disable_event(struct ftrace_event_call *event) { - struct perf_trace_buf *buf, *nmi_buf; + char *buf, *nmi_buf; if (!atomic_add_negative(-1, &event->profile_count)) return; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 3696476f307d..22e6f68b05b3 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1208,11 +1208,12 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp, struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); struct ftrace_event_call *call = &tp->call; struct kprobe_trace_entry *entry; - struct perf_trace_buf *trace_buf; struct trace_entry *ent; int size, __size, i, pc, __cpu; unsigned long irq_flags; + char *trace_buf; char *raw_data; + int *recursion; pc = preempt_count(); __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); @@ -1227,6 +1228,10 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp, * This also protects the rcu read side */ local_irq_save(irq_flags); + + if (perf_swevent_get_recursion_context(&recursion)) + goto end_recursion; + __cpu = smp_processor_id(); if (in_nmi()) @@ -1237,18 +1242,7 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp, if (!trace_buf) goto end; - trace_buf = per_cpu_ptr(trace_buf, __cpu); - - if (trace_buf->recursion++) - goto end_recursion; - - /* - * Make recursion update visible before entering perf_tp_event - * so that we protect from perf recursions. - */ - barrier(); - - raw_data = trace_buf->buf; + raw_data = per_cpu_ptr(trace_buf, __cpu); /* Zero dead bytes from alignment to avoid buffer leak to userspace */ *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; @@ -1263,9 +1257,9 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp, entry->args[i] = call_fetch(&tp->args[i].fetch, regs); perf_tp_event(call->id, entry->ip, 1, entry, size); -end_recursion: - trace_buf->recursion--; end: + perf_swevent_put_recursion_context(recursion); +end_recursion: local_irq_restore(irq_flags); return 0; @@ -1278,10 +1272,11 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri, struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); struct ftrace_event_call *call = &tp->call; struct kretprobe_trace_entry *entry; - struct perf_trace_buf *trace_buf; struct trace_entry *ent; int size, __size, i, pc, __cpu; unsigned long irq_flags; + char *trace_buf; + int *recursion; char *raw_data; pc = preempt_count(); @@ -1297,6 +1292,10 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri, * This also protects the rcu read side */ local_irq_save(irq_flags); + + if (perf_swevent_get_recursion_context(&recursion)) + goto end_recursion; + __cpu = smp_processor_id(); if (in_nmi()) @@ -1307,18 +1306,7 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri, if (!trace_buf) goto end; - trace_buf = per_cpu_ptr(trace_buf, __cpu); - - if (trace_buf->recursion++) - goto end_recursion; - - /* - * Make recursion update visible before entering perf_tp_event - * so that we protect from perf recursions. - */ - barrier(); - - raw_data = trace_buf->buf; + raw_data = per_cpu_ptr(trace_buf, __cpu); /* Zero dead bytes from alignment to avoid buffer leak to userspace */ *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; @@ -1334,9 +1322,9 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri, entry->args[i] = call_fetch(&tp->args[i].fetch, regs); perf_tp_event(call->id, entry->ret_ip, 1, entry, size); -end_recursion: - trace_buf->recursion--; end: + perf_swevent_put_recursion_context(recursion); +end_recursion: local_irq_restore(irq_flags); return 0; diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 51213b0aa81b..0bb934875263 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -477,10 +477,11 @@ static int sys_prof_refcount_exit; static void prof_syscall_enter(struct pt_regs *regs, long id) { struct syscall_metadata *sys_data; - struct perf_trace_buf *trace_buf; struct syscall_trace_enter *rec; unsigned long flags; + char *trace_buf; char *raw_data; + int *recursion; int syscall_nr; int size; int cpu; @@ -505,6 +506,9 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) /* Protect the per cpu buffer, begin the rcu read side */ local_irq_save(flags); + if (perf_swevent_get_recursion_context(&recursion)) + goto end_recursion; + cpu = smp_processor_id(); if (in_nmi()) @@ -515,18 +519,7 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) if (!trace_buf) goto end; - trace_buf = per_cpu_ptr(trace_buf, cpu); - - if (trace_buf->recursion++) - goto end_recursion; - - /* - * Make recursion update visible before entering perf_tp_event - * so that we protect from perf recursions. - */ - barrier(); - - raw_data = trace_buf->buf; + raw_data = per_cpu_ptr(trace_buf, cpu); /* zero the dead bytes from align to not leak stack to user */ *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; @@ -539,9 +532,9 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) (unsigned long *)&rec->args); perf_tp_event(sys_data->enter_id, 0, 1, rec, size); -end_recursion: - trace_buf->recursion--; end: + perf_swevent_put_recursion_context(recursion); +end_recursion: local_irq_restore(flags); } @@ -588,10 +581,11 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) { struct syscall_metadata *sys_data; struct syscall_trace_exit *rec; - struct perf_trace_buf *trace_buf; unsigned long flags; int syscall_nr; + char *trace_buf; char *raw_data; + int *recursion; int size; int cpu; @@ -617,6 +611,10 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) /* Protect the per cpu buffer, begin the rcu read side */ local_irq_save(flags); + + if (perf_swevent_get_recursion_context(&recursion)) + goto end_recursion; + cpu = smp_processor_id(); if (in_nmi()) @@ -627,18 +625,7 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) if (!trace_buf) goto end; - trace_buf = per_cpu_ptr(trace_buf, cpu); - - if (trace_buf->recursion++) - goto end_recursion; - - /* - * Make recursion update visible before entering perf_tp_event - * so that we protect from perf recursions. - */ - barrier(); - - raw_data = trace_buf->buf; + raw_data = per_cpu_ptr(trace_buf, cpu); /* zero the dead bytes from align to not leak stack to user */ *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; @@ -652,9 +639,9 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) perf_tp_event(sys_data->exit_id, 0, 1, rec, size); -end_recursion: - trace_buf->recursion--; end: + perf_swevent_put_recursion_context(recursion); +end_recursion: local_irq_restore(flags); } -- cgit v1.2.3