diff options
Diffstat (limited to 'kernel/trace/trace_syscalls.c')
| -rw-r--r-- | kernel/trace/trace_syscalls.c | 110 |
1 files changed, 58 insertions, 52 deletions
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 8ad72e17d8eb..e98ee7e1e66f 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -1371,33 +1371,33 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls); static int sys_perf_refcount_enter; static int sys_perf_refcount_exit; -static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *regs, +static int perf_call_bpf_enter(struct trace_event_call *call, struct syscall_metadata *sys_data, - struct syscall_trace_enter *rec) + int syscall_nr, unsigned long *args) { struct syscall_tp_t { struct trace_entry ent; int syscall_nr; unsigned long args[SYSCALL_DEFINE_MAXARGS]; } __aligned(8) param; + struct pt_regs regs = {}; int i; BUILD_BUG_ON(sizeof(param.ent) < sizeof(void *)); - /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. ¶m) */ - perf_fetch_caller_regs(regs); - *(struct pt_regs **)¶m = regs; - param.syscall_nr = rec->nr; + /* bpf prog requires 'regs' to be the first member in the ctx */ + perf_fetch_caller_regs(®s); + *(struct pt_regs **)¶m = ®s; + param.syscall_nr = syscall_nr; for (i = 0; i < sys_data->nb_args; i++) - param.args[i] = rec->args[i]; - return trace_call_bpf(call, ¶m); + param.args[i] = args[i]; + return trace_call_bpf_faultable(call, ¶m); } static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) { struct syscall_metadata *sys_data; struct syscall_trace_enter *rec; - struct pt_regs *fake_regs; struct hlist_head *head; unsigned long args[6]; bool valid_prog_array; @@ -1410,12 +1410,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) int size = 0; int uargs = 0; - /* - * Syscall probe called with preemption enabled, but the ring - * buffer and per-cpu data require preemption to be disabled. - */ might_fault(); - guard(preempt_notrace)(); syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) @@ -1429,6 +1424,26 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) syscall_get_arguments(current, regs, args); + /* + * Run BPF program in faultable context before per-cpu buffer + * allocation, allowing sleepable BPF programs to execute. + */ + valid_prog_array = bpf_prog_array_valid(sys_data->enter_event); + if (valid_prog_array && + !perf_call_bpf_enter(sys_data->enter_event, sys_data, + syscall_nr, args)) + return; + + /* + * Per-cpu ring buffer and perf event list operations require + * preemption to be disabled. + */ + guard(preempt_notrace)(); + + head = this_cpu_ptr(sys_data->enter_event->perf_events); + if (hlist_empty(head)) + return; + /* Check if this syscall event faults in user space memory */ mayfault = sys_data->user_mask != 0; @@ -1438,17 +1453,12 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) return; } - head = this_cpu_ptr(sys_data->enter_event->perf_events); - valid_prog_array = bpf_prog_array_valid(sys_data->enter_event); - if (!valid_prog_array && hlist_empty(head)) - return; - /* get the size after alignment with the u32 buffer size field */ size += sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec); size = ALIGN(size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - rec = perf_trace_buf_alloc(size, &fake_regs, &rctx); + rec = perf_trace_buf_alloc(size, NULL, &rctx); if (!rec) return; @@ -1458,13 +1468,6 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) if (mayfault) syscall_put_data(sys_data, rec, user_ptr, size, user_sizes, uargs); - if ((valid_prog_array && - !perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) || - hlist_empty(head)) { - perf_swevent_put_recursion_context(rctx); - return; - } - perf_trace_buf_submit(rec, size, rctx, sys_data->enter_event->event.type, 1, regs, head, NULL); @@ -1514,40 +1517,35 @@ static void perf_sysenter_disable(struct trace_event_call *call) syscall_fault_buffer_disable(); } -static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs, - struct syscall_trace_exit *rec) +static int perf_call_bpf_exit(struct trace_event_call *call, + int syscall_nr, long ret_val) { struct syscall_tp_t { struct trace_entry ent; int syscall_nr; unsigned long ret; } __aligned(8) param; - - /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. ¶m) */ - perf_fetch_caller_regs(regs); - *(struct pt_regs **)¶m = regs; - param.syscall_nr = rec->nr; - param.ret = rec->ret; - return trace_call_bpf(call, ¶m); + struct pt_regs regs = {}; + + /* bpf prog requires 'regs' to be the first member in the ctx */ + perf_fetch_caller_regs(®s); + *(struct pt_regs **)¶m = ®s; + param.syscall_nr = syscall_nr; + param.ret = ret_val; + return trace_call_bpf_faultable(call, ¶m); } static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) { struct syscall_metadata *sys_data; struct syscall_trace_exit *rec; - struct pt_regs *fake_regs; struct hlist_head *head; bool valid_prog_array; int syscall_nr; int rctx; int size; - /* - * Syscall probe called with preemption enabled, but the ring - * buffer and per-cpu data require preemption to be disabled. - */ might_fault(); - guard(preempt_notrace)(); syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) @@ -1559,29 +1557,37 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) if (!sys_data) return; - head = this_cpu_ptr(sys_data->exit_event->perf_events); + /* + * Run BPF program in faultable context before per-cpu buffer + * allocation, allowing sleepable BPF programs to execute. + */ valid_prog_array = bpf_prog_array_valid(sys_data->exit_event); - if (!valid_prog_array && hlist_empty(head)) + if (valid_prog_array && + !perf_call_bpf_exit(sys_data->exit_event, syscall_nr, + syscall_get_return_value(current, regs))) + return; + + /* + * Per-cpu ring buffer and perf event list operations require + * preemption to be disabled. + */ + guard(preempt_notrace)(); + + head = this_cpu_ptr(sys_data->exit_event->perf_events); + if (hlist_empty(head)) return; /* We can probably do that at build time */ size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - rec = perf_trace_buf_alloc(size, &fake_regs, &rctx); + rec = perf_trace_buf_alloc(size, NULL, &rctx); if (!rec) return; rec->nr = syscall_nr; rec->ret = syscall_get_return_value(current, regs); - if ((valid_prog_array && - !perf_call_bpf_exit(sys_data->exit_event, fake_regs, rec)) || - hlist_empty(head)) { - perf_swevent_put_recursion_context(rctx); - return; - } - perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type, 1, regs, head, NULL); } |
