diff options
Diffstat (limited to 'kernel/trace')
26 files changed, 1250 insertions, 475 deletions
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index e130da35808f..084f34dc6c9f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -1202,6 +1202,40 @@ config RING_BUFFER_VALIDATE_TIME_DELTAS Only say Y if you understand what this does, and you still want it enabled. Otherwise say N +config RING_BUFFER_PERSISTENT_INJECT + bool "Enable persistent ring buffer error injection test" + depends on RING_BUFFER + help + This option will have the kernel check if the persistent ring + buffer is named "ptracingtest". and if so, it will corrupt some + of its pages on a kernel panic. This is used to test if the + persistent ring buffer can recover from some of its sub-buffers + being corrupted. + To use this, boot a kernel with a "ptracingtest" persistent + ring buffer, e.g. + + reserve_mem=20M:2M:trace trace_instance=ptracingtest@trace panic=1 + + And after the 1st boot, run the following commands: + + cd /sys/kernel/tracing/instances/ptracingtest + echo 1 > events/enable + echo 1 > tracing_on + sleep 3 + echo c > /proc/sysrq-trigger + + After the panic message, the kernel will reboot and will show + the test results in the console output. + + Note that events for the test ring buffer needs to be enabled + prior to crashing the kernel so that the ring buffer has content + that the test will corrupt. + As the test will corrupt events in the "ptracingtest" persistent + ring buffer, it should not be used for any other purpose other + than this test. + + If unsure, say N + config MMIOTRACE_TEST tristate "Test module for mmiotrace" depends on MMIOTRACE && m diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 1decdce8cbef..f934ff586bd4 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -48,9 +48,10 @@ ifdef CONFIG_GCOV_PROFILE_FTRACE GCOV_PROFILE := y endif -# Functions in this file could be invoked from early interrupt -# code and produce random code coverage. +# Functions in these files can run from IRQ entry before hardirq context +# is visible to KCOV, and produce coverage unrelated to syscall inputs. KCOV_INSTRUMENT_trace_preemptirq.o := n +KCOV_INSTRUMENT_trace_irqsoff.o := n CFLAGS_bpf_trace.o := -I$(src) @@ -143,8 +144,8 @@ obj-$(CONFIG_TRACE_REMOTE_TEST) += remote_test.o targets += undefsyms_base.o KASAN_SANITIZE_undefsyms_base.o := y -UNDEFINED_ALLOWLIST = __asan __gcov __kasan __kcsan __hwasan __sancov __sanitizer __tsan __ubsan __x86_indirect_thunk \ - __msan simple_ring_buffer \ +UNDEFINED_ALLOWLIST = __asan __gcov __kasan __kcsan __hwasan __sancov __sanitizer __tsan __ubsan __msan \ + __aeabi_unwind_cpp __s390_indirect_jump __x86_indirect_thunk simple_ring_buffer \ $(shell $(NM) -u $(obj)/undefsyms_base.o 2>/dev/null | awk '{print $$2}') quiet_cmd_check_undefined = NM $< @@ -154,7 +155,8 @@ quiet_cmd_check_undefined = NM $< echo "Unexpected symbols in $<:" >&2; \ echo "$$undefsyms" >&2; \ false; \ - fi + fi; \ + touch $@ $(obj)/%.o.checked: $(obj)/%.o $(obj)/undefsyms_base.o FORCE $(call if_changed,check_undefined) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index af7079aa0f36..82f8feea6931 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -23,6 +23,7 @@ #include <linux/sort.h> #include <linux/key.h> #include <linux/namei.h> +#include <linux/file.h> #include <net/bpf_sk_storage.h> @@ -42,6 +43,7 @@ #define MAX_UPROBE_MULTI_CNT (1U << 20) #define MAX_KPROBE_MULTI_CNT (1U << 20) +#define MAX_TRACING_MULTI_CNT (1U << 20) #ifdef CONFIG_MODULES struct bpf_trace_module { @@ -152,6 +154,34 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) return ret; } +/** + * trace_call_bpf_faultable - invoke BPF program in faultable context + * @call: tracepoint event + * @ctx: opaque context pointer + * + * Variant of trace_call_bpf() for faultable tracepoints (syscall + * tracepoints). Supports sleepable BPF programs by using rcu_tasks_trace + * for lifetime protection and bpf_prog_run_array_sleepable() for per-program + * RCU flavor selection, following the uprobe pattern. + * + * Per-program recursion protection is provided by + * bpf_prog_run_array_sleepable(). Global bpf_prog_active is not + * needed because syscall tracepoints cannot self-recurse. + * + * Must be called from a faultable/preemptible context. + */ +unsigned int trace_call_bpf_faultable(struct trace_event_call *call, void *ctx) +{ + struct bpf_prog_array *prog_array; + + might_fault(); + guard(rcu_tasks_trace)(); + + prog_array = rcu_dereference_check(call->prog_array, + rcu_read_lock_trace_held()); + return bpf_prog_run_array_sleepable(prog_array, ctx, bpf_prog_run); +} + #ifdef CONFIG_BPF_KPROBE_OVERRIDE BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) { @@ -1305,7 +1335,8 @@ static inline bool is_uprobe_session(const struct bpf_prog *prog) static inline bool is_trace_fsession(const struct bpf_prog *prog) { return prog->type == BPF_PROG_TYPE_TRACING && - prog->expected_attach_type == BPF_TRACE_FSESSION; + (prog->expected_attach_type == BPF_TRACE_FSESSION || + prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI); } static const struct bpf_func_proto * @@ -2072,11 +2103,19 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) static __always_inline void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args) { + struct srcu_ctr __percpu *scp = NULL; struct bpf_prog *prog = link->link.prog; + bool sleepable = prog->sleepable; struct bpf_run_ctx *old_run_ctx; struct bpf_trace_run_ctx run_ctx; - rcu_read_lock_dont_migrate(); + if (sleepable) { + scp = rcu_read_lock_tasks_trace(); + migrate_disable(); + } else { + rcu_read_lock_dont_migrate(); + } + if (unlikely(!bpf_prog_get_recursion_context(prog))) { bpf_prog_inc_misses_counter(prog); goto out; @@ -2085,12 +2124,18 @@ void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args) run_ctx.bpf_cookie = link->cookie; old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); - (void) bpf_prog_run(prog, args); + (void)bpf_prog_run(prog, args); bpf_reset_run_ctx(old_run_ctx); out: bpf_prog_put_recursion_context(prog); - rcu_read_unlock_migrate(); + + if (sleepable) { + migrate_enable(); + rcu_read_unlock_tasks_trace(scp); + } else { + rcu_read_unlock_migrate(); + } } #define UNPACK(...) __VA_ARGS__ @@ -2384,7 +2429,8 @@ static void bpf_kprobe_multi_link_release(struct bpf_link *link) struct bpf_kprobe_multi_link *kmulti_link; kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); - unregister_fprobe(&kmulti_link->fp); + /* Don't wait for RCU GP here. */ + unregister_fprobe_async(&kmulti_link->fp); kprobe_multi_put_modules(kmulti_link->mods, kmulti_link->mods_cnt); } @@ -3169,6 +3215,38 @@ static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx) return run_ctx->uprobe->cookie; } +static int bpf_uprobe_multi_get_path(const union bpf_attr *attr, struct path *path) +{ + void __user *upath = u64_to_user_ptr(attr->link_create.uprobe_multi.path); + u32 path_fd = attr->link_create.uprobe_multi.path_fd; + u32 flags = attr->link_create.uprobe_multi.flags; + + if (flags & BPF_F_UPROBE_MULTI_PATH_FD) { + /* + * When BPF_F_UPROBE_MULTI_PATH_FD is set, the executable is + * identified by path_fd, upath must be NULL. + */ + if (upath) + return -EINVAL; + + CLASS(fd, f)(path_fd); + if (fd_empty(f)) + return -EBADF; + *path = fd_file(f)->f_path; + path_get(path); + return 0; + } + + /* + * When BPF_F_UPROBE_MULTI_PATH_FD is not set, the path is resolved + * relative to the cwd (AT_FDCWD) or absolute using the upath string. + */ + if (!upath || path_fd) + return -EINVAL; + + return user_path_at(AT_FDCWD, upath, LOOKUP_FOLLOW, path); +} + int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_uprobe_multi_link *link = NULL; @@ -3178,10 +3256,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr struct task_struct *task = NULL; unsigned long __user *uoffsets; u64 __user *ucookies; - void __user *upath; + unsigned long size; u32 flags, cnt, i; struct path path; - char *name; pid_t pid; int err; @@ -3196,19 +3273,18 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr return -EINVAL; flags = attr->link_create.uprobe_multi.flags; - if (flags & ~BPF_F_UPROBE_MULTI_RETURN) + if (flags & ~(BPF_F_UPROBE_MULTI_RETURN | BPF_F_UPROBE_MULTI_PATH_FD)) return -EINVAL; /* - * path, offsets and cnt are mandatory, + * offsets and cnt are mandatory, * ref_ctr_offsets and cookies are optional */ - upath = u64_to_user_ptr(attr->link_create.uprobe_multi.path); uoffsets = u64_to_user_ptr(attr->link_create.uprobe_multi.offsets); cnt = attr->link_create.uprobe_multi.cnt; pid = attr->link_create.uprobe_multi.pid; - if (!upath || !uoffsets || !cnt || pid < 0) + if (!uoffsets || !cnt || pid < 0) return -EINVAL; if (cnt > MAX_UPROBE_MULTI_CNT) return -E2BIG; @@ -3216,14 +3292,17 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr uref_ctr_offsets = u64_to_user_ptr(attr->link_create.uprobe_multi.ref_ctr_offsets); ucookies = u64_to_user_ptr(attr->link_create.uprobe_multi.cookies); - name = strndup_user(upath, PATH_MAX); - if (IS_ERR(name)) { - err = PTR_ERR(name); - return err; - } + /* + * All uoffsets/uref_ctr_offsets/ucookies arrays have the same value + * size, we need to check their address range is safe for __get_user + * calls. + */ + size = sizeof(*uoffsets) * cnt; + if (!access_ok(uoffsets, size) || !access_ok(uref_ctr_offsets, size) || + !access_ok(ucookies, size)) + return -EFAULT; - err = kern_path(name, LOOKUP_FOLLOW, &path); - kfree(name); + err = bpf_uprobe_multi_get_path(attr, &path); if (err) return err; @@ -3397,12 +3476,12 @@ typedef int (*copy_fn_t)(void *dst, const void *src, u32 size, struct task_struc * direct calls into all the specific callback implementations * (copy_user_data_sleepable, copy_user_data_nofault, and so on) */ -static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u64 doff, u64 size, +static __always_inline int __bpf_dynptr_copy_str(const struct bpf_dynptr *dptr, u64 doff, u64 size, const void *unsafe_src, copy_fn_t str_copy_fn, struct task_struct *tsk) { - struct bpf_dynptr_kern *dst; + const struct bpf_dynptr_kern *dst; u64 chunk_sz, off; void *dst_slice; int cnt, err; @@ -3438,7 +3517,7 @@ static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u64 u64 size, const void *unsafe_src, copy_fn_t copy_fn, struct task_struct *tsk) { - struct bpf_dynptr_kern *dst; + const struct bpf_dynptr_kern *dst; void *dst_slice; char buf[256]; u64 off, chunk_sz; @@ -3539,49 +3618,49 @@ __bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid return bpf_send_signal_common(sig, type, task, value); } -__bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u64 off, +__bpf_kfunc int bpf_probe_read_user_dynptr(const struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign, copy_user_data_nofault, NULL); } -__bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u64 off, +__bpf_kfunc int bpf_probe_read_kernel_dynptr(const struct bpf_dynptr *dptr, u64 off, u64 size, const void *unsafe_ptr__ign) { return __bpf_dynptr_copy(dptr, off, size, unsafe_ptr__ign, copy_kernel_data_nofault, NULL); } -__bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u64 off, +__bpf_kfunc int bpf_probe_read_user_str_dynptr(const struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign, copy_user_str_nofault, NULL); } -__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u64 off, +__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(const struct bpf_dynptr *dptr, u64 off, u64 size, const void *unsafe_ptr__ign) { return __bpf_dynptr_copy_str(dptr, off, size, unsafe_ptr__ign, copy_kernel_str_nofault, NULL); } -__bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u64 off, +__bpf_kfunc int bpf_copy_from_user_dynptr(const struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign, copy_user_data_sleepable, NULL); } -__bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u64 off, +__bpf_kfunc int bpf_copy_from_user_str_dynptr(const struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign, copy_user_str_sleepable, NULL); } -__bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u64 off, +__bpf_kfunc int bpf_copy_from_user_task_dynptr(const struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign, struct task_struct *tsk) { @@ -3589,7 +3668,7 @@ __bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u64 off, copy_user_data_sleepable, tsk); } -__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u64 off, +__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(const struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign, struct task_struct *tsk) { @@ -3598,3 +3677,203 @@ __bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u64 } __bpf_kfunc_end_defs(); + +#if defined(CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS) && \ + defined(CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS) + +static void bpf_tracing_multi_link_release(struct bpf_link *link) +{ + struct bpf_tracing_multi_link *tr_link = + container_of(link, struct bpf_tracing_multi_link, link); + + WARN_ON_ONCE(bpf_trampoline_multi_detach(link->prog, tr_link)); +} + +static void bpf_tracing_multi_link_dealloc(struct bpf_link *link) +{ + struct bpf_tracing_multi_link *tr_link = + container_of(link, struct bpf_tracing_multi_link, link); + + kvfree(tr_link->fexits); + kvfree(tr_link->cookies); + kvfree(tr_link); +} + +#ifdef CONFIG_PROC_FS +static void bpf_tracing_multi_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_tracing_multi_link *tr_link = + container_of(link, struct bpf_tracing_multi_link, link); + bool has_cookies = !!tr_link->cookies; + + seq_printf(seq, "attach_type:\t%u\n", tr_link->link.attach_type); + seq_printf(seq, "cnt:\t%u\n", tr_link->nodes_cnt); + + seq_printf(seq, "%s\t %s\t %s\t %s\n", "obj-id", "btf-id", "cookie", "func"); + for (int i = 0; i < tr_link->nodes_cnt; i++) { + struct bpf_tracing_multi_node *mnode = &tr_link->nodes[i]; + u32 btf_id, obj_id; + + bpf_trampoline_unpack_key(mnode->trampoline->key, &obj_id, &btf_id); + seq_printf(seq, "%u\t %u\t %llu\t %pS\n", + obj_id, btf_id, + has_cookies ? tr_link->cookies[i] : 0, + (void *) mnode->trampoline->ip); + + cond_resched(); + } +} +#endif + +static const struct bpf_link_ops bpf_tracing_multi_link_lops = { + .release = bpf_tracing_multi_link_release, + .dealloc_deferred = bpf_tracing_multi_link_dealloc, +#ifdef CONFIG_PROC_FS + .show_fdinfo = bpf_tracing_multi_show_fdinfo, +#endif +}; + +static int ids_cmp_r(const void *pa, const void *pb, const void *priv __maybe_unused) +{ + u32 a = *(u32 *) pa; + u32 b = *(u32 *) pb; + + return (a > b) - (a < b); +} + +static void ids_swap_r(void *a, void *b, int size __maybe_unused, + const void *priv __maybe_unused) +{ + u64 *cookie_a, *cookie_b, *cookies; + u32 *id_a = a, *id_b = b, *ids; + void **data = (void **) priv; + + ids = data[0]; + cookies = data[1]; + + if (cookies) { + cookie_a = cookies + (id_a - ids); + cookie_b = cookies + (id_b - ids); + swap(*cookie_a, *cookie_b); + } + swap(*id_a, *id_b); +} + +static int check_dup_ids(u32 *ids, u64 *cookies, u32 cnt) +{ + void *data[2] = { ids, cookies }; + int err = 0; + + /* + * Sort ids array (together with cookies array if defined) + * and check it for duplicates. The ids and cookies arrays + * are left sorted. + */ + sort_r_nonatomic(ids, cnt, sizeof(ids[0]), ids_cmp_r, ids_swap_r, data); + + for (int i = 1; i < cnt; i++) { + if (ids[i] == ids[i - 1]) { + err = -EINVAL; + break; + } + } + return err; +} + +int bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr) +{ + struct bpf_tracing_multi_link *link = NULL; + struct bpf_tramp_node *fexits = NULL; + struct bpf_link_primer link_primer; + u32 cnt, *ids = NULL; + u64 __user *ucookies; + u64 *cookies = NULL; + u32 __user *uids; + int err; + + uids = u64_to_user_ptr(attr->link_create.tracing_multi.ids); + cnt = attr->link_create.tracing_multi.cnt; + + if (!cnt || !uids) + return -EINVAL; + if (cnt > MAX_TRACING_MULTI_CNT) + return -E2BIG; + if (attr->link_create.flags || attr->link_create.target_fd) + return -EINVAL; + + ids = kvmalloc_objs(*ids, cnt); + if (!ids) + return -ENOMEM; + + if (copy_from_user(ids, uids, cnt * sizeof(*ids))) { + err = -EFAULT; + goto error; + } + + ucookies = u64_to_user_ptr(attr->link_create.tracing_multi.cookies); + if (ucookies) { + cookies = kvmalloc_objs(*cookies, cnt); + if (!cookies) { + err = -ENOMEM; + goto error; + } + if (copy_from_user(cookies, ucookies, cnt * sizeof(*cookies))) { + err = -EFAULT; + goto error; + } + } + + err = check_dup_ids(ids, cookies, cnt); + if (err) + goto error; + + if (prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI) { + fexits = kvmalloc_objs(*fexits, cnt); + if (!fexits) { + err = -ENOMEM; + goto error; + } + } + + link = kvzalloc_flex(*link, nodes, cnt); + if (!link) { + err = -ENOMEM; + goto error; + } + + bpf_link_init(&link->link, BPF_LINK_TYPE_TRACING_MULTI, + &bpf_tracing_multi_link_lops, prog, prog->expected_attach_type); + + err = bpf_link_prime(&link->link, &link_primer); + if (err) + goto error; + + link->nodes_cnt = cnt; + link->cookies = cookies; + link->fexits = fexits; + + err = bpf_trampoline_multi_attach(prog, ids, link); + kvfree(ids); + if (err) { + bpf_link_cleanup(&link_primer); + return err; + } + return bpf_link_settle(&link_primer); + +error: + kvfree(fexits); + kvfree(cookies); + kvfree(ids); + kvfree(link); + return err; +} + +#else + +int bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr) +{ + return -EOPNOTSUPP; +} + +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS && CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS */ diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index cc49ebd2a773..f378613ad120 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -1093,14 +1093,15 @@ static int unregister_fprobe_nolock(struct fprobe *fp) } /** - * unregister_fprobe() - Unregister fprobe. + * unregister_fprobe_async() - Unregister fprobe without RCU GP wait * @fp: A fprobe data structure to be unregistered. * * Unregister fprobe (and remove ftrace hooks from the function entries). + * This function will NOT wait until the fprobe is no longer used. * * Return 0 if @fp is unregistered successfully, -errno if not. */ -int unregister_fprobe(struct fprobe *fp) +int unregister_fprobe_async(struct fprobe *fp) { guard(mutex)(&fprobe_mutex); if (!fp || !fprobe_registered(fp)) @@ -1108,6 +1109,24 @@ int unregister_fprobe(struct fprobe *fp) return unregister_fprobe_nolock(fp); } + +/** + * unregister_fprobe() - Unregister fprobe with RCU GP wait + * @fp: A fprobe data structure to be unregistered. + * + * Unregister fprobe (and remove ftrace hooks from the function entries). + * This function will block until the fprobe is no longer used. + * + * Return 0 if @fp is unregistered successfully, -errno if not. + */ +int unregister_fprobe(struct fprobe *fp) +{ + int ret = unregister_fprobe_async(fp); + + if (!ret) + synchronize_rcu(); + return ret; +} EXPORT_SYMBOL_GPL(unregister_fprobe); static int __init fprobe_initcall(void) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b2611de3f594..f93e34dd2328 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1198,8 +1198,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) return __ftrace_lookup_ip(hash, ip); } -static void __add_hash_entry(struct ftrace_hash *hash, - struct ftrace_func_entry *entry) +void add_ftrace_hash_entry(struct ftrace_hash *hash, struct ftrace_func_entry *entry) { struct hlist_head *hhd; unsigned long key; @@ -1221,7 +1220,7 @@ add_ftrace_hash_entry_direct(struct ftrace_hash *hash, unsigned long ip, unsigne entry->ip = ip; entry->direct = direct; - __add_hash_entry(hash, entry); + add_ftrace_hash_entry(hash, entry); return entry; } @@ -1249,6 +1248,25 @@ remove_hash_entry(struct ftrace_hash *hash, hash->count--; } +void ftrace_hash_remove(struct ftrace_hash *hash) +{ + struct ftrace_func_entry *entry; + struct hlist_head *hhd; + struct hlist_node *tn; + int size; + int i; + + if (!hash || !hash->count) + return; + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hhd = &hash->buckets[i]; + hlist_for_each_entry_safe(entry, tn, hhd, hlist) + remove_hash_entry(hash, entry); + } + FTRACE_WARN_ON(hash->count); +} + static void ftrace_hash_clear(struct ftrace_hash *hash) { struct hlist_head *hhd; @@ -1458,7 +1476,7 @@ static struct ftrace_hash *__move_hash(struct ftrace_hash *src, int size) hhd = &src->buckets[i]; hlist_for_each_entry_safe(entry, tn, hhd, hlist) { remove_hash_entry(src, entry); - __add_hash_entry(new_hash, entry); + add_ftrace_hash_entry(new_hash, entry); } } return new_hash; @@ -5341,7 +5359,7 @@ int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper, map->entry.ip = ip; map->data = data; - __add_hash_entry(&mapper->hash, &map->entry); + add_ftrace_hash_entry(&mapper->hash, &map->entry); return 0; } @@ -6288,11 +6306,16 @@ int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) } EXPORT_SYMBOL_GPL(modify_ftrace_direct); -static unsigned long hash_count(struct ftrace_hash *hash) +static inline unsigned long hash_count(struct ftrace_hash *hash) { return hash ? hash->count : 0; } +unsigned long ftrace_hash_count(struct ftrace_hash *hash) +{ + return hash_count(hash); +} + /** * hash_add - adds two struct ftrace_hash and returns the result * @a: struct ftrace_hash object diff --git a/kernel/trace/remote_test.c b/kernel/trace/remote_test.c index 6c1b7701ddae..a3e2c9b606eb 100644 --- a/kernel/trace/remote_test.c +++ b/kernel/trace/remote_test.c @@ -110,9 +110,9 @@ static struct trace_buffer_desc *remote_test_load(unsigned long size, void *unus return remote_test_buffer_desc; err_unload: - for_each_ring_buffer_desc(rb_desc, cpu, remote_test_buffer_desc) + for_each_ring_buffer_desc(rb_desc, cpu, desc) remote_test_unload_simple_rb(rb_desc->cpu); - trace_remote_free_buffer(remote_test_buffer_desc); + trace_remote_free_buffer(desc); err_free_desc: kfree(desc); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 5326924615a4..56a328e94395 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -7,6 +7,7 @@ #include <linux/ring_buffer_types.h> #include <linux/sched/isolation.h> #include <linux/trace_recursion.h> +#include <linux/panic_notifier.h> #include <linux/trace_events.h> #include <linux/ring_buffer.h> #include <linux/trace_clock.h> @@ -31,6 +32,7 @@ #include <linux/oom.h> #include <linux/mm.h> +#include <asm/ring_buffer.h> #include <asm/local64.h> #include <asm/local.h> #include <asm/setup.h> @@ -62,6 +64,10 @@ struct ring_buffer_cpu_meta { unsigned long commit_buffer; __u32 subbuf_size; __u32 nr_subbufs; +#ifdef CONFIG_RING_BUFFER_PERSISTENT_INJECT + __u32 nr_invalid; + __u32 entry_bytes; +#endif int buffers[]; }; @@ -358,14 +364,30 @@ struct buffer_page { #define RB_WRITE_MASK 0xfffff #define RB_WRITE_INTCNT (1 << 20) -static void rb_init_page(struct buffer_data_page *bpage) +static void rb_init_data_page(struct buffer_data_page *bpage) { local_set(&bpage->commit, 0); + bpage->time_stamp = 0; +} + +static __always_inline long rb_data_page_commit(struct buffer_data_page *dpage) +{ + return local_read(&dpage->commit); +} + +static __always_inline long rb_data_page_size(struct buffer_data_page *dpage) +{ + return rb_data_page_commit(dpage) & ~RB_MISSED_MASK; } static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage) { - return local_read(&bpage->page->commit); + return rb_data_page_commit(bpage->page); +} + +static __always_inline unsigned int rb_page_size(struct buffer_page *bpage) +{ + return rb_data_page_size(bpage->page); } static void free_buffer_page(struct buffer_page *bpage) @@ -406,7 +428,7 @@ static struct buffer_data_page *alloc_cpu_data(int cpu, int order) return NULL; dpage = page_address(page); - rb_init_page(dpage); + rb_init_data_page(dpage); return dpage; } @@ -559,6 +581,7 @@ struct trace_buffer { unsigned long range_addr_start; unsigned long range_addr_end; + struct notifier_block flush_nb; struct ring_buffer_meta *meta; @@ -645,7 +668,7 @@ static void verify_event(struct ring_buffer_per_cpu *cpu_buffer, do { if (page == tail_page || WARN_ON_ONCE(stop++ > 100)) done = true; - commit = local_read(&page->page->commit); + commit = rb_page_commit(page); write = local_read(&page->write); if (addr >= (unsigned long)&page->page->data[commit] && addr < (unsigned long)&page->page->data[write]) @@ -1759,7 +1782,6 @@ static bool rb_cpu_meta_valid(struct ring_buffer_cpu_meta *meta, int cpu, unsigned long *subbuf_mask) { int subbuf_size = PAGE_SIZE; - struct buffer_data_page *subbuf; unsigned long buffers_start; unsigned long buffers_end; int i; @@ -1767,6 +1789,11 @@ static bool rb_cpu_meta_valid(struct ring_buffer_cpu_meta *meta, int cpu, if (!subbuf_mask) return false; + if (meta->subbuf_size != PAGE_SIZE) { + pr_info("Ring buffer boot meta [%d] invalid subbuf_size\n", cpu); + return false; + } + buffers_start = meta->first_buffer; buffers_end = meta->first_buffer + (subbuf_size * meta->nr_subbufs); @@ -1783,11 +1810,12 @@ static bool rb_cpu_meta_valid(struct ring_buffer_cpu_meta *meta, int cpu, return false; } - subbuf = rb_subbufs_from_meta(meta); - bitmap_clear(subbuf_mask, 0, meta->nr_subbufs); - /* Is the meta buffers and the subbufs themselves have correct data? */ + /* + * Ensure the meta::buffers array has correct data. The data in each subbufs + * are checked later in rb_meta_validate_events(). + */ for (i = 0; i < meta->nr_subbufs; i++) { if (meta->buffers[i] < 0 || meta->buffers[i] >= meta->nr_subbufs) { @@ -1795,18 +1823,12 @@ static bool rb_cpu_meta_valid(struct ring_buffer_cpu_meta *meta, int cpu, return false; } - if ((unsigned)local_read(&subbuf->commit) > subbuf_size) { - pr_info("Ring buffer boot meta [%d] buffer invalid commit\n", cpu); - return false; - } - if (test_bit(meta->buffers[i], subbuf_mask)) { pr_info("Ring buffer boot meta [%d] array has duplicates\n", cpu); return false; } set_bit(meta->buffers[i], subbuf_mask); - subbuf = (void *)subbuf + subbuf_size; } return true; @@ -1870,14 +1892,138 @@ static int rb_read_data_buffer(struct buffer_data_page *dpage, int tail, int cpu return events; } -static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu) +struct rb_validation_state { + unsigned long entries; + unsigned long entry_bytes; + int discarded; + u64 ts; +}; + +static int __rb_validate_buffer(struct buffer_page *bpage, int cpu, + struct ring_buffer_cpu_meta *meta, + u64 prev_ts, u64 next_ts) { + struct buffer_data_page *dpage = bpage->page; unsigned long long ts; + unsigned long tail; u64 delta; - int tail; + int ret; + + /* + * When a sub-buffer is recovered from a read, the commit value may + * have RB_MISSED_* bits set, as these bits are reset on reuse. + * Even after clearing these bits, a commit value greater than the + * subbuf_size is considered invalid. + */ + tail = rb_data_page_commit(dpage); + if (tail <= meta->subbuf_size - BUF_PAGE_HDR_SIZE) + ret = rb_read_data_buffer(dpage, tail, cpu, &ts, &delta); + else + ret = -1; + + /* + * The timestamp must be greater than @prev_ts and smaller than @next_ts. + * Since this function works in both forward (verify) and reverse (unwind) + * loop, we don't know both @prev_ts and @next_ts at the same time. + * So use the known boundary as the boundary. + */ + if (ret < 0 || (prev_ts && prev_ts > ts) || (next_ts && ts > next_ts)) { + local_set(&bpage->entries, 0); + /* + * Note, the RB_MISSED_EVENTS is only set inside the main write + * buffer by this verification logic. The normal ring buffer + * has this bit set when the page is read and passed to the + * consumers. + */ + local_set(&dpage->commit, RB_MISSED_EVENTS); + dpage->time_stamp = prev_ts ? prev_ts : next_ts; + ret = -1; + } else { + local_set(&bpage->entries, ret); + } + + return ret; +} + +/** + * rb_validate_buffer - validates a single buffer page and updates the state. + * @bpage: buffer page to validate + * @cpu_buffer: cpu_buffer this page belongs to + * @meta: meta of the cpu_buffer + * @state: validation state + * @prev_ts: previous buffer's timestamp (optional) + * @next_ts: next buffer's timestamp (optional) + * + * If the page is invalid (wrong event length or timestamp), it increments the + * discarded counter and warns it. Otherwise, it updates the validation state. + */ +static void rb_validate_buffer(struct buffer_page *bpage, + struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_cpu_meta *meta, + struct rb_validation_state *state, + u64 prev_ts, u64 next_ts) +{ + int ret; + + ret = __rb_validate_buffer(bpage, cpu_buffer->cpu, meta, prev_ts, next_ts); + if (ret < 0) { + if (!state->discarded) + pr_info("Ring buffer meta [%d] invalid buffer page detected\n", + cpu_buffer->cpu); + state->discarded++; + } else { + /* If the buffer has content, update pages_touched */ + if (ret) + local_inc(&cpu_buffer->pages_touched); + + state->entries += ret; + state->entry_bytes += rb_page_size(bpage); + state->ts = bpage->page->time_stamp; + } +} - tail = local_read(&dpage->commit); - return rb_read_data_buffer(dpage, tail, cpu, &ts, &delta); +static void rb_meta_inject_reader_page(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_cpu_meta *meta, + struct buffer_page *orig_head, + struct buffer_page *head_page) +{ + struct buffer_page *bpage = orig_head; + int i; + + rb_dec_page(&bpage); + /* + * Insert the reader_page before the original head page. + * Since the list encode RB_PAGE flags, general list + * operations should be avoided. + */ + cpu_buffer->reader_page->list.next = &orig_head->list; + cpu_buffer->reader_page->list.prev = orig_head->list.prev; + orig_head->list.prev = &cpu_buffer->reader_page->list; + bpage->list.next = &cpu_buffer->reader_page->list; + + /* Make the head_page the reader page */ + cpu_buffer->reader_page = head_page; + bpage = head_page; + rb_inc_page(&head_page); + head_page->list.prev = bpage->list.prev; + rb_dec_page(&bpage); + bpage->list.next = &head_page->list; + rb_set_list_to_head(&bpage->list); + cpu_buffer->pages = &head_page->list; + + cpu_buffer->head_page = head_page; + meta->head_buffer = (unsigned long)head_page->page; + + /* Reset all the indexes */ + bpage = cpu_buffer->reader_page; + meta->buffers[0] = rb_meta_subbuf_idx(meta, bpage->page); + bpage->id = 0; + + for (i = 1, bpage = head_page; i < meta->nr_subbufs; + i++, rb_inc_page(&bpage)) { + meta->buffers[i] = rb_meta_subbuf_idx(meta, bpage->page); + bpage->id = i; + } } /* If the meta data has been validated, now validate the events */ @@ -1885,10 +2031,9 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) { struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; struct buffer_page *head_page, *orig_head, *orig_reader; - unsigned long entry_bytes = 0; - unsigned long entries = 0; + struct rb_validation_state state = { 0 }; + bool skip = false; int ret; - u64 ts; int i; if (!meta || !meta->head_buffer) @@ -1897,20 +2042,26 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) orig_head = head_page = cpu_buffer->head_page; orig_reader = cpu_buffer->reader_page; - /* Do the reader page first */ - ret = rb_validate_buffer(orig_reader->page, cpu_buffer->cpu); + /* Do the head page first */ + ret = __rb_validate_buffer(head_page, cpu_buffer->cpu, meta, 0, 0); if (ret < 0) { - pr_info("Ring buffer reader page is invalid\n"); - goto invalid; + pr_info("Ring buffer meta [%d] invalid head page detected\n", + cpu_buffer->cpu); + /* Don't bother rewinding */ + skip = true; + state.ts = 0; + } else { + state.ts = head_page->page->time_stamp; } - entries += ret; - entry_bytes += local_read(&orig_reader->page->commit); - local_set(&orig_reader->entries, ret); - ts = head_page->page->time_stamp; + /* Do the reader page - reader must be previous to head. */ + rb_validate_buffer(orig_reader, cpu_buffer, meta, &state, 0, state.ts); + + if (skip) + goto skip_rewind; /* - * Try to rewind the head so that we can read the pages which already + * Try to rewind the head so that we can read the pages which are already * read in the previous boot. */ if (head_page == cpu_buffer->tail_page) @@ -1923,26 +2074,15 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) if (head_page == cpu_buffer->tail_page) break; - /* Ensure the page has older data than head. */ - if (ts < head_page->page->time_stamp) - break; - - ts = head_page->page->time_stamp; - /* Ensure the page has correct timestamp and some data. */ - if (!ts || rb_page_commit(head_page) == 0) - break; - - /* Stop rewind if the page is invalid. */ - ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu); - if (ret < 0) + /* Rewind until unused page (no timestamp, no commit). */ + if (!head_page->page->time_stamp && rb_page_commit(head_page) == 0) break; - /* Recover the number of entries and update stats. */ - local_set(&head_page->entries, ret); - if (ret) - local_inc(&cpu_buffer->pages_touched); - entries += ret; - entry_bytes += rb_page_commit(head_page); + /* + * Skip if the page is invalid, or its timestamp is newer than the + * previous valid page. + */ + rb_validate_buffer(head_page, cpu_buffer, meta, &state, 0, state.ts); } if (i) pr_info("Ring buffer [%d] rewound %d pages\n", cpu_buffer->cpu, i); @@ -1956,43 +2096,7 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) * into the location just before the original head page. */ if (head_page != orig_head) { - struct buffer_page *bpage = orig_head; - - rb_dec_page(&bpage); - /* - * Insert the reader_page before the original head page. - * Since the list encode RB_PAGE flags, general list - * operations should be avoided. - */ - cpu_buffer->reader_page->list.next = &orig_head->list; - cpu_buffer->reader_page->list.prev = orig_head->list.prev; - orig_head->list.prev = &cpu_buffer->reader_page->list; - bpage->list.next = &cpu_buffer->reader_page->list; - - /* Make the head_page the reader page */ - cpu_buffer->reader_page = head_page; - bpage = head_page; - rb_inc_page(&head_page); - head_page->list.prev = bpage->list.prev; - rb_dec_page(&bpage); - bpage->list.next = &head_page->list; - rb_set_list_to_head(&bpage->list); - cpu_buffer->pages = &head_page->list; - - cpu_buffer->head_page = head_page; - meta->head_buffer = (unsigned long)head_page->page; - - /* Reset all the indexes */ - bpage = cpu_buffer->reader_page; - meta->buffers[0] = rb_meta_subbuf_idx(meta, bpage->page); - bpage->id = 0; - - for (i = 1, bpage = head_page; i < meta->nr_subbufs; - i++, rb_inc_page(&bpage)) { - meta->buffers[i] = rb_meta_subbuf_idx(meta, bpage->page); - bpage->id = i; - } - + rb_meta_inject_reader_page(cpu_buffer, meta, orig_head, head_page); /* We'll restart verifying from orig_head */ head_page = orig_head; } @@ -2004,6 +2108,7 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) /* Nothing more to do, the only page is the reader page */ goto done; } + state.ts = head_page->page->time_stamp; /* Iterate until finding the commit page */ for (i = 0; i < meta->nr_subbufs + 1; i++, rb_inc_page(&head_page)) { @@ -2012,20 +2117,7 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) if (head_page == orig_reader) continue; - ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu); - if (ret < 0) { - pr_info("Ring buffer meta [%d] invalid buffer page\n", - cpu_buffer->cpu); - goto invalid; - } - - /* If the buffer has content, update pages_touched */ - if (ret) - local_inc(&cpu_buffer->pages_touched); - - entries += ret; - entry_bytes += local_read(&head_page->page->commit); - local_set(&head_page->entries, ret); + rb_validate_buffer(head_page, cpu_buffer, meta, &state, state.ts, 0); if (head_page == cpu_buffer->commit_page) break; @@ -2037,10 +2129,28 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) goto invalid; } done: - local_set(&cpu_buffer->entries, entries); - local_set(&cpu_buffer->entries_bytes, entry_bytes); - - pr_info("Ring buffer meta [%d] is from previous boot!\n", cpu_buffer->cpu); + local_set(&cpu_buffer->entries, state.entries); + local_set(&cpu_buffer->entries_bytes, state.entry_bytes); + + pr_info("Ring buffer meta [%d] is from previous boot!", cpu_buffer->cpu); + if (state.discarded) + pr_cont(" (%d pages discarded)", state.discarded); + pr_cont("\n"); + +#ifdef CONFIG_RING_BUFFER_PERSISTENT_INJECT + if (meta->nr_invalid) + pr_warn("Ring buffer testing [%d] invalid pages: %s (%d/%d)\n", + cpu_buffer->cpu, + (state.discarded == meta->nr_invalid) ? "PASSED" : "FAILED", + state.discarded, meta->nr_invalid); + if (meta->entry_bytes) + pr_warn("Ring buffer testing [%d] entry_bytes: %s (%ld/%ld)\n", + cpu_buffer->cpu, + (state.entry_bytes == meta->entry_bytes) ? "PASSED" : "FAILED", + (long)state.entry_bytes, (long)meta->entry_bytes); + meta->nr_invalid = 0; + meta->entry_bytes = 0; +#endif return; invalid: @@ -2050,12 +2160,12 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) /* Reset the reader page */ local_set(&cpu_buffer->reader_page->entries, 0); - local_set(&cpu_buffer->reader_page->page->commit, 0); + rb_init_data_page(cpu_buffer->reader_page->page); /* Reset all the subbuffers */ for (i = 0; i < meta->nr_subbufs - 1; i++, rb_inc_page(&head_page)) { local_set(&head_page->entries, 0); - local_set(&head_page->page->commit, 0); + rb_init_data_page(head_page->page); } } @@ -2115,7 +2225,7 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages, int sc */ for (i = 0; i < meta->nr_subbufs; i++) { meta->buffers[i] = i; - rb_init_page(subbuf); + rb_init_data_page(subbuf); subbuf += meta->subbuf_size; } } @@ -2152,6 +2262,7 @@ static int rbm_show(struct seq_file *m, void *v) struct ring_buffer_per_cpu *cpu_buffer = m->private; struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; unsigned long val = (unsigned long)v; + struct buffer_data_page *dpage; if (val == 1) { seq_printf(m, "head_buffer: %d\n", @@ -2164,7 +2275,9 @@ static int rbm_show(struct seq_file *m, void *v) } val -= 2; - seq_printf(m, "buffer[%ld]: %d\n", val, meta->buffers[val]); + dpage = rb_range_buffer(cpu_buffer, val); + seq_printf(m, "buffer[%ld]: %d (commit: %ld)\n", + val, meta->buffers[val], dpage ? rb_data_page_commit(dpage) : -1); return 0; } @@ -2521,6 +2634,76 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) kfree(cpu_buffer); } +#ifdef CONFIG_RING_BUFFER_PERSISTENT_INJECT +static void rb_test_inject_invalid_pages(struct trace_buffer *buffer) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_cpu_meta *meta; + struct buffer_data_page *dpage; + unsigned long entry_bytes = 0; + unsigned long ptr; + int subbuf_size; + int invalid = 0; + int cpu; + int i; + + if (!(buffer->flags & RB_FL_TESTING)) + return; + + guard(preempt)(); + cpu = smp_processor_id(); + + cpu_buffer = buffer->buffers[cpu]; + if (!cpu_buffer) + return; + meta = cpu_buffer->ring_meta; + if (!meta) + return; + + ptr = (unsigned long)rb_subbufs_from_meta(meta); + subbuf_size = meta->subbuf_size; + + for (i = 0; i < meta->nr_subbufs; i++) { + unsigned long idx = meta->buffers[i]; + + dpage = (void *)(ptr + idx * subbuf_size); + /* Skip unused pages */ + if (!rb_data_page_commit(dpage)) + continue; + + /* + * Invalidate even pages or multiples of 5. This will cause 3 + * contiguous invalidated(empty) pages. + */ + if (!(i & 0x1) || !(i % 5)) { + local_add(subbuf_size + 1, &dpage->commit); + invalid++; + } else { + /* Count total commit bytes. */ + entry_bytes += rb_data_page_size(dpage); + } + } + + pr_info("Inject invalidated %d pages on CPU%d, total size: %ld\n", + invalid, cpu, (long)entry_bytes); + meta->nr_invalid = invalid; + meta->entry_bytes = entry_bytes; +} +#else /* !CONFIG_RING_BUFFER_PERSISTENT_INJECT */ +#define rb_test_inject_invalid_pages(buffer) do { } while (0) +#endif + +/* Stop recording on a persistent buffer and flush cache if needed. */ +static int rb_flush_buffer_cb(struct notifier_block *nb, unsigned long event, void *data) +{ + struct trace_buffer *buffer = container_of(nb, struct trace_buffer, flush_nb); + + ring_buffer_record_off(buffer); + rb_test_inject_invalid_pages(buffer); + arch_ring_buffer_flush_range(buffer->range_addr_start, buffer->range_addr_end); + return NOTIFY_DONE; +} + static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, int order, unsigned long start, unsigned long end, @@ -2651,6 +2834,12 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, mutex_init(&buffer->mutex); + /* Persistent ring buffer needs to flush cache before reboot. */ + if (start && end) { + buffer->flush_nb.notifier_call = rb_flush_buffer_cb; + atomic_notifier_chain_register(&panic_notifier_list, &buffer->flush_nb); + } + return_ptr(buffer); fail_free_buffers: @@ -2749,6 +2938,9 @@ ring_buffer_free(struct trace_buffer *buffer) { int cpu; + if (buffer->range_addr_start && buffer->range_addr_end) + atomic_notifier_chain_unregister(&panic_notifier_list, &buffer->flush_nb); + cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node); irq_work_sync(&buffer->irq_work.work); @@ -3265,7 +3457,7 @@ rb_iter_head_event(struct ring_buffer_iter *iter) * is a mb(), which will synchronize with the rmb here. * (see rb_tail_page_update() and __rb_reserve_next()) */ - commit = rb_page_commit(iter_head_page); + commit = rb_page_size(iter_head_page); smp_rmb(); /* An event needs to be at least 8 bytes in size */ @@ -3294,7 +3486,7 @@ rb_iter_head_event(struct ring_buffer_iter *iter) /* Make sure the page didn't change since we read this */ if (iter->page_stamp != iter_head_page->page->time_stamp || - commit > rb_page_commit(iter_head_page)) + commit > rb_page_size(iter_head_page)) goto reset; iter->next_event = iter->head + length; @@ -3308,12 +3500,6 @@ rb_iter_head_event(struct ring_buffer_iter *iter) return NULL; } -/* Size is determined by what has been committed */ -static __always_inline unsigned rb_page_size(struct buffer_page *bpage) -{ - return rb_page_commit(bpage) & ~RB_MISSED_MASK; -} - static __always_inline unsigned rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer) { @@ -3345,6 +3531,9 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) else rb_inc_page(&iter->head_page); + if (rb_page_commit(iter->head_page) & RB_MISSED_EVENTS) + iter->missed_events = -1; + iter->page_stamp = iter->read_stamp = iter->head_page->page->time_stamp; iter->head = 0; iter->next_event = 0; @@ -3769,13 +3958,6 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, return skip_time_extend(event); } -#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK -static inline bool sched_clock_stable(void) -{ - return true; -} -#endif - static void rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer, struct rb_event_info *info) @@ -4023,8 +4205,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) local_set(&cpu_buffer->commit_page->page->commit, rb_page_write(cpu_buffer->commit_page)); RB_WARN_ON(cpu_buffer, - local_read(&cpu_buffer->commit_page->page->commit) & - ~RB_WRITE_MASK); + rb_page_commit(cpu_buffer->commit_page) & ~RB_WRITE_MASK); barrier(); } @@ -4396,7 +4577,7 @@ static const char *show_interrupt_level(void) return show_irq_str(level); } -static void dump_buffer_page(struct buffer_data_page *bpage, +static void dump_buffer_page(struct buffer_data_page *dpage, struct rb_event_info *info, unsigned long tail) { @@ -4404,12 +4585,12 @@ static void dump_buffer_page(struct buffer_data_page *bpage, u64 ts, delta; int e; - ts = bpage->time_stamp; + ts = dpage->time_stamp; pr_warn(" [%lld] PAGE TIME STAMP\n", ts); for (e = 0; e < tail; e += rb_event_length(event)) { - event = (struct ring_buffer_event *)(bpage->data + e); + event = (struct ring_buffer_event *)(dpage->data + e); switch (event->type_len) { @@ -4459,7 +4640,7 @@ static atomic_t ts_dump; } \ atomic_inc(&cpu_buffer->record_disabled); \ pr_warn(fmt, ##__VA_ARGS__); \ - dump_buffer_page(bpage, info, tail); \ + dump_buffer_page(dpage, info, tail); \ atomic_dec(&ts_dump); \ /* There's some cases in boot up that this can happen */ \ if (WARN_ON_ONCE(system_state != SYSTEM_BOOTING)) \ @@ -4475,16 +4656,16 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, struct rb_event_info *info, unsigned long tail) { - struct buffer_data_page *bpage; + struct buffer_data_page *dpage; u64 ts, delta; bool full = false; int ret; - bpage = info->tail_page->page; + dpage = info->tail_page->page; if (tail == CHECK_FULL_PAGE) { full = true; - tail = local_read(&bpage->commit); + tail = rb_data_page_commit(dpage); } else if (info->add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)) { /* Ignore events with absolute time stamps */ @@ -4495,7 +4676,7 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, * Do not check the first event (skip possible extends too). * Also do not check if previous events have not been committed. */ - if (tail <= 8 || tail > local_read(&bpage->commit)) + if (tail <= 8 || tail > rb_data_page_commit(dpage)) return; /* @@ -4504,7 +4685,7 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, if (atomic_inc_return(this_cpu_ptr(&checking)) != 1) goto out; - ret = rb_read_data_buffer(bpage, tail, cpu_buffer->cpu, &ts, &delta); + ret = rb_read_data_buffer(dpage, tail, cpu_buffer->cpu, &ts, &delta); if (ret < 0) { if (delta < ts) { buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld clock:%pS\n", @@ -5407,6 +5588,7 @@ static void rb_iter_reset(struct ring_buffer_iter *iter) iter->head_page = cpu_buffer->reader_page; iter->head = cpu_buffer->reader_page->read; iter->next_event = iter->head; + iter->missed_events = 0; iter->cache_reader_page = iter->head_page; iter->cache_read = cpu_buffer->read; @@ -5471,7 +5653,7 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter) * (see rb_tail_page_update()) */ smp_rmb(); - commit = rb_page_commit(commit_page); + commit = rb_page_size(commit_page); /* We want to make sure that the commit page doesn't change */ smp_rmb(); @@ -5613,10 +5795,12 @@ __rb_get_reader_page_from_remote(struct ring_buffer_per_cpu *cpu_buffer) static struct buffer_page * __rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) { - struct buffer_page *reader = NULL; + int max_loops = cpu_buffer->ring_meta ? cpu_buffer->nr_pages : 3; unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size); + struct buffer_page *reader = NULL; unsigned long overwrite; unsigned long flags; + int missed_events = 0; int nr_loops = 0; bool ret; @@ -5626,11 +5810,14 @@ __rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) again: /* * This should normally only loop twice. But because the - * start of the reader inserts an empty page, it causes - * a case where we will loop three times. There should be no - * reason to loop four times (that I know of). + * start of the reader inserts an empty page, it causes a + * case where we will loop three times. There should be no + * reason to loop four times unless the ring buffer is a + * recovered persistent ring buffer. For persistent ring buffers, + * invalid pages are reset during recovery, so there may be more + * than 3 contiguous pages can be empty, but less than nr_pages. */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) { + if (RB_WARN_ON(cpu_buffer, ++nr_loops > max_loops)) { reader = NULL; goto out; } @@ -5660,6 +5847,7 @@ __rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) */ local_set(&cpu_buffer->reader_page->write, 0); local_set(&cpu_buffer->reader_page->entries, 0); + rb_init_data_page(cpu_buffer->reader_page->page); cpu_buffer->reader_page->real_end = 0; spin: @@ -5713,6 +5901,9 @@ __rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) if (!ret) goto spin; + if (rb_page_commit(reader) & RB_MISSED_EVENTS) + missed_events = -1; + if (cpu_buffer->ring_meta) rb_update_meta_reader(cpu_buffer, reader); @@ -5777,6 +5968,8 @@ __rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) */ smp_rmb(); + if (!cpu_buffer->lost_events) + cpu_buffer->lost_events = missed_events; return reader; } @@ -5927,12 +6120,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; int nr_loops = 0; + int max_loops; if (ts) *ts = 0; cpu_buffer = iter->cpu_buffer; buffer = cpu_buffer->buffer; + max_loops = cpu_buffer->ring_meta ? cpu_buffer->nr_pages : 3; /* * Check if someone performed a consuming read to the buffer @@ -5955,7 +6150,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) * the ring buffer with an active write as the consumer is. * Do not warn if the three failures is reached. */ - if (++nr_loops > 3) + if (++nr_loops > max_loops) return NULL; if (rb_per_cpu_empty(cpu_buffer)) @@ -6086,10 +6281,7 @@ ring_buffer_peek(struct trace_buffer *buffer, int cpu, u64 *ts, */ bool ring_buffer_iter_dropped(struct ring_buffer_iter *iter) { - bool ret = iter->missed_events != 0; - - iter->missed_events = 0; - return ret; + return iter->missed_events != 0; } EXPORT_SYMBOL_GPL(ring_buffer_iter_dropped); @@ -6251,7 +6443,7 @@ void ring_buffer_iter_advance(struct ring_buffer_iter *iter) unsigned long flags; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); - + iter->missed_events = 0; rb_advance_iter(iter); raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); @@ -6291,7 +6483,7 @@ static void rb_clear_buffer_page(struct buffer_page *page) { local_set(&page->write, 0); local_set(&page->entries, 0); - rb_init_page(page->page); + rb_init_data_page(page->page); page->read = 0; } @@ -6776,7 +6968,7 @@ ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu) local_irq_restore(flags); if (bpage->data) { - rb_init_page(bpage->data); + rb_init_data_page(bpage->data); } else { bpage->data = alloc_cpu_data(cpu, cpu_buffer->buffer->subbuf_order); if (!bpage->data) { @@ -6801,8 +6993,8 @@ void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, struct buffer_data_read_page *data_page) { struct ring_buffer_per_cpu *cpu_buffer; - struct buffer_data_page *bpage = data_page->data; - struct page *page = virt_to_page(bpage); + struct buffer_data_page *dpage = data_page->data; + struct page *page = virt_to_page(dpage); unsigned long flags; if (!buffer || !buffer->buffers || !buffer->buffers[cpu]) @@ -6822,15 +7014,15 @@ void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, arch_spin_lock(&cpu_buffer->lock); if (!cpu_buffer->free_page) { - cpu_buffer->free_page = bpage; - bpage = NULL; + cpu_buffer->free_page = dpage; + dpage = NULL; } arch_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); out: - free_pages((unsigned long)bpage, data_page->order); + free_pages((unsigned long)dpage, data_page->order); kfree(data_page); } EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); @@ -6875,10 +7067,11 @@ int ring_buffer_read_page(struct trace_buffer *buffer, { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; struct ring_buffer_event *event; - struct buffer_data_page *bpage; + struct buffer_data_page *dpage; struct buffer_page *reader; - unsigned long missed_events; + long missed_events; unsigned int commit; + unsigned int size; unsigned int read; u64 save_timestamp; bool force_memcpy; @@ -6901,8 +7094,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer, if (data_page->order != buffer->subbuf_order) return -1; - bpage = data_page->data; - if (!bpage) + dpage = data_page->data; + if (!dpage) return -1; guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock); @@ -6914,7 +7107,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer, event = rb_reader_event(cpu_buffer); read = reader->read; - commit = rb_page_size(reader); + commit = rb_page_commit(reader); + size = rb_page_size(reader); /* Check if any events were dropped */ missed_events = cpu_buffer->lost_events; @@ -6928,13 +7122,14 @@ int ring_buffer_read_page(struct trace_buffer *buffer, * we must copy the data from the page to the buffer. * Otherwise, we can simply swap the page with the one passed in. */ - if (read || (len < (commit - read)) || + if (read || (len < (size - read)) || cpu_buffer->reader_page == cpu_buffer->commit_page || force_memcpy) { struct buffer_data_page *rpage = cpu_buffer->reader_page->page; unsigned int rpos = read; unsigned int pos = 0; - unsigned int size; + unsigned int event_size; + unsigned int flags = 0; /* * If a full page is expected, this can still be returned @@ -6943,19 +7138,22 @@ int ring_buffer_read_page(struct trace_buffer *buffer, * the reader page. */ if (full && - (!read || (len < (commit - read)) || + (!read || (len < (size - read)) || cpu_buffer->reader_page == cpu_buffer->commit_page)) return -1; - if (len > (commit - read)) - len = (commit - read); + if (len > (size - read)) + len = (size - read); /* Always keep the time extend and data together */ - size = rb_event_ts_length(event); + event_size = rb_event_ts_length(event); - if (len < size) + if (len < event_size) return -1; + if (commit & RB_MISSED_EVENTS) + flags = RB_MISSED_EVENTS; + /* save the current timestamp, since the user will need it */ save_timestamp = cpu_buffer->read_stamp; @@ -6967,26 +7165,26 @@ int ring_buffer_read_page(struct trace_buffer *buffer, * one or two events. * We have already ensured there's enough space if this * is a time extend. */ - size = rb_event_length(event); - memcpy(bpage->data + pos, rpage->data + rpos, size); + event_size = rb_event_length(event); + memcpy(dpage->data + pos, rpage->data + rpos, event_size); - len -= size; + len -= event_size; rb_advance_reader(cpu_buffer); rpos = reader->read; - pos += size; + pos += event_size; - if (rpos >= commit) + if (rpos >= event_size) break; event = rb_reader_event(cpu_buffer); /* Always keep the time extend and data together */ - size = rb_event_ts_length(event); - } while (len >= size); + event_size = rb_event_ts_length(event); + } while (len >= event_size); - /* update bpage */ - local_set(&bpage->commit, pos); - bpage->time_stamp = save_timestamp; + /* update dpage */ + local_set(&dpage->commit, pos | flags); + dpage->time_stamp = save_timestamp; /* we copied everything to the beginning */ read = 0; @@ -6996,13 +7194,15 @@ int ring_buffer_read_page(struct trace_buffer *buffer, cpu_buffer->read_bytes += rb_page_size(reader); /* swap the pages */ - rb_init_page(bpage); - bpage = reader->page; + rb_init_data_page(dpage); + dpage = reader->page; reader->page = data_page->data; local_set(&reader->write, 0); local_set(&reader->entries, 0); reader->read = 0; - data_page->data = bpage; + data_page->data = dpage; + if (!missed_events && rb_data_page_commit(dpage) & RB_MISSED_EVENTS) + missed_events = -1; /* * Use the real_end for the data size, @@ -7010,33 +7210,43 @@ int ring_buffer_read_page(struct trace_buffer *buffer, * on the page. */ if (reader->real_end) - local_set(&bpage->commit, reader->real_end); + local_set(&dpage->commit, reader->real_end); } cpu_buffer->lost_events = 0; - commit = local_read(&bpage->commit); + size = rb_data_page_size(dpage); /* * Set a flag in the commit field if we lost events */ if (missed_events) { - /* If there is room at the end of the page to save the + /* + * If there is room at the end of the page to save the * missed events, then record it there. */ - if (buffer->subbuf_size - commit >= sizeof(missed_events)) { - memcpy(&bpage->data[commit], &missed_events, + if (missed_events > 0 && + buffer->subbuf_size - size >= sizeof(missed_events)) { + memcpy(&dpage->data[size], &missed_events, sizeof(missed_events)); - local_add(RB_MISSED_STORED, &bpage->commit); - commit += sizeof(missed_events); + local_add(RB_MISSED_STORED, &dpage->commit); + size += sizeof(missed_events); } - local_add(RB_MISSED_EVENTS, &bpage->commit); + /* + * Note, for the persistent ring buffer, the RB_MISSED_EVENTS + * may have been set in the main buffer via the verification code. + * But here, dpage is a copy of that page and has not yet had + * the RB_MISSED_EVENTS set. As for the normal buffers, + * the main write buffer does not set these bits and it needs + * to be set here. + */ + local_add(RB_MISSED_EVENTS, &dpage->commit); } /* * This page may be off to user land. Zero it out here. */ - if (commit < buffer->subbuf_size) - memset(&bpage->data[commit], 0, buffer->subbuf_size - commit); + if (size < buffer->subbuf_size) + memset(&dpage->data[size], 0, buffer->subbuf_size - size); return read; } @@ -7667,7 +7877,7 @@ consume: if (missed_events) { if (cpu_buffer->reader_page != cpu_buffer->commit_page) { - struct buffer_data_page *bpage = reader->page; + struct buffer_data_page *dpage = reader->page; unsigned int commit; /* * Use the real_end for the data size, @@ -7675,18 +7885,18 @@ consume: * on the page. */ if (reader->real_end) - local_set(&bpage->commit, reader->real_end); + local_set(&dpage->commit, reader->real_end); /* * If there is room at the end of the page to save the * missed events, then record it there. */ commit = rb_page_size(reader); if (buffer->subbuf_size - commit >= sizeof(missed_events)) { - memcpy(&bpage->data[commit], &missed_events, + memcpy(&dpage->data[commit], &missed_events, sizeof(missed_events)); - local_add(RB_MISSED_STORED, &bpage->commit); + local_add(RB_MISSED_STORED, &dpage->commit); } - local_add(RB_MISSED_EVENTS, &bpage->commit); + local_add(RB_MISSED_EVENTS, &dpage->commit); } else if (!WARN_ONCE(cpu_buffer->reader_page == cpu_buffer->tail_page, "Reader on commit with %ld missed events", missed_events)) { diff --git a/kernel/trace/rv/monitors/deadline/deadline.h b/kernel/trace/rv/monitors/deadline/deadline.h index 0bbfd2543329..78fca873d61e 100644 --- a/kernel/trace/rv/monitors/deadline/deadline.h +++ b/kernel/trace/rv/monitors/deadline/deadline.h @@ -95,7 +95,8 @@ static inline u8 get_server_type(struct task_struct *tsk) static inline int extract_params(struct pt_regs *regs, long id, pid_t *pid_out) { size_t size = offsetofend(struct sched_attr, sched_flags); - struct sched_attr __user *uattr, attr; + struct sched_attr __user *uattr; + struct sched_attr attr; int new_policy = -1, ret; unsigned long args[6]; diff --git a/kernel/trace/rv/monitors/nomiss/nomiss.c b/kernel/trace/rv/monitors/nomiss/nomiss.c index 31f90f3638d8..8ead8783c29f 100644 --- a/kernel/trace/rv/monitors/nomiss/nomiss.c +++ b/kernel/trace/rv/monitors/nomiss/nomiss.c @@ -227,7 +227,7 @@ static int enable_nomiss(void) { int retval; - retval = da_monitor_init(); + retval = ha_monitor_init(); if (retval) return retval; @@ -263,7 +263,7 @@ static void disable_nomiss(void) rv_detach_trace_probe("nomiss", sched_switch, handle_sched_switch); rv_detach_trace_probe("nomiss", sched_wakeup, handle_sched_wakeup); - da_monitor_destroy(); + ha_monitor_destroy(); } static struct rv_monitor rv_this = { diff --git a/kernel/trace/rv/monitors/opid/opid.c b/kernel/trace/rv/monitors/opid/opid.c index 4594c7c46601..3b6a85e815b8 100644 --- a/kernel/trace/rv/monitors/opid/opid.c +++ b/kernel/trace/rv/monitors/opid/opid.c @@ -22,14 +22,8 @@ static u64 ha_get_env(struct ha_monitor *ha_mon, enum envs_opid env, u64 time_ns if (env == irq_off_opid) return irqs_disabled(); else if (env == preempt_off_opid) { - /* - * If CONFIG_PREEMPTION is enabled, then the tracepoint itself disables - * preemption (adding one to the preempt_count). Since we are - * interested in the preempt_count at the time the tracepoint was - * hit, we consider 1 as still enabled. - */ if (IS_ENABLED(CONFIG_PREEMPTION)) - return (preempt_count() & PREEMPT_MASK) > 1; + return (preempt_count() & PREEMPT_MASK) > 0; return true; } return ENV_INVALID_VALUE; @@ -73,7 +67,7 @@ static int enable_opid(void) { int retval; - retval = da_monitor_init(); + retval = ha_monitor_init(); if (retval) return retval; @@ -90,7 +84,7 @@ static void disable_opid(void) rv_detach_trace_probe("opid", sched_set_need_resched_tp, handle_sched_need_resched); rv_detach_trace_probe("opid", sched_waking, handle_sched_waking); - da_monitor_destroy(); + ha_monitor_destroy(); } /* diff --git a/kernel/trace/rv/monitors/stall/stall.c b/kernel/trace/rv/monitors/stall/stall.c index 9ccfda6b0e73..3c38fb1a0159 100644 --- a/kernel/trace/rv/monitors/stall/stall.c +++ b/kernel/trace/rv/monitors/stall/stall.c @@ -103,7 +103,7 @@ static int enable_stall(void) { int retval; - retval = da_monitor_init(); + retval = ha_monitor_init(); if (retval) return retval; @@ -120,7 +120,7 @@ static void disable_stall(void) rv_detach_trace_probe("stall", sched_switch, handle_sched_switch); rv_detach_trace_probe("stall", sched_wakeup, handle_sched_wakeup); - da_monitor_destroy(); + ha_monitor_destroy(); } static struct rv_monitor rv_this = { diff --git a/kernel/trace/simple_ring_buffer.c b/kernel/trace/simple_ring_buffer.c index 02af2297ae5a..f4642f5adda3 100644 --- a/kernel/trace/simple_ring_buffer.c +++ b/kernel/trace/simple_ring_buffer.c @@ -395,7 +395,6 @@ int simple_ring_buffer_init_mm(struct simple_rb_per_cpu *cpu_buffer, memset(cpu_buffer->meta, 0, sizeof(*cpu_buffer->meta)); cpu_buffer->meta->meta_page_size = PAGE_SIZE; - cpu_buffer->meta->nr_subbufs = cpu_buffer->nr_pages; /* The reader page is not part of the ring initially */ page = load_page(desc->page_va[0]); @@ -431,12 +430,13 @@ int simple_ring_buffer_init_mm(struct simple_rb_per_cpu *cpu_buffer, if (ret) { for (i--; i >= 0; i--) - unload_page((void *)desc->page_va[i]); + unload_page(bpages[i].page); unload_page(cpu_buffer->meta); return ret; } + cpu_buffer->meta->nr_subbufs = cpu_buffer->nr_pages; /* Close the ring */ bpage->link.next = &cpu_buffer->tail_page->link; cpu_buffer->tail_page->link.prev = &bpage->link; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6eb4d3097a4d..1146b83b711a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2338,15 +2338,6 @@ void trace_last_func_repeats(struct trace_array *tr, __buffer_unlock_commit(buffer, event); } -static void trace_iterator_increment(struct trace_iterator *iter) -{ - struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, iter->cpu); - - iter->idx++; - if (buf_iter) - ring_buffer_iter_advance(buf_iter); -} - static struct trace_entry * peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts, unsigned long *lost_events) @@ -2676,11 +2667,17 @@ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, /* Find the next real entry, and increment the iterator to the next entry */ void *trace_find_next_entry_inc(struct trace_iterator *iter) { + struct ring_buffer_iter *buf_iter; + iter->ent = __find_next_entry(iter, &iter->cpu, &iter->lost_events, &iter->ts); - if (iter->ent) - trace_iterator_increment(iter); + if (iter->ent) { + iter->idx++; + buf_iter = trace_buffer_iter(iter, iter->cpu); + if (buf_iter) + ring_buffer_iter_advance(buf_iter); + } return iter->ent ? iter : NULL; } @@ -4474,7 +4471,7 @@ static const char readme_msg[] = "\t snapshot() - snapshot the trace buffer\n\n" #endif #ifdef CONFIG_SYNTH_EVENTS - " events/synthetic_events\t- Create/append/remove/show synthetic events\n" + " synthetic_events\t- Create/append/remove/show synthetic events\n" "\t Write into this file to define/undefine new synthetic events.\n" "\t example: echo 'myevent u64 lat; char name[]; long[] stack' >> synthetic_events\n" #endif @@ -7928,8 +7925,8 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer, if (!topts) return 0; - tr_topts = krealloc(tr->topts, sizeof(*tr->topts) * (tr->nr_topts + 1), - GFP_KERNEL); + tr_topts = krealloc_array(tr->topts, tr->nr_topts + 1, sizeof(*tr->topts), + GFP_KERNEL); if (!tr_topts) { kfree(topts); return -ENOMEM; @@ -8383,6 +8380,8 @@ static void setup_trace_scratch(struct trace_array *tr, memset(tscratch, 0, size); } +#define TRACE_TEST_PTRACING_NAME "ptracingtest" + int allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size) { enum ring_buffer_flags rb_flags; @@ -8394,6 +8393,8 @@ int allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int buf->tr = tr; if (tr->range_addr_start && tr->range_addr_size) { + if (tr->name && !strcmp(tr->name, TRACE_TEST_PTRACING_NAME)) + rb_flags |= RB_FL_TESTING; /* Add scratch buffer to handle 128 modules */ buf->buffer = ring_buffer_alloc_range(size, rb_flags, 0, tr->range_addr_start, diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index d1564db95a8f..d8e97ad798f0 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -181,8 +181,7 @@ __init static int init_branch_tracer(void) ret = register_trace_event(&trace_branch_event); if (!ret) { - printk(KERN_WARNING "Warning: could not register " - "branch events\n"); + pr_warn("Warning: could not register branch events\n"); return 1; } return register_tracer(&branch_trace); @@ -374,8 +373,7 @@ __init static int init_annotated_branch_stats(void) ret = register_stat_tracer(&annotated_branch_stats); if (ret) { - printk(KERN_WARNING "Warning: could not register " - "annotated branches stats\n"); + pr_warn("Warning: could not register annotated branches stats\n"); return ret; } return 0; @@ -439,8 +437,7 @@ __init static int all_annotated_branch_stats(void) ret = register_stat_tracer(&all_branch_stats); if (ret) { - printk(KERN_WARNING "Warning: could not register " - "all branches stats\n"); + pr_warn("Warning: could not register all branches stats\n"); return ret; } return 0; diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index a6bb7577e8c5..5b272856e5ab 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -497,7 +497,17 @@ static int perf_ftrace_function_register(struct perf_event *event) static int perf_ftrace_function_unregister(struct perf_event *event) { struct ftrace_ops *ops = &event->ftrace_ops; - int ret = unregister_ftrace_function(ops); + int ret = 0; + + /* + * Perf will call this unconditionally even if the ops is not + * enabled. The unregister_ftrace_function() will warn if called + * when not enabled. Just bypass the unregistering if ops isn't + * enabled here. + */ + if (ops->flags & FTRACE_OPS_FL_ENABLED) + ret = unregister_ftrace_function(ops); + ftrace_free_filter(ops); return ret; } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 0dbbf6cca9bc..82ce492ab268 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -8,6 +8,7 @@ #include <linux/module.h> #include <linux/kallsyms.h> #include <linux/security.h> +#include <linux/seq_buf.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/stacktrace.h> @@ -682,8 +683,8 @@ struct track_data { struct hist_elt_data { char *comm; u64 *var_ref_vals; - char **field_var_str; int n_field_var_str; + char *field_var_str[] __counted_by(n_field_var_str); }; struct snapshot_context { @@ -1369,10 +1370,8 @@ static const char *hist_field_name(struct hist_field *field, len = snprintf(full_name, sizeof(full_name), fmt, field->system, field->event_name, field->name); - if (len >= sizeof(full_name)) - return NULL; - - field_name = full_name; + if (len < sizeof(full_name)) + field_name = full_name; } else field_name = field->name; } else if (field->flags & HIST_FIELD_FL_TIMESTAMP) @@ -1630,8 +1629,6 @@ static void hist_elt_data_free(struct hist_elt_data *elt_data) for (i = 0; i < elt_data->n_field_var_str; i++) kfree(elt_data->field_var_str[i]); - kfree(elt_data->field_var_str); - kfree(elt_data->comm); kfree(elt_data); } @@ -1651,10 +1648,19 @@ static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt) struct hist_field *hist_field; unsigned int i, n_str; - elt_data = kzalloc_obj(*elt_data); + BUILD_BUG_ON(STR_VAR_LEN_MAX & (sizeof(u64) - 1)); + + n_str = hist_data->n_field_var_str + hist_data->n_save_var_str + + hist_data->n_var_str; + if (n_str > SYNTH_FIELDS_MAX) + return -EINVAL; + + elt_data = kzalloc_flex(*elt_data, field_var_str, n_str); if (!elt_data) return -ENOMEM; + elt_data->n_field_var_str = n_str; + for_each_hist_field(i, hist_data) { hist_field = hist_data->fields[i]; @@ -1668,24 +1674,8 @@ static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt) } } - n_str = hist_data->n_field_var_str + hist_data->n_save_var_str + - hist_data->n_var_str; - if (n_str > SYNTH_FIELDS_MAX) { - hist_elt_data_free(elt_data); - return -EINVAL; - } - - BUILD_BUG_ON(STR_VAR_LEN_MAX & (sizeof(u64) - 1)); - size = STR_VAR_LEN_MAX; - elt_data->field_var_str = kcalloc(n_str, sizeof(char *), GFP_KERNEL); - if (!elt_data->field_var_str) { - hist_elt_data_free(elt_data); - return -EINVAL; - } - elt_data->n_field_var_str = n_str; - for (i = 0; i < n_str; i++) { elt_data->field_var_str[i] = kzalloc(size, GFP_KERNEL); if (!elt_data->field_var_str[i]) { @@ -2969,13 +2959,22 @@ find_synthetic_field_var(struct hist_trigger_data *target_hist_data, { struct hist_field *event_var; char *synthetic_name; + struct seq_buf s; synthetic_name = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL); if (!synthetic_name) return ERR_PTR(-ENOMEM); - strcpy(synthetic_name, "synthetic_"); - strcat(synthetic_name, field_name); + seq_buf_init(&s, synthetic_name, MAX_FILTER_STR_VAL); + seq_buf_printf(&s, "synthetic_%s", field_name); + + /* Terminate synthetic_name with a NUL. */ + seq_buf_str(&s); + + if (seq_buf_has_overflowed(&s)) { + kfree(synthetic_name); + return ERR_PTR(-E2BIG); + } event_var = find_event_var(target_hist_data, system, event_name, synthetic_name); @@ -3021,6 +3020,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, struct hist_field *key_field; struct hist_field *event_var; char *saved_filter; + struct seq_buf s; char *cmd; int ret; @@ -3065,28 +3065,34 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, return ERR_PTR(-ENOMEM); } + seq_buf_init(&s, cmd, MAX_FILTER_STR_VAL); + /* Use the same keys as the compatible histogram */ - strcat(cmd, "keys="); + seq_buf_puts(&s, "keys="); for_each_hist_key_field(i, hist_data) { key_field = hist_data->fields[i]; if (!first) - strcat(cmd, ","); - strcat(cmd, key_field->field->name); + seq_buf_putc(&s, ','); + seq_buf_puts(&s, key_field->field->name); first = false; } /* Create the synthetic field variable specification */ - strcat(cmd, ":synthetic_"); - strcat(cmd, field_name); - strcat(cmd, "="); - strcat(cmd, field_name); + seq_buf_printf(&s, ":synthetic_%s=%s", field_name, field_name); /* Use the same filter as the compatible histogram */ saved_filter = find_trigger_filter(hist_data, file); - if (saved_filter) { - strcat(cmd, " if "); - strcat(cmd, saved_filter); + if (saved_filter) + seq_buf_printf(&s, " if %s", saved_filter); + + /* Terminate cmd with a NUL. */ + seq_buf_str(&s); + + if (seq_buf_has_overflowed(&s)) { + kfree(cmd); + kfree(var_hist); + return ERR_PTR(-E2BIG); } var_hist->cmd = kstrdup(cmd, GFP_KERNEL); diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 39ac4eba0702..e6871230bde9 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -499,28 +499,19 @@ static unsigned int trace_stack(struct synth_trace_event *entry, return len; } -static void trace_event_raw_event_synth(void *__data, - u64 *var_ref_vals, - unsigned int *var_ref_idx) +static __always_inline int get_field_size(struct synth_event *event, + u64 *var_ref_vals, + unsigned int *var_ref_idx) { - unsigned int i, n_u64, val_idx, len, data_size = 0; - struct trace_event_file *trace_file = __data; - struct synth_trace_event *entry; - struct trace_event_buffer fbuffer; - struct trace_buffer *buffer; - struct synth_event *event; - int fields_size = 0; - - event = trace_file->event_call->data; - - if (trace_trigger_soft_disabled(trace_file)) - return; + int fields_size; fields_size = event->n_u64 * sizeof(u64); - for (i = 0; i < event->n_dynamic_fields; i++) { + for (int i = 0; i < event->n_dynamic_fields; i++) { unsigned int field_pos = event->dynamic_fields[i]->field_pos; char *str_val; + int val_idx; + int len; val_idx = var_ref_idx[field_pos]; str_val = (char *)(long)var_ref_vals[val_idx]; @@ -535,18 +526,18 @@ static void trace_event_raw_event_synth(void *__data, fields_size += len; } + return fields_size; +} - /* - * Avoid ring buffer recursion detection, as this event - * is being performed within another event. - */ - buffer = trace_file->tr->array_buffer.buffer; - guard(ring_buffer_nest)(buffer); - - entry = trace_event_buffer_reserve(&fbuffer, trace_file, - sizeof(*entry) + fields_size); - if (!entry) - return; +static __always_inline void write_synth_entry(struct synth_event *event, + struct synth_trace_event *entry, + u64 *var_ref_vals, + unsigned int *var_ref_idx) +{ + int data_size = 0; + int i, n_u64; + int val_idx; + int len; for (i = 0, n_u64 = 0; i < event->n_fields; i++) { val_idx = var_ref_idx[i]; @@ -587,10 +578,83 @@ static void trace_event_raw_event_synth(void *__data, n_u64++; } } +} + +static void trace_event_raw_event_synth(void *__data, + u64 *var_ref_vals, + unsigned int *var_ref_idx) +{ + struct trace_event_file *trace_file = __data; + struct synth_trace_event *entry; + struct trace_event_buffer fbuffer; + struct trace_buffer *buffer; + struct synth_event *event; + int fields_size; + + event = trace_file->event_call->data; + + if (trace_trigger_soft_disabled(trace_file)) + return; + + fields_size = get_field_size(event, var_ref_vals, var_ref_idx); + + /* + * Avoid ring buffer recursion detection, as this event + * is being performed within another event. + */ + buffer = trace_file->tr->array_buffer.buffer; + guard(ring_buffer_nest)(buffer); + + entry = trace_event_buffer_reserve(&fbuffer, trace_file, + sizeof(*entry) + fields_size); + if (!entry) + return; + + write_synth_entry(event, entry, var_ref_vals, var_ref_idx); trace_event_buffer_commit(&fbuffer); } +#ifdef CONFIG_PERF_EVENTS +static void perf_event_raw_event_synth(void *__data, + u64 *var_ref_vals, + unsigned int *var_ref_idx) +{ + struct trace_event_call *call = __data; + struct synth_trace_event *entry; + struct hlist_head *perf_head; + struct synth_event *event; + struct pt_regs *regs; + int fields_size; + size_t size; + int context; + + event = call->data; + + perf_head = this_cpu_ptr(call->perf_events); + + if (!perf_head || hlist_empty(perf_head)) + return; + + fields_size = get_field_size(event, var_ref_vals, var_ref_idx); + + size = ALIGN(sizeof(*entry) + fields_size, 8); + + entry = perf_trace_buf_alloc(size, ®s, &context); + + if (unlikely(!entry)) + return; + + write_synth_entry(event, entry, var_ref_vals, var_ref_idx); + + perf_fetch_caller_regs(regs); + + perf_trace_buf_submit(entry, size, context, + call->event.type, 1, regs, + perf_head, NULL); +} +#endif + static void free_synth_event_print_fmt(struct trace_event_call *call) { if (call) { @@ -917,6 +981,9 @@ static int register_synth_event(struct synth_event *event) call->flags = TRACE_EVENT_FL_TRACEPOINT; call->class->reg = synth_event_reg; call->class->probe = trace_event_raw_event_synth; +#ifdef CONFIG_PERF_EVENTS + call->class->perf_probe = perf_event_raw_event_synth; +#endif call->data = event; call->tp = event->tp; diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 75678053b21c..5e83c4f6f2b4 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -83,6 +83,22 @@ struct osnoise_instance { static struct list_head osnoise_instances; +static void osnoise_print(const char *fmt, ...) +{ + struct osnoise_instance *inst; + struct trace_array *tr; + va_list ap; + + rcu_read_lock(); + list_for_each_entry_rcu(inst, &osnoise_instances, list) { + tr = inst->tr; + va_start(ap, fmt); + trace_array_vprintk(tr, _RET_IP_, fmt, ap); + va_end(ap); + } + rcu_read_unlock(); +} + static bool osnoise_has_registered_instances(void) { return !!list_first_or_null_rcu(&osnoise_instances, @@ -123,6 +139,7 @@ static int osnoise_register_instance(struct trace_array *tr) * trace_types_lock. */ lockdep_assert_held(&trace_types_lock); + trace_array_init_printk(tr); inst = kmalloc_obj(*inst); if (!inst) @@ -471,15 +488,7 @@ static void print_osnoise_headers(struct seq_file *s) * osnoise_taint - report an osnoise error. */ #define osnoise_taint(msg) ({ \ - struct osnoise_instance *inst; \ - struct trace_buffer *buffer; \ - \ - rcu_read_lock(); \ - list_for_each_entry_rcu(inst, &osnoise_instances, list) { \ - buffer = inst->tr->array_buffer.buffer; \ - trace_array_printk_buf(buffer, _THIS_IP_, msg); \ - } \ - rcu_read_unlock(); \ + osnoise_print(msg); \ osnoise_data.tainted = true; \ }) @@ -1189,10 +1198,10 @@ static __always_inline void osnoise_stop_exception(char *msg, int cpu) rcu_read_lock(); list_for_each_entry_rcu(inst, &osnoise_instances, list) { tr = inst->tr; - trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_, - "stop tracing hit on cpu %d due to exception: %s\n", - smp_processor_id(), - msg); + trace_array_printk(tr, _THIS_IP_, + "stop tracing hit on cpu %d due to exception: %s\n", + smp_processor_id(), + msg); if (test_bit(OSN_PANIC_ON_STOP, &osnoise_options)) panic("tracer hit on cpu %d due to exception: %s\n", @@ -1362,8 +1371,8 @@ static __always_inline void osnoise_stop_tracing(void) rcu_read_lock(); list_for_each_entry_rcu(inst, &osnoise_instances, list) { tr = inst->tr; - trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_, - "stop tracing hit on cpu %d\n", smp_processor_id()); + trace_array_printk(tr, _THIS_IP_, + "stop tracing hit on cpu %d\n", smp_processor_id()); if (test_bit(OSN_PANIC_ON_STOP, &osnoise_options)) panic("tracer hit stop condition on CPU %d\n", smp_processor_id()); @@ -2544,9 +2553,12 @@ timerlat_fd_read(struct file *file, char __user *ubuf, size_t count, notify_new_max_latency(diff); tlat->tracing_thread = false; - if (osnoise_data.stop_tracing_total) - if (time_to_us(diff) >= osnoise_data.stop_tracing_total) + if (osnoise_data.stop_tracing_total) { + if (time_to_us(diff) >= osnoise_data.stop_tracing_total) { + timerlat_dump_stack(time_to_us(diff)); osnoise_stop_tracing(); + } + } } else { tlat->tracing_thread = false; tlat->kthread = current; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index e0d3a0da26af..fd1caa1f9723 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -332,6 +332,23 @@ static int parse_trace_event_arg(char *arg, struct fetch_insn *code, return -ENOENT; } +static int parse_trace_event(char *arg, struct fetch_insn *code, + struct traceprobe_parse_context *ctx) +{ + int ret; + + if (code->data) + return -EFAULT; + ret = parse_trace_event_arg(arg, code, ctx); + if (!ret) + return 0; + if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) { + code->op = FETCH_OP_COMM; + return 0; + } + return -EINVAL; +} + #ifdef CONFIG_PROBE_EVENTS_BTF_ARGS static u32 btf_type_int(const struct btf_type *t) @@ -376,11 +393,16 @@ static bool btf_type_is_char_array(struct btf *btf, const struct btf_type *type) && BTF_INT_BITS(intdata) == 8; } +static struct btf *ctx_btf(struct traceprobe_parse_context *ctx) +{ + return ctx->struct_btf ? : ctx->btf; +} + static int check_prepare_btf_string_fetch(char *typename, struct fetch_insn **pcode, struct traceprobe_parse_context *ctx) { - struct btf *btf = ctx->btf; + struct btf *btf = ctx_btf(ctx); if (!btf || !ctx->last_type) return 0; @@ -506,6 +528,15 @@ static int query_btf_context(struct traceprobe_parse_context *ctx) return 0; } +static void clear_struct_btf(struct traceprobe_parse_context *ctx) +{ + if (ctx->struct_btf) { + btf_put(ctx->struct_btf); + ctx->struct_btf = NULL; + ctx->last_struct = NULL; + } +} + static void clear_btf_context(struct traceprobe_parse_context *ctx) { if (ctx->btf) { @@ -554,22 +585,29 @@ static int parse_btf_field(char *fieldname, const struct btf_type *type, struct fetch_insn *code = *pcode; const struct btf_member *field; u32 bitoffs, anon_offs; + bool is_struct = ctx->struct_btf != NULL; + struct btf *btf = ctx_btf(ctx); char *next; int is_ptr; s32 tid; do { - /* Outer loop for solving arrow operator ('->') */ - if (BTF_INFO_KIND(type->info) != BTF_KIND_PTR) { - trace_probe_log_err(ctx->offset, NO_PTR_STRCT); - return -EINVAL; - } - /* Convert a struct pointer type to a struct type */ - type = btf_type_skip_modifiers(ctx->btf, type->type, &tid); - if (!type) { - trace_probe_log_err(ctx->offset, BAD_BTF_TID); - return -EINVAL; + if (!is_struct) { + /* Outer loop for solving arrow operator ('->') */ + if (BTF_INFO_KIND(type->info) != BTF_KIND_PTR) { + trace_probe_log_err(ctx->offset, NO_PTR_STRCT); + return -EINVAL; + } + + /* Convert a struct pointer type to a struct type */ + type = btf_type_skip_modifiers(btf, type->type, &tid); + if (!type) { + trace_probe_log_err(ctx->offset, BAD_BTF_TID); + return -EINVAL; + } } + /* Only the first type can skip being a pointer */ + is_struct = false; bitoffs = 0; do { @@ -580,7 +618,7 @@ static int parse_btf_field(char *fieldname, const struct btf_type *type, return is_ptr; anon_offs = 0; - field = btf_find_struct_member(ctx->btf, type, fieldname, + field = btf_find_struct_member(btf, type, fieldname, &anon_offs); if (IS_ERR(field)) { trace_probe_log_err(ctx->offset, BAD_BTF_TID); @@ -602,7 +640,7 @@ static int parse_btf_field(char *fieldname, const struct btf_type *type, ctx->last_bitsize = 0; } - type = btf_type_skip_modifiers(ctx->btf, field->type, &tid); + type = btf_type_skip_modifiers(btf, field->type, &tid); if (!type) { trace_probe_log_err(ctx->offset, BAD_BTF_TID); return -EINVAL; @@ -640,7 +678,7 @@ static int parse_btf_arg(char *varname, int i, is_ptr, ret; u32 tid; - if (WARN_ON_ONCE(!ctx->funcname)) + if (WARN_ON_ONCE(!ctx->funcname && !(ctx->flags & TPARG_FL_TEVENT))) return -EINVAL; is_ptr = split_next_field(varname, &field, ctx); @@ -653,6 +691,19 @@ static int parse_btf_arg(char *varname, return -EOPNOTSUPP; } + if (ctx->flags & TPARG_FL_TEVENT) { + ret = parse_trace_event(varname, code, ctx); + if (ret < 0) { + trace_probe_log_err(ctx->offset, BAD_ATTACH_ARG); + return ret; + } + /* TEVENT is only here via a typecast */ + if (WARN_ON_ONCE(ctx->struct_btf == NULL)) + return -EINVAL; + type = ctx->last_struct; + goto found_type; + } + if (ctx->flags & TPARG_FL_RETURN && !strcmp(varname, "$retval")) { code->op = FETCH_OP_RETVAL; /* Check whether the function return type is not void */ @@ -709,6 +760,7 @@ static int parse_btf_arg(char *varname, found: type = btf_type_skip_modifiers(ctx->btf, tid, &tid); +found_type: if (!type) { trace_probe_log_err(ctx->offset, BAD_BTF_TID); return -EINVAL; @@ -727,7 +779,7 @@ found: static const struct fetch_type *find_fetch_type_from_btf_type( struct traceprobe_parse_context *ctx) { - struct btf *btf = ctx->btf; + struct btf *btf = ctx_btf(ctx); const char *typestr = NULL; if (btf && ctx->last_type) @@ -758,7 +810,67 @@ static int parse_btf_bitfield(struct fetch_insn **pcode, return 0; } -#else +static int query_btf_struct(const char *sname, struct traceprobe_parse_context *ctx) +{ + struct btf *btf = NULL; + int id; + + /* A struct_btf should only be used by a single argument */ + if (WARN_ON_ONCE(ctx->struct_btf)) { + btf_put(ctx->struct_btf); + ctx->struct_btf = NULL; + } + + id = bpf_find_btf_id(sname, BTF_KIND_STRUCT, &btf); + if (id < 0) + return id; + ctx->struct_btf = btf; + ctx->last_struct = btf_type_by_id(ctx->struct_btf, id); + return 0; +} + +static int handle_typecast(char *arg, struct fetch_insn **pcode, + struct fetch_insn *end, + struct traceprobe_parse_context *ctx) +{ + char *tmp; + int ret; + + /* Currently this only works for eprobes */ + if (!(ctx->flags & TPARG_FL_TEVENT)) { + trace_probe_log_err(ctx->offset, TYPECAST_NOT_EVENT); + return -EINVAL; + } + + tmp = strchr(arg, ')'); + if (!tmp) { + trace_probe_log_err(ctx->offset + strlen(arg), + DEREF_OPEN_BRACE); + return -EINVAL; + } + *tmp = '\0'; + ret = query_btf_struct(arg + 1, ctx); + *tmp = ')'; + + if (ret < 0) { + trace_probe_log_err(ctx->offset + 1, NO_PTR_STRCT); + return -EINVAL; + } + + tmp++; + + ctx->offset += tmp - arg; + ret = parse_btf_arg(tmp, pcode, end, ctx); + return ret; +} + +#else /* !CONFIG_PROBE_EVENTS_BTF_ARGS */ + +static void clear_struct_btf(struct traceprobe_parse_context *ctx) +{ + ctx->struct_btf = NULL; +} + static void clear_btf_context(struct traceprobe_parse_context *ctx) { ctx->btf = NULL; @@ -794,7 +906,15 @@ static int check_prepare_btf_string_fetch(char *typename, return 0; } -#endif +static int handle_typecast(char *arg, struct fetch_insn **pcode, + struct fetch_insn *end, + struct traceprobe_parse_context *ctx) +{ + trace_probe_log_err(ctx->offset, NOSUP_BTFARG); + return -EOPNOTSUPP; +} + +#endif /* CONFIG_PROBE_EVENTS_BTF_ARGS */ #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API @@ -838,15 +958,10 @@ static int __store_entry_arg(struct trace_probe *tp, int argnum) int i, offset, last_offset = 0; if (!earg) { - earg = kzalloc_obj(*tp->entry_arg); + earg = kzalloc_flex(*earg, code, 2 * tp->nr_args + 1); if (!earg) return -ENOMEM; earg->size = 2 * tp->nr_args + 1; - earg->code = kzalloc_objs(struct fetch_insn, earg->size); - if (!earg->code) { - kfree(earg); - return -ENOMEM; - } /* Fill the code buffer with 'end' to simplify it */ for (i = 0; i < earg->size; i++) earg->code[i].op = FETCH_OP_END; @@ -953,18 +1068,9 @@ static int parse_probe_vars(char *orig_arg, const struct fetch_type *t, int len; if (ctx->flags & TPARG_FL_TEVENT) { - if (code->data) - return -EFAULT; - ret = parse_trace_event_arg(arg, code, ctx); - if (!ret) - return 0; - if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) { - code->op = FETCH_OP_COMM; - return 0; - } - /* backward compatibility */ - ctx->offset = 0; - goto inval; + if (parse_trace_event(arg, code, ctx) < 0) + goto inval; + return 0; } if (str_has_prefix(arg, "retval")) { @@ -1231,6 +1337,9 @@ parse_probe_arg(char *arg, const struct fetch_type *type, code->op = FETCH_OP_IMM; } break; + case '(': + ret = handle_typecast(arg, pcode, end, ctx); + break; default: if (isalpha(arg[0]) || arg[0] == '_') { /* BTF variable */ if (!tparg_is_function_entry(ctx->flags) && @@ -1563,6 +1672,9 @@ fail: } kfree(tmp); + /* struct_btf should not be passed to other arguments */ + clear_struct_btf(ctx); + return ret; } @@ -2051,7 +2163,6 @@ void trace_probe_cleanup(struct trace_probe *tp) traceprobe_free_probe_arg(&tp->args[i]); if (tp->entry_arg) { - kfree(tp->entry_arg->code); kfree(tp->entry_arg); tp->entry_arg = NULL; } diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 262d8707a3df..15758cc11fc6 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -238,8 +238,8 @@ struct probe_arg { }; struct probe_entry_arg { - struct fetch_insn *code; unsigned int size; /* The entry data size */ + struct fetch_insn code[] __counted_by(size); }; struct trace_uprobe_filter { @@ -422,7 +422,9 @@ struct traceprobe_parse_context { const struct btf_param *params; /* Parameter of the function */ s32 nr_params; /* The number of the parameters */ struct btf *btf; /* The BTF to be used */ + struct btf *struct_btf; /* The BTF to be used for structs */ const struct btf_type *last_type; /* Saved type */ + const struct btf_type *last_struct; /* Saved structure */ u32 last_bitoffs; /* Saved bitoffs */ u32 last_bitsize; /* Saved bitsize */ struct trace_probe *tp; @@ -563,7 +565,8 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(NEED_STRING_TYPE, "$comm and immediate-string only accepts string type"),\ C(TOO_MANY_ARGS, "Too many arguments are specified"), \ C(TOO_MANY_EARGS, "Too many entry arguments specified"), \ - C(EVENT_TOO_BIG, "Event too big (too many fields?)"), + C(EVENT_TOO_BIG, "Event too big (too many fields?)"), \ + C(TYPECAST_NOT_EVENT, "Typecasts are only for eprobe fields"), #undef C #define C(a, b) TP_ERR_##a diff --git a/kernel/trace/trace_recursion_record.c b/kernel/trace/trace_recursion_record.c index 784fe1fbb866..bac4bc844ccd 100644 --- a/kernel/trace/trace_recursion_record.c +++ b/kernel/trace/trace_recursion_record.c @@ -180,9 +180,8 @@ static const struct seq_operations recursed_function_seq_ops = { static int recursed_function_open(struct inode *inode, struct file *file) { - int ret = 0; + guard(mutex)(&recursed_function_lock); - mutex_lock(&recursed_function_lock); /* If this file was opened for write, then erase contents */ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { /* disable updating records */ @@ -194,10 +193,9 @@ static int recursed_function_open(struct inode *inode, struct file *file) atomic_set(&nr_records, 0); } if (file->f_mode & FMODE_READ) - ret = seq_open(file, &recursed_function_seq_ops); - mutex_unlock(&recursed_function_lock); + return seq_open(file, &recursed_function_seq_ops); - return ret; + return 0; } static ssize_t recursed_function_write(struct file *file, diff --git a/kernel/trace/trace_remote.c b/kernel/trace/trace_remote.c index d6c3f94d67cd..2a6cc000ec98 100644 --- a/kernel/trace/trace_remote.c +++ b/kernel/trace/trace_remote.c @@ -602,7 +602,7 @@ static int trace_pipe_open(struct inode *inode, struct file *filp) filp->private_data = iter; - return IS_ERR(iter) ? PTR_ERR(iter) : 0; + return 0; } static int trace_pipe_release(struct inode *inode, struct file *filp) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 8ad72e17d8eb..e98ee7e1e66f 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -1371,33 +1371,33 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls); static int sys_perf_refcount_enter; static int sys_perf_refcount_exit; -static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *regs, +static int perf_call_bpf_enter(struct trace_event_call *call, struct syscall_metadata *sys_data, - struct syscall_trace_enter *rec) + int syscall_nr, unsigned long *args) { struct syscall_tp_t { struct trace_entry ent; int syscall_nr; unsigned long args[SYSCALL_DEFINE_MAXARGS]; } __aligned(8) param; + struct pt_regs regs = {}; int i; BUILD_BUG_ON(sizeof(param.ent) < sizeof(void *)); - /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. ¶m) */ - perf_fetch_caller_regs(regs); - *(struct pt_regs **)¶m = regs; - param.syscall_nr = rec->nr; + /* bpf prog requires 'regs' to be the first member in the ctx */ + perf_fetch_caller_regs(®s); + *(struct pt_regs **)¶m = ®s; + param.syscall_nr = syscall_nr; for (i = 0; i < sys_data->nb_args; i++) - param.args[i] = rec->args[i]; - return trace_call_bpf(call, ¶m); + param.args[i] = args[i]; + return trace_call_bpf_faultable(call, ¶m); } static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) { struct syscall_metadata *sys_data; struct syscall_trace_enter *rec; - struct pt_regs *fake_regs; struct hlist_head *head; unsigned long args[6]; bool valid_prog_array; @@ -1410,12 +1410,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) int size = 0; int uargs = 0; - /* - * Syscall probe called with preemption enabled, but the ring - * buffer and per-cpu data require preemption to be disabled. - */ might_fault(); - guard(preempt_notrace)(); syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) @@ -1429,6 +1424,26 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) syscall_get_arguments(current, regs, args); + /* + * Run BPF program in faultable context before per-cpu buffer + * allocation, allowing sleepable BPF programs to execute. + */ + valid_prog_array = bpf_prog_array_valid(sys_data->enter_event); + if (valid_prog_array && + !perf_call_bpf_enter(sys_data->enter_event, sys_data, + syscall_nr, args)) + return; + + /* + * Per-cpu ring buffer and perf event list operations require + * preemption to be disabled. + */ + guard(preempt_notrace)(); + + head = this_cpu_ptr(sys_data->enter_event->perf_events); + if (hlist_empty(head)) + return; + /* Check if this syscall event faults in user space memory */ mayfault = sys_data->user_mask != 0; @@ -1438,17 +1453,12 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) return; } - head = this_cpu_ptr(sys_data->enter_event->perf_events); - valid_prog_array = bpf_prog_array_valid(sys_data->enter_event); - if (!valid_prog_array && hlist_empty(head)) - return; - /* get the size after alignment with the u32 buffer size field */ size += sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec); size = ALIGN(size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - rec = perf_trace_buf_alloc(size, &fake_regs, &rctx); + rec = perf_trace_buf_alloc(size, NULL, &rctx); if (!rec) return; @@ -1458,13 +1468,6 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) if (mayfault) syscall_put_data(sys_data, rec, user_ptr, size, user_sizes, uargs); - if ((valid_prog_array && - !perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) || - hlist_empty(head)) { - perf_swevent_put_recursion_context(rctx); - return; - } - perf_trace_buf_submit(rec, size, rctx, sys_data->enter_event->event.type, 1, regs, head, NULL); @@ -1514,40 +1517,35 @@ static void perf_sysenter_disable(struct trace_event_call *call) syscall_fault_buffer_disable(); } -static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs, - struct syscall_trace_exit *rec) +static int perf_call_bpf_exit(struct trace_event_call *call, + int syscall_nr, long ret_val) { struct syscall_tp_t { struct trace_entry ent; int syscall_nr; unsigned long ret; } __aligned(8) param; - - /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. ¶m) */ - perf_fetch_caller_regs(regs); - *(struct pt_regs **)¶m = regs; - param.syscall_nr = rec->nr; - param.ret = rec->ret; - return trace_call_bpf(call, ¶m); + struct pt_regs regs = {}; + + /* bpf prog requires 'regs' to be the first member in the ctx */ + perf_fetch_caller_regs(®s); + *(struct pt_regs **)¶m = ®s; + param.syscall_nr = syscall_nr; + param.ret = ret_val; + return trace_call_bpf_faultable(call, ¶m); } static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) { struct syscall_metadata *sys_data; struct syscall_trace_exit *rec; - struct pt_regs *fake_regs; struct hlist_head *head; bool valid_prog_array; int syscall_nr; int rctx; int size; - /* - * Syscall probe called with preemption enabled, but the ring - * buffer and per-cpu data require preemption to be disabled. - */ might_fault(); - guard(preempt_notrace)(); syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) @@ -1559,29 +1557,37 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) if (!sys_data) return; - head = this_cpu_ptr(sys_data->exit_event->perf_events); + /* + * Run BPF program in faultable context before per-cpu buffer + * allocation, allowing sleepable BPF programs to execute. + */ valid_prog_array = bpf_prog_array_valid(sys_data->exit_event); - if (!valid_prog_array && hlist_empty(head)) + if (valid_prog_array && + !perf_call_bpf_exit(sys_data->exit_event, syscall_nr, + syscall_get_return_value(current, regs))) + return; + + /* + * Per-cpu ring buffer and perf event list operations require + * preemption to be disabled. + */ + guard(preempt_notrace)(); + + head = this_cpu_ptr(sys_data->exit_event->perf_events); + if (hlist_empty(head)) return; /* We can probably do that at build time */ size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - rec = perf_trace_buf_alloc(size, &fake_regs, &rctx); + rec = perf_trace_buf_alloc(size, NULL, &rctx); if (!rec) return; rec->nr = syscall_nr; rec->ret = syscall_get_return_value(current, regs); - if ((valid_prog_array && - !perf_call_bpf_exit(sys_data->exit_event, fake_regs, rec)) || - hlist_empty(head)) { - perf_swevent_put_recursion_context(rctx); - return; - } - perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type, 1, regs, head, NULL); } diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 2cabf8a23ec5..c274346853d1 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -912,7 +912,7 @@ static int uprobe_buffer_enable(void) { int ret = 0; - BUG_ON(!mutex_is_locked(&event_mutex)); + lockdep_assert_held(&event_mutex); if (uprobe_buffer_refcnt++ == 0) { ret = uprobe_buffer_init(); @@ -927,7 +927,7 @@ static void uprobe_buffer_disable(void) { int cpu; - BUG_ON(!mutex_is_locked(&event_mutex)); + lockdep_assert_held(&event_mutex); if (--uprobe_buffer_refcnt == 0) { for_each_possible_cpu(cpu) @@ -979,6 +979,7 @@ static struct uprobe_cpu_buffer *prepare_uprobe_buffer(struct trace_uprobe *tu, ucb = uprobe_buffer_get(); ucb->dsize = tu->tp.size + dsize; + BUILD_BUG_ON(MAX_UCB_BUFFER_SIZE < MAX_PROBE_EVENT_SIZE); if (WARN_ON_ONCE(ucb->dsize > MAX_UCB_BUFFER_SIZE)) { ucb->dsize = MAX_UCB_BUFFER_SIZE; dsize = MAX_UCB_BUFFER_SIZE - tu->tp.size; diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index bf1a507695b6..d7922f40dbe2 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -288,9 +288,6 @@ static void tracing_map_array_clear(struct tracing_map_array *a) { unsigned int i; - if (!a->pages) - return; - for (i = 0; i < a->n_pages; i++) memset(a->pages[i], 0, PAGE_SIZE); } @@ -302,9 +299,6 @@ static void tracing_map_array_free(struct tracing_map_array *a) if (!a) return; - if (!a->pages) - goto free; - for (i = 0; i < a->n_pages; i++) { if (!a->pages[i]) break; @@ -312,9 +306,6 @@ static void tracing_map_array_free(struct tracing_map_array *a) free_page((unsigned long)a->pages[i]); } - kfree(a->pages); - - free: kfree(a); } @@ -322,24 +313,25 @@ static struct tracing_map_array *tracing_map_array_alloc(unsigned int n_elts, unsigned int entry_size) { struct tracing_map_array *a; + unsigned int entry_size_shift; + unsigned int entries_per_page; + unsigned int n_pages; unsigned int i; - a = kzalloc_obj(*a); + entry_size_shift = fls(roundup_pow_of_two(entry_size) - 1); + entries_per_page = PAGE_SIZE / (1 << entry_size_shift); + n_pages = max(1, n_elts / entries_per_page); + + a = kzalloc_flex(*a, pages, n_pages); if (!a) return NULL; - a->entry_size_shift = fls(roundup_pow_of_two(entry_size) - 1); - a->entries_per_page = PAGE_SIZE / (1 << a->entry_size_shift); - a->n_pages = n_elts / a->entries_per_page; - if (!a->n_pages) - a->n_pages = 1; + a->entry_size_shift = entry_size_shift; + a->entries_per_page = entries_per_page; + a->n_pages = n_pages; a->entry_shift = fls(a->entries_per_page) - 1; a->entry_mask = (1 << a->entry_shift) - 1; - a->pages = kcalloc(a->n_pages, sizeof(void *), GFP_KERNEL); - if (!a->pages) - goto free; - for (i = 0; i < a->n_pages; i++) { a->pages[i] = (void *)get_zeroed_page(GFP_KERNEL); if (!a->pages[i]) @@ -386,13 +378,11 @@ static void tracing_map_elt_init_fields(struct tracing_map_elt *elt) } } -static void tracing_map_elt_free(struct tracing_map_elt *elt) +static void __tracing_map_elt_free(struct tracing_map_elt *elt) { if (!elt) return; - if (elt->map->ops && elt->map->ops->elt_free) - elt->map->ops->elt_free(elt); kfree(elt->fields); kfree(elt->vars); kfree(elt->var_set); @@ -400,6 +390,17 @@ static void tracing_map_elt_free(struct tracing_map_elt *elt) kfree(elt); } +static void tracing_map_elt_free(struct tracing_map_elt *elt) +{ + if (!elt) + return; + + /* Only objects initialized with alloc_elt() should be passed to free_elt().*/ + if (elt->map->ops && elt->map->ops->elt_free) + elt->map->ops->elt_free(elt); + __tracing_map_elt_free(elt); +} + static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map) { struct tracing_map_elt *elt; @@ -444,7 +445,7 @@ static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map) } return elt; free: - tracing_map_elt_free(elt); + __tracing_map_elt_free(elt); return ERR_PTR(err); } diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h index 99c37eeebc16..18a02959d77b 100644 --- a/kernel/trace/tracing_map.h +++ b/kernel/trace/tracing_map.h @@ -167,7 +167,7 @@ struct tracing_map_array { unsigned int entry_shift; unsigned int entry_mask; unsigned int n_pages; - void **pages; + void *pages[] __counted_by(n_pages); }; #define TRACING_MAP_ARRAY_ELT(array, idx) \ |
