summaryrefslogtreecommitdiff
path: root/kernel/trace
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/trace')
-rw-r--r--kernel/trace/Kconfig34
-rw-r--r--kernel/trace/Makefile12
-rw-r--r--kernel/trace/bpf_trace.c337
-rw-r--r--kernel/trace/fprobe.c23
-rw-r--r--kernel/trace/ftrace.c35
-rw-r--r--kernel/trace/remote_test.c4
-rw-r--r--kernel/trace/ring_buffer.c594
-rw-r--r--kernel/trace/rv/monitors/deadline/deadline.h3
-rw-r--r--kernel/trace/rv/monitors/nomiss/nomiss.c4
-rw-r--r--kernel/trace/rv/monitors/opid/opid.c12
-rw-r--r--kernel/trace/rv/monitors/stall/stall.c4
-rw-r--r--kernel/trace/simple_ring_buffer.c4
-rw-r--r--kernel/trace/trace.c29
-rw-r--r--kernel/trace/trace_branch.c9
-rw-r--r--kernel/trace/trace_event_perf.c12
-rw-r--r--kernel/trace/trace_events_hist.c78
-rw-r--r--kernel/trace/trace_events_synth.c121
-rw-r--r--kernel/trace/trace_osnoise.c46
-rw-r--r--kernel/trace/trace_probe.c183
-rw-r--r--kernel/trace/trace_probe.h7
-rw-r--r--kernel/trace/trace_recursion_record.c8
-rw-r--r--kernel/trace/trace_remote.c2
-rw-r--r--kernel/trace/trace_syscalls.c110
-rw-r--r--kernel/trace/trace_uprobe.c5
-rw-r--r--kernel/trace/tracing_map.c47
-rw-r--r--kernel/trace/tracing_map.h2
26 files changed, 1250 insertions, 475 deletions
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e130da35808f..084f34dc6c9f 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -1202,6 +1202,40 @@ config RING_BUFFER_VALIDATE_TIME_DELTAS
Only say Y if you understand what this does, and you
still want it enabled. Otherwise say N
+config RING_BUFFER_PERSISTENT_INJECT
+ bool "Enable persistent ring buffer error injection test"
+ depends on RING_BUFFER
+ help
+ This option will have the kernel check if the persistent ring
+ buffer is named "ptracingtest". and if so, it will corrupt some
+ of its pages on a kernel panic. This is used to test if the
+ persistent ring buffer can recover from some of its sub-buffers
+ being corrupted.
+ To use this, boot a kernel with a "ptracingtest" persistent
+ ring buffer, e.g.
+
+ reserve_mem=20M:2M:trace trace_instance=ptracingtest@trace panic=1
+
+ And after the 1st boot, run the following commands:
+
+ cd /sys/kernel/tracing/instances/ptracingtest
+ echo 1 > events/enable
+ echo 1 > tracing_on
+ sleep 3
+ echo c > /proc/sysrq-trigger
+
+ After the panic message, the kernel will reboot and will show
+ the test results in the console output.
+
+ Note that events for the test ring buffer needs to be enabled
+ prior to crashing the kernel so that the ring buffer has content
+ that the test will corrupt.
+ As the test will corrupt events in the "ptracingtest" persistent
+ ring buffer, it should not be used for any other purpose other
+ than this test.
+
+ If unsure, say N
+
config MMIOTRACE_TEST
tristate "Test module for mmiotrace"
depends on MMIOTRACE && m
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 1decdce8cbef..f934ff586bd4 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -48,9 +48,10 @@ ifdef CONFIG_GCOV_PROFILE_FTRACE
GCOV_PROFILE := y
endif
-# Functions in this file could be invoked from early interrupt
-# code and produce random code coverage.
+# Functions in these files can run from IRQ entry before hardirq context
+# is visible to KCOV, and produce coverage unrelated to syscall inputs.
KCOV_INSTRUMENT_trace_preemptirq.o := n
+KCOV_INSTRUMENT_trace_irqsoff.o := n
CFLAGS_bpf_trace.o := -I$(src)
@@ -143,8 +144,8 @@ obj-$(CONFIG_TRACE_REMOTE_TEST) += remote_test.o
targets += undefsyms_base.o
KASAN_SANITIZE_undefsyms_base.o := y
-UNDEFINED_ALLOWLIST = __asan __gcov __kasan __kcsan __hwasan __sancov __sanitizer __tsan __ubsan __x86_indirect_thunk \
- __msan simple_ring_buffer \
+UNDEFINED_ALLOWLIST = __asan __gcov __kasan __kcsan __hwasan __sancov __sanitizer __tsan __ubsan __msan \
+ __aeabi_unwind_cpp __s390_indirect_jump __x86_indirect_thunk simple_ring_buffer \
$(shell $(NM) -u $(obj)/undefsyms_base.o 2>/dev/null | awk '{print $$2}')
quiet_cmd_check_undefined = NM $<
@@ -154,7 +155,8 @@ quiet_cmd_check_undefined = NM $<
echo "Unexpected symbols in $<:" >&2; \
echo "$$undefsyms" >&2; \
false; \
- fi
+ fi; \
+ touch $@
$(obj)/%.o.checked: $(obj)/%.o $(obj)/undefsyms_base.o FORCE
$(call if_changed,check_undefined)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index af7079aa0f36..82f8feea6931 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -23,6 +23,7 @@
#include <linux/sort.h>
#include <linux/key.h>
#include <linux/namei.h>
+#include <linux/file.h>
#include <net/bpf_sk_storage.h>
@@ -42,6 +43,7 @@
#define MAX_UPROBE_MULTI_CNT (1U << 20)
#define MAX_KPROBE_MULTI_CNT (1U << 20)
+#define MAX_TRACING_MULTI_CNT (1U << 20)
#ifdef CONFIG_MODULES
struct bpf_trace_module {
@@ -152,6 +154,34 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
return ret;
}
+/**
+ * trace_call_bpf_faultable - invoke BPF program in faultable context
+ * @call: tracepoint event
+ * @ctx: opaque context pointer
+ *
+ * Variant of trace_call_bpf() for faultable tracepoints (syscall
+ * tracepoints). Supports sleepable BPF programs by using rcu_tasks_trace
+ * for lifetime protection and bpf_prog_run_array_sleepable() for per-program
+ * RCU flavor selection, following the uprobe pattern.
+ *
+ * Per-program recursion protection is provided by
+ * bpf_prog_run_array_sleepable(). Global bpf_prog_active is not
+ * needed because syscall tracepoints cannot self-recurse.
+ *
+ * Must be called from a faultable/preemptible context.
+ */
+unsigned int trace_call_bpf_faultable(struct trace_event_call *call, void *ctx)
+{
+ struct bpf_prog_array *prog_array;
+
+ might_fault();
+ guard(rcu_tasks_trace)();
+
+ prog_array = rcu_dereference_check(call->prog_array,
+ rcu_read_lock_trace_held());
+ return bpf_prog_run_array_sleepable(prog_array, ctx, bpf_prog_run);
+}
+
#ifdef CONFIG_BPF_KPROBE_OVERRIDE
BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
{
@@ -1305,7 +1335,8 @@ static inline bool is_uprobe_session(const struct bpf_prog *prog)
static inline bool is_trace_fsession(const struct bpf_prog *prog)
{
return prog->type == BPF_PROG_TYPE_TRACING &&
- prog->expected_attach_type == BPF_TRACE_FSESSION;
+ (prog->expected_attach_type == BPF_TRACE_FSESSION ||
+ prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI);
}
static const struct bpf_func_proto *
@@ -2072,11 +2103,19 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp)
static __always_inline
void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args)
{
+ struct srcu_ctr __percpu *scp = NULL;
struct bpf_prog *prog = link->link.prog;
+ bool sleepable = prog->sleepable;
struct bpf_run_ctx *old_run_ctx;
struct bpf_trace_run_ctx run_ctx;
- rcu_read_lock_dont_migrate();
+ if (sleepable) {
+ scp = rcu_read_lock_tasks_trace();
+ migrate_disable();
+ } else {
+ rcu_read_lock_dont_migrate();
+ }
+
if (unlikely(!bpf_prog_get_recursion_context(prog))) {
bpf_prog_inc_misses_counter(prog);
goto out;
@@ -2085,12 +2124,18 @@ void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args)
run_ctx.bpf_cookie = link->cookie;
old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
- (void) bpf_prog_run(prog, args);
+ (void)bpf_prog_run(prog, args);
bpf_reset_run_ctx(old_run_ctx);
out:
bpf_prog_put_recursion_context(prog);
- rcu_read_unlock_migrate();
+
+ if (sleepable) {
+ migrate_enable();
+ rcu_read_unlock_tasks_trace(scp);
+ } else {
+ rcu_read_unlock_migrate();
+ }
}
#define UNPACK(...) __VA_ARGS__
@@ -2384,7 +2429,8 @@ static void bpf_kprobe_multi_link_release(struct bpf_link *link)
struct bpf_kprobe_multi_link *kmulti_link;
kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
- unregister_fprobe(&kmulti_link->fp);
+ /* Don't wait for RCU GP here. */
+ unregister_fprobe_async(&kmulti_link->fp);
kprobe_multi_put_modules(kmulti_link->mods, kmulti_link->mods_cnt);
}
@@ -3169,6 +3215,38 @@ static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx)
return run_ctx->uprobe->cookie;
}
+static int bpf_uprobe_multi_get_path(const union bpf_attr *attr, struct path *path)
+{
+ void __user *upath = u64_to_user_ptr(attr->link_create.uprobe_multi.path);
+ u32 path_fd = attr->link_create.uprobe_multi.path_fd;
+ u32 flags = attr->link_create.uprobe_multi.flags;
+
+ if (flags & BPF_F_UPROBE_MULTI_PATH_FD) {
+ /*
+ * When BPF_F_UPROBE_MULTI_PATH_FD is set, the executable is
+ * identified by path_fd, upath must be NULL.
+ */
+ if (upath)
+ return -EINVAL;
+
+ CLASS(fd, f)(path_fd);
+ if (fd_empty(f))
+ return -EBADF;
+ *path = fd_file(f)->f_path;
+ path_get(path);
+ return 0;
+ }
+
+ /*
+ * When BPF_F_UPROBE_MULTI_PATH_FD is not set, the path is resolved
+ * relative to the cwd (AT_FDCWD) or absolute using the upath string.
+ */
+ if (!upath || path_fd)
+ return -EINVAL;
+
+ return user_path_at(AT_FDCWD, upath, LOOKUP_FOLLOW, path);
+}
+
int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
struct bpf_uprobe_multi_link *link = NULL;
@@ -3178,10 +3256,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
struct task_struct *task = NULL;
unsigned long __user *uoffsets;
u64 __user *ucookies;
- void __user *upath;
+ unsigned long size;
u32 flags, cnt, i;
struct path path;
- char *name;
pid_t pid;
int err;
@@ -3196,19 +3273,18 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
return -EINVAL;
flags = attr->link_create.uprobe_multi.flags;
- if (flags & ~BPF_F_UPROBE_MULTI_RETURN)
+ if (flags & ~(BPF_F_UPROBE_MULTI_RETURN | BPF_F_UPROBE_MULTI_PATH_FD))
return -EINVAL;
/*
- * path, offsets and cnt are mandatory,
+ * offsets and cnt are mandatory,
* ref_ctr_offsets and cookies are optional
*/
- upath = u64_to_user_ptr(attr->link_create.uprobe_multi.path);
uoffsets = u64_to_user_ptr(attr->link_create.uprobe_multi.offsets);
cnt = attr->link_create.uprobe_multi.cnt;
pid = attr->link_create.uprobe_multi.pid;
- if (!upath || !uoffsets || !cnt || pid < 0)
+ if (!uoffsets || !cnt || pid < 0)
return -EINVAL;
if (cnt > MAX_UPROBE_MULTI_CNT)
return -E2BIG;
@@ -3216,14 +3292,17 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
uref_ctr_offsets = u64_to_user_ptr(attr->link_create.uprobe_multi.ref_ctr_offsets);
ucookies = u64_to_user_ptr(attr->link_create.uprobe_multi.cookies);
- name = strndup_user(upath, PATH_MAX);
- if (IS_ERR(name)) {
- err = PTR_ERR(name);
- return err;
- }
+ /*
+ * All uoffsets/uref_ctr_offsets/ucookies arrays have the same value
+ * size, we need to check their address range is safe for __get_user
+ * calls.
+ */
+ size = sizeof(*uoffsets) * cnt;
+ if (!access_ok(uoffsets, size) || !access_ok(uref_ctr_offsets, size) ||
+ !access_ok(ucookies, size))
+ return -EFAULT;
- err = kern_path(name, LOOKUP_FOLLOW, &path);
- kfree(name);
+ err = bpf_uprobe_multi_get_path(attr, &path);
if (err)
return err;
@@ -3397,12 +3476,12 @@ typedef int (*copy_fn_t)(void *dst, const void *src, u32 size, struct task_struc
* direct calls into all the specific callback implementations
* (copy_user_data_sleepable, copy_user_data_nofault, and so on)
*/
-static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u64 doff, u64 size,
+static __always_inline int __bpf_dynptr_copy_str(const struct bpf_dynptr *dptr, u64 doff, u64 size,
const void *unsafe_src,
copy_fn_t str_copy_fn,
struct task_struct *tsk)
{
- struct bpf_dynptr_kern *dst;
+ const struct bpf_dynptr_kern *dst;
u64 chunk_sz, off;
void *dst_slice;
int cnt, err;
@@ -3438,7 +3517,7 @@ static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u64
u64 size, const void *unsafe_src,
copy_fn_t copy_fn, struct task_struct *tsk)
{
- struct bpf_dynptr_kern *dst;
+ const struct bpf_dynptr_kern *dst;
void *dst_slice;
char buf[256];
u64 off, chunk_sz;
@@ -3539,49 +3618,49 @@ __bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid
return bpf_send_signal_common(sig, type, task, value);
}
-__bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u64 off,
+__bpf_kfunc int bpf_probe_read_user_dynptr(const struct bpf_dynptr *dptr, u64 off,
u64 size, const void __user *unsafe_ptr__ign)
{
return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign,
copy_user_data_nofault, NULL);
}
-__bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u64 off,
+__bpf_kfunc int bpf_probe_read_kernel_dynptr(const struct bpf_dynptr *dptr, u64 off,
u64 size, const void *unsafe_ptr__ign)
{
return __bpf_dynptr_copy(dptr, off, size, unsafe_ptr__ign,
copy_kernel_data_nofault, NULL);
}
-__bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u64 off,
+__bpf_kfunc int bpf_probe_read_user_str_dynptr(const struct bpf_dynptr *dptr, u64 off,
u64 size, const void __user *unsafe_ptr__ign)
{
return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign,
copy_user_str_nofault, NULL);
}
-__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u64 off,
+__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(const struct bpf_dynptr *dptr, u64 off,
u64 size, const void *unsafe_ptr__ign)
{
return __bpf_dynptr_copy_str(dptr, off, size, unsafe_ptr__ign,
copy_kernel_str_nofault, NULL);
}
-__bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u64 off,
+__bpf_kfunc int bpf_copy_from_user_dynptr(const struct bpf_dynptr *dptr, u64 off,
u64 size, const void __user *unsafe_ptr__ign)
{
return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign,
copy_user_data_sleepable, NULL);
}
-__bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u64 off,
+__bpf_kfunc int bpf_copy_from_user_str_dynptr(const struct bpf_dynptr *dptr, u64 off,
u64 size, const void __user *unsafe_ptr__ign)
{
return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign,
copy_user_str_sleepable, NULL);
}
-__bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u64 off,
+__bpf_kfunc int bpf_copy_from_user_task_dynptr(const struct bpf_dynptr *dptr, u64 off,
u64 size, const void __user *unsafe_ptr__ign,
struct task_struct *tsk)
{
@@ -3589,7 +3668,7 @@ __bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u64 off,
copy_user_data_sleepable, tsk);
}
-__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u64 off,
+__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(const struct bpf_dynptr *dptr, u64 off,
u64 size, const void __user *unsafe_ptr__ign,
struct task_struct *tsk)
{
@@ -3598,3 +3677,203 @@ __bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u64
}
__bpf_kfunc_end_defs();
+
+#if defined(CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS) && \
+ defined(CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS)
+
+static void bpf_tracing_multi_link_release(struct bpf_link *link)
+{
+ struct bpf_tracing_multi_link *tr_link =
+ container_of(link, struct bpf_tracing_multi_link, link);
+
+ WARN_ON_ONCE(bpf_trampoline_multi_detach(link->prog, tr_link));
+}
+
+static void bpf_tracing_multi_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_tracing_multi_link *tr_link =
+ container_of(link, struct bpf_tracing_multi_link, link);
+
+ kvfree(tr_link->fexits);
+ kvfree(tr_link->cookies);
+ kvfree(tr_link);
+}
+
+#ifdef CONFIG_PROC_FS
+static void bpf_tracing_multi_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_tracing_multi_link *tr_link =
+ container_of(link, struct bpf_tracing_multi_link, link);
+ bool has_cookies = !!tr_link->cookies;
+
+ seq_printf(seq, "attach_type:\t%u\n", tr_link->link.attach_type);
+ seq_printf(seq, "cnt:\t%u\n", tr_link->nodes_cnt);
+
+ seq_printf(seq, "%s\t %s\t %s\t %s\n", "obj-id", "btf-id", "cookie", "func");
+ for (int i = 0; i < tr_link->nodes_cnt; i++) {
+ struct bpf_tracing_multi_node *mnode = &tr_link->nodes[i];
+ u32 btf_id, obj_id;
+
+ bpf_trampoline_unpack_key(mnode->trampoline->key, &obj_id, &btf_id);
+ seq_printf(seq, "%u\t %u\t %llu\t %pS\n",
+ obj_id, btf_id,
+ has_cookies ? tr_link->cookies[i] : 0,
+ (void *) mnode->trampoline->ip);
+
+ cond_resched();
+ }
+}
+#endif
+
+static const struct bpf_link_ops bpf_tracing_multi_link_lops = {
+ .release = bpf_tracing_multi_link_release,
+ .dealloc_deferred = bpf_tracing_multi_link_dealloc,
+#ifdef CONFIG_PROC_FS
+ .show_fdinfo = bpf_tracing_multi_show_fdinfo,
+#endif
+};
+
+static int ids_cmp_r(const void *pa, const void *pb, const void *priv __maybe_unused)
+{
+ u32 a = *(u32 *) pa;
+ u32 b = *(u32 *) pb;
+
+ return (a > b) - (a < b);
+}
+
+static void ids_swap_r(void *a, void *b, int size __maybe_unused,
+ const void *priv __maybe_unused)
+{
+ u64 *cookie_a, *cookie_b, *cookies;
+ u32 *id_a = a, *id_b = b, *ids;
+ void **data = (void **) priv;
+
+ ids = data[0];
+ cookies = data[1];
+
+ if (cookies) {
+ cookie_a = cookies + (id_a - ids);
+ cookie_b = cookies + (id_b - ids);
+ swap(*cookie_a, *cookie_b);
+ }
+ swap(*id_a, *id_b);
+}
+
+static int check_dup_ids(u32 *ids, u64 *cookies, u32 cnt)
+{
+ void *data[2] = { ids, cookies };
+ int err = 0;
+
+ /*
+ * Sort ids array (together with cookies array if defined)
+ * and check it for duplicates. The ids and cookies arrays
+ * are left sorted.
+ */
+ sort_r_nonatomic(ids, cnt, sizeof(ids[0]), ids_cmp_r, ids_swap_r, data);
+
+ for (int i = 1; i < cnt; i++) {
+ if (ids[i] == ids[i - 1]) {
+ err = -EINVAL;
+ break;
+ }
+ }
+ return err;
+}
+
+int bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr)
+{
+ struct bpf_tracing_multi_link *link = NULL;
+ struct bpf_tramp_node *fexits = NULL;
+ struct bpf_link_primer link_primer;
+ u32 cnt, *ids = NULL;
+ u64 __user *ucookies;
+ u64 *cookies = NULL;
+ u32 __user *uids;
+ int err;
+
+ uids = u64_to_user_ptr(attr->link_create.tracing_multi.ids);
+ cnt = attr->link_create.tracing_multi.cnt;
+
+ if (!cnt || !uids)
+ return -EINVAL;
+ if (cnt > MAX_TRACING_MULTI_CNT)
+ return -E2BIG;
+ if (attr->link_create.flags || attr->link_create.target_fd)
+ return -EINVAL;
+
+ ids = kvmalloc_objs(*ids, cnt);
+ if (!ids)
+ return -ENOMEM;
+
+ if (copy_from_user(ids, uids, cnt * sizeof(*ids))) {
+ err = -EFAULT;
+ goto error;
+ }
+
+ ucookies = u64_to_user_ptr(attr->link_create.tracing_multi.cookies);
+ if (ucookies) {
+ cookies = kvmalloc_objs(*cookies, cnt);
+ if (!cookies) {
+ err = -ENOMEM;
+ goto error;
+ }
+ if (copy_from_user(cookies, ucookies, cnt * sizeof(*cookies))) {
+ err = -EFAULT;
+ goto error;
+ }
+ }
+
+ err = check_dup_ids(ids, cookies, cnt);
+ if (err)
+ goto error;
+
+ if (prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI) {
+ fexits = kvmalloc_objs(*fexits, cnt);
+ if (!fexits) {
+ err = -ENOMEM;
+ goto error;
+ }
+ }
+
+ link = kvzalloc_flex(*link, nodes, cnt);
+ if (!link) {
+ err = -ENOMEM;
+ goto error;
+ }
+
+ bpf_link_init(&link->link, BPF_LINK_TYPE_TRACING_MULTI,
+ &bpf_tracing_multi_link_lops, prog, prog->expected_attach_type);
+
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err)
+ goto error;
+
+ link->nodes_cnt = cnt;
+ link->cookies = cookies;
+ link->fexits = fexits;
+
+ err = bpf_trampoline_multi_attach(prog, ids, link);
+ kvfree(ids);
+ if (err) {
+ bpf_link_cleanup(&link_primer);
+ return err;
+ }
+ return bpf_link_settle(&link_primer);
+
+error:
+ kvfree(fexits);
+ kvfree(cookies);
+ kvfree(ids);
+ kvfree(link);
+ return err;
+}
+
+#else
+
+int bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr)
+{
+ return -EOPNOTSUPP;
+}
+
+#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS && CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS */
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index cc49ebd2a773..f378613ad120 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -1093,14 +1093,15 @@ static int unregister_fprobe_nolock(struct fprobe *fp)
}
/**
- * unregister_fprobe() - Unregister fprobe.
+ * unregister_fprobe_async() - Unregister fprobe without RCU GP wait
* @fp: A fprobe data structure to be unregistered.
*
* Unregister fprobe (and remove ftrace hooks from the function entries).
+ * This function will NOT wait until the fprobe is no longer used.
*
* Return 0 if @fp is unregistered successfully, -errno if not.
*/
-int unregister_fprobe(struct fprobe *fp)
+int unregister_fprobe_async(struct fprobe *fp)
{
guard(mutex)(&fprobe_mutex);
if (!fp || !fprobe_registered(fp))
@@ -1108,6 +1109,24 @@ int unregister_fprobe(struct fprobe *fp)
return unregister_fprobe_nolock(fp);
}
+
+/**
+ * unregister_fprobe() - Unregister fprobe with RCU GP wait
+ * @fp: A fprobe data structure to be unregistered.
+ *
+ * Unregister fprobe (and remove ftrace hooks from the function entries).
+ * This function will block until the fprobe is no longer used.
+ *
+ * Return 0 if @fp is unregistered successfully, -errno if not.
+ */
+int unregister_fprobe(struct fprobe *fp)
+{
+ int ret = unregister_fprobe_async(fp);
+
+ if (!ret)
+ synchronize_rcu();
+ return ret;
+}
EXPORT_SYMBOL_GPL(unregister_fprobe);
static int __init fprobe_initcall(void)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b2611de3f594..f93e34dd2328 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1198,8 +1198,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
return __ftrace_lookup_ip(hash, ip);
}
-static void __add_hash_entry(struct ftrace_hash *hash,
- struct ftrace_func_entry *entry)
+void add_ftrace_hash_entry(struct ftrace_hash *hash, struct ftrace_func_entry *entry)
{
struct hlist_head *hhd;
unsigned long key;
@@ -1221,7 +1220,7 @@ add_ftrace_hash_entry_direct(struct ftrace_hash *hash, unsigned long ip, unsigne
entry->ip = ip;
entry->direct = direct;
- __add_hash_entry(hash, entry);
+ add_ftrace_hash_entry(hash, entry);
return entry;
}
@@ -1249,6 +1248,25 @@ remove_hash_entry(struct ftrace_hash *hash,
hash->count--;
}
+void ftrace_hash_remove(struct ftrace_hash *hash)
+{
+ struct ftrace_func_entry *entry;
+ struct hlist_head *hhd;
+ struct hlist_node *tn;
+ int size;
+ int i;
+
+ if (!hash || !hash->count)
+ return;
+ size = 1 << hash->size_bits;
+ for (i = 0; i < size; i++) {
+ hhd = &hash->buckets[i];
+ hlist_for_each_entry_safe(entry, tn, hhd, hlist)
+ remove_hash_entry(hash, entry);
+ }
+ FTRACE_WARN_ON(hash->count);
+}
+
static void ftrace_hash_clear(struct ftrace_hash *hash)
{
struct hlist_head *hhd;
@@ -1458,7 +1476,7 @@ static struct ftrace_hash *__move_hash(struct ftrace_hash *src, int size)
hhd = &src->buckets[i];
hlist_for_each_entry_safe(entry, tn, hhd, hlist) {
remove_hash_entry(src, entry);
- __add_hash_entry(new_hash, entry);
+ add_ftrace_hash_entry(new_hash, entry);
}
}
return new_hash;
@@ -5341,7 +5359,7 @@ int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper,
map->entry.ip = ip;
map->data = data;
- __add_hash_entry(&mapper->hash, &map->entry);
+ add_ftrace_hash_entry(&mapper->hash, &map->entry);
return 0;
}
@@ -6288,11 +6306,16 @@ int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
}
EXPORT_SYMBOL_GPL(modify_ftrace_direct);
-static unsigned long hash_count(struct ftrace_hash *hash)
+static inline unsigned long hash_count(struct ftrace_hash *hash)
{
return hash ? hash->count : 0;
}
+unsigned long ftrace_hash_count(struct ftrace_hash *hash)
+{
+ return hash_count(hash);
+}
+
/**
* hash_add - adds two struct ftrace_hash and returns the result
* @a: struct ftrace_hash object
diff --git a/kernel/trace/remote_test.c b/kernel/trace/remote_test.c
index 6c1b7701ddae..a3e2c9b606eb 100644
--- a/kernel/trace/remote_test.c
+++ b/kernel/trace/remote_test.c
@@ -110,9 +110,9 @@ static struct trace_buffer_desc *remote_test_load(unsigned long size, void *unus
return remote_test_buffer_desc;
err_unload:
- for_each_ring_buffer_desc(rb_desc, cpu, remote_test_buffer_desc)
+ for_each_ring_buffer_desc(rb_desc, cpu, desc)
remote_test_unload_simple_rb(rb_desc->cpu);
- trace_remote_free_buffer(remote_test_buffer_desc);
+ trace_remote_free_buffer(desc);
err_free_desc:
kfree(desc);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 5326924615a4..56a328e94395 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -7,6 +7,7 @@
#include <linux/ring_buffer_types.h>
#include <linux/sched/isolation.h>
#include <linux/trace_recursion.h>
+#include <linux/panic_notifier.h>
#include <linux/trace_events.h>
#include <linux/ring_buffer.h>
#include <linux/trace_clock.h>
@@ -31,6 +32,7 @@
#include <linux/oom.h>
#include <linux/mm.h>
+#include <asm/ring_buffer.h>
#include <asm/local64.h>
#include <asm/local.h>
#include <asm/setup.h>
@@ -62,6 +64,10 @@ struct ring_buffer_cpu_meta {
unsigned long commit_buffer;
__u32 subbuf_size;
__u32 nr_subbufs;
+#ifdef CONFIG_RING_BUFFER_PERSISTENT_INJECT
+ __u32 nr_invalid;
+ __u32 entry_bytes;
+#endif
int buffers[];
};
@@ -358,14 +364,30 @@ struct buffer_page {
#define RB_WRITE_MASK 0xfffff
#define RB_WRITE_INTCNT (1 << 20)
-static void rb_init_page(struct buffer_data_page *bpage)
+static void rb_init_data_page(struct buffer_data_page *bpage)
{
local_set(&bpage->commit, 0);
+ bpage->time_stamp = 0;
+}
+
+static __always_inline long rb_data_page_commit(struct buffer_data_page *dpage)
+{
+ return local_read(&dpage->commit);
+}
+
+static __always_inline long rb_data_page_size(struct buffer_data_page *dpage)
+{
+ return rb_data_page_commit(dpage) & ~RB_MISSED_MASK;
}
static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage)
{
- return local_read(&bpage->page->commit);
+ return rb_data_page_commit(bpage->page);
+}
+
+static __always_inline unsigned int rb_page_size(struct buffer_page *bpage)
+{
+ return rb_data_page_size(bpage->page);
}
static void free_buffer_page(struct buffer_page *bpage)
@@ -406,7 +428,7 @@ static struct buffer_data_page *alloc_cpu_data(int cpu, int order)
return NULL;
dpage = page_address(page);
- rb_init_page(dpage);
+ rb_init_data_page(dpage);
return dpage;
}
@@ -559,6 +581,7 @@ struct trace_buffer {
unsigned long range_addr_start;
unsigned long range_addr_end;
+ struct notifier_block flush_nb;
struct ring_buffer_meta *meta;
@@ -645,7 +668,7 @@ static void verify_event(struct ring_buffer_per_cpu *cpu_buffer,
do {
if (page == tail_page || WARN_ON_ONCE(stop++ > 100))
done = true;
- commit = local_read(&page->page->commit);
+ commit = rb_page_commit(page);
write = local_read(&page->write);
if (addr >= (unsigned long)&page->page->data[commit] &&
addr < (unsigned long)&page->page->data[write])
@@ -1759,7 +1782,6 @@ static bool rb_cpu_meta_valid(struct ring_buffer_cpu_meta *meta, int cpu,
unsigned long *subbuf_mask)
{
int subbuf_size = PAGE_SIZE;
- struct buffer_data_page *subbuf;
unsigned long buffers_start;
unsigned long buffers_end;
int i;
@@ -1767,6 +1789,11 @@ static bool rb_cpu_meta_valid(struct ring_buffer_cpu_meta *meta, int cpu,
if (!subbuf_mask)
return false;
+ if (meta->subbuf_size != PAGE_SIZE) {
+ pr_info("Ring buffer boot meta [%d] invalid subbuf_size\n", cpu);
+ return false;
+ }
+
buffers_start = meta->first_buffer;
buffers_end = meta->first_buffer + (subbuf_size * meta->nr_subbufs);
@@ -1783,11 +1810,12 @@ static bool rb_cpu_meta_valid(struct ring_buffer_cpu_meta *meta, int cpu,
return false;
}
- subbuf = rb_subbufs_from_meta(meta);
-
bitmap_clear(subbuf_mask, 0, meta->nr_subbufs);
- /* Is the meta buffers and the subbufs themselves have correct data? */
+ /*
+ * Ensure the meta::buffers array has correct data. The data in each subbufs
+ * are checked later in rb_meta_validate_events().
+ */
for (i = 0; i < meta->nr_subbufs; i++) {
if (meta->buffers[i] < 0 ||
meta->buffers[i] >= meta->nr_subbufs) {
@@ -1795,18 +1823,12 @@ static bool rb_cpu_meta_valid(struct ring_buffer_cpu_meta *meta, int cpu,
return false;
}
- if ((unsigned)local_read(&subbuf->commit) > subbuf_size) {
- pr_info("Ring buffer boot meta [%d] buffer invalid commit\n", cpu);
- return false;
- }
-
if (test_bit(meta->buffers[i], subbuf_mask)) {
pr_info("Ring buffer boot meta [%d] array has duplicates\n", cpu);
return false;
}
set_bit(meta->buffers[i], subbuf_mask);
- subbuf = (void *)subbuf + subbuf_size;
}
return true;
@@ -1870,14 +1892,138 @@ static int rb_read_data_buffer(struct buffer_data_page *dpage, int tail, int cpu
return events;
}
-static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu)
+struct rb_validation_state {
+ unsigned long entries;
+ unsigned long entry_bytes;
+ int discarded;
+ u64 ts;
+};
+
+static int __rb_validate_buffer(struct buffer_page *bpage, int cpu,
+ struct ring_buffer_cpu_meta *meta,
+ u64 prev_ts, u64 next_ts)
{
+ struct buffer_data_page *dpage = bpage->page;
unsigned long long ts;
+ unsigned long tail;
u64 delta;
- int tail;
+ int ret;
+
+ /*
+ * When a sub-buffer is recovered from a read, the commit value may
+ * have RB_MISSED_* bits set, as these bits are reset on reuse.
+ * Even after clearing these bits, a commit value greater than the
+ * subbuf_size is considered invalid.
+ */
+ tail = rb_data_page_commit(dpage);
+ if (tail <= meta->subbuf_size - BUF_PAGE_HDR_SIZE)
+ ret = rb_read_data_buffer(dpage, tail, cpu, &ts, &delta);
+ else
+ ret = -1;
+
+ /*
+ * The timestamp must be greater than @prev_ts and smaller than @next_ts.
+ * Since this function works in both forward (verify) and reverse (unwind)
+ * loop, we don't know both @prev_ts and @next_ts at the same time.
+ * So use the known boundary as the boundary.
+ */
+ if (ret < 0 || (prev_ts && prev_ts > ts) || (next_ts && ts > next_ts)) {
+ local_set(&bpage->entries, 0);
+ /*
+ * Note, the RB_MISSED_EVENTS is only set inside the main write
+ * buffer by this verification logic. The normal ring buffer
+ * has this bit set when the page is read and passed to the
+ * consumers.
+ */
+ local_set(&dpage->commit, RB_MISSED_EVENTS);
+ dpage->time_stamp = prev_ts ? prev_ts : next_ts;
+ ret = -1;
+ } else {
+ local_set(&bpage->entries, ret);
+ }
+
+ return ret;
+}
+
+/**
+ * rb_validate_buffer - validates a single buffer page and updates the state.
+ * @bpage: buffer page to validate
+ * @cpu_buffer: cpu_buffer this page belongs to
+ * @meta: meta of the cpu_buffer
+ * @state: validation state
+ * @prev_ts: previous buffer's timestamp (optional)
+ * @next_ts: next buffer's timestamp (optional)
+ *
+ * If the page is invalid (wrong event length or timestamp), it increments the
+ * discarded counter and warns it. Otherwise, it updates the validation state.
+ */
+static void rb_validate_buffer(struct buffer_page *bpage,
+ struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_cpu_meta *meta,
+ struct rb_validation_state *state,
+ u64 prev_ts, u64 next_ts)
+{
+ int ret;
+
+ ret = __rb_validate_buffer(bpage, cpu_buffer->cpu, meta, prev_ts, next_ts);
+ if (ret < 0) {
+ if (!state->discarded)
+ pr_info("Ring buffer meta [%d] invalid buffer page detected\n",
+ cpu_buffer->cpu);
+ state->discarded++;
+ } else {
+ /* If the buffer has content, update pages_touched */
+ if (ret)
+ local_inc(&cpu_buffer->pages_touched);
+
+ state->entries += ret;
+ state->entry_bytes += rb_page_size(bpage);
+ state->ts = bpage->page->time_stamp;
+ }
+}
- tail = local_read(&dpage->commit);
- return rb_read_data_buffer(dpage, tail, cpu, &ts, &delta);
+static void rb_meta_inject_reader_page(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_cpu_meta *meta,
+ struct buffer_page *orig_head,
+ struct buffer_page *head_page)
+{
+ struct buffer_page *bpage = orig_head;
+ int i;
+
+ rb_dec_page(&bpage);
+ /*
+ * Insert the reader_page before the original head page.
+ * Since the list encode RB_PAGE flags, general list
+ * operations should be avoided.
+ */
+ cpu_buffer->reader_page->list.next = &orig_head->list;
+ cpu_buffer->reader_page->list.prev = orig_head->list.prev;
+ orig_head->list.prev = &cpu_buffer->reader_page->list;
+ bpage->list.next = &cpu_buffer->reader_page->list;
+
+ /* Make the head_page the reader page */
+ cpu_buffer->reader_page = head_page;
+ bpage = head_page;
+ rb_inc_page(&head_page);
+ head_page->list.prev = bpage->list.prev;
+ rb_dec_page(&bpage);
+ bpage->list.next = &head_page->list;
+ rb_set_list_to_head(&bpage->list);
+ cpu_buffer->pages = &head_page->list;
+
+ cpu_buffer->head_page = head_page;
+ meta->head_buffer = (unsigned long)head_page->page;
+
+ /* Reset all the indexes */
+ bpage = cpu_buffer->reader_page;
+ meta->buffers[0] = rb_meta_subbuf_idx(meta, bpage->page);
+ bpage->id = 0;
+
+ for (i = 1, bpage = head_page; i < meta->nr_subbufs;
+ i++, rb_inc_page(&bpage)) {
+ meta->buffers[i] = rb_meta_subbuf_idx(meta, bpage->page);
+ bpage->id = i;
+ }
}
/* If the meta data has been validated, now validate the events */
@@ -1885,10 +2031,9 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
{
struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
struct buffer_page *head_page, *orig_head, *orig_reader;
- unsigned long entry_bytes = 0;
- unsigned long entries = 0;
+ struct rb_validation_state state = { 0 };
+ bool skip = false;
int ret;
- u64 ts;
int i;
if (!meta || !meta->head_buffer)
@@ -1897,20 +2042,26 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
orig_head = head_page = cpu_buffer->head_page;
orig_reader = cpu_buffer->reader_page;
- /* Do the reader page first */
- ret = rb_validate_buffer(orig_reader->page, cpu_buffer->cpu);
+ /* Do the head page first */
+ ret = __rb_validate_buffer(head_page, cpu_buffer->cpu, meta, 0, 0);
if (ret < 0) {
- pr_info("Ring buffer reader page is invalid\n");
- goto invalid;
+ pr_info("Ring buffer meta [%d] invalid head page detected\n",
+ cpu_buffer->cpu);
+ /* Don't bother rewinding */
+ skip = true;
+ state.ts = 0;
+ } else {
+ state.ts = head_page->page->time_stamp;
}
- entries += ret;
- entry_bytes += local_read(&orig_reader->page->commit);
- local_set(&orig_reader->entries, ret);
- ts = head_page->page->time_stamp;
+ /* Do the reader page - reader must be previous to head. */
+ rb_validate_buffer(orig_reader, cpu_buffer, meta, &state, 0, state.ts);
+
+ if (skip)
+ goto skip_rewind;
/*
- * Try to rewind the head so that we can read the pages which already
+ * Try to rewind the head so that we can read the pages which are already
* read in the previous boot.
*/
if (head_page == cpu_buffer->tail_page)
@@ -1923,26 +2074,15 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
if (head_page == cpu_buffer->tail_page)
break;
- /* Ensure the page has older data than head. */
- if (ts < head_page->page->time_stamp)
- break;
-
- ts = head_page->page->time_stamp;
- /* Ensure the page has correct timestamp and some data. */
- if (!ts || rb_page_commit(head_page) == 0)
- break;
-
- /* Stop rewind if the page is invalid. */
- ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu);
- if (ret < 0)
+ /* Rewind until unused page (no timestamp, no commit). */
+ if (!head_page->page->time_stamp && rb_page_commit(head_page) == 0)
break;
- /* Recover the number of entries and update stats. */
- local_set(&head_page->entries, ret);
- if (ret)
- local_inc(&cpu_buffer->pages_touched);
- entries += ret;
- entry_bytes += rb_page_commit(head_page);
+ /*
+ * Skip if the page is invalid, or its timestamp is newer than the
+ * previous valid page.
+ */
+ rb_validate_buffer(head_page, cpu_buffer, meta, &state, 0, state.ts);
}
if (i)
pr_info("Ring buffer [%d] rewound %d pages\n", cpu_buffer->cpu, i);
@@ -1956,43 +2096,7 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
* into the location just before the original head page.
*/
if (head_page != orig_head) {
- struct buffer_page *bpage = orig_head;
-
- rb_dec_page(&bpage);
- /*
- * Insert the reader_page before the original head page.
- * Since the list encode RB_PAGE flags, general list
- * operations should be avoided.
- */
- cpu_buffer->reader_page->list.next = &orig_head->list;
- cpu_buffer->reader_page->list.prev = orig_head->list.prev;
- orig_head->list.prev = &cpu_buffer->reader_page->list;
- bpage->list.next = &cpu_buffer->reader_page->list;
-
- /* Make the head_page the reader page */
- cpu_buffer->reader_page = head_page;
- bpage = head_page;
- rb_inc_page(&head_page);
- head_page->list.prev = bpage->list.prev;
- rb_dec_page(&bpage);
- bpage->list.next = &head_page->list;
- rb_set_list_to_head(&bpage->list);
- cpu_buffer->pages = &head_page->list;
-
- cpu_buffer->head_page = head_page;
- meta->head_buffer = (unsigned long)head_page->page;
-
- /* Reset all the indexes */
- bpage = cpu_buffer->reader_page;
- meta->buffers[0] = rb_meta_subbuf_idx(meta, bpage->page);
- bpage->id = 0;
-
- for (i = 1, bpage = head_page; i < meta->nr_subbufs;
- i++, rb_inc_page(&bpage)) {
- meta->buffers[i] = rb_meta_subbuf_idx(meta, bpage->page);
- bpage->id = i;
- }
-
+ rb_meta_inject_reader_page(cpu_buffer, meta, orig_head, head_page);
/* We'll restart verifying from orig_head */
head_page = orig_head;
}
@@ -2004,6 +2108,7 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
/* Nothing more to do, the only page is the reader page */
goto done;
}
+ state.ts = head_page->page->time_stamp;
/* Iterate until finding the commit page */
for (i = 0; i < meta->nr_subbufs + 1; i++, rb_inc_page(&head_page)) {
@@ -2012,20 +2117,7 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
if (head_page == orig_reader)
continue;
- ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu);
- if (ret < 0) {
- pr_info("Ring buffer meta [%d] invalid buffer page\n",
- cpu_buffer->cpu);
- goto invalid;
- }
-
- /* If the buffer has content, update pages_touched */
- if (ret)
- local_inc(&cpu_buffer->pages_touched);
-
- entries += ret;
- entry_bytes += local_read(&head_page->page->commit);
- local_set(&head_page->entries, ret);
+ rb_validate_buffer(head_page, cpu_buffer, meta, &state, state.ts, 0);
if (head_page == cpu_buffer->commit_page)
break;
@@ -2037,10 +2129,28 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
goto invalid;
}
done:
- local_set(&cpu_buffer->entries, entries);
- local_set(&cpu_buffer->entries_bytes, entry_bytes);
-
- pr_info("Ring buffer meta [%d] is from previous boot!\n", cpu_buffer->cpu);
+ local_set(&cpu_buffer->entries, state.entries);
+ local_set(&cpu_buffer->entries_bytes, state.entry_bytes);
+
+ pr_info("Ring buffer meta [%d] is from previous boot!", cpu_buffer->cpu);
+ if (state.discarded)
+ pr_cont(" (%d pages discarded)", state.discarded);
+ pr_cont("\n");
+
+#ifdef CONFIG_RING_BUFFER_PERSISTENT_INJECT
+ if (meta->nr_invalid)
+ pr_warn("Ring buffer testing [%d] invalid pages: %s (%d/%d)\n",
+ cpu_buffer->cpu,
+ (state.discarded == meta->nr_invalid) ? "PASSED" : "FAILED",
+ state.discarded, meta->nr_invalid);
+ if (meta->entry_bytes)
+ pr_warn("Ring buffer testing [%d] entry_bytes: %s (%ld/%ld)\n",
+ cpu_buffer->cpu,
+ (state.entry_bytes == meta->entry_bytes) ? "PASSED" : "FAILED",
+ (long)state.entry_bytes, (long)meta->entry_bytes);
+ meta->nr_invalid = 0;
+ meta->entry_bytes = 0;
+#endif
return;
invalid:
@@ -2050,12 +2160,12 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
/* Reset the reader page */
local_set(&cpu_buffer->reader_page->entries, 0);
- local_set(&cpu_buffer->reader_page->page->commit, 0);
+ rb_init_data_page(cpu_buffer->reader_page->page);
/* Reset all the subbuffers */
for (i = 0; i < meta->nr_subbufs - 1; i++, rb_inc_page(&head_page)) {
local_set(&head_page->entries, 0);
- local_set(&head_page->page->commit, 0);
+ rb_init_data_page(head_page->page);
}
}
@@ -2115,7 +2225,7 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages, int sc
*/
for (i = 0; i < meta->nr_subbufs; i++) {
meta->buffers[i] = i;
- rb_init_page(subbuf);
+ rb_init_data_page(subbuf);
subbuf += meta->subbuf_size;
}
}
@@ -2152,6 +2262,7 @@ static int rbm_show(struct seq_file *m, void *v)
struct ring_buffer_per_cpu *cpu_buffer = m->private;
struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
unsigned long val = (unsigned long)v;
+ struct buffer_data_page *dpage;
if (val == 1) {
seq_printf(m, "head_buffer: %d\n",
@@ -2164,7 +2275,9 @@ static int rbm_show(struct seq_file *m, void *v)
}
val -= 2;
- seq_printf(m, "buffer[%ld]: %d\n", val, meta->buffers[val]);
+ dpage = rb_range_buffer(cpu_buffer, val);
+ seq_printf(m, "buffer[%ld]: %d (commit: %ld)\n",
+ val, meta->buffers[val], dpage ? rb_data_page_commit(dpage) : -1);
return 0;
}
@@ -2521,6 +2634,76 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
kfree(cpu_buffer);
}
+#ifdef CONFIG_RING_BUFFER_PERSISTENT_INJECT
+static void rb_test_inject_invalid_pages(struct trace_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_cpu_meta *meta;
+ struct buffer_data_page *dpage;
+ unsigned long entry_bytes = 0;
+ unsigned long ptr;
+ int subbuf_size;
+ int invalid = 0;
+ int cpu;
+ int i;
+
+ if (!(buffer->flags & RB_FL_TESTING))
+ return;
+
+ guard(preempt)();
+ cpu = smp_processor_id();
+
+ cpu_buffer = buffer->buffers[cpu];
+ if (!cpu_buffer)
+ return;
+ meta = cpu_buffer->ring_meta;
+ if (!meta)
+ return;
+
+ ptr = (unsigned long)rb_subbufs_from_meta(meta);
+ subbuf_size = meta->subbuf_size;
+
+ for (i = 0; i < meta->nr_subbufs; i++) {
+ unsigned long idx = meta->buffers[i];
+
+ dpage = (void *)(ptr + idx * subbuf_size);
+ /* Skip unused pages */
+ if (!rb_data_page_commit(dpage))
+ continue;
+
+ /*
+ * Invalidate even pages or multiples of 5. This will cause 3
+ * contiguous invalidated(empty) pages.
+ */
+ if (!(i & 0x1) || !(i % 5)) {
+ local_add(subbuf_size + 1, &dpage->commit);
+ invalid++;
+ } else {
+ /* Count total commit bytes. */
+ entry_bytes += rb_data_page_size(dpage);
+ }
+ }
+
+ pr_info("Inject invalidated %d pages on CPU%d, total size: %ld\n",
+ invalid, cpu, (long)entry_bytes);
+ meta->nr_invalid = invalid;
+ meta->entry_bytes = entry_bytes;
+}
+#else /* !CONFIG_RING_BUFFER_PERSISTENT_INJECT */
+#define rb_test_inject_invalid_pages(buffer) do { } while (0)
+#endif
+
+/* Stop recording on a persistent buffer and flush cache if needed. */
+static int rb_flush_buffer_cb(struct notifier_block *nb, unsigned long event, void *data)
+{
+ struct trace_buffer *buffer = container_of(nb, struct trace_buffer, flush_nb);
+
+ ring_buffer_record_off(buffer);
+ rb_test_inject_invalid_pages(buffer);
+ arch_ring_buffer_flush_range(buffer->range_addr_start, buffer->range_addr_end);
+ return NOTIFY_DONE;
+}
+
static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
int order, unsigned long start,
unsigned long end,
@@ -2651,6 +2834,12 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
mutex_init(&buffer->mutex);
+ /* Persistent ring buffer needs to flush cache before reboot. */
+ if (start && end) {
+ buffer->flush_nb.notifier_call = rb_flush_buffer_cb;
+ atomic_notifier_chain_register(&panic_notifier_list, &buffer->flush_nb);
+ }
+
return_ptr(buffer);
fail_free_buffers:
@@ -2749,6 +2938,9 @@ ring_buffer_free(struct trace_buffer *buffer)
{
int cpu;
+ if (buffer->range_addr_start && buffer->range_addr_end)
+ atomic_notifier_chain_unregister(&panic_notifier_list, &buffer->flush_nb);
+
cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node);
irq_work_sync(&buffer->irq_work.work);
@@ -3265,7 +3457,7 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
* is a mb(), which will synchronize with the rmb here.
* (see rb_tail_page_update() and __rb_reserve_next())
*/
- commit = rb_page_commit(iter_head_page);
+ commit = rb_page_size(iter_head_page);
smp_rmb();
/* An event needs to be at least 8 bytes in size */
@@ -3294,7 +3486,7 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
/* Make sure the page didn't change since we read this */
if (iter->page_stamp != iter_head_page->page->time_stamp ||
- commit > rb_page_commit(iter_head_page))
+ commit > rb_page_size(iter_head_page))
goto reset;
iter->next_event = iter->head + length;
@@ -3308,12 +3500,6 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
return NULL;
}
-/* Size is determined by what has been committed */
-static __always_inline unsigned rb_page_size(struct buffer_page *bpage)
-{
- return rb_page_commit(bpage) & ~RB_MISSED_MASK;
-}
-
static __always_inline unsigned
rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
{
@@ -3345,6 +3531,9 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
else
rb_inc_page(&iter->head_page);
+ if (rb_page_commit(iter->head_page) & RB_MISSED_EVENTS)
+ iter->missed_events = -1;
+
iter->page_stamp = iter->read_stamp = iter->head_page->page->time_stamp;
iter->head = 0;
iter->next_event = 0;
@@ -3769,13 +3958,6 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
return skip_time_extend(event);
}
-#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-static inline bool sched_clock_stable(void)
-{
- return true;
-}
-#endif
-
static void
rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
struct rb_event_info *info)
@@ -4023,8 +4205,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
local_set(&cpu_buffer->commit_page->page->commit,
rb_page_write(cpu_buffer->commit_page));
RB_WARN_ON(cpu_buffer,
- local_read(&cpu_buffer->commit_page->page->commit) &
- ~RB_WRITE_MASK);
+ rb_page_commit(cpu_buffer->commit_page) & ~RB_WRITE_MASK);
barrier();
}
@@ -4396,7 +4577,7 @@ static const char *show_interrupt_level(void)
return show_irq_str(level);
}
-static void dump_buffer_page(struct buffer_data_page *bpage,
+static void dump_buffer_page(struct buffer_data_page *dpage,
struct rb_event_info *info,
unsigned long tail)
{
@@ -4404,12 +4585,12 @@ static void dump_buffer_page(struct buffer_data_page *bpage,
u64 ts, delta;
int e;
- ts = bpage->time_stamp;
+ ts = dpage->time_stamp;
pr_warn(" [%lld] PAGE TIME STAMP\n", ts);
for (e = 0; e < tail; e += rb_event_length(event)) {
- event = (struct ring_buffer_event *)(bpage->data + e);
+ event = (struct ring_buffer_event *)(dpage->data + e);
switch (event->type_len) {
@@ -4459,7 +4640,7 @@ static atomic_t ts_dump;
} \
atomic_inc(&cpu_buffer->record_disabled); \
pr_warn(fmt, ##__VA_ARGS__); \
- dump_buffer_page(bpage, info, tail); \
+ dump_buffer_page(dpage, info, tail); \
atomic_dec(&ts_dump); \
/* There's some cases in boot up that this can happen */ \
if (WARN_ON_ONCE(system_state != SYSTEM_BOOTING)) \
@@ -4475,16 +4656,16 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
struct rb_event_info *info,
unsigned long tail)
{
- struct buffer_data_page *bpage;
+ struct buffer_data_page *dpage;
u64 ts, delta;
bool full = false;
int ret;
- bpage = info->tail_page->page;
+ dpage = info->tail_page->page;
if (tail == CHECK_FULL_PAGE) {
full = true;
- tail = local_read(&bpage->commit);
+ tail = rb_data_page_commit(dpage);
} else if (info->add_timestamp &
(RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)) {
/* Ignore events with absolute time stamps */
@@ -4495,7 +4676,7 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
* Do not check the first event (skip possible extends too).
* Also do not check if previous events have not been committed.
*/
- if (tail <= 8 || tail > local_read(&bpage->commit))
+ if (tail <= 8 || tail > rb_data_page_commit(dpage))
return;
/*
@@ -4504,7 +4685,7 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
if (atomic_inc_return(this_cpu_ptr(&checking)) != 1)
goto out;
- ret = rb_read_data_buffer(bpage, tail, cpu_buffer->cpu, &ts, &delta);
+ ret = rb_read_data_buffer(dpage, tail, cpu_buffer->cpu, &ts, &delta);
if (ret < 0) {
if (delta < ts) {
buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld clock:%pS\n",
@@ -5407,6 +5588,7 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
iter->head_page = cpu_buffer->reader_page;
iter->head = cpu_buffer->reader_page->read;
iter->next_event = iter->head;
+ iter->missed_events = 0;
iter->cache_reader_page = iter->head_page;
iter->cache_read = cpu_buffer->read;
@@ -5471,7 +5653,7 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter)
* (see rb_tail_page_update())
*/
smp_rmb();
- commit = rb_page_commit(commit_page);
+ commit = rb_page_size(commit_page);
/* We want to make sure that the commit page doesn't change */
smp_rmb();
@@ -5613,10 +5795,12 @@ __rb_get_reader_page_from_remote(struct ring_buffer_per_cpu *cpu_buffer)
static struct buffer_page *
__rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
{
- struct buffer_page *reader = NULL;
+ int max_loops = cpu_buffer->ring_meta ? cpu_buffer->nr_pages : 3;
unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size);
+ struct buffer_page *reader = NULL;
unsigned long overwrite;
unsigned long flags;
+ int missed_events = 0;
int nr_loops = 0;
bool ret;
@@ -5626,11 +5810,14 @@ __rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
again:
/*
* This should normally only loop twice. But because the
- * start of the reader inserts an empty page, it causes
- * a case where we will loop three times. There should be no
- * reason to loop four times (that I know of).
+ * start of the reader inserts an empty page, it causes a
+ * case where we will loop three times. There should be no
+ * reason to loop four times unless the ring buffer is a
+ * recovered persistent ring buffer. For persistent ring buffers,
+ * invalid pages are reset during recovery, so there may be more
+ * than 3 contiguous pages can be empty, but less than nr_pages.
*/
- if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) {
+ if (RB_WARN_ON(cpu_buffer, ++nr_loops > max_loops)) {
reader = NULL;
goto out;
}
@@ -5660,6 +5847,7 @@ __rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
*/
local_set(&cpu_buffer->reader_page->write, 0);
local_set(&cpu_buffer->reader_page->entries, 0);
+ rb_init_data_page(cpu_buffer->reader_page->page);
cpu_buffer->reader_page->real_end = 0;
spin:
@@ -5713,6 +5901,9 @@ __rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
if (!ret)
goto spin;
+ if (rb_page_commit(reader) & RB_MISSED_EVENTS)
+ missed_events = -1;
+
if (cpu_buffer->ring_meta)
rb_update_meta_reader(cpu_buffer, reader);
@@ -5777,6 +5968,8 @@ __rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
*/
smp_rmb();
+ if (!cpu_buffer->lost_events)
+ cpu_buffer->lost_events = missed_events;
return reader;
}
@@ -5927,12 +6120,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_event *event;
int nr_loops = 0;
+ int max_loops;
if (ts)
*ts = 0;
cpu_buffer = iter->cpu_buffer;
buffer = cpu_buffer->buffer;
+ max_loops = cpu_buffer->ring_meta ? cpu_buffer->nr_pages : 3;
/*
* Check if someone performed a consuming read to the buffer
@@ -5955,7 +6150,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
* the ring buffer with an active write as the consumer is.
* Do not warn if the three failures is reached.
*/
- if (++nr_loops > 3)
+ if (++nr_loops > max_loops)
return NULL;
if (rb_per_cpu_empty(cpu_buffer))
@@ -6086,10 +6281,7 @@ ring_buffer_peek(struct trace_buffer *buffer, int cpu, u64 *ts,
*/
bool ring_buffer_iter_dropped(struct ring_buffer_iter *iter)
{
- bool ret = iter->missed_events != 0;
-
- iter->missed_events = 0;
- return ret;
+ return iter->missed_events != 0;
}
EXPORT_SYMBOL_GPL(ring_buffer_iter_dropped);
@@ -6251,7 +6443,7 @@ void ring_buffer_iter_advance(struct ring_buffer_iter *iter)
unsigned long flags;
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
-
+ iter->missed_events = 0;
rb_advance_iter(iter);
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
@@ -6291,7 +6483,7 @@ static void rb_clear_buffer_page(struct buffer_page *page)
{
local_set(&page->write, 0);
local_set(&page->entries, 0);
- rb_init_page(page->page);
+ rb_init_data_page(page->page);
page->read = 0;
}
@@ -6776,7 +6968,7 @@ ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
local_irq_restore(flags);
if (bpage->data) {
- rb_init_page(bpage->data);
+ rb_init_data_page(bpage->data);
} else {
bpage->data = alloc_cpu_data(cpu, cpu_buffer->buffer->subbuf_order);
if (!bpage->data) {
@@ -6801,8 +6993,8 @@ void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu,
struct buffer_data_read_page *data_page)
{
struct ring_buffer_per_cpu *cpu_buffer;
- struct buffer_data_page *bpage = data_page->data;
- struct page *page = virt_to_page(bpage);
+ struct buffer_data_page *dpage = data_page->data;
+ struct page *page = virt_to_page(dpage);
unsigned long flags;
if (!buffer || !buffer->buffers || !buffer->buffers[cpu])
@@ -6822,15 +7014,15 @@ void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu,
arch_spin_lock(&cpu_buffer->lock);
if (!cpu_buffer->free_page) {
- cpu_buffer->free_page = bpage;
- bpage = NULL;
+ cpu_buffer->free_page = dpage;
+ dpage = NULL;
}
arch_spin_unlock(&cpu_buffer->lock);
local_irq_restore(flags);
out:
- free_pages((unsigned long)bpage, data_page->order);
+ free_pages((unsigned long)dpage, data_page->order);
kfree(data_page);
}
EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
@@ -6875,10 +7067,11 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
struct ring_buffer_event *event;
- struct buffer_data_page *bpage;
+ struct buffer_data_page *dpage;
struct buffer_page *reader;
- unsigned long missed_events;
+ long missed_events;
unsigned int commit;
+ unsigned int size;
unsigned int read;
u64 save_timestamp;
bool force_memcpy;
@@ -6901,8 +7094,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
if (data_page->order != buffer->subbuf_order)
return -1;
- bpage = data_page->data;
- if (!bpage)
+ dpage = data_page->data;
+ if (!dpage)
return -1;
guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock);
@@ -6914,7 +7107,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
event = rb_reader_event(cpu_buffer);
read = reader->read;
- commit = rb_page_size(reader);
+ commit = rb_page_commit(reader);
+ size = rb_page_size(reader);
/* Check if any events were dropped */
missed_events = cpu_buffer->lost_events;
@@ -6928,13 +7122,14 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
* we must copy the data from the page to the buffer.
* Otherwise, we can simply swap the page with the one passed in.
*/
- if (read || (len < (commit - read)) ||
+ if (read || (len < (size - read)) ||
cpu_buffer->reader_page == cpu_buffer->commit_page ||
force_memcpy) {
struct buffer_data_page *rpage = cpu_buffer->reader_page->page;
unsigned int rpos = read;
unsigned int pos = 0;
- unsigned int size;
+ unsigned int event_size;
+ unsigned int flags = 0;
/*
* If a full page is expected, this can still be returned
@@ -6943,19 +7138,22 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
* the reader page.
*/
if (full &&
- (!read || (len < (commit - read)) ||
+ (!read || (len < (size - read)) ||
cpu_buffer->reader_page == cpu_buffer->commit_page))
return -1;
- if (len > (commit - read))
- len = (commit - read);
+ if (len > (size - read))
+ len = (size - read);
/* Always keep the time extend and data together */
- size = rb_event_ts_length(event);
+ event_size = rb_event_ts_length(event);
- if (len < size)
+ if (len < event_size)
return -1;
+ if (commit & RB_MISSED_EVENTS)
+ flags = RB_MISSED_EVENTS;
+
/* save the current timestamp, since the user will need it */
save_timestamp = cpu_buffer->read_stamp;
@@ -6967,26 +7165,26 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
* one or two events.
* We have already ensured there's enough space if this
* is a time extend. */
- size = rb_event_length(event);
- memcpy(bpage->data + pos, rpage->data + rpos, size);
+ event_size = rb_event_length(event);
+ memcpy(dpage->data + pos, rpage->data + rpos, event_size);
- len -= size;
+ len -= event_size;
rb_advance_reader(cpu_buffer);
rpos = reader->read;
- pos += size;
+ pos += event_size;
- if (rpos >= commit)
+ if (rpos >= event_size)
break;
event = rb_reader_event(cpu_buffer);
/* Always keep the time extend and data together */
- size = rb_event_ts_length(event);
- } while (len >= size);
+ event_size = rb_event_ts_length(event);
+ } while (len >= event_size);
- /* update bpage */
- local_set(&bpage->commit, pos);
- bpage->time_stamp = save_timestamp;
+ /* update dpage */
+ local_set(&dpage->commit, pos | flags);
+ dpage->time_stamp = save_timestamp;
/* we copied everything to the beginning */
read = 0;
@@ -6996,13 +7194,15 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
cpu_buffer->read_bytes += rb_page_size(reader);
/* swap the pages */
- rb_init_page(bpage);
- bpage = reader->page;
+ rb_init_data_page(dpage);
+ dpage = reader->page;
reader->page = data_page->data;
local_set(&reader->write, 0);
local_set(&reader->entries, 0);
reader->read = 0;
- data_page->data = bpage;
+ data_page->data = dpage;
+ if (!missed_events && rb_data_page_commit(dpage) & RB_MISSED_EVENTS)
+ missed_events = -1;
/*
* Use the real_end for the data size,
@@ -7010,33 +7210,43 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
* on the page.
*/
if (reader->real_end)
- local_set(&bpage->commit, reader->real_end);
+ local_set(&dpage->commit, reader->real_end);
}
cpu_buffer->lost_events = 0;
- commit = local_read(&bpage->commit);
+ size = rb_data_page_size(dpage);
/*
* Set a flag in the commit field if we lost events
*/
if (missed_events) {
- /* If there is room at the end of the page to save the
+ /*
+ * If there is room at the end of the page to save the
* missed events, then record it there.
*/
- if (buffer->subbuf_size - commit >= sizeof(missed_events)) {
- memcpy(&bpage->data[commit], &missed_events,
+ if (missed_events > 0 &&
+ buffer->subbuf_size - size >= sizeof(missed_events)) {
+ memcpy(&dpage->data[size], &missed_events,
sizeof(missed_events));
- local_add(RB_MISSED_STORED, &bpage->commit);
- commit += sizeof(missed_events);
+ local_add(RB_MISSED_STORED, &dpage->commit);
+ size += sizeof(missed_events);
}
- local_add(RB_MISSED_EVENTS, &bpage->commit);
+ /*
+ * Note, for the persistent ring buffer, the RB_MISSED_EVENTS
+ * may have been set in the main buffer via the verification code.
+ * But here, dpage is a copy of that page and has not yet had
+ * the RB_MISSED_EVENTS set. As for the normal buffers,
+ * the main write buffer does not set these bits and it needs
+ * to be set here.
+ */
+ local_add(RB_MISSED_EVENTS, &dpage->commit);
}
/*
* This page may be off to user land. Zero it out here.
*/
- if (commit < buffer->subbuf_size)
- memset(&bpage->data[commit], 0, buffer->subbuf_size - commit);
+ if (size < buffer->subbuf_size)
+ memset(&dpage->data[size], 0, buffer->subbuf_size - size);
return read;
}
@@ -7667,7 +7877,7 @@ consume:
if (missed_events) {
if (cpu_buffer->reader_page != cpu_buffer->commit_page) {
- struct buffer_data_page *bpage = reader->page;
+ struct buffer_data_page *dpage = reader->page;
unsigned int commit;
/*
* Use the real_end for the data size,
@@ -7675,18 +7885,18 @@ consume:
* on the page.
*/
if (reader->real_end)
- local_set(&bpage->commit, reader->real_end);
+ local_set(&dpage->commit, reader->real_end);
/*
* If there is room at the end of the page to save the
* missed events, then record it there.
*/
commit = rb_page_size(reader);
if (buffer->subbuf_size - commit >= sizeof(missed_events)) {
- memcpy(&bpage->data[commit], &missed_events,
+ memcpy(&dpage->data[commit], &missed_events,
sizeof(missed_events));
- local_add(RB_MISSED_STORED, &bpage->commit);
+ local_add(RB_MISSED_STORED, &dpage->commit);
}
- local_add(RB_MISSED_EVENTS, &bpage->commit);
+ local_add(RB_MISSED_EVENTS, &dpage->commit);
} else if (!WARN_ONCE(cpu_buffer->reader_page == cpu_buffer->tail_page,
"Reader on commit with %ld missed events",
missed_events)) {
diff --git a/kernel/trace/rv/monitors/deadline/deadline.h b/kernel/trace/rv/monitors/deadline/deadline.h
index 0bbfd2543329..78fca873d61e 100644
--- a/kernel/trace/rv/monitors/deadline/deadline.h
+++ b/kernel/trace/rv/monitors/deadline/deadline.h
@@ -95,7 +95,8 @@ static inline u8 get_server_type(struct task_struct *tsk)
static inline int extract_params(struct pt_regs *regs, long id, pid_t *pid_out)
{
size_t size = offsetofend(struct sched_attr, sched_flags);
- struct sched_attr __user *uattr, attr;
+ struct sched_attr __user *uattr;
+ struct sched_attr attr;
int new_policy = -1, ret;
unsigned long args[6];
diff --git a/kernel/trace/rv/monitors/nomiss/nomiss.c b/kernel/trace/rv/monitors/nomiss/nomiss.c
index 31f90f3638d8..8ead8783c29f 100644
--- a/kernel/trace/rv/monitors/nomiss/nomiss.c
+++ b/kernel/trace/rv/monitors/nomiss/nomiss.c
@@ -227,7 +227,7 @@ static int enable_nomiss(void)
{
int retval;
- retval = da_monitor_init();
+ retval = ha_monitor_init();
if (retval)
return retval;
@@ -263,7 +263,7 @@ static void disable_nomiss(void)
rv_detach_trace_probe("nomiss", sched_switch, handle_sched_switch);
rv_detach_trace_probe("nomiss", sched_wakeup, handle_sched_wakeup);
- da_monitor_destroy();
+ ha_monitor_destroy();
}
static struct rv_monitor rv_this = {
diff --git a/kernel/trace/rv/monitors/opid/opid.c b/kernel/trace/rv/monitors/opid/opid.c
index 4594c7c46601..3b6a85e815b8 100644
--- a/kernel/trace/rv/monitors/opid/opid.c
+++ b/kernel/trace/rv/monitors/opid/opid.c
@@ -22,14 +22,8 @@ static u64 ha_get_env(struct ha_monitor *ha_mon, enum envs_opid env, u64 time_ns
if (env == irq_off_opid)
return irqs_disabled();
else if (env == preempt_off_opid) {
- /*
- * If CONFIG_PREEMPTION is enabled, then the tracepoint itself disables
- * preemption (adding one to the preempt_count). Since we are
- * interested in the preempt_count at the time the tracepoint was
- * hit, we consider 1 as still enabled.
- */
if (IS_ENABLED(CONFIG_PREEMPTION))
- return (preempt_count() & PREEMPT_MASK) > 1;
+ return (preempt_count() & PREEMPT_MASK) > 0;
return true;
}
return ENV_INVALID_VALUE;
@@ -73,7 +67,7 @@ static int enable_opid(void)
{
int retval;
- retval = da_monitor_init();
+ retval = ha_monitor_init();
if (retval)
return retval;
@@ -90,7 +84,7 @@ static void disable_opid(void)
rv_detach_trace_probe("opid", sched_set_need_resched_tp, handle_sched_need_resched);
rv_detach_trace_probe("opid", sched_waking, handle_sched_waking);
- da_monitor_destroy();
+ ha_monitor_destroy();
}
/*
diff --git a/kernel/trace/rv/monitors/stall/stall.c b/kernel/trace/rv/monitors/stall/stall.c
index 9ccfda6b0e73..3c38fb1a0159 100644
--- a/kernel/trace/rv/monitors/stall/stall.c
+++ b/kernel/trace/rv/monitors/stall/stall.c
@@ -103,7 +103,7 @@ static int enable_stall(void)
{
int retval;
- retval = da_monitor_init();
+ retval = ha_monitor_init();
if (retval)
return retval;
@@ -120,7 +120,7 @@ static void disable_stall(void)
rv_detach_trace_probe("stall", sched_switch, handle_sched_switch);
rv_detach_trace_probe("stall", sched_wakeup, handle_sched_wakeup);
- da_monitor_destroy();
+ ha_monitor_destroy();
}
static struct rv_monitor rv_this = {
diff --git a/kernel/trace/simple_ring_buffer.c b/kernel/trace/simple_ring_buffer.c
index 02af2297ae5a..f4642f5adda3 100644
--- a/kernel/trace/simple_ring_buffer.c
+++ b/kernel/trace/simple_ring_buffer.c
@@ -395,7 +395,6 @@ int simple_ring_buffer_init_mm(struct simple_rb_per_cpu *cpu_buffer,
memset(cpu_buffer->meta, 0, sizeof(*cpu_buffer->meta));
cpu_buffer->meta->meta_page_size = PAGE_SIZE;
- cpu_buffer->meta->nr_subbufs = cpu_buffer->nr_pages;
/* The reader page is not part of the ring initially */
page = load_page(desc->page_va[0]);
@@ -431,12 +430,13 @@ int simple_ring_buffer_init_mm(struct simple_rb_per_cpu *cpu_buffer,
if (ret) {
for (i--; i >= 0; i--)
- unload_page((void *)desc->page_va[i]);
+ unload_page(bpages[i].page);
unload_page(cpu_buffer->meta);
return ret;
}
+ cpu_buffer->meta->nr_subbufs = cpu_buffer->nr_pages;
/* Close the ring */
bpage->link.next = &cpu_buffer->tail_page->link;
cpu_buffer->tail_page->link.prev = &bpage->link;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6eb4d3097a4d..1146b83b711a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2338,15 +2338,6 @@ void trace_last_func_repeats(struct trace_array *tr,
__buffer_unlock_commit(buffer, event);
}
-static void trace_iterator_increment(struct trace_iterator *iter)
-{
- struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, iter->cpu);
-
- iter->idx++;
- if (buf_iter)
- ring_buffer_iter_advance(buf_iter);
-}
-
static struct trace_entry *
peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
unsigned long *lost_events)
@@ -2676,11 +2667,17 @@ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
/* Find the next real entry, and increment the iterator to the next entry */
void *trace_find_next_entry_inc(struct trace_iterator *iter)
{
+ struct ring_buffer_iter *buf_iter;
+
iter->ent = __find_next_entry(iter, &iter->cpu,
&iter->lost_events, &iter->ts);
- if (iter->ent)
- trace_iterator_increment(iter);
+ if (iter->ent) {
+ iter->idx++;
+ buf_iter = trace_buffer_iter(iter, iter->cpu);
+ if (buf_iter)
+ ring_buffer_iter_advance(buf_iter);
+ }
return iter->ent ? iter : NULL;
}
@@ -4474,7 +4471,7 @@ static const char readme_msg[] =
"\t snapshot() - snapshot the trace buffer\n\n"
#endif
#ifdef CONFIG_SYNTH_EVENTS
- " events/synthetic_events\t- Create/append/remove/show synthetic events\n"
+ " synthetic_events\t- Create/append/remove/show synthetic events\n"
"\t Write into this file to define/undefine new synthetic events.\n"
"\t example: echo 'myevent u64 lat; char name[]; long[] stack' >> synthetic_events\n"
#endif
@@ -7928,8 +7925,8 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer,
if (!topts)
return 0;
- tr_topts = krealloc(tr->topts, sizeof(*tr->topts) * (tr->nr_topts + 1),
- GFP_KERNEL);
+ tr_topts = krealloc_array(tr->topts, tr->nr_topts + 1, sizeof(*tr->topts),
+ GFP_KERNEL);
if (!tr_topts) {
kfree(topts);
return -ENOMEM;
@@ -8383,6 +8380,8 @@ static void setup_trace_scratch(struct trace_array *tr,
memset(tscratch, 0, size);
}
+#define TRACE_TEST_PTRACING_NAME "ptracingtest"
+
int allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size)
{
enum ring_buffer_flags rb_flags;
@@ -8394,6 +8393,8 @@ int allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int
buf->tr = tr;
if (tr->range_addr_start && tr->range_addr_size) {
+ if (tr->name && !strcmp(tr->name, TRACE_TEST_PTRACING_NAME))
+ rb_flags |= RB_FL_TESTING;
/* Add scratch buffer to handle 128 modules */
buf->buffer = ring_buffer_alloc_range(size, rb_flags, 0,
tr->range_addr_start,
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index d1564db95a8f..d8e97ad798f0 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -181,8 +181,7 @@ __init static int init_branch_tracer(void)
ret = register_trace_event(&trace_branch_event);
if (!ret) {
- printk(KERN_WARNING "Warning: could not register "
- "branch events\n");
+ pr_warn("Warning: could not register branch events\n");
return 1;
}
return register_tracer(&branch_trace);
@@ -374,8 +373,7 @@ __init static int init_annotated_branch_stats(void)
ret = register_stat_tracer(&annotated_branch_stats);
if (ret) {
- printk(KERN_WARNING "Warning: could not register "
- "annotated branches stats\n");
+ pr_warn("Warning: could not register annotated branches stats\n");
return ret;
}
return 0;
@@ -439,8 +437,7 @@ __init static int all_annotated_branch_stats(void)
ret = register_stat_tracer(&all_branch_stats);
if (ret) {
- printk(KERN_WARNING "Warning: could not register "
- "all branches stats\n");
+ pr_warn("Warning: could not register all branches stats\n");
return ret;
}
return 0;
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index a6bb7577e8c5..5b272856e5ab 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -497,7 +497,17 @@ static int perf_ftrace_function_register(struct perf_event *event)
static int perf_ftrace_function_unregister(struct perf_event *event)
{
struct ftrace_ops *ops = &event->ftrace_ops;
- int ret = unregister_ftrace_function(ops);
+ int ret = 0;
+
+ /*
+ * Perf will call this unconditionally even if the ops is not
+ * enabled. The unregister_ftrace_function() will warn if called
+ * when not enabled. Just bypass the unregistering if ops isn't
+ * enabled here.
+ */
+ if (ops->flags & FTRACE_OPS_FL_ENABLED)
+ ret = unregister_ftrace_function(ops);
+
ftrace_free_filter(ops);
return ret;
}
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 0dbbf6cca9bc..82ce492ab268 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -8,6 +8,7 @@
#include <linux/module.h>
#include <linux/kallsyms.h>
#include <linux/security.h>
+#include <linux/seq_buf.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/stacktrace.h>
@@ -682,8 +683,8 @@ struct track_data {
struct hist_elt_data {
char *comm;
u64 *var_ref_vals;
- char **field_var_str;
int n_field_var_str;
+ char *field_var_str[] __counted_by(n_field_var_str);
};
struct snapshot_context {
@@ -1369,10 +1370,8 @@ static const char *hist_field_name(struct hist_field *field,
len = snprintf(full_name, sizeof(full_name), fmt,
field->system, field->event_name,
field->name);
- if (len >= sizeof(full_name))
- return NULL;
-
- field_name = full_name;
+ if (len < sizeof(full_name))
+ field_name = full_name;
} else
field_name = field->name;
} else if (field->flags & HIST_FIELD_FL_TIMESTAMP)
@@ -1630,8 +1629,6 @@ static void hist_elt_data_free(struct hist_elt_data *elt_data)
for (i = 0; i < elt_data->n_field_var_str; i++)
kfree(elt_data->field_var_str[i]);
- kfree(elt_data->field_var_str);
-
kfree(elt_data->comm);
kfree(elt_data);
}
@@ -1651,10 +1648,19 @@ static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt)
struct hist_field *hist_field;
unsigned int i, n_str;
- elt_data = kzalloc_obj(*elt_data);
+ BUILD_BUG_ON(STR_VAR_LEN_MAX & (sizeof(u64) - 1));
+
+ n_str = hist_data->n_field_var_str + hist_data->n_save_var_str +
+ hist_data->n_var_str;
+ if (n_str > SYNTH_FIELDS_MAX)
+ return -EINVAL;
+
+ elt_data = kzalloc_flex(*elt_data, field_var_str, n_str);
if (!elt_data)
return -ENOMEM;
+ elt_data->n_field_var_str = n_str;
+
for_each_hist_field(i, hist_data) {
hist_field = hist_data->fields[i];
@@ -1668,24 +1674,8 @@ static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt)
}
}
- n_str = hist_data->n_field_var_str + hist_data->n_save_var_str +
- hist_data->n_var_str;
- if (n_str > SYNTH_FIELDS_MAX) {
- hist_elt_data_free(elt_data);
- return -EINVAL;
- }
-
- BUILD_BUG_ON(STR_VAR_LEN_MAX & (sizeof(u64) - 1));
-
size = STR_VAR_LEN_MAX;
- elt_data->field_var_str = kcalloc(n_str, sizeof(char *), GFP_KERNEL);
- if (!elt_data->field_var_str) {
- hist_elt_data_free(elt_data);
- return -EINVAL;
- }
- elt_data->n_field_var_str = n_str;
-
for (i = 0; i < n_str; i++) {
elt_data->field_var_str[i] = kzalloc(size, GFP_KERNEL);
if (!elt_data->field_var_str[i]) {
@@ -2969,13 +2959,22 @@ find_synthetic_field_var(struct hist_trigger_data *target_hist_data,
{
struct hist_field *event_var;
char *synthetic_name;
+ struct seq_buf s;
synthetic_name = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL);
if (!synthetic_name)
return ERR_PTR(-ENOMEM);
- strcpy(synthetic_name, "synthetic_");
- strcat(synthetic_name, field_name);
+ seq_buf_init(&s, synthetic_name, MAX_FILTER_STR_VAL);
+ seq_buf_printf(&s, "synthetic_%s", field_name);
+
+ /* Terminate synthetic_name with a NUL. */
+ seq_buf_str(&s);
+
+ if (seq_buf_has_overflowed(&s)) {
+ kfree(synthetic_name);
+ return ERR_PTR(-E2BIG);
+ }
event_var = find_event_var(target_hist_data, system, event_name, synthetic_name);
@@ -3021,6 +3020,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,
struct hist_field *key_field;
struct hist_field *event_var;
char *saved_filter;
+ struct seq_buf s;
char *cmd;
int ret;
@@ -3065,28 +3065,34 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,
return ERR_PTR(-ENOMEM);
}
+ seq_buf_init(&s, cmd, MAX_FILTER_STR_VAL);
+
/* Use the same keys as the compatible histogram */
- strcat(cmd, "keys=");
+ seq_buf_puts(&s, "keys=");
for_each_hist_key_field(i, hist_data) {
key_field = hist_data->fields[i];
if (!first)
- strcat(cmd, ",");
- strcat(cmd, key_field->field->name);
+ seq_buf_putc(&s, ',');
+ seq_buf_puts(&s, key_field->field->name);
first = false;
}
/* Create the synthetic field variable specification */
- strcat(cmd, ":synthetic_");
- strcat(cmd, field_name);
- strcat(cmd, "=");
- strcat(cmd, field_name);
+ seq_buf_printf(&s, ":synthetic_%s=%s", field_name, field_name);
/* Use the same filter as the compatible histogram */
saved_filter = find_trigger_filter(hist_data, file);
- if (saved_filter) {
- strcat(cmd, " if ");
- strcat(cmd, saved_filter);
+ if (saved_filter)
+ seq_buf_printf(&s, " if %s", saved_filter);
+
+ /* Terminate cmd with a NUL. */
+ seq_buf_str(&s);
+
+ if (seq_buf_has_overflowed(&s)) {
+ kfree(cmd);
+ kfree(var_hist);
+ return ERR_PTR(-E2BIG);
}
var_hist->cmd = kstrdup(cmd, GFP_KERNEL);
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 39ac4eba0702..e6871230bde9 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -499,28 +499,19 @@ static unsigned int trace_stack(struct synth_trace_event *entry,
return len;
}
-static void trace_event_raw_event_synth(void *__data,
- u64 *var_ref_vals,
- unsigned int *var_ref_idx)
+static __always_inline int get_field_size(struct synth_event *event,
+ u64 *var_ref_vals,
+ unsigned int *var_ref_idx)
{
- unsigned int i, n_u64, val_idx, len, data_size = 0;
- struct trace_event_file *trace_file = __data;
- struct synth_trace_event *entry;
- struct trace_event_buffer fbuffer;
- struct trace_buffer *buffer;
- struct synth_event *event;
- int fields_size = 0;
-
- event = trace_file->event_call->data;
-
- if (trace_trigger_soft_disabled(trace_file))
- return;
+ int fields_size;
fields_size = event->n_u64 * sizeof(u64);
- for (i = 0; i < event->n_dynamic_fields; i++) {
+ for (int i = 0; i < event->n_dynamic_fields; i++) {
unsigned int field_pos = event->dynamic_fields[i]->field_pos;
char *str_val;
+ int val_idx;
+ int len;
val_idx = var_ref_idx[field_pos];
str_val = (char *)(long)var_ref_vals[val_idx];
@@ -535,18 +526,18 @@ static void trace_event_raw_event_synth(void *__data,
fields_size += len;
}
+ return fields_size;
+}
- /*
- * Avoid ring buffer recursion detection, as this event
- * is being performed within another event.
- */
- buffer = trace_file->tr->array_buffer.buffer;
- guard(ring_buffer_nest)(buffer);
-
- entry = trace_event_buffer_reserve(&fbuffer, trace_file,
- sizeof(*entry) + fields_size);
- if (!entry)
- return;
+static __always_inline void write_synth_entry(struct synth_event *event,
+ struct synth_trace_event *entry,
+ u64 *var_ref_vals,
+ unsigned int *var_ref_idx)
+{
+ int data_size = 0;
+ int i, n_u64;
+ int val_idx;
+ int len;
for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
val_idx = var_ref_idx[i];
@@ -587,10 +578,83 @@ static void trace_event_raw_event_synth(void *__data,
n_u64++;
}
}
+}
+
+static void trace_event_raw_event_synth(void *__data,
+ u64 *var_ref_vals,
+ unsigned int *var_ref_idx)
+{
+ struct trace_event_file *trace_file = __data;
+ struct synth_trace_event *entry;
+ struct trace_event_buffer fbuffer;
+ struct trace_buffer *buffer;
+ struct synth_event *event;
+ int fields_size;
+
+ event = trace_file->event_call->data;
+
+ if (trace_trigger_soft_disabled(trace_file))
+ return;
+
+ fields_size = get_field_size(event, var_ref_vals, var_ref_idx);
+
+ /*
+ * Avoid ring buffer recursion detection, as this event
+ * is being performed within another event.
+ */
+ buffer = trace_file->tr->array_buffer.buffer;
+ guard(ring_buffer_nest)(buffer);
+
+ entry = trace_event_buffer_reserve(&fbuffer, trace_file,
+ sizeof(*entry) + fields_size);
+ if (!entry)
+ return;
+
+ write_synth_entry(event, entry, var_ref_vals, var_ref_idx);
trace_event_buffer_commit(&fbuffer);
}
+#ifdef CONFIG_PERF_EVENTS
+static void perf_event_raw_event_synth(void *__data,
+ u64 *var_ref_vals,
+ unsigned int *var_ref_idx)
+{
+ struct trace_event_call *call = __data;
+ struct synth_trace_event *entry;
+ struct hlist_head *perf_head;
+ struct synth_event *event;
+ struct pt_regs *regs;
+ int fields_size;
+ size_t size;
+ int context;
+
+ event = call->data;
+
+ perf_head = this_cpu_ptr(call->perf_events);
+
+ if (!perf_head || hlist_empty(perf_head))
+ return;
+
+ fields_size = get_field_size(event, var_ref_vals, var_ref_idx);
+
+ size = ALIGN(sizeof(*entry) + fields_size, 8);
+
+ entry = perf_trace_buf_alloc(size, &regs, &context);
+
+ if (unlikely(!entry))
+ return;
+
+ write_synth_entry(event, entry, var_ref_vals, var_ref_idx);
+
+ perf_fetch_caller_regs(regs);
+
+ perf_trace_buf_submit(entry, size, context,
+ call->event.type, 1, regs,
+ perf_head, NULL);
+}
+#endif
+
static void free_synth_event_print_fmt(struct trace_event_call *call)
{
if (call) {
@@ -917,6 +981,9 @@ static int register_synth_event(struct synth_event *event)
call->flags = TRACE_EVENT_FL_TRACEPOINT;
call->class->reg = synth_event_reg;
call->class->probe = trace_event_raw_event_synth;
+#ifdef CONFIG_PERF_EVENTS
+ call->class->perf_probe = perf_event_raw_event_synth;
+#endif
call->data = event;
call->tp = event->tp;
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index 75678053b21c..5e83c4f6f2b4 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -83,6 +83,22 @@ struct osnoise_instance {
static struct list_head osnoise_instances;
+static void osnoise_print(const char *fmt, ...)
+{
+ struct osnoise_instance *inst;
+ struct trace_array *tr;
+ va_list ap;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(inst, &osnoise_instances, list) {
+ tr = inst->tr;
+ va_start(ap, fmt);
+ trace_array_vprintk(tr, _RET_IP_, fmt, ap);
+ va_end(ap);
+ }
+ rcu_read_unlock();
+}
+
static bool osnoise_has_registered_instances(void)
{
return !!list_first_or_null_rcu(&osnoise_instances,
@@ -123,6 +139,7 @@ static int osnoise_register_instance(struct trace_array *tr)
* trace_types_lock.
*/
lockdep_assert_held(&trace_types_lock);
+ trace_array_init_printk(tr);
inst = kmalloc_obj(*inst);
if (!inst)
@@ -471,15 +488,7 @@ static void print_osnoise_headers(struct seq_file *s)
* osnoise_taint - report an osnoise error.
*/
#define osnoise_taint(msg) ({ \
- struct osnoise_instance *inst; \
- struct trace_buffer *buffer; \
- \
- rcu_read_lock(); \
- list_for_each_entry_rcu(inst, &osnoise_instances, list) { \
- buffer = inst->tr->array_buffer.buffer; \
- trace_array_printk_buf(buffer, _THIS_IP_, msg); \
- } \
- rcu_read_unlock(); \
+ osnoise_print(msg); \
osnoise_data.tainted = true; \
})
@@ -1189,10 +1198,10 @@ static __always_inline void osnoise_stop_exception(char *msg, int cpu)
rcu_read_lock();
list_for_each_entry_rcu(inst, &osnoise_instances, list) {
tr = inst->tr;
- trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_,
- "stop tracing hit on cpu %d due to exception: %s\n",
- smp_processor_id(),
- msg);
+ trace_array_printk(tr, _THIS_IP_,
+ "stop tracing hit on cpu %d due to exception: %s\n",
+ smp_processor_id(),
+ msg);
if (test_bit(OSN_PANIC_ON_STOP, &osnoise_options))
panic("tracer hit on cpu %d due to exception: %s\n",
@@ -1362,8 +1371,8 @@ static __always_inline void osnoise_stop_tracing(void)
rcu_read_lock();
list_for_each_entry_rcu(inst, &osnoise_instances, list) {
tr = inst->tr;
- trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_,
- "stop tracing hit on cpu %d\n", smp_processor_id());
+ trace_array_printk(tr, _THIS_IP_,
+ "stop tracing hit on cpu %d\n", smp_processor_id());
if (test_bit(OSN_PANIC_ON_STOP, &osnoise_options))
panic("tracer hit stop condition on CPU %d\n", smp_processor_id());
@@ -2544,9 +2553,12 @@ timerlat_fd_read(struct file *file, char __user *ubuf, size_t count,
notify_new_max_latency(diff);
tlat->tracing_thread = false;
- if (osnoise_data.stop_tracing_total)
- if (time_to_us(diff) >= osnoise_data.stop_tracing_total)
+ if (osnoise_data.stop_tracing_total) {
+ if (time_to_us(diff) >= osnoise_data.stop_tracing_total) {
+ timerlat_dump_stack(time_to_us(diff));
osnoise_stop_tracing();
+ }
+ }
} else {
tlat->tracing_thread = false;
tlat->kthread = current;
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index e0d3a0da26af..fd1caa1f9723 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -332,6 +332,23 @@ static int parse_trace_event_arg(char *arg, struct fetch_insn *code,
return -ENOENT;
}
+static int parse_trace_event(char *arg, struct fetch_insn *code,
+ struct traceprobe_parse_context *ctx)
+{
+ int ret;
+
+ if (code->data)
+ return -EFAULT;
+ ret = parse_trace_event_arg(arg, code, ctx);
+ if (!ret)
+ return 0;
+ if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) {
+ code->op = FETCH_OP_COMM;
+ return 0;
+ }
+ return -EINVAL;
+}
+
#ifdef CONFIG_PROBE_EVENTS_BTF_ARGS
static u32 btf_type_int(const struct btf_type *t)
@@ -376,11 +393,16 @@ static bool btf_type_is_char_array(struct btf *btf, const struct btf_type *type)
&& BTF_INT_BITS(intdata) == 8;
}
+static struct btf *ctx_btf(struct traceprobe_parse_context *ctx)
+{
+ return ctx->struct_btf ? : ctx->btf;
+}
+
static int check_prepare_btf_string_fetch(char *typename,
struct fetch_insn **pcode,
struct traceprobe_parse_context *ctx)
{
- struct btf *btf = ctx->btf;
+ struct btf *btf = ctx_btf(ctx);
if (!btf || !ctx->last_type)
return 0;
@@ -506,6 +528,15 @@ static int query_btf_context(struct traceprobe_parse_context *ctx)
return 0;
}
+static void clear_struct_btf(struct traceprobe_parse_context *ctx)
+{
+ if (ctx->struct_btf) {
+ btf_put(ctx->struct_btf);
+ ctx->struct_btf = NULL;
+ ctx->last_struct = NULL;
+ }
+}
+
static void clear_btf_context(struct traceprobe_parse_context *ctx)
{
if (ctx->btf) {
@@ -554,22 +585,29 @@ static int parse_btf_field(char *fieldname, const struct btf_type *type,
struct fetch_insn *code = *pcode;
const struct btf_member *field;
u32 bitoffs, anon_offs;
+ bool is_struct = ctx->struct_btf != NULL;
+ struct btf *btf = ctx_btf(ctx);
char *next;
int is_ptr;
s32 tid;
do {
- /* Outer loop for solving arrow operator ('->') */
- if (BTF_INFO_KIND(type->info) != BTF_KIND_PTR) {
- trace_probe_log_err(ctx->offset, NO_PTR_STRCT);
- return -EINVAL;
- }
- /* Convert a struct pointer type to a struct type */
- type = btf_type_skip_modifiers(ctx->btf, type->type, &tid);
- if (!type) {
- trace_probe_log_err(ctx->offset, BAD_BTF_TID);
- return -EINVAL;
+ if (!is_struct) {
+ /* Outer loop for solving arrow operator ('->') */
+ if (BTF_INFO_KIND(type->info) != BTF_KIND_PTR) {
+ trace_probe_log_err(ctx->offset, NO_PTR_STRCT);
+ return -EINVAL;
+ }
+
+ /* Convert a struct pointer type to a struct type */
+ type = btf_type_skip_modifiers(btf, type->type, &tid);
+ if (!type) {
+ trace_probe_log_err(ctx->offset, BAD_BTF_TID);
+ return -EINVAL;
+ }
}
+ /* Only the first type can skip being a pointer */
+ is_struct = false;
bitoffs = 0;
do {
@@ -580,7 +618,7 @@ static int parse_btf_field(char *fieldname, const struct btf_type *type,
return is_ptr;
anon_offs = 0;
- field = btf_find_struct_member(ctx->btf, type, fieldname,
+ field = btf_find_struct_member(btf, type, fieldname,
&anon_offs);
if (IS_ERR(field)) {
trace_probe_log_err(ctx->offset, BAD_BTF_TID);
@@ -602,7 +640,7 @@ static int parse_btf_field(char *fieldname, const struct btf_type *type,
ctx->last_bitsize = 0;
}
- type = btf_type_skip_modifiers(ctx->btf, field->type, &tid);
+ type = btf_type_skip_modifiers(btf, field->type, &tid);
if (!type) {
trace_probe_log_err(ctx->offset, BAD_BTF_TID);
return -EINVAL;
@@ -640,7 +678,7 @@ static int parse_btf_arg(char *varname,
int i, is_ptr, ret;
u32 tid;
- if (WARN_ON_ONCE(!ctx->funcname))
+ if (WARN_ON_ONCE(!ctx->funcname && !(ctx->flags & TPARG_FL_TEVENT)))
return -EINVAL;
is_ptr = split_next_field(varname, &field, ctx);
@@ -653,6 +691,19 @@ static int parse_btf_arg(char *varname,
return -EOPNOTSUPP;
}
+ if (ctx->flags & TPARG_FL_TEVENT) {
+ ret = parse_trace_event(varname, code, ctx);
+ if (ret < 0) {
+ trace_probe_log_err(ctx->offset, BAD_ATTACH_ARG);
+ return ret;
+ }
+ /* TEVENT is only here via a typecast */
+ if (WARN_ON_ONCE(ctx->struct_btf == NULL))
+ return -EINVAL;
+ type = ctx->last_struct;
+ goto found_type;
+ }
+
if (ctx->flags & TPARG_FL_RETURN && !strcmp(varname, "$retval")) {
code->op = FETCH_OP_RETVAL;
/* Check whether the function return type is not void */
@@ -709,6 +760,7 @@ static int parse_btf_arg(char *varname,
found:
type = btf_type_skip_modifiers(ctx->btf, tid, &tid);
+found_type:
if (!type) {
trace_probe_log_err(ctx->offset, BAD_BTF_TID);
return -EINVAL;
@@ -727,7 +779,7 @@ found:
static const struct fetch_type *find_fetch_type_from_btf_type(
struct traceprobe_parse_context *ctx)
{
- struct btf *btf = ctx->btf;
+ struct btf *btf = ctx_btf(ctx);
const char *typestr = NULL;
if (btf && ctx->last_type)
@@ -758,7 +810,67 @@ static int parse_btf_bitfield(struct fetch_insn **pcode,
return 0;
}
-#else
+static int query_btf_struct(const char *sname, struct traceprobe_parse_context *ctx)
+{
+ struct btf *btf = NULL;
+ int id;
+
+ /* A struct_btf should only be used by a single argument */
+ if (WARN_ON_ONCE(ctx->struct_btf)) {
+ btf_put(ctx->struct_btf);
+ ctx->struct_btf = NULL;
+ }
+
+ id = bpf_find_btf_id(sname, BTF_KIND_STRUCT, &btf);
+ if (id < 0)
+ return id;
+ ctx->struct_btf = btf;
+ ctx->last_struct = btf_type_by_id(ctx->struct_btf, id);
+ return 0;
+}
+
+static int handle_typecast(char *arg, struct fetch_insn **pcode,
+ struct fetch_insn *end,
+ struct traceprobe_parse_context *ctx)
+{
+ char *tmp;
+ int ret;
+
+ /* Currently this only works for eprobes */
+ if (!(ctx->flags & TPARG_FL_TEVENT)) {
+ trace_probe_log_err(ctx->offset, TYPECAST_NOT_EVENT);
+ return -EINVAL;
+ }
+
+ tmp = strchr(arg, ')');
+ if (!tmp) {
+ trace_probe_log_err(ctx->offset + strlen(arg),
+ DEREF_OPEN_BRACE);
+ return -EINVAL;
+ }
+ *tmp = '\0';
+ ret = query_btf_struct(arg + 1, ctx);
+ *tmp = ')';
+
+ if (ret < 0) {
+ trace_probe_log_err(ctx->offset + 1, NO_PTR_STRCT);
+ return -EINVAL;
+ }
+
+ tmp++;
+
+ ctx->offset += tmp - arg;
+ ret = parse_btf_arg(tmp, pcode, end, ctx);
+ return ret;
+}
+
+#else /* !CONFIG_PROBE_EVENTS_BTF_ARGS */
+
+static void clear_struct_btf(struct traceprobe_parse_context *ctx)
+{
+ ctx->struct_btf = NULL;
+}
+
static void clear_btf_context(struct traceprobe_parse_context *ctx)
{
ctx->btf = NULL;
@@ -794,7 +906,15 @@ static int check_prepare_btf_string_fetch(char *typename,
return 0;
}
-#endif
+static int handle_typecast(char *arg, struct fetch_insn **pcode,
+ struct fetch_insn *end,
+ struct traceprobe_parse_context *ctx)
+{
+ trace_probe_log_err(ctx->offset, NOSUP_BTFARG);
+ return -EOPNOTSUPP;
+}
+
+#endif /* CONFIG_PROBE_EVENTS_BTF_ARGS */
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
@@ -838,15 +958,10 @@ static int __store_entry_arg(struct trace_probe *tp, int argnum)
int i, offset, last_offset = 0;
if (!earg) {
- earg = kzalloc_obj(*tp->entry_arg);
+ earg = kzalloc_flex(*earg, code, 2 * tp->nr_args + 1);
if (!earg)
return -ENOMEM;
earg->size = 2 * tp->nr_args + 1;
- earg->code = kzalloc_objs(struct fetch_insn, earg->size);
- if (!earg->code) {
- kfree(earg);
- return -ENOMEM;
- }
/* Fill the code buffer with 'end' to simplify it */
for (i = 0; i < earg->size; i++)
earg->code[i].op = FETCH_OP_END;
@@ -953,18 +1068,9 @@ static int parse_probe_vars(char *orig_arg, const struct fetch_type *t,
int len;
if (ctx->flags & TPARG_FL_TEVENT) {
- if (code->data)
- return -EFAULT;
- ret = parse_trace_event_arg(arg, code, ctx);
- if (!ret)
- return 0;
- if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) {
- code->op = FETCH_OP_COMM;
- return 0;
- }
- /* backward compatibility */
- ctx->offset = 0;
- goto inval;
+ if (parse_trace_event(arg, code, ctx) < 0)
+ goto inval;
+ return 0;
}
if (str_has_prefix(arg, "retval")) {
@@ -1231,6 +1337,9 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
code->op = FETCH_OP_IMM;
}
break;
+ case '(':
+ ret = handle_typecast(arg, pcode, end, ctx);
+ break;
default:
if (isalpha(arg[0]) || arg[0] == '_') { /* BTF variable */
if (!tparg_is_function_entry(ctx->flags) &&
@@ -1563,6 +1672,9 @@ fail:
}
kfree(tmp);
+ /* struct_btf should not be passed to other arguments */
+ clear_struct_btf(ctx);
+
return ret;
}
@@ -2051,7 +2163,6 @@ void trace_probe_cleanup(struct trace_probe *tp)
traceprobe_free_probe_arg(&tp->args[i]);
if (tp->entry_arg) {
- kfree(tp->entry_arg->code);
kfree(tp->entry_arg);
tp->entry_arg = NULL;
}
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 262d8707a3df..15758cc11fc6 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -238,8 +238,8 @@ struct probe_arg {
};
struct probe_entry_arg {
- struct fetch_insn *code;
unsigned int size; /* The entry data size */
+ struct fetch_insn code[] __counted_by(size);
};
struct trace_uprobe_filter {
@@ -422,7 +422,9 @@ struct traceprobe_parse_context {
const struct btf_param *params; /* Parameter of the function */
s32 nr_params; /* The number of the parameters */
struct btf *btf; /* The BTF to be used */
+ struct btf *struct_btf; /* The BTF to be used for structs */
const struct btf_type *last_type; /* Saved type */
+ const struct btf_type *last_struct; /* Saved structure */
u32 last_bitoffs; /* Saved bitoffs */
u32 last_bitsize; /* Saved bitsize */
struct trace_probe *tp;
@@ -563,7 +565,8 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(NEED_STRING_TYPE, "$comm and immediate-string only accepts string type"),\
C(TOO_MANY_ARGS, "Too many arguments are specified"), \
C(TOO_MANY_EARGS, "Too many entry arguments specified"), \
- C(EVENT_TOO_BIG, "Event too big (too many fields?)"),
+ C(EVENT_TOO_BIG, "Event too big (too many fields?)"), \
+ C(TYPECAST_NOT_EVENT, "Typecasts are only for eprobe fields"),
#undef C
#define C(a, b) TP_ERR_##a
diff --git a/kernel/trace/trace_recursion_record.c b/kernel/trace/trace_recursion_record.c
index 784fe1fbb866..bac4bc844ccd 100644
--- a/kernel/trace/trace_recursion_record.c
+++ b/kernel/trace/trace_recursion_record.c
@@ -180,9 +180,8 @@ static const struct seq_operations recursed_function_seq_ops = {
static int recursed_function_open(struct inode *inode, struct file *file)
{
- int ret = 0;
+ guard(mutex)(&recursed_function_lock);
- mutex_lock(&recursed_function_lock);
/* If this file was opened for write, then erase contents */
if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
/* disable updating records */
@@ -194,10 +193,9 @@ static int recursed_function_open(struct inode *inode, struct file *file)
atomic_set(&nr_records, 0);
}
if (file->f_mode & FMODE_READ)
- ret = seq_open(file, &recursed_function_seq_ops);
- mutex_unlock(&recursed_function_lock);
+ return seq_open(file, &recursed_function_seq_ops);
- return ret;
+ return 0;
}
static ssize_t recursed_function_write(struct file *file,
diff --git a/kernel/trace/trace_remote.c b/kernel/trace/trace_remote.c
index d6c3f94d67cd..2a6cc000ec98 100644
--- a/kernel/trace/trace_remote.c
+++ b/kernel/trace/trace_remote.c
@@ -602,7 +602,7 @@ static int trace_pipe_open(struct inode *inode, struct file *filp)
filp->private_data = iter;
- return IS_ERR(iter) ? PTR_ERR(iter) : 0;
+ return 0;
}
static int trace_pipe_release(struct inode *inode, struct file *filp)
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 8ad72e17d8eb..e98ee7e1e66f 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1371,33 +1371,33 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
static int sys_perf_refcount_enter;
static int sys_perf_refcount_exit;
-static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *regs,
+static int perf_call_bpf_enter(struct trace_event_call *call,
struct syscall_metadata *sys_data,
- struct syscall_trace_enter *rec)
+ int syscall_nr, unsigned long *args)
{
struct syscall_tp_t {
struct trace_entry ent;
int syscall_nr;
unsigned long args[SYSCALL_DEFINE_MAXARGS];
} __aligned(8) param;
+ struct pt_regs regs = {};
int i;
BUILD_BUG_ON(sizeof(param.ent) < sizeof(void *));
- /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
- perf_fetch_caller_regs(regs);
- *(struct pt_regs **)&param = regs;
- param.syscall_nr = rec->nr;
+ /* bpf prog requires 'regs' to be the first member in the ctx */
+ perf_fetch_caller_regs(&regs);
+ *(struct pt_regs **)&param = &regs;
+ param.syscall_nr = syscall_nr;
for (i = 0; i < sys_data->nb_args; i++)
- param.args[i] = rec->args[i];
- return trace_call_bpf(call, &param);
+ param.args[i] = args[i];
+ return trace_call_bpf_faultable(call, &param);
}
static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
{
struct syscall_metadata *sys_data;
struct syscall_trace_enter *rec;
- struct pt_regs *fake_regs;
struct hlist_head *head;
unsigned long args[6];
bool valid_prog_array;
@@ -1410,12 +1410,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
int size = 0;
int uargs = 0;
- /*
- * Syscall probe called with preemption enabled, but the ring
- * buffer and per-cpu data require preemption to be disabled.
- */
might_fault();
- guard(preempt_notrace)();
syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
@@ -1429,6 +1424,26 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
syscall_get_arguments(current, regs, args);
+ /*
+ * Run BPF program in faultable context before per-cpu buffer
+ * allocation, allowing sleepable BPF programs to execute.
+ */
+ valid_prog_array = bpf_prog_array_valid(sys_data->enter_event);
+ if (valid_prog_array &&
+ !perf_call_bpf_enter(sys_data->enter_event, sys_data,
+ syscall_nr, args))
+ return;
+
+ /*
+ * Per-cpu ring buffer and perf event list operations require
+ * preemption to be disabled.
+ */
+ guard(preempt_notrace)();
+
+ head = this_cpu_ptr(sys_data->enter_event->perf_events);
+ if (hlist_empty(head))
+ return;
+
/* Check if this syscall event faults in user space memory */
mayfault = sys_data->user_mask != 0;
@@ -1438,17 +1453,12 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
return;
}
- head = this_cpu_ptr(sys_data->enter_event->perf_events);
- valid_prog_array = bpf_prog_array_valid(sys_data->enter_event);
- if (!valid_prog_array && hlist_empty(head))
- return;
-
/* get the size after alignment with the u32 buffer size field */
size += sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
size = ALIGN(size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
+ rec = perf_trace_buf_alloc(size, NULL, &rctx);
if (!rec)
return;
@@ -1458,13 +1468,6 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
if (mayfault)
syscall_put_data(sys_data, rec, user_ptr, size, user_sizes, uargs);
- if ((valid_prog_array &&
- !perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) ||
- hlist_empty(head)) {
- perf_swevent_put_recursion_context(rctx);
- return;
- }
-
perf_trace_buf_submit(rec, size, rctx,
sys_data->enter_event->event.type, 1, regs,
head, NULL);
@@ -1514,40 +1517,35 @@ static void perf_sysenter_disable(struct trace_event_call *call)
syscall_fault_buffer_disable();
}
-static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs,
- struct syscall_trace_exit *rec)
+static int perf_call_bpf_exit(struct trace_event_call *call,
+ int syscall_nr, long ret_val)
{
struct syscall_tp_t {
struct trace_entry ent;
int syscall_nr;
unsigned long ret;
} __aligned(8) param;
-
- /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
- perf_fetch_caller_regs(regs);
- *(struct pt_regs **)&param = regs;
- param.syscall_nr = rec->nr;
- param.ret = rec->ret;
- return trace_call_bpf(call, &param);
+ struct pt_regs regs = {};
+
+ /* bpf prog requires 'regs' to be the first member in the ctx */
+ perf_fetch_caller_regs(&regs);
+ *(struct pt_regs **)&param = &regs;
+ param.syscall_nr = syscall_nr;
+ param.ret = ret_val;
+ return trace_call_bpf_faultable(call, &param);
}
static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
{
struct syscall_metadata *sys_data;
struct syscall_trace_exit *rec;
- struct pt_regs *fake_regs;
struct hlist_head *head;
bool valid_prog_array;
int syscall_nr;
int rctx;
int size;
- /*
- * Syscall probe called with preemption enabled, but the ring
- * buffer and per-cpu data require preemption to be disabled.
- */
might_fault();
- guard(preempt_notrace)();
syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
@@ -1559,29 +1557,37 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
if (!sys_data)
return;
- head = this_cpu_ptr(sys_data->exit_event->perf_events);
+ /*
+ * Run BPF program in faultable context before per-cpu buffer
+ * allocation, allowing sleepable BPF programs to execute.
+ */
valid_prog_array = bpf_prog_array_valid(sys_data->exit_event);
- if (!valid_prog_array && hlist_empty(head))
+ if (valid_prog_array &&
+ !perf_call_bpf_exit(sys_data->exit_event, syscall_nr,
+ syscall_get_return_value(current, regs)))
+ return;
+
+ /*
+ * Per-cpu ring buffer and perf event list operations require
+ * preemption to be disabled.
+ */
+ guard(preempt_notrace)();
+
+ head = this_cpu_ptr(sys_data->exit_event->perf_events);
+ if (hlist_empty(head))
return;
/* We can probably do that at build time */
size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
+ rec = perf_trace_buf_alloc(size, NULL, &rctx);
if (!rec)
return;
rec->nr = syscall_nr;
rec->ret = syscall_get_return_value(current, regs);
- if ((valid_prog_array &&
- !perf_call_bpf_exit(sys_data->exit_event, fake_regs, rec)) ||
- hlist_empty(head)) {
- perf_swevent_put_recursion_context(rctx);
- return;
- }
-
perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
1, regs, head, NULL);
}
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 2cabf8a23ec5..c274346853d1 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -912,7 +912,7 @@ static int uprobe_buffer_enable(void)
{
int ret = 0;
- BUG_ON(!mutex_is_locked(&event_mutex));
+ lockdep_assert_held(&event_mutex);
if (uprobe_buffer_refcnt++ == 0) {
ret = uprobe_buffer_init();
@@ -927,7 +927,7 @@ static void uprobe_buffer_disable(void)
{
int cpu;
- BUG_ON(!mutex_is_locked(&event_mutex));
+ lockdep_assert_held(&event_mutex);
if (--uprobe_buffer_refcnt == 0) {
for_each_possible_cpu(cpu)
@@ -979,6 +979,7 @@ static struct uprobe_cpu_buffer *prepare_uprobe_buffer(struct trace_uprobe *tu,
ucb = uprobe_buffer_get();
ucb->dsize = tu->tp.size + dsize;
+ BUILD_BUG_ON(MAX_UCB_BUFFER_SIZE < MAX_PROBE_EVENT_SIZE);
if (WARN_ON_ONCE(ucb->dsize > MAX_UCB_BUFFER_SIZE)) {
ucb->dsize = MAX_UCB_BUFFER_SIZE;
dsize = MAX_UCB_BUFFER_SIZE - tu->tp.size;
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index bf1a507695b6..d7922f40dbe2 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -288,9 +288,6 @@ static void tracing_map_array_clear(struct tracing_map_array *a)
{
unsigned int i;
- if (!a->pages)
- return;
-
for (i = 0; i < a->n_pages; i++)
memset(a->pages[i], 0, PAGE_SIZE);
}
@@ -302,9 +299,6 @@ static void tracing_map_array_free(struct tracing_map_array *a)
if (!a)
return;
- if (!a->pages)
- goto free;
-
for (i = 0; i < a->n_pages; i++) {
if (!a->pages[i])
break;
@@ -312,9 +306,6 @@ static void tracing_map_array_free(struct tracing_map_array *a)
free_page((unsigned long)a->pages[i]);
}
- kfree(a->pages);
-
- free:
kfree(a);
}
@@ -322,24 +313,25 @@ static struct tracing_map_array *tracing_map_array_alloc(unsigned int n_elts,
unsigned int entry_size)
{
struct tracing_map_array *a;
+ unsigned int entry_size_shift;
+ unsigned int entries_per_page;
+ unsigned int n_pages;
unsigned int i;
- a = kzalloc_obj(*a);
+ entry_size_shift = fls(roundup_pow_of_two(entry_size) - 1);
+ entries_per_page = PAGE_SIZE / (1 << entry_size_shift);
+ n_pages = max(1, n_elts / entries_per_page);
+
+ a = kzalloc_flex(*a, pages, n_pages);
if (!a)
return NULL;
- a->entry_size_shift = fls(roundup_pow_of_two(entry_size) - 1);
- a->entries_per_page = PAGE_SIZE / (1 << a->entry_size_shift);
- a->n_pages = n_elts / a->entries_per_page;
- if (!a->n_pages)
- a->n_pages = 1;
+ a->entry_size_shift = entry_size_shift;
+ a->entries_per_page = entries_per_page;
+ a->n_pages = n_pages;
a->entry_shift = fls(a->entries_per_page) - 1;
a->entry_mask = (1 << a->entry_shift) - 1;
- a->pages = kcalloc(a->n_pages, sizeof(void *), GFP_KERNEL);
- if (!a->pages)
- goto free;
-
for (i = 0; i < a->n_pages; i++) {
a->pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
if (!a->pages[i])
@@ -386,13 +378,11 @@ static void tracing_map_elt_init_fields(struct tracing_map_elt *elt)
}
}
-static void tracing_map_elt_free(struct tracing_map_elt *elt)
+static void __tracing_map_elt_free(struct tracing_map_elt *elt)
{
if (!elt)
return;
- if (elt->map->ops && elt->map->ops->elt_free)
- elt->map->ops->elt_free(elt);
kfree(elt->fields);
kfree(elt->vars);
kfree(elt->var_set);
@@ -400,6 +390,17 @@ static void tracing_map_elt_free(struct tracing_map_elt *elt)
kfree(elt);
}
+static void tracing_map_elt_free(struct tracing_map_elt *elt)
+{
+ if (!elt)
+ return;
+
+ /* Only objects initialized with alloc_elt() should be passed to free_elt().*/
+ if (elt->map->ops && elt->map->ops->elt_free)
+ elt->map->ops->elt_free(elt);
+ __tracing_map_elt_free(elt);
+}
+
static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map)
{
struct tracing_map_elt *elt;
@@ -444,7 +445,7 @@ static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map)
}
return elt;
free:
- tracing_map_elt_free(elt);
+ __tracing_map_elt_free(elt);
return ERR_PTR(err);
}
diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h
index 99c37eeebc16..18a02959d77b 100644
--- a/kernel/trace/tracing_map.h
+++ b/kernel/trace/tracing_map.h
@@ -167,7 +167,7 @@ struct tracing_map_array {
unsigned int entry_shift;
unsigned int entry_mask;
unsigned int n_pages;
- void **pages;
+ void *pages[] __counted_by(n_pages);
};
#define TRACING_MAP_ARRAY_ELT(array, idx) \