diff options
Diffstat (limited to 'kernel/trace')
136 files changed, 19593 insertions, 7237 deletions
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 74c2b1d43bb9..e130da35808f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -31,9 +31,14 @@ config HAVE_FUNCTION_GRAPH_TRACER help See Documentation/trace/ftrace-design.rst -config HAVE_FUNCTION_GRAPH_RETVAL +config HAVE_FUNCTION_GRAPH_FREGS bool +config HAVE_FTRACE_GRAPH_FUNC + bool + help + True if ftrace_graph_func() is defined. + config HAVE_DYNAMIC_FTRACE bool help @@ -45,9 +50,18 @@ config HAVE_DYNAMIC_FTRACE_WITH_REGS config HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS bool +config HAVE_SINGLE_FTRACE_DIRECT_OPS + bool + config HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS bool +config HAVE_EXTRA_IPI_TRACEPOINTS + bool + help + For architectures that use ipi_raise, ipi_entry and ipi_exit + tracepoints. + config HAVE_DYNAMIC_FTRACE_WITH_ARGS bool help @@ -57,16 +71,23 @@ config HAVE_DYNAMIC_FTRACE_WITH_ARGS This allows for use of ftrace_regs_get_argument() and ftrace_regs_get_stack_pointer(). +config HAVE_FTRACE_REGS_HAVING_PT_REGS + bool + help + If this is set, ftrace_regs has pt_regs, thus it can convert to + pt_regs without allocating memory. + config HAVE_DYNAMIC_FTRACE_NO_PATCHABLE bool help If the architecture generates __patchable_function_entries sections but does not want them included in the ftrace locations. -config HAVE_FTRACE_MCOUNT_RECORD +config HAVE_DYNAMIC_FTRACE_WITH_JMP bool help - See Documentation/trace/ftrace-design.rst + If the architecture supports to replace the __fentry__ with a + "jmp" instruction. config HAVE_SYSCALL_TRACEPOINTS bool @@ -115,6 +136,7 @@ config BUILDTIME_MCOUNT_SORT config TRACER_MAX_TRACE bool + select TRACER_SNAPSHOT config TRACE_CLOCK bool @@ -188,6 +210,19 @@ menuconfig FTRACE if FTRACE +config TRACEFS_AUTOMOUNT_DEPRECATED + bool "Automount tracefs on debugfs [DEPRECATED]" + depends on TRACING + default y + help + The tracing interface was moved from /sys/kernel/debug/tracing + to /sys/kernel/tracing in 2015, but the tracing file system + was still automounted in /sys/kernel/debug for backward + compatibility with tooling. + + The new interface has been around for more than 10 years and + the old debug mount will soon be removed. + config BOOTTIME_TRACING bool "Boot-time Tracing support" depends on TRACING @@ -232,7 +267,7 @@ config FUNCTION_GRAPH_TRACER config FUNCTION_GRAPH_RETVAL bool "Kernel Function Graph Return Value" - depends on HAVE_FUNCTION_GRAPH_RETVAL + depends on HAVE_FUNCTION_GRAPH_FREGS depends on FUNCTION_GRAPH_TRACER default n help @@ -252,8 +287,19 @@ config FUNCTION_GRAPH_RETADDR the function is called. This feature is off by default, and you can enable it via the trace option funcgraph-retaddr. +config FUNCTION_TRACE_ARGS + bool + depends on PROBE_EVENTS_BTF_ARGS + default y + help + If supported with function argument access API and BTF, then + the function tracer and function graph tracer will support printing + of function arguments. This feature is off by default, and can be + enabled via the trace option func-args (for the function tracer) and + funcgraph-args (for the function graph tracer) + config DYNAMIC_FTRACE - bool "enable/disable function tracing dynamically" + bool depends on FUNCTION_TRACER depends on HAVE_DYNAMIC_FTRACE default y @@ -294,12 +340,31 @@ config DYNAMIC_FTRACE_WITH_ARGS depends on DYNAMIC_FTRACE depends on HAVE_DYNAMIC_FTRACE_WITH_ARGS +config DYNAMIC_FTRACE_WITH_JMP + def_bool y + depends on DYNAMIC_FTRACE + depends on DYNAMIC_FTRACE_WITH_DIRECT_CALLS + depends on HAVE_DYNAMIC_FTRACE_WITH_JMP + +config FUNCTION_SELF_TRACING + bool "Function trace tracing code" + depends on FUNCTION_TRACER + help + Normally all the tracing code is set to notrace, where the function + tracer will ignore all the tracing functions. Sometimes it is useful + for debugging to trace some of the tracing infratructure itself. + Enable this to allow some of the tracing infrastructure to be traced + by the function tracer. Note, this will likely add noise to function + tracing if events and other tracing features are enabled along with + function tracing. + + If unsure, say N. + config FPROBE bool "Kernel Function Probe (fprobe)" - depends on FUNCTION_TRACER - depends on DYNAMIC_FTRACE_WITH_REGS - depends on HAVE_RETHOOK - select RETHOOK + depends on HAVE_FUNCTION_GRAPH_FREGS && HAVE_FTRACE_GRAPH_FUNC + depends on DYNAMIC_FTRACE_WITH_ARGS + select FUNCTION_GRAPH_TRACER default n help This option enables kernel function probe (fprobe) based on ftrace. @@ -361,7 +426,6 @@ config IRQSOFF_TRACER select GENERIC_TRACER select TRACER_MAX_TRACE select RING_BUFFER_ALLOW_SWAP - select TRACER_SNAPSHOT select TRACER_SNAPSHOT_PER_CPU_SWAP help This option measures the time spent in irqs-off critical @@ -384,7 +448,6 @@ config PREEMPT_TRACER select GENERIC_TRACER select TRACER_MAX_TRACE select RING_BUFFER_ALLOW_SWAP - select TRACER_SNAPSHOT select TRACER_SNAPSHOT_PER_CPU_SWAP select TRACE_PREEMPT_TOGGLE help @@ -406,7 +469,6 @@ config SCHED_TRACER select GENERIC_TRACER select CONTEXT_SWITCH_TRACER select TRACER_MAX_TRACE - select TRACER_SNAPSHOT help This tracer tracks the latency of the highest priority task to be scheduled in, starting from the point it has woken up. @@ -540,9 +602,22 @@ config FTRACE_SYSCALLS help Basic tracer to catch the syscall entry and exit events. +config TRACE_SYSCALL_BUF_SIZE_DEFAULT + int "System call user read max size" + range 0 165 + default 63 + depends on FTRACE_SYSCALLS + help + Some system call trace events will record the data from a user + space address that one of the parameters point to. The amount of + data per event is limited. That limit is set by this config and + this config also affects how much user space data perf can read. + + For a tracing instance, this size may be changed by writing into + its syscall_user_buf_size file. + config TRACER_SNAPSHOT bool "Create a snapshot trace buffer" - select TRACER_MAX_TRACE help Allow tracing users to take snapshot of the current buffer using the ftrace interface, e.g.: @@ -550,6 +625,9 @@ config TRACER_SNAPSHOT echo 1 > /sys/kernel/tracing/snapshot cat snapshot + Note, the latency tracers select this option. To disable it, + all the latency tracers need to be disabled. + config TRACER_SNAPSHOT_PER_CPU_SWAP bool "Allow snapshot to swap per CPU" depends on TRACER_SNAPSHOT @@ -758,6 +836,20 @@ config UPROBE_EVENTS This option is required if you plan to use perf-probe subcommand of perf tools on user space applications. +config EPROBE_EVENTS + bool "Enable event-based dynamic events" + depends on TRACING + depends on HAVE_REGS_AND_STACK_ACCESS_API + select PROBE_EVENTS + select DYNAMIC_EVENTS + default y + help + Eprobes are dynamic events that can be placed on other existing + events. It can be used to limit what fields are recorded in + an event or even dereference a field of an event. It can + convert the type of an event field. For example, turn an + address into a string. + config BPF_EVENTS depends on BPF_SYSCALL depends on (KPROBE_EVENTS || UPROBE_EVENTS) && PERF_EVENTS @@ -782,27 +874,22 @@ config BPF_KPROBE_OVERRIDE Allows BPF to override the execution of a probed function and set a different return value. This is used for error injection. -config FTRACE_MCOUNT_RECORD - def_bool y - depends on DYNAMIC_FTRACE - depends on HAVE_FTRACE_MCOUNT_RECORD - config FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY bool - depends on FTRACE_MCOUNT_RECORD + depends on DYNAMIC_FTRACE config FTRACE_MCOUNT_USE_CC def_bool y depends on $(cc-option,-mrecord-mcount) depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY - depends on FTRACE_MCOUNT_RECORD + depends on DYNAMIC_FTRACE config FTRACE_MCOUNT_USE_OBJTOOL def_bool y depends on HAVE_OBJTOOL_MCOUNT depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY depends on !FTRACE_MCOUNT_USE_CC - depends on FTRACE_MCOUNT_RECORD + depends on DYNAMIC_FTRACE select OBJTOOL config FTRACE_MCOUNT_USE_RECORDMCOUNT @@ -810,7 +897,7 @@ config FTRACE_MCOUNT_USE_RECORDMCOUNT depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY depends on !FTRACE_MCOUNT_USE_CC depends on !FTRACE_MCOUNT_USE_OBJTOOL - depends on FTRACE_MCOUNT_RECORD + depends on DYNAMIC_FTRACE config TRACING_MAP bool @@ -1194,4 +1281,18 @@ config HIST_TRIGGERS_DEBUG source "kernel/trace/rv/Kconfig" +config TRACE_REMOTE + bool + +config SIMPLE_RING_BUFFER + bool + +config TRACE_REMOTE_TEST + tristate "Test module for remote tracing" + select TRACE_REMOTE + select SIMPLE_RING_BUFFER + help + This trace remote includes a ring-buffer writer implementation using + "simple_ring_buffer". This is solely intending for testing. + endif # FTRACE diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 057cd975d014..8d3d96e847d8 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -16,6 +16,23 @@ obj-y += trace_selftest_dynamic.o endif endif +# Allow some files to be function traced +ifdef CONFIG_FUNCTION_SELF_TRACING +CFLAGS_trace_output.o = $(CC_FLAGS_FTRACE) +CFLAGS_trace_seq.o = $(CC_FLAGS_FTRACE) +CFLAGS_trace_stat.o = $(CC_FLAGS_FTRACE) +CFLAGS_tracing_map.o = $(CC_FLAGS_FTRACE) +CFLAGS_synth_event_gen_test.o = $(CC_FLAGS_FTRACE) +CFLAGS_trace_events.o = $(CC_FLAGS_FTRACE) +CFLAGS_trace_syscalls.o = $(CC_FLAGS_FTRACE) +CFLAGS_trace_events_filter.o = $(CC_FLAGS_FTRACE) +CFLAGS_trace_events_trigger.o = $(CC_FLAGS_FTRACE) +CFLAGS_trace_events_synth.o = $(CC_FLAGS_FTRACE) +CFLAGS_trace_events_hist.o = $(CC_FLAGS_FTRACE) +CFLAGS_trace_events_user.o = $(CC_FLAGS_FTRACE) +CFLAGS_trace_dynevent.o = $(CC_FLAGS_FTRACE) +endif + ifdef CONFIG_FTRACE_STARTUP_TEST CFLAGS_trace_kprobe_selftest.o = $(CC_FLAGS_FTRACE) obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe_selftest.o @@ -51,6 +68,8 @@ obj-$(CONFIG_TRACING) += trace_output.o obj-$(CONFIG_TRACING) += trace_seq.o obj-$(CONFIG_TRACING) += trace_stat.o obj-$(CONFIG_TRACING) += trace_printk.o +obj-$(CONFIG_TRACING) += trace_pid.o +obj-$(CONFIG_TRACER_SNAPSHOT) += trace_snapshot.o obj-$(CONFIG_TRACING) += pid_list.o obj-$(CONFIG_TRACING_MAP) += tracing_map.o obj-$(CONFIG_PREEMPTIRQ_DELAY_TEST) += preemptirq_delay_test.o @@ -82,7 +101,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o -obj-$(CONFIG_PROBE_EVENTS) += trace_eprobe.o +obj-$(CONFIG_EPROBE_EVENTS) += trace_eprobe.o obj-$(CONFIG_TRACE_EVENT_INJECT) += trace_events_inject.o obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o @@ -110,4 +129,37 @@ obj-$(CONFIG_FPROBE_EVENTS) += trace_fprobe.o obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o obj-$(CONFIG_RV) += rv/ +obj-$(CONFIG_TRACE_REMOTE) += trace_remote.o +obj-$(CONFIG_SIMPLE_RING_BUFFER) += simple_ring_buffer.o +obj-$(CONFIG_TRACE_REMOTE_TEST) += remote_test.o + +# simple_ring_buffer is used by the pKVM hypervisor which does not have access +# to all kernel symbols. Fail the build if forbidden symbols are found. + +# Basic compiler and tooling-generated symbols that can safely be left +# undefined. Ensure KASAN is enabled to avoid logic that may disable +# FORTIFY_SOURCE when KASAN is not enabled. undefsyms_base.o does not +# automatically get KASAN flags because it is not linked into vmlinux. +targets += undefsyms_base.o +KASAN_SANITIZE_undefsyms_base.o := y + +UNDEFINED_ALLOWLIST = __asan __gcov __kasan __kcsan __hwasan __sancov __sanitizer __tsan __ubsan __msan \ + __aeabi_unwind_cpp __s390_indirect_jump __x86_indirect_thunk simple_ring_buffer \ + $(shell $(NM) -u $(obj)/undefsyms_base.o 2>/dev/null | awk '{print $$2}') + +quiet_cmd_check_undefined = NM $< + cmd_check_undefined = \ + undefsyms=$$($(NM) -u $< | grep -v $(addprefix -e , $(UNDEFINED_ALLOWLIST)) || true); \ + if [ -n "$$undefsyms" ]; then \ + echo "Unexpected symbols in $<:" >&2; \ + echo "$$undefsyms" >&2; \ + false; \ + fi; \ + touch $@ + +$(obj)/%.o.checked: $(obj)/%.o $(obj)/undefsyms_base.o FORCE + $(call if_changed,check_undefined) + +always-$(CONFIG_SIMPLE_RING_BUFFER) += simple_ring_buffer.o.checked + libftrace-y := ftrace.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 8fd292d34d89..8cd2520b4c99 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -63,13 +63,116 @@ static int blk_probes_ref; static void blk_register_tracepoints(void); static void blk_unregister_tracepoints(void); +static void record_blktrace_event(struct blk_io_trace *t, pid_t pid, int cpu, + sector_t sector, int bytes, u64 what, + dev_t dev, int error, u64 cgid, + ssize_t cgid_len, void *pdu_data, int pdu_len) + +{ + /* + * These two are not needed in ftrace as they are in the + * generic trace_entry, filled by tracing_generic_entry_update, + * but for the trace_event->bin() synthesizer benefit we do it + * here too. + */ + t->cpu = cpu; + t->pid = pid; + + t->sector = sector; + t->bytes = bytes; + t->action = lower_32_bits(what); + t->device = dev; + t->error = error; + t->pdu_len = pdu_len + cgid_len; + + if (cgid_len) + memcpy((void *)t + sizeof(*t), &cgid, cgid_len); + if (pdu_len) + memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len); +} + +static void record_blktrace_event2(struct blk_io_trace2 *t2, pid_t pid, int cpu, + sector_t sector, int bytes, u64 what, + dev_t dev, int error, u64 cgid, + ssize_t cgid_len, void *pdu_data, + int pdu_len) +{ + t2->pid = pid; + t2->cpu = cpu; + + t2->sector = sector; + t2->bytes = bytes; + t2->action = what; + t2->device = dev; + t2->error = error; + t2->pdu_len = pdu_len + cgid_len; + + if (cgid_len) + memcpy((void *)t2 + sizeof(*t2), &cgid, cgid_len); + if (pdu_len) + memcpy((void *)t2 + sizeof(*t2) + cgid_len, pdu_data, pdu_len); +} + +static void relay_blktrace_event1(struct blk_trace *bt, unsigned long sequence, + pid_t pid, int cpu, sector_t sector, int bytes, + u64 what, int error, u64 cgid, + ssize_t cgid_len, void *pdu_data, int pdu_len) +{ + struct blk_io_trace *t; + size_t trace_len = sizeof(*t) + pdu_len + cgid_len; + + t = relay_reserve(bt->rchan, trace_len); + if (!t) + return; + + t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; + t->sequence = sequence; + t->time = ktime_to_ns(ktime_get()); + + record_blktrace_event(t, pid, cpu, sector, bytes, what, bt->dev, error, + cgid, cgid_len, pdu_data, pdu_len); +} + +static void relay_blktrace_event2(struct blk_trace *bt, unsigned long sequence, + pid_t pid, int cpu, sector_t sector, + int bytes, u64 what, int error, u64 cgid, + ssize_t cgid_len, void *pdu_data, int pdu_len) +{ + struct blk_io_trace2 *t; + size_t trace_len = sizeof(struct blk_io_trace2) + pdu_len + cgid_len; + + t = relay_reserve(bt->rchan, trace_len); + if (!t) + return; + + t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE2_VERSION; + t->sequence = sequence; + t->time = ktime_to_ns(ktime_get()); + + record_blktrace_event2(t, pid, cpu, sector, bytes, what, bt->dev, error, + cgid, cgid_len, pdu_data, pdu_len); +} + +static void relay_blktrace_event(struct blk_trace *bt, unsigned long sequence, + pid_t pid, int cpu, sector_t sector, int bytes, + u64 what, int error, u64 cgid, + ssize_t cgid_len, void *pdu_data, int pdu_len) +{ + if (bt->version == 2) + return relay_blktrace_event2(bt, sequence, pid, cpu, sector, + bytes, what, error, cgid, cgid_len, + pdu_data, pdu_len); + return relay_blktrace_event1(bt, sequence, pid, cpu, sector, bytes, + what, error, cgid, cgid_len, pdu_data, + pdu_len); +} + /* * Send out a notify message. */ -static void trace_note(struct blk_trace *bt, pid_t pid, int action, +static void trace_note(struct blk_trace *bt, pid_t pid, u64 action, const void *data, size_t len, u64 cgid) { - struct blk_io_trace *t; struct ring_buffer_event *event = NULL; struct trace_buffer *buffer = NULL; unsigned int trace_ctx = 0; @@ -77,38 +180,30 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, bool blk_tracer = blk_tracer_enabled; ssize_t cgid_len = cgid ? sizeof(cgid) : 0; + action = lower_32_bits(action | (cgid ? __BLK_TN_CGROUP : 0)); if (blk_tracer) { + struct blk_io_trace2 *t; + size_t trace_len = sizeof(*t) + cgid_len + len; + buffer = blk_tr->array_buffer.buffer; trace_ctx = tracing_gen_ctx_flags(0); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, - sizeof(*t) + len + cgid_len, - trace_ctx); + trace_len, trace_ctx); if (!event) return; t = ring_buffer_event_data(event); - goto record_it; + record_blktrace_event2(t, pid, cpu, 0, 0, + action, bt->dev, 0, cgid, cgid_len, + (void *)data, len); + trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); + return; } if (!bt->rchan) return; - t = relay_reserve(bt->rchan, sizeof(*t) + len + cgid_len); - if (t) { - t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; - t->time = ktime_to_ns(ktime_get()); -record_it: - t->device = bt->dev; - t->action = action | (cgid ? __BLK_TN_CGROUP : 0); - t->pid = pid; - t->cpu = cpu; - t->pdu_len = len + cgid_len; - if (cgid_len) - memcpy((void *)t + sizeof(*t), &cgid, cgid_len); - memcpy((void *) t + sizeof(*t) + cgid_len, data, len); - - if (blk_tracer) - trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); - } + relay_blktrace_event(bt, 0, pid, cpu, 0, 0, action, 0, cgid, + cgid_len, (void *)data, len); } /* @@ -182,7 +277,7 @@ void __blk_trace_note_message(struct blk_trace *bt, } EXPORT_SYMBOL_GPL(__blk_trace_note_message); -static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, +static int act_log_check(struct blk_trace *bt, u64 what, sector_t sector, pid_t pid) { if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0) @@ -213,13 +308,12 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), * blk_io_trace structure and places it in a per-cpu subbuffer. */ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, - const blk_opf_t opf, u32 what, int error, + const blk_opf_t opf, u64 what, int error, int pdu_len, void *pdu_data, u64 cgid) { struct task_struct *tsk = current; struct ring_buffer_event *event = NULL; struct trace_buffer *buffer = NULL; - struct blk_io_trace *t; unsigned long flags = 0; unsigned long *sequence; unsigned int trace_ctx = 0; @@ -228,6 +322,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, bool blk_tracer = blk_tracer_enabled; ssize_t cgid_len = cgid ? sizeof(cgid) : 0; const enum req_op op = opf & REQ_OP_MASK; + size_t trace_len; if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer)) return; @@ -238,10 +333,47 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, what |= MASK_TC_BIT(opf, META); what |= MASK_TC_BIT(opf, PREFLUSH); what |= MASK_TC_BIT(opf, FUA); - if (op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE) + + switch (op) { + case REQ_OP_DISCARD: + case REQ_OP_SECURE_ERASE: what |= BLK_TC_ACT(BLK_TC_DISCARD); - if (op == REQ_OP_FLUSH) + break; + case REQ_OP_FLUSH: what |= BLK_TC_ACT(BLK_TC_FLUSH); + break; + case REQ_OP_ZONE_APPEND: + what |= BLK_TC_ACT(BLK_TC_ZONE_APPEND); + break; + case REQ_OP_ZONE_RESET: + what |= BLK_TC_ACT(BLK_TC_ZONE_RESET); + break; + case REQ_OP_ZONE_RESET_ALL: + what |= BLK_TC_ACT(BLK_TC_ZONE_RESET_ALL); + break; + case REQ_OP_ZONE_FINISH: + what |= BLK_TC_ACT(BLK_TC_ZONE_FINISH); + break; + case REQ_OP_ZONE_OPEN: + what |= BLK_TC_ACT(BLK_TC_ZONE_OPEN); + break; + case REQ_OP_ZONE_CLOSE: + what |= BLK_TC_ACT(BLK_TC_ZONE_CLOSE); + break; + case REQ_OP_WRITE_ZEROES: + what |= BLK_TC_ACT(BLK_TC_WRITE_ZEROES); + break; + default: + break; + } + + /* Drop trace events for zone operations with blktrace v1 */ + if (bt->version == 1 && (what >> BLK_TC_SHIFT) > BLK_TC_END_V1) { + pr_debug_ratelimited("blktrace v1 cannot trace zone operation 0x%llx\n", + (unsigned long long)what); + return; + } + if (cgid) what |= __BLK_TA_CGROUP; @@ -251,17 +383,71 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, cpu = raw_smp_processor_id(); if (blk_tracer) { - tracing_record_cmdline(current); - buffer = blk_tr->array_buffer.buffer; trace_ctx = tracing_gen_ctx_flags(0); + switch (bt->version) { + case 1: + trace_len = sizeof(struct blk_io_trace); + break; + case 2: + default: + /* + * ftrace always uses v2 (blk_io_trace2) format. + * + * For sysfs-enabled tracing path (enabled via + * /sys/block/DEV/trace/enable), blk_trace_setup_queue() + * never initializes bt->version, leaving it 0 from + * kzalloc(). We must handle version==0 safely here. + * + * Fall through to default to ensure we never hit the + * old bug where default set trace_len=0, causing + * buffer underflow and memory corruption. + * + * Always use v2 format for ftrace and normalize + * bt->version to 2 when uninitialized. + */ + trace_len = sizeof(struct blk_io_trace2); + if (bt->version == 0) + bt->version = 2; + break; + } + trace_len += pdu_len + cgid_len; event = trace_buffer_lock_reserve(buffer, TRACE_BLK, - sizeof(*t) + pdu_len + cgid_len, - trace_ctx); + trace_len, trace_ctx); if (!event) return; - t = ring_buffer_event_data(event); - goto record_it; + + tracing_record_cmdline(current); + switch (bt->version) { + case 1: + record_blktrace_event(ring_buffer_event_data(event), + pid, cpu, sector, bytes, + what, bt->dev, error, cgid, cgid_len, + pdu_data, pdu_len); + break; + case 2: + default: + /* + * Use v2 recording function (record_blktrace_event2) + * which writes blk_io_trace2 structure with correct + * field layout: + * - 32-bit pid at offset 28 + * - 64-bit action at offset 32 + * + * Fall through to default handles version==0 case + * (from sysfs path), ensuring we always use correct + * v2 recording function to match the v2 buffer + * allocated above. + */ + record_blktrace_event2(ring_buffer_event_data(event), + pid, cpu, sector, bytes, + what, bt->dev, error, cgid, cgid_len, + pdu_data, pdu_len); + break; + } + + trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); + return; } if (unlikely(tsk->btrace_seq != blktrace_seq)) @@ -273,41 +459,10 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, * from coming in and stepping on our toes. */ local_irq_save(flags); - t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len + cgid_len); - if (t) { - sequence = per_cpu_ptr(bt->sequence, cpu); - - t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; - t->sequence = ++(*sequence); - t->time = ktime_to_ns(ktime_get()); -record_it: - /* - * These two are not needed in ftrace as they are in the - * generic trace_entry, filled by tracing_generic_entry_update, - * but for the trace_event->bin() synthesizer benefit we do it - * here too. - */ - t->cpu = cpu; - t->pid = pid; - - t->sector = sector; - t->bytes = bytes; - t->action = what; - t->device = bt->dev; - t->error = error; - t->pdu_len = pdu_len + cgid_len; - - if (cgid_len) - memcpy((void *)t + sizeof(*t), &cgid, cgid_len); - if (pdu_len) - memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len); - - if (blk_tracer) { - trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); - return; - } - } - + sequence = per_cpu_ptr(bt->sequence, cpu); + (*sequence)++; + relay_blktrace_event(bt, *sequence, pid, cpu, sector, bytes, + what, error, cgid, cgid_len, pdu_data, pdu_len); local_irq_restore(flags); } @@ -403,9 +558,9 @@ int blk_trace_remove(struct request_queue *q) { int ret; - mutex_lock(&q->debugfs_mutex); + blk_debugfs_lock_nomemsave(q); ret = __blk_trace_remove(q); - mutex_unlock(&q->debugfs_mutex); + blk_debugfs_unlock_nomemrestore(q); return ret; } @@ -415,9 +570,10 @@ static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, size_t count, loff_t *ppos) { struct blk_trace *bt = filp->private_data; + size_t dropped = relay_stats(bt->rchan, RELAY_STATS_BUF_FULL); char buf[16]; - snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped)); + snprintf(buf, sizeof(buf), "%zu\n", dropped); return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf)); } @@ -456,23 +612,6 @@ static const struct file_operations blk_msg_fops = { .llseek = noop_llseek, }; -/* - * Keep track of how many times we encountered a full subbuffer, to aid - * the user space app in telling how many lost events there were. - */ -static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf, - void *prev_subbuf, size_t prev_padding) -{ - struct blk_trace *bt; - - if (!relay_buf_full(buf)) - return 1; - - bt = buf->chan->private_data; - atomic_inc(&bt->dropped); - return 0; -} - static int blk_remove_buf_file_callback(struct dentry *dentry) { debugfs_remove(dentry); @@ -491,7 +630,6 @@ static struct dentry *blk_create_buf_file_callback(const char *filename, } static const struct rchan_callbacks blk_relay_callbacks = { - .subbuf_start = blk_subbuf_start_callback, .create_buf_file = blk_create_buf_file_callback, .remove_buf_file = blk_remove_buf_file_callback, }; @@ -511,9 +649,10 @@ static void blk_trace_setup_lba(struct blk_trace *bt, /* * Setup everything required to start tracing */ -static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, - struct block_device *bdev, - struct blk_user_trace_setup *buts) +static struct blk_trace *blk_trace_setup_prepare(struct request_queue *q, + char *name, dev_t dev, + u32 buf_size, u32 buf_nr, + struct block_device *bdev) { struct blk_trace *bt = NULL; struct dentry *dir = NULL; @@ -521,31 +660,19 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, lockdep_assert_held(&q->debugfs_mutex); - if (!buts->buf_size || !buts->buf_nr) - return -EINVAL; - - strscpy_pad(buts->name, name, BLKTRACE_BDEV_SIZE); - - /* - * some device names have larger paths - convert the slashes - * to underscores for this to work as expected - */ - strreplace(buts->name, '/', '_'); - /* * bdev can be NULL, as with scsi-generic, this is a helpful as * we can be. */ if (rcu_dereference_protected(q->blk_trace, lockdep_is_held(&q->debugfs_mutex))) { - pr_warn("Concurrent blktraces are not allowed on %s\n", - buts->name); - return -EBUSY; + pr_warn("Concurrent blktraces are not allowed on %s\n", name); + return ERR_PTR(-EBUSY); } - bt = kzalloc(sizeof(*bt), GFP_KERNEL); + bt = kzalloc_obj(*bt); if (!bt) - return -ENOMEM; + return ERR_PTR(-ENOMEM); ret = -ENOMEM; bt->sequence = alloc_percpu(unsigned long); @@ -565,7 +692,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (bdev && !bdev_is_partition(bdev)) dir = q->debugfs_dir; else - bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); + bt->dir = dir = debugfs_create_dir(name, blk_debugfs_root); /* * As blktrace relies on debugfs for its interface the debugfs directory @@ -573,31 +700,52 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, * files or directories. */ if (IS_ERR_OR_NULL(dir)) { - pr_warn("debugfs_dir not present for %s so skipping\n", - buts->name); + pr_warn("debugfs_dir not present for %s so skipping\n", name); ret = -ENOENT; goto err; } bt->dev = dev; - atomic_set(&bt->dropped, 0); INIT_LIST_HEAD(&bt->running_list); ret = -EIO; debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops); debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops); - bt->rchan = relay_open("trace", dir, buts->buf_size, - buts->buf_nr, &blk_relay_callbacks, bt); + bt->rchan = relay_open("trace", dir, buf_size, buf_nr, + &blk_relay_callbacks, bt); if (!bt->rchan) goto err; + blk_trace_setup_lba(bt, bdev); + + return bt; + +err: + blk_trace_free(q, bt); + + return ERR_PTR(ret); +} + +static void blk_trace_setup_finalize(struct request_queue *q, + char *name, int version, + struct blk_trace *bt, + struct blk_user_trace_setup2 *buts) + +{ + strscpy_pad(buts->name, name, BLKTRACE_BDEV_SIZE2); + + /* + * some device names have larger paths - convert the slashes + * to underscores for this to work as expected + */ + strreplace(buts->name, '/', '_'); + + bt->version = version; bt->act_mask = buts->act_mask; if (!bt->act_mask) bt->act_mask = (u16) -1; - blk_trace_setup_lba(bt, bdev); - /* overwrite with user settings */ if (buts->start_lba) bt->start_lba = buts->start_lba; @@ -609,62 +757,103 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, rcu_assign_pointer(q->blk_trace, bt); get_probe_ref(); - - ret = 0; -err: - if (ret) - blk_trace_free(q, bt); - return ret; } -static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev, - struct block_device *bdev, char __user *arg) +int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, + struct block_device *bdev, + char __user *arg) { + struct blk_user_trace_setup2 buts2; struct blk_user_trace_setup buts; + struct blk_trace *bt; + unsigned int memflags; int ret; ret = copy_from_user(&buts, arg, sizeof(buts)); if (ret) return -EFAULT; - ret = do_blk_trace_setup(q, name, dev, bdev, &buts); - if (ret) - return ret; + if (!buts.buf_size || !buts.buf_nr) + return -EINVAL; + + buts2 = (struct blk_user_trace_setup2) { + .act_mask = buts.act_mask, + .buf_size = buts.buf_size, + .buf_nr = buts.buf_nr, + .start_lba = buts.start_lba, + .end_lba = buts.end_lba, + .pid = buts.pid, + }; + + memflags = blk_debugfs_lock(q); + bt = blk_trace_setup_prepare(q, name, dev, buts.buf_size, buts.buf_nr, + bdev); + if (IS_ERR(bt)) { + blk_debugfs_unlock(q, memflags); + return PTR_ERR(bt); + } + blk_trace_setup_finalize(q, name, 1, bt, &buts2); + strscpy(buts.name, buts2.name, BLKTRACE_BDEV_SIZE); + blk_debugfs_unlock(q, memflags); if (copy_to_user(arg, &buts, sizeof(buts))) { - __blk_trace_remove(q); + blk_trace_remove(q); return -EFAULT; } return 0; } +EXPORT_SYMBOL_GPL(blk_trace_setup); -int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, - struct block_device *bdev, - char __user *arg) +static int blk_trace_setup2(struct request_queue *q, char *name, dev_t dev, + struct block_device *bdev, char __user *arg) { - int ret; + struct blk_user_trace_setup2 buts2; + struct blk_trace *bt; + unsigned int memflags; + + if (copy_from_user(&buts2, arg, sizeof(buts2))) + return -EFAULT; - mutex_lock(&q->debugfs_mutex); - ret = __blk_trace_setup(q, name, dev, bdev, arg); - mutex_unlock(&q->debugfs_mutex); + if (!buts2.buf_size || !buts2.buf_nr) + return -EINVAL; - return ret; + if (buts2.flags != 0) + return -EINVAL; + + memflags = blk_debugfs_lock(q); + bt = blk_trace_setup_prepare(q, name, dev, buts2.buf_size, buts2.buf_nr, + bdev); + if (IS_ERR(bt)) { + blk_debugfs_unlock(q, memflags); + return PTR_ERR(bt); + } + blk_trace_setup_finalize(q, name, 2, bt, &buts2); + blk_debugfs_unlock(q, memflags); + + if (copy_to_user(arg, &buts2, sizeof(buts2))) { + blk_trace_remove(q); + return -EFAULT; + } + return 0; } -EXPORT_SYMBOL_GPL(blk_trace_setup); #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) static int compat_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct block_device *bdev, char __user *arg) { - struct blk_user_trace_setup buts; + struct blk_user_trace_setup2 buts2; struct compat_blk_user_trace_setup cbuts; - int ret; + struct blk_trace *bt; + unsigned int memflags; if (copy_from_user(&cbuts, arg, sizeof(cbuts))) return -EFAULT; - buts = (struct blk_user_trace_setup) { + if (!cbuts.buf_size || !cbuts.buf_nr) + return -EINVAL; + + buts2 = (struct blk_user_trace_setup2) { .act_mask = cbuts.act_mask, .buf_size = cbuts.buf_size, .buf_nr = cbuts.buf_nr, @@ -673,12 +862,18 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, .pid = cbuts.pid, }; - ret = do_blk_trace_setup(q, name, dev, bdev, &buts); - if (ret) - return ret; + memflags = blk_debugfs_lock(q); + bt = blk_trace_setup_prepare(q, name, dev, buts2.buf_size, buts2.buf_nr, + bdev); + if (IS_ERR(bt)) { + blk_debugfs_unlock(q, memflags); + return PTR_ERR(bt); + } + blk_trace_setup_finalize(q, name, 1, bt, &buts2); + blk_debugfs_unlock(q, memflags); - if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) { - __blk_trace_remove(q); + if (copy_to_user(arg, &buts2.name, ARRAY_SIZE(buts2.name))) { + blk_trace_remove(q); return -EFAULT; } @@ -705,9 +900,9 @@ int blk_trace_startstop(struct request_queue *q, int start) { int ret; - mutex_lock(&q->debugfs_mutex); + blk_debugfs_lock_nomemsave(q); ret = __blk_trace_startstop(q, start); - mutex_unlock(&q->debugfs_mutex); + blk_debugfs_unlock_nomemrestore(q); return ret; } @@ -732,12 +927,14 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) int ret, start = 0; char b[BDEVNAME_SIZE]; - mutex_lock(&q->debugfs_mutex); - switch (cmd) { + case BLKTRACESETUP2: + snprintf(b, sizeof(b), "%pg", bdev); + ret = blk_trace_setup2(q, b, bdev->bd_dev, bdev, arg); + break; case BLKTRACESETUP: snprintf(b, sizeof(b), "%pg", bdev); - ret = __blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); + ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); break; #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) case BLKTRACESETUP32: @@ -749,17 +946,15 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) start = 1; fallthrough; case BLKTRACESTOP: - ret = __blk_trace_startstop(q, start); + ret = blk_trace_startstop(q, start); break; case BLKTRACETEARDOWN: - ret = __blk_trace_remove(q); + ret = blk_trace_remove(q); break; default: ret = -ENOTTY; break; } - - mutex_unlock(&q->debugfs_mutex); return ret; } @@ -824,7 +1019,7 @@ blk_trace_request_get_cgid(struct request *rq) * **/ static void blk_add_trace_rq(struct request *rq, blk_status_t error, - unsigned int nr_bytes, u32 what, u64 cgid) + unsigned int nr_bytes, u64 what, u64 cgid) { struct blk_trace *bt; @@ -876,6 +1071,22 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq, blk_trace_request_get_cgid(rq)); } +static void blk_add_trace_zone_update_request(void *ignore, struct request *rq) +{ + struct blk_trace *bt; + + rcu_read_lock(); + bt = rcu_dereference(rq->q->blk_trace); + if (likely(!bt) || bt->version < 2) { + rcu_read_unlock(); + return; + } + rcu_read_unlock(); + + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ZONE_APPEND, + blk_trace_request_get_cgid(rq)); +} + /** * blk_add_trace_bio - Add a trace for a bio oriented action * @q: queue the io is for @@ -888,7 +1099,7 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq, * **/ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, - u32 what, int error) + u64 what, int error) { struct blk_trace *bt; @@ -905,11 +1116,6 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, rcu_read_unlock(); } -static void blk_add_trace_bio_bounce(void *ignore, struct bio *bio) -{ - blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_BOUNCE, 0); -} - static void blk_add_trace_bio_complete(void *ignore, struct request_queue *q, struct bio *bio) { @@ -959,7 +1165,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, bt = rcu_dereference(q->blk_trace); if (bt) { __be64 rpdu = cpu_to_be64(depth); - u32 what; + u64 what; if (explicit) what = BLK_TA_UNPLUG_IO; @@ -971,6 +1177,37 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, rcu_read_unlock(); } +static void blk_add_trace_zone_plug(void *ignore, struct request_queue *q, + unsigned int zno, sector_t sector, + unsigned int sectors) +{ + struct blk_trace *bt; + + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (bt && bt->version >= 2) + __blk_add_trace(bt, sector, sectors << SECTOR_SHIFT, 0, + BLK_TA_ZONE_PLUG, 0, 0, NULL, 0); + rcu_read_unlock(); + + return; +} + +static void blk_add_trace_zone_unplug(void *ignore, struct request_queue *q, + unsigned int zno, sector_t sector, + unsigned int sectors) +{ + struct blk_trace *bt; + + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (bt && bt->version >= 2) + __blk_add_trace(bt, sector, sectors << SECTOR_SHIFT, 0, + BLK_TA_ZONE_UNPLUG, 0, 0, NULL, 0); + rcu_read_unlock(); + return; +} + static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu) { struct request_queue *q = bio->bi_bdev->bd_disk->queue; @@ -1101,8 +1338,6 @@ static void blk_register_tracepoints(void) WARN_ON(ret); ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL); WARN_ON(ret); - ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL); - WARN_ON(ret); ret = register_trace_block_bio_complete(blk_add_trace_bio_complete, NULL); WARN_ON(ret); ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL); @@ -1113,6 +1348,15 @@ static void blk_register_tracepoints(void) WARN_ON(ret); ret = register_trace_block_getrq(blk_add_trace_getrq, NULL); WARN_ON(ret); + ret = register_trace_blk_zone_append_update_request_bio( + blk_add_trace_zone_update_request, NULL); + WARN_ON(ret); + ret = register_trace_disk_zone_wplug_add_bio(blk_add_trace_zone_plug, + NULL); + WARN_ON(ret); + ret = register_trace_blk_zone_wplug_bio(blk_add_trace_zone_unplug, + NULL); + WARN_ON(ret); ret = register_trace_block_plug(blk_add_trace_plug, NULL); WARN_ON(ret); ret = register_trace_block_unplug(blk_add_trace_unplug, NULL); @@ -1132,12 +1376,15 @@ static void blk_unregister_tracepoints(void) unregister_trace_block_split(blk_add_trace_split, NULL); unregister_trace_block_unplug(blk_add_trace_unplug, NULL); unregister_trace_block_plug(blk_add_trace_plug, NULL); + unregister_trace_blk_zone_wplug_bio(blk_add_trace_zone_unplug, NULL); + unregister_trace_disk_zone_wplug_add_bio(blk_add_trace_zone_plug, NULL); + unregister_trace_blk_zone_append_update_request_bio( + blk_add_trace_zone_update_request, NULL); unregister_trace_block_getrq(blk_add_trace_getrq, NULL); unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL); unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL); unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL); unregister_trace_block_bio_complete(blk_add_trace_bio_complete, NULL); - unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL); unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL); unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); unregister_trace_block_rq_merge(blk_add_trace_rq_merge, NULL); @@ -1151,7 +1398,7 @@ static void blk_unregister_tracepoints(void) * struct blk_io_tracer formatting routines */ -static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) +static void fill_rwbs(char *rwbs, const struct blk_io_trace2 *t) { int i = 0; int tc = t->action >> BLK_TC_SHIFT; @@ -1166,7 +1413,10 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) if (tc & BLK_TC_DISCARD) rwbs[i++] = 'D'; - else if (tc & BLK_TC_WRITE) + else if (tc & BLK_TC_WRITE_ZEROES) { + rwbs[i++] = 'W'; + rwbs[i++] = 'Z'; + } else if (tc & BLK_TC_WRITE) rwbs[i++] = 'W'; else if (t->bytes) rwbs[i++] = 'R'; @@ -1186,9 +1436,9 @@ out: } static inline -const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent) +const struct blk_io_trace2 *te_blk_io_trace(const struct trace_entry *ent) { - return (const struct blk_io_trace *)ent; + return (const struct blk_io_trace2 *)ent; } static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg) @@ -1247,7 +1497,7 @@ static void blk_log_action_classic(struct trace_iterator *iter, const char *act, unsigned long long ts = iter->ts; unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC); unsigned secs = (unsigned long)ts; - const struct blk_io_trace *t = te_blk_io_trace(iter->ent); + const struct blk_io_trace2 *t = te_blk_io_trace(iter->ent); fill_rwbs(rwbs, t); @@ -1261,7 +1511,7 @@ static void blk_log_action(struct trace_iterator *iter, const char *act, bool has_cg) { char rwbs[RWBS_LEN]; - const struct blk_io_trace *t = te_blk_io_trace(iter->ent); + const struct blk_io_trace2 *t = te_blk_io_trace(iter->ent); fill_rwbs(rwbs, t); if (has_cg) { @@ -1474,7 +1724,6 @@ static const struct { [__BLK_TA_UNPLUG_TIMER] = {{ "UT", "unplug_timer" }, blk_log_unplug }, [__BLK_TA_INSERT] = {{ "I", "insert" }, blk_log_generic }, [__BLK_TA_SPLIT] = {{ "X", "split" }, blk_log_split }, - [__BLK_TA_BOUNCE] = {{ "B", "bounce" }, blk_log_generic }, [__BLK_TA_REMAP] = {{ "A", "remap" }, blk_log_remap }, }; @@ -1483,7 +1732,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter, { struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; - const struct blk_io_trace *t; + const struct blk_io_trace2 *t; u16 what; bool long_act; blk_log_action_t *log_action; @@ -1491,7 +1740,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter, t = te_blk_io_trace(iter->ent); what = (t->action & ((1 << BLK_TC_SHIFT) - 1)) & ~__BLK_TA_CGROUP; - long_act = !!(tr->trace_flags & TRACE_ITER_VERBOSE); + long_act = !!(tr->trace_flags & TRACE_ITER(VERBOSE)); log_action = classic ? &blk_log_action_classic : &blk_log_action; has_cg = t->action & __BLK_TA_CGROUP; @@ -1520,8 +1769,8 @@ static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, static void blk_trace_synthesize_old_trace(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; - struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; - const int offset = offsetof(struct blk_io_trace, sector); + struct blk_io_trace2 *t = (struct blk_io_trace2 *)iter->ent; + const int offset = offsetof(struct blk_io_trace2, sector); struct blk_io_trace old = { .magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION, .time = iter->ts, @@ -1556,9 +1805,9 @@ blk_tracer_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) /* don't output context-info for blk_classic output */ if (bit == TRACE_BLK_OPT_CLASSIC) { if (set) - tr->trace_flags &= ~TRACE_ITER_CONTEXT_INFO; + tr->trace_flags &= ~TRACE_ITER(CONTEXT_INFO); else - tr->trace_flags |= TRACE_ITER_CONTEXT_INFO; + tr->trace_flags |= TRACE_ITER(CONTEXT_INFO); } return 0; } @@ -1585,7 +1834,9 @@ static struct trace_event trace_blk_event = { .funcs = &trace_blk_event_funcs, }; -static int __init init_blk_tracer(void) +static struct work_struct blktrace_works __initdata; + +static int __init __init_blk_tracer(void) { if (!register_trace_event(&trace_blk_event)) { pr_warn("Warning: could not register block events\n"); @@ -1598,9 +1849,32 @@ static int __init init_blk_tracer(void) return 1; } + BUILD_BUG_ON(__alignof__(struct blk_user_trace_setup2) % + __alignof__(long)); + BUILD_BUG_ON(__alignof__(struct blk_io_trace2) % __alignof__(long)); + return 0; } +static void __init blktrace_works_func(struct work_struct *work) +{ + __init_blk_tracer(); +} + +static int __init init_blk_tracer(void) +{ + int ret = 0; + + if (trace_init_wq) { + INIT_WORK(&blktrace_works, blktrace_works_func); + queue_work(trace_init_wq, &blktrace_works); + } else { + ret = __init_blk_tracer(); + } + + return ret; +} + device_initcall(init_blk_tracer); static int blk_trace_remove_queue(struct request_queue *q) @@ -1629,7 +1903,7 @@ static int blk_trace_setup_queue(struct request_queue *q, struct blk_trace *bt = NULL; int ret = -ENOMEM; - bt = kzalloc(sizeof(*bt), GFP_KERNEL); + bt = kzalloc_obj(*bt); if (!bt) return -ENOMEM; @@ -1706,6 +1980,7 @@ static const struct { { BLK_TC_DISCARD, "discard" }, { BLK_TC_DRV_DATA, "drv_data" }, { BLK_TC_FUA, "fua" }, + { BLK_TC_WRITE_ZEROES, "write-zeroes" }, }; static int blk_trace_str2mask(const char *str) @@ -1768,7 +2043,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, struct blk_trace *bt; ssize_t ret = -ENXIO; - mutex_lock(&q->debugfs_mutex); + blk_debugfs_lock_nomemsave(q); bt = rcu_dereference_protected(q->blk_trace, lockdep_is_held(&q->debugfs_mutex)); @@ -1789,7 +2064,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, ret = sprintf(buf, "%llu\n", bt->end_lba); out_unlock_bdev: - mutex_unlock(&q->debugfs_mutex); + blk_debugfs_unlock_nomemrestore(q); return ret; } @@ -1800,6 +2075,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, struct block_device *bdev = dev_to_bdev(dev); struct request_queue *q = bdev_get_queue(bdev); struct blk_trace *bt; + unsigned int memflags; u64 value; ssize_t ret = -EINVAL; @@ -1819,7 +2095,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, goto out; } - mutex_lock(&q->debugfs_mutex); + memflags = blk_debugfs_lock(q); bt = rcu_dereference_protected(q->blk_trace, lockdep_is_held(&q->debugfs_mutex)); @@ -1854,7 +2130,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, } out_unlock_bdev: - mutex_unlock(&q->debugfs_mutex); + blk_debugfs_unlock(q, memflags); out: return ret ? ret : count; } @@ -1896,6 +2172,33 @@ void blk_fill_rwbs(char *rwbs, blk_opf_t opf) case REQ_OP_READ: rwbs[i++] = 'R'; break; + case REQ_OP_ZONE_APPEND: + rwbs[i++] = 'Z'; + rwbs[i++] = 'A'; + break; + case REQ_OP_ZONE_RESET: + case REQ_OP_ZONE_RESET_ALL: + rwbs[i++] = 'Z'; + rwbs[i++] = 'R'; + if ((opf & REQ_OP_MASK) == REQ_OP_ZONE_RESET_ALL) + rwbs[i++] = 'A'; + break; + case REQ_OP_ZONE_FINISH: + rwbs[i++] = 'Z'; + rwbs[i++] = 'F'; + break; + case REQ_OP_ZONE_OPEN: + rwbs[i++] = 'Z'; + rwbs[i++] = 'O'; + break; + case REQ_OP_ZONE_CLOSE: + rwbs[i++] = 'Z'; + rwbs[i++] = 'C'; + break; + case REQ_OP_WRITE_ZEROES: + rwbs[i++] = 'W'; + rwbs[i++] = 'Z'; + break; default: rwbs[i++] = 'N'; } @@ -1908,6 +2211,10 @@ void blk_fill_rwbs(char *rwbs, blk_opf_t opf) rwbs[i++] = 'S'; if (opf & REQ_META) rwbs[i++] = 'M'; + if (opf & REQ_ATOMIC) + rwbs[i++] = 'U'; + + WARN_ON_ONCE(i >= RWBS_LEN); rwbs[i] = '\0'; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 1b8db5aee9d3..a02bd258677e 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -22,7 +22,6 @@ #include <linux/bsearch.h> #include <linux/sort.h> #include <linux/key.h> -#include <linux/verification.h> #include <linux/namei.h> #include <net/bpf_sk_storage.h> @@ -357,17 +356,6 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = { .arg3_type = ARG_CONST_SIZE, }; -static const struct bpf_func_proto *bpf_get_probe_write_proto(void) -{ - if (!capable(CAP_SYS_ADMIN)) - return NULL; - - pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!", - current->comm, task_pid_nr(current)); - - return &bpf_probe_write_user_proto; -} - #define MAX_TRACE_PRINTK_VARARGS 3 #define BPF_TRACE_PRINTK_SIZE 1024 @@ -403,7 +391,7 @@ static const struct bpf_func_proto bpf_trace_printk_proto = { .arg2_type = ARG_CONST_SIZE, }; -static void __set_printk_clr_event(void) +static void __set_printk_clr_event(struct work_struct *work) { /* * This program might be calling bpf_trace_printk, @@ -416,10 +404,11 @@ static void __set_printk_clr_event(void) if (trace_set_clr_event("bpf_trace", "bpf_trace_printk", 1)) pr_warn_ratelimited("could not enable bpf_trace_printk events"); } +static DECLARE_WORK(set_printk_work, __set_printk_clr_event); const struct bpf_func_proto *bpf_get_trace_printk_proto(void) { - __set_printk_clr_event(); + schedule_work(&set_printk_work); return &bpf_trace_printk_proto; } @@ -462,7 +451,7 @@ static const struct bpf_func_proto bpf_trace_vprintk_proto = { const struct bpf_func_proto *bpf_get_trace_vprintk_proto(void) { - __set_printk_clr_event(); + schedule_work(&set_printk_work); return &bpf_trace_vprintk_proto; } @@ -582,7 +571,7 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) return value; } -static const struct bpf_func_proto bpf_perf_event_read_proto = { +const struct bpf_func_proto bpf_perf_event_read_proto = { .func = bpf_perf_event_read, .gpl_only = true, .ret_type = RET_INTEGER, @@ -617,9 +606,15 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = { .arg4_type = ARG_CONST_SIZE, }; +const struct bpf_func_proto *bpf_get_perf_event_read_value_proto(void) +{ + return &bpf_perf_event_read_value_proto; +} + static __always_inline u64 __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, - u64 flags, struct perf_sample_data *sd) + u64 flags, struct perf_raw_record *raw, + struct perf_sample_data *sd) { struct bpf_array *array = container_of(map, struct bpf_array, map); unsigned int cpu = smp_processor_id(); @@ -644,6 +639,8 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, if (unlikely(event->oncpu != cpu)) return -EOPNOTSUPP; + perf_sample_save_raw_data(sd, event, raw); + return perf_event_output(event, sd, regs); } @@ -687,9 +684,8 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, } perf_sample_data_init(sd, 0, 0); - perf_sample_save_raw_data(sd, &raw); - err = __bpf_perf_event_output(regs, map, flags, sd); + err = __bpf_perf_event_output(regs, map, flags, &raw, sd); out: this_cpu_dec(bpf_trace_nest_level); preempt_enable(); @@ -748,9 +744,8 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, perf_fetch_caller_regs(regs); perf_sample_data_init(sd, 0, 0); - perf_sample_save_raw_data(sd, &raw); - ret = __bpf_perf_event_output(regs, map, flags, sd); + ret = __bpf_perf_event_output(regs, map, flags, &raw, sd); out: this_cpu_dec(bpf_event_output_nest_level); preempt_enable(); @@ -785,8 +780,7 @@ BPF_CALL_1(bpf_task_pt_regs, struct task_struct *, task) return (unsigned long) task_pt_regs(task); } -BTF_ID_LIST(bpf_task_pt_regs_ids) -BTF_ID(struct, pt_regs) +BTF_ID_LIST_SINGLE(bpf_task_pt_regs_ids, struct, pt_regs) const struct bpf_func_proto bpf_task_pt_regs_proto = { .func = bpf_task_pt_regs, @@ -836,7 +830,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type, struct task_struc info.si_code = SI_KERNEL; info.si_pid = 0; info.si_uid = 0; - info.si_value.sival_ptr = (void *)(unsigned long)value; + info.si_value.sival_ptr = (void __user __force *)(unsigned long)value; siginfo = &info; } @@ -853,7 +847,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type, struct task_struc if (unlikely(is_global_init(task))) return -EPERM; - if (irqs_disabled()) { + if (preempt_count() != 0 || irqs_disabled()) { /* Do an early check on signal validity. Otherwise, * the error is lost in deferred irq_work. */ @@ -886,7 +880,7 @@ BPF_CALL_1(bpf_send_signal, u32, sig) return bpf_send_signal_common(sig, PIDTYPE_TGID, NULL, 0); } -static const struct bpf_func_proto bpf_send_signal_proto = { +const struct bpf_func_proto bpf_send_signal_proto = { .func = bpf_send_signal, .gpl_only = false, .ret_type = RET_INTEGER, @@ -898,14 +892,14 @@ BPF_CALL_1(bpf_send_signal_thread, u32, sig) return bpf_send_signal_common(sig, PIDTYPE_PID, NULL, 0); } -static const struct bpf_func_proto bpf_send_signal_thread_proto = { +const struct bpf_func_proto bpf_send_signal_thread_proto = { .func = bpf_send_signal_thread, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_d_path, struct path *, path, char *, buf, u32, sz) +BPF_CALL_3(bpf_d_path, const struct path *, path, char *, buf, u32, sz) { struct path copy; long len; @@ -971,7 +965,7 @@ static const struct bpf_func_proto bpf_d_path_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &bpf_d_path_btf_ids[0], - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .allowed = bpf_d_path_allowed, }; @@ -1028,7 +1022,7 @@ const struct bpf_func_proto bpf_snprintf_btf_proto = { .func = bpf_snprintf_btf, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM, + .arg1_type = ARG_PTR_TO_MEM | MEM_WRITE, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE, @@ -1048,27 +1042,14 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = { .arg1_type = ARG_PTR_TO_CTX, }; -#ifdef CONFIG_X86_KERNEL_IBT -static unsigned long get_entry_ip(unsigned long fentry_ip) +static inline unsigned long get_entry_ip(unsigned long fentry_ip) { - u32 instr; - - /* We want to be extra safe in case entry ip is on the page edge, - * but otherwise we need to avoid get_kernel_nofault()'s overhead. - */ - if ((fentry_ip & ~PAGE_MASK) < ENDBR_INSN_SIZE) { - if (get_kernel_nofault(instr, (u32 *)(fentry_ip - ENDBR_INSN_SIZE))) - return fentry_ip; - } else { - instr = *(u32 *)(fentry_ip - ENDBR_INSN_SIZE); - } - if (is_endbr(instr)) +#ifdef CONFIG_X86_KERNEL_IBT + if (is_endbr((void *)(fentry_ip - ENDBR_INSN_SIZE))) fentry_ip -= ENDBR_INSN_SIZE; +#endif return fentry_ip; } -#else -#define get_entry_ip(fentry_ip) fentry_ip -#endif BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs) { @@ -1202,7 +1183,7 @@ BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags) return entry_cnt * br_entry_size; } -static const struct bpf_func_proto bpf_get_branch_snapshot_proto = { +const struct bpf_func_proto bpf_get_branch_snapshot_proto = { .func = bpf_get_branch_snapshot, .gpl_only = true, .ret_type = RET_INTEGER, @@ -1213,7 +1194,7 @@ static const struct bpf_func_proto bpf_get_branch_snapshot_proto = { BPF_CALL_3(get_func_arg, void *, ctx, u32, n, u64 *, value) { /* This helper call is inlined by verifier. */ - u64 nr_args = ((u64 *)ctx)[-1]; + u64 nr_args = ((u64 *)ctx)[-1] & 0xFF; if ((u64) n >= nr_args) return -EINVAL; @@ -1233,7 +1214,7 @@ static const struct bpf_func_proto bpf_get_func_arg_proto = { BPF_CALL_2(get_func_ret, void *, ctx, u64 *, value) { /* This helper call is inlined by verifier. */ - u64 nr_args = ((u64 *)ctx)[-1]; + u64 nr_args = ((u64 *)ctx)[-1] & 0xFF; *value = ((u64 *)ctx)[nr_args]; return 0; @@ -1250,7 +1231,7 @@ static const struct bpf_func_proto bpf_get_func_ret_proto = { BPF_CALL_1(get_func_arg_cnt, void *, ctx) { /* This helper call is inlined by verifier. */ - return ((u64 *)ctx)[-1]; + return ((u64 *)ctx)[-1] & 0xFF; } static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = { @@ -1259,245 +1240,14 @@ static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -#ifdef CONFIG_KEYS -__bpf_kfunc_start_defs(); - -/** - * bpf_lookup_user_key - lookup a key by its serial - * @serial: key handle serial number - * @flags: lookup-specific flags - * - * Search a key with a given *serial* and the provided *flags*. - * If found, increment the reference count of the key by one, and - * return it in the bpf_key structure. - * - * The bpf_key structure must be passed to bpf_key_put() when done - * with it, so that the key reference count is decremented and the - * bpf_key structure is freed. - * - * Permission checks are deferred to the time the key is used by - * one of the available key-specific kfuncs. - * - * Set *flags* with KEY_LOOKUP_CREATE, to attempt creating a requested - * special keyring (e.g. session keyring), if it doesn't yet exist. - * Set *flags* with KEY_LOOKUP_PARTIAL, to lookup a key without waiting - * for the key construction, and to retrieve uninstantiated keys (keys - * without data attached to them). - * - * Return: a bpf_key pointer with a valid key pointer if the key is found, a - * NULL pointer otherwise. - */ -__bpf_kfunc struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags) -{ - key_ref_t key_ref; - struct bpf_key *bkey; - - if (flags & ~KEY_LOOKUP_ALL) - return NULL; - - /* - * Permission check is deferred until the key is used, as the - * intent of the caller is unknown here. - */ - key_ref = lookup_user_key(serial, flags, KEY_DEFER_PERM_CHECK); - if (IS_ERR(key_ref)) - return NULL; - - bkey = kmalloc(sizeof(*bkey), GFP_KERNEL); - if (!bkey) { - key_put(key_ref_to_ptr(key_ref)); - return NULL; - } - - bkey->key = key_ref_to_ptr(key_ref); - bkey->has_ref = true; - - return bkey; -} - -/** - * bpf_lookup_system_key - lookup a key by a system-defined ID - * @id: key ID - * - * Obtain a bpf_key structure with a key pointer set to the passed key ID. - * The key pointer is marked as invalid, to prevent bpf_key_put() from - * attempting to decrement the key reference count on that pointer. The key - * pointer set in such way is currently understood only by - * verify_pkcs7_signature(). - * - * Set *id* to one of the values defined in include/linux/verification.h: - * 0 for the primary keyring (immutable keyring of system keys); - * VERIFY_USE_SECONDARY_KEYRING for both the primary and secondary keyring - * (where keys can be added only if they are vouched for by existing keys - * in those keyrings); VERIFY_USE_PLATFORM_KEYRING for the platform - * keyring (primarily used by the integrity subsystem to verify a kexec'ed - * kerned image and, possibly, the initramfs signature). - * - * Return: a bpf_key pointer with an invalid key pointer set from the - * pre-determined ID on success, a NULL pointer otherwise - */ -__bpf_kfunc struct bpf_key *bpf_lookup_system_key(u64 id) -{ - struct bpf_key *bkey; - - if (system_keyring_id_check(id) < 0) - return NULL; - - bkey = kmalloc(sizeof(*bkey), GFP_ATOMIC); - if (!bkey) - return NULL; - - bkey->key = (struct key *)(unsigned long)id; - bkey->has_ref = false; - - return bkey; -} - -/** - * bpf_key_put - decrement key reference count if key is valid and free bpf_key - * @bkey: bpf_key structure - * - * Decrement the reference count of the key inside *bkey*, if the pointer - * is valid, and free *bkey*. - */ -__bpf_kfunc void bpf_key_put(struct bpf_key *bkey) -{ - if (bkey->has_ref) - key_put(bkey->key); - - kfree(bkey); -} - -#ifdef CONFIG_SYSTEM_DATA_VERIFICATION -/** - * bpf_verify_pkcs7_signature - verify a PKCS#7 signature - * @data_p: data to verify - * @sig_p: signature of the data - * @trusted_keyring: keyring with keys trusted for signature verification - * - * Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr* - * with keys in a keyring referenced by *trusted_keyring*. - * - * Return: 0 on success, a negative value on error. - */ -__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, - struct bpf_dynptr *sig_p, - struct bpf_key *trusted_keyring) -{ - struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p; - struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p; - const void *data, *sig; - u32 data_len, sig_len; - int ret; - - if (trusted_keyring->has_ref) { - /* - * Do the permission check deferred in bpf_lookup_user_key(). - * See bpf_lookup_user_key() for more details. - * - * A call to key_task_permission() here would be redundant, as - * it is already done by keyring_search() called by - * find_asymmetric_key(). - */ - ret = key_validate(trusted_keyring->key); - if (ret < 0) - return ret; - } - - data_len = __bpf_dynptr_size(data_ptr); - data = __bpf_dynptr_data(data_ptr, data_len); - sig_len = __bpf_dynptr_size(sig_ptr); - sig = __bpf_dynptr_data(sig_ptr, sig_len); - - return verify_pkcs7_signature(data, data_len, sig, sig_len, - trusted_keyring->key, - VERIFYING_UNSPECIFIED_SIGNATURE, NULL, - NULL); -} -#endif /* CONFIG_SYSTEM_DATA_VERIFICATION */ - -__bpf_kfunc_end_defs(); - -BTF_KFUNCS_START(key_sig_kfunc_set) -BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE) -BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL) -BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE) -#ifdef CONFIG_SYSTEM_DATA_VERIFICATION -BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE) -#endif -BTF_KFUNCS_END(key_sig_kfunc_set) - -static const struct btf_kfunc_id_set bpf_key_sig_kfunc_set = { - .owner = THIS_MODULE, - .set = &key_sig_kfunc_set, -}; - -static int __init bpf_key_sig_kfuncs_init(void) -{ - return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, - &bpf_key_sig_kfunc_set); -} - -late_initcall(bpf_key_sig_kfuncs_init); -#endif /* CONFIG_KEYS */ - static const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + switch (func_id) { - case BPF_FUNC_map_lookup_elem: - return &bpf_map_lookup_elem_proto; - case BPF_FUNC_map_update_elem: - return &bpf_map_update_elem_proto; - case BPF_FUNC_map_delete_elem: - return &bpf_map_delete_elem_proto; - case BPF_FUNC_map_push_elem: - return &bpf_map_push_elem_proto; - case BPF_FUNC_map_pop_elem: - return &bpf_map_pop_elem_proto; - case BPF_FUNC_map_peek_elem: - return &bpf_map_peek_elem_proto; - case BPF_FUNC_map_lookup_percpu_elem: - return &bpf_map_lookup_percpu_elem_proto; - case BPF_FUNC_ktime_get_ns: - return &bpf_ktime_get_ns_proto; - case BPF_FUNC_ktime_get_boot_ns: - return &bpf_ktime_get_boot_ns_proto; - case BPF_FUNC_tail_call: - return &bpf_tail_call_proto; - case BPF_FUNC_get_current_task: - return &bpf_get_current_task_proto; - case BPF_FUNC_get_current_task_btf: - return &bpf_get_current_task_btf_proto; - case BPF_FUNC_task_pt_regs: - return &bpf_task_pt_regs_proto; - case BPF_FUNC_get_current_uid_gid: - return &bpf_get_current_uid_gid_proto; - case BPF_FUNC_get_current_comm: - return &bpf_get_current_comm_proto; - case BPF_FUNC_trace_printk: - return bpf_get_trace_printk_proto(); case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; - case BPF_FUNC_get_numa_node_id: - return &bpf_get_numa_node_id_proto; - case BPF_FUNC_perf_event_read: - return &bpf_perf_event_read_proto; - case BPF_FUNC_get_prandom_u32: - return &bpf_get_prandom_u32_proto; - case BPF_FUNC_probe_write_user: - return security_locked_down(LOCKDOWN_BPF_WRITE_USER) < 0 ? - NULL : bpf_get_probe_write_proto(); - case BPF_FUNC_probe_read_user: - return &bpf_probe_read_user_proto; - case BPF_FUNC_probe_read_kernel: - return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? - NULL : &bpf_probe_read_kernel_proto; - case BPF_FUNC_probe_read_user_str: - return &bpf_probe_read_user_str_proto; - case BPF_FUNC_probe_read_kernel_str: - return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? - NULL : &bpf_probe_read_kernel_str_proto; #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE case BPF_FUNC_probe_read: return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? @@ -1506,67 +1256,25 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? NULL : &bpf_probe_read_compat_str_proto; #endif -#ifdef CONFIG_CGROUPS - case BPF_FUNC_cgrp_storage_get: - return &bpf_cgrp_storage_get_proto; - case BPF_FUNC_cgrp_storage_delete: - return &bpf_cgrp_storage_delete_proto; - case BPF_FUNC_current_task_under_cgroup: - return &bpf_current_task_under_cgroup_proto; -#endif - case BPF_FUNC_send_signal: - return &bpf_send_signal_proto; - case BPF_FUNC_send_signal_thread: - return &bpf_send_signal_thread_proto; - case BPF_FUNC_perf_event_read_value: - return &bpf_perf_event_read_value_proto; - case BPF_FUNC_ringbuf_output: - return &bpf_ringbuf_output_proto; - case BPF_FUNC_ringbuf_reserve: - return &bpf_ringbuf_reserve_proto; - case BPF_FUNC_ringbuf_submit: - return &bpf_ringbuf_submit_proto; - case BPF_FUNC_ringbuf_discard: - return &bpf_ringbuf_discard_proto; - case BPF_FUNC_ringbuf_query: - return &bpf_ringbuf_query_proto; - case BPF_FUNC_jiffies64: - return &bpf_jiffies64_proto; - case BPF_FUNC_get_task_stack: - return prog->sleepable ? &bpf_get_task_stack_sleepable_proto - : &bpf_get_task_stack_proto; - case BPF_FUNC_copy_from_user: - return &bpf_copy_from_user_proto; - case BPF_FUNC_copy_from_user_task: - return &bpf_copy_from_user_task_proto; - case BPF_FUNC_snprintf_btf: - return &bpf_snprintf_btf_proto; - case BPF_FUNC_per_cpu_ptr: - return &bpf_per_cpu_ptr_proto; - case BPF_FUNC_this_cpu_ptr: - return &bpf_this_cpu_ptr_proto; - case BPF_FUNC_task_storage_get: - if (bpf_prog_check_recur(prog)) - return &bpf_task_storage_get_recur_proto; - return &bpf_task_storage_get_proto; - case BPF_FUNC_task_storage_delete: - if (bpf_prog_check_recur(prog)) - return &bpf_task_storage_delete_recur_proto; - return &bpf_task_storage_delete_proto; - case BPF_FUNC_for_each_map_elem: - return &bpf_for_each_map_elem_proto; - case BPF_FUNC_snprintf: - return &bpf_snprintf_proto; case BPF_FUNC_get_func_ip: return &bpf_get_func_ip_proto_tracing; - case BPF_FUNC_get_branch_snapshot: - return &bpf_get_branch_snapshot_proto; - case BPF_FUNC_find_vma: - return &bpf_find_vma_proto; - case BPF_FUNC_trace_vprintk: - return bpf_get_trace_vprintk_proto(); default: - return bpf_base_func_proto(func_id, prog); + break; + } + + func_proto = bpf_base_func_proto(func_id, prog); + if (func_proto) + return func_proto; + + if (!bpf_token_capable(prog->aux->token, CAP_SYS_ADMIN)) + return NULL; + + switch (func_id) { + case BPF_FUNC_probe_write_user: + return security_locked_down(LOCKDOWN_BPF_WRITE_USER) < 0 ? + NULL : &bpf_probe_write_user_proto; + default: + return NULL; } } @@ -1578,7 +1286,8 @@ static bool is_kprobe_multi(const struct bpf_prog *prog) static inline bool is_kprobe_session(const struct bpf_prog *prog) { - return prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION; + return prog->type == BPF_PROG_TYPE_KPROBE && + prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION; } static inline bool is_uprobe_multi(const struct bpf_prog *prog) @@ -1589,7 +1298,14 @@ static inline bool is_uprobe_multi(const struct bpf_prog *prog) static inline bool is_uprobe_session(const struct bpf_prog *prog) { - return prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION; + return prog->type == BPF_PROG_TYPE_KPROBE && + prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION; +} + +static inline bool is_trace_fsession(const struct bpf_prog *prog) +{ + return prog->type == BPF_PROG_TYPE_TRACING && + prog->expected_attach_type == BPF_TRACE_FSESSION; } static const struct bpf_func_proto * @@ -1630,8 +1346,6 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type { if (off < 0 || off >= sizeof(struct pt_regs)) return false; - if (type != BPF_READ) - return false; if (off % size != 0) return false; /* @@ -1641,6 +1355,9 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type if (off + size > sizeof(struct pt_regs)) return false; + if (type == BPF_WRITE) + prog->aux->kprobe_write_ctx = true; + return true; } @@ -1817,7 +1534,7 @@ static const struct bpf_func_proto bpf_read_branch_records_proto = { .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM_OR_NULL, + .arg2_type = ARG_PTR_TO_MEM_OR_NULL | MEM_WRITE, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; @@ -1861,7 +1578,7 @@ static struct pt_regs *get_bpf_raw_tp_regs(void) struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs); int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level); - if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) { + if (nest_level > ARRAY_SIZE(tp_regs->regs)) { this_cpu_dec(bpf_raw_tp_nest_level); return ERR_PTR(-EBUSY); } @@ -1952,7 +1669,7 @@ static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = { .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; @@ -2025,11 +1742,17 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_d_path: return &bpf_d_path_proto; case BPF_FUNC_get_func_arg: - return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_proto : NULL; + if (bpf_prog_has_trampoline(prog) || + prog->expected_attach_type == BPF_TRACE_RAW_TP) + return &bpf_get_func_arg_proto; + return NULL; case BPF_FUNC_get_func_ret: return bpf_prog_has_trampoline(prog) ? &bpf_get_func_ret_proto : NULL; case BPF_FUNC_get_func_arg_cnt: - return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_cnt_proto : NULL; + if (bpf_prog_has_trampoline(prog) || + prog->expected_attach_type == BPF_TRACE_RAW_TP) + return &bpf_get_func_arg_cnt_proto; + return NULL; case BPF_FUNC_get_attach_cookie: if (prog->type == BPF_PROG_TYPE_TRACING && prog->expected_attach_type == BPF_TRACE_RAW_TP) @@ -2242,6 +1965,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event) { struct bpf_prog_array *old_array; struct bpf_prog_array *new_array; + struct bpf_prog *prog = NULL; int ret; mutex_lock(&bpf_event_mutex); @@ -2262,18 +1986,22 @@ void perf_event_detach_bpf_prog(struct perf_event *event) } put: - /* - * It could be that the bpf_prog is not sleepable (and will be freed - * via normal RCU), but is called from a point that supports sleepable - * programs and uses tasks-trace-RCU. - */ - synchronize_rcu_tasks_trace(); - - bpf_prog_put(event->prog); + prog = event->prog; event->prog = NULL; unlock: mutex_unlock(&bpf_event_mutex); + + if (prog) { + /* + * It could be that the bpf_prog is not sleepable (and will be freed + * via normal RCU), but is called from a point that supports sleepable + * programs and uses tasks-trace-RCU. + */ + synchronize_rcu_tasks_trace(); + + bpf_prog_put(prog); + } } int perf_event_query_prog_array(struct perf_event *event, void __user *info) @@ -2336,10 +2064,9 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) { struct module *mod; - preempt_disable(); + guard(rcu)(); mod = __module_address((unsigned long)btp); module_put(mod); - preempt_enable(); } static __always_inline @@ -2349,8 +2076,8 @@ void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args) struct bpf_run_ctx *old_run_ctx; struct bpf_trace_run_ctx run_ctx; - cant_sleep(); - if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { + rcu_read_lock_dont_migrate(); + if (unlikely(!bpf_prog_get_recursion_context(prog))) { bpf_prog_inc_misses_counter(prog); goto out; } @@ -2358,13 +2085,12 @@ void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args) run_ctx.bpf_cookie = link->cookie; old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); - rcu_read_lock(); (void) bpf_prog_run(prog, args); - rcu_read_unlock(); bpf_reset_run_ctx(old_run_ctx); out: - this_cpu_dec(*(prog->active)); + bpf_prog_put_recursion_context(prog); + rcu_read_unlock_migrate(); } #define UNPACK(...) __VA_ARGS__ @@ -2517,7 +2243,7 @@ static int bpf_event_notify(struct notifier_block *nb, unsigned long op, switch (op) { case MODULE_STATE_COMING: - btm = kzalloc(sizeof(*btm), GFP_KERNEL); + btm = kzalloc_obj(*btm); if (btm) { btm->module = module; list_add(&btm->list, &bpf_trace_modules); @@ -2570,7 +2296,6 @@ struct bpf_kprobe_multi_link { u32 cnt; u32 mods_cnt; struct module **mods; - u32 flags; }; struct bpf_kprobe_multi_run_ctx { @@ -2584,6 +2309,20 @@ struct user_syms { char *buf; }; +#ifndef CONFIG_HAVE_FTRACE_REGS_HAVING_PT_REGS +static DEFINE_PER_CPU(struct pt_regs, bpf_kprobe_multi_pt_regs); +#define bpf_kprobe_multi_pt_regs_ptr() this_cpu_ptr(&bpf_kprobe_multi_pt_regs) +#else +#define bpf_kprobe_multi_pt_regs_ptr() (NULL) +#endif + +static unsigned long ftrace_get_entry_ip(unsigned long fentry_ip) +{ + unsigned long ip = ftrace_get_symaddr(fentry_ip); + + return ip ? : fentry_ip; +} + static int copy_user_syms(struct user_syms *us, unsigned long __user *usyms, u32 cnt) { unsigned long __user usymbol; @@ -2645,7 +2384,8 @@ static void bpf_kprobe_multi_link_release(struct bpf_link *link) struct bpf_kprobe_multi_link *kmulti_link; kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); - unregister_fprobe(&kmulti_link->fp); + /* Don't wait for RCU GP here. */ + unregister_fprobe_async(&kmulti_link->fp); kprobe_multi_put_modules(kmulti_link->mods, kmulti_link->mods_cnt); } @@ -2676,7 +2416,7 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link, kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); info->kprobe_multi.count = kmulti_link->cnt; - info->kprobe_multi.flags = kmulti_link->flags; + info->kprobe_multi.flags = kmulti_link->link.flags; info->kprobe_multi.missed = kmulti_link->fp.nmissed; if (!uaddrs) @@ -2710,10 +2450,39 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link, return err; } +#ifdef CONFIG_PROC_FS +static void bpf_kprobe_multi_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_kprobe_multi_link *kmulti_link; + bool has_cookies; + + kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); + has_cookies = !!kmulti_link->cookies; + + seq_printf(seq, + "kprobe_cnt:\t%u\n" + "missed:\t%lu\n", + kmulti_link->cnt, + kmulti_link->fp.nmissed); + + seq_printf(seq, "%s\t %s\n", "cookie", "func"); + for (int i = 0; i < kmulti_link->cnt; i++) { + seq_printf(seq, + "%llu\t %pS\n", + has_cookies ? kmulti_link->cookies[i] : 0, + (void *)kmulti_link->addrs[i]); + } +} +#endif + static const struct bpf_link_ops bpf_kprobe_multi_link_lops = { .release = bpf_kprobe_multi_link_release, .dealloc_deferred = bpf_kprobe_multi_link_dealloc, .fill_link_info = bpf_kprobe_multi_link_fill_link_info, +#ifdef CONFIG_PROC_FS + .show_fdinfo = bpf_kprobe_multi_show_fdinfo, +#endif }; static void bpf_kprobe_multi_cookie_swap(void *a, void *b, int size, const void *priv) @@ -2776,9 +2545,9 @@ static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx) return run_ctx->entry_ip; } -static int +static __always_inline int kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, - unsigned long entry_ip, struct pt_regs *regs, + unsigned long entry_ip, struct ftrace_regs *fregs, bool is_return, void *data) { struct bpf_kprobe_multi_run_ctx run_ctx = { @@ -2790,21 +2559,29 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, .entry_ip = entry_ip, }; struct bpf_run_ctx *old_run_ctx; + struct pt_regs *regs; int err; + /* + * graph tracer framework ensures we won't migrate, so there is no need + * to use migrate_disable for bpf_prog_run again. The check here just for + * __this_cpu_inc_return. + */ + cant_sleep(); + if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) { bpf_prog_inc_misses_counter(link->link.prog); - err = 0; + err = 1; goto out; } - migrate_disable(); rcu_read_lock(); + regs = ftrace_partial_regs(fregs, bpf_kprobe_multi_pt_regs_ptr()); old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx); err = bpf_prog_run(link->link.prog, regs); bpf_reset_run_ctx(old_run_ctx); + ftrace_partial_regs_update(fregs, bpf_kprobe_multi_pt_regs_ptr()); rcu_read_unlock(); - migrate_enable(); out: __this_cpu_dec(bpf_prog_active); @@ -2813,26 +2590,28 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, static int kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip, - unsigned long ret_ip, struct pt_regs *regs, + unsigned long ret_ip, struct ftrace_regs *fregs, void *data) { struct bpf_kprobe_multi_link *link; int err; link = container_of(fp, struct bpf_kprobe_multi_link, fp); - err = kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs, false, data); + err = kprobe_multi_link_prog_run(link, ftrace_get_entry_ip(fentry_ip), + fregs, false, data); return is_kprobe_session(link->link.prog) ? err : 0; } static void kprobe_multi_link_exit_handler(struct fprobe *fp, unsigned long fentry_ip, - unsigned long ret_ip, struct pt_regs *regs, + unsigned long ret_ip, struct ftrace_regs *fregs, void *data) { struct bpf_kprobe_multi_link *link; link = container_of(fp, struct bpf_kprobe_multi_link, fp); - kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs, true, data); + kprobe_multi_link_prog_run(link, ftrace_get_entry_ip(fentry_ip), + fregs, true, data); } static int symbols_cmp_r(const void *a, const void *b, const void *priv) @@ -2905,18 +2684,21 @@ static int get_modules_for_addrs(struct module ***mods, unsigned long *addrs, u3 u32 i, err = 0; for (i = 0; i < addrs_cnt; i++) { + bool skip_add = false; struct module *mod; - preempt_disable(); - mod = __module_address(addrs[i]); - /* Either no module or we it's already stored */ - if (!mod || has_module(&arr, mod)) { - preempt_enable(); - continue; + scoped_guard(rcu) { + mod = __module_address(addrs[i]); + /* Either no module or it's already stored */ + if (!mod || has_module(&arr, mod)) { + skip_add = true; + break; /* scoped_guard */ + } + if (!try_module_get(mod)) + err = -EINVAL; } - if (!try_module_get(mod)) - err = -EINVAL; - preempt_enable(); + if (skip_add) + continue; if (err) break; err = add_module(&arr, mod); @@ -2965,9 +2747,20 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (sizeof(u64) != sizeof(void *)) return -EOPNOTSUPP; + if (attr->link_create.flags) + return -EINVAL; + if (!is_kprobe_multi(prog)) return -EINVAL; + /* kprobe_multi is not allowed to be sleepable. */ + if (prog->sleepable) + return -EINVAL; + + /* Writing to context is not allowed for kprobes. */ + if (prog->aux->kprobe_write_ctx) + return -EINVAL; + flags = attr->link_create.kprobe_multi.flags; if (flags & ~BPF_F_KPROBE_MULTI_RETURN) return -EINVAL; @@ -3033,14 +2826,14 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr goto error; } - link = kzalloc(sizeof(*link), GFP_KERNEL); + link = kzalloc_obj(*link); if (!link) { err = -ENOMEM; goto error; } bpf_link_init(&link->link, BPF_LINK_TYPE_KPROBE_MULTI, - &bpf_kprobe_multi_link_lops, prog); + &bpf_kprobe_multi_link_lops, prog, attr->link_create.attach_type); err = bpf_link_prime(&link->link, &link_primer); if (err) @@ -3056,7 +2849,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr link->addrs = addrs; link->cookies = cookies; link->cnt = cnt; - link->flags = flags; + link->link.flags = flags; if (cookies) { /* @@ -3125,7 +2918,6 @@ struct bpf_uprobe_multi_link { struct path path; struct bpf_link link; u32 cnt; - u32 flags; struct bpf_uprobe *uprobes; struct task_struct *task; }; @@ -3189,7 +2981,7 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link, umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); info->uprobe_multi.count = umulti_link->cnt; - info->uprobe_multi.flags = umulti_link->flags; + info->uprobe_multi.flags = umulti_link->link.flags; info->uprobe_multi.pid = umulti_link->task ? task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0; @@ -3234,10 +3026,54 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link, return err; } +#ifdef CONFIG_PROC_FS +static void bpf_uprobe_multi_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_uprobe_multi_link *umulti_link; + char *p, *buf; + pid_t pid; + + umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); + + buf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!buf) + return; + + p = d_path(&umulti_link->path, buf, PATH_MAX); + if (IS_ERR(p)) { + kfree(buf); + return; + } + + pid = umulti_link->task ? + task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0; + seq_printf(seq, + "uprobe_cnt:\t%u\n" + "pid:\t%u\n" + "path:\t%s\n", + umulti_link->cnt, pid, p); + + seq_printf(seq, "%s\t %s\t %s\n", "cookie", "offset", "ref_ctr_offset"); + for (int i = 0; i < umulti_link->cnt; i++) { + seq_printf(seq, + "%llu\t %#llx\t %#lx\n", + umulti_link->uprobes[i].cookie, + umulti_link->uprobes[i].offset, + umulti_link->uprobes[i].ref_ctr_offset); + } + + kfree(buf); +} +#endif + static const struct bpf_link_ops bpf_uprobe_multi_link_lops = { .release = bpf_uprobe_multi_link_release, .dealloc_deferred = bpf_uprobe_multi_link_dealloc, .fill_link_info = bpf_uprobe_multi_link_fill_link_info, +#ifdef CONFIG_PROC_FS + .show_fdinfo = bpf_uprobe_multi_show_fdinfo, +#endif }; static int uprobe_prog_run(struct bpf_uprobe *uprobe, @@ -3354,6 +3190,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (sizeof(u64) != sizeof(void *)) return -EOPNOTSUPP; + if (attr->link_create.flags) + return -EINVAL; + if (!is_uprobe_multi(prog)) return -EINVAL; @@ -3395,7 +3234,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr } if (pid) { + rcu_read_lock(); task = get_pid_task(find_vpid(pid), PIDTYPE_TGID); + rcu_read_unlock(); if (!task) { err = -ESRCH; goto error_path_put; @@ -3404,8 +3245,8 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr err = -ENOMEM; - link = kzalloc(sizeof(*link), GFP_KERNEL); - uprobes = kvcalloc(cnt, sizeof(*uprobes), GFP_KERNEL); + link = kzalloc_obj(*link); + uprobes = kvzalloc_objs(*uprobes, cnt); if (!uprobes || !link) goto error_free; @@ -3444,10 +3285,10 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr link->uprobes = uprobes; link->path = path; link->task = task; - link->flags = flags; + link->link.flags = flags; bpf_link_init(&link->link, BPF_LINK_TYPE_UPROBE_MULTI, - &bpf_uprobe_multi_link_lops, prog); + &bpf_uprobe_multi_link_lops, prog, attr->link_create.attach_type); for (i = 0; i < cnt; i++) { uprobes[i].uprobe = uprobe_register(d_real_inode(link->path.dentry), @@ -3496,7 +3337,7 @@ static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx) __bpf_kfunc_start_defs(); -__bpf_kfunc bool bpf_session_is_return(void) +__bpf_kfunc bool bpf_session_is_return(void *ctx) { struct bpf_session_run_ctx *session_ctx; @@ -3504,7 +3345,7 @@ __bpf_kfunc bool bpf_session_is_return(void) return session_ctx->is_return; } -__bpf_kfunc __u64 *bpf_session_cookie(void) +__bpf_kfunc __u64 *bpf_session_cookie(void *ctx) { struct bpf_session_run_ctx *session_ctx; @@ -3514,34 +3355,179 @@ __bpf_kfunc __u64 *bpf_session_cookie(void) __bpf_kfunc_end_defs(); -BTF_KFUNCS_START(kprobe_multi_kfunc_set_ids) +BTF_KFUNCS_START(session_kfunc_set_ids) BTF_ID_FLAGS(func, bpf_session_is_return) BTF_ID_FLAGS(func, bpf_session_cookie) -BTF_KFUNCS_END(kprobe_multi_kfunc_set_ids) +BTF_KFUNCS_END(session_kfunc_set_ids) -static int bpf_kprobe_multi_filter(const struct bpf_prog *prog, u32 kfunc_id) +static int bpf_session_filter(const struct bpf_prog *prog, u32 kfunc_id) { - if (!btf_id_set8_contains(&kprobe_multi_kfunc_set_ids, kfunc_id)) + if (!btf_id_set8_contains(&session_kfunc_set_ids, kfunc_id)) return 0; - if (!is_kprobe_session(prog) && !is_uprobe_session(prog)) + if (!is_kprobe_session(prog) && !is_uprobe_session(prog) && !is_trace_fsession(prog)) return -EACCES; return 0; } -static const struct btf_kfunc_id_set bpf_kprobe_multi_kfunc_set = { +static const struct btf_kfunc_id_set bpf_session_kfunc_set = { .owner = THIS_MODULE, - .set = &kprobe_multi_kfunc_set_ids, - .filter = bpf_kprobe_multi_filter, + .set = &session_kfunc_set_ids, + .filter = bpf_session_filter, }; -static int __init bpf_kprobe_multi_kfuncs_init(void) +static int __init bpf_trace_kfuncs_init(void) { - return register_btf_kfunc_id_set(BPF_PROG_TYPE_KPROBE, &bpf_kprobe_multi_kfunc_set); + int err = 0; + + err = err ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_KPROBE, &bpf_session_kfunc_set); + err = err ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_session_kfunc_set); + + return err; } -late_initcall(bpf_kprobe_multi_kfuncs_init); +late_initcall(bpf_trace_kfuncs_init); + +typedef int (*copy_fn_t)(void *dst, const void *src, u32 size, struct task_struct *tsk); + +/* + * The __always_inline is to make sure the compiler doesn't + * generate indirect calls into callbacks, which is expensive, + * on some kernel configurations. This allows compiler to put + * direct calls into all the specific callback implementations + * (copy_user_data_sleepable, copy_user_data_nofault, and so on) + */ +static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u64 doff, u64 size, + const void *unsafe_src, + copy_fn_t str_copy_fn, + struct task_struct *tsk) +{ + struct bpf_dynptr_kern *dst; + u64 chunk_sz, off; + void *dst_slice; + int cnt, err; + char buf[256]; + + dst_slice = bpf_dynptr_slice_rdwr(dptr, doff, NULL, size); + if (likely(dst_slice)) + return str_copy_fn(dst_slice, unsafe_src, size, tsk); + + dst = (struct bpf_dynptr_kern *)dptr; + if (bpf_dynptr_check_off_len(dst, doff, size)) + return -E2BIG; + + for (off = 0; off < size; off += chunk_sz - 1) { + chunk_sz = min_t(u64, sizeof(buf), size - off); + /* Expect str_copy_fn to return count of copied bytes, including + * zero terminator. Next iteration increment off by chunk_sz - 1 to + * overwrite NUL. + */ + cnt = str_copy_fn(buf, unsafe_src + off, chunk_sz, tsk); + if (cnt < 0) + return cnt; + err = __bpf_dynptr_write(dst, doff + off, buf, cnt, 0); + if (err) + return err; + if (cnt < chunk_sz || chunk_sz == 1) /* we are done */ + return off + cnt; + } + return off; +} + +static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u64 doff, + u64 size, const void *unsafe_src, + copy_fn_t copy_fn, struct task_struct *tsk) +{ + struct bpf_dynptr_kern *dst; + void *dst_slice; + char buf[256]; + u64 off, chunk_sz; + int err; + + dst_slice = bpf_dynptr_slice_rdwr(dptr, doff, NULL, size); + if (likely(dst_slice)) + return copy_fn(dst_slice, unsafe_src, size, tsk); + + dst = (struct bpf_dynptr_kern *)dptr; + if (bpf_dynptr_check_off_len(dst, doff, size)) + return -E2BIG; + + for (off = 0; off < size; off += chunk_sz) { + chunk_sz = min_t(u64, sizeof(buf), size - off); + err = copy_fn(buf, unsafe_src + off, chunk_sz, tsk); + if (err) + return err; + err = __bpf_dynptr_write(dst, doff + off, buf, chunk_sz, 0); + if (err) + return err; + } + return 0; +} + +static __always_inline int copy_user_data_nofault(void *dst, const void *unsafe_src, + u32 size, struct task_struct *tsk) +{ + return copy_from_user_nofault(dst, (const void __user *)unsafe_src, size); +} + +static __always_inline int copy_user_data_sleepable(void *dst, const void *unsafe_src, + u32 size, struct task_struct *tsk) +{ + int ret; + + if (!tsk) { /* Read from the current task */ + ret = copy_from_user(dst, (const void __user *)unsafe_src, size); + if (ret) + return -EFAULT; + return 0; + } + + ret = access_process_vm(tsk, (unsigned long)unsafe_src, dst, size, 0); + if (ret != size) + return -EFAULT; + return 0; +} + +static __always_inline int copy_kernel_data_nofault(void *dst, const void *unsafe_src, + u32 size, struct task_struct *tsk) +{ + return copy_from_kernel_nofault(dst, unsafe_src, size); +} + +static __always_inline int copy_user_str_nofault(void *dst, const void *unsafe_src, + u32 size, struct task_struct *tsk) +{ + return strncpy_from_user_nofault(dst, (const void __user *)unsafe_src, size); +} + +static __always_inline int copy_user_str_sleepable(void *dst, const void *unsafe_src, + u32 size, struct task_struct *tsk) +{ + int ret; + + if (unlikely(size == 0)) + return 0; + + if (tsk) { + ret = copy_remote_vm_str(tsk, (unsigned long)unsafe_src, dst, size, 0); + } else { + ret = strncpy_from_user(dst, (const void __user *)unsafe_src, size - 1); + /* strncpy_from_user does not guarantee NUL termination */ + if (ret >= 0) + ((char *)dst)[ret] = '\0'; + } + + if (ret < 0) + return ret; + return ret + 1; +} + +static __always_inline int copy_kernel_str_nofault(void *dst, const void *unsafe_src, + u32 size, struct task_struct *tsk) +{ + return strncpy_from_kernel_nofault(dst, unsafe_src, size); +} __bpf_kfunc_start_defs(); @@ -3554,4 +3540,62 @@ __bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid return bpf_send_signal_common(sig, type, task, value); } +__bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign) +{ + return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign, + copy_user_data_nofault, NULL); +} + +__bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void *unsafe_ptr__ign) +{ + return __bpf_dynptr_copy(dptr, off, size, unsafe_ptr__ign, + copy_kernel_data_nofault, NULL); +} + +__bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign) +{ + return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign, + copy_user_str_nofault, NULL); +} + +__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void *unsafe_ptr__ign) +{ + return __bpf_dynptr_copy_str(dptr, off, size, unsafe_ptr__ign, + copy_kernel_str_nofault, NULL); +} + +__bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign) +{ + return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign, + copy_user_data_sleepable, NULL); +} + +__bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign) +{ + return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign, + copy_user_str_sleepable, NULL); +} + +__bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign, + struct task_struct *tsk) +{ + return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign, + copy_user_data_sleepable, tsk); +} + +__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign, + struct task_struct *tsk) +{ + return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign, + copy_user_str_sleepable, tsk); +} + __bpf_kfunc_end_defs(); diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 30e3ddc8a8a8..40d373d65f9b 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -163,7 +163,7 @@ enum { #define RET_STACK(t, offset) ((struct ftrace_ret_stack *)(&(t)->ret_stack[offset])) /* - * Each fgraph_ops has a reservered unsigned long at the end (top) of the + * Each fgraph_ops has a reserved unsigned long at the end (top) of the * ret_stack to store task specific state. */ #define SHADOW_STACK_TASK_VARS(ret_stack) \ @@ -292,13 +292,15 @@ static inline unsigned long make_data_type_val(int idx, int size, int offset) } /* ftrace_graph_entry set to this to tell some archs to run function graph */ -static int entry_run(struct ftrace_graph_ent *trace, struct fgraph_ops *ops) +static int entry_run(struct ftrace_graph_ent *trace, struct fgraph_ops *ops, + struct ftrace_regs *fregs) { return 0; } /* ftrace_graph_return set to this to tell some archs to run function graph */ -static void return_run(struct ftrace_graph_ret *trace, struct fgraph_ops *ops) +static void return_run(struct ftrace_graph_ret *trace, struct fgraph_ops *ops, + struct ftrace_regs *fregs) { } @@ -496,9 +498,6 @@ found: return get_data_type_data(current, offset); } -/* Both enabled by default (can be cleared by function_graph tracer flags */ -bool fgraph_sleep_time = true; - #ifdef CONFIG_DYNAMIC_FTRACE /* * archs can override this function if they must do something @@ -520,13 +519,15 @@ int __weak ftrace_disable_ftrace_graph_caller(void) #endif int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { return 0; } static void ftrace_graph_ret_stub(struct ftrace_graph_ret *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { } @@ -538,7 +539,11 @@ static struct fgraph_ops fgraph_stub = { static struct fgraph_ops *fgraph_direct_gops = &fgraph_stub; DEFINE_STATIC_CALL(fgraph_func, ftrace_graph_entry_stub); DEFINE_STATIC_CALL(fgraph_retfunc, ftrace_graph_ret_stub); +#if FGRAPH_NO_DIRECT +static DEFINE_STATIC_KEY_FALSE(fgraph_do_direct); +#else static DEFINE_STATIC_KEY_TRUE(fgraph_do_direct); +#endif /** * ftrace_graph_stop - set to permanently disable function graph tracing @@ -644,14 +649,20 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, #endif /* If the caller does not use ftrace, call this function. */ -int function_graph_enter(unsigned long ret, unsigned long func, - unsigned long frame_pointer, unsigned long *retp) +int function_graph_enter_regs(unsigned long ret, unsigned long func, + unsigned long frame_pointer, unsigned long *retp, + struct ftrace_regs *fregs) { struct ftrace_graph_ent trace; unsigned long bitmap = 0; int offset; + int bit; int i; + bit = ftrace_test_recursion_trylock(func, ret); + if (bit < 0) + return -EBUSY; + trace.func = func; trace.depth = ++current->curr_ret_depth; @@ -663,7 +674,7 @@ int function_graph_enter(unsigned long ret, unsigned long func, if (static_branch_likely(&fgraph_do_direct)) { int save_curr_ret_stack = current->curr_ret_stack; - if (static_call(fgraph_func)(&trace, fgraph_direct_gops)) + if (static_call(fgraph_func)(&trace, fgraph_direct_gops, fregs)) bitmap |= BIT(fgraph_direct_gops->idx); else /* Clear out any saved storage */ @@ -681,7 +692,7 @@ int function_graph_enter(unsigned long ret, unsigned long func, save_curr_ret_stack = current->curr_ret_stack; if (ftrace_ops_test(&gops->ops, func, NULL) && - gops->entryfunc(&trace, gops)) + gops->entryfunc(&trace, gops, fregs)) bitmap |= BIT(i); else /* Clear out any saved storage */ @@ -697,12 +708,13 @@ int function_graph_enter(unsigned long ret, unsigned long func, * flag, set that bit always. */ set_bitmap(current, offset, bitmap | BIT(0)); - + ftrace_test_recursion_unlock(bit); return 0; out_ret: current->curr_ret_stack -= FGRAPH_FRAME_OFFSET + 1; out: current->curr_ret_depth--; + ftrace_test_recursion_unlock(bit); return -EBUSY; } @@ -792,21 +804,19 @@ static struct notifier_block ftrace_suspend_notifier = { .notifier_call = ftrace_suspend_notifier_call, }; -/* fgraph_ret_regs is not defined without CONFIG_FUNCTION_GRAPH_RETVAL */ -struct fgraph_ret_regs; - /* * Send the trace to the ring-buffer. * @return the original return address. */ -static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs, - unsigned long frame_pointer) +static inline unsigned long +__ftrace_return_to_handler(struct ftrace_regs *fregs, unsigned long frame_pointer) { struct ftrace_ret_stack *ret_stack; struct ftrace_graph_ret trace; unsigned long bitmap; unsigned long ret; int offset; + int bit; int i; ret_stack = ftrace_pop_return_trace(&trace, &ret, frame_pointer, &offset); @@ -818,17 +828,28 @@ static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs return (unsigned long)panic; } - trace.rettime = trace_clock_local(); + if (fregs) + ftrace_regs_set_instruction_pointer(fregs, ret); + + bit = ftrace_test_recursion_trylock(trace.func, ret); + /* + * This can fail because ftrace_test_recursion_trylock() allows one nest + * call. If we are already in a nested call, then we don't probe this and + * just return the original return address. + */ + if (unlikely(bit < 0)) + goto out; + #ifdef CONFIG_FUNCTION_GRAPH_RETVAL - trace.retval = fgraph_ret_regs_return_value(ret_regs); + trace.retval = ftrace_regs_get_return_value(fregs); #endif bitmap = get_bitmap_bits(current, offset); #ifdef CONFIG_HAVE_STATIC_CALL - if (static_branch_likely(&fgraph_do_direct)) { + if (!FGRAPH_NO_DIRECT && static_branch_likely(&fgraph_do_direct)) { if (test_bit(fgraph_direct_gops->idx, &bitmap)) - static_call(fgraph_retfunc)(&trace, fgraph_direct_gops); + static_call(fgraph_retfunc)(&trace, fgraph_direct_gops, fregs); } else #endif { @@ -838,10 +859,12 @@ static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs if (gops == &fgraph_stub) continue; - gops->retfunc(&trace, gops); + gops->retfunc(&trace, gops, fregs); } } + ftrace_test_recursion_unlock(bit); +out: /* * The ftrace_graph_return() may still access the current * ret_stack structure, we need to make sure the update of @@ -855,14 +878,14 @@ static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs } /* - * After all architecures have selected HAVE_FUNCTION_GRAPH_RETVAL, we can - * leave only ftrace_return_to_handler(ret_regs). + * After all architectures have selected HAVE_FUNCTION_GRAPH_FREGS, we can + * leave only ftrace_return_to_handler(fregs). */ -#ifdef CONFIG_HAVE_FUNCTION_GRAPH_RETVAL -unsigned long ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs) +#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FREGS +unsigned long ftrace_return_to_handler(struct ftrace_regs *fregs) { - return __ftrace_return_to_handler(ret_regs, - fgraph_ret_regs_frame_pointer(ret_regs)); + return __ftrace_return_to_handler(fregs, + ftrace_regs_get_frame_pointer(fregs)); } #else unsigned long ftrace_return_to_handler(unsigned long frame_pointer) @@ -997,20 +1020,17 @@ void fgraph_init_ops(struct ftrace_ops *dst_ops, mutex_init(&dst_ops->local_hash.regex_lock); INIT_LIST_HEAD(&dst_ops->subop_list); dst_ops->flags |= FTRACE_OPS_FL_INITIALIZED; + dst_ops->private = src_ops->private; } #endif } -void ftrace_graph_sleep_time_control(bool enable) -{ - fgraph_sleep_time = enable; -} - /* * Simply points to ftrace_stub, but with the proper protocol. * Defined by the linker script in linux/vmlinux.lds.h */ -void ftrace_stub_graph(struct ftrace_graph_ret *trace, struct fgraph_ops *gops); +void ftrace_stub_graph(struct ftrace_graph_ret *trace, struct fgraph_ops *gops, + struct ftrace_regs *fregs); /* The callbacks that hook a function */ trace_func_graph_ret_t ftrace_graph_return = ftrace_stub_graph; @@ -1075,7 +1095,7 @@ ftrace_graph_probe_sched_switch(void *ignore, bool preempt, * Does the user want to count the time a function was asleep. * If so, do not update the time stamps. */ - if (fgraph_sleep_time) + if (!fgraph_no_sleep_time) return; timestamp = trace_clock_local(); @@ -1174,7 +1194,8 @@ void ftrace_graph_exit_task(struct task_struct *t) #ifdef CONFIG_DYNAMIC_FTRACE static int fgraph_pid_func(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { struct trace_array *tr = gops->ops.private; int pid; @@ -1188,7 +1209,7 @@ static int fgraph_pid_func(struct ftrace_graph_ent *trace, return 0; } - return gops->saved_func(trace, gops); + return gops->saved_func(trace, gops, fregs); } void fgraph_update_pid_func(void) @@ -1268,6 +1289,9 @@ static void ftrace_graph_enable_direct(bool enable_branch, struct fgraph_ops *go trace_func_graph_ret_t retfunc = NULL; int i; + if (FGRAPH_NO_DIRECT) + return; + if (gops) { func = gops->entryfunc; retfunc = gops->retfunc; @@ -1286,11 +1310,14 @@ static void ftrace_graph_enable_direct(bool enable_branch, struct fgraph_ops *go static_call_update(fgraph_func, func); static_call_update(fgraph_retfunc, retfunc); if (enable_branch) - static_branch_disable(&fgraph_do_direct); + static_branch_enable(&fgraph_do_direct); } static void ftrace_graph_disable_direct(bool disable_branch) { + if (FGRAPH_NO_DIRECT) + return; + if (disable_branch) static_branch_disable(&fgraph_do_direct); static_call_update(fgraph_func, ftrace_graph_entry_stub); @@ -1313,6 +1340,10 @@ int register_ftrace_graph(struct fgraph_ops *gops) int ret = 0; int i = -1; + if (WARN_ONCE(gops->ops.flags & FTRACE_OPS_FL_GRAPH, + "function graph ops registered again")) + return -EBUSY; + guard(mutex)(&ftrace_lock); if (!fgraph_stack_cachep) { @@ -1348,6 +1379,13 @@ int register_ftrace_graph(struct fgraph_ops *gops) ftrace_graph_active++; + /* Always save the function, and reset at unregistering */ + gops->saved_func = gops->entryfunc; +#ifdef CONFIG_DYNAMIC_FTRACE + if (ftrace_pids_enabled(&gops->ops)) + gops->entryfunc = fgraph_pid_func; +#endif + if (ftrace_graph_active == 2) ftrace_graph_disable_direct(true); @@ -1367,8 +1405,8 @@ int register_ftrace_graph(struct fgraph_ops *gops) } else { init_task_vars(gops->idx); } - /* Always save the function, and reset at unregistering */ - gops->saved_func = gops->entryfunc; + + gops->ops.flags |= FTRACE_OPS_FL_GRAPH; ret = ftrace_startup_subops(&graph_ops, &gops->ops, command); if (!ret) @@ -1379,6 +1417,8 @@ error: ftrace_graph_active--; gops->saved_func = NULL; fgraph_lru_release_index(i); + if (!ftrace_graph_active) + unregister_pm_notifier(&ftrace_suspend_notifier); } return ret; } @@ -1387,17 +1427,21 @@ void unregister_ftrace_graph(struct fgraph_ops *gops) { int command = 0; + if (WARN_ONCE(!(gops->ops.flags & FTRACE_OPS_FL_GRAPH), + "function graph ops unregistered without registering")) + return; + guard(mutex)(&ftrace_lock); if (unlikely(!ftrace_graph_active)) - return; + goto out; if (unlikely(gops->idx < 0 || gops->idx >= FGRAPH_ARRAY_SIZE || fgraph_array[gops->idx] != gops)) - return; + goto out; if (fgraph_lru_release_index(gops->idx) < 0) - return; + goto out; fgraph_array[gops->idx] = &fgraph_stub; @@ -1420,4 +1464,6 @@ void unregister_ftrace_graph(struct fgraph_ops *gops) unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); } gops->saved_func = NULL; + out: + gops->ops.flags &= ~FTRACE_OPS_FL_GRAPH; } diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 9ff018245840..f378613ad120 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -4,102 +4,235 @@ */ #define pr_fmt(fmt) "fprobe: " fmt +#include <linux/cleanup.h> #include <linux/err.h> #include <linux/fprobe.h> #include <linux/kallsyms.h> #include <linux/kprobes.h> -#include <linux/rethook.h> +#include <linux/list.h> +#include <linux/mutex.h> +#include <linux/rhashtable.h> #include <linux/slab.h> #include <linux/sort.h> +#include <asm/fprobe.h> + #include "trace.h" -struct fprobe_rethook_node { - struct rethook_node node; - unsigned long entry_ip; - unsigned long entry_parent_ip; - char data[]; +#define FPROBE_IP_HASH_BITS 8 +#define FPROBE_IP_TABLE_SIZE (1 << FPROBE_IP_HASH_BITS) + +#define FPROBE_HASH_BITS 6 +#define FPROBE_TABLE_SIZE (1 << FPROBE_HASH_BITS) + +#define SIZE_IN_LONG(x) ((x + sizeof(long) - 1) >> (sizeof(long) == 8 ? 3 : 2)) + +/* + * fprobe_table: hold 'fprobe_hlist::hlist' for checking the fprobe still + * exists. The key is the address of fprobe instance. + * fprobe_ip_table: hold 'fprobe_hlist::array[*]' for searching the fprobe + * instance related to the function address. The key is the ftrace IP + * address. + * + * When unregistering the fprobe, fprobe_hlist::fp and fprobe_hlist::array[*].fp + * are set NULL and delete those from both hash tables (by hlist_del_rcu). + * After an RCU grace period, the fprobe_hlist itself will be released. + * + * fprobe_table and fprobe_ip_table can be accessed from either + * - Normal hlist traversal and RCU add/del under 'fprobe_mutex' is held. + * - RCU hlist traversal under disabling preempt + */ +static struct hlist_head fprobe_table[FPROBE_TABLE_SIZE]; +static struct rhltable fprobe_ip_table; +static DEFINE_MUTEX(fprobe_mutex); +static struct fgraph_ops fprobe_graph_ops; + +static u32 fprobe_node_hashfn(const void *data, u32 len, u32 seed) +{ + return hash_ptr(*(unsigned long **)data, 32); +} + +static int fprobe_node_cmp(struct rhashtable_compare_arg *arg, + const void *ptr) +{ + unsigned long key = *(unsigned long *)arg->key; + const struct fprobe_hlist_node *n = ptr; + + return n->addr != key; +} + +static u32 fprobe_node_obj_hashfn(const void *data, u32 len, u32 seed) +{ + const struct fprobe_hlist_node *n = data; + + return hash_ptr((void *)n->addr, 32); +} + +static const struct rhashtable_params fprobe_rht_params = { + .head_offset = offsetof(struct fprobe_hlist_node, hlist), + .key_offset = offsetof(struct fprobe_hlist_node, addr), + .key_len = sizeof_field(struct fprobe_hlist_node, addr), + .hashfn = fprobe_node_hashfn, + .obj_hashfn = fprobe_node_obj_hashfn, + .obj_cmpfn = fprobe_node_cmp, + .automatic_shrinking = true, }; -static inline void __fprobe_handler(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct ftrace_regs *fregs) +/* Node insertion and deletion requires the fprobe_mutex */ +static int __insert_fprobe_node(struct fprobe_hlist_node *node, struct fprobe *fp) { - struct fprobe_rethook_node *fpr; - struct rethook_node *rh = NULL; - struct fprobe *fp; - void *entry_data = NULL; - int ret = 0; + int ret; - fp = container_of(ops, struct fprobe, ops); + lockdep_assert_held(&fprobe_mutex); - if (fp->exit_handler) { - rh = rethook_try_get(fp->rethook); - if (!rh) { - fp->nmissed++; - return; - } - fpr = container_of(rh, struct fprobe_rethook_node, node); - fpr->entry_ip = ip; - fpr->entry_parent_ip = parent_ip; - if (fp->entry_data_size) - entry_data = fpr->data; - } + ret = rhltable_insert(&fprobe_ip_table, &node->hlist, fprobe_rht_params); + /* Set the fprobe pointer if insertion was successful. */ + if (!ret) + WRITE_ONCE(node->fp, fp); + return ret; +} - if (fp->entry_handler) - ret = fp->entry_handler(fp, ip, parent_ip, ftrace_get_regs(fregs), entry_data); +static void __delete_fprobe_node(struct fprobe_hlist_node *node) +{ + lockdep_assert_held(&fprobe_mutex); - /* If entry_handler returns !0, nmissed is not counted. */ - if (rh) { - if (ret) - rethook_recycle(rh); - else - rethook_hook(rh, ftrace_get_regs(fregs), true); + /* Avoid double deleting and non-inserted nodes */ + if (READ_ONCE(node->fp) != NULL) { + WRITE_ONCE(node->fp, NULL); + rhltable_remove(&fprobe_ip_table, &node->hlist, + fprobe_rht_params); } } -static void fprobe_handler(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct ftrace_regs *fregs) +/* Check existence of the fprobe */ +static bool fprobe_registered(struct fprobe *fp) { - struct fprobe *fp; - int bit; + struct hlist_head *head; + struct fprobe_hlist *fph; + + head = &fprobe_table[hash_ptr(fp, FPROBE_HASH_BITS)]; + hlist_for_each_entry_rcu(fph, head, hlist, + lockdep_is_held(&fprobe_mutex)) { + if (fph->fp == fp) + return true; + } + return false; +} +NOKPROBE_SYMBOL(fprobe_registered); - fp = container_of(ops, struct fprobe, ops); - if (fprobe_disabled(fp)) - return; +static int add_fprobe_hash(struct fprobe *fp) +{ + struct fprobe_hlist *fph = fp->hlist_array; + struct hlist_head *head; - /* recursion detection has to go before any traceable function and - * all functions before this point should be marked as notrace - */ - bit = ftrace_test_recursion_trylock(ip, parent_ip); - if (bit < 0) { - fp->nmissed++; - return; - } - __fprobe_handler(ip, parent_ip, ops, fregs); - ftrace_test_recursion_unlock(bit); + lockdep_assert_held(&fprobe_mutex); + + if (WARN_ON_ONCE(!fph)) + return -EINVAL; + + head = &fprobe_table[hash_ptr(fp, FPROBE_HASH_BITS)]; + hlist_add_head_rcu(&fp->hlist_array->hlist, head); + return 0; +} + +static int del_fprobe_hash(struct fprobe *fp) +{ + struct fprobe_hlist *fph = fp->hlist_array; + + lockdep_assert_held(&fprobe_mutex); + + if (WARN_ON_ONCE(!fph)) + return -EINVAL; + + if (!fprobe_registered(fp)) + return -ENOENT; + + fph->fp = NULL; + hlist_del_rcu(&fph->hlist); + return 0; +} + +#ifdef ARCH_DEFINE_ENCODE_FPROBE_HEADER + +/* The arch should encode fprobe_header info into one unsigned long */ +#define FPROBE_HEADER_SIZE_IN_LONG 1 + +static inline bool write_fprobe_header(unsigned long *stack, + struct fprobe *fp, unsigned int size_words) +{ + if (WARN_ON_ONCE(size_words > MAX_FPROBE_DATA_SIZE_WORD || + !arch_fprobe_header_encodable(fp))) + return false; + *stack = arch_encode_fprobe_header(fp, size_words); + return true; } -NOKPROBE_SYMBOL(fprobe_handler); -static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct ftrace_regs *fregs) +static inline void read_fprobe_header(unsigned long *stack, + struct fprobe **fp, unsigned int *size_words) { + *fp = arch_decode_fprobe_header_fp(*stack); + *size_words = arch_decode_fprobe_header_size(*stack); +} + +#else + +/* Generic fprobe_header */ +struct __fprobe_header { struct fprobe *fp; - int bit; + unsigned long size_words; +} __packed; - fp = container_of(ops, struct fprobe, ops); - if (fprobe_disabled(fp)) - return; +#define FPROBE_HEADER_SIZE_IN_LONG SIZE_IN_LONG(sizeof(struct __fprobe_header)) - /* recursion detection has to go before any traceable function and - * all functions called before this point should be marked as notrace - */ - bit = ftrace_test_recursion_trylock(ip, parent_ip); - if (bit < 0) { - fp->nmissed++; - return; - } +static inline bool write_fprobe_header(unsigned long *stack, + struct fprobe *fp, unsigned int size_words) +{ + struct __fprobe_header *fph = (struct __fprobe_header *)stack; + + if (WARN_ON_ONCE(size_words > MAX_FPROBE_DATA_SIZE_WORD)) + return false; + + fph->fp = fp; + fph->size_words = size_words; + return true; +} +static inline void read_fprobe_header(unsigned long *stack, + struct fprobe **fp, unsigned int *size_words) +{ + struct __fprobe_header *fph = (struct __fprobe_header *)stack; + + *fp = fph->fp; + *size_words = fph->size_words; +} + +#endif + +/* + * fprobe shadow stack management: + * Since fprobe shares a single fgraph_ops, it needs to share the stack entry + * among the probes on the same function exit. Note that a new probe can be + * registered before a target function is returning, we can not use the hash + * table to find the corresponding probes. Thus the probe address is stored on + * the shadow stack with its entry data size. + * + */ +static inline int __fprobe_handler(unsigned long ip, unsigned long parent_ip, + struct fprobe *fp, struct ftrace_regs *fregs, + void *data) +{ + if (!fp->entry_handler) + return 0; + + return fp->entry_handler(fp, ip, parent_ip, fregs, data); +} + +static inline int __fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip, + struct fprobe *fp, struct ftrace_regs *fregs, + void *data) +{ + int ret; /* * This user handler is shared with other kprobes and is not expected to be * called recursively. So if any other kprobe handler is running, this will @@ -108,44 +241,541 @@ static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip, */ if (unlikely(kprobe_running())) { fp->nmissed++; - goto recursion_unlock; + return 0; } kprobe_busy_begin(); - __fprobe_handler(ip, parent_ip, ops, fregs); + ret = __fprobe_handler(ip, parent_ip, fp, fregs, data); kprobe_busy_end(); + return ret; +} -recursion_unlock: - ftrace_test_recursion_unlock(bit); +static int fprobe_fgraph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, + struct ftrace_regs *fregs); +static void fprobe_return(struct ftrace_graph_ret *trace, + struct fgraph_ops *gops, + struct ftrace_regs *fregs); + +static struct fgraph_ops fprobe_graph_ops = { + .entryfunc = fprobe_fgraph_entry, + .retfunc = fprobe_return, +}; +/* Number of fgraph fprobe nodes */ +static int nr_fgraph_fprobes; +/* Is fprobe_graph_ops registered? */ +static bool fprobe_graph_registered; + +/* Add @addrs to the ftrace filter and register fgraph if needed. */ +static int fprobe_graph_add_ips(unsigned long *addrs, int num) +{ + int ret; + + lockdep_assert_held(&fprobe_mutex); + + ret = ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 0, 0); + if (ret) + return ret; + + if (!fprobe_graph_registered) { + ret = register_ftrace_graph(&fprobe_graph_ops); + if (WARN_ON_ONCE(ret)) { + ftrace_free_filter(&fprobe_graph_ops.ops); + return ret; + } + fprobe_graph_registered = true; + } + return 0; +} + +static void __fprobe_graph_unregister(void) +{ + if (fprobe_graph_registered) { + unregister_ftrace_graph(&fprobe_graph_ops); + ftrace_free_filter(&fprobe_graph_ops.ops); + fprobe_graph_registered = false; + } +} + +/* Remove @addrs from the ftrace filter and unregister fgraph if possible. */ +static void fprobe_graph_remove_ips(unsigned long *addrs, int num) +{ + lockdep_assert_held(&fprobe_mutex); + + if (!nr_fgraph_fprobes) + __fprobe_graph_unregister(); + else if (num) + ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 1, 0); } -static void fprobe_exit_handler(struct rethook_node *rh, void *data, - unsigned long ret_ip, struct pt_regs *regs) +#if defined(CONFIG_DYNAMIC_FTRACE_WITH_ARGS) || defined(CONFIG_DYNAMIC_FTRACE_WITH_REGS) + +/* ftrace_ops callback, this processes fprobes which have only entry_handler. */ +static void fprobe_ftrace_entry(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct ftrace_regs *fregs) { - struct fprobe *fp = (struct fprobe *)data; - struct fprobe_rethook_node *fpr; + struct fprobe_hlist_node *node; + struct rhlist_head *head, *pos; + struct fprobe *fp; int bit; - if (!fp || fprobe_disabled(fp)) + bit = ftrace_test_recursion_trylock(ip, parent_ip); + if (bit < 0) return; - fpr = container_of(rh, struct fprobe_rethook_node, node); + /* + * ftrace_test_recursion_trylock() disables preemption, but + * rhltable_lookup() checks whether rcu_read_lcok is held. + * So we take rcu_read_lock() here. + */ + rcu_read_lock(); + head = rhltable_lookup(&fprobe_ip_table, &ip, fprobe_rht_params); + + rhl_for_each_entry_rcu(node, pos, head, hlist) { + if (node->addr != ip) + break; + fp = READ_ONCE(node->fp); + if (unlikely(!fp || fprobe_disabled(fp) || fp->exit_handler)) + continue; + + if (fprobe_shared_with_kprobes(fp)) + __fprobe_kprobe_handler(ip, parent_ip, fp, fregs, NULL); + else + __fprobe_handler(ip, parent_ip, fp, fregs, NULL); + } + rcu_read_unlock(); + ftrace_test_recursion_unlock(bit); +} +NOKPROBE_SYMBOL(fprobe_ftrace_entry); + +static struct ftrace_ops fprobe_ftrace_ops = { + .func = fprobe_ftrace_entry, + .flags = FTRACE_OPS_FL_SAVE_ARGS, +}; +/* Number of ftrace fprobe nodes */ +static int nr_ftrace_fprobes; +/* Is fprobe_ftrace_ops registered? */ +static bool fprobe_ftrace_registered; + +static int fprobe_ftrace_add_ips(unsigned long *addrs, int num) +{ + int ret; + + lockdep_assert_held(&fprobe_mutex); + + ret = ftrace_set_filter_ips(&fprobe_ftrace_ops, addrs, num, 0, 0); + if (ret) + return ret; + + if (!fprobe_ftrace_registered) { + ret = register_ftrace_function(&fprobe_ftrace_ops); + if (ret) { + ftrace_free_filter(&fprobe_ftrace_ops); + return ret; + } + fprobe_ftrace_registered = true; + } + return 0; +} + +static void __fprobe_ftrace_unregister(void) +{ + if (fprobe_ftrace_registered) { + unregister_ftrace_function(&fprobe_ftrace_ops); + ftrace_free_filter(&fprobe_ftrace_ops); + fprobe_ftrace_registered = false; + } +} + +static void fprobe_ftrace_remove_ips(unsigned long *addrs, int num) +{ + lockdep_assert_held(&fprobe_mutex); + + if (!nr_ftrace_fprobes) + __fprobe_ftrace_unregister(); + else if (num) + ftrace_set_filter_ips(&fprobe_ftrace_ops, addrs, num, 1, 0); +} + +static bool fprobe_is_ftrace(struct fprobe *fp) +{ + return !fp->exit_handler; +} + +/* Node insertion and deletion requires the fprobe_mutex */ +static int insert_fprobe_node(struct fprobe_hlist_node *node, struct fprobe *fp) +{ + int ret; + + lockdep_assert_held(&fprobe_mutex); + + ret = __insert_fprobe_node(node, fp); + if (!ret) { + if (fprobe_is_ftrace(fp)) + nr_ftrace_fprobes++; + else + nr_fgraph_fprobes++; + } + + return ret; +} + +static void delete_fprobe_node(struct fprobe_hlist_node *node) +{ + struct fprobe *fp; + + lockdep_assert_held(&fprobe_mutex); + + fp = READ_ONCE(node->fp); + if (fp) { + if (fprobe_is_ftrace(fp)) + nr_ftrace_fprobes--; + else + nr_fgraph_fprobes--; + } + __delete_fprobe_node(node); +} + +static bool fprobe_exists_on_hash(unsigned long ip, bool ftrace) +{ + struct rhlist_head *head, *pos; + struct fprobe_hlist_node *node; + struct fprobe *fp; + + guard(rcu)(); + head = rhltable_lookup(&fprobe_ip_table, &ip, + fprobe_rht_params); + if (!head) + return false; + /* We have to check the same type on the list. */ + rhl_for_each_entry_rcu(node, pos, head, hlist) { + if (node->addr != ip) + break; + fp = READ_ONCE(node->fp); + if (likely(fp)) { + if ((!ftrace && fp->exit_handler) || + (ftrace && !fp->exit_handler)) + return true; + } + } + + return false; +} + +#ifdef CONFIG_MODULES +static void fprobe_remove_ips(unsigned long *ips, unsigned int cnt) +{ + if (!nr_fgraph_fprobes) + __fprobe_graph_unregister(); + else if (cnt) + ftrace_set_filter_ips(&fprobe_graph_ops.ops, ips, cnt, 1, 0); + + if (!nr_ftrace_fprobes) + __fprobe_ftrace_unregister(); + else if (cnt) + ftrace_set_filter_ips(&fprobe_ftrace_ops, ips, cnt, 1, 0); +} +#endif +#else +static int fprobe_ftrace_add_ips(unsigned long *addrs, int num) +{ + return -ENOENT; +} + +static void fprobe_ftrace_remove_ips(unsigned long *addrs, int num) +{ +} + +static bool fprobe_is_ftrace(struct fprobe *fp) +{ + return false; +} + +/* Node insertion and deletion requires the fprobe_mutex */ +static int insert_fprobe_node(struct fprobe_hlist_node *node, struct fprobe *fp) +{ + int ret; + + lockdep_assert_held(&fprobe_mutex); + + ret = __insert_fprobe_node(node, fp); + if (!ret) + nr_fgraph_fprobes++; + + return ret; +} + +static void delete_fprobe_node(struct fprobe_hlist_node *node) +{ + struct fprobe *fp; + + lockdep_assert_held(&fprobe_mutex); + + fp = READ_ONCE(node->fp); + if (fp) + nr_fgraph_fprobes--; + __delete_fprobe_node(node); +} + +static bool fprobe_exists_on_hash(unsigned long ip, bool ftrace __maybe_unused) +{ + struct rhlist_head *head, *pos; + struct fprobe_hlist_node *node; + struct fprobe *fp; + + guard(rcu)(); + head = rhltable_lookup(&fprobe_ip_table, &ip, + fprobe_rht_params); + if (!head) + return false; + /* We only need to check fp is there. */ + rhl_for_each_entry_rcu(node, pos, head, hlist) { + if (node->addr != ip) + break; + fp = READ_ONCE(node->fp); + if (likely(fp)) + return true; + } + + return false; +} + +#ifdef CONFIG_MODULES +static void fprobe_remove_ips(unsigned long *ips, unsigned int cnt) +{ + if (!nr_fgraph_fprobes) + __fprobe_graph_unregister(); + else if (cnt) + ftrace_set_filter_ips(&fprobe_graph_ops.ops, ips, cnt, 1, 0); +} +#endif +#endif /* !CONFIG_DYNAMIC_FTRACE_WITH_ARGS && !CONFIG_DYNAMIC_FTRACE_WITH_REGS */ + +/* fgraph_ops callback, this processes fprobes which have exit_handler. */ +static int fprobe_fgraph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, + struct ftrace_regs *fregs) +{ + unsigned long *fgraph_data = NULL; + unsigned long func = trace->func; + struct fprobe_hlist_node *node; + struct rhlist_head *head, *pos; + unsigned long ret_ip; + int reserved_words; + struct fprobe *fp; + int used, ret; + + if (WARN_ON_ONCE(!fregs)) + return 0; + + guard(rcu)(); + head = rhltable_lookup(&fprobe_ip_table, &func, fprobe_rht_params); + reserved_words = 0; + rhl_for_each_entry_rcu(node, pos, head, hlist) { + if (node->addr != func) + continue; + fp = READ_ONCE(node->fp); + if (!fp || !fp->exit_handler) + continue; + /* + * Since fprobe can be enabled until the next loop, we ignore the + * fprobe's disabled flag in this loop. + */ + reserved_words += + FPROBE_HEADER_SIZE_IN_LONG + SIZE_IN_LONG(fp->entry_data_size); + } + if (reserved_words) { + fgraph_data = fgraph_reserve_data(gops->idx, reserved_words * sizeof(long)); + if (unlikely(!fgraph_data)) { + rhl_for_each_entry_rcu(node, pos, head, hlist) { + if (node->addr != func) + continue; + fp = READ_ONCE(node->fp); + if (fp && !fprobe_disabled(fp) && !fprobe_is_ftrace(fp)) + fp->nmissed++; + } + return 0; + } + } /* - * we need to assure no calls to traceable functions in-between the - * end of fprobe_handler and the beginning of fprobe_exit_handler. + * TODO: recursion detection has been done in the fgraph. Thus we need + * to add a callback to increment missed counter. */ - bit = ftrace_test_recursion_trylock(fpr->entry_ip, fpr->entry_parent_ip); - if (bit < 0) { - fp->nmissed++; + ret_ip = ftrace_regs_get_return_address(fregs); + used = 0; + rhl_for_each_entry_rcu(node, pos, head, hlist) { + int data_size; + void *data; + + if (node->addr != func) + continue; + fp = READ_ONCE(node->fp); + if (unlikely(!fp || fprobe_disabled(fp) || fprobe_is_ftrace(fp))) + continue; + + data_size = fp->entry_data_size; + if (data_size && fp->exit_handler) + data = fgraph_data + used + FPROBE_HEADER_SIZE_IN_LONG; + else + data = NULL; + + if (fprobe_shared_with_kprobes(fp)) + ret = __fprobe_kprobe_handler(func, ret_ip, fp, fregs, data); + else + ret = __fprobe_handler(func, ret_ip, fp, fregs, data); + + /* If entry_handler returns !0, nmissed is not counted but skips exit_handler. */ + if (!ret && fp->exit_handler) { + int size_words = SIZE_IN_LONG(data_size); + + if (write_fprobe_header(&fgraph_data[used], fp, size_words)) + used += FPROBE_HEADER_SIZE_IN_LONG + size_words; + } + } + + /* If any exit_handler is set, data must be used. */ + return used != 0; +} +NOKPROBE_SYMBOL(fprobe_fgraph_entry); + +static void fprobe_return(struct ftrace_graph_ret *trace, + struct fgraph_ops *gops, + struct ftrace_regs *fregs) +{ + unsigned long *fgraph_data = NULL; + unsigned long ret_ip; + struct fprobe *fp; + int size, curr; + int size_words; + + fgraph_data = (unsigned long *)fgraph_retrieve_data(gops->idx, &size); + if (WARN_ON_ONCE(!fgraph_data)) return; + size_words = SIZE_IN_LONG(size); + ret_ip = ftrace_regs_get_instruction_pointer(fregs); + + preempt_disable_notrace(); + + curr = 0; + while (size_words > curr) { + read_fprobe_header(&fgraph_data[curr], &fp, &size); + if (!fp) + break; + curr += FPROBE_HEADER_SIZE_IN_LONG; + if (fprobe_registered(fp) && !fprobe_disabled(fp)) { + if (WARN_ON_ONCE(curr + size > size_words)) + break; + fp->exit_handler(fp, trace->func, ret_ip, fregs, + size ? fgraph_data + curr : NULL); + } + curr += size; } + preempt_enable_notrace(); +} +NOKPROBE_SYMBOL(fprobe_return); - fp->exit_handler(fp, fpr->entry_ip, ret_ip, regs, - fp->entry_data_size ? (void *)fpr->data : NULL); - ftrace_test_recursion_unlock(bit); +#ifdef CONFIG_MODULES + +#define FPROBE_IPS_BATCH_INIT 128 +/* instruction pointer address list */ +struct fprobe_addr_list { + int index; + int size; + unsigned long *addrs; +}; + +static int fprobe_remove_node_in_module(struct module *mod, struct fprobe_hlist_node *node, + struct fprobe_addr_list *alist) +{ + lockdep_assert_in_rcu_read_lock(); + + if (!within_module(node->addr, mod)) + return 0; + + delete_fprobe_node(node); + /* If no address list is available, we can't track this address. */ + if (!alist->addrs) + return 0; + /* + * Don't care the type here, because all fprobes on the same + * address must be removed eventually. + */ + if (!rhltable_lookup(&fprobe_ip_table, &node->addr, fprobe_rht_params)) { + alist->addrs[alist->index++] = node->addr; + if (alist->index == alist->size) + return -ENOSPC; + } + + return 0; } -NOKPROBE_SYMBOL(fprobe_exit_handler); + +/* Handle module unloading to manage fprobe_ip_table. */ +static int fprobe_module_callback(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct fprobe_addr_list alist = {.size = FPROBE_IPS_BATCH_INIT}; + struct fprobe_hlist_node *node; + struct rhashtable_iter iter; + struct module *mod = data; + bool retry; + + if (val != MODULE_STATE_GOING) + return NOTIFY_DONE; + + alist.addrs = kcalloc(alist.size, sizeof(*alist.addrs), GFP_KERNEL); + /* + * If failed to alloc memory, ftrace_ops will not be able to remove ips from + * hash, but we can still remove nodes from fprobe_ip_table, so we can avoid + * the potential wrong callback. So just print a warning here and try to + * continue without address list. + */ + WARN_ONCE(!alist.addrs, + "Failed to allocate memory for fprobe_addr_list, ftrace_ops will not be updated"); + + mutex_lock(&fprobe_mutex); +again: + retry = false; + alist.index = 0; + rhltable_walk_enter(&fprobe_ip_table, &iter); + do { + rhashtable_walk_start(&iter); + + while ((node = rhashtable_walk_next(&iter)) && !IS_ERR(node)) + if (fprobe_remove_node_in_module(mod, node, &alist) < 0) { + retry = true; + break; + } + + rhashtable_walk_stop(&iter); + } while (node == ERR_PTR(-EAGAIN) && !retry); + rhashtable_walk_exit(&iter); + /* Remove any ips from hash table(s) */ + fprobe_remove_ips(alist.addrs, alist.index); + /* + * If we break rhashtable walk loop except for -EAGAIN, we need + * to restart looping from start for safety. Anyway, this is + * not a hotpath. + */ + if (retry) + goto again; + + mutex_unlock(&fprobe_mutex); + + kfree(alist.addrs); + + return NOTIFY_DONE; +} + +static struct notifier_block fprobe_module_nb = { + .notifier_call = fprobe_module_callback, + .priority = 0, +}; + +static int __init init_fprobe_module(void) +{ + return register_module_notifier(&fprobe_module_nb); +} +early_initcall(init_fprobe_module); +#endif static int symbols_cmp(const void *a, const void *b) { @@ -175,51 +805,117 @@ static unsigned long *get_ftrace_locations(const char **syms, int num) return ERR_PTR(-ENOENT); } -static void fprobe_init(struct fprobe *fp) +struct filter_match_data { + const char *filter; + const char *notfilter; + size_t index; + size_t size; + unsigned long *addrs; + struct module **mods; +}; + +static int filter_match_callback(void *data, const char *name, unsigned long addr) { - fp->nmissed = 0; - if (fprobe_shared_with_kprobes(fp)) - fp->ops.func = fprobe_kprobe_handler; - else - fp->ops.func = fprobe_handler; - fp->ops.flags |= FTRACE_OPS_FL_SAVE_REGS; + struct filter_match_data *match = data; + + if (!glob_match(match->filter, name) || + (match->notfilter && glob_match(match->notfilter, name))) + return 0; + + if (!ftrace_location(addr)) + return 0; + + if (match->addrs) { + struct module *mod = __module_text_address(addr); + + if (mod && !try_module_get(mod)) + return 0; + + match->mods[match->index] = mod; + match->addrs[match->index] = addr; + } + match->index++; + return match->index == match->size; } -static int fprobe_init_rethook(struct fprobe *fp, int num) +/* + * Make IP list from the filter/no-filter glob patterns. + * Return the number of matched symbols, or errno. + * If @addrs == NULL, this just counts the number of matched symbols. If @addrs + * is passed with an array, we need to pass the an @mods array of the same size + * to increment the module refcount for each symbol. + * This means we also need to call `module_put` for each element of @mods after + * using the @addrs. + */ +static int get_ips_from_filter(const char *filter, const char *notfilter, + unsigned long *addrs, struct module **mods, + size_t size) { - int size; + struct filter_match_data match = { .filter = filter, .notfilter = notfilter, + .index = 0, .size = size, .addrs = addrs, .mods = mods}; + int ret; - if (!fp->exit_handler) { - fp->rethook = NULL; - return 0; + if (addrs && !mods) + return -EINVAL; + + ret = kallsyms_on_each_symbol(filter_match_callback, &match); + if (ret < 0) + return ret; + if (IS_ENABLED(CONFIG_MODULES)) { + ret = module_kallsyms_on_each_symbol(NULL, filter_match_callback, &match); + if (ret < 0) + return ret; } - /* Initialize rethook if needed */ - if (fp->nr_maxactive) - num = fp->nr_maxactive; - else - num *= num_possible_cpus() * 2; - if (num <= 0) + return match.index ?: -ENOENT; +} + +static void fprobe_fail_cleanup(struct fprobe *fp) +{ + kfree(fp->hlist_array); + fp->hlist_array = NULL; +} + +/* Initialize the fprobe data structure. */ +static int fprobe_init(struct fprobe *fp, unsigned long *addrs, int num) +{ + struct fprobe_hlist *hlist_array; + unsigned long addr; + int size, i; + + if (!fp || !addrs || num <= 0) return -EINVAL; - size = sizeof(struct fprobe_rethook_node) + fp->entry_data_size; + size = ALIGN(fp->entry_data_size, sizeof(long)); + if (size > MAX_FPROBE_DATA_SIZE) + return -E2BIG; + fp->entry_data_size = size; - /* Initialize rethook */ - fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler, size, num); - if (IS_ERR(fp->rethook)) - return PTR_ERR(fp->rethook); + hlist_array = kzalloc_flex(*hlist_array, array, num); + if (!hlist_array) + return -ENOMEM; + fp->nmissed = 0; + + hlist_array->size = num; + fp->hlist_array = hlist_array; + hlist_array->fp = fp; + for (i = 0; i < num; i++) { + addr = ftrace_location(addrs[i]); + if (!addr) { + fprobe_fail_cleanup(fp); + return -ENOENT; + } + hlist_array->array[i].addr = addr; + } return 0; } -static void fprobe_fail_cleanup(struct fprobe *fp) +#define FPROBE_IPS_MAX INT_MAX + +int fprobe_count_ips_from_filter(const char *filter, const char *notfilter) { - if (!IS_ERR_OR_NULL(fp->rethook)) { - /* Don't need to cleanup rethook->handler because this is not used. */ - rethook_free(fp->rethook); - fp->rethook = NULL; - } - ftrace_free_filter(&fp->ops); + return get_ips_from_filter(filter, notfilter, NULL, NULL, FPROBE_IPS_MAX); } /** @@ -235,54 +931,45 @@ static void fprobe_fail_cleanup(struct fprobe *fp) */ int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter) { - struct ftrace_hash *hash; - unsigned char *str; - int ret, len; + unsigned long *addrs __free(kfree) = NULL; + struct module **mods __free(kfree) = NULL; + int ret, num; if (!fp || !filter) return -EINVAL; - fprobe_init(fp); + num = get_ips_from_filter(filter, notfilter, NULL, NULL, FPROBE_IPS_MAX); + if (num < 0) + return num; - len = strlen(filter); - str = kstrdup(filter, GFP_KERNEL); - ret = ftrace_set_filter(&fp->ops, str, len, 0); - kfree(str); - if (ret) - return ret; + addrs = kcalloc(num, sizeof(*addrs), GFP_KERNEL); + if (!addrs) + return -ENOMEM; - if (notfilter) { - len = strlen(notfilter); - str = kstrdup(notfilter, GFP_KERNEL); - ret = ftrace_set_notrace(&fp->ops, str, len, 0); - kfree(str); - if (ret) - goto out; - } + mods = kzalloc_objs(*mods, num); + if (!mods) + return -ENOMEM; - /* TODO: - * correctly calculate the total number of filtered symbols - * from both filter and notfilter. - */ - hash = rcu_access_pointer(fp->ops.local_hash.filter_hash); - if (WARN_ON_ONCE(!hash)) - goto out; + ret = get_ips_from_filter(filter, notfilter, addrs, mods, num); + if (ret < 0) + return ret; - ret = fprobe_init_rethook(fp, (int)hash->count); - if (!ret) - ret = register_ftrace_function(&fp->ops); + ret = register_fprobe_ips(fp, addrs, ret); -out: - if (ret) - fprobe_fail_cleanup(fp); + for (int i = 0; i < num; i++) { + if (mods[i]) + module_put(mods[i]); + } return ret; } EXPORT_SYMBOL_GPL(register_fprobe); +static int unregister_fprobe_nolock(struct fprobe *fp); + /** * register_fprobe_ips() - Register fprobe to ftrace by address. * @fp: A fprobe data structure to be registered. - * @addrs: An array of target ftrace location addresses. + * @addrs: An array of target function address. * @num: The number of entries of @addrs. * * Register @fp to ftrace for enabling the probe on the address given by @addrs. @@ -294,23 +981,37 @@ EXPORT_SYMBOL_GPL(register_fprobe); */ int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num) { - int ret; - - if (!fp || !addrs || num <= 0) - return -EINVAL; + struct fprobe_hlist *hlist_array; + int ret, i; - fprobe_init(fp); + guard(mutex)(&fprobe_mutex); + if (fprobe_registered(fp)) + return -EEXIST; - ret = ftrace_set_filter_ips(&fp->ops, addrs, num, 0, 0); + ret = fprobe_init(fp, addrs, num); if (ret) return ret; - ret = fprobe_init_rethook(fp, num); - if (!ret) - ret = register_ftrace_function(&fp->ops); - - if (ret) + if (fprobe_is_ftrace(fp)) + ret = fprobe_ftrace_add_ips(addrs, num); + else + ret = fprobe_graph_add_ips(addrs, num); + if (ret) { fprobe_fail_cleanup(fp); + return ret; + } + + hlist_array = fp->hlist_array; + ret = add_fprobe_hash(fp); + for (i = 0; i < hlist_array->size && !ret; i++) + ret = insert_fprobe_node(&hlist_array->array[i], fp); + + if (ret) { + unregister_fprobe_nolock(fp); + /* In error case, wait for clean up safely. */ + synchronize_rcu(); + } + return ret; } EXPORT_SYMBOL_GPL(register_fprobe_ips); @@ -348,39 +1049,89 @@ EXPORT_SYMBOL_GPL(register_fprobe_syms); bool fprobe_is_registered(struct fprobe *fp) { - if (!fp || (fp->ops.saved_func != fprobe_handler && - fp->ops.saved_func != fprobe_kprobe_handler)) + if (!fp || !fp->hlist_array) return false; return true; } +static int unregister_fprobe_nolock(struct fprobe *fp) +{ + struct fprobe_hlist *hlist_array = fp->hlist_array; + unsigned long *addrs = NULL; + int i, count; + + addrs = kcalloc(hlist_array->size, sizeof(unsigned long), GFP_KERNEL); + /* + * This will remove fprobe_hash_node from the hash table even if + * memory allocation fails. However, ftrace_ops will not be updated. + * Anyway, when the last fprobe is unregistered, ftrace_ops is also + * unregistered. + */ + if (!addrs) + pr_warn("Failed to allocate working array. ftrace_ops may not sync.\n"); + + /* Remove non-synonim ips from table and hash */ + count = 0; + for (i = 0; i < hlist_array->size; i++) { + delete_fprobe_node(&hlist_array->array[i]); + if (addrs && !fprobe_exists_on_hash(hlist_array->array[i].addr, + fprobe_is_ftrace(fp))) + addrs[count++] = hlist_array->array[i].addr; + } + del_fprobe_hash(fp); + + if (fprobe_is_ftrace(fp)) + fprobe_ftrace_remove_ips(addrs, count); + else + fprobe_graph_remove_ips(addrs, count); + + kfree_rcu(hlist_array, rcu); + fp->hlist_array = NULL; + kfree(addrs); + + return 0; +} + /** - * unregister_fprobe() - Unregister fprobe from ftrace + * unregister_fprobe_async() - Unregister fprobe without RCU GP wait * @fp: A fprobe data structure to be unregistered. * * Unregister fprobe (and remove ftrace hooks from the function entries). + * This function will NOT wait until the fprobe is no longer used. * * Return 0 if @fp is unregistered successfully, -errno if not. */ -int unregister_fprobe(struct fprobe *fp) +int unregister_fprobe_async(struct fprobe *fp) { - int ret; - - if (!fprobe_is_registered(fp)) + guard(mutex)(&fprobe_mutex); + if (!fp || !fprobe_registered(fp)) return -EINVAL; - if (!IS_ERR_OR_NULL(fp->rethook)) - rethook_stop(fp->rethook); - - ret = unregister_ftrace_function(&fp->ops); - if (ret < 0) - return ret; - - if (!IS_ERR_OR_NULL(fp->rethook)) - rethook_free(fp->rethook); + return unregister_fprobe_nolock(fp); +} - ftrace_free_filter(&fp->ops); +/** + * unregister_fprobe() - Unregister fprobe with RCU GP wait + * @fp: A fprobe data structure to be unregistered. + * + * Unregister fprobe (and remove ftrace hooks from the function entries). + * This function will block until the fprobe is no longer used. + * + * Return 0 if @fp is unregistered successfully, -errno if not. + */ +int unregister_fprobe(struct fprobe *fp) +{ + int ret = unregister_fprobe_async(fp); + if (!ret) + synchronize_rcu(); return ret; } EXPORT_SYMBOL_GPL(unregister_fprobe); + +static int __init fprobe_initcall(void) +{ + rhltable_init(&fprobe_ip_table, &fprobe_rht_params); + return 0; +} +core_initcall(fprobe_initcall); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2e113f8b13a2..b2611de3f594 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -68,7 +68,6 @@ }) /* hash bits for specific function selection */ -#define FTRACE_HASH_DEFAULT_BITS 10 #define FTRACE_HASH_MAX_BITS 12 #ifdef CONFIG_DYNAMIC_FTRACE @@ -188,7 +187,7 @@ static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, op->saved_func(ip, parent_ip, op, fregs); } -static void ftrace_sync_ipi(void *data) +void ftrace_sync_ipi(void *data) { /* Probably not needed, but do it anyway */ smp_rmb(); @@ -534,51 +533,69 @@ static int function_stat_headers(struct seq_file *m) static int function_stat_show(struct seq_file *m, void *v) { + struct trace_array *tr = trace_get_global_array(); struct ftrace_profile *rec = v; + const char *refsymbol = NULL; char str[KSYM_SYMBOL_LEN]; - int ret = 0; #ifdef CONFIG_FUNCTION_GRAPH_TRACER static struct trace_seq s; unsigned long long avg; unsigned long long stddev; + unsigned long long stddev_denom; #endif - mutex_lock(&ftrace_profile_lock); + guard(mutex)(&ftrace_profile_lock); /* we raced with function_profile_reset() */ - if (unlikely(rec->counter == 0)) { - ret = -EBUSY; - goto out; - } + if (unlikely(rec->counter == 0)) + return -EBUSY; #ifdef CONFIG_FUNCTION_GRAPH_TRACER avg = div64_ul(rec->time, rec->counter); if (tracing_thresh && (avg < tracing_thresh)) - goto out; + return 0; #endif - kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + if (tr->trace_flags & TRACE_ITER(PROF_TEXT_OFFSET)) { + unsigned long offset; + + if (core_kernel_text(rec->ip)) { + refsymbol = "_text"; + offset = rec->ip - (unsigned long)_text; + } else { + struct module *mod; + + guard(rcu)(); + mod = __module_text_address(rec->ip); + if (mod) { + refsymbol = mod->name; + /* Calculate offset from module's text entry address. */ + offset = rec->ip - (unsigned long)mod->mem[MOD_TEXT].base; + } + } + if (refsymbol) + snprintf(str, sizeof(str), " %s+%#lx", refsymbol, offset); + } + if (!refsymbol) + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + seq_printf(m, " %-30.30s %10lu", str, rec->counter); #ifdef CONFIG_FUNCTION_GRAPH_TRACER seq_puts(m, " "); - /* Sample standard deviation (s^2) */ - if (rec->counter <= 1) - stddev = 0; - else { - /* - * Apply Welford's method: - * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) - */ + /* + * Variance formula: + * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) + * Maybe Welford's method is better here? + * Divide only by 1000 for ns^2 -> us^2 conversion. + * trace_print_graph_duration will divide by 1000 again. + */ + stddev = 0; + stddev_denom = rec->counter * (rec->counter - 1) * 1000; + if (stddev_denom) { stddev = rec->counter * rec->time_squared - rec->time * rec->time; - - /* - * Divide only 1000 for ns^2 -> us^2 conversion. - * trace_print_graph_duration will divide 1000 again. - */ - stddev = div64_ul(stddev, - rec->counter * (rec->counter - 1) * 1000); + stddev = div64_ul(stddev, stddev_denom); } trace_seq_init(&s); @@ -590,10 +607,8 @@ static int function_stat_show(struct seq_file *m, void *v) trace_print_seq(m, &s); #endif seq_putc(m, '\n'); -out: - mutex_unlock(&ftrace_profile_lock); - return ret; + return 0; } static void ftrace_profile_reset(struct ftrace_profile_stat *stat) @@ -687,7 +702,7 @@ static int ftrace_profile_init_cpu(int cpu) */ size = FTRACE_PROFILE_HASH_SIZE; - stat->hash = kcalloc(size, sizeof(struct hlist_head), GFP_KERNEL); + stat->hash = kzalloc_objs(struct hlist_head, size); if (!stat->hash) return -ENOMEM; @@ -789,27 +804,24 @@ function_profile_call(unsigned long ip, unsigned long parent_ip, { struct ftrace_profile_stat *stat; struct ftrace_profile *rec; - unsigned long flags; if (!ftrace_profile_enabled) return; - local_irq_save(flags); + guard(preempt_notrace)(); stat = this_cpu_ptr(&ftrace_profile_stats); if (!stat->hash || !ftrace_profile_enabled) - goto out; + return; rec = ftrace_find_profiled_func(stat, ip); if (!rec) { rec = ftrace_profile_alloc(stat, ip); if (!rec) - goto out; + return; } rec->counter++; - out: - local_irq_restore(flags); } #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -827,7 +839,8 @@ struct profile_fgraph_data { }; static int profile_graph_entry(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { struct profile_fgraph_data *profile_data; @@ -848,31 +861,34 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace, return 1; } +bool fprofile_no_sleep_time; + static void profile_graph_return(struct ftrace_graph_ret *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { struct profile_fgraph_data *profile_data; struct ftrace_profile_stat *stat; unsigned long long calltime; unsigned long long rettime = trace_clock_local(); struct ftrace_profile *rec; - unsigned long flags; int size; - local_irq_save(flags); + guard(preempt_notrace)(); + stat = this_cpu_ptr(&ftrace_profile_stats); if (!stat->hash || !ftrace_profile_enabled) - goto out; + return; profile_data = fgraph_retrieve_data(gops->idx, &size); /* If the calltime was zero'd ignore it */ if (!profile_data || !profile_data->calltime) - goto out; + return; calltime = rettime - profile_data->calltime; - if (!fgraph_sleep_time) { + if (fprofile_no_sleep_time) { if (current->ftrace_sleeptime) calltime -= current->ftrace_sleeptime - profile_data->sleeptime; } @@ -896,9 +912,6 @@ static void profile_graph_return(struct ftrace_graph_ret *trace, rec->time += calltime; rec->time_squared += calltime * calltime; } - - out: - local_irq_restore(flags); } static struct fgraph_ops fprofiler_ops = { @@ -946,20 +959,16 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, val = !!val; - mutex_lock(&ftrace_profile_lock); + guard(mutex)(&ftrace_profile_lock); if (ftrace_profile_enabled ^ val) { if (val) { ret = ftrace_profile_init(); - if (ret < 0) { - cnt = ret; - goto out; - } + if (ret < 0) + return ret; ret = register_ftrace_profiler(); - if (ret < 0) { - cnt = ret; - goto out; - } + if (ret < 0) + return ret; ftrace_profile_enabled = 1; } else { ftrace_profile_enabled = 0; @@ -970,8 +979,6 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, unregister_ftrace_profiler(); } } - out: - mutex_unlock(&ftrace_profile_lock); *ppos += cnt; @@ -1060,10 +1067,6 @@ static struct ftrace_ops *removed_ops; */ static bool update_all_ops; -#ifndef CONFIG_FTRACE_MCOUNT_RECORD -# error Dynamic ftrace depends on MCOUNT_RECORD -#endif - struct ftrace_func_probe { struct ftrace_probe_ops *probe_ops; struct ftrace_ops ops; @@ -1144,7 +1147,7 @@ struct ftrace_page { }; #define ENTRY_SIZE sizeof(struct dyn_ftrace) -#define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE) +#define ENTRIES_PER_PAGE_GROUP(order) ((PAGE_SIZE << (order)) / ENTRY_SIZE) static struct ftrace_page *ftrace_pages_start; static struct ftrace_page *ftrace_pages; @@ -1207,21 +1210,28 @@ static void __add_hash_entry(struct ftrace_hash *hash, hash->count++; } -static struct ftrace_func_entry * -add_hash_entry(struct ftrace_hash *hash, unsigned long ip) +struct ftrace_func_entry * +add_ftrace_hash_entry_direct(struct ftrace_hash *hash, unsigned long ip, unsigned long direct) { struct ftrace_func_entry *entry; - entry = kmalloc(sizeof(*entry), GFP_KERNEL); + entry = kmalloc_obj(*entry); if (!entry) return NULL; entry->ip = ip; + entry->direct = direct; __add_hash_entry(hash, entry); return entry; } +static struct ftrace_func_entry * +add_hash_entry(struct ftrace_hash *hash, unsigned long ip) +{ + return add_ftrace_hash_entry_direct(hash, ip, 0); +} + static void free_hash_entry(struct ftrace_hash *hash, struct ftrace_func_entry *entry) @@ -1280,7 +1290,7 @@ static void clear_ftrace_mod_list(struct list_head *head) mutex_unlock(&ftrace_lock); } -static void free_ftrace_hash(struct ftrace_hash *hash) +void free_ftrace_hash(struct ftrace_hash *hash) { if (!hash || hash == EMPTY_HASH) return; @@ -1311,22 +1321,26 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash) void ftrace_free_filter(struct ftrace_ops *ops) { ftrace_ops_init(ops); + if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) + return; free_ftrace_hash(ops->func_hash->filter_hash); free_ftrace_hash(ops->func_hash->notrace_hash); + ops->func_hash->filter_hash = EMPTY_HASH; + ops->func_hash->notrace_hash = EMPTY_HASH; } EXPORT_SYMBOL_GPL(ftrace_free_filter); -static struct ftrace_hash *alloc_ftrace_hash(int size_bits) +struct ftrace_hash *alloc_ftrace_hash(int size_bits) { struct ftrace_hash *hash; int size; - hash = kzalloc(sizeof(*hash), GFP_KERNEL); + hash = kzalloc_obj(*hash); if (!hash) return NULL; size = 1 << size_bits; - hash->buckets = kcalloc(size, sizeof(*hash->buckets), GFP_KERNEL); + hash->buckets = kzalloc_objs(*hash->buckets, size); if (!hash->buckets) { kfree(hash); @@ -1346,7 +1360,7 @@ static int ftrace_add_mod(struct trace_array *tr, struct ftrace_mod_load *ftrace_mod; struct list_head *mod_head = enable ? &tr->mod_trace : &tr->mod_notrace; - ftrace_mod = kzalloc(sizeof(*ftrace_mod), GFP_KERNEL); + ftrace_mod = kzalloc_obj(*ftrace_mod); if (!ftrace_mod) return -ENOMEM; @@ -1390,7 +1404,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) size = 1 << hash->size_bits; for (i = 0; i < size; i++) { hlist_for_each_entry(entry, &hash->buckets[i], hlist) { - if (add_hash_entry(new_hash, entry->ip) == NULL) + if (add_ftrace_hash_entry_direct(new_hash, entry->ip, entry->direct) == NULL) goto free_hash; } } @@ -1671,14 +1685,12 @@ unsigned long ftrace_location(unsigned long ip) loc = ftrace_location_range(ip, ip); if (!loc) { if (!kallsyms_lookup_size_offset(ip, &size, &offset)) - goto out; + return 0; /* map sym+0 to __fentry__ */ if (!offset) loc = ftrace_location_range(ip, ip + size - 1); } - -out: return loc; } @@ -1991,7 +2003,8 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops) */ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, struct ftrace_hash *old_hash, - struct ftrace_hash *new_hash) + struct ftrace_hash *new_hash, + bool update_target) { struct ftrace_page *pg; struct dyn_ftrace *rec, *end = NULL; @@ -2026,10 +2039,13 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, if (rec->flags & FTRACE_FL_DISABLED) continue; - /* We need to update only differences of filter_hash */ + /* + * Unless we are updating the target of a direct function, + * we only need to update differences of filter_hash + */ in_old = !!ftrace_lookup_ip(old_hash, rec->ip); in_new = !!ftrace_lookup_ip(new_hash, rec->ip); - if (in_old == in_new) + if (!update_target && (in_old == in_new)) continue; if (in_new) { @@ -2040,7 +2056,16 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, if (is_ipmodify) goto rollback; - FTRACE_WARN_ON(rec->flags & FTRACE_FL_DIRECT); + /* + * If this is called by __modify_ftrace_direct() + * then it is only changing where the direct + * pointer is jumping to, and the record already + * points to a direct trampoline. If it isn't, + * then it is a bug to update ipmodify on a direct + * caller. + */ + FTRACE_WARN_ON(!update_target && + (rec->flags & FTRACE_FL_DIRECT)); /* * Another ops with IPMODIFY is already @@ -2050,7 +2075,7 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, */ if (!ops->ops_func) return -EBUSY; - ret = ops->ops_func(ops, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF); + ret = ops->ops_func(ops, rec->ip, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF); if (ret) return ret; } else if (is_ipmodify) { @@ -2073,7 +2098,7 @@ rollback: continue; if (rec == end) - goto err_out; + return -EBUSY; in_old = !!ftrace_lookup_ip(old_hash, rec->ip); in_new = !!ftrace_lookup_ip(new_hash, rec->ip); @@ -2086,7 +2111,6 @@ rollback: rec->flags |= FTRACE_FL_IPMODIFY; } while_for_each_ftrace_rec(); -err_out: return -EBUSY; } @@ -2097,7 +2121,7 @@ static int ftrace_hash_ipmodify_enable(struct ftrace_ops *ops) if (ftrace_hash_empty(hash)) hash = NULL; - return __ftrace_hash_update_ipmodify(ops, EMPTY_HASH, hash); + return __ftrace_hash_update_ipmodify(ops, EMPTY_HASH, hash, false); } /* Disabling always succeeds */ @@ -2108,7 +2132,7 @@ static void ftrace_hash_ipmodify_disable(struct ftrace_ops *ops) if (ftrace_hash_empty(hash)) hash = NULL; - __ftrace_hash_update_ipmodify(ops, hash, EMPTY_HASH); + __ftrace_hash_update_ipmodify(ops, hash, EMPTY_HASH, false); } static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, @@ -2122,7 +2146,7 @@ static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, if (ftrace_hash_empty(new_hash)) new_hash = NULL; - return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash); + return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash, false); } static void print_ip_ins(const char *fmt, const unsigned char *p) @@ -2607,8 +2631,13 @@ unsigned long ftrace_find_rec_direct(unsigned long ip) static void call_direct_funcs(unsigned long ip, unsigned long pip, struct ftrace_ops *ops, struct ftrace_regs *fregs) { - unsigned long addr = READ_ONCE(ops->direct_call); + unsigned long addr; +#ifdef CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS + addr = ftrace_find_rec_direct(ip); +#else + addr = READ_ONCE(ops->direct_call); +#endif if (!addr) return; @@ -3238,15 +3267,22 @@ static struct ftrace_hash *copy_hash(struct ftrace_hash *src) * The filter_hash updates uses just the append_hash() function * and the notrace_hash does not. */ -static int append_hash(struct ftrace_hash **hash, struct ftrace_hash *new_hash) +static int append_hash(struct ftrace_hash **hash, struct ftrace_hash *new_hash, + int size_bits) { struct ftrace_func_entry *entry; int size; int i; - /* An empty hash does everything */ - if (ftrace_hash_empty(*hash)) - return 0; + if (*hash) { + /* An empty hash does everything */ + if (ftrace_hash_empty(*hash)) + return 0; + } else { + *hash = alloc_ftrace_hash(size_bits); + if (!*hash) + return -ENOMEM; + } /* If new_hash has everything make hash have everything */ if (ftrace_hash_empty(new_hash)) { @@ -3268,6 +3304,31 @@ static int append_hash(struct ftrace_hash **hash, struct ftrace_hash *new_hash) } /* + * Remove functions from @hash that are in @notrace_hash + */ +static void remove_hash(struct ftrace_hash *hash, struct ftrace_hash *notrace_hash) +{ + struct ftrace_func_entry *entry; + struct hlist_node *tmp; + int size; + int i; + + /* If the notrace hash is empty, there's nothing to do */ + if (ftrace_hash_empty(notrace_hash)) + return; + + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry_safe(entry, tmp, &hash->buckets[i], hlist) { + if (!__ftrace_lookup_ip(notrace_hash, entry->ip)) + continue; + remove_hash_entry(hash, entry); + kfree(entry); + } + } +} + +/* * Add to @hash only those that are in both @new_hash1 and @new_hash2 * * The notrace_hash updates uses just the intersect_hash() function @@ -3307,64 +3368,6 @@ static int intersect_hash(struct ftrace_hash **hash, struct ftrace_hash *new_has return 0; } -/* Return a new hash that has a union of all @ops->filter_hash entries */ -static struct ftrace_hash *append_hashes(struct ftrace_ops *ops) -{ - struct ftrace_hash *new_hash; - struct ftrace_ops *subops; - int ret; - - new_hash = alloc_ftrace_hash(ops->func_hash->filter_hash->size_bits); - if (!new_hash) - return NULL; - - list_for_each_entry(subops, &ops->subop_list, list) { - ret = append_hash(&new_hash, subops->func_hash->filter_hash); - if (ret < 0) { - free_ftrace_hash(new_hash); - return NULL; - } - /* Nothing more to do if new_hash is empty */ - if (ftrace_hash_empty(new_hash)) - break; - } - return new_hash; -} - -/* Make @ops trace evenything except what all its subops do not trace */ -static struct ftrace_hash *intersect_hashes(struct ftrace_ops *ops) -{ - struct ftrace_hash *new_hash = NULL; - struct ftrace_ops *subops; - int size_bits; - int ret; - - list_for_each_entry(subops, &ops->subop_list, list) { - struct ftrace_hash *next_hash; - - if (!new_hash) { - size_bits = subops->func_hash->notrace_hash->size_bits; - new_hash = alloc_and_copy_ftrace_hash(size_bits, ops->func_hash->notrace_hash); - if (!new_hash) - return NULL; - continue; - } - size_bits = new_hash->size_bits; - next_hash = new_hash; - new_hash = alloc_ftrace_hash(size_bits); - ret = intersect_hash(&new_hash, next_hash, subops->func_hash->notrace_hash); - free_ftrace_hash(next_hash); - if (ret < 0) { - free_ftrace_hash(new_hash); - return NULL; - } - /* Nothing more to do if new_hash is empty */ - if (ftrace_hash_empty(new_hash)) - break; - } - return new_hash; -} - static bool ops_equal(struct ftrace_hash *A, struct ftrace_hash *B) { struct ftrace_func_entry *entry; @@ -3436,6 +3439,95 @@ static int ftrace_update_ops(struct ftrace_ops *ops, struct ftrace_hash *filter_ return 0; } +static int add_first_hash(struct ftrace_hash **filter_hash, struct ftrace_hash **notrace_hash, + struct ftrace_ops_hash *func_hash) +{ + /* If the filter hash is not empty, simply remove the nohash from it */ + if (!ftrace_hash_empty(func_hash->filter_hash)) { + *filter_hash = copy_hash(func_hash->filter_hash); + if (!*filter_hash) + return -ENOMEM; + remove_hash(*filter_hash, func_hash->notrace_hash); + *notrace_hash = EMPTY_HASH; + + } else { + *notrace_hash = copy_hash(func_hash->notrace_hash); + if (!*notrace_hash) + return -ENOMEM; + *filter_hash = EMPTY_HASH; + } + return 0; +} + +static int add_next_hash(struct ftrace_hash **filter_hash, struct ftrace_hash **notrace_hash, + struct ftrace_ops_hash *ops_hash, struct ftrace_ops_hash *subops_hash) +{ + int size_bits; + int ret; + + /* If the subops trace all functions so must the main ops */ + if (ftrace_hash_empty(ops_hash->filter_hash) || + ftrace_hash_empty(subops_hash->filter_hash)) { + *filter_hash = EMPTY_HASH; + } else { + /* + * The main ops filter hash is not empty, so its + * notrace_hash had better be, as the notrace hash + * is only used for empty main filter hashes. + */ + WARN_ON_ONCE(!ftrace_hash_empty(ops_hash->notrace_hash)); + + size_bits = max(ops_hash->filter_hash->size_bits, + subops_hash->filter_hash->size_bits); + + /* Copy the subops hash */ + *filter_hash = alloc_and_copy_ftrace_hash(size_bits, subops_hash->filter_hash); + if (!*filter_hash) + return -ENOMEM; + /* Remove any notrace functions from the copy */ + remove_hash(*filter_hash, subops_hash->notrace_hash); + + ret = append_hash(filter_hash, ops_hash->filter_hash, + size_bits); + if (ret < 0) { + free_ftrace_hash(*filter_hash); + *filter_hash = EMPTY_HASH; + return ret; + } + } + + /* + * Only process notrace hashes if the main filter hash is empty + * (tracing all functions), otherwise the filter hash will just + * remove the notrace hash functions, and the notrace hash is + * not needed. + */ + if (ftrace_hash_empty(*filter_hash)) { + /* + * Intersect the notrace functions. That is, if two + * subops are not tracing a set of functions, the + * main ops will only not trace the functions that are + * in both subops, but has to trace the functions that + * are only notrace in one of the subops, for the other + * subops to be able to trace them. + */ + size_bits = max(ops_hash->notrace_hash->size_bits, + subops_hash->notrace_hash->size_bits); + *notrace_hash = alloc_ftrace_hash(size_bits); + if (!*notrace_hash) + return -ENOMEM; + + ret = intersect_hash(notrace_hash, ops_hash->notrace_hash, + subops_hash->notrace_hash); + if (ret < 0) { + free_ftrace_hash(*notrace_hash); + *notrace_hash = EMPTY_HASH; + return ret; + } + } + return 0; +} + /** * ftrace_startup_subops - enable tracing for subops of an ops * @ops: Manager ops (used to pick all the functions of its subops) @@ -3448,11 +3540,10 @@ static int ftrace_update_ops(struct ftrace_ops *ops, struct ftrace_hash *filter_ */ int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command) { - struct ftrace_hash *filter_hash; - struct ftrace_hash *notrace_hash; + struct ftrace_hash *filter_hash = EMPTY_HASH; + struct ftrace_hash *notrace_hash = EMPTY_HASH; struct ftrace_hash *save_filter_hash; struct ftrace_hash *save_notrace_hash; - int size_bits; int ret; if (unlikely(ftrace_disabled)) @@ -3476,14 +3567,14 @@ int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int /* For the first subops to ops just enable it normally */ if (list_empty(&ops->subop_list)) { - /* Just use the subops hashes */ - filter_hash = copy_hash(subops->func_hash->filter_hash); - notrace_hash = copy_hash(subops->func_hash->notrace_hash); - if (!filter_hash || !notrace_hash) { - free_ftrace_hash(filter_hash); - free_ftrace_hash(notrace_hash); - return -ENOMEM; - } + + /* The ops was empty, should have empty hashes */ + WARN_ON_ONCE(!ftrace_hash_empty(ops->func_hash->filter_hash)); + WARN_ON_ONCE(!ftrace_hash_empty(ops->func_hash->notrace_hash)); + + ret = add_first_hash(&filter_hash, ¬race_hash, subops->func_hash); + if (ret < 0) + return ret; save_filter_hash = ops->func_hash->filter_hash; save_notrace_hash = ops->func_hash->notrace_hash; @@ -3509,47 +3600,16 @@ int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int /* * Here there's already something attached. Here are the rules: - * o If either filter_hash is empty then the final stays empty - * o Otherwise, the final is a superset of both hashes - * o If either notrace_hash is empty then the final stays empty - * o Otherwise, the final is an intersection between the hashes + * If the new subops and main ops filter hashes are not empty: + * o Make a copy of the subops filter hash + * o Remove all functions in the nohash from it. + * o Add in the main hash filter functions + * o Remove any of these functions from the main notrace hash */ - if (ftrace_hash_empty(ops->func_hash->filter_hash) || - ftrace_hash_empty(subops->func_hash->filter_hash)) { - filter_hash = EMPTY_HASH; - } else { - size_bits = max(ops->func_hash->filter_hash->size_bits, - subops->func_hash->filter_hash->size_bits); - filter_hash = alloc_and_copy_ftrace_hash(size_bits, ops->func_hash->filter_hash); - if (!filter_hash) - return -ENOMEM; - ret = append_hash(&filter_hash, subops->func_hash->filter_hash); - if (ret < 0) { - free_ftrace_hash(filter_hash); - return ret; - } - } - if (ftrace_hash_empty(ops->func_hash->notrace_hash) || - ftrace_hash_empty(subops->func_hash->notrace_hash)) { - notrace_hash = EMPTY_HASH; - } else { - size_bits = max(ops->func_hash->filter_hash->size_bits, - subops->func_hash->filter_hash->size_bits); - notrace_hash = alloc_ftrace_hash(size_bits); - if (!notrace_hash) { - free_ftrace_hash(filter_hash); - return -ENOMEM; - } - - ret = intersect_hash(¬race_hash, ops->func_hash->filter_hash, - subops->func_hash->filter_hash); - if (ret < 0) { - free_ftrace_hash(filter_hash); - free_ftrace_hash(notrace_hash); - return ret; - } - } + ret = add_next_hash(&filter_hash, ¬race_hash, ops->func_hash, subops->func_hash); + if (ret < 0) + return ret; list_add(&subops->list, &ops->subop_list); @@ -3565,6 +3625,45 @@ int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int return ret; } +static int rebuild_hashes(struct ftrace_hash **filter_hash, struct ftrace_hash **notrace_hash, + struct ftrace_ops *ops) +{ + struct ftrace_ops_hash temp_hash; + struct ftrace_ops *subops; + bool first = true; + int ret; + + temp_hash.filter_hash = EMPTY_HASH; + temp_hash.notrace_hash = EMPTY_HASH; + + list_for_each_entry(subops, &ops->subop_list, list) { + *filter_hash = EMPTY_HASH; + *notrace_hash = EMPTY_HASH; + + if (first) { + ret = add_first_hash(filter_hash, notrace_hash, subops->func_hash); + if (ret < 0) + return ret; + first = false; + } else { + ret = add_next_hash(filter_hash, notrace_hash, + &temp_hash, subops->func_hash); + if (ret < 0) { + free_ftrace_hash(temp_hash.filter_hash); + free_ftrace_hash(temp_hash.notrace_hash); + return ret; + } + } + + free_ftrace_hash(temp_hash.filter_hash); + free_ftrace_hash(temp_hash.notrace_hash); + + temp_hash.filter_hash = *filter_hash; + temp_hash.notrace_hash = *notrace_hash; + } + return 0; +} + /** * ftrace_shutdown_subops - Remove a subops from a manager ops * @ops: A manager ops to remove @subops from @@ -3579,8 +3678,8 @@ int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int */ int ftrace_shutdown_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command) { - struct ftrace_hash *filter_hash; - struct ftrace_hash *notrace_hash; + struct ftrace_hash *filter_hash = EMPTY_HASH; + struct ftrace_hash *notrace_hash = EMPTY_HASH; int ret; if (unlikely(ftrace_disabled)) @@ -3613,14 +3712,9 @@ int ftrace_shutdown_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, in } /* Rebuild the hashes without subops */ - filter_hash = append_hashes(ops); - notrace_hash = intersect_hashes(ops); - if (!filter_hash || !notrace_hash) { - free_ftrace_hash(filter_hash); - free_ftrace_hash(notrace_hash); - list_add(&subops->list, &ops->subop_list); - return -ENOMEM; - } + ret = rebuild_hashes(&filter_hash, ¬race_hash, ops); + if (ret < 0) + return ret; ret = ftrace_update_ops(ops, filter_hash, notrace_hash); if (ret < 0) { @@ -3636,11 +3730,11 @@ int ftrace_shutdown_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, in static int ftrace_hash_move_and_update_subops(struct ftrace_ops *subops, struct ftrace_hash **orig_subhash, - struct ftrace_hash *hash, - int enable) + struct ftrace_hash *hash) { struct ftrace_ops *ops = subops->managed; - struct ftrace_hash **orig_hash; + struct ftrace_hash *notrace_hash; + struct ftrace_hash *filter_hash; struct ftrace_hash *save_hash; struct ftrace_hash *new_hash; int ret; @@ -3657,24 +3751,18 @@ static int ftrace_hash_move_and_update_subops(struct ftrace_ops *subops, return -ENOMEM; } - /* Create a new_hash to hold the ops new functions */ - if (enable) { - orig_hash = &ops->func_hash->filter_hash; - new_hash = append_hashes(ops); - } else { - orig_hash = &ops->func_hash->notrace_hash; - new_hash = intersect_hashes(ops); + ret = rebuild_hashes(&filter_hash, ¬race_hash, ops); + if (!ret) { + ret = ftrace_update_ops(ops, filter_hash, notrace_hash); + free_ftrace_hash(filter_hash); + free_ftrace_hash(notrace_hash); } - /* Move the hash over to the new hash */ - ret = __ftrace_hash_move_and_update_ops(ops, orig_hash, new_hash, enable); - - free_ftrace_hash(new_hash); - if (ret) { /* Put back the original hash */ - free_ftrace_hash_rcu(*orig_subhash); + new_hash = *orig_subhash; *orig_subhash = save_hash; + free_ftrace_hash_rcu(new_hash); } else { free_ftrace_hash_rcu(save_hash); } @@ -3757,7 +3845,8 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) return 0; } -static int ftrace_allocate_records(struct ftrace_page *pg, int count) +static int ftrace_allocate_records(struct ftrace_page *pg, int count, + unsigned long *num_pages) { int order; int pages; @@ -3767,7 +3856,7 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count) return -EINVAL; /* We want to fill as much as possible, with no empty pages */ - pages = DIV_ROUND_UP(count, ENTRIES_PER_PAGE); + pages = DIV_ROUND_UP(count * ENTRY_SIZE, PAGE_SIZE); order = fls(pages) - 1; again: @@ -3782,9 +3871,10 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count) } ftrace_number_of_pages += 1 << order; + *num_pages += 1 << order; ftrace_number_of_groups++; - cnt = (PAGE_SIZE << order) / ENTRY_SIZE; + cnt = ENTRIES_PER_PAGE_GROUP(order); pg->order = order; if (cnt > count) @@ -3810,16 +3900,18 @@ static void ftrace_free_pages(struct ftrace_page *pages) } static struct ftrace_page * -ftrace_allocate_pages(unsigned long num_to_init) +ftrace_allocate_pages(unsigned long num_to_init, unsigned long *num_pages) { struct ftrace_page *start_pg; struct ftrace_page *pg; int cnt; + *num_pages = 0; + if (!num_to_init) return NULL; - start_pg = pg = kzalloc(sizeof(*pg), GFP_KERNEL); + start_pg = pg = kzalloc_obj(*pg); if (!pg) return NULL; @@ -3829,7 +3921,7 @@ ftrace_allocate_pages(unsigned long num_to_init) * waste as little space as possible. */ for (;;) { - cnt = ftrace_allocate_records(pg, num_to_init); + cnt = ftrace_allocate_records(pg, num_to_init, num_pages); if (cnt < 0) goto free_pages; @@ -3837,7 +3929,7 @@ ftrace_allocate_pages(unsigned long num_to_init) if (!num_to_init) break; - pg->next = kzalloc(sizeof(*pg), GFP_KERNEL); + pg->next = kzalloc_obj(*pg); if (!pg->next) goto free_pages; @@ -4331,6 +4423,42 @@ static inline int print_rec(struct seq_file *m, unsigned long ip) } #endif +static void print_subops(struct seq_file *m, struct ftrace_ops *ops, struct dyn_ftrace *rec) +{ + struct ftrace_ops *subops; + bool first = true; + + list_for_each_entry(subops, &ops->subop_list, list) { + if (!((subops->flags & FTRACE_OPS_FL_ENABLED) && + hash_contains_ip(rec->ip, subops->func_hash))) + continue; + if (first) { + seq_printf(m, "\tsubops:"); + first = false; + } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + if (subops->flags & FTRACE_OPS_FL_GRAPH) { + struct fgraph_ops *gops; + + gops = container_of(subops, struct fgraph_ops, ops); + seq_printf(m, " {ent:%pS ret:%pS}", + (void *)gops->entryfunc, + (void *)gops->retfunc); + continue; + } +#endif + if (subops->trampoline) { + seq_printf(m, " {%pS (%pS)}", + (void *)subops->trampoline, + (void *)subops->func); + add_trampoline_func(m, subops, rec); + } else { + seq_printf(m, " {%pS}", + (void *)subops->func); + } + } +} + static int t_show(struct seq_file *m, void *v) { struct ftrace_iterator *iter = m->private; @@ -4383,6 +4511,7 @@ static int t_show(struct seq_file *m, void *v) (void *)ops->trampoline, (void *)ops->func); add_trampoline_func(m, ops, rec); + print_subops(m, ops, rec); ops = ftrace_find_tramp_ops_next(rec, ops); } while (ops); } else @@ -4395,6 +4524,7 @@ static int t_show(struct seq_file *m, void *v) if (ops) { seq_printf(m, "\tops: %pS (%pS)", ops, ops->func); + print_subops(m, ops, rec); } else { seq_puts(m, "\tops: ERROR!"); } @@ -4403,8 +4533,11 @@ static int t_show(struct seq_file *m, void *v) unsigned long direct; direct = ftrace_find_rec_direct(rec->ip); - if (direct) - seq_printf(m, "\n\tdirect-->%pS", (void *)direct); + if (direct) { + seq_printf(m, "\n\tdirect%s-->%pS", + ftrace_is_jmp(direct) ? "(jmp)" : "", + (void *)ftrace_jmp_get(direct)); + } } } @@ -4553,7 +4686,7 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, if (tracing_check_open_get_tr(tr)) return -ENODEV; - iter = kzalloc(sizeof(*iter), GFP_KERNEL); + iter = kzalloc_obj(*iter); if (!iter) goto out; @@ -4585,13 +4718,17 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, } else { iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash); } + } else { + if (hash) + iter->hash = alloc_and_copy_ftrace_hash(hash->size_bits, hash); + else + iter->hash = EMPTY_HASH; + } - if (!iter->hash) { - trace_parser_put(&iter->parser); - goto out_unlock; - } - } else - iter->hash = hash; + if (!iter->hash) { + trace_parser_put(&iter->parser); + goto out_unlock; + } ret = 0; @@ -4898,7 +5035,7 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops, int enable) { if (ops->flags & FTRACE_OPS_FL_SUBOP) - return ftrace_hash_move_and_update_subops(ops, orig_hash, hash, enable); + return ftrace_hash_move_and_update_subops(ops, orig_hash, hash); /* * If this ops is not enabled, it could be sharing its filters @@ -4917,7 +5054,7 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops, list_for_each_entry(subops, &op->subop_list, list) { if ((subops->flags & FTRACE_OPS_FL_ENABLED) && subops->func_hash == ops->func_hash) { - return ftrace_hash_move_and_update_subops(subops, orig_hash, hash, enable); + return ftrace_hash_move_and_update_subops(subops, orig_hash, hash); } } } while_for_each_ftrace_op(op); @@ -4926,23 +5063,6 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops, return __ftrace_hash_move_and_update_ops(ops, orig_hash, hash, enable); } -static bool module_exists(const char *module) -{ - /* All modules have the symbol __this_module */ - static const char this_mod[] = "__this_module"; - char modname[MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 2]; - unsigned long val; - int n; - - n = snprintf(modname, sizeof(modname), "%s:%s", module, this_mod); - - if (n > sizeof(modname) - 1) - return false; - - val = module_kallsyms_lookup_name(modname); - return val != 0; -} - static int cache_mod(struct trace_array *tr, const char *func, char *module, int enable) { @@ -4982,10 +5102,6 @@ static int cache_mod(struct trace_array *tr, return ftrace_add_mod(tr, func, module, enable); } -static int -ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, - int reset, int enable); - #ifdef CONFIG_MODULES static void process_mod_list(struct list_head *head, struct ftrace_ops *ops, char *mod, bool enable) @@ -5149,8 +5265,12 @@ struct ftrace_func_map { void *data; }; +/* + * Note, ftrace_func_mapper is freed by free_ftrace_hash(&mapper->hash). + * The hash field must be the first field. + */ struct ftrace_func_mapper { - struct ftrace_hash hash; + struct ftrace_hash hash; /* Must be first! */ }; /** @@ -5214,7 +5334,7 @@ int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper, if (entry) return -EBUSY; - map = kmalloc(sizeof(*map), GFP_KERNEL); + map = kmalloc_obj(*map); if (!map) return -ENOMEM; @@ -5285,6 +5405,7 @@ void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper, } } } + /* This also frees the mapper itself */ free_ftrace_hash(&mapper->hash); } @@ -5353,7 +5474,7 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr, } } if (!probe) { - probe = kzalloc(sizeof(*probe), GFP_KERNEL); + probe = kzalloc_obj(*probe); if (!probe) { mutex_unlock(&ftrace_lock); return -ENOMEM; @@ -5615,20 +5736,15 @@ static DEFINE_MUTEX(ftrace_cmd_mutex); __init int register_ftrace_command(struct ftrace_func_command *cmd) { struct ftrace_func_command *p; - int ret = 0; - mutex_lock(&ftrace_cmd_mutex); + guard(mutex)(&ftrace_cmd_mutex); list_for_each_entry(p, &ftrace_commands, list) { - if (strcmp(cmd->name, p->name) == 0) { - ret = -EBUSY; - goto out_unlock; - } + if (strcmp(cmd->name, p->name) == 0) + return -EBUSY; } list_add(&cmd->list, &ftrace_commands); - out_unlock: - mutex_unlock(&ftrace_cmd_mutex); - return ret; + return 0; } /* @@ -5638,20 +5754,17 @@ __init int register_ftrace_command(struct ftrace_func_command *cmd) __init int unregister_ftrace_command(struct ftrace_func_command *cmd) { struct ftrace_func_command *p, *n; - int ret = -ENODEV; - mutex_lock(&ftrace_cmd_mutex); + guard(mutex)(&ftrace_cmd_mutex); + list_for_each_entry_safe(p, n, &ftrace_commands, list) { if (strcmp(cmd->name, p->name) == 0) { - ret = 0; list_del_init(&p->list); - goto out_unlock; + return 0; } } - out_unlock: - mutex_unlock(&ftrace_cmd_mutex); - return ret; + return -ENODEV; } static int ftrace_process_regex(struct ftrace_iterator *iter, @@ -5661,7 +5774,7 @@ static int ftrace_process_regex(struct ftrace_iterator *iter, struct trace_array *tr = iter->ops->private; char *func, *command, *next = buff; struct ftrace_func_command *p; - int ret = -EINVAL; + int ret; func = strsep(&next, ":"); @@ -5678,17 +5791,14 @@ static int ftrace_process_regex(struct ftrace_iterator *iter, command = strsep(&next, ":"); - mutex_lock(&ftrace_cmd_mutex); + guard(mutex)(&ftrace_cmd_mutex); + list_for_each_entry(p, &ftrace_commands, list) { - if (strcmp(p->name, command) == 0) { - ret = p->func(tr, hash, func, command, next, enable); - goto out_unlock; - } + if (strcmp(p->name, command) == 0) + return p->func(tr, hash, func, command, next, enable); } - out_unlock: - mutex_unlock(&ftrace_cmd_mutex); - return ret; + return -EINVAL; } static ssize_t @@ -5722,12 +5832,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, parser->idx, enable); trace_parser_clear(parser); if (ret < 0) - goto out; + return ret; } - ret = read; - out: - return ret; + return read; } ssize_t @@ -5759,6 +5867,9 @@ __ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) return -ENOENT; free_hash_entry(hash, entry); return 0; + } else if (__ftrace_lookup_ip(hash, ip) != NULL) { + /* Already exists */ + return 0; } entry = add_hash_entry(hash, ip); @@ -5788,7 +5899,7 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long *ips, static int ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, unsigned long *ips, unsigned int cnt, - int remove, int reset, int enable) + int remove, int reset, int enable, char *mod) { struct ftrace_hash **orig_hash; struct ftrace_hash *hash; @@ -5814,7 +5925,15 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, goto out_regex_unlock; } - if (buf && !ftrace_match_records(hash, buf, len)) { + if (buf && !match_records(hash, buf, len, mod)) { + /* If this was for a module and nothing was enabled, flag it */ + if (mod) + (*orig_hash)->flags |= FTRACE_HASH_FL_MOD; + + /* + * Even if it is a mod, return error to let caller know + * nothing was added + */ ret = -EINVAL; goto out_regex_unlock; } @@ -5839,7 +5958,7 @@ static int ftrace_set_addr(struct ftrace_ops *ops, unsigned long *ips, unsigned int cnt, int remove, int reset, int enable) { - return ftrace_set_hash(ops, NULL, 0, ips, cnt, remove, reset, enable); + return ftrace_set_hash(ops, NULL, 0, ips, cnt, remove, reset, enable, NULL); } #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS @@ -5876,7 +5995,8 @@ static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long for (i = 0; i < size; i++) { hlist_for_each_entry(entry, &hash->buckets[i], hlist) { del = __ftrace_lookup_ip(direct_functions, entry->ip); - if (del && del->direct == addr) { + if (del && ftrace_jmp_get(del->direct) == + ftrace_jmp_get(addr)) { remove_hash_entry(direct_functions, del); kfree(del); } @@ -5891,6 +6011,17 @@ static void register_ftrace_direct_cb(struct rcu_head *rhp) free_ftrace_hash(fhp); } +static void reset_direct(struct ftrace_ops *ops, unsigned long addr) +{ + struct ftrace_hash *hash = ops->func_hash->filter_hash; + + remove_direct_functions_hash(hash, addr); + + /* cleanup for possible another register call */ + ops->func = NULL; + ops->trampoline = 0; +} + /** * register_ftrace_direct - Call a custom trampoline directly * for multiple functions registered in @ops @@ -5945,9 +6076,10 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) /* Make a copy hash to place the new and the old entries in */ size = hash->count + direct_functions->count; - if (size > 32) - size = 32; - new_hash = alloc_ftrace_hash(fls(size)); + size = fls(size); + if (size > FTRACE_HASH_MAX_BITS) + size = FTRACE_HASH_MAX_BITS; + new_hash = alloc_ftrace_hash(size); if (!new_hash) goto out_unlock; @@ -5980,11 +6112,13 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) new_hash = NULL; ops->func = call_direct_funcs; - ops->flags = MULTI_FLAGS; + ops->flags |= MULTI_FLAGS; ops->trampoline = FTRACE_REGS_ADDR; ops->direct_call = addr; err = register_ftrace_function_nolock(ops); + if (err) + reset_direct(ops, addr); out_unlock: mutex_unlock(&direct_mutex); @@ -6017,7 +6151,6 @@ EXPORT_SYMBOL_GPL(register_ftrace_direct); int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr, bool free_filters) { - struct ftrace_hash *hash = ops->func_hash->filter_hash; int err; if (check_direct_multi(ops)) @@ -6027,13 +6160,9 @@ int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr, mutex_lock(&direct_mutex); err = unregister_ftrace_function(ops); - remove_direct_functions_hash(hash, addr); + reset_direct(ops, addr); mutex_unlock(&direct_mutex); - /* cleanup for possible another register call */ - ops->func = NULL; - ops->trampoline = 0; - if (free_filters) ftrace_free_filter(ops); return err; @@ -6043,7 +6172,7 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_direct); static int __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) { - struct ftrace_hash *hash; + struct ftrace_hash *hash = ops->func_hash->filter_hash; struct ftrace_func_entry *entry, *iter; static struct ftrace_ops tmp_ops = { .func = ftrace_stub, @@ -6064,12 +6193,20 @@ __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) return err; /* + * Call __ftrace_hash_update_ipmodify() here, so that we can call + * ops->ops_func for the ops. This is needed because the above + * register_ftrace_function_nolock() worked on tmp_ops. + */ + err = __ftrace_hash_update_ipmodify(ops, hash, hash, true); + if (err) + goto out; + + /* * Now the ftrace_ops_list_func() is called to do the direct callers. * We can safely change the direct functions attached to each entry. */ mutex_lock(&ftrace_lock); - hash = ops->func_hash->filter_hash; size = 1 << hash->size_bits; for (i = 0; i < size; i++) { hlist_for_each_entry(iter, &hash->buckets[i], hlist) { @@ -6084,6 +6221,7 @@ __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) mutex_unlock(&ftrace_lock); +out: /* Removing the tmp_ops will add the updated direct callers to the functions */ unregister_ftrace_function(&tmp_ops); @@ -6149,6 +6287,370 @@ int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) return err; } EXPORT_SYMBOL_GPL(modify_ftrace_direct); + +static unsigned long hash_count(struct ftrace_hash *hash) +{ + return hash ? hash->count : 0; +} + +/** + * hash_add - adds two struct ftrace_hash and returns the result + * @a: struct ftrace_hash object + * @b: struct ftrace_hash object + * + * Returns struct ftrace_hash object on success, NULL on error. + */ +static struct ftrace_hash *hash_add(struct ftrace_hash *a, struct ftrace_hash *b) +{ + struct ftrace_func_entry *entry; + struct ftrace_hash *add; + int size; + + size = hash_count(a) + hash_count(b); + if (size > 32) + size = 32; + + add = alloc_and_copy_ftrace_hash(fls(size), a); + if (!add) + return NULL; + + size = 1 << b->size_bits; + for (int i = 0; i < size; i++) { + hlist_for_each_entry(entry, &b->buckets[i], hlist) { + if (add_ftrace_hash_entry_direct(add, entry->ip, entry->direct) == NULL) { + free_ftrace_hash(add); + return NULL; + } + } + } + return add; +} + +/** + * update_ftrace_direct_add - Updates @ops by adding direct + * callers provided in @hash + * @ops: The address of the struct ftrace_ops object + * @hash: The address of the struct ftrace_hash object + * + * This is used to add custom direct callers (ip -> addr) to @ops, + * specified in @hash. The @ops will be either registered or updated. + * + * Returns: zero on success. Non zero on error, which includes: + * -EINVAL - The @hash is empty + */ +int update_ftrace_direct_add(struct ftrace_ops *ops, struct ftrace_hash *hash) +{ + struct ftrace_hash *old_direct_functions = NULL; + struct ftrace_hash *new_direct_functions; + struct ftrace_hash *old_filter_hash; + struct ftrace_hash *new_filter_hash = NULL; + struct ftrace_func_entry *entry; + int err = -EINVAL; + int size; + bool reg; + + if (!hash_count(hash)) + return -EINVAL; + + mutex_lock(&direct_mutex); + + /* Make sure requested entries are not already registered. */ + size = 1 << hash->size_bits; + for (int i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + if (__ftrace_lookup_ip(direct_functions, entry->ip)) + goto out_unlock; + } + } + + old_filter_hash = ops->func_hash ? ops->func_hash->filter_hash : NULL; + + /* If there's nothing in filter_hash we need to register the ops. */ + reg = hash_count(old_filter_hash) == 0; + if (reg) { + if (ops->func || ops->trampoline) + goto out_unlock; + if (ops->flags & FTRACE_OPS_FL_ENABLED) + goto out_unlock; + } + + err = -ENOMEM; + new_filter_hash = hash_add(old_filter_hash, hash); + if (!new_filter_hash) + goto out_unlock; + + new_direct_functions = hash_add(direct_functions, hash); + if (!new_direct_functions) + goto out_unlock; + + old_direct_functions = direct_functions; + rcu_assign_pointer(direct_functions, new_direct_functions); + + if (reg) { + ops->func = call_direct_funcs; + ops->flags |= MULTI_FLAGS; + ops->trampoline = FTRACE_REGS_ADDR; + ops->local_hash.filter_hash = new_filter_hash; + + err = register_ftrace_function_nolock(ops); + if (err) { + /* restore old filter on error */ + ops->local_hash.filter_hash = old_filter_hash; + + /* cleanup for possible another register call */ + ops->func = NULL; + ops->trampoline = 0; + } else { + new_filter_hash = old_filter_hash; + } + } else { + guard(mutex)(&ftrace_lock); + err = ftrace_update_ops(ops, new_filter_hash, EMPTY_HASH); + /* + * new_filter_hash is dup-ed, so we need to release it anyway, + * old_filter_hash either stays on error or is already released + */ + } + + if (err) { + /* reset direct_functions and free the new one */ + rcu_assign_pointer(direct_functions, old_direct_functions); + old_direct_functions = new_direct_functions; + } + + out_unlock: + mutex_unlock(&direct_mutex); + + if (old_direct_functions && old_direct_functions != EMPTY_HASH) + call_rcu_tasks(&old_direct_functions->rcu, register_ftrace_direct_cb); + free_ftrace_hash(new_filter_hash); + + return err; +} + +/** + * hash_sub - substracts @b from @a and returns the result + * @a: struct ftrace_hash object + * @b: struct ftrace_hash object + * + * Returns struct ftrace_hash object on success, NULL on error. + */ +static struct ftrace_hash *hash_sub(struct ftrace_hash *a, struct ftrace_hash *b) +{ + struct ftrace_func_entry *entry, *del; + struct ftrace_hash *sub; + int size; + + sub = alloc_and_copy_ftrace_hash(a->size_bits, a); + if (!sub) + return NULL; + + size = 1 << b->size_bits; + for (int i = 0; i < size; i++) { + hlist_for_each_entry(entry, &b->buckets[i], hlist) { + del = __ftrace_lookup_ip(sub, entry->ip); + if (WARN_ON_ONCE(!del)) { + free_ftrace_hash(sub); + return NULL; + } + remove_hash_entry(sub, del); + kfree(del); + } + } + return sub; +} + +/** + * update_ftrace_direct_del - Updates @ops by removing its direct + * callers provided in @hash + * @ops: The address of the struct ftrace_ops object + * @hash: The address of the struct ftrace_hash object + * + * This is used to delete custom direct callers (ip -> addr) in + * @ops specified via @hash. The @ops will be either unregistered + * updated. + * + * Returns: zero on success. Non zero on error, which includes: + * -EINVAL - The @hash is empty + * -EINVAL - The @ops is not registered + */ +int update_ftrace_direct_del(struct ftrace_ops *ops, struct ftrace_hash *hash) +{ + struct ftrace_hash *old_direct_functions = NULL; + struct ftrace_hash *new_direct_functions; + struct ftrace_hash *new_filter_hash = NULL; + struct ftrace_hash *old_filter_hash; + struct ftrace_func_entry *entry; + struct ftrace_func_entry *del; + unsigned long size; + int err = -EINVAL; + + if (!hash_count(hash)) + return -EINVAL; + if (check_direct_multi(ops)) + return -EINVAL; + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return -EINVAL; + if (direct_functions == EMPTY_HASH) + return -EINVAL; + + mutex_lock(&direct_mutex); + + old_filter_hash = ops->func_hash ? ops->func_hash->filter_hash : NULL; + + if (!hash_count(old_filter_hash)) + goto out_unlock; + + /* Make sure requested entries are already registered. */ + size = 1 << hash->size_bits; + for (int i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + del = __ftrace_lookup_ip(direct_functions, entry->ip); + if (!del || del->direct != entry->direct) + goto out_unlock; + } + } + + err = -ENOMEM; + new_filter_hash = hash_sub(old_filter_hash, hash); + if (!new_filter_hash) + goto out_unlock; + + new_direct_functions = hash_sub(direct_functions, hash); + if (!new_direct_functions) + goto out_unlock; + + /* If there's nothing left, we need to unregister the ops. */ + if (ftrace_hash_empty(new_filter_hash)) { + err = unregister_ftrace_function(ops); + if (!err) { + /* cleanup for possible another register call */ + ops->func = NULL; + ops->trampoline = 0; + ftrace_free_filter(ops); + ops->func_hash->filter_hash = NULL; + } + } else { + guard(mutex)(&ftrace_lock); + err = ftrace_update_ops(ops, new_filter_hash, EMPTY_HASH); + /* + * new_filter_hash is dup-ed, so we need to release it anyway, + * old_filter_hash either stays on error or is already released + */ + } + + if (err) { + /* free the new_direct_functions */ + old_direct_functions = new_direct_functions; + } else { + old_direct_functions = direct_functions; + rcu_assign_pointer(direct_functions, new_direct_functions); + } + + out_unlock: + mutex_unlock(&direct_mutex); + + if (old_direct_functions && old_direct_functions != EMPTY_HASH) + call_rcu_tasks(&old_direct_functions->rcu, register_ftrace_direct_cb); + free_ftrace_hash(new_filter_hash); + + return err; +} + +/** + * update_ftrace_direct_mod - Updates @ops by modifing its direct + * callers provided in @hash + * @ops: The address of the struct ftrace_ops object + * @hash: The address of the struct ftrace_hash object + * @do_direct_lock: If true lock the direct_mutex + * + * This is used to modify custom direct callers (ip -> addr) in + * @ops specified via @hash. + * + * This can be called from within ftrace ops_func callback with + * direct_mutex already locked, in which case @do_direct_lock + * needs to be false. + * + * Returns: zero on success. Non zero on error, which includes: + * -EINVAL - The @hash is empty + * -EINVAL - The @ops is not registered + */ +int update_ftrace_direct_mod(struct ftrace_ops *ops, struct ftrace_hash *hash, bool do_direct_lock) +{ + struct ftrace_func_entry *entry, *tmp; + static struct ftrace_ops tmp_ops = { + .func = ftrace_stub, + .flags = FTRACE_OPS_FL_STUB, + }; + struct ftrace_hash *orig_hash; + unsigned long size, i; + int err = -EINVAL; + + if (!hash_count(hash)) + return -EINVAL; + if (check_direct_multi(ops)) + return -EINVAL; + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return -EINVAL; + if (direct_functions == EMPTY_HASH) + return -EINVAL; + + /* + * We can be called from within ops_func callback with direct_mutex + * already taken. + */ + if (do_direct_lock) + mutex_lock(&direct_mutex); + + orig_hash = ops->func_hash ? ops->func_hash->filter_hash : NULL; + if (!orig_hash) + goto unlock; + + /* Enable the tmp_ops to have the same functions as the hash object. */ + ftrace_ops_init(&tmp_ops); + tmp_ops.func_hash->filter_hash = hash; + + err = register_ftrace_function_nolock(&tmp_ops); + if (err) + goto unlock; + + /* + * Call __ftrace_hash_update_ipmodify() here, so that we can call + * ops->ops_func for the ops. This is needed because the above + * register_ftrace_function_nolock() worked on tmp_ops. + */ + err = __ftrace_hash_update_ipmodify(ops, orig_hash, orig_hash, true); + if (err) + goto out; + + /* + * Now the ftrace_ops_list_func() is called to do the direct callers. + * We can safely change the direct functions attached to each entry. + */ + mutex_lock(&ftrace_lock); + + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + tmp = __ftrace_lookup_ip(direct_functions, entry->ip); + if (!tmp) + continue; + tmp->direct = entry->direct; + } + } + + mutex_unlock(&ftrace_lock); + +out: + /* Removing the tmp_ops will add the updated direct callers to the functions */ + unregister_ftrace_function(&tmp_ops); + +unlock: + if (do_direct_lock) + mutex_unlock(&direct_mutex); + return err; +} + #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ /** @@ -6217,7 +6719,38 @@ static int ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, int reset, int enable) { - return ftrace_set_hash(ops, buf, len, NULL, 0, 0, reset, enable); + char *mod = NULL, *func, *command, *next = buf; + char *tmp __free(kfree) = NULL; + struct trace_array *tr = ops->private; + int ret; + + func = strsep(&next, ":"); + + /* This can also handle :mod: parsing */ + if (next) { + if (!tr) + return -EINVAL; + + command = strsep(&next, ":"); + if (strcmp(command, "mod") != 0) + return -EINVAL; + + mod = next; + len = command - func; + /* Save the original func as ftrace_set_hash() can modify it */ + tmp = kstrdup(func, GFP_KERNEL); + } + + ret = ftrace_set_hash(ops, func, len, NULL, 0, 0, reset, enable, mod); + + if (tr && mod && ret < 0) { + /* Did tmp fail to allocate? */ + if (!tmp) + return -ENOMEM; + ret = cache_mod(tr, tmp, mod, enable); + } + + return ret; } /** @@ -6308,7 +6841,8 @@ bool ftrace_filter_param __initdata; static int __init set_ftrace_notrace(char *str) { ftrace_filter_param = true; - strscpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); + trace_append_boot_param(ftrace_notrace_buf, str, ',', + FTRACE_FILTER_SIZE); return 1; } __setup("ftrace_notrace=", set_ftrace_notrace); @@ -6316,7 +6850,8 @@ __setup("ftrace_notrace=", set_ftrace_notrace); static int __init set_ftrace_filter(char *str) { ftrace_filter_param = true; - strscpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); + trace_append_boot_param(ftrace_filter_buf, str, ',', + FTRACE_FILTER_SIZE); return 1; } __setup("ftrace_filter=", set_ftrace_filter); @@ -6328,14 +6863,16 @@ static int ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer); static int __init set_graph_function(char *str) { - strscpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); + trace_append_boot_param(ftrace_graph_buf, str, ',', + FTRACE_FILTER_SIZE); return 1; } __setup("ftrace_graph_filter=", set_graph_function); static int __init set_graph_notrace_function(char *str) { - strscpy(ftrace_graph_notrace_buf, str, FTRACE_FILTER_SIZE); + trace_append_boot_param(ftrace_graph_notrace_buf, str, ',', + FTRACE_FILTER_SIZE); return 1; } __setup("ftrace_graph_notrace=", set_graph_notrace_function); @@ -6381,6 +6918,14 @@ ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable) ftrace_ops_init(ops); + /* The trace_array is needed for caching module function filters */ + if (!ops->private) { + struct trace_array *tr = trace_get_global_array(); + + ops->private = tr; + ftrace_init_trace_array(tr); + } + while (buf) { func = strsep(&buf, ","); ftrace_set_regex(ops, func, strlen(func), 0, enable); @@ -6445,9 +6990,6 @@ int ftrace_regex_release(struct inode *inode, struct file *file) ftrace_hash_move_and_update_ops(iter->ops, orig_hash, iter->hash, filter_hash); mutex_unlock(&ftrace_lock); - } else { - /* For read only, the hash is the ops hash */ - iter->hash = NULL; } mutex_unlock(&iter->ops->func_hash->regex_lock); @@ -6687,7 +7229,7 @@ ftrace_graph_open(struct inode *inode, struct file *file) if (unlikely(ftrace_disabled)) return -ENODEV; - fgd = kmalloc(sizeof(*fgd), GFP_KERNEL); + fgd = kmalloc_obj(*fgd); if (fgd == NULL) return -ENOMEM; @@ -6715,7 +7257,7 @@ ftrace_graph_notrace_open(struct inode *inode, struct file *file) if (unlikely(ftrace_disabled)) return -ENODEV; - fgd = kmalloc(sizeof(*fgd), GFP_KERNEL); + fgd = kmalloc_obj(*fgd); if (fgd == NULL) return -ENOMEM; @@ -6847,6 +7389,7 @@ ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer) } } } + cond_resched(); } while_for_each_ftrace_rec(); return fail ? -EINVAL : 0; @@ -7010,6 +7553,7 @@ static int ftrace_process_locs(struct module *mod, unsigned long *p; unsigned long addr; unsigned long flags = 0; /* Shut up gcc */ + unsigned long pages; int ret = -ENOMEM; count = end - start; @@ -7029,7 +7573,7 @@ static int ftrace_process_locs(struct module *mod, test_is_sorted(start, count); } - start_pg = ftrace_allocate_pages(count); + start_pg = ftrace_allocate_pages(count, &pages); if (!start_pg) return -ENOMEM; @@ -7061,7 +7605,9 @@ static int ftrace_process_locs(struct module *mod, pg = start_pg; while (p < end) { unsigned long end_offset; - addr = ftrace_call_adjust(*p++); + + addr = *p++; + /* * Some architecture linkers will pad between * the different mcount_loc sections of different @@ -7073,6 +7619,19 @@ static int ftrace_process_locs(struct module *mod, continue; } + /* + * If this is core kernel, make sure the address is in core + * or inittext, as weak functions get zeroed and KASLR can + * move them to something other than zero. It just will not + * move it to an area where kernel text is. + */ + if (!mod && !(is_kernel_text(addr) || is_kernel_inittext(addr))) { + skipped++; + continue; + } + + addr = ftrace_call_adjust(addr); + end_offset = (pg->index+1) * sizeof(pg->records[0]); if (end_offset > PAGE_SIZE << pg->order) { /* We should have allocated enough */ @@ -7112,11 +7671,41 @@ static int ftrace_process_locs(struct module *mod, /* We should have used all pages unless we skipped some */ if (pg_unuse) { - WARN_ON(!skipped); + unsigned long pg_remaining, remaining = 0; + long skip; + + /* Count the number of entries unused and compare it to skipped. */ + pg_remaining = ENTRIES_PER_PAGE_GROUP(pg->order) - pg->index; + + if (!WARN(skipped < pg_remaining, "Extra allocated pages for ftrace")) { + + skip = skipped - pg_remaining; + + for (pg = pg_unuse; pg && skip > 0; pg = pg->next) { + remaining += 1 << pg->order; + skip -= ENTRIES_PER_PAGE_GROUP(pg->order); + } + + pages -= remaining; + + /* + * Check to see if the number of pages remaining would + * just fit the number of entries skipped. + */ + WARN(pg || skip > 0, "Extra allocated pages for ftrace: %lu with %lu skipped", + remaining, skipped); + } /* Need to synchronize with ftrace_location_range() */ synchronize_rcu(); ftrace_free_pages(pg_unuse); } + + if (!mod) { + count -= skipped; + pr_info("ftrace: allocating %ld entries in %ld pages\n", + count, pages); + } + return ret; } @@ -7287,9 +7876,10 @@ void ftrace_release_mod(struct module *mod) mutex_lock(&ftrace_lock); - if (ftrace_disabled) - goto out_unlock; - + /* + * To avoid the UAF problem after the module is unloaded, the + * 'mod_map' resource needs to be released unconditionally. + */ list_for_each_entry_safe(mod_map, n, &ftrace_mod_maps, list) { if (mod_map->mod == mod) { list_del_rcu(&mod_map->list); @@ -7298,6 +7888,9 @@ void ftrace_release_mod(struct module *mod) } } + if (ftrace_disabled) + goto out_unlock; + /* * Each module has its own ftrace_pages, remove * them from the list. @@ -7383,6 +7976,8 @@ void ftrace_module_enable(struct module *mod) if (!within_module(rec->ip, mod)) break; + cond_resched(); + /* Weak functions should still be ignored */ if (!test_for_valid_rec(rec)) { /* Clear all other flags. Should not be enabled anyway */ @@ -7452,7 +8047,7 @@ static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map, if (!ret) return; - mod_func = kmalloc(sizeof(*mod_func), GFP_KERNEL); + mod_func = kmalloc_obj(*mod_func); if (!mod_func) return; @@ -7476,7 +8071,10 @@ allocate_ftrace_mod_map(struct module *mod, { struct ftrace_mod_map *mod_map; - mod_map = kmalloc(sizeof(*mod_map), GFP_KERNEL); + if (ftrace_disabled) + return NULL; + + mod_map = kmalloc_obj(*mod_map); if (!mod_map) return NULL; @@ -7521,7 +8119,8 @@ ftrace_func_address_lookup(struct ftrace_mod_map *mod_map, int ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, - unsigned long *off, char **modname, char *sym) + unsigned long *off, char **modname, + const unsigned char **modbuildid, char *sym) { struct ftrace_mod_map *mod_map; int ret = 0; @@ -7533,6 +8132,8 @@ ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, if (ret) { if (modname) *modname = mod_map->mod->name; + if (modbuildid) + *modbuildid = module_buildid(mod_map->mod); break; } } @@ -7646,7 +8247,7 @@ static void add_to_clear_hash_list(struct list_head *clear_list, { struct ftrace_init_func *func; - func = kmalloc(sizeof(*func), GFP_KERNEL); + func = kmalloc_obj(*func); if (!func) { MEM_FAIL(1, "alloc failure, ftrace filter could be stale\n"); return; @@ -7762,9 +8363,6 @@ void __init ftrace_init(void) goto failed; } - pr_info("ftrace: allocating %ld entries in %ld pages\n", - count, DIV_ROUND_UP(count, ENTRIES_PER_PAGE)); - ret = ftrace_process_locs(NULL, __start_mcount_loc, __stop_mcount_loc); @@ -7814,9 +8412,14 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops) void ftrace_init_trace_array(struct trace_array *tr) { + if (tr->flags & TRACE_ARRAY_FL_MOD_INIT) + return; + INIT_LIST_HEAD(&tr->func_probes); INIT_LIST_HEAD(&tr->mod_trace); INIT_LIST_HEAD(&tr->mod_notrace); + + tr->flags |= TRACE_ARRAY_FL_MOD_INIT; } #else @@ -7845,7 +8448,8 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops) __init void ftrace_init_global_array_ops(struct trace_array *tr) { tr->ops = &global_ops; - tr->ops->private = tr; + if (!global_ops.private) + global_ops.private = tr; ftrace_init_trace_array(tr); init_array_fgraph_ops(tr, tr->ops); } @@ -8013,6 +8617,7 @@ ftrace_pid_follow_sched_process_fork(void *data, struct trace_pid_list *pid_list; struct trace_array *tr = data; + guard(preempt)(); pid_list = rcu_dereference_sched(tr->function_pids); trace_filter_add_remove_task(pid_list, self, task); @@ -8026,6 +8631,7 @@ ftrace_pid_follow_sched_process_exit(void *data, struct task_struct *task) struct trace_pid_list *pid_list; struct trace_array *tr = data; + guard(preempt)(); pid_list = rcu_dereference_sched(tr->function_pids); trace_filter_add_remove_task(pid_list, NULL, task); @@ -8287,7 +8893,7 @@ pid_write(struct file *filp, const char __user *ubuf, if (!cnt) return 0; - mutex_lock(&ftrace_lock); + guard(mutex)(&ftrace_lock); switch (type) { case TRACE_PIDS: @@ -8303,14 +8909,13 @@ pid_write(struct file *filp, const char __user *ubuf, lockdep_is_held(&ftrace_lock)); break; default: - ret = -EINVAL; WARN_ON_ONCE(1); - goto out; + return -EINVAL; } ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt); if (ret < 0) - goto out; + return ret; switch (type) { case TRACE_PIDS: @@ -8339,11 +8944,8 @@ pid_write(struct file *filp, const char __user *ubuf, ftrace_update_pid_func(); ftrace_startup_all(0); - out: - mutex_unlock(&ftrace_lock); - if (ret > 0) - *ppos += ret; + *ppos += ret; return ret; } @@ -8478,7 +9080,7 @@ static int prepare_direct_functions_for_ipmodify(struct ftrace_ops *ops) if (!op->ops_func) return -EBUSY; - ret = op->ops_func(op, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER); + ret = op->ops_func(op, ip, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER); if (ret) return ret; } @@ -8525,7 +9127,7 @@ static void cleanup_direct_functions_after_ipmodify(struct ftrace_ops *ops) /* The cleanup is optional, ignore any errors */ if (found_op && op->ops_func) - op->ops_func(op, FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER); + op->ops_func(op, ip, FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER); } } mutex_unlock(&direct_mutex); @@ -8669,6 +9271,15 @@ static int kallsyms_callback(void *data, const char *name, unsigned long addr) * @addrs array, which needs to be big enough to store at least @cnt * addresses. * + * For a single symbol (cnt == 1), uses kallsyms_lookup_name() which + * performs an O(log N) binary search via the sorted kallsyms index. + * This avoids the full O(N) linear scan over all kernel symbols that + * the multi-symbol path requires. + * + * For multiple symbols, uses a single-pass linear scan via + * kallsyms_on_each_symbol() with binary search into the sorted input + * array. + * * Returns: 0 if all provided symbols are found, -ESRCH otherwise. */ int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *addrs) @@ -8676,6 +9287,19 @@ int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *a struct kallsyms_data args; int found_all; + /* Fast path: single symbol uses O(log N) binary search */ + if (cnt == 1) { + addrs[0] = kallsyms_lookup_name(sorted_syms[0]); + if (addrs[0] && ftrace_location(addrs[0])) + return 0; + /* + * Binary lookup can fail for duplicate symbol names + * where the first match is not ftrace-instrumented. + * Retry with linear scan. + */ + } + + /* Batch path: single-pass O(N) linear scan */ memset(addrs, 0, sizeof(*addrs) * cnt); args.addrs = addrs; args.syms = sorted_syms; @@ -8746,17 +9370,17 @@ static int ftrace_enable_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - int ret = -ENODEV; + int ret; - mutex_lock(&ftrace_lock); + guard(mutex)(&ftrace_lock); if (unlikely(ftrace_disabled)) - goto out; + return -ENODEV; ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) - goto out; + return ret; if (ftrace_enabled) { @@ -8770,8 +9394,7 @@ ftrace_enable_sysctl(const struct ctl_table *table, int write, } else { if (is_permanent_ops_registered()) { ftrace_enabled = true; - ret = -EBUSY; - goto out; + return -EBUSY; } /* stopping ftrace calls (just send to ftrace_stub) */ @@ -8781,12 +9404,10 @@ ftrace_enable_sysctl(const struct ctl_table *table, int write, } last_ftrace_enabled = !!ftrace_enabled; - out: - mutex_unlock(&ftrace_lock); - return ret; + return 0; } -static struct ctl_table ftrace_sysctls[] = { +static const struct ctl_table ftrace_sysctls[] = { { .procname = "ftrace_enabled", .data = &ftrace_enabled, diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c index 4966e6bbdf6f..30e3fdae6a17 100644 --- a/kernel/trace/pid_list.c +++ b/kernel/trace/pid_list.c @@ -3,6 +3,7 @@ * Copyright (C) 2021 VMware Inc, Steven Rostedt <rostedt@goodmis.org> */ #include <linux/spinlock.h> +#include <linux/seqlock.h> #include <linux/irq_work.h> #include <linux/slab.h> #include "trace.h" @@ -81,13 +82,9 @@ static inline bool upper_empty(union upper_chunk *chunk) { /* * If chunk->data has no lower chunks, it will be the same - * as a zeroed bitmask. Use find_first_bit() to test it - * and if it doesn't find any bits set, then the array - * is empty. + * as a zeroed bitmask. */ - int bit = find_first_bit((unsigned long *)chunk->data, - sizeof(chunk->data) * 8); - return bit >= sizeof(chunk->data) * 8; + return bitmap_empty((unsigned long *)chunk->data, BITS_PER_TYPE(chunk->data)); } static inline int pid_split(unsigned int pid, unsigned int *upper1, @@ -130,7 +127,7 @@ bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid) { union upper_chunk *upper_chunk; union lower_chunk *lower_chunk; - unsigned long flags; + unsigned int seq; unsigned int upper1; unsigned int upper2; unsigned int lower; @@ -142,14 +139,16 @@ bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid) if (pid_split(pid, &upper1, &upper2, &lower) < 0) return false; - raw_spin_lock_irqsave(&pid_list->lock, flags); - upper_chunk = pid_list->upper[upper1]; - if (upper_chunk) { - lower_chunk = upper_chunk->data[upper2]; - if (lower_chunk) - ret = test_bit(lower, lower_chunk->data); - } - raw_spin_unlock_irqrestore(&pid_list->lock, flags); + do { + seq = read_seqcount_begin(&pid_list->seqcount); + ret = false; + upper_chunk = pid_list->upper[upper1]; + if (upper_chunk) { + lower_chunk = upper_chunk->data[upper2]; + if (lower_chunk) + ret = test_bit(lower, lower_chunk->data); + } + } while (read_seqcount_retry(&pid_list->seqcount, seq)); return ret; } @@ -182,6 +181,7 @@ int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid) return -EINVAL; raw_spin_lock_irqsave(&pid_list->lock, flags); + write_seqcount_begin(&pid_list->seqcount); upper_chunk = pid_list->upper[upper1]; if (!upper_chunk) { upper_chunk = get_upper_chunk(pid_list); @@ -203,6 +203,7 @@ int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid) set_bit(lower, lower_chunk->data); ret = 0; out: + write_seqcount_end(&pid_list->seqcount); raw_spin_unlock_irqrestore(&pid_list->lock, flags); return ret; } @@ -234,6 +235,7 @@ int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid) return -EINVAL; raw_spin_lock_irqsave(&pid_list->lock, flags); + write_seqcount_begin(&pid_list->seqcount); upper_chunk = pid_list->upper[upper1]; if (!upper_chunk) goto out; @@ -254,6 +256,7 @@ int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid) } } out: + write_seqcount_end(&pid_list->seqcount); raw_spin_unlock_irqrestore(&pid_list->lock, flags); return 0; } @@ -344,8 +347,10 @@ static void pid_list_refill_irq(struct irq_work *iwork) again: raw_spin_lock(&pid_list->lock); + write_seqcount_begin(&pid_list->seqcount); upper_count = CHUNK_ALLOC - pid_list->free_upper_chunks; lower_count = CHUNK_ALLOC - pid_list->free_lower_chunks; + write_seqcount_end(&pid_list->seqcount); raw_spin_unlock(&pid_list->lock); if (upper_count <= 0 && lower_count <= 0) @@ -354,7 +359,7 @@ static void pid_list_refill_irq(struct irq_work *iwork) while (upper_count-- > 0) { union upper_chunk *chunk; - chunk = kzalloc(sizeof(*chunk), GFP_NOWAIT); + chunk = kzalloc_obj(*chunk, GFP_NOWAIT); if (!chunk) break; *upper_next = chunk; @@ -365,7 +370,7 @@ static void pid_list_refill_irq(struct irq_work *iwork) while (lower_count-- > 0) { union lower_chunk *chunk; - chunk = kzalloc(sizeof(*chunk), GFP_NOWAIT); + chunk = kzalloc_obj(*chunk, GFP_NOWAIT); if (!chunk) break; *lower_next = chunk; @@ -374,6 +379,7 @@ static void pid_list_refill_irq(struct irq_work *iwork) } raw_spin_lock(&pid_list->lock); + write_seqcount_begin(&pid_list->seqcount); if (upper) { *upper_next = pid_list->upper_list; pid_list->upper_list = upper; @@ -384,6 +390,7 @@ static void pid_list_refill_irq(struct irq_work *iwork) pid_list->lower_list = lower; pid_list->free_lower_chunks += lcnt; } + write_seqcount_end(&pid_list->seqcount); raw_spin_unlock(&pid_list->lock); /* @@ -414,20 +421,21 @@ struct trace_pid_list *trace_pid_list_alloc(void) int i; /* According to linux/thread.h, pids can be no bigger that 30 bits */ - WARN_ON_ONCE(pid_max > (1 << 30)); + WARN_ON_ONCE(init_pid_ns.pid_max > (1 << 30)); - pid_list = kzalloc(sizeof(*pid_list), GFP_KERNEL); + pid_list = kzalloc_obj(*pid_list); if (!pid_list) return NULL; init_irq_work(&pid_list->refill_irqwork, pid_list_refill_irq); raw_spin_lock_init(&pid_list->lock); + seqcount_raw_spinlock_init(&pid_list->seqcount, &pid_list->lock); for (i = 0; i < CHUNK_ALLOC; i++) { union upper_chunk *chunk; - chunk = kzalloc(sizeof(*chunk), GFP_KERNEL); + chunk = kzalloc_obj(*chunk); if (!chunk) break; chunk->next = pid_list->upper_list; @@ -438,7 +446,7 @@ struct trace_pid_list *trace_pid_list_alloc(void) for (i = 0; i < CHUNK_ALLOC; i++) { union lower_chunk *chunk; - chunk = kzalloc(sizeof(*chunk), GFP_KERNEL); + chunk = kzalloc_obj(*chunk); if (!chunk) break; chunk->next = pid_list->lower_list; diff --git a/kernel/trace/pid_list.h b/kernel/trace/pid_list.h index 62e73f1ac85f..0b45fb0eadb9 100644 --- a/kernel/trace/pid_list.h +++ b/kernel/trace/pid_list.h @@ -76,6 +76,7 @@ union upper_chunk { }; struct trace_pid_list { + seqcount_raw_spinlock_t seqcount; raw_spinlock_t lock; struct irq_work refill_irqwork; union upper_chunk *upper[UPPER1_SIZE]; // 1 or 2K in size diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index 21bb161c2316..f2fe33573e54 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -17,5 +17,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume); EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_frequency); -EXPORT_TRACEPOINT_SYMBOL_GPL(powernv_throttle); diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c index 314ffc143039..acb0c971a408 100644 --- a/kernel/trace/preemptirq_delay_test.c +++ b/kernel/trace/preemptirq_delay_test.c @@ -117,12 +117,15 @@ static int preemptirq_delay_run(void *data) { int i; int s = MIN(burst_size, NR_TEST_FUNCS); - struct cpumask cpu_mask; + cpumask_var_t cpu_mask; + + if (!alloc_cpumask_var(&cpu_mask, GFP_KERNEL)) + return -ENOMEM; if (cpu_affinity > -1) { - cpumask_clear(&cpu_mask); - cpumask_set_cpu(cpu_affinity, &cpu_mask); - if (set_cpus_allowed_ptr(current, &cpu_mask)) + cpumask_clear(cpu_mask); + cpumask_set_cpu(cpu_affinity, cpu_mask); + if (set_cpus_allowed_ptr(current, cpu_mask)) pr_err("cpu_affinity:%d, failed\n", cpu_affinity); } @@ -139,6 +142,8 @@ static int preemptirq_delay_run(void *data) __set_current_state(TASK_RUNNING); + free_cpumask_var(cpu_mask); + return 0; } diff --git a/kernel/trace/remote_test.c b/kernel/trace/remote_test.c new file mode 100644 index 000000000000..a3e2c9b606eb --- /dev/null +++ b/kernel/trace/remote_test.c @@ -0,0 +1,261 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2025 - Google LLC + * Author: Vincent Donnefort <vdonnefort@google.com> + */ + +#include <linux/module.h> +#include <linux/simple_ring_buffer.h> +#include <linux/trace_remote.h> +#include <linux/tracefs.h> +#include <linux/types.h> + +#define REMOTE_EVENT_INCLUDE_FILE kernel/trace/remote_test_events.h +#include <trace/define_remote_events.h> + +static DEFINE_PER_CPU(struct simple_rb_per_cpu *, simple_rbs); +static struct trace_buffer_desc *remote_test_buffer_desc; + +/* + * The trace_remote lock already serializes accesses from the trace_remote_callbacks. + * However write_event can still race with load/unload. + */ +static DEFINE_MUTEX(simple_rbs_lock); + +static int remote_test_load_simple_rb(int cpu, struct ring_buffer_desc *rb_desc) +{ + struct simple_rb_per_cpu *cpu_buffer; + struct simple_buffer_page *bpages; + int ret = -ENOMEM; + + cpu_buffer = kmalloc_obj(*cpu_buffer); + if (!cpu_buffer) + return ret; + + bpages = kmalloc_objs(*bpages, rb_desc->nr_page_va); + if (!bpages) + goto err_free_cpu_buffer; + + ret = simple_ring_buffer_init(cpu_buffer, bpages, rb_desc); + if (ret) + goto err_free_bpages; + + scoped_guard(mutex, &simple_rbs_lock) { + WARN_ON(*per_cpu_ptr(&simple_rbs, cpu)); + *per_cpu_ptr(&simple_rbs, cpu) = cpu_buffer; + } + + return 0; + +err_free_bpages: + kfree(bpages); + +err_free_cpu_buffer: + kfree(cpu_buffer); + + return ret; +} + +static void remote_test_unload_simple_rb(int cpu) +{ + struct simple_rb_per_cpu *cpu_buffer = *per_cpu_ptr(&simple_rbs, cpu); + struct simple_buffer_page *bpages; + + if (!cpu_buffer) + return; + + guard(mutex)(&simple_rbs_lock); + + bpages = cpu_buffer->bpages; + simple_ring_buffer_unload(cpu_buffer); + kfree(bpages); + kfree(cpu_buffer); + *per_cpu_ptr(&simple_rbs, cpu) = NULL; +} + +static struct trace_buffer_desc *remote_test_load(unsigned long size, void *unused) +{ + struct ring_buffer_desc *rb_desc; + struct trace_buffer_desc *desc; + size_t desc_size; + int cpu, ret; + + if (WARN_ON(remote_test_buffer_desc)) + return ERR_PTR(-EINVAL); + + desc_size = trace_buffer_desc_size(size, num_possible_cpus()); + if (desc_size == SIZE_MAX) { + ret = -E2BIG; + goto err; + } + + desc = kmalloc(desc_size, GFP_KERNEL); + if (!desc) { + ret = -ENOMEM; + goto err; + } + + ret = trace_remote_alloc_buffer(desc, desc_size, size, cpu_possible_mask); + if (ret) + goto err_free_desc; + + for_each_ring_buffer_desc(rb_desc, cpu, desc) { + ret = remote_test_load_simple_rb(rb_desc->cpu, rb_desc); + if (ret) + goto err_unload; + } + + remote_test_buffer_desc = desc; + + return remote_test_buffer_desc; + +err_unload: + for_each_ring_buffer_desc(rb_desc, cpu, desc) + remote_test_unload_simple_rb(rb_desc->cpu); + trace_remote_free_buffer(desc); + +err_free_desc: + kfree(desc); + +err: + return ERR_PTR(ret); +} + +static void remote_test_unload(struct trace_buffer_desc *desc, void *unused) +{ + struct ring_buffer_desc *rb_desc; + int cpu; + + if (WARN_ON(desc != remote_test_buffer_desc)) + return; + + for_each_ring_buffer_desc(rb_desc, cpu, desc) + remote_test_unload_simple_rb(rb_desc->cpu); + + remote_test_buffer_desc = NULL; + trace_remote_free_buffer(desc); + kfree(desc); +} + +static int remote_test_enable_tracing(bool enable, void *unused) +{ + struct ring_buffer_desc *rb_desc; + int cpu; + + if (!remote_test_buffer_desc) + return -ENODEV; + + for_each_ring_buffer_desc(rb_desc, cpu, remote_test_buffer_desc) + WARN_ON(simple_ring_buffer_enable_tracing(*per_cpu_ptr(&simple_rbs, rb_desc->cpu), + enable)); + return 0; +} + +static int remote_test_swap_reader_page(unsigned int cpu, void *unused) +{ + struct simple_rb_per_cpu *cpu_buffer; + + if (cpu >= NR_CPUS) + return -EINVAL; + + cpu_buffer = *per_cpu_ptr(&simple_rbs, cpu); + if (!cpu_buffer) + return -EINVAL; + + return simple_ring_buffer_swap_reader_page(cpu_buffer); +} + +static int remote_test_reset(unsigned int cpu, void *unused) +{ + struct simple_rb_per_cpu *cpu_buffer; + + if (cpu >= NR_CPUS) + return -EINVAL; + + cpu_buffer = *per_cpu_ptr(&simple_rbs, cpu); + if (!cpu_buffer) + return -EINVAL; + + return simple_ring_buffer_reset(cpu_buffer); +} + +static int remote_test_enable_event(unsigned short id, bool enable, void *unused) +{ + if (id != REMOTE_TEST_EVENT_ID) + return -EINVAL; + + /* + * Let's just use the struct remote_event enabled field that is turned on and off by + * trace_remote. This is a bit racy but good enough for a simple test module. + */ + return 0; +} + +static ssize_t +write_event_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *pos) +{ + struct remote_event_format_selftest *evt_test; + struct simple_rb_per_cpu *cpu_buffer; + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + guard(mutex)(&simple_rbs_lock); + + if (!remote_event_selftest.enabled) + return -ENODEV; + + guard(preempt)(); + + cpu_buffer = *this_cpu_ptr(&simple_rbs); + if (!cpu_buffer) + return -ENODEV; + + evt_test = simple_ring_buffer_reserve(cpu_buffer, + sizeof(struct remote_event_format_selftest), + trace_clock_global()); + if (!evt_test) + return -ENODEV; + + evt_test->hdr.id = REMOTE_TEST_EVENT_ID; + evt_test->id = val; + + simple_ring_buffer_commit(cpu_buffer); + + return cnt; +} + +static const struct file_operations write_event_fops = { + .write = write_event_write, +}; + +static int remote_test_init_tracefs(struct dentry *d, void *unused) +{ + return tracefs_create_file("write_event", 0200, d, NULL, &write_event_fops) ? + 0 : -ENOMEM; +} + +static struct trace_remote_callbacks trace_remote_callbacks = { + .init = remote_test_init_tracefs, + .load_trace_buffer = remote_test_load, + .unload_trace_buffer = remote_test_unload, + .enable_tracing = remote_test_enable_tracing, + .swap_reader_page = remote_test_swap_reader_page, + .reset = remote_test_reset, + .enable_event = remote_test_enable_event, +}; + +static int __init remote_test_init(void) +{ + return trace_remote_register("test", &trace_remote_callbacks, NULL, + &remote_event_selftest, 1); +} + +module_init(remote_test_init); + +MODULE_DESCRIPTION("Test module for the trace remote interface"); +MODULE_AUTHOR("Vincent Donnefort"); +MODULE_LICENSE("GPL"); diff --git a/kernel/trace/remote_test_events.h b/kernel/trace/remote_test_events.h new file mode 100644 index 000000000000..26b93b3406fc --- /dev/null +++ b/kernel/trace/remote_test_events.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#define REMOTE_TEST_EVENT_ID 1 + +REMOTE_EVENT(selftest, REMOTE_TEST_EVENT_ID, + RE_STRUCT( + re_field(u64, id) + ), + RE_PRINTK("id=%llu", __entry->id) +); diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c index 30d224946881..5a8bdf88999a 100644 --- a/kernel/trace/rethook.c +++ b/kernel/trace/rethook.c @@ -108,7 +108,7 @@ struct rethook *rethook_alloc(void *data, rethook_handler_t handler, if (!handler || num <= 0 || size < sizeof(struct rethook_node)) return ERR_PTR(-EINVAL); - rh = kzalloc(sizeof(struct rethook), GFP_KERNEL); + rh = kzalloc_obj(struct rethook); if (!rh) return ERR_PTR(-ENOMEM); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 60210fb5b211..7b07d2004cc6 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4,7 +4,10 @@ * * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> */ +#include <linux/ring_buffer_types.h> +#include <linux/sched/isolation.h> #include <linux/trace_recursion.h> +#include <linux/panic_notifier.h> #include <linux/trace_events.h> #include <linux/ring_buffer.h> #include <linux/trace_clock.h> @@ -29,8 +32,10 @@ #include <linux/oom.h> #include <linux/mm.h> +#include <asm/ring_buffer.h> #include <asm/local64.h> #include <asm/local.h> +#include <asm/setup.h> #include "trace.h" @@ -48,9 +53,12 @@ static void update_pages_handler(struct work_struct *work); struct ring_buffer_meta { int magic; - int struct_size; - unsigned long text_addr; - unsigned long data_addr; + int struct_sizes; + unsigned long total_size; + unsigned long buffers_offset; +}; + +struct ring_buffer_cpu_meta { unsigned long first_buffer; unsigned long head_buffer; unsigned long commit_buffer; @@ -152,23 +160,6 @@ int ring_buffer_print_entry_header(struct trace_seq *s) /* Used for individual buffers (after the counter) */ #define RB_BUFFER_OFF (1 << 20) -#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) - -#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) -#define RB_ALIGNMENT 4U -#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) -#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ - -#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS -# define RB_FORCE_8BYTE_ALIGNMENT 0 -# define RB_ARCH_ALIGNMENT RB_ALIGNMENT -#else -# define RB_FORCE_8BYTE_ALIGNMENT 1 -# define RB_ARCH_ALIGNMENT 8U -#endif - -#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT) - /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX @@ -311,10 +302,6 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data); #define for_each_online_buffer_cpu(buffer, cpu) \ for_each_cpu_and(cpu, buffer->cpumask, cpu_online_mask) -#define TS_SHIFT 27 -#define TS_MASK ((1ULL << TS_SHIFT) - 1) -#define TS_DELTA_TEST (~TS_MASK) - static u64 rb_event_time_stamp(struct ring_buffer_event *event) { u64 ts; @@ -333,12 +320,6 @@ static u64 rb_event_time_stamp(struct ring_buffer_event *event) #define RB_MISSED_MASK (3 << 30) -struct buffer_data_page { - u64 time_stamp; /* page time stamp */ - local_t commit; /* write committed index */ - unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */ -}; - struct buffer_data_read_page { unsigned order; /* order of the page */ struct buffer_data_page *data; /* actual data, stored in this page */ @@ -398,11 +379,38 @@ static void free_buffer_page(struct buffer_page *bpage) } /* - * We need to fit the time_stamp delta into 27 bits. + * For best performance, allocate cpu buffer data cache line sized + * and per CPU. */ -static inline bool test_time_stamp(u64 delta) +#define alloc_cpu_buffer(cpu) (struct ring_buffer_per_cpu *) \ + kzalloc_node(ALIGN(sizeof(struct ring_buffer_per_cpu), \ + cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); + +#define alloc_cpu_page(cpu) (struct buffer_page *) \ + kzalloc_node(ALIGN(sizeof(struct buffer_page), \ + cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); + +static struct buffer_data_page *alloc_cpu_data(int cpu, int order) { - return !!(delta & TS_DELTA_TEST); + struct buffer_data_page *dpage; + struct page *page; + gfp_t mflags; + + /* + * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails + * gracefully without invoking oom-killer and the system is not + * destabilized. + */ + mflags = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_COMP | __GFP_ZERO; + + page = alloc_pages_node(cpu_to_node(cpu), mflags, order); + if (!page) + return NULL; + + dpage = page_address(page); + rb_init_page(dpage); + + return dpage; } struct rb_irq_work { @@ -515,9 +523,11 @@ struct ring_buffer_per_cpu { unsigned int mapped; unsigned int user_mapped; /* user space mapping */ struct mutex mapping_lock; - unsigned long *subbuf_ids; /* ID to subbuf VA */ + struct buffer_page **subbuf_ids; /* ID to subbuf VA */ struct trace_buffer_meta *meta_page; - struct ring_buffer_meta *ring_meta; + struct ring_buffer_cpu_meta *ring_meta; + + struct ring_buffer_remote *remote; /* ring buffer pages to update, > 0 to add, < 0 to remove */ long nr_pages_to_update; @@ -541,6 +551,8 @@ struct trace_buffer { struct ring_buffer_per_cpu **buffers; + struct ring_buffer_remote *remote; + struct hlist_node node; u64 (*clock)(void); @@ -549,9 +561,9 @@ struct trace_buffer { unsigned long range_addr_start; unsigned long range_addr_end; + struct notifier_block flush_nb; - long last_text_delta; - long last_data_delta; + struct ring_buffer_meta *meta; unsigned int subbuf_size; unsigned int subbuf_order; @@ -588,16 +600,17 @@ int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq (unsigned int)sizeof(field.commit), (unsigned int)is_signed_type(long)); - trace_seq_printf(s, "\tfield: int overwrite;\t" + trace_seq_printf(s, "\tfield: char overwrite;\t" "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), commit), 1, - (unsigned int)is_signed_type(long)); + (unsigned int)is_signed_type(char)); trace_seq_printf(s, "\tfield: char data;\t" "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), data), - (unsigned int)buffer->subbuf_size, + (unsigned int)(buffer ? buffer->subbuf_size : + PAGE_SIZE - BUF_PAGE_HDR_SIZE), (unsigned int)is_signed_type(char)); return !trace_seq_has_overflowed(s); @@ -1271,7 +1284,7 @@ static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer) rb_set_list_to_head(head->list.prev); if (cpu_buffer->ring_meta) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; meta->head_buffer = (unsigned long)head->page; } } @@ -1355,6 +1368,13 @@ static inline void rb_inc_page(struct buffer_page **bpage) *bpage = list_entry(p, struct buffer_page, list); } +static inline void rb_dec_page(struct buffer_page **bpage) +{ + struct list_head *p = rb_list_head((*bpage)->list.prev); + + *bpage = list_entry(p, struct buffer_page, list); +} + static struct buffer_page * rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer) { @@ -1569,7 +1589,7 @@ out_locked: static unsigned long rb_range_align_subbuf(unsigned long addr, int subbuf_size, int nr_subbufs) { - addr += sizeof(struct ring_buffer_meta) + + addr += sizeof(struct ring_buffer_cpu_meta) + sizeof(int) * nr_subbufs; return ALIGN(addr, subbuf_size); } @@ -1580,19 +1600,22 @@ rb_range_align_subbuf(unsigned long addr, int subbuf_size, int nr_subbufs) static void *rb_range_meta(struct trace_buffer *buffer, int nr_pages, int cpu) { int subbuf_size = buffer->subbuf_size + BUF_PAGE_HDR_SIZE; - unsigned long ptr = buffer->range_addr_start; - struct ring_buffer_meta *meta; + struct ring_buffer_cpu_meta *meta; + struct ring_buffer_meta *bmeta; + unsigned long ptr; int nr_subbufs; - if (!ptr) + bmeta = buffer->meta; + if (!bmeta) return NULL; + ptr = (unsigned long)bmeta + bmeta->buffers_offset; + meta = (struct ring_buffer_cpu_meta *)ptr; + /* When nr_pages passed in is zero, the first meta has already been initialized */ if (!nr_pages) { - meta = (struct ring_buffer_meta *)ptr; nr_subbufs = meta->nr_subbufs; } else { - meta = NULL; /* Include the reader page */ nr_subbufs = nr_pages + 1; } @@ -1624,7 +1647,7 @@ static void *rb_range_meta(struct trace_buffer *buffer, int nr_pages, int cpu) } /* Return the start of subbufs given the meta pointer */ -static void *rb_subbufs_from_meta(struct ring_buffer_meta *meta) +static void *rb_subbufs_from_meta(struct ring_buffer_cpu_meta *meta) { int subbuf_size = meta->subbuf_size; unsigned long ptr; @@ -1640,7 +1663,7 @@ static void *rb_subbufs_from_meta(struct ring_buffer_meta *meta) */ static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx) { - struct ring_buffer_meta *meta; + struct ring_buffer_cpu_meta *meta; unsigned long ptr; int subbuf_size; @@ -1666,13 +1689,77 @@ static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx) } /* + * See if the existing memory contains a valid meta section. + * if so, use that, otherwise initialize it. + */ +static bool rb_meta_init(struct trace_buffer *buffer, int scratch_size) +{ + unsigned long ptr = buffer->range_addr_start; + struct ring_buffer_meta *bmeta; + unsigned long total_size; + int struct_sizes; + + bmeta = (struct ring_buffer_meta *)ptr; + buffer->meta = bmeta; + + total_size = buffer->range_addr_end - buffer->range_addr_start; + + struct_sizes = sizeof(struct ring_buffer_cpu_meta); + struct_sizes |= sizeof(*bmeta) << 16; + + /* The first buffer will start word size after the meta page */ + ptr += sizeof(*bmeta); + ptr = ALIGN(ptr, sizeof(long)); + ptr += scratch_size; + + if (bmeta->magic != RING_BUFFER_META_MAGIC) { + pr_info("Ring buffer boot meta mismatch of magic\n"); + goto init; + } + + if (bmeta->struct_sizes != struct_sizes) { + pr_info("Ring buffer boot meta mismatch of struct size\n"); + goto init; + } + + if (bmeta->total_size != total_size) { + pr_info("Ring buffer boot meta mismatch of total size\n"); + goto init; + } + + if (bmeta->buffers_offset > bmeta->total_size) { + pr_info("Ring buffer boot meta mismatch of offset outside of total size\n"); + goto init; + } + + if (bmeta->buffers_offset != (void *)ptr - (void *)bmeta) { + pr_info("Ring buffer boot meta mismatch of first buffer offset\n"); + goto init; + } + + return true; + + init: + bmeta->magic = RING_BUFFER_META_MAGIC; + bmeta->struct_sizes = struct_sizes; + bmeta->total_size = total_size; + bmeta->buffers_offset = (void *)ptr - (void *)bmeta; + + /* Zero out the scratch pad */ + memset((void *)bmeta + sizeof(*bmeta), 0, bmeta->buffers_offset - sizeof(*bmeta)); + + return false; +} + +/* * See if the existing memory contains valid ring buffer data. * As the previous kernel must be the same as this kernel, all * the calculations (size of buffers and number of buffers) * must be the same. */ -static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, - struct trace_buffer *buffer, int nr_pages) +static bool rb_cpu_meta_valid(struct ring_buffer_cpu_meta *meta, int cpu, + struct trace_buffer *buffer, int nr_pages, + unsigned long *subbuf_mask) { int subbuf_size = PAGE_SIZE; struct buffer_data_page *subbuf; @@ -1680,19 +1767,8 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, unsigned long buffers_end; int i; - /* Check the meta magic and meta struct size */ - if (meta->magic != RING_BUFFER_META_MAGIC || - meta->struct_size != sizeof(*meta)) { - pr_info("Ring buffer boot meta[%d] mismatch of magic or struct size\n", cpu); + if (!subbuf_mask) return false; - } - - /* The subbuffer's size and number of subbuffers must match */ - if (meta->subbuf_size != subbuf_size || - meta->nr_subbufs != nr_pages + 1) { - pr_info("Ring buffer boot meta [%d] mismatch of subbuf_size/nr_pages\n", cpu); - return false; - } buffers_start = meta->first_buffer; buffers_end = meta->first_buffer + (subbuf_size * meta->nr_subbufs); @@ -1712,6 +1788,8 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, subbuf = rb_subbufs_from_meta(meta); + bitmap_clear(subbuf_mask, 0, meta->nr_subbufs); + /* Is the meta buffers and the subbufs themselves have correct data? */ for (i = 0; i < meta->nr_subbufs; i++) { if (meta->buffers[i] < 0 || @@ -1725,13 +1803,19 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, return false; } + if (test_bit(meta->buffers[i], subbuf_mask)) { + pr_info("Ring buffer boot meta [%d] array has duplicates\n", cpu); + return false; + } + + set_bit(meta->buffers[i], subbuf_mask); subbuf = (void *)subbuf + subbuf_size; } return true; } -static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf); +static int rb_meta_subbuf_idx(struct ring_buffer_cpu_meta *meta, void *subbuf); static int rb_read_data_buffer(struct buffer_data_page *dpage, int tail, int cpu, unsigned long long *timestamp, u64 *delta_ptr) @@ -1739,6 +1823,7 @@ static int rb_read_data_buffer(struct buffer_data_page *dpage, int tail, int cpu struct ring_buffer_event *event; u64 ts, delta; int events = 0; + int len; int e; *delta_ptr = 0; @@ -1746,9 +1831,12 @@ static int rb_read_data_buffer(struct buffer_data_page *dpage, int tail, int cpu ts = dpage->time_stamp; - for (e = 0; e < tail; e += rb_event_length(event)) { + for (e = 0; e < tail; e += len) { event = (struct ring_buffer_event *)(dpage->data + e); + len = rb_event_length(event); + if (len <= 0 || len > tail - e) + return -1; switch (event->type_len) { @@ -1798,38 +1886,133 @@ static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu) /* If the meta data has been validated, now validate the events */ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; - struct buffer_page *head_page; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; + struct buffer_page *head_page, *orig_head, *orig_reader; unsigned long entry_bytes = 0; unsigned long entries = 0; int ret; + u64 ts; int i; if (!meta || !meta->head_buffer) return; + orig_head = head_page = cpu_buffer->head_page; + orig_reader = cpu_buffer->reader_page; + /* Do the reader page first */ - ret = rb_validate_buffer(cpu_buffer->reader_page->page, cpu_buffer->cpu); + ret = rb_validate_buffer(orig_reader->page, cpu_buffer->cpu); if (ret < 0) { pr_info("Ring buffer reader page is invalid\n"); goto invalid; } entries += ret; - entry_bytes += local_read(&cpu_buffer->reader_page->page->commit); - local_set(&cpu_buffer->reader_page->entries, ret); + entry_bytes += local_read(&orig_reader->page->commit); + local_set(&orig_reader->entries, ret); - head_page = cpu_buffer->head_page; + ts = head_page->page->time_stamp; + + /* + * Try to rewind the head so that we can read the pages which already + * read in the previous boot. + */ + if (head_page == cpu_buffer->tail_page) + goto skip_rewind; + + rb_dec_page(&head_page); + for (i = 0; i < meta->nr_subbufs + 1; i++, rb_dec_page(&head_page)) { + + /* Rewind until tail (writer) page. */ + if (head_page == cpu_buffer->tail_page) + break; + + /* Ensure the page has older data than head. */ + if (ts < head_page->page->time_stamp) + break; + + ts = head_page->page->time_stamp; + /* Ensure the page has correct timestamp and some data. */ + if (!ts || rb_page_commit(head_page) == 0) + break; + + /* Stop rewind if the page is invalid. */ + ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu); + if (ret < 0) + break; + + /* Recover the number of entries and update stats. */ + local_set(&head_page->entries, ret); + if (ret) + local_inc(&cpu_buffer->pages_touched); + entries += ret; + entry_bytes += rb_page_commit(head_page); + } + if (i) + pr_info("Ring buffer [%d] rewound %d pages\n", cpu_buffer->cpu, i); - /* If both the head and commit are on the reader_page then we are done. */ - if (head_page == cpu_buffer->reader_page && - head_page == cpu_buffer->commit_page) + /* The last rewound page must be skipped. */ + if (head_page != orig_head) + rb_inc_page(&head_page); + + /* + * If the ring buffer was rewound, then inject the reader page + * into the location just before the original head page. + */ + if (head_page != orig_head) { + struct buffer_page *bpage = orig_head; + + rb_dec_page(&bpage); + /* + * Insert the reader_page before the original head page. + * Since the list encode RB_PAGE flags, general list + * operations should be avoided. + */ + cpu_buffer->reader_page->list.next = &orig_head->list; + cpu_buffer->reader_page->list.prev = orig_head->list.prev; + orig_head->list.prev = &cpu_buffer->reader_page->list; + bpage->list.next = &cpu_buffer->reader_page->list; + + /* Make the head_page the reader page */ + cpu_buffer->reader_page = head_page; + bpage = head_page; + rb_inc_page(&head_page); + head_page->list.prev = bpage->list.prev; + rb_dec_page(&bpage); + bpage->list.next = &head_page->list; + rb_set_list_to_head(&bpage->list); + cpu_buffer->pages = &head_page->list; + + cpu_buffer->head_page = head_page; + meta->head_buffer = (unsigned long)head_page->page; + + /* Reset all the indexes */ + bpage = cpu_buffer->reader_page; + meta->buffers[0] = rb_meta_subbuf_idx(meta, bpage->page); + bpage->id = 0; + + for (i = 1, bpage = head_page; i < meta->nr_subbufs; + i++, rb_inc_page(&bpage)) { + meta->buffers[i] = rb_meta_subbuf_idx(meta, bpage->page); + bpage->id = i; + } + + /* We'll restart verifying from orig_head */ + head_page = orig_head; + } + + skip_rewind: + /* If the commit_buffer is the reader page, update the commit page */ + if (meta->commit_buffer == (unsigned long)cpu_buffer->reader_page->page) { + cpu_buffer->commit_page = cpu_buffer->reader_page; + /* Nothing more to do, the only page is the reader page */ goto done; + } /* Iterate until finding the commit page */ for (i = 0; i < meta->nr_subbufs + 1; i++, rb_inc_page(&head_page)) { - /* Reader page has already been done */ - if (head_page == cpu_buffer->reader_page) + /* The original reader page has already been checked/counted. */ + if (head_page == orig_reader) continue; ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu); @@ -1838,9 +2021,14 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->cpu); goto invalid; } + + /* If the buffer has content, update pages_touched */ + if (ret) + local_inc(&cpu_buffer->pages_touched); + entries += ret; entry_bytes += local_read(&head_page->page->commit); - local_set(&cpu_buffer->head_page->entries, ret); + local_set(&head_page->entries, ret); if (head_page == cpu_buffer->commit_page) break; @@ -1874,40 +2062,35 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) } } -/* Used to calculate data delta */ -static char rb_data_ptr[] = ""; - -#define THIS_TEXT_PTR ((unsigned long)rb_meta_init_text_addr) -#define THIS_DATA_PTR ((unsigned long)rb_data_ptr) - -static void rb_meta_init_text_addr(struct ring_buffer_meta *meta) -{ - meta->text_addr = THIS_TEXT_PTR; - meta->data_addr = THIS_DATA_PTR; -} - -static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) +static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages, int scratch_size) { - struct ring_buffer_meta *meta; + struct ring_buffer_cpu_meta *meta; + unsigned long *subbuf_mask; unsigned long delta; void *subbuf; + bool valid = false; int cpu; int i; + /* Create a mask to test the subbuf array */ + subbuf_mask = bitmap_alloc(nr_pages + 1, GFP_KERNEL); + /* If subbuf_mask fails to allocate, then rb_meta_valid() will return false */ + + if (rb_meta_init(buffer, scratch_size)) + valid = true; + for (cpu = 0; cpu < nr_cpu_ids; cpu++) { void *next_meta; meta = rb_range_meta(buffer, nr_pages, cpu); - if (rb_meta_valid(meta, cpu, buffer, nr_pages)) { + if (valid && rb_cpu_meta_valid(meta, cpu, buffer, nr_pages, subbuf_mask)) { /* Make the mappings match the current address */ subbuf = rb_subbufs_from_meta(meta); delta = (unsigned long)subbuf - meta->first_buffer; meta->first_buffer += delta; meta->head_buffer += delta; meta->commit_buffer += delta; - buffer->last_text_delta = THIS_TEXT_PTR - meta->text_addr; - buffer->last_data_delta = THIS_DATA_PTR - meta->data_addr; continue; } @@ -1918,16 +2101,12 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) memset(meta, 0, next_meta - (void *)meta); - meta->magic = RING_BUFFER_META_MAGIC; - meta->struct_size = sizeof(*meta); - meta->nr_subbufs = nr_pages + 1; meta->subbuf_size = PAGE_SIZE; subbuf = rb_subbufs_from_meta(meta); meta->first_buffer = (unsigned long)subbuf; - rb_meta_init_text_addr(meta); /* * The buffers[] array holds the order of the sub-buffers @@ -1943,12 +2122,13 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) subbuf += meta->subbuf_size; } } + bitmap_free(subbuf_mask); } static void *rbm_start(struct seq_file *m, loff_t *pos) { struct ring_buffer_per_cpu *cpu_buffer = m->private; - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; unsigned long val; if (!meta) @@ -1973,7 +2153,7 @@ static void *rbm_next(struct seq_file *m, void *v, loff_t *pos) static int rbm_show(struct seq_file *m, void *v) { struct ring_buffer_per_cpu *cpu_buffer = m->private; - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; unsigned long val = (unsigned long)v; if (val == 1) { @@ -2022,7 +2202,7 @@ int ring_buffer_meta_seq_init(struct file *file, struct trace_buffer *buffer, in static void rb_meta_buffer_update(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *bpage) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; if (meta->head_buffer == (unsigned long)bpage->page) cpu_buffer->head_page = bpage; @@ -2033,14 +2213,48 @@ static void rb_meta_buffer_update(struct ring_buffer_per_cpu *cpu_buffer, } } +static struct ring_buffer_desc *ring_buffer_desc(struct trace_buffer_desc *trace_desc, int cpu) +{ + struct ring_buffer_desc *desc, *end; + size_t len; + int i; + + if (!trace_desc) + return NULL; + + if (cpu >= trace_desc->nr_cpus) + return NULL; + + end = (struct ring_buffer_desc *)((void *)trace_desc + trace_desc->struct_len); + desc = __first_ring_buffer_desc(trace_desc); + len = struct_size(desc, page_va, desc->nr_page_va); + desc = (struct ring_buffer_desc *)((void *)desc + (len * cpu)); + + if (desc < end && desc->cpu == cpu) + return desc; + + /* Missing CPUs, need to linear search */ + for_each_ring_buffer_desc(desc, i, trace_desc) { + if (desc->cpu == cpu) + return desc; + } + + return NULL; +} + +static void *ring_buffer_desc_page(struct ring_buffer_desc *desc, unsigned int page_id) +{ + return page_id >= desc->nr_page_va ? NULL : (void *)desc->page_va[page_id]; +} + static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, long nr_pages, struct list_head *pages) { struct trace_buffer *buffer = cpu_buffer->buffer; - struct ring_buffer_meta *meta = NULL; + struct ring_buffer_cpu_meta *meta = NULL; struct buffer_page *bpage, *tmp; bool user_thread = current->mm != NULL; - gfp_t mflags; + struct ring_buffer_desc *desc = NULL; long i; /* @@ -2055,13 +2269,6 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, return -ENOMEM; /* - * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails - * gracefully without invoking oom-killer and the system is not - * destabilized. - */ - mflags = GFP_KERNEL | __GFP_RETRY_MAYFAIL; - - /* * If a user thread allocates too much, and si_mem_available() * reports there's enough memory, even though there is not. * Make sure the OOM killer kills this thread. This can happen @@ -2076,11 +2283,15 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, if (buffer->range_addr_start) meta = rb_range_meta(buffer, nr_pages, cpu_buffer->cpu); + if (buffer->remote) { + desc = ring_buffer_desc(buffer->remote->desc, cpu_buffer->cpu); + if (!desc || WARN_ON(desc->nr_page_va != (nr_pages + 1))) + return -EINVAL; + } + for (i = 0; i < nr_pages; i++) { - struct page *page; - bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), - mflags, cpu_to_node(cpu_buffer->cpu)); + bpage = alloc_cpu_page(cpu_buffer->cpu); if (!bpage) goto free_pages; @@ -2102,14 +2313,21 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, rb_meta_buffer_update(cpu_buffer, bpage); bpage->range = 1; bpage->id = i + 1; + } else if (desc) { + void *p = ring_buffer_desc_page(desc, i + 1); + + if (WARN_ON(!p)) + goto free_pages; + + bpage->page = p; + bpage->range = 1; /* bpage->page can't be freed */ + bpage->id = i + 1; + cpu_buffer->subbuf_ids[i + 1] = bpage; } else { - page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), - mflags | __GFP_COMP | __GFP_ZERO, - cpu_buffer->buffer->subbuf_order); - if (!page) + int order = cpu_buffer->buffer->subbuf_order; + bpage->page = alloc_cpu_data(cpu_buffer->cpu, order); + if (!bpage->page) goto free_pages; - bpage->page = page_address(page); - rb_init_page(bpage->page); } bpage->order = cpu_buffer->buffer->subbuf_order; @@ -2160,14 +2378,12 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, static struct ring_buffer_per_cpu * rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) { - struct ring_buffer_per_cpu *cpu_buffer; - struct ring_buffer_meta *meta; + struct ring_buffer_per_cpu *cpu_buffer __free(kfree) = + alloc_cpu_buffer(cpu); + struct ring_buffer_cpu_meta *meta; struct buffer_page *bpage; - struct page *page; int ret; - cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()), - GFP_KERNEL, cpu_to_node(cpu)); if (!cpu_buffer) return NULL; @@ -2183,10 +2399,9 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) init_waitqueue_head(&cpu_buffer->irq_work.full_waiters); mutex_init(&cpu_buffer->mapping_lock); - bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), - GFP_KERNEL, cpu_to_node(cpu)); + bpage = alloc_cpu_page(cpu); if (!bpage) - goto fail_free_buffer; + return NULL; rb_check_bpage(cpu_buffer, bpage); @@ -2205,14 +2420,35 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) if (cpu_buffer->ring_meta->head_buffer) rb_meta_buffer_update(cpu_buffer, bpage); bpage->range = 1; + } else if (buffer->remote) { + struct ring_buffer_desc *desc = ring_buffer_desc(buffer->remote->desc, cpu); + + if (!desc) + goto fail_free_reader; + + cpu_buffer->remote = buffer->remote; + cpu_buffer->meta_page = (struct trace_buffer_meta *)(void *)desc->meta_va; + cpu_buffer->nr_pages = nr_pages; + cpu_buffer->subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, + sizeof(*cpu_buffer->subbuf_ids), GFP_KERNEL); + if (!cpu_buffer->subbuf_ids) + goto fail_free_reader; + + /* Remote buffers are read-only and immutable */ + atomic_inc(&cpu_buffer->record_disabled); + atomic_inc(&cpu_buffer->resize_disabled); + + bpage->page = ring_buffer_desc_page(desc, cpu_buffer->meta_page->reader.id); + if (!bpage->page) + goto fail_free_reader; + + bpage->range = 1; + cpu_buffer->subbuf_ids[0] = bpage; } else { - page = alloc_pages_node(cpu_to_node(cpu), - GFP_KERNEL | __GFP_COMP | __GFP_ZERO, - cpu_buffer->buffer->subbuf_order); - if (!page) + int order = cpu_buffer->buffer->subbuf_order; + bpage->page = alloc_cpu_data(cpu, order); + if (!bpage->page) goto fail_free_reader; - bpage->page = page_address(page); - rb_init_page(bpage->page); } INIT_LIST_HEAD(&cpu_buffer->reader_page->list); @@ -2252,13 +2488,11 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) rb_head_page_activate(cpu_buffer); } - return cpu_buffer; + return_ptr(cpu_buffer); fail_free_reader: free_buffer_page(cpu_buffer->reader_page); - fail_free_buffer: - kfree(cpu_buffer); return NULL; } @@ -2269,6 +2503,9 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) irq_work_sync(&cpu_buffer->irq_work.work); + if (cpu_buffer->remote) + kfree(cpu_buffer->subbuf_ids); + free_buffer_page(cpu_buffer->reader_page); if (head) { @@ -2287,12 +2524,24 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) kfree(cpu_buffer); } +/* Stop recording on a persistent buffer and flush cache if needed. */ +static int rb_flush_buffer_cb(struct notifier_block *nb, unsigned long event, void *data) +{ + struct trace_buffer *buffer = container_of(nb, struct trace_buffer, flush_nb); + + ring_buffer_record_off(buffer); + arch_ring_buffer_flush_range(buffer->range_addr_start, buffer->range_addr_end); + return NOTIFY_DONE; +} + static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, int order, unsigned long start, unsigned long end, - struct lock_class_key *key) + unsigned long scratch_size, + struct lock_class_key *key, + struct ring_buffer_remote *remote) { - struct trace_buffer *buffer; + struct trace_buffer *buffer __free(kfree) = NULL; long nr_pages; int subbuf_size; int bsize; @@ -2306,7 +2555,7 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, return NULL; if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) - goto fail_free_buffer; + return NULL; buffer->subbuf_order = order; subbuf_size = (PAGE_SIZE << order); @@ -2330,12 +2579,27 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, if (!buffer->buffers) goto fail_free_cpumask; + cpu = raw_smp_processor_id(); + /* If start/end are specified, then that overrides size */ if (start && end) { + unsigned long buffers_start; unsigned long ptr; int n; - size = end - start; + /* Make sure that start is word aligned */ + start = ALIGN(start, sizeof(long)); + + /* scratch_size needs to be aligned too */ + scratch_size = ALIGN(scratch_size, sizeof(long)); + + /* Subtract the buffer meta data and word aligned */ + buffers_start = start + sizeof(struct ring_buffer_cpu_meta); + buffers_start = ALIGN(buffers_start, sizeof(long)); + buffers_start += scratch_size; + + /* Calculate the size for the per CPU data */ + size = end - buffers_start; size = size / nr_cpu_ids; /* @@ -2345,7 +2609,7 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, * needed, plus account for the integer array index that * will be appended to the meta data. */ - nr_pages = (size - sizeof(struct ring_buffer_meta)) / + nr_pages = (size - sizeof(struct ring_buffer_cpu_meta)) / (subbuf_size + sizeof(int)); /* Need at least two pages plus the reader page */ if (nr_pages < 3) @@ -2353,8 +2617,8 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, again: /* Make sure that the size fits aligned */ - for (n = 0, ptr = start; n < nr_cpu_ids; n++) { - ptr += sizeof(struct ring_buffer_meta) + + for (n = 0, ptr = buffers_start; n < nr_cpu_ids; n++) { + ptr += sizeof(struct ring_buffer_cpu_meta) + sizeof(int) * nr_pages; ptr = ALIGN(ptr, subbuf_size); ptr += subbuf_size * nr_pages; @@ -2371,7 +2635,16 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, buffer->range_addr_start = start; buffer->range_addr_end = end; - rb_range_meta_init(buffer, nr_pages); + rb_range_meta_init(buffer, nr_pages, scratch_size); + } else if (remote) { + struct ring_buffer_desc *desc = ring_buffer_desc(remote->desc, cpu); + + buffer->remote = remote; + /* The writer is remote. This ring-buffer is read-only */ + atomic_inc(&buffer->record_disabled); + nr_pages = desc->nr_page_va - 1; + if (nr_pages < 2) + goto fail_free_buffers; } else { /* need at least two pages */ @@ -2380,7 +2653,6 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, nr_pages = 2; } - cpu = raw_smp_processor_id(); cpumask_set_cpu(cpu, buffer->cpumask); buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu); if (!buffer->buffers[cpu]) @@ -2392,7 +2664,13 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, mutex_init(&buffer->mutex); - return buffer; + /* Persistent ring buffer needs to flush cache before reboot. */ + if (start && end) { + buffer->flush_nb.notifier_call = rb_flush_buffer_cb; + atomic_notifier_chain_register(&panic_notifier_list, &buffer->flush_nb); + } + + return_ptr(buffer); fail_free_buffers: for_each_buffer_cpu(buffer, cpu) { @@ -2404,8 +2682,6 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, fail_free_cpumask: free_cpumask_var(buffer->cpumask); - fail_free_buffer: - kfree(buffer); return NULL; } @@ -2424,7 +2700,7 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *key) { /* Default buffer page size - one system page */ - return alloc_buffer(size, flags, 0, 0, 0,key); + return alloc_buffer(size, flags, 0, 0, 0, 0, key, NULL); } EXPORT_SYMBOL_GPL(__ring_buffer_alloc); @@ -2436,6 +2712,7 @@ EXPORT_SYMBOL_GPL(__ring_buffer_alloc); * @order: sub-buffer order * @start: start of allocated range * @range_size: size of allocated range + * @scratch_size: size of scratch area (for preallocated memory buffers) * @key: ring buffer reader_lock_key. * * Currently the only flag that is available is the RB_FL_OVERWRITE @@ -2446,32 +2723,40 @@ EXPORT_SYMBOL_GPL(__ring_buffer_alloc); struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flags, int order, unsigned long start, unsigned long range_size, + unsigned long scratch_size, struct lock_class_key *key) { - return alloc_buffer(size, flags, order, start, start + range_size, key); + return alloc_buffer(size, flags, order, start, start + range_size, + scratch_size, key, NULL); } /** - * ring_buffer_last_boot_delta - return the delta offset from last boot - * @buffer: The buffer to return the delta from - * @text: Return text delta - * @data: Return data delta - * - * Returns: The true if the delta is non zero + * __ring_buffer_alloc_remote - allocate a new ring_buffer from a remote + * @remote: Contains a description of the ring-buffer pages and remote callbacks. + * @key: ring buffer reader_lock_key. */ -bool ring_buffer_last_boot_delta(struct trace_buffer *buffer, long *text, - long *data) +struct trace_buffer *__ring_buffer_alloc_remote(struct ring_buffer_remote *remote, + struct lock_class_key *key) { - if (!buffer) - return false; + return alloc_buffer(0, 0, 0, 0, 0, 0, key, remote); +} - if (!buffer->last_text_delta) - return false; +void *ring_buffer_meta_scratch(struct trace_buffer *buffer, unsigned int *size) +{ + struct ring_buffer_meta *meta; + void *ptr; - *text = buffer->last_text_delta; - *data = buffer->last_data_delta; + if (!buffer || !buffer->meta) + return NULL; - return true; + meta = buffer->meta; + + ptr = (void *)ALIGN((unsigned long)meta + sizeof(*meta), sizeof(long)); + + if (size) + *size = (void *)meta + meta->buffers_offset - ptr; + + return ptr; } /** @@ -2483,6 +2768,9 @@ ring_buffer_free(struct trace_buffer *buffer) { int cpu; + if (buffer->range_addr_start && buffer->range_addr_end) + atomic_notifier_chain_unregister(&panic_notifier_list, &buffer->flush_nb); + cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node); irq_work_sync(&buffer->irq_work.work); @@ -2771,6 +3059,12 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, if (nr_pages < 2) nr_pages = 2; + /* + * Keep CPUs from coming online while resizing to synchronize + * with new per CPU buffers being created. + */ + guard(cpus_read_lock)(); + /* prevent another thread from changing buffer sizes */ mutex_lock(&buffer->mutex); atomic_inc(&buffer->resizing); @@ -2815,7 +3109,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, cond_resched(); } - cpus_read_lock(); /* * Fire off all the required work handlers * We can't schedule on offline CPUs, but it's not necessary @@ -2855,7 +3148,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, cpu_buffer->nr_pages_to_update = 0; } - cpus_read_unlock(); } else { cpu_buffer = buffer->buffers[cpu_id]; @@ -2883,8 +3175,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, goto out_err; } - cpus_read_lock(); - /* Can't run something on an offline CPU. */ if (!cpu_online(cpu_id)) rb_update_pages(cpu_buffer); @@ -2903,7 +3193,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, } cpu_buffer->nr_pages_to_update = 0; - cpus_read_unlock(); } out: @@ -2948,6 +3237,8 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, list) { list_del_init(&bpage->list); free_buffer_page(bpage); + + cond_resched(); } } out_err_unlock: @@ -3082,7 +3373,7 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) } /* Return the index into the sub-buffers for a given sub-buffer */ -static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf) +static int rb_meta_subbuf_idx(struct ring_buffer_cpu_meta *meta, void *subbuf) { void *subbuf_array; @@ -3094,7 +3385,7 @@ static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf) static void rb_update_meta_head(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *next_page) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; unsigned long old_head = (unsigned long)next_page->page; unsigned long new_head; @@ -3111,7 +3402,7 @@ static void rb_update_meta_head(struct ring_buffer_per_cpu *cpu_buffer, static void rb_update_meta_reader(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *reader) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; void *old_reader = cpu_buffer->reader_page->page; void *new_reader = reader->page; int id; @@ -3740,7 +4031,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) rb_page_write(cpu_buffer->commit_page)); rb_inc_page(&cpu_buffer->commit_page); if (cpu_buffer->ring_meta) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; meta->commit_buffer = (unsigned long)cpu_buffer->commit_page->page; } /* add barrier to keep gcc from optimizing too much */ @@ -3822,19 +4113,36 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer) rb_end_commit(cpu_buffer); } +static bool +rb_irq_work_queue(struct rb_irq_work *irq_work) +{ + int cpu; + + /* irq_work_queue_on() is not NMI-safe */ + if (unlikely(in_nmi())) + return irq_work_queue(&irq_work->work); + + /* + * If CPU isolation is not active, cpu is always the current + * CPU, and the following is equivallent to irq_work_queue(). + */ + cpu = housekeeping_any_cpu(HK_TYPE_KERNEL_NOISE); + return irq_work_queue_on(&irq_work->work, cpu); +} + static __always_inline void rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) { if (buffer->irq_work.waiters_pending) { buffer->irq_work.waiters_pending = false; /* irq_work_queue() supplies it's own memory barriers */ - irq_work_queue(&buffer->irq_work.work); + rb_irq_work_queue(&buffer->irq_work); } if (cpu_buffer->irq_work.waiters_pending) { cpu_buffer->irq_work.waiters_pending = false; /* irq_work_queue() supplies it's own memory barriers */ - irq_work_queue(&cpu_buffer->irq_work.work); + rb_irq_work_queue(&cpu_buffer->irq_work); } if (cpu_buffer->last_pages_touch == local_read(&cpu_buffer->pages_touched)) @@ -3854,7 +4162,7 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->irq_work.wakeup_full = true; cpu_buffer->irq_work.full_waiters_pending = false; /* irq_work_queue() supplies it's own memory barriers */ - irq_work_queue(&cpu_buffer->irq_work.work); + rb_irq_work_queue(&cpu_buffer->irq_work); } #ifdef CONFIG_RING_BUFFER_RECORD_RECURSION @@ -4043,7 +4351,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); static const char *show_irq_str(int bits) { - const char *type[] = { + static const char * type[] = { ".", // 0 "s", // 1 "h", // 2 @@ -4221,18 +4529,20 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, ret = rb_read_data_buffer(bpage, tail, cpu_buffer->cpu, &ts, &delta); if (ret < 0) { if (delta < ts) { - buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n", - cpu_buffer->cpu, ts, delta); + buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld clock:%pS\n", + cpu_buffer->cpu, ts, delta, + cpu_buffer->buffer->clock); goto out; } } if ((full && ts > info->ts) || (!full && ts + info->delta != info->ts)) { - buffer_warn_return("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\n", + buffer_warn_return("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\ntrace clock:%pS", cpu_buffer->cpu, ts + info->delta, info->ts, info->delta, info->before, info->after, - full ? " (full)" : "", show_interrupt_level()); + full ? " (full)" : "", show_interrupt_level(), + cpu_buffer->buffer->clock); } out: atomic_dec(this_cpu_ptr(&checking)); @@ -4398,8 +4708,13 @@ rb_reserve_next_event(struct trace_buffer *buffer, int nr_loops = 0; int add_ts_default; - /* ring buffer does cmpxchg, make sure it is safe in NMI context */ - if (!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) && + /* + * ring buffer does cmpxchg as well as atomic64 operations + * (which some archs use locking for atomic64), make sure this + * is safe in NMI context + */ + if ((!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) || + IS_ENABLED(CONFIG_GENERIC_ATOMIC64)) && (unlikely(in_nmi()))) { return NULL; } @@ -4601,10 +4916,7 @@ void ring_buffer_discard_commit(struct trace_buffer *buffer, RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing)); rb_decrement_entry(cpu_buffer, event); - if (rb_try_to_discard(cpu_buffer, event)) - goto out; - - out: + rb_try_to_discard(cpu_buffer, event); rb_end_commit(cpu_buffer); trace_recursive_unlock(cpu_buffer); @@ -4637,26 +4949,26 @@ int ring_buffer_write(struct trace_buffer *buffer, int ret = -EBUSY; int cpu; - preempt_disable_notrace(); + guard(preempt_notrace)(); if (atomic_read(&buffer->record_disabled)) - goto out; + return -EBUSY; cpu = raw_smp_processor_id(); if (!cpumask_test_cpu(cpu, buffer->cpumask)) - goto out; + return -EBUSY; cpu_buffer = buffer->buffers[cpu]; if (atomic_read(&cpu_buffer->record_disabled)) - goto out; + return -EBUSY; if (length > buffer->max_data_size) - goto out; + return -EBUSY; if (unlikely(trace_recursive_lock(cpu_buffer))) - goto out; + return -EBUSY; event = rb_reserve_next_event(buffer, cpu_buffer, length); if (!event) @@ -4674,48 +4986,26 @@ int ring_buffer_write(struct trace_buffer *buffer, out_unlock: trace_recursive_unlock(cpu_buffer); - - out: - preempt_enable_notrace(); - return ret; } EXPORT_SYMBOL_GPL(ring_buffer_write); -static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) +/* + * The total entries in the ring buffer is the running counter + * of entries entered into the ring buffer, minus the sum of + * the entries read from the ring buffer and the number of + * entries that were overwritten. + */ +static inline unsigned long +rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer) { - struct buffer_page *reader = cpu_buffer->reader_page; - struct buffer_page *head = rb_set_head_page(cpu_buffer); - struct buffer_page *commit = cpu_buffer->commit_page; - - /* In case of error, head will be NULL */ - if (unlikely(!head)) - return true; - - /* Reader should exhaust content in reader page */ - if (reader->read != rb_page_size(reader)) - return false; - - /* - * If writers are committing on the reader page, knowing all - * committed content has been read, the ring buffer is empty. - */ - if (commit == reader) - return true; - - /* - * If writers are committing on a page other than reader page - * and head page, there should always be content to read. - */ - if (commit != head) - return false; + return local_read(&cpu_buffer->entries) - + (local_read(&cpu_buffer->overrun) + cpu_buffer->read); +} - /* - * Writers are committing on the head page, we just need - * to care about there're committed data, and the reader will - * swap reader page with head page when it is to read data. - */ - return rb_page_commit(commit) == 0; +static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) +{ + return !rb_num_of_entries(cpu_buffer); } /** @@ -4820,6 +5110,24 @@ bool ring_buffer_record_is_set_on(struct trace_buffer *buffer) } /** + * ring_buffer_record_is_on_cpu - return true if the ring buffer can write + * @buffer: The ring buffer to see if write is enabled + * @cpu: The CPU to test if the ring buffer can write too + * + * Returns true if the ring buffer is in a state that it accepts writes + * for a particular CPU. + */ +bool ring_buffer_record_is_on_cpu(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + cpu_buffer = buffer->buffers[cpu]; + + return ring_buffer_record_is_set_on(buffer) && + !atomic_read(&cpu_buffer->record_disabled); +} + +/** * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer * @buffer: The ring buffer to stop writes to. * @cpu: The CPU buffer to stop @@ -4861,19 +5169,6 @@ void ring_buffer_record_enable_cpu(struct trace_buffer *buffer, int cpu) } EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); -/* - * The total entries in the ring buffer is the running counter - * of entries entered into the ring buffer, minus the sum of - * the entries read from the ring buffer and the number of - * entries that were overwritten. - */ -static inline unsigned long -rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer) -{ - return local_read(&cpu_buffer->entries) - - (local_read(&cpu_buffer->overrun) + cpu_buffer->read); -} - /** * ring_buffer_oldest_event_ts - get the oldest event timestamp from the buffer * @buffer: The ring buffer @@ -5075,14 +5370,66 @@ unsigned long ring_buffer_overruns(struct trace_buffer *buffer) } EXPORT_SYMBOL_GPL(ring_buffer_overruns); +static bool rb_read_remote_meta_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + local_set(&cpu_buffer->entries, READ_ONCE(cpu_buffer->meta_page->entries)); + local_set(&cpu_buffer->overrun, READ_ONCE(cpu_buffer->meta_page->overrun)); + local_set(&cpu_buffer->pages_touched, READ_ONCE(cpu_buffer->meta_page->pages_touched)); + local_set(&cpu_buffer->pages_lost, READ_ONCE(cpu_buffer->meta_page->pages_lost)); + + return rb_num_of_entries(cpu_buffer); +} + +static void rb_update_remote_head(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *next, *orig; + int retry = 3; + + orig = next = cpu_buffer->head_page; + rb_inc_page(&next); + + /* Run after the writer */ + while (cpu_buffer->head_page->page->time_stamp > next->page->time_stamp) { + rb_inc_page(&next); + + rb_list_head_clear(cpu_buffer->head_page->list.prev); + rb_inc_page(&cpu_buffer->head_page); + rb_set_list_to_head(cpu_buffer->head_page->list.prev); + + if (cpu_buffer->head_page == orig) { + if (WARN_ON_ONCE(!(--retry))) + return; + } + } + + orig = cpu_buffer->commit_page = cpu_buffer->head_page; + retry = 3; + + while (cpu_buffer->commit_page->page->time_stamp < next->page->time_stamp) { + rb_inc_page(&next); + rb_inc_page(&cpu_buffer->commit_page); + + if (cpu_buffer->commit_page == orig) { + if (WARN_ON_ONCE(!(--retry))) + return; + } + } +} + static void rb_iter_reset(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + if (cpu_buffer->remote) { + rb_read_remote_meta_page(cpu_buffer); + rb_update_remote_head(cpu_buffer); + } + /* Iterator usage is expected to have record disabled */ iter->head_page = cpu_buffer->reader_page; iter->head = cpu_buffer->reader_page->read; iter->next_event = iter->head; + iter->missed_events = 0; iter->cache_reader_page = iter->head_page; iter->cache_read = cpu_buffer->read; @@ -5229,7 +5576,65 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter, } static struct buffer_page * -rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) +__rb_get_reader_page_from_remote(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *new_reader, *prev_reader, *prev_head, *new_head, *last; + + if (!rb_read_remote_meta_page(cpu_buffer)) + return NULL; + + /* More to read on the reader page */ + if (cpu_buffer->reader_page->read < rb_page_size(cpu_buffer->reader_page)) { + if (!cpu_buffer->reader_page->read) + cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp; + return cpu_buffer->reader_page; + } + + prev_reader = cpu_buffer->subbuf_ids[cpu_buffer->meta_page->reader.id]; + + WARN_ON_ONCE(cpu_buffer->remote->swap_reader_page(cpu_buffer->cpu, + cpu_buffer->remote->priv)); + /* nr_pages doesn't include the reader page */ + if (WARN_ON_ONCE(cpu_buffer->meta_page->reader.id > cpu_buffer->nr_pages)) + return NULL; + + new_reader = cpu_buffer->subbuf_ids[cpu_buffer->meta_page->reader.id]; + + WARN_ON_ONCE(prev_reader == new_reader); + + prev_head = new_reader; /* New reader was also the previous head */ + new_head = prev_head; + rb_inc_page(&new_head); + last = prev_head; + rb_dec_page(&last); + + /* Clear the old HEAD flag */ + rb_list_head_clear(cpu_buffer->head_page->list.prev); + + prev_reader->list.next = prev_head->list.next; + prev_reader->list.prev = prev_head->list.prev; + + /* Swap prev_reader with new_reader */ + last->list.next = &prev_reader->list; + new_head->list.prev = &prev_reader->list; + + new_reader->list.prev = &new_reader->list; + new_reader->list.next = &new_head->list; + + /* Reactivate the HEAD flag */ + rb_set_list_to_head(&last->list); + + cpu_buffer->head_page = new_head; + cpu_buffer->reader_page = new_reader; + cpu_buffer->pages = &new_head->list; + cpu_buffer->read_stamp = new_reader->page->time_stamp; + cpu_buffer->lost_events = cpu_buffer->meta_page->reader.lost_events; + + return rb_page_size(cpu_buffer->reader_page) ? cpu_buffer->reader_page : NULL; +} + +static struct buffer_page * +__rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *reader = NULL; unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size); @@ -5278,7 +5683,6 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) */ local_set(&cpu_buffer->reader_page->write, 0); local_set(&cpu_buffer->reader_page->entries, 0); - local_set(&cpu_buffer->reader_page->page->commit, 0); cpu_buffer->reader_page->real_end = 0; spin: @@ -5321,7 +5725,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) * moving it. The page before the header page has the * flag bit '1' set if it is pointing to the page we want. * but if the writer is in the process of moving it - * than it will be '2' or already moved '0'. + * then it will be '2' or already moved '0'. */ ret = rb_head_page_replace(reader, cpu_buffer->reader_page); @@ -5400,6 +5804,13 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) return reader; } +static struct buffer_page * +rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + return cpu_buffer->remote ? __rb_get_reader_page_from_remote(cpu_buffer) : + __rb_get_reader_page(cpu_buffer); +} + static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) { struct ring_buffer_event *event; @@ -5698,10 +6109,7 @@ ring_buffer_peek(struct trace_buffer *buffer, int cpu, u64 *ts, */ bool ring_buffer_iter_dropped(struct ring_buffer_iter *iter) { - bool ret = iter->missed_events != 0; - - iter->missed_events = 0; - return ret; + return iter->missed_events != 0; } EXPORT_SYMBOL_GPL(ring_buffer_iter_dropped); @@ -5782,24 +6190,20 @@ ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts, EXPORT_SYMBOL_GPL(ring_buffer_consume); /** - * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer + * ring_buffer_read_start - start a non consuming read of the buffer * @buffer: The ring buffer to read from * @cpu: The cpu buffer to iterate over * @flags: gfp flags to use for memory allocation * - * This performs the initial preparations necessary to iterate - * through the buffer. Memory is allocated, buffer resizing - * is disabled, and the iterator pointer is returned to the caller. + * This creates an iterator to allow non-consuming iteration through + * the buffer. If the buffer is disabled for writing, it will produce + * the same information each time, but if the buffer is still writing + * then the first hit of a write will cause the iteration to stop. * - * After a sequence of ring_buffer_read_prepare calls, the user is - * expected to make at least one call to ring_buffer_read_prepare_sync. - * Afterwards, ring_buffer_read_start is invoked to get things going - * for real. - * - * This overall must be paired with ring_buffer_read_finish. + * Must be paired with ring_buffer_read_finish. */ struct ring_buffer_iter * -ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags) +ring_buffer_read_start(struct trace_buffer *buffer, int cpu, gfp_t flags) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_iter *iter; @@ -5807,7 +6211,7 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags) if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; - iter = kzalloc(sizeof(*iter), flags); + iter = kzalloc_obj(*iter, flags); if (!iter) return NULL; @@ -5825,51 +6229,12 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags) atomic_inc(&cpu_buffer->resize_disabled); - return iter; -} -EXPORT_SYMBOL_GPL(ring_buffer_read_prepare); - -/** - * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls - * - * All previously invoked ring_buffer_read_prepare calls to prepare - * iterators will be synchronized. Afterwards, read_buffer_read_start - * calls on those iterators are allowed. - */ -void -ring_buffer_read_prepare_sync(void) -{ - synchronize_rcu(); -} -EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync); - -/** - * ring_buffer_read_start - start a non consuming read of the buffer - * @iter: The iterator returned by ring_buffer_read_prepare - * - * This finalizes the startup of an iteration through the buffer. - * The iterator comes from a call to ring_buffer_read_prepare and - * an intervening ring_buffer_read_prepare_sync must have been - * performed. - * - * Must be paired with ring_buffer_read_finish. - */ -void -ring_buffer_read_start(struct ring_buffer_iter *iter) -{ - struct ring_buffer_per_cpu *cpu_buffer; - unsigned long flags; - - if (!iter) - return; - - cpu_buffer = iter->cpu_buffer; - - raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock); arch_spin_lock(&cpu_buffer->lock); rb_iter_reset(iter); arch_spin_unlock(&cpu_buffer->lock); - raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + return iter; } EXPORT_SYMBOL_GPL(ring_buffer_read_start); @@ -5906,7 +6271,7 @@ void ring_buffer_iter_advance(struct ring_buffer_iter *iter) unsigned long flags; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); - + iter->missed_events = 0; rb_advance_iter(iter); raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); @@ -5950,6 +6315,39 @@ static void rb_clear_buffer_page(struct buffer_page *page) page->read = 0; } +/* + * When the buffer is memory mapped to user space, each sub buffer + * has a unique id that is used by the meta data to tell the user + * where the current reader page is. + * + * For a normal allocated ring buffer, the id is saved in the buffer page + * id field, and updated via this function. + * + * But for a fixed memory mapped buffer, the id is already assigned for + * fixed memory ordering in the memory layout and can not be used. Instead + * the index of where the page lies in the memory layout is used. + * + * For the normal pages, set the buffer page id with the passed in @id + * value and return that. + * + * For fixed memory mapped pages, get the page index in the memory layout + * and return that as the id. + */ +static int rb_page_id(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *bpage, int id) +{ + /* + * For boot buffers, the id is the index, + * otherwise, set the buffer page with this id + */ + if (cpu_buffer->ring_meta) + id = rb_meta_subbuf_idx(cpu_buffer->ring_meta, bpage->page); + else + bpage->id = id; + + return id; +} + static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer) { struct trace_buffer_meta *meta = cpu_buffer->meta_page; @@ -5958,15 +6356,19 @@ static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer) return; meta->reader.read = cpu_buffer->reader_page->read; - meta->reader.id = cpu_buffer->reader_page->id; + meta->reader.id = rb_page_id(cpu_buffer, cpu_buffer->reader_page, + cpu_buffer->reader_page->id); + meta->reader.lost_events = cpu_buffer->lost_events; meta->entries = local_read(&cpu_buffer->entries); meta->overrun = local_read(&cpu_buffer->overrun); meta->read = cpu_buffer->read; + meta->pages_lost = local_read(&cpu_buffer->pages_lost); + meta->pages_touched = local_read(&cpu_buffer->pages_touched); /* Some archs do not have data cache coherency between kernel and user-space */ - flush_dcache_folio(virt_to_folio(cpu_buffer->meta_page)); + flush_kernel_vmap_range(cpu_buffer->meta_page, PAGE_SIZE); } static void @@ -5974,6 +6376,23 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *page; + if (cpu_buffer->remote) { + if (!cpu_buffer->remote->reset) + return; + + cpu_buffer->remote->reset(cpu_buffer->cpu, cpu_buffer->remote->priv); + rb_read_remote_meta_page(cpu_buffer); + + /* Read related values, not covered by the meta-page */ + local_set(&cpu_buffer->pages_read, 0); + cpu_buffer->read = 0; + cpu_buffer->read_bytes = 0; + cpu_buffer->last_overrun = 0; + cpu_buffer->reader_page->read = 0; + + return; + } + rb_head_page_deactivate(cpu_buffer); cpu_buffer->head_page @@ -6019,7 +6438,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) if (cpu_buffer->mapped) { rb_update_meta_page(cpu_buffer); if (cpu_buffer->ring_meta) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; meta->commit_buffer = meta->head_buffer; } } @@ -6028,21 +6447,16 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) /* Must have disabled the cpu buffer then done a synchronize_rcu */ static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) { - unsigned long flags; - - raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock); if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) - goto out; + return; arch_spin_lock(&cpu_buffer->lock); rb_reset_cpu(cpu_buffer); arch_spin_unlock(&cpu_buffer->lock); - - out: - raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } /** @@ -6053,7 +6467,6 @@ static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; - struct ring_buffer_meta *meta; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; @@ -6072,11 +6485,6 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) atomic_dec(&cpu_buffer->record_disabled); atomic_dec(&cpu_buffer->resize_disabled); - /* Make sure persistent meta now uses this buffer's addresses */ - meta = rb_range_meta(buffer, 0, cpu_buffer->cpu); - if (meta) - rb_meta_init_text_addr(meta); - mutex_unlock(&buffer->mutex); } EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); @@ -6091,7 +6499,6 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); void ring_buffer_reset_online_cpus(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; - struct ring_buffer_meta *meta; int cpu; /* prevent another thread from changing buffer sizes */ @@ -6119,11 +6526,6 @@ void ring_buffer_reset_online_cpus(struct trace_buffer *buffer) reset_disabled_cpu_buffer(cpu_buffer); - /* Make sure persistent meta now uses this buffer's addresses */ - meta = rb_range_meta(buffer, 0, cpu_buffer->cpu); - if (meta) - rb_meta_init_text_addr(meta); - atomic_dec(&cpu_buffer->record_disabled); atomic_sub(RESET_BIT, &cpu_buffer->resize_disabled); } @@ -6221,6 +6623,46 @@ bool ring_buffer_empty_cpu(struct trace_buffer *buffer, int cpu) } EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu); +int ring_buffer_poll_remote(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (cpu != RING_BUFFER_ALL_CPUS) { + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return -EINVAL; + + cpu_buffer = buffer->buffers[cpu]; + + guard(raw_spinlock)(&cpu_buffer->reader_lock); + if (rb_read_remote_meta_page(cpu_buffer)) + rb_wakeups(buffer, cpu_buffer); + + return 0; + } + + guard(cpus_read_lock)(); + + /* + * Make sure all the ring buffers are up to date before we start reading + * them. + */ + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + + guard(raw_spinlock)(&cpu_buffer->reader_lock); + rb_read_remote_meta_page(cpu_buffer); + } + + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + + if (rb_num_of_entries(cpu_buffer)) + rb_wakeups(buffer, cpu_buffer); + } + + return 0; +} + #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP /** * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers @@ -6242,37 +6684,33 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a, if (!cpumask_test_cpu(cpu, buffer_a->cpumask) || !cpumask_test_cpu(cpu, buffer_b->cpumask)) - goto out; + return -EINVAL; cpu_buffer_a = buffer_a->buffers[cpu]; cpu_buffer_b = buffer_b->buffers[cpu]; /* It's up to the callers to not try to swap mapped buffers */ - if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped)) { - ret = -EBUSY; - goto out; - } + if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped)) + return -EBUSY; /* At least make sure the two buffers are somewhat the same */ if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages) - goto out; + return -EINVAL; if (buffer_a->subbuf_order != buffer_b->subbuf_order) - goto out; - - ret = -EAGAIN; + return -EINVAL; if (atomic_read(&buffer_a->record_disabled)) - goto out; + return -EAGAIN; if (atomic_read(&buffer_b->record_disabled)) - goto out; + return -EAGAIN; if (atomic_read(&cpu_buffer_a->record_disabled)) - goto out; + return -EAGAIN; if (atomic_read(&cpu_buffer_b->record_disabled)) - goto out; + return -EAGAIN; /* * We can't do a synchronize_rcu here because this @@ -6309,7 +6747,6 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a, out_dec: atomic_dec(&cpu_buffer_a->record_disabled); atomic_dec(&cpu_buffer_b->record_disabled); -out: return ret; } EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); @@ -6337,12 +6774,11 @@ ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu) struct ring_buffer_per_cpu *cpu_buffer; struct buffer_data_read_page *bpage = NULL; unsigned long flags; - struct page *page; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return ERR_PTR(-ENODEV); - bpage = kzalloc(sizeof(*bpage), GFP_KERNEL); + bpage = kzalloc_obj(*bpage); if (!bpage) return ERR_PTR(-ENOMEM); @@ -6359,22 +6795,16 @@ ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu) arch_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); - if (bpage->data) - goto out; - - page = alloc_pages_node(cpu_to_node(cpu), - GFP_KERNEL | __GFP_NORETRY | __GFP_COMP | __GFP_ZERO, - cpu_buffer->buffer->subbuf_order); - if (!page) { - kfree(bpage); - return ERR_PTR(-ENOMEM); + if (bpage->data) { + rb_init_page(bpage->data); + } else { + bpage->data = alloc_cpu_data(cpu, cpu_buffer->buffer->subbuf_order); + if (!bpage->data) { + kfree(bpage); + return ERR_PTR(-ENOMEM); + } } - bpage->data = page_address(page); - - out: - rb_init_page(bpage->data); - return bpage; } EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page); @@ -6468,38 +6898,38 @@ int ring_buffer_read_page(struct trace_buffer *buffer, struct buffer_data_page *bpage; struct buffer_page *reader; unsigned long missed_events; - unsigned long flags; unsigned int commit; unsigned int read; u64 save_timestamp; - int ret = -1; + bool force_memcpy; if (!cpumask_test_cpu(cpu, buffer->cpumask)) - goto out; + return -1; /* * If len is not big enough to hold the page header, then * we can not copy anything. */ if (len <= BUF_PAGE_HDR_SIZE) - goto out; + return -1; len -= BUF_PAGE_HDR_SIZE; if (!data_page || !data_page->data) - goto out; + return -1; + if (data_page->order != buffer->subbuf_order) - goto out; + return -1; bpage = data_page->data; if (!bpage) - goto out; + return -1; - raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock); reader = rb_get_reader_page(cpu_buffer); if (!reader) - goto out_unlock; + return -1; event = rb_reader_event(cpu_buffer); @@ -6509,6 +6939,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer, /* Check if any events were dropped */ missed_events = cpu_buffer->lost_events; + force_memcpy = cpu_buffer->mapped || cpu_buffer->remote; + /* * If this page has been partially read or * if len is not big enough to read the rest of the page or @@ -6518,7 +6950,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, */ if (read || (len < (commit - read)) || cpu_buffer->reader_page == cpu_buffer->commit_page || - cpu_buffer->mapped) { + force_memcpy) { struct buffer_data_page *rpage = cpu_buffer->reader_page->page; unsigned int rpos = read; unsigned int pos = 0; @@ -6533,7 +6965,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, if (full && (!read || (len < (commit - read)) || cpu_buffer->reader_page == cpu_buffer->commit_page)) - goto out_unlock; + return -1; if (len > (commit - read)) len = (commit - read); @@ -6542,7 +6974,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, size = rb_event_ts_length(event); if (len < size) - goto out_unlock; + return -1; /* save the current timestamp, since the user will need it */ save_timestamp = cpu_buffer->read_stamp; @@ -6600,7 +7032,6 @@ int ring_buffer_read_page(struct trace_buffer *buffer, if (reader->real_end) local_set(&bpage->commit, reader->real_end); } - ret = read; cpu_buffer->lost_events = 0; @@ -6627,11 +7058,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, if (commit < buffer->subbuf_size) memset(&bpage->data[commit], 0, buffer->subbuf_size - commit); - out_unlock: - raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - - out: - return ret; + return read; } EXPORT_SYMBOL_GPL(ring_buffer_read_page); @@ -6724,7 +7151,7 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) old_size = buffer->subbuf_size; /* prevent another thread from changing buffer sizes */ - mutex_lock(&buffer->mutex); + guard(mutex)(&buffer->mutex); atomic_inc(&buffer->record_disabled); /* Make sure all commits have finished */ @@ -6829,7 +7256,6 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) } atomic_dec(&buffer->record_disabled); - mutex_unlock(&buffer->mutex); return 0; @@ -6838,7 +7264,6 @@ error: buffer->subbuf_size = old_size; atomic_dec(&buffer->record_disabled); - mutex_unlock(&buffer->mutex); for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; @@ -6881,29 +7306,35 @@ static void rb_free_meta_page(struct ring_buffer_per_cpu *cpu_buffer) } static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer, - unsigned long *subbuf_ids) + struct buffer_page **subbuf_ids) { struct trace_buffer_meta *meta = cpu_buffer->meta_page; unsigned int nr_subbufs = cpu_buffer->nr_pages + 1; struct buffer_page *first_subbuf, *subbuf; + int cnt = 0; int id = 0; - subbuf_ids[id] = (unsigned long)cpu_buffer->reader_page->page; - cpu_buffer->reader_page->id = id++; + id = rb_page_id(cpu_buffer, cpu_buffer->reader_page, id); + subbuf_ids[id++] = cpu_buffer->reader_page; + cnt++; first_subbuf = subbuf = rb_set_head_page(cpu_buffer); do { + id = rb_page_id(cpu_buffer, subbuf, id); + if (WARN_ON(id >= nr_subbufs)) break; - subbuf_ids[id] = (unsigned long)subbuf->page; - subbuf->id = id; + subbuf_ids[id] = subbuf; rb_inc_page(&subbuf); id++; + cnt++; } while (subbuf != first_subbuf); - /* install subbuf ID to kern VA translation */ + WARN_ON(cnt != nr_subbufs); + + /* install subbuf ID to bpage translation */ cpu_buffer->subbuf_ids = subbuf_ids; meta->meta_struct_len = sizeof(*meta); @@ -6994,7 +7425,7 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, { unsigned long nr_subbufs, nr_pages, nr_vma_pages, pgoff = vma->vm_pgoff; unsigned int subbuf_pages, subbuf_order; - struct page **pages; + struct page **pages __free(kfree) = NULL; int p = 0, s = 0; int err; @@ -7031,7 +7462,7 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, nr_pages = nr_vma_pages; - pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL); + pages = kzalloc_objs(*pages, nr_pages); if (!pages) return -ENOMEM; @@ -7059,13 +7490,15 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, } while (p < nr_pages) { - struct page *page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]); + struct buffer_page *subbuf; + struct page *page; int off = 0; - if (WARN_ON_ONCE(s >= nr_subbufs)) { - err = -EINVAL; - goto out; - } + if (WARN_ON_ONCE(s >= nr_subbufs)) + return -EINVAL; + + subbuf = cpu_buffer->subbuf_ids[s]; + page = virt_to_page((void *)subbuf->page); for (; off < (1 << (subbuf_order)); off++, page++) { if (p >= nr_pages) @@ -7078,9 +7511,6 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, err = vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); -out: - kfree(pages); - return err; } #else @@ -7095,37 +7525,36 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu, struct vm_area_struct *vma) { struct ring_buffer_per_cpu *cpu_buffer; - unsigned long flags, *subbuf_ids; - int err = 0; + struct buffer_page **subbuf_ids; + unsigned long flags; + int err; - if (!cpumask_test_cpu(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask) || buffer->remote) return -EINVAL; cpu_buffer = buffer->buffers[cpu]; - mutex_lock(&cpu_buffer->mapping_lock); + guard(mutex)(&cpu_buffer->mapping_lock); if (cpu_buffer->user_mapped) { err = __rb_map_vma(cpu_buffer, vma); if (!err) err = __rb_inc_dec_mapped(cpu_buffer, true); - mutex_unlock(&cpu_buffer->mapping_lock); return err; } /* prevent another thread from changing buffer/sub-buffer sizes */ - mutex_lock(&buffer->mutex); + guard(mutex)(&buffer->mutex); err = rb_alloc_meta_page(cpu_buffer); if (err) - goto unlock; + return err; - /* subbuf_ids include the reader while nr_pages does not */ + /* subbuf_ids includes the reader while nr_pages does not */ subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, sizeof(*subbuf_ids), GFP_KERNEL); if (!subbuf_ids) { rb_free_meta_page(cpu_buffer); - err = -ENOMEM; - goto unlock; + return -ENOMEM; } atomic_inc(&cpu_buffer->resize_disabled); @@ -7150,37 +7579,53 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu, kfree(cpu_buffer->subbuf_ids); cpu_buffer->subbuf_ids = NULL; rb_free_meta_page(cpu_buffer); + atomic_dec(&cpu_buffer->resize_disabled); } -unlock: - mutex_unlock(&buffer->mutex); - mutex_unlock(&cpu_buffer->mapping_lock); - return err; } +/* + * This is called when a VMA is duplicated (e.g., on fork()) to increment + * the user_mapped counter without remapping pages. + */ +void ring_buffer_map_dup(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (WARN_ON(!cpumask_test_cpu(cpu, buffer->cpumask))) + return; + + cpu_buffer = buffer->buffers[cpu]; + + guard(mutex)(&cpu_buffer->mapping_lock); + + if (cpu_buffer->user_mapped) + __rb_inc_dec_mapped(cpu_buffer, true); + else + WARN(1, "Unexpected buffer stat, it should be mapped"); +} + int ring_buffer_unmap(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; - int err = 0; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -EINVAL; cpu_buffer = buffer->buffers[cpu]; - mutex_lock(&cpu_buffer->mapping_lock); + guard(mutex)(&cpu_buffer->mapping_lock); if (!cpu_buffer->user_mapped) { - err = -ENODEV; - goto out; + return -ENODEV; } else if (cpu_buffer->user_mapped > 1) { __rb_inc_dec_mapped(cpu_buffer, false); - goto out; + return 0; } - mutex_lock(&buffer->mutex); + guard(mutex)(&buffer->mutex); raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); /* This is the last user space mapping */ @@ -7195,12 +7640,7 @@ int ring_buffer_unmap(struct trace_buffer *buffer, int cpu) rb_free_meta_page(cpu_buffer); atomic_dec(&cpu_buffer->resize_disabled); - mutex_unlock(&buffer->mutex); - -out: - mutex_unlock(&cpu_buffer->mapping_lock); - - return err; + return 0; } int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu) @@ -7234,6 +7674,10 @@ consume: goto out; } + /* Did the reader catch up with the writer? */ + if (cpu_buffer->reader_page == cpu_buffer->commit_page) + goto out; + reader = rb_get_reader_page(cpu_buffer); if (WARN_ON(!reader)) goto out; @@ -7241,8 +7685,8 @@ consume: /* Check if any events were dropped */ missed_events = cpu_buffer->lost_events; - if (cpu_buffer->reader_page != cpu_buffer->commit_page) { - if (missed_events) { + if (missed_events) { + if (cpu_buffer->reader_page != cpu_buffer->commit_page) { struct buffer_data_page *bpage = reader->page; unsigned int commit; /* @@ -7263,13 +7707,23 @@ consume: local_add(RB_MISSED_STORED, &bpage->commit); } local_add(RB_MISSED_EVENTS, &bpage->commit); + } else if (!WARN_ONCE(cpu_buffer->reader_page == cpu_buffer->tail_page, + "Reader on commit with %ld missed events", + missed_events)) { + /* + * There shouldn't be any missed events if the tail_page + * is on the reader page. But if the tail page is not on the + * reader page and the commit_page is, that would mean that + * there's a commit_overrun (an interrupt preempted an + * addition of an event and then filled the buffer + * with new events). In this case it's not an + * error, but it should still be reported. + * + * TODO: Add missed events to the page for user space to know. + */ + pr_info("Ring buffer [%d] commit overrun lost %ld events at timestamp:%lld\n", + cpu, missed_events, cpu_buffer->reader_page->page->time_stamp); } - } else { - /* - * There really shouldn't be any missed events if the commit - * is on the reader page. - */ - WARN_ON_ONCE(missed_events); } cpu_buffer->lost_events = 0; @@ -7278,7 +7732,8 @@ consume: out: /* Some archs do not have data cache coherency between kernel and user-space */ - flush_dcache_folio(virt_to_folio(cpu_buffer->reader_page->page)); + flush_kernel_vmap_range(cpu_buffer->reader_page->page, + buffer->subbuf_size + BUF_PAGE_HDR_SIZE); rb_update_meta_page(cpu_buffer); @@ -7288,6 +7743,12 @@ out: return 0; } +static void rb_cpu_sync(void *data) +{ + /* Not really needed, but documents what is happening */ + smp_rmb(); +} + /* * We only allocate new buffers, never free them if the CPU goes down. * If we were to free the buffer, then the user would lose any trace that was in @@ -7326,7 +7787,18 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node) cpu); return -ENOMEM; } - smp_wmb(); + + /* + * Ensure trace_buffer readers observe the newly allocated + * ring_buffer_per_cpu before they check the cpumask. Instead of using a + * read barrier for all readers, send an IPI. + */ + if (unlikely(system_state == SYSTEM_RUNNING)) { + on_each_cpu(rb_cpu_sync, NULL, 1); + /* Not really needed, but documents what is happening */ + smp_wmb(); + } + cpumask_set_cpu(cpu, buffer->cpumask); return 0; } @@ -7411,9 +7883,9 @@ static __init int rb_write_something(struct rb_test_data *data, bool nested) /* Ignore dropped events before test starts. */ if (started) { if (nested) - data->bytes_dropped += len; - else data->bytes_dropped_nested += len; + else + data->bytes_dropped += len; } return len; } @@ -7535,7 +8007,7 @@ static __init int test_ringbuffer(void) /* * Show buffer is enabled before setting rb_test_started. * Yes there's a small race window where events could be - * dropped and the thread wont catch it. But when a ring + * dropped and the thread won't catch it. But when a ring * buffer gets enabled, there will always be some kind of * delay before other CPUs see it. Thus, we don't care about * those dropped events. We care about events dropped after @@ -7545,7 +8017,7 @@ static __init int test_ringbuffer(void) rb_test_started = true; set_current_state(TASK_INTERRUPTIBLE); - /* Just run for 10 seconds */; + /* Just run for 10 seconds */ schedule_timeout(10 * HZ); kthread_stop(rb_hammer); diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index cdc3aea12c93..593e3b59e42e 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -433,7 +433,7 @@ static int __init ring_buffer_benchmark_init(void) { int ret; - /* make a one meg buffer in overwite mode */ + /* make a one meg buffer in overwrite mode */ buffer = ring_buffer_alloc(1000000, RB_FL_OVERWRITE); if (!buffer) return -ENOMEM; diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig index 831779607e84..3884b14df375 100644 --- a/kernel/trace/rv/Kconfig +++ b/kernel/trace/rv/Kconfig @@ -1,19 +1,44 @@ # SPDX-License-Identifier: GPL-2.0-only # -config DA_MON_EVENTS +config RV_MON_EVENTS + bool + +config RV_MON_MAINTENANCE_EVENTS bool config DA_MON_EVENTS_IMPLICIT - select DA_MON_EVENTS + select RV_MON_EVENTS + select RV_MON_MAINTENANCE_EVENTS bool config DA_MON_EVENTS_ID - select DA_MON_EVENTS + select RV_MON_EVENTS + select RV_MON_MAINTENANCE_EVENTS + bool + +config LTL_MON_EVENTS_ID + select RV_MON_EVENTS + bool + +config RV_LTL_MONITOR + bool + +config RV_HA_MONITOR + bool + +config HA_MON_EVENTS_IMPLICIT + select DA_MON_EVENTS_IMPLICIT + select RV_HA_MONITOR + bool + +config HA_MON_EVENTS_ID + select DA_MON_EVENTS_ID + select RV_HA_MONITOR bool menuconfig RV bool "Runtime Verification" - depends on TRACING + select TRACING help Enable the kernel runtime verification infrastructure. RV is a lightweight (yet rigorous) method that complements classical @@ -25,30 +50,40 @@ menuconfig RV For further information, see: Documentation/trace/rv/runtime-verification.rst -config RV_MON_WIP +config RV_PER_TASK_MONITORS + int "Maximum number of per-task monitor" depends on RV - depends on PREEMPT_TRACER - select DA_MON_EVENTS_IMPLICIT - bool "wip monitor" + range 1 8 + default 2 help - Enable wip (wakeup in preemptive) sample monitor that illustrates - the usage of per-cpu monitors, and one limitation of the - preempt_disable/enable events. + This option configures the maximum number of per-task RV monitors that can run + simultaneously. - For further information, see: - Documentation/trace/rv/monitor_wip.rst +source "kernel/trace/rv/monitors/wip/Kconfig" +source "kernel/trace/rv/monitors/wwnr/Kconfig" -config RV_MON_WWNR - depends on RV - select DA_MON_EVENTS_ID - bool "wwnr monitor" - help - Enable wwnr (wakeup while not running) sample monitor, this is a - sample monitor that illustrates the usage of per-task monitor. - The model is borken on purpose: it serves to test reactors. +source "kernel/trace/rv/monitors/sched/Kconfig" +source "kernel/trace/rv/monitors/sco/Kconfig" +source "kernel/trace/rv/monitors/snroc/Kconfig" +source "kernel/trace/rv/monitors/scpd/Kconfig" +source "kernel/trace/rv/monitors/snep/Kconfig" +source "kernel/trace/rv/monitors/sts/Kconfig" +source "kernel/trace/rv/monitors/nrp/Kconfig" +source "kernel/trace/rv/monitors/sssw/Kconfig" +source "kernel/trace/rv/monitors/opid/Kconfig" +# Add new sched monitors here - For further information, see: - Documentation/trace/rv/monitor_wwnr.rst +source "kernel/trace/rv/monitors/rtapp/Kconfig" +source "kernel/trace/rv/monitors/pagefault/Kconfig" +source "kernel/trace/rv/monitors/sleep/Kconfig" +# Add new rtapp monitors here + +source "kernel/trace/rv/monitors/stall/Kconfig" +source "kernel/trace/rv/monitors/deadline/Kconfig" +source "kernel/trace/rv/monitors/nomiss/Kconfig" +# Add new deadline monitors here + +# Add new monitors here config RV_REACTORS bool "Runtime verification reactors" diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile index 963d14875b45..94498da35b37 100644 --- a/kernel/trace/rv/Makefile +++ b/kernel/trace/rv/Makefile @@ -1,8 +1,26 @@ # SPDX-License-Identifier: GPL-2.0 +ccflags-y += -I $(src) # needed for trace events + obj-$(CONFIG_RV) += rv.o obj-$(CONFIG_RV_MON_WIP) += monitors/wip/wip.o obj-$(CONFIG_RV_MON_WWNR) += monitors/wwnr/wwnr.o +obj-$(CONFIG_RV_MON_SCHED) += monitors/sched/sched.o +obj-$(CONFIG_RV_MON_SCO) += monitors/sco/sco.o +obj-$(CONFIG_RV_MON_SNROC) += monitors/snroc/snroc.o +obj-$(CONFIG_RV_MON_SCPD) += monitors/scpd/scpd.o +obj-$(CONFIG_RV_MON_SNEP) += monitors/snep/snep.o +obj-$(CONFIG_RV_MON_RTAPP) += monitors/rtapp/rtapp.o +obj-$(CONFIG_RV_MON_PAGEFAULT) += monitors/pagefault/pagefault.o +obj-$(CONFIG_RV_MON_SLEEP) += monitors/sleep/sleep.o +obj-$(CONFIG_RV_MON_STS) += monitors/sts/sts.o +obj-$(CONFIG_RV_MON_NRP) += monitors/nrp/nrp.o +obj-$(CONFIG_RV_MON_SSSW) += monitors/sssw/sssw.o +obj-$(CONFIG_RV_MON_OPID) += monitors/opid/opid.o +obj-$(CONFIG_RV_MON_STALL) += monitors/stall/stall.o +obj-$(CONFIG_RV_MON_DEADLINE) += monitors/deadline/deadline.o +obj-$(CONFIG_RV_MON_NOMISS) += monitors/nomiss/nomiss.o +# Add new monitors here obj-$(CONFIG_RV_REACTORS) += rv_reactors.o obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o obj-$(CONFIG_RV_REACT_PANIC) += reactor_panic.o diff --git a/kernel/trace/rv/monitors/deadline/Kconfig b/kernel/trace/rv/monitors/deadline/Kconfig new file mode 100644 index 000000000000..38804a6ad91d --- /dev/null +++ b/kernel/trace/rv/monitors/deadline/Kconfig @@ -0,0 +1,10 @@ +config RV_MON_DEADLINE + depends on RV + bool "deadline monitor" + help + Collection of monitors to check the deadline scheduler and server + behave according to specifications. Enable this to enable all + scheduler specification supported by the current kernel. + + For further information, see: + Documentation/trace/rv/monitor_deadline.rst diff --git a/kernel/trace/rv/monitors/deadline/deadline.c b/kernel/trace/rv/monitors/deadline/deadline.c new file mode 100644 index 000000000000..d566d4542ebf --- /dev/null +++ b/kernel/trace/rv/monitors/deadline/deadline.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <linux/kallsyms.h> + +#define MODULE_NAME "deadline" + +#include "deadline.h" + +struct rv_monitor rv_deadline = { + .name = "deadline", + .description = "container for several deadline scheduler specifications.", + .enable = NULL, + .disable = NULL, + .reset = NULL, + .enabled = 0, +}; + +/* Used by other monitors */ +struct sched_class *rv_ext_sched_class; + +static int __init register_deadline(void) +{ + if (IS_ENABLED(CONFIG_SCHED_CLASS_EXT)) { + rv_ext_sched_class = (void *)kallsyms_lookup_name("ext_sched_class"); + if (!rv_ext_sched_class) + pr_warn("rv: Missing ext_sched_class, monitors may not work.\n"); + } + return rv_register_monitor(&rv_deadline, NULL); +} + +static void __exit unregister_deadline(void) +{ + rv_unregister_monitor(&rv_deadline); +} + +module_init(register_deadline); +module_exit(unregister_deadline); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("deadline: container for several deadline scheduler specifications."); diff --git a/kernel/trace/rv/monitors/deadline/deadline.h b/kernel/trace/rv/monitors/deadline/deadline.h new file mode 100644 index 000000000000..78fca873d61e --- /dev/null +++ b/kernel/trace/rv/monitors/deadline/deadline.h @@ -0,0 +1,203 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <linux/kernel.h> +#include <linux/uaccess.h> +#include <linux/sched/deadline.h> +#include <asm/syscall.h> +#include <uapi/linux/sched/types.h> +#include <trace/events/sched.h> + +/* + * Dummy values if not available + */ +#ifndef __NR_sched_setscheduler +#define __NR_sched_setscheduler -__COUNTER__ +#endif +#ifndef __NR_sched_setattr +#define __NR_sched_setattr -__COUNTER__ +#endif + +extern struct rv_monitor rv_deadline; +/* Initialised when registering the deadline container */ +extern struct sched_class *rv_ext_sched_class; + +/* + * If both have dummy values, the syscalls are not supported and we don't even + * need to register the handler. + */ +static inline bool should_skip_syscall_handle(void) +{ + return __NR_sched_setattr < 0 && __NR_sched_setscheduler < 0; +} + +/* + * is_supported_type - return true if @type is supported by the deadline monitors + */ +static inline bool is_supported_type(u8 type) +{ + return type == DL_TASK || type == DL_SERVER_FAIR || type == DL_SERVER_EXT; +} + +/* + * is_server_type - return true if @type is a supported server + */ +static inline bool is_server_type(u8 type) +{ + return is_supported_type(type) && type != DL_TASK; +} + +/* + * Use negative numbers for the server. + * Currently only one fair server per CPU, may change in the future. + */ +#define fair_server_id(cpu) (-cpu) +#define ext_server_id(cpu) (-cpu - num_possible_cpus()) +#define NO_SERVER_ID (-2 * num_possible_cpus()) +/* + * Get a unique id used for dl entities + * + * The cpu is not required for tasks as the pid is used there, if this function + * is called on a dl_se that for sure corresponds to a task, DL_TASK can be + * used in place of cpu. + * We need the cpu for servers as it is provided in the tracepoint and we + * cannot easily retrieve it from the dl_se (requires the struct rq definition). + */ +static inline int get_entity_id(struct sched_dl_entity *dl_se, int cpu, u8 type) +{ + if (dl_server(dl_se) && type != DL_TASK) { + if (type == DL_SERVER_FAIR) + return fair_server_id(cpu); + if (type == DL_SERVER_EXT) + return ext_server_id(cpu); + return NO_SERVER_ID; + } + return dl_task_of(dl_se)->pid; +} + +static inline bool task_is_scx_enabled(struct task_struct *tsk) +{ + return IS_ENABLED(CONFIG_SCHED_CLASS_EXT) && + tsk->sched_class == rv_ext_sched_class; +} + +/* Expand id and target as arguments for da functions */ +#define EXPAND_ID(dl_se, cpu, type) get_entity_id(dl_se, cpu, type), dl_se +#define EXPAND_ID_TASK(tsk) get_entity_id(&tsk->dl, task_cpu(tsk), DL_TASK), &tsk->dl + +static inline u8 get_server_type(struct task_struct *tsk) +{ + if (tsk->policy == SCHED_NORMAL || tsk->policy == SCHED_EXT || + tsk->policy == SCHED_BATCH || tsk->policy == SCHED_IDLE) + return task_is_scx_enabled(tsk) ? DL_SERVER_EXT : DL_SERVER_FAIR; + return DL_OTHER; +} + +static inline int extract_params(struct pt_regs *regs, long id, pid_t *pid_out) +{ + size_t size = offsetofend(struct sched_attr, sched_flags); + struct sched_attr __user *uattr; + struct sched_attr attr; + int new_policy = -1, ret; + unsigned long args[6]; + + switch (id) { + case __NR_sched_setscheduler: + syscall_get_arguments(current, regs, args); + *pid_out = args[0]; + new_policy = args[1]; + break; + case __NR_sched_setattr: + syscall_get_arguments(current, regs, args); + *pid_out = args[0]; + uattr = (struct sched_attr __user *)args[1]; + /* + * Just copy up to sched_flags, we are not interested after that + */ + ret = copy_struct_from_user(&attr, size, uattr, size); + if (ret) + return ret; + if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY) + return -EINVAL; + new_policy = attr.sched_policy; + break; + default: + return -EINVAL; + } + + return new_policy & ~SCHED_RESET_ON_FORK; +} + +/* Helper functions requiring DA/HA utilities */ +#ifdef RV_MON_TYPE + +/* + * get_fair_server - get the fair server associated to a task + * + * If the task is a boosted task, the server is available in the task_struct, + * otherwise grab the dl entity saved for the CPU where the task is enqueued. + * This function assumes the task is enqueued somewhere. + */ +static inline struct sched_dl_entity *get_server(struct task_struct *tsk, u8 type) +{ + if (tsk->dl_server && get_server_type(tsk) == type) + return tsk->dl_server; + if (type == DL_SERVER_FAIR) + return da_get_target_by_id(fair_server_id(task_cpu(tsk))); + if (type == DL_SERVER_EXT) + return da_get_target_by_id(ext_server_id(task_cpu(tsk))); + return NULL; +} + +/* + * Initialise monitors for all tasks and pre-allocate the storage for servers. + * This is necessary since we don't have access to the servers here and + * allocation can cause deadlocks from their tracepoints. We can only fill + * pre-initialised storage from there. + */ +static inline int init_storage(bool skip_tasks) +{ + struct task_struct *g, *p; + int cpu; + + for_each_possible_cpu(cpu) { + if (!da_create_empty_storage(fair_server_id(cpu))) + goto fail; + if (IS_ENABLED(CONFIG_SCHED_CLASS_EXT) && + !da_create_empty_storage(ext_server_id(cpu))) + goto fail; + } + + if (skip_tasks) + return 0; + + read_lock(&tasklist_lock); + for_each_process_thread(g, p) { + if (p->policy == SCHED_DEADLINE) { + if (!da_create_storage(EXPAND_ID_TASK(p), NULL)) { + read_unlock(&tasklist_lock); + goto fail; + } + } + } + read_unlock(&tasklist_lock); + return 0; + +fail: + da_monitor_destroy(); + return -ENOMEM; +} + +static void __maybe_unused handle_newtask(void *data, struct task_struct *task, u64 flags) +{ + /* Might be superfluous as tasks are not started with this policy.. */ + if (task->policy == SCHED_DEADLINE) + da_create_storage(EXPAND_ID_TASK(task), NULL); +} + +static void __maybe_unused handle_exit(void *data, struct task_struct *p, bool group_dead) +{ + if (p->policy == SCHED_DEADLINE) + da_destroy_storage(get_entity_id(&p->dl, DL_TASK, DL_TASK)); +} + +#endif diff --git a/kernel/trace/rv/monitors/nomiss/Kconfig b/kernel/trace/rv/monitors/nomiss/Kconfig new file mode 100644 index 000000000000..e1886c3a0dd9 --- /dev/null +++ b/kernel/trace/rv/monitors/nomiss/Kconfig @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_NOMISS + depends on RV + depends on HAVE_SYSCALL_TRACEPOINTS + depends on RV_MON_DEADLINE + default y + select HA_MON_EVENTS_ID + bool "nomiss monitor" + help + Monitor to ensure dl entities run to completion before their deadiline. + This monitor is part of the deadline monitors collection. + + For further information, see: + Documentation/trace/rv/monitor_deadline.rst diff --git a/kernel/trace/rv/monitors/nomiss/nomiss.c b/kernel/trace/rv/monitors/nomiss/nomiss.c new file mode 100644 index 000000000000..8ead8783c29f --- /dev/null +++ b/kernel/trace/rv/monitors/nomiss/nomiss.c @@ -0,0 +1,293 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> + +#define MODULE_NAME "nomiss" + +#include <uapi/linux/sched/types.h> +#include <trace/events/syscalls.h> +#include <trace/events/sched.h> +#include <trace/events/task.h> +#include <rv_trace.h> + +#define RV_MON_TYPE RV_MON_PER_OBJ +#define HA_TIMER_TYPE HA_TIMER_WHEEL +/* The start condition is on sched_switch, it's dangerous to allocate there */ +#define DA_SKIP_AUTO_ALLOC +typedef struct sched_dl_entity *monitor_target; +#include "nomiss.h" +#include <rv/ha_monitor.h> +#include <monitors/deadline/deadline.h> + +/* + * User configurable deadline threshold. If the total utilisation of deadline + * tasks is larger than 1, they are only guaranteed bounded tardiness. See + * Documentation/scheduler/sched-deadline.rst for more details. + * The minimum tardiness without sched_feat(HRTICK_DL) is 1 tick to accommodate + * for throttle enforced on the next tick. + */ +static u64 deadline_thresh = TICK_NSEC; +module_param(deadline_thresh, ullong, 0644); +#define DEADLINE_NS(ha_mon) (ha_get_target(ha_mon)->dl_deadline + deadline_thresh) + +static u64 ha_get_env(struct ha_monitor *ha_mon, enum envs_nomiss env, u64 time_ns) +{ + if (env == clk_nomiss) + return ha_get_clk_ns(ha_mon, env, time_ns); + else if (env == is_constr_dl_nomiss) + return !dl_is_implicit(ha_get_target(ha_mon)); + else if (env == is_defer_nomiss) + return ha_get_target(ha_mon)->dl_defer; + return ENV_INVALID_VALUE; +} + +static void ha_reset_env(struct ha_monitor *ha_mon, enum envs_nomiss env, u64 time_ns) +{ + if (env == clk_nomiss) + ha_reset_clk_ns(ha_mon, env, time_ns); +} + +static inline bool ha_verify_invariants(struct ha_monitor *ha_mon, + enum states curr_state, enum events event, + enum states next_state, u64 time_ns) +{ + if (curr_state == ready_nomiss) + return ha_check_invariant_ns(ha_mon, clk_nomiss, time_ns); + else if (curr_state == running_nomiss) + return ha_check_invariant_ns(ha_mon, clk_nomiss, time_ns); + return true; +} + +static inline void ha_convert_inv_guard(struct ha_monitor *ha_mon, + enum states curr_state, enum events event, + enum states next_state, u64 time_ns) +{ + if (curr_state == next_state) + return; + if (curr_state == ready_nomiss) + ha_inv_to_guard(ha_mon, clk_nomiss, DEADLINE_NS(ha_mon), time_ns); + else if (curr_state == running_nomiss) + ha_inv_to_guard(ha_mon, clk_nomiss, DEADLINE_NS(ha_mon), time_ns); +} + +static inline bool ha_verify_guards(struct ha_monitor *ha_mon, + enum states curr_state, enum events event, + enum states next_state, u64 time_ns) +{ + bool res = true; + + if (curr_state == ready_nomiss && event == dl_replenish_nomiss) + ha_reset_env(ha_mon, clk_nomiss, time_ns); + else if (curr_state == ready_nomiss && event == dl_throttle_nomiss) + res = ha_get_env(ha_mon, is_defer_nomiss, time_ns) == 1ull; + else if (curr_state == idle_nomiss && event == dl_replenish_nomiss) + ha_reset_env(ha_mon, clk_nomiss, time_ns); + else if (curr_state == running_nomiss && event == dl_replenish_nomiss) + ha_reset_env(ha_mon, clk_nomiss, time_ns); + else if (curr_state == sleeping_nomiss && event == dl_replenish_nomiss) + ha_reset_env(ha_mon, clk_nomiss, time_ns); + else if (curr_state == sleeping_nomiss && event == dl_throttle_nomiss) + res = ha_get_env(ha_mon, is_constr_dl_nomiss, time_ns) == 1ull || + ha_get_env(ha_mon, is_defer_nomiss, time_ns) == 1ull; + else if (curr_state == throttled_nomiss && event == dl_replenish_nomiss) + ha_reset_env(ha_mon, clk_nomiss, time_ns); + return res; +} + +static inline void ha_setup_invariants(struct ha_monitor *ha_mon, + enum states curr_state, enum events event, + enum states next_state, u64 time_ns) +{ + if (next_state == curr_state && event != dl_replenish_nomiss) + return; + if (next_state == ready_nomiss) + ha_start_timer_ns(ha_mon, clk_nomiss, DEADLINE_NS(ha_mon), time_ns); + else if (next_state == running_nomiss) + ha_start_timer_ns(ha_mon, clk_nomiss, DEADLINE_NS(ha_mon), time_ns); + else if (curr_state == ready_nomiss) + ha_cancel_timer(ha_mon); + else if (curr_state == running_nomiss) + ha_cancel_timer(ha_mon); +} + +static bool ha_verify_constraint(struct ha_monitor *ha_mon, + enum states curr_state, enum events event, + enum states next_state, u64 time_ns) +{ + if (!ha_verify_invariants(ha_mon, curr_state, event, next_state, time_ns)) + return false; + + ha_convert_inv_guard(ha_mon, curr_state, event, next_state, time_ns); + + if (!ha_verify_guards(ha_mon, curr_state, event, next_state, time_ns)) + return false; + + ha_setup_invariants(ha_mon, curr_state, event, next_state, time_ns); + + return true; +} + +static void handle_dl_replenish(void *data, struct sched_dl_entity *dl_se, + int cpu, u8 type) +{ + if (is_supported_type(type)) + da_handle_event(EXPAND_ID(dl_se, cpu, type), dl_replenish_nomiss); +} + +static void handle_dl_throttle(void *data, struct sched_dl_entity *dl_se, + int cpu, u8 type) +{ + if (is_supported_type(type)) + da_handle_event(EXPAND_ID(dl_se, cpu, type), dl_throttle_nomiss); +} + +static void handle_dl_server_stop(void *data, struct sched_dl_entity *dl_se, + int cpu, u8 type) +{ + /* + * This isn't the standard use of da_handle_start_run_event since this + * event cannot only occur from the initial state. + * It is fine to use here because it always brings to a known state and + * the fact we "pretend" the transition starts from the initial state + * has no side effect. + */ + if (is_supported_type(type)) + da_handle_start_run_event(EXPAND_ID(dl_se, cpu, type), dl_server_stop_nomiss); +} + +static inline void handle_server_switch(struct task_struct *next, int cpu, u8 type) +{ + struct sched_dl_entity *dl_se = get_server(next, type); + + if (dl_se && is_idle_task(next)) + da_handle_event(EXPAND_ID(dl_se, cpu, type), dl_server_idle_nomiss); +} + +static void handle_sched_switch(void *data, bool preempt, + struct task_struct *prev, + struct task_struct *next, + unsigned int prev_state) +{ + int cpu = task_cpu(next); + + if (prev_state != TASK_RUNNING && !preempt && prev->policy == SCHED_DEADLINE) + da_handle_event(EXPAND_ID_TASK(prev), sched_switch_suspend_nomiss); + if (next->policy == SCHED_DEADLINE) + da_handle_start_run_event(EXPAND_ID_TASK(next), sched_switch_in_nomiss); + + /* + * The server is available in next only if the next task is boosted, + * otherwise we need to retrieve it. + * Here the server continues in the state running/armed until actually + * stopped, this works since we continue expecting a throttle. + */ + if (next->dl_server) + da_handle_start_event(EXPAND_ID(next->dl_server, cpu, + get_server_type(next)), + sched_switch_in_nomiss); + else { + handle_server_switch(next, cpu, DL_SERVER_FAIR); + if (IS_ENABLED(CONFIG_SCHED_CLASS_EXT)) + handle_server_switch(next, cpu, DL_SERVER_EXT); + } +} + +static void handle_sys_enter(void *data, struct pt_regs *regs, long id) +{ + struct task_struct *p; + int new_policy = -1; + pid_t pid = 0; + + new_policy = extract_params(regs, id, &pid); + if (new_policy < 0) + return; + guard(rcu)(); + p = pid ? find_task_by_vpid(pid) : current; + if (unlikely(!p) || new_policy == p->policy) + return; + + if (p->policy == SCHED_DEADLINE) + da_reset(EXPAND_ID_TASK(p)); + else if (new_policy == SCHED_DEADLINE) + da_create_or_get(EXPAND_ID_TASK(p)); +} + +static void handle_sched_wakeup(void *data, struct task_struct *tsk) +{ + if (tsk->policy == SCHED_DEADLINE) + da_handle_event(EXPAND_ID_TASK(tsk), sched_wakeup_nomiss); +} + +static int enable_nomiss(void) +{ + int retval; + + retval = ha_monitor_init(); + if (retval) + return retval; + + retval = init_storage(false); + if (retval) + return retval; + rv_attach_trace_probe("nomiss", sched_dl_replenish_tp, handle_dl_replenish); + rv_attach_trace_probe("nomiss", sched_dl_throttle_tp, handle_dl_throttle); + rv_attach_trace_probe("nomiss", sched_dl_server_stop_tp, handle_dl_server_stop); + rv_attach_trace_probe("nomiss", sched_switch, handle_sched_switch); + rv_attach_trace_probe("nomiss", sched_wakeup, handle_sched_wakeup); + if (!should_skip_syscall_handle()) + rv_attach_trace_probe("nomiss", sys_enter, handle_sys_enter); + rv_attach_trace_probe("nomiss", task_newtask, handle_newtask); + rv_attach_trace_probe("nomiss", sched_process_exit, handle_exit); + + return 0; +} + +static void disable_nomiss(void) +{ + rv_this.enabled = 0; + + /* Those are RCU writers, detach earlier hoping to close a bit faster */ + rv_detach_trace_probe("nomiss", task_newtask, handle_newtask); + rv_detach_trace_probe("nomiss", sched_process_exit, handle_exit); + if (!should_skip_syscall_handle()) + rv_detach_trace_probe("nomiss", sys_enter, handle_sys_enter); + + rv_detach_trace_probe("nomiss", sched_dl_replenish_tp, handle_dl_replenish); + rv_detach_trace_probe("nomiss", sched_dl_throttle_tp, handle_dl_throttle); + rv_detach_trace_probe("nomiss", sched_dl_server_stop_tp, handle_dl_server_stop); + rv_detach_trace_probe("nomiss", sched_switch, handle_sched_switch); + rv_detach_trace_probe("nomiss", sched_wakeup, handle_sched_wakeup); + + ha_monitor_destroy(); +} + +static struct rv_monitor rv_this = { + .name = "nomiss", + .description = "dl entities run to completion before their deadline.", + .enable = enable_nomiss, + .disable = disable_nomiss, + .reset = da_monitor_reset_all, + .enabled = 0, +}; + +static int __init register_nomiss(void) +{ + return rv_register_monitor(&rv_this, &rv_deadline); +} + +static void __exit unregister_nomiss(void) +{ + rv_unregister_monitor(&rv_this); +} + +module_init(register_nomiss); +module_exit(unregister_nomiss); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("nomiss: dl entities run to completion before their deadline."); diff --git a/kernel/trace/rv/monitors/nomiss/nomiss.h b/kernel/trace/rv/monitors/nomiss/nomiss.h new file mode 100644 index 000000000000..3d1b436194d7 --- /dev/null +++ b/kernel/trace/rv/monitors/nomiss/nomiss.h @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of nomiss automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +#define MONITOR_NAME nomiss + +enum states_nomiss { + ready_nomiss, + idle_nomiss, + running_nomiss, + sleeping_nomiss, + throttled_nomiss, + state_max_nomiss, +}; + +#define INVALID_STATE state_max_nomiss + +enum events_nomiss { + dl_replenish_nomiss, + dl_server_idle_nomiss, + dl_server_stop_nomiss, + dl_throttle_nomiss, + sched_switch_in_nomiss, + sched_switch_suspend_nomiss, + sched_wakeup_nomiss, + event_max_nomiss, +}; + +enum envs_nomiss { + clk_nomiss, + is_constr_dl_nomiss, + is_defer_nomiss, + env_max_nomiss, + env_max_stored_nomiss = is_constr_dl_nomiss, +}; + +_Static_assert(env_max_stored_nomiss <= MAX_HA_ENV_LEN, "Not enough slots"); +#define HA_CLK_NS + +struct automaton_nomiss { + char *state_names[state_max_nomiss]; + char *event_names[event_max_nomiss]; + char *env_names[env_max_nomiss]; + unsigned char function[state_max_nomiss][event_max_nomiss]; + unsigned char initial_state; + bool final_states[state_max_nomiss]; +}; + +static const struct automaton_nomiss automaton_nomiss = { + .state_names = { + "ready", + "idle", + "running", + "sleeping", + "throttled", + }, + .event_names = { + "dl_replenish", + "dl_server_idle", + "dl_server_stop", + "dl_throttle", + "sched_switch_in", + "sched_switch_suspend", + "sched_wakeup", + }, + .env_names = { + "clk", + "is_constr_dl", + "is_defer", + }, + .function = { + { + ready_nomiss, + idle_nomiss, + sleeping_nomiss, + throttled_nomiss, + running_nomiss, + INVALID_STATE, + ready_nomiss, + }, + { + ready_nomiss, + idle_nomiss, + sleeping_nomiss, + throttled_nomiss, + running_nomiss, + INVALID_STATE, + INVALID_STATE, + }, + { + running_nomiss, + idle_nomiss, + sleeping_nomiss, + throttled_nomiss, + running_nomiss, + sleeping_nomiss, + running_nomiss, + }, + { + ready_nomiss, + sleeping_nomiss, + sleeping_nomiss, + throttled_nomiss, + running_nomiss, + INVALID_STATE, + ready_nomiss, + }, + { + ready_nomiss, + throttled_nomiss, + INVALID_STATE, + throttled_nomiss, + INVALID_STATE, + throttled_nomiss, + throttled_nomiss, + }, + }, + .initial_state = ready_nomiss, + .final_states = { 1, 0, 0, 0, 0 }, +}; diff --git a/kernel/trace/rv/monitors/nomiss/nomiss_trace.h b/kernel/trace/rv/monitors/nomiss/nomiss_trace.h new file mode 100644 index 000000000000..42e7efaca4e7 --- /dev/null +++ b/kernel/trace/rv/monitors/nomiss/nomiss_trace.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_NOMISS +DEFINE_EVENT(event_da_monitor_id, event_nomiss, + TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state), + TP_ARGS(id, state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor_id, error_nomiss, + TP_PROTO(int id, char *state, char *event), + TP_ARGS(id, state, event)); + +DEFINE_EVENT(error_env_da_monitor_id, error_env_nomiss, + TP_PROTO(int id, char *state, char *event, char *env), + TP_ARGS(id, state, event, env)); +#endif /* CONFIG_RV_MON_NOMISS */ diff --git a/kernel/trace/rv/monitors/nrp/Kconfig b/kernel/trace/rv/monitors/nrp/Kconfig new file mode 100644 index 000000000000..f5ec08f65535 --- /dev/null +++ b/kernel/trace/rv/monitors/nrp/Kconfig @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_NRP + depends on RV + depends on RV_MON_SCHED + default y if !ARM64 + select DA_MON_EVENTS_ID + bool "nrp monitor" + help + Monitor to ensure preemption requires need resched. + This monitor is part of the sched monitors collection. + + This monitor is unstable on arm64, say N unless you are testing it. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/nrp/nrp.c b/kernel/trace/rv/monitors/nrp/nrp.c new file mode 100644 index 000000000000..4b5646a70094 --- /dev/null +++ b/kernel/trace/rv/monitors/nrp/nrp.c @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> + +#define MODULE_NAME "nrp" + +#include <trace/events/irq.h> +#include <trace/events/sched.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#define RV_MON_TYPE RV_MON_PER_TASK +#include "nrp.h" +#include <rv/da_monitor.h> + +#ifdef CONFIG_X86_LOCAL_APIC +#include <asm/trace/irq_vectors.h> + +static void handle_vector_irq_entry(void *data, int vector) +{ + da_handle_event(current, irq_entry_nrp); +} + +static void attach_vector_irq(void) +{ + rv_attach_trace_probe("nrp", local_timer_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_IRQ_WORK)) + rv_attach_trace_probe("nrp", irq_work_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_SMP)) { + rv_attach_trace_probe("nrp", reschedule_entry, handle_vector_irq_entry); + rv_attach_trace_probe("nrp", call_function_entry, handle_vector_irq_entry); + rv_attach_trace_probe("nrp", call_function_single_entry, handle_vector_irq_entry); + } +} + +static void detach_vector_irq(void) +{ + rv_detach_trace_probe("nrp", local_timer_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_IRQ_WORK)) + rv_detach_trace_probe("nrp", irq_work_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_SMP)) { + rv_detach_trace_probe("nrp", reschedule_entry, handle_vector_irq_entry); + rv_detach_trace_probe("nrp", call_function_entry, handle_vector_irq_entry); + rv_detach_trace_probe("nrp", call_function_single_entry, handle_vector_irq_entry); + } +} + +#else +/* We assume irq_entry tracepoints are sufficient on other architectures */ +static void attach_vector_irq(void) { } +static void detach_vector_irq(void) { } +#endif + +static void handle_irq_entry(void *data, int irq, struct irqaction *action) +{ + da_handle_event(current, irq_entry_nrp); +} + +static void handle_sched_need_resched(void *data, struct task_struct *tsk, + int cpu, int tif) +{ + /* + * Although need_resched leads to both the rescheduling and preempt_irq + * states, it is safer to start the monitor always in preempt_irq, + * which may not mirror the system state but makes the monitor simpler, + */ + if (tif == TIF_NEED_RESCHED) + da_handle_start_event(tsk, sched_need_resched_nrp); +} + +static void handle_schedule_entry(void *data, bool preempt) +{ + if (preempt) + da_handle_event(current, schedule_entry_preempt_nrp); + else + da_handle_event(current, schedule_entry_nrp); +} + +static int enable_nrp(void) +{ + int retval; + + retval = da_monitor_init(); + if (retval) + return retval; + + rv_attach_trace_probe("nrp", irq_handler_entry, handle_irq_entry); + rv_attach_trace_probe("nrp", sched_set_need_resched_tp, handle_sched_need_resched); + rv_attach_trace_probe("nrp", sched_entry_tp, handle_schedule_entry); + attach_vector_irq(); + + return 0; +} + +static void disable_nrp(void) +{ + rv_this.enabled = 0; + + rv_detach_trace_probe("nrp", irq_handler_entry, handle_irq_entry); + rv_detach_trace_probe("nrp", sched_set_need_resched_tp, handle_sched_need_resched); + rv_detach_trace_probe("nrp", sched_entry_tp, handle_schedule_entry); + detach_vector_irq(); + + da_monitor_destroy(); +} + +static struct rv_monitor rv_this = { + .name = "nrp", + .description = "need resched preempts.", + .enable = enable_nrp, + .disable = disable_nrp, + .reset = da_monitor_reset_all, + .enabled = 0, +}; + +static int __init register_nrp(void) +{ + return rv_register_monitor(&rv_this, &rv_sched); +} + +static void __exit unregister_nrp(void) +{ + rv_unregister_monitor(&rv_this); +} + +module_init(register_nrp); +module_exit(unregister_nrp); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("nrp: need resched preempts."); diff --git a/kernel/trace/rv/monitors/nrp/nrp.h b/kernel/trace/rv/monitors/nrp/nrp.h new file mode 100644 index 000000000000..3270d4c0139f --- /dev/null +++ b/kernel/trace/rv/monitors/nrp/nrp.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of nrp automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +#define MONITOR_NAME nrp + +enum states_nrp { + preempt_irq_nrp, + any_thread_running_nrp, + nested_preempt_nrp, + rescheduling_nrp, + state_max_nrp, +}; + +#define INVALID_STATE state_max_nrp + +enum events_nrp { + irq_entry_nrp, + sched_need_resched_nrp, + schedule_entry_nrp, + schedule_entry_preempt_nrp, + event_max_nrp, +}; + +struct automaton_nrp { + char *state_names[state_max_nrp]; + char *event_names[event_max_nrp]; + unsigned char function[state_max_nrp][event_max_nrp]; + unsigned char initial_state; + bool final_states[state_max_nrp]; +}; + +static const struct automaton_nrp automaton_nrp = { + .state_names = { + "preempt_irq", + "any_thread_running", + "nested_preempt", + "rescheduling", + }, + .event_names = { + "irq_entry", + "sched_need_resched", + "schedule_entry", + "schedule_entry_preempt", + }, + .function = { + { + preempt_irq_nrp, + preempt_irq_nrp, + nested_preempt_nrp, + nested_preempt_nrp, + }, + { + any_thread_running_nrp, + rescheduling_nrp, + any_thread_running_nrp, + INVALID_STATE, + }, + { + nested_preempt_nrp, + preempt_irq_nrp, + any_thread_running_nrp, + any_thread_running_nrp, + }, + { + preempt_irq_nrp, + rescheduling_nrp, + any_thread_running_nrp, + any_thread_running_nrp, + }, + }, + .initial_state = preempt_irq_nrp, + .final_states = { 0, 1, 0, 0 }, +}; diff --git a/kernel/trace/rv/monitors/nrp/nrp_trace.h b/kernel/trace/rv/monitors/nrp/nrp_trace.h new file mode 100644 index 000000000000..2e13497de3b6 --- /dev/null +++ b/kernel/trace/rv/monitors/nrp/nrp_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_NRP +DEFINE_EVENT(event_da_monitor_id, event_nrp, + TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state), + TP_ARGS(id, state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor_id, error_nrp, + TP_PROTO(int id, char *state, char *event), + TP_ARGS(id, state, event)); +#endif /* CONFIG_RV_MON_NRP */ diff --git a/kernel/trace/rv/monitors/opid/Kconfig b/kernel/trace/rv/monitors/opid/Kconfig new file mode 100644 index 000000000000..6d02e239b684 --- /dev/null +++ b/kernel/trace/rv/monitors/opid/Kconfig @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_OPID + depends on RV + depends on RV_MON_SCHED + default y + select HA_MON_EVENTS_IMPLICIT + bool "opid monitor" + help + Monitor to ensure operations like wakeup and need resched occur with + interrupts and preemption disabled. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/opid/opid.c b/kernel/trace/rv/monitors/opid/opid.c new file mode 100644 index 000000000000..3b6a85e815b8 --- /dev/null +++ b/kernel/trace/rv/monitors/opid/opid.c @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> + +#define MODULE_NAME "opid" + +#include <trace/events/sched.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#define RV_MON_TYPE RV_MON_PER_CPU +#include "opid.h" +#include <rv/ha_monitor.h> + +static u64 ha_get_env(struct ha_monitor *ha_mon, enum envs_opid env, u64 time_ns) +{ + if (env == irq_off_opid) + return irqs_disabled(); + else if (env == preempt_off_opid) { + if (IS_ENABLED(CONFIG_PREEMPTION)) + return (preempt_count() & PREEMPT_MASK) > 0; + return true; + } + return ENV_INVALID_VALUE; +} + +static inline bool ha_verify_guards(struct ha_monitor *ha_mon, + enum states curr_state, enum events event, + enum states next_state, u64 time_ns) +{ + bool res = true; + + if (curr_state == any_opid && event == sched_need_resched_opid) + res = ha_get_env(ha_mon, irq_off_opid, time_ns) == 1ull; + else if (curr_state == any_opid && event == sched_waking_opid) + res = ha_get_env(ha_mon, irq_off_opid, time_ns) == 1ull && + ha_get_env(ha_mon, preempt_off_opid, time_ns) == 1ull; + return res; +} + +static bool ha_verify_constraint(struct ha_monitor *ha_mon, + enum states curr_state, enum events event, + enum states next_state, u64 time_ns) +{ + if (!ha_verify_guards(ha_mon, curr_state, event, next_state, time_ns)) + return false; + + return true; +} + +static void handle_sched_need_resched(void *data, struct task_struct *tsk, int cpu, int tif) +{ + da_handle_start_run_event(sched_need_resched_opid); +} + +static void handle_sched_waking(void *data, struct task_struct *p) +{ + da_handle_start_run_event(sched_waking_opid); +} + +static int enable_opid(void) +{ + int retval; + + retval = ha_monitor_init(); + if (retval) + return retval; + + rv_attach_trace_probe("opid", sched_set_need_resched_tp, handle_sched_need_resched); + rv_attach_trace_probe("opid", sched_waking, handle_sched_waking); + + return 0; +} + +static void disable_opid(void) +{ + rv_this.enabled = 0; + + rv_detach_trace_probe("opid", sched_set_need_resched_tp, handle_sched_need_resched); + rv_detach_trace_probe("opid", sched_waking, handle_sched_waking); + + ha_monitor_destroy(); +} + +/* + * This is the monitor register section. + */ +static struct rv_monitor rv_this = { + .name = "opid", + .description = "operations with preemption and irq disabled.", + .enable = enable_opid, + .disable = disable_opid, + .reset = da_monitor_reset_all, + .enabled = 0, +}; + +static int __init register_opid(void) +{ + return rv_register_monitor(&rv_this, &rv_sched); +} + +static void __exit unregister_opid(void) +{ + rv_unregister_monitor(&rv_this); +} + +module_init(register_opid); +module_exit(unregister_opid); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("opid: operations with preemption and irq disabled."); diff --git a/kernel/trace/rv/monitors/opid/opid.h b/kernel/trace/rv/monitors/opid/opid.h new file mode 100644 index 000000000000..fb0aa4c28aa6 --- /dev/null +++ b/kernel/trace/rv/monitors/opid/opid.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of opid automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +#define MONITOR_NAME opid + +enum states_opid { + any_opid, + state_max_opid, +}; + +#define INVALID_STATE state_max_opid + +enum events_opid { + sched_need_resched_opid, + sched_waking_opid, + event_max_opid, +}; + +enum envs_opid { + irq_off_opid, + preempt_off_opid, + env_max_opid, + env_max_stored_opid = irq_off_opid, +}; + +_Static_assert(env_max_stored_opid <= MAX_HA_ENV_LEN, "Not enough slots"); + +struct automaton_opid { + char *state_names[state_max_opid]; + char *event_names[event_max_opid]; + char *env_names[env_max_opid]; + unsigned char function[state_max_opid][event_max_opid]; + unsigned char initial_state; + bool final_states[state_max_opid]; +}; + +static const struct automaton_opid automaton_opid = { + .state_names = { + "any", + }, + .event_names = { + "sched_need_resched", + "sched_waking", + }, + .env_names = { + "irq_off", + "preempt_off", + }, + .function = { + { any_opid, any_opid }, + }, + .initial_state = any_opid, + .final_states = { 1 }, +}; diff --git a/kernel/trace/rv/monitors/opid/opid_trace.h b/kernel/trace/rv/monitors/opid/opid_trace.h new file mode 100644 index 000000000000..b04005b64208 --- /dev/null +++ b/kernel/trace/rv/monitors/opid/opid_trace.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_OPID +DEFINE_EVENT(event_da_monitor, event_opid, + TP_PROTO(char *state, char *event, char *next_state, bool final_state), + TP_ARGS(state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor, error_opid, + TP_PROTO(char *state, char *event), + TP_ARGS(state, event)); + +DEFINE_EVENT(error_env_da_monitor, error_env_opid, + TP_PROTO(char *state, char *event, char *env), + TP_ARGS(state, event, env)); +#endif /* CONFIG_RV_MON_OPID */ diff --git a/kernel/trace/rv/monitors/pagefault/Kconfig b/kernel/trace/rv/monitors/pagefault/Kconfig new file mode 100644 index 000000000000..0e013f00c33b --- /dev/null +++ b/kernel/trace/rv/monitors/pagefault/Kconfig @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_PAGEFAULT + depends on RV + select RV_LTL_MONITOR + depends on RV_MON_RTAPP + depends on X86 || RISCV + depends on MMU + default y + select LTL_MON_EVENTS_ID + bool "pagefault monitor" + help + Monitor that real-time tasks do not raise page faults, causing + undesirable latency. + + If you are developing a real-time system and not entirely sure whether + the applications are designed correctly for real-time, you want to say + Y here. + + This monitor does not affect execution speed while it is not running, + therefore it is safe to enable this in production kernel. diff --git a/kernel/trace/rv/monitors/pagefault/pagefault.c b/kernel/trace/rv/monitors/pagefault/pagefault.c new file mode 100644 index 000000000000..9fe6123b2200 --- /dev/null +++ b/kernel/trace/rv/monitors/pagefault/pagefault.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/rv.h> +#include <linux/sched/deadline.h> +#include <linux/sched/rt.h> +#include <linux/tracepoint.h> +#include <rv/instrumentation.h> + +#define MODULE_NAME "pagefault" + +#include <rv_trace.h> +#include <trace/events/exceptions.h> +#include <monitors/rtapp/rtapp.h> + +#include "pagefault.h" +#include <rv/ltl_monitor.h> + +static void ltl_atoms_fetch(struct task_struct *task, struct ltl_monitor *mon) +{ + /* + * This includes "actual" real-time tasks and also PI-boosted + * tasks. A task being PI-boosted means it is blocking an "actual" + * real-task, therefore it should also obey the monitor's rule, + * otherwise the "actual" real-task may be delayed. + */ + ltl_atom_set(mon, LTL_RT, rt_or_dl_task(task)); +} + +static void ltl_atoms_init(struct task_struct *task, struct ltl_monitor *mon, bool task_creation) +{ + if (task_creation) + ltl_atom_set(mon, LTL_PAGEFAULT, false); +} + +static void handle_page_fault(void *data, unsigned long address, struct pt_regs *regs, + unsigned long error_code) +{ + ltl_atom_pulse(current, LTL_PAGEFAULT, true); +} + +static int enable_pagefault(void) +{ + int retval; + + retval = ltl_monitor_init(); + if (retval) + return retval; + + rv_attach_trace_probe("rtapp_pagefault", page_fault_kernel, handle_page_fault); + rv_attach_trace_probe("rtapp_pagefault", page_fault_user, handle_page_fault); + + return 0; +} + +static void disable_pagefault(void) +{ + rv_detach_trace_probe("rtapp_pagefault", page_fault_kernel, handle_page_fault); + rv_detach_trace_probe("rtapp_pagefault", page_fault_user, handle_page_fault); + + ltl_monitor_destroy(); +} + +static struct rv_monitor rv_pagefault = { + .name = "pagefault", + .description = "Monitor that RT tasks do not raise page faults", + .enable = enable_pagefault, + .disable = disable_pagefault, +}; + +static int __init register_pagefault(void) +{ + return rv_register_monitor(&rv_pagefault, &rv_rtapp); +} + +static void __exit unregister_pagefault(void) +{ + rv_unregister_monitor(&rv_pagefault); +} + +module_init(register_pagefault); +module_exit(unregister_pagefault); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Nam Cao <namcao@linutronix.de>"); +MODULE_DESCRIPTION("pagefault: Monitor that RT tasks do not raise page faults"); diff --git a/kernel/trace/rv/monitors/pagefault/pagefault.h b/kernel/trace/rv/monitors/pagefault/pagefault.h new file mode 100644 index 000000000000..c580ec194009 --- /dev/null +++ b/kernel/trace/rv/monitors/pagefault/pagefault.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * C implementation of Buchi automaton, automatically generated by + * tools/verification/rvgen from the linear temporal logic specification. + * For further information, see kernel documentation: + * Documentation/trace/rv/linear_temporal_logic.rst + */ + +#include <linux/rv.h> + +#define MONITOR_NAME pagefault + +enum ltl_atom { + LTL_PAGEFAULT, + LTL_RT, + LTL_NUM_ATOM +}; +static_assert(LTL_NUM_ATOM <= RV_MAX_LTL_ATOM); + +static const char *ltl_atom_str(enum ltl_atom atom) +{ + static const char *const names[] = { + "pa", + "rt", + }; + + return names[atom]; +} + +enum ltl_buchi_state { + S0, + RV_NUM_BA_STATES +}; +static_assert(RV_NUM_BA_STATES <= RV_MAX_BA_STATES); + +static void ltl_start(struct task_struct *task, struct ltl_monitor *mon) +{ + bool pagefault = test_bit(LTL_PAGEFAULT, mon->atoms); + bool val3 = !pagefault; + bool rt = test_bit(LTL_RT, mon->atoms); + bool val1 = !rt; + bool val4 = val1 || val3; + + if (val4) + __set_bit(S0, mon->states); +} + +static void +ltl_possible_next_states(struct ltl_monitor *mon, unsigned int state, unsigned long *next) +{ + bool pagefault = test_bit(LTL_PAGEFAULT, mon->atoms); + bool val3 = !pagefault; + bool rt = test_bit(LTL_RT, mon->atoms); + bool val1 = !rt; + bool val4 = val1 || val3; + + switch (state) { + case S0: + if (val4) + __set_bit(S0, next); + break; + } +} diff --git a/kernel/trace/rv/monitors/pagefault/pagefault_trace.h b/kernel/trace/rv/monitors/pagefault/pagefault_trace.h new file mode 100644 index 000000000000..fe1f82597b1a --- /dev/null +++ b/kernel/trace/rv/monitors/pagefault/pagefault_trace.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_PAGEFAULT +DEFINE_EVENT(event_ltl_monitor_id, event_pagefault, + TP_PROTO(struct task_struct *task, char *states, char *atoms, char *next), + TP_ARGS(task, states, atoms, next)); +DEFINE_EVENT(error_ltl_monitor_id, error_pagefault, + TP_PROTO(struct task_struct *task), + TP_ARGS(task)); +#endif /* CONFIG_RV_MON_PAGEFAULT */ diff --git a/kernel/trace/rv/monitors/rtapp/Kconfig b/kernel/trace/rv/monitors/rtapp/Kconfig new file mode 100644 index 000000000000..1ce9370a9ba8 --- /dev/null +++ b/kernel/trace/rv/monitors/rtapp/Kconfig @@ -0,0 +1,11 @@ +config RV_MON_RTAPP + depends on RV + depends on RV_PER_TASK_MONITORS >= 2 + bool "rtapp monitor" + help + Collection of monitors to check for common problems with real-time + application that may cause unexpected latency. + + If you are developing a real-time system and not entirely sure whether + the applications are designed correctly for real-time, you want to say + Y here. diff --git a/kernel/trace/rv/monitors/rtapp/rtapp.c b/kernel/trace/rv/monitors/rtapp/rtapp.c new file mode 100644 index 000000000000..17f271231c99 --- /dev/null +++ b/kernel/trace/rv/monitors/rtapp/rtapp.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> + +#define MODULE_NAME "rtapp" + +#include "rtapp.h" + +struct rv_monitor rv_rtapp = { + .name = "rtapp", + .description = "Collection of monitors for detecting problems with real-time applications", +}; + +static int __init register_rtapp(void) +{ + return rv_register_monitor(&rv_rtapp, NULL); +} + +static void __exit unregister_rtapp(void) +{ + rv_unregister_monitor(&rv_rtapp); +} + +module_init(register_rtapp); +module_exit(unregister_rtapp); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Nam Cao <namcao@linutronix.de>"); +MODULE_DESCRIPTION("Collection of monitors for detecting problems with real-time applications"); diff --git a/kernel/trace/rv/monitors/rtapp/rtapp.h b/kernel/trace/rv/monitors/rtapp/rtapp.h new file mode 100644 index 000000000000..4c200d67c7f6 --- /dev/null +++ b/kernel/trace/rv/monitors/rtapp/rtapp.h @@ -0,0 +1,3 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +extern struct rv_monitor rv_rtapp; diff --git a/kernel/trace/rv/monitors/sched/Kconfig b/kernel/trace/rv/monitors/sched/Kconfig new file mode 100644 index 000000000000..aa16456da864 --- /dev/null +++ b/kernel/trace/rv/monitors/sched/Kconfig @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_SCHED + depends on RV + depends on RV_PER_TASK_MONITORS >= 3 + bool "sched monitor" + help + Collection of monitors to check the scheduler behaves according to specifications. + Enable this to enable all scheduler specification supported by the current kernel. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/sched/sched.c b/kernel/trace/rv/monitors/sched/sched.c new file mode 100644 index 000000000000..dd9d96fc6e21 --- /dev/null +++ b/kernel/trace/rv/monitors/sched/sched.c @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> + +#define MODULE_NAME "sched" + +#include "sched.h" + +struct rv_monitor rv_sched = { + .name = "sched", + .description = "container for several scheduler monitor specifications.", + .enable = NULL, + .disable = NULL, + .reset = NULL, + .enabled = 0, +}; + +static int __init register_sched(void) +{ + return rv_register_monitor(&rv_sched, NULL); +} + +static void __exit unregister_sched(void) +{ + rv_unregister_monitor(&rv_sched); +} + +module_init(register_sched); +module_exit(unregister_sched); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("sched: container for several scheduler monitor specifications."); diff --git a/kernel/trace/rv/monitors/sched/sched.h b/kernel/trace/rv/monitors/sched/sched.h new file mode 100644 index 000000000000..ba148dd8d48b --- /dev/null +++ b/kernel/trace/rv/monitors/sched/sched.h @@ -0,0 +1,3 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +extern struct rv_monitor rv_sched; diff --git a/kernel/trace/rv/monitors/sco/Kconfig b/kernel/trace/rv/monitors/sco/Kconfig new file mode 100644 index 000000000000..097c96cccdd7 --- /dev/null +++ b/kernel/trace/rv/monitors/sco/Kconfig @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_SCO + depends on RV + depends on RV_MON_SCHED + default y + select DA_MON_EVENTS_IMPLICIT + bool "sco monitor" + help + Monitor to ensure sched_set_state happens only in thread context. + This monitor is part of the sched monitors collection. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/sco/sco.c b/kernel/trace/rv/monitors/sco/sco.c new file mode 100644 index 000000000000..5a3bd5e16e62 --- /dev/null +++ b/kernel/trace/rv/monitors/sco/sco.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> + +#define MODULE_NAME "sco" + +#include <trace/events/sched.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#define RV_MON_TYPE RV_MON_PER_CPU +#include "sco.h" +#include <rv/da_monitor.h> + +static void handle_sched_set_state(void *data, struct task_struct *tsk, int state) +{ + da_handle_start_event(sched_set_state_sco); +} + +static void handle_schedule_entry(void *data, bool preempt) +{ + da_handle_event(schedule_entry_sco); +} + +static void handle_schedule_exit(void *data, bool is_switch) +{ + da_handle_start_event(schedule_exit_sco); +} + +static int enable_sco(void) +{ + int retval; + + retval = da_monitor_init(); + if (retval) + return retval; + + rv_attach_trace_probe("sco", sched_set_state_tp, handle_sched_set_state); + rv_attach_trace_probe("sco", sched_entry_tp, handle_schedule_entry); + rv_attach_trace_probe("sco", sched_exit_tp, handle_schedule_exit); + + return 0; +} + +static void disable_sco(void) +{ + rv_this.enabled = 0; + + rv_detach_trace_probe("sco", sched_set_state_tp, handle_sched_set_state); + rv_detach_trace_probe("sco", sched_entry_tp, handle_schedule_entry); + rv_detach_trace_probe("sco", sched_exit_tp, handle_schedule_exit); + + da_monitor_destroy(); +} + +static struct rv_monitor rv_this = { + .name = "sco", + .description = "scheduling context operations.", + .enable = enable_sco, + .disable = disable_sco, + .reset = da_monitor_reset_all, + .enabled = 0, +}; + +static int __init register_sco(void) +{ + return rv_register_monitor(&rv_this, &rv_sched); +} + +static void __exit unregister_sco(void) +{ + rv_unregister_monitor(&rv_this); +} + +module_init(register_sco); +module_exit(unregister_sco); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("sco: scheduling context operations."); diff --git a/kernel/trace/rv/monitors/sco/sco.h b/kernel/trace/rv/monitors/sco/sco.h new file mode 100644 index 000000000000..bac3beb51e72 --- /dev/null +++ b/kernel/trace/rv/monitors/sco/sco.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of sco automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +#define MONITOR_NAME sco + +enum states_sco { + thread_context_sco, + scheduling_context_sco, + state_max_sco, +}; + +#define INVALID_STATE state_max_sco + +enum events_sco { + sched_set_state_sco, + schedule_entry_sco, + schedule_exit_sco, + event_max_sco, +}; + +struct automaton_sco { + char *state_names[state_max_sco]; + char *event_names[event_max_sco]; + unsigned char function[state_max_sco][event_max_sco]; + unsigned char initial_state; + bool final_states[state_max_sco]; +}; + +static const struct automaton_sco automaton_sco = { + .state_names = { + "thread_context", + "scheduling_context", + }, + .event_names = { + "sched_set_state", + "schedule_entry", + "schedule_exit", + }, + .function = { + { thread_context_sco, scheduling_context_sco, INVALID_STATE }, + { INVALID_STATE, INVALID_STATE, thread_context_sco }, + }, + .initial_state = thread_context_sco, + .final_states = { 1, 0 }, +}; diff --git a/kernel/trace/rv/monitors/sco/sco_trace.h b/kernel/trace/rv/monitors/sco/sco_trace.h new file mode 100644 index 000000000000..b711cd9024ec --- /dev/null +++ b/kernel/trace/rv/monitors/sco/sco_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_SCO +DEFINE_EVENT(event_da_monitor, event_sco, + TP_PROTO(char *state, char *event, char *next_state, bool final_state), + TP_ARGS(state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor, error_sco, + TP_PROTO(char *state, char *event), + TP_ARGS(state, event)); +#endif /* CONFIG_RV_MON_SCO */ diff --git a/kernel/trace/rv/monitors/scpd/Kconfig b/kernel/trace/rv/monitors/scpd/Kconfig new file mode 100644 index 000000000000..682d0416188b --- /dev/null +++ b/kernel/trace/rv/monitors/scpd/Kconfig @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_SCPD + depends on RV + depends on TRACE_PREEMPT_TOGGLE + depends on RV_MON_SCHED + default y + select DA_MON_EVENTS_IMPLICIT + bool "scpd monitor" + help + Monitor to ensure schedule is called with preemption disabled. + This monitor is part of the sched monitors collection. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/scpd/scpd.c b/kernel/trace/rv/monitors/scpd/scpd.c new file mode 100644 index 000000000000..83b48627dc9f --- /dev/null +++ b/kernel/trace/rv/monitors/scpd/scpd.c @@ -0,0 +1,93 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> + +#define MODULE_NAME "scpd" + +#include <trace/events/sched.h> +#include <trace/events/preemptirq.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#define RV_MON_TYPE RV_MON_PER_CPU +#include "scpd.h" +#include <rv/da_monitor.h> + +static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_event(preempt_disable_scpd); +} + +static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_start_event(preempt_enable_scpd); +} + +static void handle_schedule_entry(void *data, bool preempt) +{ + da_handle_event(schedule_entry_scpd); +} + +static void handle_schedule_exit(void *data, bool is_switch) +{ + da_handle_event(schedule_exit_scpd); +} + +static int enable_scpd(void) +{ + int retval; + + retval = da_monitor_init(); + if (retval) + return retval; + + rv_attach_trace_probe("scpd", preempt_disable, handle_preempt_disable); + rv_attach_trace_probe("scpd", preempt_enable, handle_preempt_enable); + rv_attach_trace_probe("scpd", sched_entry_tp, handle_schedule_entry); + rv_attach_trace_probe("scpd", sched_exit_tp, handle_schedule_exit); + + return 0; +} + +static void disable_scpd(void) +{ + rv_this.enabled = 0; + + rv_detach_trace_probe("scpd", preempt_disable, handle_preempt_disable); + rv_detach_trace_probe("scpd", preempt_enable, handle_preempt_enable); + rv_detach_trace_probe("scpd", sched_entry_tp, handle_schedule_entry); + rv_detach_trace_probe("scpd", sched_exit_tp, handle_schedule_exit); + + da_monitor_destroy(); +} + +static struct rv_monitor rv_this = { + .name = "scpd", + .description = "schedule called with preemption disabled.", + .enable = enable_scpd, + .disable = disable_scpd, + .reset = da_monitor_reset_all, + .enabled = 0, +}; + +static int __init register_scpd(void) +{ + return rv_register_monitor(&rv_this, &rv_sched); +} + +static void __exit unregister_scpd(void) +{ + rv_unregister_monitor(&rv_this); +} + +module_init(register_scpd); +module_exit(unregister_scpd); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("scpd: schedule called with preemption disabled."); diff --git a/kernel/trace/rv/monitors/scpd/scpd.h b/kernel/trace/rv/monitors/scpd/scpd.h new file mode 100644 index 000000000000..d6329da2671b --- /dev/null +++ b/kernel/trace/rv/monitors/scpd/scpd.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of scpd automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +#define MONITOR_NAME scpd + +enum states_scpd { + cant_sched_scpd, + can_sched_scpd, + state_max_scpd, +}; + +#define INVALID_STATE state_max_scpd + +enum events_scpd { + preempt_disable_scpd, + preempt_enable_scpd, + schedule_entry_scpd, + schedule_exit_scpd, + event_max_scpd, +}; + +struct automaton_scpd { + char *state_names[state_max_scpd]; + char *event_names[event_max_scpd]; + unsigned char function[state_max_scpd][event_max_scpd]; + unsigned char initial_state; + bool final_states[state_max_scpd]; +}; + +static const struct automaton_scpd automaton_scpd = { + .state_names = { + "cant_sched", + "can_sched", + }, + .event_names = { + "preempt_disable", + "preempt_enable", + "schedule_entry", + "schedule_exit", + }, + .function = { + { can_sched_scpd, INVALID_STATE, INVALID_STATE, INVALID_STATE }, + { INVALID_STATE, cant_sched_scpd, can_sched_scpd, can_sched_scpd }, + }, + .initial_state = cant_sched_scpd, + .final_states = { 1, 0 }, +}; diff --git a/kernel/trace/rv/monitors/scpd/scpd_trace.h b/kernel/trace/rv/monitors/scpd/scpd_trace.h new file mode 100644 index 000000000000..6b0f4aa4732e --- /dev/null +++ b/kernel/trace/rv/monitors/scpd/scpd_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_SCPD +DEFINE_EVENT(event_da_monitor, event_scpd, + TP_PROTO(char *state, char *event, char *next_state, bool final_state), + TP_ARGS(state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor, error_scpd, + TP_PROTO(char *state, char *event), + TP_ARGS(state, event)); +#endif /* CONFIG_RV_MON_SCPD */ diff --git a/kernel/trace/rv/monitors/sleep/Kconfig b/kernel/trace/rv/monitors/sleep/Kconfig new file mode 100644 index 000000000000..6b7a122e7b47 --- /dev/null +++ b/kernel/trace/rv/monitors/sleep/Kconfig @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_SLEEP + depends on RV + select RV_LTL_MONITOR + depends on HAVE_SYSCALL_TRACEPOINTS + depends on RV_MON_RTAPP + select TRACE_IRQFLAGS + default y + select LTL_MON_EVENTS_ID + bool "sleep monitor" + help + Monitor that real-time tasks do not sleep in a manner that may + cause undesirable latency. + + If you are developing a real-time system and not entirely sure whether + the applications are designed correctly for real-time, you want to say + Y here. + + Enabling this monitor may have performance impact (due to select + TRACE_IRQFLAGS). Therefore, you probably should say N for + production kernel. diff --git a/kernel/trace/rv/monitors/sleep/sleep.c b/kernel/trace/rv/monitors/sleep/sleep.c new file mode 100644 index 000000000000..8dfe5ec13e19 --- /dev/null +++ b/kernel/trace/rv/monitors/sleep/sleep.c @@ -0,0 +1,249 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/init.h> +#include <linux/irqflags.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/rv.h> +#include <linux/sched/deadline.h> +#include <linux/sched/rt.h> +#include <rv/instrumentation.h> + +#define MODULE_NAME "sleep" + +#include <trace/events/syscalls.h> +#include <trace/events/sched.h> +#include <trace/events/lock.h> +#include <uapi/linux/futex.h> +#include <rv_trace.h> +#include <monitors/rtapp/rtapp.h> + +#include "sleep.h" +#include <rv/ltl_monitor.h> + +static void ltl_atoms_fetch(struct task_struct *task, struct ltl_monitor *mon) +{ + /* + * This includes "actual" real-time tasks and also PI-boosted + * tasks. A task being PI-boosted means it is blocking an "actual" + * real-task, therefore it should also obey the monitor's rule, + * otherwise the "actual" real-task may be delayed. + */ + ltl_atom_set(mon, LTL_RT, rt_or_dl_task(task)); +} + +static void ltl_atoms_init(struct task_struct *task, struct ltl_monitor *mon, bool task_creation) +{ + ltl_atom_set(mon, LTL_SLEEP, false); + ltl_atom_set(mon, LTL_WAKE, false); + ltl_atom_set(mon, LTL_ABORT_SLEEP, false); + ltl_atom_set(mon, LTL_WOKEN_BY_HARDIRQ, false); + ltl_atom_set(mon, LTL_WOKEN_BY_NMI, false); + ltl_atom_set(mon, LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO, false); + + if (task_creation) { + ltl_atom_set(mon, LTL_KTHREAD_SHOULD_STOP, false); + ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, false); + ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false); + ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false); + ltl_atom_set(mon, LTL_CLOCK_NANOSLEEP, false); + ltl_atom_set(mon, LTL_FUTEX_WAIT, false); + ltl_atom_set(mon, LTL_EPOLL_WAIT, false); + ltl_atom_set(mon, LTL_FUTEX_LOCK_PI, false); + ltl_atom_set(mon, LTL_BLOCK_ON_RT_MUTEX, false); + } + + if (task->flags & PF_KTHREAD) { + ltl_atom_set(mon, LTL_KERNEL_THREAD, true); + + /* kernel tasks do not do syscall */ + ltl_atom_set(mon, LTL_FUTEX_WAIT, false); + ltl_atom_set(mon, LTL_FUTEX_LOCK_PI, false); + ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, false); + ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false); + ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false); + ltl_atom_set(mon, LTL_CLOCK_NANOSLEEP, false); + ltl_atom_set(mon, LTL_EPOLL_WAIT, false); + + if (strstarts(task->comm, "migration/")) + ltl_atom_set(mon, LTL_TASK_IS_MIGRATION, true); + else + ltl_atom_set(mon, LTL_TASK_IS_MIGRATION, false); + + if (strstarts(task->comm, "rcu")) + ltl_atom_set(mon, LTL_TASK_IS_RCU, true); + else + ltl_atom_set(mon, LTL_TASK_IS_RCU, false); + } else { + ltl_atom_set(mon, LTL_KTHREAD_SHOULD_STOP, false); + ltl_atom_set(mon, LTL_KERNEL_THREAD, false); + ltl_atom_set(mon, LTL_TASK_IS_RCU, false); + ltl_atom_set(mon, LTL_TASK_IS_MIGRATION, false); + } + +} + +static void handle_sched_set_state(void *data, struct task_struct *task, int state) +{ + if (state & TASK_INTERRUPTIBLE) + ltl_atom_pulse(task, LTL_SLEEP, true); + else if (state == TASK_RUNNING) + ltl_atom_pulse(task, LTL_ABORT_SLEEP, true); +} + +static void handle_sched_wakeup(void *data, struct task_struct *task) +{ + ltl_atom_pulse(task, LTL_WAKE, true); +} + +static void handle_sched_waking(void *data, struct task_struct *task) +{ + if (this_cpu_read(hardirq_context)) { + ltl_atom_pulse(task, LTL_WOKEN_BY_HARDIRQ, true); + } else if (in_task()) { + if (current->prio <= task->prio) + ltl_atom_pulse(task, LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO, true); + } else if (in_nmi()) { + ltl_atom_pulse(task, LTL_WOKEN_BY_NMI, true); + } +} + +static void handle_contention_begin(void *data, void *lock, unsigned int flags) +{ + if (flags & LCB_F_RT) + ltl_atom_update(current, LTL_BLOCK_ON_RT_MUTEX, true); +} + +static void handle_contention_end(void *data, void *lock, int ret) +{ + ltl_atom_update(current, LTL_BLOCK_ON_RT_MUTEX, false); +} + +static void handle_sys_enter(void *data, struct pt_regs *regs, long id) +{ + struct ltl_monitor *mon; + unsigned long args[6]; + int op, cmd; + + mon = ltl_get_monitor(current); + + switch (id) { +#ifdef __NR_clock_nanosleep + case __NR_clock_nanosleep: +#endif +#ifdef __NR_clock_nanosleep_time64 + case __NR_clock_nanosleep_time64: +#endif + syscall_get_arguments(current, regs, args); + ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, args[0] == CLOCK_MONOTONIC); + ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, args[0] == CLOCK_TAI); + ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, args[1] == TIMER_ABSTIME); + ltl_atom_update(current, LTL_CLOCK_NANOSLEEP, true); + break; + +#ifdef __NR_futex + case __NR_futex: +#endif +#ifdef __NR_futex_time64 + case __NR_futex_time64: +#endif + syscall_get_arguments(current, regs, args); + op = args[1]; + cmd = op & FUTEX_CMD_MASK; + + switch (cmd) { + case FUTEX_LOCK_PI: + case FUTEX_LOCK_PI2: + ltl_atom_update(current, LTL_FUTEX_LOCK_PI, true); + break; + case FUTEX_WAIT: + case FUTEX_WAIT_BITSET: + case FUTEX_WAIT_REQUEUE_PI: + ltl_atom_update(current, LTL_FUTEX_WAIT, true); + break; + } + break; +#ifdef __NR_epoll_wait + case __NR_epoll_wait: + ltl_atom_update(current, LTL_EPOLL_WAIT, true); + break; +#endif + } +} + +static void handle_sys_exit(void *data, struct pt_regs *regs, long ret) +{ + struct ltl_monitor *mon = ltl_get_monitor(current); + + ltl_atom_set(mon, LTL_FUTEX_LOCK_PI, false); + ltl_atom_set(mon, LTL_FUTEX_WAIT, false); + ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, false); + ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false); + ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false); + ltl_atom_set(mon, LTL_EPOLL_WAIT, false); + ltl_atom_update(current, LTL_CLOCK_NANOSLEEP, false); +} + +static void handle_kthread_stop(void *data, struct task_struct *task) +{ + /* FIXME: this could race with other tracepoint handlers */ + ltl_atom_update(task, LTL_KTHREAD_SHOULD_STOP, true); +} + +static int enable_sleep(void) +{ + int retval; + + retval = ltl_monitor_init(); + if (retval) + return retval; + + rv_attach_trace_probe("rtapp_sleep", sched_waking, handle_sched_waking); + rv_attach_trace_probe("rtapp_sleep", sched_wakeup, handle_sched_wakeup); + rv_attach_trace_probe("rtapp_sleep", sched_set_state_tp, handle_sched_set_state); + rv_attach_trace_probe("rtapp_sleep", contention_begin, handle_contention_begin); + rv_attach_trace_probe("rtapp_sleep", contention_end, handle_contention_end); + rv_attach_trace_probe("rtapp_sleep", sched_kthread_stop, handle_kthread_stop); + rv_attach_trace_probe("rtapp_sleep", sys_enter, handle_sys_enter); + rv_attach_trace_probe("rtapp_sleep", sys_exit, handle_sys_exit); + return 0; +} + +static void disable_sleep(void) +{ + rv_detach_trace_probe("rtapp_sleep", sched_waking, handle_sched_waking); + rv_detach_trace_probe("rtapp_sleep", sched_wakeup, handle_sched_wakeup); + rv_detach_trace_probe("rtapp_sleep", sched_set_state_tp, handle_sched_set_state); + rv_detach_trace_probe("rtapp_sleep", contention_begin, handle_contention_begin); + rv_detach_trace_probe("rtapp_sleep", contention_end, handle_contention_end); + rv_detach_trace_probe("rtapp_sleep", sched_kthread_stop, handle_kthread_stop); + rv_detach_trace_probe("rtapp_sleep", sys_enter, handle_sys_enter); + rv_detach_trace_probe("rtapp_sleep", sys_exit, handle_sys_exit); + + ltl_monitor_destroy(); +} + +static struct rv_monitor rv_sleep = { + .name = "sleep", + .description = "Monitor that RT tasks do not undesirably sleep", + .enable = enable_sleep, + .disable = disable_sleep, +}; + +static int __init register_sleep(void) +{ + return rv_register_monitor(&rv_sleep, &rv_rtapp); +} + +static void __exit unregister_sleep(void) +{ + rv_unregister_monitor(&rv_sleep); +} + +module_init(register_sleep); +module_exit(unregister_sleep); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Nam Cao <namcao@linutronix.de>"); +MODULE_DESCRIPTION("sleep: Monitor that RT tasks do not undesirably sleep"); diff --git a/kernel/trace/rv/monitors/sleep/sleep.h b/kernel/trace/rv/monitors/sleep/sleep.h new file mode 100644 index 000000000000..95dc2727c059 --- /dev/null +++ b/kernel/trace/rv/monitors/sleep/sleep.h @@ -0,0 +1,263 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * C implementation of Buchi automaton, automatically generated by + * tools/verification/rvgen from the linear temporal logic specification. + * For further information, see kernel documentation: + * Documentation/trace/rv/linear_temporal_logic.rst + */ + +#include <linux/rv.h> + +#define MONITOR_NAME sleep + +enum ltl_atom { + LTL_ABORT_SLEEP, + LTL_BLOCK_ON_RT_MUTEX, + LTL_CLOCK_NANOSLEEP, + LTL_EPOLL_WAIT, + LTL_FUTEX_LOCK_PI, + LTL_FUTEX_WAIT, + LTL_KERNEL_THREAD, + LTL_KTHREAD_SHOULD_STOP, + LTL_NANOSLEEP_CLOCK_MONOTONIC, + LTL_NANOSLEEP_CLOCK_TAI, + LTL_NANOSLEEP_TIMER_ABSTIME, + LTL_RT, + LTL_SLEEP, + LTL_TASK_IS_MIGRATION, + LTL_TASK_IS_RCU, + LTL_WAKE, + LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO, + LTL_WOKEN_BY_HARDIRQ, + LTL_WOKEN_BY_NMI, + LTL_NUM_ATOM +}; +static_assert(LTL_NUM_ATOM <= RV_MAX_LTL_ATOM); + +static const char *ltl_atom_str(enum ltl_atom atom) +{ + static const char *const names[] = { + "ab_sl", + "bl_on_rt_mu", + "cl_na", + "ep_wa", + "fu_lo_pi", + "fu_wa", + "ker_th", + "kth_sh_st", + "na_cl_mo", + "na_cl_ta", + "na_ti_ab", + "rt", + "sl", + "ta_mi", + "ta_rc", + "wak", + "wo_eq_hi_pr", + "wo_ha", + "wo_nm", + }; + + return names[atom]; +} + +enum ltl_buchi_state { + S0, + S1, + S2, + S3, + S4, + S5, + S6, + S7, + RV_NUM_BA_STATES +}; +static_assert(RV_NUM_BA_STATES <= RV_MAX_BA_STATES); + +static void ltl_start(struct task_struct *task, struct ltl_monitor *mon) +{ + bool woken_by_nmi = test_bit(LTL_WOKEN_BY_NMI, mon->atoms); + bool woken_by_hardirq = test_bit(LTL_WOKEN_BY_HARDIRQ, mon->atoms); + bool woken_by_equal_or_higher_prio = test_bit(LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO, + mon->atoms); + bool wake = test_bit(LTL_WAKE, mon->atoms); + bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms); + bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms); + bool sleep = test_bit(LTL_SLEEP, mon->atoms); + bool rt = test_bit(LTL_RT, mon->atoms); + bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME, mon->atoms); + bool nanosleep_clock_tai = test_bit(LTL_NANOSLEEP_CLOCK_TAI, mon->atoms); + bool nanosleep_clock_monotonic = test_bit(LTL_NANOSLEEP_CLOCK_MONOTONIC, mon->atoms); + bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon->atoms); + bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms); + bool futex_wait = test_bit(LTL_FUTEX_WAIT, mon->atoms); + bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms); + bool epoll_wait = test_bit(LTL_EPOLL_WAIT, mon->atoms); + bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms); + bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms); + bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms); + bool val42 = task_is_rcu || task_is_migration; + bool val43 = futex_lock_pi || val42; + bool val5 = block_on_rt_mutex || val43; + bool val34 = abort_sleep || kthread_should_stop; + bool val35 = woken_by_nmi || val34; + bool val36 = woken_by_hardirq || val35; + bool val14 = woken_by_equal_or_higher_prio || val36; + bool val13 = !wake; + bool val26 = nanosleep_clock_monotonic || nanosleep_clock_tai; + bool val27 = nanosleep_timer_abstime && val26; + bool val18 = clock_nanosleep && val27; + bool val20 = val18 || epoll_wait; + bool val9 = futex_wait || val20; + bool val11 = val9 || kernel_thread; + bool val2 = !sleep; + bool val1 = !rt; + bool val3 = val1 || val2; + + if (val3) + __set_bit(S0, mon->states); + if (val11 && val13) + __set_bit(S1, mon->states); + if (val11 && val14) + __set_bit(S4, mon->states); + if (val5) + __set_bit(S5, mon->states); +} + +static void +ltl_possible_next_states(struct ltl_monitor *mon, unsigned int state, unsigned long *next) +{ + bool woken_by_nmi = test_bit(LTL_WOKEN_BY_NMI, mon->atoms); + bool woken_by_hardirq = test_bit(LTL_WOKEN_BY_HARDIRQ, mon->atoms); + bool woken_by_equal_or_higher_prio = test_bit(LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO, + mon->atoms); + bool wake = test_bit(LTL_WAKE, mon->atoms); + bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms); + bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms); + bool sleep = test_bit(LTL_SLEEP, mon->atoms); + bool rt = test_bit(LTL_RT, mon->atoms); + bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME, mon->atoms); + bool nanosleep_clock_tai = test_bit(LTL_NANOSLEEP_CLOCK_TAI, mon->atoms); + bool nanosleep_clock_monotonic = test_bit(LTL_NANOSLEEP_CLOCK_MONOTONIC, mon->atoms); + bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon->atoms); + bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms); + bool futex_wait = test_bit(LTL_FUTEX_WAIT, mon->atoms); + bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms); + bool epoll_wait = test_bit(LTL_EPOLL_WAIT, mon->atoms); + bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms); + bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms); + bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms); + bool val42 = task_is_rcu || task_is_migration; + bool val43 = futex_lock_pi || val42; + bool val5 = block_on_rt_mutex || val43; + bool val34 = abort_sleep || kthread_should_stop; + bool val35 = woken_by_nmi || val34; + bool val36 = woken_by_hardirq || val35; + bool val14 = woken_by_equal_or_higher_prio || val36; + bool val13 = !wake; + bool val26 = nanosleep_clock_monotonic || nanosleep_clock_tai; + bool val27 = nanosleep_timer_abstime && val26; + bool val18 = clock_nanosleep && val27; + bool val20 = val18 || epoll_wait; + bool val9 = futex_wait || val20; + bool val11 = val9 || kernel_thread; + bool val2 = !sleep; + bool val1 = !rt; + bool val3 = val1 || val2; + + switch (state) { + case S0: + if (val3) + __set_bit(S0, next); + if (val11 && val13) + __set_bit(S1, next); + if (val11 && val14) + __set_bit(S4, next); + if (val5) + __set_bit(S5, next); + break; + case S1: + if (val11 && val13) + __set_bit(S1, next); + if (val13 && val3) + __set_bit(S2, next); + if (val14 && val3) + __set_bit(S3, next); + if (val11 && val14) + __set_bit(S4, next); + if (val13 && val5) + __set_bit(S6, next); + if (val14 && val5) + __set_bit(S7, next); + break; + case S2: + if (val11 && val13) + __set_bit(S1, next); + if (val13 && val3) + __set_bit(S2, next); + if (val14 && val3) + __set_bit(S3, next); + if (val11 && val14) + __set_bit(S4, next); + if (val13 && val5) + __set_bit(S6, next); + if (val14 && val5) + __set_bit(S7, next); + break; + case S3: + if (val3) + __set_bit(S0, next); + if (val11 && val13) + __set_bit(S1, next); + if (val11 && val14) + __set_bit(S4, next); + if (val5) + __set_bit(S5, next); + break; + case S4: + if (val3) + __set_bit(S0, next); + if (val11 && val13) + __set_bit(S1, next); + if (val11 && val14) + __set_bit(S4, next); + if (val5) + __set_bit(S5, next); + break; + case S5: + if (val3) + __set_bit(S0, next); + if (val11 && val13) + __set_bit(S1, next); + if (val11 && val14) + __set_bit(S4, next); + if (val5) + __set_bit(S5, next); + break; + case S6: + if (val11 && val13) + __set_bit(S1, next); + if (val13 && val3) + __set_bit(S2, next); + if (val14 && val3) + __set_bit(S3, next); + if (val11 && val14) + __set_bit(S4, next); + if (val13 && val5) + __set_bit(S6, next); + if (val14 && val5) + __set_bit(S7, next); + break; + case S7: + if (val3) + __set_bit(S0, next); + if (val11 && val13) + __set_bit(S1, next); + if (val11 && val14) + __set_bit(S4, next); + if (val5) + __set_bit(S5, next); + break; + } +} diff --git a/kernel/trace/rv/monitors/sleep/sleep_trace.h b/kernel/trace/rv/monitors/sleep/sleep_trace.h new file mode 100644 index 000000000000..22eaf31da987 --- /dev/null +++ b/kernel/trace/rv/monitors/sleep/sleep_trace.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_SLEEP +DEFINE_EVENT(event_ltl_monitor_id, event_sleep, + TP_PROTO(struct task_struct *task, char *states, char *atoms, char *next), + TP_ARGS(task, states, atoms, next)); +DEFINE_EVENT(error_ltl_monitor_id, error_sleep, + TP_PROTO(struct task_struct *task), + TP_ARGS(task)); +#endif /* CONFIG_RV_MON_SLEEP */ diff --git a/kernel/trace/rv/monitors/snep/Kconfig b/kernel/trace/rv/monitors/snep/Kconfig new file mode 100644 index 000000000000..7dd54f434ff7 --- /dev/null +++ b/kernel/trace/rv/monitors/snep/Kconfig @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_SNEP + depends on RV + depends on TRACE_PREEMPT_TOGGLE + depends on RV_MON_SCHED + default y + select DA_MON_EVENTS_IMPLICIT + bool "snep monitor" + help + Monitor to ensure schedule does not enable preempt. + This monitor is part of the sched monitors collection. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/snep/snep.c b/kernel/trace/rv/monitors/snep/snep.c new file mode 100644 index 000000000000..b80b73795dec --- /dev/null +++ b/kernel/trace/rv/monitors/snep/snep.c @@ -0,0 +1,93 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> + +#define MODULE_NAME "snep" + +#include <trace/events/sched.h> +#include <trace/events/preemptirq.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#define RV_MON_TYPE RV_MON_PER_CPU +#include "snep.h" +#include <rv/da_monitor.h> + +static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_start_event(preempt_disable_snep); +} + +static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_start_event(preempt_enable_snep); +} + +static void handle_schedule_entry(void *data, bool preempt) +{ + da_handle_event(schedule_entry_snep); +} + +static void handle_schedule_exit(void *data, bool is_switch) +{ + da_handle_start_event(schedule_exit_snep); +} + +static int enable_snep(void) +{ + int retval; + + retval = da_monitor_init(); + if (retval) + return retval; + + rv_attach_trace_probe("snep", preempt_disable, handle_preempt_disable); + rv_attach_trace_probe("snep", preempt_enable, handle_preempt_enable); + rv_attach_trace_probe("snep", sched_entry_tp, handle_schedule_entry); + rv_attach_trace_probe("snep", sched_exit_tp, handle_schedule_exit); + + return 0; +} + +static void disable_snep(void) +{ + rv_this.enabled = 0; + + rv_detach_trace_probe("snep", preempt_disable, handle_preempt_disable); + rv_detach_trace_probe("snep", preempt_enable, handle_preempt_enable); + rv_detach_trace_probe("snep", sched_entry_tp, handle_schedule_entry); + rv_detach_trace_probe("snep", sched_exit_tp, handle_schedule_exit); + + da_monitor_destroy(); +} + +static struct rv_monitor rv_this = { + .name = "snep", + .description = "schedule does not enable preempt.", + .enable = enable_snep, + .disable = disable_snep, + .reset = da_monitor_reset_all, + .enabled = 0, +}; + +static int __init register_snep(void) +{ + return rv_register_monitor(&rv_this, &rv_sched); +} + +static void __exit unregister_snep(void) +{ + rv_unregister_monitor(&rv_this); +} + +module_init(register_snep); +module_exit(unregister_snep); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("snep: schedule does not enable preempt."); diff --git a/kernel/trace/rv/monitors/snep/snep.h b/kernel/trace/rv/monitors/snep/snep.h new file mode 100644 index 000000000000..357520a5b3d1 --- /dev/null +++ b/kernel/trace/rv/monitors/snep/snep.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of snep automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +#define MONITOR_NAME snep + +enum states_snep { + non_scheduling_context_snep, + scheduling_contex_snep, + state_max_snep, +}; + +#define INVALID_STATE state_max_snep + +enum events_snep { + preempt_disable_snep, + preempt_enable_snep, + schedule_entry_snep, + schedule_exit_snep, + event_max_snep, +}; + +struct automaton_snep { + char *state_names[state_max_snep]; + char *event_names[event_max_snep]; + unsigned char function[state_max_snep][event_max_snep]; + unsigned char initial_state; + bool final_states[state_max_snep]; +}; + +static const struct automaton_snep automaton_snep = { + .state_names = { + "non_scheduling_context", + "scheduling_contex", + }, + .event_names = { + "preempt_disable", + "preempt_enable", + "schedule_entry", + "schedule_exit", + }, + .function = { + { + non_scheduling_context_snep, + non_scheduling_context_snep, + scheduling_contex_snep, + INVALID_STATE, + }, + { + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + non_scheduling_context_snep, + }, + }, + .initial_state = non_scheduling_context_snep, + .final_states = { 1, 0 }, +}; diff --git a/kernel/trace/rv/monitors/snep/snep_trace.h b/kernel/trace/rv/monitors/snep/snep_trace.h new file mode 100644 index 000000000000..01aad49a949a --- /dev/null +++ b/kernel/trace/rv/monitors/snep/snep_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_SNEP +DEFINE_EVENT(event_da_monitor, event_snep, + TP_PROTO(char *state, char *event, char *next_state, bool final_state), + TP_ARGS(state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor, error_snep, + TP_PROTO(char *state, char *event), + TP_ARGS(state, event)); +#endif /* CONFIG_RV_MON_SNEP */ diff --git a/kernel/trace/rv/monitors/snroc/Kconfig b/kernel/trace/rv/monitors/snroc/Kconfig new file mode 100644 index 000000000000..6e4365a2fea3 --- /dev/null +++ b/kernel/trace/rv/monitors/snroc/Kconfig @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_SNROC + depends on RV + depends on RV_MON_SCHED + default y + select DA_MON_EVENTS_ID + bool "snroc monitor" + help + Monitor to ensure sched_set_state happens only in the respective task's context. + This monitor is part of the sched monitors collection. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/snroc/snroc.c b/kernel/trace/rv/monitors/snroc/snroc.c new file mode 100644 index 000000000000..f168b1a4b12c --- /dev/null +++ b/kernel/trace/rv/monitors/snroc/snroc.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> + +#define MODULE_NAME "snroc" + +#include <trace/events/sched.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#define RV_MON_TYPE RV_MON_PER_TASK +#include "snroc.h" +#include <rv/da_monitor.h> + +static void handle_sched_set_state(void *data, struct task_struct *tsk, int state) +{ + da_handle_event(tsk, sched_set_state_snroc); +} + +static void handle_sched_switch(void *data, bool preempt, + struct task_struct *prev, + struct task_struct *next, + unsigned int prev_state) +{ + da_handle_start_event(prev, sched_switch_out_snroc); + da_handle_event(next, sched_switch_in_snroc); +} + +static int enable_snroc(void) +{ + int retval; + + retval = da_monitor_init(); + if (retval) + return retval; + + rv_attach_trace_probe("snroc", sched_set_state_tp, handle_sched_set_state); + rv_attach_trace_probe("snroc", sched_switch, handle_sched_switch); + + return 0; +} + +static void disable_snroc(void) +{ + rv_this.enabled = 0; + + rv_detach_trace_probe("snroc", sched_set_state_tp, handle_sched_set_state); + rv_detach_trace_probe("snroc", sched_switch, handle_sched_switch); + + da_monitor_destroy(); +} + +static struct rv_monitor rv_this = { + .name = "snroc", + .description = "set non runnable on its own context.", + .enable = enable_snroc, + .disable = disable_snroc, + .reset = da_monitor_reset_all, + .enabled = 0, +}; + +static int __init register_snroc(void) +{ + return rv_register_monitor(&rv_this, &rv_sched); +} + +static void __exit unregister_snroc(void) +{ + rv_unregister_monitor(&rv_this); +} + +module_init(register_snroc); +module_exit(unregister_snroc); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("snroc: set non runnable on its own context."); diff --git a/kernel/trace/rv/monitors/snroc/snroc.h b/kernel/trace/rv/monitors/snroc/snroc.h new file mode 100644 index 000000000000..88b7328ad31a --- /dev/null +++ b/kernel/trace/rv/monitors/snroc/snroc.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of snroc automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +#define MONITOR_NAME snroc + +enum states_snroc { + other_context_snroc, + own_context_snroc, + state_max_snroc, +}; + +#define INVALID_STATE state_max_snroc + +enum events_snroc { + sched_set_state_snroc, + sched_switch_in_snroc, + sched_switch_out_snroc, + event_max_snroc, +}; + +struct automaton_snroc { + char *state_names[state_max_snroc]; + char *event_names[event_max_snroc]; + unsigned char function[state_max_snroc][event_max_snroc]; + unsigned char initial_state; + bool final_states[state_max_snroc]; +}; + +static const struct automaton_snroc automaton_snroc = { + .state_names = { + "other_context", + "own_context", + }, + .event_names = { + "sched_set_state", + "sched_switch_in", + "sched_switch_out", + }, + .function = { + { INVALID_STATE, own_context_snroc, INVALID_STATE }, + { own_context_snroc, INVALID_STATE, other_context_snroc }, + }, + .initial_state = other_context_snroc, + .final_states = { 1, 0 }, +}; diff --git a/kernel/trace/rv/monitors/snroc/snroc_trace.h b/kernel/trace/rv/monitors/snroc/snroc_trace.h new file mode 100644 index 000000000000..50114cef5122 --- /dev/null +++ b/kernel/trace/rv/monitors/snroc/snroc_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_SNROC +DEFINE_EVENT(event_da_monitor_id, event_snroc, + TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state), + TP_ARGS(id, state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor_id, error_snroc, + TP_PROTO(int id, char *state, char *event), + TP_ARGS(id, state, event)); +#endif /* CONFIG_RV_MON_SNROC */ diff --git a/kernel/trace/rv/monitors/sssw/Kconfig b/kernel/trace/rv/monitors/sssw/Kconfig new file mode 100644 index 000000000000..23b7eeb38bbf --- /dev/null +++ b/kernel/trace/rv/monitors/sssw/Kconfig @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_SSSW + depends on RV + depends on RV_MON_SCHED + default y + select DA_MON_EVENTS_ID + bool "sssw monitor" + help + Monitor to ensure sched_set_state to sleepable leads to sleeping and + sleeping tasks require wakeup. + This monitor is part of the sched monitors collection. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/sssw/sssw.c b/kernel/trace/rv/monitors/sssw/sssw.c new file mode 100644 index 000000000000..a91321c890cd --- /dev/null +++ b/kernel/trace/rv/monitors/sssw/sssw.c @@ -0,0 +1,114 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> + +#define MODULE_NAME "sssw" + +#include <trace/events/sched.h> +#include <trace/events/signal.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#define RV_MON_TYPE RV_MON_PER_TASK +#include "sssw.h" +#include <rv/da_monitor.h> + +static void handle_sched_set_state(void *data, struct task_struct *tsk, int state) +{ + if (state == TASK_RUNNING) + da_handle_start_event(tsk, sched_set_state_runnable_sssw); + else + da_handle_event(tsk, sched_set_state_sleepable_sssw); +} + +static void handle_sched_switch(void *data, bool preempt, + struct task_struct *prev, + struct task_struct *next, + unsigned int prev_state) +{ + if (preempt) + da_handle_event(prev, sched_switch_preempt_sssw); + else if (prev_state == TASK_RUNNING) + da_handle_event(prev, sched_switch_yield_sssw); + else if (prev_state == TASK_RTLOCK_WAIT) + /* special case of sleeping task with racy conditions */ + da_handle_event(prev, sched_switch_blocking_sssw); + else + da_handle_event(prev, sched_switch_suspend_sssw); + da_handle_event(next, sched_switch_in_sssw); +} + +static void handle_sched_wakeup(void *data, struct task_struct *p) +{ + /* + * Wakeup can also lead to signal_wakeup although the system is + * actually runnable. The monitor can safely start with this event. + */ + da_handle_start_event(p, sched_wakeup_sssw); +} + +static void handle_signal_deliver(void *data, int sig, + struct kernel_siginfo *info, + struct k_sigaction *ka) +{ + da_handle_event(current, signal_deliver_sssw); +} + +static int enable_sssw(void) +{ + int retval; + + retval = da_monitor_init(); + if (retval) + return retval; + + rv_attach_trace_probe("sssw", sched_set_state_tp, handle_sched_set_state); + rv_attach_trace_probe("sssw", sched_switch, handle_sched_switch); + rv_attach_trace_probe("sssw", sched_wakeup, handle_sched_wakeup); + rv_attach_trace_probe("sssw", signal_deliver, handle_signal_deliver); + + return 0; +} + +static void disable_sssw(void) +{ + rv_this.enabled = 0; + + rv_detach_trace_probe("sssw", sched_set_state_tp, handle_sched_set_state); + rv_detach_trace_probe("sssw", sched_switch, handle_sched_switch); + rv_detach_trace_probe("sssw", sched_wakeup, handle_sched_wakeup); + rv_detach_trace_probe("sssw", signal_deliver, handle_signal_deliver); + + da_monitor_destroy(); +} + +static struct rv_monitor rv_this = { + .name = "sssw", + .description = "set state sleep and wakeup.", + .enable = enable_sssw, + .disable = disable_sssw, + .reset = da_monitor_reset_all, + .enabled = 0, +}; + +static int __init register_sssw(void) +{ + return rv_register_monitor(&rv_this, &rv_sched); +} + +static void __exit unregister_sssw(void) +{ + rv_unregister_monitor(&rv_this); +} + +module_init(register_sssw); +module_exit(unregister_sssw); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("sssw: set state sleep and wakeup."); diff --git a/kernel/trace/rv/monitors/sssw/sssw.h b/kernel/trace/rv/monitors/sssw/sssw.h new file mode 100644 index 000000000000..1a4b806061c3 --- /dev/null +++ b/kernel/trace/rv/monitors/sssw/sssw.h @@ -0,0 +1,107 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of sssw automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +#define MONITOR_NAME sssw + +enum states_sssw { + runnable_sssw, + signal_wakeup_sssw, + sleepable_sssw, + sleeping_sssw, + state_max_sssw, +}; + +#define INVALID_STATE state_max_sssw + +enum events_sssw { + sched_set_state_runnable_sssw, + sched_set_state_sleepable_sssw, + sched_switch_blocking_sssw, + sched_switch_in_sssw, + sched_switch_preempt_sssw, + sched_switch_suspend_sssw, + sched_switch_yield_sssw, + sched_wakeup_sssw, + signal_deliver_sssw, + event_max_sssw, +}; + +struct automaton_sssw { + char *state_names[state_max_sssw]; + char *event_names[event_max_sssw]; + unsigned char function[state_max_sssw][event_max_sssw]; + unsigned char initial_state; + bool final_states[state_max_sssw]; +}; + +static const struct automaton_sssw automaton_sssw = { + .state_names = { + "runnable", + "signal_wakeup", + "sleepable", + "sleeping", + }, + .event_names = { + "sched_set_state_runnable", + "sched_set_state_sleepable", + "sched_switch_blocking", + "sched_switch_in", + "sched_switch_preempt", + "sched_switch_suspend", + "sched_switch_yield", + "sched_wakeup", + "signal_deliver", + }, + .function = { + { + runnable_sssw, + sleepable_sssw, + sleeping_sssw, + runnable_sssw, + runnable_sssw, + INVALID_STATE, + runnable_sssw, + runnable_sssw, + runnable_sssw, + }, + { + INVALID_STATE, + sleepable_sssw, + INVALID_STATE, + signal_wakeup_sssw, + signal_wakeup_sssw, + INVALID_STATE, + signal_wakeup_sssw, + signal_wakeup_sssw, + runnable_sssw, + }, + { + runnable_sssw, + sleepable_sssw, + sleeping_sssw, + sleepable_sssw, + sleepable_sssw, + sleeping_sssw, + signal_wakeup_sssw, + runnable_sssw, + sleepable_sssw, + }, + { + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + runnable_sssw, + INVALID_STATE, + }, + }, + .initial_state = runnable_sssw, + .final_states = { 1, 0, 0, 0 }, +}; diff --git a/kernel/trace/rv/monitors/sssw/sssw_trace.h b/kernel/trace/rv/monitors/sssw/sssw_trace.h new file mode 100644 index 000000000000..6c03cfc6960b --- /dev/null +++ b/kernel/trace/rv/monitors/sssw/sssw_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_SSSW +DEFINE_EVENT(event_da_monitor_id, event_sssw, + TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state), + TP_ARGS(id, state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor_id, error_sssw, + TP_PROTO(int id, char *state, char *event), + TP_ARGS(id, state, event)); +#endif /* CONFIG_RV_MON_SSSW */ diff --git a/kernel/trace/rv/monitors/stall/Kconfig b/kernel/trace/rv/monitors/stall/Kconfig new file mode 100644 index 000000000000..6f846b642544 --- /dev/null +++ b/kernel/trace/rv/monitors/stall/Kconfig @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_STALL + depends on RV + select HA_MON_EVENTS_ID + bool "stall monitor" + help + Enable the stall sample monitor that illustrates the usage of hybrid + automata monitors. It can be used to identify tasks stalled for + longer than a threshold. + + For further information, see: + Documentation/trace/rv/monitor_stall.rst diff --git a/kernel/trace/rv/monitors/stall/stall.c b/kernel/trace/rv/monitors/stall/stall.c new file mode 100644 index 000000000000..3c38fb1a0159 --- /dev/null +++ b/kernel/trace/rv/monitors/stall/stall.c @@ -0,0 +1,150 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> + +#define MODULE_NAME "stall" + +#include <trace/events/sched.h> +#include <rv_trace.h> + +#define RV_MON_TYPE RV_MON_PER_TASK +#define HA_TIMER_TYPE HA_TIMER_WHEEL +#include "stall.h" +#include <rv/ha_monitor.h> + +static u64 threshold_jiffies = 1000; +module_param(threshold_jiffies, ullong, 0644); + +static u64 ha_get_env(struct ha_monitor *ha_mon, enum envs_stall env, u64 time_ns) +{ + if (env == clk_stall) + return ha_get_clk_jiffy(ha_mon, env); + return ENV_INVALID_VALUE; +} + +static void ha_reset_env(struct ha_monitor *ha_mon, enum envs_stall env, u64 time_ns) +{ + if (env == clk_stall) + ha_reset_clk_jiffy(ha_mon, env); +} + +static inline bool ha_verify_invariants(struct ha_monitor *ha_mon, + enum states curr_state, enum events event, + enum states next_state, u64 time_ns) +{ + if (curr_state == enqueued_stall) + return ha_check_invariant_jiffy(ha_mon, clk_stall, time_ns); + return true; +} + +static inline bool ha_verify_guards(struct ha_monitor *ha_mon, + enum states curr_state, enum events event, + enum states next_state, u64 time_ns) +{ + bool res = true; + + if (curr_state == dequeued_stall && event == sched_wakeup_stall) + ha_reset_env(ha_mon, clk_stall, time_ns); + else if (curr_state == running_stall && event == sched_switch_preempt_stall) + ha_reset_env(ha_mon, clk_stall, time_ns); + return res; +} + +static inline void ha_setup_invariants(struct ha_monitor *ha_mon, + enum states curr_state, enum events event, + enum states next_state, u64 time_ns) +{ + if (next_state == curr_state) + return; + if (next_state == enqueued_stall) + ha_start_timer_jiffy(ha_mon, clk_stall, threshold_jiffies, time_ns); + else if (curr_state == enqueued_stall) + ha_cancel_timer(ha_mon); +} + +static bool ha_verify_constraint(struct ha_monitor *ha_mon, + enum states curr_state, enum events event, + enum states next_state, u64 time_ns) +{ + if (!ha_verify_invariants(ha_mon, curr_state, event, next_state, time_ns)) + return false; + + if (!ha_verify_guards(ha_mon, curr_state, event, next_state, time_ns)) + return false; + + ha_setup_invariants(ha_mon, curr_state, event, next_state, time_ns); + + return true; +} + +static void handle_sched_switch(void *data, bool preempt, + struct task_struct *prev, + struct task_struct *next, + unsigned int prev_state) +{ + if (!preempt && prev_state != TASK_RUNNING) + da_handle_start_event(prev, sched_switch_wait_stall); + else + da_handle_event(prev, sched_switch_preempt_stall); + da_handle_event(next, sched_switch_in_stall); +} + +static void handle_sched_wakeup(void *data, struct task_struct *p) +{ + da_handle_event(p, sched_wakeup_stall); +} + +static int enable_stall(void) +{ + int retval; + + retval = ha_monitor_init(); + if (retval) + return retval; + + rv_attach_trace_probe("stall", sched_switch, handle_sched_switch); + rv_attach_trace_probe("stall", sched_wakeup, handle_sched_wakeup); + + return 0; +} + +static void disable_stall(void) +{ + rv_this.enabled = 0; + + rv_detach_trace_probe("stall", sched_switch, handle_sched_switch); + rv_detach_trace_probe("stall", sched_wakeup, handle_sched_wakeup); + + ha_monitor_destroy(); +} + +static struct rv_monitor rv_this = { + .name = "stall", + .description = "identify tasks stalled for longer than a threshold.", + .enable = enable_stall, + .disable = disable_stall, + .reset = da_monitor_reset_all, + .enabled = 0, +}; + +static int __init register_stall(void) +{ + return rv_register_monitor(&rv_this, NULL); +} + +static void __exit unregister_stall(void) +{ + rv_unregister_monitor(&rv_this); +} + +module_init(register_stall); +module_exit(unregister_stall); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("stall: identify tasks stalled for longer than a threshold."); diff --git a/kernel/trace/rv/monitors/stall/stall.h b/kernel/trace/rv/monitors/stall/stall.h new file mode 100644 index 000000000000..638520cb1082 --- /dev/null +++ b/kernel/trace/rv/monitors/stall/stall.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of stall automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +#define MONITOR_NAME stall + +enum states_stall { + dequeued_stall, + enqueued_stall, + running_stall, + state_max_stall, +}; + +#define INVALID_STATE state_max_stall + +enum events_stall { + sched_switch_in_stall, + sched_switch_preempt_stall, + sched_switch_wait_stall, + sched_wakeup_stall, + event_max_stall, +}; + +enum envs_stall { + clk_stall, + env_max_stall, + env_max_stored_stall = env_max_stall, +}; + +_Static_assert(env_max_stored_stall <= MAX_HA_ENV_LEN, "Not enough slots"); + +struct automaton_stall { + char *state_names[state_max_stall]; + char *event_names[event_max_stall]; + char *env_names[env_max_stall]; + unsigned char function[state_max_stall][event_max_stall]; + unsigned char initial_state; + bool final_states[state_max_stall]; +}; + +static const struct automaton_stall automaton_stall = { + .state_names = { + "dequeued", + "enqueued", + "running", + }, + .event_names = { + "sched_switch_in", + "sched_switch_preempt", + "sched_switch_wait", + "sched_wakeup", + }, + .env_names = { + "clk", + }, + .function = { + { + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + enqueued_stall, + }, + { + running_stall, + INVALID_STATE, + INVALID_STATE, + enqueued_stall, + }, + { + running_stall, + enqueued_stall, + dequeued_stall, + running_stall, + }, + }, + .initial_state = dequeued_stall, + .final_states = { 1, 0, 0 }, +}; diff --git a/kernel/trace/rv/monitors/stall/stall_trace.h b/kernel/trace/rv/monitors/stall/stall_trace.h new file mode 100644 index 000000000000..6a7cc1b1d040 --- /dev/null +++ b/kernel/trace/rv/monitors/stall/stall_trace.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_STALL +DEFINE_EVENT(event_da_monitor_id, event_stall, + TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state), + TP_ARGS(id, state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor_id, error_stall, + TP_PROTO(int id, char *state, char *event), + TP_ARGS(id, state, event)); + +DEFINE_EVENT(error_env_da_monitor_id, error_env_stall, + TP_PROTO(int id, char *state, char *event, char *env), + TP_ARGS(id, state, event, env)); +#endif /* CONFIG_RV_MON_STALL */ diff --git a/kernel/trace/rv/monitors/sts/Kconfig b/kernel/trace/rv/monitors/sts/Kconfig new file mode 100644 index 000000000000..7d1ff0f6fc91 --- /dev/null +++ b/kernel/trace/rv/monitors/sts/Kconfig @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_STS + depends on RV + depends on TRACE_IRQFLAGS + depends on RV_MON_SCHED + default y + select DA_MON_EVENTS_IMPLICIT + bool "sts monitor" + help + Monitor to ensure relationships between scheduler and task switches + * the scheduler is called and returns with interrupts disabled + * each call to the scheduler has up to one switch + * switches only happen inside the scheduler + * each call to the scheduler disables interrupts to switch + This monitor is part of the sched monitors collection. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/sts/sts.c b/kernel/trace/rv/monitors/sts/sts.c new file mode 100644 index 000000000000..ce031cbf202a --- /dev/null +++ b/kernel/trace/rv/monitors/sts/sts.c @@ -0,0 +1,154 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> + +#define MODULE_NAME "sts" + +#include <trace/events/sched.h> +#include <trace/events/irq.h> +#include <trace/events/preemptirq.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#define RV_MON_TYPE RV_MON_PER_CPU +#include "sts.h" +#include <rv/da_monitor.h> + +#ifdef CONFIG_X86_LOCAL_APIC +#include <asm/trace/irq_vectors.h> + +static void handle_vector_irq_entry(void *data, int vector) +{ + da_handle_event(irq_entry_sts); +} + +static void attach_vector_irq(void) +{ + rv_attach_trace_probe("sts", local_timer_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_IRQ_WORK)) + rv_attach_trace_probe("sts", irq_work_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_SMP)) { + rv_attach_trace_probe("sts", reschedule_entry, handle_vector_irq_entry); + rv_attach_trace_probe("sts", call_function_entry, handle_vector_irq_entry); + rv_attach_trace_probe("sts", call_function_single_entry, handle_vector_irq_entry); + } +} + +static void detach_vector_irq(void) +{ + rv_detach_trace_probe("sts", local_timer_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_IRQ_WORK)) + rv_detach_trace_probe("sts", irq_work_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_SMP)) { + rv_detach_trace_probe("sts", reschedule_entry, handle_vector_irq_entry); + rv_detach_trace_probe("sts", call_function_entry, handle_vector_irq_entry); + rv_detach_trace_probe("sts", call_function_single_entry, handle_vector_irq_entry); + } +} + +#else +/* We assume irq_entry tracepoints are sufficient on other architectures */ +static void attach_vector_irq(void) { } +static void detach_vector_irq(void) { } +#endif + +static void handle_irq_disable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_event(irq_disable_sts); +} + +static void handle_irq_enable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_event(irq_enable_sts); +} + +static void handle_irq_entry(void *data, int irq, struct irqaction *action) +{ + da_handle_event(irq_entry_sts); +} + +static void handle_sched_switch(void *data, bool preempt, + struct task_struct *prev, + struct task_struct *next, + unsigned int prev_state) +{ + da_handle_event(sched_switch_sts); +} + +static void handle_schedule_entry(void *data, bool preempt) +{ + da_handle_event(schedule_entry_sts); +} + +static void handle_schedule_exit(void *data, bool is_switch) +{ + da_handle_start_event(schedule_exit_sts); +} + +static int enable_sts(void) +{ + int retval; + + retval = da_monitor_init(); + if (retval) + return retval; + + rv_attach_trace_probe("sts", irq_disable, handle_irq_disable); + rv_attach_trace_probe("sts", irq_enable, handle_irq_enable); + rv_attach_trace_probe("sts", irq_handler_entry, handle_irq_entry); + rv_attach_trace_probe("sts", sched_switch, handle_sched_switch); + rv_attach_trace_probe("sts", sched_entry_tp, handle_schedule_entry); + rv_attach_trace_probe("sts", sched_exit_tp, handle_schedule_exit); + attach_vector_irq(); + + return 0; +} + +static void disable_sts(void) +{ + rv_this.enabled = 0; + + rv_detach_trace_probe("sts", irq_disable, handle_irq_disable); + rv_detach_trace_probe("sts", irq_enable, handle_irq_enable); + rv_detach_trace_probe("sts", irq_handler_entry, handle_irq_entry); + rv_detach_trace_probe("sts", sched_switch, handle_sched_switch); + rv_detach_trace_probe("sts", sched_entry_tp, handle_schedule_entry); + rv_detach_trace_probe("sts", sched_exit_tp, handle_schedule_exit); + detach_vector_irq(); + + da_monitor_destroy(); +} + +/* + * This is the monitor register section. + */ +static struct rv_monitor rv_this = { + .name = "sts", + .description = "schedule implies task switch.", + .enable = enable_sts, + .disable = disable_sts, + .reset = da_monitor_reset_all, + .enabled = 0, +}; + +static int __init register_sts(void) +{ + return rv_register_monitor(&rv_this, &rv_sched); +} + +static void __exit unregister_sts(void) +{ + rv_unregister_monitor(&rv_this); +} + +module_init(register_sts); +module_exit(unregister_sts); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("sts: schedule implies task switch."); diff --git a/kernel/trace/rv/monitors/sts/sts.h b/kernel/trace/rv/monitors/sts/sts.h new file mode 100644 index 000000000000..6f7b2d9d72e6 --- /dev/null +++ b/kernel/trace/rv/monitors/sts/sts.h @@ -0,0 +1,119 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of sts automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +#define MONITOR_NAME sts + +enum states_sts { + can_sched_sts, + cant_sched_sts, + disable_to_switch_sts, + enable_to_exit_sts, + in_irq_sts, + scheduling_sts, + switching_sts, + state_max_sts, +}; + +#define INVALID_STATE state_max_sts + +enum events_sts { + irq_disable_sts, + irq_enable_sts, + irq_entry_sts, + sched_switch_sts, + schedule_entry_sts, + schedule_exit_sts, + event_max_sts, +}; + +struct automaton_sts { + char *state_names[state_max_sts]; + char *event_names[event_max_sts]; + unsigned char function[state_max_sts][event_max_sts]; + unsigned char initial_state; + bool final_states[state_max_sts]; +}; + +static const struct automaton_sts automaton_sts = { + .state_names = { + "can_sched", + "cant_sched", + "disable_to_switch", + "enable_to_exit", + "in_irq", + "scheduling", + "switching", + }, + .event_names = { + "irq_disable", + "irq_enable", + "irq_entry", + "sched_switch", + "schedule_entry", + "schedule_exit", + }, + .function = { + { + cant_sched_sts, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + scheduling_sts, + INVALID_STATE, + }, + { + INVALID_STATE, + can_sched_sts, + cant_sched_sts, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + }, + { + INVALID_STATE, + enable_to_exit_sts, + in_irq_sts, + switching_sts, + INVALID_STATE, + INVALID_STATE, + }, + { + enable_to_exit_sts, + enable_to_exit_sts, + enable_to_exit_sts, + INVALID_STATE, + INVALID_STATE, + can_sched_sts, + }, + { + INVALID_STATE, + scheduling_sts, + in_irq_sts, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + }, + { + disable_to_switch_sts, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + }, + { + INVALID_STATE, + enable_to_exit_sts, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + }, + }, + .initial_state = can_sched_sts, + .final_states = { 1, 0, 0, 0, 0, 0, 0 }, +}; diff --git a/kernel/trace/rv/monitors/sts/sts_trace.h b/kernel/trace/rv/monitors/sts/sts_trace.h new file mode 100644 index 000000000000..d78beb58d5b3 --- /dev/null +++ b/kernel/trace/rv/monitors/sts/sts_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_STS +DEFINE_EVENT(event_da_monitor, event_sts, + TP_PROTO(char *state, char *event, char *next_state, bool final_state), + TP_ARGS(state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor, error_sts, + TP_PROTO(char *state, char *event), + TP_ARGS(state, event)); +#endif /* CONFIG_RV_MON_STS */ diff --git a/kernel/trace/rv/monitors/wip/Kconfig b/kernel/trace/rv/monitors/wip/Kconfig new file mode 100644 index 000000000000..87a26195792b --- /dev/null +++ b/kernel/trace/rv/monitors/wip/Kconfig @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_WIP + depends on RV + depends on TRACE_PREEMPT_TOGGLE + select DA_MON_EVENTS_IMPLICIT + bool "wip monitor" + help + Enable wip (wakeup in preemptive) sample monitor that illustrates + the usage of per-cpu monitors, and one limitation of the + preempt_disable/enable events. + + For further information, see: + Documentation/trace/rv/monitor_wip.rst diff --git a/kernel/trace/rv/monitors/wip/wip.c b/kernel/trace/rv/monitors/wip/wip.c index b2b49a27e886..22d77ec42463 100644 --- a/kernel/trace/rv/monitors/wip/wip.c +++ b/kernel/trace/rv/monitors/wip/wip.c @@ -6,39 +6,37 @@ #include <linux/init.h> #include <linux/rv.h> #include <rv/instrumentation.h> -#include <rv/da_monitor.h> #define MODULE_NAME "wip" -#include <trace/events/rv.h> +#include <rv_trace.h> #include <trace/events/sched.h> #include <trace/events/preemptirq.h> +#define RV_MON_TYPE RV_MON_PER_CPU #include "wip.h" - -static struct rv_monitor rv_wip; -DECLARE_DA_MON_PER_CPU(wip, unsigned char); +#include <rv/da_monitor.h> static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip) { - da_handle_event_wip(preempt_disable_wip); + da_handle_event(preempt_disable_wip); } static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip) { - da_handle_start_event_wip(preempt_enable_wip); + da_handle_start_event(preempt_enable_wip); } static void handle_sched_waking(void *data, struct task_struct *task) { - da_handle_event_wip(sched_waking_wip); + da_handle_event(sched_waking_wip); } static int enable_wip(void) { int retval; - retval = da_monitor_init_wip(); + retval = da_monitor_init(); if (retval) return retval; @@ -51,33 +49,32 @@ static int enable_wip(void) static void disable_wip(void) { - rv_wip.enabled = 0; + rv_this.enabled = 0; rv_detach_trace_probe("wip", preempt_disable, handle_preempt_disable); rv_detach_trace_probe("wip", preempt_enable, handle_preempt_enable); rv_detach_trace_probe("wip", sched_waking, handle_sched_waking); - da_monitor_destroy_wip(); + da_monitor_destroy(); } -static struct rv_monitor rv_wip = { +static struct rv_monitor rv_this = { .name = "wip", .description = "wakeup in preemptive per-cpu testing monitor.", .enable = enable_wip, .disable = disable_wip, - .reset = da_monitor_reset_all_wip, + .reset = da_monitor_reset_all, .enabled = 0, }; static int __init register_wip(void) { - rv_register_monitor(&rv_wip); - return 0; + return rv_register_monitor(&rv_this, NULL); } static void __exit unregister_wip(void) { - rv_unregister_monitor(&rv_wip); + rv_unregister_monitor(&rv_this); } module_init(register_wip); diff --git a/kernel/trace/rv/monitors/wip/wip.h b/kernel/trace/rv/monitors/wip/wip.h index 2e373f2c65ed..b4c3eea94c86 100644 --- a/kernel/trace/rv/monitors/wip/wip.h +++ b/kernel/trace/rv/monitors/wip/wip.h @@ -1,22 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Automatically generated C representation of wip automaton * For further information about this format, see kernel documentation: * Documentation/trace/rv/deterministic_automata.rst */ +#define MONITOR_NAME wip + enum states_wip { - preemptive_wip = 0, + preemptive_wip, non_preemptive_wip, - state_max_wip + state_max_wip, }; #define INVALID_STATE state_max_wip enum events_wip { - preempt_disable_wip = 0, + preempt_disable_wip, preempt_enable_wip, sched_waking_wip, - event_max_wip + event_max_wip, }; struct automaton_wip { @@ -30,12 +33,12 @@ struct automaton_wip { static const struct automaton_wip automaton_wip = { .state_names = { "preemptive", - "non_preemptive" + "non_preemptive", }, .event_names = { "preempt_disable", "preempt_enable", - "sched_waking" + "sched_waking", }, .function = { { non_preemptive_wip, INVALID_STATE, INVALID_STATE }, diff --git a/kernel/trace/rv/monitors/wip/wip_trace.h b/kernel/trace/rv/monitors/wip/wip_trace.h new file mode 100644 index 000000000000..aa2162f47a4c --- /dev/null +++ b/kernel/trace/rv/monitors/wip/wip_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_WIP +DEFINE_EVENT(event_da_monitor, event_wip, + TP_PROTO(char *state, char *event, char *next_state, bool final_state), + TP_ARGS(state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor, error_wip, + TP_PROTO(char *state, char *event), + TP_ARGS(state, event)); +#endif /* CONFIG_RV_MON_WIP */ diff --git a/kernel/trace/rv/monitors/wwnr/Kconfig b/kernel/trace/rv/monitors/wwnr/Kconfig new file mode 100644 index 000000000000..d3bfc20037db --- /dev/null +++ b/kernel/trace/rv/monitors/wwnr/Kconfig @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_WWNR + depends on RV + select DA_MON_EVENTS_ID + bool "wwnr monitor" + help + Enable wwnr (wakeup while not running) sample monitor, this is a + sample monitor that illustrates the usage of per-task monitor. + The model is borken on purpose: it serves to test reactors. + + For further information, see: + Documentation/trace/rv/monitor_wwnr.rst diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.c b/kernel/trace/rv/monitors/wwnr/wwnr.c index 0e43dd2db685..579e7e217ee0 100644 --- a/kernel/trace/rv/monitors/wwnr/wwnr.c +++ b/kernel/trace/rv/monitors/wwnr/wwnr.c @@ -6,40 +6,38 @@ #include <linux/init.h> #include <linux/rv.h> #include <rv/instrumentation.h> -#include <rv/da_monitor.h> #define MODULE_NAME "wwnr" -#include <trace/events/rv.h> +#include <rv_trace.h> #include <trace/events/sched.h> +#define RV_MON_TYPE RV_MON_PER_TASK #include "wwnr.h" - -static struct rv_monitor rv_wwnr; -DECLARE_DA_MON_PER_TASK(wwnr, unsigned char); +#include <rv/da_monitor.h> static void handle_switch(void *data, bool preempt, struct task_struct *p, struct task_struct *n, unsigned int prev_state) { /* start monitoring only after the first suspension */ if (prev_state == TASK_INTERRUPTIBLE) - da_handle_start_event_wwnr(p, switch_out_wwnr); + da_handle_start_event(p, switch_out_wwnr); else - da_handle_event_wwnr(p, switch_out_wwnr); + da_handle_event(p, switch_out_wwnr); - da_handle_event_wwnr(n, switch_in_wwnr); + da_handle_event(n, switch_in_wwnr); } static void handle_wakeup(void *data, struct task_struct *p) { - da_handle_event_wwnr(p, wakeup_wwnr); + da_handle_event(p, wakeup_wwnr); } static int enable_wwnr(void) { int retval; - retval = da_monitor_init_wwnr(); + retval = da_monitor_init(); if (retval) return retval; @@ -51,32 +49,31 @@ static int enable_wwnr(void) static void disable_wwnr(void) { - rv_wwnr.enabled = 0; + rv_this.enabled = 0; rv_detach_trace_probe("wwnr", sched_switch, handle_switch); rv_detach_trace_probe("wwnr", sched_wakeup, handle_wakeup); - da_monitor_destroy_wwnr(); + da_monitor_destroy(); } -static struct rv_monitor rv_wwnr = { +static struct rv_monitor rv_this = { .name = "wwnr", .description = "wakeup while not running per-task testing model.", .enable = enable_wwnr, .disable = disable_wwnr, - .reset = da_monitor_reset_all_wwnr, + .reset = da_monitor_reset_all, .enabled = 0, }; static int __init register_wwnr(void) { - rv_register_monitor(&rv_wwnr); - return 0; + return rv_register_monitor(&rv_this, NULL); } static void __exit unregister_wwnr(void) { - rv_unregister_monitor(&rv_wwnr); + rv_unregister_monitor(&rv_this); } module_init(register_wwnr); diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.h b/kernel/trace/rv/monitors/wwnr/wwnr.h index d0d9c4b8121b..a28006512c9b 100644 --- a/kernel/trace/rv/monitors/wwnr/wwnr.h +++ b/kernel/trace/rv/monitors/wwnr/wwnr.h @@ -1,22 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Automatically generated C representation of wwnr automaton * For further information about this format, see kernel documentation: * Documentation/trace/rv/deterministic_automata.rst */ +#define MONITOR_NAME wwnr + enum states_wwnr { - not_running_wwnr = 0, + not_running_wwnr, running_wwnr, - state_max_wwnr + state_max_wwnr, }; #define INVALID_STATE state_max_wwnr enum events_wwnr { - switch_in_wwnr = 0, + switch_in_wwnr, switch_out_wwnr, wakeup_wwnr, - event_max_wwnr + event_max_wwnr, }; struct automaton_wwnr { @@ -30,12 +33,12 @@ struct automaton_wwnr { static const struct automaton_wwnr automaton_wwnr = { .state_names = { "not_running", - "running" + "running", }, .event_names = { "switch_in", "switch_out", - "wakeup" + "wakeup", }, .function = { { running_wwnr, INVALID_STATE, not_running_wwnr }, diff --git a/kernel/trace/rv/monitors/wwnr/wwnr_trace.h b/kernel/trace/rv/monitors/wwnr/wwnr_trace.h new file mode 100644 index 000000000000..fc97ec7476ad --- /dev/null +++ b/kernel/trace/rv/monitors/wwnr/wwnr_trace.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_WWNR +/* id is the pid of the task */ +DEFINE_EVENT(event_da_monitor_id, event_wwnr, + TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state), + TP_ARGS(id, state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor_id, error_wwnr, + TP_PROTO(int id, char *state, char *event), + TP_ARGS(id, state, event)); +#endif /* CONFIG_RV_MON_WWNR */ diff --git a/kernel/trace/rv/reactor_panic.c b/kernel/trace/rv/reactor_panic.c index 0186ff4cbd0b..76537b8a4343 100644 --- a/kernel/trace/rv/reactor_panic.c +++ b/kernel/trace/rv/reactor_panic.c @@ -13,9 +13,9 @@ #include <linux/init.h> #include <linux/rv.h> -static void rv_panic_reaction(char *msg) +__printf(1, 0) static void rv_panic_reaction(const char *msg, va_list args) { - panic(msg); + vpanic(msg, args); } static struct rv_reactor rv_panic = { diff --git a/kernel/trace/rv/reactor_printk.c b/kernel/trace/rv/reactor_printk.c index 178759dbf89f..48c934e315b3 100644 --- a/kernel/trace/rv/reactor_printk.c +++ b/kernel/trace/rv/reactor_printk.c @@ -12,9 +12,9 @@ #include <linux/init.h> #include <linux/rv.h> -static void rv_printk_reaction(char *msg) +__printf(1, 0) static void rv_printk_reaction(const char *msg, va_list args) { - printk_deferred(msg); + vprintk_deferred(msg, args); } static struct rv_reactor rv_printk = { diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index 279c70e1bd74..ee4e68102f17 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -143,9 +143,9 @@ #include <linux/init.h> #include <linux/slab.h> -#ifdef CONFIG_DA_MON_EVENTS +#ifdef CONFIG_RV_MON_EVENTS #define CREATE_TRACE_POINTS -#include <trace/events/rv.h> +#include <rv_trace.h> #endif #include "rv.h" @@ -162,10 +162,10 @@ struct dentry *get_monitors_root(void) /* * Interface for the monitor register. */ -static LIST_HEAD(rv_monitors_list); +LIST_HEAD(rv_monitors_list); static int task_monitor_count; -static bool task_monitor_slots[RV_PER_TASK_MONITORS]; +static bool task_monitor_slots[CONFIG_RV_PER_TASK_MONITORS]; int rv_get_task_monitor_slot(void) { @@ -173,12 +173,12 @@ int rv_get_task_monitor_slot(void) lockdep_assert_held(&rv_interface_lock); - if (task_monitor_count == RV_PER_TASK_MONITORS) + if (task_monitor_count == CONFIG_RV_PER_TASK_MONITORS) return -EBUSY; task_monitor_count++; - for (i = 0; i < RV_PER_TASK_MONITORS; i++) { + for (i = 0; i < CONFIG_RV_PER_TASK_MONITORS; i++) { if (task_monitor_slots[i] == false) { task_monitor_slots[i] = true; return i; @@ -194,7 +194,7 @@ void rv_put_task_monitor_slot(int slot) { lockdep_assert_held(&rv_interface_lock); - if (slot < 0 || slot >= RV_PER_TASK_MONITORS) { + if (slot < 0 || slot >= CONFIG_RV_PER_TASK_MONITORS) { WARN_ONCE(1, "RV releasing an invalid slot!: %d\n", slot); return; } @@ -207,15 +207,44 @@ void rv_put_task_monitor_slot(int slot) } /* + * Monitors with a parent are nested, + * Monitors without a parent could be standalone or containers. + */ +bool rv_is_nested_monitor(struct rv_monitor *mon) +{ + return mon->parent != NULL; +} + +/* + * We set our list to have nested monitors listed after their parent + * if a monitor has a child element its a container. + * Containers can be also identified based on their function pointers: + * as they are not real monitors they do not need function definitions + * for enable()/disable(). Use this condition to find empty containers. + * Keep both conditions in case we have some non-compliant containers. + */ +bool rv_is_container_monitor(struct rv_monitor *mon) +{ + struct rv_monitor *next; + + if (list_is_last(&mon->list, &rv_monitors_list)) + return false; + + next = list_next_entry(mon, list); + + return next->parent == mon || !mon->enable; +} + +/* * This section collects the monitor/ files and folders. */ static ssize_t monitor_enable_read_data(struct file *filp, char __user *user_buf, size_t count, loff_t *ppos) { - struct rv_monitor_def *mdef = filp->private_data; + struct rv_monitor *mon = filp->private_data; const char *buff; - buff = mdef->monitor->enabled ? "1\n" : "0\n"; + buff = mon->enabled ? "1\n" : "0\n"; return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff)+1); } @@ -223,13 +252,14 @@ static ssize_t monitor_enable_read_data(struct file *filp, char __user *user_buf /* * __rv_disable_monitor - disabled an enabled monitor */ -static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync) +static int __rv_disable_monitor(struct rv_monitor *mon, bool sync) { lockdep_assert_held(&rv_interface_lock); - if (mdef->monitor->enabled) { - mdef->monitor->enabled = 0; - mdef->monitor->disable(); + if (mon->enabled) { + mon->enabled = 0; + if (mon->disable) + mon->disable(); /* * Wait for the execution of all events to finish. @@ -243,37 +273,90 @@ static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync) return 0; } +static void rv_disable_single(struct rv_monitor *mon) +{ + __rv_disable_monitor(mon, true); +} + +static int rv_enable_single(struct rv_monitor *mon) +{ + int retval; + + lockdep_assert_held(&rv_interface_lock); + + if (mon->enabled) + return 0; + + retval = mon->enable(); + + if (!retval) + mon->enabled = 1; + + return retval; +} + +static void rv_disable_container(struct rv_monitor *mon) +{ + struct rv_monitor *p = mon; + int enabled = 0; + + list_for_each_entry_continue(p, &rv_monitors_list, list) { + if (p->parent != mon) + break; + enabled += __rv_disable_monitor(p, false); + } + if (enabled) + tracepoint_synchronize_unregister(); + mon->enabled = 0; +} + +static int rv_enable_container(struct rv_monitor *mon) +{ + struct rv_monitor *p = mon; + int retval = 0; + + list_for_each_entry_continue(p, &rv_monitors_list, list) { + if (retval || p->parent != mon) + break; + retval = rv_enable_single(p); + } + if (retval) + rv_disable_container(mon); + else + mon->enabled = 1; + return retval; +} + /** * rv_disable_monitor - disable a given runtime monitor - * @mdef: Pointer to the monitor definition structure. + * @mon: Pointer to the monitor definition structure. * * Returns 0 on success. */ -int rv_disable_monitor(struct rv_monitor_def *mdef) +int rv_disable_monitor(struct rv_monitor *mon) { - __rv_disable_monitor(mdef, true); + if (rv_is_container_monitor(mon)) + rv_disable_container(mon); + else + rv_disable_single(mon); + return 0; } /** * rv_enable_monitor - enable a given runtime monitor - * @mdef: Pointer to the monitor definition structure. + * @mon: Pointer to the monitor definition structure. * * Returns 0 on success, error otherwise. */ -int rv_enable_monitor(struct rv_monitor_def *mdef) +int rv_enable_monitor(struct rv_monitor *mon) { int retval; - lockdep_assert_held(&rv_interface_lock); - - if (mdef->monitor->enabled) - return 0; - - retval = mdef->monitor->enable(); - - if (!retval) - mdef->monitor->enabled = 1; + if (rv_is_container_monitor(mon)) + retval = rv_enable_container(mon); + else + retval = rv_enable_single(mon); return retval; } @@ -284,7 +367,7 @@ int rv_enable_monitor(struct rv_monitor_def *mdef) static ssize_t monitor_enable_write_data(struct file *filp, const char __user *user_buf, size_t count, loff_t *ppos) { - struct rv_monitor_def *mdef = filp->private_data; + struct rv_monitor *mon = filp->private_data; int retval; bool val; @@ -292,14 +375,12 @@ static ssize_t monitor_enable_write_data(struct file *filp, const char __user *u if (retval) return retval; - mutex_lock(&rv_interface_lock); + guard(mutex)(&rv_interface_lock); if (val) - retval = rv_enable_monitor(mdef); + retval = rv_enable_monitor(mon); else - retval = rv_disable_monitor(mdef); - - mutex_unlock(&rv_interface_lock); + retval = rv_disable_monitor(mon); return retval ? : count; } @@ -316,12 +397,12 @@ static const struct file_operations interface_enable_fops = { static ssize_t monitor_desc_read_data(struct file *filp, char __user *user_buf, size_t count, loff_t *ppos) { - struct rv_monitor_def *mdef = filp->private_data; + struct rv_monitor *mon = filp->private_data; char buff[256]; memset(buff, 0, sizeof(buff)); - snprintf(buff, sizeof(buff), "%s\n", mdef->monitor->description); + snprintf(buff, sizeof(buff), "%s\n", mon->description); return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff) + 1); } @@ -336,38 +417,30 @@ static const struct file_operations interface_desc_fops = { * the monitor dir, where the specific options of the monitor * are exposed. */ -static int create_monitor_dir(struct rv_monitor_def *mdef) +static int create_monitor_dir(struct rv_monitor *mon, struct rv_monitor *parent) { - struct dentry *root = get_monitors_root(); - const char *name = mdef->monitor->name; + struct dentry *root = parent ? parent->root_d : get_monitors_root(); + struct dentry *dir __free(rv_remove) = rv_create_dir(mon->name, root); struct dentry *tmp; int retval; - mdef->root_d = rv_create_dir(name, root); - if (!mdef->root_d) + if (!dir) return -ENOMEM; - tmp = rv_create_file("enable", RV_MODE_WRITE, mdef->root_d, mdef, &interface_enable_fops); - if (!tmp) { - retval = -ENOMEM; - goto out_remove_root; - } + tmp = rv_create_file("enable", RV_MODE_WRITE, dir, mon, &interface_enable_fops); + if (!tmp) + return -ENOMEM; - tmp = rv_create_file("desc", RV_MODE_READ, mdef->root_d, mdef, &interface_desc_fops); - if (!tmp) { - retval = -ENOMEM; - goto out_remove_root; - } + tmp = rv_create_file("desc", RV_MODE_READ, dir, mon, &interface_desc_fops); + if (!tmp) + return -ENOMEM; - retval = reactor_populate_monitor(mdef); + retval = reactor_populate_monitor(mon, dir); if (retval) - goto out_remove_root; + return retval; + mon->root_d = no_free_ptr(dir); return 0; - -out_remove_root: - rv_remove(mdef->root_d); - return retval; } /* @@ -375,9 +448,12 @@ out_remove_root: */ static int monitors_show(struct seq_file *m, void *p) { - struct rv_monitor_def *mon_def = p; + struct rv_monitor *mon = container_of(p, struct rv_monitor, list); - seq_printf(m, "%s\n", mon_def->monitor->name); + if (mon->parent) + seq_printf(m, "%s:%s\n", mon->parent->name, mon->name); + else + seq_printf(m, "%s\n", mon->name); return 0; } @@ -409,13 +485,13 @@ static void *available_monitors_next(struct seq_file *m, void *p, loff_t *pos) */ static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos) { - struct rv_monitor_def *m_def = p; + struct rv_monitor *mon = container_of(p, struct rv_monitor, list); (*pos)++; - list_for_each_entry_continue(m_def, &rv_monitors_list, list) { - if (m_def->monitor->enabled) - return m_def; + list_for_each_entry_continue(mon, &rv_monitors_list, list) { + if (mon->enabled) + return &mon->list; } return NULL; @@ -423,7 +499,7 @@ static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos) static void *enabled_monitors_start(struct seq_file *m, loff_t *pos) { - struct rv_monitor_def *m_def; + struct list_head *head; loff_t l; mutex_lock(&rv_interface_lock); @@ -431,15 +507,15 @@ static void *enabled_monitors_start(struct seq_file *m, loff_t *pos) if (list_empty(&rv_monitors_list)) return NULL; - m_def = list_entry(&rv_monitors_list, struct rv_monitor_def, list); + head = &rv_monitors_list; for (l = 0; l <= *pos; ) { - m_def = enabled_monitors_next(m, m_def, &l); - if (!m_def) + head = enabled_monitors_next(m, head, &l); + if (!head) break; } - return m_def; + return head; } /* @@ -479,13 +555,13 @@ static const struct file_operations available_monitors_ops = { */ static void disable_all_monitors(void) { - struct rv_monitor_def *mdef; + struct rv_monitor *mon; int enabled = 0; - mutex_lock(&rv_interface_lock); + guard(mutex)(&rv_interface_lock); - list_for_each_entry(mdef, &rv_monitors_list, list) - enabled += __rv_disable_monitor(mdef, false); + list_for_each_entry(mon, &rv_monitors_list, list) + enabled += __rv_disable_monitor(mon, false); if (enabled) { /* @@ -495,8 +571,6 @@ static void disable_all_monitors(void) */ tracepoint_synchronize_unregister(); } - - mutex_unlock(&rv_interface_lock); } static int enabled_monitors_open(struct inode *inode, struct file *file) @@ -511,10 +585,10 @@ static ssize_t enabled_monitors_write(struct file *filp, const char __user *user size_t count, loff_t *ppos) { char buff[MAX_RV_MONITOR_NAME_SIZE + 2]; - struct rv_monitor_def *mdef; + struct rv_monitor *mon; int retval = -EINVAL; bool enable = true; - char *ptr; + char *ptr, *tmp; int len; if (count < 1 || count > MAX_RV_MONITOR_NAME_SIZE + 1) @@ -537,29 +611,32 @@ static ssize_t enabled_monitors_write(struct file *filp, const char __user *user if (!len) return count; - mutex_lock(&rv_interface_lock); + guard(mutex)(&rv_interface_lock); retval = -EINVAL; - list_for_each_entry(mdef, &rv_monitors_list, list) { - if (strcmp(ptr, mdef->monitor->name) != 0) + /* we support 1 nesting level, trim the parent */ + tmp = strstr(ptr, ":"); + if (tmp) + ptr = tmp+1; + + list_for_each_entry(mon, &rv_monitors_list, list) { + if (strcmp(ptr, mon->name) != 0) continue; /* * Monitor found! */ if (enable) - retval = rv_enable_monitor(mdef); + retval = rv_enable_monitor(mon); else - retval = rv_disable_monitor(mdef); - - if (!retval) - retval = count; + retval = rv_disable_monitor(mon); - break; + if (retval) + return retval; + return count; } - mutex_unlock(&rv_interface_lock); return retval; } @@ -583,8 +660,6 @@ static bool __read_mostly monitoring_on; */ bool rv_monitoring_on(void) { - /* Ensures that concurrent monitors read consistent monitoring_on */ - smp_rmb(); return READ_ONCE(monitoring_on); } @@ -604,25 +679,21 @@ static ssize_t monitoring_on_read_data(struct file *filp, char __user *user_buf, static void turn_monitoring_off(void) { WRITE_ONCE(monitoring_on, false); - /* Ensures that concurrent monitors read consistent monitoring_on */ - smp_wmb(); } static void reset_all_monitors(void) { - struct rv_monitor_def *mdef; + struct rv_monitor *mon; - list_for_each_entry(mdef, &rv_monitors_list, list) { - if (mdef->monitor->enabled) - mdef->monitor->reset(); + list_for_each_entry(mon, &rv_monitors_list, list) { + if (mon->enabled && mon->reset) + mon->reset(); } } static void turn_monitoring_on(void) { WRITE_ONCE(monitoring_on, true); - /* Ensures that concurrent monitors read consistent monitoring_on */ - smp_wmb(); } static void turn_monitoring_on_with_reset(void) @@ -652,7 +723,7 @@ static ssize_t monitoring_on_write_data(struct file *filp, const char __user *us if (retval) return retval; - mutex_lock(&rv_interface_lock); + guard(mutex)(&rv_interface_lock); if (val) turn_monitoring_on_with_reset(); @@ -665,8 +736,6 @@ static ssize_t monitoring_on_write_data(struct file *filp, const char __user *us */ tracepoint_synchronize_unregister(); - mutex_unlock(&rv_interface_lock); - return count; } @@ -676,58 +745,57 @@ static const struct file_operations monitoring_on_fops = { .read = monitoring_on_read_data, }; -static void destroy_monitor_dir(struct rv_monitor_def *mdef) +static void destroy_monitor_dir(struct rv_monitor *mon) { - reactor_cleanup_monitor(mdef); - rv_remove(mdef->root_d); + rv_remove(mon->root_d); } /** * rv_register_monitor - register a rv monitor. * @monitor: The rv_monitor to be registered. + * @parent: The parent of the monitor to be registered, NULL if not nested. * * Returns 0 if successful, error otherwise. */ -int rv_register_monitor(struct rv_monitor *monitor) +int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent) { - struct rv_monitor_def *r; + struct rv_monitor *r; int retval = 0; if (strlen(monitor->name) >= MAX_RV_MONITOR_NAME_SIZE) { pr_info("Monitor %s has a name longer than %d\n", monitor->name, MAX_RV_MONITOR_NAME_SIZE); - return -1; + return -EINVAL; } - mutex_lock(&rv_interface_lock); + guard(mutex)(&rv_interface_lock); list_for_each_entry(r, &rv_monitors_list, list) { - if (strcmp(monitor->name, r->monitor->name) == 0) { + if (strcmp(monitor->name, r->name) == 0) { pr_info("Monitor %s is already registered\n", monitor->name); - retval = -1; - goto out_unlock; + return -EEXIST; } } - r = kzalloc(sizeof(struct rv_monitor_def), GFP_KERNEL); - if (!r) { - retval = -ENOMEM; - goto out_unlock; + if (parent && rv_is_nested_monitor(parent)) { + pr_info("Parent monitor %s is already nested, cannot nest further\n", + parent->name); + return -EINVAL; } - r->monitor = monitor; + monitor->parent = parent; - retval = create_monitor_dir(r); - if (retval) { - kfree(r); - goto out_unlock; - } + retval = create_monitor_dir(monitor, parent); + if (retval) + return retval; - list_add_tail(&r->list, &rv_monitors_list); + /* keep children close to the parent for easier visualisation */ + if (parent) + list_add(&monitor->list, &parent->list); + else + list_add_tail(&monitor->list, &rv_monitors_list); -out_unlock: - mutex_unlock(&rv_interface_lock); - return retval; + return 0; } /** @@ -738,19 +806,12 @@ out_unlock: */ int rv_unregister_monitor(struct rv_monitor *monitor) { - struct rv_monitor_def *ptr, *next; + guard(mutex)(&rv_interface_lock); - mutex_lock(&rv_interface_lock); + rv_disable_monitor(monitor); + list_del(&monitor->list); + destroy_monitor_dir(monitor); - list_for_each_entry_safe(ptr, next, &rv_monitors_list, list) { - if (strcmp(monitor->name, ptr->monitor->name) == 0) { - rv_disable_monitor(ptr); - list_del(&ptr->list); - destroy_monitor_dir(ptr); - } - } - - mutex_unlock(&rv_interface_lock); return 0; } @@ -758,39 +819,36 @@ int __init rv_init_interface(void) { struct dentry *tmp; int retval; + struct dentry *root_dir __free(rv_remove) = rv_create_dir("rv", NULL); - rv_root.root_dir = rv_create_dir("rv", NULL); - if (!rv_root.root_dir) - goto out_err; + if (!root_dir) + return 1; - rv_root.monitors_dir = rv_create_dir("monitors", rv_root.root_dir); + rv_root.monitors_dir = rv_create_dir("monitors", root_dir); if (!rv_root.monitors_dir) - goto out_err; + return 1; - tmp = rv_create_file("available_monitors", RV_MODE_READ, rv_root.root_dir, NULL, + tmp = rv_create_file("available_monitors", RV_MODE_READ, root_dir, NULL, &available_monitors_ops); if (!tmp) - goto out_err; + return 1; - tmp = rv_create_file("enabled_monitors", RV_MODE_WRITE, rv_root.root_dir, NULL, + tmp = rv_create_file("enabled_monitors", RV_MODE_WRITE, root_dir, NULL, &enabled_monitors_ops); if (!tmp) - goto out_err; + return 1; - tmp = rv_create_file("monitoring_on", RV_MODE_WRITE, rv_root.root_dir, NULL, + tmp = rv_create_file("monitoring_on", RV_MODE_WRITE, root_dir, NULL, &monitoring_on_fops); if (!tmp) - goto out_err; - retval = init_rv_reactors(rv_root.root_dir); + return 1; + retval = init_rv_reactors(root_dir); if (retval) - goto out_err; + return 1; turn_monitoring_on(); - return 0; + rv_root.root_dir = no_free_ptr(root_dir); -out_err: - rv_remove(rv_root.root_dir); - printk(KERN_ERR "RV: Error while creating the RV interface\n"); - return 1; + return 0; } diff --git a/kernel/trace/rv/rv.h b/kernel/trace/rv/rv.h index db6cb0913dbd..2c0f51ff9d5c 100644 --- a/kernel/trace/rv/rv.h +++ b/kernel/trace/rv/rv.h @@ -17,50 +17,29 @@ struct rv_interface { #define rv_create_file tracefs_create_file #define rv_remove tracefs_remove +DEFINE_FREE(rv_remove, struct dentry *, if (_T) rv_remove(_T)); + #define MAX_RV_MONITOR_NAME_SIZE 32 #define MAX_RV_REACTOR_NAME_SIZE 32 extern struct mutex rv_interface_lock; - -#ifdef CONFIG_RV_REACTORS -struct rv_reactor_def { - struct list_head list; - struct rv_reactor *reactor; - /* protected by the monitor interface lock */ - int counter; -}; -#endif - -struct rv_monitor_def { - struct list_head list; - struct rv_monitor *monitor; - struct dentry *root_d; -#ifdef CONFIG_RV_REACTORS - struct rv_reactor_def *rdef; - bool reacting; -#endif - bool task_monitor; -}; +extern struct list_head rv_monitors_list; struct dentry *get_monitors_root(void); -int rv_disable_monitor(struct rv_monitor_def *mdef); -int rv_enable_monitor(struct rv_monitor_def *mdef); +int rv_disable_monitor(struct rv_monitor *mon); +int rv_enable_monitor(struct rv_monitor *mon); +bool rv_is_container_monitor(struct rv_monitor *mon); +bool rv_is_nested_monitor(struct rv_monitor *mon); #ifdef CONFIG_RV_REACTORS -int reactor_populate_monitor(struct rv_monitor_def *mdef); -void reactor_cleanup_monitor(struct rv_monitor_def *mdef); +int reactor_populate_monitor(struct rv_monitor *mon, struct dentry *root); int init_rv_reactors(struct dentry *root_dir); #else -static inline int reactor_populate_monitor(struct rv_monitor_def *mdef) +static inline int reactor_populate_monitor(struct rv_monitor *mon, struct dentry *root) { return 0; } -static inline void reactor_cleanup_monitor(struct rv_monitor_def *mdef) -{ - return; -} - static inline int init_rv_reactors(struct dentry *root_dir) { return 0; diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c index 7b49cbe388d4..460af07f7aba 100644 --- a/kernel/trace/rv/rv_reactors.c +++ b/kernel/trace/rv/rv_reactors.c @@ -61,6 +61,7 @@ * printk */ +#include <linux/lockdep.h> #include <linux/slab.h> #include "rv.h" @@ -70,12 +71,12 @@ */ static LIST_HEAD(rv_reactors_list); -static struct rv_reactor_def *get_reactor_rdef_by_name(char *name) +static struct rv_reactor *get_reactor_rdef_by_name(char *name) { - struct rv_reactor_def *r; + struct rv_reactor *r; list_for_each_entry(r, &rv_reactors_list, list) { - if (strcmp(name, r->reactor->name) == 0) + if (strcmp(name, r->name) == 0) return r; } return NULL; @@ -86,9 +87,9 @@ static struct rv_reactor_def *get_reactor_rdef_by_name(char *name) */ static int reactors_show(struct seq_file *m, void *p) { - struct rv_reactor_def *rea_def = p; + struct rv_reactor *reactor = container_of(p, struct rv_reactor, list); - seq_printf(m, "%s\n", rea_def->reactor->name); + seq_printf(m, "%s\n", reactor->name); return 0; } @@ -138,13 +139,13 @@ static const struct file_operations available_reactors_ops = { */ static int monitor_reactor_show(struct seq_file *m, void *p) { - struct rv_monitor_def *mdef = m->private; - struct rv_reactor_def *rdef = p; + struct rv_monitor *mon = m->private; + struct rv_reactor *reactor = container_of(p, struct rv_reactor, list); - if (mdef->rdef == rdef) - seq_printf(m, "[%s]\n", rdef->reactor->name); + if (mon->reactor == reactor) + seq_printf(m, "[%s]\n", reactor->name); else - seq_printf(m, "%s\n", rdef->reactor->name); + seq_printf(m, "%s\n", reactor->name); return 0; } @@ -158,29 +159,45 @@ static const struct seq_operations monitor_reactors_seq_ops = { .show = monitor_reactor_show }; -static void monitor_swap_reactors(struct rv_monitor_def *mdef, struct rv_reactor_def *rdef, - bool reacting) +static void monitor_swap_reactors_single(struct rv_monitor *mon, + struct rv_reactor *reactor, + bool nested) { bool monitor_enabled; /* nothing to do */ - if (mdef->rdef == rdef) + if (mon->reactor == reactor) return; - monitor_enabled = mdef->monitor->enabled; + monitor_enabled = mon->enabled; if (monitor_enabled) - rv_disable_monitor(mdef); + rv_disable_monitor(mon); - /* swap reactor's usage */ - mdef->rdef->counter--; - rdef->counter++; + mon->reactor = reactor; + mon->react = reactor->react; - mdef->rdef = rdef; - mdef->reacting = reacting; - mdef->monitor->react = rdef->reactor->react; + /* enable only once if iterating through a container */ + if (monitor_enabled && !nested) + rv_enable_monitor(mon); +} - if (monitor_enabled) - rv_enable_monitor(mdef); +static void monitor_swap_reactors(struct rv_monitor *mon, struct rv_reactor *reactor) +{ + struct rv_monitor *p = mon; + + if (rv_is_container_monitor(mon)) + list_for_each_entry_continue(p, &rv_monitors_list, list) { + if (p->parent != mon) + break; + monitor_swap_reactors_single(p, reactor, true); + } + /* + * This call enables and disables the monitor if they were active. + * In case of a container, we already disabled all and will enable all. + * All nested monitors are enabled also if they were off, we may refine + * this logic in the future. + */ + monitor_swap_reactors_single(mon, reactor, false); } static ssize_t @@ -188,11 +205,10 @@ monitor_reactors_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { char buff[MAX_RV_REACTOR_NAME_SIZE + 2]; - struct rv_monitor_def *mdef; - struct rv_reactor_def *rdef; + struct rv_monitor *mon; + struct rv_reactor *reactor; struct seq_file *seq_f; int retval = -EINVAL; - bool enable; char *ptr; int len; @@ -215,30 +231,20 @@ monitor_reactors_write(struct file *file, const char __user *user_buf, * See monitor_reactors_open() */ seq_f = file->private_data; - mdef = seq_f->private; - - mutex_lock(&rv_interface_lock); + mon = seq_f->private; - retval = -EINVAL; + guard(mutex)(&rv_interface_lock); - list_for_each_entry(rdef, &rv_reactors_list, list) { - if (strcmp(ptr, rdef->reactor->name) != 0) + list_for_each_entry(reactor, &rv_reactors_list, list) { + if (strcmp(ptr, reactor->name) != 0) continue; - if (rdef == get_reactor_rdef_by_name("nop")) - enable = false; - else - enable = true; - - monitor_swap_reactors(mdef, rdef, enable); + monitor_swap_reactors(mon, reactor); - retval = count; - break; + return count; } - mutex_unlock(&rv_interface_lock); - - return retval; + return -EINVAL; } /* @@ -246,7 +252,7 @@ monitor_reactors_write(struct file *file, const char __user *user_buf, */ static int monitor_reactors_open(struct inode *inode, struct file *file) { - struct rv_monitor_def *mdef = inode->i_private; + struct rv_monitor *mon = inode->i_private; struct seq_file *seq_f; int ret; @@ -262,7 +268,7 @@ static int monitor_reactors_open(struct inode *inode, struct file *file) /* * Copy the create file "private" data to the seq_file private data. */ - seq_f->private = mdef; + seq_f->private = mon; return 0; }; @@ -277,23 +283,16 @@ static const struct file_operations monitor_reactors_ops = { static int __rv_register_reactor(struct rv_reactor *reactor) { - struct rv_reactor_def *r; + struct rv_reactor *r; list_for_each_entry(r, &rv_reactors_list, list) { - if (strcmp(reactor->name, r->reactor->name) == 0) { + if (strcmp(reactor->name, r->name) == 0) { pr_info("Reactor %s is already registered\n", reactor->name); return -EINVAL; } } - r = kzalloc(sizeof(struct rv_reactor_def), GFP_KERNEL); - if (!r) - return -ENOMEM; - - r->reactor = reactor; - r->counter = 0; - - list_add_tail(&r->list, &rv_reactors_list); + list_add_tail(&reactor->list, &rv_reactors_list); return 0; } @@ -306,18 +305,14 @@ static int __rv_register_reactor(struct rv_reactor *reactor) */ int rv_register_reactor(struct rv_reactor *reactor) { - int retval = 0; - if (strlen(reactor->name) >= MAX_RV_REACTOR_NAME_SIZE) { pr_info("Reactor %s has a name longer than %d\n", reactor->name, MAX_RV_MONITOR_NAME_SIZE); return -EINVAL; } - mutex_lock(&rv_interface_lock); - retval = __rv_register_reactor(reactor); - mutex_unlock(&rv_interface_lock); - return retval; + guard(mutex)(&rv_interface_lock); + return __rv_register_reactor(reactor); } /** @@ -328,30 +323,9 @@ int rv_register_reactor(struct rv_reactor *reactor) */ int rv_unregister_reactor(struct rv_reactor *reactor) { - struct rv_reactor_def *ptr, *next; - int ret = 0; - - mutex_lock(&rv_interface_lock); - - list_for_each_entry_safe(ptr, next, &rv_reactors_list, list) { - if (strcmp(reactor->name, ptr->reactor->name) == 0) { - - if (!ptr->counter) { - list_del(&ptr->list); - } else { - printk(KERN_WARNING - "rv: the rv_reactor %s is in use by %d monitor(s)\n", - ptr->reactor->name, ptr->counter); - printk(KERN_WARNING "rv: the rv_reactor %s cannot be removed\n", - ptr->reactor->name); - ret = -EBUSY; - break; - } - } - } - - mutex_unlock(&rv_interface_lock); - return ret; + guard(mutex)(&rv_interface_lock); + list_del(&reactor->list); + return 0; } /* @@ -364,7 +338,7 @@ static bool __read_mostly reacting_on; * * Returns 1 if on, 0 otherwise. */ -bool rv_reacting_on(void) +static bool rv_reacting_on(void) { /* Ensures that concurrent monitors read consistent reacting_on */ smp_rmb(); @@ -406,7 +380,7 @@ static ssize_t reacting_on_write_data(struct file *filp, const char __user *user if (retval) return retval; - mutex_lock(&rv_interface_lock); + guard(mutex)(&rv_interface_lock); if (val) turn_reacting_on(); @@ -419,8 +393,6 @@ static ssize_t reacting_on_write_data(struct file *filp, const char __user *user */ tracepoint_synchronize_unregister(); - mutex_unlock(&rv_interface_lock); - return count; } @@ -432,43 +404,31 @@ static const struct file_operations reacting_on_fops = { /** * reactor_populate_monitor - creates per monitor reactors file - * @mdef: monitor's definition. + * @mon: The monitor. + * @root: The directory of the monitor. * * Returns 0 if successful, error otherwise. */ -int reactor_populate_monitor(struct rv_monitor_def *mdef) +int reactor_populate_monitor(struct rv_monitor *mon, struct dentry *root) { struct dentry *tmp; - tmp = rv_create_file("reactors", RV_MODE_WRITE, mdef->root_d, mdef, &monitor_reactors_ops); + tmp = rv_create_file("reactors", RV_MODE_WRITE, root, mon, &monitor_reactors_ops); if (!tmp) return -ENOMEM; /* * Configure as the rv_nop reactor. */ - mdef->rdef = get_reactor_rdef_by_name("nop"); - mdef->rdef->counter++; - mdef->reacting = false; + mon->reactor = get_reactor_rdef_by_name("nop"); return 0; } -/** - * reactor_cleanup_monitor - cleanup a monitor reference - * @mdef: monitor's definition. - */ -void reactor_cleanup_monitor(struct rv_monitor_def *mdef) -{ - lockdep_assert_held(&rv_interface_lock); - mdef->rdef->counter--; - WARN_ON_ONCE(mdef->rdef->counter < 0); -} - /* * Nop reactor register */ -static void rv_nop_reaction(char *msg) +__printf(1, 0) static void rv_nop_reaction(const char *msg, va_list args) { } @@ -480,30 +440,42 @@ static struct rv_reactor rv_nop = { int init_rv_reactors(struct dentry *root_dir) { - struct dentry *available, *reacting; int retval; - available = rv_create_file("available_reactors", RV_MODE_READ, root_dir, NULL, - &available_reactors_ops); - if (!available) - goto out_err; + struct dentry *available __free(rv_remove) = + rv_create_file("available_reactors", RV_MODE_READ, root_dir, + NULL, &available_reactors_ops); + + struct dentry *reacting __free(rv_remove) = + rv_create_file("reacting_on", RV_MODE_WRITE, root_dir, NULL, &reacting_on_fops); - reacting = rv_create_file("reacting_on", RV_MODE_WRITE, root_dir, NULL, &reacting_on_fops); - if (!reacting) - goto rm_available; + if (!reacting || !available) + return -ENOMEM; retval = __rv_register_reactor(&rv_nop); if (retval) - goto rm_reacting; + return retval; turn_reacting_on(); + retain_and_null_ptr(available); + retain_and_null_ptr(reacting); return 0; +} + +void rv_react(struct rv_monitor *monitor, const char *msg, ...) +{ + static DEFINE_WAIT_OVERRIDE_MAP(rv_react_map, LD_WAIT_FREE); + va_list args; + + if (!rv_reacting_on() || !monitor->react) + return; + + va_start(args, msg); + + lock_map_acquire_try(&rv_react_map); + monitor->react(msg, args); + lock_map_release(&rv_react_map); -rm_reacting: - rv_remove(reacting); -rm_available: - rv_remove(available); -out_err: - return -ENOMEM; + va_end(args); } diff --git a/kernel/trace/rv/rv_trace.h b/kernel/trace/rv/rv_trace.h new file mode 100644 index 000000000000..9622c269789c --- /dev/null +++ b/kernel/trace/rv/rv_trace.h @@ -0,0 +1,277 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM rv + +#if !defined(_TRACE_RV_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_RV_H + +#include <linux/rv.h> +#include <linux/tracepoint.h> + +#ifdef CONFIG_DA_MON_EVENTS_IMPLICIT +DECLARE_EVENT_CLASS(event_da_monitor, + + TP_PROTO(char *state, char *event, char *next_state, bool final_state), + + TP_ARGS(state, event, next_state, final_state), + + TP_STRUCT__entry( + __string( state, state ) + __string( event, event ) + __string( next_state, next_state ) + __field( bool, final_state ) + ), + + TP_fast_assign( + __assign_str(state); + __assign_str(event); + __assign_str(next_state); + __entry->final_state = final_state; + ), + + TP_printk("%s x %s -> %s%s", + __get_str(state), + __get_str(event), + __get_str(next_state), + __entry->final_state ? " (final)" : "") +); + +DECLARE_EVENT_CLASS(error_da_monitor, + + TP_PROTO(char *state, char *event), + + TP_ARGS(state, event), + + TP_STRUCT__entry( + __string( state, state ) + __string( event, event ) + ), + + TP_fast_assign( + __assign_str(state); + __assign_str(event); + ), + + TP_printk("event %s not expected in the state %s", + __get_str(event), + __get_str(state)) +); + +#include <monitors/wip/wip_trace.h> +#include <monitors/sco/sco_trace.h> +#include <monitors/scpd/scpd_trace.h> +#include <monitors/snep/snep_trace.h> +#include <monitors/sts/sts_trace.h> +// Add new monitors based on CONFIG_DA_MON_EVENTS_IMPLICIT here + +#ifdef CONFIG_HA_MON_EVENTS_IMPLICIT +/* For simplicity this class is marked as DA although relevant only for HA */ +DECLARE_EVENT_CLASS(error_env_da_monitor, + + TP_PROTO(char *state, char *event, char *env), + + TP_ARGS(state, event, env), + + TP_STRUCT__entry( + __string( state, state ) + __string( event, event ) + __string( env, env ) + ), + + TP_fast_assign( + __assign_str(state); + __assign_str(event); + __assign_str(env); + ), + + TP_printk("event %s not expected in the state %s with env %s", + __get_str(event), + __get_str(state), + __get_str(env)) +); + +#include <monitors/opid/opid_trace.h> +// Add new monitors based on CONFIG_HA_MON_EVENTS_IMPLICIT here + +#endif + +#endif /* CONFIG_DA_MON_EVENTS_IMPLICIT */ + +#ifdef CONFIG_DA_MON_EVENTS_ID +DECLARE_EVENT_CLASS(event_da_monitor_id, + + TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state), + + TP_ARGS(id, state, event, next_state, final_state), + + TP_STRUCT__entry( + __field( int, id ) + __string( state, state ) + __string( event, event ) + __string( next_state, next_state ) + __field( bool, final_state ) + ), + + TP_fast_assign( + __assign_str(state); + __assign_str(event); + __assign_str(next_state); + __entry->id = id; + __entry->final_state = final_state; + ), + + TP_printk("%d: %s x %s -> %s%s", + __entry->id, + __get_str(state), + __get_str(event), + __get_str(next_state), + __entry->final_state ? " (final)" : "") +); + +DECLARE_EVENT_CLASS(error_da_monitor_id, + + TP_PROTO(int id, char *state, char *event), + + TP_ARGS(id, state, event), + + TP_STRUCT__entry( + __field( int, id ) + __string( state, state ) + __string( event, event ) + ), + + TP_fast_assign( + __assign_str(state); + __assign_str(event); + __entry->id = id; + ), + + TP_printk("%d: event %s not expected in the state %s", + __entry->id, + __get_str(event), + __get_str(state)) +); + +#include <monitors/wwnr/wwnr_trace.h> +#include <monitors/snroc/snroc_trace.h> +#include <monitors/nrp/nrp_trace.h> +#include <monitors/sssw/sssw_trace.h> +// Add new monitors based on CONFIG_DA_MON_EVENTS_ID here + +#ifdef CONFIG_HA_MON_EVENTS_ID +/* For simplicity this class is marked as DA although relevant only for HA */ +DECLARE_EVENT_CLASS(error_env_da_monitor_id, + + TP_PROTO(int id, char *state, char *event, char *env), + + TP_ARGS(id, state, event, env), + + TP_STRUCT__entry( + __field( int, id ) + __string( state, state ) + __string( event, event ) + __string( env, env ) + ), + + TP_fast_assign( + __assign_str(state); + __assign_str(event); + __assign_str(env); + __entry->id = id; + ), + + TP_printk("%d: event %s not expected in the state %s with env %s", + __entry->id, + __get_str(event), + __get_str(state), + __get_str(env)) +); + +#include <monitors/stall/stall_trace.h> +#include <monitors/nomiss/nomiss_trace.h> +// Add new monitors based on CONFIG_HA_MON_EVENTS_ID here + +#endif + +#endif /* CONFIG_DA_MON_EVENTS_ID */ +#ifdef CONFIG_LTL_MON_EVENTS_ID +DECLARE_EVENT_CLASS(event_ltl_monitor_id, + + TP_PROTO(struct task_struct *task, char *states, char *atoms, char *next), + + TP_ARGS(task, states, atoms, next), + + TP_STRUCT__entry( + __string(comm, task->comm) + __field(pid_t, pid) + __string(states, states) + __string(atoms, atoms) + __string(next, next) + ), + + TP_fast_assign( + __assign_str(comm); + __entry->pid = task->pid; + __assign_str(states); + __assign_str(atoms); + __assign_str(next); + ), + + TP_printk("%s[%d]: (%s) x (%s) -> (%s)", __get_str(comm), __entry->pid, + __get_str(states), __get_str(atoms), __get_str(next)) +); + +DECLARE_EVENT_CLASS(error_ltl_monitor_id, + + TP_PROTO(struct task_struct *task), + + TP_ARGS(task), + + TP_STRUCT__entry( + __string(comm, task->comm) + __field(pid_t, pid) + ), + + TP_fast_assign( + __assign_str(comm); + __entry->pid = task->pid; + ), + + TP_printk("%s[%d]: violation detected", __get_str(comm), __entry->pid) +); +#include <monitors/pagefault/pagefault_trace.h> +#include <monitors/sleep/sleep_trace.h> +// Add new monitors based on CONFIG_LTL_MON_EVENTS_ID here +#endif /* CONFIG_LTL_MON_EVENTS_ID */ + +#ifdef CONFIG_RV_MON_MAINTENANCE_EVENTS +/* Tracepoint useful for monitors development, currenly only used in DA */ +TRACE_EVENT(rv_retries_error, + + TP_PROTO(char *name, char *event), + + TP_ARGS(name, event), + + TP_STRUCT__entry( + __string( name, name ) + __string( event, event ) + ), + + TP_fast_assign( + __assign_str(name); + __assign_str(event); + ), + + TP_printk(__stringify(MAX_DA_RETRY_RACING_EVENTS) + " retries reached for event %s, resetting monitor %s", + __get_str(event), __get_str(name)) +); +#endif /* CONFIG_RV_MON_MAINTENANCE_EVENTS */ +#endif /* _TRACE_RV_H */ + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE rv_trace +#include <trace/define_trace.h> diff --git a/kernel/trace/simple_ring_buffer.c b/kernel/trace/simple_ring_buffer.c new file mode 100644 index 000000000000..f4642f5adda3 --- /dev/null +++ b/kernel/trace/simple_ring_buffer.c @@ -0,0 +1,517 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2025 - Google LLC + * Author: Vincent Donnefort <vdonnefort@google.com> + */ + +#include <linux/atomic.h> +#include <linux/simple_ring_buffer.h> + +#include <asm/barrier.h> +#include <asm/local.h> + +enum simple_rb_link_type { + SIMPLE_RB_LINK_NORMAL = 0, + SIMPLE_RB_LINK_HEAD = 1, + SIMPLE_RB_LINK_HEAD_MOVING +}; + +#define SIMPLE_RB_LINK_MASK ~(SIMPLE_RB_LINK_HEAD | SIMPLE_RB_LINK_HEAD_MOVING) + +static void simple_bpage_set_head_link(struct simple_buffer_page *bpage) +{ + unsigned long link = (unsigned long)bpage->link.next; + + link &= SIMPLE_RB_LINK_MASK; + link |= SIMPLE_RB_LINK_HEAD; + + /* + * Paired with simple_rb_find_head() to order access between the head + * link and overrun. It ensures we always report an up-to-date value + * after swapping the reader page. + */ + smp_store_release(&bpage->link.next, (struct list_head *)link); +} + +static bool simple_bpage_unset_head_link(struct simple_buffer_page *bpage, + struct simple_buffer_page *dst, + enum simple_rb_link_type new_type) +{ + unsigned long *link = (unsigned long *)(&bpage->link.next); + unsigned long old = (*link & SIMPLE_RB_LINK_MASK) | SIMPLE_RB_LINK_HEAD; + unsigned long new = (unsigned long)(&dst->link) | new_type; + + return try_cmpxchg(link, &old, new); +} + +static void simple_bpage_set_normal_link(struct simple_buffer_page *bpage) +{ + unsigned long link = (unsigned long)bpage->link.next; + + WRITE_ONCE(bpage->link.next, (struct list_head *)(link & SIMPLE_RB_LINK_MASK)); +} + +static struct simple_buffer_page *simple_bpage_from_link(struct list_head *link) +{ + unsigned long ptr = (unsigned long)link & SIMPLE_RB_LINK_MASK; + + return container_of((struct list_head *)ptr, struct simple_buffer_page, link); +} + +static struct simple_buffer_page *simple_bpage_next_page(struct simple_buffer_page *bpage) +{ + return simple_bpage_from_link(bpage->link.next); +} + +static void simple_bpage_reset(struct simple_buffer_page *bpage) +{ + bpage->write = 0; + bpage->entries = 0; + + local_set(&bpage->page->commit, 0); +} + +static void simple_bpage_init(struct simple_buffer_page *bpage, void *page) +{ + INIT_LIST_HEAD(&bpage->link); + bpage->page = (struct buffer_data_page *)page; + + simple_bpage_reset(bpage); +} + +#define simple_rb_meta_inc(__meta, __inc) \ + WRITE_ONCE((__meta), (__meta + __inc)) + +static bool simple_rb_loaded(struct simple_rb_per_cpu *cpu_buffer) +{ + return !!cpu_buffer->bpages; +} + +static int simple_rb_find_head(struct simple_rb_per_cpu *cpu_buffer) +{ + int retry = cpu_buffer->nr_pages * 2; + struct simple_buffer_page *head; + + head = cpu_buffer->head_page; + + while (retry--) { + unsigned long link; + +spin: + /* See smp_store_release in simple_bpage_set_head_link() */ + link = (unsigned long)smp_load_acquire(&head->link.prev->next); + + switch (link & ~SIMPLE_RB_LINK_MASK) { + /* Found the head */ + case SIMPLE_RB_LINK_HEAD: + cpu_buffer->head_page = head; + return 0; + /* The writer caught the head, we can spin, that won't be long */ + case SIMPLE_RB_LINK_HEAD_MOVING: + goto spin; + } + + head = simple_bpage_next_page(head); + } + + return -EBUSY; +} + +/** + * simple_ring_buffer_swap_reader_page - Swap ring-buffer head with the reader + * @cpu_buffer: A simple_rb_per_cpu + * + * This function enables consuming reading. It ensures the current head page will not be overwritten + * and can be safely read. + * + * Returns 0 on success, -ENODEV if @cpu_buffer was unloaded or -EBUSY if we failed to catch the + * head page. + */ +int simple_ring_buffer_swap_reader_page(struct simple_rb_per_cpu *cpu_buffer) +{ + struct simple_buffer_page *last, *head, *reader; + unsigned long overrun; + int retry = 8; + int ret; + + if (!simple_rb_loaded(cpu_buffer)) + return -ENODEV; + + reader = cpu_buffer->reader_page; + + do { + /* Run after the writer to find the head */ + ret = simple_rb_find_head(cpu_buffer); + if (ret) + return ret; + + head = cpu_buffer->head_page; + + /* Connect the reader page around the header page */ + reader->link.next = head->link.next; + reader->link.prev = head->link.prev; + + /* The last page before the head */ + last = simple_bpage_from_link(head->link.prev); + + /* The reader page points to the new header page */ + simple_bpage_set_head_link(reader); + + overrun = cpu_buffer->meta->overrun; + } while (!simple_bpage_unset_head_link(last, reader, SIMPLE_RB_LINK_NORMAL) && retry--); + + if (!retry) + return -EINVAL; + + cpu_buffer->head_page = simple_bpage_from_link(reader->link.next); + cpu_buffer->head_page->link.prev = &reader->link; + cpu_buffer->reader_page = head; + cpu_buffer->meta->reader.lost_events = overrun - cpu_buffer->last_overrun; + cpu_buffer->meta->reader.id = cpu_buffer->reader_page->id; + cpu_buffer->last_overrun = overrun; + + return 0; +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_swap_reader_page); + +static struct simple_buffer_page *simple_rb_move_tail(struct simple_rb_per_cpu *cpu_buffer) +{ + struct simple_buffer_page *tail, *new_tail; + + tail = cpu_buffer->tail_page; + new_tail = simple_bpage_next_page(tail); + + if (simple_bpage_unset_head_link(tail, new_tail, SIMPLE_RB_LINK_HEAD_MOVING)) { + /* + * Oh no! we've caught the head. There is none anymore and + * swap_reader will spin until we set the new one. Overrun must + * be written first, to make sure we report the correct number + * of lost events. + */ + simple_rb_meta_inc(cpu_buffer->meta->overrun, new_tail->entries); + simple_rb_meta_inc(cpu_buffer->meta->pages_lost, 1); + + simple_bpage_set_head_link(new_tail); + simple_bpage_set_normal_link(tail); + } + + simple_bpage_reset(new_tail); + cpu_buffer->tail_page = new_tail; + + simple_rb_meta_inc(cpu_buffer->meta->pages_touched, 1); + + return new_tail; +} + +static unsigned long rb_event_size(unsigned long length) +{ + struct ring_buffer_event *event; + + return length + RB_EVNT_HDR_SIZE + sizeof(event->array[0]); +} + +static struct ring_buffer_event * +rb_event_add_ts_extend(struct ring_buffer_event *event, u64 delta) +{ + event->type_len = RINGBUF_TYPE_TIME_EXTEND; + event->time_delta = delta & TS_MASK; + event->array[0] = delta >> TS_SHIFT; + + return (struct ring_buffer_event *)((unsigned long)event + 8); +} + +static struct ring_buffer_event * +simple_rb_reserve_next(struct simple_rb_per_cpu *cpu_buffer, unsigned long length, u64 timestamp) +{ + unsigned long ts_ext_size = 0, event_size = rb_event_size(length); + struct simple_buffer_page *tail = cpu_buffer->tail_page; + struct ring_buffer_event *event; + u32 write, prev_write; + u64 time_delta; + + time_delta = timestamp - cpu_buffer->write_stamp; + + if (test_time_stamp(time_delta)) + ts_ext_size = 8; + + prev_write = tail->write; + write = prev_write + event_size + ts_ext_size; + + if (unlikely(write > (PAGE_SIZE - BUF_PAGE_HDR_SIZE))) + tail = simple_rb_move_tail(cpu_buffer); + + if (!tail->entries) { + tail->page->time_stamp = timestamp; + time_delta = 0; + ts_ext_size = 0; + write = event_size; + prev_write = 0; + } + + tail->write = write; + tail->entries++; + + cpu_buffer->write_stamp = timestamp; + + event = (struct ring_buffer_event *)(tail->page->data + prev_write); + if (ts_ext_size) { + event = rb_event_add_ts_extend(event, time_delta); + time_delta = 0; + } + + event->type_len = 0; + event->time_delta = time_delta; + event->array[0] = event_size - RB_EVNT_HDR_SIZE; + + return event; +} + +/** + * simple_ring_buffer_reserve - Reserve an entry in @cpu_buffer + * @cpu_buffer: A simple_rb_per_cpu + * @length: Size of the entry in bytes + * @timestamp: Timestamp of the entry + * + * Returns the address of the entry where to write data or NULL + */ +void *simple_ring_buffer_reserve(struct simple_rb_per_cpu *cpu_buffer, unsigned long length, + u64 timestamp) +{ + struct ring_buffer_event *rb_event; + + if (cmpxchg(&cpu_buffer->status, SIMPLE_RB_READY, SIMPLE_RB_WRITING) != SIMPLE_RB_READY) + return NULL; + + rb_event = simple_rb_reserve_next(cpu_buffer, length, timestamp); + + return &rb_event->array[1]; +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_reserve); + +/** + * simple_ring_buffer_commit - Commit the entry reserved with simple_ring_buffer_reserve() + * @cpu_buffer: The simple_rb_per_cpu where the entry has been reserved + */ +void simple_ring_buffer_commit(struct simple_rb_per_cpu *cpu_buffer) +{ + local_set(&cpu_buffer->tail_page->page->commit, + cpu_buffer->tail_page->write); + simple_rb_meta_inc(cpu_buffer->meta->entries, 1); + + /* + * Paired with simple_rb_enable_tracing() to ensure data is + * written to the ring-buffer before teardown. + */ + smp_store_release(&cpu_buffer->status, SIMPLE_RB_READY); +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_commit); + +static u32 simple_rb_enable_tracing(struct simple_rb_per_cpu *cpu_buffer, bool enable) +{ + u32 prev_status; + + if (enable) + return cmpxchg(&cpu_buffer->status, SIMPLE_RB_UNAVAILABLE, SIMPLE_RB_READY); + + /* Wait for the buffer to be released */ + do { + prev_status = cmpxchg_acquire(&cpu_buffer->status, + SIMPLE_RB_READY, + SIMPLE_RB_UNAVAILABLE); + } while (prev_status == SIMPLE_RB_WRITING); + + return prev_status; +} + +/** + * simple_ring_buffer_reset - Reset @cpu_buffer + * @cpu_buffer: A simple_rb_per_cpu + * + * This will not clear the content of the data, only reset counters and pointers + * + * Returns 0 on success or -ENODEV if @cpu_buffer was unloaded. + */ +int simple_ring_buffer_reset(struct simple_rb_per_cpu *cpu_buffer) +{ + struct simple_buffer_page *bpage; + u32 prev_status; + int ret; + + if (!simple_rb_loaded(cpu_buffer)) + return -ENODEV; + + prev_status = simple_rb_enable_tracing(cpu_buffer, false); + + ret = simple_rb_find_head(cpu_buffer); + if (ret) + return ret; + + bpage = cpu_buffer->tail_page = cpu_buffer->head_page; + do { + simple_bpage_reset(bpage); + bpage = simple_bpage_next_page(bpage); + } while (bpage != cpu_buffer->head_page); + + simple_bpage_reset(cpu_buffer->reader_page); + + cpu_buffer->last_overrun = 0; + cpu_buffer->write_stamp = 0; + + cpu_buffer->meta->reader.read = 0; + cpu_buffer->meta->reader.lost_events = 0; + cpu_buffer->meta->entries = 0; + cpu_buffer->meta->overrun = 0; + cpu_buffer->meta->read = 0; + cpu_buffer->meta->pages_lost = 0; + cpu_buffer->meta->pages_touched = 0; + + if (prev_status == SIMPLE_RB_READY) + simple_rb_enable_tracing(cpu_buffer, true); + + return 0; +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_reset); + +int simple_ring_buffer_init_mm(struct simple_rb_per_cpu *cpu_buffer, + struct simple_buffer_page *bpages, + const struct ring_buffer_desc *desc, + void *(*load_page)(unsigned long va), + void (*unload_page)(void *va)) +{ + struct simple_buffer_page *bpage = bpages; + int ret = 0; + void *page; + int i; + + /* At least 1 reader page and two pages in the ring-buffer */ + if (desc->nr_page_va < 3) + return -EINVAL; + + memset(cpu_buffer, 0, sizeof(*cpu_buffer)); + + cpu_buffer->meta = load_page(desc->meta_va); + if (!cpu_buffer->meta) + return -EINVAL; + + memset(cpu_buffer->meta, 0, sizeof(*cpu_buffer->meta)); + cpu_buffer->meta->meta_page_size = PAGE_SIZE; + + /* The reader page is not part of the ring initially */ + page = load_page(desc->page_va[0]); + if (!page) { + unload_page(cpu_buffer->meta); + return -EINVAL; + } + + simple_bpage_init(bpage, page); + bpage->id = 0; + + cpu_buffer->nr_pages = 1; + + cpu_buffer->reader_page = bpage; + cpu_buffer->tail_page = bpage + 1; + cpu_buffer->head_page = bpage + 1; + + for (i = 1; i < desc->nr_page_va; i++) { + page = load_page(desc->page_va[i]); + if (!page) { + ret = -EINVAL; + break; + } + + simple_bpage_init(++bpage, page); + + bpage->link.next = &(bpage + 1)->link; + bpage->link.prev = &(bpage - 1)->link; + bpage->id = i; + + cpu_buffer->nr_pages = i + 1; + } + + if (ret) { + for (i--; i >= 0; i--) + unload_page(bpages[i].page); + unload_page(cpu_buffer->meta); + + return ret; + } + + cpu_buffer->meta->nr_subbufs = cpu_buffer->nr_pages; + /* Close the ring */ + bpage->link.next = &cpu_buffer->tail_page->link; + cpu_buffer->tail_page->link.prev = &bpage->link; + + /* The last init'ed page points to the head page */ + simple_bpage_set_head_link(bpage); + + cpu_buffer->bpages = bpages; + + return 0; +} + +static void *__load_page(unsigned long page) +{ + return (void *)page; +} + +static void __unload_page(void *page) { } + +/** + * simple_ring_buffer_init - Init @cpu_buffer based on @desc + * @cpu_buffer: A simple_rb_per_cpu buffer to init, allocated by the caller. + * @bpages: Array of simple_buffer_pages, with as many elements as @desc->nr_page_va + * @desc: A ring_buffer_desc + * + * Returns 0 on success or -EINVAL if the content of @desc is invalid + */ +int simple_ring_buffer_init(struct simple_rb_per_cpu *cpu_buffer, struct simple_buffer_page *bpages, + const struct ring_buffer_desc *desc) +{ + return simple_ring_buffer_init_mm(cpu_buffer, bpages, desc, __load_page, __unload_page); +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_init); + +void simple_ring_buffer_unload_mm(struct simple_rb_per_cpu *cpu_buffer, + void (*unload_page)(void *)) +{ + int p; + + if (!simple_rb_loaded(cpu_buffer)) + return; + + simple_rb_enable_tracing(cpu_buffer, false); + + unload_page(cpu_buffer->meta); + for (p = 0; p < cpu_buffer->nr_pages; p++) + unload_page(cpu_buffer->bpages[p].page); + + cpu_buffer->bpages = NULL; +} + +/** + * simple_ring_buffer_unload - Prepare @cpu_buffer for deletion + * @cpu_buffer: A simple_rb_per_cpu that will be deleted. + */ +void simple_ring_buffer_unload(struct simple_rb_per_cpu *cpu_buffer) +{ + return simple_ring_buffer_unload_mm(cpu_buffer, __unload_page); +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_unload); + +/** + * simple_ring_buffer_enable_tracing - Enable or disable writing to @cpu_buffer + * @cpu_buffer: A simple_rb_per_cpu + * @enable: True to enable tracing, False to disable it + * + * Returns 0 on success or -ENODEV if @cpu_buffer was unloaded + */ +int simple_ring_buffer_enable_tracing(struct simple_rb_per_cpu *cpu_buffer, bool enable) +{ + if (!simple_rb_loaded(cpu_buffer)) + return -ENODEV; + + simple_rb_enable_tracing(cpu_buffer, enable); + + return 0; +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_enable_tracing); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f8aebcb01e62..6eb4d3097a4d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -20,12 +20,14 @@ #include <linux/security.h> #include <linux/seq_file.h> #include <linux/irqflags.h> +#include <linux/syscalls.h> #include <linux/debugfs.h> #include <linux/tracefs.h> #include <linux/pagemap.h> #include <linux/hardirq.h> #include <linux/linkage.h> #include <linux/uaccess.h> +#include <linux/cleanup.h> #include <linux/vmalloc.h> #include <linux/ftrace.h> #include <linux/module.h> @@ -45,9 +47,11 @@ #include <linux/trace.h> #include <linux/sched/clock.h> #include <linux/sched/rt.h> -#include <linux/fsnotify.h> #include <linux/irq_work.h> #include <linux/workqueue.h> +#include <linux/sort.h> +#include <linux/io.h> /* vmap_page_range() */ +#include <linux/fs_context.h> #include <asm/setup.h> /* COMMAND_LINE_SIZE */ @@ -62,7 +66,7 @@ * insertions into the ring-buffer such as trace_printk could occurred * at the same time, giving false positive or negative results. */ -static bool __read_mostly tracing_selftest_running; +bool __read_mostly tracing_selftest_running; /* * If boot-time tracing including tracers/events via kernel cmdline @@ -78,7 +82,6 @@ void __init disable_tracing_selftest(const char *reason) } } #else -#define tracing_selftest_running 0 #define tracing_selftest_disabled 0 #endif @@ -86,19 +89,16 @@ void __init disable_tracing_selftest(const char *reason) static struct trace_iterator *tracepoint_print_iter; int tracepoint_printk; static bool tracepoint_printk_stop_on_boot __initdata; +static bool traceoff_after_boot __initdata; static DEFINE_STATIC_KEY_FALSE(tracepoint_printk_key); -/* For tracers that don't implement custom flags */ -static struct tracer_opt dummy_tracer_opt[] = { - { } +/* Store tracers and their flags per instance */ +struct tracers { + struct list_head list; + struct tracer *tracer; + struct tracer_flags *flags; }; -static int -dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) -{ - return 0; -} - /* * To prevent the comm cache from being overwritten when no * tracing is active, only save the comm when a trace event @@ -112,17 +112,18 @@ DEFINE_PER_CPU(bool, trace_taskinfo_save); * of the tracer is successful. But that is the only place that sets * this back to zero. */ -static int tracing_disabled = 1; +int tracing_disabled = 1; cpumask_var_t __read_mostly tracing_buffer_mask; +#define MAX_TRACER_SIZE 100 /* * ftrace_dump_on_oops - variable to dump ftrace buffer on oops * * If there is an oops (or kernel panic) and the ftrace_dump_on_oops * is set, then ftrace_dump is called. This will output the contents * of the ftrace buffers to the console. This is very useful for - * capturing traces that lead to crashes and outputing it to a + * capturing traces that lead to crashes and outputting it to a * serial console. * * It is default off, but you can enable it with either specifying @@ -131,14 +132,47 @@ cpumask_var_t __read_mostly tracing_buffer_mask; * Set 1 if you want to dump buffers of all CPUs * Set 2 if you want to dump the buffer of the CPU that triggered oops * Set instance name if you want to dump the specific trace instance - * Multiple instance dump is also supported, and instances are seperated + * Multiple instance dump is also supported, and instances are separated * by commas. */ /* Set to string format zero to disable by default */ -char ftrace_dump_on_oops[MAX_TRACER_SIZE] = "0"; +static char ftrace_dump_on_oops[MAX_TRACER_SIZE] = "0"; /* When set, tracing will stop when a WARN*() is hit */ -int __disable_trace_on_warning; +static int __disable_trace_on_warning; + +int tracepoint_printk_sysctl(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +static const struct ctl_table trace_sysctl_table[] = { + { + .procname = "ftrace_dump_on_oops", + .data = &ftrace_dump_on_oops, + .maxlen = MAX_TRACER_SIZE, + .mode = 0644, + .proc_handler = proc_dostring, + }, + { + .procname = "traceoff_on_warning", + .data = &__disable_trace_on_warning, + .maxlen = sizeof(__disable_trace_on_warning), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tracepoint_printk", + .data = &tracepoint_printk, + .maxlen = sizeof(tracepoint_printk), + .mode = 0644, + .proc_handler = tracepoint_printk_sysctl, + }, +}; + +static int __init init_trace_sysctls(void) +{ + register_sysctl_init("kernel", trace_sysctl_table); + return 0; +} +subsys_initcall(init_trace_sysctls); #ifdef CONFIG_TRACE_EVAL_MAP_FILE /* Map of enums to their values, for "eval_map" file */ @@ -184,14 +218,36 @@ static void ftrace_trace_userstack(struct trace_array *tr, static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; static char *default_bootup_tracer; -static bool allocate_snapshot; -static bool snapshot_at_boot; - static char boot_instance_info[COMMAND_LINE_SIZE] __initdata; static int boot_instance_index; -static char boot_snapshot_info[COMMAND_LINE_SIZE] __initdata; -static int boot_snapshot_index; +/* + * Repeated boot parameters, including Bootconfig array expansions, need + * to stay in the delimiter form that the existing parser consumes. + */ +void __init trace_append_boot_param(char *buf, const char *str, char sep, + int size) +{ + int len, needed, str_len; + + if (!*str) + return; + + len = strlen(buf); + str_len = strlen(str); + needed = len + str_len + 1; + + /* For continuation, account for the separator. */ + if (len) + needed++; + if (needed > size) + return; + + if (len) + buf[len++] = sep; + + strscpy(buf + len, str, size - len); +} static int __init set_cmdline_ftrace(char *str) { @@ -241,38 +297,6 @@ static int __init stop_trace_on_warning(char *str) } __setup("traceoff_on_warning", stop_trace_on_warning); -static int __init boot_alloc_snapshot(char *str) -{ - char *slot = boot_snapshot_info + boot_snapshot_index; - int left = sizeof(boot_snapshot_info) - boot_snapshot_index; - int ret; - - if (str[0] == '=') { - str++; - if (strlen(str) >= left) - return -1; - - ret = snprintf(slot, left, "%s\t", str); - boot_snapshot_index += ret; - } else { - allocate_snapshot = true; - /* We also need the main ring buffer expanded */ - trace_set_ring_buffer_expanded(NULL); - } - return 1; -} -__setup("alloc_snapshot", boot_alloc_snapshot); - - -static int __init boot_snapshot(char *str) -{ - snapshot_at_boot = true; - boot_alloc_snapshot(str); - return 1; -} -__setup("ftrace_boot_snapshot", boot_snapshot); - - static int __init boot_instance(char *str) { char *slot = boot_instance_info + boot_instance_index; @@ -294,7 +318,8 @@ static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; static int __init set_trace_boot_options(char *str) { - strscpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); + trace_append_boot_param(trace_boot_options_buf, str, ',', + MAX_TRACER_SIZE); return 1; } __setup("trace_options=", set_trace_boot_options); @@ -329,6 +354,13 @@ static int __init set_tracepoint_printk_stop(char *str) } __setup("tp_printk_stop_on_boot", set_tracepoint_printk_stop); +static int __init set_traceoff_after_boot(char *str) +{ + traceoff_after_boot = true; + return 1; +} +__setup("traceoff_after_boot", set_traceoff_after_boot); + unsigned long long ns2usecs(u64 nsec) { nsec += 500; @@ -386,15 +418,13 @@ static void ftrace_exports(struct ring_buffer_event *event, int flag) { struct trace_export *export; - preempt_disable_notrace(); + guard(preempt_notrace)(); export = rcu_dereference_raw_check(ftrace_exports_list); while (export) { trace_process_export(export, event, flag); export = rcu_dereference_raw_check(export->next); } - - preempt_enable_notrace(); } static inline void @@ -451,46 +481,40 @@ int register_ftrace_export(struct trace_export *export) if (WARN_ON_ONCE(!export->write)) return -1; - mutex_lock(&ftrace_export_lock); + guard(mutex)(&ftrace_export_lock); add_ftrace_export(&ftrace_exports_list, export); - mutex_unlock(&ftrace_export_lock); - return 0; } EXPORT_SYMBOL_GPL(register_ftrace_export); int unregister_ftrace_export(struct trace_export *export) { - int ret; - - mutex_lock(&ftrace_export_lock); - - ret = rm_ftrace_export(&ftrace_exports_list, export); - - mutex_unlock(&ftrace_export_lock); - - return ret; + guard(mutex)(&ftrace_export_lock); + return rm_ftrace_export(&ftrace_exports_list, export); } EXPORT_SYMBOL_GPL(unregister_ftrace_export); /* trace_flags holds trace_options default values */ #define TRACE_DEFAULT_FLAGS \ - (FUNCTION_DEFAULT_FLAGS | \ - TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | \ - TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | \ - TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | \ - TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | \ - TRACE_ITER_HASH_PTR | TRACE_ITER_TRACE_PRINTK) + (FUNCTION_DEFAULT_FLAGS | FPROFILE_DEFAULT_FLAGS | \ + TRACE_ITER(PRINT_PARENT) | TRACE_ITER(PRINTK) | \ + TRACE_ITER(ANNOTATE) | TRACE_ITER(CONTEXT_INFO) | \ + TRACE_ITER(RECORD_CMD) | TRACE_ITER(OVERWRITE) | \ + TRACE_ITER(IRQ_INFO) | TRACE_ITER(MARKERS) | \ + TRACE_ITER(HASH_PTR) | TRACE_ITER(TRACE_PRINTK) | \ + TRACE_ITER(COPY_MARKER)) /* trace_options that are only supported by global_trace */ -#define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK | \ - TRACE_ITER_PRINTK_MSGONLY | TRACE_ITER_RECORD_CMD) +#define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER(PRINTK) | \ + TRACE_ITER(PRINTK_MSGONLY) | TRACE_ITER(RECORD_CMD) | \ + TRACE_ITER(PROF_TEXT_OFFSET) | FPROFILE_DEFAULT_FLAGS) /* trace_flags that are default zero for instances */ #define ZEROED_TRACE_FLAGS \ - (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK | TRACE_ITER_TRACE_PRINTK) + (TRACE_ITER(EVENT_FORK) | TRACE_ITER(FUNC_FORK) | TRACE_ITER(TRACE_PRINTK) | \ + TRACE_ITER(COPY_MARKER)) /* * The global_trace is the descriptor that holds the top-level tracing @@ -500,27 +524,41 @@ static struct trace_array global_trace = { .trace_flags = TRACE_DEFAULT_FLAGS, }; -static struct trace_array *printk_trace = &global_trace; +struct trace_array *printk_trace = &global_trace; -static __always_inline bool printk_binsafe(struct trace_array *tr) -{ - /* - * The binary format of traceprintk can cause a crash if used - * by a buffer from another boot. Force the use of the - * non binary version of trace_printk if the trace_printk - * buffer is a boot mapped ring buffer. - */ - return !(tr->flags & TRACE_ARRAY_FL_BOOT); -} +/* List of trace_arrays interested in the top level trace_marker */ +static LIST_HEAD(marker_copies); static void update_printk_trace(struct trace_array *tr) { if (printk_trace == tr) return; - printk_trace->trace_flags &= ~TRACE_ITER_TRACE_PRINTK; + printk_trace->trace_flags &= ~TRACE_ITER(TRACE_PRINTK); printk_trace = tr; - tr->trace_flags |= TRACE_ITER_TRACE_PRINTK; + tr->trace_flags |= TRACE_ITER(TRACE_PRINTK); +} + +/* Returns true if the status of tr changed */ +static bool update_marker_trace(struct trace_array *tr, int enabled) +{ + lockdep_assert_held(&event_mutex); + + if (enabled) { + if (tr->trace_flags & TRACE_ITER(COPY_MARKER)) + return false; + + list_add_rcu(&tr->marker_list, &marker_copies); + tr->trace_flags |= TRACE_ITER(COPY_MARKER); + return true; + } + + if (!(tr->trace_flags & TRACE_ITER(COPY_MARKER))) + return false; + + list_del_rcu(&tr->marker_list); + tr->trace_flags &= ~TRACE_ITER(COPY_MARKER); + return true; } void trace_set_ring_buffer_expanded(struct trace_array *tr) @@ -530,30 +568,83 @@ void trace_set_ring_buffer_expanded(struct trace_array *tr) tr->ring_buffer_expanded = true; } +static void trace_array_autoremove(struct work_struct *work) +{ + struct trace_array *tr = container_of(work, struct trace_array, autoremove_work); + + trace_array_destroy(tr); +} + +static struct workqueue_struct *autoremove_wq; + +static void trace_array_kick_autoremove(struct trace_array *tr) +{ + if (autoremove_wq) + queue_work(autoremove_wq, &tr->autoremove_work); +} + +static void trace_array_cancel_autoremove(struct trace_array *tr) +{ + /* + * Since this can be called inside trace_array_autoremove(), + * it has to avoid deadlock of the workqueue. + */ + if (work_pending(&tr->autoremove_work)) + cancel_work_sync(&tr->autoremove_work); +} + +static void trace_array_init_autoremove(struct trace_array *tr) +{ + INIT_WORK(&tr->autoremove_work, trace_array_autoremove); +} + +static void trace_array_start_autoremove(void) +{ + if (autoremove_wq) + return; + + autoremove_wq = alloc_workqueue("tr_autoremove_wq", + WQ_UNBOUND | WQ_HIGHPRI, 0); + if (!autoremove_wq) + pr_warn("Unable to allocate tr_autoremove_wq. autoremove disabled.\n"); +} + LIST_HEAD(ftrace_trace_arrays); +static int __trace_array_get(struct trace_array *this_tr) +{ + /* When free_on_close is set, this is not available anymore. */ + if (autoremove_wq && this_tr->free_on_close) + return -ENODEV; + + this_tr->ref++; + return 0; +} + int trace_array_get(struct trace_array *this_tr) { struct trace_array *tr; - int ret = -ENODEV; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); list_for_each_entry(tr, &ftrace_trace_arrays, list) { if (tr == this_tr) { - tr->ref++; - ret = 0; - break; + return __trace_array_get(tr); } } - mutex_unlock(&trace_types_lock); - return ret; + return -ENODEV; } static void __trace_array_put(struct trace_array *this_tr) { WARN_ON(!this_tr->ref); this_tr->ref--; + /* + * When free_on_close is set, prepare removing the array + * when the last reference is released. + */ + if (this_tr->ref == 1 && this_tr->free_on_close) + trace_array_kick_autoremove(this_tr); } /** @@ -570,9 +661,8 @@ void trace_array_put(struct trace_array *this_tr) if (!this_tr) return; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); __trace_array_put(this_tr); - mutex_unlock(&trace_types_lock); } EXPORT_SYMBOL_GPL(trace_array_put); @@ -593,244 +683,6 @@ int tracing_check_open_get_tr(struct trace_array *tr) return 0; } -/** - * trace_find_filtered_pid - check if a pid exists in a filtered_pid list - * @filtered_pids: The list of pids to check - * @search_pid: The PID to find in @filtered_pids - * - * Returns true if @search_pid is found in @filtered_pids, and false otherwise. - */ -bool -trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) -{ - return trace_pid_list_is_set(filtered_pids, search_pid); -} - -/** - * trace_ignore_this_task - should a task be ignored for tracing - * @filtered_pids: The list of pids to check - * @filtered_no_pids: The list of pids not to be traced - * @task: The task that should be ignored if not filtered - * - * Checks if @task should be traced or not from @filtered_pids. - * Returns true if @task should *NOT* be traced. - * Returns false if @task should be traced. - */ -bool -trace_ignore_this_task(struct trace_pid_list *filtered_pids, - struct trace_pid_list *filtered_no_pids, - struct task_struct *task) -{ - /* - * If filtered_no_pids is not empty, and the task's pid is listed - * in filtered_no_pids, then return true. - * Otherwise, if filtered_pids is empty, that means we can - * trace all tasks. If it has content, then only trace pids - * within filtered_pids. - */ - - return (filtered_pids && - !trace_find_filtered_pid(filtered_pids, task->pid)) || - (filtered_no_pids && - trace_find_filtered_pid(filtered_no_pids, task->pid)); -} - -/** - * trace_filter_add_remove_task - Add or remove a task from a pid_list - * @pid_list: The list to modify - * @self: The current task for fork or NULL for exit - * @task: The task to add or remove - * - * If adding a task, if @self is defined, the task is only added if @self - * is also included in @pid_list. This happens on fork and tasks should - * only be added when the parent is listed. If @self is NULL, then the - * @task pid will be removed from the list, which would happen on exit - * of a task. - */ -void trace_filter_add_remove_task(struct trace_pid_list *pid_list, - struct task_struct *self, - struct task_struct *task) -{ - if (!pid_list) - return; - - /* For forks, we only add if the forking task is listed */ - if (self) { - if (!trace_find_filtered_pid(pid_list, self->pid)) - return; - } - - /* "self" is set for forks, and NULL for exits */ - if (self) - trace_pid_list_set(pid_list, task->pid); - else - trace_pid_list_clear(pid_list, task->pid); -} - -/** - * trace_pid_next - Used for seq_file to get to the next pid of a pid_list - * @pid_list: The pid list to show - * @v: The last pid that was shown (+1 the actual pid to let zero be displayed) - * @pos: The position of the file - * - * This is used by the seq_file "next" operation to iterate the pids - * listed in a trace_pid_list structure. - * - * Returns the pid+1 as we want to display pid of zero, but NULL would - * stop the iteration. - */ -void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos) -{ - long pid = (unsigned long)v; - unsigned int next; - - (*pos)++; - - /* pid already is +1 of the actual previous bit */ - if (trace_pid_list_next(pid_list, pid, &next) < 0) - return NULL; - - pid = next; - - /* Return pid + 1 to allow zero to be represented */ - return (void *)(pid + 1); -} - -/** - * trace_pid_start - Used for seq_file to start reading pid lists - * @pid_list: The pid list to show - * @pos: The position of the file - * - * This is used by seq_file "start" operation to start the iteration - * of listing pids. - * - * Returns the pid+1 as we want to display pid of zero, but NULL would - * stop the iteration. - */ -void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos) -{ - unsigned long pid; - unsigned int first; - loff_t l = 0; - - if (trace_pid_list_first(pid_list, &first) < 0) - return NULL; - - pid = first; - - /* Return pid + 1 so that zero can be the exit value */ - for (pid++; pid && l < *pos; - pid = (unsigned long)trace_pid_next(pid_list, (void *)pid, &l)) - ; - return (void *)pid; -} - -/** - * trace_pid_show - show the current pid in seq_file processing - * @m: The seq_file structure to write into - * @v: A void pointer of the pid (+1) value to display - * - * Can be directly used by seq_file operations to display the current - * pid value. - */ -int trace_pid_show(struct seq_file *m, void *v) -{ - unsigned long pid = (unsigned long)v - 1; - - seq_printf(m, "%lu\n", pid); - return 0; -} - -/* 128 should be much more than enough */ -#define PID_BUF_SIZE 127 - -int trace_pid_write(struct trace_pid_list *filtered_pids, - struct trace_pid_list **new_pid_list, - const char __user *ubuf, size_t cnt) -{ - struct trace_pid_list *pid_list; - struct trace_parser parser; - unsigned long val; - int nr_pids = 0; - ssize_t read = 0; - ssize_t ret; - loff_t pos; - pid_t pid; - - if (trace_parser_get_init(&parser, PID_BUF_SIZE + 1)) - return -ENOMEM; - - /* - * Always recreate a new array. The write is an all or nothing - * operation. Always create a new array when adding new pids by - * the user. If the operation fails, then the current list is - * not modified. - */ - pid_list = trace_pid_list_alloc(); - if (!pid_list) { - trace_parser_put(&parser); - return -ENOMEM; - } - - if (filtered_pids) { - /* copy the current bits to the new max */ - ret = trace_pid_list_first(filtered_pids, &pid); - while (!ret) { - trace_pid_list_set(pid_list, pid); - ret = trace_pid_list_next(filtered_pids, pid + 1, &pid); - nr_pids++; - } - } - - ret = 0; - while (cnt > 0) { - - pos = 0; - - ret = trace_get_user(&parser, ubuf, cnt, &pos); - if (ret < 0) - break; - - read += ret; - ubuf += ret; - cnt -= ret; - - if (!trace_parser_loaded(&parser)) - break; - - ret = -EINVAL; - if (kstrtoul(parser.buffer, 0, &val)) - break; - - pid = (pid_t)val; - - if (trace_pid_list_set(pid_list, pid) < 0) { - ret = -1; - break; - } - nr_pids++; - - trace_parser_clear(&parser); - ret = 0; - } - trace_parser_put(&parser); - - if (ret < 0) { - trace_pid_list_free(pid_list); - return ret; - } - - if (!nr_pids) { - /* Cleared the list of pids */ - trace_pid_list_free(pid_list); - pid_list = NULL; - } - - *new_pid_list = pid_list; - - return read; -} - static u64 buffer_ftrace_now(struct array_buffer *buf, int cpu) { u64 ts; @@ -866,7 +718,6 @@ int tracing_is_enabled(void) * return the mirror variable of the state of the ring buffer. * It's a little racy, but we don't really care. */ - smp_rmb(); return !global_trace.buffer_disabled; } @@ -974,56 +825,6 @@ static inline void trace_access_lock_init(void) #endif -#ifdef CONFIG_STACKTRACE -static void __ftrace_trace_stack(struct trace_array *tr, - struct trace_buffer *buffer, - unsigned int trace_ctx, - int skip, struct pt_regs *regs); -static inline void ftrace_trace_stack(struct trace_array *tr, - struct trace_buffer *buffer, - unsigned int trace_ctx, - int skip, struct pt_regs *regs); - -#else -static inline void __ftrace_trace_stack(struct trace_array *tr, - struct trace_buffer *buffer, - unsigned int trace_ctx, - int skip, struct pt_regs *regs) -{ -} -static inline void ftrace_trace_stack(struct trace_array *tr, - struct trace_buffer *buffer, - unsigned long trace_ctx, - int skip, struct pt_regs *regs) -{ -} - -#endif - -static __always_inline void -trace_event_setup(struct ring_buffer_event *event, - int type, unsigned int trace_ctx) -{ - struct trace_entry *ent = ring_buffer_event_data(event); - - tracing_generic_entry_update(ent, type, trace_ctx); -} - -static __always_inline struct ring_buffer_event * -__trace_buffer_lock_reserve(struct trace_buffer *buffer, - int type, - unsigned long len, - unsigned int trace_ctx) -{ - struct ring_buffer_event *event; - - event = ring_buffer_lock_reserve(buffer, len); - if (event != NULL) - trace_event_setup(event, type, trace_ctx); - - return event; -} - void tracer_tracing_on(struct trace_array *tr) { if (tr->array_buffer.buffer) @@ -1037,8 +838,6 @@ void tracer_tracing_on(struct trace_array *tr) * important to be fast than accurate. */ tr->buffer_disabled = 0; - /* Make the flag seen by readers */ - smp_wmb(); } /** @@ -1053,176 +852,7 @@ void tracing_on(void) } EXPORT_SYMBOL_GPL(tracing_on); - -static __always_inline void -__buffer_unlock_commit(struct trace_buffer *buffer, struct ring_buffer_event *event) -{ - __this_cpu_write(trace_taskinfo_save, true); - - /* If this is the temp buffer, we need to commit fully */ - if (this_cpu_read(trace_buffered_event) == event) { - /* Length is in event->array[0] */ - ring_buffer_write(buffer, event->array[0], &event->array[1]); - /* Release the temp buffer */ - this_cpu_dec(trace_buffered_event_cnt); - /* ring_buffer_unlock_commit() enables preemption */ - preempt_enable_notrace(); - } else - ring_buffer_unlock_commit(buffer); -} - -int __trace_array_puts(struct trace_array *tr, unsigned long ip, - const char *str, int size) -{ - struct ring_buffer_event *event; - struct trace_buffer *buffer; - struct print_entry *entry; - unsigned int trace_ctx; - int alloc; - - if (!(tr->trace_flags & TRACE_ITER_PRINTK)) - return 0; - - if (unlikely(tracing_selftest_running && tr == &global_trace)) - return 0; - - if (unlikely(tracing_disabled)) - return 0; - - alloc = sizeof(*entry) + size + 2; /* possible \n added */ - - trace_ctx = tracing_gen_ctx(); - buffer = tr->array_buffer.buffer; - ring_buffer_nest_start(buffer); - event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, - trace_ctx); - if (!event) { - size = 0; - goto out; - } - - entry = ring_buffer_event_data(event); - entry->ip = ip; - - memcpy(&entry->buf, str, size); - - /* Add a newline if necessary */ - if (entry->buf[size - 1] != '\n') { - entry->buf[size] = '\n'; - entry->buf[size + 1] = '\0'; - } else - entry->buf[size] = '\0'; - - __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL); - out: - ring_buffer_nest_end(buffer); - return size; -} -EXPORT_SYMBOL_GPL(__trace_array_puts); - -/** - * __trace_puts - write a constant string into the trace buffer. - * @ip: The address of the caller - * @str: The constant string to write - * @size: The size of the string. - */ -int __trace_puts(unsigned long ip, const char *str, int size) -{ - return __trace_array_puts(printk_trace, ip, str, size); -} -EXPORT_SYMBOL_GPL(__trace_puts); - -/** - * __trace_bputs - write the pointer to a constant string into trace buffer - * @ip: The address of the caller - * @str: The constant string to write to the buffer to - */ -int __trace_bputs(unsigned long ip, const char *str) -{ - struct trace_array *tr = READ_ONCE(printk_trace); - struct ring_buffer_event *event; - struct trace_buffer *buffer; - struct bputs_entry *entry; - unsigned int trace_ctx; - int size = sizeof(struct bputs_entry); - int ret = 0; - - if (!printk_binsafe(tr)) - return __trace_puts(ip, str, strlen(str)); - - if (!(tr->trace_flags & TRACE_ITER_PRINTK)) - return 0; - - if (unlikely(tracing_selftest_running || tracing_disabled)) - return 0; - - trace_ctx = tracing_gen_ctx(); - buffer = tr->array_buffer.buffer; - - ring_buffer_nest_start(buffer); - event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, - trace_ctx); - if (!event) - goto out; - - entry = ring_buffer_event_data(event); - entry->ip = ip; - entry->str = str; - - __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL); - - ret = 1; - out: - ring_buffer_nest_end(buffer); - return ret; -} -EXPORT_SYMBOL_GPL(__trace_bputs); - #ifdef CONFIG_TRACER_SNAPSHOT -static void tracing_snapshot_instance_cond(struct trace_array *tr, - void *cond_data) -{ - struct tracer *tracer = tr->current_trace; - unsigned long flags; - - if (in_nmi()) { - trace_array_puts(tr, "*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n"); - trace_array_puts(tr, "*** snapshot is being ignored ***\n"); - return; - } - - if (!tr->allocated_snapshot) { - trace_array_puts(tr, "*** SNAPSHOT NOT ALLOCATED ***\n"); - trace_array_puts(tr, "*** stopping trace here! ***\n"); - tracer_tracing_off(tr); - return; - } - - /* Note, snapshot can not be used when the tracer uses it */ - if (tracer->use_max_tr) { - trace_array_puts(tr, "*** LATENCY TRACER ACTIVE ***\n"); - trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n"); - return; - } - - if (tr->mapped) { - trace_array_puts(tr, "*** BUFFER MEMORY MAPPED ***\n"); - trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n"); - return; - } - - local_irq_save(flags); - update_max_tr(tr, current, smp_processor_id(), cond_data); - local_irq_restore(flags); -} - -void tracing_snapshot_instance(struct trace_array *tr) -{ - tracing_snapshot_instance_cond(tr, NULL); -} - /** * tracing_snapshot - take a snapshot of the current buffer. * @@ -1246,143 +876,6 @@ void tracing_snapshot(void) EXPORT_SYMBOL_GPL(tracing_snapshot); /** - * tracing_snapshot_cond - conditionally take a snapshot of the current buffer. - * @tr: The tracing instance to snapshot - * @cond_data: The data to be tested conditionally, and possibly saved - * - * This is the same as tracing_snapshot() except that the snapshot is - * conditional - the snapshot will only happen if the - * cond_snapshot.update() implementation receiving the cond_data - * returns true, which means that the trace array's cond_snapshot - * update() operation used the cond_data to determine whether the - * snapshot should be taken, and if it was, presumably saved it along - * with the snapshot. - */ -void tracing_snapshot_cond(struct trace_array *tr, void *cond_data) -{ - tracing_snapshot_instance_cond(tr, cond_data); -} -EXPORT_SYMBOL_GPL(tracing_snapshot_cond); - -/** - * tracing_cond_snapshot_data - get the user data associated with a snapshot - * @tr: The tracing instance - * - * When the user enables a conditional snapshot using - * tracing_snapshot_cond_enable(), the user-defined cond_data is saved - * with the snapshot. This accessor is used to retrieve it. - * - * Should not be called from cond_snapshot.update(), since it takes - * the tr->max_lock lock, which the code calling - * cond_snapshot.update() has already done. - * - * Returns the cond_data associated with the trace array's snapshot. - */ -void *tracing_cond_snapshot_data(struct trace_array *tr) -{ - void *cond_data = NULL; - - local_irq_disable(); - arch_spin_lock(&tr->max_lock); - - if (tr->cond_snapshot) - cond_data = tr->cond_snapshot->cond_data; - - arch_spin_unlock(&tr->max_lock); - local_irq_enable(); - - return cond_data; -} -EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data); - -static int resize_buffer_duplicate_size(struct array_buffer *trace_buf, - struct array_buffer *size_buf, int cpu_id); -static void set_buffer_entries(struct array_buffer *buf, unsigned long val); - -int tracing_alloc_snapshot_instance(struct trace_array *tr) -{ - int order; - int ret; - - if (!tr->allocated_snapshot) { - - /* Make the snapshot buffer have the same order as main buffer */ - order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer); - ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, order); - if (ret < 0) - return ret; - - /* allocate spare buffer */ - ret = resize_buffer_duplicate_size(&tr->max_buffer, - &tr->array_buffer, RING_BUFFER_ALL_CPUS); - if (ret < 0) - return ret; - - tr->allocated_snapshot = true; - } - - return 0; -} - -static void free_snapshot(struct trace_array *tr) -{ - /* - * We don't free the ring buffer. instead, resize it because - * The max_tr ring buffer has some state (e.g. ring->clock) and - * we want preserve it. - */ - ring_buffer_subbuf_order_set(tr->max_buffer.buffer, 0); - ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); - set_buffer_entries(&tr->max_buffer, 1); - tracing_reset_online_cpus(&tr->max_buffer); - tr->allocated_snapshot = false; -} - -static int tracing_arm_snapshot_locked(struct trace_array *tr) -{ - int ret; - - lockdep_assert_held(&trace_types_lock); - - spin_lock(&tr->snapshot_trigger_lock); - if (tr->snapshot == UINT_MAX || tr->mapped) { - spin_unlock(&tr->snapshot_trigger_lock); - return -EBUSY; - } - - tr->snapshot++; - spin_unlock(&tr->snapshot_trigger_lock); - - ret = tracing_alloc_snapshot_instance(tr); - if (ret) { - spin_lock(&tr->snapshot_trigger_lock); - tr->snapshot--; - spin_unlock(&tr->snapshot_trigger_lock); - } - - return ret; -} - -int tracing_arm_snapshot(struct trace_array *tr) -{ - int ret; - - mutex_lock(&trace_types_lock); - ret = tracing_arm_snapshot_locked(tr); - mutex_unlock(&trace_types_lock); - - return ret; -} - -void tracing_disarm_snapshot(struct trace_array *tr) -{ - spin_lock(&tr->snapshot_trigger_lock); - if (!WARN_ON(!tr->snapshot)) - tr->snapshot--; - spin_unlock(&tr->snapshot_trigger_lock); -} - -/** * tracing_alloc_snapshot - allocate snapshot buffer. * * This only allocates the snapshot buffer if it isn't already @@ -1402,170 +895,18 @@ int tracing_alloc_snapshot(void) return ret; } -EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); - -/** - * tracing_snapshot_alloc - allocate and take a snapshot of the current buffer. - * - * This is similar to tracing_snapshot(), but it will allocate the - * snapshot buffer if it isn't already allocated. Use this only - * where it is safe to sleep, as the allocation may sleep. - * - * This causes a swap between the snapshot buffer and the current live - * tracing buffer. You can use this to take snapshots of the live - * trace when some condition is triggered, but continue to trace. - */ -void tracing_snapshot_alloc(void) -{ - int ret; - - ret = tracing_alloc_snapshot(); - if (ret < 0) - return; - - tracing_snapshot(); -} -EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); - -/** - * tracing_snapshot_cond_enable - enable conditional snapshot for an instance - * @tr: The tracing instance - * @cond_data: User data to associate with the snapshot - * @update: Implementation of the cond_snapshot update function - * - * Check whether the conditional snapshot for the given instance has - * already been enabled, or if the current tracer is already using a - * snapshot; if so, return -EBUSY, else create a cond_snapshot and - * save the cond_data and update function inside. - * - * Returns 0 if successful, error otherwise. - */ -int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, - cond_update_fn_t update) -{ - struct cond_snapshot *cond_snapshot; - int ret = 0; - - cond_snapshot = kzalloc(sizeof(*cond_snapshot), GFP_KERNEL); - if (!cond_snapshot) - return -ENOMEM; - - cond_snapshot->cond_data = cond_data; - cond_snapshot->update = update; - - mutex_lock(&trace_types_lock); - - if (tr->current_trace->use_max_tr) { - ret = -EBUSY; - goto fail_unlock; - } - - /* - * The cond_snapshot can only change to NULL without the - * trace_types_lock. We don't care if we race with it going - * to NULL, but we want to make sure that it's not set to - * something other than NULL when we get here, which we can - * do safely with only holding the trace_types_lock and not - * having to take the max_lock. - */ - if (tr->cond_snapshot) { - ret = -EBUSY; - goto fail_unlock; - } - - ret = tracing_arm_snapshot_locked(tr); - if (ret) - goto fail_unlock; - - local_irq_disable(); - arch_spin_lock(&tr->max_lock); - tr->cond_snapshot = cond_snapshot; - arch_spin_unlock(&tr->max_lock); - local_irq_enable(); - - mutex_unlock(&trace_types_lock); - - return ret; - - fail_unlock: - mutex_unlock(&trace_types_lock); - kfree(cond_snapshot); - return ret; -} -EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable); - -/** - * tracing_snapshot_cond_disable - disable conditional snapshot for an instance - * @tr: The tracing instance - * - * Check whether the conditional snapshot for the given instance is - * enabled; if so, free the cond_snapshot associated with it, - * otherwise return -EINVAL. - * - * Returns 0 if successful, error otherwise. - */ -int tracing_snapshot_cond_disable(struct trace_array *tr) -{ - int ret = 0; - - local_irq_disable(); - arch_spin_lock(&tr->max_lock); - - if (!tr->cond_snapshot) - ret = -EINVAL; - else { - kfree(tr->cond_snapshot); - tr->cond_snapshot = NULL; - } - - arch_spin_unlock(&tr->max_lock); - local_irq_enable(); - - tracing_disarm_snapshot(tr); - - return ret; -} -EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); #else void tracing_snapshot(void) { WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used"); } EXPORT_SYMBOL_GPL(tracing_snapshot); -void tracing_snapshot_cond(struct trace_array *tr, void *cond_data) -{ - WARN_ONCE(1, "Snapshot feature not enabled, but internal conditional snapshot used"); -} -EXPORT_SYMBOL_GPL(tracing_snapshot_cond); -int tracing_alloc_snapshot(void) -{ - WARN_ONCE(1, "Snapshot feature not enabled, but snapshot allocation used"); - return -ENODEV; -} -EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); void tracing_snapshot_alloc(void) { /* Give warning */ tracing_snapshot(); } EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); -void *tracing_cond_snapshot_data(struct trace_array *tr) -{ - return NULL; -} -EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data); -int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update) -{ - return -ENODEV; -} -EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable); -int tracing_snapshot_cond_disable(struct trace_array *tr) -{ - return false; -} -EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); -#define free_snapshot(tr) do { } while (0) -#define tracing_arm_snapshot_locked(tr) ({ -EBUSY; }) #endif /* CONFIG_TRACER_SNAPSHOT */ void tracer_tracing_off(struct trace_array *tr) @@ -1581,8 +922,39 @@ void tracer_tracing_off(struct trace_array *tr) * important to be fast than accurate. */ tr->buffer_disabled = 1; - /* Make the flag seen by readers */ - smp_wmb(); +} + +/** + * tracer_tracing_disable() - temporary disable the buffer from write + * @tr: The trace array to disable its buffer for + * + * Expects trace_tracing_enable() to re-enable tracing. + * The difference between this and tracer_tracing_off() is that this + * is a counter and can nest, whereas, tracer_tracing_off() can + * be called multiple times and a single trace_tracing_on() will + * enable it. + */ +void tracer_tracing_disable(struct trace_array *tr) +{ + if (WARN_ON_ONCE(!tr->array_buffer.buffer)) + return; + + ring_buffer_record_disable(tr->array_buffer.buffer); +} + +/** + * tracer_tracing_enable() - counter part of tracer_tracing_disable() + * @tr: The trace array that had tracer_tracincg_disable() called on it + * + * This is called after tracer_tracing_disable() has been called on @tr, + * when it's safe to re-enable tracing. + */ +void tracer_tracing_enable(struct trace_array *tr) +{ + if (WARN_ON_ONCE(!tr->array_buffer.buffer)) + return; + + ring_buffer_record_enable(tr->array_buffer.buffer); } /** @@ -1602,9 +974,18 @@ EXPORT_SYMBOL_GPL(tracing_off); void disable_trace_on_warning(void) { if (__disable_trace_on_warning) { + struct trace_array *tr = READ_ONCE(printk_trace); + trace_array_printk_buf(global_trace.array_buffer.buffer, _THIS_IP_, "Disabling tracing due to warning\n"); tracing_off(); + + /* Disable trace_printk() buffer too */ + if (tr != &global_trace) { + trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_, + "Disabling tracing due to warning\n"); + tracer_tracing_off(tr); + } } } @@ -1754,7 +1135,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, ret = get_user(ch, ubuf++); if (ret) - goto out; + goto fail; read++; cnt--; @@ -1768,7 +1149,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, while (cnt && isspace(ch)) { ret = get_user(ch, ubuf++); if (ret) - goto out; + goto fail; read++; cnt--; } @@ -1778,8 +1159,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, /* only spaces were written */ if (isspace(ch) || !ch) { *ppos += read; - ret = read; - goto out; + return read; } } @@ -1789,11 +1169,12 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, parser->buffer[parser->idx++] = ch; else { ret = -EINVAL; - goto out; + goto fail; } + ret = get_user(ch, ubuf++); if (ret) - goto out; + goto fail; read++; cnt--; } @@ -1809,13 +1190,13 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, parser->buffer[parser->idx] = 0; } else { ret = -EINVAL; - goto out; + goto fail; } *ppos += read; - ret = read; - -out: + return read; +fail: + trace_parser_fail(parser); return ret; } @@ -1838,207 +1219,6 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) unsigned long __read_mostly tracing_thresh; -#ifdef CONFIG_TRACER_MAX_TRACE -static const struct file_operations tracing_max_lat_fops; - -#ifdef LATENCY_FS_NOTIFY - -static struct workqueue_struct *fsnotify_wq; - -static void latency_fsnotify_workfn(struct work_struct *work) -{ - struct trace_array *tr = container_of(work, struct trace_array, - fsnotify_work); - fsnotify_inode(tr->d_max_latency->d_inode, FS_MODIFY); -} - -static void latency_fsnotify_workfn_irq(struct irq_work *iwork) -{ - struct trace_array *tr = container_of(iwork, struct trace_array, - fsnotify_irqwork); - queue_work(fsnotify_wq, &tr->fsnotify_work); -} - -static void trace_create_maxlat_file(struct trace_array *tr, - struct dentry *d_tracer) -{ - INIT_WORK(&tr->fsnotify_work, latency_fsnotify_workfn); - init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq); - tr->d_max_latency = trace_create_file("tracing_max_latency", - TRACE_MODE_WRITE, - d_tracer, tr, - &tracing_max_lat_fops); -} - -__init static int latency_fsnotify_init(void) -{ - fsnotify_wq = alloc_workqueue("tr_max_lat_wq", - WQ_UNBOUND | WQ_HIGHPRI, 0); - if (!fsnotify_wq) { - pr_err("Unable to allocate tr_max_lat_wq\n"); - return -ENOMEM; - } - return 0; -} - -late_initcall_sync(latency_fsnotify_init); - -void latency_fsnotify(struct trace_array *tr) -{ - if (!fsnotify_wq) - return; - /* - * We cannot call queue_work(&tr->fsnotify_work) from here because it's - * possible that we are called from __schedule() or do_idle(), which - * could cause a deadlock. - */ - irq_work_queue(&tr->fsnotify_irqwork); -} - -#else /* !LATENCY_FS_NOTIFY */ - -#define trace_create_maxlat_file(tr, d_tracer) \ - trace_create_file("tracing_max_latency", TRACE_MODE_WRITE, \ - d_tracer, tr, &tracing_max_lat_fops) - -#endif - -/* - * Copy the new maximum trace into the separate maximum-trace - * structure. (this way the maximum trace is permanently saved, - * for later retrieval via /sys/kernel/tracing/tracing_max_latency) - */ -static void -__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) -{ - struct array_buffer *trace_buf = &tr->array_buffer; - struct array_buffer *max_buf = &tr->max_buffer; - struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu); - struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu); - - max_buf->cpu = cpu; - max_buf->time_start = data->preempt_timestamp; - - max_data->saved_latency = tr->max_latency; - max_data->critical_start = data->critical_start; - max_data->critical_end = data->critical_end; - - strscpy(max_data->comm, tsk->comm); - max_data->pid = tsk->pid; - /* - * If tsk == current, then use current_uid(), as that does not use - * RCU. The irq tracer can be called out of RCU scope. - */ - if (tsk == current) - max_data->uid = current_uid(); - else - max_data->uid = task_uid(tsk); - - max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; - max_data->policy = tsk->policy; - max_data->rt_priority = tsk->rt_priority; - - /* record this tasks comm */ - tracing_record_cmdline(tsk); - latency_fsnotify(tr); -} - -/** - * update_max_tr - snapshot all trace buffers from global_trace to max_tr - * @tr: tracer - * @tsk: the task with the latency - * @cpu: The cpu that initiated the trace. - * @cond_data: User data associated with a conditional snapshot - * - * Flip the buffers between the @tr and the max_tr and record information - * about which task was the cause of this latency. - */ -void -update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, - void *cond_data) -{ - if (tr->stop_count) - return; - - WARN_ON_ONCE(!irqs_disabled()); - - if (!tr->allocated_snapshot) { - /* Only the nop tracer should hit this when disabling */ - WARN_ON_ONCE(tr->current_trace != &nop_trace); - return; - } - - arch_spin_lock(&tr->max_lock); - - /* Inherit the recordable setting from array_buffer */ - if (ring_buffer_record_is_set_on(tr->array_buffer.buffer)) - ring_buffer_record_on(tr->max_buffer.buffer); - else - ring_buffer_record_off(tr->max_buffer.buffer); - -#ifdef CONFIG_TRACER_SNAPSHOT - if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data)) { - arch_spin_unlock(&tr->max_lock); - return; - } -#endif - swap(tr->array_buffer.buffer, tr->max_buffer.buffer); - - __update_max_tr(tr, tsk, cpu); - - arch_spin_unlock(&tr->max_lock); - - /* Any waiters on the old snapshot buffer need to wake up */ - ring_buffer_wake_waiters(tr->array_buffer.buffer, RING_BUFFER_ALL_CPUS); -} - -/** - * update_max_tr_single - only copy one trace over, and reset the rest - * @tr: tracer - * @tsk: task with the latency - * @cpu: the cpu of the buffer to copy. - * - * Flip the trace of a single CPU buffer between the @tr and the max_tr. - */ -void -update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) -{ - int ret; - - if (tr->stop_count) - return; - - WARN_ON_ONCE(!irqs_disabled()); - if (!tr->allocated_snapshot) { - /* Only the nop tracer should hit this when disabling */ - WARN_ON_ONCE(tr->current_trace != &nop_trace); - return; - } - - arch_spin_lock(&tr->max_lock); - - ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->array_buffer.buffer, cpu); - - if (ret == -EBUSY) { - /* - * We failed to swap the buffer due to a commit taking - * place on this CPU. We fail to record, but we reset - * the max trace buffer (no one writes directly to it) - * and flag that it failed. - * Another reason is resize is in progress. - */ - trace_array_printk_buf(tr->max_buffer.buffer, _THIS_IP_, - "Failed to swap buffers due to commit or resize in progress\n"); - } - - WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); - - __update_max_tr(tr, tsk, cpu); - arch_spin_unlock(&tr->max_lock); -} - -#endif /* CONFIG_TRACER_MAX_TRACE */ - struct pipe_wait { struct trace_iterator *iter; int wait_index; @@ -2070,13 +1250,13 @@ static int wait_on_pipe(struct trace_iterator *iter, int full) ret = ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, full, wait_pipe_cond, &pwait); -#ifdef CONFIG_TRACER_MAX_TRACE +#ifdef CONFIG_TRACER_SNAPSHOT /* * Make sure this is still the snapshot buffer, as if a snapshot were * to happen, this would now be the main buffer. */ if (iter->snapshot) - iter->array_buffer = &iter->tr->max_buffer; + iter->array_buffer = &iter->tr->snapshot_buffer; #endif return ret; } @@ -2107,6 +1287,7 @@ static int save_selftest(struct tracer *type) static int run_tracer_selftest(struct tracer *type) { struct trace_array *tr = &global_trace; + struct tracer_flags *saved_flags = tr->current_trace_flags; struct tracer *saved_tracer = tr->current_trace; int ret; @@ -2137,12 +1318,13 @@ static int run_tracer_selftest(struct tracer *type) tracing_reset_online_cpus(&tr->array_buffer); tr->current_trace = type; + tr->current_trace_flags = type->flags ? : type->default_flags; #ifdef CONFIG_TRACER_MAX_TRACE - if (type->use_max_tr) { + if (tracer_uses_snapshot(type)) { /* If we expanded the buffers, make sure the max is expanded too */ if (tr->ring_buffer_expanded) - ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size, + ring_buffer_resize(tr->snapshot_buffer.buffer, trace_buf_size, RING_BUFFER_ALL_CPUS); tr->allocated_snapshot = true; } @@ -2153,6 +1335,7 @@ static int run_tracer_selftest(struct tracer *type) ret = type->selftest(type, tr); /* the test is responsible for resetting too */ tr->current_trace = saved_tracer; + tr->current_trace_flags = saved_flags; if (ret) { printk(KERN_CONT "FAILED!\n"); /* Add the warning after printing 'FAILED' */ @@ -2163,12 +1346,12 @@ static int run_tracer_selftest(struct tracer *type) tracing_reset_online_cpus(&tr->array_buffer); #ifdef CONFIG_TRACER_MAX_TRACE - if (type->use_max_tr) { + if (tracer_uses_snapshot(type)) { tr->allocated_snapshot = false; /* Shrink the max buffer again */ if (tr->ring_buffer_expanded) - ring_buffer_resize(tr->max_buffer.buffer, 1, + ring_buffer_resize(tr->snapshot_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); } #endif @@ -2203,10 +1386,10 @@ static __init int init_trace_selftests(void) selftests_can_run = true; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); if (list_empty(&postponed_selftests)) - goto out; + return 0; pr_info("Running postponed tracer tests:\n"); @@ -2235,9 +1418,6 @@ static __init int init_trace_selftests(void) } tracing_selftest_running = false; - out: - mutex_unlock(&trace_types_lock); - return 0; } core_initcall(init_trace_selftests); @@ -2248,10 +1428,23 @@ static inline int do_run_tracer_selftest(struct tracer *type) } #endif /* CONFIG_FTRACE_STARTUP_TEST */ -static void add_tracer_options(struct trace_array *tr, struct tracer *t); +static int add_tracer(struct trace_array *tr, struct tracer *t); static void __init apply_trace_boot_options(void); +static void free_tracers(struct trace_array *tr) +{ + struct tracers *t, *n; + + lockdep_assert_held(&trace_types_lock); + + list_for_each_entry_safe(t, n, &tr->tracers, list) { + list_del(&t->list); + kfree(t->flags); + kfree(t); + } +} + /** * register_tracer - register a tracer with the ftrace system. * @type: the plugin for the tracer @@ -2260,6 +1453,7 @@ static void __init apply_trace_boot_options(void); */ int __init register_tracer(struct tracer *type) { + struct trace_array *tr; struct tracer *t; int ret = 0; @@ -2291,44 +1485,38 @@ int __init register_tracer(struct tracer *type) } } - if (!type->set_flag) - type->set_flag = &dummy_set_flag; - if (!type->flags) { - /*allocate a dummy tracer_flags*/ - type->flags = kmalloc(sizeof(*type->flags), GFP_KERNEL); - if (!type->flags) { - ret = -ENOMEM; - goto out; - } - type->flags->val = 0; - type->flags->opts = dummy_tracer_opt; - } else - if (!type->flags->opts) - type->flags->opts = dummy_tracer_opt; - /* store the tracer for __set_tracer_option */ - type->flags->trace = type; + if (type->flags) + type->flags->trace = type; ret = do_run_tracer_selftest(type); if (ret < 0) goto out; + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + ret = add_tracer(tr, type); + if (ret < 0) { + /* The tracer will still exist but without options */ + pr_warn("Failed to create tracer options for %s\n", type->name); + break; + } + } + type->next = trace_types; trace_types = type; - add_tracer_options(&global_trace, type); out: mutex_unlock(&trace_types_lock); if (ret || !default_bootup_tracer) - goto out_unlock; + return ret; if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE)) - goto out_unlock; + return 0; printk(KERN_INFO "Starting tracer '%s'\n", type->name); /* Do we want this tracer to start on bootup? */ - tracing_set_tracer(&global_trace, type->name); + WARN_ON(tracing_set_tracer(&global_trace, type->name) < 0); default_bootup_tracer = NULL; apply_trace_boot_options(); @@ -2336,11 +1524,10 @@ int __init register_tracer(struct tracer *type) /* disable other selftests, since this will break it. */ disable_tracing_selftest("running a tracer"); - out_unlock: - return ret; + return 0; } -static void tracing_reset_cpu(struct array_buffer *buf, int cpu) +void tracing_reset_cpu(struct array_buffer *buf, int cpu) { struct trace_buffer *buffer = buf->buffer; @@ -2406,17 +1593,16 @@ void tracing_reset_all_online_cpus_unlocked(void) continue; tr->clear_trace = false; tracing_reset_online_cpus(&tr->array_buffer); -#ifdef CONFIG_TRACER_MAX_TRACE - tracing_reset_online_cpus(&tr->max_buffer); +#ifdef CONFIG_TRACER_SNAPSHOT + tracing_reset_online_cpus(&tr->snapshot_buffer); #endif } } void tracing_reset_all_online_cpus(void) { - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); tracing_reset_all_online_cpus_unlocked(); - mutex_unlock(&trace_types_lock); } int is_tracing_stopped(void) @@ -2427,18 +1613,17 @@ int is_tracing_stopped(void) static void tracing_start_tr(struct trace_array *tr) { struct trace_buffer *buffer; - unsigned long flags; if (tracing_disabled) return; - raw_spin_lock_irqsave(&tr->start_lock, flags); + guard(raw_spinlock_irqsave)(&tr->start_lock); if (--tr->stop_count) { if (WARN_ON_ONCE(tr->stop_count < 0)) { /* Someone screwed up their debugging */ tr->stop_count = 0; } - goto out; + return; } /* Prevent the buffers from switching */ @@ -2448,16 +1633,13 @@ static void tracing_start_tr(struct trace_array *tr) if (buffer) ring_buffer_record_enable(buffer); -#ifdef CONFIG_TRACER_MAX_TRACE - buffer = tr->max_buffer.buffer; +#ifdef CONFIG_TRACER_SNAPSHOT + buffer = tr->snapshot_buffer.buffer; if (buffer) ring_buffer_record_enable(buffer); #endif arch_spin_unlock(&tr->max_lock); - - out: - raw_spin_unlock_irqrestore(&tr->start_lock, flags); } /** @@ -2475,11 +1657,10 @@ void tracing_start(void) static void tracing_stop_tr(struct trace_array *tr) { struct trace_buffer *buffer; - unsigned long flags; - raw_spin_lock_irqsave(&tr->start_lock, flags); + guard(raw_spinlock_irqsave)(&tr->start_lock); if (tr->stop_count++) - goto out; + return; /* Prevent the buffers from switching */ arch_spin_lock(&tr->max_lock); @@ -2488,16 +1669,13 @@ static void tracing_stop_tr(struct trace_array *tr) if (buffer) ring_buffer_record_disable(buffer); -#ifdef CONFIG_TRACER_MAX_TRACE - buffer = tr->max_buffer.buffer; +#ifdef CONFIG_TRACER_SNAPSHOT + buffer = tr->snapshot_buffer.buffer; if (buffer) ring_buffer_record_disable(buffer); #endif arch_spin_unlock(&tr->max_lock); - - out: - raw_spin_unlock_irqrestore(&tr->start_lock, flags); } /** @@ -2610,19 +1788,17 @@ void trace_buffered_event_enable(void) per_cpu(trace_buffered_event, cpu) = event; - preempt_disable(); - if (cpu == smp_processor_id() && - __this_cpu_read(trace_buffered_event) != - per_cpu(trace_buffered_event, cpu)) - WARN_ON_ONCE(1); - preempt_enable(); + scoped_guard(preempt,) { + if (cpu == smp_processor_id() && + __this_cpu_read(trace_buffered_event) != + per_cpu(trace_buffered_event, cpu)) + WARN_ON_ONCE(1); + } } } static void enable_trace_buffered_event(void *data) { - /* Probably not needed, but do it anyway */ - smp_rmb(); this_cpu_dec(trace_buffered_event_cnt); } @@ -2807,7 +1983,7 @@ int tracepoint_printk_sysctl(const struct ctl_table *table, int write, int save_tracepoint_printk; int ret; - mutex_lock(&tracepoint_printk_mutex); + guard(mutex)(&tracepoint_printk_mutex); save_tracepoint_printk = tracepoint_printk; ret = proc_dointvec(table, write, buffer, lenp, ppos); @@ -2820,16 +1996,13 @@ int tracepoint_printk_sysctl(const struct ctl_table *table, int write, tracepoint_printk = 0; if (save_tracepoint_printk == tracepoint_printk) - goto out; + return ret; if (tracepoint_printk) static_key_enable(&tracepoint_printk_key.key); else static_key_disable(&tracepoint_printk_key.key); - out: - mutex_unlock(&tracepoint_printk_mutex); - return ret; } @@ -2897,13 +2070,16 @@ trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer, void trace_function(struct trace_array *tr, unsigned long ip, unsigned long - parent_ip, unsigned int trace_ctx) + parent_ip, unsigned int trace_ctx, struct ftrace_regs *fregs) { struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct ftrace_entry *entry; + int size = sizeof(*entry); - event = __trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), + size += FTRACE_REGS_MAX_ARGS * !!fregs * sizeof(long); + + event = __trace_buffer_lock_reserve(buffer, TRACE_FN, size, trace_ctx); if (!event) return; @@ -2911,6 +2087,13 @@ trace_function(struct trace_array *tr, unsigned long ip, unsigned long entry->ip = ip; entry->parent_ip = parent_ip; +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API + if (fregs) { + for (int i = 0; i < FTRACE_REGS_MAX_ARGS; i++) + entry->args[i] = ftrace_regs_get_argument(fregs, i); + } +#endif + if (static_branch_unlikely(&trace_function_exports_enabled)) ftrace_exports(event, TRACE_EXPORT_FUNCTION); __buffer_unlock_commit(buffer, event); @@ -2935,16 +2118,21 @@ struct ftrace_stacks { static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks); static DEFINE_PER_CPU(int, ftrace_stack_reserve); -static void __ftrace_trace_stack(struct trace_array *tr, - struct trace_buffer *buffer, - unsigned int trace_ctx, - int skip, struct pt_regs *regs) +void __ftrace_trace_stack(struct trace_array *tr, + struct trace_buffer *buffer, + unsigned int trace_ctx, + int skip, struct pt_regs *regs) { struct ring_buffer_event *event; unsigned int size, nr_entries; struct ftrace_stack *fstack; struct stack_entry *entry; int stackidx; + int bit; + + bit = trace_test_and_set_recursion(_THIS_IP_, _RET_IP_, TRACE_EVENT_START); + if (bit < 0) + return; /* * Add one, for this function and the call to save_stack_trace() @@ -2955,7 +2143,7 @@ static void __ftrace_trace_stack(struct trace_array *tr, skip++; #endif - preempt_disable_notrace(); + guard(preempt_notrace)(); stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1; @@ -3013,19 +2201,7 @@ static void __ftrace_trace_stack(struct trace_array *tr, /* Again, don't let gcc optimize things here */ barrier(); __this_cpu_dec(ftrace_stack_reserve); - preempt_enable_notrace(); - -} - -static inline void ftrace_trace_stack(struct trace_array *tr, - struct trace_buffer *buffer, - unsigned int trace_ctx, - int skip, struct pt_regs *regs) -{ - if (!(tr->trace_flags & TRACE_ITER_STACKTRACE)) - return; - - __ftrace_trace_stack(tr, buffer, trace_ctx, skip, regs); + trace_clear_recursion(bit); } void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, @@ -3083,7 +2259,7 @@ ftrace_trace_userstack(struct trace_array *tr, struct ring_buffer_event *event; struct userstack_entry *entry; - if (!(tr->trace_flags & TRACE_ITER_USERSTACKTRACE)) + if (!(tr->trace_flags & TRACE_ITER(USERSTACKTRACE))) return; /* @@ -3097,9 +2273,9 @@ ftrace_trace_userstack(struct trace_array *tr, * prevent recursion, since the user stack tracing may * trigger other kernel events. */ - preempt_disable(); + guard(preempt)(); if (__this_cpu_read(user_stack_count)) - goto out; + return; __this_cpu_inc(user_stack_count); @@ -3117,8 +2293,6 @@ ftrace_trace_userstack(struct trace_array *tr, out_drop_count: __this_cpu_dec(user_stack_count); - out: - preempt_enable(); } #else /* CONFIG_USER_STACKTRACE_SUPPORT */ static void ftrace_trace_userstack(struct trace_array *tr, @@ -3164,334 +2338,6 @@ void trace_last_func_repeats(struct trace_array *tr, __buffer_unlock_commit(buffer, event); } -/* created for use with alloc_percpu */ -struct trace_buffer_struct { - int nesting; - char buffer[4][TRACE_BUF_SIZE]; -}; - -static struct trace_buffer_struct __percpu *trace_percpu_buffer; - -/* - * This allows for lockless recording. If we're nested too deeply, then - * this returns NULL. - */ -static char *get_trace_buf(void) -{ - struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer); - - if (!trace_percpu_buffer || buffer->nesting >= 4) - return NULL; - - buffer->nesting++; - - /* Interrupts must see nesting incremented before we use the buffer */ - barrier(); - return &buffer->buffer[buffer->nesting - 1][0]; -} - -static void put_trace_buf(void) -{ - /* Don't let the decrement of nesting leak before this */ - barrier(); - this_cpu_dec(trace_percpu_buffer->nesting); -} - -static int alloc_percpu_trace_buffer(void) -{ - struct trace_buffer_struct __percpu *buffers; - - if (trace_percpu_buffer) - return 0; - - buffers = alloc_percpu(struct trace_buffer_struct); - if (MEM_FAIL(!buffers, "Could not allocate percpu trace_printk buffer")) - return -ENOMEM; - - trace_percpu_buffer = buffers; - return 0; -} - -static int buffers_allocated; - -void trace_printk_init_buffers(void) -{ - if (buffers_allocated) - return; - - if (alloc_percpu_trace_buffer()) - return; - - /* trace_printk() is for debug use only. Don't use it in production. */ - - pr_warn("\n"); - pr_warn("**********************************************************\n"); - pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); - pr_warn("** **\n"); - pr_warn("** trace_printk() being used. Allocating extra memory. **\n"); - pr_warn("** **\n"); - pr_warn("** This means that this is a DEBUG kernel and it is **\n"); - pr_warn("** unsafe for production use. **\n"); - pr_warn("** **\n"); - pr_warn("** If you see this message and you are not debugging **\n"); - pr_warn("** the kernel, report this immediately to your vendor! **\n"); - pr_warn("** **\n"); - pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); - pr_warn("**********************************************************\n"); - - /* Expand the buffers to set size */ - tracing_update_buffers(&global_trace); - - buffers_allocated = 1; - - /* - * trace_printk_init_buffers() can be called by modules. - * If that happens, then we need to start cmdline recording - * directly here. If the global_trace.buffer is already - * allocated here, then this was called by module code. - */ - if (global_trace.array_buffer.buffer) - tracing_start_cmdline_record(); -} -EXPORT_SYMBOL_GPL(trace_printk_init_buffers); - -void trace_printk_start_comm(void) -{ - /* Start tracing comms if trace printk is set */ - if (!buffers_allocated) - return; - tracing_start_cmdline_record(); -} - -static void trace_printk_start_stop_comm(int enabled) -{ - if (!buffers_allocated) - return; - - if (enabled) - tracing_start_cmdline_record(); - else - tracing_stop_cmdline_record(); -} - -/** - * trace_vbprintk - write binary msg to tracing buffer - * @ip: The address of the caller - * @fmt: The string format to write to the buffer - * @args: Arguments for @fmt - */ -int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) -{ - struct ring_buffer_event *event; - struct trace_buffer *buffer; - struct trace_array *tr = READ_ONCE(printk_trace); - struct bprint_entry *entry; - unsigned int trace_ctx; - char *tbuffer; - int len = 0, size; - - if (!printk_binsafe(tr)) - return trace_vprintk(ip, fmt, args); - - if (unlikely(tracing_selftest_running || tracing_disabled)) - return 0; - - /* Don't pollute graph traces with trace_vprintk internals */ - pause_graph_tracing(); - - trace_ctx = tracing_gen_ctx(); - preempt_disable_notrace(); - - tbuffer = get_trace_buf(); - if (!tbuffer) { - len = 0; - goto out_nobuffer; - } - - len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args); - - if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0) - goto out_put; - - size = sizeof(*entry) + sizeof(u32) * len; - buffer = tr->array_buffer.buffer; - ring_buffer_nest_start(buffer); - event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, - trace_ctx); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->ip = ip; - entry->fmt = fmt; - - memcpy(entry->buf, tbuffer, sizeof(u32) * len); - __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL); - -out: - ring_buffer_nest_end(buffer); -out_put: - put_trace_buf(); - -out_nobuffer: - preempt_enable_notrace(); - unpause_graph_tracing(); - - return len; -} -EXPORT_SYMBOL_GPL(trace_vbprintk); - -__printf(3, 0) -static int -__trace_array_vprintk(struct trace_buffer *buffer, - unsigned long ip, const char *fmt, va_list args) -{ - struct ring_buffer_event *event; - int len = 0, size; - struct print_entry *entry; - unsigned int trace_ctx; - char *tbuffer; - - if (tracing_disabled) - return 0; - - /* Don't pollute graph traces with trace_vprintk internals */ - pause_graph_tracing(); - - trace_ctx = tracing_gen_ctx(); - preempt_disable_notrace(); - - - tbuffer = get_trace_buf(); - if (!tbuffer) { - len = 0; - goto out_nobuffer; - } - - len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); - - size = sizeof(*entry) + len + 1; - ring_buffer_nest_start(buffer); - event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, - trace_ctx); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->ip = ip; - - memcpy(&entry->buf, tbuffer, len + 1); - __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL); - -out: - ring_buffer_nest_end(buffer); - put_trace_buf(); - -out_nobuffer: - preempt_enable_notrace(); - unpause_graph_tracing(); - - return len; -} - -__printf(3, 0) -int trace_array_vprintk(struct trace_array *tr, - unsigned long ip, const char *fmt, va_list args) -{ - if (tracing_selftest_running && tr == &global_trace) - return 0; - - return __trace_array_vprintk(tr->array_buffer.buffer, ip, fmt, args); -} - -/** - * trace_array_printk - Print a message to a specific instance - * @tr: The instance trace_array descriptor - * @ip: The instruction pointer that this is called from. - * @fmt: The format to print (printf format) - * - * If a subsystem sets up its own instance, they have the right to - * printk strings into their tracing instance buffer using this - * function. Note, this function will not write into the top level - * buffer (use trace_printk() for that), as writing into the top level - * buffer should only have events that can be individually disabled. - * trace_printk() is only used for debugging a kernel, and should not - * be ever incorporated in normal use. - * - * trace_array_printk() can be used, as it will not add noise to the - * top level tracing buffer. - * - * Note, trace_array_init_printk() must be called on @tr before this - * can be used. - */ -__printf(3, 0) -int trace_array_printk(struct trace_array *tr, - unsigned long ip, const char *fmt, ...) -{ - int ret; - va_list ap; - - if (!tr) - return -ENOENT; - - /* This is only allowed for created instances */ - if (tr == &global_trace) - return 0; - - if (!(tr->trace_flags & TRACE_ITER_PRINTK)) - return 0; - - va_start(ap, fmt); - ret = trace_array_vprintk(tr, ip, fmt, ap); - va_end(ap); - return ret; -} -EXPORT_SYMBOL_GPL(trace_array_printk); - -/** - * trace_array_init_printk - Initialize buffers for trace_array_printk() - * @tr: The trace array to initialize the buffers for - * - * As trace_array_printk() only writes into instances, they are OK to - * have in the kernel (unlike trace_printk()). This needs to be called - * before trace_array_printk() can be used on a trace_array. - */ -int trace_array_init_printk(struct trace_array *tr) -{ - if (!tr) - return -ENOENT; - - /* This is only allowed for created instances */ - if (tr == &global_trace) - return -EINVAL; - - return alloc_percpu_trace_buffer(); -} -EXPORT_SYMBOL_GPL(trace_array_init_printk); - -__printf(3, 4) -int trace_array_printk_buf(struct trace_buffer *buffer, - unsigned long ip, const char *fmt, ...) -{ - int ret; - va_list ap; - - if (!(printk_trace->trace_flags & TRACE_ITER_PRINTK)) - return 0; - - va_start(ap, fmt); - ret = __trace_array_vprintk(buffer, ip, fmt, ap); - va_end(ap); - return ret; -} - -__printf(2, 0) -int trace_vprintk(unsigned long ip, const char *fmt, va_list args) -{ - return trace_array_vprintk(printk_trace, ip, fmt, args); -} -EXPORT_SYMBOL_GPL(trace_vprintk); - static void trace_iterator_increment(struct trace_iterator *iter) { struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, iter->cpu); @@ -3747,7 +2593,7 @@ const char *trace_event_format(struct trace_iterator *iter, const char *fmt) if (WARN_ON_ONCE(!fmt)) return fmt; - if (!iter->tr || iter->tr->trace_flags & TRACE_ITER_HASH_PTR) + if (!iter->tr || iter->tr->trace_flags & TRACE_ITER(HASH_PTR)) return fmt; p = fmt; @@ -3928,10 +2774,8 @@ static void *s_start(struct seq_file *m, loff_t *pos) } mutex_unlock(&trace_types_lock); -#ifdef CONFIG_TRACER_MAX_TRACE - if (iter->snapshot && iter->trace->use_max_tr) + if (iter->snapshot && tracer_uses_snapshot(iter->trace)) return ERR_PTR(-EBUSY); -#endif if (*pos != iter->pos) { iter->ent = NULL; @@ -3970,10 +2814,8 @@ static void s_stop(struct seq_file *m, void *p) { struct trace_iterator *iter = m->private; -#ifdef CONFIG_TRACER_MAX_TRACE - if (iter->snapshot && iter->trace->use_max_tr) + if (iter->snapshot && tracer_uses_snapshot(iter->trace)) return; -#endif trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); @@ -4069,7 +2911,7 @@ static void print_event_info(struct array_buffer *buf, struct seq_file *m) static void print_func_help_header(struct array_buffer *buf, struct seq_file *m, unsigned int flags) { - bool tgid = flags & TRACE_ITER_RECORD_TGID; + bool tgid = flags & TRACE_ITER(RECORD_TGID); print_event_info(buf, m); @@ -4080,7 +2922,7 @@ static void print_func_help_header(struct array_buffer *buf, struct seq_file *m, static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file *m, unsigned int flags) { - bool tgid = flags & TRACE_ITER_RECORD_TGID; + bool tgid = flags & TRACE_ITER(RECORD_TGID); static const char space[] = " "; int prec = tgid ? 12 : 2; @@ -4119,11 +2961,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) entries, total, buf->cpu, - preempt_model_none() ? "server" : - preempt_model_voluntary() ? "desktop" : - preempt_model_full() ? "preempt" : - preempt_model_rt() ? "preempt_rt" : - "unknown", + preempt_model_str(), /* These are reserved for later use */ 0, 0, 0, 0); #ifdef CONFIG_SMP @@ -4157,7 +2995,7 @@ static void test_cpu_buff_start(struct trace_iterator *iter) struct trace_seq *s = &iter->seq; struct trace_array *tr = iter->tr; - if (!(tr->trace_flags & TRACE_ITER_ANNOTATE)) + if (!(tr->trace_flags & TRACE_ITER(ANNOTATE))) return; if (!(iter->iter_flags & TRACE_FILE_ANNOTATE)) @@ -4179,6 +3017,22 @@ static void test_cpu_buff_start(struct trace_iterator *iter) iter->cpu); } +#ifdef CONFIG_FTRACE_SYSCALLS +static bool is_syscall_event(struct trace_event *event) +{ + return (event->funcs == &enter_syscall_print_funcs) || + (event->funcs == &exit_syscall_print_funcs); + +} +#define syscall_buf_size CONFIG_TRACE_SYSCALL_BUF_SIZE_DEFAULT +#else +static inline bool is_syscall_event(struct trace_event *event) +{ + return false; +} +#define syscall_buf_size 0 +#endif /* CONFIG_FTRACE_SYSCALLS */ + static enum print_line_t print_trace_fmt(struct trace_iterator *iter) { struct trace_array *tr = iter->tr; @@ -4193,7 +3047,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) event = ftrace_find_event(entry->type); - if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) { + if (tr->trace_flags & TRACE_ITER(CONTEXT_INFO)) { if (iter->iter_flags & TRACE_FILE_LAT_FMT) trace_print_lat_context(iter); else @@ -4204,17 +3058,19 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) return TRACE_TYPE_PARTIAL_LINE; if (event) { - if (tr->trace_flags & TRACE_ITER_FIELDS) + if (tr->trace_flags & TRACE_ITER(FIELDS)) return print_event_fields(iter, event); /* * For TRACE_EVENT() events, the print_fmt is not * safe to use if the array has delta offsets * Force printing via the fields. */ - if ((tr->text_delta || tr->data_delta) && - event->type > __TRACE_LAST_TYPE) - return print_event_fields(iter, event); - + if ((tr->text_delta)) { + /* ftrace and system call events are still OK */ + if ((event->type > __TRACE_LAST_TYPE) && + !is_syscall_event(event)) + return print_event_fields(iter, event); + } return event->funcs->trace(iter, sym_flags, event); } @@ -4232,7 +3088,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter) entry = iter->ent; - if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) + if (tr->trace_flags & TRACE_ITER(CONTEXT_INFO)) trace_seq_printf(s, "%d %d %llu ", entry->pid, iter->cpu, iter->ts); @@ -4258,7 +3114,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) entry = iter->ent; - if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) { + if (tr->trace_flags & TRACE_ITER(CONTEXT_INFO)) { SEQ_PUT_HEX_FIELD(s, entry->pid); SEQ_PUT_HEX_FIELD(s, iter->cpu); SEQ_PUT_HEX_FIELD(s, iter->ts); @@ -4287,7 +3143,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter) entry = iter->ent; - if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) { + if (tr->trace_flags & TRACE_ITER(CONTEXT_INFO)) { SEQ_PUT_FIELD(s, entry->pid); SEQ_PUT_FIELD(s, iter->cpu); SEQ_PUT_FIELD(s, iter->ts); @@ -4358,27 +3214,27 @@ enum print_line_t print_trace_line(struct trace_iterator *iter) } if (iter->ent->type == TRACE_BPUTS && - trace_flags & TRACE_ITER_PRINTK && - trace_flags & TRACE_ITER_PRINTK_MSGONLY) + trace_flags & TRACE_ITER(PRINTK) && + trace_flags & TRACE_ITER(PRINTK_MSGONLY)) return trace_print_bputs_msg_only(iter); if (iter->ent->type == TRACE_BPRINT && - trace_flags & TRACE_ITER_PRINTK && - trace_flags & TRACE_ITER_PRINTK_MSGONLY) + trace_flags & TRACE_ITER(PRINTK) && + trace_flags & TRACE_ITER(PRINTK_MSGONLY)) return trace_print_bprintk_msg_only(iter); if (iter->ent->type == TRACE_PRINT && - trace_flags & TRACE_ITER_PRINTK && - trace_flags & TRACE_ITER_PRINTK_MSGONLY) + trace_flags & TRACE_ITER(PRINTK) && + trace_flags & TRACE_ITER(PRINTK_MSGONLY)) return trace_print_printk_msg_only(iter); - if (trace_flags & TRACE_ITER_BIN) + if (trace_flags & TRACE_ITER(BIN)) return print_bin_fmt(iter); - if (trace_flags & TRACE_ITER_HEX) + if (trace_flags & TRACE_ITER(HEX)) return print_hex_fmt(iter); - if (trace_flags & TRACE_ITER_RAW) + if (trace_flags & TRACE_ITER(RAW)) return print_raw_fmt(iter); return print_trace_fmt(iter); @@ -4396,7 +3252,7 @@ void trace_latency_header(struct seq_file *m) if (iter->iter_flags & TRACE_FILE_LAT_FMT) print_trace_header(m, iter); - if (!(tr->trace_flags & TRACE_ITER_VERBOSE)) + if (!(tr->trace_flags & TRACE_ITER(VERBOSE))) print_lat_help_header(m); } @@ -4406,7 +3262,7 @@ void trace_default_header(struct seq_file *m) struct trace_array *tr = iter->tr; unsigned long trace_flags = tr->trace_flags; - if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) + if (!(trace_flags & TRACE_ITER(CONTEXT_INFO))) return; if (iter->iter_flags & TRACE_FILE_LAT_FMT) { @@ -4414,11 +3270,11 @@ void trace_default_header(struct seq_file *m) if (trace_empty(iter)) return; print_trace_header(m, iter); - if (!(trace_flags & TRACE_ITER_VERBOSE)) + if (!(trace_flags & TRACE_ITER(VERBOSE))) print_lat_help_header(m); } else { - if (!(trace_flags & TRACE_ITER_VERBOSE)) { - if (trace_flags & TRACE_ITER_IRQ_INFO) + if (!(trace_flags & TRACE_ITER(VERBOSE))) { + if (trace_flags & TRACE_ITER(IRQ_INFO)) print_func_help_header_irq(iter->array_buffer, m, trace_flags); else @@ -4436,50 +3292,6 @@ static void test_ftrace_alive(struct seq_file *m) "# MAY BE MISSING FUNCTION EVENTS\n"); } -#ifdef CONFIG_TRACER_MAX_TRACE -static void show_snapshot_main_help(struct seq_file *m) -{ - seq_puts(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n" - "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n" - "# Takes a snapshot of the main buffer.\n" - "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n" - "# (Doesn't have to be '2' works with any number that\n" - "# is not a '0' or '1')\n"); -} - -static void show_snapshot_percpu_help(struct seq_file *m) -{ - seq_puts(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n"); -#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP - seq_puts(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n" - "# Takes a snapshot of the main buffer for this cpu.\n"); -#else - seq_puts(m, "# echo 1 > snapshot : Not supported with this kernel.\n" - "# Must use main snapshot file to allocate.\n"); -#endif - seq_puts(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n" - "# (Doesn't have to be '2' works with any number that\n" - "# is not a '0' or '1')\n"); -} - -static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) -{ - if (iter->tr->allocated_snapshot) - seq_puts(m, "#\n# * Snapshot is allocated *\n#\n"); - else - seq_puts(m, "#\n# * Snapshot is freed *\n#\n"); - - seq_puts(m, "# Snapshot commands:\n"); - if (iter->cpu_file == RING_BUFFER_ALL_CPUS) - show_snapshot_main_help(m); - else - show_snapshot_percpu_help(m); -} -#else -/* Should never be called */ -static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { } -#endif - static int s_show(struct seq_file *m, void *v) { struct trace_iterator *iter = v; @@ -4528,17 +3340,6 @@ static int s_show(struct seq_file *m, void *v) return 0; } -/* - * Should be used after trace_array_get(), trace_types_lock - * ensures that i_cdev was already initialized. - */ -static inline int tracing_get_cpu(struct inode *inode) -{ - if (inode->i_cdev) /* See trace_create_cpu_file() */ - return (long)inode->i_cdev - 1; - return RING_BUFFER_ALL_CPUS; -} - static const struct seq_operations tracer_seq_ops = { .start = s_start, .next = s_next, @@ -4565,7 +3366,7 @@ static void free_trace_iter_content(struct trace_iterator *iter) free_cpumask_var(iter->started); } -static struct trace_iterator * +struct trace_iterator * __tracing_open(struct inode *inode, struct file *file, bool snapshot) { struct trace_array *tr = inode->i_private; @@ -4579,8 +3380,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) if (!iter) return ERR_PTR(-ENOMEM); - iter->buffer_iter = kcalloc(nr_cpu_ids, sizeof(*iter->buffer_iter), - GFP_KERNEL); + iter->buffer_iter = kzalloc_objs(*iter->buffer_iter, nr_cpu_ids); if (!iter->buffer_iter) goto release; @@ -4614,10 +3414,10 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) iter->tr = tr; -#ifdef CONFIG_TRACER_MAX_TRACE +#ifdef CONFIG_TRACER_SNAPSHOT /* Currently only the top directory has a snapshot */ if (tr->current_trace->print_max || snapshot) - iter->array_buffer = &tr->max_buffer; + iter->array_buffer = &tr->snapshot_buffer; else #endif iter->array_buffer = &tr->array_buffer; @@ -4642,27 +3442,23 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) * If pause-on-trace is enabled, then stop the trace while * dumping, unless this is the "snapshot" file */ - if (!iter->snapshot && (tr->trace_flags & TRACE_ITER_PAUSE_ON_TRACE)) + if (!iter->snapshot && (tr->trace_flags & TRACE_ITER(PAUSE_ON_TRACE))) { + iter->iter_flags |= TRACE_FILE_PAUSE; tracing_stop_tr(tr); + } if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter->buffer_iter[cpu] = - ring_buffer_read_prepare(iter->array_buffer->buffer, - cpu, GFP_KERNEL); - } - ring_buffer_read_prepare_sync(); - for_each_tracing_cpu(cpu) { - ring_buffer_read_start(iter->buffer_iter[cpu]); + ring_buffer_read_start(iter->array_buffer->buffer, + cpu, GFP_KERNEL); tracing_iter_reset(iter, cpu); } } else { cpu = iter->cpu_file; iter->buffer_iter[cpu] = - ring_buffer_read_prepare(iter->array_buffer->buffer, - cpu, GFP_KERNEL); - ring_buffer_read_prepare_sync(); - ring_buffer_read_start(iter->buffer_iter[cpu]); + ring_buffer_read_start(iter->array_buffer->buffer, + cpu, GFP_KERNEL); tracing_iter_reset(iter, cpu); } @@ -4690,11 +3486,6 @@ int tracing_open_generic(struct inode *inode, struct file *filp) return 0; } -bool tracing_is_disabled(void) -{ - return (tracing_disabled) ? true: false; -} - /* * Open and update trace_array ref count. * Must have the current trace_array passed to it. @@ -4708,6 +3499,11 @@ int tracing_open_generic_tr(struct inode *inode, struct file *filp) if (ret) return ret; + if ((filp->f_mode & FMODE_WRITE) && trace_array_is_readonly(tr)) { + trace_array_put(tr); + return -EACCES; + } + filp->private_data = inode->i_private; return 0; @@ -4726,22 +3522,16 @@ int tracing_open_file_tr(struct inode *inode, struct file *filp) if (ret) return ret; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); /* Fail if the file is marked for removal */ if (file->flags & EVENT_FILE_FL_FREED) { trace_array_put(file->tr); - ret = -ENODEV; + return -ENODEV; } else { event_file_get(file); } - mutex_unlock(&event_mutex); - if (ret) - return ret; - - filp->private_data = inode->i_private; - return 0; } @@ -4761,13 +3551,7 @@ int tracing_single_release_file_tr(struct inode *inode, struct file *filp) return single_release(inode, filp); } -static int tracing_mark_open(struct inode *inode, struct file *filp) -{ - stream_open(inode, filp); - return tracing_open_generic_tr(inode, filp); -} - -static int tracing_release(struct inode *inode, struct file *file) +int tracing_release(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; struct seq_file *m = file->private_data; @@ -4791,7 +3575,7 @@ static int tracing_release(struct inode *inode, struct file *file) if (iter->trace && iter->trace->close) iter->trace->close(iter); - if (!iter->snapshot && tr->stop_count) + if (iter->iter_flags & TRACE_FILE_PAUSE) /* reenable tracing if it was previously enabled */ tracing_start_tr(tr); @@ -4822,6 +3606,8 @@ static int tracing_single_release_tr(struct inode *inode, struct file *file) return single_release(inode, file); } +static bool update_last_data_if_empty(struct trace_array *tr); + static int tracing_open(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; @@ -4839,20 +3625,22 @@ static int tracing_open(struct inode *inode, struct file *file) #ifdef CONFIG_TRACER_MAX_TRACE if (tr->current_trace->print_max) - trace_buf = &tr->max_buffer; + trace_buf = &tr->snapshot_buffer; #endif if (cpu == RING_BUFFER_ALL_CPUS) tracing_reset_online_cpus(trace_buf); else tracing_reset_cpu(trace_buf, cpu); + + update_last_data_if_empty(tr); } if (file->f_mode & FMODE_READ) { iter = __tracing_open(inode, file, false); if (IS_ERR(iter)) ret = PTR_ERR(iter); - else if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) + else if (tr->trace_flags & TRACE_ITER(LATENCY_FMT)) iter->iter_flags |= TRACE_FILE_LAT_FMT; } @@ -4870,11 +3658,9 @@ static int tracing_open(struct inode *inode, struct file *file) static bool trace_ok_for_array(struct tracer *t, struct trace_array *tr) { -#ifdef CONFIG_TRACER_SNAPSHOT /* arrays with mapped buffer range do not have snapshots */ - if (tr->range_addr_start && t->use_max_tr) + if (tr->range_addr_start && tracer_uses_snapshot(t)) return false; -#endif return (tr->flags & TRACE_ARRAY_FL_GLOBAL) || t->allow_instances; } @@ -5016,7 +3802,7 @@ tracing_cpumask_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { struct trace_array *tr = file_inode(filp)->i_private; - char *mask_str; + char *mask_str __free(kfree) = NULL; int len; len = snprintf(NULL, 0, "%*pb\n", @@ -5027,16 +3813,10 @@ tracing_cpumask_read(struct file *filp, char __user *ubuf, len = snprintf(mask_str, len, "%*pb\n", cpumask_pr_args(tr->tracing_cpumask)); - if (len >= count) { - count = -EINVAL; - goto out_err; - } - count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len); - -out_err: - kfree(mask_str); + if (len >= count) + return -EINVAL; - return count; + return simple_read_from_buffer(ubuf, count, ppos, mask_str, len); } int tracing_set_cpumask(struct trace_array *tr, @@ -5056,18 +3836,16 @@ int tracing_set_cpumask(struct trace_array *tr, */ if (cpumask_test_cpu(cpu, tr->tracing_cpumask) && !cpumask_test_cpu(cpu, tracing_cpumask_new)) { - atomic_inc(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled); ring_buffer_record_disable_cpu(tr->array_buffer.buffer, cpu); -#ifdef CONFIG_TRACER_MAX_TRACE - ring_buffer_record_disable_cpu(tr->max_buffer.buffer, cpu); +#ifdef CONFIG_TRACER_SNAPSHOT + ring_buffer_record_disable_cpu(tr->snapshot_buffer.buffer, cpu); #endif } if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) && cpumask_test_cpu(cpu, tracing_cpumask_new)) { - atomic_dec(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled); ring_buffer_record_enable_cpu(tr->array_buffer.buffer, cpu); -#ifdef CONFIG_TRACER_MAX_TRACE - ring_buffer_record_enable_cpu(tr->max_buffer.buffer, cpu); +#ifdef CONFIG_TRACER_SNAPSHOT + ring_buffer_record_enable_cpu(tr->snapshot_buffer.buffer, cpu); #endif } } @@ -5123,27 +3901,32 @@ static int tracing_trace_options_show(struct seq_file *m, void *v) { struct tracer_opt *trace_opts; struct trace_array *tr = m->private; + struct tracer_flags *flags; u32 tracer_flags; int i; - mutex_lock(&trace_types_lock); - tracer_flags = tr->current_trace->flags->val; - trace_opts = tr->current_trace->flags->opts; + guard(mutex)(&trace_types_lock); for (i = 0; trace_options[i]; i++) { - if (tr->trace_flags & (1 << i)) + if (tr->trace_flags & (1ULL << i)) seq_printf(m, "%s\n", trace_options[i]); else seq_printf(m, "no%s\n", trace_options[i]); } + flags = tr->current_trace_flags; + if (!flags || !flags->opts) + return 0; + + tracer_flags = flags->val; + trace_opts = flags->opts; + for (i = 0; trace_opts[i].name; i++) { if (tracer_flags & trace_opts[i].bit) seq_printf(m, "%s\n", trace_opts[i].name); else seq_printf(m, "no%s\n", trace_opts[i].name); } - mutex_unlock(&trace_types_lock); return 0; } @@ -5153,9 +3936,10 @@ static int __set_tracer_option(struct trace_array *tr, struct tracer_opt *opts, int neg) { struct tracer *trace = tracer_flags->trace; - int ret; + int ret = 0; - ret = trace->set_flag(tr, tracer_flags->val, opts->bit, !neg); + if (trace->set_flag) + ret = trace->set_flag(tr, tracer_flags->val, opts->bit, !neg); if (ret) return ret; @@ -5169,36 +3953,41 @@ static int __set_tracer_option(struct trace_array *tr, /* Try to assign a tracer specific option */ static int set_tracer_option(struct trace_array *tr, char *cmp, int neg) { - struct tracer *trace = tr->current_trace; - struct tracer_flags *tracer_flags = trace->flags; + struct tracer_flags *tracer_flags = tr->current_trace_flags; struct tracer_opt *opts = NULL; int i; + if (!tracer_flags || !tracer_flags->opts) + return 0; + for (i = 0; tracer_flags->opts[i].name; i++) { opts = &tracer_flags->opts[i]; if (strcmp(cmp, opts->name) == 0) - return __set_tracer_option(tr, trace->flags, opts, neg); + return __set_tracer_option(tr, tracer_flags, opts, neg); } return -EINVAL; } /* Some tracers require overwrite to stay enabled */ -int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set) +int trace_keep_overwrite(struct tracer *tracer, u64 mask, int set) { - if (tracer->enabled && (mask & TRACE_ITER_OVERWRITE) && !set) + if (tracer->enabled && (mask & TRACE_ITER(OVERWRITE)) && !set) return -1; return 0; } -int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) +int set_tracer_flag(struct trace_array *tr, u64 mask, int enabled) { - if ((mask == TRACE_ITER_RECORD_TGID) || - (mask == TRACE_ITER_RECORD_CMD) || - (mask == TRACE_ITER_TRACE_PRINTK)) + switch (mask) { + case TRACE_ITER(RECORD_TGID): + case TRACE_ITER(RECORD_CMD): + case TRACE_ITER(TRACE_PRINTK): + case TRACE_ITER(COPY_MARKER): lockdep_assert_held(&event_mutex); + } /* do nothing if flag is already set */ if (!!(tr->trace_flags & mask) == !!enabled) @@ -5209,7 +3998,8 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) if (tr->current_trace->flag_changed(tr, mask, !!enabled)) return -EINVAL; - if (mask == TRACE_ITER_TRACE_PRINTK) { + switch (mask) { + case TRACE_ITER(TRACE_PRINTK): if (enabled) { update_printk_trace(tr); } else { @@ -5221,11 +4011,17 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) return -EINVAL; /* * An instance must always have it set. - * by default, that's the global_trace instane. + * by default, that's the global_trace instance. */ if (printk_trace == tr) update_printk_trace(&global_trace); } + break; + + case TRACE_ITER(COPY_MARKER): + update_marker_trace(tr, enabled); + /* update_marker_trace updates the tr->trace_flags */ + return 0; } if (enabled) @@ -5233,35 +4029,46 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) else tr->trace_flags &= ~mask; - if (mask == TRACE_ITER_RECORD_CMD) + switch (mask) { + case TRACE_ITER(RECORD_CMD): trace_event_enable_cmd_record(enabled); + break; - if (mask == TRACE_ITER_RECORD_TGID) { + case TRACE_ITER(RECORD_TGID): if (trace_alloc_tgid_map() < 0) { - tr->trace_flags &= ~TRACE_ITER_RECORD_TGID; + tr->trace_flags &= ~TRACE_ITER(RECORD_TGID); return -ENOMEM; } trace_event_enable_tgid_record(enabled); - } + break; - if (mask == TRACE_ITER_EVENT_FORK) + case TRACE_ITER(EVENT_FORK): trace_event_follow_fork(tr, enabled); + break; - if (mask == TRACE_ITER_FUNC_FORK) + case TRACE_ITER(FUNC_FORK): ftrace_pid_follow_fork(tr, enabled); + break; - if (mask == TRACE_ITER_OVERWRITE) { + case TRACE_ITER(OVERWRITE): ring_buffer_change_overwrite(tr->array_buffer.buffer, enabled); -#ifdef CONFIG_TRACER_MAX_TRACE - ring_buffer_change_overwrite(tr->max_buffer.buffer, enabled); +#ifdef CONFIG_TRACER_SNAPSHOT + ring_buffer_change_overwrite(tr->snapshot_buffer.buffer, enabled); #endif - } + break; - if (mask == TRACE_ITER_PRINTK) { + case TRACE_ITER(PRINTK): trace_printk_start_stop_comm(enabled); trace_printk_control(enabled); + break; + +#if defined(CONFIG_FUNCTION_PROFILER) && defined(CONFIG_FUNCTION_GRAPH_TRACER) + case TRACE_GRAPH_GRAPH_TIME: + ftrace_graph_graph_time_control(enabled); + break; +#endif } return 0; @@ -5291,7 +4098,7 @@ int trace_set_options(struct trace_array *tr, char *option) if (ret < 0) ret = set_tracer_option(tr, cmp, neg); else - ret = set_tracer_flag(tr, 1 << ret, !neg); + ret = set_tracer_flag(tr, 1ULL << ret, !neg); mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); @@ -5536,6 +4343,8 @@ static const char readme_msg[] = "\t efield: For event probes ('e' types), the field is on of the fields\n" "\t of the <attached-group>/<attached-event>.\n" #endif + " set_event\t\t- Enables events by name written into it\n" + "\t\t\t Can enable module events via: :mod:<module>\n" " events/\t\t- Directory containing all trace event subsystems:\n" " enable\t\t- Write 0/1 to enable/disable tracing of all events\n" " events/<system>/\t- Directory containing all trace events for <system>:\n" @@ -5802,13 +4611,13 @@ trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start, * where the head holds the module and length of array, and the * tail holds a pointer to the next list. */ - map_array = kmalloc_array(len + 2, sizeof(*map_array), GFP_KERNEL); + map_array = kmalloc_objs(*map_array, len + 2); if (!map_array) { pr_warn("Unable to allocate trace eval mapping\n"); return; } - mutex_lock(&trace_eval_mutex); + guard(mutex)(&trace_eval_mutex); if (!trace_eval_maps) trace_eval_maps = map_array; @@ -5832,8 +4641,6 @@ trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start, map_array++; } memset(map_array, 0, sizeof(*map_array)); - - mutex_unlock(&trace_eval_mutex); } static void trace_create_eval_file(struct dentry *d_tracer) @@ -5848,17 +4655,27 @@ static inline void trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start, int len) { } #endif /* !CONFIG_TRACE_EVAL_MAP_FILE */ -static void trace_insert_eval_map(struct module *mod, - struct trace_eval_map **start, int len) +static void +trace_event_update_with_eval_map(struct module *mod, + struct trace_eval_map **start, + int len) { struct trace_eval_map **map; - if (len <= 0) - return; + /* Always run sanitizer only if btf_type_tag attr exists. */ + if (len <= 0) { + if (!(IS_ENABLED(CONFIG_DEBUG_INFO_BTF) && + IS_ENABLED(CONFIG_PAHOLE_HAS_BTF_TAG) && + __has_attribute(btf_type_tag))) + return; + } map = start; - trace_event_eval_update(map, len); + trace_event_update_all(map, len); + + if (len <= 0) + return; trace_insert_eval_map_file(mod, start, len); } @@ -5871,9 +4688,9 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf, char buf[MAX_TRACER_SIZE+2]; int r; - mutex_lock(&trace_types_lock); - r = sprintf(buf, "%s\n", tr->current_trace->name); - mutex_unlock(&trace_types_lock); + scoped_guard(mutex, &trace_types_lock) { + r = sprintf(buf, "%s\n", tr->current_trace->name); + } return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } @@ -5881,10 +4698,11 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf, int tracer_init(struct tracer *t, struct trace_array *tr) { tracing_reset_online_cpus(&tr->array_buffer); + update_last_data_if_empty(tr); return t->init(tr); } -static void set_buffer_entries(struct array_buffer *buf, unsigned long val) +void trace_set_buffer_entries(struct array_buffer *buf, unsigned long val) { int cpu; @@ -5895,40 +4713,12 @@ static void set_buffer_entries(struct array_buffer *buf, unsigned long val) static void update_buffer_entries(struct array_buffer *buf, int cpu) { if (cpu == RING_BUFFER_ALL_CPUS) { - set_buffer_entries(buf, ring_buffer_size(buf->buffer, 0)); + trace_set_buffer_entries(buf, ring_buffer_size(buf->buffer, 0)); } else { per_cpu_ptr(buf->data, cpu)->entries = ring_buffer_size(buf->buffer, cpu); } } -#ifdef CONFIG_TRACER_MAX_TRACE -/* resize @tr's buffer to the size of @size_tr's entries */ -static int resize_buffer_duplicate_size(struct array_buffer *trace_buf, - struct array_buffer *size_buf, int cpu_id) -{ - int cpu, ret = 0; - - if (cpu_id == RING_BUFFER_ALL_CPUS) { - for_each_tracing_cpu(cpu) { - ret = ring_buffer_resize(trace_buf->buffer, - per_cpu_ptr(size_buf->data, cpu)->entries, cpu); - if (ret < 0) - break; - per_cpu_ptr(trace_buf->data, cpu)->entries = - per_cpu_ptr(size_buf->data, cpu)->entries; - } - } else { - ret = ring_buffer_resize(trace_buf->buffer, - per_cpu_ptr(size_buf->data, cpu_id)->entries, cpu_id); - if (ret == 0) - per_cpu_ptr(trace_buf->data, cpu_id)->entries = - per_cpu_ptr(size_buf->data, cpu_id)->entries; - } - - return ret; -} -#endif /* CONFIG_TRACER_MAX_TRACE */ - static int __tracing_resize_ring_buffer(struct trace_array *tr, unsigned long size, int cpu) { @@ -5952,11 +4742,11 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr, if (ret < 0) goto out_start; -#ifdef CONFIG_TRACER_MAX_TRACE +#ifdef CONFIG_TRACER_SNAPSHOT if (!tr->allocated_snapshot) goto out; - ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu); + ret = ring_buffer_resize(tr->snapshot_buffer.buffer, size, cpu); if (ret < 0) { int r = resize_buffer_duplicate_size(&tr->array_buffer, &tr->array_buffer, cpu); @@ -5981,10 +4771,10 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr, goto out_start; } - update_buffer_entries(&tr->max_buffer, cpu); + update_buffer_entries(&tr->snapshot_buffer, cpu); out: -#endif /* CONFIG_TRACER_MAX_TRACE */ +#endif /* CONFIG_TRACER_SNAPSHOT */ update_buffer_entries(&tr->array_buffer, cpu); out_start: @@ -5995,33 +4785,152 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr, ssize_t tracing_resize_ring_buffer(struct trace_array *tr, unsigned long size, int cpu_id) { - int ret; - - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); if (cpu_id != RING_BUFFER_ALL_CPUS) { /* make sure, this cpu is enabled in the mask */ - if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) { - ret = -EINVAL; - goto out; - } + if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) + return -EINVAL; } - ret = __tracing_resize_ring_buffer(tr, size, cpu_id); - if (ret < 0) - ret = -ENOMEM; + return __tracing_resize_ring_buffer(tr, size, cpu_id); +} -out: - mutex_unlock(&trace_types_lock); +struct trace_mod_entry { + unsigned long mod_addr; + char mod_name[MODULE_NAME_LEN]; +}; - return ret; +struct trace_scratch { + unsigned int clock_id; + unsigned long text_addr; + unsigned long nr_entries; + struct trace_mod_entry entries[]; +}; + +static DEFINE_MUTEX(scratch_mutex); + +static int cmp_mod_entry(const void *key, const void *pivot) +{ + unsigned long addr = (unsigned long)key; + const struct trace_mod_entry *ent = pivot; + + if (addr < ent[0].mod_addr) + return -1; + + return addr >= ent[1].mod_addr; } +/** + * trace_adjust_address() - Adjust prev boot address to current address. + * @tr: Persistent ring buffer's trace_array. + * @addr: Address in @tr which is adjusted. + */ +unsigned long trace_adjust_address(struct trace_array *tr, unsigned long addr) +{ + struct trace_module_delta *module_delta; + struct trace_scratch *tscratch; + struct trace_mod_entry *entry; + unsigned long raddr; + int idx = 0, nr_entries; + + /* If we don't have last boot delta, return the address */ + if (!(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) + return addr; + + /* tr->module_delta must be protected by rcu. */ + guard(rcu)(); + tscratch = tr->scratch; + /* if there is no tscrach, module_delta must be NULL. */ + module_delta = READ_ONCE(tr->module_delta); + if (!module_delta || !tscratch->nr_entries || + tscratch->entries[0].mod_addr > addr) { + raddr = addr + tr->text_delta; + return __is_kernel(raddr) || is_kernel_core_data(raddr) || + is_kernel_rodata(raddr) ? raddr : addr; + } + + /* Note that entries must be sorted. */ + nr_entries = tscratch->nr_entries; + if (nr_entries == 1 || + tscratch->entries[nr_entries - 1].mod_addr < addr) + idx = nr_entries - 1; + else { + entry = __inline_bsearch((void *)addr, + tscratch->entries, + nr_entries - 1, + sizeof(tscratch->entries[0]), + cmp_mod_entry); + if (entry) + idx = entry - tscratch->entries; + } + + return addr + module_delta->delta[idx]; +} + +#ifdef CONFIG_MODULES +static int save_mod(struct module *mod, void *data) +{ + struct trace_array *tr = data; + struct trace_scratch *tscratch; + struct trace_mod_entry *entry; + unsigned int size; + + tscratch = tr->scratch; + if (!tscratch) + return -1; + size = tr->scratch_size; + + if (struct_size(tscratch, entries, tscratch->nr_entries + 1) > size) + return -1; + + entry = &tscratch->entries[tscratch->nr_entries]; + + tscratch->nr_entries++; + + entry->mod_addr = (unsigned long)mod->mem[MOD_TEXT].base; + strscpy(entry->mod_name, mod->name); + + return 0; +} +#else +static int save_mod(struct module *mod, void *data) +{ + return 0; +} +#endif + static void update_last_data(struct trace_array *tr) { - if (!tr->text_delta && !tr->data_delta) + struct trace_module_delta *module_delta; + struct trace_scratch *tscratch; + + if (!(tr->flags & TRACE_ARRAY_FL_BOOT)) + return; + + if (!(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) return; + /* Only if the buffer has previous boot data clear and update it. */ + tr->flags &= ~TRACE_ARRAY_FL_LAST_BOOT; + + /* If this is a backup instance, mark it for autoremove. */ + if (tr->flags & TRACE_ARRAY_FL_VMALLOC) + tr->free_on_close = true; + + /* Reset the module list and reload them */ + if (tr->scratch) { + struct trace_scratch *tscratch = tr->scratch; + + tscratch->clock_id = tr->clock_id; + memset(tscratch->entries, 0, + flex_array_size(tscratch, entries, tscratch->nr_entries)); + tscratch->nr_entries = 0; + + guard(mutex)(&scratch_mutex); + module_for_each_mod(save_mod, tr); + } + /* * Need to clear all CPU buffers as there cannot be events * from the previous boot mixed with events with this boot @@ -6032,7 +4941,17 @@ static void update_last_data(struct trace_array *tr) /* Using current data now */ tr->text_delta = 0; - tr->data_delta = 0; + + if (!tr->scratch) + return; + + tscratch = tr->scratch; + module_delta = READ_ONCE(tr->module_delta); + WRITE_ONCE(tr->module_delta, NULL); + kfree_rcu(module_delta, rcu); + + /* Set the persistent ring buffer meta data to this address */ + tscratch->text_addr = (unsigned long)_text; } /** @@ -6050,23 +4969,19 @@ int tracing_update_buffers(struct trace_array *tr) { int ret = 0; - mutex_lock(&trace_types_lock); + if (!tr) + tr = &global_trace; + + guard(mutex)(&trace_types_lock); update_last_data(tr); if (!tr->ring_buffer_expanded) ret = __tracing_resize_ring_buffer(tr, trace_buf_size, RING_BUFFER_ALL_CPUS); - mutex_unlock(&trace_types_lock); - return ret; } -struct trace_option_dentry; - -static void -create_trace_option_files(struct trace_array *tr, struct tracer *tracer); - /* * Used to clear out the tracer before deletion of an instance. * Must have trace_types_lock held. @@ -6082,32 +4997,19 @@ static void tracing_set_nop(struct trace_array *tr) tr->current_trace->reset(tr); tr->current_trace = &nop_trace; + tr->current_trace_flags = nop_trace.flags; } static bool tracer_options_updated; -static void add_tracer_options(struct trace_array *tr, struct tracer *t) -{ - /* Only enable if the directory has been created already. */ - if (!tr->dir) - return; - - /* Only create trace option files after update_tracer_options finish */ - if (!tracer_options_updated) - return; - - create_trace_option_files(tr, t); -} - int tracing_set_tracer(struct trace_array *tr, const char *buf) { - struct tracer *t; -#ifdef CONFIG_TRACER_MAX_TRACE + struct tracer *trace = NULL; + struct tracers *t; bool had_max_tr; -#endif - int ret = 0; + int ret; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); update_last_data(tr); @@ -6115,51 +5017,47 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) ret = __tracing_resize_ring_buffer(tr, trace_buf_size, RING_BUFFER_ALL_CPUS); if (ret < 0) - goto out; + return ret; ret = 0; } - for (t = trace_types; t; t = t->next) { - if (strcmp(t->name, buf) == 0) + list_for_each_entry(t, &tr->tracers, list) { + if (strcmp(t->tracer->name, buf) == 0) { + trace = t->tracer; break; + } } - if (!t) { - ret = -EINVAL; - goto out; - } - if (t == tr->current_trace) - goto out; + if (!trace) + return -EINVAL; + + if (trace == tr->current_trace) + return 0; #ifdef CONFIG_TRACER_SNAPSHOT - if (t->use_max_tr) { + if (tracer_uses_snapshot(trace)) { local_irq_disable(); arch_spin_lock(&tr->max_lock); - if (tr->cond_snapshot) - ret = -EBUSY; + ret = tr->cond_snapshot ? -EBUSY : 0; arch_spin_unlock(&tr->max_lock); local_irq_enable(); if (ret) - goto out; + return ret; } #endif /* Some tracers won't work on kernel command line */ - if (system_state < SYSTEM_RUNNING && t->noboot) { + if (system_state < SYSTEM_RUNNING && trace->noboot) { pr_warn("Tracer '%s' is not allowed on command line, ignored\n", - t->name); - goto out; + trace->name); + return -EINVAL; } /* Some tracers are only allowed for the top level buffer */ - if (!trace_ok_for_array(t, tr)) { - ret = -EINVAL; - goto out; - } + if (!trace_ok_for_array(trace, tr)) + return -EINVAL; /* If trace pipe files are being read, we can't change the tracer */ - if (tr->trace_ref) { - ret = -EBUSY; - goto out; - } + if (tr->trace_ref) + return -EBUSY; trace_branch_disable(); @@ -6168,13 +5066,13 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) if (tr->current_trace->reset) tr->current_trace->reset(tr); -#ifdef CONFIG_TRACER_MAX_TRACE - had_max_tr = tr->current_trace->use_max_tr; + had_max_tr = tracer_uses_snapshot(tr->current_trace); /* Current trace needs to be nop_trace before synchronize_rcu */ tr->current_trace = &nop_trace; + tr->current_trace_flags = nop_trace.flags; - if (had_max_tr && !t->use_max_tr) { + if (had_max_tr && !tracer_uses_snapshot(trace)) { /* * We need to make sure that the update_max_tr sees that * current_trace changed to nop_trace to keep it from @@ -6187,33 +5085,29 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) tracing_disarm_snapshot(tr); } - if (!had_max_tr && t->use_max_tr) { + if (!had_max_tr && tracer_uses_snapshot(trace)) { ret = tracing_arm_snapshot_locked(tr); if (ret) - goto out; + return ret; } -#else - tr->current_trace = &nop_trace; -#endif - if (t->init) { - ret = tracer_init(t, tr); + tr->current_trace_flags = t->flags ? : t->tracer->flags; + + if (trace->init) { + ret = tracer_init(trace, tr); if (ret) { -#ifdef CONFIG_TRACER_MAX_TRACE - if (t->use_max_tr) + if (tracer_uses_snapshot(trace)) tracing_disarm_snapshot(tr); -#endif - goto out; + tr->current_trace_flags = nop_trace.flags; + return ret; } } - tr->current_trace = t; + tr->current_trace = trace; tr->current_trace->enabled++; trace_branch_enable(tr); - out: - mutex_unlock(&trace_types_lock); - return ret; + return 0; } static ssize_t @@ -6247,9 +5141,8 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, return ret; } -static ssize_t -tracing_nsecs_read(unsigned long *ptr, char __user *ubuf, - size_t cnt, loff_t *ppos) +ssize_t tracing_nsecs_read(unsigned long *ptr, char __user *ubuf, + size_t cnt, loff_t *ppos) { char buf[64]; int r; @@ -6261,9 +5154,8 @@ tracing_nsecs_read(unsigned long *ptr, char __user *ubuf, return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } -static ssize_t -tracing_nsecs_write(unsigned long *ptr, const char __user *ubuf, - size_t cnt, loff_t *ppos) +ssize_t tracing_nsecs_write(unsigned long *ptr, const char __user *ubuf, + size_t cnt, loff_t *ppos) { unsigned long val; int ret; @@ -6291,46 +5183,20 @@ tracing_thresh_write(struct file *filp, const char __user *ubuf, struct trace_array *tr = filp->private_data; int ret; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); ret = tracing_nsecs_write(&tracing_thresh, ubuf, cnt, ppos); if (ret < 0) - goto out; + return ret; if (tr->current_trace->update_thresh) { ret = tr->current_trace->update_thresh(tr); if (ret < 0) - goto out; + return ret; } - ret = cnt; -out: - mutex_unlock(&trace_types_lock); - - return ret; -} - -#ifdef CONFIG_TRACER_MAX_TRACE - -static ssize_t -tracing_max_lat_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - struct trace_array *tr = filp->private_data; - - return tracing_nsecs_read(&tr->max_latency, ubuf, cnt, ppos); -} - -static ssize_t -tracing_max_lat_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - struct trace_array *tr = filp->private_data; - - return tracing_nsecs_write(&tr->max_latency, ubuf, cnt, ppos); + return cnt; } -#endif - static int open_pipe_on_cpu(struct trace_array *tr, int cpu) { if (cpu == RING_BUFFER_ALL_CPUS) { @@ -6367,14 +5233,14 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) if (ret) return ret; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); cpu = tracing_get_cpu(inode); ret = open_pipe_on_cpu(tr, cpu); if (ret) goto fail_pipe_on_cpu; /* create a buffer to store the information to pass to userspace */ - iter = kzalloc(sizeof(*iter), GFP_KERNEL); + iter = kzalloc_obj(*iter); if (!iter) { ret = -ENOMEM; goto fail_alloc_iter; @@ -6391,7 +5257,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) /* trace pipe does not show start of buffer */ cpumask_setall(iter->started); - if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) + if (tr->trace_flags & TRACE_ITER(LATENCY_FMT)) iter->iter_flags |= TRACE_FILE_LAT_FMT; /* Output in nanoseconds only if we are using a clock in nanoseconds. */ @@ -6411,7 +5277,6 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) tr->trace_ref++; - mutex_unlock(&trace_types_lock); return ret; fail: @@ -6420,7 +5285,6 @@ fail_alloc_iter: close_pipe_on_cpu(tr, cpu); fail_pipe_on_cpu: __trace_array_put(tr); - mutex_unlock(&trace_types_lock); return ret; } @@ -6429,14 +5293,13 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) struct trace_iterator *iter = file->private_data; struct trace_array *tr = inode->i_private; - mutex_lock(&trace_types_lock); + scoped_guard(mutex, &trace_types_lock) { + tr->trace_ref--; - tr->trace_ref--; - - if (iter->trace->pipe_close) - iter->trace->pipe_close(iter); - close_pipe_on_cpu(tr, iter->cpu_file); - mutex_unlock(&trace_types_lock); + if (iter->trace->pipe_close) + iter->trace->pipe_close(iter); + close_pipe_on_cpu(tr, iter->cpu_file); + } free_trace_iter_content(iter); kfree(iter); @@ -6455,7 +5318,7 @@ trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_tabl if (trace_buffer_iter(iter, iter->cpu_file)) return EPOLLIN | EPOLLRDNORM; - if (tr->trace_flags & TRACE_ITER_BLOCK) + if (tr->trace_flags & TRACE_ITER(BLOCK)) /* * Always select as readable when in blocking mode */ @@ -6510,6 +5373,22 @@ static int tracing_wait_pipe(struct file *filp) return 1; } +static bool update_last_data_if_empty(struct trace_array *tr) +{ + if (!(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) + return false; + + if (!ring_buffer_empty(tr->array_buffer.buffer)) + return false; + + /* + * If the buffer contains the last boot data and all per-cpu + * buffers are empty, reset it from the kernel side. + */ + update_last_data(tr); + return true; +} + /* * Consumer reader. */ @@ -6525,31 +5404,32 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, * This is just a matter of traces coherency, the ring buffer itself * is protected. */ - mutex_lock(&iter->mutex); + guard(mutex)(&iter->mutex); /* return any leftover data */ sret = trace_seq_to_user(&iter->seq, ubuf, cnt); if (sret != -EBUSY) - goto out; + return sret; trace_seq_init(&iter->seq); if (iter->trace->read) { sret = iter->trace->read(iter, filp, ubuf, cnt, ppos); if (sret) - goto out; + return sret; } waitagain: + if (update_last_data_if_empty(iter->tr)) + return 0; + sret = tracing_wait_pipe(filp); if (sret <= 0) - goto out; + return sret; /* stop when tracing is finished */ - if (trace_empty(iter)) { - sret = 0; - goto out; - } + if (trace_empty(iter)) + return 0; if (cnt >= TRACE_SEQ_BUFFER_SIZE) cnt = TRACE_SEQ_BUFFER_SIZE - 1; @@ -6613,9 +5493,6 @@ waitagain: if (sret == -EBUSY) goto waitagain; -out: - mutex_unlock(&iter->mutex); - return sret; } @@ -6728,13 +5605,14 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, /* Copy the data into the page, so we can start over. */ ret = trace_seq_to_buffer(&iter->seq, page_address(spd.pages[i]), - trace_seq_used(&iter->seq)); + min((size_t)trace_seq_used(&iter->seq), + (size_t)PAGE_SIZE)); if (ret < 0) { __free_page(spd.pages[i]); break; } spd.partial[i].offset = 0; - spd.partial[i].len = trace_seq_used(&iter->seq); + spd.partial[i].len = ret; trace_seq_init(&iter->seq); } @@ -6759,6 +5637,43 @@ out_err: } static ssize_t +tracing_syscall_buf_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct inode *inode = file_inode(filp); + struct trace_array *tr = inode->i_private; + char buf[64]; + int r; + + r = snprintf(buf, 64, "%d\n", tr->syscall_buf_sz); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +tracing_syscall_buf_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct inode *inode = file_inode(filp); + struct trace_array *tr = inode->i_private; + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + if (val > SYSCALL_FAULT_USER_MAX) + val = SYSCALL_FAULT_USER_MAX; + + tr->syscall_buf_sz = val; + + *ppos += cnt; + + return cnt; +} + +static ssize_t tracing_entries_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -6858,19 +5773,102 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf, return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } -static ssize_t -tracing_last_boot_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +#define LAST_BOOT_HEADER ((void *)1) + +static void *l_next(struct seq_file *m, void *v, loff_t *pos) { - struct trace_array *tr = filp->private_data; - struct seq_buf seq; - char buf[64]; + struct trace_array *tr = m->private; + struct trace_scratch *tscratch = tr->scratch; + unsigned int index = *pos; - seq_buf_init(&seq, buf, 64); + (*pos)++; + + if (*pos == 1) + return LAST_BOOT_HEADER; + + /* Only show offsets of the last boot data */ + if (!tscratch || !(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) + return NULL; - seq_buf_printf(&seq, "text delta:\t%ld\n", tr->text_delta); - seq_buf_printf(&seq, "data delta:\t%ld\n", tr->data_delta); + /* *pos 0 is for the header, 1 is for the first module */ + index--; - return simple_read_from_buffer(ubuf, cnt, ppos, buf, seq_buf_used(&seq)); + if (index >= tscratch->nr_entries) + return NULL; + + return &tscratch->entries[index]; +} + +static void *l_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&scratch_mutex); + + return l_next(m, NULL, pos); +} + +static void l_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&scratch_mutex); +} + +static void show_last_boot_header(struct seq_file *m, struct trace_array *tr) +{ + struct trace_scratch *tscratch = tr->scratch; + + /* + * Do not leak KASLR address. This only shows the KASLR address of + * the last boot. When the ring buffer is started, the LAST_BOOT + * flag gets cleared, and this should only report "current". + * Otherwise it shows the KASLR address from the previous boot which + * should not be the same as the current boot. + */ + if (tscratch && (tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) + seq_printf(m, "%lx\t[kernel]\n", tscratch->text_addr); + else + seq_puts(m, "# Current\n"); +} + +static int l_show(struct seq_file *m, void *v) +{ + struct trace_array *tr = m->private; + struct trace_mod_entry *entry = v; + + if (v == LAST_BOOT_HEADER) { + show_last_boot_header(m, tr); + return 0; + } + + seq_printf(m, "%lx\t%s\n", entry->mod_addr, entry->mod_name); + return 0; +} + +static const struct seq_operations last_boot_seq_ops = { + .start = l_start, + .next = l_next, + .stop = l_stop, + .show = l_show, +}; + +static int tracing_last_boot_open(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + struct seq_file *m; + int ret; + + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; + + ret = seq_open(file, &last_boot_seq_ops); + if (ret) { + trace_array_put(tr); + return ret; + } + + m = file->private_data; + m->private = tr; + + return 0; } static int tracing_buffer_meta_open(struct inode *inode, struct file *filp) @@ -6909,7 +5907,7 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp) struct trace_array *tr = inode->i_private; /* disable tracing ? */ - if (tr->trace_flags & TRACE_ITER_STOP_ON_FREE) + if (tr->trace_flags & TRACE_ITER(STOP_ON_FREE)) tracer_tracing_off(tr); /* resize the ring buffer to 0 */ tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS); @@ -6921,11 +5919,9 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp) #define TRACE_MARKER_MAX_SIZE 4096 -static ssize_t -tracing_mark_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *fpos) +static ssize_t write_marker_to_buffer(struct trace_array *tr, const char *buf, + size_t cnt, unsigned long ip) { - struct trace_array *tr = filp->private_data; struct ring_buffer_event *event; enum event_trigger_type tt = ETT_NONE; struct trace_buffer *buffer; @@ -6933,32 +5929,11 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, int meta_size; ssize_t written; size_t size; - int len; - -/* Used in tracing_mark_raw_write() as well */ -#define FAULTED_STR "<faulted>" -#define FAULTED_SIZE (sizeof(FAULTED_STR) - 1) /* '\0' is already accounted for */ - - if (tracing_disabled) - return -EINVAL; - - if (!(tr->trace_flags & TRACE_ITER_MARKERS)) - return -EINVAL; - - if ((ssize_t)cnt < 0) - return -EINVAL; - - if (cnt > TRACE_MARKER_MAX_SIZE) - cnt = TRACE_MARKER_MAX_SIZE; meta_size = sizeof(*entry) + 2; /* add '\0' and possible '\n' */ again: size = cnt + meta_size; - /* If less than "<faulted>", then make sure we can still add that */ - if (cnt < FAULTED_SIZE) - size += FAULTED_SIZE - cnt; - buffer = tr->array_buffer.buffer; event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, tracing_gen_ctx()); @@ -6968,9 +5943,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, * make it smaller and try again. */ if (size > ring_buffer_max_event_size(buffer)) { - /* cnt < FAULTED size should never be bigger than max */ - if (WARN_ON_ONCE(cnt < FAULTED_SIZE)) - return -EBADF; cnt = ring_buffer_max_event_size(buffer) - meta_size; /* The above should only happen once */ if (WARN_ON_ONCE(cnt + meta_size == size)) @@ -6983,15 +5955,9 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, } entry = ring_buffer_event_data(event); - entry->ip = _THIS_IP_; - - len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt); - if (len) { - memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); - cnt = FAULTED_SIZE; - written = -EFAULT; - } else - written = cnt; + entry->ip = ip; + memcpy(&entry->buf, buf, cnt); + written = cnt; if (tr->trace_marker_file && !list_empty(&tr->trace_marker_file->triggers)) { /* do not add \n before testing triggers, but add \0 */ @@ -7015,33 +5981,368 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, return written; } +struct trace_user_buf { + char *buf; +}; + +static DEFINE_MUTEX(trace_user_buffer_mutex); +static struct trace_user_buf_info *trace_user_buffer; + +/** + * trace_user_fault_destroy - free up allocated memory of a trace user buffer + * @tinfo: The descriptor to free up + * + * Frees any data allocated in the trace info dsecriptor. + */ +void trace_user_fault_destroy(struct trace_user_buf_info *tinfo) +{ + char *buf; + int cpu; + + if (!tinfo || !tinfo->tbuf) + return; + + for_each_possible_cpu(cpu) { + buf = per_cpu_ptr(tinfo->tbuf, cpu)->buf; + kfree(buf); + } + free_percpu(tinfo->tbuf); +} + +static int user_fault_buffer_enable(struct trace_user_buf_info *tinfo, size_t size) +{ + char *buf; + int cpu; + + lockdep_assert_held(&trace_user_buffer_mutex); + + tinfo->tbuf = alloc_percpu(struct trace_user_buf); + if (!tinfo->tbuf) + return -ENOMEM; + + tinfo->ref = 1; + tinfo->size = size; + + /* Clear each buffer in case of error */ + for_each_possible_cpu(cpu) { + per_cpu_ptr(tinfo->tbuf, cpu)->buf = NULL; + } + + for_each_possible_cpu(cpu) { + buf = kmalloc_node(size, GFP_KERNEL, + cpu_to_node(cpu)); + if (!buf) + return -ENOMEM; + per_cpu_ptr(tinfo->tbuf, cpu)->buf = buf; + } + + return 0; +} + +/* For internal use. Free and reinitialize */ +static void user_buffer_free(struct trace_user_buf_info **tinfo) +{ + lockdep_assert_held(&trace_user_buffer_mutex); + + trace_user_fault_destroy(*tinfo); + kfree(*tinfo); + *tinfo = NULL; +} + +/* For internal use. Initialize and allocate */ +static int user_buffer_init(struct trace_user_buf_info **tinfo, size_t size) +{ + bool alloc = false; + int ret; + + lockdep_assert_held(&trace_user_buffer_mutex); + + if (!*tinfo) { + alloc = true; + *tinfo = kzalloc_obj(**tinfo); + if (!*tinfo) + return -ENOMEM; + } + + ret = user_fault_buffer_enable(*tinfo, size); + if (ret < 0 && alloc) + user_buffer_free(tinfo); + + return ret; +} + +/* For internal use, derefrence and free if necessary */ +static void user_buffer_put(struct trace_user_buf_info **tinfo) +{ + guard(mutex)(&trace_user_buffer_mutex); + + if (WARN_ON_ONCE(!*tinfo || !(*tinfo)->ref)) + return; + + if (--(*tinfo)->ref) + return; + + user_buffer_free(tinfo); +} + +/** + * trace_user_fault_init - Allocated or reference a per CPU buffer + * @tinfo: A pointer to the trace buffer descriptor + * @size: The size to allocate each per CPU buffer + * + * Create a per CPU buffer that can be used to copy from user space + * in a task context. When calling trace_user_fault_read(), preemption + * must be disabled, and it will enable preemption and copy user + * space data to the buffer. If any schedule switches occur, it will + * retry until it succeeds without a schedule switch knowing the buffer + * is still valid. + * + * Returns 0 on success, negative on failure. + */ +int trace_user_fault_init(struct trace_user_buf_info *tinfo, size_t size) +{ + int ret; + + if (!tinfo) + return -EINVAL; + + guard(mutex)(&trace_user_buffer_mutex); + + ret = user_buffer_init(&tinfo, size); + if (ret < 0) + trace_user_fault_destroy(tinfo); + + return ret; +} + +/** + * trace_user_fault_get - up the ref count for the user buffer + * @tinfo: A pointer to a pointer to the trace buffer descriptor + * + * Ups the ref count of the trace buffer. + * + * Returns the new ref count. + */ +int trace_user_fault_get(struct trace_user_buf_info *tinfo) +{ + if (!tinfo) + return -1; + + guard(mutex)(&trace_user_buffer_mutex); + + tinfo->ref++; + return tinfo->ref; +} + +/** + * trace_user_fault_put - dereference a per cpu trace buffer + * @tinfo: The @tinfo that was passed to trace_user_fault_get() + * + * Decrement the ref count of @tinfo. + * + * Returns the new refcount (negative on error). + */ +int trace_user_fault_put(struct trace_user_buf_info *tinfo) +{ + guard(mutex)(&trace_user_buffer_mutex); + + if (WARN_ON_ONCE(!tinfo || !tinfo->ref)) + return -1; + + --tinfo->ref; + return tinfo->ref; +} + +/** + * trace_user_fault_read - Read user space into a per CPU buffer + * @tinfo: The @tinfo allocated by trace_user_fault_get() + * @ptr: The user space pointer to read + * @size: The size of user space to read. + * @copy_func: Optional function to use to copy from user space + * @data: Data to pass to copy_func if it was supplied + * + * Preemption must be disabled when this is called, and must not + * be enabled while using the returned buffer. + * This does the copying from user space into a per CPU buffer. + * + * The @size must not be greater than the size passed in to + * trace_user_fault_init(). + * + * If @copy_func is NULL, trace_user_fault_read() will use copy_from_user(), + * otherwise it will call @copy_func. It will call @copy_func with: + * + * buffer: the per CPU buffer of the @tinfo. + * ptr: The pointer @ptr to user space to read + * size: The @size of the ptr to read + * data: The @data parameter + * + * It is expected that @copy_func will return 0 on success and non zero + * if there was a fault. + * + * Returns a pointer to the buffer with the content read from @ptr. + * Preemption must remain disabled while the caller accesses the + * buffer returned by this function. + * Returns NULL if there was a fault, or the size passed in is + * greater than the size passed to trace_user_fault_init(). + */ +char *trace_user_fault_read(struct trace_user_buf_info *tinfo, + const char __user *ptr, size_t size, + trace_user_buf_copy copy_func, void *data) +{ + int cpu = smp_processor_id(); + char *buffer = per_cpu_ptr(tinfo->tbuf, cpu)->buf; + unsigned int cnt; + int trys = 0; + int ret; + + lockdep_assert_preemption_disabled(); + + /* + * It's up to the caller to not try to copy more than it said + * it would. + */ + if (size > tinfo->size) + return NULL; + + /* + * This acts similar to a seqcount. The per CPU context switches are + * recorded, migration is disabled and preemption is enabled. The + * read of the user space memory is copied into the per CPU buffer. + * Preemption is disabled again, and if the per CPU context switches count + * is still the same, it means the buffer has not been corrupted. + * If the count is different, it is assumed the buffer is corrupted + * and reading must be tried again. + */ + + do { + /* + * It is possible that something is trying to migrate this + * task. What happens then, is when preemption is enabled, + * the migration thread will preempt this task, try to + * migrate it, fail, then let it run again. That will + * cause this to loop again and never succeed. + * On failures, enabled and disable preemption with + * migration enabled, to allow the migration thread to + * migrate this task. + */ + if (trys) { + preempt_enable_notrace(); + preempt_disable_notrace(); + cpu = smp_processor_id(); + buffer = per_cpu_ptr(tinfo->tbuf, cpu)->buf; + } + + /* + * If for some reason, copy_from_user() always causes a context + * switch, this would then cause an infinite loop. + * If this task is preempted by another user space task, it + * will cause this task to try again. But just in case something + * changes where the copying from user space causes another task + * to run, prevent this from going into an infinite loop. + * 100 tries should be plenty. + */ + if (WARN_ONCE(trys++ > 100, "Error: Too many tries to read user space")) + return NULL; + + /* Read the current CPU context switch counter */ + cnt = nr_context_switches_cpu(cpu); + + /* + * Preemption is going to be enabled, but this task must + * remain on this CPU. + */ + migrate_disable(); + + /* + * Now preemption is being enabled and another task can come in + * and use the same buffer and corrupt our data. + */ + preempt_enable_notrace(); + + /* Make sure preemption is enabled here */ + lockdep_assert_preemption_enabled(); + + if (copy_func) { + ret = copy_func(buffer, ptr, size, data); + } else { + ret = __copy_from_user(buffer, ptr, size); + } + + preempt_disable_notrace(); + migrate_enable(); + + /* if it faulted, no need to test if the buffer was corrupted */ + if (ret) + return NULL; + + /* + * Preemption is disabled again, now check the per CPU context + * switch counter. If it doesn't match, then another user space + * process may have schedule in and corrupted our buffer. In that + * case the copying must be retried. + */ + } while (nr_context_switches_cpu(cpu) != cnt); + + return buffer; +} + static ssize_t -tracing_mark_raw_write(struct file *filp, const char __user *ubuf, +tracing_mark_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) { struct trace_array *tr = filp->private_data; - struct ring_buffer_event *event; - struct trace_buffer *buffer; - struct raw_data_entry *entry; - ssize_t written; - int size; - int len; - -#define FAULT_SIZE_ID (FAULTED_SIZE + sizeof(int)) + ssize_t written = -ENODEV; + unsigned long ip; + char *buf; - if (tracing_disabled) + if (unlikely(tracing_disabled)) return -EINVAL; - if (!(tr->trace_flags & TRACE_ITER_MARKERS)) + if (!(tr->trace_flags & TRACE_ITER(MARKERS))) return -EINVAL; - /* The marker must at least have a tag id */ - if (cnt < sizeof(unsigned int)) + if ((ssize_t)cnt < 0) return -EINVAL; - size = sizeof(*entry) + cnt; - if (cnt < FAULT_SIZE_ID) - size += FAULT_SIZE_ID - cnt; + if (cnt > TRACE_MARKER_MAX_SIZE) + cnt = TRACE_MARKER_MAX_SIZE; + + /* Must have preemption disabled while having access to the buffer */ + guard(preempt_notrace)(); + + buf = trace_user_fault_read(trace_user_buffer, ubuf, cnt, NULL, NULL); + if (!buf) + return -EFAULT; + + /* The selftests expect this function to be the IP address */ + ip = _THIS_IP_; + + /* The global trace_marker can go to multiple instances */ + if (tr == &global_trace) { + guard(rcu)(); + list_for_each_entry_rcu(tr, &marker_copies, marker_list) { + written = write_marker_to_buffer(tr, buf, cnt, ip); + if (written < 0) + break; + } + } else { + written = write_marker_to_buffer(tr, buf, cnt, ip); + } + + return written; +} + +static ssize_t write_raw_marker_to_buffer(struct trace_array *tr, + const char *buf, size_t cnt) +{ + struct ring_buffer_event *event; + struct trace_buffer *buffer; + struct raw_data_entry *entry; + ssize_t written; + size_t size; + + /* cnt includes both the entry->id and the data behind it. */ + size = struct_offset(entry, id) + cnt; buffer = tr->array_buffer.buffer; @@ -7055,20 +6356,88 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, return -EBADF; entry = ring_buffer_event_data(event); - - len = __copy_from_user_inatomic(&entry->id, ubuf, cnt); - if (len) { - entry->id = -1; - memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); - written = -EFAULT; - } else - written = cnt; + unsafe_memcpy(&entry->id, buf, cnt, + "id and content already reserved on ring buffer" + "'buf' includes the 'id' and the data." + "'entry' was allocated with cnt from 'id'."); + written = cnt; __buffer_unlock_commit(buffer, event); return written; } +static ssize_t +tracing_mark_raw_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *fpos) +{ + struct trace_array *tr = filp->private_data; + ssize_t written = -ENODEV; + char *buf; + + if (unlikely(tracing_disabled)) + return -EINVAL; + + if (!(tr->trace_flags & TRACE_ITER(MARKERS))) + return -EINVAL; + + /* The marker must at least have a tag id */ + if (cnt < sizeof(unsigned int)) + return -EINVAL; + + /* raw write is all or nothing */ + if (cnt > TRACE_MARKER_MAX_SIZE) + return -EINVAL; + + /* Must have preemption disabled while having access to the buffer */ + guard(preempt_notrace)(); + + buf = trace_user_fault_read(trace_user_buffer, ubuf, cnt, NULL, NULL); + if (!buf) + return -EFAULT; + + /* The global trace_marker_raw can go to multiple instances */ + if (tr == &global_trace) { + guard(rcu)(); + list_for_each_entry_rcu(tr, &marker_copies, marker_list) { + written = write_raw_marker_to_buffer(tr, buf, cnt); + if (written < 0) + break; + } + } else { + written = write_raw_marker_to_buffer(tr, buf, cnt); + } + + return written; +} + +static int tracing_mark_open(struct inode *inode, struct file *filp) +{ + int ret; + + scoped_guard(mutex, &trace_user_buffer_mutex) { + if (!trace_user_buffer) { + ret = user_buffer_init(&trace_user_buffer, TRACE_MARKER_MAX_SIZE); + if (ret < 0) + return ret; + } else { + trace_user_buffer->ref++; + } + } + + stream_open(inode, filp); + ret = tracing_open_generic_tr(inode, filp); + if (ret < 0) + user_buffer_put(&trace_user_buffer); + return ret; +} + +static int tracing_mark_release(struct inode *inode, struct file *file) +{ + user_buffer_put(&trace_user_buffer); + return tracing_release_generic_tr(inode, file); +} + static int tracing_clock_show(struct seq_file *m, void *v) { struct trace_array *tr = m->private; @@ -7095,7 +6464,7 @@ int tracing_set_clock(struct trace_array *tr, const char *clockstr) if (i == ARRAY_SIZE(trace_clocks)) return -EINVAL; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); tr->clock_id = i; @@ -7107,13 +6476,18 @@ int tracing_set_clock(struct trace_array *tr, const char *clockstr) */ tracing_reset_online_cpus(&tr->array_buffer); -#ifdef CONFIG_TRACER_MAX_TRACE - if (tr->max_buffer.buffer) - ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func); - tracing_reset_online_cpus(&tr->max_buffer); +#ifdef CONFIG_TRACER_SNAPSHOT + if (tr->snapshot_buffer.buffer) + ring_buffer_set_clock(tr->snapshot_buffer.buffer, trace_clocks[i].func); + tracing_reset_online_cpus(&tr->snapshot_buffer); #endif + update_last_data_if_empty(tr); - mutex_unlock(&trace_types_lock); + if (tr->scratch && !(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) { + struct trace_scratch *tscratch = tr->scratch; + + tscratch->clock_id = i; + } return 0; } @@ -7155,6 +6529,11 @@ static int tracing_clock_open(struct inode *inode, struct file *file) if (ret) return ret; + if ((file->f_mode & FMODE_WRITE) && trace_array_is_readonly(tr)) { + trace_array_put(tr); + return -EACCES; + } + ret = single_open(file, tracing_clock_show, inode->i_private); if (ret < 0) trace_array_put(tr); @@ -7166,15 +6545,13 @@ static int tracing_time_stamp_mode_show(struct seq_file *m, void *v) { struct trace_array *tr = m->private; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); if (ring_buffer_time_stamp_abs(tr->array_buffer.buffer)) seq_puts(m, "delta [absolute]\n"); else seq_puts(m, "[delta] absolute\n"); - mutex_unlock(&trace_types_lock); - return 0; } @@ -7202,227 +6579,6 @@ u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_eve return ring_buffer_event_time_stamp(buffer, rbe); } -/* - * Set or disable using the per CPU trace_buffer_event when possible. - */ -int tracing_set_filter_buffering(struct trace_array *tr, bool set) -{ - int ret = 0; - - mutex_lock(&trace_types_lock); - - if (set && tr->no_filter_buffering_ref++) - goto out; - - if (!set) { - if (WARN_ON_ONCE(!tr->no_filter_buffering_ref)) { - ret = -EINVAL; - goto out; - } - - --tr->no_filter_buffering_ref; - } - out: - mutex_unlock(&trace_types_lock); - - return ret; -} - -struct ftrace_buffer_info { - struct trace_iterator iter; - void *spare; - unsigned int spare_cpu; - unsigned int spare_size; - unsigned int read; -}; - -#ifdef CONFIG_TRACER_SNAPSHOT -static int tracing_snapshot_open(struct inode *inode, struct file *file) -{ - struct trace_array *tr = inode->i_private; - struct trace_iterator *iter; - struct seq_file *m; - int ret; - - ret = tracing_check_open_get_tr(tr); - if (ret) - return ret; - - if (file->f_mode & FMODE_READ) { - iter = __tracing_open(inode, file, true); - if (IS_ERR(iter)) - ret = PTR_ERR(iter); - } else { - /* Writes still need the seq_file to hold the private data */ - ret = -ENOMEM; - m = kzalloc(sizeof(*m), GFP_KERNEL); - if (!m) - goto out; - iter = kzalloc(sizeof(*iter), GFP_KERNEL); - if (!iter) { - kfree(m); - goto out; - } - ret = 0; - - iter->tr = tr; - iter->array_buffer = &tr->max_buffer; - iter->cpu_file = tracing_get_cpu(inode); - m->private = iter; - file->private_data = m; - } -out: - if (ret < 0) - trace_array_put(tr); - - return ret; -} - -static void tracing_swap_cpu_buffer(void *tr) -{ - update_max_tr_single((struct trace_array *)tr, current, smp_processor_id()); -} - -static ssize_t -tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, - loff_t *ppos) -{ - struct seq_file *m = filp->private_data; - struct trace_iterator *iter = m->private; - struct trace_array *tr = iter->tr; - unsigned long val; - int ret; - - ret = tracing_update_buffers(tr); - if (ret < 0) - return ret; - - ret = kstrtoul_from_user(ubuf, cnt, 10, &val); - if (ret) - return ret; - - mutex_lock(&trace_types_lock); - - if (tr->current_trace->use_max_tr) { - ret = -EBUSY; - goto out; - } - - local_irq_disable(); - arch_spin_lock(&tr->max_lock); - if (tr->cond_snapshot) - ret = -EBUSY; - arch_spin_unlock(&tr->max_lock); - local_irq_enable(); - if (ret) - goto out; - - switch (val) { - case 0: - if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { - ret = -EINVAL; - break; - } - if (tr->allocated_snapshot) - free_snapshot(tr); - break; - case 1: -/* Only allow per-cpu swap if the ring buffer supports it */ -#ifndef CONFIG_RING_BUFFER_ALLOW_SWAP - if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { - ret = -EINVAL; - break; - } -#endif - if (tr->allocated_snapshot) - ret = resize_buffer_duplicate_size(&tr->max_buffer, - &tr->array_buffer, iter->cpu_file); - - ret = tracing_arm_snapshot_locked(tr); - if (ret) - break; - - /* Now, we're going to swap */ - if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { - local_irq_disable(); - update_max_tr(tr, current, smp_processor_id(), NULL); - local_irq_enable(); - } else { - smp_call_function_single(iter->cpu_file, tracing_swap_cpu_buffer, - (void *)tr, 1); - } - tracing_disarm_snapshot(tr); - break; - default: - if (tr->allocated_snapshot) { - if (iter->cpu_file == RING_BUFFER_ALL_CPUS) - tracing_reset_online_cpus(&tr->max_buffer); - else - tracing_reset_cpu(&tr->max_buffer, iter->cpu_file); - } - break; - } - - if (ret >= 0) { - *ppos += cnt; - ret = cnt; - } -out: - mutex_unlock(&trace_types_lock); - return ret; -} - -static int tracing_snapshot_release(struct inode *inode, struct file *file) -{ - struct seq_file *m = file->private_data; - int ret; - - ret = tracing_release(inode, file); - - if (file->f_mode & FMODE_READ) - return ret; - - /* If write only, the seq_file is just a stub */ - if (m) - kfree(m->private); - kfree(m); - - return 0; -} - -static int tracing_buffers_open(struct inode *inode, struct file *filp); -static ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf, - size_t count, loff_t *ppos); -static int tracing_buffers_release(struct inode *inode, struct file *file); -static ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, unsigned int flags); - -static int snapshot_raw_open(struct inode *inode, struct file *filp) -{ - struct ftrace_buffer_info *info; - int ret; - - /* The following checks for tracefs lockdown */ - ret = tracing_buffers_open(inode, filp); - if (ret < 0) - return ret; - - info = filp->private_data; - - if (info->iter.trace->use_max_tr) { - tracing_buffers_release(inode, filp); - return -EBUSY; - } - - info->iter.snapshot = true; - info->iter.array_buffer = &info->iter.tr->max_buffer; - - return ret; -} - -#endif /* CONFIG_TRACER_SNAPSHOT */ - - static const struct file_operations tracing_thresh_fops = { .open = tracing_open_generic, .read = tracing_thresh_read, @@ -7430,16 +6586,6 @@ static const struct file_operations tracing_thresh_fops = { .llseek = generic_file_llseek, }; -#ifdef CONFIG_TRACER_MAX_TRACE -static const struct file_operations tracing_max_lat_fops = { - .open = tracing_open_generic_tr, - .read = tracing_max_lat_read, - .write = tracing_max_lat_write, - .llseek = generic_file_llseek, - .release = tracing_release_generic_tr, -}; -#endif - static const struct file_operations set_tracer_fops = { .open = tracing_open_generic_tr, .read = tracing_set_trace_read, @@ -7464,6 +6610,14 @@ static const struct file_operations tracing_entries_fops = { .release = tracing_release_generic_tr, }; +static const struct file_operations tracing_syscall_buf_fops = { + .open = tracing_open_generic_tr, + .read = tracing_syscall_buf_read, + .write = tracing_syscall_buf_write, + .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, +}; + static const struct file_operations tracing_buffer_meta_fops = { .open = tracing_buffer_meta_open, .read = seq_read, @@ -7487,13 +6641,13 @@ static const struct file_operations tracing_free_buffer_fops = { static const struct file_operations tracing_mark_fops = { .open = tracing_mark_open, .write = tracing_mark_write, - .release = tracing_release_generic_tr, + .release = tracing_mark_release, }; static const struct file_operations tracing_mark_raw_fops = { .open = tracing_mark_open, .write = tracing_mark_raw_write, - .release = tracing_release_generic_tr, + .release = tracing_mark_release, }; static const struct file_operations trace_clock_fops = { @@ -7512,30 +6666,12 @@ static const struct file_operations trace_time_stamp_mode_fops = { }; static const struct file_operations last_boot_fops = { - .open = tracing_open_generic_tr, - .read = tracing_last_boot_read, - .llseek = generic_file_llseek, - .release = tracing_release_generic_tr, -}; - -#ifdef CONFIG_TRACER_SNAPSHOT -static const struct file_operations snapshot_fops = { - .open = tracing_snapshot_open, + .open = tracing_last_boot_open, .read = seq_read, - .write = tracing_snapshot_write, - .llseek = tracing_lseek, - .release = tracing_snapshot_release, -}; - -static const struct file_operations snapshot_raw_fops = { - .open = snapshot_raw_open, - .read = tracing_buffers_read, - .release = tracing_buffers_release, - .splice_read = tracing_buffers_splice_read, + .llseek = seq_lseek, + .release = tracing_seq_release, }; -#endif /* CONFIG_TRACER_SNAPSHOT */ - /* * trace_min_max_write - Write a u64 value to a trace_min_max_param struct * @filp: The active open file structure @@ -7646,7 +6782,7 @@ static struct tracing_log_err *alloc_tracing_log_err(int len) { struct tracing_log_err *err; - err = kzalloc(sizeof(*err), GFP_KERNEL); + err = kzalloc_obj(*err); if (!err) return ERR_PTR(-ENOMEM); @@ -7754,12 +6890,11 @@ void tracing_log_err(struct trace_array *tr, len += sizeof(CMD_PREFIX) + 2 * sizeof("\n") + strlen(cmd) + 1; - mutex_lock(&tracing_err_log_lock); + guard(mutex)(&tracing_err_log_lock); + err = get_tracing_log_err(tr, len); - if (PTR_ERR(err) == -ENOMEM) { - mutex_unlock(&tracing_err_log_lock); + if (PTR_ERR(err) == -ENOMEM) return; - } snprintf(err->loc, TRACING_LOG_LOC_MAX, "%s: error: ", loc); snprintf(err->cmd, len, "\n" CMD_PREFIX "%s\n", cmd); @@ -7770,21 +6905,20 @@ void tracing_log_err(struct trace_array *tr, err->info.ts = local_clock(); list_add_tail(&err->list, &tr->err_log); - mutex_unlock(&tracing_err_log_lock); } static void clear_tracing_err_log(struct trace_array *tr) { struct tracing_log_err *err, *next; - mutex_lock(&tracing_err_log_lock); + guard(mutex)(&tracing_err_log_lock); + list_for_each_entry_safe(err, next, &tr->err_log, list) { list_del(&err->list); free_tracing_log_err(err); } tr->n_err_log_entries = 0; - mutex_unlock(&tracing_err_log_lock); } static void *tracing_err_log_seq_start(struct seq_file *m, loff_t *pos) @@ -7897,7 +7031,7 @@ static const struct file_operations tracing_err_log_fops = { .release = tracing_err_log_release, }; -static int tracing_buffers_open(struct inode *inode, struct file *filp) +int tracing_buffers_open(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; struct ftrace_buffer_info *info; @@ -7907,7 +7041,7 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) if (ret) return ret; - info = kvzalloc(sizeof(*info), GFP_KERNEL); + info = kvzalloc_obj(*info); if (!info) { trace_array_put(tr); return -ENOMEM; @@ -7945,9 +7079,8 @@ tracing_buffers_poll(struct file *filp, poll_table *poll_table) return trace_poll(iter, filp, poll_table); } -static ssize_t -tracing_buffers_read(struct file *filp, char __user *ubuf, - size_t count, loff_t *ppos) +ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) { struct ftrace_buffer_info *info = filp->private_data; struct trace_iterator *iter = &info->iter; @@ -7959,10 +7092,8 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, if (!count) return 0; -#ifdef CONFIG_TRACER_MAX_TRACE - if (iter->snapshot && iter->tr->current_trace->use_max_tr) + if (iter->snapshot && tracer_uses_snapshot(iter->tr->current_trace)) return -EBUSY; -#endif page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer); @@ -8003,6 +7134,9 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, if (ret < 0) { if (trace_empty(iter) && !iter->closed) { + if (update_last_data_if_empty(iter->tr)) + return 0; + if ((filp->f_flags & O_NONBLOCK)) return -EAGAIN; @@ -8047,12 +7181,12 @@ static int tracing_buffers_flush(struct file *file, fl_owner_t id) return 0; } -static int tracing_buffers_release(struct inode *inode, struct file *file) +int tracing_buffers_release(struct inode *inode, struct file *file) { struct ftrace_buffer_info *info = file->private_data; struct trace_iterator *iter = &info->iter; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); iter->tr->trace_ref--; @@ -8063,8 +7197,6 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) info->spare_cpu, info->spare); kvfree(info); - mutex_unlock(&trace_types_lock); - return 0; } @@ -8123,10 +7255,9 @@ static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i) spd->partial[i].private = 0; } -static ssize_t -tracing_buffers_splice_read(struct file *file, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) +ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) { struct ftrace_buffer_info *info = file->private_data; struct trace_iterator *iter = &info->iter; @@ -8145,10 +7276,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, int entries, i; ssize_t ret = 0; -#ifdef CONFIG_TRACER_MAX_TRACE - if (iter->snapshot && iter->tr->current_trace->use_max_tr) + if (iter->snapshot && tracer_uses_snapshot(iter->tr->current_trace)) return -EBUSY; -#endif page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer); if (*ppos & (page_size - 1)) @@ -8171,7 +7300,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, struct page *page; int r; - ref = kzalloc(sizeof(*ref), GFP_KERNEL); + ref = kzalloc_obj(*ref); if (!ref) { ret = -ENOMEM; break; @@ -8272,54 +7401,27 @@ static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned * An ioctl call with cmd 0 to the ring buffer file will wake up all * waiters */ - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); /* Make sure the waiters see the new wait_index */ (void)atomic_fetch_inc_release(&iter->wait_index); ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file); - mutex_unlock(&trace_types_lock); return 0; } -#ifdef CONFIG_TRACER_MAX_TRACE -static int get_snapshot_map(struct trace_array *tr) +/* + * This is called when a VMA is duplicated (e.g., on fork()) to increment + * the user_mapped counter without remapping pages. + */ +static void tracing_buffers_mmap_open(struct vm_area_struct *vma) { - int err = 0; - - /* - * Called with mmap_lock held. lockdep would be unhappy if we would now - * take trace_types_lock. Instead use the specific - * snapshot_trigger_lock. - */ - spin_lock(&tr->snapshot_trigger_lock); - - if (tr->snapshot || tr->mapped == UINT_MAX) - err = -EBUSY; - else - tr->mapped++; - - spin_unlock(&tr->snapshot_trigger_lock); - - /* Wait for update_max_tr() to observe iter->tr->mapped */ - if (tr->mapped == 1) - synchronize_rcu(); - - return err; + struct ftrace_buffer_info *info = vma->vm_file->private_data; + struct trace_iterator *iter = &info->iter; + ring_buffer_map_dup(iter->array_buffer->buffer, iter->cpu_file); } -static void put_snapshot_map(struct trace_array *tr) -{ - spin_lock(&tr->snapshot_trigger_lock); - if (!WARN_ON(!tr->mapped)) - tr->mapped--; - spin_unlock(&tr->snapshot_trigger_lock); -} -#else -static inline int get_snapshot_map(struct trace_array *tr) { return 0; } -static inline void put_snapshot_map(struct trace_array *tr) { } -#endif static void tracing_buffers_mmap_close(struct vm_area_struct *vma) { @@ -8330,8 +7432,19 @@ static void tracing_buffers_mmap_close(struct vm_area_struct *vma) put_snapshot_map(iter->tr); } +static int tracing_buffers_may_split(struct vm_area_struct *vma, unsigned long addr) +{ + /* + * Trace buffer mappings require the complete buffer including + * the meta page. Partial mappings are not supported. + */ + return -EINVAL; +} + static const struct vm_operations_struct tracing_buffers_vmops = { + .open = tracing_buffers_mmap_open, .close = tracing_buffers_mmap_close, + .may_split = tracing_buffers_may_split, }; static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma) @@ -8340,6 +7453,10 @@ static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma) struct trace_iterator *iter = &info->iter; int ret = 0; + /* A memmap'ed and backup buffers are not supported for user space mmap */ + if (iter->tr->flags & (TRACE_ARRAY_FL_MEMMAP | TRACE_ARRAY_FL_VMALLOC)) + return -ENODEV; + ret = get_snapshot_map(iter->tr); if (ret) return ret; @@ -8377,7 +7494,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf, unsigned long long t; unsigned long usec_rem; - s = kmalloc(sizeof(*s), GFP_KERNEL); + s = kmalloc_obj(*s); if (!s) return -ENOMEM; @@ -8474,179 +7591,15 @@ static const struct file_operations tracing_dyn_info_fops = { }; #endif /* CONFIG_DYNAMIC_FTRACE */ -#if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) -static void -ftrace_snapshot(unsigned long ip, unsigned long parent_ip, - struct trace_array *tr, struct ftrace_probe_ops *ops, - void *data) -{ - tracing_snapshot_instance(tr); -} - -static void -ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, - struct trace_array *tr, struct ftrace_probe_ops *ops, - void *data) -{ - struct ftrace_func_mapper *mapper = data; - long *count = NULL; - - if (mapper) - count = (long *)ftrace_func_mapper_find_ip(mapper, ip); - - if (count) { - - if (*count <= 0) - return; - - (*count)--; - } - - tracing_snapshot_instance(tr); -} - -static int -ftrace_snapshot_print(struct seq_file *m, unsigned long ip, - struct ftrace_probe_ops *ops, void *data) -{ - struct ftrace_func_mapper *mapper = data; - long *count = NULL; - - seq_printf(m, "%ps:", (void *)ip); - - seq_puts(m, "snapshot"); - - if (mapper) - count = (long *)ftrace_func_mapper_find_ip(mapper, ip); - - if (count) - seq_printf(m, ":count=%ld\n", *count); - else - seq_puts(m, ":unlimited\n"); - - return 0; -} - -static int -ftrace_snapshot_init(struct ftrace_probe_ops *ops, struct trace_array *tr, - unsigned long ip, void *init_data, void **data) -{ - struct ftrace_func_mapper *mapper = *data; - - if (!mapper) { - mapper = allocate_ftrace_func_mapper(); - if (!mapper) - return -ENOMEM; - *data = mapper; - } - - return ftrace_func_mapper_add_ip(mapper, ip, init_data); -} - -static void -ftrace_snapshot_free(struct ftrace_probe_ops *ops, struct trace_array *tr, - unsigned long ip, void *data) -{ - struct ftrace_func_mapper *mapper = data; - - if (!ip) { - if (!mapper) - return; - free_ftrace_func_mapper(mapper, NULL); - return; - } - - ftrace_func_mapper_remove_ip(mapper, ip); -} - -static struct ftrace_probe_ops snapshot_probe_ops = { - .func = ftrace_snapshot, - .print = ftrace_snapshot_print, -}; - -static struct ftrace_probe_ops snapshot_count_probe_ops = { - .func = ftrace_count_snapshot, - .print = ftrace_snapshot_print, - .init = ftrace_snapshot_init, - .free = ftrace_snapshot_free, -}; - -static int -ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash, - char *glob, char *cmd, char *param, int enable) -{ - struct ftrace_probe_ops *ops; - void *count = (void *)-1; - char *number; - int ret; - - if (!tr) - return -ENODEV; - - /* hash funcs only work with set_ftrace_filter */ - if (!enable) - return -EINVAL; - - ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops; - - if (glob[0] == '!') { - ret = unregister_ftrace_function_probe_func(glob+1, tr, ops); - if (!ret) - tracing_disarm_snapshot(tr); - - return ret; - } - - if (!param) - goto out_reg; - - number = strsep(¶m, ":"); - - if (!strlen(number)) - goto out_reg; - - /* - * We use the callback data field (which is a pointer) - * as our counter. - */ - ret = kstrtoul(number, 0, (unsigned long *)&count); - if (ret) - return ret; - - out_reg: - ret = tracing_arm_snapshot(tr); - if (ret < 0) - goto out; - - ret = register_ftrace_function_probe(glob, tr, ops, count); - if (ret < 0) - tracing_disarm_snapshot(tr); - out: - return ret < 0 ? ret : 0; -} - -static struct ftrace_func_command ftrace_snapshot_cmd = { - .name = "snapshot", - .func = ftrace_trace_snapshot_callback, -}; - -static __init int register_snapshot_cmd(void) -{ - return register_ftrace_command(&ftrace_snapshot_cmd); -} -#else -static inline __init int register_snapshot_cmd(void) { return 0; } -#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */ - static struct dentry *tracing_get_dentry(struct trace_array *tr) { - if (WARN_ON(!tr->dir)) - return ERR_PTR(-ENODEV); - /* Top directory uses NULL as the parent */ if (tr->flags & TRACE_ARRAY_FL_GLOBAL) return NULL; + if (WARN_ON(!tr->dir)) + return ERR_PTR(-ENODEV); + /* All sub buffers have a descriptor */ return tr->dir; } @@ -8670,7 +7623,7 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) return tr->percpu_dir; } -static struct dentry * +struct dentry * trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent, void *data, long cpu, const struct file_operations *fops) { @@ -8712,7 +7665,7 @@ tracing_init_tracefs_percpu(struct trace_array *tr, long cpu) trace_create_cpu_file("stats", TRACE_MODE_READ, d_cpu, tr, cpu, &tracing_stats_fops); - trace_create_cpu_file("buffer_size_kb", TRACE_MODE_READ, d_cpu, + trace_create_cpu_file("buffer_size_kb", TRACE_MODE_WRITE, d_cpu, tr, cpu, &tracing_entries_fops); if (tr->range_addr_start) @@ -8765,10 +7718,9 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, return -EINVAL; if (!!(topt->flags->val & topt->opt->bit) != val) { - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); ret = __set_tracer_option(topt->tr, topt->flags, topt->opt, !val); - mutex_unlock(&trace_types_lock); if (ret) return ret; } @@ -8851,7 +7803,7 @@ trace_options_core_read(struct file *filp, char __user *ubuf, size_t cnt, get_tr_index(tr_index, &tr, &index); - if (tr->trace_flags & (1 << index)) + if (tr->trace_flags & (1ULL << index)) buf = "1\n"; else buf = "0\n"; @@ -8880,7 +7832,7 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); - ret = set_tracer_flag(tr, 1 << index, val); + ret = set_tracer_flag(tr, 1ULL << index, val); mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); @@ -8953,54 +7905,34 @@ create_trace_option_file(struct trace_array *tr, topt->entry = trace_create_file(opt->name, TRACE_MODE_WRITE, t_options, topt, &trace_options_fops); - } -static void -create_trace_option_files(struct trace_array *tr, struct tracer *tracer) +static int +create_trace_option_files(struct trace_array *tr, struct tracer *tracer, + struct tracer_flags *flags) { struct trace_option_dentry *topts; struct trace_options *tr_topts; - struct tracer_flags *flags; struct tracer_opt *opts; int cnt; - int i; - - if (!tracer) - return; - - flags = tracer->flags; if (!flags || !flags->opts) - return; - - /* - * If this is an instance, only create flags for tracers - * the instance may have. - */ - if (!trace_ok_for_array(tracer, tr)) - return; - - for (i = 0; i < tr->nr_topts; i++) { - /* Make sure there's no duplicate flags. */ - if (WARN_ON_ONCE(tr->topts[i].tracer->flags == tracer->flags)) - return; - } + return 0; opts = flags->opts; for (cnt = 0; opts[cnt].name; cnt++) ; - topts = kcalloc(cnt + 1, sizeof(*topts), GFP_KERNEL); + topts = kzalloc_objs(*topts, cnt + 1); if (!topts) - return; + return 0; tr_topts = krealloc(tr->topts, sizeof(*tr->topts) * (tr->nr_topts + 1), GFP_KERNEL); if (!tr_topts) { kfree(topts); - return; + return -ENOMEM; } tr->topts = tr_topts; @@ -9015,6 +7947,97 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer) "Failed to create trace option: %s", opts[cnt].name); } + return 0; +} + +static int get_global_flags_val(struct tracer *tracer) +{ + struct tracers *t; + + list_for_each_entry(t, &global_trace.tracers, list) { + if (t->tracer != tracer) + continue; + if (!t->flags) + return -1; + return t->flags->val; + } + return -1; +} + +static int add_tracer_options(struct trace_array *tr, struct tracers *t) +{ + struct tracer *tracer = t->tracer; + struct tracer_flags *flags = t->flags ?: tracer->flags; + + if (!flags) + return 0; + + /* Only add tracer options after update_tracer_options finish */ + if (!tracer_options_updated) + return 0; + + return create_trace_option_files(tr, tracer, flags); +} + +static int add_tracer(struct trace_array *tr, struct tracer *tracer) +{ + struct tracer_flags *flags; + struct tracers *t; + int ret; + + /* Only enable if the directory has been created already. */ + if (!tr->dir && !(tr->flags & TRACE_ARRAY_FL_GLOBAL)) + return 0; + + /* + * If this is an instance, only create flags for tracers + * the instance may have. + */ + if (!trace_ok_for_array(tracer, tr)) + return 0; + + t = kmalloc_obj(*t); + if (!t) + return -ENOMEM; + + t->tracer = tracer; + t->flags = NULL; + list_add(&t->list, &tr->tracers); + + flags = tracer->flags; + if (!flags) { + if (!tracer->default_flags) + return 0; + + /* + * If the tracer defines default flags, it means the flags are + * per trace instance. + */ + flags = kmalloc_obj(*flags); + if (!flags) + return -ENOMEM; + + *flags = *tracer->default_flags; + flags->trace = tracer; + + t->flags = flags; + + /* If this is an instance, inherit the global_trace flags */ + if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL)) { + int val = get_global_flags_val(tracer); + if (!WARN_ON_ONCE(val < 0)) + flags->val = val; + } + } + + ret = add_tracer_options(tr, t); + if (ret < 0) { + list_del(&t->list); + kfree(t->flags); + kfree(t); + } + + return ret; } static struct dentry * @@ -9044,8 +8067,9 @@ static void create_trace_options_dir(struct trace_array *tr) for (i = 0; trace_options[i]; i++) { if (top_level || - !((1 << i) & TOP_LEVEL_TRACE_FLAGS)) + !((1ULL << i) & TOP_LEVEL_TRACE_FLAGS)) { create_trace_option_core_file(tr, trace_options[i], i); + } } } @@ -9077,7 +8101,7 @@ rb_simple_write(struct file *filp, const char __user *ubuf, return ret; if (buffer) { - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); if (!!val == tracer_tracing_is_on(tr)) { val = 0; /* do nothing */ } else if (val) { @@ -9091,7 +8115,6 @@ rb_simple_write(struct file *filp, const char __user *ubuf, /* Wake up any waiters */ ring_buffer_wake_waiters(buffer, RING_BUFFER_ALL_CPUS); } - mutex_unlock(&trace_types_lock); } (*ppos)++; @@ -9203,12 +8226,12 @@ buffer_subbuf_size_write(struct file *filp, const char __user *ubuf, if (ret) goto out; -#ifdef CONFIG_TRACER_MAX_TRACE +#ifdef CONFIG_TRACER_SNAPSHOT if (!tr->allocated_snapshot) goto out_max; - ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, order); + ret = ring_buffer_subbuf_order_set(tr->snapshot_buffer.buffer, order); if (ret) { /* Put back the old order */ cnt = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, old_order); @@ -9253,22 +8276,133 @@ static struct dentry *trace_instance_dir; static void init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer); -static int -allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size) +#ifdef CONFIG_MODULES +static int make_mod_delta(struct module *mod, void *data) +{ + struct trace_module_delta *module_delta; + struct trace_scratch *tscratch; + struct trace_mod_entry *entry; + struct trace_array *tr = data; + int i; + + tscratch = tr->scratch; + module_delta = READ_ONCE(tr->module_delta); + for (i = 0; i < tscratch->nr_entries; i++) { + entry = &tscratch->entries[i]; + if (strcmp(mod->name, entry->mod_name)) + continue; + if (mod->state == MODULE_STATE_GOING) + module_delta->delta[i] = 0; + else + module_delta->delta[i] = (unsigned long)mod->mem[MOD_TEXT].base + - entry->mod_addr; + break; + } + return 0; +} +#else +static int make_mod_delta(struct module *mod, void *data) +{ + return 0; +} +#endif + +static int mod_addr_comp(const void *a, const void *b, const void *data) +{ + const struct trace_mod_entry *e1 = a; + const struct trace_mod_entry *e2 = b; + + return e1->mod_addr > e2->mod_addr ? 1 : -1; +} + +static void setup_trace_scratch(struct trace_array *tr, + struct trace_scratch *tscratch, unsigned int size) +{ + struct trace_module_delta *module_delta; + struct trace_mod_entry *entry; + int i, nr_entries; + + if (!tscratch) + return; + + tr->scratch = tscratch; + tr->scratch_size = size; + + if (tscratch->text_addr) + tr->text_delta = (unsigned long)_text - tscratch->text_addr; + + if (struct_size(tscratch, entries, tscratch->nr_entries) > size) + goto reset; + + /* Check if each module name is a valid string */ + for (i = 0; i < tscratch->nr_entries; i++) { + int n; + + entry = &tscratch->entries[i]; + + for (n = 0; n < MODULE_NAME_LEN; n++) { + if (entry->mod_name[n] == '\0') + break; + if (!isprint(entry->mod_name[n])) + goto reset; + } + if (n == MODULE_NAME_LEN) + goto reset; + } + + /* Sort the entries so that we can find appropriate module from address. */ + nr_entries = tscratch->nr_entries; + sort_r(tscratch->entries, nr_entries, sizeof(struct trace_mod_entry), + mod_addr_comp, NULL, NULL); + + if (IS_ENABLED(CONFIG_MODULES)) { + module_delta = kzalloc_flex(*module_delta, delta, nr_entries); + if (!module_delta) { + pr_info("module_delta allocation failed. Not able to decode module address."); + goto reset; + } + init_rcu_head(&module_delta->rcu); + } else + module_delta = NULL; + WRITE_ONCE(tr->module_delta, module_delta); + + /* Scan modules to make text delta for modules. */ + module_for_each_mod(make_mod_delta, tr); + + /* Set trace_clock as the same of the previous boot. */ + if (tscratch->clock_id != tr->clock_id) { + if (tscratch->clock_id >= ARRAY_SIZE(trace_clocks) || + tracing_set_clock(tr, trace_clocks[tscratch->clock_id].name) < 0) { + pr_info("the previous trace_clock info is not valid."); + goto reset; + } + } + return; + reset: + /* Invalid trace modules */ + memset(tscratch, 0, size); +} + +int allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size) { enum ring_buffer_flags rb_flags; + struct trace_scratch *tscratch; + unsigned int scratch_size = 0; - rb_flags = tr->trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; + rb_flags = tr->trace_flags & TRACE_ITER(OVERWRITE) ? RB_FL_OVERWRITE : 0; buf->tr = tr; if (tr->range_addr_start && tr->range_addr_size) { + /* Add scratch buffer to handle 128 modules */ buf->buffer = ring_buffer_alloc_range(size, rb_flags, 0, tr->range_addr_start, - tr->range_addr_size); + tr->range_addr_size, + struct_size(tscratch, entries, 128)); + + tscratch = ring_buffer_meta_scratch(buf->buffer, &scratch_size); + setup_trace_scratch(tr, tscratch, scratch_size); - ring_buffer_last_boot_delta(buf->buffer, - &tr->text_delta, &tr->data_delta); /* * This is basically the same as a mapped buffer, * with the same restrictions. @@ -9288,8 +8422,8 @@ allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size } /* Allocate the first page for all buffers */ - set_buffer_entries(&tr->array_buffer, - ring_buffer_size(tr->array_buffer.buffer, 0)); + trace_set_buffer_entries(&tr->array_buffer, + ring_buffer_size(tr->array_buffer.buffer, 0)); return 0; } @@ -9304,7 +8438,7 @@ static void free_trace_buffer(struct array_buffer *buf) } } -static int allocate_trace_buffers(struct trace_array *tr, int size) +static int allocate_trace_buffers(struct trace_array *tr, unsigned long size) { int ret; @@ -9312,23 +8446,11 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) if (ret) return ret; -#ifdef CONFIG_TRACER_MAX_TRACE - /* Fix mapped buffer trace arrays do not have snapshot buffers */ - if (tr->range_addr_start) - return 0; - - ret = allocate_trace_buffer(tr, &tr->max_buffer, - allocate_snapshot ? size : 1); - if (MEM_FAIL(ret, "Failed to allocate trace buffer\n")) { + ret = trace_allocate_snapshot(tr, size); + if (MEM_FAIL(ret, "Failed to allocate trace buffer\n")) free_trace_buffer(&tr->array_buffer); - return -ENOMEM; - } - tr->allocated_snapshot = allocate_snapshot; - - allocate_snapshot = false; -#endif - return 0; + return ret; } static void free_trace_buffers(struct trace_array *tr) @@ -9337,9 +8459,10 @@ static void free_trace_buffers(struct trace_array *tr) return; free_trace_buffer(&tr->array_buffer); + kfree(tr->module_delta); -#ifdef CONFIG_TRACER_MAX_TRACE - free_trace_buffer(&tr->max_buffer); +#ifdef CONFIG_TRACER_SNAPSHOT + free_trace_buffer(&tr->snapshot_buffer); #endif } @@ -9352,20 +8475,39 @@ static void init_trace_flags_index(struct trace_array *tr) tr->trace_flags_index[i] = i; } -static void __update_tracer_options(struct trace_array *tr) +static int __update_tracer(struct trace_array *tr) { struct tracer *t; + int ret = 0; - for (t = trace_types; t; t = t->next) - add_tracer_options(tr, t); + for (t = trace_types; t && !ret; t = t->next) + ret = add_tracer(tr, t); + + return ret; } -static void update_tracer_options(struct trace_array *tr) +static __init int __update_tracer_options(struct trace_array *tr) { - mutex_lock(&trace_types_lock); + struct tracers *t; + int ret = 0; + + list_for_each_entry(t, &tr->tracers, list) { + ret = add_tracer_options(tr, t); + if (ret < 0) + break; + } + + return ret; +} + +static __init void update_tracer_options(void) +{ + struct trace_array *tr; + + guard(mutex)(&trace_types_lock); tracer_options_updated = true; - __update_tracer_options(tr); - mutex_unlock(&trace_types_lock); + list_for_each_entry(tr, &ftrace_trace_arrays, list) + __update_tracer_options(tr); } /* Must have trace_types_lock held */ @@ -9387,11 +8529,10 @@ struct trace_array *trace_array_find_get(const char *instance) { struct trace_array *tr; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); tr = trace_array_find(instance); - if (tr) - tr->ref++; - mutex_unlock(&trace_types_lock); + if (tr && __trace_array_get(tr) < 0) + tr = NULL; return tr; } @@ -9411,9 +8552,13 @@ static int trace_array_create_dir(struct trace_array *tr) } init_tracer_tracefs(tr, tr->dir); - __update_tracer_options(tr); - - return ret; + ret = __update_tracer(tr); + if (ret) { + event_trace_del_tracer(tr); + tracefs_remove(tr->dir); + return ret; + } + return 0; } static struct trace_array * @@ -9425,7 +8570,7 @@ trace_array_create_systems(const char *name, const char *systems, int ret; ret = -ENOMEM; - tr = kzalloc(sizeof(*tr), GFP_KERNEL); + tr = kzalloc_obj(*tr); if (!tr) return ERR_PTR(ret); @@ -9455,16 +8600,25 @@ trace_array_create_systems(const char *name, const char *systems, raw_spin_lock_init(&tr->start_lock); + tr->syscall_buf_sz = global_trace.syscall_buf_sz; + tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; -#ifdef CONFIG_TRACER_MAX_TRACE +#ifdef CONFIG_TRACER_SNAPSHOT spin_lock_init(&tr->snapshot_trigger_lock); #endif tr->current_trace = &nop_trace; + tr->current_trace_flags = nop_trace.flags; INIT_LIST_HEAD(&tr->systems); INIT_LIST_HEAD(&tr->events); INIT_LIST_HEAD(&tr->hist_vars); INIT_LIST_HEAD(&tr->err_log); + INIT_LIST_HEAD(&tr->tracers); + INIT_LIST_HEAD(&tr->marker_list); + +#ifdef CONFIG_MODULES + INIT_LIST_HEAD(&tr->mod_events); +#endif if (allocate_trace_buffers(tr, trace_buf_size) < 0) goto out_free_tr; @@ -9475,6 +8629,8 @@ trace_array_create_systems(const char *name, const char *systems, if (ftrace_allocate_ftrace_ops(tr) < 0) goto out_free_tr; + trace_array_init_autoremove(tr); + ftrace_init_trace_array(tr); init_trace_flags_index(tr); @@ -9498,6 +8654,7 @@ trace_array_create_systems(const char *name, const char *systems, free_cpumask_var(tr->pipe_cpumask); free_cpumask_var(tr->tracing_cpumask); kfree_const(tr->system_names); + kfree(tr->range_name); kfree(tr->name); kfree(tr); @@ -9514,47 +8671,49 @@ static int instance_mkdir(const char *name) struct trace_array *tr; int ret; - mutex_lock(&event_mutex); - mutex_lock(&trace_types_lock); + guard(mutex)(&event_mutex); + guard(mutex)(&trace_types_lock); ret = -EEXIST; if (trace_array_find(name)) - goto out_unlock; + return -EEXIST; tr = trace_array_create(name); ret = PTR_ERR_OR_ZERO(tr); -out_unlock: - mutex_unlock(&trace_types_lock); - mutex_unlock(&event_mutex); return ret; } -static u64 map_pages(u64 start, u64 size) +#ifdef CONFIG_MMU +static u64 map_pages(unsigned long start, unsigned long size) { - struct page **pages; - phys_addr_t page_start; - unsigned int page_count; - unsigned int i; - void *vaddr; - - page_count = DIV_ROUND_UP(size, PAGE_SIZE); + unsigned long vmap_start, vmap_end; + struct vm_struct *area; + int ret; - page_start = start; - pages = kmalloc_array(page_count, sizeof(struct page *), GFP_KERNEL); - if (!pages) + area = get_vm_area(size, VM_IOREMAP); + if (!area) return 0; - for (i = 0; i < page_count; i++) { - phys_addr_t addr = page_start + i * PAGE_SIZE; - pages[i] = pfn_to_page(addr >> PAGE_SHIFT); + vmap_start = (unsigned long) area->addr; + vmap_end = vmap_start + size; + + ret = vmap_page_range(vmap_start, vmap_end, + start, pgprot_nx(PAGE_KERNEL)); + if (ret < 0) { + free_vm_area(area); + return 0; } - vaddr = vmap(pages, page_count, VM_MAP, PAGE_KERNEL); - kfree(pages); - return (u64)(unsigned long)vaddr; + return (u64)vmap_start; +} +#else +static inline u64 map_pages(unsigned long start, unsigned long size) +{ + return 0; } +#endif /** * trace_array_get_by_name - Create/Lookup a trace array, given its name. @@ -9577,24 +8736,25 @@ struct trace_array *trace_array_get_by_name(const char *name, const char *system { struct trace_array *tr; - mutex_lock(&event_mutex); - mutex_lock(&trace_types_lock); + guard(mutex)(&event_mutex); + guard(mutex)(&trace_types_lock); list_for_each_entry(tr, &ftrace_trace_arrays, list) { - if (tr->name && strcmp(tr->name, name) == 0) - goto out_unlock; + if (tr->name && strcmp(tr->name, name) == 0) { + /* if this fails, @tr is going to be removed. */ + if (__trace_array_get(tr) < 0) + tr = NULL; + return tr; + } } tr = trace_array_create_systems(name, systems, 0, 0); if (IS_ERR(tr)) tr = NULL; -out_unlock: - if (tr) + else tr->ref++; - mutex_unlock(&trace_types_lock); - mutex_unlock(&event_mutex); return tr; } EXPORT_SYMBOL_GPL(trace_array_get_by_name); @@ -9609,15 +8769,20 @@ static int __remove_instance(struct trace_array *tr) list_del(&tr->list); + if (printk_trace == tr) + update_printk_trace(&global_trace); + + /* Must be done before disabling all the flags */ + if (update_marker_trace(tr, 0)) + synchronize_rcu(); + /* Disable all the flags that were enabled coming in */ for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++) { - if ((1 << i) & ZEROED_TRACE_FLAGS) - set_tracer_flag(tr, 1 << i, 0); + if ((1ULL << i) & ZEROED_TRACE_FLAGS) + set_tracer_flag(tr, 1ULL << i, 0); } - if (printk_trace == tr) - update_printk_trace(&global_trace); - + trace_array_cancel_autoremove(tr); tracing_set_nop(tr); clear_ftrace_function_probes(tr); event_trace_del_tracer(tr); @@ -9627,6 +8792,14 @@ static int __remove_instance(struct trace_array *tr) free_percpu(tr->last_func_repeats); free_trace_buffers(tr); clear_tracing_err_log(tr); + free_tracers(tr); + + if (tr->range_name) { + reserve_mem_release_by_name(tr->range_name); + kfree(tr->range_name); + } + if (tr->flags & TRACE_ARRAY_FL_VMALLOC) + vfree((void *)tr->range_addr_start); for (i = 0; i < tr->nr_topts; i++) { kfree(tr->topts[i].topts); @@ -9645,48 +8818,36 @@ static int __remove_instance(struct trace_array *tr) int trace_array_destroy(struct trace_array *this_tr) { struct trace_array *tr; - int ret; if (!this_tr) return -EINVAL; - mutex_lock(&event_mutex); - mutex_lock(&trace_types_lock); + guard(mutex)(&event_mutex); + guard(mutex)(&trace_types_lock); - ret = -ENODEV; /* Making sure trace array exists before destroying it. */ list_for_each_entry(tr, &ftrace_trace_arrays, list) { - if (tr == this_tr) { - ret = __remove_instance(tr); - break; - } + if (tr == this_tr) + return __remove_instance(tr); } - mutex_unlock(&trace_types_lock); - mutex_unlock(&event_mutex); - - return ret; + return -ENODEV; } EXPORT_SYMBOL_GPL(trace_array_destroy); static int instance_rmdir(const char *name) { struct trace_array *tr; - int ret; - mutex_lock(&event_mutex); - mutex_lock(&trace_types_lock); + guard(mutex)(&event_mutex); + guard(mutex)(&trace_types_lock); - ret = -ENODEV; tr = trace_array_find(name); - if (tr) - ret = __remove_instance(tr); - - mutex_unlock(&trace_types_lock); - mutex_unlock(&event_mutex); + if (!tr) + return -ENODEV; - return ret; + return __remove_instance(tr); } static __init void create_trace_instances(struct dentry *d_tracer) @@ -9699,35 +8860,37 @@ static __init void create_trace_instances(struct dentry *d_tracer) if (MEM_FAIL(!trace_instance_dir, "Failed to create instances directory\n")) return; - mutex_lock(&event_mutex); - mutex_lock(&trace_types_lock); + guard(mutex)(&event_mutex); + guard(mutex)(&trace_types_lock); list_for_each_entry(tr, &ftrace_trace_arrays, list) { if (!tr->name) continue; if (MEM_FAIL(trace_array_create_dir(tr) < 0, "Failed to create instance directory\n")) - break; + return; } - - mutex_unlock(&trace_types_lock); - mutex_unlock(&event_mutex); } static void init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) { + umode_t writable_mode = TRACE_MODE_WRITE; int cpu; + if (trace_array_is_readonly(tr)) + writable_mode = TRACE_MODE_READ; + trace_create_file("available_tracers", TRACE_MODE_READ, d_tracer, - tr, &show_traces_fops); + tr, &show_traces_fops); - trace_create_file("current_tracer", TRACE_MODE_WRITE, d_tracer, - tr, &set_tracer_fops); + trace_create_file("current_tracer", writable_mode, d_tracer, + tr, &set_tracer_fops); - trace_create_file("tracing_cpumask", TRACE_MODE_WRITE, d_tracer, + trace_create_file("tracing_cpumask", writable_mode, d_tracer, tr, &tracing_cpumask_fops); + /* Options are used for changing print-format even for readonly instance. */ trace_create_file("trace_options", TRACE_MODE_WRITE, d_tracer, tr, &tracing_iter_fops); @@ -9737,12 +8900,36 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("trace_pipe", TRACE_MODE_READ, d_tracer, tr, &tracing_pipe_fops); - trace_create_file("buffer_size_kb", TRACE_MODE_WRITE, d_tracer, + trace_create_file("buffer_size_kb", writable_mode, d_tracer, tr, &tracing_entries_fops); trace_create_file("buffer_total_size_kb", TRACE_MODE_READ, d_tracer, tr, &tracing_total_entries_fops); + trace_create_file("trace_clock", writable_mode, d_tracer, tr, + &trace_clock_fops); + + trace_create_file("timestamp_mode", TRACE_MODE_READ, d_tracer, tr, + &trace_time_stamp_mode_fops); + + tr->buffer_percent = 50; + + trace_create_file("buffer_subbuf_size_kb", writable_mode, d_tracer, + tr, &buffer_subbuf_size_fops); + + create_trace_options_dir(tr); + + if (tr->range_addr_start) + trace_create_file("last_boot_info", TRACE_MODE_READ, d_tracer, + tr, &last_boot_fops); + + for_each_tracing_cpu(cpu) + tracing_init_tracefs_percpu(tr, cpu); + + /* Read-only instance has above files only. */ + if (trace_array_is_readonly(tr)) + return; + trace_create_file("free_buffer", 0200, d_tracer, tr, &tracing_free_buffer_fops); @@ -9754,55 +8941,39 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("trace_marker_raw", 0220, d_tracer, tr, &tracing_mark_raw_fops); - trace_create_file("trace_clock", TRACE_MODE_WRITE, d_tracer, tr, - &trace_clock_fops); - - trace_create_file("tracing_on", TRACE_MODE_WRITE, d_tracer, - tr, &rb_simple_fops); - - trace_create_file("timestamp_mode", TRACE_MODE_READ, d_tracer, tr, - &trace_time_stamp_mode_fops); - - tr->buffer_percent = 50; - trace_create_file("buffer_percent", TRACE_MODE_WRITE, d_tracer, - tr, &buffer_percent_fops); + tr, &buffer_percent_fops); - trace_create_file("buffer_subbuf_size_kb", TRACE_MODE_WRITE, d_tracer, - tr, &buffer_subbuf_size_fops); + trace_create_file("syscall_user_buf_size", TRACE_MODE_WRITE, d_tracer, + tr, &tracing_syscall_buf_fops); - create_trace_options_dir(tr); + trace_create_file("tracing_on", TRACE_MODE_WRITE, d_tracer, + tr, &rb_simple_fops); -#ifdef CONFIG_TRACER_MAX_TRACE trace_create_maxlat_file(tr, d_tracer); -#endif if (ftrace_create_function_files(tr, d_tracer)) MEM_FAIL(1, "Could not allocate function filter files"); - if (tr->range_addr_start) { - trace_create_file("last_boot_info", TRACE_MODE_READ, d_tracer, - tr, &last_boot_fops); #ifdef CONFIG_TRACER_SNAPSHOT - } else { + if (!tr->range_addr_start) trace_create_file("snapshot", TRACE_MODE_WRITE, d_tracer, tr, &snapshot_fops); #endif - } trace_create_file("error_log", TRACE_MODE_WRITE, d_tracer, tr, &tracing_err_log_fops); - for_each_tracing_cpu(cpu) - tracing_init_tracefs_percpu(tr, cpu); - ftrace_init_tracefs(tr, d_tracer); } +#ifdef CONFIG_TRACEFS_AUTOMOUNT_DEPRECATED static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore) { struct vfsmount *mnt; struct file_system_type *type; + struct fs_context *fc; + int ret; /* * To maintain backward compatibility for tools that mount @@ -9812,14 +8983,24 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore) type = get_fs_type("tracefs"); if (!type) return NULL; - mnt = vfs_submount(mntpt, type, "tracefs", NULL); + + fc = fs_context_for_submount(type, mntpt); put_filesystem(type); - if (IS_ERR(mnt)) - return NULL; - mntget(mnt); + if (IS_ERR(fc)) + return ERR_CAST(fc); + + pr_warn("NOTICE: Automounting of tracing to debugfs is deprecated and will be removed in 2030\n"); + + ret = vfs_parse_fs_string(fc, "source", "tracefs"); + if (!ret) + mnt = fc_mount(fc); + else + mnt = ERR_PTR(ret); + put_fs_context(fc); return mnt; } +#endif /** * tracing_init_dentry - initialize top level trace array @@ -9844,6 +9025,7 @@ int tracing_init_dentry(void) if (WARN_ON(!tracefs_initialized())) return -ENODEV; +#ifdef CONFIG_TRACEFS_AUTOMOUNT_DEPRECATED /* * As there may still be users that expect the tracing * files to exist in debugfs/tracing, we must automount @@ -9852,6 +9034,7 @@ int tracing_init_dentry(void) */ tr->dir = debugfs_create_automount("tracing", NULL, trace_automount, NULL); +#endif return 0; } @@ -9859,7 +9042,7 @@ int tracing_init_dentry(void) extern struct trace_eval_map *__start_ftrace_eval_maps[]; extern struct trace_eval_map *__stop_ftrace_eval_maps[]; -static struct workqueue_struct *eval_map_wq __initdata; +struct workqueue_struct *trace_init_wq __initdata; static struct work_struct eval_map_work __initdata; static struct work_struct tracerfs_init_work __initdata; @@ -9868,22 +9051,22 @@ static void __init eval_map_work_func(struct work_struct *work) int len; len = __stop_ftrace_eval_maps - __start_ftrace_eval_maps; - trace_insert_eval_map(NULL, __start_ftrace_eval_maps, len); + trace_event_update_with_eval_map(NULL, __start_ftrace_eval_maps, len); } static int __init trace_eval_init(void) { INIT_WORK(&eval_map_work, eval_map_work_func); - eval_map_wq = alloc_workqueue("eval_map_wq", WQ_UNBOUND, 0); - if (!eval_map_wq) { - pr_err("Unable to allocate eval_map_wq\n"); + trace_init_wq = alloc_workqueue("trace_init_wq", WQ_UNBOUND, 0); + if (!trace_init_wq) { + pr_err("Unable to allocate trace_init_wq\n"); /* Do work here */ eval_map_work_func(&eval_map_work); return -ENOMEM; } - queue_work(eval_map_wq, &eval_map_work); + queue_work(trace_init_wq, &eval_map_work); return 0; } @@ -9892,8 +9075,8 @@ subsys_initcall(trace_eval_init); static int __init trace_eval_sync(void) { /* Make sure the eval map updates are finished */ - if (eval_map_wq) - destroy_workqueue(eval_map_wq); + if (trace_init_wq) + destroy_workqueue(trace_init_wq); return 0; } @@ -9901,11 +9084,26 @@ late_initcall_sync(trace_eval_sync); #ifdef CONFIG_MODULES -static void trace_module_add_evals(struct module *mod) + +bool module_exists(const char *module) { - if (!mod->num_trace_evals) - return; + /* All modules have the symbol __this_module */ + static const char this_mod[] = "__this_module"; + char modname[MODULE_NAME_LEN + sizeof(this_mod) + 2]; + unsigned long val; + int n; + n = snprintf(modname, sizeof(modname), "%s:%s", module, this_mod); + + if (n > sizeof(modname) - 1) + return false; + + val = module_kallsyms_lookup_name(modname); + return val != 0; +} + +static void trace_module_add_evals(struct module *mod) +{ /* * Modules with bad taint do not have events created, do * not bother with enums either. @@ -9913,7 +9111,8 @@ static void trace_module_add_evals(struct module *mod) if (trace_module_has_bad_taint(mod)) return; - trace_insert_eval_map(mod, mod->trace_evals, mod->num_trace_evals); + /* Even if no trace_evals, this need to sanitize field types. */ + trace_event_update_with_eval_map(mod, mod->trace_evals, mod->num_trace_evals); } #ifdef CONFIG_TRACE_EVAL_MAP_FILE @@ -9925,7 +9124,7 @@ static void trace_module_remove_evals(struct module *mod) if (!mod->num_trace_evals) return; - mutex_lock(&trace_eval_mutex); + guard(mutex)(&trace_eval_mutex); map = trace_eval_maps; @@ -9937,17 +9136,33 @@ static void trace_module_remove_evals(struct module *mod) map = map->tail.next; } if (!map) - goto out; + return; *last = trace_eval_jmp_to_tail(map)->tail.next; kfree(map); - out: - mutex_unlock(&trace_eval_mutex); } #else static inline void trace_module_remove_evals(struct module *mod) { } #endif /* CONFIG_TRACE_EVAL_MAP_FILE */ +static void trace_module_record(struct module *mod, bool add) +{ + struct trace_array *tr; + unsigned long flags; + + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + flags = tr->flags & (TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT); + /* Update any persistent trace array that has already been started */ + if (flags == TRACE_ARRAY_FL_BOOT && add) { + guard(mutex)(&scratch_mutex); + save_mod(mod, tr); + } else if (flags & TRACE_ARRAY_FL_LAST_BOOT) { + /* Update delta if the module loaded in previous boot */ + make_mod_delta(mod, tr); + } + } +} + static int trace_module_notify(struct notifier_block *self, unsigned long val, void *data) { @@ -9956,9 +9171,11 @@ static int trace_module_notify(struct notifier_block *self, switch (val) { case MODULE_STATE_COMING: trace_module_add_evals(mod); + trace_module_record(mod, true); break; case MODULE_STATE_GOING: trace_module_remove_evals(mod); + trace_module_record(mod, false); break; } @@ -10007,7 +9224,7 @@ static __init void tracer_init_tracefs_work_func(struct work_struct *work) create_trace_instances(NULL); - update_tracer_options(&global_trace); + update_tracer_options(); } static __init int tracer_init_tracefs(void) @@ -10020,14 +9237,15 @@ static __init int tracer_init_tracefs(void) if (ret) return 0; - if (eval_map_wq) { + if (trace_init_wq) { INIT_WORK(&tracerfs_init_work, tracer_init_tracefs_work_func); - queue_work(eval_map_wq, &tracerfs_init_work); + queue_work(trace_init_wq, &tracerfs_init_work); } else { tracer_init_tracefs_work_func(NULL); } - rv_init_interface(); + if (rv_init_interface()) + pr_err("RV: Error while creating the RV interface\n"); return 0; } @@ -10140,7 +9358,7 @@ static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_m static struct trace_iterator iter; unsigned int old_userobj; unsigned long flags; - int cnt = 0, cpu; + int cnt = 0; /* * Always turn off tracing when we dump. @@ -10157,14 +9375,13 @@ static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_m /* Simulate the iterator */ trace_init_iter(&iter, tr); - for_each_tracing_cpu(cpu) { - atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); - } + /* While dumping, do not allow the buffer to be enable */ + tracer_tracing_disable(tr); - old_userobj = tr->trace_flags & TRACE_ITER_SYM_USEROBJ; + old_userobj = tr->trace_flags & TRACE_ITER(SYM_USEROBJ); /* don't look at user memory in panic mode */ - tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ; + tr->trace_flags &= ~TRACE_ITER(SYM_USEROBJ); if (dump_mode == DUMP_ORIG) iter.cpu_file = raw_smp_processor_id(); @@ -10205,10 +9422,10 @@ static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_m ret = print_trace_line(&iter); if (ret != TRACE_TYPE_NO_CONSUME) trace_consume(&iter); + + trace_printk_seq(&iter.seq); } touch_nmi_watchdog(); - - trace_printk_seq(&iter.seq); } if (!cnt) @@ -10218,9 +9435,7 @@ static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_m tr->trace_flags |= old_userobj; - for_each_tracing_cpu(cpu) { - atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); - } + tracer_tracing_enable(tr); local_irq_restore(flags); } @@ -10302,7 +9517,8 @@ ssize_t trace_parse_run_command(struct file *file, const char __user *buffer, size_t count, loff_t *ppos, int (*createfn)(const char *)) { - char *kbuf, *buf, *tmp; + char *kbuf __free(kfree) = NULL; + char *buf, *tmp; int ret = 0; size_t done = 0; size_t size; @@ -10317,10 +9533,9 @@ ssize_t trace_parse_run_command(struct file *file, const char __user *buffer, if (size >= WRITE_BUFSIZE) size = WRITE_BUFSIZE - 1; - if (copy_from_user(kbuf, buffer + done, size)) { - ret = -EFAULT; - goto out; - } + if (copy_from_user(kbuf, buffer + done, size)) + return -EFAULT; + kbuf[size] = '\0'; buf = kbuf; do { @@ -10336,8 +9551,7 @@ ssize_t trace_parse_run_command(struct file *file, const char __user *buffer, /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */ pr_warn("Line length is too long: Should be less than %d\n", WRITE_BUFSIZE - 2); - ret = -EINVAL; - goto out; + return -EINVAL; } } done += size; @@ -10350,63 +9564,54 @@ ssize_t trace_parse_run_command(struct file *file, const char __user *buffer, ret = createfn(buf); if (ret) - goto out; + return ret; buf += size; } while (done < count); } - ret = done; - -out: - kfree(kbuf); - - return ret; + return done; } -#ifdef CONFIG_TRACER_MAX_TRACE -__init static bool tr_needs_alloc_snapshot(const char *name) +__init static int backup_instance_area(const char *backup, + unsigned long *addr, phys_addr_t *size) { - char *test; - int len = strlen(name); - bool ret; + struct trace_array *backup_tr; + void *allocated_vaddr = NULL; - if (!boot_snapshot_index) - return false; + backup_tr = trace_array_get_by_name(backup, NULL); + if (!backup_tr) { + pr_warn("Tracing: Instance %s is not found.\n", backup); + return -ENOENT; + } - if (strncmp(name, boot_snapshot_info, len) == 0 && - boot_snapshot_info[len] == '\t') - return true; + if (!(backup_tr->flags & TRACE_ARRAY_FL_BOOT)) { + pr_warn("Tracing: Instance %s is not boot mapped.\n", backup); + trace_array_put(backup_tr); + return -EINVAL; + } - test = kmalloc(strlen(name) + 3, GFP_KERNEL); - if (!test) - return false; + *size = backup_tr->range_addr_size; - sprintf(test, "\t%s\t", name); - ret = strstr(boot_snapshot_info, test) == NULL; - kfree(test); - return ret; -} + allocated_vaddr = vzalloc(*size); + if (!allocated_vaddr) { + pr_warn("Tracing: Failed to allocate memory for copying instance %s (size 0x%lx)\n", + backup, (unsigned long)*size); + trace_array_put(backup_tr); + return -ENOMEM; + } -__init static void do_allocate_snapshot(const char *name) -{ - if (!tr_needs_alloc_snapshot(name)) - return; + memcpy(allocated_vaddr, + (void *)backup_tr->range_addr_start, (size_t)*size); + *addr = (unsigned long)allocated_vaddr; - /* - * When allocate_snapshot is set, the next call to - * allocate_trace_buffers() (called by trace_array_get_by_name()) - * will allocate the snapshot buffer. That will alse clear - * this flag. - */ - allocate_snapshot = true; + trace_array_put(backup_tr); + return 0; } -#else -static inline void do_allocate_snapshot(const char *name) { } -#endif __init static void enable_instances(void) { struct trace_array *tr; + bool memmap_area = false; char *curr_str; char *name; char *str; @@ -10424,11 +9629,16 @@ __init static void enable_instances(void) bool traceoff = false; char *flag_delim; char *addr_delim; + char *rname __free(kfree) = NULL; + char *backup; tok = strsep(&curr_str, ","); - flag_delim = strchr(tok, '^'); - addr_delim = strchr(tok, '@'); + name = strsep(&tok, "="); + backup = tok; + + flag_delim = strchr(name, '^'); + addr_delim = strchr(name, '@'); if (addr_delim) *addr_delim++ = '\0'; @@ -10436,7 +9646,10 @@ __init static void enable_instances(void) if (flag_delim) *flag_delim++ = '\0'; - name = tok; + if (backup) { + if (backup_instance_area(backup, &addr, &size) < 0) + continue; + } if (flag_delim) { char *flag; @@ -10474,16 +9687,31 @@ __init static void enable_instances(void) name); continue; } + memmap_area = true; } else if (tok) { if (!reserve_mem_find_by_name(tok, &start, &size)) { start = 0; pr_warn("Failed to map boot instance %s to %s\n", name, tok); continue; } + rname = kstrdup(tok, GFP_KERNEL); } if (start) { - addr = map_pages(start, size); + /* Start and size must be page aligned */ + if (start & ~PAGE_MASK) { + pr_warn("Tracing: mapping start addr %pa is not page aligned\n", &start); + continue; + } + if (size & ~PAGE_MASK) { + pr_warn("Tracing: mapping size %pa is not page aligned\n", &size); + continue; + } + + if (memmap_area) + addr = map_pages(start, size); + else + addr = (unsigned long)phys_to_virt(start); if (addr) { pr_info("Tracing: mapped boot instance %s at physical memory %pa of size 0x%lx\n", name, &start, (unsigned long)size); @@ -10493,8 +9721,7 @@ __init static void enable_instances(void) } } else { /* Only non mapped buffers have snapshot buffers */ - if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE)) - do_allocate_snapshot(name); + do_allocate_snapshot(name); } tr = trace_array_create_systems(name, NULL, addr, size); @@ -10510,24 +9737,57 @@ __init static void enable_instances(void) update_printk_trace(tr); /* - * If start is set, then this is a mapped buffer, and - * cannot be deleted by user space, so keep the reference - * to it. + * memmap'd buffers can not be freed. */ - if (start) { - tr->flags |= TRACE_ARRAY_FL_BOOT; + if (memmap_area) { + tr->flags |= TRACE_ARRAY_FL_MEMMAP; tr->ref++; } + /* + * Backup buffers can be freed but need vfree(). + */ + if (backup) { + tr->flags |= TRACE_ARRAY_FL_VMALLOC | TRACE_ARRAY_FL_RDONLY; + trace_array_start_autoremove(); + } + + if (start || backup) { + tr->flags |= TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT; + tr->range_name = no_free_ptr(rname); + } + + /* + * Save the events to start and enabled them after all boot instances + * have been created. + */ + tr->boot_events = curr_str; + } + + /* Enable the events after all boot instances have been created */ + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + + if (!tr->boot_events || !(*tr->boot_events)) { + tr->boot_events = NULL; + continue; + } + + curr_str = tr->boot_events; + + /* Clear the instance if this is a persistent buffer */ + if (tr->flags & TRACE_ARRAY_FL_LAST_BOOT) + update_last_data(tr); + while ((tok = strsep(&curr_str, ","))) { early_enable_events(tr, tok, true); } + tr->boot_events = NULL; } } __init static int tracer_alloc_buffers(void) { - int ring_buf_size; + unsigned long ring_buf_size; int ret = -ENOMEM; @@ -10543,7 +9803,7 @@ __init static int tracer_alloc_buffers(void) BUILD_BUG_ON(TRACE_ITER_LAST_BIT > TRACE_FLAGS_MAX_SIZE); if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) - goto out; + return -ENOMEM; if (!alloc_cpumask_var(&global_trace.tracing_cpumask, GFP_KERNEL)) goto out_free_buffer_mask; @@ -10608,19 +9868,21 @@ __init static int tracer_alloc_buffers(void) * just a bootstrap of current_trace anyway. */ global_trace.current_trace = &nop_trace; + global_trace.current_trace_flags = nop_trace.flags; global_trace.max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; -#ifdef CONFIG_TRACER_MAX_TRACE +#ifdef CONFIG_TRACER_SNAPSHOT spin_lock_init(&global_trace.snapshot_trigger_lock); #endif ftrace_init_global_array_ops(&global_trace); - init_trace_flags_index(&global_trace); +#ifdef CONFIG_MODULES + INIT_LIST_HEAD(&global_trace.mod_events); +#endif - register_tracer(&nop_trace); + init_trace_flags_index(&global_trace); - /* Function tracing may start here (via kernel command line) */ - init_function_trace(); + INIT_LIST_HEAD(&global_trace.tracers); /* All seems OK, enable tracing */ tracing_disabled = 0; @@ -10632,12 +9894,20 @@ __init static int tracer_alloc_buffers(void) global_trace.flags = TRACE_ARRAY_FL_GLOBAL; + global_trace.syscall_buf_sz = syscall_buf_size; + INIT_LIST_HEAD(&global_trace.systems); INIT_LIST_HEAD(&global_trace.events); INIT_LIST_HEAD(&global_trace.hist_vars); INIT_LIST_HEAD(&global_trace.err_log); + list_add(&global_trace.marker_list, &marker_copies); list_add(&global_trace.list, &ftrace_trace_arrays); + register_tracer(&nop_trace); + + /* Function tracing may start here (via kernel command line) */ + init_function_trace(); + apply_trace_boot_options(); register_snapshot_cmd(); @@ -10656,33 +9926,21 @@ out_free_cpumask: free_cpumask_var(global_trace.tracing_cpumask); out_free_buffer_mask: free_cpumask_var(tracing_buffer_mask); -out: return ret; } -void __init ftrace_boot_snapshot(void) +#ifdef CONFIG_FUNCTION_TRACER +/* Used to set module cached ftrace filtering at boot up */ +struct trace_array *trace_get_global_array(void) { -#ifdef CONFIG_TRACER_MAX_TRACE - struct trace_array *tr; - - if (!snapshot_at_boot) - return; - - list_for_each_entry(tr, &ftrace_trace_arrays, list) { - if (!tr->allocated_snapshot) - continue; - - tracing_snapshot_instance(tr); - trace_array_puts(tr, "** Boot snapshot taken **\n"); - } -#endif + return &global_trace; } +#endif void __init early_trace_init(void) { if (tracepoint_printk) { - tracepoint_print_iter = - kzalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL); + tracepoint_print_iter = kzalloc_obj(*tracepoint_print_iter); if (MEM_FAIL(!tracepoint_print_iter, "Failed to allocate trace iterator\n")) tracepoint_printk = 0; @@ -10748,6 +10006,9 @@ __init static int late_trace_init(void) tracepoint_printk = 0; } + if (traceoff_after_boot) + tracing_off(); + tracing_set_default_clock(); clear_boot_tracer(); return 0; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9691b47b5f3d..80fe152af1dd 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -21,6 +21,8 @@ #include <linux/workqueue.h> #include <linux/ctype.h> #include <linux/once_lite.h> +#include <linux/ftrace_regs.h> +#include <linux/llist.h> #include "pid_list.h" @@ -66,14 +68,17 @@ enum trace_type { #undef __field_fn #define __field_fn(type, item) type item; +#undef __field_packed +#define __field_packed(type, item) type item; + #undef __field_struct #define __field_struct(type, item) __field(type, item) #undef __field_desc #define __field_desc(type, container, item) -#undef __field_packed -#define __field_packed(type, container, item) +#undef __field_desc_packed +#define __field_desc_packed(type, container, item) #undef __array #define __array(type, item, size) type item[size]; @@ -126,10 +131,12 @@ enum trace_type { #define FAULT_STRING "(fault)" -#define HIST_STACKTRACE_DEPTH 16 +#define HIST_STACKTRACE_DEPTH 31 #define HIST_STACKTRACE_SIZE (HIST_STACKTRACE_DEPTH * sizeof(unsigned long)) #define HIST_STACKTRACE_SKIP 5 +#define SYSCALL_FAULT_USER_MAX 165 + /* * syscalls are special, and need special handling, this is why * they are not included in trace_entries.h @@ -182,8 +189,7 @@ struct trace_array; * the trace, etc.) */ struct trace_array_cpu { - atomic_t disabled; - void *buffer_page; /* ring buffer spare */ + local_t disabled; unsigned long entries; unsigned long saved_latency; @@ -216,7 +222,7 @@ struct array_buffer { int cpu; }; -#define TRACE_FLAGS_MAX_SIZE 32 +#define TRACE_FLAGS_MAX_SIZE 64 struct trace_options { struct tracer *tracer; @@ -258,6 +264,7 @@ static inline bool still_need_pid_events(int type, struct trace_pid_list *pid_li typedef bool (*cond_update_fn_t)(struct trace_array *tr, void *cond_data); +#ifdef CONFIG_TRACER_SNAPSHOT /** * struct cond_snapshot - conditional snapshot data and callback * @@ -300,6 +307,7 @@ struct cond_snapshot { void *cond_data; cond_update_fn_t update; }; +#endif /* CONFIG_TRACER_SNAPSHOT */ /* * struct trace_func_repeats - used to keep track of the consecutive @@ -312,6 +320,11 @@ struct trace_func_repeats { u64 ts_last_call; }; +struct trace_module_delta { + struct rcu_head rcu; + long delta[]; +}; + /* * The trace array - an array of per-CPU trace arrays. This is the * highest level data structure that individual tracers deal with. @@ -321,35 +334,44 @@ struct trace_array { struct list_head list; char *name; struct array_buffer array_buffer; -#ifdef CONFIG_TRACER_MAX_TRACE +#ifdef CONFIG_TRACER_SNAPSHOT /* - * The max_buffer is used to snapshot the trace when a maximum + * The snapshot_buffer is used to snapshot the trace when a maximum * latency is reached, or when the user initiates a snapshot. * Some tracers will use this to store a maximum trace while * it continues examining live traces. * - * The buffers for the max_buffer are set up the same as the array_buffer - * When a snapshot is taken, the buffer of the max_buffer is swapped - * with the buffer of the array_buffer and the buffers are reset for - * the array_buffer so the tracing can continue. + * The buffers for the snapshot_buffer are set up the same as the + * array_buffer. When a snapshot is taken, the buffer of the + * snapshot_buffer is swapped with the buffer of the array_buffer + * and the buffers are reset for the array_buffer so the tracing can + * continue. */ - struct array_buffer max_buffer; + struct array_buffer snapshot_buffer; bool allocated_snapshot; spinlock_t snapshot_trigger_lock; unsigned int snapshot; +#ifdef CONFIG_TRACER_MAX_TRACE unsigned long max_latency; -#ifdef CONFIG_FSNOTIFY struct dentry *d_max_latency; +#ifdef CONFIG_FSNOTIFY struct work_struct fsnotify_work; struct irq_work fsnotify_irqwork; -#endif -#endif +#endif /* CONFIG_FSNOTIFY */ +#endif /* CONFIG_TRACER_MAX_TRACE */ +#endif /* CONFIG_TRACER_SNAPSHOT */ + /* The below is for memory mapped ring buffer */ unsigned int mapped; unsigned long range_addr_start; unsigned long range_addr_size; + char *range_name; long text_delta; - long data_delta; + struct trace_module_delta *module_delta; + void *scratch; /* pointer in persistent memory */ + int scratch_size; + + int buffer_disabled; struct trace_pid_list __rcu *filtered_pids; struct trace_pid_list __rcu *filtered_no_pids; @@ -364,15 +386,14 @@ struct trace_array { * * It is also used in other places outside the update_max_tr * so it needs to be defined outside of the - * CONFIG_TRACER_MAX_TRACE. + * CONFIG_TRACER_SNAPSHOT. */ arch_spinlock_t max_lock; - int buffer_disabled; #ifdef CONFIG_FTRACE_SYSCALLS int sys_refcount_enter; int sys_refcount_exit; - struct trace_event_file __rcu *enter_syscall_files[NR_syscalls]; - struct trace_event_file __rcu *exit_syscall_files[NR_syscalls]; + struct trace_event_file *enter_syscall_files[NR_syscalls]; + struct trace_event_file *exit_syscall_files[NR_syscalls]; #endif int stop_count; int clock_id; @@ -381,11 +402,15 @@ struct trace_array { int buffer_percent; unsigned int n_err_log_entries; struct tracer *current_trace; - unsigned int trace_flags; + struct tracer_flags *current_trace_flags; + u64 trace_flags; unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE]; unsigned int flags; raw_spinlock_t start_lock; - const char *system_names; + union { + const char *system_names; + char *boot_events; + }; struct list_head err_log; struct dentry *dir; struct dentry *options; @@ -394,12 +419,17 @@ struct trace_array { struct trace_options *topts; struct list_head systems; struct list_head events; + struct list_head marker_list; + struct list_head tracers; struct trace_event_file *trace_marker_file; cpumask_var_t tracing_cpumask; /* only trace on set CPUs */ /* one per_cpu trace_pipe can be opened by only one user */ cpumask_var_t pipe_cpumask; int ref; int trace_ref; +#ifdef CONFIG_MODULES + struct list_head mod_events; +#endif #ifdef CONFIG_FUNCTION_TRACER struct ftrace_ops *ops; struct trace_pid_list __rcu *function_pids; @@ -417,6 +447,7 @@ struct trace_array { int function_enabled; #endif int no_filter_buffering_ref; + unsigned int syscall_buf_sz; struct list_head hist_vars; #ifdef CONFIG_TRACER_SNAPSHOT struct cond_snapshot *cond_snapshot; @@ -427,13 +458,33 @@ struct trace_array { * we do not waste memory on systems that are not using tracing. */ bool ring_buffer_expanded; + /* + * If the ring buffer is a read only backup instance, it will be + * removed after dumping all data via pipe, because no readable data. + */ + bool free_on_close; + struct work_struct autoremove_work; }; enum { - TRACE_ARRAY_FL_GLOBAL = BIT(0), - TRACE_ARRAY_FL_BOOT = BIT(1), + TRACE_ARRAY_FL_GLOBAL = BIT(0), + TRACE_ARRAY_FL_BOOT = BIT(1), + TRACE_ARRAY_FL_LAST_BOOT = BIT(2), + TRACE_ARRAY_FL_MOD_INIT = BIT(3), + TRACE_ARRAY_FL_MEMMAP = BIT(4), + TRACE_ARRAY_FL_VMALLOC = BIT(5), + TRACE_ARRAY_FL_RDONLY = BIT(6), }; +#ifdef CONFIG_MODULES +bool module_exists(const char *module); +#else +static inline bool module_exists(const char *module) +{ + return false; +} +#endif + extern struct list_head ftrace_trace_arrays; extern struct mutex trace_types_lock; @@ -444,11 +495,20 @@ extern struct trace_array *trace_array_find(const char *instance); extern struct trace_array *trace_array_find_get(const char *instance); extern u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_event *rbe); -extern int tracing_set_filter_buffering(struct trace_array *tr, bool set); extern int tracing_set_clock(struct trace_array *tr, const char *clockstr); extern bool trace_clock_in_ns(struct trace_array *tr); +extern unsigned long trace_adjust_address(struct trace_array *tr, unsigned long addr); + +extern struct trace_array *printk_trace; + +static inline bool trace_array_is_readonly(struct trace_array *tr) +{ + /* backup instance is read only. */ + return tr->flags & TRACE_ARRAY_FL_RDONLY; +} + /* * The global tracer (top) should be the first trace array added, * but we check the flag anyway. @@ -604,9 +664,10 @@ struct tracer { u32 old_flags, u32 bit, int set); /* Return 0 if OK with change, else return non-zero */ int (*flag_changed)(struct trace_array *tr, - u32 mask, int set); + u64 mask, int set); struct tracer *next; struct tracer_flags *flags; + struct tracer_flags *default_flags; int enabled; bool print_max; bool allow_instances; @@ -623,6 +684,8 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu) return iter->buffer_iter ? iter->buffer_iter[cpu] : NULL; } +extern int tracing_disabled; + int tracer_init(struct tracer *t, struct trace_array *tr); int tracing_is_enabled(void); void tracing_reset_online_cpus(struct array_buffer *buf); @@ -630,19 +693,78 @@ void tracing_reset_all_online_cpus(void); void tracing_reset_all_online_cpus_unlocked(void); int tracing_open_generic(struct inode *inode, struct file *filp); int tracing_open_generic_tr(struct inode *inode, struct file *filp); +int tracing_release(struct inode *inode, struct file *file); int tracing_release_generic_tr(struct inode *inode, struct file *file); int tracing_open_file_tr(struct inode *inode, struct file *filp); int tracing_release_file_tr(struct inode *inode, struct file *filp); int tracing_single_release_file_tr(struct inode *inode, struct file *filp); -bool tracing_is_disabled(void); bool tracer_tracing_is_on(struct trace_array *tr); void tracer_tracing_on(struct trace_array *tr); void tracer_tracing_off(struct trace_array *tr); +void tracer_tracing_disable(struct trace_array *tr); +void tracer_tracing_enable(struct trace_array *tr); +int allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size); struct dentry *trace_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops); +struct dentry *trace_create_cpu_file(const char *name, + umode_t mode, + struct dentry *parent, + void *data, + long cpu, + const struct file_operations *fops); + +struct trace_iterator *__tracing_open(struct inode *inode, struct file *file, + bool snapshot); +int tracing_buffers_open(struct inode *inode, struct file *filp); +ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos); +int tracing_buffers_release(struct inode *inode, struct file *file); +ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, unsigned int flags); + +ssize_t tracing_nsecs_read(unsigned long *ptr, char __user *ubuf, + size_t cnt, loff_t *ppos); +ssize_t tracing_nsecs_write(unsigned long *ptr, const char __user *ubuf, + size_t cnt, loff_t *ppos); + +void trace_set_buffer_entries(struct array_buffer *buf, unsigned long val); + +/* + * Should be used after trace_array_get(), trace_types_lock + * ensures that i_cdev was already initialized. + */ +static inline int tracing_get_cpu(struct inode *inode) +{ + if (inode->i_cdev) /* See trace_create_cpu_file() */ + return (long)inode->i_cdev - 1; + return RING_BUFFER_ALL_CPUS; +} +void tracing_reset_cpu(struct array_buffer *buf, int cpu); + +struct ftrace_buffer_info { + struct trace_iterator iter; + void *spare; + unsigned int spare_cpu; + unsigned int spare_size; + unsigned int read; +}; + +/** + * tracer_tracing_is_on_cpu - show real state of ring buffer enabled on for a cpu + * @tr : the trace array to know if ring buffer is enabled + * @cpu: The cpu buffer to check if enabled + * + * Shows real state of the per CPU buffer if it is enabled or not. + */ +static inline bool tracer_tracing_is_on_cpu(struct trace_array *tr, int cpu) +{ + if (tr->array_buffer.buffer) + return ring_buffer_record_is_on_cpu(tr->array_buffer.buffer, cpu); + return false; +} int tracing_init_dentry(void); @@ -684,7 +806,8 @@ unsigned long trace_total_entries(struct trace_array *tr); void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, - unsigned int trace_ctx); + unsigned int trace_ctx, + struct ftrace_regs *regs); void trace_graph_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, @@ -693,8 +816,10 @@ void trace_latency_header(struct seq_file *m); void trace_default_header(struct seq_file *m); void print_trace_header(struct seq_file *m, struct trace_iterator *iter); -void trace_graph_return(struct ftrace_graph_ret *trace, struct fgraph_ops *gops); -int trace_graph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops); +void trace_graph_return(struct ftrace_graph_ret *trace, struct fgraph_ops *gops, + struct ftrace_regs *fregs); +int trace_graph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, + struct ftrace_regs *fregs); void tracing_start_cmdline_record(void); void tracing_stop_cmdline_record(void); @@ -714,11 +839,10 @@ extern cpumask_var_t __read_mostly tracing_buffer_mask; extern unsigned long nsecs_to_usecs(unsigned long nsecs); extern unsigned long tracing_thresh; +extern struct workqueue_struct *trace_init_wq __initdata; /* PID filtering */ -extern int pid_max; - bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid); bool trace_ignore_this_task(struct trace_pid_list *filtered_pids, @@ -734,16 +858,16 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, struct trace_pid_list **new_pid_list, const char __user *ubuf, size_t cnt); -#ifdef CONFIG_TRACER_MAX_TRACE +#ifdef CONFIG_TRACER_SNAPSHOT void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, void *cond_data); void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); -#ifdef CONFIG_FSNOTIFY -#define LATENCY_FS_NOTIFY +#if defined(CONFIG_TRACER_MAX_TRACE) && defined(CONFIG_FSNOTIFY) +# define LATENCY_FS_NOTIFY #endif -#endif /* CONFIG_TRACER_MAX_TRACE */ +#endif /* CONFIG_TRACER_SNAPSHOT */ #ifdef LATENCY_FS_NOTIFY void latency_fsnotify(struct trace_array *tr); @@ -760,6 +884,22 @@ static inline void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, } #endif /* CONFIG_STACKTRACE */ +#ifdef CONFIG_TRACER_MAX_TRACE +static inline bool tracer_uses_snapshot(struct tracer *tracer) +{ + return tracer->use_max_tr; +} +void trace_create_maxlat_file(struct trace_array *tr, + struct dentry *d_tracer); +#else +static inline bool tracer_uses_snapshot(struct tracer *tracer) +{ + return false; +} +static inline void trace_create_maxlat_file(struct trace_array *tr, + struct dentry *d_tracer) { } +#endif + void trace_last_func_repeats(struct trace_array *tr, struct trace_func_repeats *last_info, unsigned int trace_ctx); @@ -770,6 +910,8 @@ extern void trace_find_cmdline(int pid, char comm[]); extern int trace_find_tgid(int pid); extern void trace_event_follow_fork(struct trace_array *tr, bool enable); +extern int trace_events_enabled(struct trace_array *tr, const char *system); + #ifdef CONFIG_DYNAMIC_FTRACE extern unsigned long ftrace_update_tot_cnt; extern unsigned long ftrace_number_of_pages; @@ -785,6 +927,8 @@ extern int DYN_FTRACE_TEST_NAME(void); #define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2 extern int DYN_FTRACE_TEST_NAME2(void); +void __init trace_append_boot_param(char *buf, const char *str, + char sep, int size); extern void trace_set_ring_buffer_expanded(struct trace_array *tr); extern bool tracing_selftest_disabled; @@ -807,6 +951,7 @@ extern int trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr); +extern bool __read_mostly tracing_selftest_running; /* * Tracer data references selftest functions that only occur * on boot up. These can be __init functions. Thus, when selftests @@ -819,17 +964,20 @@ static inline void __init disable_tracing_selftest(const char *reason) } /* Tracers are seldom changed. Optimize when selftests are disabled. */ #define __tracer_data __read_mostly +#define tracing_selftest_running 0 #endif /* CONFIG_FTRACE_STARTUP_TEST */ extern void *head_page(struct trace_array_cpu *data); extern unsigned long long ns2usecs(u64 nsec); -extern int -trace_vbprintk(unsigned long ip, const char *fmt, va_list args); -extern int -trace_vprintk(unsigned long ip, const char *fmt, va_list args); -extern int -trace_array_vprintk(struct trace_array *tr, - unsigned long ip, const char *fmt, va_list args); + +__printf(2, 0) +int trace_vbprintk(unsigned long ip, const char *fmt, va_list args); +__printf(2, 0) +int trace_vprintk(unsigned long ip, const char *fmt, va_list args); +__printf(3, 0) +int trace_array_vprintk(struct trace_array *tr, + unsigned long ip, const char *fmt, va_list args); +__printf(3, 4) int trace_array_printk_buf(struct trace_buffer *buffer, unsigned long ip, const char *fmt, ...); void trace_printk_seq(struct trace_seq *s); @@ -884,11 +1032,10 @@ static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash) #define TRACE_GRAPH_PRINT_RETVAL 0x800 #define TRACE_GRAPH_PRINT_RETVAL_HEX 0x1000 #define TRACE_GRAPH_PRINT_RETADDR 0x2000 +#define TRACE_GRAPH_ARGS 0x4000 #define TRACE_GRAPH_PRINT_FILL_SHIFT 28 #define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) -extern void ftrace_graph_sleep_time_control(bool enable); - #ifdef CONFIG_FUNCTION_PROFILER extern void ftrace_graph_graph_time_control(bool enable); #else @@ -908,10 +1055,13 @@ extern int __trace_graph_entry(struct trace_array *tr, extern int __trace_graph_retaddr_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, unsigned int trace_ctx, - unsigned long retaddr); + unsigned long retaddr, + struct ftrace_regs *fregs); extern void __trace_graph_return(struct trace_array *tr, struct ftrace_graph_ret *trace, - unsigned int trace_ctx); + unsigned int trace_ctx, + u64 calltime, u64 rettime); + extern void init_array_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops); extern int allocate_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops); extern void free_fgraph_ops(struct trace_array *tr); @@ -1057,7 +1207,8 @@ static inline void ftrace_graph_addr_finish(struct fgraph_ops *gops, struct ftra #endif /* CONFIG_DYNAMIC_FTRACE */ extern unsigned int fgraph_max_depth; -extern bool fgraph_sleep_time; +extern int fgraph_no_sleep_time; +extern bool fprofile_no_sleep_time; static inline bool ftrace_graph_ignore_func(struct fgraph_ops *gops, struct ftrace_graph_ent *trace) @@ -1102,11 +1253,6 @@ struct ftrace_func_command { char *params, int enable); }; extern bool ftrace_filter_param __initdata; -static inline int ftrace_trace_task(struct trace_array *tr) -{ - return this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid) != - FTRACE_PID_IGNORE; -} extern int ftrace_is_dead(void); int ftrace_create_function_files(struct trace_array *tr, struct dentry *parent); @@ -1114,6 +1260,7 @@ void ftrace_destroy_function_files(struct trace_array *tr); int ftrace_allocate_ftrace_ops(struct trace_array *tr); void ftrace_free_ftrace_ops(struct trace_array *tr); void ftrace_init_global_array_ops(struct trace_array *tr); +struct trace_array *trace_get_global_array(void); void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func); void ftrace_reset_array_ops(struct trace_array *tr); void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer); @@ -1123,10 +1270,6 @@ void ftrace_clear_pids(struct trace_array *tr); int init_function_trace(void); void ftrace_pid_follow_fork(struct trace_array *tr, bool enable); #else -static inline int ftrace_trace_task(struct trace_array *tr) -{ - return 1; -} static inline int ftrace_is_dead(void) { return 0; } static inline int ftrace_create_function_files(struct trace_array *tr, @@ -1239,6 +1382,7 @@ bool ftrace_event_is_function(struct trace_event_call *call); */ struct trace_parser { bool cont; + bool fail; char *buffer; unsigned idx; unsigned size; @@ -1246,7 +1390,7 @@ struct trace_parser { static inline bool trace_parser_loaded(struct trace_parser *parser) { - return (parser->idx != 0); + return !parser->fail && parser->idx != 0; } static inline bool trace_parser_cont(struct trace_parser *parser) @@ -1260,6 +1404,11 @@ static inline void trace_parser_clear(struct trace_parser *parser) parser->idx = 0; } +static inline void trace_parser_fail(struct trace_parser *parser) +{ + parser->fail = true; +} + extern int trace_parser_get_init(struct trace_parser *parser, int size); extern void trace_parser_put(struct trace_parser *parser); extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, @@ -1286,11 +1435,11 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, # define FUNCTION_FLAGS \ C(FUNCTION, "function-trace"), \ C(FUNC_FORK, "function-fork"), -# define FUNCTION_DEFAULT_FLAGS TRACE_ITER_FUNCTION +# define FUNCTION_DEFAULT_FLAGS TRACE_ITER(FUNCTION) #else # define FUNCTION_FLAGS # define FUNCTION_DEFAULT_FLAGS 0UL -# define TRACE_ITER_FUNC_FORK 0UL +# define TRACE_ITER_FUNC_FORK_BIT -1 #endif #ifdef CONFIG_STACKTRACE @@ -1300,6 +1449,24 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, # define STACK_FLAGS #endif +#ifdef CONFIG_FUNCTION_PROFILER +# define PROFILER_FLAGS \ + C(PROF_TEXT_OFFSET, "prof-text-offset"), +# ifdef CONFIG_FUNCTION_GRAPH_TRACER +# define FPROFILE_FLAGS \ + C(GRAPH_TIME, "graph-time"), +# define FPROFILE_DEFAULT_FLAGS TRACE_ITER(GRAPH_TIME) +# else +# define FPROFILE_FLAGS +# define FPROFILE_DEFAULT_FLAGS 0UL +# endif +#else +# define PROFILER_FLAGS +# define FPROFILE_FLAGS +# define FPROFILE_DEFAULT_FLAGS 0UL +# define TRACE_ITER_PROF_TEXT_OFFSET_BIT -1 +#endif + /* * trace_iterator_flags is an enumeration that defines bit * positions into trace_flags that controls the output. @@ -1332,12 +1499,16 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, C(MARKERS, "markers"), \ C(EVENT_FORK, "event-fork"), \ C(TRACE_PRINTK, "trace_printk_dest"), \ + C(COPY_MARKER, "copy_trace_marker"), \ C(PAUSE_ON_TRACE, "pause-on-trace"), \ C(HASH_PTR, "hash-ptr"), /* Print hashed pointer */ \ + C(BITMASK_LIST, "bitmask-list"), \ FUNCTION_FLAGS \ FGRAPH_FLAGS \ STACK_FLAGS \ - BRANCH_FLAGS + BRANCH_FLAGS \ + PROFILER_FLAGS \ + FPROFILE_FLAGS /* * By defining C, we can make TRACE_FLAGS a list of bit names @@ -1353,20 +1524,17 @@ enum trace_iterator_bits { }; /* - * By redefining C, we can make TRACE_FLAGS a list of masks that - * use the bits as defined above. + * And use TRACE_ITER(flag) to define the bit masks. */ -#undef C -#define C(a, b) TRACE_ITER_##a = (1 << TRACE_ITER_##a##_BIT) - -enum trace_iterator_flags { TRACE_FLAGS }; +#define TRACE_ITER(flag) \ + (TRACE_ITER_##flag##_BIT < 0 ? 0 : 1ULL << (TRACE_ITER_##flag##_BIT)) /* * TRACE_ITER_SYM_MASK masks the options in trace_flags that * control the output of kernel symbols. */ #define TRACE_ITER_SYM_MASK \ - (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR) + (TRACE_ITER(PRINT_PARENT)|TRACE_ITER(SYM_OFFSET)|TRACE_ITER(SYM_ADDR)) extern struct tracer nop_trace; @@ -1375,7 +1543,7 @@ extern int enable_branch_tracing(struct trace_array *tr); extern void disable_branch_tracing(void); static inline int trace_branch_enable(struct trace_array *tr) { - if (tr->trace_flags & TRACE_ITER_BRANCH) + if (tr->trace_flags & TRACE_ITER(BRANCH)) return enable_branch_tracing(tr); return 0; } @@ -1471,6 +1639,64 @@ void trace_buffered_event_enable(void); void early_enable_events(struct trace_array *tr, char *buf, bool disable_first); +struct trace_user_buf; +struct trace_user_buf_info { + struct trace_user_buf __percpu *tbuf; + size_t size; + int ref; +}; + +typedef int (*trace_user_buf_copy)(char *dst, const char __user *src, + size_t size, void *data); +int trace_user_fault_init(struct trace_user_buf_info *tinfo, size_t size); +int trace_user_fault_get(struct trace_user_buf_info *tinfo); +int trace_user_fault_put(struct trace_user_buf_info *tinfo); +void trace_user_fault_destroy(struct trace_user_buf_info *tinfo); +char *trace_user_fault_read(struct trace_user_buf_info *tinfo, + const char __user *ptr, size_t size, + trace_user_buf_copy copy_func, void *data); + +static __always_inline void +trace_event_setup(struct ring_buffer_event *event, + int type, unsigned int trace_ctx) +{ + struct trace_entry *ent = ring_buffer_event_data(event); + + tracing_generic_entry_update(ent, type, trace_ctx); +} + +static __always_inline struct ring_buffer_event * +__trace_buffer_lock_reserve(struct trace_buffer *buffer, + int type, + unsigned long len, + unsigned int trace_ctx) +{ + struct ring_buffer_event *event; + + event = ring_buffer_lock_reserve(buffer, len); + if (event != NULL) + trace_event_setup(event, type, trace_ctx); + + return event; +} + +static __always_inline void +__buffer_unlock_commit(struct trace_buffer *buffer, struct ring_buffer_event *event) +{ + __this_cpu_write(trace_taskinfo_save, true); + + /* If this is the temp buffer, we need to commit fully */ + if (this_cpu_read(trace_buffered_event) == event) { + /* Length is in event->array[0] */ + ring_buffer_write(buffer, event->array[0], &event->array[1]); + /* Release the temp buffer */ + this_cpu_dec(trace_buffered_event_cnt); + /* ring_buffer_unlock_commit() enables preemption */ + preempt_enable_notrace(); + } else + ring_buffer_unlock_commit(buffer); +} + static inline void __trace_event_discard_commit(struct trace_buffer *buffer, struct ring_buffer_event *event) @@ -1643,11 +1869,6 @@ extern struct trace_event_file *find_event_file(struct trace_array *tr, const char *system, const char *event); -static inline void *event_file_data(struct file *filp) -{ - return READ_ONCE(file_inode(filp)->i_private); -} - extern struct mutex event_mutex; extern struct list_head ftrace_events; @@ -1668,12 +1889,22 @@ static inline struct trace_event_file *event_file_file(struct file *filp) struct trace_event_file *file; lockdep_assert_held(&event_mutex); - file = READ_ONCE(file_inode(filp)->i_private); + file = file_inode(filp)->i_private; if (!file || file->flags & EVENT_FILE_FL_FREED) return NULL; return file; } +static inline void *event_file_data(struct file *filp) +{ + struct trace_event_file *file; + + lockdep_assert_held(&event_mutex); + file = file_inode(filp)->i_private; + WARN_ON(!file || file->flags & EVENT_FILE_FL_FREED); + return file; +} + extern const struct file_operations event_trigger_fops; extern const struct file_operations event_hist_fops; extern const struct file_operations event_hist_debug_fops; @@ -1692,13 +1923,13 @@ extern void clear_event_triggers(struct trace_array *tr); enum { EVENT_TRIGGER_FL_PROBE = BIT(0), + EVENT_TRIGGER_FL_COUNT = BIT(1), }; struct event_trigger_data { unsigned long count; int ref; int flags; - struct event_trigger_ops *ops; struct event_command *cmd_ops; struct event_filter __rcu *filter; char *filter_str; @@ -1709,6 +1940,7 @@ struct event_trigger_data { char *name; struct list_head named_list; struct event_trigger_data *named_data; + struct llist_node llist; }; /* Avoid typos */ @@ -1723,6 +1955,10 @@ struct enable_trigger_data { bool hist; }; +bool event_trigger_count(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *event); + extern int event_enable_trigger_print(struct seq_file *m, struct event_trigger_data *data); extern void event_enable_trigger_free(struct event_trigger_data *data); @@ -1736,6 +1972,9 @@ extern int event_enable_register_trigger(char *glob, extern void event_enable_unregister_trigger(char *glob, struct event_trigger_data *test, struct trace_event_file *file); +extern struct event_trigger_data * +trigger_data_alloc(struct event_command *cmd_ops, char *cmd, char *param, + void *private_data); extern void trigger_data_free(struct event_trigger_data *data); extern int event_trigger_init(struct event_trigger_data *data); extern int trace_event_trigger_enable_disable(struct trace_event_file *file, @@ -1762,11 +2001,6 @@ extern bool event_trigger_check_remove(const char *glob); extern bool event_trigger_empty_param(const char *param); extern int event_trigger_separate_filter(char *param_and_filter, char **param, char **filter, bool param_required); -extern struct event_trigger_data * -event_trigger_alloc(struct event_command *cmd_ops, - char *cmd, - char *param, - void *private_data); extern int event_trigger_parse_num(char *trigger, struct event_trigger_data *trigger_data); extern int event_trigger_set_filter(struct event_command *cmd_ops, @@ -1788,64 +2022,6 @@ extern void event_file_get(struct trace_event_file *file); extern void event_file_put(struct trace_event_file *file); /** - * struct event_trigger_ops - callbacks for trace event triggers - * - * The methods in this structure provide per-event trigger hooks for - * various trigger operations. - * - * The @init and @free methods are used during trigger setup and - * teardown, typically called from an event_command's @parse() - * function implementation. - * - * The @print method is used to print the trigger spec. - * - * The @trigger method is the function that actually implements the - * trigger and is called in the context of the triggering event - * whenever that event occurs. - * - * All the methods below, except for @init() and @free(), must be - * implemented. - * - * @trigger: The trigger 'probe' function called when the triggering - * event occurs. The data passed into this callback is the data - * that was supplied to the event_command @reg() function that - * registered the trigger (see struct event_command) along with - * the trace record, rec. - * - * @init: An optional initialization function called for the trigger - * when the trigger is registered (via the event_command reg() - * function). This can be used to perform per-trigger - * initialization such as incrementing a per-trigger reference - * count, for instance. This is usually implemented by the - * generic utility function @event_trigger_init() (see - * trace_event_triggers.c). - * - * @free: An optional de-initialization function called for the - * trigger when the trigger is unregistered (via the - * event_command @reg() function). This can be used to perform - * per-trigger de-initialization such as decrementing a - * per-trigger reference count and freeing corresponding trigger - * data, for instance. This is usually implemented by the - * generic utility function @event_trigger_free() (see - * trace_event_triggers.c). - * - * @print: The callback function invoked to have the trigger print - * itself. This is usually implemented by a wrapper function - * that calls the generic utility function @event_trigger_print() - * (see trace_event_triggers.c). - */ -struct event_trigger_ops { - void (*trigger)(struct event_trigger_data *data, - struct trace_buffer *buffer, - void *rec, - struct ring_buffer_event *rbe); - int (*init)(struct event_trigger_data *data); - void (*free)(struct event_trigger_data *data); - int (*print)(struct seq_file *m, - struct event_trigger_data *data); -}; - -/** * struct event_command - callbacks and data members for event commands * * Event commands are invoked by users by writing the command name @@ -1894,7 +2070,7 @@ struct event_trigger_ops { * * @reg: Adds the trigger to the list of triggers associated with the * event, and enables the event trigger itself, after - * initializing it (via the event_trigger_ops @init() function). + * initializing it (via the event_command @init() function). * This is also where commands can use the @trigger_type value to * make the decision as to whether or not multiple instances of * the trigger should be allowed. This is usually implemented by @@ -1903,7 +2079,7 @@ struct event_trigger_ops { * * @unreg: Removes the trigger from the list of triggers associated * with the event, and disables the event trigger itself, after - * initializing it (via the event_trigger_ops @free() function). + * initializing it (via the event_command @free() function). * This is usually implemented by the generic utility function * @unregister_trigger() (see trace_event_triggers.c). * @@ -1917,12 +2093,41 @@ struct event_trigger_ops { * ignored. This is usually implemented by the generic utility * function @set_trigger_filter() (see trace_event_triggers.c). * - * @get_trigger_ops: The callback function invoked to retrieve the - * event_trigger_ops implementation associated with the command. - * This callback function allows a single event_command to - * support multiple trigger implementations via different sets of - * event_trigger_ops, depending on the value of the @param - * string. + * All the methods below, except for @init() and @free(), must be + * implemented. + * + * @trigger: The trigger 'probe' function called when the triggering + * event occurs. The data passed into this callback is the data + * that was supplied to the event_command @reg() function that + * registered the trigger (see struct event_command) along with + * the trace record, rec. + * + * @count_func: If defined and a numeric parameter is passed to the + * trigger, then this function will be called before @trigger + * is called. If this function returns false, then @trigger is not + * executed. + * + * @init: An optional initialization function called for the trigger + * when the trigger is registered (via the event_command reg() + * function). This can be used to perform per-trigger + * initialization such as incrementing a per-trigger reference + * count, for instance. This is usually implemented by the + * generic utility function @event_trigger_init() (see + * trace_event_triggers.c). + * + * @free: An optional de-initialization function called for the + * trigger when the trigger is unregistered (via the + * event_command @reg() function). This can be used to perform + * per-trigger de-initialization such as decrementing a + * per-trigger reference count and freeing corresponding trigger + * data, for instance. This is usually implemented by the + * generic utility function @event_trigger_free() (see + * trace_event_triggers.c). + * + * @print: The callback function invoked to have the trigger print + * itself. This is usually implemented by a wrapper function + * that calls the generic utility function @event_trigger_print() + * (see trace_event_triggers.c). */ struct event_command { struct list_head list; @@ -1943,7 +2148,18 @@ struct event_command { int (*set_filter)(char *filter_str, struct event_trigger_data *data, struct trace_event_file *file); - struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param); + void (*trigger)(struct event_trigger_data *data, + struct trace_buffer *buffer, + void *rec, + struct ring_buffer_event *rbe); + bool (*count_func)(struct event_trigger_data *data, + struct trace_buffer *buffer, + void *rec, + struct ring_buffer_event *rbe); + int (*init)(struct event_trigger_data *data); + void (*free)(struct event_trigger_data *data); + int (*print)(struct seq_file *m, + struct event_trigger_data *data); }; /** @@ -1964,7 +2180,7 @@ struct event_command { * either committed or discarded. At that point, if any commands * have deferred their triggers, those commands are finally * invoked following the close of the current event. In other - * words, if the event_trigger_ops @func() probe implementation + * words, if the event_command @func() probe implementation * itself logs to the trace buffer, this flag should be set, * otherwise it can be left unspecified. * @@ -1991,12 +2207,6 @@ static inline bool event_command_needs_rec(struct event_command *cmd_ops) extern int trace_event_enable_disable(struct trace_event_file *file, int enable, int soft_disable); -extern int tracing_alloc_snapshot(void); -extern void tracing_snapshot_cond(struct trace_array *tr, void *cond_data); -extern int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update); - -extern int tracing_snapshot_cond_disable(struct trace_array *tr); -extern void *tracing_cond_snapshot_data(struct trace_array *tr); extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; @@ -2006,8 +2216,9 @@ extern const char *__stop___tracepoint_str[]; void trace_printk_control(bool enabled); void trace_printk_start_comm(void); -int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); -int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); +void trace_printk_start_stop_comm(int enabled); +int trace_keep_overwrite(struct tracer *tracer, u64 mask, int set); +int set_tracer_flag(struct trace_array *tr, u64 mask, int enabled); /* Used from boot time tracer */ extern int trace_set_options(struct trace_array *tr, char *option); @@ -2038,7 +2249,7 @@ extern void tracing_log_err(struct trace_array *tr, * about performance). The internal_trace_puts() is for such * a purpose. */ -#define internal_trace_puts(str) __trace_puts(_THIS_IP_, str, strlen(str)) +#define internal_trace_puts(str) __trace_puts(_THIS_IP_, str) #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ @@ -2073,29 +2284,81 @@ static inline const char *get_syscall_name(int syscall) #ifdef CONFIG_EVENT_TRACING void trace_event_init(void); -void trace_event_eval_update(struct trace_eval_map **map, int len); +void trace_event_update_all(struct trace_eval_map **map, int len); /* Used from boot time tracer */ extern int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set); extern int trigger_process_regex(struct trace_event_file *file, char *buff); #else static inline void __init trace_event_init(void) { } -static inline void trace_event_eval_update(struct trace_eval_map **map, int len) { } +static inline void trace_event_update_all(struct trace_eval_map **map, int len) { } #endif #ifdef CONFIG_TRACER_SNAPSHOT +extern const struct file_operations snapshot_fops; +extern const struct file_operations snapshot_raw_fops; + +/* Used when creating instances */ +int trace_allocate_snapshot(struct trace_array *tr, int size); + +int tracing_alloc_snapshot(void); +void tracing_snapshot_cond(struct trace_array *tr, void *cond_data); +int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update); +int tracing_snapshot_cond_disable(struct trace_array *tr); +void *tracing_cond_snapshot_data(struct trace_array *tr); void tracing_snapshot_instance(struct trace_array *tr); int tracing_alloc_snapshot_instance(struct trace_array *tr); +int tracing_arm_snapshot_locked(struct trace_array *tr); int tracing_arm_snapshot(struct trace_array *tr); void tracing_disarm_snapshot(struct trace_array *tr); -#else +void free_snapshot(struct trace_array *tr); +void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter); +int get_snapshot_map(struct trace_array *tr); +void put_snapshot_map(struct trace_array *tr); +int resize_buffer_duplicate_size(struct array_buffer *trace_buf, + struct array_buffer *size_buf, int cpu_id); +__init void do_allocate_snapshot(const char *name); +# ifdef CONFIG_DYNAMIC_FTRACE +__init int register_snapshot_cmd(void); +# else +static inline int register_snapshot_cmd(void) { return 0; } +# endif +#else /* !CONFIG_TRACER_SNAPSHOT */ +static inline int trace_allocate_snapshot(struct trace_array *tr, int size) { return 0; } static inline void tracing_snapshot_instance(struct trace_array *tr) { } static inline int tracing_alloc_snapshot_instance(struct trace_array *tr) { return 0; } +static inline int tracing_arm_snapshot_locked(struct trace_array *tr) { return -EBUSY; } static inline int tracing_arm_snapshot(struct trace_array *tr) { return 0; } static inline void tracing_disarm_snapshot(struct trace_array *tr) { } -#endif +static inline void free_snapshot(struct trace_array *tr) {} +static inline void tracing_snapshot_cond(struct trace_array *tr, void *cond_data) +{ + WARN_ONCE(1, "Snapshot feature not enabled, but internal conditional snapshot used"); +} +static inline void *tracing_cond_snapshot_data(struct trace_array *tr) +{ + return NULL; +} +static inline int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update) +{ + return -ENODEV; +} +static inline int tracing_snapshot_cond_disable(struct trace_array *tr) +{ + return false; +} +static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) +{ + /* Should never be called */ + WARN_ONCE(1, "Snapshot print function called without snapshot configured"); +} +static inline int get_snapshot_map(struct trace_array *tr) { return 0; } +static inline void put_snapshot_map(struct trace_array *tr) { } +static inline void do_allocate_snapshot(const char *name) { } +static inline int register_snapshot_cmd(void) { return 0; } +#endif /* CONFIG_TRACER_SNAPSHOT */ #ifdef CONFIG_PREEMPT_TRACER void tracer_preempt_on(unsigned long a0, unsigned long a1); @@ -2152,10 +2415,41 @@ static inline bool is_good_system_name(const char *name) static inline void sanitize_event_name(char *name) { while (*name++ != '\0') - if (*name == ':' || *name == '.') + if (*name == ':' || *name == '.' || *name == '*') *name = '_'; } +#ifdef CONFIG_STACKTRACE +void __ftrace_trace_stack(struct trace_array *tr, + struct trace_buffer *buffer, + unsigned int trace_ctx, + int skip, struct pt_regs *regs); + +static __always_inline void ftrace_trace_stack(struct trace_array *tr, + struct trace_buffer *buffer, + unsigned int trace_ctx, + int skip, struct pt_regs *regs) +{ + if (!(tr->trace_flags & TRACE_ITER(STACKTRACE))) + return; + + __ftrace_trace_stack(tr, buffer, trace_ctx, skip, regs); +} +#else +static inline void __ftrace_trace_stack(struct trace_array *tr, + struct trace_buffer *buffer, + unsigned int trace_ctx, + int skip, struct pt_regs *regs) +{ +} +static inline void ftrace_trace_stack(struct trace_array *tr, + struct trace_buffer *buffer, + unsigned long trace_ctx, + int skip, struct pt_regs *regs) +{ +} +#endif + /* * This is a generic way to read and write a u64 value from a file in tracefs. * @@ -2190,4 +2484,25 @@ static inline int rv_init_interface(void) */ #define FTRACE_TRAMPOLINE_MARKER ((unsigned long) INT_MAX) +/* + * This is used to get the address of the args array based on + * the type of the entry. + */ +#define FGRAPH_ENTRY_ARGS(e) \ + ({ \ + unsigned long *_args; \ + struct ftrace_graph_ent_entry *_e = e; \ + \ + if (IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) && \ + e->ent.type == TRACE_GRAPH_RETADDR_ENT) { \ + struct fgraph_retaddr_ent_entry *_re; \ + \ + _re = (typeof(_re))_e; \ + _args = _re->args; \ + } else { \ + _args = _e->args; \ + } \ + _args; \ + }) + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index dbe29b4c6a7a..2ca2541c8a58 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -61,7 +61,8 @@ trace_boot_set_instance_options(struct trace_array *tr, struct xbc_node *node) v = memparse(p, NULL); if (v < PAGE_SIZE) pr_err("Buffer size is too small: %s\n", p); - if (tracing_resize_ring_buffer(tr, v, RING_BUFFER_ALL_CPUS) < 0) + if (trace_array_is_readonly(tr) || + tracing_resize_ring_buffer(tr, v, RING_BUFFER_ALL_CPUS) < 0) pr_err("Failed to resize trace buffer to %s\n", p); } @@ -597,7 +598,7 @@ trace_boot_enable_tracer(struct trace_array *tr, struct xbc_node *node) p = xbc_node_find_value(node, "tracer", NULL); if (p && *p != '\0') { - if (tracing_set_tracer(tr, p) < 0) + if (trace_array_is_readonly(tr) || tracing_set_tracer(tr, p) < 0) pr_err("Failed to set given tracer: %s\n", p); } diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 6d08a5523ce0..d1564db95a8f 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -32,7 +32,6 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect) { struct trace_array *tr = branch_tracer; struct trace_buffer *buffer; - struct trace_array_cpu *data; struct ring_buffer_event *event; struct trace_branch *entry; unsigned long flags; @@ -54,8 +53,7 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect) raw_local_irq_save(flags); current->trace_recursion |= TRACE_BRANCH_BIT; - data = this_cpu_ptr(tr->array_buffer.data); - if (atomic_read(&data->disabled)) + if (!tracer_tracing_is_on_cpu(tr, raw_smp_processor_id())) goto out; trace_ctx = tracing_gen_ctx_flags(flags); @@ -375,10 +373,10 @@ __init static int init_annotated_branch_stats(void) int ret; ret = register_stat_tracer(&annotated_branch_stats); - if (!ret) { + if (ret) { printk(KERN_WARNING "Warning: could not register " "annotated branches stats\n"); - return 1; + return ret; } return 0; } @@ -440,10 +438,10 @@ __init static int all_annotated_branch_stats(void) int ret; ret = register_stat_tracer(&all_branch_stats); - if (!ret) { + if (ret) { printk(KERN_WARNING "Warning: could not register " "all branches stats\n"); - return 1; + return ret; } return 0; } diff --git a/kernel/trace/trace_btf.c b/kernel/trace/trace_btf.c index 5bbdbcbbde3c..00172f301f25 100644 --- a/kernel/trace/trace_btf.c +++ b/kernel/trace/trace_btf.c @@ -78,7 +78,7 @@ const struct btf_member *btf_find_struct_member(struct btf *btf, const char *name; int i, top = 0; - anon_stack = kcalloc(BTF_ANON_STACK_MAX, sizeof(*anon_stack), GFP_KERNEL); + anon_stack = kzalloc_objs(*anon_stack, BTF_ANON_STACK_MAX); if (!anon_stack) return ERR_PTR(-ENOMEM); diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index 4376887e0d8a..c4dfbc293bae 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -16,7 +16,7 @@ #include "trace_output.h" /* for trace_event_sem */ #include "trace_dynevent.h" -static DEFINE_MUTEX(dyn_event_ops_mutex); +DEFINE_MUTEX(dyn_event_ops_mutex); static LIST_HEAD(dyn_event_ops_list); bool trace_event_dyn_try_get_ref(struct trace_event_call *dyn_call) @@ -74,24 +74,19 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type struct dyn_event *pos, *n; char *system = NULL, *event, *p; int argc, ret = -ENOENT; - char **argv; + char **argv __free(argv_free) = argv_split(GFP_KERNEL, raw_command, &argc); - argv = argv_split(GFP_KERNEL, raw_command, &argc); if (!argv) return -ENOMEM; if (argv[0][0] == '-') { - if (argv[0][1] != ':') { - ret = -EINVAL; - goto out; - } + if (argv[0][1] != ':') + return -EINVAL; event = &argv[0][2]; } else { event = strchr(argv[0], ':'); - if (!event) { - ret = -EINVAL; - goto out; - } + if (!event) + return -EINVAL; event++; } @@ -101,10 +96,8 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type event = p + 1; *p = '\0'; } - if (!system && event[0] == '\0') { - ret = -EINVAL; - goto out; - } + if (!system && event[0] == '\0') + return -EINVAL; mutex_lock(&event_mutex); for_each_dyn_event_safe(pos, n) { @@ -120,8 +113,20 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type } tracing_reset_all_online_cpus(); mutex_unlock(&event_mutex); -out: - argv_free(argv); + return ret; +} + +/* + * Locked version of event creation. The event creation must be protected by + * dyn_event_ops_mutex because of protecting trace_probe_log. + */ +int dyn_event_create(const char *raw_command, struct dyn_event_operations *type) +{ + int ret; + + mutex_lock(&dyn_event_ops_mutex); + ret = type->create(raw_command); + mutex_unlock(&dyn_event_ops_mutex); return ret; } @@ -139,9 +144,16 @@ static int create_dyn_event(const char *raw_command) if (!ret || ret != -ECANCELED) break; } - mutex_unlock(&dyn_event_ops_mutex); - if (ret == -ECANCELED) + if (ret == -ECANCELED) { + static const char *err_msg[] = {"No matching dynamic event type"}; + + /* Wrong dynamic event. Leave an error message. */ + tracing_log_err(NULL, "dynevent", raw_command, err_msg, + 0, 0); ret = -EINVAL; + } + + mutex_unlock(&dyn_event_ops_mutex); return ret; } @@ -225,6 +237,10 @@ static int dyn_event_open(struct inode *inode, struct file *file) { int ret; + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + ret = tracing_check_open_get_tr(NULL); if (ret) return ret; diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h index 936477a111d3..beee3f8d7544 100644 --- a/kernel/trace/trace_dynevent.h +++ b/kernel/trace/trace_dynevent.h @@ -100,6 +100,7 @@ void *dyn_event_seq_next(struct seq_file *m, void *v, loff_t *pos); void dyn_event_seq_stop(struct seq_file *m, void *v); int dyn_events_release_all(struct dyn_event_operations *type); int dyn_event_release(const char *raw_command, struct dyn_event_operations *type); +int dyn_event_create(const char *raw_command, struct dyn_event_operations *type); /* * for_each_dyn_event - iterate over the dyn_event list diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 82fd174ebbe0..54417468fdeb 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -61,8 +61,9 @@ FTRACE_ENTRY_REG(function, ftrace_entry, TRACE_FN, F_STRUCT( - __field_fn( unsigned long, ip ) - __field_fn( unsigned long, parent_ip ) + __field_fn( unsigned long, ip ) + __field_fn( unsigned long, parent_ip ) + __dynamic_array( unsigned long, args ) ), F_printk(" %ps <-- %ps", @@ -72,17 +73,18 @@ FTRACE_ENTRY_REG(function, ftrace_entry, ); /* Function call entry */ -FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry, +FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, TRACE_GRAPH_ENT, F_STRUCT( __field_struct( struct ftrace_graph_ent, graph_ent ) - __field_packed( unsigned long, graph_ent, func ) - __field_packed( int, graph_ent, depth ) + __field_desc_packed(unsigned long, graph_ent, func ) + __field_desc_packed(unsigned long, graph_ent, depth ) + __dynamic_array(unsigned long, args ) ), - F_printk("--> %ps (%d)", (void *)__entry->func, __entry->depth) + F_printk("--> %ps (%lu)", (void *)__entry->func, __entry->depth) ); #ifdef CONFIG_FUNCTION_GRAPH_RETADDR @@ -93,13 +95,14 @@ FTRACE_ENTRY_PACKED(fgraph_retaddr_entry, fgraph_retaddr_ent_entry, TRACE_GRAPH_RETADDR_ENT, F_STRUCT( - __field_struct( struct fgraph_retaddr_ent, graph_ent ) - __field_packed( unsigned long, graph_ent, func ) - __field_packed( int, graph_ent, depth ) - __field_packed( unsigned long, graph_ent, retaddr ) + __field_struct( struct fgraph_retaddr_ent, graph_rent ) + __field_desc_packed( unsigned long, graph_rent.ent, func ) + __field_desc_packed( unsigned long, graph_rent.ent, depth ) + __field_desc_packed( unsigned long, graph_rent, retaddr ) + __dynamic_array(unsigned long, args ) ), - F_printk("--> %ps (%d) <- %ps", (void *)__entry->func, __entry->depth, + F_printk("--> %ps (%lu) <- %ps", (void *)__entry->func, __entry->depth, (void *)__entry->retaddr) ); @@ -120,15 +123,15 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, F_STRUCT( __field_struct( struct ftrace_graph_ret, ret ) - __field_packed( unsigned long, ret, func ) - __field_packed( unsigned long, ret, retval ) - __field_packed( int, ret, depth ) - __field_packed( unsigned int, ret, overrun ) - __field_packed( unsigned long long, ret, calltime) - __field_packed( unsigned long long, ret, rettime ) + __field_desc_packed( unsigned long, ret, func ) + __field_desc_packed( unsigned long, ret, retval ) + __field_desc_packed( unsigned int, ret, depth ) + __field_desc_packed( unsigned int, ret, overrun ) + __field_packed(unsigned long long, calltime) + __field_packed(unsigned long long, rettime ) ), - F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d retval: %lx", + F_printk("<-- %ps (%u) (start: %llx end: %llx) over: %u retval: %lx", (void *)__entry->func, __entry->depth, __entry->calltime, __entry->rettime, __entry->depth, __entry->retval) @@ -143,14 +146,14 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, F_STRUCT( __field_struct( struct ftrace_graph_ret, ret ) - __field_packed( unsigned long, ret, func ) - __field_packed( int, ret, depth ) - __field_packed( unsigned int, ret, overrun ) - __field_packed( unsigned long long, ret, calltime) - __field_packed( unsigned long long, ret, rettime ) + __field_desc_packed( unsigned long, ret, func ) + __field_desc_packed( unsigned int, ret, depth ) + __field_desc_packed( unsigned int, ret, overrun ) + __field_packed(unsigned long long, calltime ) + __field_packed(unsigned long long, rettime ) ), - F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d", + F_printk("<-- %ps (%u) (start: %llx end: %llx) over: %u", (void *)__entry->func, __entry->depth, __entry->calltime, __entry->rettime, __entry->depth) diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index be8be0c1aaf0..b66d6196338d 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -9,14 +9,15 @@ * Copyright (C) 2021, VMware Inc, Tzvetomir Stoyanov tz.stoyanov@gmail.com> * */ +#include <linux/cleanup.h> +#include <linux/ftrace.h> #include <linux/module.h> #include <linux/mutex.h> -#include <linux/ftrace.h> #include "trace_dynevent.h" #include "trace_probe.h" -#include "trace_probe_tmpl.h" #include "trace_probe_kernel.h" +#include "trace_probe_tmpl.h" #define EPROBE_EVENT_SYSTEM "eprobes" @@ -60,6 +61,9 @@ static void trace_event_probe_cleanup(struct trace_eprobe *ep) kfree(ep); } +DEFINE_FREE(trace_event_probe_cleanup, struct trace_eprobe *, + if (!IS_ERR_OR_NULL(_T)) trace_event_probe_cleanup(_T)) + static struct trace_eprobe *to_trace_eprobe(struct dyn_event *ev) { return container_of(ev, struct trace_eprobe, devent); @@ -196,10 +200,10 @@ static struct trace_eprobe *alloc_event_probe(const char *group, struct trace_event_call *event, int nargs) { - struct trace_eprobe *ep; + struct trace_eprobe *ep __free(trace_event_probe_cleanup) = NULL; const char *event_name; const char *sys_name; - int ret = -ENOMEM; + int ret; if (!event) return ERR_PTR(-ENODEV); @@ -207,28 +211,25 @@ static struct trace_eprobe *alloc_event_probe(const char *group, sys_name = event->class->system; event_name = trace_event_name(event); - ep = kzalloc(struct_size(ep, tp.args, nargs), GFP_KERNEL); + ep = kzalloc_flex(*ep, tp.args, nargs); if (!ep) { trace_event_put_ref(event); - goto error; + return ERR_PTR(-ENOMEM); } ep->event = event; ep->event_name = kstrdup(event_name, GFP_KERNEL); if (!ep->event_name) - goto error; + return ERR_PTR(-ENOMEM); ep->event_system = kstrdup(sys_name, GFP_KERNEL); if (!ep->event_system) - goto error; + return ERR_PTR(-ENOMEM); ret = trace_probe_init(&ep->tp, this_event, group, false, nargs); if (ret < 0) - goto error; + return ERR_PTR(ret); dyn_event_init(&ep->devent, &eprobe_dyn_event_ops); - return ep; -error: - trace_event_probe_cleanup(ep); - return ERR_PTR(ret); + return_ptr(ep); } static int eprobe_event_define_fields(struct trace_event_call *event_call) @@ -343,10 +344,15 @@ get_event_field(struct fetch_insn *code, void *rec) val = *(unsigned int *)addr; break; default: - if (field->is_signed) - val = *(long *)addr; - else - val = *(unsigned long *)addr; + if (field->size == sizeof(long)) { + if (field->is_signed) + val = *(long *)addr; + else + val = *(unsigned long *)addr; + break; + } + /* This is an array, point to the addr itself */ + val = (unsigned long)addr; break; } return val; @@ -478,13 +484,6 @@ static void eprobe_trigger_func(struct event_trigger_data *data, __eprobe_trace_func(edata, rec); } -static struct event_trigger_ops eprobe_trigger_ops = { - .trigger = eprobe_trigger_func, - .print = eprobe_trigger_print, - .init = eprobe_trigger_init, - .free = eprobe_trigger_free, -}; - static int eprobe_trigger_cmd_parse(struct event_command *cmd_ops, struct trace_event_file *file, char *glob, char *cmd, @@ -507,12 +506,6 @@ static void eprobe_trigger_unreg_func(char *glob, } -static struct event_trigger_ops *eprobe_trigger_get_ops(char *cmd, - char *param) -{ - return &eprobe_trigger_ops; -} - static struct event_command event_trigger_cmd = { .name = "eprobe", .trigger_type = ETT_EVENT_EPROBE, @@ -521,8 +514,11 @@ static struct event_command event_trigger_cmd = { .reg = eprobe_trigger_reg_func, .unreg = eprobe_trigger_unreg_func, .unreg_all = NULL, - .get_trigger_ops = eprobe_trigger_get_ops, .set_filter = NULL, + .trigger = eprobe_trigger_func, + .print = eprobe_trigger_print, + .init = eprobe_trigger_init, + .free = eprobe_trigger_free, }; static struct event_trigger_data * @@ -533,8 +529,8 @@ new_eprobe_trigger(struct trace_eprobe *ep, struct trace_event_file *file) struct eprobe_data *edata; int ret; - edata = kzalloc(sizeof(*edata), GFP_KERNEL); - trigger = kzalloc(sizeof(*trigger), GFP_KERNEL); + edata = kzalloc_obj(*edata); + trigger = kzalloc_obj(*trigger); if (!trigger || !edata) { ret = -ENOMEM; goto error; @@ -542,7 +538,6 @@ new_eprobe_trigger(struct trace_eprobe *ep, struct trace_event_file *file) trigger->flags = EVENT_TRIGGER_FL_PROBE; trigger->count = -1; - trigger->ops = &eprobe_trigger_ops; /* * EVENT PROBE triggers are not registered as commands with @@ -795,23 +790,6 @@ find_and_get_event(const char *system, const char *event_name) return NULL; } -static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[], int i) -{ - struct traceprobe_parse_context ctx = { - .event = ep->event, - .flags = TPARG_FL_KERNEL | TPARG_FL_TEVENT, - }; - int ret; - - ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], &ctx); - /* Handle symbols "@" */ - if (!ret) - ret = traceprobe_update_arg(&ep->tp.args[i]); - - traceprobe_finish_parse(&ctx); - return ret; -} - static int trace_eprobe_parse_filter(struct trace_eprobe *ep, int argc, const char *argv[]) { struct event_filter *dummy = NULL; @@ -848,13 +826,10 @@ static int trace_eprobe_parse_filter(struct trace_eprobe *ep, int argc, const ch ret = create_event_filter(top_trace_array(), ep->event, ep->filter_str, true, &dummy); free_event_filter(dummy); - if (ret) - goto error; - - return 0; -error: - kfree(ep->filter_str); - ep->filter_str = NULL; + if (ret) { + kfree(ep->filter_str); + ep->filter_str = NULL; + } return ret; } @@ -866,40 +841,52 @@ static int __trace_eprobe_create(int argc, const char *argv[]) * Fetch args (no space): * <name>=$<field>[:TYPE] */ + struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) = NULL; + struct trace_eprobe *ep __free(trace_event_probe_cleanup) = NULL; + const char *trlog __free(trace_probe_log_clear) = NULL; const char *event = NULL, *group = EPROBE_EVENT_SYSTEM; const char *sys_event = NULL, *sys_name = NULL; struct trace_event_call *event_call; - struct trace_eprobe *ep = NULL; - char buf1[MAX_EVENT_NAME_LEN]; - char buf2[MAX_EVENT_NAME_LEN]; - char gbuf[MAX_EVENT_NAME_LEN]; + char *buf1 __free(kfree) = NULL; + char *buf2 __free(kfree) = NULL; + char *gbuf __free(kfree) = NULL; int ret = 0, filter_idx = 0; int i, filter_cnt; if (argc < 2 || argv[0][0] != 'e') return -ECANCELED; - trace_probe_log_init("event_probe", argc, argv); + trlog = trace_probe_log_init("event_probe", argc, argv); event = strchr(&argv[0][1], ':'); if (event) { + gbuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); + if (!gbuf) + return -ENOMEM; event++; ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) - goto parse_error; + return -EINVAL; } trace_probe_log_set_index(1); sys_event = argv[1]; + + buf2 = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); + if (!buf2) + return -ENOMEM; + ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2, 0); if (ret || !sys_event || !sys_name) { trace_probe_log_err(0, NO_EVENT_INFO); - goto parse_error; + return -EINVAL; } if (!event) { - strscpy(buf1, sys_event, MAX_EVENT_NAME_LEN); + buf1 = kstrdup(sys_event, GFP_KERNEL); + if (!buf1) + return -ENOMEM; event = buf1; } @@ -913,14 +900,15 @@ static int __trace_eprobe_create(int argc, const char *argv[]) } if (argc - 2 > MAX_TRACE_ARGS) { - ret = -E2BIG; - goto error; + trace_probe_log_set_index(2); + trace_probe_log_err(0, TOO_MANY_ARGS); + return -E2BIG; } - mutex_lock(&event_mutex); - event_call = find_and_get_event(sys_name, sys_event); - ep = alloc_event_probe(group, event, event_call, argc - 2); - mutex_unlock(&event_mutex); + scoped_guard(mutex, &event_mutex) { + event_call = find_and_get_event(sys_name, sys_event); + ep = alloc_event_probe(group, event, event_call, argc - 2); + } if (IS_ERR(ep)) { ret = PTR_ERR(ep); @@ -928,52 +916,57 @@ static int __trace_eprobe_create(int argc, const char *argv[]) trace_probe_log_err(0, BAD_ATTACH_EVENT); /* This must return -ENOMEM or missing event, else there is a bug */ WARN_ON_ONCE(ret != -ENOMEM && ret != -ENODEV); - ep = NULL; - goto error; + return ret; } if (filter_idx) { trace_probe_log_set_index(filter_idx); ret = trace_eprobe_parse_filter(ep, filter_cnt, argv + filter_idx); if (ret) - goto parse_error; + return -EINVAL; } else ep->filter_str = NULL; + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + ctx->event = ep->event; + ctx->flags = TPARG_FL_KERNEL | TPARG_FL_TEVENT; + argc -= 2; argv += 2; /* parse arguments */ for (i = 0; i < argc; i++) { trace_probe_log_set_index(i + 2); - ret = trace_eprobe_tp_update_arg(ep, argv, i); + + ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], ctx); + /* Handle symbols "@" */ + if (!ret) + ret = traceprobe_update_arg(&ep->tp.args[i]); if (ret) - goto error; + return ret; } ret = traceprobe_set_print_fmt(&ep->tp, PROBE_PRINT_EVENT); if (ret < 0) - goto error; + return ret; + init_trace_eprobe_call(ep); - mutex_lock(&event_mutex); - ret = trace_probe_register_event_call(&ep->tp); - if (ret) { - if (ret == -EEXIST) { - trace_probe_log_set_index(0); - trace_probe_log_err(0, EVENT_EXIST); + scoped_guard(mutex, &event_mutex) { + ret = trace_probe_register_event_call(&ep->tp); + if (ret) { + if (ret == -EEXIST) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, EVENT_EXIST); + } + return ret; } - mutex_unlock(&event_mutex); - goto error; - } - ret = dyn_event_add(&ep->devent, &ep->tp.event->call); - if (ret < 0) { - trace_probe_unregister_event_call(&ep->tp); - mutex_unlock(&event_mutex); - goto error; + ret = dyn_event_add(&ep->devent, &ep->tp.event->call); + if (ret < 0) { + trace_probe_unregister_event_call(&ep->tp); + return ret; + } + /* To avoid freeing registered eprobe event, clear ep. */ + ep = NULL; } - mutex_unlock(&event_mutex); - return ret; -parse_error: - ret = -EINVAL; -error: - trace_event_probe_cleanup(ep); return ret; } diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 3ff9caa4a71b..a6bb7577e8c5 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -49,7 +49,7 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event, /* The ftrace function trace is allowed only for root. */ if (ftrace_event_is_function(tp_event)) { - ret = perf_allow_tracepoint(&p_event->attr); + ret = perf_allow_tracepoint(); if (ret) return ret; @@ -86,7 +86,7 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event, * ...otherwise raw tracepoint data can be a severe data leak, * only allow root to have these. */ - ret = perf_allow_tracepoint(&p_event->attr); + ret = perf_allow_tracepoint(); if (ret) return ret; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 770e7ed91716..c46e623e7e0d 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -360,7 +360,7 @@ static bool process_string(const char *fmt, int len, struct trace_event_call *ca /* Anything else, this isn't a function */ break; } - /* A function could be wrapped in parethesis, try the next one */ + /* A function could be wrapped in parenthesis, try the next one */ s = r + 1; } while (s < e); @@ -400,6 +400,20 @@ static bool process_string(const char *fmt, int len, struct trace_event_call *ca return true; } +static void handle_dereference_arg(const char *arg_str, u64 string_flags, int len, + u64 *dereference_flags, int arg, + struct trace_event_call *call) +{ + if (string_flags & (1ULL << arg)) { + if (process_string(arg_str, len, call)) + *dereference_flags &= ~(1ULL << arg); + } else if (process_pointer(arg_str, len, call)) + *dereference_flags &= ~(1ULL << arg); + else + pr_warn("TRACE EVENT ERROR: Bad dereference argument: '%.*s'\n", + len, arg_str); +} + /* * Examine the print fmt of the event looking for unsafe dereference * pointers using %p* that could be recorded in the trace event and @@ -470,6 +484,7 @@ static void test_event_printk(struct trace_event_call *call) case '%': continue; case 'p': + do_pointer: /* Find dereferencing fields */ switch (fmt[i + 1]) { case 'B': case 'R': case 'r': @@ -498,6 +513,12 @@ static void test_event_printk(struct trace_event_call *call) continue; if (fmt[i + j] == '*') { star = true; + /* Handle %*pbl case */ + if (!j && fmt[i + 1] == 'p') { + arg++; + i++; + goto do_pointer; + } continue; } if ((fmt[i + j] == 's')) { @@ -546,7 +567,7 @@ static void test_event_printk(struct trace_event_call *call) * If start_arg is zero, then this is the start of the * first argument. The processing of the argument happens * when the end of the argument is found, as it needs to - * handle paranthesis and such. + * handle parenthesis and such. */ if (!start_arg) { start_arg = i; @@ -556,11 +577,9 @@ static void test_event_printk(struct trace_event_call *call) } if (dereference_flags & (1ULL << arg)) { - if (string_flags & (1ULL << arg)) { - if (process_string(fmt + start_arg, e - start_arg, call)) - dereference_flags &= ~(1ULL << arg); - } else if (process_pointer(fmt + start_arg, e - start_arg, call)) - dereference_flags &= ~(1ULL << arg); + handle_dereference_arg(fmt + start_arg, string_flags, + e - start_arg, + &dereference_flags, arg, call); } start_arg = i; @@ -571,11 +590,9 @@ static void test_event_printk(struct trace_event_call *call) } if (dereference_flags & (1ULL << arg)) { - if (string_flags & (1ULL << arg)) { - if (process_string(fmt + start_arg, i - start_arg, call)) - dereference_flags &= ~(1ULL << arg); - } else if (process_pointer(fmt + start_arg, i - start_arg, call)) - dereference_flags &= ~(1ULL << arg); + handle_dereference_arg(fmt + start_arg, string_flags, + i - start_arg, + &dereference_flags, arg, call); } /* @@ -615,7 +632,6 @@ EXPORT_SYMBOL_GPL(trace_event_raw_init); bool trace_event_ignore_this_pid(struct trace_event_file *trace_file) { struct trace_array *tr = trace_file->tr; - struct trace_array_cpu *data; struct trace_pid_list *no_pid_list; struct trace_pid_list *pid_list; @@ -625,12 +641,30 @@ bool trace_event_ignore_this_pid(struct trace_event_file *trace_file) if (!pid_list && !no_pid_list) return false; - data = this_cpu_ptr(tr->array_buffer.data); - - return data->ignore_pid; + /* + * This is recorded at every sched_switch for this task. + * Thus, even if the task migrates the ignore value will be the same. + */ + return this_cpu_read(tr->array_buffer.data->ignore_pid) != 0; } EXPORT_SYMBOL_GPL(trace_event_ignore_this_pid); +/** + * trace_event_buffer_reserve - reserve space on the ring buffer for an event + * @fbuffer: information about how to save the event + * @trace_file: the instance file descriptor for the event + * @len: The length of the event + * + * The @fbuffer has information about the ring buffer and data will + * be added to it to be used by the call to trace_event_buffer_commit(). + * The @trace_file is the desrciptor with information about the status + * of the given event for a specific trace_array instance. + * The @len is the length of data to save for the event. + * + * Returns a pointer to the data on the ring buffer or NULL if the + * event was not reserved (event was filtered, too big, or the buffer + * simply was disabled for write). + */ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer, struct trace_event_file *trace_file, unsigned long len) @@ -682,6 +716,8 @@ int trace_event_reg(struct trace_event_call *call, #ifdef CONFIG_PERF_EVENTS case TRACE_REG_PERF_REGISTER: + if (!call->class->perf_probe) + return -ENODEV; return tracepoint_probe_register(call->tp, call->class->perf_probe, call); @@ -750,6 +786,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, { struct trace_event_call *call = file->event_call; struct trace_array *tr = file->tr; + bool soft_mode = atomic_read(&file->sm_ref) != 0; int ret = 0; int disable; @@ -764,19 +801,19 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, * is set we do not want the event to be enabled before we * clear the bit. * - * When soft_disable is not set but the SOFT_MODE flag is, + * When soft_disable is not set but the soft_mode is, * we do nothing. Do not disable the tracepoint, otherwise - * "soft enable"s (clearing the SOFT_DISABLED bit) wont work. + * "soft enable"s (clearing the SOFT_DISABLED bit) won't work. */ if (soft_disable) { if (atomic_dec_return(&file->sm_ref) > 0) break; disable = file->flags & EVENT_FILE_FL_SOFT_DISABLED; - clear_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags); + soft_mode = false; /* Disable use of trace_buffered_event */ trace_buffered_event_disable(); } else - disable = !(file->flags & EVENT_FILE_FL_SOFT_MODE); + disable = !soft_mode; if (disable && (file->flags & EVENT_FILE_FL_ENABLED)) { clear_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags); @@ -790,10 +827,12 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); } - call->class->reg(call, TRACE_REG_UNREGISTER, file); + ret = call->class->reg(call, TRACE_REG_UNREGISTER, file); + + WARN_ON_ONCE(ret); } - /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */ - if (file->flags & EVENT_FILE_FL_SOFT_MODE) + /* If in soft mode, just set the SOFT_DISABLE_BIT, else clear it */ + if (soft_mode) set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); else clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); @@ -803,16 +842,15 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, * When soft_disable is set and enable is set, we want to * register the tracepoint for the event, but leave the event * as is. That means, if the event was already enabled, we do - * nothing (but set SOFT_MODE). If the event is disabled, we - * set SOFT_DISABLED before enabling the event tracepoint, so - * it still seems to be disabled. + * nothing. If the event is disabled, we set SOFT_DISABLED + * before enabling the event tracepoint, so it still seems + * to be disabled. */ if (!soft_disable) clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); else { if (atomic_inc_return(&file->sm_ref) > 1) break; - set_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags); /* Enable use of trace_buffered_event */ trace_buffered_event_enable(); } @@ -820,17 +858,17 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, if (!(file->flags & EVENT_FILE_FL_ENABLED)) { bool cmd = false, tgid = false; - /* Keep the event disabled, when going to SOFT_MODE. */ + /* Keep the event disabled, when going to soft mode. */ if (soft_disable) set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); - if (tr->trace_flags & TRACE_ITER_RECORD_CMD) { + if (tr->trace_flags & TRACE_ITER(RECORD_CMD)) { cmd = true; tracing_start_cmdline_record(); set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); } - if (tr->trace_flags & TRACE_ITER_RECORD_TGID) { + if (tr->trace_flags & TRACE_ITER(RECORD_TGID)) { tgid = true; tracing_start_tgid_record(); set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); @@ -869,6 +907,120 @@ static int ftrace_event_enable_disable(struct trace_event_file *file, return __ftrace_event_enable_disable(file, enable, 0); } +#ifdef CONFIG_MODULES +struct event_mod_load { + struct list_head list; + char *module; + char *match; + char *system; + char *event; +}; + +static void free_event_mod(struct event_mod_load *event_mod) +{ + list_del(&event_mod->list); + kfree(event_mod->module); + kfree(event_mod->match); + kfree(event_mod->system); + kfree(event_mod->event); + kfree(event_mod); +} + +static void clear_mod_events(struct trace_array *tr) +{ + struct event_mod_load *event_mod, *n; + + list_for_each_entry_safe(event_mod, n, &tr->mod_events, list) { + free_event_mod(event_mod); + } +} + +static int remove_cache_mod(struct trace_array *tr, const char *mod, + const char *match, const char *system, const char *event) +{ + struct event_mod_load *event_mod, *n; + int ret = -EINVAL; + + list_for_each_entry_safe(event_mod, n, &tr->mod_events, list) { + if (strcmp(event_mod->module, mod) != 0) + continue; + + if (match && strcmp(event_mod->match, match) != 0) + continue; + + if (system && + (!event_mod->system || strcmp(event_mod->system, system) != 0)) + continue; + + if (event && + (!event_mod->event || strcmp(event_mod->event, event) != 0)) + continue; + + free_event_mod(event_mod); + ret = 0; + } + + return ret; +} + +static int cache_mod(struct trace_array *tr, const char *mod, int set, + const char *match, const char *system, const char *event) +{ + struct event_mod_load *event_mod; + + /* If the module exists, then this just failed to find an event */ + if (module_exists(mod)) + return -EINVAL; + + /* See if this is to remove a cached filter */ + if (!set) + return remove_cache_mod(tr, mod, match, system, event); + + event_mod = kzalloc_obj(*event_mod); + if (!event_mod) + return -ENOMEM; + + INIT_LIST_HEAD(&event_mod->list); + event_mod->module = kstrdup(mod, GFP_KERNEL); + if (!event_mod->module) + goto out_free; + + if (match) { + event_mod->match = kstrdup(match, GFP_KERNEL); + if (!event_mod->match) + goto out_free; + } + + if (system) { + event_mod->system = kstrdup(system, GFP_KERNEL); + if (!event_mod->system) + goto out_free; + } + + if (event) { + event_mod->event = kstrdup(event, GFP_KERNEL); + if (!event_mod->event) + goto out_free; + } + + list_add(&event_mod->list, &tr->mod_events); + + return 0; + + out_free: + free_event_mod(event_mod); + + return -ENOMEM; +} +#else /* CONFIG_MODULES */ +static inline void clear_mod_events(struct trace_array *tr) { } +static int cache_mod(struct trace_array *tr, const char *mod, int set, + const char *match, const char *system, const char *event) +{ + return -EINVAL; +} +#endif + static void ftrace_clear_events(struct trace_array *tr) { struct trace_event_file *file; @@ -877,6 +1029,7 @@ static void ftrace_clear_events(struct trace_array *tr) list_for_each_entry(file, &tr->events, list) { ftrace_event_enable_disable(file, 0); } + clear_mod_events(tr); mutex_unlock(&event_mutex); } @@ -886,6 +1039,7 @@ event_filter_pid_sched_process_exit(void *data, struct task_struct *task) struct trace_pid_list *pid_list; struct trace_array *tr = data; + guard(preempt)(); pid_list = rcu_dereference_raw(tr->filtered_pids); trace_filter_add_remove_task(pid_list, NULL, task); @@ -901,6 +1055,7 @@ event_filter_pid_sched_process_fork(void *data, struct trace_pid_list *pid_list; struct trace_array *tr = data; + guard(preempt)(); pid_list = rcu_dereference_sched(tr->filtered_pids); trace_filter_add_remove_task(pid_list, self, task); @@ -1158,6 +1313,9 @@ static void remove_event_file_dir(struct trace_event_file *file) free_event_filter(file->filter); file->flags |= EVENT_FILE_FL_FREED; event_file_put(file); + + /* Wake up hist poll waiters to notice the EVENT_FILE_FL_FREED flag. */ + hist_poll_wakeup(); } /* @@ -1165,17 +1323,36 @@ static void remove_event_file_dir(struct trace_event_file *file) */ static int __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match, - const char *sub, const char *event, int set) + const char *sub, const char *event, int set, + const char *mod) { struct trace_event_file *file; struct trace_event_call *call; + char *module __free(kfree) = NULL; const char *name; int ret = -EINVAL; int eret = 0; + if (mod) { + char *p; + + module = kstrdup(mod, GFP_KERNEL); + if (!module) + return -ENOMEM; + + /* Replace all '-' with '_' as that's what modules do */ + for (p = strchr(module, '-'); p; p = strchr(p + 1, '-')) + *p = '_'; + } + list_for_each_entry(file, &tr->events, list) { call = file->event_call; + + /* If a module is specified, skip events that are not that module */ + if (module && (!call->module || strcmp(module_name(call->module), module))) + continue; + name = trace_event_name(call); if (!name || !call->class || !call->class->reg) @@ -1208,16 +1385,27 @@ __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match, ret = eret; } + /* + * If this is a module setting and nothing was found, + * check if the module was loaded. If it wasn't cache it. + */ + if (module && ret == -EINVAL && !eret) + ret = cache_mod(tr, module, set, match, sub, event); + return ret; } static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, - const char *sub, const char *event, int set) + const char *sub, const char *event, int set, + const char *mod) { int ret; + if (trace_array_is_readonly(tr)) + return -EACCES; + mutex_lock(&event_mutex); - ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set); + ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set, mod); mutex_unlock(&event_mutex); return ret; @@ -1225,11 +1413,20 @@ static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) { - char *event = NULL, *sub = NULL, *match; + char *event = NULL, *sub = NULL, *match, *mod; int ret; if (!tr) return -ENOENT; + + /* Modules events can be appended with :mod:<module> */ + mod = strstr(buf, ":mod:"); + if (mod) { + *mod = '\0'; + /* move to the module name */ + mod += 5; + } + /* * The buf format can be <subsystem>:<event-name> * *:<event-name> means any event by that name. @@ -1252,9 +1449,13 @@ int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) sub = NULL; if (!strlen(event) || strcmp(event, "*") == 0) event = NULL; + } else if (mod) { + /* Allow wildcard for no length or star */ + if (!strlen(match) || strcmp(match, "*") == 0) + match = NULL; } - ret = __ftrace_set_clr_event(tr, match, sub, event, set); + ret = __ftrace_set_clr_event(tr, match, sub, event, set, mod); /* Put back the colon to allow this to be called again */ if (buf) @@ -1282,7 +1483,7 @@ int trace_set_clr_event(const char *system, const char *event, int set) if (!tr) return -ENODEV; - return __ftrace_set_clr_event(tr, NULL, system, event, set); + return __ftrace_set_clr_event(tr, NULL, system, event, set, NULL); } EXPORT_SYMBOL_GPL(trace_set_clr_event); @@ -1308,7 +1509,7 @@ int trace_array_set_clr_event(struct trace_array *tr, const char *system, return -ENOENT; set = (enable == true) ? 1 : 0; - return __ftrace_set_clr_event(tr, NULL, system, event, set); + return __ftrace_set_clr_event(tr, NULL, system, event, set, NULL); } EXPORT_SYMBOL_GPL(trace_array_set_clr_event); @@ -1395,37 +1596,77 @@ static void *t_start(struct seq_file *m, loff_t *pos) return file; } +enum set_event_iter_type { + SET_EVENT_FILE, + SET_EVENT_MOD, +}; + +struct set_event_iter { + enum set_event_iter_type type; + union { + struct trace_event_file *file; + struct event_mod_load *event_mod; + }; +}; + static void * s_next(struct seq_file *m, void *v, loff_t *pos) { - struct trace_event_file *file = v; + struct set_event_iter *iter = v; + struct trace_event_file *file; struct trace_array *tr = m->private; (*pos)++; - list_for_each_entry_continue(file, &tr->events, list) { - if (file->flags & EVENT_FILE_FL_ENABLED) - return file; + if (iter->type == SET_EVENT_FILE) { + file = iter->file; + list_for_each_entry_continue(file, &tr->events, list) { + if (file->flags & EVENT_FILE_FL_ENABLED) { + iter->file = file; + return iter; + } + } +#ifdef CONFIG_MODULES + iter->type = SET_EVENT_MOD; + iter->event_mod = list_entry(&tr->mod_events, struct event_mod_load, list); +#endif } +#ifdef CONFIG_MODULES + list_for_each_entry_continue(iter->event_mod, &tr->mod_events, list) + return iter; +#endif + + /* + * The iter is allocated in s_start() and passed via the 'v' + * parameter. To stop the iterator, NULL must be returned. But + * the return value is what the 'v' parameter in s_stop() receives + * and frees. Free iter here as it will no longer be used. + */ + kfree(iter); return NULL; } static void *s_start(struct seq_file *m, loff_t *pos) { - struct trace_event_file *file; struct trace_array *tr = m->private; + struct set_event_iter *iter; loff_t l; + iter = kzalloc_obj(*iter); mutex_lock(&event_mutex); + if (!iter) + return NULL; + + iter->type = SET_EVENT_FILE; + iter->file = list_entry(&tr->events, struct trace_event_file, list); - file = list_entry(&tr->events, struct trace_event_file, list); for (l = 0; l <= *pos; ) { - file = s_next(m, file, &l); - if (!file) + iter = s_next(m, iter, &l); + if (!iter) break; } - return file; + return iter; } static int t_show(struct seq_file *m, void *v) @@ -1445,6 +1686,121 @@ static void t_stop(struct seq_file *m, void *p) mutex_unlock(&event_mutex); } +static int get_call_len(struct trace_event_call *call) +{ + int len; + + /* Get the length of "<system>:<event>" */ + len = strlen(call->class->system) + 1; + len += strlen(trace_event_name(call)); + + /* Set the index to 32 bytes to separate event from data */ + return len >= 32 ? 1 : 32 - len; +} + +/** + * t_show_filters - seq_file callback to display active event filters + * @m: The seq_file interface for formatted output + * @v: The current trace_event_file being iterated + * + * Identifies and prints active filters for the current event file in the + * iteration. If a filter is applied to the current event and, if so, + * prints the system name, event name, and the filter string. + */ +static int t_show_filters(struct seq_file *m, void *v) +{ + struct trace_event_file *file = v; + struct trace_event_call *call = file->event_call; + struct event_filter *filter; + int len; + + guard(rcu)(); + filter = rcu_dereference(file->filter); + if (!filter || !filter->filter_string) + return 0; + + len = get_call_len(call); + + seq_printf(m, "%s:%s%*s%s\n", call->class->system, + trace_event_name(call), len, "", filter->filter_string); + + return 0; +} + +/** + * t_show_triggers - seq_file callback to display active event triggers + * @m: The seq_file interface for formatted output + * @v: The current trace_event_file being iterated + * + * Iterates through the trigger list of the current event file and prints + * each active trigger's configuration using its associated print + * operation. + */ +static int t_show_triggers(struct seq_file *m, void *v) +{ + struct trace_event_file *file = v; + struct trace_event_call *call = file->event_call; + struct event_trigger_data *data; + int len; + + /* + * The event_mutex is held by t_start(), protecting the + * file->triggers list traversal. + */ + if (list_empty(&file->triggers)) + return 0; + + len = get_call_len(call); + + list_for_each_entry_rcu(data, &file->triggers, list) { + seq_printf(m, "%s:%s%*s", call->class->system, + trace_event_name(call), len, ""); + + data->cmd_ops->print(m, data); + } + + return 0; +} + +#ifdef CONFIG_MODULES +static int s_show(struct seq_file *m, void *v) +{ + struct set_event_iter *iter = v; + const char *system; + const char *event; + + if (iter->type == SET_EVENT_FILE) + return t_show(m, iter->file); + + /* When match is set, system and event are not */ + if (iter->event_mod->match) { + seq_printf(m, "%s:mod:%s\n", iter->event_mod->match, + iter->event_mod->module); + return 0; + } + + system = iter->event_mod->system ? : "*"; + event = iter->event_mod->event ? : "*"; + + seq_printf(m, "%s:%s:mod:%s\n", system, event, iter->event_mod->module); + + return 0; +} +#else /* CONFIG_MODULES */ +static int s_show(struct seq_file *m, void *v) +{ + struct set_event_iter *iter = v; + + return t_show(m, iter->file); +} +#endif + +static void s_stop(struct seq_file *m, void *v) +{ + kfree(v); + t_stop(m, NULL); +} + static void * __next(struct seq_file *m, void *v, loff_t *pos, int type) { @@ -1537,8 +1893,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, !(flags & EVENT_FILE_FL_SOFT_DISABLED)) strcpy(buf, "1"); - if (flags & EVENT_FILE_FL_SOFT_DISABLED || - flags & EVENT_FILE_FL_SOFT_MODE) + if (atomic_read(&file->sm_ref) != 0) strcat(buf, "*"); strcat(buf, "\n"); @@ -1558,21 +1913,20 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, if (ret) return ret; + guard(mutex)(&event_mutex); + switch (val) { case 0: case 1: - ret = -ENODEV; - mutex_lock(&event_mutex); file = event_file_file(filp); - if (likely(file)) { - ret = tracing_update_buffers(file->tr); - if (ret < 0) { - mutex_unlock(&event_mutex); - return ret; - } - ret = ftrace_event_enable_disable(file, val); - } - mutex_unlock(&event_mutex); + if (!file) + return -ENODEV; + ret = tracing_update_buffers(file->tr); + if (ret < 0) + return ret; + ret = ftrace_event_enable_disable(file, val); + if (ret < 0) + return ret; break; default: @@ -1581,31 +1935,31 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, *ppos += cnt; - return ret ? ret : cnt; + return cnt; } -static ssize_t -system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, - loff_t *ppos) +/* + * Returns: + * 0 : no events exist? + * 1 : all events are disabled + * 2 : all events are enabled + * 3 : some events are enabled and some are enabled + */ +int trace_events_enabled(struct trace_array *tr, const char *system) { - const char set_to_char[4] = { '?', '0', '1', 'X' }; - struct trace_subsystem_dir *dir = filp->private_data; - struct event_subsystem *system = dir->subsystem; struct trace_event_call *call; struct trace_event_file *file; - struct trace_array *tr = dir->tr; - char buf[2]; int set = 0; - int ret; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); + list_for_each_entry(file, &tr->events, list) { call = file->event_call; if ((call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) || !trace_event_name(call) || !call->class || !call->class->reg) continue; - if (system && strcmp(call->class->system, system->name) != 0) + if (system && strcmp(call->class->system, system) != 0) continue; /* @@ -1621,7 +1975,23 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, if (set == 3) break; } - mutex_unlock(&event_mutex); + + return set; +} + +static ssize_t +system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + const char set_to_char[4] = { '?', '0', '1', 'X' }; + struct trace_subsystem_dir *dir = filp->private_data; + struct event_subsystem *system = dir->subsystem; + struct trace_array *tr = dir->tr; + char buf[2]; + int set; + int ret; + + set = trace_events_enabled(tr, system ? system->name : NULL); buf[0] = set_to_char[set]; buf[1] = '\n'; @@ -1659,7 +2029,7 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, if (system) name = system->name; - ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val); + ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val, NULL); if (ret) goto out; @@ -1817,12 +2187,12 @@ static int trace_format_open(struct inode *inode, struct file *file) static ssize_t event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - int id = (long)event_file_data(filp); + /* id is directly in i_private and available for inode's lifetime. */ + int id = (long)file_inode(filp)->i_private; char buf[32]; int len; - if (unlikely(!id)) - return -ENODEV; + WARN_ON(!id); len = sprintf(buf, "%d\n", id); @@ -1841,7 +2211,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, if (*ppos) return 0; - s = kmalloc(sizeof(*s), GFP_KERNEL); + s = kmalloc_obj(*s); if (!s) return -ENOMEM; @@ -1880,12 +2250,8 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, mutex_lock(&event_mutex); file = event_file_file(filp); - if (file) { - if (file->flags & EVENT_FILE_FL_FREED) - err = -ENODEV; - else - err = apply_event_filter(file, buf); - } + if (file) + err = apply_event_filter(file, buf); mutex_unlock(&event_mutex); kfree(buf); @@ -1906,7 +2272,7 @@ static int subsystem_open(struct inode *inode, struct file *filp) struct event_subsystem *system = NULL; int ret; - if (tracing_is_disabled()) + if (unlikely(tracing_disabled)) return -ENODEV; /* Make sure the system still exists */ @@ -1955,7 +2321,7 @@ static int system_tr_open(struct inode *inode, struct file *filp) int ret; /* Make a temporary dir that has no system but points to tr */ - dir = kzalloc(sizeof(*dir), GFP_KERNEL); + dir = kzalloc_obj(*dir); if (!dir) return -ENOMEM; @@ -2001,7 +2367,7 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, if (*ppos) return 0; - s = kmalloc(sizeof(*s), GFP_KERNEL); + s = kmalloc_obj(*s); if (!s) return -ENOMEM; @@ -2051,7 +2417,7 @@ show_header_page_file(struct file *filp, char __user *ubuf, size_t cnt, loff_t * if (*ppos) return 0; - s = kmalloc(sizeof(*s), GFP_KERNEL); + s = kmalloc_obj(*s); if (!s) return -ENOMEM; @@ -2075,7 +2441,7 @@ show_header_event_file(struct file *filp, char __user *ubuf, size_t cnt, loff_t if (*ppos) return 0; - s = kmalloc(sizeof(*s), GFP_KERNEL); + s = kmalloc_obj(*s); if (!s) return -ENOMEM; @@ -2157,7 +2523,7 @@ event_pid_write(struct file *filp, const char __user *ubuf, if (ret < 0) return ret; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); if (type == TRACE_PIDS) { filtered_pids = rcu_dereference_protected(tr->filtered_pids, @@ -2173,7 +2539,7 @@ event_pid_write(struct file *filp, const char __user *ubuf, ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt); if (ret < 0) - goto out; + return ret; if (type == TRACE_PIDS) rcu_assign_pointer(tr->filtered_pids, pid_list); @@ -2198,11 +2564,7 @@ event_pid_write(struct file *filp, const char __user *ubuf, */ on_each_cpu(ignore_task_cpu, tr, 1); - out: - mutex_unlock(&event_mutex); - - if (ret > 0) - *ppos += ret; + *ppos += ret; return ret; } @@ -2223,6 +2585,8 @@ ftrace_event_npid_write(struct file *filp, const char __user *ubuf, static int ftrace_event_avail_open(struct inode *inode, struct file *file); static int ftrace_event_set_open(struct inode *inode, struct file *file); +static int ftrace_event_show_filters_open(struct inode *inode, struct file *file); +static int ftrace_event_show_triggers_open(struct inode *inode, struct file *file); static int ftrace_event_set_pid_open(struct inode *inode, struct file *file); static int ftrace_event_set_npid_open(struct inode *inode, struct file *file); static int ftrace_event_release(struct inode *inode, struct file *file); @@ -2237,7 +2601,21 @@ static const struct seq_operations show_event_seq_ops = { static const struct seq_operations show_set_event_seq_ops = { .start = s_start, .next = s_next, - .show = t_show, + .show = s_show, + .stop = s_stop, +}; + +static const struct seq_operations show_show_event_filters_seq_ops = { + .start = t_start, + .next = t_next, + .show = t_show_filters, + .stop = t_stop, +}; + +static const struct seq_operations show_show_event_triggers_seq_ops = { + .start = t_start, + .next = t_next, + .show = t_show_triggers, .stop = t_stop, }; @@ -2270,6 +2648,20 @@ static const struct file_operations ftrace_set_event_fops = { .release = ftrace_event_release, }; +static const struct file_operations ftrace_show_event_filters_fops = { + .open = ftrace_event_show_filters_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static const struct file_operations ftrace_show_event_triggers_fops = { + .open = ftrace_event_show_triggers_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + static const struct file_operations ftrace_set_event_pid_fops = { .open = ftrace_event_set_pid_open, .read = seq_read, @@ -2414,6 +2806,34 @@ ftrace_event_set_open(struct inode *inode, struct file *file) return ret; } +/** + * ftrace_event_show_filters_open - open interface for set_event_filters + * @inode: The inode of the file + * @file: The file being opened + * + * Connects the set_event_filters file to the sequence operations + * required to iterate over and display active event filters. + */ +static int +ftrace_event_show_filters_open(struct inode *inode, struct file *file) +{ + return ftrace_event_open(inode, file, &show_show_event_filters_seq_ops); +} + +/** + * ftrace_event_show_triggers_open - open interface for show_event_triggers + * @inode: The inode of the file + * @file: The file being opened + * + * Connects the show_event_triggers file to the sequence operations + * required to iterate over and display active event triggers. + */ +static int +ftrace_event_show_triggers_open(struct inode *inode, struct file *file) +{ + return ftrace_event_open(inode, file, &show_show_event_triggers_seq_ops); +} + static int ftrace_event_set_pid_open(struct inode *inode, struct file *file) { @@ -2462,7 +2882,7 @@ create_new_subsystem(const char *name) struct event_subsystem *system; /* need to create new entry */ - system = kmalloc(sizeof(*system), GFP_KERNEL); + system = kmalloc_obj(*system); if (!system) return NULL; @@ -2473,7 +2893,7 @@ create_new_subsystem(const char *name) if (!system->name) goto out_free; - system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL); + system->filter = kzalloc_obj(struct event_filter); if (!system->filter) goto out_free; @@ -2541,7 +2961,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name, } } - dir = kmalloc(sizeof(*dir), GFP_KERNEL); + dir = kmalloc_obj(*dir); if (!dir) goto out_fail; @@ -2552,8 +2972,8 @@ event_subsystem_dir(struct trace_array *tr, const char *name, } else __get_system(system); - /* ftrace only has directories no files */ - if (strcmp(name, "ftrace") == 0) + /* ftrace only has directories no files, readonly instance too. */ + if (strcmp(name, "ftrace") == 0 || trace_array_is_readonly(tr)) nr_entries = 0; else nr_entries = ARRAY_SIZE(system_entries); @@ -2718,28 +3138,30 @@ event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file) int ret; static struct eventfs_entry event_entries[] = { { - .name = "enable", + .name = "format", .callback = event_callback, - .release = event_release, }, +#ifdef CONFIG_PERF_EVENTS { - .name = "filter", + .name = "id", .callback = event_callback, }, +#endif +#define NR_RO_EVENT_ENTRIES (1 + IS_ENABLED(CONFIG_PERF_EVENTS)) +/* Readonly files must be above this line and counted by NR_RO_EVENT_ENTRIES. */ { - .name = "trigger", + .name = "enable", .callback = event_callback, + .release = event_release, }, { - .name = "format", + .name = "filter", .callback = event_callback, }, -#ifdef CONFIG_PERF_EVENTS { - .name = "id", + .name = "trigger", .callback = event_callback, }, -#endif #ifdef CONFIG_HIST_TRIGGERS { .name = "hist", @@ -2772,7 +3194,10 @@ event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file) if (!e_events) return -ENOMEM; - nr_entries = ARRAY_SIZE(event_entries); + if (trace_array_is_readonly(tr)) + nr_entries = NR_RO_EVENT_ENTRIES; + else + nr_entries = ARRAY_SIZE(event_entries); name = trace_event_name(call); ei = eventfs_create_dir(name, e_events, event_entries, nr_entries, file); @@ -2870,7 +3295,10 @@ __register_event(struct trace_event_call *call, struct module *mod) if (ret < 0) return ret; + down_write(&trace_event_sem); list_add(&call->list, &ftrace_events); + up_write(&trace_event_sem); + if (call->flags & TRACE_EVENT_FL_DYNAMIC) atomic_set(&call->refcnt, 0); else @@ -2981,7 +3409,7 @@ static void add_str_to_module(struct module *module, char *str) { struct module_string *modstr; - modstr = kmalloc(sizeof(*modstr), GFP_KERNEL); + modstr = kmalloc_obj(*modstr); /* * If we failed to allocate memory here, then we'll just @@ -2998,43 +3426,120 @@ static void add_str_to_module(struct module *module, char *str) list_add(&modstr->next, &module_strings); } +#define ATTRIBUTE_STR "__attribute__(" +#define ATTRIBUTE_STR_LEN (sizeof(ATTRIBUTE_STR) - 1) + +/* Remove all __attribute__() from @type. Return allocated string or @type. */ +static char *sanitize_field_type(const char *type) +{ + char *attr, *tmp, *next, *ret = (char *)type; + int depth; + + next = (char *)type; + while ((attr = strstr(next, ATTRIBUTE_STR))) { + /* Retry if "__attribute__(" is a part of another word. */ + if (attr != next && !isspace(attr[-1])) { + next = attr + ATTRIBUTE_STR_LEN; + continue; + } + + if (ret == type) { + ret = kstrdup(type, GFP_KERNEL); + if (WARN_ON_ONCE(!ret)) + return NULL; + attr = ret + (attr - type); + } + + /* the ATTRIBUTE_STR already has the first '(' */ + depth = 1; + next = attr + ATTRIBUTE_STR_LEN; + do { + tmp = strpbrk(next, "()"); + /* There is unbalanced parentheses */ + if (WARN_ON_ONCE(!tmp)) { + kfree(ret); + return (char *)type; + } + + if (*tmp == '(') + depth++; + else + depth--; + next = tmp + 1; + } while (depth > 0); + next = skip_spaces(next); + strcpy(attr, next); + next = attr; + } + return ret; +} + +static char *find_replacable_eval(const char *type, const char *eval_string, + int len) +{ + char *ptr; + + if (!eval_string) + return NULL; + + ptr = strchr(type, '['); + if (!ptr) + return NULL; + ptr++; + + if (!isalpha(*ptr) && *ptr != '_') + return NULL; + + if (strncmp(eval_string, ptr, len) != 0) + return NULL; + + return ptr; +} + static void update_event_fields(struct trace_event_call *call, struct trace_eval_map *map) { struct ftrace_event_field *field; + const char *eval_string = NULL; struct list_head *head; + int len = 0; char *ptr; char *str; - int len = strlen(map->eval_string); /* Dynamic events should never have field maps */ - if (WARN_ON_ONCE(call->flags & TRACE_EVENT_FL_DYNAMIC)) + if (call->flags & TRACE_EVENT_FL_DYNAMIC) return; + if (map) { + eval_string = map->eval_string; + len = strlen(map->eval_string); + } + head = trace_get_fields(call); list_for_each_entry(field, head, link) { - ptr = strchr(field->type, '['); - if (!ptr) - continue; - ptr++; - - if (!isalpha(*ptr) && *ptr != '_') - continue; + str = sanitize_field_type(field->type); + if (!str) + return; - if (strncmp(map->eval_string, ptr, len) != 0) - continue; + ptr = find_replacable_eval(str, eval_string, len); + if (ptr) { + if (str == field->type) { + str = kstrdup(field->type, GFP_KERNEL); + if (WARN_ON_ONCE(!str)) + return; + ptr = str + (ptr - field->type); + } - str = kstrdup(field->type, GFP_KERNEL); - if (WARN_ON_ONCE(!str)) - return; - ptr = str + (ptr - field->type); - ptr = eval_replace(ptr, map, len); - /* enum/sizeof string smaller than value */ - if (WARN_ON_ONCE(!ptr)) { - kfree(str); - continue; + ptr = eval_replace(ptr, map, len); + /* enum/sizeof string smaller than value */ + if (WARN_ON_ONCE(!ptr)) { + kfree(str); + continue; + } } + if (str == field->type) + continue; /* * If the event is part of a module, then we need to free the string * when the module is removed. Otherwise, it will stay allocated @@ -3044,14 +3549,18 @@ static void update_event_fields(struct trace_event_call *call, add_str_to_module(call->module, str); field->type = str; + if (field->filter_type == FILTER_OTHER) + field->filter_type = filter_assign_type(field->type); } } -void trace_event_eval_update(struct trace_eval_map **map, int len) +/* Update all events for replacing eval and sanitizing */ +void trace_event_update_all(struct trace_eval_map **map, int len) { struct trace_event_call *call, *p; const char *last_system = NULL; bool first = false; + bool updated; int last_i; int i; @@ -3064,6 +3573,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len) last_system = call->class->system; } + updated = false; /* * Since calls are grouped by systems, the likelihood that the * next call in the iteration belongs to the same system as the @@ -3083,8 +3593,12 @@ void trace_event_eval_update(struct trace_eval_map **map, int len) } update_event_printk(call, map[i]); update_event_fields(call, map[i]); + updated = true; } } + /* If not updated yet, update field for sanitizing. */ + if (!updated) + update_event_fields(call, NULL); cond_resched(); } up_write(&trace_event_sem); @@ -3111,6 +3625,20 @@ static bool event_in_systems(struct trace_event_call *call, return !*p || isspace(*p) || *p == ','; } +#ifdef CONFIG_HIST_TRIGGERS +/* + * Wake up waiter on the hist_poll_wq from irq_work because the hist trigger + * may happen in any context. + */ +static void hist_poll_event_irq_work(struct irq_work *work) +{ + wake_up_all(&hist_poll_wq); +} + +DEFINE_IRQ_WORK(hist_poll_work, hist_poll_event_irq_work); +DECLARE_WAIT_QUEUE_HEAD(hist_poll_wq); +#endif + static struct trace_event_file * trace_create_new_event(struct trace_event_call *call, struct trace_array *tr) @@ -3155,20 +3683,27 @@ static struct boot_triggers { } bootup_triggers[MAX_BOOT_TRIGGERS]; static char bootup_trigger_buf[COMMAND_LINE_SIZE]; +static int boot_trigger_buf_len; static int nr_boot_triggers; static __init int setup_trace_triggers(char *str) { char *trigger; char *buf; + int len = boot_trigger_buf_len; int i; - strscpy(bootup_trigger_buf, str, COMMAND_LINE_SIZE); + if (len >= COMMAND_LINE_SIZE) + return 1; + + strscpy(bootup_trigger_buf + len, str, COMMAND_LINE_SIZE - len); trace_set_ring_buffer_expanded(NULL); disable_tracing_selftest("running event triggers"); - buf = bootup_trigger_buf; - for (i = 0; i < MAX_BOOT_TRIGGERS; i++) { + buf = bootup_trigger_buf + len; + boot_trigger_buf_len += strlen(buf) + 1; + + for (i = nr_boot_triggers; i < MAX_BOOT_TRIGGERS; i++) { trigger = strsep(&buf, ","); if (!trigger) break; @@ -3269,13 +3804,13 @@ int trace_add_event_call(struct trace_event_call *call) int ret; lockdep_assert_held(&event_mutex); - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); ret = __register_event(call, NULL); - if (ret >= 0) - __add_event_to_tracers(call); + if (ret < 0) + return ret; - mutex_unlock(&trace_types_lock); + __add_event_to_tracers(call); return ret; } EXPORT_SYMBOL_GPL(trace_add_event_call); @@ -3304,7 +3839,7 @@ static int probe_remove_event_call(struct trace_event_call *call) continue; /* * We can't rely on ftrace_event_enable_disable(enable => 0) - * we are going to do, EVENT_FILE_FL_SOFT_MODE can suppress + * we are going to do, soft mode can suppress * TRACE_REG_UNREGISTER. */ if (file->flags & EVENT_FILE_FL_ENABLED) @@ -3355,6 +3890,28 @@ EXPORT_SYMBOL_GPL(trace_remove_event_call); event++) #ifdef CONFIG_MODULES +static void update_mod_cache(struct trace_array *tr, struct module *mod) +{ + struct event_mod_load *event_mod, *n; + + list_for_each_entry_safe(event_mod, n, &tr->mod_events, list) { + if (strcmp(event_mod->module, mod->name) != 0) + continue; + + __ftrace_set_clr_event_nolock(tr, event_mod->match, + event_mod->system, + event_mod->event, 1, mod->name); + free_event_mod(event_mod); + } +} + +static void update_cache_events(struct module *mod) +{ + struct trace_array *tr; + + list_for_each_entry(tr, &ftrace_trace_arrays, list) + update_mod_cache(tr, mod); +} static void trace_module_add_events(struct module *mod) { @@ -3377,6 +3934,8 @@ static void trace_module_add_events(struct module *mod) __register_event(*call, mod); __add_event_to_tracers(*call); } + + update_cache_events(mod); } static void trace_module_remove_events(struct module *mod) @@ -3391,7 +3950,7 @@ static void trace_module_remove_events(struct module *mod) if (call->module == mod) __trace_remove_event_call(call); } - /* Check for any strings allocade for this module */ + /* Check for any strings allocated for this module */ list_for_each_entry_safe(modstr, m, &module_strings, next) { if (modstr->module != mod) continue; @@ -3446,6 +4005,8 @@ __trace_add_event_dirs(struct trace_array *tr) struct trace_event_call *call; int ret; + lockdep_assert_held(&trace_event_sem); + list_for_each_entry(call, &ftrace_events, list) { ret = __trace_add_new_event(call, tr); if (ret < 0) @@ -3529,30 +4090,21 @@ struct trace_event_file *trace_get_event_file(const char *instance, return ERR_PTR(ret); } - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); file = find_event_file(tr, system, event); if (!file) { trace_array_put(tr); - ret = -EINVAL; - goto out; + return ERR_PTR(-EINVAL); } /* Don't let event modules unload while in use */ ret = trace_event_try_get_ref(file->event_call); if (!ret) { trace_array_put(tr); - ret = -EBUSY; - goto out; + return ERR_PTR(-EBUSY); } - ret = 0; - out: - mutex_unlock(&event_mutex); - - if (ret) - file = ERR_PTR(ret); - return file; } EXPORT_SYMBOL_GPL(trace_get_event_file); @@ -3577,11 +4129,6 @@ void trace_put_event_file(struct trace_event_file *file) EXPORT_SYMBOL_GPL(trace_put_event_file); #ifdef CONFIG_DYNAMIC_FTRACE - -/* Avoid typos */ -#define ENABLE_EVENT_STR "enable_event" -#define DISABLE_EVENT_STR "disable_event" - struct event_probe_data { struct trace_event_file *file; unsigned long count; @@ -3702,7 +4249,7 @@ static int free_probe_data(void *data) edata->ref--; if (!edata->ref) { - /* Remove the SOFT_MODE flag */ + /* Remove soft mode */ __ftrace_event_enable_disable(edata->file, 0, 1); trace_event_put_ref(edata->file->event_call); kfree(edata); @@ -3770,6 +4317,7 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash, struct trace_event_file *file; struct ftrace_probe_ops *ops; struct event_probe_data *data; + unsigned long count = -1; const char *system; const char *event; char *number; @@ -3789,12 +4337,11 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash, event = strsep(¶m, ":"); - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); - ret = -EINVAL; file = find_event_file(tr, system, event); if (!file) - goto out; + return -EINVAL; enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; @@ -3803,74 +4350,62 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash, else ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops; - if (glob[0] == '!') { - ret = unregister_ftrace_function_probe_func(glob+1, tr, ops); - goto out; - } - - ret = -ENOMEM; - - data = kzalloc(sizeof(*data), GFP_KERNEL); - if (!data) - goto out; - - data->enable = enable; - data->count = -1; - data->file = file; + if (glob[0] == '!') + return unregister_ftrace_function_probe_func(glob+1, tr, ops); - if (!param) - goto out_reg; + if (param) { + number = strsep(¶m, ":"); - number = strsep(¶m, ":"); + if (!strlen(number)) + return -EINVAL; - ret = -EINVAL; - if (!strlen(number)) - goto out_free; - - /* - * We use the callback data field (which is a pointer) - * as our counter. - */ - ret = kstrtoul(number, 0, &data->count); - if (ret) - goto out_free; + /* + * We use the callback data field (which is a pointer) + * as our counter. + */ + ret = kstrtoul(number, 0, &count); + if (ret) + return ret; + } - out_reg: /* Don't let event modules unload while probe registered */ ret = trace_event_try_get_ref(file->event_call); - if (!ret) { - ret = -EBUSY; - goto out_free; - } + if (!ret) + return -EBUSY; ret = __ftrace_event_enable_disable(file, 1, 1); if (ret < 0) goto out_put; + ret = -ENOMEM; + data = kzalloc_obj(*data); + if (!data) + goto out_put; + + data->enable = enable; + data->count = count; + data->file = file; + ret = register_ftrace_function_probe(glob, tr, ops, data); /* * The above returns on success the # of functions enabled, * but if it didn't find any functions it returns zero. * Consider no functions a failure too. */ - if (!ret) { - ret = -ENOENT; - goto out_disable; - } else if (ret < 0) - goto out_disable; + /* Just return zero, not the number of enabled functions */ - ret = 0; - out: - mutex_unlock(&event_mutex); - return ret; + if (ret > 0) + return 0; + + kfree(data); + + if (!ret) + ret = -ENOENT; - out_disable: __ftrace_event_enable_disable(file, 0, 1); out_put: trace_event_put_ref(file->event_call); - out_free: - kfree(data); - goto out; + return ret; } static struct ftrace_func_command event_enable_cmd = { @@ -3969,7 +4504,11 @@ static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; static __init int setup_trace_event(char *str) { - strscpy(bootup_event_buf, str, COMMAND_LINE_SIZE); + if (bootup_event_buf[0] != '\0') + strlcat(bootup_event_buf, ",", COMMAND_LINE_SIZE); + + strlcat(bootup_event_buf, str, COMMAND_LINE_SIZE); + trace_set_ring_buffer_expanded(NULL); disable_tracing_selftest("running event tracing"); @@ -4008,25 +4547,44 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) int nr_entries; static struct eventfs_entry events_entries[] = { { - .name = "enable", + .name = "header_page", .callback = events_callback, }, { - .name = "header_page", + .name = "header_event", .callback = events_callback, }, +#define NR_RO_TOP_ENTRIES 2 +/* Readonly files must be above this line and counted by NR_RO_TOP_ENTRIES. */ { - .name = "header_event", + .name = "enable", .callback = events_callback, }, }; - entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent, - tr, &ftrace_set_event_fops); - if (!entry) - return -ENOMEM; + if (!trace_array_is_readonly(tr)) { + entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent, + tr, &ftrace_set_event_fops); + if (!entry) + return -ENOMEM; - nr_entries = ARRAY_SIZE(events_entries); + /* There are not as crucial, just warn if they are not created */ + trace_create_file("show_event_filters", TRACE_MODE_READ, parent, tr, + &ftrace_show_event_filters_fops); + + trace_create_file("show_event_triggers", TRACE_MODE_READ, parent, tr, + &ftrace_show_event_triggers_fops); + + trace_create_file("set_event_pid", TRACE_MODE_WRITE, parent, + tr, &ftrace_set_event_pid_fops); + + trace_create_file("set_event_notrace_pid", + TRACE_MODE_WRITE, parent, tr, + &ftrace_set_event_notrace_pid_fops); + nr_entries = ARRAY_SIZE(events_entries); + } else { + nr_entries = NR_RO_TOP_ENTRIES; + } e_events = eventfs_create_events_dir("events", parent, events_entries, nr_entries, tr); @@ -4035,15 +4593,6 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) return -ENOMEM; } - /* There are not as crucial, just warn if they are not created */ - - trace_create_file("set_event_pid", TRACE_MODE_WRITE, parent, - tr, &ftrace_set_event_pid_fops); - - trace_create_file("set_event_notrace_pid", - TRACE_MODE_WRITE, parent, tr, - &ftrace_set_event_notrace_pid_fops); - tr->event_dir = e_events; return 0; @@ -4093,20 +4642,17 @@ early_event_add_tracer(struct dentry *parent, struct trace_array *tr) { int ret; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); ret = create_event_toplevel_files(parent, tr); if (ret) - goto out_unlock; + return ret; down_write(&trace_event_sem); __trace_early_add_event_dirs(tr); up_write(&trace_event_sem); - out_unlock: - mutex_unlock(&event_mutex); - - return ret; + return 0; } /* Must be called with event_mutex held */ @@ -4121,7 +4667,7 @@ int event_trace_del_tracer(struct trace_array *tr) __ftrace_clear_event_pids(tr, TRACE_PIDS | TRACE_NO_PIDS); /* Disable any running events */ - __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0); + __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0, NULL); /* Make sure no more events are being executed */ tracepoint_synchronize_unregister(); @@ -4143,26 +4689,22 @@ static __init int event_trace_memsetup(void) return 0; } -__init void -early_enable_events(struct trace_array *tr, char *buf, bool disable_first) +/* + * Helper function to enable or disable a comma-separated list of events + * from the bootup buffer. + */ +static __init void __early_set_events(struct trace_array *tr, char *buf, bool enable) { char *token; - int ret; - - while (true) { - token = strsep(&buf, ","); - - if (!token) - break; + while ((token = strsep(&buf, ","))) { if (*token) { - /* Restarting syscalls requires that we stop them first */ - if (disable_first) + if (enable) { + if (ftrace_set_clr_event(tr, token, 1)) + pr_warn("Failed to enable trace event: %s\n", token); + } else { ftrace_set_clr_event(tr, token, 0); - - ret = ftrace_set_clr_event(tr, token, 1); - if (ret) - pr_warn("Failed to enable trace event: %s\n", token); + } } /* Put back the comma to allow this to be called again */ @@ -4171,6 +4713,32 @@ early_enable_events(struct trace_array *tr, char *buf, bool disable_first) } } +/** + * early_enable_events - enable events from the bootup buffer + * @tr: The trace array to enable the events in + * @buf: The buffer containing the comma separated list of events + * @disable_first: If true, disable all events in @buf before enabling them + * + * This function enables events from the bootup buffer. If @disable_first + * is true, it will first disable all events in the buffer before enabling + * them. + * + * For syscall events, which rely on a global refcount to register the + * SYSCALL_WORK_SYSCALL_TRACEPOINT flag (especially for pid 1), we must + * ensure the refcount hits zero before re-enabling them. A simple + * "disable then enable" per-event is not enough if multiple syscalls are + * used, as the refcount will stay above zero. Thus, we need a two-phase + * approach: disable all, then enable all. + */ +__init void +early_enable_events(struct trace_array *tr, char *buf, bool disable_first) +{ + if (disable_first) + __early_set_events(tr, buf, false); + + __early_set_events(tr, buf, true); +} + static __init int event_trace_enable(void) { struct trace_array *tr = top_trace_array(); @@ -4405,7 +4973,7 @@ static __init void event_trace_self_tests(void) pr_info("Testing event system %s: ", system->name); - ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1); + ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1, NULL); if (WARN_ON_ONCE(ret)) { pr_warn("error enabling system %s\n", system->name); @@ -4414,7 +4982,7 @@ static __init void event_trace_self_tests(void) event_test_stuff(); - ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0); + ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0, NULL); if (WARN_ON_ONCE(ret)) { pr_warn("error disabling system %s\n", system->name); @@ -4429,7 +4997,7 @@ static __init void event_trace_self_tests(void) pr_info("Running tests on all trace events:\n"); pr_info("Testing all events: "); - ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1); + ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1, NULL); if (WARN_ON_ONCE(ret)) { pr_warn("error enabling all events\n"); return; @@ -4438,7 +5006,7 @@ static __init void event_trace_self_tests(void) event_test_stuff(); /* reset sysname */ - ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0); + ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0, NULL); if (WARN_ON_ONCE(ret)) { pr_warn("error disabling all events\n"); return; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 78051de581e7..609325f57942 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -142,7 +142,7 @@ static bool is_not(const char *str) } /** - * struct prog_entry - a singe entry in the filter program + * struct prog_entry - a single entry in the filter program * @target: Index to jump to on a branch (actually one minus the index) * @when_to_branch: The value of the result of the predicate to do a branch * @pred: The predicate to execute. @@ -485,10 +485,10 @@ predicate_parse(const char *str, int nr_parens, int nr_preds, nr_preds += 2; /* For TRUE and FALSE */ - op_stack = kmalloc_array(nr_parens, sizeof(*op_stack), GFP_KERNEL); + op_stack = kmalloc_objs(*op_stack, nr_parens); if (!op_stack) return ERR_PTR(-ENOMEM); - prog_stack = kcalloc(nr_preds, sizeof(*prog_stack), GFP_KERNEL); + prog_stack = kzalloc_objs(*prog_stack, nr_preds); if (!prog_stack) { parse_error(pe, -ENOMEM, 0); goto out_free; @@ -808,7 +808,7 @@ static __always_inline char *test_string(char *str) kstr = ubuf->buffer; /* For safety, do not trust the string pointer */ - if (!strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE)) + if (strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE) < 0) return NULL; return kstr; } @@ -827,7 +827,7 @@ static __always_inline char *test_ustring(char *str) /* user space address? */ ustr = (char __user *)str; - if (!strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE)) + if (strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE) < 0) return NULL; return kstr; @@ -1213,7 +1213,7 @@ static void append_filter_err(struct trace_array *tr, if (WARN_ON(!filter->filter_string)) return; - s = kmalloc(sizeof(*s), GFP_KERNEL); + s = kmalloc_obj(*s); if (!s) return; trace_seq_init(s); @@ -1250,7 +1250,9 @@ static void append_filter_err(struct trace_array *tr, static inline struct event_filter *event_filter(struct trace_event_file *file) { - return file->filter; + return rcu_dereference_protected(file->filter, + lockdep_is_held(&event_mutex)); + } /* caller must hold event_mutex */ @@ -1320,7 +1322,7 @@ void free_event_filter(struct event_filter *filter) static inline void __remove_filter(struct trace_event_file *file) { filter_disable(file); - remove_filter_string(file->filter); + remove_filter_string(event_filter(file)); } static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir, @@ -1335,22 +1337,149 @@ static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir, } } +struct filter_list { + struct list_head list; + struct event_filter *filter; +}; + +struct filter_head { + struct list_head list; + union { + struct rcu_head rcu; + struct rcu_work rwork; + }; +}; + +static void free_filter_list(struct filter_head *filter_list) +{ + struct filter_list *filter_item, *tmp; + + list_for_each_entry_safe(filter_item, tmp, &filter_list->list, list) { + __free_filter(filter_item->filter); + list_del(&filter_item->list); + kfree(filter_item); + } + kfree(filter_list); +} + +static void free_filter_list_work(struct work_struct *work) +{ + struct filter_head *filter_list; + + filter_list = container_of(to_rcu_work(work), struct filter_head, rwork); + free_filter_list(filter_list); +} + +static void free_filter_list_tasks(struct rcu_head *rhp) +{ + struct filter_head *filter_list = container_of(rhp, struct filter_head, rcu); + + INIT_RCU_WORK(&filter_list->rwork, free_filter_list_work); + queue_rcu_work(system_dfl_wq, &filter_list->rwork); +} + +/* + * The tracepoint_synchronize_unregister() is a double rcu call. + * It calls synchronize_rcu_tasks_trace() followed by synchronize_rcu(). + * Instead of waiting for it, simply call these via the call_rcu*() + * variants. + */ +static void delay_free_filter(struct filter_head *head) +{ + call_rcu_tasks_trace(&head->rcu, free_filter_list_tasks); +} + +static void try_delay_free_filter(struct event_filter *filter) +{ + struct filter_head *head; + struct filter_list *item; + + head = kmalloc_obj(*head); + if (!head) + goto free_now; + + INIT_LIST_HEAD(&head->list); + + item = kmalloc_obj(*item); + if (!item) { + kfree(head); + goto free_now; + } + + item->filter = filter; + list_add_tail(&item->list, &head->list); + delay_free_filter(head); + return; + + free_now: + /* Make sure the filter is not being used */ + tracepoint_synchronize_unregister(); + __free_filter(filter); +} + static inline void __free_subsystem_filter(struct trace_event_file *file) { - __free_filter(file->filter); + __free_filter(event_filter(file)); file->filter = NULL; } +static inline void event_set_filter(struct trace_event_file *file, + struct event_filter *filter) +{ + rcu_assign_pointer(file->filter, filter); +} + +static inline void event_clear_filter(struct trace_event_file *file) +{ + RCU_INIT_POINTER(file->filter, NULL); +} + static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir, - struct trace_array *tr) + struct trace_array *tr, + struct event_filter *filter) { struct trace_event_file *file; + struct filter_head *head; + struct filter_list *item; + + head = kmalloc_obj(*head); + if (!head) + goto free_now; + + INIT_LIST_HEAD(&head->list); list_for_each_entry(file, &tr->events, list) { if (file->system != dir) continue; + item = kmalloc_obj(*item); + if (!item) + goto free_now; + item->filter = event_filter(file); + list_add_tail(&item->list, &head->list); + event_clear_filter(file); + } + + item = kmalloc_obj(*item); + if (!item) + goto free_now; + + item->filter = filter; + list_add_tail(&item->list, &head->list); + + delay_free_filter(head); + return; + free_now: + tracepoint_synchronize_unregister(); + + if (head) + free_filter_list(head); + + list_for_each_entry(file, &tr->events, list) { + if (file->system != dir || !file->filter) + continue; __free_subsystem_filter(file); } + __free_filter(filter); } int filter_assign_type(const char *type) @@ -1579,7 +1708,7 @@ static int parse_pred(const char *str, void *data, s = i; - pred = kzalloc(sizeof(*pred), GFP_KERNEL); + pred = kzalloc_obj(*pred); if (!pred) return -ENOMEM; @@ -1690,7 +1819,7 @@ static int parse_pred(const char *str, void *data, goto err_free; } - pred->regex = kzalloc(sizeof(*pred->regex), GFP_KERNEL); + pred->regex = kzalloc_obj(*pred->regex); if (!pred->regex) goto err_mem; pred->regex->len = len; @@ -1855,7 +1984,7 @@ static int parse_pred(const char *str, void *data, goto err_free; } - pred->regex = kzalloc(sizeof(*pred->regex), GFP_KERNEL); + pred->regex = kzalloc_obj(*pred->regex); if (!pred->regex) goto err_mem; pred->regex->len = len; @@ -2120,22 +2249,6 @@ static inline void event_set_filtered_flag(struct trace_event_file *file) trace_buffered_event_enable(); } -static inline void event_set_filter(struct trace_event_file *file, - struct event_filter *filter) -{ - rcu_assign_pointer(file->filter, filter); -} - -static inline void event_clear_filter(struct trace_event_file *file) -{ - RCU_INIT_POINTER(file->filter, NULL); -} - -struct filter_list { - struct list_head list; - struct event_filter *filter; -}; - static int process_system_preds(struct trace_subsystem_dir *dir, struct trace_array *tr, struct filter_parse_error *pe, @@ -2144,17 +2257,22 @@ static int process_system_preds(struct trace_subsystem_dir *dir, struct trace_event_file *file; struct filter_list *filter_item; struct event_filter *filter = NULL; - struct filter_list *tmp; - LIST_HEAD(filter_list); + struct filter_head *filter_list; bool fail = true; int err; + filter_list = kmalloc_obj(*filter_list); + if (!filter_list) + return -ENOMEM; + + INIT_LIST_HEAD(&filter_list->list); + list_for_each_entry(file, &tr->events, list) { if (file->system != dir) continue; - filter = kzalloc(sizeof(*filter), GFP_KERNEL); + filter = kzalloc_obj(*filter); if (!filter) goto fail_mem; @@ -2171,11 +2289,11 @@ static int process_system_preds(struct trace_subsystem_dir *dir, event_set_filtered_flag(file); - filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); + filter_item = kzalloc_obj(*filter_item); if (!filter_item) goto fail_mem; - list_add_tail(&filter_item->list, &filter_list); + list_add_tail(&filter_item->list, &filter_list->list); /* * Regardless of if this returned an error, we still * replace the filter for the call. @@ -2195,31 +2313,22 @@ static int process_system_preds(struct trace_subsystem_dir *dir, * Do a synchronize_rcu() and to ensure all calls are * done with them before we free them. */ - tracepoint_synchronize_unregister(); - list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { - __free_filter(filter_item->filter); - list_del(&filter_item->list); - kfree(filter_item); - } + delay_free_filter(filter_list); return 0; fail: /* No call succeeded */ - list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { - list_del(&filter_item->list); - kfree(filter_item); - } + free_filter_list(filter_list); parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0); return -EINVAL; fail_mem: __free_filter(filter); + /* If any call succeeded, we still need to sync */ if (!fail) - tracepoint_synchronize_unregister(); - list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { - __free_filter(filter_item->filter); - list_del(&filter_item->list); - kfree(filter_item); - } + delay_free_filter(filter_list); + else + free_filter_list(filter_list); + return -ENOMEM; } @@ -2234,14 +2343,14 @@ static int create_filter_start(char *filter_string, bool set_str, if (WARN_ON_ONCE(*pse || *filterp)) return -EINVAL; - filter = kzalloc(sizeof(*filter), GFP_KERNEL); + filter = kzalloc_obj(*filter); if (filter && set_str) { filter->filter_string = kstrdup(filter_string, GFP_KERNEL); if (!filter->filter_string) err = -ENOMEM; } - pe = kzalloc(sizeof(*pe), GFP_KERNEL); + pe = kzalloc_obj(*pe); if (!filter || !pe || err) { kfree(pe); @@ -2361,9 +2470,7 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string) event_clear_filter(file); - /* Make sure the filter is not being used */ - tracepoint_synchronize_unregister(); - __free_filter(filter); + try_delay_free_filter(filter); return 0; } @@ -2387,11 +2494,8 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string) event_set_filter(file, filter); - if (tmp) { - /* Make sure the call is done with the filter */ - tracepoint_synchronize_unregister(); - __free_filter(tmp); - } + if (tmp) + try_delay_free_filter(tmp); } return err; @@ -2405,13 +2509,11 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, struct event_filter *filter = NULL; int err = 0; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); /* Make sure the system still has events */ - if (!dir->nr_events) { - err = -ENODEV; - goto out_unlock; - } + if (!dir->nr_events) + return -ENODEV; if (!strcmp(strstrip(filter_string), "0")) { filter_free_subsystem_preds(dir, tr); @@ -2419,10 +2521,8 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, filter = system->filter; system->filter = NULL; /* Ensure all filters are no longer used */ - tracepoint_synchronize_unregister(); - filter_free_subsystem_filters(dir, tr); - __free_filter(filter); - goto out_unlock; + filter_free_subsystem_filters(dir, tr, filter); + return 0; } err = create_system_filter(dir, filter_string, &filter); @@ -2434,8 +2534,6 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, __free_filter(system->filter); system->filter = filter; } -out_unlock: - mutex_unlock(&event_mutex); return err; } @@ -2612,17 +2710,15 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, struct event_filter *filter = NULL; struct trace_event_call *call; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); call = event->tp_event; - err = -EINVAL; if (!call) - goto out_unlock; + return -EINVAL; - err = -EEXIST; if (event->filter) - goto out_unlock; + return -EEXIST; err = create_filter(NULL, call, filter_str, false, &filter); if (err) @@ -2637,9 +2733,6 @@ free_filter: if (err || ftrace_event_is_function(call)) __free_filter(filter); -out_unlock: - mutex_unlock(&event_mutex); - return err; } @@ -2819,6 +2912,10 @@ static __init int ftrace_test_event_filter(void) if (i == DATA_CNT) printk(KERN_CONT "OK\n"); + /* Need to call ftrace_test_filter to prevent a warning */ + if (!trace_ftrace_test_filter_enabled()) + trace_ftrace_test_filter(1, 2, 3, 4, 5, 6, 7, 8); + return 0; } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 9c058aa8baf3..eb2c2bc8bc3d 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -105,37 +105,44 @@ enum field_op_id { FIELD_OP_MULT, }; +#define FIELD_FUNCS \ + C(NOP, "nop"), \ + C(VAR_REF, "var_ref"), \ + C(COUNTER, "counter"), \ + C(CONST, "const"), \ + C(LOG2, "log2"), \ + C(BUCKET, "bucket"), \ + C(TIMESTAMP, "timestamp"), \ + C(CPU, "cpu"), \ + C(COMM, "comm"), \ + C(STRING, "string"), \ + C(DYNSTRING, "dynstring"), \ + C(RELDYNSTRING, "reldynstring"), \ + C(PSTRING, "pstring"), \ + C(S64, "s64"), \ + C(U64, "u64"), \ + C(S32, "s32"), \ + C(U32, "u32"), \ + C(S16, "s16"), \ + C(U16, "u16"), \ + C(S8, "s8"), \ + C(U8, "u8"), \ + C(UMINUS, "uminus"), \ + C(MINUS, "minus"), \ + C(PLUS, "plus"), \ + C(DIV, "div"), \ + C(MULT, "mult"), \ + C(DIV_POWER2, "div_power2"), \ + C(DIV_NOT_POWER2, "div_not_power2"), \ + C(DIV_MULT_SHIFT, "div_mult_shift"), \ + C(EXECNAME, "execname"), \ + C(STACK, "stack"), + +#undef C +#define C(a, b) HIST_FIELD_FN_##a + enum hist_field_fn { - HIST_FIELD_FN_NOP, - HIST_FIELD_FN_VAR_REF, - HIST_FIELD_FN_COUNTER, - HIST_FIELD_FN_CONST, - HIST_FIELD_FN_LOG2, - HIST_FIELD_FN_BUCKET, - HIST_FIELD_FN_TIMESTAMP, - HIST_FIELD_FN_CPU, - HIST_FIELD_FN_STRING, - HIST_FIELD_FN_DYNSTRING, - HIST_FIELD_FN_RELDYNSTRING, - HIST_FIELD_FN_PSTRING, - HIST_FIELD_FN_S64, - HIST_FIELD_FN_U64, - HIST_FIELD_FN_S32, - HIST_FIELD_FN_U32, - HIST_FIELD_FN_S16, - HIST_FIELD_FN_U16, - HIST_FIELD_FN_S8, - HIST_FIELD_FN_U8, - HIST_FIELD_FN_UMINUS, - HIST_FIELD_FN_MINUS, - HIST_FIELD_FN_PLUS, - HIST_FIELD_FN_DIV, - HIST_FIELD_FN_MULT, - HIST_FIELD_FN_DIV_POWER2, - HIST_FIELD_FN_DIV_NOT_POWER2, - HIST_FIELD_FN_DIV_MULT_SHIFT, - HIST_FIELD_FN_EXECNAME, - HIST_FIELD_FN_STACK, + FIELD_FUNCS }; /* @@ -506,6 +513,7 @@ enum hist_field_flags { HIST_FIELD_FL_CONST = 1 << 18, HIST_FIELD_FL_PERCENT = 1 << 19, HIST_FIELD_FL_GRAPH = 1 << 20, + HIST_FIELD_FL_COMM = 1 << 21, }; struct var_defs { @@ -724,7 +732,7 @@ static struct track_data *track_data_alloc(unsigned int key_len, struct action_data *action_data, struct hist_trigger_data *hist_data) { - struct track_data *data = kzalloc(sizeof(*data), GFP_KERNEL); + struct track_data *data = kzalloc_obj(*data); struct hist_elt_data *elt_data; if (!data) @@ -740,7 +748,7 @@ static struct track_data *track_data_alloc(unsigned int key_len, data->action_data = action_data; data->hist_data = hist_data; - elt_data = kzalloc(sizeof(*elt_data), GFP_KERNEL); + elt_data = kzalloc_obj(*elt_data); if (!elt_data) { track_data_free(data); return ERR_PTR(-ENOMEM); @@ -885,6 +893,15 @@ static u64 hist_field_cpu(struct hist_field *hist_field, return cpu; } +static u64 hist_field_comm(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + return (u64)(unsigned long)current->comm; +} + /** * check_field_for_var_ref - Check if a VAR_REF field references a variable * @hist_field: The VAR_REF field to check @@ -1069,7 +1086,7 @@ static int save_hist_vars(struct hist_trigger_data *hist_data) if (tracing_check_open_get_tr(tr)) return -ENODEV; - var_data = kzalloc(sizeof(*var_data), GFP_KERNEL); + var_data = kzalloc_obj(*var_data); if (!var_data) { trace_array_put(tr); return -ENOMEM; @@ -1338,17 +1355,22 @@ static const char *hist_field_name(struct hist_field *field, field_name = hist_field_name(field->operands[0], ++level); else if (field->flags & HIST_FIELD_FL_CPU) field_name = "common_cpu"; + else if (field->flags & HIST_FIELD_FL_COMM) + field_name = "common_comm"; else if (field->flags & HIST_FIELD_FL_EXPR || field->flags & HIST_FIELD_FL_VAR_REF) { if (field->system) { static char full_name[MAX_FILTER_STR_VAL]; + static char *fmt; + int len; - strcat(full_name, field->system); - strcat(full_name, "."); - strcat(full_name, field->event_name); - strcat(full_name, "."); - strcat(full_name, field->name); - field_name = full_name; + fmt = field->flags & HIST_FIELD_FL_VAR_REF ? "%s.%s.$%s" : "%s.%s.%s"; + + len = snprintf(full_name, sizeof(full_name), fmt, + field->system, field->event_name, + field->name); + if (len < sizeof(full_name)) + field_name = full_name; } else field_name = field->name; } else if (field->flags & HIST_FIELD_FL_TIMESTAMP) @@ -1529,7 +1551,7 @@ parse_hist_trigger_attrs(struct trace_array *tr, char *trigger_str) struct hist_trigger_attrs *attrs; int ret = 0; - attrs = kzalloc(sizeof(*attrs), GFP_KERNEL); + attrs = kzalloc_obj(*attrs); if (!attrs) return ERR_PTR(-ENOMEM); @@ -1627,7 +1649,7 @@ static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt) struct hist_field *hist_field; unsigned int i, n_str; - elt_data = kzalloc(sizeof(*elt_data), GFP_KERNEL); + elt_data = kzalloc_obj(*elt_data); if (!elt_data) return -ENOMEM; @@ -1721,9 +1743,10 @@ static const char *get_hist_field_flags(struct hist_field *hist_field) static void expr_field_str(struct hist_field *field, char *expr) { - if (field->flags & HIST_FIELD_FL_VAR_REF) - strcat(expr, "$"); - else if (field->flags & HIST_FIELD_FL_CONST) { + if (field->flags & HIST_FIELD_FL_VAR_REF) { + if (!field->system) + strcat(expr, "$"); + } else if (field->flags & HIST_FIELD_FL_CONST) { char str[HIST_CONST_DIGITS_MAX]; snprintf(str, HIST_CONST_DIGITS_MAX, "%llu", field->constant); @@ -1943,7 +1966,7 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, if (field && is_function_field(field)) return NULL; - hist_field = kzalloc(sizeof(struct hist_field), GFP_KERNEL); + hist_field = kzalloc_obj(struct hist_field); if (!hist_field) return NULL; @@ -2015,6 +2038,13 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, goto out; } + if (flags & HIST_FIELD_FL_COMM) { + hist_field->fn_num = HIST_FIELD_FN_COMM; + hist_field->size = MAX_FILTER_STR_VAL; + hist_field->type = "char[]"; + goto out; + } + if (WARN_ON_ONCE(!field)) goto out; @@ -2037,6 +2067,15 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, hist_field->fn_num = HIST_FIELD_FN_RELDYNSTRING; else hist_field->fn_num = HIST_FIELD_FN_PSTRING; + } else if (field->filter_type == FILTER_STACKTRACE) { + flags |= HIST_FIELD_FL_STACKTRACE; + + hist_field->size = MAX_FILTER_STR_VAL; + hist_field->type = kstrdup_const(field->type, GFP_KERNEL); + if (!hist_field->type) + goto free; + + hist_field->fn_num = HIST_FIELD_FN_STACK; } else { hist_field->size = field->size; hist_field->is_signed = field->is_signed; @@ -2359,9 +2398,11 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, hist_data->attrs->ts_in_usecs = true; } else if (strcmp(field_name, "common_stacktrace") == 0) { *flags |= HIST_FIELD_FL_STACKTRACE; - } else if (strcmp(field_name, "common_cpu") == 0) + } else if (strcmp(field_name, "common_cpu") == 0) { *flags |= HIST_FIELD_FL_CPU; - else if (strcmp(field_name, "hitcount") == 0) + } else if (strcmp(field_name, "common_comm") == 0) { + *flags |= HIST_FIELD_FL_COMM | HIST_FIELD_FL_STRING; + } else if (strcmp(field_name, "hitcount") == 0) *flags |= HIST_FIELD_FL_HITCOUNT; else { field = trace_find_event_field(file->event_call, field_name); @@ -2377,6 +2418,8 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, *flags |= HIST_FIELD_FL_CPU; } else if (field && field->filter_type == FILTER_STACKTRACE) { *flags |= HIST_FIELD_FL_STACKTRACE; + } else if (field && field->filter_type == FILTER_COMM) { + *flags |= HIST_FIELD_FL_COMM | HIST_FIELD_FL_STRING; } else { hist_err(tr, HIST_ERR_FIELD_NOT_FOUND, errpos(field_name)); @@ -3010,7 +3053,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, if (!IS_ERR_OR_NULL(event_var)) return event_var; - var_hist = kzalloc(sizeof(*var_hist), GFP_KERNEL); + var_hist = kzalloc_obj(*var_hist); if (!var_hist) return ERR_PTR(-ENOMEM); @@ -3124,7 +3167,7 @@ static inline void __update_field_vars(struct tracing_map_elt *elt, u64 var_val; /* Make sure stacktrace can fit in the string variable length */ - BUILD_BUG_ON((HIST_STACKTRACE_DEPTH + 1) * sizeof(long) >= STR_VAR_LEN_MAX); + BUILD_BUG_ON((HIST_STACKTRACE_DEPTH + 1) * sizeof(long) > STR_VAR_LEN_MAX); for (i = 0, j = field_var_str_start; i < n_field_vars; i++) { struct field_var *field_var = field_vars[i]; @@ -3192,7 +3235,7 @@ static struct hist_field *create_var(struct hist_trigger_data *hist_data, goto out; } - var = kzalloc(sizeof(struct hist_field), GFP_KERNEL); + var = kzalloc_obj(struct hist_field); if (!var) { var = ERR_PTR(-ENOMEM); goto out; @@ -3248,14 +3291,16 @@ static struct field_var *create_field_var(struct hist_trigger_data *hist_data, var = create_var(hist_data, file, field_name, val->size, val->type); if (IS_ERR(var)) { hist_err(tr, HIST_ERR_VAR_CREATE_FIND_FAIL, errpos(field_name)); - kfree(val); + destroy_hist_field(val, 0); ret = PTR_ERR(var); goto err; } - field_var = kzalloc(sizeof(struct field_var), GFP_KERNEL); + field_var = kzalloc_obj(struct field_var); if (!field_var) { - kfree(val); + destroy_hist_field(val, 0); + kfree_const(var->type); + kfree(var->var.name); kfree(var); ret = -ENOMEM; goto err; @@ -3790,7 +3835,7 @@ static struct action_data *track_data_parse(struct hist_trigger_data *hist_data, int ret = -EINVAL; char *var_str; - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc_obj(*data); if (!data) return ERR_PTR(-ENOMEM); @@ -4157,7 +4202,7 @@ static struct action_data *onmatch_parse(struct trace_array *tr, char *str) struct action_data *data; int ret = -EINVAL; - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc_obj(*data); if (!data) return ERR_PTR(-ENOMEM); @@ -4327,6 +4372,8 @@ static u64 hist_fn_call(struct hist_field *hist_field, return hist_field_timestamp(hist_field, elt, buffer, rbe, event); case HIST_FIELD_FN_CPU: return hist_field_cpu(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_COMM: + return hist_field_comm(hist_field, elt, buffer, rbe, event); case HIST_FIELD_FN_STRING: return hist_field_string(hist_field, elt, buffer, rbe, event); case HIST_FIELD_FN_DYNSTRING: @@ -5093,7 +5140,7 @@ create_hist_data(unsigned int map_bits, struct hist_trigger_data *hist_data; int ret = 0; - hist_data = kzalloc(sizeof(*hist_data), GFP_KERNEL); + hist_data = kzalloc_obj(*hist_data); if (!hist_data) return ERR_PTR(-ENOMEM); @@ -5212,22 +5259,25 @@ static inline void add_to_key(char *compound_key, void *key, size_t size = key_field->size; if (key_field->flags & HIST_FIELD_FL_STRING) { - struct ftrace_event_field *field; - field = key_field->field; - if (field->filter_type == FILTER_DYN_STRING || - field->filter_type == FILTER_RDYN_STRING) - size = *(u32 *)(rec + field->offset) >> 16; - else if (field->filter_type == FILTER_STATIC_STRING) - size = field->size; + if (key_field->flags & HIST_FIELD_FL_COMM) { + size = strlen((char *)key); + } else { + struct ftrace_event_field *field; + + field = key_field->field; + if (field->filter_type == FILTER_DYN_STRING || + field->filter_type == FILTER_RDYN_STRING) + size = *(u32 *)(rec + field->offset) >> 16; + else if (field->filter_type == FILTER_STATIC_STRING) + size = field->size; + } /* ensure NULL-termination */ if (size > key_field->size - 1) size = key_field->size - 1; - - strncpy(compound_key + key_field->offset, (char *)key, size); - } else - memcpy(compound_key + key_field->offset, key, size); + } + memcpy(compound_key + key_field->offset, key, size); } static void @@ -5246,17 +5296,94 @@ hist_trigger_actions(struct hist_trigger_data *hist_data, } } +/* + * The hist_pad structure is used to save information to create + * a histogram from the histogram trigger. It's too big to store + * on the stack, so when the histogram trigger is initialized + * a percpu array of 4 hist_pad structures is allocated. + * This will cover every context from normal, softirq, irq and NMI + * in the very unlikely event that a trigger happens at each of + * these contexts and interrupts a currently active trigger. + */ +struct hist_pad { + unsigned long entries[HIST_STACKTRACE_DEPTH]; + u64 var_ref_vals[TRACING_MAP_VARS_MAX]; + char compound_key[HIST_KEY_SIZE_MAX]; +}; + +static struct hist_pad __percpu *hist_pads; +static DEFINE_PER_CPU(int, hist_pad_cnt); +static refcount_t hist_pad_ref; + +/* One hist_pad for every context (normal, softirq, irq, NMI) */ +#define MAX_HIST_CNT 4 + +static int alloc_hist_pad(void) +{ + lockdep_assert_held(&event_mutex); + + if (refcount_read(&hist_pad_ref)) { + refcount_inc(&hist_pad_ref); + return 0; + } + + hist_pads = __alloc_percpu(sizeof(struct hist_pad) * MAX_HIST_CNT, + __alignof__(struct hist_pad)); + if (!hist_pads) + return -ENOMEM; + + refcount_set(&hist_pad_ref, 1); + return 0; +} + +static void free_hist_pad(void) +{ + lockdep_assert_held(&event_mutex); + + if (!refcount_dec_and_test(&hist_pad_ref)) + return; + + free_percpu(hist_pads); + hist_pads = NULL; +} + +static struct hist_pad *get_hist_pad(void) +{ + struct hist_pad *hist_pad; + int cnt; + + if (WARN_ON_ONCE(!hist_pads)) + return NULL; + + preempt_disable(); + + hist_pad = per_cpu_ptr(hist_pads, smp_processor_id()); + + if (this_cpu_read(hist_pad_cnt) == MAX_HIST_CNT) { + preempt_enable(); + return NULL; + } + + cnt = this_cpu_inc_return(hist_pad_cnt) - 1; + + return &hist_pad[cnt]; +} + +static void put_hist_pad(void) +{ + this_cpu_dec(hist_pad_cnt); + preempt_enable(); +} + static void event_hist_trigger(struct event_trigger_data *data, struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe) { struct hist_trigger_data *hist_data = data->private_data; bool use_compound_key = (hist_data->n_keys > 1); - unsigned long entries[HIST_STACKTRACE_DEPTH]; - u64 var_ref_vals[TRACING_MAP_VARS_MAX]; - char compound_key[HIST_KEY_SIZE_MAX]; struct tracing_map_elt *elt = NULL; struct hist_field *key_field; + struct hist_pad *hist_pad; u64 field_contents; void *key = NULL; unsigned int i; @@ -5264,12 +5391,18 @@ static void event_hist_trigger(struct event_trigger_data *data, if (unlikely(!rbe)) return; - memset(compound_key, 0, hist_data->key_size); + hist_pad = get_hist_pad(); + if (!hist_pad) + return; + + memset(hist_pad->compound_key, 0, hist_data->key_size); for_each_hist_key_field(i, hist_data) { key_field = hist_data->fields[i]; if (key_field->flags & HIST_FIELD_FL_STACKTRACE) { + unsigned long *entries = hist_pad->entries; + memset(entries, 0, HIST_STACKTRACE_SIZE); if (key_field->field) { unsigned long *stack, n_entries; @@ -5293,24 +5426,31 @@ static void event_hist_trigger(struct event_trigger_data *data, } if (use_compound_key) - add_to_key(compound_key, key, key_field, rec); + add_to_key(hist_pad->compound_key, key, key_field, rec); } if (use_compound_key) - key = compound_key; + key = hist_pad->compound_key; if (hist_data->n_var_refs && - !resolve_var_refs(hist_data, key, var_ref_vals, false)) - return; + !resolve_var_refs(hist_data, key, hist_pad->var_ref_vals, false)) + goto out; elt = tracing_map_insert(hist_data->map, key); if (!elt) - return; + goto out; - hist_trigger_elt_update(hist_data, elt, buffer, rec, rbe, var_ref_vals); + hist_trigger_elt_update(hist_data, elt, buffer, rec, rbe, hist_pad->var_ref_vals); + + if (resolve_var_refs(hist_data, key, hist_pad->var_ref_vals, true)) { + hist_trigger_actions(hist_data, elt, buffer, rec, rbe, + key, hist_pad->var_ref_vals); + } - if (resolve_var_refs(hist_data, key, var_ref_vals, true)) - hist_trigger_actions(hist_data, elt, buffer, rec, rbe, key, var_ref_vals); + hist_poll_wakeup(); + + out: + put_hist_pad(); } static void hist_trigger_stacktrace_print(struct seq_file *m, @@ -5538,8 +5678,7 @@ static int print_entries(struct seq_file *m, (HIST_FIELD_FL_PERCENT | HIST_FIELD_FL_GRAPH))) continue; if (!stats) { - stats = kcalloc(hist_data->n_vals, sizeof(*stats), - GFP_KERNEL); + stats = kzalloc_objs(*stats, hist_data->n_vals); if (!stats) { n_entries = -ENOMEM; goto out; @@ -5575,7 +5714,7 @@ static void hist_trigger_show(struct seq_file *m, seq_puts(m, "\n\n"); seq_puts(m, "# event histogram\n#\n# trigger info: "); - data->ops->print(m, data); + data->cmd_ops->print(m, data); seq_puts(m, "#\n\n"); hist_data = data->private_data; @@ -5590,52 +5729,144 @@ static void hist_trigger_show(struct seq_file *m, n_entries, (u64)atomic64_read(&hist_data->map->drops)); } +struct hist_file_data { + struct file *file; + u64 last_read; + u64 last_act; +}; + +static u64 get_hist_hit_count(struct trace_event_file *event_file) +{ + struct hist_trigger_data *hist_data; + struct event_trigger_data *data; + u64 ret = 0; + + list_for_each_entry(data, &event_file->triggers, list) { + if (data->cmd_ops->trigger_type == ETT_EVENT_HIST) { + hist_data = data->private_data; + ret += atomic64_read(&hist_data->map->hits); + } + } + return ret; +} + static int hist_show(struct seq_file *m, void *v) { + struct hist_file_data *hist_file = m->private; struct event_trigger_data *data; struct trace_event_file *event_file; - int n = 0, ret = 0; + int n = 0; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); - event_file = event_file_file(m->private); - if (unlikely(!event_file)) { - ret = -ENODEV; - goto out_unlock; - } + event_file = event_file_file(hist_file->file); + if (unlikely(!event_file)) + return -ENODEV; list_for_each_entry(data, &event_file->triggers, list) { if (data->cmd_ops->trigger_type == ETT_EVENT_HIST) hist_trigger_show(m, data, n++); } + hist_file->last_read = get_hist_hit_count(event_file); + /* + * Update last_act too so that poll()/POLLPRI can wait for the next + * event after any syscall on hist file. + */ + hist_file->last_act = hist_file->last_read; + + return 0; +} - out_unlock: - mutex_unlock(&event_mutex); +static __poll_t event_hist_poll(struct file *file, struct poll_table_struct *wait) +{ + struct trace_event_file *event_file; + struct seq_file *m = file->private_data; + struct hist_file_data *hist_file = m->private; + __poll_t ret = 0; + u64 cnt; + + guard(mutex)(&event_mutex); + + event_file = event_file_file(file); + if (!event_file) + return EPOLLERR; + + hist_poll_wait(file, wait); + + cnt = get_hist_hit_count(event_file); + if (hist_file->last_read != cnt) + ret |= EPOLLIN | EPOLLRDNORM; + if (hist_file->last_act != cnt) { + hist_file->last_act = cnt; + ret |= EPOLLPRI; + } return ret; } +static int event_hist_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + struct hist_file_data *hist_file = m->private; + + kfree(hist_file); + return tracing_single_release_file_tr(inode, file); +} + static int event_hist_open(struct inode *inode, struct file *file) { + struct trace_event_file *event_file; + struct hist_file_data *hist_file; int ret; ret = tracing_open_file_tr(inode, file); if (ret) return ret; - /* Clear private_data to avoid warning in single_open() */ - file->private_data = NULL; - return single_open(file, hist_show, file); + guard(mutex)(&event_mutex); + + event_file = event_file_file(file); + if (!event_file) { + ret = -ENODEV; + goto err; + } + + hist_file = kzalloc_obj(*hist_file); + if (!hist_file) { + ret = -ENOMEM; + goto err; + } + + hist_file->file = file; + hist_file->last_act = get_hist_hit_count(event_file); + + ret = single_open(file, hist_show, hist_file); + if (ret) { + kfree(hist_file); + goto err; + } + + return 0; +err: + tracing_release_file_tr(inode, file); + return ret; } const struct file_operations event_hist_fops = { .open = event_hist_open, .read = seq_read, .llseek = seq_lseek, - .release = tracing_single_release_file_tr, + .release = event_hist_release, + .poll = event_hist_poll, }; #ifdef CONFIG_HIST_TRIGGERS_DEBUG + +#undef C +#define C(a, b) b + +static const char * const field_funcs[] = { FIELD_FUNCS }; + static void hist_field_debug_show_flags(struct seq_file *m, unsigned long flags) { @@ -5700,6 +5931,7 @@ static int hist_field_debug_show(struct seq_file *m, seq_printf(m, " type: %s\n", field->type); seq_printf(m, " size: %u\n", field->size); seq_printf(m, " is_signed: %u\n", field->is_signed); + seq_printf(m, " function: hist_field_%s()\n", field_funcs[field->fn_num]); return 0; } @@ -5809,7 +6041,7 @@ static void hist_trigger_debug_show(struct seq_file *m, seq_puts(m, "\n\n"); seq_puts(m, "# event histogram\n#\n# trigger info: "); - data->ops->print(m, data); + data->cmd_ops->print(m, data); seq_puts(m, "#\n\n"); hist_data = data->private_data; @@ -5873,25 +6105,19 @@ static int hist_debug_show(struct seq_file *m, void *v) { struct event_trigger_data *data; struct trace_event_file *event_file; - int n = 0, ret = 0; + int n = 0; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); event_file = event_file_file(m->private); - if (unlikely(!event_file)) { - ret = -ENODEV; - goto out_unlock; - } + if (unlikely(!event_file)) + return -ENODEV; list_for_each_entry(data, &event_file->triggers, list) { if (data->cmd_ops->trigger_type == ETT_EVENT_HIST) hist_trigger_debug_show(m, data, n++); } - - out_unlock: - mutex_unlock(&event_mutex); - - return ret; + return 0; } static int event_hist_debug_open(struct inode *inode, struct file *file) @@ -5902,9 +6128,10 @@ static int event_hist_debug_open(struct inode *inode, struct file *file) if (ret) return ret; - /* Clear private_data to avoid warning in single_open() */ - file->private_data = NULL; - return single_open(file, hist_debug_show, file); + ret = single_open(file, hist_debug_show, file); + if (ret) + tracing_release_file_tr(inode, file); + return ret; } const struct file_operations event_hist_debug_fops = { @@ -5924,12 +6151,15 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) if (hist_field->flags & HIST_FIELD_FL_CPU) seq_puts(m, "common_cpu"); + if (hist_field->flags & HIST_FIELD_FL_COMM) + seq_puts(m, "common_comm"); else if (hist_field->flags & HIST_FIELD_FL_CONST) seq_printf(m, "%llu", hist_field->constant); else if (field_name) { if (hist_field->flags & HIST_FIELD_FL_VAR_REF || hist_field->flags & HIST_FIELD_FL_ALIAS) - seq_putc(m, '$'); + if (!hist_field->system) + seq_putc(m, '$'); seq_printf(m, "%s", field_name); } else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP) seq_puts(m, "common_timestamp"); @@ -6070,6 +6300,9 @@ static int event_hist_trigger_init(struct event_trigger_data *data) { struct hist_trigger_data *hist_data = data->private_data; + if (alloc_hist_pad() < 0) + return -ENOMEM; + if (!data->ref && hist_data->attrs->name) save_named_trigger(hist_data->attrs->name, data); @@ -6114,24 +6347,24 @@ static void event_hist_trigger_free(struct event_trigger_data *data) destroy_hist_data(hist_data); } + free_hist_pad(); } -static struct event_trigger_ops event_hist_trigger_ops = { - .trigger = event_hist_trigger, - .print = event_hist_trigger_print, - .init = event_hist_trigger_init, - .free = event_hist_trigger_free, -}; - static int event_hist_trigger_named_init(struct event_trigger_data *data) { + int ret; + data->ref++; save_named_trigger(data->named_data->name, data); - event_hist_trigger_init(data->named_data); + ret = event_hist_trigger_init(data->named_data); + if (ret < 0) { + kfree(data->cmd_ops); + data->cmd_ops = &trigger_hist_cmd; + } - return 0; + return ret; } static void event_hist_trigger_named_free(struct event_trigger_data *data) @@ -6143,24 +6376,14 @@ static void event_hist_trigger_named_free(struct event_trigger_data *data) data->ref--; if (!data->ref) { + struct event_command *cmd_ops = data->cmd_ops; + del_named_trigger(data); trigger_data_free(data); + kfree(cmd_ops); } } -static struct event_trigger_ops event_hist_trigger_named_ops = { - .trigger = event_hist_trigger, - .print = event_hist_trigger_print, - .init = event_hist_trigger_named_init, - .free = event_hist_trigger_named_free, -}; - -static struct event_trigger_ops *event_hist_get_trigger_ops(char *cmd, - char *param) -{ - return &event_hist_trigger_ops; -} - static void hist_clear(struct event_trigger_data *data) { struct hist_trigger_data *hist_data = data->private_data; @@ -6308,6 +6531,26 @@ static bool existing_hist_update_only(char *glob, return updated; } +/* + * Set or disable using the per CPU trace_buffer_event when possible. + */ +static int tracing_set_filter_buffering(struct trace_array *tr, bool set) +{ + guard(mutex)(&trace_types_lock); + + if (set && tr->no_filter_buffering_ref++) + return 0; + + if (!set) { + if (WARN_ON_ONCE(!tr->no_filter_buffering_ref)) + return -EINVAL; + + --tr->no_filter_buffering_ref; + } + + return 0; +} + static int hist_register_trigger(char *glob, struct event_trigger_data *data, struct trace_event_file *file) @@ -6354,13 +6597,24 @@ static int hist_register_trigger(char *glob, data->paused = true; if (named_data) { + struct event_command *cmd_ops; + data->private_data = named_data->private_data; set_named_trigger_data(data, named_data); - data->ops = &event_hist_trigger_named_ops; + /* Copy the command ops and update some of the functions */ + cmd_ops = kmalloc_obj(*cmd_ops); + if (!cmd_ops) { + ret = -ENOMEM; + goto out; + } + *cmd_ops = *data->cmd_ops; + cmd_ops->init = event_hist_trigger_named_init; + cmd_ops->free = event_hist_trigger_named_free; + data->cmd_ops = cmd_ops; } - if (data->ops->init) { - ret = data->ops->init(data); + if (data->cmd_ops->init) { + ret = data->cmd_ops->init(data); if (ret < 0) goto out; } @@ -6474,8 +6728,8 @@ static void hist_unregister_trigger(char *glob, } } - if (test && test->ops->free) - test->ops->free(test); + if (test && test->cmd_ops->free) + test->cmd_ops->free(test); if (hist_data->enable_timestamps) { if (!hist_data->remove || test) @@ -6527,8 +6781,8 @@ static void hist_unreg_all(struct trace_event_file *file) update_cond_flag(file); if (hist_data->enable_timestamps) tracing_set_filter_buffering(file->tr, false); - if (test->ops->free) - test->ops->free(test); + if (test->cmd_ops->free) + test->cmd_ops->free(test); } } } @@ -6618,7 +6872,7 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, return PTR_ERR(hist_data); } - trigger_data = event_trigger_alloc(cmd_ops, cmd, param, hist_data); + trigger_data = trigger_data_alloc(cmd_ops, cmd, param, hist_data); if (!trigger_data) { ret = -ENOMEM; goto out_free; @@ -6649,27 +6903,27 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, if (existing_hist_update_only(glob, trigger_data, file)) goto out_free; - ret = event_trigger_register(cmd_ops, file, glob, trigger_data); - if (ret < 0) - goto out_free; + if (!get_named_trigger_data(trigger_data)) { - if (get_named_trigger_data(trigger_data)) - goto enable; + ret = create_actions(hist_data); + if (ret) + goto out_free; - ret = create_actions(hist_data); - if (ret) - goto out_unreg; + if (has_hist_vars(hist_data) || hist_data->n_var_refs) { + ret = save_hist_vars(hist_data); + if (ret) + goto out_free; + } - if (has_hist_vars(hist_data) || hist_data->n_var_refs) { - ret = save_hist_vars(hist_data); + ret = tracing_map_init(hist_data->map); if (ret) - goto out_unreg; + goto out_free; } - ret = tracing_map_init(hist_data->map); - if (ret) - goto out_unreg; -enable: + ret = event_trigger_register(cmd_ops, file, glob, trigger_data); + if (ret < 0) + goto out_free; + ret = hist_trigger_enable(trigger_data, file); if (ret) goto out_unreg; @@ -6686,11 +6940,9 @@ enable: out_unreg: event_trigger_unregister(cmd_ops, file, glob+1, trigger_data); out_free: - event_trigger_reset_filter(cmd_ops, trigger_data); - remove_hist_vars(hist_data); - kfree(trigger_data); + trigger_data_free(trigger_data); destroy_hist_data(hist_data); goto out; @@ -6704,8 +6956,11 @@ static struct event_command trigger_hist_cmd = { .reg = hist_register_trigger, .unreg = hist_unregister_trigger, .unreg_all = hist_unreg_all, - .get_trigger_ops = event_hist_get_trigger_ops, .set_filter = set_trigger_filter, + .trigger = event_hist_trigger, + .print = event_hist_trigger_print, + .init = event_hist_trigger_init, + .free = event_hist_trigger_free, }; __init int register_trigger_hist_cmd(void) @@ -6737,66 +6992,6 @@ hist_enable_trigger(struct event_trigger_data *data, } } -static void -hist_enable_count_trigger(struct event_trigger_data *data, - struct trace_buffer *buffer, void *rec, - struct ring_buffer_event *event) -{ - if (!data->count) - return; - - if (data->count != -1) - (data->count)--; - - hist_enable_trigger(data, buffer, rec, event); -} - -static struct event_trigger_ops hist_enable_trigger_ops = { - .trigger = hist_enable_trigger, - .print = event_enable_trigger_print, - .init = event_trigger_init, - .free = event_enable_trigger_free, -}; - -static struct event_trigger_ops hist_enable_count_trigger_ops = { - .trigger = hist_enable_count_trigger, - .print = event_enable_trigger_print, - .init = event_trigger_init, - .free = event_enable_trigger_free, -}; - -static struct event_trigger_ops hist_disable_trigger_ops = { - .trigger = hist_enable_trigger, - .print = event_enable_trigger_print, - .init = event_trigger_init, - .free = event_enable_trigger_free, -}; - -static struct event_trigger_ops hist_disable_count_trigger_ops = { - .trigger = hist_enable_count_trigger, - .print = event_enable_trigger_print, - .init = event_trigger_init, - .free = event_enable_trigger_free, -}; - -static struct event_trigger_ops * -hist_enable_get_trigger_ops(char *cmd, char *param) -{ - struct event_trigger_ops *ops; - bool enable; - - enable = (strcmp(cmd, ENABLE_HIST_STR) == 0); - - if (enable) - ops = param ? &hist_enable_count_trigger_ops : - &hist_enable_trigger_ops; - else - ops = param ? &hist_disable_count_trigger_ops : - &hist_disable_trigger_ops; - - return ops; -} - static void hist_enable_unreg_all(struct trace_event_file *file) { struct event_trigger_data *test, *n; @@ -6806,8 +7001,8 @@ static void hist_enable_unreg_all(struct trace_event_file *file) list_del_rcu(&test->list); update_cond_flag(file); trace_event_trigger_enable_disable(file, 0); - if (test->ops->free) - test->ops->free(test); + if (test->cmd_ops->free) + test->cmd_ops->free(test); } } } @@ -6819,8 +7014,12 @@ static struct event_command trigger_hist_enable_cmd = { .reg = event_enable_register_trigger, .unreg = event_enable_unregister_trigger, .unreg_all = hist_enable_unreg_all, - .get_trigger_ops = hist_enable_get_trigger_ops, .set_filter = set_trigger_filter, + .trigger = hist_enable_trigger, + .count_func = event_trigger_count, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, }; static struct event_command trigger_hist_disable_cmd = { @@ -6830,8 +7029,12 @@ static struct event_command trigger_hist_disable_cmd = { .reg = event_enable_register_trigger, .unreg = event_enable_unregister_trigger, .unreg_all = hist_enable_unreg_all, - .get_trigger_ops = hist_enable_get_trigger_ops, .set_filter = set_trigger_filter, + .trigger = hist_enable_trigger, + .count_func = event_trigger_count, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, }; static __init void unregister_trigger_hist_enable_disable_cmds(void) diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index c82b401a294d..39ac4eba0702 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -49,16 +49,11 @@ static char *last_cmd; static int errpos(const char *str) { - int ret = 0; - - mutex_lock(&lastcmd_mutex); + guard(mutex)(&lastcmd_mutex); if (!str || !last_cmd) - goto out; + return 0; - ret = err_pos(last_cmd, str); - out: - mutex_unlock(&lastcmd_mutex); - return ret; + return err_pos(last_cmd, str); } static void last_cmd_set(const char *str) @@ -74,14 +69,12 @@ static void last_cmd_set(const char *str) static void synth_err(u8 err_type, u16 err_pos) { - mutex_lock(&lastcmd_mutex); + guard(mutex)(&lastcmd_mutex); if (!last_cmd) - goto out; + return; tracing_log_err(NULL, "synthetic_events", last_cmd, err_text, err_type, err_pos); - out: - mutex_unlock(&lastcmd_mutex); } static int create_synth_event(const char *raw_command); @@ -137,7 +130,9 @@ static int synth_event_define_fields(struct trace_event_call *call) struct synth_event *event = call->data; unsigned int i, size, n_u64; char *name, *type; + int filter_type; bool is_signed; + bool is_stack; int ret = 0; for (i = 0, n_u64 = 0; i < event->n_fields; i++) { @@ -145,8 +140,12 @@ static int synth_event_define_fields(struct trace_event_call *call) is_signed = event->fields[i]->is_signed; type = event->fields[i]->type; name = event->fields[i]->name; + is_stack = event->fields[i]->is_stack; + + filter_type = is_stack ? FILTER_STACKTRACE : FILTER_OTHER; + ret = trace_define_field(call, type, name, offset, size, - is_signed, FILTER_OTHER); + is_signed, filter_type); if (ret) break; @@ -214,7 +213,7 @@ static int synth_field_string_size(char *type) if (len == 0) return 0; /* variable-length string */ - strncpy(buf, start, len); + memcpy(buf, start, len); buf[len] = '\0'; err = kstrtouint(buf, 0, &size); @@ -312,7 +311,7 @@ static const char *synth_field_fmt(char *type) else if (strcmp(type, "gfp_t") == 0) fmt = "%x"; else if (synth_field_is_string(type)) - fmt = "%.*s"; + fmt = "%s"; else if (synth_field_is_stack(type)) fmt = "%s"; @@ -366,7 +365,7 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, fmt = synth_field_fmt(se->fields[i]->type); /* parameter types */ - if (tr && tr->trace_flags & TRACE_ITER_VERBOSE) + if (tr && tr->trace_flags & TRACE_ITER(VERBOSE)) trace_seq_printf(s, "%s ", fmt); snprintf(print_fmt, sizeof(print_fmt), "%%s=%s%%s", fmt); @@ -377,13 +376,11 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, union trace_synth_field *data = &entry->fields[n_u64]; trace_seq_printf(s, print_fmt, se->fields[i]->name, - STR_VAR_LEN_MAX, (char *)entry + data->as_dynamic.offset, i == se->n_fields - 1 ? "" : " "); n_u64++; } else { trace_seq_printf(s, print_fmt, se->fields[i]->name, - STR_VAR_LEN_MAX, (char *)&entry->fields[n_u64].as_u64, i == se->n_fields - 1 ? "" : " "); n_u64 += STR_VAR_LEN_MAX / sizeof(u64); @@ -398,7 +395,7 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, n_u64++; } else { struct trace_print_flags __flags[] = { - __def_gfpflag_names, {-1, NULL} }; + __def_gfpflag_names }; char *space = (i == se->n_fields - 1 ? "" : " "); print_synth_event_num_val(s, print_fmt, @@ -411,7 +408,7 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, trace_seq_puts(s, " ("); trace_print_flags_seq(s, "|", entry->fields[n_u64].as_u64, - __flags); + __flags, ARRAY_SIZE(__flags)); trace_seq_putc(s, ')'); } n_u64++; @@ -502,9 +499,9 @@ static unsigned int trace_stack(struct synth_trace_event *entry, return len; } -static notrace void trace_event_raw_event_synth(void *__data, - u64 *var_ref_vals, - unsigned int *var_ref_idx) +static void trace_event_raw_event_synth(void *__data, + u64 *var_ref_vals, + unsigned int *var_ref_idx) { unsigned int i, n_u64, val_idx, len, data_size = 0; struct trace_event_file *trace_file = __data; @@ -544,12 +541,12 @@ static notrace void trace_event_raw_event_synth(void *__data, * is being performed within another event. */ buffer = trace_file->tr->array_buffer.buffer; - ring_buffer_nest_start(buffer); + guard(ring_buffer_nest)(buffer); entry = trace_event_buffer_reserve(&fbuffer, trace_file, sizeof(*entry) + fields_size); if (!entry) - goto out; + return; for (i = 0, n_u64 = 0; i < event->n_fields; i++) { val_idx = var_ref_idx[i]; @@ -592,8 +589,6 @@ static notrace void trace_event_raw_event_synth(void *__data, } trace_event_buffer_commit(&fbuffer); -out: - ring_buffer_nest_end(buffer); } static void free_synth_event_print_fmt(struct trace_event_call *call) @@ -619,7 +614,7 @@ static int __set_synth_event_print_fmt(struct synth_event *event, fmt = synth_field_fmt(event->fields[i]->type); pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s%s", event->fields[i]->name, fmt, - i == event->n_fields - 1 ? "" : ", "); + i == event->n_fields - 1 ? "" : " "); } pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); @@ -716,7 +711,7 @@ static struct synth_field *parse_synth_field(int argc, char **argv, *field_version = check_field_version(prefix, field_type, field_name); - field = kzalloc(sizeof(*field), GFP_KERNEL); + field = kzalloc_obj(*field); if (!field) return ERR_PTR(-ENOMEM); @@ -824,7 +819,7 @@ static struct tracepoint *alloc_synth_tracepoint(char *name) { struct tracepoint *tp; - tp = kzalloc(sizeof(*tp), GFP_KERNEL); + tp = kzalloc_obj(*tp); if (!tp) return ERR_PTR(-ENOMEM); @@ -859,6 +854,38 @@ static struct trace_event_fields synth_event_fields_array[] = { {} }; +static int synth_event_reg(struct trace_event_call *call, + enum trace_reg type, void *data) +{ + struct synth_event *event = container_of(call, struct synth_event, call); + + switch (type) { +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: +#endif + case TRACE_REG_REGISTER: + if (!try_module_get(event->mod)) + return -EBUSY; + break; + default: + break; + } + + int ret = trace_event_reg(call, type, data); + + switch (type) { +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_UNREGISTER: +#endif + case TRACE_REG_UNREGISTER: + module_put(event->mod); + break; + default: + break; + } + return ret; +} + static int register_synth_event(struct synth_event *event) { struct trace_event_call *call = &event->call; @@ -888,7 +915,7 @@ static int register_synth_event(struct synth_event *event) goto out; } call->flags = TRACE_EVENT_FL_TRACEPOINT; - call->class->reg = trace_event_reg; + call->class->reg = synth_event_reg; call->class->probe = trace_event_raw_event_synth; call->data = event; call->tp = event->tp; @@ -946,7 +973,7 @@ static struct synth_event *alloc_synth_event(const char *name, int n_fields, unsigned int i, j, n_dynamic_fields = 0; struct synth_event *event; - event = kzalloc(sizeof(*event), GFP_KERNEL); + event = kzalloc_obj(*event); if (!event) { event = ERR_PTR(-ENOMEM); goto out; @@ -959,7 +986,7 @@ static struct synth_event *alloc_synth_event(const char *name, int n_fields, goto out; } - event->fields = kcalloc(n_fields, sizeof(*event->fields), GFP_KERNEL); + event->fields = kzalloc_objs(*event->fields, n_fields); if (!event->fields) { free_synth_event(event); event = ERR_PTR(-ENOMEM); @@ -971,9 +998,8 @@ static struct synth_event *alloc_synth_event(const char *name, int n_fields, n_dynamic_fields++; if (n_dynamic_fields) { - event->dynamic_fields = kcalloc(n_dynamic_fields, - sizeof(*event->dynamic_fields), - GFP_KERNEL); + event->dynamic_fields = kzalloc_objs(*event->dynamic_fields, + n_dynamic_fields); if (!event->dynamic_fields) { free_synth_event(event); event = ERR_PTR(-ENOMEM); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index a5e3d6acf1e1..655db2e82513 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -6,6 +6,7 @@ */ #include <linux/security.h> +#include <linux/kthread.h> #include <linux/module.h> #include <linux/ctype.h> #include <linux/mutex.h> @@ -17,15 +18,133 @@ static LIST_HEAD(trigger_commands); static DEFINE_MUTEX(trigger_cmd_mutex); +static struct task_struct *trigger_kthread; +static struct llist_head trigger_data_free_list; +static DEFINE_MUTEX(trigger_data_kthread_mutex); + +static int trigger_kthread_fn(void *ignore); + +static void trigger_create_kthread_locked(void) +{ + lockdep_assert_held(&trigger_data_kthread_mutex); + + if (!trigger_kthread) { + struct task_struct *kthread; + + kthread = kthread_create(trigger_kthread_fn, NULL, + "trigger_data_free"); + if (!IS_ERR(kthread)) + WRITE_ONCE(trigger_kthread, kthread); + } +} + +static void trigger_data_free_queued_locked(void) +{ + struct event_trigger_data *data, *tmp; + struct llist_node *llnodes; + + lockdep_assert_held(&trigger_data_kthread_mutex); + + llnodes = llist_del_all(&trigger_data_free_list); + if (!llnodes) + return; + + tracepoint_synchronize_unregister(); + + llist_for_each_entry_safe(data, tmp, llnodes, llist) + kfree(data); +} + +/* Bulk garbage collection of event_trigger_data elements */ +static int trigger_kthread_fn(void *ignore) +{ + struct event_trigger_data *data, *tmp; + struct llist_node *llnodes; + + /* Once this task starts, it lives forever */ + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + if (llist_empty(&trigger_data_free_list)) + schedule(); + + __set_current_state(TASK_RUNNING); + + llnodes = llist_del_all(&trigger_data_free_list); + + /* make sure current triggers exit before free */ + tracepoint_synchronize_unregister(); + + llist_for_each_entry_safe(data, tmp, llnodes, llist) + kfree(data); + } + + return 0; +} + void trigger_data_free(struct event_trigger_data *data) { + if (!data) + return; + if (data->cmd_ops->set_filter) data->cmd_ops->set_filter(NULL, data, NULL); - /* make sure current triggers exit before free */ - tracepoint_synchronize_unregister(); + /* + * Boot-time trigger registration can fail before kthread creation + * works. Keep the deferred-free semantics during boot and let late + * init start the kthread to drain the list. + */ + if (system_state == SYSTEM_BOOTING && !trigger_kthread) { + llist_add(&data->llist, &trigger_data_free_list); + return; + } + + if (unlikely(!trigger_kthread)) { + guard(mutex)(&trigger_data_kthread_mutex); - kfree(data); + trigger_create_kthread_locked(); + /* Check again after taking mutex */ + if (!trigger_kthread) { + llist_add(&data->llist, &trigger_data_free_list); + /* Drain the queued frees synchronously if creation failed. */ + trigger_data_free_queued_locked(); + return; + } + } + + llist_add(&data->llist, &trigger_data_free_list); + wake_up_process(trigger_kthread); +} + +static int __init trigger_data_free_init(void) +{ + guard(mutex)(&trigger_data_kthread_mutex); + + if (llist_empty(&trigger_data_free_list)) + return 0; + + trigger_create_kthread_locked(); + if (trigger_kthread) + wake_up_process(trigger_kthread); + else + trigger_data_free_queued_locked(); + + return 0; +} +late_initcall(trigger_data_free_init); + +static inline void data_ops_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *event) +{ + const struct event_command *cmd_ops = data->cmd_ops; + + if (data->flags & EVENT_TRIGGER_FL_COUNT) { + if (!cmd_ops->count_func(data, buffer, rec, event)) + return; + } + + cmd_ops->trigger(data, buffer, rec, event); } /** @@ -70,7 +189,7 @@ event_triggers_call(struct trace_event_file *file, if (data->paused) continue; if (!rec) { - data->ops->trigger(data, buffer, rec, event); + data_ops_trigger(data, buffer, rec, event); continue; } filter = rcu_dereference_sched(data->filter); @@ -80,7 +199,7 @@ event_triggers_call(struct trace_event_file *file, tt |= data->cmd_ops->trigger_type; continue; } - data->ops->trigger(data, buffer, rec, event); + data_ops_trigger(data, buffer, rec, event); } return tt; } @@ -122,7 +241,7 @@ event_triggers_post_call(struct trace_event_file *file, if (data->paused) continue; if (data->cmd_ops->trigger_type & tt) - data->ops->trigger(data, NULL, NULL, NULL); + data_ops_trigger(data, NULL, NULL, NULL); } } EXPORT_SYMBOL_GPL(event_triggers_post_call); @@ -191,7 +310,7 @@ static int trigger_show(struct seq_file *m, void *v) } data = list_entry(v, struct event_trigger_data, list); - data->ops->print(m, data); + data->cmd_ops->print(m, data); return 0; } @@ -211,12 +330,10 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file) if (ret) return ret; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); - if (unlikely(!event_file_file(file))) { - mutex_unlock(&event_mutex); + if (unlikely(!event_file_file(file))) return -ENODEV; - } if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { @@ -239,8 +356,6 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file) } } - mutex_unlock(&event_mutex); - return ret; } @@ -248,9 +363,9 @@ int trigger_process_regex(struct trace_event_file *file, char *buff) { char *command, *next; struct event_command *p; - int ret = -EINVAL; - next = buff = skip_spaces(buff); + next = buff = strim(buff); + command = strsep(&next, ": \t"); if (next) { next = skip_spaces(next); @@ -259,17 +374,14 @@ int trigger_process_regex(struct trace_event_file *file, char *buff) } command = (command[0] != '!') ? command : command + 1; - mutex_lock(&trigger_cmd_mutex); + guard(mutex)(&trigger_cmd_mutex); + list_for_each_entry(p, &trigger_commands, list) { - if (strcmp(p->name, command) == 0) { - ret = p->parse(p, file, buff, command, next); - goto out_unlock; - } + if (strcmp(p->name, command) == 0) + return p->parse(p, file, buff, command, next); } - out_unlock: - mutex_unlock(&trigger_cmd_mutex); - return ret; + return -EINVAL; } static ssize_t event_trigger_regex_write(struct file *file, @@ -278,7 +390,7 @@ static ssize_t event_trigger_regex_write(struct file *file, { struct trace_event_file *event_file; ssize_t ret; - char *buf; + char *buf __free(kfree) = NULL; if (!cnt) return 0; @@ -290,37 +402,25 @@ static ssize_t event_trigger_regex_write(struct file *file, if (IS_ERR(buf)) return PTR_ERR(buf); - strim(buf); + guard(mutex)(&event_mutex); - mutex_lock(&event_mutex); event_file = event_file_file(file); - if (unlikely(!event_file)) { - mutex_unlock(&event_mutex); - kfree(buf); + if (unlikely(!event_file)) return -ENODEV; - } - ret = trigger_process_regex(event_file, buf); - mutex_unlock(&event_mutex); - kfree(buf); + ret = trigger_process_regex(event_file, buf); if (ret < 0) - goto out; + return ret; *ppos += cnt; - ret = cnt; - out: - return ret; + return cnt; } static int event_trigger_regex_release(struct inode *inode, struct file *file) { - mutex_lock(&event_mutex); - if (file->f_mode & FMODE_READ) seq_release(inode, file); - mutex_unlock(&event_mutex); - return 0; } @@ -359,20 +459,16 @@ const struct file_operations event_trigger_fops = { __init int register_event_command(struct event_command *cmd) { struct event_command *p; - int ret = 0; - mutex_lock(&trigger_cmd_mutex); + guard(mutex)(&trigger_cmd_mutex); + list_for_each_entry(p, &trigger_commands, list) { - if (strcmp(cmd->name, p->name) == 0) { - ret = -EBUSY; - goto out_unlock; - } + if (strcmp(cmd->name, p->name) == 0) + return -EBUSY; } list_add(&cmd->list, &trigger_commands); - out_unlock: - mutex_unlock(&trigger_cmd_mutex); - return ret; + return 0; } /* @@ -382,24 +478,51 @@ __init int register_event_command(struct event_command *cmd) __init int unregister_event_command(struct event_command *cmd) { struct event_command *p, *n; - int ret = -ENODEV; - mutex_lock(&trigger_cmd_mutex); + guard(mutex)(&trigger_cmd_mutex); + list_for_each_entry_safe(p, n, &trigger_commands, list) { if (strcmp(cmd->name, p->name) == 0) { - ret = 0; list_del_init(&p->list); - goto out_unlock; + return 0; } } - out_unlock: - mutex_unlock(&trigger_cmd_mutex); - return ret; + return -ENODEV; +} + +/** + * event_trigger_count - Optional count function for event triggers + * @data: Trigger-specific data + * @buffer: The ring buffer that the event is being written to + * @rec: The trace entry for the event, NULL for unconditional invocation + * @event: The event meta data in the ring buffer + * + * For triggers that can take a count parameter that doesn't do anything + * special, they can use this function to assign to their .count_func + * field. + * + * This simply does a count down of the @data->count field. + * + * If the @data->count is greater than zero, it will decrement it. + * + * Returns false if @data->count is zero, otherwise true. + */ +bool event_trigger_count(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *event) +{ + if (!data->count) + return false; + + if (data->count != -1) + (data->count)--; + + return true; } /** - * event_trigger_print - Generic event_trigger_ops @print implementation + * event_trigger_print - Generic event_command @print implementation * @name: The name of the event trigger * @m: The seq_file being printed to * @data: Trigger-specific data @@ -434,7 +557,7 @@ event_trigger_print(const char *name, struct seq_file *m, } /** - * event_trigger_init - Generic event_trigger_ops @init implementation + * event_trigger_init - Generic event_command @init implementation * @data: Trigger-specific data * * Common implementation of event trigger initialization. @@ -451,7 +574,7 @@ int event_trigger_init(struct event_trigger_data *data) } /** - * event_trigger_free - Generic event_trigger_ops @free implementation + * event_trigger_free - Generic event_command @free implementation * @data: Trigger-specific data * * Common implementation of event trigger de-initialization. @@ -513,8 +636,8 @@ clear_event_triggers(struct trace_array *tr) list_for_each_entry_safe(data, n, &file->triggers, list) { trace_event_trigger_enable_disable(file, 0); list_del_rcu(&data->list); - if (data->ops->free) - data->ops->free(data); + if (data->cmd_ops->free) + data->cmd_ops->free(data); } } } @@ -573,16 +696,14 @@ static int register_trigger(char *glob, lockdep_assert_held(&event_mutex); list_for_each_entry(test, &file->triggers, list) { - if (test->cmd_ops->trigger_type == data->cmd_ops->trigger_type) { - ret = -EEXIST; - goto out; - } + if (test->cmd_ops->trigger_type == data->cmd_ops->trigger_type) + return -EEXIST; } - if (data->ops->init) { - ret = data->ops->init(data); + if (data->cmd_ops->init) { + ret = data->cmd_ops->init(data); if (ret < 0) - goto out; + return ret; } list_add_rcu(&data->list, &file->triggers); @@ -593,7 +714,6 @@ static int register_trigger(char *glob, list_del_rcu(&data->list); update_cond_flag(file); } -out: return ret; } @@ -619,8 +739,8 @@ static bool try_unregister_trigger(char *glob, } if (data) { - if (data->ops->free) - data->ops->free(data); + if (data->cmd_ops->free) + data->cmd_ops->free(data); return true; } @@ -668,7 +788,7 @@ static void unregister_trigger(char *glob, * param - text following cmd and ':' and stripped of filter * filter - the optional filter text following (and including) 'if' * - * To illustrate the use of these componenents, here are some concrete + * To illustrate the use of these components, here are some concrete * examples. For the following triggers: * * echo 'traceon:5 if pid == 0' > trigger @@ -791,7 +911,7 @@ int event_trigger_separate_filter(char *param_and_filter, char **param, if (!param_and_filter) { if (param_required) ret = -EINVAL; - goto out; + return ret; } /* @@ -802,7 +922,7 @@ int event_trigger_separate_filter(char *param_and_filter, char **param, */ if (!param_required && param_and_filter && !isdigit(param_and_filter[0])) { *filter = param_and_filter; - goto out; + return ret; } /* @@ -820,44 +940,45 @@ int event_trigger_separate_filter(char *param_and_filter, char **param, if (!**filter) *filter = NULL; } -out: return ret; } /** - * event_trigger_alloc - allocate and init event_trigger_data for a trigger + * trigger_data_alloc - allocate and init event_trigger_data for a trigger * @cmd_ops: The event_command operations for the trigger * @cmd: The cmd string * @param: The param string * @private_data: User data to associate with the event trigger * * Allocate an event_trigger_data instance and initialize it. The - * @cmd_ops are used along with the @cmd and @param to get the - * trigger_ops to assign to the event_trigger_data. @private_data can - * also be passed in and associated with the event_trigger_data. + * @cmd_ops defines how the trigger will operate. If @param is set, + * and @cmd_ops->trigger_ops->count_func is non NULL, then the + * data->count is set to @param and before the trigger is executed, the + * @cmd_ops->trigger_ops->count_func() is called. If that function returns + * false, the @cmd_ops->trigger_ops->trigger() function will not be called. + * @private_data can also be passed in and associated with the + * event_trigger_data. * - * Use event_trigger_free() to free an event_trigger_data object. + * Use trigger_data_free() to free an event_trigger_data object. * * Return: The trigger_data object success, NULL otherwise */ -struct event_trigger_data *event_trigger_alloc(struct event_command *cmd_ops, - char *cmd, - char *param, - void *private_data) +struct event_trigger_data *trigger_data_alloc(struct event_command *cmd_ops, + char *cmd, + char *param, + void *private_data) { struct event_trigger_data *trigger_data; - struct event_trigger_ops *trigger_ops; - - trigger_ops = cmd_ops->get_trigger_ops(cmd, param); - trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); + trigger_data = kzalloc_obj(*trigger_data); if (!trigger_data) return NULL; trigger_data->count = -1; - trigger_data->ops = trigger_ops; trigger_data->cmd_ops = cmd_ops; trigger_data->private_data = private_data; + if (param && cmd_ops->count_func) + trigger_data->flags |= EVENT_TRIGGER_FL_COUNT; INIT_LIST_HEAD(&trigger_data->list); INIT_LIST_HEAD(&trigger_data->named_list); @@ -1010,15 +1131,14 @@ event_trigger_parse(struct event_command *cmd_ops, return ret; ret = -ENOMEM; - trigger_data = event_trigger_alloc(cmd_ops, cmd, param, file); + trigger_data = trigger_data_alloc(cmd_ops, cmd, param, file); if (!trigger_data) - goto out; + return ret; if (remove) { event_trigger_unregister(cmd_ops, file, glob+1, trigger_data); - kfree(trigger_data); - ret = 0; - goto out; + trigger_data_free(trigger_data); + return 0; } ret = event_trigger_parse_num(param, trigger_data); @@ -1038,13 +1158,12 @@ event_trigger_parse(struct event_command *cmd_ops, /* Down the counter of trigger_data or free it if not used anymore */ event_trigger_free(trigger_data); - out: return ret; out_free: event_trigger_reset_filter(cmd_ops, trigger_data); - kfree(trigger_data); - goto out; + trigger_data_free(trigger_data); + return ret; } /** @@ -1078,10 +1197,10 @@ int set_trigger_filter(char *filter_str, s = strsep(&filter_str, " \t"); if (!strlen(s) || strcmp(s, "if") != 0) - goto out; + return ret; if (!filter_str) - goto out; + return ret; /* The filter is for the 'trigger' event, not the triggered event */ ret = create_event_filter(file->tr, file->event_call, @@ -1125,7 +1244,6 @@ int set_trigger_filter(char *filter_str, ret = -ENOMEM; } } - out: return ret; } @@ -1285,45 +1403,35 @@ traceon_trigger(struct event_trigger_data *data, { struct trace_event_file *file = data->private_data; - if (file) { - if (tracer_tracing_is_on(file->tr)) - return; - - tracer_tracing_on(file->tr); + if (WARN_ON_ONCE(!file)) return; - } - if (tracing_is_on()) + if (tracer_tracing_is_on(file->tr)) return; - tracing_on(); + tracer_tracing_on(file->tr); } -static void -traceon_count_trigger(struct event_trigger_data *data, - struct trace_buffer *buffer, void *rec, - struct ring_buffer_event *event) +static bool +traceon_count_func(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *event) { struct trace_event_file *file = data->private_data; - if (file) { - if (tracer_tracing_is_on(file->tr)) - return; - } else { - if (tracing_is_on()) - return; - } + if (WARN_ON_ONCE(!file)) + return false; + + if (tracer_tracing_is_on(file->tr)) + return false; if (!data->count) - return; + return false; if (data->count != -1) (data->count)--; - if (file) - tracer_tracing_on(file->tr); - else - tracing_on(); + return true; } static void @@ -1333,45 +1441,35 @@ traceoff_trigger(struct event_trigger_data *data, { struct trace_event_file *file = data->private_data; - if (file) { - if (!tracer_tracing_is_on(file->tr)) - return; - - tracer_tracing_off(file->tr); + if (WARN_ON_ONCE(!file)) return; - } - if (!tracing_is_on()) + if (!tracer_tracing_is_on(file->tr)) return; - tracing_off(); + tracer_tracing_off(file->tr); } -static void -traceoff_count_trigger(struct event_trigger_data *data, - struct trace_buffer *buffer, void *rec, - struct ring_buffer_event *event) +static bool +traceoff_count_func(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *event) { struct trace_event_file *file = data->private_data; - if (file) { - if (!tracer_tracing_is_on(file->tr)) - return; - } else { - if (!tracing_is_on()) - return; - } + if (WARN_ON_ONCE(!file)) + return false; + + if (!tracer_tracing_is_on(file->tr)) + return false; if (!data->count) - return; + return false; if (data->count != -1) (data->count)--; - if (file) - tracer_tracing_off(file->tr); - else - tracing_off(); + return true; } static int @@ -1388,58 +1486,18 @@ traceoff_trigger_print(struct seq_file *m, struct event_trigger_data *data) data->filter_str); } -static struct event_trigger_ops traceon_trigger_ops = { - .trigger = traceon_trigger, - .print = traceon_trigger_print, - .init = event_trigger_init, - .free = event_trigger_free, -}; - -static struct event_trigger_ops traceon_count_trigger_ops = { - .trigger = traceon_count_trigger, - .print = traceon_trigger_print, - .init = event_trigger_init, - .free = event_trigger_free, -}; - -static struct event_trigger_ops traceoff_trigger_ops = { - .trigger = traceoff_trigger, - .print = traceoff_trigger_print, - .init = event_trigger_init, - .free = event_trigger_free, -}; - -static struct event_trigger_ops traceoff_count_trigger_ops = { - .trigger = traceoff_count_trigger, - .print = traceoff_trigger_print, - .init = event_trigger_init, - .free = event_trigger_free, -}; - -static struct event_trigger_ops * -onoff_get_trigger_ops(char *cmd, char *param) -{ - struct event_trigger_ops *ops; - - /* we register both traceon and traceoff to this callback */ - if (strcmp(cmd, "traceon") == 0) - ops = param ? &traceon_count_trigger_ops : - &traceon_trigger_ops; - else - ops = param ? &traceoff_count_trigger_ops : - &traceoff_trigger_ops; - - return ops; -} - static struct event_command trigger_traceon_cmd = { .name = "traceon", .trigger_type = ETT_TRACE_ONOFF, .parse = event_trigger_parse, .reg = register_trigger, .unreg = unregister_trigger, - .get_trigger_ops = onoff_get_trigger_ops, .set_filter = set_trigger_filter, + .trigger = traceon_trigger, + .count_func = traceon_count_func, + .print = traceon_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, }; static struct event_command trigger_traceoff_cmd = { @@ -1449,8 +1507,12 @@ static struct event_command trigger_traceoff_cmd = { .parse = event_trigger_parse, .reg = register_trigger, .unreg = unregister_trigger, - .get_trigger_ops = onoff_get_trigger_ops, .set_filter = set_trigger_filter, + .trigger = traceoff_trigger, + .count_func = traceoff_count_func, + .print = traceoff_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, }; #ifdef CONFIG_TRACER_SNAPSHOT @@ -1461,24 +1523,10 @@ snapshot_trigger(struct event_trigger_data *data, { struct trace_event_file *file = data->private_data; - if (file) - tracing_snapshot_instance(file->tr); - else - tracing_snapshot(); -} - -static void -snapshot_count_trigger(struct event_trigger_data *data, - struct trace_buffer *buffer, void *rec, - struct ring_buffer_event *event) -{ - if (!data->count) + if (WARN_ON_ONCE(!file)) return; - if (data->count != -1) - (data->count)--; - - snapshot_trigger(data, buffer, rec, event); + tracing_snapshot_instance(file->tr); } static int @@ -1512,34 +1560,18 @@ snapshot_trigger_print(struct seq_file *m, struct event_trigger_data *data) data->filter_str); } -static struct event_trigger_ops snapshot_trigger_ops = { - .trigger = snapshot_trigger, - .print = snapshot_trigger_print, - .init = event_trigger_init, - .free = event_trigger_free, -}; - -static struct event_trigger_ops snapshot_count_trigger_ops = { - .trigger = snapshot_count_trigger, - .print = snapshot_trigger_print, - .init = event_trigger_init, - .free = event_trigger_free, -}; - -static struct event_trigger_ops * -snapshot_get_trigger_ops(char *cmd, char *param) -{ - return param ? &snapshot_count_trigger_ops : &snapshot_trigger_ops; -} - static struct event_command trigger_snapshot_cmd = { .name = "snapshot", .trigger_type = ETT_SNAPSHOT, .parse = event_trigger_parse, .reg = register_snapshot_trigger, .unreg = unregister_snapshot_trigger, - .get_trigger_ops = snapshot_get_trigger_ops, .set_filter = set_trigger_filter, + .trigger = snapshot_trigger, + .count_func = event_trigger_count, + .print = snapshot_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, }; static __init int register_trigger_snapshot_cmd(void) @@ -1580,24 +1612,10 @@ stacktrace_trigger(struct event_trigger_data *data, { struct trace_event_file *file = data->private_data; - if (file) - __trace_stack(file->tr, tracing_gen_ctx(), STACK_SKIP); - else - trace_dump_stack(STACK_SKIP); -} - -static void -stacktrace_count_trigger(struct event_trigger_data *data, - struct trace_buffer *buffer, void *rec, - struct ring_buffer_event *event) -{ - if (!data->count) + if (WARN_ON_ONCE(!file)) return; - if (data->count != -1) - (data->count)--; - - stacktrace_trigger(data, buffer, rec, event); + __trace_stack(file->tr, tracing_gen_ctx_dec(), STACK_SKIP); } static int @@ -1607,26 +1625,6 @@ stacktrace_trigger_print(struct seq_file *m, struct event_trigger_data *data) data->filter_str); } -static struct event_trigger_ops stacktrace_trigger_ops = { - .trigger = stacktrace_trigger, - .print = stacktrace_trigger_print, - .init = event_trigger_init, - .free = event_trigger_free, -}; - -static struct event_trigger_ops stacktrace_count_trigger_ops = { - .trigger = stacktrace_count_trigger, - .print = stacktrace_trigger_print, - .init = event_trigger_init, - .free = event_trigger_free, -}; - -static struct event_trigger_ops * -stacktrace_get_trigger_ops(char *cmd, char *param) -{ - return param ? &stacktrace_count_trigger_ops : &stacktrace_trigger_ops; -} - static struct event_command trigger_stacktrace_cmd = { .name = "stacktrace", .trigger_type = ETT_STACKTRACE, @@ -1634,8 +1632,12 @@ static struct event_command trigger_stacktrace_cmd = { .parse = event_trigger_parse, .reg = register_trigger, .unreg = unregister_trigger, - .get_trigger_ops = stacktrace_get_trigger_ops, .set_filter = set_trigger_filter, + .trigger = stacktrace_trigger, + .count_func = event_trigger_count, + .print = stacktrace_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, }; static __init int register_trigger_stacktrace_cmd(void) @@ -1670,24 +1672,24 @@ event_enable_trigger(struct event_trigger_data *data, set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &enable_data->file->flags); } -static void -event_enable_count_trigger(struct event_trigger_data *data, - struct trace_buffer *buffer, void *rec, - struct ring_buffer_event *event) +static bool +event_enable_count_func(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *event) { struct enable_trigger_data *enable_data = data->private_data; if (!data->count) - return; + return false; /* Skip if the event is in a state we want to switch to */ if (enable_data->enable == !(enable_data->file->flags & EVENT_FILE_FL_SOFT_DISABLED)) - return; + return false; if (data->count != -1) (data->count)--; - event_enable_trigger(data, buffer, rec, event); + return true; } int event_enable_trigger_print(struct seq_file *m, @@ -1732,34 +1734,6 @@ void event_enable_trigger_free(struct event_trigger_data *data) } } -static struct event_trigger_ops event_enable_trigger_ops = { - .trigger = event_enable_trigger, - .print = event_enable_trigger_print, - .init = event_trigger_init, - .free = event_enable_trigger_free, -}; - -static struct event_trigger_ops event_enable_count_trigger_ops = { - .trigger = event_enable_count_trigger, - .print = event_enable_trigger_print, - .init = event_trigger_init, - .free = event_enable_trigger_free, -}; - -static struct event_trigger_ops event_disable_trigger_ops = { - .trigger = event_enable_trigger, - .print = event_enable_trigger_print, - .init = event_trigger_init, - .free = event_enable_trigger_free, -}; - -static struct event_trigger_ops event_disable_count_trigger_ops = { - .trigger = event_enable_count_trigger, - .print = event_enable_trigger_print, - .init = event_trigger_init, - .free = event_enable_trigger_free, -}; - int event_enable_trigger_parse(struct event_command *cmd_ops, struct trace_event_file *file, char *glob, char *cmd, char *param_and_filter) @@ -1793,7 +1767,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, ret = -EINVAL; event_enable_file = find_event_file(tr, system, event); if (!event_enable_file) - goto out; + return ret; #ifdef CONFIG_HIST_TRIGGERS hist = ((strcmp(cmd, ENABLE_HIST_STR) == 0) || @@ -1806,18 +1780,18 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, #endif ret = -ENOMEM; - enable_data = kzalloc(sizeof(*enable_data), GFP_KERNEL); + enable_data = kzalloc_obj(*enable_data); if (!enable_data) - goto out; + return ret; enable_data->hist = hist; enable_data->enable = enable; enable_data->file = event_enable_file; - trigger_data = event_trigger_alloc(cmd_ops, cmd, param, enable_data); + trigger_data = trigger_data_alloc(cmd_ops, cmd, param, enable_data); if (!trigger_data) { kfree(enable_data); - goto out; + return ret; } if (remove) { @@ -1825,7 +1799,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, kfree(trigger_data); kfree(enable_data); ret = 0; - goto out; + return ret; } /* Up the trigger_data count to make sure nothing frees it on failure */ @@ -1855,7 +1829,6 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, goto out_disable; event_trigger_free(trigger_data); - out: return ret; out_disable: trace_event_enable_disable(event_enable_file, 0, 1); @@ -1866,7 +1839,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, event_trigger_free(trigger_data); kfree(enable_data); - goto out; + return ret; } int event_enable_register_trigger(char *glob, @@ -1886,15 +1859,14 @@ int event_enable_register_trigger(char *glob, (test->cmd_ops->trigger_type == data->cmd_ops->trigger_type) && (test_enable_data->file == enable_data->file)) { - ret = -EEXIST; - goto out; + return -EEXIST; } } - if (data->ops->init) { - ret = data->ops->init(data); + if (data->cmd_ops->init) { + ret = data->cmd_ops->init(data); if (ret < 0) - goto out; + return ret; } list_add_rcu(&data->list, &file->triggers); @@ -1905,7 +1877,6 @@ int event_enable_register_trigger(char *glob, list_del_rcu(&data->list); update_cond_flag(file); } -out: return ret; } @@ -1933,30 +1904,8 @@ void event_enable_unregister_trigger(char *glob, } } - if (data && data->ops->free) - data->ops->free(data); -} - -static struct event_trigger_ops * -event_enable_get_trigger_ops(char *cmd, char *param) -{ - struct event_trigger_ops *ops; - bool enable; - -#ifdef CONFIG_HIST_TRIGGERS - enable = ((strcmp(cmd, ENABLE_EVENT_STR) == 0) || - (strcmp(cmd, ENABLE_HIST_STR) == 0)); -#else - enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; -#endif - if (enable) - ops = param ? &event_enable_count_trigger_ops : - &event_enable_trigger_ops; - else - ops = param ? &event_disable_count_trigger_ops : - &event_disable_trigger_ops; - - return ops; + if (data && data->cmd_ops->free) + data->cmd_ops->free(data); } static struct event_command trigger_enable_cmd = { @@ -1965,8 +1914,12 @@ static struct event_command trigger_enable_cmd = { .parse = event_enable_trigger_parse, .reg = event_enable_register_trigger, .unreg = event_enable_unregister_trigger, - .get_trigger_ops = event_enable_get_trigger_ops, .set_filter = set_trigger_filter, + .trigger = event_enable_trigger, + .count_func = event_enable_count_func, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, }; static struct event_command trigger_disable_cmd = { @@ -1975,8 +1928,12 @@ static struct event_command trigger_disable_cmd = { .parse = event_enable_trigger_parse, .reg = event_enable_register_trigger, .unreg = event_enable_unregister_trigger, - .get_trigger_ops = event_enable_get_trigger_ops, .set_filter = set_trigger_filter, + .trigger = event_enable_trigger, + .count_func = event_enable_count_func, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, }; static __init void unregister_trigger_enable_disable_cmds(void) diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 17bcad8f79de..c4ba484f7b38 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -370,7 +370,7 @@ static struct user_event_group *user_event_group_create(void) { struct user_event_group *group; - group = kzalloc(sizeof(*group), GFP_KERNEL); + group = kzalloc_obj(*group); if (!group) return NULL; @@ -455,7 +455,7 @@ static void user_event_enabler_fault_fixup(struct work_struct *work) if (ret && ret != -ENOENT) { struct user_event *user = enabler->event; - pr_warn("user_events: Fault for mm: 0x%pK @ 0x%llx event: %s\n", + pr_warn("user_events: Fault for mm: 0x%p @ 0x%llx event: %s\n", mm->mm, (unsigned long long)uaddr, EVENT_NAME(user)); } @@ -496,7 +496,7 @@ static bool user_event_enabler_queue_fault(struct user_event_mm *mm, { struct user_event_enabler_fault *fault; - fault = kmem_cache_zalloc(fault_cache, GFP_NOWAIT | __GFP_NOWARN); + fault = kmem_cache_zalloc(fault_cache, GFP_NOWAIT); if (!fault) return false; @@ -637,7 +637,7 @@ static bool user_event_enabler_dup(struct user_event_enabler *orig, if (unlikely(test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(orig)))) return true; - enabler = kzalloc(sizeof(*enabler), GFP_NOWAIT | __GFP_ACCOUNT); + enabler = kzalloc_obj(*enabler, GFP_NOWAIT | __GFP_ACCOUNT); if (!enabler) return false; @@ -706,7 +706,7 @@ static struct user_event_mm *user_event_mm_alloc(struct task_struct *t) { struct user_event_mm *user_mm; - user_mm = kzalloc(sizeof(*user_mm), GFP_KERNEL_ACCOUNT); + user_mm = kzalloc_obj(*user_mm, GFP_KERNEL_ACCOUNT); if (!user_mm) return NULL; @@ -835,7 +835,7 @@ void user_event_mm_remove(struct task_struct *t) * so we use a work queue after call_rcu() to run within. */ INIT_RCU_WORK(&mm->put_rwork, delayed_user_event_mm_put); - queue_rcu_work(system_wq, &mm->put_rwork); + queue_rcu_work(system_percpu_wq, &mm->put_rwork); } void user_event_mm_dup(struct task_struct *t, struct user_event_mm *old_mm) @@ -892,7 +892,7 @@ static struct user_event_enabler if (!user_mm) return NULL; - enabler = kzalloc(sizeof(*enabler), GFP_KERNEL_ACCOUNT); + enabler = kzalloc_obj(*enabler, GFP_KERNEL_ACCOUNT); if (!enabler) goto out; @@ -1041,7 +1041,7 @@ static int user_field_array_size(const char *type) static int user_field_size(const char *type) { - /* long is not allowed from a user, since it's ambigious in size */ + /* long is not allowed from a user, since it's ambiguous in size */ if (strcmp(type, "s64") == 0) return sizeof(s64); if (strcmp(type, "u64") == 0) @@ -1079,7 +1079,7 @@ static int user_field_size(const char *type) if (str_has_prefix(type, "__rel_loc ")) return sizeof(u32); - /* Uknown basic type, error */ + /* Unknown basic type, error */ return -EINVAL; } @@ -1113,7 +1113,7 @@ static int user_event_add_field(struct user_event *user, const char *type, struct ftrace_event_field *field; int validator_flags = 0; - field = kmalloc(sizeof(*field), GFP_KERNEL_ACCOUNT); + field = kmalloc_obj(*field, GFP_KERNEL_ACCOUNT); if (!field) return -ENOMEM; @@ -1132,7 +1132,7 @@ add_validator: if (strstr(type, "char") != NULL) validator_flags |= VALIDATOR_ENSURE_NULL; - validator = kmalloc(sizeof(*validator), GFP_KERNEL_ACCOUNT); + validator = kmalloc_obj(*validator, GFP_KERNEL_ACCOUNT); if (!validator) { kfree(field); @@ -1449,12 +1449,7 @@ static struct trace_event_functions user_event_funcs = { static int user_event_set_call_visible(struct user_event *user, bool visible) { - int ret; - const struct cred *old_cred; - struct cred *cred; - - cred = prepare_creds(); - + CLASS(prepare_creds, cred)(); if (!cred) return -ENOMEM; @@ -1469,17 +1464,12 @@ static int user_event_set_call_visible(struct user_event *user, bool visible) */ cred->fsuid = GLOBAL_ROOT_UID; - old_cred = override_creds(cred); - - if (visible) - ret = trace_add_event_call(&user->call); - else - ret = trace_remove_event_call(&user->call); + scoped_with_creds(cred) { + if (visible) + return trace_add_event_call(&user->call); - revert_creds(old_cred); - put_cred(cred); - - return ret; + return trace_remove_event_call(&user->call); + } } static int destroy_user_event(struct user_event *user) @@ -2115,7 +2105,7 @@ static int user_event_parse(struct user_event_group *group, char *name, return 0; } - user = kzalloc(sizeof(*user), GFP_KERNEL_ACCOUNT); + user = kzalloc_obj(*user, GFP_KERNEL_ACCOUNT); if (!user) return -ENOMEM; @@ -2325,7 +2315,7 @@ static int user_events_open(struct inode *node, struct file *file) if (!group) return -ENOENT; - info = kzalloc(sizeof(*info), GFP_KERNEL_ACCOUNT); + info = kzalloc_obj(*info, GFP_KERNEL_ACCOUNT); if (!info) return -ENOMEM; @@ -2475,7 +2465,7 @@ static long user_events_ioctl_reg(struct user_event_file_info *info, /* * Prevent users from using the same address and bit multiple times * within the same mm address space. This can cause unexpected behavior - * for user processes that is far easier to debug if this is explictly + * for user processes that is far easier to debug if this is explicitly * an error upon registering. */ if (current_user_event_enabler_exists((unsigned long)reg.enable_addr, @@ -2793,11 +2783,8 @@ static int user_seq_show(struct seq_file *m, void *p) seq_printf(m, "%s", EVENT_TP_NAME(user)); - if (status != 0) - seq_puts(m, " #"); - if (status != 0) { - seq_puts(m, " Used by"); + seq_puts(m, " # Used by"); if (status & EVENT_STATUS_FTRACE) seq_puts(m, " ftrace"); if (status & EVENT_STATUS_PERF) @@ -2899,7 +2886,7 @@ static int set_max_user_events_sysctl(const struct ctl_table *table, int write, return ret; } -static struct ctl_table user_event_sysctls[] = { +static const struct ctl_table user_event_sysctls[] = { { .procname = "user_events_max", .data = &max_user_events, diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 1698fc22afa0..32a42ef31855 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -42,11 +42,14 @@ static int ftrace_event_register(struct trace_event_call *call, #undef __field_fn #define __field_fn(type, item) type item; +#undef __field_packed +#define __field_packed(type, item) type item; + #undef __field_desc #define __field_desc(type, container, item) type item; -#undef __field_packed -#define __field_packed(type, container, item) type item; +#undef __field_desc_packed +#define __field_desc_packed(type, container, item) type item; #undef __array #define __array(type, item, size) type item[size]; @@ -104,11 +107,14 @@ static void __always_unused ____ftrace_check_##name(void) \ #undef __field_fn #define __field_fn(_type, _item) __field_ext(_type, _item, FILTER_TRACE_FN) +#undef __field_packed +#define __field_packed(_type, _item) __field_ext_packed(_type, _item, FILTER_OTHER) + #undef __field_desc #define __field_desc(_type, _container, _item) __field_ext(_type, _item, FILTER_OTHER) -#undef __field_packed -#define __field_packed(_type, _container, _item) __field_ext_packed(_type, _item, FILTER_OTHER) +#undef __field_desc_packed +#define __field_desc_packed(_type, _container, _item) __field_ext_packed(_type, _item, FILTER_OTHER) #undef __array #define __array(_type, _item, _len) { \ @@ -146,11 +152,14 @@ static struct trace_event_fields ftrace_event_fields_##name[] = { \ #undef __field_fn #define __field_fn(type, item) +#undef __field_packed +#define __field_packed(type, item) + #undef __field_desc #define __field_desc(type, container, item) -#undef __field_packed -#define __field_packed(type, container, item) +#undef __field_desc_packed +#define __field_desc_packed(type, container, item) #undef __array #define __array(type, item, len) diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index c62d1629cffe..9f5f08c0e7c2 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -4,15 +4,18 @@ * Copyright (C) 2022 Google LLC. */ #define pr_fmt(fmt) "trace_fprobe: " fmt -#include <asm/ptrace.h> #include <linux/fprobe.h> +#include <linux/list.h> #include <linux/module.h> +#include <linux/mutex.h> #include <linux/rculist.h> #include <linux/security.h> #include <linux/tracepoint.h> #include <linux/uaccess.h> +#include <asm/ptrace.h> + #include "trace_dynevent.h" #include "trace_probe.h" #include "trace_probe_kernel.h" @@ -21,7 +24,6 @@ #define FPROBE_EVENT_SYSTEM "fprobes" #define TRACEPOINT_EVENT_SYSTEM "tracepoints" #define RETHOOK_MAXACTIVE_MAX 4096 -#define TRACEPOINT_STUB ERR_PTR(-ENOENT) static int trace_fprobe_create(const char *raw_command); static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev); @@ -38,15 +40,157 @@ static struct dyn_event_operations trace_fprobe_ops = { .match = trace_fprobe_match, }; +/* List of tracepoint_user */ +static LIST_HEAD(tracepoint_user_list); +static DEFINE_MUTEX(tracepoint_user_mutex); + +/* While living tracepoint_user, @tpoint can be NULL and @refcount != 0. */ +struct tracepoint_user { + struct list_head list; + const char *name; + struct tracepoint *tpoint; + unsigned int refcount; +}; + +/* NOTE: you must lock tracepoint_user_mutex. */ +#define for_each_tracepoint_user(tuser) \ + list_for_each_entry(tuser, &tracepoint_user_list, list) + +static int tracepoint_user_register(struct tracepoint_user *tuser) +{ + struct tracepoint *tpoint = tuser->tpoint; + + if (!tpoint) + return 0; + + return tracepoint_probe_register_prio_may_exist(tpoint, + tpoint->probestub, NULL, 0); +} + +static void tracepoint_user_unregister(struct tracepoint_user *tuser) +{ + if (!tuser->tpoint) + return; + + WARN_ON_ONCE(tracepoint_probe_unregister(tuser->tpoint, tuser->tpoint->probestub, NULL)); + tuser->tpoint = NULL; +} + +static unsigned long tracepoint_user_ip(struct tracepoint_user *tuser) +{ + if (!tuser->tpoint) + return 0UL; + + return (unsigned long)tuser->tpoint->probestub; +} + +static void __tracepoint_user_free(struct tracepoint_user *tuser) +{ + if (!tuser) + return; + kfree(tuser->name); + kfree(tuser); +} + +DEFINE_FREE(tuser_free, struct tracepoint_user *, __tracepoint_user_free(_T)) + +static struct tracepoint_user *__tracepoint_user_init(const char *name, struct tracepoint *tpoint) +{ + struct tracepoint_user *tuser __free(tuser_free) = NULL; + int ret; + + tuser = kzalloc_obj(*tuser); + if (!tuser) + return NULL; + tuser->name = kstrdup(name, GFP_KERNEL); + if (!tuser->name) + return NULL; + + /* Register tracepoint if it is loaded. */ + if (tpoint) { + tuser->tpoint = tpoint; + ret = tracepoint_user_register(tuser); + if (ret) + return ERR_PTR(ret); + } + + tuser->refcount = 1; + INIT_LIST_HEAD(&tuser->list); + list_add(&tuser->list, &tracepoint_user_list); + + return_ptr(tuser); +} + +static struct tracepoint *find_tracepoint(const char *tp_name, + struct module **tp_mod); + +/* + * Get tracepoint_user if exist, or allocate new one and register it. + * If tracepoint is on a module, get its refcounter too. + * This returns errno or NULL (not loaded yet) or tracepoint_user. + */ +static struct tracepoint_user *tracepoint_user_find_get(const char *name, struct module **pmod) +{ + struct module *mod __free(module_put) = NULL; + struct tracepoint_user *tuser; + struct tracepoint *tpoint; + + if (!name || !pmod) + return ERR_PTR(-EINVAL); + + /* Get and lock the module which has tracepoint. */ + tpoint = find_tracepoint(name, &mod); + + guard(mutex)(&tracepoint_user_mutex); + /* Search existing tracepoint_user */ + for_each_tracepoint_user(tuser) { + if (!strcmp(tuser->name, name)) { + tuser->refcount++; + *pmod = no_free_ptr(mod); + return tuser; + } + } + + /* The corresponding tracepoint_user is not found. */ + tuser = __tracepoint_user_init(name, tpoint); + if (!IS_ERR_OR_NULL(tuser)) + *pmod = no_free_ptr(mod); + + return tuser; +} + +static void tracepoint_user_put(struct tracepoint_user *tuser) +{ + scoped_guard(mutex, &tracepoint_user_mutex) { + if (--tuser->refcount > 0) + return; + + list_del(&tuser->list); + tracepoint_user_unregister(tuser); + } + + __tracepoint_user_free(tuser); +} + +DEFINE_FREE(tuser_put, struct tracepoint_user *, + if (!IS_ERR_OR_NULL(_T)) + tracepoint_user_put(_T)) + /* * Fprobe event core functions */ + +/* + * @tprobe is true for tracepoint probe. + * @tuser can be NULL if the trace_fprobe is disabled or the tracepoint is not + * loaded with a module. If @tuser != NULL, this trace_fprobe is enabled. + */ struct trace_fprobe { struct dyn_event devent; struct fprobe fp; const char *symbol; - struct tracepoint *tpoint; - struct module *mod; + bool tprobe; + struct tracepoint_user *tuser; struct trace_probe tp; }; @@ -76,7 +220,7 @@ static bool trace_fprobe_is_return(struct trace_fprobe *tf) static bool trace_fprobe_is_tracepoint(struct trace_fprobe *tf) { - return tf->tpoint != NULL; + return tf->tprobe; } static const char *trace_fprobe_symbol(struct trace_fprobe *tf) @@ -134,7 +278,7 @@ static int process_fetch_insn(struct fetch_insn *code, void *rec, void *edata, void *dest, void *base) { - struct pt_regs *regs = rec; + struct ftrace_regs *fregs = rec; unsigned long val; int ret; @@ -142,17 +286,17 @@ retry: /* 1st stage: get value from context */ switch (code->op) { case FETCH_OP_STACK: - val = regs_get_kernel_stack_nth(regs, code->param); + val = ftrace_regs_get_kernel_stack_nth(fregs, code->param); break; case FETCH_OP_STACKP: - val = kernel_stack_pointer(regs); + val = ftrace_regs_get_stack_pointer(fregs); break; case FETCH_OP_RETVAL: - val = regs_return_value(regs); + val = ftrace_regs_get_return_value(fregs); break; #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API case FETCH_OP_ARG: - val = regs_get_kernel_argument(regs, code->param); + val = ftrace_regs_get_argument(fregs, code->param); break; case FETCH_OP_EDATA: val = *(unsigned long *)((unsigned long)edata + code->offset); @@ -175,7 +319,7 @@ NOKPROBE_SYMBOL(process_fetch_insn) /* function entry handler */ static nokprobe_inline void __fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, - struct pt_regs *regs, + struct ftrace_regs *fregs, struct trace_event_file *trace_file) { struct fentry_trace_entry_head *entry; @@ -189,41 +333,71 @@ __fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, if (trace_trigger_soft_disabled(trace_file)) return; - dsize = __get_data_size(&tf->tp, regs, NULL); + dsize = __get_data_size(&tf->tp, fregs, NULL); entry = trace_event_buffer_reserve(&fbuffer, trace_file, sizeof(*entry) + tf->tp.size + dsize); if (!entry) return; - fbuffer.regs = regs; + fbuffer.regs = ftrace_get_regs(fregs); entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); entry->ip = entry_ip; - store_trace_args(&entry[1], &tf->tp, regs, NULL, sizeof(*entry), dsize); + store_trace_args(&entry[1], &tf->tp, fregs, NULL, sizeof(*entry), dsize); trace_event_buffer_commit(&fbuffer); } static void fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, - struct pt_regs *regs) + struct ftrace_regs *fregs) { struct event_file_link *link; trace_probe_for_each_link_rcu(link, &tf->tp) - __fentry_trace_func(tf, entry_ip, regs, link->file); + __fentry_trace_func(tf, entry_ip, fregs, link->file); } NOKPROBE_SYMBOL(fentry_trace_func); +static nokprobe_inline +void store_fprobe_entry_data(void *edata, struct trace_probe *tp, struct ftrace_regs *fregs) +{ + struct probe_entry_arg *earg = tp->entry_arg; + unsigned long val = 0; + int i; + + if (!earg) + return; + + for (i = 0; i < earg->size; i++) { + struct fetch_insn *code = &earg->code[i]; + + switch (code->op) { + case FETCH_OP_ARG: + val = ftrace_regs_get_argument(fregs, code->param); + break; + case FETCH_OP_ST_EDATA: + *(unsigned long *)((unsigned long)edata + code->offset) = val; + break; + case FETCH_OP_END: + goto end; + default: + break; + } + } +end: + return; +} + /* function exit handler */ static int trace_fprobe_entry_handler(struct fprobe *fp, unsigned long entry_ip, - unsigned long ret_ip, struct pt_regs *regs, + unsigned long ret_ip, struct ftrace_regs *fregs, void *entry_data) { struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp); if (tf->tp.entry_arg) - store_trace_entry_data(entry_data, &tf->tp, regs); + store_fprobe_entry_data(entry_data, &tf->tp, fregs); return 0; } @@ -231,7 +405,7 @@ NOKPROBE_SYMBOL(trace_fprobe_entry_handler) static nokprobe_inline void __fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, - unsigned long ret_ip, struct pt_regs *regs, + unsigned long ret_ip, struct ftrace_regs *fregs, void *entry_data, struct trace_event_file *trace_file) { struct fexit_trace_entry_head *entry; @@ -245,60 +419,63 @@ __fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, if (trace_trigger_soft_disabled(trace_file)) return; - dsize = __get_data_size(&tf->tp, regs, entry_data); + dsize = __get_data_size(&tf->tp, fregs, entry_data); entry = trace_event_buffer_reserve(&fbuffer, trace_file, sizeof(*entry) + tf->tp.size + dsize); if (!entry) return; - fbuffer.regs = regs; + fbuffer.regs = ftrace_get_regs(fregs); entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); entry->func = entry_ip; entry->ret_ip = ret_ip; - store_trace_args(&entry[1], &tf->tp, regs, entry_data, sizeof(*entry), dsize); + store_trace_args(&entry[1], &tf->tp, fregs, entry_data, sizeof(*entry), dsize); trace_event_buffer_commit(&fbuffer); } static void fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, - unsigned long ret_ip, struct pt_regs *regs, void *entry_data) + unsigned long ret_ip, struct ftrace_regs *fregs, void *entry_data) { struct event_file_link *link; trace_probe_for_each_link_rcu(link, &tf->tp) - __fexit_trace_func(tf, entry_ip, ret_ip, regs, entry_data, link->file); + __fexit_trace_func(tf, entry_ip, ret_ip, fregs, entry_data, link->file); } NOKPROBE_SYMBOL(fexit_trace_func); #ifdef CONFIG_PERF_EVENTS static int fentry_perf_func(struct trace_fprobe *tf, unsigned long entry_ip, - struct pt_regs *regs) + struct ftrace_regs *fregs) { struct trace_event_call *call = trace_probe_event_call(&tf->tp); struct fentry_trace_entry_head *entry; struct hlist_head *head; int size, __size, dsize; + struct pt_regs *regs; int rctx; head = this_cpu_ptr(call->perf_events); if (hlist_empty(head)) return 0; - dsize = __get_data_size(&tf->tp, regs, NULL); + dsize = __get_data_size(&tf->tp, fregs, NULL); __size = sizeof(*entry) + tf->tp.size + dsize; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - entry = perf_trace_buf_alloc(size, NULL, &rctx); + entry = perf_trace_buf_alloc(size, ®s, &rctx); if (!entry) return 0; + regs = ftrace_fill_perf_regs(fregs, regs); + entry->ip = entry_ip; memset(&entry[1], 0, dsize); - store_trace_args(&entry[1], &tf->tp, regs, NULL, sizeof(*entry), dsize); + store_trace_args(&entry[1], &tf->tp, fregs, NULL, sizeof(*entry), dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL); return 0; @@ -307,31 +484,34 @@ NOKPROBE_SYMBOL(fentry_perf_func); static void fexit_perf_func(struct trace_fprobe *tf, unsigned long entry_ip, - unsigned long ret_ip, struct pt_regs *regs, + unsigned long ret_ip, struct ftrace_regs *fregs, void *entry_data) { struct trace_event_call *call = trace_probe_event_call(&tf->tp); struct fexit_trace_entry_head *entry; struct hlist_head *head; int size, __size, dsize; + struct pt_regs *regs; int rctx; head = this_cpu_ptr(call->perf_events); if (hlist_empty(head)) return; - dsize = __get_data_size(&tf->tp, regs, entry_data); + dsize = __get_data_size(&tf->tp, fregs, entry_data); __size = sizeof(*entry) + tf->tp.size + dsize; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - entry = perf_trace_buf_alloc(size, NULL, &rctx); + entry = perf_trace_buf_alloc(size, ®s, &rctx); if (!entry) return; + regs = ftrace_fill_perf_regs(fregs, regs); + entry->func = entry_ip; entry->ret_ip = ret_ip; - store_trace_args(&entry[1], &tf->tp, regs, entry_data, sizeof(*entry), dsize); + store_trace_args(&entry[1], &tf->tp, fregs, entry_data, sizeof(*entry), dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL); } @@ -339,33 +519,36 @@ NOKPROBE_SYMBOL(fexit_perf_func); #endif /* CONFIG_PERF_EVENTS */ static int fentry_dispatcher(struct fprobe *fp, unsigned long entry_ip, - unsigned long ret_ip, struct pt_regs *regs, + unsigned long ret_ip, struct ftrace_regs *fregs, void *entry_data) { struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp); + unsigned int flags = trace_probe_load_flag(&tf->tp); int ret = 0; - if (trace_probe_test_flag(&tf->tp, TP_FLAG_TRACE)) - fentry_trace_func(tf, entry_ip, regs); + if (flags & TP_FLAG_TRACE) + fentry_trace_func(tf, entry_ip, fregs); + #ifdef CONFIG_PERF_EVENTS - if (trace_probe_test_flag(&tf->tp, TP_FLAG_PROFILE)) - ret = fentry_perf_func(tf, entry_ip, regs); + if (flags & TP_FLAG_PROFILE) + ret = fentry_perf_func(tf, entry_ip, fregs); #endif return ret; } NOKPROBE_SYMBOL(fentry_dispatcher); static void fexit_dispatcher(struct fprobe *fp, unsigned long entry_ip, - unsigned long ret_ip, struct pt_regs *regs, + unsigned long ret_ip, struct ftrace_regs *fregs, void *entry_data) { struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp); + unsigned int flags = trace_probe_load_flag(&tf->tp); - if (trace_probe_test_flag(&tf->tp, TP_FLAG_TRACE)) - fexit_trace_func(tf, entry_ip, ret_ip, regs, entry_data); + if (flags & TP_FLAG_TRACE) + fexit_trace_func(tf, entry_ip, ret_ip, fregs, entry_data); #ifdef CONFIG_PERF_EVENTS - if (trace_probe_test_flag(&tf->tp, TP_FLAG_PROFILE)) - fexit_perf_func(tf, entry_ip, ret_ip, regs, entry_data); + if (flags & TP_FLAG_PROFILE) + fexit_perf_func(tf, entry_ip, ret_ip, fregs, entry_data); #endif } NOKPROBE_SYMBOL(fexit_dispatcher); @@ -374,51 +557,49 @@ static void free_trace_fprobe(struct trace_fprobe *tf) { if (tf) { trace_probe_cleanup(&tf->tp); + if (tf->tuser) + tracepoint_user_put(tf->tuser); kfree(tf->symbol); kfree(tf); } } +/* Since alloc_trace_fprobe() can return error, check the pointer is ERR too. */ +DEFINE_FREE(free_trace_fprobe, struct trace_fprobe *, if (!IS_ERR_OR_NULL(_T)) free_trace_fprobe(_T)) + /* * Allocate new trace_probe and initialize it (including fprobe). */ static struct trace_fprobe *alloc_trace_fprobe(const char *group, const char *event, const char *symbol, - struct tracepoint *tpoint, - struct module *mod, - int maxactive, - int nargs, bool is_return) + int nargs, bool is_return, + bool is_tracepoint) { - struct trace_fprobe *tf; + struct trace_fprobe *tf __free(free_trace_fprobe) = NULL; int ret = -ENOMEM; - tf = kzalloc(struct_size(tf, tp.args, nargs), GFP_KERNEL); + tf = kzalloc_flex(*tf, tp.args, nargs); if (!tf) return ERR_PTR(ret); tf->symbol = kstrdup(symbol, GFP_KERNEL); if (!tf->symbol) - goto error; + return ERR_PTR(-ENOMEM); if (is_return) tf->fp.exit_handler = fexit_dispatcher; else tf->fp.entry_handler = fentry_dispatcher; - tf->tpoint = tpoint; - tf->mod = mod; - tf->fp.nr_maxactive = maxactive; + tf->tprobe = is_tracepoint; ret = trace_probe_init(&tf->tp, event, group, false, nargs); if (ret < 0) - goto error; + return ERR_PTR(ret); dyn_event_init(&tf->devent, &trace_fprobe_ops); - return tf; -error: - free_trace_fprobe(tf); - return ERR_PTR(ret); + return_ptr(tf); } static struct trace_fprobe *find_trace_fprobe(const char *event, @@ -434,98 +615,6 @@ static struct trace_fprobe *find_trace_fprobe(const char *event, return NULL; } -static inline int __enable_trace_fprobe(struct trace_fprobe *tf) -{ - if (trace_fprobe_is_registered(tf)) - enable_fprobe(&tf->fp); - - return 0; -} - -static void __disable_trace_fprobe(struct trace_probe *tp) -{ - struct trace_fprobe *tf; - - list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { - if (!trace_fprobe_is_registered(tf)) - continue; - disable_fprobe(&tf->fp); - } -} - -/* - * Enable trace_probe - * if the file is NULL, enable "perf" handler, or enable "trace" handler. - */ -static int enable_trace_fprobe(struct trace_event_call *call, - struct trace_event_file *file) -{ - struct trace_probe *tp; - struct trace_fprobe *tf; - bool enabled; - int ret = 0; - - tp = trace_probe_primary_from_call(call); - if (WARN_ON_ONCE(!tp)) - return -ENODEV; - enabled = trace_probe_is_enabled(tp); - - /* This also changes "enabled" state */ - if (file) { - ret = trace_probe_add_file(tp, file); - if (ret) - return ret; - } else - trace_probe_set_flag(tp, TP_FLAG_PROFILE); - - if (!enabled) { - list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { - /* TODO: check the fprobe is gone */ - __enable_trace_fprobe(tf); - } - } - - return 0; -} - -/* - * Disable trace_probe - * if the file is NULL, disable "perf" handler, or disable "trace" handler. - */ -static int disable_trace_fprobe(struct trace_event_call *call, - struct trace_event_file *file) -{ - struct trace_probe *tp; - - tp = trace_probe_primary_from_call(call); - if (WARN_ON_ONCE(!tp)) - return -ENODEV; - - if (file) { - if (!trace_probe_get_file_link(tp, file)) - return -ENOENT; - if (!trace_probe_has_single_file(tp)) - goto out; - trace_probe_clear_flag(tp, TP_FLAG_TRACE); - } else - trace_probe_clear_flag(tp, TP_FLAG_PROFILE); - - if (!trace_probe_is_enabled(tp)) - __disable_trace_fprobe(tp); - - out: - if (file) - /* - * Synchronization is done in below function. For perf event, - * file == NULL and perf_trace_event_unreg() calls - * tracepoint_synchronize_unregister() to ensure synchronize - * event. We don't need to care about it. - */ - trace_probe_remove_file(tp, file); - - return 0; -} - /* Event entry printers */ static enum print_line_t print_fentry_event(struct trace_iterator *iter, int flags, @@ -543,7 +632,7 @@ print_fentry_event(struct trace_iterator *iter, int flags, trace_seq_printf(s, "%s: (", trace_probe_name(tp)); - if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) + if (!seq_print_ip_sym_offset(s, field->ip, flags)) goto out; trace_seq_putc(s, ')'); @@ -573,12 +662,12 @@ print_fexit_event(struct trace_iterator *iter, int flags, trace_seq_printf(s, "%s: (", trace_probe_name(tp)); - if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) + if (!seq_print_ip_sym_offset(s, field->ret_ip, flags)) goto out; trace_seq_puts(s, " <- "); - if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET)) + if (!seq_print_ip_sym_no_offset(s, field->func, flags)) goto out; trace_seq_putc(s, ')'); @@ -677,20 +766,52 @@ static int unregister_fprobe_event(struct trace_fprobe *tf) static int __regsiter_tracepoint_fprobe(struct trace_fprobe *tf) { - struct tracepoint *tpoint = tf->tpoint; - unsigned long ip = (unsigned long)tpoint->probestub; + struct tracepoint_user *tuser __free(tuser_put) = NULL; + struct module *mod __free(module_put) = NULL; + unsigned long ip; + int ret; + + if (WARN_ON_ONCE(tf->tuser)) + return -EINVAL; + + /* If the tracepoint is in a module, it must be locked in this function. */ + tuser = tracepoint_user_find_get(tf->symbol, &mod); + /* This tracepoint is not loaded yet */ + if (IS_ERR(tuser)) + return PTR_ERR(tuser); + if (!tuser) + return -ENOMEM; + + /* Register fprobe only if the tracepoint is loaded. */ + if (tuser->tpoint) { + ip = tracepoint_user_ip(tuser); + if (WARN_ON_ONCE(!ip)) + return -ENOENT; + + ret = register_fprobe_ips(&tf->fp, &ip, 1); + if (ret < 0) + return ret; + } + + tf->tuser = no_free_ptr(tuser); + return 0; +} + +/* Returns an error if the target function is not available, or 0 */ +static int trace_fprobe_verify_target(struct trace_fprobe *tf) +{ int ret; + /* Tracepoint should have a stub function. */ + if (trace_fprobe_is_tracepoint(tf)) + return 0; + /* - * Here, we do 2 steps to enable fprobe on a tracepoint. - * At first, put __probestub_##TP function on the tracepoint - * and put a fprobe on the stub function. + * Note: since we don't lock the module, even if this succeeded, + * register_fprobe() later can fail. */ - ret = tracepoint_probe_register_prio_may_exist(tpoint, - tpoint->probestub, NULL, 0); - if (ret < 0) - return ret; - return register_fprobe_ips(&tf->fp, &ip, 1); + ret = fprobe_count_ips_from_filter(tf->symbol, NULL); + return (ret < 0) ? ret : 0; } /* Internal register function - just handle fprobe and flags */ @@ -712,20 +833,10 @@ static int __register_trace_fprobe(struct trace_fprobe *tf) return ret; } - /* Set/clear disabled flag according to tp->flag */ - if (trace_probe_is_enabled(&tf->tp)) - tf->fp.flags &= ~FPROBE_FL_DISABLED; - else - tf->fp.flags |= FPROBE_FL_DISABLED; - - if (trace_fprobe_is_tracepoint(tf)) { - - /* This tracepoint is not loaded yet */ - if (tf->tpoint == TRACEPOINT_STUB) - return 0; + tf->fp.flags &= ~FPROBE_FL_DISABLED; + if (trace_fprobe_is_tracepoint(tf)) return __regsiter_tracepoint_fprobe(tf); - } /* TODO: handle filter, nofilter or symbol list */ return register_fprobe(&tf->fp, tf->symbol, NULL); @@ -734,15 +845,11 @@ static int __register_trace_fprobe(struct trace_fprobe *tf) /* Internal unregister function - just handle fprobe and flags */ static void __unregister_trace_fprobe(struct trace_fprobe *tf) { - if (trace_fprobe_is_registered(tf)) { + if (trace_fprobe_is_registered(tf)) unregister_fprobe(&tf->fp); - memset(&tf->fp, 0, sizeof(tf->fp)); - if (trace_fprobe_is_tracepoint(tf)) { - tracepoint_probe_unregister(tf->tpoint, - tf->tpoint->probestub, NULL); - tf->tpoint = NULL; - tf->mod = NULL; - } + if (tf->tuser) { + tracepoint_user_put(tf->tuser); + tf->tuser = NULL; } } @@ -802,7 +909,7 @@ static bool trace_fprobe_has_same_fprobe(struct trace_fprobe *orig, return false; } -static int append_trace_fprobe(struct trace_fprobe *tf, struct trace_fprobe *to) +static int append_trace_fprobe_event(struct trace_fprobe *tf, struct trace_fprobe *to) { int ret; @@ -830,7 +937,7 @@ static int append_trace_fprobe(struct trace_fprobe *tf, struct trace_fprobe *to) if (ret) return ret; - ret = __register_trace_fprobe(tf); + ret = trace_fprobe_verify_target(tf); if (ret) trace_probe_unlink(&tf->tp); else @@ -839,20 +946,18 @@ static int append_trace_fprobe(struct trace_fprobe *tf, struct trace_fprobe *to) return ret; } -/* Register a trace_probe and probe_event */ -static int register_trace_fprobe(struct trace_fprobe *tf) +/* Register a trace_probe and probe_event, and check the fprobe is available. */ +static int register_trace_fprobe_event(struct trace_fprobe *tf) { struct trace_fprobe *old_tf; int ret; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); old_tf = find_trace_fprobe(trace_probe_name(&tf->tp), trace_probe_group_name(&tf->tp)); - if (old_tf) { - ret = append_trace_fprobe(tf, old_tf); - goto end; - } + if (old_tf) + return append_trace_fprobe_event(tf, old_tf); /* Register new event */ ret = register_fprobe_event(tf); @@ -862,18 +967,16 @@ static int register_trace_fprobe(struct trace_fprobe *tf) trace_probe_log_err(0, EVENT_EXIST); } else pr_warn("Failed to register probe event(%d)\n", ret); - goto end; + return ret; } - /* Register fprobe */ - ret = __register_trace_fprobe(tf); + /* Verify fprobe is sane. */ + ret = trace_fprobe_verify_target(tf); if (ret < 0) unregister_fprobe_event(tf); else dyn_event_add(&tf->devent, trace_probe_event_call(&tf->tp)); -end: - mutex_unlock(&event_mutex); return ret; } @@ -888,14 +991,15 @@ static void __find_tracepoint_module_cb(struct tracepoint *tp, struct module *mo struct __find_tracepoint_cb_data *data = priv; if (!data->tpoint && !strcmp(data->tp_name, tp->name)) { - data->tpoint = tp; - if (!data->mod) { + /* If module is not specified, try getting module refcount. */ + if (!data->mod && mod) { + /* If failed to get refcount, ignore this tracepoint. */ + if (!try_module_get(mod)) + return; + data->mod = mod; - if (!try_module_get(data->mod)) { - data->tpoint = NULL; - data->mod = NULL; - } } + data->tpoint = tp; } } @@ -908,11 +1012,9 @@ static void __find_tracepoint_cb(struct tracepoint *tp, void *priv) } /* - * Find a tracepoint from kernel and module. If the tracepoint is in a module, - * this increments the module refcount to prevent unloading until the - * trace_fprobe is registered to the list. After registering the trace_fprobe - * on the trace_fprobe list, the module refcount is decremented because - * tracepoint_probe_module_cb will handle it. + * Find a tracepoint from kernel and module. If the tracepoint is on the module, + * the module's refcount is incremented and returned as *@tp_mod. Thus, if it is + * not NULL, caller must call module_put(*tp_mod) after used the tracepoint. */ static struct tracepoint *find_tracepoint(const char *tp_name, struct module **tp_mod) @@ -933,15 +1035,10 @@ static struct tracepoint *find_tracepoint(const char *tp_name, } #ifdef CONFIG_MODULES -static void reenable_trace_fprobe(struct trace_fprobe *tf) -{ - struct trace_probe *tp = &tf->tp; - - list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { - __enable_trace_fprobe(tf); - } -} - +/* + * Find a tracepoint from specified module. In this case, this does not get the + * module's refcount. The caller must ensure the module is not freed. + */ static struct tracepoint *find_tracepoint_in_module(struct module *mod, const char *tp_name) { @@ -954,33 +1051,94 @@ static struct tracepoint *find_tracepoint_in_module(struct module *mod, return data.tpoint; } +/* These are CONFIG_MODULES=y specific functions. */ +static bool tracepoint_user_within_module(struct tracepoint_user *tuser, + struct module *mod) +{ + return within_module(tracepoint_user_ip(tuser), mod); +} + +static int tracepoint_user_register_again(struct tracepoint_user *tuser, + struct tracepoint *tpoint) +{ + tuser->tpoint = tpoint; + return tracepoint_user_register(tuser); +} + +static void tracepoint_user_unregister_clear(struct tracepoint_user *tuser) +{ + tracepoint_user_unregister(tuser); + tuser->tpoint = NULL; +} + +/* module callback for tracepoint_user */ static int __tracepoint_probe_module_cb(struct notifier_block *self, unsigned long val, void *data) { struct tp_module *tp_mod = data; + struct tracepoint_user *tuser; struct tracepoint *tpoint; + + if (val != MODULE_STATE_GOING && val != MODULE_STATE_COMING) + return NOTIFY_DONE; + + mutex_lock(&tracepoint_user_mutex); + for_each_tracepoint_user(tuser) { + if (val == MODULE_STATE_COMING) { + /* This is not a tracepoint in this module. Skip it. */ + tpoint = find_tracepoint_in_module(tp_mod->mod, tuser->name); + if (!tpoint) + continue; + WARN_ON_ONCE(tracepoint_user_register_again(tuser, tpoint)); + } else if (val == MODULE_STATE_GOING && + tracepoint_user_within_module(tuser, tp_mod->mod)) { + /* Unregister all tracepoint_user in this module. */ + tracepoint_user_unregister_clear(tuser); + } + } + mutex_unlock(&tracepoint_user_mutex); + + return NOTIFY_DONE; +} + +static struct notifier_block tracepoint_module_nb = { + .notifier_call = __tracepoint_probe_module_cb, +}; + +/* module callback for tprobe events */ +static int __tprobe_event_module_cb(struct notifier_block *self, + unsigned long val, void *data) +{ struct trace_fprobe *tf; struct dyn_event *pos; + struct module *mod = data; if (val != MODULE_STATE_GOING && val != MODULE_STATE_COMING) return NOTIFY_DONE; mutex_lock(&event_mutex); for_each_trace_fprobe(tf, pos) { - if (val == MODULE_STATE_COMING && tf->tpoint == TRACEPOINT_STUB) { - tpoint = find_tracepoint_in_module(tp_mod->mod, tf->symbol); - if (tpoint) { - tf->tpoint = tpoint; - tf->mod = tp_mod->mod; - if (!WARN_ON_ONCE(__regsiter_tracepoint_fprobe(tf)) && - trace_probe_is_enabled(&tf->tp)) - reenable_trace_fprobe(tf); - } - } else if (val == MODULE_STATE_GOING && tp_mod->mod == tf->mod) { - tracepoint_probe_unregister(tf->tpoint, - tf->tpoint->probestub, NULL); - tf->tpoint = NULL; - tf->mod = NULL; + /* Skip fprobe and disabled tprobe events. */ + if (!trace_fprobe_is_tracepoint(tf) || !tf->tuser) + continue; + + /* Before this notification, tracepoint notifier has already done. */ + if (val == MODULE_STATE_COMING && + tracepoint_user_within_module(tf->tuser, mod)) { + unsigned long ip = tracepoint_user_ip(tf->tuser); + + WARN_ON_ONCE(register_fprobe_ips(&tf->fp, &ip, 1)); + } else if (val == MODULE_STATE_GOING && + /* + * tracepoint_user_within_module() does not work here because + * tracepoint_user is already unregistered and cleared tpoint. + * Instead, checking whether the fprobe is registered but + * tpoint is cleared(unregistered). Such unbalance probes + * must be adjusted anyway. + */ + trace_fprobe_is_registered(tf) && + !tf->tuser->tpoint) { + unregister_fprobe(&tf->fp); } } mutex_unlock(&event_mutex); @@ -988,8 +1146,11 @@ static int __tracepoint_probe_module_cb(struct notifier_block *self, return NOTIFY_DONE; } -static struct notifier_block tracepoint_module_nb = { - .notifier_call = __tracepoint_probe_module_cb, +/* NOTE: this must be called after tracepoint callback */ +static struct notifier_block tprobe_event_module_nb = { + .notifier_call = __tprobe_event_module_cb, + /* Make sure this is later than tracepoint module notifier. */ + .priority = -10, }; #endif /* CONFIG_MODULES */ @@ -1018,6 +1179,19 @@ static int parse_symbol_and_return(int argc, const char *argv[], if (*is_return) return 0; + if (is_tracepoint) { + tmp = *symbol; + while (*tmp && (isalnum(*tmp) || *tmp == '_')) + tmp++; + if (*tmp) { + /* find a wrong character. */ + trace_probe_log_err(tmp - *symbol, BAD_TP_NAME); + kfree(*symbol); + *symbol = NULL; + return -EINVAL; + } + } + /* If there is $retval, this should be a return fprobe. */ for (i = 2; i < argc; i++) { tmp = strstr(argv[i], "$retval"); @@ -1025,6 +1199,8 @@ static int parse_symbol_and_return(int argc, const char *argv[], if (is_tracepoint) { trace_probe_log_set_index(i); trace_probe_log_err(tmp - argv[i], RETVAL_ON_PROBE); + kfree(*symbol); + *symbol = NULL; return -EINVAL; } *is_return = true; @@ -1034,7 +1210,8 @@ static int parse_symbol_and_return(int argc, const char *argv[], return 0; } -static int __trace_fprobe_create(int argc, const char *argv[]) +static int trace_fprobe_create_internal(int argc, const char *argv[], + struct traceprobe_parse_context *ctx) { /* * Argument syntax: @@ -1060,24 +1237,19 @@ static int __trace_fprobe_create(int argc, const char *argv[]) * Type of args: * FETCHARG:TYPE : use TYPE instead of unsigned long. */ - struct trace_fprobe *tf = NULL; - int i, len, new_argc = 0, ret = 0; - bool is_return = false; - char *symbol = NULL; + struct trace_fprobe *tf __free(free_trace_fprobe) = NULL; const char *event = NULL, *group = FPROBE_EVENT_SYSTEM; - const char **new_argv = NULL; - int maxactive = 0; - char buf[MAX_EVENT_NAME_LEN]; - char gbuf[MAX_EVENT_NAME_LEN]; - char sbuf[KSYM_NAME_LEN]; - char abuf[MAX_BTF_ARGS_LEN]; - char *dbuf = NULL; + struct module *mod __free(module_put) = NULL; + const char **new_argv __free(kfree) = NULL; + char *symbol __free(kfree) = NULL; + char *ebuf __free(kfree) = NULL; + char *gbuf __free(kfree) = NULL; + char *sbuf __free(kfree) = NULL; + char *abuf __free(kfree) = NULL; + char *dbuf __free(kfree) = NULL; + int i, new_argc = 0, ret = 0; bool is_tracepoint = false; - struct module *tp_mod = NULL; - struct tracepoint *tpoint = NULL; - struct traceprobe_parse_context ctx = { - .flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE, - }; + bool is_return = false; if ((argv[0][0] != 'f' && argv[0][0] != 't') || argc < 2) return -ECANCELED; @@ -1087,35 +1259,13 @@ static int __trace_fprobe_create(int argc, const char *argv[]) group = TRACEPOINT_EVENT_SYSTEM; } - trace_probe_log_init("trace_fprobe", argc, argv); - - event = strchr(&argv[0][1], ':'); - if (event) - event++; - - if (isdigit(argv[0][1])) { - if (event) - len = event - &argv[0][1] - 1; - else - len = strlen(&argv[0][1]); - if (len > MAX_EVENT_NAME_LEN - 1) { - trace_probe_log_err(1, BAD_MAXACT); - goto parse_error; - } - memcpy(buf, &argv[0][1], len); - buf[len] = '\0'; - ret = kstrtouint(buf, 0, &maxactive); - if (ret || !maxactive) { + if (argv[0][1] != '\0') { + if (argv[0][1] != ':') { + trace_probe_log_set_index(0); trace_probe_log_err(1, BAD_MAXACT); - goto parse_error; - } - /* fprobe rethook instances are iterated over via a list. The - * maximum should stay reasonable. - */ - if (maxactive > RETHOOK_MAXACTIVE_MAX) { - trace_probe_log_err(1, MAXACT_TOO_BIG); - goto parse_error; + return -EINVAL; } + event = &argv[0][2]; } trace_probe_log_set_index(1); @@ -1123,109 +1273,114 @@ static int __trace_fprobe_create(int argc, const char *argv[]) /* a symbol(or tracepoint) must be specified */ ret = parse_symbol_and_return(argc, argv, &symbol, &is_return, is_tracepoint); if (ret < 0) - goto parse_error; - - if (!is_return && maxactive) { - trace_probe_log_set_index(0); - trace_probe_log_err(1, BAD_MAXACT_TYPE); - goto parse_error; - } + return -EINVAL; trace_probe_log_set_index(0); if (event) { + gbuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); + if (!gbuf) + return -ENOMEM; ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) - goto parse_error; + return -EINVAL; } if (!event) { + ebuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); + if (!ebuf) + return -ENOMEM; /* Make a new event name */ if (is_tracepoint) - snprintf(buf, MAX_EVENT_NAME_LEN, "%s%s", + snprintf(ebuf, MAX_EVENT_NAME_LEN, "%s%s", isdigit(*symbol) ? "_" : "", symbol); else - snprintf(buf, MAX_EVENT_NAME_LEN, "%s__%s", symbol, + snprintf(ebuf, MAX_EVENT_NAME_LEN, "%s__%s", symbol, is_return ? "exit" : "entry"); - sanitize_event_name(buf); - event = buf; + sanitize_event_name(ebuf); + event = ebuf; } if (is_return) - ctx.flags |= TPARG_FL_RETURN; + ctx->flags |= TPARG_FL_RETURN; else - ctx.flags |= TPARG_FL_FENTRY; + ctx->flags |= TPARG_FL_FENTRY; + ctx->funcname = NULL; if (is_tracepoint) { - ctx.flags |= TPARG_FL_TPOINT; - tpoint = find_tracepoint(symbol, &tp_mod); + /* Get tracepoint and lock its module until the end of the registration. */ + struct tracepoint *tpoint; + + ctx->flags |= TPARG_FL_TPOINT; + mod = NULL; + tpoint = find_tracepoint(symbol, &mod); if (tpoint) { - ctx.funcname = kallsyms_lookup( - (unsigned long)tpoint->probestub, - NULL, NULL, NULL, sbuf); - } else if (IS_ENABLED(CONFIG_MODULES)) { - /* This *may* be loaded afterwards */ - tpoint = TRACEPOINT_STUB; - ctx.funcname = symbol; - } else { - trace_probe_log_set_index(1); - trace_probe_log_err(0, NO_TRACEPOINT); - goto parse_error; + sbuf = kmalloc(KSYM_NAME_LEN, GFP_KERNEL); + if (!sbuf) + return -ENOMEM; + ctx->funcname = kallsyms_lookup((unsigned long)tpoint->probestub, + NULL, NULL, NULL, sbuf); } - } else - ctx.funcname = symbol; + } + if (!ctx->funcname) + ctx->funcname = symbol; + abuf = kmalloc(MAX_BTF_ARGS_LEN, GFP_KERNEL); + if (!abuf) + return -ENOMEM; argc -= 2; argv += 2; new_argv = traceprobe_expand_meta_args(argc, argv, &new_argc, - abuf, MAX_BTF_ARGS_LEN, &ctx); - if (IS_ERR(new_argv)) { - ret = PTR_ERR(new_argv); - new_argv = NULL; - goto out; - } + abuf, MAX_BTF_ARGS_LEN, ctx); + if (IS_ERR(new_argv)) + return PTR_ERR(new_argv); if (new_argv) { argc = new_argc; argv = new_argv; } if (argc > MAX_TRACE_ARGS) { - ret = -E2BIG; - goto out; + trace_probe_log_set_index(2); + trace_probe_log_err(0, TOO_MANY_ARGS); + return -E2BIG; } ret = traceprobe_expand_dentry_args(argc, argv, &dbuf); if (ret) - goto out; + return ret; /* setup a probe */ - tf = alloc_trace_fprobe(group, event, symbol, tpoint, tp_mod, - maxactive, argc, is_return); + tf = alloc_trace_fprobe(group, event, symbol, argc, is_return, is_tracepoint); if (IS_ERR(tf)) { ret = PTR_ERR(tf); /* This must return -ENOMEM, else there is a bug */ WARN_ON_ONCE(ret != -ENOMEM); - goto out; /* We know tf is not allocated */ + return ret; } /* parse arguments */ for (i = 0; i < argc; i++) { trace_probe_log_set_index(i + 2); - ctx.offset = 0; - ret = traceprobe_parse_probe_arg(&tf->tp, i, argv[i], &ctx); + ctx->offset = 0; + ret = traceprobe_parse_probe_arg(&tf->tp, i, argv[i], ctx); if (ret) - goto error; /* This can be -ENOMEM */ + return ret; /* This can be -ENOMEM */ } if (is_return && tf->tp.entry_arg) { tf->fp.entry_handler = trace_fprobe_entry_handler; tf->fp.entry_data_size = traceprobe_get_entry_data_size(&tf->tp); + if (ALIGN(tf->fp.entry_data_size, sizeof(long)) > MAX_FPROBE_DATA_SIZE) { + trace_probe_log_set_index(2); + trace_probe_log_err(0, TOO_MANY_EARGS); + return -E2BIG; + } } ret = traceprobe_set_print_fmt(&tf->tp, is_return ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL); if (ret < 0) - goto error; + return ret; - ret = register_trace_fprobe(tf); + ret = register_trace_fprobe_event(tf); if (ret) { trace_probe_log_set_index(1); if (ret == -EILSEQ) @@ -1234,29 +1389,35 @@ static int __trace_fprobe_create(int argc, const char *argv[]) trace_probe_log_err(0, BAD_PROBE_ADDR); else if (ret != -ENOMEM && ret != -EEXIST) trace_probe_log_err(0, FAIL_REG_PROBE); - goto error; + return -EINVAL; } -out: - if (tp_mod) - module_put(tp_mod); - traceprobe_finish_parse(&ctx); + /* 'tf' is successfully registered. To avoid freeing, assign NULL. */ + tf = NULL; + + return 0; +} + +static int trace_fprobe_create_cb(int argc, const char *argv[]) +{ + struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) = NULL; + int ret; + + ctx = kzalloc_obj(*ctx); + if (!ctx) + return -ENOMEM; + + ctx->flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE; + + trace_probe_log_init("trace_fprobe", argc, argv); + ret = trace_fprobe_create_internal(argc, argv, ctx); trace_probe_log_clear(); - kfree(new_argv); - kfree(symbol); - kfree(dbuf); return ret; - -parse_error: - ret = -EINVAL; -error: - free_trace_fprobe(tf); - goto out; } static int trace_fprobe_create(const char *raw_command) { - return trace_probe_create(raw_command, __trace_fprobe_create); + return trace_probe_create(raw_command, trace_fprobe_create_cb); } static int trace_fprobe_release(struct dyn_event *ev) @@ -1278,8 +1439,6 @@ static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev) seq_putc(m, 't'); else seq_putc(m, 'f'); - if (trace_fprobe_is_return(tf) && tf->fp.nr_maxactive) - seq_printf(m, "%d", tf->fp.nr_maxactive); seq_printf(m, ":%s/%s", trace_probe_group_name(&tf->tp), trace_probe_name(&tf->tp)); @@ -1294,6 +1453,88 @@ static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev) } /* + * Enable trace_probe + * if the file is NULL, enable "perf" handler, or enable "trace" handler. + */ +static int enable_trace_fprobe(struct trace_event_call *call, + struct trace_event_file *file) +{ + struct trace_probe *tp; + struct trace_fprobe *tf; + bool enabled; + int ret = 0; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + enabled = trace_probe_is_enabled(tp); + + /* This also changes "enabled" state */ + if (file) { + ret = trace_probe_add_file(tp, file); + if (ret) + return ret; + } else + trace_probe_set_flag(tp, TP_FLAG_PROFILE); + + if (!enabled) { + list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { + ret = __register_trace_fprobe(tf); + if (ret < 0) + return ret; + } + } + + return 0; +} + +/* + * Disable trace_probe + * if the file is NULL, disable "perf" handler, or disable "trace" handler. + */ +static int disable_trace_fprobe(struct trace_event_call *call, + struct trace_event_file *file) +{ + struct trace_fprobe *tf; + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + + if (file) { + if (!trace_probe_get_file_link(tp, file)) + return -ENOENT; + if (!trace_probe_has_single_file(tp)) + goto out; + trace_probe_clear_flag(tp, TP_FLAG_TRACE); + } else + trace_probe_clear_flag(tp, TP_FLAG_PROFILE); + + if (!trace_probe_is_enabled(tp)) { + list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { + unregister_fprobe(&tf->fp); + if (tf->tuser) { + tracepoint_user_put(tf->tuser); + tf->tuser = NULL; + } + } + } + + out: + if (file) + /* + * Synchronization is done in below function. For perf event, + * file == NULL and perf_trace_event_unreg() calls + * tracepoint_synchronize_unregister() to ensure synchronize + * event. We don't need to care about it. + */ + trace_probe_remove_file(tp, file); + + return 0; +} + +/* * called by perf_trace_init() or __ftrace_set_clr_event() under event_mutex. */ static int fprobe_register(struct trace_event_call *event, @@ -1338,6 +1579,9 @@ static __init int init_fprobe_trace_early(void) ret = register_tracepoint_module_notifier(&tracepoint_module_nb); if (ret) return ret; + ret = register_module_notifier(&tprobe_event_module_nb); + if (ret) + return ret; #endif return 0; diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index d358c9935164..f283391a4dc8 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -25,6 +25,9 @@ static void function_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs); static void +function_args_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs); +static void function_stack_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs); static void @@ -42,9 +45,10 @@ enum { TRACE_FUNC_NO_OPTS = 0x0, /* No flags set. */ TRACE_FUNC_OPT_STACK = 0x1, TRACE_FUNC_OPT_NO_REPEATS = 0x2, + TRACE_FUNC_OPT_ARGS = 0x4, /* Update this to next highest bit. */ - TRACE_FUNC_OPT_HIGHEST_BIT = 0x4 + TRACE_FUNC_OPT_HIGHEST_BIT = 0x8 }; #define TRACE_FUNC_OPT_MASK (TRACE_FUNC_OPT_HIGHEST_BIT - 1) @@ -57,7 +61,7 @@ int ftrace_allocate_ftrace_ops(struct trace_array *tr) if (tr->flags & TRACE_ARRAY_FL_GLOBAL) return 0; - ops = kzalloc(sizeof(*ops), GFP_KERNEL); + ops = kzalloc_obj(*ops); if (!ops) return -ENOMEM; @@ -114,6 +118,8 @@ static ftrace_func_t select_trace_function(u32 flags_val) switch (flags_val & TRACE_FUNC_OPT_MASK) { case TRACE_FUNC_NO_OPTS: return function_trace_call; + case TRACE_FUNC_OPT_ARGS: + return function_args_trace_call; case TRACE_FUNC_OPT_STACK: return function_stack_trace_call; case TRACE_FUNC_OPT_NO_REPEATS: @@ -148,11 +154,11 @@ static int function_trace_init(struct trace_array *tr) if (!tr->ops) return -ENOMEM; - func = select_trace_function(func_flags.val); + func = select_trace_function(tr->current_trace_flags->val); if (!func) return -EINVAL; - if (!handle_func_repeats(tr, func_flags.val)) + if (!handle_func_repeats(tr, tr->current_trace_flags->val)) return -ENOMEM; ftrace_init_array_ops(tr, func); @@ -203,7 +209,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs) { struct trace_array *tr = op->private; - struct trace_array_cpu *data; unsigned int trace_ctx; int bit; @@ -216,11 +221,31 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, parent_ip = function_get_true_parent_ip(parent_ip, fregs); + trace_ctx = tracing_gen_ctx_dec(); + + trace_function(tr, ip, parent_ip, trace_ctx, NULL); + + ftrace_test_recursion_unlock(bit); +} + +static void +function_args_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs) +{ + struct trace_array *tr = op->private; + unsigned int trace_ctx; + int bit; + + if (unlikely(!tr->function_enabled)) + return; + + bit = ftrace_test_recursion_trylock(ip, parent_ip); + if (bit < 0) + return; + trace_ctx = tracing_gen_ctx(); - data = this_cpu_ptr(tr->array_buffer.data); - if (!atomic_read(&data->disabled)) - trace_function(tr, ip, parent_ip, trace_ctx); + trace_function(tr, ip, parent_ip, trace_ctx, fregs); ftrace_test_recursion_unlock(bit); } @@ -266,11 +291,11 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, parent_ip = function_get_true_parent_ip(parent_ip, fregs); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->array_buffer.data, cpu); - disabled = atomic_inc_return(&data->disabled); + disabled = local_inc_return(&data->disabled); if (likely(disabled == 1)) { trace_ctx = tracing_gen_ctx_flags(flags); - trace_function(tr, ip, parent_ip, trace_ctx); + trace_function(tr, ip, parent_ip, trace_ctx, NULL); #ifdef CONFIG_UNWINDER_FRAME_POINTER if (ftrace_pids_enabled(op)) skip++; @@ -278,7 +303,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, __trace_stack(tr, trace_ctx, skip); } - atomic_dec(&data->disabled); + local_dec(&data->disabled); local_irq_restore(flags); } @@ -319,9 +344,7 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, { struct trace_func_repeats *last_info; struct trace_array *tr = op->private; - struct trace_array_cpu *data; unsigned int trace_ctx; - unsigned long flags; int bit; if (unlikely(!tr->function_enabled)) @@ -332,8 +355,7 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, return; parent_ip = function_get_true_parent_ip(parent_ip, fregs); - data = this_cpu_ptr(tr->array_buffer.data); - if (atomic_read(&data->disabled)) + if (!tracer_tracing_is_on(tr)) goto out; /* @@ -347,11 +369,10 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, if (is_repeat_check(tr, last_info, ip, parent_ip)) goto out; - local_save_flags(flags); - trace_ctx = tracing_gen_ctx_flags(flags); + trace_ctx = tracing_gen_ctx_dec(); process_repeats(tr, ip, parent_ip, last_info, trace_ctx); - trace_function(tr, ip, parent_ip, trace_ctx); + trace_function(tr, ip, parent_ip, trace_ctx, NULL); out: ftrace_test_recursion_unlock(bit); @@ -381,7 +402,7 @@ function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, parent_ip = function_get_true_parent_ip(parent_ip, fregs); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->array_buffer.data, cpu); - disabled = atomic_inc_return(&data->disabled); + disabled = local_inc_return(&data->disabled); if (likely(disabled == 1)) { last_info = per_cpu_ptr(tr->last_func_repeats, cpu); @@ -391,12 +412,12 @@ function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, trace_ctx = tracing_gen_ctx_flags(flags); process_repeats(tr, ip, parent_ip, last_info, trace_ctx); - trace_function(tr, ip, parent_ip, trace_ctx); + trace_function(tr, ip, parent_ip, trace_ctx, NULL); __trace_stack(tr, trace_ctx, STACK_SKIP); } out: - atomic_dec(&data->disabled); + local_dec(&data->disabled); local_irq_restore(flags); } @@ -405,6 +426,9 @@ static struct tracer_opt func_opts[] = { { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, #endif { TRACER_OPT(func-no-repeats, TRACE_FUNC_OPT_NO_REPEATS) }, +#ifdef CONFIG_FUNCTION_TRACE_ARGS + { TRACER_OPT(func-args, TRACE_FUNC_OPT_ARGS) }, +#endif { } /* Always set a last empty entry */ }; @@ -435,14 +459,14 @@ func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) u32 new_flags; /* Do nothing if already set. */ - if (!!set == !!(func_flags.val & bit)) + if (!!set == !!(tr->current_trace_flags->val & bit)) return 0; /* We can change this flag only when not running. */ if (tr->current_trace != &function_trace) return 0; - new_flags = (func_flags.val & ~bit) | (set ? bit : 0); + new_flags = (tr->current_trace_flags->val & ~bit) | (set ? bit : 0); func = select_trace_function(new_flags); if (!func) return -EINVAL; @@ -467,7 +491,7 @@ static struct tracer function_trace __tracer_data = .init = function_trace_init, .reset = function_trace_reset, .start = function_trace_start, - .flags = &func_flags, + .default_flags = &func_flags, .set_flag = func_set_flag, .allow_instances = true, #ifdef CONFIG_FTRACE_SELFTEST @@ -599,11 +623,7 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, static __always_inline void trace_stack(struct trace_array *tr) { - unsigned int trace_ctx; - - trace_ctx = tracing_gen_ctx(); - - __trace_stack(tr, trace_ctx, FTRACE_STACK_SKIP); + __trace_stack(tr, tracing_gen_ctx_dec(), FTRACE_STACK_SKIP); } static void diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 5504b5e4e7b4..0d2d3a2ea7dd 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -16,9 +16,12 @@ #include "trace.h" #include "trace_output.h" -/* When set, irq functions will be ignored */ +/* When set, irq functions might be ignored */ static int ftrace_graph_skip_irqs; +/* Do not record function time when task is sleeping */ +int fgraph_no_sleep_time; + struct fgraph_cpu_data { pid_t last_pid; int depth; @@ -27,14 +30,26 @@ struct fgraph_cpu_data { unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH]; }; +struct fgraph_ent_args { + struct ftrace_graph_ent_entry ent; + /* Force the sizeof of args[] to have FTRACE_REGS_MAX_ARGS entries */ + unsigned long args[FTRACE_REGS_MAX_ARGS]; +}; + +struct fgraph_retaddr_ent_args { + struct fgraph_retaddr_ent_entry ent; + /* Force the sizeof of args[] to have FTRACE_REGS_MAX_ARGS entries */ + unsigned long args[FTRACE_REGS_MAX_ARGS]; +}; + struct fgraph_data { struct fgraph_cpu_data __percpu *cpu_data; /* Place to preserve last processed entry. */ union { - struct ftrace_graph_ent_entry ent; - struct fgraph_retaddr_ent_entry rent; - } ent; + struct fgraph_ent_args ent; + struct fgraph_retaddr_ent_args rent; + }; struct ftrace_graph_ret_entry ret; int failed; int cpu; @@ -71,14 +86,13 @@ static struct tracer_opt trace_opts[] = { /* Display function return address ? */ { TRACER_OPT(funcgraph-retaddr, TRACE_GRAPH_PRINT_RETADDR) }, #endif +#ifdef CONFIG_FUNCTION_TRACE_ARGS + /* Display function arguments ? */ + { TRACER_OPT(funcgraph-args, TRACE_GRAPH_ARGS) }, +#endif /* Include sleep time (scheduled out) between entry and return */ { TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) }, -#ifdef CONFIG_FUNCTION_PROFILER - /* Include time within nested functions */ - { TRACER_OPT(graph-time, TRACE_GRAPH_GRAPH_TIME) }, -#endif - { } /* Empty entry */ }; @@ -86,13 +100,13 @@ static struct tracer_flags tracer_flags = { /* Don't display overruns, proc, or tail by default */ .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS | - TRACE_GRAPH_SLEEP_TIME | TRACE_GRAPH_GRAPH_TIME, + TRACE_GRAPH_SLEEP_TIME, .opts = trace_opts }; -static bool tracer_flags_is_set(u32 flags) +static bool tracer_flags_is_set(struct trace_array *tr, u32 flags) { - return (tracer_flags.val & flags) == flags; + return (tr->current_trace_flags->val & flags) == flags; } /* @@ -110,43 +124,73 @@ static void print_graph_duration(struct trace_array *tr, unsigned long long duration, struct trace_seq *s, u32 flags); -int __trace_graph_entry(struct trace_array *tr, - struct ftrace_graph_ent *trace, - unsigned int trace_ctx) +static int __graph_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, + unsigned int trace_ctx, struct ftrace_regs *fregs) { struct ring_buffer_event *event; struct trace_buffer *buffer = tr->array_buffer.buffer; struct ftrace_graph_ent_entry *entry; + int size; - event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, - sizeof(*entry), trace_ctx); + /* If fregs is defined, add FTRACE_REGS_MAX_ARGS long size words */ + size = sizeof(*entry) + (FTRACE_REGS_MAX_ARGS * !!fregs * sizeof(long)); + + event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, size, trace_ctx); if (!event) return 0; - entry = ring_buffer_event_data(event); - entry->graph_ent = *trace; + + entry = ring_buffer_event_data(event); + entry->graph_ent = *trace; + +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API + if (fregs) { + for (int i = 0; i < FTRACE_REGS_MAX_ARGS; i++) + entry->args[i] = ftrace_regs_get_argument(fregs, i); + } +#endif + trace_buffer_unlock_commit_nostack(buffer, event); return 1; } +int __trace_graph_entry(struct trace_array *tr, + struct ftrace_graph_ent *trace, + unsigned int trace_ctx) +{ + return __graph_entry(tr, trace, trace_ctx, NULL); +} + #ifdef CONFIG_FUNCTION_GRAPH_RETADDR int __trace_graph_retaddr_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, unsigned int trace_ctx, - unsigned long retaddr) + unsigned long retaddr, + struct ftrace_regs *fregs) { struct ring_buffer_event *event; struct trace_buffer *buffer = tr->array_buffer.buffer; struct fgraph_retaddr_ent_entry *entry; + int size; + + /* If fregs is defined, add FTRACE_REGS_MAX_ARGS long size words */ + size = sizeof(*entry) + (FTRACE_REGS_MAX_ARGS * !!fregs * sizeof(long)); event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RETADDR_ENT, - sizeof(*entry), trace_ctx); + size, trace_ctx); if (!event) return 0; entry = ring_buffer_event_data(event); - entry->graph_ent.func = trace->func; - entry->graph_ent.depth = trace->depth; - entry->graph_ent.retaddr = retaddr; + entry->graph_rent.ent = *trace; + entry->graph_rent.retaddr = retaddr; + +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API + if (fregs) { + for (int i = 0; i < FTRACE_REGS_MAX_ARGS; i++) + entry->args[i] = ftrace_regs_get_argument(fregs, i); + } +#endif + trace_buffer_unlock_commit_nostack(buffer, event); return 1; @@ -155,17 +199,21 @@ int __trace_graph_retaddr_entry(struct trace_array *tr, int __trace_graph_retaddr_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, unsigned int trace_ctx, - unsigned long retaddr) + unsigned long retaddr, + struct ftrace_regs *fregs) { return 1; } #endif -static inline int ftrace_graph_ignore_irqs(void) +static inline int ftrace_graph_ignore_irqs(struct trace_array *tr) { if (!ftrace_graph_skip_irqs || trace_recursion_test(TRACE_IRQ_BIT)) return 0; + if (tracer_flags_is_set(tr, TRACE_GRAPH_PRINT_IRQS)) + return 0; + return in_hardirq(); } @@ -174,18 +222,15 @@ struct fgraph_times { unsigned long long sleeptime; /* may be optional! */ }; -int trace_graph_entry(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops) +static int graph_entry(struct ftrace_graph_ent *trace, + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { unsigned long *task_var = fgraph_get_task_var(gops); struct trace_array *tr = gops->private; - struct trace_array_cpu *data; struct fgraph_times *ftimes; - unsigned long flags; unsigned int trace_ctx; - long disabled; - int ret; - int cpu; + int ret = 0; if (*task_var & TRACE_GRAPH_NOTRACE) return 0; @@ -198,7 +243,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace, * returning from the function. */ if (ftrace_graph_notrace_addr(trace->func)) { - *task_var |= TRACE_GRAPH_NOTRACE_BIT; + *task_var |= TRACE_GRAPH_NOTRACE; /* * Need to return 1 to have the return called * that will clear the NOTRACE bit. @@ -206,22 +251,20 @@ int trace_graph_entry(struct ftrace_graph_ent *trace, return 1; } - if (!ftrace_trace_task(tr)) - return 0; - if (ftrace_graph_ignore_func(gops, trace)) return 0; - if (ftrace_graph_ignore_irqs()) + if (ftrace_graph_ignore_irqs(tr)) return 0; - if (fgraph_sleep_time) { - /* Only need to record the calltime */ - ftimes = fgraph_reserve_data(gops->idx, sizeof(ftimes->calltime)); - } else { + if (fgraph_no_sleep_time && + !tracer_flags_is_set(tr, TRACE_GRAPH_SLEEP_TIME)) { ftimes = fgraph_reserve_data(gops->idx, sizeof(*ftimes)); if (ftimes) ftimes->sleeptime = current->ftrace_sleeptime; + } else { + /* Only need to record the calltime */ + ftimes = fgraph_reserve_data(gops->idx, sizeof(ftimes->calltime)); } if (!ftimes) return 0; @@ -235,29 +278,33 @@ int trace_graph_entry(struct ftrace_graph_ent *trace, if (tracing_thresh) return 1; - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = per_cpu_ptr(tr->array_buffer.data, cpu); - disabled = atomic_inc_return(&data->disabled); - if (likely(disabled == 1)) { - trace_ctx = tracing_gen_ctx_flags(flags); - if (unlikely(IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) && - tracer_flags_is_set(TRACE_GRAPH_PRINT_RETADDR))) { - unsigned long retaddr = ftrace_graph_top_ret_addr(current); - - ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx, retaddr); - } else - ret = __trace_graph_entry(tr, trace, trace_ctx); + trace_ctx = tracing_gen_ctx(); + if (IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) && + tracer_flags_is_set(tr, TRACE_GRAPH_PRINT_RETADDR)) { + unsigned long retaddr = ftrace_graph_top_ret_addr(current); + ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx, + retaddr, fregs); } else { - ret = 0; + ret = __graph_entry(tr, trace, trace_ctx, fregs); } - atomic_dec(&data->disabled); - local_irq_restore(flags); - return ret; } +int trace_graph_entry(struct ftrace_graph_ent *trace, + struct fgraph_ops *gops, + struct ftrace_regs *fregs) +{ + return graph_entry(trace, gops, NULL); +} + +static int trace_graph_entry_args(struct ftrace_graph_ent *trace, + struct fgraph_ops *gops, + struct ftrace_regs *fregs) +{ + return graph_entry(trace, gops, fregs); +} + static void __trace_graph_function(struct trace_array *tr, unsigned long ip, unsigned int trace_ctx) @@ -270,12 +317,10 @@ __trace_graph_function(struct trace_array *tr, struct ftrace_graph_ret ret = { .func = ip, .depth = 0, - .calltime = time, - .rettime = time, }; __trace_graph_entry(tr, &ent, trace_ctx); - __trace_graph_return(tr, &ret, trace_ctx); + __trace_graph_return(tr, &ret, trace_ctx, time, time); } void @@ -287,8 +332,9 @@ trace_graph_function(struct trace_array *tr, } void __trace_graph_return(struct trace_array *tr, - struct ftrace_graph_ret *trace, - unsigned int trace_ctx) + struct ftrace_graph_ret *trace, + unsigned int trace_ctx, + u64 calltime, u64 rettime) { struct ring_buffer_event *event; struct trace_buffer *buffer = tr->array_buffer.buffer; @@ -300,31 +346,36 @@ void __trace_graph_return(struct trace_array *tr, return; entry = ring_buffer_event_data(event); entry->ret = *trace; + entry->calltime = calltime; + entry->rettime = rettime; trace_buffer_unlock_commit_nostack(buffer, event); } -static void handle_nosleeptime(struct ftrace_graph_ret *trace, +static void handle_nosleeptime(struct trace_array *tr, + struct ftrace_graph_ret *trace, struct fgraph_times *ftimes, int size) { - if (fgraph_sleep_time || size < sizeof(*ftimes)) + if (size < sizeof(*ftimes)) + return; + + if (!fgraph_no_sleep_time || tracer_flags_is_set(tr, TRACE_GRAPH_SLEEP_TIME)) return; ftimes->calltime += current->ftrace_sleeptime - ftimes->sleeptime; } void trace_graph_return(struct ftrace_graph_ret *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, struct ftrace_regs *fregs) { unsigned long *task_var = fgraph_get_task_var(gops); struct trace_array *tr = gops->private; - struct trace_array_cpu *data; struct fgraph_times *ftimes; - unsigned long flags; unsigned int trace_ctx; - long disabled; + u64 calltime, rettime; int size; - int cpu; + + rettime = trace_clock_local(); ftrace_graph_addr_finish(gops, trace); @@ -337,32 +388,31 @@ void trace_graph_return(struct ftrace_graph_ret *trace, if (!ftimes) return; - handle_nosleeptime(trace, ftimes, size); + handle_nosleeptime(tr, trace, ftimes, size); - trace->calltime = ftimes->calltime; + calltime = ftimes->calltime; - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = per_cpu_ptr(tr->array_buffer.data, cpu); - disabled = atomic_inc_return(&data->disabled); - if (likely(disabled == 1)) { - trace_ctx = tracing_gen_ctx_flags(flags); - __trace_graph_return(tr, trace, trace_ctx); - } - atomic_dec(&data->disabled); - local_irq_restore(flags); + trace_ctx = tracing_gen_ctx(); + __trace_graph_return(tr, trace, trace_ctx, calltime, rettime); } static void trace_graph_thresh_return(struct ftrace_graph_ret *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { + unsigned long *task_var = fgraph_get_task_var(gops); struct fgraph_times *ftimes; + struct trace_array *tr; + unsigned int trace_ctx; + u64 calltime, rettime; int size; + rettime = trace_clock_local(); + ftrace_graph_addr_finish(gops, trace); - if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) { - trace_recursion_clear(TRACE_GRAPH_NOTRACE_BIT); + if (*task_var & TRACE_GRAPH_NOTRACE) { + *task_var &= ~TRACE_GRAPH_NOTRACE; return; } @@ -370,15 +420,16 @@ static void trace_graph_thresh_return(struct ftrace_graph_ret *trace, if (!ftimes) return; - handle_nosleeptime(trace, ftimes, size); + tr = gops->private; + handle_nosleeptime(tr, trace, ftimes, size); - trace->calltime = ftimes->calltime; + calltime = ftimes->calltime; - if (tracing_thresh && - (trace->rettime - ftimes->calltime < tracing_thresh)) + if (tracing_thresh && (rettime - calltime < tracing_thresh)) return; - else - trace_graph_return(trace, gops); + + trace_ctx = tracing_gen_ctx(); + __trace_graph_return(tr, trace, trace_ctx, calltime, rettime); } static struct fgraph_ops funcgraph_ops = { @@ -390,7 +441,7 @@ int allocate_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops) { struct fgraph_ops *gops; - gops = kzalloc(sizeof(*gops), GFP_KERNEL); + gops = kzalloc_obj(*gops); if (!gops) return -ENOMEM; @@ -421,14 +472,23 @@ static int graph_trace_init(struct trace_array *tr) { int ret; - tr->gops->entryfunc = trace_graph_entry; + if (tracer_flags_is_set(tr, TRACE_GRAPH_ARGS)) + tr->gops->entryfunc = trace_graph_entry_args; + else + tr->gops->entryfunc = trace_graph_entry; if (tracing_thresh) tr->gops->retfunc = trace_graph_thresh_return; else tr->gops->retfunc = trace_graph_return; - /* Make gops functions are visible before we start tracing */ + if (!tracer_flags_is_set(tr, TRACE_GRAPH_PRINT_IRQS)) + ftrace_graph_skip_irqs++; + + if (!tracer_flags_is_set(tr, TRACE_GRAPH_SLEEP_TIME)) + fgraph_no_sleep_time++; + + /* Make gops functions visible before we start tracing */ smp_mb(); ret = register_ftrace_graph(tr->gops); @@ -439,8 +499,42 @@ static int graph_trace_init(struct trace_array *tr) return 0; } +static struct tracer graph_trace; + +static int ftrace_graph_trace_args(struct trace_array *tr, int set) +{ + trace_func_graph_ent_t entry; + + if (set) + entry = trace_graph_entry_args; + else + entry = trace_graph_entry; + + /* See if there's any changes */ + if (tr->gops->entryfunc == entry) + return 0; + + unregister_ftrace_graph(tr->gops); + + tr->gops->entryfunc = entry; + + /* Make gops functions visible before we start tracing */ + smp_mb(); + return register_ftrace_graph(tr->gops); +} + static void graph_trace_reset(struct trace_array *tr) { + if (!tracer_flags_is_set(tr, TRACE_GRAPH_PRINT_IRQS)) + ftrace_graph_skip_irqs--; + if (WARN_ON_ONCE(ftrace_graph_skip_irqs < 0)) + ftrace_graph_skip_irqs = 0; + + if (!tracer_flags_is_set(tr, TRACE_GRAPH_SLEEP_TIME)) + fgraph_no_sleep_time--; + if (WARN_ON_ONCE(fgraph_no_sleep_time < 0)) + fgraph_no_sleep_time = 0; + tracing_stop_cmdline_record(); unregister_ftrace_graph(tr->gops); } @@ -469,7 +563,7 @@ static void print_graph_proc(struct trace_seq *s, pid_t pid) { char comm[TASK_COMM_LEN]; /* sign + log10(MAX_INT) + '\0' */ - char pid_str[11]; + char pid_str[12]; int spaces = 0; int len; int i; @@ -583,10 +677,9 @@ get_return_for_leaf(struct trace_iterator *iter, * Save current and next entries for later reference * if the output fails. */ - if (unlikely(curr->ent.type == TRACE_GRAPH_RETADDR_ENT)) - data->ent.rent = *(struct fgraph_retaddr_ent_entry *)curr; - else - data->ent.ent = *curr; + int size = min_t(int, sizeof(data->rent), iter->ent_size); + + memcpy(&data->rent, curr, size); /* * If the next event is not a return type, then * we only care about what type it is. Otherwise we can @@ -649,7 +742,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, addr >= (unsigned long)__irqentry_text_end) return; - if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) { + if (tr->trace_flags & TRACE_ITER(CONTEXT_INFO)) { /* Absolute time */ if (flags & TRACE_GRAPH_PRINT_ABS_TIME) print_graph_abs_time(iter->ts, s); @@ -669,7 +762,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, } /* Latency format */ - if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) + if (tr->trace_flags & TRACE_ITER(LATENCY_FMT)) print_graph_lat_fmt(s, ent); } @@ -723,7 +816,7 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration, struct trace_seq *s, u32 flags) { if (!(flags & TRACE_GRAPH_PRINT_DURATION) || - !(tr->trace_flags & TRACE_ITER_CONTEXT_INFO)) + !(tr->trace_flags & TRACE_ITER(CONTEXT_INFO))) return; /* No real adata, just filling the column with spaces */ @@ -764,7 +857,7 @@ static void print_graph_retaddr(struct trace_seq *s, struct fgraph_retaddr_ent_e trace_seq_puts(s, " /*"); trace_seq_puts(s, " <-"); - seq_print_ip_sym(s, entry->graph_ent.retaddr, trace_flags | TRACE_ITER_SYM_OFFSET); + seq_print_ip_sym_offset(s, entry->graph_rent.retaddr, trace_flags); if (comment) trace_seq_puts(s, " */"); @@ -778,7 +871,7 @@ static void print_graph_retaddr(struct trace_seq *s, struct fgraph_retaddr_ent_e static void print_graph_retval(struct trace_seq *s, struct ftrace_graph_ent_entry *entry, struct ftrace_graph_ret *graph_ret, void *func, - u32 opt_flags, u32 trace_flags) + u32 opt_flags, u32 trace_flags, int args_size) { unsigned long err_code = 0; unsigned long retval = 0; @@ -812,11 +905,16 @@ static void print_graph_retval(struct trace_seq *s, struct ftrace_graph_ent_entr if (entry->ent.type != TRACE_GRAPH_RETADDR_ENT) print_retaddr = false; - trace_seq_printf(s, "%ps();", func); + trace_seq_printf(s, "%ps", func); + + if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long)) { + print_function_args(s, FGRAPH_ENTRY_ARGS(entry), (unsigned long)func); + trace_seq_putc(s, ';'); + } else + trace_seq_puts(s, "();"); + if (print_retval || print_retaddr) trace_seq_puts(s, " /*"); - else - trace_seq_putc(s, '\n'); } else { print_retaddr = false; trace_seq_printf(s, "} /* %ps", func); @@ -834,12 +932,13 @@ static void print_graph_retval(struct trace_seq *s, struct ftrace_graph_ent_entr } if (!entry || print_retval || print_retaddr) - trace_seq_puts(s, " */\n"); + trace_seq_puts(s, " */"); } #else -#define print_graph_retval(_seq, _ent, _ret, _func, _opt_flags, _trace_flags) do {} while (0) +#define print_graph_retval(_seq, _ent, _ret, _func, _opt_flags, _trace_flags, args_size) \ + do {} while (0) #endif @@ -855,15 +954,16 @@ print_graph_entry_leaf(struct trace_iterator *iter, struct ftrace_graph_ret *graph_ret; struct ftrace_graph_ent *call; unsigned long long duration; - unsigned long func; + unsigned long ret_func; + int args_size; int cpu = iter->cpu; int i; + args_size = iter->ent_size - offsetof(struct ftrace_graph_ent_entry, args); + graph_ret = &ret_entry->ret; call = &entry->graph_ent; - duration = graph_ret->rettime - graph_ret->calltime; - - func = call->func + iter->tr->text_delta; + duration = ret_entry->rettime - ret_entry->calltime; if (data) { struct fgraph_cpu_data *cpu_data; @@ -890,16 +990,25 @@ print_graph_entry_leaf(struct trace_iterator *iter, for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) trace_seq_putc(s, ' '); + ret_func = graph_ret->func + iter->tr->text_delta; + /* * Write out the function return value or return address */ if (flags & (__TRACE_GRAPH_PRINT_RETVAL | __TRACE_GRAPH_PRINT_RETADDR)) { print_graph_retval(s, entry, graph_ret, (void *)graph_ret->func + iter->tr->text_delta, - flags, tr->trace_flags); + flags, tr->trace_flags, args_size); } else { - trace_seq_printf(s, "%ps();\n", (void *)func); + trace_seq_printf(s, "%ps", (void *)ret_func); + + if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long)) { + print_function_args(s, FGRAPH_ENTRY_ARGS(entry), ret_func); + trace_seq_putc(s, ';'); + } else + trace_seq_puts(s, "();"); } + trace_seq_putc(s, '\n'); print_graph_irq(iter, graph_ret->func, TRACE_GRAPH_RET, cpu, iter->ent->pid, flags); @@ -916,6 +1025,7 @@ print_graph_entry_nested(struct trace_iterator *iter, struct fgraph_data *data = iter->private; struct trace_array *tr = iter->tr; unsigned long func; + int args_size; int i; if (data) { @@ -940,7 +1050,17 @@ print_graph_entry_nested(struct trace_iterator *iter, func = call->func + iter->tr->text_delta; - trace_seq_printf(s, "%ps() {", (void *)func); + trace_seq_printf(s, "%ps", (void *)func); + + args_size = iter->ent_size - offsetof(struct ftrace_graph_ent_entry, args); + + if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long)) + print_function_args(s, FGRAPH_ENTRY_ARGS(entry), func); + else + trace_seq_puts(s, "()"); + + trace_seq_puts(s, " {"); + if (flags & __TRACE_GRAPH_PRINT_RETADDR && entry->ent.type == TRACE_GRAPH_RETADDR_ENT) print_graph_retaddr(s, (struct fgraph_retaddr_ent_entry *)entry, @@ -973,7 +1093,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, /* Interrupt */ print_graph_irq(iter, addr, type, cpu, ent->pid, flags); - if (!(tr->trace_flags & TRACE_ITER_CONTEXT_INFO)) + if (!(tr->trace_flags & TRACE_ITER(CONTEXT_INFO))) return; /* Absolute time */ @@ -995,7 +1115,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, } /* Latency format */ - if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) + if (tr->trace_flags & TRACE_ITER(LATENCY_FMT)) print_graph_lat_fmt(s, ent); return; @@ -1110,21 +1230,41 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, struct trace_iterator *iter, u32 flags) { struct fgraph_data *data = iter->private; - struct ftrace_graph_ent *call = &field->graph_ent; + struct ftrace_graph_ent *call; struct ftrace_graph_ret_entry *leaf_ret; static enum print_line_t ret; int cpu = iter->cpu; + /* + * print_graph_entry() may consume the current event, + * thus @field may become invalid, so we need to save it. + * This function is shared by ftrace_graph_ent_entry and + * fgraph_retaddr_ent_entry, the size of the latter one + * is larger, but it is very small and can be safely saved + * at the stack. + */ + struct ftrace_graph_ent_entry *entry; + struct fgraph_retaddr_ent_entry *rentry; + u8 save_buf[sizeof(*rentry) + FTRACE_REGS_MAX_ARGS * sizeof(long)]; + + /* The ent_size is expected to be as big as the entry */ + if (iter->ent_size > sizeof(save_buf)) + iter->ent_size = sizeof(save_buf); + + entry = (void *)save_buf; + memcpy(entry, field, iter->ent_size); + + call = &entry->graph_ent; if (check_irq_entry(iter, flags, call->func, call->depth)) return TRACE_TYPE_HANDLED; print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags); - leaf_ret = get_return_for_leaf(iter, field); + leaf_ret = get_return_for_leaf(iter, entry); if (leaf_ret) - ret = print_graph_entry_leaf(iter, field, leaf_ret, s, flags); + ret = print_graph_entry_leaf(iter, entry, leaf_ret, s, flags); else - ret = print_graph_entry_nested(iter, field, s, cpu, flags); + ret = print_graph_entry_nested(iter, entry, s, cpu, flags); if (data) { /* @@ -1142,11 +1282,14 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, } static enum print_line_t -print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, +print_graph_return(struct ftrace_graph_ret_entry *retentry, struct trace_seq *s, struct trace_entry *ent, struct trace_iterator *iter, u32 flags) { - unsigned long long duration = trace->rettime - trace->calltime; + struct ftrace_graph_ret *trace = &retentry->ret; + u64 calltime = retentry->calltime; + u64 rettime = retentry->rettime; + unsigned long long duration = rettime - calltime; struct fgraph_data *data = iter->private; struct trace_array *tr = iter->tr; unsigned long func; @@ -1195,7 +1338,8 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, * funcgraph-retval option is enabled. */ if (flags & __TRACE_GRAPH_PRINT_RETVAL) { - print_graph_retval(s, NULL, trace, (void *)func, flags, tr->trace_flags); + print_graph_retval(s, NULL, trace, (void *)func, flags, + tr->trace_flags, 0); } else { /* * If the return function does not have a matching entry, @@ -1205,10 +1349,11 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, * that if the funcgraph-tail option is enabled. */ if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) - trace_seq_puts(s, "}\n"); + trace_seq_puts(s, "}"); else - trace_seq_printf(s, "} /* %ps */\n", (void *)func); + trace_seq_printf(s, "} /* %ps */", (void *)func); } + trace_seq_putc(s, '\n'); /* Overrun */ if (flags & TRACE_GRAPH_PRINT_OVERRUN) @@ -1323,31 +1468,28 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) switch (entry->type) { case TRACE_GRAPH_ENT: { - /* - * print_graph_entry() may consume the current event, - * thus @field may become invalid, so we need to save it. - * sizeof(struct ftrace_graph_ent_entry) is very small, - * it can be safely saved at the stack. - */ - struct ftrace_graph_ent_entry saved; trace_assign_type(field, entry); - saved = *field; - return print_graph_entry(&saved, s, iter, flags); + return print_graph_entry(field, s, iter, flags); } #ifdef CONFIG_FUNCTION_GRAPH_RETADDR case TRACE_GRAPH_RETADDR_ENT: { - struct fgraph_retaddr_ent_entry saved; + /* + * ftrace_graph_ent_entry and fgraph_retaddr_ent_entry have + * similar functions and memory layouts. The only difference + * is that the latter one has an extra retaddr member, so + * they can share most of the logic. + */ struct fgraph_retaddr_ent_entry *rfield; trace_assign_type(rfield, entry); - saved = *rfield; - return print_graph_entry((struct ftrace_graph_ent_entry *)&saved, s, iter, flags); + return print_graph_entry((struct ftrace_graph_ent_entry *)rfield, + s, iter, flags); } #endif case TRACE_GRAPH_RET: { struct ftrace_graph_ret_entry *field; trace_assign_type(field, entry); - return print_graph_return(&field->ret, s, entry, iter, flags); + return print_graph_return(field, s, entry, iter, flags); } case TRACE_STACK: case TRACE_FN: @@ -1364,7 +1506,8 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) static enum print_line_t print_graph_function(struct trace_iterator *iter) { - return print_graph_function_flags(iter, tracer_flags.val); + struct trace_array *tr = iter->tr; + return print_graph_function_flags(iter, tr->current_trace_flags->val); } static enum print_line_t @@ -1400,7 +1543,7 @@ static void print_lat_header(struct seq_file *s, u32 flags) static void __print_graph_headers_flags(struct trace_array *tr, struct seq_file *s, u32 flags) { - int lat = tr->trace_flags & TRACE_ITER_LATENCY_FMT; + int lat = tr->trace_flags & TRACE_ITER(LATENCY_FMT); if (lat) print_lat_header(s, flags); @@ -1440,7 +1583,10 @@ static void __print_graph_headers_flags(struct trace_array *tr, static void print_graph_headers(struct seq_file *s) { - print_graph_headers_flags(s, tracer_flags.val); + struct trace_iterator *iter = s->private; + struct trace_array *tr = iter->tr; + + print_graph_headers_flags(s, tr->current_trace_flags->val); } void print_graph_headers_flags(struct seq_file *s, u32 flags) @@ -1448,10 +1594,10 @@ void print_graph_headers_flags(struct seq_file *s, u32 flags) struct trace_iterator *iter = s->private; struct trace_array *tr = iter->tr; - if (!(tr->trace_flags & TRACE_ITER_CONTEXT_INFO)) + if (!(tr->trace_flags & TRACE_ITER(CONTEXT_INFO))) return; - if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) { + if (tr->trace_flags & TRACE_ITER(LATENCY_FMT)) { /* print nothing if the buffers are empty */ if (trace_empty(iter)) return; @@ -1474,7 +1620,7 @@ void graph_trace_open(struct trace_iterator *iter) /* We can be called in atomic context via ftrace_dump() */ gfpflags = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : GFP_KERNEL; - data = kzalloc(sizeof(*data), gfpflags); + data = kzalloc_obj(*data, gfpflags); if (!data) goto out_err; @@ -1511,20 +1657,63 @@ void graph_trace_close(struct trace_iterator *iter) if (data) { free_percpu(data->cpu_data); kfree(data); + iter->private = NULL; } } static int func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { - if (bit == TRACE_GRAPH_PRINT_IRQS) - ftrace_graph_skip_irqs = !set; +/* + * The function profiler gets updated even if function graph + * isn't the current tracer. Handle it separately. + */ +#ifdef CONFIG_FUNCTION_PROFILER + if (bit == TRACE_GRAPH_SLEEP_TIME && (tr->flags & TRACE_ARRAY_FL_GLOBAL) && + !!set == fprofile_no_sleep_time) { + if (set) { + fgraph_no_sleep_time--; + if (WARN_ON_ONCE(fgraph_no_sleep_time < 0)) + fgraph_no_sleep_time = 0; + fprofile_no_sleep_time = false; + } else { + fgraph_no_sleep_time++; + fprofile_no_sleep_time = true; + } + } +#endif + + /* Do nothing if the current tracer is not this tracer */ + if (tr->current_trace != &graph_trace) + return 0; - if (bit == TRACE_GRAPH_SLEEP_TIME) - ftrace_graph_sleep_time_control(set); + /* Do nothing if already set. */ + if (!!set == !!(tr->current_trace_flags->val & bit)) + return 0; - if (bit == TRACE_GRAPH_GRAPH_TIME) - ftrace_graph_graph_time_control(set); + switch (bit) { + case TRACE_GRAPH_SLEEP_TIME: + if (set) { + fgraph_no_sleep_time--; + if (WARN_ON_ONCE(fgraph_no_sleep_time < 0)) + fgraph_no_sleep_time = 0; + } else { + fgraph_no_sleep_time++; + } + break; + + case TRACE_GRAPH_PRINT_IRQS: + if (set) + ftrace_graph_skip_irqs--; + else + ftrace_graph_skip_irqs++; + if (WARN_ON_ONCE(ftrace_graph_skip_irqs < 0)) + ftrace_graph_skip_irqs = 0; + break; + + case TRACE_GRAPH_ARGS: + return ftrace_graph_trace_args(tr, set); + } return 0; } @@ -1561,7 +1750,7 @@ static struct tracer graph_trace __tracer_data = { .reset = graph_trace_reset, .print_line = print_graph_function, .print_header = print_graph_headers, - .flags = &tracer_flags, + .default_flags = &tracer_flags, .set_flag = func_graph_set_flag, .allow_instances = true, #ifdef CONFIG_FTRACE_SELFTEST diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index b65353ec2837..3fe274b84f1c 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -102,9 +102,9 @@ struct hwlat_sample { /* keep the global state somewhere. */ static struct hwlat_data { - struct mutex lock; /* protect changes */ + struct mutex lock; /* protect changes */ - u64 count; /* total since reset */ + atomic64_t count; /* total since reset */ u64 sample_window; /* total sampling window (on+off) */ u64 sample_width; /* active sampling portion of window */ @@ -193,8 +193,7 @@ void trace_hwlat_callback(bool enter) * get_sample - sample the CPU TSC and look for likely hardware latencies * * Used to repeatedly capture the CPU TSC (or similar), looking for potential - * hardware-induced latency. Called with interrupts disabled and with - * hwlat_data.lock held. + * hardware-induced latency. Called with interrupts disabled. */ static int get_sample(void) { @@ -204,6 +203,7 @@ static int get_sample(void) time_type start, t1, t2, last_t2; s64 diff, outer_diff, total, last_total = 0; u64 sample = 0; + u64 sample_width = READ_ONCE(hwlat_data.sample_width); u64 thresh = tracing_thresh; u64 outer_sample = 0; int ret = -1; @@ -267,7 +267,7 @@ static int get_sample(void) if (diff > sample) sample = diff; /* only want highest value */ - } while (total <= hwlat_data.sample_width); + } while (total <= sample_width); barrier(); /* finish the above in the view for NMIs */ trace_hwlat_callback_enabled = false; @@ -285,8 +285,7 @@ static int get_sample(void) if (kdata->nmi_total_ts) do_div(kdata->nmi_total_ts, NSEC_PER_USEC); - hwlat_data.count++; - s.seqnum = hwlat_data.count; + s.seqnum = atomic64_inc_return(&hwlat_data.count); s.duration = sample; s.outer_duration = outer_sample; s.nmi_total_ts = kdata->nmi_total_ts; @@ -325,12 +324,9 @@ static void move_to_next_cpu(void) cpus_read_lock(); cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask); - next_cpu = cpumask_next(raw_smp_processor_id(), current_mask); + next_cpu = cpumask_next_wrap(raw_smp_processor_id(), current_mask); cpus_read_unlock(); - if (next_cpu >= nr_cpu_ids) - next_cpu = cpumask_first(current_mask); - if (next_cpu >= nr_cpu_ids) /* Shouldn't happen! */ goto change_mode; @@ -835,7 +831,7 @@ static int hwlat_tracer_init(struct trace_array *tr) hwlat_trace = tr; - hwlat_data.count = 0; + atomic64_set(&hwlat_data.count, 0); tr->max_latency = 0; save_tracing_thresh = tracing_thresh; diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index fce064e20570..17673905907c 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -63,7 +63,7 @@ irq_trace(void) #ifdef CONFIG_FUNCTION_GRAPH_TRACER static int irqsoff_display_graph(struct trace_array *tr, int set); -# define is_graph(tr) ((tr)->trace_flags & TRACE_ITER_DISPLAY_GRAPH) +# define is_graph(tr) ((tr)->trace_flags & TRACE_ITER(DISPLAY_GRAPH)) #else static inline int irqsoff_display_graph(struct trace_array *tr, int set) { @@ -123,12 +123,12 @@ static int func_prolog_dec(struct trace_array *tr, return 0; *data = per_cpu_ptr(tr->array_buffer.data, cpu); - disabled = atomic_inc_return(&(*data)->disabled); + disabled = local_inc_return(&(*data)->disabled); if (likely(disabled == 1)) return 1; - atomic_dec(&(*data)->disabled); + local_dec(&(*data)->disabled); return 0; } @@ -150,9 +150,9 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip, trace_ctx = tracing_gen_ctx_flags(flags); - trace_function(tr, ip, parent_ip, trace_ctx); + trace_function(tr, ip, parent_ip, trace_ctx, fregs); - atomic_dec(&data->disabled); + local_dec(&data->disabled); } #endif /* CONFIG_FUNCTION_TRACER */ @@ -176,13 +176,15 @@ static int irqsoff_display_graph(struct trace_array *tr, int set) } static int irqsoff_graph_entry(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; unsigned long flags; unsigned int trace_ctx; - int ret; + u64 *calltime; + int ret = 0; if (ftrace_graph_ignore_func(gops, trace)) return 0; @@ -199,29 +201,41 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace, if (!func_prolog_dec(tr, &data, &flags)) return 0; - trace_ctx = tracing_gen_ctx_flags(flags); - ret = __trace_graph_entry(tr, trace, trace_ctx); - atomic_dec(&data->disabled); + calltime = fgraph_reserve_data(gops->idx, sizeof(*calltime)); + if (calltime) { + *calltime = trace_clock_local(); + trace_ctx = tracing_gen_ctx_flags(flags); + ret = __trace_graph_entry(tr, trace, trace_ctx); + } + local_dec(&data->disabled); return ret; } static void irqsoff_graph_return(struct ftrace_graph_ret *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; unsigned long flags; unsigned int trace_ctx; + u64 *calltime; + u64 rettime; + int size; ftrace_graph_addr_finish(gops, trace); if (!func_prolog_dec(tr, &data, &flags)) return; - trace_ctx = tracing_gen_ctx_flags(flags); - __trace_graph_return(tr, trace, trace_ctx); - atomic_dec(&data->disabled); + rettime = trace_clock_local(); + calltime = fgraph_retrieve_data(gops->idx, &size); + if (calltime) { + trace_ctx = tracing_gen_ctx_flags(flags); + __trace_graph_return(tr, trace, trace_ctx, *calltime, rettime); + } + local_dec(&data->disabled); } static struct fgraph_ops fgraph_ops = { @@ -233,8 +247,6 @@ static void irqsoff_trace_open(struct trace_iterator *iter) { if (is_graph(iter->tr)) graph_trace_open(iter); - else - iter->private = NULL; } static void irqsoff_trace_close(struct trace_iterator *iter) @@ -278,11 +290,17 @@ __trace_function(struct trace_array *tr, if (is_graph(tr)) trace_graph_function(tr, ip, parent_ip, trace_ctx); else - trace_function(tr, ip, parent_ip, trace_ctx); + trace_function(tr, ip, parent_ip, trace_ctx, NULL); } #else -#define __trace_function trace_function +static inline void +__trace_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, + unsigned int trace_ctx) +{ + return trace_function(tr, ip, parent_ip, trace_ctx, NULL); +} static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) { @@ -376,6 +394,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) int cpu; struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; + long disabled; if (!tracer_enabled || !tracing_is_enabled()) return; @@ -387,20 +406,22 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) data = per_cpu_ptr(tr->array_buffer.data, cpu); - if (unlikely(!data) || atomic_read(&data->disabled)) + if (unlikely(!data) || local_read(&data->disabled)) return; - atomic_inc(&data->disabled); + disabled = local_inc_return(&data->disabled); - data->critical_sequence = max_sequence; - data->preempt_timestamp = ftrace_now(cpu); - data->critical_start = parent_ip ? : ip; + if (disabled == 1) { + data->critical_sequence = max_sequence; + data->preempt_timestamp = ftrace_now(cpu); + data->critical_start = parent_ip ? : ip; - __trace_function(tr, ip, parent_ip, tracing_gen_ctx()); + __trace_function(tr, ip, parent_ip, tracing_gen_ctx()); - per_cpu(tracing_cpu, cpu) = 1; + per_cpu(tracing_cpu, cpu) = 1; + } - atomic_dec(&data->disabled); + local_dec(&data->disabled); } static nokprobe_inline void @@ -410,6 +431,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; unsigned int trace_ctx; + long disabled; cpu = raw_smp_processor_id(); /* Always clear the tracing cpu on stopping the trace */ @@ -424,16 +446,19 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) data = per_cpu_ptr(tr->array_buffer.data, cpu); if (unlikely(!data) || - !data->critical_start || atomic_read(&data->disabled)) + !data->critical_start || local_read(&data->disabled)) return; - atomic_inc(&data->disabled); + disabled = local_inc_return(&data->disabled); - trace_ctx = tracing_gen_ctx(); - __trace_function(tr, ip, parent_ip, trace_ctx); - check_critical_timing(tr, data, parent_ip ? : ip, cpu); - data->critical_start = 0; - atomic_dec(&data->disabled); + if (disabled == 1) { + trace_ctx = tracing_gen_ctx(); + __trace_function(tr, ip, parent_ip, trace_ctx); + check_critical_timing(tr, data, parent_ip ? : ip, cpu); + data->critical_start = 0; + } + + local_dec(&data->disabled); } /* start and stop critical timings used to for stoppage (in idle) */ @@ -460,8 +485,8 @@ static int register_irqsoff_function(struct trace_array *tr, int graph, int set) { int ret; - /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */ - if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER_FUNCTION))) + /* 'set' is set if TRACE_ITER(FUNCTION) is about to be set */ + if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER(FUNCTION)))) return 0; if (graph) @@ -490,7 +515,7 @@ static void unregister_irqsoff_function(struct trace_array *tr, int graph) static int irqsoff_function_set(struct trace_array *tr, u32 mask, int set) { - if (!(mask & TRACE_ITER_FUNCTION)) + if (!(mask & TRACE_ITER(FUNCTION))) return 0; if (set) @@ -511,7 +536,7 @@ static inline int irqsoff_function_set(struct trace_array *tr, u32 mask, int set } #endif /* CONFIG_FUNCTION_TRACER */ -static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set) +static int irqsoff_flag_changed(struct trace_array *tr, u64 mask, int set) { struct tracer *tracer = tr->current_trace; @@ -519,7 +544,7 @@ static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set) return 0; #ifdef CONFIG_FUNCTION_GRAPH_TRACER - if (mask & TRACE_ITER_DISPLAY_GRAPH) + if (mask & TRACE_ITER(DISPLAY_GRAPH)) return irqsoff_display_graph(tr, set); #endif @@ -557,10 +582,10 @@ static int __irqsoff_tracer_init(struct trace_array *tr) save_flags = tr->trace_flags; /* non overwrite screws up the latency tracers */ - set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); - set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); + set_tracer_flag(tr, TRACE_ITER(OVERWRITE), 1); + set_tracer_flag(tr, TRACE_ITER(LATENCY_FMT), 1); /* without pause, we will produce garbage if another latency occurs */ - set_tracer_flag(tr, TRACE_ITER_PAUSE_ON_TRACE, 1); + set_tracer_flag(tr, TRACE_ITER(PAUSE_ON_TRACE), 1); tr->max_latency = 0; irqsoff_trace = tr; @@ -580,15 +605,15 @@ static int __irqsoff_tracer_init(struct trace_array *tr) static void __irqsoff_tracer_reset(struct trace_array *tr) { - int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; - int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE; - int pause_flag = save_flags & TRACE_ITER_PAUSE_ON_TRACE; + int lat_flag = save_flags & TRACE_ITER(LATENCY_FMT); + int overwrite_flag = save_flags & TRACE_ITER(OVERWRITE); + int pause_flag = save_flags & TRACE_ITER(PAUSE_ON_TRACE); stop_irqsoff_tracer(tr, is_graph(tr)); - set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); - set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); - set_tracer_flag(tr, TRACE_ITER_PAUSE_ON_TRACE, pause_flag); + set_tracer_flag(tr, TRACE_ITER(LATENCY_FMT), lat_flag); + set_tracer_flag(tr, TRACE_ITER(OVERWRITE), overwrite_flag); + set_tracer_flag(tr, TRACE_ITER(PAUSE_ON_TRACE), pause_flag); ftrace_reset_array_ops(tr); irqsoff_busy = false; diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 1e72d20b3c2f..b30795f34079 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -31,7 +31,7 @@ static void ftrace_dump_buf(int skip_entries, long cpu_file) old_userobj = tr->trace_flags; /* don't look at user memory in panic mode */ - tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ; + tr->trace_flags &= ~TRACE_ITER(SYM_USEROBJ); kdb_printf("Dumping ftrace buffer:\n"); if (skip_entries) @@ -43,17 +43,15 @@ static void ftrace_dump_buf(int skip_entries, long cpu_file) if (cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter.buffer_iter[cpu] = - ring_buffer_read_prepare(iter.array_buffer->buffer, - cpu, GFP_ATOMIC); - ring_buffer_read_start(iter.buffer_iter[cpu]); + ring_buffer_read_start(iter.array_buffer->buffer, + cpu, GFP_ATOMIC); tracing_iter_reset(&iter, cpu); } } else { iter.cpu_file = cpu_file; iter.buffer_iter[cpu_file] = - ring_buffer_read_prepare(iter.array_buffer->buffer, + ring_buffer_read_start(iter.array_buffer->buffer, cpu_file, GFP_ATOMIC); - ring_buffer_read_start(iter.buffer_iter[cpu_file]); tracing_iter_reset(&iter, cpu_file); } @@ -98,7 +96,6 @@ static int kdb_ftdump(int argc, const char **argv) long cpu_file; int err; int cnt; - int cpu; if (argc > 2) return KDB_ARGCOUNT; @@ -120,9 +117,7 @@ static int kdb_ftdump(int argc, const char **argv) trace_init_global_iter(&iter); iter.buffer_iter = buffer_iter; - for_each_tracing_cpu(cpu) { - atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); - } + tracer_tracing_disable(iter.tr); /* A negative skip_entries means skip all but the last entries */ if (skip_entries < 0) { @@ -135,9 +130,7 @@ static int kdb_ftdump(int argc, const char **argv) ftrace_dump_buf(skip_entries, cpu_file); - for_each_tracing_cpu(cpu) { - atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); - } + tracer_tracing_enable(iter.tr); kdb_trap_printk--; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 935a886af40c..a8420e6abb56 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -8,19 +8,20 @@ #define pr_fmt(fmt) "trace_kprobe: " fmt #include <linux/bpf-cgroup.h> -#include <linux/security.h> +#include <linux/cleanup.h> +#include <linux/error-injection.h> #include <linux/module.h> -#include <linux/uaccess.h> #include <linux/rculist.h> -#include <linux/error-injection.h> +#include <linux/security.h> +#include <linux/uaccess.h> #include <asm/setup.h> /* for COMMAND_LINE_SIZE */ #include "trace_dynevent.h" #include "trace_kprobe_selftest.h" #include "trace_probe.h" -#include "trace_probe_tmpl.h" #include "trace_probe_kernel.h" +#include "trace_probe_tmpl.h" #define KPROBE_EVENT_SYSTEM "kprobes" #define KRETPROBE_MAXACTIVE_MAX 4096 @@ -30,7 +31,8 @@ static char kprobe_boot_events_buf[COMMAND_LINE_SIZE] __initdata; static int __init set_kprobe_boot_events(char *str) { - strscpy(kprobe_boot_events_buf, str, COMMAND_LINE_SIZE); + trace_append_boot_param(kprobe_boot_events_buf, str, ';', + COMMAND_LINE_SIZE); disable_tracing_selftest("running kprobe events"); return 1; @@ -81,6 +83,7 @@ static struct trace_kprobe *to_trace_kprobe(struct dyn_event *ev) #define for_each_trace_kprobe(pos, dpos) \ for_each_dyn_event(dpos) \ if (is_trace_kprobe(dpos) && (pos = to_trace_kprobe(dpos))) +#define trace_kprobe_list_empty() list_empty(&dyn_event_list) static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk) { @@ -123,9 +126,8 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk) if (!p) return true; *p = '\0'; - rcu_read_lock_sched(); - ret = !!find_module(tk->symbol); - rcu_read_unlock_sched(); + scoped_guard(rcu) + ret = !!find_module(tk->symbol); *p = ':'; return ret; @@ -257,6 +259,9 @@ static void free_trace_kprobe(struct trace_kprobe *tk) } } +DEFINE_FREE(free_trace_kprobe, struct trace_kprobe *, + if (!IS_ERR_OR_NULL(_T)) free_trace_kprobe(_T)) + /* * Allocate new trace_probe and initialize it (including kprobes). */ @@ -268,21 +273,21 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, int maxactive, int nargs, bool is_return) { - struct trace_kprobe *tk; + struct trace_kprobe *tk __free(free_trace_kprobe) = NULL; int ret = -ENOMEM; - tk = kzalloc(struct_size(tk, tp.args, nargs), GFP_KERNEL); + tk = kzalloc_flex(*tk, tp.args, nargs); if (!tk) return ERR_PTR(ret); tk->nhit = alloc_percpu(unsigned long); if (!tk->nhit) - goto error; + return ERR_PTR(ret); if (symbol) { tk->symbol = kstrdup(symbol, GFP_KERNEL); if (!tk->symbol) - goto error; + return ERR_PTR(ret); tk->rp.kp.symbol_name = tk->symbol; tk->rp.kp.offset = offs; } else @@ -299,13 +304,10 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, ret = trace_probe_init(&tk->tp, event, group, false, nargs); if (ret < 0) - goto error; + return ERR_PTR(ret); dyn_event_init(&tk->devent, &trace_kprobe_ops); - return tk; -error: - free_trace_kprobe(tk); - return ERR_PTR(ret); + return_ptr(tk); } static struct trace_kprobe *find_trace_kprobe(const char *event, @@ -634,7 +636,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk) struct trace_kprobe *old_tk; int ret; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); old_tk = find_trace_kprobe(trace_probe_name(&tk->tp), trace_probe_group_name(&tk->tp)); @@ -642,11 +644,9 @@ static int register_trace_kprobe(struct trace_kprobe *tk) if (trace_kprobe_is_return(tk) != trace_kprobe_is_return(old_tk)) { trace_probe_log_set_index(0); trace_probe_log_err(0, DIFF_PROBE_TYPE); - ret = -EEXIST; - } else { - ret = append_trace_kprobe(tk, old_tk); + return -EEXIST; } - goto end; + return append_trace_kprobe(tk, old_tk); } /* Register new event */ @@ -657,7 +657,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk) trace_probe_log_err(0, EVENT_EXIST); } else pr_warn("Failed to register probe event(%d)\n", ret); - goto end; + return ret; } /* Register k*probe */ @@ -672,8 +672,6 @@ static int register_trace_kprobe(struct trace_kprobe *tk) else dyn_event_add(&tk->devent, trace_probe_event_call(&tk->tp)); -end: - mutex_unlock(&event_mutex); return ret; } @@ -706,7 +704,7 @@ static int trace_kprobe_module_callback(struct notifier_block *nb, return NOTIFY_DONE; /* Update probes on coming module */ - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); for_each_trace_kprobe(tk, pos) { if (trace_kprobe_within_module(tk, mod)) { /* Don't need to check busy - this should have gone. */ @@ -718,7 +716,6 @@ static int trace_kprobe_module_callback(struct notifier_block *nb, module_name(mod), ret); } } - mutex_unlock(&event_mutex); return NOTIFY_DONE; } @@ -769,6 +766,14 @@ static unsigned int number_of_same_symbols(const char *mod, const char *func_nam if (!mod) kallsyms_on_each_match_symbol(count_symbols, func_name, &ctx.count); + /* + * If the symbol is found in vmlinux, use vmlinux resolution only. + * This prevents module symbols from shadowing vmlinux symbols + * and causing -EADDRNOTAVAIL for unqualified kprobe targets. + */ + if (!mod && ctx.count > 0) + return ctx.count; + module_kallsyms_on_each_symbol(mod, count_mod_symbols, &ctx); return ctx.count; @@ -800,12 +805,10 @@ static struct module *try_module_get_by_name(const char *name) { struct module *mod; - rcu_read_lock_sched(); + guard(rcu)(); mod = find_module(name); if (mod && !try_module_get(mod)) mod = NULL; - rcu_read_unlock_sched(); - return mod; } #else @@ -840,7 +843,8 @@ out: static int trace_kprobe_entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs); -static int __trace_kprobe_create(int argc, const char *argv[]) +static int trace_kprobe_create_internal(int argc, const char *argv[], + struct traceprobe_parse_context *ctx) { /* * Argument syntax: @@ -866,21 +870,21 @@ static int __trace_kprobe_create(int argc, const char *argv[]) * Type of args: * FETCHARG:TYPE : use TYPE instead of unsigned long. */ - struct trace_kprobe *tk = NULL; - int i, len, new_argc = 0, ret = 0; - bool is_return = false; - char *symbol = NULL, *tmp = NULL; - const char **new_argv = NULL; + struct trace_kprobe *tk __free(free_trace_kprobe) = NULL; const char *event = NULL, *group = KPROBE_EVENT_SYSTEM; + const char **new_argv __free(kfree) = NULL; + int i, len, new_argc = 0, ret = 0; + char *symbol __free(kfree) = NULL; + char *ebuf __free(kfree) = NULL; + char *gbuf __free(kfree) = NULL; + char *abuf __free(kfree) = NULL; + char *dbuf __free(kfree) = NULL; enum probe_print_type ptype; + bool is_return = false; int maxactive = 0; - long offset = 0; void *addr = NULL; - char buf[MAX_EVENT_NAME_LEN]; - char gbuf[MAX_EVENT_NAME_LEN]; - char abuf[MAX_BTF_ARGS_LEN]; - char *dbuf = NULL; - struct traceprobe_parse_context ctx = { .flags = TPARG_FL_KERNEL }; + char *tmp = NULL; + long offset = 0; switch (argv[0][0]) { case 'r': @@ -894,16 +898,16 @@ static int __trace_kprobe_create(int argc, const char *argv[]) if (argc < 2) return -ECANCELED; - trace_probe_log_init("trace_kprobe", argc, argv); - event = strchr(&argv[0][1], ':'); if (event) event++; if (isdigit(argv[0][1])) { + char *buf __free(kfree) = NULL; + if (!is_return) { trace_probe_log_err(1, BAD_MAXACT_TYPE); - goto parse_error; + return -EINVAL; } if (event) len = event - &argv[0][1] - 1; @@ -911,21 +915,23 @@ static int __trace_kprobe_create(int argc, const char *argv[]) len = strlen(&argv[0][1]); if (len > MAX_EVENT_NAME_LEN - 1) { trace_probe_log_err(1, BAD_MAXACT); - goto parse_error; + return -EINVAL; } - memcpy(buf, &argv[0][1], len); + buf = kmemdup(&argv[0][1], len + 1, GFP_KERNEL); + if (!buf) + return -ENOMEM; buf[len] = '\0'; ret = kstrtouint(buf, 0, &maxactive); if (ret || !maxactive) { trace_probe_log_err(1, BAD_MAXACT); - goto parse_error; + return -EINVAL; } /* kretprobes instances are iterated over via a list. The * maximum should stay reasonable. */ if (maxactive > KRETPROBE_MAXACTIVE_MAX) { trace_probe_log_err(1, MAXACT_TOO_BIG); - goto parse_error; + return -EINVAL; } } @@ -934,10 +940,9 @@ static int __trace_kprobe_create(int argc, const char *argv[]) if (kstrtoul(argv[1], 0, (unsigned long *)&addr)) { trace_probe_log_set_index(1); /* Check whether uprobe event specified */ - if (strchr(argv[1], '/') && strchr(argv[1], ':')) { - ret = -ECANCELED; - goto error; - } + if (strchr(argv[1], '/') && strchr(argv[1], ':')) + return -ECANCELED; + /* a symbol specified */ symbol = kstrdup(argv[1], GFP_KERNEL); if (!symbol) @@ -950,7 +955,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) is_return = true; } else { trace_probe_log_err(tmp - symbol, BAD_ADDR_SUFFIX); - goto parse_error; + return -EINVAL; } } @@ -958,7 +963,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) ret = traceprobe_split_symbol_offset(symbol, &offset); if (ret || offset < 0 || offset > UINT_MAX) { trace_probe_log_err(0, BAD_PROBE_ADDR); - goto parse_error; + return -EINVAL; } ret = validate_probe_symbol(symbol); if (ret) { @@ -966,61 +971,71 @@ static int __trace_kprobe_create(int argc, const char *argv[]) trace_probe_log_err(0, NON_UNIQ_SYMBOL); else trace_probe_log_err(0, BAD_PROBE_ADDR); - goto parse_error; + return -EINVAL; } if (is_return) - ctx.flags |= TPARG_FL_RETURN; + ctx->flags |= TPARG_FL_RETURN; ret = kprobe_on_func_entry(NULL, symbol, offset); if (ret == 0 && !is_return) - ctx.flags |= TPARG_FL_FENTRY; + ctx->flags |= TPARG_FL_FENTRY; /* Defer the ENOENT case until register kprobe */ if (ret == -EINVAL && is_return) { trace_probe_log_err(0, BAD_RETPROBE); - goto parse_error; + return -EINVAL; } } trace_probe_log_set_index(0); if (event) { + gbuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); + if (!gbuf) + return -ENOMEM; ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) - goto parse_error; + return ret; } if (!event) { /* Make a new event name */ + ebuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); + if (!ebuf) + return -ENOMEM; if (symbol) - snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_%ld", + snprintf(ebuf, MAX_EVENT_NAME_LEN, "%c_%s_%ld", is_return ? 'r' : 'p', symbol, offset); else - snprintf(buf, MAX_EVENT_NAME_LEN, "%c_0x%p", + snprintf(ebuf, MAX_EVENT_NAME_LEN, "%c_0x%p", is_return ? 'r' : 'p', addr); - sanitize_event_name(buf); - event = buf; + sanitize_event_name(ebuf); + event = ebuf; } + abuf = kmalloc(MAX_BTF_ARGS_LEN, GFP_KERNEL); + if (!abuf) + return -ENOMEM; argc -= 2; argv += 2; - ctx.funcname = symbol; + ctx->funcname = symbol; new_argv = traceprobe_expand_meta_args(argc, argv, &new_argc, - abuf, MAX_BTF_ARGS_LEN, &ctx); + abuf, MAX_BTF_ARGS_LEN, ctx); if (IS_ERR(new_argv)) { ret = PTR_ERR(new_argv); new_argv = NULL; - goto out; + return ret; } if (new_argv) { argc = new_argc; argv = new_argv; } if (argc > MAX_TRACE_ARGS) { - ret = -E2BIG; - goto out; + trace_probe_log_set_index(2); + trace_probe_log_err(0, TOO_MANY_ARGS); + return -E2BIG; } ret = traceprobe_expand_dentry_args(argc, argv, &dbuf); if (ret) - goto out; + return ret; /* setup a probe */ tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, @@ -1029,16 +1044,16 @@ static int __trace_kprobe_create(int argc, const char *argv[]) ret = PTR_ERR(tk); /* This must return -ENOMEM, else there is a bug */ WARN_ON_ONCE(ret != -ENOMEM); - goto out; /* We know tk is not allocated */ + return ret; /* We know tk is not allocated */ } /* parse arguments */ for (i = 0; i < argc; i++) { trace_probe_log_set_index(i + 2); - ctx.offset = 0; - ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], &ctx); + ctx->offset = 0; + ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], ctx); if (ret) - goto error; /* This can be -ENOMEM */ + return ret; /* This can be -ENOMEM */ } /* entry handler for kretprobe */ if (is_return && tk->tp.entry_arg) { @@ -1049,7 +1064,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) ptype = is_return ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL; ret = traceprobe_set_print_fmt(&tk->tp, ptype); if (ret < 0) - goto error; + return ret; ret = register_trace_kprobe(tk); if (ret) { @@ -1060,27 +1075,38 @@ static int __trace_kprobe_create(int argc, const char *argv[]) trace_probe_log_err(0, BAD_PROBE_ADDR); else if (ret != -ENOMEM && ret != -EEXIST) trace_probe_log_err(0, FAIL_REG_PROBE); - goto error; + return ret; } + /* + * Here, 'tk' has been registered to the list successfully, + * so we don't need to free it. + */ + tk = NULL; + + return 0; +} + +static int trace_kprobe_create_cb(int argc, const char *argv[]) +{ + struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) = NULL; + int ret; + + ctx = kzalloc_obj(*ctx); + if (!ctx) + return -ENOMEM; + ctx->flags = TPARG_FL_KERNEL; + + trace_probe_log_init("trace_kprobe", argc, argv); + + ret = trace_kprobe_create_internal(argc, argv, ctx); -out: - traceprobe_finish_parse(&ctx); trace_probe_log_clear(); - kfree(new_argv); - kfree(symbol); - kfree(dbuf); return ret; - -parse_error: - ret = -EINVAL; -error: - free_trace_kprobe(tk); - goto out; } static int trace_kprobe_create(const char *raw_command) { - return trace_probe_create(raw_command, __trace_kprobe_create); + return trace_probe_create(raw_command, trace_kprobe_create_cb); } static int create_or_delete_trace_kprobe(const char *raw_command) @@ -1090,7 +1116,7 @@ static int create_or_delete_trace_kprobe(const char *raw_command) if (raw_command[0] == '-') return dyn_event_release(raw_command, &trace_kprobe_ops); - ret = trace_kprobe_create(raw_command); + ret = dyn_event_create(raw_command, &trace_kprobe_ops); return ret == -ECANCELED ? -EINVAL : ret; } @@ -1568,7 +1594,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags, trace_seq_printf(s, "%s: (", trace_probe_name(tp)); - if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) + if (!seq_print_ip_sym_offset(s, field->ip, flags)) goto out; trace_seq_putc(s, ')'); @@ -1598,12 +1624,12 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, trace_seq_printf(s, "%s: (", trace_probe_name(tp)); - if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) + if (!seq_print_ip_sym_offset(s, field->ret_ip, flags)) goto out; trace_seq_puts(s, " <- "); - if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET)) + if (!seq_print_ip_sym_no_offset(s, field->func, flags)) goto out; trace_seq_putc(s, ')'); @@ -1799,14 +1825,15 @@ static int kprobe_register(struct trace_event_call *event, static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) { struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp); + unsigned int flags = trace_probe_load_flag(&tk->tp); int ret = 0; raw_cpu_inc(*tk->nhit); - if (trace_probe_test_flag(&tk->tp, TP_FLAG_TRACE)) + if (flags & TP_FLAG_TRACE) kprobe_trace_func(tk, regs); #ifdef CONFIG_PERF_EVENTS - if (trace_probe_test_flag(&tk->tp, TP_FLAG_PROFILE)) + if (flags & TP_FLAG_PROFILE) ret = kprobe_perf_func(tk, regs); #endif return ret; @@ -1818,6 +1845,7 @@ kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) { struct kretprobe *rp = get_kretprobe(ri); struct trace_kprobe *tk; + unsigned int flags; /* * There is a small chance that get_kretprobe(ri) returns NULL when @@ -1830,10 +1858,11 @@ kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) tk = container_of(rp, struct trace_kprobe, rp); raw_cpu_inc(*tk->nhit); - if (trace_probe_test_flag(&tk->tp, TP_FLAG_TRACE)) + flags = trace_probe_load_flag(&tk->tp); + if (flags & TP_FLAG_TRACE) kretprobe_trace_func(tk, ri, regs); #ifdef CONFIG_PERF_EVENTS - if (trace_probe_test_flag(&tk->tp, TP_FLAG_PROFILE)) + if (flags & TP_FLAG_PROFILE) kretprobe_perf_func(tk, ri, regs); #endif return 0; /* We don't tweak kernel, so just return 0 */ @@ -1896,7 +1925,7 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs, bool is_return) { enum probe_print_type ptype; - struct trace_kprobe *tk; + struct trace_kprobe *tk __free(free_trace_kprobe) = NULL; int ret; char *event; @@ -1927,19 +1956,14 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs, ptype = trace_kprobe_is_return(tk) ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL; - if (traceprobe_set_print_fmt(&tk->tp, ptype) < 0) { - ret = -ENOMEM; - goto error; - } + if (traceprobe_set_print_fmt(&tk->tp, ptype) < 0) + return ERR_PTR(-ENOMEM); ret = __register_trace_kprobe(tk); if (ret < 0) - goto error; + return ERR_PTR(ret); - return trace_probe_event_call(&tk->tp); -error: - free_trace_kprobe(tk); - return ERR_PTR(ret); + return trace_probe_event_call(&(no_free_ptr(tk)->tp)); } void destroy_local_trace_kprobe(struct trace_event_call *event_call) @@ -1968,13 +1992,15 @@ static __init void enable_boot_kprobe_events(void) struct trace_kprobe *tk; struct dyn_event *pos; - mutex_lock(&event_mutex); + if (trace_kprobe_list_empty()) + return; + + guard(mutex)(&event_mutex); for_each_trace_kprobe(tk, pos) { list_for_each_entry(file, &tr->events, list) if (file->event_call == trace_probe_event_call(&tk->tp)) trace_event_enable_disable(file, 1, 0); } - mutex_unlock(&event_mutex); } static __init void setup_boot_kprobe_events(void) @@ -2035,6 +2061,10 @@ static __init int init_kprobe_trace(void) trace_create_file("kprobe_profile", TRACE_MODE_READ, NULL, NULL, &kprobe_profile_ops); + /* If no 'kprobe_event=' cmd is provided, return directly. */ + if (kprobe_boot_events_buf[0] == '\0') + return 0; + setup_boot_kprobe_events(); return 0; @@ -2066,7 +2096,7 @@ static __init int kprobe_trace_self_tests_init(void) struct trace_kprobe *tk; struct trace_event_file *file; - if (tracing_is_disabled()) + if (unlikely(tracing_disabled)) return -ENODEV; if (tracing_selftest_disabled) diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index ba5858866b2f..226cf66e0d68 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -101,7 +101,7 @@ static void mmio_pipe_open(struct trace_iterator *iter) trace_seq_puts(s, "VERSION 20070824\n"); - hiter = kzalloc(sizeof(*hiter), GFP_KERNEL); + hiter = kzalloc_obj(*hiter); if (!hiter) return; @@ -291,7 +291,6 @@ __init static int init_mmio_trace(void) device_initcall(init_mmio_trace); static void __trace_mmiotrace_rw(struct trace_array *tr, - struct trace_array_cpu *data, struct mmiotrace_rw *rw) { struct trace_buffer *buffer = tr->array_buffer.buffer; @@ -315,12 +314,10 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, void mmio_trace_rw(struct mmiotrace_rw *rw) { struct trace_array *tr = mmio_trace_array; - struct trace_array_cpu *data = per_cpu_ptr(tr->array_buffer.data, smp_processor_id()); - __trace_mmiotrace_rw(tr, data, rw); + __trace_mmiotrace_rw(tr, rw); } static void __trace_mmiotrace_map(struct trace_array *tr, - struct trace_array_cpu *data, struct mmiotrace_map *map) { struct trace_buffer *buffer = tr->array_buffer.buffer; @@ -344,12 +341,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr, void mmio_trace_mapping(struct mmiotrace_map *map) { struct trace_array *tr = mmio_trace_array; - struct trace_array_cpu *data; - - preempt_disable(); - data = per_cpu_ptr(tr->array_buffer.data, smp_processor_id()); - __trace_mmiotrace_map(tr, data, map); - preempt_enable(); + __trace_mmiotrace_map(tr, map); } int mmio_trace_printk(const char *fmt, va_list args) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index b9f96c77527d..75678053b21c 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -24,6 +24,7 @@ #include <linux/sched/clock.h> #include <uapi/linux/sched/types.h> #include <linux/sched.h> +#include <linux/string.h> #include "trace.h" #ifdef CONFIG_X86_LOCAL_APIC @@ -57,6 +58,7 @@ enum osnoise_options_index { OSN_PANIC_ON_STOP, OSN_PREEMPT_DISABLE, OSN_IRQ_DISABLE, + OSN_TIMERLAT_ALIGN, OSN_MAX }; @@ -65,7 +67,8 @@ static const char * const osnoise_options_str[OSN_MAX] = { "OSNOISE_WORKLOAD", "PANIC_ON_STOP", "OSNOISE_PREEMPT_DISABLE", - "OSNOISE_IRQ_DISABLE" }; + "OSNOISE_IRQ_DISABLE", + "TIMERLAT_ALIGN" }; #define OSN_DEFAULT_OPTIONS 0x2 static unsigned long osnoise_options = OSN_DEFAULT_OPTIONS; @@ -121,7 +124,7 @@ static int osnoise_register_instance(struct trace_array *tr) */ lockdep_assert_held(&trace_types_lock); - inst = kmalloc(sizeof(*inst), GFP_KERNEL); + inst = kmalloc_obj(*inst); if (!inst) return -ENOMEM; @@ -250,6 +253,11 @@ struct timerlat_variables { static DEFINE_PER_CPU(struct timerlat_variables, per_cpu_timerlat_var); /* + * timerlat wake-up offset for next thread with TIMERLAT_ALIGN set. + */ +static atomic64_t align_next; + +/* * this_cpu_tmr_var - Return the per-cpu timerlat_variables on its relative CPU */ static inline struct timerlat_variables *this_cpu_tmr_var(void) @@ -267,16 +275,23 @@ static inline void tlat_var_reset(void) /* Synchronize with the timerlat interfaces */ mutex_lock(&interface_lock); + /* * So far, all the values are initialized as 0, so * zeroing the structure is perfect. */ - for_each_cpu(cpu, cpu_online_mask) { + for_each_online_cpu(cpu) { tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu); if (tlat_var->kthread) hrtimer_cancel(&tlat_var->timer); memset(tlat_var, 0, sizeof(*tlat_var)); } + /* + * Reset also align_next, to be filled by a new offset by the first timerlat + * thread that wakes up, if TIMERLAT_ALIGN is set. + */ + atomic64_set(&align_next, 0); + mutex_unlock(&interface_lock); } #else /* CONFIG_TIMERLAT_TRACER */ @@ -295,7 +310,7 @@ static inline void osn_var_reset(void) * So far, all the values are initialized as 0, so * zeroing the structure is perfect. */ - for_each_cpu(cpu, cpu_online_mask) { + for_each_online_cpu(cpu) { osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu); memset(osn_var, 0, sizeof(*osn_var)); } @@ -316,33 +331,6 @@ static inline void osn_var_reset_all(void) bool trace_osnoise_callback_enabled; /* - * osnoise sample structure definition. Used to store the statistics of a - * sample run. - */ -struct osnoise_sample { - u64 runtime; /* runtime */ - u64 noise; /* noise */ - u64 max_sample; /* max single noise sample */ - int hw_count; /* # HW (incl. hypervisor) interference */ - int nmi_count; /* # NMIs during this sample */ - int irq_count; /* # IRQs during this sample */ - int softirq_count; /* # softirqs during this sample */ - int thread_count; /* # threads during this sample */ -}; - -#ifdef CONFIG_TIMERLAT_TRACER -/* - * timerlat sample structure definition. Used to store the statistics of - * a sample run. - */ -struct timerlat_sample { - u64 timer_latency; /* timer_latency */ - unsigned int seqnum; /* unique sequence */ - int context; /* timer context */ -}; -#endif - -/* * Tracer data. */ static struct osnoise_data { @@ -352,10 +340,11 @@ static struct osnoise_data { u64 stop_tracing_total; /* stop trace in the final operation (report/thread) */ #ifdef CONFIG_TIMERLAT_TRACER u64 timerlat_period; /* timerlat period */ + u64 timerlat_align_us; /* timerlat alignment */ u64 print_stack; /* print IRQ stack if total > */ int timerlat_tracer; /* timerlat tracer */ #endif - bool tainted; /* infor users and developers about a problem */ + bool tainted; /* info users and developers about a problem */ } osnoise_data = { .sample_period = DEFAULT_SAMPLE_PERIOD, .sample_runtime = DEFAULT_SAMPLE_RUNTIME, @@ -364,6 +353,7 @@ static struct osnoise_data { #ifdef CONFIG_TIMERLAT_TRACER .print_stack = 0, .timerlat_period = DEFAULT_TIMERLAT_PERIOD, + .timerlat_align_us = 0, .timerlat_tracer = 0, #endif }; @@ -497,7 +487,7 @@ static void print_osnoise_headers(struct seq_file *s) * Record an osnoise_sample into the tracer buffer. */ static void -__trace_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffer) +__record_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffer) { struct ring_buffer_event *event; struct osnoise_entry *entry; @@ -520,17 +510,19 @@ __trace_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffe } /* - * Record an osnoise_sample on all osnoise instances. + * Record an osnoise_sample on all osnoise instances and fire trace event. */ -static void trace_osnoise_sample(struct osnoise_sample *sample) +static void record_osnoise_sample(struct osnoise_sample *sample) { struct osnoise_instance *inst; struct trace_buffer *buffer; + trace_osnoise_sample(sample); + rcu_read_lock(); list_for_each_entry_rcu(inst, &osnoise_instances, list) { buffer = inst->tr->array_buffer.buffer; - __trace_osnoise_sample(sample, buffer); + __record_osnoise_sample(sample, buffer); } rcu_read_unlock(); } @@ -574,7 +566,7 @@ static void print_timerlat_headers(struct seq_file *s) #endif /* CONFIG_PREEMPT_RT */ static void -__trace_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buffer) +__record_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buffer) { struct ring_buffer_event *event; struct timerlat_entry *entry; @@ -594,15 +586,17 @@ __trace_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buf /* * Record an timerlat_sample into the tracer buffer. */ -static void trace_timerlat_sample(struct timerlat_sample *sample) +static void record_timerlat_sample(struct timerlat_sample *sample) { struct osnoise_instance *inst; struct trace_buffer *buffer; + trace_timerlat_sample(sample); + rcu_read_lock(); list_for_each_entry_rcu(inst, &osnoise_instances, list) { buffer = inst->tr->array_buffer.buffer; - __trace_timerlat_sample(sample, buffer); + __record_timerlat_sample(sample, buffer); } rcu_read_unlock(); } @@ -660,8 +654,8 @@ __timerlat_dump_stack(struct trace_buffer *buffer, struct trace_stack *fstack, u entry = ring_buffer_event_data(event); - memcpy(&entry->caller, fstack->calls, size); entry->size = fstack->nr_entries; + memcpy(&entry->caller, fstack->calls, size); trace_buffer_unlock_commit_nostack(buffer, event); } @@ -760,7 +754,7 @@ cond_move_thread_delta_start(struct osnoise_variables *osn_var, u64 duration) /* * get_int_safe_duration - Get the duration of a window * - * The irq, softirq and thread varaibles need to have its duration without + * The irq, softirq and thread variables need to have its duration without * the interference from higher priority interrupts. Instead of keeping a * variable to discount the interrupt interference from these variables, the * starting time of these variables are pushed forward with the interrupt's @@ -1229,6 +1223,8 @@ static void trace_sched_migrate_callback(void *data, struct task_struct *p, int } } +static bool monitor_enabled; + static int register_migration_monitor(void) { int ret = 0; @@ -1237,16 +1233,25 @@ static int register_migration_monitor(void) * Timerlat thread migration check is only required when running timerlat in user-space. * Thus, enable callback only if timerlat is set with no workload. */ - if (timerlat_enabled() && !test_bit(OSN_WORKLOAD, &osnoise_options)) + if (timerlat_enabled() && !test_bit(OSN_WORKLOAD, &osnoise_options)) { + if (WARN_ON_ONCE(monitor_enabled)) + return 0; + ret = register_trace_sched_migrate_task(trace_sched_migrate_callback, NULL); + if (!ret) + monitor_enabled = true; + } return ret; } static void unregister_migration_monitor(void) { - if (timerlat_enabled() && !test_bit(OSN_WORKLOAD, &osnoise_options)) - unregister_trace_sched_migrate_task(trace_sched_migrate_callback, NULL); + if (!monitor_enabled) + return; + + unregister_trace_sched_migrate_task(trace_sched_migrate_callback, NULL); + monitor_enabled = false; } #else static int register_migration_monitor(void) @@ -1471,7 +1476,7 @@ static int run_osnoise(void) stop_in = osnoise_data.stop_tracing * NSEC_PER_USEC; /* - * Start timestemp + * Start timestamp */ start = time_get(); @@ -1531,27 +1536,25 @@ static int run_osnoise(void) /* * In some cases, notably when running on a nohz_full CPU with - * a stopped tick PREEMPT_RCU has no way to account for QSs. - * This will eventually cause unwarranted noise as PREEMPT_RCU - * will force preemption as the means of ending the current - * grace period. We avoid this problem by calling - * rcu_momentary_eqs(), which performs a zero duration - * EQS allowing PREEMPT_RCU to end the current grace period. - * This call shouldn't be wrapped inside an RCU critical - * section. + * a stopped tick PREEMPT_RCU or PREEMPT_LAZY have no way to + * account for QSs. This will eventually cause unwarranted + * noise as RCU forces preemption as the means of ending the + * current grace period. We avoid this by calling + * rcu_momentary_eqs(), which performs a zero duration EQS + * allowing RCU to end the current grace period. This call + * shouldn't be wrapped inside an RCU critical section. * - * Note that in non PREEMPT_RCU kernels QSs are handled through - * cond_resched() + * Normally QSs for other cases are handled through cond_resched(). + * For simplicity, however, we call rcu_momentary_eqs() for all + * configurations here. */ - if (IS_ENABLED(CONFIG_PREEMPT_RCU)) { - if (!disable_irq) - local_irq_disable(); + if (!disable_irq) + local_irq_disable(); - rcu_momentary_eqs(); + rcu_momentary_eqs(); - if (!disable_irq) - local_irq_enable(); - } + if (!disable_irq) + local_irq_enable(); /* * For the non-preemptive kernel config: let threads runs, if @@ -1597,7 +1600,7 @@ static int run_osnoise(void) /* Save interference stats info */ diff_osn_sample_stats(osn_var, &s); - trace_osnoise_sample(&s); + record_osnoise_sample(&s); notify_new_max_latency(max_noise); @@ -1792,7 +1795,7 @@ static enum hrtimer_restart timerlat_irq(struct hrtimer *timer) s.timer_latency = diff; s.context = IRQ_CONTEXT; - trace_timerlat_sample(&s); + record_timerlat_sample(&s); if (osnoise_data.stop_tracing) { if (time_to_us(diff) >= osnoise_data.stop_tracing) { @@ -1843,6 +1846,26 @@ static int wait_next_period(struct timerlat_variables *tlat) tlat->abs_period = (u64) ktime_to_ns(next_abs_period); /* + * Align thread in the first cycle on each CPU to the set alignment + * if TIMERLAT_ALIGN is set. + * + * This is done by using an atomic64_t to store the next absolute period. + * The first thread that wakes up will set the atomic64_t to its + * absolute period, and the other threads will increment it by + * the alignment value. + */ + if (test_bit(OSN_TIMERLAT_ALIGN, &osnoise_options) && !tlat->count + && atomic64_cmpxchg_relaxed(&align_next, 0, tlat->abs_period)) { + /* + * A thread has already set align_next, use it and increment it + * to be used by the next thread that wakes up after this one. + */ + tlat->abs_period = atomic64_add_return_relaxed( + osnoise_data.timerlat_align_us * 1000, &align_next); + next_abs_period = ns_to_ktime(tlat->abs_period); + } + + /* * If the new abs_period is in the past, skip the activation. */ while (ktime_compare(now, next_abs_period) > 0) { @@ -1890,12 +1913,11 @@ static int timerlat_main(void *data) tlat->count = 0; tlat->tracing_thread = false; - hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); - tlat->timer.function = timerlat_irq; + hrtimer_setup(&tlat->timer, timerlat_irq, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); tlat->kthread = current; osn_var->pid = current->pid; /* - * Anotate the arrival time. + * Annotate the arrival time. */ tlat->abs_period = hrtimer_cb_get_time(&tlat->timer); @@ -1912,7 +1934,7 @@ static int timerlat_main(void *data) s.timer_latency = diff; s.context = THREAD_CONTEXT; - trace_timerlat_sample(&s); + record_timerlat_sample(&s); notify_new_max_latency(diff); @@ -1992,7 +2014,7 @@ static void stop_per_cpu_kthreads(void) } /* - * start_kthread - Start a workload tread + * start_kthread - Start a workload thread */ static int start_kthread(unsigned int cpu) { @@ -2021,7 +2043,6 @@ static int start_kthread(unsigned int cpu) if (IS_ERR(kthread)) { pr_err(BANNER "could not start sampling thread\n"); - stop_per_cpu_kthreads(); return -ENOMEM; } @@ -2083,26 +2104,21 @@ static void osnoise_hotplug_workfn(struct work_struct *dummy) { unsigned int cpu = smp_processor_id(); - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); if (!osnoise_has_registered_instances()) - goto out_unlock_trace; + return; - mutex_lock(&interface_lock); - cpus_read_lock(); + guard(cpus_read_lock)(); + guard(mutex)(&interface_lock); if (!cpu_online(cpu)) - goto out_unlock; + return; + if (!cpumask_test_cpu(cpu, &osnoise_cpumask)) - goto out_unlock; + return; start_kthread(cpu); - -out_unlock: - cpus_read_unlock(); - mutex_unlock(&interface_lock); -out_unlock_trace: - mutex_unlock(&trace_types_lock); } static DECLARE_WORK(osnoise_hotplug_work, osnoise_hotplug_workfn); @@ -2257,11 +2273,11 @@ static ssize_t osnoise_options_write(struct file *filp, const char __user *ubuf, if (running) stop_per_cpu_kthreads(); - mutex_lock(&interface_lock); /* * avoid CPU hotplug operations that might read options. */ cpus_read_lock(); + mutex_lock(&interface_lock); retval = cnt; @@ -2277,8 +2293,8 @@ static ssize_t osnoise_options_write(struct file *filp, const char __user *ubuf, clear_bit(option, &osnoise_options); } - cpus_read_unlock(); mutex_unlock(&interface_lock); + cpus_read_unlock(); if (running) start_per_cpu_kthreads(); @@ -2300,31 +2316,22 @@ static ssize_t osnoise_cpus_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { - char *mask_str; + char *mask_str __free(kfree) = NULL; int len; - mutex_lock(&interface_lock); + guard(mutex)(&interface_lock); len = snprintf(NULL, 0, "%*pbl\n", cpumask_pr_args(&osnoise_cpumask)) + 1; mask_str = kmalloc(len, GFP_KERNEL); - if (!mask_str) { - count = -ENOMEM; - goto out_unlock; - } + if (!mask_str) + return -ENOMEM; len = snprintf(mask_str, len, "%*pbl\n", cpumask_pr_args(&osnoise_cpumask)); - if (len >= count) { - count = -EINVAL; - goto out_free; - } + if (len >= count) + return -EINVAL; count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len); -out_free: - kfree(mask_str); -out_unlock: - mutex_unlock(&interface_lock); - return count; } @@ -2332,7 +2339,7 @@ out_unlock: * osnoise_cpus_write - Write function for "cpus" entry * @filp: The active open file structure * @ubuf: The user buffer that contains the value to write - * @cnt: The maximum number of bytes to write to "file" + * @count: The maximum number of bytes to write to "file" * @ppos: The current position in @file * * This function provides a write implementation for the "cpus" @@ -2350,13 +2357,14 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count, { cpumask_var_t osnoise_cpumask_new; int running, err; - char buf[256]; + char *buf __free(kfree) = NULL; - if (count >= 256) - return -EINVAL; + if (count < 1) + return 0; - if (copy_from_user(buf, ubuf, count)) - return -EFAULT; + buf = memdup_user_nul(ubuf, count); + if (IS_ERR(buf)) + return PTR_ERR(buf); if (!zalloc_cpumask_var(&osnoise_cpumask_new, GFP_KERNEL)) return -ENOMEM; @@ -2373,16 +2381,16 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count, if (running) stop_per_cpu_kthreads(); - mutex_lock(&interface_lock); /* * osnoise_cpumask is read by CPU hotplug operations. */ cpus_read_lock(); + mutex_lock(&interface_lock); cpumask_copy(&osnoise_cpumask, osnoise_cpumask_new); - cpus_read_unlock(); mutex_unlock(&interface_lock); + cpus_read_unlock(); if (running) start_per_cpu_kthreads(); @@ -2459,8 +2467,7 @@ static int timerlat_fd_open(struct inode *inode, struct file *file) tlat = this_cpu_tmr_var(); tlat->count = 0; - hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); - tlat->timer.function = timerlat_irq; + hrtimer_setup(&tlat->timer, timerlat_irq, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); migrate_enable(); return 0; @@ -2532,7 +2539,7 @@ timerlat_fd_read(struct file *file, char __user *ubuf, size_t count, s.timer_latency = diff; s.context = THREAD_URET; - trace_timerlat_sample(&s); + record_timerlat_sample(&s); notify_new_max_latency(diff); @@ -2567,7 +2574,7 @@ timerlat_fd_read(struct file *file, char __user *ubuf, size_t count, s.timer_latency = diff; s.context = THREAD_CONTEXT; - trace_timerlat_sample(&s); + record_timerlat_sample(&s); if (osnoise_data.stop_tracing_total) { if (time_to_us(diff) >= osnoise_data.stop_tracing_total) { @@ -2679,6 +2686,17 @@ static struct trace_min_max_param timerlat_period = { .min = &timerlat_min_period, }; +/* + * osnoise/timerlat_align_us: align the first wakeup of all timerlat + * threads to a common boundary (in us). 0 means disabled. + */ +static struct trace_min_max_param timerlat_align_us = { + .lock = &interface_lock, + .val = &osnoise_data.timerlat_align_us, + .max = NULL, + .min = NULL, +}; + static const struct file_operations timerlat_fd_fops = { .open = timerlat_fd_open, .read = timerlat_fd_read, @@ -2734,7 +2752,7 @@ static int osnoise_create_cpu_timerlat_fd(struct dentry *top_dir) * Why not using tracing instance per_cpu/ dir? * * Because osnoise/timerlat have a single workload, having - * multiple files like these are wast of memory. + * multiple files like these are waste of memory. */ per_cpu = tracefs_create_dir("per_cpu", top_dir); if (!per_cpu) @@ -2775,6 +2793,11 @@ static int init_timerlat_tracefs(struct dentry *top_dir) if (!tmp) return -ENOMEM; + tmp = tracefs_create_file("timerlat_align_us", TRACE_MODE_WRITE, top_dir, + &timerlat_align_us, &trace_min_max_fops); + if (!tmp) + return -ENOMEM; + retval = osnoise_create_cpu_timerlat_fd(top_dir); if (retval) return retval; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 03d56f711ad1..a5ad76175d10 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -5,6 +5,7 @@ * Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com> * */ +#include "trace.h" #include <linux/module.h> #include <linux/mutex.h> #include <linux/ftrace.h> @@ -12,15 +13,19 @@ #include <linux/sched/clock.h> #include <linux/sched/mm.h> #include <linux/idr.h> +#include <linux/btf.h> +#include <linux/bpf.h> +#include <linux/hashtable.h> #include "trace_output.h" +#include "trace_btf.h" -/* must be a power of 2 */ -#define EVENT_HASHSIZE 128 +/* 2^7 = 128 */ +#define EVENT_HASH_BITS 7 DECLARE_RWSEM(trace_event_sem); -static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; +static DEFINE_HASHTABLE(event_hash, EVENT_HASH_BITS); enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter) { @@ -64,14 +69,15 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) const char * trace_print_flags_seq(struct trace_seq *p, const char *delim, unsigned long flags, - const struct trace_print_flags *flag_array) + const struct trace_print_flags *flag_array, + size_t flag_array_size) { unsigned long mask; const char *str; const char *ret = trace_seq_buffer_ptr(p); int i, first = 1; - for (i = 0; flag_array[i].name && flags; i++) { + for (i = 0; i < flag_array_size && flags; i++) { mask = flag_array[i].mask; if ((flags & mask) != mask) @@ -101,12 +107,13 @@ EXPORT_SYMBOL(trace_print_flags_seq); const char * trace_print_symbols_seq(struct trace_seq *p, unsigned long val, - const struct trace_print_flags *symbol_array) + const struct trace_print_flags *symbol_array, + size_t symbol_array_size) { int i; const char *ret = trace_seq_buffer_ptr(p); - for (i = 0; symbol_array[i].name; i++) { + for (i = 0; i < symbol_array_size; i++) { if (val != symbol_array[i].mask) continue; @@ -128,14 +135,15 @@ EXPORT_SYMBOL(trace_print_symbols_seq); const char * trace_print_flags_seq_u64(struct trace_seq *p, const char *delim, unsigned long long flags, - const struct trace_print_flags_u64 *flag_array) + const struct trace_print_flags_u64 *flag_array, + size_t flag_array_size) { unsigned long long mask; const char *str; const char *ret = trace_seq_buffer_ptr(p); int i, first = 1; - for (i = 0; flag_array[i].name && flags; i++) { + for (i = 0; i < flag_array_size && flags; i++) { mask = flag_array[i].mask; if ((flags & mask) != mask) @@ -165,12 +173,13 @@ EXPORT_SYMBOL(trace_print_flags_seq_u64); const char * trace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, - const struct trace_print_flags_u64 *symbol_array) + const struct trace_print_flags_u64 *symbol_array, + size_t symbol_array_size) { int i; const char *ret = trace_seq_buffer_ptr(p); - for (i = 0; symbol_array[i].name; i++) { + for (i = 0; i < symbol_array_size; i++) { if (val != symbol_array[i].mask) continue; @@ -189,13 +198,37 @@ trace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, EXPORT_SYMBOL(trace_print_symbols_seq_u64); #endif +/** + * trace_print_bitmask_seq - print a bitmask to a sequence buffer + * @iter: The trace iterator for the current event instance + * @bitmask_ptr: The pointer to the bitmask data + * @bitmask_size: The size of the bitmask in bytes + * + * Prints a bitmask into a sequence buffer as either a hex string or a + * human-readable range list, depending on the instance's "bitmask-list" + * trace option. The bitmask is formatted into the iterator's temporary + * scratchpad rather than the primary sequence buffer. This avoids + * duplication and pointer-collision issues when the returned string is + * processed by a "%s" specifier in a TP_printk() macro. + * + * Returns a pointer to the formatted string within the temporary buffer. + */ const char * -trace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, +trace_print_bitmask_seq(struct trace_iterator *iter, void *bitmask_ptr, unsigned int bitmask_size) { - const char *ret = trace_seq_buffer_ptr(p); + struct trace_seq *p = &iter->tmp_seq; + const struct trace_array *tr = iter->tr; + const char *ret; + + trace_seq_init(p); + ret = trace_seq_buffer_ptr(p); + + if (tr->trace_flags & TRACE_ITER(BITMASK_LIST)) + trace_seq_bitmask_list(p, bitmask_ptr, bitmask_size * 8); + else + trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8); - trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8); trace_seq_putc(p, 0); return ret; @@ -415,7 +448,7 @@ static int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, } mmap_read_unlock(mm); } - if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file)) + if (ret && ((sym_flags & TRACE_ITER(SYM_ADDR)) || !file)) trace_seq_printf(s, " <" IP_FMT ">", ip); return !trace_seq_has_overflowed(s); } @@ -428,9 +461,9 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) goto out; } - trace_seq_print_sym(s, ip, sym_flags & TRACE_ITER_SYM_OFFSET); + trace_seq_print_sym(s, ip, sym_flags & TRACE_ITER(SYM_OFFSET)); - if (sym_flags & TRACE_ITER_SYM_ADDR) + if (sym_flags & TRACE_ITER(SYM_ADDR)) trace_seq_printf(s, " <" IP_FMT ">", ip); out: @@ -564,7 +597,7 @@ static int lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) { struct trace_array *tr = iter->tr; - unsigned long verbose = tr->trace_flags & TRACE_ITER_VERBOSE; + unsigned long verbose = tr->trace_flags & TRACE_ITER(VERBOSE); unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS; unsigned long long abs_ts = iter->ts - iter->array_buffer->time_start; unsigned long long rel_ts = next_ts - iter->ts; @@ -631,7 +664,7 @@ int trace_print_context(struct trace_iterator *iter) trace_seq_printf(s, "%16s-%-7d ", comm, entry->pid); - if (tr->trace_flags & TRACE_ITER_RECORD_TGID) { + if (tr->trace_flags & TRACE_ITER(RECORD_TGID)) { unsigned int tgid = trace_find_tgid(entry->pid); if (!tgid) @@ -642,7 +675,7 @@ int trace_print_context(struct trace_iterator *iter) trace_seq_printf(s, "[%03d] ", iter->cpu); - if (tr->trace_flags & TRACE_ITER_IRQ_INFO) + if (tr->trace_flags & TRACE_ITER(IRQ_INFO)) trace_print_lat_fmt(s, entry); trace_print_time(s, iter, iter->ts); @@ -656,7 +689,7 @@ int trace_print_lat_context(struct trace_iterator *iter) struct trace_entry *entry, *next_entry; struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; - unsigned long verbose = (tr->trace_flags & TRACE_ITER_VERBOSE); + unsigned long verbose = (tr->trace_flags & TRACE_ITER(VERBOSE)); u64 next_ts; next_entry = trace_find_next_entry(iter, NULL, &next_ts); @@ -684,6 +717,104 @@ int trace_print_lat_context(struct trace_iterator *iter) return !trace_seq_has_overflowed(s); } +#ifdef CONFIG_FUNCTION_TRACE_ARGS +void print_function_args(struct trace_seq *s, unsigned long *args, + unsigned long func) +{ + const struct btf_param *param; + const struct btf_type *t; + const struct btf_enum *enums; + const char *param_name; + char name[KSYM_NAME_LEN]; + unsigned long arg; + struct btf *btf; + s32 tid, nr = 0; + int a, p, x, i; + u16 encode; + + trace_seq_printf(s, "("); + + if (!args) + goto out; + if (lookup_symbol_name(func, name)) + goto out; + + /* TODO: Pass module name here too */ + t = btf_find_func_proto(name, &btf); + if (IS_ERR_OR_NULL(t)) + goto out; + + param = btf_get_func_param(t, &nr); + if (!param) + goto out_put; + + for (a = 0, p = 0; p < nr; a++, p++) { + if (p) + trace_seq_puts(s, ", "); + + /* This only prints what the arch allows (6 args by default) */ + if (a == FTRACE_REGS_MAX_ARGS) { + trace_seq_puts(s, "..."); + break; + } + + arg = args[a]; + + param_name = btf_name_by_offset(btf, param[p].name_off); + if (param_name) + trace_seq_printf(s, "%s=", param_name); + t = btf_type_skip_modifiers(btf, param[p].type, &tid); + + switch (t ? BTF_INFO_KIND(t->info) : BTF_KIND_UNKN) { + case BTF_KIND_UNKN: + trace_seq_putc(s, '?'); + /* Still print unknown type values */ + fallthrough; + case BTF_KIND_PTR: + trace_seq_printf(s, "0x%lx", arg); + break; + case BTF_KIND_INT: + encode = btf_int_encoding(t); + /* Print unsigned ints as hex */ + if (encode & BTF_INT_SIGNED) + trace_seq_printf(s, "%ld", arg); + else + trace_seq_printf(s, "0x%lx", arg); + break; + case BTF_KIND_ENUM: + trace_seq_printf(s, "%ld", arg); + enums = btf_enum(t); + for (i = 0; i < btf_vlen(t); i++) { + if (arg == enums[i].val) { + trace_seq_printf(s, " [%s]", + btf_name_by_offset(btf, + enums[i].name_off)); + break; + } + } + break; + default: + /* This does not handle complex arguments */ + trace_seq_printf(s, "(%s)[0x%lx", btf_type_str(t), arg); + for (x = sizeof(long); x < t->size; x += sizeof(long)) { + trace_seq_putc(s, ':'); + if (++a == FTRACE_REGS_MAX_ARGS) { + trace_seq_puts(s, "...]"); + goto out_put; + } + trace_seq_printf(s, "0x%lx", args[a]); + } + trace_seq_putc(s, ']'); + break; + } + } +out_put: + btf_put(btf); +out: + trace_seq_printf(s, ")"); +} +#endif + /** * ftrace_find_event - find a registered event * @type: the type of event to look for @@ -694,11 +825,8 @@ int trace_print_lat_context(struct trace_iterator *iter) struct trace_event *ftrace_find_event(int type) { struct trace_event *event; - unsigned key; - key = type & (EVENT_HASHSIZE - 1); - - hlist_for_each_entry(event, &event_hash[key], node) { + hash_for_each_possible(event_hash, event, node, type) { if (event->type == type) return event; } @@ -753,7 +881,6 @@ void trace_event_read_unlock(void) */ int register_trace_event(struct trace_event *event) { - unsigned key; int ret = 0; down_write(&trace_event_sem); @@ -786,9 +913,7 @@ int register_trace_event(struct trace_event *event) if (event->funcs->binary == NULL) event->funcs->binary = trace_nop_print; - key = event->type & (EVENT_HASHSIZE - 1); - - hlist_add_head(&event->node, &event_hash[key]); + hash_add(event_hash, &event->node, event->type); ret = event->type; out: @@ -803,7 +928,7 @@ EXPORT_SYMBOL_GPL(register_trace_event); */ int __unregister_trace_event(struct trace_event *event) { - hlist_del(&event->node); + hash_del(&event->node); free_trace_event_type(event->type); return 0; } @@ -857,10 +982,15 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c struct list_head *head) { struct ftrace_event_field *field; + struct trace_array *tr = iter->tr; + unsigned long long laddr; + unsigned long addr; int offset; int len; int ret; + int i; void *pos; + char *str; list_for_each_entry_reverse(field, head, link) { trace_seq_printf(&iter->seq, " %s=", field->name); @@ -887,14 +1017,35 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c trace_seq_puts(&iter->seq, "<OVERFLOW>"); break; } - pos = (void *)iter->ent + offset; - trace_seq_printf(&iter->seq, "%.*s", len, (char *)pos); + str = (char *)iter->ent + offset; + /* Check if there's any non printable strings */ + for (i = 0; i < len; i++) { + if (str[i] && !(isascii(str[i]) && isprint(str[i]))) + break; + } + if (i < len) { + for (i = 0; i < len; i++) { + if (isascii(str[i]) && isprint(str[i])) + trace_seq_putc(&iter->seq, str[i]); + else + trace_seq_putc(&iter->seq, '.'); + } + trace_seq_puts(&iter->seq, " ("); + for (i = 0; i < len; i++) { + if (i) + trace_seq_putc(&iter->seq, ':'); + trace_seq_printf(&iter->seq, "%02x", str[i]); + } + trace_seq_putc(&iter->seq, ')'); + } else { + trace_seq_printf(&iter->seq, "%.*s", len, str); + } break; case FILTER_PTR_STRING: if (!iter->fmt_size) trace_iter_expand_format(iter); - pos = *(void **)pos; - ret = strncpy_from_kernel_nofault(iter->fmt, pos, + addr = trace_adjust_address(tr, *(unsigned long *)pos); + ret = strncpy_from_kernel_nofault(iter->fmt, (void *)addr, iter->fmt_size); if (ret < 0) trace_seq_printf(&iter->seq, "(0x%px)", pos); @@ -903,8 +1054,8 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c pos, iter->fmt); break; case FILTER_TRACE_FN: - pos = *(void **)pos; - trace_seq_printf(&iter->seq, "%pS", pos); + addr = trace_adjust_address(tr, *(unsigned long *)pos); + trace_seq_printf(&iter->seq, "%pS", (void *)addr); break; case FILTER_CPU: case FILTER_OTHER: @@ -934,14 +1085,36 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c break; } - trace_seq_printf(&iter->seq, "0x%x (%d)", - *(unsigned int *)pos, - *(unsigned int *)pos); + addr = *(unsigned int *)pos; + + /* Some fields reference offset from _stext. */ + if (!strcmp(field->name, "caller_offs") || + !strcmp(field->name, "parent_offs")) { + unsigned long ip; + + ip = addr + (unsigned long)_stext; + ip = trace_adjust_address(tr, ip); + trace_seq_printf(&iter->seq, "%pS ", (void *)ip); + } + + if (sizeof(long) == 4) { + addr = trace_adjust_address(tr, addr); + trace_seq_printf(&iter->seq, "%pS (%d)", + (void *)addr, (int)addr); + } else { + trace_seq_printf(&iter->seq, "0x%x (%d)", + (unsigned int)addr, (int)addr); + } break; case 8: - trace_seq_printf(&iter->seq, "0x%llx (%lld)", - *(unsigned long long *)pos, - *(unsigned long long *)pos); + laddr = *(unsigned long long *)pos; + if (sizeof(long) == 8) { + laddr = trace_adjust_address(tr, (unsigned long)laddr); + trace_seq_printf(&iter->seq, "%pS (%lld)", + (void *)(long)laddr, laddr); + } else { + trace_seq_printf(&iter->seq, "0x%llx (%lld)", laddr, laddr); + } break; default: trace_seq_puts(&iter->seq, "<INVALID-SIZE>"); @@ -961,11 +1134,12 @@ enum print_line_t print_event_fields(struct trace_iterator *iter, struct trace_event_call *call; struct list_head *head; + lockdep_assert_held_read(&trace_event_sem); + /* ftrace defined events have separate call structures */ if (event->type <= __TRACE_LAST_TYPE) { bool found = false; - down_read(&trace_event_sem); list_for_each_entry(call, &ftrace_events, list) { if (call->event.type == event->type) { found = true; @@ -975,7 +1149,6 @@ enum print_line_t print_event_fields(struct trace_iterator *iter, if (call->event.type > __TRACE_LAST_TYPE) break; } - up_read(&trace_event_sem); if (!found) { trace_seq_printf(&iter->seq, "UNKNOWN TYPE %d\n", event->type); goto out; @@ -1005,14 +1178,17 @@ enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, } static void print_fn_trace(struct trace_seq *s, unsigned long ip, - unsigned long parent_ip, long delta, int flags) + unsigned long parent_ip, unsigned long *args, + struct trace_array *tr, int flags) { - ip += delta; - parent_ip += delta; + ip = trace_adjust_address(tr, ip); + parent_ip = trace_adjust_address(tr, parent_ip); seq_print_ip_sym(s, ip, flags); + if (args) + print_function_args(s, args, ip); - if ((flags & TRACE_ITER_PRINT_PARENT) && parent_ip) { + if ((flags & TRACE_ITER(PRINT_PARENT)) && parent_ip) { trace_seq_puts(s, " <-"); seq_print_ip_sym(s, parent_ip, flags); } @@ -1024,10 +1200,18 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags, { struct ftrace_entry *field; struct trace_seq *s = &iter->seq; + unsigned long *args; + int args_size; trace_assign_type(field, iter->ent); - print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, flags); + args_size = iter->ent_size - offsetof(struct ftrace_entry, args); + if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long)) + args = field->args; + else + args = NULL; + + print_fn_trace(s, field->ip, field->parent_ip, args, iter->tr, flags); trace_seq_putc(s, '\n'); return trace_handle_return(s); @@ -1248,7 +1432,6 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, struct trace_seq *s = &iter->seq; unsigned long *p; unsigned long *end; - long delta = iter->tr->text_delta; trace_assign_type(field, iter->ent); end = (unsigned long *)((long)iter->ent + iter->ent_size); @@ -1265,7 +1448,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, trace_seq_puts(s, "[FTRACE TRAMPOLINE]\n"); continue; } - seq_print_ip_sym(s, (*p) + delta, flags); + seq_print_ip_sym(s, trace_adjust_address(iter->tr, *p), flags); trace_seq_putc(s, '\n'); } @@ -1295,7 +1478,7 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, trace_seq_puts(s, "<user stack trace>\n"); - if (tr->trace_flags & TRACE_ITER_SYM_USEROBJ) { + if (tr->trace_flags & TRACE_ITER(SYM_USEROBJ)) { struct task_struct *task; /* * we do the lookup on the thread group leader, @@ -1345,12 +1528,12 @@ trace_hwlat_print(struct trace_iterator *iter, int flags, trace_assign_type(field, entry); - trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%lld.%09ld count:%d", + trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%ptSp count:%d", field->seqnum, field->duration, field->outer_duration, - (long long)field->timestamp.tv_sec, - field->timestamp.tv_nsec, field->count); + &field->timestamp, + field->count); if (field->nmi_count) { /* @@ -1614,7 +1797,7 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); - ip = field->ip + iter->tr->text_delta; + ip = trace_adjust_address(iter->tr, field->ip); seq_print_ip_sym(s, ip, flags); trace_seq_printf(s, ": %s", field->buf); @@ -1700,7 +1883,7 @@ trace_func_repeats_print(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, flags); + print_fn_trace(s, field->ip, field->parent_ip, NULL, iter->tr, flags); trace_seq_printf(s, " (repeats: %u, last_ts:", field->count); trace_print_time(s, iter, iter->ts - FUNC_REPEATS_GET_DELTA_TS(field)); diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index dca40f1f1da4..99b676733d46 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -16,6 +16,17 @@ extern int seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags); +static inline int seq_print_ip_sym_offset(struct trace_seq *s, unsigned long ip, + unsigned long sym_flags) +{ + return seq_print_ip_sym(s, ip, sym_flags | TRACE_ITER(SYM_OFFSET)); +} +static inline int seq_print_ip_sym_no_offset(struct trace_seq *s, unsigned long ip, + unsigned long sym_flags) +{ + return seq_print_ip_sym(s, ip, sym_flags & ~TRACE_ITER(SYM_OFFSET)); +} + extern void trace_seq_print_sym(struct trace_seq *s, unsigned long address, bool offset); extern int trace_print_context(struct trace_iterator *iter); extern int trace_print_lat_context(struct trace_iterator *iter); @@ -41,5 +52,14 @@ extern struct rw_semaphore trace_event_sem; #define SEQ_PUT_HEX_FIELD(s, x) \ trace_seq_putmem_hex(s, &(x), sizeof(x)) +#ifdef CONFIG_FUNCTION_TRACE_ARGS +void print_function_args(struct trace_seq *s, unsigned long *args, + unsigned long func); +#else +static inline void print_function_args(struct trace_seq *s, unsigned long *args, + unsigned long func) { + trace_seq_puts(s, "()"); +} +#endif #endif diff --git a/kernel/trace/trace_pid.c b/kernel/trace/trace_pid.c new file mode 100644 index 000000000000..7127c8de4174 --- /dev/null +++ b/kernel/trace/trace_pid.c @@ -0,0 +1,246 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "trace.h" + +/** + * trace_find_filtered_pid - check if a pid exists in a filtered_pid list + * @filtered_pids: The list of pids to check + * @search_pid: The PID to find in @filtered_pids + * + * Returns true if @search_pid is found in @filtered_pids, and false otherwise. + */ +bool +trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) +{ + return trace_pid_list_is_set(filtered_pids, search_pid); +} + +/** + * trace_ignore_this_task - should a task be ignored for tracing + * @filtered_pids: The list of pids to check + * @filtered_no_pids: The list of pids not to be traced + * @task: The task that should be ignored if not filtered + * + * Checks if @task should be traced or not from @filtered_pids. + * Returns true if @task should *NOT* be traced. + * Returns false if @task should be traced. + */ +bool +trace_ignore_this_task(struct trace_pid_list *filtered_pids, + struct trace_pid_list *filtered_no_pids, + struct task_struct *task) +{ + /* + * If filtered_no_pids is not empty, and the task's pid is listed + * in filtered_no_pids, then return true. + * Otherwise, if filtered_pids is empty, that means we can + * trace all tasks. If it has content, then only trace pids + * within filtered_pids. + */ + + return (filtered_pids && + !trace_find_filtered_pid(filtered_pids, task->pid)) || + (filtered_no_pids && + trace_find_filtered_pid(filtered_no_pids, task->pid)); +} + +/** + * trace_filter_add_remove_task - Add or remove a task from a pid_list + * @pid_list: The list to modify + * @self: The current task for fork or NULL for exit + * @task: The task to add or remove + * + * If adding a task, if @self is defined, the task is only added if @self + * is also included in @pid_list. This happens on fork and tasks should + * only be added when the parent is listed. If @self is NULL, then the + * @task pid will be removed from the list, which would happen on exit + * of a task. + */ +void trace_filter_add_remove_task(struct trace_pid_list *pid_list, + struct task_struct *self, + struct task_struct *task) +{ + if (!pid_list) + return; + + /* For forks, we only add if the forking task is listed */ + if (self) { + if (!trace_find_filtered_pid(pid_list, self->pid)) + return; + } + + /* "self" is set for forks, and NULL for exits */ + if (self) + trace_pid_list_set(pid_list, task->pid); + else + trace_pid_list_clear(pid_list, task->pid); +} + +/** + * trace_pid_next - Used for seq_file to get to the next pid of a pid_list + * @pid_list: The pid list to show + * @v: The last pid that was shown (+1 the actual pid to let zero be displayed) + * @pos: The position of the file + * + * This is used by the seq_file "next" operation to iterate the pids + * listed in a trace_pid_list structure. + * + * Returns the pid+1 as we want to display pid of zero, but NULL would + * stop the iteration. + */ +void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos) +{ + long pid = (unsigned long)v; + unsigned int next; + + (*pos)++; + + /* pid already is +1 of the actual previous bit */ + if (trace_pid_list_next(pid_list, pid, &next) < 0) + return NULL; + + pid = next; + + /* Return pid + 1 to allow zero to be represented */ + return (void *)(pid + 1); +} + +/** + * trace_pid_start - Used for seq_file to start reading pid lists + * @pid_list: The pid list to show + * @pos: The position of the file + * + * This is used by seq_file "start" operation to start the iteration + * of listing pids. + * + * Returns the pid+1 as we want to display pid of zero, but NULL would + * stop the iteration. + */ +void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos) +{ + unsigned long pid; + unsigned int first; + loff_t l = 0; + + if (trace_pid_list_first(pid_list, &first) < 0) + return NULL; + + pid = first; + + /* Return pid + 1 so that zero can be the exit value */ + for (pid++; pid && l < *pos; + pid = (unsigned long)trace_pid_next(pid_list, (void *)pid, &l)) + ; + return (void *)pid; +} + +/** + * trace_pid_show - show the current pid in seq_file processing + * @m: The seq_file structure to write into + * @v: A void pointer of the pid (+1) value to display + * + * Can be directly used by seq_file operations to display the current + * pid value. + */ +int trace_pid_show(struct seq_file *m, void *v) +{ + unsigned long pid = (unsigned long)v - 1; + + seq_printf(m, "%lu\n", pid); + return 0; +} + +/* 128 should be much more than enough */ +#define PID_BUF_SIZE 127 + +int trace_pid_write(struct trace_pid_list *filtered_pids, + struct trace_pid_list **new_pid_list, + const char __user *ubuf, size_t cnt) +{ + struct trace_pid_list *pid_list; + struct trace_parser parser; + unsigned long val; + int nr_pids = 0; + ssize_t read = 0; + ssize_t ret; + loff_t pos; + pid_t pid; + + if (trace_parser_get_init(&parser, PID_BUF_SIZE + 1)) + return -ENOMEM; + + /* + * Always recreate a new array. The write is an all or nothing + * operation. Always create a new array when adding new pids by + * the user. If the operation fails, then the current list is + * not modified. + */ + pid_list = trace_pid_list_alloc(); + if (!pid_list) { + trace_parser_put(&parser); + return -ENOMEM; + } + + if (filtered_pids) { + /* copy the current bits to the new max */ + ret = trace_pid_list_first(filtered_pids, &pid); + while (!ret) { + ret = trace_pid_list_set(pid_list, pid); + if (ret < 0) + goto out; + + ret = trace_pid_list_next(filtered_pids, pid + 1, &pid); + nr_pids++; + } + } + + ret = 0; + while (cnt > 0) { + + pos = 0; + + ret = trace_get_user(&parser, ubuf, cnt, &pos); + if (ret < 0) + break; + + read += ret; + ubuf += ret; + cnt -= ret; + + if (!trace_parser_loaded(&parser)) + break; + + ret = -EINVAL; + if (kstrtoul(parser.buffer, 0, &val)) + break; + + pid = (pid_t)val; + + if (trace_pid_list_set(pid_list, pid) < 0) { + ret = -1; + break; + } + nr_pids++; + + trace_parser_clear(&parser); + ret = 0; + } + out: + trace_parser_put(&parser); + + if (ret < 0) { + trace_pid_list_free(pid_list); + return ret; + } + + if (!nr_pids) { + /* Cleared the list of pids */ + trace_pid_list_free(pid_list); + pid_list = NULL; + } + + *new_pid_list = pid_list; + + return read; +} + diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 29f6e95439b6..3ea17af60169 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -69,7 +69,7 @@ void hold_module_trace_bprintk_format(const char **start, const char **end) } fmt = NULL; - tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL); + tb_fmt = kmalloc_obj(*tb_fmt); if (tb_fmt) { fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL); if (fmt) { @@ -197,6 +197,7 @@ struct notifier_block module_trace_bprintk_format_nb = { .notifier_call = module_trace_bprintk_format_notify, }; +__printf(2, 3) int __trace_bprintk(unsigned long ip, const char *fmt, ...) { int ret; @@ -376,6 +377,436 @@ static const struct file_operations ftrace_formats_fops = { .release = seq_release, }; +static __always_inline bool printk_binsafe(struct trace_array *tr) +{ + /* + * The binary format of traceprintk can cause a crash if used + * by a buffer from another boot. Force the use of the + * non binary version of trace_printk if the trace_printk + * buffer is a boot mapped ring buffer. + */ + return !(tr->flags & TRACE_ARRAY_FL_BOOT); +} + +int __trace_array_puts(struct trace_array *tr, unsigned long ip, + const char *str, int size) +{ + struct ring_buffer_event *event; + struct trace_buffer *buffer; + struct print_entry *entry; + unsigned int trace_ctx; + int alloc; + + if (!(tr->trace_flags & TRACE_ITER(PRINTK))) + return 0; + + if (unlikely(tracing_selftest_running && + (tr->flags & TRACE_ARRAY_FL_GLOBAL))) + return 0; + + if (unlikely(tracing_disabled)) + return 0; + + alloc = sizeof(*entry) + size + 2; /* possible \n added */ + + trace_ctx = tracing_gen_ctx(); + buffer = tr->array_buffer.buffer; + guard(ring_buffer_nest)(buffer); + event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, + trace_ctx); + if (!event) + return 0; + + entry = ring_buffer_event_data(event); + entry->ip = ip; + + memcpy(&entry->buf, str, size); + + /* Add a newline if necessary */ + if (entry->buf[size - 1] != '\n') { + entry->buf[size] = '\n'; + entry->buf[size + 1] = '\0'; + } else + entry->buf[size] = '\0'; + + __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL); + return size; +} +EXPORT_SYMBOL_GPL(__trace_array_puts); + +/** + * __trace_puts - write a constant string into the trace buffer. + * @ip: The address of the caller + * @str: The constant string to write + */ +int __trace_puts(unsigned long ip, const char *str) +{ + return __trace_array_puts(printk_trace, ip, str, strlen(str)); +} +EXPORT_SYMBOL_GPL(__trace_puts); + +/** + * __trace_bputs - write the pointer to a constant string into trace buffer + * @ip: The address of the caller + * @str: The constant string to write to the buffer to + */ +int __trace_bputs(unsigned long ip, const char *str) +{ + struct trace_array *tr = READ_ONCE(printk_trace); + struct ring_buffer_event *event; + struct trace_buffer *buffer; + struct bputs_entry *entry; + unsigned int trace_ctx; + int size = sizeof(struct bputs_entry); + + if (!printk_binsafe(tr)) + return __trace_puts(ip, str); + + if (!(tr->trace_flags & TRACE_ITER(PRINTK))) + return 0; + + if (unlikely(tracing_selftest_running || tracing_disabled)) + return 0; + + trace_ctx = tracing_gen_ctx(); + buffer = tr->array_buffer.buffer; + + guard(ring_buffer_nest)(buffer); + event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, + trace_ctx); + if (!event) + return 0; + + entry = ring_buffer_event_data(event); + entry->ip = ip; + entry->str = str; + + __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL); + + return 1; +} +EXPORT_SYMBOL_GPL(__trace_bputs); + +/* created for use with alloc_percpu */ +struct trace_buffer_struct { + int nesting; + char buffer[4][TRACE_BUF_SIZE]; +}; + +static struct trace_buffer_struct __percpu *trace_percpu_buffer; + +/* + * This allows for lockless recording. If we're nested too deeply, then + * this returns NULL. + */ +static char *get_trace_buf(void) +{ + struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer); + + if (!trace_percpu_buffer || buffer->nesting >= 4) + return NULL; + + buffer->nesting++; + + /* Interrupts must see nesting incremented before we use the buffer */ + barrier(); + return &buffer->buffer[buffer->nesting - 1][0]; +} + +static void put_trace_buf(void) +{ + /* Don't let the decrement of nesting leak before this */ + barrier(); + this_cpu_dec(trace_percpu_buffer->nesting); +} + +static int alloc_percpu_trace_buffer(void) +{ + struct trace_buffer_struct __percpu *buffers; + + if (trace_percpu_buffer) + return 0; + + buffers = alloc_percpu(struct trace_buffer_struct); + if (MEM_FAIL(!buffers, "Could not allocate percpu trace_printk buffer")) + return -ENOMEM; + + trace_percpu_buffer = buffers; + return 0; +} + +static int buffers_allocated; + +void trace_printk_init_buffers(void) +{ + if (buffers_allocated) + return; + + if (alloc_percpu_trace_buffer()) + return; + + /* trace_printk() is for debug use only. Don't use it in production. */ + + pr_warn("\n"); + pr_warn("**********************************************************\n"); + pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); + pr_warn("** **\n"); + pr_warn("** trace_printk() being used. Allocating extra memory. **\n"); + pr_warn("** **\n"); + pr_warn("** This means that this is a DEBUG kernel and it is **\n"); + pr_warn("** unsafe for production use. **\n"); + pr_warn("** **\n"); + pr_warn("** If you see this message and you are not debugging **\n"); + pr_warn("** the kernel, report this immediately to your vendor! **\n"); + pr_warn("** **\n"); + pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); + pr_warn("**********************************************************\n"); + + /* Expand the buffers to set size */ + if (tracing_update_buffers(NULL) < 0) + pr_err("Failed to expand tracing buffers for trace_printk() calls\n"); + else + buffers_allocated = 1; + + /* + * trace_printk_init_buffers() can be called by modules. + * If that happens, then we need to start cmdline recording + * directly here. + */ + if (system_state == SYSTEM_RUNNING) + tracing_start_cmdline_record(); +} +EXPORT_SYMBOL_GPL(trace_printk_init_buffers); + +void trace_printk_start_comm(void) +{ + /* Start tracing comms if trace printk is set */ + if (!buffers_allocated) + return; + tracing_start_cmdline_record(); +} + +void trace_printk_start_stop_comm(int enabled) +{ + if (!buffers_allocated) + return; + + if (enabled) + tracing_start_cmdline_record(); + else + tracing_stop_cmdline_record(); +} + +/** + * trace_vbprintk - write binary msg to tracing buffer + * @ip: The address of the caller + * @fmt: The string format to write to the buffer + * @args: Arguments for @fmt + */ +int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) +{ + struct ring_buffer_event *event; + struct trace_buffer *buffer; + struct trace_array *tr = READ_ONCE(printk_trace); + struct bprint_entry *entry; + unsigned int trace_ctx; + char *tbuffer; + int len = 0, size; + + if (!printk_binsafe(tr)) + return trace_vprintk(ip, fmt, args); + + if (unlikely(tracing_selftest_running || tracing_disabled)) + return 0; + + /* Don't pollute graph traces with trace_vprintk internals */ + pause_graph_tracing(); + + trace_ctx = tracing_gen_ctx(); + guard(preempt_notrace)(); + + tbuffer = get_trace_buf(); + if (!tbuffer) { + len = 0; + goto out_nobuffer; + } + + len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args); + + if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0) + goto out_put; + + size = sizeof(*entry) + sizeof(u32) * len; + buffer = tr->array_buffer.buffer; + scoped_guard(ring_buffer_nest, buffer) { + event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, + trace_ctx); + if (!event) + goto out_put; + entry = ring_buffer_event_data(event); + entry->ip = ip; + entry->fmt = fmt; + + memcpy(entry->buf, tbuffer, sizeof(u32) * len); + __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL); + } +out_put: + put_trace_buf(); + +out_nobuffer: + unpause_graph_tracing(); + + return len; +} +EXPORT_SYMBOL_GPL(trace_vbprintk); + +static __printf(3, 0) +int __trace_array_vprintk(struct trace_buffer *buffer, + unsigned long ip, const char *fmt, va_list args) +{ + struct ring_buffer_event *event; + int len = 0, size; + struct print_entry *entry; + unsigned int trace_ctx; + char *tbuffer; + + if (unlikely(tracing_disabled)) + return 0; + + /* Don't pollute graph traces with trace_vprintk internals */ + pause_graph_tracing(); + + trace_ctx = tracing_gen_ctx(); + guard(preempt_notrace)(); + + + tbuffer = get_trace_buf(); + if (!tbuffer) { + len = 0; + goto out_nobuffer; + } + + len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); + + size = sizeof(*entry) + len + 1; + scoped_guard(ring_buffer_nest, buffer) { + event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, + trace_ctx); + if (!event) + goto out; + entry = ring_buffer_event_data(event); + entry->ip = ip; + + memcpy(&entry->buf, tbuffer, len + 1); + __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL); + } +out: + put_trace_buf(); + +out_nobuffer: + unpause_graph_tracing(); + + return len; +} + +int trace_array_vprintk(struct trace_array *tr, + unsigned long ip, const char *fmt, va_list args) +{ + if (tracing_selftest_running && (tr->flags & TRACE_ARRAY_FL_GLOBAL)) + return 0; + + return __trace_array_vprintk(tr->array_buffer.buffer, ip, fmt, args); +} + +/** + * trace_array_printk - Print a message to a specific instance + * @tr: The instance trace_array descriptor + * @ip: The instruction pointer that this is called from. + * @fmt: The format to print (printf format) + * + * If a subsystem sets up its own instance, they have the right to + * printk strings into their tracing instance buffer using this + * function. Note, this function will not write into the top level + * buffer (use trace_printk() for that), as writing into the top level + * buffer should only have events that can be individually disabled. + * trace_printk() is only used for debugging a kernel, and should not + * be ever incorporated in normal use. + * + * trace_array_printk() can be used, as it will not add noise to the + * top level tracing buffer. + * + * Note, trace_array_init_printk() must be called on @tr before this + * can be used. + */ +int trace_array_printk(struct trace_array *tr, + unsigned long ip, const char *fmt, ...) +{ + int ret; + va_list ap; + + if (!tr) + return -ENOENT; + + /* This is only allowed for created instances */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) + return 0; + + if (!(tr->trace_flags & TRACE_ITER(PRINTK))) + return 0; + + va_start(ap, fmt); + ret = trace_array_vprintk(tr, ip, fmt, ap); + va_end(ap); + return ret; +} +EXPORT_SYMBOL_GPL(trace_array_printk); + +/** + * trace_array_init_printk - Initialize buffers for trace_array_printk() + * @tr: The trace array to initialize the buffers for + * + * As trace_array_printk() only writes into instances, they are OK to + * have in the kernel (unlike trace_printk()). This needs to be called + * before trace_array_printk() can be used on a trace_array. + */ +int trace_array_init_printk(struct trace_array *tr) +{ + if (!tr) + return -ENOENT; + + /* This is only allowed for created instances */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) + return -EINVAL; + + return alloc_percpu_trace_buffer(); +} +EXPORT_SYMBOL_GPL(trace_array_init_printk); + +int trace_array_printk_buf(struct trace_buffer *buffer, + unsigned long ip, const char *fmt, ...) +{ + int ret; + va_list ap; + + if (!(printk_trace->trace_flags & TRACE_ITER(PRINTK))) + return 0; + + va_start(ap, fmt); + ret = __trace_array_vprintk(buffer, ip, fmt, ap); + va_end(ap); + return ret; +} + +int trace_vprintk(unsigned long ip, const char *fmt, va_list args) +{ + return trace_array_vprintk(printk_trace, ip, fmt, args); +} +EXPORT_SYMBOL_GPL(trace_vprintk); + static __init int init_trace_printk_function_export(void) { int ret; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 16a5e368e7b7..44c22d4e7881 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -13,8 +13,8 @@ #include <linux/bpf.h> #include <linux/fs.h> -#include "trace_btf.h" +#include "trace_btf.h" #include "trace_probe.h" #undef C @@ -154,22 +154,30 @@ fail: } static struct trace_probe_log trace_probe_log; +extern struct mutex dyn_event_ops_mutex; -void trace_probe_log_init(const char *subsystem, int argc, const char **argv) +const char *trace_probe_log_init(const char *subsystem, int argc, const char **argv) { + lockdep_assert_held(&dyn_event_ops_mutex); + trace_probe_log.subsystem = subsystem; trace_probe_log.argc = argc; trace_probe_log.argv = argv; trace_probe_log.index = 0; + return subsystem; } void trace_probe_log_clear(void) { + lockdep_assert_held(&dyn_event_ops_mutex); + memset(&trace_probe_log, 0, sizeof(trace_probe_log)); } void trace_probe_log_set_index(int index) { + lockdep_assert_held(&dyn_event_ops_mutex); + trace_probe_log.index = index; } @@ -178,6 +186,8 @@ void __trace_probe_log_err(int offset, int err_type) char *command, *p; int i, len = 0, pos = 0; + lockdep_assert_held(&dyn_event_ops_mutex); + if (!trace_probe_log.argv) return; @@ -205,7 +215,7 @@ void __trace_probe_log_err(int offset, int err_type) p = command; for (i = 0; i < trace_probe_log.argc; i++) { len = strlen(trace_probe_log.argv[i]); - strcpy(p, trace_probe_log.argv[i]); + memcpy(p, trace_probe_log.argv[i], len); p[len] = ' '; p += len + 1; } @@ -238,7 +248,25 @@ int traceprobe_split_symbol_offset(char *symbol, long *offset) return 0; } -/* @buf must has MAX_EVENT_NAME_LEN size */ +/** + * traceprobe_parse_event_name() - Parse a string into group and event names + * @pevent: A pointer to the string to be parsed. + * @pgroup: A pointer to the group name. + * @buf: A buffer to store the parsed group name. + * @offset: The offset of the string in the original user command, for logging. + * + * This parses a string with the format `[GROUP/][EVENT]` or `[GROUP.][EVENT]` + * (either GROUP or EVENT or both must be specified). + * Since the parsed group name is stored in @buf, the caller must ensure @buf + * is at least MAX_EVENT_NAME_LEN bytes. + * + * Return: 0 on success, or -EINVAL on failure. + * + * If success, *@pevent is updated to point to the event name part of the + * original string, or NULL if there is no event name. + * Also, *@pgroup is updated to point to the parsed group which is stored + * in @buf, or NULL if there is no group name. + */ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, char *buf, int offset) { @@ -489,7 +517,7 @@ static void clear_btf_context(struct traceprobe_parse_context *ctx) } } -/* Return 1 if the field separater is arrow operator ('->') */ +/* Return 1 if the field separator is arrow operator ('->') */ static int split_next_field(char *varname, char **next_field, struct traceprobe_parse_context *ctx) { @@ -648,7 +676,7 @@ static int parse_btf_arg(char *varname, ret = query_btf_context(ctx); if (ret < 0 || ctx->nr_params == 0) { trace_probe_log_err(ctx->offset, NO_BTF_ENTRY); - return PTR_ERR(params); + return -ENOENT; } } params = ctx->params; @@ -770,19 +798,51 @@ static int check_prepare_btf_string_fetch(char *typename, #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API +static void store_entry_arg_at(struct fetch_insn *code, int argnum, int offset) +{ + code[0].op = FETCH_OP_ARG; + code[0].param = argnum; + code[1].op = FETCH_OP_ST_EDATA; + code[1].offset = offset; +} + +static int get_entry_arg_max_offset(struct probe_entry_arg *earg) +{ + int i, max_offset = 0; + + /* + * earg->code[] array has an operation sequence which is run in + * the entry handler. + * The sequence stopped by FETCH_OP_END and each data stored in + * the entry data buffer by FETCH_OP_ST_EDATA. The FETCH_OP_ST_EDATA + * stores the data at the data buffer + its offset, and all data are + * "unsigned long" size. The offset must be increased when a data is + * stored. Thus we need to find the last FETCH_OP_ST_EDATA in the + * code array. + */ + for (i = 0; i < earg->size - 1 && earg->code[i].op != FETCH_OP_END; i++) { + if (earg->code[i].op == FETCH_OP_ST_EDATA) + if (earg->code[i].offset > max_offset) + max_offset = earg->code[i].offset; + } + return max_offset; +} + +/* + * Add the entry code to store the 'argnum'th parameter and return the offset + * in the entry data buffer where the data will be stored. + */ static int __store_entry_arg(struct trace_probe *tp, int argnum) { struct probe_entry_arg *earg = tp->entry_arg; - bool match = false; - int i, offset; + int i, offset, last_offset = 0; if (!earg) { - earg = kzalloc(sizeof(*tp->entry_arg), GFP_KERNEL); + earg = kzalloc_obj(*tp->entry_arg); if (!earg) return -ENOMEM; earg->size = 2 * tp->nr_args + 1; - earg->code = kcalloc(earg->size, sizeof(struct fetch_insn), - GFP_KERNEL); + earg->code = kzalloc_objs(struct fetch_insn, earg->size); if (!earg->code) { kfree(earg); return -ENOMEM; @@ -791,54 +851,59 @@ static int __store_entry_arg(struct trace_probe *tp, int argnum) for (i = 0; i < earg->size; i++) earg->code[i].op = FETCH_OP_END; tp->entry_arg = earg; + store_entry_arg_at(earg->code, argnum, 0); + return 0; } - offset = 0; - for (i = 0; i < earg->size - 1; i++) { - switch (earg->code[i].op) { - case FETCH_OP_END: - earg->code[i].op = FETCH_OP_ARG; - earg->code[i].param = argnum; - earg->code[i + 1].op = FETCH_OP_ST_EDATA; - earg->code[i + 1].offset = offset; - return offset; - case FETCH_OP_ARG: - match = (earg->code[i].param == argnum); - break; - case FETCH_OP_ST_EDATA: - offset = earg->code[i].offset; - if (match) - return offset; - offset += sizeof(unsigned long); - break; - default: - break; - } + /* + * NOTE: if anyone change the following rule, please rewrite this. + * The entry code array is filled with the pair of + * + * [FETCH_OP_ARG(argnum)] + * [FETCH_OP_ST_EDATA(offset of entry data buffer)] + * + * and the rest of entries are filled with [FETCH_OP_END]. + * The offset should be incremented, thus the last pair should + * have the largest offset. + */ + + /* Search the offset for the sprcified argnum. */ + for (i = 0; i < earg->size - 1 && earg->code[i].op != FETCH_OP_END; i += 2) { + if (WARN_ON_ONCE(earg->code[i].op != FETCH_OP_ARG)) + return -EINVAL; + + if (earg->code[i].param != argnum) + continue; + + if (WARN_ON_ONCE(earg->code[i + 1].op != FETCH_OP_ST_EDATA)) + return -EINVAL; + + return earg->code[i + 1].offset; } - return -ENOSPC; + /* Not found, append new entry if possible. */ + if (i >= earg->size - 1) + return -ENOSPC; + + /* The last entry must have the largest offset. */ + if (i != 0) { + if (WARN_ON_ONCE(earg->code[i - 1].op != FETCH_OP_ST_EDATA)) + return -EINVAL; + last_offset = earg->code[i - 1].offset; + } + + offset = last_offset + sizeof(unsigned long); + store_entry_arg_at(&earg->code[i], argnum, offset); + return offset; } int traceprobe_get_entry_data_size(struct trace_probe *tp) { struct probe_entry_arg *earg = tp->entry_arg; - int i, size = 0; if (!earg) return 0; - for (i = 0; i < earg->size; i++) { - switch (earg->code[i].op) { - case FETCH_OP_END: - goto out; - case FETCH_OP_ST_EDATA: - size = earg->code[i].offset + sizeof(unsigned long); - break; - default: - break; - } - } -out: - return size; + return get_entry_arg_max_offset(earg) + sizeof(unsigned long); } void store_trace_entry_data(void *edata, struct trace_probe *tp, struct pt_regs *regs) @@ -897,8 +962,6 @@ static int parse_probe_vars(char *orig_arg, const struct fetch_type *t, code->op = FETCH_OP_COMM; return 0; } - /* backward compatibility */ - ctx->offset = 0; goto inval; } @@ -1003,7 +1066,7 @@ static int __parse_imm_string(char *str, char **pbuf, int offs) { size_t len = strlen(str); - if (str[len - 1] != '"') { + if (!len || str[len - 1] != '"') { trace_probe_log_err(offs + len, IMMSTR_NO_CLOSE); return -EINVAL; } @@ -1409,7 +1472,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, struct traceprobe_parse_context *ctx) { struct fetch_insn *code, *tmp = NULL; - char *type, *arg; + char *type, *arg __free(kfree) = NULL; int ret, len; len = strlen(argv); @@ -1426,22 +1489,16 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, return -ENOMEM; parg->comm = kstrdup(arg, GFP_KERNEL); - if (!parg->comm) { - ret = -ENOMEM; - goto out; - } + if (!parg->comm) + return -ENOMEM; type = parse_probe_arg_type(arg, parg, ctx); - if (IS_ERR(type)) { - ret = PTR_ERR(type); - goto out; - } + if (IS_ERR(type)) + return PTR_ERR(type); - code = tmp = kcalloc(FETCH_INSN_MAX, sizeof(*code), GFP_KERNEL); - if (!code) { - ret = -ENOMEM; - goto out; - } + code = tmp = kzalloc_objs(*code, FETCH_INSN_MAX); + if (!code) + return -ENOMEM; code[FETCH_INSN_MAX - 1].op = FETCH_OP_END; ctx->last_type = NULL; @@ -1464,6 +1521,12 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, parg->offset = *size; *size += parg->type->size * (parg->count ?: 1); + if (*size > MAX_PROBE_EVENT_SIZE) { + ret = -E2BIG; + trace_probe_log_err(ctx->offset, EVENT_TOO_BIG); + goto fail; + } + if (parg->count) { len = strlen(parg->type->fmttype) + 6; parg->fmt = kmalloc(len, GFP_KERNEL); @@ -1483,7 +1546,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, if (code->op == FETCH_OP_END) break; /* Shrink down the code buffer */ - parg->code = kcalloc(code - tmp + 1, sizeof(*code), GFP_KERNEL); + parg->code = kzalloc_objs(*code, code - tmp + 1); if (!parg->code) ret = -ENOMEM; else @@ -1497,8 +1560,6 @@ fail: kfree(code->data); } kfree(tmp); -out: - kfree(arg); return ret; } @@ -1668,7 +1729,7 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[], { const struct btf_param *params = NULL; int i, j, n, used, ret, args_idx = -1; - const char **new_argv = NULL; + const char **new_argv __free(kfree) = NULL; ret = argv_has_var_arg(argc, argv, &args_idx, ctx); if (ret < 0) @@ -1707,7 +1768,7 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[], ret = sprint_nth_btf_arg(n, "", buf + used, bufsize - used, ctx); if (ret < 0) - goto error; + return ERR_PTR(ret); new_argv[j++] = buf + used; used += ret + 1; @@ -1721,25 +1782,20 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[], n = simple_strtoul(argv[i] + 4, &type, 10); if (type && !(*type == ':' || *type == '\0')) { trace_probe_log_err(0, BAD_VAR); - ret = -ENOENT; - goto error; + return ERR_PTR(-ENOENT); } /* Note: $argN starts from $arg1 */ ret = sprint_nth_btf_arg(n - 1, type, buf + used, bufsize - used, ctx); if (ret < 0) - goto error; + return ERR_PTR(ret); new_argv[j++] = buf + used; used += ret + 1; } else new_argv[j++] = argv[i]; } - return new_argv; - -error: - kfree(new_argv); - return ERR_PTR(ret); + return_ptr(new_argv); } /* @buf: *buf must be equal to NULL. Caller must to free *buf */ @@ -1747,14 +1803,14 @@ int traceprobe_expand_dentry_args(int argc, const char *argv[], char **buf) { int i, used, ret; const int bufsize = MAX_DENTRY_ARGS_LEN; - char *tmpbuf = NULL; + char *tmpbuf __free(kfree) = NULL; if (*buf) return -EINVAL; used = 0; for (i = 0; i < argc; i++) { - char *tmp; + char *tmp __free(kfree) = NULL; char *equal; size_t arg_len; @@ -1769,7 +1825,7 @@ int traceprobe_expand_dentry_args(int argc, const char *argv[], char **buf) tmp = kstrdup(argv[i], GFP_KERNEL); if (!tmp) - goto nomem; + return -ENOMEM; equal = strchr(tmp, '='); if (equal) @@ -1790,18 +1846,14 @@ int traceprobe_expand_dentry_args(int argc, const char *argv[], char **buf) offsetof(struct file, f_path.dentry), equal ? equal + 1 : tmp); - kfree(tmp); if (ret >= bufsize - used) - goto nomem; + return -ENOMEM; argv[i] = tmpbuf + used; used += ret + 1; } - *buf = tmpbuf; + *buf = no_free_ptr(tmpbuf); return 0; -nomem: - kfree(tmpbuf); - return -ENOMEM; } void traceprobe_finish_parse(struct traceprobe_parse_context *ctx) @@ -2100,7 +2152,7 @@ int trace_probe_add_file(struct trace_probe *tp, struct trace_event_file *file) { struct event_file_link *link; - link = kmalloc(sizeof(*link), GFP_KERNEL); + link = kmalloc_obj(*link); if (!link) return -ENOMEM; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 5803e6a41570..262d8707a3df 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -10,20 +10,22 @@ * Author: Srikar Dronamraju */ +#include <linux/bitops.h> +#include <linux/btf.h> +#include <linux/cleanup.h> +#include <linux/kprobes.h> +#include <linux/limits.h> +#include <linux/perf_event.h> +#include <linux/ptrace.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/smp.h> -#include <linux/tracefs.h> -#include <linux/types.h> #include <linux/string.h> -#include <linux/ptrace.h> -#include <linux/perf_event.h> -#include <linux/kprobes.h> #include <linux/stringify.h> -#include <linux/limits.h> +#include <linux/tracefs.h> +#include <linux/types.h> #include <linux/uaccess.h> -#include <linux/bitops.h> -#include <linux/btf.h> + #include <asm/bitsperlong.h> #include "trace.h" @@ -36,7 +38,7 @@ #define MAX_BTF_ARGS_LEN 128 #define MAX_DENTRY_ARGS_LEN 256 #define MAX_STRING_SIZE PATH_MAX -#define MAX_ARG_BUF_LEN (MAX_TRACE_ARGS * MAX_ARG_NAME_LEN) +#define MAX_PROBE_EVENT_SIZE 3072 /* Reserved field names */ #define FIELD_STRING_IP "__probe_ip" @@ -270,16 +272,21 @@ struct event_file_link { struct list_head list; }; +static inline unsigned int trace_probe_load_flag(struct trace_probe *tp) +{ + return smp_load_acquire(&tp->event->flags); +} + static inline bool trace_probe_test_flag(struct trace_probe *tp, unsigned int flag) { - return !!(tp->event->flags & flag); + return !!(trace_probe_load_flag(tp) & flag); } static inline void trace_probe_set_flag(struct trace_probe *tp, unsigned int flag) { - tp->event->flags |= flag; + smp_store_release(&tp->event->flags, tp->event->flags | flag); } static inline void trace_probe_clear_flag(struct trace_probe *tp, @@ -439,6 +446,14 @@ extern void traceprobe_free_probe_arg(struct probe_arg *arg); * this MUST be called for clean up the context and return a resource. */ void traceprobe_finish_parse(struct traceprobe_parse_context *ctx); +static inline void traceprobe_free_parse_ctx(struct traceprobe_parse_context *ctx) +{ + traceprobe_finish_parse(ctx); + kfree(ctx); +} + +DEFINE_FREE(traceprobe_parse_context, struct traceprobe_parse_context *, + if (_T) traceprobe_free_parse_ctx(_T)) extern int traceprobe_split_symbol_offset(char *symbol, long *offset); int traceprobe_parse_event_name(const char **pevent, const char **pgroup, @@ -481,6 +496,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(NON_UNIQ_SYMBOL, "The symbol is not unique"), \ C(BAD_RETPROBE, "Retprobe address must be an function entry"), \ C(NO_TRACEPOINT, "Tracepoint is not found"), \ + C(BAD_TP_NAME, "Invalid character in tracepoint name"),\ C(BAD_ADDR_SUFFIX, "Invalid probed address suffix"), \ C(NO_GROUP_NAME, "Group name is not specified"), \ C(GROUP_TOO_LONG, "Group name is too long"), \ @@ -544,7 +560,10 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(NO_BTF_FIELD, "This field is not found."), \ C(BAD_BTF_TID, "Failed to get BTF type info."),\ C(BAD_TYPE4STR, "This type does not fit for string."),\ - C(NEED_STRING_TYPE, "$comm and immediate-string only accepts string type"), + C(NEED_STRING_TYPE, "$comm and immediate-string only accepts string type"),\ + C(TOO_MANY_ARGS, "Too many arguments are specified"), \ + C(TOO_MANY_EARGS, "Too many entry arguments specified"), \ + C(EVENT_TOO_BIG, "Event too big (too many fields?)"), #undef C #define C(a, b) TP_ERR_##a @@ -561,11 +580,13 @@ struct trace_probe_log { int index; }; -void trace_probe_log_init(const char *subsystem, int argc, const char **argv); +const char *trace_probe_log_init(const char *subsystem, int argc, const char **argv); void trace_probe_log_set_index(int index); void trace_probe_log_clear(void); void __trace_probe_log_err(int offset, int err); +DEFINE_FREE(trace_probe_log_clear, const char *, if (_T) trace_probe_log_clear()) + #define trace_probe_log_err(offs, err) \ __trace_probe_log_err(offs, TP_ERR_##err) diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index 2caf0d2afb32..f39b37fcdb3b 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -232,7 +232,7 @@ array: /* Sum up total data length for dynamic arrays (strings) */ static nokprobe_inline int -__get_data_size(struct trace_probe *tp, struct pt_regs *regs, void *edata) +__get_data_size(struct trace_probe *tp, void *regs, void *edata) { struct probe_arg *arg; int i, len, ret = 0; diff --git a/kernel/trace/trace_recursion_record.c b/kernel/trace/trace_recursion_record.c index a520b11afb0d..784fe1fbb866 100644 --- a/kernel/trace/trace_recursion_record.c +++ b/kernel/trace/trace_recursion_record.c @@ -129,7 +129,7 @@ static void *recursed_function_seq_start(struct seq_file *m, loff_t *pos) ret = &recursed_functions[*pos]; } - tseq = kzalloc(sizeof(*tseq), GFP_KERNEL); + tseq = kzalloc_obj(*tseq); if (!tseq) return ERR_PTR(-ENOMEM); diff --git a/kernel/trace/trace_remote.c b/kernel/trace/trace_remote.c new file mode 100644 index 000000000000..d6c3f94d67cd --- /dev/null +++ b/kernel/trace/trace_remote.c @@ -0,0 +1,1384 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2025 - Google LLC + * Author: Vincent Donnefort <vdonnefort@google.com> + */ + +#include <linux/kstrtox.h> +#include <linux/lockdep.h> +#include <linux/mutex.h> +#include <linux/tracefs.h> +#include <linux/trace_remote.h> +#include <linux/trace_seq.h> +#include <linux/types.h> + +#include "trace.h" + +#define TRACEFS_DIR "remotes" +#define TRACEFS_MODE_WRITE 0640 +#define TRACEFS_MODE_READ 0440 + +enum tri_type { + TRI_CONSUMING, + TRI_NONCONSUMING, +}; + +struct trace_remote_iterator { + struct trace_remote *remote; + struct trace_seq seq; + struct delayed_work poll_work; + unsigned long lost_events; + u64 ts; + struct ring_buffer_iter *rb_iter; + struct ring_buffer_iter **rb_iters; + struct remote_event_hdr *evt; + int cpu; + int evt_cpu; + loff_t pos; + enum tri_type type; +}; + +struct trace_remote { + struct trace_remote_callbacks *cbs; + void *priv; + struct trace_buffer *trace_buffer; + struct trace_buffer_desc *trace_buffer_desc; + struct dentry *dentry; + struct eventfs_inode *eventfs; + struct remote_event *events; + unsigned long nr_events; + unsigned long trace_buffer_size; + struct ring_buffer_remote rb_remote; + struct mutex lock; + struct rw_semaphore reader_lock; + struct rw_semaphore *pcpu_reader_locks; + unsigned int nr_readers; + unsigned int poll_ms; + bool tracing_on; +}; + +static bool trace_remote_loaded(struct trace_remote *remote) +{ + return !!remote->trace_buffer; +} + +static int trace_remote_load(struct trace_remote *remote) +{ + struct ring_buffer_remote *rb_remote = &remote->rb_remote; + struct trace_buffer_desc *desc; + + lockdep_assert_held(&remote->lock); + + if (trace_remote_loaded(remote)) + return 0; + + desc = remote->cbs->load_trace_buffer(remote->trace_buffer_size, remote->priv); + if (IS_ERR(desc)) + return PTR_ERR(desc); + + rb_remote->desc = desc; + rb_remote->swap_reader_page = remote->cbs->swap_reader_page; + rb_remote->priv = remote->priv; + rb_remote->reset = remote->cbs->reset; + remote->trace_buffer = ring_buffer_alloc_remote(rb_remote); + if (!remote->trace_buffer) { + remote->cbs->unload_trace_buffer(desc, remote->priv); + return -ENOMEM; + } + + remote->trace_buffer_desc = desc; + + return 0; +} + +static void trace_remote_try_unload(struct trace_remote *remote) +{ + lockdep_assert_held(&remote->lock); + + if (!trace_remote_loaded(remote)) + return; + + /* The buffer is being read or writable */ + if (remote->nr_readers || remote->tracing_on) + return; + + /* The buffer has readable data */ + if (!ring_buffer_empty(remote->trace_buffer)) + return; + + ring_buffer_free(remote->trace_buffer); + remote->trace_buffer = NULL; + remote->cbs->unload_trace_buffer(remote->trace_buffer_desc, remote->priv); +} + +static int trace_remote_enable_tracing(struct trace_remote *remote) +{ + int ret; + + lockdep_assert_held(&remote->lock); + + if (remote->tracing_on) + return 0; + + ret = trace_remote_load(remote); + if (ret) + return ret; + + ret = remote->cbs->enable_tracing(true, remote->priv); + if (ret) { + trace_remote_try_unload(remote); + return ret; + } + + remote->tracing_on = true; + + return 0; +} + +static int trace_remote_disable_tracing(struct trace_remote *remote) +{ + int ret; + + lockdep_assert_held(&remote->lock); + + if (!remote->tracing_on) + return 0; + + ret = remote->cbs->enable_tracing(false, remote->priv); + if (ret) + return ret; + + ring_buffer_poll_remote(remote->trace_buffer, RING_BUFFER_ALL_CPUS); + remote->tracing_on = false; + trace_remote_try_unload(remote); + + return 0; +} + +static void trace_remote_reset(struct trace_remote *remote, int cpu) +{ + lockdep_assert_held(&remote->lock); + + if (!trace_remote_loaded(remote)) + return; + + if (cpu == RING_BUFFER_ALL_CPUS) + ring_buffer_reset(remote->trace_buffer); + else + ring_buffer_reset_cpu(remote->trace_buffer, cpu); + + trace_remote_try_unload(remote); +} + +static ssize_t +tracing_on_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct seq_file *seq = filp->private_data; + struct trace_remote *remote = seq->private; + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + guard(mutex)(&remote->lock); + + ret = val ? trace_remote_enable_tracing(remote) : trace_remote_disable_tracing(remote); + if (ret) + return ret; + + return cnt; +} +static int tracing_on_show(struct seq_file *s, void *unused) +{ + struct trace_remote *remote = s->private; + + seq_printf(s, "%d\n", remote->tracing_on); + + return 0; +} +DEFINE_SHOW_STORE_ATTRIBUTE(tracing_on); + +static ssize_t buffer_size_kb_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct seq_file *seq = filp->private_data; + struct trace_remote *remote = seq->private; + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + /* KiB to Bytes */ + if (!val || check_shl_overflow(val, 10, &val)) + return -EINVAL; + + guard(mutex)(&remote->lock); + + if (trace_remote_loaded(remote)) + return -EBUSY; + + remote->trace_buffer_size = val; + + return cnt; +} + +static int buffer_size_kb_show(struct seq_file *s, void *unused) +{ + struct trace_remote *remote = s->private; + + seq_printf(s, "%lu (%s)\n", remote->trace_buffer_size >> 10, + trace_remote_loaded(remote) ? "loaded" : "unloaded"); + + return 0; +} +DEFINE_SHOW_STORE_ATTRIBUTE(buffer_size_kb); + +static int trace_remote_get(struct trace_remote *remote, int cpu) +{ + int ret; + + if (remote->nr_readers == UINT_MAX) + return -EBUSY; + + ret = trace_remote_load(remote); + if (ret) + return ret; + + if (cpu != RING_BUFFER_ALL_CPUS && !remote->pcpu_reader_locks) { + int lock_cpu; + + remote->pcpu_reader_locks = kcalloc(nr_cpu_ids, sizeof(*remote->pcpu_reader_locks), + GFP_KERNEL); + if (!remote->pcpu_reader_locks) { + trace_remote_try_unload(remote); + return -ENOMEM; + } + + for_each_possible_cpu(lock_cpu) + init_rwsem(&remote->pcpu_reader_locks[lock_cpu]); + } + + remote->nr_readers++; + + return 0; +} + +static void trace_remote_put(struct trace_remote *remote) +{ + if (WARN_ON(!remote->nr_readers)) + return; + + remote->nr_readers--; + if (remote->nr_readers) + return; + + kfree(remote->pcpu_reader_locks); + remote->pcpu_reader_locks = NULL; + + trace_remote_try_unload(remote); +} + +static bool trace_remote_has_cpu(struct trace_remote *remote, int cpu) +{ + if (cpu == RING_BUFFER_ALL_CPUS) + return true; + + return ring_buffer_poll_remote(remote->trace_buffer, cpu) == 0; +} + +static void __poll_remote(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct trace_remote_iterator *iter; + + iter = container_of(dwork, struct trace_remote_iterator, poll_work); + ring_buffer_poll_remote(iter->remote->trace_buffer, iter->cpu); + schedule_delayed_work((struct delayed_work *)work, + msecs_to_jiffies(iter->remote->poll_ms)); +} + +static void __free_ring_buffer_iter(struct trace_remote_iterator *iter, int cpu) +{ + if (cpu != RING_BUFFER_ALL_CPUS) { + ring_buffer_read_finish(iter->rb_iter); + return; + } + + for_each_possible_cpu(cpu) { + if (iter->rb_iters[cpu]) + ring_buffer_read_finish(iter->rb_iters[cpu]); + } + + kfree(iter->rb_iters); +} + +static int __alloc_ring_buffer_iter(struct trace_remote_iterator *iter, int cpu) +{ + if (cpu != RING_BUFFER_ALL_CPUS) { + iter->rb_iter = ring_buffer_read_start(iter->remote->trace_buffer, cpu, GFP_KERNEL); + + return iter->rb_iter ? 0 : -ENOMEM; + } + + iter->rb_iters = kcalloc(nr_cpu_ids, sizeof(*iter->rb_iters), GFP_KERNEL); + if (!iter->rb_iters) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + iter->rb_iters[cpu] = ring_buffer_read_start(iter->remote->trace_buffer, cpu, + GFP_KERNEL); + if (!iter->rb_iters[cpu]) { + /* This CPU isn't part of trace_buffer. Skip it */ + if (!trace_remote_has_cpu(iter->remote, cpu)) + continue; + + __free_ring_buffer_iter(iter, RING_BUFFER_ALL_CPUS); + return -ENOMEM; + } + } + + return 0; +} + +static struct trace_remote_iterator +*trace_remote_iter(struct trace_remote *remote, int cpu, enum tri_type type) +{ + struct trace_remote_iterator *iter = NULL; + int ret; + + lockdep_assert_held(&remote->lock); + + if (type == TRI_NONCONSUMING && !trace_remote_loaded(remote)) + return NULL; + + ret = trace_remote_get(remote, cpu); + if (ret) + return ERR_PTR(ret); + + if (!trace_remote_has_cpu(remote, cpu)) { + ret = -ENODEV; + goto err; + } + + iter = kzalloc_obj(*iter); + if (iter) { + iter->remote = remote; + iter->cpu = cpu; + iter->type = type; + trace_seq_init(&iter->seq); + + switch (type) { + case TRI_CONSUMING: + ring_buffer_poll_remote(remote->trace_buffer, cpu); + INIT_DELAYED_WORK(&iter->poll_work, __poll_remote); + schedule_delayed_work(&iter->poll_work, msecs_to_jiffies(remote->poll_ms)); + break; + case TRI_NONCONSUMING: + ret = __alloc_ring_buffer_iter(iter, cpu); + break; + } + + if (ret) + goto err; + + return iter; + } + ret = -ENOMEM; + +err: + kfree(iter); + trace_remote_put(remote); + + return ERR_PTR(ret); +} + +static void trace_remote_iter_free(struct trace_remote_iterator *iter) +{ + struct trace_remote *remote; + + if (!iter) + return; + + remote = iter->remote; + + lockdep_assert_held(&remote->lock); + + switch (iter->type) { + case TRI_CONSUMING: + cancel_delayed_work_sync(&iter->poll_work); + break; + case TRI_NONCONSUMING: + __free_ring_buffer_iter(iter, iter->cpu); + break; + } + + kfree(iter); + trace_remote_put(remote); +} + +static void trace_remote_iter_read_start(struct trace_remote_iterator *iter) +{ + struct trace_remote *remote = iter->remote; + int cpu = iter->cpu; + + /* Acquire global reader lock */ + if (cpu == RING_BUFFER_ALL_CPUS && iter->type == TRI_CONSUMING) + down_write(&remote->reader_lock); + else + down_read(&remote->reader_lock); + + if (cpu == RING_BUFFER_ALL_CPUS) + return; + + /* + * No need for the remote lock here, iter holds a reference on + * remote->nr_readers + */ + + /* Get the per-CPU one */ + if (WARN_ON_ONCE(!remote->pcpu_reader_locks)) + return; + + if (iter->type == TRI_CONSUMING) + down_write(&remote->pcpu_reader_locks[cpu]); + else + down_read(&remote->pcpu_reader_locks[cpu]); +} + +static void trace_remote_iter_read_finished(struct trace_remote_iterator *iter) +{ + struct trace_remote *remote = iter->remote; + int cpu = iter->cpu; + + /* Release per-CPU reader lock */ + if (cpu != RING_BUFFER_ALL_CPUS) { + /* + * No need for the remote lock here, iter holds a reference on + * remote->nr_readers + */ + if (iter->type == TRI_CONSUMING) + up_write(&remote->pcpu_reader_locks[cpu]); + else + up_read(&remote->pcpu_reader_locks[cpu]); + } + + /* Release global reader lock */ + if (cpu == RING_BUFFER_ALL_CPUS && iter->type == TRI_CONSUMING) + up_write(&remote->reader_lock); + else + up_read(&remote->reader_lock); +} + +static struct ring_buffer_iter *__get_rb_iter(struct trace_remote_iterator *iter, int cpu) +{ + return iter->cpu != RING_BUFFER_ALL_CPUS ? iter->rb_iter : iter->rb_iters[cpu]; +} + +static struct ring_buffer_event * +__peek_event(struct trace_remote_iterator *iter, int cpu, u64 *ts, unsigned long *lost_events) +{ + struct ring_buffer_event *rb_evt; + struct ring_buffer_iter *rb_iter; + + switch (iter->type) { + case TRI_CONSUMING: + return ring_buffer_peek(iter->remote->trace_buffer, cpu, ts, lost_events); + case TRI_NONCONSUMING: + rb_iter = __get_rb_iter(iter, cpu); + if (!rb_iter) + return NULL; + + rb_evt = ring_buffer_iter_peek(rb_iter, ts); + if (!rb_evt) + return NULL; + + *lost_events = ring_buffer_iter_dropped(rb_iter); + + return rb_evt; + } + + return NULL; +} + +static bool trace_remote_iter_read_event(struct trace_remote_iterator *iter) +{ + struct trace_buffer *trace_buffer = iter->remote->trace_buffer; + struct ring_buffer_event *rb_evt; + int cpu = iter->cpu; + + if (cpu != RING_BUFFER_ALL_CPUS) { + if (ring_buffer_empty_cpu(trace_buffer, cpu)) + return false; + + rb_evt = __peek_event(iter, cpu, &iter->ts, &iter->lost_events); + if (!rb_evt) + return false; + + iter->evt_cpu = cpu; + iter->evt = ring_buffer_event_data(rb_evt); + return true; + } + + iter->ts = U64_MAX; + for_each_possible_cpu(cpu) { + unsigned long lost_events; + u64 ts; + + if (ring_buffer_empty_cpu(trace_buffer, cpu)) + continue; + + rb_evt = __peek_event(iter, cpu, &ts, &lost_events); + if (!rb_evt) + continue; + + if (ts >= iter->ts) + continue; + + iter->ts = ts; + iter->evt_cpu = cpu; + iter->evt = ring_buffer_event_data(rb_evt); + iter->lost_events = lost_events; + } + + return iter->ts != U64_MAX; +} + +static void trace_remote_iter_move(struct trace_remote_iterator *iter) +{ + struct trace_buffer *trace_buffer = iter->remote->trace_buffer; + + switch (iter->type) { + case TRI_CONSUMING: + ring_buffer_consume(trace_buffer, iter->evt_cpu, NULL, NULL); + break; + case TRI_NONCONSUMING: + ring_buffer_iter_advance(__get_rb_iter(iter, iter->evt_cpu)); + break; + } +} + +static struct remote_event *trace_remote_find_event(struct trace_remote *remote, unsigned short id); + +static int trace_remote_iter_print_event(struct trace_remote_iterator *iter) +{ + struct remote_event *evt; + unsigned long usecs_rem; + u64 ts = iter->ts; + + if (iter->lost_events) + trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", + iter->evt_cpu, iter->lost_events); + + do_div(ts, 1000); + usecs_rem = do_div(ts, USEC_PER_SEC); + + trace_seq_printf(&iter->seq, "[%03d]\t%5llu.%06lu: ", iter->evt_cpu, + ts, usecs_rem); + + evt = trace_remote_find_event(iter->remote, iter->evt->id); + if (!evt) + trace_seq_printf(&iter->seq, "UNKNOWN id=%d\n", iter->evt->id); + else + evt->print(iter->evt, &iter->seq); + + return trace_seq_has_overflowed(&iter->seq) ? -EOVERFLOW : 0; +} + +static int trace_pipe_open(struct inode *inode, struct file *filp) +{ + struct trace_remote *remote = inode->i_private; + struct trace_remote_iterator *iter; + int cpu = tracing_get_cpu(inode); + + guard(mutex)(&remote->lock); + + iter = trace_remote_iter(remote, cpu, TRI_CONSUMING); + if (IS_ERR(iter)) + return PTR_ERR(iter); + + filp->private_data = iter; + + return IS_ERR(iter) ? PTR_ERR(iter) : 0; +} + +static int trace_pipe_release(struct inode *inode, struct file *filp) +{ + struct trace_remote_iterator *iter = filp->private_data; + struct trace_remote *remote = iter->remote; + + guard(mutex)(&remote->lock); + + trace_remote_iter_free(iter); + + return 0; +} + +static ssize_t trace_pipe_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct trace_remote_iterator *iter = filp->private_data; + struct trace_buffer *trace_buffer = iter->remote->trace_buffer; + int ret; + +copy_to_user: + ret = trace_seq_to_user(&iter->seq, ubuf, cnt); + if (ret != -EBUSY) + return ret; + + trace_seq_init(&iter->seq); + + ret = ring_buffer_wait(trace_buffer, iter->cpu, 0, NULL, NULL); + if (ret < 0) + return ret; + + trace_remote_iter_read_start(iter); + + while (trace_remote_iter_read_event(iter)) { + int prev_len = iter->seq.seq.len; + + if (trace_remote_iter_print_event(iter)) { + iter->seq.seq.len = prev_len; + break; + } + + trace_remote_iter_move(iter); + } + + trace_remote_iter_read_finished(iter); + + goto copy_to_user; +} + +static const struct file_operations trace_pipe_fops = { + .open = trace_pipe_open, + .read = trace_pipe_read, + .release = trace_pipe_release, +}; + +static void *trace_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct trace_remote_iterator *iter = m->private; + + ++*pos; + + if (!iter || !trace_remote_iter_read_event(iter)) + return NULL; + + trace_remote_iter_move(iter); + iter->pos++; + + return iter; +} + +static void *trace_start(struct seq_file *m, loff_t *pos) +{ + struct trace_remote_iterator *iter = m->private; + loff_t i; + + if (!iter) + return NULL; + + trace_remote_iter_read_start(iter); + + if (!*pos) { + iter->pos = -1; + return trace_next(m, NULL, &i); + } + + i = iter->pos; + while (i < *pos) { + iter = trace_next(m, NULL, &i); + if (!iter) + return NULL; + } + + return iter; +} + +static int trace_show(struct seq_file *m, void *v) +{ + struct trace_remote_iterator *iter = v; + + trace_seq_init(&iter->seq); + + if (trace_remote_iter_print_event(iter)) { + seq_printf(m, "[EVENT %d PRINT TOO BIG]\n", iter->evt->id); + return 0; + } + + return trace_print_seq(m, &iter->seq); +} + +static void trace_stop(struct seq_file *m, void *v) +{ + struct trace_remote_iterator *iter = m->private; + + if (iter) + trace_remote_iter_read_finished(iter); +} + +static const struct seq_operations trace_sops = { + .start = trace_start, + .next = trace_next, + .show = trace_show, + .stop = trace_stop, +}; + +static int trace_open(struct inode *inode, struct file *filp) +{ + struct trace_remote *remote = inode->i_private; + struct trace_remote_iterator *iter = NULL; + int cpu = tracing_get_cpu(inode); + int ret; + + if (!(filp->f_mode & FMODE_READ)) + return 0; + + guard(mutex)(&remote->lock); + + iter = trace_remote_iter(remote, cpu, TRI_NONCONSUMING); + if (IS_ERR(iter)) + return PTR_ERR(iter); + + ret = seq_open(filp, &trace_sops); + if (ret) { + trace_remote_iter_free(iter); + return ret; + } + + ((struct seq_file *)filp->private_data)->private = (void *)iter; + + return 0; +} + +static int trace_release(struct inode *inode, struct file *filp) +{ + struct trace_remote_iterator *iter; + + if (!(filp->f_mode & FMODE_READ)) + return 0; + + iter = ((struct seq_file *)filp->private_data)->private; + seq_release(inode, filp); + + if (!iter) + return 0; + + guard(mutex)(&iter->remote->lock); + + trace_remote_iter_free(iter); + + return 0; +} + +static ssize_t trace_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct inode *inode = file_inode(filp); + struct trace_remote *remote = inode->i_private; + int cpu = tracing_get_cpu(inode); + + guard(mutex)(&remote->lock); + + trace_remote_reset(remote, cpu); + + return cnt; +} + +static const struct file_operations trace_fops = { + .open = trace_open, + .write = trace_write, + .read = seq_read, + .read_iter = seq_read_iter, + .release = trace_release, +}; + +static int trace_remote_init_tracefs(const char *name, struct trace_remote *remote) +{ + struct dentry *remote_d, *percpu_d, *d; + static struct dentry *root; + static DEFINE_MUTEX(lock); + bool root_inited = false; + int cpu; + + guard(mutex)(&lock); + + if (!root) { + root = tracefs_create_dir(TRACEFS_DIR, NULL); + if (!root) { + pr_err("Failed to create tracefs dir "TRACEFS_DIR"\n"); + return -ENOMEM; + } + root_inited = true; + } + + remote_d = tracefs_create_dir(name, root); + if (!remote_d) { + pr_err("Failed to create tracefs dir "TRACEFS_DIR"%s/\n", name); + goto err; + } + + d = trace_create_file("tracing_on", TRACEFS_MODE_WRITE, remote_d, remote, &tracing_on_fops); + if (!d) + goto err; + + d = trace_create_file("buffer_size_kb", TRACEFS_MODE_WRITE, remote_d, remote, + &buffer_size_kb_fops); + if (!d) + goto err; + + d = trace_create_file("trace_pipe", TRACEFS_MODE_READ, remote_d, remote, &trace_pipe_fops); + if (!d) + goto err; + + d = trace_create_file("trace", TRACEFS_MODE_WRITE, remote_d, remote, &trace_fops); + if (!d) + goto err; + + percpu_d = tracefs_create_dir("per_cpu", remote_d); + if (!percpu_d) { + pr_err("Failed to create tracefs dir "TRACEFS_DIR"%s/per_cpu/\n", name); + goto err; + } + + for_each_possible_cpu(cpu) { + struct dentry *cpu_d; + char cpu_name[16]; + + snprintf(cpu_name, sizeof(cpu_name), "cpu%d", cpu); + cpu_d = tracefs_create_dir(cpu_name, percpu_d); + if (!cpu_d) { + pr_err("Failed to create tracefs dir "TRACEFS_DIR"%s/percpu/cpu%d\n", + name, cpu); + goto err; + } + + d = trace_create_cpu_file("trace_pipe", TRACEFS_MODE_READ, cpu_d, remote, cpu, + &trace_pipe_fops); + if (!d) + goto err; + + d = trace_create_cpu_file("trace", TRACEFS_MODE_WRITE, cpu_d, remote, cpu, + &trace_fops); + if (!d) + goto err; + } + + remote->dentry = remote_d; + + return 0; + +err: + if (root_inited) { + tracefs_remove(root); + root = NULL; + } else { + tracefs_remove(remote_d); + } + + return -ENOMEM; +} + +static int trace_remote_register_events(const char *remote_name, struct trace_remote *remote, + struct remote_event *events, size_t nr_events); + +/** + * trace_remote_register() - Register a Tracefs remote + * @name: Name of the remote, used for the Tracefs remotes/ directory. + * @cbs: Set of callbacks used to control the remote. + * @priv: Private data, passed to each callback from @cbs. + * @events: Array of events. &remote_event.name and &remote_event.id must be + * filled by the caller. + * @nr_events: Number of events in the @events array. + * + * A trace remote is an entity, outside of the kernel (most likely firmware or + * hypervisor) capable of writing events into a Tracefs compatible ring-buffer. + * The kernel would then act as a reader. + * + * The registered remote will be found under the Tracefs directory + * remotes/<name>. + * + * Return: 0 on success, negative error code on failure. + */ +int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv, + struct remote_event *events, size_t nr_events) +{ + struct trace_remote *remote; + int ret; + + remote = kzalloc_obj(*remote); + if (!remote) + return -ENOMEM; + + remote->cbs = cbs; + remote->priv = priv; + remote->trace_buffer_size = 7 << 10; + remote->poll_ms = 100; + mutex_init(&remote->lock); + init_rwsem(&remote->reader_lock); + + if (trace_remote_init_tracefs(name, remote)) { + kfree(remote); + return -ENOMEM; + } + + ret = trace_remote_register_events(name, remote, events, nr_events); + if (ret) { + pr_err("Failed to register events for trace remote '%s' (%d)\n", + name, ret); + return ret; + } + + ret = cbs->init ? cbs->init(remote->dentry, priv) : 0; + if (ret) + pr_err("Init failed for trace remote '%s' (%d)\n", name, ret); + + return ret; +} +EXPORT_SYMBOL_GPL(trace_remote_register); + +/** + * trace_remote_free_buffer() - Free trace buffer allocated with trace_remote_alloc_buffer() + * @desc: Descriptor of the per-CPU ring-buffers, originally filled by + * trace_remote_alloc_buffer() + * + * Most likely called from &trace_remote_callbacks.unload_trace_buffer. + */ +void trace_remote_free_buffer(struct trace_buffer_desc *desc) +{ + struct ring_buffer_desc *rb_desc; + int cpu; + + for_each_ring_buffer_desc(rb_desc, cpu, desc) { + unsigned int id; + + free_page(rb_desc->meta_va); + + for (id = 0; id < rb_desc->nr_page_va; id++) + free_page(rb_desc->page_va[id]); + } +} +EXPORT_SYMBOL_GPL(trace_remote_free_buffer); + +/** + * trace_remote_alloc_buffer() - Dynamically allocate a trace buffer + * @desc: Uninitialized trace_buffer_desc + * @desc_size: Size of the trace_buffer_desc. Must be at least equal to + * trace_buffer_desc_size() + * @buffer_size: Size in bytes of each per-CPU ring-buffer + * @cpumask: CPUs to allocate a ring-buffer for + * + * Helper to dynamically allocate a set of pages (enough to cover @buffer_size) + * for each CPU from @cpumask and fill @desc. Most likely called from + * &trace_remote_callbacks.load_trace_buffer. + * + * Return: 0 on success, negative error code on failure. + */ +int trace_remote_alloc_buffer(struct trace_buffer_desc *desc, size_t desc_size, size_t buffer_size, + const struct cpumask *cpumask) +{ + unsigned int nr_pages = max(DIV_ROUND_UP(buffer_size, PAGE_SIZE), 2UL) + 1; + void *desc_end = desc + desc_size; + struct ring_buffer_desc *rb_desc; + int cpu, ret = -ENOMEM; + + if (desc_size < struct_size(desc, __data, 0)) + return -EINVAL; + + desc->nr_cpus = 0; + desc->struct_len = struct_size(desc, __data, 0); + + rb_desc = (struct ring_buffer_desc *)&desc->__data[0]; + + for_each_cpu(cpu, cpumask) { + unsigned int id; + + if ((void *)rb_desc + struct_size(rb_desc, page_va, nr_pages) > desc_end) { + ret = -EINVAL; + goto err; + } + + rb_desc->cpu = cpu; + rb_desc->nr_page_va = 0; + rb_desc->meta_va = (unsigned long)__get_free_page(GFP_KERNEL); + if (!rb_desc->meta_va) + goto err; + + for (id = 0; id < nr_pages; id++) { + rb_desc->page_va[id] = (unsigned long)__get_free_page(GFP_KERNEL); + if (!rb_desc->page_va[id]) + goto err; + + rb_desc->nr_page_va++; + } + desc->nr_cpus++; + desc->struct_len += offsetof(struct ring_buffer_desc, page_va); + desc->struct_len += struct_size(rb_desc, page_va, rb_desc->nr_page_va); + rb_desc = __next_ring_buffer_desc(rb_desc); + } + + return 0; + +err: + trace_remote_free_buffer(desc); + return ret; +} +EXPORT_SYMBOL_GPL(trace_remote_alloc_buffer); + +static int +trace_remote_enable_event(struct trace_remote *remote, struct remote_event *evt, bool enable) +{ + int ret; + + lockdep_assert_held(&remote->lock); + + if (evt->enabled == enable) + return 0; + + ret = remote->cbs->enable_event(evt->id, enable, remote->priv); + if (ret) + return ret; + + evt->enabled = enable; + + return 0; +} + +static int remote_event_enable_show(struct seq_file *s, void *unused) +{ + struct remote_event *evt = s->private; + + seq_printf(s, "%d\n", evt->enabled); + + return 0; +} + +static ssize_t remote_event_enable_write(struct file *filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct seq_file *seq = filp->private_data; + struct remote_event *evt = seq->private; + struct trace_remote *remote = evt->remote; + u8 enable; + int ret; + + ret = kstrtou8_from_user(ubuf, count, 10, &enable); + if (ret) + return ret; + + guard(mutex)(&remote->lock); + + ret = trace_remote_enable_event(remote, evt, enable); + if (ret) + return ret; + + return count; +} +DEFINE_SHOW_STORE_ATTRIBUTE(remote_event_enable); + +static int remote_event_id_show(struct seq_file *s, void *unused) +{ + struct remote_event *evt = s->private; + + seq_printf(s, "%d\n", evt->id); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(remote_event_id); + +static int remote_event_format_show(struct seq_file *s, void *unused) +{ + size_t offset = sizeof(struct remote_event_hdr); + struct remote_event *evt = s->private; + struct trace_event_fields *field; + + seq_printf(s, "name: %s\n", evt->name); + seq_printf(s, "ID: %d\n", evt->id); + seq_puts(s, + "format:\n\tfield:unsigned short common_type;\toffset:0;\tsize:2;\tsigned:0;\n\n"); + + field = &evt->fields[0]; + while (field->name) { + seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%u;\tsigned:%d;\n", + field->type, field->name, offset, field->size, + field->is_signed); + offset += field->size; + field++; + } + + if (field != &evt->fields[0]) + seq_puts(s, "\n"); + + seq_printf(s, "print fmt: %s\n", evt->print_fmt); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(remote_event_format); + +static int remote_event_callback(const char *name, umode_t *mode, void **data, + const struct file_operations **fops) +{ + if (!strcmp(name, "enable")) { + *mode = TRACEFS_MODE_WRITE; + *fops = &remote_event_enable_fops; + return 1; + } + + if (!strcmp(name, "id")) { + *mode = TRACEFS_MODE_READ; + *fops = &remote_event_id_fops; + return 1; + } + + if (!strcmp(name, "format")) { + *mode = TRACEFS_MODE_READ; + *fops = &remote_event_format_fops; + return 1; + } + + return 0; +} + +static ssize_t remote_events_dir_enable_write(struct file *filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct trace_remote *remote = file_inode(filp)->i_private; + int i, ret; + u8 enable; + + ret = kstrtou8_from_user(ubuf, count, 10, &enable); + if (ret) + return ret; + + guard(mutex)(&remote->lock); + + for (i = 0; i < remote->nr_events; i++) { + struct remote_event *evt = &remote->events[i]; + + trace_remote_enable_event(remote, evt, enable); + } + + return count; +} + +static ssize_t remote_events_dir_enable_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct trace_remote *remote = file_inode(filp)->i_private; + const char enabled_char[] = {'0', '1', 'X'}; + char enabled_str[] = " \n"; + int i, enabled = -1; + + guard(mutex)(&remote->lock); + + for (i = 0; i < remote->nr_events; i++) { + struct remote_event *evt = &remote->events[i]; + + if (enabled == -1) { + enabled = evt->enabled; + } else if (enabled != evt->enabled) { + enabled = 2; + break; + } + } + + enabled_str[0] = enabled_char[enabled == -1 ? 0 : enabled]; + + return simple_read_from_buffer(ubuf, cnt, ppos, enabled_str, 2); +} + +static const struct file_operations remote_events_dir_enable_fops = { + .write = remote_events_dir_enable_write, + .read = remote_events_dir_enable_read, +}; + +static ssize_t +remote_events_dir_header_page_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct trace_seq *s; + int ret; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + trace_seq_init(s); + + ring_buffer_print_page_header(NULL, s); + ret = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, trace_seq_used(s)); + kfree(s); + + return ret; +} + +static const struct file_operations remote_events_dir_header_page_fops = { + .read = remote_events_dir_header_page_read, +}; + +static ssize_t +remote_events_dir_header_event_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct trace_seq *s; + int ret; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + trace_seq_init(s); + + ring_buffer_print_entry_header(s); + ret = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, trace_seq_used(s)); + kfree(s); + + return ret; +} + +static const struct file_operations remote_events_dir_header_event_fops = { + .read = remote_events_dir_header_event_read, +}; + +static int remote_events_dir_callback(const char *name, umode_t *mode, void **data, + const struct file_operations **fops) +{ + if (!strcmp(name, "enable")) { + *mode = TRACEFS_MODE_WRITE; + *fops = &remote_events_dir_enable_fops; + return 1; + } + + if (!strcmp(name, "header_page")) { + *mode = TRACEFS_MODE_READ; + *fops = &remote_events_dir_header_page_fops; + return 1; + } + + if (!strcmp(name, "header_event")) { + *mode = TRACEFS_MODE_READ; + *fops = &remote_events_dir_header_event_fops; + return 1; + } + + return 0; +} + +static int trace_remote_init_eventfs(const char *remote_name, struct trace_remote *remote, + struct remote_event *evt) +{ + struct eventfs_inode *eventfs = remote->eventfs; + static struct eventfs_entry dir_entries[] = { + { + .name = "enable", + .callback = remote_events_dir_callback, + }, { + .name = "header_page", + .callback = remote_events_dir_callback, + }, { + .name = "header_event", + .callback = remote_events_dir_callback, + } + }; + static struct eventfs_entry entries[] = { + { + .name = "enable", + .callback = remote_event_callback, + }, { + .name = "id", + .callback = remote_event_callback, + }, { + .name = "format", + .callback = remote_event_callback, + } + }; + bool eventfs_create = false; + + if (!eventfs) { + eventfs = eventfs_create_events_dir("events", remote->dentry, dir_entries, + ARRAY_SIZE(dir_entries), remote); + if (IS_ERR(eventfs)) + return PTR_ERR(eventfs); + + /* + * Create similar hierarchy as local events even if a single system is supported at + * the moment + */ + eventfs = eventfs_create_dir(remote_name, eventfs, NULL, 0, NULL); + if (IS_ERR(eventfs)) + return PTR_ERR(eventfs); + + remote->eventfs = eventfs; + eventfs_create = true; + } + + eventfs = eventfs_create_dir(evt->name, eventfs, entries, ARRAY_SIZE(entries), evt); + if (IS_ERR(eventfs)) { + if (eventfs_create) { + eventfs_remove_events_dir(remote->eventfs); + remote->eventfs = NULL; + } + return PTR_ERR(eventfs); + } + + return 0; +} + +static int trace_remote_attach_events(struct trace_remote *remote, struct remote_event *events, + size_t nr_events) +{ + int i; + + for (i = 0; i < nr_events; i++) { + struct remote_event *evt = &events[i]; + + if (evt->remote) + return -EEXIST; + + evt->remote = remote; + + /* We need events to be sorted for efficient lookup */ + if (i && evt->id <= events[i - 1].id) + return -EINVAL; + } + + remote->events = events; + remote->nr_events = nr_events; + + return 0; +} + +static int trace_remote_register_events(const char *remote_name, struct trace_remote *remote, + struct remote_event *events, size_t nr_events) +{ + int i, ret; + + ret = trace_remote_attach_events(remote, events, nr_events); + if (ret) + return ret; + + for (i = 0; i < nr_events; i++) { + struct remote_event *evt = &events[i]; + + ret = trace_remote_init_eventfs(remote_name, remote, evt); + if (ret) + pr_warn("Failed to init eventfs for event '%s' (%d)", + evt->name, ret); + } + + return 0; +} + +static int __cmp_events(const void *key, const void *data) +{ + const struct remote_event *evt = data; + int id = (int)((long)key); + + return id - (int)evt->id; +} + +static struct remote_event *trace_remote_find_event(struct trace_remote *remote, unsigned short id) +{ + return bsearch((const void *)(unsigned long)id, remote->events, remote->nr_events, + sizeof(*remote->events), __cmp_events); +} diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 573b5d8e8a28..e9f0ff962660 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -224,7 +224,6 @@ static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val) /* Place map_cmdline_to_pid array right after saved_cmdlines */ s->map_cmdline_to_pid = (unsigned *)&s->saved_cmdlines[val * TASK_COMM_LEN]; - s->cmdline_idx = 0; memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(s->map_pid_to_cmdline)); memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP, @@ -248,6 +247,8 @@ int trace_save_cmdline(struct task_struct *tsk) if (!tsk->pid) return 1; + BUILD_BUG_ON(!is_power_of_2(PID_MAX_DEFAULT)); + tpid = tsk->pid & (PID_MAX_DEFAULT - 1); /* @@ -442,9 +443,8 @@ int trace_alloc_tgid_map(void) if (tgid_map) return 0; - tgid_map_max = pid_max; - map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map), - GFP_KERNEL); + tgid_map_max = init_pid_ns.pid_max; + map = kvzalloc_objs(*tgid_map, tgid_map_max + 1); if (!map) return -ENOMEM; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index d6c7f18daa15..8faa73d3bba1 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -41,7 +41,7 @@ static void stop_func_tracer(struct trace_array *tr, int graph); static int save_flags; #ifdef CONFIG_FUNCTION_GRAPH_TRACER -# define is_graph(tr) ((tr)->trace_flags & TRACE_ITER_DISPLAY_GRAPH) +# define is_graph(tr) ((tr)->trace_flags & TRACE_ITER(DISPLAY_GRAPH)) #else # define is_graph(tr) false #endif @@ -83,14 +83,14 @@ func_prolog_preempt_disable(struct trace_array *tr, goto out_enable; *data = per_cpu_ptr(tr->array_buffer.data, cpu); - disabled = atomic_inc_return(&(*data)->disabled); + disabled = local_inc_return(&(*data)->disabled); if (unlikely(disabled != 1)) goto out; return 1; out: - atomic_dec(&(*data)->disabled); + local_dec(&(*data)->disabled); out_enable: preempt_enable_notrace(); @@ -113,11 +113,13 @@ static int wakeup_display_graph(struct trace_array *tr, int set) } static int wakeup_graph_entry(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { struct trace_array *tr = wakeup_trace; struct trace_array_cpu *data; unsigned int trace_ctx; + u64 *calltime; int ret = 0; if (ftrace_graph_ignore_func(gops, trace)) @@ -135,28 +137,40 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace, if (!func_prolog_preempt_disable(tr, &data, &trace_ctx)) return 0; - ret = __trace_graph_entry(tr, trace, trace_ctx); - atomic_dec(&data->disabled); + calltime = fgraph_reserve_data(gops->idx, sizeof(*calltime)); + if (calltime) { + *calltime = trace_clock_local(); + ret = __trace_graph_entry(tr, trace, trace_ctx); + } + local_dec(&data->disabled); preempt_enable_notrace(); return ret; } static void wakeup_graph_return(struct ftrace_graph_ret *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { struct trace_array *tr = wakeup_trace; struct trace_array_cpu *data; unsigned int trace_ctx; + u64 *calltime; + u64 rettime; + int size; ftrace_graph_addr_finish(gops, trace); if (!func_prolog_preempt_disable(tr, &data, &trace_ctx)) return; - __trace_graph_return(tr, trace, trace_ctx); - atomic_dec(&data->disabled); + rettime = trace_clock_local(); + + calltime = fgraph_retrieve_data(gops->idx, &size); + if (calltime) + __trace_graph_return(tr, trace, trace_ctx, *calltime, rettime); + local_dec(&data->disabled); preempt_enable_notrace(); return; } @@ -170,8 +184,6 @@ static void wakeup_trace_open(struct trace_iterator *iter) { if (is_graph(iter->tr)) graph_trace_open(iter); - else - iter->private = NULL; } static void wakeup_trace_close(struct trace_iterator *iter) @@ -224,10 +236,10 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, return; local_irq_save(flags); - trace_function(tr, ip, parent_ip, trace_ctx); + trace_function(tr, ip, parent_ip, trace_ctx, fregs); local_irq_restore(flags); - atomic_dec(&data->disabled); + local_dec(&data->disabled); preempt_enable_notrace(); } @@ -235,8 +247,8 @@ static int register_wakeup_function(struct trace_array *tr, int graph, int set) { int ret; - /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */ - if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER_FUNCTION))) + /* 'set' is set if TRACE_ITER(FUNCTION) is about to be set */ + if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER(FUNCTION)))) return 0; if (graph) @@ -265,7 +277,7 @@ static void unregister_wakeup_function(struct trace_array *tr, int graph) static int wakeup_function_set(struct trace_array *tr, u32 mask, int set) { - if (!(mask & TRACE_ITER_FUNCTION)) + if (!(mask & TRACE_ITER(FUNCTION))) return 0; if (set) @@ -309,10 +321,10 @@ __trace_function(struct trace_array *tr, if (is_graph(tr)) trace_graph_function(tr, ip, parent_ip, trace_ctx); else - trace_function(tr, ip, parent_ip, trace_ctx); + trace_function(tr, ip, parent_ip, trace_ctx, NULL); } -static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) +static int wakeup_flag_changed(struct trace_array *tr, u64 mask, int set) { struct tracer *tracer = tr->current_trace; @@ -320,7 +332,7 @@ static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) return 0; #ifdef CONFIG_FUNCTION_GRAPH_TRACER - if (mask & TRACE_ITER_DISPLAY_GRAPH) + if (mask & TRACE_ITER(DISPLAY_GRAPH)) return wakeup_display_graph(tr, set); #endif @@ -455,7 +467,7 @@ probe_wakeup_sched_switch(void *ignore, bool preempt, /* disable local data, not wakeup_cpu data */ cpu = raw_smp_processor_id(); - disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); + disabled = local_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); if (likely(disabled != 1)) goto out; @@ -492,7 +504,7 @@ out_unlock: arch_spin_unlock(&wakeup_lock); local_irq_restore(flags); out: - atomic_dec(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); + local_dec(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); } static void __wakeup_reset(struct trace_array *tr) @@ -547,7 +559,7 @@ probe_wakeup(void *ignore, struct task_struct *p) (!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio))) return; - disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); + disabled = local_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); if (unlikely(disabled != 1)) goto out; @@ -594,7 +606,7 @@ probe_wakeup(void *ignore, struct task_struct *p) out_locked: arch_spin_unlock(&wakeup_lock); out: - atomic_dec(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); + local_dec(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); } static void start_wakeup_tracer(struct trace_array *tr) @@ -669,8 +681,8 @@ static int __wakeup_tracer_init(struct trace_array *tr) save_flags = tr->trace_flags; /* non overwrite screws up the latency tracers */ - set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); - set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); + set_tracer_flag(tr, TRACE_ITER(OVERWRITE), 1); + set_tracer_flag(tr, TRACE_ITER(LATENCY_FMT), 1); tr->max_latency = 0; wakeup_trace = tr; @@ -713,15 +725,15 @@ static int wakeup_dl_tracer_init(struct trace_array *tr) static void wakeup_tracer_reset(struct trace_array *tr) { - int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; - int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE; + int lat_flag = save_flags & TRACE_ITER(LATENCY_FMT); + int overwrite_flag = save_flags & TRACE_ITER(OVERWRITE); stop_wakeup_tracer(tr); /* make sure we put back any tasks we are tracing */ wakeup_reset(tr); - set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); - set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); + set_tracer_flag(tr, TRACE_ITER(LATENCY_FMT), lat_flag); + set_tracer_flag(tr, TRACE_ITER(OVERWRITE), overwrite_flag); ftrace_reset_array_ops(tr); wakeup_busy = false; } diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 38b5754790c9..929c84075315 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -248,7 +248,7 @@ static int trace_selftest_ops(struct trace_array *tr, int cnt) goto out; /* Add a dynamic probe */ - dyn_ops = kzalloc(sizeof(*dyn_ops), GFP_KERNEL); + dyn_ops = kzalloc_obj(*dyn_ops); if (!dyn_ops) { printk("MEMORY ERROR "); goto out; @@ -774,7 +774,8 @@ struct fgraph_fixture { }; static __init int store_entry(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { struct fgraph_fixture *fixture = container_of(gops, struct fgraph_fixture, gops); const char *type = fixture->store_type_name; @@ -807,7 +808,8 @@ static __init int store_entry(struct ftrace_graph_ent *trace, } static __init void store_return(struct ftrace_graph_ret *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { struct fgraph_fixture *fixture = container_of(gops, struct fgraph_fixture, gops); const char *type = fixture->store_type_name; @@ -1025,7 +1027,8 @@ static unsigned int graph_hang_thresh; /* Wrap the real function entry probe to avoid possible hanging */ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { /* This is harmlessly racy, we want to approximately detect a hang */ if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) { @@ -1039,7 +1042,7 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace, return 0; } - return trace_graph_entry(trace, gops); + return trace_graph_entry(trace, gops, fregs); } static struct fgraph_ops fgraph_ops __initdata = { @@ -1222,7 +1225,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) /* check both trace buffers */ ret = trace_test_buffer(&tr->array_buffer, NULL); if (!ret) - ret = trace_test_buffer(&tr->max_buffer, &count); + ret = trace_test_buffer(&tr->snapshot_buffer, &count); trace->reset(tr); tracing_start(); @@ -1284,7 +1287,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) /* check both trace buffers */ ret = trace_test_buffer(&tr->array_buffer, NULL); if (!ret) - ret = trace_test_buffer(&tr->max_buffer, &count); + ret = trace_test_buffer(&tr->snapshot_buffer, &count); trace->reset(tr); tracing_start(); @@ -1352,7 +1355,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * if (ret) goto out; - ret = trace_test_buffer(&tr->max_buffer, &count); + ret = trace_test_buffer(&tr->snapshot_buffer, &count); if (ret) goto out; @@ -1382,7 +1385,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * if (ret) goto out; - ret = trace_test_buffer(&tr->max_buffer, &count); + ret = trace_test_buffer(&tr->snapshot_buffer, &count); if (!ret && !count) { printk(KERN_CONT ".. no entries found .."); @@ -1510,7 +1513,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) /* check both trace buffers */ ret = trace_test_buffer(&tr->array_buffer, NULL); if (!ret) - ret = trace_test_buffer(&tr->max_buffer, &count); + ret = trace_test_buffer(&tr->snapshot_buffer, &count); trace->reset(tr); diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index c158d65a8a88..85f6f10d107f 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c @@ -15,7 +15,7 @@ * * A write to the buffer will either succeed or fail. That is, unlike * sprintf() there will not be a partial write (well it may write into - * the buffer but it wont update the pointers). This allows users to + * the buffer but it won't update the pointers). This allows users to * try to write something into the trace_seq buffer and if it fails * they can flush it and try again. * @@ -106,7 +106,7 @@ EXPORT_SYMBOL_GPL(trace_seq_printf); * Writes a ASCII representation of a bitmask string into @s. */ void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, - int nmaskbits) + int nmaskbits) { unsigned int save_len = s->seq.len; @@ -125,6 +125,33 @@ void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, EXPORT_SYMBOL_GPL(trace_seq_bitmask); /** + * trace_seq_bitmask_list - write a bitmask array in its list representation + * @s: trace sequence descriptor + * @maskp: points to an array of unsigned longs that represent a bitmask + * @nmaskbits: The number of bits that are valid in @maskp + * + * Writes a list representation (e.g., 0-3,5-7) of a bitmask string into @s. + */ +void trace_seq_bitmask_list(struct trace_seq *s, const unsigned long *maskp, + int nmaskbits) +{ + unsigned int save_len = s->seq.len; + + if (s->full) + return; + + __trace_seq_init(s); + + seq_buf_printf(&s->seq, "%*pbl", nmaskbits, maskp); + + if (unlikely(seq_buf_has_overflowed(&s->seq))) { + s->seq.len = save_len; + s->full = 1; + } +} +EXPORT_SYMBOL_GPL(trace_seq_bitmask_list); + +/** * trace_seq_vprintf - sequence printing of trace information * @s: trace sequence descriptor * @fmt: printf format string diff --git a/kernel/trace/trace_snapshot.c b/kernel/trace/trace_snapshot.c new file mode 100644 index 000000000000..07b43c9863a2 --- /dev/null +++ b/kernel/trace/trace_snapshot.c @@ -0,0 +1,1066 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/fsnotify.h> + +#include <asm/setup.h> /* COMMAND_LINE_SIZE */ + +#include "trace.h" + +/* Used if snapshot allocated at boot */ +static bool allocate_snapshot; +static bool snapshot_at_boot; + +static char boot_snapshot_info[COMMAND_LINE_SIZE] __initdata; +static int boot_snapshot_index; + +static int __init boot_alloc_snapshot(char *str) +{ + char *slot = boot_snapshot_info + boot_snapshot_index; + int left = sizeof(boot_snapshot_info) - boot_snapshot_index; + int ret; + + if (str[0] == '=') { + str++; + if (strlen(str) >= left) + return -1; + + ret = snprintf(slot, left, "%s\t", str); + boot_snapshot_index += ret; + } else { + allocate_snapshot = true; + /* We also need the main ring buffer expanded */ + trace_set_ring_buffer_expanded(NULL); + } + return 1; +} +__setup("alloc_snapshot", boot_alloc_snapshot); + + +static int __init boot_snapshot(char *str) +{ + snapshot_at_boot = true; + boot_alloc_snapshot(str); + return 1; +} +__setup("ftrace_boot_snapshot", boot_snapshot); +static void tracing_snapshot_instance_cond(struct trace_array *tr, + void *cond_data) +{ + unsigned long flags; + + if (in_nmi()) { + trace_array_puts(tr, "*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n"); + trace_array_puts(tr, "*** snapshot is being ignored ***\n"); + return; + } + + if (!tr->allocated_snapshot) { + trace_array_puts(tr, "*** SNAPSHOT NOT ALLOCATED ***\n"); + trace_array_puts(tr, "*** stopping trace here! ***\n"); + tracer_tracing_off(tr); + return; + } + + if (tr->mapped) { + trace_array_puts(tr, "*** BUFFER MEMORY MAPPED ***\n"); + trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n"); + return; + } + + /* Note, snapshot can not be used when the tracer uses it */ + if (tracer_uses_snapshot(tr->current_trace)) { + trace_array_puts(tr, "*** LATENCY TRACER ACTIVE ***\n"); + trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n"); + return; + } + + local_irq_save(flags); + update_max_tr(tr, current, smp_processor_id(), cond_data); + local_irq_restore(flags); +} + +void tracing_snapshot_instance(struct trace_array *tr) +{ + tracing_snapshot_instance_cond(tr, NULL); +} + +/** + * tracing_snapshot_cond - conditionally take a snapshot of the current buffer. + * @tr: The tracing instance to snapshot + * @cond_data: The data to be tested conditionally, and possibly saved + * + * This is the same as tracing_snapshot() except that the snapshot is + * conditional - the snapshot will only happen if the + * cond_snapshot.update() implementation receiving the cond_data + * returns true, which means that the trace array's cond_snapshot + * update() operation used the cond_data to determine whether the + * snapshot should be taken, and if it was, presumably saved it along + * with the snapshot. + */ +void tracing_snapshot_cond(struct trace_array *tr, void *cond_data) +{ + tracing_snapshot_instance_cond(tr, cond_data); +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond); + +/** + * tracing_cond_snapshot_data - get the user data associated with a snapshot + * @tr: The tracing instance + * + * When the user enables a conditional snapshot using + * tracing_snapshot_cond_enable(), the user-defined cond_data is saved + * with the snapshot. This accessor is used to retrieve it. + * + * Should not be called from cond_snapshot.update(), since it takes + * the tr->max_lock lock, which the code calling + * cond_snapshot.update() has already done. + * + * Returns the cond_data associated with the trace array's snapshot. + */ +void *tracing_cond_snapshot_data(struct trace_array *tr) +{ + void *cond_data = NULL; + + local_irq_disable(); + arch_spin_lock(&tr->max_lock); + + if (tr->cond_snapshot) + cond_data = tr->cond_snapshot->cond_data; + + arch_spin_unlock(&tr->max_lock); + local_irq_enable(); + + return cond_data; +} +EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data); + +/* resize @tr's buffer to the size of @size_tr's entries */ +int resize_buffer_duplicate_size(struct array_buffer *trace_buf, + struct array_buffer *size_buf, int cpu_id) +{ + int cpu, ret = 0; + + if (cpu_id == RING_BUFFER_ALL_CPUS) { + for_each_tracing_cpu(cpu) { + ret = ring_buffer_resize(trace_buf->buffer, + per_cpu_ptr(size_buf->data, cpu)->entries, cpu); + if (ret < 0) + break; + per_cpu_ptr(trace_buf->data, cpu)->entries = + per_cpu_ptr(size_buf->data, cpu)->entries; + } + } else { + ret = ring_buffer_resize(trace_buf->buffer, + per_cpu_ptr(size_buf->data, cpu_id)->entries, cpu_id); + if (ret == 0) + per_cpu_ptr(trace_buf->data, cpu_id)->entries = + per_cpu_ptr(size_buf->data, cpu_id)->entries; + } + + return ret; +} + +int tracing_alloc_snapshot_instance(struct trace_array *tr) +{ + int order; + int ret; + + if (!tr->allocated_snapshot) { + + /* Make the snapshot buffer have the same order as main buffer */ + order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer); + ret = ring_buffer_subbuf_order_set(tr->snapshot_buffer.buffer, order); + if (ret < 0) + return ret; + + /* allocate spare buffer */ + ret = resize_buffer_duplicate_size(&tr->snapshot_buffer, + &tr->array_buffer, RING_BUFFER_ALL_CPUS); + if (ret < 0) + return ret; + + tr->allocated_snapshot = true; + } + + return 0; +} + +void free_snapshot(struct trace_array *tr) +{ + /* + * We don't free the ring buffer. instead, resize it because + * The max_tr ring buffer has some state (e.g. ring->clock) and + * we want preserve it. + */ + ring_buffer_subbuf_order_set(tr->snapshot_buffer.buffer, 0); + ring_buffer_resize(tr->snapshot_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); + trace_set_buffer_entries(&tr->snapshot_buffer, 1); + tracing_reset_online_cpus(&tr->snapshot_buffer); + tr->allocated_snapshot = false; +} + +int tracing_arm_snapshot_locked(struct trace_array *tr) +{ + int ret; + + lockdep_assert_held(&trace_types_lock); + + spin_lock(&tr->snapshot_trigger_lock); + if (tr->snapshot == UINT_MAX || tr->mapped) { + spin_unlock(&tr->snapshot_trigger_lock); + return -EBUSY; + } + + tr->snapshot++; + spin_unlock(&tr->snapshot_trigger_lock); + + ret = tracing_alloc_snapshot_instance(tr); + if (ret) { + spin_lock(&tr->snapshot_trigger_lock); + tr->snapshot--; + spin_unlock(&tr->snapshot_trigger_lock); + } + + return ret; +} + +int tracing_arm_snapshot(struct trace_array *tr) +{ + guard(mutex)(&trace_types_lock); + return tracing_arm_snapshot_locked(tr); +} + +void tracing_disarm_snapshot(struct trace_array *tr) +{ + spin_lock(&tr->snapshot_trigger_lock); + if (!WARN_ON(!tr->snapshot)) + tr->snapshot--; + spin_unlock(&tr->snapshot_trigger_lock); +} + +/** + * tracing_snapshot_alloc - allocate and take a snapshot of the current buffer. + * + * This is similar to tracing_snapshot(), but it will allocate the + * snapshot buffer if it isn't already allocated. Use this only + * where it is safe to sleep, as the allocation may sleep. + * + * This causes a swap between the snapshot buffer and the current live + * tracing buffer. You can use this to take snapshots of the live + * trace when some condition is triggered, but continue to trace. + */ +void tracing_snapshot_alloc(void) +{ + int ret; + + ret = tracing_alloc_snapshot(); + if (ret < 0) + return; + + tracing_snapshot(); +} +EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); + +/** + * tracing_snapshot_cond_enable - enable conditional snapshot for an instance + * @tr: The tracing instance + * @cond_data: User data to associate with the snapshot + * @update: Implementation of the cond_snapshot update function + * + * Check whether the conditional snapshot for the given instance has + * already been enabled, or if the current tracer is already using a + * snapshot; if so, return -EBUSY, else create a cond_snapshot and + * save the cond_data and update function inside. + * + * Returns 0 if successful, error otherwise. + */ +int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, + cond_update_fn_t update) +{ + struct cond_snapshot *cond_snapshot __free(kfree) = + kzalloc_obj(*cond_snapshot); + int ret; + + if (!cond_snapshot) + return -ENOMEM; + + cond_snapshot->cond_data = cond_data; + cond_snapshot->update = update; + + guard(mutex)(&trace_types_lock); + + if (tracer_uses_snapshot(tr->current_trace)) + return -EBUSY; + + /* + * The cond_snapshot can only change to NULL without the + * trace_types_lock. We don't care if we race with it going + * to NULL, but we want to make sure that it's not set to + * something other than NULL when we get here, which we can + * do safely with only holding the trace_types_lock and not + * having to take the max_lock. + */ + if (tr->cond_snapshot) + return -EBUSY; + + ret = tracing_arm_snapshot_locked(tr); + if (ret) + return ret; + + local_irq_disable(); + arch_spin_lock(&tr->max_lock); + tr->cond_snapshot = no_free_ptr(cond_snapshot); + arch_spin_unlock(&tr->max_lock); + local_irq_enable(); + + return 0; +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable); + +/** + * tracing_snapshot_cond_disable - disable conditional snapshot for an instance + * @tr: The tracing instance + * + * Check whether the conditional snapshot for the given instance is + * enabled; if so, free the cond_snapshot associated with it, + * otherwise return -EINVAL. + * + * Returns 0 if successful, error otherwise. + */ +int tracing_snapshot_cond_disable(struct trace_array *tr) +{ + int ret = 0; + + local_irq_disable(); + arch_spin_lock(&tr->max_lock); + + if (!tr->cond_snapshot) + ret = -EINVAL; + else { + kfree(tr->cond_snapshot); + tr->cond_snapshot = NULL; + } + + arch_spin_unlock(&tr->max_lock); + local_irq_enable(); + + tracing_disarm_snapshot(tr); + + return ret; +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); + +#ifdef CONFIG_TRACER_MAX_TRACE +#ifdef LATENCY_FS_NOTIFY +static struct workqueue_struct *fsnotify_wq; + +static void latency_fsnotify_workfn(struct work_struct *work) +{ + struct trace_array *tr = container_of(work, struct trace_array, + fsnotify_work); + fsnotify_inode(tr->d_max_latency->d_inode, FS_MODIFY); +} + +static void latency_fsnotify_workfn_irq(struct irq_work *iwork) +{ + struct trace_array *tr = container_of(iwork, struct trace_array, + fsnotify_irqwork); + queue_work(fsnotify_wq, &tr->fsnotify_work); +} + +__init static int latency_fsnotify_init(void) +{ + fsnotify_wq = alloc_workqueue("tr_max_lat_wq", + WQ_UNBOUND | WQ_HIGHPRI, 0); + if (!fsnotify_wq) { + pr_err("Unable to allocate tr_max_lat_wq\n"); + return -ENOMEM; + } + return 0; +} + +late_initcall_sync(latency_fsnotify_init); + +void latency_fsnotify(struct trace_array *tr) +{ + if (!fsnotify_wq) + return; + /* + * We cannot call queue_work(&tr->fsnotify_work) from here because it's + * possible that we are called from __schedule() or do_idle(), which + * could cause a deadlock. + */ + irq_work_queue(&tr->fsnotify_irqwork); +} +#endif /* LATENCY_FS_NOTIFY */ + +static const struct file_operations tracing_max_lat_fops; + +void trace_create_maxlat_file(struct trace_array *tr, + struct dentry *d_tracer) +{ +#ifdef LATENCY_FS_NOTIFY + INIT_WORK(&tr->fsnotify_work, latency_fsnotify_workfn); + init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq); +#endif + tr->d_max_latency = trace_create_file("tracing_max_latency", + TRACE_MODE_WRITE, + d_tracer, tr, + &tracing_max_lat_fops); +} + +/* + * Copy the new maximum trace into the separate maximum-trace + * structure. (this way the maximum trace is permanently saved, + * for later retrieval via /sys/kernel/tracing/tracing_max_latency) + */ +static void +__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) +{ + struct array_buffer *trace_buf = &tr->array_buffer; + struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu); + struct array_buffer *max_buf = &tr->snapshot_buffer; + struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu); + + max_buf->cpu = cpu; + max_buf->time_start = data->preempt_timestamp; + + max_data->saved_latency = tr->max_latency; + max_data->critical_start = data->critical_start; + max_data->critical_end = data->critical_end; + + strscpy(max_data->comm, tsk->comm); + max_data->pid = tsk->pid; + /* + * If tsk == current, then use current_uid(), as that does not use + * RCU. The irq tracer can be called out of RCU scope. + */ + if (tsk == current) + max_data->uid = current_uid(); + else + max_data->uid = task_uid(tsk); + + max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; + max_data->policy = tsk->policy; + max_data->rt_priority = tsk->rt_priority; + + /* record this tasks comm */ + tracing_record_cmdline(tsk); + latency_fsnotify(tr); +} +#else +static inline void __update_max_tr(struct trace_array *tr, + struct task_struct *tsk, int cpu) { } +#endif /* CONFIG_TRACER_MAX_TRACE */ + +/** + * update_max_tr - snapshot all trace buffers from global_trace to max_tr + * @tr: tracer + * @tsk: the task with the latency + * @cpu: The cpu that initiated the trace. + * @cond_data: User data associated with a conditional snapshot + * + * Flip the buffers between the @tr and the max_tr and record information + * about which task was the cause of this latency. + */ +void +update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, + void *cond_data) +{ + if (tr->stop_count) + return; + + WARN_ON_ONCE(!irqs_disabled()); + + if (!tr->allocated_snapshot) { + /* Only the nop tracer should hit this when disabling */ + WARN_ON_ONCE(tr->current_trace != &nop_trace); + return; + } + + arch_spin_lock(&tr->max_lock); + + /* Inherit the recordable setting from array_buffer */ + if (ring_buffer_record_is_set_on(tr->array_buffer.buffer)) + ring_buffer_record_on(tr->snapshot_buffer.buffer); + else + ring_buffer_record_off(tr->snapshot_buffer.buffer); + + if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data)) { + arch_spin_unlock(&tr->max_lock); + return; + } + + swap(tr->array_buffer.buffer, tr->snapshot_buffer.buffer); + + __update_max_tr(tr, tsk, cpu); + + arch_spin_unlock(&tr->max_lock); + + /* Any waiters on the old snapshot buffer need to wake up */ + ring_buffer_wake_waiters(tr->array_buffer.buffer, RING_BUFFER_ALL_CPUS); +} + +/** + * update_max_tr_single - only copy one trace over, and reset the rest + * @tr: tracer + * @tsk: task with the latency + * @cpu: the cpu of the buffer to copy. + * + * Flip the trace of a single CPU buffer between the @tr and the max_tr. + */ +void +update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) +{ + int ret; + + if (tr->stop_count) + return; + + WARN_ON_ONCE(!irqs_disabled()); + if (!tr->allocated_snapshot) { + /* Only the nop tracer should hit this when disabling */ + WARN_ON_ONCE(tr->current_trace != &nop_trace); + return; + } + + arch_spin_lock(&tr->max_lock); + + ret = ring_buffer_swap_cpu(tr->snapshot_buffer.buffer, tr->array_buffer.buffer, cpu); + + if (ret == -EBUSY) { + /* + * We failed to swap the buffer due to a commit taking + * place on this CPU. We fail to record, but we reset + * the max trace buffer (no one writes directly to it) + * and flag that it failed. + * Another reason is resize is in progress. + */ + trace_array_printk_buf(tr->snapshot_buffer.buffer, _THIS_IP_, + "Failed to swap buffers due to commit or resize in progress\n"); + } + + WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); + + __update_max_tr(tr, tsk, cpu); + arch_spin_unlock(&tr->max_lock); +} + +static void show_snapshot_main_help(struct seq_file *m) +{ + seq_puts(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n" + "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n" + "# Takes a snapshot of the main buffer.\n" + "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n" + "# (Doesn't have to be '2' works with any number that\n" + "# is not a '0' or '1')\n"); +} + +static void show_snapshot_percpu_help(struct seq_file *m) +{ + seq_puts(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n"); +#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP + seq_puts(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n" + "# Takes a snapshot of the main buffer for this cpu.\n"); +#else + seq_puts(m, "# echo 1 > snapshot : Not supported with this kernel.\n" + "# Must use main snapshot file to allocate.\n"); +#endif + seq_puts(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n" + "# (Doesn't have to be '2' works with any number that\n" + "# is not a '0' or '1')\n"); +} + +void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) +{ + if (iter->tr->allocated_snapshot) + seq_puts(m, "#\n# * Snapshot is allocated *\n#\n"); + else + seq_puts(m, "#\n# * Snapshot is freed *\n#\n"); + + seq_puts(m, "# Snapshot commands:\n"); + if (iter->cpu_file == RING_BUFFER_ALL_CPUS) + show_snapshot_main_help(m); + else + show_snapshot_percpu_help(m); +} + +static int tracing_snapshot_open(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + struct trace_iterator *iter; + struct seq_file *m; + int ret; + + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; + + if (file->f_mode & FMODE_READ) { + iter = __tracing_open(inode, file, true); + if (IS_ERR(iter)) + ret = PTR_ERR(iter); + } else { + /* Writes still need the seq_file to hold the private data */ + ret = -ENOMEM; + m = kzalloc_obj(*m); + if (!m) + goto out; + iter = kzalloc_obj(*iter); + if (!iter) { + kfree(m); + goto out; + } + ret = 0; + + iter->tr = tr; + iter->array_buffer = &tr->snapshot_buffer; + iter->cpu_file = tracing_get_cpu(inode); + m->private = iter; + file->private_data = m; + } +out: + if (ret < 0) + trace_array_put(tr); + + return ret; +} + +static void tracing_swap_cpu_buffer(void *tr) +{ + update_max_tr_single((struct trace_array *)tr, current, smp_processor_id()); +} + +static ssize_t +tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct seq_file *m = filp->private_data; + struct trace_iterator *iter = m->private; + struct trace_array *tr = iter->tr; + unsigned long val; + int ret; + + ret = tracing_update_buffers(tr); + if (ret < 0) + return ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + guard(mutex)(&trace_types_lock); + + if (tracer_uses_snapshot(tr->current_trace)) + return -EBUSY; + + local_irq_disable(); + arch_spin_lock(&tr->max_lock); + if (tr->cond_snapshot) + ret = -EBUSY; + arch_spin_unlock(&tr->max_lock); + local_irq_enable(); + if (ret) + return ret; + + switch (val) { + case 0: + if (iter->cpu_file != RING_BUFFER_ALL_CPUS) + return -EINVAL; + if (tr->allocated_snapshot) + free_snapshot(tr); + break; + case 1: +/* Only allow per-cpu swap if the ring buffer supports it */ +#ifndef CONFIG_RING_BUFFER_ALLOW_SWAP + if (iter->cpu_file != RING_BUFFER_ALL_CPUS) + return -EINVAL; +#endif + if (tr->allocated_snapshot) + ret = resize_buffer_duplicate_size(&tr->snapshot_buffer, + &tr->array_buffer, iter->cpu_file); + + ret = tracing_arm_snapshot_locked(tr); + if (ret) + return ret; + + /* Now, we're going to swap */ + if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { + local_irq_disable(); + update_max_tr(tr, current, smp_processor_id(), NULL); + local_irq_enable(); + } else { + smp_call_function_single(iter->cpu_file, tracing_swap_cpu_buffer, + (void *)tr, 1); + } + tracing_disarm_snapshot(tr); + break; + default: + if (tr->allocated_snapshot) { + if (iter->cpu_file == RING_BUFFER_ALL_CPUS) + tracing_reset_online_cpus(&tr->snapshot_buffer); + else + tracing_reset_cpu(&tr->snapshot_buffer, iter->cpu_file); + } + break; + } + + if (ret >= 0) { + *ppos += cnt; + ret = cnt; + } + + return ret; +} + +static int tracing_snapshot_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + int ret; + + ret = tracing_release(inode, file); + + if (file->f_mode & FMODE_READ) + return ret; + + /* If write only, the seq_file is just a stub */ + if (m) + kfree(m->private); + kfree(m); + + return 0; +} + +static int snapshot_raw_open(struct inode *inode, struct file *filp) +{ + struct ftrace_buffer_info *info; + int ret; + + /* The following checks for tracefs lockdown */ + ret = tracing_buffers_open(inode, filp); + if (ret < 0) + return ret; + + info = filp->private_data; + + if (tracer_uses_snapshot(info->iter.trace)) { + tracing_buffers_release(inode, filp); + return -EBUSY; + } + + info->iter.snapshot = true; + info->iter.array_buffer = &info->iter.tr->snapshot_buffer; + + return ret; +} + +const struct file_operations snapshot_fops = { + .open = tracing_snapshot_open, + .read = seq_read, + .write = tracing_snapshot_write, + .llseek = tracing_lseek, + .release = tracing_snapshot_release, +}; + +const struct file_operations snapshot_raw_fops = { + .open = snapshot_raw_open, + .read = tracing_buffers_read, + .release = tracing_buffers_release, + .splice_read = tracing_buffers_splice_read, +}; + +#ifdef CONFIG_TRACER_MAX_TRACE +static ssize_t +tracing_max_lat_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + + return tracing_nsecs_read(&tr->max_latency, ubuf, cnt, ppos); +} + +static ssize_t +tracing_max_lat_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + + return tracing_nsecs_write(&tr->max_latency, ubuf, cnt, ppos); +} + +static const struct file_operations tracing_max_lat_fops = { + .open = tracing_open_generic_tr, + .read = tracing_max_lat_read, + .write = tracing_max_lat_write, + .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, +}; +#endif /* CONFIG_TRACER_MAX_TRACE */ + +int get_snapshot_map(struct trace_array *tr) +{ + int err = 0; + + /* + * Called with mmap_lock held. lockdep would be unhappy if we would now + * take trace_types_lock. Instead use the specific + * snapshot_trigger_lock. + */ + spin_lock(&tr->snapshot_trigger_lock); + + if (tr->snapshot || tr->mapped == UINT_MAX) + err = -EBUSY; + else + tr->mapped++; + + spin_unlock(&tr->snapshot_trigger_lock); + + /* Wait for update_max_tr() to observe iter->tr->mapped */ + if (tr->mapped == 1) + synchronize_rcu(); + + return err; + +} + +void put_snapshot_map(struct trace_array *tr) +{ + spin_lock(&tr->snapshot_trigger_lock); + if (!WARN_ON(!tr->mapped)) + tr->mapped--; + spin_unlock(&tr->snapshot_trigger_lock); +} + +#ifdef CONFIG_DYNAMIC_FTRACE +static void +ftrace_snapshot(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) +{ + tracing_snapshot_instance(tr); +} + +static void +ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) +{ + struct ftrace_func_mapper *mapper = data; + long *count = NULL; + + if (mapper) + count = (long *)ftrace_func_mapper_find_ip(mapper, ip); + + if (count) { + + if (*count <= 0) + return; + + (*count)--; + } + + tracing_snapshot_instance(tr); +} + +static int +ftrace_snapshot_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) +{ + struct ftrace_func_mapper *mapper = data; + long *count = NULL; + + seq_printf(m, "%ps:", (void *)ip); + + seq_puts(m, "snapshot"); + + if (mapper) + count = (long *)ftrace_func_mapper_find_ip(mapper, ip); + + if (count) + seq_printf(m, ":count=%ld\n", *count); + else + seq_puts(m, ":unlimited\n"); + + return 0; +} + +static int +ftrace_snapshot_init(struct ftrace_probe_ops *ops, struct trace_array *tr, + unsigned long ip, void *init_data, void **data) +{ + struct ftrace_func_mapper *mapper = *data; + + if (!mapper) { + mapper = allocate_ftrace_func_mapper(); + if (!mapper) + return -ENOMEM; + *data = mapper; + } + + return ftrace_func_mapper_add_ip(mapper, ip, init_data); +} + +static void +ftrace_snapshot_free(struct ftrace_probe_ops *ops, struct trace_array *tr, + unsigned long ip, void *data) +{ + struct ftrace_func_mapper *mapper = data; + + if (!ip) { + if (!mapper) + return; + free_ftrace_func_mapper(mapper, NULL); + return; + } + + ftrace_func_mapper_remove_ip(mapper, ip); +} + +static struct ftrace_probe_ops snapshot_probe_ops = { + .func = ftrace_snapshot, + .print = ftrace_snapshot_print, +}; + +static struct ftrace_probe_ops snapshot_count_probe_ops = { + .func = ftrace_count_snapshot, + .print = ftrace_snapshot_print, + .init = ftrace_snapshot_init, + .free = ftrace_snapshot_free, +}; + +static int +ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) +{ + struct ftrace_probe_ops *ops; + void *count = (void *)-1; + char *number; + int ret; + + if (!tr) + return -ENODEV; + + /* hash funcs only work with set_ftrace_filter */ + if (!enable) + return -EINVAL; + + ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops; + + if (glob[0] == '!') { + ret = unregister_ftrace_function_probe_func(glob+1, tr, ops); + if (!ret) + tracing_disarm_snapshot(tr); + + return ret; + } + + if (!param) + goto out_reg; + + number = strsep(¶m, ":"); + + if (!strlen(number)) + goto out_reg; + + /* + * We use the callback data field (which is a pointer) + * as our counter. + */ + ret = kstrtoul(number, 0, (unsigned long *)&count); + if (ret) + return ret; + + out_reg: + ret = tracing_arm_snapshot(tr); + if (ret < 0) + return ret; + + ret = register_ftrace_function_probe(glob, tr, ops, count); + if (ret < 0) + tracing_disarm_snapshot(tr); + + return ret < 0 ? ret : 0; +} + +static struct ftrace_func_command ftrace_snapshot_cmd = { + .name = "snapshot", + .func = ftrace_trace_snapshot_callback, +}; + +__init int register_snapshot_cmd(void) +{ + return register_ftrace_command(&ftrace_snapshot_cmd); +} +#endif /* CONFIG_DYNAMIC_FTRACE */ + +int trace_allocate_snapshot(struct trace_array *tr, int size) +{ + int ret; + + /* Fix mapped buffer trace arrays do not have snapshot buffers */ + if (tr->range_addr_start) + return 0; + + /* allocate_snapshot can only be true during system boot */ + ret = allocate_trace_buffer(tr, &tr->snapshot_buffer, + allocate_snapshot ? size : 1); + if (ret < 0) + return -ENOMEM; + + tr->allocated_snapshot = allocate_snapshot; + + allocate_snapshot = false; + return 0; +} + +__init static bool tr_needs_alloc_snapshot(const char *name) +{ + char *test; + int len = strlen(name); + bool ret; + + if (!boot_snapshot_index) + return false; + + if (strncmp(name, boot_snapshot_info, len) == 0 && + boot_snapshot_info[len] == '\t') + return true; + + test = kmalloc(strlen(name) + 3, GFP_KERNEL); + if (!test) + return false; + + sprintf(test, "\t%s\t", name); + ret = strstr(boot_snapshot_info, test) == NULL; + kfree(test); + return ret; +} + +__init void do_allocate_snapshot(const char *name) +{ + if (!tr_needs_alloc_snapshot(name)) + return; + + /* + * When allocate_snapshot is set, the next call to + * allocate_trace_buffers() (called by trace_array_get_by_name()) + * will allocate the snapshot buffer. That will also clear + * this flag. + */ + allocate_snapshot = true; +} + +void __init ftrace_boot_snapshot(void) +{ + struct trace_array *tr; + + if (!snapshot_at_boot) + return; + + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (!tr->allocated_snapshot) + continue; + + tracing_snapshot_instance(tr); + trace_array_puts(tr, "** Boot snapshot taken **\n"); + } +} diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 7f9572a37333..0aa2514a6593 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -32,7 +32,7 @@ static arch_spinlock_t stack_trace_max_lock = DEFINE_PER_CPU(int, disable_stack_tracer); static DEFINE_MUTEX(stack_sysctl_mutex); -int stack_tracer_enabled; +static int stack_tracer_enabled; static void print_max_stack(void) { @@ -520,20 +520,18 @@ stack_trace_sysctl(const struct ctl_table *table, int write, void *buffer, int was_enabled; int ret; - mutex_lock(&stack_sysctl_mutex); + guard(mutex)(&stack_sysctl_mutex); was_enabled = !!stack_tracer_enabled; ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret || !write || (was_enabled == !!stack_tracer_enabled)) - goto out; + return ret; if (stack_tracer_enabled) register_ftrace_function(&trace_ops); else unregister_ftrace_function(&trace_ops); - out: - mutex_unlock(&stack_sysctl_mutex); return ret; } @@ -544,7 +542,7 @@ static __init int enable_stacktrace(char *str) int len; if ((len = str_has_prefix(str, "_filter="))) - strncpy(stack_trace_filter_buf, str + len, COMMAND_LINE_SIZE); + strscpy(stack_trace_filter_buf, str + len); stack_tracer_enabled = 1; return 1; @@ -580,3 +578,23 @@ static __init int stack_trace_init(void) } device_initcall(stack_trace_init); + + +static const struct ctl_table trace_stack_sysctl_table[] = { + { + .procname = "stack_tracer_enabled", + .data = &stack_tracer_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = stack_trace_sysctl, + }, +}; + +static int __init init_trace_stack_sysctls(void) +{ + register_sysctl_init("kernel", trace_stack_sysctl_table); + return 0; +} +subsys_initcall(init_trace_stack_sysctls); + + diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index bb247beec447..856ece13b7dc 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -77,7 +77,7 @@ static int insert_stat(struct rb_root *root, void *stat, cmp_func_t cmp) struct rb_node **new = &(root->rb_node), *parent = NULL; struct stat_node *data; - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc_obj(*data); if (!data) return -ENOMEM; data->stat = stat; @@ -128,7 +128,7 @@ static int stat_seq_init(struct stat_session *session) int ret = 0; int i; - mutex_lock(&session->stat_mutex); + guard(mutex)(&session->stat_mutex); __reset_stat_session(session); if (!ts->stat_cmp) @@ -136,11 +136,11 @@ static int stat_seq_init(struct stat_session *session) stat = ts->stat_start(ts); if (!stat) - goto exit; + return 0; ret = insert_stat(root, stat, ts->stat_cmp); if (ret) - goto exit; + return ret; /* * Iterate over the tracer stat entries and store them in an rbtree. @@ -157,13 +157,10 @@ static int stat_seq_init(struct stat_session *session) goto exit_free_rbtree; } -exit: - mutex_unlock(&session->stat_mutex); return ret; exit_free_rbtree: __reset_stat_session(session); - mutex_unlock(&session->stat_mutex); return ret; } @@ -308,7 +305,7 @@ static int init_stat_file(struct stat_session *session) int register_stat_tracer(struct tracer_stat *trace) { struct stat_session *session, *node; - int ret = -EINVAL; + int ret; if (!trace) return -EINVAL; @@ -316,18 +313,18 @@ int register_stat_tracer(struct tracer_stat *trace) if (!trace->stat_start || !trace->stat_next || !trace->stat_show) return -EINVAL; + guard(mutex)(&all_stat_sessions_mutex); + /* Already registered? */ - mutex_lock(&all_stat_sessions_mutex); list_for_each_entry(node, &all_stat_sessions, session_list) { if (node->ts == trace) - goto out; + return -EINVAL; } - ret = -ENOMEM; /* Init the session */ - session = kzalloc(sizeof(*session), GFP_KERNEL); + session = kzalloc_obj(*session); if (!session) - goto out; + return -ENOMEM; session->ts = trace; INIT_LIST_HEAD(&session->session_list); @@ -336,16 +333,13 @@ int register_stat_tracer(struct tracer_stat *trace) ret = init_stat_file(session); if (ret) { destroy_session(session); - goto out; + return ret; } - ret = 0; /* Register */ list_add_tail(&session->session_list, &all_stat_sessions); - out: - mutex_unlock(&all_stat_sessions_mutex); - return ret; + return 0; } void unregister_stat_tracer(struct tracer_stat *trace) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 46aab0ab9350..8ad72e17d8eb 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <trace/syscall.h> #include <trace/events/syscalls.h> +#include <linux/kernel_stat.h> #include <linux/syscalls.h> #include <linux/slab.h> #include <linux/kernel.h> @@ -123,6 +124,118 @@ const char *get_syscall_name(int syscall) return entry->name; } +/* Added to user strings or arrays when max limit is reached */ +#define EXTRA "..." + +static void get_dynamic_len_ptr(struct syscall_trace_enter *trace, + struct syscall_metadata *entry, + int *offset_p, int *len_p, unsigned char **ptr_p) +{ + unsigned char *ptr; + int offset = *offset_p; + int val; + + /* This arg points to a user space string */ + ptr = (void *)trace->args + sizeof(long) * entry->nb_args + offset; + val = *(int *)ptr; + + /* The value is a dynamic string (len << 16 | offset) */ + ptr = (void *)trace + (val & 0xffff); + *len_p = val >> 16; + offset += 4; + + *ptr_p = ptr; + *offset_p = offset; +} + +static enum print_line_t +sys_enter_openat_print(struct syscall_trace_enter *trace, struct syscall_metadata *entry, + struct trace_seq *s, struct trace_event *event) +{ + unsigned char *ptr; + int offset = 0; + int bits, len; + bool done = false; + static const struct trace_print_flags __flags[] = + { + { O_TMPFILE, "O_TMPFILE" }, + { O_WRONLY, "O_WRONLY" }, + { O_RDWR, "O_RDWR" }, + { O_CREAT, "O_CREAT" }, + { O_EXCL, "O_EXCL" }, + { O_NOCTTY, "O_NOCTTY" }, + { O_TRUNC, "O_TRUNC" }, + { O_APPEND, "O_APPEND" }, + { O_NONBLOCK, "O_NONBLOCK" }, + { O_DSYNC, "O_DSYNC" }, + { O_DIRECT, "O_DIRECT" }, + { O_LARGEFILE, "O_LARGEFILE" }, + { O_DIRECTORY, "O_DIRECTORY" }, + { O_NOFOLLOW, "O_NOFOLLOW" }, + { O_NOATIME, "O_NOATIME" }, + { O_CLOEXEC, "O_CLOEXEC" }, + }; + + trace_seq_printf(s, "%s(", entry->name); + + for (int i = 0; !done && i < entry->nb_args; i++) { + + if (trace_seq_has_overflowed(s)) + goto end; + + if (i) + trace_seq_puts(s, ", "); + + switch (i) { + case 2: + bits = trace->args[2]; + + trace_seq_puts(s, "flags: "); + + /* No need to show mode when not creating the file */ + if (!(bits & (O_CREAT|O_TMPFILE))) + done = true; + + if (!(bits & O_ACCMODE)) { + if (!bits) { + trace_seq_puts(s, "O_RDONLY"); + continue; + } + trace_seq_puts(s, "O_RDONLY|"); + } + + trace_print_flags_seq(s, "|", bits, __flags, ARRAY_SIZE(__flags)); + /* + * trace_print_flags_seq() adds a '\0' to the + * buffer, but this needs to append more to the seq. + */ + if (!trace_seq_has_overflowed(s)) + trace_seq_pop(s); + + continue; + case 3: + trace_seq_printf(s, "%s: 0%03o", entry->args[i], + (unsigned int)trace->args[i]); + continue; + } + + trace_seq_printf(s, "%s: %lu", entry->args[i], + trace->args[i]); + + if (!(BIT(i) & entry->user_mask)) + continue; + + get_dynamic_len_ptr(trace, entry, &offset, &len, &ptr); + trace_seq_printf(s, " \"%.*s\"", len, ptr); + } + + trace_seq_putc(s, ')'); +end: + trace_seq_putc(s, '\n'); + + return trace_handle_return(s); +} + static enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags, struct trace_event *event) @@ -132,7 +245,9 @@ print_syscall_enter(struct trace_iterator *iter, int flags, struct trace_entry *ent = iter->ent; struct syscall_trace_enter *trace; struct syscall_metadata *entry; - int i, syscall; + int i, syscall, val, len; + unsigned char *ptr; + int offset = 0; trace = (typeof(trace))ent; syscall = trace->nr; @@ -146,21 +261,80 @@ print_syscall_enter(struct trace_iterator *iter, int flags, goto end; } + switch (entry->syscall_nr) { + case __NR_openat: + if (!tr || !(tr->trace_flags & TRACE_ITER(VERBOSE))) + return sys_enter_openat_print(trace, entry, s, event); + break; + default: + break; + } + trace_seq_printf(s, "%s(", entry->name); for (i = 0; i < entry->nb_args; i++) { + bool printable = false; + char *str; if (trace_seq_has_overflowed(s)) goto end; + if (i) + trace_seq_puts(s, ", "); + /* parameter types */ - if (tr && tr->trace_flags & TRACE_ITER_VERBOSE) + if (tr && tr->trace_flags & TRACE_ITER(VERBOSE)) trace_seq_printf(s, "%s ", entry->types[i]); /* parameter values */ - trace_seq_printf(s, "%s: %lx%s", entry->args[i], - trace->args[i], - i == entry->nb_args - 1 ? "" : ", "); + if (trace->args[i] < 10) + trace_seq_printf(s, "%s: %lu", entry->args[i], + trace->args[i]); + else + trace_seq_printf(s, "%s: 0x%lx", entry->args[i], + trace->args[i]); + + if (!(BIT(i) & entry->user_mask)) + continue; + + get_dynamic_len_ptr(trace, entry, &offset, &len, &ptr); + + if (entry->user_arg_size < 0 || entry->user_arg_is_str) { + trace_seq_printf(s, " \"%.*s\"", len, ptr); + continue; + } + + val = trace->args[entry->user_arg_size]; + + str = ptr; + trace_seq_puts(s, " ("); + for (int x = 0; x < len; x++, ptr++) { + if (isascii(*ptr) && isprint(*ptr)) + printable = true; + if (x) + trace_seq_putc(s, ':'); + trace_seq_printf(s, "%02x", *ptr); + } + if (len < val) + trace_seq_printf(s, ", %s", EXTRA); + + trace_seq_putc(s, ')'); + + /* If nothing is printable, don't bother printing anything */ + if (!printable) + continue; + + trace_seq_puts(s, " \""); + for (int x = 0; x < len; x++) { + if (isascii(str[x]) && isprint(str[x])) + trace_seq_putc(s, str[x]); + else + trace_seq_putc(s, '.'); + } + if (len < val) + trace_seq_printf(s, "\"%s", EXTRA); + else + trace_seq_putc(s, '"'); } trace_seq_putc(s, ')'); @@ -206,26 +380,107 @@ print_syscall_exit(struct trace_iterator *iter, int flags, .size = sizeof(_type), .align = __alignof__(_type), \ .is_signed = is_signed_type(_type), .filter_type = FILTER_OTHER } +/* When len=0, we just calculate the needed length */ +#define LEN_OR_ZERO (len ? len - pos : 0) + +static int __init +sys_enter_openat_print_fmt(struct syscall_metadata *entry, char *buf, int len) +{ + int pos = 0; + + pos += snprintf(buf + pos, LEN_OR_ZERO, + "\"dfd: 0x%%08lx, filename: 0x%%08lx \\\"%%s\\\", flags: %%s%%s, mode: 0%%03o\","); + pos += snprintf(buf + pos, LEN_OR_ZERO, + " ((unsigned long)(REC->dfd)),"); + pos += snprintf(buf + pos, LEN_OR_ZERO, + " ((unsigned long)(REC->filename)),"); + pos += snprintf(buf + pos, LEN_OR_ZERO, + " __get_str(__filename_val),"); + pos += snprintf(buf + pos, LEN_OR_ZERO, + " (REC->flags & ~3) && !(REC->flags & 3) ? \"O_RDONLY|\" : \"\", "); + pos += snprintf(buf + pos, LEN_OR_ZERO, + " REC->flags ? __print_flags(REC->flags, \"|\", "); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_WRONLY\" }, ", O_WRONLY); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_RDWR\" }, ", O_RDWR); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_CREAT\" }, ", O_CREAT); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_EXCL\" }, ", O_EXCL); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_NOCTTY\" }, ", O_NOCTTY); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_TRUNC\" }, ", O_TRUNC); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_APPEND\" }, ", O_APPEND); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_NONBLOCK\" }, ", O_NONBLOCK); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_DSYNC\" }, ", O_DSYNC); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_DIRECT\" }, ", O_DIRECT); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_LARGEFILE\" }, ", O_LARGEFILE); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_DIRECTORY\" }, ", O_DIRECTORY); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_NOFOLLOW\" }, ", O_NOFOLLOW); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_NOATIME\" }, ", O_NOATIME); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_CLOEXEC\" }) : \"O_RDONLY\", ", O_CLOEXEC); + + pos += snprintf(buf + pos, LEN_OR_ZERO, + " ((unsigned long)(REC->mode))"); + return pos; +} + static int __init __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) { + bool is_string = entry->user_arg_is_str; int i; int pos = 0; - /* When len=0, we just calculate the needed length */ -#define LEN_OR_ZERO (len ? len - pos : 0) + switch (entry->syscall_nr) { + case __NR_openat: + return sys_enter_openat_print_fmt(entry, buf, len); + default: + break; + } pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); for (i = 0; i < entry->nb_args; i++) { - pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s", - entry->args[i], sizeof(unsigned long), - i == entry->nb_args - 1 ? "" : ", "); + if (i) + pos += snprintf(buf + pos, LEN_OR_ZERO, ", "); + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx", + entry->args[i], sizeof(unsigned long)); + + if (!(BIT(i) & entry->user_mask)) + continue; + + /* Add the format for the user space string or array */ + if (entry->user_arg_size < 0 || is_string) + pos += snprintf(buf + pos, LEN_OR_ZERO, " \\\"%%s\\\""); + else + pos += snprintf(buf + pos, LEN_OR_ZERO, " (%%s)"); } pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); for (i = 0; i < entry->nb_args; i++) { pos += snprintf(buf + pos, LEN_OR_ZERO, ", ((unsigned long)(REC->%s))", entry->args[i]); + if (!(BIT(i) & entry->user_mask)) + continue; + /* The user space data for arg has name __<arg>_val */ + if (entry->user_arg_size < 0 || is_string) { + pos += snprintf(buf + pos, LEN_OR_ZERO, ", __get_str(__%s_val)", + entry->args[i]); + } else { + pos += snprintf(buf + pos, LEN_OR_ZERO, ", __print_dynamic_array(__%s_val, 1)", + entry->args[i]); + } } #undef LEN_OR_ZERO @@ -271,8 +526,11 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call) { struct syscall_trace_enter trace; struct syscall_metadata *meta = call->data; + unsigned long mask; + char *arg; int offset = offsetof(typeof(trace), args); int ret = 0; + int len; int i; for (i = 0; i < meta->nb_args; i++) { @@ -285,9 +543,320 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call) offset += sizeof(unsigned long); } + if (ret || !meta->user_mask) + return ret; + + mask = meta->user_mask; + + while (mask) { + int idx = ffs(mask) - 1; + mask &= ~BIT(idx); + + /* + * User space data is faulted into a temporary buffer and then + * added as a dynamic string or array to the end of the event. + * The user space data name for the arg pointer is + * "__<arg>_val". + */ + len = strlen(meta->args[idx]) + sizeof("___val"); + arg = kmalloc(len, GFP_KERNEL); + if (WARN_ON_ONCE(!arg)) { + meta->user_mask = 0; + return -ENOMEM; + } + + snprintf(arg, len, "__%s_val", meta->args[idx]); + + ret = trace_define_field(call, "__data_loc char[]", + arg, offset, sizeof(int), 0, + FILTER_OTHER); + if (ret) { + kfree(arg); + break; + } + offset += 4; + } return ret; } +/* + * Create a per CPU temporary buffer to copy user space pointers into. + * + * SYSCALL_FAULT_USER_MAX is the amount to copy from user space. + * (defined in kernel/trace/trace.h) + + * SYSCALL_FAULT_ARG_SZ is the amount to copy from user space plus the + * nul terminating byte and possibly appended EXTRA (4 bytes). + * + * SYSCALL_FAULT_BUF_SZ holds the size of the per CPU buffer to use + * to copy memory from user space addresses into that will hold + * 3 args as only 3 args are allowed to be copied from system calls. + */ +#define SYSCALL_FAULT_ARG_SZ (SYSCALL_FAULT_USER_MAX + 1 + 4) +#define SYSCALL_FAULT_MAX_CNT 3 +#define SYSCALL_FAULT_BUF_SZ (SYSCALL_FAULT_ARG_SZ * SYSCALL_FAULT_MAX_CNT) + +/* Use the tracing per CPU buffer infrastructure to copy from user space */ +struct syscall_user_buffer { + struct trace_user_buf_info buf; + struct rcu_head rcu; +}; + +static struct syscall_user_buffer *syscall_buffer; + +static int syscall_fault_buffer_enable(void) +{ + struct syscall_user_buffer *sbuf; + int ret; + + lockdep_assert_held(&syscall_trace_lock); + + if (syscall_buffer) { + trace_user_fault_get(&syscall_buffer->buf); + return 0; + } + + sbuf = kmalloc_obj(*sbuf); + if (!sbuf) + return -ENOMEM; + + ret = trace_user_fault_init(&sbuf->buf, SYSCALL_FAULT_BUF_SZ); + if (ret < 0) { + kfree(sbuf); + return ret; + } + + WRITE_ONCE(syscall_buffer, sbuf); + + return 0; +} + +static void rcu_free_syscall_buffer(struct rcu_head *rcu) +{ + struct syscall_user_buffer *sbuf = + container_of(rcu, struct syscall_user_buffer, rcu); + + trace_user_fault_destroy(&sbuf->buf); + kfree(sbuf); +} + + +static void syscall_fault_buffer_disable(void) +{ + struct syscall_user_buffer *sbuf = syscall_buffer; + + lockdep_assert_held(&syscall_trace_lock); + + if (trace_user_fault_put(&sbuf->buf)) + return; + + WRITE_ONCE(syscall_buffer, NULL); + call_rcu_tasks_trace(&sbuf->rcu, rcu_free_syscall_buffer); +} + +struct syscall_args { + char *ptr_array[SYSCALL_FAULT_MAX_CNT]; + int read[SYSCALL_FAULT_MAX_CNT]; + int uargs; +}; + +static int syscall_copy_user(char *buf, const char __user *ptr, + size_t size, void *data) +{ + struct syscall_args *args = data; + int ret; + + for (int i = 0; i < args->uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) { + ptr = (char __user *)args->ptr_array[i]; + ret = strncpy_from_user(buf, ptr, size); + args->read[i] = ret; + } + return 0; +} + +static int syscall_copy_user_array(char *buf, const char __user *ptr, + size_t size, void *data) +{ + struct syscall_args *args = data; + int ret; + + for (int i = 0; i < args->uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) { + ptr = (char __user *)args->ptr_array[i]; + ret = __copy_from_user(buf, ptr, size); + args->read[i] = ret ? -1 : size; + } + return 0; +} + +static char *sys_fault_user(unsigned int buf_size, + struct syscall_metadata *sys_data, + struct syscall_user_buffer *sbuf, + unsigned long *args, + unsigned int data_size[SYSCALL_FAULT_MAX_CNT]) +{ + trace_user_buf_copy syscall_copy = syscall_copy_user; + unsigned long mask = sys_data->user_mask; + unsigned long size = SYSCALL_FAULT_ARG_SZ - 1; + struct syscall_args sargs; + bool array = false; + char *buffer; + char *buf; + int ret; + int i = 0; + + /* The extra is appended to the user data in the buffer */ + BUILD_BUG_ON(SYSCALL_FAULT_USER_MAX + sizeof(EXTRA) >= + SYSCALL_FAULT_ARG_SZ); + + /* + * If this system call event has a size argument, use + * it to define how much of user space memory to read, + * and read it as an array and not a string. + */ + if (sys_data->user_arg_size >= 0) { + array = true; + size = args[sys_data->user_arg_size]; + if (size > SYSCALL_FAULT_ARG_SZ - 1) + size = SYSCALL_FAULT_ARG_SZ - 1; + syscall_copy = syscall_copy_user_array; + } + + while (mask) { + int idx = ffs(mask) - 1; + mask &= ~BIT(idx); + + if (WARN_ON_ONCE(i == SYSCALL_FAULT_MAX_CNT)) + break; + + /* Get the pointer to user space memory to read */ + sargs.ptr_array[i++] = (char *)args[idx]; + } + + sargs.uargs = i; + + /* Clear the values that are not used */ + for (; i < SYSCALL_FAULT_MAX_CNT; i++) { + data_size[i] = -1; /* Denotes no pointer */ + } + + /* A zero size means do not even try */ + if (!buf_size) + return NULL; + + buffer = trace_user_fault_read(&sbuf->buf, NULL, size, + syscall_copy, &sargs); + if (!buffer) + return NULL; + + buf = buffer; + for (i = 0; i < sargs.uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) { + + ret = sargs.read[i]; + if (ret < 0) + continue; + buf[ret] = '\0'; + + /* For strings, replace any non-printable characters with '.' */ + if (!array) { + for (int x = 0; x < ret; x++) { + if (!isprint(buf[x])) + buf[x] = '.'; + } + + size = min(buf_size, SYSCALL_FAULT_USER_MAX); + + /* + * If the text was truncated due to our max limit, + * add "..." to the string. + */ + if (ret > size) { + strscpy(buf + size, EXTRA, sizeof(EXTRA)); + ret = size + sizeof(EXTRA); + } else { + buf[ret++] = '\0'; + } + } else { + ret = min((unsigned int)ret, buf_size); + } + data_size[i] = ret; + } + + return buffer; +} + +static int +syscall_get_data(struct syscall_metadata *sys_data, unsigned long *args, + char **buffer, int *size, int *user_sizes, int *uargs, + int buf_size) +{ + struct syscall_user_buffer *sbuf; + int i; + + /* If the syscall_buffer is NULL, tracing is being shutdown */ + sbuf = READ_ONCE(syscall_buffer); + if (!sbuf) + return -1; + + *buffer = sys_fault_user(buf_size, sys_data, sbuf, args, user_sizes); + /* + * user_size is the amount of data to append. + * Need to add 4 for the meta field that points to + * the user memory at the end of the event and also + * stores its size. + */ + for (i = 0; i < SYSCALL_FAULT_MAX_CNT; i++) { + if (user_sizes[i] < 0) + break; + *size += user_sizes[i] + 4; + } + /* Save the number of user read arguments of this syscall */ + *uargs = i; + return 0; +} + +static void syscall_put_data(struct syscall_metadata *sys_data, + struct syscall_trace_enter *entry, + char *buffer, int size, int *user_sizes, int uargs) +{ + char *buf = buffer; + void *ptr; + int val; + + /* + * Set the pointer to point to the meta data of the event + * that has information about the stored user space memory. + */ + ptr = (void *)entry->args + sizeof(unsigned long) * sys_data->nb_args; + + /* + * The meta data will store the offset of the user data from + * the beginning of the event. That is after the static arguments + * and the meta data fields. + */ + val = (ptr - (void *)entry) + 4 * uargs; + + for (int i = 0; i < uargs; i++) { + + if (i) + val += user_sizes[i - 1]; + + /* Store the offset and the size into the meta data */ + *(int *)ptr = val | (user_sizes[i] << 16); + + /* Skip the meta data */ + ptr += 4; + } + + for (int i = 0; i < uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) { + /* Nothing to do if the user space was empty or faulted */ + if (!user_sizes[i]) + continue; + + memcpy(ptr, buf, user_sizes[i]); + ptr += user_sizes[i]; + } +} + static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) { struct trace_array *tr = data; @@ -296,22 +865,24 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) struct syscall_metadata *sys_data; struct trace_event_buffer fbuffer; unsigned long args[6]; + char *user_ptr; + int user_sizes[SYSCALL_FAULT_MAX_CNT] = {}; int syscall_nr; - int size; + int size = 0; + int uargs = 0; + bool mayfault; /* * Syscall probe called with preemption enabled, but the ring * buffer and per-cpu data require preemption to be disabled. */ might_fault(); - guard(preempt_notrace)(); syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; - /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */ - trace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]); + trace_file = READ_ONCE(tr->enter_syscall_files[syscall_nr]); if (!trace_file) return; @@ -322,7 +893,20 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) if (!sys_data) return; - size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; + /* Check if this syscall event faults in user space memory */ + mayfault = sys_data->user_mask != 0; + + guard(preempt_notrace)(); + + syscall_get_arguments(current, regs, args); + + if (mayfault) { + if (syscall_get_data(sys_data, args, &user_ptr, + &size, user_sizes, &uargs, tr->syscall_buf_sz) < 0) + return; + } + + size += sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; entry = trace_event_buffer_reserve(&fbuffer, trace_file, size); if (!entry) @@ -330,9 +914,12 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) entry = ring_buffer_event_data(fbuffer.event); entry->nr = syscall_nr; - syscall_get_arguments(current, regs, args); + memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args); + if (mayfault) + syscall_put_data(sys_data, entry, user_ptr, size, user_sizes, uargs); + trace_event_buffer_commit(&fbuffer); } @@ -356,8 +943,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; - /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */ - trace_file = rcu_dereference_sched(tr->exit_syscall_files[syscall_nr]); + trace_file = READ_ONCE(tr->exit_syscall_files[syscall_nr]); if (!trace_file) return; @@ -382,39 +968,50 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) static int reg_event_syscall_enter(struct trace_event_file *file, struct trace_event_call *call) { + struct syscall_metadata *sys_data = call->data; struct trace_array *tr = file->tr; int ret = 0; int num; - num = ((struct syscall_metadata *)call->data)->syscall_nr; + num = sys_data->syscall_nr; if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return -ENOSYS; - mutex_lock(&syscall_trace_lock); - if (!tr->sys_refcount_enter) + guard(mutex)(&syscall_trace_lock); + if (sys_data->user_mask) { + ret = syscall_fault_buffer_enable(); + if (ret < 0) + return ret; + } + if (!tr->sys_refcount_enter) { ret = register_trace_sys_enter(ftrace_syscall_enter, tr); - if (!ret) { - rcu_assign_pointer(tr->enter_syscall_files[num], file); - tr->sys_refcount_enter++; + if (ret < 0) { + if (sys_data->user_mask) + syscall_fault_buffer_disable(); + return ret; + } } - mutex_unlock(&syscall_trace_lock); - return ret; + WRITE_ONCE(tr->enter_syscall_files[num], file); + tr->sys_refcount_enter++; + return 0; } static void unreg_event_syscall_enter(struct trace_event_file *file, struct trace_event_call *call) { + struct syscall_metadata *sys_data = call->data; struct trace_array *tr = file->tr; int num; - num = ((struct syscall_metadata *)call->data)->syscall_nr; + num = sys_data->syscall_nr; if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return; - mutex_lock(&syscall_trace_lock); + guard(mutex)(&syscall_trace_lock); tr->sys_refcount_enter--; - RCU_INIT_POINTER(tr->enter_syscall_files[num], NULL); + WRITE_ONCE(tr->enter_syscall_files[num], NULL); if (!tr->sys_refcount_enter) unregister_trace_sys_enter(ftrace_syscall_enter, tr); - mutex_unlock(&syscall_trace_lock); + if (sys_data->user_mask) + syscall_fault_buffer_disable(); } static int reg_event_syscall_exit(struct trace_event_file *file, @@ -431,7 +1028,7 @@ static int reg_event_syscall_exit(struct trace_event_file *file, if (!tr->sys_refcount_exit) ret = register_trace_sys_exit(ftrace_syscall_exit, tr); if (!ret) { - rcu_assign_pointer(tr->exit_syscall_files[num], file); + WRITE_ONCE(tr->exit_syscall_files[num], file); tr->sys_refcount_exit++; } mutex_unlock(&syscall_trace_lock); @@ -449,12 +1046,221 @@ static void unreg_event_syscall_exit(struct trace_event_file *file, return; mutex_lock(&syscall_trace_lock); tr->sys_refcount_exit--; - RCU_INIT_POINTER(tr->exit_syscall_files[num], NULL); + WRITE_ONCE(tr->exit_syscall_files[num], NULL); if (!tr->sys_refcount_exit) unregister_trace_sys_exit(ftrace_syscall_exit, tr); mutex_unlock(&syscall_trace_lock); } +/* + * For system calls that reference user space memory that can + * be recorded into the event, set the system call meta data's user_mask + * to the "args" index that points to the user space memory to retrieve. + */ +static void check_faultable_syscall(struct trace_event_call *call, int nr) +{ + struct syscall_metadata *sys_data = call->data; + unsigned long mask; + + /* Only work on entry */ + if (sys_data->enter_event != call) + return; + + sys_data->user_arg_size = -1; + + switch (nr) { + /* user arg 1 with size arg at 2 */ + case __NR_write: +#ifdef __NR_mq_timedsend + case __NR_mq_timedsend: +#endif + case __NR_pwrite64: + sys_data->user_mask = BIT(1); + sys_data->user_arg_size = 2; + break; + /* user arg 0 with size arg at 1 as string */ + case __NR_setdomainname: + case __NR_sethostname: + sys_data->user_mask = BIT(0); + sys_data->user_arg_size = 1; + sys_data->user_arg_is_str = 1; + break; +#ifdef __NR_kexec_file_load + /* user arg 4 with size arg at 3 as string */ + case __NR_kexec_file_load: + sys_data->user_mask = BIT(4); + sys_data->user_arg_size = 3; + sys_data->user_arg_is_str = 1; + break; +#endif + /* user arg at position 0 */ +#ifdef __NR_access + case __NR_access: +#endif + case __NR_acct: + case __NR_chdir: +#ifdef __NR_chown + case __NR_chown: +#endif +#ifdef __NR_chmod + case __NR_chmod: +#endif + case __NR_chroot: +#ifdef __NR_creat + case __NR_creat: +#endif + case __NR_delete_module: + case __NR_execve: + case __NR_fsopen: +#ifdef __NR_lchown + case __NR_lchown: +#endif +#ifdef __NR_open + case __NR_open: +#endif + case __NR_memfd_create: +#ifdef __NR_mkdir + case __NR_mkdir: +#endif +#ifdef __NR_mknod + case __NR_mknod: +#endif + case __NR_mq_open: + case __NR_mq_unlink: +#ifdef __NR_readlink + case __NR_readlink: +#endif +#ifdef __NR_rmdir + case __NR_rmdir: +#endif + case __NR_shmdt: +#ifdef __NR_statfs + case __NR_statfs: +#endif + case __NR_swapon: + case __NR_swapoff: +#ifdef __NR_truncate + case __NR_truncate: +#endif +#ifdef __NR_unlink + case __NR_unlink: +#endif + case __NR_umount2: +#ifdef __NR_utime + case __NR_utime: +#endif +#ifdef __NR_utimes + case __NR_utimes: +#endif + sys_data->user_mask = BIT(0); + break; + /* user arg at position 1 */ + case __NR_execveat: + case __NR_faccessat: + case __NR_faccessat2: + case __NR_finit_module: + case __NR_fchmodat: + case __NR_fchmodat2: + case __NR_fchownat: + case __NR_fgetxattr: + case __NR_flistxattr: + case __NR_fsetxattr: + case __NR_fspick: + case __NR_fremovexattr: +#ifdef __NR_futimesat + case __NR_futimesat: +#endif + case __NR_inotify_add_watch: + case __NR_mkdirat: + case __NR_mknodat: + case __NR_mount_setattr: + case __NR_name_to_handle_at: +#ifdef __NR_newfstatat + case __NR_newfstatat: +#endif + case __NR_openat: + case __NR_openat2: + case __NR_open_tree: + case __NR_open_tree_attr: + case __NR_readlinkat: + case __NR_quotactl: + case __NR_syslog: + case __NR_statx: + case __NR_unlinkat: +#ifdef __NR_utimensat + case __NR_utimensat: +#endif + sys_data->user_mask = BIT(1); + break; + /* user arg at position 2 */ + case __NR_init_module: + case __NR_fsconfig: + sys_data->user_mask = BIT(2); + break; + /* user arg at position 4 */ + case __NR_fanotify_mark: + sys_data->user_mask = BIT(4); + break; + /* 2 user args, 0 and 1 */ + case __NR_add_key: + case __NR_getxattr: + case __NR_lgetxattr: + case __NR_lremovexattr: +#ifdef __NR_link + case __NR_link: +#endif + case __NR_listxattr: + case __NR_llistxattr: + case __NR_lsetxattr: + case __NR_pivot_root: + case __NR_removexattr: +#ifdef __NR_rename + case __NR_rename: +#endif + case __NR_request_key: + case __NR_setxattr: +#ifdef __NR_symlink + case __NR_symlink: +#endif + sys_data->user_mask = BIT(0) | BIT(1); + break; + /* 2 user args, 0 and 2 */ + case __NR_symlinkat: + sys_data->user_mask = BIT(0) | BIT(2); + break; + /* 2 user args, 1 and 3 */ + case __NR_getxattrat: + case __NR_linkat: + case __NR_listxattrat: + case __NR_move_mount: +#ifdef __NR_renameat + case __NR_renameat: +#endif + case __NR_renameat2: + case __NR_removexattrat: + case __NR_setxattrat: + sys_data->user_mask = BIT(1) | BIT(3); + break; + case __NR_mount: /* Just dev_name and dir_name, TODO add type */ + sys_data->user_mask = BIT(0) | BIT(1) | BIT(2); + break; + default: + sys_data->user_mask = 0; + return; + } + + if (sys_data->user_arg_size < 0) + return; + + /* + * The user_arg_size can only be used when the system call + * is reading only a single address from user space. + */ + mask = sys_data->user_mask; + if (WARN_ON(mask & (mask - 1))) + sys_data->user_arg_size = -1; +} + static int __init init_syscall_trace(struct trace_event_call *call) { int id; @@ -467,6 +1273,8 @@ static int __init init_syscall_trace(struct trace_event_call *call) return -ENOSYS; } + check_faultable_syscall(call, num); + if (set_syscall_print_fmt(call) < 0) return -ENOMEM; @@ -528,9 +1336,8 @@ void __init init_ftrace_syscalls(void) void *ret; if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) { - syscalls_metadata = kcalloc(NR_syscalls, - sizeof(*syscalls_metadata), - GFP_KERNEL); + syscalls_metadata = kzalloc_objs(*syscalls_metadata, + NR_syscalls); if (!syscalls_metadata) { WARN_ON(1); return; @@ -594,9 +1401,14 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) struct hlist_head *head; unsigned long args[6]; bool valid_prog_array; + bool mayfault; + char *user_ptr; + int user_sizes[SYSCALL_FAULT_MAX_CNT] = {}; + int buf_size = CONFIG_TRACE_SYSCALL_BUF_SIZE_DEFAULT; int syscall_nr; int rctx; - int size; + int size = 0; + int uargs = 0; /* * Syscall probe called with preemption enabled, but the ring @@ -615,13 +1427,24 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) if (!sys_data) return; + syscall_get_arguments(current, regs, args); + + /* Check if this syscall event faults in user space memory */ + mayfault = sys_data->user_mask != 0; + + if (mayfault) { + if (syscall_get_data(sys_data, args, &user_ptr, + &size, user_sizes, &uargs, buf_size) < 0) + return; + } + head = this_cpu_ptr(sys_data->enter_event->perf_events); valid_prog_array = bpf_prog_array_valid(sys_data->enter_event); if (!valid_prog_array && hlist_empty(head)) return; /* get the size after alignment with the u32 buffer size field */ - size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec); + size += sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec); size = ALIGN(size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); @@ -630,9 +1453,11 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) return; rec->nr = syscall_nr; - syscall_get_arguments(current, regs, args); memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args); + if (mayfault) + syscall_put_data(sys_data, rec, user_ptr, size, user_sizes, uargs); + if ((valid_prog_array && !perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) || hlist_empty(head)) { @@ -647,36 +1472,46 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) static int perf_sysenter_enable(struct trace_event_call *call) { - int ret = 0; + struct syscall_metadata *sys_data = call->data; int num; + int ret; - num = ((struct syscall_metadata *)call->data)->syscall_nr; + num = sys_data->syscall_nr; - mutex_lock(&syscall_trace_lock); - if (!sys_perf_refcount_enter) + guard(mutex)(&syscall_trace_lock); + if (sys_data->user_mask) { + ret = syscall_fault_buffer_enable(); + if (ret < 0) + return ret; + } + if (!sys_perf_refcount_enter) { ret = register_trace_sys_enter(perf_syscall_enter, NULL); - if (ret) { - pr_info("event trace: Could not activate syscall entry trace point"); - } else { - set_bit(num, enabled_perf_enter_syscalls); - sys_perf_refcount_enter++; + if (ret) { + pr_info("event trace: Could not activate syscall entry trace point"); + if (sys_data->user_mask) + syscall_fault_buffer_disable(); + return ret; + } } - mutex_unlock(&syscall_trace_lock); - return ret; + set_bit(num, enabled_perf_enter_syscalls); + sys_perf_refcount_enter++; + return 0; } static void perf_sysenter_disable(struct trace_event_call *call) { + struct syscall_metadata *sys_data = call->data; int num; - num = ((struct syscall_metadata *)call->data)->syscall_nr; + num = sys_data->syscall_nr; - mutex_lock(&syscall_trace_lock); + guard(mutex)(&syscall_trace_lock); sys_perf_refcount_enter--; clear_bit(num, enabled_perf_enter_syscalls); if (!sys_perf_refcount_enter) unregister_trace_sys_enter(perf_syscall_enter, NULL); - mutex_unlock(&syscall_trace_lock); + if (sys_data->user_mask) + syscall_fault_buffer_disable(); } static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs, @@ -753,22 +1588,21 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) static int perf_sysexit_enable(struct trace_event_call *call) { - int ret = 0; int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; - mutex_lock(&syscall_trace_lock); - if (!sys_perf_refcount_exit) - ret = register_trace_sys_exit(perf_syscall_exit, NULL); - if (ret) { - pr_info("event trace: Could not activate syscall exit trace point"); - } else { - set_bit(num, enabled_perf_exit_syscalls); - sys_perf_refcount_exit++; + guard(mutex)(&syscall_trace_lock); + if (!sys_perf_refcount_exit) { + int ret = register_trace_sys_exit(perf_syscall_exit, NULL); + if (ret) { + pr_info("event trace: Could not activate syscall exit trace point"); + return ret; + } } - mutex_unlock(&syscall_trace_lock); - return ret; + set_bit(num, enabled_perf_exit_syscalls); + sys_perf_refcount_exit++; + return 0; } static void perf_sysexit_disable(struct trace_event_call *call) @@ -777,12 +1611,11 @@ static void perf_sysexit_disable(struct trace_event_call *call) num = ((struct syscall_metadata *)call->data)->syscall_nr; - mutex_lock(&syscall_trace_lock); + guard(mutex)(&syscall_trace_lock); sys_perf_refcount_exit--; clear_bit(num, enabled_perf_exit_syscalls); if (!sys_perf_refcount_exit) unregister_trace_sys_exit(perf_syscall_exit, NULL); - mutex_unlock(&syscall_trace_lock); } #endif /* CONFIG_PERF_EVENTS */ diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 4875e7f5de3d..2cabf8a23ec5 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -8,17 +8,19 @@ #define pr_fmt(fmt) "trace_uprobe: " fmt #include <linux/bpf-cgroup.h> -#include <linux/security.h> +#include <linux/cleanup.h> #include <linux/ctype.h> +#include <linux/filter.h> #include <linux/module.h> -#include <linux/uaccess.h> -#include <linux/uprobes.h> #include <linux/namei.h> -#include <linux/string.h> -#include <linux/rculist.h> -#include <linux/filter.h> #include <linux/percpu.h> +#include <linux/rculist.h> +#include <linux/security.h> +#include <linux/string.h> +#include <linux/uaccess.h> +#include <linux/uprobes.h> +#include "trace.h" #include "trace_dynevent.h" #include "trace_probe.h" #include "trace_probe_tmpl.h" @@ -336,7 +338,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) struct trace_uprobe *tu; int ret; - tu = kzalloc(struct_size(tu, tp.args, nargs), GFP_KERNEL); + tu = kzalloc_flex(*tu, tp.args, nargs); if (!tu) return ERR_PTR(-ENOMEM); @@ -498,11 +500,11 @@ static int register_trace_uprobe(struct trace_uprobe *tu) struct trace_uprobe *old_tu; int ret; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); ret = validate_ref_ctr_offset(tu); if (ret) - goto end; + return ret; /* register as an event */ old_tu = find_probe_event(trace_probe_name(&tu->tp), @@ -511,11 +513,9 @@ static int register_trace_uprobe(struct trace_uprobe *tu) if (is_ret_probe(tu) != is_ret_probe(old_tu)) { trace_probe_log_set_index(0); trace_probe_log_err(0, DIFF_PROBE_TYPE); - ret = -EEXIST; - } else { - ret = append_trace_uprobe(tu, old_tu); + return -EEXIST; } - goto end; + return append_trace_uprobe(tu, old_tu); } ret = register_uprobe_event(tu); @@ -525,31 +525,33 @@ static int register_trace_uprobe(struct trace_uprobe *tu) trace_probe_log_err(0, EVENT_EXIST); } else pr_warn("Failed to register probe event(%d)\n", ret); - goto end; + return ret; } dyn_event_add(&tu->devent, trace_probe_event_call(&tu->tp)); -end: - mutex_unlock(&event_mutex); - return ret; } +DEFINE_FREE(free_trace_uprobe, struct trace_uprobe *, if (_T) free_trace_uprobe(_T)) + /* * Argument syntax: * - Add uprobe: p|r[:[GRP/][EVENT]] PATH:OFFSET[%return][(REF)] [FETCHARGS] */ static int __trace_uprobe_create(int argc, const char **argv) { - struct trace_uprobe *tu; + struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) = NULL; + struct trace_uprobe *tu __free(free_trace_uprobe) = NULL; + const char *trlog __free(trace_probe_log_clear) = NULL; const char *event = NULL, *group = UPROBE_EVENT_SYSTEM; - char *arg, *filename, *rctr, *rctr_end, *tmp; - char buf[MAX_EVENT_NAME_LEN]; - char gbuf[MAX_EVENT_NAME_LEN]; - enum probe_print_type ptype; - struct path path; + struct path path __free(path_put) = {}; unsigned long offset, ref_ctr_offset; + char *filename __free(kfree) = NULL; + char *arg, *rctr, *rctr_end, *tmp; + char *gbuf __free(kfree) = NULL; + char *buf __free(kfree) = NULL; + enum probe_print_type ptype; bool is_return = false; int i, ret; @@ -567,8 +569,14 @@ static int __trace_uprobe_create(int argc, const char **argv) if (argc < 2) return -ECANCELED; - if (argc - 2 > MAX_TRACE_ARGS) + + trlog = trace_probe_log_init("trace_uprobe", argc, argv); + + if (argc - 2 > MAX_TRACE_ARGS) { + trace_probe_log_set_index(2); + trace_probe_log_err(0, TOO_MANY_ARGS); return -E2BIG; + } if (argv[0][1] == ':') event = &argv[0][2]; @@ -582,26 +590,20 @@ static int __trace_uprobe_create(int argc, const char **argv) /* Find the last occurrence, in case the path contains ':' too. */ arg = strrchr(filename, ':'); - if (!arg || !isdigit(arg[1])) { - kfree(filename); + if (!arg || !isdigit(arg[1])) return -ECANCELED; - } - trace_probe_log_init("trace_uprobe", argc, argv); trace_probe_log_set_index(1); /* filename is the 2nd argument */ *arg++ = '\0'; ret = kern_path(filename, LOOKUP_FOLLOW, &path); if (ret) { trace_probe_log_err(0, FILE_NOT_FOUND); - kfree(filename); - trace_probe_log_clear(); return ret; } if (!d_is_reg(path.dentry)) { trace_probe_log_err(0, NO_REGULAR_FILE); - ret = -EINVAL; - goto fail_address_parse; + return -EINVAL; } /* Parse reference counter offset if specified. */ @@ -609,16 +611,14 @@ static int __trace_uprobe_create(int argc, const char **argv) if (rctr) { rctr_end = strchr(rctr, ')'); if (!rctr_end) { - ret = -EINVAL; rctr_end = rctr + strlen(rctr); trace_probe_log_err(rctr_end - filename, REFCNT_OPEN_BRACE); - goto fail_address_parse; + return -EINVAL; } else if (rctr_end[1] != '\0') { - ret = -EINVAL; trace_probe_log_err(rctr_end + 1 - filename, BAD_REFCNT_SUFFIX); - goto fail_address_parse; + return -EINVAL; } *rctr++ = '\0'; @@ -626,7 +626,7 @@ static int __trace_uprobe_create(int argc, const char **argv) ret = kstrtoul(rctr, 0, &ref_ctr_offset); if (ret) { trace_probe_log_err(rctr - filename, BAD_REFCNT); - goto fail_address_parse; + return ret; } } @@ -638,8 +638,7 @@ static int __trace_uprobe_create(int argc, const char **argv) is_return = true; } else { trace_probe_log_err(tmp - filename, BAD_ADDR_SUFFIX); - ret = -EINVAL; - goto fail_address_parse; + return -EINVAL; } } @@ -647,16 +646,20 @@ static int __trace_uprobe_create(int argc, const char **argv) ret = kstrtoul(arg, 0, &offset); if (ret) { trace_probe_log_err(arg - filename, BAD_UPROBE_OFFS); - goto fail_address_parse; + return ret; } /* setup a probe */ trace_probe_log_set_index(0); if (event) { + gbuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); + if (!gbuf) + return -ENOMEM; + ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) - goto fail_address_parse; + return ret; } if (!event) { @@ -664,15 +667,16 @@ static int __trace_uprobe_create(int argc, const char **argv) char *ptr; tail = kstrdup(kbasename(filename), GFP_KERNEL); - if (!tail) { - ret = -ENOMEM; - goto fail_address_parse; - } + if (!tail) + return -ENOMEM; ptr = strpbrk(tail, ".-_"); if (ptr) *ptr = '\0'; + buf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); + if (!buf) + return -ENOMEM; snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_0x%lx", 'p', tail, offset); event = buf; kfree(tail); @@ -686,45 +690,36 @@ static int __trace_uprobe_create(int argc, const char **argv) ret = PTR_ERR(tu); /* This must return -ENOMEM otherwise there is a bug */ WARN_ON_ONCE(ret != -ENOMEM); - goto fail_address_parse; + return ret; } tu->offset = offset; tu->ref_ctr_offset = ref_ctr_offset; tu->path = path; - tu->filename = filename; + /* Clear @path so that it will not freed by path_put() */ + memset(&path, 0, sizeof(path)); + tu->filename = no_free_ptr(filename); + + ctx = kzalloc_obj(*ctx); + if (!ctx) + return -ENOMEM; + ctx->flags = (is_return ? TPARG_FL_RETURN : 0) | TPARG_FL_USER; /* parse arguments */ for (i = 0; i < argc; i++) { - struct traceprobe_parse_context ctx = { - .flags = (is_return ? TPARG_FL_RETURN : 0) | TPARG_FL_USER, - }; - trace_probe_log_set_index(i + 2); - ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], &ctx); - traceprobe_finish_parse(&ctx); + ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], ctx); if (ret) - goto error; + return ret; } ptype = is_ret_probe(tu) ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL; ret = traceprobe_set_print_fmt(&tu->tp, ptype); if (ret < 0) - goto error; + return ret; ret = register_trace_uprobe(tu); if (!ret) - goto out; - -error: - free_trace_uprobe(tu); -out: - trace_probe_log_clear(); - return ret; - -fail_address_parse: - trace_probe_log_clear(); - path_put(&path); - kfree(filename); + tu = NULL; return ret; } @@ -741,7 +736,7 @@ static int create_or_delete_trace_uprobe(const char *raw_command) if (raw_command[0] == '-') return dyn_event_release(raw_command, &trace_uprobe_ops); - ret = trace_uprobe_create(raw_command); + ret = dyn_event_create(raw_command, &trace_uprobe_ops); return ret == -ECANCELED ? -EINVAL : ret; } @@ -1489,7 +1484,7 @@ int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type, : BPF_FD_TYPE_UPROBE; *filename = tu->filename; *probe_offset = tu->offset; - *probe_addr = 0; + *probe_addr = tu->ref_ctr_offset; return 0; } #endif /* CONFIG_PERF_EVENTS */ @@ -1534,6 +1529,7 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs, struct trace_uprobe *tu; struct uprobe_dispatch_data udd; struct uprobe_cpu_buffer *ucb = NULL; + unsigned int flags; int ret = 0; tu = container_of(con, struct trace_uprobe, consumer); @@ -1548,11 +1544,12 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs, if (WARN_ON_ONCE(!uprobe_cpu_buffer)) return 0; - if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) + flags = trace_probe_load_flag(&tu->tp); + if (flags & TP_FLAG_TRACE) ret |= uprobe_trace_func(tu, regs, &ucb); #ifdef CONFIG_PERF_EVENTS - if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE)) + if (flags & TP_FLAG_PROFILE) ret |= uprobe_perf_func(tu, regs, &ucb); #endif uprobe_buffer_put(ucb); @@ -1566,6 +1563,7 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con, struct trace_uprobe *tu; struct uprobe_dispatch_data udd; struct uprobe_cpu_buffer *ucb = NULL; + unsigned int flags; tu = container_of(con, struct trace_uprobe, consumer); @@ -1577,11 +1575,12 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con, if (WARN_ON_ONCE(!uprobe_cpu_buffer)) return 0; - if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) + flags = trace_probe_load_flag(&tu->tp); + if (flags & TP_FLAG_TRACE) uretprobe_trace_func(tu, func, regs, &ucb); #ifdef CONFIG_PERF_EVENTS - if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE)) + if (flags & TP_FLAG_PROFILE) uretprobe_perf_func(tu, func, regs, &ucb); #endif uprobe_buffer_put(ucb); diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 1921ade45be3..0dd7927df22a 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -324,7 +324,7 @@ static struct tracing_map_array *tracing_map_array_alloc(unsigned int n_elts, struct tracing_map_array *a; unsigned int i; - a = kzalloc(sizeof(*a), GFP_KERNEL); + a = kzalloc_obj(*a); if (!a) return NULL; @@ -386,13 +386,11 @@ static void tracing_map_elt_init_fields(struct tracing_map_elt *elt) } } -static void tracing_map_elt_free(struct tracing_map_elt *elt) +static void __tracing_map_elt_free(struct tracing_map_elt *elt) { if (!elt) return; - if (elt->map->ops && elt->map->ops->elt_free) - elt->map->ops->elt_free(elt); kfree(elt->fields); kfree(elt->vars); kfree(elt->var_set); @@ -400,12 +398,23 @@ static void tracing_map_elt_free(struct tracing_map_elt *elt) kfree(elt); } +static void tracing_map_elt_free(struct tracing_map_elt *elt) +{ + if (!elt) + return; + + /* Only objects initialized with alloc_elt() should be passed to free_elt().*/ + if (elt->map->ops && elt->map->ops->elt_free) + elt->map->ops->elt_free(elt); + __tracing_map_elt_free(elt); +} + static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map) { struct tracing_map_elt *elt; int err = 0; - elt = kzalloc(sizeof(*elt), GFP_KERNEL); + elt = kzalloc_obj(*elt); if (!elt) return ERR_PTR(-ENOMEM); @@ -417,19 +426,19 @@ static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map) goto free; } - elt->fields = kcalloc(map->n_fields, sizeof(*elt->fields), GFP_KERNEL); + elt->fields = kzalloc_objs(*elt->fields, map->n_fields); if (!elt->fields) { err = -ENOMEM; goto free; } - elt->vars = kcalloc(map->n_vars, sizeof(*elt->vars), GFP_KERNEL); + elt->vars = kzalloc_objs(*elt->vars, map->n_vars); if (!elt->vars) { err = -ENOMEM; goto free; } - elt->var_set = kcalloc(map->n_vars, sizeof(*elt->var_set), GFP_KERNEL); + elt->var_set = kzalloc_objs(*elt->var_set, map->n_vars); if (!elt->var_set) { err = -ENOMEM; goto free; @@ -444,7 +453,7 @@ static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map) } return elt; free: - tracing_map_elt_free(elt); + __tracing_map_elt_free(elt); return ERR_PTR(err); } @@ -777,7 +786,7 @@ struct tracing_map *tracing_map_create(unsigned int map_bits, map_bits > TRACING_MAP_BITS_MAX) return ERR_PTR(-EINVAL); - map = kzalloc(sizeof(*map), GFP_KERNEL); + map = kzalloc_obj(*map); if (!map) return ERR_PTR(-ENOMEM); @@ -949,7 +958,7 @@ create_sort_entry(void *key, struct tracing_map_elt *elt) { struct tracing_map_sort_entry *sort_entry; - sort_entry = kzalloc(sizeof(*sort_entry), GFP_KERNEL); + sort_entry = kzalloc_obj(*sort_entry); if (!sort_entry) return NULL; @@ -1076,7 +1085,7 @@ int tracing_map_sort_entries(struct tracing_map *map, struct tracing_map_sort_entry *sort_entry, **entries; int i, n_entries, ret; - entries = vmalloc(array_size(sizeof(sort_entry), map->max_elts)); + entries = vmalloc_array(map->max_elts, sizeof(sort_entry)); if (!entries) return -ENOMEM; diff --git a/kernel/trace/undefsyms_base.c b/kernel/trace/undefsyms_base.c new file mode 100644 index 000000000000..e65baf58e6ff --- /dev/null +++ b/kernel/trace/undefsyms_base.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * simple_ring_buffer is used by the pKVM hypervisor which does not have access + * to all kernel symbols. Whatever is undefined when compiling this file is + * compiler and tooling-generated symbols that can safely be ignored for + * simple_ring_buffer. + */ + +#include <linux/atomic.h> +#include <linux/string.h> +#include <asm/page.h> + +void undefsyms_base(void *p, int n); + +static char page[PAGE_SIZE] __aligned(PAGE_SIZE); + +void undefsyms_base(void *p, int n) +{ + char buffer[256] = { 0 }; + + u32 u = 0; + memset((char * volatile)page, 8, PAGE_SIZE); + memset((char * volatile)buffer, 8, sizeof(buffer)); + memcpy((void * volatile)p, buffer, sizeof(buffer)); + cmpxchg((u32 * volatile)&u, 0, 8); + WARN_ON(n == 0xdeadbeef); +} |
