summaryrefslogtreecommitdiff
path: root/kernel/trace
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/trace')
-rw-r--r--kernel/trace/Kconfig145
-rw-r--r--kernel/trace/Makefile54
-rw-r--r--kernel/trace/blktrace.c675
-rw-r--r--kernel/trace/bpf_trace.c864
-rw-r--r--kernel/trace/fgraph.c134
-rw-r--r--kernel/trace/fprobe.c1101
-rw-r--r--kernel/trace/ftrace.c1327
-rw-r--r--kernel/trace/pid_list.c50
-rw-r--r--kernel/trace/pid_list.h1
-rw-r--r--kernel/trace/power-traces.c1
-rw-r--r--kernel/trace/preemptirq_delay_test.c13
-rw-r--r--kernel/trace/remote_test.c261
-rw-r--r--kernel/trace/remote_test_events.h10
-rw-r--r--kernel/trace/rethook.c2
-rw-r--r--kernel/trace/ring_buffer.c1340
-rw-r--r--kernel/trace/ring_buffer_benchmark.c2
-rw-r--r--kernel/trace/rv/Kconfig81
-rw-r--r--kernel/trace/rv/Makefile18
-rw-r--r--kernel/trace/rv/monitors/deadline/Kconfig10
-rw-r--r--kernel/trace/rv/monitors/deadline/deadline.c44
-rw-r--r--kernel/trace/rv/monitors/deadline/deadline.h203
-rw-r--r--kernel/trace/rv/monitors/nomiss/Kconfig15
-rw-r--r--kernel/trace/rv/monitors/nomiss/nomiss.c293
-rw-r--r--kernel/trace/rv/monitors/nomiss/nomiss.h123
-rw-r--r--kernel/trace/rv/monitors/nomiss/nomiss_trace.h19
-rw-r--r--kernel/trace/rv/monitors/nrp/Kconfig16
-rw-r--r--kernel/trace/rv/monitors/nrp/nrp.c136
-rw-r--r--kernel/trace/rv/monitors/nrp/nrp.h77
-rw-r--r--kernel/trace/rv/monitors/nrp/nrp_trace.h15
-rw-r--r--kernel/trace/rv/monitors/opid/Kconfig14
-rw-r--r--kernel/trace/rv/monitors/opid/opid.c117
-rw-r--r--kernel/trace/rv/monitors/opid/opid.h58
-rw-r--r--kernel/trace/rv/monitors/opid/opid_trace.h19
-rw-r--r--kernel/trace/rv/monitors/pagefault/Kconfig21
-rw-r--r--kernel/trace/rv/monitors/pagefault/pagefault.c88
-rw-r--r--kernel/trace/rv/monitors/pagefault/pagefault.h64
-rw-r--r--kernel/trace/rv/monitors/pagefault/pagefault_trace.h14
-rw-r--r--kernel/trace/rv/monitors/rtapp/Kconfig11
-rw-r--r--kernel/trace/rv/monitors/rtapp/rtapp.c31
-rw-r--r--kernel/trace/rv/monitors/rtapp/rtapp.h3
-rw-r--r--kernel/trace/rv/monitors/sched/Kconfig12
-rw-r--r--kernel/trace/rv/monitors/sched/sched.c35
-rw-r--r--kernel/trace/rv/monitors/sched/sched.h3
-rw-r--r--kernel/trace/rv/monitors/sco/Kconfig14
-rw-r--r--kernel/trace/rv/monitors/sco/sco.c85
-rw-r--r--kernel/trace/rv/monitors/sco/sco.h49
-rw-r--r--kernel/trace/rv/monitors/sco/sco_trace.h15
-rw-r--r--kernel/trace/rv/monitors/scpd/Kconfig15
-rw-r--r--kernel/trace/rv/monitors/scpd/scpd.c93
-rw-r--r--kernel/trace/rv/monitors/scpd/scpd.h51
-rw-r--r--kernel/trace/rv/monitors/scpd/scpd_trace.h15
-rw-r--r--kernel/trace/rv/monitors/sleep/Kconfig22
-rw-r--r--kernel/trace/rv/monitors/sleep/sleep.c249
-rw-r--r--kernel/trace/rv/monitors/sleep/sleep.h263
-rw-r--r--kernel/trace/rv/monitors/sleep/sleep_trace.h14
-rw-r--r--kernel/trace/rv/monitors/snep/Kconfig15
-rw-r--r--kernel/trace/rv/monitors/snep/snep.c93
-rw-r--r--kernel/trace/rv/monitors/snep/snep.h61
-rw-r--r--kernel/trace/rv/monitors/snep/snep_trace.h15
-rw-r--r--kernel/trace/rv/monitors/snroc/Kconfig14
-rw-r--r--kernel/trace/rv/monitors/snroc/snroc.c82
-rw-r--r--kernel/trace/rv/monitors/snroc/snroc.h49
-rw-r--r--kernel/trace/rv/monitors/snroc/snroc_trace.h15
-rw-r--r--kernel/trace/rv/monitors/sssw/Kconfig15
-rw-r--r--kernel/trace/rv/monitors/sssw/sssw.c114
-rw-r--r--kernel/trace/rv/monitors/sssw/sssw.h107
-rw-r--r--kernel/trace/rv/monitors/sssw/sssw_trace.h15
-rw-r--r--kernel/trace/rv/monitors/stall/Kconfig13
-rw-r--r--kernel/trace/rv/monitors/stall/stall.c150
-rw-r--r--kernel/trace/rv/monitors/stall/stall.h81
-rw-r--r--kernel/trace/rv/monitors/stall/stall_trace.h19
-rw-r--r--kernel/trace/rv/monitors/sts/Kconfig19
-rw-r--r--kernel/trace/rv/monitors/sts/sts.c154
-rw-r--r--kernel/trace/rv/monitors/sts/sts.h119
-rw-r--r--kernel/trace/rv/monitors/sts/sts_trace.h15
-rw-r--r--kernel/trace/rv/monitors/wip/Kconfig14
-rw-r--r--kernel/trace/rv/monitors/wip/wip.c29
-rw-r--r--kernel/trace/rv/monitors/wip/wip.h15
-rw-r--r--kernel/trace/rv/monitors/wip/wip_trace.h15
-rw-r--r--kernel/trace/rv/monitors/wwnr/Kconfig13
-rw-r--r--kernel/trace/rv/monitors/wwnr/wwnr.c31
-rw-r--r--kernel/trace/rv/monitors/wwnr/wwnr.h15
-rw-r--r--kernel/trace/rv/monitors/wwnr/wwnr_trace.h16
-rw-r--r--kernel/trace/rv/reactor_panic.c4
-rw-r--r--kernel/trace/rv/reactor_printk.c4
-rw-r--r--kernel/trace/rv/rv.c360
-rw-r--r--kernel/trace/rv/rv.h39
-rw-r--r--kernel/trace/rv/rv_reactors.c216
-rw-r--r--kernel/trace/rv/rv_trace.h277
-rw-r--r--kernel/trace/simple_ring_buffer.c517
-rw-r--r--kernel/trace/trace.c4687
-rw-r--r--kernel/trace/trace.h643
-rw-r--r--kernel/trace/trace_boot.c5
-rw-r--r--kernel/trace/trace_branch.c12
-rw-r--r--kernel/trace/trace_btf.c2
-rw-r--r--kernel/trace/trace_dynevent.c54
-rw-r--r--kernel/trace/trace_dynevent.h1
-rw-r--r--kernel/trace/trace_entries.h51
-rw-r--r--kernel/trace/trace_eprobe.c189
-rw-r--r--kernel/trace/trace_event_perf.c4
-rw-r--r--kernel/trace/trace_events.c1072
-rw-r--r--kernel/trace/trace_events_filter.c255
-rw-r--r--kernel/trace/trace_events_hist.c651
-rw-r--r--kernel/trace/trace_events_synth.c98
-rw-r--r--kernel/trace/trace_events_trigger.c645
-rw-r--r--kernel/trace/trace_events_user.c57
-rw-r--r--kernel/trace/trace_export.c21
-rw-r--r--kernel/trace/trace_fprobe.c922
-rw-r--r--kernel/trace/trace_functions.c78
-rw-r--r--kernel/trace/trace_functions_graph.c497
-rw-r--r--kernel/trace/trace_hwlat.c20
-rw-r--r--kernel/trace/trace_irqsoff.c119
-rw-r--r--kernel/trace/trace_kdb.c19
-rw-r--r--kernel/trace/trace_kprobe.c248
-rw-r--r--kernel/trace/trace_mmiotrace.c14
-rw-r--r--kernel/trace/trace_osnoise.c247
-rw-r--r--kernel/trace/trace_output.c297
-rw-r--r--kernel/trace/trace_output.h20
-rw-r--r--kernel/trace/trace_pid.c246
-rw-r--r--kernel/trace/trace_printk.c433
-rw-r--r--kernel/trace/trace_probe.c226
-rw-r--r--kernel/trace/trace_probe.h47
-rw-r--r--kernel/trace/trace_probe_tmpl.h2
-rw-r--r--kernel/trace/trace_recursion_record.c2
-rw-r--r--kernel/trace/trace_remote.c1384
-rw-r--r--kernel/trace/trace_sched_switch.c8
-rw-r--r--kernel/trace/trace_sched_wakeup.c70
-rw-r--r--kernel/trace/trace_selftest.c23
-rw-r--r--kernel/trace/trace_seq.c31
-rw-r--r--kernel/trace/trace_snapshot.c1066
-rw-r--r--kernel/trace/trace_stack.c30
-rw-r--r--kernel/trace/trace_stat.c30
-rw-r--r--kernel/trace/trace_syscalls.c963
-rw-r--r--kernel/trace/trace_uprobe.c145
-rw-r--r--kernel/trace/tracing_map.c33
-rw-r--r--kernel/trace/undefsyms_base.c28
136 files changed, 19593 insertions, 7237 deletions
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 74c2b1d43bb9..e130da35808f 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -31,9 +31,14 @@ config HAVE_FUNCTION_GRAPH_TRACER
help
See Documentation/trace/ftrace-design.rst
-config HAVE_FUNCTION_GRAPH_RETVAL
+config HAVE_FUNCTION_GRAPH_FREGS
bool
+config HAVE_FTRACE_GRAPH_FUNC
+ bool
+ help
+ True if ftrace_graph_func() is defined.
+
config HAVE_DYNAMIC_FTRACE
bool
help
@@ -45,9 +50,18 @@ config HAVE_DYNAMIC_FTRACE_WITH_REGS
config HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
bool
+config HAVE_SINGLE_FTRACE_DIRECT_OPS
+ bool
+
config HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS
bool
+config HAVE_EXTRA_IPI_TRACEPOINTS
+ bool
+ help
+ For architectures that use ipi_raise, ipi_entry and ipi_exit
+ tracepoints.
+
config HAVE_DYNAMIC_FTRACE_WITH_ARGS
bool
help
@@ -57,16 +71,23 @@ config HAVE_DYNAMIC_FTRACE_WITH_ARGS
This allows for use of ftrace_regs_get_argument() and
ftrace_regs_get_stack_pointer().
+config HAVE_FTRACE_REGS_HAVING_PT_REGS
+ bool
+ help
+ If this is set, ftrace_regs has pt_regs, thus it can convert to
+ pt_regs without allocating memory.
+
config HAVE_DYNAMIC_FTRACE_NO_PATCHABLE
bool
help
If the architecture generates __patchable_function_entries sections
but does not want them included in the ftrace locations.
-config HAVE_FTRACE_MCOUNT_RECORD
+config HAVE_DYNAMIC_FTRACE_WITH_JMP
bool
help
- See Documentation/trace/ftrace-design.rst
+ If the architecture supports to replace the __fentry__ with a
+ "jmp" instruction.
config HAVE_SYSCALL_TRACEPOINTS
bool
@@ -115,6 +136,7 @@ config BUILDTIME_MCOUNT_SORT
config TRACER_MAX_TRACE
bool
+ select TRACER_SNAPSHOT
config TRACE_CLOCK
bool
@@ -188,6 +210,19 @@ menuconfig FTRACE
if FTRACE
+config TRACEFS_AUTOMOUNT_DEPRECATED
+ bool "Automount tracefs on debugfs [DEPRECATED]"
+ depends on TRACING
+ default y
+ help
+ The tracing interface was moved from /sys/kernel/debug/tracing
+ to /sys/kernel/tracing in 2015, but the tracing file system
+ was still automounted in /sys/kernel/debug for backward
+ compatibility with tooling.
+
+ The new interface has been around for more than 10 years and
+ the old debug mount will soon be removed.
+
config BOOTTIME_TRACING
bool "Boot-time Tracing support"
depends on TRACING
@@ -232,7 +267,7 @@ config FUNCTION_GRAPH_TRACER
config FUNCTION_GRAPH_RETVAL
bool "Kernel Function Graph Return Value"
- depends on HAVE_FUNCTION_GRAPH_RETVAL
+ depends on HAVE_FUNCTION_GRAPH_FREGS
depends on FUNCTION_GRAPH_TRACER
default n
help
@@ -252,8 +287,19 @@ config FUNCTION_GRAPH_RETADDR
the function is called. This feature is off by default, and you can
enable it via the trace option funcgraph-retaddr.
+config FUNCTION_TRACE_ARGS
+ bool
+ depends on PROBE_EVENTS_BTF_ARGS
+ default y
+ help
+ If supported with function argument access API and BTF, then
+ the function tracer and function graph tracer will support printing
+ of function arguments. This feature is off by default, and can be
+ enabled via the trace option func-args (for the function tracer) and
+ funcgraph-args (for the function graph tracer)
+
config DYNAMIC_FTRACE
- bool "enable/disable function tracing dynamically"
+ bool
depends on FUNCTION_TRACER
depends on HAVE_DYNAMIC_FTRACE
default y
@@ -294,12 +340,31 @@ config DYNAMIC_FTRACE_WITH_ARGS
depends on DYNAMIC_FTRACE
depends on HAVE_DYNAMIC_FTRACE_WITH_ARGS
+config DYNAMIC_FTRACE_WITH_JMP
+ def_bool y
+ depends on DYNAMIC_FTRACE
+ depends on DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+ depends on HAVE_DYNAMIC_FTRACE_WITH_JMP
+
+config FUNCTION_SELF_TRACING
+ bool "Function trace tracing code"
+ depends on FUNCTION_TRACER
+ help
+ Normally all the tracing code is set to notrace, where the function
+ tracer will ignore all the tracing functions. Sometimes it is useful
+ for debugging to trace some of the tracing infratructure itself.
+ Enable this to allow some of the tracing infrastructure to be traced
+ by the function tracer. Note, this will likely add noise to function
+ tracing if events and other tracing features are enabled along with
+ function tracing.
+
+ If unsure, say N.
+
config FPROBE
bool "Kernel Function Probe (fprobe)"
- depends on FUNCTION_TRACER
- depends on DYNAMIC_FTRACE_WITH_REGS
- depends on HAVE_RETHOOK
- select RETHOOK
+ depends on HAVE_FUNCTION_GRAPH_FREGS && HAVE_FTRACE_GRAPH_FUNC
+ depends on DYNAMIC_FTRACE_WITH_ARGS
+ select FUNCTION_GRAPH_TRACER
default n
help
This option enables kernel function probe (fprobe) based on ftrace.
@@ -361,7 +426,6 @@ config IRQSOFF_TRACER
select GENERIC_TRACER
select TRACER_MAX_TRACE
select RING_BUFFER_ALLOW_SWAP
- select TRACER_SNAPSHOT
select TRACER_SNAPSHOT_PER_CPU_SWAP
help
This option measures the time spent in irqs-off critical
@@ -384,7 +448,6 @@ config PREEMPT_TRACER
select GENERIC_TRACER
select TRACER_MAX_TRACE
select RING_BUFFER_ALLOW_SWAP
- select TRACER_SNAPSHOT
select TRACER_SNAPSHOT_PER_CPU_SWAP
select TRACE_PREEMPT_TOGGLE
help
@@ -406,7 +469,6 @@ config SCHED_TRACER
select GENERIC_TRACER
select CONTEXT_SWITCH_TRACER
select TRACER_MAX_TRACE
- select TRACER_SNAPSHOT
help
This tracer tracks the latency of the highest priority task
to be scheduled in, starting from the point it has woken up.
@@ -540,9 +602,22 @@ config FTRACE_SYSCALLS
help
Basic tracer to catch the syscall entry and exit events.
+config TRACE_SYSCALL_BUF_SIZE_DEFAULT
+ int "System call user read max size"
+ range 0 165
+ default 63
+ depends on FTRACE_SYSCALLS
+ help
+ Some system call trace events will record the data from a user
+ space address that one of the parameters point to. The amount of
+ data per event is limited. That limit is set by this config and
+ this config also affects how much user space data perf can read.
+
+ For a tracing instance, this size may be changed by writing into
+ its syscall_user_buf_size file.
+
config TRACER_SNAPSHOT
bool "Create a snapshot trace buffer"
- select TRACER_MAX_TRACE
help
Allow tracing users to take snapshot of the current buffer using the
ftrace interface, e.g.:
@@ -550,6 +625,9 @@ config TRACER_SNAPSHOT
echo 1 > /sys/kernel/tracing/snapshot
cat snapshot
+ Note, the latency tracers select this option. To disable it,
+ all the latency tracers need to be disabled.
+
config TRACER_SNAPSHOT_PER_CPU_SWAP
bool "Allow snapshot to swap per CPU"
depends on TRACER_SNAPSHOT
@@ -758,6 +836,20 @@ config UPROBE_EVENTS
This option is required if you plan to use perf-probe subcommand
of perf tools on user space applications.
+config EPROBE_EVENTS
+ bool "Enable event-based dynamic events"
+ depends on TRACING
+ depends on HAVE_REGS_AND_STACK_ACCESS_API
+ select PROBE_EVENTS
+ select DYNAMIC_EVENTS
+ default y
+ help
+ Eprobes are dynamic events that can be placed on other existing
+ events. It can be used to limit what fields are recorded in
+ an event or even dereference a field of an event. It can
+ convert the type of an event field. For example, turn an
+ address into a string.
+
config BPF_EVENTS
depends on BPF_SYSCALL
depends on (KPROBE_EVENTS || UPROBE_EVENTS) && PERF_EVENTS
@@ -782,27 +874,22 @@ config BPF_KPROBE_OVERRIDE
Allows BPF to override the execution of a probed function and
set a different return value. This is used for error injection.
-config FTRACE_MCOUNT_RECORD
- def_bool y
- depends on DYNAMIC_FTRACE
- depends on HAVE_FTRACE_MCOUNT_RECORD
-
config FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY
bool
- depends on FTRACE_MCOUNT_RECORD
+ depends on DYNAMIC_FTRACE
config FTRACE_MCOUNT_USE_CC
def_bool y
depends on $(cc-option,-mrecord-mcount)
depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY
- depends on FTRACE_MCOUNT_RECORD
+ depends on DYNAMIC_FTRACE
config FTRACE_MCOUNT_USE_OBJTOOL
def_bool y
depends on HAVE_OBJTOOL_MCOUNT
depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY
depends on !FTRACE_MCOUNT_USE_CC
- depends on FTRACE_MCOUNT_RECORD
+ depends on DYNAMIC_FTRACE
select OBJTOOL
config FTRACE_MCOUNT_USE_RECORDMCOUNT
@@ -810,7 +897,7 @@ config FTRACE_MCOUNT_USE_RECORDMCOUNT
depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY
depends on !FTRACE_MCOUNT_USE_CC
depends on !FTRACE_MCOUNT_USE_OBJTOOL
- depends on FTRACE_MCOUNT_RECORD
+ depends on DYNAMIC_FTRACE
config TRACING_MAP
bool
@@ -1194,4 +1281,18 @@ config HIST_TRIGGERS_DEBUG
source "kernel/trace/rv/Kconfig"
+config TRACE_REMOTE
+ bool
+
+config SIMPLE_RING_BUFFER
+ bool
+
+config TRACE_REMOTE_TEST
+ tristate "Test module for remote tracing"
+ select TRACE_REMOTE
+ select SIMPLE_RING_BUFFER
+ help
+ This trace remote includes a ring-buffer writer implementation using
+ "simple_ring_buffer". This is solely intending for testing.
+
endif # FTRACE
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 057cd975d014..8d3d96e847d8 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -16,6 +16,23 @@ obj-y += trace_selftest_dynamic.o
endif
endif
+# Allow some files to be function traced
+ifdef CONFIG_FUNCTION_SELF_TRACING
+CFLAGS_trace_output.o = $(CC_FLAGS_FTRACE)
+CFLAGS_trace_seq.o = $(CC_FLAGS_FTRACE)
+CFLAGS_trace_stat.o = $(CC_FLAGS_FTRACE)
+CFLAGS_tracing_map.o = $(CC_FLAGS_FTRACE)
+CFLAGS_synth_event_gen_test.o = $(CC_FLAGS_FTRACE)
+CFLAGS_trace_events.o = $(CC_FLAGS_FTRACE)
+CFLAGS_trace_syscalls.o = $(CC_FLAGS_FTRACE)
+CFLAGS_trace_events_filter.o = $(CC_FLAGS_FTRACE)
+CFLAGS_trace_events_trigger.o = $(CC_FLAGS_FTRACE)
+CFLAGS_trace_events_synth.o = $(CC_FLAGS_FTRACE)
+CFLAGS_trace_events_hist.o = $(CC_FLAGS_FTRACE)
+CFLAGS_trace_events_user.o = $(CC_FLAGS_FTRACE)
+CFLAGS_trace_dynevent.o = $(CC_FLAGS_FTRACE)
+endif
+
ifdef CONFIG_FTRACE_STARTUP_TEST
CFLAGS_trace_kprobe_selftest.o = $(CC_FLAGS_FTRACE)
obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe_selftest.o
@@ -51,6 +68,8 @@ obj-$(CONFIG_TRACING) += trace_output.o
obj-$(CONFIG_TRACING) += trace_seq.o
obj-$(CONFIG_TRACING) += trace_stat.o
obj-$(CONFIG_TRACING) += trace_printk.o
+obj-$(CONFIG_TRACING) += trace_pid.o
+obj-$(CONFIG_TRACER_SNAPSHOT) += trace_snapshot.o
obj-$(CONFIG_TRACING) += pid_list.o
obj-$(CONFIG_TRACING_MAP) += tracing_map.o
obj-$(CONFIG_PREEMPTIRQ_DELAY_TEST) += preemptirq_delay_test.o
@@ -82,7 +101,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
endif
obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
-obj-$(CONFIG_PROBE_EVENTS) += trace_eprobe.o
+obj-$(CONFIG_EPROBE_EVENTS) += trace_eprobe.o
obj-$(CONFIG_TRACE_EVENT_INJECT) += trace_events_inject.o
obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o
obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o
@@ -110,4 +129,37 @@ obj-$(CONFIG_FPROBE_EVENTS) += trace_fprobe.o
obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
obj-$(CONFIG_RV) += rv/
+obj-$(CONFIG_TRACE_REMOTE) += trace_remote.o
+obj-$(CONFIG_SIMPLE_RING_BUFFER) += simple_ring_buffer.o
+obj-$(CONFIG_TRACE_REMOTE_TEST) += remote_test.o
+
+# simple_ring_buffer is used by the pKVM hypervisor which does not have access
+# to all kernel symbols. Fail the build if forbidden symbols are found.
+
+# Basic compiler and tooling-generated symbols that can safely be left
+# undefined. Ensure KASAN is enabled to avoid logic that may disable
+# FORTIFY_SOURCE when KASAN is not enabled. undefsyms_base.o does not
+# automatically get KASAN flags because it is not linked into vmlinux.
+targets += undefsyms_base.o
+KASAN_SANITIZE_undefsyms_base.o := y
+
+UNDEFINED_ALLOWLIST = __asan __gcov __kasan __kcsan __hwasan __sancov __sanitizer __tsan __ubsan __msan \
+ __aeabi_unwind_cpp __s390_indirect_jump __x86_indirect_thunk simple_ring_buffer \
+ $(shell $(NM) -u $(obj)/undefsyms_base.o 2>/dev/null | awk '{print $$2}')
+
+quiet_cmd_check_undefined = NM $<
+ cmd_check_undefined = \
+ undefsyms=$$($(NM) -u $< | grep -v $(addprefix -e , $(UNDEFINED_ALLOWLIST)) || true); \
+ if [ -n "$$undefsyms" ]; then \
+ echo "Unexpected symbols in $<:" >&2; \
+ echo "$$undefsyms" >&2; \
+ false; \
+ fi; \
+ touch $@
+
+$(obj)/%.o.checked: $(obj)/%.o $(obj)/undefsyms_base.o FORCE
+ $(call if_changed,check_undefined)
+
+always-$(CONFIG_SIMPLE_RING_BUFFER) += simple_ring_buffer.o.checked
+
libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 8fd292d34d89..8cd2520b4c99 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -63,13 +63,116 @@ static int blk_probes_ref;
static void blk_register_tracepoints(void);
static void blk_unregister_tracepoints(void);
+static void record_blktrace_event(struct blk_io_trace *t, pid_t pid, int cpu,
+ sector_t sector, int bytes, u64 what,
+ dev_t dev, int error, u64 cgid,
+ ssize_t cgid_len, void *pdu_data, int pdu_len)
+
+{
+ /*
+ * These two are not needed in ftrace as they are in the
+ * generic trace_entry, filled by tracing_generic_entry_update,
+ * but for the trace_event->bin() synthesizer benefit we do it
+ * here too.
+ */
+ t->cpu = cpu;
+ t->pid = pid;
+
+ t->sector = sector;
+ t->bytes = bytes;
+ t->action = lower_32_bits(what);
+ t->device = dev;
+ t->error = error;
+ t->pdu_len = pdu_len + cgid_len;
+
+ if (cgid_len)
+ memcpy((void *)t + sizeof(*t), &cgid, cgid_len);
+ if (pdu_len)
+ memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len);
+}
+
+static void record_blktrace_event2(struct blk_io_trace2 *t2, pid_t pid, int cpu,
+ sector_t sector, int bytes, u64 what,
+ dev_t dev, int error, u64 cgid,
+ ssize_t cgid_len, void *pdu_data,
+ int pdu_len)
+{
+ t2->pid = pid;
+ t2->cpu = cpu;
+
+ t2->sector = sector;
+ t2->bytes = bytes;
+ t2->action = what;
+ t2->device = dev;
+ t2->error = error;
+ t2->pdu_len = pdu_len + cgid_len;
+
+ if (cgid_len)
+ memcpy((void *)t2 + sizeof(*t2), &cgid, cgid_len);
+ if (pdu_len)
+ memcpy((void *)t2 + sizeof(*t2) + cgid_len, pdu_data, pdu_len);
+}
+
+static void relay_blktrace_event1(struct blk_trace *bt, unsigned long sequence,
+ pid_t pid, int cpu, sector_t sector, int bytes,
+ u64 what, int error, u64 cgid,
+ ssize_t cgid_len, void *pdu_data, int pdu_len)
+{
+ struct blk_io_trace *t;
+ size_t trace_len = sizeof(*t) + pdu_len + cgid_len;
+
+ t = relay_reserve(bt->rchan, trace_len);
+ if (!t)
+ return;
+
+ t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
+ t->sequence = sequence;
+ t->time = ktime_to_ns(ktime_get());
+
+ record_blktrace_event(t, pid, cpu, sector, bytes, what, bt->dev, error,
+ cgid, cgid_len, pdu_data, pdu_len);
+}
+
+static void relay_blktrace_event2(struct blk_trace *bt, unsigned long sequence,
+ pid_t pid, int cpu, sector_t sector,
+ int bytes, u64 what, int error, u64 cgid,
+ ssize_t cgid_len, void *pdu_data, int pdu_len)
+{
+ struct blk_io_trace2 *t;
+ size_t trace_len = sizeof(struct blk_io_trace2) + pdu_len + cgid_len;
+
+ t = relay_reserve(bt->rchan, trace_len);
+ if (!t)
+ return;
+
+ t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE2_VERSION;
+ t->sequence = sequence;
+ t->time = ktime_to_ns(ktime_get());
+
+ record_blktrace_event2(t, pid, cpu, sector, bytes, what, bt->dev, error,
+ cgid, cgid_len, pdu_data, pdu_len);
+}
+
+static void relay_blktrace_event(struct blk_trace *bt, unsigned long sequence,
+ pid_t pid, int cpu, sector_t sector, int bytes,
+ u64 what, int error, u64 cgid,
+ ssize_t cgid_len, void *pdu_data, int pdu_len)
+{
+ if (bt->version == 2)
+ return relay_blktrace_event2(bt, sequence, pid, cpu, sector,
+ bytes, what, error, cgid, cgid_len,
+ pdu_data, pdu_len);
+ return relay_blktrace_event1(bt, sequence, pid, cpu, sector, bytes,
+ what, error, cgid, cgid_len, pdu_data,
+ pdu_len);
+}
+
/*
* Send out a notify message.
*/
-static void trace_note(struct blk_trace *bt, pid_t pid, int action,
+static void trace_note(struct blk_trace *bt, pid_t pid, u64 action,
const void *data, size_t len, u64 cgid)
{
- struct blk_io_trace *t;
struct ring_buffer_event *event = NULL;
struct trace_buffer *buffer = NULL;
unsigned int trace_ctx = 0;
@@ -77,38 +180,30 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
bool blk_tracer = blk_tracer_enabled;
ssize_t cgid_len = cgid ? sizeof(cgid) : 0;
+ action = lower_32_bits(action | (cgid ? __BLK_TN_CGROUP : 0));
if (blk_tracer) {
+ struct blk_io_trace2 *t;
+ size_t trace_len = sizeof(*t) + cgid_len + len;
+
buffer = blk_tr->array_buffer.buffer;
trace_ctx = tracing_gen_ctx_flags(0);
event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
- sizeof(*t) + len + cgid_len,
- trace_ctx);
+ trace_len, trace_ctx);
if (!event)
return;
t = ring_buffer_event_data(event);
- goto record_it;
+ record_blktrace_event2(t, pid, cpu, 0, 0,
+ action, bt->dev, 0, cgid, cgid_len,
+ (void *)data, len);
+ trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx);
+ return;
}
if (!bt->rchan)
return;
- t = relay_reserve(bt->rchan, sizeof(*t) + len + cgid_len);
- if (t) {
- t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
- t->time = ktime_to_ns(ktime_get());
-record_it:
- t->device = bt->dev;
- t->action = action | (cgid ? __BLK_TN_CGROUP : 0);
- t->pid = pid;
- t->cpu = cpu;
- t->pdu_len = len + cgid_len;
- if (cgid_len)
- memcpy((void *)t + sizeof(*t), &cgid, cgid_len);
- memcpy((void *) t + sizeof(*t) + cgid_len, data, len);
-
- if (blk_tracer)
- trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx);
- }
+ relay_blktrace_event(bt, 0, pid, cpu, 0, 0, action, 0, cgid,
+ cgid_len, (void *)data, len);
}
/*
@@ -182,7 +277,7 @@ void __blk_trace_note_message(struct blk_trace *bt,
}
EXPORT_SYMBOL_GPL(__blk_trace_note_message);
-static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
+static int act_log_check(struct blk_trace *bt, u64 what, sector_t sector,
pid_t pid)
{
if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
@@ -213,13 +308,12 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
* blk_io_trace structure and places it in a per-cpu subbuffer.
*/
static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
- const blk_opf_t opf, u32 what, int error,
+ const blk_opf_t opf, u64 what, int error,
int pdu_len, void *pdu_data, u64 cgid)
{
struct task_struct *tsk = current;
struct ring_buffer_event *event = NULL;
struct trace_buffer *buffer = NULL;
- struct blk_io_trace *t;
unsigned long flags = 0;
unsigned long *sequence;
unsigned int trace_ctx = 0;
@@ -228,6 +322,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
bool blk_tracer = blk_tracer_enabled;
ssize_t cgid_len = cgid ? sizeof(cgid) : 0;
const enum req_op op = opf & REQ_OP_MASK;
+ size_t trace_len;
if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))
return;
@@ -238,10 +333,47 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
what |= MASK_TC_BIT(opf, META);
what |= MASK_TC_BIT(opf, PREFLUSH);
what |= MASK_TC_BIT(opf, FUA);
- if (op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE)
+
+ switch (op) {
+ case REQ_OP_DISCARD:
+ case REQ_OP_SECURE_ERASE:
what |= BLK_TC_ACT(BLK_TC_DISCARD);
- if (op == REQ_OP_FLUSH)
+ break;
+ case REQ_OP_FLUSH:
what |= BLK_TC_ACT(BLK_TC_FLUSH);
+ break;
+ case REQ_OP_ZONE_APPEND:
+ what |= BLK_TC_ACT(BLK_TC_ZONE_APPEND);
+ break;
+ case REQ_OP_ZONE_RESET:
+ what |= BLK_TC_ACT(BLK_TC_ZONE_RESET);
+ break;
+ case REQ_OP_ZONE_RESET_ALL:
+ what |= BLK_TC_ACT(BLK_TC_ZONE_RESET_ALL);
+ break;
+ case REQ_OP_ZONE_FINISH:
+ what |= BLK_TC_ACT(BLK_TC_ZONE_FINISH);
+ break;
+ case REQ_OP_ZONE_OPEN:
+ what |= BLK_TC_ACT(BLK_TC_ZONE_OPEN);
+ break;
+ case REQ_OP_ZONE_CLOSE:
+ what |= BLK_TC_ACT(BLK_TC_ZONE_CLOSE);
+ break;
+ case REQ_OP_WRITE_ZEROES:
+ what |= BLK_TC_ACT(BLK_TC_WRITE_ZEROES);
+ break;
+ default:
+ break;
+ }
+
+ /* Drop trace events for zone operations with blktrace v1 */
+ if (bt->version == 1 && (what >> BLK_TC_SHIFT) > BLK_TC_END_V1) {
+ pr_debug_ratelimited("blktrace v1 cannot trace zone operation 0x%llx\n",
+ (unsigned long long)what);
+ return;
+ }
+
if (cgid)
what |= __BLK_TA_CGROUP;
@@ -251,17 +383,71 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
cpu = raw_smp_processor_id();
if (blk_tracer) {
- tracing_record_cmdline(current);
-
buffer = blk_tr->array_buffer.buffer;
trace_ctx = tracing_gen_ctx_flags(0);
+ switch (bt->version) {
+ case 1:
+ trace_len = sizeof(struct blk_io_trace);
+ break;
+ case 2:
+ default:
+ /*
+ * ftrace always uses v2 (blk_io_trace2) format.
+ *
+ * For sysfs-enabled tracing path (enabled via
+ * /sys/block/DEV/trace/enable), blk_trace_setup_queue()
+ * never initializes bt->version, leaving it 0 from
+ * kzalloc(). We must handle version==0 safely here.
+ *
+ * Fall through to default to ensure we never hit the
+ * old bug where default set trace_len=0, causing
+ * buffer underflow and memory corruption.
+ *
+ * Always use v2 format for ftrace and normalize
+ * bt->version to 2 when uninitialized.
+ */
+ trace_len = sizeof(struct blk_io_trace2);
+ if (bt->version == 0)
+ bt->version = 2;
+ break;
+ }
+ trace_len += pdu_len + cgid_len;
event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
- sizeof(*t) + pdu_len + cgid_len,
- trace_ctx);
+ trace_len, trace_ctx);
if (!event)
return;
- t = ring_buffer_event_data(event);
- goto record_it;
+
+ tracing_record_cmdline(current);
+ switch (bt->version) {
+ case 1:
+ record_blktrace_event(ring_buffer_event_data(event),
+ pid, cpu, sector, bytes,
+ what, bt->dev, error, cgid, cgid_len,
+ pdu_data, pdu_len);
+ break;
+ case 2:
+ default:
+ /*
+ * Use v2 recording function (record_blktrace_event2)
+ * which writes blk_io_trace2 structure with correct
+ * field layout:
+ * - 32-bit pid at offset 28
+ * - 64-bit action at offset 32
+ *
+ * Fall through to default handles version==0 case
+ * (from sysfs path), ensuring we always use correct
+ * v2 recording function to match the v2 buffer
+ * allocated above.
+ */
+ record_blktrace_event2(ring_buffer_event_data(event),
+ pid, cpu, sector, bytes,
+ what, bt->dev, error, cgid, cgid_len,
+ pdu_data, pdu_len);
+ break;
+ }
+
+ trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx);
+ return;
}
if (unlikely(tsk->btrace_seq != blktrace_seq))
@@ -273,41 +459,10 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
* from coming in and stepping on our toes.
*/
local_irq_save(flags);
- t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len + cgid_len);
- if (t) {
- sequence = per_cpu_ptr(bt->sequence, cpu);
-
- t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
- t->sequence = ++(*sequence);
- t->time = ktime_to_ns(ktime_get());
-record_it:
- /*
- * These two are not needed in ftrace as they are in the
- * generic trace_entry, filled by tracing_generic_entry_update,
- * but for the trace_event->bin() synthesizer benefit we do it
- * here too.
- */
- t->cpu = cpu;
- t->pid = pid;
-
- t->sector = sector;
- t->bytes = bytes;
- t->action = what;
- t->device = bt->dev;
- t->error = error;
- t->pdu_len = pdu_len + cgid_len;
-
- if (cgid_len)
- memcpy((void *)t + sizeof(*t), &cgid, cgid_len);
- if (pdu_len)
- memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len);
-
- if (blk_tracer) {
- trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx);
- return;
- }
- }
-
+ sequence = per_cpu_ptr(bt->sequence, cpu);
+ (*sequence)++;
+ relay_blktrace_event(bt, *sequence, pid, cpu, sector, bytes,
+ what, error, cgid, cgid_len, pdu_data, pdu_len);
local_irq_restore(flags);
}
@@ -403,9 +558,9 @@ int blk_trace_remove(struct request_queue *q)
{
int ret;
- mutex_lock(&q->debugfs_mutex);
+ blk_debugfs_lock_nomemsave(q);
ret = __blk_trace_remove(q);
- mutex_unlock(&q->debugfs_mutex);
+ blk_debugfs_unlock_nomemrestore(q);
return ret;
}
@@ -415,9 +570,10 @@ static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
size_t count, loff_t *ppos)
{
struct blk_trace *bt = filp->private_data;
+ size_t dropped = relay_stats(bt->rchan, RELAY_STATS_BUF_FULL);
char buf[16];
- snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped));
+ snprintf(buf, sizeof(buf), "%zu\n", dropped);
return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
}
@@ -456,23 +612,6 @@ static const struct file_operations blk_msg_fops = {
.llseek = noop_llseek,
};
-/*
- * Keep track of how many times we encountered a full subbuffer, to aid
- * the user space app in telling how many lost events there were.
- */
-static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
- void *prev_subbuf, size_t prev_padding)
-{
- struct blk_trace *bt;
-
- if (!relay_buf_full(buf))
- return 1;
-
- bt = buf->chan->private_data;
- atomic_inc(&bt->dropped);
- return 0;
-}
-
static int blk_remove_buf_file_callback(struct dentry *dentry)
{
debugfs_remove(dentry);
@@ -491,7 +630,6 @@ static struct dentry *blk_create_buf_file_callback(const char *filename,
}
static const struct rchan_callbacks blk_relay_callbacks = {
- .subbuf_start = blk_subbuf_start_callback,
.create_buf_file = blk_create_buf_file_callback,
.remove_buf_file = blk_remove_buf_file_callback,
};
@@ -511,9 +649,10 @@ static void blk_trace_setup_lba(struct blk_trace *bt,
/*
* Setup everything required to start tracing
*/
-static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
- struct block_device *bdev,
- struct blk_user_trace_setup *buts)
+static struct blk_trace *blk_trace_setup_prepare(struct request_queue *q,
+ char *name, dev_t dev,
+ u32 buf_size, u32 buf_nr,
+ struct block_device *bdev)
{
struct blk_trace *bt = NULL;
struct dentry *dir = NULL;
@@ -521,31 +660,19 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
lockdep_assert_held(&q->debugfs_mutex);
- if (!buts->buf_size || !buts->buf_nr)
- return -EINVAL;
-
- strscpy_pad(buts->name, name, BLKTRACE_BDEV_SIZE);
-
- /*
- * some device names have larger paths - convert the slashes
- * to underscores for this to work as expected
- */
- strreplace(buts->name, '/', '_');
-
/*
* bdev can be NULL, as with scsi-generic, this is a helpful as
* we can be.
*/
if (rcu_dereference_protected(q->blk_trace,
lockdep_is_held(&q->debugfs_mutex))) {
- pr_warn("Concurrent blktraces are not allowed on %s\n",
- buts->name);
- return -EBUSY;
+ pr_warn("Concurrent blktraces are not allowed on %s\n", name);
+ return ERR_PTR(-EBUSY);
}
- bt = kzalloc(sizeof(*bt), GFP_KERNEL);
+ bt = kzalloc_obj(*bt);
if (!bt)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
ret = -ENOMEM;
bt->sequence = alloc_percpu(unsigned long);
@@ -565,7 +692,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
if (bdev && !bdev_is_partition(bdev))
dir = q->debugfs_dir;
else
- bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root);
+ bt->dir = dir = debugfs_create_dir(name, blk_debugfs_root);
/*
* As blktrace relies on debugfs for its interface the debugfs directory
@@ -573,31 +700,52 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
* files or directories.
*/
if (IS_ERR_OR_NULL(dir)) {
- pr_warn("debugfs_dir not present for %s so skipping\n",
- buts->name);
+ pr_warn("debugfs_dir not present for %s so skipping\n", name);
ret = -ENOENT;
goto err;
}
bt->dev = dev;
- atomic_set(&bt->dropped, 0);
INIT_LIST_HEAD(&bt->running_list);
ret = -EIO;
debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops);
debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
- bt->rchan = relay_open("trace", dir, buts->buf_size,
- buts->buf_nr, &blk_relay_callbacks, bt);
+ bt->rchan = relay_open("trace", dir, buf_size, buf_nr,
+ &blk_relay_callbacks, bt);
if (!bt->rchan)
goto err;
+ blk_trace_setup_lba(bt, bdev);
+
+ return bt;
+
+err:
+ blk_trace_free(q, bt);
+
+ return ERR_PTR(ret);
+}
+
+static void blk_trace_setup_finalize(struct request_queue *q,
+ char *name, int version,
+ struct blk_trace *bt,
+ struct blk_user_trace_setup2 *buts)
+
+{
+ strscpy_pad(buts->name, name, BLKTRACE_BDEV_SIZE2);
+
+ /*
+ * some device names have larger paths - convert the slashes
+ * to underscores for this to work as expected
+ */
+ strreplace(buts->name, '/', '_');
+
+ bt->version = version;
bt->act_mask = buts->act_mask;
if (!bt->act_mask)
bt->act_mask = (u16) -1;
- blk_trace_setup_lba(bt, bdev);
-
/* overwrite with user settings */
if (buts->start_lba)
bt->start_lba = buts->start_lba;
@@ -609,62 +757,103 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
rcu_assign_pointer(q->blk_trace, bt);
get_probe_ref();
-
- ret = 0;
-err:
- if (ret)
- blk_trace_free(q, bt);
- return ret;
}
-static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
- struct block_device *bdev, char __user *arg)
+int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+ struct block_device *bdev,
+ char __user *arg)
{
+ struct blk_user_trace_setup2 buts2;
struct blk_user_trace_setup buts;
+ struct blk_trace *bt;
+ unsigned int memflags;
int ret;
ret = copy_from_user(&buts, arg, sizeof(buts));
if (ret)
return -EFAULT;
- ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
- if (ret)
- return ret;
+ if (!buts.buf_size || !buts.buf_nr)
+ return -EINVAL;
+
+ buts2 = (struct blk_user_trace_setup2) {
+ .act_mask = buts.act_mask,
+ .buf_size = buts.buf_size,
+ .buf_nr = buts.buf_nr,
+ .start_lba = buts.start_lba,
+ .end_lba = buts.end_lba,
+ .pid = buts.pid,
+ };
+
+ memflags = blk_debugfs_lock(q);
+ bt = blk_trace_setup_prepare(q, name, dev, buts.buf_size, buts.buf_nr,
+ bdev);
+ if (IS_ERR(bt)) {
+ blk_debugfs_unlock(q, memflags);
+ return PTR_ERR(bt);
+ }
+ blk_trace_setup_finalize(q, name, 1, bt, &buts2);
+ strscpy(buts.name, buts2.name, BLKTRACE_BDEV_SIZE);
+ blk_debugfs_unlock(q, memflags);
if (copy_to_user(arg, &buts, sizeof(buts))) {
- __blk_trace_remove(q);
+ blk_trace_remove(q);
return -EFAULT;
}
return 0;
}
+EXPORT_SYMBOL_GPL(blk_trace_setup);
-int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
- struct block_device *bdev,
- char __user *arg)
+static int blk_trace_setup2(struct request_queue *q, char *name, dev_t dev,
+ struct block_device *bdev, char __user *arg)
{
- int ret;
+ struct blk_user_trace_setup2 buts2;
+ struct blk_trace *bt;
+ unsigned int memflags;
+
+ if (copy_from_user(&buts2, arg, sizeof(buts2)))
+ return -EFAULT;
- mutex_lock(&q->debugfs_mutex);
- ret = __blk_trace_setup(q, name, dev, bdev, arg);
- mutex_unlock(&q->debugfs_mutex);
+ if (!buts2.buf_size || !buts2.buf_nr)
+ return -EINVAL;
- return ret;
+ if (buts2.flags != 0)
+ return -EINVAL;
+
+ memflags = blk_debugfs_lock(q);
+ bt = blk_trace_setup_prepare(q, name, dev, buts2.buf_size, buts2.buf_nr,
+ bdev);
+ if (IS_ERR(bt)) {
+ blk_debugfs_unlock(q, memflags);
+ return PTR_ERR(bt);
+ }
+ blk_trace_setup_finalize(q, name, 2, bt, &buts2);
+ blk_debugfs_unlock(q, memflags);
+
+ if (copy_to_user(arg, &buts2, sizeof(buts2))) {
+ blk_trace_remove(q);
+ return -EFAULT;
+ }
+ return 0;
}
-EXPORT_SYMBOL_GPL(blk_trace_setup);
#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
static int compat_blk_trace_setup(struct request_queue *q, char *name,
dev_t dev, struct block_device *bdev,
char __user *arg)
{
- struct blk_user_trace_setup buts;
+ struct blk_user_trace_setup2 buts2;
struct compat_blk_user_trace_setup cbuts;
- int ret;
+ struct blk_trace *bt;
+ unsigned int memflags;
if (copy_from_user(&cbuts, arg, sizeof(cbuts)))
return -EFAULT;
- buts = (struct blk_user_trace_setup) {
+ if (!cbuts.buf_size || !cbuts.buf_nr)
+ return -EINVAL;
+
+ buts2 = (struct blk_user_trace_setup2) {
.act_mask = cbuts.act_mask,
.buf_size = cbuts.buf_size,
.buf_nr = cbuts.buf_nr,
@@ -673,12 +862,18 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
.pid = cbuts.pid,
};
- ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
- if (ret)
- return ret;
+ memflags = blk_debugfs_lock(q);
+ bt = blk_trace_setup_prepare(q, name, dev, buts2.buf_size, buts2.buf_nr,
+ bdev);
+ if (IS_ERR(bt)) {
+ blk_debugfs_unlock(q, memflags);
+ return PTR_ERR(bt);
+ }
+ blk_trace_setup_finalize(q, name, 1, bt, &buts2);
+ blk_debugfs_unlock(q, memflags);
- if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) {
- __blk_trace_remove(q);
+ if (copy_to_user(arg, &buts2.name, ARRAY_SIZE(buts2.name))) {
+ blk_trace_remove(q);
return -EFAULT;
}
@@ -705,9 +900,9 @@ int blk_trace_startstop(struct request_queue *q, int start)
{
int ret;
- mutex_lock(&q->debugfs_mutex);
+ blk_debugfs_lock_nomemsave(q);
ret = __blk_trace_startstop(q, start);
- mutex_unlock(&q->debugfs_mutex);
+ blk_debugfs_unlock_nomemrestore(q);
return ret;
}
@@ -732,12 +927,14 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
int ret, start = 0;
char b[BDEVNAME_SIZE];
- mutex_lock(&q->debugfs_mutex);
-
switch (cmd) {
+ case BLKTRACESETUP2:
+ snprintf(b, sizeof(b), "%pg", bdev);
+ ret = blk_trace_setup2(q, b, bdev->bd_dev, bdev, arg);
+ break;
case BLKTRACESETUP:
snprintf(b, sizeof(b), "%pg", bdev);
- ret = __blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
+ ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
break;
#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
case BLKTRACESETUP32:
@@ -749,17 +946,15 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
start = 1;
fallthrough;
case BLKTRACESTOP:
- ret = __blk_trace_startstop(q, start);
+ ret = blk_trace_startstop(q, start);
break;
case BLKTRACETEARDOWN:
- ret = __blk_trace_remove(q);
+ ret = blk_trace_remove(q);
break;
default:
ret = -ENOTTY;
break;
}
-
- mutex_unlock(&q->debugfs_mutex);
return ret;
}
@@ -824,7 +1019,7 @@ blk_trace_request_get_cgid(struct request *rq)
*
**/
static void blk_add_trace_rq(struct request *rq, blk_status_t error,
- unsigned int nr_bytes, u32 what, u64 cgid)
+ unsigned int nr_bytes, u64 what, u64 cgid)
{
struct blk_trace *bt;
@@ -876,6 +1071,22 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
blk_trace_request_get_cgid(rq));
}
+static void blk_add_trace_zone_update_request(void *ignore, struct request *rq)
+{
+ struct blk_trace *bt;
+
+ rcu_read_lock();
+ bt = rcu_dereference(rq->q->blk_trace);
+ if (likely(!bt) || bt->version < 2) {
+ rcu_read_unlock();
+ return;
+ }
+ rcu_read_unlock();
+
+ blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ZONE_APPEND,
+ blk_trace_request_get_cgid(rq));
+}
+
/**
* blk_add_trace_bio - Add a trace for a bio oriented action
* @q: queue the io is for
@@ -888,7 +1099,7 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
*
**/
static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
- u32 what, int error)
+ u64 what, int error)
{
struct blk_trace *bt;
@@ -905,11 +1116,6 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
rcu_read_unlock();
}
-static void blk_add_trace_bio_bounce(void *ignore, struct bio *bio)
-{
- blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_BOUNCE, 0);
-}
-
static void blk_add_trace_bio_complete(void *ignore,
struct request_queue *q, struct bio *bio)
{
@@ -959,7 +1165,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
bt = rcu_dereference(q->blk_trace);
if (bt) {
__be64 rpdu = cpu_to_be64(depth);
- u32 what;
+ u64 what;
if (explicit)
what = BLK_TA_UNPLUG_IO;
@@ -971,6 +1177,37 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
rcu_read_unlock();
}
+static void blk_add_trace_zone_plug(void *ignore, struct request_queue *q,
+ unsigned int zno, sector_t sector,
+ unsigned int sectors)
+{
+ struct blk_trace *bt;
+
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
+ if (bt && bt->version >= 2)
+ __blk_add_trace(bt, sector, sectors << SECTOR_SHIFT, 0,
+ BLK_TA_ZONE_PLUG, 0, 0, NULL, 0);
+ rcu_read_unlock();
+
+ return;
+}
+
+static void blk_add_trace_zone_unplug(void *ignore, struct request_queue *q,
+ unsigned int zno, sector_t sector,
+ unsigned int sectors)
+{
+ struct blk_trace *bt;
+
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
+ if (bt && bt->version >= 2)
+ __blk_add_trace(bt, sector, sectors << SECTOR_SHIFT, 0,
+ BLK_TA_ZONE_UNPLUG, 0, 0, NULL, 0);
+ rcu_read_unlock();
+ return;
+}
+
static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu)
{
struct request_queue *q = bio->bi_bdev->bd_disk->queue;
@@ -1101,8 +1338,6 @@ static void blk_register_tracepoints(void)
WARN_ON(ret);
ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
WARN_ON(ret);
- ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
- WARN_ON(ret);
ret = register_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
WARN_ON(ret);
ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
@@ -1113,6 +1348,15 @@ static void blk_register_tracepoints(void)
WARN_ON(ret);
ret = register_trace_block_getrq(blk_add_trace_getrq, NULL);
WARN_ON(ret);
+ ret = register_trace_blk_zone_append_update_request_bio(
+ blk_add_trace_zone_update_request, NULL);
+ WARN_ON(ret);
+ ret = register_trace_disk_zone_wplug_add_bio(blk_add_trace_zone_plug,
+ NULL);
+ WARN_ON(ret);
+ ret = register_trace_blk_zone_wplug_bio(blk_add_trace_zone_unplug,
+ NULL);
+ WARN_ON(ret);
ret = register_trace_block_plug(blk_add_trace_plug, NULL);
WARN_ON(ret);
ret = register_trace_block_unplug(blk_add_trace_unplug, NULL);
@@ -1132,12 +1376,15 @@ static void blk_unregister_tracepoints(void)
unregister_trace_block_split(blk_add_trace_split, NULL);
unregister_trace_block_unplug(blk_add_trace_unplug, NULL);
unregister_trace_block_plug(blk_add_trace_plug, NULL);
+ unregister_trace_blk_zone_wplug_bio(blk_add_trace_zone_unplug, NULL);
+ unregister_trace_disk_zone_wplug_add_bio(blk_add_trace_zone_plug, NULL);
+ unregister_trace_blk_zone_append_update_request_bio(
+ blk_add_trace_zone_update_request, NULL);
unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
unregister_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
- unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
unregister_trace_block_rq_merge(blk_add_trace_rq_merge, NULL);
@@ -1151,7 +1398,7 @@ static void blk_unregister_tracepoints(void)
* struct blk_io_tracer formatting routines
*/
-static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
+static void fill_rwbs(char *rwbs, const struct blk_io_trace2 *t)
{
int i = 0;
int tc = t->action >> BLK_TC_SHIFT;
@@ -1166,7 +1413,10 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
if (tc & BLK_TC_DISCARD)
rwbs[i++] = 'D';
- else if (tc & BLK_TC_WRITE)
+ else if (tc & BLK_TC_WRITE_ZEROES) {
+ rwbs[i++] = 'W';
+ rwbs[i++] = 'Z';
+ } else if (tc & BLK_TC_WRITE)
rwbs[i++] = 'W';
else if (t->bytes)
rwbs[i++] = 'R';
@@ -1186,9 +1436,9 @@ out:
}
static inline
-const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent)
+const struct blk_io_trace2 *te_blk_io_trace(const struct trace_entry *ent)
{
- return (const struct blk_io_trace *)ent;
+ return (const struct blk_io_trace2 *)ent;
}
static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg)
@@ -1247,7 +1497,7 @@ static void blk_log_action_classic(struct trace_iterator *iter, const char *act,
unsigned long long ts = iter->ts;
unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC);
unsigned secs = (unsigned long)ts;
- const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
+ const struct blk_io_trace2 *t = te_blk_io_trace(iter->ent);
fill_rwbs(rwbs, t);
@@ -1261,7 +1511,7 @@ static void blk_log_action(struct trace_iterator *iter, const char *act,
bool has_cg)
{
char rwbs[RWBS_LEN];
- const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
+ const struct blk_io_trace2 *t = te_blk_io_trace(iter->ent);
fill_rwbs(rwbs, t);
if (has_cg) {
@@ -1474,7 +1724,6 @@ static const struct {
[__BLK_TA_UNPLUG_TIMER] = {{ "UT", "unplug_timer" }, blk_log_unplug },
[__BLK_TA_INSERT] = {{ "I", "insert" }, blk_log_generic },
[__BLK_TA_SPLIT] = {{ "X", "split" }, blk_log_split },
- [__BLK_TA_BOUNCE] = {{ "B", "bounce" }, blk_log_generic },
[__BLK_TA_REMAP] = {{ "A", "remap" }, blk_log_remap },
};
@@ -1483,7 +1732,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
{
struct trace_array *tr = iter->tr;
struct trace_seq *s = &iter->seq;
- const struct blk_io_trace *t;
+ const struct blk_io_trace2 *t;
u16 what;
bool long_act;
blk_log_action_t *log_action;
@@ -1491,7 +1740,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
t = te_blk_io_trace(iter->ent);
what = (t->action & ((1 << BLK_TC_SHIFT) - 1)) & ~__BLK_TA_CGROUP;
- long_act = !!(tr->trace_flags & TRACE_ITER_VERBOSE);
+ long_act = !!(tr->trace_flags & TRACE_ITER(VERBOSE));
log_action = classic ? &blk_log_action_classic : &blk_log_action;
has_cg = t->action & __BLK_TA_CGROUP;
@@ -1520,8 +1769,8 @@ static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
static void blk_trace_synthesize_old_trace(struct trace_iterator *iter)
{
struct trace_seq *s = &iter->seq;
- struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
- const int offset = offsetof(struct blk_io_trace, sector);
+ struct blk_io_trace2 *t = (struct blk_io_trace2 *)iter->ent;
+ const int offset = offsetof(struct blk_io_trace2, sector);
struct blk_io_trace old = {
.magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION,
.time = iter->ts,
@@ -1556,9 +1805,9 @@ blk_tracer_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
/* don't output context-info for blk_classic output */
if (bit == TRACE_BLK_OPT_CLASSIC) {
if (set)
- tr->trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
+ tr->trace_flags &= ~TRACE_ITER(CONTEXT_INFO);
else
- tr->trace_flags |= TRACE_ITER_CONTEXT_INFO;
+ tr->trace_flags |= TRACE_ITER(CONTEXT_INFO);
}
return 0;
}
@@ -1585,7 +1834,9 @@ static struct trace_event trace_blk_event = {
.funcs = &trace_blk_event_funcs,
};
-static int __init init_blk_tracer(void)
+static struct work_struct blktrace_works __initdata;
+
+static int __init __init_blk_tracer(void)
{
if (!register_trace_event(&trace_blk_event)) {
pr_warn("Warning: could not register block events\n");
@@ -1598,9 +1849,32 @@ static int __init init_blk_tracer(void)
return 1;
}
+ BUILD_BUG_ON(__alignof__(struct blk_user_trace_setup2) %
+ __alignof__(long));
+ BUILD_BUG_ON(__alignof__(struct blk_io_trace2) % __alignof__(long));
+
return 0;
}
+static void __init blktrace_works_func(struct work_struct *work)
+{
+ __init_blk_tracer();
+}
+
+static int __init init_blk_tracer(void)
+{
+ int ret = 0;
+
+ if (trace_init_wq) {
+ INIT_WORK(&blktrace_works, blktrace_works_func);
+ queue_work(trace_init_wq, &blktrace_works);
+ } else {
+ ret = __init_blk_tracer();
+ }
+
+ return ret;
+}
+
device_initcall(init_blk_tracer);
static int blk_trace_remove_queue(struct request_queue *q)
@@ -1629,7 +1903,7 @@ static int blk_trace_setup_queue(struct request_queue *q,
struct blk_trace *bt = NULL;
int ret = -ENOMEM;
- bt = kzalloc(sizeof(*bt), GFP_KERNEL);
+ bt = kzalloc_obj(*bt);
if (!bt)
return -ENOMEM;
@@ -1706,6 +1980,7 @@ static const struct {
{ BLK_TC_DISCARD, "discard" },
{ BLK_TC_DRV_DATA, "drv_data" },
{ BLK_TC_FUA, "fua" },
+ { BLK_TC_WRITE_ZEROES, "write-zeroes" },
};
static int blk_trace_str2mask(const char *str)
@@ -1768,7 +2043,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
struct blk_trace *bt;
ssize_t ret = -ENXIO;
- mutex_lock(&q->debugfs_mutex);
+ blk_debugfs_lock_nomemsave(q);
bt = rcu_dereference_protected(q->blk_trace,
lockdep_is_held(&q->debugfs_mutex));
@@ -1789,7 +2064,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
ret = sprintf(buf, "%llu\n", bt->end_lba);
out_unlock_bdev:
- mutex_unlock(&q->debugfs_mutex);
+ blk_debugfs_unlock_nomemrestore(q);
return ret;
}
@@ -1800,6 +2075,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
struct block_device *bdev = dev_to_bdev(dev);
struct request_queue *q = bdev_get_queue(bdev);
struct blk_trace *bt;
+ unsigned int memflags;
u64 value;
ssize_t ret = -EINVAL;
@@ -1819,7 +2095,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
goto out;
}
- mutex_lock(&q->debugfs_mutex);
+ memflags = blk_debugfs_lock(q);
bt = rcu_dereference_protected(q->blk_trace,
lockdep_is_held(&q->debugfs_mutex));
@@ -1854,7 +2130,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
}
out_unlock_bdev:
- mutex_unlock(&q->debugfs_mutex);
+ blk_debugfs_unlock(q, memflags);
out:
return ret ? ret : count;
}
@@ -1896,6 +2172,33 @@ void blk_fill_rwbs(char *rwbs, blk_opf_t opf)
case REQ_OP_READ:
rwbs[i++] = 'R';
break;
+ case REQ_OP_ZONE_APPEND:
+ rwbs[i++] = 'Z';
+ rwbs[i++] = 'A';
+ break;
+ case REQ_OP_ZONE_RESET:
+ case REQ_OP_ZONE_RESET_ALL:
+ rwbs[i++] = 'Z';
+ rwbs[i++] = 'R';
+ if ((opf & REQ_OP_MASK) == REQ_OP_ZONE_RESET_ALL)
+ rwbs[i++] = 'A';
+ break;
+ case REQ_OP_ZONE_FINISH:
+ rwbs[i++] = 'Z';
+ rwbs[i++] = 'F';
+ break;
+ case REQ_OP_ZONE_OPEN:
+ rwbs[i++] = 'Z';
+ rwbs[i++] = 'O';
+ break;
+ case REQ_OP_ZONE_CLOSE:
+ rwbs[i++] = 'Z';
+ rwbs[i++] = 'C';
+ break;
+ case REQ_OP_WRITE_ZEROES:
+ rwbs[i++] = 'W';
+ rwbs[i++] = 'Z';
+ break;
default:
rwbs[i++] = 'N';
}
@@ -1908,6 +2211,10 @@ void blk_fill_rwbs(char *rwbs, blk_opf_t opf)
rwbs[i++] = 'S';
if (opf & REQ_META)
rwbs[i++] = 'M';
+ if (opf & REQ_ATOMIC)
+ rwbs[i++] = 'U';
+
+ WARN_ON_ONCE(i >= RWBS_LEN);
rwbs[i] = '\0';
}
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 1b8db5aee9d3..a02bd258677e 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -22,7 +22,6 @@
#include <linux/bsearch.h>
#include <linux/sort.h>
#include <linux/key.h>
-#include <linux/verification.h>
#include <linux/namei.h>
#include <net/bpf_sk_storage.h>
@@ -357,17 +356,6 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = {
.arg3_type = ARG_CONST_SIZE,
};
-static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
-{
- if (!capable(CAP_SYS_ADMIN))
- return NULL;
-
- pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!",
- current->comm, task_pid_nr(current));
-
- return &bpf_probe_write_user_proto;
-}
-
#define MAX_TRACE_PRINTK_VARARGS 3
#define BPF_TRACE_PRINTK_SIZE 1024
@@ -403,7 +391,7 @@ static const struct bpf_func_proto bpf_trace_printk_proto = {
.arg2_type = ARG_CONST_SIZE,
};
-static void __set_printk_clr_event(void)
+static void __set_printk_clr_event(struct work_struct *work)
{
/*
* This program might be calling bpf_trace_printk,
@@ -416,10 +404,11 @@ static void __set_printk_clr_event(void)
if (trace_set_clr_event("bpf_trace", "bpf_trace_printk", 1))
pr_warn_ratelimited("could not enable bpf_trace_printk events");
}
+static DECLARE_WORK(set_printk_work, __set_printk_clr_event);
const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
{
- __set_printk_clr_event();
+ schedule_work(&set_printk_work);
return &bpf_trace_printk_proto;
}
@@ -462,7 +451,7 @@ static const struct bpf_func_proto bpf_trace_vprintk_proto = {
const struct bpf_func_proto *bpf_get_trace_vprintk_proto(void)
{
- __set_printk_clr_event();
+ schedule_work(&set_printk_work);
return &bpf_trace_vprintk_proto;
}
@@ -582,7 +571,7 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
return value;
}
-static const struct bpf_func_proto bpf_perf_event_read_proto = {
+const struct bpf_func_proto bpf_perf_event_read_proto = {
.func = bpf_perf_event_read,
.gpl_only = true,
.ret_type = RET_INTEGER,
@@ -617,9 +606,15 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
.arg4_type = ARG_CONST_SIZE,
};
+const struct bpf_func_proto *bpf_get_perf_event_read_value_proto(void)
+{
+ return &bpf_perf_event_read_value_proto;
+}
+
static __always_inline u64
__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
- u64 flags, struct perf_sample_data *sd)
+ u64 flags, struct perf_raw_record *raw,
+ struct perf_sample_data *sd)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
unsigned int cpu = smp_processor_id();
@@ -644,6 +639,8 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
if (unlikely(event->oncpu != cpu))
return -EOPNOTSUPP;
+ perf_sample_save_raw_data(sd, event, raw);
+
return perf_event_output(event, sd, regs);
}
@@ -687,9 +684,8 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
}
perf_sample_data_init(sd, 0, 0);
- perf_sample_save_raw_data(sd, &raw);
- err = __bpf_perf_event_output(regs, map, flags, sd);
+ err = __bpf_perf_event_output(regs, map, flags, &raw, sd);
out:
this_cpu_dec(bpf_trace_nest_level);
preempt_enable();
@@ -748,9 +744,8 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
perf_fetch_caller_regs(regs);
perf_sample_data_init(sd, 0, 0);
- perf_sample_save_raw_data(sd, &raw);
- ret = __bpf_perf_event_output(regs, map, flags, sd);
+ ret = __bpf_perf_event_output(regs, map, flags, &raw, sd);
out:
this_cpu_dec(bpf_event_output_nest_level);
preempt_enable();
@@ -785,8 +780,7 @@ BPF_CALL_1(bpf_task_pt_regs, struct task_struct *, task)
return (unsigned long) task_pt_regs(task);
}
-BTF_ID_LIST(bpf_task_pt_regs_ids)
-BTF_ID(struct, pt_regs)
+BTF_ID_LIST_SINGLE(bpf_task_pt_regs_ids, struct, pt_regs)
const struct bpf_func_proto bpf_task_pt_regs_proto = {
.func = bpf_task_pt_regs,
@@ -836,7 +830,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type, struct task_struc
info.si_code = SI_KERNEL;
info.si_pid = 0;
info.si_uid = 0;
- info.si_value.sival_ptr = (void *)(unsigned long)value;
+ info.si_value.sival_ptr = (void __user __force *)(unsigned long)value;
siginfo = &info;
}
@@ -853,7 +847,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type, struct task_struc
if (unlikely(is_global_init(task)))
return -EPERM;
- if (irqs_disabled()) {
+ if (preempt_count() != 0 || irqs_disabled()) {
/* Do an early check on signal validity. Otherwise,
* the error is lost in deferred irq_work.
*/
@@ -886,7 +880,7 @@ BPF_CALL_1(bpf_send_signal, u32, sig)
return bpf_send_signal_common(sig, PIDTYPE_TGID, NULL, 0);
}
-static const struct bpf_func_proto bpf_send_signal_proto = {
+const struct bpf_func_proto bpf_send_signal_proto = {
.func = bpf_send_signal,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -898,14 +892,14 @@ BPF_CALL_1(bpf_send_signal_thread, u32, sig)
return bpf_send_signal_common(sig, PIDTYPE_PID, NULL, 0);
}
-static const struct bpf_func_proto bpf_send_signal_thread_proto = {
+const struct bpf_func_proto bpf_send_signal_thread_proto = {
.func = bpf_send_signal_thread,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_ANYTHING,
};
-BPF_CALL_3(bpf_d_path, struct path *, path, char *, buf, u32, sz)
+BPF_CALL_3(bpf_d_path, const struct path *, path, char *, buf, u32, sz)
{
struct path copy;
long len;
@@ -971,7 +965,7 @@ static const struct bpf_func_proto bpf_d_path_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
.arg1_btf_id = &bpf_d_path_btf_ids[0],
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE,
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
.allowed = bpf_d_path_allowed,
};
@@ -1028,7 +1022,7 @@ const struct bpf_func_proto bpf_snprintf_btf_proto = {
.func = bpf_snprintf_btf,
.gpl_only = false,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_MEM,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_WRITE,
.arg2_type = ARG_CONST_SIZE,
.arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg4_type = ARG_CONST_SIZE,
@@ -1048,27 +1042,14 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = {
.arg1_type = ARG_PTR_TO_CTX,
};
-#ifdef CONFIG_X86_KERNEL_IBT
-static unsigned long get_entry_ip(unsigned long fentry_ip)
+static inline unsigned long get_entry_ip(unsigned long fentry_ip)
{
- u32 instr;
-
- /* We want to be extra safe in case entry ip is on the page edge,
- * but otherwise we need to avoid get_kernel_nofault()'s overhead.
- */
- if ((fentry_ip & ~PAGE_MASK) < ENDBR_INSN_SIZE) {
- if (get_kernel_nofault(instr, (u32 *)(fentry_ip - ENDBR_INSN_SIZE)))
- return fentry_ip;
- } else {
- instr = *(u32 *)(fentry_ip - ENDBR_INSN_SIZE);
- }
- if (is_endbr(instr))
+#ifdef CONFIG_X86_KERNEL_IBT
+ if (is_endbr((void *)(fentry_ip - ENDBR_INSN_SIZE)))
fentry_ip -= ENDBR_INSN_SIZE;
+#endif
return fentry_ip;
}
-#else
-#define get_entry_ip(fentry_ip) fentry_ip
-#endif
BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs)
{
@@ -1202,7 +1183,7 @@ BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags)
return entry_cnt * br_entry_size;
}
-static const struct bpf_func_proto bpf_get_branch_snapshot_proto = {
+const struct bpf_func_proto bpf_get_branch_snapshot_proto = {
.func = bpf_get_branch_snapshot,
.gpl_only = true,
.ret_type = RET_INTEGER,
@@ -1213,7 +1194,7 @@ static const struct bpf_func_proto bpf_get_branch_snapshot_proto = {
BPF_CALL_3(get_func_arg, void *, ctx, u32, n, u64 *, value)
{
/* This helper call is inlined by verifier. */
- u64 nr_args = ((u64 *)ctx)[-1];
+ u64 nr_args = ((u64 *)ctx)[-1] & 0xFF;
if ((u64) n >= nr_args)
return -EINVAL;
@@ -1233,7 +1214,7 @@ static const struct bpf_func_proto bpf_get_func_arg_proto = {
BPF_CALL_2(get_func_ret, void *, ctx, u64 *, value)
{
/* This helper call is inlined by verifier. */
- u64 nr_args = ((u64 *)ctx)[-1];
+ u64 nr_args = ((u64 *)ctx)[-1] & 0xFF;
*value = ((u64 *)ctx)[nr_args];
return 0;
@@ -1250,7 +1231,7 @@ static const struct bpf_func_proto bpf_get_func_ret_proto = {
BPF_CALL_1(get_func_arg_cnt, void *, ctx)
{
/* This helper call is inlined by verifier. */
- return ((u64 *)ctx)[-1];
+ return ((u64 *)ctx)[-1] & 0xFF;
}
static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = {
@@ -1259,245 +1240,14 @@ static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
-#ifdef CONFIG_KEYS
-__bpf_kfunc_start_defs();
-
-/**
- * bpf_lookup_user_key - lookup a key by its serial
- * @serial: key handle serial number
- * @flags: lookup-specific flags
- *
- * Search a key with a given *serial* and the provided *flags*.
- * If found, increment the reference count of the key by one, and
- * return it in the bpf_key structure.
- *
- * The bpf_key structure must be passed to bpf_key_put() when done
- * with it, so that the key reference count is decremented and the
- * bpf_key structure is freed.
- *
- * Permission checks are deferred to the time the key is used by
- * one of the available key-specific kfuncs.
- *
- * Set *flags* with KEY_LOOKUP_CREATE, to attempt creating a requested
- * special keyring (e.g. session keyring), if it doesn't yet exist.
- * Set *flags* with KEY_LOOKUP_PARTIAL, to lookup a key without waiting
- * for the key construction, and to retrieve uninstantiated keys (keys
- * without data attached to them).
- *
- * Return: a bpf_key pointer with a valid key pointer if the key is found, a
- * NULL pointer otherwise.
- */
-__bpf_kfunc struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags)
-{
- key_ref_t key_ref;
- struct bpf_key *bkey;
-
- if (flags & ~KEY_LOOKUP_ALL)
- return NULL;
-
- /*
- * Permission check is deferred until the key is used, as the
- * intent of the caller is unknown here.
- */
- key_ref = lookup_user_key(serial, flags, KEY_DEFER_PERM_CHECK);
- if (IS_ERR(key_ref))
- return NULL;
-
- bkey = kmalloc(sizeof(*bkey), GFP_KERNEL);
- if (!bkey) {
- key_put(key_ref_to_ptr(key_ref));
- return NULL;
- }
-
- bkey->key = key_ref_to_ptr(key_ref);
- bkey->has_ref = true;
-
- return bkey;
-}
-
-/**
- * bpf_lookup_system_key - lookup a key by a system-defined ID
- * @id: key ID
- *
- * Obtain a bpf_key structure with a key pointer set to the passed key ID.
- * The key pointer is marked as invalid, to prevent bpf_key_put() from
- * attempting to decrement the key reference count on that pointer. The key
- * pointer set in such way is currently understood only by
- * verify_pkcs7_signature().
- *
- * Set *id* to one of the values defined in include/linux/verification.h:
- * 0 for the primary keyring (immutable keyring of system keys);
- * VERIFY_USE_SECONDARY_KEYRING for both the primary and secondary keyring
- * (where keys can be added only if they are vouched for by existing keys
- * in those keyrings); VERIFY_USE_PLATFORM_KEYRING for the platform
- * keyring (primarily used by the integrity subsystem to verify a kexec'ed
- * kerned image and, possibly, the initramfs signature).
- *
- * Return: a bpf_key pointer with an invalid key pointer set from the
- * pre-determined ID on success, a NULL pointer otherwise
- */
-__bpf_kfunc struct bpf_key *bpf_lookup_system_key(u64 id)
-{
- struct bpf_key *bkey;
-
- if (system_keyring_id_check(id) < 0)
- return NULL;
-
- bkey = kmalloc(sizeof(*bkey), GFP_ATOMIC);
- if (!bkey)
- return NULL;
-
- bkey->key = (struct key *)(unsigned long)id;
- bkey->has_ref = false;
-
- return bkey;
-}
-
-/**
- * bpf_key_put - decrement key reference count if key is valid and free bpf_key
- * @bkey: bpf_key structure
- *
- * Decrement the reference count of the key inside *bkey*, if the pointer
- * is valid, and free *bkey*.
- */
-__bpf_kfunc void bpf_key_put(struct bpf_key *bkey)
-{
- if (bkey->has_ref)
- key_put(bkey->key);
-
- kfree(bkey);
-}
-
-#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
-/**
- * bpf_verify_pkcs7_signature - verify a PKCS#7 signature
- * @data_p: data to verify
- * @sig_p: signature of the data
- * @trusted_keyring: keyring with keys trusted for signature verification
- *
- * Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr*
- * with keys in a keyring referenced by *trusted_keyring*.
- *
- * Return: 0 on success, a negative value on error.
- */
-__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p,
- struct bpf_dynptr *sig_p,
- struct bpf_key *trusted_keyring)
-{
- struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p;
- struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p;
- const void *data, *sig;
- u32 data_len, sig_len;
- int ret;
-
- if (trusted_keyring->has_ref) {
- /*
- * Do the permission check deferred in bpf_lookup_user_key().
- * See bpf_lookup_user_key() for more details.
- *
- * A call to key_task_permission() here would be redundant, as
- * it is already done by keyring_search() called by
- * find_asymmetric_key().
- */
- ret = key_validate(trusted_keyring->key);
- if (ret < 0)
- return ret;
- }
-
- data_len = __bpf_dynptr_size(data_ptr);
- data = __bpf_dynptr_data(data_ptr, data_len);
- sig_len = __bpf_dynptr_size(sig_ptr);
- sig = __bpf_dynptr_data(sig_ptr, sig_len);
-
- return verify_pkcs7_signature(data, data_len, sig, sig_len,
- trusted_keyring->key,
- VERIFYING_UNSPECIFIED_SIGNATURE, NULL,
- NULL);
-}
-#endif /* CONFIG_SYSTEM_DATA_VERIFICATION */
-
-__bpf_kfunc_end_defs();
-
-BTF_KFUNCS_START(key_sig_kfunc_set)
-BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE)
-BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL)
-BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE)
-#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
-BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE)
-#endif
-BTF_KFUNCS_END(key_sig_kfunc_set)
-
-static const struct btf_kfunc_id_set bpf_key_sig_kfunc_set = {
- .owner = THIS_MODULE,
- .set = &key_sig_kfunc_set,
-};
-
-static int __init bpf_key_sig_kfuncs_init(void)
-{
- return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
- &bpf_key_sig_kfunc_set);
-}
-
-late_initcall(bpf_key_sig_kfuncs_init);
-#endif /* CONFIG_KEYS */
-
static const struct bpf_func_proto *
bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
switch (func_id) {
- case BPF_FUNC_map_lookup_elem:
- return &bpf_map_lookup_elem_proto;
- case BPF_FUNC_map_update_elem:
- return &bpf_map_update_elem_proto;
- case BPF_FUNC_map_delete_elem:
- return &bpf_map_delete_elem_proto;
- case BPF_FUNC_map_push_elem:
- return &bpf_map_push_elem_proto;
- case BPF_FUNC_map_pop_elem:
- return &bpf_map_pop_elem_proto;
- case BPF_FUNC_map_peek_elem:
- return &bpf_map_peek_elem_proto;
- case BPF_FUNC_map_lookup_percpu_elem:
- return &bpf_map_lookup_percpu_elem_proto;
- case BPF_FUNC_ktime_get_ns:
- return &bpf_ktime_get_ns_proto;
- case BPF_FUNC_ktime_get_boot_ns:
- return &bpf_ktime_get_boot_ns_proto;
- case BPF_FUNC_tail_call:
- return &bpf_tail_call_proto;
- case BPF_FUNC_get_current_task:
- return &bpf_get_current_task_proto;
- case BPF_FUNC_get_current_task_btf:
- return &bpf_get_current_task_btf_proto;
- case BPF_FUNC_task_pt_regs:
- return &bpf_task_pt_regs_proto;
- case BPF_FUNC_get_current_uid_gid:
- return &bpf_get_current_uid_gid_proto;
- case BPF_FUNC_get_current_comm:
- return &bpf_get_current_comm_proto;
- case BPF_FUNC_trace_printk:
- return bpf_get_trace_printk_proto();
case BPF_FUNC_get_smp_processor_id:
return &bpf_get_smp_processor_id_proto;
- case BPF_FUNC_get_numa_node_id:
- return &bpf_get_numa_node_id_proto;
- case BPF_FUNC_perf_event_read:
- return &bpf_perf_event_read_proto;
- case BPF_FUNC_get_prandom_u32:
- return &bpf_get_prandom_u32_proto;
- case BPF_FUNC_probe_write_user:
- return security_locked_down(LOCKDOWN_BPF_WRITE_USER) < 0 ?
- NULL : bpf_get_probe_write_proto();
- case BPF_FUNC_probe_read_user:
- return &bpf_probe_read_user_proto;
- case BPF_FUNC_probe_read_kernel:
- return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
- NULL : &bpf_probe_read_kernel_proto;
- case BPF_FUNC_probe_read_user_str:
- return &bpf_probe_read_user_str_proto;
- case BPF_FUNC_probe_read_kernel_str:
- return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
- NULL : &bpf_probe_read_kernel_str_proto;
#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
case BPF_FUNC_probe_read:
return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
@@ -1506,67 +1256,25 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
NULL : &bpf_probe_read_compat_str_proto;
#endif
-#ifdef CONFIG_CGROUPS
- case BPF_FUNC_cgrp_storage_get:
- return &bpf_cgrp_storage_get_proto;
- case BPF_FUNC_cgrp_storage_delete:
- return &bpf_cgrp_storage_delete_proto;
- case BPF_FUNC_current_task_under_cgroup:
- return &bpf_current_task_under_cgroup_proto;
-#endif
- case BPF_FUNC_send_signal:
- return &bpf_send_signal_proto;
- case BPF_FUNC_send_signal_thread:
- return &bpf_send_signal_thread_proto;
- case BPF_FUNC_perf_event_read_value:
- return &bpf_perf_event_read_value_proto;
- case BPF_FUNC_ringbuf_output:
- return &bpf_ringbuf_output_proto;
- case BPF_FUNC_ringbuf_reserve:
- return &bpf_ringbuf_reserve_proto;
- case BPF_FUNC_ringbuf_submit:
- return &bpf_ringbuf_submit_proto;
- case BPF_FUNC_ringbuf_discard:
- return &bpf_ringbuf_discard_proto;
- case BPF_FUNC_ringbuf_query:
- return &bpf_ringbuf_query_proto;
- case BPF_FUNC_jiffies64:
- return &bpf_jiffies64_proto;
- case BPF_FUNC_get_task_stack:
- return prog->sleepable ? &bpf_get_task_stack_sleepable_proto
- : &bpf_get_task_stack_proto;
- case BPF_FUNC_copy_from_user:
- return &bpf_copy_from_user_proto;
- case BPF_FUNC_copy_from_user_task:
- return &bpf_copy_from_user_task_proto;
- case BPF_FUNC_snprintf_btf:
- return &bpf_snprintf_btf_proto;
- case BPF_FUNC_per_cpu_ptr:
- return &bpf_per_cpu_ptr_proto;
- case BPF_FUNC_this_cpu_ptr:
- return &bpf_this_cpu_ptr_proto;
- case BPF_FUNC_task_storage_get:
- if (bpf_prog_check_recur(prog))
- return &bpf_task_storage_get_recur_proto;
- return &bpf_task_storage_get_proto;
- case BPF_FUNC_task_storage_delete:
- if (bpf_prog_check_recur(prog))
- return &bpf_task_storage_delete_recur_proto;
- return &bpf_task_storage_delete_proto;
- case BPF_FUNC_for_each_map_elem:
- return &bpf_for_each_map_elem_proto;
- case BPF_FUNC_snprintf:
- return &bpf_snprintf_proto;
case BPF_FUNC_get_func_ip:
return &bpf_get_func_ip_proto_tracing;
- case BPF_FUNC_get_branch_snapshot:
- return &bpf_get_branch_snapshot_proto;
- case BPF_FUNC_find_vma:
- return &bpf_find_vma_proto;
- case BPF_FUNC_trace_vprintk:
- return bpf_get_trace_vprintk_proto();
default:
- return bpf_base_func_proto(func_id, prog);
+ break;
+ }
+
+ func_proto = bpf_base_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
+ if (!bpf_token_capable(prog->aux->token, CAP_SYS_ADMIN))
+ return NULL;
+
+ switch (func_id) {
+ case BPF_FUNC_probe_write_user:
+ return security_locked_down(LOCKDOWN_BPF_WRITE_USER) < 0 ?
+ NULL : &bpf_probe_write_user_proto;
+ default:
+ return NULL;
}
}
@@ -1578,7 +1286,8 @@ static bool is_kprobe_multi(const struct bpf_prog *prog)
static inline bool is_kprobe_session(const struct bpf_prog *prog)
{
- return prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION;
+ return prog->type == BPF_PROG_TYPE_KPROBE &&
+ prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION;
}
static inline bool is_uprobe_multi(const struct bpf_prog *prog)
@@ -1589,7 +1298,14 @@ static inline bool is_uprobe_multi(const struct bpf_prog *prog)
static inline bool is_uprobe_session(const struct bpf_prog *prog)
{
- return prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION;
+ return prog->type == BPF_PROG_TYPE_KPROBE &&
+ prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION;
+}
+
+static inline bool is_trace_fsession(const struct bpf_prog *prog)
+{
+ return prog->type == BPF_PROG_TYPE_TRACING &&
+ prog->expected_attach_type == BPF_TRACE_FSESSION;
}
static const struct bpf_func_proto *
@@ -1630,8 +1346,6 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type
{
if (off < 0 || off >= sizeof(struct pt_regs))
return false;
- if (type != BPF_READ)
- return false;
if (off % size != 0)
return false;
/*
@@ -1641,6 +1355,9 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type
if (off + size > sizeof(struct pt_regs))
return false;
+ if (type == BPF_WRITE)
+ prog->aux->kprobe_write_ctx = true;
+
return true;
}
@@ -1817,7 +1534,7 @@ static const struct bpf_func_proto bpf_read_branch_records_proto = {
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_MEM_OR_NULL,
+ .arg2_type = ARG_PTR_TO_MEM_OR_NULL | MEM_WRITE,
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
};
@@ -1861,7 +1578,7 @@ static struct pt_regs *get_bpf_raw_tp_regs(void)
struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs);
int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level);
- if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) {
+ if (nest_level > ARRAY_SIZE(tp_regs->regs)) {
this_cpu_dec(bpf_raw_tp_nest_level);
return ERR_PTR(-EBUSY);
}
@@ -1952,7 +1669,7 @@ static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
};
@@ -2025,11 +1742,17 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_d_path:
return &bpf_d_path_proto;
case BPF_FUNC_get_func_arg:
- return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_proto : NULL;
+ if (bpf_prog_has_trampoline(prog) ||
+ prog->expected_attach_type == BPF_TRACE_RAW_TP)
+ return &bpf_get_func_arg_proto;
+ return NULL;
case BPF_FUNC_get_func_ret:
return bpf_prog_has_trampoline(prog) ? &bpf_get_func_ret_proto : NULL;
case BPF_FUNC_get_func_arg_cnt:
- return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_cnt_proto : NULL;
+ if (bpf_prog_has_trampoline(prog) ||
+ prog->expected_attach_type == BPF_TRACE_RAW_TP)
+ return &bpf_get_func_arg_cnt_proto;
+ return NULL;
case BPF_FUNC_get_attach_cookie:
if (prog->type == BPF_PROG_TYPE_TRACING &&
prog->expected_attach_type == BPF_TRACE_RAW_TP)
@@ -2242,6 +1965,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
{
struct bpf_prog_array *old_array;
struct bpf_prog_array *new_array;
+ struct bpf_prog *prog = NULL;
int ret;
mutex_lock(&bpf_event_mutex);
@@ -2262,18 +1986,22 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
}
put:
- /*
- * It could be that the bpf_prog is not sleepable (and will be freed
- * via normal RCU), but is called from a point that supports sleepable
- * programs and uses tasks-trace-RCU.
- */
- synchronize_rcu_tasks_trace();
-
- bpf_prog_put(event->prog);
+ prog = event->prog;
event->prog = NULL;
unlock:
mutex_unlock(&bpf_event_mutex);
+
+ if (prog) {
+ /*
+ * It could be that the bpf_prog is not sleepable (and will be freed
+ * via normal RCU), but is called from a point that supports sleepable
+ * programs and uses tasks-trace-RCU.
+ */
+ synchronize_rcu_tasks_trace();
+
+ bpf_prog_put(prog);
+ }
}
int perf_event_query_prog_array(struct perf_event *event, void __user *info)
@@ -2336,10 +2064,9 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp)
{
struct module *mod;
- preempt_disable();
+ guard(rcu)();
mod = __module_address((unsigned long)btp);
module_put(mod);
- preempt_enable();
}
static __always_inline
@@ -2349,8 +2076,8 @@ void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args)
struct bpf_run_ctx *old_run_ctx;
struct bpf_trace_run_ctx run_ctx;
- cant_sleep();
- if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
+ rcu_read_lock_dont_migrate();
+ if (unlikely(!bpf_prog_get_recursion_context(prog))) {
bpf_prog_inc_misses_counter(prog);
goto out;
}
@@ -2358,13 +2085,12 @@ void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args)
run_ctx.bpf_cookie = link->cookie;
old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
- rcu_read_lock();
(void) bpf_prog_run(prog, args);
- rcu_read_unlock();
bpf_reset_run_ctx(old_run_ctx);
out:
- this_cpu_dec(*(prog->active));
+ bpf_prog_put_recursion_context(prog);
+ rcu_read_unlock_migrate();
}
#define UNPACK(...) __VA_ARGS__
@@ -2517,7 +2243,7 @@ static int bpf_event_notify(struct notifier_block *nb, unsigned long op,
switch (op) {
case MODULE_STATE_COMING:
- btm = kzalloc(sizeof(*btm), GFP_KERNEL);
+ btm = kzalloc_obj(*btm);
if (btm) {
btm->module = module;
list_add(&btm->list, &bpf_trace_modules);
@@ -2570,7 +2296,6 @@ struct bpf_kprobe_multi_link {
u32 cnt;
u32 mods_cnt;
struct module **mods;
- u32 flags;
};
struct bpf_kprobe_multi_run_ctx {
@@ -2584,6 +2309,20 @@ struct user_syms {
char *buf;
};
+#ifndef CONFIG_HAVE_FTRACE_REGS_HAVING_PT_REGS
+static DEFINE_PER_CPU(struct pt_regs, bpf_kprobe_multi_pt_regs);
+#define bpf_kprobe_multi_pt_regs_ptr() this_cpu_ptr(&bpf_kprobe_multi_pt_regs)
+#else
+#define bpf_kprobe_multi_pt_regs_ptr() (NULL)
+#endif
+
+static unsigned long ftrace_get_entry_ip(unsigned long fentry_ip)
+{
+ unsigned long ip = ftrace_get_symaddr(fentry_ip);
+
+ return ip ? : fentry_ip;
+}
+
static int copy_user_syms(struct user_syms *us, unsigned long __user *usyms, u32 cnt)
{
unsigned long __user usymbol;
@@ -2645,7 +2384,8 @@ static void bpf_kprobe_multi_link_release(struct bpf_link *link)
struct bpf_kprobe_multi_link *kmulti_link;
kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
- unregister_fprobe(&kmulti_link->fp);
+ /* Don't wait for RCU GP here. */
+ unregister_fprobe_async(&kmulti_link->fp);
kprobe_multi_put_modules(kmulti_link->mods, kmulti_link->mods_cnt);
}
@@ -2676,7 +2416,7 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link,
kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
info->kprobe_multi.count = kmulti_link->cnt;
- info->kprobe_multi.flags = kmulti_link->flags;
+ info->kprobe_multi.flags = kmulti_link->link.flags;
info->kprobe_multi.missed = kmulti_link->fp.nmissed;
if (!uaddrs)
@@ -2710,10 +2450,39 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link,
return err;
}
+#ifdef CONFIG_PROC_FS
+static void bpf_kprobe_multi_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_kprobe_multi_link *kmulti_link;
+ bool has_cookies;
+
+ kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
+ has_cookies = !!kmulti_link->cookies;
+
+ seq_printf(seq,
+ "kprobe_cnt:\t%u\n"
+ "missed:\t%lu\n",
+ kmulti_link->cnt,
+ kmulti_link->fp.nmissed);
+
+ seq_printf(seq, "%s\t %s\n", "cookie", "func");
+ for (int i = 0; i < kmulti_link->cnt; i++) {
+ seq_printf(seq,
+ "%llu\t %pS\n",
+ has_cookies ? kmulti_link->cookies[i] : 0,
+ (void *)kmulti_link->addrs[i]);
+ }
+}
+#endif
+
static const struct bpf_link_ops bpf_kprobe_multi_link_lops = {
.release = bpf_kprobe_multi_link_release,
.dealloc_deferred = bpf_kprobe_multi_link_dealloc,
.fill_link_info = bpf_kprobe_multi_link_fill_link_info,
+#ifdef CONFIG_PROC_FS
+ .show_fdinfo = bpf_kprobe_multi_show_fdinfo,
+#endif
};
static void bpf_kprobe_multi_cookie_swap(void *a, void *b, int size, const void *priv)
@@ -2776,9 +2545,9 @@ static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
return run_ctx->entry_ip;
}
-static int
+static __always_inline int
kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
- unsigned long entry_ip, struct pt_regs *regs,
+ unsigned long entry_ip, struct ftrace_regs *fregs,
bool is_return, void *data)
{
struct bpf_kprobe_multi_run_ctx run_ctx = {
@@ -2790,21 +2559,29 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
.entry_ip = entry_ip,
};
struct bpf_run_ctx *old_run_ctx;
+ struct pt_regs *regs;
int err;
+ /*
+ * graph tracer framework ensures we won't migrate, so there is no need
+ * to use migrate_disable for bpf_prog_run again. The check here just for
+ * __this_cpu_inc_return.
+ */
+ cant_sleep();
+
if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
bpf_prog_inc_misses_counter(link->link.prog);
- err = 0;
+ err = 1;
goto out;
}
- migrate_disable();
rcu_read_lock();
+ regs = ftrace_partial_regs(fregs, bpf_kprobe_multi_pt_regs_ptr());
old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx);
err = bpf_prog_run(link->link.prog, regs);
bpf_reset_run_ctx(old_run_ctx);
+ ftrace_partial_regs_update(fregs, bpf_kprobe_multi_pt_regs_ptr());
rcu_read_unlock();
- migrate_enable();
out:
__this_cpu_dec(bpf_prog_active);
@@ -2813,26 +2590,28 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
static int
kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip,
- unsigned long ret_ip, struct pt_regs *regs,
+ unsigned long ret_ip, struct ftrace_regs *fregs,
void *data)
{
struct bpf_kprobe_multi_link *link;
int err;
link = container_of(fp, struct bpf_kprobe_multi_link, fp);
- err = kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs, false, data);
+ err = kprobe_multi_link_prog_run(link, ftrace_get_entry_ip(fentry_ip),
+ fregs, false, data);
return is_kprobe_session(link->link.prog) ? err : 0;
}
static void
kprobe_multi_link_exit_handler(struct fprobe *fp, unsigned long fentry_ip,
- unsigned long ret_ip, struct pt_regs *regs,
+ unsigned long ret_ip, struct ftrace_regs *fregs,
void *data)
{
struct bpf_kprobe_multi_link *link;
link = container_of(fp, struct bpf_kprobe_multi_link, fp);
- kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs, true, data);
+ kprobe_multi_link_prog_run(link, ftrace_get_entry_ip(fentry_ip),
+ fregs, true, data);
}
static int symbols_cmp_r(const void *a, const void *b, const void *priv)
@@ -2905,18 +2684,21 @@ static int get_modules_for_addrs(struct module ***mods, unsigned long *addrs, u3
u32 i, err = 0;
for (i = 0; i < addrs_cnt; i++) {
+ bool skip_add = false;
struct module *mod;
- preempt_disable();
- mod = __module_address(addrs[i]);
- /* Either no module or we it's already stored */
- if (!mod || has_module(&arr, mod)) {
- preempt_enable();
- continue;
+ scoped_guard(rcu) {
+ mod = __module_address(addrs[i]);
+ /* Either no module or it's already stored */
+ if (!mod || has_module(&arr, mod)) {
+ skip_add = true;
+ break; /* scoped_guard */
+ }
+ if (!try_module_get(mod))
+ err = -EINVAL;
}
- if (!try_module_get(mod))
- err = -EINVAL;
- preempt_enable();
+ if (skip_add)
+ continue;
if (err)
break;
err = add_module(&arr, mod);
@@ -2965,9 +2747,20 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
if (sizeof(u64) != sizeof(void *))
return -EOPNOTSUPP;
+ if (attr->link_create.flags)
+ return -EINVAL;
+
if (!is_kprobe_multi(prog))
return -EINVAL;
+ /* kprobe_multi is not allowed to be sleepable. */
+ if (prog->sleepable)
+ return -EINVAL;
+
+ /* Writing to context is not allowed for kprobes. */
+ if (prog->aux->kprobe_write_ctx)
+ return -EINVAL;
+
flags = attr->link_create.kprobe_multi.flags;
if (flags & ~BPF_F_KPROBE_MULTI_RETURN)
return -EINVAL;
@@ -3033,14 +2826,14 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
goto error;
}
- link = kzalloc(sizeof(*link), GFP_KERNEL);
+ link = kzalloc_obj(*link);
if (!link) {
err = -ENOMEM;
goto error;
}
bpf_link_init(&link->link, BPF_LINK_TYPE_KPROBE_MULTI,
- &bpf_kprobe_multi_link_lops, prog);
+ &bpf_kprobe_multi_link_lops, prog, attr->link_create.attach_type);
err = bpf_link_prime(&link->link, &link_primer);
if (err)
@@ -3056,7 +2849,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
link->addrs = addrs;
link->cookies = cookies;
link->cnt = cnt;
- link->flags = flags;
+ link->link.flags = flags;
if (cookies) {
/*
@@ -3125,7 +2918,6 @@ struct bpf_uprobe_multi_link {
struct path path;
struct bpf_link link;
u32 cnt;
- u32 flags;
struct bpf_uprobe *uprobes;
struct task_struct *task;
};
@@ -3189,7 +2981,7 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link,
umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
info->uprobe_multi.count = umulti_link->cnt;
- info->uprobe_multi.flags = umulti_link->flags;
+ info->uprobe_multi.flags = umulti_link->link.flags;
info->uprobe_multi.pid = umulti_link->task ?
task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0;
@@ -3234,10 +3026,54 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link,
return err;
}
+#ifdef CONFIG_PROC_FS
+static void bpf_uprobe_multi_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_uprobe_multi_link *umulti_link;
+ char *p, *buf;
+ pid_t pid;
+
+ umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
+
+ buf = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!buf)
+ return;
+
+ p = d_path(&umulti_link->path, buf, PATH_MAX);
+ if (IS_ERR(p)) {
+ kfree(buf);
+ return;
+ }
+
+ pid = umulti_link->task ?
+ task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0;
+ seq_printf(seq,
+ "uprobe_cnt:\t%u\n"
+ "pid:\t%u\n"
+ "path:\t%s\n",
+ umulti_link->cnt, pid, p);
+
+ seq_printf(seq, "%s\t %s\t %s\n", "cookie", "offset", "ref_ctr_offset");
+ for (int i = 0; i < umulti_link->cnt; i++) {
+ seq_printf(seq,
+ "%llu\t %#llx\t %#lx\n",
+ umulti_link->uprobes[i].cookie,
+ umulti_link->uprobes[i].offset,
+ umulti_link->uprobes[i].ref_ctr_offset);
+ }
+
+ kfree(buf);
+}
+#endif
+
static const struct bpf_link_ops bpf_uprobe_multi_link_lops = {
.release = bpf_uprobe_multi_link_release,
.dealloc_deferred = bpf_uprobe_multi_link_dealloc,
.fill_link_info = bpf_uprobe_multi_link_fill_link_info,
+#ifdef CONFIG_PROC_FS
+ .show_fdinfo = bpf_uprobe_multi_show_fdinfo,
+#endif
};
static int uprobe_prog_run(struct bpf_uprobe *uprobe,
@@ -3354,6 +3190,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
if (sizeof(u64) != sizeof(void *))
return -EOPNOTSUPP;
+ if (attr->link_create.flags)
+ return -EINVAL;
+
if (!is_uprobe_multi(prog))
return -EINVAL;
@@ -3395,7 +3234,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
}
if (pid) {
+ rcu_read_lock();
task = get_pid_task(find_vpid(pid), PIDTYPE_TGID);
+ rcu_read_unlock();
if (!task) {
err = -ESRCH;
goto error_path_put;
@@ -3404,8 +3245,8 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
err = -ENOMEM;
- link = kzalloc(sizeof(*link), GFP_KERNEL);
- uprobes = kvcalloc(cnt, sizeof(*uprobes), GFP_KERNEL);
+ link = kzalloc_obj(*link);
+ uprobes = kvzalloc_objs(*uprobes, cnt);
if (!uprobes || !link)
goto error_free;
@@ -3444,10 +3285,10 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
link->uprobes = uprobes;
link->path = path;
link->task = task;
- link->flags = flags;
+ link->link.flags = flags;
bpf_link_init(&link->link, BPF_LINK_TYPE_UPROBE_MULTI,
- &bpf_uprobe_multi_link_lops, prog);
+ &bpf_uprobe_multi_link_lops, prog, attr->link_create.attach_type);
for (i = 0; i < cnt; i++) {
uprobes[i].uprobe = uprobe_register(d_real_inode(link->path.dentry),
@@ -3496,7 +3337,7 @@ static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
__bpf_kfunc_start_defs();
-__bpf_kfunc bool bpf_session_is_return(void)
+__bpf_kfunc bool bpf_session_is_return(void *ctx)
{
struct bpf_session_run_ctx *session_ctx;
@@ -3504,7 +3345,7 @@ __bpf_kfunc bool bpf_session_is_return(void)
return session_ctx->is_return;
}
-__bpf_kfunc __u64 *bpf_session_cookie(void)
+__bpf_kfunc __u64 *bpf_session_cookie(void *ctx)
{
struct bpf_session_run_ctx *session_ctx;
@@ -3514,34 +3355,179 @@ __bpf_kfunc __u64 *bpf_session_cookie(void)
__bpf_kfunc_end_defs();
-BTF_KFUNCS_START(kprobe_multi_kfunc_set_ids)
+BTF_KFUNCS_START(session_kfunc_set_ids)
BTF_ID_FLAGS(func, bpf_session_is_return)
BTF_ID_FLAGS(func, bpf_session_cookie)
-BTF_KFUNCS_END(kprobe_multi_kfunc_set_ids)
+BTF_KFUNCS_END(session_kfunc_set_ids)
-static int bpf_kprobe_multi_filter(const struct bpf_prog *prog, u32 kfunc_id)
+static int bpf_session_filter(const struct bpf_prog *prog, u32 kfunc_id)
{
- if (!btf_id_set8_contains(&kprobe_multi_kfunc_set_ids, kfunc_id))
+ if (!btf_id_set8_contains(&session_kfunc_set_ids, kfunc_id))
return 0;
- if (!is_kprobe_session(prog) && !is_uprobe_session(prog))
+ if (!is_kprobe_session(prog) && !is_uprobe_session(prog) && !is_trace_fsession(prog))
return -EACCES;
return 0;
}
-static const struct btf_kfunc_id_set bpf_kprobe_multi_kfunc_set = {
+static const struct btf_kfunc_id_set bpf_session_kfunc_set = {
.owner = THIS_MODULE,
- .set = &kprobe_multi_kfunc_set_ids,
- .filter = bpf_kprobe_multi_filter,
+ .set = &session_kfunc_set_ids,
+ .filter = bpf_session_filter,
};
-static int __init bpf_kprobe_multi_kfuncs_init(void)
+static int __init bpf_trace_kfuncs_init(void)
{
- return register_btf_kfunc_id_set(BPF_PROG_TYPE_KPROBE, &bpf_kprobe_multi_kfunc_set);
+ int err = 0;
+
+ err = err ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_KPROBE, &bpf_session_kfunc_set);
+ err = err ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_session_kfunc_set);
+
+ return err;
}
-late_initcall(bpf_kprobe_multi_kfuncs_init);
+late_initcall(bpf_trace_kfuncs_init);
+
+typedef int (*copy_fn_t)(void *dst, const void *src, u32 size, struct task_struct *tsk);
+
+/*
+ * The __always_inline is to make sure the compiler doesn't
+ * generate indirect calls into callbacks, which is expensive,
+ * on some kernel configurations. This allows compiler to put
+ * direct calls into all the specific callback implementations
+ * (copy_user_data_sleepable, copy_user_data_nofault, and so on)
+ */
+static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u64 doff, u64 size,
+ const void *unsafe_src,
+ copy_fn_t str_copy_fn,
+ struct task_struct *tsk)
+{
+ struct bpf_dynptr_kern *dst;
+ u64 chunk_sz, off;
+ void *dst_slice;
+ int cnt, err;
+ char buf[256];
+
+ dst_slice = bpf_dynptr_slice_rdwr(dptr, doff, NULL, size);
+ if (likely(dst_slice))
+ return str_copy_fn(dst_slice, unsafe_src, size, tsk);
+
+ dst = (struct bpf_dynptr_kern *)dptr;
+ if (bpf_dynptr_check_off_len(dst, doff, size))
+ return -E2BIG;
+
+ for (off = 0; off < size; off += chunk_sz - 1) {
+ chunk_sz = min_t(u64, sizeof(buf), size - off);
+ /* Expect str_copy_fn to return count of copied bytes, including
+ * zero terminator. Next iteration increment off by chunk_sz - 1 to
+ * overwrite NUL.
+ */
+ cnt = str_copy_fn(buf, unsafe_src + off, chunk_sz, tsk);
+ if (cnt < 0)
+ return cnt;
+ err = __bpf_dynptr_write(dst, doff + off, buf, cnt, 0);
+ if (err)
+ return err;
+ if (cnt < chunk_sz || chunk_sz == 1) /* we are done */
+ return off + cnt;
+ }
+ return off;
+}
+
+static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u64 doff,
+ u64 size, const void *unsafe_src,
+ copy_fn_t copy_fn, struct task_struct *tsk)
+{
+ struct bpf_dynptr_kern *dst;
+ void *dst_slice;
+ char buf[256];
+ u64 off, chunk_sz;
+ int err;
+
+ dst_slice = bpf_dynptr_slice_rdwr(dptr, doff, NULL, size);
+ if (likely(dst_slice))
+ return copy_fn(dst_slice, unsafe_src, size, tsk);
+
+ dst = (struct bpf_dynptr_kern *)dptr;
+ if (bpf_dynptr_check_off_len(dst, doff, size))
+ return -E2BIG;
+
+ for (off = 0; off < size; off += chunk_sz) {
+ chunk_sz = min_t(u64, sizeof(buf), size - off);
+ err = copy_fn(buf, unsafe_src + off, chunk_sz, tsk);
+ if (err)
+ return err;
+ err = __bpf_dynptr_write(dst, doff + off, buf, chunk_sz, 0);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+static __always_inline int copy_user_data_nofault(void *dst, const void *unsafe_src,
+ u32 size, struct task_struct *tsk)
+{
+ return copy_from_user_nofault(dst, (const void __user *)unsafe_src, size);
+}
+
+static __always_inline int copy_user_data_sleepable(void *dst, const void *unsafe_src,
+ u32 size, struct task_struct *tsk)
+{
+ int ret;
+
+ if (!tsk) { /* Read from the current task */
+ ret = copy_from_user(dst, (const void __user *)unsafe_src, size);
+ if (ret)
+ return -EFAULT;
+ return 0;
+ }
+
+ ret = access_process_vm(tsk, (unsigned long)unsafe_src, dst, size, 0);
+ if (ret != size)
+ return -EFAULT;
+ return 0;
+}
+
+static __always_inline int copy_kernel_data_nofault(void *dst, const void *unsafe_src,
+ u32 size, struct task_struct *tsk)
+{
+ return copy_from_kernel_nofault(dst, unsafe_src, size);
+}
+
+static __always_inline int copy_user_str_nofault(void *dst, const void *unsafe_src,
+ u32 size, struct task_struct *tsk)
+{
+ return strncpy_from_user_nofault(dst, (const void __user *)unsafe_src, size);
+}
+
+static __always_inline int copy_user_str_sleepable(void *dst, const void *unsafe_src,
+ u32 size, struct task_struct *tsk)
+{
+ int ret;
+
+ if (unlikely(size == 0))
+ return 0;
+
+ if (tsk) {
+ ret = copy_remote_vm_str(tsk, (unsigned long)unsafe_src, dst, size, 0);
+ } else {
+ ret = strncpy_from_user(dst, (const void __user *)unsafe_src, size - 1);
+ /* strncpy_from_user does not guarantee NUL termination */
+ if (ret >= 0)
+ ((char *)dst)[ret] = '\0';
+ }
+
+ if (ret < 0)
+ return ret;
+ return ret + 1;
+}
+
+static __always_inline int copy_kernel_str_nofault(void *dst, const void *unsafe_src,
+ u32 size, struct task_struct *tsk)
+{
+ return strncpy_from_kernel_nofault(dst, unsafe_src, size);
+}
__bpf_kfunc_start_defs();
@@ -3554,4 +3540,62 @@ __bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid
return bpf_send_signal_common(sig, type, task, value);
}
+__bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u64 off,
+ u64 size, const void __user *unsafe_ptr__ign)
+{
+ return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign,
+ copy_user_data_nofault, NULL);
+}
+
+__bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u64 off,
+ u64 size, const void *unsafe_ptr__ign)
+{
+ return __bpf_dynptr_copy(dptr, off, size, unsafe_ptr__ign,
+ copy_kernel_data_nofault, NULL);
+}
+
+__bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u64 off,
+ u64 size, const void __user *unsafe_ptr__ign)
+{
+ return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign,
+ copy_user_str_nofault, NULL);
+}
+
+__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u64 off,
+ u64 size, const void *unsafe_ptr__ign)
+{
+ return __bpf_dynptr_copy_str(dptr, off, size, unsafe_ptr__ign,
+ copy_kernel_str_nofault, NULL);
+}
+
+__bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u64 off,
+ u64 size, const void __user *unsafe_ptr__ign)
+{
+ return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign,
+ copy_user_data_sleepable, NULL);
+}
+
+__bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u64 off,
+ u64 size, const void __user *unsafe_ptr__ign)
+{
+ return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign,
+ copy_user_str_sleepable, NULL);
+}
+
+__bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u64 off,
+ u64 size, const void __user *unsafe_ptr__ign,
+ struct task_struct *tsk)
+{
+ return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign,
+ copy_user_data_sleepable, tsk);
+}
+
+__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u64 off,
+ u64 size, const void __user *unsafe_ptr__ign,
+ struct task_struct *tsk)
+{
+ return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign,
+ copy_user_str_sleepable, tsk);
+}
+
__bpf_kfunc_end_defs();
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 30e3ddc8a8a8..40d373d65f9b 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -163,7 +163,7 @@ enum {
#define RET_STACK(t, offset) ((struct ftrace_ret_stack *)(&(t)->ret_stack[offset]))
/*
- * Each fgraph_ops has a reservered unsigned long at the end (top) of the
+ * Each fgraph_ops has a reserved unsigned long at the end (top) of the
* ret_stack to store task specific state.
*/
#define SHADOW_STACK_TASK_VARS(ret_stack) \
@@ -292,13 +292,15 @@ static inline unsigned long make_data_type_val(int idx, int size, int offset)
}
/* ftrace_graph_entry set to this to tell some archs to run function graph */
-static int entry_run(struct ftrace_graph_ent *trace, struct fgraph_ops *ops)
+static int entry_run(struct ftrace_graph_ent *trace, struct fgraph_ops *ops,
+ struct ftrace_regs *fregs)
{
return 0;
}
/* ftrace_graph_return set to this to tell some archs to run function graph */
-static void return_run(struct ftrace_graph_ret *trace, struct fgraph_ops *ops)
+static void return_run(struct ftrace_graph_ret *trace, struct fgraph_ops *ops,
+ struct ftrace_regs *fregs)
{
}
@@ -496,9 +498,6 @@ found:
return get_data_type_data(current, offset);
}
-/* Both enabled by default (can be cleared by function_graph tracer flags */
-bool fgraph_sleep_time = true;
-
#ifdef CONFIG_DYNAMIC_FTRACE
/*
* archs can override this function if they must do something
@@ -520,13 +519,15 @@ int __weak ftrace_disable_ftrace_graph_caller(void)
#endif
int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace,
- struct fgraph_ops *gops)
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
return 0;
}
static void ftrace_graph_ret_stub(struct ftrace_graph_ret *trace,
- struct fgraph_ops *gops)
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
}
@@ -538,7 +539,11 @@ static struct fgraph_ops fgraph_stub = {
static struct fgraph_ops *fgraph_direct_gops = &fgraph_stub;
DEFINE_STATIC_CALL(fgraph_func, ftrace_graph_entry_stub);
DEFINE_STATIC_CALL(fgraph_retfunc, ftrace_graph_ret_stub);
+#if FGRAPH_NO_DIRECT
+static DEFINE_STATIC_KEY_FALSE(fgraph_do_direct);
+#else
static DEFINE_STATIC_KEY_TRUE(fgraph_do_direct);
+#endif
/**
* ftrace_graph_stop - set to permanently disable function graph tracing
@@ -644,14 +649,20 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func,
#endif
/* If the caller does not use ftrace, call this function. */
-int function_graph_enter(unsigned long ret, unsigned long func,
- unsigned long frame_pointer, unsigned long *retp)
+int function_graph_enter_regs(unsigned long ret, unsigned long func,
+ unsigned long frame_pointer, unsigned long *retp,
+ struct ftrace_regs *fregs)
{
struct ftrace_graph_ent trace;
unsigned long bitmap = 0;
int offset;
+ int bit;
int i;
+ bit = ftrace_test_recursion_trylock(func, ret);
+ if (bit < 0)
+ return -EBUSY;
+
trace.func = func;
trace.depth = ++current->curr_ret_depth;
@@ -663,7 +674,7 @@ int function_graph_enter(unsigned long ret, unsigned long func,
if (static_branch_likely(&fgraph_do_direct)) {
int save_curr_ret_stack = current->curr_ret_stack;
- if (static_call(fgraph_func)(&trace, fgraph_direct_gops))
+ if (static_call(fgraph_func)(&trace, fgraph_direct_gops, fregs))
bitmap |= BIT(fgraph_direct_gops->idx);
else
/* Clear out any saved storage */
@@ -681,7 +692,7 @@ int function_graph_enter(unsigned long ret, unsigned long func,
save_curr_ret_stack = current->curr_ret_stack;
if (ftrace_ops_test(&gops->ops, func, NULL) &&
- gops->entryfunc(&trace, gops))
+ gops->entryfunc(&trace, gops, fregs))
bitmap |= BIT(i);
else
/* Clear out any saved storage */
@@ -697,12 +708,13 @@ int function_graph_enter(unsigned long ret, unsigned long func,
* flag, set that bit always.
*/
set_bitmap(current, offset, bitmap | BIT(0));
-
+ ftrace_test_recursion_unlock(bit);
return 0;
out_ret:
current->curr_ret_stack -= FGRAPH_FRAME_OFFSET + 1;
out:
current->curr_ret_depth--;
+ ftrace_test_recursion_unlock(bit);
return -EBUSY;
}
@@ -792,21 +804,19 @@ static struct notifier_block ftrace_suspend_notifier = {
.notifier_call = ftrace_suspend_notifier_call,
};
-/* fgraph_ret_regs is not defined without CONFIG_FUNCTION_GRAPH_RETVAL */
-struct fgraph_ret_regs;
-
/*
* Send the trace to the ring-buffer.
* @return the original return address.
*/
-static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs,
- unsigned long frame_pointer)
+static inline unsigned long
+__ftrace_return_to_handler(struct ftrace_regs *fregs, unsigned long frame_pointer)
{
struct ftrace_ret_stack *ret_stack;
struct ftrace_graph_ret trace;
unsigned long bitmap;
unsigned long ret;
int offset;
+ int bit;
int i;
ret_stack = ftrace_pop_return_trace(&trace, &ret, frame_pointer, &offset);
@@ -818,17 +828,28 @@ static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs
return (unsigned long)panic;
}
- trace.rettime = trace_clock_local();
+ if (fregs)
+ ftrace_regs_set_instruction_pointer(fregs, ret);
+
+ bit = ftrace_test_recursion_trylock(trace.func, ret);
+ /*
+ * This can fail because ftrace_test_recursion_trylock() allows one nest
+ * call. If we are already in a nested call, then we don't probe this and
+ * just return the original return address.
+ */
+ if (unlikely(bit < 0))
+ goto out;
+
#ifdef CONFIG_FUNCTION_GRAPH_RETVAL
- trace.retval = fgraph_ret_regs_return_value(ret_regs);
+ trace.retval = ftrace_regs_get_return_value(fregs);
#endif
bitmap = get_bitmap_bits(current, offset);
#ifdef CONFIG_HAVE_STATIC_CALL
- if (static_branch_likely(&fgraph_do_direct)) {
+ if (!FGRAPH_NO_DIRECT && static_branch_likely(&fgraph_do_direct)) {
if (test_bit(fgraph_direct_gops->idx, &bitmap))
- static_call(fgraph_retfunc)(&trace, fgraph_direct_gops);
+ static_call(fgraph_retfunc)(&trace, fgraph_direct_gops, fregs);
} else
#endif
{
@@ -838,10 +859,12 @@ static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs
if (gops == &fgraph_stub)
continue;
- gops->retfunc(&trace, gops);
+ gops->retfunc(&trace, gops, fregs);
}
}
+ ftrace_test_recursion_unlock(bit);
+out:
/*
* The ftrace_graph_return() may still access the current
* ret_stack structure, we need to make sure the update of
@@ -855,14 +878,14 @@ static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs
}
/*
- * After all architecures have selected HAVE_FUNCTION_GRAPH_RETVAL, we can
- * leave only ftrace_return_to_handler(ret_regs).
+ * After all architectures have selected HAVE_FUNCTION_GRAPH_FREGS, we can
+ * leave only ftrace_return_to_handler(fregs).
*/
-#ifdef CONFIG_HAVE_FUNCTION_GRAPH_RETVAL
-unsigned long ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs)
+#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FREGS
+unsigned long ftrace_return_to_handler(struct ftrace_regs *fregs)
{
- return __ftrace_return_to_handler(ret_regs,
- fgraph_ret_regs_frame_pointer(ret_regs));
+ return __ftrace_return_to_handler(fregs,
+ ftrace_regs_get_frame_pointer(fregs));
}
#else
unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
@@ -997,20 +1020,17 @@ void fgraph_init_ops(struct ftrace_ops *dst_ops,
mutex_init(&dst_ops->local_hash.regex_lock);
INIT_LIST_HEAD(&dst_ops->subop_list);
dst_ops->flags |= FTRACE_OPS_FL_INITIALIZED;
+ dst_ops->private = src_ops->private;
}
#endif
}
-void ftrace_graph_sleep_time_control(bool enable)
-{
- fgraph_sleep_time = enable;
-}
-
/*
* Simply points to ftrace_stub, but with the proper protocol.
* Defined by the linker script in linux/vmlinux.lds.h
*/
-void ftrace_stub_graph(struct ftrace_graph_ret *trace, struct fgraph_ops *gops);
+void ftrace_stub_graph(struct ftrace_graph_ret *trace, struct fgraph_ops *gops,
+ struct ftrace_regs *fregs);
/* The callbacks that hook a function */
trace_func_graph_ret_t ftrace_graph_return = ftrace_stub_graph;
@@ -1075,7 +1095,7 @@ ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
* Does the user want to count the time a function was asleep.
* If so, do not update the time stamps.
*/
- if (fgraph_sleep_time)
+ if (!fgraph_no_sleep_time)
return;
timestamp = trace_clock_local();
@@ -1174,7 +1194,8 @@ void ftrace_graph_exit_task(struct task_struct *t)
#ifdef CONFIG_DYNAMIC_FTRACE
static int fgraph_pid_func(struct ftrace_graph_ent *trace,
- struct fgraph_ops *gops)
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
struct trace_array *tr = gops->ops.private;
int pid;
@@ -1188,7 +1209,7 @@ static int fgraph_pid_func(struct ftrace_graph_ent *trace,
return 0;
}
- return gops->saved_func(trace, gops);
+ return gops->saved_func(trace, gops, fregs);
}
void fgraph_update_pid_func(void)
@@ -1268,6 +1289,9 @@ static void ftrace_graph_enable_direct(bool enable_branch, struct fgraph_ops *go
trace_func_graph_ret_t retfunc = NULL;
int i;
+ if (FGRAPH_NO_DIRECT)
+ return;
+
if (gops) {
func = gops->entryfunc;
retfunc = gops->retfunc;
@@ -1286,11 +1310,14 @@ static void ftrace_graph_enable_direct(bool enable_branch, struct fgraph_ops *go
static_call_update(fgraph_func, func);
static_call_update(fgraph_retfunc, retfunc);
if (enable_branch)
- static_branch_disable(&fgraph_do_direct);
+ static_branch_enable(&fgraph_do_direct);
}
static void ftrace_graph_disable_direct(bool disable_branch)
{
+ if (FGRAPH_NO_DIRECT)
+ return;
+
if (disable_branch)
static_branch_disable(&fgraph_do_direct);
static_call_update(fgraph_func, ftrace_graph_entry_stub);
@@ -1313,6 +1340,10 @@ int register_ftrace_graph(struct fgraph_ops *gops)
int ret = 0;
int i = -1;
+ if (WARN_ONCE(gops->ops.flags & FTRACE_OPS_FL_GRAPH,
+ "function graph ops registered again"))
+ return -EBUSY;
+
guard(mutex)(&ftrace_lock);
if (!fgraph_stack_cachep) {
@@ -1348,6 +1379,13 @@ int register_ftrace_graph(struct fgraph_ops *gops)
ftrace_graph_active++;
+ /* Always save the function, and reset at unregistering */
+ gops->saved_func = gops->entryfunc;
+#ifdef CONFIG_DYNAMIC_FTRACE
+ if (ftrace_pids_enabled(&gops->ops))
+ gops->entryfunc = fgraph_pid_func;
+#endif
+
if (ftrace_graph_active == 2)
ftrace_graph_disable_direct(true);
@@ -1367,8 +1405,8 @@ int register_ftrace_graph(struct fgraph_ops *gops)
} else {
init_task_vars(gops->idx);
}
- /* Always save the function, and reset at unregistering */
- gops->saved_func = gops->entryfunc;
+
+ gops->ops.flags |= FTRACE_OPS_FL_GRAPH;
ret = ftrace_startup_subops(&graph_ops, &gops->ops, command);
if (!ret)
@@ -1379,6 +1417,8 @@ error:
ftrace_graph_active--;
gops->saved_func = NULL;
fgraph_lru_release_index(i);
+ if (!ftrace_graph_active)
+ unregister_pm_notifier(&ftrace_suspend_notifier);
}
return ret;
}
@@ -1387,17 +1427,21 @@ void unregister_ftrace_graph(struct fgraph_ops *gops)
{
int command = 0;
+ if (WARN_ONCE(!(gops->ops.flags & FTRACE_OPS_FL_GRAPH),
+ "function graph ops unregistered without registering"))
+ return;
+
guard(mutex)(&ftrace_lock);
if (unlikely(!ftrace_graph_active))
- return;
+ goto out;
if (unlikely(gops->idx < 0 || gops->idx >= FGRAPH_ARRAY_SIZE ||
fgraph_array[gops->idx] != gops))
- return;
+ goto out;
if (fgraph_lru_release_index(gops->idx) < 0)
- return;
+ goto out;
fgraph_array[gops->idx] = &fgraph_stub;
@@ -1420,4 +1464,6 @@ void unregister_ftrace_graph(struct fgraph_ops *gops)
unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
}
gops->saved_func = NULL;
+ out:
+ gops->ops.flags &= ~FTRACE_OPS_FL_GRAPH;
}
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index 9ff018245840..f378613ad120 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -4,102 +4,235 @@
*/
#define pr_fmt(fmt) "fprobe: " fmt
+#include <linux/cleanup.h>
#include <linux/err.h>
#include <linux/fprobe.h>
#include <linux/kallsyms.h>
#include <linux/kprobes.h>
-#include <linux/rethook.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/rhashtable.h>
#include <linux/slab.h>
#include <linux/sort.h>
+#include <asm/fprobe.h>
+
#include "trace.h"
-struct fprobe_rethook_node {
- struct rethook_node node;
- unsigned long entry_ip;
- unsigned long entry_parent_ip;
- char data[];
+#define FPROBE_IP_HASH_BITS 8
+#define FPROBE_IP_TABLE_SIZE (1 << FPROBE_IP_HASH_BITS)
+
+#define FPROBE_HASH_BITS 6
+#define FPROBE_TABLE_SIZE (1 << FPROBE_HASH_BITS)
+
+#define SIZE_IN_LONG(x) ((x + sizeof(long) - 1) >> (sizeof(long) == 8 ? 3 : 2))
+
+/*
+ * fprobe_table: hold 'fprobe_hlist::hlist' for checking the fprobe still
+ * exists. The key is the address of fprobe instance.
+ * fprobe_ip_table: hold 'fprobe_hlist::array[*]' for searching the fprobe
+ * instance related to the function address. The key is the ftrace IP
+ * address.
+ *
+ * When unregistering the fprobe, fprobe_hlist::fp and fprobe_hlist::array[*].fp
+ * are set NULL and delete those from both hash tables (by hlist_del_rcu).
+ * After an RCU grace period, the fprobe_hlist itself will be released.
+ *
+ * fprobe_table and fprobe_ip_table can be accessed from either
+ * - Normal hlist traversal and RCU add/del under 'fprobe_mutex' is held.
+ * - RCU hlist traversal under disabling preempt
+ */
+static struct hlist_head fprobe_table[FPROBE_TABLE_SIZE];
+static struct rhltable fprobe_ip_table;
+static DEFINE_MUTEX(fprobe_mutex);
+static struct fgraph_ops fprobe_graph_ops;
+
+static u32 fprobe_node_hashfn(const void *data, u32 len, u32 seed)
+{
+ return hash_ptr(*(unsigned long **)data, 32);
+}
+
+static int fprobe_node_cmp(struct rhashtable_compare_arg *arg,
+ const void *ptr)
+{
+ unsigned long key = *(unsigned long *)arg->key;
+ const struct fprobe_hlist_node *n = ptr;
+
+ return n->addr != key;
+}
+
+static u32 fprobe_node_obj_hashfn(const void *data, u32 len, u32 seed)
+{
+ const struct fprobe_hlist_node *n = data;
+
+ return hash_ptr((void *)n->addr, 32);
+}
+
+static const struct rhashtable_params fprobe_rht_params = {
+ .head_offset = offsetof(struct fprobe_hlist_node, hlist),
+ .key_offset = offsetof(struct fprobe_hlist_node, addr),
+ .key_len = sizeof_field(struct fprobe_hlist_node, addr),
+ .hashfn = fprobe_node_hashfn,
+ .obj_hashfn = fprobe_node_obj_hashfn,
+ .obj_cmpfn = fprobe_node_cmp,
+ .automatic_shrinking = true,
};
-static inline void __fprobe_handler(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *ops, struct ftrace_regs *fregs)
+/* Node insertion and deletion requires the fprobe_mutex */
+static int __insert_fprobe_node(struct fprobe_hlist_node *node, struct fprobe *fp)
{
- struct fprobe_rethook_node *fpr;
- struct rethook_node *rh = NULL;
- struct fprobe *fp;
- void *entry_data = NULL;
- int ret = 0;
+ int ret;
- fp = container_of(ops, struct fprobe, ops);
+ lockdep_assert_held(&fprobe_mutex);
- if (fp->exit_handler) {
- rh = rethook_try_get(fp->rethook);
- if (!rh) {
- fp->nmissed++;
- return;
- }
- fpr = container_of(rh, struct fprobe_rethook_node, node);
- fpr->entry_ip = ip;
- fpr->entry_parent_ip = parent_ip;
- if (fp->entry_data_size)
- entry_data = fpr->data;
- }
+ ret = rhltable_insert(&fprobe_ip_table, &node->hlist, fprobe_rht_params);
+ /* Set the fprobe pointer if insertion was successful. */
+ if (!ret)
+ WRITE_ONCE(node->fp, fp);
+ return ret;
+}
- if (fp->entry_handler)
- ret = fp->entry_handler(fp, ip, parent_ip, ftrace_get_regs(fregs), entry_data);
+static void __delete_fprobe_node(struct fprobe_hlist_node *node)
+{
+ lockdep_assert_held(&fprobe_mutex);
- /* If entry_handler returns !0, nmissed is not counted. */
- if (rh) {
- if (ret)
- rethook_recycle(rh);
- else
- rethook_hook(rh, ftrace_get_regs(fregs), true);
+ /* Avoid double deleting and non-inserted nodes */
+ if (READ_ONCE(node->fp) != NULL) {
+ WRITE_ONCE(node->fp, NULL);
+ rhltable_remove(&fprobe_ip_table, &node->hlist,
+ fprobe_rht_params);
}
}
-static void fprobe_handler(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *ops, struct ftrace_regs *fregs)
+/* Check existence of the fprobe */
+static bool fprobe_registered(struct fprobe *fp)
{
- struct fprobe *fp;
- int bit;
+ struct hlist_head *head;
+ struct fprobe_hlist *fph;
+
+ head = &fprobe_table[hash_ptr(fp, FPROBE_HASH_BITS)];
+ hlist_for_each_entry_rcu(fph, head, hlist,
+ lockdep_is_held(&fprobe_mutex)) {
+ if (fph->fp == fp)
+ return true;
+ }
+ return false;
+}
+NOKPROBE_SYMBOL(fprobe_registered);
- fp = container_of(ops, struct fprobe, ops);
- if (fprobe_disabled(fp))
- return;
+static int add_fprobe_hash(struct fprobe *fp)
+{
+ struct fprobe_hlist *fph = fp->hlist_array;
+ struct hlist_head *head;
- /* recursion detection has to go before any traceable function and
- * all functions before this point should be marked as notrace
- */
- bit = ftrace_test_recursion_trylock(ip, parent_ip);
- if (bit < 0) {
- fp->nmissed++;
- return;
- }
- __fprobe_handler(ip, parent_ip, ops, fregs);
- ftrace_test_recursion_unlock(bit);
+ lockdep_assert_held(&fprobe_mutex);
+
+ if (WARN_ON_ONCE(!fph))
+ return -EINVAL;
+
+ head = &fprobe_table[hash_ptr(fp, FPROBE_HASH_BITS)];
+ hlist_add_head_rcu(&fp->hlist_array->hlist, head);
+ return 0;
+}
+
+static int del_fprobe_hash(struct fprobe *fp)
+{
+ struct fprobe_hlist *fph = fp->hlist_array;
+
+ lockdep_assert_held(&fprobe_mutex);
+
+ if (WARN_ON_ONCE(!fph))
+ return -EINVAL;
+
+ if (!fprobe_registered(fp))
+ return -ENOENT;
+
+ fph->fp = NULL;
+ hlist_del_rcu(&fph->hlist);
+ return 0;
+}
+
+#ifdef ARCH_DEFINE_ENCODE_FPROBE_HEADER
+
+/* The arch should encode fprobe_header info into one unsigned long */
+#define FPROBE_HEADER_SIZE_IN_LONG 1
+
+static inline bool write_fprobe_header(unsigned long *stack,
+ struct fprobe *fp, unsigned int size_words)
+{
+ if (WARN_ON_ONCE(size_words > MAX_FPROBE_DATA_SIZE_WORD ||
+ !arch_fprobe_header_encodable(fp)))
+ return false;
+ *stack = arch_encode_fprobe_header(fp, size_words);
+ return true;
}
-NOKPROBE_SYMBOL(fprobe_handler);
-static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *ops, struct ftrace_regs *fregs)
+static inline void read_fprobe_header(unsigned long *stack,
+ struct fprobe **fp, unsigned int *size_words)
{
+ *fp = arch_decode_fprobe_header_fp(*stack);
+ *size_words = arch_decode_fprobe_header_size(*stack);
+}
+
+#else
+
+/* Generic fprobe_header */
+struct __fprobe_header {
struct fprobe *fp;
- int bit;
+ unsigned long size_words;
+} __packed;
- fp = container_of(ops, struct fprobe, ops);
- if (fprobe_disabled(fp))
- return;
+#define FPROBE_HEADER_SIZE_IN_LONG SIZE_IN_LONG(sizeof(struct __fprobe_header))
- /* recursion detection has to go before any traceable function and
- * all functions called before this point should be marked as notrace
- */
- bit = ftrace_test_recursion_trylock(ip, parent_ip);
- if (bit < 0) {
- fp->nmissed++;
- return;
- }
+static inline bool write_fprobe_header(unsigned long *stack,
+ struct fprobe *fp, unsigned int size_words)
+{
+ struct __fprobe_header *fph = (struct __fprobe_header *)stack;
+
+ if (WARN_ON_ONCE(size_words > MAX_FPROBE_DATA_SIZE_WORD))
+ return false;
+
+ fph->fp = fp;
+ fph->size_words = size_words;
+ return true;
+}
+static inline void read_fprobe_header(unsigned long *stack,
+ struct fprobe **fp, unsigned int *size_words)
+{
+ struct __fprobe_header *fph = (struct __fprobe_header *)stack;
+
+ *fp = fph->fp;
+ *size_words = fph->size_words;
+}
+
+#endif
+
+/*
+ * fprobe shadow stack management:
+ * Since fprobe shares a single fgraph_ops, it needs to share the stack entry
+ * among the probes on the same function exit. Note that a new probe can be
+ * registered before a target function is returning, we can not use the hash
+ * table to find the corresponding probes. Thus the probe address is stored on
+ * the shadow stack with its entry data size.
+ *
+ */
+static inline int __fprobe_handler(unsigned long ip, unsigned long parent_ip,
+ struct fprobe *fp, struct ftrace_regs *fregs,
+ void *data)
+{
+ if (!fp->entry_handler)
+ return 0;
+
+ return fp->entry_handler(fp, ip, parent_ip, fregs, data);
+}
+
+static inline int __fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip,
+ struct fprobe *fp, struct ftrace_regs *fregs,
+ void *data)
+{
+ int ret;
/*
* This user handler is shared with other kprobes and is not expected to be
* called recursively. So if any other kprobe handler is running, this will
@@ -108,44 +241,541 @@ static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip,
*/
if (unlikely(kprobe_running())) {
fp->nmissed++;
- goto recursion_unlock;
+ return 0;
}
kprobe_busy_begin();
- __fprobe_handler(ip, parent_ip, ops, fregs);
+ ret = __fprobe_handler(ip, parent_ip, fp, fregs, data);
kprobe_busy_end();
+ return ret;
+}
-recursion_unlock:
- ftrace_test_recursion_unlock(bit);
+static int fprobe_fgraph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
+ struct ftrace_regs *fregs);
+static void fprobe_return(struct ftrace_graph_ret *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs);
+
+static struct fgraph_ops fprobe_graph_ops = {
+ .entryfunc = fprobe_fgraph_entry,
+ .retfunc = fprobe_return,
+};
+/* Number of fgraph fprobe nodes */
+static int nr_fgraph_fprobes;
+/* Is fprobe_graph_ops registered? */
+static bool fprobe_graph_registered;
+
+/* Add @addrs to the ftrace filter and register fgraph if needed. */
+static int fprobe_graph_add_ips(unsigned long *addrs, int num)
+{
+ int ret;
+
+ lockdep_assert_held(&fprobe_mutex);
+
+ ret = ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 0, 0);
+ if (ret)
+ return ret;
+
+ if (!fprobe_graph_registered) {
+ ret = register_ftrace_graph(&fprobe_graph_ops);
+ if (WARN_ON_ONCE(ret)) {
+ ftrace_free_filter(&fprobe_graph_ops.ops);
+ return ret;
+ }
+ fprobe_graph_registered = true;
+ }
+ return 0;
+}
+
+static void __fprobe_graph_unregister(void)
+{
+ if (fprobe_graph_registered) {
+ unregister_ftrace_graph(&fprobe_graph_ops);
+ ftrace_free_filter(&fprobe_graph_ops.ops);
+ fprobe_graph_registered = false;
+ }
+}
+
+/* Remove @addrs from the ftrace filter and unregister fgraph if possible. */
+static void fprobe_graph_remove_ips(unsigned long *addrs, int num)
+{
+ lockdep_assert_held(&fprobe_mutex);
+
+ if (!nr_fgraph_fprobes)
+ __fprobe_graph_unregister();
+ else if (num)
+ ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 1, 0);
}
-static void fprobe_exit_handler(struct rethook_node *rh, void *data,
- unsigned long ret_ip, struct pt_regs *regs)
+#if defined(CONFIG_DYNAMIC_FTRACE_WITH_ARGS) || defined(CONFIG_DYNAMIC_FTRACE_WITH_REGS)
+
+/* ftrace_ops callback, this processes fprobes which have only entry_handler. */
+static void fprobe_ftrace_entry(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *ops, struct ftrace_regs *fregs)
{
- struct fprobe *fp = (struct fprobe *)data;
- struct fprobe_rethook_node *fpr;
+ struct fprobe_hlist_node *node;
+ struct rhlist_head *head, *pos;
+ struct fprobe *fp;
int bit;
- if (!fp || fprobe_disabled(fp))
+ bit = ftrace_test_recursion_trylock(ip, parent_ip);
+ if (bit < 0)
return;
- fpr = container_of(rh, struct fprobe_rethook_node, node);
+ /*
+ * ftrace_test_recursion_trylock() disables preemption, but
+ * rhltable_lookup() checks whether rcu_read_lcok is held.
+ * So we take rcu_read_lock() here.
+ */
+ rcu_read_lock();
+ head = rhltable_lookup(&fprobe_ip_table, &ip, fprobe_rht_params);
+
+ rhl_for_each_entry_rcu(node, pos, head, hlist) {
+ if (node->addr != ip)
+ break;
+ fp = READ_ONCE(node->fp);
+ if (unlikely(!fp || fprobe_disabled(fp) || fp->exit_handler))
+ continue;
+
+ if (fprobe_shared_with_kprobes(fp))
+ __fprobe_kprobe_handler(ip, parent_ip, fp, fregs, NULL);
+ else
+ __fprobe_handler(ip, parent_ip, fp, fregs, NULL);
+ }
+ rcu_read_unlock();
+ ftrace_test_recursion_unlock(bit);
+}
+NOKPROBE_SYMBOL(fprobe_ftrace_entry);
+
+static struct ftrace_ops fprobe_ftrace_ops = {
+ .func = fprobe_ftrace_entry,
+ .flags = FTRACE_OPS_FL_SAVE_ARGS,
+};
+/* Number of ftrace fprobe nodes */
+static int nr_ftrace_fprobes;
+/* Is fprobe_ftrace_ops registered? */
+static bool fprobe_ftrace_registered;
+
+static int fprobe_ftrace_add_ips(unsigned long *addrs, int num)
+{
+ int ret;
+
+ lockdep_assert_held(&fprobe_mutex);
+
+ ret = ftrace_set_filter_ips(&fprobe_ftrace_ops, addrs, num, 0, 0);
+ if (ret)
+ return ret;
+
+ if (!fprobe_ftrace_registered) {
+ ret = register_ftrace_function(&fprobe_ftrace_ops);
+ if (ret) {
+ ftrace_free_filter(&fprobe_ftrace_ops);
+ return ret;
+ }
+ fprobe_ftrace_registered = true;
+ }
+ return 0;
+}
+
+static void __fprobe_ftrace_unregister(void)
+{
+ if (fprobe_ftrace_registered) {
+ unregister_ftrace_function(&fprobe_ftrace_ops);
+ ftrace_free_filter(&fprobe_ftrace_ops);
+ fprobe_ftrace_registered = false;
+ }
+}
+
+static void fprobe_ftrace_remove_ips(unsigned long *addrs, int num)
+{
+ lockdep_assert_held(&fprobe_mutex);
+
+ if (!nr_ftrace_fprobes)
+ __fprobe_ftrace_unregister();
+ else if (num)
+ ftrace_set_filter_ips(&fprobe_ftrace_ops, addrs, num, 1, 0);
+}
+
+static bool fprobe_is_ftrace(struct fprobe *fp)
+{
+ return !fp->exit_handler;
+}
+
+/* Node insertion and deletion requires the fprobe_mutex */
+static int insert_fprobe_node(struct fprobe_hlist_node *node, struct fprobe *fp)
+{
+ int ret;
+
+ lockdep_assert_held(&fprobe_mutex);
+
+ ret = __insert_fprobe_node(node, fp);
+ if (!ret) {
+ if (fprobe_is_ftrace(fp))
+ nr_ftrace_fprobes++;
+ else
+ nr_fgraph_fprobes++;
+ }
+
+ return ret;
+}
+
+static void delete_fprobe_node(struct fprobe_hlist_node *node)
+{
+ struct fprobe *fp;
+
+ lockdep_assert_held(&fprobe_mutex);
+
+ fp = READ_ONCE(node->fp);
+ if (fp) {
+ if (fprobe_is_ftrace(fp))
+ nr_ftrace_fprobes--;
+ else
+ nr_fgraph_fprobes--;
+ }
+ __delete_fprobe_node(node);
+}
+
+static bool fprobe_exists_on_hash(unsigned long ip, bool ftrace)
+{
+ struct rhlist_head *head, *pos;
+ struct fprobe_hlist_node *node;
+ struct fprobe *fp;
+
+ guard(rcu)();
+ head = rhltable_lookup(&fprobe_ip_table, &ip,
+ fprobe_rht_params);
+ if (!head)
+ return false;
+ /* We have to check the same type on the list. */
+ rhl_for_each_entry_rcu(node, pos, head, hlist) {
+ if (node->addr != ip)
+ break;
+ fp = READ_ONCE(node->fp);
+ if (likely(fp)) {
+ if ((!ftrace && fp->exit_handler) ||
+ (ftrace && !fp->exit_handler))
+ return true;
+ }
+ }
+
+ return false;
+}
+
+#ifdef CONFIG_MODULES
+static void fprobe_remove_ips(unsigned long *ips, unsigned int cnt)
+{
+ if (!nr_fgraph_fprobes)
+ __fprobe_graph_unregister();
+ else if (cnt)
+ ftrace_set_filter_ips(&fprobe_graph_ops.ops, ips, cnt, 1, 0);
+
+ if (!nr_ftrace_fprobes)
+ __fprobe_ftrace_unregister();
+ else if (cnt)
+ ftrace_set_filter_ips(&fprobe_ftrace_ops, ips, cnt, 1, 0);
+}
+#endif
+#else
+static int fprobe_ftrace_add_ips(unsigned long *addrs, int num)
+{
+ return -ENOENT;
+}
+
+static void fprobe_ftrace_remove_ips(unsigned long *addrs, int num)
+{
+}
+
+static bool fprobe_is_ftrace(struct fprobe *fp)
+{
+ return false;
+}
+
+/* Node insertion and deletion requires the fprobe_mutex */
+static int insert_fprobe_node(struct fprobe_hlist_node *node, struct fprobe *fp)
+{
+ int ret;
+
+ lockdep_assert_held(&fprobe_mutex);
+
+ ret = __insert_fprobe_node(node, fp);
+ if (!ret)
+ nr_fgraph_fprobes++;
+
+ return ret;
+}
+
+static void delete_fprobe_node(struct fprobe_hlist_node *node)
+{
+ struct fprobe *fp;
+
+ lockdep_assert_held(&fprobe_mutex);
+
+ fp = READ_ONCE(node->fp);
+ if (fp)
+ nr_fgraph_fprobes--;
+ __delete_fprobe_node(node);
+}
+
+static bool fprobe_exists_on_hash(unsigned long ip, bool ftrace __maybe_unused)
+{
+ struct rhlist_head *head, *pos;
+ struct fprobe_hlist_node *node;
+ struct fprobe *fp;
+
+ guard(rcu)();
+ head = rhltable_lookup(&fprobe_ip_table, &ip,
+ fprobe_rht_params);
+ if (!head)
+ return false;
+ /* We only need to check fp is there. */
+ rhl_for_each_entry_rcu(node, pos, head, hlist) {
+ if (node->addr != ip)
+ break;
+ fp = READ_ONCE(node->fp);
+ if (likely(fp))
+ return true;
+ }
+
+ return false;
+}
+
+#ifdef CONFIG_MODULES
+static void fprobe_remove_ips(unsigned long *ips, unsigned int cnt)
+{
+ if (!nr_fgraph_fprobes)
+ __fprobe_graph_unregister();
+ else if (cnt)
+ ftrace_set_filter_ips(&fprobe_graph_ops.ops, ips, cnt, 1, 0);
+}
+#endif
+#endif /* !CONFIG_DYNAMIC_FTRACE_WITH_ARGS && !CONFIG_DYNAMIC_FTRACE_WITH_REGS */
+
+/* fgraph_ops callback, this processes fprobes which have exit_handler. */
+static int fprobe_fgraph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
+{
+ unsigned long *fgraph_data = NULL;
+ unsigned long func = trace->func;
+ struct fprobe_hlist_node *node;
+ struct rhlist_head *head, *pos;
+ unsigned long ret_ip;
+ int reserved_words;
+ struct fprobe *fp;
+ int used, ret;
+
+ if (WARN_ON_ONCE(!fregs))
+ return 0;
+
+ guard(rcu)();
+ head = rhltable_lookup(&fprobe_ip_table, &func, fprobe_rht_params);
+ reserved_words = 0;
+ rhl_for_each_entry_rcu(node, pos, head, hlist) {
+ if (node->addr != func)
+ continue;
+ fp = READ_ONCE(node->fp);
+ if (!fp || !fp->exit_handler)
+ continue;
+ /*
+ * Since fprobe can be enabled until the next loop, we ignore the
+ * fprobe's disabled flag in this loop.
+ */
+ reserved_words +=
+ FPROBE_HEADER_SIZE_IN_LONG + SIZE_IN_LONG(fp->entry_data_size);
+ }
+ if (reserved_words) {
+ fgraph_data = fgraph_reserve_data(gops->idx, reserved_words * sizeof(long));
+ if (unlikely(!fgraph_data)) {
+ rhl_for_each_entry_rcu(node, pos, head, hlist) {
+ if (node->addr != func)
+ continue;
+ fp = READ_ONCE(node->fp);
+ if (fp && !fprobe_disabled(fp) && !fprobe_is_ftrace(fp))
+ fp->nmissed++;
+ }
+ return 0;
+ }
+ }
/*
- * we need to assure no calls to traceable functions in-between the
- * end of fprobe_handler and the beginning of fprobe_exit_handler.
+ * TODO: recursion detection has been done in the fgraph. Thus we need
+ * to add a callback to increment missed counter.
*/
- bit = ftrace_test_recursion_trylock(fpr->entry_ip, fpr->entry_parent_ip);
- if (bit < 0) {
- fp->nmissed++;
+ ret_ip = ftrace_regs_get_return_address(fregs);
+ used = 0;
+ rhl_for_each_entry_rcu(node, pos, head, hlist) {
+ int data_size;
+ void *data;
+
+ if (node->addr != func)
+ continue;
+ fp = READ_ONCE(node->fp);
+ if (unlikely(!fp || fprobe_disabled(fp) || fprobe_is_ftrace(fp)))
+ continue;
+
+ data_size = fp->entry_data_size;
+ if (data_size && fp->exit_handler)
+ data = fgraph_data + used + FPROBE_HEADER_SIZE_IN_LONG;
+ else
+ data = NULL;
+
+ if (fprobe_shared_with_kprobes(fp))
+ ret = __fprobe_kprobe_handler(func, ret_ip, fp, fregs, data);
+ else
+ ret = __fprobe_handler(func, ret_ip, fp, fregs, data);
+
+ /* If entry_handler returns !0, nmissed is not counted but skips exit_handler. */
+ if (!ret && fp->exit_handler) {
+ int size_words = SIZE_IN_LONG(data_size);
+
+ if (write_fprobe_header(&fgraph_data[used], fp, size_words))
+ used += FPROBE_HEADER_SIZE_IN_LONG + size_words;
+ }
+ }
+
+ /* If any exit_handler is set, data must be used. */
+ return used != 0;
+}
+NOKPROBE_SYMBOL(fprobe_fgraph_entry);
+
+static void fprobe_return(struct ftrace_graph_ret *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
+{
+ unsigned long *fgraph_data = NULL;
+ unsigned long ret_ip;
+ struct fprobe *fp;
+ int size, curr;
+ int size_words;
+
+ fgraph_data = (unsigned long *)fgraph_retrieve_data(gops->idx, &size);
+ if (WARN_ON_ONCE(!fgraph_data))
return;
+ size_words = SIZE_IN_LONG(size);
+ ret_ip = ftrace_regs_get_instruction_pointer(fregs);
+
+ preempt_disable_notrace();
+
+ curr = 0;
+ while (size_words > curr) {
+ read_fprobe_header(&fgraph_data[curr], &fp, &size);
+ if (!fp)
+ break;
+ curr += FPROBE_HEADER_SIZE_IN_LONG;
+ if (fprobe_registered(fp) && !fprobe_disabled(fp)) {
+ if (WARN_ON_ONCE(curr + size > size_words))
+ break;
+ fp->exit_handler(fp, trace->func, ret_ip, fregs,
+ size ? fgraph_data + curr : NULL);
+ }
+ curr += size;
}
+ preempt_enable_notrace();
+}
+NOKPROBE_SYMBOL(fprobe_return);
- fp->exit_handler(fp, fpr->entry_ip, ret_ip, regs,
- fp->entry_data_size ? (void *)fpr->data : NULL);
- ftrace_test_recursion_unlock(bit);
+#ifdef CONFIG_MODULES
+
+#define FPROBE_IPS_BATCH_INIT 128
+/* instruction pointer address list */
+struct fprobe_addr_list {
+ int index;
+ int size;
+ unsigned long *addrs;
+};
+
+static int fprobe_remove_node_in_module(struct module *mod, struct fprobe_hlist_node *node,
+ struct fprobe_addr_list *alist)
+{
+ lockdep_assert_in_rcu_read_lock();
+
+ if (!within_module(node->addr, mod))
+ return 0;
+
+ delete_fprobe_node(node);
+ /* If no address list is available, we can't track this address. */
+ if (!alist->addrs)
+ return 0;
+ /*
+ * Don't care the type here, because all fprobes on the same
+ * address must be removed eventually.
+ */
+ if (!rhltable_lookup(&fprobe_ip_table, &node->addr, fprobe_rht_params)) {
+ alist->addrs[alist->index++] = node->addr;
+ if (alist->index == alist->size)
+ return -ENOSPC;
+ }
+
+ return 0;
}
-NOKPROBE_SYMBOL(fprobe_exit_handler);
+
+/* Handle module unloading to manage fprobe_ip_table. */
+static int fprobe_module_callback(struct notifier_block *nb,
+ unsigned long val, void *data)
+{
+ struct fprobe_addr_list alist = {.size = FPROBE_IPS_BATCH_INIT};
+ struct fprobe_hlist_node *node;
+ struct rhashtable_iter iter;
+ struct module *mod = data;
+ bool retry;
+
+ if (val != MODULE_STATE_GOING)
+ return NOTIFY_DONE;
+
+ alist.addrs = kcalloc(alist.size, sizeof(*alist.addrs), GFP_KERNEL);
+ /*
+ * If failed to alloc memory, ftrace_ops will not be able to remove ips from
+ * hash, but we can still remove nodes from fprobe_ip_table, so we can avoid
+ * the potential wrong callback. So just print a warning here and try to
+ * continue without address list.
+ */
+ WARN_ONCE(!alist.addrs,
+ "Failed to allocate memory for fprobe_addr_list, ftrace_ops will not be updated");
+
+ mutex_lock(&fprobe_mutex);
+again:
+ retry = false;
+ alist.index = 0;
+ rhltable_walk_enter(&fprobe_ip_table, &iter);
+ do {
+ rhashtable_walk_start(&iter);
+
+ while ((node = rhashtable_walk_next(&iter)) && !IS_ERR(node))
+ if (fprobe_remove_node_in_module(mod, node, &alist) < 0) {
+ retry = true;
+ break;
+ }
+
+ rhashtable_walk_stop(&iter);
+ } while (node == ERR_PTR(-EAGAIN) && !retry);
+ rhashtable_walk_exit(&iter);
+ /* Remove any ips from hash table(s) */
+ fprobe_remove_ips(alist.addrs, alist.index);
+ /*
+ * If we break rhashtable walk loop except for -EAGAIN, we need
+ * to restart looping from start for safety. Anyway, this is
+ * not a hotpath.
+ */
+ if (retry)
+ goto again;
+
+ mutex_unlock(&fprobe_mutex);
+
+ kfree(alist.addrs);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block fprobe_module_nb = {
+ .notifier_call = fprobe_module_callback,
+ .priority = 0,
+};
+
+static int __init init_fprobe_module(void)
+{
+ return register_module_notifier(&fprobe_module_nb);
+}
+early_initcall(init_fprobe_module);
+#endif
static int symbols_cmp(const void *a, const void *b)
{
@@ -175,51 +805,117 @@ static unsigned long *get_ftrace_locations(const char **syms, int num)
return ERR_PTR(-ENOENT);
}
-static void fprobe_init(struct fprobe *fp)
+struct filter_match_data {
+ const char *filter;
+ const char *notfilter;
+ size_t index;
+ size_t size;
+ unsigned long *addrs;
+ struct module **mods;
+};
+
+static int filter_match_callback(void *data, const char *name, unsigned long addr)
{
- fp->nmissed = 0;
- if (fprobe_shared_with_kprobes(fp))
- fp->ops.func = fprobe_kprobe_handler;
- else
- fp->ops.func = fprobe_handler;
- fp->ops.flags |= FTRACE_OPS_FL_SAVE_REGS;
+ struct filter_match_data *match = data;
+
+ if (!glob_match(match->filter, name) ||
+ (match->notfilter && glob_match(match->notfilter, name)))
+ return 0;
+
+ if (!ftrace_location(addr))
+ return 0;
+
+ if (match->addrs) {
+ struct module *mod = __module_text_address(addr);
+
+ if (mod && !try_module_get(mod))
+ return 0;
+
+ match->mods[match->index] = mod;
+ match->addrs[match->index] = addr;
+ }
+ match->index++;
+ return match->index == match->size;
}
-static int fprobe_init_rethook(struct fprobe *fp, int num)
+/*
+ * Make IP list from the filter/no-filter glob patterns.
+ * Return the number of matched symbols, or errno.
+ * If @addrs == NULL, this just counts the number of matched symbols. If @addrs
+ * is passed with an array, we need to pass the an @mods array of the same size
+ * to increment the module refcount for each symbol.
+ * This means we also need to call `module_put` for each element of @mods after
+ * using the @addrs.
+ */
+static int get_ips_from_filter(const char *filter, const char *notfilter,
+ unsigned long *addrs, struct module **mods,
+ size_t size)
{
- int size;
+ struct filter_match_data match = { .filter = filter, .notfilter = notfilter,
+ .index = 0, .size = size, .addrs = addrs, .mods = mods};
+ int ret;
- if (!fp->exit_handler) {
- fp->rethook = NULL;
- return 0;
+ if (addrs && !mods)
+ return -EINVAL;
+
+ ret = kallsyms_on_each_symbol(filter_match_callback, &match);
+ if (ret < 0)
+ return ret;
+ if (IS_ENABLED(CONFIG_MODULES)) {
+ ret = module_kallsyms_on_each_symbol(NULL, filter_match_callback, &match);
+ if (ret < 0)
+ return ret;
}
- /* Initialize rethook if needed */
- if (fp->nr_maxactive)
- num = fp->nr_maxactive;
- else
- num *= num_possible_cpus() * 2;
- if (num <= 0)
+ return match.index ?: -ENOENT;
+}
+
+static void fprobe_fail_cleanup(struct fprobe *fp)
+{
+ kfree(fp->hlist_array);
+ fp->hlist_array = NULL;
+}
+
+/* Initialize the fprobe data structure. */
+static int fprobe_init(struct fprobe *fp, unsigned long *addrs, int num)
+{
+ struct fprobe_hlist *hlist_array;
+ unsigned long addr;
+ int size, i;
+
+ if (!fp || !addrs || num <= 0)
return -EINVAL;
- size = sizeof(struct fprobe_rethook_node) + fp->entry_data_size;
+ size = ALIGN(fp->entry_data_size, sizeof(long));
+ if (size > MAX_FPROBE_DATA_SIZE)
+ return -E2BIG;
+ fp->entry_data_size = size;
- /* Initialize rethook */
- fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler, size, num);
- if (IS_ERR(fp->rethook))
- return PTR_ERR(fp->rethook);
+ hlist_array = kzalloc_flex(*hlist_array, array, num);
+ if (!hlist_array)
+ return -ENOMEM;
+ fp->nmissed = 0;
+
+ hlist_array->size = num;
+ fp->hlist_array = hlist_array;
+ hlist_array->fp = fp;
+ for (i = 0; i < num; i++) {
+ addr = ftrace_location(addrs[i]);
+ if (!addr) {
+ fprobe_fail_cleanup(fp);
+ return -ENOENT;
+ }
+ hlist_array->array[i].addr = addr;
+ }
return 0;
}
-static void fprobe_fail_cleanup(struct fprobe *fp)
+#define FPROBE_IPS_MAX INT_MAX
+
+int fprobe_count_ips_from_filter(const char *filter, const char *notfilter)
{
- if (!IS_ERR_OR_NULL(fp->rethook)) {
- /* Don't need to cleanup rethook->handler because this is not used. */
- rethook_free(fp->rethook);
- fp->rethook = NULL;
- }
- ftrace_free_filter(&fp->ops);
+ return get_ips_from_filter(filter, notfilter, NULL, NULL, FPROBE_IPS_MAX);
}
/**
@@ -235,54 +931,45 @@ static void fprobe_fail_cleanup(struct fprobe *fp)
*/
int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter)
{
- struct ftrace_hash *hash;
- unsigned char *str;
- int ret, len;
+ unsigned long *addrs __free(kfree) = NULL;
+ struct module **mods __free(kfree) = NULL;
+ int ret, num;
if (!fp || !filter)
return -EINVAL;
- fprobe_init(fp);
+ num = get_ips_from_filter(filter, notfilter, NULL, NULL, FPROBE_IPS_MAX);
+ if (num < 0)
+ return num;
- len = strlen(filter);
- str = kstrdup(filter, GFP_KERNEL);
- ret = ftrace_set_filter(&fp->ops, str, len, 0);
- kfree(str);
- if (ret)
- return ret;
+ addrs = kcalloc(num, sizeof(*addrs), GFP_KERNEL);
+ if (!addrs)
+ return -ENOMEM;
- if (notfilter) {
- len = strlen(notfilter);
- str = kstrdup(notfilter, GFP_KERNEL);
- ret = ftrace_set_notrace(&fp->ops, str, len, 0);
- kfree(str);
- if (ret)
- goto out;
- }
+ mods = kzalloc_objs(*mods, num);
+ if (!mods)
+ return -ENOMEM;
- /* TODO:
- * correctly calculate the total number of filtered symbols
- * from both filter and notfilter.
- */
- hash = rcu_access_pointer(fp->ops.local_hash.filter_hash);
- if (WARN_ON_ONCE(!hash))
- goto out;
+ ret = get_ips_from_filter(filter, notfilter, addrs, mods, num);
+ if (ret < 0)
+ return ret;
- ret = fprobe_init_rethook(fp, (int)hash->count);
- if (!ret)
- ret = register_ftrace_function(&fp->ops);
+ ret = register_fprobe_ips(fp, addrs, ret);
-out:
- if (ret)
- fprobe_fail_cleanup(fp);
+ for (int i = 0; i < num; i++) {
+ if (mods[i])
+ module_put(mods[i]);
+ }
return ret;
}
EXPORT_SYMBOL_GPL(register_fprobe);
+static int unregister_fprobe_nolock(struct fprobe *fp);
+
/**
* register_fprobe_ips() - Register fprobe to ftrace by address.
* @fp: A fprobe data structure to be registered.
- * @addrs: An array of target ftrace location addresses.
+ * @addrs: An array of target function address.
* @num: The number of entries of @addrs.
*
* Register @fp to ftrace for enabling the probe on the address given by @addrs.
@@ -294,23 +981,37 @@ EXPORT_SYMBOL_GPL(register_fprobe);
*/
int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num)
{
- int ret;
-
- if (!fp || !addrs || num <= 0)
- return -EINVAL;
+ struct fprobe_hlist *hlist_array;
+ int ret, i;
- fprobe_init(fp);
+ guard(mutex)(&fprobe_mutex);
+ if (fprobe_registered(fp))
+ return -EEXIST;
- ret = ftrace_set_filter_ips(&fp->ops, addrs, num, 0, 0);
+ ret = fprobe_init(fp, addrs, num);
if (ret)
return ret;
- ret = fprobe_init_rethook(fp, num);
- if (!ret)
- ret = register_ftrace_function(&fp->ops);
-
- if (ret)
+ if (fprobe_is_ftrace(fp))
+ ret = fprobe_ftrace_add_ips(addrs, num);
+ else
+ ret = fprobe_graph_add_ips(addrs, num);
+ if (ret) {
fprobe_fail_cleanup(fp);
+ return ret;
+ }
+
+ hlist_array = fp->hlist_array;
+ ret = add_fprobe_hash(fp);
+ for (i = 0; i < hlist_array->size && !ret; i++)
+ ret = insert_fprobe_node(&hlist_array->array[i], fp);
+
+ if (ret) {
+ unregister_fprobe_nolock(fp);
+ /* In error case, wait for clean up safely. */
+ synchronize_rcu();
+ }
+
return ret;
}
EXPORT_SYMBOL_GPL(register_fprobe_ips);
@@ -348,39 +1049,89 @@ EXPORT_SYMBOL_GPL(register_fprobe_syms);
bool fprobe_is_registered(struct fprobe *fp)
{
- if (!fp || (fp->ops.saved_func != fprobe_handler &&
- fp->ops.saved_func != fprobe_kprobe_handler))
+ if (!fp || !fp->hlist_array)
return false;
return true;
}
+static int unregister_fprobe_nolock(struct fprobe *fp)
+{
+ struct fprobe_hlist *hlist_array = fp->hlist_array;
+ unsigned long *addrs = NULL;
+ int i, count;
+
+ addrs = kcalloc(hlist_array->size, sizeof(unsigned long), GFP_KERNEL);
+ /*
+ * This will remove fprobe_hash_node from the hash table even if
+ * memory allocation fails. However, ftrace_ops will not be updated.
+ * Anyway, when the last fprobe is unregistered, ftrace_ops is also
+ * unregistered.
+ */
+ if (!addrs)
+ pr_warn("Failed to allocate working array. ftrace_ops may not sync.\n");
+
+ /* Remove non-synonim ips from table and hash */
+ count = 0;
+ for (i = 0; i < hlist_array->size; i++) {
+ delete_fprobe_node(&hlist_array->array[i]);
+ if (addrs && !fprobe_exists_on_hash(hlist_array->array[i].addr,
+ fprobe_is_ftrace(fp)))
+ addrs[count++] = hlist_array->array[i].addr;
+ }
+ del_fprobe_hash(fp);
+
+ if (fprobe_is_ftrace(fp))
+ fprobe_ftrace_remove_ips(addrs, count);
+ else
+ fprobe_graph_remove_ips(addrs, count);
+
+ kfree_rcu(hlist_array, rcu);
+ fp->hlist_array = NULL;
+ kfree(addrs);
+
+ return 0;
+}
+
/**
- * unregister_fprobe() - Unregister fprobe from ftrace
+ * unregister_fprobe_async() - Unregister fprobe without RCU GP wait
* @fp: A fprobe data structure to be unregistered.
*
* Unregister fprobe (and remove ftrace hooks from the function entries).
+ * This function will NOT wait until the fprobe is no longer used.
*
* Return 0 if @fp is unregistered successfully, -errno if not.
*/
-int unregister_fprobe(struct fprobe *fp)
+int unregister_fprobe_async(struct fprobe *fp)
{
- int ret;
-
- if (!fprobe_is_registered(fp))
+ guard(mutex)(&fprobe_mutex);
+ if (!fp || !fprobe_registered(fp))
return -EINVAL;
- if (!IS_ERR_OR_NULL(fp->rethook))
- rethook_stop(fp->rethook);
-
- ret = unregister_ftrace_function(&fp->ops);
- if (ret < 0)
- return ret;
-
- if (!IS_ERR_OR_NULL(fp->rethook))
- rethook_free(fp->rethook);
+ return unregister_fprobe_nolock(fp);
+}
- ftrace_free_filter(&fp->ops);
+/**
+ * unregister_fprobe() - Unregister fprobe with RCU GP wait
+ * @fp: A fprobe data structure to be unregistered.
+ *
+ * Unregister fprobe (and remove ftrace hooks from the function entries).
+ * This function will block until the fprobe is no longer used.
+ *
+ * Return 0 if @fp is unregistered successfully, -errno if not.
+ */
+int unregister_fprobe(struct fprobe *fp)
+{
+ int ret = unregister_fprobe_async(fp);
+ if (!ret)
+ synchronize_rcu();
return ret;
}
EXPORT_SYMBOL_GPL(unregister_fprobe);
+
+static int __init fprobe_initcall(void)
+{
+ rhltable_init(&fprobe_ip_table, &fprobe_rht_params);
+ return 0;
+}
+core_initcall(fprobe_initcall);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2e113f8b13a2..b2611de3f594 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -68,7 +68,6 @@
})
/* hash bits for specific function selection */
-#define FTRACE_HASH_DEFAULT_BITS 10
#define FTRACE_HASH_MAX_BITS 12
#ifdef CONFIG_DYNAMIC_FTRACE
@@ -188,7 +187,7 @@ static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
op->saved_func(ip, parent_ip, op, fregs);
}
-static void ftrace_sync_ipi(void *data)
+void ftrace_sync_ipi(void *data)
{
/* Probably not needed, but do it anyway */
smp_rmb();
@@ -534,51 +533,69 @@ static int function_stat_headers(struct seq_file *m)
static int function_stat_show(struct seq_file *m, void *v)
{
+ struct trace_array *tr = trace_get_global_array();
struct ftrace_profile *rec = v;
+ const char *refsymbol = NULL;
char str[KSYM_SYMBOL_LEN];
- int ret = 0;
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
static struct trace_seq s;
unsigned long long avg;
unsigned long long stddev;
+ unsigned long long stddev_denom;
#endif
- mutex_lock(&ftrace_profile_lock);
+ guard(mutex)(&ftrace_profile_lock);
/* we raced with function_profile_reset() */
- if (unlikely(rec->counter == 0)) {
- ret = -EBUSY;
- goto out;
- }
+ if (unlikely(rec->counter == 0))
+ return -EBUSY;
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
avg = div64_ul(rec->time, rec->counter);
if (tracing_thresh && (avg < tracing_thresh))
- goto out;
+ return 0;
#endif
- kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+ if (tr->trace_flags & TRACE_ITER(PROF_TEXT_OFFSET)) {
+ unsigned long offset;
+
+ if (core_kernel_text(rec->ip)) {
+ refsymbol = "_text";
+ offset = rec->ip - (unsigned long)_text;
+ } else {
+ struct module *mod;
+
+ guard(rcu)();
+ mod = __module_text_address(rec->ip);
+ if (mod) {
+ refsymbol = mod->name;
+ /* Calculate offset from module's text entry address. */
+ offset = rec->ip - (unsigned long)mod->mem[MOD_TEXT].base;
+ }
+ }
+ if (refsymbol)
+ snprintf(str, sizeof(str), " %s+%#lx", refsymbol, offset);
+ }
+ if (!refsymbol)
+ kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+
seq_printf(m, " %-30.30s %10lu", str, rec->counter);
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
seq_puts(m, " ");
- /* Sample standard deviation (s^2) */
- if (rec->counter <= 1)
- stddev = 0;
- else {
- /*
- * Apply Welford's method:
- * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2)
- */
+ /*
+ * Variance formula:
+ * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2)
+ * Maybe Welford's method is better here?
+ * Divide only by 1000 for ns^2 -> us^2 conversion.
+ * trace_print_graph_duration will divide by 1000 again.
+ */
+ stddev = 0;
+ stddev_denom = rec->counter * (rec->counter - 1) * 1000;
+ if (stddev_denom) {
stddev = rec->counter * rec->time_squared -
rec->time * rec->time;
-
- /*
- * Divide only 1000 for ns^2 -> us^2 conversion.
- * trace_print_graph_duration will divide 1000 again.
- */
- stddev = div64_ul(stddev,
- rec->counter * (rec->counter - 1) * 1000);
+ stddev = div64_ul(stddev, stddev_denom);
}
trace_seq_init(&s);
@@ -590,10 +607,8 @@ static int function_stat_show(struct seq_file *m, void *v)
trace_print_seq(m, &s);
#endif
seq_putc(m, '\n');
-out:
- mutex_unlock(&ftrace_profile_lock);
- return ret;
+ return 0;
}
static void ftrace_profile_reset(struct ftrace_profile_stat *stat)
@@ -687,7 +702,7 @@ static int ftrace_profile_init_cpu(int cpu)
*/
size = FTRACE_PROFILE_HASH_SIZE;
- stat->hash = kcalloc(size, sizeof(struct hlist_head), GFP_KERNEL);
+ stat->hash = kzalloc_objs(struct hlist_head, size);
if (!stat->hash)
return -ENOMEM;
@@ -789,27 +804,24 @@ function_profile_call(unsigned long ip, unsigned long parent_ip,
{
struct ftrace_profile_stat *stat;
struct ftrace_profile *rec;
- unsigned long flags;
if (!ftrace_profile_enabled)
return;
- local_irq_save(flags);
+ guard(preempt_notrace)();
stat = this_cpu_ptr(&ftrace_profile_stats);
if (!stat->hash || !ftrace_profile_enabled)
- goto out;
+ return;
rec = ftrace_find_profiled_func(stat, ip);
if (!rec) {
rec = ftrace_profile_alloc(stat, ip);
if (!rec)
- goto out;
+ return;
}
rec->counter++;
- out:
- local_irq_restore(flags);
}
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -827,7 +839,8 @@ struct profile_fgraph_data {
};
static int profile_graph_entry(struct ftrace_graph_ent *trace,
- struct fgraph_ops *gops)
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
struct profile_fgraph_data *profile_data;
@@ -848,31 +861,34 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace,
return 1;
}
+bool fprofile_no_sleep_time;
+
static void profile_graph_return(struct ftrace_graph_ret *trace,
- struct fgraph_ops *gops)
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
struct profile_fgraph_data *profile_data;
struct ftrace_profile_stat *stat;
unsigned long long calltime;
unsigned long long rettime = trace_clock_local();
struct ftrace_profile *rec;
- unsigned long flags;
int size;
- local_irq_save(flags);
+ guard(preempt_notrace)();
+
stat = this_cpu_ptr(&ftrace_profile_stats);
if (!stat->hash || !ftrace_profile_enabled)
- goto out;
+ return;
profile_data = fgraph_retrieve_data(gops->idx, &size);
/* If the calltime was zero'd ignore it */
if (!profile_data || !profile_data->calltime)
- goto out;
+ return;
calltime = rettime - profile_data->calltime;
- if (!fgraph_sleep_time) {
+ if (fprofile_no_sleep_time) {
if (current->ftrace_sleeptime)
calltime -= current->ftrace_sleeptime - profile_data->sleeptime;
}
@@ -896,9 +912,6 @@ static void profile_graph_return(struct ftrace_graph_ret *trace,
rec->time += calltime;
rec->time_squared += calltime * calltime;
}
-
- out:
- local_irq_restore(flags);
}
static struct fgraph_ops fprofiler_ops = {
@@ -946,20 +959,16 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
val = !!val;
- mutex_lock(&ftrace_profile_lock);
+ guard(mutex)(&ftrace_profile_lock);
if (ftrace_profile_enabled ^ val) {
if (val) {
ret = ftrace_profile_init();
- if (ret < 0) {
- cnt = ret;
- goto out;
- }
+ if (ret < 0)
+ return ret;
ret = register_ftrace_profiler();
- if (ret < 0) {
- cnt = ret;
- goto out;
- }
+ if (ret < 0)
+ return ret;
ftrace_profile_enabled = 1;
} else {
ftrace_profile_enabled = 0;
@@ -970,8 +979,6 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
unregister_ftrace_profiler();
}
}
- out:
- mutex_unlock(&ftrace_profile_lock);
*ppos += cnt;
@@ -1060,10 +1067,6 @@ static struct ftrace_ops *removed_ops;
*/
static bool update_all_ops;
-#ifndef CONFIG_FTRACE_MCOUNT_RECORD
-# error Dynamic ftrace depends on MCOUNT_RECORD
-#endif
-
struct ftrace_func_probe {
struct ftrace_probe_ops *probe_ops;
struct ftrace_ops ops;
@@ -1144,7 +1147,7 @@ struct ftrace_page {
};
#define ENTRY_SIZE sizeof(struct dyn_ftrace)
-#define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE)
+#define ENTRIES_PER_PAGE_GROUP(order) ((PAGE_SIZE << (order)) / ENTRY_SIZE)
static struct ftrace_page *ftrace_pages_start;
static struct ftrace_page *ftrace_pages;
@@ -1207,21 +1210,28 @@ static void __add_hash_entry(struct ftrace_hash *hash,
hash->count++;
}
-static struct ftrace_func_entry *
-add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
+struct ftrace_func_entry *
+add_ftrace_hash_entry_direct(struct ftrace_hash *hash, unsigned long ip, unsigned long direct)
{
struct ftrace_func_entry *entry;
- entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ entry = kmalloc_obj(*entry);
if (!entry)
return NULL;
entry->ip = ip;
+ entry->direct = direct;
__add_hash_entry(hash, entry);
return entry;
}
+static struct ftrace_func_entry *
+add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
+{
+ return add_ftrace_hash_entry_direct(hash, ip, 0);
+}
+
static void
free_hash_entry(struct ftrace_hash *hash,
struct ftrace_func_entry *entry)
@@ -1280,7 +1290,7 @@ static void clear_ftrace_mod_list(struct list_head *head)
mutex_unlock(&ftrace_lock);
}
-static void free_ftrace_hash(struct ftrace_hash *hash)
+void free_ftrace_hash(struct ftrace_hash *hash)
{
if (!hash || hash == EMPTY_HASH)
return;
@@ -1311,22 +1321,26 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash)
void ftrace_free_filter(struct ftrace_ops *ops)
{
ftrace_ops_init(ops);
+ if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED))
+ return;
free_ftrace_hash(ops->func_hash->filter_hash);
free_ftrace_hash(ops->func_hash->notrace_hash);
+ ops->func_hash->filter_hash = EMPTY_HASH;
+ ops->func_hash->notrace_hash = EMPTY_HASH;
}
EXPORT_SYMBOL_GPL(ftrace_free_filter);
-static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
+struct ftrace_hash *alloc_ftrace_hash(int size_bits)
{
struct ftrace_hash *hash;
int size;
- hash = kzalloc(sizeof(*hash), GFP_KERNEL);
+ hash = kzalloc_obj(*hash);
if (!hash)
return NULL;
size = 1 << size_bits;
- hash->buckets = kcalloc(size, sizeof(*hash->buckets), GFP_KERNEL);
+ hash->buckets = kzalloc_objs(*hash->buckets, size);
if (!hash->buckets) {
kfree(hash);
@@ -1346,7 +1360,7 @@ static int ftrace_add_mod(struct trace_array *tr,
struct ftrace_mod_load *ftrace_mod;
struct list_head *mod_head = enable ? &tr->mod_trace : &tr->mod_notrace;
- ftrace_mod = kzalloc(sizeof(*ftrace_mod), GFP_KERNEL);
+ ftrace_mod = kzalloc_obj(*ftrace_mod);
if (!ftrace_mod)
return -ENOMEM;
@@ -1390,7 +1404,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
size = 1 << hash->size_bits;
for (i = 0; i < size; i++) {
hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
- if (add_hash_entry(new_hash, entry->ip) == NULL)
+ if (add_ftrace_hash_entry_direct(new_hash, entry->ip, entry->direct) == NULL)
goto free_hash;
}
}
@@ -1671,14 +1685,12 @@ unsigned long ftrace_location(unsigned long ip)
loc = ftrace_location_range(ip, ip);
if (!loc) {
if (!kallsyms_lookup_size_offset(ip, &size, &offset))
- goto out;
+ return 0;
/* map sym+0 to __fentry__ */
if (!offset)
loc = ftrace_location_range(ip, ip + size - 1);
}
-
-out:
return loc;
}
@@ -1991,7 +2003,8 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops)
*/
static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
struct ftrace_hash *old_hash,
- struct ftrace_hash *new_hash)
+ struct ftrace_hash *new_hash,
+ bool update_target)
{
struct ftrace_page *pg;
struct dyn_ftrace *rec, *end = NULL;
@@ -2026,10 +2039,13 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
if (rec->flags & FTRACE_FL_DISABLED)
continue;
- /* We need to update only differences of filter_hash */
+ /*
+ * Unless we are updating the target of a direct function,
+ * we only need to update differences of filter_hash
+ */
in_old = !!ftrace_lookup_ip(old_hash, rec->ip);
in_new = !!ftrace_lookup_ip(new_hash, rec->ip);
- if (in_old == in_new)
+ if (!update_target && (in_old == in_new))
continue;
if (in_new) {
@@ -2040,7 +2056,16 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
if (is_ipmodify)
goto rollback;
- FTRACE_WARN_ON(rec->flags & FTRACE_FL_DIRECT);
+ /*
+ * If this is called by __modify_ftrace_direct()
+ * then it is only changing where the direct
+ * pointer is jumping to, and the record already
+ * points to a direct trampoline. If it isn't,
+ * then it is a bug to update ipmodify on a direct
+ * caller.
+ */
+ FTRACE_WARN_ON(!update_target &&
+ (rec->flags & FTRACE_FL_DIRECT));
/*
* Another ops with IPMODIFY is already
@@ -2050,7 +2075,7 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
*/
if (!ops->ops_func)
return -EBUSY;
- ret = ops->ops_func(ops, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF);
+ ret = ops->ops_func(ops, rec->ip, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF);
if (ret)
return ret;
} else if (is_ipmodify) {
@@ -2073,7 +2098,7 @@ rollback:
continue;
if (rec == end)
- goto err_out;
+ return -EBUSY;
in_old = !!ftrace_lookup_ip(old_hash, rec->ip);
in_new = !!ftrace_lookup_ip(new_hash, rec->ip);
@@ -2086,7 +2111,6 @@ rollback:
rec->flags |= FTRACE_FL_IPMODIFY;
} while_for_each_ftrace_rec();
-err_out:
return -EBUSY;
}
@@ -2097,7 +2121,7 @@ static int ftrace_hash_ipmodify_enable(struct ftrace_ops *ops)
if (ftrace_hash_empty(hash))
hash = NULL;
- return __ftrace_hash_update_ipmodify(ops, EMPTY_HASH, hash);
+ return __ftrace_hash_update_ipmodify(ops, EMPTY_HASH, hash, false);
}
/* Disabling always succeeds */
@@ -2108,7 +2132,7 @@ static void ftrace_hash_ipmodify_disable(struct ftrace_ops *ops)
if (ftrace_hash_empty(hash))
hash = NULL;
- __ftrace_hash_update_ipmodify(ops, hash, EMPTY_HASH);
+ __ftrace_hash_update_ipmodify(ops, hash, EMPTY_HASH, false);
}
static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
@@ -2122,7 +2146,7 @@ static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
if (ftrace_hash_empty(new_hash))
new_hash = NULL;
- return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash);
+ return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash, false);
}
static void print_ip_ins(const char *fmt, const unsigned char *p)
@@ -2607,8 +2631,13 @@ unsigned long ftrace_find_rec_direct(unsigned long ip)
static void call_direct_funcs(unsigned long ip, unsigned long pip,
struct ftrace_ops *ops, struct ftrace_regs *fregs)
{
- unsigned long addr = READ_ONCE(ops->direct_call);
+ unsigned long addr;
+#ifdef CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS
+ addr = ftrace_find_rec_direct(ip);
+#else
+ addr = READ_ONCE(ops->direct_call);
+#endif
if (!addr)
return;
@@ -3238,15 +3267,22 @@ static struct ftrace_hash *copy_hash(struct ftrace_hash *src)
* The filter_hash updates uses just the append_hash() function
* and the notrace_hash does not.
*/
-static int append_hash(struct ftrace_hash **hash, struct ftrace_hash *new_hash)
+static int append_hash(struct ftrace_hash **hash, struct ftrace_hash *new_hash,
+ int size_bits)
{
struct ftrace_func_entry *entry;
int size;
int i;
- /* An empty hash does everything */
- if (ftrace_hash_empty(*hash))
- return 0;
+ if (*hash) {
+ /* An empty hash does everything */
+ if (ftrace_hash_empty(*hash))
+ return 0;
+ } else {
+ *hash = alloc_ftrace_hash(size_bits);
+ if (!*hash)
+ return -ENOMEM;
+ }
/* If new_hash has everything make hash have everything */
if (ftrace_hash_empty(new_hash)) {
@@ -3268,6 +3304,31 @@ static int append_hash(struct ftrace_hash **hash, struct ftrace_hash *new_hash)
}
/*
+ * Remove functions from @hash that are in @notrace_hash
+ */
+static void remove_hash(struct ftrace_hash *hash, struct ftrace_hash *notrace_hash)
+{
+ struct ftrace_func_entry *entry;
+ struct hlist_node *tmp;
+ int size;
+ int i;
+
+ /* If the notrace hash is empty, there's nothing to do */
+ if (ftrace_hash_empty(notrace_hash))
+ return;
+
+ size = 1 << hash->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry_safe(entry, tmp, &hash->buckets[i], hlist) {
+ if (!__ftrace_lookup_ip(notrace_hash, entry->ip))
+ continue;
+ remove_hash_entry(hash, entry);
+ kfree(entry);
+ }
+ }
+}
+
+/*
* Add to @hash only those that are in both @new_hash1 and @new_hash2
*
* The notrace_hash updates uses just the intersect_hash() function
@@ -3307,64 +3368,6 @@ static int intersect_hash(struct ftrace_hash **hash, struct ftrace_hash *new_has
return 0;
}
-/* Return a new hash that has a union of all @ops->filter_hash entries */
-static struct ftrace_hash *append_hashes(struct ftrace_ops *ops)
-{
- struct ftrace_hash *new_hash;
- struct ftrace_ops *subops;
- int ret;
-
- new_hash = alloc_ftrace_hash(ops->func_hash->filter_hash->size_bits);
- if (!new_hash)
- return NULL;
-
- list_for_each_entry(subops, &ops->subop_list, list) {
- ret = append_hash(&new_hash, subops->func_hash->filter_hash);
- if (ret < 0) {
- free_ftrace_hash(new_hash);
- return NULL;
- }
- /* Nothing more to do if new_hash is empty */
- if (ftrace_hash_empty(new_hash))
- break;
- }
- return new_hash;
-}
-
-/* Make @ops trace evenything except what all its subops do not trace */
-static struct ftrace_hash *intersect_hashes(struct ftrace_ops *ops)
-{
- struct ftrace_hash *new_hash = NULL;
- struct ftrace_ops *subops;
- int size_bits;
- int ret;
-
- list_for_each_entry(subops, &ops->subop_list, list) {
- struct ftrace_hash *next_hash;
-
- if (!new_hash) {
- size_bits = subops->func_hash->notrace_hash->size_bits;
- new_hash = alloc_and_copy_ftrace_hash(size_bits, ops->func_hash->notrace_hash);
- if (!new_hash)
- return NULL;
- continue;
- }
- size_bits = new_hash->size_bits;
- next_hash = new_hash;
- new_hash = alloc_ftrace_hash(size_bits);
- ret = intersect_hash(&new_hash, next_hash, subops->func_hash->notrace_hash);
- free_ftrace_hash(next_hash);
- if (ret < 0) {
- free_ftrace_hash(new_hash);
- return NULL;
- }
- /* Nothing more to do if new_hash is empty */
- if (ftrace_hash_empty(new_hash))
- break;
- }
- return new_hash;
-}
-
static bool ops_equal(struct ftrace_hash *A, struct ftrace_hash *B)
{
struct ftrace_func_entry *entry;
@@ -3436,6 +3439,95 @@ static int ftrace_update_ops(struct ftrace_ops *ops, struct ftrace_hash *filter_
return 0;
}
+static int add_first_hash(struct ftrace_hash **filter_hash, struct ftrace_hash **notrace_hash,
+ struct ftrace_ops_hash *func_hash)
+{
+ /* If the filter hash is not empty, simply remove the nohash from it */
+ if (!ftrace_hash_empty(func_hash->filter_hash)) {
+ *filter_hash = copy_hash(func_hash->filter_hash);
+ if (!*filter_hash)
+ return -ENOMEM;
+ remove_hash(*filter_hash, func_hash->notrace_hash);
+ *notrace_hash = EMPTY_HASH;
+
+ } else {
+ *notrace_hash = copy_hash(func_hash->notrace_hash);
+ if (!*notrace_hash)
+ return -ENOMEM;
+ *filter_hash = EMPTY_HASH;
+ }
+ return 0;
+}
+
+static int add_next_hash(struct ftrace_hash **filter_hash, struct ftrace_hash **notrace_hash,
+ struct ftrace_ops_hash *ops_hash, struct ftrace_ops_hash *subops_hash)
+{
+ int size_bits;
+ int ret;
+
+ /* If the subops trace all functions so must the main ops */
+ if (ftrace_hash_empty(ops_hash->filter_hash) ||
+ ftrace_hash_empty(subops_hash->filter_hash)) {
+ *filter_hash = EMPTY_HASH;
+ } else {
+ /*
+ * The main ops filter hash is not empty, so its
+ * notrace_hash had better be, as the notrace hash
+ * is only used for empty main filter hashes.
+ */
+ WARN_ON_ONCE(!ftrace_hash_empty(ops_hash->notrace_hash));
+
+ size_bits = max(ops_hash->filter_hash->size_bits,
+ subops_hash->filter_hash->size_bits);
+
+ /* Copy the subops hash */
+ *filter_hash = alloc_and_copy_ftrace_hash(size_bits, subops_hash->filter_hash);
+ if (!*filter_hash)
+ return -ENOMEM;
+ /* Remove any notrace functions from the copy */
+ remove_hash(*filter_hash, subops_hash->notrace_hash);
+
+ ret = append_hash(filter_hash, ops_hash->filter_hash,
+ size_bits);
+ if (ret < 0) {
+ free_ftrace_hash(*filter_hash);
+ *filter_hash = EMPTY_HASH;
+ return ret;
+ }
+ }
+
+ /*
+ * Only process notrace hashes if the main filter hash is empty
+ * (tracing all functions), otherwise the filter hash will just
+ * remove the notrace hash functions, and the notrace hash is
+ * not needed.
+ */
+ if (ftrace_hash_empty(*filter_hash)) {
+ /*
+ * Intersect the notrace functions. That is, if two
+ * subops are not tracing a set of functions, the
+ * main ops will only not trace the functions that are
+ * in both subops, but has to trace the functions that
+ * are only notrace in one of the subops, for the other
+ * subops to be able to trace them.
+ */
+ size_bits = max(ops_hash->notrace_hash->size_bits,
+ subops_hash->notrace_hash->size_bits);
+ *notrace_hash = alloc_ftrace_hash(size_bits);
+ if (!*notrace_hash)
+ return -ENOMEM;
+
+ ret = intersect_hash(notrace_hash, ops_hash->notrace_hash,
+ subops_hash->notrace_hash);
+ if (ret < 0) {
+ free_ftrace_hash(*notrace_hash);
+ *notrace_hash = EMPTY_HASH;
+ return ret;
+ }
+ }
+ return 0;
+}
+
/**
* ftrace_startup_subops - enable tracing for subops of an ops
* @ops: Manager ops (used to pick all the functions of its subops)
@@ -3448,11 +3540,10 @@ static int ftrace_update_ops(struct ftrace_ops *ops, struct ftrace_hash *filter_
*/
int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command)
{
- struct ftrace_hash *filter_hash;
- struct ftrace_hash *notrace_hash;
+ struct ftrace_hash *filter_hash = EMPTY_HASH;
+ struct ftrace_hash *notrace_hash = EMPTY_HASH;
struct ftrace_hash *save_filter_hash;
struct ftrace_hash *save_notrace_hash;
- int size_bits;
int ret;
if (unlikely(ftrace_disabled))
@@ -3476,14 +3567,14 @@ int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int
/* For the first subops to ops just enable it normally */
if (list_empty(&ops->subop_list)) {
- /* Just use the subops hashes */
- filter_hash = copy_hash(subops->func_hash->filter_hash);
- notrace_hash = copy_hash(subops->func_hash->notrace_hash);
- if (!filter_hash || !notrace_hash) {
- free_ftrace_hash(filter_hash);
- free_ftrace_hash(notrace_hash);
- return -ENOMEM;
- }
+
+ /* The ops was empty, should have empty hashes */
+ WARN_ON_ONCE(!ftrace_hash_empty(ops->func_hash->filter_hash));
+ WARN_ON_ONCE(!ftrace_hash_empty(ops->func_hash->notrace_hash));
+
+ ret = add_first_hash(&filter_hash, &notrace_hash, subops->func_hash);
+ if (ret < 0)
+ return ret;
save_filter_hash = ops->func_hash->filter_hash;
save_notrace_hash = ops->func_hash->notrace_hash;
@@ -3509,47 +3600,16 @@ int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int
/*
* Here there's already something attached. Here are the rules:
- * o If either filter_hash is empty then the final stays empty
- * o Otherwise, the final is a superset of both hashes
- * o If either notrace_hash is empty then the final stays empty
- * o Otherwise, the final is an intersection between the hashes
+ * If the new subops and main ops filter hashes are not empty:
+ * o Make a copy of the subops filter hash
+ * o Remove all functions in the nohash from it.
+ * o Add in the main hash filter functions
+ * o Remove any of these functions from the main notrace hash
*/
- if (ftrace_hash_empty(ops->func_hash->filter_hash) ||
- ftrace_hash_empty(subops->func_hash->filter_hash)) {
- filter_hash = EMPTY_HASH;
- } else {
- size_bits = max(ops->func_hash->filter_hash->size_bits,
- subops->func_hash->filter_hash->size_bits);
- filter_hash = alloc_and_copy_ftrace_hash(size_bits, ops->func_hash->filter_hash);
- if (!filter_hash)
- return -ENOMEM;
- ret = append_hash(&filter_hash, subops->func_hash->filter_hash);
- if (ret < 0) {
- free_ftrace_hash(filter_hash);
- return ret;
- }
- }
- if (ftrace_hash_empty(ops->func_hash->notrace_hash) ||
- ftrace_hash_empty(subops->func_hash->notrace_hash)) {
- notrace_hash = EMPTY_HASH;
- } else {
- size_bits = max(ops->func_hash->filter_hash->size_bits,
- subops->func_hash->filter_hash->size_bits);
- notrace_hash = alloc_ftrace_hash(size_bits);
- if (!notrace_hash) {
- free_ftrace_hash(filter_hash);
- return -ENOMEM;
- }
-
- ret = intersect_hash(&notrace_hash, ops->func_hash->filter_hash,
- subops->func_hash->filter_hash);
- if (ret < 0) {
- free_ftrace_hash(filter_hash);
- free_ftrace_hash(notrace_hash);
- return ret;
- }
- }
+ ret = add_next_hash(&filter_hash, &notrace_hash, ops->func_hash, subops->func_hash);
+ if (ret < 0)
+ return ret;
list_add(&subops->list, &ops->subop_list);
@@ -3565,6 +3625,45 @@ int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int
return ret;
}
+static int rebuild_hashes(struct ftrace_hash **filter_hash, struct ftrace_hash **notrace_hash,
+ struct ftrace_ops *ops)
+{
+ struct ftrace_ops_hash temp_hash;
+ struct ftrace_ops *subops;
+ bool first = true;
+ int ret;
+
+ temp_hash.filter_hash = EMPTY_HASH;
+ temp_hash.notrace_hash = EMPTY_HASH;
+
+ list_for_each_entry(subops, &ops->subop_list, list) {
+ *filter_hash = EMPTY_HASH;
+ *notrace_hash = EMPTY_HASH;
+
+ if (first) {
+ ret = add_first_hash(filter_hash, notrace_hash, subops->func_hash);
+ if (ret < 0)
+ return ret;
+ first = false;
+ } else {
+ ret = add_next_hash(filter_hash, notrace_hash,
+ &temp_hash, subops->func_hash);
+ if (ret < 0) {
+ free_ftrace_hash(temp_hash.filter_hash);
+ free_ftrace_hash(temp_hash.notrace_hash);
+ return ret;
+ }
+ }
+
+ free_ftrace_hash(temp_hash.filter_hash);
+ free_ftrace_hash(temp_hash.notrace_hash);
+
+ temp_hash.filter_hash = *filter_hash;
+ temp_hash.notrace_hash = *notrace_hash;
+ }
+ return 0;
+}
+
/**
* ftrace_shutdown_subops - Remove a subops from a manager ops
* @ops: A manager ops to remove @subops from
@@ -3579,8 +3678,8 @@ int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int
*/
int ftrace_shutdown_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command)
{
- struct ftrace_hash *filter_hash;
- struct ftrace_hash *notrace_hash;
+ struct ftrace_hash *filter_hash = EMPTY_HASH;
+ struct ftrace_hash *notrace_hash = EMPTY_HASH;
int ret;
if (unlikely(ftrace_disabled))
@@ -3613,14 +3712,9 @@ int ftrace_shutdown_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, in
}
/* Rebuild the hashes without subops */
- filter_hash = append_hashes(ops);
- notrace_hash = intersect_hashes(ops);
- if (!filter_hash || !notrace_hash) {
- free_ftrace_hash(filter_hash);
- free_ftrace_hash(notrace_hash);
- list_add(&subops->list, &ops->subop_list);
- return -ENOMEM;
- }
+ ret = rebuild_hashes(&filter_hash, &notrace_hash, ops);
+ if (ret < 0)
+ return ret;
ret = ftrace_update_ops(ops, filter_hash, notrace_hash);
if (ret < 0) {
@@ -3636,11 +3730,11 @@ int ftrace_shutdown_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, in
static int ftrace_hash_move_and_update_subops(struct ftrace_ops *subops,
struct ftrace_hash **orig_subhash,
- struct ftrace_hash *hash,
- int enable)
+ struct ftrace_hash *hash)
{
struct ftrace_ops *ops = subops->managed;
- struct ftrace_hash **orig_hash;
+ struct ftrace_hash *notrace_hash;
+ struct ftrace_hash *filter_hash;
struct ftrace_hash *save_hash;
struct ftrace_hash *new_hash;
int ret;
@@ -3657,24 +3751,18 @@ static int ftrace_hash_move_and_update_subops(struct ftrace_ops *subops,
return -ENOMEM;
}
- /* Create a new_hash to hold the ops new functions */
- if (enable) {
- orig_hash = &ops->func_hash->filter_hash;
- new_hash = append_hashes(ops);
- } else {
- orig_hash = &ops->func_hash->notrace_hash;
- new_hash = intersect_hashes(ops);
+ ret = rebuild_hashes(&filter_hash, &notrace_hash, ops);
+ if (!ret) {
+ ret = ftrace_update_ops(ops, filter_hash, notrace_hash);
+ free_ftrace_hash(filter_hash);
+ free_ftrace_hash(notrace_hash);
}
- /* Move the hash over to the new hash */
- ret = __ftrace_hash_move_and_update_ops(ops, orig_hash, new_hash, enable);
-
- free_ftrace_hash(new_hash);
-
if (ret) {
/* Put back the original hash */
- free_ftrace_hash_rcu(*orig_subhash);
+ new_hash = *orig_subhash;
*orig_subhash = save_hash;
+ free_ftrace_hash_rcu(new_hash);
} else {
free_ftrace_hash_rcu(save_hash);
}
@@ -3757,7 +3845,8 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
return 0;
}
-static int ftrace_allocate_records(struct ftrace_page *pg, int count)
+static int ftrace_allocate_records(struct ftrace_page *pg, int count,
+ unsigned long *num_pages)
{
int order;
int pages;
@@ -3767,7 +3856,7 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count)
return -EINVAL;
/* We want to fill as much as possible, with no empty pages */
- pages = DIV_ROUND_UP(count, ENTRIES_PER_PAGE);
+ pages = DIV_ROUND_UP(count * ENTRY_SIZE, PAGE_SIZE);
order = fls(pages) - 1;
again:
@@ -3782,9 +3871,10 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count)
}
ftrace_number_of_pages += 1 << order;
+ *num_pages += 1 << order;
ftrace_number_of_groups++;
- cnt = (PAGE_SIZE << order) / ENTRY_SIZE;
+ cnt = ENTRIES_PER_PAGE_GROUP(order);
pg->order = order;
if (cnt > count)
@@ -3810,16 +3900,18 @@ static void ftrace_free_pages(struct ftrace_page *pages)
}
static struct ftrace_page *
-ftrace_allocate_pages(unsigned long num_to_init)
+ftrace_allocate_pages(unsigned long num_to_init, unsigned long *num_pages)
{
struct ftrace_page *start_pg;
struct ftrace_page *pg;
int cnt;
+ *num_pages = 0;
+
if (!num_to_init)
return NULL;
- start_pg = pg = kzalloc(sizeof(*pg), GFP_KERNEL);
+ start_pg = pg = kzalloc_obj(*pg);
if (!pg)
return NULL;
@@ -3829,7 +3921,7 @@ ftrace_allocate_pages(unsigned long num_to_init)
* waste as little space as possible.
*/
for (;;) {
- cnt = ftrace_allocate_records(pg, num_to_init);
+ cnt = ftrace_allocate_records(pg, num_to_init, num_pages);
if (cnt < 0)
goto free_pages;
@@ -3837,7 +3929,7 @@ ftrace_allocate_pages(unsigned long num_to_init)
if (!num_to_init)
break;
- pg->next = kzalloc(sizeof(*pg), GFP_KERNEL);
+ pg->next = kzalloc_obj(*pg);
if (!pg->next)
goto free_pages;
@@ -4331,6 +4423,42 @@ static inline int print_rec(struct seq_file *m, unsigned long ip)
}
#endif
+static void print_subops(struct seq_file *m, struct ftrace_ops *ops, struct dyn_ftrace *rec)
+{
+ struct ftrace_ops *subops;
+ bool first = true;
+
+ list_for_each_entry(subops, &ops->subop_list, list) {
+ if (!((subops->flags & FTRACE_OPS_FL_ENABLED) &&
+ hash_contains_ip(rec->ip, subops->func_hash)))
+ continue;
+ if (first) {
+ seq_printf(m, "\tsubops:");
+ first = false;
+ }
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ if (subops->flags & FTRACE_OPS_FL_GRAPH) {
+ struct fgraph_ops *gops;
+
+ gops = container_of(subops, struct fgraph_ops, ops);
+ seq_printf(m, " {ent:%pS ret:%pS}",
+ (void *)gops->entryfunc,
+ (void *)gops->retfunc);
+ continue;
+ }
+#endif
+ if (subops->trampoline) {
+ seq_printf(m, " {%pS (%pS)}",
+ (void *)subops->trampoline,
+ (void *)subops->func);
+ add_trampoline_func(m, subops, rec);
+ } else {
+ seq_printf(m, " {%pS}",
+ (void *)subops->func);
+ }
+ }
+}
+
static int t_show(struct seq_file *m, void *v)
{
struct ftrace_iterator *iter = m->private;
@@ -4383,6 +4511,7 @@ static int t_show(struct seq_file *m, void *v)
(void *)ops->trampoline,
(void *)ops->func);
add_trampoline_func(m, ops, rec);
+ print_subops(m, ops, rec);
ops = ftrace_find_tramp_ops_next(rec, ops);
} while (ops);
} else
@@ -4395,6 +4524,7 @@ static int t_show(struct seq_file *m, void *v)
if (ops) {
seq_printf(m, "\tops: %pS (%pS)",
ops, ops->func);
+ print_subops(m, ops, rec);
} else {
seq_puts(m, "\tops: ERROR!");
}
@@ -4403,8 +4533,11 @@ static int t_show(struct seq_file *m, void *v)
unsigned long direct;
direct = ftrace_find_rec_direct(rec->ip);
- if (direct)
- seq_printf(m, "\n\tdirect-->%pS", (void *)direct);
+ if (direct) {
+ seq_printf(m, "\n\tdirect%s-->%pS",
+ ftrace_is_jmp(direct) ? "(jmp)" : "",
+ (void *)ftrace_jmp_get(direct));
+ }
}
}
@@ -4553,7 +4686,7 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
if (tracing_check_open_get_tr(tr))
return -ENODEV;
- iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+ iter = kzalloc_obj(*iter);
if (!iter)
goto out;
@@ -4585,13 +4718,17 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
} else {
iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash);
}
+ } else {
+ if (hash)
+ iter->hash = alloc_and_copy_ftrace_hash(hash->size_bits, hash);
+ else
+ iter->hash = EMPTY_HASH;
+ }
- if (!iter->hash) {
- trace_parser_put(&iter->parser);
- goto out_unlock;
- }
- } else
- iter->hash = hash;
+ if (!iter->hash) {
+ trace_parser_put(&iter->parser);
+ goto out_unlock;
+ }
ret = 0;
@@ -4898,7 +5035,7 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops,
int enable)
{
if (ops->flags & FTRACE_OPS_FL_SUBOP)
- return ftrace_hash_move_and_update_subops(ops, orig_hash, hash, enable);
+ return ftrace_hash_move_and_update_subops(ops, orig_hash, hash);
/*
* If this ops is not enabled, it could be sharing its filters
@@ -4917,7 +5054,7 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops,
list_for_each_entry(subops, &op->subop_list, list) {
if ((subops->flags & FTRACE_OPS_FL_ENABLED) &&
subops->func_hash == ops->func_hash) {
- return ftrace_hash_move_and_update_subops(subops, orig_hash, hash, enable);
+ return ftrace_hash_move_and_update_subops(subops, orig_hash, hash);
}
}
} while_for_each_ftrace_op(op);
@@ -4926,23 +5063,6 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops,
return __ftrace_hash_move_and_update_ops(ops, orig_hash, hash, enable);
}
-static bool module_exists(const char *module)
-{
- /* All modules have the symbol __this_module */
- static const char this_mod[] = "__this_module";
- char modname[MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 2];
- unsigned long val;
- int n;
-
- n = snprintf(modname, sizeof(modname), "%s:%s", module, this_mod);
-
- if (n > sizeof(modname) - 1)
- return false;
-
- val = module_kallsyms_lookup_name(modname);
- return val != 0;
-}
-
static int cache_mod(struct trace_array *tr,
const char *func, char *module, int enable)
{
@@ -4982,10 +5102,6 @@ static int cache_mod(struct trace_array *tr,
return ftrace_add_mod(tr, func, module, enable);
}
-static int
-ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
- int reset, int enable);
-
#ifdef CONFIG_MODULES
static void process_mod_list(struct list_head *head, struct ftrace_ops *ops,
char *mod, bool enable)
@@ -5149,8 +5265,12 @@ struct ftrace_func_map {
void *data;
};
+/*
+ * Note, ftrace_func_mapper is freed by free_ftrace_hash(&mapper->hash).
+ * The hash field must be the first field.
+ */
struct ftrace_func_mapper {
- struct ftrace_hash hash;
+ struct ftrace_hash hash; /* Must be first! */
};
/**
@@ -5214,7 +5334,7 @@ int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper,
if (entry)
return -EBUSY;
- map = kmalloc(sizeof(*map), GFP_KERNEL);
+ map = kmalloc_obj(*map);
if (!map)
return -ENOMEM;
@@ -5285,6 +5405,7 @@ void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper,
}
}
}
+ /* This also frees the mapper itself */
free_ftrace_hash(&mapper->hash);
}
@@ -5353,7 +5474,7 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr,
}
}
if (!probe) {
- probe = kzalloc(sizeof(*probe), GFP_KERNEL);
+ probe = kzalloc_obj(*probe);
if (!probe) {
mutex_unlock(&ftrace_lock);
return -ENOMEM;
@@ -5615,20 +5736,15 @@ static DEFINE_MUTEX(ftrace_cmd_mutex);
__init int register_ftrace_command(struct ftrace_func_command *cmd)
{
struct ftrace_func_command *p;
- int ret = 0;
- mutex_lock(&ftrace_cmd_mutex);
+ guard(mutex)(&ftrace_cmd_mutex);
list_for_each_entry(p, &ftrace_commands, list) {
- if (strcmp(cmd->name, p->name) == 0) {
- ret = -EBUSY;
- goto out_unlock;
- }
+ if (strcmp(cmd->name, p->name) == 0)
+ return -EBUSY;
}
list_add(&cmd->list, &ftrace_commands);
- out_unlock:
- mutex_unlock(&ftrace_cmd_mutex);
- return ret;
+ return 0;
}
/*
@@ -5638,20 +5754,17 @@ __init int register_ftrace_command(struct ftrace_func_command *cmd)
__init int unregister_ftrace_command(struct ftrace_func_command *cmd)
{
struct ftrace_func_command *p, *n;
- int ret = -ENODEV;
- mutex_lock(&ftrace_cmd_mutex);
+ guard(mutex)(&ftrace_cmd_mutex);
+
list_for_each_entry_safe(p, n, &ftrace_commands, list) {
if (strcmp(cmd->name, p->name) == 0) {
- ret = 0;
list_del_init(&p->list);
- goto out_unlock;
+ return 0;
}
}
- out_unlock:
- mutex_unlock(&ftrace_cmd_mutex);
- return ret;
+ return -ENODEV;
}
static int ftrace_process_regex(struct ftrace_iterator *iter,
@@ -5661,7 +5774,7 @@ static int ftrace_process_regex(struct ftrace_iterator *iter,
struct trace_array *tr = iter->ops->private;
char *func, *command, *next = buff;
struct ftrace_func_command *p;
- int ret = -EINVAL;
+ int ret;
func = strsep(&next, ":");
@@ -5678,17 +5791,14 @@ static int ftrace_process_regex(struct ftrace_iterator *iter,
command = strsep(&next, ":");
- mutex_lock(&ftrace_cmd_mutex);
+ guard(mutex)(&ftrace_cmd_mutex);
+
list_for_each_entry(p, &ftrace_commands, list) {
- if (strcmp(p->name, command) == 0) {
- ret = p->func(tr, hash, func, command, next, enable);
- goto out_unlock;
- }
+ if (strcmp(p->name, command) == 0)
+ return p->func(tr, hash, func, command, next, enable);
}
- out_unlock:
- mutex_unlock(&ftrace_cmd_mutex);
- return ret;
+ return -EINVAL;
}
static ssize_t
@@ -5722,12 +5832,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
parser->idx, enable);
trace_parser_clear(parser);
if (ret < 0)
- goto out;
+ return ret;
}
- ret = read;
- out:
- return ret;
+ return read;
}
ssize_t
@@ -5759,6 +5867,9 @@ __ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
return -ENOENT;
free_hash_entry(hash, entry);
return 0;
+ } else if (__ftrace_lookup_ip(hash, ip) != NULL) {
+ /* Already exists */
+ return 0;
}
entry = add_hash_entry(hash, ip);
@@ -5788,7 +5899,7 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long *ips,
static int
ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
unsigned long *ips, unsigned int cnt,
- int remove, int reset, int enable)
+ int remove, int reset, int enable, char *mod)
{
struct ftrace_hash **orig_hash;
struct ftrace_hash *hash;
@@ -5814,7 +5925,15 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
goto out_regex_unlock;
}
- if (buf && !ftrace_match_records(hash, buf, len)) {
+ if (buf && !match_records(hash, buf, len, mod)) {
+ /* If this was for a module and nothing was enabled, flag it */
+ if (mod)
+ (*orig_hash)->flags |= FTRACE_HASH_FL_MOD;
+
+ /*
+ * Even if it is a mod, return error to let caller know
+ * nothing was added
+ */
ret = -EINVAL;
goto out_regex_unlock;
}
@@ -5839,7 +5958,7 @@ static int
ftrace_set_addr(struct ftrace_ops *ops, unsigned long *ips, unsigned int cnt,
int remove, int reset, int enable)
{
- return ftrace_set_hash(ops, NULL, 0, ips, cnt, remove, reset, enable);
+ return ftrace_set_hash(ops, NULL, 0, ips, cnt, remove, reset, enable, NULL);
}
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
@@ -5876,7 +5995,8 @@ static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long
for (i = 0; i < size; i++) {
hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
del = __ftrace_lookup_ip(direct_functions, entry->ip);
- if (del && del->direct == addr) {
+ if (del && ftrace_jmp_get(del->direct) ==
+ ftrace_jmp_get(addr)) {
remove_hash_entry(direct_functions, del);
kfree(del);
}
@@ -5891,6 +6011,17 @@ static void register_ftrace_direct_cb(struct rcu_head *rhp)
free_ftrace_hash(fhp);
}
+static void reset_direct(struct ftrace_ops *ops, unsigned long addr)
+{
+ struct ftrace_hash *hash = ops->func_hash->filter_hash;
+
+ remove_direct_functions_hash(hash, addr);
+
+ /* cleanup for possible another register call */
+ ops->func = NULL;
+ ops->trampoline = 0;
+}
+
/**
* register_ftrace_direct - Call a custom trampoline directly
* for multiple functions registered in @ops
@@ -5945,9 +6076,10 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
/* Make a copy hash to place the new and the old entries in */
size = hash->count + direct_functions->count;
- if (size > 32)
- size = 32;
- new_hash = alloc_ftrace_hash(fls(size));
+ size = fls(size);
+ if (size > FTRACE_HASH_MAX_BITS)
+ size = FTRACE_HASH_MAX_BITS;
+ new_hash = alloc_ftrace_hash(size);
if (!new_hash)
goto out_unlock;
@@ -5980,11 +6112,13 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
new_hash = NULL;
ops->func = call_direct_funcs;
- ops->flags = MULTI_FLAGS;
+ ops->flags |= MULTI_FLAGS;
ops->trampoline = FTRACE_REGS_ADDR;
ops->direct_call = addr;
err = register_ftrace_function_nolock(ops);
+ if (err)
+ reset_direct(ops, addr);
out_unlock:
mutex_unlock(&direct_mutex);
@@ -6017,7 +6151,6 @@ EXPORT_SYMBOL_GPL(register_ftrace_direct);
int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr,
bool free_filters)
{
- struct ftrace_hash *hash = ops->func_hash->filter_hash;
int err;
if (check_direct_multi(ops))
@@ -6027,13 +6160,9 @@ int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr,
mutex_lock(&direct_mutex);
err = unregister_ftrace_function(ops);
- remove_direct_functions_hash(hash, addr);
+ reset_direct(ops, addr);
mutex_unlock(&direct_mutex);
- /* cleanup for possible another register call */
- ops->func = NULL;
- ops->trampoline = 0;
-
if (free_filters)
ftrace_free_filter(ops);
return err;
@@ -6043,7 +6172,7 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_direct);
static int
__modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
{
- struct ftrace_hash *hash;
+ struct ftrace_hash *hash = ops->func_hash->filter_hash;
struct ftrace_func_entry *entry, *iter;
static struct ftrace_ops tmp_ops = {
.func = ftrace_stub,
@@ -6064,12 +6193,20 @@ __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
return err;
/*
+ * Call __ftrace_hash_update_ipmodify() here, so that we can call
+ * ops->ops_func for the ops. This is needed because the above
+ * register_ftrace_function_nolock() worked on tmp_ops.
+ */
+ err = __ftrace_hash_update_ipmodify(ops, hash, hash, true);
+ if (err)
+ goto out;
+
+ /*
* Now the ftrace_ops_list_func() is called to do the direct callers.
* We can safely change the direct functions attached to each entry.
*/
mutex_lock(&ftrace_lock);
- hash = ops->func_hash->filter_hash;
size = 1 << hash->size_bits;
for (i = 0; i < size; i++) {
hlist_for_each_entry(iter, &hash->buckets[i], hlist) {
@@ -6084,6 +6221,7 @@ __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
mutex_unlock(&ftrace_lock);
+out:
/* Removing the tmp_ops will add the updated direct callers to the functions */
unregister_ftrace_function(&tmp_ops);
@@ -6149,6 +6287,370 @@ int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
return err;
}
EXPORT_SYMBOL_GPL(modify_ftrace_direct);
+
+static unsigned long hash_count(struct ftrace_hash *hash)
+{
+ return hash ? hash->count : 0;
+}
+
+/**
+ * hash_add - adds two struct ftrace_hash and returns the result
+ * @a: struct ftrace_hash object
+ * @b: struct ftrace_hash object
+ *
+ * Returns struct ftrace_hash object on success, NULL on error.
+ */
+static struct ftrace_hash *hash_add(struct ftrace_hash *a, struct ftrace_hash *b)
+{
+ struct ftrace_func_entry *entry;
+ struct ftrace_hash *add;
+ int size;
+
+ size = hash_count(a) + hash_count(b);
+ if (size > 32)
+ size = 32;
+
+ add = alloc_and_copy_ftrace_hash(fls(size), a);
+ if (!add)
+ return NULL;
+
+ size = 1 << b->size_bits;
+ for (int i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &b->buckets[i], hlist) {
+ if (add_ftrace_hash_entry_direct(add, entry->ip, entry->direct) == NULL) {
+ free_ftrace_hash(add);
+ return NULL;
+ }
+ }
+ }
+ return add;
+}
+
+/**
+ * update_ftrace_direct_add - Updates @ops by adding direct
+ * callers provided in @hash
+ * @ops: The address of the struct ftrace_ops object
+ * @hash: The address of the struct ftrace_hash object
+ *
+ * This is used to add custom direct callers (ip -> addr) to @ops,
+ * specified in @hash. The @ops will be either registered or updated.
+ *
+ * Returns: zero on success. Non zero on error, which includes:
+ * -EINVAL - The @hash is empty
+ */
+int update_ftrace_direct_add(struct ftrace_ops *ops, struct ftrace_hash *hash)
+{
+ struct ftrace_hash *old_direct_functions = NULL;
+ struct ftrace_hash *new_direct_functions;
+ struct ftrace_hash *old_filter_hash;
+ struct ftrace_hash *new_filter_hash = NULL;
+ struct ftrace_func_entry *entry;
+ int err = -EINVAL;
+ int size;
+ bool reg;
+
+ if (!hash_count(hash))
+ return -EINVAL;
+
+ mutex_lock(&direct_mutex);
+
+ /* Make sure requested entries are not already registered. */
+ size = 1 << hash->size_bits;
+ for (int i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
+ if (__ftrace_lookup_ip(direct_functions, entry->ip))
+ goto out_unlock;
+ }
+ }
+
+ old_filter_hash = ops->func_hash ? ops->func_hash->filter_hash : NULL;
+
+ /* If there's nothing in filter_hash we need to register the ops. */
+ reg = hash_count(old_filter_hash) == 0;
+ if (reg) {
+ if (ops->func || ops->trampoline)
+ goto out_unlock;
+ if (ops->flags & FTRACE_OPS_FL_ENABLED)
+ goto out_unlock;
+ }
+
+ err = -ENOMEM;
+ new_filter_hash = hash_add(old_filter_hash, hash);
+ if (!new_filter_hash)
+ goto out_unlock;
+
+ new_direct_functions = hash_add(direct_functions, hash);
+ if (!new_direct_functions)
+ goto out_unlock;
+
+ old_direct_functions = direct_functions;
+ rcu_assign_pointer(direct_functions, new_direct_functions);
+
+ if (reg) {
+ ops->func = call_direct_funcs;
+ ops->flags |= MULTI_FLAGS;
+ ops->trampoline = FTRACE_REGS_ADDR;
+ ops->local_hash.filter_hash = new_filter_hash;
+
+ err = register_ftrace_function_nolock(ops);
+ if (err) {
+ /* restore old filter on error */
+ ops->local_hash.filter_hash = old_filter_hash;
+
+ /* cleanup for possible another register call */
+ ops->func = NULL;
+ ops->trampoline = 0;
+ } else {
+ new_filter_hash = old_filter_hash;
+ }
+ } else {
+ guard(mutex)(&ftrace_lock);
+ err = ftrace_update_ops(ops, new_filter_hash, EMPTY_HASH);
+ /*
+ * new_filter_hash is dup-ed, so we need to release it anyway,
+ * old_filter_hash either stays on error or is already released
+ */
+ }
+
+ if (err) {
+ /* reset direct_functions and free the new one */
+ rcu_assign_pointer(direct_functions, old_direct_functions);
+ old_direct_functions = new_direct_functions;
+ }
+
+ out_unlock:
+ mutex_unlock(&direct_mutex);
+
+ if (old_direct_functions && old_direct_functions != EMPTY_HASH)
+ call_rcu_tasks(&old_direct_functions->rcu, register_ftrace_direct_cb);
+ free_ftrace_hash(new_filter_hash);
+
+ return err;
+}
+
+/**
+ * hash_sub - substracts @b from @a and returns the result
+ * @a: struct ftrace_hash object
+ * @b: struct ftrace_hash object
+ *
+ * Returns struct ftrace_hash object on success, NULL on error.
+ */
+static struct ftrace_hash *hash_sub(struct ftrace_hash *a, struct ftrace_hash *b)
+{
+ struct ftrace_func_entry *entry, *del;
+ struct ftrace_hash *sub;
+ int size;
+
+ sub = alloc_and_copy_ftrace_hash(a->size_bits, a);
+ if (!sub)
+ return NULL;
+
+ size = 1 << b->size_bits;
+ for (int i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &b->buckets[i], hlist) {
+ del = __ftrace_lookup_ip(sub, entry->ip);
+ if (WARN_ON_ONCE(!del)) {
+ free_ftrace_hash(sub);
+ return NULL;
+ }
+ remove_hash_entry(sub, del);
+ kfree(del);
+ }
+ }
+ return sub;
+}
+
+/**
+ * update_ftrace_direct_del - Updates @ops by removing its direct
+ * callers provided in @hash
+ * @ops: The address of the struct ftrace_ops object
+ * @hash: The address of the struct ftrace_hash object
+ *
+ * This is used to delete custom direct callers (ip -> addr) in
+ * @ops specified via @hash. The @ops will be either unregistered
+ * updated.
+ *
+ * Returns: zero on success. Non zero on error, which includes:
+ * -EINVAL - The @hash is empty
+ * -EINVAL - The @ops is not registered
+ */
+int update_ftrace_direct_del(struct ftrace_ops *ops, struct ftrace_hash *hash)
+{
+ struct ftrace_hash *old_direct_functions = NULL;
+ struct ftrace_hash *new_direct_functions;
+ struct ftrace_hash *new_filter_hash = NULL;
+ struct ftrace_hash *old_filter_hash;
+ struct ftrace_func_entry *entry;
+ struct ftrace_func_entry *del;
+ unsigned long size;
+ int err = -EINVAL;
+
+ if (!hash_count(hash))
+ return -EINVAL;
+ if (check_direct_multi(ops))
+ return -EINVAL;
+ if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
+ return -EINVAL;
+ if (direct_functions == EMPTY_HASH)
+ return -EINVAL;
+
+ mutex_lock(&direct_mutex);
+
+ old_filter_hash = ops->func_hash ? ops->func_hash->filter_hash : NULL;
+
+ if (!hash_count(old_filter_hash))
+ goto out_unlock;
+
+ /* Make sure requested entries are already registered. */
+ size = 1 << hash->size_bits;
+ for (int i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
+ del = __ftrace_lookup_ip(direct_functions, entry->ip);
+ if (!del || del->direct != entry->direct)
+ goto out_unlock;
+ }
+ }
+
+ err = -ENOMEM;
+ new_filter_hash = hash_sub(old_filter_hash, hash);
+ if (!new_filter_hash)
+ goto out_unlock;
+
+ new_direct_functions = hash_sub(direct_functions, hash);
+ if (!new_direct_functions)
+ goto out_unlock;
+
+ /* If there's nothing left, we need to unregister the ops. */
+ if (ftrace_hash_empty(new_filter_hash)) {
+ err = unregister_ftrace_function(ops);
+ if (!err) {
+ /* cleanup for possible another register call */
+ ops->func = NULL;
+ ops->trampoline = 0;
+ ftrace_free_filter(ops);
+ ops->func_hash->filter_hash = NULL;
+ }
+ } else {
+ guard(mutex)(&ftrace_lock);
+ err = ftrace_update_ops(ops, new_filter_hash, EMPTY_HASH);
+ /*
+ * new_filter_hash is dup-ed, so we need to release it anyway,
+ * old_filter_hash either stays on error or is already released
+ */
+ }
+
+ if (err) {
+ /* free the new_direct_functions */
+ old_direct_functions = new_direct_functions;
+ } else {
+ old_direct_functions = direct_functions;
+ rcu_assign_pointer(direct_functions, new_direct_functions);
+ }
+
+ out_unlock:
+ mutex_unlock(&direct_mutex);
+
+ if (old_direct_functions && old_direct_functions != EMPTY_HASH)
+ call_rcu_tasks(&old_direct_functions->rcu, register_ftrace_direct_cb);
+ free_ftrace_hash(new_filter_hash);
+
+ return err;
+}
+
+/**
+ * update_ftrace_direct_mod - Updates @ops by modifing its direct
+ * callers provided in @hash
+ * @ops: The address of the struct ftrace_ops object
+ * @hash: The address of the struct ftrace_hash object
+ * @do_direct_lock: If true lock the direct_mutex
+ *
+ * This is used to modify custom direct callers (ip -> addr) in
+ * @ops specified via @hash.
+ *
+ * This can be called from within ftrace ops_func callback with
+ * direct_mutex already locked, in which case @do_direct_lock
+ * needs to be false.
+ *
+ * Returns: zero on success. Non zero on error, which includes:
+ * -EINVAL - The @hash is empty
+ * -EINVAL - The @ops is not registered
+ */
+int update_ftrace_direct_mod(struct ftrace_ops *ops, struct ftrace_hash *hash, bool do_direct_lock)
+{
+ struct ftrace_func_entry *entry, *tmp;
+ static struct ftrace_ops tmp_ops = {
+ .func = ftrace_stub,
+ .flags = FTRACE_OPS_FL_STUB,
+ };
+ struct ftrace_hash *orig_hash;
+ unsigned long size, i;
+ int err = -EINVAL;
+
+ if (!hash_count(hash))
+ return -EINVAL;
+ if (check_direct_multi(ops))
+ return -EINVAL;
+ if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
+ return -EINVAL;
+ if (direct_functions == EMPTY_HASH)
+ return -EINVAL;
+
+ /*
+ * We can be called from within ops_func callback with direct_mutex
+ * already taken.
+ */
+ if (do_direct_lock)
+ mutex_lock(&direct_mutex);
+
+ orig_hash = ops->func_hash ? ops->func_hash->filter_hash : NULL;
+ if (!orig_hash)
+ goto unlock;
+
+ /* Enable the tmp_ops to have the same functions as the hash object. */
+ ftrace_ops_init(&tmp_ops);
+ tmp_ops.func_hash->filter_hash = hash;
+
+ err = register_ftrace_function_nolock(&tmp_ops);
+ if (err)
+ goto unlock;
+
+ /*
+ * Call __ftrace_hash_update_ipmodify() here, so that we can call
+ * ops->ops_func for the ops. This is needed because the above
+ * register_ftrace_function_nolock() worked on tmp_ops.
+ */
+ err = __ftrace_hash_update_ipmodify(ops, orig_hash, orig_hash, true);
+ if (err)
+ goto out;
+
+ /*
+ * Now the ftrace_ops_list_func() is called to do the direct callers.
+ * We can safely change the direct functions attached to each entry.
+ */
+ mutex_lock(&ftrace_lock);
+
+ size = 1 << hash->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
+ tmp = __ftrace_lookup_ip(direct_functions, entry->ip);
+ if (!tmp)
+ continue;
+ tmp->direct = entry->direct;
+ }
+ }
+
+ mutex_unlock(&ftrace_lock);
+
+out:
+ /* Removing the tmp_ops will add the updated direct callers to the functions */
+ unregister_ftrace_function(&tmp_ops);
+
+unlock:
+ if (do_direct_lock)
+ mutex_unlock(&direct_mutex);
+ return err;
+}
+
#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
/**
@@ -6217,7 +6719,38 @@ static int
ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
int reset, int enable)
{
- return ftrace_set_hash(ops, buf, len, NULL, 0, 0, reset, enable);
+ char *mod = NULL, *func, *command, *next = buf;
+ char *tmp __free(kfree) = NULL;
+ struct trace_array *tr = ops->private;
+ int ret;
+
+ func = strsep(&next, ":");
+
+ /* This can also handle :mod: parsing */
+ if (next) {
+ if (!tr)
+ return -EINVAL;
+
+ command = strsep(&next, ":");
+ if (strcmp(command, "mod") != 0)
+ return -EINVAL;
+
+ mod = next;
+ len = command - func;
+ /* Save the original func as ftrace_set_hash() can modify it */
+ tmp = kstrdup(func, GFP_KERNEL);
+ }
+
+ ret = ftrace_set_hash(ops, func, len, NULL, 0, 0, reset, enable, mod);
+
+ if (tr && mod && ret < 0) {
+ /* Did tmp fail to allocate? */
+ if (!tmp)
+ return -ENOMEM;
+ ret = cache_mod(tr, tmp, mod, enable);
+ }
+
+ return ret;
}
/**
@@ -6308,7 +6841,8 @@ bool ftrace_filter_param __initdata;
static int __init set_ftrace_notrace(char *str)
{
ftrace_filter_param = true;
- strscpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
+ trace_append_boot_param(ftrace_notrace_buf, str, ',',
+ FTRACE_FILTER_SIZE);
return 1;
}
__setup("ftrace_notrace=", set_ftrace_notrace);
@@ -6316,7 +6850,8 @@ __setup("ftrace_notrace=", set_ftrace_notrace);
static int __init set_ftrace_filter(char *str)
{
ftrace_filter_param = true;
- strscpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
+ trace_append_boot_param(ftrace_filter_buf, str, ',',
+ FTRACE_FILTER_SIZE);
return 1;
}
__setup("ftrace_filter=", set_ftrace_filter);
@@ -6328,14 +6863,16 @@ static int ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer);
static int __init set_graph_function(char *str)
{
- strscpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
+ trace_append_boot_param(ftrace_graph_buf, str, ',',
+ FTRACE_FILTER_SIZE);
return 1;
}
__setup("ftrace_graph_filter=", set_graph_function);
static int __init set_graph_notrace_function(char *str)
{
- strscpy(ftrace_graph_notrace_buf, str, FTRACE_FILTER_SIZE);
+ trace_append_boot_param(ftrace_graph_notrace_buf, str, ',',
+ FTRACE_FILTER_SIZE);
return 1;
}
__setup("ftrace_graph_notrace=", set_graph_notrace_function);
@@ -6381,6 +6918,14 @@ ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable)
ftrace_ops_init(ops);
+ /* The trace_array is needed for caching module function filters */
+ if (!ops->private) {
+ struct trace_array *tr = trace_get_global_array();
+
+ ops->private = tr;
+ ftrace_init_trace_array(tr);
+ }
+
while (buf) {
func = strsep(&buf, ",");
ftrace_set_regex(ops, func, strlen(func), 0, enable);
@@ -6445,9 +6990,6 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
ftrace_hash_move_and_update_ops(iter->ops, orig_hash,
iter->hash, filter_hash);
mutex_unlock(&ftrace_lock);
- } else {
- /* For read only, the hash is the ops hash */
- iter->hash = NULL;
}
mutex_unlock(&iter->ops->func_hash->regex_lock);
@@ -6687,7 +7229,7 @@ ftrace_graph_open(struct inode *inode, struct file *file)
if (unlikely(ftrace_disabled))
return -ENODEV;
- fgd = kmalloc(sizeof(*fgd), GFP_KERNEL);
+ fgd = kmalloc_obj(*fgd);
if (fgd == NULL)
return -ENOMEM;
@@ -6715,7 +7257,7 @@ ftrace_graph_notrace_open(struct inode *inode, struct file *file)
if (unlikely(ftrace_disabled))
return -ENODEV;
- fgd = kmalloc(sizeof(*fgd), GFP_KERNEL);
+ fgd = kmalloc_obj(*fgd);
if (fgd == NULL)
return -ENOMEM;
@@ -6847,6 +7389,7 @@ ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer)
}
}
}
+ cond_resched();
} while_for_each_ftrace_rec();
return fail ? -EINVAL : 0;
@@ -7010,6 +7553,7 @@ static int ftrace_process_locs(struct module *mod,
unsigned long *p;
unsigned long addr;
unsigned long flags = 0; /* Shut up gcc */
+ unsigned long pages;
int ret = -ENOMEM;
count = end - start;
@@ -7029,7 +7573,7 @@ static int ftrace_process_locs(struct module *mod,
test_is_sorted(start, count);
}
- start_pg = ftrace_allocate_pages(count);
+ start_pg = ftrace_allocate_pages(count, &pages);
if (!start_pg)
return -ENOMEM;
@@ -7061,7 +7605,9 @@ static int ftrace_process_locs(struct module *mod,
pg = start_pg;
while (p < end) {
unsigned long end_offset;
- addr = ftrace_call_adjust(*p++);
+
+ addr = *p++;
+
/*
* Some architecture linkers will pad between
* the different mcount_loc sections of different
@@ -7073,6 +7619,19 @@ static int ftrace_process_locs(struct module *mod,
continue;
}
+ /*
+ * If this is core kernel, make sure the address is in core
+ * or inittext, as weak functions get zeroed and KASLR can
+ * move them to something other than zero. It just will not
+ * move it to an area where kernel text is.
+ */
+ if (!mod && !(is_kernel_text(addr) || is_kernel_inittext(addr))) {
+ skipped++;
+ continue;
+ }
+
+ addr = ftrace_call_adjust(addr);
+
end_offset = (pg->index+1) * sizeof(pg->records[0]);
if (end_offset > PAGE_SIZE << pg->order) {
/* We should have allocated enough */
@@ -7112,11 +7671,41 @@ static int ftrace_process_locs(struct module *mod,
/* We should have used all pages unless we skipped some */
if (pg_unuse) {
- WARN_ON(!skipped);
+ unsigned long pg_remaining, remaining = 0;
+ long skip;
+
+ /* Count the number of entries unused and compare it to skipped. */
+ pg_remaining = ENTRIES_PER_PAGE_GROUP(pg->order) - pg->index;
+
+ if (!WARN(skipped < pg_remaining, "Extra allocated pages for ftrace")) {
+
+ skip = skipped - pg_remaining;
+
+ for (pg = pg_unuse; pg && skip > 0; pg = pg->next) {
+ remaining += 1 << pg->order;
+ skip -= ENTRIES_PER_PAGE_GROUP(pg->order);
+ }
+
+ pages -= remaining;
+
+ /*
+ * Check to see if the number of pages remaining would
+ * just fit the number of entries skipped.
+ */
+ WARN(pg || skip > 0, "Extra allocated pages for ftrace: %lu with %lu skipped",
+ remaining, skipped);
+ }
/* Need to synchronize with ftrace_location_range() */
synchronize_rcu();
ftrace_free_pages(pg_unuse);
}
+
+ if (!mod) {
+ count -= skipped;
+ pr_info("ftrace: allocating %ld entries in %ld pages\n",
+ count, pages);
+ }
+
return ret;
}
@@ -7287,9 +7876,10 @@ void ftrace_release_mod(struct module *mod)
mutex_lock(&ftrace_lock);
- if (ftrace_disabled)
- goto out_unlock;
-
+ /*
+ * To avoid the UAF problem after the module is unloaded, the
+ * 'mod_map' resource needs to be released unconditionally.
+ */
list_for_each_entry_safe(mod_map, n, &ftrace_mod_maps, list) {
if (mod_map->mod == mod) {
list_del_rcu(&mod_map->list);
@@ -7298,6 +7888,9 @@ void ftrace_release_mod(struct module *mod)
}
}
+ if (ftrace_disabled)
+ goto out_unlock;
+
/*
* Each module has its own ftrace_pages, remove
* them from the list.
@@ -7383,6 +7976,8 @@ void ftrace_module_enable(struct module *mod)
if (!within_module(rec->ip, mod))
break;
+ cond_resched();
+
/* Weak functions should still be ignored */
if (!test_for_valid_rec(rec)) {
/* Clear all other flags. Should not be enabled anyway */
@@ -7452,7 +8047,7 @@ static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map,
if (!ret)
return;
- mod_func = kmalloc(sizeof(*mod_func), GFP_KERNEL);
+ mod_func = kmalloc_obj(*mod_func);
if (!mod_func)
return;
@@ -7476,7 +8071,10 @@ allocate_ftrace_mod_map(struct module *mod,
{
struct ftrace_mod_map *mod_map;
- mod_map = kmalloc(sizeof(*mod_map), GFP_KERNEL);
+ if (ftrace_disabled)
+ return NULL;
+
+ mod_map = kmalloc_obj(*mod_map);
if (!mod_map)
return NULL;
@@ -7521,7 +8119,8 @@ ftrace_func_address_lookup(struct ftrace_mod_map *mod_map,
int
ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
- unsigned long *off, char **modname, char *sym)
+ unsigned long *off, char **modname,
+ const unsigned char **modbuildid, char *sym)
{
struct ftrace_mod_map *mod_map;
int ret = 0;
@@ -7533,6 +8132,8 @@ ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
if (ret) {
if (modname)
*modname = mod_map->mod->name;
+ if (modbuildid)
+ *modbuildid = module_buildid(mod_map->mod);
break;
}
}
@@ -7646,7 +8247,7 @@ static void add_to_clear_hash_list(struct list_head *clear_list,
{
struct ftrace_init_func *func;
- func = kmalloc(sizeof(*func), GFP_KERNEL);
+ func = kmalloc_obj(*func);
if (!func) {
MEM_FAIL(1, "alloc failure, ftrace filter could be stale\n");
return;
@@ -7762,9 +8363,6 @@ void __init ftrace_init(void)
goto failed;
}
- pr_info("ftrace: allocating %ld entries in %ld pages\n",
- count, DIV_ROUND_UP(count, ENTRIES_PER_PAGE));
-
ret = ftrace_process_locs(NULL,
__start_mcount_loc,
__stop_mcount_loc);
@@ -7814,9 +8412,14 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops)
void ftrace_init_trace_array(struct trace_array *tr)
{
+ if (tr->flags & TRACE_ARRAY_FL_MOD_INIT)
+ return;
+
INIT_LIST_HEAD(&tr->func_probes);
INIT_LIST_HEAD(&tr->mod_trace);
INIT_LIST_HEAD(&tr->mod_notrace);
+
+ tr->flags |= TRACE_ARRAY_FL_MOD_INIT;
}
#else
@@ -7845,7 +8448,8 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops)
__init void ftrace_init_global_array_ops(struct trace_array *tr)
{
tr->ops = &global_ops;
- tr->ops->private = tr;
+ if (!global_ops.private)
+ global_ops.private = tr;
ftrace_init_trace_array(tr);
init_array_fgraph_ops(tr, tr->ops);
}
@@ -8013,6 +8617,7 @@ ftrace_pid_follow_sched_process_fork(void *data,
struct trace_pid_list *pid_list;
struct trace_array *tr = data;
+ guard(preempt)();
pid_list = rcu_dereference_sched(tr->function_pids);
trace_filter_add_remove_task(pid_list, self, task);
@@ -8026,6 +8631,7 @@ ftrace_pid_follow_sched_process_exit(void *data, struct task_struct *task)
struct trace_pid_list *pid_list;
struct trace_array *tr = data;
+ guard(preempt)();
pid_list = rcu_dereference_sched(tr->function_pids);
trace_filter_add_remove_task(pid_list, NULL, task);
@@ -8287,7 +8893,7 @@ pid_write(struct file *filp, const char __user *ubuf,
if (!cnt)
return 0;
- mutex_lock(&ftrace_lock);
+ guard(mutex)(&ftrace_lock);
switch (type) {
case TRACE_PIDS:
@@ -8303,14 +8909,13 @@ pid_write(struct file *filp, const char __user *ubuf,
lockdep_is_held(&ftrace_lock));
break;
default:
- ret = -EINVAL;
WARN_ON_ONCE(1);
- goto out;
+ return -EINVAL;
}
ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt);
if (ret < 0)
- goto out;
+ return ret;
switch (type) {
case TRACE_PIDS:
@@ -8339,11 +8944,8 @@ pid_write(struct file *filp, const char __user *ubuf,
ftrace_update_pid_func();
ftrace_startup_all(0);
- out:
- mutex_unlock(&ftrace_lock);
- if (ret > 0)
- *ppos += ret;
+ *ppos += ret;
return ret;
}
@@ -8478,7 +9080,7 @@ static int prepare_direct_functions_for_ipmodify(struct ftrace_ops *ops)
if (!op->ops_func)
return -EBUSY;
- ret = op->ops_func(op, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER);
+ ret = op->ops_func(op, ip, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER);
if (ret)
return ret;
}
@@ -8525,7 +9127,7 @@ static void cleanup_direct_functions_after_ipmodify(struct ftrace_ops *ops)
/* The cleanup is optional, ignore any errors */
if (found_op && op->ops_func)
- op->ops_func(op, FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER);
+ op->ops_func(op, ip, FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER);
}
}
mutex_unlock(&direct_mutex);
@@ -8669,6 +9271,15 @@ static int kallsyms_callback(void *data, const char *name, unsigned long addr)
* @addrs array, which needs to be big enough to store at least @cnt
* addresses.
*
+ * For a single symbol (cnt == 1), uses kallsyms_lookup_name() which
+ * performs an O(log N) binary search via the sorted kallsyms index.
+ * This avoids the full O(N) linear scan over all kernel symbols that
+ * the multi-symbol path requires.
+ *
+ * For multiple symbols, uses a single-pass linear scan via
+ * kallsyms_on_each_symbol() with binary search into the sorted input
+ * array.
+ *
* Returns: 0 if all provided symbols are found, -ESRCH otherwise.
*/
int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *addrs)
@@ -8676,6 +9287,19 @@ int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *a
struct kallsyms_data args;
int found_all;
+ /* Fast path: single symbol uses O(log N) binary search */
+ if (cnt == 1) {
+ addrs[0] = kallsyms_lookup_name(sorted_syms[0]);
+ if (addrs[0] && ftrace_location(addrs[0]))
+ return 0;
+ /*
+ * Binary lookup can fail for duplicate symbol names
+ * where the first match is not ftrace-instrumented.
+ * Retry with linear scan.
+ */
+ }
+
+ /* Batch path: single-pass O(N) linear scan */
memset(addrs, 0, sizeof(*addrs) * cnt);
args.addrs = addrs;
args.syms = sorted_syms;
@@ -8746,17 +9370,17 @@ static int
ftrace_enable_sysctl(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
- int ret = -ENODEV;
+ int ret;
- mutex_lock(&ftrace_lock);
+ guard(mutex)(&ftrace_lock);
if (unlikely(ftrace_disabled))
- goto out;
+ return -ENODEV;
ret = proc_dointvec(table, write, buffer, lenp, ppos);
if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
- goto out;
+ return ret;
if (ftrace_enabled) {
@@ -8770,8 +9394,7 @@ ftrace_enable_sysctl(const struct ctl_table *table, int write,
} else {
if (is_permanent_ops_registered()) {
ftrace_enabled = true;
- ret = -EBUSY;
- goto out;
+ return -EBUSY;
}
/* stopping ftrace calls (just send to ftrace_stub) */
@@ -8781,12 +9404,10 @@ ftrace_enable_sysctl(const struct ctl_table *table, int write,
}
last_ftrace_enabled = !!ftrace_enabled;
- out:
- mutex_unlock(&ftrace_lock);
- return ret;
+ return 0;
}
-static struct ctl_table ftrace_sysctls[] = {
+static const struct ctl_table ftrace_sysctls[] = {
{
.procname = "ftrace_enabled",
.data = &ftrace_enabled,
diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c
index 4966e6bbdf6f..30e3fdae6a17 100644
--- a/kernel/trace/pid_list.c
+++ b/kernel/trace/pid_list.c
@@ -3,6 +3,7 @@
* Copyright (C) 2021 VMware Inc, Steven Rostedt <rostedt@goodmis.org>
*/
#include <linux/spinlock.h>
+#include <linux/seqlock.h>
#include <linux/irq_work.h>
#include <linux/slab.h>
#include "trace.h"
@@ -81,13 +82,9 @@ static inline bool upper_empty(union upper_chunk *chunk)
{
/*
* If chunk->data has no lower chunks, it will be the same
- * as a zeroed bitmask. Use find_first_bit() to test it
- * and if it doesn't find any bits set, then the array
- * is empty.
+ * as a zeroed bitmask.
*/
- int bit = find_first_bit((unsigned long *)chunk->data,
- sizeof(chunk->data) * 8);
- return bit >= sizeof(chunk->data) * 8;
+ return bitmap_empty((unsigned long *)chunk->data, BITS_PER_TYPE(chunk->data));
}
static inline int pid_split(unsigned int pid, unsigned int *upper1,
@@ -130,7 +127,7 @@ bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid)
{
union upper_chunk *upper_chunk;
union lower_chunk *lower_chunk;
- unsigned long flags;
+ unsigned int seq;
unsigned int upper1;
unsigned int upper2;
unsigned int lower;
@@ -142,14 +139,16 @@ bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid)
if (pid_split(pid, &upper1, &upper2, &lower) < 0)
return false;
- raw_spin_lock_irqsave(&pid_list->lock, flags);
- upper_chunk = pid_list->upper[upper1];
- if (upper_chunk) {
- lower_chunk = upper_chunk->data[upper2];
- if (lower_chunk)
- ret = test_bit(lower, lower_chunk->data);
- }
- raw_spin_unlock_irqrestore(&pid_list->lock, flags);
+ do {
+ seq = read_seqcount_begin(&pid_list->seqcount);
+ ret = false;
+ upper_chunk = pid_list->upper[upper1];
+ if (upper_chunk) {
+ lower_chunk = upper_chunk->data[upper2];
+ if (lower_chunk)
+ ret = test_bit(lower, lower_chunk->data);
+ }
+ } while (read_seqcount_retry(&pid_list->seqcount, seq));
return ret;
}
@@ -182,6 +181,7 @@ int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid)
return -EINVAL;
raw_spin_lock_irqsave(&pid_list->lock, flags);
+ write_seqcount_begin(&pid_list->seqcount);
upper_chunk = pid_list->upper[upper1];
if (!upper_chunk) {
upper_chunk = get_upper_chunk(pid_list);
@@ -203,6 +203,7 @@ int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid)
set_bit(lower, lower_chunk->data);
ret = 0;
out:
+ write_seqcount_end(&pid_list->seqcount);
raw_spin_unlock_irqrestore(&pid_list->lock, flags);
return ret;
}
@@ -234,6 +235,7 @@ int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid)
return -EINVAL;
raw_spin_lock_irqsave(&pid_list->lock, flags);
+ write_seqcount_begin(&pid_list->seqcount);
upper_chunk = pid_list->upper[upper1];
if (!upper_chunk)
goto out;
@@ -254,6 +256,7 @@ int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid)
}
}
out:
+ write_seqcount_end(&pid_list->seqcount);
raw_spin_unlock_irqrestore(&pid_list->lock, flags);
return 0;
}
@@ -344,8 +347,10 @@ static void pid_list_refill_irq(struct irq_work *iwork)
again:
raw_spin_lock(&pid_list->lock);
+ write_seqcount_begin(&pid_list->seqcount);
upper_count = CHUNK_ALLOC - pid_list->free_upper_chunks;
lower_count = CHUNK_ALLOC - pid_list->free_lower_chunks;
+ write_seqcount_end(&pid_list->seqcount);
raw_spin_unlock(&pid_list->lock);
if (upper_count <= 0 && lower_count <= 0)
@@ -354,7 +359,7 @@ static void pid_list_refill_irq(struct irq_work *iwork)
while (upper_count-- > 0) {
union upper_chunk *chunk;
- chunk = kzalloc(sizeof(*chunk), GFP_NOWAIT);
+ chunk = kzalloc_obj(*chunk, GFP_NOWAIT);
if (!chunk)
break;
*upper_next = chunk;
@@ -365,7 +370,7 @@ static void pid_list_refill_irq(struct irq_work *iwork)
while (lower_count-- > 0) {
union lower_chunk *chunk;
- chunk = kzalloc(sizeof(*chunk), GFP_NOWAIT);
+ chunk = kzalloc_obj(*chunk, GFP_NOWAIT);
if (!chunk)
break;
*lower_next = chunk;
@@ -374,6 +379,7 @@ static void pid_list_refill_irq(struct irq_work *iwork)
}
raw_spin_lock(&pid_list->lock);
+ write_seqcount_begin(&pid_list->seqcount);
if (upper) {
*upper_next = pid_list->upper_list;
pid_list->upper_list = upper;
@@ -384,6 +390,7 @@ static void pid_list_refill_irq(struct irq_work *iwork)
pid_list->lower_list = lower;
pid_list->free_lower_chunks += lcnt;
}
+ write_seqcount_end(&pid_list->seqcount);
raw_spin_unlock(&pid_list->lock);
/*
@@ -414,20 +421,21 @@ struct trace_pid_list *trace_pid_list_alloc(void)
int i;
/* According to linux/thread.h, pids can be no bigger that 30 bits */
- WARN_ON_ONCE(pid_max > (1 << 30));
+ WARN_ON_ONCE(init_pid_ns.pid_max > (1 << 30));
- pid_list = kzalloc(sizeof(*pid_list), GFP_KERNEL);
+ pid_list = kzalloc_obj(*pid_list);
if (!pid_list)
return NULL;
init_irq_work(&pid_list->refill_irqwork, pid_list_refill_irq);
raw_spin_lock_init(&pid_list->lock);
+ seqcount_raw_spinlock_init(&pid_list->seqcount, &pid_list->lock);
for (i = 0; i < CHUNK_ALLOC; i++) {
union upper_chunk *chunk;
- chunk = kzalloc(sizeof(*chunk), GFP_KERNEL);
+ chunk = kzalloc_obj(*chunk);
if (!chunk)
break;
chunk->next = pid_list->upper_list;
@@ -438,7 +446,7 @@ struct trace_pid_list *trace_pid_list_alloc(void)
for (i = 0; i < CHUNK_ALLOC; i++) {
union lower_chunk *chunk;
- chunk = kzalloc(sizeof(*chunk), GFP_KERNEL);
+ chunk = kzalloc_obj(*chunk);
if (!chunk)
break;
chunk->next = pid_list->lower_list;
diff --git a/kernel/trace/pid_list.h b/kernel/trace/pid_list.h
index 62e73f1ac85f..0b45fb0eadb9 100644
--- a/kernel/trace/pid_list.h
+++ b/kernel/trace/pid_list.h
@@ -76,6 +76,7 @@ union upper_chunk {
};
struct trace_pid_list {
+ seqcount_raw_spinlock_t seqcount;
raw_spinlock_t lock;
struct irq_work refill_irqwork;
union upper_chunk *upper[UPPER1_SIZE]; // 1 or 2K in size
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index 21bb161c2316..f2fe33573e54 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -17,5 +17,4 @@
EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume);
EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_frequency);
-EXPORT_TRACEPOINT_SYMBOL_GPL(powernv_throttle);
diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c
index 314ffc143039..acb0c971a408 100644
--- a/kernel/trace/preemptirq_delay_test.c
+++ b/kernel/trace/preemptirq_delay_test.c
@@ -117,12 +117,15 @@ static int preemptirq_delay_run(void *data)
{
int i;
int s = MIN(burst_size, NR_TEST_FUNCS);
- struct cpumask cpu_mask;
+ cpumask_var_t cpu_mask;
+
+ if (!alloc_cpumask_var(&cpu_mask, GFP_KERNEL))
+ return -ENOMEM;
if (cpu_affinity > -1) {
- cpumask_clear(&cpu_mask);
- cpumask_set_cpu(cpu_affinity, &cpu_mask);
- if (set_cpus_allowed_ptr(current, &cpu_mask))
+ cpumask_clear(cpu_mask);
+ cpumask_set_cpu(cpu_affinity, cpu_mask);
+ if (set_cpus_allowed_ptr(current, cpu_mask))
pr_err("cpu_affinity:%d, failed\n", cpu_affinity);
}
@@ -139,6 +142,8 @@ static int preemptirq_delay_run(void *data)
__set_current_state(TASK_RUNNING);
+ free_cpumask_var(cpu_mask);
+
return 0;
}
diff --git a/kernel/trace/remote_test.c b/kernel/trace/remote_test.c
new file mode 100644
index 000000000000..a3e2c9b606eb
--- /dev/null
+++ b/kernel/trace/remote_test.c
@@ -0,0 +1,261 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2025 - Google LLC
+ * Author: Vincent Donnefort <vdonnefort@google.com>
+ */
+
+#include <linux/module.h>
+#include <linux/simple_ring_buffer.h>
+#include <linux/trace_remote.h>
+#include <linux/tracefs.h>
+#include <linux/types.h>
+
+#define REMOTE_EVENT_INCLUDE_FILE kernel/trace/remote_test_events.h
+#include <trace/define_remote_events.h>
+
+static DEFINE_PER_CPU(struct simple_rb_per_cpu *, simple_rbs);
+static struct trace_buffer_desc *remote_test_buffer_desc;
+
+/*
+ * The trace_remote lock already serializes accesses from the trace_remote_callbacks.
+ * However write_event can still race with load/unload.
+ */
+static DEFINE_MUTEX(simple_rbs_lock);
+
+static int remote_test_load_simple_rb(int cpu, struct ring_buffer_desc *rb_desc)
+{
+ struct simple_rb_per_cpu *cpu_buffer;
+ struct simple_buffer_page *bpages;
+ int ret = -ENOMEM;
+
+ cpu_buffer = kmalloc_obj(*cpu_buffer);
+ if (!cpu_buffer)
+ return ret;
+
+ bpages = kmalloc_objs(*bpages, rb_desc->nr_page_va);
+ if (!bpages)
+ goto err_free_cpu_buffer;
+
+ ret = simple_ring_buffer_init(cpu_buffer, bpages, rb_desc);
+ if (ret)
+ goto err_free_bpages;
+
+ scoped_guard(mutex, &simple_rbs_lock) {
+ WARN_ON(*per_cpu_ptr(&simple_rbs, cpu));
+ *per_cpu_ptr(&simple_rbs, cpu) = cpu_buffer;
+ }
+
+ return 0;
+
+err_free_bpages:
+ kfree(bpages);
+
+err_free_cpu_buffer:
+ kfree(cpu_buffer);
+
+ return ret;
+}
+
+static void remote_test_unload_simple_rb(int cpu)
+{
+ struct simple_rb_per_cpu *cpu_buffer = *per_cpu_ptr(&simple_rbs, cpu);
+ struct simple_buffer_page *bpages;
+
+ if (!cpu_buffer)
+ return;
+
+ guard(mutex)(&simple_rbs_lock);
+
+ bpages = cpu_buffer->bpages;
+ simple_ring_buffer_unload(cpu_buffer);
+ kfree(bpages);
+ kfree(cpu_buffer);
+ *per_cpu_ptr(&simple_rbs, cpu) = NULL;
+}
+
+static struct trace_buffer_desc *remote_test_load(unsigned long size, void *unused)
+{
+ struct ring_buffer_desc *rb_desc;
+ struct trace_buffer_desc *desc;
+ size_t desc_size;
+ int cpu, ret;
+
+ if (WARN_ON(remote_test_buffer_desc))
+ return ERR_PTR(-EINVAL);
+
+ desc_size = trace_buffer_desc_size(size, num_possible_cpus());
+ if (desc_size == SIZE_MAX) {
+ ret = -E2BIG;
+ goto err;
+ }
+
+ desc = kmalloc(desc_size, GFP_KERNEL);
+ if (!desc) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ ret = trace_remote_alloc_buffer(desc, desc_size, size, cpu_possible_mask);
+ if (ret)
+ goto err_free_desc;
+
+ for_each_ring_buffer_desc(rb_desc, cpu, desc) {
+ ret = remote_test_load_simple_rb(rb_desc->cpu, rb_desc);
+ if (ret)
+ goto err_unload;
+ }
+
+ remote_test_buffer_desc = desc;
+
+ return remote_test_buffer_desc;
+
+err_unload:
+ for_each_ring_buffer_desc(rb_desc, cpu, desc)
+ remote_test_unload_simple_rb(rb_desc->cpu);
+ trace_remote_free_buffer(desc);
+
+err_free_desc:
+ kfree(desc);
+
+err:
+ return ERR_PTR(ret);
+}
+
+static void remote_test_unload(struct trace_buffer_desc *desc, void *unused)
+{
+ struct ring_buffer_desc *rb_desc;
+ int cpu;
+
+ if (WARN_ON(desc != remote_test_buffer_desc))
+ return;
+
+ for_each_ring_buffer_desc(rb_desc, cpu, desc)
+ remote_test_unload_simple_rb(rb_desc->cpu);
+
+ remote_test_buffer_desc = NULL;
+ trace_remote_free_buffer(desc);
+ kfree(desc);
+}
+
+static int remote_test_enable_tracing(bool enable, void *unused)
+{
+ struct ring_buffer_desc *rb_desc;
+ int cpu;
+
+ if (!remote_test_buffer_desc)
+ return -ENODEV;
+
+ for_each_ring_buffer_desc(rb_desc, cpu, remote_test_buffer_desc)
+ WARN_ON(simple_ring_buffer_enable_tracing(*per_cpu_ptr(&simple_rbs, rb_desc->cpu),
+ enable));
+ return 0;
+}
+
+static int remote_test_swap_reader_page(unsigned int cpu, void *unused)
+{
+ struct simple_rb_per_cpu *cpu_buffer;
+
+ if (cpu >= NR_CPUS)
+ return -EINVAL;
+
+ cpu_buffer = *per_cpu_ptr(&simple_rbs, cpu);
+ if (!cpu_buffer)
+ return -EINVAL;
+
+ return simple_ring_buffer_swap_reader_page(cpu_buffer);
+}
+
+static int remote_test_reset(unsigned int cpu, void *unused)
+{
+ struct simple_rb_per_cpu *cpu_buffer;
+
+ if (cpu >= NR_CPUS)
+ return -EINVAL;
+
+ cpu_buffer = *per_cpu_ptr(&simple_rbs, cpu);
+ if (!cpu_buffer)
+ return -EINVAL;
+
+ return simple_ring_buffer_reset(cpu_buffer);
+}
+
+static int remote_test_enable_event(unsigned short id, bool enable, void *unused)
+{
+ if (id != REMOTE_TEST_EVENT_ID)
+ return -EINVAL;
+
+ /*
+ * Let's just use the struct remote_event enabled field that is turned on and off by
+ * trace_remote. This is a bit racy but good enough for a simple test module.
+ */
+ return 0;
+}
+
+static ssize_t
+write_event_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *pos)
+{
+ struct remote_event_format_selftest *evt_test;
+ struct simple_rb_per_cpu *cpu_buffer;
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ guard(mutex)(&simple_rbs_lock);
+
+ if (!remote_event_selftest.enabled)
+ return -ENODEV;
+
+ guard(preempt)();
+
+ cpu_buffer = *this_cpu_ptr(&simple_rbs);
+ if (!cpu_buffer)
+ return -ENODEV;
+
+ evt_test = simple_ring_buffer_reserve(cpu_buffer,
+ sizeof(struct remote_event_format_selftest),
+ trace_clock_global());
+ if (!evt_test)
+ return -ENODEV;
+
+ evt_test->hdr.id = REMOTE_TEST_EVENT_ID;
+ evt_test->id = val;
+
+ simple_ring_buffer_commit(cpu_buffer);
+
+ return cnt;
+}
+
+static const struct file_operations write_event_fops = {
+ .write = write_event_write,
+};
+
+static int remote_test_init_tracefs(struct dentry *d, void *unused)
+{
+ return tracefs_create_file("write_event", 0200, d, NULL, &write_event_fops) ?
+ 0 : -ENOMEM;
+}
+
+static struct trace_remote_callbacks trace_remote_callbacks = {
+ .init = remote_test_init_tracefs,
+ .load_trace_buffer = remote_test_load,
+ .unload_trace_buffer = remote_test_unload,
+ .enable_tracing = remote_test_enable_tracing,
+ .swap_reader_page = remote_test_swap_reader_page,
+ .reset = remote_test_reset,
+ .enable_event = remote_test_enable_event,
+};
+
+static int __init remote_test_init(void)
+{
+ return trace_remote_register("test", &trace_remote_callbacks, NULL,
+ &remote_event_selftest, 1);
+}
+
+module_init(remote_test_init);
+
+MODULE_DESCRIPTION("Test module for the trace remote interface");
+MODULE_AUTHOR("Vincent Donnefort");
+MODULE_LICENSE("GPL");
diff --git a/kernel/trace/remote_test_events.h b/kernel/trace/remote_test_events.h
new file mode 100644
index 000000000000..26b93b3406fc
--- /dev/null
+++ b/kernel/trace/remote_test_events.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#define REMOTE_TEST_EVENT_ID 1
+
+REMOTE_EVENT(selftest, REMOTE_TEST_EVENT_ID,
+ RE_STRUCT(
+ re_field(u64, id)
+ ),
+ RE_PRINTK("id=%llu", __entry->id)
+);
diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c
index 30d224946881..5a8bdf88999a 100644
--- a/kernel/trace/rethook.c
+++ b/kernel/trace/rethook.c
@@ -108,7 +108,7 @@ struct rethook *rethook_alloc(void *data, rethook_handler_t handler,
if (!handler || num <= 0 || size < sizeof(struct rethook_node))
return ERR_PTR(-EINVAL);
- rh = kzalloc(sizeof(struct rethook), GFP_KERNEL);
+ rh = kzalloc_obj(struct rethook);
if (!rh)
return ERR_PTR(-ENOMEM);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 60210fb5b211..7b07d2004cc6 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4,7 +4,10 @@
*
* Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
*/
+#include <linux/ring_buffer_types.h>
+#include <linux/sched/isolation.h>
#include <linux/trace_recursion.h>
+#include <linux/panic_notifier.h>
#include <linux/trace_events.h>
#include <linux/ring_buffer.h>
#include <linux/trace_clock.h>
@@ -29,8 +32,10 @@
#include <linux/oom.h>
#include <linux/mm.h>
+#include <asm/ring_buffer.h>
#include <asm/local64.h>
#include <asm/local.h>
+#include <asm/setup.h>
#include "trace.h"
@@ -48,9 +53,12 @@ static void update_pages_handler(struct work_struct *work);
struct ring_buffer_meta {
int magic;
- int struct_size;
- unsigned long text_addr;
- unsigned long data_addr;
+ int struct_sizes;
+ unsigned long total_size;
+ unsigned long buffers_offset;
+};
+
+struct ring_buffer_cpu_meta {
unsigned long first_buffer;
unsigned long head_buffer;
unsigned long commit_buffer;
@@ -152,23 +160,6 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
/* Used for individual buffers (after the counter) */
#define RB_BUFFER_OFF (1 << 20)
-#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data)
-
-#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
-#define RB_ALIGNMENT 4U
-#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
-#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */
-
-#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS
-# define RB_FORCE_8BYTE_ALIGNMENT 0
-# define RB_ARCH_ALIGNMENT RB_ALIGNMENT
-#else
-# define RB_FORCE_8BYTE_ALIGNMENT 1
-# define RB_ARCH_ALIGNMENT 8U
-#endif
-
-#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT)
-
/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
@@ -311,10 +302,6 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
#define for_each_online_buffer_cpu(buffer, cpu) \
for_each_cpu_and(cpu, buffer->cpumask, cpu_online_mask)
-#define TS_SHIFT 27
-#define TS_MASK ((1ULL << TS_SHIFT) - 1)
-#define TS_DELTA_TEST (~TS_MASK)
-
static u64 rb_event_time_stamp(struct ring_buffer_event *event)
{
u64 ts;
@@ -333,12 +320,6 @@ static u64 rb_event_time_stamp(struct ring_buffer_event *event)
#define RB_MISSED_MASK (3 << 30)
-struct buffer_data_page {
- u64 time_stamp; /* page time stamp */
- local_t commit; /* write committed index */
- unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */
-};
-
struct buffer_data_read_page {
unsigned order; /* order of the page */
struct buffer_data_page *data; /* actual data, stored in this page */
@@ -398,11 +379,38 @@ static void free_buffer_page(struct buffer_page *bpage)
}
/*
- * We need to fit the time_stamp delta into 27 bits.
+ * For best performance, allocate cpu buffer data cache line sized
+ * and per CPU.
*/
-static inline bool test_time_stamp(u64 delta)
+#define alloc_cpu_buffer(cpu) (struct ring_buffer_per_cpu *) \
+ kzalloc_node(ALIGN(sizeof(struct ring_buffer_per_cpu), \
+ cache_line_size()), GFP_KERNEL, cpu_to_node(cpu));
+
+#define alloc_cpu_page(cpu) (struct buffer_page *) \
+ kzalloc_node(ALIGN(sizeof(struct buffer_page), \
+ cache_line_size()), GFP_KERNEL, cpu_to_node(cpu));
+
+static struct buffer_data_page *alloc_cpu_data(int cpu, int order)
{
- return !!(delta & TS_DELTA_TEST);
+ struct buffer_data_page *dpage;
+ struct page *page;
+ gfp_t mflags;
+
+ /*
+ * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails
+ * gracefully without invoking oom-killer and the system is not
+ * destabilized.
+ */
+ mflags = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_COMP | __GFP_ZERO;
+
+ page = alloc_pages_node(cpu_to_node(cpu), mflags, order);
+ if (!page)
+ return NULL;
+
+ dpage = page_address(page);
+ rb_init_page(dpage);
+
+ return dpage;
}
struct rb_irq_work {
@@ -515,9 +523,11 @@ struct ring_buffer_per_cpu {
unsigned int mapped;
unsigned int user_mapped; /* user space mapping */
struct mutex mapping_lock;
- unsigned long *subbuf_ids; /* ID to subbuf VA */
+ struct buffer_page **subbuf_ids; /* ID to subbuf VA */
struct trace_buffer_meta *meta_page;
- struct ring_buffer_meta *ring_meta;
+ struct ring_buffer_cpu_meta *ring_meta;
+
+ struct ring_buffer_remote *remote;
/* ring buffer pages to update, > 0 to add, < 0 to remove */
long nr_pages_to_update;
@@ -541,6 +551,8 @@ struct trace_buffer {
struct ring_buffer_per_cpu **buffers;
+ struct ring_buffer_remote *remote;
+
struct hlist_node node;
u64 (*clock)(void);
@@ -549,9 +561,9 @@ struct trace_buffer {
unsigned long range_addr_start;
unsigned long range_addr_end;
+ struct notifier_block flush_nb;
- long last_text_delta;
- long last_data_delta;
+ struct ring_buffer_meta *meta;
unsigned int subbuf_size;
unsigned int subbuf_order;
@@ -588,16 +600,17 @@ int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq
(unsigned int)sizeof(field.commit),
(unsigned int)is_signed_type(long));
- trace_seq_printf(s, "\tfield: int overwrite;\t"
+ trace_seq_printf(s, "\tfield: char overwrite;\t"
"offset:%u;\tsize:%u;\tsigned:%u;\n",
(unsigned int)offsetof(typeof(field), commit),
1,
- (unsigned int)is_signed_type(long));
+ (unsigned int)is_signed_type(char));
trace_seq_printf(s, "\tfield: char data;\t"
"offset:%u;\tsize:%u;\tsigned:%u;\n",
(unsigned int)offsetof(typeof(field), data),
- (unsigned int)buffer->subbuf_size,
+ (unsigned int)(buffer ? buffer->subbuf_size :
+ PAGE_SIZE - BUF_PAGE_HDR_SIZE),
(unsigned int)is_signed_type(char));
return !trace_seq_has_overflowed(s);
@@ -1271,7 +1284,7 @@ static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer)
rb_set_list_to_head(head->list.prev);
if (cpu_buffer->ring_meta) {
- struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
meta->head_buffer = (unsigned long)head->page;
}
}
@@ -1355,6 +1368,13 @@ static inline void rb_inc_page(struct buffer_page **bpage)
*bpage = list_entry(p, struct buffer_page, list);
}
+static inline void rb_dec_page(struct buffer_page **bpage)
+{
+ struct list_head *p = rb_list_head((*bpage)->list.prev);
+
+ *bpage = list_entry(p, struct buffer_page, list);
+}
+
static struct buffer_page *
rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
{
@@ -1569,7 +1589,7 @@ out_locked:
static unsigned long
rb_range_align_subbuf(unsigned long addr, int subbuf_size, int nr_subbufs)
{
- addr += sizeof(struct ring_buffer_meta) +
+ addr += sizeof(struct ring_buffer_cpu_meta) +
sizeof(int) * nr_subbufs;
return ALIGN(addr, subbuf_size);
}
@@ -1580,19 +1600,22 @@ rb_range_align_subbuf(unsigned long addr, int subbuf_size, int nr_subbufs)
static void *rb_range_meta(struct trace_buffer *buffer, int nr_pages, int cpu)
{
int subbuf_size = buffer->subbuf_size + BUF_PAGE_HDR_SIZE;
- unsigned long ptr = buffer->range_addr_start;
- struct ring_buffer_meta *meta;
+ struct ring_buffer_cpu_meta *meta;
+ struct ring_buffer_meta *bmeta;
+ unsigned long ptr;
int nr_subbufs;
- if (!ptr)
+ bmeta = buffer->meta;
+ if (!bmeta)
return NULL;
+ ptr = (unsigned long)bmeta + bmeta->buffers_offset;
+ meta = (struct ring_buffer_cpu_meta *)ptr;
+
/* When nr_pages passed in is zero, the first meta has already been initialized */
if (!nr_pages) {
- meta = (struct ring_buffer_meta *)ptr;
nr_subbufs = meta->nr_subbufs;
} else {
- meta = NULL;
/* Include the reader page */
nr_subbufs = nr_pages + 1;
}
@@ -1624,7 +1647,7 @@ static void *rb_range_meta(struct trace_buffer *buffer, int nr_pages, int cpu)
}
/* Return the start of subbufs given the meta pointer */
-static void *rb_subbufs_from_meta(struct ring_buffer_meta *meta)
+static void *rb_subbufs_from_meta(struct ring_buffer_cpu_meta *meta)
{
int subbuf_size = meta->subbuf_size;
unsigned long ptr;
@@ -1640,7 +1663,7 @@ static void *rb_subbufs_from_meta(struct ring_buffer_meta *meta)
*/
static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx)
{
- struct ring_buffer_meta *meta;
+ struct ring_buffer_cpu_meta *meta;
unsigned long ptr;
int subbuf_size;
@@ -1666,13 +1689,77 @@ static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx)
}
/*
+ * See if the existing memory contains a valid meta section.
+ * if so, use that, otherwise initialize it.
+ */
+static bool rb_meta_init(struct trace_buffer *buffer, int scratch_size)
+{
+ unsigned long ptr = buffer->range_addr_start;
+ struct ring_buffer_meta *bmeta;
+ unsigned long total_size;
+ int struct_sizes;
+
+ bmeta = (struct ring_buffer_meta *)ptr;
+ buffer->meta = bmeta;
+
+ total_size = buffer->range_addr_end - buffer->range_addr_start;
+
+ struct_sizes = sizeof(struct ring_buffer_cpu_meta);
+ struct_sizes |= sizeof(*bmeta) << 16;
+
+ /* The first buffer will start word size after the meta page */
+ ptr += sizeof(*bmeta);
+ ptr = ALIGN(ptr, sizeof(long));
+ ptr += scratch_size;
+
+ if (bmeta->magic != RING_BUFFER_META_MAGIC) {
+ pr_info("Ring buffer boot meta mismatch of magic\n");
+ goto init;
+ }
+
+ if (bmeta->struct_sizes != struct_sizes) {
+ pr_info("Ring buffer boot meta mismatch of struct size\n");
+ goto init;
+ }
+
+ if (bmeta->total_size != total_size) {
+ pr_info("Ring buffer boot meta mismatch of total size\n");
+ goto init;
+ }
+
+ if (bmeta->buffers_offset > bmeta->total_size) {
+ pr_info("Ring buffer boot meta mismatch of offset outside of total size\n");
+ goto init;
+ }
+
+ if (bmeta->buffers_offset != (void *)ptr - (void *)bmeta) {
+ pr_info("Ring buffer boot meta mismatch of first buffer offset\n");
+ goto init;
+ }
+
+ return true;
+
+ init:
+ bmeta->magic = RING_BUFFER_META_MAGIC;
+ bmeta->struct_sizes = struct_sizes;
+ bmeta->total_size = total_size;
+ bmeta->buffers_offset = (void *)ptr - (void *)bmeta;
+
+ /* Zero out the scratch pad */
+ memset((void *)bmeta + sizeof(*bmeta), 0, bmeta->buffers_offset - sizeof(*bmeta));
+
+ return false;
+}
+
+/*
* See if the existing memory contains valid ring buffer data.
* As the previous kernel must be the same as this kernel, all
* the calculations (size of buffers and number of buffers)
* must be the same.
*/
-static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu,
- struct trace_buffer *buffer, int nr_pages)
+static bool rb_cpu_meta_valid(struct ring_buffer_cpu_meta *meta, int cpu,
+ struct trace_buffer *buffer, int nr_pages,
+ unsigned long *subbuf_mask)
{
int subbuf_size = PAGE_SIZE;
struct buffer_data_page *subbuf;
@@ -1680,19 +1767,8 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu,
unsigned long buffers_end;
int i;
- /* Check the meta magic and meta struct size */
- if (meta->magic != RING_BUFFER_META_MAGIC ||
- meta->struct_size != sizeof(*meta)) {
- pr_info("Ring buffer boot meta[%d] mismatch of magic or struct size\n", cpu);
+ if (!subbuf_mask)
return false;
- }
-
- /* The subbuffer's size and number of subbuffers must match */
- if (meta->subbuf_size != subbuf_size ||
- meta->nr_subbufs != nr_pages + 1) {
- pr_info("Ring buffer boot meta [%d] mismatch of subbuf_size/nr_pages\n", cpu);
- return false;
- }
buffers_start = meta->first_buffer;
buffers_end = meta->first_buffer + (subbuf_size * meta->nr_subbufs);
@@ -1712,6 +1788,8 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu,
subbuf = rb_subbufs_from_meta(meta);
+ bitmap_clear(subbuf_mask, 0, meta->nr_subbufs);
+
/* Is the meta buffers and the subbufs themselves have correct data? */
for (i = 0; i < meta->nr_subbufs; i++) {
if (meta->buffers[i] < 0 ||
@@ -1725,13 +1803,19 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu,
return false;
}
+ if (test_bit(meta->buffers[i], subbuf_mask)) {
+ pr_info("Ring buffer boot meta [%d] array has duplicates\n", cpu);
+ return false;
+ }
+
+ set_bit(meta->buffers[i], subbuf_mask);
subbuf = (void *)subbuf + subbuf_size;
}
return true;
}
-static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf);
+static int rb_meta_subbuf_idx(struct ring_buffer_cpu_meta *meta, void *subbuf);
static int rb_read_data_buffer(struct buffer_data_page *dpage, int tail, int cpu,
unsigned long long *timestamp, u64 *delta_ptr)
@@ -1739,6 +1823,7 @@ static int rb_read_data_buffer(struct buffer_data_page *dpage, int tail, int cpu
struct ring_buffer_event *event;
u64 ts, delta;
int events = 0;
+ int len;
int e;
*delta_ptr = 0;
@@ -1746,9 +1831,12 @@ static int rb_read_data_buffer(struct buffer_data_page *dpage, int tail, int cpu
ts = dpage->time_stamp;
- for (e = 0; e < tail; e += rb_event_length(event)) {
+ for (e = 0; e < tail; e += len) {
event = (struct ring_buffer_event *)(dpage->data + e);
+ len = rb_event_length(event);
+ if (len <= 0 || len > tail - e)
+ return -1;
switch (event->type_len) {
@@ -1798,38 +1886,133 @@ static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu)
/* If the meta data has been validated, now validate the events */
static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
{
- struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
- struct buffer_page *head_page;
+ struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
+ struct buffer_page *head_page, *orig_head, *orig_reader;
unsigned long entry_bytes = 0;
unsigned long entries = 0;
int ret;
+ u64 ts;
int i;
if (!meta || !meta->head_buffer)
return;
+ orig_head = head_page = cpu_buffer->head_page;
+ orig_reader = cpu_buffer->reader_page;
+
/* Do the reader page first */
- ret = rb_validate_buffer(cpu_buffer->reader_page->page, cpu_buffer->cpu);
+ ret = rb_validate_buffer(orig_reader->page, cpu_buffer->cpu);
if (ret < 0) {
pr_info("Ring buffer reader page is invalid\n");
goto invalid;
}
entries += ret;
- entry_bytes += local_read(&cpu_buffer->reader_page->page->commit);
- local_set(&cpu_buffer->reader_page->entries, ret);
+ entry_bytes += local_read(&orig_reader->page->commit);
+ local_set(&orig_reader->entries, ret);
- head_page = cpu_buffer->head_page;
+ ts = head_page->page->time_stamp;
+
+ /*
+ * Try to rewind the head so that we can read the pages which already
+ * read in the previous boot.
+ */
+ if (head_page == cpu_buffer->tail_page)
+ goto skip_rewind;
+
+ rb_dec_page(&head_page);
+ for (i = 0; i < meta->nr_subbufs + 1; i++, rb_dec_page(&head_page)) {
+
+ /* Rewind until tail (writer) page. */
+ if (head_page == cpu_buffer->tail_page)
+ break;
+
+ /* Ensure the page has older data than head. */
+ if (ts < head_page->page->time_stamp)
+ break;
+
+ ts = head_page->page->time_stamp;
+ /* Ensure the page has correct timestamp and some data. */
+ if (!ts || rb_page_commit(head_page) == 0)
+ break;
+
+ /* Stop rewind if the page is invalid. */
+ ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu);
+ if (ret < 0)
+ break;
+
+ /* Recover the number of entries and update stats. */
+ local_set(&head_page->entries, ret);
+ if (ret)
+ local_inc(&cpu_buffer->pages_touched);
+ entries += ret;
+ entry_bytes += rb_page_commit(head_page);
+ }
+ if (i)
+ pr_info("Ring buffer [%d] rewound %d pages\n", cpu_buffer->cpu, i);
- /* If both the head and commit are on the reader_page then we are done. */
- if (head_page == cpu_buffer->reader_page &&
- head_page == cpu_buffer->commit_page)
+ /* The last rewound page must be skipped. */
+ if (head_page != orig_head)
+ rb_inc_page(&head_page);
+
+ /*
+ * If the ring buffer was rewound, then inject the reader page
+ * into the location just before the original head page.
+ */
+ if (head_page != orig_head) {
+ struct buffer_page *bpage = orig_head;
+
+ rb_dec_page(&bpage);
+ /*
+ * Insert the reader_page before the original head page.
+ * Since the list encode RB_PAGE flags, general list
+ * operations should be avoided.
+ */
+ cpu_buffer->reader_page->list.next = &orig_head->list;
+ cpu_buffer->reader_page->list.prev = orig_head->list.prev;
+ orig_head->list.prev = &cpu_buffer->reader_page->list;
+ bpage->list.next = &cpu_buffer->reader_page->list;
+
+ /* Make the head_page the reader page */
+ cpu_buffer->reader_page = head_page;
+ bpage = head_page;
+ rb_inc_page(&head_page);
+ head_page->list.prev = bpage->list.prev;
+ rb_dec_page(&bpage);
+ bpage->list.next = &head_page->list;
+ rb_set_list_to_head(&bpage->list);
+ cpu_buffer->pages = &head_page->list;
+
+ cpu_buffer->head_page = head_page;
+ meta->head_buffer = (unsigned long)head_page->page;
+
+ /* Reset all the indexes */
+ bpage = cpu_buffer->reader_page;
+ meta->buffers[0] = rb_meta_subbuf_idx(meta, bpage->page);
+ bpage->id = 0;
+
+ for (i = 1, bpage = head_page; i < meta->nr_subbufs;
+ i++, rb_inc_page(&bpage)) {
+ meta->buffers[i] = rb_meta_subbuf_idx(meta, bpage->page);
+ bpage->id = i;
+ }
+
+ /* We'll restart verifying from orig_head */
+ head_page = orig_head;
+ }
+
+ skip_rewind:
+ /* If the commit_buffer is the reader page, update the commit page */
+ if (meta->commit_buffer == (unsigned long)cpu_buffer->reader_page->page) {
+ cpu_buffer->commit_page = cpu_buffer->reader_page;
+ /* Nothing more to do, the only page is the reader page */
goto done;
+ }
/* Iterate until finding the commit page */
for (i = 0; i < meta->nr_subbufs + 1; i++, rb_inc_page(&head_page)) {
- /* Reader page has already been done */
- if (head_page == cpu_buffer->reader_page)
+ /* The original reader page has already been checked/counted. */
+ if (head_page == orig_reader)
continue;
ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu);
@@ -1838,9 +2021,14 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->cpu);
goto invalid;
}
+
+ /* If the buffer has content, update pages_touched */
+ if (ret)
+ local_inc(&cpu_buffer->pages_touched);
+
entries += ret;
entry_bytes += local_read(&head_page->page->commit);
- local_set(&cpu_buffer->head_page->entries, ret);
+ local_set(&head_page->entries, ret);
if (head_page == cpu_buffer->commit_page)
break;
@@ -1874,40 +2062,35 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
}
}
-/* Used to calculate data delta */
-static char rb_data_ptr[] = "";
-
-#define THIS_TEXT_PTR ((unsigned long)rb_meta_init_text_addr)
-#define THIS_DATA_PTR ((unsigned long)rb_data_ptr)
-
-static void rb_meta_init_text_addr(struct ring_buffer_meta *meta)
-{
- meta->text_addr = THIS_TEXT_PTR;
- meta->data_addr = THIS_DATA_PTR;
-}
-
-static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages)
+static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages, int scratch_size)
{
- struct ring_buffer_meta *meta;
+ struct ring_buffer_cpu_meta *meta;
+ unsigned long *subbuf_mask;
unsigned long delta;
void *subbuf;
+ bool valid = false;
int cpu;
int i;
+ /* Create a mask to test the subbuf array */
+ subbuf_mask = bitmap_alloc(nr_pages + 1, GFP_KERNEL);
+ /* If subbuf_mask fails to allocate, then rb_meta_valid() will return false */
+
+ if (rb_meta_init(buffer, scratch_size))
+ valid = true;
+
for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
void *next_meta;
meta = rb_range_meta(buffer, nr_pages, cpu);
- if (rb_meta_valid(meta, cpu, buffer, nr_pages)) {
+ if (valid && rb_cpu_meta_valid(meta, cpu, buffer, nr_pages, subbuf_mask)) {
/* Make the mappings match the current address */
subbuf = rb_subbufs_from_meta(meta);
delta = (unsigned long)subbuf - meta->first_buffer;
meta->first_buffer += delta;
meta->head_buffer += delta;
meta->commit_buffer += delta;
- buffer->last_text_delta = THIS_TEXT_PTR - meta->text_addr;
- buffer->last_data_delta = THIS_DATA_PTR - meta->data_addr;
continue;
}
@@ -1918,16 +2101,12 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages)
memset(meta, 0, next_meta - (void *)meta);
- meta->magic = RING_BUFFER_META_MAGIC;
- meta->struct_size = sizeof(*meta);
-
meta->nr_subbufs = nr_pages + 1;
meta->subbuf_size = PAGE_SIZE;
subbuf = rb_subbufs_from_meta(meta);
meta->first_buffer = (unsigned long)subbuf;
- rb_meta_init_text_addr(meta);
/*
* The buffers[] array holds the order of the sub-buffers
@@ -1943,12 +2122,13 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages)
subbuf += meta->subbuf_size;
}
}
+ bitmap_free(subbuf_mask);
}
static void *rbm_start(struct seq_file *m, loff_t *pos)
{
struct ring_buffer_per_cpu *cpu_buffer = m->private;
- struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
unsigned long val;
if (!meta)
@@ -1973,7 +2153,7 @@ static void *rbm_next(struct seq_file *m, void *v, loff_t *pos)
static int rbm_show(struct seq_file *m, void *v)
{
struct ring_buffer_per_cpu *cpu_buffer = m->private;
- struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
unsigned long val = (unsigned long)v;
if (val == 1) {
@@ -2022,7 +2202,7 @@ int ring_buffer_meta_seq_init(struct file *file, struct trace_buffer *buffer, in
static void rb_meta_buffer_update(struct ring_buffer_per_cpu *cpu_buffer,
struct buffer_page *bpage)
{
- struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
if (meta->head_buffer == (unsigned long)bpage->page)
cpu_buffer->head_page = bpage;
@@ -2033,14 +2213,48 @@ static void rb_meta_buffer_update(struct ring_buffer_per_cpu *cpu_buffer,
}
}
+static struct ring_buffer_desc *ring_buffer_desc(struct trace_buffer_desc *trace_desc, int cpu)
+{
+ struct ring_buffer_desc *desc, *end;
+ size_t len;
+ int i;
+
+ if (!trace_desc)
+ return NULL;
+
+ if (cpu >= trace_desc->nr_cpus)
+ return NULL;
+
+ end = (struct ring_buffer_desc *)((void *)trace_desc + trace_desc->struct_len);
+ desc = __first_ring_buffer_desc(trace_desc);
+ len = struct_size(desc, page_va, desc->nr_page_va);
+ desc = (struct ring_buffer_desc *)((void *)desc + (len * cpu));
+
+ if (desc < end && desc->cpu == cpu)
+ return desc;
+
+ /* Missing CPUs, need to linear search */
+ for_each_ring_buffer_desc(desc, i, trace_desc) {
+ if (desc->cpu == cpu)
+ return desc;
+ }
+
+ return NULL;
+}
+
+static void *ring_buffer_desc_page(struct ring_buffer_desc *desc, unsigned int page_id)
+{
+ return page_id >= desc->nr_page_va ? NULL : (void *)desc->page_va[page_id];
+}
+
static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
long nr_pages, struct list_head *pages)
{
struct trace_buffer *buffer = cpu_buffer->buffer;
- struct ring_buffer_meta *meta = NULL;
+ struct ring_buffer_cpu_meta *meta = NULL;
struct buffer_page *bpage, *tmp;
bool user_thread = current->mm != NULL;
- gfp_t mflags;
+ struct ring_buffer_desc *desc = NULL;
long i;
/*
@@ -2055,13 +2269,6 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
return -ENOMEM;
/*
- * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails
- * gracefully without invoking oom-killer and the system is not
- * destabilized.
- */
- mflags = GFP_KERNEL | __GFP_RETRY_MAYFAIL;
-
- /*
* If a user thread allocates too much, and si_mem_available()
* reports there's enough memory, even though there is not.
* Make sure the OOM killer kills this thread. This can happen
@@ -2076,11 +2283,15 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
if (buffer->range_addr_start)
meta = rb_range_meta(buffer, nr_pages, cpu_buffer->cpu);
+ if (buffer->remote) {
+ desc = ring_buffer_desc(buffer->remote->desc, cpu_buffer->cpu);
+ if (!desc || WARN_ON(desc->nr_page_va != (nr_pages + 1)))
+ return -EINVAL;
+ }
+
for (i = 0; i < nr_pages; i++) {
- struct page *page;
- bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
- mflags, cpu_to_node(cpu_buffer->cpu));
+ bpage = alloc_cpu_page(cpu_buffer->cpu);
if (!bpage)
goto free_pages;
@@ -2102,14 +2313,21 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
rb_meta_buffer_update(cpu_buffer, bpage);
bpage->range = 1;
bpage->id = i + 1;
+ } else if (desc) {
+ void *p = ring_buffer_desc_page(desc, i + 1);
+
+ if (WARN_ON(!p))
+ goto free_pages;
+
+ bpage->page = p;
+ bpage->range = 1; /* bpage->page can't be freed */
+ bpage->id = i + 1;
+ cpu_buffer->subbuf_ids[i + 1] = bpage;
} else {
- page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu),
- mflags | __GFP_COMP | __GFP_ZERO,
- cpu_buffer->buffer->subbuf_order);
- if (!page)
+ int order = cpu_buffer->buffer->subbuf_order;
+ bpage->page = alloc_cpu_data(cpu_buffer->cpu, order);
+ if (!bpage->page)
goto free_pages;
- bpage->page = page_address(page);
- rb_init_page(bpage->page);
}
bpage->order = cpu_buffer->buffer->subbuf_order;
@@ -2160,14 +2378,12 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
static struct ring_buffer_per_cpu *
rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
{
- struct ring_buffer_per_cpu *cpu_buffer;
- struct ring_buffer_meta *meta;
+ struct ring_buffer_per_cpu *cpu_buffer __free(kfree) =
+ alloc_cpu_buffer(cpu);
+ struct ring_buffer_cpu_meta *meta;
struct buffer_page *bpage;
- struct page *page;
int ret;
- cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()),
- GFP_KERNEL, cpu_to_node(cpu));
if (!cpu_buffer)
return NULL;
@@ -2183,10 +2399,9 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
init_waitqueue_head(&cpu_buffer->irq_work.full_waiters);
mutex_init(&cpu_buffer->mapping_lock);
- bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
- GFP_KERNEL, cpu_to_node(cpu));
+ bpage = alloc_cpu_page(cpu);
if (!bpage)
- goto fail_free_buffer;
+ return NULL;
rb_check_bpage(cpu_buffer, bpage);
@@ -2205,14 +2420,35 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
if (cpu_buffer->ring_meta->head_buffer)
rb_meta_buffer_update(cpu_buffer, bpage);
bpage->range = 1;
+ } else if (buffer->remote) {
+ struct ring_buffer_desc *desc = ring_buffer_desc(buffer->remote->desc, cpu);
+
+ if (!desc)
+ goto fail_free_reader;
+
+ cpu_buffer->remote = buffer->remote;
+ cpu_buffer->meta_page = (struct trace_buffer_meta *)(void *)desc->meta_va;
+ cpu_buffer->nr_pages = nr_pages;
+ cpu_buffer->subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1,
+ sizeof(*cpu_buffer->subbuf_ids), GFP_KERNEL);
+ if (!cpu_buffer->subbuf_ids)
+ goto fail_free_reader;
+
+ /* Remote buffers are read-only and immutable */
+ atomic_inc(&cpu_buffer->record_disabled);
+ atomic_inc(&cpu_buffer->resize_disabled);
+
+ bpage->page = ring_buffer_desc_page(desc, cpu_buffer->meta_page->reader.id);
+ if (!bpage->page)
+ goto fail_free_reader;
+
+ bpage->range = 1;
+ cpu_buffer->subbuf_ids[0] = bpage;
} else {
- page = alloc_pages_node(cpu_to_node(cpu),
- GFP_KERNEL | __GFP_COMP | __GFP_ZERO,
- cpu_buffer->buffer->subbuf_order);
- if (!page)
+ int order = cpu_buffer->buffer->subbuf_order;
+ bpage->page = alloc_cpu_data(cpu, order);
+ if (!bpage->page)
goto fail_free_reader;
- bpage->page = page_address(page);
- rb_init_page(bpage->page);
}
INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
@@ -2252,13 +2488,11 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
rb_head_page_activate(cpu_buffer);
}
- return cpu_buffer;
+ return_ptr(cpu_buffer);
fail_free_reader:
free_buffer_page(cpu_buffer->reader_page);
- fail_free_buffer:
- kfree(cpu_buffer);
return NULL;
}
@@ -2269,6 +2503,9 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
irq_work_sync(&cpu_buffer->irq_work.work);
+ if (cpu_buffer->remote)
+ kfree(cpu_buffer->subbuf_ids);
+
free_buffer_page(cpu_buffer->reader_page);
if (head) {
@@ -2287,12 +2524,24 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
kfree(cpu_buffer);
}
+/* Stop recording on a persistent buffer and flush cache if needed. */
+static int rb_flush_buffer_cb(struct notifier_block *nb, unsigned long event, void *data)
+{
+ struct trace_buffer *buffer = container_of(nb, struct trace_buffer, flush_nb);
+
+ ring_buffer_record_off(buffer);
+ arch_ring_buffer_flush_range(buffer->range_addr_start, buffer->range_addr_end);
+ return NOTIFY_DONE;
+}
+
static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
int order, unsigned long start,
unsigned long end,
- struct lock_class_key *key)
+ unsigned long scratch_size,
+ struct lock_class_key *key,
+ struct ring_buffer_remote *remote)
{
- struct trace_buffer *buffer;
+ struct trace_buffer *buffer __free(kfree) = NULL;
long nr_pages;
int subbuf_size;
int bsize;
@@ -2306,7 +2555,7 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
return NULL;
if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
- goto fail_free_buffer;
+ return NULL;
buffer->subbuf_order = order;
subbuf_size = (PAGE_SIZE << order);
@@ -2330,12 +2579,27 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
if (!buffer->buffers)
goto fail_free_cpumask;
+ cpu = raw_smp_processor_id();
+
/* If start/end are specified, then that overrides size */
if (start && end) {
+ unsigned long buffers_start;
unsigned long ptr;
int n;
- size = end - start;
+ /* Make sure that start is word aligned */
+ start = ALIGN(start, sizeof(long));
+
+ /* scratch_size needs to be aligned too */
+ scratch_size = ALIGN(scratch_size, sizeof(long));
+
+ /* Subtract the buffer meta data and word aligned */
+ buffers_start = start + sizeof(struct ring_buffer_cpu_meta);
+ buffers_start = ALIGN(buffers_start, sizeof(long));
+ buffers_start += scratch_size;
+
+ /* Calculate the size for the per CPU data */
+ size = end - buffers_start;
size = size / nr_cpu_ids;
/*
@@ -2345,7 +2609,7 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
* needed, plus account for the integer array index that
* will be appended to the meta data.
*/
- nr_pages = (size - sizeof(struct ring_buffer_meta)) /
+ nr_pages = (size - sizeof(struct ring_buffer_cpu_meta)) /
(subbuf_size + sizeof(int));
/* Need at least two pages plus the reader page */
if (nr_pages < 3)
@@ -2353,8 +2617,8 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
again:
/* Make sure that the size fits aligned */
- for (n = 0, ptr = start; n < nr_cpu_ids; n++) {
- ptr += sizeof(struct ring_buffer_meta) +
+ for (n = 0, ptr = buffers_start; n < nr_cpu_ids; n++) {
+ ptr += sizeof(struct ring_buffer_cpu_meta) +
sizeof(int) * nr_pages;
ptr = ALIGN(ptr, subbuf_size);
ptr += subbuf_size * nr_pages;
@@ -2371,7 +2635,16 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
buffer->range_addr_start = start;
buffer->range_addr_end = end;
- rb_range_meta_init(buffer, nr_pages);
+ rb_range_meta_init(buffer, nr_pages, scratch_size);
+ } else if (remote) {
+ struct ring_buffer_desc *desc = ring_buffer_desc(remote->desc, cpu);
+
+ buffer->remote = remote;
+ /* The writer is remote. This ring-buffer is read-only */
+ atomic_inc(&buffer->record_disabled);
+ nr_pages = desc->nr_page_va - 1;
+ if (nr_pages < 2)
+ goto fail_free_buffers;
} else {
/* need at least two pages */
@@ -2380,7 +2653,6 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
nr_pages = 2;
}
- cpu = raw_smp_processor_id();
cpumask_set_cpu(cpu, buffer->cpumask);
buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
if (!buffer->buffers[cpu])
@@ -2392,7 +2664,13 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
mutex_init(&buffer->mutex);
- return buffer;
+ /* Persistent ring buffer needs to flush cache before reboot. */
+ if (start && end) {
+ buffer->flush_nb.notifier_call = rb_flush_buffer_cb;
+ atomic_notifier_chain_register(&panic_notifier_list, &buffer->flush_nb);
+ }
+
+ return_ptr(buffer);
fail_free_buffers:
for_each_buffer_cpu(buffer, cpu) {
@@ -2404,8 +2682,6 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
fail_free_cpumask:
free_cpumask_var(buffer->cpumask);
- fail_free_buffer:
- kfree(buffer);
return NULL;
}
@@ -2424,7 +2700,7 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
struct lock_class_key *key)
{
/* Default buffer page size - one system page */
- return alloc_buffer(size, flags, 0, 0, 0,key);
+ return alloc_buffer(size, flags, 0, 0, 0, 0, key, NULL);
}
EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
@@ -2436,6 +2712,7 @@ EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
* @order: sub-buffer order
* @start: start of allocated range
* @range_size: size of allocated range
+ * @scratch_size: size of scratch area (for preallocated memory buffers)
* @key: ring buffer reader_lock_key.
*
* Currently the only flag that is available is the RB_FL_OVERWRITE
@@ -2446,32 +2723,40 @@ EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flags,
int order, unsigned long start,
unsigned long range_size,
+ unsigned long scratch_size,
struct lock_class_key *key)
{
- return alloc_buffer(size, flags, order, start, start + range_size, key);
+ return alloc_buffer(size, flags, order, start, start + range_size,
+ scratch_size, key, NULL);
}
/**
- * ring_buffer_last_boot_delta - return the delta offset from last boot
- * @buffer: The buffer to return the delta from
- * @text: Return text delta
- * @data: Return data delta
- *
- * Returns: The true if the delta is non zero
+ * __ring_buffer_alloc_remote - allocate a new ring_buffer from a remote
+ * @remote: Contains a description of the ring-buffer pages and remote callbacks.
+ * @key: ring buffer reader_lock_key.
*/
-bool ring_buffer_last_boot_delta(struct trace_buffer *buffer, long *text,
- long *data)
+struct trace_buffer *__ring_buffer_alloc_remote(struct ring_buffer_remote *remote,
+ struct lock_class_key *key)
{
- if (!buffer)
- return false;
+ return alloc_buffer(0, 0, 0, 0, 0, 0, key, remote);
+}
- if (!buffer->last_text_delta)
- return false;
+void *ring_buffer_meta_scratch(struct trace_buffer *buffer, unsigned int *size)
+{
+ struct ring_buffer_meta *meta;
+ void *ptr;
- *text = buffer->last_text_delta;
- *data = buffer->last_data_delta;
+ if (!buffer || !buffer->meta)
+ return NULL;
- return true;
+ meta = buffer->meta;
+
+ ptr = (void *)ALIGN((unsigned long)meta + sizeof(*meta), sizeof(long));
+
+ if (size)
+ *size = (void *)meta + meta->buffers_offset - ptr;
+
+ return ptr;
}
/**
@@ -2483,6 +2768,9 @@ ring_buffer_free(struct trace_buffer *buffer)
{
int cpu;
+ if (buffer->range_addr_start && buffer->range_addr_end)
+ atomic_notifier_chain_unregister(&panic_notifier_list, &buffer->flush_nb);
+
cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node);
irq_work_sync(&buffer->irq_work.work);
@@ -2771,6 +3059,12 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
if (nr_pages < 2)
nr_pages = 2;
+ /*
+ * Keep CPUs from coming online while resizing to synchronize
+ * with new per CPU buffers being created.
+ */
+ guard(cpus_read_lock)();
+
/* prevent another thread from changing buffer sizes */
mutex_lock(&buffer->mutex);
atomic_inc(&buffer->resizing);
@@ -2815,7 +3109,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
cond_resched();
}
- cpus_read_lock();
/*
* Fire off all the required work handlers
* We can't schedule on offline CPUs, but it's not necessary
@@ -2855,7 +3148,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
cpu_buffer->nr_pages_to_update = 0;
}
- cpus_read_unlock();
} else {
cpu_buffer = buffer->buffers[cpu_id];
@@ -2883,8 +3175,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
goto out_err;
}
- cpus_read_lock();
-
/* Can't run something on an offline CPU. */
if (!cpu_online(cpu_id))
rb_update_pages(cpu_buffer);
@@ -2903,7 +3193,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
}
cpu_buffer->nr_pages_to_update = 0;
- cpus_read_unlock();
}
out:
@@ -2948,6 +3237,8 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
list) {
list_del_init(&bpage->list);
free_buffer_page(bpage);
+
+ cond_resched();
}
}
out_err_unlock:
@@ -3082,7 +3373,7 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
}
/* Return the index into the sub-buffers for a given sub-buffer */
-static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf)
+static int rb_meta_subbuf_idx(struct ring_buffer_cpu_meta *meta, void *subbuf)
{
void *subbuf_array;
@@ -3094,7 +3385,7 @@ static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf)
static void rb_update_meta_head(struct ring_buffer_per_cpu *cpu_buffer,
struct buffer_page *next_page)
{
- struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
unsigned long old_head = (unsigned long)next_page->page;
unsigned long new_head;
@@ -3111,7 +3402,7 @@ static void rb_update_meta_head(struct ring_buffer_per_cpu *cpu_buffer,
static void rb_update_meta_reader(struct ring_buffer_per_cpu *cpu_buffer,
struct buffer_page *reader)
{
- struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
void *old_reader = cpu_buffer->reader_page->page;
void *new_reader = reader->page;
int id;
@@ -3740,7 +4031,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
rb_page_write(cpu_buffer->commit_page));
rb_inc_page(&cpu_buffer->commit_page);
if (cpu_buffer->ring_meta) {
- struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
meta->commit_buffer = (unsigned long)cpu_buffer->commit_page->page;
}
/* add barrier to keep gcc from optimizing too much */
@@ -3822,19 +4113,36 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer)
rb_end_commit(cpu_buffer);
}
+static bool
+rb_irq_work_queue(struct rb_irq_work *irq_work)
+{
+ int cpu;
+
+ /* irq_work_queue_on() is not NMI-safe */
+ if (unlikely(in_nmi()))
+ return irq_work_queue(&irq_work->work);
+
+ /*
+ * If CPU isolation is not active, cpu is always the current
+ * CPU, and the following is equivallent to irq_work_queue().
+ */
+ cpu = housekeeping_any_cpu(HK_TYPE_KERNEL_NOISE);
+ return irq_work_queue_on(&irq_work->work, cpu);
+}
+
static __always_inline void
rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
{
if (buffer->irq_work.waiters_pending) {
buffer->irq_work.waiters_pending = false;
/* irq_work_queue() supplies it's own memory barriers */
- irq_work_queue(&buffer->irq_work.work);
+ rb_irq_work_queue(&buffer->irq_work);
}
if (cpu_buffer->irq_work.waiters_pending) {
cpu_buffer->irq_work.waiters_pending = false;
/* irq_work_queue() supplies it's own memory barriers */
- irq_work_queue(&cpu_buffer->irq_work.work);
+ rb_irq_work_queue(&cpu_buffer->irq_work);
}
if (cpu_buffer->last_pages_touch == local_read(&cpu_buffer->pages_touched))
@@ -3854,7 +4162,7 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->irq_work.wakeup_full = true;
cpu_buffer->irq_work.full_waiters_pending = false;
/* irq_work_queue() supplies it's own memory barriers */
- irq_work_queue(&cpu_buffer->irq_work.work);
+ rb_irq_work_queue(&cpu_buffer->irq_work);
}
#ifdef CONFIG_RING_BUFFER_RECORD_RECURSION
@@ -4043,7 +4351,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
static const char *show_irq_str(int bits)
{
- const char *type[] = {
+ static const char * type[] = {
".", // 0
"s", // 1
"h", // 2
@@ -4221,18 +4529,20 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
ret = rb_read_data_buffer(bpage, tail, cpu_buffer->cpu, &ts, &delta);
if (ret < 0) {
if (delta < ts) {
- buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n",
- cpu_buffer->cpu, ts, delta);
+ buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld clock:%pS\n",
+ cpu_buffer->cpu, ts, delta,
+ cpu_buffer->buffer->clock);
goto out;
}
}
if ((full && ts > info->ts) ||
(!full && ts + info->delta != info->ts)) {
- buffer_warn_return("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\n",
+ buffer_warn_return("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\ntrace clock:%pS",
cpu_buffer->cpu,
ts + info->delta, info->ts, info->delta,
info->before, info->after,
- full ? " (full)" : "", show_interrupt_level());
+ full ? " (full)" : "", show_interrupt_level(),
+ cpu_buffer->buffer->clock);
}
out:
atomic_dec(this_cpu_ptr(&checking));
@@ -4398,8 +4708,13 @@ rb_reserve_next_event(struct trace_buffer *buffer,
int nr_loops = 0;
int add_ts_default;
- /* ring buffer does cmpxchg, make sure it is safe in NMI context */
- if (!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) &&
+ /*
+ * ring buffer does cmpxchg as well as atomic64 operations
+ * (which some archs use locking for atomic64), make sure this
+ * is safe in NMI context
+ */
+ if ((!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) ||
+ IS_ENABLED(CONFIG_GENERIC_ATOMIC64)) &&
(unlikely(in_nmi()))) {
return NULL;
}
@@ -4601,10 +4916,7 @@ void ring_buffer_discard_commit(struct trace_buffer *buffer,
RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing));
rb_decrement_entry(cpu_buffer, event);
- if (rb_try_to_discard(cpu_buffer, event))
- goto out;
-
- out:
+ rb_try_to_discard(cpu_buffer, event);
rb_end_commit(cpu_buffer);
trace_recursive_unlock(cpu_buffer);
@@ -4637,26 +4949,26 @@ int ring_buffer_write(struct trace_buffer *buffer,
int ret = -EBUSY;
int cpu;
- preempt_disable_notrace();
+ guard(preempt_notrace)();
if (atomic_read(&buffer->record_disabled))
- goto out;
+ return -EBUSY;
cpu = raw_smp_processor_id();
if (!cpumask_test_cpu(cpu, buffer->cpumask))
- goto out;
+ return -EBUSY;
cpu_buffer = buffer->buffers[cpu];
if (atomic_read(&cpu_buffer->record_disabled))
- goto out;
+ return -EBUSY;
if (length > buffer->max_data_size)
- goto out;
+ return -EBUSY;
if (unlikely(trace_recursive_lock(cpu_buffer)))
- goto out;
+ return -EBUSY;
event = rb_reserve_next_event(buffer, cpu_buffer, length);
if (!event)
@@ -4674,48 +4986,26 @@ int ring_buffer_write(struct trace_buffer *buffer,
out_unlock:
trace_recursive_unlock(cpu_buffer);
-
- out:
- preempt_enable_notrace();
-
return ret;
}
EXPORT_SYMBOL_GPL(ring_buffer_write);
-static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
+/*
+ * The total entries in the ring buffer is the running counter
+ * of entries entered into the ring buffer, minus the sum of
+ * the entries read from the ring buffer and the number of
+ * entries that were overwritten.
+ */
+static inline unsigned long
+rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
{
- struct buffer_page *reader = cpu_buffer->reader_page;
- struct buffer_page *head = rb_set_head_page(cpu_buffer);
- struct buffer_page *commit = cpu_buffer->commit_page;
-
- /* In case of error, head will be NULL */
- if (unlikely(!head))
- return true;
-
- /* Reader should exhaust content in reader page */
- if (reader->read != rb_page_size(reader))
- return false;
-
- /*
- * If writers are committing on the reader page, knowing all
- * committed content has been read, the ring buffer is empty.
- */
- if (commit == reader)
- return true;
-
- /*
- * If writers are committing on a page other than reader page
- * and head page, there should always be content to read.
- */
- if (commit != head)
- return false;
+ return local_read(&cpu_buffer->entries) -
+ (local_read(&cpu_buffer->overrun) + cpu_buffer->read);
+}
- /*
- * Writers are committing on the head page, we just need
- * to care about there're committed data, and the reader will
- * swap reader page with head page when it is to read data.
- */
- return rb_page_commit(commit) == 0;
+static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return !rb_num_of_entries(cpu_buffer);
}
/**
@@ -4820,6 +5110,24 @@ bool ring_buffer_record_is_set_on(struct trace_buffer *buffer)
}
/**
+ * ring_buffer_record_is_on_cpu - return true if the ring buffer can write
+ * @buffer: The ring buffer to see if write is enabled
+ * @cpu: The CPU to test if the ring buffer can write too
+ *
+ * Returns true if the ring buffer is in a state that it accepts writes
+ * for a particular CPU.
+ */
+bool ring_buffer_record_is_on_cpu(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ return ring_buffer_record_is_set_on(buffer) &&
+ !atomic_read(&cpu_buffer->record_disabled);
+}
+
+/**
* ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer
* @buffer: The ring buffer to stop writes to.
* @cpu: The CPU buffer to stop
@@ -4861,19 +5169,6 @@ void ring_buffer_record_enable_cpu(struct trace_buffer *buffer, int cpu)
}
EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
-/*
- * The total entries in the ring buffer is the running counter
- * of entries entered into the ring buffer, minus the sum of
- * the entries read from the ring buffer and the number of
- * entries that were overwritten.
- */
-static inline unsigned long
-rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
-{
- return local_read(&cpu_buffer->entries) -
- (local_read(&cpu_buffer->overrun) + cpu_buffer->read);
-}
-
/**
* ring_buffer_oldest_event_ts - get the oldest event timestamp from the buffer
* @buffer: The ring buffer
@@ -5075,14 +5370,66 @@ unsigned long ring_buffer_overruns(struct trace_buffer *buffer)
}
EXPORT_SYMBOL_GPL(ring_buffer_overruns);
+static bool rb_read_remote_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ local_set(&cpu_buffer->entries, READ_ONCE(cpu_buffer->meta_page->entries));
+ local_set(&cpu_buffer->overrun, READ_ONCE(cpu_buffer->meta_page->overrun));
+ local_set(&cpu_buffer->pages_touched, READ_ONCE(cpu_buffer->meta_page->pages_touched));
+ local_set(&cpu_buffer->pages_lost, READ_ONCE(cpu_buffer->meta_page->pages_lost));
+
+ return rb_num_of_entries(cpu_buffer);
+}
+
+static void rb_update_remote_head(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct buffer_page *next, *orig;
+ int retry = 3;
+
+ orig = next = cpu_buffer->head_page;
+ rb_inc_page(&next);
+
+ /* Run after the writer */
+ while (cpu_buffer->head_page->page->time_stamp > next->page->time_stamp) {
+ rb_inc_page(&next);
+
+ rb_list_head_clear(cpu_buffer->head_page->list.prev);
+ rb_inc_page(&cpu_buffer->head_page);
+ rb_set_list_to_head(cpu_buffer->head_page->list.prev);
+
+ if (cpu_buffer->head_page == orig) {
+ if (WARN_ON_ONCE(!(--retry)))
+ return;
+ }
+ }
+
+ orig = cpu_buffer->commit_page = cpu_buffer->head_page;
+ retry = 3;
+
+ while (cpu_buffer->commit_page->page->time_stamp < next->page->time_stamp) {
+ rb_inc_page(&next);
+ rb_inc_page(&cpu_buffer->commit_page);
+
+ if (cpu_buffer->commit_page == orig) {
+ if (WARN_ON_ONCE(!(--retry)))
+ return;
+ }
+ }
+}
+
static void rb_iter_reset(struct ring_buffer_iter *iter)
{
struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+ if (cpu_buffer->remote) {
+ rb_read_remote_meta_page(cpu_buffer);
+ rb_update_remote_head(cpu_buffer);
+ }
+
/* Iterator usage is expected to have record disabled */
iter->head_page = cpu_buffer->reader_page;
iter->head = cpu_buffer->reader_page->read;
iter->next_event = iter->head;
+ iter->missed_events = 0;
iter->cache_reader_page = iter->head_page;
iter->cache_read = cpu_buffer->read;
@@ -5229,7 +5576,65 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
}
static struct buffer_page *
-rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
+__rb_get_reader_page_from_remote(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct buffer_page *new_reader, *prev_reader, *prev_head, *new_head, *last;
+
+ if (!rb_read_remote_meta_page(cpu_buffer))
+ return NULL;
+
+ /* More to read on the reader page */
+ if (cpu_buffer->reader_page->read < rb_page_size(cpu_buffer->reader_page)) {
+ if (!cpu_buffer->reader_page->read)
+ cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp;
+ return cpu_buffer->reader_page;
+ }
+
+ prev_reader = cpu_buffer->subbuf_ids[cpu_buffer->meta_page->reader.id];
+
+ WARN_ON_ONCE(cpu_buffer->remote->swap_reader_page(cpu_buffer->cpu,
+ cpu_buffer->remote->priv));
+ /* nr_pages doesn't include the reader page */
+ if (WARN_ON_ONCE(cpu_buffer->meta_page->reader.id > cpu_buffer->nr_pages))
+ return NULL;
+
+ new_reader = cpu_buffer->subbuf_ids[cpu_buffer->meta_page->reader.id];
+
+ WARN_ON_ONCE(prev_reader == new_reader);
+
+ prev_head = new_reader; /* New reader was also the previous head */
+ new_head = prev_head;
+ rb_inc_page(&new_head);
+ last = prev_head;
+ rb_dec_page(&last);
+
+ /* Clear the old HEAD flag */
+ rb_list_head_clear(cpu_buffer->head_page->list.prev);
+
+ prev_reader->list.next = prev_head->list.next;
+ prev_reader->list.prev = prev_head->list.prev;
+
+ /* Swap prev_reader with new_reader */
+ last->list.next = &prev_reader->list;
+ new_head->list.prev = &prev_reader->list;
+
+ new_reader->list.prev = &new_reader->list;
+ new_reader->list.next = &new_head->list;
+
+ /* Reactivate the HEAD flag */
+ rb_set_list_to_head(&last->list);
+
+ cpu_buffer->head_page = new_head;
+ cpu_buffer->reader_page = new_reader;
+ cpu_buffer->pages = &new_head->list;
+ cpu_buffer->read_stamp = new_reader->page->time_stamp;
+ cpu_buffer->lost_events = cpu_buffer->meta_page->reader.lost_events;
+
+ return rb_page_size(cpu_buffer->reader_page) ? cpu_buffer->reader_page : NULL;
+}
+
+static struct buffer_page *
+__rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
{
struct buffer_page *reader = NULL;
unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size);
@@ -5278,7 +5683,6 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
*/
local_set(&cpu_buffer->reader_page->write, 0);
local_set(&cpu_buffer->reader_page->entries, 0);
- local_set(&cpu_buffer->reader_page->page->commit, 0);
cpu_buffer->reader_page->real_end = 0;
spin:
@@ -5321,7 +5725,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
* moving it. The page before the header page has the
* flag bit '1' set if it is pointing to the page we want.
* but if the writer is in the process of moving it
- * than it will be '2' or already moved '0'.
+ * then it will be '2' or already moved '0'.
*/
ret = rb_head_page_replace(reader, cpu_buffer->reader_page);
@@ -5400,6 +5804,13 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
return reader;
}
+static struct buffer_page *
+rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return cpu_buffer->remote ? __rb_get_reader_page_from_remote(cpu_buffer) :
+ __rb_get_reader_page(cpu_buffer);
+}
+
static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
{
struct ring_buffer_event *event;
@@ -5698,10 +6109,7 @@ ring_buffer_peek(struct trace_buffer *buffer, int cpu, u64 *ts,
*/
bool ring_buffer_iter_dropped(struct ring_buffer_iter *iter)
{
- bool ret = iter->missed_events != 0;
-
- iter->missed_events = 0;
- return ret;
+ return iter->missed_events != 0;
}
EXPORT_SYMBOL_GPL(ring_buffer_iter_dropped);
@@ -5782,24 +6190,20 @@ ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts,
EXPORT_SYMBOL_GPL(ring_buffer_consume);
/**
- * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
+ * ring_buffer_read_start - start a non consuming read of the buffer
* @buffer: The ring buffer to read from
* @cpu: The cpu buffer to iterate over
* @flags: gfp flags to use for memory allocation
*
- * This performs the initial preparations necessary to iterate
- * through the buffer. Memory is allocated, buffer resizing
- * is disabled, and the iterator pointer is returned to the caller.
+ * This creates an iterator to allow non-consuming iteration through
+ * the buffer. If the buffer is disabled for writing, it will produce
+ * the same information each time, but if the buffer is still writing
+ * then the first hit of a write will cause the iteration to stop.
*
- * After a sequence of ring_buffer_read_prepare calls, the user is
- * expected to make at least one call to ring_buffer_read_prepare_sync.
- * Afterwards, ring_buffer_read_start is invoked to get things going
- * for real.
- *
- * This overall must be paired with ring_buffer_read_finish.
+ * Must be paired with ring_buffer_read_finish.
*/
struct ring_buffer_iter *
-ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
+ring_buffer_read_start(struct trace_buffer *buffer, int cpu, gfp_t flags)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_iter *iter;
@@ -5807,7 +6211,7 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return NULL;
- iter = kzalloc(sizeof(*iter), flags);
+ iter = kzalloc_obj(*iter, flags);
if (!iter)
return NULL;
@@ -5825,51 +6229,12 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
atomic_inc(&cpu_buffer->resize_disabled);
- return iter;
-}
-EXPORT_SYMBOL_GPL(ring_buffer_read_prepare);
-
-/**
- * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls
- *
- * All previously invoked ring_buffer_read_prepare calls to prepare
- * iterators will be synchronized. Afterwards, read_buffer_read_start
- * calls on those iterators are allowed.
- */
-void
-ring_buffer_read_prepare_sync(void)
-{
- synchronize_rcu();
-}
-EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
-
-/**
- * ring_buffer_read_start - start a non consuming read of the buffer
- * @iter: The iterator returned by ring_buffer_read_prepare
- *
- * This finalizes the startup of an iteration through the buffer.
- * The iterator comes from a call to ring_buffer_read_prepare and
- * an intervening ring_buffer_read_prepare_sync must have been
- * performed.
- *
- * Must be paired with ring_buffer_read_finish.
- */
-void
-ring_buffer_read_start(struct ring_buffer_iter *iter)
-{
- struct ring_buffer_per_cpu *cpu_buffer;
- unsigned long flags;
-
- if (!iter)
- return;
-
- cpu_buffer = iter->cpu_buffer;
-
- raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock);
arch_spin_lock(&cpu_buffer->lock);
rb_iter_reset(iter);
arch_spin_unlock(&cpu_buffer->lock);
- raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+ return iter;
}
EXPORT_SYMBOL_GPL(ring_buffer_read_start);
@@ -5906,7 +6271,7 @@ void ring_buffer_iter_advance(struct ring_buffer_iter *iter)
unsigned long flags;
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
-
+ iter->missed_events = 0;
rb_advance_iter(iter);
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
@@ -5950,6 +6315,39 @@ static void rb_clear_buffer_page(struct buffer_page *page)
page->read = 0;
}
+/*
+ * When the buffer is memory mapped to user space, each sub buffer
+ * has a unique id that is used by the meta data to tell the user
+ * where the current reader page is.
+ *
+ * For a normal allocated ring buffer, the id is saved in the buffer page
+ * id field, and updated via this function.
+ *
+ * But for a fixed memory mapped buffer, the id is already assigned for
+ * fixed memory ordering in the memory layout and can not be used. Instead
+ * the index of where the page lies in the memory layout is used.
+ *
+ * For the normal pages, set the buffer page id with the passed in @id
+ * value and return that.
+ *
+ * For fixed memory mapped pages, get the page index in the memory layout
+ * and return that as the id.
+ */
+static int rb_page_id(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *bpage, int id)
+{
+ /*
+ * For boot buffers, the id is the index,
+ * otherwise, set the buffer page with this id
+ */
+ if (cpu_buffer->ring_meta)
+ id = rb_meta_subbuf_idx(cpu_buffer->ring_meta, bpage->page);
+ else
+ bpage->id = id;
+
+ return id;
+}
+
static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
{
struct trace_buffer_meta *meta = cpu_buffer->meta_page;
@@ -5958,15 +6356,19 @@ static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
return;
meta->reader.read = cpu_buffer->reader_page->read;
- meta->reader.id = cpu_buffer->reader_page->id;
+ meta->reader.id = rb_page_id(cpu_buffer, cpu_buffer->reader_page,
+ cpu_buffer->reader_page->id);
+
meta->reader.lost_events = cpu_buffer->lost_events;
meta->entries = local_read(&cpu_buffer->entries);
meta->overrun = local_read(&cpu_buffer->overrun);
meta->read = cpu_buffer->read;
+ meta->pages_lost = local_read(&cpu_buffer->pages_lost);
+ meta->pages_touched = local_read(&cpu_buffer->pages_touched);
/* Some archs do not have data cache coherency between kernel and user-space */
- flush_dcache_folio(virt_to_folio(cpu_buffer->meta_page));
+ flush_kernel_vmap_range(cpu_buffer->meta_page, PAGE_SIZE);
}
static void
@@ -5974,6 +6376,23 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
{
struct buffer_page *page;
+ if (cpu_buffer->remote) {
+ if (!cpu_buffer->remote->reset)
+ return;
+
+ cpu_buffer->remote->reset(cpu_buffer->cpu, cpu_buffer->remote->priv);
+ rb_read_remote_meta_page(cpu_buffer);
+
+ /* Read related values, not covered by the meta-page */
+ local_set(&cpu_buffer->pages_read, 0);
+ cpu_buffer->read = 0;
+ cpu_buffer->read_bytes = 0;
+ cpu_buffer->last_overrun = 0;
+ cpu_buffer->reader_page->read = 0;
+
+ return;
+ }
+
rb_head_page_deactivate(cpu_buffer);
cpu_buffer->head_page
@@ -6019,7 +6438,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
if (cpu_buffer->mapped) {
rb_update_meta_page(cpu_buffer);
if (cpu_buffer->ring_meta) {
- struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
meta->commit_buffer = meta->head_buffer;
}
}
@@ -6028,21 +6447,16 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
/* Must have disabled the cpu buffer then done a synchronize_rcu */
static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock);
if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
- goto out;
+ return;
arch_spin_lock(&cpu_buffer->lock);
rb_reset_cpu(cpu_buffer);
arch_spin_unlock(&cpu_buffer->lock);
-
- out:
- raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
}
/**
@@ -6053,7 +6467,6 @@ static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
- struct ring_buffer_meta *meta;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return;
@@ -6072,11 +6485,6 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu)
atomic_dec(&cpu_buffer->record_disabled);
atomic_dec(&cpu_buffer->resize_disabled);
- /* Make sure persistent meta now uses this buffer's addresses */
- meta = rb_range_meta(buffer, 0, cpu_buffer->cpu);
- if (meta)
- rb_meta_init_text_addr(meta);
-
mutex_unlock(&buffer->mutex);
}
EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
@@ -6091,7 +6499,6 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
void ring_buffer_reset_online_cpus(struct trace_buffer *buffer)
{
struct ring_buffer_per_cpu *cpu_buffer;
- struct ring_buffer_meta *meta;
int cpu;
/* prevent another thread from changing buffer sizes */
@@ -6119,11 +6526,6 @@ void ring_buffer_reset_online_cpus(struct trace_buffer *buffer)
reset_disabled_cpu_buffer(cpu_buffer);
- /* Make sure persistent meta now uses this buffer's addresses */
- meta = rb_range_meta(buffer, 0, cpu_buffer->cpu);
- if (meta)
- rb_meta_init_text_addr(meta);
-
atomic_dec(&cpu_buffer->record_disabled);
atomic_sub(RESET_BIT, &cpu_buffer->resize_disabled);
}
@@ -6221,6 +6623,46 @@ bool ring_buffer_empty_cpu(struct trace_buffer *buffer, int cpu)
}
EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu);
+int ring_buffer_poll_remote(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (cpu != RING_BUFFER_ALL_CPUS) {
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return -EINVAL;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ guard(raw_spinlock)(&cpu_buffer->reader_lock);
+ if (rb_read_remote_meta_page(cpu_buffer))
+ rb_wakeups(buffer, cpu_buffer);
+
+ return 0;
+ }
+
+ guard(cpus_read_lock)();
+
+ /*
+ * Make sure all the ring buffers are up to date before we start reading
+ * them.
+ */
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+
+ guard(raw_spinlock)(&cpu_buffer->reader_lock);
+ rb_read_remote_meta_page(cpu_buffer);
+ }
+
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+
+ if (rb_num_of_entries(cpu_buffer))
+ rb_wakeups(buffer, cpu_buffer);
+ }
+
+ return 0;
+}
+
#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
/**
* ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
@@ -6242,37 +6684,33 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a,
if (!cpumask_test_cpu(cpu, buffer_a->cpumask) ||
!cpumask_test_cpu(cpu, buffer_b->cpumask))
- goto out;
+ return -EINVAL;
cpu_buffer_a = buffer_a->buffers[cpu];
cpu_buffer_b = buffer_b->buffers[cpu];
/* It's up to the callers to not try to swap mapped buffers */
- if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped)) {
- ret = -EBUSY;
- goto out;
- }
+ if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped))
+ return -EBUSY;
/* At least make sure the two buffers are somewhat the same */
if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages)
- goto out;
+ return -EINVAL;
if (buffer_a->subbuf_order != buffer_b->subbuf_order)
- goto out;
-
- ret = -EAGAIN;
+ return -EINVAL;
if (atomic_read(&buffer_a->record_disabled))
- goto out;
+ return -EAGAIN;
if (atomic_read(&buffer_b->record_disabled))
- goto out;
+ return -EAGAIN;
if (atomic_read(&cpu_buffer_a->record_disabled))
- goto out;
+ return -EAGAIN;
if (atomic_read(&cpu_buffer_b->record_disabled))
- goto out;
+ return -EAGAIN;
/*
* We can't do a synchronize_rcu here because this
@@ -6309,7 +6747,6 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a,
out_dec:
atomic_dec(&cpu_buffer_a->record_disabled);
atomic_dec(&cpu_buffer_b->record_disabled);
-out:
return ret;
}
EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
@@ -6337,12 +6774,11 @@ ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
struct ring_buffer_per_cpu *cpu_buffer;
struct buffer_data_read_page *bpage = NULL;
unsigned long flags;
- struct page *page;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return ERR_PTR(-ENODEV);
- bpage = kzalloc(sizeof(*bpage), GFP_KERNEL);
+ bpage = kzalloc_obj(*bpage);
if (!bpage)
return ERR_PTR(-ENOMEM);
@@ -6359,22 +6795,16 @@ ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
arch_spin_unlock(&cpu_buffer->lock);
local_irq_restore(flags);
- if (bpage->data)
- goto out;
-
- page = alloc_pages_node(cpu_to_node(cpu),
- GFP_KERNEL | __GFP_NORETRY | __GFP_COMP | __GFP_ZERO,
- cpu_buffer->buffer->subbuf_order);
- if (!page) {
- kfree(bpage);
- return ERR_PTR(-ENOMEM);
+ if (bpage->data) {
+ rb_init_page(bpage->data);
+ } else {
+ bpage->data = alloc_cpu_data(cpu, cpu_buffer->buffer->subbuf_order);
+ if (!bpage->data) {
+ kfree(bpage);
+ return ERR_PTR(-ENOMEM);
+ }
}
- bpage->data = page_address(page);
-
- out:
- rb_init_page(bpage->data);
-
return bpage;
}
EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page);
@@ -6468,38 +6898,38 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
struct buffer_data_page *bpage;
struct buffer_page *reader;
unsigned long missed_events;
- unsigned long flags;
unsigned int commit;
unsigned int read;
u64 save_timestamp;
- int ret = -1;
+ bool force_memcpy;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
- goto out;
+ return -1;
/*
* If len is not big enough to hold the page header, then
* we can not copy anything.
*/
if (len <= BUF_PAGE_HDR_SIZE)
- goto out;
+ return -1;
len -= BUF_PAGE_HDR_SIZE;
if (!data_page || !data_page->data)
- goto out;
+ return -1;
+
if (data_page->order != buffer->subbuf_order)
- goto out;
+ return -1;
bpage = data_page->data;
if (!bpage)
- goto out;
+ return -1;
- raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock);
reader = rb_get_reader_page(cpu_buffer);
if (!reader)
- goto out_unlock;
+ return -1;
event = rb_reader_event(cpu_buffer);
@@ -6509,6 +6939,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
/* Check if any events were dropped */
missed_events = cpu_buffer->lost_events;
+ force_memcpy = cpu_buffer->mapped || cpu_buffer->remote;
+
/*
* If this page has been partially read or
* if len is not big enough to read the rest of the page or
@@ -6518,7 +6950,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
*/
if (read || (len < (commit - read)) ||
cpu_buffer->reader_page == cpu_buffer->commit_page ||
- cpu_buffer->mapped) {
+ force_memcpy) {
struct buffer_data_page *rpage = cpu_buffer->reader_page->page;
unsigned int rpos = read;
unsigned int pos = 0;
@@ -6533,7 +6965,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
if (full &&
(!read || (len < (commit - read)) ||
cpu_buffer->reader_page == cpu_buffer->commit_page))
- goto out_unlock;
+ return -1;
if (len > (commit - read))
len = (commit - read);
@@ -6542,7 +6974,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
size = rb_event_ts_length(event);
if (len < size)
- goto out_unlock;
+ return -1;
/* save the current timestamp, since the user will need it */
save_timestamp = cpu_buffer->read_stamp;
@@ -6600,7 +7032,6 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
if (reader->real_end)
local_set(&bpage->commit, reader->real_end);
}
- ret = read;
cpu_buffer->lost_events = 0;
@@ -6627,11 +7058,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
if (commit < buffer->subbuf_size)
memset(&bpage->data[commit], 0, buffer->subbuf_size - commit);
- out_unlock:
- raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
-
- out:
- return ret;
+ return read;
}
EXPORT_SYMBOL_GPL(ring_buffer_read_page);
@@ -6724,7 +7151,7 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
old_size = buffer->subbuf_size;
/* prevent another thread from changing buffer sizes */
- mutex_lock(&buffer->mutex);
+ guard(mutex)(&buffer->mutex);
atomic_inc(&buffer->record_disabled);
/* Make sure all commits have finished */
@@ -6829,7 +7256,6 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
}
atomic_dec(&buffer->record_disabled);
- mutex_unlock(&buffer->mutex);
return 0;
@@ -6838,7 +7264,6 @@ error:
buffer->subbuf_size = old_size;
atomic_dec(&buffer->record_disabled);
- mutex_unlock(&buffer->mutex);
for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
@@ -6881,29 +7306,35 @@ static void rb_free_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
}
static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
- unsigned long *subbuf_ids)
+ struct buffer_page **subbuf_ids)
{
struct trace_buffer_meta *meta = cpu_buffer->meta_page;
unsigned int nr_subbufs = cpu_buffer->nr_pages + 1;
struct buffer_page *first_subbuf, *subbuf;
+ int cnt = 0;
int id = 0;
- subbuf_ids[id] = (unsigned long)cpu_buffer->reader_page->page;
- cpu_buffer->reader_page->id = id++;
+ id = rb_page_id(cpu_buffer, cpu_buffer->reader_page, id);
+ subbuf_ids[id++] = cpu_buffer->reader_page;
+ cnt++;
first_subbuf = subbuf = rb_set_head_page(cpu_buffer);
do {
+ id = rb_page_id(cpu_buffer, subbuf, id);
+
if (WARN_ON(id >= nr_subbufs))
break;
- subbuf_ids[id] = (unsigned long)subbuf->page;
- subbuf->id = id;
+ subbuf_ids[id] = subbuf;
rb_inc_page(&subbuf);
id++;
+ cnt++;
} while (subbuf != first_subbuf);
- /* install subbuf ID to kern VA translation */
+ WARN_ON(cnt != nr_subbufs);
+
+ /* install subbuf ID to bpage translation */
cpu_buffer->subbuf_ids = subbuf_ids;
meta->meta_struct_len = sizeof(*meta);
@@ -6994,7 +7425,7 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
{
unsigned long nr_subbufs, nr_pages, nr_vma_pages, pgoff = vma->vm_pgoff;
unsigned int subbuf_pages, subbuf_order;
- struct page **pages;
+ struct page **pages __free(kfree) = NULL;
int p = 0, s = 0;
int err;
@@ -7031,7 +7462,7 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
nr_pages = nr_vma_pages;
- pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL);
+ pages = kzalloc_objs(*pages, nr_pages);
if (!pages)
return -ENOMEM;
@@ -7059,13 +7490,15 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
}
while (p < nr_pages) {
- struct page *page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]);
+ struct buffer_page *subbuf;
+ struct page *page;
int off = 0;
- if (WARN_ON_ONCE(s >= nr_subbufs)) {
- err = -EINVAL;
- goto out;
- }
+ if (WARN_ON_ONCE(s >= nr_subbufs))
+ return -EINVAL;
+
+ subbuf = cpu_buffer->subbuf_ids[s];
+ page = virt_to_page((void *)subbuf->page);
for (; off < (1 << (subbuf_order)); off++, page++) {
if (p >= nr_pages)
@@ -7078,9 +7511,6 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
err = vm_insert_pages(vma, vma->vm_start, pages, &nr_pages);
-out:
- kfree(pages);
-
return err;
}
#else
@@ -7095,37 +7525,36 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu,
struct vm_area_struct *vma)
{
struct ring_buffer_per_cpu *cpu_buffer;
- unsigned long flags, *subbuf_ids;
- int err = 0;
+ struct buffer_page **subbuf_ids;
+ unsigned long flags;
+ int err;
- if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ if (!cpumask_test_cpu(cpu, buffer->cpumask) || buffer->remote)
return -EINVAL;
cpu_buffer = buffer->buffers[cpu];
- mutex_lock(&cpu_buffer->mapping_lock);
+ guard(mutex)(&cpu_buffer->mapping_lock);
if (cpu_buffer->user_mapped) {
err = __rb_map_vma(cpu_buffer, vma);
if (!err)
err = __rb_inc_dec_mapped(cpu_buffer, true);
- mutex_unlock(&cpu_buffer->mapping_lock);
return err;
}
/* prevent another thread from changing buffer/sub-buffer sizes */
- mutex_lock(&buffer->mutex);
+ guard(mutex)(&buffer->mutex);
err = rb_alloc_meta_page(cpu_buffer);
if (err)
- goto unlock;
+ return err;
- /* subbuf_ids include the reader while nr_pages does not */
+ /* subbuf_ids includes the reader while nr_pages does not */
subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, sizeof(*subbuf_ids), GFP_KERNEL);
if (!subbuf_ids) {
rb_free_meta_page(cpu_buffer);
- err = -ENOMEM;
- goto unlock;
+ return -ENOMEM;
}
atomic_inc(&cpu_buffer->resize_disabled);
@@ -7150,37 +7579,53 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu,
kfree(cpu_buffer->subbuf_ids);
cpu_buffer->subbuf_ids = NULL;
rb_free_meta_page(cpu_buffer);
+ atomic_dec(&cpu_buffer->resize_disabled);
}
-unlock:
- mutex_unlock(&buffer->mutex);
- mutex_unlock(&cpu_buffer->mapping_lock);
-
return err;
}
+/*
+ * This is called when a VMA is duplicated (e.g., on fork()) to increment
+ * the user_mapped counter without remapping pages.
+ */
+void ring_buffer_map_dup(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (WARN_ON(!cpumask_test_cpu(cpu, buffer->cpumask)))
+ return;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ guard(mutex)(&cpu_buffer->mapping_lock);
+
+ if (cpu_buffer->user_mapped)
+ __rb_inc_dec_mapped(cpu_buffer, true);
+ else
+ WARN(1, "Unexpected buffer stat, it should be mapped");
+}
+
int ring_buffer_unmap(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long flags;
- int err = 0;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return -EINVAL;
cpu_buffer = buffer->buffers[cpu];
- mutex_lock(&cpu_buffer->mapping_lock);
+ guard(mutex)(&cpu_buffer->mapping_lock);
if (!cpu_buffer->user_mapped) {
- err = -ENODEV;
- goto out;
+ return -ENODEV;
} else if (cpu_buffer->user_mapped > 1) {
__rb_inc_dec_mapped(cpu_buffer, false);
- goto out;
+ return 0;
}
- mutex_lock(&buffer->mutex);
+ guard(mutex)(&buffer->mutex);
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
/* This is the last user space mapping */
@@ -7195,12 +7640,7 @@ int ring_buffer_unmap(struct trace_buffer *buffer, int cpu)
rb_free_meta_page(cpu_buffer);
atomic_dec(&cpu_buffer->resize_disabled);
- mutex_unlock(&buffer->mutex);
-
-out:
- mutex_unlock(&cpu_buffer->mapping_lock);
-
- return err;
+ return 0;
}
int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu)
@@ -7234,6 +7674,10 @@ consume:
goto out;
}
+ /* Did the reader catch up with the writer? */
+ if (cpu_buffer->reader_page == cpu_buffer->commit_page)
+ goto out;
+
reader = rb_get_reader_page(cpu_buffer);
if (WARN_ON(!reader))
goto out;
@@ -7241,8 +7685,8 @@ consume:
/* Check if any events were dropped */
missed_events = cpu_buffer->lost_events;
- if (cpu_buffer->reader_page != cpu_buffer->commit_page) {
- if (missed_events) {
+ if (missed_events) {
+ if (cpu_buffer->reader_page != cpu_buffer->commit_page) {
struct buffer_data_page *bpage = reader->page;
unsigned int commit;
/*
@@ -7263,13 +7707,23 @@ consume:
local_add(RB_MISSED_STORED, &bpage->commit);
}
local_add(RB_MISSED_EVENTS, &bpage->commit);
+ } else if (!WARN_ONCE(cpu_buffer->reader_page == cpu_buffer->tail_page,
+ "Reader on commit with %ld missed events",
+ missed_events)) {
+ /*
+ * There shouldn't be any missed events if the tail_page
+ * is on the reader page. But if the tail page is not on the
+ * reader page and the commit_page is, that would mean that
+ * there's a commit_overrun (an interrupt preempted an
+ * addition of an event and then filled the buffer
+ * with new events). In this case it's not an
+ * error, but it should still be reported.
+ *
+ * TODO: Add missed events to the page for user space to know.
+ */
+ pr_info("Ring buffer [%d] commit overrun lost %ld events at timestamp:%lld\n",
+ cpu, missed_events, cpu_buffer->reader_page->page->time_stamp);
}
- } else {
- /*
- * There really shouldn't be any missed events if the commit
- * is on the reader page.
- */
- WARN_ON_ONCE(missed_events);
}
cpu_buffer->lost_events = 0;
@@ -7278,7 +7732,8 @@ consume:
out:
/* Some archs do not have data cache coherency between kernel and user-space */
- flush_dcache_folio(virt_to_folio(cpu_buffer->reader_page->page));
+ flush_kernel_vmap_range(cpu_buffer->reader_page->page,
+ buffer->subbuf_size + BUF_PAGE_HDR_SIZE);
rb_update_meta_page(cpu_buffer);
@@ -7288,6 +7743,12 @@ out:
return 0;
}
+static void rb_cpu_sync(void *data)
+{
+ /* Not really needed, but documents what is happening */
+ smp_rmb();
+}
+
/*
* We only allocate new buffers, never free them if the CPU goes down.
* If we were to free the buffer, then the user would lose any trace that was in
@@ -7326,7 +7787,18 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node)
cpu);
return -ENOMEM;
}
- smp_wmb();
+
+ /*
+ * Ensure trace_buffer readers observe the newly allocated
+ * ring_buffer_per_cpu before they check the cpumask. Instead of using a
+ * read barrier for all readers, send an IPI.
+ */
+ if (unlikely(system_state == SYSTEM_RUNNING)) {
+ on_each_cpu(rb_cpu_sync, NULL, 1);
+ /* Not really needed, but documents what is happening */
+ smp_wmb();
+ }
+
cpumask_set_cpu(cpu, buffer->cpumask);
return 0;
}
@@ -7411,9 +7883,9 @@ static __init int rb_write_something(struct rb_test_data *data, bool nested)
/* Ignore dropped events before test starts. */
if (started) {
if (nested)
- data->bytes_dropped += len;
- else
data->bytes_dropped_nested += len;
+ else
+ data->bytes_dropped += len;
}
return len;
}
@@ -7535,7 +8007,7 @@ static __init int test_ringbuffer(void)
/*
* Show buffer is enabled before setting rb_test_started.
* Yes there's a small race window where events could be
- * dropped and the thread wont catch it. But when a ring
+ * dropped and the thread won't catch it. But when a ring
* buffer gets enabled, there will always be some kind of
* delay before other CPUs see it. Thus, we don't care about
* those dropped events. We care about events dropped after
@@ -7545,7 +8017,7 @@ static __init int test_ringbuffer(void)
rb_test_started = true;
set_current_state(TASK_INTERRUPTIBLE);
- /* Just run for 10 seconds */;
+ /* Just run for 10 seconds */
schedule_timeout(10 * HZ);
kthread_stop(rb_hammer);
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index cdc3aea12c93..593e3b59e42e 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -433,7 +433,7 @@ static int __init ring_buffer_benchmark_init(void)
{
int ret;
- /* make a one meg buffer in overwite mode */
+ /* make a one meg buffer in overwrite mode */
buffer = ring_buffer_alloc(1000000, RB_FL_OVERWRITE);
if (!buffer)
return -ENOMEM;
diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig
index 831779607e84..3884b14df375 100644
--- a/kernel/trace/rv/Kconfig
+++ b/kernel/trace/rv/Kconfig
@@ -1,19 +1,44 @@
# SPDX-License-Identifier: GPL-2.0-only
#
-config DA_MON_EVENTS
+config RV_MON_EVENTS
+ bool
+
+config RV_MON_MAINTENANCE_EVENTS
bool
config DA_MON_EVENTS_IMPLICIT
- select DA_MON_EVENTS
+ select RV_MON_EVENTS
+ select RV_MON_MAINTENANCE_EVENTS
bool
config DA_MON_EVENTS_ID
- select DA_MON_EVENTS
+ select RV_MON_EVENTS
+ select RV_MON_MAINTENANCE_EVENTS
+ bool
+
+config LTL_MON_EVENTS_ID
+ select RV_MON_EVENTS
+ bool
+
+config RV_LTL_MONITOR
+ bool
+
+config RV_HA_MONITOR
+ bool
+
+config HA_MON_EVENTS_IMPLICIT
+ select DA_MON_EVENTS_IMPLICIT
+ select RV_HA_MONITOR
+ bool
+
+config HA_MON_EVENTS_ID
+ select DA_MON_EVENTS_ID
+ select RV_HA_MONITOR
bool
menuconfig RV
bool "Runtime Verification"
- depends on TRACING
+ select TRACING
help
Enable the kernel runtime verification infrastructure. RV is a
lightweight (yet rigorous) method that complements classical
@@ -25,30 +50,40 @@ menuconfig RV
For further information, see:
Documentation/trace/rv/runtime-verification.rst
-config RV_MON_WIP
+config RV_PER_TASK_MONITORS
+ int "Maximum number of per-task monitor"
depends on RV
- depends on PREEMPT_TRACER
- select DA_MON_EVENTS_IMPLICIT
- bool "wip monitor"
+ range 1 8
+ default 2
help
- Enable wip (wakeup in preemptive) sample monitor that illustrates
- the usage of per-cpu monitors, and one limitation of the
- preempt_disable/enable events.
+ This option configures the maximum number of per-task RV monitors that can run
+ simultaneously.
- For further information, see:
- Documentation/trace/rv/monitor_wip.rst
+source "kernel/trace/rv/monitors/wip/Kconfig"
+source "kernel/trace/rv/monitors/wwnr/Kconfig"
-config RV_MON_WWNR
- depends on RV
- select DA_MON_EVENTS_ID
- bool "wwnr monitor"
- help
- Enable wwnr (wakeup while not running) sample monitor, this is a
- sample monitor that illustrates the usage of per-task monitor.
- The model is borken on purpose: it serves to test reactors.
+source "kernel/trace/rv/monitors/sched/Kconfig"
+source "kernel/trace/rv/monitors/sco/Kconfig"
+source "kernel/trace/rv/monitors/snroc/Kconfig"
+source "kernel/trace/rv/monitors/scpd/Kconfig"
+source "kernel/trace/rv/monitors/snep/Kconfig"
+source "kernel/trace/rv/monitors/sts/Kconfig"
+source "kernel/trace/rv/monitors/nrp/Kconfig"
+source "kernel/trace/rv/monitors/sssw/Kconfig"
+source "kernel/trace/rv/monitors/opid/Kconfig"
+# Add new sched monitors here
- For further information, see:
- Documentation/trace/rv/monitor_wwnr.rst
+source "kernel/trace/rv/monitors/rtapp/Kconfig"
+source "kernel/trace/rv/monitors/pagefault/Kconfig"
+source "kernel/trace/rv/monitors/sleep/Kconfig"
+# Add new rtapp monitors here
+
+source "kernel/trace/rv/monitors/stall/Kconfig"
+source "kernel/trace/rv/monitors/deadline/Kconfig"
+source "kernel/trace/rv/monitors/nomiss/Kconfig"
+# Add new deadline monitors here
+
+# Add new monitors here
config RV_REACTORS
bool "Runtime verification reactors"
diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile
index 963d14875b45..94498da35b37 100644
--- a/kernel/trace/rv/Makefile
+++ b/kernel/trace/rv/Makefile
@@ -1,8 +1,26 @@
# SPDX-License-Identifier: GPL-2.0
+ccflags-y += -I $(src) # needed for trace events
+
obj-$(CONFIG_RV) += rv.o
obj-$(CONFIG_RV_MON_WIP) += monitors/wip/wip.o
obj-$(CONFIG_RV_MON_WWNR) += monitors/wwnr/wwnr.o
+obj-$(CONFIG_RV_MON_SCHED) += monitors/sched/sched.o
+obj-$(CONFIG_RV_MON_SCO) += monitors/sco/sco.o
+obj-$(CONFIG_RV_MON_SNROC) += monitors/snroc/snroc.o
+obj-$(CONFIG_RV_MON_SCPD) += monitors/scpd/scpd.o
+obj-$(CONFIG_RV_MON_SNEP) += monitors/snep/snep.o
+obj-$(CONFIG_RV_MON_RTAPP) += monitors/rtapp/rtapp.o
+obj-$(CONFIG_RV_MON_PAGEFAULT) += monitors/pagefault/pagefault.o
+obj-$(CONFIG_RV_MON_SLEEP) += monitors/sleep/sleep.o
+obj-$(CONFIG_RV_MON_STS) += monitors/sts/sts.o
+obj-$(CONFIG_RV_MON_NRP) += monitors/nrp/nrp.o
+obj-$(CONFIG_RV_MON_SSSW) += monitors/sssw/sssw.o
+obj-$(CONFIG_RV_MON_OPID) += monitors/opid/opid.o
+obj-$(CONFIG_RV_MON_STALL) += monitors/stall/stall.o
+obj-$(CONFIG_RV_MON_DEADLINE) += monitors/deadline/deadline.o
+obj-$(CONFIG_RV_MON_NOMISS) += monitors/nomiss/nomiss.o
+# Add new monitors here
obj-$(CONFIG_RV_REACTORS) += rv_reactors.o
obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o
obj-$(CONFIG_RV_REACT_PANIC) += reactor_panic.o
diff --git a/kernel/trace/rv/monitors/deadline/Kconfig b/kernel/trace/rv/monitors/deadline/Kconfig
new file mode 100644
index 000000000000..38804a6ad91d
--- /dev/null
+++ b/kernel/trace/rv/monitors/deadline/Kconfig
@@ -0,0 +1,10 @@
+config RV_MON_DEADLINE
+ depends on RV
+ bool "deadline monitor"
+ help
+ Collection of monitors to check the deadline scheduler and server
+ behave according to specifications. Enable this to enable all
+ scheduler specification supported by the current kernel.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_deadline.rst
diff --git a/kernel/trace/rv/monitors/deadline/deadline.c b/kernel/trace/rv/monitors/deadline/deadline.c
new file mode 100644
index 000000000000..d566d4542ebf
--- /dev/null
+++ b/kernel/trace/rv/monitors/deadline/deadline.c
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <linux/kallsyms.h>
+
+#define MODULE_NAME "deadline"
+
+#include "deadline.h"
+
+struct rv_monitor rv_deadline = {
+ .name = "deadline",
+ .description = "container for several deadline scheduler specifications.",
+ .enable = NULL,
+ .disable = NULL,
+ .reset = NULL,
+ .enabled = 0,
+};
+
+/* Used by other monitors */
+struct sched_class *rv_ext_sched_class;
+
+static int __init register_deadline(void)
+{
+ if (IS_ENABLED(CONFIG_SCHED_CLASS_EXT)) {
+ rv_ext_sched_class = (void *)kallsyms_lookup_name("ext_sched_class");
+ if (!rv_ext_sched_class)
+ pr_warn("rv: Missing ext_sched_class, monitors may not work.\n");
+ }
+ return rv_register_monitor(&rv_deadline, NULL);
+}
+
+static void __exit unregister_deadline(void)
+{
+ rv_unregister_monitor(&rv_deadline);
+}
+
+module_init(register_deadline);
+module_exit(unregister_deadline);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
+MODULE_DESCRIPTION("deadline: container for several deadline scheduler specifications.");
diff --git a/kernel/trace/rv/monitors/deadline/deadline.h b/kernel/trace/rv/monitors/deadline/deadline.h
new file mode 100644
index 000000000000..78fca873d61e
--- /dev/null
+++ b/kernel/trace/rv/monitors/deadline/deadline.h
@@ -0,0 +1,203 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/kernel.h>
+#include <linux/uaccess.h>
+#include <linux/sched/deadline.h>
+#include <asm/syscall.h>
+#include <uapi/linux/sched/types.h>
+#include <trace/events/sched.h>
+
+/*
+ * Dummy values if not available
+ */
+#ifndef __NR_sched_setscheduler
+#define __NR_sched_setscheduler -__COUNTER__
+#endif
+#ifndef __NR_sched_setattr
+#define __NR_sched_setattr -__COUNTER__
+#endif
+
+extern struct rv_monitor rv_deadline;
+/* Initialised when registering the deadline container */
+extern struct sched_class *rv_ext_sched_class;
+
+/*
+ * If both have dummy values, the syscalls are not supported and we don't even
+ * need to register the handler.
+ */
+static inline bool should_skip_syscall_handle(void)
+{
+ return __NR_sched_setattr < 0 && __NR_sched_setscheduler < 0;
+}
+
+/*
+ * is_supported_type - return true if @type is supported by the deadline monitors
+ */
+static inline bool is_supported_type(u8 type)
+{
+ return type == DL_TASK || type == DL_SERVER_FAIR || type == DL_SERVER_EXT;
+}
+
+/*
+ * is_server_type - return true if @type is a supported server
+ */
+static inline bool is_server_type(u8 type)
+{
+ return is_supported_type(type) && type != DL_TASK;
+}
+
+/*
+ * Use negative numbers for the server.
+ * Currently only one fair server per CPU, may change in the future.
+ */
+#define fair_server_id(cpu) (-cpu)
+#define ext_server_id(cpu) (-cpu - num_possible_cpus())
+#define NO_SERVER_ID (-2 * num_possible_cpus())
+/*
+ * Get a unique id used for dl entities
+ *
+ * The cpu is not required for tasks as the pid is used there, if this function
+ * is called on a dl_se that for sure corresponds to a task, DL_TASK can be
+ * used in place of cpu.
+ * We need the cpu for servers as it is provided in the tracepoint and we
+ * cannot easily retrieve it from the dl_se (requires the struct rq definition).
+ */
+static inline int get_entity_id(struct sched_dl_entity *dl_se, int cpu, u8 type)
+{
+ if (dl_server(dl_se) && type != DL_TASK) {
+ if (type == DL_SERVER_FAIR)
+ return fair_server_id(cpu);
+ if (type == DL_SERVER_EXT)
+ return ext_server_id(cpu);
+ return NO_SERVER_ID;
+ }
+ return dl_task_of(dl_se)->pid;
+}
+
+static inline bool task_is_scx_enabled(struct task_struct *tsk)
+{
+ return IS_ENABLED(CONFIG_SCHED_CLASS_EXT) &&
+ tsk->sched_class == rv_ext_sched_class;
+}
+
+/* Expand id and target as arguments for da functions */
+#define EXPAND_ID(dl_se, cpu, type) get_entity_id(dl_se, cpu, type), dl_se
+#define EXPAND_ID_TASK(tsk) get_entity_id(&tsk->dl, task_cpu(tsk), DL_TASK), &tsk->dl
+
+static inline u8 get_server_type(struct task_struct *tsk)
+{
+ if (tsk->policy == SCHED_NORMAL || tsk->policy == SCHED_EXT ||
+ tsk->policy == SCHED_BATCH || tsk->policy == SCHED_IDLE)
+ return task_is_scx_enabled(tsk) ? DL_SERVER_EXT : DL_SERVER_FAIR;
+ return DL_OTHER;
+}
+
+static inline int extract_params(struct pt_regs *regs, long id, pid_t *pid_out)
+{
+ size_t size = offsetofend(struct sched_attr, sched_flags);
+ struct sched_attr __user *uattr;
+ struct sched_attr attr;
+ int new_policy = -1, ret;
+ unsigned long args[6];
+
+ switch (id) {
+ case __NR_sched_setscheduler:
+ syscall_get_arguments(current, regs, args);
+ *pid_out = args[0];
+ new_policy = args[1];
+ break;
+ case __NR_sched_setattr:
+ syscall_get_arguments(current, regs, args);
+ *pid_out = args[0];
+ uattr = (struct sched_attr __user *)args[1];
+ /*
+ * Just copy up to sched_flags, we are not interested after that
+ */
+ ret = copy_struct_from_user(&attr, size, uattr, size);
+ if (ret)
+ return ret;
+ if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
+ return -EINVAL;
+ new_policy = attr.sched_policy;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return new_policy & ~SCHED_RESET_ON_FORK;
+}
+
+/* Helper functions requiring DA/HA utilities */
+#ifdef RV_MON_TYPE
+
+/*
+ * get_fair_server - get the fair server associated to a task
+ *
+ * If the task is a boosted task, the server is available in the task_struct,
+ * otherwise grab the dl entity saved for the CPU where the task is enqueued.
+ * This function assumes the task is enqueued somewhere.
+ */
+static inline struct sched_dl_entity *get_server(struct task_struct *tsk, u8 type)
+{
+ if (tsk->dl_server && get_server_type(tsk) == type)
+ return tsk->dl_server;
+ if (type == DL_SERVER_FAIR)
+ return da_get_target_by_id(fair_server_id(task_cpu(tsk)));
+ if (type == DL_SERVER_EXT)
+ return da_get_target_by_id(ext_server_id(task_cpu(tsk)));
+ return NULL;
+}
+
+/*
+ * Initialise monitors for all tasks and pre-allocate the storage for servers.
+ * This is necessary since we don't have access to the servers here and
+ * allocation can cause deadlocks from their tracepoints. We can only fill
+ * pre-initialised storage from there.
+ */
+static inline int init_storage(bool skip_tasks)
+{
+ struct task_struct *g, *p;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ if (!da_create_empty_storage(fair_server_id(cpu)))
+ goto fail;
+ if (IS_ENABLED(CONFIG_SCHED_CLASS_EXT) &&
+ !da_create_empty_storage(ext_server_id(cpu)))
+ goto fail;
+ }
+
+ if (skip_tasks)
+ return 0;
+
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, p) {
+ if (p->policy == SCHED_DEADLINE) {
+ if (!da_create_storage(EXPAND_ID_TASK(p), NULL)) {
+ read_unlock(&tasklist_lock);
+ goto fail;
+ }
+ }
+ }
+ read_unlock(&tasklist_lock);
+ return 0;
+
+fail:
+ da_monitor_destroy();
+ return -ENOMEM;
+}
+
+static void __maybe_unused handle_newtask(void *data, struct task_struct *task, u64 flags)
+{
+ /* Might be superfluous as tasks are not started with this policy.. */
+ if (task->policy == SCHED_DEADLINE)
+ da_create_storage(EXPAND_ID_TASK(task), NULL);
+}
+
+static void __maybe_unused handle_exit(void *data, struct task_struct *p, bool group_dead)
+{
+ if (p->policy == SCHED_DEADLINE)
+ da_destroy_storage(get_entity_id(&p->dl, DL_TASK, DL_TASK));
+}
+
+#endif
diff --git a/kernel/trace/rv/monitors/nomiss/Kconfig b/kernel/trace/rv/monitors/nomiss/Kconfig
new file mode 100644
index 000000000000..e1886c3a0dd9
--- /dev/null
+++ b/kernel/trace/rv/monitors/nomiss/Kconfig
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_NOMISS
+ depends on RV
+ depends on HAVE_SYSCALL_TRACEPOINTS
+ depends on RV_MON_DEADLINE
+ default y
+ select HA_MON_EVENTS_ID
+ bool "nomiss monitor"
+ help
+ Monitor to ensure dl entities run to completion before their deadiline.
+ This monitor is part of the deadline monitors collection.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_deadline.rst
diff --git a/kernel/trace/rv/monitors/nomiss/nomiss.c b/kernel/trace/rv/monitors/nomiss/nomiss.c
new file mode 100644
index 000000000000..8ead8783c29f
--- /dev/null
+++ b/kernel/trace/rv/monitors/nomiss/nomiss.c
@@ -0,0 +1,293 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <rv/instrumentation.h>
+
+#define MODULE_NAME "nomiss"
+
+#include <uapi/linux/sched/types.h>
+#include <trace/events/syscalls.h>
+#include <trace/events/sched.h>
+#include <trace/events/task.h>
+#include <rv_trace.h>
+
+#define RV_MON_TYPE RV_MON_PER_OBJ
+#define HA_TIMER_TYPE HA_TIMER_WHEEL
+/* The start condition is on sched_switch, it's dangerous to allocate there */
+#define DA_SKIP_AUTO_ALLOC
+typedef struct sched_dl_entity *monitor_target;
+#include "nomiss.h"
+#include <rv/ha_monitor.h>
+#include <monitors/deadline/deadline.h>
+
+/*
+ * User configurable deadline threshold. If the total utilisation of deadline
+ * tasks is larger than 1, they are only guaranteed bounded tardiness. See
+ * Documentation/scheduler/sched-deadline.rst for more details.
+ * The minimum tardiness without sched_feat(HRTICK_DL) is 1 tick to accommodate
+ * for throttle enforced on the next tick.
+ */
+static u64 deadline_thresh = TICK_NSEC;
+module_param(deadline_thresh, ullong, 0644);
+#define DEADLINE_NS(ha_mon) (ha_get_target(ha_mon)->dl_deadline + deadline_thresh)
+
+static u64 ha_get_env(struct ha_monitor *ha_mon, enum envs_nomiss env, u64 time_ns)
+{
+ if (env == clk_nomiss)
+ return ha_get_clk_ns(ha_mon, env, time_ns);
+ else if (env == is_constr_dl_nomiss)
+ return !dl_is_implicit(ha_get_target(ha_mon));
+ else if (env == is_defer_nomiss)
+ return ha_get_target(ha_mon)->dl_defer;
+ return ENV_INVALID_VALUE;
+}
+
+static void ha_reset_env(struct ha_monitor *ha_mon, enum envs_nomiss env, u64 time_ns)
+{
+ if (env == clk_nomiss)
+ ha_reset_clk_ns(ha_mon, env, time_ns);
+}
+
+static inline bool ha_verify_invariants(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
+{
+ if (curr_state == ready_nomiss)
+ return ha_check_invariant_ns(ha_mon, clk_nomiss, time_ns);
+ else if (curr_state == running_nomiss)
+ return ha_check_invariant_ns(ha_mon, clk_nomiss, time_ns);
+ return true;
+}
+
+static inline void ha_convert_inv_guard(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
+{
+ if (curr_state == next_state)
+ return;
+ if (curr_state == ready_nomiss)
+ ha_inv_to_guard(ha_mon, clk_nomiss, DEADLINE_NS(ha_mon), time_ns);
+ else if (curr_state == running_nomiss)
+ ha_inv_to_guard(ha_mon, clk_nomiss, DEADLINE_NS(ha_mon), time_ns);
+}
+
+static inline bool ha_verify_guards(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
+{
+ bool res = true;
+
+ if (curr_state == ready_nomiss && event == dl_replenish_nomiss)
+ ha_reset_env(ha_mon, clk_nomiss, time_ns);
+ else if (curr_state == ready_nomiss && event == dl_throttle_nomiss)
+ res = ha_get_env(ha_mon, is_defer_nomiss, time_ns) == 1ull;
+ else if (curr_state == idle_nomiss && event == dl_replenish_nomiss)
+ ha_reset_env(ha_mon, clk_nomiss, time_ns);
+ else if (curr_state == running_nomiss && event == dl_replenish_nomiss)
+ ha_reset_env(ha_mon, clk_nomiss, time_ns);
+ else if (curr_state == sleeping_nomiss && event == dl_replenish_nomiss)
+ ha_reset_env(ha_mon, clk_nomiss, time_ns);
+ else if (curr_state == sleeping_nomiss && event == dl_throttle_nomiss)
+ res = ha_get_env(ha_mon, is_constr_dl_nomiss, time_ns) == 1ull ||
+ ha_get_env(ha_mon, is_defer_nomiss, time_ns) == 1ull;
+ else if (curr_state == throttled_nomiss && event == dl_replenish_nomiss)
+ ha_reset_env(ha_mon, clk_nomiss, time_ns);
+ return res;
+}
+
+static inline void ha_setup_invariants(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
+{
+ if (next_state == curr_state && event != dl_replenish_nomiss)
+ return;
+ if (next_state == ready_nomiss)
+ ha_start_timer_ns(ha_mon, clk_nomiss, DEADLINE_NS(ha_mon), time_ns);
+ else if (next_state == running_nomiss)
+ ha_start_timer_ns(ha_mon, clk_nomiss, DEADLINE_NS(ha_mon), time_ns);
+ else if (curr_state == ready_nomiss)
+ ha_cancel_timer(ha_mon);
+ else if (curr_state == running_nomiss)
+ ha_cancel_timer(ha_mon);
+}
+
+static bool ha_verify_constraint(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
+{
+ if (!ha_verify_invariants(ha_mon, curr_state, event, next_state, time_ns))
+ return false;
+
+ ha_convert_inv_guard(ha_mon, curr_state, event, next_state, time_ns);
+
+ if (!ha_verify_guards(ha_mon, curr_state, event, next_state, time_ns))
+ return false;
+
+ ha_setup_invariants(ha_mon, curr_state, event, next_state, time_ns);
+
+ return true;
+}
+
+static void handle_dl_replenish(void *data, struct sched_dl_entity *dl_se,
+ int cpu, u8 type)
+{
+ if (is_supported_type(type))
+ da_handle_event(EXPAND_ID(dl_se, cpu, type), dl_replenish_nomiss);
+}
+
+static void handle_dl_throttle(void *data, struct sched_dl_entity *dl_se,
+ int cpu, u8 type)
+{
+ if (is_supported_type(type))
+ da_handle_event(EXPAND_ID(dl_se, cpu, type), dl_throttle_nomiss);
+}
+
+static void handle_dl_server_stop(void *data, struct sched_dl_entity *dl_se,
+ int cpu, u8 type)
+{
+ /*
+ * This isn't the standard use of da_handle_start_run_event since this
+ * event cannot only occur from the initial state.
+ * It is fine to use here because it always brings to a known state and
+ * the fact we "pretend" the transition starts from the initial state
+ * has no side effect.
+ */
+ if (is_supported_type(type))
+ da_handle_start_run_event(EXPAND_ID(dl_se, cpu, type), dl_server_stop_nomiss);
+}
+
+static inline void handle_server_switch(struct task_struct *next, int cpu, u8 type)
+{
+ struct sched_dl_entity *dl_se = get_server(next, type);
+
+ if (dl_se && is_idle_task(next))
+ da_handle_event(EXPAND_ID(dl_se, cpu, type), dl_server_idle_nomiss);
+}
+
+static void handle_sched_switch(void *data, bool preempt,
+ struct task_struct *prev,
+ struct task_struct *next,
+ unsigned int prev_state)
+{
+ int cpu = task_cpu(next);
+
+ if (prev_state != TASK_RUNNING && !preempt && prev->policy == SCHED_DEADLINE)
+ da_handle_event(EXPAND_ID_TASK(prev), sched_switch_suspend_nomiss);
+ if (next->policy == SCHED_DEADLINE)
+ da_handle_start_run_event(EXPAND_ID_TASK(next), sched_switch_in_nomiss);
+
+ /*
+ * The server is available in next only if the next task is boosted,
+ * otherwise we need to retrieve it.
+ * Here the server continues in the state running/armed until actually
+ * stopped, this works since we continue expecting a throttle.
+ */
+ if (next->dl_server)
+ da_handle_start_event(EXPAND_ID(next->dl_server, cpu,
+ get_server_type(next)),
+ sched_switch_in_nomiss);
+ else {
+ handle_server_switch(next, cpu, DL_SERVER_FAIR);
+ if (IS_ENABLED(CONFIG_SCHED_CLASS_EXT))
+ handle_server_switch(next, cpu, DL_SERVER_EXT);
+ }
+}
+
+static void handle_sys_enter(void *data, struct pt_regs *regs, long id)
+{
+ struct task_struct *p;
+ int new_policy = -1;
+ pid_t pid = 0;
+
+ new_policy = extract_params(regs, id, &pid);
+ if (new_policy < 0)
+ return;
+ guard(rcu)();
+ p = pid ? find_task_by_vpid(pid) : current;
+ if (unlikely(!p) || new_policy == p->policy)
+ return;
+
+ if (p->policy == SCHED_DEADLINE)
+ da_reset(EXPAND_ID_TASK(p));
+ else if (new_policy == SCHED_DEADLINE)
+ da_create_or_get(EXPAND_ID_TASK(p));
+}
+
+static void handle_sched_wakeup(void *data, struct task_struct *tsk)
+{
+ if (tsk->policy == SCHED_DEADLINE)
+ da_handle_event(EXPAND_ID_TASK(tsk), sched_wakeup_nomiss);
+}
+
+static int enable_nomiss(void)
+{
+ int retval;
+
+ retval = ha_monitor_init();
+ if (retval)
+ return retval;
+
+ retval = init_storage(false);
+ if (retval)
+ return retval;
+ rv_attach_trace_probe("nomiss", sched_dl_replenish_tp, handle_dl_replenish);
+ rv_attach_trace_probe("nomiss", sched_dl_throttle_tp, handle_dl_throttle);
+ rv_attach_trace_probe("nomiss", sched_dl_server_stop_tp, handle_dl_server_stop);
+ rv_attach_trace_probe("nomiss", sched_switch, handle_sched_switch);
+ rv_attach_trace_probe("nomiss", sched_wakeup, handle_sched_wakeup);
+ if (!should_skip_syscall_handle())
+ rv_attach_trace_probe("nomiss", sys_enter, handle_sys_enter);
+ rv_attach_trace_probe("nomiss", task_newtask, handle_newtask);
+ rv_attach_trace_probe("nomiss", sched_process_exit, handle_exit);
+
+ return 0;
+}
+
+static void disable_nomiss(void)
+{
+ rv_this.enabled = 0;
+
+ /* Those are RCU writers, detach earlier hoping to close a bit faster */
+ rv_detach_trace_probe("nomiss", task_newtask, handle_newtask);
+ rv_detach_trace_probe("nomiss", sched_process_exit, handle_exit);
+ if (!should_skip_syscall_handle())
+ rv_detach_trace_probe("nomiss", sys_enter, handle_sys_enter);
+
+ rv_detach_trace_probe("nomiss", sched_dl_replenish_tp, handle_dl_replenish);
+ rv_detach_trace_probe("nomiss", sched_dl_throttle_tp, handle_dl_throttle);
+ rv_detach_trace_probe("nomiss", sched_dl_server_stop_tp, handle_dl_server_stop);
+ rv_detach_trace_probe("nomiss", sched_switch, handle_sched_switch);
+ rv_detach_trace_probe("nomiss", sched_wakeup, handle_sched_wakeup);
+
+ ha_monitor_destroy();
+}
+
+static struct rv_monitor rv_this = {
+ .name = "nomiss",
+ .description = "dl entities run to completion before their deadline.",
+ .enable = enable_nomiss,
+ .disable = disable_nomiss,
+ .reset = da_monitor_reset_all,
+ .enabled = 0,
+};
+
+static int __init register_nomiss(void)
+{
+ return rv_register_monitor(&rv_this, &rv_deadline);
+}
+
+static void __exit unregister_nomiss(void)
+{
+ rv_unregister_monitor(&rv_this);
+}
+
+module_init(register_nomiss);
+module_exit(unregister_nomiss);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
+MODULE_DESCRIPTION("nomiss: dl entities run to completion before their deadline.");
diff --git a/kernel/trace/rv/monitors/nomiss/nomiss.h b/kernel/trace/rv/monitors/nomiss/nomiss.h
new file mode 100644
index 000000000000..3d1b436194d7
--- /dev/null
+++ b/kernel/trace/rv/monitors/nomiss/nomiss.h
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Automatically generated C representation of nomiss automaton
+ * For further information about this format, see kernel documentation:
+ * Documentation/trace/rv/deterministic_automata.rst
+ */
+
+#define MONITOR_NAME nomiss
+
+enum states_nomiss {
+ ready_nomiss,
+ idle_nomiss,
+ running_nomiss,
+ sleeping_nomiss,
+ throttled_nomiss,
+ state_max_nomiss,
+};
+
+#define INVALID_STATE state_max_nomiss
+
+enum events_nomiss {
+ dl_replenish_nomiss,
+ dl_server_idle_nomiss,
+ dl_server_stop_nomiss,
+ dl_throttle_nomiss,
+ sched_switch_in_nomiss,
+ sched_switch_suspend_nomiss,
+ sched_wakeup_nomiss,
+ event_max_nomiss,
+};
+
+enum envs_nomiss {
+ clk_nomiss,
+ is_constr_dl_nomiss,
+ is_defer_nomiss,
+ env_max_nomiss,
+ env_max_stored_nomiss = is_constr_dl_nomiss,
+};
+
+_Static_assert(env_max_stored_nomiss <= MAX_HA_ENV_LEN, "Not enough slots");
+#define HA_CLK_NS
+
+struct automaton_nomiss {
+ char *state_names[state_max_nomiss];
+ char *event_names[event_max_nomiss];
+ char *env_names[env_max_nomiss];
+ unsigned char function[state_max_nomiss][event_max_nomiss];
+ unsigned char initial_state;
+ bool final_states[state_max_nomiss];
+};
+
+static const struct automaton_nomiss automaton_nomiss = {
+ .state_names = {
+ "ready",
+ "idle",
+ "running",
+ "sleeping",
+ "throttled",
+ },
+ .event_names = {
+ "dl_replenish",
+ "dl_server_idle",
+ "dl_server_stop",
+ "dl_throttle",
+ "sched_switch_in",
+ "sched_switch_suspend",
+ "sched_wakeup",
+ },
+ .env_names = {
+ "clk",
+ "is_constr_dl",
+ "is_defer",
+ },
+ .function = {
+ {
+ ready_nomiss,
+ idle_nomiss,
+ sleeping_nomiss,
+ throttled_nomiss,
+ running_nomiss,
+ INVALID_STATE,
+ ready_nomiss,
+ },
+ {
+ ready_nomiss,
+ idle_nomiss,
+ sleeping_nomiss,
+ throttled_nomiss,
+ running_nomiss,
+ INVALID_STATE,
+ INVALID_STATE,
+ },
+ {
+ running_nomiss,
+ idle_nomiss,
+ sleeping_nomiss,
+ throttled_nomiss,
+ running_nomiss,
+ sleeping_nomiss,
+ running_nomiss,
+ },
+ {
+ ready_nomiss,
+ sleeping_nomiss,
+ sleeping_nomiss,
+ throttled_nomiss,
+ running_nomiss,
+ INVALID_STATE,
+ ready_nomiss,
+ },
+ {
+ ready_nomiss,
+ throttled_nomiss,
+ INVALID_STATE,
+ throttled_nomiss,
+ INVALID_STATE,
+ throttled_nomiss,
+ throttled_nomiss,
+ },
+ },
+ .initial_state = ready_nomiss,
+ .final_states = { 1, 0, 0, 0, 0 },
+};
diff --git a/kernel/trace/rv/monitors/nomiss/nomiss_trace.h b/kernel/trace/rv/monitors/nomiss/nomiss_trace.h
new file mode 100644
index 000000000000..42e7efaca4e7
--- /dev/null
+++ b/kernel/trace/rv/monitors/nomiss/nomiss_trace.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_NOMISS
+DEFINE_EVENT(event_da_monitor_id, event_nomiss,
+ TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state),
+ TP_ARGS(id, state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor_id, error_nomiss,
+ TP_PROTO(int id, char *state, char *event),
+ TP_ARGS(id, state, event));
+
+DEFINE_EVENT(error_env_da_monitor_id, error_env_nomiss,
+ TP_PROTO(int id, char *state, char *event, char *env),
+ TP_ARGS(id, state, event, env));
+#endif /* CONFIG_RV_MON_NOMISS */
diff --git a/kernel/trace/rv/monitors/nrp/Kconfig b/kernel/trace/rv/monitors/nrp/Kconfig
new file mode 100644
index 000000000000..f5ec08f65535
--- /dev/null
+++ b/kernel/trace/rv/monitors/nrp/Kconfig
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_NRP
+ depends on RV
+ depends on RV_MON_SCHED
+ default y if !ARM64
+ select DA_MON_EVENTS_ID
+ bool "nrp monitor"
+ help
+ Monitor to ensure preemption requires need resched.
+ This monitor is part of the sched monitors collection.
+
+ This monitor is unstable on arm64, say N unless you are testing it.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_sched.rst
diff --git a/kernel/trace/rv/monitors/nrp/nrp.c b/kernel/trace/rv/monitors/nrp/nrp.c
new file mode 100644
index 000000000000..4b5646a70094
--- /dev/null
+++ b/kernel/trace/rv/monitors/nrp/nrp.c
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <rv/instrumentation.h>
+
+#define MODULE_NAME "nrp"
+
+#include <trace/events/irq.h>
+#include <trace/events/sched.h>
+#include <rv_trace.h>
+#include <monitors/sched/sched.h>
+
+#define RV_MON_TYPE RV_MON_PER_TASK
+#include "nrp.h"
+#include <rv/da_monitor.h>
+
+#ifdef CONFIG_X86_LOCAL_APIC
+#include <asm/trace/irq_vectors.h>
+
+static void handle_vector_irq_entry(void *data, int vector)
+{
+ da_handle_event(current, irq_entry_nrp);
+}
+
+static void attach_vector_irq(void)
+{
+ rv_attach_trace_probe("nrp", local_timer_entry, handle_vector_irq_entry);
+ if (IS_ENABLED(CONFIG_IRQ_WORK))
+ rv_attach_trace_probe("nrp", irq_work_entry, handle_vector_irq_entry);
+ if (IS_ENABLED(CONFIG_SMP)) {
+ rv_attach_trace_probe("nrp", reschedule_entry, handle_vector_irq_entry);
+ rv_attach_trace_probe("nrp", call_function_entry, handle_vector_irq_entry);
+ rv_attach_trace_probe("nrp", call_function_single_entry, handle_vector_irq_entry);
+ }
+}
+
+static void detach_vector_irq(void)
+{
+ rv_detach_trace_probe("nrp", local_timer_entry, handle_vector_irq_entry);
+ if (IS_ENABLED(CONFIG_IRQ_WORK))
+ rv_detach_trace_probe("nrp", irq_work_entry, handle_vector_irq_entry);
+ if (IS_ENABLED(CONFIG_SMP)) {
+ rv_detach_trace_probe("nrp", reschedule_entry, handle_vector_irq_entry);
+ rv_detach_trace_probe("nrp", call_function_entry, handle_vector_irq_entry);
+ rv_detach_trace_probe("nrp", call_function_single_entry, handle_vector_irq_entry);
+ }
+}
+
+#else
+/* We assume irq_entry tracepoints are sufficient on other architectures */
+static void attach_vector_irq(void) { }
+static void detach_vector_irq(void) { }
+#endif
+
+static void handle_irq_entry(void *data, int irq, struct irqaction *action)
+{
+ da_handle_event(current, irq_entry_nrp);
+}
+
+static void handle_sched_need_resched(void *data, struct task_struct *tsk,
+ int cpu, int tif)
+{
+ /*
+ * Although need_resched leads to both the rescheduling and preempt_irq
+ * states, it is safer to start the monitor always in preempt_irq,
+ * which may not mirror the system state but makes the monitor simpler,
+ */
+ if (tif == TIF_NEED_RESCHED)
+ da_handle_start_event(tsk, sched_need_resched_nrp);
+}
+
+static void handle_schedule_entry(void *data, bool preempt)
+{
+ if (preempt)
+ da_handle_event(current, schedule_entry_preempt_nrp);
+ else
+ da_handle_event(current, schedule_entry_nrp);
+}
+
+static int enable_nrp(void)
+{
+ int retval;
+
+ retval = da_monitor_init();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("nrp", irq_handler_entry, handle_irq_entry);
+ rv_attach_trace_probe("nrp", sched_set_need_resched_tp, handle_sched_need_resched);
+ rv_attach_trace_probe("nrp", sched_entry_tp, handle_schedule_entry);
+ attach_vector_irq();
+
+ return 0;
+}
+
+static void disable_nrp(void)
+{
+ rv_this.enabled = 0;
+
+ rv_detach_trace_probe("nrp", irq_handler_entry, handle_irq_entry);
+ rv_detach_trace_probe("nrp", sched_set_need_resched_tp, handle_sched_need_resched);
+ rv_detach_trace_probe("nrp", sched_entry_tp, handle_schedule_entry);
+ detach_vector_irq();
+
+ da_monitor_destroy();
+}
+
+static struct rv_monitor rv_this = {
+ .name = "nrp",
+ .description = "need resched preempts.",
+ .enable = enable_nrp,
+ .disable = disable_nrp,
+ .reset = da_monitor_reset_all,
+ .enabled = 0,
+};
+
+static int __init register_nrp(void)
+{
+ return rv_register_monitor(&rv_this, &rv_sched);
+}
+
+static void __exit unregister_nrp(void)
+{
+ rv_unregister_monitor(&rv_this);
+}
+
+module_init(register_nrp);
+module_exit(unregister_nrp);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
+MODULE_DESCRIPTION("nrp: need resched preempts.");
diff --git a/kernel/trace/rv/monitors/nrp/nrp.h b/kernel/trace/rv/monitors/nrp/nrp.h
new file mode 100644
index 000000000000..3270d4c0139f
--- /dev/null
+++ b/kernel/trace/rv/monitors/nrp/nrp.h
@@ -0,0 +1,77 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Automatically generated C representation of nrp automaton
+ * For further information about this format, see kernel documentation:
+ * Documentation/trace/rv/deterministic_automata.rst
+ */
+
+#define MONITOR_NAME nrp
+
+enum states_nrp {
+ preempt_irq_nrp,
+ any_thread_running_nrp,
+ nested_preempt_nrp,
+ rescheduling_nrp,
+ state_max_nrp,
+};
+
+#define INVALID_STATE state_max_nrp
+
+enum events_nrp {
+ irq_entry_nrp,
+ sched_need_resched_nrp,
+ schedule_entry_nrp,
+ schedule_entry_preempt_nrp,
+ event_max_nrp,
+};
+
+struct automaton_nrp {
+ char *state_names[state_max_nrp];
+ char *event_names[event_max_nrp];
+ unsigned char function[state_max_nrp][event_max_nrp];
+ unsigned char initial_state;
+ bool final_states[state_max_nrp];
+};
+
+static const struct automaton_nrp automaton_nrp = {
+ .state_names = {
+ "preempt_irq",
+ "any_thread_running",
+ "nested_preempt",
+ "rescheduling",
+ },
+ .event_names = {
+ "irq_entry",
+ "sched_need_resched",
+ "schedule_entry",
+ "schedule_entry_preempt",
+ },
+ .function = {
+ {
+ preempt_irq_nrp,
+ preempt_irq_nrp,
+ nested_preempt_nrp,
+ nested_preempt_nrp,
+ },
+ {
+ any_thread_running_nrp,
+ rescheduling_nrp,
+ any_thread_running_nrp,
+ INVALID_STATE,
+ },
+ {
+ nested_preempt_nrp,
+ preempt_irq_nrp,
+ any_thread_running_nrp,
+ any_thread_running_nrp,
+ },
+ {
+ preempt_irq_nrp,
+ rescheduling_nrp,
+ any_thread_running_nrp,
+ any_thread_running_nrp,
+ },
+ },
+ .initial_state = preempt_irq_nrp,
+ .final_states = { 0, 1, 0, 0 },
+};
diff --git a/kernel/trace/rv/monitors/nrp/nrp_trace.h b/kernel/trace/rv/monitors/nrp/nrp_trace.h
new file mode 100644
index 000000000000..2e13497de3b6
--- /dev/null
+++ b/kernel/trace/rv/monitors/nrp/nrp_trace.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_NRP
+DEFINE_EVENT(event_da_monitor_id, event_nrp,
+ TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state),
+ TP_ARGS(id, state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor_id, error_nrp,
+ TP_PROTO(int id, char *state, char *event),
+ TP_ARGS(id, state, event));
+#endif /* CONFIG_RV_MON_NRP */
diff --git a/kernel/trace/rv/monitors/opid/Kconfig b/kernel/trace/rv/monitors/opid/Kconfig
new file mode 100644
index 000000000000..6d02e239b684
--- /dev/null
+++ b/kernel/trace/rv/monitors/opid/Kconfig
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_OPID
+ depends on RV
+ depends on RV_MON_SCHED
+ default y
+ select HA_MON_EVENTS_IMPLICIT
+ bool "opid monitor"
+ help
+ Monitor to ensure operations like wakeup and need resched occur with
+ interrupts and preemption disabled.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_sched.rst
diff --git a/kernel/trace/rv/monitors/opid/opid.c b/kernel/trace/rv/monitors/opid/opid.c
new file mode 100644
index 000000000000..3b6a85e815b8
--- /dev/null
+++ b/kernel/trace/rv/monitors/opid/opid.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <rv/instrumentation.h>
+
+#define MODULE_NAME "opid"
+
+#include <trace/events/sched.h>
+#include <rv_trace.h>
+#include <monitors/sched/sched.h>
+
+#define RV_MON_TYPE RV_MON_PER_CPU
+#include "opid.h"
+#include <rv/ha_monitor.h>
+
+static u64 ha_get_env(struct ha_monitor *ha_mon, enum envs_opid env, u64 time_ns)
+{
+ if (env == irq_off_opid)
+ return irqs_disabled();
+ else if (env == preempt_off_opid) {
+ if (IS_ENABLED(CONFIG_PREEMPTION))
+ return (preempt_count() & PREEMPT_MASK) > 0;
+ return true;
+ }
+ return ENV_INVALID_VALUE;
+}
+
+static inline bool ha_verify_guards(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
+{
+ bool res = true;
+
+ if (curr_state == any_opid && event == sched_need_resched_opid)
+ res = ha_get_env(ha_mon, irq_off_opid, time_ns) == 1ull;
+ else if (curr_state == any_opid && event == sched_waking_opid)
+ res = ha_get_env(ha_mon, irq_off_opid, time_ns) == 1ull &&
+ ha_get_env(ha_mon, preempt_off_opid, time_ns) == 1ull;
+ return res;
+}
+
+static bool ha_verify_constraint(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
+{
+ if (!ha_verify_guards(ha_mon, curr_state, event, next_state, time_ns))
+ return false;
+
+ return true;
+}
+
+static void handle_sched_need_resched(void *data, struct task_struct *tsk, int cpu, int tif)
+{
+ da_handle_start_run_event(sched_need_resched_opid);
+}
+
+static void handle_sched_waking(void *data, struct task_struct *p)
+{
+ da_handle_start_run_event(sched_waking_opid);
+}
+
+static int enable_opid(void)
+{
+ int retval;
+
+ retval = ha_monitor_init();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("opid", sched_set_need_resched_tp, handle_sched_need_resched);
+ rv_attach_trace_probe("opid", sched_waking, handle_sched_waking);
+
+ return 0;
+}
+
+static void disable_opid(void)
+{
+ rv_this.enabled = 0;
+
+ rv_detach_trace_probe("opid", sched_set_need_resched_tp, handle_sched_need_resched);
+ rv_detach_trace_probe("opid", sched_waking, handle_sched_waking);
+
+ ha_monitor_destroy();
+}
+
+/*
+ * This is the monitor register section.
+ */
+static struct rv_monitor rv_this = {
+ .name = "opid",
+ .description = "operations with preemption and irq disabled.",
+ .enable = enable_opid,
+ .disable = disable_opid,
+ .reset = da_monitor_reset_all,
+ .enabled = 0,
+};
+
+static int __init register_opid(void)
+{
+ return rv_register_monitor(&rv_this, &rv_sched);
+}
+
+static void __exit unregister_opid(void)
+{
+ rv_unregister_monitor(&rv_this);
+}
+
+module_init(register_opid);
+module_exit(unregister_opid);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
+MODULE_DESCRIPTION("opid: operations with preemption and irq disabled.");
diff --git a/kernel/trace/rv/monitors/opid/opid.h b/kernel/trace/rv/monitors/opid/opid.h
new file mode 100644
index 000000000000..fb0aa4c28aa6
--- /dev/null
+++ b/kernel/trace/rv/monitors/opid/opid.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Automatically generated C representation of opid automaton
+ * For further information about this format, see kernel documentation:
+ * Documentation/trace/rv/deterministic_automata.rst
+ */
+
+#define MONITOR_NAME opid
+
+enum states_opid {
+ any_opid,
+ state_max_opid,
+};
+
+#define INVALID_STATE state_max_opid
+
+enum events_opid {
+ sched_need_resched_opid,
+ sched_waking_opid,
+ event_max_opid,
+};
+
+enum envs_opid {
+ irq_off_opid,
+ preempt_off_opid,
+ env_max_opid,
+ env_max_stored_opid = irq_off_opid,
+};
+
+_Static_assert(env_max_stored_opid <= MAX_HA_ENV_LEN, "Not enough slots");
+
+struct automaton_opid {
+ char *state_names[state_max_opid];
+ char *event_names[event_max_opid];
+ char *env_names[env_max_opid];
+ unsigned char function[state_max_opid][event_max_opid];
+ unsigned char initial_state;
+ bool final_states[state_max_opid];
+};
+
+static const struct automaton_opid automaton_opid = {
+ .state_names = {
+ "any",
+ },
+ .event_names = {
+ "sched_need_resched",
+ "sched_waking",
+ },
+ .env_names = {
+ "irq_off",
+ "preempt_off",
+ },
+ .function = {
+ { any_opid, any_opid },
+ },
+ .initial_state = any_opid,
+ .final_states = { 1 },
+};
diff --git a/kernel/trace/rv/monitors/opid/opid_trace.h b/kernel/trace/rv/monitors/opid/opid_trace.h
new file mode 100644
index 000000000000..b04005b64208
--- /dev/null
+++ b/kernel/trace/rv/monitors/opid/opid_trace.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_OPID
+DEFINE_EVENT(event_da_monitor, event_opid,
+ TP_PROTO(char *state, char *event, char *next_state, bool final_state),
+ TP_ARGS(state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor, error_opid,
+ TP_PROTO(char *state, char *event),
+ TP_ARGS(state, event));
+
+DEFINE_EVENT(error_env_da_monitor, error_env_opid,
+ TP_PROTO(char *state, char *event, char *env),
+ TP_ARGS(state, event, env));
+#endif /* CONFIG_RV_MON_OPID */
diff --git a/kernel/trace/rv/monitors/pagefault/Kconfig b/kernel/trace/rv/monitors/pagefault/Kconfig
new file mode 100644
index 000000000000..0e013f00c33b
--- /dev/null
+++ b/kernel/trace/rv/monitors/pagefault/Kconfig
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_PAGEFAULT
+ depends on RV
+ select RV_LTL_MONITOR
+ depends on RV_MON_RTAPP
+ depends on X86 || RISCV
+ depends on MMU
+ default y
+ select LTL_MON_EVENTS_ID
+ bool "pagefault monitor"
+ help
+ Monitor that real-time tasks do not raise page faults, causing
+ undesirable latency.
+
+ If you are developing a real-time system and not entirely sure whether
+ the applications are designed correctly for real-time, you want to say
+ Y here.
+
+ This monitor does not affect execution speed while it is not running,
+ therefore it is safe to enable this in production kernel.
diff --git a/kernel/trace/rv/monitors/pagefault/pagefault.c b/kernel/trace/rv/monitors/pagefault/pagefault.c
new file mode 100644
index 000000000000..9fe6123b2200
--- /dev/null
+++ b/kernel/trace/rv/monitors/pagefault/pagefault.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/rv.h>
+#include <linux/sched/deadline.h>
+#include <linux/sched/rt.h>
+#include <linux/tracepoint.h>
+#include <rv/instrumentation.h>
+
+#define MODULE_NAME "pagefault"
+
+#include <rv_trace.h>
+#include <trace/events/exceptions.h>
+#include <monitors/rtapp/rtapp.h>
+
+#include "pagefault.h"
+#include <rv/ltl_monitor.h>
+
+static void ltl_atoms_fetch(struct task_struct *task, struct ltl_monitor *mon)
+{
+ /*
+ * This includes "actual" real-time tasks and also PI-boosted
+ * tasks. A task being PI-boosted means it is blocking an "actual"
+ * real-task, therefore it should also obey the monitor's rule,
+ * otherwise the "actual" real-task may be delayed.
+ */
+ ltl_atom_set(mon, LTL_RT, rt_or_dl_task(task));
+}
+
+static void ltl_atoms_init(struct task_struct *task, struct ltl_monitor *mon, bool task_creation)
+{
+ if (task_creation)
+ ltl_atom_set(mon, LTL_PAGEFAULT, false);
+}
+
+static void handle_page_fault(void *data, unsigned long address, struct pt_regs *regs,
+ unsigned long error_code)
+{
+ ltl_atom_pulse(current, LTL_PAGEFAULT, true);
+}
+
+static int enable_pagefault(void)
+{
+ int retval;
+
+ retval = ltl_monitor_init();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("rtapp_pagefault", page_fault_kernel, handle_page_fault);
+ rv_attach_trace_probe("rtapp_pagefault", page_fault_user, handle_page_fault);
+
+ return 0;
+}
+
+static void disable_pagefault(void)
+{
+ rv_detach_trace_probe("rtapp_pagefault", page_fault_kernel, handle_page_fault);
+ rv_detach_trace_probe("rtapp_pagefault", page_fault_user, handle_page_fault);
+
+ ltl_monitor_destroy();
+}
+
+static struct rv_monitor rv_pagefault = {
+ .name = "pagefault",
+ .description = "Monitor that RT tasks do not raise page faults",
+ .enable = enable_pagefault,
+ .disable = disable_pagefault,
+};
+
+static int __init register_pagefault(void)
+{
+ return rv_register_monitor(&rv_pagefault, &rv_rtapp);
+}
+
+static void __exit unregister_pagefault(void)
+{
+ rv_unregister_monitor(&rv_pagefault);
+}
+
+module_init(register_pagefault);
+module_exit(unregister_pagefault);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Nam Cao <namcao@linutronix.de>");
+MODULE_DESCRIPTION("pagefault: Monitor that RT tasks do not raise page faults");
diff --git a/kernel/trace/rv/monitors/pagefault/pagefault.h b/kernel/trace/rv/monitors/pagefault/pagefault.h
new file mode 100644
index 000000000000..c580ec194009
--- /dev/null
+++ b/kernel/trace/rv/monitors/pagefault/pagefault.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * C implementation of Buchi automaton, automatically generated by
+ * tools/verification/rvgen from the linear temporal logic specification.
+ * For further information, see kernel documentation:
+ * Documentation/trace/rv/linear_temporal_logic.rst
+ */
+
+#include <linux/rv.h>
+
+#define MONITOR_NAME pagefault
+
+enum ltl_atom {
+ LTL_PAGEFAULT,
+ LTL_RT,
+ LTL_NUM_ATOM
+};
+static_assert(LTL_NUM_ATOM <= RV_MAX_LTL_ATOM);
+
+static const char *ltl_atom_str(enum ltl_atom atom)
+{
+ static const char *const names[] = {
+ "pa",
+ "rt",
+ };
+
+ return names[atom];
+}
+
+enum ltl_buchi_state {
+ S0,
+ RV_NUM_BA_STATES
+};
+static_assert(RV_NUM_BA_STATES <= RV_MAX_BA_STATES);
+
+static void ltl_start(struct task_struct *task, struct ltl_monitor *mon)
+{
+ bool pagefault = test_bit(LTL_PAGEFAULT, mon->atoms);
+ bool val3 = !pagefault;
+ bool rt = test_bit(LTL_RT, mon->atoms);
+ bool val1 = !rt;
+ bool val4 = val1 || val3;
+
+ if (val4)
+ __set_bit(S0, mon->states);
+}
+
+static void
+ltl_possible_next_states(struct ltl_monitor *mon, unsigned int state, unsigned long *next)
+{
+ bool pagefault = test_bit(LTL_PAGEFAULT, mon->atoms);
+ bool val3 = !pagefault;
+ bool rt = test_bit(LTL_RT, mon->atoms);
+ bool val1 = !rt;
+ bool val4 = val1 || val3;
+
+ switch (state) {
+ case S0:
+ if (val4)
+ __set_bit(S0, next);
+ break;
+ }
+}
diff --git a/kernel/trace/rv/monitors/pagefault/pagefault_trace.h b/kernel/trace/rv/monitors/pagefault/pagefault_trace.h
new file mode 100644
index 000000000000..fe1f82597b1a
--- /dev/null
+++ b/kernel/trace/rv/monitors/pagefault/pagefault_trace.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_PAGEFAULT
+DEFINE_EVENT(event_ltl_monitor_id, event_pagefault,
+ TP_PROTO(struct task_struct *task, char *states, char *atoms, char *next),
+ TP_ARGS(task, states, atoms, next));
+DEFINE_EVENT(error_ltl_monitor_id, error_pagefault,
+ TP_PROTO(struct task_struct *task),
+ TP_ARGS(task));
+#endif /* CONFIG_RV_MON_PAGEFAULT */
diff --git a/kernel/trace/rv/monitors/rtapp/Kconfig b/kernel/trace/rv/monitors/rtapp/Kconfig
new file mode 100644
index 000000000000..1ce9370a9ba8
--- /dev/null
+++ b/kernel/trace/rv/monitors/rtapp/Kconfig
@@ -0,0 +1,11 @@
+config RV_MON_RTAPP
+ depends on RV
+ depends on RV_PER_TASK_MONITORS >= 2
+ bool "rtapp monitor"
+ help
+ Collection of monitors to check for common problems with real-time
+ application that may cause unexpected latency.
+
+ If you are developing a real-time system and not entirely sure whether
+ the applications are designed correctly for real-time, you want to say
+ Y here.
diff --git a/kernel/trace/rv/monitors/rtapp/rtapp.c b/kernel/trace/rv/monitors/rtapp/rtapp.c
new file mode 100644
index 000000000000..17f271231c99
--- /dev/null
+++ b/kernel/trace/rv/monitors/rtapp/rtapp.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+
+#define MODULE_NAME "rtapp"
+
+#include "rtapp.h"
+
+struct rv_monitor rv_rtapp = {
+ .name = "rtapp",
+ .description = "Collection of monitors for detecting problems with real-time applications",
+};
+
+static int __init register_rtapp(void)
+{
+ return rv_register_monitor(&rv_rtapp, NULL);
+}
+
+static void __exit unregister_rtapp(void)
+{
+ rv_unregister_monitor(&rv_rtapp);
+}
+
+module_init(register_rtapp);
+module_exit(unregister_rtapp);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Nam Cao <namcao@linutronix.de>");
+MODULE_DESCRIPTION("Collection of monitors for detecting problems with real-time applications");
diff --git a/kernel/trace/rv/monitors/rtapp/rtapp.h b/kernel/trace/rv/monitors/rtapp/rtapp.h
new file mode 100644
index 000000000000..4c200d67c7f6
--- /dev/null
+++ b/kernel/trace/rv/monitors/rtapp/rtapp.h
@@ -0,0 +1,3 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+extern struct rv_monitor rv_rtapp;
diff --git a/kernel/trace/rv/monitors/sched/Kconfig b/kernel/trace/rv/monitors/sched/Kconfig
new file mode 100644
index 000000000000..aa16456da864
--- /dev/null
+++ b/kernel/trace/rv/monitors/sched/Kconfig
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_SCHED
+ depends on RV
+ depends on RV_PER_TASK_MONITORS >= 3
+ bool "sched monitor"
+ help
+ Collection of monitors to check the scheduler behaves according to specifications.
+ Enable this to enable all scheduler specification supported by the current kernel.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_sched.rst
diff --git a/kernel/trace/rv/monitors/sched/sched.c b/kernel/trace/rv/monitors/sched/sched.c
new file mode 100644
index 000000000000..dd9d96fc6e21
--- /dev/null
+++ b/kernel/trace/rv/monitors/sched/sched.c
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+
+#define MODULE_NAME "sched"
+
+#include "sched.h"
+
+struct rv_monitor rv_sched = {
+ .name = "sched",
+ .description = "container for several scheduler monitor specifications.",
+ .enable = NULL,
+ .disable = NULL,
+ .reset = NULL,
+ .enabled = 0,
+};
+
+static int __init register_sched(void)
+{
+ return rv_register_monitor(&rv_sched, NULL);
+}
+
+static void __exit unregister_sched(void)
+{
+ rv_unregister_monitor(&rv_sched);
+}
+
+module_init(register_sched);
+module_exit(unregister_sched);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
+MODULE_DESCRIPTION("sched: container for several scheduler monitor specifications.");
diff --git a/kernel/trace/rv/monitors/sched/sched.h b/kernel/trace/rv/monitors/sched/sched.h
new file mode 100644
index 000000000000..ba148dd8d48b
--- /dev/null
+++ b/kernel/trace/rv/monitors/sched/sched.h
@@ -0,0 +1,3 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+extern struct rv_monitor rv_sched;
diff --git a/kernel/trace/rv/monitors/sco/Kconfig b/kernel/trace/rv/monitors/sco/Kconfig
new file mode 100644
index 000000000000..097c96cccdd7
--- /dev/null
+++ b/kernel/trace/rv/monitors/sco/Kconfig
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_SCO
+ depends on RV
+ depends on RV_MON_SCHED
+ default y
+ select DA_MON_EVENTS_IMPLICIT
+ bool "sco monitor"
+ help
+ Monitor to ensure sched_set_state happens only in thread context.
+ This monitor is part of the sched monitors collection.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_sched.rst
diff --git a/kernel/trace/rv/monitors/sco/sco.c b/kernel/trace/rv/monitors/sco/sco.c
new file mode 100644
index 000000000000..5a3bd5e16e62
--- /dev/null
+++ b/kernel/trace/rv/monitors/sco/sco.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <rv/instrumentation.h>
+
+#define MODULE_NAME "sco"
+
+#include <trace/events/sched.h>
+#include <rv_trace.h>
+#include <monitors/sched/sched.h>
+
+#define RV_MON_TYPE RV_MON_PER_CPU
+#include "sco.h"
+#include <rv/da_monitor.h>
+
+static void handle_sched_set_state(void *data, struct task_struct *tsk, int state)
+{
+ da_handle_start_event(sched_set_state_sco);
+}
+
+static void handle_schedule_entry(void *data, bool preempt)
+{
+ da_handle_event(schedule_entry_sco);
+}
+
+static void handle_schedule_exit(void *data, bool is_switch)
+{
+ da_handle_start_event(schedule_exit_sco);
+}
+
+static int enable_sco(void)
+{
+ int retval;
+
+ retval = da_monitor_init();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("sco", sched_set_state_tp, handle_sched_set_state);
+ rv_attach_trace_probe("sco", sched_entry_tp, handle_schedule_entry);
+ rv_attach_trace_probe("sco", sched_exit_tp, handle_schedule_exit);
+
+ return 0;
+}
+
+static void disable_sco(void)
+{
+ rv_this.enabled = 0;
+
+ rv_detach_trace_probe("sco", sched_set_state_tp, handle_sched_set_state);
+ rv_detach_trace_probe("sco", sched_entry_tp, handle_schedule_entry);
+ rv_detach_trace_probe("sco", sched_exit_tp, handle_schedule_exit);
+
+ da_monitor_destroy();
+}
+
+static struct rv_monitor rv_this = {
+ .name = "sco",
+ .description = "scheduling context operations.",
+ .enable = enable_sco,
+ .disable = disable_sco,
+ .reset = da_monitor_reset_all,
+ .enabled = 0,
+};
+
+static int __init register_sco(void)
+{
+ return rv_register_monitor(&rv_this, &rv_sched);
+}
+
+static void __exit unregister_sco(void)
+{
+ rv_unregister_monitor(&rv_this);
+}
+
+module_init(register_sco);
+module_exit(unregister_sco);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
+MODULE_DESCRIPTION("sco: scheduling context operations.");
diff --git a/kernel/trace/rv/monitors/sco/sco.h b/kernel/trace/rv/monitors/sco/sco.h
new file mode 100644
index 000000000000..bac3beb51e72
--- /dev/null
+++ b/kernel/trace/rv/monitors/sco/sco.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Automatically generated C representation of sco automaton
+ * For further information about this format, see kernel documentation:
+ * Documentation/trace/rv/deterministic_automata.rst
+ */
+
+#define MONITOR_NAME sco
+
+enum states_sco {
+ thread_context_sco,
+ scheduling_context_sco,
+ state_max_sco,
+};
+
+#define INVALID_STATE state_max_sco
+
+enum events_sco {
+ sched_set_state_sco,
+ schedule_entry_sco,
+ schedule_exit_sco,
+ event_max_sco,
+};
+
+struct automaton_sco {
+ char *state_names[state_max_sco];
+ char *event_names[event_max_sco];
+ unsigned char function[state_max_sco][event_max_sco];
+ unsigned char initial_state;
+ bool final_states[state_max_sco];
+};
+
+static const struct automaton_sco automaton_sco = {
+ .state_names = {
+ "thread_context",
+ "scheduling_context",
+ },
+ .event_names = {
+ "sched_set_state",
+ "schedule_entry",
+ "schedule_exit",
+ },
+ .function = {
+ { thread_context_sco, scheduling_context_sco, INVALID_STATE },
+ { INVALID_STATE, INVALID_STATE, thread_context_sco },
+ },
+ .initial_state = thread_context_sco,
+ .final_states = { 1, 0 },
+};
diff --git a/kernel/trace/rv/monitors/sco/sco_trace.h b/kernel/trace/rv/monitors/sco/sco_trace.h
new file mode 100644
index 000000000000..b711cd9024ec
--- /dev/null
+++ b/kernel/trace/rv/monitors/sco/sco_trace.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_SCO
+DEFINE_EVENT(event_da_monitor, event_sco,
+ TP_PROTO(char *state, char *event, char *next_state, bool final_state),
+ TP_ARGS(state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor, error_sco,
+ TP_PROTO(char *state, char *event),
+ TP_ARGS(state, event));
+#endif /* CONFIG_RV_MON_SCO */
diff --git a/kernel/trace/rv/monitors/scpd/Kconfig b/kernel/trace/rv/monitors/scpd/Kconfig
new file mode 100644
index 000000000000..682d0416188b
--- /dev/null
+++ b/kernel/trace/rv/monitors/scpd/Kconfig
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_SCPD
+ depends on RV
+ depends on TRACE_PREEMPT_TOGGLE
+ depends on RV_MON_SCHED
+ default y
+ select DA_MON_EVENTS_IMPLICIT
+ bool "scpd monitor"
+ help
+ Monitor to ensure schedule is called with preemption disabled.
+ This monitor is part of the sched monitors collection.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_sched.rst
diff --git a/kernel/trace/rv/monitors/scpd/scpd.c b/kernel/trace/rv/monitors/scpd/scpd.c
new file mode 100644
index 000000000000..83b48627dc9f
--- /dev/null
+++ b/kernel/trace/rv/monitors/scpd/scpd.c
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <rv/instrumentation.h>
+
+#define MODULE_NAME "scpd"
+
+#include <trace/events/sched.h>
+#include <trace/events/preemptirq.h>
+#include <rv_trace.h>
+#include <monitors/sched/sched.h>
+
+#define RV_MON_TYPE RV_MON_PER_CPU
+#include "scpd.h"
+#include <rv/da_monitor.h>
+
+static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip)
+{
+ da_handle_event(preempt_disable_scpd);
+}
+
+static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip)
+{
+ da_handle_start_event(preempt_enable_scpd);
+}
+
+static void handle_schedule_entry(void *data, bool preempt)
+{
+ da_handle_event(schedule_entry_scpd);
+}
+
+static void handle_schedule_exit(void *data, bool is_switch)
+{
+ da_handle_event(schedule_exit_scpd);
+}
+
+static int enable_scpd(void)
+{
+ int retval;
+
+ retval = da_monitor_init();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("scpd", preempt_disable, handle_preempt_disable);
+ rv_attach_trace_probe("scpd", preempt_enable, handle_preempt_enable);
+ rv_attach_trace_probe("scpd", sched_entry_tp, handle_schedule_entry);
+ rv_attach_trace_probe("scpd", sched_exit_tp, handle_schedule_exit);
+
+ return 0;
+}
+
+static void disable_scpd(void)
+{
+ rv_this.enabled = 0;
+
+ rv_detach_trace_probe("scpd", preempt_disable, handle_preempt_disable);
+ rv_detach_trace_probe("scpd", preempt_enable, handle_preempt_enable);
+ rv_detach_trace_probe("scpd", sched_entry_tp, handle_schedule_entry);
+ rv_detach_trace_probe("scpd", sched_exit_tp, handle_schedule_exit);
+
+ da_monitor_destroy();
+}
+
+static struct rv_monitor rv_this = {
+ .name = "scpd",
+ .description = "schedule called with preemption disabled.",
+ .enable = enable_scpd,
+ .disable = disable_scpd,
+ .reset = da_monitor_reset_all,
+ .enabled = 0,
+};
+
+static int __init register_scpd(void)
+{
+ return rv_register_monitor(&rv_this, &rv_sched);
+}
+
+static void __exit unregister_scpd(void)
+{
+ rv_unregister_monitor(&rv_this);
+}
+
+module_init(register_scpd);
+module_exit(unregister_scpd);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
+MODULE_DESCRIPTION("scpd: schedule called with preemption disabled.");
diff --git a/kernel/trace/rv/monitors/scpd/scpd.h b/kernel/trace/rv/monitors/scpd/scpd.h
new file mode 100644
index 000000000000..d6329da2671b
--- /dev/null
+++ b/kernel/trace/rv/monitors/scpd/scpd.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Automatically generated C representation of scpd automaton
+ * For further information about this format, see kernel documentation:
+ * Documentation/trace/rv/deterministic_automata.rst
+ */
+
+#define MONITOR_NAME scpd
+
+enum states_scpd {
+ cant_sched_scpd,
+ can_sched_scpd,
+ state_max_scpd,
+};
+
+#define INVALID_STATE state_max_scpd
+
+enum events_scpd {
+ preempt_disable_scpd,
+ preempt_enable_scpd,
+ schedule_entry_scpd,
+ schedule_exit_scpd,
+ event_max_scpd,
+};
+
+struct automaton_scpd {
+ char *state_names[state_max_scpd];
+ char *event_names[event_max_scpd];
+ unsigned char function[state_max_scpd][event_max_scpd];
+ unsigned char initial_state;
+ bool final_states[state_max_scpd];
+};
+
+static const struct automaton_scpd automaton_scpd = {
+ .state_names = {
+ "cant_sched",
+ "can_sched",
+ },
+ .event_names = {
+ "preempt_disable",
+ "preempt_enable",
+ "schedule_entry",
+ "schedule_exit",
+ },
+ .function = {
+ { can_sched_scpd, INVALID_STATE, INVALID_STATE, INVALID_STATE },
+ { INVALID_STATE, cant_sched_scpd, can_sched_scpd, can_sched_scpd },
+ },
+ .initial_state = cant_sched_scpd,
+ .final_states = { 1, 0 },
+};
diff --git a/kernel/trace/rv/monitors/scpd/scpd_trace.h b/kernel/trace/rv/monitors/scpd/scpd_trace.h
new file mode 100644
index 000000000000..6b0f4aa4732e
--- /dev/null
+++ b/kernel/trace/rv/monitors/scpd/scpd_trace.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_SCPD
+DEFINE_EVENT(event_da_monitor, event_scpd,
+ TP_PROTO(char *state, char *event, char *next_state, bool final_state),
+ TP_ARGS(state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor, error_scpd,
+ TP_PROTO(char *state, char *event),
+ TP_ARGS(state, event));
+#endif /* CONFIG_RV_MON_SCPD */
diff --git a/kernel/trace/rv/monitors/sleep/Kconfig b/kernel/trace/rv/monitors/sleep/Kconfig
new file mode 100644
index 000000000000..6b7a122e7b47
--- /dev/null
+++ b/kernel/trace/rv/monitors/sleep/Kconfig
@@ -0,0 +1,22 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_SLEEP
+ depends on RV
+ select RV_LTL_MONITOR
+ depends on HAVE_SYSCALL_TRACEPOINTS
+ depends on RV_MON_RTAPP
+ select TRACE_IRQFLAGS
+ default y
+ select LTL_MON_EVENTS_ID
+ bool "sleep monitor"
+ help
+ Monitor that real-time tasks do not sleep in a manner that may
+ cause undesirable latency.
+
+ If you are developing a real-time system and not entirely sure whether
+ the applications are designed correctly for real-time, you want to say
+ Y here.
+
+ Enabling this monitor may have performance impact (due to select
+ TRACE_IRQFLAGS). Therefore, you probably should say N for
+ production kernel.
diff --git a/kernel/trace/rv/monitors/sleep/sleep.c b/kernel/trace/rv/monitors/sleep/sleep.c
new file mode 100644
index 000000000000..8dfe5ec13e19
--- /dev/null
+++ b/kernel/trace/rv/monitors/sleep/sleep.c
@@ -0,0 +1,249 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/init.h>
+#include <linux/irqflags.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/rv.h>
+#include <linux/sched/deadline.h>
+#include <linux/sched/rt.h>
+#include <rv/instrumentation.h>
+
+#define MODULE_NAME "sleep"
+
+#include <trace/events/syscalls.h>
+#include <trace/events/sched.h>
+#include <trace/events/lock.h>
+#include <uapi/linux/futex.h>
+#include <rv_trace.h>
+#include <monitors/rtapp/rtapp.h>
+
+#include "sleep.h"
+#include <rv/ltl_monitor.h>
+
+static void ltl_atoms_fetch(struct task_struct *task, struct ltl_monitor *mon)
+{
+ /*
+ * This includes "actual" real-time tasks and also PI-boosted
+ * tasks. A task being PI-boosted means it is blocking an "actual"
+ * real-task, therefore it should also obey the monitor's rule,
+ * otherwise the "actual" real-task may be delayed.
+ */
+ ltl_atom_set(mon, LTL_RT, rt_or_dl_task(task));
+}
+
+static void ltl_atoms_init(struct task_struct *task, struct ltl_monitor *mon, bool task_creation)
+{
+ ltl_atom_set(mon, LTL_SLEEP, false);
+ ltl_atom_set(mon, LTL_WAKE, false);
+ ltl_atom_set(mon, LTL_ABORT_SLEEP, false);
+ ltl_atom_set(mon, LTL_WOKEN_BY_HARDIRQ, false);
+ ltl_atom_set(mon, LTL_WOKEN_BY_NMI, false);
+ ltl_atom_set(mon, LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO, false);
+
+ if (task_creation) {
+ ltl_atom_set(mon, LTL_KTHREAD_SHOULD_STOP, false);
+ ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, false);
+ ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false);
+ ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false);
+ ltl_atom_set(mon, LTL_CLOCK_NANOSLEEP, false);
+ ltl_atom_set(mon, LTL_FUTEX_WAIT, false);
+ ltl_atom_set(mon, LTL_EPOLL_WAIT, false);
+ ltl_atom_set(mon, LTL_FUTEX_LOCK_PI, false);
+ ltl_atom_set(mon, LTL_BLOCK_ON_RT_MUTEX, false);
+ }
+
+ if (task->flags & PF_KTHREAD) {
+ ltl_atom_set(mon, LTL_KERNEL_THREAD, true);
+
+ /* kernel tasks do not do syscall */
+ ltl_atom_set(mon, LTL_FUTEX_WAIT, false);
+ ltl_atom_set(mon, LTL_FUTEX_LOCK_PI, false);
+ ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, false);
+ ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false);
+ ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false);
+ ltl_atom_set(mon, LTL_CLOCK_NANOSLEEP, false);
+ ltl_atom_set(mon, LTL_EPOLL_WAIT, false);
+
+ if (strstarts(task->comm, "migration/"))
+ ltl_atom_set(mon, LTL_TASK_IS_MIGRATION, true);
+ else
+ ltl_atom_set(mon, LTL_TASK_IS_MIGRATION, false);
+
+ if (strstarts(task->comm, "rcu"))
+ ltl_atom_set(mon, LTL_TASK_IS_RCU, true);
+ else
+ ltl_atom_set(mon, LTL_TASK_IS_RCU, false);
+ } else {
+ ltl_atom_set(mon, LTL_KTHREAD_SHOULD_STOP, false);
+ ltl_atom_set(mon, LTL_KERNEL_THREAD, false);
+ ltl_atom_set(mon, LTL_TASK_IS_RCU, false);
+ ltl_atom_set(mon, LTL_TASK_IS_MIGRATION, false);
+ }
+
+}
+
+static void handle_sched_set_state(void *data, struct task_struct *task, int state)
+{
+ if (state & TASK_INTERRUPTIBLE)
+ ltl_atom_pulse(task, LTL_SLEEP, true);
+ else if (state == TASK_RUNNING)
+ ltl_atom_pulse(task, LTL_ABORT_SLEEP, true);
+}
+
+static void handle_sched_wakeup(void *data, struct task_struct *task)
+{
+ ltl_atom_pulse(task, LTL_WAKE, true);
+}
+
+static void handle_sched_waking(void *data, struct task_struct *task)
+{
+ if (this_cpu_read(hardirq_context)) {
+ ltl_atom_pulse(task, LTL_WOKEN_BY_HARDIRQ, true);
+ } else if (in_task()) {
+ if (current->prio <= task->prio)
+ ltl_atom_pulse(task, LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO, true);
+ } else if (in_nmi()) {
+ ltl_atom_pulse(task, LTL_WOKEN_BY_NMI, true);
+ }
+}
+
+static void handle_contention_begin(void *data, void *lock, unsigned int flags)
+{
+ if (flags & LCB_F_RT)
+ ltl_atom_update(current, LTL_BLOCK_ON_RT_MUTEX, true);
+}
+
+static void handle_contention_end(void *data, void *lock, int ret)
+{
+ ltl_atom_update(current, LTL_BLOCK_ON_RT_MUTEX, false);
+}
+
+static void handle_sys_enter(void *data, struct pt_regs *regs, long id)
+{
+ struct ltl_monitor *mon;
+ unsigned long args[6];
+ int op, cmd;
+
+ mon = ltl_get_monitor(current);
+
+ switch (id) {
+#ifdef __NR_clock_nanosleep
+ case __NR_clock_nanosleep:
+#endif
+#ifdef __NR_clock_nanosleep_time64
+ case __NR_clock_nanosleep_time64:
+#endif
+ syscall_get_arguments(current, regs, args);
+ ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, args[0] == CLOCK_MONOTONIC);
+ ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, args[0] == CLOCK_TAI);
+ ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, args[1] == TIMER_ABSTIME);
+ ltl_atom_update(current, LTL_CLOCK_NANOSLEEP, true);
+ break;
+
+#ifdef __NR_futex
+ case __NR_futex:
+#endif
+#ifdef __NR_futex_time64
+ case __NR_futex_time64:
+#endif
+ syscall_get_arguments(current, regs, args);
+ op = args[1];
+ cmd = op & FUTEX_CMD_MASK;
+
+ switch (cmd) {
+ case FUTEX_LOCK_PI:
+ case FUTEX_LOCK_PI2:
+ ltl_atom_update(current, LTL_FUTEX_LOCK_PI, true);
+ break;
+ case FUTEX_WAIT:
+ case FUTEX_WAIT_BITSET:
+ case FUTEX_WAIT_REQUEUE_PI:
+ ltl_atom_update(current, LTL_FUTEX_WAIT, true);
+ break;
+ }
+ break;
+#ifdef __NR_epoll_wait
+ case __NR_epoll_wait:
+ ltl_atom_update(current, LTL_EPOLL_WAIT, true);
+ break;
+#endif
+ }
+}
+
+static void handle_sys_exit(void *data, struct pt_regs *regs, long ret)
+{
+ struct ltl_monitor *mon = ltl_get_monitor(current);
+
+ ltl_atom_set(mon, LTL_FUTEX_LOCK_PI, false);
+ ltl_atom_set(mon, LTL_FUTEX_WAIT, false);
+ ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, false);
+ ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false);
+ ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false);
+ ltl_atom_set(mon, LTL_EPOLL_WAIT, false);
+ ltl_atom_update(current, LTL_CLOCK_NANOSLEEP, false);
+}
+
+static void handle_kthread_stop(void *data, struct task_struct *task)
+{
+ /* FIXME: this could race with other tracepoint handlers */
+ ltl_atom_update(task, LTL_KTHREAD_SHOULD_STOP, true);
+}
+
+static int enable_sleep(void)
+{
+ int retval;
+
+ retval = ltl_monitor_init();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("rtapp_sleep", sched_waking, handle_sched_waking);
+ rv_attach_trace_probe("rtapp_sleep", sched_wakeup, handle_sched_wakeup);
+ rv_attach_trace_probe("rtapp_sleep", sched_set_state_tp, handle_sched_set_state);
+ rv_attach_trace_probe("rtapp_sleep", contention_begin, handle_contention_begin);
+ rv_attach_trace_probe("rtapp_sleep", contention_end, handle_contention_end);
+ rv_attach_trace_probe("rtapp_sleep", sched_kthread_stop, handle_kthread_stop);
+ rv_attach_trace_probe("rtapp_sleep", sys_enter, handle_sys_enter);
+ rv_attach_trace_probe("rtapp_sleep", sys_exit, handle_sys_exit);
+ return 0;
+}
+
+static void disable_sleep(void)
+{
+ rv_detach_trace_probe("rtapp_sleep", sched_waking, handle_sched_waking);
+ rv_detach_trace_probe("rtapp_sleep", sched_wakeup, handle_sched_wakeup);
+ rv_detach_trace_probe("rtapp_sleep", sched_set_state_tp, handle_sched_set_state);
+ rv_detach_trace_probe("rtapp_sleep", contention_begin, handle_contention_begin);
+ rv_detach_trace_probe("rtapp_sleep", contention_end, handle_contention_end);
+ rv_detach_trace_probe("rtapp_sleep", sched_kthread_stop, handle_kthread_stop);
+ rv_detach_trace_probe("rtapp_sleep", sys_enter, handle_sys_enter);
+ rv_detach_trace_probe("rtapp_sleep", sys_exit, handle_sys_exit);
+
+ ltl_monitor_destroy();
+}
+
+static struct rv_monitor rv_sleep = {
+ .name = "sleep",
+ .description = "Monitor that RT tasks do not undesirably sleep",
+ .enable = enable_sleep,
+ .disable = disable_sleep,
+};
+
+static int __init register_sleep(void)
+{
+ return rv_register_monitor(&rv_sleep, &rv_rtapp);
+}
+
+static void __exit unregister_sleep(void)
+{
+ rv_unregister_monitor(&rv_sleep);
+}
+
+module_init(register_sleep);
+module_exit(unregister_sleep);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Nam Cao <namcao@linutronix.de>");
+MODULE_DESCRIPTION("sleep: Monitor that RT tasks do not undesirably sleep");
diff --git a/kernel/trace/rv/monitors/sleep/sleep.h b/kernel/trace/rv/monitors/sleep/sleep.h
new file mode 100644
index 000000000000..95dc2727c059
--- /dev/null
+++ b/kernel/trace/rv/monitors/sleep/sleep.h
@@ -0,0 +1,263 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * C implementation of Buchi automaton, automatically generated by
+ * tools/verification/rvgen from the linear temporal logic specification.
+ * For further information, see kernel documentation:
+ * Documentation/trace/rv/linear_temporal_logic.rst
+ */
+
+#include <linux/rv.h>
+
+#define MONITOR_NAME sleep
+
+enum ltl_atom {
+ LTL_ABORT_SLEEP,
+ LTL_BLOCK_ON_RT_MUTEX,
+ LTL_CLOCK_NANOSLEEP,
+ LTL_EPOLL_WAIT,
+ LTL_FUTEX_LOCK_PI,
+ LTL_FUTEX_WAIT,
+ LTL_KERNEL_THREAD,
+ LTL_KTHREAD_SHOULD_STOP,
+ LTL_NANOSLEEP_CLOCK_MONOTONIC,
+ LTL_NANOSLEEP_CLOCK_TAI,
+ LTL_NANOSLEEP_TIMER_ABSTIME,
+ LTL_RT,
+ LTL_SLEEP,
+ LTL_TASK_IS_MIGRATION,
+ LTL_TASK_IS_RCU,
+ LTL_WAKE,
+ LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO,
+ LTL_WOKEN_BY_HARDIRQ,
+ LTL_WOKEN_BY_NMI,
+ LTL_NUM_ATOM
+};
+static_assert(LTL_NUM_ATOM <= RV_MAX_LTL_ATOM);
+
+static const char *ltl_atom_str(enum ltl_atom atom)
+{
+ static const char *const names[] = {
+ "ab_sl",
+ "bl_on_rt_mu",
+ "cl_na",
+ "ep_wa",
+ "fu_lo_pi",
+ "fu_wa",
+ "ker_th",
+ "kth_sh_st",
+ "na_cl_mo",
+ "na_cl_ta",
+ "na_ti_ab",
+ "rt",
+ "sl",
+ "ta_mi",
+ "ta_rc",
+ "wak",
+ "wo_eq_hi_pr",
+ "wo_ha",
+ "wo_nm",
+ };
+
+ return names[atom];
+}
+
+enum ltl_buchi_state {
+ S0,
+ S1,
+ S2,
+ S3,
+ S4,
+ S5,
+ S6,
+ S7,
+ RV_NUM_BA_STATES
+};
+static_assert(RV_NUM_BA_STATES <= RV_MAX_BA_STATES);
+
+static void ltl_start(struct task_struct *task, struct ltl_monitor *mon)
+{
+ bool woken_by_nmi = test_bit(LTL_WOKEN_BY_NMI, mon->atoms);
+ bool woken_by_hardirq = test_bit(LTL_WOKEN_BY_HARDIRQ, mon->atoms);
+ bool woken_by_equal_or_higher_prio = test_bit(LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO,
+ mon->atoms);
+ bool wake = test_bit(LTL_WAKE, mon->atoms);
+ bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms);
+ bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms);
+ bool sleep = test_bit(LTL_SLEEP, mon->atoms);
+ bool rt = test_bit(LTL_RT, mon->atoms);
+ bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME, mon->atoms);
+ bool nanosleep_clock_tai = test_bit(LTL_NANOSLEEP_CLOCK_TAI, mon->atoms);
+ bool nanosleep_clock_monotonic = test_bit(LTL_NANOSLEEP_CLOCK_MONOTONIC, mon->atoms);
+ bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon->atoms);
+ bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
+ bool futex_wait = test_bit(LTL_FUTEX_WAIT, mon->atoms);
+ bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms);
+ bool epoll_wait = test_bit(LTL_EPOLL_WAIT, mon->atoms);
+ bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms);
+ bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
+ bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms);
+ bool val42 = task_is_rcu || task_is_migration;
+ bool val43 = futex_lock_pi || val42;
+ bool val5 = block_on_rt_mutex || val43;
+ bool val34 = abort_sleep || kthread_should_stop;
+ bool val35 = woken_by_nmi || val34;
+ bool val36 = woken_by_hardirq || val35;
+ bool val14 = woken_by_equal_or_higher_prio || val36;
+ bool val13 = !wake;
+ bool val26 = nanosleep_clock_monotonic || nanosleep_clock_tai;
+ bool val27 = nanosleep_timer_abstime && val26;
+ bool val18 = clock_nanosleep && val27;
+ bool val20 = val18 || epoll_wait;
+ bool val9 = futex_wait || val20;
+ bool val11 = val9 || kernel_thread;
+ bool val2 = !sleep;
+ bool val1 = !rt;
+ bool val3 = val1 || val2;
+
+ if (val3)
+ __set_bit(S0, mon->states);
+ if (val11 && val13)
+ __set_bit(S1, mon->states);
+ if (val11 && val14)
+ __set_bit(S4, mon->states);
+ if (val5)
+ __set_bit(S5, mon->states);
+}
+
+static void
+ltl_possible_next_states(struct ltl_monitor *mon, unsigned int state, unsigned long *next)
+{
+ bool woken_by_nmi = test_bit(LTL_WOKEN_BY_NMI, mon->atoms);
+ bool woken_by_hardirq = test_bit(LTL_WOKEN_BY_HARDIRQ, mon->atoms);
+ bool woken_by_equal_or_higher_prio = test_bit(LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO,
+ mon->atoms);
+ bool wake = test_bit(LTL_WAKE, mon->atoms);
+ bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms);
+ bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms);
+ bool sleep = test_bit(LTL_SLEEP, mon->atoms);
+ bool rt = test_bit(LTL_RT, mon->atoms);
+ bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME, mon->atoms);
+ bool nanosleep_clock_tai = test_bit(LTL_NANOSLEEP_CLOCK_TAI, mon->atoms);
+ bool nanosleep_clock_monotonic = test_bit(LTL_NANOSLEEP_CLOCK_MONOTONIC, mon->atoms);
+ bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon->atoms);
+ bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
+ bool futex_wait = test_bit(LTL_FUTEX_WAIT, mon->atoms);
+ bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms);
+ bool epoll_wait = test_bit(LTL_EPOLL_WAIT, mon->atoms);
+ bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms);
+ bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
+ bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms);
+ bool val42 = task_is_rcu || task_is_migration;
+ bool val43 = futex_lock_pi || val42;
+ bool val5 = block_on_rt_mutex || val43;
+ bool val34 = abort_sleep || kthread_should_stop;
+ bool val35 = woken_by_nmi || val34;
+ bool val36 = woken_by_hardirq || val35;
+ bool val14 = woken_by_equal_or_higher_prio || val36;
+ bool val13 = !wake;
+ bool val26 = nanosleep_clock_monotonic || nanosleep_clock_tai;
+ bool val27 = nanosleep_timer_abstime && val26;
+ bool val18 = clock_nanosleep && val27;
+ bool val20 = val18 || epoll_wait;
+ bool val9 = futex_wait || val20;
+ bool val11 = val9 || kernel_thread;
+ bool val2 = !sleep;
+ bool val1 = !rt;
+ bool val3 = val1 || val2;
+
+ switch (state) {
+ case S0:
+ if (val3)
+ __set_bit(S0, next);
+ if (val11 && val13)
+ __set_bit(S1, next);
+ if (val11 && val14)
+ __set_bit(S4, next);
+ if (val5)
+ __set_bit(S5, next);
+ break;
+ case S1:
+ if (val11 && val13)
+ __set_bit(S1, next);
+ if (val13 && val3)
+ __set_bit(S2, next);
+ if (val14 && val3)
+ __set_bit(S3, next);
+ if (val11 && val14)
+ __set_bit(S4, next);
+ if (val13 && val5)
+ __set_bit(S6, next);
+ if (val14 && val5)
+ __set_bit(S7, next);
+ break;
+ case S2:
+ if (val11 && val13)
+ __set_bit(S1, next);
+ if (val13 && val3)
+ __set_bit(S2, next);
+ if (val14 && val3)
+ __set_bit(S3, next);
+ if (val11 && val14)
+ __set_bit(S4, next);
+ if (val13 && val5)
+ __set_bit(S6, next);
+ if (val14 && val5)
+ __set_bit(S7, next);
+ break;
+ case S3:
+ if (val3)
+ __set_bit(S0, next);
+ if (val11 && val13)
+ __set_bit(S1, next);
+ if (val11 && val14)
+ __set_bit(S4, next);
+ if (val5)
+ __set_bit(S5, next);
+ break;
+ case S4:
+ if (val3)
+ __set_bit(S0, next);
+ if (val11 && val13)
+ __set_bit(S1, next);
+ if (val11 && val14)
+ __set_bit(S4, next);
+ if (val5)
+ __set_bit(S5, next);
+ break;
+ case S5:
+ if (val3)
+ __set_bit(S0, next);
+ if (val11 && val13)
+ __set_bit(S1, next);
+ if (val11 && val14)
+ __set_bit(S4, next);
+ if (val5)
+ __set_bit(S5, next);
+ break;
+ case S6:
+ if (val11 && val13)
+ __set_bit(S1, next);
+ if (val13 && val3)
+ __set_bit(S2, next);
+ if (val14 && val3)
+ __set_bit(S3, next);
+ if (val11 && val14)
+ __set_bit(S4, next);
+ if (val13 && val5)
+ __set_bit(S6, next);
+ if (val14 && val5)
+ __set_bit(S7, next);
+ break;
+ case S7:
+ if (val3)
+ __set_bit(S0, next);
+ if (val11 && val13)
+ __set_bit(S1, next);
+ if (val11 && val14)
+ __set_bit(S4, next);
+ if (val5)
+ __set_bit(S5, next);
+ break;
+ }
+}
diff --git a/kernel/trace/rv/monitors/sleep/sleep_trace.h b/kernel/trace/rv/monitors/sleep/sleep_trace.h
new file mode 100644
index 000000000000..22eaf31da987
--- /dev/null
+++ b/kernel/trace/rv/monitors/sleep/sleep_trace.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_SLEEP
+DEFINE_EVENT(event_ltl_monitor_id, event_sleep,
+ TP_PROTO(struct task_struct *task, char *states, char *atoms, char *next),
+ TP_ARGS(task, states, atoms, next));
+DEFINE_EVENT(error_ltl_monitor_id, error_sleep,
+ TP_PROTO(struct task_struct *task),
+ TP_ARGS(task));
+#endif /* CONFIG_RV_MON_SLEEP */
diff --git a/kernel/trace/rv/monitors/snep/Kconfig b/kernel/trace/rv/monitors/snep/Kconfig
new file mode 100644
index 000000000000..7dd54f434ff7
--- /dev/null
+++ b/kernel/trace/rv/monitors/snep/Kconfig
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_SNEP
+ depends on RV
+ depends on TRACE_PREEMPT_TOGGLE
+ depends on RV_MON_SCHED
+ default y
+ select DA_MON_EVENTS_IMPLICIT
+ bool "snep monitor"
+ help
+ Monitor to ensure schedule does not enable preempt.
+ This monitor is part of the sched monitors collection.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_sched.rst
diff --git a/kernel/trace/rv/monitors/snep/snep.c b/kernel/trace/rv/monitors/snep/snep.c
new file mode 100644
index 000000000000..b80b73795dec
--- /dev/null
+++ b/kernel/trace/rv/monitors/snep/snep.c
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <rv/instrumentation.h>
+
+#define MODULE_NAME "snep"
+
+#include <trace/events/sched.h>
+#include <trace/events/preemptirq.h>
+#include <rv_trace.h>
+#include <monitors/sched/sched.h>
+
+#define RV_MON_TYPE RV_MON_PER_CPU
+#include "snep.h"
+#include <rv/da_monitor.h>
+
+static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip)
+{
+ da_handle_start_event(preempt_disable_snep);
+}
+
+static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip)
+{
+ da_handle_start_event(preempt_enable_snep);
+}
+
+static void handle_schedule_entry(void *data, bool preempt)
+{
+ da_handle_event(schedule_entry_snep);
+}
+
+static void handle_schedule_exit(void *data, bool is_switch)
+{
+ da_handle_start_event(schedule_exit_snep);
+}
+
+static int enable_snep(void)
+{
+ int retval;
+
+ retval = da_monitor_init();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("snep", preempt_disable, handle_preempt_disable);
+ rv_attach_trace_probe("snep", preempt_enable, handle_preempt_enable);
+ rv_attach_trace_probe("snep", sched_entry_tp, handle_schedule_entry);
+ rv_attach_trace_probe("snep", sched_exit_tp, handle_schedule_exit);
+
+ return 0;
+}
+
+static void disable_snep(void)
+{
+ rv_this.enabled = 0;
+
+ rv_detach_trace_probe("snep", preempt_disable, handle_preempt_disable);
+ rv_detach_trace_probe("snep", preempt_enable, handle_preempt_enable);
+ rv_detach_trace_probe("snep", sched_entry_tp, handle_schedule_entry);
+ rv_detach_trace_probe("snep", sched_exit_tp, handle_schedule_exit);
+
+ da_monitor_destroy();
+}
+
+static struct rv_monitor rv_this = {
+ .name = "snep",
+ .description = "schedule does not enable preempt.",
+ .enable = enable_snep,
+ .disable = disable_snep,
+ .reset = da_monitor_reset_all,
+ .enabled = 0,
+};
+
+static int __init register_snep(void)
+{
+ return rv_register_monitor(&rv_this, &rv_sched);
+}
+
+static void __exit unregister_snep(void)
+{
+ rv_unregister_monitor(&rv_this);
+}
+
+module_init(register_snep);
+module_exit(unregister_snep);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
+MODULE_DESCRIPTION("snep: schedule does not enable preempt.");
diff --git a/kernel/trace/rv/monitors/snep/snep.h b/kernel/trace/rv/monitors/snep/snep.h
new file mode 100644
index 000000000000..357520a5b3d1
--- /dev/null
+++ b/kernel/trace/rv/monitors/snep/snep.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Automatically generated C representation of snep automaton
+ * For further information about this format, see kernel documentation:
+ * Documentation/trace/rv/deterministic_automata.rst
+ */
+
+#define MONITOR_NAME snep
+
+enum states_snep {
+ non_scheduling_context_snep,
+ scheduling_contex_snep,
+ state_max_snep,
+};
+
+#define INVALID_STATE state_max_snep
+
+enum events_snep {
+ preempt_disable_snep,
+ preempt_enable_snep,
+ schedule_entry_snep,
+ schedule_exit_snep,
+ event_max_snep,
+};
+
+struct automaton_snep {
+ char *state_names[state_max_snep];
+ char *event_names[event_max_snep];
+ unsigned char function[state_max_snep][event_max_snep];
+ unsigned char initial_state;
+ bool final_states[state_max_snep];
+};
+
+static const struct automaton_snep automaton_snep = {
+ .state_names = {
+ "non_scheduling_context",
+ "scheduling_contex",
+ },
+ .event_names = {
+ "preempt_disable",
+ "preempt_enable",
+ "schedule_entry",
+ "schedule_exit",
+ },
+ .function = {
+ {
+ non_scheduling_context_snep,
+ non_scheduling_context_snep,
+ scheduling_contex_snep,
+ INVALID_STATE,
+ },
+ {
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE,
+ non_scheduling_context_snep,
+ },
+ },
+ .initial_state = non_scheduling_context_snep,
+ .final_states = { 1, 0 },
+};
diff --git a/kernel/trace/rv/monitors/snep/snep_trace.h b/kernel/trace/rv/monitors/snep/snep_trace.h
new file mode 100644
index 000000000000..01aad49a949a
--- /dev/null
+++ b/kernel/trace/rv/monitors/snep/snep_trace.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_SNEP
+DEFINE_EVENT(event_da_monitor, event_snep,
+ TP_PROTO(char *state, char *event, char *next_state, bool final_state),
+ TP_ARGS(state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor, error_snep,
+ TP_PROTO(char *state, char *event),
+ TP_ARGS(state, event));
+#endif /* CONFIG_RV_MON_SNEP */
diff --git a/kernel/trace/rv/monitors/snroc/Kconfig b/kernel/trace/rv/monitors/snroc/Kconfig
new file mode 100644
index 000000000000..6e4365a2fea3
--- /dev/null
+++ b/kernel/trace/rv/monitors/snroc/Kconfig
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_SNROC
+ depends on RV
+ depends on RV_MON_SCHED
+ default y
+ select DA_MON_EVENTS_ID
+ bool "snroc monitor"
+ help
+ Monitor to ensure sched_set_state happens only in the respective task's context.
+ This monitor is part of the sched monitors collection.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_sched.rst
diff --git a/kernel/trace/rv/monitors/snroc/snroc.c b/kernel/trace/rv/monitors/snroc/snroc.c
new file mode 100644
index 000000000000..f168b1a4b12c
--- /dev/null
+++ b/kernel/trace/rv/monitors/snroc/snroc.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <rv/instrumentation.h>
+
+#define MODULE_NAME "snroc"
+
+#include <trace/events/sched.h>
+#include <rv_trace.h>
+#include <monitors/sched/sched.h>
+
+#define RV_MON_TYPE RV_MON_PER_TASK
+#include "snroc.h"
+#include <rv/da_monitor.h>
+
+static void handle_sched_set_state(void *data, struct task_struct *tsk, int state)
+{
+ da_handle_event(tsk, sched_set_state_snroc);
+}
+
+static void handle_sched_switch(void *data, bool preempt,
+ struct task_struct *prev,
+ struct task_struct *next,
+ unsigned int prev_state)
+{
+ da_handle_start_event(prev, sched_switch_out_snroc);
+ da_handle_event(next, sched_switch_in_snroc);
+}
+
+static int enable_snroc(void)
+{
+ int retval;
+
+ retval = da_monitor_init();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("snroc", sched_set_state_tp, handle_sched_set_state);
+ rv_attach_trace_probe("snroc", sched_switch, handle_sched_switch);
+
+ return 0;
+}
+
+static void disable_snroc(void)
+{
+ rv_this.enabled = 0;
+
+ rv_detach_trace_probe("snroc", sched_set_state_tp, handle_sched_set_state);
+ rv_detach_trace_probe("snroc", sched_switch, handle_sched_switch);
+
+ da_monitor_destroy();
+}
+
+static struct rv_monitor rv_this = {
+ .name = "snroc",
+ .description = "set non runnable on its own context.",
+ .enable = enable_snroc,
+ .disable = disable_snroc,
+ .reset = da_monitor_reset_all,
+ .enabled = 0,
+};
+
+static int __init register_snroc(void)
+{
+ return rv_register_monitor(&rv_this, &rv_sched);
+}
+
+static void __exit unregister_snroc(void)
+{
+ rv_unregister_monitor(&rv_this);
+}
+
+module_init(register_snroc);
+module_exit(unregister_snroc);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
+MODULE_DESCRIPTION("snroc: set non runnable on its own context.");
diff --git a/kernel/trace/rv/monitors/snroc/snroc.h b/kernel/trace/rv/monitors/snroc/snroc.h
new file mode 100644
index 000000000000..88b7328ad31a
--- /dev/null
+++ b/kernel/trace/rv/monitors/snroc/snroc.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Automatically generated C representation of snroc automaton
+ * For further information about this format, see kernel documentation:
+ * Documentation/trace/rv/deterministic_automata.rst
+ */
+
+#define MONITOR_NAME snroc
+
+enum states_snroc {
+ other_context_snroc,
+ own_context_snroc,
+ state_max_snroc,
+};
+
+#define INVALID_STATE state_max_snroc
+
+enum events_snroc {
+ sched_set_state_snroc,
+ sched_switch_in_snroc,
+ sched_switch_out_snroc,
+ event_max_snroc,
+};
+
+struct automaton_snroc {
+ char *state_names[state_max_snroc];
+ char *event_names[event_max_snroc];
+ unsigned char function[state_max_snroc][event_max_snroc];
+ unsigned char initial_state;
+ bool final_states[state_max_snroc];
+};
+
+static const struct automaton_snroc automaton_snroc = {
+ .state_names = {
+ "other_context",
+ "own_context",
+ },
+ .event_names = {
+ "sched_set_state",
+ "sched_switch_in",
+ "sched_switch_out",
+ },
+ .function = {
+ { INVALID_STATE, own_context_snroc, INVALID_STATE },
+ { own_context_snroc, INVALID_STATE, other_context_snroc },
+ },
+ .initial_state = other_context_snroc,
+ .final_states = { 1, 0 },
+};
diff --git a/kernel/trace/rv/monitors/snroc/snroc_trace.h b/kernel/trace/rv/monitors/snroc/snroc_trace.h
new file mode 100644
index 000000000000..50114cef5122
--- /dev/null
+++ b/kernel/trace/rv/monitors/snroc/snroc_trace.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_SNROC
+DEFINE_EVENT(event_da_monitor_id, event_snroc,
+ TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state),
+ TP_ARGS(id, state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor_id, error_snroc,
+ TP_PROTO(int id, char *state, char *event),
+ TP_ARGS(id, state, event));
+#endif /* CONFIG_RV_MON_SNROC */
diff --git a/kernel/trace/rv/monitors/sssw/Kconfig b/kernel/trace/rv/monitors/sssw/Kconfig
new file mode 100644
index 000000000000..23b7eeb38bbf
--- /dev/null
+++ b/kernel/trace/rv/monitors/sssw/Kconfig
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_SSSW
+ depends on RV
+ depends on RV_MON_SCHED
+ default y
+ select DA_MON_EVENTS_ID
+ bool "sssw monitor"
+ help
+ Monitor to ensure sched_set_state to sleepable leads to sleeping and
+ sleeping tasks require wakeup.
+ This monitor is part of the sched monitors collection.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_sched.rst
diff --git a/kernel/trace/rv/monitors/sssw/sssw.c b/kernel/trace/rv/monitors/sssw/sssw.c
new file mode 100644
index 000000000000..a91321c890cd
--- /dev/null
+++ b/kernel/trace/rv/monitors/sssw/sssw.c
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <rv/instrumentation.h>
+
+#define MODULE_NAME "sssw"
+
+#include <trace/events/sched.h>
+#include <trace/events/signal.h>
+#include <rv_trace.h>
+#include <monitors/sched/sched.h>
+
+#define RV_MON_TYPE RV_MON_PER_TASK
+#include "sssw.h"
+#include <rv/da_monitor.h>
+
+static void handle_sched_set_state(void *data, struct task_struct *tsk, int state)
+{
+ if (state == TASK_RUNNING)
+ da_handle_start_event(tsk, sched_set_state_runnable_sssw);
+ else
+ da_handle_event(tsk, sched_set_state_sleepable_sssw);
+}
+
+static void handle_sched_switch(void *data, bool preempt,
+ struct task_struct *prev,
+ struct task_struct *next,
+ unsigned int prev_state)
+{
+ if (preempt)
+ da_handle_event(prev, sched_switch_preempt_sssw);
+ else if (prev_state == TASK_RUNNING)
+ da_handle_event(prev, sched_switch_yield_sssw);
+ else if (prev_state == TASK_RTLOCK_WAIT)
+ /* special case of sleeping task with racy conditions */
+ da_handle_event(prev, sched_switch_blocking_sssw);
+ else
+ da_handle_event(prev, sched_switch_suspend_sssw);
+ da_handle_event(next, sched_switch_in_sssw);
+}
+
+static void handle_sched_wakeup(void *data, struct task_struct *p)
+{
+ /*
+ * Wakeup can also lead to signal_wakeup although the system is
+ * actually runnable. The monitor can safely start with this event.
+ */
+ da_handle_start_event(p, sched_wakeup_sssw);
+}
+
+static void handle_signal_deliver(void *data, int sig,
+ struct kernel_siginfo *info,
+ struct k_sigaction *ka)
+{
+ da_handle_event(current, signal_deliver_sssw);
+}
+
+static int enable_sssw(void)
+{
+ int retval;
+
+ retval = da_monitor_init();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("sssw", sched_set_state_tp, handle_sched_set_state);
+ rv_attach_trace_probe("sssw", sched_switch, handle_sched_switch);
+ rv_attach_trace_probe("sssw", sched_wakeup, handle_sched_wakeup);
+ rv_attach_trace_probe("sssw", signal_deliver, handle_signal_deliver);
+
+ return 0;
+}
+
+static void disable_sssw(void)
+{
+ rv_this.enabled = 0;
+
+ rv_detach_trace_probe("sssw", sched_set_state_tp, handle_sched_set_state);
+ rv_detach_trace_probe("sssw", sched_switch, handle_sched_switch);
+ rv_detach_trace_probe("sssw", sched_wakeup, handle_sched_wakeup);
+ rv_detach_trace_probe("sssw", signal_deliver, handle_signal_deliver);
+
+ da_monitor_destroy();
+}
+
+static struct rv_monitor rv_this = {
+ .name = "sssw",
+ .description = "set state sleep and wakeup.",
+ .enable = enable_sssw,
+ .disable = disable_sssw,
+ .reset = da_monitor_reset_all,
+ .enabled = 0,
+};
+
+static int __init register_sssw(void)
+{
+ return rv_register_monitor(&rv_this, &rv_sched);
+}
+
+static void __exit unregister_sssw(void)
+{
+ rv_unregister_monitor(&rv_this);
+}
+
+module_init(register_sssw);
+module_exit(unregister_sssw);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
+MODULE_DESCRIPTION("sssw: set state sleep and wakeup.");
diff --git a/kernel/trace/rv/monitors/sssw/sssw.h b/kernel/trace/rv/monitors/sssw/sssw.h
new file mode 100644
index 000000000000..1a4b806061c3
--- /dev/null
+++ b/kernel/trace/rv/monitors/sssw/sssw.h
@@ -0,0 +1,107 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Automatically generated C representation of sssw automaton
+ * For further information about this format, see kernel documentation:
+ * Documentation/trace/rv/deterministic_automata.rst
+ */
+
+#define MONITOR_NAME sssw
+
+enum states_sssw {
+ runnable_sssw,
+ signal_wakeup_sssw,
+ sleepable_sssw,
+ sleeping_sssw,
+ state_max_sssw,
+};
+
+#define INVALID_STATE state_max_sssw
+
+enum events_sssw {
+ sched_set_state_runnable_sssw,
+ sched_set_state_sleepable_sssw,
+ sched_switch_blocking_sssw,
+ sched_switch_in_sssw,
+ sched_switch_preempt_sssw,
+ sched_switch_suspend_sssw,
+ sched_switch_yield_sssw,
+ sched_wakeup_sssw,
+ signal_deliver_sssw,
+ event_max_sssw,
+};
+
+struct automaton_sssw {
+ char *state_names[state_max_sssw];
+ char *event_names[event_max_sssw];
+ unsigned char function[state_max_sssw][event_max_sssw];
+ unsigned char initial_state;
+ bool final_states[state_max_sssw];
+};
+
+static const struct automaton_sssw automaton_sssw = {
+ .state_names = {
+ "runnable",
+ "signal_wakeup",
+ "sleepable",
+ "sleeping",
+ },
+ .event_names = {
+ "sched_set_state_runnable",
+ "sched_set_state_sleepable",
+ "sched_switch_blocking",
+ "sched_switch_in",
+ "sched_switch_preempt",
+ "sched_switch_suspend",
+ "sched_switch_yield",
+ "sched_wakeup",
+ "signal_deliver",
+ },
+ .function = {
+ {
+ runnable_sssw,
+ sleepable_sssw,
+ sleeping_sssw,
+ runnable_sssw,
+ runnable_sssw,
+ INVALID_STATE,
+ runnable_sssw,
+ runnable_sssw,
+ runnable_sssw,
+ },
+ {
+ INVALID_STATE,
+ sleepable_sssw,
+ INVALID_STATE,
+ signal_wakeup_sssw,
+ signal_wakeup_sssw,
+ INVALID_STATE,
+ signal_wakeup_sssw,
+ signal_wakeup_sssw,
+ runnable_sssw,
+ },
+ {
+ runnable_sssw,
+ sleepable_sssw,
+ sleeping_sssw,
+ sleepable_sssw,
+ sleepable_sssw,
+ sleeping_sssw,
+ signal_wakeup_sssw,
+ runnable_sssw,
+ sleepable_sssw,
+ },
+ {
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE,
+ runnable_sssw,
+ INVALID_STATE,
+ },
+ },
+ .initial_state = runnable_sssw,
+ .final_states = { 1, 0, 0, 0 },
+};
diff --git a/kernel/trace/rv/monitors/sssw/sssw_trace.h b/kernel/trace/rv/monitors/sssw/sssw_trace.h
new file mode 100644
index 000000000000..6c03cfc6960b
--- /dev/null
+++ b/kernel/trace/rv/monitors/sssw/sssw_trace.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_SSSW
+DEFINE_EVENT(event_da_monitor_id, event_sssw,
+ TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state),
+ TP_ARGS(id, state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor_id, error_sssw,
+ TP_PROTO(int id, char *state, char *event),
+ TP_ARGS(id, state, event));
+#endif /* CONFIG_RV_MON_SSSW */
diff --git a/kernel/trace/rv/monitors/stall/Kconfig b/kernel/trace/rv/monitors/stall/Kconfig
new file mode 100644
index 000000000000..6f846b642544
--- /dev/null
+++ b/kernel/trace/rv/monitors/stall/Kconfig
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_STALL
+ depends on RV
+ select HA_MON_EVENTS_ID
+ bool "stall monitor"
+ help
+ Enable the stall sample monitor that illustrates the usage of hybrid
+ automata monitors. It can be used to identify tasks stalled for
+ longer than a threshold.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_stall.rst
diff --git a/kernel/trace/rv/monitors/stall/stall.c b/kernel/trace/rv/monitors/stall/stall.c
new file mode 100644
index 000000000000..3c38fb1a0159
--- /dev/null
+++ b/kernel/trace/rv/monitors/stall/stall.c
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <rv/instrumentation.h>
+
+#define MODULE_NAME "stall"
+
+#include <trace/events/sched.h>
+#include <rv_trace.h>
+
+#define RV_MON_TYPE RV_MON_PER_TASK
+#define HA_TIMER_TYPE HA_TIMER_WHEEL
+#include "stall.h"
+#include <rv/ha_monitor.h>
+
+static u64 threshold_jiffies = 1000;
+module_param(threshold_jiffies, ullong, 0644);
+
+static u64 ha_get_env(struct ha_monitor *ha_mon, enum envs_stall env, u64 time_ns)
+{
+ if (env == clk_stall)
+ return ha_get_clk_jiffy(ha_mon, env);
+ return ENV_INVALID_VALUE;
+}
+
+static void ha_reset_env(struct ha_monitor *ha_mon, enum envs_stall env, u64 time_ns)
+{
+ if (env == clk_stall)
+ ha_reset_clk_jiffy(ha_mon, env);
+}
+
+static inline bool ha_verify_invariants(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
+{
+ if (curr_state == enqueued_stall)
+ return ha_check_invariant_jiffy(ha_mon, clk_stall, time_ns);
+ return true;
+}
+
+static inline bool ha_verify_guards(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
+{
+ bool res = true;
+
+ if (curr_state == dequeued_stall && event == sched_wakeup_stall)
+ ha_reset_env(ha_mon, clk_stall, time_ns);
+ else if (curr_state == running_stall && event == sched_switch_preempt_stall)
+ ha_reset_env(ha_mon, clk_stall, time_ns);
+ return res;
+}
+
+static inline void ha_setup_invariants(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
+{
+ if (next_state == curr_state)
+ return;
+ if (next_state == enqueued_stall)
+ ha_start_timer_jiffy(ha_mon, clk_stall, threshold_jiffies, time_ns);
+ else if (curr_state == enqueued_stall)
+ ha_cancel_timer(ha_mon);
+}
+
+static bool ha_verify_constraint(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
+{
+ if (!ha_verify_invariants(ha_mon, curr_state, event, next_state, time_ns))
+ return false;
+
+ if (!ha_verify_guards(ha_mon, curr_state, event, next_state, time_ns))
+ return false;
+
+ ha_setup_invariants(ha_mon, curr_state, event, next_state, time_ns);
+
+ return true;
+}
+
+static void handle_sched_switch(void *data, bool preempt,
+ struct task_struct *prev,
+ struct task_struct *next,
+ unsigned int prev_state)
+{
+ if (!preempt && prev_state != TASK_RUNNING)
+ da_handle_start_event(prev, sched_switch_wait_stall);
+ else
+ da_handle_event(prev, sched_switch_preempt_stall);
+ da_handle_event(next, sched_switch_in_stall);
+}
+
+static void handle_sched_wakeup(void *data, struct task_struct *p)
+{
+ da_handle_event(p, sched_wakeup_stall);
+}
+
+static int enable_stall(void)
+{
+ int retval;
+
+ retval = ha_monitor_init();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("stall", sched_switch, handle_sched_switch);
+ rv_attach_trace_probe("stall", sched_wakeup, handle_sched_wakeup);
+
+ return 0;
+}
+
+static void disable_stall(void)
+{
+ rv_this.enabled = 0;
+
+ rv_detach_trace_probe("stall", sched_switch, handle_sched_switch);
+ rv_detach_trace_probe("stall", sched_wakeup, handle_sched_wakeup);
+
+ ha_monitor_destroy();
+}
+
+static struct rv_monitor rv_this = {
+ .name = "stall",
+ .description = "identify tasks stalled for longer than a threshold.",
+ .enable = enable_stall,
+ .disable = disable_stall,
+ .reset = da_monitor_reset_all,
+ .enabled = 0,
+};
+
+static int __init register_stall(void)
+{
+ return rv_register_monitor(&rv_this, NULL);
+}
+
+static void __exit unregister_stall(void)
+{
+ rv_unregister_monitor(&rv_this);
+}
+
+module_init(register_stall);
+module_exit(unregister_stall);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
+MODULE_DESCRIPTION("stall: identify tasks stalled for longer than a threshold.");
diff --git a/kernel/trace/rv/monitors/stall/stall.h b/kernel/trace/rv/monitors/stall/stall.h
new file mode 100644
index 000000000000..638520cb1082
--- /dev/null
+++ b/kernel/trace/rv/monitors/stall/stall.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Automatically generated C representation of stall automaton
+ * For further information about this format, see kernel documentation:
+ * Documentation/trace/rv/deterministic_automata.rst
+ */
+
+#define MONITOR_NAME stall
+
+enum states_stall {
+ dequeued_stall,
+ enqueued_stall,
+ running_stall,
+ state_max_stall,
+};
+
+#define INVALID_STATE state_max_stall
+
+enum events_stall {
+ sched_switch_in_stall,
+ sched_switch_preempt_stall,
+ sched_switch_wait_stall,
+ sched_wakeup_stall,
+ event_max_stall,
+};
+
+enum envs_stall {
+ clk_stall,
+ env_max_stall,
+ env_max_stored_stall = env_max_stall,
+};
+
+_Static_assert(env_max_stored_stall <= MAX_HA_ENV_LEN, "Not enough slots");
+
+struct automaton_stall {
+ char *state_names[state_max_stall];
+ char *event_names[event_max_stall];
+ char *env_names[env_max_stall];
+ unsigned char function[state_max_stall][event_max_stall];
+ unsigned char initial_state;
+ bool final_states[state_max_stall];
+};
+
+static const struct automaton_stall automaton_stall = {
+ .state_names = {
+ "dequeued",
+ "enqueued",
+ "running",
+ },
+ .event_names = {
+ "sched_switch_in",
+ "sched_switch_preempt",
+ "sched_switch_wait",
+ "sched_wakeup",
+ },
+ .env_names = {
+ "clk",
+ },
+ .function = {
+ {
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE,
+ enqueued_stall,
+ },
+ {
+ running_stall,
+ INVALID_STATE,
+ INVALID_STATE,
+ enqueued_stall,
+ },
+ {
+ running_stall,
+ enqueued_stall,
+ dequeued_stall,
+ running_stall,
+ },
+ },
+ .initial_state = dequeued_stall,
+ .final_states = { 1, 0, 0 },
+};
diff --git a/kernel/trace/rv/monitors/stall/stall_trace.h b/kernel/trace/rv/monitors/stall/stall_trace.h
new file mode 100644
index 000000000000..6a7cc1b1d040
--- /dev/null
+++ b/kernel/trace/rv/monitors/stall/stall_trace.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_STALL
+DEFINE_EVENT(event_da_monitor_id, event_stall,
+ TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state),
+ TP_ARGS(id, state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor_id, error_stall,
+ TP_PROTO(int id, char *state, char *event),
+ TP_ARGS(id, state, event));
+
+DEFINE_EVENT(error_env_da_monitor_id, error_env_stall,
+ TP_PROTO(int id, char *state, char *event, char *env),
+ TP_ARGS(id, state, event, env));
+#endif /* CONFIG_RV_MON_STALL */
diff --git a/kernel/trace/rv/monitors/sts/Kconfig b/kernel/trace/rv/monitors/sts/Kconfig
new file mode 100644
index 000000000000..7d1ff0f6fc91
--- /dev/null
+++ b/kernel/trace/rv/monitors/sts/Kconfig
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_STS
+ depends on RV
+ depends on TRACE_IRQFLAGS
+ depends on RV_MON_SCHED
+ default y
+ select DA_MON_EVENTS_IMPLICIT
+ bool "sts monitor"
+ help
+ Monitor to ensure relationships between scheduler and task switches
+ * the scheduler is called and returns with interrupts disabled
+ * each call to the scheduler has up to one switch
+ * switches only happen inside the scheduler
+ * each call to the scheduler disables interrupts to switch
+ This monitor is part of the sched monitors collection.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_sched.rst
diff --git a/kernel/trace/rv/monitors/sts/sts.c b/kernel/trace/rv/monitors/sts/sts.c
new file mode 100644
index 000000000000..ce031cbf202a
--- /dev/null
+++ b/kernel/trace/rv/monitors/sts/sts.c
@@ -0,0 +1,154 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <rv/instrumentation.h>
+
+#define MODULE_NAME "sts"
+
+#include <trace/events/sched.h>
+#include <trace/events/irq.h>
+#include <trace/events/preemptirq.h>
+#include <rv_trace.h>
+#include <monitors/sched/sched.h>
+
+#define RV_MON_TYPE RV_MON_PER_CPU
+#include "sts.h"
+#include <rv/da_monitor.h>
+
+#ifdef CONFIG_X86_LOCAL_APIC
+#include <asm/trace/irq_vectors.h>
+
+static void handle_vector_irq_entry(void *data, int vector)
+{
+ da_handle_event(irq_entry_sts);
+}
+
+static void attach_vector_irq(void)
+{
+ rv_attach_trace_probe("sts", local_timer_entry, handle_vector_irq_entry);
+ if (IS_ENABLED(CONFIG_IRQ_WORK))
+ rv_attach_trace_probe("sts", irq_work_entry, handle_vector_irq_entry);
+ if (IS_ENABLED(CONFIG_SMP)) {
+ rv_attach_trace_probe("sts", reschedule_entry, handle_vector_irq_entry);
+ rv_attach_trace_probe("sts", call_function_entry, handle_vector_irq_entry);
+ rv_attach_trace_probe("sts", call_function_single_entry, handle_vector_irq_entry);
+ }
+}
+
+static void detach_vector_irq(void)
+{
+ rv_detach_trace_probe("sts", local_timer_entry, handle_vector_irq_entry);
+ if (IS_ENABLED(CONFIG_IRQ_WORK))
+ rv_detach_trace_probe("sts", irq_work_entry, handle_vector_irq_entry);
+ if (IS_ENABLED(CONFIG_SMP)) {
+ rv_detach_trace_probe("sts", reschedule_entry, handle_vector_irq_entry);
+ rv_detach_trace_probe("sts", call_function_entry, handle_vector_irq_entry);
+ rv_detach_trace_probe("sts", call_function_single_entry, handle_vector_irq_entry);
+ }
+}
+
+#else
+/* We assume irq_entry tracepoints are sufficient on other architectures */
+static void attach_vector_irq(void) { }
+static void detach_vector_irq(void) { }
+#endif
+
+static void handle_irq_disable(void *data, unsigned long ip, unsigned long parent_ip)
+{
+ da_handle_event(irq_disable_sts);
+}
+
+static void handle_irq_enable(void *data, unsigned long ip, unsigned long parent_ip)
+{
+ da_handle_event(irq_enable_sts);
+}
+
+static void handle_irq_entry(void *data, int irq, struct irqaction *action)
+{
+ da_handle_event(irq_entry_sts);
+}
+
+static void handle_sched_switch(void *data, bool preempt,
+ struct task_struct *prev,
+ struct task_struct *next,
+ unsigned int prev_state)
+{
+ da_handle_event(sched_switch_sts);
+}
+
+static void handle_schedule_entry(void *data, bool preempt)
+{
+ da_handle_event(schedule_entry_sts);
+}
+
+static void handle_schedule_exit(void *data, bool is_switch)
+{
+ da_handle_start_event(schedule_exit_sts);
+}
+
+static int enable_sts(void)
+{
+ int retval;
+
+ retval = da_monitor_init();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("sts", irq_disable, handle_irq_disable);
+ rv_attach_trace_probe("sts", irq_enable, handle_irq_enable);
+ rv_attach_trace_probe("sts", irq_handler_entry, handle_irq_entry);
+ rv_attach_trace_probe("sts", sched_switch, handle_sched_switch);
+ rv_attach_trace_probe("sts", sched_entry_tp, handle_schedule_entry);
+ rv_attach_trace_probe("sts", sched_exit_tp, handle_schedule_exit);
+ attach_vector_irq();
+
+ return 0;
+}
+
+static void disable_sts(void)
+{
+ rv_this.enabled = 0;
+
+ rv_detach_trace_probe("sts", irq_disable, handle_irq_disable);
+ rv_detach_trace_probe("sts", irq_enable, handle_irq_enable);
+ rv_detach_trace_probe("sts", irq_handler_entry, handle_irq_entry);
+ rv_detach_trace_probe("sts", sched_switch, handle_sched_switch);
+ rv_detach_trace_probe("sts", sched_entry_tp, handle_schedule_entry);
+ rv_detach_trace_probe("sts", sched_exit_tp, handle_schedule_exit);
+ detach_vector_irq();
+
+ da_monitor_destroy();
+}
+
+/*
+ * This is the monitor register section.
+ */
+static struct rv_monitor rv_this = {
+ .name = "sts",
+ .description = "schedule implies task switch.",
+ .enable = enable_sts,
+ .disable = disable_sts,
+ .reset = da_monitor_reset_all,
+ .enabled = 0,
+};
+
+static int __init register_sts(void)
+{
+ return rv_register_monitor(&rv_this, &rv_sched);
+}
+
+static void __exit unregister_sts(void)
+{
+ rv_unregister_monitor(&rv_this);
+}
+
+module_init(register_sts);
+module_exit(unregister_sts);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
+MODULE_DESCRIPTION("sts: schedule implies task switch.");
diff --git a/kernel/trace/rv/monitors/sts/sts.h b/kernel/trace/rv/monitors/sts/sts.h
new file mode 100644
index 000000000000..6f7b2d9d72e6
--- /dev/null
+++ b/kernel/trace/rv/monitors/sts/sts.h
@@ -0,0 +1,119 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Automatically generated C representation of sts automaton
+ * For further information about this format, see kernel documentation:
+ * Documentation/trace/rv/deterministic_automata.rst
+ */
+
+#define MONITOR_NAME sts
+
+enum states_sts {
+ can_sched_sts,
+ cant_sched_sts,
+ disable_to_switch_sts,
+ enable_to_exit_sts,
+ in_irq_sts,
+ scheduling_sts,
+ switching_sts,
+ state_max_sts,
+};
+
+#define INVALID_STATE state_max_sts
+
+enum events_sts {
+ irq_disable_sts,
+ irq_enable_sts,
+ irq_entry_sts,
+ sched_switch_sts,
+ schedule_entry_sts,
+ schedule_exit_sts,
+ event_max_sts,
+};
+
+struct automaton_sts {
+ char *state_names[state_max_sts];
+ char *event_names[event_max_sts];
+ unsigned char function[state_max_sts][event_max_sts];
+ unsigned char initial_state;
+ bool final_states[state_max_sts];
+};
+
+static const struct automaton_sts automaton_sts = {
+ .state_names = {
+ "can_sched",
+ "cant_sched",
+ "disable_to_switch",
+ "enable_to_exit",
+ "in_irq",
+ "scheduling",
+ "switching",
+ },
+ .event_names = {
+ "irq_disable",
+ "irq_enable",
+ "irq_entry",
+ "sched_switch",
+ "schedule_entry",
+ "schedule_exit",
+ },
+ .function = {
+ {
+ cant_sched_sts,
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE,
+ scheduling_sts,
+ INVALID_STATE,
+ },
+ {
+ INVALID_STATE,
+ can_sched_sts,
+ cant_sched_sts,
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE,
+ },
+ {
+ INVALID_STATE,
+ enable_to_exit_sts,
+ in_irq_sts,
+ switching_sts,
+ INVALID_STATE,
+ INVALID_STATE,
+ },
+ {
+ enable_to_exit_sts,
+ enable_to_exit_sts,
+ enable_to_exit_sts,
+ INVALID_STATE,
+ INVALID_STATE,
+ can_sched_sts,
+ },
+ {
+ INVALID_STATE,
+ scheduling_sts,
+ in_irq_sts,
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE,
+ },
+ {
+ disable_to_switch_sts,
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE,
+ },
+ {
+ INVALID_STATE,
+ enable_to_exit_sts,
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE,
+ },
+ },
+ .initial_state = can_sched_sts,
+ .final_states = { 1, 0, 0, 0, 0, 0, 0 },
+};
diff --git a/kernel/trace/rv/monitors/sts/sts_trace.h b/kernel/trace/rv/monitors/sts/sts_trace.h
new file mode 100644
index 000000000000..d78beb58d5b3
--- /dev/null
+++ b/kernel/trace/rv/monitors/sts/sts_trace.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_STS
+DEFINE_EVENT(event_da_monitor, event_sts,
+ TP_PROTO(char *state, char *event, char *next_state, bool final_state),
+ TP_ARGS(state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor, error_sts,
+ TP_PROTO(char *state, char *event),
+ TP_ARGS(state, event));
+#endif /* CONFIG_RV_MON_STS */
diff --git a/kernel/trace/rv/monitors/wip/Kconfig b/kernel/trace/rv/monitors/wip/Kconfig
new file mode 100644
index 000000000000..87a26195792b
--- /dev/null
+++ b/kernel/trace/rv/monitors/wip/Kconfig
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_WIP
+ depends on RV
+ depends on TRACE_PREEMPT_TOGGLE
+ select DA_MON_EVENTS_IMPLICIT
+ bool "wip monitor"
+ help
+ Enable wip (wakeup in preemptive) sample monitor that illustrates
+ the usage of per-cpu monitors, and one limitation of the
+ preempt_disable/enable events.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_wip.rst
diff --git a/kernel/trace/rv/monitors/wip/wip.c b/kernel/trace/rv/monitors/wip/wip.c
index b2b49a27e886..22d77ec42463 100644
--- a/kernel/trace/rv/monitors/wip/wip.c
+++ b/kernel/trace/rv/monitors/wip/wip.c
@@ -6,39 +6,37 @@
#include <linux/init.h>
#include <linux/rv.h>
#include <rv/instrumentation.h>
-#include <rv/da_monitor.h>
#define MODULE_NAME "wip"
-#include <trace/events/rv.h>
+#include <rv_trace.h>
#include <trace/events/sched.h>
#include <trace/events/preemptirq.h>
+#define RV_MON_TYPE RV_MON_PER_CPU
#include "wip.h"
-
-static struct rv_monitor rv_wip;
-DECLARE_DA_MON_PER_CPU(wip, unsigned char);
+#include <rv/da_monitor.h>
static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip)
{
- da_handle_event_wip(preempt_disable_wip);
+ da_handle_event(preempt_disable_wip);
}
static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip)
{
- da_handle_start_event_wip(preempt_enable_wip);
+ da_handle_start_event(preempt_enable_wip);
}
static void handle_sched_waking(void *data, struct task_struct *task)
{
- da_handle_event_wip(sched_waking_wip);
+ da_handle_event(sched_waking_wip);
}
static int enable_wip(void)
{
int retval;
- retval = da_monitor_init_wip();
+ retval = da_monitor_init();
if (retval)
return retval;
@@ -51,33 +49,32 @@ static int enable_wip(void)
static void disable_wip(void)
{
- rv_wip.enabled = 0;
+ rv_this.enabled = 0;
rv_detach_trace_probe("wip", preempt_disable, handle_preempt_disable);
rv_detach_trace_probe("wip", preempt_enable, handle_preempt_enable);
rv_detach_trace_probe("wip", sched_waking, handle_sched_waking);
- da_monitor_destroy_wip();
+ da_monitor_destroy();
}
-static struct rv_monitor rv_wip = {
+static struct rv_monitor rv_this = {
.name = "wip",
.description = "wakeup in preemptive per-cpu testing monitor.",
.enable = enable_wip,
.disable = disable_wip,
- .reset = da_monitor_reset_all_wip,
+ .reset = da_monitor_reset_all,
.enabled = 0,
};
static int __init register_wip(void)
{
- rv_register_monitor(&rv_wip);
- return 0;
+ return rv_register_monitor(&rv_this, NULL);
}
static void __exit unregister_wip(void)
{
- rv_unregister_monitor(&rv_wip);
+ rv_unregister_monitor(&rv_this);
}
module_init(register_wip);
diff --git a/kernel/trace/rv/monitors/wip/wip.h b/kernel/trace/rv/monitors/wip/wip.h
index 2e373f2c65ed..b4c3eea94c86 100644
--- a/kernel/trace/rv/monitors/wip/wip.h
+++ b/kernel/trace/rv/monitors/wip/wip.h
@@ -1,22 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Automatically generated C representation of wip automaton
* For further information about this format, see kernel documentation:
* Documentation/trace/rv/deterministic_automata.rst
*/
+#define MONITOR_NAME wip
+
enum states_wip {
- preemptive_wip = 0,
+ preemptive_wip,
non_preemptive_wip,
- state_max_wip
+ state_max_wip,
};
#define INVALID_STATE state_max_wip
enum events_wip {
- preempt_disable_wip = 0,
+ preempt_disable_wip,
preempt_enable_wip,
sched_waking_wip,
- event_max_wip
+ event_max_wip,
};
struct automaton_wip {
@@ -30,12 +33,12 @@ struct automaton_wip {
static const struct automaton_wip automaton_wip = {
.state_names = {
"preemptive",
- "non_preemptive"
+ "non_preemptive",
},
.event_names = {
"preempt_disable",
"preempt_enable",
- "sched_waking"
+ "sched_waking",
},
.function = {
{ non_preemptive_wip, INVALID_STATE, INVALID_STATE },
diff --git a/kernel/trace/rv/monitors/wip/wip_trace.h b/kernel/trace/rv/monitors/wip/wip_trace.h
new file mode 100644
index 000000000000..aa2162f47a4c
--- /dev/null
+++ b/kernel/trace/rv/monitors/wip/wip_trace.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_WIP
+DEFINE_EVENT(event_da_monitor, event_wip,
+ TP_PROTO(char *state, char *event, char *next_state, bool final_state),
+ TP_ARGS(state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor, error_wip,
+ TP_PROTO(char *state, char *event),
+ TP_ARGS(state, event));
+#endif /* CONFIG_RV_MON_WIP */
diff --git a/kernel/trace/rv/monitors/wwnr/Kconfig b/kernel/trace/rv/monitors/wwnr/Kconfig
new file mode 100644
index 000000000000..d3bfc20037db
--- /dev/null
+++ b/kernel/trace/rv/monitors/wwnr/Kconfig
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_WWNR
+ depends on RV
+ select DA_MON_EVENTS_ID
+ bool "wwnr monitor"
+ help
+ Enable wwnr (wakeup while not running) sample monitor, this is a
+ sample monitor that illustrates the usage of per-task monitor.
+ The model is borken on purpose: it serves to test reactors.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_wwnr.rst
diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.c b/kernel/trace/rv/monitors/wwnr/wwnr.c
index 0e43dd2db685..579e7e217ee0 100644
--- a/kernel/trace/rv/monitors/wwnr/wwnr.c
+++ b/kernel/trace/rv/monitors/wwnr/wwnr.c
@@ -6,40 +6,38 @@
#include <linux/init.h>
#include <linux/rv.h>
#include <rv/instrumentation.h>
-#include <rv/da_monitor.h>
#define MODULE_NAME "wwnr"
-#include <trace/events/rv.h>
+#include <rv_trace.h>
#include <trace/events/sched.h>
+#define RV_MON_TYPE RV_MON_PER_TASK
#include "wwnr.h"
-
-static struct rv_monitor rv_wwnr;
-DECLARE_DA_MON_PER_TASK(wwnr, unsigned char);
+#include <rv/da_monitor.h>
static void handle_switch(void *data, bool preempt, struct task_struct *p,
struct task_struct *n, unsigned int prev_state)
{
/* start monitoring only after the first suspension */
if (prev_state == TASK_INTERRUPTIBLE)
- da_handle_start_event_wwnr(p, switch_out_wwnr);
+ da_handle_start_event(p, switch_out_wwnr);
else
- da_handle_event_wwnr(p, switch_out_wwnr);
+ da_handle_event(p, switch_out_wwnr);
- da_handle_event_wwnr(n, switch_in_wwnr);
+ da_handle_event(n, switch_in_wwnr);
}
static void handle_wakeup(void *data, struct task_struct *p)
{
- da_handle_event_wwnr(p, wakeup_wwnr);
+ da_handle_event(p, wakeup_wwnr);
}
static int enable_wwnr(void)
{
int retval;
- retval = da_monitor_init_wwnr();
+ retval = da_monitor_init();
if (retval)
return retval;
@@ -51,32 +49,31 @@ static int enable_wwnr(void)
static void disable_wwnr(void)
{
- rv_wwnr.enabled = 0;
+ rv_this.enabled = 0;
rv_detach_trace_probe("wwnr", sched_switch, handle_switch);
rv_detach_trace_probe("wwnr", sched_wakeup, handle_wakeup);
- da_monitor_destroy_wwnr();
+ da_monitor_destroy();
}
-static struct rv_monitor rv_wwnr = {
+static struct rv_monitor rv_this = {
.name = "wwnr",
.description = "wakeup while not running per-task testing model.",
.enable = enable_wwnr,
.disable = disable_wwnr,
- .reset = da_monitor_reset_all_wwnr,
+ .reset = da_monitor_reset_all,
.enabled = 0,
};
static int __init register_wwnr(void)
{
- rv_register_monitor(&rv_wwnr);
- return 0;
+ return rv_register_monitor(&rv_this, NULL);
}
static void __exit unregister_wwnr(void)
{
- rv_unregister_monitor(&rv_wwnr);
+ rv_unregister_monitor(&rv_this);
}
module_init(register_wwnr);
diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.h b/kernel/trace/rv/monitors/wwnr/wwnr.h
index d0d9c4b8121b..a28006512c9b 100644
--- a/kernel/trace/rv/monitors/wwnr/wwnr.h
+++ b/kernel/trace/rv/monitors/wwnr/wwnr.h
@@ -1,22 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Automatically generated C representation of wwnr automaton
* For further information about this format, see kernel documentation:
* Documentation/trace/rv/deterministic_automata.rst
*/
+#define MONITOR_NAME wwnr
+
enum states_wwnr {
- not_running_wwnr = 0,
+ not_running_wwnr,
running_wwnr,
- state_max_wwnr
+ state_max_wwnr,
};
#define INVALID_STATE state_max_wwnr
enum events_wwnr {
- switch_in_wwnr = 0,
+ switch_in_wwnr,
switch_out_wwnr,
wakeup_wwnr,
- event_max_wwnr
+ event_max_wwnr,
};
struct automaton_wwnr {
@@ -30,12 +33,12 @@ struct automaton_wwnr {
static const struct automaton_wwnr automaton_wwnr = {
.state_names = {
"not_running",
- "running"
+ "running",
},
.event_names = {
"switch_in",
"switch_out",
- "wakeup"
+ "wakeup",
},
.function = {
{ running_wwnr, INVALID_STATE, not_running_wwnr },
diff --git a/kernel/trace/rv/monitors/wwnr/wwnr_trace.h b/kernel/trace/rv/monitors/wwnr/wwnr_trace.h
new file mode 100644
index 000000000000..fc97ec7476ad
--- /dev/null
+++ b/kernel/trace/rv/monitors/wwnr/wwnr_trace.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_WWNR
+/* id is the pid of the task */
+DEFINE_EVENT(event_da_monitor_id, event_wwnr,
+ TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state),
+ TP_ARGS(id, state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor_id, error_wwnr,
+ TP_PROTO(int id, char *state, char *event),
+ TP_ARGS(id, state, event));
+#endif /* CONFIG_RV_MON_WWNR */
diff --git a/kernel/trace/rv/reactor_panic.c b/kernel/trace/rv/reactor_panic.c
index 0186ff4cbd0b..76537b8a4343 100644
--- a/kernel/trace/rv/reactor_panic.c
+++ b/kernel/trace/rv/reactor_panic.c
@@ -13,9 +13,9 @@
#include <linux/init.h>
#include <linux/rv.h>
-static void rv_panic_reaction(char *msg)
+__printf(1, 0) static void rv_panic_reaction(const char *msg, va_list args)
{
- panic(msg);
+ vpanic(msg, args);
}
static struct rv_reactor rv_panic = {
diff --git a/kernel/trace/rv/reactor_printk.c b/kernel/trace/rv/reactor_printk.c
index 178759dbf89f..48c934e315b3 100644
--- a/kernel/trace/rv/reactor_printk.c
+++ b/kernel/trace/rv/reactor_printk.c
@@ -12,9 +12,9 @@
#include <linux/init.h>
#include <linux/rv.h>
-static void rv_printk_reaction(char *msg)
+__printf(1, 0) static void rv_printk_reaction(const char *msg, va_list args)
{
- printk_deferred(msg);
+ vprintk_deferred(msg, args);
}
static struct rv_reactor rv_printk = {
diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c
index 279c70e1bd74..ee4e68102f17 100644
--- a/kernel/trace/rv/rv.c
+++ b/kernel/trace/rv/rv.c
@@ -143,9 +143,9 @@
#include <linux/init.h>
#include <linux/slab.h>
-#ifdef CONFIG_DA_MON_EVENTS
+#ifdef CONFIG_RV_MON_EVENTS
#define CREATE_TRACE_POINTS
-#include <trace/events/rv.h>
+#include <rv_trace.h>
#endif
#include "rv.h"
@@ -162,10 +162,10 @@ struct dentry *get_monitors_root(void)
/*
* Interface for the monitor register.
*/
-static LIST_HEAD(rv_monitors_list);
+LIST_HEAD(rv_monitors_list);
static int task_monitor_count;
-static bool task_monitor_slots[RV_PER_TASK_MONITORS];
+static bool task_monitor_slots[CONFIG_RV_PER_TASK_MONITORS];
int rv_get_task_monitor_slot(void)
{
@@ -173,12 +173,12 @@ int rv_get_task_monitor_slot(void)
lockdep_assert_held(&rv_interface_lock);
- if (task_monitor_count == RV_PER_TASK_MONITORS)
+ if (task_monitor_count == CONFIG_RV_PER_TASK_MONITORS)
return -EBUSY;
task_monitor_count++;
- for (i = 0; i < RV_PER_TASK_MONITORS; i++) {
+ for (i = 0; i < CONFIG_RV_PER_TASK_MONITORS; i++) {
if (task_monitor_slots[i] == false) {
task_monitor_slots[i] = true;
return i;
@@ -194,7 +194,7 @@ void rv_put_task_monitor_slot(int slot)
{
lockdep_assert_held(&rv_interface_lock);
- if (slot < 0 || slot >= RV_PER_TASK_MONITORS) {
+ if (slot < 0 || slot >= CONFIG_RV_PER_TASK_MONITORS) {
WARN_ONCE(1, "RV releasing an invalid slot!: %d\n", slot);
return;
}
@@ -207,15 +207,44 @@ void rv_put_task_monitor_slot(int slot)
}
/*
+ * Monitors with a parent are nested,
+ * Monitors without a parent could be standalone or containers.
+ */
+bool rv_is_nested_monitor(struct rv_monitor *mon)
+{
+ return mon->parent != NULL;
+}
+
+/*
+ * We set our list to have nested monitors listed after their parent
+ * if a monitor has a child element its a container.
+ * Containers can be also identified based on their function pointers:
+ * as they are not real monitors they do not need function definitions
+ * for enable()/disable(). Use this condition to find empty containers.
+ * Keep both conditions in case we have some non-compliant containers.
+ */
+bool rv_is_container_monitor(struct rv_monitor *mon)
+{
+ struct rv_monitor *next;
+
+ if (list_is_last(&mon->list, &rv_monitors_list))
+ return false;
+
+ next = list_next_entry(mon, list);
+
+ return next->parent == mon || !mon->enable;
+}
+
+/*
* This section collects the monitor/ files and folders.
*/
static ssize_t monitor_enable_read_data(struct file *filp, char __user *user_buf, size_t count,
loff_t *ppos)
{
- struct rv_monitor_def *mdef = filp->private_data;
+ struct rv_monitor *mon = filp->private_data;
const char *buff;
- buff = mdef->monitor->enabled ? "1\n" : "0\n";
+ buff = mon->enabled ? "1\n" : "0\n";
return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff)+1);
}
@@ -223,13 +252,14 @@ static ssize_t monitor_enable_read_data(struct file *filp, char __user *user_buf
/*
* __rv_disable_monitor - disabled an enabled monitor
*/
-static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync)
+static int __rv_disable_monitor(struct rv_monitor *mon, bool sync)
{
lockdep_assert_held(&rv_interface_lock);
- if (mdef->monitor->enabled) {
- mdef->monitor->enabled = 0;
- mdef->monitor->disable();
+ if (mon->enabled) {
+ mon->enabled = 0;
+ if (mon->disable)
+ mon->disable();
/*
* Wait for the execution of all events to finish.
@@ -243,37 +273,90 @@ static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync)
return 0;
}
+static void rv_disable_single(struct rv_monitor *mon)
+{
+ __rv_disable_monitor(mon, true);
+}
+
+static int rv_enable_single(struct rv_monitor *mon)
+{
+ int retval;
+
+ lockdep_assert_held(&rv_interface_lock);
+
+ if (mon->enabled)
+ return 0;
+
+ retval = mon->enable();
+
+ if (!retval)
+ mon->enabled = 1;
+
+ return retval;
+}
+
+static void rv_disable_container(struct rv_monitor *mon)
+{
+ struct rv_monitor *p = mon;
+ int enabled = 0;
+
+ list_for_each_entry_continue(p, &rv_monitors_list, list) {
+ if (p->parent != mon)
+ break;
+ enabled += __rv_disable_monitor(p, false);
+ }
+ if (enabled)
+ tracepoint_synchronize_unregister();
+ mon->enabled = 0;
+}
+
+static int rv_enable_container(struct rv_monitor *mon)
+{
+ struct rv_monitor *p = mon;
+ int retval = 0;
+
+ list_for_each_entry_continue(p, &rv_monitors_list, list) {
+ if (retval || p->parent != mon)
+ break;
+ retval = rv_enable_single(p);
+ }
+ if (retval)
+ rv_disable_container(mon);
+ else
+ mon->enabled = 1;
+ return retval;
+}
+
/**
* rv_disable_monitor - disable a given runtime monitor
- * @mdef: Pointer to the monitor definition structure.
+ * @mon: Pointer to the monitor definition structure.
*
* Returns 0 on success.
*/
-int rv_disable_monitor(struct rv_monitor_def *mdef)
+int rv_disable_monitor(struct rv_monitor *mon)
{
- __rv_disable_monitor(mdef, true);
+ if (rv_is_container_monitor(mon))
+ rv_disable_container(mon);
+ else
+ rv_disable_single(mon);
+
return 0;
}
/**
* rv_enable_monitor - enable a given runtime monitor
- * @mdef: Pointer to the monitor definition structure.
+ * @mon: Pointer to the monitor definition structure.
*
* Returns 0 on success, error otherwise.
*/
-int rv_enable_monitor(struct rv_monitor_def *mdef)
+int rv_enable_monitor(struct rv_monitor *mon)
{
int retval;
- lockdep_assert_held(&rv_interface_lock);
-
- if (mdef->monitor->enabled)
- return 0;
-
- retval = mdef->monitor->enable();
-
- if (!retval)
- mdef->monitor->enabled = 1;
+ if (rv_is_container_monitor(mon))
+ retval = rv_enable_container(mon);
+ else
+ retval = rv_enable_single(mon);
return retval;
}
@@ -284,7 +367,7 @@ int rv_enable_monitor(struct rv_monitor_def *mdef)
static ssize_t monitor_enable_write_data(struct file *filp, const char __user *user_buf,
size_t count, loff_t *ppos)
{
- struct rv_monitor_def *mdef = filp->private_data;
+ struct rv_monitor *mon = filp->private_data;
int retval;
bool val;
@@ -292,14 +375,12 @@ static ssize_t monitor_enable_write_data(struct file *filp, const char __user *u
if (retval)
return retval;
- mutex_lock(&rv_interface_lock);
+ guard(mutex)(&rv_interface_lock);
if (val)
- retval = rv_enable_monitor(mdef);
+ retval = rv_enable_monitor(mon);
else
- retval = rv_disable_monitor(mdef);
-
- mutex_unlock(&rv_interface_lock);
+ retval = rv_disable_monitor(mon);
return retval ? : count;
}
@@ -316,12 +397,12 @@ static const struct file_operations interface_enable_fops = {
static ssize_t monitor_desc_read_data(struct file *filp, char __user *user_buf, size_t count,
loff_t *ppos)
{
- struct rv_monitor_def *mdef = filp->private_data;
+ struct rv_monitor *mon = filp->private_data;
char buff[256];
memset(buff, 0, sizeof(buff));
- snprintf(buff, sizeof(buff), "%s\n", mdef->monitor->description);
+ snprintf(buff, sizeof(buff), "%s\n", mon->description);
return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff) + 1);
}
@@ -336,38 +417,30 @@ static const struct file_operations interface_desc_fops = {
* the monitor dir, where the specific options of the monitor
* are exposed.
*/
-static int create_monitor_dir(struct rv_monitor_def *mdef)
+static int create_monitor_dir(struct rv_monitor *mon, struct rv_monitor *parent)
{
- struct dentry *root = get_monitors_root();
- const char *name = mdef->monitor->name;
+ struct dentry *root = parent ? parent->root_d : get_monitors_root();
+ struct dentry *dir __free(rv_remove) = rv_create_dir(mon->name, root);
struct dentry *tmp;
int retval;
- mdef->root_d = rv_create_dir(name, root);
- if (!mdef->root_d)
+ if (!dir)
return -ENOMEM;
- tmp = rv_create_file("enable", RV_MODE_WRITE, mdef->root_d, mdef, &interface_enable_fops);
- if (!tmp) {
- retval = -ENOMEM;
- goto out_remove_root;
- }
+ tmp = rv_create_file("enable", RV_MODE_WRITE, dir, mon, &interface_enable_fops);
+ if (!tmp)
+ return -ENOMEM;
- tmp = rv_create_file("desc", RV_MODE_READ, mdef->root_d, mdef, &interface_desc_fops);
- if (!tmp) {
- retval = -ENOMEM;
- goto out_remove_root;
- }
+ tmp = rv_create_file("desc", RV_MODE_READ, dir, mon, &interface_desc_fops);
+ if (!tmp)
+ return -ENOMEM;
- retval = reactor_populate_monitor(mdef);
+ retval = reactor_populate_monitor(mon, dir);
if (retval)
- goto out_remove_root;
+ return retval;
+ mon->root_d = no_free_ptr(dir);
return 0;
-
-out_remove_root:
- rv_remove(mdef->root_d);
- return retval;
}
/*
@@ -375,9 +448,12 @@ out_remove_root:
*/
static int monitors_show(struct seq_file *m, void *p)
{
- struct rv_monitor_def *mon_def = p;
+ struct rv_monitor *mon = container_of(p, struct rv_monitor, list);
- seq_printf(m, "%s\n", mon_def->monitor->name);
+ if (mon->parent)
+ seq_printf(m, "%s:%s\n", mon->parent->name, mon->name);
+ else
+ seq_printf(m, "%s\n", mon->name);
return 0;
}
@@ -409,13 +485,13 @@ static void *available_monitors_next(struct seq_file *m, void *p, loff_t *pos)
*/
static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos)
{
- struct rv_monitor_def *m_def = p;
+ struct rv_monitor *mon = container_of(p, struct rv_monitor, list);
(*pos)++;
- list_for_each_entry_continue(m_def, &rv_monitors_list, list) {
- if (m_def->monitor->enabled)
- return m_def;
+ list_for_each_entry_continue(mon, &rv_monitors_list, list) {
+ if (mon->enabled)
+ return &mon->list;
}
return NULL;
@@ -423,7 +499,7 @@ static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos)
static void *enabled_monitors_start(struct seq_file *m, loff_t *pos)
{
- struct rv_monitor_def *m_def;
+ struct list_head *head;
loff_t l;
mutex_lock(&rv_interface_lock);
@@ -431,15 +507,15 @@ static void *enabled_monitors_start(struct seq_file *m, loff_t *pos)
if (list_empty(&rv_monitors_list))
return NULL;
- m_def = list_entry(&rv_monitors_list, struct rv_monitor_def, list);
+ head = &rv_monitors_list;
for (l = 0; l <= *pos; ) {
- m_def = enabled_monitors_next(m, m_def, &l);
- if (!m_def)
+ head = enabled_monitors_next(m, head, &l);
+ if (!head)
break;
}
- return m_def;
+ return head;
}
/*
@@ -479,13 +555,13 @@ static const struct file_operations available_monitors_ops = {
*/
static void disable_all_monitors(void)
{
- struct rv_monitor_def *mdef;
+ struct rv_monitor *mon;
int enabled = 0;
- mutex_lock(&rv_interface_lock);
+ guard(mutex)(&rv_interface_lock);
- list_for_each_entry(mdef, &rv_monitors_list, list)
- enabled += __rv_disable_monitor(mdef, false);
+ list_for_each_entry(mon, &rv_monitors_list, list)
+ enabled += __rv_disable_monitor(mon, false);
if (enabled) {
/*
@@ -495,8 +571,6 @@ static void disable_all_monitors(void)
*/
tracepoint_synchronize_unregister();
}
-
- mutex_unlock(&rv_interface_lock);
}
static int enabled_monitors_open(struct inode *inode, struct file *file)
@@ -511,10 +585,10 @@ static ssize_t enabled_monitors_write(struct file *filp, const char __user *user
size_t count, loff_t *ppos)
{
char buff[MAX_RV_MONITOR_NAME_SIZE + 2];
- struct rv_monitor_def *mdef;
+ struct rv_monitor *mon;
int retval = -EINVAL;
bool enable = true;
- char *ptr;
+ char *ptr, *tmp;
int len;
if (count < 1 || count > MAX_RV_MONITOR_NAME_SIZE + 1)
@@ -537,29 +611,32 @@ static ssize_t enabled_monitors_write(struct file *filp, const char __user *user
if (!len)
return count;
- mutex_lock(&rv_interface_lock);
+ guard(mutex)(&rv_interface_lock);
retval = -EINVAL;
- list_for_each_entry(mdef, &rv_monitors_list, list) {
- if (strcmp(ptr, mdef->monitor->name) != 0)
+ /* we support 1 nesting level, trim the parent */
+ tmp = strstr(ptr, ":");
+ if (tmp)
+ ptr = tmp+1;
+
+ list_for_each_entry(mon, &rv_monitors_list, list) {
+ if (strcmp(ptr, mon->name) != 0)
continue;
/*
* Monitor found!
*/
if (enable)
- retval = rv_enable_monitor(mdef);
+ retval = rv_enable_monitor(mon);
else
- retval = rv_disable_monitor(mdef);
-
- if (!retval)
- retval = count;
+ retval = rv_disable_monitor(mon);
- break;
+ if (retval)
+ return retval;
+ return count;
}
- mutex_unlock(&rv_interface_lock);
return retval;
}
@@ -583,8 +660,6 @@ static bool __read_mostly monitoring_on;
*/
bool rv_monitoring_on(void)
{
- /* Ensures that concurrent monitors read consistent monitoring_on */
- smp_rmb();
return READ_ONCE(monitoring_on);
}
@@ -604,25 +679,21 @@ static ssize_t monitoring_on_read_data(struct file *filp, char __user *user_buf,
static void turn_monitoring_off(void)
{
WRITE_ONCE(monitoring_on, false);
- /* Ensures that concurrent monitors read consistent monitoring_on */
- smp_wmb();
}
static void reset_all_monitors(void)
{
- struct rv_monitor_def *mdef;
+ struct rv_monitor *mon;
- list_for_each_entry(mdef, &rv_monitors_list, list) {
- if (mdef->monitor->enabled)
- mdef->monitor->reset();
+ list_for_each_entry(mon, &rv_monitors_list, list) {
+ if (mon->enabled && mon->reset)
+ mon->reset();
}
}
static void turn_monitoring_on(void)
{
WRITE_ONCE(monitoring_on, true);
- /* Ensures that concurrent monitors read consistent monitoring_on */
- smp_wmb();
}
static void turn_monitoring_on_with_reset(void)
@@ -652,7 +723,7 @@ static ssize_t monitoring_on_write_data(struct file *filp, const char __user *us
if (retval)
return retval;
- mutex_lock(&rv_interface_lock);
+ guard(mutex)(&rv_interface_lock);
if (val)
turn_monitoring_on_with_reset();
@@ -665,8 +736,6 @@ static ssize_t monitoring_on_write_data(struct file *filp, const char __user *us
*/
tracepoint_synchronize_unregister();
- mutex_unlock(&rv_interface_lock);
-
return count;
}
@@ -676,58 +745,57 @@ static const struct file_operations monitoring_on_fops = {
.read = monitoring_on_read_data,
};
-static void destroy_monitor_dir(struct rv_monitor_def *mdef)
+static void destroy_monitor_dir(struct rv_monitor *mon)
{
- reactor_cleanup_monitor(mdef);
- rv_remove(mdef->root_d);
+ rv_remove(mon->root_d);
}
/**
* rv_register_monitor - register a rv monitor.
* @monitor: The rv_monitor to be registered.
+ * @parent: The parent of the monitor to be registered, NULL if not nested.
*
* Returns 0 if successful, error otherwise.
*/
-int rv_register_monitor(struct rv_monitor *monitor)
+int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent)
{
- struct rv_monitor_def *r;
+ struct rv_monitor *r;
int retval = 0;
if (strlen(monitor->name) >= MAX_RV_MONITOR_NAME_SIZE) {
pr_info("Monitor %s has a name longer than %d\n", monitor->name,
MAX_RV_MONITOR_NAME_SIZE);
- return -1;
+ return -EINVAL;
}
- mutex_lock(&rv_interface_lock);
+ guard(mutex)(&rv_interface_lock);
list_for_each_entry(r, &rv_monitors_list, list) {
- if (strcmp(monitor->name, r->monitor->name) == 0) {
+ if (strcmp(monitor->name, r->name) == 0) {
pr_info("Monitor %s is already registered\n", monitor->name);
- retval = -1;
- goto out_unlock;
+ return -EEXIST;
}
}
- r = kzalloc(sizeof(struct rv_monitor_def), GFP_KERNEL);
- if (!r) {
- retval = -ENOMEM;
- goto out_unlock;
+ if (parent && rv_is_nested_monitor(parent)) {
+ pr_info("Parent monitor %s is already nested, cannot nest further\n",
+ parent->name);
+ return -EINVAL;
}
- r->monitor = monitor;
+ monitor->parent = parent;
- retval = create_monitor_dir(r);
- if (retval) {
- kfree(r);
- goto out_unlock;
- }
+ retval = create_monitor_dir(monitor, parent);
+ if (retval)
+ return retval;
- list_add_tail(&r->list, &rv_monitors_list);
+ /* keep children close to the parent for easier visualisation */
+ if (parent)
+ list_add(&monitor->list, &parent->list);
+ else
+ list_add_tail(&monitor->list, &rv_monitors_list);
-out_unlock:
- mutex_unlock(&rv_interface_lock);
- return retval;
+ return 0;
}
/**
@@ -738,19 +806,12 @@ out_unlock:
*/
int rv_unregister_monitor(struct rv_monitor *monitor)
{
- struct rv_monitor_def *ptr, *next;
+ guard(mutex)(&rv_interface_lock);
- mutex_lock(&rv_interface_lock);
+ rv_disable_monitor(monitor);
+ list_del(&monitor->list);
+ destroy_monitor_dir(monitor);
- list_for_each_entry_safe(ptr, next, &rv_monitors_list, list) {
- if (strcmp(monitor->name, ptr->monitor->name) == 0) {
- rv_disable_monitor(ptr);
- list_del(&ptr->list);
- destroy_monitor_dir(ptr);
- }
- }
-
- mutex_unlock(&rv_interface_lock);
return 0;
}
@@ -758,39 +819,36 @@ int __init rv_init_interface(void)
{
struct dentry *tmp;
int retval;
+ struct dentry *root_dir __free(rv_remove) = rv_create_dir("rv", NULL);
- rv_root.root_dir = rv_create_dir("rv", NULL);
- if (!rv_root.root_dir)
- goto out_err;
+ if (!root_dir)
+ return 1;
- rv_root.monitors_dir = rv_create_dir("monitors", rv_root.root_dir);
+ rv_root.monitors_dir = rv_create_dir("monitors", root_dir);
if (!rv_root.monitors_dir)
- goto out_err;
+ return 1;
- tmp = rv_create_file("available_monitors", RV_MODE_READ, rv_root.root_dir, NULL,
+ tmp = rv_create_file("available_monitors", RV_MODE_READ, root_dir, NULL,
&available_monitors_ops);
if (!tmp)
- goto out_err;
+ return 1;
- tmp = rv_create_file("enabled_monitors", RV_MODE_WRITE, rv_root.root_dir, NULL,
+ tmp = rv_create_file("enabled_monitors", RV_MODE_WRITE, root_dir, NULL,
&enabled_monitors_ops);
if (!tmp)
- goto out_err;
+ return 1;
- tmp = rv_create_file("monitoring_on", RV_MODE_WRITE, rv_root.root_dir, NULL,
+ tmp = rv_create_file("monitoring_on", RV_MODE_WRITE, root_dir, NULL,
&monitoring_on_fops);
if (!tmp)
- goto out_err;
- retval = init_rv_reactors(rv_root.root_dir);
+ return 1;
+ retval = init_rv_reactors(root_dir);
if (retval)
- goto out_err;
+ return 1;
turn_monitoring_on();
- return 0;
+ rv_root.root_dir = no_free_ptr(root_dir);
-out_err:
- rv_remove(rv_root.root_dir);
- printk(KERN_ERR "RV: Error while creating the RV interface\n");
- return 1;
+ return 0;
}
diff --git a/kernel/trace/rv/rv.h b/kernel/trace/rv/rv.h
index db6cb0913dbd..2c0f51ff9d5c 100644
--- a/kernel/trace/rv/rv.h
+++ b/kernel/trace/rv/rv.h
@@ -17,50 +17,29 @@ struct rv_interface {
#define rv_create_file tracefs_create_file
#define rv_remove tracefs_remove
+DEFINE_FREE(rv_remove, struct dentry *, if (_T) rv_remove(_T));
+
#define MAX_RV_MONITOR_NAME_SIZE 32
#define MAX_RV_REACTOR_NAME_SIZE 32
extern struct mutex rv_interface_lock;
-
-#ifdef CONFIG_RV_REACTORS
-struct rv_reactor_def {
- struct list_head list;
- struct rv_reactor *reactor;
- /* protected by the monitor interface lock */
- int counter;
-};
-#endif
-
-struct rv_monitor_def {
- struct list_head list;
- struct rv_monitor *monitor;
- struct dentry *root_d;
-#ifdef CONFIG_RV_REACTORS
- struct rv_reactor_def *rdef;
- bool reacting;
-#endif
- bool task_monitor;
-};
+extern struct list_head rv_monitors_list;
struct dentry *get_monitors_root(void);
-int rv_disable_monitor(struct rv_monitor_def *mdef);
-int rv_enable_monitor(struct rv_monitor_def *mdef);
+int rv_disable_monitor(struct rv_monitor *mon);
+int rv_enable_monitor(struct rv_monitor *mon);
+bool rv_is_container_monitor(struct rv_monitor *mon);
+bool rv_is_nested_monitor(struct rv_monitor *mon);
#ifdef CONFIG_RV_REACTORS
-int reactor_populate_monitor(struct rv_monitor_def *mdef);
-void reactor_cleanup_monitor(struct rv_monitor_def *mdef);
+int reactor_populate_monitor(struct rv_monitor *mon, struct dentry *root);
int init_rv_reactors(struct dentry *root_dir);
#else
-static inline int reactor_populate_monitor(struct rv_monitor_def *mdef)
+static inline int reactor_populate_monitor(struct rv_monitor *mon, struct dentry *root)
{
return 0;
}
-static inline void reactor_cleanup_monitor(struct rv_monitor_def *mdef)
-{
- return;
-}
-
static inline int init_rv_reactors(struct dentry *root_dir)
{
return 0;
diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c
index 7b49cbe388d4..460af07f7aba 100644
--- a/kernel/trace/rv/rv_reactors.c
+++ b/kernel/trace/rv/rv_reactors.c
@@ -61,6 +61,7 @@
* printk
*/
+#include <linux/lockdep.h>
#include <linux/slab.h>
#include "rv.h"
@@ -70,12 +71,12 @@
*/
static LIST_HEAD(rv_reactors_list);
-static struct rv_reactor_def *get_reactor_rdef_by_name(char *name)
+static struct rv_reactor *get_reactor_rdef_by_name(char *name)
{
- struct rv_reactor_def *r;
+ struct rv_reactor *r;
list_for_each_entry(r, &rv_reactors_list, list) {
- if (strcmp(name, r->reactor->name) == 0)
+ if (strcmp(name, r->name) == 0)
return r;
}
return NULL;
@@ -86,9 +87,9 @@ static struct rv_reactor_def *get_reactor_rdef_by_name(char *name)
*/
static int reactors_show(struct seq_file *m, void *p)
{
- struct rv_reactor_def *rea_def = p;
+ struct rv_reactor *reactor = container_of(p, struct rv_reactor, list);
- seq_printf(m, "%s\n", rea_def->reactor->name);
+ seq_printf(m, "%s\n", reactor->name);
return 0;
}
@@ -138,13 +139,13 @@ static const struct file_operations available_reactors_ops = {
*/
static int monitor_reactor_show(struct seq_file *m, void *p)
{
- struct rv_monitor_def *mdef = m->private;
- struct rv_reactor_def *rdef = p;
+ struct rv_monitor *mon = m->private;
+ struct rv_reactor *reactor = container_of(p, struct rv_reactor, list);
- if (mdef->rdef == rdef)
- seq_printf(m, "[%s]\n", rdef->reactor->name);
+ if (mon->reactor == reactor)
+ seq_printf(m, "[%s]\n", reactor->name);
else
- seq_printf(m, "%s\n", rdef->reactor->name);
+ seq_printf(m, "%s\n", reactor->name);
return 0;
}
@@ -158,29 +159,45 @@ static const struct seq_operations monitor_reactors_seq_ops = {
.show = monitor_reactor_show
};
-static void monitor_swap_reactors(struct rv_monitor_def *mdef, struct rv_reactor_def *rdef,
- bool reacting)
+static void monitor_swap_reactors_single(struct rv_monitor *mon,
+ struct rv_reactor *reactor,
+ bool nested)
{
bool monitor_enabled;
/* nothing to do */
- if (mdef->rdef == rdef)
+ if (mon->reactor == reactor)
return;
- monitor_enabled = mdef->monitor->enabled;
+ monitor_enabled = mon->enabled;
if (monitor_enabled)
- rv_disable_monitor(mdef);
+ rv_disable_monitor(mon);
- /* swap reactor's usage */
- mdef->rdef->counter--;
- rdef->counter++;
+ mon->reactor = reactor;
+ mon->react = reactor->react;
- mdef->rdef = rdef;
- mdef->reacting = reacting;
- mdef->monitor->react = rdef->reactor->react;
+ /* enable only once if iterating through a container */
+ if (monitor_enabled && !nested)
+ rv_enable_monitor(mon);
+}
- if (monitor_enabled)
- rv_enable_monitor(mdef);
+static void monitor_swap_reactors(struct rv_monitor *mon, struct rv_reactor *reactor)
+{
+ struct rv_monitor *p = mon;
+
+ if (rv_is_container_monitor(mon))
+ list_for_each_entry_continue(p, &rv_monitors_list, list) {
+ if (p->parent != mon)
+ break;
+ monitor_swap_reactors_single(p, reactor, true);
+ }
+ /*
+ * This call enables and disables the monitor if they were active.
+ * In case of a container, we already disabled all and will enable all.
+ * All nested monitors are enabled also if they were off, we may refine
+ * this logic in the future.
+ */
+ monitor_swap_reactors_single(mon, reactor, false);
}
static ssize_t
@@ -188,11 +205,10 @@ monitor_reactors_write(struct file *file, const char __user *user_buf,
size_t count, loff_t *ppos)
{
char buff[MAX_RV_REACTOR_NAME_SIZE + 2];
- struct rv_monitor_def *mdef;
- struct rv_reactor_def *rdef;
+ struct rv_monitor *mon;
+ struct rv_reactor *reactor;
struct seq_file *seq_f;
int retval = -EINVAL;
- bool enable;
char *ptr;
int len;
@@ -215,30 +231,20 @@ monitor_reactors_write(struct file *file, const char __user *user_buf,
* See monitor_reactors_open()
*/
seq_f = file->private_data;
- mdef = seq_f->private;
-
- mutex_lock(&rv_interface_lock);
+ mon = seq_f->private;
- retval = -EINVAL;
+ guard(mutex)(&rv_interface_lock);
- list_for_each_entry(rdef, &rv_reactors_list, list) {
- if (strcmp(ptr, rdef->reactor->name) != 0)
+ list_for_each_entry(reactor, &rv_reactors_list, list) {
+ if (strcmp(ptr, reactor->name) != 0)
continue;
- if (rdef == get_reactor_rdef_by_name("nop"))
- enable = false;
- else
- enable = true;
-
- monitor_swap_reactors(mdef, rdef, enable);
+ monitor_swap_reactors(mon, reactor);
- retval = count;
- break;
+ return count;
}
- mutex_unlock(&rv_interface_lock);
-
- return retval;
+ return -EINVAL;
}
/*
@@ -246,7 +252,7 @@ monitor_reactors_write(struct file *file, const char __user *user_buf,
*/
static int monitor_reactors_open(struct inode *inode, struct file *file)
{
- struct rv_monitor_def *mdef = inode->i_private;
+ struct rv_monitor *mon = inode->i_private;
struct seq_file *seq_f;
int ret;
@@ -262,7 +268,7 @@ static int monitor_reactors_open(struct inode *inode, struct file *file)
/*
* Copy the create file "private" data to the seq_file private data.
*/
- seq_f->private = mdef;
+ seq_f->private = mon;
return 0;
};
@@ -277,23 +283,16 @@ static const struct file_operations monitor_reactors_ops = {
static int __rv_register_reactor(struct rv_reactor *reactor)
{
- struct rv_reactor_def *r;
+ struct rv_reactor *r;
list_for_each_entry(r, &rv_reactors_list, list) {
- if (strcmp(reactor->name, r->reactor->name) == 0) {
+ if (strcmp(reactor->name, r->name) == 0) {
pr_info("Reactor %s is already registered\n", reactor->name);
return -EINVAL;
}
}
- r = kzalloc(sizeof(struct rv_reactor_def), GFP_KERNEL);
- if (!r)
- return -ENOMEM;
-
- r->reactor = reactor;
- r->counter = 0;
-
- list_add_tail(&r->list, &rv_reactors_list);
+ list_add_tail(&reactor->list, &rv_reactors_list);
return 0;
}
@@ -306,18 +305,14 @@ static int __rv_register_reactor(struct rv_reactor *reactor)
*/
int rv_register_reactor(struct rv_reactor *reactor)
{
- int retval = 0;
-
if (strlen(reactor->name) >= MAX_RV_REACTOR_NAME_SIZE) {
pr_info("Reactor %s has a name longer than %d\n",
reactor->name, MAX_RV_MONITOR_NAME_SIZE);
return -EINVAL;
}
- mutex_lock(&rv_interface_lock);
- retval = __rv_register_reactor(reactor);
- mutex_unlock(&rv_interface_lock);
- return retval;
+ guard(mutex)(&rv_interface_lock);
+ return __rv_register_reactor(reactor);
}
/**
@@ -328,30 +323,9 @@ int rv_register_reactor(struct rv_reactor *reactor)
*/
int rv_unregister_reactor(struct rv_reactor *reactor)
{
- struct rv_reactor_def *ptr, *next;
- int ret = 0;
-
- mutex_lock(&rv_interface_lock);
-
- list_for_each_entry_safe(ptr, next, &rv_reactors_list, list) {
- if (strcmp(reactor->name, ptr->reactor->name) == 0) {
-
- if (!ptr->counter) {
- list_del(&ptr->list);
- } else {
- printk(KERN_WARNING
- "rv: the rv_reactor %s is in use by %d monitor(s)\n",
- ptr->reactor->name, ptr->counter);
- printk(KERN_WARNING "rv: the rv_reactor %s cannot be removed\n",
- ptr->reactor->name);
- ret = -EBUSY;
- break;
- }
- }
- }
-
- mutex_unlock(&rv_interface_lock);
- return ret;
+ guard(mutex)(&rv_interface_lock);
+ list_del(&reactor->list);
+ return 0;
}
/*
@@ -364,7 +338,7 @@ static bool __read_mostly reacting_on;
*
* Returns 1 if on, 0 otherwise.
*/
-bool rv_reacting_on(void)
+static bool rv_reacting_on(void)
{
/* Ensures that concurrent monitors read consistent reacting_on */
smp_rmb();
@@ -406,7 +380,7 @@ static ssize_t reacting_on_write_data(struct file *filp, const char __user *user
if (retval)
return retval;
- mutex_lock(&rv_interface_lock);
+ guard(mutex)(&rv_interface_lock);
if (val)
turn_reacting_on();
@@ -419,8 +393,6 @@ static ssize_t reacting_on_write_data(struct file *filp, const char __user *user
*/
tracepoint_synchronize_unregister();
- mutex_unlock(&rv_interface_lock);
-
return count;
}
@@ -432,43 +404,31 @@ static const struct file_operations reacting_on_fops = {
/**
* reactor_populate_monitor - creates per monitor reactors file
- * @mdef: monitor's definition.
+ * @mon: The monitor.
+ * @root: The directory of the monitor.
*
* Returns 0 if successful, error otherwise.
*/
-int reactor_populate_monitor(struct rv_monitor_def *mdef)
+int reactor_populate_monitor(struct rv_monitor *mon, struct dentry *root)
{
struct dentry *tmp;
- tmp = rv_create_file("reactors", RV_MODE_WRITE, mdef->root_d, mdef, &monitor_reactors_ops);
+ tmp = rv_create_file("reactors", RV_MODE_WRITE, root, mon, &monitor_reactors_ops);
if (!tmp)
return -ENOMEM;
/*
* Configure as the rv_nop reactor.
*/
- mdef->rdef = get_reactor_rdef_by_name("nop");
- mdef->rdef->counter++;
- mdef->reacting = false;
+ mon->reactor = get_reactor_rdef_by_name("nop");
return 0;
}
-/**
- * reactor_cleanup_monitor - cleanup a monitor reference
- * @mdef: monitor's definition.
- */
-void reactor_cleanup_monitor(struct rv_monitor_def *mdef)
-{
- lockdep_assert_held(&rv_interface_lock);
- mdef->rdef->counter--;
- WARN_ON_ONCE(mdef->rdef->counter < 0);
-}
-
/*
* Nop reactor register
*/
-static void rv_nop_reaction(char *msg)
+__printf(1, 0) static void rv_nop_reaction(const char *msg, va_list args)
{
}
@@ -480,30 +440,42 @@ static struct rv_reactor rv_nop = {
int init_rv_reactors(struct dentry *root_dir)
{
- struct dentry *available, *reacting;
int retval;
- available = rv_create_file("available_reactors", RV_MODE_READ, root_dir, NULL,
- &available_reactors_ops);
- if (!available)
- goto out_err;
+ struct dentry *available __free(rv_remove) =
+ rv_create_file("available_reactors", RV_MODE_READ, root_dir,
+ NULL, &available_reactors_ops);
+
+ struct dentry *reacting __free(rv_remove) =
+ rv_create_file("reacting_on", RV_MODE_WRITE, root_dir, NULL, &reacting_on_fops);
- reacting = rv_create_file("reacting_on", RV_MODE_WRITE, root_dir, NULL, &reacting_on_fops);
- if (!reacting)
- goto rm_available;
+ if (!reacting || !available)
+ return -ENOMEM;
retval = __rv_register_reactor(&rv_nop);
if (retval)
- goto rm_reacting;
+ return retval;
turn_reacting_on();
+ retain_and_null_ptr(available);
+ retain_and_null_ptr(reacting);
return 0;
+}
+
+void rv_react(struct rv_monitor *monitor, const char *msg, ...)
+{
+ static DEFINE_WAIT_OVERRIDE_MAP(rv_react_map, LD_WAIT_FREE);
+ va_list args;
+
+ if (!rv_reacting_on() || !monitor->react)
+ return;
+
+ va_start(args, msg);
+
+ lock_map_acquire_try(&rv_react_map);
+ monitor->react(msg, args);
+ lock_map_release(&rv_react_map);
-rm_reacting:
- rv_remove(reacting);
-rm_available:
- rv_remove(available);
-out_err:
- return -ENOMEM;
+ va_end(args);
}
diff --git a/kernel/trace/rv/rv_trace.h b/kernel/trace/rv/rv_trace.h
new file mode 100644
index 000000000000..9622c269789c
--- /dev/null
+++ b/kernel/trace/rv/rv_trace.h
@@ -0,0 +1,277 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM rv
+
+#if !defined(_TRACE_RV_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_RV_H
+
+#include <linux/rv.h>
+#include <linux/tracepoint.h>
+
+#ifdef CONFIG_DA_MON_EVENTS_IMPLICIT
+DECLARE_EVENT_CLASS(event_da_monitor,
+
+ TP_PROTO(char *state, char *event, char *next_state, bool final_state),
+
+ TP_ARGS(state, event, next_state, final_state),
+
+ TP_STRUCT__entry(
+ __string( state, state )
+ __string( event, event )
+ __string( next_state, next_state )
+ __field( bool, final_state )
+ ),
+
+ TP_fast_assign(
+ __assign_str(state);
+ __assign_str(event);
+ __assign_str(next_state);
+ __entry->final_state = final_state;
+ ),
+
+ TP_printk("%s x %s -> %s%s",
+ __get_str(state),
+ __get_str(event),
+ __get_str(next_state),
+ __entry->final_state ? " (final)" : "")
+);
+
+DECLARE_EVENT_CLASS(error_da_monitor,
+
+ TP_PROTO(char *state, char *event),
+
+ TP_ARGS(state, event),
+
+ TP_STRUCT__entry(
+ __string( state, state )
+ __string( event, event )
+ ),
+
+ TP_fast_assign(
+ __assign_str(state);
+ __assign_str(event);
+ ),
+
+ TP_printk("event %s not expected in the state %s",
+ __get_str(event),
+ __get_str(state))
+);
+
+#include <monitors/wip/wip_trace.h>
+#include <monitors/sco/sco_trace.h>
+#include <monitors/scpd/scpd_trace.h>
+#include <monitors/snep/snep_trace.h>
+#include <monitors/sts/sts_trace.h>
+// Add new monitors based on CONFIG_DA_MON_EVENTS_IMPLICIT here
+
+#ifdef CONFIG_HA_MON_EVENTS_IMPLICIT
+/* For simplicity this class is marked as DA although relevant only for HA */
+DECLARE_EVENT_CLASS(error_env_da_monitor,
+
+ TP_PROTO(char *state, char *event, char *env),
+
+ TP_ARGS(state, event, env),
+
+ TP_STRUCT__entry(
+ __string( state, state )
+ __string( event, event )
+ __string( env, env )
+ ),
+
+ TP_fast_assign(
+ __assign_str(state);
+ __assign_str(event);
+ __assign_str(env);
+ ),
+
+ TP_printk("event %s not expected in the state %s with env %s",
+ __get_str(event),
+ __get_str(state),
+ __get_str(env))
+);
+
+#include <monitors/opid/opid_trace.h>
+// Add new monitors based on CONFIG_HA_MON_EVENTS_IMPLICIT here
+
+#endif
+
+#endif /* CONFIG_DA_MON_EVENTS_IMPLICIT */
+
+#ifdef CONFIG_DA_MON_EVENTS_ID
+DECLARE_EVENT_CLASS(event_da_monitor_id,
+
+ TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state),
+
+ TP_ARGS(id, state, event, next_state, final_state),
+
+ TP_STRUCT__entry(
+ __field( int, id )
+ __string( state, state )
+ __string( event, event )
+ __string( next_state, next_state )
+ __field( bool, final_state )
+ ),
+
+ TP_fast_assign(
+ __assign_str(state);
+ __assign_str(event);
+ __assign_str(next_state);
+ __entry->id = id;
+ __entry->final_state = final_state;
+ ),
+
+ TP_printk("%d: %s x %s -> %s%s",
+ __entry->id,
+ __get_str(state),
+ __get_str(event),
+ __get_str(next_state),
+ __entry->final_state ? " (final)" : "")
+);
+
+DECLARE_EVENT_CLASS(error_da_monitor_id,
+
+ TP_PROTO(int id, char *state, char *event),
+
+ TP_ARGS(id, state, event),
+
+ TP_STRUCT__entry(
+ __field( int, id )
+ __string( state, state )
+ __string( event, event )
+ ),
+
+ TP_fast_assign(
+ __assign_str(state);
+ __assign_str(event);
+ __entry->id = id;
+ ),
+
+ TP_printk("%d: event %s not expected in the state %s",
+ __entry->id,
+ __get_str(event),
+ __get_str(state))
+);
+
+#include <monitors/wwnr/wwnr_trace.h>
+#include <monitors/snroc/snroc_trace.h>
+#include <monitors/nrp/nrp_trace.h>
+#include <monitors/sssw/sssw_trace.h>
+// Add new monitors based on CONFIG_DA_MON_EVENTS_ID here
+
+#ifdef CONFIG_HA_MON_EVENTS_ID
+/* For simplicity this class is marked as DA although relevant only for HA */
+DECLARE_EVENT_CLASS(error_env_da_monitor_id,
+
+ TP_PROTO(int id, char *state, char *event, char *env),
+
+ TP_ARGS(id, state, event, env),
+
+ TP_STRUCT__entry(
+ __field( int, id )
+ __string( state, state )
+ __string( event, event )
+ __string( env, env )
+ ),
+
+ TP_fast_assign(
+ __assign_str(state);
+ __assign_str(event);
+ __assign_str(env);
+ __entry->id = id;
+ ),
+
+ TP_printk("%d: event %s not expected in the state %s with env %s",
+ __entry->id,
+ __get_str(event),
+ __get_str(state),
+ __get_str(env))
+);
+
+#include <monitors/stall/stall_trace.h>
+#include <monitors/nomiss/nomiss_trace.h>
+// Add new monitors based on CONFIG_HA_MON_EVENTS_ID here
+
+#endif
+
+#endif /* CONFIG_DA_MON_EVENTS_ID */
+#ifdef CONFIG_LTL_MON_EVENTS_ID
+DECLARE_EVENT_CLASS(event_ltl_monitor_id,
+
+ TP_PROTO(struct task_struct *task, char *states, char *atoms, char *next),
+
+ TP_ARGS(task, states, atoms, next),
+
+ TP_STRUCT__entry(
+ __string(comm, task->comm)
+ __field(pid_t, pid)
+ __string(states, states)
+ __string(atoms, atoms)
+ __string(next, next)
+ ),
+
+ TP_fast_assign(
+ __assign_str(comm);
+ __entry->pid = task->pid;
+ __assign_str(states);
+ __assign_str(atoms);
+ __assign_str(next);
+ ),
+
+ TP_printk("%s[%d]: (%s) x (%s) -> (%s)", __get_str(comm), __entry->pid,
+ __get_str(states), __get_str(atoms), __get_str(next))
+);
+
+DECLARE_EVENT_CLASS(error_ltl_monitor_id,
+
+ TP_PROTO(struct task_struct *task),
+
+ TP_ARGS(task),
+
+ TP_STRUCT__entry(
+ __string(comm, task->comm)
+ __field(pid_t, pid)
+ ),
+
+ TP_fast_assign(
+ __assign_str(comm);
+ __entry->pid = task->pid;
+ ),
+
+ TP_printk("%s[%d]: violation detected", __get_str(comm), __entry->pid)
+);
+#include <monitors/pagefault/pagefault_trace.h>
+#include <monitors/sleep/sleep_trace.h>
+// Add new monitors based on CONFIG_LTL_MON_EVENTS_ID here
+#endif /* CONFIG_LTL_MON_EVENTS_ID */
+
+#ifdef CONFIG_RV_MON_MAINTENANCE_EVENTS
+/* Tracepoint useful for monitors development, currenly only used in DA */
+TRACE_EVENT(rv_retries_error,
+
+ TP_PROTO(char *name, char *event),
+
+ TP_ARGS(name, event),
+
+ TP_STRUCT__entry(
+ __string( name, name )
+ __string( event, event )
+ ),
+
+ TP_fast_assign(
+ __assign_str(name);
+ __assign_str(event);
+ ),
+
+ TP_printk(__stringify(MAX_DA_RETRY_RACING_EVENTS)
+ " retries reached for event %s, resetting monitor %s",
+ __get_str(event), __get_str(name))
+);
+#endif /* CONFIG_RV_MON_MAINTENANCE_EVENTS */
+#endif /* _TRACE_RV_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE rv_trace
+#include <trace/define_trace.h>
diff --git a/kernel/trace/simple_ring_buffer.c b/kernel/trace/simple_ring_buffer.c
new file mode 100644
index 000000000000..f4642f5adda3
--- /dev/null
+++ b/kernel/trace/simple_ring_buffer.c
@@ -0,0 +1,517 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2025 - Google LLC
+ * Author: Vincent Donnefort <vdonnefort@google.com>
+ */
+
+#include <linux/atomic.h>
+#include <linux/simple_ring_buffer.h>
+
+#include <asm/barrier.h>
+#include <asm/local.h>
+
+enum simple_rb_link_type {
+ SIMPLE_RB_LINK_NORMAL = 0,
+ SIMPLE_RB_LINK_HEAD = 1,
+ SIMPLE_RB_LINK_HEAD_MOVING
+};
+
+#define SIMPLE_RB_LINK_MASK ~(SIMPLE_RB_LINK_HEAD | SIMPLE_RB_LINK_HEAD_MOVING)
+
+static void simple_bpage_set_head_link(struct simple_buffer_page *bpage)
+{
+ unsigned long link = (unsigned long)bpage->link.next;
+
+ link &= SIMPLE_RB_LINK_MASK;
+ link |= SIMPLE_RB_LINK_HEAD;
+
+ /*
+ * Paired with simple_rb_find_head() to order access between the head
+ * link and overrun. It ensures we always report an up-to-date value
+ * after swapping the reader page.
+ */
+ smp_store_release(&bpage->link.next, (struct list_head *)link);
+}
+
+static bool simple_bpage_unset_head_link(struct simple_buffer_page *bpage,
+ struct simple_buffer_page *dst,
+ enum simple_rb_link_type new_type)
+{
+ unsigned long *link = (unsigned long *)(&bpage->link.next);
+ unsigned long old = (*link & SIMPLE_RB_LINK_MASK) | SIMPLE_RB_LINK_HEAD;
+ unsigned long new = (unsigned long)(&dst->link) | new_type;
+
+ return try_cmpxchg(link, &old, new);
+}
+
+static void simple_bpage_set_normal_link(struct simple_buffer_page *bpage)
+{
+ unsigned long link = (unsigned long)bpage->link.next;
+
+ WRITE_ONCE(bpage->link.next, (struct list_head *)(link & SIMPLE_RB_LINK_MASK));
+}
+
+static struct simple_buffer_page *simple_bpage_from_link(struct list_head *link)
+{
+ unsigned long ptr = (unsigned long)link & SIMPLE_RB_LINK_MASK;
+
+ return container_of((struct list_head *)ptr, struct simple_buffer_page, link);
+}
+
+static struct simple_buffer_page *simple_bpage_next_page(struct simple_buffer_page *bpage)
+{
+ return simple_bpage_from_link(bpage->link.next);
+}
+
+static void simple_bpage_reset(struct simple_buffer_page *bpage)
+{
+ bpage->write = 0;
+ bpage->entries = 0;
+
+ local_set(&bpage->page->commit, 0);
+}
+
+static void simple_bpage_init(struct simple_buffer_page *bpage, void *page)
+{
+ INIT_LIST_HEAD(&bpage->link);
+ bpage->page = (struct buffer_data_page *)page;
+
+ simple_bpage_reset(bpage);
+}
+
+#define simple_rb_meta_inc(__meta, __inc) \
+ WRITE_ONCE((__meta), (__meta + __inc))
+
+static bool simple_rb_loaded(struct simple_rb_per_cpu *cpu_buffer)
+{
+ return !!cpu_buffer->bpages;
+}
+
+static int simple_rb_find_head(struct simple_rb_per_cpu *cpu_buffer)
+{
+ int retry = cpu_buffer->nr_pages * 2;
+ struct simple_buffer_page *head;
+
+ head = cpu_buffer->head_page;
+
+ while (retry--) {
+ unsigned long link;
+
+spin:
+ /* See smp_store_release in simple_bpage_set_head_link() */
+ link = (unsigned long)smp_load_acquire(&head->link.prev->next);
+
+ switch (link & ~SIMPLE_RB_LINK_MASK) {
+ /* Found the head */
+ case SIMPLE_RB_LINK_HEAD:
+ cpu_buffer->head_page = head;
+ return 0;
+ /* The writer caught the head, we can spin, that won't be long */
+ case SIMPLE_RB_LINK_HEAD_MOVING:
+ goto spin;
+ }
+
+ head = simple_bpage_next_page(head);
+ }
+
+ return -EBUSY;
+}
+
+/**
+ * simple_ring_buffer_swap_reader_page - Swap ring-buffer head with the reader
+ * @cpu_buffer: A simple_rb_per_cpu
+ *
+ * This function enables consuming reading. It ensures the current head page will not be overwritten
+ * and can be safely read.
+ *
+ * Returns 0 on success, -ENODEV if @cpu_buffer was unloaded or -EBUSY if we failed to catch the
+ * head page.
+ */
+int simple_ring_buffer_swap_reader_page(struct simple_rb_per_cpu *cpu_buffer)
+{
+ struct simple_buffer_page *last, *head, *reader;
+ unsigned long overrun;
+ int retry = 8;
+ int ret;
+
+ if (!simple_rb_loaded(cpu_buffer))
+ return -ENODEV;
+
+ reader = cpu_buffer->reader_page;
+
+ do {
+ /* Run after the writer to find the head */
+ ret = simple_rb_find_head(cpu_buffer);
+ if (ret)
+ return ret;
+
+ head = cpu_buffer->head_page;
+
+ /* Connect the reader page around the header page */
+ reader->link.next = head->link.next;
+ reader->link.prev = head->link.prev;
+
+ /* The last page before the head */
+ last = simple_bpage_from_link(head->link.prev);
+
+ /* The reader page points to the new header page */
+ simple_bpage_set_head_link(reader);
+
+ overrun = cpu_buffer->meta->overrun;
+ } while (!simple_bpage_unset_head_link(last, reader, SIMPLE_RB_LINK_NORMAL) && retry--);
+
+ if (!retry)
+ return -EINVAL;
+
+ cpu_buffer->head_page = simple_bpage_from_link(reader->link.next);
+ cpu_buffer->head_page->link.prev = &reader->link;
+ cpu_buffer->reader_page = head;
+ cpu_buffer->meta->reader.lost_events = overrun - cpu_buffer->last_overrun;
+ cpu_buffer->meta->reader.id = cpu_buffer->reader_page->id;
+ cpu_buffer->last_overrun = overrun;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(simple_ring_buffer_swap_reader_page);
+
+static struct simple_buffer_page *simple_rb_move_tail(struct simple_rb_per_cpu *cpu_buffer)
+{
+ struct simple_buffer_page *tail, *new_tail;
+
+ tail = cpu_buffer->tail_page;
+ new_tail = simple_bpage_next_page(tail);
+
+ if (simple_bpage_unset_head_link(tail, new_tail, SIMPLE_RB_LINK_HEAD_MOVING)) {
+ /*
+ * Oh no! we've caught the head. There is none anymore and
+ * swap_reader will spin until we set the new one. Overrun must
+ * be written first, to make sure we report the correct number
+ * of lost events.
+ */
+ simple_rb_meta_inc(cpu_buffer->meta->overrun, new_tail->entries);
+ simple_rb_meta_inc(cpu_buffer->meta->pages_lost, 1);
+
+ simple_bpage_set_head_link(new_tail);
+ simple_bpage_set_normal_link(tail);
+ }
+
+ simple_bpage_reset(new_tail);
+ cpu_buffer->tail_page = new_tail;
+
+ simple_rb_meta_inc(cpu_buffer->meta->pages_touched, 1);
+
+ return new_tail;
+}
+
+static unsigned long rb_event_size(unsigned long length)
+{
+ struct ring_buffer_event *event;
+
+ return length + RB_EVNT_HDR_SIZE + sizeof(event->array[0]);
+}
+
+static struct ring_buffer_event *
+rb_event_add_ts_extend(struct ring_buffer_event *event, u64 delta)
+{
+ event->type_len = RINGBUF_TYPE_TIME_EXTEND;
+ event->time_delta = delta & TS_MASK;
+ event->array[0] = delta >> TS_SHIFT;
+
+ return (struct ring_buffer_event *)((unsigned long)event + 8);
+}
+
+static struct ring_buffer_event *
+simple_rb_reserve_next(struct simple_rb_per_cpu *cpu_buffer, unsigned long length, u64 timestamp)
+{
+ unsigned long ts_ext_size = 0, event_size = rb_event_size(length);
+ struct simple_buffer_page *tail = cpu_buffer->tail_page;
+ struct ring_buffer_event *event;
+ u32 write, prev_write;
+ u64 time_delta;
+
+ time_delta = timestamp - cpu_buffer->write_stamp;
+
+ if (test_time_stamp(time_delta))
+ ts_ext_size = 8;
+
+ prev_write = tail->write;
+ write = prev_write + event_size + ts_ext_size;
+
+ if (unlikely(write > (PAGE_SIZE - BUF_PAGE_HDR_SIZE)))
+ tail = simple_rb_move_tail(cpu_buffer);
+
+ if (!tail->entries) {
+ tail->page->time_stamp = timestamp;
+ time_delta = 0;
+ ts_ext_size = 0;
+ write = event_size;
+ prev_write = 0;
+ }
+
+ tail->write = write;
+ tail->entries++;
+
+ cpu_buffer->write_stamp = timestamp;
+
+ event = (struct ring_buffer_event *)(tail->page->data + prev_write);
+ if (ts_ext_size) {
+ event = rb_event_add_ts_extend(event, time_delta);
+ time_delta = 0;
+ }
+
+ event->type_len = 0;
+ event->time_delta = time_delta;
+ event->array[0] = event_size - RB_EVNT_HDR_SIZE;
+
+ return event;
+}
+
+/**
+ * simple_ring_buffer_reserve - Reserve an entry in @cpu_buffer
+ * @cpu_buffer: A simple_rb_per_cpu
+ * @length: Size of the entry in bytes
+ * @timestamp: Timestamp of the entry
+ *
+ * Returns the address of the entry where to write data or NULL
+ */
+void *simple_ring_buffer_reserve(struct simple_rb_per_cpu *cpu_buffer, unsigned long length,
+ u64 timestamp)
+{
+ struct ring_buffer_event *rb_event;
+
+ if (cmpxchg(&cpu_buffer->status, SIMPLE_RB_READY, SIMPLE_RB_WRITING) != SIMPLE_RB_READY)
+ return NULL;
+
+ rb_event = simple_rb_reserve_next(cpu_buffer, length, timestamp);
+
+ return &rb_event->array[1];
+}
+EXPORT_SYMBOL_GPL(simple_ring_buffer_reserve);
+
+/**
+ * simple_ring_buffer_commit - Commit the entry reserved with simple_ring_buffer_reserve()
+ * @cpu_buffer: The simple_rb_per_cpu where the entry has been reserved
+ */
+void simple_ring_buffer_commit(struct simple_rb_per_cpu *cpu_buffer)
+{
+ local_set(&cpu_buffer->tail_page->page->commit,
+ cpu_buffer->tail_page->write);
+ simple_rb_meta_inc(cpu_buffer->meta->entries, 1);
+
+ /*
+ * Paired with simple_rb_enable_tracing() to ensure data is
+ * written to the ring-buffer before teardown.
+ */
+ smp_store_release(&cpu_buffer->status, SIMPLE_RB_READY);
+}
+EXPORT_SYMBOL_GPL(simple_ring_buffer_commit);
+
+static u32 simple_rb_enable_tracing(struct simple_rb_per_cpu *cpu_buffer, bool enable)
+{
+ u32 prev_status;
+
+ if (enable)
+ return cmpxchg(&cpu_buffer->status, SIMPLE_RB_UNAVAILABLE, SIMPLE_RB_READY);
+
+ /* Wait for the buffer to be released */
+ do {
+ prev_status = cmpxchg_acquire(&cpu_buffer->status,
+ SIMPLE_RB_READY,
+ SIMPLE_RB_UNAVAILABLE);
+ } while (prev_status == SIMPLE_RB_WRITING);
+
+ return prev_status;
+}
+
+/**
+ * simple_ring_buffer_reset - Reset @cpu_buffer
+ * @cpu_buffer: A simple_rb_per_cpu
+ *
+ * This will not clear the content of the data, only reset counters and pointers
+ *
+ * Returns 0 on success or -ENODEV if @cpu_buffer was unloaded.
+ */
+int simple_ring_buffer_reset(struct simple_rb_per_cpu *cpu_buffer)
+{
+ struct simple_buffer_page *bpage;
+ u32 prev_status;
+ int ret;
+
+ if (!simple_rb_loaded(cpu_buffer))
+ return -ENODEV;
+
+ prev_status = simple_rb_enable_tracing(cpu_buffer, false);
+
+ ret = simple_rb_find_head(cpu_buffer);
+ if (ret)
+ return ret;
+
+ bpage = cpu_buffer->tail_page = cpu_buffer->head_page;
+ do {
+ simple_bpage_reset(bpage);
+ bpage = simple_bpage_next_page(bpage);
+ } while (bpage != cpu_buffer->head_page);
+
+ simple_bpage_reset(cpu_buffer->reader_page);
+
+ cpu_buffer->last_overrun = 0;
+ cpu_buffer->write_stamp = 0;
+
+ cpu_buffer->meta->reader.read = 0;
+ cpu_buffer->meta->reader.lost_events = 0;
+ cpu_buffer->meta->entries = 0;
+ cpu_buffer->meta->overrun = 0;
+ cpu_buffer->meta->read = 0;
+ cpu_buffer->meta->pages_lost = 0;
+ cpu_buffer->meta->pages_touched = 0;
+
+ if (prev_status == SIMPLE_RB_READY)
+ simple_rb_enable_tracing(cpu_buffer, true);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(simple_ring_buffer_reset);
+
+int simple_ring_buffer_init_mm(struct simple_rb_per_cpu *cpu_buffer,
+ struct simple_buffer_page *bpages,
+ const struct ring_buffer_desc *desc,
+ void *(*load_page)(unsigned long va),
+ void (*unload_page)(void *va))
+{
+ struct simple_buffer_page *bpage = bpages;
+ int ret = 0;
+ void *page;
+ int i;
+
+ /* At least 1 reader page and two pages in the ring-buffer */
+ if (desc->nr_page_va < 3)
+ return -EINVAL;
+
+ memset(cpu_buffer, 0, sizeof(*cpu_buffer));
+
+ cpu_buffer->meta = load_page(desc->meta_va);
+ if (!cpu_buffer->meta)
+ return -EINVAL;
+
+ memset(cpu_buffer->meta, 0, sizeof(*cpu_buffer->meta));
+ cpu_buffer->meta->meta_page_size = PAGE_SIZE;
+
+ /* The reader page is not part of the ring initially */
+ page = load_page(desc->page_va[0]);
+ if (!page) {
+ unload_page(cpu_buffer->meta);
+ return -EINVAL;
+ }
+
+ simple_bpage_init(bpage, page);
+ bpage->id = 0;
+
+ cpu_buffer->nr_pages = 1;
+
+ cpu_buffer->reader_page = bpage;
+ cpu_buffer->tail_page = bpage + 1;
+ cpu_buffer->head_page = bpage + 1;
+
+ for (i = 1; i < desc->nr_page_va; i++) {
+ page = load_page(desc->page_va[i]);
+ if (!page) {
+ ret = -EINVAL;
+ break;
+ }
+
+ simple_bpage_init(++bpage, page);
+
+ bpage->link.next = &(bpage + 1)->link;
+ bpage->link.prev = &(bpage - 1)->link;
+ bpage->id = i;
+
+ cpu_buffer->nr_pages = i + 1;
+ }
+
+ if (ret) {
+ for (i--; i >= 0; i--)
+ unload_page(bpages[i].page);
+ unload_page(cpu_buffer->meta);
+
+ return ret;
+ }
+
+ cpu_buffer->meta->nr_subbufs = cpu_buffer->nr_pages;
+ /* Close the ring */
+ bpage->link.next = &cpu_buffer->tail_page->link;
+ cpu_buffer->tail_page->link.prev = &bpage->link;
+
+ /* The last init'ed page points to the head page */
+ simple_bpage_set_head_link(bpage);
+
+ cpu_buffer->bpages = bpages;
+
+ return 0;
+}
+
+static void *__load_page(unsigned long page)
+{
+ return (void *)page;
+}
+
+static void __unload_page(void *page) { }
+
+/**
+ * simple_ring_buffer_init - Init @cpu_buffer based on @desc
+ * @cpu_buffer: A simple_rb_per_cpu buffer to init, allocated by the caller.
+ * @bpages: Array of simple_buffer_pages, with as many elements as @desc->nr_page_va
+ * @desc: A ring_buffer_desc
+ *
+ * Returns 0 on success or -EINVAL if the content of @desc is invalid
+ */
+int simple_ring_buffer_init(struct simple_rb_per_cpu *cpu_buffer, struct simple_buffer_page *bpages,
+ const struct ring_buffer_desc *desc)
+{
+ return simple_ring_buffer_init_mm(cpu_buffer, bpages, desc, __load_page, __unload_page);
+}
+EXPORT_SYMBOL_GPL(simple_ring_buffer_init);
+
+void simple_ring_buffer_unload_mm(struct simple_rb_per_cpu *cpu_buffer,
+ void (*unload_page)(void *))
+{
+ int p;
+
+ if (!simple_rb_loaded(cpu_buffer))
+ return;
+
+ simple_rb_enable_tracing(cpu_buffer, false);
+
+ unload_page(cpu_buffer->meta);
+ for (p = 0; p < cpu_buffer->nr_pages; p++)
+ unload_page(cpu_buffer->bpages[p].page);
+
+ cpu_buffer->bpages = NULL;
+}
+
+/**
+ * simple_ring_buffer_unload - Prepare @cpu_buffer for deletion
+ * @cpu_buffer: A simple_rb_per_cpu that will be deleted.
+ */
+void simple_ring_buffer_unload(struct simple_rb_per_cpu *cpu_buffer)
+{
+ return simple_ring_buffer_unload_mm(cpu_buffer, __unload_page);
+}
+EXPORT_SYMBOL_GPL(simple_ring_buffer_unload);
+
+/**
+ * simple_ring_buffer_enable_tracing - Enable or disable writing to @cpu_buffer
+ * @cpu_buffer: A simple_rb_per_cpu
+ * @enable: True to enable tracing, False to disable it
+ *
+ * Returns 0 on success or -ENODEV if @cpu_buffer was unloaded
+ */
+int simple_ring_buffer_enable_tracing(struct simple_rb_per_cpu *cpu_buffer, bool enable)
+{
+ if (!simple_rb_loaded(cpu_buffer))
+ return -ENODEV;
+
+ simple_rb_enable_tracing(cpu_buffer, enable);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(simple_ring_buffer_enable_tracing);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f8aebcb01e62..6eb4d3097a4d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -20,12 +20,14 @@
#include <linux/security.h>
#include <linux/seq_file.h>
#include <linux/irqflags.h>
+#include <linux/syscalls.h>
#include <linux/debugfs.h>
#include <linux/tracefs.h>
#include <linux/pagemap.h>
#include <linux/hardirq.h>
#include <linux/linkage.h>
#include <linux/uaccess.h>
+#include <linux/cleanup.h>
#include <linux/vmalloc.h>
#include <linux/ftrace.h>
#include <linux/module.h>
@@ -45,9 +47,11 @@
#include <linux/trace.h>
#include <linux/sched/clock.h>
#include <linux/sched/rt.h>
-#include <linux/fsnotify.h>
#include <linux/irq_work.h>
#include <linux/workqueue.h>
+#include <linux/sort.h>
+#include <linux/io.h> /* vmap_page_range() */
+#include <linux/fs_context.h>
#include <asm/setup.h> /* COMMAND_LINE_SIZE */
@@ -62,7 +66,7 @@
* insertions into the ring-buffer such as trace_printk could occurred
* at the same time, giving false positive or negative results.
*/
-static bool __read_mostly tracing_selftest_running;
+bool __read_mostly tracing_selftest_running;
/*
* If boot-time tracing including tracers/events via kernel cmdline
@@ -78,7 +82,6 @@ void __init disable_tracing_selftest(const char *reason)
}
}
#else
-#define tracing_selftest_running 0
#define tracing_selftest_disabled 0
#endif
@@ -86,19 +89,16 @@ void __init disable_tracing_selftest(const char *reason)
static struct trace_iterator *tracepoint_print_iter;
int tracepoint_printk;
static bool tracepoint_printk_stop_on_boot __initdata;
+static bool traceoff_after_boot __initdata;
static DEFINE_STATIC_KEY_FALSE(tracepoint_printk_key);
-/* For tracers that don't implement custom flags */
-static struct tracer_opt dummy_tracer_opt[] = {
- { }
+/* Store tracers and their flags per instance */
+struct tracers {
+ struct list_head list;
+ struct tracer *tracer;
+ struct tracer_flags *flags;
};
-static int
-dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
-{
- return 0;
-}
-
/*
* To prevent the comm cache from being overwritten when no
* tracing is active, only save the comm when a trace event
@@ -112,17 +112,18 @@ DEFINE_PER_CPU(bool, trace_taskinfo_save);
* of the tracer is successful. But that is the only place that sets
* this back to zero.
*/
-static int tracing_disabled = 1;
+int tracing_disabled = 1;
cpumask_var_t __read_mostly tracing_buffer_mask;
+#define MAX_TRACER_SIZE 100
/*
* ftrace_dump_on_oops - variable to dump ftrace buffer on oops
*
* If there is an oops (or kernel panic) and the ftrace_dump_on_oops
* is set, then ftrace_dump is called. This will output the contents
* of the ftrace buffers to the console. This is very useful for
- * capturing traces that lead to crashes and outputing it to a
+ * capturing traces that lead to crashes and outputting it to a
* serial console.
*
* It is default off, but you can enable it with either specifying
@@ -131,14 +132,47 @@ cpumask_var_t __read_mostly tracing_buffer_mask;
* Set 1 if you want to dump buffers of all CPUs
* Set 2 if you want to dump the buffer of the CPU that triggered oops
* Set instance name if you want to dump the specific trace instance
- * Multiple instance dump is also supported, and instances are seperated
+ * Multiple instance dump is also supported, and instances are separated
* by commas.
*/
/* Set to string format zero to disable by default */
-char ftrace_dump_on_oops[MAX_TRACER_SIZE] = "0";
+static char ftrace_dump_on_oops[MAX_TRACER_SIZE] = "0";
/* When set, tracing will stop when a WARN*() is hit */
-int __disable_trace_on_warning;
+static int __disable_trace_on_warning;
+
+int tracepoint_printk_sysctl(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos);
+static const struct ctl_table trace_sysctl_table[] = {
+ {
+ .procname = "ftrace_dump_on_oops",
+ .data = &ftrace_dump_on_oops,
+ .maxlen = MAX_TRACER_SIZE,
+ .mode = 0644,
+ .proc_handler = proc_dostring,
+ },
+ {
+ .procname = "traceoff_on_warning",
+ .data = &__disable_trace_on_warning,
+ .maxlen = sizeof(__disable_trace_on_warning),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "tracepoint_printk",
+ .data = &tracepoint_printk,
+ .maxlen = sizeof(tracepoint_printk),
+ .mode = 0644,
+ .proc_handler = tracepoint_printk_sysctl,
+ },
+};
+
+static int __init init_trace_sysctls(void)
+{
+ register_sysctl_init("kernel", trace_sysctl_table);
+ return 0;
+}
+subsys_initcall(init_trace_sysctls);
#ifdef CONFIG_TRACE_EVAL_MAP_FILE
/* Map of enums to their values, for "eval_map" file */
@@ -184,14 +218,36 @@ static void ftrace_trace_userstack(struct trace_array *tr,
static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
static char *default_bootup_tracer;
-static bool allocate_snapshot;
-static bool snapshot_at_boot;
-
static char boot_instance_info[COMMAND_LINE_SIZE] __initdata;
static int boot_instance_index;
-static char boot_snapshot_info[COMMAND_LINE_SIZE] __initdata;
-static int boot_snapshot_index;
+/*
+ * Repeated boot parameters, including Bootconfig array expansions, need
+ * to stay in the delimiter form that the existing parser consumes.
+ */
+void __init trace_append_boot_param(char *buf, const char *str, char sep,
+ int size)
+{
+ int len, needed, str_len;
+
+ if (!*str)
+ return;
+
+ len = strlen(buf);
+ str_len = strlen(str);
+ needed = len + str_len + 1;
+
+ /* For continuation, account for the separator. */
+ if (len)
+ needed++;
+ if (needed > size)
+ return;
+
+ if (len)
+ buf[len++] = sep;
+
+ strscpy(buf + len, str, size - len);
+}
static int __init set_cmdline_ftrace(char *str)
{
@@ -241,38 +297,6 @@ static int __init stop_trace_on_warning(char *str)
}
__setup("traceoff_on_warning", stop_trace_on_warning);
-static int __init boot_alloc_snapshot(char *str)
-{
- char *slot = boot_snapshot_info + boot_snapshot_index;
- int left = sizeof(boot_snapshot_info) - boot_snapshot_index;
- int ret;
-
- if (str[0] == '=') {
- str++;
- if (strlen(str) >= left)
- return -1;
-
- ret = snprintf(slot, left, "%s\t", str);
- boot_snapshot_index += ret;
- } else {
- allocate_snapshot = true;
- /* We also need the main ring buffer expanded */
- trace_set_ring_buffer_expanded(NULL);
- }
- return 1;
-}
-__setup("alloc_snapshot", boot_alloc_snapshot);
-
-
-static int __init boot_snapshot(char *str)
-{
- snapshot_at_boot = true;
- boot_alloc_snapshot(str);
- return 1;
-}
-__setup("ftrace_boot_snapshot", boot_snapshot);
-
-
static int __init boot_instance(char *str)
{
char *slot = boot_instance_info + boot_instance_index;
@@ -294,7 +318,8 @@ static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;
static int __init set_trace_boot_options(char *str)
{
- strscpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
+ trace_append_boot_param(trace_boot_options_buf, str, ',',
+ MAX_TRACER_SIZE);
return 1;
}
__setup("trace_options=", set_trace_boot_options);
@@ -329,6 +354,13 @@ static int __init set_tracepoint_printk_stop(char *str)
}
__setup("tp_printk_stop_on_boot", set_tracepoint_printk_stop);
+static int __init set_traceoff_after_boot(char *str)
+{
+ traceoff_after_boot = true;
+ return 1;
+}
+__setup("traceoff_after_boot", set_traceoff_after_boot);
+
unsigned long long ns2usecs(u64 nsec)
{
nsec += 500;
@@ -386,15 +418,13 @@ static void ftrace_exports(struct ring_buffer_event *event, int flag)
{
struct trace_export *export;
- preempt_disable_notrace();
+ guard(preempt_notrace)();
export = rcu_dereference_raw_check(ftrace_exports_list);
while (export) {
trace_process_export(export, event, flag);
export = rcu_dereference_raw_check(export->next);
}
-
- preempt_enable_notrace();
}
static inline void
@@ -451,46 +481,40 @@ int register_ftrace_export(struct trace_export *export)
if (WARN_ON_ONCE(!export->write))
return -1;
- mutex_lock(&ftrace_export_lock);
+ guard(mutex)(&ftrace_export_lock);
add_ftrace_export(&ftrace_exports_list, export);
- mutex_unlock(&ftrace_export_lock);
-
return 0;
}
EXPORT_SYMBOL_GPL(register_ftrace_export);
int unregister_ftrace_export(struct trace_export *export)
{
- int ret;
-
- mutex_lock(&ftrace_export_lock);
-
- ret = rm_ftrace_export(&ftrace_exports_list, export);
-
- mutex_unlock(&ftrace_export_lock);
-
- return ret;
+ guard(mutex)(&ftrace_export_lock);
+ return rm_ftrace_export(&ftrace_exports_list, export);
}
EXPORT_SYMBOL_GPL(unregister_ftrace_export);
/* trace_flags holds trace_options default values */
#define TRACE_DEFAULT_FLAGS \
- (FUNCTION_DEFAULT_FLAGS | \
- TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | \
- TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | \
- TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | \
- TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | \
- TRACE_ITER_HASH_PTR | TRACE_ITER_TRACE_PRINTK)
+ (FUNCTION_DEFAULT_FLAGS | FPROFILE_DEFAULT_FLAGS | \
+ TRACE_ITER(PRINT_PARENT) | TRACE_ITER(PRINTK) | \
+ TRACE_ITER(ANNOTATE) | TRACE_ITER(CONTEXT_INFO) | \
+ TRACE_ITER(RECORD_CMD) | TRACE_ITER(OVERWRITE) | \
+ TRACE_ITER(IRQ_INFO) | TRACE_ITER(MARKERS) | \
+ TRACE_ITER(HASH_PTR) | TRACE_ITER(TRACE_PRINTK) | \
+ TRACE_ITER(COPY_MARKER))
/* trace_options that are only supported by global_trace */
-#define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK | \
- TRACE_ITER_PRINTK_MSGONLY | TRACE_ITER_RECORD_CMD)
+#define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER(PRINTK) | \
+ TRACE_ITER(PRINTK_MSGONLY) | TRACE_ITER(RECORD_CMD) | \
+ TRACE_ITER(PROF_TEXT_OFFSET) | FPROFILE_DEFAULT_FLAGS)
/* trace_flags that are default zero for instances */
#define ZEROED_TRACE_FLAGS \
- (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK | TRACE_ITER_TRACE_PRINTK)
+ (TRACE_ITER(EVENT_FORK) | TRACE_ITER(FUNC_FORK) | TRACE_ITER(TRACE_PRINTK) | \
+ TRACE_ITER(COPY_MARKER))
/*
* The global_trace is the descriptor that holds the top-level tracing
@@ -500,27 +524,41 @@ static struct trace_array global_trace = {
.trace_flags = TRACE_DEFAULT_FLAGS,
};
-static struct trace_array *printk_trace = &global_trace;
+struct trace_array *printk_trace = &global_trace;
-static __always_inline bool printk_binsafe(struct trace_array *tr)
-{
- /*
- * The binary format of traceprintk can cause a crash if used
- * by a buffer from another boot. Force the use of the
- * non binary version of trace_printk if the trace_printk
- * buffer is a boot mapped ring buffer.
- */
- return !(tr->flags & TRACE_ARRAY_FL_BOOT);
-}
+/* List of trace_arrays interested in the top level trace_marker */
+static LIST_HEAD(marker_copies);
static void update_printk_trace(struct trace_array *tr)
{
if (printk_trace == tr)
return;
- printk_trace->trace_flags &= ~TRACE_ITER_TRACE_PRINTK;
+ printk_trace->trace_flags &= ~TRACE_ITER(TRACE_PRINTK);
printk_trace = tr;
- tr->trace_flags |= TRACE_ITER_TRACE_PRINTK;
+ tr->trace_flags |= TRACE_ITER(TRACE_PRINTK);
+}
+
+/* Returns true if the status of tr changed */
+static bool update_marker_trace(struct trace_array *tr, int enabled)
+{
+ lockdep_assert_held(&event_mutex);
+
+ if (enabled) {
+ if (tr->trace_flags & TRACE_ITER(COPY_MARKER))
+ return false;
+
+ list_add_rcu(&tr->marker_list, &marker_copies);
+ tr->trace_flags |= TRACE_ITER(COPY_MARKER);
+ return true;
+ }
+
+ if (!(tr->trace_flags & TRACE_ITER(COPY_MARKER)))
+ return false;
+
+ list_del_rcu(&tr->marker_list);
+ tr->trace_flags &= ~TRACE_ITER(COPY_MARKER);
+ return true;
}
void trace_set_ring_buffer_expanded(struct trace_array *tr)
@@ -530,30 +568,83 @@ void trace_set_ring_buffer_expanded(struct trace_array *tr)
tr->ring_buffer_expanded = true;
}
+static void trace_array_autoremove(struct work_struct *work)
+{
+ struct trace_array *tr = container_of(work, struct trace_array, autoremove_work);
+
+ trace_array_destroy(tr);
+}
+
+static struct workqueue_struct *autoremove_wq;
+
+static void trace_array_kick_autoremove(struct trace_array *tr)
+{
+ if (autoremove_wq)
+ queue_work(autoremove_wq, &tr->autoremove_work);
+}
+
+static void trace_array_cancel_autoremove(struct trace_array *tr)
+{
+ /*
+ * Since this can be called inside trace_array_autoremove(),
+ * it has to avoid deadlock of the workqueue.
+ */
+ if (work_pending(&tr->autoremove_work))
+ cancel_work_sync(&tr->autoremove_work);
+}
+
+static void trace_array_init_autoremove(struct trace_array *tr)
+{
+ INIT_WORK(&tr->autoremove_work, trace_array_autoremove);
+}
+
+static void trace_array_start_autoremove(void)
+{
+ if (autoremove_wq)
+ return;
+
+ autoremove_wq = alloc_workqueue("tr_autoremove_wq",
+ WQ_UNBOUND | WQ_HIGHPRI, 0);
+ if (!autoremove_wq)
+ pr_warn("Unable to allocate tr_autoremove_wq. autoremove disabled.\n");
+}
+
LIST_HEAD(ftrace_trace_arrays);
+static int __trace_array_get(struct trace_array *this_tr)
+{
+ /* When free_on_close is set, this is not available anymore. */
+ if (autoremove_wq && this_tr->free_on_close)
+ return -ENODEV;
+
+ this_tr->ref++;
+ return 0;
+}
+
int trace_array_get(struct trace_array *this_tr)
{
struct trace_array *tr;
- int ret = -ENODEV;
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
if (tr == this_tr) {
- tr->ref++;
- ret = 0;
- break;
+ return __trace_array_get(tr);
}
}
- mutex_unlock(&trace_types_lock);
- return ret;
+ return -ENODEV;
}
static void __trace_array_put(struct trace_array *this_tr)
{
WARN_ON(!this_tr->ref);
this_tr->ref--;
+ /*
+ * When free_on_close is set, prepare removing the array
+ * when the last reference is released.
+ */
+ if (this_tr->ref == 1 && this_tr->free_on_close)
+ trace_array_kick_autoremove(this_tr);
}
/**
@@ -570,9 +661,8 @@ void trace_array_put(struct trace_array *this_tr)
if (!this_tr)
return;
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
__trace_array_put(this_tr);
- mutex_unlock(&trace_types_lock);
}
EXPORT_SYMBOL_GPL(trace_array_put);
@@ -593,244 +683,6 @@ int tracing_check_open_get_tr(struct trace_array *tr)
return 0;
}
-/**
- * trace_find_filtered_pid - check if a pid exists in a filtered_pid list
- * @filtered_pids: The list of pids to check
- * @search_pid: The PID to find in @filtered_pids
- *
- * Returns true if @search_pid is found in @filtered_pids, and false otherwise.
- */
-bool
-trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
-{
- return trace_pid_list_is_set(filtered_pids, search_pid);
-}
-
-/**
- * trace_ignore_this_task - should a task be ignored for tracing
- * @filtered_pids: The list of pids to check
- * @filtered_no_pids: The list of pids not to be traced
- * @task: The task that should be ignored if not filtered
- *
- * Checks if @task should be traced or not from @filtered_pids.
- * Returns true if @task should *NOT* be traced.
- * Returns false if @task should be traced.
- */
-bool
-trace_ignore_this_task(struct trace_pid_list *filtered_pids,
- struct trace_pid_list *filtered_no_pids,
- struct task_struct *task)
-{
- /*
- * If filtered_no_pids is not empty, and the task's pid is listed
- * in filtered_no_pids, then return true.
- * Otherwise, if filtered_pids is empty, that means we can
- * trace all tasks. If it has content, then only trace pids
- * within filtered_pids.
- */
-
- return (filtered_pids &&
- !trace_find_filtered_pid(filtered_pids, task->pid)) ||
- (filtered_no_pids &&
- trace_find_filtered_pid(filtered_no_pids, task->pid));
-}
-
-/**
- * trace_filter_add_remove_task - Add or remove a task from a pid_list
- * @pid_list: The list to modify
- * @self: The current task for fork or NULL for exit
- * @task: The task to add or remove
- *
- * If adding a task, if @self is defined, the task is only added if @self
- * is also included in @pid_list. This happens on fork and tasks should
- * only be added when the parent is listed. If @self is NULL, then the
- * @task pid will be removed from the list, which would happen on exit
- * of a task.
- */
-void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
- struct task_struct *self,
- struct task_struct *task)
-{
- if (!pid_list)
- return;
-
- /* For forks, we only add if the forking task is listed */
- if (self) {
- if (!trace_find_filtered_pid(pid_list, self->pid))
- return;
- }
-
- /* "self" is set for forks, and NULL for exits */
- if (self)
- trace_pid_list_set(pid_list, task->pid);
- else
- trace_pid_list_clear(pid_list, task->pid);
-}
-
-/**
- * trace_pid_next - Used for seq_file to get to the next pid of a pid_list
- * @pid_list: The pid list to show
- * @v: The last pid that was shown (+1 the actual pid to let zero be displayed)
- * @pos: The position of the file
- *
- * This is used by the seq_file "next" operation to iterate the pids
- * listed in a trace_pid_list structure.
- *
- * Returns the pid+1 as we want to display pid of zero, but NULL would
- * stop the iteration.
- */
-void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos)
-{
- long pid = (unsigned long)v;
- unsigned int next;
-
- (*pos)++;
-
- /* pid already is +1 of the actual previous bit */
- if (trace_pid_list_next(pid_list, pid, &next) < 0)
- return NULL;
-
- pid = next;
-
- /* Return pid + 1 to allow zero to be represented */
- return (void *)(pid + 1);
-}
-
-/**
- * trace_pid_start - Used for seq_file to start reading pid lists
- * @pid_list: The pid list to show
- * @pos: The position of the file
- *
- * This is used by seq_file "start" operation to start the iteration
- * of listing pids.
- *
- * Returns the pid+1 as we want to display pid of zero, but NULL would
- * stop the iteration.
- */
-void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos)
-{
- unsigned long pid;
- unsigned int first;
- loff_t l = 0;
-
- if (trace_pid_list_first(pid_list, &first) < 0)
- return NULL;
-
- pid = first;
-
- /* Return pid + 1 so that zero can be the exit value */
- for (pid++; pid && l < *pos;
- pid = (unsigned long)trace_pid_next(pid_list, (void *)pid, &l))
- ;
- return (void *)pid;
-}
-
-/**
- * trace_pid_show - show the current pid in seq_file processing
- * @m: The seq_file structure to write into
- * @v: A void pointer of the pid (+1) value to display
- *
- * Can be directly used by seq_file operations to display the current
- * pid value.
- */
-int trace_pid_show(struct seq_file *m, void *v)
-{
- unsigned long pid = (unsigned long)v - 1;
-
- seq_printf(m, "%lu\n", pid);
- return 0;
-}
-
-/* 128 should be much more than enough */
-#define PID_BUF_SIZE 127
-
-int trace_pid_write(struct trace_pid_list *filtered_pids,
- struct trace_pid_list **new_pid_list,
- const char __user *ubuf, size_t cnt)
-{
- struct trace_pid_list *pid_list;
- struct trace_parser parser;
- unsigned long val;
- int nr_pids = 0;
- ssize_t read = 0;
- ssize_t ret;
- loff_t pos;
- pid_t pid;
-
- if (trace_parser_get_init(&parser, PID_BUF_SIZE + 1))
- return -ENOMEM;
-
- /*
- * Always recreate a new array. The write is an all or nothing
- * operation. Always create a new array when adding new pids by
- * the user. If the operation fails, then the current list is
- * not modified.
- */
- pid_list = trace_pid_list_alloc();
- if (!pid_list) {
- trace_parser_put(&parser);
- return -ENOMEM;
- }
-
- if (filtered_pids) {
- /* copy the current bits to the new max */
- ret = trace_pid_list_first(filtered_pids, &pid);
- while (!ret) {
- trace_pid_list_set(pid_list, pid);
- ret = trace_pid_list_next(filtered_pids, pid + 1, &pid);
- nr_pids++;
- }
- }
-
- ret = 0;
- while (cnt > 0) {
-
- pos = 0;
-
- ret = trace_get_user(&parser, ubuf, cnt, &pos);
- if (ret < 0)
- break;
-
- read += ret;
- ubuf += ret;
- cnt -= ret;
-
- if (!trace_parser_loaded(&parser))
- break;
-
- ret = -EINVAL;
- if (kstrtoul(parser.buffer, 0, &val))
- break;
-
- pid = (pid_t)val;
-
- if (trace_pid_list_set(pid_list, pid) < 0) {
- ret = -1;
- break;
- }
- nr_pids++;
-
- trace_parser_clear(&parser);
- ret = 0;
- }
- trace_parser_put(&parser);
-
- if (ret < 0) {
- trace_pid_list_free(pid_list);
- return ret;
- }
-
- if (!nr_pids) {
- /* Cleared the list of pids */
- trace_pid_list_free(pid_list);
- pid_list = NULL;
- }
-
- *new_pid_list = pid_list;
-
- return read;
-}
-
static u64 buffer_ftrace_now(struct array_buffer *buf, int cpu)
{
u64 ts;
@@ -866,7 +718,6 @@ int tracing_is_enabled(void)
* return the mirror variable of the state of the ring buffer.
* It's a little racy, but we don't really care.
*/
- smp_rmb();
return !global_trace.buffer_disabled;
}
@@ -974,56 +825,6 @@ static inline void trace_access_lock_init(void)
#endif
-#ifdef CONFIG_STACKTRACE
-static void __ftrace_trace_stack(struct trace_array *tr,
- struct trace_buffer *buffer,
- unsigned int trace_ctx,
- int skip, struct pt_regs *regs);
-static inline void ftrace_trace_stack(struct trace_array *tr,
- struct trace_buffer *buffer,
- unsigned int trace_ctx,
- int skip, struct pt_regs *regs);
-
-#else
-static inline void __ftrace_trace_stack(struct trace_array *tr,
- struct trace_buffer *buffer,
- unsigned int trace_ctx,
- int skip, struct pt_regs *regs)
-{
-}
-static inline void ftrace_trace_stack(struct trace_array *tr,
- struct trace_buffer *buffer,
- unsigned long trace_ctx,
- int skip, struct pt_regs *regs)
-{
-}
-
-#endif
-
-static __always_inline void
-trace_event_setup(struct ring_buffer_event *event,
- int type, unsigned int trace_ctx)
-{
- struct trace_entry *ent = ring_buffer_event_data(event);
-
- tracing_generic_entry_update(ent, type, trace_ctx);
-}
-
-static __always_inline struct ring_buffer_event *
-__trace_buffer_lock_reserve(struct trace_buffer *buffer,
- int type,
- unsigned long len,
- unsigned int trace_ctx)
-{
- struct ring_buffer_event *event;
-
- event = ring_buffer_lock_reserve(buffer, len);
- if (event != NULL)
- trace_event_setup(event, type, trace_ctx);
-
- return event;
-}
-
void tracer_tracing_on(struct trace_array *tr)
{
if (tr->array_buffer.buffer)
@@ -1037,8 +838,6 @@ void tracer_tracing_on(struct trace_array *tr)
* important to be fast than accurate.
*/
tr->buffer_disabled = 0;
- /* Make the flag seen by readers */
- smp_wmb();
}
/**
@@ -1053,176 +852,7 @@ void tracing_on(void)
}
EXPORT_SYMBOL_GPL(tracing_on);
-
-static __always_inline void
-__buffer_unlock_commit(struct trace_buffer *buffer, struct ring_buffer_event *event)
-{
- __this_cpu_write(trace_taskinfo_save, true);
-
- /* If this is the temp buffer, we need to commit fully */
- if (this_cpu_read(trace_buffered_event) == event) {
- /* Length is in event->array[0] */
- ring_buffer_write(buffer, event->array[0], &event->array[1]);
- /* Release the temp buffer */
- this_cpu_dec(trace_buffered_event_cnt);
- /* ring_buffer_unlock_commit() enables preemption */
- preempt_enable_notrace();
- } else
- ring_buffer_unlock_commit(buffer);
-}
-
-int __trace_array_puts(struct trace_array *tr, unsigned long ip,
- const char *str, int size)
-{
- struct ring_buffer_event *event;
- struct trace_buffer *buffer;
- struct print_entry *entry;
- unsigned int trace_ctx;
- int alloc;
-
- if (!(tr->trace_flags & TRACE_ITER_PRINTK))
- return 0;
-
- if (unlikely(tracing_selftest_running && tr == &global_trace))
- return 0;
-
- if (unlikely(tracing_disabled))
- return 0;
-
- alloc = sizeof(*entry) + size + 2; /* possible \n added */
-
- trace_ctx = tracing_gen_ctx();
- buffer = tr->array_buffer.buffer;
- ring_buffer_nest_start(buffer);
- event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,
- trace_ctx);
- if (!event) {
- size = 0;
- goto out;
- }
-
- entry = ring_buffer_event_data(event);
- entry->ip = ip;
-
- memcpy(&entry->buf, str, size);
-
- /* Add a newline if necessary */
- if (entry->buf[size - 1] != '\n') {
- entry->buf[size] = '\n';
- entry->buf[size + 1] = '\0';
- } else
- entry->buf[size] = '\0';
-
- __buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL);
- out:
- ring_buffer_nest_end(buffer);
- return size;
-}
-EXPORT_SYMBOL_GPL(__trace_array_puts);
-
-/**
- * __trace_puts - write a constant string into the trace buffer.
- * @ip: The address of the caller
- * @str: The constant string to write
- * @size: The size of the string.
- */
-int __trace_puts(unsigned long ip, const char *str, int size)
-{
- return __trace_array_puts(printk_trace, ip, str, size);
-}
-EXPORT_SYMBOL_GPL(__trace_puts);
-
-/**
- * __trace_bputs - write the pointer to a constant string into trace buffer
- * @ip: The address of the caller
- * @str: The constant string to write to the buffer to
- */
-int __trace_bputs(unsigned long ip, const char *str)
-{
- struct trace_array *tr = READ_ONCE(printk_trace);
- struct ring_buffer_event *event;
- struct trace_buffer *buffer;
- struct bputs_entry *entry;
- unsigned int trace_ctx;
- int size = sizeof(struct bputs_entry);
- int ret = 0;
-
- if (!printk_binsafe(tr))
- return __trace_puts(ip, str, strlen(str));
-
- if (!(tr->trace_flags & TRACE_ITER_PRINTK))
- return 0;
-
- if (unlikely(tracing_selftest_running || tracing_disabled))
- return 0;
-
- trace_ctx = tracing_gen_ctx();
- buffer = tr->array_buffer.buffer;
-
- ring_buffer_nest_start(buffer);
- event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
- trace_ctx);
- if (!event)
- goto out;
-
- entry = ring_buffer_event_data(event);
- entry->ip = ip;
- entry->str = str;
-
- __buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL);
-
- ret = 1;
- out:
- ring_buffer_nest_end(buffer);
- return ret;
-}
-EXPORT_SYMBOL_GPL(__trace_bputs);
-
#ifdef CONFIG_TRACER_SNAPSHOT
-static void tracing_snapshot_instance_cond(struct trace_array *tr,
- void *cond_data)
-{
- struct tracer *tracer = tr->current_trace;
- unsigned long flags;
-
- if (in_nmi()) {
- trace_array_puts(tr, "*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n");
- trace_array_puts(tr, "*** snapshot is being ignored ***\n");
- return;
- }
-
- if (!tr->allocated_snapshot) {
- trace_array_puts(tr, "*** SNAPSHOT NOT ALLOCATED ***\n");
- trace_array_puts(tr, "*** stopping trace here! ***\n");
- tracer_tracing_off(tr);
- return;
- }
-
- /* Note, snapshot can not be used when the tracer uses it */
- if (tracer->use_max_tr) {
- trace_array_puts(tr, "*** LATENCY TRACER ACTIVE ***\n");
- trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n");
- return;
- }
-
- if (tr->mapped) {
- trace_array_puts(tr, "*** BUFFER MEMORY MAPPED ***\n");
- trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n");
- return;
- }
-
- local_irq_save(flags);
- update_max_tr(tr, current, smp_processor_id(), cond_data);
- local_irq_restore(flags);
-}
-
-void tracing_snapshot_instance(struct trace_array *tr)
-{
- tracing_snapshot_instance_cond(tr, NULL);
-}
-
/**
* tracing_snapshot - take a snapshot of the current buffer.
*
@@ -1246,143 +876,6 @@ void tracing_snapshot(void)
EXPORT_SYMBOL_GPL(tracing_snapshot);
/**
- * tracing_snapshot_cond - conditionally take a snapshot of the current buffer.
- * @tr: The tracing instance to snapshot
- * @cond_data: The data to be tested conditionally, and possibly saved
- *
- * This is the same as tracing_snapshot() except that the snapshot is
- * conditional - the snapshot will only happen if the
- * cond_snapshot.update() implementation receiving the cond_data
- * returns true, which means that the trace array's cond_snapshot
- * update() operation used the cond_data to determine whether the
- * snapshot should be taken, and if it was, presumably saved it along
- * with the snapshot.
- */
-void tracing_snapshot_cond(struct trace_array *tr, void *cond_data)
-{
- tracing_snapshot_instance_cond(tr, cond_data);
-}
-EXPORT_SYMBOL_GPL(tracing_snapshot_cond);
-
-/**
- * tracing_cond_snapshot_data - get the user data associated with a snapshot
- * @tr: The tracing instance
- *
- * When the user enables a conditional snapshot using
- * tracing_snapshot_cond_enable(), the user-defined cond_data is saved
- * with the snapshot. This accessor is used to retrieve it.
- *
- * Should not be called from cond_snapshot.update(), since it takes
- * the tr->max_lock lock, which the code calling
- * cond_snapshot.update() has already done.
- *
- * Returns the cond_data associated with the trace array's snapshot.
- */
-void *tracing_cond_snapshot_data(struct trace_array *tr)
-{
- void *cond_data = NULL;
-
- local_irq_disable();
- arch_spin_lock(&tr->max_lock);
-
- if (tr->cond_snapshot)
- cond_data = tr->cond_snapshot->cond_data;
-
- arch_spin_unlock(&tr->max_lock);
- local_irq_enable();
-
- return cond_data;
-}
-EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data);
-
-static int resize_buffer_duplicate_size(struct array_buffer *trace_buf,
- struct array_buffer *size_buf, int cpu_id);
-static void set_buffer_entries(struct array_buffer *buf, unsigned long val);
-
-int tracing_alloc_snapshot_instance(struct trace_array *tr)
-{
- int order;
- int ret;
-
- if (!tr->allocated_snapshot) {
-
- /* Make the snapshot buffer have the same order as main buffer */
- order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer);
- ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, order);
- if (ret < 0)
- return ret;
-
- /* allocate spare buffer */
- ret = resize_buffer_duplicate_size(&tr->max_buffer,
- &tr->array_buffer, RING_BUFFER_ALL_CPUS);
- if (ret < 0)
- return ret;
-
- tr->allocated_snapshot = true;
- }
-
- return 0;
-}
-
-static void free_snapshot(struct trace_array *tr)
-{
- /*
- * We don't free the ring buffer. instead, resize it because
- * The max_tr ring buffer has some state (e.g. ring->clock) and
- * we want preserve it.
- */
- ring_buffer_subbuf_order_set(tr->max_buffer.buffer, 0);
- ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS);
- set_buffer_entries(&tr->max_buffer, 1);
- tracing_reset_online_cpus(&tr->max_buffer);
- tr->allocated_snapshot = false;
-}
-
-static int tracing_arm_snapshot_locked(struct trace_array *tr)
-{
- int ret;
-
- lockdep_assert_held(&trace_types_lock);
-
- spin_lock(&tr->snapshot_trigger_lock);
- if (tr->snapshot == UINT_MAX || tr->mapped) {
- spin_unlock(&tr->snapshot_trigger_lock);
- return -EBUSY;
- }
-
- tr->snapshot++;
- spin_unlock(&tr->snapshot_trigger_lock);
-
- ret = tracing_alloc_snapshot_instance(tr);
- if (ret) {
- spin_lock(&tr->snapshot_trigger_lock);
- tr->snapshot--;
- spin_unlock(&tr->snapshot_trigger_lock);
- }
-
- return ret;
-}
-
-int tracing_arm_snapshot(struct trace_array *tr)
-{
- int ret;
-
- mutex_lock(&trace_types_lock);
- ret = tracing_arm_snapshot_locked(tr);
- mutex_unlock(&trace_types_lock);
-
- return ret;
-}
-
-void tracing_disarm_snapshot(struct trace_array *tr)
-{
- spin_lock(&tr->snapshot_trigger_lock);
- if (!WARN_ON(!tr->snapshot))
- tr->snapshot--;
- spin_unlock(&tr->snapshot_trigger_lock);
-}
-
-/**
* tracing_alloc_snapshot - allocate snapshot buffer.
*
* This only allocates the snapshot buffer if it isn't already
@@ -1402,170 +895,18 @@ int tracing_alloc_snapshot(void)
return ret;
}
-EXPORT_SYMBOL_GPL(tracing_alloc_snapshot);
-
-/**
- * tracing_snapshot_alloc - allocate and take a snapshot of the current buffer.
- *
- * This is similar to tracing_snapshot(), but it will allocate the
- * snapshot buffer if it isn't already allocated. Use this only
- * where it is safe to sleep, as the allocation may sleep.
- *
- * This causes a swap between the snapshot buffer and the current live
- * tracing buffer. You can use this to take snapshots of the live
- * trace when some condition is triggered, but continue to trace.
- */
-void tracing_snapshot_alloc(void)
-{
- int ret;
-
- ret = tracing_alloc_snapshot();
- if (ret < 0)
- return;
-
- tracing_snapshot();
-}
-EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
-
-/**
- * tracing_snapshot_cond_enable - enable conditional snapshot for an instance
- * @tr: The tracing instance
- * @cond_data: User data to associate with the snapshot
- * @update: Implementation of the cond_snapshot update function
- *
- * Check whether the conditional snapshot for the given instance has
- * already been enabled, or if the current tracer is already using a
- * snapshot; if so, return -EBUSY, else create a cond_snapshot and
- * save the cond_data and update function inside.
- *
- * Returns 0 if successful, error otherwise.
- */
-int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data,
- cond_update_fn_t update)
-{
- struct cond_snapshot *cond_snapshot;
- int ret = 0;
-
- cond_snapshot = kzalloc(sizeof(*cond_snapshot), GFP_KERNEL);
- if (!cond_snapshot)
- return -ENOMEM;
-
- cond_snapshot->cond_data = cond_data;
- cond_snapshot->update = update;
-
- mutex_lock(&trace_types_lock);
-
- if (tr->current_trace->use_max_tr) {
- ret = -EBUSY;
- goto fail_unlock;
- }
-
- /*
- * The cond_snapshot can only change to NULL without the
- * trace_types_lock. We don't care if we race with it going
- * to NULL, but we want to make sure that it's not set to
- * something other than NULL when we get here, which we can
- * do safely with only holding the trace_types_lock and not
- * having to take the max_lock.
- */
- if (tr->cond_snapshot) {
- ret = -EBUSY;
- goto fail_unlock;
- }
-
- ret = tracing_arm_snapshot_locked(tr);
- if (ret)
- goto fail_unlock;
-
- local_irq_disable();
- arch_spin_lock(&tr->max_lock);
- tr->cond_snapshot = cond_snapshot;
- arch_spin_unlock(&tr->max_lock);
- local_irq_enable();
-
- mutex_unlock(&trace_types_lock);
-
- return ret;
-
- fail_unlock:
- mutex_unlock(&trace_types_lock);
- kfree(cond_snapshot);
- return ret;
-}
-EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable);
-
-/**
- * tracing_snapshot_cond_disable - disable conditional snapshot for an instance
- * @tr: The tracing instance
- *
- * Check whether the conditional snapshot for the given instance is
- * enabled; if so, free the cond_snapshot associated with it,
- * otherwise return -EINVAL.
- *
- * Returns 0 if successful, error otherwise.
- */
-int tracing_snapshot_cond_disable(struct trace_array *tr)
-{
- int ret = 0;
-
- local_irq_disable();
- arch_spin_lock(&tr->max_lock);
-
- if (!tr->cond_snapshot)
- ret = -EINVAL;
- else {
- kfree(tr->cond_snapshot);
- tr->cond_snapshot = NULL;
- }
-
- arch_spin_unlock(&tr->max_lock);
- local_irq_enable();
-
- tracing_disarm_snapshot(tr);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable);
#else
void tracing_snapshot(void)
{
WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used");
}
EXPORT_SYMBOL_GPL(tracing_snapshot);
-void tracing_snapshot_cond(struct trace_array *tr, void *cond_data)
-{
- WARN_ONCE(1, "Snapshot feature not enabled, but internal conditional snapshot used");
-}
-EXPORT_SYMBOL_GPL(tracing_snapshot_cond);
-int tracing_alloc_snapshot(void)
-{
- WARN_ONCE(1, "Snapshot feature not enabled, but snapshot allocation used");
- return -ENODEV;
-}
-EXPORT_SYMBOL_GPL(tracing_alloc_snapshot);
void tracing_snapshot_alloc(void)
{
/* Give warning */
tracing_snapshot();
}
EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
-void *tracing_cond_snapshot_data(struct trace_array *tr)
-{
- return NULL;
-}
-EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data);
-int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update)
-{
- return -ENODEV;
-}
-EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable);
-int tracing_snapshot_cond_disable(struct trace_array *tr)
-{
- return false;
-}
-EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable);
-#define free_snapshot(tr) do { } while (0)
-#define tracing_arm_snapshot_locked(tr) ({ -EBUSY; })
#endif /* CONFIG_TRACER_SNAPSHOT */
void tracer_tracing_off(struct trace_array *tr)
@@ -1581,8 +922,39 @@ void tracer_tracing_off(struct trace_array *tr)
* important to be fast than accurate.
*/
tr->buffer_disabled = 1;
- /* Make the flag seen by readers */
- smp_wmb();
+}
+
+/**
+ * tracer_tracing_disable() - temporary disable the buffer from write
+ * @tr: The trace array to disable its buffer for
+ *
+ * Expects trace_tracing_enable() to re-enable tracing.
+ * The difference between this and tracer_tracing_off() is that this
+ * is a counter and can nest, whereas, tracer_tracing_off() can
+ * be called multiple times and a single trace_tracing_on() will
+ * enable it.
+ */
+void tracer_tracing_disable(struct trace_array *tr)
+{
+ if (WARN_ON_ONCE(!tr->array_buffer.buffer))
+ return;
+
+ ring_buffer_record_disable(tr->array_buffer.buffer);
+}
+
+/**
+ * tracer_tracing_enable() - counter part of tracer_tracing_disable()
+ * @tr: The trace array that had tracer_tracincg_disable() called on it
+ *
+ * This is called after tracer_tracing_disable() has been called on @tr,
+ * when it's safe to re-enable tracing.
+ */
+void tracer_tracing_enable(struct trace_array *tr)
+{
+ if (WARN_ON_ONCE(!tr->array_buffer.buffer))
+ return;
+
+ ring_buffer_record_enable(tr->array_buffer.buffer);
}
/**
@@ -1602,9 +974,18 @@ EXPORT_SYMBOL_GPL(tracing_off);
void disable_trace_on_warning(void)
{
if (__disable_trace_on_warning) {
+ struct trace_array *tr = READ_ONCE(printk_trace);
+
trace_array_printk_buf(global_trace.array_buffer.buffer, _THIS_IP_,
"Disabling tracing due to warning\n");
tracing_off();
+
+ /* Disable trace_printk() buffer too */
+ if (tr != &global_trace) {
+ trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_,
+ "Disabling tracing due to warning\n");
+ tracer_tracing_off(tr);
+ }
}
}
@@ -1754,7 +1135,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
ret = get_user(ch, ubuf++);
if (ret)
- goto out;
+ goto fail;
read++;
cnt--;
@@ -1768,7 +1149,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
while (cnt && isspace(ch)) {
ret = get_user(ch, ubuf++);
if (ret)
- goto out;
+ goto fail;
read++;
cnt--;
}
@@ -1778,8 +1159,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
/* only spaces were written */
if (isspace(ch) || !ch) {
*ppos += read;
- ret = read;
- goto out;
+ return read;
}
}
@@ -1789,11 +1169,12 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
parser->buffer[parser->idx++] = ch;
else {
ret = -EINVAL;
- goto out;
+ goto fail;
}
+
ret = get_user(ch, ubuf++);
if (ret)
- goto out;
+ goto fail;
read++;
cnt--;
}
@@ -1809,13 +1190,13 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
parser->buffer[parser->idx] = 0;
} else {
ret = -EINVAL;
- goto out;
+ goto fail;
}
*ppos += read;
- ret = read;
-
-out:
+ return read;
+fail:
+ trace_parser_fail(parser);
return ret;
}
@@ -1838,207 +1219,6 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
unsigned long __read_mostly tracing_thresh;
-#ifdef CONFIG_TRACER_MAX_TRACE
-static const struct file_operations tracing_max_lat_fops;
-
-#ifdef LATENCY_FS_NOTIFY
-
-static struct workqueue_struct *fsnotify_wq;
-
-static void latency_fsnotify_workfn(struct work_struct *work)
-{
- struct trace_array *tr = container_of(work, struct trace_array,
- fsnotify_work);
- fsnotify_inode(tr->d_max_latency->d_inode, FS_MODIFY);
-}
-
-static void latency_fsnotify_workfn_irq(struct irq_work *iwork)
-{
- struct trace_array *tr = container_of(iwork, struct trace_array,
- fsnotify_irqwork);
- queue_work(fsnotify_wq, &tr->fsnotify_work);
-}
-
-static void trace_create_maxlat_file(struct trace_array *tr,
- struct dentry *d_tracer)
-{
- INIT_WORK(&tr->fsnotify_work, latency_fsnotify_workfn);
- init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq);
- tr->d_max_latency = trace_create_file("tracing_max_latency",
- TRACE_MODE_WRITE,
- d_tracer, tr,
- &tracing_max_lat_fops);
-}
-
-__init static int latency_fsnotify_init(void)
-{
- fsnotify_wq = alloc_workqueue("tr_max_lat_wq",
- WQ_UNBOUND | WQ_HIGHPRI, 0);
- if (!fsnotify_wq) {
- pr_err("Unable to allocate tr_max_lat_wq\n");
- return -ENOMEM;
- }
- return 0;
-}
-
-late_initcall_sync(latency_fsnotify_init);
-
-void latency_fsnotify(struct trace_array *tr)
-{
- if (!fsnotify_wq)
- return;
- /*
- * We cannot call queue_work(&tr->fsnotify_work) from here because it's
- * possible that we are called from __schedule() or do_idle(), which
- * could cause a deadlock.
- */
- irq_work_queue(&tr->fsnotify_irqwork);
-}
-
-#else /* !LATENCY_FS_NOTIFY */
-
-#define trace_create_maxlat_file(tr, d_tracer) \
- trace_create_file("tracing_max_latency", TRACE_MODE_WRITE, \
- d_tracer, tr, &tracing_max_lat_fops)
-
-#endif
-
-/*
- * Copy the new maximum trace into the separate maximum-trace
- * structure. (this way the maximum trace is permanently saved,
- * for later retrieval via /sys/kernel/tracing/tracing_max_latency)
- */
-static void
-__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
-{
- struct array_buffer *trace_buf = &tr->array_buffer;
- struct array_buffer *max_buf = &tr->max_buffer;
- struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu);
- struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu);
-
- max_buf->cpu = cpu;
- max_buf->time_start = data->preempt_timestamp;
-
- max_data->saved_latency = tr->max_latency;
- max_data->critical_start = data->critical_start;
- max_data->critical_end = data->critical_end;
-
- strscpy(max_data->comm, tsk->comm);
- max_data->pid = tsk->pid;
- /*
- * If tsk == current, then use current_uid(), as that does not use
- * RCU. The irq tracer can be called out of RCU scope.
- */
- if (tsk == current)
- max_data->uid = current_uid();
- else
- max_data->uid = task_uid(tsk);
-
- max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
- max_data->policy = tsk->policy;
- max_data->rt_priority = tsk->rt_priority;
-
- /* record this tasks comm */
- tracing_record_cmdline(tsk);
- latency_fsnotify(tr);
-}
-
-/**
- * update_max_tr - snapshot all trace buffers from global_trace to max_tr
- * @tr: tracer
- * @tsk: the task with the latency
- * @cpu: The cpu that initiated the trace.
- * @cond_data: User data associated with a conditional snapshot
- *
- * Flip the buffers between the @tr and the max_tr and record information
- * about which task was the cause of this latency.
- */
-void
-update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu,
- void *cond_data)
-{
- if (tr->stop_count)
- return;
-
- WARN_ON_ONCE(!irqs_disabled());
-
- if (!tr->allocated_snapshot) {
- /* Only the nop tracer should hit this when disabling */
- WARN_ON_ONCE(tr->current_trace != &nop_trace);
- return;
- }
-
- arch_spin_lock(&tr->max_lock);
-
- /* Inherit the recordable setting from array_buffer */
- if (ring_buffer_record_is_set_on(tr->array_buffer.buffer))
- ring_buffer_record_on(tr->max_buffer.buffer);
- else
- ring_buffer_record_off(tr->max_buffer.buffer);
-
-#ifdef CONFIG_TRACER_SNAPSHOT
- if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data)) {
- arch_spin_unlock(&tr->max_lock);
- return;
- }
-#endif
- swap(tr->array_buffer.buffer, tr->max_buffer.buffer);
-
- __update_max_tr(tr, tsk, cpu);
-
- arch_spin_unlock(&tr->max_lock);
-
- /* Any waiters on the old snapshot buffer need to wake up */
- ring_buffer_wake_waiters(tr->array_buffer.buffer, RING_BUFFER_ALL_CPUS);
-}
-
-/**
- * update_max_tr_single - only copy one trace over, and reset the rest
- * @tr: tracer
- * @tsk: task with the latency
- * @cpu: the cpu of the buffer to copy.
- *
- * Flip the trace of a single CPU buffer between the @tr and the max_tr.
- */
-void
-update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
-{
- int ret;
-
- if (tr->stop_count)
- return;
-
- WARN_ON_ONCE(!irqs_disabled());
- if (!tr->allocated_snapshot) {
- /* Only the nop tracer should hit this when disabling */
- WARN_ON_ONCE(tr->current_trace != &nop_trace);
- return;
- }
-
- arch_spin_lock(&tr->max_lock);
-
- ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->array_buffer.buffer, cpu);
-
- if (ret == -EBUSY) {
- /*
- * We failed to swap the buffer due to a commit taking
- * place on this CPU. We fail to record, but we reset
- * the max trace buffer (no one writes directly to it)
- * and flag that it failed.
- * Another reason is resize is in progress.
- */
- trace_array_printk_buf(tr->max_buffer.buffer, _THIS_IP_,
- "Failed to swap buffers due to commit or resize in progress\n");
- }
-
- WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
-
- __update_max_tr(tr, tsk, cpu);
- arch_spin_unlock(&tr->max_lock);
-}
-
-#endif /* CONFIG_TRACER_MAX_TRACE */
-
struct pipe_wait {
struct trace_iterator *iter;
int wait_index;
@@ -2070,13 +1250,13 @@ static int wait_on_pipe(struct trace_iterator *iter, int full)
ret = ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, full,
wait_pipe_cond, &pwait);
-#ifdef CONFIG_TRACER_MAX_TRACE
+#ifdef CONFIG_TRACER_SNAPSHOT
/*
* Make sure this is still the snapshot buffer, as if a snapshot were
* to happen, this would now be the main buffer.
*/
if (iter->snapshot)
- iter->array_buffer = &iter->tr->max_buffer;
+ iter->array_buffer = &iter->tr->snapshot_buffer;
#endif
return ret;
}
@@ -2107,6 +1287,7 @@ static int save_selftest(struct tracer *type)
static int run_tracer_selftest(struct tracer *type)
{
struct trace_array *tr = &global_trace;
+ struct tracer_flags *saved_flags = tr->current_trace_flags;
struct tracer *saved_tracer = tr->current_trace;
int ret;
@@ -2137,12 +1318,13 @@ static int run_tracer_selftest(struct tracer *type)
tracing_reset_online_cpus(&tr->array_buffer);
tr->current_trace = type;
+ tr->current_trace_flags = type->flags ? : type->default_flags;
#ifdef CONFIG_TRACER_MAX_TRACE
- if (type->use_max_tr) {
+ if (tracer_uses_snapshot(type)) {
/* If we expanded the buffers, make sure the max is expanded too */
if (tr->ring_buffer_expanded)
- ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size,
+ ring_buffer_resize(tr->snapshot_buffer.buffer, trace_buf_size,
RING_BUFFER_ALL_CPUS);
tr->allocated_snapshot = true;
}
@@ -2153,6 +1335,7 @@ static int run_tracer_selftest(struct tracer *type)
ret = type->selftest(type, tr);
/* the test is responsible for resetting too */
tr->current_trace = saved_tracer;
+ tr->current_trace_flags = saved_flags;
if (ret) {
printk(KERN_CONT "FAILED!\n");
/* Add the warning after printing 'FAILED' */
@@ -2163,12 +1346,12 @@ static int run_tracer_selftest(struct tracer *type)
tracing_reset_online_cpus(&tr->array_buffer);
#ifdef CONFIG_TRACER_MAX_TRACE
- if (type->use_max_tr) {
+ if (tracer_uses_snapshot(type)) {
tr->allocated_snapshot = false;
/* Shrink the max buffer again */
if (tr->ring_buffer_expanded)
- ring_buffer_resize(tr->max_buffer.buffer, 1,
+ ring_buffer_resize(tr->snapshot_buffer.buffer, 1,
RING_BUFFER_ALL_CPUS);
}
#endif
@@ -2203,10 +1386,10 @@ static __init int init_trace_selftests(void)
selftests_can_run = true;
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
if (list_empty(&postponed_selftests))
- goto out;
+ return 0;
pr_info("Running postponed tracer tests:\n");
@@ -2235,9 +1418,6 @@ static __init int init_trace_selftests(void)
}
tracing_selftest_running = false;
- out:
- mutex_unlock(&trace_types_lock);
-
return 0;
}
core_initcall(init_trace_selftests);
@@ -2248,10 +1428,23 @@ static inline int do_run_tracer_selftest(struct tracer *type)
}
#endif /* CONFIG_FTRACE_STARTUP_TEST */
-static void add_tracer_options(struct trace_array *tr, struct tracer *t);
+static int add_tracer(struct trace_array *tr, struct tracer *t);
static void __init apply_trace_boot_options(void);
+static void free_tracers(struct trace_array *tr)
+{
+ struct tracers *t, *n;
+
+ lockdep_assert_held(&trace_types_lock);
+
+ list_for_each_entry_safe(t, n, &tr->tracers, list) {
+ list_del(&t->list);
+ kfree(t->flags);
+ kfree(t);
+ }
+}
+
/**
* register_tracer - register a tracer with the ftrace system.
* @type: the plugin for the tracer
@@ -2260,6 +1453,7 @@ static void __init apply_trace_boot_options(void);
*/
int __init register_tracer(struct tracer *type)
{
+ struct trace_array *tr;
struct tracer *t;
int ret = 0;
@@ -2291,44 +1485,38 @@ int __init register_tracer(struct tracer *type)
}
}
- if (!type->set_flag)
- type->set_flag = &dummy_set_flag;
- if (!type->flags) {
- /*allocate a dummy tracer_flags*/
- type->flags = kmalloc(sizeof(*type->flags), GFP_KERNEL);
- if (!type->flags) {
- ret = -ENOMEM;
- goto out;
- }
- type->flags->val = 0;
- type->flags->opts = dummy_tracer_opt;
- } else
- if (!type->flags->opts)
- type->flags->opts = dummy_tracer_opt;
-
/* store the tracer for __set_tracer_option */
- type->flags->trace = type;
+ if (type->flags)
+ type->flags->trace = type;
ret = do_run_tracer_selftest(type);
if (ret < 0)
goto out;
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ ret = add_tracer(tr, type);
+ if (ret < 0) {
+ /* The tracer will still exist but without options */
+ pr_warn("Failed to create tracer options for %s\n", type->name);
+ break;
+ }
+ }
+
type->next = trace_types;
trace_types = type;
- add_tracer_options(&global_trace, type);
out:
mutex_unlock(&trace_types_lock);
if (ret || !default_bootup_tracer)
- goto out_unlock;
+ return ret;
if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE))
- goto out_unlock;
+ return 0;
printk(KERN_INFO "Starting tracer '%s'\n", type->name);
/* Do we want this tracer to start on bootup? */
- tracing_set_tracer(&global_trace, type->name);
+ WARN_ON(tracing_set_tracer(&global_trace, type->name) < 0);
default_bootup_tracer = NULL;
apply_trace_boot_options();
@@ -2336,11 +1524,10 @@ int __init register_tracer(struct tracer *type)
/* disable other selftests, since this will break it. */
disable_tracing_selftest("running a tracer");
- out_unlock:
- return ret;
+ return 0;
}
-static void tracing_reset_cpu(struct array_buffer *buf, int cpu)
+void tracing_reset_cpu(struct array_buffer *buf, int cpu)
{
struct trace_buffer *buffer = buf->buffer;
@@ -2406,17 +1593,16 @@ void tracing_reset_all_online_cpus_unlocked(void)
continue;
tr->clear_trace = false;
tracing_reset_online_cpus(&tr->array_buffer);
-#ifdef CONFIG_TRACER_MAX_TRACE
- tracing_reset_online_cpus(&tr->max_buffer);
+#ifdef CONFIG_TRACER_SNAPSHOT
+ tracing_reset_online_cpus(&tr->snapshot_buffer);
#endif
}
}
void tracing_reset_all_online_cpus(void)
{
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
tracing_reset_all_online_cpus_unlocked();
- mutex_unlock(&trace_types_lock);
}
int is_tracing_stopped(void)
@@ -2427,18 +1613,17 @@ int is_tracing_stopped(void)
static void tracing_start_tr(struct trace_array *tr)
{
struct trace_buffer *buffer;
- unsigned long flags;
if (tracing_disabled)
return;
- raw_spin_lock_irqsave(&tr->start_lock, flags);
+ guard(raw_spinlock_irqsave)(&tr->start_lock);
if (--tr->stop_count) {
if (WARN_ON_ONCE(tr->stop_count < 0)) {
/* Someone screwed up their debugging */
tr->stop_count = 0;
}
- goto out;
+ return;
}
/* Prevent the buffers from switching */
@@ -2448,16 +1633,13 @@ static void tracing_start_tr(struct trace_array *tr)
if (buffer)
ring_buffer_record_enable(buffer);
-#ifdef CONFIG_TRACER_MAX_TRACE
- buffer = tr->max_buffer.buffer;
+#ifdef CONFIG_TRACER_SNAPSHOT
+ buffer = tr->snapshot_buffer.buffer;
if (buffer)
ring_buffer_record_enable(buffer);
#endif
arch_spin_unlock(&tr->max_lock);
-
- out:
- raw_spin_unlock_irqrestore(&tr->start_lock, flags);
}
/**
@@ -2475,11 +1657,10 @@ void tracing_start(void)
static void tracing_stop_tr(struct trace_array *tr)
{
struct trace_buffer *buffer;
- unsigned long flags;
- raw_spin_lock_irqsave(&tr->start_lock, flags);
+ guard(raw_spinlock_irqsave)(&tr->start_lock);
if (tr->stop_count++)
- goto out;
+ return;
/* Prevent the buffers from switching */
arch_spin_lock(&tr->max_lock);
@@ -2488,16 +1669,13 @@ static void tracing_stop_tr(struct trace_array *tr)
if (buffer)
ring_buffer_record_disable(buffer);
-#ifdef CONFIG_TRACER_MAX_TRACE
- buffer = tr->max_buffer.buffer;
+#ifdef CONFIG_TRACER_SNAPSHOT
+ buffer = tr->snapshot_buffer.buffer;
if (buffer)
ring_buffer_record_disable(buffer);
#endif
arch_spin_unlock(&tr->max_lock);
-
- out:
- raw_spin_unlock_irqrestore(&tr->start_lock, flags);
}
/**
@@ -2610,19 +1788,17 @@ void trace_buffered_event_enable(void)
per_cpu(trace_buffered_event, cpu) = event;
- preempt_disable();
- if (cpu == smp_processor_id() &&
- __this_cpu_read(trace_buffered_event) !=
- per_cpu(trace_buffered_event, cpu))
- WARN_ON_ONCE(1);
- preempt_enable();
+ scoped_guard(preempt,) {
+ if (cpu == smp_processor_id() &&
+ __this_cpu_read(trace_buffered_event) !=
+ per_cpu(trace_buffered_event, cpu))
+ WARN_ON_ONCE(1);
+ }
}
}
static void enable_trace_buffered_event(void *data)
{
- /* Probably not needed, but do it anyway */
- smp_rmb();
this_cpu_dec(trace_buffered_event_cnt);
}
@@ -2807,7 +1983,7 @@ int tracepoint_printk_sysctl(const struct ctl_table *table, int write,
int save_tracepoint_printk;
int ret;
- mutex_lock(&tracepoint_printk_mutex);
+ guard(mutex)(&tracepoint_printk_mutex);
save_tracepoint_printk = tracepoint_printk;
ret = proc_dointvec(table, write, buffer, lenp, ppos);
@@ -2820,16 +1996,13 @@ int tracepoint_printk_sysctl(const struct ctl_table *table, int write,
tracepoint_printk = 0;
if (save_tracepoint_printk == tracepoint_printk)
- goto out;
+ return ret;
if (tracepoint_printk)
static_key_enable(&tracepoint_printk_key.key);
else
static_key_disable(&tracepoint_printk_key.key);
- out:
- mutex_unlock(&tracepoint_printk_mutex);
-
return ret;
}
@@ -2897,13 +2070,16 @@ trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer,
void
trace_function(struct trace_array *tr, unsigned long ip, unsigned long
- parent_ip, unsigned int trace_ctx)
+ parent_ip, unsigned int trace_ctx, struct ftrace_regs *fregs)
{
struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ring_buffer_event *event;
struct ftrace_entry *entry;
+ int size = sizeof(*entry);
- event = __trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
+ size += FTRACE_REGS_MAX_ARGS * !!fregs * sizeof(long);
+
+ event = __trace_buffer_lock_reserve(buffer, TRACE_FN, size,
trace_ctx);
if (!event)
return;
@@ -2911,6 +2087,13 @@ trace_function(struct trace_array *tr, unsigned long ip, unsigned long
entry->ip = ip;
entry->parent_ip = parent_ip;
+#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
+ if (fregs) {
+ for (int i = 0; i < FTRACE_REGS_MAX_ARGS; i++)
+ entry->args[i] = ftrace_regs_get_argument(fregs, i);
+ }
+#endif
+
if (static_branch_unlikely(&trace_function_exports_enabled))
ftrace_exports(event, TRACE_EXPORT_FUNCTION);
__buffer_unlock_commit(buffer, event);
@@ -2935,16 +2118,21 @@ struct ftrace_stacks {
static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks);
static DEFINE_PER_CPU(int, ftrace_stack_reserve);
-static void __ftrace_trace_stack(struct trace_array *tr,
- struct trace_buffer *buffer,
- unsigned int trace_ctx,
- int skip, struct pt_regs *regs)
+void __ftrace_trace_stack(struct trace_array *tr,
+ struct trace_buffer *buffer,
+ unsigned int trace_ctx,
+ int skip, struct pt_regs *regs)
{
struct ring_buffer_event *event;
unsigned int size, nr_entries;
struct ftrace_stack *fstack;
struct stack_entry *entry;
int stackidx;
+ int bit;
+
+ bit = trace_test_and_set_recursion(_THIS_IP_, _RET_IP_, TRACE_EVENT_START);
+ if (bit < 0)
+ return;
/*
* Add one, for this function and the call to save_stack_trace()
@@ -2955,7 +2143,7 @@ static void __ftrace_trace_stack(struct trace_array *tr,
skip++;
#endif
- preempt_disable_notrace();
+ guard(preempt_notrace)();
stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1;
@@ -3013,19 +2201,7 @@ static void __ftrace_trace_stack(struct trace_array *tr,
/* Again, don't let gcc optimize things here */
barrier();
__this_cpu_dec(ftrace_stack_reserve);
- preempt_enable_notrace();
-
-}
-
-static inline void ftrace_trace_stack(struct trace_array *tr,
- struct trace_buffer *buffer,
- unsigned int trace_ctx,
- int skip, struct pt_regs *regs)
-{
- if (!(tr->trace_flags & TRACE_ITER_STACKTRACE))
- return;
-
- __ftrace_trace_stack(tr, buffer, trace_ctx, skip, regs);
+ trace_clear_recursion(bit);
}
void __trace_stack(struct trace_array *tr, unsigned int trace_ctx,
@@ -3083,7 +2259,7 @@ ftrace_trace_userstack(struct trace_array *tr,
struct ring_buffer_event *event;
struct userstack_entry *entry;
- if (!(tr->trace_flags & TRACE_ITER_USERSTACKTRACE))
+ if (!(tr->trace_flags & TRACE_ITER(USERSTACKTRACE)))
return;
/*
@@ -3097,9 +2273,9 @@ ftrace_trace_userstack(struct trace_array *tr,
* prevent recursion, since the user stack tracing may
* trigger other kernel events.
*/
- preempt_disable();
+ guard(preempt)();
if (__this_cpu_read(user_stack_count))
- goto out;
+ return;
__this_cpu_inc(user_stack_count);
@@ -3117,8 +2293,6 @@ ftrace_trace_userstack(struct trace_array *tr,
out_drop_count:
__this_cpu_dec(user_stack_count);
- out:
- preempt_enable();
}
#else /* CONFIG_USER_STACKTRACE_SUPPORT */
static void ftrace_trace_userstack(struct trace_array *tr,
@@ -3164,334 +2338,6 @@ void trace_last_func_repeats(struct trace_array *tr,
__buffer_unlock_commit(buffer, event);
}
-/* created for use with alloc_percpu */
-struct trace_buffer_struct {
- int nesting;
- char buffer[4][TRACE_BUF_SIZE];
-};
-
-static struct trace_buffer_struct __percpu *trace_percpu_buffer;
-
-/*
- * This allows for lockless recording. If we're nested too deeply, then
- * this returns NULL.
- */
-static char *get_trace_buf(void)
-{
- struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer);
-
- if (!trace_percpu_buffer || buffer->nesting >= 4)
- return NULL;
-
- buffer->nesting++;
-
- /* Interrupts must see nesting incremented before we use the buffer */
- barrier();
- return &buffer->buffer[buffer->nesting - 1][0];
-}
-
-static void put_trace_buf(void)
-{
- /* Don't let the decrement of nesting leak before this */
- barrier();
- this_cpu_dec(trace_percpu_buffer->nesting);
-}
-
-static int alloc_percpu_trace_buffer(void)
-{
- struct trace_buffer_struct __percpu *buffers;
-
- if (trace_percpu_buffer)
- return 0;
-
- buffers = alloc_percpu(struct trace_buffer_struct);
- if (MEM_FAIL(!buffers, "Could not allocate percpu trace_printk buffer"))
- return -ENOMEM;
-
- trace_percpu_buffer = buffers;
- return 0;
-}
-
-static int buffers_allocated;
-
-void trace_printk_init_buffers(void)
-{
- if (buffers_allocated)
- return;
-
- if (alloc_percpu_trace_buffer())
- return;
-
- /* trace_printk() is for debug use only. Don't use it in production. */
-
- pr_warn("\n");
- pr_warn("**********************************************************\n");
- pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
- pr_warn("** **\n");
- pr_warn("** trace_printk() being used. Allocating extra memory. **\n");
- pr_warn("** **\n");
- pr_warn("** This means that this is a DEBUG kernel and it is **\n");
- pr_warn("** unsafe for production use. **\n");
- pr_warn("** **\n");
- pr_warn("** If you see this message and you are not debugging **\n");
- pr_warn("** the kernel, report this immediately to your vendor! **\n");
- pr_warn("** **\n");
- pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
- pr_warn("**********************************************************\n");
-
- /* Expand the buffers to set size */
- tracing_update_buffers(&global_trace);
-
- buffers_allocated = 1;
-
- /*
- * trace_printk_init_buffers() can be called by modules.
- * If that happens, then we need to start cmdline recording
- * directly here. If the global_trace.buffer is already
- * allocated here, then this was called by module code.
- */
- if (global_trace.array_buffer.buffer)
- tracing_start_cmdline_record();
-}
-EXPORT_SYMBOL_GPL(trace_printk_init_buffers);
-
-void trace_printk_start_comm(void)
-{
- /* Start tracing comms if trace printk is set */
- if (!buffers_allocated)
- return;
- tracing_start_cmdline_record();
-}
-
-static void trace_printk_start_stop_comm(int enabled)
-{
- if (!buffers_allocated)
- return;
-
- if (enabled)
- tracing_start_cmdline_record();
- else
- tracing_stop_cmdline_record();
-}
-
-/**
- * trace_vbprintk - write binary msg to tracing buffer
- * @ip: The address of the caller
- * @fmt: The string format to write to the buffer
- * @args: Arguments for @fmt
- */
-int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
-{
- struct ring_buffer_event *event;
- struct trace_buffer *buffer;
- struct trace_array *tr = READ_ONCE(printk_trace);
- struct bprint_entry *entry;
- unsigned int trace_ctx;
- char *tbuffer;
- int len = 0, size;
-
- if (!printk_binsafe(tr))
- return trace_vprintk(ip, fmt, args);
-
- if (unlikely(tracing_selftest_running || tracing_disabled))
- return 0;
-
- /* Don't pollute graph traces with trace_vprintk internals */
- pause_graph_tracing();
-
- trace_ctx = tracing_gen_ctx();
- preempt_disable_notrace();
-
- tbuffer = get_trace_buf();
- if (!tbuffer) {
- len = 0;
- goto out_nobuffer;
- }
-
- len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args);
-
- if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0)
- goto out_put;
-
- size = sizeof(*entry) + sizeof(u32) * len;
- buffer = tr->array_buffer.buffer;
- ring_buffer_nest_start(buffer);
- event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
- trace_ctx);
- if (!event)
- goto out;
- entry = ring_buffer_event_data(event);
- entry->ip = ip;
- entry->fmt = fmt;
-
- memcpy(entry->buf, tbuffer, sizeof(u32) * len);
- __buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL);
-
-out:
- ring_buffer_nest_end(buffer);
-out_put:
- put_trace_buf();
-
-out_nobuffer:
- preempt_enable_notrace();
- unpause_graph_tracing();
-
- return len;
-}
-EXPORT_SYMBOL_GPL(trace_vbprintk);
-
-__printf(3, 0)
-static int
-__trace_array_vprintk(struct trace_buffer *buffer,
- unsigned long ip, const char *fmt, va_list args)
-{
- struct ring_buffer_event *event;
- int len = 0, size;
- struct print_entry *entry;
- unsigned int trace_ctx;
- char *tbuffer;
-
- if (tracing_disabled)
- return 0;
-
- /* Don't pollute graph traces with trace_vprintk internals */
- pause_graph_tracing();
-
- trace_ctx = tracing_gen_ctx();
- preempt_disable_notrace();
-
-
- tbuffer = get_trace_buf();
- if (!tbuffer) {
- len = 0;
- goto out_nobuffer;
- }
-
- len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
-
- size = sizeof(*entry) + len + 1;
- ring_buffer_nest_start(buffer);
- event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
- trace_ctx);
- if (!event)
- goto out;
- entry = ring_buffer_event_data(event);
- entry->ip = ip;
-
- memcpy(&entry->buf, tbuffer, len + 1);
- __buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL);
-
-out:
- ring_buffer_nest_end(buffer);
- put_trace_buf();
-
-out_nobuffer:
- preempt_enable_notrace();
- unpause_graph_tracing();
-
- return len;
-}
-
-__printf(3, 0)
-int trace_array_vprintk(struct trace_array *tr,
- unsigned long ip, const char *fmt, va_list args)
-{
- if (tracing_selftest_running && tr == &global_trace)
- return 0;
-
- return __trace_array_vprintk(tr->array_buffer.buffer, ip, fmt, args);
-}
-
-/**
- * trace_array_printk - Print a message to a specific instance
- * @tr: The instance trace_array descriptor
- * @ip: The instruction pointer that this is called from.
- * @fmt: The format to print (printf format)
- *
- * If a subsystem sets up its own instance, they have the right to
- * printk strings into their tracing instance buffer using this
- * function. Note, this function will not write into the top level
- * buffer (use trace_printk() for that), as writing into the top level
- * buffer should only have events that can be individually disabled.
- * trace_printk() is only used for debugging a kernel, and should not
- * be ever incorporated in normal use.
- *
- * trace_array_printk() can be used, as it will not add noise to the
- * top level tracing buffer.
- *
- * Note, trace_array_init_printk() must be called on @tr before this
- * can be used.
- */
-__printf(3, 0)
-int trace_array_printk(struct trace_array *tr,
- unsigned long ip, const char *fmt, ...)
-{
- int ret;
- va_list ap;
-
- if (!tr)
- return -ENOENT;
-
- /* This is only allowed for created instances */
- if (tr == &global_trace)
- return 0;
-
- if (!(tr->trace_flags & TRACE_ITER_PRINTK))
- return 0;
-
- va_start(ap, fmt);
- ret = trace_array_vprintk(tr, ip, fmt, ap);
- va_end(ap);
- return ret;
-}
-EXPORT_SYMBOL_GPL(trace_array_printk);
-
-/**
- * trace_array_init_printk - Initialize buffers for trace_array_printk()
- * @tr: The trace array to initialize the buffers for
- *
- * As trace_array_printk() only writes into instances, they are OK to
- * have in the kernel (unlike trace_printk()). This needs to be called
- * before trace_array_printk() can be used on a trace_array.
- */
-int trace_array_init_printk(struct trace_array *tr)
-{
- if (!tr)
- return -ENOENT;
-
- /* This is only allowed for created instances */
- if (tr == &global_trace)
- return -EINVAL;
-
- return alloc_percpu_trace_buffer();
-}
-EXPORT_SYMBOL_GPL(trace_array_init_printk);
-
-__printf(3, 4)
-int trace_array_printk_buf(struct trace_buffer *buffer,
- unsigned long ip, const char *fmt, ...)
-{
- int ret;
- va_list ap;
-
- if (!(printk_trace->trace_flags & TRACE_ITER_PRINTK))
- return 0;
-
- va_start(ap, fmt);
- ret = __trace_array_vprintk(buffer, ip, fmt, ap);
- va_end(ap);
- return ret;
-}
-
-__printf(2, 0)
-int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
-{
- return trace_array_vprintk(printk_trace, ip, fmt, args);
-}
-EXPORT_SYMBOL_GPL(trace_vprintk);
-
static void trace_iterator_increment(struct trace_iterator *iter)
{
struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, iter->cpu);
@@ -3747,7 +2593,7 @@ const char *trace_event_format(struct trace_iterator *iter, const char *fmt)
if (WARN_ON_ONCE(!fmt))
return fmt;
- if (!iter->tr || iter->tr->trace_flags & TRACE_ITER_HASH_PTR)
+ if (!iter->tr || iter->tr->trace_flags & TRACE_ITER(HASH_PTR))
return fmt;
p = fmt;
@@ -3928,10 +2774,8 @@ static void *s_start(struct seq_file *m, loff_t *pos)
}
mutex_unlock(&trace_types_lock);
-#ifdef CONFIG_TRACER_MAX_TRACE
- if (iter->snapshot && iter->trace->use_max_tr)
+ if (iter->snapshot && tracer_uses_snapshot(iter->trace))
return ERR_PTR(-EBUSY);
-#endif
if (*pos != iter->pos) {
iter->ent = NULL;
@@ -3970,10 +2814,8 @@ static void s_stop(struct seq_file *m, void *p)
{
struct trace_iterator *iter = m->private;
-#ifdef CONFIG_TRACER_MAX_TRACE
- if (iter->snapshot && iter->trace->use_max_tr)
+ if (iter->snapshot && tracer_uses_snapshot(iter->trace))
return;
-#endif
trace_access_unlock(iter->cpu_file);
trace_event_read_unlock();
@@ -4069,7 +2911,7 @@ static void print_event_info(struct array_buffer *buf, struct seq_file *m)
static void print_func_help_header(struct array_buffer *buf, struct seq_file *m,
unsigned int flags)
{
- bool tgid = flags & TRACE_ITER_RECORD_TGID;
+ bool tgid = flags & TRACE_ITER(RECORD_TGID);
print_event_info(buf, m);
@@ -4080,7 +2922,7 @@ static void print_func_help_header(struct array_buffer *buf, struct seq_file *m,
static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file *m,
unsigned int flags)
{
- bool tgid = flags & TRACE_ITER_RECORD_TGID;
+ bool tgid = flags & TRACE_ITER(RECORD_TGID);
static const char space[] = " ";
int prec = tgid ? 12 : 2;
@@ -4119,11 +2961,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
entries,
total,
buf->cpu,
- preempt_model_none() ? "server" :
- preempt_model_voluntary() ? "desktop" :
- preempt_model_full() ? "preempt" :
- preempt_model_rt() ? "preempt_rt" :
- "unknown",
+ preempt_model_str(),
/* These are reserved for later use */
0, 0, 0, 0);
#ifdef CONFIG_SMP
@@ -4157,7 +2995,7 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
struct trace_seq *s = &iter->seq;
struct trace_array *tr = iter->tr;
- if (!(tr->trace_flags & TRACE_ITER_ANNOTATE))
+ if (!(tr->trace_flags & TRACE_ITER(ANNOTATE)))
return;
if (!(iter->iter_flags & TRACE_FILE_ANNOTATE))
@@ -4179,6 +3017,22 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
iter->cpu);
}
+#ifdef CONFIG_FTRACE_SYSCALLS
+static bool is_syscall_event(struct trace_event *event)
+{
+ return (event->funcs == &enter_syscall_print_funcs) ||
+ (event->funcs == &exit_syscall_print_funcs);
+
+}
+#define syscall_buf_size CONFIG_TRACE_SYSCALL_BUF_SIZE_DEFAULT
+#else
+static inline bool is_syscall_event(struct trace_event *event)
+{
+ return false;
+}
+#define syscall_buf_size 0
+#endif /* CONFIG_FTRACE_SYSCALLS */
+
static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
{
struct trace_array *tr = iter->tr;
@@ -4193,7 +3047,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
event = ftrace_find_event(entry->type);
- if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) {
+ if (tr->trace_flags & TRACE_ITER(CONTEXT_INFO)) {
if (iter->iter_flags & TRACE_FILE_LAT_FMT)
trace_print_lat_context(iter);
else
@@ -4204,17 +3058,19 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
return TRACE_TYPE_PARTIAL_LINE;
if (event) {
- if (tr->trace_flags & TRACE_ITER_FIELDS)
+ if (tr->trace_flags & TRACE_ITER(FIELDS))
return print_event_fields(iter, event);
/*
* For TRACE_EVENT() events, the print_fmt is not
* safe to use if the array has delta offsets
* Force printing via the fields.
*/
- if ((tr->text_delta || tr->data_delta) &&
- event->type > __TRACE_LAST_TYPE)
- return print_event_fields(iter, event);
-
+ if ((tr->text_delta)) {
+ /* ftrace and system call events are still OK */
+ if ((event->type > __TRACE_LAST_TYPE) &&
+ !is_syscall_event(event))
+ return print_event_fields(iter, event);
+ }
return event->funcs->trace(iter, sym_flags, event);
}
@@ -4232,7 +3088,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
entry = iter->ent;
- if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO)
+ if (tr->trace_flags & TRACE_ITER(CONTEXT_INFO))
trace_seq_printf(s, "%d %d %llu ",
entry->pid, iter->cpu, iter->ts);
@@ -4258,7 +3114,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
entry = iter->ent;
- if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) {
+ if (tr->trace_flags & TRACE_ITER(CONTEXT_INFO)) {
SEQ_PUT_HEX_FIELD(s, entry->pid);
SEQ_PUT_HEX_FIELD(s, iter->cpu);
SEQ_PUT_HEX_FIELD(s, iter->ts);
@@ -4287,7 +3143,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
entry = iter->ent;
- if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) {
+ if (tr->trace_flags & TRACE_ITER(CONTEXT_INFO)) {
SEQ_PUT_FIELD(s, entry->pid);
SEQ_PUT_FIELD(s, iter->cpu);
SEQ_PUT_FIELD(s, iter->ts);
@@ -4358,27 +3214,27 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
}
if (iter->ent->type == TRACE_BPUTS &&
- trace_flags & TRACE_ITER_PRINTK &&
- trace_flags & TRACE_ITER_PRINTK_MSGONLY)
+ trace_flags & TRACE_ITER(PRINTK) &&
+ trace_flags & TRACE_ITER(PRINTK_MSGONLY))
return trace_print_bputs_msg_only(iter);
if (iter->ent->type == TRACE_BPRINT &&
- trace_flags & TRACE_ITER_PRINTK &&
- trace_flags & TRACE_ITER_PRINTK_MSGONLY)
+ trace_flags & TRACE_ITER(PRINTK) &&
+ trace_flags & TRACE_ITER(PRINTK_MSGONLY))
return trace_print_bprintk_msg_only(iter);
if (iter->ent->type == TRACE_PRINT &&
- trace_flags & TRACE_ITER_PRINTK &&
- trace_flags & TRACE_ITER_PRINTK_MSGONLY)
+ trace_flags & TRACE_ITER(PRINTK) &&
+ trace_flags & TRACE_ITER(PRINTK_MSGONLY))
return trace_print_printk_msg_only(iter);
- if (trace_flags & TRACE_ITER_BIN)
+ if (trace_flags & TRACE_ITER(BIN))
return print_bin_fmt(iter);
- if (trace_flags & TRACE_ITER_HEX)
+ if (trace_flags & TRACE_ITER(HEX))
return print_hex_fmt(iter);
- if (trace_flags & TRACE_ITER_RAW)
+ if (trace_flags & TRACE_ITER(RAW))
return print_raw_fmt(iter);
return print_trace_fmt(iter);
@@ -4396,7 +3252,7 @@ void trace_latency_header(struct seq_file *m)
if (iter->iter_flags & TRACE_FILE_LAT_FMT)
print_trace_header(m, iter);
- if (!(tr->trace_flags & TRACE_ITER_VERBOSE))
+ if (!(tr->trace_flags & TRACE_ITER(VERBOSE)))
print_lat_help_header(m);
}
@@ -4406,7 +3262,7 @@ void trace_default_header(struct seq_file *m)
struct trace_array *tr = iter->tr;
unsigned long trace_flags = tr->trace_flags;
- if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
+ if (!(trace_flags & TRACE_ITER(CONTEXT_INFO)))
return;
if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
@@ -4414,11 +3270,11 @@ void trace_default_header(struct seq_file *m)
if (trace_empty(iter))
return;
print_trace_header(m, iter);
- if (!(trace_flags & TRACE_ITER_VERBOSE))
+ if (!(trace_flags & TRACE_ITER(VERBOSE)))
print_lat_help_header(m);
} else {
- if (!(trace_flags & TRACE_ITER_VERBOSE)) {
- if (trace_flags & TRACE_ITER_IRQ_INFO)
+ if (!(trace_flags & TRACE_ITER(VERBOSE))) {
+ if (trace_flags & TRACE_ITER(IRQ_INFO))
print_func_help_header_irq(iter->array_buffer,
m, trace_flags);
else
@@ -4436,50 +3292,6 @@ static void test_ftrace_alive(struct seq_file *m)
"# MAY BE MISSING FUNCTION EVENTS\n");
}
-#ifdef CONFIG_TRACER_MAX_TRACE
-static void show_snapshot_main_help(struct seq_file *m)
-{
- seq_puts(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"
- "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"
- "# Takes a snapshot of the main buffer.\n"
- "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n"
- "# (Doesn't have to be '2' works with any number that\n"
- "# is not a '0' or '1')\n");
-}
-
-static void show_snapshot_percpu_help(struct seq_file *m)
-{
- seq_puts(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n");
-#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
- seq_puts(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"
- "# Takes a snapshot of the main buffer for this cpu.\n");
-#else
- seq_puts(m, "# echo 1 > snapshot : Not supported with this kernel.\n"
- "# Must use main snapshot file to allocate.\n");
-#endif
- seq_puts(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n"
- "# (Doesn't have to be '2' works with any number that\n"
- "# is not a '0' or '1')\n");
-}
-
-static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter)
-{
- if (iter->tr->allocated_snapshot)
- seq_puts(m, "#\n# * Snapshot is allocated *\n#\n");
- else
- seq_puts(m, "#\n# * Snapshot is freed *\n#\n");
-
- seq_puts(m, "# Snapshot commands:\n");
- if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
- show_snapshot_main_help(m);
- else
- show_snapshot_percpu_help(m);
-}
-#else
-/* Should never be called */
-static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { }
-#endif
-
static int s_show(struct seq_file *m, void *v)
{
struct trace_iterator *iter = v;
@@ -4528,17 +3340,6 @@ static int s_show(struct seq_file *m, void *v)
return 0;
}
-/*
- * Should be used after trace_array_get(), trace_types_lock
- * ensures that i_cdev was already initialized.
- */
-static inline int tracing_get_cpu(struct inode *inode)
-{
- if (inode->i_cdev) /* See trace_create_cpu_file() */
- return (long)inode->i_cdev - 1;
- return RING_BUFFER_ALL_CPUS;
-}
-
static const struct seq_operations tracer_seq_ops = {
.start = s_start,
.next = s_next,
@@ -4565,7 +3366,7 @@ static void free_trace_iter_content(struct trace_iterator *iter)
free_cpumask_var(iter->started);
}
-static struct trace_iterator *
+struct trace_iterator *
__tracing_open(struct inode *inode, struct file *file, bool snapshot)
{
struct trace_array *tr = inode->i_private;
@@ -4579,8 +3380,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
if (!iter)
return ERR_PTR(-ENOMEM);
- iter->buffer_iter = kcalloc(nr_cpu_ids, sizeof(*iter->buffer_iter),
- GFP_KERNEL);
+ iter->buffer_iter = kzalloc_objs(*iter->buffer_iter, nr_cpu_ids);
if (!iter->buffer_iter)
goto release;
@@ -4614,10 +3414,10 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
iter->tr = tr;
-#ifdef CONFIG_TRACER_MAX_TRACE
+#ifdef CONFIG_TRACER_SNAPSHOT
/* Currently only the top directory has a snapshot */
if (tr->current_trace->print_max || snapshot)
- iter->array_buffer = &tr->max_buffer;
+ iter->array_buffer = &tr->snapshot_buffer;
else
#endif
iter->array_buffer = &tr->array_buffer;
@@ -4642,27 +3442,23 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
* If pause-on-trace is enabled, then stop the trace while
* dumping, unless this is the "snapshot" file
*/
- if (!iter->snapshot && (tr->trace_flags & TRACE_ITER_PAUSE_ON_TRACE))
+ if (!iter->snapshot && (tr->trace_flags & TRACE_ITER(PAUSE_ON_TRACE))) {
+ iter->iter_flags |= TRACE_FILE_PAUSE;
tracing_stop_tr(tr);
+ }
if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
for_each_tracing_cpu(cpu) {
iter->buffer_iter[cpu] =
- ring_buffer_read_prepare(iter->array_buffer->buffer,
- cpu, GFP_KERNEL);
- }
- ring_buffer_read_prepare_sync();
- for_each_tracing_cpu(cpu) {
- ring_buffer_read_start(iter->buffer_iter[cpu]);
+ ring_buffer_read_start(iter->array_buffer->buffer,
+ cpu, GFP_KERNEL);
tracing_iter_reset(iter, cpu);
}
} else {
cpu = iter->cpu_file;
iter->buffer_iter[cpu] =
- ring_buffer_read_prepare(iter->array_buffer->buffer,
- cpu, GFP_KERNEL);
- ring_buffer_read_prepare_sync();
- ring_buffer_read_start(iter->buffer_iter[cpu]);
+ ring_buffer_read_start(iter->array_buffer->buffer,
+ cpu, GFP_KERNEL);
tracing_iter_reset(iter, cpu);
}
@@ -4690,11 +3486,6 @@ int tracing_open_generic(struct inode *inode, struct file *filp)
return 0;
}
-bool tracing_is_disabled(void)
-{
- return (tracing_disabled) ? true: false;
-}
-
/*
* Open and update trace_array ref count.
* Must have the current trace_array passed to it.
@@ -4708,6 +3499,11 @@ int tracing_open_generic_tr(struct inode *inode, struct file *filp)
if (ret)
return ret;
+ if ((filp->f_mode & FMODE_WRITE) && trace_array_is_readonly(tr)) {
+ trace_array_put(tr);
+ return -EACCES;
+ }
+
filp->private_data = inode->i_private;
return 0;
@@ -4726,22 +3522,16 @@ int tracing_open_file_tr(struct inode *inode, struct file *filp)
if (ret)
return ret;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
/* Fail if the file is marked for removal */
if (file->flags & EVENT_FILE_FL_FREED) {
trace_array_put(file->tr);
- ret = -ENODEV;
+ return -ENODEV;
} else {
event_file_get(file);
}
- mutex_unlock(&event_mutex);
- if (ret)
- return ret;
-
- filp->private_data = inode->i_private;
-
return 0;
}
@@ -4761,13 +3551,7 @@ int tracing_single_release_file_tr(struct inode *inode, struct file *filp)
return single_release(inode, filp);
}
-static int tracing_mark_open(struct inode *inode, struct file *filp)
-{
- stream_open(inode, filp);
- return tracing_open_generic_tr(inode, filp);
-}
-
-static int tracing_release(struct inode *inode, struct file *file)
+int tracing_release(struct inode *inode, struct file *file)
{
struct trace_array *tr = inode->i_private;
struct seq_file *m = file->private_data;
@@ -4791,7 +3575,7 @@ static int tracing_release(struct inode *inode, struct file *file)
if (iter->trace && iter->trace->close)
iter->trace->close(iter);
- if (!iter->snapshot && tr->stop_count)
+ if (iter->iter_flags & TRACE_FILE_PAUSE)
/* reenable tracing if it was previously enabled */
tracing_start_tr(tr);
@@ -4822,6 +3606,8 @@ static int tracing_single_release_tr(struct inode *inode, struct file *file)
return single_release(inode, file);
}
+static bool update_last_data_if_empty(struct trace_array *tr);
+
static int tracing_open(struct inode *inode, struct file *file)
{
struct trace_array *tr = inode->i_private;
@@ -4839,20 +3625,22 @@ static int tracing_open(struct inode *inode, struct file *file)
#ifdef CONFIG_TRACER_MAX_TRACE
if (tr->current_trace->print_max)
- trace_buf = &tr->max_buffer;
+ trace_buf = &tr->snapshot_buffer;
#endif
if (cpu == RING_BUFFER_ALL_CPUS)
tracing_reset_online_cpus(trace_buf);
else
tracing_reset_cpu(trace_buf, cpu);
+
+ update_last_data_if_empty(tr);
}
if (file->f_mode & FMODE_READ) {
iter = __tracing_open(inode, file, false);
if (IS_ERR(iter))
ret = PTR_ERR(iter);
- else if (tr->trace_flags & TRACE_ITER_LATENCY_FMT)
+ else if (tr->trace_flags & TRACE_ITER(LATENCY_FMT))
iter->iter_flags |= TRACE_FILE_LAT_FMT;
}
@@ -4870,11 +3658,9 @@ static int tracing_open(struct inode *inode, struct file *file)
static bool
trace_ok_for_array(struct tracer *t, struct trace_array *tr)
{
-#ifdef CONFIG_TRACER_SNAPSHOT
/* arrays with mapped buffer range do not have snapshots */
- if (tr->range_addr_start && t->use_max_tr)
+ if (tr->range_addr_start && tracer_uses_snapshot(t))
return false;
-#endif
return (tr->flags & TRACE_ARRAY_FL_GLOBAL) || t->allow_instances;
}
@@ -5016,7 +3802,7 @@ tracing_cpumask_read(struct file *filp, char __user *ubuf,
size_t count, loff_t *ppos)
{
struct trace_array *tr = file_inode(filp)->i_private;
- char *mask_str;
+ char *mask_str __free(kfree) = NULL;
int len;
len = snprintf(NULL, 0, "%*pb\n",
@@ -5027,16 +3813,10 @@ tracing_cpumask_read(struct file *filp, char __user *ubuf,
len = snprintf(mask_str, len, "%*pb\n",
cpumask_pr_args(tr->tracing_cpumask));
- if (len >= count) {
- count = -EINVAL;
- goto out_err;
- }
- count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len);
-
-out_err:
- kfree(mask_str);
+ if (len >= count)
+ return -EINVAL;
- return count;
+ return simple_read_from_buffer(ubuf, count, ppos, mask_str, len);
}
int tracing_set_cpumask(struct trace_array *tr,
@@ -5056,18 +3836,16 @@ int tracing_set_cpumask(struct trace_array *tr,
*/
if (cpumask_test_cpu(cpu, tr->tracing_cpumask) &&
!cpumask_test_cpu(cpu, tracing_cpumask_new)) {
- atomic_inc(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled);
ring_buffer_record_disable_cpu(tr->array_buffer.buffer, cpu);
-#ifdef CONFIG_TRACER_MAX_TRACE
- ring_buffer_record_disable_cpu(tr->max_buffer.buffer, cpu);
+#ifdef CONFIG_TRACER_SNAPSHOT
+ ring_buffer_record_disable_cpu(tr->snapshot_buffer.buffer, cpu);
#endif
}
if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) &&
cpumask_test_cpu(cpu, tracing_cpumask_new)) {
- atomic_dec(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled);
ring_buffer_record_enable_cpu(tr->array_buffer.buffer, cpu);
-#ifdef CONFIG_TRACER_MAX_TRACE
- ring_buffer_record_enable_cpu(tr->max_buffer.buffer, cpu);
+#ifdef CONFIG_TRACER_SNAPSHOT
+ ring_buffer_record_enable_cpu(tr->snapshot_buffer.buffer, cpu);
#endif
}
}
@@ -5123,27 +3901,32 @@ static int tracing_trace_options_show(struct seq_file *m, void *v)
{
struct tracer_opt *trace_opts;
struct trace_array *tr = m->private;
+ struct tracer_flags *flags;
u32 tracer_flags;
int i;
- mutex_lock(&trace_types_lock);
- tracer_flags = tr->current_trace->flags->val;
- trace_opts = tr->current_trace->flags->opts;
+ guard(mutex)(&trace_types_lock);
for (i = 0; trace_options[i]; i++) {
- if (tr->trace_flags & (1 << i))
+ if (tr->trace_flags & (1ULL << i))
seq_printf(m, "%s\n", trace_options[i]);
else
seq_printf(m, "no%s\n", trace_options[i]);
}
+ flags = tr->current_trace_flags;
+ if (!flags || !flags->opts)
+ return 0;
+
+ tracer_flags = flags->val;
+ trace_opts = flags->opts;
+
for (i = 0; trace_opts[i].name; i++) {
if (tracer_flags & trace_opts[i].bit)
seq_printf(m, "%s\n", trace_opts[i].name);
else
seq_printf(m, "no%s\n", trace_opts[i].name);
}
- mutex_unlock(&trace_types_lock);
return 0;
}
@@ -5153,9 +3936,10 @@ static int __set_tracer_option(struct trace_array *tr,
struct tracer_opt *opts, int neg)
{
struct tracer *trace = tracer_flags->trace;
- int ret;
+ int ret = 0;
- ret = trace->set_flag(tr, tracer_flags->val, opts->bit, !neg);
+ if (trace->set_flag)
+ ret = trace->set_flag(tr, tracer_flags->val, opts->bit, !neg);
if (ret)
return ret;
@@ -5169,36 +3953,41 @@ static int __set_tracer_option(struct trace_array *tr,
/* Try to assign a tracer specific option */
static int set_tracer_option(struct trace_array *tr, char *cmp, int neg)
{
- struct tracer *trace = tr->current_trace;
- struct tracer_flags *tracer_flags = trace->flags;
+ struct tracer_flags *tracer_flags = tr->current_trace_flags;
struct tracer_opt *opts = NULL;
int i;
+ if (!tracer_flags || !tracer_flags->opts)
+ return 0;
+
for (i = 0; tracer_flags->opts[i].name; i++) {
opts = &tracer_flags->opts[i];
if (strcmp(cmp, opts->name) == 0)
- return __set_tracer_option(tr, trace->flags, opts, neg);
+ return __set_tracer_option(tr, tracer_flags, opts, neg);
}
return -EINVAL;
}
/* Some tracers require overwrite to stay enabled */
-int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set)
+int trace_keep_overwrite(struct tracer *tracer, u64 mask, int set)
{
- if (tracer->enabled && (mask & TRACE_ITER_OVERWRITE) && !set)
+ if (tracer->enabled && (mask & TRACE_ITER(OVERWRITE)) && !set)
return -1;
return 0;
}
-int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
+int set_tracer_flag(struct trace_array *tr, u64 mask, int enabled)
{
- if ((mask == TRACE_ITER_RECORD_TGID) ||
- (mask == TRACE_ITER_RECORD_CMD) ||
- (mask == TRACE_ITER_TRACE_PRINTK))
+ switch (mask) {
+ case TRACE_ITER(RECORD_TGID):
+ case TRACE_ITER(RECORD_CMD):
+ case TRACE_ITER(TRACE_PRINTK):
+ case TRACE_ITER(COPY_MARKER):
lockdep_assert_held(&event_mutex);
+ }
/* do nothing if flag is already set */
if (!!(tr->trace_flags & mask) == !!enabled)
@@ -5209,7 +3998,8 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
if (tr->current_trace->flag_changed(tr, mask, !!enabled))
return -EINVAL;
- if (mask == TRACE_ITER_TRACE_PRINTK) {
+ switch (mask) {
+ case TRACE_ITER(TRACE_PRINTK):
if (enabled) {
update_printk_trace(tr);
} else {
@@ -5221,11 +4011,17 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
return -EINVAL;
/*
* An instance must always have it set.
- * by default, that's the global_trace instane.
+ * by default, that's the global_trace instance.
*/
if (printk_trace == tr)
update_printk_trace(&global_trace);
}
+ break;
+
+ case TRACE_ITER(COPY_MARKER):
+ update_marker_trace(tr, enabled);
+ /* update_marker_trace updates the tr->trace_flags */
+ return 0;
}
if (enabled)
@@ -5233,35 +4029,46 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
else
tr->trace_flags &= ~mask;
- if (mask == TRACE_ITER_RECORD_CMD)
+ switch (mask) {
+ case TRACE_ITER(RECORD_CMD):
trace_event_enable_cmd_record(enabled);
+ break;
- if (mask == TRACE_ITER_RECORD_TGID) {
+ case TRACE_ITER(RECORD_TGID):
if (trace_alloc_tgid_map() < 0) {
- tr->trace_flags &= ~TRACE_ITER_RECORD_TGID;
+ tr->trace_flags &= ~TRACE_ITER(RECORD_TGID);
return -ENOMEM;
}
trace_event_enable_tgid_record(enabled);
- }
+ break;
- if (mask == TRACE_ITER_EVENT_FORK)
+ case TRACE_ITER(EVENT_FORK):
trace_event_follow_fork(tr, enabled);
+ break;
- if (mask == TRACE_ITER_FUNC_FORK)
+ case TRACE_ITER(FUNC_FORK):
ftrace_pid_follow_fork(tr, enabled);
+ break;
- if (mask == TRACE_ITER_OVERWRITE) {
+ case TRACE_ITER(OVERWRITE):
ring_buffer_change_overwrite(tr->array_buffer.buffer, enabled);
-#ifdef CONFIG_TRACER_MAX_TRACE
- ring_buffer_change_overwrite(tr->max_buffer.buffer, enabled);
+#ifdef CONFIG_TRACER_SNAPSHOT
+ ring_buffer_change_overwrite(tr->snapshot_buffer.buffer, enabled);
#endif
- }
+ break;
- if (mask == TRACE_ITER_PRINTK) {
+ case TRACE_ITER(PRINTK):
trace_printk_start_stop_comm(enabled);
trace_printk_control(enabled);
+ break;
+
+#if defined(CONFIG_FUNCTION_PROFILER) && defined(CONFIG_FUNCTION_GRAPH_TRACER)
+ case TRACE_GRAPH_GRAPH_TIME:
+ ftrace_graph_graph_time_control(enabled);
+ break;
+#endif
}
return 0;
@@ -5291,7 +4098,7 @@ int trace_set_options(struct trace_array *tr, char *option)
if (ret < 0)
ret = set_tracer_option(tr, cmp, neg);
else
- ret = set_tracer_flag(tr, 1 << ret, !neg);
+ ret = set_tracer_flag(tr, 1ULL << ret, !neg);
mutex_unlock(&trace_types_lock);
mutex_unlock(&event_mutex);
@@ -5536,6 +4343,8 @@ static const char readme_msg[] =
"\t efield: For event probes ('e' types), the field is on of the fields\n"
"\t of the <attached-group>/<attached-event>.\n"
#endif
+ " set_event\t\t- Enables events by name written into it\n"
+ "\t\t\t Can enable module events via: :mod:<module>\n"
" events/\t\t- Directory containing all trace event subsystems:\n"
" enable\t\t- Write 0/1 to enable/disable tracing of all events\n"
" events/<system>/\t- Directory containing all trace events for <system>:\n"
@@ -5802,13 +4611,13 @@ trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start,
* where the head holds the module and length of array, and the
* tail holds a pointer to the next list.
*/
- map_array = kmalloc_array(len + 2, sizeof(*map_array), GFP_KERNEL);
+ map_array = kmalloc_objs(*map_array, len + 2);
if (!map_array) {
pr_warn("Unable to allocate trace eval mapping\n");
return;
}
- mutex_lock(&trace_eval_mutex);
+ guard(mutex)(&trace_eval_mutex);
if (!trace_eval_maps)
trace_eval_maps = map_array;
@@ -5832,8 +4641,6 @@ trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start,
map_array++;
}
memset(map_array, 0, sizeof(*map_array));
-
- mutex_unlock(&trace_eval_mutex);
}
static void trace_create_eval_file(struct dentry *d_tracer)
@@ -5848,17 +4655,27 @@ static inline void trace_insert_eval_map_file(struct module *mod,
struct trace_eval_map **start, int len) { }
#endif /* !CONFIG_TRACE_EVAL_MAP_FILE */
-static void trace_insert_eval_map(struct module *mod,
- struct trace_eval_map **start, int len)
+static void
+trace_event_update_with_eval_map(struct module *mod,
+ struct trace_eval_map **start,
+ int len)
{
struct trace_eval_map **map;
- if (len <= 0)
- return;
+ /* Always run sanitizer only if btf_type_tag attr exists. */
+ if (len <= 0) {
+ if (!(IS_ENABLED(CONFIG_DEBUG_INFO_BTF) &&
+ IS_ENABLED(CONFIG_PAHOLE_HAS_BTF_TAG) &&
+ __has_attribute(btf_type_tag)))
+ return;
+ }
map = start;
- trace_event_eval_update(map, len);
+ trace_event_update_all(map, len);
+
+ if (len <= 0)
+ return;
trace_insert_eval_map_file(mod, start, len);
}
@@ -5871,9 +4688,9 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
char buf[MAX_TRACER_SIZE+2];
int r;
- mutex_lock(&trace_types_lock);
- r = sprintf(buf, "%s\n", tr->current_trace->name);
- mutex_unlock(&trace_types_lock);
+ scoped_guard(mutex, &trace_types_lock) {
+ r = sprintf(buf, "%s\n", tr->current_trace->name);
+ }
return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}
@@ -5881,10 +4698,11 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
int tracer_init(struct tracer *t, struct trace_array *tr)
{
tracing_reset_online_cpus(&tr->array_buffer);
+ update_last_data_if_empty(tr);
return t->init(tr);
}
-static void set_buffer_entries(struct array_buffer *buf, unsigned long val)
+void trace_set_buffer_entries(struct array_buffer *buf, unsigned long val)
{
int cpu;
@@ -5895,40 +4713,12 @@ static void set_buffer_entries(struct array_buffer *buf, unsigned long val)
static void update_buffer_entries(struct array_buffer *buf, int cpu)
{
if (cpu == RING_BUFFER_ALL_CPUS) {
- set_buffer_entries(buf, ring_buffer_size(buf->buffer, 0));
+ trace_set_buffer_entries(buf, ring_buffer_size(buf->buffer, 0));
} else {
per_cpu_ptr(buf->data, cpu)->entries = ring_buffer_size(buf->buffer, cpu);
}
}
-#ifdef CONFIG_TRACER_MAX_TRACE
-/* resize @tr's buffer to the size of @size_tr's entries */
-static int resize_buffer_duplicate_size(struct array_buffer *trace_buf,
- struct array_buffer *size_buf, int cpu_id)
-{
- int cpu, ret = 0;
-
- if (cpu_id == RING_BUFFER_ALL_CPUS) {
- for_each_tracing_cpu(cpu) {
- ret = ring_buffer_resize(trace_buf->buffer,
- per_cpu_ptr(size_buf->data, cpu)->entries, cpu);
- if (ret < 0)
- break;
- per_cpu_ptr(trace_buf->data, cpu)->entries =
- per_cpu_ptr(size_buf->data, cpu)->entries;
- }
- } else {
- ret = ring_buffer_resize(trace_buf->buffer,
- per_cpu_ptr(size_buf->data, cpu_id)->entries, cpu_id);
- if (ret == 0)
- per_cpu_ptr(trace_buf->data, cpu_id)->entries =
- per_cpu_ptr(size_buf->data, cpu_id)->entries;
- }
-
- return ret;
-}
-#endif /* CONFIG_TRACER_MAX_TRACE */
-
static int __tracing_resize_ring_buffer(struct trace_array *tr,
unsigned long size, int cpu)
{
@@ -5952,11 +4742,11 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
if (ret < 0)
goto out_start;
-#ifdef CONFIG_TRACER_MAX_TRACE
+#ifdef CONFIG_TRACER_SNAPSHOT
if (!tr->allocated_snapshot)
goto out;
- ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu);
+ ret = ring_buffer_resize(tr->snapshot_buffer.buffer, size, cpu);
if (ret < 0) {
int r = resize_buffer_duplicate_size(&tr->array_buffer,
&tr->array_buffer, cpu);
@@ -5981,10 +4771,10 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
goto out_start;
}
- update_buffer_entries(&tr->max_buffer, cpu);
+ update_buffer_entries(&tr->snapshot_buffer, cpu);
out:
-#endif /* CONFIG_TRACER_MAX_TRACE */
+#endif /* CONFIG_TRACER_SNAPSHOT */
update_buffer_entries(&tr->array_buffer, cpu);
out_start:
@@ -5995,33 +4785,152 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
ssize_t tracing_resize_ring_buffer(struct trace_array *tr,
unsigned long size, int cpu_id)
{
- int ret;
-
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
if (cpu_id != RING_BUFFER_ALL_CPUS) {
/* make sure, this cpu is enabled in the mask */
- if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) {
- ret = -EINVAL;
- goto out;
- }
+ if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask))
+ return -EINVAL;
}
- ret = __tracing_resize_ring_buffer(tr, size, cpu_id);
- if (ret < 0)
- ret = -ENOMEM;
+ return __tracing_resize_ring_buffer(tr, size, cpu_id);
+}
-out:
- mutex_unlock(&trace_types_lock);
+struct trace_mod_entry {
+ unsigned long mod_addr;
+ char mod_name[MODULE_NAME_LEN];
+};
- return ret;
+struct trace_scratch {
+ unsigned int clock_id;
+ unsigned long text_addr;
+ unsigned long nr_entries;
+ struct trace_mod_entry entries[];
+};
+
+static DEFINE_MUTEX(scratch_mutex);
+
+static int cmp_mod_entry(const void *key, const void *pivot)
+{
+ unsigned long addr = (unsigned long)key;
+ const struct trace_mod_entry *ent = pivot;
+
+ if (addr < ent[0].mod_addr)
+ return -1;
+
+ return addr >= ent[1].mod_addr;
}
+/**
+ * trace_adjust_address() - Adjust prev boot address to current address.
+ * @tr: Persistent ring buffer's trace_array.
+ * @addr: Address in @tr which is adjusted.
+ */
+unsigned long trace_adjust_address(struct trace_array *tr, unsigned long addr)
+{
+ struct trace_module_delta *module_delta;
+ struct trace_scratch *tscratch;
+ struct trace_mod_entry *entry;
+ unsigned long raddr;
+ int idx = 0, nr_entries;
+
+ /* If we don't have last boot delta, return the address */
+ if (!(tr->flags & TRACE_ARRAY_FL_LAST_BOOT))
+ return addr;
+
+ /* tr->module_delta must be protected by rcu. */
+ guard(rcu)();
+ tscratch = tr->scratch;
+ /* if there is no tscrach, module_delta must be NULL. */
+ module_delta = READ_ONCE(tr->module_delta);
+ if (!module_delta || !tscratch->nr_entries ||
+ tscratch->entries[0].mod_addr > addr) {
+ raddr = addr + tr->text_delta;
+ return __is_kernel(raddr) || is_kernel_core_data(raddr) ||
+ is_kernel_rodata(raddr) ? raddr : addr;
+ }
+
+ /* Note that entries must be sorted. */
+ nr_entries = tscratch->nr_entries;
+ if (nr_entries == 1 ||
+ tscratch->entries[nr_entries - 1].mod_addr < addr)
+ idx = nr_entries - 1;
+ else {
+ entry = __inline_bsearch((void *)addr,
+ tscratch->entries,
+ nr_entries - 1,
+ sizeof(tscratch->entries[0]),
+ cmp_mod_entry);
+ if (entry)
+ idx = entry - tscratch->entries;
+ }
+
+ return addr + module_delta->delta[idx];
+}
+
+#ifdef CONFIG_MODULES
+static int save_mod(struct module *mod, void *data)
+{
+ struct trace_array *tr = data;
+ struct trace_scratch *tscratch;
+ struct trace_mod_entry *entry;
+ unsigned int size;
+
+ tscratch = tr->scratch;
+ if (!tscratch)
+ return -1;
+ size = tr->scratch_size;
+
+ if (struct_size(tscratch, entries, tscratch->nr_entries + 1) > size)
+ return -1;
+
+ entry = &tscratch->entries[tscratch->nr_entries];
+
+ tscratch->nr_entries++;
+
+ entry->mod_addr = (unsigned long)mod->mem[MOD_TEXT].base;
+ strscpy(entry->mod_name, mod->name);
+
+ return 0;
+}
+#else
+static int save_mod(struct module *mod, void *data)
+{
+ return 0;
+}
+#endif
+
static void update_last_data(struct trace_array *tr)
{
- if (!tr->text_delta && !tr->data_delta)
+ struct trace_module_delta *module_delta;
+ struct trace_scratch *tscratch;
+
+ if (!(tr->flags & TRACE_ARRAY_FL_BOOT))
+ return;
+
+ if (!(tr->flags & TRACE_ARRAY_FL_LAST_BOOT))
return;
+ /* Only if the buffer has previous boot data clear and update it. */
+ tr->flags &= ~TRACE_ARRAY_FL_LAST_BOOT;
+
+ /* If this is a backup instance, mark it for autoremove. */
+ if (tr->flags & TRACE_ARRAY_FL_VMALLOC)
+ tr->free_on_close = true;
+
+ /* Reset the module list and reload them */
+ if (tr->scratch) {
+ struct trace_scratch *tscratch = tr->scratch;
+
+ tscratch->clock_id = tr->clock_id;
+ memset(tscratch->entries, 0,
+ flex_array_size(tscratch, entries, tscratch->nr_entries));
+ tscratch->nr_entries = 0;
+
+ guard(mutex)(&scratch_mutex);
+ module_for_each_mod(save_mod, tr);
+ }
+
/*
* Need to clear all CPU buffers as there cannot be events
* from the previous boot mixed with events with this boot
@@ -6032,7 +4941,17 @@ static void update_last_data(struct trace_array *tr)
/* Using current data now */
tr->text_delta = 0;
- tr->data_delta = 0;
+
+ if (!tr->scratch)
+ return;
+
+ tscratch = tr->scratch;
+ module_delta = READ_ONCE(tr->module_delta);
+ WRITE_ONCE(tr->module_delta, NULL);
+ kfree_rcu(module_delta, rcu);
+
+ /* Set the persistent ring buffer meta data to this address */
+ tscratch->text_addr = (unsigned long)_text;
}
/**
@@ -6050,23 +4969,19 @@ int tracing_update_buffers(struct trace_array *tr)
{
int ret = 0;
- mutex_lock(&trace_types_lock);
+ if (!tr)
+ tr = &global_trace;
+
+ guard(mutex)(&trace_types_lock);
update_last_data(tr);
if (!tr->ring_buffer_expanded)
ret = __tracing_resize_ring_buffer(tr, trace_buf_size,
RING_BUFFER_ALL_CPUS);
- mutex_unlock(&trace_types_lock);
-
return ret;
}
-struct trace_option_dentry;
-
-static void
-create_trace_option_files(struct trace_array *tr, struct tracer *tracer);
-
/*
* Used to clear out the tracer before deletion of an instance.
* Must have trace_types_lock held.
@@ -6082,32 +4997,19 @@ static void tracing_set_nop(struct trace_array *tr)
tr->current_trace->reset(tr);
tr->current_trace = &nop_trace;
+ tr->current_trace_flags = nop_trace.flags;
}
static bool tracer_options_updated;
-static void add_tracer_options(struct trace_array *tr, struct tracer *t)
-{
- /* Only enable if the directory has been created already. */
- if (!tr->dir)
- return;
-
- /* Only create trace option files after update_tracer_options finish */
- if (!tracer_options_updated)
- return;
-
- create_trace_option_files(tr, t);
-}
-
int tracing_set_tracer(struct trace_array *tr, const char *buf)
{
- struct tracer *t;
-#ifdef CONFIG_TRACER_MAX_TRACE
+ struct tracer *trace = NULL;
+ struct tracers *t;
bool had_max_tr;
-#endif
- int ret = 0;
+ int ret;
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
update_last_data(tr);
@@ -6115,51 +5017,47 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
ret = __tracing_resize_ring_buffer(tr, trace_buf_size,
RING_BUFFER_ALL_CPUS);
if (ret < 0)
- goto out;
+ return ret;
ret = 0;
}
- for (t = trace_types; t; t = t->next) {
- if (strcmp(t->name, buf) == 0)
+ list_for_each_entry(t, &tr->tracers, list) {
+ if (strcmp(t->tracer->name, buf) == 0) {
+ trace = t->tracer;
break;
+ }
}
- if (!t) {
- ret = -EINVAL;
- goto out;
- }
- if (t == tr->current_trace)
- goto out;
+ if (!trace)
+ return -EINVAL;
+
+ if (trace == tr->current_trace)
+ return 0;
#ifdef CONFIG_TRACER_SNAPSHOT
- if (t->use_max_tr) {
+ if (tracer_uses_snapshot(trace)) {
local_irq_disable();
arch_spin_lock(&tr->max_lock);
- if (tr->cond_snapshot)
- ret = -EBUSY;
+ ret = tr->cond_snapshot ? -EBUSY : 0;
arch_spin_unlock(&tr->max_lock);
local_irq_enable();
if (ret)
- goto out;
+ return ret;
}
#endif
/* Some tracers won't work on kernel command line */
- if (system_state < SYSTEM_RUNNING && t->noboot) {
+ if (system_state < SYSTEM_RUNNING && trace->noboot) {
pr_warn("Tracer '%s' is not allowed on command line, ignored\n",
- t->name);
- goto out;
+ trace->name);
+ return -EINVAL;
}
/* Some tracers are only allowed for the top level buffer */
- if (!trace_ok_for_array(t, tr)) {
- ret = -EINVAL;
- goto out;
- }
+ if (!trace_ok_for_array(trace, tr))
+ return -EINVAL;
/* If trace pipe files are being read, we can't change the tracer */
- if (tr->trace_ref) {
- ret = -EBUSY;
- goto out;
- }
+ if (tr->trace_ref)
+ return -EBUSY;
trace_branch_disable();
@@ -6168,13 +5066,13 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
if (tr->current_trace->reset)
tr->current_trace->reset(tr);
-#ifdef CONFIG_TRACER_MAX_TRACE
- had_max_tr = tr->current_trace->use_max_tr;
+ had_max_tr = tracer_uses_snapshot(tr->current_trace);
/* Current trace needs to be nop_trace before synchronize_rcu */
tr->current_trace = &nop_trace;
+ tr->current_trace_flags = nop_trace.flags;
- if (had_max_tr && !t->use_max_tr) {
+ if (had_max_tr && !tracer_uses_snapshot(trace)) {
/*
* We need to make sure that the update_max_tr sees that
* current_trace changed to nop_trace to keep it from
@@ -6187,33 +5085,29 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
tracing_disarm_snapshot(tr);
}
- if (!had_max_tr && t->use_max_tr) {
+ if (!had_max_tr && tracer_uses_snapshot(trace)) {
ret = tracing_arm_snapshot_locked(tr);
if (ret)
- goto out;
+ return ret;
}
-#else
- tr->current_trace = &nop_trace;
-#endif
- if (t->init) {
- ret = tracer_init(t, tr);
+ tr->current_trace_flags = t->flags ? : t->tracer->flags;
+
+ if (trace->init) {
+ ret = tracer_init(trace, tr);
if (ret) {
-#ifdef CONFIG_TRACER_MAX_TRACE
- if (t->use_max_tr)
+ if (tracer_uses_snapshot(trace))
tracing_disarm_snapshot(tr);
-#endif
- goto out;
+ tr->current_trace_flags = nop_trace.flags;
+ return ret;
}
}
- tr->current_trace = t;
+ tr->current_trace = trace;
tr->current_trace->enabled++;
trace_branch_enable(tr);
- out:
- mutex_unlock(&trace_types_lock);
- return ret;
+ return 0;
}
static ssize_t
@@ -6247,9 +5141,8 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
return ret;
}
-static ssize_t
-tracing_nsecs_read(unsigned long *ptr, char __user *ubuf,
- size_t cnt, loff_t *ppos)
+ssize_t tracing_nsecs_read(unsigned long *ptr, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
{
char buf[64];
int r;
@@ -6261,9 +5154,8 @@ tracing_nsecs_read(unsigned long *ptr, char __user *ubuf,
return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}
-static ssize_t
-tracing_nsecs_write(unsigned long *ptr, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
+ssize_t tracing_nsecs_write(unsigned long *ptr, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
{
unsigned long val;
int ret;
@@ -6291,46 +5183,20 @@ tracing_thresh_write(struct file *filp, const char __user *ubuf,
struct trace_array *tr = filp->private_data;
int ret;
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
ret = tracing_nsecs_write(&tracing_thresh, ubuf, cnt, ppos);
if (ret < 0)
- goto out;
+ return ret;
if (tr->current_trace->update_thresh) {
ret = tr->current_trace->update_thresh(tr);
if (ret < 0)
- goto out;
+ return ret;
}
- ret = cnt;
-out:
- mutex_unlock(&trace_types_lock);
-
- return ret;
-}
-
-#ifdef CONFIG_TRACER_MAX_TRACE
-
-static ssize_t
-tracing_max_lat_read(struct file *filp, char __user *ubuf,
- size_t cnt, loff_t *ppos)
-{
- struct trace_array *tr = filp->private_data;
-
- return tracing_nsecs_read(&tr->max_latency, ubuf, cnt, ppos);
-}
-
-static ssize_t
-tracing_max_lat_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
-{
- struct trace_array *tr = filp->private_data;
-
- return tracing_nsecs_write(&tr->max_latency, ubuf, cnt, ppos);
+ return cnt;
}
-#endif
-
static int open_pipe_on_cpu(struct trace_array *tr, int cpu)
{
if (cpu == RING_BUFFER_ALL_CPUS) {
@@ -6367,14 +5233,14 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
if (ret)
return ret;
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
cpu = tracing_get_cpu(inode);
ret = open_pipe_on_cpu(tr, cpu);
if (ret)
goto fail_pipe_on_cpu;
/* create a buffer to store the information to pass to userspace */
- iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+ iter = kzalloc_obj(*iter);
if (!iter) {
ret = -ENOMEM;
goto fail_alloc_iter;
@@ -6391,7 +5257,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
/* trace pipe does not show start of buffer */
cpumask_setall(iter->started);
- if (tr->trace_flags & TRACE_ITER_LATENCY_FMT)
+ if (tr->trace_flags & TRACE_ITER(LATENCY_FMT))
iter->iter_flags |= TRACE_FILE_LAT_FMT;
/* Output in nanoseconds only if we are using a clock in nanoseconds. */
@@ -6411,7 +5277,6 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
tr->trace_ref++;
- mutex_unlock(&trace_types_lock);
return ret;
fail:
@@ -6420,7 +5285,6 @@ fail_alloc_iter:
close_pipe_on_cpu(tr, cpu);
fail_pipe_on_cpu:
__trace_array_put(tr);
- mutex_unlock(&trace_types_lock);
return ret;
}
@@ -6429,14 +5293,13 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
struct trace_iterator *iter = file->private_data;
struct trace_array *tr = inode->i_private;
- mutex_lock(&trace_types_lock);
+ scoped_guard(mutex, &trace_types_lock) {
+ tr->trace_ref--;
- tr->trace_ref--;
-
- if (iter->trace->pipe_close)
- iter->trace->pipe_close(iter);
- close_pipe_on_cpu(tr, iter->cpu_file);
- mutex_unlock(&trace_types_lock);
+ if (iter->trace->pipe_close)
+ iter->trace->pipe_close(iter);
+ close_pipe_on_cpu(tr, iter->cpu_file);
+ }
free_trace_iter_content(iter);
kfree(iter);
@@ -6455,7 +5318,7 @@ trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_tabl
if (trace_buffer_iter(iter, iter->cpu_file))
return EPOLLIN | EPOLLRDNORM;
- if (tr->trace_flags & TRACE_ITER_BLOCK)
+ if (tr->trace_flags & TRACE_ITER(BLOCK))
/*
* Always select as readable when in blocking mode
*/
@@ -6510,6 +5373,22 @@ static int tracing_wait_pipe(struct file *filp)
return 1;
}
+static bool update_last_data_if_empty(struct trace_array *tr)
+{
+ if (!(tr->flags & TRACE_ARRAY_FL_LAST_BOOT))
+ return false;
+
+ if (!ring_buffer_empty(tr->array_buffer.buffer))
+ return false;
+
+ /*
+ * If the buffer contains the last boot data and all per-cpu
+ * buffers are empty, reset it from the kernel side.
+ */
+ update_last_data(tr);
+ return true;
+}
+
/*
* Consumer reader.
*/
@@ -6525,31 +5404,32 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
* This is just a matter of traces coherency, the ring buffer itself
* is protected.
*/
- mutex_lock(&iter->mutex);
+ guard(mutex)(&iter->mutex);
/* return any leftover data */
sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
if (sret != -EBUSY)
- goto out;
+ return sret;
trace_seq_init(&iter->seq);
if (iter->trace->read) {
sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
if (sret)
- goto out;
+ return sret;
}
waitagain:
+ if (update_last_data_if_empty(iter->tr))
+ return 0;
+
sret = tracing_wait_pipe(filp);
if (sret <= 0)
- goto out;
+ return sret;
/* stop when tracing is finished */
- if (trace_empty(iter)) {
- sret = 0;
- goto out;
- }
+ if (trace_empty(iter))
+ return 0;
if (cnt >= TRACE_SEQ_BUFFER_SIZE)
cnt = TRACE_SEQ_BUFFER_SIZE - 1;
@@ -6613,9 +5493,6 @@ waitagain:
if (sret == -EBUSY)
goto waitagain;
-out:
- mutex_unlock(&iter->mutex);
-
return sret;
}
@@ -6728,13 +5605,14 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
/* Copy the data into the page, so we can start over. */
ret = trace_seq_to_buffer(&iter->seq,
page_address(spd.pages[i]),
- trace_seq_used(&iter->seq));
+ min((size_t)trace_seq_used(&iter->seq),
+ (size_t)PAGE_SIZE));
if (ret < 0) {
__free_page(spd.pages[i]);
break;
}
spd.partial[i].offset = 0;
- spd.partial[i].len = trace_seq_used(&iter->seq);
+ spd.partial[i].len = ret;
trace_seq_init(&iter->seq);
}
@@ -6759,6 +5637,43 @@ out_err:
}
static ssize_t
+tracing_syscall_buf_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct inode *inode = file_inode(filp);
+ struct trace_array *tr = inode->i_private;
+ char buf[64];
+ int r;
+
+ r = snprintf(buf, 64, "%d\n", tr->syscall_buf_sz);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+tracing_syscall_buf_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct inode *inode = file_inode(filp);
+ struct trace_array *tr = inode->i_private;
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ if (val > SYSCALL_FAULT_USER_MAX)
+ val = SYSCALL_FAULT_USER_MAX;
+
+ tr->syscall_buf_sz = val;
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+static ssize_t
tracing_entries_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
@@ -6858,19 +5773,102 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf,
return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}
-static ssize_t
-tracing_last_boot_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+#define LAST_BOOT_HEADER ((void *)1)
+
+static void *l_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct trace_array *tr = filp->private_data;
- struct seq_buf seq;
- char buf[64];
+ struct trace_array *tr = m->private;
+ struct trace_scratch *tscratch = tr->scratch;
+ unsigned int index = *pos;
- seq_buf_init(&seq, buf, 64);
+ (*pos)++;
+
+ if (*pos == 1)
+ return LAST_BOOT_HEADER;
+
+ /* Only show offsets of the last boot data */
+ if (!tscratch || !(tr->flags & TRACE_ARRAY_FL_LAST_BOOT))
+ return NULL;
- seq_buf_printf(&seq, "text delta:\t%ld\n", tr->text_delta);
- seq_buf_printf(&seq, "data delta:\t%ld\n", tr->data_delta);
+ /* *pos 0 is for the header, 1 is for the first module */
+ index--;
- return simple_read_from_buffer(ubuf, cnt, ppos, buf, seq_buf_used(&seq));
+ if (index >= tscratch->nr_entries)
+ return NULL;
+
+ return &tscratch->entries[index];
+}
+
+static void *l_start(struct seq_file *m, loff_t *pos)
+{
+ mutex_lock(&scratch_mutex);
+
+ return l_next(m, NULL, pos);
+}
+
+static void l_stop(struct seq_file *m, void *p)
+{
+ mutex_unlock(&scratch_mutex);
+}
+
+static void show_last_boot_header(struct seq_file *m, struct trace_array *tr)
+{
+ struct trace_scratch *tscratch = tr->scratch;
+
+ /*
+ * Do not leak KASLR address. This only shows the KASLR address of
+ * the last boot. When the ring buffer is started, the LAST_BOOT
+ * flag gets cleared, and this should only report "current".
+ * Otherwise it shows the KASLR address from the previous boot which
+ * should not be the same as the current boot.
+ */
+ if (tscratch && (tr->flags & TRACE_ARRAY_FL_LAST_BOOT))
+ seq_printf(m, "%lx\t[kernel]\n", tscratch->text_addr);
+ else
+ seq_puts(m, "# Current\n");
+}
+
+static int l_show(struct seq_file *m, void *v)
+{
+ struct trace_array *tr = m->private;
+ struct trace_mod_entry *entry = v;
+
+ if (v == LAST_BOOT_HEADER) {
+ show_last_boot_header(m, tr);
+ return 0;
+ }
+
+ seq_printf(m, "%lx\t%s\n", entry->mod_addr, entry->mod_name);
+ return 0;
+}
+
+static const struct seq_operations last_boot_seq_ops = {
+ .start = l_start,
+ .next = l_next,
+ .stop = l_stop,
+ .show = l_show,
+};
+
+static int tracing_last_boot_open(struct inode *inode, struct file *file)
+{
+ struct trace_array *tr = inode->i_private;
+ struct seq_file *m;
+ int ret;
+
+ ret = tracing_check_open_get_tr(tr);
+ if (ret)
+ return ret;
+
+ ret = seq_open(file, &last_boot_seq_ops);
+ if (ret) {
+ trace_array_put(tr);
+ return ret;
+ }
+
+ m = file->private_data;
+ m->private = tr;
+
+ return 0;
}
static int tracing_buffer_meta_open(struct inode *inode, struct file *filp)
@@ -6909,7 +5907,7 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)
struct trace_array *tr = inode->i_private;
/* disable tracing ? */
- if (tr->trace_flags & TRACE_ITER_STOP_ON_FREE)
+ if (tr->trace_flags & TRACE_ITER(STOP_ON_FREE))
tracer_tracing_off(tr);
/* resize the ring buffer to 0 */
tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS);
@@ -6921,11 +5919,9 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)
#define TRACE_MARKER_MAX_SIZE 4096
-static ssize_t
-tracing_mark_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *fpos)
+static ssize_t write_marker_to_buffer(struct trace_array *tr, const char *buf,
+ size_t cnt, unsigned long ip)
{
- struct trace_array *tr = filp->private_data;
struct ring_buffer_event *event;
enum event_trigger_type tt = ETT_NONE;
struct trace_buffer *buffer;
@@ -6933,32 +5929,11 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
int meta_size;
ssize_t written;
size_t size;
- int len;
-
-/* Used in tracing_mark_raw_write() as well */
-#define FAULTED_STR "<faulted>"
-#define FAULTED_SIZE (sizeof(FAULTED_STR) - 1) /* '\0' is already accounted for */
-
- if (tracing_disabled)
- return -EINVAL;
-
- if (!(tr->trace_flags & TRACE_ITER_MARKERS))
- return -EINVAL;
-
- if ((ssize_t)cnt < 0)
- return -EINVAL;
-
- if (cnt > TRACE_MARKER_MAX_SIZE)
- cnt = TRACE_MARKER_MAX_SIZE;
meta_size = sizeof(*entry) + 2; /* add '\0' and possible '\n' */
again:
size = cnt + meta_size;
- /* If less than "<faulted>", then make sure we can still add that */
- if (cnt < FAULTED_SIZE)
- size += FAULTED_SIZE - cnt;
-
buffer = tr->array_buffer.buffer;
event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
tracing_gen_ctx());
@@ -6968,9 +5943,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
* make it smaller and try again.
*/
if (size > ring_buffer_max_event_size(buffer)) {
- /* cnt < FAULTED size should never be bigger than max */
- if (WARN_ON_ONCE(cnt < FAULTED_SIZE))
- return -EBADF;
cnt = ring_buffer_max_event_size(buffer) - meta_size;
/* The above should only happen once */
if (WARN_ON_ONCE(cnt + meta_size == size))
@@ -6983,15 +5955,9 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
}
entry = ring_buffer_event_data(event);
- entry->ip = _THIS_IP_;
-
- len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt);
- if (len) {
- memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE);
- cnt = FAULTED_SIZE;
- written = -EFAULT;
- } else
- written = cnt;
+ entry->ip = ip;
+ memcpy(&entry->buf, buf, cnt);
+ written = cnt;
if (tr->trace_marker_file && !list_empty(&tr->trace_marker_file->triggers)) {
/* do not add \n before testing triggers, but add \0 */
@@ -7015,33 +5981,368 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
return written;
}
+struct trace_user_buf {
+ char *buf;
+};
+
+static DEFINE_MUTEX(trace_user_buffer_mutex);
+static struct trace_user_buf_info *trace_user_buffer;
+
+/**
+ * trace_user_fault_destroy - free up allocated memory of a trace user buffer
+ * @tinfo: The descriptor to free up
+ *
+ * Frees any data allocated in the trace info dsecriptor.
+ */
+void trace_user_fault_destroy(struct trace_user_buf_info *tinfo)
+{
+ char *buf;
+ int cpu;
+
+ if (!tinfo || !tinfo->tbuf)
+ return;
+
+ for_each_possible_cpu(cpu) {
+ buf = per_cpu_ptr(tinfo->tbuf, cpu)->buf;
+ kfree(buf);
+ }
+ free_percpu(tinfo->tbuf);
+}
+
+static int user_fault_buffer_enable(struct trace_user_buf_info *tinfo, size_t size)
+{
+ char *buf;
+ int cpu;
+
+ lockdep_assert_held(&trace_user_buffer_mutex);
+
+ tinfo->tbuf = alloc_percpu(struct trace_user_buf);
+ if (!tinfo->tbuf)
+ return -ENOMEM;
+
+ tinfo->ref = 1;
+ tinfo->size = size;
+
+ /* Clear each buffer in case of error */
+ for_each_possible_cpu(cpu) {
+ per_cpu_ptr(tinfo->tbuf, cpu)->buf = NULL;
+ }
+
+ for_each_possible_cpu(cpu) {
+ buf = kmalloc_node(size, GFP_KERNEL,
+ cpu_to_node(cpu));
+ if (!buf)
+ return -ENOMEM;
+ per_cpu_ptr(tinfo->tbuf, cpu)->buf = buf;
+ }
+
+ return 0;
+}
+
+/* For internal use. Free and reinitialize */
+static void user_buffer_free(struct trace_user_buf_info **tinfo)
+{
+ lockdep_assert_held(&trace_user_buffer_mutex);
+
+ trace_user_fault_destroy(*tinfo);
+ kfree(*tinfo);
+ *tinfo = NULL;
+}
+
+/* For internal use. Initialize and allocate */
+static int user_buffer_init(struct trace_user_buf_info **tinfo, size_t size)
+{
+ bool alloc = false;
+ int ret;
+
+ lockdep_assert_held(&trace_user_buffer_mutex);
+
+ if (!*tinfo) {
+ alloc = true;
+ *tinfo = kzalloc_obj(**tinfo);
+ if (!*tinfo)
+ return -ENOMEM;
+ }
+
+ ret = user_fault_buffer_enable(*tinfo, size);
+ if (ret < 0 && alloc)
+ user_buffer_free(tinfo);
+
+ return ret;
+}
+
+/* For internal use, derefrence and free if necessary */
+static void user_buffer_put(struct trace_user_buf_info **tinfo)
+{
+ guard(mutex)(&trace_user_buffer_mutex);
+
+ if (WARN_ON_ONCE(!*tinfo || !(*tinfo)->ref))
+ return;
+
+ if (--(*tinfo)->ref)
+ return;
+
+ user_buffer_free(tinfo);
+}
+
+/**
+ * trace_user_fault_init - Allocated or reference a per CPU buffer
+ * @tinfo: A pointer to the trace buffer descriptor
+ * @size: The size to allocate each per CPU buffer
+ *
+ * Create a per CPU buffer that can be used to copy from user space
+ * in a task context. When calling trace_user_fault_read(), preemption
+ * must be disabled, and it will enable preemption and copy user
+ * space data to the buffer. If any schedule switches occur, it will
+ * retry until it succeeds without a schedule switch knowing the buffer
+ * is still valid.
+ *
+ * Returns 0 on success, negative on failure.
+ */
+int trace_user_fault_init(struct trace_user_buf_info *tinfo, size_t size)
+{
+ int ret;
+
+ if (!tinfo)
+ return -EINVAL;
+
+ guard(mutex)(&trace_user_buffer_mutex);
+
+ ret = user_buffer_init(&tinfo, size);
+ if (ret < 0)
+ trace_user_fault_destroy(tinfo);
+
+ return ret;
+}
+
+/**
+ * trace_user_fault_get - up the ref count for the user buffer
+ * @tinfo: A pointer to a pointer to the trace buffer descriptor
+ *
+ * Ups the ref count of the trace buffer.
+ *
+ * Returns the new ref count.
+ */
+int trace_user_fault_get(struct trace_user_buf_info *tinfo)
+{
+ if (!tinfo)
+ return -1;
+
+ guard(mutex)(&trace_user_buffer_mutex);
+
+ tinfo->ref++;
+ return tinfo->ref;
+}
+
+/**
+ * trace_user_fault_put - dereference a per cpu trace buffer
+ * @tinfo: The @tinfo that was passed to trace_user_fault_get()
+ *
+ * Decrement the ref count of @tinfo.
+ *
+ * Returns the new refcount (negative on error).
+ */
+int trace_user_fault_put(struct trace_user_buf_info *tinfo)
+{
+ guard(mutex)(&trace_user_buffer_mutex);
+
+ if (WARN_ON_ONCE(!tinfo || !tinfo->ref))
+ return -1;
+
+ --tinfo->ref;
+ return tinfo->ref;
+}
+
+/**
+ * trace_user_fault_read - Read user space into a per CPU buffer
+ * @tinfo: The @tinfo allocated by trace_user_fault_get()
+ * @ptr: The user space pointer to read
+ * @size: The size of user space to read.
+ * @copy_func: Optional function to use to copy from user space
+ * @data: Data to pass to copy_func if it was supplied
+ *
+ * Preemption must be disabled when this is called, and must not
+ * be enabled while using the returned buffer.
+ * This does the copying from user space into a per CPU buffer.
+ *
+ * The @size must not be greater than the size passed in to
+ * trace_user_fault_init().
+ *
+ * If @copy_func is NULL, trace_user_fault_read() will use copy_from_user(),
+ * otherwise it will call @copy_func. It will call @copy_func with:
+ *
+ * buffer: the per CPU buffer of the @tinfo.
+ * ptr: The pointer @ptr to user space to read
+ * size: The @size of the ptr to read
+ * data: The @data parameter
+ *
+ * It is expected that @copy_func will return 0 on success and non zero
+ * if there was a fault.
+ *
+ * Returns a pointer to the buffer with the content read from @ptr.
+ * Preemption must remain disabled while the caller accesses the
+ * buffer returned by this function.
+ * Returns NULL if there was a fault, or the size passed in is
+ * greater than the size passed to trace_user_fault_init().
+ */
+char *trace_user_fault_read(struct trace_user_buf_info *tinfo,
+ const char __user *ptr, size_t size,
+ trace_user_buf_copy copy_func, void *data)
+{
+ int cpu = smp_processor_id();
+ char *buffer = per_cpu_ptr(tinfo->tbuf, cpu)->buf;
+ unsigned int cnt;
+ int trys = 0;
+ int ret;
+
+ lockdep_assert_preemption_disabled();
+
+ /*
+ * It's up to the caller to not try to copy more than it said
+ * it would.
+ */
+ if (size > tinfo->size)
+ return NULL;
+
+ /*
+ * This acts similar to a seqcount. The per CPU context switches are
+ * recorded, migration is disabled and preemption is enabled. The
+ * read of the user space memory is copied into the per CPU buffer.
+ * Preemption is disabled again, and if the per CPU context switches count
+ * is still the same, it means the buffer has not been corrupted.
+ * If the count is different, it is assumed the buffer is corrupted
+ * and reading must be tried again.
+ */
+
+ do {
+ /*
+ * It is possible that something is trying to migrate this
+ * task. What happens then, is when preemption is enabled,
+ * the migration thread will preempt this task, try to
+ * migrate it, fail, then let it run again. That will
+ * cause this to loop again and never succeed.
+ * On failures, enabled and disable preemption with
+ * migration enabled, to allow the migration thread to
+ * migrate this task.
+ */
+ if (trys) {
+ preempt_enable_notrace();
+ preempt_disable_notrace();
+ cpu = smp_processor_id();
+ buffer = per_cpu_ptr(tinfo->tbuf, cpu)->buf;
+ }
+
+ /*
+ * If for some reason, copy_from_user() always causes a context
+ * switch, this would then cause an infinite loop.
+ * If this task is preempted by another user space task, it
+ * will cause this task to try again. But just in case something
+ * changes where the copying from user space causes another task
+ * to run, prevent this from going into an infinite loop.
+ * 100 tries should be plenty.
+ */
+ if (WARN_ONCE(trys++ > 100, "Error: Too many tries to read user space"))
+ return NULL;
+
+ /* Read the current CPU context switch counter */
+ cnt = nr_context_switches_cpu(cpu);
+
+ /*
+ * Preemption is going to be enabled, but this task must
+ * remain on this CPU.
+ */
+ migrate_disable();
+
+ /*
+ * Now preemption is being enabled and another task can come in
+ * and use the same buffer and corrupt our data.
+ */
+ preempt_enable_notrace();
+
+ /* Make sure preemption is enabled here */
+ lockdep_assert_preemption_enabled();
+
+ if (copy_func) {
+ ret = copy_func(buffer, ptr, size, data);
+ } else {
+ ret = __copy_from_user(buffer, ptr, size);
+ }
+
+ preempt_disable_notrace();
+ migrate_enable();
+
+ /* if it faulted, no need to test if the buffer was corrupted */
+ if (ret)
+ return NULL;
+
+ /*
+ * Preemption is disabled again, now check the per CPU context
+ * switch counter. If it doesn't match, then another user space
+ * process may have schedule in and corrupted our buffer. In that
+ * case the copying must be retried.
+ */
+ } while (nr_context_switches_cpu(cpu) != cnt);
+
+ return buffer;
+}
+
static ssize_t
-tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
+tracing_mark_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *fpos)
{
struct trace_array *tr = filp->private_data;
- struct ring_buffer_event *event;
- struct trace_buffer *buffer;
- struct raw_data_entry *entry;
- ssize_t written;
- int size;
- int len;
-
-#define FAULT_SIZE_ID (FAULTED_SIZE + sizeof(int))
+ ssize_t written = -ENODEV;
+ unsigned long ip;
+ char *buf;
- if (tracing_disabled)
+ if (unlikely(tracing_disabled))
return -EINVAL;
- if (!(tr->trace_flags & TRACE_ITER_MARKERS))
+ if (!(tr->trace_flags & TRACE_ITER(MARKERS)))
return -EINVAL;
- /* The marker must at least have a tag id */
- if (cnt < sizeof(unsigned int))
+ if ((ssize_t)cnt < 0)
return -EINVAL;
- size = sizeof(*entry) + cnt;
- if (cnt < FAULT_SIZE_ID)
- size += FAULT_SIZE_ID - cnt;
+ if (cnt > TRACE_MARKER_MAX_SIZE)
+ cnt = TRACE_MARKER_MAX_SIZE;
+
+ /* Must have preemption disabled while having access to the buffer */
+ guard(preempt_notrace)();
+
+ buf = trace_user_fault_read(trace_user_buffer, ubuf, cnt, NULL, NULL);
+ if (!buf)
+ return -EFAULT;
+
+ /* The selftests expect this function to be the IP address */
+ ip = _THIS_IP_;
+
+ /* The global trace_marker can go to multiple instances */
+ if (tr == &global_trace) {
+ guard(rcu)();
+ list_for_each_entry_rcu(tr, &marker_copies, marker_list) {
+ written = write_marker_to_buffer(tr, buf, cnt, ip);
+ if (written < 0)
+ break;
+ }
+ } else {
+ written = write_marker_to_buffer(tr, buf, cnt, ip);
+ }
+
+ return written;
+}
+
+static ssize_t write_raw_marker_to_buffer(struct trace_array *tr,
+ const char *buf, size_t cnt)
+{
+ struct ring_buffer_event *event;
+ struct trace_buffer *buffer;
+ struct raw_data_entry *entry;
+ ssize_t written;
+ size_t size;
+
+ /* cnt includes both the entry->id and the data behind it. */
+ size = struct_offset(entry, id) + cnt;
buffer = tr->array_buffer.buffer;
@@ -7055,20 +6356,88 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
return -EBADF;
entry = ring_buffer_event_data(event);
-
- len = __copy_from_user_inatomic(&entry->id, ubuf, cnt);
- if (len) {
- entry->id = -1;
- memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE);
- written = -EFAULT;
- } else
- written = cnt;
+ unsafe_memcpy(&entry->id, buf, cnt,
+ "id and content already reserved on ring buffer"
+ "'buf' includes the 'id' and the data."
+ "'entry' was allocated with cnt from 'id'.");
+ written = cnt;
__buffer_unlock_commit(buffer, event);
return written;
}
+static ssize_t
+tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *fpos)
+{
+ struct trace_array *tr = filp->private_data;
+ ssize_t written = -ENODEV;
+ char *buf;
+
+ if (unlikely(tracing_disabled))
+ return -EINVAL;
+
+ if (!(tr->trace_flags & TRACE_ITER(MARKERS)))
+ return -EINVAL;
+
+ /* The marker must at least have a tag id */
+ if (cnt < sizeof(unsigned int))
+ return -EINVAL;
+
+ /* raw write is all or nothing */
+ if (cnt > TRACE_MARKER_MAX_SIZE)
+ return -EINVAL;
+
+ /* Must have preemption disabled while having access to the buffer */
+ guard(preempt_notrace)();
+
+ buf = trace_user_fault_read(trace_user_buffer, ubuf, cnt, NULL, NULL);
+ if (!buf)
+ return -EFAULT;
+
+ /* The global trace_marker_raw can go to multiple instances */
+ if (tr == &global_trace) {
+ guard(rcu)();
+ list_for_each_entry_rcu(tr, &marker_copies, marker_list) {
+ written = write_raw_marker_to_buffer(tr, buf, cnt);
+ if (written < 0)
+ break;
+ }
+ } else {
+ written = write_raw_marker_to_buffer(tr, buf, cnt);
+ }
+
+ return written;
+}
+
+static int tracing_mark_open(struct inode *inode, struct file *filp)
+{
+ int ret;
+
+ scoped_guard(mutex, &trace_user_buffer_mutex) {
+ if (!trace_user_buffer) {
+ ret = user_buffer_init(&trace_user_buffer, TRACE_MARKER_MAX_SIZE);
+ if (ret < 0)
+ return ret;
+ } else {
+ trace_user_buffer->ref++;
+ }
+ }
+
+ stream_open(inode, filp);
+ ret = tracing_open_generic_tr(inode, filp);
+ if (ret < 0)
+ user_buffer_put(&trace_user_buffer);
+ return ret;
+}
+
+static int tracing_mark_release(struct inode *inode, struct file *file)
+{
+ user_buffer_put(&trace_user_buffer);
+ return tracing_release_generic_tr(inode, file);
+}
+
static int tracing_clock_show(struct seq_file *m, void *v)
{
struct trace_array *tr = m->private;
@@ -7095,7 +6464,7 @@ int tracing_set_clock(struct trace_array *tr, const char *clockstr)
if (i == ARRAY_SIZE(trace_clocks))
return -EINVAL;
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
tr->clock_id = i;
@@ -7107,13 +6476,18 @@ int tracing_set_clock(struct trace_array *tr, const char *clockstr)
*/
tracing_reset_online_cpus(&tr->array_buffer);
-#ifdef CONFIG_TRACER_MAX_TRACE
- if (tr->max_buffer.buffer)
- ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func);
- tracing_reset_online_cpus(&tr->max_buffer);
+#ifdef CONFIG_TRACER_SNAPSHOT
+ if (tr->snapshot_buffer.buffer)
+ ring_buffer_set_clock(tr->snapshot_buffer.buffer, trace_clocks[i].func);
+ tracing_reset_online_cpus(&tr->snapshot_buffer);
#endif
+ update_last_data_if_empty(tr);
- mutex_unlock(&trace_types_lock);
+ if (tr->scratch && !(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) {
+ struct trace_scratch *tscratch = tr->scratch;
+
+ tscratch->clock_id = i;
+ }
return 0;
}
@@ -7155,6 +6529,11 @@ static int tracing_clock_open(struct inode *inode, struct file *file)
if (ret)
return ret;
+ if ((file->f_mode & FMODE_WRITE) && trace_array_is_readonly(tr)) {
+ trace_array_put(tr);
+ return -EACCES;
+ }
+
ret = single_open(file, tracing_clock_show, inode->i_private);
if (ret < 0)
trace_array_put(tr);
@@ -7166,15 +6545,13 @@ static int tracing_time_stamp_mode_show(struct seq_file *m, void *v)
{
struct trace_array *tr = m->private;
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
if (ring_buffer_time_stamp_abs(tr->array_buffer.buffer))
seq_puts(m, "delta [absolute]\n");
else
seq_puts(m, "[delta] absolute\n");
- mutex_unlock(&trace_types_lock);
-
return 0;
}
@@ -7202,227 +6579,6 @@ u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_eve
return ring_buffer_event_time_stamp(buffer, rbe);
}
-/*
- * Set or disable using the per CPU trace_buffer_event when possible.
- */
-int tracing_set_filter_buffering(struct trace_array *tr, bool set)
-{
- int ret = 0;
-
- mutex_lock(&trace_types_lock);
-
- if (set && tr->no_filter_buffering_ref++)
- goto out;
-
- if (!set) {
- if (WARN_ON_ONCE(!tr->no_filter_buffering_ref)) {
- ret = -EINVAL;
- goto out;
- }
-
- --tr->no_filter_buffering_ref;
- }
- out:
- mutex_unlock(&trace_types_lock);
-
- return ret;
-}
-
-struct ftrace_buffer_info {
- struct trace_iterator iter;
- void *spare;
- unsigned int spare_cpu;
- unsigned int spare_size;
- unsigned int read;
-};
-
-#ifdef CONFIG_TRACER_SNAPSHOT
-static int tracing_snapshot_open(struct inode *inode, struct file *file)
-{
- struct trace_array *tr = inode->i_private;
- struct trace_iterator *iter;
- struct seq_file *m;
- int ret;
-
- ret = tracing_check_open_get_tr(tr);
- if (ret)
- return ret;
-
- if (file->f_mode & FMODE_READ) {
- iter = __tracing_open(inode, file, true);
- if (IS_ERR(iter))
- ret = PTR_ERR(iter);
- } else {
- /* Writes still need the seq_file to hold the private data */
- ret = -ENOMEM;
- m = kzalloc(sizeof(*m), GFP_KERNEL);
- if (!m)
- goto out;
- iter = kzalloc(sizeof(*iter), GFP_KERNEL);
- if (!iter) {
- kfree(m);
- goto out;
- }
- ret = 0;
-
- iter->tr = tr;
- iter->array_buffer = &tr->max_buffer;
- iter->cpu_file = tracing_get_cpu(inode);
- m->private = iter;
- file->private_data = m;
- }
-out:
- if (ret < 0)
- trace_array_put(tr);
-
- return ret;
-}
-
-static void tracing_swap_cpu_buffer(void *tr)
-{
- update_max_tr_single((struct trace_array *)tr, current, smp_processor_id());
-}
-
-static ssize_t
-tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
- loff_t *ppos)
-{
- struct seq_file *m = filp->private_data;
- struct trace_iterator *iter = m->private;
- struct trace_array *tr = iter->tr;
- unsigned long val;
- int ret;
-
- ret = tracing_update_buffers(tr);
- if (ret < 0)
- return ret;
-
- ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
- if (ret)
- return ret;
-
- mutex_lock(&trace_types_lock);
-
- if (tr->current_trace->use_max_tr) {
- ret = -EBUSY;
- goto out;
- }
-
- local_irq_disable();
- arch_spin_lock(&tr->max_lock);
- if (tr->cond_snapshot)
- ret = -EBUSY;
- arch_spin_unlock(&tr->max_lock);
- local_irq_enable();
- if (ret)
- goto out;
-
- switch (val) {
- case 0:
- if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
- ret = -EINVAL;
- break;
- }
- if (tr->allocated_snapshot)
- free_snapshot(tr);
- break;
- case 1:
-/* Only allow per-cpu swap if the ring buffer supports it */
-#ifndef CONFIG_RING_BUFFER_ALLOW_SWAP
- if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
- ret = -EINVAL;
- break;
- }
-#endif
- if (tr->allocated_snapshot)
- ret = resize_buffer_duplicate_size(&tr->max_buffer,
- &tr->array_buffer, iter->cpu_file);
-
- ret = tracing_arm_snapshot_locked(tr);
- if (ret)
- break;
-
- /* Now, we're going to swap */
- if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
- local_irq_disable();
- update_max_tr(tr, current, smp_processor_id(), NULL);
- local_irq_enable();
- } else {
- smp_call_function_single(iter->cpu_file, tracing_swap_cpu_buffer,
- (void *)tr, 1);
- }
- tracing_disarm_snapshot(tr);
- break;
- default:
- if (tr->allocated_snapshot) {
- if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
- tracing_reset_online_cpus(&tr->max_buffer);
- else
- tracing_reset_cpu(&tr->max_buffer, iter->cpu_file);
- }
- break;
- }
-
- if (ret >= 0) {
- *ppos += cnt;
- ret = cnt;
- }
-out:
- mutex_unlock(&trace_types_lock);
- return ret;
-}
-
-static int tracing_snapshot_release(struct inode *inode, struct file *file)
-{
- struct seq_file *m = file->private_data;
- int ret;
-
- ret = tracing_release(inode, file);
-
- if (file->f_mode & FMODE_READ)
- return ret;
-
- /* If write only, the seq_file is just a stub */
- if (m)
- kfree(m->private);
- kfree(m);
-
- return 0;
-}
-
-static int tracing_buffers_open(struct inode *inode, struct file *filp);
-static ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf,
- size_t count, loff_t *ppos);
-static int tracing_buffers_release(struct inode *inode, struct file *file);
-static ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos,
- struct pipe_inode_info *pipe, size_t len, unsigned int flags);
-
-static int snapshot_raw_open(struct inode *inode, struct file *filp)
-{
- struct ftrace_buffer_info *info;
- int ret;
-
- /* The following checks for tracefs lockdown */
- ret = tracing_buffers_open(inode, filp);
- if (ret < 0)
- return ret;
-
- info = filp->private_data;
-
- if (info->iter.trace->use_max_tr) {
- tracing_buffers_release(inode, filp);
- return -EBUSY;
- }
-
- info->iter.snapshot = true;
- info->iter.array_buffer = &info->iter.tr->max_buffer;
-
- return ret;
-}
-
-#endif /* CONFIG_TRACER_SNAPSHOT */
-
-
static const struct file_operations tracing_thresh_fops = {
.open = tracing_open_generic,
.read = tracing_thresh_read,
@@ -7430,16 +6586,6 @@ static const struct file_operations tracing_thresh_fops = {
.llseek = generic_file_llseek,
};
-#ifdef CONFIG_TRACER_MAX_TRACE
-static const struct file_operations tracing_max_lat_fops = {
- .open = tracing_open_generic_tr,
- .read = tracing_max_lat_read,
- .write = tracing_max_lat_write,
- .llseek = generic_file_llseek,
- .release = tracing_release_generic_tr,
-};
-#endif
-
static const struct file_operations set_tracer_fops = {
.open = tracing_open_generic_tr,
.read = tracing_set_trace_read,
@@ -7464,6 +6610,14 @@ static const struct file_operations tracing_entries_fops = {
.release = tracing_release_generic_tr,
};
+static const struct file_operations tracing_syscall_buf_fops = {
+ .open = tracing_open_generic_tr,
+ .read = tracing_syscall_buf_read,
+ .write = tracing_syscall_buf_write,
+ .llseek = generic_file_llseek,
+ .release = tracing_release_generic_tr,
+};
+
static const struct file_operations tracing_buffer_meta_fops = {
.open = tracing_buffer_meta_open,
.read = seq_read,
@@ -7487,13 +6641,13 @@ static const struct file_operations tracing_free_buffer_fops = {
static const struct file_operations tracing_mark_fops = {
.open = tracing_mark_open,
.write = tracing_mark_write,
- .release = tracing_release_generic_tr,
+ .release = tracing_mark_release,
};
static const struct file_operations tracing_mark_raw_fops = {
.open = tracing_mark_open,
.write = tracing_mark_raw_write,
- .release = tracing_release_generic_tr,
+ .release = tracing_mark_release,
};
static const struct file_operations trace_clock_fops = {
@@ -7512,30 +6666,12 @@ static const struct file_operations trace_time_stamp_mode_fops = {
};
static const struct file_operations last_boot_fops = {
- .open = tracing_open_generic_tr,
- .read = tracing_last_boot_read,
- .llseek = generic_file_llseek,
- .release = tracing_release_generic_tr,
-};
-
-#ifdef CONFIG_TRACER_SNAPSHOT
-static const struct file_operations snapshot_fops = {
- .open = tracing_snapshot_open,
+ .open = tracing_last_boot_open,
.read = seq_read,
- .write = tracing_snapshot_write,
- .llseek = tracing_lseek,
- .release = tracing_snapshot_release,
-};
-
-static const struct file_operations snapshot_raw_fops = {
- .open = snapshot_raw_open,
- .read = tracing_buffers_read,
- .release = tracing_buffers_release,
- .splice_read = tracing_buffers_splice_read,
+ .llseek = seq_lseek,
+ .release = tracing_seq_release,
};
-#endif /* CONFIG_TRACER_SNAPSHOT */
-
/*
* trace_min_max_write - Write a u64 value to a trace_min_max_param struct
* @filp: The active open file structure
@@ -7646,7 +6782,7 @@ static struct tracing_log_err *alloc_tracing_log_err(int len)
{
struct tracing_log_err *err;
- err = kzalloc(sizeof(*err), GFP_KERNEL);
+ err = kzalloc_obj(*err);
if (!err)
return ERR_PTR(-ENOMEM);
@@ -7754,12 +6890,11 @@ void tracing_log_err(struct trace_array *tr,
len += sizeof(CMD_PREFIX) + 2 * sizeof("\n") + strlen(cmd) + 1;
- mutex_lock(&tracing_err_log_lock);
+ guard(mutex)(&tracing_err_log_lock);
+
err = get_tracing_log_err(tr, len);
- if (PTR_ERR(err) == -ENOMEM) {
- mutex_unlock(&tracing_err_log_lock);
+ if (PTR_ERR(err) == -ENOMEM)
return;
- }
snprintf(err->loc, TRACING_LOG_LOC_MAX, "%s: error: ", loc);
snprintf(err->cmd, len, "\n" CMD_PREFIX "%s\n", cmd);
@@ -7770,21 +6905,20 @@ void tracing_log_err(struct trace_array *tr,
err->info.ts = local_clock();
list_add_tail(&err->list, &tr->err_log);
- mutex_unlock(&tracing_err_log_lock);
}
static void clear_tracing_err_log(struct trace_array *tr)
{
struct tracing_log_err *err, *next;
- mutex_lock(&tracing_err_log_lock);
+ guard(mutex)(&tracing_err_log_lock);
+
list_for_each_entry_safe(err, next, &tr->err_log, list) {
list_del(&err->list);
free_tracing_log_err(err);
}
tr->n_err_log_entries = 0;
- mutex_unlock(&tracing_err_log_lock);
}
static void *tracing_err_log_seq_start(struct seq_file *m, loff_t *pos)
@@ -7897,7 +7031,7 @@ static const struct file_operations tracing_err_log_fops = {
.release = tracing_err_log_release,
};
-static int tracing_buffers_open(struct inode *inode, struct file *filp)
+int tracing_buffers_open(struct inode *inode, struct file *filp)
{
struct trace_array *tr = inode->i_private;
struct ftrace_buffer_info *info;
@@ -7907,7 +7041,7 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
if (ret)
return ret;
- info = kvzalloc(sizeof(*info), GFP_KERNEL);
+ info = kvzalloc_obj(*info);
if (!info) {
trace_array_put(tr);
return -ENOMEM;
@@ -7945,9 +7079,8 @@ tracing_buffers_poll(struct file *filp, poll_table *poll_table)
return trace_poll(iter, filp, poll_table);
}
-static ssize_t
-tracing_buffers_read(struct file *filp, char __user *ubuf,
- size_t count, loff_t *ppos)
+ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf,
+ size_t count, loff_t *ppos)
{
struct ftrace_buffer_info *info = filp->private_data;
struct trace_iterator *iter = &info->iter;
@@ -7959,10 +7092,8 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
if (!count)
return 0;
-#ifdef CONFIG_TRACER_MAX_TRACE
- if (iter->snapshot && iter->tr->current_trace->use_max_tr)
+ if (iter->snapshot && tracer_uses_snapshot(iter->tr->current_trace))
return -EBUSY;
-#endif
page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer);
@@ -8003,6 +7134,9 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
if (ret < 0) {
if (trace_empty(iter) && !iter->closed) {
+ if (update_last_data_if_empty(iter->tr))
+ return 0;
+
if ((filp->f_flags & O_NONBLOCK))
return -EAGAIN;
@@ -8047,12 +7181,12 @@ static int tracing_buffers_flush(struct file *file, fl_owner_t id)
return 0;
}
-static int tracing_buffers_release(struct inode *inode, struct file *file)
+int tracing_buffers_release(struct inode *inode, struct file *file)
{
struct ftrace_buffer_info *info = file->private_data;
struct trace_iterator *iter = &info->iter;
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
iter->tr->trace_ref--;
@@ -8063,8 +7197,6 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
info->spare_cpu, info->spare);
kvfree(info);
- mutex_unlock(&trace_types_lock);
-
return 0;
}
@@ -8123,10 +7255,9 @@ static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i)
spd->partial[i].private = 0;
}
-static ssize_t
-tracing_buffers_splice_read(struct file *file, loff_t *ppos,
- struct pipe_inode_info *pipe, size_t len,
- unsigned int flags)
+ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
{
struct ftrace_buffer_info *info = file->private_data;
struct trace_iterator *iter = &info->iter;
@@ -8145,10 +7276,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
int entries, i;
ssize_t ret = 0;
-#ifdef CONFIG_TRACER_MAX_TRACE
- if (iter->snapshot && iter->tr->current_trace->use_max_tr)
+ if (iter->snapshot && tracer_uses_snapshot(iter->tr->current_trace))
return -EBUSY;
-#endif
page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer);
if (*ppos & (page_size - 1))
@@ -8171,7 +7300,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
struct page *page;
int r;
- ref = kzalloc(sizeof(*ref), GFP_KERNEL);
+ ref = kzalloc_obj(*ref);
if (!ref) {
ret = -ENOMEM;
break;
@@ -8272,54 +7401,27 @@ static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned
* An ioctl call with cmd 0 to the ring buffer file will wake up all
* waiters
*/
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
/* Make sure the waiters see the new wait_index */
(void)atomic_fetch_inc_release(&iter->wait_index);
ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file);
- mutex_unlock(&trace_types_lock);
return 0;
}
-#ifdef CONFIG_TRACER_MAX_TRACE
-static int get_snapshot_map(struct trace_array *tr)
+/*
+ * This is called when a VMA is duplicated (e.g., on fork()) to increment
+ * the user_mapped counter without remapping pages.
+ */
+static void tracing_buffers_mmap_open(struct vm_area_struct *vma)
{
- int err = 0;
-
- /*
- * Called with mmap_lock held. lockdep would be unhappy if we would now
- * take trace_types_lock. Instead use the specific
- * snapshot_trigger_lock.
- */
- spin_lock(&tr->snapshot_trigger_lock);
-
- if (tr->snapshot || tr->mapped == UINT_MAX)
- err = -EBUSY;
- else
- tr->mapped++;
-
- spin_unlock(&tr->snapshot_trigger_lock);
-
- /* Wait for update_max_tr() to observe iter->tr->mapped */
- if (tr->mapped == 1)
- synchronize_rcu();
-
- return err;
+ struct ftrace_buffer_info *info = vma->vm_file->private_data;
+ struct trace_iterator *iter = &info->iter;
+ ring_buffer_map_dup(iter->array_buffer->buffer, iter->cpu_file);
}
-static void put_snapshot_map(struct trace_array *tr)
-{
- spin_lock(&tr->snapshot_trigger_lock);
- if (!WARN_ON(!tr->mapped))
- tr->mapped--;
- spin_unlock(&tr->snapshot_trigger_lock);
-}
-#else
-static inline int get_snapshot_map(struct trace_array *tr) { return 0; }
-static inline void put_snapshot_map(struct trace_array *tr) { }
-#endif
static void tracing_buffers_mmap_close(struct vm_area_struct *vma)
{
@@ -8330,8 +7432,19 @@ static void tracing_buffers_mmap_close(struct vm_area_struct *vma)
put_snapshot_map(iter->tr);
}
+static int tracing_buffers_may_split(struct vm_area_struct *vma, unsigned long addr)
+{
+ /*
+ * Trace buffer mappings require the complete buffer including
+ * the meta page. Partial mappings are not supported.
+ */
+ return -EINVAL;
+}
+
static const struct vm_operations_struct tracing_buffers_vmops = {
+ .open = tracing_buffers_mmap_open,
.close = tracing_buffers_mmap_close,
+ .may_split = tracing_buffers_may_split,
};
static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma)
@@ -8340,6 +7453,10 @@ static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma)
struct trace_iterator *iter = &info->iter;
int ret = 0;
+ /* A memmap'ed and backup buffers are not supported for user space mmap */
+ if (iter->tr->flags & (TRACE_ARRAY_FL_MEMMAP | TRACE_ARRAY_FL_VMALLOC))
+ return -ENODEV;
+
ret = get_snapshot_map(iter->tr);
if (ret)
return ret;
@@ -8377,7 +7494,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
unsigned long long t;
unsigned long usec_rem;
- s = kmalloc(sizeof(*s), GFP_KERNEL);
+ s = kmalloc_obj(*s);
if (!s)
return -ENOMEM;
@@ -8474,179 +7591,15 @@ static const struct file_operations tracing_dyn_info_fops = {
};
#endif /* CONFIG_DYNAMIC_FTRACE */
-#if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE)
-static void
-ftrace_snapshot(unsigned long ip, unsigned long parent_ip,
- struct trace_array *tr, struct ftrace_probe_ops *ops,
- void *data)
-{
- tracing_snapshot_instance(tr);
-}
-
-static void
-ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip,
- struct trace_array *tr, struct ftrace_probe_ops *ops,
- void *data)
-{
- struct ftrace_func_mapper *mapper = data;
- long *count = NULL;
-
- if (mapper)
- count = (long *)ftrace_func_mapper_find_ip(mapper, ip);
-
- if (count) {
-
- if (*count <= 0)
- return;
-
- (*count)--;
- }
-
- tracing_snapshot_instance(tr);
-}
-
-static int
-ftrace_snapshot_print(struct seq_file *m, unsigned long ip,
- struct ftrace_probe_ops *ops, void *data)
-{
- struct ftrace_func_mapper *mapper = data;
- long *count = NULL;
-
- seq_printf(m, "%ps:", (void *)ip);
-
- seq_puts(m, "snapshot");
-
- if (mapper)
- count = (long *)ftrace_func_mapper_find_ip(mapper, ip);
-
- if (count)
- seq_printf(m, ":count=%ld\n", *count);
- else
- seq_puts(m, ":unlimited\n");
-
- return 0;
-}
-
-static int
-ftrace_snapshot_init(struct ftrace_probe_ops *ops, struct trace_array *tr,
- unsigned long ip, void *init_data, void **data)
-{
- struct ftrace_func_mapper *mapper = *data;
-
- if (!mapper) {
- mapper = allocate_ftrace_func_mapper();
- if (!mapper)
- return -ENOMEM;
- *data = mapper;
- }
-
- return ftrace_func_mapper_add_ip(mapper, ip, init_data);
-}
-
-static void
-ftrace_snapshot_free(struct ftrace_probe_ops *ops, struct trace_array *tr,
- unsigned long ip, void *data)
-{
- struct ftrace_func_mapper *mapper = data;
-
- if (!ip) {
- if (!mapper)
- return;
- free_ftrace_func_mapper(mapper, NULL);
- return;
- }
-
- ftrace_func_mapper_remove_ip(mapper, ip);
-}
-
-static struct ftrace_probe_ops snapshot_probe_ops = {
- .func = ftrace_snapshot,
- .print = ftrace_snapshot_print,
-};
-
-static struct ftrace_probe_ops snapshot_count_probe_ops = {
- .func = ftrace_count_snapshot,
- .print = ftrace_snapshot_print,
- .init = ftrace_snapshot_init,
- .free = ftrace_snapshot_free,
-};
-
-static int
-ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash,
- char *glob, char *cmd, char *param, int enable)
-{
- struct ftrace_probe_ops *ops;
- void *count = (void *)-1;
- char *number;
- int ret;
-
- if (!tr)
- return -ENODEV;
-
- /* hash funcs only work with set_ftrace_filter */
- if (!enable)
- return -EINVAL;
-
- ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops;
-
- if (glob[0] == '!') {
- ret = unregister_ftrace_function_probe_func(glob+1, tr, ops);
- if (!ret)
- tracing_disarm_snapshot(tr);
-
- return ret;
- }
-
- if (!param)
- goto out_reg;
-
- number = strsep(&param, ":");
-
- if (!strlen(number))
- goto out_reg;
-
- /*
- * We use the callback data field (which is a pointer)
- * as our counter.
- */
- ret = kstrtoul(number, 0, (unsigned long *)&count);
- if (ret)
- return ret;
-
- out_reg:
- ret = tracing_arm_snapshot(tr);
- if (ret < 0)
- goto out;
-
- ret = register_ftrace_function_probe(glob, tr, ops, count);
- if (ret < 0)
- tracing_disarm_snapshot(tr);
- out:
- return ret < 0 ? ret : 0;
-}
-
-static struct ftrace_func_command ftrace_snapshot_cmd = {
- .name = "snapshot",
- .func = ftrace_trace_snapshot_callback,
-};
-
-static __init int register_snapshot_cmd(void)
-{
- return register_ftrace_command(&ftrace_snapshot_cmd);
-}
-#else
-static inline __init int register_snapshot_cmd(void) { return 0; }
-#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */
-
static struct dentry *tracing_get_dentry(struct trace_array *tr)
{
- if (WARN_ON(!tr->dir))
- return ERR_PTR(-ENODEV);
-
/* Top directory uses NULL as the parent */
if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
return NULL;
+ if (WARN_ON(!tr->dir))
+ return ERR_PTR(-ENODEV);
+
/* All sub buffers have a descriptor */
return tr->dir;
}
@@ -8670,7 +7623,7 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
return tr->percpu_dir;
}
-static struct dentry *
+struct dentry *
trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent,
void *data, long cpu, const struct file_operations *fops)
{
@@ -8712,7 +7665,7 @@ tracing_init_tracefs_percpu(struct trace_array *tr, long cpu)
trace_create_cpu_file("stats", TRACE_MODE_READ, d_cpu,
tr, cpu, &tracing_stats_fops);
- trace_create_cpu_file("buffer_size_kb", TRACE_MODE_READ, d_cpu,
+ trace_create_cpu_file("buffer_size_kb", TRACE_MODE_WRITE, d_cpu,
tr, cpu, &tracing_entries_fops);
if (tr->range_addr_start)
@@ -8765,10 +7718,9 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
return -EINVAL;
if (!!(topt->flags->val & topt->opt->bit) != val) {
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
ret = __set_tracer_option(topt->tr, topt->flags,
topt->opt, !val);
- mutex_unlock(&trace_types_lock);
if (ret)
return ret;
}
@@ -8851,7 +7803,7 @@ trace_options_core_read(struct file *filp, char __user *ubuf, size_t cnt,
get_tr_index(tr_index, &tr, &index);
- if (tr->trace_flags & (1 << index))
+ if (tr->trace_flags & (1ULL << index))
buf = "1\n";
else
buf = "0\n";
@@ -8880,7 +7832,7 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
mutex_lock(&event_mutex);
mutex_lock(&trace_types_lock);
- ret = set_tracer_flag(tr, 1 << index, val);
+ ret = set_tracer_flag(tr, 1ULL << index, val);
mutex_unlock(&trace_types_lock);
mutex_unlock(&event_mutex);
@@ -8953,54 +7905,34 @@ create_trace_option_file(struct trace_array *tr,
topt->entry = trace_create_file(opt->name, TRACE_MODE_WRITE,
t_options, topt, &trace_options_fops);
-
}
-static void
-create_trace_option_files(struct trace_array *tr, struct tracer *tracer)
+static int
+create_trace_option_files(struct trace_array *tr, struct tracer *tracer,
+ struct tracer_flags *flags)
{
struct trace_option_dentry *topts;
struct trace_options *tr_topts;
- struct tracer_flags *flags;
struct tracer_opt *opts;
int cnt;
- int i;
-
- if (!tracer)
- return;
-
- flags = tracer->flags;
if (!flags || !flags->opts)
- return;
-
- /*
- * If this is an instance, only create flags for tracers
- * the instance may have.
- */
- if (!trace_ok_for_array(tracer, tr))
- return;
-
- for (i = 0; i < tr->nr_topts; i++) {
- /* Make sure there's no duplicate flags. */
- if (WARN_ON_ONCE(tr->topts[i].tracer->flags == tracer->flags))
- return;
- }
+ return 0;
opts = flags->opts;
for (cnt = 0; opts[cnt].name; cnt++)
;
- topts = kcalloc(cnt + 1, sizeof(*topts), GFP_KERNEL);
+ topts = kzalloc_objs(*topts, cnt + 1);
if (!topts)
- return;
+ return 0;
tr_topts = krealloc(tr->topts, sizeof(*tr->topts) * (tr->nr_topts + 1),
GFP_KERNEL);
if (!tr_topts) {
kfree(topts);
- return;
+ return -ENOMEM;
}
tr->topts = tr_topts;
@@ -9015,6 +7947,97 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer)
"Failed to create trace option: %s",
opts[cnt].name);
}
+ return 0;
+}
+
+static int get_global_flags_val(struct tracer *tracer)
+{
+ struct tracers *t;
+
+ list_for_each_entry(t, &global_trace.tracers, list) {
+ if (t->tracer != tracer)
+ continue;
+ if (!t->flags)
+ return -1;
+ return t->flags->val;
+ }
+ return -1;
+}
+
+static int add_tracer_options(struct trace_array *tr, struct tracers *t)
+{
+ struct tracer *tracer = t->tracer;
+ struct tracer_flags *flags = t->flags ?: tracer->flags;
+
+ if (!flags)
+ return 0;
+
+ /* Only add tracer options after update_tracer_options finish */
+ if (!tracer_options_updated)
+ return 0;
+
+ return create_trace_option_files(tr, tracer, flags);
+}
+
+static int add_tracer(struct trace_array *tr, struct tracer *tracer)
+{
+ struct tracer_flags *flags;
+ struct tracers *t;
+ int ret;
+
+ /* Only enable if the directory has been created already. */
+ if (!tr->dir && !(tr->flags & TRACE_ARRAY_FL_GLOBAL))
+ return 0;
+
+ /*
+ * If this is an instance, only create flags for tracers
+ * the instance may have.
+ */
+ if (!trace_ok_for_array(tracer, tr))
+ return 0;
+
+ t = kmalloc_obj(*t);
+ if (!t)
+ return -ENOMEM;
+
+ t->tracer = tracer;
+ t->flags = NULL;
+ list_add(&t->list, &tr->tracers);
+
+ flags = tracer->flags;
+ if (!flags) {
+ if (!tracer->default_flags)
+ return 0;
+
+ /*
+ * If the tracer defines default flags, it means the flags are
+ * per trace instance.
+ */
+ flags = kmalloc_obj(*flags);
+ if (!flags)
+ return -ENOMEM;
+
+ *flags = *tracer->default_flags;
+ flags->trace = tracer;
+
+ t->flags = flags;
+
+ /* If this is an instance, inherit the global_trace flags */
+ if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL)) {
+ int val = get_global_flags_val(tracer);
+ if (!WARN_ON_ONCE(val < 0))
+ flags->val = val;
+ }
+ }
+
+ ret = add_tracer_options(tr, t);
+ if (ret < 0) {
+ list_del(&t->list);
+ kfree(t->flags);
+ kfree(t);
+ }
+
+ return ret;
}
static struct dentry *
@@ -9044,8 +8067,9 @@ static void create_trace_options_dir(struct trace_array *tr)
for (i = 0; trace_options[i]; i++) {
if (top_level ||
- !((1 << i) & TOP_LEVEL_TRACE_FLAGS))
+ !((1ULL << i) & TOP_LEVEL_TRACE_FLAGS)) {
create_trace_option_core_file(tr, trace_options[i], i);
+ }
}
}
@@ -9077,7 +8101,7 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
return ret;
if (buffer) {
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
if (!!val == tracer_tracing_is_on(tr)) {
val = 0; /* do nothing */
} else if (val) {
@@ -9091,7 +8115,6 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
/* Wake up any waiters */
ring_buffer_wake_waiters(buffer, RING_BUFFER_ALL_CPUS);
}
- mutex_unlock(&trace_types_lock);
}
(*ppos)++;
@@ -9203,12 +8226,12 @@ buffer_subbuf_size_write(struct file *filp, const char __user *ubuf,
if (ret)
goto out;
-#ifdef CONFIG_TRACER_MAX_TRACE
+#ifdef CONFIG_TRACER_SNAPSHOT
if (!tr->allocated_snapshot)
goto out_max;
- ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, order);
+ ret = ring_buffer_subbuf_order_set(tr->snapshot_buffer.buffer, order);
if (ret) {
/* Put back the old order */
cnt = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, old_order);
@@ -9253,22 +8276,133 @@ static struct dentry *trace_instance_dir;
static void
init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer);
-static int
-allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size)
+#ifdef CONFIG_MODULES
+static int make_mod_delta(struct module *mod, void *data)
+{
+ struct trace_module_delta *module_delta;
+ struct trace_scratch *tscratch;
+ struct trace_mod_entry *entry;
+ struct trace_array *tr = data;
+ int i;
+
+ tscratch = tr->scratch;
+ module_delta = READ_ONCE(tr->module_delta);
+ for (i = 0; i < tscratch->nr_entries; i++) {
+ entry = &tscratch->entries[i];
+ if (strcmp(mod->name, entry->mod_name))
+ continue;
+ if (mod->state == MODULE_STATE_GOING)
+ module_delta->delta[i] = 0;
+ else
+ module_delta->delta[i] = (unsigned long)mod->mem[MOD_TEXT].base
+ - entry->mod_addr;
+ break;
+ }
+ return 0;
+}
+#else
+static int make_mod_delta(struct module *mod, void *data)
+{
+ return 0;
+}
+#endif
+
+static int mod_addr_comp(const void *a, const void *b, const void *data)
+{
+ const struct trace_mod_entry *e1 = a;
+ const struct trace_mod_entry *e2 = b;
+
+ return e1->mod_addr > e2->mod_addr ? 1 : -1;
+}
+
+static void setup_trace_scratch(struct trace_array *tr,
+ struct trace_scratch *tscratch, unsigned int size)
+{
+ struct trace_module_delta *module_delta;
+ struct trace_mod_entry *entry;
+ int i, nr_entries;
+
+ if (!tscratch)
+ return;
+
+ tr->scratch = tscratch;
+ tr->scratch_size = size;
+
+ if (tscratch->text_addr)
+ tr->text_delta = (unsigned long)_text - tscratch->text_addr;
+
+ if (struct_size(tscratch, entries, tscratch->nr_entries) > size)
+ goto reset;
+
+ /* Check if each module name is a valid string */
+ for (i = 0; i < tscratch->nr_entries; i++) {
+ int n;
+
+ entry = &tscratch->entries[i];
+
+ for (n = 0; n < MODULE_NAME_LEN; n++) {
+ if (entry->mod_name[n] == '\0')
+ break;
+ if (!isprint(entry->mod_name[n]))
+ goto reset;
+ }
+ if (n == MODULE_NAME_LEN)
+ goto reset;
+ }
+
+ /* Sort the entries so that we can find appropriate module from address. */
+ nr_entries = tscratch->nr_entries;
+ sort_r(tscratch->entries, nr_entries, sizeof(struct trace_mod_entry),
+ mod_addr_comp, NULL, NULL);
+
+ if (IS_ENABLED(CONFIG_MODULES)) {
+ module_delta = kzalloc_flex(*module_delta, delta, nr_entries);
+ if (!module_delta) {
+ pr_info("module_delta allocation failed. Not able to decode module address.");
+ goto reset;
+ }
+ init_rcu_head(&module_delta->rcu);
+ } else
+ module_delta = NULL;
+ WRITE_ONCE(tr->module_delta, module_delta);
+
+ /* Scan modules to make text delta for modules. */
+ module_for_each_mod(make_mod_delta, tr);
+
+ /* Set trace_clock as the same of the previous boot. */
+ if (tscratch->clock_id != tr->clock_id) {
+ if (tscratch->clock_id >= ARRAY_SIZE(trace_clocks) ||
+ tracing_set_clock(tr, trace_clocks[tscratch->clock_id].name) < 0) {
+ pr_info("the previous trace_clock info is not valid.");
+ goto reset;
+ }
+ }
+ return;
+ reset:
+ /* Invalid trace modules */
+ memset(tscratch, 0, size);
+}
+
+int allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size)
{
enum ring_buffer_flags rb_flags;
+ struct trace_scratch *tscratch;
+ unsigned int scratch_size = 0;
- rb_flags = tr->trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
+ rb_flags = tr->trace_flags & TRACE_ITER(OVERWRITE) ? RB_FL_OVERWRITE : 0;
buf->tr = tr;
if (tr->range_addr_start && tr->range_addr_size) {
+ /* Add scratch buffer to handle 128 modules */
buf->buffer = ring_buffer_alloc_range(size, rb_flags, 0,
tr->range_addr_start,
- tr->range_addr_size);
+ tr->range_addr_size,
+ struct_size(tscratch, entries, 128));
+
+ tscratch = ring_buffer_meta_scratch(buf->buffer, &scratch_size);
+ setup_trace_scratch(tr, tscratch, scratch_size);
- ring_buffer_last_boot_delta(buf->buffer,
- &tr->text_delta, &tr->data_delta);
/*
* This is basically the same as a mapped buffer,
* with the same restrictions.
@@ -9288,8 +8422,8 @@ allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size
}
/* Allocate the first page for all buffers */
- set_buffer_entries(&tr->array_buffer,
- ring_buffer_size(tr->array_buffer.buffer, 0));
+ trace_set_buffer_entries(&tr->array_buffer,
+ ring_buffer_size(tr->array_buffer.buffer, 0));
return 0;
}
@@ -9304,7 +8438,7 @@ static void free_trace_buffer(struct array_buffer *buf)
}
}
-static int allocate_trace_buffers(struct trace_array *tr, int size)
+static int allocate_trace_buffers(struct trace_array *tr, unsigned long size)
{
int ret;
@@ -9312,23 +8446,11 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
if (ret)
return ret;
-#ifdef CONFIG_TRACER_MAX_TRACE
- /* Fix mapped buffer trace arrays do not have snapshot buffers */
- if (tr->range_addr_start)
- return 0;
-
- ret = allocate_trace_buffer(tr, &tr->max_buffer,
- allocate_snapshot ? size : 1);
- if (MEM_FAIL(ret, "Failed to allocate trace buffer\n")) {
+ ret = trace_allocate_snapshot(tr, size);
+ if (MEM_FAIL(ret, "Failed to allocate trace buffer\n"))
free_trace_buffer(&tr->array_buffer);
- return -ENOMEM;
- }
- tr->allocated_snapshot = allocate_snapshot;
-
- allocate_snapshot = false;
-#endif
- return 0;
+ return ret;
}
static void free_trace_buffers(struct trace_array *tr)
@@ -9337,9 +8459,10 @@ static void free_trace_buffers(struct trace_array *tr)
return;
free_trace_buffer(&tr->array_buffer);
+ kfree(tr->module_delta);
-#ifdef CONFIG_TRACER_MAX_TRACE
- free_trace_buffer(&tr->max_buffer);
+#ifdef CONFIG_TRACER_SNAPSHOT
+ free_trace_buffer(&tr->snapshot_buffer);
#endif
}
@@ -9352,20 +8475,39 @@ static void init_trace_flags_index(struct trace_array *tr)
tr->trace_flags_index[i] = i;
}
-static void __update_tracer_options(struct trace_array *tr)
+static int __update_tracer(struct trace_array *tr)
{
struct tracer *t;
+ int ret = 0;
- for (t = trace_types; t; t = t->next)
- add_tracer_options(tr, t);
+ for (t = trace_types; t && !ret; t = t->next)
+ ret = add_tracer(tr, t);
+
+ return ret;
}
-static void update_tracer_options(struct trace_array *tr)
+static __init int __update_tracer_options(struct trace_array *tr)
{
- mutex_lock(&trace_types_lock);
+ struct tracers *t;
+ int ret = 0;
+
+ list_for_each_entry(t, &tr->tracers, list) {
+ ret = add_tracer_options(tr, t);
+ if (ret < 0)
+ break;
+ }
+
+ return ret;
+}
+
+static __init void update_tracer_options(void)
+{
+ struct trace_array *tr;
+
+ guard(mutex)(&trace_types_lock);
tracer_options_updated = true;
- __update_tracer_options(tr);
- mutex_unlock(&trace_types_lock);
+ list_for_each_entry(tr, &ftrace_trace_arrays, list)
+ __update_tracer_options(tr);
}
/* Must have trace_types_lock held */
@@ -9387,11 +8529,10 @@ struct trace_array *trace_array_find_get(const char *instance)
{
struct trace_array *tr;
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
tr = trace_array_find(instance);
- if (tr)
- tr->ref++;
- mutex_unlock(&trace_types_lock);
+ if (tr && __trace_array_get(tr) < 0)
+ tr = NULL;
return tr;
}
@@ -9411,9 +8552,13 @@ static int trace_array_create_dir(struct trace_array *tr)
}
init_tracer_tracefs(tr, tr->dir);
- __update_tracer_options(tr);
-
- return ret;
+ ret = __update_tracer(tr);
+ if (ret) {
+ event_trace_del_tracer(tr);
+ tracefs_remove(tr->dir);
+ return ret;
+ }
+ return 0;
}
static struct trace_array *
@@ -9425,7 +8570,7 @@ trace_array_create_systems(const char *name, const char *systems,
int ret;
ret = -ENOMEM;
- tr = kzalloc(sizeof(*tr), GFP_KERNEL);
+ tr = kzalloc_obj(*tr);
if (!tr)
return ERR_PTR(ret);
@@ -9455,16 +8600,25 @@ trace_array_create_systems(const char *name, const char *systems,
raw_spin_lock_init(&tr->start_lock);
+ tr->syscall_buf_sz = global_trace.syscall_buf_sz;
+
tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
-#ifdef CONFIG_TRACER_MAX_TRACE
+#ifdef CONFIG_TRACER_SNAPSHOT
spin_lock_init(&tr->snapshot_trigger_lock);
#endif
tr->current_trace = &nop_trace;
+ tr->current_trace_flags = nop_trace.flags;
INIT_LIST_HEAD(&tr->systems);
INIT_LIST_HEAD(&tr->events);
INIT_LIST_HEAD(&tr->hist_vars);
INIT_LIST_HEAD(&tr->err_log);
+ INIT_LIST_HEAD(&tr->tracers);
+ INIT_LIST_HEAD(&tr->marker_list);
+
+#ifdef CONFIG_MODULES
+ INIT_LIST_HEAD(&tr->mod_events);
+#endif
if (allocate_trace_buffers(tr, trace_buf_size) < 0)
goto out_free_tr;
@@ -9475,6 +8629,8 @@ trace_array_create_systems(const char *name, const char *systems,
if (ftrace_allocate_ftrace_ops(tr) < 0)
goto out_free_tr;
+ trace_array_init_autoremove(tr);
+
ftrace_init_trace_array(tr);
init_trace_flags_index(tr);
@@ -9498,6 +8654,7 @@ trace_array_create_systems(const char *name, const char *systems,
free_cpumask_var(tr->pipe_cpumask);
free_cpumask_var(tr->tracing_cpumask);
kfree_const(tr->system_names);
+ kfree(tr->range_name);
kfree(tr->name);
kfree(tr);
@@ -9514,47 +8671,49 @@ static int instance_mkdir(const char *name)
struct trace_array *tr;
int ret;
- mutex_lock(&event_mutex);
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&event_mutex);
+ guard(mutex)(&trace_types_lock);
ret = -EEXIST;
if (trace_array_find(name))
- goto out_unlock;
+ return -EEXIST;
tr = trace_array_create(name);
ret = PTR_ERR_OR_ZERO(tr);
-out_unlock:
- mutex_unlock(&trace_types_lock);
- mutex_unlock(&event_mutex);
return ret;
}
-static u64 map_pages(u64 start, u64 size)
+#ifdef CONFIG_MMU
+static u64 map_pages(unsigned long start, unsigned long size)
{
- struct page **pages;
- phys_addr_t page_start;
- unsigned int page_count;
- unsigned int i;
- void *vaddr;
-
- page_count = DIV_ROUND_UP(size, PAGE_SIZE);
+ unsigned long vmap_start, vmap_end;
+ struct vm_struct *area;
+ int ret;
- page_start = start;
- pages = kmalloc_array(page_count, sizeof(struct page *), GFP_KERNEL);
- if (!pages)
+ area = get_vm_area(size, VM_IOREMAP);
+ if (!area)
return 0;
- for (i = 0; i < page_count; i++) {
- phys_addr_t addr = page_start + i * PAGE_SIZE;
- pages[i] = pfn_to_page(addr >> PAGE_SHIFT);
+ vmap_start = (unsigned long) area->addr;
+ vmap_end = vmap_start + size;
+
+ ret = vmap_page_range(vmap_start, vmap_end,
+ start, pgprot_nx(PAGE_KERNEL));
+ if (ret < 0) {
+ free_vm_area(area);
+ return 0;
}
- vaddr = vmap(pages, page_count, VM_MAP, PAGE_KERNEL);
- kfree(pages);
- return (u64)(unsigned long)vaddr;
+ return (u64)vmap_start;
+}
+#else
+static inline u64 map_pages(unsigned long start, unsigned long size)
+{
+ return 0;
}
+#endif
/**
* trace_array_get_by_name - Create/Lookup a trace array, given its name.
@@ -9577,24 +8736,25 @@ struct trace_array *trace_array_get_by_name(const char *name, const char *system
{
struct trace_array *tr;
- mutex_lock(&event_mutex);
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&event_mutex);
+ guard(mutex)(&trace_types_lock);
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
- if (tr->name && strcmp(tr->name, name) == 0)
- goto out_unlock;
+ if (tr->name && strcmp(tr->name, name) == 0) {
+ /* if this fails, @tr is going to be removed. */
+ if (__trace_array_get(tr) < 0)
+ tr = NULL;
+ return tr;
+ }
}
tr = trace_array_create_systems(name, systems, 0, 0);
if (IS_ERR(tr))
tr = NULL;
-out_unlock:
- if (tr)
+ else
tr->ref++;
- mutex_unlock(&trace_types_lock);
- mutex_unlock(&event_mutex);
return tr;
}
EXPORT_SYMBOL_GPL(trace_array_get_by_name);
@@ -9609,15 +8769,20 @@ static int __remove_instance(struct trace_array *tr)
list_del(&tr->list);
+ if (printk_trace == tr)
+ update_printk_trace(&global_trace);
+
+ /* Must be done before disabling all the flags */
+ if (update_marker_trace(tr, 0))
+ synchronize_rcu();
+
/* Disable all the flags that were enabled coming in */
for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++) {
- if ((1 << i) & ZEROED_TRACE_FLAGS)
- set_tracer_flag(tr, 1 << i, 0);
+ if ((1ULL << i) & ZEROED_TRACE_FLAGS)
+ set_tracer_flag(tr, 1ULL << i, 0);
}
- if (printk_trace == tr)
- update_printk_trace(&global_trace);
-
+ trace_array_cancel_autoremove(tr);
tracing_set_nop(tr);
clear_ftrace_function_probes(tr);
event_trace_del_tracer(tr);
@@ -9627,6 +8792,14 @@ static int __remove_instance(struct trace_array *tr)
free_percpu(tr->last_func_repeats);
free_trace_buffers(tr);
clear_tracing_err_log(tr);
+ free_tracers(tr);
+
+ if (tr->range_name) {
+ reserve_mem_release_by_name(tr->range_name);
+ kfree(tr->range_name);
+ }
+ if (tr->flags & TRACE_ARRAY_FL_VMALLOC)
+ vfree((void *)tr->range_addr_start);
for (i = 0; i < tr->nr_topts; i++) {
kfree(tr->topts[i].topts);
@@ -9645,48 +8818,36 @@ static int __remove_instance(struct trace_array *tr)
int trace_array_destroy(struct trace_array *this_tr)
{
struct trace_array *tr;
- int ret;
if (!this_tr)
return -EINVAL;
- mutex_lock(&event_mutex);
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&event_mutex);
+ guard(mutex)(&trace_types_lock);
- ret = -ENODEV;
/* Making sure trace array exists before destroying it. */
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
- if (tr == this_tr) {
- ret = __remove_instance(tr);
- break;
- }
+ if (tr == this_tr)
+ return __remove_instance(tr);
}
- mutex_unlock(&trace_types_lock);
- mutex_unlock(&event_mutex);
-
- return ret;
+ return -ENODEV;
}
EXPORT_SYMBOL_GPL(trace_array_destroy);
static int instance_rmdir(const char *name)
{
struct trace_array *tr;
- int ret;
- mutex_lock(&event_mutex);
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&event_mutex);
+ guard(mutex)(&trace_types_lock);
- ret = -ENODEV;
tr = trace_array_find(name);
- if (tr)
- ret = __remove_instance(tr);
-
- mutex_unlock(&trace_types_lock);
- mutex_unlock(&event_mutex);
+ if (!tr)
+ return -ENODEV;
- return ret;
+ return __remove_instance(tr);
}
static __init void create_trace_instances(struct dentry *d_tracer)
@@ -9699,35 +8860,37 @@ static __init void create_trace_instances(struct dentry *d_tracer)
if (MEM_FAIL(!trace_instance_dir, "Failed to create instances directory\n"))
return;
- mutex_lock(&event_mutex);
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&event_mutex);
+ guard(mutex)(&trace_types_lock);
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
if (!tr->name)
continue;
if (MEM_FAIL(trace_array_create_dir(tr) < 0,
"Failed to create instance directory\n"))
- break;
+ return;
}
-
- mutex_unlock(&trace_types_lock);
- mutex_unlock(&event_mutex);
}
static void
init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
{
+ umode_t writable_mode = TRACE_MODE_WRITE;
int cpu;
+ if (trace_array_is_readonly(tr))
+ writable_mode = TRACE_MODE_READ;
+
trace_create_file("available_tracers", TRACE_MODE_READ, d_tracer,
- tr, &show_traces_fops);
+ tr, &show_traces_fops);
- trace_create_file("current_tracer", TRACE_MODE_WRITE, d_tracer,
- tr, &set_tracer_fops);
+ trace_create_file("current_tracer", writable_mode, d_tracer,
+ tr, &set_tracer_fops);
- trace_create_file("tracing_cpumask", TRACE_MODE_WRITE, d_tracer,
+ trace_create_file("tracing_cpumask", writable_mode, d_tracer,
tr, &tracing_cpumask_fops);
+ /* Options are used for changing print-format even for readonly instance. */
trace_create_file("trace_options", TRACE_MODE_WRITE, d_tracer,
tr, &tracing_iter_fops);
@@ -9737,12 +8900,36 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
trace_create_file("trace_pipe", TRACE_MODE_READ, d_tracer,
tr, &tracing_pipe_fops);
- trace_create_file("buffer_size_kb", TRACE_MODE_WRITE, d_tracer,
+ trace_create_file("buffer_size_kb", writable_mode, d_tracer,
tr, &tracing_entries_fops);
trace_create_file("buffer_total_size_kb", TRACE_MODE_READ, d_tracer,
tr, &tracing_total_entries_fops);
+ trace_create_file("trace_clock", writable_mode, d_tracer, tr,
+ &trace_clock_fops);
+
+ trace_create_file("timestamp_mode", TRACE_MODE_READ, d_tracer, tr,
+ &trace_time_stamp_mode_fops);
+
+ tr->buffer_percent = 50;
+
+ trace_create_file("buffer_subbuf_size_kb", writable_mode, d_tracer,
+ tr, &buffer_subbuf_size_fops);
+
+ create_trace_options_dir(tr);
+
+ if (tr->range_addr_start)
+ trace_create_file("last_boot_info", TRACE_MODE_READ, d_tracer,
+ tr, &last_boot_fops);
+
+ for_each_tracing_cpu(cpu)
+ tracing_init_tracefs_percpu(tr, cpu);
+
+ /* Read-only instance has above files only. */
+ if (trace_array_is_readonly(tr))
+ return;
+
trace_create_file("free_buffer", 0200, d_tracer,
tr, &tracing_free_buffer_fops);
@@ -9754,55 +8941,39 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
trace_create_file("trace_marker_raw", 0220, d_tracer,
tr, &tracing_mark_raw_fops);
- trace_create_file("trace_clock", TRACE_MODE_WRITE, d_tracer, tr,
- &trace_clock_fops);
-
- trace_create_file("tracing_on", TRACE_MODE_WRITE, d_tracer,
- tr, &rb_simple_fops);
-
- trace_create_file("timestamp_mode", TRACE_MODE_READ, d_tracer, tr,
- &trace_time_stamp_mode_fops);
-
- tr->buffer_percent = 50;
-
trace_create_file("buffer_percent", TRACE_MODE_WRITE, d_tracer,
- tr, &buffer_percent_fops);
+ tr, &buffer_percent_fops);
- trace_create_file("buffer_subbuf_size_kb", TRACE_MODE_WRITE, d_tracer,
- tr, &buffer_subbuf_size_fops);
+ trace_create_file("syscall_user_buf_size", TRACE_MODE_WRITE, d_tracer,
+ tr, &tracing_syscall_buf_fops);
- create_trace_options_dir(tr);
+ trace_create_file("tracing_on", TRACE_MODE_WRITE, d_tracer,
+ tr, &rb_simple_fops);
-#ifdef CONFIG_TRACER_MAX_TRACE
trace_create_maxlat_file(tr, d_tracer);
-#endif
if (ftrace_create_function_files(tr, d_tracer))
MEM_FAIL(1, "Could not allocate function filter files");
- if (tr->range_addr_start) {
- trace_create_file("last_boot_info", TRACE_MODE_READ, d_tracer,
- tr, &last_boot_fops);
#ifdef CONFIG_TRACER_SNAPSHOT
- } else {
+ if (!tr->range_addr_start)
trace_create_file("snapshot", TRACE_MODE_WRITE, d_tracer,
tr, &snapshot_fops);
#endif
- }
trace_create_file("error_log", TRACE_MODE_WRITE, d_tracer,
tr, &tracing_err_log_fops);
- for_each_tracing_cpu(cpu)
- tracing_init_tracefs_percpu(tr, cpu);
-
ftrace_init_tracefs(tr, d_tracer);
}
+#ifdef CONFIG_TRACEFS_AUTOMOUNT_DEPRECATED
static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore)
{
struct vfsmount *mnt;
struct file_system_type *type;
+ struct fs_context *fc;
+ int ret;
/*
* To maintain backward compatibility for tools that mount
@@ -9812,14 +8983,24 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore)
type = get_fs_type("tracefs");
if (!type)
return NULL;
- mnt = vfs_submount(mntpt, type, "tracefs", NULL);
+
+ fc = fs_context_for_submount(type, mntpt);
put_filesystem(type);
- if (IS_ERR(mnt))
- return NULL;
- mntget(mnt);
+ if (IS_ERR(fc))
+ return ERR_CAST(fc);
+
+ pr_warn("NOTICE: Automounting of tracing to debugfs is deprecated and will be removed in 2030\n");
+
+ ret = vfs_parse_fs_string(fc, "source", "tracefs");
+ if (!ret)
+ mnt = fc_mount(fc);
+ else
+ mnt = ERR_PTR(ret);
+ put_fs_context(fc);
return mnt;
}
+#endif
/**
* tracing_init_dentry - initialize top level trace array
@@ -9844,6 +9025,7 @@ int tracing_init_dentry(void)
if (WARN_ON(!tracefs_initialized()))
return -ENODEV;
+#ifdef CONFIG_TRACEFS_AUTOMOUNT_DEPRECATED
/*
* As there may still be users that expect the tracing
* files to exist in debugfs/tracing, we must automount
@@ -9852,6 +9034,7 @@ int tracing_init_dentry(void)
*/
tr->dir = debugfs_create_automount("tracing", NULL,
trace_automount, NULL);
+#endif
return 0;
}
@@ -9859,7 +9042,7 @@ int tracing_init_dentry(void)
extern struct trace_eval_map *__start_ftrace_eval_maps[];
extern struct trace_eval_map *__stop_ftrace_eval_maps[];
-static struct workqueue_struct *eval_map_wq __initdata;
+struct workqueue_struct *trace_init_wq __initdata;
static struct work_struct eval_map_work __initdata;
static struct work_struct tracerfs_init_work __initdata;
@@ -9868,22 +9051,22 @@ static void __init eval_map_work_func(struct work_struct *work)
int len;
len = __stop_ftrace_eval_maps - __start_ftrace_eval_maps;
- trace_insert_eval_map(NULL, __start_ftrace_eval_maps, len);
+ trace_event_update_with_eval_map(NULL, __start_ftrace_eval_maps, len);
}
static int __init trace_eval_init(void)
{
INIT_WORK(&eval_map_work, eval_map_work_func);
- eval_map_wq = alloc_workqueue("eval_map_wq", WQ_UNBOUND, 0);
- if (!eval_map_wq) {
- pr_err("Unable to allocate eval_map_wq\n");
+ trace_init_wq = alloc_workqueue("trace_init_wq", WQ_UNBOUND, 0);
+ if (!trace_init_wq) {
+ pr_err("Unable to allocate trace_init_wq\n");
/* Do work here */
eval_map_work_func(&eval_map_work);
return -ENOMEM;
}
- queue_work(eval_map_wq, &eval_map_work);
+ queue_work(trace_init_wq, &eval_map_work);
return 0;
}
@@ -9892,8 +9075,8 @@ subsys_initcall(trace_eval_init);
static int __init trace_eval_sync(void)
{
/* Make sure the eval map updates are finished */
- if (eval_map_wq)
- destroy_workqueue(eval_map_wq);
+ if (trace_init_wq)
+ destroy_workqueue(trace_init_wq);
return 0;
}
@@ -9901,11 +9084,26 @@ late_initcall_sync(trace_eval_sync);
#ifdef CONFIG_MODULES
-static void trace_module_add_evals(struct module *mod)
+
+bool module_exists(const char *module)
{
- if (!mod->num_trace_evals)
- return;
+ /* All modules have the symbol __this_module */
+ static const char this_mod[] = "__this_module";
+ char modname[MODULE_NAME_LEN + sizeof(this_mod) + 2];
+ unsigned long val;
+ int n;
+ n = snprintf(modname, sizeof(modname), "%s:%s", module, this_mod);
+
+ if (n > sizeof(modname) - 1)
+ return false;
+
+ val = module_kallsyms_lookup_name(modname);
+ return val != 0;
+}
+
+static void trace_module_add_evals(struct module *mod)
+{
/*
* Modules with bad taint do not have events created, do
* not bother with enums either.
@@ -9913,7 +9111,8 @@ static void trace_module_add_evals(struct module *mod)
if (trace_module_has_bad_taint(mod))
return;
- trace_insert_eval_map(mod, mod->trace_evals, mod->num_trace_evals);
+ /* Even if no trace_evals, this need to sanitize field types. */
+ trace_event_update_with_eval_map(mod, mod->trace_evals, mod->num_trace_evals);
}
#ifdef CONFIG_TRACE_EVAL_MAP_FILE
@@ -9925,7 +9124,7 @@ static void trace_module_remove_evals(struct module *mod)
if (!mod->num_trace_evals)
return;
- mutex_lock(&trace_eval_mutex);
+ guard(mutex)(&trace_eval_mutex);
map = trace_eval_maps;
@@ -9937,17 +9136,33 @@ static void trace_module_remove_evals(struct module *mod)
map = map->tail.next;
}
if (!map)
- goto out;
+ return;
*last = trace_eval_jmp_to_tail(map)->tail.next;
kfree(map);
- out:
- mutex_unlock(&trace_eval_mutex);
}
#else
static inline void trace_module_remove_evals(struct module *mod) { }
#endif /* CONFIG_TRACE_EVAL_MAP_FILE */
+static void trace_module_record(struct module *mod, bool add)
+{
+ struct trace_array *tr;
+ unsigned long flags;
+
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ flags = tr->flags & (TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT);
+ /* Update any persistent trace array that has already been started */
+ if (flags == TRACE_ARRAY_FL_BOOT && add) {
+ guard(mutex)(&scratch_mutex);
+ save_mod(mod, tr);
+ } else if (flags & TRACE_ARRAY_FL_LAST_BOOT) {
+ /* Update delta if the module loaded in previous boot */
+ make_mod_delta(mod, tr);
+ }
+ }
+}
+
static int trace_module_notify(struct notifier_block *self,
unsigned long val, void *data)
{
@@ -9956,9 +9171,11 @@ static int trace_module_notify(struct notifier_block *self,
switch (val) {
case MODULE_STATE_COMING:
trace_module_add_evals(mod);
+ trace_module_record(mod, true);
break;
case MODULE_STATE_GOING:
trace_module_remove_evals(mod);
+ trace_module_record(mod, false);
break;
}
@@ -10007,7 +9224,7 @@ static __init void tracer_init_tracefs_work_func(struct work_struct *work)
create_trace_instances(NULL);
- update_tracer_options(&global_trace);
+ update_tracer_options();
}
static __init int tracer_init_tracefs(void)
@@ -10020,14 +9237,15 @@ static __init int tracer_init_tracefs(void)
if (ret)
return 0;
- if (eval_map_wq) {
+ if (trace_init_wq) {
INIT_WORK(&tracerfs_init_work, tracer_init_tracefs_work_func);
- queue_work(eval_map_wq, &tracerfs_init_work);
+ queue_work(trace_init_wq, &tracerfs_init_work);
} else {
tracer_init_tracefs_work_func(NULL);
}
- rv_init_interface();
+ if (rv_init_interface())
+ pr_err("RV: Error while creating the RV interface\n");
return 0;
}
@@ -10140,7 +9358,7 @@ static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_m
static struct trace_iterator iter;
unsigned int old_userobj;
unsigned long flags;
- int cnt = 0, cpu;
+ int cnt = 0;
/*
* Always turn off tracing when we dump.
@@ -10157,14 +9375,13 @@ static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_m
/* Simulate the iterator */
trace_init_iter(&iter, tr);
- for_each_tracing_cpu(cpu) {
- atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);
- }
+ /* While dumping, do not allow the buffer to be enable */
+ tracer_tracing_disable(tr);
- old_userobj = tr->trace_flags & TRACE_ITER_SYM_USEROBJ;
+ old_userobj = tr->trace_flags & TRACE_ITER(SYM_USEROBJ);
/* don't look at user memory in panic mode */
- tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
+ tr->trace_flags &= ~TRACE_ITER(SYM_USEROBJ);
if (dump_mode == DUMP_ORIG)
iter.cpu_file = raw_smp_processor_id();
@@ -10205,10 +9422,10 @@ static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_m
ret = print_trace_line(&iter);
if (ret != TRACE_TYPE_NO_CONSUME)
trace_consume(&iter);
+
+ trace_printk_seq(&iter.seq);
}
touch_nmi_watchdog();
-
- trace_printk_seq(&iter.seq);
}
if (!cnt)
@@ -10218,9 +9435,7 @@ static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_m
tr->trace_flags |= old_userobj;
- for_each_tracing_cpu(cpu) {
- atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);
- }
+ tracer_tracing_enable(tr);
local_irq_restore(flags);
}
@@ -10302,7 +9517,8 @@ ssize_t trace_parse_run_command(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos,
int (*createfn)(const char *))
{
- char *kbuf, *buf, *tmp;
+ char *kbuf __free(kfree) = NULL;
+ char *buf, *tmp;
int ret = 0;
size_t done = 0;
size_t size;
@@ -10317,10 +9533,9 @@ ssize_t trace_parse_run_command(struct file *file, const char __user *buffer,
if (size >= WRITE_BUFSIZE)
size = WRITE_BUFSIZE - 1;
- if (copy_from_user(kbuf, buffer + done, size)) {
- ret = -EFAULT;
- goto out;
- }
+ if (copy_from_user(kbuf, buffer + done, size))
+ return -EFAULT;
+
kbuf[size] = '\0';
buf = kbuf;
do {
@@ -10336,8 +9551,7 @@ ssize_t trace_parse_run_command(struct file *file, const char __user *buffer,
/* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */
pr_warn("Line length is too long: Should be less than %d\n",
WRITE_BUFSIZE - 2);
- ret = -EINVAL;
- goto out;
+ return -EINVAL;
}
}
done += size;
@@ -10350,63 +9564,54 @@ ssize_t trace_parse_run_command(struct file *file, const char __user *buffer,
ret = createfn(buf);
if (ret)
- goto out;
+ return ret;
buf += size;
} while (done < count);
}
- ret = done;
-
-out:
- kfree(kbuf);
-
- return ret;
+ return done;
}
-#ifdef CONFIG_TRACER_MAX_TRACE
-__init static bool tr_needs_alloc_snapshot(const char *name)
+__init static int backup_instance_area(const char *backup,
+ unsigned long *addr, phys_addr_t *size)
{
- char *test;
- int len = strlen(name);
- bool ret;
+ struct trace_array *backup_tr;
+ void *allocated_vaddr = NULL;
- if (!boot_snapshot_index)
- return false;
+ backup_tr = trace_array_get_by_name(backup, NULL);
+ if (!backup_tr) {
+ pr_warn("Tracing: Instance %s is not found.\n", backup);
+ return -ENOENT;
+ }
- if (strncmp(name, boot_snapshot_info, len) == 0 &&
- boot_snapshot_info[len] == '\t')
- return true;
+ if (!(backup_tr->flags & TRACE_ARRAY_FL_BOOT)) {
+ pr_warn("Tracing: Instance %s is not boot mapped.\n", backup);
+ trace_array_put(backup_tr);
+ return -EINVAL;
+ }
- test = kmalloc(strlen(name) + 3, GFP_KERNEL);
- if (!test)
- return false;
+ *size = backup_tr->range_addr_size;
- sprintf(test, "\t%s\t", name);
- ret = strstr(boot_snapshot_info, test) == NULL;
- kfree(test);
- return ret;
-}
+ allocated_vaddr = vzalloc(*size);
+ if (!allocated_vaddr) {
+ pr_warn("Tracing: Failed to allocate memory for copying instance %s (size 0x%lx)\n",
+ backup, (unsigned long)*size);
+ trace_array_put(backup_tr);
+ return -ENOMEM;
+ }
-__init static void do_allocate_snapshot(const char *name)
-{
- if (!tr_needs_alloc_snapshot(name))
- return;
+ memcpy(allocated_vaddr,
+ (void *)backup_tr->range_addr_start, (size_t)*size);
+ *addr = (unsigned long)allocated_vaddr;
- /*
- * When allocate_snapshot is set, the next call to
- * allocate_trace_buffers() (called by trace_array_get_by_name())
- * will allocate the snapshot buffer. That will alse clear
- * this flag.
- */
- allocate_snapshot = true;
+ trace_array_put(backup_tr);
+ return 0;
}
-#else
-static inline void do_allocate_snapshot(const char *name) { }
-#endif
__init static void enable_instances(void)
{
struct trace_array *tr;
+ bool memmap_area = false;
char *curr_str;
char *name;
char *str;
@@ -10424,11 +9629,16 @@ __init static void enable_instances(void)
bool traceoff = false;
char *flag_delim;
char *addr_delim;
+ char *rname __free(kfree) = NULL;
+ char *backup;
tok = strsep(&curr_str, ",");
- flag_delim = strchr(tok, '^');
- addr_delim = strchr(tok, '@');
+ name = strsep(&tok, "=");
+ backup = tok;
+
+ flag_delim = strchr(name, '^');
+ addr_delim = strchr(name, '@');
if (addr_delim)
*addr_delim++ = '\0';
@@ -10436,7 +9646,10 @@ __init static void enable_instances(void)
if (flag_delim)
*flag_delim++ = '\0';
- name = tok;
+ if (backup) {
+ if (backup_instance_area(backup, &addr, &size) < 0)
+ continue;
+ }
if (flag_delim) {
char *flag;
@@ -10474,16 +9687,31 @@ __init static void enable_instances(void)
name);
continue;
}
+ memmap_area = true;
} else if (tok) {
if (!reserve_mem_find_by_name(tok, &start, &size)) {
start = 0;
pr_warn("Failed to map boot instance %s to %s\n", name, tok);
continue;
}
+ rname = kstrdup(tok, GFP_KERNEL);
}
if (start) {
- addr = map_pages(start, size);
+ /* Start and size must be page aligned */
+ if (start & ~PAGE_MASK) {
+ pr_warn("Tracing: mapping start addr %pa is not page aligned\n", &start);
+ continue;
+ }
+ if (size & ~PAGE_MASK) {
+ pr_warn("Tracing: mapping size %pa is not page aligned\n", &size);
+ continue;
+ }
+
+ if (memmap_area)
+ addr = map_pages(start, size);
+ else
+ addr = (unsigned long)phys_to_virt(start);
if (addr) {
pr_info("Tracing: mapped boot instance %s at physical memory %pa of size 0x%lx\n",
name, &start, (unsigned long)size);
@@ -10493,8 +9721,7 @@ __init static void enable_instances(void)
}
} else {
/* Only non mapped buffers have snapshot buffers */
- if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE))
- do_allocate_snapshot(name);
+ do_allocate_snapshot(name);
}
tr = trace_array_create_systems(name, NULL, addr, size);
@@ -10510,24 +9737,57 @@ __init static void enable_instances(void)
update_printk_trace(tr);
/*
- * If start is set, then this is a mapped buffer, and
- * cannot be deleted by user space, so keep the reference
- * to it.
+ * memmap'd buffers can not be freed.
*/
- if (start) {
- tr->flags |= TRACE_ARRAY_FL_BOOT;
+ if (memmap_area) {
+ tr->flags |= TRACE_ARRAY_FL_MEMMAP;
tr->ref++;
}
+ /*
+ * Backup buffers can be freed but need vfree().
+ */
+ if (backup) {
+ tr->flags |= TRACE_ARRAY_FL_VMALLOC | TRACE_ARRAY_FL_RDONLY;
+ trace_array_start_autoremove();
+ }
+
+ if (start || backup) {
+ tr->flags |= TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT;
+ tr->range_name = no_free_ptr(rname);
+ }
+
+ /*
+ * Save the events to start and enabled them after all boot instances
+ * have been created.
+ */
+ tr->boot_events = curr_str;
+ }
+
+ /* Enable the events after all boot instances have been created */
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+
+ if (!tr->boot_events || !(*tr->boot_events)) {
+ tr->boot_events = NULL;
+ continue;
+ }
+
+ curr_str = tr->boot_events;
+
+ /* Clear the instance if this is a persistent buffer */
+ if (tr->flags & TRACE_ARRAY_FL_LAST_BOOT)
+ update_last_data(tr);
+
while ((tok = strsep(&curr_str, ","))) {
early_enable_events(tr, tok, true);
}
+ tr->boot_events = NULL;
}
}
__init static int tracer_alloc_buffers(void)
{
- int ring_buf_size;
+ unsigned long ring_buf_size;
int ret = -ENOMEM;
@@ -10543,7 +9803,7 @@ __init static int tracer_alloc_buffers(void)
BUILD_BUG_ON(TRACE_ITER_LAST_BIT > TRACE_FLAGS_MAX_SIZE);
if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL))
- goto out;
+ return -ENOMEM;
if (!alloc_cpumask_var(&global_trace.tracing_cpumask, GFP_KERNEL))
goto out_free_buffer_mask;
@@ -10608,19 +9868,21 @@ __init static int tracer_alloc_buffers(void)
* just a bootstrap of current_trace anyway.
*/
global_trace.current_trace = &nop_trace;
+ global_trace.current_trace_flags = nop_trace.flags;
global_trace.max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
-#ifdef CONFIG_TRACER_MAX_TRACE
+#ifdef CONFIG_TRACER_SNAPSHOT
spin_lock_init(&global_trace.snapshot_trigger_lock);
#endif
ftrace_init_global_array_ops(&global_trace);
- init_trace_flags_index(&global_trace);
+#ifdef CONFIG_MODULES
+ INIT_LIST_HEAD(&global_trace.mod_events);
+#endif
- register_tracer(&nop_trace);
+ init_trace_flags_index(&global_trace);
- /* Function tracing may start here (via kernel command line) */
- init_function_trace();
+ INIT_LIST_HEAD(&global_trace.tracers);
/* All seems OK, enable tracing */
tracing_disabled = 0;
@@ -10632,12 +9894,20 @@ __init static int tracer_alloc_buffers(void)
global_trace.flags = TRACE_ARRAY_FL_GLOBAL;
+ global_trace.syscall_buf_sz = syscall_buf_size;
+
INIT_LIST_HEAD(&global_trace.systems);
INIT_LIST_HEAD(&global_trace.events);
INIT_LIST_HEAD(&global_trace.hist_vars);
INIT_LIST_HEAD(&global_trace.err_log);
+ list_add(&global_trace.marker_list, &marker_copies);
list_add(&global_trace.list, &ftrace_trace_arrays);
+ register_tracer(&nop_trace);
+
+ /* Function tracing may start here (via kernel command line) */
+ init_function_trace();
+
apply_trace_boot_options();
register_snapshot_cmd();
@@ -10656,33 +9926,21 @@ out_free_cpumask:
free_cpumask_var(global_trace.tracing_cpumask);
out_free_buffer_mask:
free_cpumask_var(tracing_buffer_mask);
-out:
return ret;
}
-void __init ftrace_boot_snapshot(void)
+#ifdef CONFIG_FUNCTION_TRACER
+/* Used to set module cached ftrace filtering at boot up */
+struct trace_array *trace_get_global_array(void)
{
-#ifdef CONFIG_TRACER_MAX_TRACE
- struct trace_array *tr;
-
- if (!snapshot_at_boot)
- return;
-
- list_for_each_entry(tr, &ftrace_trace_arrays, list) {
- if (!tr->allocated_snapshot)
- continue;
-
- tracing_snapshot_instance(tr);
- trace_array_puts(tr, "** Boot snapshot taken **\n");
- }
-#endif
+ return &global_trace;
}
+#endif
void __init early_trace_init(void)
{
if (tracepoint_printk) {
- tracepoint_print_iter =
- kzalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL);
+ tracepoint_print_iter = kzalloc_obj(*tracepoint_print_iter);
if (MEM_FAIL(!tracepoint_print_iter,
"Failed to allocate trace iterator\n"))
tracepoint_printk = 0;
@@ -10748,6 +10006,9 @@ __init static int late_trace_init(void)
tracepoint_printk = 0;
}
+ if (traceoff_after_boot)
+ tracing_off();
+
tracing_set_default_clock();
clear_boot_tracer();
return 0;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 9691b47b5f3d..80fe152af1dd 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -21,6 +21,8 @@
#include <linux/workqueue.h>
#include <linux/ctype.h>
#include <linux/once_lite.h>
+#include <linux/ftrace_regs.h>
+#include <linux/llist.h>
#include "pid_list.h"
@@ -66,14 +68,17 @@ enum trace_type {
#undef __field_fn
#define __field_fn(type, item) type item;
+#undef __field_packed
+#define __field_packed(type, item) type item;
+
#undef __field_struct
#define __field_struct(type, item) __field(type, item)
#undef __field_desc
#define __field_desc(type, container, item)
-#undef __field_packed
-#define __field_packed(type, container, item)
+#undef __field_desc_packed
+#define __field_desc_packed(type, container, item)
#undef __array
#define __array(type, item, size) type item[size];
@@ -126,10 +131,12 @@ enum trace_type {
#define FAULT_STRING "(fault)"
-#define HIST_STACKTRACE_DEPTH 16
+#define HIST_STACKTRACE_DEPTH 31
#define HIST_STACKTRACE_SIZE (HIST_STACKTRACE_DEPTH * sizeof(unsigned long))
#define HIST_STACKTRACE_SKIP 5
+#define SYSCALL_FAULT_USER_MAX 165
+
/*
* syscalls are special, and need special handling, this is why
* they are not included in trace_entries.h
@@ -182,8 +189,7 @@ struct trace_array;
* the trace, etc.)
*/
struct trace_array_cpu {
- atomic_t disabled;
- void *buffer_page; /* ring buffer spare */
+ local_t disabled;
unsigned long entries;
unsigned long saved_latency;
@@ -216,7 +222,7 @@ struct array_buffer {
int cpu;
};
-#define TRACE_FLAGS_MAX_SIZE 32
+#define TRACE_FLAGS_MAX_SIZE 64
struct trace_options {
struct tracer *tracer;
@@ -258,6 +264,7 @@ static inline bool still_need_pid_events(int type, struct trace_pid_list *pid_li
typedef bool (*cond_update_fn_t)(struct trace_array *tr, void *cond_data);
+#ifdef CONFIG_TRACER_SNAPSHOT
/**
* struct cond_snapshot - conditional snapshot data and callback
*
@@ -300,6 +307,7 @@ struct cond_snapshot {
void *cond_data;
cond_update_fn_t update;
};
+#endif /* CONFIG_TRACER_SNAPSHOT */
/*
* struct trace_func_repeats - used to keep track of the consecutive
@@ -312,6 +320,11 @@ struct trace_func_repeats {
u64 ts_last_call;
};
+struct trace_module_delta {
+ struct rcu_head rcu;
+ long delta[];
+};
+
/*
* The trace array - an array of per-CPU trace arrays. This is the
* highest level data structure that individual tracers deal with.
@@ -321,35 +334,44 @@ struct trace_array {
struct list_head list;
char *name;
struct array_buffer array_buffer;
-#ifdef CONFIG_TRACER_MAX_TRACE
+#ifdef CONFIG_TRACER_SNAPSHOT
/*
- * The max_buffer is used to snapshot the trace when a maximum
+ * The snapshot_buffer is used to snapshot the trace when a maximum
* latency is reached, or when the user initiates a snapshot.
* Some tracers will use this to store a maximum trace while
* it continues examining live traces.
*
- * The buffers for the max_buffer are set up the same as the array_buffer
- * When a snapshot is taken, the buffer of the max_buffer is swapped
- * with the buffer of the array_buffer and the buffers are reset for
- * the array_buffer so the tracing can continue.
+ * The buffers for the snapshot_buffer are set up the same as the
+ * array_buffer. When a snapshot is taken, the buffer of the
+ * snapshot_buffer is swapped with the buffer of the array_buffer
+ * and the buffers are reset for the array_buffer so the tracing can
+ * continue.
*/
- struct array_buffer max_buffer;
+ struct array_buffer snapshot_buffer;
bool allocated_snapshot;
spinlock_t snapshot_trigger_lock;
unsigned int snapshot;
+#ifdef CONFIG_TRACER_MAX_TRACE
unsigned long max_latency;
-#ifdef CONFIG_FSNOTIFY
struct dentry *d_max_latency;
+#ifdef CONFIG_FSNOTIFY
struct work_struct fsnotify_work;
struct irq_work fsnotify_irqwork;
-#endif
-#endif
+#endif /* CONFIG_FSNOTIFY */
+#endif /* CONFIG_TRACER_MAX_TRACE */
+#endif /* CONFIG_TRACER_SNAPSHOT */
+
/* The below is for memory mapped ring buffer */
unsigned int mapped;
unsigned long range_addr_start;
unsigned long range_addr_size;
+ char *range_name;
long text_delta;
- long data_delta;
+ struct trace_module_delta *module_delta;
+ void *scratch; /* pointer in persistent memory */
+ int scratch_size;
+
+ int buffer_disabled;
struct trace_pid_list __rcu *filtered_pids;
struct trace_pid_list __rcu *filtered_no_pids;
@@ -364,15 +386,14 @@ struct trace_array {
*
* It is also used in other places outside the update_max_tr
* so it needs to be defined outside of the
- * CONFIG_TRACER_MAX_TRACE.
+ * CONFIG_TRACER_SNAPSHOT.
*/
arch_spinlock_t max_lock;
- int buffer_disabled;
#ifdef CONFIG_FTRACE_SYSCALLS
int sys_refcount_enter;
int sys_refcount_exit;
- struct trace_event_file __rcu *enter_syscall_files[NR_syscalls];
- struct trace_event_file __rcu *exit_syscall_files[NR_syscalls];
+ struct trace_event_file *enter_syscall_files[NR_syscalls];
+ struct trace_event_file *exit_syscall_files[NR_syscalls];
#endif
int stop_count;
int clock_id;
@@ -381,11 +402,15 @@ struct trace_array {
int buffer_percent;
unsigned int n_err_log_entries;
struct tracer *current_trace;
- unsigned int trace_flags;
+ struct tracer_flags *current_trace_flags;
+ u64 trace_flags;
unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE];
unsigned int flags;
raw_spinlock_t start_lock;
- const char *system_names;
+ union {
+ const char *system_names;
+ char *boot_events;
+ };
struct list_head err_log;
struct dentry *dir;
struct dentry *options;
@@ -394,12 +419,17 @@ struct trace_array {
struct trace_options *topts;
struct list_head systems;
struct list_head events;
+ struct list_head marker_list;
+ struct list_head tracers;
struct trace_event_file *trace_marker_file;
cpumask_var_t tracing_cpumask; /* only trace on set CPUs */
/* one per_cpu trace_pipe can be opened by only one user */
cpumask_var_t pipe_cpumask;
int ref;
int trace_ref;
+#ifdef CONFIG_MODULES
+ struct list_head mod_events;
+#endif
#ifdef CONFIG_FUNCTION_TRACER
struct ftrace_ops *ops;
struct trace_pid_list __rcu *function_pids;
@@ -417,6 +447,7 @@ struct trace_array {
int function_enabled;
#endif
int no_filter_buffering_ref;
+ unsigned int syscall_buf_sz;
struct list_head hist_vars;
#ifdef CONFIG_TRACER_SNAPSHOT
struct cond_snapshot *cond_snapshot;
@@ -427,13 +458,33 @@ struct trace_array {
* we do not waste memory on systems that are not using tracing.
*/
bool ring_buffer_expanded;
+ /*
+ * If the ring buffer is a read only backup instance, it will be
+ * removed after dumping all data via pipe, because no readable data.
+ */
+ bool free_on_close;
+ struct work_struct autoremove_work;
};
enum {
- TRACE_ARRAY_FL_GLOBAL = BIT(0),
- TRACE_ARRAY_FL_BOOT = BIT(1),
+ TRACE_ARRAY_FL_GLOBAL = BIT(0),
+ TRACE_ARRAY_FL_BOOT = BIT(1),
+ TRACE_ARRAY_FL_LAST_BOOT = BIT(2),
+ TRACE_ARRAY_FL_MOD_INIT = BIT(3),
+ TRACE_ARRAY_FL_MEMMAP = BIT(4),
+ TRACE_ARRAY_FL_VMALLOC = BIT(5),
+ TRACE_ARRAY_FL_RDONLY = BIT(6),
};
+#ifdef CONFIG_MODULES
+bool module_exists(const char *module);
+#else
+static inline bool module_exists(const char *module)
+{
+ return false;
+}
+#endif
+
extern struct list_head ftrace_trace_arrays;
extern struct mutex trace_types_lock;
@@ -444,11 +495,20 @@ extern struct trace_array *trace_array_find(const char *instance);
extern struct trace_array *trace_array_find_get(const char *instance);
extern u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_event *rbe);
-extern int tracing_set_filter_buffering(struct trace_array *tr, bool set);
extern int tracing_set_clock(struct trace_array *tr, const char *clockstr);
extern bool trace_clock_in_ns(struct trace_array *tr);
+extern unsigned long trace_adjust_address(struct trace_array *tr, unsigned long addr);
+
+extern struct trace_array *printk_trace;
+
+static inline bool trace_array_is_readonly(struct trace_array *tr)
+{
+ /* backup instance is read only. */
+ return tr->flags & TRACE_ARRAY_FL_RDONLY;
+}
+
/*
* The global tracer (top) should be the first trace array added,
* but we check the flag anyway.
@@ -604,9 +664,10 @@ struct tracer {
u32 old_flags, u32 bit, int set);
/* Return 0 if OK with change, else return non-zero */
int (*flag_changed)(struct trace_array *tr,
- u32 mask, int set);
+ u64 mask, int set);
struct tracer *next;
struct tracer_flags *flags;
+ struct tracer_flags *default_flags;
int enabled;
bool print_max;
bool allow_instances;
@@ -623,6 +684,8 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu)
return iter->buffer_iter ? iter->buffer_iter[cpu] : NULL;
}
+extern int tracing_disabled;
+
int tracer_init(struct tracer *t, struct trace_array *tr);
int tracing_is_enabled(void);
void tracing_reset_online_cpus(struct array_buffer *buf);
@@ -630,19 +693,78 @@ void tracing_reset_all_online_cpus(void);
void tracing_reset_all_online_cpus_unlocked(void);
int tracing_open_generic(struct inode *inode, struct file *filp);
int tracing_open_generic_tr(struct inode *inode, struct file *filp);
+int tracing_release(struct inode *inode, struct file *file);
int tracing_release_generic_tr(struct inode *inode, struct file *file);
int tracing_open_file_tr(struct inode *inode, struct file *filp);
int tracing_release_file_tr(struct inode *inode, struct file *filp);
int tracing_single_release_file_tr(struct inode *inode, struct file *filp);
-bool tracing_is_disabled(void);
bool tracer_tracing_is_on(struct trace_array *tr);
void tracer_tracing_on(struct trace_array *tr);
void tracer_tracing_off(struct trace_array *tr);
+void tracer_tracing_disable(struct trace_array *tr);
+void tracer_tracing_enable(struct trace_array *tr);
+int allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size);
struct dentry *trace_create_file(const char *name,
umode_t mode,
struct dentry *parent,
void *data,
const struct file_operations *fops);
+struct dentry *trace_create_cpu_file(const char *name,
+ umode_t mode,
+ struct dentry *parent,
+ void *data,
+ long cpu,
+ const struct file_operations *fops);
+
+struct trace_iterator *__tracing_open(struct inode *inode, struct file *file,
+ bool snapshot);
+int tracing_buffers_open(struct inode *inode, struct file *filp);
+ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf,
+ size_t count, loff_t *ppos);
+int tracing_buffers_release(struct inode *inode, struct file *file);
+ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len, unsigned int flags);
+
+ssize_t tracing_nsecs_read(unsigned long *ptr, char __user *ubuf,
+ size_t cnt, loff_t *ppos);
+ssize_t tracing_nsecs_write(unsigned long *ptr, const char __user *ubuf,
+ size_t cnt, loff_t *ppos);
+
+void trace_set_buffer_entries(struct array_buffer *buf, unsigned long val);
+
+/*
+ * Should be used after trace_array_get(), trace_types_lock
+ * ensures that i_cdev was already initialized.
+ */
+static inline int tracing_get_cpu(struct inode *inode)
+{
+ if (inode->i_cdev) /* See trace_create_cpu_file() */
+ return (long)inode->i_cdev - 1;
+ return RING_BUFFER_ALL_CPUS;
+}
+void tracing_reset_cpu(struct array_buffer *buf, int cpu);
+
+struct ftrace_buffer_info {
+ struct trace_iterator iter;
+ void *spare;
+ unsigned int spare_cpu;
+ unsigned int spare_size;
+ unsigned int read;
+};
+
+/**
+ * tracer_tracing_is_on_cpu - show real state of ring buffer enabled on for a cpu
+ * @tr : the trace array to know if ring buffer is enabled
+ * @cpu: The cpu buffer to check if enabled
+ *
+ * Shows real state of the per CPU buffer if it is enabled or not.
+ */
+static inline bool tracer_tracing_is_on_cpu(struct trace_array *tr, int cpu)
+{
+ if (tr->array_buffer.buffer)
+ return ring_buffer_record_is_on_cpu(tr->array_buffer.buffer, cpu);
+ return false;
+}
int tracing_init_dentry(void);
@@ -684,7 +806,8 @@ unsigned long trace_total_entries(struct trace_array *tr);
void trace_function(struct trace_array *tr,
unsigned long ip,
unsigned long parent_ip,
- unsigned int trace_ctx);
+ unsigned int trace_ctx,
+ struct ftrace_regs *regs);
void trace_graph_function(struct trace_array *tr,
unsigned long ip,
unsigned long parent_ip,
@@ -693,8 +816,10 @@ void trace_latency_header(struct seq_file *m);
void trace_default_header(struct seq_file *m);
void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
-void trace_graph_return(struct ftrace_graph_ret *trace, struct fgraph_ops *gops);
-int trace_graph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops);
+void trace_graph_return(struct ftrace_graph_ret *trace, struct fgraph_ops *gops,
+ struct ftrace_regs *fregs);
+int trace_graph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
+ struct ftrace_regs *fregs);
void tracing_start_cmdline_record(void);
void tracing_stop_cmdline_record(void);
@@ -714,11 +839,10 @@ extern cpumask_var_t __read_mostly tracing_buffer_mask;
extern unsigned long nsecs_to_usecs(unsigned long nsecs);
extern unsigned long tracing_thresh;
+extern struct workqueue_struct *trace_init_wq __initdata;
/* PID filtering */
-extern int pid_max;
-
bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids,
pid_t search_pid);
bool trace_ignore_this_task(struct trace_pid_list *filtered_pids,
@@ -734,16 +858,16 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
struct trace_pid_list **new_pid_list,
const char __user *ubuf, size_t cnt);
-#ifdef CONFIG_TRACER_MAX_TRACE
+#ifdef CONFIG_TRACER_SNAPSHOT
void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu,
void *cond_data);
void update_max_tr_single(struct trace_array *tr,
struct task_struct *tsk, int cpu);
-#ifdef CONFIG_FSNOTIFY
-#define LATENCY_FS_NOTIFY
+#if defined(CONFIG_TRACER_MAX_TRACE) && defined(CONFIG_FSNOTIFY)
+# define LATENCY_FS_NOTIFY
#endif
-#endif /* CONFIG_TRACER_MAX_TRACE */
+#endif /* CONFIG_TRACER_SNAPSHOT */
#ifdef LATENCY_FS_NOTIFY
void latency_fsnotify(struct trace_array *tr);
@@ -760,6 +884,22 @@ static inline void __trace_stack(struct trace_array *tr, unsigned int trace_ctx,
}
#endif /* CONFIG_STACKTRACE */
+#ifdef CONFIG_TRACER_MAX_TRACE
+static inline bool tracer_uses_snapshot(struct tracer *tracer)
+{
+ return tracer->use_max_tr;
+}
+void trace_create_maxlat_file(struct trace_array *tr,
+ struct dentry *d_tracer);
+#else
+static inline bool tracer_uses_snapshot(struct tracer *tracer)
+{
+ return false;
+}
+static inline void trace_create_maxlat_file(struct trace_array *tr,
+ struct dentry *d_tracer) { }
+#endif
+
void trace_last_func_repeats(struct trace_array *tr,
struct trace_func_repeats *last_info,
unsigned int trace_ctx);
@@ -770,6 +910,8 @@ extern void trace_find_cmdline(int pid, char comm[]);
extern int trace_find_tgid(int pid);
extern void trace_event_follow_fork(struct trace_array *tr, bool enable);
+extern int trace_events_enabled(struct trace_array *tr, const char *system);
+
#ifdef CONFIG_DYNAMIC_FTRACE
extern unsigned long ftrace_update_tot_cnt;
extern unsigned long ftrace_number_of_pages;
@@ -785,6 +927,8 @@ extern int DYN_FTRACE_TEST_NAME(void);
#define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2
extern int DYN_FTRACE_TEST_NAME2(void);
+void __init trace_append_boot_param(char *buf, const char *str,
+ char sep, int size);
extern void trace_set_ring_buffer_expanded(struct trace_array *tr);
extern bool tracing_selftest_disabled;
@@ -807,6 +951,7 @@ extern int trace_selftest_startup_nop(struct tracer *trace,
struct trace_array *tr);
extern int trace_selftest_startup_branch(struct tracer *trace,
struct trace_array *tr);
+extern bool __read_mostly tracing_selftest_running;
/*
* Tracer data references selftest functions that only occur
* on boot up. These can be __init functions. Thus, when selftests
@@ -819,17 +964,20 @@ static inline void __init disable_tracing_selftest(const char *reason)
}
/* Tracers are seldom changed. Optimize when selftests are disabled. */
#define __tracer_data __read_mostly
+#define tracing_selftest_running 0
#endif /* CONFIG_FTRACE_STARTUP_TEST */
extern void *head_page(struct trace_array_cpu *data);
extern unsigned long long ns2usecs(u64 nsec);
-extern int
-trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
-extern int
-trace_vprintk(unsigned long ip, const char *fmt, va_list args);
-extern int
-trace_array_vprintk(struct trace_array *tr,
- unsigned long ip, const char *fmt, va_list args);
+
+__printf(2, 0)
+int trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
+__printf(2, 0)
+int trace_vprintk(unsigned long ip, const char *fmt, va_list args);
+__printf(3, 0)
+int trace_array_vprintk(struct trace_array *tr,
+ unsigned long ip, const char *fmt, va_list args);
+__printf(3, 4)
int trace_array_printk_buf(struct trace_buffer *buffer,
unsigned long ip, const char *fmt, ...);
void trace_printk_seq(struct trace_seq *s);
@@ -884,11 +1032,10 @@ static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash)
#define TRACE_GRAPH_PRINT_RETVAL 0x800
#define TRACE_GRAPH_PRINT_RETVAL_HEX 0x1000
#define TRACE_GRAPH_PRINT_RETADDR 0x2000
+#define TRACE_GRAPH_ARGS 0x4000
#define TRACE_GRAPH_PRINT_FILL_SHIFT 28
#define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT)
-extern void ftrace_graph_sleep_time_control(bool enable);
-
#ifdef CONFIG_FUNCTION_PROFILER
extern void ftrace_graph_graph_time_control(bool enable);
#else
@@ -908,10 +1055,13 @@ extern int __trace_graph_entry(struct trace_array *tr,
extern int __trace_graph_retaddr_entry(struct trace_array *tr,
struct ftrace_graph_ent *trace,
unsigned int trace_ctx,
- unsigned long retaddr);
+ unsigned long retaddr,
+ struct ftrace_regs *fregs);
extern void __trace_graph_return(struct trace_array *tr,
struct ftrace_graph_ret *trace,
- unsigned int trace_ctx);
+ unsigned int trace_ctx,
+ u64 calltime, u64 rettime);
+
extern void init_array_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops);
extern int allocate_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops);
extern void free_fgraph_ops(struct trace_array *tr);
@@ -1057,7 +1207,8 @@ static inline void ftrace_graph_addr_finish(struct fgraph_ops *gops, struct ftra
#endif /* CONFIG_DYNAMIC_FTRACE */
extern unsigned int fgraph_max_depth;
-extern bool fgraph_sleep_time;
+extern int fgraph_no_sleep_time;
+extern bool fprofile_no_sleep_time;
static inline bool
ftrace_graph_ignore_func(struct fgraph_ops *gops, struct ftrace_graph_ent *trace)
@@ -1102,11 +1253,6 @@ struct ftrace_func_command {
char *params, int enable);
};
extern bool ftrace_filter_param __initdata;
-static inline int ftrace_trace_task(struct trace_array *tr)
-{
- return this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid) !=
- FTRACE_PID_IGNORE;
-}
extern int ftrace_is_dead(void);
int ftrace_create_function_files(struct trace_array *tr,
struct dentry *parent);
@@ -1114,6 +1260,7 @@ void ftrace_destroy_function_files(struct trace_array *tr);
int ftrace_allocate_ftrace_ops(struct trace_array *tr);
void ftrace_free_ftrace_ops(struct trace_array *tr);
void ftrace_init_global_array_ops(struct trace_array *tr);
+struct trace_array *trace_get_global_array(void);
void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func);
void ftrace_reset_array_ops(struct trace_array *tr);
void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer);
@@ -1123,10 +1270,6 @@ void ftrace_clear_pids(struct trace_array *tr);
int init_function_trace(void);
void ftrace_pid_follow_fork(struct trace_array *tr, bool enable);
#else
-static inline int ftrace_trace_task(struct trace_array *tr)
-{
- return 1;
-}
static inline int ftrace_is_dead(void) { return 0; }
static inline int
ftrace_create_function_files(struct trace_array *tr,
@@ -1239,6 +1382,7 @@ bool ftrace_event_is_function(struct trace_event_call *call);
*/
struct trace_parser {
bool cont;
+ bool fail;
char *buffer;
unsigned idx;
unsigned size;
@@ -1246,7 +1390,7 @@ struct trace_parser {
static inline bool trace_parser_loaded(struct trace_parser *parser)
{
- return (parser->idx != 0);
+ return !parser->fail && parser->idx != 0;
}
static inline bool trace_parser_cont(struct trace_parser *parser)
@@ -1260,6 +1404,11 @@ static inline void trace_parser_clear(struct trace_parser *parser)
parser->idx = 0;
}
+static inline void trace_parser_fail(struct trace_parser *parser)
+{
+ parser->fail = true;
+}
+
extern int trace_parser_get_init(struct trace_parser *parser, int size);
extern void trace_parser_put(struct trace_parser *parser);
extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
@@ -1286,11 +1435,11 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
# define FUNCTION_FLAGS \
C(FUNCTION, "function-trace"), \
C(FUNC_FORK, "function-fork"),
-# define FUNCTION_DEFAULT_FLAGS TRACE_ITER_FUNCTION
+# define FUNCTION_DEFAULT_FLAGS TRACE_ITER(FUNCTION)
#else
# define FUNCTION_FLAGS
# define FUNCTION_DEFAULT_FLAGS 0UL
-# define TRACE_ITER_FUNC_FORK 0UL
+# define TRACE_ITER_FUNC_FORK_BIT -1
#endif
#ifdef CONFIG_STACKTRACE
@@ -1300,6 +1449,24 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
# define STACK_FLAGS
#endif
+#ifdef CONFIG_FUNCTION_PROFILER
+# define PROFILER_FLAGS \
+ C(PROF_TEXT_OFFSET, "prof-text-offset"),
+# ifdef CONFIG_FUNCTION_GRAPH_TRACER
+# define FPROFILE_FLAGS \
+ C(GRAPH_TIME, "graph-time"),
+# define FPROFILE_DEFAULT_FLAGS TRACE_ITER(GRAPH_TIME)
+# else
+# define FPROFILE_FLAGS
+# define FPROFILE_DEFAULT_FLAGS 0UL
+# endif
+#else
+# define PROFILER_FLAGS
+# define FPROFILE_FLAGS
+# define FPROFILE_DEFAULT_FLAGS 0UL
+# define TRACE_ITER_PROF_TEXT_OFFSET_BIT -1
+#endif
+
/*
* trace_iterator_flags is an enumeration that defines bit
* positions into trace_flags that controls the output.
@@ -1332,12 +1499,16 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
C(MARKERS, "markers"), \
C(EVENT_FORK, "event-fork"), \
C(TRACE_PRINTK, "trace_printk_dest"), \
+ C(COPY_MARKER, "copy_trace_marker"), \
C(PAUSE_ON_TRACE, "pause-on-trace"), \
C(HASH_PTR, "hash-ptr"), /* Print hashed pointer */ \
+ C(BITMASK_LIST, "bitmask-list"), \
FUNCTION_FLAGS \
FGRAPH_FLAGS \
STACK_FLAGS \
- BRANCH_FLAGS
+ BRANCH_FLAGS \
+ PROFILER_FLAGS \
+ FPROFILE_FLAGS
/*
* By defining C, we can make TRACE_FLAGS a list of bit names
@@ -1353,20 +1524,17 @@ enum trace_iterator_bits {
};
/*
- * By redefining C, we can make TRACE_FLAGS a list of masks that
- * use the bits as defined above.
+ * And use TRACE_ITER(flag) to define the bit masks.
*/
-#undef C
-#define C(a, b) TRACE_ITER_##a = (1 << TRACE_ITER_##a##_BIT)
-
-enum trace_iterator_flags { TRACE_FLAGS };
+#define TRACE_ITER(flag) \
+ (TRACE_ITER_##flag##_BIT < 0 ? 0 : 1ULL << (TRACE_ITER_##flag##_BIT))
/*
* TRACE_ITER_SYM_MASK masks the options in trace_flags that
* control the output of kernel symbols.
*/
#define TRACE_ITER_SYM_MASK \
- (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
+ (TRACE_ITER(PRINT_PARENT)|TRACE_ITER(SYM_OFFSET)|TRACE_ITER(SYM_ADDR))
extern struct tracer nop_trace;
@@ -1375,7 +1543,7 @@ extern int enable_branch_tracing(struct trace_array *tr);
extern void disable_branch_tracing(void);
static inline int trace_branch_enable(struct trace_array *tr)
{
- if (tr->trace_flags & TRACE_ITER_BRANCH)
+ if (tr->trace_flags & TRACE_ITER(BRANCH))
return enable_branch_tracing(tr);
return 0;
}
@@ -1471,6 +1639,64 @@ void trace_buffered_event_enable(void);
void early_enable_events(struct trace_array *tr, char *buf, bool disable_first);
+struct trace_user_buf;
+struct trace_user_buf_info {
+ struct trace_user_buf __percpu *tbuf;
+ size_t size;
+ int ref;
+};
+
+typedef int (*trace_user_buf_copy)(char *dst, const char __user *src,
+ size_t size, void *data);
+int trace_user_fault_init(struct trace_user_buf_info *tinfo, size_t size);
+int trace_user_fault_get(struct trace_user_buf_info *tinfo);
+int trace_user_fault_put(struct trace_user_buf_info *tinfo);
+void trace_user_fault_destroy(struct trace_user_buf_info *tinfo);
+char *trace_user_fault_read(struct trace_user_buf_info *tinfo,
+ const char __user *ptr, size_t size,
+ trace_user_buf_copy copy_func, void *data);
+
+static __always_inline void
+trace_event_setup(struct ring_buffer_event *event,
+ int type, unsigned int trace_ctx)
+{
+ struct trace_entry *ent = ring_buffer_event_data(event);
+
+ tracing_generic_entry_update(ent, type, trace_ctx);
+}
+
+static __always_inline struct ring_buffer_event *
+__trace_buffer_lock_reserve(struct trace_buffer *buffer,
+ int type,
+ unsigned long len,
+ unsigned int trace_ctx)
+{
+ struct ring_buffer_event *event;
+
+ event = ring_buffer_lock_reserve(buffer, len);
+ if (event != NULL)
+ trace_event_setup(event, type, trace_ctx);
+
+ return event;
+}
+
+static __always_inline void
+__buffer_unlock_commit(struct trace_buffer *buffer, struct ring_buffer_event *event)
+{
+ __this_cpu_write(trace_taskinfo_save, true);
+
+ /* If this is the temp buffer, we need to commit fully */
+ if (this_cpu_read(trace_buffered_event) == event) {
+ /* Length is in event->array[0] */
+ ring_buffer_write(buffer, event->array[0], &event->array[1]);
+ /* Release the temp buffer */
+ this_cpu_dec(trace_buffered_event_cnt);
+ /* ring_buffer_unlock_commit() enables preemption */
+ preempt_enable_notrace();
+ } else
+ ring_buffer_unlock_commit(buffer);
+}
+
static inline void
__trace_event_discard_commit(struct trace_buffer *buffer,
struct ring_buffer_event *event)
@@ -1643,11 +1869,6 @@ extern struct trace_event_file *find_event_file(struct trace_array *tr,
const char *system,
const char *event);
-static inline void *event_file_data(struct file *filp)
-{
- return READ_ONCE(file_inode(filp)->i_private);
-}
-
extern struct mutex event_mutex;
extern struct list_head ftrace_events;
@@ -1668,12 +1889,22 @@ static inline struct trace_event_file *event_file_file(struct file *filp)
struct trace_event_file *file;
lockdep_assert_held(&event_mutex);
- file = READ_ONCE(file_inode(filp)->i_private);
+ file = file_inode(filp)->i_private;
if (!file || file->flags & EVENT_FILE_FL_FREED)
return NULL;
return file;
}
+static inline void *event_file_data(struct file *filp)
+{
+ struct trace_event_file *file;
+
+ lockdep_assert_held(&event_mutex);
+ file = file_inode(filp)->i_private;
+ WARN_ON(!file || file->flags & EVENT_FILE_FL_FREED);
+ return file;
+}
+
extern const struct file_operations event_trigger_fops;
extern const struct file_operations event_hist_fops;
extern const struct file_operations event_hist_debug_fops;
@@ -1692,13 +1923,13 @@ extern void clear_event_triggers(struct trace_array *tr);
enum {
EVENT_TRIGGER_FL_PROBE = BIT(0),
+ EVENT_TRIGGER_FL_COUNT = BIT(1),
};
struct event_trigger_data {
unsigned long count;
int ref;
int flags;
- struct event_trigger_ops *ops;
struct event_command *cmd_ops;
struct event_filter __rcu *filter;
char *filter_str;
@@ -1709,6 +1940,7 @@ struct event_trigger_data {
char *name;
struct list_head named_list;
struct event_trigger_data *named_data;
+ struct llist_node llist;
};
/* Avoid typos */
@@ -1723,6 +1955,10 @@ struct enable_trigger_data {
bool hist;
};
+bool event_trigger_count(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
+ struct ring_buffer_event *event);
+
extern int event_enable_trigger_print(struct seq_file *m,
struct event_trigger_data *data);
extern void event_enable_trigger_free(struct event_trigger_data *data);
@@ -1736,6 +1972,9 @@ extern int event_enable_register_trigger(char *glob,
extern void event_enable_unregister_trigger(char *glob,
struct event_trigger_data *test,
struct trace_event_file *file);
+extern struct event_trigger_data *
+trigger_data_alloc(struct event_command *cmd_ops, char *cmd, char *param,
+ void *private_data);
extern void trigger_data_free(struct event_trigger_data *data);
extern int event_trigger_init(struct event_trigger_data *data);
extern int trace_event_trigger_enable_disable(struct trace_event_file *file,
@@ -1762,11 +2001,6 @@ extern bool event_trigger_check_remove(const char *glob);
extern bool event_trigger_empty_param(const char *param);
extern int event_trigger_separate_filter(char *param_and_filter, char **param,
char **filter, bool param_required);
-extern struct event_trigger_data *
-event_trigger_alloc(struct event_command *cmd_ops,
- char *cmd,
- char *param,
- void *private_data);
extern int event_trigger_parse_num(char *trigger,
struct event_trigger_data *trigger_data);
extern int event_trigger_set_filter(struct event_command *cmd_ops,
@@ -1788,64 +2022,6 @@ extern void event_file_get(struct trace_event_file *file);
extern void event_file_put(struct trace_event_file *file);
/**
- * struct event_trigger_ops - callbacks for trace event triggers
- *
- * The methods in this structure provide per-event trigger hooks for
- * various trigger operations.
- *
- * The @init and @free methods are used during trigger setup and
- * teardown, typically called from an event_command's @parse()
- * function implementation.
- *
- * The @print method is used to print the trigger spec.
- *
- * The @trigger method is the function that actually implements the
- * trigger and is called in the context of the triggering event
- * whenever that event occurs.
- *
- * All the methods below, except for @init() and @free(), must be
- * implemented.
- *
- * @trigger: The trigger 'probe' function called when the triggering
- * event occurs. The data passed into this callback is the data
- * that was supplied to the event_command @reg() function that
- * registered the trigger (see struct event_command) along with
- * the trace record, rec.
- *
- * @init: An optional initialization function called for the trigger
- * when the trigger is registered (via the event_command reg()
- * function). This can be used to perform per-trigger
- * initialization such as incrementing a per-trigger reference
- * count, for instance. This is usually implemented by the
- * generic utility function @event_trigger_init() (see
- * trace_event_triggers.c).
- *
- * @free: An optional de-initialization function called for the
- * trigger when the trigger is unregistered (via the
- * event_command @reg() function). This can be used to perform
- * per-trigger de-initialization such as decrementing a
- * per-trigger reference count and freeing corresponding trigger
- * data, for instance. This is usually implemented by the
- * generic utility function @event_trigger_free() (see
- * trace_event_triggers.c).
- *
- * @print: The callback function invoked to have the trigger print
- * itself. This is usually implemented by a wrapper function
- * that calls the generic utility function @event_trigger_print()
- * (see trace_event_triggers.c).
- */
-struct event_trigger_ops {
- void (*trigger)(struct event_trigger_data *data,
- struct trace_buffer *buffer,
- void *rec,
- struct ring_buffer_event *rbe);
- int (*init)(struct event_trigger_data *data);
- void (*free)(struct event_trigger_data *data);
- int (*print)(struct seq_file *m,
- struct event_trigger_data *data);
-};
-
-/**
* struct event_command - callbacks and data members for event commands
*
* Event commands are invoked by users by writing the command name
@@ -1894,7 +2070,7 @@ struct event_trigger_ops {
*
* @reg: Adds the trigger to the list of triggers associated with the
* event, and enables the event trigger itself, after
- * initializing it (via the event_trigger_ops @init() function).
+ * initializing it (via the event_command @init() function).
* This is also where commands can use the @trigger_type value to
* make the decision as to whether or not multiple instances of
* the trigger should be allowed. This is usually implemented by
@@ -1903,7 +2079,7 @@ struct event_trigger_ops {
*
* @unreg: Removes the trigger from the list of triggers associated
* with the event, and disables the event trigger itself, after
- * initializing it (via the event_trigger_ops @free() function).
+ * initializing it (via the event_command @free() function).
* This is usually implemented by the generic utility function
* @unregister_trigger() (see trace_event_triggers.c).
*
@@ -1917,12 +2093,41 @@ struct event_trigger_ops {
* ignored. This is usually implemented by the generic utility
* function @set_trigger_filter() (see trace_event_triggers.c).
*
- * @get_trigger_ops: The callback function invoked to retrieve the
- * event_trigger_ops implementation associated with the command.
- * This callback function allows a single event_command to
- * support multiple trigger implementations via different sets of
- * event_trigger_ops, depending on the value of the @param
- * string.
+ * All the methods below, except for @init() and @free(), must be
+ * implemented.
+ *
+ * @trigger: The trigger 'probe' function called when the triggering
+ * event occurs. The data passed into this callback is the data
+ * that was supplied to the event_command @reg() function that
+ * registered the trigger (see struct event_command) along with
+ * the trace record, rec.
+ *
+ * @count_func: If defined and a numeric parameter is passed to the
+ * trigger, then this function will be called before @trigger
+ * is called. If this function returns false, then @trigger is not
+ * executed.
+ *
+ * @init: An optional initialization function called for the trigger
+ * when the trigger is registered (via the event_command reg()
+ * function). This can be used to perform per-trigger
+ * initialization such as incrementing a per-trigger reference
+ * count, for instance. This is usually implemented by the
+ * generic utility function @event_trigger_init() (see
+ * trace_event_triggers.c).
+ *
+ * @free: An optional de-initialization function called for the
+ * trigger when the trigger is unregistered (via the
+ * event_command @reg() function). This can be used to perform
+ * per-trigger de-initialization such as decrementing a
+ * per-trigger reference count and freeing corresponding trigger
+ * data, for instance. This is usually implemented by the
+ * generic utility function @event_trigger_free() (see
+ * trace_event_triggers.c).
+ *
+ * @print: The callback function invoked to have the trigger print
+ * itself. This is usually implemented by a wrapper function
+ * that calls the generic utility function @event_trigger_print()
+ * (see trace_event_triggers.c).
*/
struct event_command {
struct list_head list;
@@ -1943,7 +2148,18 @@ struct event_command {
int (*set_filter)(char *filter_str,
struct event_trigger_data *data,
struct trace_event_file *file);
- struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param);
+ void (*trigger)(struct event_trigger_data *data,
+ struct trace_buffer *buffer,
+ void *rec,
+ struct ring_buffer_event *rbe);
+ bool (*count_func)(struct event_trigger_data *data,
+ struct trace_buffer *buffer,
+ void *rec,
+ struct ring_buffer_event *rbe);
+ int (*init)(struct event_trigger_data *data);
+ void (*free)(struct event_trigger_data *data);
+ int (*print)(struct seq_file *m,
+ struct event_trigger_data *data);
};
/**
@@ -1964,7 +2180,7 @@ struct event_command {
* either committed or discarded. At that point, if any commands
* have deferred their triggers, those commands are finally
* invoked following the close of the current event. In other
- * words, if the event_trigger_ops @func() probe implementation
+ * words, if the event_command @func() probe implementation
* itself logs to the trace buffer, this flag should be set,
* otherwise it can be left unspecified.
*
@@ -1991,12 +2207,6 @@ static inline bool event_command_needs_rec(struct event_command *cmd_ops)
extern int trace_event_enable_disable(struct trace_event_file *file,
int enable, int soft_disable);
-extern int tracing_alloc_snapshot(void);
-extern void tracing_snapshot_cond(struct trace_array *tr, void *cond_data);
-extern int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update);
-
-extern int tracing_snapshot_cond_disable(struct trace_array *tr);
-extern void *tracing_cond_snapshot_data(struct trace_array *tr);
extern const char *__start___trace_bprintk_fmt[];
extern const char *__stop___trace_bprintk_fmt[];
@@ -2006,8 +2216,9 @@ extern const char *__stop___tracepoint_str[];
void trace_printk_control(bool enabled);
void trace_printk_start_comm(void);
-int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
-int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
+void trace_printk_start_stop_comm(int enabled);
+int trace_keep_overwrite(struct tracer *tracer, u64 mask, int set);
+int set_tracer_flag(struct trace_array *tr, u64 mask, int enabled);
/* Used from boot time tracer */
extern int trace_set_options(struct trace_array *tr, char *option);
@@ -2038,7 +2249,7 @@ extern void tracing_log_err(struct trace_array *tr,
* about performance). The internal_trace_puts() is for such
* a purpose.
*/
-#define internal_trace_puts(str) __trace_puts(_THIS_IP_, str, strlen(str))
+#define internal_trace_puts(str) __trace_puts(_THIS_IP_, str)
#undef FTRACE_ENTRY
#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \
@@ -2073,29 +2284,81 @@ static inline const char *get_syscall_name(int syscall)
#ifdef CONFIG_EVENT_TRACING
void trace_event_init(void);
-void trace_event_eval_update(struct trace_eval_map **map, int len);
+void trace_event_update_all(struct trace_eval_map **map, int len);
/* Used from boot time tracer */
extern int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set);
extern int trigger_process_regex(struct trace_event_file *file, char *buff);
#else
static inline void __init trace_event_init(void) { }
-static inline void trace_event_eval_update(struct trace_eval_map **map, int len) { }
+static inline void trace_event_update_all(struct trace_eval_map **map, int len) { }
#endif
#ifdef CONFIG_TRACER_SNAPSHOT
+extern const struct file_operations snapshot_fops;
+extern const struct file_operations snapshot_raw_fops;
+
+/* Used when creating instances */
+int trace_allocate_snapshot(struct trace_array *tr, int size);
+
+int tracing_alloc_snapshot(void);
+void tracing_snapshot_cond(struct trace_array *tr, void *cond_data);
+int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update);
+int tracing_snapshot_cond_disable(struct trace_array *tr);
+void *tracing_cond_snapshot_data(struct trace_array *tr);
void tracing_snapshot_instance(struct trace_array *tr);
int tracing_alloc_snapshot_instance(struct trace_array *tr);
+int tracing_arm_snapshot_locked(struct trace_array *tr);
int tracing_arm_snapshot(struct trace_array *tr);
void tracing_disarm_snapshot(struct trace_array *tr);
-#else
+void free_snapshot(struct trace_array *tr);
+void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter);
+int get_snapshot_map(struct trace_array *tr);
+void put_snapshot_map(struct trace_array *tr);
+int resize_buffer_duplicate_size(struct array_buffer *trace_buf,
+ struct array_buffer *size_buf, int cpu_id);
+__init void do_allocate_snapshot(const char *name);
+# ifdef CONFIG_DYNAMIC_FTRACE
+__init int register_snapshot_cmd(void);
+# else
+static inline int register_snapshot_cmd(void) { return 0; }
+# endif
+#else /* !CONFIG_TRACER_SNAPSHOT */
+static inline int trace_allocate_snapshot(struct trace_array *tr, int size) { return 0; }
static inline void tracing_snapshot_instance(struct trace_array *tr) { }
static inline int tracing_alloc_snapshot_instance(struct trace_array *tr)
{
return 0;
}
+static inline int tracing_arm_snapshot_locked(struct trace_array *tr) { return -EBUSY; }
static inline int tracing_arm_snapshot(struct trace_array *tr) { return 0; }
static inline void tracing_disarm_snapshot(struct trace_array *tr) { }
-#endif
+static inline void free_snapshot(struct trace_array *tr) {}
+static inline void tracing_snapshot_cond(struct trace_array *tr, void *cond_data)
+{
+ WARN_ONCE(1, "Snapshot feature not enabled, but internal conditional snapshot used");
+}
+static inline void *tracing_cond_snapshot_data(struct trace_array *tr)
+{
+ return NULL;
+}
+static inline int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update)
+{
+ return -ENODEV;
+}
+static inline int tracing_snapshot_cond_disable(struct trace_array *tr)
+{
+ return false;
+}
+static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter)
+{
+ /* Should never be called */
+ WARN_ONCE(1, "Snapshot print function called without snapshot configured");
+}
+static inline int get_snapshot_map(struct trace_array *tr) { return 0; }
+static inline void put_snapshot_map(struct trace_array *tr) { }
+static inline void do_allocate_snapshot(const char *name) { }
+static inline int register_snapshot_cmd(void) { return 0; }
+#endif /* CONFIG_TRACER_SNAPSHOT */
#ifdef CONFIG_PREEMPT_TRACER
void tracer_preempt_on(unsigned long a0, unsigned long a1);
@@ -2152,10 +2415,41 @@ static inline bool is_good_system_name(const char *name)
static inline void sanitize_event_name(char *name)
{
while (*name++ != '\0')
- if (*name == ':' || *name == '.')
+ if (*name == ':' || *name == '.' || *name == '*')
*name = '_';
}
+#ifdef CONFIG_STACKTRACE
+void __ftrace_trace_stack(struct trace_array *tr,
+ struct trace_buffer *buffer,
+ unsigned int trace_ctx,
+ int skip, struct pt_regs *regs);
+
+static __always_inline void ftrace_trace_stack(struct trace_array *tr,
+ struct trace_buffer *buffer,
+ unsigned int trace_ctx,
+ int skip, struct pt_regs *regs)
+{
+ if (!(tr->trace_flags & TRACE_ITER(STACKTRACE)))
+ return;
+
+ __ftrace_trace_stack(tr, buffer, trace_ctx, skip, regs);
+}
+#else
+static inline void __ftrace_trace_stack(struct trace_array *tr,
+ struct trace_buffer *buffer,
+ unsigned int trace_ctx,
+ int skip, struct pt_regs *regs)
+{
+}
+static inline void ftrace_trace_stack(struct trace_array *tr,
+ struct trace_buffer *buffer,
+ unsigned long trace_ctx,
+ int skip, struct pt_regs *regs)
+{
+}
+#endif
+
/*
* This is a generic way to read and write a u64 value from a file in tracefs.
*
@@ -2190,4 +2484,25 @@ static inline int rv_init_interface(void)
*/
#define FTRACE_TRAMPOLINE_MARKER ((unsigned long) INT_MAX)
+/*
+ * This is used to get the address of the args array based on
+ * the type of the entry.
+ */
+#define FGRAPH_ENTRY_ARGS(e) \
+ ({ \
+ unsigned long *_args; \
+ struct ftrace_graph_ent_entry *_e = e; \
+ \
+ if (IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) && \
+ e->ent.type == TRACE_GRAPH_RETADDR_ENT) { \
+ struct fgraph_retaddr_ent_entry *_re; \
+ \
+ _re = (typeof(_re))_e; \
+ _args = _re->args; \
+ } else { \
+ _args = _e->args; \
+ } \
+ _args; \
+ })
+
#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index dbe29b4c6a7a..2ca2541c8a58 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -61,7 +61,8 @@ trace_boot_set_instance_options(struct trace_array *tr, struct xbc_node *node)
v = memparse(p, NULL);
if (v < PAGE_SIZE)
pr_err("Buffer size is too small: %s\n", p);
- if (tracing_resize_ring_buffer(tr, v, RING_BUFFER_ALL_CPUS) < 0)
+ if (trace_array_is_readonly(tr) ||
+ tracing_resize_ring_buffer(tr, v, RING_BUFFER_ALL_CPUS) < 0)
pr_err("Failed to resize trace buffer to %s\n", p);
}
@@ -597,7 +598,7 @@ trace_boot_enable_tracer(struct trace_array *tr, struct xbc_node *node)
p = xbc_node_find_value(node, "tracer", NULL);
if (p && *p != '\0') {
- if (tracing_set_tracer(tr, p) < 0)
+ if (trace_array_is_readonly(tr) || tracing_set_tracer(tr, p) < 0)
pr_err("Failed to set given tracer: %s\n", p);
}
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 6d08a5523ce0..d1564db95a8f 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -32,7 +32,6 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
{
struct trace_array *tr = branch_tracer;
struct trace_buffer *buffer;
- struct trace_array_cpu *data;
struct ring_buffer_event *event;
struct trace_branch *entry;
unsigned long flags;
@@ -54,8 +53,7 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
raw_local_irq_save(flags);
current->trace_recursion |= TRACE_BRANCH_BIT;
- data = this_cpu_ptr(tr->array_buffer.data);
- if (atomic_read(&data->disabled))
+ if (!tracer_tracing_is_on_cpu(tr, raw_smp_processor_id()))
goto out;
trace_ctx = tracing_gen_ctx_flags(flags);
@@ -375,10 +373,10 @@ __init static int init_annotated_branch_stats(void)
int ret;
ret = register_stat_tracer(&annotated_branch_stats);
- if (!ret) {
+ if (ret) {
printk(KERN_WARNING "Warning: could not register "
"annotated branches stats\n");
- return 1;
+ return ret;
}
return 0;
}
@@ -440,10 +438,10 @@ __init static int all_annotated_branch_stats(void)
int ret;
ret = register_stat_tracer(&all_branch_stats);
- if (!ret) {
+ if (ret) {
printk(KERN_WARNING "Warning: could not register "
"all branches stats\n");
- return 1;
+ return ret;
}
return 0;
}
diff --git a/kernel/trace/trace_btf.c b/kernel/trace/trace_btf.c
index 5bbdbcbbde3c..00172f301f25 100644
--- a/kernel/trace/trace_btf.c
+++ b/kernel/trace/trace_btf.c
@@ -78,7 +78,7 @@ const struct btf_member *btf_find_struct_member(struct btf *btf,
const char *name;
int i, top = 0;
- anon_stack = kcalloc(BTF_ANON_STACK_MAX, sizeof(*anon_stack), GFP_KERNEL);
+ anon_stack = kzalloc_objs(*anon_stack, BTF_ANON_STACK_MAX);
if (!anon_stack)
return ERR_PTR(-ENOMEM);
diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c
index 4376887e0d8a..c4dfbc293bae 100644
--- a/kernel/trace/trace_dynevent.c
+++ b/kernel/trace/trace_dynevent.c
@@ -16,7 +16,7 @@
#include "trace_output.h" /* for trace_event_sem */
#include "trace_dynevent.h"
-static DEFINE_MUTEX(dyn_event_ops_mutex);
+DEFINE_MUTEX(dyn_event_ops_mutex);
static LIST_HEAD(dyn_event_ops_list);
bool trace_event_dyn_try_get_ref(struct trace_event_call *dyn_call)
@@ -74,24 +74,19 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type
struct dyn_event *pos, *n;
char *system = NULL, *event, *p;
int argc, ret = -ENOENT;
- char **argv;
+ char **argv __free(argv_free) = argv_split(GFP_KERNEL, raw_command, &argc);
- argv = argv_split(GFP_KERNEL, raw_command, &argc);
if (!argv)
return -ENOMEM;
if (argv[0][0] == '-') {
- if (argv[0][1] != ':') {
- ret = -EINVAL;
- goto out;
- }
+ if (argv[0][1] != ':')
+ return -EINVAL;
event = &argv[0][2];
} else {
event = strchr(argv[0], ':');
- if (!event) {
- ret = -EINVAL;
- goto out;
- }
+ if (!event)
+ return -EINVAL;
event++;
}
@@ -101,10 +96,8 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type
event = p + 1;
*p = '\0';
}
- if (!system && event[0] == '\0') {
- ret = -EINVAL;
- goto out;
- }
+ if (!system && event[0] == '\0')
+ return -EINVAL;
mutex_lock(&event_mutex);
for_each_dyn_event_safe(pos, n) {
@@ -120,8 +113,20 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type
}
tracing_reset_all_online_cpus();
mutex_unlock(&event_mutex);
-out:
- argv_free(argv);
+ return ret;
+}
+
+/*
+ * Locked version of event creation. The event creation must be protected by
+ * dyn_event_ops_mutex because of protecting trace_probe_log.
+ */
+int dyn_event_create(const char *raw_command, struct dyn_event_operations *type)
+{
+ int ret;
+
+ mutex_lock(&dyn_event_ops_mutex);
+ ret = type->create(raw_command);
+ mutex_unlock(&dyn_event_ops_mutex);
return ret;
}
@@ -139,9 +144,16 @@ static int create_dyn_event(const char *raw_command)
if (!ret || ret != -ECANCELED)
break;
}
- mutex_unlock(&dyn_event_ops_mutex);
- if (ret == -ECANCELED)
+ if (ret == -ECANCELED) {
+ static const char *err_msg[] = {"No matching dynamic event type"};
+
+ /* Wrong dynamic event. Leave an error message. */
+ tracing_log_err(NULL, "dynevent", raw_command, err_msg,
+ 0, 0);
ret = -EINVAL;
+ }
+
+ mutex_unlock(&dyn_event_ops_mutex);
return ret;
}
@@ -225,6 +237,10 @@ static int dyn_event_open(struct inode *inode, struct file *file)
{
int ret;
+ ret = security_locked_down(LOCKDOWN_TRACEFS);
+ if (ret)
+ return ret;
+
ret = tracing_check_open_get_tr(NULL);
if (ret)
return ret;
diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h
index 936477a111d3..beee3f8d7544 100644
--- a/kernel/trace/trace_dynevent.h
+++ b/kernel/trace/trace_dynevent.h
@@ -100,6 +100,7 @@ void *dyn_event_seq_next(struct seq_file *m, void *v, loff_t *pos);
void dyn_event_seq_stop(struct seq_file *m, void *v);
int dyn_events_release_all(struct dyn_event_operations *type);
int dyn_event_release(const char *raw_command, struct dyn_event_operations *type);
+int dyn_event_create(const char *raw_command, struct dyn_event_operations *type);
/*
* for_each_dyn_event - iterate over the dyn_event list
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 82fd174ebbe0..54417468fdeb 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -61,8 +61,9 @@ FTRACE_ENTRY_REG(function, ftrace_entry,
TRACE_FN,
F_STRUCT(
- __field_fn( unsigned long, ip )
- __field_fn( unsigned long, parent_ip )
+ __field_fn( unsigned long, ip )
+ __field_fn( unsigned long, parent_ip )
+ __dynamic_array( unsigned long, args )
),
F_printk(" %ps <-- %ps",
@@ -72,17 +73,18 @@ FTRACE_ENTRY_REG(function, ftrace_entry,
);
/* Function call entry */
-FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry,
+FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
TRACE_GRAPH_ENT,
F_STRUCT(
__field_struct( struct ftrace_graph_ent, graph_ent )
- __field_packed( unsigned long, graph_ent, func )
- __field_packed( int, graph_ent, depth )
+ __field_desc_packed(unsigned long, graph_ent, func )
+ __field_desc_packed(unsigned long, graph_ent, depth )
+ __dynamic_array(unsigned long, args )
),
- F_printk("--> %ps (%d)", (void *)__entry->func, __entry->depth)
+ F_printk("--> %ps (%lu)", (void *)__entry->func, __entry->depth)
);
#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
@@ -93,13 +95,14 @@ FTRACE_ENTRY_PACKED(fgraph_retaddr_entry, fgraph_retaddr_ent_entry,
TRACE_GRAPH_RETADDR_ENT,
F_STRUCT(
- __field_struct( struct fgraph_retaddr_ent, graph_ent )
- __field_packed( unsigned long, graph_ent, func )
- __field_packed( int, graph_ent, depth )
- __field_packed( unsigned long, graph_ent, retaddr )
+ __field_struct( struct fgraph_retaddr_ent, graph_rent )
+ __field_desc_packed( unsigned long, graph_rent.ent, func )
+ __field_desc_packed( unsigned long, graph_rent.ent, depth )
+ __field_desc_packed( unsigned long, graph_rent, retaddr )
+ __dynamic_array(unsigned long, args )
),
- F_printk("--> %ps (%d) <- %ps", (void *)__entry->func, __entry->depth,
+ F_printk("--> %ps (%lu) <- %ps", (void *)__entry->func, __entry->depth,
(void *)__entry->retaddr)
);
@@ -120,15 +123,15 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
F_STRUCT(
__field_struct( struct ftrace_graph_ret, ret )
- __field_packed( unsigned long, ret, func )
- __field_packed( unsigned long, ret, retval )
- __field_packed( int, ret, depth )
- __field_packed( unsigned int, ret, overrun )
- __field_packed( unsigned long long, ret, calltime)
- __field_packed( unsigned long long, ret, rettime )
+ __field_desc_packed( unsigned long, ret, func )
+ __field_desc_packed( unsigned long, ret, retval )
+ __field_desc_packed( unsigned int, ret, depth )
+ __field_desc_packed( unsigned int, ret, overrun )
+ __field_packed(unsigned long long, calltime)
+ __field_packed(unsigned long long, rettime )
),
- F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d retval: %lx",
+ F_printk("<-- %ps (%u) (start: %llx end: %llx) over: %u retval: %lx",
(void *)__entry->func, __entry->depth,
__entry->calltime, __entry->rettime,
__entry->depth, __entry->retval)
@@ -143,14 +146,14 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
F_STRUCT(
__field_struct( struct ftrace_graph_ret, ret )
- __field_packed( unsigned long, ret, func )
- __field_packed( int, ret, depth )
- __field_packed( unsigned int, ret, overrun )
- __field_packed( unsigned long long, ret, calltime)
- __field_packed( unsigned long long, ret, rettime )
+ __field_desc_packed( unsigned long, ret, func )
+ __field_desc_packed( unsigned int, ret, depth )
+ __field_desc_packed( unsigned int, ret, overrun )
+ __field_packed(unsigned long long, calltime )
+ __field_packed(unsigned long long, rettime )
),
- F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d",
+ F_printk("<-- %ps (%u) (start: %llx end: %llx) over: %u",
(void *)__entry->func, __entry->depth,
__entry->calltime, __entry->rettime,
__entry->depth)
diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
index be8be0c1aaf0..b66d6196338d 100644
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -9,14 +9,15 @@
* Copyright (C) 2021, VMware Inc, Tzvetomir Stoyanov tz.stoyanov@gmail.com>
*
*/
+#include <linux/cleanup.h>
+#include <linux/ftrace.h>
#include <linux/module.h>
#include <linux/mutex.h>
-#include <linux/ftrace.h>
#include "trace_dynevent.h"
#include "trace_probe.h"
-#include "trace_probe_tmpl.h"
#include "trace_probe_kernel.h"
+#include "trace_probe_tmpl.h"
#define EPROBE_EVENT_SYSTEM "eprobes"
@@ -60,6 +61,9 @@ static void trace_event_probe_cleanup(struct trace_eprobe *ep)
kfree(ep);
}
+DEFINE_FREE(trace_event_probe_cleanup, struct trace_eprobe *,
+ if (!IS_ERR_OR_NULL(_T)) trace_event_probe_cleanup(_T))
+
static struct trace_eprobe *to_trace_eprobe(struct dyn_event *ev)
{
return container_of(ev, struct trace_eprobe, devent);
@@ -196,10 +200,10 @@ static struct trace_eprobe *alloc_event_probe(const char *group,
struct trace_event_call *event,
int nargs)
{
- struct trace_eprobe *ep;
+ struct trace_eprobe *ep __free(trace_event_probe_cleanup) = NULL;
const char *event_name;
const char *sys_name;
- int ret = -ENOMEM;
+ int ret;
if (!event)
return ERR_PTR(-ENODEV);
@@ -207,28 +211,25 @@ static struct trace_eprobe *alloc_event_probe(const char *group,
sys_name = event->class->system;
event_name = trace_event_name(event);
- ep = kzalloc(struct_size(ep, tp.args, nargs), GFP_KERNEL);
+ ep = kzalloc_flex(*ep, tp.args, nargs);
if (!ep) {
trace_event_put_ref(event);
- goto error;
+ return ERR_PTR(-ENOMEM);
}
ep->event = event;
ep->event_name = kstrdup(event_name, GFP_KERNEL);
if (!ep->event_name)
- goto error;
+ return ERR_PTR(-ENOMEM);
ep->event_system = kstrdup(sys_name, GFP_KERNEL);
if (!ep->event_system)
- goto error;
+ return ERR_PTR(-ENOMEM);
ret = trace_probe_init(&ep->tp, this_event, group, false, nargs);
if (ret < 0)
- goto error;
+ return ERR_PTR(ret);
dyn_event_init(&ep->devent, &eprobe_dyn_event_ops);
- return ep;
-error:
- trace_event_probe_cleanup(ep);
- return ERR_PTR(ret);
+ return_ptr(ep);
}
static int eprobe_event_define_fields(struct trace_event_call *event_call)
@@ -343,10 +344,15 @@ get_event_field(struct fetch_insn *code, void *rec)
val = *(unsigned int *)addr;
break;
default:
- if (field->is_signed)
- val = *(long *)addr;
- else
- val = *(unsigned long *)addr;
+ if (field->size == sizeof(long)) {
+ if (field->is_signed)
+ val = *(long *)addr;
+ else
+ val = *(unsigned long *)addr;
+ break;
+ }
+ /* This is an array, point to the addr itself */
+ val = (unsigned long)addr;
break;
}
return val;
@@ -478,13 +484,6 @@ static void eprobe_trigger_func(struct event_trigger_data *data,
__eprobe_trace_func(edata, rec);
}
-static struct event_trigger_ops eprobe_trigger_ops = {
- .trigger = eprobe_trigger_func,
- .print = eprobe_trigger_print,
- .init = eprobe_trigger_init,
- .free = eprobe_trigger_free,
-};
-
static int eprobe_trigger_cmd_parse(struct event_command *cmd_ops,
struct trace_event_file *file,
char *glob, char *cmd,
@@ -507,12 +506,6 @@ static void eprobe_trigger_unreg_func(char *glob,
}
-static struct event_trigger_ops *eprobe_trigger_get_ops(char *cmd,
- char *param)
-{
- return &eprobe_trigger_ops;
-}
-
static struct event_command event_trigger_cmd = {
.name = "eprobe",
.trigger_type = ETT_EVENT_EPROBE,
@@ -521,8 +514,11 @@ static struct event_command event_trigger_cmd = {
.reg = eprobe_trigger_reg_func,
.unreg = eprobe_trigger_unreg_func,
.unreg_all = NULL,
- .get_trigger_ops = eprobe_trigger_get_ops,
.set_filter = NULL,
+ .trigger = eprobe_trigger_func,
+ .print = eprobe_trigger_print,
+ .init = eprobe_trigger_init,
+ .free = eprobe_trigger_free,
};
static struct event_trigger_data *
@@ -533,8 +529,8 @@ new_eprobe_trigger(struct trace_eprobe *ep, struct trace_event_file *file)
struct eprobe_data *edata;
int ret;
- edata = kzalloc(sizeof(*edata), GFP_KERNEL);
- trigger = kzalloc(sizeof(*trigger), GFP_KERNEL);
+ edata = kzalloc_obj(*edata);
+ trigger = kzalloc_obj(*trigger);
if (!trigger || !edata) {
ret = -ENOMEM;
goto error;
@@ -542,7 +538,6 @@ new_eprobe_trigger(struct trace_eprobe *ep, struct trace_event_file *file)
trigger->flags = EVENT_TRIGGER_FL_PROBE;
trigger->count = -1;
- trigger->ops = &eprobe_trigger_ops;
/*
* EVENT PROBE triggers are not registered as commands with
@@ -795,23 +790,6 @@ find_and_get_event(const char *system, const char *event_name)
return NULL;
}
-static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[], int i)
-{
- struct traceprobe_parse_context ctx = {
- .event = ep->event,
- .flags = TPARG_FL_KERNEL | TPARG_FL_TEVENT,
- };
- int ret;
-
- ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], &ctx);
- /* Handle symbols "@" */
- if (!ret)
- ret = traceprobe_update_arg(&ep->tp.args[i]);
-
- traceprobe_finish_parse(&ctx);
- return ret;
-}
-
static int trace_eprobe_parse_filter(struct trace_eprobe *ep, int argc, const char *argv[])
{
struct event_filter *dummy = NULL;
@@ -848,13 +826,10 @@ static int trace_eprobe_parse_filter(struct trace_eprobe *ep, int argc, const ch
ret = create_event_filter(top_trace_array(), ep->event, ep->filter_str,
true, &dummy);
free_event_filter(dummy);
- if (ret)
- goto error;
-
- return 0;
-error:
- kfree(ep->filter_str);
- ep->filter_str = NULL;
+ if (ret) {
+ kfree(ep->filter_str);
+ ep->filter_str = NULL;
+ }
return ret;
}
@@ -866,40 +841,52 @@ static int __trace_eprobe_create(int argc, const char *argv[])
* Fetch args (no space):
* <name>=$<field>[:TYPE]
*/
+ struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) = NULL;
+ struct trace_eprobe *ep __free(trace_event_probe_cleanup) = NULL;
+ const char *trlog __free(trace_probe_log_clear) = NULL;
const char *event = NULL, *group = EPROBE_EVENT_SYSTEM;
const char *sys_event = NULL, *sys_name = NULL;
struct trace_event_call *event_call;
- struct trace_eprobe *ep = NULL;
- char buf1[MAX_EVENT_NAME_LEN];
- char buf2[MAX_EVENT_NAME_LEN];
- char gbuf[MAX_EVENT_NAME_LEN];
+ char *buf1 __free(kfree) = NULL;
+ char *buf2 __free(kfree) = NULL;
+ char *gbuf __free(kfree) = NULL;
int ret = 0, filter_idx = 0;
int i, filter_cnt;
if (argc < 2 || argv[0][0] != 'e')
return -ECANCELED;
- trace_probe_log_init("event_probe", argc, argv);
+ trlog = trace_probe_log_init("event_probe", argc, argv);
event = strchr(&argv[0][1], ':');
if (event) {
+ gbuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL);
+ if (!gbuf)
+ return -ENOMEM;
event++;
ret = traceprobe_parse_event_name(&event, &group, gbuf,
event - argv[0]);
if (ret)
- goto parse_error;
+ return -EINVAL;
}
trace_probe_log_set_index(1);
sys_event = argv[1];
+
+ buf2 = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL);
+ if (!buf2)
+ return -ENOMEM;
+
ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2, 0);
if (ret || !sys_event || !sys_name) {
trace_probe_log_err(0, NO_EVENT_INFO);
- goto parse_error;
+ return -EINVAL;
}
if (!event) {
- strscpy(buf1, sys_event, MAX_EVENT_NAME_LEN);
+ buf1 = kstrdup(sys_event, GFP_KERNEL);
+ if (!buf1)
+ return -ENOMEM;
event = buf1;
}
@@ -913,14 +900,15 @@ static int __trace_eprobe_create(int argc, const char *argv[])
}
if (argc - 2 > MAX_TRACE_ARGS) {
- ret = -E2BIG;
- goto error;
+ trace_probe_log_set_index(2);
+ trace_probe_log_err(0, TOO_MANY_ARGS);
+ return -E2BIG;
}
- mutex_lock(&event_mutex);
- event_call = find_and_get_event(sys_name, sys_event);
- ep = alloc_event_probe(group, event, event_call, argc - 2);
- mutex_unlock(&event_mutex);
+ scoped_guard(mutex, &event_mutex) {
+ event_call = find_and_get_event(sys_name, sys_event);
+ ep = alloc_event_probe(group, event, event_call, argc - 2);
+ }
if (IS_ERR(ep)) {
ret = PTR_ERR(ep);
@@ -928,52 +916,57 @@ static int __trace_eprobe_create(int argc, const char *argv[])
trace_probe_log_err(0, BAD_ATTACH_EVENT);
/* This must return -ENOMEM or missing event, else there is a bug */
WARN_ON_ONCE(ret != -ENOMEM && ret != -ENODEV);
- ep = NULL;
- goto error;
+ return ret;
}
if (filter_idx) {
trace_probe_log_set_index(filter_idx);
ret = trace_eprobe_parse_filter(ep, filter_cnt, argv + filter_idx);
if (ret)
- goto parse_error;
+ return -EINVAL;
} else
ep->filter_str = NULL;
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+ ctx->event = ep->event;
+ ctx->flags = TPARG_FL_KERNEL | TPARG_FL_TEVENT;
+
argc -= 2; argv += 2;
/* parse arguments */
for (i = 0; i < argc; i++) {
trace_probe_log_set_index(i + 2);
- ret = trace_eprobe_tp_update_arg(ep, argv, i);
+
+ ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], ctx);
+ /* Handle symbols "@" */
+ if (!ret)
+ ret = traceprobe_update_arg(&ep->tp.args[i]);
if (ret)
- goto error;
+ return ret;
}
ret = traceprobe_set_print_fmt(&ep->tp, PROBE_PRINT_EVENT);
if (ret < 0)
- goto error;
+ return ret;
+
init_trace_eprobe_call(ep);
- mutex_lock(&event_mutex);
- ret = trace_probe_register_event_call(&ep->tp);
- if (ret) {
- if (ret == -EEXIST) {
- trace_probe_log_set_index(0);
- trace_probe_log_err(0, EVENT_EXIST);
+ scoped_guard(mutex, &event_mutex) {
+ ret = trace_probe_register_event_call(&ep->tp);
+ if (ret) {
+ if (ret == -EEXIST) {
+ trace_probe_log_set_index(0);
+ trace_probe_log_err(0, EVENT_EXIST);
+ }
+ return ret;
}
- mutex_unlock(&event_mutex);
- goto error;
- }
- ret = dyn_event_add(&ep->devent, &ep->tp.event->call);
- if (ret < 0) {
- trace_probe_unregister_event_call(&ep->tp);
- mutex_unlock(&event_mutex);
- goto error;
+ ret = dyn_event_add(&ep->devent, &ep->tp.event->call);
+ if (ret < 0) {
+ trace_probe_unregister_event_call(&ep->tp);
+ return ret;
+ }
+ /* To avoid freeing registered eprobe event, clear ep. */
+ ep = NULL;
}
- mutex_unlock(&event_mutex);
- return ret;
-parse_error:
- ret = -EINVAL;
-error:
- trace_event_probe_cleanup(ep);
return ret;
}
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 3ff9caa4a71b..a6bb7577e8c5 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -49,7 +49,7 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event,
/* The ftrace function trace is allowed only for root. */
if (ftrace_event_is_function(tp_event)) {
- ret = perf_allow_tracepoint(&p_event->attr);
+ ret = perf_allow_tracepoint();
if (ret)
return ret;
@@ -86,7 +86,7 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event,
* ...otherwise raw tracepoint data can be a severe data leak,
* only allow root to have these.
*/
- ret = perf_allow_tracepoint(&p_event->attr);
+ ret = perf_allow_tracepoint();
if (ret)
return ret;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 770e7ed91716..c46e623e7e0d 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -360,7 +360,7 @@ static bool process_string(const char *fmt, int len, struct trace_event_call *ca
/* Anything else, this isn't a function */
break;
}
- /* A function could be wrapped in parethesis, try the next one */
+ /* A function could be wrapped in parenthesis, try the next one */
s = r + 1;
} while (s < e);
@@ -400,6 +400,20 @@ static bool process_string(const char *fmt, int len, struct trace_event_call *ca
return true;
}
+static void handle_dereference_arg(const char *arg_str, u64 string_flags, int len,
+ u64 *dereference_flags, int arg,
+ struct trace_event_call *call)
+{
+ if (string_flags & (1ULL << arg)) {
+ if (process_string(arg_str, len, call))
+ *dereference_flags &= ~(1ULL << arg);
+ } else if (process_pointer(arg_str, len, call))
+ *dereference_flags &= ~(1ULL << arg);
+ else
+ pr_warn("TRACE EVENT ERROR: Bad dereference argument: '%.*s'\n",
+ len, arg_str);
+}
+
/*
* Examine the print fmt of the event looking for unsafe dereference
* pointers using %p* that could be recorded in the trace event and
@@ -470,6 +484,7 @@ static void test_event_printk(struct trace_event_call *call)
case '%':
continue;
case 'p':
+ do_pointer:
/* Find dereferencing fields */
switch (fmt[i + 1]) {
case 'B': case 'R': case 'r':
@@ -498,6 +513,12 @@ static void test_event_printk(struct trace_event_call *call)
continue;
if (fmt[i + j] == '*') {
star = true;
+ /* Handle %*pbl case */
+ if (!j && fmt[i + 1] == 'p') {
+ arg++;
+ i++;
+ goto do_pointer;
+ }
continue;
}
if ((fmt[i + j] == 's')) {
@@ -546,7 +567,7 @@ static void test_event_printk(struct trace_event_call *call)
* If start_arg is zero, then this is the start of the
* first argument. The processing of the argument happens
* when the end of the argument is found, as it needs to
- * handle paranthesis and such.
+ * handle parenthesis and such.
*/
if (!start_arg) {
start_arg = i;
@@ -556,11 +577,9 @@ static void test_event_printk(struct trace_event_call *call)
}
if (dereference_flags & (1ULL << arg)) {
- if (string_flags & (1ULL << arg)) {
- if (process_string(fmt + start_arg, e - start_arg, call))
- dereference_flags &= ~(1ULL << arg);
- } else if (process_pointer(fmt + start_arg, e - start_arg, call))
- dereference_flags &= ~(1ULL << arg);
+ handle_dereference_arg(fmt + start_arg, string_flags,
+ e - start_arg,
+ &dereference_flags, arg, call);
}
start_arg = i;
@@ -571,11 +590,9 @@ static void test_event_printk(struct trace_event_call *call)
}
if (dereference_flags & (1ULL << arg)) {
- if (string_flags & (1ULL << arg)) {
- if (process_string(fmt + start_arg, i - start_arg, call))
- dereference_flags &= ~(1ULL << arg);
- } else if (process_pointer(fmt + start_arg, i - start_arg, call))
- dereference_flags &= ~(1ULL << arg);
+ handle_dereference_arg(fmt + start_arg, string_flags,
+ i - start_arg,
+ &dereference_flags, arg, call);
}
/*
@@ -615,7 +632,6 @@ EXPORT_SYMBOL_GPL(trace_event_raw_init);
bool trace_event_ignore_this_pid(struct trace_event_file *trace_file)
{
struct trace_array *tr = trace_file->tr;
- struct trace_array_cpu *data;
struct trace_pid_list *no_pid_list;
struct trace_pid_list *pid_list;
@@ -625,12 +641,30 @@ bool trace_event_ignore_this_pid(struct trace_event_file *trace_file)
if (!pid_list && !no_pid_list)
return false;
- data = this_cpu_ptr(tr->array_buffer.data);
-
- return data->ignore_pid;
+ /*
+ * This is recorded at every sched_switch for this task.
+ * Thus, even if the task migrates the ignore value will be the same.
+ */
+ return this_cpu_read(tr->array_buffer.data->ignore_pid) != 0;
}
EXPORT_SYMBOL_GPL(trace_event_ignore_this_pid);
+/**
+ * trace_event_buffer_reserve - reserve space on the ring buffer for an event
+ * @fbuffer: information about how to save the event
+ * @trace_file: the instance file descriptor for the event
+ * @len: The length of the event
+ *
+ * The @fbuffer has information about the ring buffer and data will
+ * be added to it to be used by the call to trace_event_buffer_commit().
+ * The @trace_file is the desrciptor with information about the status
+ * of the given event for a specific trace_array instance.
+ * The @len is the length of data to save for the event.
+ *
+ * Returns a pointer to the data on the ring buffer or NULL if the
+ * event was not reserved (event was filtered, too big, or the buffer
+ * simply was disabled for write).
+ */
void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
struct trace_event_file *trace_file,
unsigned long len)
@@ -682,6 +716,8 @@ int trace_event_reg(struct trace_event_call *call,
#ifdef CONFIG_PERF_EVENTS
case TRACE_REG_PERF_REGISTER:
+ if (!call->class->perf_probe)
+ return -ENODEV;
return tracepoint_probe_register(call->tp,
call->class->perf_probe,
call);
@@ -750,6 +786,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
{
struct trace_event_call *call = file->event_call;
struct trace_array *tr = file->tr;
+ bool soft_mode = atomic_read(&file->sm_ref) != 0;
int ret = 0;
int disable;
@@ -764,19 +801,19 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
* is set we do not want the event to be enabled before we
* clear the bit.
*
- * When soft_disable is not set but the SOFT_MODE flag is,
+ * When soft_disable is not set but the soft_mode is,
* we do nothing. Do not disable the tracepoint, otherwise
- * "soft enable"s (clearing the SOFT_DISABLED bit) wont work.
+ * "soft enable"s (clearing the SOFT_DISABLED bit) won't work.
*/
if (soft_disable) {
if (atomic_dec_return(&file->sm_ref) > 0)
break;
disable = file->flags & EVENT_FILE_FL_SOFT_DISABLED;
- clear_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags);
+ soft_mode = false;
/* Disable use of trace_buffered_event */
trace_buffered_event_disable();
} else
- disable = !(file->flags & EVENT_FILE_FL_SOFT_MODE);
+ disable = !soft_mode;
if (disable && (file->flags & EVENT_FILE_FL_ENABLED)) {
clear_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags);
@@ -790,10 +827,12 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags);
}
- call->class->reg(call, TRACE_REG_UNREGISTER, file);
+ ret = call->class->reg(call, TRACE_REG_UNREGISTER, file);
+
+ WARN_ON_ONCE(ret);
}
- /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */
- if (file->flags & EVENT_FILE_FL_SOFT_MODE)
+ /* If in soft mode, just set the SOFT_DISABLE_BIT, else clear it */
+ if (soft_mode)
set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
else
clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
@@ -803,16 +842,15 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
* When soft_disable is set and enable is set, we want to
* register the tracepoint for the event, but leave the event
* as is. That means, if the event was already enabled, we do
- * nothing (but set SOFT_MODE). If the event is disabled, we
- * set SOFT_DISABLED before enabling the event tracepoint, so
- * it still seems to be disabled.
+ * nothing. If the event is disabled, we set SOFT_DISABLED
+ * before enabling the event tracepoint, so it still seems
+ * to be disabled.
*/
if (!soft_disable)
clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
else {
if (atomic_inc_return(&file->sm_ref) > 1)
break;
- set_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags);
/* Enable use of trace_buffered_event */
trace_buffered_event_enable();
}
@@ -820,17 +858,17 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
if (!(file->flags & EVENT_FILE_FL_ENABLED)) {
bool cmd = false, tgid = false;
- /* Keep the event disabled, when going to SOFT_MODE. */
+ /* Keep the event disabled, when going to soft mode. */
if (soft_disable)
set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
- if (tr->trace_flags & TRACE_ITER_RECORD_CMD) {
+ if (tr->trace_flags & TRACE_ITER(RECORD_CMD)) {
cmd = true;
tracing_start_cmdline_record();
set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
}
- if (tr->trace_flags & TRACE_ITER_RECORD_TGID) {
+ if (tr->trace_flags & TRACE_ITER(RECORD_TGID)) {
tgid = true;
tracing_start_tgid_record();
set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags);
@@ -869,6 +907,120 @@ static int ftrace_event_enable_disable(struct trace_event_file *file,
return __ftrace_event_enable_disable(file, enable, 0);
}
+#ifdef CONFIG_MODULES
+struct event_mod_load {
+ struct list_head list;
+ char *module;
+ char *match;
+ char *system;
+ char *event;
+};
+
+static void free_event_mod(struct event_mod_load *event_mod)
+{
+ list_del(&event_mod->list);
+ kfree(event_mod->module);
+ kfree(event_mod->match);
+ kfree(event_mod->system);
+ kfree(event_mod->event);
+ kfree(event_mod);
+}
+
+static void clear_mod_events(struct trace_array *tr)
+{
+ struct event_mod_load *event_mod, *n;
+
+ list_for_each_entry_safe(event_mod, n, &tr->mod_events, list) {
+ free_event_mod(event_mod);
+ }
+}
+
+static int remove_cache_mod(struct trace_array *tr, const char *mod,
+ const char *match, const char *system, const char *event)
+{
+ struct event_mod_load *event_mod, *n;
+ int ret = -EINVAL;
+
+ list_for_each_entry_safe(event_mod, n, &tr->mod_events, list) {
+ if (strcmp(event_mod->module, mod) != 0)
+ continue;
+
+ if (match && strcmp(event_mod->match, match) != 0)
+ continue;
+
+ if (system &&
+ (!event_mod->system || strcmp(event_mod->system, system) != 0))
+ continue;
+
+ if (event &&
+ (!event_mod->event || strcmp(event_mod->event, event) != 0))
+ continue;
+
+ free_event_mod(event_mod);
+ ret = 0;
+ }
+
+ return ret;
+}
+
+static int cache_mod(struct trace_array *tr, const char *mod, int set,
+ const char *match, const char *system, const char *event)
+{
+ struct event_mod_load *event_mod;
+
+ /* If the module exists, then this just failed to find an event */
+ if (module_exists(mod))
+ return -EINVAL;
+
+ /* See if this is to remove a cached filter */
+ if (!set)
+ return remove_cache_mod(tr, mod, match, system, event);
+
+ event_mod = kzalloc_obj(*event_mod);
+ if (!event_mod)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&event_mod->list);
+ event_mod->module = kstrdup(mod, GFP_KERNEL);
+ if (!event_mod->module)
+ goto out_free;
+
+ if (match) {
+ event_mod->match = kstrdup(match, GFP_KERNEL);
+ if (!event_mod->match)
+ goto out_free;
+ }
+
+ if (system) {
+ event_mod->system = kstrdup(system, GFP_KERNEL);
+ if (!event_mod->system)
+ goto out_free;
+ }
+
+ if (event) {
+ event_mod->event = kstrdup(event, GFP_KERNEL);
+ if (!event_mod->event)
+ goto out_free;
+ }
+
+ list_add(&event_mod->list, &tr->mod_events);
+
+ return 0;
+
+ out_free:
+ free_event_mod(event_mod);
+
+ return -ENOMEM;
+}
+#else /* CONFIG_MODULES */
+static inline void clear_mod_events(struct trace_array *tr) { }
+static int cache_mod(struct trace_array *tr, const char *mod, int set,
+ const char *match, const char *system, const char *event)
+{
+ return -EINVAL;
+}
+#endif
+
static void ftrace_clear_events(struct trace_array *tr)
{
struct trace_event_file *file;
@@ -877,6 +1029,7 @@ static void ftrace_clear_events(struct trace_array *tr)
list_for_each_entry(file, &tr->events, list) {
ftrace_event_enable_disable(file, 0);
}
+ clear_mod_events(tr);
mutex_unlock(&event_mutex);
}
@@ -886,6 +1039,7 @@ event_filter_pid_sched_process_exit(void *data, struct task_struct *task)
struct trace_pid_list *pid_list;
struct trace_array *tr = data;
+ guard(preempt)();
pid_list = rcu_dereference_raw(tr->filtered_pids);
trace_filter_add_remove_task(pid_list, NULL, task);
@@ -901,6 +1055,7 @@ event_filter_pid_sched_process_fork(void *data,
struct trace_pid_list *pid_list;
struct trace_array *tr = data;
+ guard(preempt)();
pid_list = rcu_dereference_sched(tr->filtered_pids);
trace_filter_add_remove_task(pid_list, self, task);
@@ -1158,6 +1313,9 @@ static void remove_event_file_dir(struct trace_event_file *file)
free_event_filter(file->filter);
file->flags |= EVENT_FILE_FL_FREED;
event_file_put(file);
+
+ /* Wake up hist poll waiters to notice the EVENT_FILE_FL_FREED flag. */
+ hist_poll_wakeup();
}
/*
@@ -1165,17 +1323,36 @@ static void remove_event_file_dir(struct trace_event_file *file)
*/
static int
__ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match,
- const char *sub, const char *event, int set)
+ const char *sub, const char *event, int set,
+ const char *mod)
{
struct trace_event_file *file;
struct trace_event_call *call;
+ char *module __free(kfree) = NULL;
const char *name;
int ret = -EINVAL;
int eret = 0;
+ if (mod) {
+ char *p;
+
+ module = kstrdup(mod, GFP_KERNEL);
+ if (!module)
+ return -ENOMEM;
+
+ /* Replace all '-' with '_' as that's what modules do */
+ for (p = strchr(module, '-'); p; p = strchr(p + 1, '-'))
+ *p = '_';
+ }
+
list_for_each_entry(file, &tr->events, list) {
call = file->event_call;
+
+ /* If a module is specified, skip events that are not that module */
+ if (module && (!call->module || strcmp(module_name(call->module), module)))
+ continue;
+
name = trace_event_name(call);
if (!name || !call->class || !call->class->reg)
@@ -1208,16 +1385,27 @@ __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match,
ret = eret;
}
+ /*
+ * If this is a module setting and nothing was found,
+ * check if the module was loaded. If it wasn't cache it.
+ */
+ if (module && ret == -EINVAL && !eret)
+ ret = cache_mod(tr, module, set, match, sub, event);
+
return ret;
}
static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
- const char *sub, const char *event, int set)
+ const char *sub, const char *event, int set,
+ const char *mod)
{
int ret;
+ if (trace_array_is_readonly(tr))
+ return -EACCES;
+
mutex_lock(&event_mutex);
- ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set);
+ ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set, mod);
mutex_unlock(&event_mutex);
return ret;
@@ -1225,11 +1413,20 @@ static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
{
- char *event = NULL, *sub = NULL, *match;
+ char *event = NULL, *sub = NULL, *match, *mod;
int ret;
if (!tr)
return -ENOENT;
+
+ /* Modules events can be appended with :mod:<module> */
+ mod = strstr(buf, ":mod:");
+ if (mod) {
+ *mod = '\0';
+ /* move to the module name */
+ mod += 5;
+ }
+
/*
* The buf format can be <subsystem>:<event-name>
* *:<event-name> means any event by that name.
@@ -1252,9 +1449,13 @@ int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
sub = NULL;
if (!strlen(event) || strcmp(event, "*") == 0)
event = NULL;
+ } else if (mod) {
+ /* Allow wildcard for no length or star */
+ if (!strlen(match) || strcmp(match, "*") == 0)
+ match = NULL;
}
- ret = __ftrace_set_clr_event(tr, match, sub, event, set);
+ ret = __ftrace_set_clr_event(tr, match, sub, event, set, mod);
/* Put back the colon to allow this to be called again */
if (buf)
@@ -1282,7 +1483,7 @@ int trace_set_clr_event(const char *system, const char *event, int set)
if (!tr)
return -ENODEV;
- return __ftrace_set_clr_event(tr, NULL, system, event, set);
+ return __ftrace_set_clr_event(tr, NULL, system, event, set, NULL);
}
EXPORT_SYMBOL_GPL(trace_set_clr_event);
@@ -1308,7 +1509,7 @@ int trace_array_set_clr_event(struct trace_array *tr, const char *system,
return -ENOENT;
set = (enable == true) ? 1 : 0;
- return __ftrace_set_clr_event(tr, NULL, system, event, set);
+ return __ftrace_set_clr_event(tr, NULL, system, event, set, NULL);
}
EXPORT_SYMBOL_GPL(trace_array_set_clr_event);
@@ -1395,37 +1596,77 @@ static void *t_start(struct seq_file *m, loff_t *pos)
return file;
}
+enum set_event_iter_type {
+ SET_EVENT_FILE,
+ SET_EVENT_MOD,
+};
+
+struct set_event_iter {
+ enum set_event_iter_type type;
+ union {
+ struct trace_event_file *file;
+ struct event_mod_load *event_mod;
+ };
+};
+
static void *
s_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct trace_event_file *file = v;
+ struct set_event_iter *iter = v;
+ struct trace_event_file *file;
struct trace_array *tr = m->private;
(*pos)++;
- list_for_each_entry_continue(file, &tr->events, list) {
- if (file->flags & EVENT_FILE_FL_ENABLED)
- return file;
+ if (iter->type == SET_EVENT_FILE) {
+ file = iter->file;
+ list_for_each_entry_continue(file, &tr->events, list) {
+ if (file->flags & EVENT_FILE_FL_ENABLED) {
+ iter->file = file;
+ return iter;
+ }
+ }
+#ifdef CONFIG_MODULES
+ iter->type = SET_EVENT_MOD;
+ iter->event_mod = list_entry(&tr->mod_events, struct event_mod_load, list);
+#endif
}
+#ifdef CONFIG_MODULES
+ list_for_each_entry_continue(iter->event_mod, &tr->mod_events, list)
+ return iter;
+#endif
+
+ /*
+ * The iter is allocated in s_start() and passed via the 'v'
+ * parameter. To stop the iterator, NULL must be returned. But
+ * the return value is what the 'v' parameter in s_stop() receives
+ * and frees. Free iter here as it will no longer be used.
+ */
+ kfree(iter);
return NULL;
}
static void *s_start(struct seq_file *m, loff_t *pos)
{
- struct trace_event_file *file;
struct trace_array *tr = m->private;
+ struct set_event_iter *iter;
loff_t l;
+ iter = kzalloc_obj(*iter);
mutex_lock(&event_mutex);
+ if (!iter)
+ return NULL;
+
+ iter->type = SET_EVENT_FILE;
+ iter->file = list_entry(&tr->events, struct trace_event_file, list);
- file = list_entry(&tr->events, struct trace_event_file, list);
for (l = 0; l <= *pos; ) {
- file = s_next(m, file, &l);
- if (!file)
+ iter = s_next(m, iter, &l);
+ if (!iter)
break;
}
- return file;
+ return iter;
}
static int t_show(struct seq_file *m, void *v)
@@ -1445,6 +1686,121 @@ static void t_stop(struct seq_file *m, void *p)
mutex_unlock(&event_mutex);
}
+static int get_call_len(struct trace_event_call *call)
+{
+ int len;
+
+ /* Get the length of "<system>:<event>" */
+ len = strlen(call->class->system) + 1;
+ len += strlen(trace_event_name(call));
+
+ /* Set the index to 32 bytes to separate event from data */
+ return len >= 32 ? 1 : 32 - len;
+}
+
+/**
+ * t_show_filters - seq_file callback to display active event filters
+ * @m: The seq_file interface for formatted output
+ * @v: The current trace_event_file being iterated
+ *
+ * Identifies and prints active filters for the current event file in the
+ * iteration. If a filter is applied to the current event and, if so,
+ * prints the system name, event name, and the filter string.
+ */
+static int t_show_filters(struct seq_file *m, void *v)
+{
+ struct trace_event_file *file = v;
+ struct trace_event_call *call = file->event_call;
+ struct event_filter *filter;
+ int len;
+
+ guard(rcu)();
+ filter = rcu_dereference(file->filter);
+ if (!filter || !filter->filter_string)
+ return 0;
+
+ len = get_call_len(call);
+
+ seq_printf(m, "%s:%s%*s%s\n", call->class->system,
+ trace_event_name(call), len, "", filter->filter_string);
+
+ return 0;
+}
+
+/**
+ * t_show_triggers - seq_file callback to display active event triggers
+ * @m: The seq_file interface for formatted output
+ * @v: The current trace_event_file being iterated
+ *
+ * Iterates through the trigger list of the current event file and prints
+ * each active trigger's configuration using its associated print
+ * operation.
+ */
+static int t_show_triggers(struct seq_file *m, void *v)
+{
+ struct trace_event_file *file = v;
+ struct trace_event_call *call = file->event_call;
+ struct event_trigger_data *data;
+ int len;
+
+ /*
+ * The event_mutex is held by t_start(), protecting the
+ * file->triggers list traversal.
+ */
+ if (list_empty(&file->triggers))
+ return 0;
+
+ len = get_call_len(call);
+
+ list_for_each_entry_rcu(data, &file->triggers, list) {
+ seq_printf(m, "%s:%s%*s", call->class->system,
+ trace_event_name(call), len, "");
+
+ data->cmd_ops->print(m, data);
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_MODULES
+static int s_show(struct seq_file *m, void *v)
+{
+ struct set_event_iter *iter = v;
+ const char *system;
+ const char *event;
+
+ if (iter->type == SET_EVENT_FILE)
+ return t_show(m, iter->file);
+
+ /* When match is set, system and event are not */
+ if (iter->event_mod->match) {
+ seq_printf(m, "%s:mod:%s\n", iter->event_mod->match,
+ iter->event_mod->module);
+ return 0;
+ }
+
+ system = iter->event_mod->system ? : "*";
+ event = iter->event_mod->event ? : "*";
+
+ seq_printf(m, "%s:%s:mod:%s\n", system, event, iter->event_mod->module);
+
+ return 0;
+}
+#else /* CONFIG_MODULES */
+static int s_show(struct seq_file *m, void *v)
+{
+ struct set_event_iter *iter = v;
+
+ return t_show(m, iter->file);
+}
+#endif
+
+static void s_stop(struct seq_file *m, void *v)
+{
+ kfree(v);
+ t_stop(m, NULL);
+}
+
static void *
__next(struct seq_file *m, void *v, loff_t *pos, int type)
{
@@ -1537,8 +1893,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
!(flags & EVENT_FILE_FL_SOFT_DISABLED))
strcpy(buf, "1");
- if (flags & EVENT_FILE_FL_SOFT_DISABLED ||
- flags & EVENT_FILE_FL_SOFT_MODE)
+ if (atomic_read(&file->sm_ref) != 0)
strcat(buf, "*");
strcat(buf, "\n");
@@ -1558,21 +1913,20 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (ret)
return ret;
+ guard(mutex)(&event_mutex);
+
switch (val) {
case 0:
case 1:
- ret = -ENODEV;
- mutex_lock(&event_mutex);
file = event_file_file(filp);
- if (likely(file)) {
- ret = tracing_update_buffers(file->tr);
- if (ret < 0) {
- mutex_unlock(&event_mutex);
- return ret;
- }
- ret = ftrace_event_enable_disable(file, val);
- }
- mutex_unlock(&event_mutex);
+ if (!file)
+ return -ENODEV;
+ ret = tracing_update_buffers(file->tr);
+ if (ret < 0)
+ return ret;
+ ret = ftrace_event_enable_disable(file, val);
+ if (ret < 0)
+ return ret;
break;
default:
@@ -1581,31 +1935,31 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
*ppos += cnt;
- return ret ? ret : cnt;
+ return cnt;
}
-static ssize_t
-system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
- loff_t *ppos)
+/*
+ * Returns:
+ * 0 : no events exist?
+ * 1 : all events are disabled
+ * 2 : all events are enabled
+ * 3 : some events are enabled and some are enabled
+ */
+int trace_events_enabled(struct trace_array *tr, const char *system)
{
- const char set_to_char[4] = { '?', '0', '1', 'X' };
- struct trace_subsystem_dir *dir = filp->private_data;
- struct event_subsystem *system = dir->subsystem;
struct trace_event_call *call;
struct trace_event_file *file;
- struct trace_array *tr = dir->tr;
- char buf[2];
int set = 0;
- int ret;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
+
list_for_each_entry(file, &tr->events, list) {
call = file->event_call;
if ((call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) ||
!trace_event_name(call) || !call->class || !call->class->reg)
continue;
- if (system && strcmp(call->class->system, system->name) != 0)
+ if (system && strcmp(call->class->system, system) != 0)
continue;
/*
@@ -1621,7 +1975,23 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
if (set == 3)
break;
}
- mutex_unlock(&event_mutex);
+
+ return set;
+}
+
+static ssize_t
+system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ const char set_to_char[4] = { '?', '0', '1', 'X' };
+ struct trace_subsystem_dir *dir = filp->private_data;
+ struct event_subsystem *system = dir->subsystem;
+ struct trace_array *tr = dir->tr;
+ char buf[2];
+ int set;
+ int ret;
+
+ set = trace_events_enabled(tr, system ? system->name : NULL);
buf[0] = set_to_char[set];
buf[1] = '\n';
@@ -1659,7 +2029,7 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (system)
name = system->name;
- ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val);
+ ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val, NULL);
if (ret)
goto out;
@@ -1817,12 +2187,12 @@ static int trace_format_open(struct inode *inode, struct file *file)
static ssize_t
event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
{
- int id = (long)event_file_data(filp);
+ /* id is directly in i_private and available for inode's lifetime. */
+ int id = (long)file_inode(filp)->i_private;
char buf[32];
int len;
- if (unlikely(!id))
- return -ENODEV;
+ WARN_ON(!id);
len = sprintf(buf, "%d\n", id);
@@ -1841,7 +2211,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
if (*ppos)
return 0;
- s = kmalloc(sizeof(*s), GFP_KERNEL);
+ s = kmalloc_obj(*s);
if (!s)
return -ENOMEM;
@@ -1880,12 +2250,8 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
mutex_lock(&event_mutex);
file = event_file_file(filp);
- if (file) {
- if (file->flags & EVENT_FILE_FL_FREED)
- err = -ENODEV;
- else
- err = apply_event_filter(file, buf);
- }
+ if (file)
+ err = apply_event_filter(file, buf);
mutex_unlock(&event_mutex);
kfree(buf);
@@ -1906,7 +2272,7 @@ static int subsystem_open(struct inode *inode, struct file *filp)
struct event_subsystem *system = NULL;
int ret;
- if (tracing_is_disabled())
+ if (unlikely(tracing_disabled))
return -ENODEV;
/* Make sure the system still exists */
@@ -1955,7 +2321,7 @@ static int system_tr_open(struct inode *inode, struct file *filp)
int ret;
/* Make a temporary dir that has no system but points to tr */
- dir = kzalloc(sizeof(*dir), GFP_KERNEL);
+ dir = kzalloc_obj(*dir);
if (!dir)
return -ENOMEM;
@@ -2001,7 +2367,7 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
if (*ppos)
return 0;
- s = kmalloc(sizeof(*s), GFP_KERNEL);
+ s = kmalloc_obj(*s);
if (!s)
return -ENOMEM;
@@ -2051,7 +2417,7 @@ show_header_page_file(struct file *filp, char __user *ubuf, size_t cnt, loff_t *
if (*ppos)
return 0;
- s = kmalloc(sizeof(*s), GFP_KERNEL);
+ s = kmalloc_obj(*s);
if (!s)
return -ENOMEM;
@@ -2075,7 +2441,7 @@ show_header_event_file(struct file *filp, char __user *ubuf, size_t cnt, loff_t
if (*ppos)
return 0;
- s = kmalloc(sizeof(*s), GFP_KERNEL);
+ s = kmalloc_obj(*s);
if (!s)
return -ENOMEM;
@@ -2157,7 +2523,7 @@ event_pid_write(struct file *filp, const char __user *ubuf,
if (ret < 0)
return ret;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
if (type == TRACE_PIDS) {
filtered_pids = rcu_dereference_protected(tr->filtered_pids,
@@ -2173,7 +2539,7 @@ event_pid_write(struct file *filp, const char __user *ubuf,
ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt);
if (ret < 0)
- goto out;
+ return ret;
if (type == TRACE_PIDS)
rcu_assign_pointer(tr->filtered_pids, pid_list);
@@ -2198,11 +2564,7 @@ event_pid_write(struct file *filp, const char __user *ubuf,
*/
on_each_cpu(ignore_task_cpu, tr, 1);
- out:
- mutex_unlock(&event_mutex);
-
- if (ret > 0)
- *ppos += ret;
+ *ppos += ret;
return ret;
}
@@ -2223,6 +2585,8 @@ ftrace_event_npid_write(struct file *filp, const char __user *ubuf,
static int ftrace_event_avail_open(struct inode *inode, struct file *file);
static int ftrace_event_set_open(struct inode *inode, struct file *file);
+static int ftrace_event_show_filters_open(struct inode *inode, struct file *file);
+static int ftrace_event_show_triggers_open(struct inode *inode, struct file *file);
static int ftrace_event_set_pid_open(struct inode *inode, struct file *file);
static int ftrace_event_set_npid_open(struct inode *inode, struct file *file);
static int ftrace_event_release(struct inode *inode, struct file *file);
@@ -2237,7 +2601,21 @@ static const struct seq_operations show_event_seq_ops = {
static const struct seq_operations show_set_event_seq_ops = {
.start = s_start,
.next = s_next,
- .show = t_show,
+ .show = s_show,
+ .stop = s_stop,
+};
+
+static const struct seq_operations show_show_event_filters_seq_ops = {
+ .start = t_start,
+ .next = t_next,
+ .show = t_show_filters,
+ .stop = t_stop,
+};
+
+static const struct seq_operations show_show_event_triggers_seq_ops = {
+ .start = t_start,
+ .next = t_next,
+ .show = t_show_triggers,
.stop = t_stop,
};
@@ -2270,6 +2648,20 @@ static const struct file_operations ftrace_set_event_fops = {
.release = ftrace_event_release,
};
+static const struct file_operations ftrace_show_event_filters_fops = {
+ .open = ftrace_event_show_filters_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static const struct file_operations ftrace_show_event_triggers_fops = {
+ .open = ftrace_event_show_triggers_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
static const struct file_operations ftrace_set_event_pid_fops = {
.open = ftrace_event_set_pid_open,
.read = seq_read,
@@ -2414,6 +2806,34 @@ ftrace_event_set_open(struct inode *inode, struct file *file)
return ret;
}
+/**
+ * ftrace_event_show_filters_open - open interface for set_event_filters
+ * @inode: The inode of the file
+ * @file: The file being opened
+ *
+ * Connects the set_event_filters file to the sequence operations
+ * required to iterate over and display active event filters.
+ */
+static int
+ftrace_event_show_filters_open(struct inode *inode, struct file *file)
+{
+ return ftrace_event_open(inode, file, &show_show_event_filters_seq_ops);
+}
+
+/**
+ * ftrace_event_show_triggers_open - open interface for show_event_triggers
+ * @inode: The inode of the file
+ * @file: The file being opened
+ *
+ * Connects the show_event_triggers file to the sequence operations
+ * required to iterate over and display active event triggers.
+ */
+static int
+ftrace_event_show_triggers_open(struct inode *inode, struct file *file)
+{
+ return ftrace_event_open(inode, file, &show_show_event_triggers_seq_ops);
+}
+
static int
ftrace_event_set_pid_open(struct inode *inode, struct file *file)
{
@@ -2462,7 +2882,7 @@ create_new_subsystem(const char *name)
struct event_subsystem *system;
/* need to create new entry */
- system = kmalloc(sizeof(*system), GFP_KERNEL);
+ system = kmalloc_obj(*system);
if (!system)
return NULL;
@@ -2473,7 +2893,7 @@ create_new_subsystem(const char *name)
if (!system->name)
goto out_free;
- system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL);
+ system->filter = kzalloc_obj(struct event_filter);
if (!system->filter)
goto out_free;
@@ -2541,7 +2961,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
}
}
- dir = kmalloc(sizeof(*dir), GFP_KERNEL);
+ dir = kmalloc_obj(*dir);
if (!dir)
goto out_fail;
@@ -2552,8 +2972,8 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
} else
__get_system(system);
- /* ftrace only has directories no files */
- if (strcmp(name, "ftrace") == 0)
+ /* ftrace only has directories no files, readonly instance too. */
+ if (strcmp(name, "ftrace") == 0 || trace_array_is_readonly(tr))
nr_entries = 0;
else
nr_entries = ARRAY_SIZE(system_entries);
@@ -2718,28 +3138,30 @@ event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file)
int ret;
static struct eventfs_entry event_entries[] = {
{
- .name = "enable",
+ .name = "format",
.callback = event_callback,
- .release = event_release,
},
+#ifdef CONFIG_PERF_EVENTS
{
- .name = "filter",
+ .name = "id",
.callback = event_callback,
},
+#endif
+#define NR_RO_EVENT_ENTRIES (1 + IS_ENABLED(CONFIG_PERF_EVENTS))
+/* Readonly files must be above this line and counted by NR_RO_EVENT_ENTRIES. */
{
- .name = "trigger",
+ .name = "enable",
.callback = event_callback,
+ .release = event_release,
},
{
- .name = "format",
+ .name = "filter",
.callback = event_callback,
},
-#ifdef CONFIG_PERF_EVENTS
{
- .name = "id",
+ .name = "trigger",
.callback = event_callback,
},
-#endif
#ifdef CONFIG_HIST_TRIGGERS
{
.name = "hist",
@@ -2772,7 +3194,10 @@ event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file)
if (!e_events)
return -ENOMEM;
- nr_entries = ARRAY_SIZE(event_entries);
+ if (trace_array_is_readonly(tr))
+ nr_entries = NR_RO_EVENT_ENTRIES;
+ else
+ nr_entries = ARRAY_SIZE(event_entries);
name = trace_event_name(call);
ei = eventfs_create_dir(name, e_events, event_entries, nr_entries, file);
@@ -2870,7 +3295,10 @@ __register_event(struct trace_event_call *call, struct module *mod)
if (ret < 0)
return ret;
+ down_write(&trace_event_sem);
list_add(&call->list, &ftrace_events);
+ up_write(&trace_event_sem);
+
if (call->flags & TRACE_EVENT_FL_DYNAMIC)
atomic_set(&call->refcnt, 0);
else
@@ -2981,7 +3409,7 @@ static void add_str_to_module(struct module *module, char *str)
{
struct module_string *modstr;
- modstr = kmalloc(sizeof(*modstr), GFP_KERNEL);
+ modstr = kmalloc_obj(*modstr);
/*
* If we failed to allocate memory here, then we'll just
@@ -2998,43 +3426,120 @@ static void add_str_to_module(struct module *module, char *str)
list_add(&modstr->next, &module_strings);
}
+#define ATTRIBUTE_STR "__attribute__("
+#define ATTRIBUTE_STR_LEN (sizeof(ATTRIBUTE_STR) - 1)
+
+/* Remove all __attribute__() from @type. Return allocated string or @type. */
+static char *sanitize_field_type(const char *type)
+{
+ char *attr, *tmp, *next, *ret = (char *)type;
+ int depth;
+
+ next = (char *)type;
+ while ((attr = strstr(next, ATTRIBUTE_STR))) {
+ /* Retry if "__attribute__(" is a part of another word. */
+ if (attr != next && !isspace(attr[-1])) {
+ next = attr + ATTRIBUTE_STR_LEN;
+ continue;
+ }
+
+ if (ret == type) {
+ ret = kstrdup(type, GFP_KERNEL);
+ if (WARN_ON_ONCE(!ret))
+ return NULL;
+ attr = ret + (attr - type);
+ }
+
+ /* the ATTRIBUTE_STR already has the first '(' */
+ depth = 1;
+ next = attr + ATTRIBUTE_STR_LEN;
+ do {
+ tmp = strpbrk(next, "()");
+ /* There is unbalanced parentheses */
+ if (WARN_ON_ONCE(!tmp)) {
+ kfree(ret);
+ return (char *)type;
+ }
+
+ if (*tmp == '(')
+ depth++;
+ else
+ depth--;
+ next = tmp + 1;
+ } while (depth > 0);
+ next = skip_spaces(next);
+ strcpy(attr, next);
+ next = attr;
+ }
+ return ret;
+}
+
+static char *find_replacable_eval(const char *type, const char *eval_string,
+ int len)
+{
+ char *ptr;
+
+ if (!eval_string)
+ return NULL;
+
+ ptr = strchr(type, '[');
+ if (!ptr)
+ return NULL;
+ ptr++;
+
+ if (!isalpha(*ptr) && *ptr != '_')
+ return NULL;
+
+ if (strncmp(eval_string, ptr, len) != 0)
+ return NULL;
+
+ return ptr;
+}
+
static void update_event_fields(struct trace_event_call *call,
struct trace_eval_map *map)
{
struct ftrace_event_field *field;
+ const char *eval_string = NULL;
struct list_head *head;
+ int len = 0;
char *ptr;
char *str;
- int len = strlen(map->eval_string);
/* Dynamic events should never have field maps */
- if (WARN_ON_ONCE(call->flags & TRACE_EVENT_FL_DYNAMIC))
+ if (call->flags & TRACE_EVENT_FL_DYNAMIC)
return;
+ if (map) {
+ eval_string = map->eval_string;
+ len = strlen(map->eval_string);
+ }
+
head = trace_get_fields(call);
list_for_each_entry(field, head, link) {
- ptr = strchr(field->type, '[');
- if (!ptr)
- continue;
- ptr++;
-
- if (!isalpha(*ptr) && *ptr != '_')
- continue;
+ str = sanitize_field_type(field->type);
+ if (!str)
+ return;
- if (strncmp(map->eval_string, ptr, len) != 0)
- continue;
+ ptr = find_replacable_eval(str, eval_string, len);
+ if (ptr) {
+ if (str == field->type) {
+ str = kstrdup(field->type, GFP_KERNEL);
+ if (WARN_ON_ONCE(!str))
+ return;
+ ptr = str + (ptr - field->type);
+ }
- str = kstrdup(field->type, GFP_KERNEL);
- if (WARN_ON_ONCE(!str))
- return;
- ptr = str + (ptr - field->type);
- ptr = eval_replace(ptr, map, len);
- /* enum/sizeof string smaller than value */
- if (WARN_ON_ONCE(!ptr)) {
- kfree(str);
- continue;
+ ptr = eval_replace(ptr, map, len);
+ /* enum/sizeof string smaller than value */
+ if (WARN_ON_ONCE(!ptr)) {
+ kfree(str);
+ continue;
+ }
}
+ if (str == field->type)
+ continue;
/*
* If the event is part of a module, then we need to free the string
* when the module is removed. Otherwise, it will stay allocated
@@ -3044,14 +3549,18 @@ static void update_event_fields(struct trace_event_call *call,
add_str_to_module(call->module, str);
field->type = str;
+ if (field->filter_type == FILTER_OTHER)
+ field->filter_type = filter_assign_type(field->type);
}
}
-void trace_event_eval_update(struct trace_eval_map **map, int len)
+/* Update all events for replacing eval and sanitizing */
+void trace_event_update_all(struct trace_eval_map **map, int len)
{
struct trace_event_call *call, *p;
const char *last_system = NULL;
bool first = false;
+ bool updated;
int last_i;
int i;
@@ -3064,6 +3573,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
last_system = call->class->system;
}
+ updated = false;
/*
* Since calls are grouped by systems, the likelihood that the
* next call in the iteration belongs to the same system as the
@@ -3083,8 +3593,12 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
}
update_event_printk(call, map[i]);
update_event_fields(call, map[i]);
+ updated = true;
}
}
+ /* If not updated yet, update field for sanitizing. */
+ if (!updated)
+ update_event_fields(call, NULL);
cond_resched();
}
up_write(&trace_event_sem);
@@ -3111,6 +3625,20 @@ static bool event_in_systems(struct trace_event_call *call,
return !*p || isspace(*p) || *p == ',';
}
+#ifdef CONFIG_HIST_TRIGGERS
+/*
+ * Wake up waiter on the hist_poll_wq from irq_work because the hist trigger
+ * may happen in any context.
+ */
+static void hist_poll_event_irq_work(struct irq_work *work)
+{
+ wake_up_all(&hist_poll_wq);
+}
+
+DEFINE_IRQ_WORK(hist_poll_work, hist_poll_event_irq_work);
+DECLARE_WAIT_QUEUE_HEAD(hist_poll_wq);
+#endif
+
static struct trace_event_file *
trace_create_new_event(struct trace_event_call *call,
struct trace_array *tr)
@@ -3155,20 +3683,27 @@ static struct boot_triggers {
} bootup_triggers[MAX_BOOT_TRIGGERS];
static char bootup_trigger_buf[COMMAND_LINE_SIZE];
+static int boot_trigger_buf_len;
static int nr_boot_triggers;
static __init int setup_trace_triggers(char *str)
{
char *trigger;
char *buf;
+ int len = boot_trigger_buf_len;
int i;
- strscpy(bootup_trigger_buf, str, COMMAND_LINE_SIZE);
+ if (len >= COMMAND_LINE_SIZE)
+ return 1;
+
+ strscpy(bootup_trigger_buf + len, str, COMMAND_LINE_SIZE - len);
trace_set_ring_buffer_expanded(NULL);
disable_tracing_selftest("running event triggers");
- buf = bootup_trigger_buf;
- for (i = 0; i < MAX_BOOT_TRIGGERS; i++) {
+ buf = bootup_trigger_buf + len;
+ boot_trigger_buf_len += strlen(buf) + 1;
+
+ for (i = nr_boot_triggers; i < MAX_BOOT_TRIGGERS; i++) {
trigger = strsep(&buf, ",");
if (!trigger)
break;
@@ -3269,13 +3804,13 @@ int trace_add_event_call(struct trace_event_call *call)
int ret;
lockdep_assert_held(&event_mutex);
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
ret = __register_event(call, NULL);
- if (ret >= 0)
- __add_event_to_tracers(call);
+ if (ret < 0)
+ return ret;
- mutex_unlock(&trace_types_lock);
+ __add_event_to_tracers(call);
return ret;
}
EXPORT_SYMBOL_GPL(trace_add_event_call);
@@ -3304,7 +3839,7 @@ static int probe_remove_event_call(struct trace_event_call *call)
continue;
/*
* We can't rely on ftrace_event_enable_disable(enable => 0)
- * we are going to do, EVENT_FILE_FL_SOFT_MODE can suppress
+ * we are going to do, soft mode can suppress
* TRACE_REG_UNREGISTER.
*/
if (file->flags & EVENT_FILE_FL_ENABLED)
@@ -3355,6 +3890,28 @@ EXPORT_SYMBOL_GPL(trace_remove_event_call);
event++)
#ifdef CONFIG_MODULES
+static void update_mod_cache(struct trace_array *tr, struct module *mod)
+{
+ struct event_mod_load *event_mod, *n;
+
+ list_for_each_entry_safe(event_mod, n, &tr->mod_events, list) {
+ if (strcmp(event_mod->module, mod->name) != 0)
+ continue;
+
+ __ftrace_set_clr_event_nolock(tr, event_mod->match,
+ event_mod->system,
+ event_mod->event, 1, mod->name);
+ free_event_mod(event_mod);
+ }
+}
+
+static void update_cache_events(struct module *mod)
+{
+ struct trace_array *tr;
+
+ list_for_each_entry(tr, &ftrace_trace_arrays, list)
+ update_mod_cache(tr, mod);
+}
static void trace_module_add_events(struct module *mod)
{
@@ -3377,6 +3934,8 @@ static void trace_module_add_events(struct module *mod)
__register_event(*call, mod);
__add_event_to_tracers(*call);
}
+
+ update_cache_events(mod);
}
static void trace_module_remove_events(struct module *mod)
@@ -3391,7 +3950,7 @@ static void trace_module_remove_events(struct module *mod)
if (call->module == mod)
__trace_remove_event_call(call);
}
- /* Check for any strings allocade for this module */
+ /* Check for any strings allocated for this module */
list_for_each_entry_safe(modstr, m, &module_strings, next) {
if (modstr->module != mod)
continue;
@@ -3446,6 +4005,8 @@ __trace_add_event_dirs(struct trace_array *tr)
struct trace_event_call *call;
int ret;
+ lockdep_assert_held(&trace_event_sem);
+
list_for_each_entry(call, &ftrace_events, list) {
ret = __trace_add_new_event(call, tr);
if (ret < 0)
@@ -3529,30 +4090,21 @@ struct trace_event_file *trace_get_event_file(const char *instance,
return ERR_PTR(ret);
}
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
file = find_event_file(tr, system, event);
if (!file) {
trace_array_put(tr);
- ret = -EINVAL;
- goto out;
+ return ERR_PTR(-EINVAL);
}
/* Don't let event modules unload while in use */
ret = trace_event_try_get_ref(file->event_call);
if (!ret) {
trace_array_put(tr);
- ret = -EBUSY;
- goto out;
+ return ERR_PTR(-EBUSY);
}
- ret = 0;
- out:
- mutex_unlock(&event_mutex);
-
- if (ret)
- file = ERR_PTR(ret);
-
return file;
}
EXPORT_SYMBOL_GPL(trace_get_event_file);
@@ -3577,11 +4129,6 @@ void trace_put_event_file(struct trace_event_file *file)
EXPORT_SYMBOL_GPL(trace_put_event_file);
#ifdef CONFIG_DYNAMIC_FTRACE
-
-/* Avoid typos */
-#define ENABLE_EVENT_STR "enable_event"
-#define DISABLE_EVENT_STR "disable_event"
-
struct event_probe_data {
struct trace_event_file *file;
unsigned long count;
@@ -3702,7 +4249,7 @@ static int free_probe_data(void *data)
edata->ref--;
if (!edata->ref) {
- /* Remove the SOFT_MODE flag */
+ /* Remove soft mode */
__ftrace_event_enable_disable(edata->file, 0, 1);
trace_event_put_ref(edata->file->event_call);
kfree(edata);
@@ -3770,6 +4317,7 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash,
struct trace_event_file *file;
struct ftrace_probe_ops *ops;
struct event_probe_data *data;
+ unsigned long count = -1;
const char *system;
const char *event;
char *number;
@@ -3789,12 +4337,11 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash,
event = strsep(&param, ":");
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
- ret = -EINVAL;
file = find_event_file(tr, system, event);
if (!file)
- goto out;
+ return -EINVAL;
enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
@@ -3803,74 +4350,62 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash,
else
ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops;
- if (glob[0] == '!') {
- ret = unregister_ftrace_function_probe_func(glob+1, tr, ops);
- goto out;
- }
-
- ret = -ENOMEM;
-
- data = kzalloc(sizeof(*data), GFP_KERNEL);
- if (!data)
- goto out;
-
- data->enable = enable;
- data->count = -1;
- data->file = file;
+ if (glob[0] == '!')
+ return unregister_ftrace_function_probe_func(glob+1, tr, ops);
- if (!param)
- goto out_reg;
+ if (param) {
+ number = strsep(&param, ":");
- number = strsep(&param, ":");
+ if (!strlen(number))
+ return -EINVAL;
- ret = -EINVAL;
- if (!strlen(number))
- goto out_free;
-
- /*
- * We use the callback data field (which is a pointer)
- * as our counter.
- */
- ret = kstrtoul(number, 0, &data->count);
- if (ret)
- goto out_free;
+ /*
+ * We use the callback data field (which is a pointer)
+ * as our counter.
+ */
+ ret = kstrtoul(number, 0, &count);
+ if (ret)
+ return ret;
+ }
- out_reg:
/* Don't let event modules unload while probe registered */
ret = trace_event_try_get_ref(file->event_call);
- if (!ret) {
- ret = -EBUSY;
- goto out_free;
- }
+ if (!ret)
+ return -EBUSY;
ret = __ftrace_event_enable_disable(file, 1, 1);
if (ret < 0)
goto out_put;
+ ret = -ENOMEM;
+ data = kzalloc_obj(*data);
+ if (!data)
+ goto out_put;
+
+ data->enable = enable;
+ data->count = count;
+ data->file = file;
+
ret = register_ftrace_function_probe(glob, tr, ops, data);
/*
* The above returns on success the # of functions enabled,
* but if it didn't find any functions it returns zero.
* Consider no functions a failure too.
*/
- if (!ret) {
- ret = -ENOENT;
- goto out_disable;
- } else if (ret < 0)
- goto out_disable;
+
/* Just return zero, not the number of enabled functions */
- ret = 0;
- out:
- mutex_unlock(&event_mutex);
- return ret;
+ if (ret > 0)
+ return 0;
+
+ kfree(data);
+
+ if (!ret)
+ ret = -ENOENT;
- out_disable:
__ftrace_event_enable_disable(file, 0, 1);
out_put:
trace_event_put_ref(file->event_call);
- out_free:
- kfree(data);
- goto out;
+ return ret;
}
static struct ftrace_func_command event_enable_cmd = {
@@ -3969,7 +4504,11 @@ static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
static __init int setup_trace_event(char *str)
{
- strscpy(bootup_event_buf, str, COMMAND_LINE_SIZE);
+ if (bootup_event_buf[0] != '\0')
+ strlcat(bootup_event_buf, ",", COMMAND_LINE_SIZE);
+
+ strlcat(bootup_event_buf, str, COMMAND_LINE_SIZE);
+
trace_set_ring_buffer_expanded(NULL);
disable_tracing_selftest("running event tracing");
@@ -4008,25 +4547,44 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
int nr_entries;
static struct eventfs_entry events_entries[] = {
{
- .name = "enable",
+ .name = "header_page",
.callback = events_callback,
},
{
- .name = "header_page",
+ .name = "header_event",
.callback = events_callback,
},
+#define NR_RO_TOP_ENTRIES 2
+/* Readonly files must be above this line and counted by NR_RO_TOP_ENTRIES. */
{
- .name = "header_event",
+ .name = "enable",
.callback = events_callback,
},
};
- entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent,
- tr, &ftrace_set_event_fops);
- if (!entry)
- return -ENOMEM;
+ if (!trace_array_is_readonly(tr)) {
+ entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent,
+ tr, &ftrace_set_event_fops);
+ if (!entry)
+ return -ENOMEM;
- nr_entries = ARRAY_SIZE(events_entries);
+ /* There are not as crucial, just warn if they are not created */
+ trace_create_file("show_event_filters", TRACE_MODE_READ, parent, tr,
+ &ftrace_show_event_filters_fops);
+
+ trace_create_file("show_event_triggers", TRACE_MODE_READ, parent, tr,
+ &ftrace_show_event_triggers_fops);
+
+ trace_create_file("set_event_pid", TRACE_MODE_WRITE, parent,
+ tr, &ftrace_set_event_pid_fops);
+
+ trace_create_file("set_event_notrace_pid",
+ TRACE_MODE_WRITE, parent, tr,
+ &ftrace_set_event_notrace_pid_fops);
+ nr_entries = ARRAY_SIZE(events_entries);
+ } else {
+ nr_entries = NR_RO_TOP_ENTRIES;
+ }
e_events = eventfs_create_events_dir("events", parent, events_entries,
nr_entries, tr);
@@ -4035,15 +4593,6 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
return -ENOMEM;
}
- /* There are not as crucial, just warn if they are not created */
-
- trace_create_file("set_event_pid", TRACE_MODE_WRITE, parent,
- tr, &ftrace_set_event_pid_fops);
-
- trace_create_file("set_event_notrace_pid",
- TRACE_MODE_WRITE, parent, tr,
- &ftrace_set_event_notrace_pid_fops);
-
tr->event_dir = e_events;
return 0;
@@ -4093,20 +4642,17 @@ early_event_add_tracer(struct dentry *parent, struct trace_array *tr)
{
int ret;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
ret = create_event_toplevel_files(parent, tr);
if (ret)
- goto out_unlock;
+ return ret;
down_write(&trace_event_sem);
__trace_early_add_event_dirs(tr);
up_write(&trace_event_sem);
- out_unlock:
- mutex_unlock(&event_mutex);
-
- return ret;
+ return 0;
}
/* Must be called with event_mutex held */
@@ -4121,7 +4667,7 @@ int event_trace_del_tracer(struct trace_array *tr)
__ftrace_clear_event_pids(tr, TRACE_PIDS | TRACE_NO_PIDS);
/* Disable any running events */
- __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0);
+ __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0, NULL);
/* Make sure no more events are being executed */
tracepoint_synchronize_unregister();
@@ -4143,26 +4689,22 @@ static __init int event_trace_memsetup(void)
return 0;
}
-__init void
-early_enable_events(struct trace_array *tr, char *buf, bool disable_first)
+/*
+ * Helper function to enable or disable a comma-separated list of events
+ * from the bootup buffer.
+ */
+static __init void __early_set_events(struct trace_array *tr, char *buf, bool enable)
{
char *token;
- int ret;
-
- while (true) {
- token = strsep(&buf, ",");
-
- if (!token)
- break;
+ while ((token = strsep(&buf, ","))) {
if (*token) {
- /* Restarting syscalls requires that we stop them first */
- if (disable_first)
+ if (enable) {
+ if (ftrace_set_clr_event(tr, token, 1))
+ pr_warn("Failed to enable trace event: %s\n", token);
+ } else {
ftrace_set_clr_event(tr, token, 0);
-
- ret = ftrace_set_clr_event(tr, token, 1);
- if (ret)
- pr_warn("Failed to enable trace event: %s\n", token);
+ }
}
/* Put back the comma to allow this to be called again */
@@ -4171,6 +4713,32 @@ early_enable_events(struct trace_array *tr, char *buf, bool disable_first)
}
}
+/**
+ * early_enable_events - enable events from the bootup buffer
+ * @tr: The trace array to enable the events in
+ * @buf: The buffer containing the comma separated list of events
+ * @disable_first: If true, disable all events in @buf before enabling them
+ *
+ * This function enables events from the bootup buffer. If @disable_first
+ * is true, it will first disable all events in the buffer before enabling
+ * them.
+ *
+ * For syscall events, which rely on a global refcount to register the
+ * SYSCALL_WORK_SYSCALL_TRACEPOINT flag (especially for pid 1), we must
+ * ensure the refcount hits zero before re-enabling them. A simple
+ * "disable then enable" per-event is not enough if multiple syscalls are
+ * used, as the refcount will stay above zero. Thus, we need a two-phase
+ * approach: disable all, then enable all.
+ */
+__init void
+early_enable_events(struct trace_array *tr, char *buf, bool disable_first)
+{
+ if (disable_first)
+ __early_set_events(tr, buf, false);
+
+ __early_set_events(tr, buf, true);
+}
+
static __init int event_trace_enable(void)
{
struct trace_array *tr = top_trace_array();
@@ -4405,7 +4973,7 @@ static __init void event_trace_self_tests(void)
pr_info("Testing event system %s: ", system->name);
- ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1);
+ ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1, NULL);
if (WARN_ON_ONCE(ret)) {
pr_warn("error enabling system %s\n",
system->name);
@@ -4414,7 +4982,7 @@ static __init void event_trace_self_tests(void)
event_test_stuff();
- ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0);
+ ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0, NULL);
if (WARN_ON_ONCE(ret)) {
pr_warn("error disabling system %s\n",
system->name);
@@ -4429,7 +4997,7 @@ static __init void event_trace_self_tests(void)
pr_info("Running tests on all trace events:\n");
pr_info("Testing all events: ");
- ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1);
+ ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1, NULL);
if (WARN_ON_ONCE(ret)) {
pr_warn("error enabling all events\n");
return;
@@ -4438,7 +5006,7 @@ static __init void event_trace_self_tests(void)
event_test_stuff();
/* reset sysname */
- ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0);
+ ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0, NULL);
if (WARN_ON_ONCE(ret)) {
pr_warn("error disabling all events\n");
return;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 78051de581e7..609325f57942 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -142,7 +142,7 @@ static bool is_not(const char *str)
}
/**
- * struct prog_entry - a singe entry in the filter program
+ * struct prog_entry - a single entry in the filter program
* @target: Index to jump to on a branch (actually one minus the index)
* @when_to_branch: The value of the result of the predicate to do a branch
* @pred: The predicate to execute.
@@ -485,10 +485,10 @@ predicate_parse(const char *str, int nr_parens, int nr_preds,
nr_preds += 2; /* For TRUE and FALSE */
- op_stack = kmalloc_array(nr_parens, sizeof(*op_stack), GFP_KERNEL);
+ op_stack = kmalloc_objs(*op_stack, nr_parens);
if (!op_stack)
return ERR_PTR(-ENOMEM);
- prog_stack = kcalloc(nr_preds, sizeof(*prog_stack), GFP_KERNEL);
+ prog_stack = kzalloc_objs(*prog_stack, nr_preds);
if (!prog_stack) {
parse_error(pe, -ENOMEM, 0);
goto out_free;
@@ -808,7 +808,7 @@ static __always_inline char *test_string(char *str)
kstr = ubuf->buffer;
/* For safety, do not trust the string pointer */
- if (!strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE))
+ if (strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE) < 0)
return NULL;
return kstr;
}
@@ -827,7 +827,7 @@ static __always_inline char *test_ustring(char *str)
/* user space address? */
ustr = (char __user *)str;
- if (!strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE))
+ if (strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE) < 0)
return NULL;
return kstr;
@@ -1213,7 +1213,7 @@ static void append_filter_err(struct trace_array *tr,
if (WARN_ON(!filter->filter_string))
return;
- s = kmalloc(sizeof(*s), GFP_KERNEL);
+ s = kmalloc_obj(*s);
if (!s)
return;
trace_seq_init(s);
@@ -1250,7 +1250,9 @@ static void append_filter_err(struct trace_array *tr,
static inline struct event_filter *event_filter(struct trace_event_file *file)
{
- return file->filter;
+ return rcu_dereference_protected(file->filter,
+ lockdep_is_held(&event_mutex));
+
}
/* caller must hold event_mutex */
@@ -1320,7 +1322,7 @@ void free_event_filter(struct event_filter *filter)
static inline void __remove_filter(struct trace_event_file *file)
{
filter_disable(file);
- remove_filter_string(file->filter);
+ remove_filter_string(event_filter(file));
}
static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir,
@@ -1335,22 +1337,149 @@ static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir,
}
}
+struct filter_list {
+ struct list_head list;
+ struct event_filter *filter;
+};
+
+struct filter_head {
+ struct list_head list;
+ union {
+ struct rcu_head rcu;
+ struct rcu_work rwork;
+ };
+};
+
+static void free_filter_list(struct filter_head *filter_list)
+{
+ struct filter_list *filter_item, *tmp;
+
+ list_for_each_entry_safe(filter_item, tmp, &filter_list->list, list) {
+ __free_filter(filter_item->filter);
+ list_del(&filter_item->list);
+ kfree(filter_item);
+ }
+ kfree(filter_list);
+}
+
+static void free_filter_list_work(struct work_struct *work)
+{
+ struct filter_head *filter_list;
+
+ filter_list = container_of(to_rcu_work(work), struct filter_head, rwork);
+ free_filter_list(filter_list);
+}
+
+static void free_filter_list_tasks(struct rcu_head *rhp)
+{
+ struct filter_head *filter_list = container_of(rhp, struct filter_head, rcu);
+
+ INIT_RCU_WORK(&filter_list->rwork, free_filter_list_work);
+ queue_rcu_work(system_dfl_wq, &filter_list->rwork);
+}
+
+/*
+ * The tracepoint_synchronize_unregister() is a double rcu call.
+ * It calls synchronize_rcu_tasks_trace() followed by synchronize_rcu().
+ * Instead of waiting for it, simply call these via the call_rcu*()
+ * variants.
+ */
+static void delay_free_filter(struct filter_head *head)
+{
+ call_rcu_tasks_trace(&head->rcu, free_filter_list_tasks);
+}
+
+static void try_delay_free_filter(struct event_filter *filter)
+{
+ struct filter_head *head;
+ struct filter_list *item;
+
+ head = kmalloc_obj(*head);
+ if (!head)
+ goto free_now;
+
+ INIT_LIST_HEAD(&head->list);
+
+ item = kmalloc_obj(*item);
+ if (!item) {
+ kfree(head);
+ goto free_now;
+ }
+
+ item->filter = filter;
+ list_add_tail(&item->list, &head->list);
+ delay_free_filter(head);
+ return;
+
+ free_now:
+ /* Make sure the filter is not being used */
+ tracepoint_synchronize_unregister();
+ __free_filter(filter);
+}
+
static inline void __free_subsystem_filter(struct trace_event_file *file)
{
- __free_filter(file->filter);
+ __free_filter(event_filter(file));
file->filter = NULL;
}
+static inline void event_set_filter(struct trace_event_file *file,
+ struct event_filter *filter)
+{
+ rcu_assign_pointer(file->filter, filter);
+}
+
+static inline void event_clear_filter(struct trace_event_file *file)
+{
+ RCU_INIT_POINTER(file->filter, NULL);
+}
+
static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir,
- struct trace_array *tr)
+ struct trace_array *tr,
+ struct event_filter *filter)
{
struct trace_event_file *file;
+ struct filter_head *head;
+ struct filter_list *item;
+
+ head = kmalloc_obj(*head);
+ if (!head)
+ goto free_now;
+
+ INIT_LIST_HEAD(&head->list);
list_for_each_entry(file, &tr->events, list) {
if (file->system != dir)
continue;
+ item = kmalloc_obj(*item);
+ if (!item)
+ goto free_now;
+ item->filter = event_filter(file);
+ list_add_tail(&item->list, &head->list);
+ event_clear_filter(file);
+ }
+
+ item = kmalloc_obj(*item);
+ if (!item)
+ goto free_now;
+
+ item->filter = filter;
+ list_add_tail(&item->list, &head->list);
+
+ delay_free_filter(head);
+ return;
+ free_now:
+ tracepoint_synchronize_unregister();
+
+ if (head)
+ free_filter_list(head);
+
+ list_for_each_entry(file, &tr->events, list) {
+ if (file->system != dir || !file->filter)
+ continue;
__free_subsystem_filter(file);
}
+ __free_filter(filter);
}
int filter_assign_type(const char *type)
@@ -1579,7 +1708,7 @@ static int parse_pred(const char *str, void *data,
s = i;
- pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+ pred = kzalloc_obj(*pred);
if (!pred)
return -ENOMEM;
@@ -1690,7 +1819,7 @@ static int parse_pred(const char *str, void *data,
goto err_free;
}
- pred->regex = kzalloc(sizeof(*pred->regex), GFP_KERNEL);
+ pred->regex = kzalloc_obj(*pred->regex);
if (!pred->regex)
goto err_mem;
pred->regex->len = len;
@@ -1855,7 +1984,7 @@ static int parse_pred(const char *str, void *data,
goto err_free;
}
- pred->regex = kzalloc(sizeof(*pred->regex), GFP_KERNEL);
+ pred->regex = kzalloc_obj(*pred->regex);
if (!pred->regex)
goto err_mem;
pred->regex->len = len;
@@ -2120,22 +2249,6 @@ static inline void event_set_filtered_flag(struct trace_event_file *file)
trace_buffered_event_enable();
}
-static inline void event_set_filter(struct trace_event_file *file,
- struct event_filter *filter)
-{
- rcu_assign_pointer(file->filter, filter);
-}
-
-static inline void event_clear_filter(struct trace_event_file *file)
-{
- RCU_INIT_POINTER(file->filter, NULL);
-}
-
-struct filter_list {
- struct list_head list;
- struct event_filter *filter;
-};
-
static int process_system_preds(struct trace_subsystem_dir *dir,
struct trace_array *tr,
struct filter_parse_error *pe,
@@ -2144,17 +2257,22 @@ static int process_system_preds(struct trace_subsystem_dir *dir,
struct trace_event_file *file;
struct filter_list *filter_item;
struct event_filter *filter = NULL;
- struct filter_list *tmp;
- LIST_HEAD(filter_list);
+ struct filter_head *filter_list;
bool fail = true;
int err;
+ filter_list = kmalloc_obj(*filter_list);
+ if (!filter_list)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&filter_list->list);
+
list_for_each_entry(file, &tr->events, list) {
if (file->system != dir)
continue;
- filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+ filter = kzalloc_obj(*filter);
if (!filter)
goto fail_mem;
@@ -2171,11 +2289,11 @@ static int process_system_preds(struct trace_subsystem_dir *dir,
event_set_filtered_flag(file);
- filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL);
+ filter_item = kzalloc_obj(*filter_item);
if (!filter_item)
goto fail_mem;
- list_add_tail(&filter_item->list, &filter_list);
+ list_add_tail(&filter_item->list, &filter_list->list);
/*
* Regardless of if this returned an error, we still
* replace the filter for the call.
@@ -2195,31 +2313,22 @@ static int process_system_preds(struct trace_subsystem_dir *dir,
* Do a synchronize_rcu() and to ensure all calls are
* done with them before we free them.
*/
- tracepoint_synchronize_unregister();
- list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
- __free_filter(filter_item->filter);
- list_del(&filter_item->list);
- kfree(filter_item);
- }
+ delay_free_filter(filter_list);
return 0;
fail:
/* No call succeeded */
- list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
- list_del(&filter_item->list);
- kfree(filter_item);
- }
+ free_filter_list(filter_list);
parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0);
return -EINVAL;
fail_mem:
__free_filter(filter);
+
/* If any call succeeded, we still need to sync */
if (!fail)
- tracepoint_synchronize_unregister();
- list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
- __free_filter(filter_item->filter);
- list_del(&filter_item->list);
- kfree(filter_item);
- }
+ delay_free_filter(filter_list);
+ else
+ free_filter_list(filter_list);
+
return -ENOMEM;
}
@@ -2234,14 +2343,14 @@ static int create_filter_start(char *filter_string, bool set_str,
if (WARN_ON_ONCE(*pse || *filterp))
return -EINVAL;
- filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+ filter = kzalloc_obj(*filter);
if (filter && set_str) {
filter->filter_string = kstrdup(filter_string, GFP_KERNEL);
if (!filter->filter_string)
err = -ENOMEM;
}
- pe = kzalloc(sizeof(*pe), GFP_KERNEL);
+ pe = kzalloc_obj(*pe);
if (!filter || !pe || err) {
kfree(pe);
@@ -2361,9 +2470,7 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string)
event_clear_filter(file);
- /* Make sure the filter is not being used */
- tracepoint_synchronize_unregister();
- __free_filter(filter);
+ try_delay_free_filter(filter);
return 0;
}
@@ -2387,11 +2494,8 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string)
event_set_filter(file, filter);
- if (tmp) {
- /* Make sure the call is done with the filter */
- tracepoint_synchronize_unregister();
- __free_filter(tmp);
- }
+ if (tmp)
+ try_delay_free_filter(tmp);
}
return err;
@@ -2405,13 +2509,11 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir,
struct event_filter *filter = NULL;
int err = 0;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
/* Make sure the system still has events */
- if (!dir->nr_events) {
- err = -ENODEV;
- goto out_unlock;
- }
+ if (!dir->nr_events)
+ return -ENODEV;
if (!strcmp(strstrip(filter_string), "0")) {
filter_free_subsystem_preds(dir, tr);
@@ -2419,10 +2521,8 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir,
filter = system->filter;
system->filter = NULL;
/* Ensure all filters are no longer used */
- tracepoint_synchronize_unregister();
- filter_free_subsystem_filters(dir, tr);
- __free_filter(filter);
- goto out_unlock;
+ filter_free_subsystem_filters(dir, tr, filter);
+ return 0;
}
err = create_system_filter(dir, filter_string, &filter);
@@ -2434,8 +2534,6 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir,
__free_filter(system->filter);
system->filter = filter;
}
-out_unlock:
- mutex_unlock(&event_mutex);
return err;
}
@@ -2612,17 +2710,15 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
struct event_filter *filter = NULL;
struct trace_event_call *call;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
call = event->tp_event;
- err = -EINVAL;
if (!call)
- goto out_unlock;
+ return -EINVAL;
- err = -EEXIST;
if (event->filter)
- goto out_unlock;
+ return -EEXIST;
err = create_filter(NULL, call, filter_str, false, &filter);
if (err)
@@ -2637,9 +2733,6 @@ free_filter:
if (err || ftrace_event_is_function(call))
__free_filter(filter);
-out_unlock:
- mutex_unlock(&event_mutex);
-
return err;
}
@@ -2819,6 +2912,10 @@ static __init int ftrace_test_event_filter(void)
if (i == DATA_CNT)
printk(KERN_CONT "OK\n");
+ /* Need to call ftrace_test_filter to prevent a warning */
+ if (!trace_ftrace_test_filter_enabled())
+ trace_ftrace_test_filter(1, 2, 3, 4, 5, 6, 7, 8);
+
return 0;
}
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 9c058aa8baf3..eb2c2bc8bc3d 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -105,37 +105,44 @@ enum field_op_id {
FIELD_OP_MULT,
};
+#define FIELD_FUNCS \
+ C(NOP, "nop"), \
+ C(VAR_REF, "var_ref"), \
+ C(COUNTER, "counter"), \
+ C(CONST, "const"), \
+ C(LOG2, "log2"), \
+ C(BUCKET, "bucket"), \
+ C(TIMESTAMP, "timestamp"), \
+ C(CPU, "cpu"), \
+ C(COMM, "comm"), \
+ C(STRING, "string"), \
+ C(DYNSTRING, "dynstring"), \
+ C(RELDYNSTRING, "reldynstring"), \
+ C(PSTRING, "pstring"), \
+ C(S64, "s64"), \
+ C(U64, "u64"), \
+ C(S32, "s32"), \
+ C(U32, "u32"), \
+ C(S16, "s16"), \
+ C(U16, "u16"), \
+ C(S8, "s8"), \
+ C(U8, "u8"), \
+ C(UMINUS, "uminus"), \
+ C(MINUS, "minus"), \
+ C(PLUS, "plus"), \
+ C(DIV, "div"), \
+ C(MULT, "mult"), \
+ C(DIV_POWER2, "div_power2"), \
+ C(DIV_NOT_POWER2, "div_not_power2"), \
+ C(DIV_MULT_SHIFT, "div_mult_shift"), \
+ C(EXECNAME, "execname"), \
+ C(STACK, "stack"),
+
+#undef C
+#define C(a, b) HIST_FIELD_FN_##a
+
enum hist_field_fn {
- HIST_FIELD_FN_NOP,
- HIST_FIELD_FN_VAR_REF,
- HIST_FIELD_FN_COUNTER,
- HIST_FIELD_FN_CONST,
- HIST_FIELD_FN_LOG2,
- HIST_FIELD_FN_BUCKET,
- HIST_FIELD_FN_TIMESTAMP,
- HIST_FIELD_FN_CPU,
- HIST_FIELD_FN_STRING,
- HIST_FIELD_FN_DYNSTRING,
- HIST_FIELD_FN_RELDYNSTRING,
- HIST_FIELD_FN_PSTRING,
- HIST_FIELD_FN_S64,
- HIST_FIELD_FN_U64,
- HIST_FIELD_FN_S32,
- HIST_FIELD_FN_U32,
- HIST_FIELD_FN_S16,
- HIST_FIELD_FN_U16,
- HIST_FIELD_FN_S8,
- HIST_FIELD_FN_U8,
- HIST_FIELD_FN_UMINUS,
- HIST_FIELD_FN_MINUS,
- HIST_FIELD_FN_PLUS,
- HIST_FIELD_FN_DIV,
- HIST_FIELD_FN_MULT,
- HIST_FIELD_FN_DIV_POWER2,
- HIST_FIELD_FN_DIV_NOT_POWER2,
- HIST_FIELD_FN_DIV_MULT_SHIFT,
- HIST_FIELD_FN_EXECNAME,
- HIST_FIELD_FN_STACK,
+ FIELD_FUNCS
};
/*
@@ -506,6 +513,7 @@ enum hist_field_flags {
HIST_FIELD_FL_CONST = 1 << 18,
HIST_FIELD_FL_PERCENT = 1 << 19,
HIST_FIELD_FL_GRAPH = 1 << 20,
+ HIST_FIELD_FL_COMM = 1 << 21,
};
struct var_defs {
@@ -724,7 +732,7 @@ static struct track_data *track_data_alloc(unsigned int key_len,
struct action_data *action_data,
struct hist_trigger_data *hist_data)
{
- struct track_data *data = kzalloc(sizeof(*data), GFP_KERNEL);
+ struct track_data *data = kzalloc_obj(*data);
struct hist_elt_data *elt_data;
if (!data)
@@ -740,7 +748,7 @@ static struct track_data *track_data_alloc(unsigned int key_len,
data->action_data = action_data;
data->hist_data = hist_data;
- elt_data = kzalloc(sizeof(*elt_data), GFP_KERNEL);
+ elt_data = kzalloc_obj(*elt_data);
if (!elt_data) {
track_data_free(data);
return ERR_PTR(-ENOMEM);
@@ -885,6 +893,15 @@ static u64 hist_field_cpu(struct hist_field *hist_field,
return cpu;
}
+static u64 hist_field_comm(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ return (u64)(unsigned long)current->comm;
+}
+
/**
* check_field_for_var_ref - Check if a VAR_REF field references a variable
* @hist_field: The VAR_REF field to check
@@ -1069,7 +1086,7 @@ static int save_hist_vars(struct hist_trigger_data *hist_data)
if (tracing_check_open_get_tr(tr))
return -ENODEV;
- var_data = kzalloc(sizeof(*var_data), GFP_KERNEL);
+ var_data = kzalloc_obj(*var_data);
if (!var_data) {
trace_array_put(tr);
return -ENOMEM;
@@ -1338,17 +1355,22 @@ static const char *hist_field_name(struct hist_field *field,
field_name = hist_field_name(field->operands[0], ++level);
else if (field->flags & HIST_FIELD_FL_CPU)
field_name = "common_cpu";
+ else if (field->flags & HIST_FIELD_FL_COMM)
+ field_name = "common_comm";
else if (field->flags & HIST_FIELD_FL_EXPR ||
field->flags & HIST_FIELD_FL_VAR_REF) {
if (field->system) {
static char full_name[MAX_FILTER_STR_VAL];
+ static char *fmt;
+ int len;
- strcat(full_name, field->system);
- strcat(full_name, ".");
- strcat(full_name, field->event_name);
- strcat(full_name, ".");
- strcat(full_name, field->name);
- field_name = full_name;
+ fmt = field->flags & HIST_FIELD_FL_VAR_REF ? "%s.%s.$%s" : "%s.%s.%s";
+
+ len = snprintf(full_name, sizeof(full_name), fmt,
+ field->system, field->event_name,
+ field->name);
+ if (len < sizeof(full_name))
+ field_name = full_name;
} else
field_name = field->name;
} else if (field->flags & HIST_FIELD_FL_TIMESTAMP)
@@ -1529,7 +1551,7 @@ parse_hist_trigger_attrs(struct trace_array *tr, char *trigger_str)
struct hist_trigger_attrs *attrs;
int ret = 0;
- attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
+ attrs = kzalloc_obj(*attrs);
if (!attrs)
return ERR_PTR(-ENOMEM);
@@ -1627,7 +1649,7 @@ static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt)
struct hist_field *hist_field;
unsigned int i, n_str;
- elt_data = kzalloc(sizeof(*elt_data), GFP_KERNEL);
+ elt_data = kzalloc_obj(*elt_data);
if (!elt_data)
return -ENOMEM;
@@ -1721,9 +1743,10 @@ static const char *get_hist_field_flags(struct hist_field *hist_field)
static void expr_field_str(struct hist_field *field, char *expr)
{
- if (field->flags & HIST_FIELD_FL_VAR_REF)
- strcat(expr, "$");
- else if (field->flags & HIST_FIELD_FL_CONST) {
+ if (field->flags & HIST_FIELD_FL_VAR_REF) {
+ if (!field->system)
+ strcat(expr, "$");
+ } else if (field->flags & HIST_FIELD_FL_CONST) {
char str[HIST_CONST_DIGITS_MAX];
snprintf(str, HIST_CONST_DIGITS_MAX, "%llu", field->constant);
@@ -1943,7 +1966,7 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
if (field && is_function_field(field))
return NULL;
- hist_field = kzalloc(sizeof(struct hist_field), GFP_KERNEL);
+ hist_field = kzalloc_obj(struct hist_field);
if (!hist_field)
return NULL;
@@ -2015,6 +2038,13 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
goto out;
}
+ if (flags & HIST_FIELD_FL_COMM) {
+ hist_field->fn_num = HIST_FIELD_FN_COMM;
+ hist_field->size = MAX_FILTER_STR_VAL;
+ hist_field->type = "char[]";
+ goto out;
+ }
+
if (WARN_ON_ONCE(!field))
goto out;
@@ -2037,6 +2067,15 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
hist_field->fn_num = HIST_FIELD_FN_RELDYNSTRING;
else
hist_field->fn_num = HIST_FIELD_FN_PSTRING;
+ } else if (field->filter_type == FILTER_STACKTRACE) {
+ flags |= HIST_FIELD_FL_STACKTRACE;
+
+ hist_field->size = MAX_FILTER_STR_VAL;
+ hist_field->type = kstrdup_const(field->type, GFP_KERNEL);
+ if (!hist_field->type)
+ goto free;
+
+ hist_field->fn_num = HIST_FIELD_FN_STACK;
} else {
hist_field->size = field->size;
hist_field->is_signed = field->is_signed;
@@ -2359,9 +2398,11 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
hist_data->attrs->ts_in_usecs = true;
} else if (strcmp(field_name, "common_stacktrace") == 0) {
*flags |= HIST_FIELD_FL_STACKTRACE;
- } else if (strcmp(field_name, "common_cpu") == 0)
+ } else if (strcmp(field_name, "common_cpu") == 0) {
*flags |= HIST_FIELD_FL_CPU;
- else if (strcmp(field_name, "hitcount") == 0)
+ } else if (strcmp(field_name, "common_comm") == 0) {
+ *flags |= HIST_FIELD_FL_COMM | HIST_FIELD_FL_STRING;
+ } else if (strcmp(field_name, "hitcount") == 0)
*flags |= HIST_FIELD_FL_HITCOUNT;
else {
field = trace_find_event_field(file->event_call, field_name);
@@ -2377,6 +2418,8 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
*flags |= HIST_FIELD_FL_CPU;
} else if (field && field->filter_type == FILTER_STACKTRACE) {
*flags |= HIST_FIELD_FL_STACKTRACE;
+ } else if (field && field->filter_type == FILTER_COMM) {
+ *flags |= HIST_FIELD_FL_COMM | HIST_FIELD_FL_STRING;
} else {
hist_err(tr, HIST_ERR_FIELD_NOT_FOUND,
errpos(field_name));
@@ -3010,7 +3053,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,
if (!IS_ERR_OR_NULL(event_var))
return event_var;
- var_hist = kzalloc(sizeof(*var_hist), GFP_KERNEL);
+ var_hist = kzalloc_obj(*var_hist);
if (!var_hist)
return ERR_PTR(-ENOMEM);
@@ -3124,7 +3167,7 @@ static inline void __update_field_vars(struct tracing_map_elt *elt,
u64 var_val;
/* Make sure stacktrace can fit in the string variable length */
- BUILD_BUG_ON((HIST_STACKTRACE_DEPTH + 1) * sizeof(long) >= STR_VAR_LEN_MAX);
+ BUILD_BUG_ON((HIST_STACKTRACE_DEPTH + 1) * sizeof(long) > STR_VAR_LEN_MAX);
for (i = 0, j = field_var_str_start; i < n_field_vars; i++) {
struct field_var *field_var = field_vars[i];
@@ -3192,7 +3235,7 @@ static struct hist_field *create_var(struct hist_trigger_data *hist_data,
goto out;
}
- var = kzalloc(sizeof(struct hist_field), GFP_KERNEL);
+ var = kzalloc_obj(struct hist_field);
if (!var) {
var = ERR_PTR(-ENOMEM);
goto out;
@@ -3248,14 +3291,16 @@ static struct field_var *create_field_var(struct hist_trigger_data *hist_data,
var = create_var(hist_data, file, field_name, val->size, val->type);
if (IS_ERR(var)) {
hist_err(tr, HIST_ERR_VAR_CREATE_FIND_FAIL, errpos(field_name));
- kfree(val);
+ destroy_hist_field(val, 0);
ret = PTR_ERR(var);
goto err;
}
- field_var = kzalloc(sizeof(struct field_var), GFP_KERNEL);
+ field_var = kzalloc_obj(struct field_var);
if (!field_var) {
- kfree(val);
+ destroy_hist_field(val, 0);
+ kfree_const(var->type);
+ kfree(var->var.name);
kfree(var);
ret = -ENOMEM;
goto err;
@@ -3790,7 +3835,7 @@ static struct action_data *track_data_parse(struct hist_trigger_data *hist_data,
int ret = -EINVAL;
char *var_str;
- data = kzalloc(sizeof(*data), GFP_KERNEL);
+ data = kzalloc_obj(*data);
if (!data)
return ERR_PTR(-ENOMEM);
@@ -4157,7 +4202,7 @@ static struct action_data *onmatch_parse(struct trace_array *tr, char *str)
struct action_data *data;
int ret = -EINVAL;
- data = kzalloc(sizeof(*data), GFP_KERNEL);
+ data = kzalloc_obj(*data);
if (!data)
return ERR_PTR(-ENOMEM);
@@ -4327,6 +4372,8 @@ static u64 hist_fn_call(struct hist_field *hist_field,
return hist_field_timestamp(hist_field, elt, buffer, rbe, event);
case HIST_FIELD_FN_CPU:
return hist_field_cpu(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_COMM:
+ return hist_field_comm(hist_field, elt, buffer, rbe, event);
case HIST_FIELD_FN_STRING:
return hist_field_string(hist_field, elt, buffer, rbe, event);
case HIST_FIELD_FN_DYNSTRING:
@@ -5093,7 +5140,7 @@ create_hist_data(unsigned int map_bits,
struct hist_trigger_data *hist_data;
int ret = 0;
- hist_data = kzalloc(sizeof(*hist_data), GFP_KERNEL);
+ hist_data = kzalloc_obj(*hist_data);
if (!hist_data)
return ERR_PTR(-ENOMEM);
@@ -5212,22 +5259,25 @@ static inline void add_to_key(char *compound_key, void *key,
size_t size = key_field->size;
if (key_field->flags & HIST_FIELD_FL_STRING) {
- struct ftrace_event_field *field;
- field = key_field->field;
- if (field->filter_type == FILTER_DYN_STRING ||
- field->filter_type == FILTER_RDYN_STRING)
- size = *(u32 *)(rec + field->offset) >> 16;
- else if (field->filter_type == FILTER_STATIC_STRING)
- size = field->size;
+ if (key_field->flags & HIST_FIELD_FL_COMM) {
+ size = strlen((char *)key);
+ } else {
+ struct ftrace_event_field *field;
+
+ field = key_field->field;
+ if (field->filter_type == FILTER_DYN_STRING ||
+ field->filter_type == FILTER_RDYN_STRING)
+ size = *(u32 *)(rec + field->offset) >> 16;
+ else if (field->filter_type == FILTER_STATIC_STRING)
+ size = field->size;
+ }
/* ensure NULL-termination */
if (size > key_field->size - 1)
size = key_field->size - 1;
-
- strncpy(compound_key + key_field->offset, (char *)key, size);
- } else
- memcpy(compound_key + key_field->offset, key, size);
+ }
+ memcpy(compound_key + key_field->offset, key, size);
}
static void
@@ -5246,17 +5296,94 @@ hist_trigger_actions(struct hist_trigger_data *hist_data,
}
}
+/*
+ * The hist_pad structure is used to save information to create
+ * a histogram from the histogram trigger. It's too big to store
+ * on the stack, so when the histogram trigger is initialized
+ * a percpu array of 4 hist_pad structures is allocated.
+ * This will cover every context from normal, softirq, irq and NMI
+ * in the very unlikely event that a trigger happens at each of
+ * these contexts and interrupts a currently active trigger.
+ */
+struct hist_pad {
+ unsigned long entries[HIST_STACKTRACE_DEPTH];
+ u64 var_ref_vals[TRACING_MAP_VARS_MAX];
+ char compound_key[HIST_KEY_SIZE_MAX];
+};
+
+static struct hist_pad __percpu *hist_pads;
+static DEFINE_PER_CPU(int, hist_pad_cnt);
+static refcount_t hist_pad_ref;
+
+/* One hist_pad for every context (normal, softirq, irq, NMI) */
+#define MAX_HIST_CNT 4
+
+static int alloc_hist_pad(void)
+{
+ lockdep_assert_held(&event_mutex);
+
+ if (refcount_read(&hist_pad_ref)) {
+ refcount_inc(&hist_pad_ref);
+ return 0;
+ }
+
+ hist_pads = __alloc_percpu(sizeof(struct hist_pad) * MAX_HIST_CNT,
+ __alignof__(struct hist_pad));
+ if (!hist_pads)
+ return -ENOMEM;
+
+ refcount_set(&hist_pad_ref, 1);
+ return 0;
+}
+
+static void free_hist_pad(void)
+{
+ lockdep_assert_held(&event_mutex);
+
+ if (!refcount_dec_and_test(&hist_pad_ref))
+ return;
+
+ free_percpu(hist_pads);
+ hist_pads = NULL;
+}
+
+static struct hist_pad *get_hist_pad(void)
+{
+ struct hist_pad *hist_pad;
+ int cnt;
+
+ if (WARN_ON_ONCE(!hist_pads))
+ return NULL;
+
+ preempt_disable();
+
+ hist_pad = per_cpu_ptr(hist_pads, smp_processor_id());
+
+ if (this_cpu_read(hist_pad_cnt) == MAX_HIST_CNT) {
+ preempt_enable();
+ return NULL;
+ }
+
+ cnt = this_cpu_inc_return(hist_pad_cnt) - 1;
+
+ return &hist_pad[cnt];
+}
+
+static void put_hist_pad(void)
+{
+ this_cpu_dec(hist_pad_cnt);
+ preempt_enable();
+}
+
static void event_hist_trigger(struct event_trigger_data *data,
struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *rbe)
{
struct hist_trigger_data *hist_data = data->private_data;
bool use_compound_key = (hist_data->n_keys > 1);
- unsigned long entries[HIST_STACKTRACE_DEPTH];
- u64 var_ref_vals[TRACING_MAP_VARS_MAX];
- char compound_key[HIST_KEY_SIZE_MAX];
struct tracing_map_elt *elt = NULL;
struct hist_field *key_field;
+ struct hist_pad *hist_pad;
u64 field_contents;
void *key = NULL;
unsigned int i;
@@ -5264,12 +5391,18 @@ static void event_hist_trigger(struct event_trigger_data *data,
if (unlikely(!rbe))
return;
- memset(compound_key, 0, hist_data->key_size);
+ hist_pad = get_hist_pad();
+ if (!hist_pad)
+ return;
+
+ memset(hist_pad->compound_key, 0, hist_data->key_size);
for_each_hist_key_field(i, hist_data) {
key_field = hist_data->fields[i];
if (key_field->flags & HIST_FIELD_FL_STACKTRACE) {
+ unsigned long *entries = hist_pad->entries;
+
memset(entries, 0, HIST_STACKTRACE_SIZE);
if (key_field->field) {
unsigned long *stack, n_entries;
@@ -5293,24 +5426,31 @@ static void event_hist_trigger(struct event_trigger_data *data,
}
if (use_compound_key)
- add_to_key(compound_key, key, key_field, rec);
+ add_to_key(hist_pad->compound_key, key, key_field, rec);
}
if (use_compound_key)
- key = compound_key;
+ key = hist_pad->compound_key;
if (hist_data->n_var_refs &&
- !resolve_var_refs(hist_data, key, var_ref_vals, false))
- return;
+ !resolve_var_refs(hist_data, key, hist_pad->var_ref_vals, false))
+ goto out;
elt = tracing_map_insert(hist_data->map, key);
if (!elt)
- return;
+ goto out;
- hist_trigger_elt_update(hist_data, elt, buffer, rec, rbe, var_ref_vals);
+ hist_trigger_elt_update(hist_data, elt, buffer, rec, rbe, hist_pad->var_ref_vals);
+
+ if (resolve_var_refs(hist_data, key, hist_pad->var_ref_vals, true)) {
+ hist_trigger_actions(hist_data, elt, buffer, rec, rbe,
+ key, hist_pad->var_ref_vals);
+ }
- if (resolve_var_refs(hist_data, key, var_ref_vals, true))
- hist_trigger_actions(hist_data, elt, buffer, rec, rbe, key, var_ref_vals);
+ hist_poll_wakeup();
+
+ out:
+ put_hist_pad();
}
static void hist_trigger_stacktrace_print(struct seq_file *m,
@@ -5538,8 +5678,7 @@ static int print_entries(struct seq_file *m,
(HIST_FIELD_FL_PERCENT | HIST_FIELD_FL_GRAPH)))
continue;
if (!stats) {
- stats = kcalloc(hist_data->n_vals, sizeof(*stats),
- GFP_KERNEL);
+ stats = kzalloc_objs(*stats, hist_data->n_vals);
if (!stats) {
n_entries = -ENOMEM;
goto out;
@@ -5575,7 +5714,7 @@ static void hist_trigger_show(struct seq_file *m,
seq_puts(m, "\n\n");
seq_puts(m, "# event histogram\n#\n# trigger info: ");
- data->ops->print(m, data);
+ data->cmd_ops->print(m, data);
seq_puts(m, "#\n\n");
hist_data = data->private_data;
@@ -5590,52 +5729,144 @@ static void hist_trigger_show(struct seq_file *m,
n_entries, (u64)atomic64_read(&hist_data->map->drops));
}
+struct hist_file_data {
+ struct file *file;
+ u64 last_read;
+ u64 last_act;
+};
+
+static u64 get_hist_hit_count(struct trace_event_file *event_file)
+{
+ struct hist_trigger_data *hist_data;
+ struct event_trigger_data *data;
+ u64 ret = 0;
+
+ list_for_each_entry(data, &event_file->triggers, list) {
+ if (data->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+ hist_data = data->private_data;
+ ret += atomic64_read(&hist_data->map->hits);
+ }
+ }
+ return ret;
+}
+
static int hist_show(struct seq_file *m, void *v)
{
+ struct hist_file_data *hist_file = m->private;
struct event_trigger_data *data;
struct trace_event_file *event_file;
- int n = 0, ret = 0;
+ int n = 0;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
- event_file = event_file_file(m->private);
- if (unlikely(!event_file)) {
- ret = -ENODEV;
- goto out_unlock;
- }
+ event_file = event_file_file(hist_file->file);
+ if (unlikely(!event_file))
+ return -ENODEV;
list_for_each_entry(data, &event_file->triggers, list) {
if (data->cmd_ops->trigger_type == ETT_EVENT_HIST)
hist_trigger_show(m, data, n++);
}
+ hist_file->last_read = get_hist_hit_count(event_file);
+ /*
+ * Update last_act too so that poll()/POLLPRI can wait for the next
+ * event after any syscall on hist file.
+ */
+ hist_file->last_act = hist_file->last_read;
+
+ return 0;
+}
- out_unlock:
- mutex_unlock(&event_mutex);
+static __poll_t event_hist_poll(struct file *file, struct poll_table_struct *wait)
+{
+ struct trace_event_file *event_file;
+ struct seq_file *m = file->private_data;
+ struct hist_file_data *hist_file = m->private;
+ __poll_t ret = 0;
+ u64 cnt;
+
+ guard(mutex)(&event_mutex);
+
+ event_file = event_file_file(file);
+ if (!event_file)
+ return EPOLLERR;
+
+ hist_poll_wait(file, wait);
+
+ cnt = get_hist_hit_count(event_file);
+ if (hist_file->last_read != cnt)
+ ret |= EPOLLIN | EPOLLRDNORM;
+ if (hist_file->last_act != cnt) {
+ hist_file->last_act = cnt;
+ ret |= EPOLLPRI;
+ }
return ret;
}
+static int event_hist_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *m = file->private_data;
+ struct hist_file_data *hist_file = m->private;
+
+ kfree(hist_file);
+ return tracing_single_release_file_tr(inode, file);
+}
+
static int event_hist_open(struct inode *inode, struct file *file)
{
+ struct trace_event_file *event_file;
+ struct hist_file_data *hist_file;
int ret;
ret = tracing_open_file_tr(inode, file);
if (ret)
return ret;
- /* Clear private_data to avoid warning in single_open() */
- file->private_data = NULL;
- return single_open(file, hist_show, file);
+ guard(mutex)(&event_mutex);
+
+ event_file = event_file_file(file);
+ if (!event_file) {
+ ret = -ENODEV;
+ goto err;
+ }
+
+ hist_file = kzalloc_obj(*hist_file);
+ if (!hist_file) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ hist_file->file = file;
+ hist_file->last_act = get_hist_hit_count(event_file);
+
+ ret = single_open(file, hist_show, hist_file);
+ if (ret) {
+ kfree(hist_file);
+ goto err;
+ }
+
+ return 0;
+err:
+ tracing_release_file_tr(inode, file);
+ return ret;
}
const struct file_operations event_hist_fops = {
.open = event_hist_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = tracing_single_release_file_tr,
+ .release = event_hist_release,
+ .poll = event_hist_poll,
};
#ifdef CONFIG_HIST_TRIGGERS_DEBUG
+
+#undef C
+#define C(a, b) b
+
+static const char * const field_funcs[] = { FIELD_FUNCS };
+
static void hist_field_debug_show_flags(struct seq_file *m,
unsigned long flags)
{
@@ -5700,6 +5931,7 @@ static int hist_field_debug_show(struct seq_file *m,
seq_printf(m, " type: %s\n", field->type);
seq_printf(m, " size: %u\n", field->size);
seq_printf(m, " is_signed: %u\n", field->is_signed);
+ seq_printf(m, " function: hist_field_%s()\n", field_funcs[field->fn_num]);
return 0;
}
@@ -5809,7 +6041,7 @@ static void hist_trigger_debug_show(struct seq_file *m,
seq_puts(m, "\n\n");
seq_puts(m, "# event histogram\n#\n# trigger info: ");
- data->ops->print(m, data);
+ data->cmd_ops->print(m, data);
seq_puts(m, "#\n\n");
hist_data = data->private_data;
@@ -5873,25 +6105,19 @@ static int hist_debug_show(struct seq_file *m, void *v)
{
struct event_trigger_data *data;
struct trace_event_file *event_file;
- int n = 0, ret = 0;
+ int n = 0;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
event_file = event_file_file(m->private);
- if (unlikely(!event_file)) {
- ret = -ENODEV;
- goto out_unlock;
- }
+ if (unlikely(!event_file))
+ return -ENODEV;
list_for_each_entry(data, &event_file->triggers, list) {
if (data->cmd_ops->trigger_type == ETT_EVENT_HIST)
hist_trigger_debug_show(m, data, n++);
}
-
- out_unlock:
- mutex_unlock(&event_mutex);
-
- return ret;
+ return 0;
}
static int event_hist_debug_open(struct inode *inode, struct file *file)
@@ -5902,9 +6128,10 @@ static int event_hist_debug_open(struct inode *inode, struct file *file)
if (ret)
return ret;
- /* Clear private_data to avoid warning in single_open() */
- file->private_data = NULL;
- return single_open(file, hist_debug_show, file);
+ ret = single_open(file, hist_debug_show, file);
+ if (ret)
+ tracing_release_file_tr(inode, file);
+ return ret;
}
const struct file_operations event_hist_debug_fops = {
@@ -5924,12 +6151,15 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
if (hist_field->flags & HIST_FIELD_FL_CPU)
seq_puts(m, "common_cpu");
+ if (hist_field->flags & HIST_FIELD_FL_COMM)
+ seq_puts(m, "common_comm");
else if (hist_field->flags & HIST_FIELD_FL_CONST)
seq_printf(m, "%llu", hist_field->constant);
else if (field_name) {
if (hist_field->flags & HIST_FIELD_FL_VAR_REF ||
hist_field->flags & HIST_FIELD_FL_ALIAS)
- seq_putc(m, '$');
+ if (!hist_field->system)
+ seq_putc(m, '$');
seq_printf(m, "%s", field_name);
} else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP)
seq_puts(m, "common_timestamp");
@@ -6070,6 +6300,9 @@ static int event_hist_trigger_init(struct event_trigger_data *data)
{
struct hist_trigger_data *hist_data = data->private_data;
+ if (alloc_hist_pad() < 0)
+ return -ENOMEM;
+
if (!data->ref && hist_data->attrs->name)
save_named_trigger(hist_data->attrs->name, data);
@@ -6114,24 +6347,24 @@ static void event_hist_trigger_free(struct event_trigger_data *data)
destroy_hist_data(hist_data);
}
+ free_hist_pad();
}
-static struct event_trigger_ops event_hist_trigger_ops = {
- .trigger = event_hist_trigger,
- .print = event_hist_trigger_print,
- .init = event_hist_trigger_init,
- .free = event_hist_trigger_free,
-};
-
static int event_hist_trigger_named_init(struct event_trigger_data *data)
{
+ int ret;
+
data->ref++;
save_named_trigger(data->named_data->name, data);
- event_hist_trigger_init(data->named_data);
+ ret = event_hist_trigger_init(data->named_data);
+ if (ret < 0) {
+ kfree(data->cmd_ops);
+ data->cmd_ops = &trigger_hist_cmd;
+ }
- return 0;
+ return ret;
}
static void event_hist_trigger_named_free(struct event_trigger_data *data)
@@ -6143,24 +6376,14 @@ static void event_hist_trigger_named_free(struct event_trigger_data *data)
data->ref--;
if (!data->ref) {
+ struct event_command *cmd_ops = data->cmd_ops;
+
del_named_trigger(data);
trigger_data_free(data);
+ kfree(cmd_ops);
}
}
-static struct event_trigger_ops event_hist_trigger_named_ops = {
- .trigger = event_hist_trigger,
- .print = event_hist_trigger_print,
- .init = event_hist_trigger_named_init,
- .free = event_hist_trigger_named_free,
-};
-
-static struct event_trigger_ops *event_hist_get_trigger_ops(char *cmd,
- char *param)
-{
- return &event_hist_trigger_ops;
-}
-
static void hist_clear(struct event_trigger_data *data)
{
struct hist_trigger_data *hist_data = data->private_data;
@@ -6308,6 +6531,26 @@ static bool existing_hist_update_only(char *glob,
return updated;
}
+/*
+ * Set or disable using the per CPU trace_buffer_event when possible.
+ */
+static int tracing_set_filter_buffering(struct trace_array *tr, bool set)
+{
+ guard(mutex)(&trace_types_lock);
+
+ if (set && tr->no_filter_buffering_ref++)
+ return 0;
+
+ if (!set) {
+ if (WARN_ON_ONCE(!tr->no_filter_buffering_ref))
+ return -EINVAL;
+
+ --tr->no_filter_buffering_ref;
+ }
+
+ return 0;
+}
+
static int hist_register_trigger(char *glob,
struct event_trigger_data *data,
struct trace_event_file *file)
@@ -6354,13 +6597,24 @@ static int hist_register_trigger(char *glob,
data->paused = true;
if (named_data) {
+ struct event_command *cmd_ops;
+
data->private_data = named_data->private_data;
set_named_trigger_data(data, named_data);
- data->ops = &event_hist_trigger_named_ops;
+ /* Copy the command ops and update some of the functions */
+ cmd_ops = kmalloc_obj(*cmd_ops);
+ if (!cmd_ops) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ *cmd_ops = *data->cmd_ops;
+ cmd_ops->init = event_hist_trigger_named_init;
+ cmd_ops->free = event_hist_trigger_named_free;
+ data->cmd_ops = cmd_ops;
}
- if (data->ops->init) {
- ret = data->ops->init(data);
+ if (data->cmd_ops->init) {
+ ret = data->cmd_ops->init(data);
if (ret < 0)
goto out;
}
@@ -6474,8 +6728,8 @@ static void hist_unregister_trigger(char *glob,
}
}
- if (test && test->ops->free)
- test->ops->free(test);
+ if (test && test->cmd_ops->free)
+ test->cmd_ops->free(test);
if (hist_data->enable_timestamps) {
if (!hist_data->remove || test)
@@ -6527,8 +6781,8 @@ static void hist_unreg_all(struct trace_event_file *file)
update_cond_flag(file);
if (hist_data->enable_timestamps)
tracing_set_filter_buffering(file->tr, false);
- if (test->ops->free)
- test->ops->free(test);
+ if (test->cmd_ops->free)
+ test->cmd_ops->free(test);
}
}
}
@@ -6618,7 +6872,7 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops,
return PTR_ERR(hist_data);
}
- trigger_data = event_trigger_alloc(cmd_ops, cmd, param, hist_data);
+ trigger_data = trigger_data_alloc(cmd_ops, cmd, param, hist_data);
if (!trigger_data) {
ret = -ENOMEM;
goto out_free;
@@ -6649,27 +6903,27 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops,
if (existing_hist_update_only(glob, trigger_data, file))
goto out_free;
- ret = event_trigger_register(cmd_ops, file, glob, trigger_data);
- if (ret < 0)
- goto out_free;
+ if (!get_named_trigger_data(trigger_data)) {
- if (get_named_trigger_data(trigger_data))
- goto enable;
+ ret = create_actions(hist_data);
+ if (ret)
+ goto out_free;
- ret = create_actions(hist_data);
- if (ret)
- goto out_unreg;
+ if (has_hist_vars(hist_data) || hist_data->n_var_refs) {
+ ret = save_hist_vars(hist_data);
+ if (ret)
+ goto out_free;
+ }
- if (has_hist_vars(hist_data) || hist_data->n_var_refs) {
- ret = save_hist_vars(hist_data);
+ ret = tracing_map_init(hist_data->map);
if (ret)
- goto out_unreg;
+ goto out_free;
}
- ret = tracing_map_init(hist_data->map);
- if (ret)
- goto out_unreg;
-enable:
+ ret = event_trigger_register(cmd_ops, file, glob, trigger_data);
+ if (ret < 0)
+ goto out_free;
+
ret = hist_trigger_enable(trigger_data, file);
if (ret)
goto out_unreg;
@@ -6686,11 +6940,9 @@ enable:
out_unreg:
event_trigger_unregister(cmd_ops, file, glob+1, trigger_data);
out_free:
- event_trigger_reset_filter(cmd_ops, trigger_data);
-
remove_hist_vars(hist_data);
- kfree(trigger_data);
+ trigger_data_free(trigger_data);
destroy_hist_data(hist_data);
goto out;
@@ -6704,8 +6956,11 @@ static struct event_command trigger_hist_cmd = {
.reg = hist_register_trigger,
.unreg = hist_unregister_trigger,
.unreg_all = hist_unreg_all,
- .get_trigger_ops = event_hist_get_trigger_ops,
.set_filter = set_trigger_filter,
+ .trigger = event_hist_trigger,
+ .print = event_hist_trigger_print,
+ .init = event_hist_trigger_init,
+ .free = event_hist_trigger_free,
};
__init int register_trigger_hist_cmd(void)
@@ -6737,66 +6992,6 @@ hist_enable_trigger(struct event_trigger_data *data,
}
}
-static void
-hist_enable_count_trigger(struct event_trigger_data *data,
- struct trace_buffer *buffer, void *rec,
- struct ring_buffer_event *event)
-{
- if (!data->count)
- return;
-
- if (data->count != -1)
- (data->count)--;
-
- hist_enable_trigger(data, buffer, rec, event);
-}
-
-static struct event_trigger_ops hist_enable_trigger_ops = {
- .trigger = hist_enable_trigger,
- .print = event_enable_trigger_print,
- .init = event_trigger_init,
- .free = event_enable_trigger_free,
-};
-
-static struct event_trigger_ops hist_enable_count_trigger_ops = {
- .trigger = hist_enable_count_trigger,
- .print = event_enable_trigger_print,
- .init = event_trigger_init,
- .free = event_enable_trigger_free,
-};
-
-static struct event_trigger_ops hist_disable_trigger_ops = {
- .trigger = hist_enable_trigger,
- .print = event_enable_trigger_print,
- .init = event_trigger_init,
- .free = event_enable_trigger_free,
-};
-
-static struct event_trigger_ops hist_disable_count_trigger_ops = {
- .trigger = hist_enable_count_trigger,
- .print = event_enable_trigger_print,
- .init = event_trigger_init,
- .free = event_enable_trigger_free,
-};
-
-static struct event_trigger_ops *
-hist_enable_get_trigger_ops(char *cmd, char *param)
-{
- struct event_trigger_ops *ops;
- bool enable;
-
- enable = (strcmp(cmd, ENABLE_HIST_STR) == 0);
-
- if (enable)
- ops = param ? &hist_enable_count_trigger_ops :
- &hist_enable_trigger_ops;
- else
- ops = param ? &hist_disable_count_trigger_ops :
- &hist_disable_trigger_ops;
-
- return ops;
-}
-
static void hist_enable_unreg_all(struct trace_event_file *file)
{
struct event_trigger_data *test, *n;
@@ -6806,8 +7001,8 @@ static void hist_enable_unreg_all(struct trace_event_file *file)
list_del_rcu(&test->list);
update_cond_flag(file);
trace_event_trigger_enable_disable(file, 0);
- if (test->ops->free)
- test->ops->free(test);
+ if (test->cmd_ops->free)
+ test->cmd_ops->free(test);
}
}
}
@@ -6819,8 +7014,12 @@ static struct event_command trigger_hist_enable_cmd = {
.reg = event_enable_register_trigger,
.unreg = event_enable_unregister_trigger,
.unreg_all = hist_enable_unreg_all,
- .get_trigger_ops = hist_enable_get_trigger_ops,
.set_filter = set_trigger_filter,
+ .trigger = hist_enable_trigger,
+ .count_func = event_trigger_count,
+ .print = event_enable_trigger_print,
+ .init = event_trigger_init,
+ .free = event_enable_trigger_free,
};
static struct event_command trigger_hist_disable_cmd = {
@@ -6830,8 +7029,12 @@ static struct event_command trigger_hist_disable_cmd = {
.reg = event_enable_register_trigger,
.unreg = event_enable_unregister_trigger,
.unreg_all = hist_enable_unreg_all,
- .get_trigger_ops = hist_enable_get_trigger_ops,
.set_filter = set_trigger_filter,
+ .trigger = hist_enable_trigger,
+ .count_func = event_trigger_count,
+ .print = event_enable_trigger_print,
+ .init = event_trigger_init,
+ .free = event_enable_trigger_free,
};
static __init void unregister_trigger_hist_enable_disable_cmds(void)
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index c82b401a294d..39ac4eba0702 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -49,16 +49,11 @@ static char *last_cmd;
static int errpos(const char *str)
{
- int ret = 0;
-
- mutex_lock(&lastcmd_mutex);
+ guard(mutex)(&lastcmd_mutex);
if (!str || !last_cmd)
- goto out;
+ return 0;
- ret = err_pos(last_cmd, str);
- out:
- mutex_unlock(&lastcmd_mutex);
- return ret;
+ return err_pos(last_cmd, str);
}
static void last_cmd_set(const char *str)
@@ -74,14 +69,12 @@ static void last_cmd_set(const char *str)
static void synth_err(u8 err_type, u16 err_pos)
{
- mutex_lock(&lastcmd_mutex);
+ guard(mutex)(&lastcmd_mutex);
if (!last_cmd)
- goto out;
+ return;
tracing_log_err(NULL, "synthetic_events", last_cmd, err_text,
err_type, err_pos);
- out:
- mutex_unlock(&lastcmd_mutex);
}
static int create_synth_event(const char *raw_command);
@@ -137,7 +130,9 @@ static int synth_event_define_fields(struct trace_event_call *call)
struct synth_event *event = call->data;
unsigned int i, size, n_u64;
char *name, *type;
+ int filter_type;
bool is_signed;
+ bool is_stack;
int ret = 0;
for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
@@ -145,8 +140,12 @@ static int synth_event_define_fields(struct trace_event_call *call)
is_signed = event->fields[i]->is_signed;
type = event->fields[i]->type;
name = event->fields[i]->name;
+ is_stack = event->fields[i]->is_stack;
+
+ filter_type = is_stack ? FILTER_STACKTRACE : FILTER_OTHER;
+
ret = trace_define_field(call, type, name, offset, size,
- is_signed, FILTER_OTHER);
+ is_signed, filter_type);
if (ret)
break;
@@ -214,7 +213,7 @@ static int synth_field_string_size(char *type)
if (len == 0)
return 0; /* variable-length string */
- strncpy(buf, start, len);
+ memcpy(buf, start, len);
buf[len] = '\0';
err = kstrtouint(buf, 0, &size);
@@ -312,7 +311,7 @@ static const char *synth_field_fmt(char *type)
else if (strcmp(type, "gfp_t") == 0)
fmt = "%x";
else if (synth_field_is_string(type))
- fmt = "%.*s";
+ fmt = "%s";
else if (synth_field_is_stack(type))
fmt = "%s";
@@ -366,7 +365,7 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter,
fmt = synth_field_fmt(se->fields[i]->type);
/* parameter types */
- if (tr && tr->trace_flags & TRACE_ITER_VERBOSE)
+ if (tr && tr->trace_flags & TRACE_ITER(VERBOSE))
trace_seq_printf(s, "%s ", fmt);
snprintf(print_fmt, sizeof(print_fmt), "%%s=%s%%s", fmt);
@@ -377,13 +376,11 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter,
union trace_synth_field *data = &entry->fields[n_u64];
trace_seq_printf(s, print_fmt, se->fields[i]->name,
- STR_VAR_LEN_MAX,
(char *)entry + data->as_dynamic.offset,
i == se->n_fields - 1 ? "" : " ");
n_u64++;
} else {
trace_seq_printf(s, print_fmt, se->fields[i]->name,
- STR_VAR_LEN_MAX,
(char *)&entry->fields[n_u64].as_u64,
i == se->n_fields - 1 ? "" : " ");
n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
@@ -398,7 +395,7 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter,
n_u64++;
} else {
struct trace_print_flags __flags[] = {
- __def_gfpflag_names, {-1, NULL} };
+ __def_gfpflag_names };
char *space = (i == se->n_fields - 1 ? "" : " ");
print_synth_event_num_val(s, print_fmt,
@@ -411,7 +408,7 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter,
trace_seq_puts(s, " (");
trace_print_flags_seq(s, "|",
entry->fields[n_u64].as_u64,
- __flags);
+ __flags, ARRAY_SIZE(__flags));
trace_seq_putc(s, ')');
}
n_u64++;
@@ -502,9 +499,9 @@ static unsigned int trace_stack(struct synth_trace_event *entry,
return len;
}
-static notrace void trace_event_raw_event_synth(void *__data,
- u64 *var_ref_vals,
- unsigned int *var_ref_idx)
+static void trace_event_raw_event_synth(void *__data,
+ u64 *var_ref_vals,
+ unsigned int *var_ref_idx)
{
unsigned int i, n_u64, val_idx, len, data_size = 0;
struct trace_event_file *trace_file = __data;
@@ -544,12 +541,12 @@ static notrace void trace_event_raw_event_synth(void *__data,
* is being performed within another event.
*/
buffer = trace_file->tr->array_buffer.buffer;
- ring_buffer_nest_start(buffer);
+ guard(ring_buffer_nest)(buffer);
entry = trace_event_buffer_reserve(&fbuffer, trace_file,
sizeof(*entry) + fields_size);
if (!entry)
- goto out;
+ return;
for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
val_idx = var_ref_idx[i];
@@ -592,8 +589,6 @@ static notrace void trace_event_raw_event_synth(void *__data,
}
trace_event_buffer_commit(&fbuffer);
-out:
- ring_buffer_nest_end(buffer);
}
static void free_synth_event_print_fmt(struct trace_event_call *call)
@@ -619,7 +614,7 @@ static int __set_synth_event_print_fmt(struct synth_event *event,
fmt = synth_field_fmt(event->fields[i]->type);
pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s%s",
event->fields[i]->name, fmt,
- i == event->n_fields - 1 ? "" : ", ");
+ i == event->n_fields - 1 ? "" : " ");
}
pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
@@ -716,7 +711,7 @@ static struct synth_field *parse_synth_field(int argc, char **argv,
*field_version = check_field_version(prefix, field_type, field_name);
- field = kzalloc(sizeof(*field), GFP_KERNEL);
+ field = kzalloc_obj(*field);
if (!field)
return ERR_PTR(-ENOMEM);
@@ -824,7 +819,7 @@ static struct tracepoint *alloc_synth_tracepoint(char *name)
{
struct tracepoint *tp;
- tp = kzalloc(sizeof(*tp), GFP_KERNEL);
+ tp = kzalloc_obj(*tp);
if (!tp)
return ERR_PTR(-ENOMEM);
@@ -859,6 +854,38 @@ static struct trace_event_fields synth_event_fields_array[] = {
{}
};
+static int synth_event_reg(struct trace_event_call *call,
+ enum trace_reg type, void *data)
+{
+ struct synth_event *event = container_of(call, struct synth_event, call);
+
+ switch (type) {
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_REGISTER:
+#endif
+ case TRACE_REG_REGISTER:
+ if (!try_module_get(event->mod))
+ return -EBUSY;
+ break;
+ default:
+ break;
+ }
+
+ int ret = trace_event_reg(call, type, data);
+
+ switch (type) {
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_UNREGISTER:
+#endif
+ case TRACE_REG_UNREGISTER:
+ module_put(event->mod);
+ break;
+ default:
+ break;
+ }
+ return ret;
+}
+
static int register_synth_event(struct synth_event *event)
{
struct trace_event_call *call = &event->call;
@@ -888,7 +915,7 @@ static int register_synth_event(struct synth_event *event)
goto out;
}
call->flags = TRACE_EVENT_FL_TRACEPOINT;
- call->class->reg = trace_event_reg;
+ call->class->reg = synth_event_reg;
call->class->probe = trace_event_raw_event_synth;
call->data = event;
call->tp = event->tp;
@@ -946,7 +973,7 @@ static struct synth_event *alloc_synth_event(const char *name, int n_fields,
unsigned int i, j, n_dynamic_fields = 0;
struct synth_event *event;
- event = kzalloc(sizeof(*event), GFP_KERNEL);
+ event = kzalloc_obj(*event);
if (!event) {
event = ERR_PTR(-ENOMEM);
goto out;
@@ -959,7 +986,7 @@ static struct synth_event *alloc_synth_event(const char *name, int n_fields,
goto out;
}
- event->fields = kcalloc(n_fields, sizeof(*event->fields), GFP_KERNEL);
+ event->fields = kzalloc_objs(*event->fields, n_fields);
if (!event->fields) {
free_synth_event(event);
event = ERR_PTR(-ENOMEM);
@@ -971,9 +998,8 @@ static struct synth_event *alloc_synth_event(const char *name, int n_fields,
n_dynamic_fields++;
if (n_dynamic_fields) {
- event->dynamic_fields = kcalloc(n_dynamic_fields,
- sizeof(*event->dynamic_fields),
- GFP_KERNEL);
+ event->dynamic_fields = kzalloc_objs(*event->dynamic_fields,
+ n_dynamic_fields);
if (!event->dynamic_fields) {
free_synth_event(event);
event = ERR_PTR(-ENOMEM);
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index a5e3d6acf1e1..655db2e82513 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -6,6 +6,7 @@
*/
#include <linux/security.h>
+#include <linux/kthread.h>
#include <linux/module.h>
#include <linux/ctype.h>
#include <linux/mutex.h>
@@ -17,15 +18,133 @@
static LIST_HEAD(trigger_commands);
static DEFINE_MUTEX(trigger_cmd_mutex);
+static struct task_struct *trigger_kthread;
+static struct llist_head trigger_data_free_list;
+static DEFINE_MUTEX(trigger_data_kthread_mutex);
+
+static int trigger_kthread_fn(void *ignore);
+
+static void trigger_create_kthread_locked(void)
+{
+ lockdep_assert_held(&trigger_data_kthread_mutex);
+
+ if (!trigger_kthread) {
+ struct task_struct *kthread;
+
+ kthread = kthread_create(trigger_kthread_fn, NULL,
+ "trigger_data_free");
+ if (!IS_ERR(kthread))
+ WRITE_ONCE(trigger_kthread, kthread);
+ }
+}
+
+static void trigger_data_free_queued_locked(void)
+{
+ struct event_trigger_data *data, *tmp;
+ struct llist_node *llnodes;
+
+ lockdep_assert_held(&trigger_data_kthread_mutex);
+
+ llnodes = llist_del_all(&trigger_data_free_list);
+ if (!llnodes)
+ return;
+
+ tracepoint_synchronize_unregister();
+
+ llist_for_each_entry_safe(data, tmp, llnodes, llist)
+ kfree(data);
+}
+
+/* Bulk garbage collection of event_trigger_data elements */
+static int trigger_kthread_fn(void *ignore)
+{
+ struct event_trigger_data *data, *tmp;
+ struct llist_node *llnodes;
+
+ /* Once this task starts, it lives forever */
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (llist_empty(&trigger_data_free_list))
+ schedule();
+
+ __set_current_state(TASK_RUNNING);
+
+ llnodes = llist_del_all(&trigger_data_free_list);
+
+ /* make sure current triggers exit before free */
+ tracepoint_synchronize_unregister();
+
+ llist_for_each_entry_safe(data, tmp, llnodes, llist)
+ kfree(data);
+ }
+
+ return 0;
+}
+
void trigger_data_free(struct event_trigger_data *data)
{
+ if (!data)
+ return;
+
if (data->cmd_ops->set_filter)
data->cmd_ops->set_filter(NULL, data, NULL);
- /* make sure current triggers exit before free */
- tracepoint_synchronize_unregister();
+ /*
+ * Boot-time trigger registration can fail before kthread creation
+ * works. Keep the deferred-free semantics during boot and let late
+ * init start the kthread to drain the list.
+ */
+ if (system_state == SYSTEM_BOOTING && !trigger_kthread) {
+ llist_add(&data->llist, &trigger_data_free_list);
+ return;
+ }
+
+ if (unlikely(!trigger_kthread)) {
+ guard(mutex)(&trigger_data_kthread_mutex);
- kfree(data);
+ trigger_create_kthread_locked();
+ /* Check again after taking mutex */
+ if (!trigger_kthread) {
+ llist_add(&data->llist, &trigger_data_free_list);
+ /* Drain the queued frees synchronously if creation failed. */
+ trigger_data_free_queued_locked();
+ return;
+ }
+ }
+
+ llist_add(&data->llist, &trigger_data_free_list);
+ wake_up_process(trigger_kthread);
+}
+
+static int __init trigger_data_free_init(void)
+{
+ guard(mutex)(&trigger_data_kthread_mutex);
+
+ if (llist_empty(&trigger_data_free_list))
+ return 0;
+
+ trigger_create_kthread_locked();
+ if (trigger_kthread)
+ wake_up_process(trigger_kthread);
+ else
+ trigger_data_free_queued_locked();
+
+ return 0;
+}
+late_initcall(trigger_data_free_init);
+
+static inline void data_ops_trigger(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
+ struct ring_buffer_event *event)
+{
+ const struct event_command *cmd_ops = data->cmd_ops;
+
+ if (data->flags & EVENT_TRIGGER_FL_COUNT) {
+ if (!cmd_ops->count_func(data, buffer, rec, event))
+ return;
+ }
+
+ cmd_ops->trigger(data, buffer, rec, event);
}
/**
@@ -70,7 +189,7 @@ event_triggers_call(struct trace_event_file *file,
if (data->paused)
continue;
if (!rec) {
- data->ops->trigger(data, buffer, rec, event);
+ data_ops_trigger(data, buffer, rec, event);
continue;
}
filter = rcu_dereference_sched(data->filter);
@@ -80,7 +199,7 @@ event_triggers_call(struct trace_event_file *file,
tt |= data->cmd_ops->trigger_type;
continue;
}
- data->ops->trigger(data, buffer, rec, event);
+ data_ops_trigger(data, buffer, rec, event);
}
return tt;
}
@@ -122,7 +241,7 @@ event_triggers_post_call(struct trace_event_file *file,
if (data->paused)
continue;
if (data->cmd_ops->trigger_type & tt)
- data->ops->trigger(data, NULL, NULL, NULL);
+ data_ops_trigger(data, NULL, NULL, NULL);
}
}
EXPORT_SYMBOL_GPL(event_triggers_post_call);
@@ -191,7 +310,7 @@ static int trigger_show(struct seq_file *m, void *v)
}
data = list_entry(v, struct event_trigger_data, list);
- data->ops->print(m, data);
+ data->cmd_ops->print(m, data);
return 0;
}
@@ -211,12 +330,10 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file)
if (ret)
return ret;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
- if (unlikely(!event_file_file(file))) {
- mutex_unlock(&event_mutex);
+ if (unlikely(!event_file_file(file)))
return -ENODEV;
- }
if ((file->f_mode & FMODE_WRITE) &&
(file->f_flags & O_TRUNC)) {
@@ -239,8 +356,6 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file)
}
}
- mutex_unlock(&event_mutex);
-
return ret;
}
@@ -248,9 +363,9 @@ int trigger_process_regex(struct trace_event_file *file, char *buff)
{
char *command, *next;
struct event_command *p;
- int ret = -EINVAL;
- next = buff = skip_spaces(buff);
+ next = buff = strim(buff);
+
command = strsep(&next, ": \t");
if (next) {
next = skip_spaces(next);
@@ -259,17 +374,14 @@ int trigger_process_regex(struct trace_event_file *file, char *buff)
}
command = (command[0] != '!') ? command : command + 1;
- mutex_lock(&trigger_cmd_mutex);
+ guard(mutex)(&trigger_cmd_mutex);
+
list_for_each_entry(p, &trigger_commands, list) {
- if (strcmp(p->name, command) == 0) {
- ret = p->parse(p, file, buff, command, next);
- goto out_unlock;
- }
+ if (strcmp(p->name, command) == 0)
+ return p->parse(p, file, buff, command, next);
}
- out_unlock:
- mutex_unlock(&trigger_cmd_mutex);
- return ret;
+ return -EINVAL;
}
static ssize_t event_trigger_regex_write(struct file *file,
@@ -278,7 +390,7 @@ static ssize_t event_trigger_regex_write(struct file *file,
{
struct trace_event_file *event_file;
ssize_t ret;
- char *buf;
+ char *buf __free(kfree) = NULL;
if (!cnt)
return 0;
@@ -290,37 +402,25 @@ static ssize_t event_trigger_regex_write(struct file *file,
if (IS_ERR(buf))
return PTR_ERR(buf);
- strim(buf);
+ guard(mutex)(&event_mutex);
- mutex_lock(&event_mutex);
event_file = event_file_file(file);
- if (unlikely(!event_file)) {
- mutex_unlock(&event_mutex);
- kfree(buf);
+ if (unlikely(!event_file))
return -ENODEV;
- }
- ret = trigger_process_regex(event_file, buf);
- mutex_unlock(&event_mutex);
- kfree(buf);
+ ret = trigger_process_regex(event_file, buf);
if (ret < 0)
- goto out;
+ return ret;
*ppos += cnt;
- ret = cnt;
- out:
- return ret;
+ return cnt;
}
static int event_trigger_regex_release(struct inode *inode, struct file *file)
{
- mutex_lock(&event_mutex);
-
if (file->f_mode & FMODE_READ)
seq_release(inode, file);
- mutex_unlock(&event_mutex);
-
return 0;
}
@@ -359,20 +459,16 @@ const struct file_operations event_trigger_fops = {
__init int register_event_command(struct event_command *cmd)
{
struct event_command *p;
- int ret = 0;
- mutex_lock(&trigger_cmd_mutex);
+ guard(mutex)(&trigger_cmd_mutex);
+
list_for_each_entry(p, &trigger_commands, list) {
- if (strcmp(cmd->name, p->name) == 0) {
- ret = -EBUSY;
- goto out_unlock;
- }
+ if (strcmp(cmd->name, p->name) == 0)
+ return -EBUSY;
}
list_add(&cmd->list, &trigger_commands);
- out_unlock:
- mutex_unlock(&trigger_cmd_mutex);
- return ret;
+ return 0;
}
/*
@@ -382,24 +478,51 @@ __init int register_event_command(struct event_command *cmd)
__init int unregister_event_command(struct event_command *cmd)
{
struct event_command *p, *n;
- int ret = -ENODEV;
- mutex_lock(&trigger_cmd_mutex);
+ guard(mutex)(&trigger_cmd_mutex);
+
list_for_each_entry_safe(p, n, &trigger_commands, list) {
if (strcmp(cmd->name, p->name) == 0) {
- ret = 0;
list_del_init(&p->list);
- goto out_unlock;
+ return 0;
}
}
- out_unlock:
- mutex_unlock(&trigger_cmd_mutex);
- return ret;
+ return -ENODEV;
+}
+
+/**
+ * event_trigger_count - Optional count function for event triggers
+ * @data: Trigger-specific data
+ * @buffer: The ring buffer that the event is being written to
+ * @rec: The trace entry for the event, NULL for unconditional invocation
+ * @event: The event meta data in the ring buffer
+ *
+ * For triggers that can take a count parameter that doesn't do anything
+ * special, they can use this function to assign to their .count_func
+ * field.
+ *
+ * This simply does a count down of the @data->count field.
+ *
+ * If the @data->count is greater than zero, it will decrement it.
+ *
+ * Returns false if @data->count is zero, otherwise true.
+ */
+bool event_trigger_count(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
+ struct ring_buffer_event *event)
+{
+ if (!data->count)
+ return false;
+
+ if (data->count != -1)
+ (data->count)--;
+
+ return true;
}
/**
- * event_trigger_print - Generic event_trigger_ops @print implementation
+ * event_trigger_print - Generic event_command @print implementation
* @name: The name of the event trigger
* @m: The seq_file being printed to
* @data: Trigger-specific data
@@ -434,7 +557,7 @@ event_trigger_print(const char *name, struct seq_file *m,
}
/**
- * event_trigger_init - Generic event_trigger_ops @init implementation
+ * event_trigger_init - Generic event_command @init implementation
* @data: Trigger-specific data
*
* Common implementation of event trigger initialization.
@@ -451,7 +574,7 @@ int event_trigger_init(struct event_trigger_data *data)
}
/**
- * event_trigger_free - Generic event_trigger_ops @free implementation
+ * event_trigger_free - Generic event_command @free implementation
* @data: Trigger-specific data
*
* Common implementation of event trigger de-initialization.
@@ -513,8 +636,8 @@ clear_event_triggers(struct trace_array *tr)
list_for_each_entry_safe(data, n, &file->triggers, list) {
trace_event_trigger_enable_disable(file, 0);
list_del_rcu(&data->list);
- if (data->ops->free)
- data->ops->free(data);
+ if (data->cmd_ops->free)
+ data->cmd_ops->free(data);
}
}
}
@@ -573,16 +696,14 @@ static int register_trigger(char *glob,
lockdep_assert_held(&event_mutex);
list_for_each_entry(test, &file->triggers, list) {
- if (test->cmd_ops->trigger_type == data->cmd_ops->trigger_type) {
- ret = -EEXIST;
- goto out;
- }
+ if (test->cmd_ops->trigger_type == data->cmd_ops->trigger_type)
+ return -EEXIST;
}
- if (data->ops->init) {
- ret = data->ops->init(data);
+ if (data->cmd_ops->init) {
+ ret = data->cmd_ops->init(data);
if (ret < 0)
- goto out;
+ return ret;
}
list_add_rcu(&data->list, &file->triggers);
@@ -593,7 +714,6 @@ static int register_trigger(char *glob,
list_del_rcu(&data->list);
update_cond_flag(file);
}
-out:
return ret;
}
@@ -619,8 +739,8 @@ static bool try_unregister_trigger(char *glob,
}
if (data) {
- if (data->ops->free)
- data->ops->free(data);
+ if (data->cmd_ops->free)
+ data->cmd_ops->free(data);
return true;
}
@@ -668,7 +788,7 @@ static void unregister_trigger(char *glob,
* param - text following cmd and ':' and stripped of filter
* filter - the optional filter text following (and including) 'if'
*
- * To illustrate the use of these componenents, here are some concrete
+ * To illustrate the use of these components, here are some concrete
* examples. For the following triggers:
*
* echo 'traceon:5 if pid == 0' > trigger
@@ -791,7 +911,7 @@ int event_trigger_separate_filter(char *param_and_filter, char **param,
if (!param_and_filter) {
if (param_required)
ret = -EINVAL;
- goto out;
+ return ret;
}
/*
@@ -802,7 +922,7 @@ int event_trigger_separate_filter(char *param_and_filter, char **param,
*/
if (!param_required && param_and_filter && !isdigit(param_and_filter[0])) {
*filter = param_and_filter;
- goto out;
+ return ret;
}
/*
@@ -820,44 +940,45 @@ int event_trigger_separate_filter(char *param_and_filter, char **param,
if (!**filter)
*filter = NULL;
}
-out:
return ret;
}
/**
- * event_trigger_alloc - allocate and init event_trigger_data for a trigger
+ * trigger_data_alloc - allocate and init event_trigger_data for a trigger
* @cmd_ops: The event_command operations for the trigger
* @cmd: The cmd string
* @param: The param string
* @private_data: User data to associate with the event trigger
*
* Allocate an event_trigger_data instance and initialize it. The
- * @cmd_ops are used along with the @cmd and @param to get the
- * trigger_ops to assign to the event_trigger_data. @private_data can
- * also be passed in and associated with the event_trigger_data.
+ * @cmd_ops defines how the trigger will operate. If @param is set,
+ * and @cmd_ops->trigger_ops->count_func is non NULL, then the
+ * data->count is set to @param and before the trigger is executed, the
+ * @cmd_ops->trigger_ops->count_func() is called. If that function returns
+ * false, the @cmd_ops->trigger_ops->trigger() function will not be called.
+ * @private_data can also be passed in and associated with the
+ * event_trigger_data.
*
- * Use event_trigger_free() to free an event_trigger_data object.
+ * Use trigger_data_free() to free an event_trigger_data object.
*
* Return: The trigger_data object success, NULL otherwise
*/
-struct event_trigger_data *event_trigger_alloc(struct event_command *cmd_ops,
- char *cmd,
- char *param,
- void *private_data)
+struct event_trigger_data *trigger_data_alloc(struct event_command *cmd_ops,
+ char *cmd,
+ char *param,
+ void *private_data)
{
struct event_trigger_data *trigger_data;
- struct event_trigger_ops *trigger_ops;
-
- trigger_ops = cmd_ops->get_trigger_ops(cmd, param);
- trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL);
+ trigger_data = kzalloc_obj(*trigger_data);
if (!trigger_data)
return NULL;
trigger_data->count = -1;
- trigger_data->ops = trigger_ops;
trigger_data->cmd_ops = cmd_ops;
trigger_data->private_data = private_data;
+ if (param && cmd_ops->count_func)
+ trigger_data->flags |= EVENT_TRIGGER_FL_COUNT;
INIT_LIST_HEAD(&trigger_data->list);
INIT_LIST_HEAD(&trigger_data->named_list);
@@ -1010,15 +1131,14 @@ event_trigger_parse(struct event_command *cmd_ops,
return ret;
ret = -ENOMEM;
- trigger_data = event_trigger_alloc(cmd_ops, cmd, param, file);
+ trigger_data = trigger_data_alloc(cmd_ops, cmd, param, file);
if (!trigger_data)
- goto out;
+ return ret;
if (remove) {
event_trigger_unregister(cmd_ops, file, glob+1, trigger_data);
- kfree(trigger_data);
- ret = 0;
- goto out;
+ trigger_data_free(trigger_data);
+ return 0;
}
ret = event_trigger_parse_num(param, trigger_data);
@@ -1038,13 +1158,12 @@ event_trigger_parse(struct event_command *cmd_ops,
/* Down the counter of trigger_data or free it if not used anymore */
event_trigger_free(trigger_data);
- out:
return ret;
out_free:
event_trigger_reset_filter(cmd_ops, trigger_data);
- kfree(trigger_data);
- goto out;
+ trigger_data_free(trigger_data);
+ return ret;
}
/**
@@ -1078,10 +1197,10 @@ int set_trigger_filter(char *filter_str,
s = strsep(&filter_str, " \t");
if (!strlen(s) || strcmp(s, "if") != 0)
- goto out;
+ return ret;
if (!filter_str)
- goto out;
+ return ret;
/* The filter is for the 'trigger' event, not the triggered event */
ret = create_event_filter(file->tr, file->event_call,
@@ -1125,7 +1244,6 @@ int set_trigger_filter(char *filter_str,
ret = -ENOMEM;
}
}
- out:
return ret;
}
@@ -1285,45 +1403,35 @@ traceon_trigger(struct event_trigger_data *data,
{
struct trace_event_file *file = data->private_data;
- if (file) {
- if (tracer_tracing_is_on(file->tr))
- return;
-
- tracer_tracing_on(file->tr);
+ if (WARN_ON_ONCE(!file))
return;
- }
- if (tracing_is_on())
+ if (tracer_tracing_is_on(file->tr))
return;
- tracing_on();
+ tracer_tracing_on(file->tr);
}
-static void
-traceon_count_trigger(struct event_trigger_data *data,
- struct trace_buffer *buffer, void *rec,
- struct ring_buffer_event *event)
+static bool
+traceon_count_func(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
+ struct ring_buffer_event *event)
{
struct trace_event_file *file = data->private_data;
- if (file) {
- if (tracer_tracing_is_on(file->tr))
- return;
- } else {
- if (tracing_is_on())
- return;
- }
+ if (WARN_ON_ONCE(!file))
+ return false;
+
+ if (tracer_tracing_is_on(file->tr))
+ return false;
if (!data->count)
- return;
+ return false;
if (data->count != -1)
(data->count)--;
- if (file)
- tracer_tracing_on(file->tr);
- else
- tracing_on();
+ return true;
}
static void
@@ -1333,45 +1441,35 @@ traceoff_trigger(struct event_trigger_data *data,
{
struct trace_event_file *file = data->private_data;
- if (file) {
- if (!tracer_tracing_is_on(file->tr))
- return;
-
- tracer_tracing_off(file->tr);
+ if (WARN_ON_ONCE(!file))
return;
- }
- if (!tracing_is_on())
+ if (!tracer_tracing_is_on(file->tr))
return;
- tracing_off();
+ tracer_tracing_off(file->tr);
}
-static void
-traceoff_count_trigger(struct event_trigger_data *data,
- struct trace_buffer *buffer, void *rec,
- struct ring_buffer_event *event)
+static bool
+traceoff_count_func(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
+ struct ring_buffer_event *event)
{
struct trace_event_file *file = data->private_data;
- if (file) {
- if (!tracer_tracing_is_on(file->tr))
- return;
- } else {
- if (!tracing_is_on())
- return;
- }
+ if (WARN_ON_ONCE(!file))
+ return false;
+
+ if (!tracer_tracing_is_on(file->tr))
+ return false;
if (!data->count)
- return;
+ return false;
if (data->count != -1)
(data->count)--;
- if (file)
- tracer_tracing_off(file->tr);
- else
- tracing_off();
+ return true;
}
static int
@@ -1388,58 +1486,18 @@ traceoff_trigger_print(struct seq_file *m, struct event_trigger_data *data)
data->filter_str);
}
-static struct event_trigger_ops traceon_trigger_ops = {
- .trigger = traceon_trigger,
- .print = traceon_trigger_print,
- .init = event_trigger_init,
- .free = event_trigger_free,
-};
-
-static struct event_trigger_ops traceon_count_trigger_ops = {
- .trigger = traceon_count_trigger,
- .print = traceon_trigger_print,
- .init = event_trigger_init,
- .free = event_trigger_free,
-};
-
-static struct event_trigger_ops traceoff_trigger_ops = {
- .trigger = traceoff_trigger,
- .print = traceoff_trigger_print,
- .init = event_trigger_init,
- .free = event_trigger_free,
-};
-
-static struct event_trigger_ops traceoff_count_trigger_ops = {
- .trigger = traceoff_count_trigger,
- .print = traceoff_trigger_print,
- .init = event_trigger_init,
- .free = event_trigger_free,
-};
-
-static struct event_trigger_ops *
-onoff_get_trigger_ops(char *cmd, char *param)
-{
- struct event_trigger_ops *ops;
-
- /* we register both traceon and traceoff to this callback */
- if (strcmp(cmd, "traceon") == 0)
- ops = param ? &traceon_count_trigger_ops :
- &traceon_trigger_ops;
- else
- ops = param ? &traceoff_count_trigger_ops :
- &traceoff_trigger_ops;
-
- return ops;
-}
-
static struct event_command trigger_traceon_cmd = {
.name = "traceon",
.trigger_type = ETT_TRACE_ONOFF,
.parse = event_trigger_parse,
.reg = register_trigger,
.unreg = unregister_trigger,
- .get_trigger_ops = onoff_get_trigger_ops,
.set_filter = set_trigger_filter,
+ .trigger = traceon_trigger,
+ .count_func = traceon_count_func,
+ .print = traceon_trigger_print,
+ .init = event_trigger_init,
+ .free = event_trigger_free,
};
static struct event_command trigger_traceoff_cmd = {
@@ -1449,8 +1507,12 @@ static struct event_command trigger_traceoff_cmd = {
.parse = event_trigger_parse,
.reg = register_trigger,
.unreg = unregister_trigger,
- .get_trigger_ops = onoff_get_trigger_ops,
.set_filter = set_trigger_filter,
+ .trigger = traceoff_trigger,
+ .count_func = traceoff_count_func,
+ .print = traceoff_trigger_print,
+ .init = event_trigger_init,
+ .free = event_trigger_free,
};
#ifdef CONFIG_TRACER_SNAPSHOT
@@ -1461,24 +1523,10 @@ snapshot_trigger(struct event_trigger_data *data,
{
struct trace_event_file *file = data->private_data;
- if (file)
- tracing_snapshot_instance(file->tr);
- else
- tracing_snapshot();
-}
-
-static void
-snapshot_count_trigger(struct event_trigger_data *data,
- struct trace_buffer *buffer, void *rec,
- struct ring_buffer_event *event)
-{
- if (!data->count)
+ if (WARN_ON_ONCE(!file))
return;
- if (data->count != -1)
- (data->count)--;
-
- snapshot_trigger(data, buffer, rec, event);
+ tracing_snapshot_instance(file->tr);
}
static int
@@ -1512,34 +1560,18 @@ snapshot_trigger_print(struct seq_file *m, struct event_trigger_data *data)
data->filter_str);
}
-static struct event_trigger_ops snapshot_trigger_ops = {
- .trigger = snapshot_trigger,
- .print = snapshot_trigger_print,
- .init = event_trigger_init,
- .free = event_trigger_free,
-};
-
-static struct event_trigger_ops snapshot_count_trigger_ops = {
- .trigger = snapshot_count_trigger,
- .print = snapshot_trigger_print,
- .init = event_trigger_init,
- .free = event_trigger_free,
-};
-
-static struct event_trigger_ops *
-snapshot_get_trigger_ops(char *cmd, char *param)
-{
- return param ? &snapshot_count_trigger_ops : &snapshot_trigger_ops;
-}
-
static struct event_command trigger_snapshot_cmd = {
.name = "snapshot",
.trigger_type = ETT_SNAPSHOT,
.parse = event_trigger_parse,
.reg = register_snapshot_trigger,
.unreg = unregister_snapshot_trigger,
- .get_trigger_ops = snapshot_get_trigger_ops,
.set_filter = set_trigger_filter,
+ .trigger = snapshot_trigger,
+ .count_func = event_trigger_count,
+ .print = snapshot_trigger_print,
+ .init = event_trigger_init,
+ .free = event_trigger_free,
};
static __init int register_trigger_snapshot_cmd(void)
@@ -1580,24 +1612,10 @@ stacktrace_trigger(struct event_trigger_data *data,
{
struct trace_event_file *file = data->private_data;
- if (file)
- __trace_stack(file->tr, tracing_gen_ctx(), STACK_SKIP);
- else
- trace_dump_stack(STACK_SKIP);
-}
-
-static void
-stacktrace_count_trigger(struct event_trigger_data *data,
- struct trace_buffer *buffer, void *rec,
- struct ring_buffer_event *event)
-{
- if (!data->count)
+ if (WARN_ON_ONCE(!file))
return;
- if (data->count != -1)
- (data->count)--;
-
- stacktrace_trigger(data, buffer, rec, event);
+ __trace_stack(file->tr, tracing_gen_ctx_dec(), STACK_SKIP);
}
static int
@@ -1607,26 +1625,6 @@ stacktrace_trigger_print(struct seq_file *m, struct event_trigger_data *data)
data->filter_str);
}
-static struct event_trigger_ops stacktrace_trigger_ops = {
- .trigger = stacktrace_trigger,
- .print = stacktrace_trigger_print,
- .init = event_trigger_init,
- .free = event_trigger_free,
-};
-
-static struct event_trigger_ops stacktrace_count_trigger_ops = {
- .trigger = stacktrace_count_trigger,
- .print = stacktrace_trigger_print,
- .init = event_trigger_init,
- .free = event_trigger_free,
-};
-
-static struct event_trigger_ops *
-stacktrace_get_trigger_ops(char *cmd, char *param)
-{
- return param ? &stacktrace_count_trigger_ops : &stacktrace_trigger_ops;
-}
-
static struct event_command trigger_stacktrace_cmd = {
.name = "stacktrace",
.trigger_type = ETT_STACKTRACE,
@@ -1634,8 +1632,12 @@ static struct event_command trigger_stacktrace_cmd = {
.parse = event_trigger_parse,
.reg = register_trigger,
.unreg = unregister_trigger,
- .get_trigger_ops = stacktrace_get_trigger_ops,
.set_filter = set_trigger_filter,
+ .trigger = stacktrace_trigger,
+ .count_func = event_trigger_count,
+ .print = stacktrace_trigger_print,
+ .init = event_trigger_init,
+ .free = event_trigger_free,
};
static __init int register_trigger_stacktrace_cmd(void)
@@ -1670,24 +1672,24 @@ event_enable_trigger(struct event_trigger_data *data,
set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &enable_data->file->flags);
}
-static void
-event_enable_count_trigger(struct event_trigger_data *data,
- struct trace_buffer *buffer, void *rec,
- struct ring_buffer_event *event)
+static bool
+event_enable_count_func(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
+ struct ring_buffer_event *event)
{
struct enable_trigger_data *enable_data = data->private_data;
if (!data->count)
- return;
+ return false;
/* Skip if the event is in a state we want to switch to */
if (enable_data->enable == !(enable_data->file->flags & EVENT_FILE_FL_SOFT_DISABLED))
- return;
+ return false;
if (data->count != -1)
(data->count)--;
- event_enable_trigger(data, buffer, rec, event);
+ return true;
}
int event_enable_trigger_print(struct seq_file *m,
@@ -1732,34 +1734,6 @@ void event_enable_trigger_free(struct event_trigger_data *data)
}
}
-static struct event_trigger_ops event_enable_trigger_ops = {
- .trigger = event_enable_trigger,
- .print = event_enable_trigger_print,
- .init = event_trigger_init,
- .free = event_enable_trigger_free,
-};
-
-static struct event_trigger_ops event_enable_count_trigger_ops = {
- .trigger = event_enable_count_trigger,
- .print = event_enable_trigger_print,
- .init = event_trigger_init,
- .free = event_enable_trigger_free,
-};
-
-static struct event_trigger_ops event_disable_trigger_ops = {
- .trigger = event_enable_trigger,
- .print = event_enable_trigger_print,
- .init = event_trigger_init,
- .free = event_enable_trigger_free,
-};
-
-static struct event_trigger_ops event_disable_count_trigger_ops = {
- .trigger = event_enable_count_trigger,
- .print = event_enable_trigger_print,
- .init = event_trigger_init,
- .free = event_enable_trigger_free,
-};
-
int event_enable_trigger_parse(struct event_command *cmd_ops,
struct trace_event_file *file,
char *glob, char *cmd, char *param_and_filter)
@@ -1793,7 +1767,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops,
ret = -EINVAL;
event_enable_file = find_event_file(tr, system, event);
if (!event_enable_file)
- goto out;
+ return ret;
#ifdef CONFIG_HIST_TRIGGERS
hist = ((strcmp(cmd, ENABLE_HIST_STR) == 0) ||
@@ -1806,18 +1780,18 @@ int event_enable_trigger_parse(struct event_command *cmd_ops,
#endif
ret = -ENOMEM;
- enable_data = kzalloc(sizeof(*enable_data), GFP_KERNEL);
+ enable_data = kzalloc_obj(*enable_data);
if (!enable_data)
- goto out;
+ return ret;
enable_data->hist = hist;
enable_data->enable = enable;
enable_data->file = event_enable_file;
- trigger_data = event_trigger_alloc(cmd_ops, cmd, param, enable_data);
+ trigger_data = trigger_data_alloc(cmd_ops, cmd, param, enable_data);
if (!trigger_data) {
kfree(enable_data);
- goto out;
+ return ret;
}
if (remove) {
@@ -1825,7 +1799,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops,
kfree(trigger_data);
kfree(enable_data);
ret = 0;
- goto out;
+ return ret;
}
/* Up the trigger_data count to make sure nothing frees it on failure */
@@ -1855,7 +1829,6 @@ int event_enable_trigger_parse(struct event_command *cmd_ops,
goto out_disable;
event_trigger_free(trigger_data);
- out:
return ret;
out_disable:
trace_event_enable_disable(event_enable_file, 0, 1);
@@ -1866,7 +1839,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops,
event_trigger_free(trigger_data);
kfree(enable_data);
- goto out;
+ return ret;
}
int event_enable_register_trigger(char *glob,
@@ -1886,15 +1859,14 @@ int event_enable_register_trigger(char *glob,
(test->cmd_ops->trigger_type ==
data->cmd_ops->trigger_type) &&
(test_enable_data->file == enable_data->file)) {
- ret = -EEXIST;
- goto out;
+ return -EEXIST;
}
}
- if (data->ops->init) {
- ret = data->ops->init(data);
+ if (data->cmd_ops->init) {
+ ret = data->cmd_ops->init(data);
if (ret < 0)
- goto out;
+ return ret;
}
list_add_rcu(&data->list, &file->triggers);
@@ -1905,7 +1877,6 @@ int event_enable_register_trigger(char *glob,
list_del_rcu(&data->list);
update_cond_flag(file);
}
-out:
return ret;
}
@@ -1933,30 +1904,8 @@ void event_enable_unregister_trigger(char *glob,
}
}
- if (data && data->ops->free)
- data->ops->free(data);
-}
-
-static struct event_trigger_ops *
-event_enable_get_trigger_ops(char *cmd, char *param)
-{
- struct event_trigger_ops *ops;
- bool enable;
-
-#ifdef CONFIG_HIST_TRIGGERS
- enable = ((strcmp(cmd, ENABLE_EVENT_STR) == 0) ||
- (strcmp(cmd, ENABLE_HIST_STR) == 0));
-#else
- enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
-#endif
- if (enable)
- ops = param ? &event_enable_count_trigger_ops :
- &event_enable_trigger_ops;
- else
- ops = param ? &event_disable_count_trigger_ops :
- &event_disable_trigger_ops;
-
- return ops;
+ if (data && data->cmd_ops->free)
+ data->cmd_ops->free(data);
}
static struct event_command trigger_enable_cmd = {
@@ -1965,8 +1914,12 @@ static struct event_command trigger_enable_cmd = {
.parse = event_enable_trigger_parse,
.reg = event_enable_register_trigger,
.unreg = event_enable_unregister_trigger,
- .get_trigger_ops = event_enable_get_trigger_ops,
.set_filter = set_trigger_filter,
+ .trigger = event_enable_trigger,
+ .count_func = event_enable_count_func,
+ .print = event_enable_trigger_print,
+ .init = event_trigger_init,
+ .free = event_enable_trigger_free,
};
static struct event_command trigger_disable_cmd = {
@@ -1975,8 +1928,12 @@ static struct event_command trigger_disable_cmd = {
.parse = event_enable_trigger_parse,
.reg = event_enable_register_trigger,
.unreg = event_enable_unregister_trigger,
- .get_trigger_ops = event_enable_get_trigger_ops,
.set_filter = set_trigger_filter,
+ .trigger = event_enable_trigger,
+ .count_func = event_enable_count_func,
+ .print = event_enable_trigger_print,
+ .init = event_trigger_init,
+ .free = event_enable_trigger_free,
};
static __init void unregister_trigger_enable_disable_cmds(void)
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 17bcad8f79de..c4ba484f7b38 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -370,7 +370,7 @@ static struct user_event_group *user_event_group_create(void)
{
struct user_event_group *group;
- group = kzalloc(sizeof(*group), GFP_KERNEL);
+ group = kzalloc_obj(*group);
if (!group)
return NULL;
@@ -455,7 +455,7 @@ static void user_event_enabler_fault_fixup(struct work_struct *work)
if (ret && ret != -ENOENT) {
struct user_event *user = enabler->event;
- pr_warn("user_events: Fault for mm: 0x%pK @ 0x%llx event: %s\n",
+ pr_warn("user_events: Fault for mm: 0x%p @ 0x%llx event: %s\n",
mm->mm, (unsigned long long)uaddr, EVENT_NAME(user));
}
@@ -496,7 +496,7 @@ static bool user_event_enabler_queue_fault(struct user_event_mm *mm,
{
struct user_event_enabler_fault *fault;
- fault = kmem_cache_zalloc(fault_cache, GFP_NOWAIT | __GFP_NOWARN);
+ fault = kmem_cache_zalloc(fault_cache, GFP_NOWAIT);
if (!fault)
return false;
@@ -637,7 +637,7 @@ static bool user_event_enabler_dup(struct user_event_enabler *orig,
if (unlikely(test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(orig))))
return true;
- enabler = kzalloc(sizeof(*enabler), GFP_NOWAIT | __GFP_ACCOUNT);
+ enabler = kzalloc_obj(*enabler, GFP_NOWAIT | __GFP_ACCOUNT);
if (!enabler)
return false;
@@ -706,7 +706,7 @@ static struct user_event_mm *user_event_mm_alloc(struct task_struct *t)
{
struct user_event_mm *user_mm;
- user_mm = kzalloc(sizeof(*user_mm), GFP_KERNEL_ACCOUNT);
+ user_mm = kzalloc_obj(*user_mm, GFP_KERNEL_ACCOUNT);
if (!user_mm)
return NULL;
@@ -835,7 +835,7 @@ void user_event_mm_remove(struct task_struct *t)
* so we use a work queue after call_rcu() to run within.
*/
INIT_RCU_WORK(&mm->put_rwork, delayed_user_event_mm_put);
- queue_rcu_work(system_wq, &mm->put_rwork);
+ queue_rcu_work(system_percpu_wq, &mm->put_rwork);
}
void user_event_mm_dup(struct task_struct *t, struct user_event_mm *old_mm)
@@ -892,7 +892,7 @@ static struct user_event_enabler
if (!user_mm)
return NULL;
- enabler = kzalloc(sizeof(*enabler), GFP_KERNEL_ACCOUNT);
+ enabler = kzalloc_obj(*enabler, GFP_KERNEL_ACCOUNT);
if (!enabler)
goto out;
@@ -1041,7 +1041,7 @@ static int user_field_array_size(const char *type)
static int user_field_size(const char *type)
{
- /* long is not allowed from a user, since it's ambigious in size */
+ /* long is not allowed from a user, since it's ambiguous in size */
if (strcmp(type, "s64") == 0)
return sizeof(s64);
if (strcmp(type, "u64") == 0)
@@ -1079,7 +1079,7 @@ static int user_field_size(const char *type)
if (str_has_prefix(type, "__rel_loc "))
return sizeof(u32);
- /* Uknown basic type, error */
+ /* Unknown basic type, error */
return -EINVAL;
}
@@ -1113,7 +1113,7 @@ static int user_event_add_field(struct user_event *user, const char *type,
struct ftrace_event_field *field;
int validator_flags = 0;
- field = kmalloc(sizeof(*field), GFP_KERNEL_ACCOUNT);
+ field = kmalloc_obj(*field, GFP_KERNEL_ACCOUNT);
if (!field)
return -ENOMEM;
@@ -1132,7 +1132,7 @@ add_validator:
if (strstr(type, "char") != NULL)
validator_flags |= VALIDATOR_ENSURE_NULL;
- validator = kmalloc(sizeof(*validator), GFP_KERNEL_ACCOUNT);
+ validator = kmalloc_obj(*validator, GFP_KERNEL_ACCOUNT);
if (!validator) {
kfree(field);
@@ -1449,12 +1449,7 @@ static struct trace_event_functions user_event_funcs = {
static int user_event_set_call_visible(struct user_event *user, bool visible)
{
- int ret;
- const struct cred *old_cred;
- struct cred *cred;
-
- cred = prepare_creds();
-
+ CLASS(prepare_creds, cred)();
if (!cred)
return -ENOMEM;
@@ -1469,17 +1464,12 @@ static int user_event_set_call_visible(struct user_event *user, bool visible)
*/
cred->fsuid = GLOBAL_ROOT_UID;
- old_cred = override_creds(cred);
-
- if (visible)
- ret = trace_add_event_call(&user->call);
- else
- ret = trace_remove_event_call(&user->call);
+ scoped_with_creds(cred) {
+ if (visible)
+ return trace_add_event_call(&user->call);
- revert_creds(old_cred);
- put_cred(cred);
-
- return ret;
+ return trace_remove_event_call(&user->call);
+ }
}
static int destroy_user_event(struct user_event *user)
@@ -2115,7 +2105,7 @@ static int user_event_parse(struct user_event_group *group, char *name,
return 0;
}
- user = kzalloc(sizeof(*user), GFP_KERNEL_ACCOUNT);
+ user = kzalloc_obj(*user, GFP_KERNEL_ACCOUNT);
if (!user)
return -ENOMEM;
@@ -2325,7 +2315,7 @@ static int user_events_open(struct inode *node, struct file *file)
if (!group)
return -ENOENT;
- info = kzalloc(sizeof(*info), GFP_KERNEL_ACCOUNT);
+ info = kzalloc_obj(*info, GFP_KERNEL_ACCOUNT);
if (!info)
return -ENOMEM;
@@ -2475,7 +2465,7 @@ static long user_events_ioctl_reg(struct user_event_file_info *info,
/*
* Prevent users from using the same address and bit multiple times
* within the same mm address space. This can cause unexpected behavior
- * for user processes that is far easier to debug if this is explictly
+ * for user processes that is far easier to debug if this is explicitly
* an error upon registering.
*/
if (current_user_event_enabler_exists((unsigned long)reg.enable_addr,
@@ -2793,11 +2783,8 @@ static int user_seq_show(struct seq_file *m, void *p)
seq_printf(m, "%s", EVENT_TP_NAME(user));
- if (status != 0)
- seq_puts(m, " #");
-
if (status != 0) {
- seq_puts(m, " Used by");
+ seq_puts(m, " # Used by");
if (status & EVENT_STATUS_FTRACE)
seq_puts(m, " ftrace");
if (status & EVENT_STATUS_PERF)
@@ -2899,7 +2886,7 @@ static int set_max_user_events_sysctl(const struct ctl_table *table, int write,
return ret;
}
-static struct ctl_table user_event_sysctls[] = {
+static const struct ctl_table user_event_sysctls[] = {
{
.procname = "user_events_max",
.data = &max_user_events,
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 1698fc22afa0..32a42ef31855 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -42,11 +42,14 @@ static int ftrace_event_register(struct trace_event_call *call,
#undef __field_fn
#define __field_fn(type, item) type item;
+#undef __field_packed
+#define __field_packed(type, item) type item;
+
#undef __field_desc
#define __field_desc(type, container, item) type item;
-#undef __field_packed
-#define __field_packed(type, container, item) type item;
+#undef __field_desc_packed
+#define __field_desc_packed(type, container, item) type item;
#undef __array
#define __array(type, item, size) type item[size];
@@ -104,11 +107,14 @@ static void __always_unused ____ftrace_check_##name(void) \
#undef __field_fn
#define __field_fn(_type, _item) __field_ext(_type, _item, FILTER_TRACE_FN)
+#undef __field_packed
+#define __field_packed(_type, _item) __field_ext_packed(_type, _item, FILTER_OTHER)
+
#undef __field_desc
#define __field_desc(_type, _container, _item) __field_ext(_type, _item, FILTER_OTHER)
-#undef __field_packed
-#define __field_packed(_type, _container, _item) __field_ext_packed(_type, _item, FILTER_OTHER)
+#undef __field_desc_packed
+#define __field_desc_packed(_type, _container, _item) __field_ext_packed(_type, _item, FILTER_OTHER)
#undef __array
#define __array(_type, _item, _len) { \
@@ -146,11 +152,14 @@ static struct trace_event_fields ftrace_event_fields_##name[] = { \
#undef __field_fn
#define __field_fn(type, item)
+#undef __field_packed
+#define __field_packed(type, item)
+
#undef __field_desc
#define __field_desc(type, container, item)
-#undef __field_packed
-#define __field_packed(type, container, item)
+#undef __field_desc_packed
+#define __field_desc_packed(type, container, item)
#undef __array
#define __array(type, item, len)
diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c
index c62d1629cffe..9f5f08c0e7c2 100644
--- a/kernel/trace/trace_fprobe.c
+++ b/kernel/trace/trace_fprobe.c
@@ -4,15 +4,18 @@
* Copyright (C) 2022 Google LLC.
*/
#define pr_fmt(fmt) "trace_fprobe: " fmt
-#include <asm/ptrace.h>
#include <linux/fprobe.h>
+#include <linux/list.h>
#include <linux/module.h>
+#include <linux/mutex.h>
#include <linux/rculist.h>
#include <linux/security.h>
#include <linux/tracepoint.h>
#include <linux/uaccess.h>
+#include <asm/ptrace.h>
+
#include "trace_dynevent.h"
#include "trace_probe.h"
#include "trace_probe_kernel.h"
@@ -21,7 +24,6 @@
#define FPROBE_EVENT_SYSTEM "fprobes"
#define TRACEPOINT_EVENT_SYSTEM "tracepoints"
#define RETHOOK_MAXACTIVE_MAX 4096
-#define TRACEPOINT_STUB ERR_PTR(-ENOENT)
static int trace_fprobe_create(const char *raw_command);
static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev);
@@ -38,15 +40,157 @@ static struct dyn_event_operations trace_fprobe_ops = {
.match = trace_fprobe_match,
};
+/* List of tracepoint_user */
+static LIST_HEAD(tracepoint_user_list);
+static DEFINE_MUTEX(tracepoint_user_mutex);
+
+/* While living tracepoint_user, @tpoint can be NULL and @refcount != 0. */
+struct tracepoint_user {
+ struct list_head list;
+ const char *name;
+ struct tracepoint *tpoint;
+ unsigned int refcount;
+};
+
+/* NOTE: you must lock tracepoint_user_mutex. */
+#define for_each_tracepoint_user(tuser) \
+ list_for_each_entry(tuser, &tracepoint_user_list, list)
+
+static int tracepoint_user_register(struct tracepoint_user *tuser)
+{
+ struct tracepoint *tpoint = tuser->tpoint;
+
+ if (!tpoint)
+ return 0;
+
+ return tracepoint_probe_register_prio_may_exist(tpoint,
+ tpoint->probestub, NULL, 0);
+}
+
+static void tracepoint_user_unregister(struct tracepoint_user *tuser)
+{
+ if (!tuser->tpoint)
+ return;
+
+ WARN_ON_ONCE(tracepoint_probe_unregister(tuser->tpoint, tuser->tpoint->probestub, NULL));
+ tuser->tpoint = NULL;
+}
+
+static unsigned long tracepoint_user_ip(struct tracepoint_user *tuser)
+{
+ if (!tuser->tpoint)
+ return 0UL;
+
+ return (unsigned long)tuser->tpoint->probestub;
+}
+
+static void __tracepoint_user_free(struct tracepoint_user *tuser)
+{
+ if (!tuser)
+ return;
+ kfree(tuser->name);
+ kfree(tuser);
+}
+
+DEFINE_FREE(tuser_free, struct tracepoint_user *, __tracepoint_user_free(_T))
+
+static struct tracepoint_user *__tracepoint_user_init(const char *name, struct tracepoint *tpoint)
+{
+ struct tracepoint_user *tuser __free(tuser_free) = NULL;
+ int ret;
+
+ tuser = kzalloc_obj(*tuser);
+ if (!tuser)
+ return NULL;
+ tuser->name = kstrdup(name, GFP_KERNEL);
+ if (!tuser->name)
+ return NULL;
+
+ /* Register tracepoint if it is loaded. */
+ if (tpoint) {
+ tuser->tpoint = tpoint;
+ ret = tracepoint_user_register(tuser);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+
+ tuser->refcount = 1;
+ INIT_LIST_HEAD(&tuser->list);
+ list_add(&tuser->list, &tracepoint_user_list);
+
+ return_ptr(tuser);
+}
+
+static struct tracepoint *find_tracepoint(const char *tp_name,
+ struct module **tp_mod);
+
+/*
+ * Get tracepoint_user if exist, or allocate new one and register it.
+ * If tracepoint is on a module, get its refcounter too.
+ * This returns errno or NULL (not loaded yet) or tracepoint_user.
+ */
+static struct tracepoint_user *tracepoint_user_find_get(const char *name, struct module **pmod)
+{
+ struct module *mod __free(module_put) = NULL;
+ struct tracepoint_user *tuser;
+ struct tracepoint *tpoint;
+
+ if (!name || !pmod)
+ return ERR_PTR(-EINVAL);
+
+ /* Get and lock the module which has tracepoint. */
+ tpoint = find_tracepoint(name, &mod);
+
+ guard(mutex)(&tracepoint_user_mutex);
+ /* Search existing tracepoint_user */
+ for_each_tracepoint_user(tuser) {
+ if (!strcmp(tuser->name, name)) {
+ tuser->refcount++;
+ *pmod = no_free_ptr(mod);
+ return tuser;
+ }
+ }
+
+ /* The corresponding tracepoint_user is not found. */
+ tuser = __tracepoint_user_init(name, tpoint);
+ if (!IS_ERR_OR_NULL(tuser))
+ *pmod = no_free_ptr(mod);
+
+ return tuser;
+}
+
+static void tracepoint_user_put(struct tracepoint_user *tuser)
+{
+ scoped_guard(mutex, &tracepoint_user_mutex) {
+ if (--tuser->refcount > 0)
+ return;
+
+ list_del(&tuser->list);
+ tracepoint_user_unregister(tuser);
+ }
+
+ __tracepoint_user_free(tuser);
+}
+
+DEFINE_FREE(tuser_put, struct tracepoint_user *,
+ if (!IS_ERR_OR_NULL(_T))
+ tracepoint_user_put(_T))
+
/*
* Fprobe event core functions
*/
+
+/*
+ * @tprobe is true for tracepoint probe.
+ * @tuser can be NULL if the trace_fprobe is disabled or the tracepoint is not
+ * loaded with a module. If @tuser != NULL, this trace_fprobe is enabled.
+ */
struct trace_fprobe {
struct dyn_event devent;
struct fprobe fp;
const char *symbol;
- struct tracepoint *tpoint;
- struct module *mod;
+ bool tprobe;
+ struct tracepoint_user *tuser;
struct trace_probe tp;
};
@@ -76,7 +220,7 @@ static bool trace_fprobe_is_return(struct trace_fprobe *tf)
static bool trace_fprobe_is_tracepoint(struct trace_fprobe *tf)
{
- return tf->tpoint != NULL;
+ return tf->tprobe;
}
static const char *trace_fprobe_symbol(struct trace_fprobe *tf)
@@ -134,7 +278,7 @@ static int
process_fetch_insn(struct fetch_insn *code, void *rec, void *edata,
void *dest, void *base)
{
- struct pt_regs *regs = rec;
+ struct ftrace_regs *fregs = rec;
unsigned long val;
int ret;
@@ -142,17 +286,17 @@ retry:
/* 1st stage: get value from context */
switch (code->op) {
case FETCH_OP_STACK:
- val = regs_get_kernel_stack_nth(regs, code->param);
+ val = ftrace_regs_get_kernel_stack_nth(fregs, code->param);
break;
case FETCH_OP_STACKP:
- val = kernel_stack_pointer(regs);
+ val = ftrace_regs_get_stack_pointer(fregs);
break;
case FETCH_OP_RETVAL:
- val = regs_return_value(regs);
+ val = ftrace_regs_get_return_value(fregs);
break;
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
case FETCH_OP_ARG:
- val = regs_get_kernel_argument(regs, code->param);
+ val = ftrace_regs_get_argument(fregs, code->param);
break;
case FETCH_OP_EDATA:
val = *(unsigned long *)((unsigned long)edata + code->offset);
@@ -175,7 +319,7 @@ NOKPROBE_SYMBOL(process_fetch_insn)
/* function entry handler */
static nokprobe_inline void
__fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
- struct pt_regs *regs,
+ struct ftrace_regs *fregs,
struct trace_event_file *trace_file)
{
struct fentry_trace_entry_head *entry;
@@ -189,41 +333,71 @@ __fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
if (trace_trigger_soft_disabled(trace_file))
return;
- dsize = __get_data_size(&tf->tp, regs, NULL);
+ dsize = __get_data_size(&tf->tp, fregs, NULL);
entry = trace_event_buffer_reserve(&fbuffer, trace_file,
sizeof(*entry) + tf->tp.size + dsize);
if (!entry)
return;
- fbuffer.regs = regs;
+ fbuffer.regs = ftrace_get_regs(fregs);
entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
entry->ip = entry_ip;
- store_trace_args(&entry[1], &tf->tp, regs, NULL, sizeof(*entry), dsize);
+ store_trace_args(&entry[1], &tf->tp, fregs, NULL, sizeof(*entry), dsize);
trace_event_buffer_commit(&fbuffer);
}
static void
fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
- struct pt_regs *regs)
+ struct ftrace_regs *fregs)
{
struct event_file_link *link;
trace_probe_for_each_link_rcu(link, &tf->tp)
- __fentry_trace_func(tf, entry_ip, regs, link->file);
+ __fentry_trace_func(tf, entry_ip, fregs, link->file);
}
NOKPROBE_SYMBOL(fentry_trace_func);
+static nokprobe_inline
+void store_fprobe_entry_data(void *edata, struct trace_probe *tp, struct ftrace_regs *fregs)
+{
+ struct probe_entry_arg *earg = tp->entry_arg;
+ unsigned long val = 0;
+ int i;
+
+ if (!earg)
+ return;
+
+ for (i = 0; i < earg->size; i++) {
+ struct fetch_insn *code = &earg->code[i];
+
+ switch (code->op) {
+ case FETCH_OP_ARG:
+ val = ftrace_regs_get_argument(fregs, code->param);
+ break;
+ case FETCH_OP_ST_EDATA:
+ *(unsigned long *)((unsigned long)edata + code->offset) = val;
+ break;
+ case FETCH_OP_END:
+ goto end;
+ default:
+ break;
+ }
+ }
+end:
+ return;
+}
+
/* function exit handler */
static int trace_fprobe_entry_handler(struct fprobe *fp, unsigned long entry_ip,
- unsigned long ret_ip, struct pt_regs *regs,
+ unsigned long ret_ip, struct ftrace_regs *fregs,
void *entry_data)
{
struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp);
if (tf->tp.entry_arg)
- store_trace_entry_data(entry_data, &tf->tp, regs);
+ store_fprobe_entry_data(entry_data, &tf->tp, fregs);
return 0;
}
@@ -231,7 +405,7 @@ NOKPROBE_SYMBOL(trace_fprobe_entry_handler)
static nokprobe_inline void
__fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
- unsigned long ret_ip, struct pt_regs *regs,
+ unsigned long ret_ip, struct ftrace_regs *fregs,
void *entry_data, struct trace_event_file *trace_file)
{
struct fexit_trace_entry_head *entry;
@@ -245,60 +419,63 @@ __fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
if (trace_trigger_soft_disabled(trace_file))
return;
- dsize = __get_data_size(&tf->tp, regs, entry_data);
+ dsize = __get_data_size(&tf->tp, fregs, entry_data);
entry = trace_event_buffer_reserve(&fbuffer, trace_file,
sizeof(*entry) + tf->tp.size + dsize);
if (!entry)
return;
- fbuffer.regs = regs;
+ fbuffer.regs = ftrace_get_regs(fregs);
entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
entry->func = entry_ip;
entry->ret_ip = ret_ip;
- store_trace_args(&entry[1], &tf->tp, regs, entry_data, sizeof(*entry), dsize);
+ store_trace_args(&entry[1], &tf->tp, fregs, entry_data, sizeof(*entry), dsize);
trace_event_buffer_commit(&fbuffer);
}
static void
fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
- unsigned long ret_ip, struct pt_regs *regs, void *entry_data)
+ unsigned long ret_ip, struct ftrace_regs *fregs, void *entry_data)
{
struct event_file_link *link;
trace_probe_for_each_link_rcu(link, &tf->tp)
- __fexit_trace_func(tf, entry_ip, ret_ip, regs, entry_data, link->file);
+ __fexit_trace_func(tf, entry_ip, ret_ip, fregs, entry_data, link->file);
}
NOKPROBE_SYMBOL(fexit_trace_func);
#ifdef CONFIG_PERF_EVENTS
static int fentry_perf_func(struct trace_fprobe *tf, unsigned long entry_ip,
- struct pt_regs *regs)
+ struct ftrace_regs *fregs)
{
struct trace_event_call *call = trace_probe_event_call(&tf->tp);
struct fentry_trace_entry_head *entry;
struct hlist_head *head;
int size, __size, dsize;
+ struct pt_regs *regs;
int rctx;
head = this_cpu_ptr(call->perf_events);
if (hlist_empty(head))
return 0;
- dsize = __get_data_size(&tf->tp, regs, NULL);
+ dsize = __get_data_size(&tf->tp, fregs, NULL);
__size = sizeof(*entry) + tf->tp.size + dsize;
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- entry = perf_trace_buf_alloc(size, NULL, &rctx);
+ entry = perf_trace_buf_alloc(size, &regs, &rctx);
if (!entry)
return 0;
+ regs = ftrace_fill_perf_regs(fregs, regs);
+
entry->ip = entry_ip;
memset(&entry[1], 0, dsize);
- store_trace_args(&entry[1], &tf->tp, regs, NULL, sizeof(*entry), dsize);
+ store_trace_args(&entry[1], &tf->tp, fregs, NULL, sizeof(*entry), dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
head, NULL);
return 0;
@@ -307,31 +484,34 @@ NOKPROBE_SYMBOL(fentry_perf_func);
static void
fexit_perf_func(struct trace_fprobe *tf, unsigned long entry_ip,
- unsigned long ret_ip, struct pt_regs *regs,
+ unsigned long ret_ip, struct ftrace_regs *fregs,
void *entry_data)
{
struct trace_event_call *call = trace_probe_event_call(&tf->tp);
struct fexit_trace_entry_head *entry;
struct hlist_head *head;
int size, __size, dsize;
+ struct pt_regs *regs;
int rctx;
head = this_cpu_ptr(call->perf_events);
if (hlist_empty(head))
return;
- dsize = __get_data_size(&tf->tp, regs, entry_data);
+ dsize = __get_data_size(&tf->tp, fregs, entry_data);
__size = sizeof(*entry) + tf->tp.size + dsize;
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- entry = perf_trace_buf_alloc(size, NULL, &rctx);
+ entry = perf_trace_buf_alloc(size, &regs, &rctx);
if (!entry)
return;
+ regs = ftrace_fill_perf_regs(fregs, regs);
+
entry->func = entry_ip;
entry->ret_ip = ret_ip;
- store_trace_args(&entry[1], &tf->tp, regs, entry_data, sizeof(*entry), dsize);
+ store_trace_args(&entry[1], &tf->tp, fregs, entry_data, sizeof(*entry), dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
head, NULL);
}
@@ -339,33 +519,36 @@ NOKPROBE_SYMBOL(fexit_perf_func);
#endif /* CONFIG_PERF_EVENTS */
static int fentry_dispatcher(struct fprobe *fp, unsigned long entry_ip,
- unsigned long ret_ip, struct pt_regs *regs,
+ unsigned long ret_ip, struct ftrace_regs *fregs,
void *entry_data)
{
struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp);
+ unsigned int flags = trace_probe_load_flag(&tf->tp);
int ret = 0;
- if (trace_probe_test_flag(&tf->tp, TP_FLAG_TRACE))
- fentry_trace_func(tf, entry_ip, regs);
+ if (flags & TP_FLAG_TRACE)
+ fentry_trace_func(tf, entry_ip, fregs);
+
#ifdef CONFIG_PERF_EVENTS
- if (trace_probe_test_flag(&tf->tp, TP_FLAG_PROFILE))
- ret = fentry_perf_func(tf, entry_ip, regs);
+ if (flags & TP_FLAG_PROFILE)
+ ret = fentry_perf_func(tf, entry_ip, fregs);
#endif
return ret;
}
NOKPROBE_SYMBOL(fentry_dispatcher);
static void fexit_dispatcher(struct fprobe *fp, unsigned long entry_ip,
- unsigned long ret_ip, struct pt_regs *regs,
+ unsigned long ret_ip, struct ftrace_regs *fregs,
void *entry_data)
{
struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp);
+ unsigned int flags = trace_probe_load_flag(&tf->tp);
- if (trace_probe_test_flag(&tf->tp, TP_FLAG_TRACE))
- fexit_trace_func(tf, entry_ip, ret_ip, regs, entry_data);
+ if (flags & TP_FLAG_TRACE)
+ fexit_trace_func(tf, entry_ip, ret_ip, fregs, entry_data);
#ifdef CONFIG_PERF_EVENTS
- if (trace_probe_test_flag(&tf->tp, TP_FLAG_PROFILE))
- fexit_perf_func(tf, entry_ip, ret_ip, regs, entry_data);
+ if (flags & TP_FLAG_PROFILE)
+ fexit_perf_func(tf, entry_ip, ret_ip, fregs, entry_data);
#endif
}
NOKPROBE_SYMBOL(fexit_dispatcher);
@@ -374,51 +557,49 @@ static void free_trace_fprobe(struct trace_fprobe *tf)
{
if (tf) {
trace_probe_cleanup(&tf->tp);
+ if (tf->tuser)
+ tracepoint_user_put(tf->tuser);
kfree(tf->symbol);
kfree(tf);
}
}
+/* Since alloc_trace_fprobe() can return error, check the pointer is ERR too. */
+DEFINE_FREE(free_trace_fprobe, struct trace_fprobe *, if (!IS_ERR_OR_NULL(_T)) free_trace_fprobe(_T))
+
/*
* Allocate new trace_probe and initialize it (including fprobe).
*/
static struct trace_fprobe *alloc_trace_fprobe(const char *group,
const char *event,
const char *symbol,
- struct tracepoint *tpoint,
- struct module *mod,
- int maxactive,
- int nargs, bool is_return)
+ int nargs, bool is_return,
+ bool is_tracepoint)
{
- struct trace_fprobe *tf;
+ struct trace_fprobe *tf __free(free_trace_fprobe) = NULL;
int ret = -ENOMEM;
- tf = kzalloc(struct_size(tf, tp.args, nargs), GFP_KERNEL);
+ tf = kzalloc_flex(*tf, tp.args, nargs);
if (!tf)
return ERR_PTR(ret);
tf->symbol = kstrdup(symbol, GFP_KERNEL);
if (!tf->symbol)
- goto error;
+ return ERR_PTR(-ENOMEM);
if (is_return)
tf->fp.exit_handler = fexit_dispatcher;
else
tf->fp.entry_handler = fentry_dispatcher;
- tf->tpoint = tpoint;
- tf->mod = mod;
- tf->fp.nr_maxactive = maxactive;
+ tf->tprobe = is_tracepoint;
ret = trace_probe_init(&tf->tp, event, group, false, nargs);
if (ret < 0)
- goto error;
+ return ERR_PTR(ret);
dyn_event_init(&tf->devent, &trace_fprobe_ops);
- return tf;
-error:
- free_trace_fprobe(tf);
- return ERR_PTR(ret);
+ return_ptr(tf);
}
static struct trace_fprobe *find_trace_fprobe(const char *event,
@@ -434,98 +615,6 @@ static struct trace_fprobe *find_trace_fprobe(const char *event,
return NULL;
}
-static inline int __enable_trace_fprobe(struct trace_fprobe *tf)
-{
- if (trace_fprobe_is_registered(tf))
- enable_fprobe(&tf->fp);
-
- return 0;
-}
-
-static void __disable_trace_fprobe(struct trace_probe *tp)
-{
- struct trace_fprobe *tf;
-
- list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) {
- if (!trace_fprobe_is_registered(tf))
- continue;
- disable_fprobe(&tf->fp);
- }
-}
-
-/*
- * Enable trace_probe
- * if the file is NULL, enable "perf" handler, or enable "trace" handler.
- */
-static int enable_trace_fprobe(struct trace_event_call *call,
- struct trace_event_file *file)
-{
- struct trace_probe *tp;
- struct trace_fprobe *tf;
- bool enabled;
- int ret = 0;
-
- tp = trace_probe_primary_from_call(call);
- if (WARN_ON_ONCE(!tp))
- return -ENODEV;
- enabled = trace_probe_is_enabled(tp);
-
- /* This also changes "enabled" state */
- if (file) {
- ret = trace_probe_add_file(tp, file);
- if (ret)
- return ret;
- } else
- trace_probe_set_flag(tp, TP_FLAG_PROFILE);
-
- if (!enabled) {
- list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) {
- /* TODO: check the fprobe is gone */
- __enable_trace_fprobe(tf);
- }
- }
-
- return 0;
-}
-
-/*
- * Disable trace_probe
- * if the file is NULL, disable "perf" handler, or disable "trace" handler.
- */
-static int disable_trace_fprobe(struct trace_event_call *call,
- struct trace_event_file *file)
-{
- struct trace_probe *tp;
-
- tp = trace_probe_primary_from_call(call);
- if (WARN_ON_ONCE(!tp))
- return -ENODEV;
-
- if (file) {
- if (!trace_probe_get_file_link(tp, file))
- return -ENOENT;
- if (!trace_probe_has_single_file(tp))
- goto out;
- trace_probe_clear_flag(tp, TP_FLAG_TRACE);
- } else
- trace_probe_clear_flag(tp, TP_FLAG_PROFILE);
-
- if (!trace_probe_is_enabled(tp))
- __disable_trace_fprobe(tp);
-
- out:
- if (file)
- /*
- * Synchronization is done in below function. For perf event,
- * file == NULL and perf_trace_event_unreg() calls
- * tracepoint_synchronize_unregister() to ensure synchronize
- * event. We don't need to care about it.
- */
- trace_probe_remove_file(tp, file);
-
- return 0;
-}
-
/* Event entry printers */
static enum print_line_t
print_fentry_event(struct trace_iterator *iter, int flags,
@@ -543,7 +632,7 @@ print_fentry_event(struct trace_iterator *iter, int flags,
trace_seq_printf(s, "%s: (", trace_probe_name(tp));
- if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
+ if (!seq_print_ip_sym_offset(s, field->ip, flags))
goto out;
trace_seq_putc(s, ')');
@@ -573,12 +662,12 @@ print_fexit_event(struct trace_iterator *iter, int flags,
trace_seq_printf(s, "%s: (", trace_probe_name(tp));
- if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
+ if (!seq_print_ip_sym_offset(s, field->ret_ip, flags))
goto out;
trace_seq_puts(s, " <- ");
- if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET))
+ if (!seq_print_ip_sym_no_offset(s, field->func, flags))
goto out;
trace_seq_putc(s, ')');
@@ -677,20 +766,52 @@ static int unregister_fprobe_event(struct trace_fprobe *tf)
static int __regsiter_tracepoint_fprobe(struct trace_fprobe *tf)
{
- struct tracepoint *tpoint = tf->tpoint;
- unsigned long ip = (unsigned long)tpoint->probestub;
+ struct tracepoint_user *tuser __free(tuser_put) = NULL;
+ struct module *mod __free(module_put) = NULL;
+ unsigned long ip;
+ int ret;
+
+ if (WARN_ON_ONCE(tf->tuser))
+ return -EINVAL;
+
+ /* If the tracepoint is in a module, it must be locked in this function. */
+ tuser = tracepoint_user_find_get(tf->symbol, &mod);
+ /* This tracepoint is not loaded yet */
+ if (IS_ERR(tuser))
+ return PTR_ERR(tuser);
+ if (!tuser)
+ return -ENOMEM;
+
+ /* Register fprobe only if the tracepoint is loaded. */
+ if (tuser->tpoint) {
+ ip = tracepoint_user_ip(tuser);
+ if (WARN_ON_ONCE(!ip))
+ return -ENOENT;
+
+ ret = register_fprobe_ips(&tf->fp, &ip, 1);
+ if (ret < 0)
+ return ret;
+ }
+
+ tf->tuser = no_free_ptr(tuser);
+ return 0;
+}
+
+/* Returns an error if the target function is not available, or 0 */
+static int trace_fprobe_verify_target(struct trace_fprobe *tf)
+{
int ret;
+ /* Tracepoint should have a stub function. */
+ if (trace_fprobe_is_tracepoint(tf))
+ return 0;
+
/*
- * Here, we do 2 steps to enable fprobe on a tracepoint.
- * At first, put __probestub_##TP function on the tracepoint
- * and put a fprobe on the stub function.
+ * Note: since we don't lock the module, even if this succeeded,
+ * register_fprobe() later can fail.
*/
- ret = tracepoint_probe_register_prio_may_exist(tpoint,
- tpoint->probestub, NULL, 0);
- if (ret < 0)
- return ret;
- return register_fprobe_ips(&tf->fp, &ip, 1);
+ ret = fprobe_count_ips_from_filter(tf->symbol, NULL);
+ return (ret < 0) ? ret : 0;
}
/* Internal register function - just handle fprobe and flags */
@@ -712,20 +833,10 @@ static int __register_trace_fprobe(struct trace_fprobe *tf)
return ret;
}
- /* Set/clear disabled flag according to tp->flag */
- if (trace_probe_is_enabled(&tf->tp))
- tf->fp.flags &= ~FPROBE_FL_DISABLED;
- else
- tf->fp.flags |= FPROBE_FL_DISABLED;
-
- if (trace_fprobe_is_tracepoint(tf)) {
-
- /* This tracepoint is not loaded yet */
- if (tf->tpoint == TRACEPOINT_STUB)
- return 0;
+ tf->fp.flags &= ~FPROBE_FL_DISABLED;
+ if (trace_fprobe_is_tracepoint(tf))
return __regsiter_tracepoint_fprobe(tf);
- }
/* TODO: handle filter, nofilter or symbol list */
return register_fprobe(&tf->fp, tf->symbol, NULL);
@@ -734,15 +845,11 @@ static int __register_trace_fprobe(struct trace_fprobe *tf)
/* Internal unregister function - just handle fprobe and flags */
static void __unregister_trace_fprobe(struct trace_fprobe *tf)
{
- if (trace_fprobe_is_registered(tf)) {
+ if (trace_fprobe_is_registered(tf))
unregister_fprobe(&tf->fp);
- memset(&tf->fp, 0, sizeof(tf->fp));
- if (trace_fprobe_is_tracepoint(tf)) {
- tracepoint_probe_unregister(tf->tpoint,
- tf->tpoint->probestub, NULL);
- tf->tpoint = NULL;
- tf->mod = NULL;
- }
+ if (tf->tuser) {
+ tracepoint_user_put(tf->tuser);
+ tf->tuser = NULL;
}
}
@@ -802,7 +909,7 @@ static bool trace_fprobe_has_same_fprobe(struct trace_fprobe *orig,
return false;
}
-static int append_trace_fprobe(struct trace_fprobe *tf, struct trace_fprobe *to)
+static int append_trace_fprobe_event(struct trace_fprobe *tf, struct trace_fprobe *to)
{
int ret;
@@ -830,7 +937,7 @@ static int append_trace_fprobe(struct trace_fprobe *tf, struct trace_fprobe *to)
if (ret)
return ret;
- ret = __register_trace_fprobe(tf);
+ ret = trace_fprobe_verify_target(tf);
if (ret)
trace_probe_unlink(&tf->tp);
else
@@ -839,20 +946,18 @@ static int append_trace_fprobe(struct trace_fprobe *tf, struct trace_fprobe *to)
return ret;
}
-/* Register a trace_probe and probe_event */
-static int register_trace_fprobe(struct trace_fprobe *tf)
+/* Register a trace_probe and probe_event, and check the fprobe is available. */
+static int register_trace_fprobe_event(struct trace_fprobe *tf)
{
struct trace_fprobe *old_tf;
int ret;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
old_tf = find_trace_fprobe(trace_probe_name(&tf->tp),
trace_probe_group_name(&tf->tp));
- if (old_tf) {
- ret = append_trace_fprobe(tf, old_tf);
- goto end;
- }
+ if (old_tf)
+ return append_trace_fprobe_event(tf, old_tf);
/* Register new event */
ret = register_fprobe_event(tf);
@@ -862,18 +967,16 @@ static int register_trace_fprobe(struct trace_fprobe *tf)
trace_probe_log_err(0, EVENT_EXIST);
} else
pr_warn("Failed to register probe event(%d)\n", ret);
- goto end;
+ return ret;
}
- /* Register fprobe */
- ret = __register_trace_fprobe(tf);
+ /* Verify fprobe is sane. */
+ ret = trace_fprobe_verify_target(tf);
if (ret < 0)
unregister_fprobe_event(tf);
else
dyn_event_add(&tf->devent, trace_probe_event_call(&tf->tp));
-end:
- mutex_unlock(&event_mutex);
return ret;
}
@@ -888,14 +991,15 @@ static void __find_tracepoint_module_cb(struct tracepoint *tp, struct module *mo
struct __find_tracepoint_cb_data *data = priv;
if (!data->tpoint && !strcmp(data->tp_name, tp->name)) {
- data->tpoint = tp;
- if (!data->mod) {
+ /* If module is not specified, try getting module refcount. */
+ if (!data->mod && mod) {
+ /* If failed to get refcount, ignore this tracepoint. */
+ if (!try_module_get(mod))
+ return;
+
data->mod = mod;
- if (!try_module_get(data->mod)) {
- data->tpoint = NULL;
- data->mod = NULL;
- }
}
+ data->tpoint = tp;
}
}
@@ -908,11 +1012,9 @@ static void __find_tracepoint_cb(struct tracepoint *tp, void *priv)
}
/*
- * Find a tracepoint from kernel and module. If the tracepoint is in a module,
- * this increments the module refcount to prevent unloading until the
- * trace_fprobe is registered to the list. After registering the trace_fprobe
- * on the trace_fprobe list, the module refcount is decremented because
- * tracepoint_probe_module_cb will handle it.
+ * Find a tracepoint from kernel and module. If the tracepoint is on the module,
+ * the module's refcount is incremented and returned as *@tp_mod. Thus, if it is
+ * not NULL, caller must call module_put(*tp_mod) after used the tracepoint.
*/
static struct tracepoint *find_tracepoint(const char *tp_name,
struct module **tp_mod)
@@ -933,15 +1035,10 @@ static struct tracepoint *find_tracepoint(const char *tp_name,
}
#ifdef CONFIG_MODULES
-static void reenable_trace_fprobe(struct trace_fprobe *tf)
-{
- struct trace_probe *tp = &tf->tp;
-
- list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) {
- __enable_trace_fprobe(tf);
- }
-}
-
+/*
+ * Find a tracepoint from specified module. In this case, this does not get the
+ * module's refcount. The caller must ensure the module is not freed.
+ */
static struct tracepoint *find_tracepoint_in_module(struct module *mod,
const char *tp_name)
{
@@ -954,33 +1051,94 @@ static struct tracepoint *find_tracepoint_in_module(struct module *mod,
return data.tpoint;
}
+/* These are CONFIG_MODULES=y specific functions. */
+static bool tracepoint_user_within_module(struct tracepoint_user *tuser,
+ struct module *mod)
+{
+ return within_module(tracepoint_user_ip(tuser), mod);
+}
+
+static int tracepoint_user_register_again(struct tracepoint_user *tuser,
+ struct tracepoint *tpoint)
+{
+ tuser->tpoint = tpoint;
+ return tracepoint_user_register(tuser);
+}
+
+static void tracepoint_user_unregister_clear(struct tracepoint_user *tuser)
+{
+ tracepoint_user_unregister(tuser);
+ tuser->tpoint = NULL;
+}
+
+/* module callback for tracepoint_user */
static int __tracepoint_probe_module_cb(struct notifier_block *self,
unsigned long val, void *data)
{
struct tp_module *tp_mod = data;
+ struct tracepoint_user *tuser;
struct tracepoint *tpoint;
+
+ if (val != MODULE_STATE_GOING && val != MODULE_STATE_COMING)
+ return NOTIFY_DONE;
+
+ mutex_lock(&tracepoint_user_mutex);
+ for_each_tracepoint_user(tuser) {
+ if (val == MODULE_STATE_COMING) {
+ /* This is not a tracepoint in this module. Skip it. */
+ tpoint = find_tracepoint_in_module(tp_mod->mod, tuser->name);
+ if (!tpoint)
+ continue;
+ WARN_ON_ONCE(tracepoint_user_register_again(tuser, tpoint));
+ } else if (val == MODULE_STATE_GOING &&
+ tracepoint_user_within_module(tuser, tp_mod->mod)) {
+ /* Unregister all tracepoint_user in this module. */
+ tracepoint_user_unregister_clear(tuser);
+ }
+ }
+ mutex_unlock(&tracepoint_user_mutex);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block tracepoint_module_nb = {
+ .notifier_call = __tracepoint_probe_module_cb,
+};
+
+/* module callback for tprobe events */
+static int __tprobe_event_module_cb(struct notifier_block *self,
+ unsigned long val, void *data)
+{
struct trace_fprobe *tf;
struct dyn_event *pos;
+ struct module *mod = data;
if (val != MODULE_STATE_GOING && val != MODULE_STATE_COMING)
return NOTIFY_DONE;
mutex_lock(&event_mutex);
for_each_trace_fprobe(tf, pos) {
- if (val == MODULE_STATE_COMING && tf->tpoint == TRACEPOINT_STUB) {
- tpoint = find_tracepoint_in_module(tp_mod->mod, tf->symbol);
- if (tpoint) {
- tf->tpoint = tpoint;
- tf->mod = tp_mod->mod;
- if (!WARN_ON_ONCE(__regsiter_tracepoint_fprobe(tf)) &&
- trace_probe_is_enabled(&tf->tp))
- reenable_trace_fprobe(tf);
- }
- } else if (val == MODULE_STATE_GOING && tp_mod->mod == tf->mod) {
- tracepoint_probe_unregister(tf->tpoint,
- tf->tpoint->probestub, NULL);
- tf->tpoint = NULL;
- tf->mod = NULL;
+ /* Skip fprobe and disabled tprobe events. */
+ if (!trace_fprobe_is_tracepoint(tf) || !tf->tuser)
+ continue;
+
+ /* Before this notification, tracepoint notifier has already done. */
+ if (val == MODULE_STATE_COMING &&
+ tracepoint_user_within_module(tf->tuser, mod)) {
+ unsigned long ip = tracepoint_user_ip(tf->tuser);
+
+ WARN_ON_ONCE(register_fprobe_ips(&tf->fp, &ip, 1));
+ } else if (val == MODULE_STATE_GOING &&
+ /*
+ * tracepoint_user_within_module() does not work here because
+ * tracepoint_user is already unregistered and cleared tpoint.
+ * Instead, checking whether the fprobe is registered but
+ * tpoint is cleared(unregistered). Such unbalance probes
+ * must be adjusted anyway.
+ */
+ trace_fprobe_is_registered(tf) &&
+ !tf->tuser->tpoint) {
+ unregister_fprobe(&tf->fp);
}
}
mutex_unlock(&event_mutex);
@@ -988,8 +1146,11 @@ static int __tracepoint_probe_module_cb(struct notifier_block *self,
return NOTIFY_DONE;
}
-static struct notifier_block tracepoint_module_nb = {
- .notifier_call = __tracepoint_probe_module_cb,
+/* NOTE: this must be called after tracepoint callback */
+static struct notifier_block tprobe_event_module_nb = {
+ .notifier_call = __tprobe_event_module_cb,
+ /* Make sure this is later than tracepoint module notifier. */
+ .priority = -10,
};
#endif /* CONFIG_MODULES */
@@ -1018,6 +1179,19 @@ static int parse_symbol_and_return(int argc, const char *argv[],
if (*is_return)
return 0;
+ if (is_tracepoint) {
+ tmp = *symbol;
+ while (*tmp && (isalnum(*tmp) || *tmp == '_'))
+ tmp++;
+ if (*tmp) {
+ /* find a wrong character. */
+ trace_probe_log_err(tmp - *symbol, BAD_TP_NAME);
+ kfree(*symbol);
+ *symbol = NULL;
+ return -EINVAL;
+ }
+ }
+
/* If there is $retval, this should be a return fprobe. */
for (i = 2; i < argc; i++) {
tmp = strstr(argv[i], "$retval");
@@ -1025,6 +1199,8 @@ static int parse_symbol_and_return(int argc, const char *argv[],
if (is_tracepoint) {
trace_probe_log_set_index(i);
trace_probe_log_err(tmp - argv[i], RETVAL_ON_PROBE);
+ kfree(*symbol);
+ *symbol = NULL;
return -EINVAL;
}
*is_return = true;
@@ -1034,7 +1210,8 @@ static int parse_symbol_and_return(int argc, const char *argv[],
return 0;
}
-static int __trace_fprobe_create(int argc, const char *argv[])
+static int trace_fprobe_create_internal(int argc, const char *argv[],
+ struct traceprobe_parse_context *ctx)
{
/*
* Argument syntax:
@@ -1060,24 +1237,19 @@ static int __trace_fprobe_create(int argc, const char *argv[])
* Type of args:
* FETCHARG:TYPE : use TYPE instead of unsigned long.
*/
- struct trace_fprobe *tf = NULL;
- int i, len, new_argc = 0, ret = 0;
- bool is_return = false;
- char *symbol = NULL;
+ struct trace_fprobe *tf __free(free_trace_fprobe) = NULL;
const char *event = NULL, *group = FPROBE_EVENT_SYSTEM;
- const char **new_argv = NULL;
- int maxactive = 0;
- char buf[MAX_EVENT_NAME_LEN];
- char gbuf[MAX_EVENT_NAME_LEN];
- char sbuf[KSYM_NAME_LEN];
- char abuf[MAX_BTF_ARGS_LEN];
- char *dbuf = NULL;
+ struct module *mod __free(module_put) = NULL;
+ const char **new_argv __free(kfree) = NULL;
+ char *symbol __free(kfree) = NULL;
+ char *ebuf __free(kfree) = NULL;
+ char *gbuf __free(kfree) = NULL;
+ char *sbuf __free(kfree) = NULL;
+ char *abuf __free(kfree) = NULL;
+ char *dbuf __free(kfree) = NULL;
+ int i, new_argc = 0, ret = 0;
bool is_tracepoint = false;
- struct module *tp_mod = NULL;
- struct tracepoint *tpoint = NULL;
- struct traceprobe_parse_context ctx = {
- .flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE,
- };
+ bool is_return = false;
if ((argv[0][0] != 'f' && argv[0][0] != 't') || argc < 2)
return -ECANCELED;
@@ -1087,35 +1259,13 @@ static int __trace_fprobe_create(int argc, const char *argv[])
group = TRACEPOINT_EVENT_SYSTEM;
}
- trace_probe_log_init("trace_fprobe", argc, argv);
-
- event = strchr(&argv[0][1], ':');
- if (event)
- event++;
-
- if (isdigit(argv[0][1])) {
- if (event)
- len = event - &argv[0][1] - 1;
- else
- len = strlen(&argv[0][1]);
- if (len > MAX_EVENT_NAME_LEN - 1) {
- trace_probe_log_err(1, BAD_MAXACT);
- goto parse_error;
- }
- memcpy(buf, &argv[0][1], len);
- buf[len] = '\0';
- ret = kstrtouint(buf, 0, &maxactive);
- if (ret || !maxactive) {
+ if (argv[0][1] != '\0') {
+ if (argv[0][1] != ':') {
+ trace_probe_log_set_index(0);
trace_probe_log_err(1, BAD_MAXACT);
- goto parse_error;
- }
- /* fprobe rethook instances are iterated over via a list. The
- * maximum should stay reasonable.
- */
- if (maxactive > RETHOOK_MAXACTIVE_MAX) {
- trace_probe_log_err(1, MAXACT_TOO_BIG);
- goto parse_error;
+ return -EINVAL;
}
+ event = &argv[0][2];
}
trace_probe_log_set_index(1);
@@ -1123,109 +1273,114 @@ static int __trace_fprobe_create(int argc, const char *argv[])
/* a symbol(or tracepoint) must be specified */
ret = parse_symbol_and_return(argc, argv, &symbol, &is_return, is_tracepoint);
if (ret < 0)
- goto parse_error;
-
- if (!is_return && maxactive) {
- trace_probe_log_set_index(0);
- trace_probe_log_err(1, BAD_MAXACT_TYPE);
- goto parse_error;
- }
+ return -EINVAL;
trace_probe_log_set_index(0);
if (event) {
+ gbuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL);
+ if (!gbuf)
+ return -ENOMEM;
ret = traceprobe_parse_event_name(&event, &group, gbuf,
event - argv[0]);
if (ret)
- goto parse_error;
+ return -EINVAL;
}
if (!event) {
+ ebuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL);
+ if (!ebuf)
+ return -ENOMEM;
/* Make a new event name */
if (is_tracepoint)
- snprintf(buf, MAX_EVENT_NAME_LEN, "%s%s",
+ snprintf(ebuf, MAX_EVENT_NAME_LEN, "%s%s",
isdigit(*symbol) ? "_" : "", symbol);
else
- snprintf(buf, MAX_EVENT_NAME_LEN, "%s__%s", symbol,
+ snprintf(ebuf, MAX_EVENT_NAME_LEN, "%s__%s", symbol,
is_return ? "exit" : "entry");
- sanitize_event_name(buf);
- event = buf;
+ sanitize_event_name(ebuf);
+ event = ebuf;
}
if (is_return)
- ctx.flags |= TPARG_FL_RETURN;
+ ctx->flags |= TPARG_FL_RETURN;
else
- ctx.flags |= TPARG_FL_FENTRY;
+ ctx->flags |= TPARG_FL_FENTRY;
+ ctx->funcname = NULL;
if (is_tracepoint) {
- ctx.flags |= TPARG_FL_TPOINT;
- tpoint = find_tracepoint(symbol, &tp_mod);
+ /* Get tracepoint and lock its module until the end of the registration. */
+ struct tracepoint *tpoint;
+
+ ctx->flags |= TPARG_FL_TPOINT;
+ mod = NULL;
+ tpoint = find_tracepoint(symbol, &mod);
if (tpoint) {
- ctx.funcname = kallsyms_lookup(
- (unsigned long)tpoint->probestub,
- NULL, NULL, NULL, sbuf);
- } else if (IS_ENABLED(CONFIG_MODULES)) {
- /* This *may* be loaded afterwards */
- tpoint = TRACEPOINT_STUB;
- ctx.funcname = symbol;
- } else {
- trace_probe_log_set_index(1);
- trace_probe_log_err(0, NO_TRACEPOINT);
- goto parse_error;
+ sbuf = kmalloc(KSYM_NAME_LEN, GFP_KERNEL);
+ if (!sbuf)
+ return -ENOMEM;
+ ctx->funcname = kallsyms_lookup((unsigned long)tpoint->probestub,
+ NULL, NULL, NULL, sbuf);
}
- } else
- ctx.funcname = symbol;
+ }
+ if (!ctx->funcname)
+ ctx->funcname = symbol;
+ abuf = kmalloc(MAX_BTF_ARGS_LEN, GFP_KERNEL);
+ if (!abuf)
+ return -ENOMEM;
argc -= 2; argv += 2;
new_argv = traceprobe_expand_meta_args(argc, argv, &new_argc,
- abuf, MAX_BTF_ARGS_LEN, &ctx);
- if (IS_ERR(new_argv)) {
- ret = PTR_ERR(new_argv);
- new_argv = NULL;
- goto out;
- }
+ abuf, MAX_BTF_ARGS_LEN, ctx);
+ if (IS_ERR(new_argv))
+ return PTR_ERR(new_argv);
if (new_argv) {
argc = new_argc;
argv = new_argv;
}
if (argc > MAX_TRACE_ARGS) {
- ret = -E2BIG;
- goto out;
+ trace_probe_log_set_index(2);
+ trace_probe_log_err(0, TOO_MANY_ARGS);
+ return -E2BIG;
}
ret = traceprobe_expand_dentry_args(argc, argv, &dbuf);
if (ret)
- goto out;
+ return ret;
/* setup a probe */
- tf = alloc_trace_fprobe(group, event, symbol, tpoint, tp_mod,
- maxactive, argc, is_return);
+ tf = alloc_trace_fprobe(group, event, symbol, argc, is_return, is_tracepoint);
if (IS_ERR(tf)) {
ret = PTR_ERR(tf);
/* This must return -ENOMEM, else there is a bug */
WARN_ON_ONCE(ret != -ENOMEM);
- goto out; /* We know tf is not allocated */
+ return ret;
}
/* parse arguments */
for (i = 0; i < argc; i++) {
trace_probe_log_set_index(i + 2);
- ctx.offset = 0;
- ret = traceprobe_parse_probe_arg(&tf->tp, i, argv[i], &ctx);
+ ctx->offset = 0;
+ ret = traceprobe_parse_probe_arg(&tf->tp, i, argv[i], ctx);
if (ret)
- goto error; /* This can be -ENOMEM */
+ return ret; /* This can be -ENOMEM */
}
if (is_return && tf->tp.entry_arg) {
tf->fp.entry_handler = trace_fprobe_entry_handler;
tf->fp.entry_data_size = traceprobe_get_entry_data_size(&tf->tp);
+ if (ALIGN(tf->fp.entry_data_size, sizeof(long)) > MAX_FPROBE_DATA_SIZE) {
+ trace_probe_log_set_index(2);
+ trace_probe_log_err(0, TOO_MANY_EARGS);
+ return -E2BIG;
+ }
}
ret = traceprobe_set_print_fmt(&tf->tp,
is_return ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL);
if (ret < 0)
- goto error;
+ return ret;
- ret = register_trace_fprobe(tf);
+ ret = register_trace_fprobe_event(tf);
if (ret) {
trace_probe_log_set_index(1);
if (ret == -EILSEQ)
@@ -1234,29 +1389,35 @@ static int __trace_fprobe_create(int argc, const char *argv[])
trace_probe_log_err(0, BAD_PROBE_ADDR);
else if (ret != -ENOMEM && ret != -EEXIST)
trace_probe_log_err(0, FAIL_REG_PROBE);
- goto error;
+ return -EINVAL;
}
-out:
- if (tp_mod)
- module_put(tp_mod);
- traceprobe_finish_parse(&ctx);
+ /* 'tf' is successfully registered. To avoid freeing, assign NULL. */
+ tf = NULL;
+
+ return 0;
+}
+
+static int trace_fprobe_create_cb(int argc, const char *argv[])
+{
+ struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) = NULL;
+ int ret;
+
+ ctx = kzalloc_obj(*ctx);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE;
+
+ trace_probe_log_init("trace_fprobe", argc, argv);
+ ret = trace_fprobe_create_internal(argc, argv, ctx);
trace_probe_log_clear();
- kfree(new_argv);
- kfree(symbol);
- kfree(dbuf);
return ret;
-
-parse_error:
- ret = -EINVAL;
-error:
- free_trace_fprobe(tf);
- goto out;
}
static int trace_fprobe_create(const char *raw_command)
{
- return trace_probe_create(raw_command, __trace_fprobe_create);
+ return trace_probe_create(raw_command, trace_fprobe_create_cb);
}
static int trace_fprobe_release(struct dyn_event *ev)
@@ -1278,8 +1439,6 @@ static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev)
seq_putc(m, 't');
else
seq_putc(m, 'f');
- if (trace_fprobe_is_return(tf) && tf->fp.nr_maxactive)
- seq_printf(m, "%d", tf->fp.nr_maxactive);
seq_printf(m, ":%s/%s", trace_probe_group_name(&tf->tp),
trace_probe_name(&tf->tp));
@@ -1294,6 +1453,88 @@ static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev)
}
/*
+ * Enable trace_probe
+ * if the file is NULL, enable "perf" handler, or enable "trace" handler.
+ */
+static int enable_trace_fprobe(struct trace_event_call *call,
+ struct trace_event_file *file)
+{
+ struct trace_probe *tp;
+ struct trace_fprobe *tf;
+ bool enabled;
+ int ret = 0;
+
+ tp = trace_probe_primary_from_call(call);
+ if (WARN_ON_ONCE(!tp))
+ return -ENODEV;
+ enabled = trace_probe_is_enabled(tp);
+
+ /* This also changes "enabled" state */
+ if (file) {
+ ret = trace_probe_add_file(tp, file);
+ if (ret)
+ return ret;
+ } else
+ trace_probe_set_flag(tp, TP_FLAG_PROFILE);
+
+ if (!enabled) {
+ list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) {
+ ret = __register_trace_fprobe(tf);
+ if (ret < 0)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Disable trace_probe
+ * if the file is NULL, disable "perf" handler, or disable "trace" handler.
+ */
+static int disable_trace_fprobe(struct trace_event_call *call,
+ struct trace_event_file *file)
+{
+ struct trace_fprobe *tf;
+ struct trace_probe *tp;
+
+ tp = trace_probe_primary_from_call(call);
+ if (WARN_ON_ONCE(!tp))
+ return -ENODEV;
+
+ if (file) {
+ if (!trace_probe_get_file_link(tp, file))
+ return -ENOENT;
+ if (!trace_probe_has_single_file(tp))
+ goto out;
+ trace_probe_clear_flag(tp, TP_FLAG_TRACE);
+ } else
+ trace_probe_clear_flag(tp, TP_FLAG_PROFILE);
+
+ if (!trace_probe_is_enabled(tp)) {
+ list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) {
+ unregister_fprobe(&tf->fp);
+ if (tf->tuser) {
+ tracepoint_user_put(tf->tuser);
+ tf->tuser = NULL;
+ }
+ }
+ }
+
+ out:
+ if (file)
+ /*
+ * Synchronization is done in below function. For perf event,
+ * file == NULL and perf_trace_event_unreg() calls
+ * tracepoint_synchronize_unregister() to ensure synchronize
+ * event. We don't need to care about it.
+ */
+ trace_probe_remove_file(tp, file);
+
+ return 0;
+}
+
+/*
* called by perf_trace_init() or __ftrace_set_clr_event() under event_mutex.
*/
static int fprobe_register(struct trace_event_call *event,
@@ -1338,6 +1579,9 @@ static __init int init_fprobe_trace_early(void)
ret = register_tracepoint_module_notifier(&tracepoint_module_nb);
if (ret)
return ret;
+ ret = register_module_notifier(&tprobe_event_module_nb);
+ if (ret)
+ return ret;
#endif
return 0;
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index d358c9935164..f283391a4dc8 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -25,6 +25,9 @@ static void
function_trace_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct ftrace_regs *fregs);
static void
+function_args_trace_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct ftrace_regs *fregs);
+static void
function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct ftrace_regs *fregs);
static void
@@ -42,9 +45,10 @@ enum {
TRACE_FUNC_NO_OPTS = 0x0, /* No flags set. */
TRACE_FUNC_OPT_STACK = 0x1,
TRACE_FUNC_OPT_NO_REPEATS = 0x2,
+ TRACE_FUNC_OPT_ARGS = 0x4,
/* Update this to next highest bit. */
- TRACE_FUNC_OPT_HIGHEST_BIT = 0x4
+ TRACE_FUNC_OPT_HIGHEST_BIT = 0x8
};
#define TRACE_FUNC_OPT_MASK (TRACE_FUNC_OPT_HIGHEST_BIT - 1)
@@ -57,7 +61,7 @@ int ftrace_allocate_ftrace_ops(struct trace_array *tr)
if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
return 0;
- ops = kzalloc(sizeof(*ops), GFP_KERNEL);
+ ops = kzalloc_obj(*ops);
if (!ops)
return -ENOMEM;
@@ -114,6 +118,8 @@ static ftrace_func_t select_trace_function(u32 flags_val)
switch (flags_val & TRACE_FUNC_OPT_MASK) {
case TRACE_FUNC_NO_OPTS:
return function_trace_call;
+ case TRACE_FUNC_OPT_ARGS:
+ return function_args_trace_call;
case TRACE_FUNC_OPT_STACK:
return function_stack_trace_call;
case TRACE_FUNC_OPT_NO_REPEATS:
@@ -148,11 +154,11 @@ static int function_trace_init(struct trace_array *tr)
if (!tr->ops)
return -ENOMEM;
- func = select_trace_function(func_flags.val);
+ func = select_trace_function(tr->current_trace_flags->val);
if (!func)
return -EINVAL;
- if (!handle_func_repeats(tr, func_flags.val))
+ if (!handle_func_repeats(tr, tr->current_trace_flags->val))
return -ENOMEM;
ftrace_init_array_ops(tr, func);
@@ -203,7 +209,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct ftrace_regs *fregs)
{
struct trace_array *tr = op->private;
- struct trace_array_cpu *data;
unsigned int trace_ctx;
int bit;
@@ -216,11 +221,31 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
parent_ip = function_get_true_parent_ip(parent_ip, fregs);
+ trace_ctx = tracing_gen_ctx_dec();
+
+ trace_function(tr, ip, parent_ip, trace_ctx, NULL);
+
+ ftrace_test_recursion_unlock(bit);
+}
+
+static void
+function_args_trace_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct ftrace_regs *fregs)
+{
+ struct trace_array *tr = op->private;
+ unsigned int trace_ctx;
+ int bit;
+
+ if (unlikely(!tr->function_enabled))
+ return;
+
+ bit = ftrace_test_recursion_trylock(ip, parent_ip);
+ if (bit < 0)
+ return;
+
trace_ctx = tracing_gen_ctx();
- data = this_cpu_ptr(tr->array_buffer.data);
- if (!atomic_read(&data->disabled))
- trace_function(tr, ip, parent_ip, trace_ctx);
+ trace_function(tr, ip, parent_ip, trace_ctx, fregs);
ftrace_test_recursion_unlock(bit);
}
@@ -266,11 +291,11 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
parent_ip = function_get_true_parent_ip(parent_ip, fregs);
cpu = raw_smp_processor_id();
data = per_cpu_ptr(tr->array_buffer.data, cpu);
- disabled = atomic_inc_return(&data->disabled);
+ disabled = local_inc_return(&data->disabled);
if (likely(disabled == 1)) {
trace_ctx = tracing_gen_ctx_flags(flags);
- trace_function(tr, ip, parent_ip, trace_ctx);
+ trace_function(tr, ip, parent_ip, trace_ctx, NULL);
#ifdef CONFIG_UNWINDER_FRAME_POINTER
if (ftrace_pids_enabled(op))
skip++;
@@ -278,7 +303,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
__trace_stack(tr, trace_ctx, skip);
}
- atomic_dec(&data->disabled);
+ local_dec(&data->disabled);
local_irq_restore(flags);
}
@@ -319,9 +344,7 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
{
struct trace_func_repeats *last_info;
struct trace_array *tr = op->private;
- struct trace_array_cpu *data;
unsigned int trace_ctx;
- unsigned long flags;
int bit;
if (unlikely(!tr->function_enabled))
@@ -332,8 +355,7 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
return;
parent_ip = function_get_true_parent_ip(parent_ip, fregs);
- data = this_cpu_ptr(tr->array_buffer.data);
- if (atomic_read(&data->disabled))
+ if (!tracer_tracing_is_on(tr))
goto out;
/*
@@ -347,11 +369,10 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
if (is_repeat_check(tr, last_info, ip, parent_ip))
goto out;
- local_save_flags(flags);
- trace_ctx = tracing_gen_ctx_flags(flags);
+ trace_ctx = tracing_gen_ctx_dec();
process_repeats(tr, ip, parent_ip, last_info, trace_ctx);
- trace_function(tr, ip, parent_ip, trace_ctx);
+ trace_function(tr, ip, parent_ip, trace_ctx, NULL);
out:
ftrace_test_recursion_unlock(bit);
@@ -381,7 +402,7 @@ function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
parent_ip = function_get_true_parent_ip(parent_ip, fregs);
cpu = raw_smp_processor_id();
data = per_cpu_ptr(tr->array_buffer.data, cpu);
- disabled = atomic_inc_return(&data->disabled);
+ disabled = local_inc_return(&data->disabled);
if (likely(disabled == 1)) {
last_info = per_cpu_ptr(tr->last_func_repeats, cpu);
@@ -391,12 +412,12 @@ function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
trace_ctx = tracing_gen_ctx_flags(flags);
process_repeats(tr, ip, parent_ip, last_info, trace_ctx);
- trace_function(tr, ip, parent_ip, trace_ctx);
+ trace_function(tr, ip, parent_ip, trace_ctx, NULL);
__trace_stack(tr, trace_ctx, STACK_SKIP);
}
out:
- atomic_dec(&data->disabled);
+ local_dec(&data->disabled);
local_irq_restore(flags);
}
@@ -405,6 +426,9 @@ static struct tracer_opt func_opts[] = {
{ TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) },
#endif
{ TRACER_OPT(func-no-repeats, TRACE_FUNC_OPT_NO_REPEATS) },
+#ifdef CONFIG_FUNCTION_TRACE_ARGS
+ { TRACER_OPT(func-args, TRACE_FUNC_OPT_ARGS) },
+#endif
{ } /* Always set a last empty entry */
};
@@ -435,14 +459,14 @@ func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
u32 new_flags;
/* Do nothing if already set. */
- if (!!set == !!(func_flags.val & bit))
+ if (!!set == !!(tr->current_trace_flags->val & bit))
return 0;
/* We can change this flag only when not running. */
if (tr->current_trace != &function_trace)
return 0;
- new_flags = (func_flags.val & ~bit) | (set ? bit : 0);
+ new_flags = (tr->current_trace_flags->val & ~bit) | (set ? bit : 0);
func = select_trace_function(new_flags);
if (!func)
return -EINVAL;
@@ -467,7 +491,7 @@ static struct tracer function_trace __tracer_data =
.init = function_trace_init,
.reset = function_trace_reset,
.start = function_trace_start,
- .flags = &func_flags,
+ .default_flags = &func_flags,
.set_flag = func_set_flag,
.allow_instances = true,
#ifdef CONFIG_FTRACE_SELFTEST
@@ -599,11 +623,7 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip,
static __always_inline void trace_stack(struct trace_array *tr)
{
- unsigned int trace_ctx;
-
- trace_ctx = tracing_gen_ctx();
-
- __trace_stack(tr, trace_ctx, FTRACE_STACK_SKIP);
+ __trace_stack(tr, tracing_gen_ctx_dec(), FTRACE_STACK_SKIP);
}
static void
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 5504b5e4e7b4..0d2d3a2ea7dd 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -16,9 +16,12 @@
#include "trace.h"
#include "trace_output.h"
-/* When set, irq functions will be ignored */
+/* When set, irq functions might be ignored */
static int ftrace_graph_skip_irqs;
+/* Do not record function time when task is sleeping */
+int fgraph_no_sleep_time;
+
struct fgraph_cpu_data {
pid_t last_pid;
int depth;
@@ -27,14 +30,26 @@ struct fgraph_cpu_data {
unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH];
};
+struct fgraph_ent_args {
+ struct ftrace_graph_ent_entry ent;
+ /* Force the sizeof of args[] to have FTRACE_REGS_MAX_ARGS entries */
+ unsigned long args[FTRACE_REGS_MAX_ARGS];
+};
+
+struct fgraph_retaddr_ent_args {
+ struct fgraph_retaddr_ent_entry ent;
+ /* Force the sizeof of args[] to have FTRACE_REGS_MAX_ARGS entries */
+ unsigned long args[FTRACE_REGS_MAX_ARGS];
+};
+
struct fgraph_data {
struct fgraph_cpu_data __percpu *cpu_data;
/* Place to preserve last processed entry. */
union {
- struct ftrace_graph_ent_entry ent;
- struct fgraph_retaddr_ent_entry rent;
- } ent;
+ struct fgraph_ent_args ent;
+ struct fgraph_retaddr_ent_args rent;
+ };
struct ftrace_graph_ret_entry ret;
int failed;
int cpu;
@@ -71,14 +86,13 @@ static struct tracer_opt trace_opts[] = {
/* Display function return address ? */
{ TRACER_OPT(funcgraph-retaddr, TRACE_GRAPH_PRINT_RETADDR) },
#endif
+#ifdef CONFIG_FUNCTION_TRACE_ARGS
+ /* Display function arguments ? */
+ { TRACER_OPT(funcgraph-args, TRACE_GRAPH_ARGS) },
+#endif
/* Include sleep time (scheduled out) between entry and return */
{ TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) },
-#ifdef CONFIG_FUNCTION_PROFILER
- /* Include time within nested functions */
- { TRACER_OPT(graph-time, TRACE_GRAPH_GRAPH_TIME) },
-#endif
-
{ } /* Empty entry */
};
@@ -86,13 +100,13 @@ static struct tracer_flags tracer_flags = {
/* Don't display overruns, proc, or tail by default */
.val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD |
TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS |
- TRACE_GRAPH_SLEEP_TIME | TRACE_GRAPH_GRAPH_TIME,
+ TRACE_GRAPH_SLEEP_TIME,
.opts = trace_opts
};
-static bool tracer_flags_is_set(u32 flags)
+static bool tracer_flags_is_set(struct trace_array *tr, u32 flags)
{
- return (tracer_flags.val & flags) == flags;
+ return (tr->current_trace_flags->val & flags) == flags;
}
/*
@@ -110,43 +124,73 @@ static void
print_graph_duration(struct trace_array *tr, unsigned long long duration,
struct trace_seq *s, u32 flags);
-int __trace_graph_entry(struct trace_array *tr,
- struct ftrace_graph_ent *trace,
- unsigned int trace_ctx)
+static int __graph_entry(struct trace_array *tr, struct ftrace_graph_ent *trace,
+ unsigned int trace_ctx, struct ftrace_regs *fregs)
{
struct ring_buffer_event *event;
struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ftrace_graph_ent_entry *entry;
+ int size;
- event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
- sizeof(*entry), trace_ctx);
+ /* If fregs is defined, add FTRACE_REGS_MAX_ARGS long size words */
+ size = sizeof(*entry) + (FTRACE_REGS_MAX_ARGS * !!fregs * sizeof(long));
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, size, trace_ctx);
if (!event)
return 0;
- entry = ring_buffer_event_data(event);
- entry->graph_ent = *trace;
+
+ entry = ring_buffer_event_data(event);
+ entry->graph_ent = *trace;
+
+#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
+ if (fregs) {
+ for (int i = 0; i < FTRACE_REGS_MAX_ARGS; i++)
+ entry->args[i] = ftrace_regs_get_argument(fregs, i);
+ }
+#endif
+
trace_buffer_unlock_commit_nostack(buffer, event);
return 1;
}
+int __trace_graph_entry(struct trace_array *tr,
+ struct ftrace_graph_ent *trace,
+ unsigned int trace_ctx)
+{
+ return __graph_entry(tr, trace, trace_ctx, NULL);
+}
+
#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
int __trace_graph_retaddr_entry(struct trace_array *tr,
struct ftrace_graph_ent *trace,
unsigned int trace_ctx,
- unsigned long retaddr)
+ unsigned long retaddr,
+ struct ftrace_regs *fregs)
{
struct ring_buffer_event *event;
struct trace_buffer *buffer = tr->array_buffer.buffer;
struct fgraph_retaddr_ent_entry *entry;
+ int size;
+
+ /* If fregs is defined, add FTRACE_REGS_MAX_ARGS long size words */
+ size = sizeof(*entry) + (FTRACE_REGS_MAX_ARGS * !!fregs * sizeof(long));
event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RETADDR_ENT,
- sizeof(*entry), trace_ctx);
+ size, trace_ctx);
if (!event)
return 0;
entry = ring_buffer_event_data(event);
- entry->graph_ent.func = trace->func;
- entry->graph_ent.depth = trace->depth;
- entry->graph_ent.retaddr = retaddr;
+ entry->graph_rent.ent = *trace;
+ entry->graph_rent.retaddr = retaddr;
+
+#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
+ if (fregs) {
+ for (int i = 0; i < FTRACE_REGS_MAX_ARGS; i++)
+ entry->args[i] = ftrace_regs_get_argument(fregs, i);
+ }
+#endif
+
trace_buffer_unlock_commit_nostack(buffer, event);
return 1;
@@ -155,17 +199,21 @@ int __trace_graph_retaddr_entry(struct trace_array *tr,
int __trace_graph_retaddr_entry(struct trace_array *tr,
struct ftrace_graph_ent *trace,
unsigned int trace_ctx,
- unsigned long retaddr)
+ unsigned long retaddr,
+ struct ftrace_regs *fregs)
{
return 1;
}
#endif
-static inline int ftrace_graph_ignore_irqs(void)
+static inline int ftrace_graph_ignore_irqs(struct trace_array *tr)
{
if (!ftrace_graph_skip_irqs || trace_recursion_test(TRACE_IRQ_BIT))
return 0;
+ if (tracer_flags_is_set(tr, TRACE_GRAPH_PRINT_IRQS))
+ return 0;
+
return in_hardirq();
}
@@ -174,18 +222,15 @@ struct fgraph_times {
unsigned long long sleeptime; /* may be optional! */
};
-int trace_graph_entry(struct ftrace_graph_ent *trace,
- struct fgraph_ops *gops)
+static int graph_entry(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
unsigned long *task_var = fgraph_get_task_var(gops);
struct trace_array *tr = gops->private;
- struct trace_array_cpu *data;
struct fgraph_times *ftimes;
- unsigned long flags;
unsigned int trace_ctx;
- long disabled;
- int ret;
- int cpu;
+ int ret = 0;
if (*task_var & TRACE_GRAPH_NOTRACE)
return 0;
@@ -198,7 +243,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace,
* returning from the function.
*/
if (ftrace_graph_notrace_addr(trace->func)) {
- *task_var |= TRACE_GRAPH_NOTRACE_BIT;
+ *task_var |= TRACE_GRAPH_NOTRACE;
/*
* Need to return 1 to have the return called
* that will clear the NOTRACE bit.
@@ -206,22 +251,20 @@ int trace_graph_entry(struct ftrace_graph_ent *trace,
return 1;
}
- if (!ftrace_trace_task(tr))
- return 0;
-
if (ftrace_graph_ignore_func(gops, trace))
return 0;
- if (ftrace_graph_ignore_irqs())
+ if (ftrace_graph_ignore_irqs(tr))
return 0;
- if (fgraph_sleep_time) {
- /* Only need to record the calltime */
- ftimes = fgraph_reserve_data(gops->idx, sizeof(ftimes->calltime));
- } else {
+ if (fgraph_no_sleep_time &&
+ !tracer_flags_is_set(tr, TRACE_GRAPH_SLEEP_TIME)) {
ftimes = fgraph_reserve_data(gops->idx, sizeof(*ftimes));
if (ftimes)
ftimes->sleeptime = current->ftrace_sleeptime;
+ } else {
+ /* Only need to record the calltime */
+ ftimes = fgraph_reserve_data(gops->idx, sizeof(ftimes->calltime));
}
if (!ftimes)
return 0;
@@ -235,29 +278,33 @@ int trace_graph_entry(struct ftrace_graph_ent *trace,
if (tracing_thresh)
return 1;
- local_irq_save(flags);
- cpu = raw_smp_processor_id();
- data = per_cpu_ptr(tr->array_buffer.data, cpu);
- disabled = atomic_inc_return(&data->disabled);
- if (likely(disabled == 1)) {
- trace_ctx = tracing_gen_ctx_flags(flags);
- if (unlikely(IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) &&
- tracer_flags_is_set(TRACE_GRAPH_PRINT_RETADDR))) {
- unsigned long retaddr = ftrace_graph_top_ret_addr(current);
-
- ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx, retaddr);
- } else
- ret = __trace_graph_entry(tr, trace, trace_ctx);
+ trace_ctx = tracing_gen_ctx();
+ if (IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) &&
+ tracer_flags_is_set(tr, TRACE_GRAPH_PRINT_RETADDR)) {
+ unsigned long retaddr = ftrace_graph_top_ret_addr(current);
+ ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx,
+ retaddr, fregs);
} else {
- ret = 0;
+ ret = __graph_entry(tr, trace, trace_ctx, fregs);
}
- atomic_dec(&data->disabled);
- local_irq_restore(flags);
-
return ret;
}
+int trace_graph_entry(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
+{
+ return graph_entry(trace, gops, NULL);
+}
+
+static int trace_graph_entry_args(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
+{
+ return graph_entry(trace, gops, fregs);
+}
+
static void
__trace_graph_function(struct trace_array *tr,
unsigned long ip, unsigned int trace_ctx)
@@ -270,12 +317,10 @@ __trace_graph_function(struct trace_array *tr,
struct ftrace_graph_ret ret = {
.func = ip,
.depth = 0,
- .calltime = time,
- .rettime = time,
};
__trace_graph_entry(tr, &ent, trace_ctx);
- __trace_graph_return(tr, &ret, trace_ctx);
+ __trace_graph_return(tr, &ret, trace_ctx, time, time);
}
void
@@ -287,8 +332,9 @@ trace_graph_function(struct trace_array *tr,
}
void __trace_graph_return(struct trace_array *tr,
- struct ftrace_graph_ret *trace,
- unsigned int trace_ctx)
+ struct ftrace_graph_ret *trace,
+ unsigned int trace_ctx,
+ u64 calltime, u64 rettime)
{
struct ring_buffer_event *event;
struct trace_buffer *buffer = tr->array_buffer.buffer;
@@ -300,31 +346,36 @@ void __trace_graph_return(struct trace_array *tr,
return;
entry = ring_buffer_event_data(event);
entry->ret = *trace;
+ entry->calltime = calltime;
+ entry->rettime = rettime;
trace_buffer_unlock_commit_nostack(buffer, event);
}
-static void handle_nosleeptime(struct ftrace_graph_ret *trace,
+static void handle_nosleeptime(struct trace_array *tr,
+ struct ftrace_graph_ret *trace,
struct fgraph_times *ftimes,
int size)
{
- if (fgraph_sleep_time || size < sizeof(*ftimes))
+ if (size < sizeof(*ftimes))
+ return;
+
+ if (!fgraph_no_sleep_time || tracer_flags_is_set(tr, TRACE_GRAPH_SLEEP_TIME))
return;
ftimes->calltime += current->ftrace_sleeptime - ftimes->sleeptime;
}
void trace_graph_return(struct ftrace_graph_ret *trace,
- struct fgraph_ops *gops)
+ struct fgraph_ops *gops, struct ftrace_regs *fregs)
{
unsigned long *task_var = fgraph_get_task_var(gops);
struct trace_array *tr = gops->private;
- struct trace_array_cpu *data;
struct fgraph_times *ftimes;
- unsigned long flags;
unsigned int trace_ctx;
- long disabled;
+ u64 calltime, rettime;
int size;
- int cpu;
+
+ rettime = trace_clock_local();
ftrace_graph_addr_finish(gops, trace);
@@ -337,32 +388,31 @@ void trace_graph_return(struct ftrace_graph_ret *trace,
if (!ftimes)
return;
- handle_nosleeptime(trace, ftimes, size);
+ handle_nosleeptime(tr, trace, ftimes, size);
- trace->calltime = ftimes->calltime;
+ calltime = ftimes->calltime;
- local_irq_save(flags);
- cpu = raw_smp_processor_id();
- data = per_cpu_ptr(tr->array_buffer.data, cpu);
- disabled = atomic_inc_return(&data->disabled);
- if (likely(disabled == 1)) {
- trace_ctx = tracing_gen_ctx_flags(flags);
- __trace_graph_return(tr, trace, trace_ctx);
- }
- atomic_dec(&data->disabled);
- local_irq_restore(flags);
+ trace_ctx = tracing_gen_ctx();
+ __trace_graph_return(tr, trace, trace_ctx, calltime, rettime);
}
static void trace_graph_thresh_return(struct ftrace_graph_ret *trace,
- struct fgraph_ops *gops)
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
+ unsigned long *task_var = fgraph_get_task_var(gops);
struct fgraph_times *ftimes;
+ struct trace_array *tr;
+ unsigned int trace_ctx;
+ u64 calltime, rettime;
int size;
+ rettime = trace_clock_local();
+
ftrace_graph_addr_finish(gops, trace);
- if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) {
- trace_recursion_clear(TRACE_GRAPH_NOTRACE_BIT);
+ if (*task_var & TRACE_GRAPH_NOTRACE) {
+ *task_var &= ~TRACE_GRAPH_NOTRACE;
return;
}
@@ -370,15 +420,16 @@ static void trace_graph_thresh_return(struct ftrace_graph_ret *trace,
if (!ftimes)
return;
- handle_nosleeptime(trace, ftimes, size);
+ tr = gops->private;
+ handle_nosleeptime(tr, trace, ftimes, size);
- trace->calltime = ftimes->calltime;
+ calltime = ftimes->calltime;
- if (tracing_thresh &&
- (trace->rettime - ftimes->calltime < tracing_thresh))
+ if (tracing_thresh && (rettime - calltime < tracing_thresh))
return;
- else
- trace_graph_return(trace, gops);
+
+ trace_ctx = tracing_gen_ctx();
+ __trace_graph_return(tr, trace, trace_ctx, calltime, rettime);
}
static struct fgraph_ops funcgraph_ops = {
@@ -390,7 +441,7 @@ int allocate_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops)
{
struct fgraph_ops *gops;
- gops = kzalloc(sizeof(*gops), GFP_KERNEL);
+ gops = kzalloc_obj(*gops);
if (!gops)
return -ENOMEM;
@@ -421,14 +472,23 @@ static int graph_trace_init(struct trace_array *tr)
{
int ret;
- tr->gops->entryfunc = trace_graph_entry;
+ if (tracer_flags_is_set(tr, TRACE_GRAPH_ARGS))
+ tr->gops->entryfunc = trace_graph_entry_args;
+ else
+ tr->gops->entryfunc = trace_graph_entry;
if (tracing_thresh)
tr->gops->retfunc = trace_graph_thresh_return;
else
tr->gops->retfunc = trace_graph_return;
- /* Make gops functions are visible before we start tracing */
+ if (!tracer_flags_is_set(tr, TRACE_GRAPH_PRINT_IRQS))
+ ftrace_graph_skip_irqs++;
+
+ if (!tracer_flags_is_set(tr, TRACE_GRAPH_SLEEP_TIME))
+ fgraph_no_sleep_time++;
+
+ /* Make gops functions visible before we start tracing */
smp_mb();
ret = register_ftrace_graph(tr->gops);
@@ -439,8 +499,42 @@ static int graph_trace_init(struct trace_array *tr)
return 0;
}
+static struct tracer graph_trace;
+
+static int ftrace_graph_trace_args(struct trace_array *tr, int set)
+{
+ trace_func_graph_ent_t entry;
+
+ if (set)
+ entry = trace_graph_entry_args;
+ else
+ entry = trace_graph_entry;
+
+ /* See if there's any changes */
+ if (tr->gops->entryfunc == entry)
+ return 0;
+
+ unregister_ftrace_graph(tr->gops);
+
+ tr->gops->entryfunc = entry;
+
+ /* Make gops functions visible before we start tracing */
+ smp_mb();
+ return register_ftrace_graph(tr->gops);
+}
+
static void graph_trace_reset(struct trace_array *tr)
{
+ if (!tracer_flags_is_set(tr, TRACE_GRAPH_PRINT_IRQS))
+ ftrace_graph_skip_irqs--;
+ if (WARN_ON_ONCE(ftrace_graph_skip_irqs < 0))
+ ftrace_graph_skip_irqs = 0;
+
+ if (!tracer_flags_is_set(tr, TRACE_GRAPH_SLEEP_TIME))
+ fgraph_no_sleep_time--;
+ if (WARN_ON_ONCE(fgraph_no_sleep_time < 0))
+ fgraph_no_sleep_time = 0;
+
tracing_stop_cmdline_record();
unregister_ftrace_graph(tr->gops);
}
@@ -469,7 +563,7 @@ static void print_graph_proc(struct trace_seq *s, pid_t pid)
{
char comm[TASK_COMM_LEN];
/* sign + log10(MAX_INT) + '\0' */
- char pid_str[11];
+ char pid_str[12];
int spaces = 0;
int len;
int i;
@@ -583,10 +677,9 @@ get_return_for_leaf(struct trace_iterator *iter,
* Save current and next entries for later reference
* if the output fails.
*/
- if (unlikely(curr->ent.type == TRACE_GRAPH_RETADDR_ENT))
- data->ent.rent = *(struct fgraph_retaddr_ent_entry *)curr;
- else
- data->ent.ent = *curr;
+ int size = min_t(int, sizeof(data->rent), iter->ent_size);
+
+ memcpy(&data->rent, curr, size);
/*
* If the next event is not a return type, then
* we only care about what type it is. Otherwise we can
@@ -649,7 +742,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
addr >= (unsigned long)__irqentry_text_end)
return;
- if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) {
+ if (tr->trace_flags & TRACE_ITER(CONTEXT_INFO)) {
/* Absolute time */
if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
print_graph_abs_time(iter->ts, s);
@@ -669,7 +762,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
}
/* Latency format */
- if (tr->trace_flags & TRACE_ITER_LATENCY_FMT)
+ if (tr->trace_flags & TRACE_ITER(LATENCY_FMT))
print_graph_lat_fmt(s, ent);
}
@@ -723,7 +816,7 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration,
struct trace_seq *s, u32 flags)
{
if (!(flags & TRACE_GRAPH_PRINT_DURATION) ||
- !(tr->trace_flags & TRACE_ITER_CONTEXT_INFO))
+ !(tr->trace_flags & TRACE_ITER(CONTEXT_INFO)))
return;
/* No real adata, just filling the column with spaces */
@@ -764,7 +857,7 @@ static void print_graph_retaddr(struct trace_seq *s, struct fgraph_retaddr_ent_e
trace_seq_puts(s, " /*");
trace_seq_puts(s, " <-");
- seq_print_ip_sym(s, entry->graph_ent.retaddr, trace_flags | TRACE_ITER_SYM_OFFSET);
+ seq_print_ip_sym_offset(s, entry->graph_rent.retaddr, trace_flags);
if (comment)
trace_seq_puts(s, " */");
@@ -778,7 +871,7 @@ static void print_graph_retaddr(struct trace_seq *s, struct fgraph_retaddr_ent_e
static void print_graph_retval(struct trace_seq *s, struct ftrace_graph_ent_entry *entry,
struct ftrace_graph_ret *graph_ret, void *func,
- u32 opt_flags, u32 trace_flags)
+ u32 opt_flags, u32 trace_flags, int args_size)
{
unsigned long err_code = 0;
unsigned long retval = 0;
@@ -812,11 +905,16 @@ static void print_graph_retval(struct trace_seq *s, struct ftrace_graph_ent_entr
if (entry->ent.type != TRACE_GRAPH_RETADDR_ENT)
print_retaddr = false;
- trace_seq_printf(s, "%ps();", func);
+ trace_seq_printf(s, "%ps", func);
+
+ if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long)) {
+ print_function_args(s, FGRAPH_ENTRY_ARGS(entry), (unsigned long)func);
+ trace_seq_putc(s, ';');
+ } else
+ trace_seq_puts(s, "();");
+
if (print_retval || print_retaddr)
trace_seq_puts(s, " /*");
- else
- trace_seq_putc(s, '\n');
} else {
print_retaddr = false;
trace_seq_printf(s, "} /* %ps", func);
@@ -834,12 +932,13 @@ static void print_graph_retval(struct trace_seq *s, struct ftrace_graph_ent_entr
}
if (!entry || print_retval || print_retaddr)
- trace_seq_puts(s, " */\n");
+ trace_seq_puts(s, " */");
}
#else
-#define print_graph_retval(_seq, _ent, _ret, _func, _opt_flags, _trace_flags) do {} while (0)
+#define print_graph_retval(_seq, _ent, _ret, _func, _opt_flags, _trace_flags, args_size) \
+ do {} while (0)
#endif
@@ -855,15 +954,16 @@ print_graph_entry_leaf(struct trace_iterator *iter,
struct ftrace_graph_ret *graph_ret;
struct ftrace_graph_ent *call;
unsigned long long duration;
- unsigned long func;
+ unsigned long ret_func;
+ int args_size;
int cpu = iter->cpu;
int i;
+ args_size = iter->ent_size - offsetof(struct ftrace_graph_ent_entry, args);
+
graph_ret = &ret_entry->ret;
call = &entry->graph_ent;
- duration = graph_ret->rettime - graph_ret->calltime;
-
- func = call->func + iter->tr->text_delta;
+ duration = ret_entry->rettime - ret_entry->calltime;
if (data) {
struct fgraph_cpu_data *cpu_data;
@@ -890,16 +990,25 @@ print_graph_entry_leaf(struct trace_iterator *iter,
for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++)
trace_seq_putc(s, ' ');
+ ret_func = graph_ret->func + iter->tr->text_delta;
+
/*
* Write out the function return value or return address
*/
if (flags & (__TRACE_GRAPH_PRINT_RETVAL | __TRACE_GRAPH_PRINT_RETADDR)) {
print_graph_retval(s, entry, graph_ret,
(void *)graph_ret->func + iter->tr->text_delta,
- flags, tr->trace_flags);
+ flags, tr->trace_flags, args_size);
} else {
- trace_seq_printf(s, "%ps();\n", (void *)func);
+ trace_seq_printf(s, "%ps", (void *)ret_func);
+
+ if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long)) {
+ print_function_args(s, FGRAPH_ENTRY_ARGS(entry), ret_func);
+ trace_seq_putc(s, ';');
+ } else
+ trace_seq_puts(s, "();");
}
+ trace_seq_putc(s, '\n');
print_graph_irq(iter, graph_ret->func, TRACE_GRAPH_RET,
cpu, iter->ent->pid, flags);
@@ -916,6 +1025,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
struct fgraph_data *data = iter->private;
struct trace_array *tr = iter->tr;
unsigned long func;
+ int args_size;
int i;
if (data) {
@@ -940,7 +1050,17 @@ print_graph_entry_nested(struct trace_iterator *iter,
func = call->func + iter->tr->text_delta;
- trace_seq_printf(s, "%ps() {", (void *)func);
+ trace_seq_printf(s, "%ps", (void *)func);
+
+ args_size = iter->ent_size - offsetof(struct ftrace_graph_ent_entry, args);
+
+ if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long))
+ print_function_args(s, FGRAPH_ENTRY_ARGS(entry), func);
+ else
+ trace_seq_puts(s, "()");
+
+ trace_seq_puts(s, " {");
+
if (flags & __TRACE_GRAPH_PRINT_RETADDR &&
entry->ent.type == TRACE_GRAPH_RETADDR_ENT)
print_graph_retaddr(s, (struct fgraph_retaddr_ent_entry *)entry,
@@ -973,7 +1093,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
/* Interrupt */
print_graph_irq(iter, addr, type, cpu, ent->pid, flags);
- if (!(tr->trace_flags & TRACE_ITER_CONTEXT_INFO))
+ if (!(tr->trace_flags & TRACE_ITER(CONTEXT_INFO)))
return;
/* Absolute time */
@@ -995,7 +1115,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
}
/* Latency format */
- if (tr->trace_flags & TRACE_ITER_LATENCY_FMT)
+ if (tr->trace_flags & TRACE_ITER(LATENCY_FMT))
print_graph_lat_fmt(s, ent);
return;
@@ -1110,21 +1230,41 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
struct trace_iterator *iter, u32 flags)
{
struct fgraph_data *data = iter->private;
- struct ftrace_graph_ent *call = &field->graph_ent;
+ struct ftrace_graph_ent *call;
struct ftrace_graph_ret_entry *leaf_ret;
static enum print_line_t ret;
int cpu = iter->cpu;
+ /*
+ * print_graph_entry() may consume the current event,
+ * thus @field may become invalid, so we need to save it.
+ * This function is shared by ftrace_graph_ent_entry and
+ * fgraph_retaddr_ent_entry, the size of the latter one
+ * is larger, but it is very small and can be safely saved
+ * at the stack.
+ */
+ struct ftrace_graph_ent_entry *entry;
+ struct fgraph_retaddr_ent_entry *rentry;
+ u8 save_buf[sizeof(*rentry) + FTRACE_REGS_MAX_ARGS * sizeof(long)];
+
+ /* The ent_size is expected to be as big as the entry */
+ if (iter->ent_size > sizeof(save_buf))
+ iter->ent_size = sizeof(save_buf);
+
+ entry = (void *)save_buf;
+ memcpy(entry, field, iter->ent_size);
+
+ call = &entry->graph_ent;
if (check_irq_entry(iter, flags, call->func, call->depth))
return TRACE_TYPE_HANDLED;
print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags);
- leaf_ret = get_return_for_leaf(iter, field);
+ leaf_ret = get_return_for_leaf(iter, entry);
if (leaf_ret)
- ret = print_graph_entry_leaf(iter, field, leaf_ret, s, flags);
+ ret = print_graph_entry_leaf(iter, entry, leaf_ret, s, flags);
else
- ret = print_graph_entry_nested(iter, field, s, cpu, flags);
+ ret = print_graph_entry_nested(iter, entry, s, cpu, flags);
if (data) {
/*
@@ -1142,11 +1282,14 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
}
static enum print_line_t
-print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
+print_graph_return(struct ftrace_graph_ret_entry *retentry, struct trace_seq *s,
struct trace_entry *ent, struct trace_iterator *iter,
u32 flags)
{
- unsigned long long duration = trace->rettime - trace->calltime;
+ struct ftrace_graph_ret *trace = &retentry->ret;
+ u64 calltime = retentry->calltime;
+ u64 rettime = retentry->rettime;
+ unsigned long long duration = rettime - calltime;
struct fgraph_data *data = iter->private;
struct trace_array *tr = iter->tr;
unsigned long func;
@@ -1195,7 +1338,8 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
* funcgraph-retval option is enabled.
*/
if (flags & __TRACE_GRAPH_PRINT_RETVAL) {
- print_graph_retval(s, NULL, trace, (void *)func, flags, tr->trace_flags);
+ print_graph_retval(s, NULL, trace, (void *)func, flags,
+ tr->trace_flags, 0);
} else {
/*
* If the return function does not have a matching entry,
@@ -1205,10 +1349,11 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
* that if the funcgraph-tail option is enabled.
*/
if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL))
- trace_seq_puts(s, "}\n");
+ trace_seq_puts(s, "}");
else
- trace_seq_printf(s, "} /* %ps */\n", (void *)func);
+ trace_seq_printf(s, "} /* %ps */", (void *)func);
}
+ trace_seq_putc(s, '\n');
/* Overrun */
if (flags & TRACE_GRAPH_PRINT_OVERRUN)
@@ -1323,31 +1468,28 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
switch (entry->type) {
case TRACE_GRAPH_ENT: {
- /*
- * print_graph_entry() may consume the current event,
- * thus @field may become invalid, so we need to save it.
- * sizeof(struct ftrace_graph_ent_entry) is very small,
- * it can be safely saved at the stack.
- */
- struct ftrace_graph_ent_entry saved;
trace_assign_type(field, entry);
- saved = *field;
- return print_graph_entry(&saved, s, iter, flags);
+ return print_graph_entry(field, s, iter, flags);
}
#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
case TRACE_GRAPH_RETADDR_ENT: {
- struct fgraph_retaddr_ent_entry saved;
+ /*
+ * ftrace_graph_ent_entry and fgraph_retaddr_ent_entry have
+ * similar functions and memory layouts. The only difference
+ * is that the latter one has an extra retaddr member, so
+ * they can share most of the logic.
+ */
struct fgraph_retaddr_ent_entry *rfield;
trace_assign_type(rfield, entry);
- saved = *rfield;
- return print_graph_entry((struct ftrace_graph_ent_entry *)&saved, s, iter, flags);
+ return print_graph_entry((struct ftrace_graph_ent_entry *)rfield,
+ s, iter, flags);
}
#endif
case TRACE_GRAPH_RET: {
struct ftrace_graph_ret_entry *field;
trace_assign_type(field, entry);
- return print_graph_return(&field->ret, s, entry, iter, flags);
+ return print_graph_return(field, s, entry, iter, flags);
}
case TRACE_STACK:
case TRACE_FN:
@@ -1364,7 +1506,8 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
static enum print_line_t
print_graph_function(struct trace_iterator *iter)
{
- return print_graph_function_flags(iter, tracer_flags.val);
+ struct trace_array *tr = iter->tr;
+ return print_graph_function_flags(iter, tr->current_trace_flags->val);
}
static enum print_line_t
@@ -1400,7 +1543,7 @@ static void print_lat_header(struct seq_file *s, u32 flags)
static void __print_graph_headers_flags(struct trace_array *tr,
struct seq_file *s, u32 flags)
{
- int lat = tr->trace_flags & TRACE_ITER_LATENCY_FMT;
+ int lat = tr->trace_flags & TRACE_ITER(LATENCY_FMT);
if (lat)
print_lat_header(s, flags);
@@ -1440,7 +1583,10 @@ static void __print_graph_headers_flags(struct trace_array *tr,
static void print_graph_headers(struct seq_file *s)
{
- print_graph_headers_flags(s, tracer_flags.val);
+ struct trace_iterator *iter = s->private;
+ struct trace_array *tr = iter->tr;
+
+ print_graph_headers_flags(s, tr->current_trace_flags->val);
}
void print_graph_headers_flags(struct seq_file *s, u32 flags)
@@ -1448,10 +1594,10 @@ void print_graph_headers_flags(struct seq_file *s, u32 flags)
struct trace_iterator *iter = s->private;
struct trace_array *tr = iter->tr;
- if (!(tr->trace_flags & TRACE_ITER_CONTEXT_INFO))
+ if (!(tr->trace_flags & TRACE_ITER(CONTEXT_INFO)))
return;
- if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) {
+ if (tr->trace_flags & TRACE_ITER(LATENCY_FMT)) {
/* print nothing if the buffers are empty */
if (trace_empty(iter))
return;
@@ -1474,7 +1620,7 @@ void graph_trace_open(struct trace_iterator *iter)
/* We can be called in atomic context via ftrace_dump() */
gfpflags = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : GFP_KERNEL;
- data = kzalloc(sizeof(*data), gfpflags);
+ data = kzalloc_obj(*data, gfpflags);
if (!data)
goto out_err;
@@ -1511,20 +1657,63 @@ void graph_trace_close(struct trace_iterator *iter)
if (data) {
free_percpu(data->cpu_data);
kfree(data);
+ iter->private = NULL;
}
}
static int
func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
{
- if (bit == TRACE_GRAPH_PRINT_IRQS)
- ftrace_graph_skip_irqs = !set;
+/*
+ * The function profiler gets updated even if function graph
+ * isn't the current tracer. Handle it separately.
+ */
+#ifdef CONFIG_FUNCTION_PROFILER
+ if (bit == TRACE_GRAPH_SLEEP_TIME && (tr->flags & TRACE_ARRAY_FL_GLOBAL) &&
+ !!set == fprofile_no_sleep_time) {
+ if (set) {
+ fgraph_no_sleep_time--;
+ if (WARN_ON_ONCE(fgraph_no_sleep_time < 0))
+ fgraph_no_sleep_time = 0;
+ fprofile_no_sleep_time = false;
+ } else {
+ fgraph_no_sleep_time++;
+ fprofile_no_sleep_time = true;
+ }
+ }
+#endif
+
+ /* Do nothing if the current tracer is not this tracer */
+ if (tr->current_trace != &graph_trace)
+ return 0;
- if (bit == TRACE_GRAPH_SLEEP_TIME)
- ftrace_graph_sleep_time_control(set);
+ /* Do nothing if already set. */
+ if (!!set == !!(tr->current_trace_flags->val & bit))
+ return 0;
- if (bit == TRACE_GRAPH_GRAPH_TIME)
- ftrace_graph_graph_time_control(set);
+ switch (bit) {
+ case TRACE_GRAPH_SLEEP_TIME:
+ if (set) {
+ fgraph_no_sleep_time--;
+ if (WARN_ON_ONCE(fgraph_no_sleep_time < 0))
+ fgraph_no_sleep_time = 0;
+ } else {
+ fgraph_no_sleep_time++;
+ }
+ break;
+
+ case TRACE_GRAPH_PRINT_IRQS:
+ if (set)
+ ftrace_graph_skip_irqs--;
+ else
+ ftrace_graph_skip_irqs++;
+ if (WARN_ON_ONCE(ftrace_graph_skip_irqs < 0))
+ ftrace_graph_skip_irqs = 0;
+ break;
+
+ case TRACE_GRAPH_ARGS:
+ return ftrace_graph_trace_args(tr, set);
+ }
return 0;
}
@@ -1561,7 +1750,7 @@ static struct tracer graph_trace __tracer_data = {
.reset = graph_trace_reset,
.print_line = print_graph_function,
.print_header = print_graph_headers,
- .flags = &tracer_flags,
+ .default_flags = &tracer_flags,
.set_flag = func_graph_set_flag,
.allow_instances = true,
#ifdef CONFIG_FTRACE_SELFTEST
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index b65353ec2837..3fe274b84f1c 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -102,9 +102,9 @@ struct hwlat_sample {
/* keep the global state somewhere. */
static struct hwlat_data {
- struct mutex lock; /* protect changes */
+ struct mutex lock; /* protect changes */
- u64 count; /* total since reset */
+ atomic64_t count; /* total since reset */
u64 sample_window; /* total sampling window (on+off) */
u64 sample_width; /* active sampling portion of window */
@@ -193,8 +193,7 @@ void trace_hwlat_callback(bool enter)
* get_sample - sample the CPU TSC and look for likely hardware latencies
*
* Used to repeatedly capture the CPU TSC (or similar), looking for potential
- * hardware-induced latency. Called with interrupts disabled and with
- * hwlat_data.lock held.
+ * hardware-induced latency. Called with interrupts disabled.
*/
static int get_sample(void)
{
@@ -204,6 +203,7 @@ static int get_sample(void)
time_type start, t1, t2, last_t2;
s64 diff, outer_diff, total, last_total = 0;
u64 sample = 0;
+ u64 sample_width = READ_ONCE(hwlat_data.sample_width);
u64 thresh = tracing_thresh;
u64 outer_sample = 0;
int ret = -1;
@@ -267,7 +267,7 @@ static int get_sample(void)
if (diff > sample)
sample = diff; /* only want highest value */
- } while (total <= hwlat_data.sample_width);
+ } while (total <= sample_width);
barrier(); /* finish the above in the view for NMIs */
trace_hwlat_callback_enabled = false;
@@ -285,8 +285,7 @@ static int get_sample(void)
if (kdata->nmi_total_ts)
do_div(kdata->nmi_total_ts, NSEC_PER_USEC);
- hwlat_data.count++;
- s.seqnum = hwlat_data.count;
+ s.seqnum = atomic64_inc_return(&hwlat_data.count);
s.duration = sample;
s.outer_duration = outer_sample;
s.nmi_total_ts = kdata->nmi_total_ts;
@@ -325,12 +324,9 @@ static void move_to_next_cpu(void)
cpus_read_lock();
cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask);
- next_cpu = cpumask_next(raw_smp_processor_id(), current_mask);
+ next_cpu = cpumask_next_wrap(raw_smp_processor_id(), current_mask);
cpus_read_unlock();
- if (next_cpu >= nr_cpu_ids)
- next_cpu = cpumask_first(current_mask);
-
if (next_cpu >= nr_cpu_ids) /* Shouldn't happen! */
goto change_mode;
@@ -835,7 +831,7 @@ static int hwlat_tracer_init(struct trace_array *tr)
hwlat_trace = tr;
- hwlat_data.count = 0;
+ atomic64_set(&hwlat_data.count, 0);
tr->max_latency = 0;
save_tracing_thresh = tracing_thresh;
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index fce064e20570..17673905907c 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -63,7 +63,7 @@ irq_trace(void)
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
static int irqsoff_display_graph(struct trace_array *tr, int set);
-# define is_graph(tr) ((tr)->trace_flags & TRACE_ITER_DISPLAY_GRAPH)
+# define is_graph(tr) ((tr)->trace_flags & TRACE_ITER(DISPLAY_GRAPH))
#else
static inline int irqsoff_display_graph(struct trace_array *tr, int set)
{
@@ -123,12 +123,12 @@ static int func_prolog_dec(struct trace_array *tr,
return 0;
*data = per_cpu_ptr(tr->array_buffer.data, cpu);
- disabled = atomic_inc_return(&(*data)->disabled);
+ disabled = local_inc_return(&(*data)->disabled);
if (likely(disabled == 1))
return 1;
- atomic_dec(&(*data)->disabled);
+ local_dec(&(*data)->disabled);
return 0;
}
@@ -150,9 +150,9 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip,
trace_ctx = tracing_gen_ctx_flags(flags);
- trace_function(tr, ip, parent_ip, trace_ctx);
+ trace_function(tr, ip, parent_ip, trace_ctx, fregs);
- atomic_dec(&data->disabled);
+ local_dec(&data->disabled);
}
#endif /* CONFIG_FUNCTION_TRACER */
@@ -176,13 +176,15 @@ static int irqsoff_display_graph(struct trace_array *tr, int set)
}
static int irqsoff_graph_entry(struct ftrace_graph_ent *trace,
- struct fgraph_ops *gops)
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
struct trace_array *tr = irqsoff_trace;
struct trace_array_cpu *data;
unsigned long flags;
unsigned int trace_ctx;
- int ret;
+ u64 *calltime;
+ int ret = 0;
if (ftrace_graph_ignore_func(gops, trace))
return 0;
@@ -199,29 +201,41 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace,
if (!func_prolog_dec(tr, &data, &flags))
return 0;
- trace_ctx = tracing_gen_ctx_flags(flags);
- ret = __trace_graph_entry(tr, trace, trace_ctx);
- atomic_dec(&data->disabled);
+ calltime = fgraph_reserve_data(gops->idx, sizeof(*calltime));
+ if (calltime) {
+ *calltime = trace_clock_local();
+ trace_ctx = tracing_gen_ctx_flags(flags);
+ ret = __trace_graph_entry(tr, trace, trace_ctx);
+ }
+ local_dec(&data->disabled);
return ret;
}
static void irqsoff_graph_return(struct ftrace_graph_ret *trace,
- struct fgraph_ops *gops)
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
struct trace_array *tr = irqsoff_trace;
struct trace_array_cpu *data;
unsigned long flags;
unsigned int trace_ctx;
+ u64 *calltime;
+ u64 rettime;
+ int size;
ftrace_graph_addr_finish(gops, trace);
if (!func_prolog_dec(tr, &data, &flags))
return;
- trace_ctx = tracing_gen_ctx_flags(flags);
- __trace_graph_return(tr, trace, trace_ctx);
- atomic_dec(&data->disabled);
+ rettime = trace_clock_local();
+ calltime = fgraph_retrieve_data(gops->idx, &size);
+ if (calltime) {
+ trace_ctx = tracing_gen_ctx_flags(flags);
+ __trace_graph_return(tr, trace, trace_ctx, *calltime, rettime);
+ }
+ local_dec(&data->disabled);
}
static struct fgraph_ops fgraph_ops = {
@@ -233,8 +247,6 @@ static void irqsoff_trace_open(struct trace_iterator *iter)
{
if (is_graph(iter->tr))
graph_trace_open(iter);
- else
- iter->private = NULL;
}
static void irqsoff_trace_close(struct trace_iterator *iter)
@@ -278,11 +290,17 @@ __trace_function(struct trace_array *tr,
if (is_graph(tr))
trace_graph_function(tr, ip, parent_ip, trace_ctx);
else
- trace_function(tr, ip, parent_ip, trace_ctx);
+ trace_function(tr, ip, parent_ip, trace_ctx, NULL);
}
#else
-#define __trace_function trace_function
+static inline void
+__trace_function(struct trace_array *tr,
+ unsigned long ip, unsigned long parent_ip,
+ unsigned int trace_ctx)
+{
+ return trace_function(tr, ip, parent_ip, trace_ctx, NULL);
+}
static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
{
@@ -376,6 +394,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
int cpu;
struct trace_array *tr = irqsoff_trace;
struct trace_array_cpu *data;
+ long disabled;
if (!tracer_enabled || !tracing_is_enabled())
return;
@@ -387,20 +406,22 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
data = per_cpu_ptr(tr->array_buffer.data, cpu);
- if (unlikely(!data) || atomic_read(&data->disabled))
+ if (unlikely(!data) || local_read(&data->disabled))
return;
- atomic_inc(&data->disabled);
+ disabled = local_inc_return(&data->disabled);
- data->critical_sequence = max_sequence;
- data->preempt_timestamp = ftrace_now(cpu);
- data->critical_start = parent_ip ? : ip;
+ if (disabled == 1) {
+ data->critical_sequence = max_sequence;
+ data->preempt_timestamp = ftrace_now(cpu);
+ data->critical_start = parent_ip ? : ip;
- __trace_function(tr, ip, parent_ip, tracing_gen_ctx());
+ __trace_function(tr, ip, parent_ip, tracing_gen_ctx());
- per_cpu(tracing_cpu, cpu) = 1;
+ per_cpu(tracing_cpu, cpu) = 1;
+ }
- atomic_dec(&data->disabled);
+ local_dec(&data->disabled);
}
static nokprobe_inline void
@@ -410,6 +431,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
struct trace_array *tr = irqsoff_trace;
struct trace_array_cpu *data;
unsigned int trace_ctx;
+ long disabled;
cpu = raw_smp_processor_id();
/* Always clear the tracing cpu on stopping the trace */
@@ -424,16 +446,19 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
data = per_cpu_ptr(tr->array_buffer.data, cpu);
if (unlikely(!data) ||
- !data->critical_start || atomic_read(&data->disabled))
+ !data->critical_start || local_read(&data->disabled))
return;
- atomic_inc(&data->disabled);
+ disabled = local_inc_return(&data->disabled);
- trace_ctx = tracing_gen_ctx();
- __trace_function(tr, ip, parent_ip, trace_ctx);
- check_critical_timing(tr, data, parent_ip ? : ip, cpu);
- data->critical_start = 0;
- atomic_dec(&data->disabled);
+ if (disabled == 1) {
+ trace_ctx = tracing_gen_ctx();
+ __trace_function(tr, ip, parent_ip, trace_ctx);
+ check_critical_timing(tr, data, parent_ip ? : ip, cpu);
+ data->critical_start = 0;
+ }
+
+ local_dec(&data->disabled);
}
/* start and stop critical timings used to for stoppage (in idle) */
@@ -460,8 +485,8 @@ static int register_irqsoff_function(struct trace_array *tr, int graph, int set)
{
int ret;
- /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */
- if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER_FUNCTION)))
+ /* 'set' is set if TRACE_ITER(FUNCTION) is about to be set */
+ if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER(FUNCTION))))
return 0;
if (graph)
@@ -490,7 +515,7 @@ static void unregister_irqsoff_function(struct trace_array *tr, int graph)
static int irqsoff_function_set(struct trace_array *tr, u32 mask, int set)
{
- if (!(mask & TRACE_ITER_FUNCTION))
+ if (!(mask & TRACE_ITER(FUNCTION)))
return 0;
if (set)
@@ -511,7 +536,7 @@ static inline int irqsoff_function_set(struct trace_array *tr, u32 mask, int set
}
#endif /* CONFIG_FUNCTION_TRACER */
-static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set)
+static int irqsoff_flag_changed(struct trace_array *tr, u64 mask, int set)
{
struct tracer *tracer = tr->current_trace;
@@ -519,7 +544,7 @@ static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set)
return 0;
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- if (mask & TRACE_ITER_DISPLAY_GRAPH)
+ if (mask & TRACE_ITER(DISPLAY_GRAPH))
return irqsoff_display_graph(tr, set);
#endif
@@ -557,10 +582,10 @@ static int __irqsoff_tracer_init(struct trace_array *tr)
save_flags = tr->trace_flags;
/* non overwrite screws up the latency tracers */
- set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
- set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);
+ set_tracer_flag(tr, TRACE_ITER(OVERWRITE), 1);
+ set_tracer_flag(tr, TRACE_ITER(LATENCY_FMT), 1);
/* without pause, we will produce garbage if another latency occurs */
- set_tracer_flag(tr, TRACE_ITER_PAUSE_ON_TRACE, 1);
+ set_tracer_flag(tr, TRACE_ITER(PAUSE_ON_TRACE), 1);
tr->max_latency = 0;
irqsoff_trace = tr;
@@ -580,15 +605,15 @@ static int __irqsoff_tracer_init(struct trace_array *tr)
static void __irqsoff_tracer_reset(struct trace_array *tr)
{
- int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT;
- int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE;
- int pause_flag = save_flags & TRACE_ITER_PAUSE_ON_TRACE;
+ int lat_flag = save_flags & TRACE_ITER(LATENCY_FMT);
+ int overwrite_flag = save_flags & TRACE_ITER(OVERWRITE);
+ int pause_flag = save_flags & TRACE_ITER(PAUSE_ON_TRACE);
stop_irqsoff_tracer(tr, is_graph(tr));
- set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);
- set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
- set_tracer_flag(tr, TRACE_ITER_PAUSE_ON_TRACE, pause_flag);
+ set_tracer_flag(tr, TRACE_ITER(LATENCY_FMT), lat_flag);
+ set_tracer_flag(tr, TRACE_ITER(OVERWRITE), overwrite_flag);
+ set_tracer_flag(tr, TRACE_ITER(PAUSE_ON_TRACE), pause_flag);
ftrace_reset_array_ops(tr);
irqsoff_busy = false;
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 1e72d20b3c2f..b30795f34079 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -31,7 +31,7 @@ static void ftrace_dump_buf(int skip_entries, long cpu_file)
old_userobj = tr->trace_flags;
/* don't look at user memory in panic mode */
- tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
+ tr->trace_flags &= ~TRACE_ITER(SYM_USEROBJ);
kdb_printf("Dumping ftrace buffer:\n");
if (skip_entries)
@@ -43,17 +43,15 @@ static void ftrace_dump_buf(int skip_entries, long cpu_file)
if (cpu_file == RING_BUFFER_ALL_CPUS) {
for_each_tracing_cpu(cpu) {
iter.buffer_iter[cpu] =
- ring_buffer_read_prepare(iter.array_buffer->buffer,
- cpu, GFP_ATOMIC);
- ring_buffer_read_start(iter.buffer_iter[cpu]);
+ ring_buffer_read_start(iter.array_buffer->buffer,
+ cpu, GFP_ATOMIC);
tracing_iter_reset(&iter, cpu);
}
} else {
iter.cpu_file = cpu_file;
iter.buffer_iter[cpu_file] =
- ring_buffer_read_prepare(iter.array_buffer->buffer,
+ ring_buffer_read_start(iter.array_buffer->buffer,
cpu_file, GFP_ATOMIC);
- ring_buffer_read_start(iter.buffer_iter[cpu_file]);
tracing_iter_reset(&iter, cpu_file);
}
@@ -98,7 +96,6 @@ static int kdb_ftdump(int argc, const char **argv)
long cpu_file;
int err;
int cnt;
- int cpu;
if (argc > 2)
return KDB_ARGCOUNT;
@@ -120,9 +117,7 @@ static int kdb_ftdump(int argc, const char **argv)
trace_init_global_iter(&iter);
iter.buffer_iter = buffer_iter;
- for_each_tracing_cpu(cpu) {
- atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);
- }
+ tracer_tracing_disable(iter.tr);
/* A negative skip_entries means skip all but the last entries */
if (skip_entries < 0) {
@@ -135,9 +130,7 @@ static int kdb_ftdump(int argc, const char **argv)
ftrace_dump_buf(skip_entries, cpu_file);
- for_each_tracing_cpu(cpu) {
- atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);
- }
+ tracer_tracing_enable(iter.tr);
kdb_trap_printk--;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 935a886af40c..a8420e6abb56 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -8,19 +8,20 @@
#define pr_fmt(fmt) "trace_kprobe: " fmt
#include <linux/bpf-cgroup.h>
-#include <linux/security.h>
+#include <linux/cleanup.h>
+#include <linux/error-injection.h>
#include <linux/module.h>
-#include <linux/uaccess.h>
#include <linux/rculist.h>
-#include <linux/error-injection.h>
+#include <linux/security.h>
+#include <linux/uaccess.h>
#include <asm/setup.h> /* for COMMAND_LINE_SIZE */
#include "trace_dynevent.h"
#include "trace_kprobe_selftest.h"
#include "trace_probe.h"
-#include "trace_probe_tmpl.h"
#include "trace_probe_kernel.h"
+#include "trace_probe_tmpl.h"
#define KPROBE_EVENT_SYSTEM "kprobes"
#define KRETPROBE_MAXACTIVE_MAX 4096
@@ -30,7 +31,8 @@ static char kprobe_boot_events_buf[COMMAND_LINE_SIZE] __initdata;
static int __init set_kprobe_boot_events(char *str)
{
- strscpy(kprobe_boot_events_buf, str, COMMAND_LINE_SIZE);
+ trace_append_boot_param(kprobe_boot_events_buf, str, ';',
+ COMMAND_LINE_SIZE);
disable_tracing_selftest("running kprobe events");
return 1;
@@ -81,6 +83,7 @@ static struct trace_kprobe *to_trace_kprobe(struct dyn_event *ev)
#define for_each_trace_kprobe(pos, dpos) \
for_each_dyn_event(dpos) \
if (is_trace_kprobe(dpos) && (pos = to_trace_kprobe(dpos)))
+#define trace_kprobe_list_empty() list_empty(&dyn_event_list)
static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk)
{
@@ -123,9 +126,8 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
if (!p)
return true;
*p = '\0';
- rcu_read_lock_sched();
- ret = !!find_module(tk->symbol);
- rcu_read_unlock_sched();
+ scoped_guard(rcu)
+ ret = !!find_module(tk->symbol);
*p = ':';
return ret;
@@ -257,6 +259,9 @@ static void free_trace_kprobe(struct trace_kprobe *tk)
}
}
+DEFINE_FREE(free_trace_kprobe, struct trace_kprobe *,
+ if (!IS_ERR_OR_NULL(_T)) free_trace_kprobe(_T))
+
/*
* Allocate new trace_probe and initialize it (including kprobes).
*/
@@ -268,21 +273,21 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
int maxactive,
int nargs, bool is_return)
{
- struct trace_kprobe *tk;
+ struct trace_kprobe *tk __free(free_trace_kprobe) = NULL;
int ret = -ENOMEM;
- tk = kzalloc(struct_size(tk, tp.args, nargs), GFP_KERNEL);
+ tk = kzalloc_flex(*tk, tp.args, nargs);
if (!tk)
return ERR_PTR(ret);
tk->nhit = alloc_percpu(unsigned long);
if (!tk->nhit)
- goto error;
+ return ERR_PTR(ret);
if (symbol) {
tk->symbol = kstrdup(symbol, GFP_KERNEL);
if (!tk->symbol)
- goto error;
+ return ERR_PTR(ret);
tk->rp.kp.symbol_name = tk->symbol;
tk->rp.kp.offset = offs;
} else
@@ -299,13 +304,10 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
ret = trace_probe_init(&tk->tp, event, group, false, nargs);
if (ret < 0)
- goto error;
+ return ERR_PTR(ret);
dyn_event_init(&tk->devent, &trace_kprobe_ops);
- return tk;
-error:
- free_trace_kprobe(tk);
- return ERR_PTR(ret);
+ return_ptr(tk);
}
static struct trace_kprobe *find_trace_kprobe(const char *event,
@@ -634,7 +636,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
struct trace_kprobe *old_tk;
int ret;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
old_tk = find_trace_kprobe(trace_probe_name(&tk->tp),
trace_probe_group_name(&tk->tp));
@@ -642,11 +644,9 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
if (trace_kprobe_is_return(tk) != trace_kprobe_is_return(old_tk)) {
trace_probe_log_set_index(0);
trace_probe_log_err(0, DIFF_PROBE_TYPE);
- ret = -EEXIST;
- } else {
- ret = append_trace_kprobe(tk, old_tk);
+ return -EEXIST;
}
- goto end;
+ return append_trace_kprobe(tk, old_tk);
}
/* Register new event */
@@ -657,7 +657,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
trace_probe_log_err(0, EVENT_EXIST);
} else
pr_warn("Failed to register probe event(%d)\n", ret);
- goto end;
+ return ret;
}
/* Register k*probe */
@@ -672,8 +672,6 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
else
dyn_event_add(&tk->devent, trace_probe_event_call(&tk->tp));
-end:
- mutex_unlock(&event_mutex);
return ret;
}
@@ -706,7 +704,7 @@ static int trace_kprobe_module_callback(struct notifier_block *nb,
return NOTIFY_DONE;
/* Update probes on coming module */
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
for_each_trace_kprobe(tk, pos) {
if (trace_kprobe_within_module(tk, mod)) {
/* Don't need to check busy - this should have gone. */
@@ -718,7 +716,6 @@ static int trace_kprobe_module_callback(struct notifier_block *nb,
module_name(mod), ret);
}
}
- mutex_unlock(&event_mutex);
return NOTIFY_DONE;
}
@@ -769,6 +766,14 @@ static unsigned int number_of_same_symbols(const char *mod, const char *func_nam
if (!mod)
kallsyms_on_each_match_symbol(count_symbols, func_name, &ctx.count);
+ /*
+ * If the symbol is found in vmlinux, use vmlinux resolution only.
+ * This prevents module symbols from shadowing vmlinux symbols
+ * and causing -EADDRNOTAVAIL for unqualified kprobe targets.
+ */
+ if (!mod && ctx.count > 0)
+ return ctx.count;
+
module_kallsyms_on_each_symbol(mod, count_mod_symbols, &ctx);
return ctx.count;
@@ -800,12 +805,10 @@ static struct module *try_module_get_by_name(const char *name)
{
struct module *mod;
- rcu_read_lock_sched();
+ guard(rcu)();
mod = find_module(name);
if (mod && !try_module_get(mod))
mod = NULL;
- rcu_read_unlock_sched();
-
return mod;
}
#else
@@ -840,7 +843,8 @@ out:
static int trace_kprobe_entry_handler(struct kretprobe_instance *ri,
struct pt_regs *regs);
-static int __trace_kprobe_create(int argc, const char *argv[])
+static int trace_kprobe_create_internal(int argc, const char *argv[],
+ struct traceprobe_parse_context *ctx)
{
/*
* Argument syntax:
@@ -866,21 +870,21 @@ static int __trace_kprobe_create(int argc, const char *argv[])
* Type of args:
* FETCHARG:TYPE : use TYPE instead of unsigned long.
*/
- struct trace_kprobe *tk = NULL;
- int i, len, new_argc = 0, ret = 0;
- bool is_return = false;
- char *symbol = NULL, *tmp = NULL;
- const char **new_argv = NULL;
+ struct trace_kprobe *tk __free(free_trace_kprobe) = NULL;
const char *event = NULL, *group = KPROBE_EVENT_SYSTEM;
+ const char **new_argv __free(kfree) = NULL;
+ int i, len, new_argc = 0, ret = 0;
+ char *symbol __free(kfree) = NULL;
+ char *ebuf __free(kfree) = NULL;
+ char *gbuf __free(kfree) = NULL;
+ char *abuf __free(kfree) = NULL;
+ char *dbuf __free(kfree) = NULL;
enum probe_print_type ptype;
+ bool is_return = false;
int maxactive = 0;
- long offset = 0;
void *addr = NULL;
- char buf[MAX_EVENT_NAME_LEN];
- char gbuf[MAX_EVENT_NAME_LEN];
- char abuf[MAX_BTF_ARGS_LEN];
- char *dbuf = NULL;
- struct traceprobe_parse_context ctx = { .flags = TPARG_FL_KERNEL };
+ char *tmp = NULL;
+ long offset = 0;
switch (argv[0][0]) {
case 'r':
@@ -894,16 +898,16 @@ static int __trace_kprobe_create(int argc, const char *argv[])
if (argc < 2)
return -ECANCELED;
- trace_probe_log_init("trace_kprobe", argc, argv);
-
event = strchr(&argv[0][1], ':');
if (event)
event++;
if (isdigit(argv[0][1])) {
+ char *buf __free(kfree) = NULL;
+
if (!is_return) {
trace_probe_log_err(1, BAD_MAXACT_TYPE);
- goto parse_error;
+ return -EINVAL;
}
if (event)
len = event - &argv[0][1] - 1;
@@ -911,21 +915,23 @@ static int __trace_kprobe_create(int argc, const char *argv[])
len = strlen(&argv[0][1]);
if (len > MAX_EVENT_NAME_LEN - 1) {
trace_probe_log_err(1, BAD_MAXACT);
- goto parse_error;
+ return -EINVAL;
}
- memcpy(buf, &argv[0][1], len);
+ buf = kmemdup(&argv[0][1], len + 1, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
buf[len] = '\0';
ret = kstrtouint(buf, 0, &maxactive);
if (ret || !maxactive) {
trace_probe_log_err(1, BAD_MAXACT);
- goto parse_error;
+ return -EINVAL;
}
/* kretprobes instances are iterated over via a list. The
* maximum should stay reasonable.
*/
if (maxactive > KRETPROBE_MAXACTIVE_MAX) {
trace_probe_log_err(1, MAXACT_TOO_BIG);
- goto parse_error;
+ return -EINVAL;
}
}
@@ -934,10 +940,9 @@ static int __trace_kprobe_create(int argc, const char *argv[])
if (kstrtoul(argv[1], 0, (unsigned long *)&addr)) {
trace_probe_log_set_index(1);
/* Check whether uprobe event specified */
- if (strchr(argv[1], '/') && strchr(argv[1], ':')) {
- ret = -ECANCELED;
- goto error;
- }
+ if (strchr(argv[1], '/') && strchr(argv[1], ':'))
+ return -ECANCELED;
+
/* a symbol specified */
symbol = kstrdup(argv[1], GFP_KERNEL);
if (!symbol)
@@ -950,7 +955,7 @@ static int __trace_kprobe_create(int argc, const char *argv[])
is_return = true;
} else {
trace_probe_log_err(tmp - symbol, BAD_ADDR_SUFFIX);
- goto parse_error;
+ return -EINVAL;
}
}
@@ -958,7 +963,7 @@ static int __trace_kprobe_create(int argc, const char *argv[])
ret = traceprobe_split_symbol_offset(symbol, &offset);
if (ret || offset < 0 || offset > UINT_MAX) {
trace_probe_log_err(0, BAD_PROBE_ADDR);
- goto parse_error;
+ return -EINVAL;
}
ret = validate_probe_symbol(symbol);
if (ret) {
@@ -966,61 +971,71 @@ static int __trace_kprobe_create(int argc, const char *argv[])
trace_probe_log_err(0, NON_UNIQ_SYMBOL);
else
trace_probe_log_err(0, BAD_PROBE_ADDR);
- goto parse_error;
+ return -EINVAL;
}
if (is_return)
- ctx.flags |= TPARG_FL_RETURN;
+ ctx->flags |= TPARG_FL_RETURN;
ret = kprobe_on_func_entry(NULL, symbol, offset);
if (ret == 0 && !is_return)
- ctx.flags |= TPARG_FL_FENTRY;
+ ctx->flags |= TPARG_FL_FENTRY;
/* Defer the ENOENT case until register kprobe */
if (ret == -EINVAL && is_return) {
trace_probe_log_err(0, BAD_RETPROBE);
- goto parse_error;
+ return -EINVAL;
}
}
trace_probe_log_set_index(0);
if (event) {
+ gbuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL);
+ if (!gbuf)
+ return -ENOMEM;
ret = traceprobe_parse_event_name(&event, &group, gbuf,
event - argv[0]);
if (ret)
- goto parse_error;
+ return ret;
}
if (!event) {
/* Make a new event name */
+ ebuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL);
+ if (!ebuf)
+ return -ENOMEM;
if (symbol)
- snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_%ld",
+ snprintf(ebuf, MAX_EVENT_NAME_LEN, "%c_%s_%ld",
is_return ? 'r' : 'p', symbol, offset);
else
- snprintf(buf, MAX_EVENT_NAME_LEN, "%c_0x%p",
+ snprintf(ebuf, MAX_EVENT_NAME_LEN, "%c_0x%p",
is_return ? 'r' : 'p', addr);
- sanitize_event_name(buf);
- event = buf;
+ sanitize_event_name(ebuf);
+ event = ebuf;
}
+ abuf = kmalloc(MAX_BTF_ARGS_LEN, GFP_KERNEL);
+ if (!abuf)
+ return -ENOMEM;
argc -= 2; argv += 2;
- ctx.funcname = symbol;
+ ctx->funcname = symbol;
new_argv = traceprobe_expand_meta_args(argc, argv, &new_argc,
- abuf, MAX_BTF_ARGS_LEN, &ctx);
+ abuf, MAX_BTF_ARGS_LEN, ctx);
if (IS_ERR(new_argv)) {
ret = PTR_ERR(new_argv);
new_argv = NULL;
- goto out;
+ return ret;
}
if (new_argv) {
argc = new_argc;
argv = new_argv;
}
if (argc > MAX_TRACE_ARGS) {
- ret = -E2BIG;
- goto out;
+ trace_probe_log_set_index(2);
+ trace_probe_log_err(0, TOO_MANY_ARGS);
+ return -E2BIG;
}
ret = traceprobe_expand_dentry_args(argc, argv, &dbuf);
if (ret)
- goto out;
+ return ret;
/* setup a probe */
tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive,
@@ -1029,16 +1044,16 @@ static int __trace_kprobe_create(int argc, const char *argv[])
ret = PTR_ERR(tk);
/* This must return -ENOMEM, else there is a bug */
WARN_ON_ONCE(ret != -ENOMEM);
- goto out; /* We know tk is not allocated */
+ return ret; /* We know tk is not allocated */
}
/* parse arguments */
for (i = 0; i < argc; i++) {
trace_probe_log_set_index(i + 2);
- ctx.offset = 0;
- ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], &ctx);
+ ctx->offset = 0;
+ ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], ctx);
if (ret)
- goto error; /* This can be -ENOMEM */
+ return ret; /* This can be -ENOMEM */
}
/* entry handler for kretprobe */
if (is_return && tk->tp.entry_arg) {
@@ -1049,7 +1064,7 @@ static int __trace_kprobe_create(int argc, const char *argv[])
ptype = is_return ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL;
ret = traceprobe_set_print_fmt(&tk->tp, ptype);
if (ret < 0)
- goto error;
+ return ret;
ret = register_trace_kprobe(tk);
if (ret) {
@@ -1060,27 +1075,38 @@ static int __trace_kprobe_create(int argc, const char *argv[])
trace_probe_log_err(0, BAD_PROBE_ADDR);
else if (ret != -ENOMEM && ret != -EEXIST)
trace_probe_log_err(0, FAIL_REG_PROBE);
- goto error;
+ return ret;
}
+ /*
+ * Here, 'tk' has been registered to the list successfully,
+ * so we don't need to free it.
+ */
+ tk = NULL;
+
+ return 0;
+}
+
+static int trace_kprobe_create_cb(int argc, const char *argv[])
+{
+ struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) = NULL;
+ int ret;
+
+ ctx = kzalloc_obj(*ctx);
+ if (!ctx)
+ return -ENOMEM;
+ ctx->flags = TPARG_FL_KERNEL;
+
+ trace_probe_log_init("trace_kprobe", argc, argv);
+
+ ret = trace_kprobe_create_internal(argc, argv, ctx);
-out:
- traceprobe_finish_parse(&ctx);
trace_probe_log_clear();
- kfree(new_argv);
- kfree(symbol);
- kfree(dbuf);
return ret;
-
-parse_error:
- ret = -EINVAL;
-error:
- free_trace_kprobe(tk);
- goto out;
}
static int trace_kprobe_create(const char *raw_command)
{
- return trace_probe_create(raw_command, __trace_kprobe_create);
+ return trace_probe_create(raw_command, trace_kprobe_create_cb);
}
static int create_or_delete_trace_kprobe(const char *raw_command)
@@ -1090,7 +1116,7 @@ static int create_or_delete_trace_kprobe(const char *raw_command)
if (raw_command[0] == '-')
return dyn_event_release(raw_command, &trace_kprobe_ops);
- ret = trace_kprobe_create(raw_command);
+ ret = dyn_event_create(raw_command, &trace_kprobe_ops);
return ret == -ECANCELED ? -EINVAL : ret;
}
@@ -1568,7 +1594,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags,
trace_seq_printf(s, "%s: (", trace_probe_name(tp));
- if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
+ if (!seq_print_ip_sym_offset(s, field->ip, flags))
goto out;
trace_seq_putc(s, ')');
@@ -1598,12 +1624,12 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,
trace_seq_printf(s, "%s: (", trace_probe_name(tp));
- if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
+ if (!seq_print_ip_sym_offset(s, field->ret_ip, flags))
goto out;
trace_seq_puts(s, " <- ");
- if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET))
+ if (!seq_print_ip_sym_no_offset(s, field->func, flags))
goto out;
trace_seq_putc(s, ')');
@@ -1799,14 +1825,15 @@ static int kprobe_register(struct trace_event_call *event,
static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
{
struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp);
+ unsigned int flags = trace_probe_load_flag(&tk->tp);
int ret = 0;
raw_cpu_inc(*tk->nhit);
- if (trace_probe_test_flag(&tk->tp, TP_FLAG_TRACE))
+ if (flags & TP_FLAG_TRACE)
kprobe_trace_func(tk, regs);
#ifdef CONFIG_PERF_EVENTS
- if (trace_probe_test_flag(&tk->tp, TP_FLAG_PROFILE))
+ if (flags & TP_FLAG_PROFILE)
ret = kprobe_perf_func(tk, regs);
#endif
return ret;
@@ -1818,6 +1845,7 @@ kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
{
struct kretprobe *rp = get_kretprobe(ri);
struct trace_kprobe *tk;
+ unsigned int flags;
/*
* There is a small chance that get_kretprobe(ri) returns NULL when
@@ -1830,10 +1858,11 @@ kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
tk = container_of(rp, struct trace_kprobe, rp);
raw_cpu_inc(*tk->nhit);
- if (trace_probe_test_flag(&tk->tp, TP_FLAG_TRACE))
+ flags = trace_probe_load_flag(&tk->tp);
+ if (flags & TP_FLAG_TRACE)
kretprobe_trace_func(tk, ri, regs);
#ifdef CONFIG_PERF_EVENTS
- if (trace_probe_test_flag(&tk->tp, TP_FLAG_PROFILE))
+ if (flags & TP_FLAG_PROFILE)
kretprobe_perf_func(tk, ri, regs);
#endif
return 0; /* We don't tweak kernel, so just return 0 */
@@ -1896,7 +1925,7 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
bool is_return)
{
enum probe_print_type ptype;
- struct trace_kprobe *tk;
+ struct trace_kprobe *tk __free(free_trace_kprobe) = NULL;
int ret;
char *event;
@@ -1927,19 +1956,14 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
ptype = trace_kprobe_is_return(tk) ?
PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL;
- if (traceprobe_set_print_fmt(&tk->tp, ptype) < 0) {
- ret = -ENOMEM;
- goto error;
- }
+ if (traceprobe_set_print_fmt(&tk->tp, ptype) < 0)
+ return ERR_PTR(-ENOMEM);
ret = __register_trace_kprobe(tk);
if (ret < 0)
- goto error;
+ return ERR_PTR(ret);
- return trace_probe_event_call(&tk->tp);
-error:
- free_trace_kprobe(tk);
- return ERR_PTR(ret);
+ return trace_probe_event_call(&(no_free_ptr(tk)->tp));
}
void destroy_local_trace_kprobe(struct trace_event_call *event_call)
@@ -1968,13 +1992,15 @@ static __init void enable_boot_kprobe_events(void)
struct trace_kprobe *tk;
struct dyn_event *pos;
- mutex_lock(&event_mutex);
+ if (trace_kprobe_list_empty())
+ return;
+
+ guard(mutex)(&event_mutex);
for_each_trace_kprobe(tk, pos) {
list_for_each_entry(file, &tr->events, list)
if (file->event_call == trace_probe_event_call(&tk->tp))
trace_event_enable_disable(file, 1, 0);
}
- mutex_unlock(&event_mutex);
}
static __init void setup_boot_kprobe_events(void)
@@ -2035,6 +2061,10 @@ static __init int init_kprobe_trace(void)
trace_create_file("kprobe_profile", TRACE_MODE_READ,
NULL, NULL, &kprobe_profile_ops);
+ /* If no 'kprobe_event=' cmd is provided, return directly. */
+ if (kprobe_boot_events_buf[0] == '\0')
+ return 0;
+
setup_boot_kprobe_events();
return 0;
@@ -2066,7 +2096,7 @@ static __init int kprobe_trace_self_tests_init(void)
struct trace_kprobe *tk;
struct trace_event_file *file;
- if (tracing_is_disabled())
+ if (unlikely(tracing_disabled))
return -ENODEV;
if (tracing_selftest_disabled)
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index ba5858866b2f..226cf66e0d68 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -101,7 +101,7 @@ static void mmio_pipe_open(struct trace_iterator *iter)
trace_seq_puts(s, "VERSION 20070824\n");
- hiter = kzalloc(sizeof(*hiter), GFP_KERNEL);
+ hiter = kzalloc_obj(*hiter);
if (!hiter)
return;
@@ -291,7 +291,6 @@ __init static int init_mmio_trace(void)
device_initcall(init_mmio_trace);
static void __trace_mmiotrace_rw(struct trace_array *tr,
- struct trace_array_cpu *data,
struct mmiotrace_rw *rw)
{
struct trace_buffer *buffer = tr->array_buffer.buffer;
@@ -315,12 +314,10 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
void mmio_trace_rw(struct mmiotrace_rw *rw)
{
struct trace_array *tr = mmio_trace_array;
- struct trace_array_cpu *data = per_cpu_ptr(tr->array_buffer.data, smp_processor_id());
- __trace_mmiotrace_rw(tr, data, rw);
+ __trace_mmiotrace_rw(tr, rw);
}
static void __trace_mmiotrace_map(struct trace_array *tr,
- struct trace_array_cpu *data,
struct mmiotrace_map *map)
{
struct trace_buffer *buffer = tr->array_buffer.buffer;
@@ -344,12 +341,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
void mmio_trace_mapping(struct mmiotrace_map *map)
{
struct trace_array *tr = mmio_trace_array;
- struct trace_array_cpu *data;
-
- preempt_disable();
- data = per_cpu_ptr(tr->array_buffer.data, smp_processor_id());
- __trace_mmiotrace_map(tr, data, map);
- preempt_enable();
+ __trace_mmiotrace_map(tr, map);
}
int mmio_trace_printk(const char *fmt, va_list args)
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index b9f96c77527d..75678053b21c 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -24,6 +24,7 @@
#include <linux/sched/clock.h>
#include <uapi/linux/sched/types.h>
#include <linux/sched.h>
+#include <linux/string.h>
#include "trace.h"
#ifdef CONFIG_X86_LOCAL_APIC
@@ -57,6 +58,7 @@ enum osnoise_options_index {
OSN_PANIC_ON_STOP,
OSN_PREEMPT_DISABLE,
OSN_IRQ_DISABLE,
+ OSN_TIMERLAT_ALIGN,
OSN_MAX
};
@@ -65,7 +67,8 @@ static const char * const osnoise_options_str[OSN_MAX] = {
"OSNOISE_WORKLOAD",
"PANIC_ON_STOP",
"OSNOISE_PREEMPT_DISABLE",
- "OSNOISE_IRQ_DISABLE" };
+ "OSNOISE_IRQ_DISABLE",
+ "TIMERLAT_ALIGN" };
#define OSN_DEFAULT_OPTIONS 0x2
static unsigned long osnoise_options = OSN_DEFAULT_OPTIONS;
@@ -121,7 +124,7 @@ static int osnoise_register_instance(struct trace_array *tr)
*/
lockdep_assert_held(&trace_types_lock);
- inst = kmalloc(sizeof(*inst), GFP_KERNEL);
+ inst = kmalloc_obj(*inst);
if (!inst)
return -ENOMEM;
@@ -250,6 +253,11 @@ struct timerlat_variables {
static DEFINE_PER_CPU(struct timerlat_variables, per_cpu_timerlat_var);
/*
+ * timerlat wake-up offset for next thread with TIMERLAT_ALIGN set.
+ */
+static atomic64_t align_next;
+
+/*
* this_cpu_tmr_var - Return the per-cpu timerlat_variables on its relative CPU
*/
static inline struct timerlat_variables *this_cpu_tmr_var(void)
@@ -267,16 +275,23 @@ static inline void tlat_var_reset(void)
/* Synchronize with the timerlat interfaces */
mutex_lock(&interface_lock);
+
/*
* So far, all the values are initialized as 0, so
* zeroing the structure is perfect.
*/
- for_each_cpu(cpu, cpu_online_mask) {
+ for_each_online_cpu(cpu) {
tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu);
if (tlat_var->kthread)
hrtimer_cancel(&tlat_var->timer);
memset(tlat_var, 0, sizeof(*tlat_var));
}
+ /*
+ * Reset also align_next, to be filled by a new offset by the first timerlat
+ * thread that wakes up, if TIMERLAT_ALIGN is set.
+ */
+ atomic64_set(&align_next, 0);
+
mutex_unlock(&interface_lock);
}
#else /* CONFIG_TIMERLAT_TRACER */
@@ -295,7 +310,7 @@ static inline void osn_var_reset(void)
* So far, all the values are initialized as 0, so
* zeroing the structure is perfect.
*/
- for_each_cpu(cpu, cpu_online_mask) {
+ for_each_online_cpu(cpu) {
osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu);
memset(osn_var, 0, sizeof(*osn_var));
}
@@ -316,33 +331,6 @@ static inline void osn_var_reset_all(void)
bool trace_osnoise_callback_enabled;
/*
- * osnoise sample structure definition. Used to store the statistics of a
- * sample run.
- */
-struct osnoise_sample {
- u64 runtime; /* runtime */
- u64 noise; /* noise */
- u64 max_sample; /* max single noise sample */
- int hw_count; /* # HW (incl. hypervisor) interference */
- int nmi_count; /* # NMIs during this sample */
- int irq_count; /* # IRQs during this sample */
- int softirq_count; /* # softirqs during this sample */
- int thread_count; /* # threads during this sample */
-};
-
-#ifdef CONFIG_TIMERLAT_TRACER
-/*
- * timerlat sample structure definition. Used to store the statistics of
- * a sample run.
- */
-struct timerlat_sample {
- u64 timer_latency; /* timer_latency */
- unsigned int seqnum; /* unique sequence */
- int context; /* timer context */
-};
-#endif
-
-/*
* Tracer data.
*/
static struct osnoise_data {
@@ -352,10 +340,11 @@ static struct osnoise_data {
u64 stop_tracing_total; /* stop trace in the final operation (report/thread) */
#ifdef CONFIG_TIMERLAT_TRACER
u64 timerlat_period; /* timerlat period */
+ u64 timerlat_align_us; /* timerlat alignment */
u64 print_stack; /* print IRQ stack if total > */
int timerlat_tracer; /* timerlat tracer */
#endif
- bool tainted; /* infor users and developers about a problem */
+ bool tainted; /* info users and developers about a problem */
} osnoise_data = {
.sample_period = DEFAULT_SAMPLE_PERIOD,
.sample_runtime = DEFAULT_SAMPLE_RUNTIME,
@@ -364,6 +353,7 @@ static struct osnoise_data {
#ifdef CONFIG_TIMERLAT_TRACER
.print_stack = 0,
.timerlat_period = DEFAULT_TIMERLAT_PERIOD,
+ .timerlat_align_us = 0,
.timerlat_tracer = 0,
#endif
};
@@ -497,7 +487,7 @@ static void print_osnoise_headers(struct seq_file *s)
* Record an osnoise_sample into the tracer buffer.
*/
static void
-__trace_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffer)
+__record_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffer)
{
struct ring_buffer_event *event;
struct osnoise_entry *entry;
@@ -520,17 +510,19 @@ __trace_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffe
}
/*
- * Record an osnoise_sample on all osnoise instances.
+ * Record an osnoise_sample on all osnoise instances and fire trace event.
*/
-static void trace_osnoise_sample(struct osnoise_sample *sample)
+static void record_osnoise_sample(struct osnoise_sample *sample)
{
struct osnoise_instance *inst;
struct trace_buffer *buffer;
+ trace_osnoise_sample(sample);
+
rcu_read_lock();
list_for_each_entry_rcu(inst, &osnoise_instances, list) {
buffer = inst->tr->array_buffer.buffer;
- __trace_osnoise_sample(sample, buffer);
+ __record_osnoise_sample(sample, buffer);
}
rcu_read_unlock();
}
@@ -574,7 +566,7 @@ static void print_timerlat_headers(struct seq_file *s)
#endif /* CONFIG_PREEMPT_RT */
static void
-__trace_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buffer)
+__record_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buffer)
{
struct ring_buffer_event *event;
struct timerlat_entry *entry;
@@ -594,15 +586,17 @@ __trace_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buf
/*
* Record an timerlat_sample into the tracer buffer.
*/
-static void trace_timerlat_sample(struct timerlat_sample *sample)
+static void record_timerlat_sample(struct timerlat_sample *sample)
{
struct osnoise_instance *inst;
struct trace_buffer *buffer;
+ trace_timerlat_sample(sample);
+
rcu_read_lock();
list_for_each_entry_rcu(inst, &osnoise_instances, list) {
buffer = inst->tr->array_buffer.buffer;
- __trace_timerlat_sample(sample, buffer);
+ __record_timerlat_sample(sample, buffer);
}
rcu_read_unlock();
}
@@ -660,8 +654,8 @@ __timerlat_dump_stack(struct trace_buffer *buffer, struct trace_stack *fstack, u
entry = ring_buffer_event_data(event);
- memcpy(&entry->caller, fstack->calls, size);
entry->size = fstack->nr_entries;
+ memcpy(&entry->caller, fstack->calls, size);
trace_buffer_unlock_commit_nostack(buffer, event);
}
@@ -760,7 +754,7 @@ cond_move_thread_delta_start(struct osnoise_variables *osn_var, u64 duration)
/*
* get_int_safe_duration - Get the duration of a window
*
- * The irq, softirq and thread varaibles need to have its duration without
+ * The irq, softirq and thread variables need to have its duration without
* the interference from higher priority interrupts. Instead of keeping a
* variable to discount the interrupt interference from these variables, the
* starting time of these variables are pushed forward with the interrupt's
@@ -1229,6 +1223,8 @@ static void trace_sched_migrate_callback(void *data, struct task_struct *p, int
}
}
+static bool monitor_enabled;
+
static int register_migration_monitor(void)
{
int ret = 0;
@@ -1237,16 +1233,25 @@ static int register_migration_monitor(void)
* Timerlat thread migration check is only required when running timerlat in user-space.
* Thus, enable callback only if timerlat is set with no workload.
*/
- if (timerlat_enabled() && !test_bit(OSN_WORKLOAD, &osnoise_options))
+ if (timerlat_enabled() && !test_bit(OSN_WORKLOAD, &osnoise_options)) {
+ if (WARN_ON_ONCE(monitor_enabled))
+ return 0;
+
ret = register_trace_sched_migrate_task(trace_sched_migrate_callback, NULL);
+ if (!ret)
+ monitor_enabled = true;
+ }
return ret;
}
static void unregister_migration_monitor(void)
{
- if (timerlat_enabled() && !test_bit(OSN_WORKLOAD, &osnoise_options))
- unregister_trace_sched_migrate_task(trace_sched_migrate_callback, NULL);
+ if (!monitor_enabled)
+ return;
+
+ unregister_trace_sched_migrate_task(trace_sched_migrate_callback, NULL);
+ monitor_enabled = false;
}
#else
static int register_migration_monitor(void)
@@ -1471,7 +1476,7 @@ static int run_osnoise(void)
stop_in = osnoise_data.stop_tracing * NSEC_PER_USEC;
/*
- * Start timestemp
+ * Start timestamp
*/
start = time_get();
@@ -1531,27 +1536,25 @@ static int run_osnoise(void)
/*
* In some cases, notably when running on a nohz_full CPU with
- * a stopped tick PREEMPT_RCU has no way to account for QSs.
- * This will eventually cause unwarranted noise as PREEMPT_RCU
- * will force preemption as the means of ending the current
- * grace period. We avoid this problem by calling
- * rcu_momentary_eqs(), which performs a zero duration
- * EQS allowing PREEMPT_RCU to end the current grace period.
- * This call shouldn't be wrapped inside an RCU critical
- * section.
+ * a stopped tick PREEMPT_RCU or PREEMPT_LAZY have no way to
+ * account for QSs. This will eventually cause unwarranted
+ * noise as RCU forces preemption as the means of ending the
+ * current grace period. We avoid this by calling
+ * rcu_momentary_eqs(), which performs a zero duration EQS
+ * allowing RCU to end the current grace period. This call
+ * shouldn't be wrapped inside an RCU critical section.
*
- * Note that in non PREEMPT_RCU kernels QSs are handled through
- * cond_resched()
+ * Normally QSs for other cases are handled through cond_resched().
+ * For simplicity, however, we call rcu_momentary_eqs() for all
+ * configurations here.
*/
- if (IS_ENABLED(CONFIG_PREEMPT_RCU)) {
- if (!disable_irq)
- local_irq_disable();
+ if (!disable_irq)
+ local_irq_disable();
- rcu_momentary_eqs();
+ rcu_momentary_eqs();
- if (!disable_irq)
- local_irq_enable();
- }
+ if (!disable_irq)
+ local_irq_enable();
/*
* For the non-preemptive kernel config: let threads runs, if
@@ -1597,7 +1600,7 @@ static int run_osnoise(void)
/* Save interference stats info */
diff_osn_sample_stats(osn_var, &s);
- trace_osnoise_sample(&s);
+ record_osnoise_sample(&s);
notify_new_max_latency(max_noise);
@@ -1792,7 +1795,7 @@ static enum hrtimer_restart timerlat_irq(struct hrtimer *timer)
s.timer_latency = diff;
s.context = IRQ_CONTEXT;
- trace_timerlat_sample(&s);
+ record_timerlat_sample(&s);
if (osnoise_data.stop_tracing) {
if (time_to_us(diff) >= osnoise_data.stop_tracing) {
@@ -1843,6 +1846,26 @@ static int wait_next_period(struct timerlat_variables *tlat)
tlat->abs_period = (u64) ktime_to_ns(next_abs_period);
/*
+ * Align thread in the first cycle on each CPU to the set alignment
+ * if TIMERLAT_ALIGN is set.
+ *
+ * This is done by using an atomic64_t to store the next absolute period.
+ * The first thread that wakes up will set the atomic64_t to its
+ * absolute period, and the other threads will increment it by
+ * the alignment value.
+ */
+ if (test_bit(OSN_TIMERLAT_ALIGN, &osnoise_options) && !tlat->count
+ && atomic64_cmpxchg_relaxed(&align_next, 0, tlat->abs_period)) {
+ /*
+ * A thread has already set align_next, use it and increment it
+ * to be used by the next thread that wakes up after this one.
+ */
+ tlat->abs_period = atomic64_add_return_relaxed(
+ osnoise_data.timerlat_align_us * 1000, &align_next);
+ next_abs_period = ns_to_ktime(tlat->abs_period);
+ }
+
+ /*
* If the new abs_period is in the past, skip the activation.
*/
while (ktime_compare(now, next_abs_period) > 0) {
@@ -1890,12 +1913,11 @@ static int timerlat_main(void *data)
tlat->count = 0;
tlat->tracing_thread = false;
- hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
- tlat->timer.function = timerlat_irq;
+ hrtimer_setup(&tlat->timer, timerlat_irq, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
tlat->kthread = current;
osn_var->pid = current->pid;
/*
- * Anotate the arrival time.
+ * Annotate the arrival time.
*/
tlat->abs_period = hrtimer_cb_get_time(&tlat->timer);
@@ -1912,7 +1934,7 @@ static int timerlat_main(void *data)
s.timer_latency = diff;
s.context = THREAD_CONTEXT;
- trace_timerlat_sample(&s);
+ record_timerlat_sample(&s);
notify_new_max_latency(diff);
@@ -1992,7 +2014,7 @@ static void stop_per_cpu_kthreads(void)
}
/*
- * start_kthread - Start a workload tread
+ * start_kthread - Start a workload thread
*/
static int start_kthread(unsigned int cpu)
{
@@ -2021,7 +2043,6 @@ static int start_kthread(unsigned int cpu)
if (IS_ERR(kthread)) {
pr_err(BANNER "could not start sampling thread\n");
- stop_per_cpu_kthreads();
return -ENOMEM;
}
@@ -2083,26 +2104,21 @@ static void osnoise_hotplug_workfn(struct work_struct *dummy)
{
unsigned int cpu = smp_processor_id();
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
if (!osnoise_has_registered_instances())
- goto out_unlock_trace;
+ return;
- mutex_lock(&interface_lock);
- cpus_read_lock();
+ guard(cpus_read_lock)();
+ guard(mutex)(&interface_lock);
if (!cpu_online(cpu))
- goto out_unlock;
+ return;
+
if (!cpumask_test_cpu(cpu, &osnoise_cpumask))
- goto out_unlock;
+ return;
start_kthread(cpu);
-
-out_unlock:
- cpus_read_unlock();
- mutex_unlock(&interface_lock);
-out_unlock_trace:
- mutex_unlock(&trace_types_lock);
}
static DECLARE_WORK(osnoise_hotplug_work, osnoise_hotplug_workfn);
@@ -2257,11 +2273,11 @@ static ssize_t osnoise_options_write(struct file *filp, const char __user *ubuf,
if (running)
stop_per_cpu_kthreads();
- mutex_lock(&interface_lock);
/*
* avoid CPU hotplug operations that might read options.
*/
cpus_read_lock();
+ mutex_lock(&interface_lock);
retval = cnt;
@@ -2277,8 +2293,8 @@ static ssize_t osnoise_options_write(struct file *filp, const char __user *ubuf,
clear_bit(option, &osnoise_options);
}
- cpus_read_unlock();
mutex_unlock(&interface_lock);
+ cpus_read_unlock();
if (running)
start_per_cpu_kthreads();
@@ -2300,31 +2316,22 @@ static ssize_t
osnoise_cpus_read(struct file *filp, char __user *ubuf, size_t count,
loff_t *ppos)
{
- char *mask_str;
+ char *mask_str __free(kfree) = NULL;
int len;
- mutex_lock(&interface_lock);
+ guard(mutex)(&interface_lock);
len = snprintf(NULL, 0, "%*pbl\n", cpumask_pr_args(&osnoise_cpumask)) + 1;
mask_str = kmalloc(len, GFP_KERNEL);
- if (!mask_str) {
- count = -ENOMEM;
- goto out_unlock;
- }
+ if (!mask_str)
+ return -ENOMEM;
len = snprintf(mask_str, len, "%*pbl\n", cpumask_pr_args(&osnoise_cpumask));
- if (len >= count) {
- count = -EINVAL;
- goto out_free;
- }
+ if (len >= count)
+ return -EINVAL;
count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len);
-out_free:
- kfree(mask_str);
-out_unlock:
- mutex_unlock(&interface_lock);
-
return count;
}
@@ -2332,7 +2339,7 @@ out_unlock:
* osnoise_cpus_write - Write function for "cpus" entry
* @filp: The active open file structure
* @ubuf: The user buffer that contains the value to write
- * @cnt: The maximum number of bytes to write to "file"
+ * @count: The maximum number of bytes to write to "file"
* @ppos: The current position in @file
*
* This function provides a write implementation for the "cpus"
@@ -2350,13 +2357,14 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count,
{
cpumask_var_t osnoise_cpumask_new;
int running, err;
- char buf[256];
+ char *buf __free(kfree) = NULL;
- if (count >= 256)
- return -EINVAL;
+ if (count < 1)
+ return 0;
- if (copy_from_user(buf, ubuf, count))
- return -EFAULT;
+ buf = memdup_user_nul(ubuf, count);
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
if (!zalloc_cpumask_var(&osnoise_cpumask_new, GFP_KERNEL))
return -ENOMEM;
@@ -2373,16 +2381,16 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count,
if (running)
stop_per_cpu_kthreads();
- mutex_lock(&interface_lock);
/*
* osnoise_cpumask is read by CPU hotplug operations.
*/
cpus_read_lock();
+ mutex_lock(&interface_lock);
cpumask_copy(&osnoise_cpumask, osnoise_cpumask_new);
- cpus_read_unlock();
mutex_unlock(&interface_lock);
+ cpus_read_unlock();
if (running)
start_per_cpu_kthreads();
@@ -2459,8 +2467,7 @@ static int timerlat_fd_open(struct inode *inode, struct file *file)
tlat = this_cpu_tmr_var();
tlat->count = 0;
- hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
- tlat->timer.function = timerlat_irq;
+ hrtimer_setup(&tlat->timer, timerlat_irq, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
migrate_enable();
return 0;
@@ -2532,7 +2539,7 @@ timerlat_fd_read(struct file *file, char __user *ubuf, size_t count,
s.timer_latency = diff;
s.context = THREAD_URET;
- trace_timerlat_sample(&s);
+ record_timerlat_sample(&s);
notify_new_max_latency(diff);
@@ -2567,7 +2574,7 @@ timerlat_fd_read(struct file *file, char __user *ubuf, size_t count,
s.timer_latency = diff;
s.context = THREAD_CONTEXT;
- trace_timerlat_sample(&s);
+ record_timerlat_sample(&s);
if (osnoise_data.stop_tracing_total) {
if (time_to_us(diff) >= osnoise_data.stop_tracing_total) {
@@ -2679,6 +2686,17 @@ static struct trace_min_max_param timerlat_period = {
.min = &timerlat_min_period,
};
+/*
+ * osnoise/timerlat_align_us: align the first wakeup of all timerlat
+ * threads to a common boundary (in us). 0 means disabled.
+ */
+static struct trace_min_max_param timerlat_align_us = {
+ .lock = &interface_lock,
+ .val = &osnoise_data.timerlat_align_us,
+ .max = NULL,
+ .min = NULL,
+};
+
static const struct file_operations timerlat_fd_fops = {
.open = timerlat_fd_open,
.read = timerlat_fd_read,
@@ -2734,7 +2752,7 @@ static int osnoise_create_cpu_timerlat_fd(struct dentry *top_dir)
* Why not using tracing instance per_cpu/ dir?
*
* Because osnoise/timerlat have a single workload, having
- * multiple files like these are wast of memory.
+ * multiple files like these are waste of memory.
*/
per_cpu = tracefs_create_dir("per_cpu", top_dir);
if (!per_cpu)
@@ -2775,6 +2793,11 @@ static int init_timerlat_tracefs(struct dentry *top_dir)
if (!tmp)
return -ENOMEM;
+ tmp = tracefs_create_file("timerlat_align_us", TRACE_MODE_WRITE, top_dir,
+ &timerlat_align_us, &trace_min_max_fops);
+ if (!tmp)
+ return -ENOMEM;
+
retval = osnoise_create_cpu_timerlat_fd(top_dir);
if (retval)
return retval;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 03d56f711ad1..a5ad76175d10 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -5,6 +5,7 @@
* Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
*
*/
+#include "trace.h"
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/ftrace.h>
@@ -12,15 +13,19 @@
#include <linux/sched/clock.h>
#include <linux/sched/mm.h>
#include <linux/idr.h>
+#include <linux/btf.h>
+#include <linux/bpf.h>
+#include <linux/hashtable.h>
#include "trace_output.h"
+#include "trace_btf.h"
-/* must be a power of 2 */
-#define EVENT_HASHSIZE 128
+/* 2^7 = 128 */
+#define EVENT_HASH_BITS 7
DECLARE_RWSEM(trace_event_sem);
-static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
+static DEFINE_HASHTABLE(event_hash, EVENT_HASH_BITS);
enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter)
{
@@ -64,14 +69,15 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
const char *
trace_print_flags_seq(struct trace_seq *p, const char *delim,
unsigned long flags,
- const struct trace_print_flags *flag_array)
+ const struct trace_print_flags *flag_array,
+ size_t flag_array_size)
{
unsigned long mask;
const char *str;
const char *ret = trace_seq_buffer_ptr(p);
int i, first = 1;
- for (i = 0; flag_array[i].name && flags; i++) {
+ for (i = 0; i < flag_array_size && flags; i++) {
mask = flag_array[i].mask;
if ((flags & mask) != mask)
@@ -101,12 +107,13 @@ EXPORT_SYMBOL(trace_print_flags_seq);
const char *
trace_print_symbols_seq(struct trace_seq *p, unsigned long val,
- const struct trace_print_flags *symbol_array)
+ const struct trace_print_flags *symbol_array,
+ size_t symbol_array_size)
{
int i;
const char *ret = trace_seq_buffer_ptr(p);
- for (i = 0; symbol_array[i].name; i++) {
+ for (i = 0; i < symbol_array_size; i++) {
if (val != symbol_array[i].mask)
continue;
@@ -128,14 +135,15 @@ EXPORT_SYMBOL(trace_print_symbols_seq);
const char *
trace_print_flags_seq_u64(struct trace_seq *p, const char *delim,
unsigned long long flags,
- const struct trace_print_flags_u64 *flag_array)
+ const struct trace_print_flags_u64 *flag_array,
+ size_t flag_array_size)
{
unsigned long long mask;
const char *str;
const char *ret = trace_seq_buffer_ptr(p);
int i, first = 1;
- for (i = 0; flag_array[i].name && flags; i++) {
+ for (i = 0; i < flag_array_size && flags; i++) {
mask = flag_array[i].mask;
if ((flags & mask) != mask)
@@ -165,12 +173,13 @@ EXPORT_SYMBOL(trace_print_flags_seq_u64);
const char *
trace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
- const struct trace_print_flags_u64 *symbol_array)
+ const struct trace_print_flags_u64 *symbol_array,
+ size_t symbol_array_size)
{
int i;
const char *ret = trace_seq_buffer_ptr(p);
- for (i = 0; symbol_array[i].name; i++) {
+ for (i = 0; i < symbol_array_size; i++) {
if (val != symbol_array[i].mask)
continue;
@@ -189,13 +198,37 @@ trace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
EXPORT_SYMBOL(trace_print_symbols_seq_u64);
#endif
+/**
+ * trace_print_bitmask_seq - print a bitmask to a sequence buffer
+ * @iter: The trace iterator for the current event instance
+ * @bitmask_ptr: The pointer to the bitmask data
+ * @bitmask_size: The size of the bitmask in bytes
+ *
+ * Prints a bitmask into a sequence buffer as either a hex string or a
+ * human-readable range list, depending on the instance's "bitmask-list"
+ * trace option. The bitmask is formatted into the iterator's temporary
+ * scratchpad rather than the primary sequence buffer. This avoids
+ * duplication and pointer-collision issues when the returned string is
+ * processed by a "%s" specifier in a TP_printk() macro.
+ *
+ * Returns a pointer to the formatted string within the temporary buffer.
+ */
const char *
-trace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
+trace_print_bitmask_seq(struct trace_iterator *iter, void *bitmask_ptr,
unsigned int bitmask_size)
{
- const char *ret = trace_seq_buffer_ptr(p);
+ struct trace_seq *p = &iter->tmp_seq;
+ const struct trace_array *tr = iter->tr;
+ const char *ret;
+
+ trace_seq_init(p);
+ ret = trace_seq_buffer_ptr(p);
+
+ if (tr->trace_flags & TRACE_ITER(BITMASK_LIST))
+ trace_seq_bitmask_list(p, bitmask_ptr, bitmask_size * 8);
+ else
+ trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8);
- trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8);
trace_seq_putc(p, 0);
return ret;
@@ -415,7 +448,7 @@ static int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
}
mmap_read_unlock(mm);
}
- if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
+ if (ret && ((sym_flags & TRACE_ITER(SYM_ADDR)) || !file))
trace_seq_printf(s, " <" IP_FMT ">", ip);
return !trace_seq_has_overflowed(s);
}
@@ -428,9 +461,9 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
goto out;
}
- trace_seq_print_sym(s, ip, sym_flags & TRACE_ITER_SYM_OFFSET);
+ trace_seq_print_sym(s, ip, sym_flags & TRACE_ITER(SYM_OFFSET));
- if (sym_flags & TRACE_ITER_SYM_ADDR)
+ if (sym_flags & TRACE_ITER(SYM_ADDR))
trace_seq_printf(s, " <" IP_FMT ">", ip);
out:
@@ -564,7 +597,7 @@ static int
lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
{
struct trace_array *tr = iter->tr;
- unsigned long verbose = tr->trace_flags & TRACE_ITER_VERBOSE;
+ unsigned long verbose = tr->trace_flags & TRACE_ITER(VERBOSE);
unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS;
unsigned long long abs_ts = iter->ts - iter->array_buffer->time_start;
unsigned long long rel_ts = next_ts - iter->ts;
@@ -631,7 +664,7 @@ int trace_print_context(struct trace_iterator *iter)
trace_seq_printf(s, "%16s-%-7d ", comm, entry->pid);
- if (tr->trace_flags & TRACE_ITER_RECORD_TGID) {
+ if (tr->trace_flags & TRACE_ITER(RECORD_TGID)) {
unsigned int tgid = trace_find_tgid(entry->pid);
if (!tgid)
@@ -642,7 +675,7 @@ int trace_print_context(struct trace_iterator *iter)
trace_seq_printf(s, "[%03d] ", iter->cpu);
- if (tr->trace_flags & TRACE_ITER_IRQ_INFO)
+ if (tr->trace_flags & TRACE_ITER(IRQ_INFO))
trace_print_lat_fmt(s, entry);
trace_print_time(s, iter, iter->ts);
@@ -656,7 +689,7 @@ int trace_print_lat_context(struct trace_iterator *iter)
struct trace_entry *entry, *next_entry;
struct trace_array *tr = iter->tr;
struct trace_seq *s = &iter->seq;
- unsigned long verbose = (tr->trace_flags & TRACE_ITER_VERBOSE);
+ unsigned long verbose = (tr->trace_flags & TRACE_ITER(VERBOSE));
u64 next_ts;
next_entry = trace_find_next_entry(iter, NULL, &next_ts);
@@ -684,6 +717,104 @@ int trace_print_lat_context(struct trace_iterator *iter)
return !trace_seq_has_overflowed(s);
}
+#ifdef CONFIG_FUNCTION_TRACE_ARGS
+void print_function_args(struct trace_seq *s, unsigned long *args,
+ unsigned long func)
+{
+ const struct btf_param *param;
+ const struct btf_type *t;
+ const struct btf_enum *enums;
+ const char *param_name;
+ char name[KSYM_NAME_LEN];
+ unsigned long arg;
+ struct btf *btf;
+ s32 tid, nr = 0;
+ int a, p, x, i;
+ u16 encode;
+
+ trace_seq_printf(s, "(");
+
+ if (!args)
+ goto out;
+ if (lookup_symbol_name(func, name))
+ goto out;
+
+ /* TODO: Pass module name here too */
+ t = btf_find_func_proto(name, &btf);
+ if (IS_ERR_OR_NULL(t))
+ goto out;
+
+ param = btf_get_func_param(t, &nr);
+ if (!param)
+ goto out_put;
+
+ for (a = 0, p = 0; p < nr; a++, p++) {
+ if (p)
+ trace_seq_puts(s, ", ");
+
+ /* This only prints what the arch allows (6 args by default) */
+ if (a == FTRACE_REGS_MAX_ARGS) {
+ trace_seq_puts(s, "...");
+ break;
+ }
+
+ arg = args[a];
+
+ param_name = btf_name_by_offset(btf, param[p].name_off);
+ if (param_name)
+ trace_seq_printf(s, "%s=", param_name);
+ t = btf_type_skip_modifiers(btf, param[p].type, &tid);
+
+ switch (t ? BTF_INFO_KIND(t->info) : BTF_KIND_UNKN) {
+ case BTF_KIND_UNKN:
+ trace_seq_putc(s, '?');
+ /* Still print unknown type values */
+ fallthrough;
+ case BTF_KIND_PTR:
+ trace_seq_printf(s, "0x%lx", arg);
+ break;
+ case BTF_KIND_INT:
+ encode = btf_int_encoding(t);
+ /* Print unsigned ints as hex */
+ if (encode & BTF_INT_SIGNED)
+ trace_seq_printf(s, "%ld", arg);
+ else
+ trace_seq_printf(s, "0x%lx", arg);
+ break;
+ case BTF_KIND_ENUM:
+ trace_seq_printf(s, "%ld", arg);
+ enums = btf_enum(t);
+ for (i = 0; i < btf_vlen(t); i++) {
+ if (arg == enums[i].val) {
+ trace_seq_printf(s, " [%s]",
+ btf_name_by_offset(btf,
+ enums[i].name_off));
+ break;
+ }
+ }
+ break;
+ default:
+ /* This does not handle complex arguments */
+ trace_seq_printf(s, "(%s)[0x%lx", btf_type_str(t), arg);
+ for (x = sizeof(long); x < t->size; x += sizeof(long)) {
+ trace_seq_putc(s, ':');
+ if (++a == FTRACE_REGS_MAX_ARGS) {
+ trace_seq_puts(s, "...]");
+ goto out_put;
+ }
+ trace_seq_printf(s, "0x%lx", args[a]);
+ }
+ trace_seq_putc(s, ']');
+ break;
+ }
+ }
+out_put:
+ btf_put(btf);
+out:
+ trace_seq_printf(s, ")");
+}
+#endif
+
/**
* ftrace_find_event - find a registered event
* @type: the type of event to look for
@@ -694,11 +825,8 @@ int trace_print_lat_context(struct trace_iterator *iter)
struct trace_event *ftrace_find_event(int type)
{
struct trace_event *event;
- unsigned key;
- key = type & (EVENT_HASHSIZE - 1);
-
- hlist_for_each_entry(event, &event_hash[key], node) {
+ hash_for_each_possible(event_hash, event, node, type) {
if (event->type == type)
return event;
}
@@ -753,7 +881,6 @@ void trace_event_read_unlock(void)
*/
int register_trace_event(struct trace_event *event)
{
- unsigned key;
int ret = 0;
down_write(&trace_event_sem);
@@ -786,9 +913,7 @@ int register_trace_event(struct trace_event *event)
if (event->funcs->binary == NULL)
event->funcs->binary = trace_nop_print;
- key = event->type & (EVENT_HASHSIZE - 1);
-
- hlist_add_head(&event->node, &event_hash[key]);
+ hash_add(event_hash, &event->node, event->type);
ret = event->type;
out:
@@ -803,7 +928,7 @@ EXPORT_SYMBOL_GPL(register_trace_event);
*/
int __unregister_trace_event(struct trace_event *event)
{
- hlist_del(&event->node);
+ hash_del(&event->node);
free_trace_event_type(event->type);
return 0;
}
@@ -857,10 +982,15 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c
struct list_head *head)
{
struct ftrace_event_field *field;
+ struct trace_array *tr = iter->tr;
+ unsigned long long laddr;
+ unsigned long addr;
int offset;
int len;
int ret;
+ int i;
void *pos;
+ char *str;
list_for_each_entry_reverse(field, head, link) {
trace_seq_printf(&iter->seq, " %s=", field->name);
@@ -887,14 +1017,35 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c
trace_seq_puts(&iter->seq, "<OVERFLOW>");
break;
}
- pos = (void *)iter->ent + offset;
- trace_seq_printf(&iter->seq, "%.*s", len, (char *)pos);
+ str = (char *)iter->ent + offset;
+ /* Check if there's any non printable strings */
+ for (i = 0; i < len; i++) {
+ if (str[i] && !(isascii(str[i]) && isprint(str[i])))
+ break;
+ }
+ if (i < len) {
+ for (i = 0; i < len; i++) {
+ if (isascii(str[i]) && isprint(str[i]))
+ trace_seq_putc(&iter->seq, str[i]);
+ else
+ trace_seq_putc(&iter->seq, '.');
+ }
+ trace_seq_puts(&iter->seq, " (");
+ for (i = 0; i < len; i++) {
+ if (i)
+ trace_seq_putc(&iter->seq, ':');
+ trace_seq_printf(&iter->seq, "%02x", str[i]);
+ }
+ trace_seq_putc(&iter->seq, ')');
+ } else {
+ trace_seq_printf(&iter->seq, "%.*s", len, str);
+ }
break;
case FILTER_PTR_STRING:
if (!iter->fmt_size)
trace_iter_expand_format(iter);
- pos = *(void **)pos;
- ret = strncpy_from_kernel_nofault(iter->fmt, pos,
+ addr = trace_adjust_address(tr, *(unsigned long *)pos);
+ ret = strncpy_from_kernel_nofault(iter->fmt, (void *)addr,
iter->fmt_size);
if (ret < 0)
trace_seq_printf(&iter->seq, "(0x%px)", pos);
@@ -903,8 +1054,8 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c
pos, iter->fmt);
break;
case FILTER_TRACE_FN:
- pos = *(void **)pos;
- trace_seq_printf(&iter->seq, "%pS", pos);
+ addr = trace_adjust_address(tr, *(unsigned long *)pos);
+ trace_seq_printf(&iter->seq, "%pS", (void *)addr);
break;
case FILTER_CPU:
case FILTER_OTHER:
@@ -934,14 +1085,36 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c
break;
}
- trace_seq_printf(&iter->seq, "0x%x (%d)",
- *(unsigned int *)pos,
- *(unsigned int *)pos);
+ addr = *(unsigned int *)pos;
+
+ /* Some fields reference offset from _stext. */
+ if (!strcmp(field->name, "caller_offs") ||
+ !strcmp(field->name, "parent_offs")) {
+ unsigned long ip;
+
+ ip = addr + (unsigned long)_stext;
+ ip = trace_adjust_address(tr, ip);
+ trace_seq_printf(&iter->seq, "%pS ", (void *)ip);
+ }
+
+ if (sizeof(long) == 4) {
+ addr = trace_adjust_address(tr, addr);
+ trace_seq_printf(&iter->seq, "%pS (%d)",
+ (void *)addr, (int)addr);
+ } else {
+ trace_seq_printf(&iter->seq, "0x%x (%d)",
+ (unsigned int)addr, (int)addr);
+ }
break;
case 8:
- trace_seq_printf(&iter->seq, "0x%llx (%lld)",
- *(unsigned long long *)pos,
- *(unsigned long long *)pos);
+ laddr = *(unsigned long long *)pos;
+ if (sizeof(long) == 8) {
+ laddr = trace_adjust_address(tr, (unsigned long)laddr);
+ trace_seq_printf(&iter->seq, "%pS (%lld)",
+ (void *)(long)laddr, laddr);
+ } else {
+ trace_seq_printf(&iter->seq, "0x%llx (%lld)", laddr, laddr);
+ }
break;
default:
trace_seq_puts(&iter->seq, "<INVALID-SIZE>");
@@ -961,11 +1134,12 @@ enum print_line_t print_event_fields(struct trace_iterator *iter,
struct trace_event_call *call;
struct list_head *head;
+ lockdep_assert_held_read(&trace_event_sem);
+
/* ftrace defined events have separate call structures */
if (event->type <= __TRACE_LAST_TYPE) {
bool found = false;
- down_read(&trace_event_sem);
list_for_each_entry(call, &ftrace_events, list) {
if (call->event.type == event->type) {
found = true;
@@ -975,7 +1149,6 @@ enum print_line_t print_event_fields(struct trace_iterator *iter,
if (call->event.type > __TRACE_LAST_TYPE)
break;
}
- up_read(&trace_event_sem);
if (!found) {
trace_seq_printf(&iter->seq, "UNKNOWN TYPE %d\n", event->type);
goto out;
@@ -1005,14 +1178,17 @@ enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
}
static void print_fn_trace(struct trace_seq *s, unsigned long ip,
- unsigned long parent_ip, long delta, int flags)
+ unsigned long parent_ip, unsigned long *args,
+ struct trace_array *tr, int flags)
{
- ip += delta;
- parent_ip += delta;
+ ip = trace_adjust_address(tr, ip);
+ parent_ip = trace_adjust_address(tr, parent_ip);
seq_print_ip_sym(s, ip, flags);
+ if (args)
+ print_function_args(s, args, ip);
- if ((flags & TRACE_ITER_PRINT_PARENT) && parent_ip) {
+ if ((flags & TRACE_ITER(PRINT_PARENT)) && parent_ip) {
trace_seq_puts(s, " <-");
seq_print_ip_sym(s, parent_ip, flags);
}
@@ -1024,10 +1200,18 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
{
struct ftrace_entry *field;
struct trace_seq *s = &iter->seq;
+ unsigned long *args;
+ int args_size;
trace_assign_type(field, iter->ent);
- print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, flags);
+ args_size = iter->ent_size - offsetof(struct ftrace_entry, args);
+ if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long))
+ args = field->args;
+ else
+ args = NULL;
+
+ print_fn_trace(s, field->ip, field->parent_ip, args, iter->tr, flags);
trace_seq_putc(s, '\n');
return trace_handle_return(s);
@@ -1248,7 +1432,6 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
struct trace_seq *s = &iter->seq;
unsigned long *p;
unsigned long *end;
- long delta = iter->tr->text_delta;
trace_assign_type(field, iter->ent);
end = (unsigned long *)((long)iter->ent + iter->ent_size);
@@ -1265,7 +1448,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
trace_seq_puts(s, "[FTRACE TRAMPOLINE]\n");
continue;
}
- seq_print_ip_sym(s, (*p) + delta, flags);
+ seq_print_ip_sym(s, trace_adjust_address(iter->tr, *p), flags);
trace_seq_putc(s, '\n');
}
@@ -1295,7 +1478,7 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
trace_seq_puts(s, "<user stack trace>\n");
- if (tr->trace_flags & TRACE_ITER_SYM_USEROBJ) {
+ if (tr->trace_flags & TRACE_ITER(SYM_USEROBJ)) {
struct task_struct *task;
/*
* we do the lookup on the thread group leader,
@@ -1345,12 +1528,12 @@ trace_hwlat_print(struct trace_iterator *iter, int flags,
trace_assign_type(field, entry);
- trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%lld.%09ld count:%d",
+ trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%ptSp count:%d",
field->seqnum,
field->duration,
field->outer_duration,
- (long long)field->timestamp.tv_sec,
- field->timestamp.tv_nsec, field->count);
+ &field->timestamp,
+ field->count);
if (field->nmi_count) {
/*
@@ -1614,7 +1797,7 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter,
trace_assign_type(field, iter->ent);
- ip = field->ip + iter->tr->text_delta;
+ ip = trace_adjust_address(iter->tr, field->ip);
seq_print_ip_sym(s, ip, flags);
trace_seq_printf(s, ": %s", field->buf);
@@ -1700,7 +1883,7 @@ trace_func_repeats_print(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, flags);
+ print_fn_trace(s, field->ip, field->parent_ip, NULL, iter->tr, flags);
trace_seq_printf(s, " (repeats: %u, last_ts:", field->count);
trace_print_time(s, iter,
iter->ts - FUNC_REPEATS_GET_DELTA_TS(field));
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index dca40f1f1da4..99b676733d46 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -16,6 +16,17 @@ extern int
seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
unsigned long sym_flags);
+static inline int seq_print_ip_sym_offset(struct trace_seq *s, unsigned long ip,
+ unsigned long sym_flags)
+{
+ return seq_print_ip_sym(s, ip, sym_flags | TRACE_ITER(SYM_OFFSET));
+}
+static inline int seq_print_ip_sym_no_offset(struct trace_seq *s, unsigned long ip,
+ unsigned long sym_flags)
+{
+ return seq_print_ip_sym(s, ip, sym_flags & ~TRACE_ITER(SYM_OFFSET));
+}
+
extern void trace_seq_print_sym(struct trace_seq *s, unsigned long address, bool offset);
extern int trace_print_context(struct trace_iterator *iter);
extern int trace_print_lat_context(struct trace_iterator *iter);
@@ -41,5 +52,14 @@ extern struct rw_semaphore trace_event_sem;
#define SEQ_PUT_HEX_FIELD(s, x) \
trace_seq_putmem_hex(s, &(x), sizeof(x))
+#ifdef CONFIG_FUNCTION_TRACE_ARGS
+void print_function_args(struct trace_seq *s, unsigned long *args,
+ unsigned long func);
+#else
+static inline void print_function_args(struct trace_seq *s, unsigned long *args,
+ unsigned long func) {
+ trace_seq_puts(s, "()");
+}
+#endif
#endif
diff --git a/kernel/trace/trace_pid.c b/kernel/trace/trace_pid.c
new file mode 100644
index 000000000000..7127c8de4174
--- /dev/null
+++ b/kernel/trace/trace_pid.c
@@ -0,0 +1,246 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "trace.h"
+
+/**
+ * trace_find_filtered_pid - check if a pid exists in a filtered_pid list
+ * @filtered_pids: The list of pids to check
+ * @search_pid: The PID to find in @filtered_pids
+ *
+ * Returns true if @search_pid is found in @filtered_pids, and false otherwise.
+ */
+bool
+trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
+{
+ return trace_pid_list_is_set(filtered_pids, search_pid);
+}
+
+/**
+ * trace_ignore_this_task - should a task be ignored for tracing
+ * @filtered_pids: The list of pids to check
+ * @filtered_no_pids: The list of pids not to be traced
+ * @task: The task that should be ignored if not filtered
+ *
+ * Checks if @task should be traced or not from @filtered_pids.
+ * Returns true if @task should *NOT* be traced.
+ * Returns false if @task should be traced.
+ */
+bool
+trace_ignore_this_task(struct trace_pid_list *filtered_pids,
+ struct trace_pid_list *filtered_no_pids,
+ struct task_struct *task)
+{
+ /*
+ * If filtered_no_pids is not empty, and the task's pid is listed
+ * in filtered_no_pids, then return true.
+ * Otherwise, if filtered_pids is empty, that means we can
+ * trace all tasks. If it has content, then only trace pids
+ * within filtered_pids.
+ */
+
+ return (filtered_pids &&
+ !trace_find_filtered_pid(filtered_pids, task->pid)) ||
+ (filtered_no_pids &&
+ trace_find_filtered_pid(filtered_no_pids, task->pid));
+}
+
+/**
+ * trace_filter_add_remove_task - Add or remove a task from a pid_list
+ * @pid_list: The list to modify
+ * @self: The current task for fork or NULL for exit
+ * @task: The task to add or remove
+ *
+ * If adding a task, if @self is defined, the task is only added if @self
+ * is also included in @pid_list. This happens on fork and tasks should
+ * only be added when the parent is listed. If @self is NULL, then the
+ * @task pid will be removed from the list, which would happen on exit
+ * of a task.
+ */
+void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
+ struct task_struct *self,
+ struct task_struct *task)
+{
+ if (!pid_list)
+ return;
+
+ /* For forks, we only add if the forking task is listed */
+ if (self) {
+ if (!trace_find_filtered_pid(pid_list, self->pid))
+ return;
+ }
+
+ /* "self" is set for forks, and NULL for exits */
+ if (self)
+ trace_pid_list_set(pid_list, task->pid);
+ else
+ trace_pid_list_clear(pid_list, task->pid);
+}
+
+/**
+ * trace_pid_next - Used for seq_file to get to the next pid of a pid_list
+ * @pid_list: The pid list to show
+ * @v: The last pid that was shown (+1 the actual pid to let zero be displayed)
+ * @pos: The position of the file
+ *
+ * This is used by the seq_file "next" operation to iterate the pids
+ * listed in a trace_pid_list structure.
+ *
+ * Returns the pid+1 as we want to display pid of zero, but NULL would
+ * stop the iteration.
+ */
+void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos)
+{
+ long pid = (unsigned long)v;
+ unsigned int next;
+
+ (*pos)++;
+
+ /* pid already is +1 of the actual previous bit */
+ if (trace_pid_list_next(pid_list, pid, &next) < 0)
+ return NULL;
+
+ pid = next;
+
+ /* Return pid + 1 to allow zero to be represented */
+ return (void *)(pid + 1);
+}
+
+/**
+ * trace_pid_start - Used for seq_file to start reading pid lists
+ * @pid_list: The pid list to show
+ * @pos: The position of the file
+ *
+ * This is used by seq_file "start" operation to start the iteration
+ * of listing pids.
+ *
+ * Returns the pid+1 as we want to display pid of zero, but NULL would
+ * stop the iteration.
+ */
+void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos)
+{
+ unsigned long pid;
+ unsigned int first;
+ loff_t l = 0;
+
+ if (trace_pid_list_first(pid_list, &first) < 0)
+ return NULL;
+
+ pid = first;
+
+ /* Return pid + 1 so that zero can be the exit value */
+ for (pid++; pid && l < *pos;
+ pid = (unsigned long)trace_pid_next(pid_list, (void *)pid, &l))
+ ;
+ return (void *)pid;
+}
+
+/**
+ * trace_pid_show - show the current pid in seq_file processing
+ * @m: The seq_file structure to write into
+ * @v: A void pointer of the pid (+1) value to display
+ *
+ * Can be directly used by seq_file operations to display the current
+ * pid value.
+ */
+int trace_pid_show(struct seq_file *m, void *v)
+{
+ unsigned long pid = (unsigned long)v - 1;
+
+ seq_printf(m, "%lu\n", pid);
+ return 0;
+}
+
+/* 128 should be much more than enough */
+#define PID_BUF_SIZE 127
+
+int trace_pid_write(struct trace_pid_list *filtered_pids,
+ struct trace_pid_list **new_pid_list,
+ const char __user *ubuf, size_t cnt)
+{
+ struct trace_pid_list *pid_list;
+ struct trace_parser parser;
+ unsigned long val;
+ int nr_pids = 0;
+ ssize_t read = 0;
+ ssize_t ret;
+ loff_t pos;
+ pid_t pid;
+
+ if (trace_parser_get_init(&parser, PID_BUF_SIZE + 1))
+ return -ENOMEM;
+
+ /*
+ * Always recreate a new array. The write is an all or nothing
+ * operation. Always create a new array when adding new pids by
+ * the user. If the operation fails, then the current list is
+ * not modified.
+ */
+ pid_list = trace_pid_list_alloc();
+ if (!pid_list) {
+ trace_parser_put(&parser);
+ return -ENOMEM;
+ }
+
+ if (filtered_pids) {
+ /* copy the current bits to the new max */
+ ret = trace_pid_list_first(filtered_pids, &pid);
+ while (!ret) {
+ ret = trace_pid_list_set(pid_list, pid);
+ if (ret < 0)
+ goto out;
+
+ ret = trace_pid_list_next(filtered_pids, pid + 1, &pid);
+ nr_pids++;
+ }
+ }
+
+ ret = 0;
+ while (cnt > 0) {
+
+ pos = 0;
+
+ ret = trace_get_user(&parser, ubuf, cnt, &pos);
+ if (ret < 0)
+ break;
+
+ read += ret;
+ ubuf += ret;
+ cnt -= ret;
+
+ if (!trace_parser_loaded(&parser))
+ break;
+
+ ret = -EINVAL;
+ if (kstrtoul(parser.buffer, 0, &val))
+ break;
+
+ pid = (pid_t)val;
+
+ if (trace_pid_list_set(pid_list, pid) < 0) {
+ ret = -1;
+ break;
+ }
+ nr_pids++;
+
+ trace_parser_clear(&parser);
+ ret = 0;
+ }
+ out:
+ trace_parser_put(&parser);
+
+ if (ret < 0) {
+ trace_pid_list_free(pid_list);
+ return ret;
+ }
+
+ if (!nr_pids) {
+ /* Cleared the list of pids */
+ trace_pid_list_free(pid_list);
+ pid_list = NULL;
+ }
+
+ *new_pid_list = pid_list;
+
+ return read;
+}
+
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 29f6e95439b6..3ea17af60169 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -69,7 +69,7 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)
}
fmt = NULL;
- tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL);
+ tb_fmt = kmalloc_obj(*tb_fmt);
if (tb_fmt) {
fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL);
if (fmt) {
@@ -197,6 +197,7 @@ struct notifier_block module_trace_bprintk_format_nb = {
.notifier_call = module_trace_bprintk_format_notify,
};
+__printf(2, 3)
int __trace_bprintk(unsigned long ip, const char *fmt, ...)
{
int ret;
@@ -376,6 +377,436 @@ static const struct file_operations ftrace_formats_fops = {
.release = seq_release,
};
+static __always_inline bool printk_binsafe(struct trace_array *tr)
+{
+ /*
+ * The binary format of traceprintk can cause a crash if used
+ * by a buffer from another boot. Force the use of the
+ * non binary version of trace_printk if the trace_printk
+ * buffer is a boot mapped ring buffer.
+ */
+ return !(tr->flags & TRACE_ARRAY_FL_BOOT);
+}
+
+int __trace_array_puts(struct trace_array *tr, unsigned long ip,
+ const char *str, int size)
+{
+ struct ring_buffer_event *event;
+ struct trace_buffer *buffer;
+ struct print_entry *entry;
+ unsigned int trace_ctx;
+ int alloc;
+
+ if (!(tr->trace_flags & TRACE_ITER(PRINTK)))
+ return 0;
+
+ if (unlikely(tracing_selftest_running &&
+ (tr->flags & TRACE_ARRAY_FL_GLOBAL)))
+ return 0;
+
+ if (unlikely(tracing_disabled))
+ return 0;
+
+ alloc = sizeof(*entry) + size + 2; /* possible \n added */
+
+ trace_ctx = tracing_gen_ctx();
+ buffer = tr->array_buffer.buffer;
+ guard(ring_buffer_nest)(buffer);
+ event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,
+ trace_ctx);
+ if (!event)
+ return 0;
+
+ entry = ring_buffer_event_data(event);
+ entry->ip = ip;
+
+ memcpy(&entry->buf, str, size);
+
+ /* Add a newline if necessary */
+ if (entry->buf[size - 1] != '\n') {
+ entry->buf[size] = '\n';
+ entry->buf[size + 1] = '\0';
+ } else
+ entry->buf[size] = '\0';
+
+ __buffer_unlock_commit(buffer, event);
+ ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL);
+ return size;
+}
+EXPORT_SYMBOL_GPL(__trace_array_puts);
+
+/**
+ * __trace_puts - write a constant string into the trace buffer.
+ * @ip: The address of the caller
+ * @str: The constant string to write
+ */
+int __trace_puts(unsigned long ip, const char *str)
+{
+ return __trace_array_puts(printk_trace, ip, str, strlen(str));
+}
+EXPORT_SYMBOL_GPL(__trace_puts);
+
+/**
+ * __trace_bputs - write the pointer to a constant string into trace buffer
+ * @ip: The address of the caller
+ * @str: The constant string to write to the buffer to
+ */
+int __trace_bputs(unsigned long ip, const char *str)
+{
+ struct trace_array *tr = READ_ONCE(printk_trace);
+ struct ring_buffer_event *event;
+ struct trace_buffer *buffer;
+ struct bputs_entry *entry;
+ unsigned int trace_ctx;
+ int size = sizeof(struct bputs_entry);
+
+ if (!printk_binsafe(tr))
+ return __trace_puts(ip, str);
+
+ if (!(tr->trace_flags & TRACE_ITER(PRINTK)))
+ return 0;
+
+ if (unlikely(tracing_selftest_running || tracing_disabled))
+ return 0;
+
+ trace_ctx = tracing_gen_ctx();
+ buffer = tr->array_buffer.buffer;
+
+ guard(ring_buffer_nest)(buffer);
+ event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
+ trace_ctx);
+ if (!event)
+ return 0;
+
+ entry = ring_buffer_event_data(event);
+ entry->ip = ip;
+ entry->str = str;
+
+ __buffer_unlock_commit(buffer, event);
+ ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL);
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(__trace_bputs);
+
+/* created for use with alloc_percpu */
+struct trace_buffer_struct {
+ int nesting;
+ char buffer[4][TRACE_BUF_SIZE];
+};
+
+static struct trace_buffer_struct __percpu *trace_percpu_buffer;
+
+/*
+ * This allows for lockless recording. If we're nested too deeply, then
+ * this returns NULL.
+ */
+static char *get_trace_buf(void)
+{
+ struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer);
+
+ if (!trace_percpu_buffer || buffer->nesting >= 4)
+ return NULL;
+
+ buffer->nesting++;
+
+ /* Interrupts must see nesting incremented before we use the buffer */
+ barrier();
+ return &buffer->buffer[buffer->nesting - 1][0];
+}
+
+static void put_trace_buf(void)
+{
+ /* Don't let the decrement of nesting leak before this */
+ barrier();
+ this_cpu_dec(trace_percpu_buffer->nesting);
+}
+
+static int alloc_percpu_trace_buffer(void)
+{
+ struct trace_buffer_struct __percpu *buffers;
+
+ if (trace_percpu_buffer)
+ return 0;
+
+ buffers = alloc_percpu(struct trace_buffer_struct);
+ if (MEM_FAIL(!buffers, "Could not allocate percpu trace_printk buffer"))
+ return -ENOMEM;
+
+ trace_percpu_buffer = buffers;
+ return 0;
+}
+
+static int buffers_allocated;
+
+void trace_printk_init_buffers(void)
+{
+ if (buffers_allocated)
+ return;
+
+ if (alloc_percpu_trace_buffer())
+ return;
+
+ /* trace_printk() is for debug use only. Don't use it in production. */
+
+ pr_warn("\n");
+ pr_warn("**********************************************************\n");
+ pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
+ pr_warn("** **\n");
+ pr_warn("** trace_printk() being used. Allocating extra memory. **\n");
+ pr_warn("** **\n");
+ pr_warn("** This means that this is a DEBUG kernel and it is **\n");
+ pr_warn("** unsafe for production use. **\n");
+ pr_warn("** **\n");
+ pr_warn("** If you see this message and you are not debugging **\n");
+ pr_warn("** the kernel, report this immediately to your vendor! **\n");
+ pr_warn("** **\n");
+ pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
+ pr_warn("**********************************************************\n");
+
+ /* Expand the buffers to set size */
+ if (tracing_update_buffers(NULL) < 0)
+ pr_err("Failed to expand tracing buffers for trace_printk() calls\n");
+ else
+ buffers_allocated = 1;
+
+ /*
+ * trace_printk_init_buffers() can be called by modules.
+ * If that happens, then we need to start cmdline recording
+ * directly here.
+ */
+ if (system_state == SYSTEM_RUNNING)
+ tracing_start_cmdline_record();
+}
+EXPORT_SYMBOL_GPL(trace_printk_init_buffers);
+
+void trace_printk_start_comm(void)
+{
+ /* Start tracing comms if trace printk is set */
+ if (!buffers_allocated)
+ return;
+ tracing_start_cmdline_record();
+}
+
+void trace_printk_start_stop_comm(int enabled)
+{
+ if (!buffers_allocated)
+ return;
+
+ if (enabled)
+ tracing_start_cmdline_record();
+ else
+ tracing_stop_cmdline_record();
+}
+
+/**
+ * trace_vbprintk - write binary msg to tracing buffer
+ * @ip: The address of the caller
+ * @fmt: The string format to write to the buffer
+ * @args: Arguments for @fmt
+ */
+int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
+{
+ struct ring_buffer_event *event;
+ struct trace_buffer *buffer;
+ struct trace_array *tr = READ_ONCE(printk_trace);
+ struct bprint_entry *entry;
+ unsigned int trace_ctx;
+ char *tbuffer;
+ int len = 0, size;
+
+ if (!printk_binsafe(tr))
+ return trace_vprintk(ip, fmt, args);
+
+ if (unlikely(tracing_selftest_running || tracing_disabled))
+ return 0;
+
+ /* Don't pollute graph traces with trace_vprintk internals */
+ pause_graph_tracing();
+
+ trace_ctx = tracing_gen_ctx();
+ guard(preempt_notrace)();
+
+ tbuffer = get_trace_buf();
+ if (!tbuffer) {
+ len = 0;
+ goto out_nobuffer;
+ }
+
+ len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args);
+
+ if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0)
+ goto out_put;
+
+ size = sizeof(*entry) + sizeof(u32) * len;
+ buffer = tr->array_buffer.buffer;
+ scoped_guard(ring_buffer_nest, buffer) {
+ event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
+ trace_ctx);
+ if (!event)
+ goto out_put;
+ entry = ring_buffer_event_data(event);
+ entry->ip = ip;
+ entry->fmt = fmt;
+
+ memcpy(entry->buf, tbuffer, sizeof(u32) * len);
+ __buffer_unlock_commit(buffer, event);
+ ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL);
+ }
+out_put:
+ put_trace_buf();
+
+out_nobuffer:
+ unpause_graph_tracing();
+
+ return len;
+}
+EXPORT_SYMBOL_GPL(trace_vbprintk);
+
+static __printf(3, 0)
+int __trace_array_vprintk(struct trace_buffer *buffer,
+ unsigned long ip, const char *fmt, va_list args)
+{
+ struct ring_buffer_event *event;
+ int len = 0, size;
+ struct print_entry *entry;
+ unsigned int trace_ctx;
+ char *tbuffer;
+
+ if (unlikely(tracing_disabled))
+ return 0;
+
+ /* Don't pollute graph traces with trace_vprintk internals */
+ pause_graph_tracing();
+
+ trace_ctx = tracing_gen_ctx();
+ guard(preempt_notrace)();
+
+
+ tbuffer = get_trace_buf();
+ if (!tbuffer) {
+ len = 0;
+ goto out_nobuffer;
+ }
+
+ len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
+
+ size = sizeof(*entry) + len + 1;
+ scoped_guard(ring_buffer_nest, buffer) {
+ event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
+ trace_ctx);
+ if (!event)
+ goto out;
+ entry = ring_buffer_event_data(event);
+ entry->ip = ip;
+
+ memcpy(&entry->buf, tbuffer, len + 1);
+ __buffer_unlock_commit(buffer, event);
+ ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL);
+ }
+out:
+ put_trace_buf();
+
+out_nobuffer:
+ unpause_graph_tracing();
+
+ return len;
+}
+
+int trace_array_vprintk(struct trace_array *tr,
+ unsigned long ip, const char *fmt, va_list args)
+{
+ if (tracing_selftest_running && (tr->flags & TRACE_ARRAY_FL_GLOBAL))
+ return 0;
+
+ return __trace_array_vprintk(tr->array_buffer.buffer, ip, fmt, args);
+}
+
+/**
+ * trace_array_printk - Print a message to a specific instance
+ * @tr: The instance trace_array descriptor
+ * @ip: The instruction pointer that this is called from.
+ * @fmt: The format to print (printf format)
+ *
+ * If a subsystem sets up its own instance, they have the right to
+ * printk strings into their tracing instance buffer using this
+ * function. Note, this function will not write into the top level
+ * buffer (use trace_printk() for that), as writing into the top level
+ * buffer should only have events that can be individually disabled.
+ * trace_printk() is only used for debugging a kernel, and should not
+ * be ever incorporated in normal use.
+ *
+ * trace_array_printk() can be used, as it will not add noise to the
+ * top level tracing buffer.
+ *
+ * Note, trace_array_init_printk() must be called on @tr before this
+ * can be used.
+ */
+int trace_array_printk(struct trace_array *tr,
+ unsigned long ip, const char *fmt, ...)
+{
+ int ret;
+ va_list ap;
+
+ if (!tr)
+ return -ENOENT;
+
+ /* This is only allowed for created instances */
+ if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
+ return 0;
+
+ if (!(tr->trace_flags & TRACE_ITER(PRINTK)))
+ return 0;
+
+ va_start(ap, fmt);
+ ret = trace_array_vprintk(tr, ip, fmt, ap);
+ va_end(ap);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(trace_array_printk);
+
+/**
+ * trace_array_init_printk - Initialize buffers for trace_array_printk()
+ * @tr: The trace array to initialize the buffers for
+ *
+ * As trace_array_printk() only writes into instances, they are OK to
+ * have in the kernel (unlike trace_printk()). This needs to be called
+ * before trace_array_printk() can be used on a trace_array.
+ */
+int trace_array_init_printk(struct trace_array *tr)
+{
+ if (!tr)
+ return -ENOENT;
+
+ /* This is only allowed for created instances */
+ if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
+ return -EINVAL;
+
+ return alloc_percpu_trace_buffer();
+}
+EXPORT_SYMBOL_GPL(trace_array_init_printk);
+
+int trace_array_printk_buf(struct trace_buffer *buffer,
+ unsigned long ip, const char *fmt, ...)
+{
+ int ret;
+ va_list ap;
+
+ if (!(printk_trace->trace_flags & TRACE_ITER(PRINTK)))
+ return 0;
+
+ va_start(ap, fmt);
+ ret = __trace_array_vprintk(buffer, ip, fmt, ap);
+ va_end(ap);
+ return ret;
+}
+
+int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
+{
+ return trace_array_vprintk(printk_trace, ip, fmt, args);
+}
+EXPORT_SYMBOL_GPL(trace_vprintk);
+
static __init int init_trace_printk_function_export(void)
{
int ret;
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 16a5e368e7b7..44c22d4e7881 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -13,8 +13,8 @@
#include <linux/bpf.h>
#include <linux/fs.h>
-#include "trace_btf.h"
+#include "trace_btf.h"
#include "trace_probe.h"
#undef C
@@ -154,22 +154,30 @@ fail:
}
static struct trace_probe_log trace_probe_log;
+extern struct mutex dyn_event_ops_mutex;
-void trace_probe_log_init(const char *subsystem, int argc, const char **argv)
+const char *trace_probe_log_init(const char *subsystem, int argc, const char **argv)
{
+ lockdep_assert_held(&dyn_event_ops_mutex);
+
trace_probe_log.subsystem = subsystem;
trace_probe_log.argc = argc;
trace_probe_log.argv = argv;
trace_probe_log.index = 0;
+ return subsystem;
}
void trace_probe_log_clear(void)
{
+ lockdep_assert_held(&dyn_event_ops_mutex);
+
memset(&trace_probe_log, 0, sizeof(trace_probe_log));
}
void trace_probe_log_set_index(int index)
{
+ lockdep_assert_held(&dyn_event_ops_mutex);
+
trace_probe_log.index = index;
}
@@ -178,6 +186,8 @@ void __trace_probe_log_err(int offset, int err_type)
char *command, *p;
int i, len = 0, pos = 0;
+ lockdep_assert_held(&dyn_event_ops_mutex);
+
if (!trace_probe_log.argv)
return;
@@ -205,7 +215,7 @@ void __trace_probe_log_err(int offset, int err_type)
p = command;
for (i = 0; i < trace_probe_log.argc; i++) {
len = strlen(trace_probe_log.argv[i]);
- strcpy(p, trace_probe_log.argv[i]);
+ memcpy(p, trace_probe_log.argv[i], len);
p[len] = ' ';
p += len + 1;
}
@@ -238,7 +248,25 @@ int traceprobe_split_symbol_offset(char *symbol, long *offset)
return 0;
}
-/* @buf must has MAX_EVENT_NAME_LEN size */
+/**
+ * traceprobe_parse_event_name() - Parse a string into group and event names
+ * @pevent: A pointer to the string to be parsed.
+ * @pgroup: A pointer to the group name.
+ * @buf: A buffer to store the parsed group name.
+ * @offset: The offset of the string in the original user command, for logging.
+ *
+ * This parses a string with the format `[GROUP/][EVENT]` or `[GROUP.][EVENT]`
+ * (either GROUP or EVENT or both must be specified).
+ * Since the parsed group name is stored in @buf, the caller must ensure @buf
+ * is at least MAX_EVENT_NAME_LEN bytes.
+ *
+ * Return: 0 on success, or -EINVAL on failure.
+ *
+ * If success, *@pevent is updated to point to the event name part of the
+ * original string, or NULL if there is no event name.
+ * Also, *@pgroup is updated to point to the parsed group which is stored
+ * in @buf, or NULL if there is no group name.
+ */
int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
char *buf, int offset)
{
@@ -489,7 +517,7 @@ static void clear_btf_context(struct traceprobe_parse_context *ctx)
}
}
-/* Return 1 if the field separater is arrow operator ('->') */
+/* Return 1 if the field separator is arrow operator ('->') */
static int split_next_field(char *varname, char **next_field,
struct traceprobe_parse_context *ctx)
{
@@ -648,7 +676,7 @@ static int parse_btf_arg(char *varname,
ret = query_btf_context(ctx);
if (ret < 0 || ctx->nr_params == 0) {
trace_probe_log_err(ctx->offset, NO_BTF_ENTRY);
- return PTR_ERR(params);
+ return -ENOENT;
}
}
params = ctx->params;
@@ -770,19 +798,51 @@ static int check_prepare_btf_string_fetch(char *typename,
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
+static void store_entry_arg_at(struct fetch_insn *code, int argnum, int offset)
+{
+ code[0].op = FETCH_OP_ARG;
+ code[0].param = argnum;
+ code[1].op = FETCH_OP_ST_EDATA;
+ code[1].offset = offset;
+}
+
+static int get_entry_arg_max_offset(struct probe_entry_arg *earg)
+{
+ int i, max_offset = 0;
+
+ /*
+ * earg->code[] array has an operation sequence which is run in
+ * the entry handler.
+ * The sequence stopped by FETCH_OP_END and each data stored in
+ * the entry data buffer by FETCH_OP_ST_EDATA. The FETCH_OP_ST_EDATA
+ * stores the data at the data buffer + its offset, and all data are
+ * "unsigned long" size. The offset must be increased when a data is
+ * stored. Thus we need to find the last FETCH_OP_ST_EDATA in the
+ * code array.
+ */
+ for (i = 0; i < earg->size - 1 && earg->code[i].op != FETCH_OP_END; i++) {
+ if (earg->code[i].op == FETCH_OP_ST_EDATA)
+ if (earg->code[i].offset > max_offset)
+ max_offset = earg->code[i].offset;
+ }
+ return max_offset;
+}
+
+/*
+ * Add the entry code to store the 'argnum'th parameter and return the offset
+ * in the entry data buffer where the data will be stored.
+ */
static int __store_entry_arg(struct trace_probe *tp, int argnum)
{
struct probe_entry_arg *earg = tp->entry_arg;
- bool match = false;
- int i, offset;
+ int i, offset, last_offset = 0;
if (!earg) {
- earg = kzalloc(sizeof(*tp->entry_arg), GFP_KERNEL);
+ earg = kzalloc_obj(*tp->entry_arg);
if (!earg)
return -ENOMEM;
earg->size = 2 * tp->nr_args + 1;
- earg->code = kcalloc(earg->size, sizeof(struct fetch_insn),
- GFP_KERNEL);
+ earg->code = kzalloc_objs(struct fetch_insn, earg->size);
if (!earg->code) {
kfree(earg);
return -ENOMEM;
@@ -791,54 +851,59 @@ static int __store_entry_arg(struct trace_probe *tp, int argnum)
for (i = 0; i < earg->size; i++)
earg->code[i].op = FETCH_OP_END;
tp->entry_arg = earg;
+ store_entry_arg_at(earg->code, argnum, 0);
+ return 0;
}
- offset = 0;
- for (i = 0; i < earg->size - 1; i++) {
- switch (earg->code[i].op) {
- case FETCH_OP_END:
- earg->code[i].op = FETCH_OP_ARG;
- earg->code[i].param = argnum;
- earg->code[i + 1].op = FETCH_OP_ST_EDATA;
- earg->code[i + 1].offset = offset;
- return offset;
- case FETCH_OP_ARG:
- match = (earg->code[i].param == argnum);
- break;
- case FETCH_OP_ST_EDATA:
- offset = earg->code[i].offset;
- if (match)
- return offset;
- offset += sizeof(unsigned long);
- break;
- default:
- break;
- }
+ /*
+ * NOTE: if anyone change the following rule, please rewrite this.
+ * The entry code array is filled with the pair of
+ *
+ * [FETCH_OP_ARG(argnum)]
+ * [FETCH_OP_ST_EDATA(offset of entry data buffer)]
+ *
+ * and the rest of entries are filled with [FETCH_OP_END].
+ * The offset should be incremented, thus the last pair should
+ * have the largest offset.
+ */
+
+ /* Search the offset for the sprcified argnum. */
+ for (i = 0; i < earg->size - 1 && earg->code[i].op != FETCH_OP_END; i += 2) {
+ if (WARN_ON_ONCE(earg->code[i].op != FETCH_OP_ARG))
+ return -EINVAL;
+
+ if (earg->code[i].param != argnum)
+ continue;
+
+ if (WARN_ON_ONCE(earg->code[i + 1].op != FETCH_OP_ST_EDATA))
+ return -EINVAL;
+
+ return earg->code[i + 1].offset;
}
- return -ENOSPC;
+ /* Not found, append new entry if possible. */
+ if (i >= earg->size - 1)
+ return -ENOSPC;
+
+ /* The last entry must have the largest offset. */
+ if (i != 0) {
+ if (WARN_ON_ONCE(earg->code[i - 1].op != FETCH_OP_ST_EDATA))
+ return -EINVAL;
+ last_offset = earg->code[i - 1].offset;
+ }
+
+ offset = last_offset + sizeof(unsigned long);
+ store_entry_arg_at(&earg->code[i], argnum, offset);
+ return offset;
}
int traceprobe_get_entry_data_size(struct trace_probe *tp)
{
struct probe_entry_arg *earg = tp->entry_arg;
- int i, size = 0;
if (!earg)
return 0;
- for (i = 0; i < earg->size; i++) {
- switch (earg->code[i].op) {
- case FETCH_OP_END:
- goto out;
- case FETCH_OP_ST_EDATA:
- size = earg->code[i].offset + sizeof(unsigned long);
- break;
- default:
- break;
- }
- }
-out:
- return size;
+ return get_entry_arg_max_offset(earg) + sizeof(unsigned long);
}
void store_trace_entry_data(void *edata, struct trace_probe *tp, struct pt_regs *regs)
@@ -897,8 +962,6 @@ static int parse_probe_vars(char *orig_arg, const struct fetch_type *t,
code->op = FETCH_OP_COMM;
return 0;
}
- /* backward compatibility */
- ctx->offset = 0;
goto inval;
}
@@ -1003,7 +1066,7 @@ static int __parse_imm_string(char *str, char **pbuf, int offs)
{
size_t len = strlen(str);
- if (str[len - 1] != '"') {
+ if (!len || str[len - 1] != '"') {
trace_probe_log_err(offs + len, IMMSTR_NO_CLOSE);
return -EINVAL;
}
@@ -1409,7 +1472,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
struct traceprobe_parse_context *ctx)
{
struct fetch_insn *code, *tmp = NULL;
- char *type, *arg;
+ char *type, *arg __free(kfree) = NULL;
int ret, len;
len = strlen(argv);
@@ -1426,22 +1489,16 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
return -ENOMEM;
parg->comm = kstrdup(arg, GFP_KERNEL);
- if (!parg->comm) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!parg->comm)
+ return -ENOMEM;
type = parse_probe_arg_type(arg, parg, ctx);
- if (IS_ERR(type)) {
- ret = PTR_ERR(type);
- goto out;
- }
+ if (IS_ERR(type))
+ return PTR_ERR(type);
- code = tmp = kcalloc(FETCH_INSN_MAX, sizeof(*code), GFP_KERNEL);
- if (!code) {
- ret = -ENOMEM;
- goto out;
- }
+ code = tmp = kzalloc_objs(*code, FETCH_INSN_MAX);
+ if (!code)
+ return -ENOMEM;
code[FETCH_INSN_MAX - 1].op = FETCH_OP_END;
ctx->last_type = NULL;
@@ -1464,6 +1521,12 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
parg->offset = *size;
*size += parg->type->size * (parg->count ?: 1);
+ if (*size > MAX_PROBE_EVENT_SIZE) {
+ ret = -E2BIG;
+ trace_probe_log_err(ctx->offset, EVENT_TOO_BIG);
+ goto fail;
+ }
+
if (parg->count) {
len = strlen(parg->type->fmttype) + 6;
parg->fmt = kmalloc(len, GFP_KERNEL);
@@ -1483,7 +1546,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
if (code->op == FETCH_OP_END)
break;
/* Shrink down the code buffer */
- parg->code = kcalloc(code - tmp + 1, sizeof(*code), GFP_KERNEL);
+ parg->code = kzalloc_objs(*code, code - tmp + 1);
if (!parg->code)
ret = -ENOMEM;
else
@@ -1497,8 +1560,6 @@ fail:
kfree(code->data);
}
kfree(tmp);
-out:
- kfree(arg);
return ret;
}
@@ -1668,7 +1729,7 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[],
{
const struct btf_param *params = NULL;
int i, j, n, used, ret, args_idx = -1;
- const char **new_argv = NULL;
+ const char **new_argv __free(kfree) = NULL;
ret = argv_has_var_arg(argc, argv, &args_idx, ctx);
if (ret < 0)
@@ -1707,7 +1768,7 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[],
ret = sprint_nth_btf_arg(n, "", buf + used,
bufsize - used, ctx);
if (ret < 0)
- goto error;
+ return ERR_PTR(ret);
new_argv[j++] = buf + used;
used += ret + 1;
@@ -1721,25 +1782,20 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[],
n = simple_strtoul(argv[i] + 4, &type, 10);
if (type && !(*type == ':' || *type == '\0')) {
trace_probe_log_err(0, BAD_VAR);
- ret = -ENOENT;
- goto error;
+ return ERR_PTR(-ENOENT);
}
/* Note: $argN starts from $arg1 */
ret = sprint_nth_btf_arg(n - 1, type, buf + used,
bufsize - used, ctx);
if (ret < 0)
- goto error;
+ return ERR_PTR(ret);
new_argv[j++] = buf + used;
used += ret + 1;
} else
new_argv[j++] = argv[i];
}
- return new_argv;
-
-error:
- kfree(new_argv);
- return ERR_PTR(ret);
+ return_ptr(new_argv);
}
/* @buf: *buf must be equal to NULL. Caller must to free *buf */
@@ -1747,14 +1803,14 @@ int traceprobe_expand_dentry_args(int argc, const char *argv[], char **buf)
{
int i, used, ret;
const int bufsize = MAX_DENTRY_ARGS_LEN;
- char *tmpbuf = NULL;
+ char *tmpbuf __free(kfree) = NULL;
if (*buf)
return -EINVAL;
used = 0;
for (i = 0; i < argc; i++) {
- char *tmp;
+ char *tmp __free(kfree) = NULL;
char *equal;
size_t arg_len;
@@ -1769,7 +1825,7 @@ int traceprobe_expand_dentry_args(int argc, const char *argv[], char **buf)
tmp = kstrdup(argv[i], GFP_KERNEL);
if (!tmp)
- goto nomem;
+ return -ENOMEM;
equal = strchr(tmp, '=');
if (equal)
@@ -1790,18 +1846,14 @@ int traceprobe_expand_dentry_args(int argc, const char *argv[], char **buf)
offsetof(struct file, f_path.dentry),
equal ? equal + 1 : tmp);
- kfree(tmp);
if (ret >= bufsize - used)
- goto nomem;
+ return -ENOMEM;
argv[i] = tmpbuf + used;
used += ret + 1;
}
- *buf = tmpbuf;
+ *buf = no_free_ptr(tmpbuf);
return 0;
-nomem:
- kfree(tmpbuf);
- return -ENOMEM;
}
void traceprobe_finish_parse(struct traceprobe_parse_context *ctx)
@@ -2100,7 +2152,7 @@ int trace_probe_add_file(struct trace_probe *tp, struct trace_event_file *file)
{
struct event_file_link *link;
- link = kmalloc(sizeof(*link), GFP_KERNEL);
+ link = kmalloc_obj(*link);
if (!link)
return -ENOMEM;
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 5803e6a41570..262d8707a3df 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -10,20 +10,22 @@
* Author: Srikar Dronamraju
*/
+#include <linux/bitops.h>
+#include <linux/btf.h>
+#include <linux/cleanup.h>
+#include <linux/kprobes.h>
+#include <linux/limits.h>
+#include <linux/perf_event.h>
+#include <linux/ptrace.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/smp.h>
-#include <linux/tracefs.h>
-#include <linux/types.h>
#include <linux/string.h>
-#include <linux/ptrace.h>
-#include <linux/perf_event.h>
-#include <linux/kprobes.h>
#include <linux/stringify.h>
-#include <linux/limits.h>
+#include <linux/tracefs.h>
+#include <linux/types.h>
#include <linux/uaccess.h>
-#include <linux/bitops.h>
-#include <linux/btf.h>
+
#include <asm/bitsperlong.h>
#include "trace.h"
@@ -36,7 +38,7 @@
#define MAX_BTF_ARGS_LEN 128
#define MAX_DENTRY_ARGS_LEN 256
#define MAX_STRING_SIZE PATH_MAX
-#define MAX_ARG_BUF_LEN (MAX_TRACE_ARGS * MAX_ARG_NAME_LEN)
+#define MAX_PROBE_EVENT_SIZE 3072
/* Reserved field names */
#define FIELD_STRING_IP "__probe_ip"
@@ -270,16 +272,21 @@ struct event_file_link {
struct list_head list;
};
+static inline unsigned int trace_probe_load_flag(struct trace_probe *tp)
+{
+ return smp_load_acquire(&tp->event->flags);
+}
+
static inline bool trace_probe_test_flag(struct trace_probe *tp,
unsigned int flag)
{
- return !!(tp->event->flags & flag);
+ return !!(trace_probe_load_flag(tp) & flag);
}
static inline void trace_probe_set_flag(struct trace_probe *tp,
unsigned int flag)
{
- tp->event->flags |= flag;
+ smp_store_release(&tp->event->flags, tp->event->flags | flag);
}
static inline void trace_probe_clear_flag(struct trace_probe *tp,
@@ -439,6 +446,14 @@ extern void traceprobe_free_probe_arg(struct probe_arg *arg);
* this MUST be called for clean up the context and return a resource.
*/
void traceprobe_finish_parse(struct traceprobe_parse_context *ctx);
+static inline void traceprobe_free_parse_ctx(struct traceprobe_parse_context *ctx)
+{
+ traceprobe_finish_parse(ctx);
+ kfree(ctx);
+}
+
+DEFINE_FREE(traceprobe_parse_context, struct traceprobe_parse_context *,
+ if (_T) traceprobe_free_parse_ctx(_T))
extern int traceprobe_split_symbol_offset(char *symbol, long *offset);
int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
@@ -481,6 +496,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(NON_UNIQ_SYMBOL, "The symbol is not unique"), \
C(BAD_RETPROBE, "Retprobe address must be an function entry"), \
C(NO_TRACEPOINT, "Tracepoint is not found"), \
+ C(BAD_TP_NAME, "Invalid character in tracepoint name"),\
C(BAD_ADDR_SUFFIX, "Invalid probed address suffix"), \
C(NO_GROUP_NAME, "Group name is not specified"), \
C(GROUP_TOO_LONG, "Group name is too long"), \
@@ -544,7 +560,10 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(NO_BTF_FIELD, "This field is not found."), \
C(BAD_BTF_TID, "Failed to get BTF type info."),\
C(BAD_TYPE4STR, "This type does not fit for string."),\
- C(NEED_STRING_TYPE, "$comm and immediate-string only accepts string type"),
+ C(NEED_STRING_TYPE, "$comm and immediate-string only accepts string type"),\
+ C(TOO_MANY_ARGS, "Too many arguments are specified"), \
+ C(TOO_MANY_EARGS, "Too many entry arguments specified"), \
+ C(EVENT_TOO_BIG, "Event too big (too many fields?)"),
#undef C
#define C(a, b) TP_ERR_##a
@@ -561,11 +580,13 @@ struct trace_probe_log {
int index;
};
-void trace_probe_log_init(const char *subsystem, int argc, const char **argv);
+const char *trace_probe_log_init(const char *subsystem, int argc, const char **argv);
void trace_probe_log_set_index(int index);
void trace_probe_log_clear(void);
void __trace_probe_log_err(int offset, int err);
+DEFINE_FREE(trace_probe_log_clear, const char *, if (_T) trace_probe_log_clear())
+
#define trace_probe_log_err(offs, err) \
__trace_probe_log_err(offs, TP_ERR_##err)
diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h
index 2caf0d2afb32..f39b37fcdb3b 100644
--- a/kernel/trace/trace_probe_tmpl.h
+++ b/kernel/trace/trace_probe_tmpl.h
@@ -232,7 +232,7 @@ array:
/* Sum up total data length for dynamic arrays (strings) */
static nokprobe_inline int
-__get_data_size(struct trace_probe *tp, struct pt_regs *regs, void *edata)
+__get_data_size(struct trace_probe *tp, void *regs, void *edata)
{
struct probe_arg *arg;
int i, len, ret = 0;
diff --git a/kernel/trace/trace_recursion_record.c b/kernel/trace/trace_recursion_record.c
index a520b11afb0d..784fe1fbb866 100644
--- a/kernel/trace/trace_recursion_record.c
+++ b/kernel/trace/trace_recursion_record.c
@@ -129,7 +129,7 @@ static void *recursed_function_seq_start(struct seq_file *m, loff_t *pos)
ret = &recursed_functions[*pos];
}
- tseq = kzalloc(sizeof(*tseq), GFP_KERNEL);
+ tseq = kzalloc_obj(*tseq);
if (!tseq)
return ERR_PTR(-ENOMEM);
diff --git a/kernel/trace/trace_remote.c b/kernel/trace/trace_remote.c
new file mode 100644
index 000000000000..d6c3f94d67cd
--- /dev/null
+++ b/kernel/trace/trace_remote.c
@@ -0,0 +1,1384 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2025 - Google LLC
+ * Author: Vincent Donnefort <vdonnefort@google.com>
+ */
+
+#include <linux/kstrtox.h>
+#include <linux/lockdep.h>
+#include <linux/mutex.h>
+#include <linux/tracefs.h>
+#include <linux/trace_remote.h>
+#include <linux/trace_seq.h>
+#include <linux/types.h>
+
+#include "trace.h"
+
+#define TRACEFS_DIR "remotes"
+#define TRACEFS_MODE_WRITE 0640
+#define TRACEFS_MODE_READ 0440
+
+enum tri_type {
+ TRI_CONSUMING,
+ TRI_NONCONSUMING,
+};
+
+struct trace_remote_iterator {
+ struct trace_remote *remote;
+ struct trace_seq seq;
+ struct delayed_work poll_work;
+ unsigned long lost_events;
+ u64 ts;
+ struct ring_buffer_iter *rb_iter;
+ struct ring_buffer_iter **rb_iters;
+ struct remote_event_hdr *evt;
+ int cpu;
+ int evt_cpu;
+ loff_t pos;
+ enum tri_type type;
+};
+
+struct trace_remote {
+ struct trace_remote_callbacks *cbs;
+ void *priv;
+ struct trace_buffer *trace_buffer;
+ struct trace_buffer_desc *trace_buffer_desc;
+ struct dentry *dentry;
+ struct eventfs_inode *eventfs;
+ struct remote_event *events;
+ unsigned long nr_events;
+ unsigned long trace_buffer_size;
+ struct ring_buffer_remote rb_remote;
+ struct mutex lock;
+ struct rw_semaphore reader_lock;
+ struct rw_semaphore *pcpu_reader_locks;
+ unsigned int nr_readers;
+ unsigned int poll_ms;
+ bool tracing_on;
+};
+
+static bool trace_remote_loaded(struct trace_remote *remote)
+{
+ return !!remote->trace_buffer;
+}
+
+static int trace_remote_load(struct trace_remote *remote)
+{
+ struct ring_buffer_remote *rb_remote = &remote->rb_remote;
+ struct trace_buffer_desc *desc;
+
+ lockdep_assert_held(&remote->lock);
+
+ if (trace_remote_loaded(remote))
+ return 0;
+
+ desc = remote->cbs->load_trace_buffer(remote->trace_buffer_size, remote->priv);
+ if (IS_ERR(desc))
+ return PTR_ERR(desc);
+
+ rb_remote->desc = desc;
+ rb_remote->swap_reader_page = remote->cbs->swap_reader_page;
+ rb_remote->priv = remote->priv;
+ rb_remote->reset = remote->cbs->reset;
+ remote->trace_buffer = ring_buffer_alloc_remote(rb_remote);
+ if (!remote->trace_buffer) {
+ remote->cbs->unload_trace_buffer(desc, remote->priv);
+ return -ENOMEM;
+ }
+
+ remote->trace_buffer_desc = desc;
+
+ return 0;
+}
+
+static void trace_remote_try_unload(struct trace_remote *remote)
+{
+ lockdep_assert_held(&remote->lock);
+
+ if (!trace_remote_loaded(remote))
+ return;
+
+ /* The buffer is being read or writable */
+ if (remote->nr_readers || remote->tracing_on)
+ return;
+
+ /* The buffer has readable data */
+ if (!ring_buffer_empty(remote->trace_buffer))
+ return;
+
+ ring_buffer_free(remote->trace_buffer);
+ remote->trace_buffer = NULL;
+ remote->cbs->unload_trace_buffer(remote->trace_buffer_desc, remote->priv);
+}
+
+static int trace_remote_enable_tracing(struct trace_remote *remote)
+{
+ int ret;
+
+ lockdep_assert_held(&remote->lock);
+
+ if (remote->tracing_on)
+ return 0;
+
+ ret = trace_remote_load(remote);
+ if (ret)
+ return ret;
+
+ ret = remote->cbs->enable_tracing(true, remote->priv);
+ if (ret) {
+ trace_remote_try_unload(remote);
+ return ret;
+ }
+
+ remote->tracing_on = true;
+
+ return 0;
+}
+
+static int trace_remote_disable_tracing(struct trace_remote *remote)
+{
+ int ret;
+
+ lockdep_assert_held(&remote->lock);
+
+ if (!remote->tracing_on)
+ return 0;
+
+ ret = remote->cbs->enable_tracing(false, remote->priv);
+ if (ret)
+ return ret;
+
+ ring_buffer_poll_remote(remote->trace_buffer, RING_BUFFER_ALL_CPUS);
+ remote->tracing_on = false;
+ trace_remote_try_unload(remote);
+
+ return 0;
+}
+
+static void trace_remote_reset(struct trace_remote *remote, int cpu)
+{
+ lockdep_assert_held(&remote->lock);
+
+ if (!trace_remote_loaded(remote))
+ return;
+
+ if (cpu == RING_BUFFER_ALL_CPUS)
+ ring_buffer_reset(remote->trace_buffer);
+ else
+ ring_buffer_reset_cpu(remote->trace_buffer, cpu);
+
+ trace_remote_try_unload(remote);
+}
+
+static ssize_t
+tracing_on_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct seq_file *seq = filp->private_data;
+ struct trace_remote *remote = seq->private;
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ guard(mutex)(&remote->lock);
+
+ ret = val ? trace_remote_enable_tracing(remote) : trace_remote_disable_tracing(remote);
+ if (ret)
+ return ret;
+
+ return cnt;
+}
+static int tracing_on_show(struct seq_file *s, void *unused)
+{
+ struct trace_remote *remote = s->private;
+
+ seq_printf(s, "%d\n", remote->tracing_on);
+
+ return 0;
+}
+DEFINE_SHOW_STORE_ATTRIBUTE(tracing_on);
+
+static ssize_t buffer_size_kb_write(struct file *filp, const char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ struct seq_file *seq = filp->private_data;
+ struct trace_remote *remote = seq->private;
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ /* KiB to Bytes */
+ if (!val || check_shl_overflow(val, 10, &val))
+ return -EINVAL;
+
+ guard(mutex)(&remote->lock);
+
+ if (trace_remote_loaded(remote))
+ return -EBUSY;
+
+ remote->trace_buffer_size = val;
+
+ return cnt;
+}
+
+static int buffer_size_kb_show(struct seq_file *s, void *unused)
+{
+ struct trace_remote *remote = s->private;
+
+ seq_printf(s, "%lu (%s)\n", remote->trace_buffer_size >> 10,
+ trace_remote_loaded(remote) ? "loaded" : "unloaded");
+
+ return 0;
+}
+DEFINE_SHOW_STORE_ATTRIBUTE(buffer_size_kb);
+
+static int trace_remote_get(struct trace_remote *remote, int cpu)
+{
+ int ret;
+
+ if (remote->nr_readers == UINT_MAX)
+ return -EBUSY;
+
+ ret = trace_remote_load(remote);
+ if (ret)
+ return ret;
+
+ if (cpu != RING_BUFFER_ALL_CPUS && !remote->pcpu_reader_locks) {
+ int lock_cpu;
+
+ remote->pcpu_reader_locks = kcalloc(nr_cpu_ids, sizeof(*remote->pcpu_reader_locks),
+ GFP_KERNEL);
+ if (!remote->pcpu_reader_locks) {
+ trace_remote_try_unload(remote);
+ return -ENOMEM;
+ }
+
+ for_each_possible_cpu(lock_cpu)
+ init_rwsem(&remote->pcpu_reader_locks[lock_cpu]);
+ }
+
+ remote->nr_readers++;
+
+ return 0;
+}
+
+static void trace_remote_put(struct trace_remote *remote)
+{
+ if (WARN_ON(!remote->nr_readers))
+ return;
+
+ remote->nr_readers--;
+ if (remote->nr_readers)
+ return;
+
+ kfree(remote->pcpu_reader_locks);
+ remote->pcpu_reader_locks = NULL;
+
+ trace_remote_try_unload(remote);
+}
+
+static bool trace_remote_has_cpu(struct trace_remote *remote, int cpu)
+{
+ if (cpu == RING_BUFFER_ALL_CPUS)
+ return true;
+
+ return ring_buffer_poll_remote(remote->trace_buffer, cpu) == 0;
+}
+
+static void __poll_remote(struct work_struct *work)
+{
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct trace_remote_iterator *iter;
+
+ iter = container_of(dwork, struct trace_remote_iterator, poll_work);
+ ring_buffer_poll_remote(iter->remote->trace_buffer, iter->cpu);
+ schedule_delayed_work((struct delayed_work *)work,
+ msecs_to_jiffies(iter->remote->poll_ms));
+}
+
+static void __free_ring_buffer_iter(struct trace_remote_iterator *iter, int cpu)
+{
+ if (cpu != RING_BUFFER_ALL_CPUS) {
+ ring_buffer_read_finish(iter->rb_iter);
+ return;
+ }
+
+ for_each_possible_cpu(cpu) {
+ if (iter->rb_iters[cpu])
+ ring_buffer_read_finish(iter->rb_iters[cpu]);
+ }
+
+ kfree(iter->rb_iters);
+}
+
+static int __alloc_ring_buffer_iter(struct trace_remote_iterator *iter, int cpu)
+{
+ if (cpu != RING_BUFFER_ALL_CPUS) {
+ iter->rb_iter = ring_buffer_read_start(iter->remote->trace_buffer, cpu, GFP_KERNEL);
+
+ return iter->rb_iter ? 0 : -ENOMEM;
+ }
+
+ iter->rb_iters = kcalloc(nr_cpu_ids, sizeof(*iter->rb_iters), GFP_KERNEL);
+ if (!iter->rb_iters)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ iter->rb_iters[cpu] = ring_buffer_read_start(iter->remote->trace_buffer, cpu,
+ GFP_KERNEL);
+ if (!iter->rb_iters[cpu]) {
+ /* This CPU isn't part of trace_buffer. Skip it */
+ if (!trace_remote_has_cpu(iter->remote, cpu))
+ continue;
+
+ __free_ring_buffer_iter(iter, RING_BUFFER_ALL_CPUS);
+ return -ENOMEM;
+ }
+ }
+
+ return 0;
+}
+
+static struct trace_remote_iterator
+*trace_remote_iter(struct trace_remote *remote, int cpu, enum tri_type type)
+{
+ struct trace_remote_iterator *iter = NULL;
+ int ret;
+
+ lockdep_assert_held(&remote->lock);
+
+ if (type == TRI_NONCONSUMING && !trace_remote_loaded(remote))
+ return NULL;
+
+ ret = trace_remote_get(remote, cpu);
+ if (ret)
+ return ERR_PTR(ret);
+
+ if (!trace_remote_has_cpu(remote, cpu)) {
+ ret = -ENODEV;
+ goto err;
+ }
+
+ iter = kzalloc_obj(*iter);
+ if (iter) {
+ iter->remote = remote;
+ iter->cpu = cpu;
+ iter->type = type;
+ trace_seq_init(&iter->seq);
+
+ switch (type) {
+ case TRI_CONSUMING:
+ ring_buffer_poll_remote(remote->trace_buffer, cpu);
+ INIT_DELAYED_WORK(&iter->poll_work, __poll_remote);
+ schedule_delayed_work(&iter->poll_work, msecs_to_jiffies(remote->poll_ms));
+ break;
+ case TRI_NONCONSUMING:
+ ret = __alloc_ring_buffer_iter(iter, cpu);
+ break;
+ }
+
+ if (ret)
+ goto err;
+
+ return iter;
+ }
+ ret = -ENOMEM;
+
+err:
+ kfree(iter);
+ trace_remote_put(remote);
+
+ return ERR_PTR(ret);
+}
+
+static void trace_remote_iter_free(struct trace_remote_iterator *iter)
+{
+ struct trace_remote *remote;
+
+ if (!iter)
+ return;
+
+ remote = iter->remote;
+
+ lockdep_assert_held(&remote->lock);
+
+ switch (iter->type) {
+ case TRI_CONSUMING:
+ cancel_delayed_work_sync(&iter->poll_work);
+ break;
+ case TRI_NONCONSUMING:
+ __free_ring_buffer_iter(iter, iter->cpu);
+ break;
+ }
+
+ kfree(iter);
+ trace_remote_put(remote);
+}
+
+static void trace_remote_iter_read_start(struct trace_remote_iterator *iter)
+{
+ struct trace_remote *remote = iter->remote;
+ int cpu = iter->cpu;
+
+ /* Acquire global reader lock */
+ if (cpu == RING_BUFFER_ALL_CPUS && iter->type == TRI_CONSUMING)
+ down_write(&remote->reader_lock);
+ else
+ down_read(&remote->reader_lock);
+
+ if (cpu == RING_BUFFER_ALL_CPUS)
+ return;
+
+ /*
+ * No need for the remote lock here, iter holds a reference on
+ * remote->nr_readers
+ */
+
+ /* Get the per-CPU one */
+ if (WARN_ON_ONCE(!remote->pcpu_reader_locks))
+ return;
+
+ if (iter->type == TRI_CONSUMING)
+ down_write(&remote->pcpu_reader_locks[cpu]);
+ else
+ down_read(&remote->pcpu_reader_locks[cpu]);
+}
+
+static void trace_remote_iter_read_finished(struct trace_remote_iterator *iter)
+{
+ struct trace_remote *remote = iter->remote;
+ int cpu = iter->cpu;
+
+ /* Release per-CPU reader lock */
+ if (cpu != RING_BUFFER_ALL_CPUS) {
+ /*
+ * No need for the remote lock here, iter holds a reference on
+ * remote->nr_readers
+ */
+ if (iter->type == TRI_CONSUMING)
+ up_write(&remote->pcpu_reader_locks[cpu]);
+ else
+ up_read(&remote->pcpu_reader_locks[cpu]);
+ }
+
+ /* Release global reader lock */
+ if (cpu == RING_BUFFER_ALL_CPUS && iter->type == TRI_CONSUMING)
+ up_write(&remote->reader_lock);
+ else
+ up_read(&remote->reader_lock);
+}
+
+static struct ring_buffer_iter *__get_rb_iter(struct trace_remote_iterator *iter, int cpu)
+{
+ return iter->cpu != RING_BUFFER_ALL_CPUS ? iter->rb_iter : iter->rb_iters[cpu];
+}
+
+static struct ring_buffer_event *
+__peek_event(struct trace_remote_iterator *iter, int cpu, u64 *ts, unsigned long *lost_events)
+{
+ struct ring_buffer_event *rb_evt;
+ struct ring_buffer_iter *rb_iter;
+
+ switch (iter->type) {
+ case TRI_CONSUMING:
+ return ring_buffer_peek(iter->remote->trace_buffer, cpu, ts, lost_events);
+ case TRI_NONCONSUMING:
+ rb_iter = __get_rb_iter(iter, cpu);
+ if (!rb_iter)
+ return NULL;
+
+ rb_evt = ring_buffer_iter_peek(rb_iter, ts);
+ if (!rb_evt)
+ return NULL;
+
+ *lost_events = ring_buffer_iter_dropped(rb_iter);
+
+ return rb_evt;
+ }
+
+ return NULL;
+}
+
+static bool trace_remote_iter_read_event(struct trace_remote_iterator *iter)
+{
+ struct trace_buffer *trace_buffer = iter->remote->trace_buffer;
+ struct ring_buffer_event *rb_evt;
+ int cpu = iter->cpu;
+
+ if (cpu != RING_BUFFER_ALL_CPUS) {
+ if (ring_buffer_empty_cpu(trace_buffer, cpu))
+ return false;
+
+ rb_evt = __peek_event(iter, cpu, &iter->ts, &iter->lost_events);
+ if (!rb_evt)
+ return false;
+
+ iter->evt_cpu = cpu;
+ iter->evt = ring_buffer_event_data(rb_evt);
+ return true;
+ }
+
+ iter->ts = U64_MAX;
+ for_each_possible_cpu(cpu) {
+ unsigned long lost_events;
+ u64 ts;
+
+ if (ring_buffer_empty_cpu(trace_buffer, cpu))
+ continue;
+
+ rb_evt = __peek_event(iter, cpu, &ts, &lost_events);
+ if (!rb_evt)
+ continue;
+
+ if (ts >= iter->ts)
+ continue;
+
+ iter->ts = ts;
+ iter->evt_cpu = cpu;
+ iter->evt = ring_buffer_event_data(rb_evt);
+ iter->lost_events = lost_events;
+ }
+
+ return iter->ts != U64_MAX;
+}
+
+static void trace_remote_iter_move(struct trace_remote_iterator *iter)
+{
+ struct trace_buffer *trace_buffer = iter->remote->trace_buffer;
+
+ switch (iter->type) {
+ case TRI_CONSUMING:
+ ring_buffer_consume(trace_buffer, iter->evt_cpu, NULL, NULL);
+ break;
+ case TRI_NONCONSUMING:
+ ring_buffer_iter_advance(__get_rb_iter(iter, iter->evt_cpu));
+ break;
+ }
+}
+
+static struct remote_event *trace_remote_find_event(struct trace_remote *remote, unsigned short id);
+
+static int trace_remote_iter_print_event(struct trace_remote_iterator *iter)
+{
+ struct remote_event *evt;
+ unsigned long usecs_rem;
+ u64 ts = iter->ts;
+
+ if (iter->lost_events)
+ trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
+ iter->evt_cpu, iter->lost_events);
+
+ do_div(ts, 1000);
+ usecs_rem = do_div(ts, USEC_PER_SEC);
+
+ trace_seq_printf(&iter->seq, "[%03d]\t%5llu.%06lu: ", iter->evt_cpu,
+ ts, usecs_rem);
+
+ evt = trace_remote_find_event(iter->remote, iter->evt->id);
+ if (!evt)
+ trace_seq_printf(&iter->seq, "UNKNOWN id=%d\n", iter->evt->id);
+ else
+ evt->print(iter->evt, &iter->seq);
+
+ return trace_seq_has_overflowed(&iter->seq) ? -EOVERFLOW : 0;
+}
+
+static int trace_pipe_open(struct inode *inode, struct file *filp)
+{
+ struct trace_remote *remote = inode->i_private;
+ struct trace_remote_iterator *iter;
+ int cpu = tracing_get_cpu(inode);
+
+ guard(mutex)(&remote->lock);
+
+ iter = trace_remote_iter(remote, cpu, TRI_CONSUMING);
+ if (IS_ERR(iter))
+ return PTR_ERR(iter);
+
+ filp->private_data = iter;
+
+ return IS_ERR(iter) ? PTR_ERR(iter) : 0;
+}
+
+static int trace_pipe_release(struct inode *inode, struct file *filp)
+{
+ struct trace_remote_iterator *iter = filp->private_data;
+ struct trace_remote *remote = iter->remote;
+
+ guard(mutex)(&remote->lock);
+
+ trace_remote_iter_free(iter);
+
+ return 0;
+}
+
+static ssize_t trace_pipe_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct trace_remote_iterator *iter = filp->private_data;
+ struct trace_buffer *trace_buffer = iter->remote->trace_buffer;
+ int ret;
+
+copy_to_user:
+ ret = trace_seq_to_user(&iter->seq, ubuf, cnt);
+ if (ret != -EBUSY)
+ return ret;
+
+ trace_seq_init(&iter->seq);
+
+ ret = ring_buffer_wait(trace_buffer, iter->cpu, 0, NULL, NULL);
+ if (ret < 0)
+ return ret;
+
+ trace_remote_iter_read_start(iter);
+
+ while (trace_remote_iter_read_event(iter)) {
+ int prev_len = iter->seq.seq.len;
+
+ if (trace_remote_iter_print_event(iter)) {
+ iter->seq.seq.len = prev_len;
+ break;
+ }
+
+ trace_remote_iter_move(iter);
+ }
+
+ trace_remote_iter_read_finished(iter);
+
+ goto copy_to_user;
+}
+
+static const struct file_operations trace_pipe_fops = {
+ .open = trace_pipe_open,
+ .read = trace_pipe_read,
+ .release = trace_pipe_release,
+};
+
+static void *trace_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct trace_remote_iterator *iter = m->private;
+
+ ++*pos;
+
+ if (!iter || !trace_remote_iter_read_event(iter))
+ return NULL;
+
+ trace_remote_iter_move(iter);
+ iter->pos++;
+
+ return iter;
+}
+
+static void *trace_start(struct seq_file *m, loff_t *pos)
+{
+ struct trace_remote_iterator *iter = m->private;
+ loff_t i;
+
+ if (!iter)
+ return NULL;
+
+ trace_remote_iter_read_start(iter);
+
+ if (!*pos) {
+ iter->pos = -1;
+ return trace_next(m, NULL, &i);
+ }
+
+ i = iter->pos;
+ while (i < *pos) {
+ iter = trace_next(m, NULL, &i);
+ if (!iter)
+ return NULL;
+ }
+
+ return iter;
+}
+
+static int trace_show(struct seq_file *m, void *v)
+{
+ struct trace_remote_iterator *iter = v;
+
+ trace_seq_init(&iter->seq);
+
+ if (trace_remote_iter_print_event(iter)) {
+ seq_printf(m, "[EVENT %d PRINT TOO BIG]\n", iter->evt->id);
+ return 0;
+ }
+
+ return trace_print_seq(m, &iter->seq);
+}
+
+static void trace_stop(struct seq_file *m, void *v)
+{
+ struct trace_remote_iterator *iter = m->private;
+
+ if (iter)
+ trace_remote_iter_read_finished(iter);
+}
+
+static const struct seq_operations trace_sops = {
+ .start = trace_start,
+ .next = trace_next,
+ .show = trace_show,
+ .stop = trace_stop,
+};
+
+static int trace_open(struct inode *inode, struct file *filp)
+{
+ struct trace_remote *remote = inode->i_private;
+ struct trace_remote_iterator *iter = NULL;
+ int cpu = tracing_get_cpu(inode);
+ int ret;
+
+ if (!(filp->f_mode & FMODE_READ))
+ return 0;
+
+ guard(mutex)(&remote->lock);
+
+ iter = trace_remote_iter(remote, cpu, TRI_NONCONSUMING);
+ if (IS_ERR(iter))
+ return PTR_ERR(iter);
+
+ ret = seq_open(filp, &trace_sops);
+ if (ret) {
+ trace_remote_iter_free(iter);
+ return ret;
+ }
+
+ ((struct seq_file *)filp->private_data)->private = (void *)iter;
+
+ return 0;
+}
+
+static int trace_release(struct inode *inode, struct file *filp)
+{
+ struct trace_remote_iterator *iter;
+
+ if (!(filp->f_mode & FMODE_READ))
+ return 0;
+
+ iter = ((struct seq_file *)filp->private_data)->private;
+ seq_release(inode, filp);
+
+ if (!iter)
+ return 0;
+
+ guard(mutex)(&iter->remote->lock);
+
+ trace_remote_iter_free(iter);
+
+ return 0;
+}
+
+static ssize_t trace_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct inode *inode = file_inode(filp);
+ struct trace_remote *remote = inode->i_private;
+ int cpu = tracing_get_cpu(inode);
+
+ guard(mutex)(&remote->lock);
+
+ trace_remote_reset(remote, cpu);
+
+ return cnt;
+}
+
+static const struct file_operations trace_fops = {
+ .open = trace_open,
+ .write = trace_write,
+ .read = seq_read,
+ .read_iter = seq_read_iter,
+ .release = trace_release,
+};
+
+static int trace_remote_init_tracefs(const char *name, struct trace_remote *remote)
+{
+ struct dentry *remote_d, *percpu_d, *d;
+ static struct dentry *root;
+ static DEFINE_MUTEX(lock);
+ bool root_inited = false;
+ int cpu;
+
+ guard(mutex)(&lock);
+
+ if (!root) {
+ root = tracefs_create_dir(TRACEFS_DIR, NULL);
+ if (!root) {
+ pr_err("Failed to create tracefs dir "TRACEFS_DIR"\n");
+ return -ENOMEM;
+ }
+ root_inited = true;
+ }
+
+ remote_d = tracefs_create_dir(name, root);
+ if (!remote_d) {
+ pr_err("Failed to create tracefs dir "TRACEFS_DIR"%s/\n", name);
+ goto err;
+ }
+
+ d = trace_create_file("tracing_on", TRACEFS_MODE_WRITE, remote_d, remote, &tracing_on_fops);
+ if (!d)
+ goto err;
+
+ d = trace_create_file("buffer_size_kb", TRACEFS_MODE_WRITE, remote_d, remote,
+ &buffer_size_kb_fops);
+ if (!d)
+ goto err;
+
+ d = trace_create_file("trace_pipe", TRACEFS_MODE_READ, remote_d, remote, &trace_pipe_fops);
+ if (!d)
+ goto err;
+
+ d = trace_create_file("trace", TRACEFS_MODE_WRITE, remote_d, remote, &trace_fops);
+ if (!d)
+ goto err;
+
+ percpu_d = tracefs_create_dir("per_cpu", remote_d);
+ if (!percpu_d) {
+ pr_err("Failed to create tracefs dir "TRACEFS_DIR"%s/per_cpu/\n", name);
+ goto err;
+ }
+
+ for_each_possible_cpu(cpu) {
+ struct dentry *cpu_d;
+ char cpu_name[16];
+
+ snprintf(cpu_name, sizeof(cpu_name), "cpu%d", cpu);
+ cpu_d = tracefs_create_dir(cpu_name, percpu_d);
+ if (!cpu_d) {
+ pr_err("Failed to create tracefs dir "TRACEFS_DIR"%s/percpu/cpu%d\n",
+ name, cpu);
+ goto err;
+ }
+
+ d = trace_create_cpu_file("trace_pipe", TRACEFS_MODE_READ, cpu_d, remote, cpu,
+ &trace_pipe_fops);
+ if (!d)
+ goto err;
+
+ d = trace_create_cpu_file("trace", TRACEFS_MODE_WRITE, cpu_d, remote, cpu,
+ &trace_fops);
+ if (!d)
+ goto err;
+ }
+
+ remote->dentry = remote_d;
+
+ return 0;
+
+err:
+ if (root_inited) {
+ tracefs_remove(root);
+ root = NULL;
+ } else {
+ tracefs_remove(remote_d);
+ }
+
+ return -ENOMEM;
+}
+
+static int trace_remote_register_events(const char *remote_name, struct trace_remote *remote,
+ struct remote_event *events, size_t nr_events);
+
+/**
+ * trace_remote_register() - Register a Tracefs remote
+ * @name: Name of the remote, used for the Tracefs remotes/ directory.
+ * @cbs: Set of callbacks used to control the remote.
+ * @priv: Private data, passed to each callback from @cbs.
+ * @events: Array of events. &remote_event.name and &remote_event.id must be
+ * filled by the caller.
+ * @nr_events: Number of events in the @events array.
+ *
+ * A trace remote is an entity, outside of the kernel (most likely firmware or
+ * hypervisor) capable of writing events into a Tracefs compatible ring-buffer.
+ * The kernel would then act as a reader.
+ *
+ * The registered remote will be found under the Tracefs directory
+ * remotes/<name>.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv,
+ struct remote_event *events, size_t nr_events)
+{
+ struct trace_remote *remote;
+ int ret;
+
+ remote = kzalloc_obj(*remote);
+ if (!remote)
+ return -ENOMEM;
+
+ remote->cbs = cbs;
+ remote->priv = priv;
+ remote->trace_buffer_size = 7 << 10;
+ remote->poll_ms = 100;
+ mutex_init(&remote->lock);
+ init_rwsem(&remote->reader_lock);
+
+ if (trace_remote_init_tracefs(name, remote)) {
+ kfree(remote);
+ return -ENOMEM;
+ }
+
+ ret = trace_remote_register_events(name, remote, events, nr_events);
+ if (ret) {
+ pr_err("Failed to register events for trace remote '%s' (%d)\n",
+ name, ret);
+ return ret;
+ }
+
+ ret = cbs->init ? cbs->init(remote->dentry, priv) : 0;
+ if (ret)
+ pr_err("Init failed for trace remote '%s' (%d)\n", name, ret);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(trace_remote_register);
+
+/**
+ * trace_remote_free_buffer() - Free trace buffer allocated with trace_remote_alloc_buffer()
+ * @desc: Descriptor of the per-CPU ring-buffers, originally filled by
+ * trace_remote_alloc_buffer()
+ *
+ * Most likely called from &trace_remote_callbacks.unload_trace_buffer.
+ */
+void trace_remote_free_buffer(struct trace_buffer_desc *desc)
+{
+ struct ring_buffer_desc *rb_desc;
+ int cpu;
+
+ for_each_ring_buffer_desc(rb_desc, cpu, desc) {
+ unsigned int id;
+
+ free_page(rb_desc->meta_va);
+
+ for (id = 0; id < rb_desc->nr_page_va; id++)
+ free_page(rb_desc->page_va[id]);
+ }
+}
+EXPORT_SYMBOL_GPL(trace_remote_free_buffer);
+
+/**
+ * trace_remote_alloc_buffer() - Dynamically allocate a trace buffer
+ * @desc: Uninitialized trace_buffer_desc
+ * @desc_size: Size of the trace_buffer_desc. Must be at least equal to
+ * trace_buffer_desc_size()
+ * @buffer_size: Size in bytes of each per-CPU ring-buffer
+ * @cpumask: CPUs to allocate a ring-buffer for
+ *
+ * Helper to dynamically allocate a set of pages (enough to cover @buffer_size)
+ * for each CPU from @cpumask and fill @desc. Most likely called from
+ * &trace_remote_callbacks.load_trace_buffer.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int trace_remote_alloc_buffer(struct trace_buffer_desc *desc, size_t desc_size, size_t buffer_size,
+ const struct cpumask *cpumask)
+{
+ unsigned int nr_pages = max(DIV_ROUND_UP(buffer_size, PAGE_SIZE), 2UL) + 1;
+ void *desc_end = desc + desc_size;
+ struct ring_buffer_desc *rb_desc;
+ int cpu, ret = -ENOMEM;
+
+ if (desc_size < struct_size(desc, __data, 0))
+ return -EINVAL;
+
+ desc->nr_cpus = 0;
+ desc->struct_len = struct_size(desc, __data, 0);
+
+ rb_desc = (struct ring_buffer_desc *)&desc->__data[0];
+
+ for_each_cpu(cpu, cpumask) {
+ unsigned int id;
+
+ if ((void *)rb_desc + struct_size(rb_desc, page_va, nr_pages) > desc_end) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ rb_desc->cpu = cpu;
+ rb_desc->nr_page_va = 0;
+ rb_desc->meta_va = (unsigned long)__get_free_page(GFP_KERNEL);
+ if (!rb_desc->meta_va)
+ goto err;
+
+ for (id = 0; id < nr_pages; id++) {
+ rb_desc->page_va[id] = (unsigned long)__get_free_page(GFP_KERNEL);
+ if (!rb_desc->page_va[id])
+ goto err;
+
+ rb_desc->nr_page_va++;
+ }
+ desc->nr_cpus++;
+ desc->struct_len += offsetof(struct ring_buffer_desc, page_va);
+ desc->struct_len += struct_size(rb_desc, page_va, rb_desc->nr_page_va);
+ rb_desc = __next_ring_buffer_desc(rb_desc);
+ }
+
+ return 0;
+
+err:
+ trace_remote_free_buffer(desc);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(trace_remote_alloc_buffer);
+
+static int
+trace_remote_enable_event(struct trace_remote *remote, struct remote_event *evt, bool enable)
+{
+ int ret;
+
+ lockdep_assert_held(&remote->lock);
+
+ if (evt->enabled == enable)
+ return 0;
+
+ ret = remote->cbs->enable_event(evt->id, enable, remote->priv);
+ if (ret)
+ return ret;
+
+ evt->enabled = enable;
+
+ return 0;
+}
+
+static int remote_event_enable_show(struct seq_file *s, void *unused)
+{
+ struct remote_event *evt = s->private;
+
+ seq_printf(s, "%d\n", evt->enabled);
+
+ return 0;
+}
+
+static ssize_t remote_event_enable_write(struct file *filp, const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ struct seq_file *seq = filp->private_data;
+ struct remote_event *evt = seq->private;
+ struct trace_remote *remote = evt->remote;
+ u8 enable;
+ int ret;
+
+ ret = kstrtou8_from_user(ubuf, count, 10, &enable);
+ if (ret)
+ return ret;
+
+ guard(mutex)(&remote->lock);
+
+ ret = trace_remote_enable_event(remote, evt, enable);
+ if (ret)
+ return ret;
+
+ return count;
+}
+DEFINE_SHOW_STORE_ATTRIBUTE(remote_event_enable);
+
+static int remote_event_id_show(struct seq_file *s, void *unused)
+{
+ struct remote_event *evt = s->private;
+
+ seq_printf(s, "%d\n", evt->id);
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(remote_event_id);
+
+static int remote_event_format_show(struct seq_file *s, void *unused)
+{
+ size_t offset = sizeof(struct remote_event_hdr);
+ struct remote_event *evt = s->private;
+ struct trace_event_fields *field;
+
+ seq_printf(s, "name: %s\n", evt->name);
+ seq_printf(s, "ID: %d\n", evt->id);
+ seq_puts(s,
+ "format:\n\tfield:unsigned short common_type;\toffset:0;\tsize:2;\tsigned:0;\n\n");
+
+ field = &evt->fields[0];
+ while (field->name) {
+ seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%u;\tsigned:%d;\n",
+ field->type, field->name, offset, field->size,
+ field->is_signed);
+ offset += field->size;
+ field++;
+ }
+
+ if (field != &evt->fields[0])
+ seq_puts(s, "\n");
+
+ seq_printf(s, "print fmt: %s\n", evt->print_fmt);
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(remote_event_format);
+
+static int remote_event_callback(const char *name, umode_t *mode, void **data,
+ const struct file_operations **fops)
+{
+ if (!strcmp(name, "enable")) {
+ *mode = TRACEFS_MODE_WRITE;
+ *fops = &remote_event_enable_fops;
+ return 1;
+ }
+
+ if (!strcmp(name, "id")) {
+ *mode = TRACEFS_MODE_READ;
+ *fops = &remote_event_id_fops;
+ return 1;
+ }
+
+ if (!strcmp(name, "format")) {
+ *mode = TRACEFS_MODE_READ;
+ *fops = &remote_event_format_fops;
+ return 1;
+ }
+
+ return 0;
+}
+
+static ssize_t remote_events_dir_enable_write(struct file *filp, const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ struct trace_remote *remote = file_inode(filp)->i_private;
+ int i, ret;
+ u8 enable;
+
+ ret = kstrtou8_from_user(ubuf, count, 10, &enable);
+ if (ret)
+ return ret;
+
+ guard(mutex)(&remote->lock);
+
+ for (i = 0; i < remote->nr_events; i++) {
+ struct remote_event *evt = &remote->events[i];
+
+ trace_remote_enable_event(remote, evt, enable);
+ }
+
+ return count;
+}
+
+static ssize_t remote_events_dir_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ struct trace_remote *remote = file_inode(filp)->i_private;
+ const char enabled_char[] = {'0', '1', 'X'};
+ char enabled_str[] = " \n";
+ int i, enabled = -1;
+
+ guard(mutex)(&remote->lock);
+
+ for (i = 0; i < remote->nr_events; i++) {
+ struct remote_event *evt = &remote->events[i];
+
+ if (enabled == -1) {
+ enabled = evt->enabled;
+ } else if (enabled != evt->enabled) {
+ enabled = 2;
+ break;
+ }
+ }
+
+ enabled_str[0] = enabled_char[enabled == -1 ? 0 : enabled];
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, enabled_str, 2);
+}
+
+static const struct file_operations remote_events_dir_enable_fops = {
+ .write = remote_events_dir_enable_write,
+ .read = remote_events_dir_enable_read,
+};
+
+static ssize_t
+remote_events_dir_header_page_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct trace_seq *s;
+ int ret;
+
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+
+ trace_seq_init(s);
+
+ ring_buffer_print_page_header(NULL, s);
+ ret = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, trace_seq_used(s));
+ kfree(s);
+
+ return ret;
+}
+
+static const struct file_operations remote_events_dir_header_page_fops = {
+ .read = remote_events_dir_header_page_read,
+};
+
+static ssize_t
+remote_events_dir_header_event_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct trace_seq *s;
+ int ret;
+
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+
+ trace_seq_init(s);
+
+ ring_buffer_print_entry_header(s);
+ ret = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, trace_seq_used(s));
+ kfree(s);
+
+ return ret;
+}
+
+static const struct file_operations remote_events_dir_header_event_fops = {
+ .read = remote_events_dir_header_event_read,
+};
+
+static int remote_events_dir_callback(const char *name, umode_t *mode, void **data,
+ const struct file_operations **fops)
+{
+ if (!strcmp(name, "enable")) {
+ *mode = TRACEFS_MODE_WRITE;
+ *fops = &remote_events_dir_enable_fops;
+ return 1;
+ }
+
+ if (!strcmp(name, "header_page")) {
+ *mode = TRACEFS_MODE_READ;
+ *fops = &remote_events_dir_header_page_fops;
+ return 1;
+ }
+
+ if (!strcmp(name, "header_event")) {
+ *mode = TRACEFS_MODE_READ;
+ *fops = &remote_events_dir_header_event_fops;
+ return 1;
+ }
+
+ return 0;
+}
+
+static int trace_remote_init_eventfs(const char *remote_name, struct trace_remote *remote,
+ struct remote_event *evt)
+{
+ struct eventfs_inode *eventfs = remote->eventfs;
+ static struct eventfs_entry dir_entries[] = {
+ {
+ .name = "enable",
+ .callback = remote_events_dir_callback,
+ }, {
+ .name = "header_page",
+ .callback = remote_events_dir_callback,
+ }, {
+ .name = "header_event",
+ .callback = remote_events_dir_callback,
+ }
+ };
+ static struct eventfs_entry entries[] = {
+ {
+ .name = "enable",
+ .callback = remote_event_callback,
+ }, {
+ .name = "id",
+ .callback = remote_event_callback,
+ }, {
+ .name = "format",
+ .callback = remote_event_callback,
+ }
+ };
+ bool eventfs_create = false;
+
+ if (!eventfs) {
+ eventfs = eventfs_create_events_dir("events", remote->dentry, dir_entries,
+ ARRAY_SIZE(dir_entries), remote);
+ if (IS_ERR(eventfs))
+ return PTR_ERR(eventfs);
+
+ /*
+ * Create similar hierarchy as local events even if a single system is supported at
+ * the moment
+ */
+ eventfs = eventfs_create_dir(remote_name, eventfs, NULL, 0, NULL);
+ if (IS_ERR(eventfs))
+ return PTR_ERR(eventfs);
+
+ remote->eventfs = eventfs;
+ eventfs_create = true;
+ }
+
+ eventfs = eventfs_create_dir(evt->name, eventfs, entries, ARRAY_SIZE(entries), evt);
+ if (IS_ERR(eventfs)) {
+ if (eventfs_create) {
+ eventfs_remove_events_dir(remote->eventfs);
+ remote->eventfs = NULL;
+ }
+ return PTR_ERR(eventfs);
+ }
+
+ return 0;
+}
+
+static int trace_remote_attach_events(struct trace_remote *remote, struct remote_event *events,
+ size_t nr_events)
+{
+ int i;
+
+ for (i = 0; i < nr_events; i++) {
+ struct remote_event *evt = &events[i];
+
+ if (evt->remote)
+ return -EEXIST;
+
+ evt->remote = remote;
+
+ /* We need events to be sorted for efficient lookup */
+ if (i && evt->id <= events[i - 1].id)
+ return -EINVAL;
+ }
+
+ remote->events = events;
+ remote->nr_events = nr_events;
+
+ return 0;
+}
+
+static int trace_remote_register_events(const char *remote_name, struct trace_remote *remote,
+ struct remote_event *events, size_t nr_events)
+{
+ int i, ret;
+
+ ret = trace_remote_attach_events(remote, events, nr_events);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < nr_events; i++) {
+ struct remote_event *evt = &events[i];
+
+ ret = trace_remote_init_eventfs(remote_name, remote, evt);
+ if (ret)
+ pr_warn("Failed to init eventfs for event '%s' (%d)",
+ evt->name, ret);
+ }
+
+ return 0;
+}
+
+static int __cmp_events(const void *key, const void *data)
+{
+ const struct remote_event *evt = data;
+ int id = (int)((long)key);
+
+ return id - (int)evt->id;
+}
+
+static struct remote_event *trace_remote_find_event(struct trace_remote *remote, unsigned short id)
+{
+ return bsearch((const void *)(unsigned long)id, remote->events, remote->nr_events,
+ sizeof(*remote->events), __cmp_events);
+}
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 573b5d8e8a28..e9f0ff962660 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -224,7 +224,6 @@ static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val)
/* Place map_cmdline_to_pid array right after saved_cmdlines */
s->map_cmdline_to_pid = (unsigned *)&s->saved_cmdlines[val * TASK_COMM_LEN];
- s->cmdline_idx = 0;
memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP,
sizeof(s->map_pid_to_cmdline));
memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP,
@@ -248,6 +247,8 @@ int trace_save_cmdline(struct task_struct *tsk)
if (!tsk->pid)
return 1;
+ BUILD_BUG_ON(!is_power_of_2(PID_MAX_DEFAULT));
+
tpid = tsk->pid & (PID_MAX_DEFAULT - 1);
/*
@@ -442,9 +443,8 @@ int trace_alloc_tgid_map(void)
if (tgid_map)
return 0;
- tgid_map_max = pid_max;
- map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map),
- GFP_KERNEL);
+ tgid_map_max = init_pid_ns.pid_max;
+ map = kvzalloc_objs(*tgid_map, tgid_map_max + 1);
if (!map)
return -ENOMEM;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index d6c7f18daa15..8faa73d3bba1 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -41,7 +41,7 @@ static void stop_func_tracer(struct trace_array *tr, int graph);
static int save_flags;
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-# define is_graph(tr) ((tr)->trace_flags & TRACE_ITER_DISPLAY_GRAPH)
+# define is_graph(tr) ((tr)->trace_flags & TRACE_ITER(DISPLAY_GRAPH))
#else
# define is_graph(tr) false
#endif
@@ -83,14 +83,14 @@ func_prolog_preempt_disable(struct trace_array *tr,
goto out_enable;
*data = per_cpu_ptr(tr->array_buffer.data, cpu);
- disabled = atomic_inc_return(&(*data)->disabled);
+ disabled = local_inc_return(&(*data)->disabled);
if (unlikely(disabled != 1))
goto out;
return 1;
out:
- atomic_dec(&(*data)->disabled);
+ local_dec(&(*data)->disabled);
out_enable:
preempt_enable_notrace();
@@ -113,11 +113,13 @@ static int wakeup_display_graph(struct trace_array *tr, int set)
}
static int wakeup_graph_entry(struct ftrace_graph_ent *trace,
- struct fgraph_ops *gops)
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
struct trace_array *tr = wakeup_trace;
struct trace_array_cpu *data;
unsigned int trace_ctx;
+ u64 *calltime;
int ret = 0;
if (ftrace_graph_ignore_func(gops, trace))
@@ -135,28 +137,40 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace,
if (!func_prolog_preempt_disable(tr, &data, &trace_ctx))
return 0;
- ret = __trace_graph_entry(tr, trace, trace_ctx);
- atomic_dec(&data->disabled);
+ calltime = fgraph_reserve_data(gops->idx, sizeof(*calltime));
+ if (calltime) {
+ *calltime = trace_clock_local();
+ ret = __trace_graph_entry(tr, trace, trace_ctx);
+ }
+ local_dec(&data->disabled);
preempt_enable_notrace();
return ret;
}
static void wakeup_graph_return(struct ftrace_graph_ret *trace,
- struct fgraph_ops *gops)
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
struct trace_array *tr = wakeup_trace;
struct trace_array_cpu *data;
unsigned int trace_ctx;
+ u64 *calltime;
+ u64 rettime;
+ int size;
ftrace_graph_addr_finish(gops, trace);
if (!func_prolog_preempt_disable(tr, &data, &trace_ctx))
return;
- __trace_graph_return(tr, trace, trace_ctx);
- atomic_dec(&data->disabled);
+ rettime = trace_clock_local();
+
+ calltime = fgraph_retrieve_data(gops->idx, &size);
+ if (calltime)
+ __trace_graph_return(tr, trace, trace_ctx, *calltime, rettime);
+ local_dec(&data->disabled);
preempt_enable_notrace();
return;
}
@@ -170,8 +184,6 @@ static void wakeup_trace_open(struct trace_iterator *iter)
{
if (is_graph(iter->tr))
graph_trace_open(iter);
- else
- iter->private = NULL;
}
static void wakeup_trace_close(struct trace_iterator *iter)
@@ -224,10 +236,10 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip,
return;
local_irq_save(flags);
- trace_function(tr, ip, parent_ip, trace_ctx);
+ trace_function(tr, ip, parent_ip, trace_ctx, fregs);
local_irq_restore(flags);
- atomic_dec(&data->disabled);
+ local_dec(&data->disabled);
preempt_enable_notrace();
}
@@ -235,8 +247,8 @@ static int register_wakeup_function(struct trace_array *tr, int graph, int set)
{
int ret;
- /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */
- if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER_FUNCTION)))
+ /* 'set' is set if TRACE_ITER(FUNCTION) is about to be set */
+ if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER(FUNCTION))))
return 0;
if (graph)
@@ -265,7 +277,7 @@ static void unregister_wakeup_function(struct trace_array *tr, int graph)
static int wakeup_function_set(struct trace_array *tr, u32 mask, int set)
{
- if (!(mask & TRACE_ITER_FUNCTION))
+ if (!(mask & TRACE_ITER(FUNCTION)))
return 0;
if (set)
@@ -309,10 +321,10 @@ __trace_function(struct trace_array *tr,
if (is_graph(tr))
trace_graph_function(tr, ip, parent_ip, trace_ctx);
else
- trace_function(tr, ip, parent_ip, trace_ctx);
+ trace_function(tr, ip, parent_ip, trace_ctx, NULL);
}
-static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set)
+static int wakeup_flag_changed(struct trace_array *tr, u64 mask, int set)
{
struct tracer *tracer = tr->current_trace;
@@ -320,7 +332,7 @@ static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set)
return 0;
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- if (mask & TRACE_ITER_DISPLAY_GRAPH)
+ if (mask & TRACE_ITER(DISPLAY_GRAPH))
return wakeup_display_graph(tr, set);
#endif
@@ -455,7 +467,7 @@ probe_wakeup_sched_switch(void *ignore, bool preempt,
/* disable local data, not wakeup_cpu data */
cpu = raw_smp_processor_id();
- disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled);
+ disabled = local_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled);
if (likely(disabled != 1))
goto out;
@@ -492,7 +504,7 @@ out_unlock:
arch_spin_unlock(&wakeup_lock);
local_irq_restore(flags);
out:
- atomic_dec(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled);
+ local_dec(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled);
}
static void __wakeup_reset(struct trace_array *tr)
@@ -547,7 +559,7 @@ probe_wakeup(void *ignore, struct task_struct *p)
(!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio)))
return;
- disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled);
+ disabled = local_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled);
if (unlikely(disabled != 1))
goto out;
@@ -594,7 +606,7 @@ probe_wakeup(void *ignore, struct task_struct *p)
out_locked:
arch_spin_unlock(&wakeup_lock);
out:
- atomic_dec(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled);
+ local_dec(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled);
}
static void start_wakeup_tracer(struct trace_array *tr)
@@ -669,8 +681,8 @@ static int __wakeup_tracer_init(struct trace_array *tr)
save_flags = tr->trace_flags;
/* non overwrite screws up the latency tracers */
- set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
- set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);
+ set_tracer_flag(tr, TRACE_ITER(OVERWRITE), 1);
+ set_tracer_flag(tr, TRACE_ITER(LATENCY_FMT), 1);
tr->max_latency = 0;
wakeup_trace = tr;
@@ -713,15 +725,15 @@ static int wakeup_dl_tracer_init(struct trace_array *tr)
static void wakeup_tracer_reset(struct trace_array *tr)
{
- int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT;
- int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE;
+ int lat_flag = save_flags & TRACE_ITER(LATENCY_FMT);
+ int overwrite_flag = save_flags & TRACE_ITER(OVERWRITE);
stop_wakeup_tracer(tr);
/* make sure we put back any tasks we are tracing */
wakeup_reset(tr);
- set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);
- set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
+ set_tracer_flag(tr, TRACE_ITER(LATENCY_FMT), lat_flag);
+ set_tracer_flag(tr, TRACE_ITER(OVERWRITE), overwrite_flag);
ftrace_reset_array_ops(tr);
wakeup_busy = false;
}
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 38b5754790c9..929c84075315 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -248,7 +248,7 @@ static int trace_selftest_ops(struct trace_array *tr, int cnt)
goto out;
/* Add a dynamic probe */
- dyn_ops = kzalloc(sizeof(*dyn_ops), GFP_KERNEL);
+ dyn_ops = kzalloc_obj(*dyn_ops);
if (!dyn_ops) {
printk("MEMORY ERROR ");
goto out;
@@ -774,7 +774,8 @@ struct fgraph_fixture {
};
static __init int store_entry(struct ftrace_graph_ent *trace,
- struct fgraph_ops *gops)
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
struct fgraph_fixture *fixture = container_of(gops, struct fgraph_fixture, gops);
const char *type = fixture->store_type_name;
@@ -807,7 +808,8 @@ static __init int store_entry(struct ftrace_graph_ent *trace,
}
static __init void store_return(struct ftrace_graph_ret *trace,
- struct fgraph_ops *gops)
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
struct fgraph_fixture *fixture = container_of(gops, struct fgraph_fixture, gops);
const char *type = fixture->store_type_name;
@@ -1025,7 +1027,8 @@ static unsigned int graph_hang_thresh;
/* Wrap the real function entry probe to avoid possible hanging */
static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace,
- struct fgraph_ops *gops)
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
/* This is harmlessly racy, we want to approximately detect a hang */
if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) {
@@ -1039,7 +1042,7 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace,
return 0;
}
- return trace_graph_entry(trace, gops);
+ return trace_graph_entry(trace, gops, fregs);
}
static struct fgraph_ops fgraph_ops __initdata = {
@@ -1222,7 +1225,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
/* check both trace buffers */
ret = trace_test_buffer(&tr->array_buffer, NULL);
if (!ret)
- ret = trace_test_buffer(&tr->max_buffer, &count);
+ ret = trace_test_buffer(&tr->snapshot_buffer, &count);
trace->reset(tr);
tracing_start();
@@ -1284,7 +1287,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
/* check both trace buffers */
ret = trace_test_buffer(&tr->array_buffer, NULL);
if (!ret)
- ret = trace_test_buffer(&tr->max_buffer, &count);
+ ret = trace_test_buffer(&tr->snapshot_buffer, &count);
trace->reset(tr);
tracing_start();
@@ -1352,7 +1355,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
if (ret)
goto out;
- ret = trace_test_buffer(&tr->max_buffer, &count);
+ ret = trace_test_buffer(&tr->snapshot_buffer, &count);
if (ret)
goto out;
@@ -1382,7 +1385,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
if (ret)
goto out;
- ret = trace_test_buffer(&tr->max_buffer, &count);
+ ret = trace_test_buffer(&tr->snapshot_buffer, &count);
if (!ret && !count) {
printk(KERN_CONT ".. no entries found ..");
@@ -1510,7 +1513,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
/* check both trace buffers */
ret = trace_test_buffer(&tr->array_buffer, NULL);
if (!ret)
- ret = trace_test_buffer(&tr->max_buffer, &count);
+ ret = trace_test_buffer(&tr->snapshot_buffer, &count);
trace->reset(tr);
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
index c158d65a8a88..85f6f10d107f 100644
--- a/kernel/trace/trace_seq.c
+++ b/kernel/trace/trace_seq.c
@@ -15,7 +15,7 @@
*
* A write to the buffer will either succeed or fail. That is, unlike
* sprintf() there will not be a partial write (well it may write into
- * the buffer but it wont update the pointers). This allows users to
+ * the buffer but it won't update the pointers). This allows users to
* try to write something into the trace_seq buffer and if it fails
* they can flush it and try again.
*
@@ -106,7 +106,7 @@ EXPORT_SYMBOL_GPL(trace_seq_printf);
* Writes a ASCII representation of a bitmask string into @s.
*/
void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
- int nmaskbits)
+ int nmaskbits)
{
unsigned int save_len = s->seq.len;
@@ -125,6 +125,33 @@ void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
EXPORT_SYMBOL_GPL(trace_seq_bitmask);
/**
+ * trace_seq_bitmask_list - write a bitmask array in its list representation
+ * @s: trace sequence descriptor
+ * @maskp: points to an array of unsigned longs that represent a bitmask
+ * @nmaskbits: The number of bits that are valid in @maskp
+ *
+ * Writes a list representation (e.g., 0-3,5-7) of a bitmask string into @s.
+ */
+void trace_seq_bitmask_list(struct trace_seq *s, const unsigned long *maskp,
+ int nmaskbits)
+{
+ unsigned int save_len = s->seq.len;
+
+ if (s->full)
+ return;
+
+ __trace_seq_init(s);
+
+ seq_buf_printf(&s->seq, "%*pbl", nmaskbits, maskp);
+
+ if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+ s->seq.len = save_len;
+ s->full = 1;
+ }
+}
+EXPORT_SYMBOL_GPL(trace_seq_bitmask_list);
+
+/**
* trace_seq_vprintf - sequence printing of trace information
* @s: trace sequence descriptor
* @fmt: printf format string
diff --git a/kernel/trace/trace_snapshot.c b/kernel/trace/trace_snapshot.c
new file mode 100644
index 000000000000..07b43c9863a2
--- /dev/null
+++ b/kernel/trace/trace_snapshot.c
@@ -0,0 +1,1066 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/fsnotify.h>
+
+#include <asm/setup.h> /* COMMAND_LINE_SIZE */
+
+#include "trace.h"
+
+/* Used if snapshot allocated at boot */
+static bool allocate_snapshot;
+static bool snapshot_at_boot;
+
+static char boot_snapshot_info[COMMAND_LINE_SIZE] __initdata;
+static int boot_snapshot_index;
+
+static int __init boot_alloc_snapshot(char *str)
+{
+ char *slot = boot_snapshot_info + boot_snapshot_index;
+ int left = sizeof(boot_snapshot_info) - boot_snapshot_index;
+ int ret;
+
+ if (str[0] == '=') {
+ str++;
+ if (strlen(str) >= left)
+ return -1;
+
+ ret = snprintf(slot, left, "%s\t", str);
+ boot_snapshot_index += ret;
+ } else {
+ allocate_snapshot = true;
+ /* We also need the main ring buffer expanded */
+ trace_set_ring_buffer_expanded(NULL);
+ }
+ return 1;
+}
+__setup("alloc_snapshot", boot_alloc_snapshot);
+
+
+static int __init boot_snapshot(char *str)
+{
+ snapshot_at_boot = true;
+ boot_alloc_snapshot(str);
+ return 1;
+}
+__setup("ftrace_boot_snapshot", boot_snapshot);
+static void tracing_snapshot_instance_cond(struct trace_array *tr,
+ void *cond_data)
+{
+ unsigned long flags;
+
+ if (in_nmi()) {
+ trace_array_puts(tr, "*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n");
+ trace_array_puts(tr, "*** snapshot is being ignored ***\n");
+ return;
+ }
+
+ if (!tr->allocated_snapshot) {
+ trace_array_puts(tr, "*** SNAPSHOT NOT ALLOCATED ***\n");
+ trace_array_puts(tr, "*** stopping trace here! ***\n");
+ tracer_tracing_off(tr);
+ return;
+ }
+
+ if (tr->mapped) {
+ trace_array_puts(tr, "*** BUFFER MEMORY MAPPED ***\n");
+ trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n");
+ return;
+ }
+
+ /* Note, snapshot can not be used when the tracer uses it */
+ if (tracer_uses_snapshot(tr->current_trace)) {
+ trace_array_puts(tr, "*** LATENCY TRACER ACTIVE ***\n");
+ trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n");
+ return;
+ }
+
+ local_irq_save(flags);
+ update_max_tr(tr, current, smp_processor_id(), cond_data);
+ local_irq_restore(flags);
+}
+
+void tracing_snapshot_instance(struct trace_array *tr)
+{
+ tracing_snapshot_instance_cond(tr, NULL);
+}
+
+/**
+ * tracing_snapshot_cond - conditionally take a snapshot of the current buffer.
+ * @tr: The tracing instance to snapshot
+ * @cond_data: The data to be tested conditionally, and possibly saved
+ *
+ * This is the same as tracing_snapshot() except that the snapshot is
+ * conditional - the snapshot will only happen if the
+ * cond_snapshot.update() implementation receiving the cond_data
+ * returns true, which means that the trace array's cond_snapshot
+ * update() operation used the cond_data to determine whether the
+ * snapshot should be taken, and if it was, presumably saved it along
+ * with the snapshot.
+ */
+void tracing_snapshot_cond(struct trace_array *tr, void *cond_data)
+{
+ tracing_snapshot_instance_cond(tr, cond_data);
+}
+EXPORT_SYMBOL_GPL(tracing_snapshot_cond);
+
+/**
+ * tracing_cond_snapshot_data - get the user data associated with a snapshot
+ * @tr: The tracing instance
+ *
+ * When the user enables a conditional snapshot using
+ * tracing_snapshot_cond_enable(), the user-defined cond_data is saved
+ * with the snapshot. This accessor is used to retrieve it.
+ *
+ * Should not be called from cond_snapshot.update(), since it takes
+ * the tr->max_lock lock, which the code calling
+ * cond_snapshot.update() has already done.
+ *
+ * Returns the cond_data associated with the trace array's snapshot.
+ */
+void *tracing_cond_snapshot_data(struct trace_array *tr)
+{
+ void *cond_data = NULL;
+
+ local_irq_disable();
+ arch_spin_lock(&tr->max_lock);
+
+ if (tr->cond_snapshot)
+ cond_data = tr->cond_snapshot->cond_data;
+
+ arch_spin_unlock(&tr->max_lock);
+ local_irq_enable();
+
+ return cond_data;
+}
+EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data);
+
+/* resize @tr's buffer to the size of @size_tr's entries */
+int resize_buffer_duplicate_size(struct array_buffer *trace_buf,
+ struct array_buffer *size_buf, int cpu_id)
+{
+ int cpu, ret = 0;
+
+ if (cpu_id == RING_BUFFER_ALL_CPUS) {
+ for_each_tracing_cpu(cpu) {
+ ret = ring_buffer_resize(trace_buf->buffer,
+ per_cpu_ptr(size_buf->data, cpu)->entries, cpu);
+ if (ret < 0)
+ break;
+ per_cpu_ptr(trace_buf->data, cpu)->entries =
+ per_cpu_ptr(size_buf->data, cpu)->entries;
+ }
+ } else {
+ ret = ring_buffer_resize(trace_buf->buffer,
+ per_cpu_ptr(size_buf->data, cpu_id)->entries, cpu_id);
+ if (ret == 0)
+ per_cpu_ptr(trace_buf->data, cpu_id)->entries =
+ per_cpu_ptr(size_buf->data, cpu_id)->entries;
+ }
+
+ return ret;
+}
+
+int tracing_alloc_snapshot_instance(struct trace_array *tr)
+{
+ int order;
+ int ret;
+
+ if (!tr->allocated_snapshot) {
+
+ /* Make the snapshot buffer have the same order as main buffer */
+ order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer);
+ ret = ring_buffer_subbuf_order_set(tr->snapshot_buffer.buffer, order);
+ if (ret < 0)
+ return ret;
+
+ /* allocate spare buffer */
+ ret = resize_buffer_duplicate_size(&tr->snapshot_buffer,
+ &tr->array_buffer, RING_BUFFER_ALL_CPUS);
+ if (ret < 0)
+ return ret;
+
+ tr->allocated_snapshot = true;
+ }
+
+ return 0;
+}
+
+void free_snapshot(struct trace_array *tr)
+{
+ /*
+ * We don't free the ring buffer. instead, resize it because
+ * The max_tr ring buffer has some state (e.g. ring->clock) and
+ * we want preserve it.
+ */
+ ring_buffer_subbuf_order_set(tr->snapshot_buffer.buffer, 0);
+ ring_buffer_resize(tr->snapshot_buffer.buffer, 1, RING_BUFFER_ALL_CPUS);
+ trace_set_buffer_entries(&tr->snapshot_buffer, 1);
+ tracing_reset_online_cpus(&tr->snapshot_buffer);
+ tr->allocated_snapshot = false;
+}
+
+int tracing_arm_snapshot_locked(struct trace_array *tr)
+{
+ int ret;
+
+ lockdep_assert_held(&trace_types_lock);
+
+ spin_lock(&tr->snapshot_trigger_lock);
+ if (tr->snapshot == UINT_MAX || tr->mapped) {
+ spin_unlock(&tr->snapshot_trigger_lock);
+ return -EBUSY;
+ }
+
+ tr->snapshot++;
+ spin_unlock(&tr->snapshot_trigger_lock);
+
+ ret = tracing_alloc_snapshot_instance(tr);
+ if (ret) {
+ spin_lock(&tr->snapshot_trigger_lock);
+ tr->snapshot--;
+ spin_unlock(&tr->snapshot_trigger_lock);
+ }
+
+ return ret;
+}
+
+int tracing_arm_snapshot(struct trace_array *tr)
+{
+ guard(mutex)(&trace_types_lock);
+ return tracing_arm_snapshot_locked(tr);
+}
+
+void tracing_disarm_snapshot(struct trace_array *tr)
+{
+ spin_lock(&tr->snapshot_trigger_lock);
+ if (!WARN_ON(!tr->snapshot))
+ tr->snapshot--;
+ spin_unlock(&tr->snapshot_trigger_lock);
+}
+
+/**
+ * tracing_snapshot_alloc - allocate and take a snapshot of the current buffer.
+ *
+ * This is similar to tracing_snapshot(), but it will allocate the
+ * snapshot buffer if it isn't already allocated. Use this only
+ * where it is safe to sleep, as the allocation may sleep.
+ *
+ * This causes a swap between the snapshot buffer and the current live
+ * tracing buffer. You can use this to take snapshots of the live
+ * trace when some condition is triggered, but continue to trace.
+ */
+void tracing_snapshot_alloc(void)
+{
+ int ret;
+
+ ret = tracing_alloc_snapshot();
+ if (ret < 0)
+ return;
+
+ tracing_snapshot();
+}
+EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
+
+/**
+ * tracing_snapshot_cond_enable - enable conditional snapshot for an instance
+ * @tr: The tracing instance
+ * @cond_data: User data to associate with the snapshot
+ * @update: Implementation of the cond_snapshot update function
+ *
+ * Check whether the conditional snapshot for the given instance has
+ * already been enabled, or if the current tracer is already using a
+ * snapshot; if so, return -EBUSY, else create a cond_snapshot and
+ * save the cond_data and update function inside.
+ *
+ * Returns 0 if successful, error otherwise.
+ */
+int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data,
+ cond_update_fn_t update)
+{
+ struct cond_snapshot *cond_snapshot __free(kfree) =
+ kzalloc_obj(*cond_snapshot);
+ int ret;
+
+ if (!cond_snapshot)
+ return -ENOMEM;
+
+ cond_snapshot->cond_data = cond_data;
+ cond_snapshot->update = update;
+
+ guard(mutex)(&trace_types_lock);
+
+ if (tracer_uses_snapshot(tr->current_trace))
+ return -EBUSY;
+
+ /*
+ * The cond_snapshot can only change to NULL without the
+ * trace_types_lock. We don't care if we race with it going
+ * to NULL, but we want to make sure that it's not set to
+ * something other than NULL when we get here, which we can
+ * do safely with only holding the trace_types_lock and not
+ * having to take the max_lock.
+ */
+ if (tr->cond_snapshot)
+ return -EBUSY;
+
+ ret = tracing_arm_snapshot_locked(tr);
+ if (ret)
+ return ret;
+
+ local_irq_disable();
+ arch_spin_lock(&tr->max_lock);
+ tr->cond_snapshot = no_free_ptr(cond_snapshot);
+ arch_spin_unlock(&tr->max_lock);
+ local_irq_enable();
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable);
+
+/**
+ * tracing_snapshot_cond_disable - disable conditional snapshot for an instance
+ * @tr: The tracing instance
+ *
+ * Check whether the conditional snapshot for the given instance is
+ * enabled; if so, free the cond_snapshot associated with it,
+ * otherwise return -EINVAL.
+ *
+ * Returns 0 if successful, error otherwise.
+ */
+int tracing_snapshot_cond_disable(struct trace_array *tr)
+{
+ int ret = 0;
+
+ local_irq_disable();
+ arch_spin_lock(&tr->max_lock);
+
+ if (!tr->cond_snapshot)
+ ret = -EINVAL;
+ else {
+ kfree(tr->cond_snapshot);
+ tr->cond_snapshot = NULL;
+ }
+
+ arch_spin_unlock(&tr->max_lock);
+ local_irq_enable();
+
+ tracing_disarm_snapshot(tr);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+#ifdef LATENCY_FS_NOTIFY
+static struct workqueue_struct *fsnotify_wq;
+
+static void latency_fsnotify_workfn(struct work_struct *work)
+{
+ struct trace_array *tr = container_of(work, struct trace_array,
+ fsnotify_work);
+ fsnotify_inode(tr->d_max_latency->d_inode, FS_MODIFY);
+}
+
+static void latency_fsnotify_workfn_irq(struct irq_work *iwork)
+{
+ struct trace_array *tr = container_of(iwork, struct trace_array,
+ fsnotify_irqwork);
+ queue_work(fsnotify_wq, &tr->fsnotify_work);
+}
+
+__init static int latency_fsnotify_init(void)
+{
+ fsnotify_wq = alloc_workqueue("tr_max_lat_wq",
+ WQ_UNBOUND | WQ_HIGHPRI, 0);
+ if (!fsnotify_wq) {
+ pr_err("Unable to allocate tr_max_lat_wq\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+late_initcall_sync(latency_fsnotify_init);
+
+void latency_fsnotify(struct trace_array *tr)
+{
+ if (!fsnotify_wq)
+ return;
+ /*
+ * We cannot call queue_work(&tr->fsnotify_work) from here because it's
+ * possible that we are called from __schedule() or do_idle(), which
+ * could cause a deadlock.
+ */
+ irq_work_queue(&tr->fsnotify_irqwork);
+}
+#endif /* LATENCY_FS_NOTIFY */
+
+static const struct file_operations tracing_max_lat_fops;
+
+void trace_create_maxlat_file(struct trace_array *tr,
+ struct dentry *d_tracer)
+{
+#ifdef LATENCY_FS_NOTIFY
+ INIT_WORK(&tr->fsnotify_work, latency_fsnotify_workfn);
+ init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq);
+#endif
+ tr->d_max_latency = trace_create_file("tracing_max_latency",
+ TRACE_MODE_WRITE,
+ d_tracer, tr,
+ &tracing_max_lat_fops);
+}
+
+/*
+ * Copy the new maximum trace into the separate maximum-trace
+ * structure. (this way the maximum trace is permanently saved,
+ * for later retrieval via /sys/kernel/tracing/tracing_max_latency)
+ */
+static void
+__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
+{
+ struct array_buffer *trace_buf = &tr->array_buffer;
+ struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu);
+ struct array_buffer *max_buf = &tr->snapshot_buffer;
+ struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu);
+
+ max_buf->cpu = cpu;
+ max_buf->time_start = data->preempt_timestamp;
+
+ max_data->saved_latency = tr->max_latency;
+ max_data->critical_start = data->critical_start;
+ max_data->critical_end = data->critical_end;
+
+ strscpy(max_data->comm, tsk->comm);
+ max_data->pid = tsk->pid;
+ /*
+ * If tsk == current, then use current_uid(), as that does not use
+ * RCU. The irq tracer can be called out of RCU scope.
+ */
+ if (tsk == current)
+ max_data->uid = current_uid();
+ else
+ max_data->uid = task_uid(tsk);
+
+ max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
+ max_data->policy = tsk->policy;
+ max_data->rt_priority = tsk->rt_priority;
+
+ /* record this tasks comm */
+ tracing_record_cmdline(tsk);
+ latency_fsnotify(tr);
+}
+#else
+static inline void __update_max_tr(struct trace_array *tr,
+ struct task_struct *tsk, int cpu) { }
+#endif /* CONFIG_TRACER_MAX_TRACE */
+
+/**
+ * update_max_tr - snapshot all trace buffers from global_trace to max_tr
+ * @tr: tracer
+ * @tsk: the task with the latency
+ * @cpu: The cpu that initiated the trace.
+ * @cond_data: User data associated with a conditional snapshot
+ *
+ * Flip the buffers between the @tr and the max_tr and record information
+ * about which task was the cause of this latency.
+ */
+void
+update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu,
+ void *cond_data)
+{
+ if (tr->stop_count)
+ return;
+
+ WARN_ON_ONCE(!irqs_disabled());
+
+ if (!tr->allocated_snapshot) {
+ /* Only the nop tracer should hit this when disabling */
+ WARN_ON_ONCE(tr->current_trace != &nop_trace);
+ return;
+ }
+
+ arch_spin_lock(&tr->max_lock);
+
+ /* Inherit the recordable setting from array_buffer */
+ if (ring_buffer_record_is_set_on(tr->array_buffer.buffer))
+ ring_buffer_record_on(tr->snapshot_buffer.buffer);
+ else
+ ring_buffer_record_off(tr->snapshot_buffer.buffer);
+
+ if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data)) {
+ arch_spin_unlock(&tr->max_lock);
+ return;
+ }
+
+ swap(tr->array_buffer.buffer, tr->snapshot_buffer.buffer);
+
+ __update_max_tr(tr, tsk, cpu);
+
+ arch_spin_unlock(&tr->max_lock);
+
+ /* Any waiters on the old snapshot buffer need to wake up */
+ ring_buffer_wake_waiters(tr->array_buffer.buffer, RING_BUFFER_ALL_CPUS);
+}
+
+/**
+ * update_max_tr_single - only copy one trace over, and reset the rest
+ * @tr: tracer
+ * @tsk: task with the latency
+ * @cpu: the cpu of the buffer to copy.
+ *
+ * Flip the trace of a single CPU buffer between the @tr and the max_tr.
+ */
+void
+update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
+{
+ int ret;
+
+ if (tr->stop_count)
+ return;
+
+ WARN_ON_ONCE(!irqs_disabled());
+ if (!tr->allocated_snapshot) {
+ /* Only the nop tracer should hit this when disabling */
+ WARN_ON_ONCE(tr->current_trace != &nop_trace);
+ return;
+ }
+
+ arch_spin_lock(&tr->max_lock);
+
+ ret = ring_buffer_swap_cpu(tr->snapshot_buffer.buffer, tr->array_buffer.buffer, cpu);
+
+ if (ret == -EBUSY) {
+ /*
+ * We failed to swap the buffer due to a commit taking
+ * place on this CPU. We fail to record, but we reset
+ * the max trace buffer (no one writes directly to it)
+ * and flag that it failed.
+ * Another reason is resize is in progress.
+ */
+ trace_array_printk_buf(tr->snapshot_buffer.buffer, _THIS_IP_,
+ "Failed to swap buffers due to commit or resize in progress\n");
+ }
+
+ WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
+
+ __update_max_tr(tr, tsk, cpu);
+ arch_spin_unlock(&tr->max_lock);
+}
+
+static void show_snapshot_main_help(struct seq_file *m)
+{
+ seq_puts(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"
+ "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"
+ "# Takes a snapshot of the main buffer.\n"
+ "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n"
+ "# (Doesn't have to be '2' works with any number that\n"
+ "# is not a '0' or '1')\n");
+}
+
+static void show_snapshot_percpu_help(struct seq_file *m)
+{
+ seq_puts(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n");
+#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
+ seq_puts(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"
+ "# Takes a snapshot of the main buffer for this cpu.\n");
+#else
+ seq_puts(m, "# echo 1 > snapshot : Not supported with this kernel.\n"
+ "# Must use main snapshot file to allocate.\n");
+#endif
+ seq_puts(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n"
+ "# (Doesn't have to be '2' works with any number that\n"
+ "# is not a '0' or '1')\n");
+}
+
+void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter)
+{
+ if (iter->tr->allocated_snapshot)
+ seq_puts(m, "#\n# * Snapshot is allocated *\n#\n");
+ else
+ seq_puts(m, "#\n# * Snapshot is freed *\n#\n");
+
+ seq_puts(m, "# Snapshot commands:\n");
+ if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
+ show_snapshot_main_help(m);
+ else
+ show_snapshot_percpu_help(m);
+}
+
+static int tracing_snapshot_open(struct inode *inode, struct file *file)
+{
+ struct trace_array *tr = inode->i_private;
+ struct trace_iterator *iter;
+ struct seq_file *m;
+ int ret;
+
+ ret = tracing_check_open_get_tr(tr);
+ if (ret)
+ return ret;
+
+ if (file->f_mode & FMODE_READ) {
+ iter = __tracing_open(inode, file, true);
+ if (IS_ERR(iter))
+ ret = PTR_ERR(iter);
+ } else {
+ /* Writes still need the seq_file to hold the private data */
+ ret = -ENOMEM;
+ m = kzalloc_obj(*m);
+ if (!m)
+ goto out;
+ iter = kzalloc_obj(*iter);
+ if (!iter) {
+ kfree(m);
+ goto out;
+ }
+ ret = 0;
+
+ iter->tr = tr;
+ iter->array_buffer = &tr->snapshot_buffer;
+ iter->cpu_file = tracing_get_cpu(inode);
+ m->private = iter;
+ file->private_data = m;
+ }
+out:
+ if (ret < 0)
+ trace_array_put(tr);
+
+ return ret;
+}
+
+static void tracing_swap_cpu_buffer(void *tr)
+{
+ update_max_tr_single((struct trace_array *)tr, current, smp_processor_id());
+}
+
+static ssize_t
+tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ struct seq_file *m = filp->private_data;
+ struct trace_iterator *iter = m->private;
+ struct trace_array *tr = iter->tr;
+ unsigned long val;
+ int ret;
+
+ ret = tracing_update_buffers(tr);
+ if (ret < 0)
+ return ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ guard(mutex)(&trace_types_lock);
+
+ if (tracer_uses_snapshot(tr->current_trace))
+ return -EBUSY;
+
+ local_irq_disable();
+ arch_spin_lock(&tr->max_lock);
+ if (tr->cond_snapshot)
+ ret = -EBUSY;
+ arch_spin_unlock(&tr->max_lock);
+ local_irq_enable();
+ if (ret)
+ return ret;
+
+ switch (val) {
+ case 0:
+ if (iter->cpu_file != RING_BUFFER_ALL_CPUS)
+ return -EINVAL;
+ if (tr->allocated_snapshot)
+ free_snapshot(tr);
+ break;
+ case 1:
+/* Only allow per-cpu swap if the ring buffer supports it */
+#ifndef CONFIG_RING_BUFFER_ALLOW_SWAP
+ if (iter->cpu_file != RING_BUFFER_ALL_CPUS)
+ return -EINVAL;
+#endif
+ if (tr->allocated_snapshot)
+ ret = resize_buffer_duplicate_size(&tr->snapshot_buffer,
+ &tr->array_buffer, iter->cpu_file);
+
+ ret = tracing_arm_snapshot_locked(tr);
+ if (ret)
+ return ret;
+
+ /* Now, we're going to swap */
+ if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
+ local_irq_disable();
+ update_max_tr(tr, current, smp_processor_id(), NULL);
+ local_irq_enable();
+ } else {
+ smp_call_function_single(iter->cpu_file, tracing_swap_cpu_buffer,
+ (void *)tr, 1);
+ }
+ tracing_disarm_snapshot(tr);
+ break;
+ default:
+ if (tr->allocated_snapshot) {
+ if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
+ tracing_reset_online_cpus(&tr->snapshot_buffer);
+ else
+ tracing_reset_cpu(&tr->snapshot_buffer, iter->cpu_file);
+ }
+ break;
+ }
+
+ if (ret >= 0) {
+ *ppos += cnt;
+ ret = cnt;
+ }
+
+ return ret;
+}
+
+static int tracing_snapshot_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *m = file->private_data;
+ int ret;
+
+ ret = tracing_release(inode, file);
+
+ if (file->f_mode & FMODE_READ)
+ return ret;
+
+ /* If write only, the seq_file is just a stub */
+ if (m)
+ kfree(m->private);
+ kfree(m);
+
+ return 0;
+}
+
+static int snapshot_raw_open(struct inode *inode, struct file *filp)
+{
+ struct ftrace_buffer_info *info;
+ int ret;
+
+ /* The following checks for tracefs lockdown */
+ ret = tracing_buffers_open(inode, filp);
+ if (ret < 0)
+ return ret;
+
+ info = filp->private_data;
+
+ if (tracer_uses_snapshot(info->iter.trace)) {
+ tracing_buffers_release(inode, filp);
+ return -EBUSY;
+ }
+
+ info->iter.snapshot = true;
+ info->iter.array_buffer = &info->iter.tr->snapshot_buffer;
+
+ return ret;
+}
+
+const struct file_operations snapshot_fops = {
+ .open = tracing_snapshot_open,
+ .read = seq_read,
+ .write = tracing_snapshot_write,
+ .llseek = tracing_lseek,
+ .release = tracing_snapshot_release,
+};
+
+const struct file_operations snapshot_raw_fops = {
+ .open = snapshot_raw_open,
+ .read = tracing_buffers_read,
+ .release = tracing_buffers_release,
+ .splice_read = tracing_buffers_splice_read,
+};
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+static ssize_t
+tracing_max_lat_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = filp->private_data;
+
+ return tracing_nsecs_read(&tr->max_latency, ubuf, cnt, ppos);
+}
+
+static ssize_t
+tracing_max_lat_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = filp->private_data;
+
+ return tracing_nsecs_write(&tr->max_latency, ubuf, cnt, ppos);
+}
+
+static const struct file_operations tracing_max_lat_fops = {
+ .open = tracing_open_generic_tr,
+ .read = tracing_max_lat_read,
+ .write = tracing_max_lat_write,
+ .llseek = generic_file_llseek,
+ .release = tracing_release_generic_tr,
+};
+#endif /* CONFIG_TRACER_MAX_TRACE */
+
+int get_snapshot_map(struct trace_array *tr)
+{
+ int err = 0;
+
+ /*
+ * Called with mmap_lock held. lockdep would be unhappy if we would now
+ * take trace_types_lock. Instead use the specific
+ * snapshot_trigger_lock.
+ */
+ spin_lock(&tr->snapshot_trigger_lock);
+
+ if (tr->snapshot || tr->mapped == UINT_MAX)
+ err = -EBUSY;
+ else
+ tr->mapped++;
+
+ spin_unlock(&tr->snapshot_trigger_lock);
+
+ /* Wait for update_max_tr() to observe iter->tr->mapped */
+ if (tr->mapped == 1)
+ synchronize_rcu();
+
+ return err;
+
+}
+
+void put_snapshot_map(struct trace_array *tr)
+{
+ spin_lock(&tr->snapshot_trigger_lock);
+ if (!WARN_ON(!tr->mapped))
+ tr->mapped--;
+ spin_unlock(&tr->snapshot_trigger_lock);
+}
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+static void
+ftrace_snapshot(unsigned long ip, unsigned long parent_ip,
+ struct trace_array *tr, struct ftrace_probe_ops *ops,
+ void *data)
+{
+ tracing_snapshot_instance(tr);
+}
+
+static void
+ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip,
+ struct trace_array *tr, struct ftrace_probe_ops *ops,
+ void *data)
+{
+ struct ftrace_func_mapper *mapper = data;
+ long *count = NULL;
+
+ if (mapper)
+ count = (long *)ftrace_func_mapper_find_ip(mapper, ip);
+
+ if (count) {
+
+ if (*count <= 0)
+ return;
+
+ (*count)--;
+ }
+
+ tracing_snapshot_instance(tr);
+}
+
+static int
+ftrace_snapshot_print(struct seq_file *m, unsigned long ip,
+ struct ftrace_probe_ops *ops, void *data)
+{
+ struct ftrace_func_mapper *mapper = data;
+ long *count = NULL;
+
+ seq_printf(m, "%ps:", (void *)ip);
+
+ seq_puts(m, "snapshot");
+
+ if (mapper)
+ count = (long *)ftrace_func_mapper_find_ip(mapper, ip);
+
+ if (count)
+ seq_printf(m, ":count=%ld\n", *count);
+ else
+ seq_puts(m, ":unlimited\n");
+
+ return 0;
+}
+
+static int
+ftrace_snapshot_init(struct ftrace_probe_ops *ops, struct trace_array *tr,
+ unsigned long ip, void *init_data, void **data)
+{
+ struct ftrace_func_mapper *mapper = *data;
+
+ if (!mapper) {
+ mapper = allocate_ftrace_func_mapper();
+ if (!mapper)
+ return -ENOMEM;
+ *data = mapper;
+ }
+
+ return ftrace_func_mapper_add_ip(mapper, ip, init_data);
+}
+
+static void
+ftrace_snapshot_free(struct ftrace_probe_ops *ops, struct trace_array *tr,
+ unsigned long ip, void *data)
+{
+ struct ftrace_func_mapper *mapper = data;
+
+ if (!ip) {
+ if (!mapper)
+ return;
+ free_ftrace_func_mapper(mapper, NULL);
+ return;
+ }
+
+ ftrace_func_mapper_remove_ip(mapper, ip);
+}
+
+static struct ftrace_probe_ops snapshot_probe_ops = {
+ .func = ftrace_snapshot,
+ .print = ftrace_snapshot_print,
+};
+
+static struct ftrace_probe_ops snapshot_count_probe_ops = {
+ .func = ftrace_count_snapshot,
+ .print = ftrace_snapshot_print,
+ .init = ftrace_snapshot_init,
+ .free = ftrace_snapshot_free,
+};
+
+static int
+ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash,
+ char *glob, char *cmd, char *param, int enable)
+{
+ struct ftrace_probe_ops *ops;
+ void *count = (void *)-1;
+ char *number;
+ int ret;
+
+ if (!tr)
+ return -ENODEV;
+
+ /* hash funcs only work with set_ftrace_filter */
+ if (!enable)
+ return -EINVAL;
+
+ ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops;
+
+ if (glob[0] == '!') {
+ ret = unregister_ftrace_function_probe_func(glob+1, tr, ops);
+ if (!ret)
+ tracing_disarm_snapshot(tr);
+
+ return ret;
+ }
+
+ if (!param)
+ goto out_reg;
+
+ number = strsep(&param, ":");
+
+ if (!strlen(number))
+ goto out_reg;
+
+ /*
+ * We use the callback data field (which is a pointer)
+ * as our counter.
+ */
+ ret = kstrtoul(number, 0, (unsigned long *)&count);
+ if (ret)
+ return ret;
+
+ out_reg:
+ ret = tracing_arm_snapshot(tr);
+ if (ret < 0)
+ return ret;
+
+ ret = register_ftrace_function_probe(glob, tr, ops, count);
+ if (ret < 0)
+ tracing_disarm_snapshot(tr);
+
+ return ret < 0 ? ret : 0;
+}
+
+static struct ftrace_func_command ftrace_snapshot_cmd = {
+ .name = "snapshot",
+ .func = ftrace_trace_snapshot_callback,
+};
+
+__init int register_snapshot_cmd(void)
+{
+ return register_ftrace_command(&ftrace_snapshot_cmd);
+}
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+int trace_allocate_snapshot(struct trace_array *tr, int size)
+{
+ int ret;
+
+ /* Fix mapped buffer trace arrays do not have snapshot buffers */
+ if (tr->range_addr_start)
+ return 0;
+
+ /* allocate_snapshot can only be true during system boot */
+ ret = allocate_trace_buffer(tr, &tr->snapshot_buffer,
+ allocate_snapshot ? size : 1);
+ if (ret < 0)
+ return -ENOMEM;
+
+ tr->allocated_snapshot = allocate_snapshot;
+
+ allocate_snapshot = false;
+ return 0;
+}
+
+__init static bool tr_needs_alloc_snapshot(const char *name)
+{
+ char *test;
+ int len = strlen(name);
+ bool ret;
+
+ if (!boot_snapshot_index)
+ return false;
+
+ if (strncmp(name, boot_snapshot_info, len) == 0 &&
+ boot_snapshot_info[len] == '\t')
+ return true;
+
+ test = kmalloc(strlen(name) + 3, GFP_KERNEL);
+ if (!test)
+ return false;
+
+ sprintf(test, "\t%s\t", name);
+ ret = strstr(boot_snapshot_info, test) == NULL;
+ kfree(test);
+ return ret;
+}
+
+__init void do_allocate_snapshot(const char *name)
+{
+ if (!tr_needs_alloc_snapshot(name))
+ return;
+
+ /*
+ * When allocate_snapshot is set, the next call to
+ * allocate_trace_buffers() (called by trace_array_get_by_name())
+ * will allocate the snapshot buffer. That will also clear
+ * this flag.
+ */
+ allocate_snapshot = true;
+}
+
+void __init ftrace_boot_snapshot(void)
+{
+ struct trace_array *tr;
+
+ if (!snapshot_at_boot)
+ return;
+
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ if (!tr->allocated_snapshot)
+ continue;
+
+ tracing_snapshot_instance(tr);
+ trace_array_puts(tr, "** Boot snapshot taken **\n");
+ }
+}
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 7f9572a37333..0aa2514a6593 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -32,7 +32,7 @@ static arch_spinlock_t stack_trace_max_lock =
DEFINE_PER_CPU(int, disable_stack_tracer);
static DEFINE_MUTEX(stack_sysctl_mutex);
-int stack_tracer_enabled;
+static int stack_tracer_enabled;
static void print_max_stack(void)
{
@@ -520,20 +520,18 @@ stack_trace_sysctl(const struct ctl_table *table, int write, void *buffer,
int was_enabled;
int ret;
- mutex_lock(&stack_sysctl_mutex);
+ guard(mutex)(&stack_sysctl_mutex);
was_enabled = !!stack_tracer_enabled;
ret = proc_dointvec(table, write, buffer, lenp, ppos);
if (ret || !write || (was_enabled == !!stack_tracer_enabled))
- goto out;
+ return ret;
if (stack_tracer_enabled)
register_ftrace_function(&trace_ops);
else
unregister_ftrace_function(&trace_ops);
- out:
- mutex_unlock(&stack_sysctl_mutex);
return ret;
}
@@ -544,7 +542,7 @@ static __init int enable_stacktrace(char *str)
int len;
if ((len = str_has_prefix(str, "_filter=")))
- strncpy(stack_trace_filter_buf, str + len, COMMAND_LINE_SIZE);
+ strscpy(stack_trace_filter_buf, str + len);
stack_tracer_enabled = 1;
return 1;
@@ -580,3 +578,23 @@ static __init int stack_trace_init(void)
}
device_initcall(stack_trace_init);
+
+
+static const struct ctl_table trace_stack_sysctl_table[] = {
+ {
+ .procname = "stack_tracer_enabled",
+ .data = &stack_tracer_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = stack_trace_sysctl,
+ },
+};
+
+static int __init init_trace_stack_sysctls(void)
+{
+ register_sysctl_init("kernel", trace_stack_sysctl_table);
+ return 0;
+}
+subsys_initcall(init_trace_stack_sysctls);
+
+
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index bb247beec447..856ece13b7dc 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -77,7 +77,7 @@ static int insert_stat(struct rb_root *root, void *stat, cmp_func_t cmp)
struct rb_node **new = &(root->rb_node), *parent = NULL;
struct stat_node *data;
- data = kzalloc(sizeof(*data), GFP_KERNEL);
+ data = kzalloc_obj(*data);
if (!data)
return -ENOMEM;
data->stat = stat;
@@ -128,7 +128,7 @@ static int stat_seq_init(struct stat_session *session)
int ret = 0;
int i;
- mutex_lock(&session->stat_mutex);
+ guard(mutex)(&session->stat_mutex);
__reset_stat_session(session);
if (!ts->stat_cmp)
@@ -136,11 +136,11 @@ static int stat_seq_init(struct stat_session *session)
stat = ts->stat_start(ts);
if (!stat)
- goto exit;
+ return 0;
ret = insert_stat(root, stat, ts->stat_cmp);
if (ret)
- goto exit;
+ return ret;
/*
* Iterate over the tracer stat entries and store them in an rbtree.
@@ -157,13 +157,10 @@ static int stat_seq_init(struct stat_session *session)
goto exit_free_rbtree;
}
-exit:
- mutex_unlock(&session->stat_mutex);
return ret;
exit_free_rbtree:
__reset_stat_session(session);
- mutex_unlock(&session->stat_mutex);
return ret;
}
@@ -308,7 +305,7 @@ static int init_stat_file(struct stat_session *session)
int register_stat_tracer(struct tracer_stat *trace)
{
struct stat_session *session, *node;
- int ret = -EINVAL;
+ int ret;
if (!trace)
return -EINVAL;
@@ -316,18 +313,18 @@ int register_stat_tracer(struct tracer_stat *trace)
if (!trace->stat_start || !trace->stat_next || !trace->stat_show)
return -EINVAL;
+ guard(mutex)(&all_stat_sessions_mutex);
+
/* Already registered? */
- mutex_lock(&all_stat_sessions_mutex);
list_for_each_entry(node, &all_stat_sessions, session_list) {
if (node->ts == trace)
- goto out;
+ return -EINVAL;
}
- ret = -ENOMEM;
/* Init the session */
- session = kzalloc(sizeof(*session), GFP_KERNEL);
+ session = kzalloc_obj(*session);
if (!session)
- goto out;
+ return -ENOMEM;
session->ts = trace;
INIT_LIST_HEAD(&session->session_list);
@@ -336,16 +333,13 @@ int register_stat_tracer(struct tracer_stat *trace)
ret = init_stat_file(session);
if (ret) {
destroy_session(session);
- goto out;
+ return ret;
}
- ret = 0;
/* Register */
list_add_tail(&session->session_list, &all_stat_sessions);
- out:
- mutex_unlock(&all_stat_sessions_mutex);
- return ret;
+ return 0;
}
void unregister_stat_tracer(struct tracer_stat *trace)
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 46aab0ab9350..8ad72e17d8eb 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <trace/syscall.h>
#include <trace/events/syscalls.h>
+#include <linux/kernel_stat.h>
#include <linux/syscalls.h>
#include <linux/slab.h>
#include <linux/kernel.h>
@@ -123,6 +124,118 @@ const char *get_syscall_name(int syscall)
return entry->name;
}
+/* Added to user strings or arrays when max limit is reached */
+#define EXTRA "..."
+
+static void get_dynamic_len_ptr(struct syscall_trace_enter *trace,
+ struct syscall_metadata *entry,
+ int *offset_p, int *len_p, unsigned char **ptr_p)
+{
+ unsigned char *ptr;
+ int offset = *offset_p;
+ int val;
+
+ /* This arg points to a user space string */
+ ptr = (void *)trace->args + sizeof(long) * entry->nb_args + offset;
+ val = *(int *)ptr;
+
+ /* The value is a dynamic string (len << 16 | offset) */
+ ptr = (void *)trace + (val & 0xffff);
+ *len_p = val >> 16;
+ offset += 4;
+
+ *ptr_p = ptr;
+ *offset_p = offset;
+}
+
+static enum print_line_t
+sys_enter_openat_print(struct syscall_trace_enter *trace, struct syscall_metadata *entry,
+ struct trace_seq *s, struct trace_event *event)
+{
+ unsigned char *ptr;
+ int offset = 0;
+ int bits, len;
+ bool done = false;
+ static const struct trace_print_flags __flags[] =
+ {
+ { O_TMPFILE, "O_TMPFILE" },
+ { O_WRONLY, "O_WRONLY" },
+ { O_RDWR, "O_RDWR" },
+ { O_CREAT, "O_CREAT" },
+ { O_EXCL, "O_EXCL" },
+ { O_NOCTTY, "O_NOCTTY" },
+ { O_TRUNC, "O_TRUNC" },
+ { O_APPEND, "O_APPEND" },
+ { O_NONBLOCK, "O_NONBLOCK" },
+ { O_DSYNC, "O_DSYNC" },
+ { O_DIRECT, "O_DIRECT" },
+ { O_LARGEFILE, "O_LARGEFILE" },
+ { O_DIRECTORY, "O_DIRECTORY" },
+ { O_NOFOLLOW, "O_NOFOLLOW" },
+ { O_NOATIME, "O_NOATIME" },
+ { O_CLOEXEC, "O_CLOEXEC" },
+ };
+
+ trace_seq_printf(s, "%s(", entry->name);
+
+ for (int i = 0; !done && i < entry->nb_args; i++) {
+
+ if (trace_seq_has_overflowed(s))
+ goto end;
+
+ if (i)
+ trace_seq_puts(s, ", ");
+
+ switch (i) {
+ case 2:
+ bits = trace->args[2];
+
+ trace_seq_puts(s, "flags: ");
+
+ /* No need to show mode when not creating the file */
+ if (!(bits & (O_CREAT|O_TMPFILE)))
+ done = true;
+
+ if (!(bits & O_ACCMODE)) {
+ if (!bits) {
+ trace_seq_puts(s, "O_RDONLY");
+ continue;
+ }
+ trace_seq_puts(s, "O_RDONLY|");
+ }
+
+ trace_print_flags_seq(s, "|", bits, __flags, ARRAY_SIZE(__flags));
+ /*
+ * trace_print_flags_seq() adds a '\0' to the
+ * buffer, but this needs to append more to the seq.
+ */
+ if (!trace_seq_has_overflowed(s))
+ trace_seq_pop(s);
+
+ continue;
+ case 3:
+ trace_seq_printf(s, "%s: 0%03o", entry->args[i],
+ (unsigned int)trace->args[i]);
+ continue;
+ }
+
+ trace_seq_printf(s, "%s: %lu", entry->args[i],
+ trace->args[i]);
+
+ if (!(BIT(i) & entry->user_mask))
+ continue;
+
+ get_dynamic_len_ptr(trace, entry, &offset, &len, &ptr);
+ trace_seq_printf(s, " \"%.*s\"", len, ptr);
+ }
+
+ trace_seq_putc(s, ')');
+end:
+ trace_seq_putc(s, '\n');
+
+ return trace_handle_return(s);
+}
+
static enum print_line_t
print_syscall_enter(struct trace_iterator *iter, int flags,
struct trace_event *event)
@@ -132,7 +245,9 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
struct trace_entry *ent = iter->ent;
struct syscall_trace_enter *trace;
struct syscall_metadata *entry;
- int i, syscall;
+ int i, syscall, val, len;
+ unsigned char *ptr;
+ int offset = 0;
trace = (typeof(trace))ent;
syscall = trace->nr;
@@ -146,21 +261,80 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
goto end;
}
+ switch (entry->syscall_nr) {
+ case __NR_openat:
+ if (!tr || !(tr->trace_flags & TRACE_ITER(VERBOSE)))
+ return sys_enter_openat_print(trace, entry, s, event);
+ break;
+ default:
+ break;
+ }
+
trace_seq_printf(s, "%s(", entry->name);
for (i = 0; i < entry->nb_args; i++) {
+ bool printable = false;
+ char *str;
if (trace_seq_has_overflowed(s))
goto end;
+ if (i)
+ trace_seq_puts(s, ", ");
+
/* parameter types */
- if (tr && tr->trace_flags & TRACE_ITER_VERBOSE)
+ if (tr && tr->trace_flags & TRACE_ITER(VERBOSE))
trace_seq_printf(s, "%s ", entry->types[i]);
/* parameter values */
- trace_seq_printf(s, "%s: %lx%s", entry->args[i],
- trace->args[i],
- i == entry->nb_args - 1 ? "" : ", ");
+ if (trace->args[i] < 10)
+ trace_seq_printf(s, "%s: %lu", entry->args[i],
+ trace->args[i]);
+ else
+ trace_seq_printf(s, "%s: 0x%lx", entry->args[i],
+ trace->args[i]);
+
+ if (!(BIT(i) & entry->user_mask))
+ continue;
+
+ get_dynamic_len_ptr(trace, entry, &offset, &len, &ptr);
+
+ if (entry->user_arg_size < 0 || entry->user_arg_is_str) {
+ trace_seq_printf(s, " \"%.*s\"", len, ptr);
+ continue;
+ }
+
+ val = trace->args[entry->user_arg_size];
+
+ str = ptr;
+ trace_seq_puts(s, " (");
+ for (int x = 0; x < len; x++, ptr++) {
+ if (isascii(*ptr) && isprint(*ptr))
+ printable = true;
+ if (x)
+ trace_seq_putc(s, ':');
+ trace_seq_printf(s, "%02x", *ptr);
+ }
+ if (len < val)
+ trace_seq_printf(s, ", %s", EXTRA);
+
+ trace_seq_putc(s, ')');
+
+ /* If nothing is printable, don't bother printing anything */
+ if (!printable)
+ continue;
+
+ trace_seq_puts(s, " \"");
+ for (int x = 0; x < len; x++) {
+ if (isascii(str[x]) && isprint(str[x]))
+ trace_seq_putc(s, str[x]);
+ else
+ trace_seq_putc(s, '.');
+ }
+ if (len < val)
+ trace_seq_printf(s, "\"%s", EXTRA);
+ else
+ trace_seq_putc(s, '"');
}
trace_seq_putc(s, ')');
@@ -206,26 +380,107 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
.size = sizeof(_type), .align = __alignof__(_type), \
.is_signed = is_signed_type(_type), .filter_type = FILTER_OTHER }
+/* When len=0, we just calculate the needed length */
+#define LEN_OR_ZERO (len ? len - pos : 0)
+
+static int __init
+sys_enter_openat_print_fmt(struct syscall_metadata *entry, char *buf, int len)
+{
+ int pos = 0;
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ "\"dfd: 0x%%08lx, filename: 0x%%08lx \\\"%%s\\\", flags: %%s%%s, mode: 0%%03o\",");
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ " ((unsigned long)(REC->dfd)),");
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ " ((unsigned long)(REC->filename)),");
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ " __get_str(__filename_val),");
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ " (REC->flags & ~3) && !(REC->flags & 3) ? \"O_RDONLY|\" : \"\", ");
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ " REC->flags ? __print_flags(REC->flags, \"|\", ");
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ "{ 0x%x, \"O_WRONLY\" }, ", O_WRONLY);
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ "{ 0x%x, \"O_RDWR\" }, ", O_RDWR);
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ "{ 0x%x, \"O_CREAT\" }, ", O_CREAT);
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ "{ 0x%x, \"O_EXCL\" }, ", O_EXCL);
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ "{ 0x%x, \"O_NOCTTY\" }, ", O_NOCTTY);
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ "{ 0x%x, \"O_TRUNC\" }, ", O_TRUNC);
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ "{ 0x%x, \"O_APPEND\" }, ", O_APPEND);
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ "{ 0x%x, \"O_NONBLOCK\" }, ", O_NONBLOCK);
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ "{ 0x%x, \"O_DSYNC\" }, ", O_DSYNC);
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ "{ 0x%x, \"O_DIRECT\" }, ", O_DIRECT);
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ "{ 0x%x, \"O_LARGEFILE\" }, ", O_LARGEFILE);
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ "{ 0x%x, \"O_DIRECTORY\" }, ", O_DIRECTORY);
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ "{ 0x%x, \"O_NOFOLLOW\" }, ", O_NOFOLLOW);
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ "{ 0x%x, \"O_NOATIME\" }, ", O_NOATIME);
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ "{ 0x%x, \"O_CLOEXEC\" }) : \"O_RDONLY\", ", O_CLOEXEC);
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ " ((unsigned long)(REC->mode))");
+ return pos;
+}
+
static int __init
__set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
{
+ bool is_string = entry->user_arg_is_str;
int i;
int pos = 0;
- /* When len=0, we just calculate the needed length */
-#define LEN_OR_ZERO (len ? len - pos : 0)
+ switch (entry->syscall_nr) {
+ case __NR_openat:
+ return sys_enter_openat_print_fmt(entry, buf, len);
+ default:
+ break;
+ }
pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
for (i = 0; i < entry->nb_args; i++) {
- pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
- entry->args[i], sizeof(unsigned long),
- i == entry->nb_args - 1 ? "" : ", ");
+ if (i)
+ pos += snprintf(buf + pos, LEN_OR_ZERO, ", ");
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx",
+ entry->args[i], sizeof(unsigned long));
+
+ if (!(BIT(i) & entry->user_mask))
+ continue;
+
+ /* Add the format for the user space string or array */
+ if (entry->user_arg_size < 0 || is_string)
+ pos += snprintf(buf + pos, LEN_OR_ZERO, " \\\"%%s\\\"");
+ else
+ pos += snprintf(buf + pos, LEN_OR_ZERO, " (%%s)");
}
pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
for (i = 0; i < entry->nb_args; i++) {
pos += snprintf(buf + pos, LEN_OR_ZERO,
", ((unsigned long)(REC->%s))", entry->args[i]);
+ if (!(BIT(i) & entry->user_mask))
+ continue;
+ /* The user space data for arg has name __<arg>_val */
+ if (entry->user_arg_size < 0 || is_string) {
+ pos += snprintf(buf + pos, LEN_OR_ZERO, ", __get_str(__%s_val)",
+ entry->args[i]);
+ } else {
+ pos += snprintf(buf + pos, LEN_OR_ZERO, ", __print_dynamic_array(__%s_val, 1)",
+ entry->args[i]);
+ }
}
#undef LEN_OR_ZERO
@@ -271,8 +526,11 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call)
{
struct syscall_trace_enter trace;
struct syscall_metadata *meta = call->data;
+ unsigned long mask;
+ char *arg;
int offset = offsetof(typeof(trace), args);
int ret = 0;
+ int len;
int i;
for (i = 0; i < meta->nb_args; i++) {
@@ -285,9 +543,320 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call)
offset += sizeof(unsigned long);
}
+ if (ret || !meta->user_mask)
+ return ret;
+
+ mask = meta->user_mask;
+
+ while (mask) {
+ int idx = ffs(mask) - 1;
+ mask &= ~BIT(idx);
+
+ /*
+ * User space data is faulted into a temporary buffer and then
+ * added as a dynamic string or array to the end of the event.
+ * The user space data name for the arg pointer is
+ * "__<arg>_val".
+ */
+ len = strlen(meta->args[idx]) + sizeof("___val");
+ arg = kmalloc(len, GFP_KERNEL);
+ if (WARN_ON_ONCE(!arg)) {
+ meta->user_mask = 0;
+ return -ENOMEM;
+ }
+
+ snprintf(arg, len, "__%s_val", meta->args[idx]);
+
+ ret = trace_define_field(call, "__data_loc char[]",
+ arg, offset, sizeof(int), 0,
+ FILTER_OTHER);
+ if (ret) {
+ kfree(arg);
+ break;
+ }
+ offset += 4;
+ }
return ret;
}
+/*
+ * Create a per CPU temporary buffer to copy user space pointers into.
+ *
+ * SYSCALL_FAULT_USER_MAX is the amount to copy from user space.
+ * (defined in kernel/trace/trace.h)
+
+ * SYSCALL_FAULT_ARG_SZ is the amount to copy from user space plus the
+ * nul terminating byte and possibly appended EXTRA (4 bytes).
+ *
+ * SYSCALL_FAULT_BUF_SZ holds the size of the per CPU buffer to use
+ * to copy memory from user space addresses into that will hold
+ * 3 args as only 3 args are allowed to be copied from system calls.
+ */
+#define SYSCALL_FAULT_ARG_SZ (SYSCALL_FAULT_USER_MAX + 1 + 4)
+#define SYSCALL_FAULT_MAX_CNT 3
+#define SYSCALL_FAULT_BUF_SZ (SYSCALL_FAULT_ARG_SZ * SYSCALL_FAULT_MAX_CNT)
+
+/* Use the tracing per CPU buffer infrastructure to copy from user space */
+struct syscall_user_buffer {
+ struct trace_user_buf_info buf;
+ struct rcu_head rcu;
+};
+
+static struct syscall_user_buffer *syscall_buffer;
+
+static int syscall_fault_buffer_enable(void)
+{
+ struct syscall_user_buffer *sbuf;
+ int ret;
+
+ lockdep_assert_held(&syscall_trace_lock);
+
+ if (syscall_buffer) {
+ trace_user_fault_get(&syscall_buffer->buf);
+ return 0;
+ }
+
+ sbuf = kmalloc_obj(*sbuf);
+ if (!sbuf)
+ return -ENOMEM;
+
+ ret = trace_user_fault_init(&sbuf->buf, SYSCALL_FAULT_BUF_SZ);
+ if (ret < 0) {
+ kfree(sbuf);
+ return ret;
+ }
+
+ WRITE_ONCE(syscall_buffer, sbuf);
+
+ return 0;
+}
+
+static void rcu_free_syscall_buffer(struct rcu_head *rcu)
+{
+ struct syscall_user_buffer *sbuf =
+ container_of(rcu, struct syscall_user_buffer, rcu);
+
+ trace_user_fault_destroy(&sbuf->buf);
+ kfree(sbuf);
+}
+
+
+static void syscall_fault_buffer_disable(void)
+{
+ struct syscall_user_buffer *sbuf = syscall_buffer;
+
+ lockdep_assert_held(&syscall_trace_lock);
+
+ if (trace_user_fault_put(&sbuf->buf))
+ return;
+
+ WRITE_ONCE(syscall_buffer, NULL);
+ call_rcu_tasks_trace(&sbuf->rcu, rcu_free_syscall_buffer);
+}
+
+struct syscall_args {
+ char *ptr_array[SYSCALL_FAULT_MAX_CNT];
+ int read[SYSCALL_FAULT_MAX_CNT];
+ int uargs;
+};
+
+static int syscall_copy_user(char *buf, const char __user *ptr,
+ size_t size, void *data)
+{
+ struct syscall_args *args = data;
+ int ret;
+
+ for (int i = 0; i < args->uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
+ ptr = (char __user *)args->ptr_array[i];
+ ret = strncpy_from_user(buf, ptr, size);
+ args->read[i] = ret;
+ }
+ return 0;
+}
+
+static int syscall_copy_user_array(char *buf, const char __user *ptr,
+ size_t size, void *data)
+{
+ struct syscall_args *args = data;
+ int ret;
+
+ for (int i = 0; i < args->uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
+ ptr = (char __user *)args->ptr_array[i];
+ ret = __copy_from_user(buf, ptr, size);
+ args->read[i] = ret ? -1 : size;
+ }
+ return 0;
+}
+
+static char *sys_fault_user(unsigned int buf_size,
+ struct syscall_metadata *sys_data,
+ struct syscall_user_buffer *sbuf,
+ unsigned long *args,
+ unsigned int data_size[SYSCALL_FAULT_MAX_CNT])
+{
+ trace_user_buf_copy syscall_copy = syscall_copy_user;
+ unsigned long mask = sys_data->user_mask;
+ unsigned long size = SYSCALL_FAULT_ARG_SZ - 1;
+ struct syscall_args sargs;
+ bool array = false;
+ char *buffer;
+ char *buf;
+ int ret;
+ int i = 0;
+
+ /* The extra is appended to the user data in the buffer */
+ BUILD_BUG_ON(SYSCALL_FAULT_USER_MAX + sizeof(EXTRA) >=
+ SYSCALL_FAULT_ARG_SZ);
+
+ /*
+ * If this system call event has a size argument, use
+ * it to define how much of user space memory to read,
+ * and read it as an array and not a string.
+ */
+ if (sys_data->user_arg_size >= 0) {
+ array = true;
+ size = args[sys_data->user_arg_size];
+ if (size > SYSCALL_FAULT_ARG_SZ - 1)
+ size = SYSCALL_FAULT_ARG_SZ - 1;
+ syscall_copy = syscall_copy_user_array;
+ }
+
+ while (mask) {
+ int idx = ffs(mask) - 1;
+ mask &= ~BIT(idx);
+
+ if (WARN_ON_ONCE(i == SYSCALL_FAULT_MAX_CNT))
+ break;
+
+ /* Get the pointer to user space memory to read */
+ sargs.ptr_array[i++] = (char *)args[idx];
+ }
+
+ sargs.uargs = i;
+
+ /* Clear the values that are not used */
+ for (; i < SYSCALL_FAULT_MAX_CNT; i++) {
+ data_size[i] = -1; /* Denotes no pointer */
+ }
+
+ /* A zero size means do not even try */
+ if (!buf_size)
+ return NULL;
+
+ buffer = trace_user_fault_read(&sbuf->buf, NULL, size,
+ syscall_copy, &sargs);
+ if (!buffer)
+ return NULL;
+
+ buf = buffer;
+ for (i = 0; i < sargs.uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
+
+ ret = sargs.read[i];
+ if (ret < 0)
+ continue;
+ buf[ret] = '\0';
+
+ /* For strings, replace any non-printable characters with '.' */
+ if (!array) {
+ for (int x = 0; x < ret; x++) {
+ if (!isprint(buf[x]))
+ buf[x] = '.';
+ }
+
+ size = min(buf_size, SYSCALL_FAULT_USER_MAX);
+
+ /*
+ * If the text was truncated due to our max limit,
+ * add "..." to the string.
+ */
+ if (ret > size) {
+ strscpy(buf + size, EXTRA, sizeof(EXTRA));
+ ret = size + sizeof(EXTRA);
+ } else {
+ buf[ret++] = '\0';
+ }
+ } else {
+ ret = min((unsigned int)ret, buf_size);
+ }
+ data_size[i] = ret;
+ }
+
+ return buffer;
+}
+
+static int
+syscall_get_data(struct syscall_metadata *sys_data, unsigned long *args,
+ char **buffer, int *size, int *user_sizes, int *uargs,
+ int buf_size)
+{
+ struct syscall_user_buffer *sbuf;
+ int i;
+
+ /* If the syscall_buffer is NULL, tracing is being shutdown */
+ sbuf = READ_ONCE(syscall_buffer);
+ if (!sbuf)
+ return -1;
+
+ *buffer = sys_fault_user(buf_size, sys_data, sbuf, args, user_sizes);
+ /*
+ * user_size is the amount of data to append.
+ * Need to add 4 for the meta field that points to
+ * the user memory at the end of the event and also
+ * stores its size.
+ */
+ for (i = 0; i < SYSCALL_FAULT_MAX_CNT; i++) {
+ if (user_sizes[i] < 0)
+ break;
+ *size += user_sizes[i] + 4;
+ }
+ /* Save the number of user read arguments of this syscall */
+ *uargs = i;
+ return 0;
+}
+
+static void syscall_put_data(struct syscall_metadata *sys_data,
+ struct syscall_trace_enter *entry,
+ char *buffer, int size, int *user_sizes, int uargs)
+{
+ char *buf = buffer;
+ void *ptr;
+ int val;
+
+ /*
+ * Set the pointer to point to the meta data of the event
+ * that has information about the stored user space memory.
+ */
+ ptr = (void *)entry->args + sizeof(unsigned long) * sys_data->nb_args;
+
+ /*
+ * The meta data will store the offset of the user data from
+ * the beginning of the event. That is after the static arguments
+ * and the meta data fields.
+ */
+ val = (ptr - (void *)entry) + 4 * uargs;
+
+ for (int i = 0; i < uargs; i++) {
+
+ if (i)
+ val += user_sizes[i - 1];
+
+ /* Store the offset and the size into the meta data */
+ *(int *)ptr = val | (user_sizes[i] << 16);
+
+ /* Skip the meta data */
+ ptr += 4;
+ }
+
+ for (int i = 0; i < uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
+ /* Nothing to do if the user space was empty or faulted */
+ if (!user_sizes[i])
+ continue;
+
+ memcpy(ptr, buf, user_sizes[i]);
+ ptr += user_sizes[i];
+ }
+}
+
static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
{
struct trace_array *tr = data;
@@ -296,22 +865,24 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
struct syscall_metadata *sys_data;
struct trace_event_buffer fbuffer;
unsigned long args[6];
+ char *user_ptr;
+ int user_sizes[SYSCALL_FAULT_MAX_CNT] = {};
int syscall_nr;
- int size;
+ int size = 0;
+ int uargs = 0;
+ bool mayfault;
/*
* Syscall probe called with preemption enabled, but the ring
* buffer and per-cpu data require preemption to be disabled.
*/
might_fault();
- guard(preempt_notrace)();
syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
return;
- /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */
- trace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]);
+ trace_file = READ_ONCE(tr->enter_syscall_files[syscall_nr]);
if (!trace_file)
return;
@@ -322,7 +893,20 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
if (!sys_data)
return;
- size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
+ /* Check if this syscall event faults in user space memory */
+ mayfault = sys_data->user_mask != 0;
+
+ guard(preempt_notrace)();
+
+ syscall_get_arguments(current, regs, args);
+
+ if (mayfault) {
+ if (syscall_get_data(sys_data, args, &user_ptr,
+ &size, user_sizes, &uargs, tr->syscall_buf_sz) < 0)
+ return;
+ }
+
+ size += sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
entry = trace_event_buffer_reserve(&fbuffer, trace_file, size);
if (!entry)
@@ -330,9 +914,12 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
entry = ring_buffer_event_data(fbuffer.event);
entry->nr = syscall_nr;
- syscall_get_arguments(current, regs, args);
+
memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args);
+ if (mayfault)
+ syscall_put_data(sys_data, entry, user_ptr, size, user_sizes, uargs);
+
trace_event_buffer_commit(&fbuffer);
}
@@ -356,8 +943,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
return;
- /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */
- trace_file = rcu_dereference_sched(tr->exit_syscall_files[syscall_nr]);
+ trace_file = READ_ONCE(tr->exit_syscall_files[syscall_nr]);
if (!trace_file)
return;
@@ -382,39 +968,50 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
static int reg_event_syscall_enter(struct trace_event_file *file,
struct trace_event_call *call)
{
+ struct syscall_metadata *sys_data = call->data;
struct trace_array *tr = file->tr;
int ret = 0;
int num;
- num = ((struct syscall_metadata *)call->data)->syscall_nr;
+ num = sys_data->syscall_nr;
if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
return -ENOSYS;
- mutex_lock(&syscall_trace_lock);
- if (!tr->sys_refcount_enter)
+ guard(mutex)(&syscall_trace_lock);
+ if (sys_data->user_mask) {
+ ret = syscall_fault_buffer_enable();
+ if (ret < 0)
+ return ret;
+ }
+ if (!tr->sys_refcount_enter) {
ret = register_trace_sys_enter(ftrace_syscall_enter, tr);
- if (!ret) {
- rcu_assign_pointer(tr->enter_syscall_files[num], file);
- tr->sys_refcount_enter++;
+ if (ret < 0) {
+ if (sys_data->user_mask)
+ syscall_fault_buffer_disable();
+ return ret;
+ }
}
- mutex_unlock(&syscall_trace_lock);
- return ret;
+ WRITE_ONCE(tr->enter_syscall_files[num], file);
+ tr->sys_refcount_enter++;
+ return 0;
}
static void unreg_event_syscall_enter(struct trace_event_file *file,
struct trace_event_call *call)
{
+ struct syscall_metadata *sys_data = call->data;
struct trace_array *tr = file->tr;
int num;
- num = ((struct syscall_metadata *)call->data)->syscall_nr;
+ num = sys_data->syscall_nr;
if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
return;
- mutex_lock(&syscall_trace_lock);
+ guard(mutex)(&syscall_trace_lock);
tr->sys_refcount_enter--;
- RCU_INIT_POINTER(tr->enter_syscall_files[num], NULL);
+ WRITE_ONCE(tr->enter_syscall_files[num], NULL);
if (!tr->sys_refcount_enter)
unregister_trace_sys_enter(ftrace_syscall_enter, tr);
- mutex_unlock(&syscall_trace_lock);
+ if (sys_data->user_mask)
+ syscall_fault_buffer_disable();
}
static int reg_event_syscall_exit(struct trace_event_file *file,
@@ -431,7 +1028,7 @@ static int reg_event_syscall_exit(struct trace_event_file *file,
if (!tr->sys_refcount_exit)
ret = register_trace_sys_exit(ftrace_syscall_exit, tr);
if (!ret) {
- rcu_assign_pointer(tr->exit_syscall_files[num], file);
+ WRITE_ONCE(tr->exit_syscall_files[num], file);
tr->sys_refcount_exit++;
}
mutex_unlock(&syscall_trace_lock);
@@ -449,12 +1046,221 @@ static void unreg_event_syscall_exit(struct trace_event_file *file,
return;
mutex_lock(&syscall_trace_lock);
tr->sys_refcount_exit--;
- RCU_INIT_POINTER(tr->exit_syscall_files[num], NULL);
+ WRITE_ONCE(tr->exit_syscall_files[num], NULL);
if (!tr->sys_refcount_exit)
unregister_trace_sys_exit(ftrace_syscall_exit, tr);
mutex_unlock(&syscall_trace_lock);
}
+/*
+ * For system calls that reference user space memory that can
+ * be recorded into the event, set the system call meta data's user_mask
+ * to the "args" index that points to the user space memory to retrieve.
+ */
+static void check_faultable_syscall(struct trace_event_call *call, int nr)
+{
+ struct syscall_metadata *sys_data = call->data;
+ unsigned long mask;
+
+ /* Only work on entry */
+ if (sys_data->enter_event != call)
+ return;
+
+ sys_data->user_arg_size = -1;
+
+ switch (nr) {
+ /* user arg 1 with size arg at 2 */
+ case __NR_write:
+#ifdef __NR_mq_timedsend
+ case __NR_mq_timedsend:
+#endif
+ case __NR_pwrite64:
+ sys_data->user_mask = BIT(1);
+ sys_data->user_arg_size = 2;
+ break;
+ /* user arg 0 with size arg at 1 as string */
+ case __NR_setdomainname:
+ case __NR_sethostname:
+ sys_data->user_mask = BIT(0);
+ sys_data->user_arg_size = 1;
+ sys_data->user_arg_is_str = 1;
+ break;
+#ifdef __NR_kexec_file_load
+ /* user arg 4 with size arg at 3 as string */
+ case __NR_kexec_file_load:
+ sys_data->user_mask = BIT(4);
+ sys_data->user_arg_size = 3;
+ sys_data->user_arg_is_str = 1;
+ break;
+#endif
+ /* user arg at position 0 */
+#ifdef __NR_access
+ case __NR_access:
+#endif
+ case __NR_acct:
+ case __NR_chdir:
+#ifdef __NR_chown
+ case __NR_chown:
+#endif
+#ifdef __NR_chmod
+ case __NR_chmod:
+#endif
+ case __NR_chroot:
+#ifdef __NR_creat
+ case __NR_creat:
+#endif
+ case __NR_delete_module:
+ case __NR_execve:
+ case __NR_fsopen:
+#ifdef __NR_lchown
+ case __NR_lchown:
+#endif
+#ifdef __NR_open
+ case __NR_open:
+#endif
+ case __NR_memfd_create:
+#ifdef __NR_mkdir
+ case __NR_mkdir:
+#endif
+#ifdef __NR_mknod
+ case __NR_mknod:
+#endif
+ case __NR_mq_open:
+ case __NR_mq_unlink:
+#ifdef __NR_readlink
+ case __NR_readlink:
+#endif
+#ifdef __NR_rmdir
+ case __NR_rmdir:
+#endif
+ case __NR_shmdt:
+#ifdef __NR_statfs
+ case __NR_statfs:
+#endif
+ case __NR_swapon:
+ case __NR_swapoff:
+#ifdef __NR_truncate
+ case __NR_truncate:
+#endif
+#ifdef __NR_unlink
+ case __NR_unlink:
+#endif
+ case __NR_umount2:
+#ifdef __NR_utime
+ case __NR_utime:
+#endif
+#ifdef __NR_utimes
+ case __NR_utimes:
+#endif
+ sys_data->user_mask = BIT(0);
+ break;
+ /* user arg at position 1 */
+ case __NR_execveat:
+ case __NR_faccessat:
+ case __NR_faccessat2:
+ case __NR_finit_module:
+ case __NR_fchmodat:
+ case __NR_fchmodat2:
+ case __NR_fchownat:
+ case __NR_fgetxattr:
+ case __NR_flistxattr:
+ case __NR_fsetxattr:
+ case __NR_fspick:
+ case __NR_fremovexattr:
+#ifdef __NR_futimesat
+ case __NR_futimesat:
+#endif
+ case __NR_inotify_add_watch:
+ case __NR_mkdirat:
+ case __NR_mknodat:
+ case __NR_mount_setattr:
+ case __NR_name_to_handle_at:
+#ifdef __NR_newfstatat
+ case __NR_newfstatat:
+#endif
+ case __NR_openat:
+ case __NR_openat2:
+ case __NR_open_tree:
+ case __NR_open_tree_attr:
+ case __NR_readlinkat:
+ case __NR_quotactl:
+ case __NR_syslog:
+ case __NR_statx:
+ case __NR_unlinkat:
+#ifdef __NR_utimensat
+ case __NR_utimensat:
+#endif
+ sys_data->user_mask = BIT(1);
+ break;
+ /* user arg at position 2 */
+ case __NR_init_module:
+ case __NR_fsconfig:
+ sys_data->user_mask = BIT(2);
+ break;
+ /* user arg at position 4 */
+ case __NR_fanotify_mark:
+ sys_data->user_mask = BIT(4);
+ break;
+ /* 2 user args, 0 and 1 */
+ case __NR_add_key:
+ case __NR_getxattr:
+ case __NR_lgetxattr:
+ case __NR_lremovexattr:
+#ifdef __NR_link
+ case __NR_link:
+#endif
+ case __NR_listxattr:
+ case __NR_llistxattr:
+ case __NR_lsetxattr:
+ case __NR_pivot_root:
+ case __NR_removexattr:
+#ifdef __NR_rename
+ case __NR_rename:
+#endif
+ case __NR_request_key:
+ case __NR_setxattr:
+#ifdef __NR_symlink
+ case __NR_symlink:
+#endif
+ sys_data->user_mask = BIT(0) | BIT(1);
+ break;
+ /* 2 user args, 0 and 2 */
+ case __NR_symlinkat:
+ sys_data->user_mask = BIT(0) | BIT(2);
+ break;
+ /* 2 user args, 1 and 3 */
+ case __NR_getxattrat:
+ case __NR_linkat:
+ case __NR_listxattrat:
+ case __NR_move_mount:
+#ifdef __NR_renameat
+ case __NR_renameat:
+#endif
+ case __NR_renameat2:
+ case __NR_removexattrat:
+ case __NR_setxattrat:
+ sys_data->user_mask = BIT(1) | BIT(3);
+ break;
+ case __NR_mount: /* Just dev_name and dir_name, TODO add type */
+ sys_data->user_mask = BIT(0) | BIT(1) | BIT(2);
+ break;
+ default:
+ sys_data->user_mask = 0;
+ return;
+ }
+
+ if (sys_data->user_arg_size < 0)
+ return;
+
+ /*
+ * The user_arg_size can only be used when the system call
+ * is reading only a single address from user space.
+ */
+ mask = sys_data->user_mask;
+ if (WARN_ON(mask & (mask - 1)))
+ sys_data->user_arg_size = -1;
+}
+
static int __init init_syscall_trace(struct trace_event_call *call)
{
int id;
@@ -467,6 +1273,8 @@ static int __init init_syscall_trace(struct trace_event_call *call)
return -ENOSYS;
}
+ check_faultable_syscall(call, num);
+
if (set_syscall_print_fmt(call) < 0)
return -ENOMEM;
@@ -528,9 +1336,8 @@ void __init init_ftrace_syscalls(void)
void *ret;
if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) {
- syscalls_metadata = kcalloc(NR_syscalls,
- sizeof(*syscalls_metadata),
- GFP_KERNEL);
+ syscalls_metadata = kzalloc_objs(*syscalls_metadata,
+ NR_syscalls);
if (!syscalls_metadata) {
WARN_ON(1);
return;
@@ -594,9 +1401,14 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
struct hlist_head *head;
unsigned long args[6];
bool valid_prog_array;
+ bool mayfault;
+ char *user_ptr;
+ int user_sizes[SYSCALL_FAULT_MAX_CNT] = {};
+ int buf_size = CONFIG_TRACE_SYSCALL_BUF_SIZE_DEFAULT;
int syscall_nr;
int rctx;
- int size;
+ int size = 0;
+ int uargs = 0;
/*
* Syscall probe called with preemption enabled, but the ring
@@ -615,13 +1427,24 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
if (!sys_data)
return;
+ syscall_get_arguments(current, regs, args);
+
+ /* Check if this syscall event faults in user space memory */
+ mayfault = sys_data->user_mask != 0;
+
+ if (mayfault) {
+ if (syscall_get_data(sys_data, args, &user_ptr,
+ &size, user_sizes, &uargs, buf_size) < 0)
+ return;
+ }
+
head = this_cpu_ptr(sys_data->enter_event->perf_events);
valid_prog_array = bpf_prog_array_valid(sys_data->enter_event);
if (!valid_prog_array && hlist_empty(head))
return;
/* get the size after alignment with the u32 buffer size field */
- size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
+ size += sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
size = ALIGN(size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
@@ -630,9 +1453,11 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
return;
rec->nr = syscall_nr;
- syscall_get_arguments(current, regs, args);
memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args);
+ if (mayfault)
+ syscall_put_data(sys_data, rec, user_ptr, size, user_sizes, uargs);
+
if ((valid_prog_array &&
!perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) ||
hlist_empty(head)) {
@@ -647,36 +1472,46 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
static int perf_sysenter_enable(struct trace_event_call *call)
{
- int ret = 0;
+ struct syscall_metadata *sys_data = call->data;
int num;
+ int ret;
- num = ((struct syscall_metadata *)call->data)->syscall_nr;
+ num = sys_data->syscall_nr;
- mutex_lock(&syscall_trace_lock);
- if (!sys_perf_refcount_enter)
+ guard(mutex)(&syscall_trace_lock);
+ if (sys_data->user_mask) {
+ ret = syscall_fault_buffer_enable();
+ if (ret < 0)
+ return ret;
+ }
+ if (!sys_perf_refcount_enter) {
ret = register_trace_sys_enter(perf_syscall_enter, NULL);
- if (ret) {
- pr_info("event trace: Could not activate syscall entry trace point");
- } else {
- set_bit(num, enabled_perf_enter_syscalls);
- sys_perf_refcount_enter++;
+ if (ret) {
+ pr_info("event trace: Could not activate syscall entry trace point");
+ if (sys_data->user_mask)
+ syscall_fault_buffer_disable();
+ return ret;
+ }
}
- mutex_unlock(&syscall_trace_lock);
- return ret;
+ set_bit(num, enabled_perf_enter_syscalls);
+ sys_perf_refcount_enter++;
+ return 0;
}
static void perf_sysenter_disable(struct trace_event_call *call)
{
+ struct syscall_metadata *sys_data = call->data;
int num;
- num = ((struct syscall_metadata *)call->data)->syscall_nr;
+ num = sys_data->syscall_nr;
- mutex_lock(&syscall_trace_lock);
+ guard(mutex)(&syscall_trace_lock);
sys_perf_refcount_enter--;
clear_bit(num, enabled_perf_enter_syscalls);
if (!sys_perf_refcount_enter)
unregister_trace_sys_enter(perf_syscall_enter, NULL);
- mutex_unlock(&syscall_trace_lock);
+ if (sys_data->user_mask)
+ syscall_fault_buffer_disable();
}
static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs,
@@ -753,22 +1588,21 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
static int perf_sysexit_enable(struct trace_event_call *call)
{
- int ret = 0;
int num;
num = ((struct syscall_metadata *)call->data)->syscall_nr;
- mutex_lock(&syscall_trace_lock);
- if (!sys_perf_refcount_exit)
- ret = register_trace_sys_exit(perf_syscall_exit, NULL);
- if (ret) {
- pr_info("event trace: Could not activate syscall exit trace point");
- } else {
- set_bit(num, enabled_perf_exit_syscalls);
- sys_perf_refcount_exit++;
+ guard(mutex)(&syscall_trace_lock);
+ if (!sys_perf_refcount_exit) {
+ int ret = register_trace_sys_exit(perf_syscall_exit, NULL);
+ if (ret) {
+ pr_info("event trace: Could not activate syscall exit trace point");
+ return ret;
+ }
}
- mutex_unlock(&syscall_trace_lock);
- return ret;
+ set_bit(num, enabled_perf_exit_syscalls);
+ sys_perf_refcount_exit++;
+ return 0;
}
static void perf_sysexit_disable(struct trace_event_call *call)
@@ -777,12 +1611,11 @@ static void perf_sysexit_disable(struct trace_event_call *call)
num = ((struct syscall_metadata *)call->data)->syscall_nr;
- mutex_lock(&syscall_trace_lock);
+ guard(mutex)(&syscall_trace_lock);
sys_perf_refcount_exit--;
clear_bit(num, enabled_perf_exit_syscalls);
if (!sys_perf_refcount_exit)
unregister_trace_sys_exit(perf_syscall_exit, NULL);
- mutex_unlock(&syscall_trace_lock);
}
#endif /* CONFIG_PERF_EVENTS */
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 4875e7f5de3d..2cabf8a23ec5 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -8,17 +8,19 @@
#define pr_fmt(fmt) "trace_uprobe: " fmt
#include <linux/bpf-cgroup.h>
-#include <linux/security.h>
+#include <linux/cleanup.h>
#include <linux/ctype.h>
+#include <linux/filter.h>
#include <linux/module.h>
-#include <linux/uaccess.h>
-#include <linux/uprobes.h>
#include <linux/namei.h>
-#include <linux/string.h>
-#include <linux/rculist.h>
-#include <linux/filter.h>
#include <linux/percpu.h>
+#include <linux/rculist.h>
+#include <linux/security.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <linux/uprobes.h>
+#include "trace.h"
#include "trace_dynevent.h"
#include "trace_probe.h"
#include "trace_probe_tmpl.h"
@@ -336,7 +338,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
struct trace_uprobe *tu;
int ret;
- tu = kzalloc(struct_size(tu, tp.args, nargs), GFP_KERNEL);
+ tu = kzalloc_flex(*tu, tp.args, nargs);
if (!tu)
return ERR_PTR(-ENOMEM);
@@ -498,11 +500,11 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
struct trace_uprobe *old_tu;
int ret;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
ret = validate_ref_ctr_offset(tu);
if (ret)
- goto end;
+ return ret;
/* register as an event */
old_tu = find_probe_event(trace_probe_name(&tu->tp),
@@ -511,11 +513,9 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
if (is_ret_probe(tu) != is_ret_probe(old_tu)) {
trace_probe_log_set_index(0);
trace_probe_log_err(0, DIFF_PROBE_TYPE);
- ret = -EEXIST;
- } else {
- ret = append_trace_uprobe(tu, old_tu);
+ return -EEXIST;
}
- goto end;
+ return append_trace_uprobe(tu, old_tu);
}
ret = register_uprobe_event(tu);
@@ -525,31 +525,33 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
trace_probe_log_err(0, EVENT_EXIST);
} else
pr_warn("Failed to register probe event(%d)\n", ret);
- goto end;
+ return ret;
}
dyn_event_add(&tu->devent, trace_probe_event_call(&tu->tp));
-end:
- mutex_unlock(&event_mutex);
-
return ret;
}
+DEFINE_FREE(free_trace_uprobe, struct trace_uprobe *, if (_T) free_trace_uprobe(_T))
+
/*
* Argument syntax:
* - Add uprobe: p|r[:[GRP/][EVENT]] PATH:OFFSET[%return][(REF)] [FETCHARGS]
*/
static int __trace_uprobe_create(int argc, const char **argv)
{
- struct trace_uprobe *tu;
+ struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) = NULL;
+ struct trace_uprobe *tu __free(free_trace_uprobe) = NULL;
+ const char *trlog __free(trace_probe_log_clear) = NULL;
const char *event = NULL, *group = UPROBE_EVENT_SYSTEM;
- char *arg, *filename, *rctr, *rctr_end, *tmp;
- char buf[MAX_EVENT_NAME_LEN];
- char gbuf[MAX_EVENT_NAME_LEN];
- enum probe_print_type ptype;
- struct path path;
+ struct path path __free(path_put) = {};
unsigned long offset, ref_ctr_offset;
+ char *filename __free(kfree) = NULL;
+ char *arg, *rctr, *rctr_end, *tmp;
+ char *gbuf __free(kfree) = NULL;
+ char *buf __free(kfree) = NULL;
+ enum probe_print_type ptype;
bool is_return = false;
int i, ret;
@@ -567,8 +569,14 @@ static int __trace_uprobe_create(int argc, const char **argv)
if (argc < 2)
return -ECANCELED;
- if (argc - 2 > MAX_TRACE_ARGS)
+
+ trlog = trace_probe_log_init("trace_uprobe", argc, argv);
+
+ if (argc - 2 > MAX_TRACE_ARGS) {
+ trace_probe_log_set_index(2);
+ trace_probe_log_err(0, TOO_MANY_ARGS);
return -E2BIG;
+ }
if (argv[0][1] == ':')
event = &argv[0][2];
@@ -582,26 +590,20 @@ static int __trace_uprobe_create(int argc, const char **argv)
/* Find the last occurrence, in case the path contains ':' too. */
arg = strrchr(filename, ':');
- if (!arg || !isdigit(arg[1])) {
- kfree(filename);
+ if (!arg || !isdigit(arg[1]))
return -ECANCELED;
- }
- trace_probe_log_init("trace_uprobe", argc, argv);
trace_probe_log_set_index(1); /* filename is the 2nd argument */
*arg++ = '\0';
ret = kern_path(filename, LOOKUP_FOLLOW, &path);
if (ret) {
trace_probe_log_err(0, FILE_NOT_FOUND);
- kfree(filename);
- trace_probe_log_clear();
return ret;
}
if (!d_is_reg(path.dentry)) {
trace_probe_log_err(0, NO_REGULAR_FILE);
- ret = -EINVAL;
- goto fail_address_parse;
+ return -EINVAL;
}
/* Parse reference counter offset if specified. */
@@ -609,16 +611,14 @@ static int __trace_uprobe_create(int argc, const char **argv)
if (rctr) {
rctr_end = strchr(rctr, ')');
if (!rctr_end) {
- ret = -EINVAL;
rctr_end = rctr + strlen(rctr);
trace_probe_log_err(rctr_end - filename,
REFCNT_OPEN_BRACE);
- goto fail_address_parse;
+ return -EINVAL;
} else if (rctr_end[1] != '\0') {
- ret = -EINVAL;
trace_probe_log_err(rctr_end + 1 - filename,
BAD_REFCNT_SUFFIX);
- goto fail_address_parse;
+ return -EINVAL;
}
*rctr++ = '\0';
@@ -626,7 +626,7 @@ static int __trace_uprobe_create(int argc, const char **argv)
ret = kstrtoul(rctr, 0, &ref_ctr_offset);
if (ret) {
trace_probe_log_err(rctr - filename, BAD_REFCNT);
- goto fail_address_parse;
+ return ret;
}
}
@@ -638,8 +638,7 @@ static int __trace_uprobe_create(int argc, const char **argv)
is_return = true;
} else {
trace_probe_log_err(tmp - filename, BAD_ADDR_SUFFIX);
- ret = -EINVAL;
- goto fail_address_parse;
+ return -EINVAL;
}
}
@@ -647,16 +646,20 @@ static int __trace_uprobe_create(int argc, const char **argv)
ret = kstrtoul(arg, 0, &offset);
if (ret) {
trace_probe_log_err(arg - filename, BAD_UPROBE_OFFS);
- goto fail_address_parse;
+ return ret;
}
/* setup a probe */
trace_probe_log_set_index(0);
if (event) {
+ gbuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL);
+ if (!gbuf)
+ return -ENOMEM;
+
ret = traceprobe_parse_event_name(&event, &group, gbuf,
event - argv[0]);
if (ret)
- goto fail_address_parse;
+ return ret;
}
if (!event) {
@@ -664,15 +667,16 @@ static int __trace_uprobe_create(int argc, const char **argv)
char *ptr;
tail = kstrdup(kbasename(filename), GFP_KERNEL);
- if (!tail) {
- ret = -ENOMEM;
- goto fail_address_parse;
- }
+ if (!tail)
+ return -ENOMEM;
ptr = strpbrk(tail, ".-_");
if (ptr)
*ptr = '\0';
+ buf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_0x%lx", 'p', tail, offset);
event = buf;
kfree(tail);
@@ -686,45 +690,36 @@ static int __trace_uprobe_create(int argc, const char **argv)
ret = PTR_ERR(tu);
/* This must return -ENOMEM otherwise there is a bug */
WARN_ON_ONCE(ret != -ENOMEM);
- goto fail_address_parse;
+ return ret;
}
tu->offset = offset;
tu->ref_ctr_offset = ref_ctr_offset;
tu->path = path;
- tu->filename = filename;
+ /* Clear @path so that it will not freed by path_put() */
+ memset(&path, 0, sizeof(path));
+ tu->filename = no_free_ptr(filename);
+
+ ctx = kzalloc_obj(*ctx);
+ if (!ctx)
+ return -ENOMEM;
+ ctx->flags = (is_return ? TPARG_FL_RETURN : 0) | TPARG_FL_USER;
/* parse arguments */
for (i = 0; i < argc; i++) {
- struct traceprobe_parse_context ctx = {
- .flags = (is_return ? TPARG_FL_RETURN : 0) | TPARG_FL_USER,
- };
-
trace_probe_log_set_index(i + 2);
- ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], &ctx);
- traceprobe_finish_parse(&ctx);
+ ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], ctx);
if (ret)
- goto error;
+ return ret;
}
ptype = is_ret_probe(tu) ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL;
ret = traceprobe_set_print_fmt(&tu->tp, ptype);
if (ret < 0)
- goto error;
+ return ret;
ret = register_trace_uprobe(tu);
if (!ret)
- goto out;
-
-error:
- free_trace_uprobe(tu);
-out:
- trace_probe_log_clear();
- return ret;
-
-fail_address_parse:
- trace_probe_log_clear();
- path_put(&path);
- kfree(filename);
+ tu = NULL;
return ret;
}
@@ -741,7 +736,7 @@ static int create_or_delete_trace_uprobe(const char *raw_command)
if (raw_command[0] == '-')
return dyn_event_release(raw_command, &trace_uprobe_ops);
- ret = trace_uprobe_create(raw_command);
+ ret = dyn_event_create(raw_command, &trace_uprobe_ops);
return ret == -ECANCELED ? -EINVAL : ret;
}
@@ -1489,7 +1484,7 @@ int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type,
: BPF_FD_TYPE_UPROBE;
*filename = tu->filename;
*probe_offset = tu->offset;
- *probe_addr = 0;
+ *probe_addr = tu->ref_ctr_offset;
return 0;
}
#endif /* CONFIG_PERF_EVENTS */
@@ -1534,6 +1529,7 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs,
struct trace_uprobe *tu;
struct uprobe_dispatch_data udd;
struct uprobe_cpu_buffer *ucb = NULL;
+ unsigned int flags;
int ret = 0;
tu = container_of(con, struct trace_uprobe, consumer);
@@ -1548,11 +1544,12 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs,
if (WARN_ON_ONCE(!uprobe_cpu_buffer))
return 0;
- if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE))
+ flags = trace_probe_load_flag(&tu->tp);
+ if (flags & TP_FLAG_TRACE)
ret |= uprobe_trace_func(tu, regs, &ucb);
#ifdef CONFIG_PERF_EVENTS
- if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE))
+ if (flags & TP_FLAG_PROFILE)
ret |= uprobe_perf_func(tu, regs, &ucb);
#endif
uprobe_buffer_put(ucb);
@@ -1566,6 +1563,7 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con,
struct trace_uprobe *tu;
struct uprobe_dispatch_data udd;
struct uprobe_cpu_buffer *ucb = NULL;
+ unsigned int flags;
tu = container_of(con, struct trace_uprobe, consumer);
@@ -1577,11 +1575,12 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con,
if (WARN_ON_ONCE(!uprobe_cpu_buffer))
return 0;
- if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE))
+ flags = trace_probe_load_flag(&tu->tp);
+ if (flags & TP_FLAG_TRACE)
uretprobe_trace_func(tu, func, regs, &ucb);
#ifdef CONFIG_PERF_EVENTS
- if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE))
+ if (flags & TP_FLAG_PROFILE)
uretprobe_perf_func(tu, func, regs, &ucb);
#endif
uprobe_buffer_put(ucb);
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index 1921ade45be3..0dd7927df22a 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -324,7 +324,7 @@ static struct tracing_map_array *tracing_map_array_alloc(unsigned int n_elts,
struct tracing_map_array *a;
unsigned int i;
- a = kzalloc(sizeof(*a), GFP_KERNEL);
+ a = kzalloc_obj(*a);
if (!a)
return NULL;
@@ -386,13 +386,11 @@ static void tracing_map_elt_init_fields(struct tracing_map_elt *elt)
}
}
-static void tracing_map_elt_free(struct tracing_map_elt *elt)
+static void __tracing_map_elt_free(struct tracing_map_elt *elt)
{
if (!elt)
return;
- if (elt->map->ops && elt->map->ops->elt_free)
- elt->map->ops->elt_free(elt);
kfree(elt->fields);
kfree(elt->vars);
kfree(elt->var_set);
@@ -400,12 +398,23 @@ static void tracing_map_elt_free(struct tracing_map_elt *elt)
kfree(elt);
}
+static void tracing_map_elt_free(struct tracing_map_elt *elt)
+{
+ if (!elt)
+ return;
+
+ /* Only objects initialized with alloc_elt() should be passed to free_elt().*/
+ if (elt->map->ops && elt->map->ops->elt_free)
+ elt->map->ops->elt_free(elt);
+ __tracing_map_elt_free(elt);
+}
+
static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map)
{
struct tracing_map_elt *elt;
int err = 0;
- elt = kzalloc(sizeof(*elt), GFP_KERNEL);
+ elt = kzalloc_obj(*elt);
if (!elt)
return ERR_PTR(-ENOMEM);
@@ -417,19 +426,19 @@ static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map)
goto free;
}
- elt->fields = kcalloc(map->n_fields, sizeof(*elt->fields), GFP_KERNEL);
+ elt->fields = kzalloc_objs(*elt->fields, map->n_fields);
if (!elt->fields) {
err = -ENOMEM;
goto free;
}
- elt->vars = kcalloc(map->n_vars, sizeof(*elt->vars), GFP_KERNEL);
+ elt->vars = kzalloc_objs(*elt->vars, map->n_vars);
if (!elt->vars) {
err = -ENOMEM;
goto free;
}
- elt->var_set = kcalloc(map->n_vars, sizeof(*elt->var_set), GFP_KERNEL);
+ elt->var_set = kzalloc_objs(*elt->var_set, map->n_vars);
if (!elt->var_set) {
err = -ENOMEM;
goto free;
@@ -444,7 +453,7 @@ static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map)
}
return elt;
free:
- tracing_map_elt_free(elt);
+ __tracing_map_elt_free(elt);
return ERR_PTR(err);
}
@@ -777,7 +786,7 @@ struct tracing_map *tracing_map_create(unsigned int map_bits,
map_bits > TRACING_MAP_BITS_MAX)
return ERR_PTR(-EINVAL);
- map = kzalloc(sizeof(*map), GFP_KERNEL);
+ map = kzalloc_obj(*map);
if (!map)
return ERR_PTR(-ENOMEM);
@@ -949,7 +958,7 @@ create_sort_entry(void *key, struct tracing_map_elt *elt)
{
struct tracing_map_sort_entry *sort_entry;
- sort_entry = kzalloc(sizeof(*sort_entry), GFP_KERNEL);
+ sort_entry = kzalloc_obj(*sort_entry);
if (!sort_entry)
return NULL;
@@ -1076,7 +1085,7 @@ int tracing_map_sort_entries(struct tracing_map *map,
struct tracing_map_sort_entry *sort_entry, **entries;
int i, n_entries, ret;
- entries = vmalloc(array_size(sizeof(sort_entry), map->max_elts));
+ entries = vmalloc_array(map->max_elts, sizeof(sort_entry));
if (!entries)
return -ENOMEM;
diff --git a/kernel/trace/undefsyms_base.c b/kernel/trace/undefsyms_base.c
new file mode 100644
index 000000000000..e65baf58e6ff
--- /dev/null
+++ b/kernel/trace/undefsyms_base.c
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * simple_ring_buffer is used by the pKVM hypervisor which does not have access
+ * to all kernel symbols. Whatever is undefined when compiling this file is
+ * compiler and tooling-generated symbols that can safely be ignored for
+ * simple_ring_buffer.
+ */
+
+#include <linux/atomic.h>
+#include <linux/string.h>
+#include <asm/page.h>
+
+void undefsyms_base(void *p, int n);
+
+static char page[PAGE_SIZE] __aligned(PAGE_SIZE);
+
+void undefsyms_base(void *p, int n)
+{
+ char buffer[256] = { 0 };
+
+ u32 u = 0;
+ memset((char * volatile)page, 8, PAGE_SIZE);
+ memset((char * volatile)buffer, 8, sizeof(buffer));
+ memcpy((void * volatile)p, buffer, sizeof(buffer));
+ cmpxchg((u32 * volatile)&u, 0, 8);
+ WARN_ON(n == 0xdeadbeef);
+}