From 50a3242d84ee1625b0bfef29b95f935958dccfbe Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 3 Oct 2024 10:49:25 -0400 Subject: tracing: Fix trace_check_vprintf() when tp_printk is used When the tp_printk kernel command line is used, the trace events go directly to printk(). It is still checked via the trace_check_vprintf() function to make sure the pointers of the trace event are legit. The addition of reading buffers from previous boots required adding a delta between the addresses of the previous boot and the current boot so that the pointers in the old buffer can still be used. But this required adding a trace_array pointer to acquire the delta offsets. The tp_printk code does not provide a trace_array (tr) pointer, so when the offsets were examined, a NULL pointer dereference happened and the kernel crashed. If the trace_array does not exist, just default the delta offsets to zero, as that also means the trace event is not being read from a previous boot. Link: https://lore.kernel.org/all/Zv3z5UsG_jsO9_Tb@aschofie-mobl2.lan/ Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241003104925.4e1b1fd9@gandalf.local.home Fixes: 07714b4bb3f98 ("tracing: Handle old buffer mappings for event strings and functions") Reported-by: Alison Schofield Tested-by: Alison Schofield Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c01375adc471..1c69ca1f1088 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3697,8 +3697,8 @@ static void test_can_verify(void) void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, va_list ap) { - long text_delta = iter->tr->text_delta; - long data_delta = iter->tr->data_delta; + long text_delta = 0; + long data_delta = 0; const char *p = fmt; const char *str; bool good; @@ -3710,6 +3710,17 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, if (static_branch_unlikely(&trace_no_verify)) goto print; + /* + * When the kernel is booted with the tp_printk command line + * parameter, trace events go directly through to printk(). + * It also is checked by this function, but it does not + * have an associated trace_array (tr) for it. + */ + if (iter->tr) { + text_delta = iter->tr->text_delta; + data_delta = iter->tr->data_delta; + } + /* Don't bother checking when doing a ftrace_dump() */ if (iter->fmt == static_fmt_buf) goto print; -- cgit v1.2.3 From 0bb0a5c12ecf36ad561542bbb95f96355e036a02 Mon Sep 17 00:00:00 2001 From: Wei Li Date: Tue, 24 Sep 2024 17:45:11 +0800 Subject: tracing/timerlat: Fix duplicated kthread creation due to CPU online/offline osnoise_hotplug_workfn() is the asynchronous online callback for "trace/osnoise:online". It may be congested when a CPU goes online and offline repeatedly and is invoked for multiple times after a certain online. This will lead to kthread leak and timer corruption. Add a check in start_kthread() to prevent this situation. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20240924094515.3561410-2-liwei391@huawei.com Fixes: c8895e271f79 ("trace/osnoise: Support hotplug operations") Signed-off-by: Wei Li Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 1439064f65d6..d1a539913a5f 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -2007,6 +2007,10 @@ static int start_kthread(unsigned int cpu) void *main = osnoise_main; char comm[24]; + /* Do not start a new thread if it is already running */ + if (per_cpu(per_cpu_osnoise_var, cpu).kthread) + return 0; + if (timerlat_enabled()) { snprintf(comm, 24, "timerlat/%d", cpu); main = timerlat_main; @@ -2061,11 +2065,10 @@ static int start_per_cpu_kthreads(void) if (cpumask_test_and_clear_cpu(cpu, &kthread_cpumask)) { struct task_struct *kthread; - kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread; + kthread = xchg_relaxed(&(per_cpu(per_cpu_osnoise_var, cpu).kthread), NULL); if (!WARN_ON(!kthread)) kthread_stop(kthread); } - per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; } for_each_cpu(cpu, current_mask) { -- cgit v1.2.3 From b484a02c9cedf8703eff8f0756f94618004bd165 Mon Sep 17 00:00:00 2001 From: Wei Li Date: Tue, 24 Sep 2024 17:45:12 +0800 Subject: tracing/timerlat: Drop interface_lock in stop_kthread() stop_kthread() is the offline callback for "trace/osnoise:online", since commit 5bfbcd1ee57b ("tracing/timerlat: Add interface_lock around clearing of kthread in stop_kthread()"), the following ABBA deadlock scenario is introduced: T1 | T2 [BP] | T3 [AP] osnoise_hotplug_workfn() | work_for_cpu_fn() | cpuhp_thread_fun() | _cpu_down() | osnoise_cpu_die() mutex_lock(&interface_lock) | | stop_kthread() | cpus_write_lock() | mutex_lock(&interface_lock) cpus_read_lock() | cpuhp_kick_ap() | As the interface_lock here in just for protecting the "kthread" field of the osn_var, use xchg() instead to fix this issue. Also use for_each_online_cpu() back in stop_per_cpu_kthreads() as it can take cpu_read_lock() again. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20240924094515.3561410-3-liwei391@huawei.com Fixes: 5bfbcd1ee57b ("tracing/timerlat: Add interface_lock around clearing of kthread in stop_kthread()") Signed-off-by: Wei Li Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index d1a539913a5f..e22567174dd3 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1953,12 +1953,8 @@ static void stop_kthread(unsigned int cpu) { struct task_struct *kthread; - mutex_lock(&interface_lock); - kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread; + kthread = xchg_relaxed(&(per_cpu(per_cpu_osnoise_var, cpu).kthread), NULL); if (kthread) { - per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; - mutex_unlock(&interface_lock); - if (cpumask_test_and_clear_cpu(cpu, &kthread_cpumask) && !WARN_ON(!test_bit(OSN_WORKLOAD, &osnoise_options))) { kthread_stop(kthread); @@ -1972,7 +1968,6 @@ static void stop_kthread(unsigned int cpu) put_task_struct(kthread); } } else { - mutex_unlock(&interface_lock); /* if no workload, just return */ if (!test_bit(OSN_WORKLOAD, &osnoise_options)) { /* @@ -1994,8 +1989,12 @@ static void stop_per_cpu_kthreads(void) { int cpu; - for_each_possible_cpu(cpu) + cpus_read_lock(); + + for_each_online_cpu(cpu) stop_kthread(cpu); + + cpus_read_unlock(); } /* -- cgit v1.2.3 From 829e0c9f0855f26b3ae830d17b24aec103f7e915 Mon Sep 17 00:00:00 2001 From: Wei Li Date: Tue, 24 Sep 2024 17:45:13 +0800 Subject: tracing/timerlat: Fix a race during cpuhp processing There is another found exception that the "timerlat/1" thread was scheduled on CPU0, and lead to timer corruption finally: ``` ODEBUG: init active (active state 0) object: ffff888237c2e108 object type: hrtimer hint: timerlat_irq+0x0/0x220 WARNING: CPU: 0 PID: 426 at lib/debugobjects.c:518 debug_print_object+0x7d/0xb0 Modules linked in: CPU: 0 UID: 0 PID: 426 Comm: timerlat/1 Not tainted 6.11.0-rc7+ #45 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 RIP: 0010:debug_print_object+0x7d/0xb0 ... Call Trace: ? __warn+0x7c/0x110 ? debug_print_object+0x7d/0xb0 ? report_bug+0xf1/0x1d0 ? prb_read_valid+0x17/0x20 ? handle_bug+0x3f/0x70 ? exc_invalid_op+0x13/0x60 ? asm_exc_invalid_op+0x16/0x20 ? debug_print_object+0x7d/0xb0 ? debug_print_object+0x7d/0xb0 ? __pfx_timerlat_irq+0x10/0x10 __debug_object_init+0x110/0x150 hrtimer_init+0x1d/0x60 timerlat_main+0xab/0x2d0 ? __pfx_timerlat_main+0x10/0x10 kthread+0xb7/0xe0 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2d/0x40 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 ``` After tracing the scheduling event, it was discovered that the migration of the "timerlat/1" thread was performed during thread creation. Further analysis confirmed that it is because the CPU online processing for osnoise is implemented through workers, which is asynchronous with the offline processing. When the worker was scheduled to create a thread, the CPU may has already been removed from the cpu_online_mask during the offline process, resulting in the inability to select the right CPU: T1 | T2 [CPUHP_ONLINE] | cpu_device_down() osnoise_hotplug_workfn() | | cpus_write_lock() | takedown_cpu(1) | cpus_write_unlock() [CPUHP_OFFLINE] | cpus_read_lock() | start_kthread(1) | cpus_read_unlock() | To fix this, skip online processing if the CPU is already offline. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20240924094515.3561410-4-liwei391@huawei.com Fixes: c8895e271f79 ("trace/osnoise: Support hotplug operations") Signed-off-by: Wei Li Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index e22567174dd3..a50ed23bee77 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -2097,6 +2097,8 @@ static void osnoise_hotplug_workfn(struct work_struct *dummy) mutex_lock(&interface_lock); cpus_read_lock(); + if (!cpu_online(cpu)) + goto out_unlock; if (!cpumask_test_cpu(cpu, &osnoise_cpumask)) goto out_unlock; -- cgit v1.2.3 From 2a13ca2e8abb12ee43ada8a107dadca83f140937 Mon Sep 17 00:00:00 2001 From: Wei Li Date: Tue, 24 Sep 2024 17:45:14 +0800 Subject: tracing/hwlat: Fix a race during cpuhp processing The cpuhp online/offline processing race also exists in percpu-mode hwlat tracer in theory, apply the fix too. That is: T1 | T2 [CPUHP_ONLINE] | cpu_device_down() hwlat_hotplug_workfn() | | cpus_write_lock() | takedown_cpu(1) | cpus_write_unlock() [CPUHP_OFFLINE] | cpus_read_lock() | start_kthread(1) | cpus_read_unlock() | Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20240924094515.3561410-5-liwei391@huawei.com Fixes: ba998f7d9531 ("trace/hwlat: Support hotplug operations") Signed-off-by: Wei Li Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_hwlat.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index b791524a6536..3bd6071441ad 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -520,6 +520,8 @@ static void hwlat_hotplug_workfn(struct work_struct *dummy) if (!hwlat_busy || hwlat_data.thread_mode != MODE_PER_CPU) goto out_unlock; + if (!cpu_online(cpu)) + goto out_unlock; if (!cpumask_test_cpu(cpu, tr->tracing_cpumask)) goto out_unlock; -- cgit v1.2.3 From 912da2c384d510ce40c5af9c3adc316afa4ec547 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 8 Oct 2024 14:32:42 -0400 Subject: ring-buffer: Do not have boot mapped buffers hook to CPU hotplug The boot mapped ring buffer has its buffer mapped at a fixed location found at boot up. It is not dynamic. It cannot grow or be expanded when new CPUs come online. Do not hook fixed memory mapped ring buffers to the CPU hotplug callback, otherwise it can cause a crash when it tries to add the buffer to the memory that is already fully occupied. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241008143242.25e20801@gandalf.local.home Fixes: be68d63a139bd ("ring-buffer: Add ring_buffer_alloc_range()") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 77dc0b25140e..fb04445f92c3 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2337,9 +2337,12 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, if (!buffer->buffers[cpu]) goto fail_free_buffers; - ret = cpuhp_state_add_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node); - if (ret < 0) - goto fail_free_buffers; + /* If already mapped, do not hook to CPU hotplug */ + if (!start) { + ret = cpuhp_state_add_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node); + if (ret < 0) + goto fail_free_buffers; + } mutex_init(&buffer->mutex); -- cgit v1.2.3 From 2cf9733891a460a16a209fcc20fbd138605b13b8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 11 Oct 2024 16:52:24 -0400 Subject: ring-buffer: Fix refcount setting of boot mapped buffers A ring buffer which has its buffered mapped at boot up to fixed memory should not be freed. Other buffers can be. The ref counting setup was wrong for both. It made the not mapped buffers ref count have zero, and the boot mapped buffer a ref count of 1. But an normally allocated buffer should be 1, where it can be removed. Keep the ref count of a normal boot buffer with its setup ref count (do not decrement it), and increment the fixed memory boot mapped buffer's ref count. Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241011165224.33dd2624@gandalf.local.home Fixes: e645535a954ad ("tracing: Add option to use memmapped memory for trace boot instance") Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1c69ca1f1088..a8f52b6527ca 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -10621,10 +10621,10 @@ __init static void enable_instances(void) * cannot be deleted by user space, so keep the reference * to it. */ - if (start) + if (start) { tr->flags |= TRACE_ARRAY_FL_BOOT; - else - trace_array_put(tr); + tr->ref++; + } while ((tok = strsep(&curr_str, ","))) { early_enable_events(tr, tok, true); -- cgit v1.2.3 From 09661f75e75cb6c1d2d8326a70c311d46729235f Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Tue, 15 Oct 2024 13:24:29 +0200 Subject: ring-buffer: Fix reader locking when changing the sub buffer order The function ring_buffer_subbuf_order_set() updates each ring_buffer_per_cpu and installs new sub buffers that match the requested page order. This operation may be invoked concurrently with readers that rely on some of the modified data, such as the head bit (RB_PAGE_HEAD), or the ring_buffer_per_cpu.pages and reader_page pointers. However, no exclusive access is acquired by ring_buffer_subbuf_order_set(). Modifying the mentioned data while a reader also operates on them can then result in incorrect memory access and various crashes. Fix the problem by taking the reader_lock when updating a specific ring_buffer_per_cpu in ring_buffer_subbuf_order_set(). Link: https://lore.kernel.org/linux-trace-kernel/20240715145141.5528-1-petr.pavlu@suse.com/ Link: https://lore.kernel.org/linux-trace-kernel/20241010195849.2f77cc3f@gandalf.local.home/ Link: https://lore.kernel.org/linux-trace-kernel/20241011112850.17212b25@gandalf.local.home/ Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241015112440.26987-1-petr.pavlu@suse.com Fixes: 8e7b58c27b3c ("ring-buffer: Just update the subbuffers when changing their allocation order") Signed-off-by: Petr Pavlu Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 44 ++++++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 18 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index fb04445f92c3..3ea4f7bb1837 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -6728,39 +6728,38 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) } for_each_buffer_cpu(buffer, cpu) { + struct buffer_data_page *old_free_data_page; + struct list_head old_pages; + unsigned long flags; if (!cpumask_test_cpu(cpu, buffer->cpumask)) continue; cpu_buffer = buffer->buffers[cpu]; + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + /* Clear the head bit to make the link list normal to read */ rb_head_page_deactivate(cpu_buffer); - /* Now walk the list and free all the old sub buffers */ - list_for_each_entry_safe(bpage, tmp, cpu_buffer->pages, list) { - list_del_init(&bpage->list); - free_buffer_page(bpage); - } - /* The above loop stopped an the last page needing to be freed */ - bpage = list_entry(cpu_buffer->pages, struct buffer_page, list); - free_buffer_page(bpage); - - /* Free the current reader page */ - free_buffer_page(cpu_buffer->reader_page); + /* + * Collect buffers from the cpu_buffer pages list and the + * reader_page on old_pages, so they can be freed later when not + * under a spinlock. The pages list is a linked list with no + * head, adding old_pages turns it into a regular list with + * old_pages being the head. + */ + list_add(&old_pages, cpu_buffer->pages); + list_add(&cpu_buffer->reader_page->list, &old_pages); /* One page was allocated for the reader page */ cpu_buffer->reader_page = list_entry(cpu_buffer->new_pages.next, struct buffer_page, list); list_del_init(&cpu_buffer->reader_page->list); - /* The cpu_buffer pages are a link list with no head */ + /* Install the new pages, remove the head from the list */ cpu_buffer->pages = cpu_buffer->new_pages.next; - cpu_buffer->new_pages.next->prev = cpu_buffer->new_pages.prev; - cpu_buffer->new_pages.prev->next = cpu_buffer->new_pages.next; - - /* Clear the new_pages list */ - INIT_LIST_HEAD(&cpu_buffer->new_pages); + list_del_init(&cpu_buffer->new_pages); cpu_buffer->head_page = list_entry(cpu_buffer->pages, struct buffer_page, list); @@ -6769,11 +6768,20 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) cpu_buffer->nr_pages = cpu_buffer->nr_pages_to_update; cpu_buffer->nr_pages_to_update = 0; - free_pages((unsigned long)cpu_buffer->free_page, old_order); + old_free_data_page = cpu_buffer->free_page; cpu_buffer->free_page = NULL; rb_head_page_activate(cpu_buffer); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + /* Free old sub buffers */ + list_for_each_entry_safe(bpage, tmp, &old_pages, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + free_pages((unsigned long)old_free_data_page, old_order); + rb_check_pages(cpu_buffer); } -- cgit v1.2.3