From bd0c9706430240e3b7e9323361bb25066540d2a8 Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Mon, 21 Dec 2020 08:27:15 -0800
Subject: tracing: Add printf attribute to log function

Attributing the function allows the compiler to more thoroughly
check the use of the function with -Wformat and similar flags.

Link: https://lkml.kernel.org/r/20201221162715.3757291-1-trix@redhat.com

Signed-off-by: Tom Rix <trix@redhat.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/linux/trace.h b/include/linux/trace.h
index 886a4ffd9d45..be1e130ed87c 100644
--- a/include/linux/trace.h
+++ b/include/linux/trace.h
@@ -34,8 +34,9 @@ int unregister_ftrace_export(struct trace_export *export);
 struct trace_array;
 
 void trace_printk_init_buffers(void);
+__printf(3, 4)
 int trace_array_printk(struct trace_array *tr, unsigned long ip,
-		const char *fmt, ...);
+		       const char *fmt, ...);
 int trace_array_init_printk(struct trace_array *tr);
 void trace_array_put(struct trace_array *tr);
 struct trace_array *trace_array_get_by_name(const char *name);
-- 
cgit v1.2.3


From b3ca59f6fe79eb83a360e885f49730d07d31bf79 Mon Sep 17 00:00:00 2001
From: Qiujun Huang <hqjagain@gmail.com>
Date: Thu, 31 Dec 2020 10:35:58 -0500
Subject: tracing: Update trace_ignore_this_task() kernel-doc comment

Update kernel-doc parameter after
commit b3b1e6ededa4 ("ftrace: Create set_ftrace_notrace_pid to not trace tasks")
added @filtered_no_pids.

Link: https://lkml.kernel.org/r/20201231153558.4804-1-hqjagain@gmail.com

Signed-off-by: Qiujun Huang <hqjagain@gmail.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b8a2d786b503..9e4f4043a3df 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -530,6 +530,7 @@ trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
 /**
  * trace_ignore_this_task - should a task be ignored for tracing
  * @filtered_pids: The list of pids to check
+ * @filtered_no_pids: The list of pids not to be traced
  * @task: The task that should be ignored if not filtered
  *
  * Checks if @task should be traced or not from @filtered_pids.
@@ -780,7 +781,7 @@ u64 ftrace_now(int cpu)
 }
 
 /**
- * tracing_is_enabled - Show if global_trace has been disabled
+ * tracing_is_enabled - Show if global_trace has been enabled
  *
  * Shows if the global trace has been enabled or not. It uses the
  * mirror flag "buffer_disabled" to be used in fast paths such as for
-- 
cgit v1.2.3


From 18d14ebdbd07abdd6466f252d431ce6f0e70e38a Mon Sep 17 00:00:00 2001
From: Qiujun Huang <hqjagain@gmail.com>
Date: Wed, 30 Dec 2020 22:05:21 +0800
Subject: tracing: Remove get/put_cpu() from function_trace_init

Since commit b6f11df26fdc ("trace: Call tracing_reset_online_cpus before
tracer->init()"), get/put_cpu() are not needed anymore.
We can use raw_smp_processor_id() instead.

Link: https://lkml.kernel.org/r/20201230140521.31920-1-hqjagain@gmail.com

Signed-off-by: Qiujun Huang <hqjagain@gmail.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_functions.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index c5095dd28e20..f67aec5bb771 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -106,8 +106,7 @@ static int function_trace_init(struct trace_array *tr)
 
 	ftrace_init_array_ops(tr, func);
 
-	tr->array_buffer.cpu = get_cpu();
-	put_cpu();
+	tr->array_buffer.cpu = raw_smp_processor_id();
 
 	tracing_start_cmdline_record();
 	tracing_start_function_trace(tr);
-- 
cgit v1.2.3


From 6689bed36c52e34d772603118b0a31a0a5c11013 Mon Sep 17 00:00:00 2001
From: Qiujun Huang <hqjagain@gmail.com>
Date: Fri, 25 Dec 2020 22:03:56 +0800
Subject: ring-buffer: Remove cpu_buffer argument from the rb_inc_page()

The cpu_buffer argument is not used inside the rb_inc_page() after
commit 3adc54fa82a6 ("ring-buffer: make the buffer a true circular link
list").
And cpu_buffer argument is not used inside the two functions too,
rb_is_head_page/rb_set_list_to_head.

Link: https://lkml.kernel.org/r/20201225140356.23008-1-hqjagain@gmail.com

Signed-off-by: Qiujun Huang <hqjagain@gmail.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 35 ++++++++++++++++-------------------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index ec08f948dd80..8fccee76a5f3 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1112,8 +1112,7 @@ static struct list_head *rb_list_head(struct list_head *list)
  * its flags will be non zero.
  */
 static inline int
-rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer,
-		struct buffer_page *page, struct list_head *list)
+rb_is_head_page(struct buffer_page *page, struct list_head *list)
 {
 	unsigned long val;
 
@@ -1142,8 +1141,7 @@ static bool rb_is_reader_page(struct buffer_page *page)
 /*
  * rb_set_list_to_head - set a list_head to be pointing to head.
  */
-static void rb_set_list_to_head(struct ring_buffer_per_cpu *cpu_buffer,
-				struct list_head *list)
+static void rb_set_list_to_head(struct list_head *list)
 {
 	unsigned long *ptr;
 
@@ -1166,7 +1164,7 @@ static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer)
 	/*
 	 * Set the previous list pointer to have the HEAD flag.
 	 */
-	rb_set_list_to_head(cpu_buffer, head->list.prev);
+	rb_set_list_to_head(head->list.prev);
 }
 
 static void rb_list_head_clear(struct list_head *list)
@@ -1241,8 +1239,7 @@ static int rb_head_page_set_normal(struct ring_buffer_per_cpu *cpu_buffer,
 				old_flag, RB_PAGE_NORMAL);
 }
 
-static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
-			       struct buffer_page **bpage)
+static inline void rb_inc_page(struct buffer_page **bpage)
 {
 	struct list_head *p = rb_list_head((*bpage)->list.next);
 
@@ -1274,11 +1271,11 @@ rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
 	 */
 	for (i = 0; i < 3; i++) {
 		do {
-			if (rb_is_head_page(cpu_buffer, page, page->list.prev)) {
+			if (rb_is_head_page(page, page->list.prev)) {
 				cpu_buffer->head_page = page;
 				return page;
 			}
-			rb_inc_page(cpu_buffer, &page);
+			rb_inc_page(&page);
 		} while (page != head);
 	}
 
@@ -1824,7 +1821,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
 		cond_resched();
 
 		to_remove_page = tmp_iter_page;
-		rb_inc_page(cpu_buffer, &tmp_iter_page);
+		rb_inc_page(&tmp_iter_page);
 
 		/* update the counters */
 		page_entries = rb_page_entries(to_remove_page);
@@ -2271,7 +2268,7 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
 	if (iter->head_page == cpu_buffer->reader_page)
 		iter->head_page = rb_set_head_page(cpu_buffer);
 	else
-		rb_inc_page(cpu_buffer, &iter->head_page);
+		rb_inc_page(&iter->head_page);
 
 	iter->page_stamp = iter->read_stamp = iter->head_page->page->time_stamp;
 	iter->head = 0;
@@ -2374,7 +2371,7 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
 	 * want the outer most commit to reset it.
 	 */
 	new_head = next_page;
-	rb_inc_page(cpu_buffer, &new_head);
+	rb_inc_page(&new_head);
 
 	ret = rb_head_page_set_head(cpu_buffer, new_head, next_page,
 				    RB_PAGE_NORMAL);
@@ -2526,7 +2523,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 
 	next_page = tail_page;
 
-	rb_inc_page(cpu_buffer, &next_page);
+	rb_inc_page(&next_page);
 
 	/*
 	 * If for some reason, we had an interrupt storm that made
@@ -2552,7 +2549,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	 * the buffer, unless the commit page is still on the
 	 * reader page.
 	 */
-	if (rb_is_head_page(cpu_buffer, next_page, &tail_page->list)) {
+	if (rb_is_head_page(next_page, &tail_page->list)) {
 
 		/*
 		 * If the commit is not on the reader page, then
@@ -2879,7 +2876,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
 			return;
 		local_set(&cpu_buffer->commit_page->page->commit,
 			  rb_page_write(cpu_buffer->commit_page));
-		rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
+		rb_inc_page(&cpu_buffer->commit_page);
 		/* add barrier to keep gcc from optimizing too much */
 		barrier();
 	}
@@ -3638,14 +3635,14 @@ rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer,
 	 * Because the commit page may be on the reader page we
 	 * start with the next page and check the end loop there.
 	 */
-	rb_inc_page(cpu_buffer, &bpage);
+	rb_inc_page(&bpage);
 	start = bpage;
 	do {
 		if (bpage->page == (void *)addr) {
 			local_dec(&bpage->entries);
 			return;
 		}
-		rb_inc_page(cpu_buffer, &bpage);
+		rb_inc_page(&bpage);
 	} while (bpage != start);
 
 	/* commit not part of this buffer?? */
@@ -4367,7 +4364,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 	cpu_buffer->pages = reader->list.prev;
 
 	/* The reader page will be pointing to the new head */
-	rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list);
+	rb_set_list_to_head(&cpu_buffer->reader_page->list);
 
 	/*
 	 * We want to make sure we read the overruns after we set up our
@@ -4406,7 +4403,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 	 * Now make the new head point back to the reader page.
 	 */
 	rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list;
-	rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
+	rb_inc_page(&cpu_buffer->head_page);
 
 	local_inc(&cpu_buffer->pages_read);
 
-- 
cgit v1.2.3


From c6358bacdcdb64eb75192a135b66d3d3e57ad2e5 Mon Sep 17 00:00:00 2001
From: Qiujun Huang <hqjagain@gmail.com>
Date: Thu, 24 Dec 2020 22:46:34 +0800
Subject: ring-buffer: Drop unneeded check in ring_buffer_resize()

Remove the cpumask check, as we has done it at the beginning of
the function.
Also fix a typo. s/also the on the/also on the/

Link: https://lkml.kernel.org/r/20201224144634.3210-1-hqjagain@gmail.com

Signed-off-by: Qiujun Huang <hqjagain@gmail.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 8fccee76a5f3..b9dad3500041 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2059,10 +2059,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
 
 		put_online_cpus();
 	} else {
-		/* Make sure this CPU has been initialized */
-		if (!cpumask_test_cpu(cpu_id, buffer->cpumask))
-			goto out;
-
 		cpu_buffer = buffer->buffers[cpu_id];
 
 		if (nr_pages == cpu_buffer->nr_pages)
@@ -2580,7 +2576,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 			 * have filled up the buffer with events
 			 * from interrupts and such, and wrapped.
 			 *
-			 * Note, if the tail page is also the on the
+			 * Note, if the tail page is also on the
 			 * reader_page, we let it move out.
 			 */
 			if (unlikely((cpu_buffer->commit_page !=
-- 
cgit v1.2.3


From 36590c50b2d0729952511129916beeea30d31d81 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 25 Jan 2021 20:45:08 +0100
Subject: tracing: Merge irqflags + preempt counter.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The state of the interrupts (irqflags) and the preemption counter are
both passed down to tracing_generic_entry_update(). Only one bit of
irqflags is actually required: The on/off state. The complete 32bit
of the preemption counter isn't needed. Just whether of the upper bits
(softirq, hardirq and NMI) are set and the preemption depth is needed.

The irqflags and the preemption counter could be evaluated early and the
information stored in an integer `trace_ctx'.
tracing_generic_entry_update() would use the upper bits as the
TRACE_FLAG_* and the lower 8bit as the disabled-preemption depth
(considering that one must be substracted from the counter in one
special cases).

The actual preemption value is not used except for the tracing record.
The `irqflags' variable is mostly used only for the tracing record. An
exception here is for instance wakeup_tracer_call() or
probe_wakeup_sched_switch() which explicilty disable interrupts and use
that `irqflags' to save (and restore) the IRQ state and to record the
state.

Struct trace_event_buffer has also the `pc' and flags' members which can
be replaced with `trace_ctx' since their actual value is not used
outside of trace recording.

This will reduce tracing_generic_entry_update() to simply assign values
to struct trace_entry. The evaluation of the TRACE_FLAG_* bits is moved
to _tracing_gen_ctx_flags() which replaces preempt_count() and
local_save_flags() invocations.

As an example, ftrace_syscall_enter() may invoke:
- trace_buffer_lock_reserve() -> … -> tracing_generic_entry_update()
- event_trigger_unlock_commit()
  -> ftrace_trace_stack() -> … -> tracing_generic_entry_update()
  -> ftrace_trace_userstack() -> … -> tracing_generic_entry_update()

In this case the TRACE_FLAG_* bits were evaluated three times. By using
the `trace_ctx' they are evaluated once and assigned three times.

A build with all tracers enabled on x86-64 with and without the patch:

    text     data      bss      dec      hex    filename
21970669 17084168  7639260 46694097  2c87ed1 vmlinux.old
21970293 17084168  7639260 46693721  2c87d59 vmlinux.new

text shrank by 379 bytes, data remained constant.

Link: https://lkml.kernel.org/r/20210125194511.3924915-2-bigeasy@linutronix.de

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_events.h         |  25 +++--
 kernel/trace/blktrace.c              |  17 +--
 kernel/trace/trace.c                 | 208 +++++++++++++++++++----------------
 kernel/trace/trace.h                 |  38 +++----
 kernel/trace/trace_branch.c          |   6 +-
 kernel/trace/trace_event_perf.c      |   5 +-
 kernel/trace/trace_events.c          |  18 +--
 kernel/trace/trace_events_inject.c   |   6 +-
 kernel/trace/trace_functions.c       |  28 ++---
 kernel/trace/trace_functions_graph.c |  32 +++---
 kernel/trace/trace_hwlat.c           |   7 +-
 kernel/trace/trace_irqsoff.c         |  86 ++++++---------
 kernel/trace/trace_kprobe.c          |  10 +-
 kernel/trace/trace_mmiotrace.c       |  14 ++-
 kernel/trace/trace_sched_wakeup.c    |  71 ++++++------
 kernel/trace/trace_syscalls.c        |  20 ++--
 kernel/trace/trace_uprobe.c          |   4 +-
 17 files changed, 287 insertions(+), 308 deletions(-)

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index d321fe5ad1a1..091250b0895a 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -148,17 +148,29 @@ enum print_line_t {
 
 enum print_line_t trace_handle_return(struct trace_seq *s);
 
-void tracing_generic_entry_update(struct trace_entry *entry,
-				  unsigned short type,
-				  unsigned long flags,
-				  int pc);
+static inline void tracing_generic_entry_update(struct trace_entry *entry,
+						unsigned short type,
+						unsigned int trace_ctx)
+{
+	struct task_struct *tsk = current;
+
+	entry->preempt_count		= trace_ctx & 0xff;
+	entry->pid			= (tsk) ? tsk->pid : 0;
+	entry->type			= type;
+	entry->flags =			trace_ctx >> 16;
+}
+
+unsigned int tracing_gen_ctx_flags(unsigned long irqflags);
+unsigned int tracing_gen_ctx(void);
+unsigned int tracing_gen_ctx_dec(void);
+
 struct trace_event_file;
 
 struct ring_buffer_event *
 trace_event_buffer_lock_reserve(struct trace_buffer **current_buffer,
 				struct trace_event_file *trace_file,
 				int type, unsigned long len,
-				unsigned long flags, int pc);
+				unsigned int trace_ctx);
 
 #define TRACE_RECORD_CMDLINE	BIT(0)
 #define TRACE_RECORD_TGID	BIT(1)
@@ -232,8 +244,7 @@ struct trace_event_buffer {
 	struct ring_buffer_event	*event;
 	struct trace_event_file		*trace_file;
 	void				*entry;
-	unsigned long			flags;
-	int				pc;
+	unsigned int			trace_ctx;
 	struct pt_regs			*regs;
 };
 
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index fb0fe4c66b84..c54eae2ab208 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -72,17 +72,17 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
 	struct blk_io_trace *t;
 	struct ring_buffer_event *event = NULL;
 	struct trace_buffer *buffer = NULL;
-	int pc = 0;
+	unsigned int trace_ctx = 0;
 	int cpu = smp_processor_id();
 	bool blk_tracer = blk_tracer_enabled;
 	ssize_t cgid_len = cgid ? sizeof(cgid) : 0;
 
 	if (blk_tracer) {
 		buffer = blk_tr->array_buffer.buffer;
-		pc = preempt_count();
+		trace_ctx = tracing_gen_ctx_flags(0);
 		event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
 						  sizeof(*t) + len + cgid_len,
-						  0, pc);
+						  trace_ctx);
 		if (!event)
 			return;
 		t = ring_buffer_event_data(event);
@@ -107,7 +107,7 @@ record_it:
 		memcpy((void *) t + sizeof(*t) + cgid_len, data, len);
 
 		if (blk_tracer)
-			trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc);
+			trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx);
 	}
 }
 
@@ -222,8 +222,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	struct blk_io_trace *t;
 	unsigned long flags = 0;
 	unsigned long *sequence;
+	unsigned int trace_ctx = 0;
 	pid_t pid;
-	int cpu, pc = 0;
+	int cpu;
 	bool blk_tracer = blk_tracer_enabled;
 	ssize_t cgid_len = cgid ? sizeof(cgid) : 0;
 
@@ -252,10 +253,10 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 		tracing_record_cmdline(current);
 
 		buffer = blk_tr->array_buffer.buffer;
-		pc = preempt_count();
+		trace_ctx = tracing_gen_ctx_flags(0);
 		event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
 						  sizeof(*t) + pdu_len + cgid_len,
-						  0, pc);
+						  trace_ctx);
 		if (!event)
 			return;
 		t = ring_buffer_event_data(event);
@@ -301,7 +302,7 @@ record_it:
 			memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len);
 
 		if (blk_tracer) {
-			trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc);
+			trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx);
 			return;
 		}
 	}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9e4f4043a3df..0b3cce6ecf52 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -176,7 +176,7 @@ static union trace_eval_map_item *trace_eval_maps;
 int tracing_set_tracer(struct trace_array *tr, const char *buf);
 static void ftrace_trace_userstack(struct trace_array *tr,
 				   struct trace_buffer *buffer,
-				   unsigned long flags, int pc);
+				   unsigned int trace_ctx);
 
 #define MAX_TRACER_SIZE		100
 static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
@@ -906,23 +906,23 @@ static inline void trace_access_lock_init(void)
 
 #ifdef CONFIG_STACKTRACE
 static void __ftrace_trace_stack(struct trace_buffer *buffer,
-				 unsigned long flags,
-				 int skip, int pc, struct pt_regs *regs);
+				 unsigned int trace_ctx,
+				 int skip, struct pt_regs *regs);
 static inline void ftrace_trace_stack(struct trace_array *tr,
 				      struct trace_buffer *buffer,
-				      unsigned long flags,
-				      int skip, int pc, struct pt_regs *regs);
+				      unsigned int trace_ctx,
+				      int skip, struct pt_regs *regs);
 
 #else
 static inline void __ftrace_trace_stack(struct trace_buffer *buffer,
-					unsigned long flags,
-					int skip, int pc, struct pt_regs *regs)
+					unsigned int trace_ctx,
+					int skip, struct pt_regs *regs)
 {
 }
 static inline void ftrace_trace_stack(struct trace_array *tr,
 				      struct trace_buffer *buffer,
-				      unsigned long flags,
-				      int skip, int pc, struct pt_regs *regs)
+				      unsigned long trace_ctx,
+				      int skip, struct pt_regs *regs)
 {
 }
 
@@ -930,24 +930,24 @@ static inline void ftrace_trace_stack(struct trace_array *tr,
 
 static __always_inline void
 trace_event_setup(struct ring_buffer_event *event,
-		  int type, unsigned long flags, int pc)
+		  int type, unsigned int trace_ctx)
 {
 	struct trace_entry *ent = ring_buffer_event_data(event);
 
-	tracing_generic_entry_update(ent, type, flags, pc);
+	tracing_generic_entry_update(ent, type, trace_ctx);
 }
 
 static __always_inline struct ring_buffer_event *
 __trace_buffer_lock_reserve(struct trace_buffer *buffer,
 			  int type,
 			  unsigned long len,
-			  unsigned long flags, int pc)
+			  unsigned int trace_ctx)
 {
 	struct ring_buffer_event *event;
 
 	event = ring_buffer_lock_reserve(buffer, len);
 	if (event != NULL)
-		trace_event_setup(event, type, flags, pc);
+		trace_event_setup(event, type, trace_ctx);
 
 	return event;
 }
@@ -1008,25 +1008,22 @@ int __trace_puts(unsigned long ip, const char *str, int size)
 	struct ring_buffer_event *event;
 	struct trace_buffer *buffer;
 	struct print_entry *entry;
-	unsigned long irq_flags;
+	unsigned int trace_ctx;
 	int alloc;
-	int pc;
 
 	if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
 		return 0;
 
-	pc = preempt_count();
-
 	if (unlikely(tracing_selftest_running || tracing_disabled))
 		return 0;
 
 	alloc = sizeof(*entry) + size + 2; /* possible \n added */
 
-	local_save_flags(irq_flags);
+	trace_ctx = tracing_gen_ctx();
 	buffer = global_trace.array_buffer.buffer;
 	ring_buffer_nest_start(buffer);
-	event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, 
-					    irq_flags, pc);
+	event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,
+					    trace_ctx);
 	if (!event) {
 		size = 0;
 		goto out;
@@ -1045,7 +1042,7 @@ int __trace_puts(unsigned long ip, const char *str, int size)
 		entry->buf[size] = '\0';
 
 	__buffer_unlock_commit(buffer, event);
-	ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL);
+	ftrace_trace_stack(&global_trace, buffer, trace_ctx, 4, NULL);
  out:
 	ring_buffer_nest_end(buffer);
 	return size;
@@ -1062,25 +1059,22 @@ int __trace_bputs(unsigned long ip, const char *str)
 	struct ring_buffer_event *event;
 	struct trace_buffer *buffer;
 	struct bputs_entry *entry;
-	unsigned long irq_flags;
+	unsigned int trace_ctx;
 	int size = sizeof(struct bputs_entry);
 	int ret = 0;
-	int pc;
 
 	if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
 		return 0;
 
-	pc = preempt_count();
-
 	if (unlikely(tracing_selftest_running || tracing_disabled))
 		return 0;
 
-	local_save_flags(irq_flags);
+	trace_ctx = tracing_gen_ctx();
 	buffer = global_trace.array_buffer.buffer;
 
 	ring_buffer_nest_start(buffer);
 	event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
-					    irq_flags, pc);
+					    trace_ctx);
 	if (!event)
 		goto out;
 
@@ -1089,7 +1083,7 @@ int __trace_bputs(unsigned long ip, const char *str)
 	entry->str			= str;
 
 	__buffer_unlock_commit(buffer, event);
-	ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL);
+	ftrace_trace_stack(&global_trace, buffer, trace_ctx, 4, NULL);
 
 	ret = 1;
  out:
@@ -2585,36 +2579,69 @@ enum print_line_t trace_handle_return(struct trace_seq *s)
 }
 EXPORT_SYMBOL_GPL(trace_handle_return);
 
-void
-tracing_generic_entry_update(struct trace_entry *entry, unsigned short type,
-			     unsigned long flags, int pc)
+unsigned int tracing_gen_ctx_flags(unsigned long irqflags)
 {
-	struct task_struct *tsk = current;
+	unsigned int trace_flags = 0;
+	unsigned int pc;
+
+	pc = preempt_count();
 
-	entry->preempt_count		= pc & 0xff;
-	entry->pid			= (tsk) ? tsk->pid : 0;
-	entry->type			= type;
-	entry->flags =
 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
-		(irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
+	if (irqs_disabled_flags(irqflags))
+		trace_flags |= TRACE_FLAG_IRQS_OFF;
 #else
-		TRACE_FLAG_IRQS_NOSUPPORT |
+	trace_flags |= TRACE_FLAG_IRQS_NOSUPPORT;
 #endif
-		((pc & NMI_MASK    ) ? TRACE_FLAG_NMI     : 0) |
-		((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
-		((pc & SOFTIRQ_OFFSET) ? TRACE_FLAG_SOFTIRQ : 0) |
-		(tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
-		(test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
+
+	if (pc & NMI_MASK)
+		trace_flags |= TRACE_FLAG_NMI;
+	if (pc & HARDIRQ_MASK)
+		trace_flags |= TRACE_FLAG_HARDIRQ;
+
+	if (pc & SOFTIRQ_OFFSET)
+		trace_flags |= TRACE_FLAG_SOFTIRQ;
+
+	if (tif_need_resched())
+		trace_flags |= TRACE_FLAG_NEED_RESCHED;
+	if (test_preempt_need_resched())
+		trace_flags |= TRACE_FLAG_PREEMPT_RESCHED;
+	return (trace_flags << 16) | (pc & 0xff);
+}
+
+unsigned int tracing_gen_ctx(void)
+{
+	unsigned long irqflags;
+
+#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
+	local_save_flags(irqflags);
+#else
+	irqflags = 0;
+#endif
+	return tracing_gen_ctx_flags(irqflags);
+}
+
+unsigned int tracing_gen_ctx_dec(void)
+{
+	unsigned int trace_ctx;
+
+	trace_ctx = tracing_gen_ctx();
+
+	/*
+	 * Subtract one from the preeption counter if preemption is enabled,
+	 * see trace_event_buffer_reserve()for details.
+	 */
+	if (IS_ENABLED(CONFIG_PREEMPTION))
+		trace_ctx--;
+	return trace_ctx;
 }
-EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
 
 struct ring_buffer_event *
 trace_buffer_lock_reserve(struct trace_buffer *buffer,
 			  int type,
 			  unsigned long len,
-			  unsigned long flags, int pc)
+			  unsigned int trace_ctx)
 {
-	return __trace_buffer_lock_reserve(buffer, type, len, flags, pc);
+	return __trace_buffer_lock_reserve(buffer, type, len, trace_ctx);
 }
 
 DEFINE_PER_CPU(struct ring_buffer_event *, trace_buffered_event);
@@ -2734,7 +2761,7 @@ struct ring_buffer_event *
 trace_event_buffer_lock_reserve(struct trace_buffer **current_rb,
 			  struct trace_event_file *trace_file,
 			  int type, unsigned long len,
-			  unsigned long flags, int pc)
+			  unsigned int trace_ctx)
 {
 	struct ring_buffer_event *entry;
 	int val;
@@ -2747,15 +2774,15 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb,
 		/* Try to use the per cpu buffer first */
 		val = this_cpu_inc_return(trace_buffered_event_cnt);
 		if (val == 1) {
-			trace_event_setup(entry, type, flags, pc);
+			trace_event_setup(entry, type, trace_ctx);
 			entry->array[0] = len;
 			return entry;
 		}
 		this_cpu_dec(trace_buffered_event_cnt);
 	}
 
-	entry = __trace_buffer_lock_reserve(*current_rb,
-					    type, len, flags, pc);
+	entry = __trace_buffer_lock_reserve(*current_rb, type, len,
+					    trace_ctx);
 	/*
 	 * If tracing is off, but we have triggers enabled
 	 * we still need to look at the event data. Use the temp_buffer
@@ -2764,8 +2791,8 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb,
 	 */
 	if (!entry && trace_file->flags & EVENT_FILE_FL_TRIGGER_COND) {
 		*current_rb = temp_buffer;
-		entry = __trace_buffer_lock_reserve(*current_rb,
-						    type, len, flags, pc);
+		entry = __trace_buffer_lock_reserve(*current_rb, type, len,
+						    trace_ctx);
 	}
 	return entry;
 }
@@ -2851,7 +2878,7 @@ void trace_event_buffer_commit(struct trace_event_buffer *fbuffer)
 		ftrace_exports(fbuffer->event, TRACE_EXPORT_EVENT);
 	event_trigger_unlock_commit_regs(fbuffer->trace_file, fbuffer->buffer,
 				    fbuffer->event, fbuffer->entry,
-				    fbuffer->flags, fbuffer->pc, fbuffer->regs);
+				    fbuffer->trace_ctx, fbuffer->regs);
 }
 EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
 
@@ -2867,7 +2894,7 @@ EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
 void trace_buffer_unlock_commit_regs(struct trace_array *tr,
 				     struct trace_buffer *buffer,
 				     struct ring_buffer_event *event,
-				     unsigned long flags, int pc,
+				     unsigned int trace_ctx,
 				     struct pt_regs *regs)
 {
 	__buffer_unlock_commit(buffer, event);
@@ -2878,8 +2905,8 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
 	 * and mmiotrace, but that's ok if they lose a function or
 	 * two. They are not that meaningful.
 	 */
-	ftrace_trace_stack(tr, buffer, flags, regs ? 0 : STACK_SKIP, pc, regs);
-	ftrace_trace_userstack(tr, buffer, flags, pc);
+	ftrace_trace_stack(tr, buffer, trace_ctx, regs ? 0 : STACK_SKIP, regs);
+	ftrace_trace_userstack(tr, buffer, trace_ctx);
 }
 
 /*
@@ -2893,9 +2920,8 @@ trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer,
 }
 
 void
-trace_function(struct trace_array *tr,
-	       unsigned long ip, unsigned long parent_ip, unsigned long flags,
-	       int pc)
+trace_function(struct trace_array *tr, unsigned long ip, unsigned long
+	       parent_ip, unsigned int trace_ctx)
 {
 	struct trace_event_call *call = &event_function;
 	struct trace_buffer *buffer = tr->array_buffer.buffer;
@@ -2903,7 +2929,7 @@ trace_function(struct trace_array *tr,
 	struct ftrace_entry *entry;
 
 	event = __trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
-					    flags, pc);
+					    trace_ctx);
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -2937,8 +2963,8 @@ static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks);
 static DEFINE_PER_CPU(int, ftrace_stack_reserve);
 
 static void __ftrace_trace_stack(struct trace_buffer *buffer,
-				 unsigned long flags,
-				 int skip, int pc, struct pt_regs *regs)
+				 unsigned int trace_ctx,
+				 int skip, struct pt_regs *regs)
 {
 	struct trace_event_call *call = &event_kernel_stack;
 	struct ring_buffer_event *event;
@@ -2985,7 +3011,7 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer,
 
 	size = nr_entries * sizeof(unsigned long);
 	event = __trace_buffer_lock_reserve(buffer, TRACE_STACK,
-					    sizeof(*entry) + size, flags, pc);
+					    sizeof(*entry) + size, trace_ctx);
 	if (!event)
 		goto out;
 	entry = ring_buffer_event_data(event);
@@ -3006,22 +3032,22 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer,
 
 static inline void ftrace_trace_stack(struct trace_array *tr,
 				      struct trace_buffer *buffer,
-				      unsigned long flags,
-				      int skip, int pc, struct pt_regs *regs)
+				      unsigned int trace_ctx,
+				      int skip, struct pt_regs *regs)
 {
 	if (!(tr->trace_flags & TRACE_ITER_STACKTRACE))
 		return;
 
-	__ftrace_trace_stack(buffer, flags, skip, pc, regs);
+	__ftrace_trace_stack(buffer, trace_ctx, skip, regs);
 }
 
-void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
-		   int pc)
+void __trace_stack(struct trace_array *tr, unsigned int trace_ctx,
+		   int skip)
 {
 	struct trace_buffer *buffer = tr->array_buffer.buffer;
 
 	if (rcu_is_watching()) {
-		__ftrace_trace_stack(buffer, flags, skip, pc, NULL);
+		__ftrace_trace_stack(buffer, trace_ctx, skip, NULL);
 		return;
 	}
 
@@ -3035,7 +3061,7 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
 		return;
 
 	rcu_irq_enter_irqson();
-	__ftrace_trace_stack(buffer, flags, skip, pc, NULL);
+	__ftrace_trace_stack(buffer, trace_ctx, skip, NULL);
 	rcu_irq_exit_irqson();
 }
 
@@ -3045,19 +3071,15 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
  */
 void trace_dump_stack(int skip)
 {
-	unsigned long flags;
-
 	if (tracing_disabled || tracing_selftest_running)
 		return;
 
-	local_save_flags(flags);
-
 #ifndef CONFIG_UNWINDER_ORC
 	/* Skip 1 to skip this function. */
 	skip++;
 #endif
 	__ftrace_trace_stack(global_trace.array_buffer.buffer,
-			     flags, skip, preempt_count(), NULL);
+			     tracing_gen_ctx(), skip, NULL);
 }
 EXPORT_SYMBOL_GPL(trace_dump_stack);
 
@@ -3066,7 +3088,7 @@ static DEFINE_PER_CPU(int, user_stack_count);
 
 static void
 ftrace_trace_userstack(struct trace_array *tr,
-		       struct trace_buffer *buffer, unsigned long flags, int pc)
+		       struct trace_buffer *buffer, unsigned int trace_ctx)
 {
 	struct trace_event_call *call = &event_user_stack;
 	struct ring_buffer_event *event;
@@ -3093,7 +3115,7 @@ ftrace_trace_userstack(struct trace_array *tr,
 	__this_cpu_inc(user_stack_count);
 
 	event = __trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
-					    sizeof(*entry), flags, pc);
+					    sizeof(*entry), trace_ctx);
 	if (!event)
 		goto out_drop_count;
 	entry	= ring_buffer_event_data(event);
@@ -3113,7 +3135,7 @@ ftrace_trace_userstack(struct trace_array *tr,
 #else /* CONFIG_USER_STACKTRACE_SUPPORT */
 static void ftrace_trace_userstack(struct trace_array *tr,
 				   struct trace_buffer *buffer,
-				   unsigned long flags, int pc)
+				   unsigned int trace_ctx)
 {
 }
 #endif /* !CONFIG_USER_STACKTRACE_SUPPORT */
@@ -3243,9 +3265,9 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 	struct trace_buffer *buffer;
 	struct trace_array *tr = &global_trace;
 	struct bprint_entry *entry;
-	unsigned long flags;
+	unsigned int trace_ctx;
 	char *tbuffer;
-	int len = 0, size, pc;
+	int len = 0, size;
 
 	if (unlikely(tracing_selftest_running || tracing_disabled))
 		return 0;
@@ -3253,7 +3275,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 	/* Don't pollute graph traces with trace_vprintk internals */
 	pause_graph_tracing();
 
-	pc = preempt_count();
+	trace_ctx = tracing_gen_ctx();
 	preempt_disable_notrace();
 
 	tbuffer = get_trace_buf();
@@ -3267,12 +3289,11 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 	if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0)
 		goto out_put;
 
-	local_save_flags(flags);
 	size = sizeof(*entry) + sizeof(u32) * len;
 	buffer = tr->array_buffer.buffer;
 	ring_buffer_nest_start(buffer);
 	event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
-					    flags, pc);
+					    trace_ctx);
 	if (!event)
 		goto out;
 	entry = ring_buffer_event_data(event);
@@ -3282,7 +3303,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 	memcpy(entry->buf, tbuffer, sizeof(u32) * len);
 	if (!call_filter_check_discard(call, entry, buffer, event)) {
 		__buffer_unlock_commit(buffer, event);
-		ftrace_trace_stack(tr, buffer, flags, 6, pc, NULL);
+		ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL);
 	}
 
 out:
@@ -3305,9 +3326,9 @@ __trace_array_vprintk(struct trace_buffer *buffer,
 {
 	struct trace_event_call *call = &event_print;
 	struct ring_buffer_event *event;
-	int len = 0, size, pc;
+	int len = 0, size;
 	struct print_entry *entry;
-	unsigned long flags;
+	unsigned int trace_ctx;
 	char *tbuffer;
 
 	if (tracing_disabled || tracing_selftest_running)
@@ -3316,7 +3337,7 @@ __trace_array_vprintk(struct trace_buffer *buffer,
 	/* Don't pollute graph traces with trace_vprintk internals */
 	pause_graph_tracing();
 
-	pc = preempt_count();
+	trace_ctx = tracing_gen_ctx();
 	preempt_disable_notrace();
 
 
@@ -3328,11 +3349,10 @@ __trace_array_vprintk(struct trace_buffer *buffer,
 
 	len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
 
-	local_save_flags(flags);
 	size = sizeof(*entry) + len + 1;
 	ring_buffer_nest_start(buffer);
 	event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
-					    flags, pc);
+					    trace_ctx);
 	if (!event)
 		goto out;
 	entry = ring_buffer_event_data(event);
@@ -3341,7 +3361,7 @@ __trace_array_vprintk(struct trace_buffer *buffer,
 	memcpy(&entry->buf, tbuffer, len + 1);
 	if (!call_filter_check_discard(call, entry, buffer, event)) {
 		__buffer_unlock_commit(buffer, event);
-		ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL);
+		ftrace_trace_stack(&global_trace, buffer, trace_ctx, 6, NULL);
 	}
 
 out:
@@ -6654,7 +6674,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 	enum event_trigger_type tt = ETT_NONE;
 	struct trace_buffer *buffer;
 	struct print_entry *entry;
-	unsigned long irq_flags;
 	ssize_t written;
 	int size;
 	int len;
@@ -6674,7 +6693,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 
 	BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
 
-	local_save_flags(irq_flags);
 	size = sizeof(*entry) + cnt + 2; /* add '\0' and possible '\n' */
 
 	/* If less than "<faulted>", then make sure we can still add that */
@@ -6683,7 +6701,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 
 	buffer = tr->array_buffer.buffer;
 	event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
-					    irq_flags, preempt_count());
+					    tracing_gen_ctx());
 	if (unlikely(!event))
 		/* Ring buffer disabled, return as if not open for write */
 		return -EBADF;
@@ -6735,7 +6753,6 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
 	struct ring_buffer_event *event;
 	struct trace_buffer *buffer;
 	struct raw_data_entry *entry;
-	unsigned long irq_flags;
 	ssize_t written;
 	int size;
 	int len;
@@ -6757,14 +6774,13 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
 
 	BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
 
-	local_save_flags(irq_flags);
 	size = sizeof(*entry) + cnt;
 	if (cnt < FAULT_SIZE_ID)
 		size += FAULT_SIZE_ID - cnt;
 
 	buffer = tr->array_buffer.buffer;
 	event = __trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size,
-					    irq_flags, preempt_count());
+					    tracing_gen_ctx());
 	if (!event)
 		/* Ring buffer disabled, return as if not open for write */
 		return -EBADF;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e448d2da0b99..8daf3a0758b1 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -589,8 +589,7 @@ struct ring_buffer_event *
 trace_buffer_lock_reserve(struct trace_buffer *buffer,
 			  int type,
 			  unsigned long len,
-			  unsigned long flags,
-			  int pc);
+			  unsigned int trace_ctx);
 
 struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
 						struct trace_array_cpu *data);
@@ -615,11 +614,11 @@ unsigned long trace_total_entries(struct trace_array *tr);
 void trace_function(struct trace_array *tr,
 		    unsigned long ip,
 		    unsigned long parent_ip,
-		    unsigned long flags, int pc);
+		    unsigned int trace_ctx);
 void trace_graph_function(struct trace_array *tr,
 		    unsigned long ip,
 		    unsigned long parent_ip,
-		    unsigned long flags, int pc);
+		    unsigned int trace_ctx);
 void trace_latency_header(struct seq_file *m);
 void trace_default_header(struct seq_file *m);
 void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
@@ -687,11 +686,10 @@ static inline void latency_fsnotify(struct trace_array *tr) { }
 #endif
 
 #ifdef CONFIG_STACKTRACE
-void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
-		   int pc);
+void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, int skip);
 #else
-static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
-				 int skip, int pc)
+static inline void __trace_stack(struct trace_array *tr, unsigned int trace_ctx,
+				 int skip)
 {
 }
 #endif /* CONFIG_STACKTRACE */
@@ -831,10 +829,10 @@ extern void graph_trace_open(struct trace_iterator *iter);
 extern void graph_trace_close(struct trace_iterator *iter);
 extern int __trace_graph_entry(struct trace_array *tr,
 			       struct ftrace_graph_ent *trace,
-			       unsigned long flags, int pc);
+			       unsigned int trace_ctx);
 extern void __trace_graph_return(struct trace_array *tr,
 				 struct ftrace_graph_ret *trace,
-				 unsigned long flags, int pc);
+				 unsigned int trace_ctx);
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 extern struct ftrace_hash __rcu *ftrace_graph_hash;
@@ -1297,15 +1295,15 @@ extern int call_filter_check_discard(struct trace_event_call *call, void *rec,
 void trace_buffer_unlock_commit_regs(struct trace_array *tr,
 				     struct trace_buffer *buffer,
 				     struct ring_buffer_event *event,
-				     unsigned long flags, int pc,
+				     unsigned int trcace_ctx,
 				     struct pt_regs *regs);
 
 static inline void trace_buffer_unlock_commit(struct trace_array *tr,
 					      struct trace_buffer *buffer,
 					      struct ring_buffer_event *event,
-					      unsigned long flags, int pc)
+					      unsigned int trace_ctx)
 {
-	trace_buffer_unlock_commit_regs(tr, buffer, event, flags, pc, NULL);
+	trace_buffer_unlock_commit_regs(tr, buffer, event, trace_ctx, NULL);
 }
 
 DECLARE_PER_CPU(struct ring_buffer_event *, trace_buffered_event);
@@ -1366,8 +1364,7 @@ __event_trigger_test_discard(struct trace_event_file *file,
  * @buffer: The ring buffer that the event is being written to
  * @event: The event meta data in the ring buffer
  * @entry: The event itself
- * @irq_flags: The state of the interrupts at the start of the event
- * @pc: The state of the preempt count at the start of the event.
+ * @trace_ctx: The tracing context flags.
  *
  * This is a helper function to handle triggers that require data
  * from the event itself. It also tests the event against filters and
@@ -1377,12 +1374,12 @@ static inline void
 event_trigger_unlock_commit(struct trace_event_file *file,
 			    struct trace_buffer *buffer,
 			    struct ring_buffer_event *event,
-			    void *entry, unsigned long irq_flags, int pc)
+			    void *entry, unsigned int trace_ctx)
 {
 	enum event_trigger_type tt = ETT_NONE;
 
 	if (!__event_trigger_test_discard(file, buffer, event, entry, &tt))
-		trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc);
+		trace_buffer_unlock_commit(file->tr, buffer, event, trace_ctx);
 
 	if (tt)
 		event_triggers_post_call(file, tt);
@@ -1394,8 +1391,7 @@ event_trigger_unlock_commit(struct trace_event_file *file,
  * @buffer: The ring buffer that the event is being written to
  * @event: The event meta data in the ring buffer
  * @entry: The event itself
- * @irq_flags: The state of the interrupts at the start of the event
- * @pc: The state of the preempt count at the start of the event.
+ * @trace_ctx: The tracing context flags.
  *
  * This is a helper function to handle triggers that require data
  * from the event itself. It also tests the event against filters and
@@ -1408,14 +1404,14 @@ static inline void
 event_trigger_unlock_commit_regs(struct trace_event_file *file,
 				 struct trace_buffer *buffer,
 				 struct ring_buffer_event *event,
-				 void *entry, unsigned long irq_flags, int pc,
+				 void *entry, unsigned int trace_ctx,
 				 struct pt_regs *regs)
 {
 	enum event_trigger_type tt = ETT_NONE;
 
 	if (!__event_trigger_test_discard(file, buffer, event, entry, &tt))
 		trace_buffer_unlock_commit_regs(file->tr, buffer, event,
-						irq_flags, pc, regs);
+						trace_ctx, regs);
 
 	if (tt)
 		event_triggers_post_call(file, tt);
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index eff099123aa2..e47fdb4c92fb 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -37,7 +37,7 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
 	struct ring_buffer_event *event;
 	struct trace_branch *entry;
 	unsigned long flags;
-	int pc;
+	unsigned int trace_ctx;
 	const char *p;
 
 	if (current->trace_recursion & TRACE_BRANCH_BIT)
@@ -59,10 +59,10 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
 	if (atomic_read(&data->disabled))
 		goto out;
 
-	pc = preempt_count();
+	trace_ctx = tracing_gen_ctx_flags(flags);
 	buffer = tr->array_buffer.buffer;
 	event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH,
-					  sizeof(*entry), flags, pc);
+					  sizeof(*entry), trace_ctx);
 	if (!event)
 		goto out;
 
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index a71181655958..288ad2c274fb 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -421,11 +421,8 @@ NOKPROBE_SYMBOL(perf_trace_buf_alloc);
 void perf_trace_buf_update(void *record, u16 type)
 {
 	struct trace_entry *entry = record;
-	int pc = preempt_count();
-	unsigned long flags;
 
-	local_save_flags(flags);
-	tracing_generic_entry_update(entry, type, flags, pc);
+	tracing_generic_entry_update(entry, type, tracing_gen_ctx());
 }
 NOKPROBE_SYMBOL(perf_trace_buf_update);
 
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index e9d28eeccb7e..20ccce3e4ffb 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -258,22 +258,19 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
 	    trace_event_ignore_this_pid(trace_file))
 		return NULL;
 
-	local_save_flags(fbuffer->flags);
-	fbuffer->pc = preempt_count();
 	/*
 	 * If CONFIG_PREEMPTION is enabled, then the tracepoint itself disables
 	 * preemption (adding one to the preempt_count). Since we are
 	 * interested in the preempt_count at the time the tracepoint was
 	 * hit, we need to subtract one to offset the increment.
 	 */
-	if (IS_ENABLED(CONFIG_PREEMPTION))
-		fbuffer->pc--;
+	fbuffer->trace_ctx = tracing_gen_ctx_dec();
 	fbuffer->trace_file = trace_file;
 
 	fbuffer->event =
 		trace_event_buffer_lock_reserve(&fbuffer->buffer, trace_file,
 						event_call->event.type, len,
-						fbuffer->flags, fbuffer->pc);
+						fbuffer->trace_ctx);
 	if (!fbuffer->event)
 		return NULL;
 
@@ -3678,12 +3675,11 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip,
 	struct trace_buffer *buffer;
 	struct ring_buffer_event *event;
 	struct ftrace_entry *entry;
-	unsigned long flags;
+	unsigned int trace_ctx;
 	long disabled;
 	int cpu;
-	int pc;
 
-	pc = preempt_count();
+	trace_ctx = tracing_gen_ctx();
 	preempt_disable_notrace();
 	cpu = raw_smp_processor_id();
 	disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu));
@@ -3691,11 +3687,9 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip,
 	if (disabled != 1)
 		goto out;
 
-	local_save_flags(flags);
-
 	event = trace_event_buffer_lock_reserve(&buffer, &event_trace_file,
 						TRACE_FN, sizeof(*entry),
-						flags, pc);
+						trace_ctx);
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
@@ -3703,7 +3697,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip,
 	entry->parent_ip		= parent_ip;
 
 	event_trigger_unlock_commit(&event_trace_file, buffer, event,
-				    entry, flags, pc);
+				    entry, trace_ctx);
  out:
 	atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
 	preempt_enable_notrace();
diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c
index 22bcf7c51d1e..c188045c5f97 100644
--- a/kernel/trace/trace_events_inject.c
+++ b/kernel/trace/trace_events_inject.c
@@ -192,7 +192,6 @@ static void *trace_alloc_entry(struct trace_event_call *call, int *size)
 static int parse_entry(char *str, struct trace_event_call *call, void **pentry)
 {
 	struct ftrace_event_field *field;
-	unsigned long irq_flags;
 	void *entry = NULL;
 	int entry_size;
 	u64 val = 0;
@@ -203,9 +202,8 @@ static int parse_entry(char *str, struct trace_event_call *call, void **pentry)
 	if (!entry)
 		return -ENOMEM;
 
-	local_save_flags(irq_flags);
-	tracing_generic_entry_update(entry, call->event.type, irq_flags,
-				     preempt_count());
+	tracing_generic_entry_update(entry, call->event.type,
+				     tracing_gen_ctx());
 
 	while ((len = parse_field(str, call, &field, &val)) > 0) {
 		if (is_function_field(field))
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index f67aec5bb771..f93723ca66bc 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -131,10 +131,9 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
 {
 	struct trace_array *tr = op->private;
 	struct trace_array_cpu *data;
-	unsigned long flags;
+	unsigned int trace_ctx;
 	int bit;
 	int cpu;
-	int pc;
 
 	if (unlikely(!tr->function_enabled))
 		return;
@@ -143,15 +142,14 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
 	if (bit < 0)
 		return;
 
-	pc = preempt_count();
+	trace_ctx = tracing_gen_ctx();
 	preempt_disable_notrace();
 
 	cpu = smp_processor_id();
 	data = per_cpu_ptr(tr->array_buffer.data, cpu);
-	if (!atomic_read(&data->disabled)) {
-		local_save_flags(flags);
-		trace_function(tr, ip, parent_ip, flags, pc);
-	}
+	if (!atomic_read(&data->disabled))
+		trace_function(tr, ip, parent_ip, trace_ctx);
+
 	ftrace_test_recursion_unlock(bit);
 	preempt_enable_notrace();
 }
@@ -183,7 +181,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
 	unsigned long flags;
 	long disabled;
 	int cpu;
-	int pc;
+	unsigned int trace_ctx;
 
 	if (unlikely(!tr->function_enabled))
 		return;
@@ -198,9 +196,9 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
 	disabled = atomic_inc_return(&data->disabled);
 
 	if (likely(disabled == 1)) {
-		pc = preempt_count();
-		trace_function(tr, ip, parent_ip, flags, pc);
-		__trace_stack(tr, flags, STACK_SKIP, pc);
+		trace_ctx = tracing_gen_ctx_flags(flags);
+		trace_function(tr, ip, parent_ip, trace_ctx);
+		__trace_stack(tr, trace_ctx, STACK_SKIP);
 	}
 
 	atomic_dec(&data->disabled);
@@ -403,13 +401,11 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip,
 
 static __always_inline void trace_stack(struct trace_array *tr)
 {
-	unsigned long flags;
-	int pc;
+	unsigned int trace_ctx;
 
-	local_save_flags(flags);
-	pc = preempt_count();
+	trace_ctx = tracing_gen_ctx();
 
-	__trace_stack(tr, flags, FTRACE_STACK_SKIP, pc);
+	__trace_stack(tr, trace_ctx, FTRACE_STACK_SKIP);
 }
 
 static void
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index d874dec87131..0aa6e6faa943 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -96,8 +96,7 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration,
 
 int __trace_graph_entry(struct trace_array *tr,
 				struct ftrace_graph_ent *trace,
-				unsigned long flags,
-				int pc)
+				unsigned int trace_ctx)
 {
 	struct trace_event_call *call = &event_funcgraph_entry;
 	struct ring_buffer_event *event;
@@ -105,7 +104,7 @@ int __trace_graph_entry(struct trace_array *tr,
 	struct ftrace_graph_ent_entry *entry;
 
 	event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
-					  sizeof(*entry), flags, pc);
+					  sizeof(*entry), trace_ctx);
 	if (!event)
 		return 0;
 	entry	= ring_buffer_event_data(event);
@@ -129,10 +128,10 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
 	struct trace_array *tr = graph_array;
 	struct trace_array_cpu *data;
 	unsigned long flags;
+	unsigned int trace_ctx;
 	long disabled;
 	int ret;
 	int cpu;
-	int pc;
 
 	if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT))
 		return 0;
@@ -174,8 +173,8 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
 	data = per_cpu_ptr(tr->array_buffer.data, cpu);
 	disabled = atomic_inc_return(&data->disabled);
 	if (likely(disabled == 1)) {
-		pc = preempt_count();
-		ret = __trace_graph_entry(tr, trace, flags, pc);
+		trace_ctx = tracing_gen_ctx_flags(flags);
+		ret = __trace_graph_entry(tr, trace, trace_ctx);
 	} else {
 		ret = 0;
 	}
@@ -188,7 +187,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
 
 static void
 __trace_graph_function(struct trace_array *tr,
-		unsigned long ip, unsigned long flags, int pc)
+		unsigned long ip, unsigned int trace_ctx)
 {
 	u64 time = trace_clock_local();
 	struct ftrace_graph_ent ent = {
@@ -202,22 +201,21 @@ __trace_graph_function(struct trace_array *tr,
 		.rettime  = time,
 	};
 
-	__trace_graph_entry(tr, &ent, flags, pc);
-	__trace_graph_return(tr, &ret, flags, pc);
+	__trace_graph_entry(tr, &ent, trace_ctx);
+	__trace_graph_return(tr, &ret, trace_ctx);
 }
 
 void
 trace_graph_function(struct trace_array *tr,
 		unsigned long ip, unsigned long parent_ip,
-		unsigned long flags, int pc)
+		unsigned int trace_ctx)
 {
-	__trace_graph_function(tr, ip, flags, pc);
+	__trace_graph_function(tr, ip, trace_ctx);
 }
 
 void __trace_graph_return(struct trace_array *tr,
 				struct ftrace_graph_ret *trace,
-				unsigned long flags,
-				int pc)
+				unsigned int trace_ctx)
 {
 	struct trace_event_call *call = &event_funcgraph_exit;
 	struct ring_buffer_event *event;
@@ -225,7 +223,7 @@ void __trace_graph_return(struct trace_array *tr,
 	struct ftrace_graph_ret_entry *entry;
 
 	event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
-					  sizeof(*entry), flags, pc);
+					  sizeof(*entry), trace_ctx);
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -239,9 +237,9 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
 	struct trace_array *tr = graph_array;
 	struct trace_array_cpu *data;
 	unsigned long flags;
+	unsigned int trace_ctx;
 	long disabled;
 	int cpu;
-	int pc;
 
 	ftrace_graph_addr_finish(trace);
 
@@ -255,8 +253,8 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
 	data = per_cpu_ptr(tr->array_buffer.data, cpu);
 	disabled = atomic_inc_return(&data->disabled);
 	if (likely(disabled == 1)) {
-		pc = preempt_count();
-		__trace_graph_return(tr, trace, flags, pc);
+		trace_ctx = tracing_gen_ctx_flags(flags);
+		__trace_graph_return(tr, trace, trace_ctx);
 	}
 	atomic_dec(&data->disabled);
 	local_irq_restore(flags);
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index c0df9b97f147..34dc1a712dcb 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -108,14 +108,9 @@ static void trace_hwlat_sample(struct hwlat_sample *sample)
 	struct trace_buffer *buffer = tr->array_buffer.buffer;
 	struct ring_buffer_event *event;
 	struct hwlat_entry *entry;
-	unsigned long flags;
-	int pc;
-
-	pc = preempt_count();
-	local_save_flags(flags);
 
 	event = trace_buffer_lock_reserve(buffer, TRACE_HWLAT, sizeof(*entry),
-					  flags, pc);
+					  tracing_gen_ctx());
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 6756379b661f..590b3d51afae 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -143,11 +143,14 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip,
 	struct trace_array *tr = irqsoff_trace;
 	struct trace_array_cpu *data;
 	unsigned long flags;
+	unsigned int trace_ctx;
 
 	if (!func_prolog_dec(tr, &data, &flags))
 		return;
 
-	trace_function(tr, ip, parent_ip, flags, preempt_count());
+	trace_ctx = tracing_gen_ctx_flags(flags);
+
+	trace_function(tr, ip, parent_ip, trace_ctx);
 
 	atomic_dec(&data->disabled);
 }
@@ -177,8 +180,8 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
 	struct trace_array *tr = irqsoff_trace;
 	struct trace_array_cpu *data;
 	unsigned long flags;
+	unsigned int trace_ctx;
 	int ret;
-	int pc;
 
 	if (ftrace_graph_ignore_func(trace))
 		return 0;
@@ -195,8 +198,8 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
 	if (!func_prolog_dec(tr, &data, &flags))
 		return 0;
 
-	pc = preempt_count();
-	ret = __trace_graph_entry(tr, trace, flags, pc);
+	trace_ctx = tracing_gen_ctx_flags(flags);
+	ret = __trace_graph_entry(tr, trace, trace_ctx);
 	atomic_dec(&data->disabled);
 
 	return ret;
@@ -207,15 +210,15 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
 	struct trace_array *tr = irqsoff_trace;
 	struct trace_array_cpu *data;
 	unsigned long flags;
-	int pc;
+	unsigned int trace_ctx;
 
 	ftrace_graph_addr_finish(trace);
 
 	if (!func_prolog_dec(tr, &data, &flags))
 		return;
 
-	pc = preempt_count();
-	__trace_graph_return(tr, trace, flags, pc);
+	trace_ctx = tracing_gen_ctx_flags(flags);
+	__trace_graph_return(tr, trace, trace_ctx);
 	atomic_dec(&data->disabled);
 }
 
@@ -267,12 +270,12 @@ static void irqsoff_print_header(struct seq_file *s)
 static void
 __trace_function(struct trace_array *tr,
 		 unsigned long ip, unsigned long parent_ip,
-		 unsigned long flags, int pc)
+		 unsigned int trace_ctx)
 {
 	if (is_graph(tr))
-		trace_graph_function(tr, ip, parent_ip, flags, pc);
+		trace_graph_function(tr, ip, parent_ip, trace_ctx);
 	else
-		trace_function(tr, ip, parent_ip, flags, pc);
+		trace_function(tr, ip, parent_ip, trace_ctx);
 }
 
 #else
@@ -322,15 +325,13 @@ check_critical_timing(struct trace_array *tr,
 {
 	u64 T0, T1, delta;
 	unsigned long flags;
-	int pc;
+	unsigned int trace_ctx;
 
 	T0 = data->preempt_timestamp;
 	T1 = ftrace_now(cpu);
 	delta = T1-T0;
 
-	local_save_flags(flags);
-
-	pc = preempt_count();
+	trace_ctx = tracing_gen_ctx();
 
 	if (!report_latency(tr, delta))
 		goto out;
@@ -341,9 +342,9 @@ check_critical_timing(struct trace_array *tr,
 	if (!report_latency(tr, delta))
 		goto out_unlock;
 
-	__trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
+	__trace_function(tr, CALLER_ADDR0, parent_ip, trace_ctx);
 	/* Skip 5 functions to get to the irq/preempt enable function */
-	__trace_stack(tr, flags, 5, pc);
+	__trace_stack(tr, trace_ctx, 5);
 
 	if (data->critical_sequence != max_sequence)
 		goto out_unlock;
@@ -363,16 +364,15 @@ out_unlock:
 out:
 	data->critical_sequence = max_sequence;
 	data->preempt_timestamp = ftrace_now(cpu);
-	__trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
+	__trace_function(tr, CALLER_ADDR0, parent_ip, trace_ctx);
 }
 
 static nokprobe_inline void
-start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
+start_critical_timing(unsigned long ip, unsigned long parent_ip)
 {
 	int cpu;
 	struct trace_array *tr = irqsoff_trace;
 	struct trace_array_cpu *data;
-	unsigned long flags;
 
 	if (!tracer_enabled || !tracing_is_enabled())
 		return;
@@ -393,9 +393,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
 	data->preempt_timestamp = ftrace_now(cpu);
 	data->critical_start = parent_ip ? : ip;
 
-	local_save_flags(flags);
-
-	__trace_function(tr, ip, parent_ip, flags, pc);
+	__trace_function(tr, ip, parent_ip, tracing_gen_ctx());
 
 	per_cpu(tracing_cpu, cpu) = 1;
 
@@ -403,12 +401,12 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
 }
 
 static nokprobe_inline void
-stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
+stop_critical_timing(unsigned long ip, unsigned long parent_ip)
 {
 	int cpu;
 	struct trace_array *tr = irqsoff_trace;
 	struct trace_array_cpu *data;
-	unsigned long flags;
+	unsigned int trace_ctx;
 
 	cpu = raw_smp_processor_id();
 	/* Always clear the tracing cpu on stopping the trace */
@@ -428,8 +426,8 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
 
 	atomic_inc(&data->disabled);
 
-	local_save_flags(flags);
-	__trace_function(tr, ip, parent_ip, flags, pc);
+	trace_ctx = tracing_gen_ctx();
+	__trace_function(tr, ip, parent_ip, trace_ctx);
 	check_critical_timing(tr, data, parent_ip ? : ip, cpu);
 	data->critical_start = 0;
 	atomic_dec(&data->disabled);
@@ -438,20 +436,16 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
 /* start and stop critical timings used to for stoppage (in idle) */
 void start_critical_timings(void)
 {
-	int pc = preempt_count();
-
-	if (preempt_trace(pc) || irq_trace())
-		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc);
+	if (preempt_trace(preempt_count()) || irq_trace())
+		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
 EXPORT_SYMBOL_GPL(start_critical_timings);
 NOKPROBE_SYMBOL(start_critical_timings);
 
 void stop_critical_timings(void)
 {
-	int pc = preempt_count();
-
-	if (preempt_trace(pc) || irq_trace())
-		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc);
+	if (preempt_trace(preempt_count()) || irq_trace())
+		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
 EXPORT_SYMBOL_GPL(stop_critical_timings);
 NOKPROBE_SYMBOL(stop_critical_timings);
@@ -613,19 +607,15 @@ static void irqsoff_tracer_stop(struct trace_array *tr)
  */
 void tracer_hardirqs_on(unsigned long a0, unsigned long a1)
 {
-	unsigned int pc = preempt_count();
-
-	if (!preempt_trace(pc) && irq_trace())
-		stop_critical_timing(a0, a1, pc);
+	if (!preempt_trace(preempt_count()) && irq_trace())
+		stop_critical_timing(a0, a1);
 }
 NOKPROBE_SYMBOL(tracer_hardirqs_on);
 
 void tracer_hardirqs_off(unsigned long a0, unsigned long a1)
 {
-	unsigned int pc = preempt_count();
-
-	if (!preempt_trace(pc) && irq_trace())
-		start_critical_timing(a0, a1, pc);
+	if (!preempt_trace(preempt_count()) && irq_trace())
+		start_critical_timing(a0, a1);
 }
 NOKPROBE_SYMBOL(tracer_hardirqs_off);
 
@@ -665,18 +655,14 @@ static struct tracer irqsoff_tracer __read_mostly =
 #ifdef CONFIG_PREEMPT_TRACER
 void tracer_preempt_on(unsigned long a0, unsigned long a1)
 {
-	int pc = preempt_count();
-
-	if (preempt_trace(pc) && !irq_trace())
-		stop_critical_timing(a0, a1, pc);
+	if (preempt_trace(preempt_count()) && !irq_trace())
+		stop_critical_timing(a0, a1);
 }
 
 void tracer_preempt_off(unsigned long a0, unsigned long a1)
 {
-	int pc = preempt_count();
-
-	if (preempt_trace(pc) && !irq_trace())
-		start_critical_timing(a0, a1, pc);
+	if (preempt_trace(preempt_count()) && !irq_trace())
+		start_critical_timing(a0, a1);
 }
 
 static int preemptoff_tracer_init(struct trace_array *tr)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 56c7fbff7bd7..f6c459aba8a6 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1386,8 +1386,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
 	if (trace_trigger_soft_disabled(trace_file))
 		return;
 
-	local_save_flags(fbuffer.flags);
-	fbuffer.pc = preempt_count();
+	fbuffer.trace_ctx = tracing_gen_ctx();
 	fbuffer.trace_file = trace_file;
 
 	dsize = __get_data_size(&tk->tp, regs);
@@ -1396,7 +1395,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
 		trace_event_buffer_lock_reserve(&fbuffer.buffer, trace_file,
 					call->event.type,
 					sizeof(*entry) + tk->tp.size + dsize,
-					fbuffer.flags, fbuffer.pc);
+					fbuffer.trace_ctx);
 	if (!fbuffer.event)
 		return;
 
@@ -1434,8 +1433,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 	if (trace_trigger_soft_disabled(trace_file))
 		return;
 
-	local_save_flags(fbuffer.flags);
-	fbuffer.pc = preempt_count();
+	fbuffer.trace_ctx = tracing_gen_ctx();
 	fbuffer.trace_file = trace_file;
 
 	dsize = __get_data_size(&tk->tp, regs);
@@ -1443,7 +1441,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 		trace_event_buffer_lock_reserve(&fbuffer.buffer, trace_file,
 					call->event.type,
 					sizeof(*entry) + tk->tp.size + dsize,
-					fbuffer.flags, fbuffer.pc);
+					fbuffer.trace_ctx);
 	if (!fbuffer.event)
 		return;
 
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 84582bf1ed5f..7221ae0b4c47 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -300,10 +300,11 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
 	struct trace_buffer *buffer = tr->array_buffer.buffer;
 	struct ring_buffer_event *event;
 	struct trace_mmiotrace_rw *entry;
-	int pc = preempt_count();
+	unsigned int trace_ctx;
 
+	trace_ctx = tracing_gen_ctx_flags(0);
 	event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_RW,
-					  sizeof(*entry), 0, pc);
+					  sizeof(*entry), trace_ctx);
 	if (!event) {
 		atomic_inc(&dropped_count);
 		return;
@@ -312,7 +313,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
 	entry->rw			= *rw;
 
 	if (!call_filter_check_discard(call, entry, buffer, event))
-		trace_buffer_unlock_commit(tr, buffer, event, 0, pc);
+		trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
 }
 
 void mmio_trace_rw(struct mmiotrace_rw *rw)
@@ -330,10 +331,11 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
 	struct trace_buffer *buffer = tr->array_buffer.buffer;
 	struct ring_buffer_event *event;
 	struct trace_mmiotrace_map *entry;
-	int pc = preempt_count();
+	unsigned int trace_ctx;
 
+	trace_ctx = tracing_gen_ctx_flags(0);
 	event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_MAP,
-					  sizeof(*entry), 0, pc);
+					  sizeof(*entry), trace_ctx);
 	if (!event) {
 		atomic_inc(&dropped_count);
 		return;
@@ -342,7 +344,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
 	entry->map			= *map;
 
 	if (!call_filter_check_discard(call, entry, buffer, event))
-		trace_buffer_unlock_commit(tr, buffer, event, 0, pc);
+		trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
 }
 
 void mmio_trace_mapping(struct mmiotrace_map *map)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index c0181066dbe9..e5778d1d7a5b 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -67,7 +67,7 @@ static bool function_enabled;
 static int
 func_prolog_preempt_disable(struct trace_array *tr,
 			    struct trace_array_cpu **data,
-			    int *pc)
+			    unsigned int *trace_ctx)
 {
 	long disabled;
 	int cpu;
@@ -75,7 +75,7 @@ func_prolog_preempt_disable(struct trace_array *tr,
 	if (likely(!wakeup_task))
 		return 0;
 
-	*pc = preempt_count();
+	*trace_ctx = tracing_gen_ctx();
 	preempt_disable_notrace();
 
 	cpu = raw_smp_processor_id();
@@ -116,8 +116,8 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
 {
 	struct trace_array *tr = wakeup_trace;
 	struct trace_array_cpu *data;
-	unsigned long flags;
-	int pc, ret = 0;
+	unsigned int trace_ctx;
+	int ret = 0;
 
 	if (ftrace_graph_ignore_func(trace))
 		return 0;
@@ -131,11 +131,10 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
 	if (ftrace_graph_notrace_addr(trace->func))
 		return 1;
 
-	if (!func_prolog_preempt_disable(tr, &data, &pc))
+	if (!func_prolog_preempt_disable(tr, &data, &trace_ctx))
 		return 0;
 
-	local_save_flags(flags);
-	ret = __trace_graph_entry(tr, trace, flags, pc);
+	ret = __trace_graph_entry(tr, trace, trace_ctx);
 	atomic_dec(&data->disabled);
 	preempt_enable_notrace();
 
@@ -146,16 +145,14 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace)
 {
 	struct trace_array *tr = wakeup_trace;
 	struct trace_array_cpu *data;
-	unsigned long flags;
-	int pc;
+	unsigned int trace_ctx;
 
 	ftrace_graph_addr_finish(trace);
 
-	if (!func_prolog_preempt_disable(tr, &data, &pc))
+	if (!func_prolog_preempt_disable(tr, &data, &trace_ctx))
 		return;
 
-	local_save_flags(flags);
-	__trace_graph_return(tr, trace, flags, pc);
+	__trace_graph_return(tr, trace, trace_ctx);
 	atomic_dec(&data->disabled);
 
 	preempt_enable_notrace();
@@ -217,13 +214,13 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip,
 	struct trace_array *tr = wakeup_trace;
 	struct trace_array_cpu *data;
 	unsigned long flags;
-	int pc;
+	unsigned int trace_ctx;
 
-	if (!func_prolog_preempt_disable(tr, &data, &pc))
+	if (!func_prolog_preempt_disable(tr, &data, &trace_ctx))
 		return;
 
 	local_irq_save(flags);
-	trace_function(tr, ip, parent_ip, flags, pc);
+	trace_function(tr, ip, parent_ip, trace_ctx);
 	local_irq_restore(flags);
 
 	atomic_dec(&data->disabled);
@@ -303,12 +300,12 @@ static void wakeup_print_header(struct seq_file *s)
 static void
 __trace_function(struct trace_array *tr,
 		 unsigned long ip, unsigned long parent_ip,
-		 unsigned long flags, int pc)
+		 unsigned int trace_ctx)
 {
 	if (is_graph(tr))
-		trace_graph_function(tr, ip, parent_ip, flags, pc);
+		trace_graph_function(tr, ip, parent_ip, trace_ctx);
 	else
-		trace_function(tr, ip, parent_ip, flags, pc);
+		trace_function(tr, ip, parent_ip, trace_ctx);
 }
 
 static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set)
@@ -375,7 +372,7 @@ static void
 tracing_sched_switch_trace(struct trace_array *tr,
 			   struct task_struct *prev,
 			   struct task_struct *next,
-			   unsigned long flags, int pc)
+			   unsigned int trace_ctx)
 {
 	struct trace_event_call *call = &event_context_switch;
 	struct trace_buffer *buffer = tr->array_buffer.buffer;
@@ -383,7 +380,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	struct ctx_switch_entry *entry;
 
 	event = trace_buffer_lock_reserve(buffer, TRACE_CTX,
-					  sizeof(*entry), flags, pc);
+					  sizeof(*entry), trace_ctx);
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -396,14 +393,14 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry->next_cpu	= task_cpu(next);
 
 	if (!call_filter_check_discard(call, entry, buffer, event))
-		trace_buffer_unlock_commit(tr, buffer, event, flags, pc);
+		trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
 }
 
 static void
 tracing_sched_wakeup_trace(struct trace_array *tr,
 			   struct task_struct *wakee,
 			   struct task_struct *curr,
-			   unsigned long flags, int pc)
+			   unsigned int trace_ctx)
 {
 	struct trace_event_call *call = &event_wakeup;
 	struct ring_buffer_event *event;
@@ -411,7 +408,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	struct trace_buffer *buffer = tr->array_buffer.buffer;
 
 	event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
-					  sizeof(*entry), flags, pc);
+					  sizeof(*entry), trace_ctx);
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -424,7 +421,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry->next_cpu			= task_cpu(wakee);
 
 	if (!call_filter_check_discard(call, entry, buffer, event))
-		trace_buffer_unlock_commit(tr, buffer, event, flags, pc);
+		trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
 }
 
 static void notrace
@@ -436,7 +433,7 @@ probe_wakeup_sched_switch(void *ignore, bool preempt,
 	unsigned long flags;
 	long disabled;
 	int cpu;
-	int pc;
+	unsigned int trace_ctx;
 
 	tracing_record_cmdline(prev);
 
@@ -455,8 +452,6 @@ probe_wakeup_sched_switch(void *ignore, bool preempt,
 	if (next != wakeup_task)
 		return;
 
-	pc = preempt_count();
-
 	/* disable local data, not wakeup_cpu data */
 	cpu = raw_smp_processor_id();
 	disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled);
@@ -464,6 +459,8 @@ probe_wakeup_sched_switch(void *ignore, bool preempt,
 		goto out;
 
 	local_irq_save(flags);
+	trace_ctx = tracing_gen_ctx_flags(flags);
+
 	arch_spin_lock(&wakeup_lock);
 
 	/* We could race with grabbing wakeup_lock */
@@ -473,9 +470,9 @@ probe_wakeup_sched_switch(void *ignore, bool preempt,
 	/* The task we are waiting for is waking up */
 	data = per_cpu_ptr(wakeup_trace->array_buffer.data, wakeup_cpu);
 
-	__trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
-	tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
-	__trace_stack(wakeup_trace, flags, 0, pc);
+	__trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, trace_ctx);
+	tracing_sched_switch_trace(wakeup_trace, prev, next, trace_ctx);
+	__trace_stack(wakeup_trace, trace_ctx, 0);
 
 	T0 = data->preempt_timestamp;
 	T1 = ftrace_now(cpu);
@@ -527,9 +524,8 @@ probe_wakeup(void *ignore, struct task_struct *p)
 {
 	struct trace_array_cpu *data;
 	int cpu = smp_processor_id();
-	unsigned long flags;
 	long disabled;
-	int pc;
+	unsigned int trace_ctx;
 
 	if (likely(!tracer_enabled))
 		return;
@@ -550,11 +546,12 @@ probe_wakeup(void *ignore, struct task_struct *p)
 	    (!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio)))
 		return;
 
-	pc = preempt_count();
 	disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled);
 	if (unlikely(disabled != 1))
 		goto out;
 
+	trace_ctx = tracing_gen_ctx();
+
 	/* interrupts should be off from try_to_wake_up */
 	arch_spin_lock(&wakeup_lock);
 
@@ -581,19 +578,17 @@ probe_wakeup(void *ignore, struct task_struct *p)
 
 	wakeup_task = get_task_struct(p);
 
-	local_save_flags(flags);
-
 	data = per_cpu_ptr(wakeup_trace->array_buffer.data, wakeup_cpu);
 	data->preempt_timestamp = ftrace_now(cpu);
-	tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc);
-	__trace_stack(wakeup_trace, flags, 0, pc);
+	tracing_sched_wakeup_trace(wakeup_trace, p, current, trace_ctx);
+	__trace_stack(wakeup_trace, trace_ctx, 0);
 
 	/*
 	 * We must be careful in using CALLER_ADDR2. But since wake_up
 	 * is not called by an assembly function  (where as schedule is)
 	 * it should be safe to use it here.
 	 */
-	__trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
+	__trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, trace_ctx);
 
 out_locked:
 	arch_spin_unlock(&wakeup_lock);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index d85a2f0f316b..8bfcd3b09422 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -298,9 +298,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
 	struct syscall_metadata *sys_data;
 	struct ring_buffer_event *event;
 	struct trace_buffer *buffer;
-	unsigned long irq_flags;
+	unsigned int trace_ctx;
 	unsigned long args[6];
-	int pc;
 	int syscall_nr;
 	int size;
 
@@ -322,12 +321,11 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
 
 	size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
 
-	local_save_flags(irq_flags);
-	pc = preempt_count();
+	trace_ctx = tracing_gen_ctx();
 
 	buffer = tr->array_buffer.buffer;
 	event = trace_buffer_lock_reserve(buffer,
-			sys_data->enter_event->event.type, size, irq_flags, pc);
+			sys_data->enter_event->event.type, size, trace_ctx);
 	if (!event)
 		return;
 
@@ -337,7 +335,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
 	memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args);
 
 	event_trigger_unlock_commit(trace_file, buffer, event, entry,
-				    irq_flags, pc);
+				    trace_ctx);
 }
 
 static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
@@ -348,8 +346,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
 	struct syscall_metadata *sys_data;
 	struct ring_buffer_event *event;
 	struct trace_buffer *buffer;
-	unsigned long irq_flags;
-	int pc;
+	unsigned int trace_ctx;
 	int syscall_nr;
 
 	syscall_nr = trace_get_syscall_nr(current, regs);
@@ -368,13 +365,12 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
 	if (!sys_data)
 		return;
 
-	local_save_flags(irq_flags);
-	pc = preempt_count();
+	trace_ctx = tracing_gen_ctx();
 
 	buffer = tr->array_buffer.buffer;
 	event = trace_buffer_lock_reserve(buffer,
 			sys_data->exit_event->event.type, sizeof(*entry),
-			irq_flags, pc);
+			trace_ctx);
 	if (!event)
 		return;
 
@@ -383,7 +379,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
 	entry->ret = syscall_get_return_value(current, regs);
 
 	event_trigger_unlock_commit(trace_file, buffer, event, entry,
-				    irq_flags, pc);
+				    trace_ctx);
 }
 
 static int reg_event_syscall_enter(struct trace_event_file *file,
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 3cf7128e1ad3..a1ed96a7a462 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -961,7 +961,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
 	esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
 	size = esize + tu->tp.size + dsize;
 	event = trace_event_buffer_lock_reserve(&buffer, trace_file,
-						call->event.type, size, 0, 0);
+						call->event.type, size, 0);
 	if (!event)
 		return;
 
@@ -977,7 +977,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
 
 	memcpy(data, ucb->buf, tu->tp.size + dsize);
 
-	event_trigger_unlock_commit(trace_file, buffer, event, entry, 0, 0);
+	event_trigger_unlock_commit(trace_file, buffer, event, entry, 0);
 }
 
 /* uprobe handler */
-- 
cgit v1.2.3


From 0c02006e6f5b0a3e73499bbf5943d9174c5ed640 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 25 Jan 2021 20:45:09 +0100
Subject: tracing: Inline tracing_gen_ctx_flags()

Inline tracing_gen_ctx_flags(). This allows to have one ifdef
CONFIG_TRACE_IRQFLAGS_SUPPORT.

This requires to move `trace_flag_type' so tracing_gen_ctx_flags() can
use it.

Link: https://lkml.kernel.org/r/20210125194511.3924915-3-bigeasy@linutronix.de

Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Link: https://lkml.kernel.org/r/20210125140323.6b1ff20c@gandalf.local.home
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_events.h | 54 +++++++++++++++++++++++++++++++++++++++++---
 kernel/trace/trace.c         | 38 ++-----------------------------
 kernel/trace/trace.h         | 19 ----------------
 3 files changed, 53 insertions(+), 58 deletions(-)

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 091250b0895a..67ae708de40d 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -160,9 +160,57 @@ static inline void tracing_generic_entry_update(struct trace_entry *entry,
 	entry->flags =			trace_ctx >> 16;
 }
 
-unsigned int tracing_gen_ctx_flags(unsigned long irqflags);
-unsigned int tracing_gen_ctx(void);
-unsigned int tracing_gen_ctx_dec(void);
+unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status);
+
+enum trace_flag_type {
+	TRACE_FLAG_IRQS_OFF		= 0x01,
+	TRACE_FLAG_IRQS_NOSUPPORT	= 0x02,
+	TRACE_FLAG_NEED_RESCHED		= 0x04,
+	TRACE_FLAG_HARDIRQ		= 0x08,
+	TRACE_FLAG_SOFTIRQ		= 0x10,
+	TRACE_FLAG_PREEMPT_RESCHED	= 0x20,
+	TRACE_FLAG_NMI			= 0x40,
+};
+
+#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
+static inline unsigned int tracing_gen_ctx_flags(unsigned long irqflags)
+{
+	unsigned int irq_status = irqs_disabled_flags(irqflags) ?
+		TRACE_FLAG_IRQS_OFF : 0;
+	return tracing_gen_ctx_irq_test(irq_status);
+}
+static inline unsigned int tracing_gen_ctx(void)
+{
+	unsigned long irqflags;
+
+	local_save_flags(irqflags);
+	return tracing_gen_ctx_flags(irqflags);
+}
+#else
+
+static inline unsigned int tracing_gen_ctx_flags(unsigned long irqflags)
+{
+	return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT);
+}
+static inline unsigned int tracing_gen_ctx(void)
+{
+	return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT);
+}
+#endif
+
+static inline unsigned int tracing_gen_ctx_dec(void)
+{
+	unsigned int trace_ctx;
+
+	trace_ctx = tracing_gen_ctx();
+	/*
+	 * Subtract one from the preeption counter if preemption is enabled,
+	 * see trace_event_buffer_reserve()for details.
+	 */
+	if (IS_ENABLED(CONFIG_PREEMPTION))
+		trace_ctx--;
+	return trace_ctx;
+}
 
 struct trace_event_file;
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0b3cce6ecf52..584fa2a1304a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2579,20 +2579,13 @@ enum print_line_t trace_handle_return(struct trace_seq *s)
 }
 EXPORT_SYMBOL_GPL(trace_handle_return);
 
-unsigned int tracing_gen_ctx_flags(unsigned long irqflags)
+unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status)
 {
-	unsigned int trace_flags = 0;
+	unsigned int trace_flags = irqs_status;
 	unsigned int pc;
 
 	pc = preempt_count();
 
-#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
-	if (irqs_disabled_flags(irqflags))
-		trace_flags |= TRACE_FLAG_IRQS_OFF;
-#else
-	trace_flags |= TRACE_FLAG_IRQS_NOSUPPORT;
-#endif
-
 	if (pc & NMI_MASK)
 		trace_flags |= TRACE_FLAG_NMI;
 	if (pc & HARDIRQ_MASK)
@@ -2608,33 +2601,6 @@ unsigned int tracing_gen_ctx_flags(unsigned long irqflags)
 	return (trace_flags << 16) | (pc & 0xff);
 }
 
-unsigned int tracing_gen_ctx(void)
-{
-	unsigned long irqflags;
-
-#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
-	local_save_flags(irqflags);
-#else
-	irqflags = 0;
-#endif
-	return tracing_gen_ctx_flags(irqflags);
-}
-
-unsigned int tracing_gen_ctx_dec(void)
-{
-	unsigned int trace_ctx;
-
-	trace_ctx = tracing_gen_ctx();
-
-	/*
-	 * Subtract one from the preeption counter if preemption is enabled,
-	 * see trace_event_buffer_reserve()for details.
-	 */
-	if (IS_ENABLED(CONFIG_PREEMPTION))
-		trace_ctx--;
-	return trace_ctx;
-}
-
 struct ring_buffer_event *
 trace_buffer_lock_reserve(struct trace_buffer *buffer,
 			  int type,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8daf3a0758b1..93fb08ab8bb6 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -136,25 +136,6 @@ struct kretprobe_trace_entry_head {
 	unsigned long		ret_ip;
 };
 
-/*
- * trace_flag_type is an enumeration that holds different
- * states when a trace occurs. These are:
- *  IRQS_OFF		- interrupts were disabled
- *  IRQS_NOSUPPORT	- arch does not support irqs_disabled_flags
- *  NEED_RESCHED	- reschedule is requested
- *  HARDIRQ		- inside an interrupt handler
- *  SOFTIRQ		- inside a softirq handler
- */
-enum trace_flag_type {
-	TRACE_FLAG_IRQS_OFF		= 0x01,
-	TRACE_FLAG_IRQS_NOSUPPORT	= 0x02,
-	TRACE_FLAG_NEED_RESCHED		= 0x04,
-	TRACE_FLAG_HARDIRQ		= 0x08,
-	TRACE_FLAG_SOFTIRQ		= 0x10,
-	TRACE_FLAG_PREEMPT_RESCHED	= 0x20,
-	TRACE_FLAG_NMI			= 0x40,
-};
-
 #define TRACE_BUF_SIZE		1024
 
 struct trace_array;
-- 
cgit v1.2.3


From fe427886bf41279085e0707cced41150dbcd8512 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 25 Jan 2021 20:45:10 +0100
Subject: tracing: Use in_serving_softirq() to deduct softirq status.

PREEMPT_RT does not report "serving softirq" because the tracing core
looks at the preemption counter while PREEMPT_RT does not update it
while processing softirqs in order to remain preemptible. The
information is stored somewhere else.
The in_serving_softirq() macro and the SOFTIRQ_OFFSET define are still
working but not on the preempt-counter.

Use in_serving_softirq() macro which works on PREEMPT_RT. On !PREEMPT_RT
the compiler (gcc-10 / clang-11) is smart enough to optimize the
in_serving_softirq() related read of the preemption counter away.
The only difference I noticed by using in_serving_softirq() on
!PREEMPT_RT is that gcc-10 implemented tracing_gen_ctx_flags() as
reading FLAG, jmp _tracing_gen_ctx_flags(). Without in_serving_softirq()
it inlined _tracing_gen_ctx_flags() into tracing_gen_ctx_flags().

Link: https://lkml.kernel.org/r/20210125194511.3924915-4-bigeasy@linutronix.de

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 584fa2a1304a..75620c29e904 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2590,8 +2590,7 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status)
 		trace_flags |= TRACE_FLAG_NMI;
 	if (pc & HARDIRQ_MASK)
 		trace_flags |= TRACE_FLAG_HARDIRQ;
-
-	if (pc & SOFTIRQ_OFFSET)
+	if (in_serving_softirq())
 		trace_flags |= TRACE_FLAG_SOFTIRQ;
 
 	if (tif_need_resched())
-- 
cgit v1.2.3


From 5817708493bec547914d23dcdd064919ac409f33 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 25 Jan 2021 20:45:11 +0100
Subject: tracing: Remove NULL check from current in
 tracing_generic_entry_update().

I can't imagine when or why `current' would return a NULL pointer. This
check was added in commit
      72829bc3d63cd ("ftrace: move enums to ftrace.h and make helper function global")

but it doesn't give me hint why it was needed.

Assume `current' never returns a NULL pointer and remove the check.

Link: https://lkml.kernel.org/r/20210125194511.3924915-5-bigeasy@linutronix.de

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_events.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 67ae708de40d..5d1eeac4bfbe 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -152,10 +152,8 @@ static inline void tracing_generic_entry_update(struct trace_entry *entry,
 						unsigned short type,
 						unsigned int trace_ctx)
 {
-	struct task_struct *tsk = current;
-
 	entry->preempt_count		= trace_ctx & 0xff;
-	entry->pid			= (tsk) ? tsk->pid : 0;
+	entry->pid			= current->pid;
 	entry->type			= type;
 	entry->flags =			trace_ctx >> 16;
 }
-- 
cgit v1.2.3


From 28cc65a173819ccb049d6335ebe82c1be054e9bb Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Wed, 16 Dec 2020 11:40:51 +0000
Subject: tracing: Fix spelling mistake in Kconfig "infinit" -> "infinite"

There is a spelling mistake in the Kconfig help text. Fix it.

Link: https://lkml.kernel.org/r/20201216114051.12056-1-colin.king@canonical.com

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index c1a62ae7e812..4f976f8d9a38 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -545,7 +545,7 @@ config KPROBE_EVENTS_ON_NOTRACE
 	  using kprobe events.
 
 	  If kprobes can use ftrace instead of breakpoint, ftrace related
-	  functions are protected from kprobe-events to prevent an infinit
+	  functions are protected from kprobe-events to prevent an infinite
 	  recursion or any unexpected execution path which leads to a kernel
 	  crash.
 
-- 
cgit v1.2.3


From 39bcdd6a964b2d80fcec2f70f11896b1db6fb572 Mon Sep 17 00:00:00 2001
From: Bhaskar Chowdhury <unixbhaskar@gmail.com>
Date: Tue, 12 Jan 2021 10:20:08 +0530
Subject: tracing: Fix spelling of controlling in uprobes

s/controling/controlling/p

Link: https://lkml.kernel.org/r/20210112045008.29834-1-unixbhaskar@gmail.com

Signed-off-by: Bhaskar Chowdhury <unixbhaskar@gmail.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_uprobe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index a1ed96a7a462..9d9440303075 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1635,7 +1635,7 @@ void destroy_local_trace_uprobe(struct trace_event_call *event_call)
 }
 #endif /* CONFIG_PERF_EVENTS */
 
-/* Make a trace interface for controling probe points */
+/* Make a trace interface for controlling probe points */
 static __init int init_uprobe_trace(void)
 {
 	int ret;
-- 
cgit v1.2.3


From 557d50e79df7fe527bd5f93b26cf2bcdaaabce7a Mon Sep 17 00:00:00 2001
From: Bean Huo <beanhuo@micron.com>
Date: Tue, 12 Jan 2021 12:12:02 +0100
Subject: tracing: Fix a kernel doc warning

Add description for trace_array_put() parameter.

kernel/trace/trace.c:464: warning: Function parameter or member 'this_tr' not described in 'trace_array_put'

Link: https://lkml.kernel.org/r/20210112111202.23508-1-huobean@gmail.com

Signed-off-by: Bean Huo <beanhuo@micron.com>
[ Merged as one of the original fixes was already fixed by someone else ]
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 75620c29e904..7fd432334ff5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -454,6 +454,7 @@ static void __trace_array_put(struct trace_array *this_tr)
 
 /**
  * trace_array_put - Decrement the reference counter for this trace array.
+ * @this_tr : pointer to the trace array
  *
  * NOTE: Use this when we no longer need the trace array returned by
  * trace_array_get_by_name(). This ensures the trace array can be later
-- 
cgit v1.2.3


From f2a99ddfd0aaff5f5c53ea1f652b5160ba5ee9b7 Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Fri, 15 Jan 2021 07:33:48 -0800
Subject: tracing: Remove definition of DEBUG in trace_mmiotrace.c

Defining DEBUG should only be done in development.
So remove DEBUG.

Link: https://lkml.kernel.org/r/20210115153348.131791-1-trix@redhat.com

Signed-off-by: Tom Rix <trix@redhat.com>
Reviewed-by: Karol Herbst <kherbst@redhat.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_mmiotrace.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 7221ae0b4c47..64e77b513697 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -5,8 +5,6 @@
  * Copyright (C) 2008 Pekka Paalanen <pq@iki.fi>
  */
 
-#define DEBUG 1
-
 #include <linux/kernel.h>
 #include <linux/mmiotrace.h>
 #include <linux/pci.h>
-- 
cgit v1.2.3


From befe6d946551d65cddbd32b9cb0170b0249fd5ed Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Wed, 18 Nov 2020 09:34:05 -0500
Subject: tracepoint: Do not fail unregistering a probe due to memory failure

The list of tracepoint callbacks is managed by an array that is protected
by RCU. To update this array, a new array is allocated, the updates are
copied over to the new array, and then the list of functions for the
tracepoint is switched over to the new array. After a completion of an RCU
grace period, the old array is freed.

This process happens for both adding a callback as well as removing one.
But on removing a callback, if the new array fails to be allocated, the
callback is not removed, and may be used after it is freed by the clients
of the tracepoint.

There's really no reason to fail if the allocation for a new array fails
when removing a function. Instead, the function can simply be replaced by a
stub function that could be cleaned up on the next modification of the
array. That is, instead of calling the function registered to the
tracepoint, it would call a stub function in its place.

Link: https://lore.kernel.org/r/20201115055256.65625-1-mmullins@mmlx.us
Link: https://lore.kernel.org/r/20201116175107.02db396d@gandalf.local.home
Link: https://lore.kernel.org/r/20201117211836.54acaef2@oasis.local.home
Link: https://lkml.kernel.org/r/20201118093405.7a6d2290@gandalf.local.home

[ Note, this version does use undefined compiler behavior (assuming that
  a stub function with no parameters or return, can be called by a location
  that thinks it has parameters but still no return value. Static calls
  do the same thing, so this trick is not without precedent.

  There's another solution that uses RCU tricks and is more complex, but
  can be an alternative if this solution becomes an issue.

  Link: https://lore.kernel.org/lkml/20210127170721.58bce7cc@gandalf.local.home/
]

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Martin KaFai Lau <kafai@fb.com>
Cc: Song Liu <songliubraving@fb.com>
Cc: Yonghong Song <yhs@fb.com>
Cc: Andrii Nakryiko <andriin@fb.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: KP Singh <kpsingh@chromium.org>
Cc: netdev <netdev@vger.kernel.org>
Cc: bpf <bpf@vger.kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Florian Weimer <fw@deneb.enyo.de>
Fixes: 97e1c18e8d17b ("tracing: Kernel Tracepoints")
Reported-by: syzbot+83aa762ef23b6f0d1991@syzkaller.appspotmail.com
Reported-by: syzbot+d29e58bb557324e55e5e@syzkaller.appspotmail.com
Reported-by: Matt Mullins <mmullins@mmlx.us>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Tested-by: Matt Mullins <mmullins@mmlx.us>
---
 kernel/tracepoint.c | 80 ++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 64 insertions(+), 16 deletions(-)

diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 7261fa0f5e3c..e8f20ae29c18 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -53,6 +53,12 @@ struct tp_probes {
 	struct tracepoint_func probes[];
 };
 
+/* Called in removal of a func but failed to allocate a new tp_funcs */
+static void tp_stub_func(void)
+{
+	return;
+}
+
 static inline void *allocate_probes(int count)
 {
 	struct tp_probes *p  = kmalloc(struct_size(p, probes, count),
@@ -131,6 +137,7 @@ func_add(struct tracepoint_func **funcs, struct tracepoint_func *tp_func,
 {
 	struct tracepoint_func *old, *new;
 	int nr_probes = 0;
+	int stub_funcs = 0;
 	int pos = -1;
 
 	if (WARN_ON(!tp_func->func))
@@ -147,14 +154,34 @@ func_add(struct tracepoint_func **funcs, struct tracepoint_func *tp_func,
 			if (old[nr_probes].func == tp_func->func &&
 			    old[nr_probes].data == tp_func->data)
 				return ERR_PTR(-EEXIST);
+			if (old[nr_probes].func == tp_stub_func)
+				stub_funcs++;
 		}
 	}
-	/* + 2 : one for new probe, one for NULL func */
-	new = allocate_probes(nr_probes + 2);
+	/* + 2 : one for new probe, one for NULL func - stub functions */
+	new = allocate_probes(nr_probes + 2 - stub_funcs);
 	if (new == NULL)
 		return ERR_PTR(-ENOMEM);
 	if (old) {
-		if (pos < 0) {
+		if (stub_funcs) {
+			/* Need to copy one at a time to remove stubs */
+			int probes = 0;
+
+			pos = -1;
+			for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
+				if (old[nr_probes].func == tp_stub_func)
+					continue;
+				if (pos < 0 && old[nr_probes].prio < prio)
+					pos = probes++;
+				new[probes++] = old[nr_probes];
+			}
+			nr_probes = probes;
+			if (pos < 0)
+				pos = probes;
+			else
+				nr_probes--; /* Account for insertion */
+
+		} else if (pos < 0) {
 			pos = nr_probes;
 			memcpy(new, old, nr_probes * sizeof(struct tracepoint_func));
 		} else {
@@ -188,8 +215,9 @@ static void *func_remove(struct tracepoint_func **funcs,
 	/* (N -> M), (N > 1, M >= 0) probes */
 	if (tp_func->func) {
 		for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
-			if (old[nr_probes].func == tp_func->func &&
-			     old[nr_probes].data == tp_func->data)
+			if ((old[nr_probes].func == tp_func->func &&
+			     old[nr_probes].data == tp_func->data) ||
+			    old[nr_probes].func == tp_stub_func)
 				nr_del++;
 		}
 	}
@@ -208,14 +236,32 @@ static void *func_remove(struct tracepoint_func **funcs,
 		/* N -> M, (N > 1, M > 0) */
 		/* + 1 for NULL */
 		new = allocate_probes(nr_probes - nr_del + 1);
-		if (new == NULL)
-			return ERR_PTR(-ENOMEM);
-		for (i = 0; old[i].func; i++)
-			if (old[i].func != tp_func->func
-					|| old[i].data != tp_func->data)
-				new[j++] = old[i];
-		new[nr_probes - nr_del].func = NULL;
-		*funcs = new;
+		if (new) {
+			for (i = 0; old[i].func; i++)
+				if ((old[i].func != tp_func->func
+				     || old[i].data != tp_func->data)
+				    && old[i].func != tp_stub_func)
+					new[j++] = old[i];
+			new[nr_probes - nr_del].func = NULL;
+			*funcs = new;
+		} else {
+			/*
+			 * Failed to allocate, replace the old function
+			 * with calls to tp_stub_func.
+			 */
+			for (i = 0; old[i].func; i++)
+				if (old[i].func == tp_func->func &&
+				    old[i].data == tp_func->data) {
+					old[i].func = tp_stub_func;
+					/* Set the prio to the next event. */
+					if (old[i + 1].func)
+						old[i].prio =
+							old[i + 1].prio;
+					else
+						old[i].prio = -1;
+				}
+			*funcs = old;
+		}
 	}
 	debug_print_probes(*funcs);
 	return old;
@@ -295,10 +341,12 @@ static int tracepoint_remove_func(struct tracepoint *tp,
 	tp_funcs = rcu_dereference_protected(tp->funcs,
 			lockdep_is_held(&tracepoints_mutex));
 	old = func_remove(&tp_funcs, func);
-	if (IS_ERR(old)) {
-		WARN_ON_ONCE(PTR_ERR(old) != -ENOMEM);
+	if (WARN_ON_ONCE(IS_ERR(old)))
 		return PTR_ERR(old);
-	}
+
+	if (tp_funcs == old)
+		/* Failed allocating new tp_funcs, replaced func with stub */
+		return 0;
 
 	if (!tp_funcs) {
 		/* Removed last function */
-- 
cgit v1.2.3


From 4b9091e1c1948dea3b0b097496f308ede897d665 Mon Sep 17 00:00:00 2001
From: Song Chen <chensong_2000@189.cn>
Date: Thu, 28 Jan 2021 09:35:13 +0800
Subject: kernel: trace: preemptirq_delay_test: add cpu affinity

The kernel thread executing test can run on any cpu, which might be
different cpu latency tracer is running on, as a result, the
big latency caused by preemptirq delay test can't be detected.

Therefore, the argument cpu_affinity is added to be passed to test,
ensure it's running on the same cpu with latency tracer.

e.g.
cyclictest -p 90 -m -c 0 -i 1000 -a 3
modprobe preemptirq_delay_test test_mode=preempt delay=500 \
burst_size=3 cpu_affinity=3

Link: https://lkml.kernel.org/r/1611797713-20965-1-git-send-email-chensong_2000@189.cn

Signed-off-by: Song Chen <chensong_2000@189.cn>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/Kconfig                 |  4 ++++
 kernel/trace/preemptirq_delay_test.c | 14 ++++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 4f976f8d9a38..799dbcfe65ad 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -886,6 +886,10 @@ config PREEMPTIRQ_DELAY_TEST
 	  irq-disabled critical sections for 500us:
 	  modprobe preemptirq_delay_test test_mode=irq delay=500 burst_size=3
 
+	  What's more, if you want to attach the test on the cpu which the latency
+	  tracer is running on, specify cpu_affinity=cpu_num at the end of the
+	  command.
+
 	  If unsure, say N
 
 config SYNTH_EVENT_GEN_TEST
diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c
index 312d1a0ca3b6..8c4ffd076162 100644
--- a/kernel/trace/preemptirq_delay_test.c
+++ b/kernel/trace/preemptirq_delay_test.c
@@ -21,13 +21,16 @@
 static ulong delay = 100;
 static char test_mode[12] = "irq";
 static uint burst_size = 1;
+static int  cpu_affinity = -1;
 
 module_param_named(delay, delay, ulong, 0444);
 module_param_string(test_mode, test_mode, 12, 0444);
 module_param_named(burst_size, burst_size, uint, 0444);
+module_param_named(cpu_affinity, cpu_affinity, int, 0444);
 MODULE_PARM_DESC(delay, "Period in microseconds (100 us default)");
 MODULE_PARM_DESC(test_mode, "Mode of the test such as preempt, irq, or alternate (default irq)");
 MODULE_PARM_DESC(burst_size, "The size of a burst (default 1)");
+MODULE_PARM_DESC(cpu_affinity, "Cpu num test is running on");
 
 static struct completion done;
 
@@ -36,7 +39,9 @@ static struct completion done;
 static void busy_wait(ulong time)
 {
 	u64 start, end;
+
 	start = trace_clock_local();
+
 	do {
 		end = trace_clock_local();
 		if (kthread_should_stop())
@@ -47,6 +52,7 @@ static void busy_wait(ulong time)
 static __always_inline void irqoff_test(void)
 {
 	unsigned long flags;
+
 	local_irq_save(flags);
 	busy_wait(delay);
 	local_irq_restore(flags);
@@ -113,6 +119,14 @@ static int preemptirq_delay_run(void *data)
 {
 	int i;
 	int s = MIN(burst_size, NR_TEST_FUNCS);
+	struct cpumask cpu_mask;
+
+	if (cpu_affinity > -1) {
+		cpumask_clear(&cpu_mask);
+		cpumask_set_cpu(cpu_affinity, &cpu_mask);
+		if (set_cpus_allowed_ptr(current, &cpu_mask))
+			pr_err("cpu_affinity:%d, failed\n", cpu_affinity);
+	}
 
 	for (i = 0; i < s; i++)
 		(testfuncs[i])(i);
-- 
cgit v1.2.3


From 2d396cb3b12641adcde91104a7c89994f2e876a6 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 5 Feb 2021 18:04:39 -0500
Subject: tracing: Do not create "enable" or "filter" files for ftrace event
 subsystem

The ftrace event subsystem is only created for showing the format files of
events created by the ftrace tracers, and are not trace events. The ftrace
subsystem currently has both the "enable" and "filter" files that in other
subsystems are used to enable/disable all events within the subsystem or set
a filter for all the subsystem events.

As ftrace subsystem events do not use enable or filter operations, these
files are useless in the ftrace subsystem. Remove them.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 20ccce3e4ffb..c1e90611fe22 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2097,16 +2097,20 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
 	dir->subsystem = system;
 	file->system = dir;
 
-	entry = tracefs_create_file("filter", 0644, dir->entry, dir,
-				    &ftrace_subsystem_filter_fops);
-	if (!entry) {
-		kfree(system->filter);
-		system->filter = NULL;
-		pr_warn("Could not create tracefs '%s/filter' entry\n", name);
-	}
+	/* the ftrace system is special, do not create enable or filter files */
+	if (strcmp(name, "ftrace") != 0) {
+
+		entry = tracefs_create_file("filter", 0644, dir->entry, dir,
+					    &ftrace_subsystem_filter_fops);
+		if (!entry) {
+			kfree(system->filter);
+			system->filter = NULL;
+			pr_warn("Could not create tracefs '%s/filter' entry\n", name);
+		}
 
-	trace_create_file("enable", 0644, dir->entry, dir,
-			  &ftrace_system_enable_fops);
+		trace_create_file("enable", 0644, dir->entry, dir,
+				  &ftrace_system_enable_fops);
+	}
 
 	list_add(&dir->list, &tr->systems);
 
-- 
cgit v1.2.3


From 1746fd4416ed5510fe9fdd6a93e49a436187b680 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Mon, 8 Feb 2021 15:09:23 -0500
Subject: tracepoints: Remove unnecessary "data_args" macro parameter

While working on a clean up that would restructure the difference between
architectures that have static calls vs those that do not, I was stumbling
over the "data_args" parameter that includes "__data" in the arguments. The
issue was that one version didn't even need it, while the other one did.
Instead of injecting a "__data = NULL;" into the macro for the unneeded
version, just remove it completely.

The original idea behind data_args is that there may be a case of a
tracepoint with no arguments. But this is considered bad practice, and all
tracepoints should pass something to that location (that's what tracepoints
were created for).

Link: https://lkml.kernel.org/r/20210208201050.768074128@goodmis.org

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/tracepoint.h | 31 +++++++++++--------------------
 1 file changed, 11 insertions(+), 20 deletions(-)

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 966ed8980327..42bb5b753b33 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -160,13 +160,8 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 /*
  * it_func[0] is never NULL because there is at least one element in the array
  * when the array itself is non NULL.
- *
- * Note, the proto and args passed in includes "__data" as the first parameter.
- * The reason for this is to handle the "void" prototype. If a tracepoint
- * has a "void" prototype, then it is invalid to declare a function
- * as "(void *, void)".
  */
-#define __DO_TRACE(name, proto, args, cond, rcuidle)			\
+#define __DO_TRACE(name, args, cond, rcuidle)				\
 	do {								\
 		struct tracepoint_func *it_func_ptr;			\
 		int __maybe_unused __idx = 0;				\
@@ -194,7 +189,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 			rcu_dereference_raw((&__tracepoint_##name)->funcs); \
 		if (it_func_ptr) {					\
 			__data = (it_func_ptr)->data;			\
-			__DO_TRACE_CALL(name)(args);			\
+			__DO_TRACE_CALL(name)(__data, args);		\
 		}							\
 									\
 		if (rcuidle) {						\
@@ -206,17 +201,16 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 	} while (0)
 
 #ifndef MODULE
-#define __DECLARE_TRACE_RCU(name, proto, args, cond, data_proto, data_args) \
+#define __DECLARE_TRACE_RCU(name, proto, args, cond)			\
 	static inline void trace_##name##_rcuidle(proto)		\
 	{								\
 		if (static_key_false(&__tracepoint_##name.key))		\
 			__DO_TRACE(name,				\
-				TP_PROTO(data_proto),			\
-				TP_ARGS(data_args),			\
+				TP_ARGS(args),				\
 				TP_CONDITION(cond), 1);			\
 	}
 #else
-#define __DECLARE_TRACE_RCU(name, proto, args, cond, data_proto, data_args)
+#define __DECLARE_TRACE_RCU(name, proto, args, cond)
 #endif
 
 /*
@@ -231,7 +225,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
  * even when this tracepoint is off. This code has no purpose other than
  * poking RCU a bit.
  */
-#define __DECLARE_TRACE(name, proto, args, cond, data_proto, data_args) \
+#define __DECLARE_TRACE(name, proto, args, cond, data_proto)		\
 	extern int __traceiter_##name(data_proto);			\
 	DECLARE_STATIC_CALL(tp_func_##name, __traceiter_##name);	\
 	extern struct tracepoint __tracepoint_##name;			\
@@ -239,8 +233,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 	{								\
 		if (static_key_false(&__tracepoint_##name.key))		\
 			__DO_TRACE(name,				\
-				TP_PROTO(data_proto),			\
-				TP_ARGS(data_args),			\
+				TP_ARGS(args),				\
 				TP_CONDITION(cond), 0);			\
 		if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) {		\
 			rcu_read_lock_sched_notrace();			\
@@ -249,7 +242,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 		}							\
 	}								\
 	__DECLARE_TRACE_RCU(name, PARAMS(proto), PARAMS(args),		\
-		PARAMS(cond), PARAMS(data_proto), PARAMS(data_args))	\
+			    PARAMS(cond))				\
 	static inline int						\
 	register_trace_##name(void (*probe)(data_proto), void *data)	\
 	{								\
@@ -332,7 +325,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 
 
 #else /* !TRACEPOINTS_ENABLED */
-#define __DECLARE_TRACE(name, proto, args, cond, data_proto, data_args) \
+#define __DECLARE_TRACE(name, proto, args, cond, data_proto)		\
 	static inline void trace_##name(proto)				\
 	{ }								\
 	static inline void trace_##name##_rcuidle(proto)		\
@@ -412,14 +405,12 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 #define DECLARE_TRACE(name, proto, args)				\
 	__DECLARE_TRACE(name, PARAMS(proto), PARAMS(args),		\
 			cpu_online(raw_smp_processor_id()),		\
-			PARAMS(void *__data, proto),			\
-			PARAMS(__data, args))
+			PARAMS(void *__data, proto))
 
 #define DECLARE_TRACE_CONDITION(name, proto, args, cond)		\
 	__DECLARE_TRACE(name, PARAMS(proto), PARAMS(args),		\
 			cpu_online(raw_smp_processor_id()) && (PARAMS(cond)), \
-			PARAMS(void *__data, proto),			\
-			PARAMS(__data, args))
+			PARAMS(void *__data, proto))
 
 #define TRACE_EVENT_FLAGS(event, flag)
 
-- 
cgit v1.2.3


From d9a1be1be331fc857d3fe29f86c3a305950b35a9 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Mon, 8 Feb 2021 15:09:24 -0500
Subject: tracepoints: Do not punish non static call users

With static calls, a tracepoint can call the callback directly if there is
only one callback registered to that tracepoint. When there is more than
one, the static call will call the tracepoint's "iterator" function, which
needs to reload the tracepoint's "funcs" array again, as it could have
changed since the first time it was loaded.

But an arch without static calls is punished by having to load the
tracepoint's "funcs" array twice. Once in the DO_TRACE macro, and once
again in the iterator macro.

For archs without static calls, there's no reason to load the array macro
in the first place, since the iterator function will do it anyway.

Change the __DO_TRACE_CALL() macro to do the load and call of the
tracepoints funcs array only for architectures with static calls, and just
call the iterator function directly for architectures without static calls.

Link: https://lkml.kernel.org/r/20210208201050.909329787@goodmis.org

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/tracepoint.h | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 42bb5b753b33..2aad1c10821a 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -152,9 +152,19 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 #ifdef TRACEPOINTS_ENABLED
 
 #ifdef CONFIG_HAVE_STATIC_CALL
-#define __DO_TRACE_CALL(name)	static_call(tp_func_##name)
+#define __DO_TRACE_CALL(name, args)					\
+	do {								\
+		struct tracepoint_func *it_func_ptr;			\
+		void *__data;						\
+		it_func_ptr =						\
+			rcu_dereference_raw((&__tracepoint_##name)->funcs); \
+		if (it_func_ptr) {					\
+			__data = (it_func_ptr)->data;			\
+			static_call(tp_func_##name)(__data, args);	\
+		}							\
+	} while (0)
 #else
-#define __DO_TRACE_CALL(name)	__traceiter_##name
+#define __DO_TRACE_CALL(name, args)	__traceiter_##name(NULL, args)
 #endif /* CONFIG_HAVE_STATIC_CALL */
 
 /*
@@ -163,9 +173,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
  */
 #define __DO_TRACE(name, args, cond, rcuidle)				\
 	do {								\
-		struct tracepoint_func *it_func_ptr;			\
 		int __maybe_unused __idx = 0;				\
-		void *__data;						\
 									\
 		if (!(cond))						\
 			return;						\
@@ -185,12 +193,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 			rcu_irq_enter_irqson();				\
 		}							\
 									\
-		it_func_ptr =						\
-			rcu_dereference_raw((&__tracepoint_##name)->funcs); \
-		if (it_func_ptr) {					\
-			__data = (it_func_ptr)->data;			\
-			__DO_TRACE_CALL(name)(__data, args);		\
-		}							\
+		__DO_TRACE_CALL(name, TP_ARGS(args));			\
 									\
 		if (rcuidle) {						\
 			rcu_irq_exit_irqson();				\
-- 
cgit v1.2.3


From 7211f0a25781ace5f79b272318b4c60b5dcfd413 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 4 Feb 2021 14:30:04 -0500
Subject: tracepoints: Code clean up

Restructure the code a bit to make it simpler, fix some formatting problems
and add READ_ONCE/WRITE_ONCE to make sure there's no compiler load/store
tearing to the variables that can be accessed across CPUs.

Started with Mathieu Desnoyers's patch:

  Link: https://lore.kernel.org/lkml/20210203175741.20665-1-mathieu.desnoyers@efficios.com/

And will keep his signature, but I will take the responsibility of this
being correct, and keep the authorship.

Link: https://lkml.kernel.org/r/20210204143004.61126582@gandalf.local.home

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/tracepoint.h |  2 +-
 kernel/tracepoint.c        | 91 ++++++++++++++++++----------------------------
 2 files changed, 36 insertions(+), 57 deletions(-)

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 2aad1c10821a..9cfb099da58f 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -305,7 +305,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 			rcu_dereference_raw((&__tracepoint_##_name)->funcs); \
 		if (it_func_ptr) {					\
 			do {						\
-				it_func = (it_func_ptr)->func;		\
+				it_func = READ_ONCE((it_func_ptr)->func); \
 				__data = (it_func_ptr)->data;		\
 				((void(*)(void *, proto))(it_func))(__data, args); \
 			} while ((++it_func_ptr)->func);		\
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index e8f20ae29c18..9f478d29b926 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -136,9 +136,9 @@ func_add(struct tracepoint_func **funcs, struct tracepoint_func *tp_func,
 	 int prio)
 {
 	struct tracepoint_func *old, *new;
-	int nr_probes = 0;
-	int stub_funcs = 0;
-	int pos = -1;
+	int iter_probes;	/* Iterate over old probe array. */
+	int nr_probes = 0;	/* Counter for probes */
+	int pos = -1;		/* Insertion position into new array */
 
 	if (WARN_ON(!tp_func->func))
 		return ERR_PTR(-EINVAL);
@@ -147,54 +147,38 @@ func_add(struct tracepoint_func **funcs, struct tracepoint_func *tp_func,
 	old = *funcs;
 	if (old) {
 		/* (N -> N+1), (N != 0, 1) probes */
-		for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
-			/* Insert before probes of lower priority */
-			if (pos < 0 && old[nr_probes].prio < prio)
-				pos = nr_probes;
-			if (old[nr_probes].func == tp_func->func &&
-			    old[nr_probes].data == tp_func->data)
+		for (iter_probes = 0; old[iter_probes].func; iter_probes++) {
+			if (old[iter_probes].func == tp_stub_func)
+				continue;	/* Skip stub functions. */
+			if (old[iter_probes].func == tp_func->func &&
+			    old[iter_probes].data == tp_func->data)
 				return ERR_PTR(-EEXIST);
-			if (old[nr_probes].func == tp_stub_func)
-				stub_funcs++;
+			nr_probes++;
 		}
 	}
-	/* + 2 : one for new probe, one for NULL func - stub functions */
-	new = allocate_probes(nr_probes + 2 - stub_funcs);
+	/* + 2 : one for new probe, one for NULL func */
+	new = allocate_probes(nr_probes + 2);
 	if (new == NULL)
 		return ERR_PTR(-ENOMEM);
 	if (old) {
-		if (stub_funcs) {
-			/* Need to copy one at a time to remove stubs */
-			int probes = 0;
-
-			pos = -1;
-			for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
-				if (old[nr_probes].func == tp_stub_func)
-					continue;
-				if (pos < 0 && old[nr_probes].prio < prio)
-					pos = probes++;
-				new[probes++] = old[nr_probes];
-			}
-			nr_probes = probes;
-			if (pos < 0)
-				pos = probes;
-			else
-				nr_probes--; /* Account for insertion */
-
-		} else if (pos < 0) {
-			pos = nr_probes;
-			memcpy(new, old, nr_probes * sizeof(struct tracepoint_func));
-		} else {
-			/* Copy higher priority probes ahead of the new probe */
-			memcpy(new, old, pos * sizeof(struct tracepoint_func));
-			/* Copy the rest after it. */
-			memcpy(new + pos + 1, old + pos,
-			       (nr_probes - pos) * sizeof(struct tracepoint_func));
+		nr_probes = 0;
+		for (iter_probes = 0; old[iter_probes].func; iter_probes++) {
+			if (old[iter_probes].func == tp_stub_func)
+				continue;
+			/* Insert before probes of lower priority */
+			if (pos < 0 && old[iter_probes].prio < prio)
+				pos = nr_probes++;
+			new[nr_probes++] = old[iter_probes];
 		}
-	} else
+		if (pos < 0)
+			pos = nr_probes++;
+		/* nr_probes now points to the end of the new array */
+	} else {
 		pos = 0;
+		nr_probes = 1; /* must point at end of array */
+	}
 	new[pos] = *tp_func;
-	new[nr_probes + 1].func = NULL;
+	new[nr_probes].func = NULL;
 	*funcs = new;
 	debug_print_probes(*funcs);
 	return old;
@@ -237,11 +221,12 @@ static void *func_remove(struct tracepoint_func **funcs,
 		/* + 1 for NULL */
 		new = allocate_probes(nr_probes - nr_del + 1);
 		if (new) {
-			for (i = 0; old[i].func; i++)
-				if ((old[i].func != tp_func->func
-				     || old[i].data != tp_func->data)
-				    && old[i].func != tp_stub_func)
+			for (i = 0; old[i].func; i++) {
+				if ((old[i].func != tp_func->func ||
+				     old[i].data != tp_func->data) &&
+				    old[i].func != tp_stub_func)
 					new[j++] = old[i];
+			}
 			new[nr_probes - nr_del].func = NULL;
 			*funcs = new;
 		} else {
@@ -249,17 +234,11 @@ static void *func_remove(struct tracepoint_func **funcs,
 			 * Failed to allocate, replace the old function
 			 * with calls to tp_stub_func.
 			 */
-			for (i = 0; old[i].func; i++)
+			for (i = 0; old[i].func; i++) {
 				if (old[i].func == tp_func->func &&
-				    old[i].data == tp_func->data) {
-					old[i].func = tp_stub_func;
-					/* Set the prio to the next event. */
-					if (old[i + 1].func)
-						old[i].prio =
-							old[i + 1].prio;
-					else
-						old[i].prio = -1;
-				}
+				    old[i].data == tp_func->data)
+					WRITE_ONCE(old[i].func, tp_stub_func);
+			}
 			*funcs = old;
 		}
 	}
-- 
cgit v1.2.3


From a1320e0c2fc834264fa2125ef5c40670dbb5b736 Mon Sep 17 00:00:00 2001
From: Jinyang He <hejinyang@loongson.cn>
Date: Thu, 4 Feb 2021 11:34:31 +0800
Subject: ftrace: Remove unused ftrace_force_update()

ftrace_force_update() is committed by Commit e1c08bdd9fa7 ("ftrace: force
recording") and removed by Commit cb7be3b2fc2c ("ftrace: remove daemon").
Remove it in header file.

Link: https://lkml.kernel.org/r/1612409671-8249-1-git-send-email-hejinyang@loongson.cn

Signed-off-by: Jinyang He <hejinyang@loongson.cn>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 9a8ce28e4485..86e5028bfa20 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -485,7 +485,6 @@ struct dyn_ftrace {
 	struct dyn_arch_ftrace	arch;
 };
 
-int ftrace_force_update(void);
 int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip,
 			 int remove, int reset);
 int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,
@@ -740,7 +739,6 @@ extern void ftrace_disable_daemon(void);
 extern void ftrace_enable_daemon(void);
 #else /* CONFIG_DYNAMIC_FTRACE */
 static inline int skip_trace(unsigned long ip) { return 0; }
-static inline int ftrace_force_update(void) { return 0; }
 static inline void ftrace_disable_daemon(void) { }
 static inline void ftrace_enable_daemon(void) { }
 static inline void ftrace_module_init(struct module *mod) { }
-- 
cgit v1.2.3


From 33b1d14668859626bf96958e38042b0ed8a22a68 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Wed, 3 Feb 2021 23:59:27 +0900
Subject: kprobes: Warn if the kprobe is reregistered

Warn if the kprobe is reregistered, since there must be
a software bug (actively used resource must not be re-registered)
and caller must be fixed.

Link: https://lkml.kernel.org/r/161236436734.194052.4058506306336814476.stgit@devnote2

Acked-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@linux.ibm.com>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/kprobes.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index d5a3eb74a657..dd1d027455c4 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1520,13 +1520,16 @@ valid:
 	return ap;
 }
 
-/* Return error if the kprobe is being re-registered */
-static inline int check_kprobe_rereg(struct kprobe *p)
+/*
+ * Warn and return error if the kprobe is being re-registered since
+ * there must be a software bug.
+ */
+static inline int warn_kprobe_rereg(struct kprobe *p)
 {
 	int ret = 0;
 
 	mutex_lock(&kprobe_mutex);
-	if (__get_valid_kprobe(p))
+	if (WARN_ON_ONCE(__get_valid_kprobe(p)))
 		ret = -EINVAL;
 	mutex_unlock(&kprobe_mutex);
 
@@ -1614,7 +1617,7 @@ int register_kprobe(struct kprobe *p)
 		return PTR_ERR(addr);
 	p->addr = addr;
 
-	ret = check_kprobe_rereg(p);
+	ret = warn_kprobe_rereg(p);
 	if (ret)
 		return ret;
 
@@ -1995,7 +1998,7 @@ int register_kretprobe(struct kretprobe *rp)
 		return ret;
 
 	/* If only rp->kp.addr is specified, check reregistering kprobes */
-	if (rp->kp.addr && check_kprobe_rereg(&rp->kp))
+	if (rp->kp.addr && warn_kprobe_rereg(&rp->kp))
 		return -EINVAL;
 
 	if (kretprobe_blacklist_size) {
-- 
cgit v1.2.3


From d262271d04830e4b5009f4a5cc64934d86b49832 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Mon, 1 Feb 2021 13:48:11 -0600
Subject: tracing/dynevent: Delegate parsing to create function

Delegate command parsing to each create function so that the
command syntax can be customized.

This requires changes to the kprobe/uprobe/synthetic event handling,
which are also included here.

Link: https://lkml.kernel.org/r/e488726f49cbdbc01568618f8680584306c4c79f.1612208610.git.zanussi@kernel.org

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
[ zanussi@kernel.org: added synthetic event modifications ]
Signed-off-by: Tom Zanussi <zanussi@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c              | 23 ++-------------
 kernel/trace/trace.h              |  3 +-
 kernel/trace/trace_dynevent.c     | 35 ++++++++++++++---------
 kernel/trace/trace_dynevent.h     |  4 +--
 kernel/trace/trace_events_synth.c | 60 ++++++++++++++++++++++++++++++---------
 kernel/trace/trace_kprobe.c       | 33 +++++++++++----------
 kernel/trace/trace_probe.c        | 17 +++++++++++
 kernel/trace/trace_probe.h        |  1 +
 kernel/trace/trace_uprobe.c       | 17 +++++++----
 9 files changed, 120 insertions(+), 73 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7fd432334ff5..b79bcacdd6f9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -9412,30 +9412,11 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
 }
 EXPORT_SYMBOL_GPL(ftrace_dump);
 
-int trace_run_command(const char *buf, int (*createfn)(int, char **))
-{
-	char **argv;
-	int argc, ret;
-
-	argc = 0;
-	ret = 0;
-	argv = argv_split(GFP_KERNEL, buf, &argc);
-	if (!argv)
-		return -ENOMEM;
-
-	if (argc)
-		ret = createfn(argc, argv);
-
-	argv_free(argv);
-
-	return ret;
-}
-
 #define WRITE_BUFSIZE  4096
 
 ssize_t trace_parse_run_command(struct file *file, const char __user *buffer,
 				size_t count, loff_t *ppos,
-				int (*createfn)(int, char **))
+				int (*createfn)(const char *))
 {
 	char *kbuf, *buf, *tmp;
 	int ret = 0;
@@ -9483,7 +9464,7 @@ ssize_t trace_parse_run_command(struct file *file, const char __user *buffer,
 			if (tmp)
 				*tmp = '\0';
 
-			ret = trace_run_command(buf, createfn);
+			ret = createfn(buf);
 			if (ret)
 				goto out;
 			buf += size;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 93fb08ab8bb6..a9e13bd5a41b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1807,10 +1807,9 @@ extern int tracing_set_cpumask(struct trace_array *tr,
 
 #define MAX_EVENT_NAME_LEN	64
 
-extern int trace_run_command(const char *buf, int (*createfn)(int, char**));
 extern ssize_t trace_parse_run_command(struct file *file,
 		const char __user *buffer, size_t count, loff_t *ppos,
-		int (*createfn)(int, char**));
+		int (*createfn)(const char *));
 
 extern unsigned int err_pos(char *cmd, const char *str);
 extern void tracing_log_err(struct trace_array *tr,
diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c
index 4f967d5cd917..dc971a68dda4 100644
--- a/kernel/trace/trace_dynevent.c
+++ b/kernel/trace/trace_dynevent.c
@@ -31,23 +31,31 @@ int dyn_event_register(struct dyn_event_operations *ops)
 	return 0;
 }
 
-int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type)
+int dyn_event_release(const char *raw_command, struct dyn_event_operations *type)
 {
 	struct dyn_event *pos, *n;
 	char *system = NULL, *event, *p;
-	int ret = -ENOENT;
+	int argc, ret = -ENOENT;
+	char **argv;
+
+	argv = argv_split(GFP_KERNEL, raw_command, &argc);
+	if (!argv)
+		return -ENOMEM;
 
 	if (argv[0][0] == '-') {
-		if (argv[0][1] != ':')
-			return -EINVAL;
+		if (argv[0][1] != ':') {
+			ret = -EINVAL;
+			goto out;
+		}
 		event = &argv[0][2];
 	} else {
 		event = strchr(argv[0], ':');
-		if (!event)
-			return -EINVAL;
+		if (!event) {
+			ret = -EINVAL;
+			goto out;
+		}
 		event++;
 	}
-	argc--; argv++;
 
 	p = strchr(event, '/');
 	if (p) {
@@ -63,7 +71,7 @@ int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type)
 		if (type && type != pos->ops)
 			continue;
 		if (!pos->ops->match(system, event,
-				argc, (const char **)argv, pos))
+				argc - 1, (const char **)argv + 1, pos))
 			continue;
 
 		ret = pos->ops->free(pos);
@@ -71,21 +79,22 @@ int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type)
 			break;
 	}
 	mutex_unlock(&event_mutex);
-
+out:
+	argv_free(argv);
 	return ret;
 }
 
-static int create_dyn_event(int argc, char **argv)
+static int create_dyn_event(const char *raw_command)
 {
 	struct dyn_event_operations *ops;
 	int ret = -ENODEV;
 
-	if (argv[0][0] == '-' || argv[0][0] == '!')
-		return dyn_event_release(argc, argv, NULL);
+	if (raw_command[0] == '-' || raw_command[0] == '!')
+		return dyn_event_release(raw_command, NULL);
 
 	mutex_lock(&dyn_event_ops_mutex);
 	list_for_each_entry(ops, &dyn_event_ops_list, list) {
-		ret = ops->create(argc, (const char **)argv);
+		ret = ops->create(raw_command);
 		if (!ret || ret != -ECANCELED)
 			break;
 	}
diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h
index d6f72dcb7269..7754936b57ee 100644
--- a/kernel/trace/trace_dynevent.h
+++ b/kernel/trace/trace_dynevent.h
@@ -39,7 +39,7 @@ struct dyn_event;
  */
 struct dyn_event_operations {
 	struct list_head	list;
-	int (*create)(int argc, const char *argv[]);
+	int (*create)(const char *raw_command);
 	int (*show)(struct seq_file *m, struct dyn_event *ev);
 	bool (*is_busy)(struct dyn_event *ev);
 	int (*free)(struct dyn_event *ev);
@@ -97,7 +97,7 @@ void *dyn_event_seq_start(struct seq_file *m, loff_t *pos);
 void *dyn_event_seq_next(struct seq_file *m, void *v, loff_t *pos);
 void dyn_event_seq_stop(struct seq_file *m, void *v);
 int dyn_events_release_all(struct dyn_event_operations *type);
-int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type);
+int dyn_event_release(const char *raw_command, struct dyn_event_operations *type);
 
 /*
  * for_each_dyn_event	-	iterate over the dyn_event list
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 5a8bc0b421f1..b2588a5650c9 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -62,7 +62,7 @@ static void synth_err(u8 err_type, u8 err_pos)
 			err_type, err_pos);
 }
 
-static int create_synth_event(int argc, const char **argv);
+static int create_synth_event(const char *raw_command);
 static int synth_event_show(struct seq_file *m, struct dyn_event *ev);
 static int synth_event_release(struct dyn_event *ev);
 static bool synth_event_is_busy(struct dyn_event *ev);
@@ -1383,18 +1383,30 @@ int synth_event_delete(const char *event_name)
 }
 EXPORT_SYMBOL_GPL(synth_event_delete);
 
-static int create_or_delete_synth_event(int argc, char **argv)
+static int create_or_delete_synth_event(const char *raw_command)
 {
-	const char *name = argv[0];
-	int ret;
+	char **argv, *name = NULL;
+	int argc = 0, ret = 0;
+
+	argv = argv_split(GFP_KERNEL, raw_command, &argc);
+	if (!argv)
+		return -ENOMEM;
+
+	if (!argc)
+		goto free;
+
+	name = argv[0];
 
 	/* trace_run_command() ensures argc != 0 */
 	if (name[0] == '!') {
 		ret = synth_event_delete(name + 1);
-		return ret;
+		goto free;
 	}
 
 	ret = __create_synth_event(argc - 1, name, (const char **)argv + 1);
+free:
+	argv_free(argv);
+
 	return ret == -ECANCELED ? -EINVAL : ret;
 }
 
@@ -1403,7 +1415,7 @@ static int synth_event_run_command(struct dynevent_cmd *cmd)
 	struct synth_event *se;
 	int ret;
 
-	ret = trace_run_command(cmd->seq.buffer, create_or_delete_synth_event);
+	ret = create_or_delete_synth_event(cmd->seq.buffer);
 	if (ret)
 		return ret;
 
@@ -1939,23 +1951,43 @@ int synth_event_trace_end(struct synth_event_trace_state *trace_state)
 }
 EXPORT_SYMBOL_GPL(synth_event_trace_end);
 
-static int create_synth_event(int argc, const char **argv)
+static int create_synth_event(const char *raw_command)
 {
-	const char *name = argv[0];
-	int len;
+	char **argv, *name;
+	int len, argc = 0, ret = 0;
+
+	argv = argv_split(GFP_KERNEL, raw_command, &argc);
+	if (!argv) {
+		ret = -ENOMEM;
+		return ret;
+	}
 
-	if (name[0] != 's' || name[1] != ':')
-		return -ECANCELED;
+	if (!argc)
+		goto free;
+
+	name = argv[0];
+
+	if (name[0] != 's' || name[1] != ':') {
+		ret = -ECANCELED;
+		goto free;
+	}
 	name += 2;
 
 	/* This interface accepts group name prefix */
 	if (strchr(name, '/')) {
 		len = str_has_prefix(name, SYNTH_SYSTEM "/");
-		if (len == 0)
-			return -EINVAL;
+		if (len == 0) {
+			ret = -EINVAL;
+			goto free;
+		}
 		name += len;
 	}
-	return __create_synth_event(argc - 1, name, argv + 1);
+
+	ret = __create_synth_event(argc - 1, name, (const char **)argv + 1);
+free:
+	argv_free(argv);
+
+	return ret;
 }
 
 static int synth_event_release(struct dyn_event *ev)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index f6c459aba8a6..8a1cb0878cbc 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -35,7 +35,7 @@ static int __init set_kprobe_boot_events(char *str)
 }
 __setup("kprobe_event=", set_kprobe_boot_events);
 
-static int trace_kprobe_create(int argc, const char **argv);
+static int trace_kprobe_create(const char *raw_command);
 static int trace_kprobe_show(struct seq_file *m, struct dyn_event *ev);
 static int trace_kprobe_release(struct dyn_event *ev);
 static bool trace_kprobe_is_busy(struct dyn_event *ev);
@@ -711,7 +711,7 @@ static inline void sanitize_event_name(char *name)
 			*name = '_';
 }
 
-static int trace_kprobe_create(int argc, const char *argv[])
+static int __trace_kprobe_create(int argc, const char *argv[])
 {
 	/*
 	 * Argument syntax:
@@ -910,20 +910,25 @@ error:
 	goto out;
 }
 
-static int create_or_delete_trace_kprobe(int argc, char **argv)
+static int trace_kprobe_create(const char *raw_command)
+{
+	return trace_probe_create(raw_command, __trace_kprobe_create);
+}
+
+static int create_or_delete_trace_kprobe(const char *raw_command)
 {
 	int ret;
 
-	if (argv[0][0] == '-')
-		return dyn_event_release(argc, argv, &trace_kprobe_ops);
+	if (raw_command[0] == '-')
+		return dyn_event_release(raw_command, &trace_kprobe_ops);
 
-	ret = trace_kprobe_create(argc, (const char **)argv);
+	ret = trace_kprobe_create(raw_command);
 	return ret == -ECANCELED ? -EINVAL : ret;
 }
 
 static int trace_kprobe_run_command(struct dynevent_cmd *cmd)
 {
-	return trace_run_command(cmd->seq.buffer, create_or_delete_trace_kprobe);
+	return create_or_delete_trace_kprobe(cmd->seq.buffer);
 }
 
 /**
@@ -1084,7 +1089,7 @@ int kprobe_event_delete(const char *name)
 
 	snprintf(buf, MAX_EVENT_NAME_LEN, "-:%s", name);
 
-	return trace_run_command(buf, create_or_delete_trace_kprobe);
+	return create_or_delete_trace_kprobe(buf);
 }
 EXPORT_SYMBOL_GPL(kprobe_event_delete);
 
@@ -1886,7 +1891,7 @@ static __init void setup_boot_kprobe_events(void)
 		if (p)
 			*p++ = '\0';
 
-		ret = trace_run_command(cmd, create_or_delete_trace_kprobe);
+		ret = create_or_delete_trace_kprobe(cmd);
 		if (ret)
 			pr_warn("Failed to add event(%d): %s\n", ret, cmd);
 
@@ -1980,8 +1985,7 @@ static __init int kprobe_trace_self_tests_init(void)
 
 	pr_info("Testing kprobe tracing: ");
 
-	ret = trace_run_command("p:testprobe kprobe_trace_selftest_target $stack $stack0 +0($stack)",
-				create_or_delete_trace_kprobe);
+	ret = create_or_delete_trace_kprobe("p:testprobe kprobe_trace_selftest_target $stack $stack0 +0($stack)");
 	if (WARN_ON_ONCE(ret)) {
 		pr_warn("error on probing function entry.\n");
 		warn++;
@@ -2002,8 +2006,7 @@ static __init int kprobe_trace_self_tests_init(void)
 		}
 	}
 
-	ret = trace_run_command("r:testprobe2 kprobe_trace_selftest_target $retval",
-				create_or_delete_trace_kprobe);
+	ret = create_or_delete_trace_kprobe("r:testprobe2 kprobe_trace_selftest_target $retval");
 	if (WARN_ON_ONCE(ret)) {
 		pr_warn("error on probing function return.\n");
 		warn++;
@@ -2076,13 +2079,13 @@ static __init int kprobe_trace_self_tests_init(void)
 				trace_probe_event_call(&tk->tp), file);
 	}
 
-	ret = trace_run_command("-:testprobe", create_or_delete_trace_kprobe);
+	ret = create_or_delete_trace_kprobe("-:testprobe");
 	if (WARN_ON_ONCE(ret)) {
 		pr_warn("error on deleting a probe.\n");
 		warn++;
 	}
 
-	ret = trace_run_command("-:testprobe2", create_or_delete_trace_kprobe);
+	ret = create_or_delete_trace_kprobe("-:testprobe2");
 	if (WARN_ON_ONCE(ret)) {
 		pr_warn("error on deleting a probe.\n");
 		warn++;
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index d2867ccc6aca..ec589a4612df 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -1134,3 +1134,20 @@ bool trace_probe_match_command_args(struct trace_probe *tp,
 	}
 	return true;
 }
+
+int trace_probe_create(const char *raw_command, int (*createfn)(int, const char **))
+{
+	int argc = 0, ret = 0;
+	char **argv;
+
+	argv = argv_split(GFP_KERNEL, raw_command, &argc);
+	if (!argv)
+		return -ENOMEM;
+
+	if (argc)
+		ret = createfn(argc, (const char **)argv);
+
+	argv_free(argv);
+
+	return ret;
+}
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 2f703a20c724..7ce4027089ee 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -341,6 +341,7 @@ struct event_file_link *trace_probe_get_file_link(struct trace_probe *tp,
 int trace_probe_compare_arg_type(struct trace_probe *a, struct trace_probe *b);
 bool trace_probe_match_command_args(struct trace_probe *tp,
 				    int argc, const char **argv);
+int trace_probe_create(const char *raw_command, int (*createfn)(int, const char **));
 
 #define trace_probe_for_each_link(pos, tp)	\
 	list_for_each_entry(pos, &(tp)->event->files, list)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 9d9440303075..9b50869a5ddb 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -34,7 +34,7 @@ struct uprobe_trace_entry_head {
 #define DATAOF_TRACE_ENTRY(entry, is_return)		\
 	((void*)(entry) + SIZEOF_TRACE_ENTRY(is_return))
 
-static int trace_uprobe_create(int argc, const char **argv);
+static int trace_uprobe_create(const char *raw_command);
 static int trace_uprobe_show(struct seq_file *m, struct dyn_event *ev);
 static int trace_uprobe_release(struct dyn_event *ev);
 static bool trace_uprobe_is_busy(struct dyn_event *ev);
@@ -530,7 +530,7 @@ end:
  * Argument syntax:
  *  - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET[%return][(REF)] [FETCHARGS]
  */
-static int trace_uprobe_create(int argc, const char **argv)
+static int __trace_uprobe_create(int argc, const char **argv)
 {
 	struct trace_uprobe *tu;
 	const char *event = NULL, *group = UPROBE_EVENT_SYSTEM;
@@ -716,14 +716,19 @@ fail_address_parse:
 	return ret;
 }
 
-static int create_or_delete_trace_uprobe(int argc, char **argv)
+int trace_uprobe_create(const char *raw_command)
+{
+	return trace_probe_create(raw_command, __trace_uprobe_create);
+}
+
+static int create_or_delete_trace_uprobe(const char *raw_command)
 {
 	int ret;
 
-	if (argv[0][0] == '-')
-		return dyn_event_release(argc, argv, &trace_uprobe_ops);
+	if (raw_command[0] == '-')
+		return dyn_event_release(raw_command, &trace_uprobe_ops);
 
-	ret = trace_uprobe_create(argc, (const char **)argv);
+	ret = trace_uprobe_create(raw_command);
 	return ret == -ECANCELED ? -EINVAL : ret;
 }
 
-- 
cgit v1.2.3


From c9e759b1e8456a460f258fcfe9682003fcf03938 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@kernel.org>
Date: Mon, 1 Feb 2021 13:48:12 -0600
Subject: tracing: Rework synthetic event command parsing

Now that command parsing has been delegated to the create functions
and we're no longer constrained by argv_split(), we can modify the
synthetic event command parser to better match the higher-level
structure of the synthetic event commands, which is basically an event
name followed by a set of semicolon-separated fields.

Since we're also now passed the raw command, we can also save it
directly and can get rid of save_cmdstr().

Link: https://lkml.kernel.org/r/cb9e2be92d992ce59f2b4f132264a5d467f3933f.1612208610.git.zanussi@kernel.org

Signed-off-by: Tom Zanussi <zanussi@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_synth.c | 245 ++++++++++++++++++++++----------------
 1 file changed, 143 insertions(+), 102 deletions(-)

diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index b2588a5650c9..4f6c5a104ee2 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -48,7 +48,7 @@ static int errpos(const char *str)
 	return err_pos(last_cmd, str);
 }
 
-static void last_cmd_set(char *str)
+static void last_cmd_set(const char *str)
 {
 	if (!str)
 		return;
@@ -579,18 +579,14 @@ static void free_synth_field(struct synth_field *field)
 	kfree(field);
 }
 
-static struct synth_field *parse_synth_field(int argc, const char **argv,
-					     int *consumed)
+static struct synth_field *parse_synth_field(int argc, char **argv)
 {
-	struct synth_field *field;
 	const char *prefix = NULL, *field_type = argv[0], *field_name, *array;
-	int len, ret = -ENOMEM;
+	int len, consumed, ret = -ENOMEM;
+	struct synth_field *field;
 	struct seq_buf s;
 	ssize_t size;
 
-	if (field_type[0] == ';')
-		field_type++;
-
 	if (!strcmp(field_type, "unsigned")) {
 		if (argc < 3) {
 			synth_err(SYNTH_ERR_INCOMPLETE_TYPE, errpos(field_type));
@@ -599,10 +595,20 @@ static struct synth_field *parse_synth_field(int argc, const char **argv,
 		prefix = "unsigned ";
 		field_type = argv[1];
 		field_name = argv[2];
-		*consumed = 3;
+		consumed = 3;
 	} else {
 		field_name = argv[1];
-		*consumed = 2;
+		consumed = 2;
+	}
+
+	if (consumed < argc) {
+		synth_err(SYNTH_ERR_INVALID_FIELD, errpos(field_type));
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (!field_name) {
+		synth_err(SYNTH_ERR_INVALID_FIELD, errpos(field_type));
+		return ERR_PTR(-EINVAL);
 	}
 
 	field = kzalloc(sizeof(*field), GFP_KERNEL);
@@ -613,8 +619,6 @@ static struct synth_field *parse_synth_field(int argc, const char **argv,
 	array = strchr(field_name, '[');
 	if (array)
 		len -= strlen(array);
-	else if (field_name[len - 1] == ';')
-		len--;
 
 	field->name = kmemdup_nul(field_name, len, GFP_KERNEL);
 	if (!field->name)
@@ -626,8 +630,6 @@ static struct synth_field *parse_synth_field(int argc, const char **argv,
 		goto free;
 	}
 
-	if (field_type[0] == ';')
-		field_type++;
 	len = strlen(field_type) + 1;
 
 	if (array)
@@ -644,11 +646,8 @@ static struct synth_field *parse_synth_field(int argc, const char **argv,
 	if (prefix)
 		seq_buf_puts(&s, prefix);
 	seq_buf_puts(&s, field_type);
-	if (array) {
+	if (array)
 		seq_buf_puts(&s, array);
-		if (s.buffer[s.len - 1] == ';')
-			s.len--;
-	}
 	if (WARN_ON_ONCE(!seq_buf_buffer_left(&s)))
 		goto free;
 
@@ -1160,46 +1159,12 @@ int synth_event_gen_cmd_array_start(struct dynevent_cmd *cmd, const char *name,
 }
 EXPORT_SYMBOL_GPL(synth_event_gen_cmd_array_start);
 
-static int save_cmdstr(int argc, const char *name, const char **argv)
-{
-	struct seq_buf s;
-	char *buf;
-	int i;
-
-	buf = kzalloc(MAX_DYNEVENT_CMD_LEN, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
-
-	seq_buf_init(&s, buf, MAX_DYNEVENT_CMD_LEN);
-
-	seq_buf_puts(&s, name);
-
-	for (i = 0; i < argc; i++) {
-		seq_buf_putc(&s, ' ');
-		seq_buf_puts(&s, argv[i]);
-	}
-
-	if (!seq_buf_buffer_left(&s)) {
-		synth_err(SYNTH_ERR_CMD_TOO_LONG, 0);
-		kfree(buf);
-		return -EINVAL;
-	}
-	buf[s.len] = 0;
-	last_cmd_set(buf);
-
-	kfree(buf);
-	return 0;
-}
-
-static int __create_synth_event(int argc, const char *name, const char **argv)
+static int __create_synth_event(const char *name, const char *raw_fields)
 {
+	char **argv, *field_str, *tmp_fields, *saved_fields = NULL;
 	struct synth_field *field, *fields[SYNTH_FIELDS_MAX];
+	int i, argc, n_fields = 0, ret = 0;
 	struct synth_event *event = NULL;
-	int i, consumed = 0, n_fields = 0, ret = 0;
-
-	ret = save_cmdstr(argc, name, argv);
-	if (ret)
-		return ret;
 
 	/*
 	 * Argument syntax:
@@ -1208,46 +1173,60 @@ static int __create_synth_event(int argc, const char *name, const char **argv)
 	 *      where 'field' = type field_name
 	 */
 
-	if (name[0] == '\0' || argc < 1) {
+	if (name[0] == '\0') {
 		synth_err(SYNTH_ERR_CMD_INCOMPLETE, 0);
 		return -EINVAL;
 	}
 
-	mutex_lock(&event_mutex);
-
 	if (!is_good_name(name)) {
 		synth_err(SYNTH_ERR_BAD_NAME, errpos(name));
-		ret = -EINVAL;
-		goto out;
+		return -EINVAL;
 	}
 
+	mutex_lock(&event_mutex);
+
 	event = find_synth_event(name);
 	if (event) {
 		synth_err(SYNTH_ERR_EVENT_EXISTS, errpos(name));
 		ret = -EEXIST;
-		goto out;
+		goto err;
 	}
 
-	for (i = 0; i < argc - 1; i++) {
-		if (strcmp(argv[i], ";") == 0)
-			continue;
-		if (n_fields == SYNTH_FIELDS_MAX) {
-			synth_err(SYNTH_ERR_TOO_MANY_FIELDS, 0);
-			ret = -EINVAL;
+	tmp_fields = saved_fields = kstrdup(raw_fields, GFP_KERNEL);
+	if (!tmp_fields) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	while ((field_str = strsep(&tmp_fields, ";")) != NULL) {
+		argv = argv_split(GFP_KERNEL, field_str, &argc);
+		if (!argv) {
+			ret = -ENOMEM;
 			goto err;
 		}
 
-		field = parse_synth_field(argc - i, &argv[i], &consumed);
+		if (!argc)
+			continue;
+
+		field = parse_synth_field(argc, argv);
 		if (IS_ERR(field)) {
+			argv_free(argv);
 			ret = PTR_ERR(field);
 			goto err;
 		}
+
+		argv_free(argv);
+
 		fields[n_fields++] = field;
-		i += consumed - 1;
+		if (n_fields == SYNTH_FIELDS_MAX) {
+			synth_err(SYNTH_ERR_TOO_MANY_FIELDS, 0);
+			ret = -EINVAL;
+			goto err;
+		}
 	}
 
-	if (i < argc && strcmp(argv[i], ";") != 0) {
-		synth_err(SYNTH_ERR_INVALID_FIELD, errpos(argv[i]));
+	if (n_fields == 0) {
+		synth_err(SYNTH_ERR_CMD_INCOMPLETE, 0);
 		ret = -EINVAL;
 		goto err;
 	}
@@ -1266,6 +1245,8 @@ static int __create_synth_event(int argc, const char *name, const char **argv)
  out:
 	mutex_unlock(&event_mutex);
 
+	kfree(saved_fields);
+
 	return ret;
  err:
 	for (i = 0; i < n_fields; i++)
@@ -1383,31 +1364,79 @@ int synth_event_delete(const char *event_name)
 }
 EXPORT_SYMBOL_GPL(synth_event_delete);
 
-static int create_or_delete_synth_event(const char *raw_command)
+static int check_command(const char *raw_command)
 {
-	char **argv, *name = NULL;
-	int argc = 0, ret = 0;
+	char **argv = NULL, *cmd, *saved_cmd, *name_and_field;
+	int argc, ret = 0;
 
-	argv = argv_split(GFP_KERNEL, raw_command, &argc);
-	if (!argv)
+	cmd = saved_cmd = kstrdup(raw_command, GFP_KERNEL);
+	if (!cmd)
 		return -ENOMEM;
 
-	if (!argc)
+	name_and_field = strsep(&cmd, ";");
+	if (!name_and_field) {
+		ret = -EINVAL;
+		goto free;
+	}
+
+	if (name_and_field[0] == '!')
+		goto free;
+
+	argv = argv_split(GFP_KERNEL, name_and_field, &argc);
+	if (!argv) {
+		ret = -ENOMEM;
 		goto free;
+	}
+	argv_free(argv);
+
+	if (argc < 3)
+		ret = -EINVAL;
+free:
+	kfree(saved_cmd);
 
-	name = argv[0];
+	return ret;
+}
+
+static int create_or_delete_synth_event(const char *raw_command)
+{
+	char *name = NULL, *fields, *p;
+	int ret = 0;
+
+	raw_command = skip_spaces(raw_command);
+	if (raw_command[0] == '\0')
+		return ret;
+
+	last_cmd_set(raw_command);
+
+	ret = check_command(raw_command);
+	if (ret) {
+		synth_err(SYNTH_ERR_CMD_INCOMPLETE, 0);
+		return ret;
+	}
+
+	p = strpbrk(raw_command, " \t");
+	if (!p && raw_command[0] != '!') {
+		synth_err(SYNTH_ERR_CMD_INCOMPLETE, 0);
+		ret = -EINVAL;
+		goto free;
+	}
+
+	name = kmemdup_nul(raw_command, p ? p - raw_command : strlen(raw_command), GFP_KERNEL);
+	if (!name)
+		return -ENOMEM;
 
-	/* trace_run_command() ensures argc != 0 */
 	if (name[0] == '!') {
 		ret = synth_event_delete(name + 1);
 		goto free;
 	}
 
-	ret = __create_synth_event(argc - 1, name, (const char **)argv + 1);
+	fields = skip_spaces(p);
+
+	ret = __create_synth_event(name, fields);
 free:
-	argv_free(argv);
+	kfree(name);
 
-	return ret == -ECANCELED ? -EINVAL : ret;
+	return ret;
 }
 
 static int synth_event_run_command(struct dynevent_cmd *cmd)
@@ -1953,39 +1982,51 @@ EXPORT_SYMBOL_GPL(synth_event_trace_end);
 
 static int create_synth_event(const char *raw_command)
 {
-	char **argv, *name;
-	int len, argc = 0, ret = 0;
+	char *fields, *p;
+	const char *name;
+	int len, ret = 0;
 
-	argv = argv_split(GFP_KERNEL, raw_command, &argc);
-	if (!argv) {
-		ret = -ENOMEM;
+	raw_command = skip_spaces(raw_command);
+	if (raw_command[0] == '\0')
 		return ret;
-	}
 
-	if (!argc)
-		goto free;
+	last_cmd_set(raw_command);
 
-	name = argv[0];
+	p = strpbrk(raw_command, " \t");
+	if (!p)
+		return -EINVAL;
 
-	if (name[0] != 's' || name[1] != ':') {
-		ret = -ECANCELED;
-		goto free;
-	}
+	fields = skip_spaces(p);
+
+	name = raw_command;
+
+	if (name[0] != 's' || name[1] != ':')
+		return -ECANCELED;
 	name += 2;
 
 	/* This interface accepts group name prefix */
 	if (strchr(name, '/')) {
 		len = str_has_prefix(name, SYNTH_SYSTEM "/");
-		if (len == 0) {
-			ret = -EINVAL;
-			goto free;
-		}
+		if (len == 0)
+			return -EINVAL;
 		name += len;
 	}
 
-	ret = __create_synth_event(argc - 1, name, (const char **)argv + 1);
-free:
-	argv_free(argv);
+	len = name - raw_command;
+
+	ret = check_command(raw_command + len);
+	if (ret) {
+		synth_err(SYNTH_ERR_CMD_INCOMPLETE, 0);
+		return ret;
+	}
+
+	name = kmemdup_nul(raw_command + len, p - raw_command - len, GFP_KERNEL);
+	if (!name)
+		return -ENOMEM;
+
+	ret = __create_synth_event(name, fields);
+
+	kfree(name);
 
 	return ret;
 }
-- 
cgit v1.2.3


From 8d3e8165232322eb32b1404f97690d05fdfd94ef Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@kernel.org>
Date: Mon, 1 Feb 2021 13:48:13 -0600
Subject: tracing: Update synth command errors

Since array types are handled differently, errors referencing them
also need to be handled differently.  Add and use a new
INVALID_ARRAY_SPEC error.  Also add INVALID_CMD and INVALID_DYN_CMD to
catch and display the correct form for badly-formed commands, which
can also be used in place of CMD_INCOMPLETE, which is removed, and
remove CMD_TOO_LONG, since it's no longer used.

Link: https://lkml.kernel.org/r/b9dd434dc6458dcff11adc6ed616fe93a8794770.1612208610.git.zanussi@kernel.org

Signed-off-by: Tom Zanussi <zanussi@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_synth.c | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 4f6c5a104ee2..aace72426e99 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -23,13 +23,14 @@
 #undef ERRORS
 #define ERRORS	\
 	C(BAD_NAME,		"Illegal name"),		\
-	C(CMD_INCOMPLETE,	"Incomplete command"),		\
+	C(INVALID_CMD,		"Command must be of the form: <name> field[;field] ..."),\
+	C(INVALID_DYN_CMD,	"Command must be of the form: s or -:[synthetic/]<name> field[;field] ..."),\
 	C(EVENT_EXISTS,		"Event already exists"),	\
 	C(TOO_MANY_FIELDS,	"Too many fields"),		\
 	C(INCOMPLETE_TYPE,	"Incomplete type"),		\
 	C(INVALID_TYPE,		"Invalid type"),		\
-	C(INVALID_FIELD,	"Invalid field"),		\
-	C(CMD_TOO_LONG,		"Command too long"),
+	C(INVALID_FIELD,        "Invalid field"),		\
+	C(INVALID_ARRAY_SPEC,	"Invalid array specification"),
 
 #undef C
 #define C(a, b)		SYNTH_ERR_##a
@@ -655,7 +656,10 @@ static struct synth_field *parse_synth_field(int argc, char **argv)
 
 	size = synth_field_size(field->type);
 	if (size < 0) {
-		synth_err(SYNTH_ERR_INVALID_TYPE, errpos(field_type));
+		if (array)
+			synth_err(SYNTH_ERR_INVALID_ARRAY_SPEC, errpos(field_name));
+		else
+			synth_err(SYNTH_ERR_INVALID_TYPE, errpos(field_type));
 		ret = -EINVAL;
 		goto free;
 	} else if (size == 0) {
@@ -1174,7 +1178,7 @@ static int __create_synth_event(const char *name, const char *raw_fields)
 	 */
 
 	if (name[0] == '\0') {
-		synth_err(SYNTH_ERR_CMD_INCOMPLETE, 0);
+		synth_err(SYNTH_ERR_INVALID_CMD, 0);
 		return -EINVAL;
 	}
 
@@ -1226,7 +1230,7 @@ static int __create_synth_event(const char *name, const char *raw_fields)
 	}
 
 	if (n_fields == 0) {
-		synth_err(SYNTH_ERR_CMD_INCOMPLETE, 0);
+		synth_err(SYNTH_ERR_INVALID_CMD, 0);
 		ret = -EINVAL;
 		goto err;
 	}
@@ -1410,13 +1414,13 @@ static int create_or_delete_synth_event(const char *raw_command)
 
 	ret = check_command(raw_command);
 	if (ret) {
-		synth_err(SYNTH_ERR_CMD_INCOMPLETE, 0);
+		synth_err(SYNTH_ERR_INVALID_CMD, 0);
 		return ret;
 	}
 
 	p = strpbrk(raw_command, " \t");
 	if (!p && raw_command[0] != '!') {
-		synth_err(SYNTH_ERR_CMD_INCOMPLETE, 0);
+		synth_err(SYNTH_ERR_INVALID_CMD, 0);
 		ret = -EINVAL;
 		goto free;
 	}
@@ -1993,8 +1997,10 @@ static int create_synth_event(const char *raw_command)
 	last_cmd_set(raw_command);
 
 	p = strpbrk(raw_command, " \t");
-	if (!p)
+	if (!p) {
+		synth_err(SYNTH_ERR_INVALID_CMD, 0);
 		return -EINVAL;
+	}
 
 	fields = skip_spaces(p);
 
@@ -2007,8 +2013,10 @@ static int create_synth_event(const char *raw_command)
 	/* This interface accepts group name prefix */
 	if (strchr(name, '/')) {
 		len = str_has_prefix(name, SYNTH_SYSTEM "/");
-		if (len == 0)
+		if (len == 0) {
+			synth_err(SYNTH_ERR_INVALID_DYN_CMD, 0);
 			return -EINVAL;
+		}
 		name += len;
 	}
 
@@ -2016,7 +2024,7 @@ static int create_synth_event(const char *raw_command)
 
 	ret = check_command(raw_command + len);
 	if (ret) {
-		synth_err(SYNTH_ERR_CMD_INCOMPLETE, 0);
+		synth_err(SYNTH_ERR_INVALID_CMD, 0);
 		return ret;
 	}
 
-- 
cgit v1.2.3


From 8b5ab6bd0b293408ed8c9450831f879ce9903ea2 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@kernel.org>
Date: Mon, 1 Feb 2021 13:48:14 -0600
Subject: tracing: Add a backward-compatibility check for synthetic event
 creation

The synthetic event parsing rework now requires semicolons between
synthetic event fields.  That requirement breaks existing users who
might already have used the old synthetic event command format, so
this adds an inner loop that can parse more than one field, if
present, between semicolons.  For each field, parse_synth_field()
checks in which version that field was introduced, using
check_field_version().  The caller, __create_synth_event() can then use
that version information to determine whether or not to enforce the
requirement on the command as a whole.

In the future, if/when new features are added, the requirement will be
that any field/string containing the new feature must use semicolons,
and the check_field_version() check can then check for those and
enforce it.  Using a version number allows this scheme to be extended
if necessary.

Link: https://lkml.kernel.org/r/74fcc500d561b40ce91c5ee94818c70c6b0c9330.1612208610.git.zanussi@kernel.org

[ zanussi: added check_field_version() comment from rostedt@goodmis.org ]
Signed-off-by: Tom Zanussi <zanussi@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_synth.c | 93 +++++++++++++++++++++++++++++++--------
 1 file changed, 74 insertions(+), 19 deletions(-)

diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index aace72426e99..2979a96595b4 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -580,11 +580,29 @@ static void free_synth_field(struct synth_field *field)
 	kfree(field);
 }
 
-static struct synth_field *parse_synth_field(int argc, char **argv)
+static int check_field_version(const char *prefix, const char *field_type,
+			       const char *field_name)
+{
+	/*
+	 * For backward compatibility, the old synthetic event command
+	 * format did not require semicolons, and in order to not
+	 * break user space, that old format must still work. If a new
+	 * feature is added, then the format that uses the new feature
+	 * will be required to have semicolons, as nothing that uses
+	 * the old format would be using the new, yet to be created,
+	 * feature. When a new feature is added, this will detect it,
+	 * and return a number greater than 1, and require the format
+	 * to use semicolons.
+	 */
+	return 1;
+}
+
+static struct synth_field *parse_synth_field(int argc, char **argv,
+					     int *consumed, int *field_version)
 {
 	const char *prefix = NULL, *field_type = argv[0], *field_name, *array;
-	int len, consumed, ret = -ENOMEM;
 	struct synth_field *field;
+	int len, ret = -ENOMEM;
 	struct seq_buf s;
 	ssize_t size;
 
@@ -596,15 +614,10 @@ static struct synth_field *parse_synth_field(int argc, char **argv)
 		prefix = "unsigned ";
 		field_type = argv[1];
 		field_name = argv[2];
-		consumed = 3;
+		*consumed += 3;
 	} else {
 		field_name = argv[1];
-		consumed = 2;
-	}
-
-	if (consumed < argc) {
-		synth_err(SYNTH_ERR_INVALID_FIELD, errpos(field_type));
-		return ERR_PTR(-EINVAL);
+		*consumed += 2;
 	}
 
 	if (!field_name) {
@@ -612,6 +625,8 @@ static struct synth_field *parse_synth_field(int argc, char **argv)
 		return ERR_PTR(-EINVAL);
 	}
 
+	*field_version = check_field_version(prefix, field_type, field_name);
+
 	field = kzalloc(sizeof(*field), GFP_KERNEL);
 	if (!field)
 		return ERR_PTR(-ENOMEM);
@@ -1167,6 +1182,7 @@ static int __create_synth_event(const char *name, const char *raw_fields)
 {
 	char **argv, *field_str, *tmp_fields, *saved_fields = NULL;
 	struct synth_field *field, *fields[SYNTH_FIELDS_MAX];
+	int consumed, cmd_version = 1, n_fields_this_loop;
 	int i, argc, n_fields = 0, ret = 0;
 	struct synth_event *event = NULL;
 
@@ -1212,21 +1228,60 @@ static int __create_synth_event(const char *name, const char *raw_fields)
 		if (!argc)
 			continue;
 
-		field = parse_synth_field(argc, argv);
-		if (IS_ERR(field)) {
-			argv_free(argv);
-			ret = PTR_ERR(field);
-			goto err;
-		}
+		n_fields_this_loop = 0;
+		consumed = 0;
+		while (argc > consumed) {
+			int field_version;
+
+			field = parse_synth_field(argc - consumed,
+						  argv + consumed, &consumed,
+						  &field_version);
+			if (IS_ERR(field)) {
+				argv_free(argv);
+				ret = PTR_ERR(field);
+				goto err;
+			}
 
-		argv_free(argv);
+			/*
+			 * Track the highest version of any field we
+			 * found in the command.
+			 */
+			if (field_version > cmd_version)
+				cmd_version = field_version;
+
+			/*
+			 * Now sort out what is and isn't valid for
+			 * each supported version.
+			 *
+			 * If we see more than 1 field per loop, it
+			 * means we have multiple fields between
+			 * semicolons, and that's something we no
+			 * longer support in a version 2 or greater
+			 * command.
+			 */
+			if (cmd_version > 1 && n_fields_this_loop >= 1) {
+				synth_err(SYNTH_ERR_INVALID_CMD, errpos(field_str));
+				ret = -EINVAL;
+				goto err;
+			}
+
+			fields[n_fields++] = field;
+			if (n_fields == SYNTH_FIELDS_MAX) {
+				synth_err(SYNTH_ERR_TOO_MANY_FIELDS, 0);
+				ret = -EINVAL;
+				goto err;
+			}
+
+			n_fields_this_loop++;
+		}
 
-		fields[n_fields++] = field;
-		if (n_fields == SYNTH_FIELDS_MAX) {
-			synth_err(SYNTH_ERR_TOO_MANY_FIELDS, 0);
+		if (consumed < argc) {
+			synth_err(SYNTH_ERR_INVALID_CMD, 0);
 			ret = -EINVAL;
 			goto err;
 		}
+
+		argv_free(argv);
 	}
 
 	if (n_fields == 0) {
-- 
cgit v1.2.3


From b5734e997e1117afb479ffda500e36fa91aea3e8 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@kernel.org>
Date: Mon, 1 Feb 2021 13:48:15 -0600
Subject: selftests/ftrace: Update synthetic event syntax errors

Some of the synthetic event errors and positions have changed in the
code - update those and add several more tests.

Also add a runtime check to ensure that the kernel supports dynamic
strings in synthetic events, which these tests require.

Link: https://lkml.kernel.org/r/51402656433455baead34f068c6e9466b64df9c0.1612208610.git.zanussi@kernel.org

Fixes: 81ff92a93d95 (selftests/ftrace: Add test case for synthetic event syntax errors)
Reported-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Tom Zanussi <zanussi@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 .../trigger-synthetic_event_syntax_errors.tc       | 35 +++++++++++++++++-----
 1 file changed, 27 insertions(+), 8 deletions(-)

diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic_event_syntax_errors.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic_event_syntax_errors.tc
index ada594fe16cb..955e3ceea44b 100644
--- a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic_event_syntax_errors.tc
+++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic_event_syntax_errors.tc
@@ -1,19 +1,38 @@
 #!/bin/sh
 # SPDX-License-Identifier: GPL-2.0
 # description: event trigger - test synthetic_events syntax parser errors
-# requires: synthetic_events error_log
+# requires: synthetic_events error_log "char name[]' >> synthetic_events":README
 
 check_error() { # command-with-error-pos-by-^
     ftrace_errlog_check 'synthetic_events' "$1" 'synthetic_events'
 }
 
+check_dyn_error() { # command-with-error-pos-by-^
+    ftrace_errlog_check 'synthetic_events' "$1" 'dynamic_events'
+}
+
 check_error 'myevent ^chr arg'			# INVALID_TYPE
-check_error 'myevent ^char str[];; int v'	# INVALID_TYPE
-check_error 'myevent char ^str]; int v'		# INVALID_NAME
-check_error 'myevent char ^str;[]'		# INVALID_NAME
-check_error 'myevent ^char str[; int v'		# INVALID_TYPE
-check_error '^mye;vent char str[]'		# BAD_NAME
-check_error 'myevent char str[]; ^int'		# INVALID_FIELD
-check_error '^myevent'				# INCOMPLETE_CMD
+check_error 'myevent ^unsigned arg'		# INCOMPLETE_TYPE
+
+check_error 'myevent char ^str]; int v'		# BAD_NAME
+check_error '^mye-vent char str[]'		# BAD_NAME
+check_error 'myevent char ^st-r[]'		# BAD_NAME
+
+check_error 'myevent char str;^[]'		# INVALID_FIELD
+check_error 'myevent char str; ^int'		# INVALID_FIELD
+
+check_error 'myevent char ^str[; int v'		# INVALID_ARRAY_SPEC
+check_error 'myevent char ^str[kdjdk]'		# INVALID_ARRAY_SPEC
+check_error 'myevent char ^str[257]'		# INVALID_ARRAY_SPEC
+
+check_error '^mye;vent char str[]'		# INVALID_CMD
+check_error '^myevent ; char str[]'		# INVALID_CMD
+check_error '^myevent; char str[]'		# INVALID_CMD
+check_error '^myevent ;char str[]'		# INVALID_CMD
+check_error '^; char str[]'			# INVALID_CMD
+check_error '^;myevent char str[]'		# INVALID_CMD
+check_error '^myevent'				# INVALID_CMD
+
+check_dyn_error '^s:junk/myevent char str['	# INVALID_DYN_CMD
 
 exit 0
-- 
cgit v1.2.3


From 7d5367539ae902a9208c5918e21b0f89e5feb29c Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@kernel.org>
Date: Mon, 1 Feb 2021 13:48:16 -0600
Subject: selftests/ftrace: Add '!event' synthetic event syntax check

Add a check confirming that '!event' alone will remove a synthetic
event.

Link: https://lkml.kernel.org/r/1dff3f03d18542cece08c10d6323d8a8dba11e42.1612208610.git.zanussi@kernel.org

Signed-off-by: Tom Zanussi <zanussi@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 .../test.d/trigger/inter-event/trigger-synthetic-event-syntax.tc      | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-syntax.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-syntax.tc
index 59216f3cfb12..2968cdc7df30 100644
--- a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-syntax.tc
+++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-syntax.tc
@@ -32,6 +32,10 @@ grep "myevent[[:space:]]u64 var1" synthetic_events
 # it is not possible to add same name event
 ! echo "myevent u64 var2" >> synthetic_events
 
+# make sure !synthetic event doesn't require a field
+echo "!myevent" >> synthetic_events
+echo "myevent u64 var1" >> synthetic_events
+
 # Non-append open will cleanup all events and add new one
 echo "myevent u64 var2" > synthetic_events
 
-- 
cgit v1.2.3


From efbbdaa22bb78761bff8dfdde027ad04bedd47ce Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Thu, 15 Oct 2020 23:55:07 +0900
Subject: tracing: Show real address for trace event arguments

To help debugging kernel, show real address for trace event arguments
in tracefs/trace{,pipe} instead of hashed pointer value.

Since ftrace human-readable format uses vsprintf(), all %p are
translated to hash values instead of pointer address.

However, when debugging the kernel, raw address value gives a
hint when comparing with the memory mapping in the kernel.
(Those are sometimes used with crash log, which is not hashed too)
So converting %p with %px when calling trace_seq_printf().

Moreover, this is not improving the security because the tracefs
can be used only by root user and the raw address values are readable
from tracefs/percpu/cpu*/trace_pipe_raw file.

Link: https://lkml.kernel.org/r/160277370703.29307.5134475491761971203.stgit@devnote2

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_events.h |  4 +++
 include/trace/trace_events.h |  2 +-
 kernel/trace/trace.c         | 71 +++++++++++++++++++++++++++++++++++++++++++-
 kernel/trace/trace.h         |  2 ++
 kernel/trace/trace_output.c  | 12 +++++++-
 5 files changed, 88 insertions(+), 3 deletions(-)

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 5d1eeac4bfbe..7077fec653bb 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -55,6 +55,8 @@ struct trace_event;
 
 int trace_raw_output_prep(struct trace_iterator *iter,
 			  struct trace_event *event);
+extern __printf(2, 3)
+void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...);
 
 /*
  * The trace entry - the most basic unit of tracing. This is what
@@ -87,6 +89,8 @@ struct trace_iterator {
 	unsigned long		iter_flags;
 	void			*temp;	/* temp holder */
 	unsigned int		temp_size;
+	char			*fmt;	/* modified format holder */
+	unsigned int		fmt_size;
 
 	/* trace_seq for __print_flags() and __print_symbolic() etc. */
 	struct trace_seq	tmp_seq;
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
index 7785961d82ba..e6a8b3febc49 100644
--- a/include/trace/trace_events.h
+++ b/include/trace/trace_events.h
@@ -364,7 +364,7 @@ trace_raw_output_##call(struct trace_iterator *iter, int flags,		\
 	if (ret != TRACE_TYPE_HANDLED)					\
 		return ret;						\
 									\
-	trace_seq_printf(s, print);					\
+	trace_event_printf(iter, print);				\
 									\
 	return trace_handle_return(s);					\
 }									\
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b79bcacdd6f9..39f8a537196e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3530,6 +3530,62 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu,
 	return next;
 }
 
+#define STATIC_FMT_BUF_SIZE	128
+static char static_fmt_buf[STATIC_FMT_BUF_SIZE];
+
+static char *trace_iter_expand_format(struct trace_iterator *iter)
+{
+	char *tmp;
+
+	if (iter->fmt == static_fmt_buf)
+		return NULL;
+
+	tmp = krealloc(iter->fmt, iter->fmt_size + STATIC_FMT_BUF_SIZE,
+		       GFP_KERNEL);
+	if (tmp) {
+		iter->fmt_size += STATIC_FMT_BUF_SIZE;
+		iter->fmt = tmp;
+	}
+
+	return tmp;
+}
+
+const char *trace_event_format(struct trace_iterator *iter, const char *fmt)
+{
+	const char *p, *new_fmt;
+	char *q;
+
+	if (WARN_ON_ONCE(!fmt))
+		return fmt;
+
+	p = fmt;
+	new_fmt = q = iter->fmt;
+	while (*p) {
+		if (unlikely(q - new_fmt + 3 > iter->fmt_size)) {
+			if (!trace_iter_expand_format(iter))
+				return fmt;
+
+			q += iter->fmt - new_fmt;
+			new_fmt = iter->fmt;
+		}
+
+		*q++ = *p++;
+
+		/* Replace %p with %px */
+		if (p[-1] == '%') {
+			if (p[0] == '%') {
+				*q++ = *p++;
+			} else if (p[0] == 'p' && !isalnum(p[1])) {
+				*q++ = *p++;
+				*q++ = 'x';
+			}
+		}
+	}
+	*q = '\0';
+
+	return new_fmt;
+}
+
 #define STATIC_TEMP_BUF_SIZE	128
 static char static_temp_buf[STATIC_TEMP_BUF_SIZE] __aligned(4);
 
@@ -4322,6 +4378,16 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
 	if (iter->temp)
 		iter->temp_size = 128;
 
+	/*
+	 * trace_event_printf() may need to modify given format
+	 * string to replace %p with %px so that it shows real address
+	 * instead of hash value. However, that is only for the event
+	 * tracing, other tracer may not need. Defer the allocation
+	 * until it is needed.
+	 */
+	iter->fmt = NULL;
+	iter->fmt_size = 0;
+
 	/*
 	 * We make a copy of the current tracer to avoid concurrent
 	 * changes on it while we are reading.
@@ -4473,6 +4539,7 @@ static int tracing_release(struct inode *inode, struct file *file)
 
 	mutex_destroy(&iter->mutex);
 	free_cpumask_var(iter->started);
+	kfree(iter->fmt);
 	kfree(iter->temp);
 	kfree(iter->trace);
 	kfree(iter->buffer_iter);
@@ -9331,9 +9398,11 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
 
 	/* Simulate the iterator */
 	trace_init_global_iter(&iter);
-	/* Can not use kmalloc for iter.temp */
+	/* Can not use kmalloc for iter.temp and iter.fmt */
 	iter.temp = static_temp_buf;
 	iter.temp_size = STATIC_TEMP_BUF_SIZE;
+	iter.fmt = static_fmt_buf;
+	iter.fmt_size = STATIC_FMT_BUF_SIZE;
 
 	for_each_tracing_cpu(cpu) {
 		atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index a9e13bd5a41b..6c3ea6f95e68 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -581,6 +581,8 @@ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
 void trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer,
 					struct ring_buffer_event *event);
 
+const char *trace_event_format(struct trace_iterator *iter, const char *fmt);
+
 int trace_empty(struct trace_iterator *iter);
 
 void *trace_find_next_entry_inc(struct trace_iterator *iter);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 92b1575ae0ca..61255bad7e01 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -312,13 +312,23 @@ int trace_raw_output_prep(struct trace_iterator *iter,
 }
 EXPORT_SYMBOL(trace_raw_output_prep);
 
+void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	trace_seq_vprintf(&iter->seq, trace_event_format(iter, fmt), ap);
+	va_end(ap);
+}
+EXPORT_SYMBOL(trace_event_printf);
+
 static int trace_output_raw(struct trace_iterator *iter, char *name,
 			    char *fmt, va_list ap)
 {
 	struct trace_seq *s = &iter->seq;
 
 	trace_seq_printf(s, "%s: ", name);
-	trace_seq_vprintf(s, fmt, ap);
+	trace_seq_vprintf(s, trace_event_format(iter, fmt), ap);
 
 	return trace_handle_return(s);
 }
-- 
cgit v1.2.3


From 1600cbcf6944dee9f87f52cbceb0b6895c29efd0 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Thu, 15 Oct 2020 23:55:16 +0900
Subject: tracing: Update the stage 3 of trace event macro comment

Update the comment of the 3rd stage of trace event macro
expansion code. Now there are 2 macros makes different
trace_raw_output_<call>() functions.

Link: https://lkml.kernel.org/r/160277371605.29307.8586817119278606720.stgit@devnote2

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/trace/trace_events.h | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
index e6a8b3febc49..8268bf747d6f 100644
--- a/include/trace/trace_events.h
+++ b/include/trace/trace_events.h
@@ -231,9 +231,11 @@ TRACE_MAKE_SYSTEM_STR();
  * {
  *	struct trace_seq *s = &iter->seq;
  *	struct trace_event_raw_<call> *field; <-- defined in stage 1
- *	struct trace_entry *entry;
  *	struct trace_seq *p = &iter->tmp_seq;
- *	int ret;
+ *
+ * -------(for event)-------
+ *
+ *	struct trace_entry *entry;
  *
  *	entry = iter->ent;
  *
@@ -245,14 +247,23 @@ TRACE_MAKE_SYSTEM_STR();
  *	field = (typeof(field))entry;
  *
  *	trace_seq_init(p);
- *	ret = trace_seq_printf(s, "%s: ", <call>);
- *	if (ret)
- *		ret = trace_seq_printf(s, <TP_printk> "\n");
- *	if (!ret)
- *		return TRACE_TYPE_PARTIAL_LINE;
+ *	return trace_output_call(iter, <call>, <TP_printk> "\n");
  *
- *	return TRACE_TYPE_HANDLED;
- * }
+ * ------(or, for event class)------
+ *
+ *	int ret;
+ *
+ *	field = (typeof(field))iter->ent;
+ *
+ *	ret = trace_raw_output_prep(iter, trace_event);
+ *	if (ret != TRACE_TYPE_HANDLED)
+ *		return ret;
+ *
+ *	trace_event_printf(iter, <TP_printk> "\n");
+ *
+ *	return trace_handle_return(s);
+ * -------
+ *  }
  *
  * This is the method used to print the raw event to the trace
  * output format. Note, this is not needed if the data is read
-- 
cgit v1.2.3


From a345a6718bd5689f7cff7715e902d8739e0d5beb Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Thu, 15 Oct 2020 23:55:25 +0900
Subject: tracing: Add ptr-hash option to show the hashed pointer value

Add tracefs/options/hash-ptr option to show hashed pointer
value by %p in event printk format string.

For the security reason, normal printk will show the hashed
pointer value (encrypted by random number) with %p to printk
buffer to hide the real address. But the tracefs/trace always
shows real address for debug. To bridge those outputs, add an
option to switch the output format. Ftrace users can use it
to find the hashed value corresponding to the real address
in trace log.

Link: https://lkml.kernel.org/r/160277372504.29307.14909828808982012211.stgit@devnote2

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 Documentation/trace/ftrace.rst | 6 ++++++
 kernel/trace/trace.c           | 3 +++
 kernel/trace/trace.h           | 1 +
 3 files changed, 10 insertions(+)

diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst
index 87cf5c010d5d..62c98e9bbdd9 100644
--- a/Documentation/trace/ftrace.rst
+++ b/Documentation/trace/ftrace.rst
@@ -1159,6 +1159,12 @@ Here are the available options:
 	This simulates the original behavior of the trace file.
 	When the file is closed, tracing will be enabled again.
 
+  hash-ptr
+        When set, "%p" in the event printk format displays the
+        hashed pointer value instead of real address.
+        This will be useful if you want to find out which hashed
+        value is corresponding to the real value in trace log.
+
   record-cmd
 	When any event or tracer is enabled, a hook is enabled
 	in the sched_switch trace point to fill comm cache
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 39f8a537196e..16e252d39016 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3558,6 +3558,9 @@ const char *trace_event_format(struct trace_iterator *iter, const char *fmt)
 	if (WARN_ON_ONCE(!fmt))
 		return fmt;
 
+	if (iter->tr->trace_flags & TRACE_ITER_HASH_PTR)
+		return fmt;
+
 	p = fmt;
 	new_fmt = q = iter->fmt;
 	while (*p) {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6c3ea6f95e68..dec13ff66077 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1175,6 +1175,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
 		C(MARKERS,		"markers"),		\
 		C(EVENT_FORK,		"event-fork"),		\
 		C(PAUSE_ON_TRACE,	"pause-on-trace"),	\
+		C(HASH_PTR,		"hash-ptr"),	/* Print hashed pointer */ \
 		FUNCTION_FLAGS					\
 		FGRAPH_FLAGS					\
 		STACK_FLAGS					\
-- 
cgit v1.2.3


From 99e22ce73c59ac2d6d08893af376483ca7d62850 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 12 Feb 2021 11:51:06 -0500
Subject: tracing: Make hash-ptr option default

Since the original behavior of the trace events is to hash the %p pointers,
make that the default, and have developers have to enable the option in
order to have them unhashed.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 16e252d39016..f5e8e39d6f57 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -408,7 +408,8 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_export);
 	 TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |			\
 	 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO |		\
 	 TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE |			\
-	 TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS)
+	 TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS |			\
+	 TRACE_ITER_HASH_PTR)
 
 /* trace_options that are only supported by global_trace */
 #define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK |			\
-- 
cgit v1.2.3


From e23db805da2dfc39e5281b5efd3e36d132aa83af Mon Sep 17 00:00:00 2001
From: Viktor Rosendahl <Viktor.Rosendahl@bmw.de>
Date: Fri, 12 Feb 2021 14:44:21 +0100
Subject: tracing/tools: Add the latency-collector to tools directory

This is a tool that is intended to work around the fact that the
preemptoff, irqsoff, and preemptirqsoff tracers only work in
overwrite mode. The idea is to act randomly in such a way that we
do not systematically lose any latencies, so that if enough testing
is done, all latencies will be captured. If the same burst of
latencies is repeated, then sooner or later we will have captured all
the latencies.

It also works with the wakeup_dl, wakeup_rt, and wakeup tracers.
However, in that case it is probably not useful to use the random
sleep functionality.

The reason why it may be desirable to catch all latencies with a long
test campaign is that for some organizations, it's necessary to test
the kernel in the field and not practical for developers to work
iteratively with field testers. Because of cost and project schedules
it is not possible to start a new test campaign every time a latency
problem has been fixed.

It uses inotify to detect changes to /sys/kernel/tracing/trace.
When a latency is detected, it will either sleep or print
immediately, depending on a function that act as an unfair coin
toss.

If immediate print is chosen, it means that we open
/sys/kernel/tracing/trace and thereby cause a blackout period
that will hide any subsequent latencies.

If sleep is chosen, it means that we wait before opening
/sys/kernel/tracing/trace, by default for 1000 ms, to see if
there is another latency during this period. If there is, then we will
lose the previous latency. The coin will be tossed again with a
different probability, and we will either print the new latency, or
possibly a subsequent one.

The probability for the unfair coin toss is chosen so that there
is equal probability to obtain any of the latencies in a burst.
However, this assumes that we make an assumption of how many
latencies there can be. By default  the program assumes that there
are no more than 2 latencies in a burst, the probability of immediate
printout will be:

1/2 and 1

Thus, the probability of getting each of the two latencies will be 1/2.

If we ever find that there is more than one latency in a series,
meaning that we reach the probability of 1, then the table will be
expanded to:

1/3, 1/2, and 1

Thus, we assume that there are no more than three latencies and each
with a probability of 1/3 of being captured. If the probability of 1
is reached in the new table, that is we see more than two closely
occurring latencies, then the table will again be extended, and so
on.

On my systems, it seems like this scheme works fairly well, as
long as the latencies we trace are long enough, 300 us seems to be
enough. This userspace program receive the inotify event at the end
of a latency, and it has time until the end of the next latency
to react, that is to open /sys/kernel/tracing/trace. Thus,
if we trace latencies that are >300 us, then we have at least 300 us
to react.

The minimum latency will of course not be 300 us on all systems, it
will depend on the hardware, kernel version, workload and
configuration.

Example usage:

In one shell, give the following command:
sudo latency-collector -rvv -t preemptirqsoff -s 2000 -a 3

This will trace latencies > 2000us with the preemptirqsoff tracer,
using random sleep with maximum verbosity, with a probability
table initialized to a size of 3.

In another shell, generate a few bursts of latencies:

root@host:~# modprobe preemptirq_delay_test delay=3000 test_mode=alternate
burst_size=3
root@host:~# echo 1 > /sys/kernel/preemptirq_delay_test/trigger
root@host:~# echo 1 > /sys/kernel/preemptirq_delay_test/trigger
root@host:~# echo 1 > /sys/kernel/preemptirq_delay_test/trigger
root@host:~# echo 1 > /sys/kernel/preemptirq_delay_test/trigger

If all goes well, you should be getting stack traces that shows
all the different latencies, i.e. you should see all the three
functions preemptirqtest_0, preemptirqtest_1, preemptirqtest_2 in the
stack traces.

Link: https://lkml.kernel.org/r/20210212134421.172750-2-Viktor.Rosendahl@bmw.de

Signed-off-by: Viktor Rosendahl <Viktor.Rosendahl@bmw.de>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 tools/Makefile                            |   14 +-
 tools/tracing/Makefile                    |   19 +
 tools/tracing/latency/.gitignore          |    2 +
 tools/tracing/latency/Makefile            |   24 +
 tools/tracing/latency/latency-collector.c | 2108 +++++++++++++++++++++++++++++
 5 files changed, 2161 insertions(+), 6 deletions(-)
 create mode 100644 tools/tracing/Makefile
 create mode 100644 tools/tracing/latency/.gitignore
 create mode 100644 tools/tracing/latency/Makefile
 create mode 100644 tools/tracing/latency/latency-collector.c

diff --git a/tools/Makefile b/tools/Makefile
index 85af6ebbce91..7e9d34ddd74c 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -31,6 +31,7 @@ help:
 	@echo '  bootconfig             - boot config tool'
 	@echo '  spi                    - spi tools'
 	@echo '  tmon                   - thermal monitoring and tuning tool'
+	@echo '  tracing                - misc tracing tools'
 	@echo '  turbostat              - Intel CPU idle stats and freq reporting tool'
 	@echo '  usb                    - USB testing tools'
 	@echo '  virtio                 - vhost test module'
@@ -64,7 +65,7 @@ acpi: FORCE
 cpupower: FORCE
 	$(call descend,power/$@)
 
-cgroup firewire hv guest bootconfig spi usb virtio vm bpf iio gpio objtool leds wmi pci firmware debugging: FORCE
+cgroup firewire hv guest bootconfig spi usb virtio vm bpf iio gpio objtool leds wmi pci firmware debugging tracing: FORCE
 	$(call descend,$@)
 
 bpf/%: FORCE
@@ -103,7 +104,7 @@ all: acpi cgroup cpupower gpio hv firewire liblockdep \
 		perf selftests bootconfig spi turbostat usb \
 		virtio vm bpf x86_energy_perf_policy \
 		tmon freefall iio objtool kvm_stat wmi \
-		pci debugging
+		pci debugging tracing
 
 acpi_install:
 	$(call descend,power/$(@:_install=),install)
@@ -111,7 +112,7 @@ acpi_install:
 cpupower_install:
 	$(call descend,power/$(@:_install=),install)
 
-cgroup_install firewire_install gpio_install hv_install iio_install perf_install bootconfig_install spi_install usb_install virtio_install vm_install bpf_install objtool_install wmi_install pci_install debugging_install:
+cgroup_install firewire_install gpio_install hv_install iio_install perf_install bootconfig_install spi_install usb_install virtio_install vm_install bpf_install objtool_install wmi_install pci_install debugging_install tracing_install:
 	$(call descend,$(@:_install=),install)
 
 liblockdep_install:
@@ -137,7 +138,8 @@ install: acpi_install cgroup_install cpupower_install gpio_install \
 		perf_install selftests_install turbostat_install usb_install \
 		virtio_install vm_install bpf_install x86_energy_perf_policy_install \
 		tmon_install freefall_install objtool_install kvm_stat_install \
-		wmi_install pci_install debugging_install intel-speed-select_install
+		wmi_install pci_install debugging_install intel-speed-select_install \
+		tracing_install
 
 acpi_clean:
 	$(call descend,power/acpi,clean)
@@ -145,7 +147,7 @@ acpi_clean:
 cpupower_clean:
 	$(call descend,power/cpupower,clean)
 
-cgroup_clean hv_clean firewire_clean bootconfig_clean spi_clean usb_clean virtio_clean vm_clean wmi_clean bpf_clean iio_clean gpio_clean objtool_clean leds_clean pci_clean firmware_clean debugging_clean:
+cgroup_clean hv_clean firewire_clean bootconfig_clean spi_clean usb_clean virtio_clean vm_clean wmi_clean bpf_clean iio_clean gpio_clean objtool_clean leds_clean pci_clean firmware_clean debugging_clean tracing_clean:
 	$(call descend,$(@:_clean=),clean)
 
 liblockdep_clean:
@@ -184,6 +186,6 @@ clean: acpi_clean cgroup_clean cpupower_clean hv_clean firewire_clean \
 		vm_clean bpf_clean iio_clean x86_energy_perf_policy_clean tmon_clean \
 		freefall_clean build_clean libbpf_clean libsubcmd_clean liblockdep_clean \
 		gpio_clean objtool_clean leds_clean wmi_clean pci_clean firmware_clean debugging_clean \
-		intel-speed-select_clean
+		intel-speed-select_clean tracing_clean
 
 .PHONY: FORCE
diff --git a/tools/tracing/Makefile b/tools/tracing/Makefile
new file mode 100644
index 000000000000..87e0ec48e2e7
--- /dev/null
+++ b/tools/tracing/Makefile
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: GPL-2.0
+include ../scripts/Makefile.include
+
+all: latency
+
+clean: latency_clean
+
+install: latency_install
+
+latency:
+	$(call descend,latency)
+
+latency_install:
+	$(call descend,latency,install)
+
+latency_clean:
+	$(call descend,latency,clean)
+
+.PHONY: all install clean latency latency_install latency_clean
diff --git a/tools/tracing/latency/.gitignore b/tools/tracing/latency/.gitignore
new file mode 100644
index 000000000000..0863960761e7
--- /dev/null
+++ b/tools/tracing/latency/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0
+latency-collector
diff --git a/tools/tracing/latency/Makefile b/tools/tracing/latency/Makefile
new file mode 100644
index 000000000000..40c4ddaf8be1
--- /dev/null
+++ b/tools/tracing/latency/Makefile
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: GPL-2.0
+# Makefile for vm tools
+#
+VAR_CFLAGS := $(shell pkg-config --cflags libtracefs 2>/dev/null)
+VAR_LDLIBS := $(shell pkg-config --libs libtracefs 2>/dev/null)
+
+TARGETS = latency-collector
+CFLAGS = -Wall -Wextra -g -O2 $(VAR_CFLAGS)
+LDFLAGS = -lpthread $(VAR_LDLIBS)
+
+all: $(TARGETS)
+
+%: %.c
+	$(CC) $(CFLAGS) -o $@ $< $(LDFLAGS)
+
+clean:
+	$(RM) latency-collector
+
+prefix ?= /usr/local
+sbindir ?= ${prefix}/sbin
+
+install: all
+	install -d $(DESTDIR)$(sbindir)
+	install -m 755 -p $(TARGETS) $(DESTDIR)$(sbindir)
diff --git a/tools/tracing/latency/latency-collector.c b/tools/tracing/latency/latency-collector.c
new file mode 100644
index 000000000000..57b20802e71b
--- /dev/null
+++ b/tools/tracing/latency/latency-collector.c
@@ -0,0 +1,2108 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2017, 2018, 2019, 2021 BMW Car IT GmbH
+ * Author: Viktor Rosendahl (viktor.rosendahl@bmw.de)
+ */
+
+#define _GNU_SOURCE
+#define _POSIX_C_SOURCE 200809L
+
+#include <ctype.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <sched.h>
+#include <linux/unistd.h>
+#include <signal.h>
+#include <sys/inotify.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <tracefs.h>
+
+static const char *prg_name;
+static const char *prg_unknown = "unknown program name";
+
+static int fd_stdout;
+
+static int sched_policy;
+static bool sched_policy_set;
+
+static int sched_pri;
+static bool sched_pri_set;
+
+static bool trace_enable = true;
+static bool setup_ftrace = true;
+static bool use_random_sleep;
+
+#define TRACE_OPTS				\
+	C(FUNC_TR, "function-trace"),		\
+	C(DISP_GR, "display-graph"),		\
+	C(NR,       NULL)
+
+#undef C
+#define C(a, b) OPTIDX_##a
+
+enum traceopt {
+	TRACE_OPTS
+};
+
+#undef C
+#define C(a, b)  b
+
+static const char *const optstr[] = {
+	TRACE_OPTS
+};
+
+enum errhandling {
+	ERR_EXIT = 0,
+	ERR_WARN,
+	ERR_CLEANUP,
+};
+
+static bool use_options[OPTIDX_NR];
+
+static char inotify_buffer[655360];
+
+#define likely(x)      __builtin_expect(!!(x), 1)
+#define unlikely(x)    __builtin_expect(!!(x), 0)
+#define bool2str(x)    (x ? "true":"false")
+
+#define DEFAULT_NR_PRINTER_THREADS (3)
+static unsigned int nr_threads = DEFAULT_NR_PRINTER_THREADS;
+
+#define DEFAULT_TABLE_SIZE (2)
+static unsigned int table_startsize = DEFAULT_TABLE_SIZE;
+
+static int verbosity;
+
+#define verbose_sizechange() (verbosity >= 1)
+#define verbose_lostevent()  (verbosity >= 2)
+#define verbose_ftrace()     (verbosity >= 1)
+
+#define was_changed(ORIG, CUR) (strcmp(ORIG, CUR) != 0)
+#define needs_change(CUR, WANTED) (strcmp(CUR, WANTED) != 0)
+
+static const char *debug_tracefile;
+static const char *debug_tracefile_dflt;
+static const char *debug_maxlat;
+static const char *debug_maxlat_dflt;
+static const char * const DEBUG_NOFILE = "[file not found]";
+
+static const char * const TR_MAXLAT  = "tracing_max_latency";
+static const char * const TR_THRESH  = "tracing_thresh";
+static const char * const TR_CURRENT = "current_tracer";
+static const char * const TR_OPTIONS = "trace_options";
+
+static const char * const NOP_TRACER = "nop";
+
+static const char * const OPT_NO_PREFIX = "no";
+
+#define DFLT_THRESHOLD_US "0"
+static const char *threshold = DFLT_THRESHOLD_US;
+
+#define DEV_URANDOM     "/dev/urandom"
+#define RT_DEFAULT_PRI (99)
+#define DEFAULT_PRI    (0)
+
+#define USEC_PER_MSEC (1000L)
+#define NSEC_PER_USEC (1000L)
+#define NSEC_PER_MSEC (USEC_PER_MSEC * NSEC_PER_USEC)
+
+#define MSEC_PER_SEC (1000L)
+#define USEC_PER_SEC (USEC_PER_MSEC * MSEC_PER_SEC)
+#define NSEC_PER_SEC (NSEC_PER_MSEC * MSEC_PER_SEC)
+
+#define SLEEP_TIME_MS_DEFAULT (1000L)
+#define TRY_PRINTMUTEX_MS (1000)
+
+static long sleep_time = (USEC_PER_MSEC * SLEEP_TIME_MS_DEFAULT);
+
+static const char * const queue_full_warning =
+"Could not queue trace for printing. It is likely that events happen faster\n"
+"than what they can be printed. Probably partly because of random sleeping\n";
+
+static const char * const no_tracer_msg =
+"Could not find any tracers! Running this program as root may help!\n";
+
+static const char * const no_latency_tr_msg =
+"No latency tracers are supported by your kernel!\n";
+
+struct policy {
+	const char *name;
+	int policy;
+	int default_pri;
+};
+
+static const struct policy policies[] = {
+	{ "other", SCHED_OTHER, DEFAULT_PRI    },
+	{ "batch", SCHED_BATCH, DEFAULT_PRI    },
+	{ "idle",  SCHED_IDLE,  DEFAULT_PRI    },
+	{ "rr",    SCHED_RR,    RT_DEFAULT_PRI },
+	{ "fifo",  SCHED_FIFO,  RT_DEFAULT_PRI },
+	{ NULL,    0,           DEFAULT_PRI    }
+};
+
+/*
+ * The default tracer will be the first on this list that is supported by the
+ * currently running Linux kernel.
+ */
+static const char * const relevant_tracers[] = {
+	"preemptirqsoff",
+	"preemptoff",
+	"irqsoff",
+	"wakeup",
+	"wakeup_rt",
+	"wakeup_dl",
+	NULL
+};
+
+/* This is the list of tracers for which random sleep makes sense */
+static const char * const random_tracers[] = {
+	"preemptirqsoff",
+	"preemptoff",
+	"irqsoff",
+	NULL
+};
+
+static const char *current_tracer;
+static bool force_tracer;
+
+struct ftrace_state {
+	char *tracer;
+	char *thresh;
+	bool opt[OPTIDX_NR];
+	bool opt_valid[OPTIDX_NR];
+	pthread_mutex_t mutex;
+};
+
+struct entry {
+	int ticket;
+	int ticket_completed_ref;
+};
+
+struct print_state {
+	int ticket_counter;
+	int ticket_completed;
+	pthread_mutex_t mutex;
+	pthread_cond_t cond;
+	int cnt;
+	pthread_mutex_t cnt_mutex;
+};
+
+struct short_msg {
+	char buf[160];
+	int len;
+};
+
+static struct print_state printstate;
+static struct ftrace_state save_state;
+volatile sig_atomic_t signal_flag;
+
+#define PROB_TABLE_MAX_SIZE (1000)
+
+int probabilities[PROB_TABLE_MAX_SIZE];
+
+struct sleep_table {
+	int *table;
+	int size;
+	pthread_mutex_t mutex;
+};
+
+static struct sleep_table sleeptable;
+
+#define QUEUE_SIZE (10)
+
+struct queue {
+	struct entry entries[QUEUE_SIZE];
+	int next_prod_idx;
+	int next_cons_idx;
+	pthread_mutex_t mutex;
+	pthread_cond_t cond;
+};
+
+#define MAX_THREADS (40)
+
+struct queue printqueue;
+pthread_t printthread[MAX_THREADS];
+pthread_mutex_t print_mtx;
+#define PRINT_BUFFER_SIZE (16 * 1024 * 1024)
+
+static void cleanup_exit(int status);
+static int set_trace_opt(const char *opt, bool value);
+
+static __always_inline void *malloc_or_die(size_t size)
+{
+	void *ptr = malloc(size);
+
+	if (unlikely(ptr == NULL)) {
+		warn("malloc() failed");
+		cleanup_exit(EXIT_FAILURE);
+	}
+	return ptr;
+}
+
+static __always_inline void *malloc_or_die_nocleanup(size_t size)
+{
+	void *ptr = malloc(size);
+
+	if (unlikely(ptr == NULL))
+		err(0, "malloc() failed");
+	return ptr;
+}
+
+static __always_inline void write_or_die(int fd, const char *buf, size_t count)
+{
+	ssize_t r;
+
+	do {
+		r = write(fd, buf, count);
+		if (unlikely(r < 0)) {
+			if (errno == EINTR)
+				continue;
+			warn("write() failed");
+			cleanup_exit(EXIT_FAILURE);
+		}
+		count -= r;
+		buf += r;
+	} while (count > 0);
+}
+
+static __always_inline void clock_gettime_or_die(clockid_t clk_id,
+						 struct timespec *tp)
+{
+	int r = clock_gettime(clk_id, tp);
+
+	if (unlikely(r != 0))
+		err(EXIT_FAILURE, "clock_gettime() failed");
+}
+
+static __always_inline void sigemptyset_or_die(sigset_t *s)
+{
+	if (unlikely(sigemptyset(s) != 0)) {
+		warn("sigemptyset() failed");
+		cleanup_exit(EXIT_FAILURE);
+	}
+}
+
+static __always_inline void sigaddset_or_die(sigset_t *s, int signum)
+{
+	if (unlikely(sigaddset(s, signum) != 0)) {
+		warn("sigemptyset() failed");
+		cleanup_exit(EXIT_FAILURE);
+	}
+}
+
+static __always_inline void sigaction_or_die(int signum,
+					     const struct sigaction *act,
+					     struct sigaction *oldact)
+{
+	if (unlikely(sigaction(signum, act, oldact) != 0)) {
+		warn("sigaction() failed");
+		cleanup_exit(EXIT_FAILURE);
+	}
+}
+
+static void open_stdout(void)
+{
+	if (setvbuf(stdout, NULL, _IONBF, 0) != 0)
+		err(EXIT_FAILURE, "setvbuf() failed");
+	fd_stdout = fileno(stdout);
+	if (fd_stdout < 0)
+		err(EXIT_FAILURE, "fileno() failed");
+}
+
+/*
+ * It's not worth it to call cleanup_exit() from mutex functions because
+ * cleanup_exit() uses mutexes.
+ */
+static __always_inline void mutex_lock(pthread_mutex_t *mtx)
+{
+	errno = pthread_mutex_lock(mtx);
+	if (unlikely(errno))
+		err(EXIT_FAILURE, "pthread_mutex_lock() failed");
+}
+
+
+static __always_inline void mutex_unlock(pthread_mutex_t *mtx)
+{
+	errno = pthread_mutex_unlock(mtx);
+	if (unlikely(errno))
+		err(EXIT_FAILURE, "pthread_mutex_unlock() failed");
+}
+
+static __always_inline void cond_signal(pthread_cond_t *cond)
+{
+	errno = pthread_cond_signal(cond);
+	if (unlikely(errno))
+		err(EXIT_FAILURE, "pthread_cond_signal() failed");
+}
+
+static __always_inline void cond_wait(pthread_cond_t *restrict cond,
+				      pthread_mutex_t *restrict mutex)
+{
+	errno = pthread_cond_wait(cond, mutex);
+	if (unlikely(errno))
+		err(EXIT_FAILURE, "pthread_cond_wait() failed");
+}
+
+static __always_inline void cond_broadcast(pthread_cond_t *cond)
+{
+	errno = pthread_cond_broadcast(cond);
+	if (unlikely(errno))
+		err(EXIT_FAILURE, "pthread_cond_broadcast() failed");
+}
+
+static __always_inline void
+mutex_init(pthread_mutex_t *mutex,
+	   const pthread_mutexattr_t *attr)
+{
+	errno = pthread_mutex_init(mutex, attr);
+	if (errno)
+		err(EXIT_FAILURE, "pthread_mutex_init() failed");
+}
+
+static __always_inline void mutexattr_init(pthread_mutexattr_t *attr)
+{
+	errno = pthread_mutexattr_init(attr);
+	if (errno)
+		err(EXIT_FAILURE, "pthread_mutexattr_init() failed");
+}
+
+static __always_inline void mutexattr_destroy(pthread_mutexattr_t *attr)
+{
+	errno = pthread_mutexattr_destroy(attr);
+	if (errno)
+		err(EXIT_FAILURE, "pthread_mutexattr_destroy() failed");
+}
+
+static __always_inline void mutexattr_settype(pthread_mutexattr_t *attr,
+					      int type)
+{
+	errno = pthread_mutexattr_settype(attr, type);
+	if (errno)
+		err(EXIT_FAILURE, "pthread_mutexattr_settype() failed");
+}
+
+static __always_inline void condattr_init(pthread_condattr_t *attr)
+{
+	errno = pthread_condattr_init(attr);
+	if (errno)
+		err(EXIT_FAILURE, "pthread_condattr_init() failed");
+}
+
+static __always_inline void condattr_destroy(pthread_condattr_t *attr)
+{
+	errno = pthread_condattr_destroy(attr);
+	if (errno)
+		err(EXIT_FAILURE, "pthread_condattr_destroy() failed");
+}
+
+static __always_inline void condattr_setclock(pthread_condattr_t *attr,
+					      clockid_t clock_id)
+{
+	errno = pthread_condattr_setclock(attr, clock_id);
+	if (unlikely(errno))
+		err(EXIT_FAILURE, "pthread_condattr_setclock() failed");
+}
+
+static __always_inline void cond_init(pthread_cond_t *cond,
+				      const pthread_condattr_t *attr)
+{
+	errno = pthread_cond_init(cond, attr);
+	if (errno)
+		err(EXIT_FAILURE, "pthread_cond_init() failed");
+}
+
+static __always_inline int
+cond_timedwait(pthread_cond_t *restrict cond,
+	       pthread_mutex_t *restrict mutex,
+	       const struct timespec *restrict abstime)
+{
+	errno = pthread_cond_timedwait(cond, mutex, abstime);
+	if (errno && errno != ETIMEDOUT)
+		err(EXIT_FAILURE, "pthread_cond_timedwait() failed");
+	return errno;
+}
+
+static void init_printstate(void)
+{
+	pthread_condattr_t cattr;
+
+	printstate.ticket_counter = 0;
+	printstate.ticket_completed = 0;
+	printstate.cnt = 0;
+
+	mutex_init(&printstate.mutex, NULL);
+
+	condattr_init(&cattr);
+	condattr_setclock(&cattr, CLOCK_MONOTONIC);
+	cond_init(&printstate.cond, &cattr);
+	condattr_destroy(&cattr);
+}
+
+static void init_print_mtx(void)
+{
+	pthread_mutexattr_t mattr;
+
+	mutexattr_init(&mattr);
+	mutexattr_settype(&mattr, PTHREAD_MUTEX_RECURSIVE);
+	mutex_init(&print_mtx, &mattr);
+	mutexattr_destroy(&mattr);
+
+}
+
+static void signal_blocking(int how)
+{
+	sigset_t s;
+
+	sigemptyset_or_die(&s);
+	sigaddset_or_die(&s, SIGHUP);
+	sigaddset_or_die(&s, SIGTERM);
+	sigaddset_or_die(&s, SIGINT);
+
+	errno = pthread_sigmask(how, &s, NULL);
+	if (unlikely(errno)) {
+		warn("pthread_sigmask() failed");
+		cleanup_exit(EXIT_FAILURE);
+	}
+}
+
+static void signal_handler(int num)
+{
+	signal_flag = num;
+}
+
+static void setup_sig_handler(void)
+{
+	struct sigaction sa;
+
+	memset(&sa, 0, sizeof(sa));
+	sa.sa_handler = signal_handler;
+
+	sigaction_or_die(SIGHUP, &sa, NULL);
+	sigaction_or_die(SIGTERM, &sa, NULL);
+	sigaction_or_die(SIGINT, &sa, NULL);
+}
+
+static void process_signal(int signal)
+{
+	char *name;
+
+	name = strsignal(signal);
+	if (name == NULL)
+		printf("Received signal %d\n", signal);
+	else
+		printf("Received signal %d (%s)\n", signal, name);
+	cleanup_exit(EXIT_SUCCESS);
+}
+
+static __always_inline void check_signals(void)
+{
+	int signal = signal_flag;
+
+	if (unlikely(signal))
+		process_signal(signal);
+}
+
+static __always_inline void get_time_in_future(struct timespec *future,
+					       long time_us)
+{
+	long nsec;
+
+	clock_gettime_or_die(CLOCK_MONOTONIC, future);
+	future->tv_sec += time_us / USEC_PER_SEC;
+	nsec = future->tv_nsec + (time_us * NSEC_PER_USEC) % NSEC_PER_SEC;
+	if (nsec >= NSEC_PER_SEC) {
+		future->tv_nsec = nsec % NSEC_PER_SEC;
+		future->tv_sec += 1;
+	}
+}
+
+static __always_inline bool time_has_passed(const struct timespec *time)
+{
+	struct timespec now;
+
+	clock_gettime_or_die(CLOCK_MONOTONIC, &now);
+	if (now.tv_sec > time->tv_sec)
+		return true;
+	if (now.tv_sec < time->tv_sec)
+		return false;
+	return (now.tv_nsec >= time->tv_nsec);
+}
+
+static bool mutex_trylock_limit(pthread_mutex_t *mutex, int time_ms)
+{
+	long time_us = time_ms * USEC_PER_MSEC;
+	struct timespec limit;
+
+	get_time_in_future(&limit, time_us);
+	do {
+		errno =  pthread_mutex_trylock(mutex);
+		if (errno && errno != EBUSY)
+			err(EXIT_FAILURE, "pthread_mutex_trylock() failed");
+	} while (errno && !time_has_passed(&limit));
+	return errno == 0;
+}
+
+static void restore_trace_opts(const struct ftrace_state *state,
+				const bool *cur)
+{
+	int i;
+	int r;
+
+	for (i = 0; i < OPTIDX_NR; i++)
+		if (state->opt_valid[i] && state->opt[i] != cur[i]) {
+			r = set_trace_opt(optstr[i], state->opt[i]);
+			if (r < 0)
+				warnx("Failed to restore the %s option to %s",
+				      optstr[i], bool2str(state->opt[i]));
+			else if (verbose_ftrace())
+				printf("Restored the %s option in %s to %s\n",
+				       optstr[i], TR_OPTIONS,
+				       bool2str(state->opt[i]));
+		}
+}
+
+static char *read_file(const char *file, enum errhandling h)
+{
+	int psize;
+	char *r;
+	static const char *emsg = "Failed to read the %s file";
+
+	r = tracefs_instance_file_read(NULL, file, &psize);
+	if (!r) {
+		if (h) {
+			warn(emsg, file);
+			if (h == ERR_CLEANUP)
+				cleanup_exit(EXIT_FAILURE);
+		} else
+			errx(EXIT_FAILURE, emsg, file);
+	}
+
+	if (r && r[psize - 1] == '\n')
+		r[psize - 1] = '\0';
+	return r;
+}
+
+static void restore_file(const char *file, char **saved, const char *cur)
+{
+	if (*saved && was_changed(*saved, cur)) {
+		if (tracefs_instance_file_write(NULL, file, *saved) < 0)
+			warnx("Failed to restore %s to %s!", file, *saved);
+		else if (verbose_ftrace())
+			printf("Restored %s to %s\n", file, *saved);
+		free(*saved);
+		*saved = NULL;
+	}
+}
+
+static void restore_ftrace(void)
+{
+	mutex_lock(&save_state.mutex);
+
+	restore_file(TR_CURRENT, &save_state.tracer, current_tracer);
+	restore_file(TR_THRESH, &save_state.thresh, threshold);
+	restore_trace_opts(&save_state, use_options);
+
+	mutex_unlock(&save_state.mutex);
+}
+
+static void cleanup_exit(int status)
+{
+	char *maxlat;
+
+	if (!setup_ftrace)
+		exit(status);
+
+	/*
+	 * We try the print_mtx for 1 sec in order to avoid garbled
+	 * output if possible, but if it cannot be obtained we proceed anyway.
+	 */
+	mutex_trylock_limit(&print_mtx, TRY_PRINTMUTEX_MS);
+
+	maxlat = read_file(TR_MAXLAT, ERR_WARN);
+	if (maxlat) {
+		printf("The maximum detected latency was: %sus\n", maxlat);
+		free(maxlat);
+	}
+
+	restore_ftrace();
+	/*
+	 * We do not need to unlock the print_mtx here because we will exit at
+	 * the end of this function. Unlocking print_mtx causes problems if a
+	 * print thread happens to be waiting for the mutex because we have
+	 * just changed the ftrace settings to the original and thus the
+	 * print thread would output incorrect data from ftrace.
+	 */
+	exit(status);
+}
+
+static void init_save_state(void)
+{
+	pthread_mutexattr_t mattr;
+
+	mutexattr_init(&mattr);
+	mutexattr_settype(&mattr, PTHREAD_MUTEX_RECURSIVE);
+	mutex_init(&save_state.mutex, &mattr);
+	mutexattr_destroy(&mattr);
+
+	save_state.tracer = NULL;
+	save_state.thresh = NULL;
+	save_state.opt_valid[OPTIDX_FUNC_TR] = false;
+	save_state.opt_valid[OPTIDX_DISP_GR] = false;
+}
+
+static int printstate_next_ticket(struct entry *req)
+{
+	int r;
+
+	r = ++(printstate.ticket_counter);
+	req->ticket = r;
+	req->ticket_completed_ref = printstate.ticket_completed;
+	cond_broadcast(&printstate.cond);
+	return r;
+}
+
+static __always_inline
+void printstate_mark_req_completed(const struct entry *req)
+{
+	if (req->ticket > printstate.ticket_completed)
+		printstate.ticket_completed = req->ticket;
+}
+
+static __always_inline
+bool printstate_has_new_req_arrived(const struct entry *req)
+{
+	return (printstate.ticket_counter != req->ticket);
+}
+
+static __always_inline int printstate_cnt_inc(void)
+{
+	int value;
+
+	mutex_lock(&printstate.cnt_mutex);
+	value = ++printstate.cnt;
+	mutex_unlock(&printstate.cnt_mutex);
+	return value;
+}
+
+static __always_inline int printstate_cnt_dec(void)
+{
+	int value;
+
+	mutex_lock(&printstate.cnt_mutex);
+	value = --printstate.cnt;
+	mutex_unlock(&printstate.cnt_mutex);
+	return value;
+}
+
+static __always_inline int printstate_cnt_read(void)
+{
+	int value;
+
+	mutex_lock(&printstate.cnt_mutex);
+	value = printstate.cnt;
+	mutex_unlock(&printstate.cnt_mutex);
+	return value;
+}
+
+static __always_inline
+bool prev_req_won_race(const struct entry *req)
+{
+	return (printstate.ticket_completed != req->ticket_completed_ref);
+}
+
+static void sleeptable_resize(int size, bool printout, struct short_msg *msg)
+{
+	int bytes;
+
+	if (printout) {
+		msg->len = 0;
+		if (unlikely(size > PROB_TABLE_MAX_SIZE))
+			bytes = snprintf(msg->buf, sizeof(msg->buf),
+"Cannot increase probability table to %d (maximum size reached)\n", size);
+		else
+			bytes = snprintf(msg->buf, sizeof(msg->buf),
+"Increasing probability table to %d\n", size);
+		if (bytes < 0)
+			warn("snprintf() failed");
+		else
+			msg->len = bytes;
+	}
+
+	if (unlikely(size < 0)) {
+		/* Should never happen */
+		warnx("Bad program state at %s:%d", __FILE__, __LINE__);
+		cleanup_exit(EXIT_FAILURE);
+		return;
+	}
+	sleeptable.size = size;
+	sleeptable.table = &probabilities[PROB_TABLE_MAX_SIZE - size];
+}
+
+static void init_probabilities(void)
+{
+	int i;
+	int j = 1000;
+
+	for (i = 0; i < PROB_TABLE_MAX_SIZE; i++) {
+		probabilities[i] = 1000 / j;
+		j--;
+	}
+	mutex_init(&sleeptable.mutex, NULL);
+}
+
+static int table_get_probability(const struct entry *req,
+				 struct short_msg *msg)
+{
+	int diff = req->ticket - req->ticket_completed_ref;
+	int rval = 0;
+
+	msg->len = 0;
+	diff--;
+	/* Should never happen...*/
+	if (unlikely(diff < 0)) {
+		warnx("Programmer assumption error at %s:%d\n", __FILE__,
+		      __LINE__);
+		cleanup_exit(EXIT_FAILURE);
+	}
+	mutex_lock(&sleeptable.mutex);
+	if (diff >= (sleeptable.size - 1)) {
+		rval = sleeptable.table[sleeptable.size - 1];
+		sleeptable_resize(sleeptable.size + 1, verbose_sizechange(),
+				  msg);
+	} else {
+		rval = sleeptable.table[diff];
+	}
+	mutex_unlock(&sleeptable.mutex);
+	return rval;
+}
+
+static void init_queue(struct queue *q)
+{
+	q->next_prod_idx = 0;
+	q->next_cons_idx = 0;
+	mutex_init(&q->mutex, NULL);
+	errno = pthread_cond_init(&q->cond, NULL);
+	if (errno)
+		err(EXIT_FAILURE, "pthread_cond_init() failed");
+}
+
+static __always_inline int queue_len(const struct queue *q)
+{
+	if (q->next_prod_idx >= q->next_cons_idx)
+		return q->next_prod_idx - q->next_cons_idx;
+	else
+		return QUEUE_SIZE - q->next_cons_idx + q->next_prod_idx;
+}
+
+static __always_inline int queue_nr_free(const struct queue *q)
+{
+	int nr_free = QUEUE_SIZE - queue_len(q);
+
+	/*
+	 * If there is only one slot left we will anyway lie and claim that the
+	 * queue is full because adding an element will make it appear empty
+	 */
+	if (nr_free == 1)
+		nr_free = 0;
+	return nr_free;
+}
+
+static __always_inline void queue_idx_inc(int *idx)
+{
+	*idx = (*idx + 1) % QUEUE_SIZE;
+}
+
+static __always_inline void queue_push_to_back(struct queue *q,
+					      const struct entry *e)
+{
+	q->entries[q->next_prod_idx] = *e;
+	queue_idx_inc(&q->next_prod_idx);
+}
+
+static __always_inline struct entry queue_pop_from_front(struct queue *q)
+{
+	struct entry e = q->entries[q->next_cons_idx];
+
+	queue_idx_inc(&q->next_cons_idx);
+	return e;
+}
+
+static __always_inline void queue_cond_signal(struct queue *q)
+{
+	cond_signal(&q->cond);
+}
+
+static __always_inline void queue_cond_wait(struct queue *q)
+{
+	cond_wait(&q->cond, &q->mutex);
+}
+
+static __always_inline int queue_try_to_add_entry(struct queue *q,
+						  const struct entry *e)
+{
+	int r = 0;
+
+	mutex_lock(&q->mutex);
+	if (queue_nr_free(q) > 0) {
+		queue_push_to_back(q, e);
+		cond_signal(&q->cond);
+	} else
+		r = -1;
+	mutex_unlock(&q->mutex);
+	return r;
+}
+
+static struct entry queue_wait_for_entry(struct queue *q)
+{
+	struct entry e;
+
+	mutex_lock(&q->mutex);
+	while (true) {
+		if (queue_len(&printqueue) > 0) {
+			e = queue_pop_from_front(q);
+			break;
+		}
+		queue_cond_wait(q);
+	}
+	mutex_unlock(&q->mutex);
+
+	return e;
+}
+
+static const struct policy *policy_from_name(const char *name)
+{
+	const struct policy *p = &policies[0];
+
+	while (p->name != NULL) {
+		if (!strcmp(name, p->name))
+			return p;
+		p++;
+	}
+	return NULL;
+}
+
+static const char *policy_name(int policy)
+{
+	const struct policy *p = &policies[0];
+	static const char *rval = "unknown";
+
+	while (p->name != NULL) {
+		if (p->policy == policy)
+			return p->name;
+		p++;
+	}
+	return rval;
+}
+
+static bool is_relevant_tracer(const char *name)
+{
+	unsigned int i;
+
+	for (i = 0; relevant_tracers[i]; i++)
+		if (!strcmp(name, relevant_tracers[i]))
+			return true;
+	return false;
+}
+
+static bool random_makes_sense(const char *name)
+{
+	unsigned int i;
+
+	for (i = 0; random_tracers[i]; i++)
+		if (!strcmp(name, random_tracers[i]))
+			return true;
+	return false;
+}
+
+static void show_available(void)
+{
+	char **tracers;
+	int found = 0;
+	int i;
+
+	tracers = tracefs_tracers(NULL);
+	for (i = 0; tracers && tracers[i]; i++) {
+		if (is_relevant_tracer(tracers[i]))
+			found++;
+	}
+
+	if (!tracers) {
+		warnx(no_tracer_msg);
+		return;
+	}
+
+	if (!found) {
+		warnx(no_latency_tr_msg);
+		tracefs_list_free(tracers);
+		return;
+	}
+
+	printf("The following latency tracers are available on your system:\n");
+	for (i = 0; tracers[i]; i++) {
+		if (is_relevant_tracer(tracers[i]))
+			printf("%s\n", tracers[i]);
+	}
+	tracefs_list_free(tracers);
+}
+
+static bool tracer_valid(const char *name, bool *notracer)
+{
+	char **tracers;
+	int i;
+	bool rval = false;
+
+	*notracer = false;
+	tracers = tracefs_tracers(NULL);
+	if (!tracers) {
+		*notracer = true;
+		return false;
+	}
+	for (i = 0; tracers[i]; i++)
+		if (!strcmp(tracers[i], name)) {
+			rval = true;
+			break;
+		}
+	tracefs_list_free(tracers);
+	return rval;
+}
+
+static const char *find_default_tracer(void)
+{
+	int i;
+	bool notracer;
+	bool valid;
+
+	for (i = 0; relevant_tracers[i]; i++) {
+		valid = tracer_valid(relevant_tracers[i], &notracer);
+		if (notracer)
+			errx(EXIT_FAILURE, no_tracer_msg);
+		if (valid)
+			return relevant_tracers[i];
+	}
+	return NULL;
+}
+
+static bool toss_coin(struct drand48_data *buffer, unsigned int prob)
+{
+	long r;
+
+	if (unlikely(lrand48_r(buffer, &r))) {
+		warnx("lrand48_r() failed");
+		cleanup_exit(EXIT_FAILURE);
+	}
+	r = r % 1000L;
+	if (r < prob)
+		return true;
+	else
+		return false;
+}
+
+
+static long go_to_sleep(const struct entry *req)
+{
+	struct timespec future;
+	long delay = sleep_time;
+
+	get_time_in_future(&future, delay);
+
+	mutex_lock(&printstate.mutex);
+	while (!printstate_has_new_req_arrived(req)) {
+		cond_timedwait(&printstate.cond, &printstate.mutex, &future);
+		if (time_has_passed(&future))
+			break;
+	};
+
+	if (printstate_has_new_req_arrived(req))
+		delay = -1;
+	mutex_unlock(&printstate.mutex);
+
+	return delay;
+}
+
+
+static void set_priority(void)
+{
+	int r;
+	pid_t pid;
+	struct sched_param param;
+
+	memset(&param, 0, sizeof(param));
+	param.sched_priority = sched_pri;
+
+	pid = getpid();
+	r = sched_setscheduler(pid, sched_policy, &param);
+
+	if (r != 0)
+		err(EXIT_FAILURE, "sched_setscheduler() failed");
+}
+
+pid_t latency_collector_gettid(void)
+{
+	return (pid_t) syscall(__NR_gettid);
+}
+
+static void print_priority(void)
+{
+	pid_t tid;
+	int policy;
+	int r;
+	struct sched_param param;
+
+	tid = latency_collector_gettid();
+	r = pthread_getschedparam(pthread_self(), &policy, &param);
+	if (r != 0) {
+		warn("pthread_getschedparam() failed");
+		cleanup_exit(EXIT_FAILURE);
+	}
+	mutex_lock(&print_mtx);
+	printf("Thread %d runs with scheduling policy %s and priority %d\n",
+	       tid, policy_name(policy), param.sched_priority);
+	mutex_unlock(&print_mtx);
+}
+
+static __always_inline
+void __print_skipmessage(const struct short_msg *resize_msg,
+			 const struct timespec *timestamp, char *buffer,
+			 size_t bufspace, const struct entry *req, bool excuse,
+			 const char *str)
+{
+	ssize_t bytes = 0;
+	char *p = &buffer[0];
+	long us, sec;
+	int r;
+
+	sec = timestamp->tv_sec;
+	us = timestamp->tv_nsec / 1000;
+
+	if (resize_msg != NULL && resize_msg->len > 0) {
+		strncpy(p, resize_msg->buf, resize_msg->len);
+		bytes += resize_msg->len;
+		p += resize_msg->len;
+		bufspace -= resize_msg->len;
+	}
+
+	if (excuse)
+		r = snprintf(p, bufspace,
+"%ld.%06ld Latency %d printout skipped due to %s\n",
+			     sec, us, req->ticket, str);
+	else
+		r = snprintf(p, bufspace, "%ld.%06ld Latency %d detected\n",
+			    sec, us, req->ticket);
+
+	if (r < 0)
+		warn("snprintf() failed");
+	else
+		bytes += r;
+
+	/* These prints could happen concurrently */
+	mutex_lock(&print_mtx);
+	write_or_die(fd_stdout, buffer, bytes);
+	mutex_unlock(&print_mtx);
+}
+
+static void print_skipmessage(const struct short_msg *resize_msg,
+			      const struct timespec *timestamp, char *buffer,
+			      size_t bufspace, const struct entry *req,
+			      bool excuse)
+{
+	__print_skipmessage(resize_msg, timestamp, buffer, bufspace, req,
+			    excuse, "random delay");
+}
+
+static void print_lostmessage(const struct timespec *timestamp, char *buffer,
+			      size_t bufspace, const struct entry *req,
+			      const char *reason)
+{
+	__print_skipmessage(NULL, timestamp, buffer, bufspace, req, true,
+			    reason);
+}
+
+static void print_tracefile(const struct short_msg *resize_msg,
+			    const struct timespec *timestamp, char *buffer,
+			    size_t bufspace, long slept,
+			    const struct entry *req)
+{
+	static const int reserve = 256;
+	char *p = &buffer[0];
+	ssize_t bytes = 0;
+	ssize_t bytes_tot = 0;
+	long us, sec;
+	long slept_ms;
+	int trace_fd;
+
+	/* Save some space for the final string and final null char */
+	bufspace = bufspace - reserve - 1;
+
+	if (resize_msg != NULL && resize_msg->len > 0) {
+		bytes = resize_msg->len;
+		strncpy(p, resize_msg->buf, bytes);
+		bytes_tot += bytes;
+		p += bytes;
+		bufspace -= bytes;
+	}
+
+	trace_fd = open(debug_tracefile, O_RDONLY);
+
+	if (trace_fd < 0) {
+		warn("open() failed on %s", debug_tracefile);
+		return;
+	}
+
+	sec = timestamp->tv_sec;
+	us = timestamp->tv_nsec / 1000;
+
+	if (slept != 0) {
+		slept_ms = slept / 1000;
+		bytes = snprintf(p, bufspace,
+"%ld.%06ld Latency %d randomly sleep for %ld ms before print\n",
+				 sec, us, req->ticket, slept_ms);
+	} else {
+		bytes = snprintf(p, bufspace,
+				 "%ld.%06ld Latency %d immediate print\n", sec,
+				 us, req->ticket);
+	}
+
+	if (bytes < 0) {
+		warn("snprintf() failed");
+		return;
+	}
+	p += bytes;
+	bufspace -= bytes;
+	bytes_tot += bytes;
+
+	bytes = snprintf(p, bufspace,
+">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> BEGIN <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"
+		);
+
+	if (bytes < 0) {
+		warn("snprintf() failed");
+		return;
+	}
+
+	p += bytes;
+	bufspace -= bytes;
+	bytes_tot += bytes;
+
+	do {
+		bytes = read(trace_fd, p, bufspace);
+		if (bytes < 0) {
+			if (errno == EINTR)
+				continue;
+			warn("read() failed on %s", debug_tracefile);
+			if (unlikely(close(trace_fd) != 0))
+				warn("close() failed on %s", debug_tracefile);
+			return;
+		}
+		if (bytes == 0)
+			break;
+		p += bytes;
+		bufspace -= bytes;
+		bytes_tot += bytes;
+	} while (true);
+
+	if (unlikely(close(trace_fd) != 0))
+		warn("close() failed on %s", debug_tracefile);
+
+	printstate_cnt_dec();
+	/* Add the reserve space back to the budget for the final string */
+	bufspace += reserve;
+
+	bytes = snprintf(p, bufspace,
+			 ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> END <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n\n");
+
+	if (bytes < 0) {
+		warn("snprintf() failed");
+		return;
+	}
+
+	bytes_tot += bytes;
+
+	/* These prints could happen concurrently */
+	mutex_lock(&print_mtx);
+	write_or_die(fd_stdout, buffer, bytes_tot);
+	mutex_unlock(&print_mtx);
+}
+
+static char *get_no_opt(const char *opt)
+{
+	char *no_opt;
+	int s;
+
+	s = strlen(opt) + strlen(OPT_NO_PREFIX) + 1;
+	/* We may be called from cleanup_exit() via set_trace_opt() */
+	no_opt = malloc_or_die_nocleanup(s);
+	strcpy(no_opt, OPT_NO_PREFIX);
+	strcat(no_opt, opt);
+	return no_opt;
+}
+
+static char *find_next_optstr(const char *allopt, const char **next)
+{
+	const char *begin;
+	const char *end;
+	char *r;
+	int s = 0;
+
+	if (allopt == NULL)
+		return NULL;
+
+	for (begin = allopt; *begin != '\0'; begin++) {
+		if (isgraph(*begin))
+			break;
+	}
+
+	if (*begin == '\0')
+		return NULL;
+
+	for (end = begin; *end != '\0' && isgraph(*end); end++)
+		s++;
+
+	r = malloc_or_die_nocleanup(s + 1);
+	strncpy(r, begin, s);
+	r[s] = '\0';
+	*next = begin + s;
+	return r;
+}
+
+static bool get_trace_opt(const char *allopt, const char *opt, bool *found)
+{
+	*found = false;
+	char *no_opt;
+	char *str;
+	const char *next = allopt;
+	bool rval = false;
+
+	no_opt = get_no_opt(opt);
+
+	do {
+		str = find_next_optstr(next, &next);
+		if (str == NULL)
+			break;
+		if (!strcmp(str, opt)) {
+			*found = true;
+			rval = true;
+			free(str);
+			break;
+		}
+		if (!strcmp(str, no_opt)) {
+			*found = true;
+			rval = false;
+			free(str);
+			break;
+		}
+		free(str);
+	} while (true);
+	free(no_opt);
+
+	return rval;
+}
+
+static int set_trace_opt(const char *opt, bool value)
+{
+	char *str;
+	int r;
+
+	if (value)
+		str = strdup(opt);
+	else
+		str = get_no_opt(opt);
+
+	r = tracefs_instance_file_write(NULL, TR_OPTIONS, str);
+	free(str);
+	return r;
+}
+
+void save_trace_opts(struct ftrace_state *state)
+{
+	char *allopt;
+	int psize;
+	int i;
+
+	allopt = tracefs_instance_file_read(NULL, TR_OPTIONS, &psize);
+	if (!allopt)
+		errx(EXIT_FAILURE, "Failed to read the %s file\n", TR_OPTIONS);
+
+	for (i = 0; i < OPTIDX_NR; i++)
+		state->opt[i] = get_trace_opt(allopt, optstr[i],
+					      &state->opt_valid[i]);
+
+	free(allopt);
+}
+
+static void write_file(const char *file, const char *cur, const char *new,
+		       enum errhandling h)
+{
+	int r;
+	static const char *emsg = "Failed to write to the %s file!";
+
+	/* Do nothing if we now that the current and new value are equal */
+	if (cur && !needs_change(cur, new))
+		return;
+
+	r = tracefs_instance_file_write(NULL, file, new);
+	if (r < 0) {
+		if (h) {
+			warnx(emsg, file);
+			if (h == ERR_CLEANUP)
+				cleanup_exit(EXIT_FAILURE);
+		} else
+			errx(EXIT_FAILURE, emsg, file);
+	}
+	if (verbose_ftrace()) {
+		mutex_lock(&print_mtx);
+		printf("%s was set to %s\n", file, new);
+		mutex_unlock(&print_mtx);
+	}
+}
+
+static void reset_max_latency(void)
+{
+	write_file(TR_MAXLAT, NULL, "0", ERR_CLEANUP);
+}
+
+static void save_and_disable_tracer(void)
+{
+	char *orig_th;
+	char *tracer;
+	bool need_nop = false;
+
+	mutex_lock(&save_state.mutex);
+
+	save_trace_opts(&save_state);
+	tracer = read_file(TR_CURRENT, ERR_EXIT);
+	orig_th = read_file(TR_THRESH, ERR_EXIT);
+
+	if (needs_change(tracer, NOP_TRACER)) {
+		mutex_lock(&print_mtx);
+		if (force_tracer) {
+			printf(
+				"The %s tracer is already in use but proceeding anyway!\n",
+				tracer);
+		} else {
+			printf(
+				"The %s tracer is already in use, cowardly bailing out!\n"
+				"This could indicate that another program or instance is tracing.\n"
+				"Use the -F [--force] option to disregard the current tracer.\n", tracer);
+			exit(0);
+		}
+		mutex_unlock(&print_mtx);
+		need_nop = true;
+	}
+
+	save_state.tracer =  tracer;
+	save_state.thresh = orig_th;
+
+	if (need_nop)
+		write_file(TR_CURRENT, NULL, NOP_TRACER, ERR_EXIT);
+
+	mutex_unlock(&save_state.mutex);
+}
+
+void set_trace_opts(struct ftrace_state *state, bool *new)
+{
+	int i;
+	int r;
+
+	/*
+	 * We only set options if we earlier detected that the option exists in
+	 * the trace_options file and that the wanted setting is different from
+	 * the one we saw in save_and_disable_tracer()
+	 */
+	for (i = 0; i < OPTIDX_NR; i++)
+		if (state->opt_valid[i] &&
+		    state->opt[i] != new[i]) {
+			r = set_trace_opt(optstr[i], new[i]);
+			if (r < 0) {
+				warnx("Failed to set the %s option to %s",
+				      optstr[i], bool2str(new[i]));
+				cleanup_exit(EXIT_FAILURE);
+			}
+			if (verbose_ftrace()) {
+				mutex_lock(&print_mtx);
+				printf("%s in %s was set to %s\n", optstr[i],
+				       TR_OPTIONS, bool2str(new[i]));
+				mutex_unlock(&print_mtx);
+			}
+		}
+}
+
+static void enable_tracer(void)
+{
+	mutex_lock(&save_state.mutex);
+	set_trace_opts(&save_state, use_options);
+
+	write_file(TR_THRESH, save_state.thresh, threshold, ERR_CLEANUP);
+	write_file(TR_CURRENT, NOP_TRACER, current_tracer, ERR_CLEANUP);
+
+	mutex_unlock(&save_state.mutex);
+}
+
+static void tracing_loop(void)
+{
+	int ifd = inotify_init();
+	int wd;
+	const ssize_t bufsize = sizeof(inotify_buffer);
+	const ssize_t istructsize = sizeof(struct inotify_event);
+	char *buf = &inotify_buffer[0];
+	ssize_t nr_read;
+	char *p;
+	int modified;
+	struct inotify_event *event;
+	struct entry req;
+	char *buffer;
+	const size_t bufspace = PRINT_BUFFER_SIZE;
+	struct timespec timestamp;
+
+	print_priority();
+
+	buffer = malloc_or_die(bufspace);
+
+	if (ifd < 0)
+		err(EXIT_FAILURE, "inotify_init() failed!");
+
+
+	if (setup_ftrace) {
+		/*
+		 * We must disable the tracer before resetting the max_latency
+		 */
+		save_and_disable_tracer();
+		/*
+		 * We must reset the max_latency before the inotify_add_watch()
+		 * call.
+		 */
+		reset_max_latency();
+	}
+
+	wd = inotify_add_watch(ifd, debug_maxlat, IN_MODIFY);
+	if (wd < 0)
+		err(EXIT_FAILURE, "inotify_add_watch() failed!");
+
+	if (setup_ftrace)
+		enable_tracer();
+
+	signal_blocking(SIG_UNBLOCK);
+
+	while (true) {
+		modified = 0;
+		check_signals();
+		nr_read = read(ifd, buf, bufsize);
+		check_signals();
+		if (nr_read < 0) {
+			if (errno == EINTR)
+				continue;
+			warn("read() failed on inotify fd!");
+			cleanup_exit(EXIT_FAILURE);
+		}
+		if (nr_read == bufsize)
+			warnx("inotify() buffer filled, skipping events");
+		if (nr_read < istructsize) {
+			warnx("read() returned too few bytes on inotify fd");
+			cleanup_exit(EXIT_FAILURE);
+		}
+
+		for (p = buf; p < buf + nr_read;) {
+			event = (struct inotify_event *) p;
+			if ((event->mask & IN_MODIFY) != 0)
+				modified++;
+			p += istructsize + event->len;
+		}
+		while (modified > 0) {
+			check_signals();
+			mutex_lock(&printstate.mutex);
+			check_signals();
+			printstate_next_ticket(&req);
+			if (printstate_cnt_read() > 0) {
+				printstate_mark_req_completed(&req);
+				mutex_unlock(&printstate.mutex);
+				if (verbose_lostevent()) {
+					clock_gettime_or_die(CLOCK_MONOTONIC,
+							     &timestamp);
+					print_lostmessage(&timestamp, buffer,
+							  bufspace, &req,
+							  "inotify loop");
+				}
+				break;
+			}
+			mutex_unlock(&printstate.mutex);
+			if (queue_try_to_add_entry(&printqueue, &req) != 0) {
+				/* These prints could happen concurrently */
+				check_signals();
+				mutex_lock(&print_mtx);
+				check_signals();
+				write_or_die(fd_stdout, queue_full_warning,
+					     sizeof(queue_full_warning));
+				mutex_unlock(&print_mtx);
+			}
+			modified--;
+		}
+	}
+}
+
+static void *do_printloop(void *arg)
+{
+	const size_t bufspace = PRINT_BUFFER_SIZE;
+	char *buffer;
+	long *rseed = (long *) arg;
+	struct drand48_data drandbuf;
+	long slept = 0;
+	struct entry req;
+	int prob = 0;
+	struct timespec timestamp;
+	struct short_msg resize_msg;
+
+	print_priority();
+
+	if (srand48_r(*rseed, &drandbuf) != 0) {
+		warn("srand48_r() failed!\n");
+		cleanup_exit(EXIT_FAILURE);
+	}
+
+	buffer = malloc_or_die(bufspace);
+
+	while (true) {
+		req = queue_wait_for_entry(&printqueue);
+		clock_gettime_or_die(CLOCK_MONOTONIC, &timestamp);
+		mutex_lock(&printstate.mutex);
+		if (prev_req_won_race(&req)) {
+			printstate_mark_req_completed(&req);
+			mutex_unlock(&printstate.mutex);
+			if (verbose_lostevent())
+				print_lostmessage(&timestamp, buffer, bufspace,
+						  &req, "print loop");
+			continue;
+		}
+		mutex_unlock(&printstate.mutex);
+
+		/*
+		 * Toss a coin to decide if we want to sleep before printing
+		 * out the backtrace. The reason for this is that opening
+		 * /sys/kernel/debug/tracing/trace will cause a blackout of
+		 * hundreds of ms, where no latencies will be noted by the
+		 * latency tracer. Thus by randomly sleeping we try to avoid
+		 * missing traces systematically due to this. With this option
+		 * we will sometimes get the first latency, some other times
+		 * some of the later ones, in case of closely spaced traces.
+		 */
+		if (trace_enable && use_random_sleep) {
+			slept = 0;
+			prob = table_get_probability(&req, &resize_msg);
+			if (!toss_coin(&drandbuf, prob))
+				slept = go_to_sleep(&req);
+			if (slept >= 0) {
+				/* A print is ongoing */
+				printstate_cnt_inc();
+				/*
+				 * We will do the printout below so we have to
+				 * mark it as completed while we still have the
+				 * mutex.
+				 */
+				mutex_lock(&printstate.mutex);
+				printstate_mark_req_completed(&req);
+				mutex_unlock(&printstate.mutex);
+			}
+		}
+		if (trace_enable) {
+			/*
+			 * slept < 0  means that we detected another
+			 * notification in go_to_sleep() above
+			 */
+			if (slept >= 0)
+				/*
+				 * N.B. printstate_cnt_dec(); will be called
+				 * inside print_tracefile()
+				 */
+				print_tracefile(&resize_msg, &timestamp, buffer,
+						bufspace, slept, &req);
+			else
+				print_skipmessage(&resize_msg, &timestamp,
+						  buffer, bufspace, &req, true);
+		} else {
+			print_skipmessage(&resize_msg, &timestamp, buffer,
+					  bufspace, &req, false);
+		}
+	}
+	return NULL;
+}
+
+static void start_printthread(void)
+{
+	unsigned int i;
+	long *seed;
+	int ufd;
+
+	ufd = open(DEV_URANDOM, O_RDONLY);
+	if (nr_threads > MAX_THREADS) {
+		warnx(
+"Number of requested print threads was %d, max number is %d\n",
+		      nr_threads, MAX_THREADS);
+		nr_threads = MAX_THREADS;
+	}
+	for (i = 0; i < nr_threads; i++) {
+		seed = malloc_or_die(sizeof(*seed));
+		if (ufd <  0 ||
+		    read(ufd, seed, sizeof(*seed)) != sizeof(*seed)) {
+			printf(
+"Warning! Using trivial random nummer seed, since %s not available\n",
+			DEV_URANDOM);
+			fflush(stdout);
+			*seed = i;
+		}
+		errno = pthread_create(&printthread[i], NULL, do_printloop,
+				       seed);
+		if (errno)
+			err(EXIT_FAILURE, "pthread_create()");
+	}
+	if (ufd > 0 && close(ufd) != 0)
+		warn("close() failed");
+}
+
+static void show_usage(void)
+{
+	printf(
+"Usage: %s [OPTION]...\n\n"
+"Collect closely occurring latencies from %s\n"
+"with any of the following tracers: preemptirqsoff, preemptoff, irqsoff, "
+"wakeup,\nwakeup_dl, or wakeup_rt.\n\n"
+
+"The occurrence of a latency is detected by monitoring the file\n"
+"%s with inotify.\n\n"
+
+"The following options are supported:\n\n"
+
+"-l, --list\t\tList the latency tracers that are supported by the\n"
+"\t\t\tcurrently running Linux kernel. If you don't see the\n"
+"\t\t\ttracer that you want, you will probably need to\n"
+"\t\t\tchange your kernel config and build a new kernel.\n\n"
+
+"-t, --tracer TR\t\tUse the tracer TR. The default is to use the first\n"
+"\t\t\ttracer that is supported by the kernel in the following\n"
+"\t\t\torder of precedence:\n\n"
+"\t\t\tpreemptirqsoff\n"
+"\t\t\tpreemptoff\n"
+"\t\t\tirqsoff\n"
+"\t\t\twakeup\n"
+"\t\t\twakeup_rt\n"
+"\t\t\twakeup_dl\n"
+"\n"
+"\t\t\tIf TR is not on the list above, then a warning will be\n"
+"\t\t\tprinted.\n\n"
+
+"-F, --force\t\tProceed even if another ftrace tracer is active. Without\n"
+"\t\t\tthis option, the program will refuse to start tracing if\n"
+"\t\t\tany other tracer than the nop tracer is active.\n\n"
+
+"-s, --threshold TH\tConfigure ftrace to use a threshold of TH microseconds\n"
+"\t\t\tfor the tracer. The default is 0, which means that\n"
+"\t\t\ttracing_max_latency will be used. tracing_max_latency is\n"
+"\t\t\tset to 0 when the program is started and contains the\n"
+"\t\t\tmaximum of the latencies that have been encountered.\n\n"
+
+"-f, --function\t\tEnable the function-trace option in trace_options. With\n"
+"\t\t\tthis option, ftrace will trace the functions that are\n"
+"\t\t\texecuted during a latency, without it we only get the\n"
+"\t\t\tbeginning, end, and backtrace.\n\n"
+
+"-g, --graph\t\tEnable the display-graph option in trace_option. This\n"
+"\t\t\toption causes ftrace to show the functionph of how\n"
+"\t\t\tfunctions are calling other functions.\n\n"
+
+"-c, --policy POL\tRun the program with scheduling policy POL. POL can be\n"
+"\t\t\tother, batch, idle, rr or fifo. The default is rr. When\n"
+"\t\t\tusing rr or fifo, remember that these policies may cause\n"
+"\t\t\tother tasks to experience latencies.\n\n"
+
+"-p, --priority PRI\tRun the program with priority PRI. The acceptable range\n"
+"\t\t\tof PRI depends on the scheduling policy.\n\n"
+
+"-n, --notrace\t\tIf latency is detected, do not print out the content of\n"
+"\t\t\tthe trace file to standard output\n\n"
+
+"-t, --threads NRTHR\tRun NRTHR threads for printing. Default is %d.\n\n"
+
+"-r, --random\t\tArbitrarily sleep a certain amount of time, default\n"
+"\t\t\t%ld ms, before reading the trace file. The\n"
+"\t\t\tprobabilities for sleep are chosen so that the\n"
+"\t\t\tprobability of obtaining any of a cluster of closely\n"
+"\t\t\toccurring latencies are equal, i.e. we will randomly\n"
+"\t\t\tchoose which one we collect from the trace file.\n\n"
+"\t\t\tThis option is probably only useful with the irqsoff,\n"
+"\t\t\tpreemptoff, and preemptirqsoff tracers.\n\n"
+
+"-a, --nrlat NRLAT\tFor the purpose of arbitrary delay, assume that there\n"
+"\t\t\tare no more than NRLAT clustered latencies. If NRLAT\n"
+"\t\t\tlatencies are detected during a run, this value will\n"
+"\t\t\tautomatically be increased to NRLAT + 1 and then to\n"
+"\t\t\tNRLAT + 2 and so on. The default is %d. This option\n"
+"\t\t\timplies -r. We need to know this number in order to\n"
+"\t\t\tbe able to calculate the probabilities of sleeping.\n"
+"\t\t\tSpecifically, the probabilities of not sleeping, i.e. to\n"
+"\t\t\tdo an immediate printout will be:\n\n"
+"\t\t\t1/NRLAT  1/(NRLAT - 1) ... 1/3  1/2  1\n\n"
+"\t\t\tThe probability of sleeping will be:\n\n"
+"\t\t\t1 - P, where P is from the series above\n\n"
+"\t\t\tThis descending probability will cause us to choose\n"
+"\t\t\tan occurrence at random. Observe that the final\n"
+"\t\t\tprobability is 0, it is when we reach this probability\n"
+"\t\t\tthat we increase NRLAT automatically. As an example,\n"
+"\t\t\twith the default value of 2, the probabilities will be:\n\n"
+"\t\t\t1/2  0\n\n"
+"\t\t\tThis means, when a latency is detected we will sleep\n"
+"\t\t\twith 50%% probability. If we ever detect another latency\n"
+"\t\t\tduring the sleep period, then the probability of sleep\n"
+"\t\t\twill be 0%% and the table will be expanded to:\n\n"
+"\t\t\t1/3  1/2  0\n\n"
+
+"-v, --verbose\t\tIncrease the verbosity. If this option is given once,\n"
+"\t\t\tthen print a message every time that the NRLAT value\n"
+"\t\t\tis automatically increased. It also causes a message to\n"
+"\t\t\tbe printed when the ftrace settings are changed. If this\n"
+"\t\t\toption is given at least twice, then also print a\n"
+"\t\t\twarning for lost events.\n\n"
+
+"-u, --time TIME\t\tArbitrarily sleep for a specified time TIME ms before\n"
+"\t\t\tprinting out the trace from the trace file. The default\n"
+"\t\t\tis %ld ms. This option implies -r.\n\n"
+
+"-x, --no-ftrace\t\tDo not configure ftrace. This assume that the user\n"
+"\t\t\tconfigures the ftrace files in sysfs such as\n"
+"\t\t\t/sys/kernel/tracing/current_tracer or equivalent.\n\n"
+
+"-i, --tracefile FILE\tUse FILE as trace file. The default is\n"
+"\t\t\t%s.\n"
+"\t\t\tThis options implies -x\n\n"
+
+"-m, --max-lat FILE\tUse FILE as tracing_max_latency file. The default is\n"
+"\t\t\t%s.\n"
+"\t\t\tThis options implies -x\n\n"
+,
+prg_name, debug_tracefile_dflt, debug_maxlat_dflt, DEFAULT_NR_PRINTER_THREADS,
+SLEEP_TIME_MS_DEFAULT, DEFAULT_TABLE_SIZE, SLEEP_TIME_MS_DEFAULT,
+debug_tracefile_dflt, debug_maxlat_dflt);
+}
+
+static void find_tracefiles(void)
+{
+	debug_tracefile_dflt = tracefs_get_tracing_file("trace");
+	if (debug_tracefile_dflt == NULL) {
+		/* This is needed in show_usage() */
+		debug_tracefile_dflt = DEBUG_NOFILE;
+	}
+
+	debug_maxlat_dflt = tracefs_get_tracing_file("tracing_max_latency");
+	if (debug_maxlat_dflt == NULL) {
+		/* This is needed in show_usage() */
+		debug_maxlat_dflt = DEBUG_NOFILE;
+	}
+
+	debug_tracefile = debug_tracefile_dflt;
+	debug_maxlat = debug_maxlat_dflt;
+}
+
+bool alldigits(const char *s)
+{
+	for (; *s != '\0'; s++)
+		if (!isdigit(*s))
+			return false;
+	return true;
+}
+
+void check_alldigits(const char *optarg, const char *argname)
+{
+	if (!alldigits(optarg))
+		errx(EXIT_FAILURE,
+		     "The %s parameter expects a decimal argument\n", argname);
+}
+
+static void scan_arguments(int argc, char *argv[])
+{
+	int c;
+	int i;
+	int option_idx = 0;
+
+	static struct option long_options[] = {
+		{ "list",       no_argument,            0, 'l' },
+		{ "tracer",	required_argument,	0, 't' },
+		{ "force",      no_argument,            0, 'F' },
+		{ "threshold",  required_argument,      0, 's' },
+		{ "function",   no_argument,            0, 'f' },
+		{ "graph",      no_argument,            0, 'g' },
+		{ "policy",	required_argument,	0, 'c' },
+		{ "priority",	required_argument,	0, 'p' },
+		{ "help",	no_argument,		0, 'h' },
+		{ "notrace",	no_argument,		0, 'n' },
+		{ "random",	no_argument,		0, 'r' },
+		{ "nrlat",	required_argument,	0, 'a' },
+		{ "threads",	required_argument,	0, 'e' },
+		{ "time",	required_argument,	0, 'u' },
+		{ "verbose",	no_argument,		0, 'v' },
+		{ "no-ftrace",  no_argument,            0, 'x' },
+		{ "tracefile",	required_argument,	0, 'i' },
+		{ "max-lat",	required_argument,	0, 'm' },
+		{ 0,		0,			0,  0  }
+	};
+	const struct policy *p;
+	int max, min;
+	int value;
+	bool notracer, valid;
+
+	/*
+	 * We must do this before parsing the arguments because show_usage()
+	 * needs to display these.
+	 */
+	find_tracefiles();
+
+	while (true) {
+		c = getopt_long(argc, argv, "lt:Fs:fgc:p:hnra:e:u:vxi:m:",
+				long_options, &option_idx);
+		if (c == -1)
+			break;
+
+		switch (c) {
+		case 'l':
+			show_available();
+			exit(0);
+			break;
+		case 't':
+			current_tracer = strdup(optarg);
+			if (!is_relevant_tracer(current_tracer)) {
+				warnx("%s is not a known latency tracer!\n",
+				      current_tracer);
+			}
+			valid = tracer_valid(current_tracer, &notracer);
+			if (notracer)
+				errx(EXIT_FAILURE, no_tracer_msg);
+			if (!valid)
+				errx(EXIT_FAILURE,
+"The tracer %s is not supported by your kernel!\n", current_tracer);
+			break;
+		case 'F':
+			force_tracer = true;
+			break;
+		case 's':
+			check_alldigits(optarg, "-s [--threshold]");
+			threshold = strdup(optarg);
+			break;
+		case 'f':
+			use_options[OPTIDX_FUNC_TR] = true;
+			break;
+		case 'g':
+			use_options[OPTIDX_DISP_GR] = true;
+			break;
+		case 'c':
+			p = policy_from_name(optarg);
+			if (p != NULL) {
+				sched_policy = p->policy;
+				sched_policy_set = true;
+				if (!sched_pri_set) {
+					sched_pri = p->default_pri;
+					sched_pri_set = true;
+				}
+			} else {
+				warnx("Unknown scheduling %s\n", optarg);
+				show_usage();
+				exit(0);
+			}
+			break;
+		case 'p':
+			check_alldigits(optarg, "-p [--priority]");
+			sched_pri = atoi(optarg);
+			sched_pri_set = true;
+			break;
+		case 'h':
+			show_usage();
+			exit(0);
+			break;
+		case 'n':
+			trace_enable = false;
+			use_random_sleep = false;
+			break;
+		case 'e':
+			check_alldigits(optarg, "-e [--threads]");
+			value = atoi(optarg);
+			if (value > 0)
+				nr_threads = value;
+			else {
+				warnx("NRTHR must be > 0\n");
+				show_usage();
+				exit(0);
+			}
+			break;
+		case 'u':
+			check_alldigits(optarg, "-u [--time]");
+			value = atoi(optarg);
+			if (value < 0) {
+				warnx("TIME must be >= 0\n");
+				show_usage();
+				;
+			}
+			trace_enable = true;
+			use_random_sleep = true;
+			sleep_time = value * USEC_PER_MSEC;
+			break;
+		case 'v':
+			verbosity++;
+			break;
+		case 'r':
+			trace_enable = true;
+			use_random_sleep = true;
+			break;
+		case 'a':
+			check_alldigits(optarg, "-a [--nrlat]");
+			value = atoi(optarg);
+			if (value <= 0) {
+				warnx("NRLAT must be > 0\n");
+				show_usage();
+				exit(0);
+			}
+			trace_enable = true;
+			use_random_sleep = true;
+			table_startsize = value;
+			break;
+		case 'x':
+			setup_ftrace = false;
+			break;
+		case 'i':
+			setup_ftrace = false;
+			debug_tracefile = strdup(optarg);
+			break;
+		case 'm':
+			setup_ftrace = false;
+			debug_maxlat = strdup(optarg);
+			break;
+		default:
+			show_usage();
+			exit(0);
+			break;
+		}
+	}
+
+	if (setup_ftrace) {
+		if (!current_tracer) {
+			current_tracer = find_default_tracer();
+			if (!current_tracer)
+				errx(EXIT_FAILURE,
+"No default tracer found and tracer not specified\n");
+		}
+
+		if (use_random_sleep && !random_makes_sense(current_tracer)) {
+			warnx("WARNING: The tracer is %s and random sleep has",
+			      current_tracer);
+			fprintf(stderr,
+"been enabled. Random sleep is intended for the following tracers:\n");
+			for (i = 0; random_tracers[i]; i++)
+				fprintf(stderr, "%s\n", random_tracers[i]);
+			fprintf(stderr, "\n");
+		}
+	}
+
+	if (debug_tracefile == DEBUG_NOFILE ||
+	    debug_maxlat == DEBUG_NOFILE)
+		errx(EXIT_FAILURE,
+"Could not find tracing directory e.g. /sys/kernel/tracing\n");
+
+	if (!sched_policy_set) {
+		sched_policy = SCHED_RR;
+		sched_policy_set = true;
+		if (!sched_pri_set) {
+			sched_pri = RT_DEFAULT_PRI;
+			sched_pri_set = true;
+		}
+	}
+
+	max = sched_get_priority_max(sched_policy);
+	min = sched_get_priority_min(sched_policy);
+
+	if (sched_pri < min) {
+		printf(
+"ATTENTION: Increasing priority to minimum, which is %d\n", min);
+		sched_pri = min;
+	}
+	if (sched_pri > max) {
+		printf(
+"ATTENTION: Reducing priority to maximum, which is %d\n", max);
+		sched_pri = max;
+	}
+}
+
+static void show_params(void)
+{
+	printf(
+"\n"
+"Running with scheduling policy %s and priority %d. Using %d print threads.\n",
+		policy_name(sched_policy), sched_pri, nr_threads);
+	if (trace_enable) {
+		if (use_random_sleep) {
+			printf(
+"%s will be printed with random delay\n"
+"Start size of the probability table:\t\t\t%d\n"
+"Print a message when the prob. table changes size:\t%s\n"
+"Print a warning when an event has been lost:\t\t%s\n"
+"Sleep time is:\t\t\t\t\t\t%ld ms\n",
+debug_tracefile,
+table_startsize,
+bool2str(verbose_sizechange()),
+bool2str(verbose_lostevent()),
+sleep_time / USEC_PER_MSEC);
+		} else {
+			printf("%s will be printed immediately\n",
+			       debug_tracefile);
+		}
+	} else {
+		printf("%s will not be printed\n",
+		       debug_tracefile);
+	}
+	if (setup_ftrace) {
+		printf("Tracer:\t\t\t\t\t\t\t%s\n"
+		       "%s option:\t\t\t\t\t%s\n"
+		       "%s option:\t\t\t\t\t%s\n",
+		       current_tracer,
+		       optstr[OPTIDX_FUNC_TR],
+		       bool2str(use_options[OPTIDX_FUNC_TR]),
+		       optstr[OPTIDX_DISP_GR],
+		       bool2str(use_options[OPTIDX_DISP_GR]));
+		if (!strcmp(threshold, "0"))
+			printf("Threshold:\t\t\t\t\t\ttracing_max_latency\n");
+		else
+			printf("Threshold:\t\t\t\t\t\t%s\n", threshold);
+	}
+	printf("\n");
+}
+
+int main(int argc, char *argv[])
+{
+	init_save_state();
+	signal_blocking(SIG_BLOCK);
+	setup_sig_handler();
+	open_stdout();
+
+	if (argc >= 1)
+		prg_name = argv[0];
+	else
+		prg_name = prg_unknown;
+
+	scan_arguments(argc, argv);
+	show_params();
+
+	init_printstate();
+	init_print_mtx();
+	if (use_random_sleep) {
+		init_probabilities();
+		if (verbose_sizechange())
+			printf("Initializing probability table to %d\n",
+			       table_startsize);
+		sleeptable_resize(table_startsize, false, NULL);
+	}
+	set_priority();
+	init_queue(&printqueue);
+	start_printthread();
+	tracing_loop();
+	return 0;
+}
-- 
cgit v1.2.3


From c85c9a2c6e368dc94907e63babb18a9788e5c9b6 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Thu, 18 Feb 2021 23:29:23 +0900
Subject: kprobes: Fix to delay the kprobes jump optimization

Commit 36dadef23fcc ("kprobes: Init kprobes in early_initcall")
moved the kprobe setup in early_initcall(), which includes kprobe
jump optimization.
The kprobes jump optimizer involves synchronize_rcu_tasks() which
depends on the ksoftirqd and rcu_spawn_tasks_*(). However, since
those are setup in core_initcall(), kprobes jump optimizer can not
run at the early_initcall().

To avoid this issue, make the kprobe optimization disabled in the
early_initcall() and enables it in subsys_initcall().

Note that non-optimized kprobes is still available after
early_initcall(). Only jump optimization is delayed.

Link: https://lkml.kernel.org/r/161365856280.719838.12423085451287256713.stgit@devnote2

Fixes: 36dadef23fcc ("kprobes: Init kprobes in early_initcall")
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: RCU <rcu@vger.kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Daniel Axtens <dja@axtens.net>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Neeraj Upadhyay <neeraju@codeaurora.org>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: "Theodore Y . Ts'o" <tytso@mit.edu>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sonymobile.com>
Cc: stable@vger.kernel.org
Reported-by: Paul E. McKenney <paulmck@kernel.org>
Reported-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reported-by: Uladzislau Rezki <urezki@gmail.com>
Acked-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/kprobes.c | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index dd1d027455c4..745f08fdd7a6 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -861,7 +861,6 @@ out:
 	cpus_read_unlock();
 }
 
-#ifdef CONFIG_SYSCTL
 static void optimize_all_kprobes(void)
 {
 	struct hlist_head *head;
@@ -887,6 +886,7 @@ out:
 	mutex_unlock(&kprobe_mutex);
 }
 
+#ifdef CONFIG_SYSCTL
 static void unoptimize_all_kprobes(void)
 {
 	struct hlist_head *head;
@@ -2500,18 +2500,14 @@ static int __init init_kprobes(void)
 		}
 	}
 
-#if defined(CONFIG_OPTPROBES)
-#if defined(__ARCH_WANT_KPROBES_INSN_SLOT)
-	/* Init kprobe_optinsn_slots */
-	kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE;
-#endif
-	/* By default, kprobes can be optimized */
-	kprobes_allow_optimization = true;
-#endif
-
 	/* By default, kprobes are armed */
 	kprobes_all_disarmed = false;
 
+#if defined(CONFIG_OPTPROBES) && defined(__ARCH_WANT_KPROBES_INSN_SLOT)
+	/* Init kprobe_optinsn_slots for allocation */
+	kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE;
+#endif
+
 	err = arch_init_kprobes();
 	if (!err)
 		err = register_die_notifier(&kprobe_exceptions_nb);
@@ -2526,6 +2522,21 @@ static int __init init_kprobes(void)
 }
 early_initcall(init_kprobes);
 
+#if defined(CONFIG_OPTPROBES)
+static int __init init_optprobes(void)
+{
+	/*
+	 * Enable kprobe optimization - this kicks the optimizer which
+	 * depends on synchronize_rcu_tasks() and ksoftirqd, that is
+	 * not spawned in early initcall. So delay the optimization.
+	 */
+	optimize_all_kprobes();
+
+	return 0;
+}
+subsys_initcall(init_optprobes);
+#endif
+
 #ifdef CONFIG_DEBUG_FS
 static void report_probe(struct seq_file *pi, struct kprobe *p,
 		const char *sym, int offset, char *modname, struct kprobe *pp)
-- 
cgit v1.2.3