From 6b3e458806e34f1142592f786d3eb0ebac209cc2 Mon Sep 17 00:00:00 2001
From: Günther Noack <gnoack@google.com>
Date: Thu, 19 Feb 2026 16:43:35 +0100
Subject: HID: Document memory allocation properties of report_fixup()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The memory pointer returned by the report_fixup() hook does not get
freed by the caller.  Instead, report_fixup() must return (in return
value and *rsize) a memory buffer with at least the same lifetime as
the input buffer (defined by rdesc and original *rsize).

This is usually achieved using one of the following techniques:

* Returning a pointer and size to a sub-portion of the input buffer
* Returning a pointer to a static buffer
* Allocating a buffer with a devm_*() function,
  which will automatically get freed when the device is removed.

Signed-off-by: Günther Noack <gnoack@google.com>
Signed-off-by: Benjamin Tissoires <bentiss@kernel.org>
---
 include/linux/hid.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/linux/hid.h b/include/linux/hid.h
index dce862cafbbd..2990b9f94cb5 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -836,6 +836,12 @@ struct hid_usage_id {
  * raw_event and event should return negative on error, any other value will
  * pass the event on to .event() typically return 0 for success.
  *
+ * report_fixup must return a report descriptor pointer whose lifetime is at
+ * least that of the input rdesc.  This is usually done by mutating the input
+ * rdesc and returning it or a sub-portion of it.  In case a new buffer is
+ * allocated and returned, the implementation of report_fixup is responsible for
+ * freeing it later.
+ *
  * input_mapping shall return a negative value to completely ignore this usage
  * (e.g. doubled or invalid usage), zero to continue with parsing of this
  * usage by generic code (no special handling needed) or positive to skip
-- 
cgit v1.2.3


From 2a7b7652b1bb3fadc3bd47d622bfb127a93ab6b0 Mon Sep 17 00:00:00 2001
From: Leif Skunberg <diamondback@cohunt.app>
Date: Tue, 10 Feb 2026 14:21:29 +0100
Subject: platform/x86: int3472: Handle GPIO type 0x10 (DOVDD)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Lenovo ThinkPad X1 Fold 16 Gen 1 has an OV5675 sensor (ACPI HID
OVTI5675) behind an INT3472 discrete PMIC controller. The INT3472
_DSM returns GPIO type 0x10 for one of the pins, which controls the
DOVDD (digital I/O power) regulator enable.

Type 0x10 is not currently handled by the driver, causing the GPIO to
be ignored with a warning. Add INT3472_GPIO_TYPE_DOVDD (0x10) and
handle it as a regulator with con_id "dovdd" to match the supply name
used by sensor drivers (e.g. ov5675).

Also increase GPIO_SUPPLY_NAME_LENGTH from 5 to 6 to accommodate
the "dovdd" name (5 chars + null terminator).

Signed-off-by: Leif Skunberg <diamondback@cohunt.app>
Reviewed-by: Hans de Goede <johannes.goede@oss.qualcomm.com>
Link: https://patch.msgid.link/20260210132129.17943-1-diamondback@cohunt.app
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 drivers/platform/x86/intel/int3472/discrete.c | 7 +++++++
 include/linux/platform_data/x86/int3472.h     | 5 +++--
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/platform/x86/intel/int3472/discrete.c b/drivers/platform/x86/intel/int3472/discrete.c
index 1455d9a7afca..1c65ce87cde0 100644
--- a/drivers/platform/x86/intel/int3472/discrete.c
+++ b/drivers/platform/x86/intel/int3472/discrete.c
@@ -223,6 +223,10 @@ static void int3472_get_con_id_and_polarity(struct int3472_discrete_device *int3
 		*con_id = "avdd";
 		*gpio_flags = GPIO_ACTIVE_HIGH;
 		break;
+	case INT3472_GPIO_TYPE_DOVDD:
+		*con_id = "dovdd";
+		*gpio_flags = GPIO_ACTIVE_HIGH;
+		break;
 	case INT3472_GPIO_TYPE_HANDSHAKE:
 		*con_id = "dvdd";
 		*gpio_flags = GPIO_ACTIVE_HIGH;
@@ -251,6 +255,7 @@ static void int3472_get_con_id_and_polarity(struct int3472_discrete_device *int3
  * 0x0b Power enable
  * 0x0c Clock enable
  * 0x0d Privacy LED
+ * 0x10 DOVDD (digital I/O voltage)
  * 0x13 Hotplug detect
  *
  * There are some known platform specific quirks where that does not quite
@@ -332,6 +337,7 @@ static int skl_int3472_handle_gpio_resources(struct acpi_resource *ares,
 	case INT3472_GPIO_TYPE_CLK_ENABLE:
 	case INT3472_GPIO_TYPE_PRIVACY_LED:
 	case INT3472_GPIO_TYPE_POWER_ENABLE:
+	case INT3472_GPIO_TYPE_DOVDD:
 	case INT3472_GPIO_TYPE_HANDSHAKE:
 		gpio = skl_int3472_gpiod_get_from_temp_lookup(int3472, agpio, con_id, gpio_flags);
 		if (IS_ERR(gpio)) {
@@ -356,6 +362,7 @@ static int skl_int3472_handle_gpio_resources(struct acpi_resource *ares,
 		case INT3472_GPIO_TYPE_POWER_ENABLE:
 			second_sensor = int3472->quirks.avdd_second_sensor;
 			fallthrough;
+		case INT3472_GPIO_TYPE_DOVDD:
 		case INT3472_GPIO_TYPE_HANDSHAKE:
 			ret = skl_int3472_register_regulator(int3472, gpio, enable_time_us,
 							     con_id, second_sensor);
diff --git a/include/linux/platform_data/x86/int3472.h b/include/linux/platform_data/x86/int3472.h
index b1b837583d54..dbe745dc88d5 100644
--- a/include/linux/platform_data/x86/int3472.h
+++ b/include/linux/platform_data/x86/int3472.h
@@ -26,6 +26,7 @@
 #define INT3472_GPIO_TYPE_POWER_ENABLE				0x0b
 #define INT3472_GPIO_TYPE_CLK_ENABLE				0x0c
 #define INT3472_GPIO_TYPE_PRIVACY_LED				0x0d
+#define INT3472_GPIO_TYPE_DOVDD					0x10
 #define INT3472_GPIO_TYPE_HANDSHAKE				0x12
 #define INT3472_GPIO_TYPE_HOTPLUG_DETECT			0x13
 
@@ -33,8 +34,8 @@
 #define INT3472_MAX_SENSOR_GPIOS				3
 #define INT3472_MAX_REGULATORS					3
 
-/* E.g. "avdd\0" */
-#define GPIO_SUPPLY_NAME_LENGTH				5
+/* E.g. "dovdd\0" */
+#define GPIO_SUPPLY_NAME_LENGTH				6
 /* 12 chars for acpi_dev_name() + "-", e.g. "ABCD1234:00-" */
 #define GPIO_REGULATOR_NAME_LENGTH				(12 + GPIO_SUPPLY_NAME_LENGTH)
 /* lower- and upper-case mapping */
-- 
cgit v1.2.3


From 201ceb94aa1def0024a7c18ce643e5f65026be06 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Mon, 23 Feb 2026 19:37:51 -0800
Subject: kunit: irq: Ensure timer doesn't fire too frequently

Fix a bug where kunit_run_irq_test() could hang if the system is too
slow.  This was noticed with the crypto library tests in certain VMs.

Specifically, if kunit_irq_test_timer_func() and the associated hrtimer
code took over 5us to run, then the CPU would spend all its time
executing that code in hardirq context.  As a result, the task executing
kunit_run_irq_test() never had a chance to run, exit the loop, and
cancel the timer.

To fix it, make kunit_irq_test_timer_func() increase the timer interval
when the other contexts aren't having a chance to run.

Fixes: 950a81224e8b ("lib/crypto: tests: Add hash-test-template.h and gen-hash-testvecs.py")
Cc: stable@vger.kernel.org
Reviewed-by: David Gow <david@davidgow.net>
Link: https://lore.kernel.org/r/20260224033751.97615-1-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 include/kunit/run-in-irq-context.h | 44 ++++++++++++++++++++++++--------------
 1 file changed, 28 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/kunit/run-in-irq-context.h b/include/kunit/run-in-irq-context.h
index c89b1b1b12dd..bfe60d6cf28d 100644
--- a/include/kunit/run-in-irq-context.h
+++ b/include/kunit/run-in-irq-context.h
@@ -12,16 +12,16 @@
 #include <linux/hrtimer.h>
 #include <linux/workqueue.h>
 
-#define KUNIT_IRQ_TEST_HRTIMER_INTERVAL us_to_ktime(5)
-
 struct kunit_irq_test_state {
 	bool (*func)(void *test_specific_state);
 	void *test_specific_state;
 	bool task_func_reported_failure;
 	bool hardirq_func_reported_failure;
 	bool softirq_func_reported_failure;
+	atomic_t task_func_calls;
 	atomic_t hardirq_func_calls;
 	atomic_t softirq_func_calls;
+	ktime_t interval;
 	struct hrtimer timer;
 	struct work_struct bh_work;
 };
@@ -30,14 +30,25 @@ static enum hrtimer_restart kunit_irq_test_timer_func(struct hrtimer *timer)
 {
 	struct kunit_irq_test_state *state =
 		container_of(timer, typeof(*state), timer);
+	int task_calls, hardirq_calls, softirq_calls;
 
 	WARN_ON_ONCE(!in_hardirq());
-	atomic_inc(&state->hardirq_func_calls);
+	task_calls = atomic_read(&state->task_func_calls);
+	hardirq_calls = atomic_inc_return(&state->hardirq_func_calls);
+	softirq_calls = atomic_read(&state->softirq_func_calls);
+
+	/*
+	 * If the timer is firing too often for the softirq or task to ever have
+	 * a chance to run, increase the timer interval.  This is needed on very
+	 * slow systems.
+	 */
+	if (hardirq_calls >= 20 && (softirq_calls == 0 || task_calls == 0))
+		state->interval = ktime_add_ns(state->interval, 250);
 
 	if (!state->func(state->test_specific_state))
 		state->hardirq_func_reported_failure = true;
 
-	hrtimer_forward_now(&state->timer, KUNIT_IRQ_TEST_HRTIMER_INTERVAL);
+	hrtimer_forward_now(&state->timer, state->interval);
 	queue_work(system_bh_wq, &state->bh_work);
 	return HRTIMER_RESTART;
 }
@@ -86,10 +97,14 @@ static inline void kunit_run_irq_test(struct kunit *test, bool (*func)(void *),
 	struct kunit_irq_test_state state = {
 		.func = func,
 		.test_specific_state = test_specific_state,
+		/*
+		 * Start with a 5us timer interval.  If the system can't keep
+		 * up, kunit_irq_test_timer_func() will increase it.
+		 */
+		.interval = us_to_ktime(5),
 	};
 	unsigned long end_jiffies;
-	int hardirq_calls, softirq_calls;
-	bool allctx = false;
+	int task_calls, hardirq_calls, softirq_calls;
 
 	/*
 	 * Set up a hrtimer (the way we access hardirq context) and a work
@@ -104,21 +119,18 @@ static inline void kunit_run_irq_test(struct kunit *test, bool (*func)(void *),
 	 * and hardirq), or 1 second, whichever comes first.
 	 */
 	end_jiffies = jiffies + HZ;
-	hrtimer_start(&state.timer, KUNIT_IRQ_TEST_HRTIMER_INTERVAL,
-		      HRTIMER_MODE_REL_HARD);
-	for (int task_calls = 0, calls = 0;
-	     ((calls < max_iterations) || !allctx) &&
-	     !time_after(jiffies, end_jiffies);
-	     task_calls++) {
+	hrtimer_start(&state.timer, state.interval, HRTIMER_MODE_REL_HARD);
+	do {
 		if (!func(test_specific_state))
 			state.task_func_reported_failure = true;
 
+		task_calls = atomic_inc_return(&state.task_func_calls);
 		hardirq_calls = atomic_read(&state.hardirq_func_calls);
 		softirq_calls = atomic_read(&state.softirq_func_calls);
-		calls = task_calls + hardirq_calls + softirq_calls;
-		allctx = (task_calls > 0) && (hardirq_calls > 0) &&
-			 (softirq_calls > 0);
-	}
+	} while ((task_calls + hardirq_calls + softirq_calls < max_iterations ||
+		  (task_calls == 0 || hardirq_calls == 0 ||
+		   softirq_calls == 0)) &&
+		 !time_after(jiffies, end_jiffies));
 
 	/* Cancel the timer and work. */
 	hrtimer_cancel(&state.timer);
-- 
cgit v1.2.3


From 28aaa9c39945b7925a1cc1d513c8f21ed38f5e4f Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 26 Feb 2026 10:43:55 +0100
Subject: kthread: consolidate kthread exit paths to prevent use-after-free

Guillaume reported crashes via corrupted RCU callback function pointers
during KUnit testing. The crash was traced back to the pidfs rhashtable
conversion which replaced the 24-byte rb_node with an 8-byte rhash_head
in struct pid, shrinking it from 160 to 144 bytes.

struct kthread (without CONFIG_BLK_CGROUP) is also 144 bytes. With
CONFIG_SLAB_MERGE_DEFAULT and SLAB_HWCACHE_ALIGN both round up to
192 bytes and share the same slab cache. struct pid.rcu.func and
struct kthread.affinity_node both sit at offset 0x78.

When a kthread exits via make_task_dead() it bypasses kthread_exit() and
misses the affinity_node cleanup. free_kthread_struct() frees the memory
while the node is still linked into the global kthread_affinity_list. A
subsequent list_del() by another kthread writes through dangling list
pointers into the freed and reused memory, corrupting the pid's
rcu.func pointer.

Instead of patching free_kthread_struct() to handle the missed cleanup,
consolidate all kthread exit paths. Turn kthread_exit() into a macro
that calls do_exit() and add kthread_do_exit() which is called from
do_exit() for any task with PF_KTHREAD set. This guarantees that
kthread-specific cleanup always happens regardless of the exit path -
make_task_dead(), direct do_exit(), or kthread_exit().

Replace __to_kthread() with a new tsk_is_kthread() accessor in the
public header. Export do_exit() since module code using the
kthread_exit() macro now needs it directly.

Reported-by: Guillaume Tucker <gtucker@gtucker.io>
Tested-by: Guillaume Tucker <gtucker@gtucker.io>
Tested-by: Mark Brown <broonie@kernel.org>
Tested-by: David Gow <davidgow@google.com>
Cc: <stable@vger.kernel.org>
Link: https://lore.kernel.org/all/20260224-mittlerweile-besessen-2738831ae7f6@brauner
Co-developed-by: Linus Torvalds <torvalds@linux-foundation.org>
Fixes: 4d13f4304fa4 ("kthread: Implement preferred affinity")
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/kthread.h | 21 ++++++++++++++++++++-
 kernel/exit.c           |  6 ++++++
 kernel/kthread.c        | 41 +++++------------------------------------
 3 files changed, 31 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index c92c1149ee6e..a01a474719a7 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -7,6 +7,24 @@
 
 struct mm_struct;
 
+/* opaque kthread data */
+struct kthread;
+
+/*
+ * When "(p->flags & PF_KTHREAD)" is set the task is a kthread and will
+ * always remain a kthread.  For kthreads p->worker_private always
+ * points to a struct kthread.  For tasks that are not kthreads
+ * p->worker_private is used to point to other things.
+ *
+ * Return NULL for any task that is not a kthread.
+ */
+static inline struct kthread *tsk_is_kthread(struct task_struct *p)
+{
+	if (p->flags & PF_KTHREAD)
+		return p->worker_private;
+	return NULL;
+}
+
 __printf(4, 5)
 struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
 					   void *data,
@@ -98,9 +116,10 @@ void *kthread_probe_data(struct task_struct *k);
 int kthread_park(struct task_struct *k);
 void kthread_unpark(struct task_struct *k);
 void kthread_parkme(void);
-void kthread_exit(long result) __noreturn;
+#define kthread_exit(result) do_exit(result)
 void kthread_complete_and_exit(struct completion *, long) __noreturn;
 int kthreads_update_housekeeping(void);
+void kthread_do_exit(struct kthread *, long);
 
 int kthreadd(void *unused);
 extern struct task_struct *kthreadd_task;
diff --git a/kernel/exit.c b/kernel/exit.c
index 8a87021211ae..ede3117fa7d4 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -896,11 +896,16 @@ static void synchronize_group_exit(struct task_struct *tsk, long code)
 void __noreturn do_exit(long code)
 {
 	struct task_struct *tsk = current;
+	struct kthread *kthread;
 	int group_dead;
 
 	WARN_ON(irqs_disabled());
 	WARN_ON(tsk->plug);
 
+	kthread = tsk_is_kthread(tsk);
+	if (unlikely(kthread))
+		kthread_do_exit(kthread, code);
+
 	kcov_task_exit(tsk);
 	kmsan_task_exit(tsk);
 
@@ -1013,6 +1018,7 @@ void __noreturn do_exit(long code)
 	lockdep_free_task(tsk);
 	do_task_dead();
 }
+EXPORT_SYMBOL(do_exit);
 
 void __noreturn make_task_dead(int signr)
 {
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 20451b624b67..791210daf8b4 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -85,24 +85,6 @@ static inline struct kthread *to_kthread(struct task_struct *k)
 	return k->worker_private;
 }
 
-/*
- * Variant of to_kthread() that doesn't assume @p is a kthread.
- *
- * When "(p->flags & PF_KTHREAD)" is set the task is a kthread and will
- * always remain a kthread.  For kthreads p->worker_private always
- * points to a struct kthread.  For tasks that are not kthreads
- * p->worker_private is used to point to other things.
- *
- * Return NULL for any task that is not a kthread.
- */
-static inline struct kthread *__to_kthread(struct task_struct *p)
-{
-	void *kthread = p->worker_private;
-	if (kthread && !(p->flags & PF_KTHREAD))
-		kthread = NULL;
-	return kthread;
-}
-
 void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk)
 {
 	struct kthread *kthread = to_kthread(tsk);
@@ -193,7 +175,7 @@ EXPORT_SYMBOL_GPL(kthread_should_park);
 
 bool kthread_should_stop_or_park(void)
 {
-	struct kthread *kthread = __to_kthread(current);
+	struct kthread *kthread = tsk_is_kthread(current);
 
 	if (!kthread)
 		return false;
@@ -234,7 +216,7 @@ EXPORT_SYMBOL_GPL(kthread_freezable_should_stop);
  */
 void *kthread_func(struct task_struct *task)
 {
-	struct kthread *kthread = __to_kthread(task);
+	struct kthread *kthread = tsk_is_kthread(task);
 	if (kthread)
 		return kthread->threadfn;
 	return NULL;
@@ -266,7 +248,7 @@ EXPORT_SYMBOL_GPL(kthread_data);
  */
 void *kthread_probe_data(struct task_struct *task)
 {
-	struct kthread *kthread = __to_kthread(task);
+	struct kthread *kthread = tsk_is_kthread(task);
 	void *data = NULL;
 
 	if (kthread)
@@ -309,19 +291,8 @@ void kthread_parkme(void)
 }
 EXPORT_SYMBOL_GPL(kthread_parkme);
 
-/**
- * kthread_exit - Cause the current kthread return @result to kthread_stop().
- * @result: The integer value to return to kthread_stop().
- *
- * While kthread_exit can be called directly, it exists so that
- * functions which do some additional work in non-modular code such as
- * module_put_and_kthread_exit can be implemented.
- *
- * Does not return.
- */
-void __noreturn kthread_exit(long result)
+void kthread_do_exit(struct kthread *kthread, long result)
 {
-	struct kthread *kthread = to_kthread(current);
 	kthread->result = result;
 	if (!list_empty(&kthread->affinity_node)) {
 		mutex_lock(&kthread_affinity_lock);
@@ -333,9 +304,7 @@ void __noreturn kthread_exit(long result)
 			kthread->preferred_affinity = NULL;
 		}
 	}
-	do_exit(0);
 }
-EXPORT_SYMBOL(kthread_exit);
 
 /**
  * kthread_complete_and_exit - Exit the current kthread.
@@ -683,7 +652,7 @@ void kthread_set_per_cpu(struct task_struct *k, int cpu)
 
 bool kthread_is_per_cpu(struct task_struct *p)
 {
-	struct kthread *kthread = __to_kthread(p);
+	struct kthread *kthread = tsk_is_kthread(p);
 	if (!kthread)
 		return false;
 
-- 
cgit v1.2.3


From a0b4c7a49137ed21279f354eb59f49ddae8dffc2 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 26 Feb 2026 13:32:33 +0000
Subject: netfs: Fix unbuffered/DIO writes to dispatch subrequests in strict
 sequence

Fix netfslib such that when it's making an unbuffered or DIO write, to make
sure that it sends each subrequest strictly sequentially, waiting till the
previous one is 'committed' before sending the next so that we don't have
pieces landing out of order and potentially leaving a hole if an error
occurs (ENOSPC for example).

This is done by copying in just those bits of issuing, collecting and
retrying subrequests that are necessary to do one subrequest at a time.
Retrying, in particular, is simpler because if the current subrequest needs
retrying, the source iterator can just be copied again and the subrequest
prepped and issued again without needing to be concerned about whether it
needs merging with the previous or next in the sequence.

Note that the issuing loop waits for a subrequest to complete right after
issuing it, but this wait could be moved elsewhere allowing preparatory
steps to be performed whilst the subrequest is in progress.  In particular,
once content encryption is available in netfslib, that could be done whilst
waiting, as could cleanup of buffers that have been completed.

Fixes: 153a9961b551 ("netfs: Implement unbuffered/DIO write support")
Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://patch.msgid.link/58526.1772112753@warthog.procyon.org.uk
Tested-by: Steve French <sfrench@samba.org>
Reviewed-by: Paulo Alcantara (Red Hat) <pc@manguebit.org>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/netfs/direct_write.c      | 228 ++++++++++++++++++++++++++++++++++++++++---
 fs/netfs/internal.h          |   4 +-
 fs/netfs/write_collect.c     |  21 ----
 fs/netfs/write_issue.c       |  41 +-------
 include/trace/events/netfs.h |   4 +-
 5 files changed, 221 insertions(+), 77 deletions(-)

(limited to 'include')

diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c
index a9d1c3b2c084..dd1451bf7543 100644
--- a/fs/netfs/direct_write.c
+++ b/fs/netfs/direct_write.c
@@ -9,6 +9,202 @@
 #include <linux/uio.h>
 #include "internal.h"
 
+/*
+ * Perform the cleanup rituals after an unbuffered write is complete.
+ */
+static void netfs_unbuffered_write_done(struct netfs_io_request *wreq)
+{
+	struct netfs_inode *ictx = netfs_inode(wreq->inode);
+
+	_enter("R=%x", wreq->debug_id);
+
+	/* Okay, declare that all I/O is complete. */
+	trace_netfs_rreq(wreq, netfs_rreq_trace_write_done);
+
+	if (!wreq->error)
+		netfs_update_i_size(ictx, &ictx->inode, wreq->start, wreq->transferred);
+
+	if (wreq->origin == NETFS_DIO_WRITE &&
+	    wreq->mapping->nrpages) {
+		/* mmap may have got underfoot and we may now have folios
+		 * locally covering the region we just wrote.  Attempt to
+		 * discard the folios, but leave in place any modified locally.
+		 * ->write_iter() is prevented from interfering by the DIO
+		 * counter.
+		 */
+		pgoff_t first = wreq->start >> PAGE_SHIFT;
+		pgoff_t last = (wreq->start + wreq->transferred - 1) >> PAGE_SHIFT;
+
+		invalidate_inode_pages2_range(wreq->mapping, first, last);
+	}
+
+	if (wreq->origin == NETFS_DIO_WRITE)
+		inode_dio_end(wreq->inode);
+
+	_debug("finished");
+	netfs_wake_rreq_flag(wreq, NETFS_RREQ_IN_PROGRESS, netfs_rreq_trace_wake_ip);
+	/* As we cleared NETFS_RREQ_IN_PROGRESS, we acquired its ref. */
+
+	if (wreq->iocb) {
+		size_t written = umin(wreq->transferred, wreq->len);
+
+		wreq->iocb->ki_pos += written;
+		if (wreq->iocb->ki_complete) {
+			trace_netfs_rreq(wreq, netfs_rreq_trace_ki_complete);
+			wreq->iocb->ki_complete(wreq->iocb, wreq->error ?: written);
+		}
+		wreq->iocb = VFS_PTR_POISON;
+	}
+
+	netfs_clear_subrequests(wreq);
+}
+
+/*
+ * Collect the subrequest results of unbuffered write subrequests.
+ */
+static void netfs_unbuffered_write_collect(struct netfs_io_request *wreq,
+					   struct netfs_io_stream *stream,
+					   struct netfs_io_subrequest *subreq)
+{
+	trace_netfs_collect_sreq(wreq, subreq);
+
+	spin_lock(&wreq->lock);
+	list_del_init(&subreq->rreq_link);
+	spin_unlock(&wreq->lock);
+
+	wreq->transferred += subreq->transferred;
+	iov_iter_advance(&wreq->buffer.iter, subreq->transferred);
+
+	stream->collected_to = subreq->start + subreq->transferred;
+	wreq->collected_to = stream->collected_to;
+	netfs_put_subrequest(subreq, netfs_sreq_trace_put_done);
+
+	trace_netfs_collect_stream(wreq, stream);
+	trace_netfs_collect_state(wreq, wreq->collected_to, 0);
+}
+
+/*
+ * Write data to the server without going through the pagecache and without
+ * writing it to the local cache.  We dispatch the subrequests serially and
+ * wait for each to complete before dispatching the next, lest we leave a gap
+ * in the data written due to a failure such as ENOSPC.  We could, however
+ * attempt to do preparation such as content encryption for the next subreq
+ * whilst the current is in progress.
+ */
+static int netfs_unbuffered_write(struct netfs_io_request *wreq)
+{
+	struct netfs_io_subrequest *subreq = NULL;
+	struct netfs_io_stream *stream = &wreq->io_streams[0];
+	int ret;
+
+	_enter("%llx", wreq->len);
+
+	if (wreq->origin == NETFS_DIO_WRITE)
+		inode_dio_begin(wreq->inode);
+
+	stream->collected_to = wreq->start;
+
+	for (;;) {
+		bool retry = false;
+
+		if (!subreq) {
+			netfs_prepare_write(wreq, stream, wreq->start + wreq->transferred);
+			subreq = stream->construct;
+			stream->construct = NULL;
+			stream->front = NULL;
+		}
+
+		/* Check if (re-)preparation failed. */
+		if (unlikely(test_bit(NETFS_SREQ_FAILED, &subreq->flags))) {
+			netfs_write_subrequest_terminated(subreq, subreq->error);
+			wreq->error = subreq->error;
+			break;
+		}
+
+		iov_iter_truncate(&subreq->io_iter, wreq->len - wreq->transferred);
+		if (!iov_iter_count(&subreq->io_iter))
+			break;
+
+		subreq->len = netfs_limit_iter(&subreq->io_iter, 0,
+					       stream->sreq_max_len,
+					       stream->sreq_max_segs);
+		iov_iter_truncate(&subreq->io_iter, subreq->len);
+		stream->submit_extendable_to = subreq->len;
+
+		trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
+		stream->issue_write(subreq);
+
+		/* Async, need to wait. */
+		netfs_wait_for_in_progress_stream(wreq, stream);
+
+		if (test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) {
+			retry = true;
+		} else if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) {
+			ret = subreq->error;
+			wreq->error = ret;
+			netfs_see_subrequest(subreq, netfs_sreq_trace_see_failed);
+			subreq = NULL;
+			break;
+		}
+		ret = 0;
+
+		if (!retry) {
+			netfs_unbuffered_write_collect(wreq, stream, subreq);
+			subreq = NULL;
+			if (wreq->transferred >= wreq->len)
+				break;
+			if (!wreq->iocb && signal_pending(current)) {
+				ret = wreq->transferred ? -EINTR : -ERESTARTSYS;
+				trace_netfs_rreq(wreq, netfs_rreq_trace_intr);
+				break;
+			}
+			continue;
+		}
+
+		/* We need to retry the last subrequest, so first reset the
+		 * iterator, taking into account what, if anything, we managed
+		 * to transfer.
+		 */
+		subreq->error = -EAGAIN;
+		trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
+		if (subreq->transferred > 0)
+			iov_iter_advance(&wreq->buffer.iter, subreq->transferred);
+
+		if (stream->source == NETFS_UPLOAD_TO_SERVER &&
+		    wreq->netfs_ops->retry_request)
+			wreq->netfs_ops->retry_request(wreq, stream);
+
+		__clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
+		__clear_bit(NETFS_SREQ_BOUNDARY, &subreq->flags);
+		__clear_bit(NETFS_SREQ_FAILED, &subreq->flags);
+		subreq->io_iter		= wreq->buffer.iter;
+		subreq->start		= wreq->start + wreq->transferred;
+		subreq->len		= wreq->len   - wreq->transferred;
+		subreq->transferred	= 0;
+		subreq->retry_count	+= 1;
+		stream->sreq_max_len	= UINT_MAX;
+		stream->sreq_max_segs	= INT_MAX;
+
+		netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
+		stream->prepare_write(subreq);
+
+		__set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
+		netfs_stat(&netfs_n_wh_retry_write_subreq);
+	}
+
+	netfs_unbuffered_write_done(wreq);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+static void netfs_unbuffered_write_async(struct work_struct *work)
+{
+	struct netfs_io_request *wreq = container_of(work, struct netfs_io_request, work);
+
+	netfs_unbuffered_write(wreq);
+	netfs_put_request(wreq, netfs_rreq_trace_put_complete);
+}
+
 /*
  * Perform an unbuffered write where we may have to do an RMW operation on an
  * encrypted file.  This can also be used for direct I/O writes.
@@ -70,35 +266,35 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
 			 */
 			wreq->buffer.iter = *iter;
 		}
+
+		wreq->len = iov_iter_count(&wreq->buffer.iter);
 	}
 
 	__set_bit(NETFS_RREQ_USE_IO_ITER, &wreq->flags);
-	if (async)
-		__set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &wreq->flags);
 
 	/* Copy the data into the bounce buffer and encrypt it. */
 	// TODO
 
 	/* Dispatch the write. */
 	__set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
-	if (async)
-		wreq->iocb = iocb;
-	wreq->len = iov_iter_count(&wreq->buffer.iter);
-	ret = netfs_unbuffered_write(wreq, is_sync_kiocb(iocb), wreq->len);
-	if (ret < 0) {
-		_debug("begin = %zd", ret);
-		goto out;
-	}
 
-	if (!async) {
-		ret = netfs_wait_for_write(wreq);
-		if (ret > 0)
-			iocb->ki_pos += ret;
-	} else {
+	if (async) {
+		INIT_WORK(&wreq->work, netfs_unbuffered_write_async);
+		wreq->iocb = iocb;
+		queue_work(system_dfl_wq, &wreq->work);
 		ret = -EIOCBQUEUED;
+	} else {
+		ret = netfs_unbuffered_write(wreq);
+		if (ret < 0) {
+			_debug("begin = %zd", ret);
+		} else {
+			iocb->ki_pos += wreq->transferred;
+			ret = wreq->transferred ?: wreq->error;
+		}
+
+		netfs_put_request(wreq, netfs_rreq_trace_put_complete);
 	}
 
-out:
 	netfs_put_request(wreq, netfs_rreq_trace_put_return);
 	return ret;
 
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index 4319611f5354..d436e20d3418 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -198,6 +198,9 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,
 						struct file *file,
 						loff_t start,
 						enum netfs_io_origin origin);
+void netfs_prepare_write(struct netfs_io_request *wreq,
+			 struct netfs_io_stream *stream,
+			 loff_t start);
 void netfs_reissue_write(struct netfs_io_stream *stream,
 			 struct netfs_io_subrequest *subreq,
 			 struct iov_iter *source);
@@ -212,7 +215,6 @@ int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_c
 			       struct folio **writethrough_cache);
 ssize_t netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
 			       struct folio *writethrough_cache);
-int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t len);
 
 /*
  * write_retry.c
diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c
index 61eab34ea67e..83eb3dc1adf8 100644
--- a/fs/netfs/write_collect.c
+++ b/fs/netfs/write_collect.c
@@ -399,27 +399,6 @@ bool netfs_write_collection(struct netfs_io_request *wreq)
 		ictx->ops->invalidate_cache(wreq);
 	}
 
-	if ((wreq->origin == NETFS_UNBUFFERED_WRITE ||
-	     wreq->origin == NETFS_DIO_WRITE) &&
-	    !wreq->error)
-		netfs_update_i_size(ictx, &ictx->inode, wreq->start, wreq->transferred);
-
-	if (wreq->origin == NETFS_DIO_WRITE &&
-	    wreq->mapping->nrpages) {
-		/* mmap may have got underfoot and we may now have folios
-		 * locally covering the region we just wrote.  Attempt to
-		 * discard the folios, but leave in place any modified locally.
-		 * ->write_iter() is prevented from interfering by the DIO
-		 * counter.
-		 */
-		pgoff_t first = wreq->start >> PAGE_SHIFT;
-		pgoff_t last = (wreq->start + wreq->transferred - 1) >> PAGE_SHIFT;
-		invalidate_inode_pages2_range(wreq->mapping, first, last);
-	}
-
-	if (wreq->origin == NETFS_DIO_WRITE)
-		inode_dio_end(wreq->inode);
-
 	_debug("finished");
 	netfs_wake_rreq_flag(wreq, NETFS_RREQ_IN_PROGRESS, netfs_rreq_trace_wake_ip);
 	/* As we cleared NETFS_RREQ_IN_PROGRESS, we acquired its ref. */
diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c
index 34894da5a23e..437268f65640 100644
--- a/fs/netfs/write_issue.c
+++ b/fs/netfs/write_issue.c
@@ -154,9 +154,9 @@ EXPORT_SYMBOL(netfs_prepare_write_failed);
  * Prepare a write subrequest.  We need to allocate a new subrequest
  * if we don't have one.
  */
-static void netfs_prepare_write(struct netfs_io_request *wreq,
-				struct netfs_io_stream *stream,
-				loff_t start)
+void netfs_prepare_write(struct netfs_io_request *wreq,
+			 struct netfs_io_stream *stream,
+			 loff_t start)
 {
 	struct netfs_io_subrequest *subreq;
 	struct iov_iter *wreq_iter = &wreq->buffer.iter;
@@ -698,41 +698,6 @@ ssize_t netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_c
 	return ret;
 }
 
-/*
- * Write data to the server without going through the pagecache and without
- * writing it to the local cache.
- */
-int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t len)
-{
-	struct netfs_io_stream *upload = &wreq->io_streams[0];
-	ssize_t part;
-	loff_t start = wreq->start;
-	int error = 0;
-
-	_enter("%zx", len);
-
-	if (wreq->origin == NETFS_DIO_WRITE)
-		inode_dio_begin(wreq->inode);
-
-	while (len) {
-		// TODO: Prepare content encryption
-
-		_debug("unbuffered %zx", len);
-		part = netfs_advance_write(wreq, upload, start, len, false);
-		start += part;
-		len -= part;
-		rolling_buffer_advance(&wreq->buffer, part);
-		if (test_bit(NETFS_RREQ_PAUSE, &wreq->flags))
-			netfs_wait_for_paused_write(wreq);
-		if (test_bit(NETFS_RREQ_FAILED, &wreq->flags))
-			break;
-	}
-
-	netfs_end_issue_write(wreq);
-	_leave(" = %d", error);
-	return error;
-}
-
 /*
  * Write some of a pending folio data back to the server and/or the cache.
  */
diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h
index 64a382fbc31a..2d366be46a1c 100644
--- a/include/trace/events/netfs.h
+++ b/include/trace/events/netfs.h
@@ -57,6 +57,7 @@
 	EM(netfs_rreq_trace_done,		"DONE   ")	\
 	EM(netfs_rreq_trace_end_copy_to_cache,	"END-C2C")	\
 	EM(netfs_rreq_trace_free,		"FREE   ")	\
+	EM(netfs_rreq_trace_intr,		"INTR   ")	\
 	EM(netfs_rreq_trace_ki_complete,	"KI-CMPL")	\
 	EM(netfs_rreq_trace_recollect,		"RECLLCT")	\
 	EM(netfs_rreq_trace_redirty,		"REDIRTY")	\
@@ -169,7 +170,8 @@
 	EM(netfs_sreq_trace_put_oom,		"PUT OOM    ")	\
 	EM(netfs_sreq_trace_put_wip,		"PUT WIP    ")	\
 	EM(netfs_sreq_trace_put_work,		"PUT WORK   ")	\
-	E_(netfs_sreq_trace_put_terminated,	"PUT TERM   ")
+	EM(netfs_sreq_trace_put_terminated,	"PUT TERM   ")	\
+	E_(netfs_sreq_trace_see_failed,		"SEE FAILED ")
 
 #define netfs_folio_traces					\
 	EM(netfs_folio_is_uptodate,		"mod-uptodate")	\
-- 
cgit v1.2.3


From 8678591b47469fe16357234efef9b260317b8be4 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Wed, 25 Feb 2026 15:02:51 -0700
Subject: kbuild: Split .modinfo out from ELF_DETAILS

Commit 3e86e4d74c04 ("kbuild: keep .modinfo section in
vmlinux.unstripped") added .modinfo to ELF_DETAILS while removing it
from COMMON_DISCARDS, as it was needed in vmlinux.unstripped and
ELF_DETAILS was present in all architecture specific vmlinux linker
scripts. While this shuffle is fine for vmlinux, ELF_DETAILS and
COMMON_DISCARDS may be used by other linker scripts, such as the s390
and x86 compressed boot images, which may not expect to have a .modinfo
section. In certain circumstances, this could result in a bootloader
failing to load the compressed kernel [1].

Commit ddc6cbef3ef1 ("s390/boot/vmlinux.lds.S: Ensure bzImage ends with
SecureBoot trailer") recently addressed this for the s390 bzImage but
the same bug remains for arm, parisc, and x86. The presence of .modinfo
in the x86 bzImage was the root cause of the issue worked around with
commit d50f21091358 ("kbuild: align modinfo section for Secureboot
Authenticode EDK2 compat"). misc.c in arch/x86/boot/compressed includes
lib/decompress_unzstd.c, which in turn includes lib/xxhash.c and its
MODULE_LICENSE / MODULE_DESCRIPTION macros due to the STATIC definition.

Split .modinfo out from ELF_DETAILS into its own macro and handle it in
all vmlinux linker scripts. Discard .modinfo in the places where it was
previously being discarded from being in COMMON_DISCARDS, as it has
never been necessary in those uses.

Cc: stable@vger.kernel.org
Fixes: 3e86e4d74c04 ("kbuild: keep .modinfo section in vmlinux.unstripped")
Reported-by: Ed W <lists@wildgooses.com>
Closes: https://lore.kernel.org/587f25e0-a80e-46a5-9f01-87cb40cfa377@wildgooses.com/ [1]
Tested-by: Ed W <lists@wildgooses.com> # x86_64
Link: https://patch.msgid.link/20260225-separate-modinfo-from-elf-details-v1-1-387ced6baf4b@kernel.org
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
---
 arch/alpha/kernel/vmlinux.lds.S           | 1 +
 arch/arc/kernel/vmlinux.lds.S             | 1 +
 arch/arm/boot/compressed/vmlinux.lds.S    | 1 +
 arch/arm/kernel/vmlinux-xip.lds.S         | 1 +
 arch/arm/kernel/vmlinux.lds.S             | 1 +
 arch/arm64/kernel/vmlinux.lds.S           | 1 +
 arch/csky/kernel/vmlinux.lds.S            | 1 +
 arch/hexagon/kernel/vmlinux.lds.S         | 1 +
 arch/loongarch/kernel/vmlinux.lds.S       | 1 +
 arch/m68k/kernel/vmlinux-nommu.lds        | 1 +
 arch/m68k/kernel/vmlinux-std.lds          | 1 +
 arch/m68k/kernel/vmlinux-sun3.lds         | 1 +
 arch/mips/kernel/vmlinux.lds.S            | 1 +
 arch/nios2/kernel/vmlinux.lds.S           | 1 +
 arch/openrisc/kernel/vmlinux.lds.S        | 1 +
 arch/parisc/boot/compressed/vmlinux.lds.S | 1 +
 arch/parisc/kernel/vmlinux.lds.S          | 1 +
 arch/powerpc/kernel/vmlinux.lds.S         | 1 +
 arch/riscv/kernel/vmlinux.lds.S           | 1 +
 arch/s390/kernel/vmlinux.lds.S            | 1 +
 arch/sh/kernel/vmlinux.lds.S              | 1 +
 arch/sparc/kernel/vmlinux.lds.S           | 1 +
 arch/um/kernel/dyn.lds.S                  | 1 +
 arch/um/kernel/uml.lds.S                  | 1 +
 arch/x86/boot/compressed/vmlinux.lds.S    | 2 +-
 arch/x86/kernel/vmlinux.lds.S             | 1 +
 include/asm-generic/vmlinux.lds.h         | 4 +++-
 27 files changed, 29 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/arch/alpha/kernel/vmlinux.lds.S b/arch/alpha/kernel/vmlinux.lds.S
index 2efa7dfc798a..2d136c63db16 100644
--- a/arch/alpha/kernel/vmlinux.lds.S
+++ b/arch/alpha/kernel/vmlinux.lds.S
@@ -71,6 +71,7 @@ SECTIONS
 
 	STABS_DEBUG
 	DWARF_DEBUG
+	MODINFO
 	ELF_DETAILS
 
 	DISCARDS
diff --git a/arch/arc/kernel/vmlinux.lds.S b/arch/arc/kernel/vmlinux.lds.S
index 61a1b2b96e1d..6af63084ff28 100644
--- a/arch/arc/kernel/vmlinux.lds.S
+++ b/arch/arc/kernel/vmlinux.lds.S
@@ -123,6 +123,7 @@ SECTIONS
 	_end = . ;
 
 	STABS_DEBUG
+	MODINFO
 	ELF_DETAILS
 	DISCARDS
 
diff --git a/arch/arm/boot/compressed/vmlinux.lds.S b/arch/arm/boot/compressed/vmlinux.lds.S
index d411abd4310e..2d916647df03 100644
--- a/arch/arm/boot/compressed/vmlinux.lds.S
+++ b/arch/arm/boot/compressed/vmlinux.lds.S
@@ -21,6 +21,7 @@ SECTIONS
     COMMON_DISCARDS
     *(.ARM.exidx*)
     *(.ARM.extab*)
+    *(.modinfo)
     *(.note.*)
     *(.rel.*)
     *(.printk_index)
diff --git a/arch/arm/kernel/vmlinux-xip.lds.S b/arch/arm/kernel/vmlinux-xip.lds.S
index f2e8d4fac068..5afb725998ec 100644
--- a/arch/arm/kernel/vmlinux-xip.lds.S
+++ b/arch/arm/kernel/vmlinux-xip.lds.S
@@ -154,6 +154,7 @@ SECTIONS
 
 	STABS_DEBUG
 	DWARF_DEBUG
+	MODINFO
 	ARM_DETAILS
 
 	ARM_ASSERTS
diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S
index d592a203f9c6..c07843c3c53d 100644
--- a/arch/arm/kernel/vmlinux.lds.S
+++ b/arch/arm/kernel/vmlinux.lds.S
@@ -153,6 +153,7 @@ SECTIONS
 
 	STABS_DEBUG
 	DWARF_DEBUG
+	MODINFO
 	ARM_DETAILS
 
 	ARM_ASSERTS
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index ad6133b89e7a..2964aad0362e 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -349,6 +349,7 @@ SECTIONS
 
 	STABS_DEBUG
 	DWARF_DEBUG
+	MODINFO
 	ELF_DETAILS
 
 	HEAD_SYMBOLS
diff --git a/arch/csky/kernel/vmlinux.lds.S b/arch/csky/kernel/vmlinux.lds.S
index d718961786d2..81943981b3af 100644
--- a/arch/csky/kernel/vmlinux.lds.S
+++ b/arch/csky/kernel/vmlinux.lds.S
@@ -109,6 +109,7 @@ SECTIONS
 
 	STABS_DEBUG
 	DWARF_DEBUG
+	MODINFO
 	ELF_DETAILS
 
 	DISCARDS
diff --git a/arch/hexagon/kernel/vmlinux.lds.S b/arch/hexagon/kernel/vmlinux.lds.S
index 1150b77fa281..aae22283b5e0 100644
--- a/arch/hexagon/kernel/vmlinux.lds.S
+++ b/arch/hexagon/kernel/vmlinux.lds.S
@@ -62,6 +62,7 @@ SECTIONS
 
 	STABS_DEBUG
 	DWARF_DEBUG
+	MODINFO
 	ELF_DETAILS
 	.hexagon.attributes 0 : { *(.hexagon.attributes) }
 
diff --git a/arch/loongarch/kernel/vmlinux.lds.S b/arch/loongarch/kernel/vmlinux.lds.S
index 08ea921cdec1..d0e1377a041d 100644
--- a/arch/loongarch/kernel/vmlinux.lds.S
+++ b/arch/loongarch/kernel/vmlinux.lds.S
@@ -147,6 +147,7 @@ SECTIONS
 
 	STABS_DEBUG
 	DWARF_DEBUG
+	MODINFO
 	ELF_DETAILS
 
 #ifdef CONFIG_EFI_STUB
diff --git a/arch/m68k/kernel/vmlinux-nommu.lds b/arch/m68k/kernel/vmlinux-nommu.lds
index 2624fc18c131..45d7f4b0177b 100644
--- a/arch/m68k/kernel/vmlinux-nommu.lds
+++ b/arch/m68k/kernel/vmlinux-nommu.lds
@@ -85,6 +85,7 @@ SECTIONS {
 	_end = .;
 
 	STABS_DEBUG
+	MODINFO
 	ELF_DETAILS
 
 	/* Sections to be discarded */
diff --git a/arch/m68k/kernel/vmlinux-std.lds b/arch/m68k/kernel/vmlinux-std.lds
index 1ccdd04ae462..7326586afe15 100644
--- a/arch/m68k/kernel/vmlinux-std.lds
+++ b/arch/m68k/kernel/vmlinux-std.lds
@@ -58,6 +58,7 @@ SECTIONS
   _end = . ;
 
   STABS_DEBUG
+  MODINFO
   ELF_DETAILS
 
   /* Sections to be discarded */
diff --git a/arch/m68k/kernel/vmlinux-sun3.lds b/arch/m68k/kernel/vmlinux-sun3.lds
index f13ddcc2af5c..1b19fef201fb 100644
--- a/arch/m68k/kernel/vmlinux-sun3.lds
+++ b/arch/m68k/kernel/vmlinux-sun3.lds
@@ -51,6 +51,7 @@ __init_begin = .;
   _end = . ;
 
   STABS_DEBUG
+  MODINFO
   ELF_DETAILS
 
   /* Sections to be discarded */
diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S
index 2b708fac8d2c..579b2cc1995a 100644
--- a/arch/mips/kernel/vmlinux.lds.S
+++ b/arch/mips/kernel/vmlinux.lds.S
@@ -217,6 +217,7 @@ SECTIONS
 
 	STABS_DEBUG
 	DWARF_DEBUG
+	MODINFO
 	ELF_DETAILS
 
 	/* These must appear regardless of  .  */
diff --git a/arch/nios2/kernel/vmlinux.lds.S b/arch/nios2/kernel/vmlinux.lds.S
index 37b958055064..206f92445bfa 100644
--- a/arch/nios2/kernel/vmlinux.lds.S
+++ b/arch/nios2/kernel/vmlinux.lds.S
@@ -57,6 +57,7 @@ SECTIONS
 
 	STABS_DEBUG
 	DWARF_DEBUG
+	MODINFO
 	ELF_DETAILS
 
 	DISCARDS
diff --git a/arch/openrisc/kernel/vmlinux.lds.S b/arch/openrisc/kernel/vmlinux.lds.S
index 049bff45f612..9b29c3211774 100644
--- a/arch/openrisc/kernel/vmlinux.lds.S
+++ b/arch/openrisc/kernel/vmlinux.lds.S
@@ -101,6 +101,7 @@ SECTIONS
 	/* Throw in the debugging sections */
 	STABS_DEBUG
 	DWARF_DEBUG
+	MODINFO
 	ELF_DETAILS
 
         /* Sections to be discarded -- must be last */
diff --git a/arch/parisc/boot/compressed/vmlinux.lds.S b/arch/parisc/boot/compressed/vmlinux.lds.S
index ab7b43990857..87d24cc824b6 100644
--- a/arch/parisc/boot/compressed/vmlinux.lds.S
+++ b/arch/parisc/boot/compressed/vmlinux.lds.S
@@ -90,6 +90,7 @@ SECTIONS
 	/* Sections to be discarded */
 	DISCARDS
 	/DISCARD/ : {
+		*(.modinfo)
 #ifdef CONFIG_64BIT
 		/* temporary hack until binutils is fixed to not emit these
 		 * for static binaries
diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S
index b445e47903cf..0ca93d6d7235 100644
--- a/arch/parisc/kernel/vmlinux.lds.S
+++ b/arch/parisc/kernel/vmlinux.lds.S
@@ -165,6 +165,7 @@ SECTIONS
 	_end = . ;
 
 	STABS_DEBUG
+	MODINFO
 	ELF_DETAILS
 	.note 0 : { *(.note) }
 
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index 15850296c0a9..8fc11d6565bf 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -397,6 +397,7 @@ SECTIONS
 	_end = . ;
 
 	DWARF_DEBUG
+	MODINFO
 	ELF_DETAILS
 
 	DISCARDS
diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S
index 61bd5ba6680a..997f9eb3b22b 100644
--- a/arch/riscv/kernel/vmlinux.lds.S
+++ b/arch/riscv/kernel/vmlinux.lds.S
@@ -170,6 +170,7 @@ SECTIONS
 
 	STABS_DEBUG
 	DWARF_DEBUG
+	MODINFO
 	ELF_DETAILS
 	.riscv.attributes 0 : { *(.riscv.attributes) }
 
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S
index 53bcbb91bb9b..2b62395e35bf 100644
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -221,6 +221,7 @@ SECTIONS
 	/* Debugging sections.	*/
 	STABS_DEBUG
 	DWARF_DEBUG
+	MODINFO
 	ELF_DETAILS
 
 	/*
diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S
index 008c30289eaa..169c63fb3c1d 100644
--- a/arch/sh/kernel/vmlinux.lds.S
+++ b/arch/sh/kernel/vmlinux.lds.S
@@ -89,6 +89,7 @@ SECTIONS
 
 	STABS_DEBUG
 	DWARF_DEBUG
+	MODINFO
 	ELF_DETAILS
 
 	DISCARDS
diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S
index f1b86eb30340..7ea510d9b42f 100644
--- a/arch/sparc/kernel/vmlinux.lds.S
+++ b/arch/sparc/kernel/vmlinux.lds.S
@@ -191,6 +191,7 @@ SECTIONS
 
 	STABS_DEBUG
 	DWARF_DEBUG
+	MODINFO
 	ELF_DETAILS
 
 	DISCARDS
diff --git a/arch/um/kernel/dyn.lds.S b/arch/um/kernel/dyn.lds.S
index a36b7918a011..ad3cefeff2ac 100644
--- a/arch/um/kernel/dyn.lds.S
+++ b/arch/um/kernel/dyn.lds.S
@@ -172,6 +172,7 @@ SECTIONS
 
   STABS_DEBUG
   DWARF_DEBUG
+  MODINFO
   ELF_DETAILS
 
   DISCARDS
diff --git a/arch/um/kernel/uml.lds.S b/arch/um/kernel/uml.lds.S
index a409d4b66114..30aa24348d60 100644
--- a/arch/um/kernel/uml.lds.S
+++ b/arch/um/kernel/uml.lds.S
@@ -113,6 +113,7 @@ SECTIONS
 
   STABS_DEBUG
   DWARF_DEBUG
+  MODINFO
   ELF_DETAILS
 
   DISCARDS
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
index 587ce3e7c504..e0b152715d9c 100644
--- a/arch/x86/boot/compressed/vmlinux.lds.S
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -88,7 +88,7 @@ SECTIONS
 	/DISCARD/ : {
 		*(.dynamic) *(.dynsym) *(.dynstr) *(.dynbss)
 		*(.hash) *(.gnu.hash)
-		*(.note.*)
+		*(.note.*) *(.modinfo)
 	}
 
 	.got.plt (INFO) : {
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 3a24a3fc55f5..4711a35e706c 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -427,6 +427,7 @@ SECTIONS
 	.llvm_bb_addr_map : { *(.llvm_bb_addr_map) }
 #endif
 
+	MODINFO
 	ELF_DETAILS
 
 	DISCARDS
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index eeb070f330bd..1e1580febe4b 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -848,12 +848,14 @@
 
 /* Required sections not related to debugging. */
 #define ELF_DETAILS							\
-		.modinfo : { *(.modinfo) . = ALIGN(8); }		\
 		.comment 0 : { *(.comment) }				\
 		.symtab 0 : { *(.symtab) }				\
 		.strtab 0 : { *(.strtab) }				\
 		.shstrtab 0 : { *(.shstrtab) }
 
+#define MODINFO								\
+		.modinfo : { *(.modinfo) . = ALIGN(8); }
+
 #ifdef CONFIG_GENERIC_BUG
 #define BUG_TABLE							\
 	. = ALIGN(8);							\
-- 
cgit v1.2.3


From 3350c2b3f2b8a3b985a020a4ef4f2f050a4b6a1d Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Wed, 25 Feb 2026 21:12:29 -0800
Subject: platform_data/mlxreg: mlxreg.h: fix all kernel-doc warnings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use the correct kernel-doc format & notation to eliminate
kernel-doc warnings:

Warning: include/linux/platform_data/mlxreg.h:24 Enum value
 'MLX_WDT_TYPE1' not described in enum 'mlxreg_wdt_type'
Warning: include/linux/platform_data/mlxreg.h:24 Enum value
 'MLX_WDT_TYPE2' not described in enum 'mlxreg_wdt_type'
Warning: include/linux/platform_data/mlxreg.h:24 Enum value
 'MLX_WDT_TYPE3' not described in enum 'mlxreg_wdt_type'
Warning: include/linux/platform_data/mlxreg.h:37 bad line:
 PHYs ready / unready state;
Warning: include/linux/platform_data/mlxreg.h:153 struct member 'np'
 not described in 'mlxreg_core_data'
Warning: include/linux/platform_data/mlxreg.h:153 struct member 'hpdev'
 not described in 'mlxreg_core_data'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://patch.msgid.link/20260226051232.549537-1-rdunlap@infradead.org
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 include/linux/platform_data/mlxreg.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/platform_data/mlxreg.h b/include/linux/platform_data/mlxreg.h
index f6cca7a035c7..50b6be57da66 100644
--- a/include/linux/platform_data/mlxreg.h
+++ b/include/linux/platform_data/mlxreg.h
@@ -13,10 +13,10 @@
 /**
  * enum mlxreg_wdt_type - type of HW watchdog
  *
- * TYPE1 HW watchdog implementation exist in old systems.
- * All new systems have TYPE2 HW watchdog.
- * TYPE3 HW watchdog can exist on all systems with new CPLD.
- * TYPE3 is selected by WD capability bit.
+ * @MLX_WDT_TYPE1: HW watchdog implementation in old systems.
+ * @MLX_WDT_TYPE2: All new systems have TYPE2 HW watchdog.
+ * @MLX_WDT_TYPE3: HW watchdog that can exist on all systems with new CPLD.
+ *   TYPE3 is selected by WD capability bit.
  */
 enum mlxreg_wdt_type {
 	MLX_WDT_TYPE1,
@@ -35,7 +35,7 @@ enum mlxreg_wdt_type {
  * @MLXREG_HOTPLUG_LC_SYNCED: entry for line card synchronization events, coming
  *			      after hardware-firmware synchronization handshake;
  * @MLXREG_HOTPLUG_LC_READY: entry for line card ready events, indicating line card
-			     PHYs ready / unready state;
+ *			     PHYs ready / unready state;
  * @MLXREG_HOTPLUG_LC_ACTIVE: entry for line card active events, indicating firmware
  *			      availability / unavailability for the ports on line card;
  * @MLXREG_HOTPLUG_LC_THERMAL: entry for line card thermal shutdown events, positive
@@ -123,8 +123,8 @@ struct mlxreg_hotplug_device {
  * @reg_pwr: attribute power register;
  * @reg_ena: attribute enable register;
  * @mode: access mode;
- * @np - pointer to node platform associated with attribute;
- * @hpdev - hotplug device data;
+ * @np: pointer to node platform associated with attribute;
+ * @hpdev: hotplug device data;
  * @notifier: pointer to event notifier block;
  * @health_cntr: dynamic device health indication counter;
  * @attached: true if device has been attached after good health indication;
-- 
cgit v1.2.3


From e6b899f08066e744f89df16ceb782e06868bd148 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 26 Feb 2026 14:50:09 +0100
Subject: nsfs: tighten permission checks for ns iteration ioctls

Even privileged services should not necessarily be able to see other
privileged service's namespaces so they can't leak information to each
other. Use may_see_all_namespaces() helper that centralizes this policy
until the nstree adapts.

Link: https://patch.msgid.link/20260226-work-visibility-fixes-v1-1-d2c2853313bd@kernel.org
Fixes: a1d220d9dafa ("nsfs: iterate through mount namespaces")
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Cc: stable@kernel.org # v6.12+
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/nsfs.c                 | 13 +++++++++++++
 include/linux/ns_common.h |  2 ++
 kernel/nscommon.c         |  6 ++++++
 3 files changed, 21 insertions(+)

(limited to 'include')

diff --git a/fs/nsfs.c b/fs/nsfs.c
index db91de208645..be36c10c38cf 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -199,6 +199,17 @@ static bool nsfs_ioctl_valid(unsigned int cmd)
 	return false;
 }
 
+static bool may_use_nsfs_ioctl(unsigned int cmd)
+{
+	switch (_IOC_NR(cmd)) {
+	case _IOC_NR(NS_MNT_GET_NEXT):
+		fallthrough;
+	case _IOC_NR(NS_MNT_GET_PREV):
+		return may_see_all_namespaces();
+	}
+	return true;
+}
+
 static long ns_ioctl(struct file *filp, unsigned int ioctl,
 			unsigned long arg)
 {
@@ -214,6 +225,8 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
 
 	if (!nsfs_ioctl_valid(ioctl))
 		return -ENOIOCTLCMD;
+	if (!may_use_nsfs_ioctl(ioctl))
+		return -EPERM;
 
 	ns = get_proc_ns(file_inode(filp));
 	switch (ioctl) {
diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h
index 825f5865bfc5..c8e227a3f9e2 100644
--- a/include/linux/ns_common.h
+++ b/include/linux/ns_common.h
@@ -55,6 +55,8 @@ static __always_inline bool is_ns_init_id(const struct ns_common *ns)
 
 #define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns)))
 
+bool may_see_all_namespaces(void);
+
 static __always_inline __must_check int __ns_ref_active_read(const struct ns_common *ns)
 {
 	return atomic_read(&ns->__ns_ref_active);
diff --git a/kernel/nscommon.c b/kernel/nscommon.c
index bdc3c86231d3..3166c1fd844a 100644
--- a/kernel/nscommon.c
+++ b/kernel/nscommon.c
@@ -309,3 +309,9 @@ void __ns_ref_active_get(struct ns_common *ns)
 			return;
 	}
 }
+
+bool may_see_all_namespaces(void)
+{
+	return (task_active_pid_ns(current) == &init_pid_ns) &&
+	       ns_capable_noaudit(init_pid_ns.user_ns, CAP_SYS_ADMIN);
+}
-- 
cgit v1.2.3


From 62413a9c3cb183afb9bb6e94dd68caf4e4145f4c Mon Sep 17 00:00:00 2001
From: Paul Moses <p@1g4.org>
Date: Mon, 23 Feb 2026 15:05:44 +0000
Subject: net/sched: act_gate: snapshot parameters with RCU on replace

The gate action can be replaced while the hrtimer callback or dump path is
walking the schedule list.

Convert the parameters to an RCU-protected snapshot and swap updates under
tcf_lock, freeing the previous snapshot via call_rcu(). When REPLACE omits
the entry list, preserve the existing schedule so the effective state is
unchanged.

Fixes: a51c328df310 ("net: qos: introduce a gate control flow action")
Cc: stable@vger.kernel.org
Signed-off-by: Paul Moses <p@1g4.org>
Tested-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Reviewed-by: Victor Nogueira <victor@mojatatu.com>
Link: https://patch.msgid.link/20260223150512.2251594-2-p@1g4.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tc_act/tc_gate.h |  33 ++++--
 net/sched/act_gate.c         | 265 ++++++++++++++++++++++++++++++-------------
 2 files changed, 212 insertions(+), 86 deletions(-)

(limited to 'include')

diff --git a/include/net/tc_act/tc_gate.h b/include/net/tc_act/tc_gate.h
index b147a3bb1a46..e0fded18e18c 100644
--- a/include/net/tc_act/tc_gate.h
+++ b/include/net/tc_act/tc_gate.h
@@ -32,6 +32,7 @@ struct tcf_gate_params {
 	s32			tcfg_clockid;
 	size_t			num_entries;
 	struct list_head	entries;
+	struct rcu_head		rcu;
 };
 
 #define GATE_ACT_GATE_OPEN	BIT(0)
@@ -39,7 +40,7 @@ struct tcf_gate_params {
 
 struct tcf_gate {
 	struct tc_action	common;
-	struct tcf_gate_params	param;
+	struct tcf_gate_params __rcu *param;
 	u8			current_gate_status;
 	ktime_t			current_close_time;
 	u32			current_entry_octets;
@@ -51,47 +52,65 @@ struct tcf_gate {
 
 #define to_gate(a) ((struct tcf_gate *)a)
 
+static inline struct tcf_gate_params *tcf_gate_params_locked(const struct tc_action *a)
+{
+	struct tcf_gate *gact = to_gate(a);
+
+	return rcu_dereference_protected(gact->param,
+					 lockdep_is_held(&gact->tcf_lock));
+}
+
 static inline s32 tcf_gate_prio(const struct tc_action *a)
 {
+	struct tcf_gate_params *p;
 	s32 tcfg_prio;
 
-	tcfg_prio = to_gate(a)->param.tcfg_priority;
+	p = tcf_gate_params_locked(a);
+	tcfg_prio = p->tcfg_priority;
 
 	return tcfg_prio;
 }
 
 static inline u64 tcf_gate_basetime(const struct tc_action *a)
 {
+	struct tcf_gate_params *p;
 	u64 tcfg_basetime;
 
-	tcfg_basetime = to_gate(a)->param.tcfg_basetime;
+	p = tcf_gate_params_locked(a);
+	tcfg_basetime = p->tcfg_basetime;
 
 	return tcfg_basetime;
 }
 
 static inline u64 tcf_gate_cycletime(const struct tc_action *a)
 {
+	struct tcf_gate_params *p;
 	u64 tcfg_cycletime;
 
-	tcfg_cycletime = to_gate(a)->param.tcfg_cycletime;
+	p = tcf_gate_params_locked(a);
+	tcfg_cycletime = p->tcfg_cycletime;
 
 	return tcfg_cycletime;
 }
 
 static inline u64 tcf_gate_cycletimeext(const struct tc_action *a)
 {
+	struct tcf_gate_params *p;
 	u64 tcfg_cycletimeext;
 
-	tcfg_cycletimeext = to_gate(a)->param.tcfg_cycletime_ext;
+	p = tcf_gate_params_locked(a);
+	tcfg_cycletimeext = p->tcfg_cycletime_ext;
 
 	return tcfg_cycletimeext;
 }
 
 static inline u32 tcf_gate_num_entries(const struct tc_action *a)
 {
+	struct tcf_gate_params *p;
 	u32 num_entries;
 
-	num_entries = to_gate(a)->param.num_entries;
+	p = tcf_gate_params_locked(a);
+	num_entries = p->num_entries;
 
 	return num_entries;
 }
@@ -105,7 +124,7 @@ static inline struct action_gate_entry
 	u32 num_entries;
 	int i = 0;
 
-	p = &to_gate(a)->param;
+	p = tcf_gate_params_locked(a);
 	num_entries = p->num_entries;
 
 	list_for_each_entry(entry, &p->entries, list)
diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c
index 686eaed81b81..fdbfcaa3e2ab 100644
--- a/net/sched/act_gate.c
+++ b/net/sched/act_gate.c
@@ -32,9 +32,12 @@ static ktime_t gate_get_time(struct tcf_gate *gact)
 	return KTIME_MAX;
 }
 
-static void gate_get_start_time(struct tcf_gate *gact, ktime_t *start)
+static void tcf_gate_params_free_rcu(struct rcu_head *head);
+
+static void gate_get_start_time(struct tcf_gate *gact,
+				const struct tcf_gate_params *param,
+				ktime_t *start)
 {
-	struct tcf_gate_params *param = &gact->param;
 	ktime_t now, base, cycle;
 	u64 n;
 
@@ -69,12 +72,14 @@ static enum hrtimer_restart gate_timer_func(struct hrtimer *timer)
 {
 	struct tcf_gate *gact = container_of(timer, struct tcf_gate,
 					     hitimer);
-	struct tcf_gate_params *p = &gact->param;
 	struct tcfg_gate_entry *next;
+	struct tcf_gate_params *p;
 	ktime_t close_time, now;
 
 	spin_lock(&gact->tcf_lock);
 
+	p = rcu_dereference_protected(gact->param,
+				      lockdep_is_held(&gact->tcf_lock));
 	next = gact->next_entry;
 
 	/* cycle start, clear pending bit, clear total octets */
@@ -225,6 +230,35 @@ static void release_entry_list(struct list_head *entries)
 	}
 }
 
+static int tcf_gate_copy_entries(struct tcf_gate_params *dst,
+				 const struct tcf_gate_params *src,
+				 struct netlink_ext_ack *extack)
+{
+	struct tcfg_gate_entry *entry;
+	int i = 0;
+
+	list_for_each_entry(entry, &src->entries, list) {
+		struct tcfg_gate_entry *new;
+
+		new = kzalloc(sizeof(*new), GFP_ATOMIC);
+		if (!new) {
+			NL_SET_ERR_MSG(extack, "Not enough memory for entry");
+			return -ENOMEM;
+		}
+
+		new->index      = entry->index;
+		new->gate_state = entry->gate_state;
+		new->interval   = entry->interval;
+		new->ipv        = entry->ipv;
+		new->maxoctets  = entry->maxoctets;
+		list_add_tail(&new->list, &dst->entries);
+		i++;
+	}
+
+	dst->num_entries = i;
+	return 0;
+}
+
 static int parse_gate_list(struct nlattr *list_attr,
 			   struct tcf_gate_params *sched,
 			   struct netlink_ext_ack *extack)
@@ -270,24 +304,44 @@ release_list:
 	return err;
 }
 
-static void gate_setup_timer(struct tcf_gate *gact, u64 basetime,
-			     enum tk_offsets tko, s32 clockid,
-			     bool do_init)
+static bool gate_timer_needs_cancel(u64 basetime, u64 old_basetime,
+				    enum tk_offsets tko,
+				    enum tk_offsets old_tko,
+				    s32 clockid, s32 old_clockid)
 {
-	if (!do_init) {
-		if (basetime == gact->param.tcfg_basetime &&
-		    tko == gact->tk_offset &&
-		    clockid == gact->param.tcfg_clockid)
-			return;
+	return basetime != old_basetime ||
+	       clockid != old_clockid ||
+	       tko != old_tko;
+}
 
-		spin_unlock_bh(&gact->tcf_lock);
-		hrtimer_cancel(&gact->hitimer);
-		spin_lock_bh(&gact->tcf_lock);
+static int gate_clock_resolve(s32 clockid, enum tk_offsets *tko,
+			      struct netlink_ext_ack *extack)
+{
+	switch (clockid) {
+	case CLOCK_REALTIME:
+		*tko = TK_OFFS_REAL;
+		return 0;
+	case CLOCK_MONOTONIC:
+		*tko = TK_OFFS_MAX;
+		return 0;
+	case CLOCK_BOOTTIME:
+		*tko = TK_OFFS_BOOT;
+		return 0;
+	case CLOCK_TAI:
+		*tko = TK_OFFS_TAI;
+		return 0;
+	default:
+		NL_SET_ERR_MSG(extack, "Invalid 'clockid'");
+		return -EINVAL;
 	}
-	gact->param.tcfg_basetime = basetime;
-	gact->param.tcfg_clockid = clockid;
-	gact->tk_offset = tko;
-	hrtimer_setup(&gact->hitimer, gate_timer_func, clockid, HRTIMER_MODE_ABS_SOFT);
+}
+
+static void gate_setup_timer(struct tcf_gate *gact, s32 clockid,
+			     enum tk_offsets tko)
+{
+	WRITE_ONCE(gact->tk_offset, tko);
+	hrtimer_setup(&gact->hitimer, gate_timer_func, clockid,
+		      HRTIMER_MODE_ABS_SOFT);
 }
 
 static int tcf_gate_init(struct net *net, struct nlattr *nla,
@@ -296,15 +350,22 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla,
 			 struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, act_gate_ops.net_id);
-	enum tk_offsets tk_offset = TK_OFFS_TAI;
+	u64 cycletime = 0, basetime = 0, cycletime_ext = 0;
+	struct tcf_gate_params *p = NULL, *old_p = NULL;
+	enum tk_offsets old_tk_offset = TK_OFFS_TAI;
+	const struct tcf_gate_params *cur_p = NULL;
 	bool bind = flags & TCA_ACT_FLAGS_BIND;
 	struct nlattr *tb[TCA_GATE_MAX + 1];
+	enum tk_offsets tko = TK_OFFS_TAI;
 	struct tcf_chain *goto_ch = NULL;
-	u64 cycletime = 0, basetime = 0;
-	struct tcf_gate_params *p;
+	s32 timer_clockid = CLOCK_TAI;
+	bool use_old_entries = false;
+	s32 old_clockid = CLOCK_TAI;
+	bool need_cancel = false;
 	s32 clockid = CLOCK_TAI;
 	struct tcf_gate *gact;
 	struct tc_gate *parm;
+	u64 old_basetime = 0;
 	int ret = 0, err;
 	u32 gflags = 0;
 	s32 prio = -1;
@@ -321,26 +382,8 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla,
 	if (!tb[TCA_GATE_PARMS])
 		return -EINVAL;
 
-	if (tb[TCA_GATE_CLOCKID]) {
+	if (tb[TCA_GATE_CLOCKID])
 		clockid = nla_get_s32(tb[TCA_GATE_CLOCKID]);
-		switch (clockid) {
-		case CLOCK_REALTIME:
-			tk_offset = TK_OFFS_REAL;
-			break;
-		case CLOCK_MONOTONIC:
-			tk_offset = TK_OFFS_MAX;
-			break;
-		case CLOCK_BOOTTIME:
-			tk_offset = TK_OFFS_BOOT;
-			break;
-		case CLOCK_TAI:
-			tk_offset = TK_OFFS_TAI;
-			break;
-		default:
-			NL_SET_ERR_MSG(extack, "Invalid 'clockid'");
-			return -EINVAL;
-		}
-	}
 
 	parm = nla_data(tb[TCA_GATE_PARMS]);
 	index = parm->index;
@@ -366,6 +409,60 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla,
 		return -EEXIST;
 	}
 
+	gact = to_gate(*a);
+
+	err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+	if (err < 0)
+		goto release_idr;
+
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (!p) {
+		err = -ENOMEM;
+		goto chain_put;
+	}
+	INIT_LIST_HEAD(&p->entries);
+
+	use_old_entries = !tb[TCA_GATE_ENTRY_LIST];
+	if (!use_old_entries) {
+		err = parse_gate_list(tb[TCA_GATE_ENTRY_LIST], p, extack);
+		if (err < 0)
+			goto err_free;
+		use_old_entries = !err;
+	}
+
+	if (ret == ACT_P_CREATED && use_old_entries) {
+		NL_SET_ERR_MSG(extack, "The entry list is empty");
+		err = -EINVAL;
+		goto err_free;
+	}
+
+	if (ret != ACT_P_CREATED) {
+		rcu_read_lock();
+		cur_p = rcu_dereference(gact->param);
+
+		old_basetime  = cur_p->tcfg_basetime;
+		old_clockid   = cur_p->tcfg_clockid;
+		old_tk_offset = READ_ONCE(gact->tk_offset);
+
+		basetime      = old_basetime;
+		cycletime_ext = cur_p->tcfg_cycletime_ext;
+		prio          = cur_p->tcfg_priority;
+		gflags        = cur_p->tcfg_flags;
+
+		if (!tb[TCA_GATE_CLOCKID])
+			clockid = old_clockid;
+
+		err = 0;
+		if (use_old_entries) {
+			err = tcf_gate_copy_entries(p, cur_p, extack);
+			if (!err && !tb[TCA_GATE_CYCLE_TIME])
+				cycletime = cur_p->tcfg_cycletime;
+		}
+		rcu_read_unlock();
+		if (err)
+			goto err_free;
+	}
+
 	if (tb[TCA_GATE_PRIORITY])
 		prio = nla_get_s32(tb[TCA_GATE_PRIORITY]);
 
@@ -375,25 +472,26 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla,
 	if (tb[TCA_GATE_FLAGS])
 		gflags = nla_get_u32(tb[TCA_GATE_FLAGS]);
 
-	gact = to_gate(*a);
-	if (ret == ACT_P_CREATED)
-		INIT_LIST_HEAD(&gact->param.entries);
+	if (tb[TCA_GATE_CYCLE_TIME])
+		cycletime = nla_get_u64(tb[TCA_GATE_CYCLE_TIME]);
 
-	err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
-	if (err < 0)
-		goto release_idr;
+	if (tb[TCA_GATE_CYCLE_TIME_EXT])
+		cycletime_ext = nla_get_u64(tb[TCA_GATE_CYCLE_TIME_EXT]);
 
-	spin_lock_bh(&gact->tcf_lock);
-	p = &gact->param;
+	err = gate_clock_resolve(clockid, &tko, extack);
+	if (err)
+		goto err_free;
+	timer_clockid = clockid;
 
-	if (tb[TCA_GATE_CYCLE_TIME])
-		cycletime = nla_get_u64(tb[TCA_GATE_CYCLE_TIME]);
+	need_cancel = ret != ACT_P_CREATED &&
+		      gate_timer_needs_cancel(basetime, old_basetime,
+					      tko, old_tk_offset,
+					      timer_clockid, old_clockid);
 
-	if (tb[TCA_GATE_ENTRY_LIST]) {
-		err = parse_gate_list(tb[TCA_GATE_ENTRY_LIST], p, extack);
-		if (err < 0)
-			goto chain_put;
-	}
+	if (need_cancel)
+		hrtimer_cancel(&gact->hitimer);
+
+	spin_lock_bh(&gact->tcf_lock);
 
 	if (!cycletime) {
 		struct tcfg_gate_entry *entry;
@@ -402,22 +500,20 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla,
 		list_for_each_entry(entry, &p->entries, list)
 			cycle = ktime_add_ns(cycle, entry->interval);
 		cycletime = cycle;
-		if (!cycletime) {
-			err = -EINVAL;
-			goto chain_put;
-		}
 	}
 	p->tcfg_cycletime = cycletime;
+	p->tcfg_cycletime_ext = cycletime_ext;
 
-	if (tb[TCA_GATE_CYCLE_TIME_EXT])
-		p->tcfg_cycletime_ext =
-			nla_get_u64(tb[TCA_GATE_CYCLE_TIME_EXT]);
-
-	gate_setup_timer(gact, basetime, tk_offset, clockid,
-			 ret == ACT_P_CREATED);
+	if (need_cancel || ret == ACT_P_CREATED)
+		gate_setup_timer(gact, timer_clockid, tko);
 	p->tcfg_priority = prio;
 	p->tcfg_flags = gflags;
-	gate_get_start_time(gact, &start);
+	p->tcfg_basetime = basetime;
+	p->tcfg_clockid = timer_clockid;
+	gate_get_start_time(gact, p, &start);
+
+	old_p = rcu_replace_pointer(gact->param, p,
+				    lockdep_is_held(&gact->tcf_lock));
 
 	gact->current_close_time = start;
 	gact->current_gate_status = GATE_ACT_GATE_OPEN | GATE_ACT_PENDING;
@@ -434,11 +530,15 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla,
 	if (goto_ch)
 		tcf_chain_put_by_act(goto_ch);
 
+	if (old_p)
+		call_rcu(&old_p->rcu, tcf_gate_params_free_rcu);
+
 	return ret;
 
+err_free:
+	release_entry_list(&p->entries);
+	kfree(p);
 chain_put:
-	spin_unlock_bh(&gact->tcf_lock);
-
 	if (goto_ch)
 		tcf_chain_put_by_act(goto_ch);
 release_idr:
@@ -446,21 +546,29 @@ release_idr:
 	 * without taking tcf_lock.
 	 */
 	if (ret == ACT_P_CREATED)
-		gate_setup_timer(gact, gact->param.tcfg_basetime,
-				 gact->tk_offset, gact->param.tcfg_clockid,
-				 true);
+		gate_setup_timer(gact, timer_clockid, tko);
+
 	tcf_idr_release(*a, bind);
 	return err;
 }
 
+static void tcf_gate_params_free_rcu(struct rcu_head *head)
+{
+	struct tcf_gate_params *p = container_of(head, struct tcf_gate_params, rcu);
+
+	release_entry_list(&p->entries);
+	kfree(p);
+}
+
 static void tcf_gate_cleanup(struct tc_action *a)
 {
 	struct tcf_gate *gact = to_gate(a);
 	struct tcf_gate_params *p;
 
-	p = &gact->param;
 	hrtimer_cancel(&gact->hitimer);
-	release_entry_list(&p->entries);
+	p = rcu_dereference_protected(gact->param, 1);
+	if (p)
+		call_rcu(&p->rcu, tcf_gate_params_free_rcu);
 }
 
 static int dumping_entry(struct sk_buff *skb,
@@ -509,10 +617,9 @@ static int tcf_gate_dump(struct sk_buff *skb, struct tc_action *a,
 	struct nlattr *entry_list;
 	struct tcf_t t;
 
-	spin_lock_bh(&gact->tcf_lock);
-	opt.action = gact->tcf_action;
-
-	p = &gact->param;
+	rcu_read_lock();
+	opt.action = READ_ONCE(gact->tcf_action);
+	p = rcu_dereference(gact->param);
 
 	if (nla_put(skb, TCA_GATE_PARMS, sizeof(opt), &opt))
 		goto nla_put_failure;
@@ -552,12 +659,12 @@ static int tcf_gate_dump(struct sk_buff *skb, struct tc_action *a,
 	tcf_tm_dump(&t, &gact->tcf_tm);
 	if (nla_put_64bit(skb, TCA_GATE_TM, sizeof(t), &t, TCA_GATE_PAD))
 		goto nla_put_failure;
-	spin_unlock_bh(&gact->tcf_lock);
+	rcu_read_unlock();
 
 	return skb->len;
 
 nla_put_failure:
-	spin_unlock_bh(&gact->tcf_lock);
+	rcu_read_unlock();
 	nlmsg_trim(skb, b);
 	return -1;
 }
-- 
cgit v1.2.3


From 29252397bcc1e0a1f85e5c3bee59c325f5c26341 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 25 Feb 2026 20:35:45 +0000
Subject: inet: annotate data-races around isk->inet_num

UDP/TCP lookups are using RCU, thus isk->inet_num accesses
should use READ_ONCE() and WRITE_ONCE() where needed.

Fixes: 3ab5aee7fe84 ("net: Convert TCP & DCCP hash tables to use RCU / hlist_nulls")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20260225203545.1512417-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/inet6_hashtables.h | 2 +-
 include/net/inet_hashtables.h  | 2 +-
 include/net/ip.h               | 2 +-
 net/ipv4/inet_hashtables.c     | 8 ++++----
 net/ipv4/tcp_diag.c            | 2 +-
 net/ipv6/inet6_hashtables.c    | 3 ++-
 6 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index 282e29237d93..c16de5b7963f 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -175,7 +175,7 @@ static inline bool inet6_match(const struct net *net, const struct sock *sk,
 {
 	if (!net_eq(sock_net(sk), net) ||
 	    sk->sk_family != AF_INET6 ||
-	    sk->sk_portpair != ports ||
+	    READ_ONCE(sk->sk_portpair) != ports ||
 	    !ipv6_addr_equal(&sk->sk_v6_daddr, saddr) ||
 	    !ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr))
 		return false;
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index ac05a52d9e13..5a979dcab538 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -345,7 +345,7 @@ static inline bool inet_match(const struct net *net, const struct sock *sk,
 			      int dif, int sdif)
 {
 	if (!net_eq(sock_net(sk), net) ||
-	    sk->sk_portpair != ports ||
+	    READ_ONCE(sk->sk_portpair) != ports ||
 	    sk->sk_addrpair != cookie)
 	        return false;
 
diff --git a/include/net/ip.h b/include/net/ip.h
index 69d5cef46004..7f9abd457e01 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -101,7 +101,7 @@ static inline void ipcm_init_sk(struct ipcm_cookie *ipcm,
 
 	ipcm->oif = READ_ONCE(inet->sk.sk_bound_dev_if);
 	ipcm->addr = inet->inet_saddr;
-	ipcm->protocol = inet->inet_num;
+	ipcm->protocol = READ_ONCE(inet->inet_num);
 }
 
 #define IPCB(skb) ((struct inet_skb_parm*)((skb)->cb))
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index fca980772c81..9bfccc283fa6 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -200,7 +200,7 @@ static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2,
 void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
 		    struct inet_bind2_bucket *tb2, unsigned short port)
 {
-	inet_sk(sk)->inet_num = port;
+	WRITE_ONCE(inet_sk(sk)->inet_num, port);
 	inet_csk(sk)->icsk_bind_hash = tb;
 	inet_csk(sk)->icsk_bind2_hash = tb2;
 	sk_add_bind_node(sk, &tb2->owners);
@@ -224,7 +224,7 @@ static void __inet_put_port(struct sock *sk)
 	spin_lock(&head->lock);
 	tb = inet_csk(sk)->icsk_bind_hash;
 	inet_csk(sk)->icsk_bind_hash = NULL;
-	inet_sk(sk)->inet_num = 0;
+	WRITE_ONCE(inet_sk(sk)->inet_num, 0);
 	sk->sk_userlocks &= ~SOCK_CONNECT_BIND;
 
 	spin_lock(&head2->lock);
@@ -352,7 +352,7 @@ static inline int compute_score(struct sock *sk, const struct net *net,
 {
 	int score = -1;
 
-	if (net_eq(sock_net(sk), net) && sk->sk_num == hnum &&
+	if (net_eq(sock_net(sk), net) && READ_ONCE(sk->sk_num) == hnum &&
 			!ipv6_only_sock(sk)) {
 		if (sk->sk_rcv_saddr != daddr)
 			return -1;
@@ -1206,7 +1206,7 @@ error:
 
 		sk->sk_hash = 0;
 		inet_sk(sk)->inet_sport = 0;
-		inet_sk(sk)->inet_num = 0;
+		WRITE_ONCE(inet_sk(sk)->inet_num, 0);
 
 		if (tw)
 			inet_twsk_bind_unhash(tw, hinfo);
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index d83efd91f461..7935702e394b 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -509,7 +509,7 @@ next_chunk:
 			if (r->sdiag_family != AF_UNSPEC &&
 			    sk->sk_family != r->sdiag_family)
 				goto next_normal;
-			if (r->id.idiag_sport != htons(sk->sk_num) &&
+			if (r->id.idiag_sport != htons(READ_ONCE(sk->sk_num)) &&
 			    r->id.idiag_sport)
 				goto next_normal;
 			if (r->id.idiag_dport != sk->sk_dport &&
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 5e1da088d8e1..182d38e6d6d8 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -95,7 +95,8 @@ static inline int compute_score(struct sock *sk, const struct net *net,
 {
 	int score = -1;
 
-	if (net_eq(sock_net(sk), net) && inet_sk(sk)->inet_num == hnum &&
+	if (net_eq(sock_net(sk), net) &&
+	    READ_ONCE(inet_sk(sk)->inet_num) == hnum &&
 	    sk->sk_family == PF_INET6) {
 		if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr))
 			return -1;
-- 
cgit v1.2.3


From 11cb63b0d1a0685e0831ae3c77223e002ef18189 Mon Sep 17 00:00:00 2001
From: Victor Nogueira <victor@mojatatu.com>
Date: Wed, 25 Feb 2026 10:43:48 -0300
Subject: net/sched: Only allow act_ct to bind to clsact/ingress qdiscs and
 shared blocks

As Paolo said earlier [1]:

"Since the blamed commit below, classify can return TC_ACT_CONSUMED while
the current skb being held by the defragmentation engine. As reported by
GangMin Kim, if such packet is that may cause a UaF when the defrag engine
later on tries to tuch again such packet."

act_ct was never meant to be used in the egress path, however some users
are attaching it to egress today [2]. Attempting to reach a middle
ground, we noticed that, while most qdiscs are not handling
TC_ACT_CONSUMED, clsact/ingress qdiscs are. With that in mind, we
address the issue by only allowing act_ct to bind to clsact/ingress
qdiscs and shared blocks. That way it's still possible to attach act_ct to
egress (albeit only with clsact).

[1] https://lore.kernel.org/netdev/674b8cbfc385c6f37fb29a1de08d8fe5c2b0fbee.1771321118.git.pabeni@redhat.com/
[2] https://lore.kernel.org/netdev/cc6bfb4a-4a2b-42d8-b9ce-7ef6644fb22b@ovn.org/

Reported-by: GangMin Kim <km.kim1503@gmail.com>
Fixes: 3f14b377d01d ("net/sched: act_ct: fix skb leak and crash on ooo frags")
CC: stable@vger.kernel.org
Signed-off-by: Victor Nogueira <victor@mojatatu.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Link: https://patch.msgid.link/20260225134349.1287037-1-victor@mojatatu.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/act_api.h | 1 +
 net/sched/act_ct.c    | 6 ++++++
 net/sched/cls_api.c   | 7 +++++++
 3 files changed, 14 insertions(+)

(limited to 'include')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index e1e8f0f7dacb..d11b79107930 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -70,6 +70,7 @@ struct tc_action {
 #define TCA_ACT_FLAGS_REPLACE	(1U << (TCA_ACT_FLAGS_USER_BITS + 2))
 #define TCA_ACT_FLAGS_NO_RTNL	(1U << (TCA_ACT_FLAGS_USER_BITS + 3))
 #define TCA_ACT_FLAGS_AT_INGRESS	(1U << (TCA_ACT_FLAGS_USER_BITS + 4))
+#define TCA_ACT_FLAGS_AT_INGRESS_OR_CLSACT	(1U << (TCA_ACT_FLAGS_USER_BITS + 5))
 
 /* Update lastuse only if needed, to avoid dirtying a cache line.
  * We use a temp variable to avoid fetching jiffies twice.
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index bd51522c7953..7d5e50c921a0 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -1360,6 +1360,12 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla,
 		return -EINVAL;
 	}
 
+	if (bind && !(flags & TCA_ACT_FLAGS_AT_INGRESS_OR_CLSACT)) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Attaching ct to a non ingress/clsact qdisc is unsupported");
+		return -EOPNOTSUPP;
+	}
+
 	err = nla_parse_nested(tb, TCA_CT_MAX, nla, ct_policy, extack);
 	if (err < 0)
 		return err;
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 343309e9009b..4829c27446e3 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -2228,6 +2228,11 @@ static bool is_qdisc_ingress(__u32 classid)
 	return (TC_H_MIN(classid) == TC_H_MIN(TC_H_MIN_INGRESS));
 }
 
+static bool is_ingress_or_clsact(struct tcf_block *block, struct Qdisc *q)
+{
+	return tcf_block_shared(block) || (q && !!(q->flags & TCQ_F_INGRESS));
+}
+
 static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
 			  struct netlink_ext_ack *extack)
 {
@@ -2420,6 +2425,8 @@ replay:
 		flags |= TCA_ACT_FLAGS_NO_RTNL;
 	if (is_qdisc_ingress(parent))
 		flags |= TCA_ACT_FLAGS_AT_INGRESS;
+	if (is_ingress_or_clsact(block, q))
+		flags |= TCA_ACT_FLAGS_AT_INGRESS_OR_CLSACT;
 	err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh,
 			      flags, extack);
 	if (err == 0) {
-- 
cgit v1.2.3


From 15fba71533bcdfaa8eeba69a5a5a2927afdf664a Mon Sep 17 00:00:00 2001
From: Valentin Spreckels <valentin@spreckels.dev>
Date: Thu, 26 Feb 2026 20:54:09 +0100
Subject: net: usb: r8152: add TRENDnet TUC-ET2G

The TRENDnet TUC-ET2G is a RTL8156 based usb ethernet adapter. Add its
vendor and product IDs.

Signed-off-by: Valentin Spreckels <valentin@spreckels.dev>
Link: https://patch.msgid.link/20260226195409.7891-2-valentin@spreckels.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/usb/r8152.c   | 1 +
 include/linux/usb/r8152.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c
index 4af85728ac4f..0c83bbbea2e7 100644
--- a/drivers/net/usb/r8152.c
+++ b/drivers/net/usb/r8152.c
@@ -10054,6 +10054,7 @@ static const struct usb_device_id rtl8152_table[] = {
 	{ USB_DEVICE(VENDOR_ID_DLINK,   0xb301) },
 	{ USB_DEVICE(VENDOR_ID_DELL,    0xb097) },
 	{ USB_DEVICE(VENDOR_ID_ASUS,    0x1976) },
+	{ USB_DEVICE(VENDOR_ID_TRENDNET, 0xe02b) },
 	{}
 };
 
diff --git a/include/linux/usb/r8152.h b/include/linux/usb/r8152.h
index 2ca60828f28b..1502b2a355f9 100644
--- a/include/linux/usb/r8152.h
+++ b/include/linux/usb/r8152.h
@@ -32,6 +32,7 @@
 #define VENDOR_ID_DLINK			0x2001
 #define VENDOR_ID_DELL			0x413c
 #define VENDOR_ID_ASUS			0x0b05
+#define VENDOR_ID_TRENDNET		0x20f4
 
 #if IS_REACHABLE(CONFIG_USB_RTL8152)
 extern u8 rtl8152_get_version(struct usb_interface *intf);
-- 
cgit v1.2.3


From 05ac3846ffe53fc63e454eb195ce8a6bab7a6a88 Mon Sep 17 00:00:00 2001
From: Shenghao Ding <shenghao-ding@ti.com>
Date: Fri, 27 Feb 2026 22:46:40 +0800
Subject: ALSA: hda/tas2781: A workaround solution to lower-vol issue among
 lower calibrated-impedance micro-speaker on TAS2781

On TAS2781, if the Speaker calibrated impedance is lower than default
value hard-coded inside the TAS2781, it will cuase vol lower than
normal. In order to fix this issue, the parameter of SineGainI need
updating.

Signed-off-by: Shenghao Ding <shenghao-ding@ti.com>
Tested-by: Matthew Schwartz <matthew.schwartz@linux.dev>
Link: https://patch.msgid.link/20260227144641.1243-1-shenghao-ding@ti.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/tas2781.h                        |  1 +
 sound/hda/codecs/side-codecs/tas2781_hda_i2c.c | 15 +---
 sound/soc/codecs/tas2781-fmwlib.c              | 94 ++++++++++++++++++++++++++
 3 files changed, 98 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/sound/tas2781.h b/include/sound/tas2781.h
index 7c03bdc951bb..e847cf51878c 100644
--- a/include/sound/tas2781.h
+++ b/include/sound/tas2781.h
@@ -151,6 +151,7 @@ struct tasdevice {
 	struct bulk_reg_val *cali_data_backup;
 	struct bulk_reg_val alp_cali_bckp;
 	struct tasdevice_fw *cali_data_fmw;
+	void *cali_specific;
 	unsigned int dev_addr;
 	unsigned int err_code;
 	unsigned char cur_book;
diff --git a/sound/hda/codecs/side-codecs/tas2781_hda_i2c.c b/sound/hda/codecs/side-codecs/tas2781_hda_i2c.c
index 74c3cf1e45e1..67240ce184e1 100644
--- a/sound/hda/codecs/side-codecs/tas2781_hda_i2c.c
+++ b/sound/hda/codecs/side-codecs/tas2781_hda_i2c.c
@@ -60,7 +60,6 @@ struct tas2781_hda_i2c_priv {
 	int (*save_calibration)(struct tas2781_hda *h);
 
 	int hda_chip_id;
-	bool skip_calibration;
 };
 
 static int tas2781_get_i2c_res(struct acpi_resource *ares, void *data)
@@ -479,8 +478,7 @@ static void tasdevice_dspfw_init(void *context)
 	/* If calibrated data occurs error, dsp will still works with default
 	 * calibrated data inside algo.
 	 */
-	if (!hda_priv->skip_calibration)
-		hda_priv->save_calibration(tas_hda);
+	hda_priv->save_calibration(tas_hda);
 }
 
 static void tasdev_fw_ready(const struct firmware *fmw, void *context)
@@ -535,7 +533,6 @@ static int tas2781_hda_bind(struct device *dev, struct device *master,
 	void *master_data)
 {
 	struct tas2781_hda *tas_hda = dev_get_drvdata(dev);
-	struct tas2781_hda_i2c_priv *hda_priv = tas_hda->hda_priv;
 	struct hda_component_parent *parent = master_data;
 	struct hda_component *comp;
 	struct hda_codec *codec;
@@ -564,14 +561,6 @@ static int tas2781_hda_bind(struct device *dev, struct device *master,
 		break;
 	}
 
-	/*
-	 * Using ASUS ROG Xbox Ally X (RC73XA) UEFI calibration data
-	 * causes audio dropouts during playback, use fallback data
-	 * from DSP firmware as a workaround.
-	 */
-	if (codec->core.subsystem_id == 0x10431384)
-		hda_priv->skip_calibration = true;
-
 	guard(pm_runtime_active_auto)(dev);
 
 	comp->dev = dev;
@@ -643,6 +632,7 @@ static int tas2781_hda_i2c_probe(struct i2c_client *clt)
 		 */
 		device_name = "TIAS2781";
 		hda_priv->hda_chip_id = HDA_TAS2781;
+		tas_hda->priv->chip_id = TAS2781;
 		hda_priv->save_calibration = tas2781_save_calibration;
 		tas_hda->priv->global_addr = TAS2781_GLOBAL_ADDR;
 	} else if (strstarts(dev_name(&clt->dev), "i2c-TXNW2770")) {
@@ -656,6 +646,7 @@ static int tas2781_hda_i2c_probe(struct i2c_client *clt)
 			     "i2c-TXNW2781:00-tas2781-hda.0")) {
 		device_name = "TXNW2781";
 		hda_priv->hda_chip_id = HDA_TAS2781;
+		tas_hda->priv->chip_id = TAS2781;
 		hda_priv->save_calibration = tas2781_save_calibration;
 		tas_hda->priv->global_addr = TAS2781_GLOBAL_ADDR;
 	} else if (strstr(dev_name(&clt->dev), "INT8866")) {
diff --git a/sound/soc/codecs/tas2781-fmwlib.c b/sound/soc/codecs/tas2781-fmwlib.c
index c969eb38704e..5798d518d94c 100644
--- a/sound/soc/codecs/tas2781-fmwlib.c
+++ b/sound/soc/codecs/tas2781-fmwlib.c
@@ -32,6 +32,10 @@
 #define TAS2781_YRAM1_PAGE			42
 #define TAS2781_YRAM1_START_REG			88
 
+#define TAS2781_PG_REG		TASDEVICE_REG(0x00, 0x00, 0x7c)
+#define TAS2781_PG_1_0		0xA0
+#define TAS2781_PG_2_0		0xA8
+
 #define TAS2781_YRAM2_START_PAGE		43
 #define TAS2781_YRAM2_END_PAGE			49
 #define TAS2781_YRAM2_START_REG			8
@@ -98,6 +102,12 @@ struct blktyp_devidx_map {
 	unsigned char dev_idx;
 };
 
+struct tas2781_cali_specific {
+	unsigned char sin_gni[4];
+	int sin_gni_reg;
+	bool is_sin_gn_flush;
+};
+
 static const char deviceNumber[TASDEVICE_DSP_TAS_MAX_DEVICE] = {
 	1, 2, 1, 2, 1, 1, 0, 2, 4, 3, 1, 2, 3, 4, 1, 2
 };
@@ -2454,6 +2464,84 @@ static int tasdevice_load_data(struct tasdevice_priv *tas_priv,
 	return ret;
 }
 
+static int tas2781_cali_preproc(struct tasdevice_priv *priv, int i)
+{
+	struct tas2781_cali_specific *spec = priv->tasdevice[i].cali_specific;
+	struct calidata *cali_data = &priv->cali_data;
+	struct cali_reg *p = &cali_data->cali_reg_array;
+	unsigned char *data = cali_data->data;
+	int rc;
+
+	/*
+	 * On TAS2781, if the Speaker calibrated impedance is lower than
+	 * default value hard-coded inside the TAS2781, it will cuase vol
+	 * lower than normal. In order to fix this issue, the parameter of
+	 * SineGainI need updating.
+	 */
+	if (spec == NULL) {
+		int k = i * (cali_data->cali_dat_sz_per_dev + 1);
+		int re_org, re_cal, corrected_sin_gn, pg_id;
+		unsigned char r0_deflt[4];
+
+		spec = devm_kzalloc(priv->dev, sizeof(*spec), GFP_KERNEL);
+		if (spec == NULL)
+			return -ENOMEM;
+		priv->tasdevice[i].cali_specific = spec;
+		rc = tasdevice_dev_bulk_read(priv, i, p->r0_reg, r0_deflt, 4);
+		if (rc < 0) {
+			dev_err(priv->dev, "invalid RE from %d = %d\n", i, rc);
+			return rc;
+		}
+		/*
+		 * SineGainI need to be re-calculated, calculate the high 16
+		 * bits.
+		 */
+		re_org = r0_deflt[0] << 8 | r0_deflt[1];
+		re_cal = data[k + 1] << 8 | data[k + 2];
+		if (re_org > re_cal) {
+			rc = tasdevice_dev_read(priv, i, TAS2781_PG_REG,
+						 &pg_id);
+			if (rc < 0) {
+				dev_err(priv->dev, "invalid PG id %d = %d\n",
+					i, rc);
+				return rc;
+			}
+
+			spec->sin_gni_reg = (pg_id == TAS2781_PG_1_0) ?
+				TASDEVICE_REG(0, 0x1b, 0x34) :
+				TASDEVICE_REG(0, 0x18, 0x1c);
+
+			rc = tasdevice_dev_bulk_read(priv, i,
+						      spec->sin_gni_reg,
+						      spec->sin_gni, 4);
+			if (rc < 0) {
+				dev_err(priv->dev, "wrong sinegaini %d = %d\n",
+					i, rc);
+				return rc;
+			}
+			corrected_sin_gn = re_org * ((spec->sin_gni[0] << 8) +
+						       spec->sin_gni[1]);
+			corrected_sin_gn /= re_cal;
+			spec->sin_gni[0] = corrected_sin_gn >> 8;
+			spec->sin_gni[1] = corrected_sin_gn & 0xff;
+
+			spec->is_sin_gn_flush = true;
+		}
+	}
+
+	if (spec->is_sin_gn_flush) {
+		rc = tasdevice_dev_bulk_write(priv, i, spec->sin_gni_reg,
+						       spec->sin_gni, 4);
+		if (rc < 0) {
+			dev_err(priv->dev, "update failed %d = %d\n",
+				i, rc);
+			return rc;
+		}
+	}
+
+	return 0;
+}
+
 static void tasdev_load_calibrated_data(struct tasdevice_priv *priv, int i)
 {
 	struct calidata *cali_data = &priv->cali_data;
@@ -2469,6 +2557,12 @@ static void tasdev_load_calibrated_data(struct tasdevice_priv *priv, int i)
 	}
 	k++;
 
+	if (priv->chip_id == TAS2781) {
+		rc = tas2781_cali_preproc(priv, i);
+		if (rc < 0)
+			return;
+	}
+
 	rc = tasdevice_dev_bulk_write(priv, i, p->r0_reg, &(data[k]), 4);
 	if (rc < 0) {
 		dev_err(priv->dev, "chn %d r0_reg bulk_wr err = %d\n", i, rc);
-- 
cgit v1.2.3


From 0ed2e8bf61d6d5df1d78f4e24b682dff4c394e17 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 28 Feb 2026 04:56:20 -0700
Subject: io_uring: correct comment for IORING_SETUP_TASKRUN_FLAG

Sync with a recent liburing fix, which corrects the comment explaining
when the IORING_SETUP_TASKRUN_FLAG setup flag is valid to use. May be
use with COOP_TASKRUN or DEFER_TASKRUN, not useful without either of
this task_work mechanisms being used.

Link: https://github.com/axboe/liburing/pull/1543
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 6750c383a2ab..1ff16141c8a5 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -188,7 +188,8 @@ enum io_uring_sqe_flags_bit {
 /*
  * If COOP_TASKRUN is set, get notified if task work is available for
  * running and a kernel transition would be needed to run it. This sets
- * IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN.
+ * IORING_SQ_TASKRUN in the sq ring flags. Not valid without COOP_TASKRUN
+ * or DEFER_TASKRUN.
  */
 #define IORING_SETUP_TASKRUN_FLAG	(1U << 9)
 #define IORING_SETUP_SQE128		(1U << 10) /* SQEs are 128 byte */
-- 
cgit v1.2.3


From 60abb0ac11dccd6b98fd9182bc5f85b621688861 Mon Sep 17 00:00:00 2001
From: "Nikhil P. Rao" <nikhil.rao@amd.com>
Date: Wed, 25 Feb 2026 00:00:26 +0000
Subject: xsk: Fix fragment node deletion to prevent buffer leak

After commit b692bf9a7543 ("xsk: Get rid of xdp_buff_xsk::xskb_list_node"),
the list_node field is reused for both the xskb pool list and the buffer
free list, this causes a buffer leak as described below.

xp_free() checks if a buffer is already on the free list using
list_empty(&xskb->list_node). When list_del() is used to remove a node
from the xskb pool list, it doesn't reinitialize the node pointers.
This means list_empty() will return false even after the node has been
removed, causing xp_free() to incorrectly skip adding the buffer to the
free list.

Fix this by using list_del_init() instead of list_del() in all fragment
handling paths, this ensures the list node is reinitialized after removal,
allowing the list_empty() to work correctly.

Fixes: b692bf9a7543 ("xsk: Get rid of xdp_buff_xsk::xskb_list_node")
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Nikhil P. Rao <nikhil.rao@amd.com>
Link: https://patch.msgid.link/20260225000456.107806-2-nikhil.rao@amd.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/xdp_sock_drv.h | 6 +++---
 net/xdp/xsk.c              | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
index 242e34f771cc..aefc368449d5 100644
--- a/include/net/xdp_sock_drv.h
+++ b/include/net/xdp_sock_drv.h
@@ -122,7 +122,7 @@ static inline void xsk_buff_free(struct xdp_buff *xdp)
 		goto out;
 
 	list_for_each_entry_safe(pos, tmp, xskb_list, list_node) {
-		list_del(&pos->list_node);
+		list_del_init(&pos->list_node);
 		xp_free(pos);
 	}
 
@@ -157,7 +157,7 @@ static inline struct xdp_buff *xsk_buff_get_frag(const struct xdp_buff *first)
 	frag = list_first_entry_or_null(&xskb->pool->xskb_list,
 					struct xdp_buff_xsk, list_node);
 	if (frag) {
-		list_del(&frag->list_node);
+		list_del_init(&frag->list_node);
 		ret = &frag->xdp;
 	}
 
@@ -168,7 +168,7 @@ static inline void xsk_buff_del_frag(struct xdp_buff *xdp)
 {
 	struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
 
-	list_del(&xskb->list_node);
+	list_del_init(&xskb->list_node);
 }
 
 static inline struct xdp_buff *xsk_buff_get_head(struct xdp_buff *first)
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 3b46bc635c43..be882a8d473c 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -186,7 +186,7 @@ static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
 		err = __xsk_rcv_zc(xs, pos, len, contd);
 		if (err)
 			goto err;
-		list_del(&pos->list_node);
+		list_del_init(&pos->list_node);
 	}
 
 	return 0;
-- 
cgit v1.2.3


From 9351cf3fd92dc1349bb75f2f7f7324607dcf596f Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.cirrus.com>
Date: Thu, 26 Feb 2026 11:01:37 +0000
Subject: ASoC: cs35l56: Only patch ASP registers if the DAI is part of a
 DAIlink

Move the ASP register patches to a separate struct and apply this from the
ASP DAI probe() function so that the registers are only patched if the DAI
is part of a DAI link.

Some systems use the ASP as a special-purpose interconnect and on these
systems the ASP registers are configured by a third party (the firmware,
the BIOS, or another device using the amp's secondary host control
interface).

If the machine driver does not hook up the ASP DAI then the ASP registers
must be omitted from the patch to prevent overwriting the third party
configuration.

If the machine driver includes the ASP DAI in a DAI link, this implies that
the machine driver and higher components (such as alsa-ucm) are taking
ownership of the ASP. In this case the ASP registers are patched to known
defaults and the machine driver should configure the ASP.

Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Link: https://patch.msgid.link/20260226110137.1664562-1-rf@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/cs35l56.h           |  1 +
 sound/soc/codecs/cs35l56-shared.c | 16 +++++++++++++++-
 sound/soc/codecs/cs35l56.c        |  8 ++++++++
 3 files changed, 24 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/sound/cs35l56.h b/include/sound/cs35l56.h
index ae1e1489b671..28f9f5940ab6 100644
--- a/include/sound/cs35l56.h
+++ b/include/sound/cs35l56.h
@@ -406,6 +406,7 @@ extern const char * const cs35l56_cal_set_status_text[3];
 extern const char * const cs35l56_tx_input_texts[CS35L56_NUM_INPUT_SRC];
 extern const unsigned int cs35l56_tx_input_values[CS35L56_NUM_INPUT_SRC];
 
+int cs35l56_set_asp_patch(struct cs35l56_base *cs35l56_base);
 int cs35l56_set_patch(struct cs35l56_base *cs35l56_base);
 int cs35l56_mbox_send(struct cs35l56_base *cs35l56_base, unsigned int command);
 int cs35l56_firmware_shutdown(struct cs35l56_base *cs35l56_base);
diff --git a/sound/soc/codecs/cs35l56-shared.c b/sound/soc/codecs/cs35l56-shared.c
index 4707f28bfca2..af87ebae98cb 100644
--- a/sound/soc/codecs/cs35l56-shared.c
+++ b/sound/soc/codecs/cs35l56-shared.c
@@ -26,7 +26,7 @@
 
 #include "cs35l56.h"
 
-static const struct reg_sequence cs35l56_patch[] = {
+static const struct reg_sequence cs35l56_asp_patch[] = {
 	/*
 	 * Firmware can change these to non-defaults to satisfy SDCA.
 	 * Ensure that they are at known defaults.
@@ -43,6 +43,20 @@ static const struct reg_sequence cs35l56_patch[] = {
 	{ CS35L56_ASP1TX2_INPUT,		0x00000000 },
 	{ CS35L56_ASP1TX3_INPUT,		0x00000000 },
 	{ CS35L56_ASP1TX4_INPUT,		0x00000000 },
+};
+
+int cs35l56_set_asp_patch(struct cs35l56_base *cs35l56_base)
+{
+	return regmap_register_patch(cs35l56_base->regmap, cs35l56_asp_patch,
+				     ARRAY_SIZE(cs35l56_asp_patch));
+}
+EXPORT_SYMBOL_NS_GPL(cs35l56_set_asp_patch, "SND_SOC_CS35L56_SHARED");
+
+static const struct reg_sequence cs35l56_patch[] = {
+	/*
+	 * Firmware can change these to non-defaults to satisfy SDCA.
+	 * Ensure that they are at known defaults.
+	 */
 	{ CS35L56_SWIRE_DP3_CH1_INPUT,		0x00000018 },
 	{ CS35L56_SWIRE_DP3_CH2_INPUT,		0x00000019 },
 	{ CS35L56_SWIRE_DP3_CH3_INPUT,		0x00000029 },
diff --git a/sound/soc/codecs/cs35l56.c b/sound/soc/codecs/cs35l56.c
index 2ff8b172b76e..3bf9e8fc34a2 100644
--- a/sound/soc/codecs/cs35l56.c
+++ b/sound/soc/codecs/cs35l56.c
@@ -348,6 +348,13 @@ static int cs35l56_dsp_event(struct snd_soc_dapm_widget *w,
 	return wm_adsp_event(w, kcontrol, event);
 }
 
+static int cs35l56_asp_dai_probe(struct snd_soc_dai *codec_dai)
+{
+	struct cs35l56_private *cs35l56 = snd_soc_component_get_drvdata(codec_dai->component);
+
+	return cs35l56_set_asp_patch(&cs35l56->base);
+}
+
 static int cs35l56_asp_dai_set_fmt(struct snd_soc_dai *codec_dai, unsigned int fmt)
 {
 	struct cs35l56_private *cs35l56 = snd_soc_component_get_drvdata(codec_dai->component);
@@ -552,6 +559,7 @@ static int cs35l56_asp_dai_set_sysclk(struct snd_soc_dai *dai,
 }
 
 static const struct snd_soc_dai_ops cs35l56_ops = {
+	.probe = cs35l56_asp_dai_probe,
 	.set_fmt = cs35l56_asp_dai_set_fmt,
 	.set_tdm_slot = cs35l56_asp_dai_set_tdm_slot,
 	.hw_params = cs35l56_asp_dai_hw_params,
-- 
cgit v1.2.3


From b570f37a2ce480be26c665345c5514686a8a0274 Mon Sep 17 00:00:00 2001
From: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Date: Tue, 10 Feb 2026 12:56:53 +0100
Subject: mm: Fix a hmm_range_fault() livelock / starvation problem
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If hmm_range_fault() fails a folio_trylock() in do_swap_page,
trying to acquire the lock of a device-private folio for migration,
to ram, the function will spin until it succeeds grabbing the lock.

However, if the process holding the lock is depending on a work
item to be completed, which is scheduled on the same CPU as the
spinning hmm_range_fault(), that work item might be starved and
we end up in a livelock / starvation situation which is never
resolved.

This can happen, for example if the process holding the
device-private folio lock is stuck in
   migrate_device_unmap()->lru_add_drain_all()
sinc lru_add_drain_all() requires a short work-item
to be run on all online cpus to complete.

A prerequisite for this to happen is:
a) Both zone device and system memory folios are considered in
   migrate_device_unmap(), so that there is a reason to call
   lru_add_drain_all() for a system memory folio while a
   folio lock is held on a zone device folio.
b) The zone device folio has an initial mapcount > 1 which causes
   at least one migration PTE entry insertion to be deferred to
   try_to_migrate(), which can happen after the call to
   lru_add_drain_all().
c) No or voluntary only preemption.

This all seems pretty unlikely to happen, but indeed is hit by
the "xe_exec_system_allocator" igt test.

Resolve this by waiting for the folio to be unlocked if the
folio_trylock() fails in do_swap_page().

Rename migration_entry_wait_on_locked() to
softleaf_entry_wait_unlock() and update its documentation to
indicate the new use-case.

Future code improvements might consider moving
the lru_add_drain_all() call in migrate_device_unmap() to be
called *after* all pages have migration entries inserted.
That would eliminate also b) above.

v2:
- Instead of a cond_resched() in hmm_range_fault(),
  eliminate the problem by waiting for the folio to be unlocked
  in do_swap_page() (Alistair Popple, Andrew Morton)
v3:
- Add a stub migration_entry_wait_on_locked() for the
  !CONFIG_MIGRATION case. (Kernel Test Robot)
v4:
- Rename migrate_entry_wait_on_locked() to
  softleaf_entry_wait_on_locked() and update docs (Alistair Popple)
v5:
- Add a WARN_ON_ONCE() for the !CONFIG_MIGRATION
  version of softleaf_entry_wait_on_locked().
- Modify wording around function names in the commit message
  (Andrew Morton)

Suggested-by: Alistair Popple <apopple@nvidia.com>
Fixes: 1afaeb8293c9 ("mm/migrate: Trylock device page in do_swap_page")
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: linux-mm@kvack.org
Cc: <dri-devel@lists.freedesktop.org>
Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Cc: <stable@vger.kernel.org> # v6.15+
Reviewed-by: John Hubbard <jhubbard@nvidia.com> #v3
Reviewed-by: Alistair Popple <apopple@nvidia.com>
Link: https://patch.msgid.link/20260210115653.92413-1-thomas.hellstrom@linux.intel.com
(cherry picked from commit a69d1ab971a624c6f112cea61536569d579c3215)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 include/linux/migrate.h | 10 +++++++++-
 mm/filemap.c            | 15 ++++++++++-----
 mm/memory.c             |  3 ++-
 mm/migrate.c            |  8 ++++----
 mm/migrate_device.c     |  2 +-
 5 files changed, 26 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 26ca00c325d9..d5af2b7f577b 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -65,7 +65,7 @@ bool isolate_folio_to_list(struct folio *folio, struct list_head *list);
 
 int migrate_huge_page_move_mapping(struct address_space *mapping,
 		struct folio *dst, struct folio *src);
-void migration_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl)
+void softleaf_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl)
 		__releases(ptl);
 void folio_migrate_flags(struct folio *newfolio, struct folio *folio);
 int folio_migrate_mapping(struct address_space *mapping,
@@ -97,6 +97,14 @@ static inline int set_movable_ops(const struct movable_operations *ops, enum pag
 	return -ENOSYS;
 }
 
+static inline void softleaf_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl)
+	__releases(ptl)
+{
+	WARN_ON_ONCE(1);
+
+	spin_unlock(ptl);
+}
+
 #endif /* CONFIG_MIGRATION */
 
 #ifdef CONFIG_NUMA_BALANCING
diff --git a/mm/filemap.c b/mm/filemap.c
index 6cd7974d4ada..406cef06b684 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1379,14 +1379,16 @@ repeat:
 
 #ifdef CONFIG_MIGRATION
 /**
- * migration_entry_wait_on_locked - Wait for a migration entry to be removed
- * @entry: migration swap entry.
+ * softleaf_entry_wait_on_locked - Wait for a migration entry or
+ * device_private entry to be removed.
+ * @entry: migration or device_private swap entry.
  * @ptl: already locked ptl. This function will drop the lock.
  *
- * Wait for a migration entry referencing the given page to be removed. This is
+ * Wait for a migration entry referencing the given page, or device_private
+ * entry referencing a dvice_private page to be unlocked. This is
  * equivalent to folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE) except
  * this can be called without taking a reference on the page. Instead this
- * should be called while holding the ptl for the migration entry referencing
+ * should be called while holding the ptl for @entry referencing
  * the page.
  *
  * Returns after unlocking the ptl.
@@ -1394,7 +1396,7 @@ repeat:
  * This follows the same logic as folio_wait_bit_common() so see the comments
  * there.
  */
-void migration_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl)
+void softleaf_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl)
 	__releases(ptl)
 {
 	struct wait_page_queue wait_page;
@@ -1428,6 +1430,9 @@ void migration_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl)
 	 * If a migration entry exists for the page the migration path must hold
 	 * a valid reference to the page, and it must take the ptl to remove the
 	 * migration entry. So the page is valid until the ptl is dropped.
+	 * Similarly any path attempting to drop the last reference to a
+	 * device-private page needs to grab the ptl to remove the device-private
+	 * entry.
 	 */
 	spin_unlock(ptl);
 
diff --git a/mm/memory.c b/mm/memory.c
index 07778814b4a8..2f815a34d924 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4763,7 +4763,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 				unlock_page(vmf->page);
 				put_page(vmf->page);
 			} else {
-				pte_unmap_unlock(vmf->pte, vmf->ptl);
+				pte_unmap(vmf->pte);
+				softleaf_entry_wait_on_locked(entry, vmf->ptl);
 			}
 		} else if (softleaf_is_hwpoison(entry)) {
 			ret = VM_FAULT_HWPOISON;
diff --git a/mm/migrate.c b/mm/migrate.c
index 1bf2cf8c44dd..2c3d489ecf51 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -500,7 +500,7 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 	if (!softleaf_is_migration(entry))
 		goto out;
 
-	migration_entry_wait_on_locked(entry, ptl);
+	softleaf_entry_wait_on_locked(entry, ptl);
 	return;
 out:
 	spin_unlock(ptl);
@@ -532,10 +532,10 @@ void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, p
 		 * If migration entry existed, safe to release vma lock
 		 * here because the pgtable page won't be freed without the
 		 * pgtable lock released.  See comment right above pgtable
-		 * lock release in migration_entry_wait_on_locked().
+		 * lock release in softleaf_entry_wait_on_locked().
 		 */
 		hugetlb_vma_unlock_read(vma);
-		migration_entry_wait_on_locked(entry, ptl);
+		softleaf_entry_wait_on_locked(entry, ptl);
 		return;
 	}
 
@@ -553,7 +553,7 @@ void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
 	ptl = pmd_lock(mm, pmd);
 	if (!pmd_is_migration_entry(*pmd))
 		goto unlock;
-	migration_entry_wait_on_locked(softleaf_from_pmd(*pmd), ptl);
+	softleaf_entry_wait_on_locked(softleaf_from_pmd(*pmd), ptl);
 	return;
 unlock:
 	spin_unlock(ptl);
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 0a8b31939640..8079676c8f1f 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -176,7 +176,7 @@ static int migrate_vma_collect_huge_pmd(pmd_t *pmdp, unsigned long start,
 		}
 
 		if (softleaf_is_migration(entry)) {
-			migration_entry_wait_on_locked(entry, ptl);
+			softleaf_entry_wait_on_locked(entry, ptl);
 			spin_unlock(ptl);
 			return -EAGAIN;
 		}
-- 
cgit v1.2.3


From af4e9ef3d78420feb8fe58cd9a1ab80c501b3c08 Mon Sep 17 00:00:00 2001
From: David Laight <david.laight.linux@gmail.com>
Date: Mon, 2 Mar 2026 13:27:51 +0000
Subject: uaccess: Fix scoped_user_read_access() for 'pointer to const'

If a 'const struct foo __user *ptr' is used for the address passed to
scoped_user_read_access() then you get a warning/error

  uaccess.h:691:1: error: initialization discards 'const' qualifier from pointer target type [-Werror=discarded-qualifiers]

for the

  void __user *_tmpptr = __scoped_user_access_begin(mode, uptr, size, elbl)

assignment.

Fix by using 'auto' for both _tmpptr and the redeclaration of uptr.
Replace the CLASS() with explicit __cleanup() functions on uptr.

Fixes: e497310b4ffb ("uaccess: Provide scoped user access regions")
Signed-off-by: David Laight <david.laight.linux@gmail.com>
Reviewed-and-tested-by: Christophe Leroy (CS GROUP) <chleroy@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/uaccess.h | 54 ++++++++++++++++++-------------------------------
 1 file changed, 20 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 1f3804245c06..809e4f7dfdbd 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -647,36 +647,22 @@ static inline void user_access_restore(unsigned long flags) { }
 /* Define RW variant so the below _mode macro expansion works */
 #define masked_user_rw_access_begin(u)	masked_user_access_begin(u)
 #define user_rw_access_begin(u, s)	user_access_begin(u, s)
-#define user_rw_access_end()		user_access_end()
 
 /* Scoped user access */
-#define USER_ACCESS_GUARD(_mode)				\
-static __always_inline void __user *				\
-class_user_##_mode##_begin(void __user *ptr)			\
-{								\
-	return ptr;						\
-}								\
-								\
-static __always_inline void					\
-class_user_##_mode##_end(void __user *ptr)			\
-{								\
-	user_##_mode##_access_end();				\
-}								\
-								\
-DEFINE_CLASS(user_ ##_mode## _access, void __user *,		\
-	     class_user_##_mode##_end(_T),			\
-	     class_user_##_mode##_begin(ptr), void __user *ptr)	\
-								\
-static __always_inline class_user_##_mode##_access_t		\
-class_user_##_mode##_access_ptr(void __user *scope)		\
-{								\
-	return scope;						\
-}
 
-USER_ACCESS_GUARD(read)
-USER_ACCESS_GUARD(write)
-USER_ACCESS_GUARD(rw)
-#undef USER_ACCESS_GUARD
+/* Cleanup wrapper functions */
+static __always_inline void __scoped_user_read_access_end(const void *p)
+{
+	user_read_access_end();
+};
+static __always_inline void __scoped_user_write_access_end(const void *p)
+{
+	user_write_access_end();
+};
+static __always_inline void __scoped_user_rw_access_end(const void *p)
+{
+	user_access_end();
+};
 
 /**
  * __scoped_user_access_begin - Start a scoped user access
@@ -750,13 +736,13 @@ USER_ACCESS_GUARD(rw)
  *
  * Don't use directly. Use scoped_masked_user_$MODE_access() instead.
  */
-#define __scoped_user_access(mode, uptr, size, elbl)					\
-for (bool done = false; !done; done = true)						\
-	for (void __user *_tmpptr = __scoped_user_access_begin(mode, uptr, size, elbl); \
-	     !done; done = true)							\
-		for (CLASS(user_##mode##_access, scope)(_tmpptr); !done; done = true)	\
-			/* Force modified pointer usage within the scope */		\
-			for (const typeof(uptr) uptr = _tmpptr; !done; done = true)
+#define __scoped_user_access(mode, uptr, size, elbl)				\
+for (bool done = false; !done; done = true)					\
+	for (auto _tmpptr = __scoped_user_access_begin(mode, uptr, size, elbl);	\
+	     !done; done = true)						\
+		/* Force modified pointer usage within the scope */		\
+		for (const auto uptr  __cleanup(__scoped_user_##mode##_access_end) = \
+		     _tmpptr; !done; done = true)
 
 /**
  * scoped_user_read_access_size - Start a scoped user read access with given size
-- 
cgit v1.2.3


From a116bac87118903925108e57781bbfc7a7eea27b Mon Sep 17 00:00:00 2001
From: "Isaac J. Manjarres" <isaacmanjarres@google.com>
Date: Mon, 2 Mar 2026 16:23:09 -0800
Subject: dma-buf: Include ioctl.h in UAPI header
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

include/uapi/linux/dma-buf.h uses several macros from ioctl.h to define
its ioctl commands. However, it does not include ioctl.h itself. So,
if userspace source code tries to include the dma-buf.h file without
including ioctl.h, it can result in build failures.

Therefore, include ioctl.h in the dma-buf UAPI header.

Signed-off-by: Isaac J. Manjarres <isaacmanjarres@google.com>
Reviewed-by: T.J. Mercier <tjmercier@google.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Christian König <christian.koenig@amd.com>
Link: https://lore.kernel.org/r/20260303002309.1401849-1-isaacmanjarres@google.com
---
 include/uapi/linux/dma-buf.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/dma-buf.h b/include/uapi/linux/dma-buf.h
index 5a6fda66d9ad..e827c9d20c5d 100644
--- a/include/uapi/linux/dma-buf.h
+++ b/include/uapi/linux/dma-buf.h
@@ -20,6 +20,7 @@
 #ifndef _DMA_BUF_UAPI_H_
 #define _DMA_BUF_UAPI_H_
 
+#include <linux/ioctl.h>
 #include <linux/types.h>
 
 /**
-- 
cgit v1.2.3


From 479d589b40b836442bbdadc3fdb37f001bb67f26 Mon Sep 17 00:00:00 2001
From: Jiayuan Chen <jiayuan.chen@shopee.com>
Date: Thu, 26 Feb 2026 16:03:01 +0800
Subject: bpf/bonding: reject vlan+srcmac xmit_hash_policy change when XDP is
 loaded

bond_option_mode_set() already rejects mode changes that would make a
loaded XDP program incompatible via bond_xdp_check().  However,
bond_option_xmit_hash_policy_set() has no such guard.

For 802.3ad and balance-xor modes, bond_xdp_check() returns false when
xmit_hash_policy is vlan+srcmac, because the 802.1q payload is usually
absent due to hardware offload.  This means a user can:

1. Attach a native XDP program to a bond in 802.3ad/balance-xor mode
   with a compatible xmit_hash_policy (e.g. layer2+3).
2. Change xmit_hash_policy to vlan+srcmac while XDP remains loaded.

This leaves bond->xdp_prog set but bond_xdp_check() now returning false
for the same device.  When the bond is later destroyed, dev_xdp_uninstall()
calls bond_xdp_set(dev, NULL, NULL) to remove the program, which hits
the bond_xdp_check() guard and returns -EOPNOTSUPP, triggering:

WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL))

Fix this by rejecting xmit_hash_policy changes to vlan+srcmac when an
XDP program is loaded on a bond in 802.3ad or balance-xor mode.

commit 39a0876d595b ("net, bonding: Disallow vlan+srcmac with XDP")
introduced bond_xdp_check() which returns false for 802.3ad/balance-xor
modes when xmit_hash_policy is vlan+srcmac.  The check was wired into
bond_xdp_set() to reject XDP attachment with an incompatible policy, but
the symmetric path -- preventing xmit_hash_policy from being changed to an
incompatible value after XDP is already loaded -- was left unguarded in
bond_option_xmit_hash_policy_set().

Note:
commit 094ee6017ea0 ("bonding: check xdp prog when set bond mode")
later added a similar guard to bond_option_mode_set(), but
bond_option_xmit_hash_policy_set() remained unprotected.

Reported-by: syzbot+5a287bcdc08104bc3132@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/all/6995aff6.050a0220.2eeac1.014e.GAE@google.com/T/
Fixes: 39a0876d595b ("net, bonding: Disallow vlan+srcmac with XDP")
Signed-off-by: Jiayuan Chen <jiayuan.chen@shopee.com>
Link: https://patch.msgid.link/20260226080306.98766-2-jiayuan.chen@linux.dev
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/bonding/bond_main.c    | 9 +++++++--
 drivers/net/bonding/bond_options.c | 2 ++
 include/net/bonding.h              | 1 +
 3 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index a1de08ee3815..14ed91391fcc 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -324,7 +324,7 @@ static bool bond_sk_check(struct bonding *bond)
 	}
 }
 
-bool bond_xdp_check(struct bonding *bond, int mode)
+bool __bond_xdp_check(int mode, int xmit_policy)
 {
 	switch (mode) {
 	case BOND_MODE_ROUNDROBIN:
@@ -335,7 +335,7 @@ bool bond_xdp_check(struct bonding *bond, int mode)
 		/* vlan+srcmac is not supported with XDP as in most cases the 802.1q
 		 * payload is not in the packet due to hardware offload.
 		 */
-		if (bond->params.xmit_policy != BOND_XMIT_POLICY_VLAN_SRCMAC)
+		if (xmit_policy != BOND_XMIT_POLICY_VLAN_SRCMAC)
 			return true;
 		fallthrough;
 	default:
@@ -343,6 +343,11 @@ bool bond_xdp_check(struct bonding *bond, int mode)
 	}
 }
 
+bool bond_xdp_check(struct bonding *bond, int mode)
+{
+	return __bond_xdp_check(mode, bond->params.xmit_policy);
+}
+
 /*---------------------------------- VLAN -----------------------------------*/
 
 /* In the following 2 functions, bond_vlan_rx_add_vid and bond_vlan_rx_kill_vid,
diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index dcee384c2f06..7380cc4ee75a 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -1575,6 +1575,8 @@ static int bond_option_fail_over_mac_set(struct bonding *bond,
 static int bond_option_xmit_hash_policy_set(struct bonding *bond,
 					    const struct bond_opt_value *newval)
 {
+	if (bond->xdp_prog && !__bond_xdp_check(BOND_MODE(bond), newval->value))
+		return -EOPNOTSUPP;
 	netdev_dbg(bond->dev, "Setting xmit hash policy to %s (%llu)\n",
 		   newval->string, newval->value);
 	bond->params.xmit_policy = newval->value;
diff --git a/include/net/bonding.h b/include/net/bonding.h
index 4ad5521e7731..395c6e281c5f 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -699,6 +699,7 @@ void bond_debug_register(struct bonding *bond);
 void bond_debug_unregister(struct bonding *bond);
 void bond_debug_reregister(struct bonding *bond);
 const char *bond_mode_name(int mode);
+bool __bond_xdp_check(int mode, int xmit_policy);
 bool bond_xdp_check(struct bonding *bond, int mode);
 void bond_setup(struct net_device *bond_dev);
 unsigned int bond_get_num_tx_queues(void);
-- 
cgit v1.2.3


From 710f5c76580306cdb9ec51fac8fcf6a8faff7821 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 27 Feb 2026 17:26:03 +0000
Subject: indirect_call_wrapper: do not reevaluate function pointer

We have an increasing number of READ_ONCE(xxx->function)
combined with INDIRECT_CALL_[1234]() helpers.

Unfortunately this forces INDIRECT_CALL_[1234]() to read
xxx->function many times, which is not what we wanted.

Fix these macros so that xxx->function value is not reloaded.

$ scripts/bloat-o-meter -t vmlinux.0 vmlinux
add/remove: 0/0 grow/shrink: 1/65 up/down: 122/-1084 (-962)
Function                                     old     new   delta
ip_push_pending_frames                        59     181    +122
ip6_finish_output                            687     681      -6
__udp_enqueue_schedule_skb                  1078    1072      -6
ioam6_output                                2319    2312      -7
xfrm4_rcv_encap_finish2                       64      56      -8
xfrm4_output                                 297     289      -8
vrf_ip_local_out                             278     270      -8
vrf_ip6_local_out                            278     270      -8
seg6_input_finish                             64      56      -8
rpl_output                                   700     692      -8
ipmr_forward_finish                          124     116      -8
ip_forward_finish                            143     135      -8
ip6mr_forward2_finish                        100      92      -8
ip6_forward_finish                            73      65      -8
input_action_end_bpf                        1091    1083      -8
dst_input                                     52      44      -8
__xfrm6_output                               801     793      -8
__xfrm4_output                                83      75      -8
bpf_input                                    500     491      -9
__tcp_check_space                            530     521      -9
input_action_end_dt6                         291     280     -11
vti6_tnl_xmit                               1634    1622     -12
bpf_xmit                                    1203    1191     -12
rpl_input                                    497     483     -14
rawv6_send_hdrinc                           1355    1341     -14
ndisc_send_skb                              1030    1016     -14
ipv6_srh_rcv                                1377    1363     -14
ip_send_unicast_reply                       1253    1239     -14
ip_rcv_finish                                226     212     -14
ip6_rcv_finish                               300     286     -14
input_action_end_x_core                      205     191     -14
input_action_end_x                           355     341     -14
input_action_end_t                           205     191     -14
input_action_end_dx6_finish                  127     113     -14
input_action_end_dx4_finish                  373     359     -14
input_action_end_dt4                         426     412     -14
input_action_end_core                        186     172     -14
input_action_end_b6_encap                    292     278     -14
input_action_end_b6                          198     184     -14
igmp6_send                                  1332    1318     -14
ip_sublist_rcv                               864     848     -16
ip6_sublist_rcv                             1091    1075     -16
ipv6_rpl_srh_rcv                            1937    1920     -17
xfrm_policy_queue_process                   1246    1228     -18
seg6_output_core                             903     885     -18
mld_sendpack                                 856     836     -20
NF_HOOK                                      756     736     -20
vti_tunnel_xmit                             1447    1426     -21
input_action_end_dx6                         664     642     -22
input_action_end                            1502    1480     -22
sock_sendmsg_nosec                           134     111     -23
ip6mr_forward2                               388     364     -24
sock_recvmsg_nosec                           134     109     -25
seg6_input_core                              836     810     -26
ip_send_skb                                  172     146     -26
ip_local_out                                 140     114     -26
ip6_local_out                                140     114     -26
__sock_sendmsg                               162     136     -26
__ip_queue_xmit                             1196    1170     -26
__ip_finish_output                           405     379     -26
ipmr_queue_fwd_xmit                          373     346     -27
sock_recvmsg                                 173     145     -28
ip6_xmit                                    1635    1607     -28
xfrm_output_resume                          1418    1389     -29
ip_build_and_send_pkt                        625     591     -34
dst_output                                   504     432     -72
Total: Before=25217686, After=25216724, chg -0.00%

Fixes: 283c16a2dfd3 ("indirect call wrappers: helpers to speed-up indirect calls of builtin")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20260227172603.1700433-1-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/indirect_call_wrapper.h | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/indirect_call_wrapper.h b/include/linux/indirect_call_wrapper.h
index 35227d47cfc9..dc272b514a01 100644
--- a/include/linux/indirect_call_wrapper.h
+++ b/include/linux/indirect_call_wrapper.h
@@ -16,22 +16,26 @@
  */
 #define INDIRECT_CALL_1(f, f1, ...)					\
 	({								\
-		likely(f == f1) ? f1(__VA_ARGS__) : f(__VA_ARGS__);	\
+		typeof(f) __f1 = (f);					\
+		likely(__f1 == f1) ? f1(__VA_ARGS__) : __f1(__VA_ARGS__);	\
 	})
 #define INDIRECT_CALL_2(f, f2, f1, ...)					\
 	({								\
-		likely(f == f2) ? f2(__VA_ARGS__) :			\
-				  INDIRECT_CALL_1(f, f1, __VA_ARGS__);	\
+		typeof(f) __f2 = (f);					\
+		likely(__f2 == f2) ? f2(__VA_ARGS__) :			\
+				  INDIRECT_CALL_1(__f2, f1, __VA_ARGS__);	\
 	})
 #define INDIRECT_CALL_3(f, f3, f2, f1, ...)					\
 	({									\
-		likely(f == f3) ? f3(__VA_ARGS__) :				\
-				  INDIRECT_CALL_2(f, f2, f1, __VA_ARGS__);	\
+		typeof(f) __f3 = (f);						\
+		likely(__f3 == f3) ? f3(__VA_ARGS__) :				\
+				  INDIRECT_CALL_2(__f3, f2, f1, __VA_ARGS__);	\
 	})
 #define INDIRECT_CALL_4(f, f4, f3, f2, f1, ...)					\
 	({									\
-		likely(f == f4) ? f4(__VA_ARGS__) :				\
-				  INDIRECT_CALL_3(f, f3, f2, f1, __VA_ARGS__);	\
+		typeof(f) __f4 = (f);						\
+		likely(__f4 == f4) ? f4(__VA_ARGS__) :				\
+				  INDIRECT_CALL_3(__f4, f3, f2, f1, __VA_ARGS__);	\
 	})
 
 #define INDIRECT_CALLABLE_DECLARE(f)	f
-- 
cgit v1.2.3


From 9de68394a61528d40f575c3e6719cc75c56f62c3 Mon Sep 17 00:00:00 2001
From: Danilo Krummrich <dakr@kernel.org>
Date: Mon, 2 Mar 2026 01:25:44 +0100
Subject: Revert "driver core: enforce device_lock for driver_match_device()"

This reverts commit dc23806a7c47 ("driver core: enforce device_lock for
driver_match_device()") and commit 289b14592cef ("driver core: fix
inverted "locked" suffix of driver_match_device()").

While technically correct, there is a major downside to this approach:

When a device is already present in the system and a driver is
registered on the same bus, we iterate over all devices registered on
this bus to see if one of them matches. If we come across an already
bound one where the corresponding driver crashed while holding the
device lock (e.g. in probe()) we can't make any progress anymore.

However, drivers are typically the least tested code in the kernel and
hence it is a case that is likely to happen regularly. Besides hurting
developer ergonomics, it potentially decreases chances of shutting
things down cleanly and obtaining logs in production environments as
well [1].

This came up in the context of a firewire bug, which only in combination
with the reverted commit, caused the machine to hang [2]. Additionally,
it was observed in [3].

Thus, revert commit dc23806a7c47 ("driver core: enforce device_lock for
driver_match_device()") and add a brief note clarifying that an
implementer of struct bus_type must not expect match() to be called with
the device lock held.

Link: https://lore.kernel.org/driver-core/DGRGTIRHA62X.3RY09D9SOK77P@kernel.org/ [1]
Link: https://lore.kernel.org/all/67f655bb-4d81-4609-b008-68d200255dd2@davidgow.net/ [2]
Link: https://lore.kernel.org/lkml/CALbr=LZ4v7N=tO1vgOsyj9AS+XuNbn6kG-QcF+PacdMjSo0iyw@mail.gmail.com/ [3]
Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Closes: https://lore.kernel.org/driver-core/CAHk-=wgJ_L1C=HjcYJotg_zrZEmiLFJaoic+PWthjuQrutrfJw@mail.gmail.com/
Reviewed-by: Gui-Dong Han <hanguidong02@gmail.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://patch.msgid.link/20260302002545.19389-1-dakr@kernel.org
[ Add additional Link: reference. - Danilo ]
Signed-off-by: Danilo Krummrich <dakr@kernel.org>
---
 drivers/base/base.h        | 11 +----------
 drivers/base/dd.c          |  2 +-
 include/linux/device/bus.h |  2 ++
 3 files changed, 4 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/base/base.h b/drivers/base/base.h
index 79d031d2d845..1af95ac68b77 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -179,19 +179,10 @@ void device_release_driver_internal(struct device *dev, const struct device_driv
 void driver_detach(const struct device_driver *drv);
 void driver_deferred_probe_del(struct device *dev);
 void device_set_deferred_probe_reason(const struct device *dev, struct va_format *vaf);
-static inline int driver_match_device_locked(const struct device_driver *drv,
-					     struct device *dev)
-{
-	device_lock_assert(dev);
-
-	return drv->bus->match ? drv->bus->match(dev, drv) : 1;
-}
-
 static inline int driver_match_device(const struct device_driver *drv,
 				      struct device *dev)
 {
-	guard(device)(dev);
-	return driver_match_device_locked(drv, dev);
+	return drv->bus->match ? drv->bus->match(dev, drv) : 1;
 }
 
 static inline void dev_sync_state(struct device *dev)
diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index 0354f209529c..bea8da5f8a3a 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -928,7 +928,7 @@ static int __device_attach_driver(struct device_driver *drv, void *_data)
 	bool async_allowed;
 	int ret;
 
-	ret = driver_match_device_locked(drv, dev);
+	ret = driver_match_device(drv, dev);
 	if (ret == 0) {
 		/* no match */
 		return 0;
diff --git a/include/linux/device/bus.h b/include/linux/device/bus.h
index 99c3c83ea520..63de5f053c33 100644
--- a/include/linux/device/bus.h
+++ b/include/linux/device/bus.h
@@ -35,6 +35,8 @@ struct fwnode_handle;
  *		otherwise. It may also return error code if determining that
  *		the driver supports the device is not possible. In case of
  *		-EPROBE_DEFER it will queue the device for deferred probing.
+ *		Note: This callback may be invoked with or without the device
+ *		lock held.
  * @uevent:	Called when a device is added, removed, or a few other things
  *		that generate uevents to add the environment variables.
  * @probe:	Called when a new device or driver add to this bus, and callback
-- 
cgit v1.2.3


From 4ee7fa6cf78ff26d783d39e2949d14c4c1cd5e7f Mon Sep 17 00:00:00 2001
From: Yung Chih Su <yuuchihsu@gmail.com>
Date: Mon, 2 Mar 2026 14:02:47 +0800
Subject: net: ipv4: fix ARM64 alignment fault in multipath hash seed

`struct sysctl_fib_multipath_hash_seed` contains two u32 fields
(user_seed and mp_seed), making it an 8-byte structure with a 4-byte
alignment requirement.

In `fib_multipath_hash_from_keys()`, the code evaluates the entire
struct atomically via `READ_ONCE()`:

    mp_seed = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed).mp_seed;

While this silently works on GCC by falling back to unaligned regular
loads which the ARM64 kernel tolerates, it causes a fatal kernel panic
when compiled with Clang and LTO enabled.

Commit e35123d83ee3 ("arm64: lto: Strengthen READ_ONCE() to acquire
when CONFIG_LTO=y") strengthens `READ_ONCE()` to use Load-Acquire
instructions (`ldar` / `ldapr`) to prevent compiler reordering bugs
under Clang LTO. Since the macro evaluates the full 8-byte struct,
Clang emits a 64-bit `ldar` instruction. ARM64 architecture strictly
requires `ldar` to be naturally aligned, thus executing it on a 4-byte
aligned address triggers a strict Alignment Fault (FSC = 0x21).

Fix the read side by moving the `READ_ONCE()` directly to the `u32`
member, which emits a safe 32-bit `ldar Wn`.

Furthermore, Eric Dumazet pointed out that `WRITE_ONCE()` on the entire
struct in `proc_fib_multipath_hash_set_seed()` is also flawed. Analysis
shows that Clang splits this 8-byte write into two separate 32-bit
`str` instructions. While this avoids an alignment fault, it destroys
atomicity and exposes a tear-write vulnerability. Fix this by
explicitly splitting the write into two 32-bit `WRITE_ONCE()`
operations.

Finally, add the missing `READ_ONCE()` when reading `user_seed` in
`proc_fib_multipath_hash_seed()` to ensure proper pairing and
concurrency safety.

Fixes: 4ee2a8cace3f ("net: ipv4: Add a sysctl to set multipath hash seed")
Signed-off-by: Yung Chih Su <yuuchihsu@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20260302060247.7066-1-yuuchihsu@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/ip_fib.h       | 2 +-
 net/ipv4/sysctl_net_ipv4.c | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index b4495c38e0a0..318593743b6e 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -559,7 +559,7 @@ static inline u32 fib_multipath_hash_from_keys(const struct net *net,
 	siphash_aligned_key_t hash_key;
 	u32 mp_seed;
 
-	mp_seed = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed).mp_seed;
+	mp_seed = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed.mp_seed);
 	fib_multipath_hash_construct_key(&hash_key, mp_seed);
 
 	return flow_hash_from_keys_seed(keys, &hash_key);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 643763bc2142..5654cc9c8a0b 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -486,7 +486,8 @@ static void proc_fib_multipath_hash_set_seed(struct net *net, u32 user_seed)
 			    proc_fib_multipath_hash_rand_seed),
 	};
 
-	WRITE_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed, new);
+	WRITE_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed.user_seed, new.user_seed);
+	WRITE_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed.mp_seed, new.mp_seed);
 }
 
 static int proc_fib_multipath_hash_seed(const struct ctl_table *table, int write,
@@ -500,7 +501,7 @@ static int proc_fib_multipath_hash_seed(const struct ctl_table *table, int write
 	int ret;
 
 	mphs = &net->ipv4.sysctl_fib_multipath_hash_seed;
-	user_seed = mphs->user_seed;
+	user_seed = READ_ONCE(mphs->user_seed);
 
 	tmp = *table;
 	tmp.data = &user_seed;
-- 
cgit v1.2.3


From e39bb9e02b68942f8e9359d2a3efe7d37ae6be0e Mon Sep 17 00:00:00 2001
From: Qing Wang <wangqing7171@gmail.com>
Date: Fri, 27 Feb 2026 10:58:42 +0800
Subject: tracing: Fix WARN_ON in tracing_buffers_mmap_close

When a process forks, the child process copies the parent's VMAs but the
user_mapped reference count is not incremented. As a result, when both the
parent and child processes exit, tracing_buffers_mmap_close() is called
twice. On the second call, user_mapped is already 0, causing the function to
return -ENODEV and triggering a WARN_ON.

Normally, this isn't an issue as the memory is mapped with VM_DONTCOPY set.
But this is only a hint, and the application can call
madvise(MADVISE_DOFORK) which resets the VM_DONTCOPY flag. When the
application does that, it can trigger this issue on fork.

Fix it by incrementing the user_mapped reference count without re-mapping
the pages in the VMA's open callback.

Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Vincent Donnefort <vdonnefort@google.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Link: https://patch.msgid.link/20260227025842.1085206-1-wangqing7171@gmail.com
Fixes: cf9f0f7c4c5bb ("tracing: Allow user-space mapping of the ring-buffer")
Reported-by: syzbot+3b5dd2030fe08afdf65d@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=3b5dd2030fe08afdf65d
Tested-by: syzbot+3b5dd2030fe08afdf65d@syzkaller.appspotmail.com
Signed-off-by: Qing Wang <wangqing7171@gmail.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h |  1 +
 kernel/trace/ring_buffer.c  | 21 +++++++++++++++++++++
 kernel/trace/trace.c        | 13 +++++++++++++
 3 files changed, 35 insertions(+)

(limited to 'include')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 876358cfe1b1..d862fa610270 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -248,6 +248,7 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node);
 
 int ring_buffer_map(struct trace_buffer *buffer, int cpu,
 		    struct vm_area_struct *vma);
+void ring_buffer_map_dup(struct trace_buffer *buffer, int cpu);
 int ring_buffer_unmap(struct trace_buffer *buffer, int cpu);
 int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu);
 #endif /* _LINUX_RING_BUFFER_H */
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f16f053ef77d..17d0ea0cc3e6 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -7310,6 +7310,27 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu,
 	return err;
 }
 
+/*
+ * This is called when a VMA is duplicated (e.g., on fork()) to increment
+ * the user_mapped counter without remapping pages.
+ */
+void ring_buffer_map_dup(struct trace_buffer *buffer, int cpu)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+
+	if (WARN_ON(!cpumask_test_cpu(cpu, buffer->cpumask)))
+		return;
+
+	cpu_buffer = buffer->buffers[cpu];
+
+	guard(mutex)(&cpu_buffer->mapping_lock);
+
+	if (cpu_buffer->user_mapped)
+		__rb_inc_dec_mapped(cpu_buffer, true);
+	else
+		WARN(1, "Unexpected buffer stat, it should be mapped");
+}
+
 int ring_buffer_unmap(struct trace_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 23de3719f495..1e7c032a72d2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -8213,6 +8213,18 @@ static inline int get_snapshot_map(struct trace_array *tr) { return 0; }
 static inline void put_snapshot_map(struct trace_array *tr) { }
 #endif
 
+/*
+ * This is called when a VMA is duplicated (e.g., on fork()) to increment
+ * the user_mapped counter without remapping pages.
+ */
+static void tracing_buffers_mmap_open(struct vm_area_struct *vma)
+{
+	struct ftrace_buffer_info *info = vma->vm_file->private_data;
+	struct trace_iterator *iter = &info->iter;
+
+	ring_buffer_map_dup(iter->array_buffer->buffer, iter->cpu_file);
+}
+
 static void tracing_buffers_mmap_close(struct vm_area_struct *vma)
 {
 	struct ftrace_buffer_info *info = vma->vm_file->private_data;
@@ -8232,6 +8244,7 @@ static int tracing_buffers_may_split(struct vm_area_struct *vma, unsigned long a
 }
 
 static const struct vm_operations_struct tracing_buffers_vmops = {
+	.open		= tracing_buffers_mmap_open,
 	.close		= tracing_buffers_mmap_close,
 	.may_split      = tracing_buffers_may_split,
 };
-- 
cgit v1.2.3


From ace7dcc8181373a0338efa1686c5e36eb121dff2 Mon Sep 17 00:00:00 2001
From: Jouni Högander <jouni.hogander@intel.com>
Date: Wed, 25 Feb 2026 09:42:20 +0200
Subject: drm/dp: Add definition for Panel Replay full-line granularity
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

DP specification is saying value 0xff 0xff in PANEL REPLAY SELECTIVE UPDATE
X GRANULARITY CAPABILITY registers (0xb2 and 0xb3) means full-line
granularity. Add definition for this.

Cc: dri-devel@lists.freedesktop.org
Signed-off-by: Jouni Högander <jouni.hogander@intel.com>
Reviewed-by: Uma Shankar <uma.shankar@intel.com>
Acked-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Link: https://patch.msgid.link/20260225074221.1744330-1-jouni.hogander@intel.com
(cherry picked from commit b93311673263bb98a200ab1cb6304f969bdada5c)
Signed-off-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 include/drm/display/drm_dp.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/drm/display/drm_dp.h b/include/drm/display/drm_dp.h
index e4eebabab975..8b15d3eeb716 100644
--- a/include/drm/display/drm_dp.h
+++ b/include/drm/display/drm_dp.h
@@ -571,6 +571,8 @@
 # define DP_PANEL_REPLAY_LINK_OFF_SUPPORTED_IN_PR_AFTER_ADAPTIVE_SYNC_SDP	(1 << 7)
 
 #define DP_PANEL_REPLAY_CAP_X_GRANULARITY		0xb2
+# define DP_PANEL_REPLAY_FULL_LINE_GRANULARITY		0xffff
+
 #define DP_PANEL_REPLAY_CAP_Y_GRANULARITY		0xb4
 
 /* Link Configuration */
-- 
cgit v1.2.3


From 82169dace41cbaa951341b0f80f4570be3b2dec0 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Wed, 18 Feb 2026 10:52:04 +0100
Subject: xenbus: add xenbus_device parameter to xenbus_read_driver_state()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In order to prepare checking the xenbus device status in
xenbus_read_driver_state(), add the pointer to struct xenbus_device
as a parameter.

Tested-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
Acked-by: "Martin K. Petersen" <martin.petersen@oracle.com> # SCSI
Acked-by: Jakub Kicinski <kuba@kernel.org>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>	# drivers/pci/xen-pcifront.c
Signed-off-by: Juergen Gross <jgross@suse.com>
Message-ID: <20260218095205.453657-2-jgross@suse.com>
---
 drivers/net/xen-netfront.c                 | 34 +++++++++++++++---------------
 drivers/pci/xen-pcifront.c                 |  8 +++----
 drivers/scsi/xen-scsifront.c               |  2 +-
 drivers/xen/xen-pciback/xenbus.c           | 10 ++++-----
 drivers/xen/xenbus/xenbus_client.c         |  4 +++-
 drivers/xen/xenbus/xenbus_probe.c          |  6 +++---
 drivers/xen/xenbus/xenbus_probe_frontend.c |  2 +-
 include/xen/xenbus.h                       |  3 ++-
 8 files changed, 36 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index 7c2220366623..e2da977c9c50 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -1646,7 +1646,7 @@ static int xennet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
 
 	/* avoid the race with XDP headroom adjustment */
 	wait_event(module_wq,
-		   xenbus_read_driver_state(np->xbdev->otherend) ==
+		   xenbus_read_driver_state(np->xbdev, np->xbdev->otherend) ==
 		   XenbusStateReconfigured);
 	np->netfront_xdp_enabled = true;
 
@@ -1764,9 +1764,9 @@ static struct net_device *xennet_create_dev(struct xenbus_device *dev)
 	do {
 		xenbus_switch_state(dev, XenbusStateInitialising);
 		err = wait_event_timeout(module_wq,
-				 xenbus_read_driver_state(dev->otherend) !=
+				 xenbus_read_driver_state(dev, dev->otherend) !=
 				 XenbusStateClosed &&
-				 xenbus_read_driver_state(dev->otherend) !=
+				 xenbus_read_driver_state(dev, dev->otherend) !=
 				 XenbusStateUnknown, XENNET_TIMEOUT);
 	} while (!err);
 
@@ -2627,31 +2627,31 @@ static void xennet_bus_close(struct xenbus_device *dev)
 {
 	int ret;
 
-	if (xenbus_read_driver_state(dev->otherend) == XenbusStateClosed)
+	if (xenbus_read_driver_state(dev, dev->otherend) == XenbusStateClosed)
 		return;
 	do {
 		xenbus_switch_state(dev, XenbusStateClosing);
 		ret = wait_event_timeout(module_wq,
-				   xenbus_read_driver_state(dev->otherend) ==
-				   XenbusStateClosing ||
-				   xenbus_read_driver_state(dev->otherend) ==
-				   XenbusStateClosed ||
-				   xenbus_read_driver_state(dev->otherend) ==
-				   XenbusStateUnknown,
-				   XENNET_TIMEOUT);
+				xenbus_read_driver_state(dev, dev->otherend) ==
+				XenbusStateClosing ||
+				xenbus_read_driver_state(dev, dev->otherend) ==
+				XenbusStateClosed ||
+				xenbus_read_driver_state(dev, dev->otherend) ==
+				XenbusStateUnknown,
+				XENNET_TIMEOUT);
 	} while (!ret);
 
-	if (xenbus_read_driver_state(dev->otherend) == XenbusStateClosed)
+	if (xenbus_read_driver_state(dev, dev->otherend) == XenbusStateClosed)
 		return;
 
 	do {
 		xenbus_switch_state(dev, XenbusStateClosed);
 		ret = wait_event_timeout(module_wq,
-				   xenbus_read_driver_state(dev->otherend) ==
-				   XenbusStateClosed ||
-				   xenbus_read_driver_state(dev->otherend) ==
-				   XenbusStateUnknown,
-				   XENNET_TIMEOUT);
+				xenbus_read_driver_state(dev, dev->otherend) ==
+				XenbusStateClosed ||
+				xenbus_read_driver_state(dev, dev->otherend) ==
+				XenbusStateUnknown,
+				XENNET_TIMEOUT);
 	} while (!ret);
 }
 
diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c
index 11636634ae51..cd22bf984024 100644
--- a/drivers/pci/xen-pcifront.c
+++ b/drivers/pci/xen-pcifront.c
@@ -856,7 +856,7 @@ static void pcifront_try_connect(struct pcifront_device *pdev)
 	int err;
 
 	/* Only connect once */
-	if (xenbus_read_driver_state(pdev->xdev->nodename) !=
+	if (xenbus_read_driver_state(pdev->xdev, pdev->xdev->nodename) !=
 	    XenbusStateInitialised)
 		return;
 
@@ -876,7 +876,7 @@ static int pcifront_try_disconnect(struct pcifront_device *pdev)
 	enum xenbus_state prev_state;
 
 
-	prev_state = xenbus_read_driver_state(pdev->xdev->nodename);
+	prev_state = xenbus_read_driver_state(pdev->xdev, pdev->xdev->nodename);
 
 	if (prev_state >= XenbusStateClosing)
 		goto out;
@@ -895,7 +895,7 @@ out:
 
 static void pcifront_attach_devices(struct pcifront_device *pdev)
 {
-	if (xenbus_read_driver_state(pdev->xdev->nodename) ==
+	if (xenbus_read_driver_state(pdev->xdev, pdev->xdev->nodename) ==
 	    XenbusStateReconfiguring)
 		pcifront_connect(pdev);
 }
@@ -909,7 +909,7 @@ static int pcifront_detach_devices(struct pcifront_device *pdev)
 	struct pci_dev *pci_dev;
 	char str[64];
 
-	state = xenbus_read_driver_state(pdev->xdev->nodename);
+	state = xenbus_read_driver_state(pdev->xdev, pdev->xdev->nodename);
 	if (state == XenbusStateInitialised) {
 		dev_dbg(&pdev->xdev->dev, "Handle skipped connect.\n");
 		/* We missed Connected and need to initialize. */
diff --git a/drivers/scsi/xen-scsifront.c b/drivers/scsi/xen-scsifront.c
index 924025305753..ef74d4da5ab0 100644
--- a/drivers/scsi/xen-scsifront.c
+++ b/drivers/scsi/xen-scsifront.c
@@ -1175,7 +1175,7 @@ static void scsifront_backend_changed(struct xenbus_device *dev,
 			return;
 		}
 
-		if (xenbus_read_driver_state(dev->nodename) ==
+		if (xenbus_read_driver_state(dev, dev->nodename) ==
 		    XenbusStateInitialised)
 			scsifront_do_lun_hotplug(info, VSCSIFRONT_OP_ADD_LUN);
 
diff --git a/drivers/xen/xen-pciback/xenbus.c b/drivers/xen/xen-pciback/xenbus.c
index b11e401f1b1e..4bd1c7a8957e 100644
--- a/drivers/xen/xen-pciback/xenbus.c
+++ b/drivers/xen/xen-pciback/xenbus.c
@@ -149,12 +149,12 @@ static int xen_pcibk_attach(struct xen_pcibk_device *pdev)
 
 	mutex_lock(&pdev->dev_lock);
 	/* Make sure we only do this setup once */
-	if (xenbus_read_driver_state(pdev->xdev->nodename) !=
+	if (xenbus_read_driver_state(pdev->xdev, pdev->xdev->nodename) !=
 	    XenbusStateInitialised)
 		goto out;
 
 	/* Wait for frontend to state that it has published the configuration */
-	if (xenbus_read_driver_state(pdev->xdev->otherend) !=
+	if (xenbus_read_driver_state(pdev->xdev, pdev->xdev->otherend) !=
 	    XenbusStateInitialised)
 		goto out;
 
@@ -374,7 +374,7 @@ static int xen_pcibk_reconfigure(struct xen_pcibk_device *pdev,
 	dev_dbg(&pdev->xdev->dev, "Reconfiguring device ...\n");
 
 	mutex_lock(&pdev->dev_lock);
-	if (xenbus_read_driver_state(pdev->xdev->nodename) != state)
+	if (xenbus_read_driver_state(pdev->xdev, pdev->xdev->nodename) != state)
 		goto out;
 
 	err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d",
@@ -572,7 +572,7 @@ static int xen_pcibk_setup_backend(struct xen_pcibk_device *pdev)
 	/* It's possible we could get the call to setup twice, so make sure
 	 * we're not already connected.
 	 */
-	if (xenbus_read_driver_state(pdev->xdev->nodename) !=
+	if (xenbus_read_driver_state(pdev->xdev, pdev->xdev->nodename) !=
 	    XenbusStateInitWait)
 		goto out;
 
@@ -662,7 +662,7 @@ static void xen_pcibk_be_watch(struct xenbus_watch *watch,
 	struct xen_pcibk_device *pdev =
 	    container_of(watch, struct xen_pcibk_device, be_watch);
 
-	switch (xenbus_read_driver_state(pdev->xdev->nodename)) {
+	switch (xenbus_read_driver_state(pdev->xdev, pdev->xdev->nodename)) {
 	case XenbusStateInitWait:
 		xen_pcibk_setup_backend(pdev);
 		break;
diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c
index 2dc874fb5506..e502435145ae 100644
--- a/drivers/xen/xenbus/xenbus_client.c
+++ b/drivers/xen/xenbus/xenbus_client.c
@@ -931,12 +931,14 @@ static int xenbus_unmap_ring_hvm(struct xenbus_device *dev, void *vaddr)
 
 /**
  * xenbus_read_driver_state - read state from a store path
+ * @dev: xenbus device pointer
  * @path: path for driver
  *
  * Returns: the state of the driver rooted at the given store path, or
  * XenbusStateUnknown if no state can be read.
  */
-enum xenbus_state xenbus_read_driver_state(const char *path)
+enum xenbus_state xenbus_read_driver_state(const struct xenbus_device *dev,
+					   const char *path)
 {
 	enum xenbus_state result;
 	int err = xenbus_gather(XBT_NIL, path, "state", "%d", &result, NULL);
diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
index 9f9011cd7447..2eed06ba5d38 100644
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -191,7 +191,7 @@ void xenbus_otherend_changed(struct xenbus_watch *watch,
 		return;
 	}
 
-	state = xenbus_read_driver_state(dev->otherend);
+	state = xenbus_read_driver_state(dev, dev->otherend);
 
 	dev_dbg(&dev->dev, "state is %d, (%s), %s, %s\n",
 		state, xenbus_strstate(state), dev->otherend_watch.node, path);
@@ -364,7 +364,7 @@ void xenbus_dev_remove(struct device *_dev)
 	 * closed.
 	 */
 	if (!drv->allow_rebind ||
-	    xenbus_read_driver_state(dev->nodename) == XenbusStateClosing)
+	    xenbus_read_driver_state(dev, dev->nodename) == XenbusStateClosing)
 		xenbus_switch_state(dev, XenbusStateClosed);
 }
 EXPORT_SYMBOL_GPL(xenbus_dev_remove);
@@ -514,7 +514,7 @@ int xenbus_probe_node(struct xen_bus_type *bus,
 	size_t stringlen;
 	char *tmpstring;
 
-	enum xenbus_state state = xenbus_read_driver_state(nodename);
+	enum xenbus_state state = xenbus_read_driver_state(NULL, nodename);
 
 	if (state != XenbusStateInitialising) {
 		/* Device is not new, so ignore it.  This can happen if a
diff --git a/drivers/xen/xenbus/xenbus_probe_frontend.c b/drivers/xen/xenbus/xenbus_probe_frontend.c
index f04707d1f667..ca04609730df 100644
--- a/drivers/xen/xenbus/xenbus_probe_frontend.c
+++ b/drivers/xen/xenbus/xenbus_probe_frontend.c
@@ -253,7 +253,7 @@ static int print_device_status(struct device *dev, void *data)
 	} else if (xendev->state < XenbusStateConnected) {
 		enum xenbus_state rstate = XenbusStateUnknown;
 		if (xendev->otherend)
-			rstate = xenbus_read_driver_state(xendev->otherend);
+			rstate = xenbus_read_driver_state(xendev, xendev->otherend);
 		pr_warn("Timeout connecting to device: %s (local state %d, remote state %d)\n",
 			xendev->nodename, xendev->state, rstate);
 	}
diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h
index c94caf852aea..15319da65b7f 100644
--- a/include/xen/xenbus.h
+++ b/include/xen/xenbus.h
@@ -228,7 +228,8 @@ int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr);
 int xenbus_alloc_evtchn(struct xenbus_device *dev, evtchn_port_t *port);
 int xenbus_free_evtchn(struct xenbus_device *dev, evtchn_port_t port);
 
-enum xenbus_state xenbus_read_driver_state(const char *path);
+enum xenbus_state xenbus_read_driver_state(const struct xenbus_device *dev,
+					   const char *path);
 
 __printf(3, 4)
 void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt, ...);
-- 
cgit v1.2.3


From e2dcf9065536ab4a1b00828ff0d19f7d282dfecc Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Wed, 18 Feb 2026 10:52:05 +0100
Subject: xen/xenbus: better handle backend crash
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the backend domain crashes, coordinated device cleanup is not
possible (as it involves waiting for the backend state change). In that
case, toolstack forcefully removes frontend xenstore entries.
xenbus_dev_changed() handles this case, and triggers device cleanup.
It's possible that toolstack manages to connect new device in that
place, before xenbus_dev_changed() notices the old one is missing. If
that happens, new one won't be probed and will forever remain in
XenbusStateInitialising.

Fix this by checking the frontend's state in Xenstore. In case it has
been reset to XenbusStateInitialising by Xen tools, consider this
being the result of an unplug+plug operation.

It's important that cleanup on such unplug doesn't modify Xenstore
entries (especially the "state" key) as it belong to the new device
to be probed - changing it would derail establishing connection to the
new backend (most likely, closing the device before it was even
connected). Handle this case by setting new xenbus_device->vanished
flag to true, and check it before changing state entry.

And even if xenbus_dev_changed() correctly detects the device was
forcefully removed, the cleanup handling is still racy. Since this whole
handling doesn't happened in a single Xenstore transaction, it's possible
that toolstack might put a new device there already. Avoid re-creating
the state key (which in the case of loosing the race would actually
close newly attached device).

The problem does not apply to frontend domain crash, as this case
involves coordinated cleanup.

Problem originally reported at
https://lore.kernel.org/xen-devel/aOZvivyZ9YhVWDLN@mail-itl/T/#t,
including reproduction steps.

Based-on-patch-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com>
Tested-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
Message-ID: <20260218095205.453657-3-jgross@suse.com>
---
 drivers/xen/xenbus/xenbus_client.c | 13 +++++++++++--
 drivers/xen/xenbus/xenbus_probe.c  | 36 ++++++++++++++++++++++++++++++++++++
 include/xen/xenbus.h               |  1 +
 3 files changed, 48 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c
index e502435145ae..19672b08a680 100644
--- a/drivers/xen/xenbus/xenbus_client.c
+++ b/drivers/xen/xenbus/xenbus_client.c
@@ -226,8 +226,9 @@ __xenbus_switch_state(struct xenbus_device *dev,
 	struct xenbus_transaction xbt;
 	int current_state;
 	int err, abort;
+	bool vanished = false;
 
-	if (state == dev->state)
+	if (state == dev->state || dev->vanished)
 		return 0;
 
 again:
@@ -242,6 +243,10 @@ again:
 	err = xenbus_scanf(xbt, dev->nodename, "state", "%d", &current_state);
 	if (err != 1)
 		goto abort;
+	if (current_state != dev->state && current_state == XenbusStateInitialising) {
+		vanished = true;
+		goto abort;
+	}
 
 	err = xenbus_printf(xbt, dev->nodename, "state", "%d", state);
 	if (err) {
@@ -256,7 +261,7 @@ abort:
 		if (err == -EAGAIN && !abort)
 			goto again;
 		xenbus_switch_fatal(dev, depth, err, "ending transaction");
-	} else
+	} else if (!vanished)
 		dev->state = state;
 
 	return 0;
@@ -941,6 +946,10 @@ enum xenbus_state xenbus_read_driver_state(const struct xenbus_device *dev,
 					   const char *path)
 {
 	enum xenbus_state result;
+
+	if (dev && dev->vanished)
+		return XenbusStateUnknown;
+
 	int err = xenbus_gather(XBT_NIL, path, "state", "%d", &result, NULL);
 	if (err)
 		result = XenbusStateUnknown;
diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
index 2eed06ba5d38..eb260eceb4d2 100644
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -444,6 +444,9 @@ static void xenbus_cleanup_devices(const char *path, struct bus_type *bus)
 		info.dev = NULL;
 		bus_for_each_dev(bus, NULL, &info, cleanup_dev);
 		if (info.dev) {
+			dev_warn(&info.dev->dev,
+				 "device forcefully removed from xenstore\n");
+			info.dev->vanished = true;
 			device_unregister(&info.dev->dev);
 			put_device(&info.dev->dev);
 		}
@@ -659,6 +662,39 @@ void xenbus_dev_changed(const char *node, struct xen_bus_type *bus)
 		return;
 
 	dev = xenbus_device_find(root, &bus->bus);
+	/*
+	 * Backend domain crash results in not coordinated frontend removal,
+	 * without going through XenbusStateClosing. If this is a new instance
+	 * of the same device Xen tools will have reset the state to
+	 * XenbusStateInitializing.
+	 * It might be that the backend crashed early during the init phase of
+	 * device setup, in which case the known state would have been
+	 * XenbusStateInitializing. So test the backend domid to match the
+	 * saved one. In case the new backend happens to have the same domid as
+	 * the old one, we can just carry on, as there is no inconsistency
+	 * resulting in this case.
+	 */
+	if (dev && !strcmp(bus->root, "device")) {
+		enum xenbus_state state = xenbus_read_driver_state(dev, dev->nodename);
+		unsigned int backend = xenbus_read_unsigned(root, "backend-id",
+							    dev->otherend_id);
+
+		if (state == XenbusStateInitialising &&
+		    (state != dev->state || backend != dev->otherend_id)) {
+			/*
+			 * State has been reset, assume the old one vanished
+			 * and new one needs to be probed.
+			 */
+			dev_warn(&dev->dev,
+				 "state reset occurred, reconnecting\n");
+			dev->vanished = true;
+		}
+		if (dev->vanished) {
+			device_unregister(&dev->dev);
+			put_device(&dev->dev);
+			dev = NULL;
+		}
+	}
 	if (!dev)
 		xenbus_probe_node(bus, type, root);
 	else
diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h
index 15319da65b7f..8ca15743af7f 100644
--- a/include/xen/xenbus.h
+++ b/include/xen/xenbus.h
@@ -80,6 +80,7 @@ struct xenbus_device {
 	const char *devicetype;
 	const char *nodename;
 	const char *otherend;
+	bool vanished;
 	int otherend_id;
 	struct xenbus_watch otherend_watch;
 	struct device dev;
-- 
cgit v1.2.3


From 7f083faf59d14c04e01ec05a7507f036c965acf8 Mon Sep 17 00:00:00 2001
From: Koichiro Den <den@valinux.co.jp>
Date: Sat, 28 Feb 2026 23:53:07 +0900
Subject: net: sched: avoid qdisc_reset_all_tx_gt() vs dequeue race for
 lockless qdiscs

When shrinking the number of real tx queues,
netif_set_real_num_tx_queues() calls qdisc_reset_all_tx_gt() to flush
qdiscs for queues which will no longer be used.

qdisc_reset_all_tx_gt() currently serializes qdisc_reset() with
qdisc_lock(). However, for lockless qdiscs, the dequeue path is
serialized by qdisc_run_begin/end() using qdisc->seqlock instead, so
qdisc_reset() can run concurrently with __qdisc_run() and free skbs
while they are still being dequeued, leading to UAF.

This can easily be reproduced on e.g. virtio-net by imposing heavy
traffic while frequently changing the number of queue pairs:

  iperf3 -ub0 -c $peer -t 0 &
  while :; do
    ethtool -L eth0 combined 1
    ethtool -L eth0 combined 2
  done

With KASAN enabled, this leads to reports like:

  BUG: KASAN: slab-use-after-free in __qdisc_run+0x133f/0x1760
  ...
  Call Trace:
   <TASK>
   ...
   __qdisc_run+0x133f/0x1760
   __dev_queue_xmit+0x248f/0x3550
   ip_finish_output2+0xa42/0x2110
   ip_output+0x1a7/0x410
   ip_send_skb+0x2e6/0x480
   udp_send_skb+0xb0a/0x1590
   udp_sendmsg+0x13c9/0x1fc0
   ...
   </TASK>

  Allocated by task 1270 on cpu 5 at 44.558414s:
   ...
   alloc_skb_with_frags+0x84/0x7c0
   sock_alloc_send_pskb+0x69a/0x830
   __ip_append_data+0x1b86/0x48c0
   ip_make_skb+0x1e8/0x2b0
   udp_sendmsg+0x13a6/0x1fc0
   ...

  Freed by task 1306 on cpu 3 at 44.558445s:
   ...
   kmem_cache_free+0x117/0x5e0
   pfifo_fast_reset+0x14d/0x580
   qdisc_reset+0x9e/0x5f0
   netif_set_real_num_tx_queues+0x303/0x840
   virtnet_set_channels+0x1bf/0x260 [virtio_net]
   ethnl_set_channels+0x684/0xae0
   ethnl_default_set_doit+0x31a/0x890
   ...

Serialize qdisc_reset_all_tx_gt() against the lockless dequeue path by
taking qdisc->seqlock for TCQ_F_NOLOCK qdiscs, matching the
serialization model already used by dev_reset_queue().

Additionally clear QDISC_STATE_NON_EMPTY after reset so the qdisc state
reflects an empty queue, avoiding needless re-scheduling.

Fixes: 6b3ba9146fe6 ("net: sched: allow qdiscs to handle locking")
Signed-off-by: Koichiro Den <den@valinux.co.jp>
Link: https://patch.msgid.link/20260228145307.3955532-1-den@valinux.co.jp
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sch_generic.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index c3a7268b567e..d5d55cb21686 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -778,13 +778,23 @@ static inline bool skb_skip_tc_classify(struct sk_buff *skb)
 static inline void qdisc_reset_all_tx_gt(struct net_device *dev, unsigned int i)
 {
 	struct Qdisc *qdisc;
+	bool nolock;
 
 	for (; i < dev->num_tx_queues; i++) {
 		qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc);
 		if (qdisc) {
+			nolock = qdisc->flags & TCQ_F_NOLOCK;
+
+			if (nolock)
+				spin_lock_bh(&qdisc->seqlock);
 			spin_lock_bh(qdisc_lock(qdisc));
 			qdisc_reset(qdisc);
 			spin_unlock_bh(qdisc_lock(qdisc));
+			if (nolock) {
+				clear_bit(__QDISC_STATE_MISSED, &qdisc->state);
+				clear_bit(__QDISC_STATE_DRAINING, &qdisc->state);
+				spin_unlock_bh(&qdisc->seqlock);
+			}
 		}
 	}
 }
-- 
cgit v1.2.3


From 165573e41f2f66ef98940cf65f838b2cb575d9d1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 2 Mar 2026 20:55:27 +0000
Subject: tcp: secure_seq: add back ports to TS offset

This reverts 28ee1b746f49 ("secure_seq: downgrade to per-host timestamp offsets")

tcp_tw_recycle went away in 2017.

Zhouyan Deng reported off-path TCP source port leakage via
SYN cookie side-channel that can be fixed in multiple ways.

One of them is to bring back TCP ports in TS offset randomization.

As a bonus, we perform a single siphash() computation
to provide both an ISN and a TS offset.

Fixes: 28ee1b746f49 ("secure_seq: downgrade to per-host timestamp offsets")
Reported-by: Zhouyan Deng <dengzhouyan_nwpu@163.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Acked-by: Florian Westphal <fw@strlen.de>
Link: https://patch.msgid.link/20260302205527.1982836-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/secure_seq.h | 45 ++++++++++++++++++++++-----
 include/net/tcp.h        |  6 ++--
 net/core/secure_seq.c    | 80 ++++++++++++++++++------------------------------
 net/ipv4/syncookies.c    | 11 +++++--
 net/ipv4/tcp_input.c     |  8 +++--
 net/ipv4/tcp_ipv4.c      | 37 ++++++++++------------
 net/ipv6/syncookies.c    | 11 +++++--
 net/ipv6/tcp_ipv6.c      | 37 ++++++++++------------
 8 files changed, 127 insertions(+), 108 deletions(-)

(limited to 'include')

diff --git a/include/net/secure_seq.h b/include/net/secure_seq.h
index cddebafb9f77..6f996229167b 100644
--- a/include/net/secure_seq.h
+++ b/include/net/secure_seq.h
@@ -5,16 +5,47 @@
 #include <linux/types.h>
 
 struct net;
+extern struct net init_net;
+
+union tcp_seq_and_ts_off {
+	struct {
+		u32 seq;
+		u32 ts_off;
+	};
+	u64 hash64;
+};
 
 u64 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport);
 u64 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
 			       __be16 dport);
-u32 secure_tcp_seq(__be32 saddr, __be32 daddr,
-		   __be16 sport, __be16 dport);
-u32 secure_tcp_ts_off(const struct net *net, __be32 saddr, __be32 daddr);
-u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr,
-		     __be16 sport, __be16 dport);
-u32 secure_tcpv6_ts_off(const struct net *net,
-			const __be32 *saddr, const __be32 *daddr);
+union tcp_seq_and_ts_off
+secure_tcp_seq_and_ts_off(const struct net *net, __be32 saddr, __be32 daddr,
+			  __be16 sport, __be16 dport);
+
+static inline u32 secure_tcp_seq(__be32 saddr, __be32 daddr,
+				 __be16 sport, __be16 dport)
+{
+	union tcp_seq_and_ts_off ts;
+
+	ts = secure_tcp_seq_and_ts_off(&init_net, saddr, daddr,
+				       sport, dport);
+
+	return ts.seq;
+}
+
+union tcp_seq_and_ts_off
+secure_tcpv6_seq_and_ts_off(const struct net *net, const __be32 *saddr,
+			    const __be32 *daddr,
+			    __be16 sport, __be16 dport);
+
+static inline u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr,
+				   __be16 sport, __be16 dport)
+{
+	union tcp_seq_and_ts_off ts;
+
+	ts = secure_tcpv6_seq_and_ts_off(&init_net, saddr, daddr,
+					 sport, dport);
 
+	return ts.seq;
+}
 #endif /* _NET_SECURE_SEQ */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index eb8bf63fdafc..978eea2d5df0 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -43,6 +43,7 @@
 #include <net/dst.h>
 #include <net/mptcp.h>
 #include <net/xfrm.h>
+#include <net/secure_seq.h>
 
 #include <linux/seq_file.h>
 #include <linux/memcontrol.h>
@@ -2464,8 +2465,9 @@ struct tcp_request_sock_ops {
 				       struct flowi *fl,
 				       struct request_sock *req,
 				       u32 tw_isn);
-	u32 (*init_seq)(const struct sk_buff *skb);
-	u32 (*init_ts_off)(const struct net *net, const struct sk_buff *skb);
+	union tcp_seq_and_ts_off (*init_seq_and_ts_off)(
+					const struct net *net,
+					const struct sk_buff *skb);
 	int (*send_synack)(const struct sock *sk, struct dst_entry *dst,
 			   struct flowi *fl, struct request_sock *req,
 			   struct tcp_fastopen_cookie *foc,
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index 9a3965680451..6a6f2cda5aae 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -20,7 +20,6 @@
 #include <net/tcp.h>
 
 static siphash_aligned_key_t net_secret;
-static siphash_aligned_key_t ts_secret;
 
 #define EPHEMERAL_PORT_SHUFFLE_PERIOD (10 * HZ)
 
@@ -28,11 +27,6 @@ static __always_inline void net_secret_init(void)
 {
 	net_get_random_once(&net_secret, sizeof(net_secret));
 }
-
-static __always_inline void ts_secret_init(void)
-{
-	net_get_random_once(&ts_secret, sizeof(ts_secret));
-}
 #endif
 
 #ifdef CONFIG_INET
@@ -53,28 +47,9 @@ static u32 seq_scale(u32 seq)
 #endif
 
 #if IS_ENABLED(CONFIG_IPV6)
-u32 secure_tcpv6_ts_off(const struct net *net,
-			const __be32 *saddr, const __be32 *daddr)
-{
-	const struct {
-		struct in6_addr saddr;
-		struct in6_addr daddr;
-	} __aligned(SIPHASH_ALIGNMENT) combined = {
-		.saddr = *(struct in6_addr *)saddr,
-		.daddr = *(struct in6_addr *)daddr,
-	};
-
-	if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1)
-		return 0;
-
-	ts_secret_init();
-	return siphash(&combined, offsetofend(typeof(combined), daddr),
-		       &ts_secret);
-}
-EXPORT_IPV6_MOD(secure_tcpv6_ts_off);
-
-u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr,
-		     __be16 sport, __be16 dport)
+union tcp_seq_and_ts_off
+secure_tcpv6_seq_and_ts_off(const struct net *net, const __be32 *saddr,
+			    const __be32 *daddr, __be16 sport, __be16 dport)
 {
 	const struct {
 		struct in6_addr saddr;
@@ -87,14 +62,20 @@ u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr,
 		.sport = sport,
 		.dport = dport
 	};
-	u32 hash;
+	union tcp_seq_and_ts_off st;
 
 	net_secret_init();
-	hash = siphash(&combined, offsetofend(typeof(combined), dport),
-		       &net_secret);
-	return seq_scale(hash);
+
+	st.hash64 = siphash(&combined, offsetofend(typeof(combined), dport),
+			    &net_secret);
+
+	if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1)
+		st.ts_off = 0;
+
+	st.seq = seq_scale(st.seq);
+	return st;
 }
-EXPORT_SYMBOL(secure_tcpv6_seq);
+EXPORT_SYMBOL(secure_tcpv6_seq_and_ts_off);
 
 u64 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
 			       __be16 dport)
@@ -118,33 +99,30 @@ EXPORT_SYMBOL(secure_ipv6_port_ephemeral);
 #endif
 
 #ifdef CONFIG_INET
-u32 secure_tcp_ts_off(const struct net *net, __be32 saddr, __be32 daddr)
-{
-	if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1)
-		return 0;
-
-	ts_secret_init();
-	return siphash_2u32((__force u32)saddr, (__force u32)daddr,
-			    &ts_secret);
-}
-
 /* secure_tcp_seq_and_tsoff(a, b, 0, d) == secure_ipv4_port_ephemeral(a, b, d),
  * but fortunately, `sport' cannot be 0 in any circumstances. If this changes,
  * it would be easy enough to have the former function use siphash_4u32, passing
  * the arguments as separate u32.
  */
-u32 secure_tcp_seq(__be32 saddr, __be32 daddr,
-		   __be16 sport, __be16 dport)
+union tcp_seq_and_ts_off
+secure_tcp_seq_and_ts_off(const struct net *net, __be32 saddr, __be32 daddr,
+			  __be16 sport, __be16 dport)
 {
-	u32 hash;
+	u32 ports = (__force u32)sport << 16 | (__force u32)dport;
+	union tcp_seq_and_ts_off st;
 
 	net_secret_init();
-	hash = siphash_3u32((__force u32)saddr, (__force u32)daddr,
-			    (__force u32)sport << 16 | (__force u32)dport,
-			    &net_secret);
-	return seq_scale(hash);
+
+	st.hash64 = siphash_3u32((__force u32)saddr, (__force u32)daddr,
+				 ports, &net_secret);
+
+	if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1)
+		st.ts_off = 0;
+
+	st.seq = seq_scale(st.seq);
+	return st;
 }
-EXPORT_SYMBOL_GPL(secure_tcp_seq);
+EXPORT_SYMBOL_GPL(secure_tcp_seq_and_ts_off);
 
 u64 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport)
 {
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 061751aabc8e..fc3affd9c801 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -378,9 +378,14 @@ static struct request_sock *cookie_tcp_check(struct net *net, struct sock *sk,
 	tcp_parse_options(net, skb, &tcp_opt, 0, NULL);
 
 	if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) {
-		tsoff = secure_tcp_ts_off(net,
-					  ip_hdr(skb)->daddr,
-					  ip_hdr(skb)->saddr);
+		union tcp_seq_and_ts_off st;
+
+		st = secure_tcp_seq_and_ts_off(net,
+					       ip_hdr(skb)->daddr,
+					       ip_hdr(skb)->saddr,
+					       tcp_hdr(skb)->dest,
+					       tcp_hdr(skb)->source);
+		tsoff = st.ts_off;
 		tcp_opt.rcv_tsecr -= tsoff;
 	}
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 7b03f2460751..cba89733d121 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -7646,6 +7646,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 	const struct tcp_sock *tp = tcp_sk(sk);
 	struct net *net = sock_net(sk);
 	struct sock *fastopen_sk = NULL;
+	union tcp_seq_and_ts_off st;
 	struct request_sock *req;
 	bool want_cookie = false;
 	struct dst_entry *dst;
@@ -7715,9 +7716,12 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 	if (!dst)
 		goto drop_and_free;
 
+	if (tmp_opt.tstamp_ok || (!want_cookie && !isn))
+		st = af_ops->init_seq_and_ts_off(net, skb);
+
 	if (tmp_opt.tstamp_ok) {
 		tcp_rsk(req)->req_usec_ts = dst_tcp_usec_ts(dst);
-		tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb);
+		tcp_rsk(req)->ts_off = st.ts_off;
 	}
 	if (!want_cookie && !isn) {
 		int max_syn_backlog = READ_ONCE(net->ipv4.sysctl_max_syn_backlog);
@@ -7739,7 +7743,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 			goto drop_and_release;
 		}
 
-		isn = af_ops->init_seq(skb);
+		isn = st.seq;
 	}
 
 	tcp_ecn_create_request(req, skb, sk, dst);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 910c25cb24e1..c7b2463c2e25 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -105,17 +105,14 @@ static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
 
 static DEFINE_MUTEX(tcp_exit_batch_mutex);
 
-static u32 tcp_v4_init_seq(const struct sk_buff *skb)
+static union tcp_seq_and_ts_off
+tcp_v4_init_seq_and_ts_off(const struct net *net, const struct sk_buff *skb)
 {
-	return secure_tcp_seq(ip_hdr(skb)->daddr,
-			      ip_hdr(skb)->saddr,
-			      tcp_hdr(skb)->dest,
-			      tcp_hdr(skb)->source);
-}
-
-static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
-{
-	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
+	return secure_tcp_seq_and_ts_off(net,
+					 ip_hdr(skb)->daddr,
+					 ip_hdr(skb)->saddr,
+					 tcp_hdr(skb)->dest,
+					 tcp_hdr(skb)->source);
 }
 
 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
@@ -327,15 +324,16 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len
 	rt = NULL;
 
 	if (likely(!tp->repair)) {
+		union tcp_seq_and_ts_off st;
+
+		st = secure_tcp_seq_and_ts_off(net,
+					       inet->inet_saddr,
+					       inet->inet_daddr,
+					       inet->inet_sport,
+					       usin->sin_port);
 		if (!tp->write_seq)
-			WRITE_ONCE(tp->write_seq,
-				   secure_tcp_seq(inet->inet_saddr,
-						  inet->inet_daddr,
-						  inet->inet_sport,
-						  usin->sin_port));
-		WRITE_ONCE(tp->tsoffset,
-			   secure_tcp_ts_off(net, inet->inet_saddr,
-					     inet->inet_daddr));
+			WRITE_ONCE(tp->write_seq, st.seq);
+		WRITE_ONCE(tp->tsoffset, st.ts_off);
 	}
 
 	atomic_set(&inet->inet_id, get_random_u16());
@@ -1677,8 +1675,7 @@ const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
 	.cookie_init_seq =	cookie_v4_init_sequence,
 #endif
 	.route_req	=	tcp_v4_route_req,
-	.init_seq	=	tcp_v4_init_seq,
-	.init_ts_off	=	tcp_v4_init_ts_off,
+	.init_seq_and_ts_off	=	tcp_v4_init_seq_and_ts_off,
 	.send_synack	=	tcp_v4_send_synack,
 };
 
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 7e007f013ec8..4f6f0d751d6c 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -151,9 +151,14 @@ static struct request_sock *cookie_tcp_check(struct net *net, struct sock *sk,
 	tcp_parse_options(net, skb, &tcp_opt, 0, NULL);
 
 	if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) {
-		tsoff = secure_tcpv6_ts_off(net,
-					    ipv6_hdr(skb)->daddr.s6_addr32,
-					    ipv6_hdr(skb)->saddr.s6_addr32);
+		union tcp_seq_and_ts_off st;
+
+		st = secure_tcpv6_seq_and_ts_off(net,
+						 ipv6_hdr(skb)->daddr.s6_addr32,
+						 ipv6_hdr(skb)->saddr.s6_addr32,
+						 tcp_hdr(skb)->dest,
+						 tcp_hdr(skb)->source);
+		tsoff = st.ts_off;
 		tcp_opt.rcv_tsecr -= tsoff;
 	}
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 5195a46b951e..bb09d5ccf599 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -105,18 +105,14 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
 	}
 }
 
-static u32 tcp_v6_init_seq(const struct sk_buff *skb)
+static union tcp_seq_and_ts_off
+tcp_v6_init_seq_and_ts_off(const struct net *net, const struct sk_buff *skb)
 {
-	return secure_tcpv6_seq(ipv6_hdr(skb)->daddr.s6_addr32,
-				ipv6_hdr(skb)->saddr.s6_addr32,
-				tcp_hdr(skb)->dest,
-				tcp_hdr(skb)->source);
-}
-
-static u32 tcp_v6_init_ts_off(const struct net *net, const struct sk_buff *skb)
-{
-	return secure_tcpv6_ts_off(net, ipv6_hdr(skb)->daddr.s6_addr32,
-				   ipv6_hdr(skb)->saddr.s6_addr32);
+	return secure_tcpv6_seq_and_ts_off(net,
+					   ipv6_hdr(skb)->daddr.s6_addr32,
+					   ipv6_hdr(skb)->saddr.s6_addr32,
+					   tcp_hdr(skb)->dest,
+					   tcp_hdr(skb)->source);
 }
 
 static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
@@ -320,14 +316,16 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
 	sk_set_txhash(sk);
 
 	if (likely(!tp->repair)) {
+		union tcp_seq_and_ts_off st;
+
+		st = secure_tcpv6_seq_and_ts_off(net,
+						 np->saddr.s6_addr32,
+						 sk->sk_v6_daddr.s6_addr32,
+						 inet->inet_sport,
+						 inet->inet_dport);
 		if (!tp->write_seq)
-			WRITE_ONCE(tp->write_seq,
-				   secure_tcpv6_seq(np->saddr.s6_addr32,
-						    sk->sk_v6_daddr.s6_addr32,
-						    inet->inet_sport,
-						    inet->inet_dport));
-		tp->tsoffset = secure_tcpv6_ts_off(net, np->saddr.s6_addr32,
-						   sk->sk_v6_daddr.s6_addr32);
+			WRITE_ONCE(tp->write_seq, st.seq);
+		tp->tsoffset = st.ts_off;
 	}
 
 	if (tcp_fastopen_defer_connect(sk, &err))
@@ -817,8 +815,7 @@ const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
 	.cookie_init_seq =	cookie_v6_init_sequence,
 #endif
 	.route_req	=	tcp_v6_route_req,
-	.init_seq	=	tcp_v6_init_seq,
-	.init_ts_off	=	tcp_v6_init_ts_off,
+	.init_seq_and_ts_off	= tcp_v6_init_seq_and_ts_off,
 	.send_synack	=	tcp_v6_send_synack,
 };
 
-- 
cgit v1.2.3


From b824c3e16c1904bf80df489e293d1e3cbf98896d Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 2 Mar 2026 17:26:31 +0100
Subject: net: Provide a PREEMPT_RT specific check for netdev_queue::_xmit_lock

After acquiring netdev_queue::_xmit_lock the number of the CPU owning
the lock is recorded in netdev_queue::xmit_lock_owner. This works as
long as the BH context is not preemptible.

On PREEMPT_RT the softirq context is preemptible and without the
softirq-lock it is possible to have multiple user in __dev_queue_xmit()
submitting a skb on the same CPU. This is fine in general but this means
also that the current CPU is recorded as netdev_queue::xmit_lock_owner.
This in turn leads to the recursion alert and the skb is dropped.

Instead checking the for CPU number, that owns the lock, PREEMPT_RT can
check if the lockowner matches the current task.

Add netif_tx_owned() which returns true if the current context owns the
lock by comparing the provided CPU number with the recorded number. This
resembles the current check by negating the condition (the current check
returns true if the lock is not owned).
On PREEMPT_RT use rt_mutex_owner() to return the lock owner and compare
the current task against it.
Use the new helper in __dev_queue_xmit() and netif_local_xmit_active()
which provides a similar check.
Update comments regarding pairing READ_ONCE().

Reported-by: Bert Karwatzki <spasswolf@web.de>
Closes: https://lore.kernel.org/all/20260216134333.412332-1-spasswolf@web.de
Fixes: 3253cb49cbad4 ("softirq: Allow to drop the softirq-BKL lock on PREEMPT_RT")
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reported-by: Bert Karwatzki <spasswolf@web.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: https://patch.msgid.link/20260302162631.uGUyIqDT@linutronix.de
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/netdevice.h | 27 ++++++++++++++++++++++-----
 net/core/dev.c            |  5 +----
 net/core/netpoll.c        |  2 +-
 3 files changed, 24 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d4e6e00bb90a..67e25f6d15a4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4711,7 +4711,7 @@ static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits)
 static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu)
 {
 	spin_lock(&txq->_xmit_lock);
-	/* Pairs with READ_ONCE() in __dev_queue_xmit() */
+	/* Pairs with READ_ONCE() in netif_tx_owned() */
 	WRITE_ONCE(txq->xmit_lock_owner, cpu);
 }
 
@@ -4729,7 +4729,7 @@ static inline void __netif_tx_release(struct netdev_queue *txq)
 static inline void __netif_tx_lock_bh(struct netdev_queue *txq)
 {
 	spin_lock_bh(&txq->_xmit_lock);
-	/* Pairs with READ_ONCE() in __dev_queue_xmit() */
+	/* Pairs with READ_ONCE() in netif_tx_owned() */
 	WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id());
 }
 
@@ -4738,7 +4738,7 @@ static inline bool __netif_tx_trylock(struct netdev_queue *txq)
 	bool ok = spin_trylock(&txq->_xmit_lock);
 
 	if (likely(ok)) {
-		/* Pairs with READ_ONCE() in __dev_queue_xmit() */
+		/* Pairs with READ_ONCE() in netif_tx_owned() */
 		WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id());
 	}
 	return ok;
@@ -4746,14 +4746,14 @@ static inline bool __netif_tx_trylock(struct netdev_queue *txq)
 
 static inline void __netif_tx_unlock(struct netdev_queue *txq)
 {
-	/* Pairs with READ_ONCE() in __dev_queue_xmit() */
+	/* Pairs with READ_ONCE() in netif_tx_owned() */
 	WRITE_ONCE(txq->xmit_lock_owner, -1);
 	spin_unlock(&txq->_xmit_lock);
 }
 
 static inline void __netif_tx_unlock_bh(struct netdev_queue *txq)
 {
-	/* Pairs with READ_ONCE() in __dev_queue_xmit() */
+	/* Pairs with READ_ONCE() in netif_tx_owned() */
 	WRITE_ONCE(txq->xmit_lock_owner, -1);
 	spin_unlock_bh(&txq->_xmit_lock);
 }
@@ -4846,6 +4846,23 @@ static inline void netif_tx_disable(struct net_device *dev)
 	local_bh_enable();
 }
 
+#ifndef CONFIG_PREEMPT_RT
+static inline bool netif_tx_owned(struct netdev_queue *txq, unsigned int cpu)
+{
+	/* Other cpus might concurrently change txq->xmit_lock_owner
+	 * to -1 or to their cpu id, but not to our id.
+	 */
+	return READ_ONCE(txq->xmit_lock_owner) == cpu;
+}
+
+#else
+static inline bool netif_tx_owned(struct netdev_queue *txq, unsigned int cpu)
+{
+	return rt_mutex_owner(&txq->_xmit_lock.lock) == current;
+}
+
+#endif
+
 static inline void netif_addr_lock(struct net_device *dev)
 {
 	unsigned char nest_level = 0;
diff --git a/net/core/dev.c b/net/core/dev.c
index 20610a192ec7..14a83f2035b9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4818,10 +4818,7 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
 	if (dev->flags & IFF_UP) {
 		int cpu = smp_processor_id(); /* ok because BHs are off */
 
-		/* Other cpus might concurrently change txq->xmit_lock_owner
-		 * to -1 or to their cpu id, but not to our id.
-		 */
-		if (READ_ONCE(txq->xmit_lock_owner) != cpu) {
+		if (!netif_tx_owned(txq, cpu)) {
 			bool is_list = false;
 
 			if (dev_xmit_recursion())
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index a8558a52884f..cd74beffd209 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -132,7 +132,7 @@ static int netif_local_xmit_active(struct net_device *dev)
 	for (i = 0; i < dev->num_tx_queues; i++) {
 		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
 
-		if (READ_ONCE(txq->xmit_lock_owner) == smp_processor_id())
+		if (netif_tx_owned(txq, smp_processor_id()))
 			return 1;
 	}
 
-- 
cgit v1.2.3


From fb7fb4016300ac622c964069e286dc83166a5d52 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 2 Mar 2026 23:28:15 +0100
Subject: netfilter: nf_tables: clone set on flush only

Syzbot with fault injection triggered a failing memory allocation with
GFP_KERNEL which results in a WARN splat:

iter.err
WARNING: net/netfilter/nf_tables_api.c:845 at nft_map_deactivate+0x34e/0x3c0 net/netfilter/nf_tables_api.c:845, CPU#0: syz.0.17/5992
Modules linked in:
CPU: 0 UID: 0 PID: 5992 Comm: syz.0.17 Not tainted syzkaller #0 PREEMPT(full)
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 02/12/2026
RIP: 0010:nft_map_deactivate+0x34e/0x3c0 net/netfilter/nf_tables_api.c:845
Code: 8b 05 86 5a 4e 09 48 3b 84 24 a0 00 00 00 75 62 48 8d 65 d8 5b 41 5c 41 5d 41 5e 41 5f 5d c3 cc cc cc cc cc e8 63 6d fa f7 90 <0f> 0b 90 43
+80 7c 35 00 00 0f 85 23 fe ff ff e9 26 fe ff ff 89 d9
RSP: 0018:ffffc900045af780 EFLAGS: 00010293
RAX: ffffffff89ca45bd RBX: 00000000fffffff4 RCX: ffff888028111e40
RDX: 0000000000000000 RSI: 00000000fffffff4 RDI: 0000000000000000
RBP: ffffc900045af870 R08: 0000000000400dc0 R09: 00000000ffffffff
R10: dffffc0000000000 R11: fffffbfff1d141db R12: ffffc900045af7e0
R13: 1ffff920008b5f24 R14: dffffc0000000000 R15: ffffc900045af920
FS:  000055557a6a5500(0000) GS:ffff888125496000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007fb5ea271fc0 CR3: 000000003269e000 CR4: 00000000003526f0
Call Trace:
 <TASK>
 __nft_release_table+0xceb/0x11f0 net/netfilter/nf_tables_api.c:12115
 nft_rcv_nl_event+0xc25/0xdb0 net/netfilter/nf_tables_api.c:12187
 notifier_call_chain+0x19d/0x3a0 kernel/notifier.c:85
 blocking_notifier_call_chain+0x6a/0x90 kernel/notifier.c:380
 netlink_release+0x123b/0x1ad0 net/netlink/af_netlink.c:761
 __sock_release net/socket.c:662 [inline]
 sock_close+0xc3/0x240 net/socket.c:1455

Restrict set clone to the flush set command in the preparation phase.
Add NFT_ITER_UPDATE_CLONE and use it for this purpose, update the rbtree
and pipapo backends to only clone the set when this iteration type is
used.

As for the existing NFT_ITER_UPDATE type, update the pipapo backend to
use the existing set clone if available, otherwise use the existing set
representation. After this update, there is no need to clone a set that
is being deleted, this includes bound anonymous set.

An alternative approach to NFT_ITER_UPDATE_CLONE is to add a .clone
interface and call it from the flush set path.

Reported-by: syzbot+4924a0edc148e8b4b342@syzkaller.appspotmail.com
Fixes: 3f1d886cc7c3 ("netfilter: nft_set_pipapo: move cloning of match info to insert/removal path")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/net/netfilter/nf_tables.h |  2 ++
 net/netfilter/nf_tables_api.c     | 10 +++++++++-
 net/netfilter/nft_set_hash.c      |  1 +
 net/netfilter/nft_set_pipapo.c    | 11 +++++++++--
 net/netfilter/nft_set_rbtree.c    |  8 +++++---
 5 files changed, 26 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 426534a711b0..ea6f29ad7888 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -320,11 +320,13 @@ static inline void *nft_elem_priv_cast(const struct nft_elem_priv *priv)
  * @NFT_ITER_UNSPEC: unspecified, to catch errors
  * @NFT_ITER_READ: read-only iteration over set elements
  * @NFT_ITER_UPDATE: iteration under mutex to update set element state
+ * @NFT_ITER_UPDATE_CLONE: clone set before iteration under mutex to update element
  */
 enum nft_iter_type {
 	NFT_ITER_UNSPEC,
 	NFT_ITER_READ,
 	NFT_ITER_UPDATE,
+	NFT_ITER_UPDATE_CLONE,
 };
 
 struct nft_set;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index df67932d3e09..058f7004cb2b 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -833,6 +833,11 @@ static void nft_map_catchall_deactivate(const struct nft_ctx *ctx,
 	}
 }
 
+/* Use NFT_ITER_UPDATE iterator even if this may be called from the preparation
+ * phase, the set clone might already exist from a previous command, or it might
+ * be a set that is going away and does not require a clone. The netns and
+ * netlink release paths also need to work on the live set.
+ */
 static void nft_map_deactivate(const struct nft_ctx *ctx, struct nft_set *set)
 {
 	struct nft_set_iter iter = {
@@ -7903,9 +7908,12 @@ static int nft_set_catchall_flush(const struct nft_ctx *ctx,
 
 static int nft_set_flush(struct nft_ctx *ctx, struct nft_set *set, u8 genmask)
 {
+	/* The set backend might need to clone the set, do it now from the
+	 * preparation phase, use NFT_ITER_UPDATE_CLONE iterator type.
+	 */
 	struct nft_set_iter iter = {
 		.genmask	= genmask,
-		.type		= NFT_ITER_UPDATE,
+		.type		= NFT_ITER_UPDATE_CLONE,
 		.fn		= nft_setelem_flush,
 	};
 
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index 739b992bde59..b0e571c8e3f3 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -374,6 +374,7 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set,
 {
 	switch (iter->type) {
 	case NFT_ITER_UPDATE:
+	case NFT_ITER_UPDATE_CLONE:
 		/* only relevant for netlink dumps which use READ type */
 		WARN_ON_ONCE(iter->skip != 0);
 
diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index 7ef4b44471d3..c091898df710 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -2144,13 +2144,20 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set,
 	const struct nft_pipapo_match *m;
 
 	switch (iter->type) {
-	case NFT_ITER_UPDATE:
+	case NFT_ITER_UPDATE_CLONE:
 		m = pipapo_maybe_clone(set);
 		if (!m) {
 			iter->err = -ENOMEM;
 			return;
 		}
-
+		nft_pipapo_do_walk(ctx, set, m, iter);
+		break;
+	case NFT_ITER_UPDATE:
+		if (priv->clone)
+			m = priv->clone;
+		else
+			m = rcu_dereference_protected(priv->match,
+						      nft_pipapo_transaction_mutex_held(set));
 		nft_pipapo_do_walk(ctx, set, m, iter);
 		break;
 	case NFT_ITER_READ:
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 3f02e4478216..ee3d4f5b9ff7 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -861,13 +861,15 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx,
 	struct nft_rbtree *priv = nft_set_priv(set);
 
 	switch (iter->type) {
-	case NFT_ITER_UPDATE:
-		lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex);
-
+	case NFT_ITER_UPDATE_CLONE:
 		if (nft_array_may_resize(set) < 0) {
 			iter->err = -ENOMEM;
 			break;
 		}
+		fallthrough;
+	case NFT_ITER_UPDATE:
+		lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex);
+
 		nft_rbtree_do_walk(ctx, set, iter);
 		break;
 	case NFT_ITER_READ:
-- 
cgit v1.2.3


From 9df95785d3d8302f7c066050117b04cd3c2048c2 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 3 Mar 2026 16:31:32 +0100
Subject: netfilter: nft_set_pipapo: split gc into unlink and reclaim phase

Yiming Qian reports Use-after-free in the pipapo set type:
  Under a large number of expired elements, commit-time GC can run for a very
  long time in a non-preemptible context, triggering soft lockup warnings and
  RCU stall reports (local denial of service).

We must split GC in an unlink and a reclaim phase.

We cannot queue elements for freeing until pointers have been swapped.
Expired elements are still exposed to both the packet path and userspace
dumpers via the live copy of the data structure.

call_rcu() does not protect us: dump operations or element lookups starting
after call_rcu has fired can still observe the free'd element, unless the
commit phase has made enough progress to swap the clone and live pointers
before any new reader has picked up the old version.

This a similar approach as done recently for the rbtree backend in commit
35f83a75529a ("netfilter: nft_set_rbtree: don't gc elements on insert").

Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges")
Reported-by: Yiming Qian <yimingqian591@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/net/netfilter/nf_tables.h |  5 ++++
 net/netfilter/nf_tables_api.c     |  5 ----
 net/netfilter/nft_set_pipapo.c    | 51 +++++++++++++++++++++++++++++++++------
 net/netfilter/nft_set_pipapo.h    |  2 ++
 4 files changed, 50 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index ea6f29ad7888..e2d2bfc1f989 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1863,6 +1863,11 @@ struct nft_trans_gc {
 	struct rcu_head		rcu;
 };
 
+static inline int nft_trans_gc_space(const struct nft_trans_gc *trans)
+{
+	return NFT_TRANS_GC_BATCHCOUNT - trans->count;
+}
+
 static inline void nft_ctx_update(struct nft_ctx *ctx,
 				  const struct nft_trans *trans)
 {
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 058f7004cb2b..1862bd7fe804 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -10493,11 +10493,6 @@ static void nft_trans_gc_queue_work(struct nft_trans_gc *trans)
 	schedule_work(&trans_gc_work);
 }
 
-static int nft_trans_gc_space(struct nft_trans_gc *trans)
-{
-	return NFT_TRANS_GC_BATCHCOUNT - trans->count;
-}
-
 struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc,
 					      unsigned int gc_seq, gfp_t gfp)
 {
diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index c091898df710..a34632ae6048 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -1680,11 +1680,11 @@ static void nft_pipapo_gc_deactivate(struct net *net, struct nft_set *set,
 }
 
 /**
- * pipapo_gc() - Drop expired entries from set, destroy start and end elements
+ * pipapo_gc_scan() - Drop expired entries from set and link them to gc list
  * @set:	nftables API set representation
  * @m:		Matching data
  */
-static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m)
+static void pipapo_gc_scan(struct nft_set *set, struct nft_pipapo_match *m)
 {
 	struct nft_pipapo *priv = nft_set_priv(set);
 	struct net *net = read_pnet(&set->net);
@@ -1697,6 +1697,8 @@ static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m)
 	if (!gc)
 		return;
 
+	list_add(&gc->list, &priv->gc_head);
+
 	while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) {
 		union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS];
 		const struct nft_pipapo_field *f;
@@ -1724,9 +1726,13 @@ static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m)
 		 * NFT_SET_ELEM_DEAD_BIT.
 		 */
 		if (__nft_set_elem_expired(&e->ext, tstamp)) {
-			gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL);
-			if (!gc)
-				return;
+			if (!nft_trans_gc_space(gc)) {
+				gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL);
+				if (!gc)
+					return;
+
+				list_add(&gc->list, &priv->gc_head);
+			}
 
 			nft_pipapo_gc_deactivate(net, set, e);
 			pipapo_drop(m, rulemap);
@@ -1740,10 +1746,30 @@ static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m)
 		}
 	}
 
-	gc = nft_trans_gc_catchall_sync(gc);
+	priv->last_gc = jiffies;
+}
+
+/**
+ * pipapo_gc_queue() - Free expired elements
+ * @set:	nftables API set representation
+ */
+static void pipapo_gc_queue(struct nft_set *set)
+{
+	struct nft_pipapo *priv = nft_set_priv(set);
+	struct nft_trans_gc *gc, *next;
+
+	/* always do a catchall cycle: */
+	gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL);
 	if (gc) {
+		gc = nft_trans_gc_catchall_sync(gc);
+		if (gc)
+			nft_trans_gc_queue_sync_done(gc);
+	}
+
+	/* always purge queued gc elements. */
+	list_for_each_entry_safe(gc, next, &priv->gc_head, list) {
+		list_del(&gc->list);
 		nft_trans_gc_queue_sync_done(gc);
-		priv->last_gc = jiffies;
 	}
 }
 
@@ -1797,6 +1823,10 @@ static void pipapo_reclaim_match(struct rcu_head *rcu)
  *
  * We also need to create a new working copy for subsequent insertions and
  * deletions.
+ *
+ * After the live copy has been replaced by the clone, we can safely queue
+ * expired elements that have been collected by pipapo_gc_scan() for
+ * memory reclaim.
  */
 static void nft_pipapo_commit(struct nft_set *set)
 {
@@ -1807,7 +1837,7 @@ static void nft_pipapo_commit(struct nft_set *set)
 		return;
 
 	if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set)))
-		pipapo_gc(set, priv->clone);
+		pipapo_gc_scan(set, priv->clone);
 
 	old = rcu_replace_pointer(priv->match, priv->clone,
 				  nft_pipapo_transaction_mutex_held(set));
@@ -1815,6 +1845,8 @@ static void nft_pipapo_commit(struct nft_set *set)
 
 	if (old)
 		call_rcu(&old->rcu, pipapo_reclaim_match);
+
+	pipapo_gc_queue(set);
 }
 
 static void nft_pipapo_abort(const struct nft_set *set)
@@ -2279,6 +2311,7 @@ static int nft_pipapo_init(const struct nft_set *set,
 		f->mt = NULL;
 	}
 
+	INIT_LIST_HEAD(&priv->gc_head);
 	rcu_assign_pointer(priv->match, m);
 
 	return 0;
@@ -2328,6 +2361,8 @@ static void nft_pipapo_destroy(const struct nft_ctx *ctx,
 	struct nft_pipapo *priv = nft_set_priv(set);
 	struct nft_pipapo_match *m;
 
+	WARN_ON_ONCE(!list_empty(&priv->gc_head));
+
 	m = rcu_dereference_protected(priv->match, true);
 
 	if (priv->clone) {
diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h
index eaab422aa56a..9aee9a9eaeb7 100644
--- a/net/netfilter/nft_set_pipapo.h
+++ b/net/netfilter/nft_set_pipapo.h
@@ -156,12 +156,14 @@ struct nft_pipapo_match {
  * @clone:	Copy where pending insertions and deletions are kept
  * @width:	Total bytes to be matched for one packet, including padding
  * @last_gc:	Timestamp of last garbage collection run, jiffies
+ * @gc_head:	list of nft_trans_gc to queue up for mem reclaim
  */
 struct nft_pipapo {
 	struct nft_pipapo_match __rcu *match;
 	struct nft_pipapo_match *clone;
 	int width;
 	unsigned long last_gc;
+	struct list_head gc_head;
 };
 
 struct nft_pipapo_elem;
-- 
cgit v1.2.3


From e2cedd400c3ec0302ffca2490e8751772906ac23 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Wed, 4 Mar 2026 09:06:02 -0500
Subject: net/sched: act_ife: Fix metalist update behavior

Whenever an ife action replace changes the metalist, instead of
replacing the old data on the metalist, the current ife code is appending
the new metadata. Aside from being innapropriate behavior, this may lead
to an unbounded addition of metadata to the metalist which might cause an
out of bounds error when running the encode op:

[  138.423369][    C1] ==================================================================
[  138.424317][    C1] BUG: KASAN: slab-out-of-bounds in ife_tlv_meta_encode (net/ife/ife.c:168)
[  138.424906][    C1] Write of size 4 at addr ffff8880077f4ffe by task ife_out_out_bou/255
[  138.425778][    C1] CPU: 1 UID: 0 PID: 255 Comm: ife_out_out_bou Not tainted 7.0.0-rc1-00169-gfbdfa8da05b6 #624 PREEMPT(full)
[  138.425795][    C1] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[  138.425800][    C1] Call Trace:
[  138.425804][    C1]  <IRQ>
[  138.425808][    C1]  dump_stack_lvl (lib/dump_stack.c:122)
[  138.425828][    C1]  print_report (mm/kasan/report.c:379 mm/kasan/report.c:482)
[  138.425839][    C1]  ? srso_alias_return_thunk (arch/x86/lib/retpoline.S:221)
[  138.425844][    C1]  ? __virt_addr_valid (./arch/x86/include/asm/preempt.h:95 (discriminator 1) ./include/linux/rcupdate.h:975 (discriminator 1) ./include/linux/mmzone.h:2207 (discriminator 1) arch/x86/mm/physaddr.c:54 (discriminator 1))
[  138.425853][    C1]  ? ife_tlv_meta_encode (net/ife/ife.c:168)
[  138.425859][    C1]  kasan_report (mm/kasan/report.c:221 mm/kasan/report.c:597)
[  138.425868][    C1]  ? ife_tlv_meta_encode (net/ife/ife.c:168)
[  138.425878][    C1]  kasan_check_range (mm/kasan/generic.c:186 (discriminator 1) mm/kasan/generic.c:200 (discriminator 1))
[  138.425884][    C1]  __asan_memset (mm/kasan/shadow.c:84 (discriminator 2))
[  138.425889][    C1]  ife_tlv_meta_encode (net/ife/ife.c:168)
[  138.425893][    C1]  ? ife_tlv_meta_encode (net/ife/ife.c:171)
[  138.425898][    C1]  ? srso_alias_return_thunk (arch/x86/lib/retpoline.S:221)
[  138.425903][    C1]  ife_encode_meta_u16 (net/sched/act_ife.c:57)
[  138.425910][    C1]  ? __pfx_do_raw_spin_lock (kernel/locking/spinlock_debug.c:114)
[  138.425916][    C1]  ? __asan_memcpy (mm/kasan/shadow.c:105 (discriminator 3))
[  138.425921][    C1]  ? __pfx_ife_encode_meta_u16 (net/sched/act_ife.c:45)
[  138.425927][    C1]  ? srso_alias_return_thunk (arch/x86/lib/retpoline.S:221)
[  138.425931][    C1]  tcf_ife_act (net/sched/act_ife.c:847 net/sched/act_ife.c:879)

To solve this issue, fix the replace behavior by adding the metalist to
the ife rcu data structure.

Fixes: aa9fd9a325d51 ("sched: act: ife: update parameters via rcu handling")
Reported-by: Ruitong Liu <cnitlrt@gmail.com>
Tested-by: Ruitong Liu <cnitlrt@gmail.com>
Co-developed-by: Victor Nogueira <victor@mojatatu.com>
Signed-off-by: Victor Nogueira <victor@mojatatu.com>
Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Link: https://patch.msgid.link/20260304140603.76500-1-jhs@mojatatu.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tc_act/tc_ife.h |  4 +-
 net/sched/act_ife.c         | 93 +++++++++++++++++++++------------------------
 2 files changed, 45 insertions(+), 52 deletions(-)

(limited to 'include')

diff --git a/include/net/tc_act/tc_ife.h b/include/net/tc_act/tc_ife.h
index c7f24a2da1ca..24d4d5a62b3c 100644
--- a/include/net/tc_act/tc_ife.h
+++ b/include/net/tc_act/tc_ife.h
@@ -13,15 +13,13 @@ struct tcf_ife_params {
 	u8 eth_src[ETH_ALEN];
 	u16 eth_type;
 	u16 flags;
-
+	struct list_head metalist;
 	struct rcu_head rcu;
 };
 
 struct tcf_ife_info {
 	struct tc_action common;
 	struct tcf_ife_params __rcu *params;
-	/* list of metaids allowed */
-	struct list_head metalist;
 };
 #define to_ife(a) ((struct tcf_ife_info *)a)
 
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 79df81d12894..d5e8a91bb4eb 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -293,8 +293,8 @@ static int load_metaops_and_vet(u32 metaid, void *val, int len, bool rtnl_held)
 /* called when adding new meta information
 */
 static int __add_metainfo(const struct tcf_meta_ops *ops,
-			  struct tcf_ife_info *ife, u32 metaid, void *metaval,
-			  int len, bool atomic, bool exists)
+			  struct tcf_ife_params *p, u32 metaid, void *metaval,
+			  int len, bool atomic)
 {
 	struct tcf_meta_info *mi = NULL;
 	int ret = 0;
@@ -313,45 +313,40 @@ static int __add_metainfo(const struct tcf_meta_ops *ops,
 		}
 	}
 
-	if (exists)
-		spin_lock_bh(&ife->tcf_lock);
-	list_add_tail(&mi->metalist, &ife->metalist);
-	if (exists)
-		spin_unlock_bh(&ife->tcf_lock);
+	list_add_tail(&mi->metalist, &p->metalist);
 
 	return ret;
 }
 
 static int add_metainfo_and_get_ops(const struct tcf_meta_ops *ops,
-				    struct tcf_ife_info *ife, u32 metaid,
-				    bool exists)
+				    struct tcf_ife_params *p, u32 metaid)
 {
 	int ret;
 
 	if (!try_module_get(ops->owner))
 		return -ENOENT;
-	ret = __add_metainfo(ops, ife, metaid, NULL, 0, true, exists);
+	ret = __add_metainfo(ops, p, metaid, NULL, 0, true);
 	if (ret)
 		module_put(ops->owner);
 	return ret;
 }
 
-static int add_metainfo(struct tcf_ife_info *ife, u32 metaid, void *metaval,
-			int len, bool exists)
+static int add_metainfo(struct tcf_ife_params *p, u32 metaid, void *metaval,
+			int len)
 {
 	const struct tcf_meta_ops *ops = find_ife_oplist(metaid);
 	int ret;
 
 	if (!ops)
 		return -ENOENT;
-	ret = __add_metainfo(ops, ife, metaid, metaval, len, false, exists);
+	ret = __add_metainfo(ops, p, metaid, metaval, len, false);
 	if (ret)
 		/*put back what find_ife_oplist took */
 		module_put(ops->owner);
 	return ret;
 }
 
-static int use_all_metadata(struct tcf_ife_info *ife, bool exists)
+static int use_all_metadata(struct tcf_ife_params *p)
 {
 	struct tcf_meta_ops *o;
 	int rc = 0;
@@ -359,7 +354,7 @@ static int use_all_metadata(struct tcf_ife_info *ife, bool exists)
 
 	read_lock(&ife_mod_lock);
 	list_for_each_entry(o, &ifeoplist, list) {
-		rc = add_metainfo_and_get_ops(o, ife, o->metaid, exists);
+		rc = add_metainfo_and_get_ops(o, p, o->metaid);
 		if (rc == 0)
 			installed += 1;
 	}
@@ -371,7 +366,7 @@ static int use_all_metadata(struct tcf_ife_info *ife, bool exists)
 		return -EINVAL;
 }
 
-static int dump_metalist(struct sk_buff *skb, struct tcf_ife_info *ife)
+static int dump_metalist(struct sk_buff *skb, struct tcf_ife_params *p)
 {
 	struct tcf_meta_info *e;
 	struct nlattr *nest;
@@ -379,14 +374,14 @@ static int dump_metalist(struct sk_buff *skb, struct tcf_ife_info *ife)
 	int total_encoded = 0;
 
 	/*can only happen on decode */
-	if (list_empty(&ife->metalist))
+	if (list_empty(&p->metalist))
 		return 0;
 
 	nest = nla_nest_start_noflag(skb, TCA_IFE_METALST);
 	if (!nest)
 		goto out_nlmsg_trim;
 
-	list_for_each_entry(e, &ife->metalist, metalist) {
+	list_for_each_entry(e, &p->metalist, metalist) {
 		if (!e->ops->get(skb, e))
 			total_encoded += 1;
 	}
@@ -403,13 +398,11 @@ out_nlmsg_trim:
 	return -1;
 }
 
-/* under ife->tcf_lock */
-static void _tcf_ife_cleanup(struct tc_action *a)
+static void __tcf_ife_cleanup(struct tcf_ife_params *p)
 {
-	struct tcf_ife_info *ife = to_ife(a);
 	struct tcf_meta_info *e, *n;
 
-	list_for_each_entry_safe(e, n, &ife->metalist, metalist) {
+	list_for_each_entry_safe(e, n, &p->metalist, metalist) {
 		list_del(&e->metalist);
 		if (e->metaval) {
 			if (e->ops->release)
@@ -422,18 +415,23 @@ static void _tcf_ife_cleanup(struct tc_action *a)
 	}
 }
 
+static void tcf_ife_cleanup_params(struct rcu_head *head)
+{
+	struct tcf_ife_params *p = container_of(head, struct tcf_ife_params,
+						rcu);
+
+	__tcf_ife_cleanup(p);
+	kfree(p);
+}
+
 static void tcf_ife_cleanup(struct tc_action *a)
 {
 	struct tcf_ife_info *ife = to_ife(a);
 	struct tcf_ife_params *p;
 
-	spin_lock_bh(&ife->tcf_lock);
-	_tcf_ife_cleanup(a);
-	spin_unlock_bh(&ife->tcf_lock);
-
 	p = rcu_dereference_protected(ife->params, 1);
 	if (p)
-		kfree_rcu(p, rcu);
+		call_rcu(&p->rcu, tcf_ife_cleanup_params);
 }
 
 static int load_metalist(struct nlattr **tb, bool rtnl_held)
@@ -455,8 +453,7 @@ static int load_metalist(struct nlattr **tb, bool rtnl_held)
 	return 0;
 }
 
-static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb,
-			     bool exists, bool rtnl_held)
+static int populate_metalist(struct tcf_ife_params *p, struct nlattr **tb)
 {
 	int len = 0;
 	int rc = 0;
@@ -468,7 +465,7 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb,
 			val = nla_data(tb[i]);
 			len = nla_len(tb[i]);
 
-			rc = add_metainfo(ife, i, val, len, exists);
+			rc = add_metainfo(p, i, val, len);
 			if (rc)
 				return rc;
 		}
@@ -523,6 +520,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
 	p = kzalloc_obj(*p);
 	if (!p)
 		return -ENOMEM;
+	INIT_LIST_HEAD(&p->metalist);
 
 	if (tb[TCA_IFE_METALST]) {
 		err = nla_parse_nested_deprecated(tb2, IFE_META_MAX,
@@ -567,8 +565,6 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
 	}
 
 	ife = to_ife(*a);
-	if (ret == ACT_P_CREATED)
-		INIT_LIST_HEAD(&ife->metalist);
 
 	err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
 	if (err < 0)
@@ -600,8 +596,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
 	}
 
 	if (tb[TCA_IFE_METALST]) {
-		err = populate_metalist(ife, tb2, exists,
-					!(flags & TCA_ACT_FLAGS_NO_RTNL));
+		err = populate_metalist(p, tb2);
 		if (err)
 			goto metadata_parse_err;
 	} else {
@@ -610,7 +605,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
 		 * as we can. You better have at least one else we are
 		 * going to bail out
 		 */
-		err = use_all_metadata(ife, exists);
+		err = use_all_metadata(p);
 		if (err)
 			goto metadata_parse_err;
 	}
@@ -626,13 +621,14 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
 	if (goto_ch)
 		tcf_chain_put_by_act(goto_ch);
 	if (p)
-		kfree_rcu(p, rcu);
+		call_rcu(&p->rcu, tcf_ife_cleanup_params);
 
 	return ret;
 metadata_parse_err:
 	if (goto_ch)
 		tcf_chain_put_by_act(goto_ch);
 release_idr:
+	__tcf_ife_cleanup(p);
 	kfree(p);
 	tcf_idr_release(*a, bind);
 	return err;
@@ -679,7 +675,7 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind,
 	if (nla_put(skb, TCA_IFE_TYPE, 2, &p->eth_type))
 		goto nla_put_failure;
 
-	if (dump_metalist(skb, ife)) {
+	if (dump_metalist(skb, p)) {
 		/*ignore failure to dump metalist */
 		pr_info("Failed to dump metalist\n");
 	}
@@ -693,13 +689,13 @@ nla_put_failure:
 	return -1;
 }
 
-static int find_decode_metaid(struct sk_buff *skb, struct tcf_ife_info *ife,
+static int find_decode_metaid(struct sk_buff *skb, struct tcf_ife_params *p,
 			      u16 metaid, u16 mlen, void *mdata)
 {
 	struct tcf_meta_info *e;
 
 	/* XXX: use hash to speed up */
-	list_for_each_entry(e, &ife->metalist, metalist) {
+	list_for_each_entry_rcu(e, &p->metalist, metalist) {
 		if (metaid == e->metaid) {
 			if (e->ops) {
 				/* We check for decode presence already */
@@ -716,10 +712,13 @@ static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a,
 {
 	struct tcf_ife_info *ife = to_ife(a);
 	int action = ife->tcf_action;
+	struct tcf_ife_params *p;
 	u8 *ifehdr_end;
 	u8 *tlv_data;
 	u16 metalen;
 
+	p = rcu_dereference_bh(ife->params);
+
 	bstats_update(this_cpu_ptr(ife->common.cpu_bstats), skb);
 	tcf_lastuse_update(&ife->tcf_tm);
 
@@ -745,7 +744,7 @@ static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a,
 			return TC_ACT_SHOT;
 		}
 
-		if (find_decode_metaid(skb, ife, mtype, dlen, curr_data)) {
+		if (find_decode_metaid(skb, p, mtype, dlen, curr_data)) {
 			/* abuse overlimits to count when we receive metadata
 			 * but dont have an ops for it
 			 */
@@ -769,12 +768,12 @@ static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a,
 /*XXX: check if we can do this at install time instead of current
  * send data path
 **/
-static int ife_get_sz(struct sk_buff *skb, struct tcf_ife_info *ife)
+static int ife_get_sz(struct sk_buff *skb, struct tcf_ife_params *p)
 {
-	struct tcf_meta_info *e, *n;
+	struct tcf_meta_info *e;
 	int tot_run_sz = 0, run_sz = 0;
 
-	list_for_each_entry_safe(e, n, &ife->metalist, metalist) {
+	list_for_each_entry_rcu(e, &p->metalist, metalist) {
 		if (e->ops->check_presence) {
 			run_sz = e->ops->check_presence(skb, e);
 			tot_run_sz += run_sz;
@@ -795,7 +794,7 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
 	   OUTERHDR:TOTMETALEN:{TLVHDR:Metadatum:TLVHDR..}:ORIGDATA
 	   where ORIGDATA = original ethernet header ...
 	 */
-	u16 metalen = ife_get_sz(skb, ife);
+	u16 metalen = ife_get_sz(skb, p);
 	int hdrm = metalen + skb->dev->hard_header_len + IFE_METAHDRLEN;
 	unsigned int skboff = 0;
 	int new_len = skb->len + hdrm;
@@ -833,25 +832,21 @@ drop:
 	if (!ife_meta)
 		goto drop;
 
-	spin_lock(&ife->tcf_lock);
-
 	/* XXX: we dont have a clever way of telling encode to
 	 * not repeat some of the computations that are done by
 	 * ops->presence_check...
 	 */
-	list_for_each_entry(e, &ife->metalist, metalist) {
+	list_for_each_entry_rcu(e, &p->metalist, metalist) {
 		if (e->ops->encode) {
 			err = e->ops->encode(skb, (void *)(ife_meta + skboff),
 					     e);
 		}
 		if (err < 0) {
 			/* too corrupt to keep around if overwritten */
-			spin_unlock(&ife->tcf_lock);
 			goto drop;
 		}
 		skboff += err;
 	}
-	spin_unlock(&ife->tcf_lock);
 	oethh = (struct ethhdr *)skb->data;
 
 	if (!is_zero_ether_addr(p->eth_src))
-- 
cgit v1.2.3


From 16394d80539937d348dd3b9ea32415c54e67a81b Mon Sep 17 00:00:00 2001
From: Larysa Zaremba <larysa.zaremba@intel.com>
Date: Thu, 5 Mar 2026 12:12:43 +0100
Subject: xsk: introduce helper to determine rxq->frag_size

rxq->frag_size is basically a step between consecutive strictly aligned
frames. In ZC mode, chunk size fits exactly, but if chunks are unaligned,
there is no safe way to determine accessible space to grow tailroom.

Report frag_size to be zero, if chunks are unaligned, chunk_size otherwise.

Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX")
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
Link: https://patch.msgid.link/20260305111253.2317394-3-larysa.zaremba@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/xdp_sock_drv.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
index aefc368449d5..6b9ebae2dc95 100644
--- a/include/net/xdp_sock_drv.h
+++ b/include/net/xdp_sock_drv.h
@@ -51,6 +51,11 @@ static inline u32 xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool)
 	return xsk_pool_get_chunk_size(pool) - xsk_pool_get_headroom(pool);
 }
 
+static inline u32 xsk_pool_get_rx_frag_step(struct xsk_buff_pool *pool)
+{
+	return pool->unaligned ? 0 : xsk_pool_get_chunk_size(pool);
+}
+
 static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool,
 					 struct xdp_rxq_info *rxq)
 {
@@ -337,6 +342,11 @@ static inline u32 xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool)
 	return 0;
 }
 
+static inline u32 xsk_pool_get_rx_frag_step(struct xsk_buff_pool *pool)
+{
+	return 0;
+}
+
 static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool,
 					 struct xdp_rxq_info *rxq)
 {
-- 
cgit v1.2.3


From 75d9228982f23d68066ca0b7d87014c3eb8ddc85 Mon Sep 17 00:00:00 2001
From: Larysa Zaremba <larysa.zaremba@intel.com>
Date: Thu, 5 Mar 2026 12:12:48 +0100
Subject: libeth, idpf: use truesize as XDP RxQ info frag_size

The only user of frag_size field in XDP RxQ info is
bpf_xdp_frags_increase_tail(). It clearly expects whole buffer size instead
of DMA write size. Different assumptions in idpf driver configuration lead
to negative tailroom.

To make it worse, buffer sizes are not actually uniform in idpf when
splitq is enabled, as there are several buffer queues, so rxq->rx_buf_size
is meaningless in this case.

Use truesize of the first bufq in AF_XDP ZC, as there is only one. Disable
growing tail for regular splitq.

Fixes: ac8a861f632e ("idpf: prepare structures to support XDP")
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
Link: https://patch.msgid.link/20260305111253.2317394-8-larysa.zaremba@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/intel/idpf/xdp.c   | 6 +++++-
 drivers/net/ethernet/intel/idpf/xsk.c   | 1 +
 drivers/net/ethernet/intel/libeth/xsk.c | 1 +
 include/net/libeth/xsk.h                | 3 +++
 4 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/idpf/xdp.c b/drivers/net/ethernet/intel/idpf/xdp.c
index 6ac9c6624c2a..cbccd4546768 100644
--- a/drivers/net/ethernet/intel/idpf/xdp.c
+++ b/drivers/net/ethernet/intel/idpf/xdp.c
@@ -47,12 +47,16 @@ static int __idpf_xdp_rxq_info_init(struct idpf_rx_queue *rxq, void *arg)
 {
 	const struct idpf_vport *vport = rxq->q_vector->vport;
 	const struct idpf_q_vec_rsrc *rsrc;
+	u32 frag_size = 0;
 	bool split;
 	int err;
 
+	if (idpf_queue_has(XSK, rxq))
+		frag_size = rxq->bufq_sets[0].bufq.truesize;
+
 	err = __xdp_rxq_info_reg(&rxq->xdp_rxq, vport->netdev, rxq->idx,
 				 rxq->q_vector->napi.napi_id,
-				 rxq->rx_buf_size);
+				 frag_size);
 	if (err)
 		return err;
 
diff --git a/drivers/net/ethernet/intel/idpf/xsk.c b/drivers/net/ethernet/intel/idpf/xsk.c
index 676cbd80774d..d95d3efdfd36 100644
--- a/drivers/net/ethernet/intel/idpf/xsk.c
+++ b/drivers/net/ethernet/intel/idpf/xsk.c
@@ -403,6 +403,7 @@ int idpf_xskfq_init(struct idpf_buf_queue *bufq)
 	bufq->pending = fq.pending;
 	bufq->thresh = fq.thresh;
 	bufq->rx_buf_size = fq.buf_len;
+	bufq->truesize = fq.truesize;
 
 	if (!idpf_xskfq_refill(bufq))
 		netdev_err(bufq->pool->netdev,
diff --git a/drivers/net/ethernet/intel/libeth/xsk.c b/drivers/net/ethernet/intel/libeth/xsk.c
index 846e902e31b6..4882951d5c9c 100644
--- a/drivers/net/ethernet/intel/libeth/xsk.c
+++ b/drivers/net/ethernet/intel/libeth/xsk.c
@@ -167,6 +167,7 @@ int libeth_xskfq_create(struct libeth_xskfq *fq)
 	fq->pending = fq->count;
 	fq->thresh = libeth_xdp_queue_threshold(fq->count);
 	fq->buf_len = xsk_pool_get_rx_frame_size(fq->pool);
+	fq->truesize = xsk_pool_get_rx_frag_step(fq->pool);
 
 	return 0;
 }
diff --git a/include/net/libeth/xsk.h b/include/net/libeth/xsk.h
index 481a7b28e6f2..82b5d21aae87 100644
--- a/include/net/libeth/xsk.h
+++ b/include/net/libeth/xsk.h
@@ -597,6 +597,7 @@ __libeth_xsk_run_pass(struct libeth_xdp_buff *xdp,
  * @pending: current number of XSkFQEs to refill
  * @thresh: threshold below which the queue is refilled
  * @buf_len: HW-writeable length per each buffer
+ * @truesize: step between consecutive buffers, 0 if none exists
  * @nid: ID of the closest NUMA node with memory
  */
 struct libeth_xskfq {
@@ -614,6 +615,8 @@ struct libeth_xskfq {
 	u32			thresh;
 
 	u32			buf_len;
+	u32			truesize;
+
 	int			nid;
 };
 
-- 
cgit v1.2.3


From 1954c4f012206147c34acda8da04f827aa7d3ee3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 7 Mar 2026 20:07:15 +0000
Subject: eventpoll: Convert epoll_put_uevent() to scoped user access

Saves two function calls, and one stac/clac pair.

stac/clac is rather expensive on older cpus like Zen 2.

A synthetic network stress test gives a ~1.5% increase of pps
on AMD Zen 2.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Kuniyuki Iwashima <kuniyu@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/eventpoll.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index ccb478eb174b..ea9ca0e4172a 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -82,11 +82,14 @@ static inline struct epoll_event __user *
 epoll_put_uevent(__poll_t revents, __u64 data,
 		 struct epoll_event __user *uevent)
 {
-	if (__put_user(revents, &uevent->events) ||
-	    __put_user(data, &uevent->data))
-		return NULL;
-
+	scoped_user_write_access_size(uevent, sizeof(*uevent), efault) {
+		unsafe_put_user(revents, &uevent->events, efault);
+		unsafe_put_user(data, &uevent->data, efault);
+	}
 	return uevent+1;
+
+efault:
+	return NULL;
 }
 #endif
 
-- 
cgit v1.2.3