From 894af4a1cde61c3401f237184fb770f72ff12df8 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 12 Apr 2025 13:56:01 +0200
Subject: objtool: Validate kCFI calls

Validate that all indirect calls adhere to kCFI rules. Notably doing
nocfi indirect call to a cfi function is broken.

Apparently some Rust 'core' code violates this and explodes when ran
with FineIBT.

All the ANNOTATE_NOCFI_SYM sites are prime targets for attackers.

 - runtime EFI is especially henous because it also needs to disable
   IBT. Basically calling unknown code without CFI protection at
   runtime is a massice security issue.

 - Kexec image handover; if you can exploit this, you get to keep it :-)

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Acked-by: Sean Christopherson <seanjc@google.com>
Link: https://lkml.kernel.org/r/20250714103441.496787279@infradead.org
---
 include/linux/objtool.h       | 10 ++++++++++
 include/linux/objtool_types.h |  1 +
 2 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/objtool.h b/include/linux/objtool.h
index 366ad004d794..46ebaa46e6c5 100644
--- a/include/linux/objtool.h
+++ b/include/linux/objtool.h
@@ -184,6 +184,15 @@
  * WARN using UD2.
  */
 #define ANNOTATE_REACHABLE(label)	__ASM_ANNOTATE(label, ANNOTYPE_REACHABLE)
+/*
+ * This should not be used; it annotates away CFI violations. There are a few
+ * valid use cases like kexec handover to the next kernel image, and there is
+ * no security concern there.
+ *
+ * There are also a few real issues annotated away, like EFI because we can't
+ * control the EFI code.
+ */
+#define ANNOTATE_NOCFI_SYM(sym)		asm(__ASM_ANNOTATE(sym, ANNOTYPE_NOCFI))
 
 #else
 #define ANNOTATE_NOENDBR		ANNOTATE type=ANNOTYPE_NOENDBR
@@ -194,6 +203,7 @@
 #define ANNOTATE_INTRA_FUNCTION_CALL	ANNOTATE type=ANNOTYPE_INTRA_FUNCTION_CALL
 #define ANNOTATE_UNRET_BEGIN		ANNOTATE type=ANNOTYPE_UNRET_BEGIN
 #define ANNOTATE_REACHABLE		ANNOTATE type=ANNOTYPE_REACHABLE
+#define ANNOTATE_NOCFI_SYM		ANNOTATE type=ANNOTYPE_NOCFI
 #endif
 
 #if defined(CONFIG_NOINSTR_VALIDATION) && \
diff --git a/include/linux/objtool_types.h b/include/linux/objtool_types.h
index df5d9fa84dba..aceac94632c8 100644
--- a/include/linux/objtool_types.h
+++ b/include/linux/objtool_types.h
@@ -65,5 +65,6 @@ struct unwind_hint {
 #define ANNOTYPE_IGNORE_ALTS		6
 #define ANNOTYPE_INTRA_FUNCTION_CALL	7
 #define ANNOTYPE_REACHABLE		8
+#define ANNOTYPE_NOCFI			9
 
 #endif /* _LINUX_OBJTOOL_TYPES_H */
-- 
cgit v1.2.3


From 038c7dc66e2744e5df57163b8f957745ae10d23e Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Wed, 3 Sep 2025 20:46:40 -0700
Subject: compiler_types.h: Move __nocfi out of compiler-specific header

Prepare for GCC KCFI support and move the __nocfi attribute from
compiler-clang.h to compiler_types.h. This was already gated by
CONFIG_CFI_CLANG, so this remains safe for non-KCFI GCC builds.

Signed-off-by: Kees Cook <kees@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Link: https://lore.kernel.org/r/20250904034656.3670313-1-kees@kernel.org
---
 include/linux/compiler-clang.h | 5 -----
 include/linux/compiler_types.h | 4 +++-
 2 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index fa4ffe037bc7..7a4568e421dc 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -96,11 +96,6 @@
 # define __noscs	__attribute__((__no_sanitize__("shadow-call-stack")))
 #endif
 
-#if __has_feature(kcfi)
-/* Disable CFI checking inside a function. */
-#define __nocfi		__attribute__((__no_sanitize__("kcfi")))
-#endif
-
 /*
  * Turn individual warnings and errors on and off locally, depending
  * on version.
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 16755431fc11..a910f9fa5341 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -432,7 +432,9 @@ struct ftrace_likely_data {
 # define __noscs
 #endif
 
-#ifndef __nocfi
+#if defined(CONFIG_CFI_CLANG)
+# define __nocfi		__attribute__((__no_sanitize__("kcfi")))
+#else
 # define __nocfi
 #endif
 
-- 
cgit v1.2.3


From 0b815825b1b0bd6762ca028e9b6631b002efb7ca Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Wed, 3 Sep 2025 20:46:45 -0700
Subject: x86/cfi: Remove __noinitretpoline and __noretpoline

Commit 66f793099a63 ("x86/retpoline: Avoid retpolines for built-in __init
functions") disabled retpolines in __init sections (__noinitretpoline)
as a precaution against potential issues with retpolines in early boot,
but it has not been a problem in practice (i.e. see Clang below).

Commit 87358710c1fb ("x86/retpoline: Support retpoline builds with Clang")
narrowed this to only GCC, as Clang doesn't have per-function control
over retpoline emission. As such, Clang has been booting with retpolines
in __init since retpoline support was introduced.

Clang KCFI has been instrumenting __init since CFI was introduced.

With the introduction of KCFI for GCC, KCFI instrumentation with
retpolines disabled means that objtool does not construct .retpoline_sites
section entries for the non-retpoline KCFI calls. At boot, the KCFI
rehashing code, via __apply_fineibt(), misses all __init KCFI calls
(since they are not retpolines), resulting in immediate hash mismatches:
all preambles are rehashed (via .cfi_sites) and none of the __init call
sites are rehashed.

Remove __noinitretpoline since it provides no meaningful utility and
creates problems with CFI. Additionally remove __noretpoline since it
is now unused.

Alternatively, cfi_rand_callers() could walk the .kcfi_traps section which
is exactly the list of KCFI instrumentation sites. But it seems better to
have as few differences in common instruction sequences between compilers
as possible, so better to remove the special handling of retpolines in
__init for GCC.

Signed-off-by: Kees Cook <kees@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20250904034656.3670313-6-kees@kernel.org
---
 include/linux/compiler-gcc.h | 4 ----
 include/linux/init.h         | 8 --------
 2 files changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 5d07c469b571..5de824a0b3d7 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -35,10 +35,6 @@
 	(typeof(ptr)) (__ptr + (off));					\
 })
 
-#ifdef CONFIG_MITIGATION_RETPOLINE
-#define __noretpoline __attribute__((__indirect_branch__("keep")))
-#endif
-
 #if defined(LATENT_ENTROPY_PLUGIN) && !defined(__CHECKER__)
 #define __latent_entropy __attribute__((latent_entropy))
 #endif
diff --git a/include/linux/init.h b/include/linux/init.h
index a60d32d227ee..17c1bc712e23 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -7,13 +7,6 @@
 #include <linux/stringify.h>
 #include <linux/types.h>
 
-/* Built-in __init functions needn't be compiled with retpoline */
-#if defined(__noretpoline) && !defined(MODULE)
-#define __noinitretpoline __noretpoline
-#else
-#define __noinitretpoline
-#endif
-
 /* These macros are used to mark some functions or 
  * initialized data (doesn't apply to uninitialized data)
  * as `initialization' functions. The kernel can take this
@@ -50,7 +43,6 @@
 /* These are for everybody (although not all archs will actually
    discard it in modules) */
 #define __init		__section(".init.text") __cold __latent_entropy	\
-						__noinitretpoline	\
 						__no_kstack_erase
 #define __initdata	__section(".init.data")
 #define __initconst	__section(".init.rodata")
-- 
cgit v1.2.3


From 4e66293bb141df33d5eb1f922e16fe05913bf296 Mon Sep 17 00:00:00 2001
From: Bhanu Seshu Kumar Valluri <bhanuseshukumar@gmail.com>
Date: Wed, 1 Oct 2025 14:27:16 +0530
Subject: of: doc: Fix typo in doc comments.

synthetized => synthesized
definied => defined
sucess => success

Signed-off-by: Bhanu Seshu Kumar Valluri <bhanuseshukumar@gmail.com>
Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
---
 drivers/of/irq.c     | 2 +-
 drivers/of/overlay.c | 2 +-
 include/linux/of.h   | 8 ++++----
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/irq.c b/drivers/of/irq.c
index e7c12abd10ab..1ca66b6e267c 100644
--- a/drivers/of/irq.c
+++ b/drivers/of/irq.c
@@ -163,7 +163,7 @@ const __be32 *of_irq_parse_imap_parent(const __be32 *imap, int len, struct of_ph
  * @out_irq:	structure of_phandle_args updated by this function
  *
  * This function is a low-level interrupt tree walking function. It
- * can be used to do a partial walk with synthetized reg and interrupts
+ * can be used to do a partial walk with synthesized reg and interrupts
  * properties, for example when resolving PCI interrupts when no device
  * node exist for the parent. It takes an interrupt specifier structure as
  * input, walks the tree looking for any interrupt-map properties, translates
diff --git a/drivers/of/overlay.c b/drivers/of/overlay.c
index 1af6f52d0708..255e8362f600 100644
--- a/drivers/of/overlay.c
+++ b/drivers/of/overlay.c
@@ -135,7 +135,7 @@ static BLOCKING_NOTIFIER_HEAD(overlay_notify_chain);
  * @nb:		Notifier block to register
  *
  * Register for notification on overlay operations on device tree nodes. The
- * reported actions definied by @of_reconfig_change. The notifier callback
+ * reported actions defined by @of_reconfig_change. The notifier callback
  * furthermore receives a pointer to the affected device tree node.
  *
  * Note that a notifier callback is not supposed to store pointers to a device
diff --git a/include/linux/of.h b/include/linux/of.h
index 5e2c6ed9370a..121a288ca92d 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -1134,7 +1134,7 @@ static inline bool of_phandle_args_equal(const struct of_phandle_args *a1,
  * Search for a property in a device node and count the number of u8 elements
  * in it.
  *
- * Return: The number of elements on sucess, -EINVAL if the property does
+ * Return: The number of elements on success, -EINVAL if the property does
  * not exist or its length does not match a multiple of u8 and -ENODATA if the
  * property does not have a value.
  */
@@ -1153,7 +1153,7 @@ static inline int of_property_count_u8_elems(const struct device_node *np,
  * Search for a property in a device node and count the number of u16 elements
  * in it.
  *
- * Return: The number of elements on sucess, -EINVAL if the property does
+ * Return: The number of elements on success, -EINVAL if the property does
  * not exist or its length does not match a multiple of u16 and -ENODATA if the
  * property does not have a value.
  */
@@ -1172,7 +1172,7 @@ static inline int of_property_count_u16_elems(const struct device_node *np,
  * Search for a property in a device node and count the number of u32 elements
  * in it.
  *
- * Return: The number of elements on sucess, -EINVAL if the property does
+ * Return: The number of elements on success, -EINVAL if the property does
  * not exist or its length does not match a multiple of u32 and -ENODATA if the
  * property does not have a value.
  */
@@ -1191,7 +1191,7 @@ static inline int of_property_count_u32_elems(const struct device_node *np,
  * Search for a property in a device node and count the number of u64 elements
  * in it.
  *
- * Return: The number of elements on sucess, -EINVAL if the property does
+ * Return: The number of elements on success, -EINVAL if the property does
  * not exist or its length does not match a multiple of u64 and -ENODATA if the
  * property does not have a value.
  */
-- 
cgit v1.2.3


From 510d76646a6a7beaa49fc0da7282e285a3dfce97 Mon Sep 17 00:00:00 2001
From: Tang Yizhou <yizhou.tang@shopee.com>
Date: Tue, 23 Sep 2025 19:21:36 +0800
Subject: block: Update a comment of disk statistics

>From commit 074a7aca7afa ("block: move stats from disk to part0"),
we know that:

* {disk|all}_stat_*() are gone.

* disk_stat_lock/unlock() are renamed to part_stat_lock/unlock().

Therefore, outdated comments should be updated accordingly.

Fixes: 074a7aca7afa ("block: move stats from disk to part0")
Signed-off-by: Tang Yizhou <yizhou.tang@shopee.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/part_stat.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h
index eeeff2a04529..729415e91215 100644
--- a/include/linux/part_stat.h
+++ b/include/linux/part_stat.h
@@ -17,8 +17,8 @@ struct disk_stats {
 /*
  * Macros to operate on percpu disk statistics:
  *
- * {disk|part|all}_stat_{add|sub|inc|dec}() modify the stat counters and should
- * be called between disk_stat_lock() and disk_stat_unlock().
+ * part_stat_{add|sub|inc|dec}() modify the stat counters and should
+ * be called between part_stat_lock() and part_stat_unlock().
  *
  * part_stat_read() can be called at any time.
  */
-- 
cgit v1.2.3


From b71a6e2a1b710ea31c363a72c75f510ef35d8e69 Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul@sk.com>
Date: Thu, 2 Oct 2025 17:12:35 +0900
Subject: i2c: rename wait_for_completion callback to wait_for_completion_cb

Functionally no change. Remove the ambiguity of 'wait_for_completion'.
It helps development of the DEPT dependency tracker, but seems favorable
in any case.

Signed-off-by: Byungchul Park <byungchul@sk.com>
[wsa: reworded commit message]
Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
---
 drivers/i2c/algos/i2c-algo-pca.c      | 2 +-
 drivers/i2c/busses/i2c-pca-isa.c      | 2 +-
 drivers/i2c/busses/i2c-pca-platform.c | 2 +-
 include/linux/i2c-algo-pca.h          | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/algos/i2c-algo-pca.c b/drivers/i2c/algos/i2c-algo-pca.c
index 74b66aec33d4..ee86df4cff4b 100644
--- a/drivers/i2c/algos/i2c-algo-pca.c
+++ b/drivers/i2c/algos/i2c-algo-pca.c
@@ -30,7 +30,7 @@ static int i2c_debug;
 #define pca_clock(adap) adap->i2c_clock
 #define pca_set_con(adap, val) pca_outw(adap, I2C_PCA_CON, val)
 #define pca_get_con(adap) pca_inw(adap, I2C_PCA_CON)
-#define pca_wait(adap) adap->wait_for_completion(adap->data)
+#define pca_wait(adap) adap->wait_for_completion_cb(adap->data)
 
 static void pca_reset(struct i2c_algo_pca_data *adap)
 {
diff --git a/drivers/i2c/busses/i2c-pca-isa.c b/drivers/i2c/busses/i2c-pca-isa.c
index 85e8cf58e8bf..0cbf2f509527 100644
--- a/drivers/i2c/busses/i2c-pca-isa.c
+++ b/drivers/i2c/busses/i2c-pca-isa.c
@@ -95,7 +95,7 @@ static struct i2c_algo_pca_data pca_isa_data = {
 	/* .data intentionally left NULL, not needed with ISA */
 	.write_byte		= pca_isa_writebyte,
 	.read_byte		= pca_isa_readbyte,
-	.wait_for_completion	= pca_isa_waitforcompletion,
+	.wait_for_completion_cb	= pca_isa_waitforcompletion,
 	.reset_chip		= pca_isa_resetchip,
 };
 
diff --git a/drivers/i2c/busses/i2c-pca-platform.c b/drivers/i2c/busses/i2c-pca-platform.c
index 87da8241b927..c0f35ebbe37d 100644
--- a/drivers/i2c/busses/i2c-pca-platform.c
+++ b/drivers/i2c/busses/i2c-pca-platform.c
@@ -180,7 +180,7 @@ static int i2c_pca_pf_probe(struct platform_device *pdev)
 	}
 
 	i2c->algo_data.data = i2c;
-	i2c->algo_data.wait_for_completion = i2c_pca_pf_waitforcompletion;
+	i2c->algo_data.wait_for_completion_cb = i2c_pca_pf_waitforcompletion;
 	if (i2c->gpio)
 		i2c->algo_data.reset_chip = i2c_pca_pf_resetchip;
 	else
diff --git a/include/linux/i2c-algo-pca.h b/include/linux/i2c-algo-pca.h
index 7c522fdd9ea7..e305bf32e40a 100644
--- a/include/linux/i2c-algo-pca.h
+++ b/include/linux/i2c-algo-pca.h
@@ -71,7 +71,7 @@ struct i2c_algo_pca_data {
 	void 				*data;	/* private low level data */
 	void (*write_byte)		(void *data, int reg, int val);
 	int  (*read_byte)		(void *data, int reg);
-	int  (*wait_for_completion)	(void *data);
+	int  (*wait_for_completion_cb)	(void *data);
 	void (*reset_chip)		(void *data);
 	/* For PCA9564, use one of the predefined frequencies:
 	 * 330000, 288000, 217000, 146000, 88000, 59000, 44000, 36000
-- 
cgit v1.2.3


From 1ed06c83506ecaaf1836ddeb7c65772ff86d8d53 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 7 Oct 2025 11:06:25 +0200
Subject: block: remove bio_iov_iter_get_pages

Switch the only caller to bio_iov_iter_get_pages, and explain why it does
not have any alignment requirements.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-map.c     | 6 +++++-
 include/linux/bio.h | 5 -----
 2 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-map.c b/block/blk-map.c
index 165f2234f00f..6cce652c7fa6 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -283,7 +283,11 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
 	bio = blk_rq_map_bio_alloc(rq, nr_vecs, gfp_mask);
 	if (!bio)
 		return -ENOMEM;
-	ret = bio_iov_iter_get_pages(bio, iter);
+	/*
+	 * No alignment requirements on our part to support arbitrary
+	 * passthrough commands.
+	 */
+	ret = bio_iov_iter_get_pages_aligned(bio, iter, 0);
 	if (ret)
 		goto out_put;
 	ret = blk_rq_append_bio(rq, bio);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index a64a30131031..b01dae9506de 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -449,11 +449,6 @@ int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data,
 int bio_iov_iter_get_pages_aligned(struct bio *bio, struct iov_iter *iter,
 		unsigned len_align_mask);
 
-static inline int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
-{
-	return bio_iov_iter_get_pages_aligned(bio, iter, 0);
-}
-
 void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter);
 void __bio_release_pages(struct bio *bio, bool mark_dirty);
 extern void bio_set_pages_dirty(struct bio *bio);
-- 
cgit v1.2.3


From 82dd5d763c9b718e2d655b9565e0a06a91bb83dc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 7 Oct 2025 11:06:26 +0200
Subject: block: rename bio_iov_iter_get_pages_aligned to
 bio_iov_iter_get_pages

Now that the bio_iov_iter_get_pages is free again, use it instead of
the more complicated now.  Also drop the unused export.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c            | 5 ++---
 block/blk-map.c        | 2 +-
 include/linux/bio.h    | 2 +-
 include/linux/blkdev.h | 2 +-
 4 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index 3a1a848940dd..b3a79285c278 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1316,7 +1316,7 @@ static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter,
 }
 
 /**
- * bio_iov_iter_get_pages_aligned - add user or kernel pages to a bio
+ * bio_iov_iter_get_pages - add user or kernel pages to a bio
  * @bio: bio to add pages to
  * @iter: iov iterator describing the region to be added
  * @len_align_mask: the mask to align the total size to, 0 for any length
@@ -1336,7 +1336,7 @@ static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter,
  * MM encounters an error pinning the requested pages, it stops. Error
  * is returned only if 0 pages could be pinned.
  */
-int bio_iov_iter_get_pages_aligned(struct bio *bio, struct iov_iter *iter,
+int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter,
 			   unsigned len_align_mask)
 {
 	int ret = 0;
@@ -1360,7 +1360,6 @@ int bio_iov_iter_get_pages_aligned(struct bio *bio, struct iov_iter *iter,
 		return bio_iov_iter_align_down(bio, iter, len_align_mask);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages_aligned);
 
 static void submit_bio_wait_endio(struct bio *bio)
 {
diff --git a/block/blk-map.c b/block/blk-map.c
index 6cce652c7fa6..60faf036fb6e 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -287,7 +287,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
 	 * No alignment requirements on our part to support arbitrary
 	 * passthrough commands.
 	 */
-	ret = bio_iov_iter_get_pages_aligned(bio, iter, 0);
+	ret = bio_iov_iter_get_pages(bio, iter, 0);
 	if (ret)
 		goto out_put;
 	ret = blk_rq_append_bio(rq, bio);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index b01dae9506de..16c1c85613b7 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -446,7 +446,7 @@ int submit_bio_wait(struct bio *bio);
 int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data,
 		size_t len, enum req_op op);
 
-int bio_iov_iter_get_pages_aligned(struct bio *bio, struct iov_iter *iter,
+int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter,
 		unsigned len_align_mask);
 
 void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 066e5309bd45..c97e8b0e67b6 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1876,7 +1876,7 @@ static inline int bio_split_rw_at(struct bio *bio,
 static inline int bio_iov_iter_get_bdev_pages(struct bio *bio,
 		struct iov_iter *iter, struct block_device *bdev)
 {
-	return bio_iov_iter_get_pages_aligned(bio, iter,
+	return bio_iov_iter_get_pages(bio, iter,
 					bdev_logical_block_size(bdev) - 1);
 }
 
-- 
cgit v1.2.3


From 506aa235f6e0baa00bf792df82a5e9f618b7a5d8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 7 Oct 2025 11:06:28 +0200
Subject: block: move bio_iov_iter_get_bdev_pages to block/fops.c

Keep bio_iov_iter_get_bdev_pages local with the callers, as blindly
looking at the bdev logical block size is often not the best idea
unless on a block device.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/fops.c           | 13 ++++++++++---
 include/linux/blkdev.h |  7 -------
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/block/fops.c b/block/fops.c
index c2c0396ea9ee..5e3db9fead77 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -43,6 +43,13 @@ static bool blkdev_dio_invalid(struct block_device *bdev, struct kiocb *iocb,
 			(bdev_logical_block_size(bdev) - 1);
 }
 
+static inline int blkdev_iov_iter_get_pages(struct bio *bio,
+		struct iov_iter *iter, struct block_device *bdev)
+{
+	return bio_iov_iter_get_pages(bio, iter,
+			bdev_logical_block_size(bdev) - 1);
+}
+
 #define DIO_INLINE_BIO_VECS 4
 
 static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
@@ -78,7 +85,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
 	if (iocb->ki_flags & IOCB_ATOMIC)
 		bio.bi_opf |= REQ_ATOMIC;
 
-	ret = bio_iov_iter_get_bdev_pages(&bio, iter, bdev);
+	ret = blkdev_iov_iter_get_pages(&bio, iter, bdev);
 	if (unlikely(ret))
 		goto out;
 	ret = bio.bi_iter.bi_size;
@@ -212,7 +219,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 		bio->bi_end_io = blkdev_bio_end_io;
 		bio->bi_ioprio = iocb->ki_ioprio;
 
-		ret = bio_iov_iter_get_bdev_pages(bio, iter, bdev);
+		ret = blkdev_iov_iter_get_pages(bio, iter, bdev);
 		if (unlikely(ret)) {
 			bio->bi_status = BLK_STS_IOERR;
 			bio_endio(bio);
@@ -348,7 +355,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
 		 */
 		bio_iov_bvec_set(bio, iter);
 	} else {
-		ret = bio_iov_iter_get_bdev_pages(bio, iter, bdev);
+		ret = blkdev_iov_iter_get_pages(bio, iter, bdev);
 		if (unlikely(ret))
 			goto out_bio_put;
 	}
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c97e8b0e67b6..d4db5039836d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1873,13 +1873,6 @@ static inline int bio_split_rw_at(struct bio *bio,
 	return bio_split_io_at(bio, lim, segs, max_bytes, lim->dma_alignment);
 }
 
-static inline int bio_iov_iter_get_bdev_pages(struct bio *bio,
-		struct iov_iter *iter, struct block_device *bdev)
-{
-	return bio_iov_iter_get_pages(bio, iter,
-					bdev_logical_block_size(bdev) - 1);
-}
-
 #define DEFINE_IO_COMP_BATCH(name)	struct io_comp_batch name = { }
 
 #endif /* _LINUX_BLKDEV_H */
-- 
cgit v1.2.3


From 8375b76517cb52bac0903071feedc218c45d74d2 Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Date: Sun, 21 Sep 2025 08:44:56 +0300
Subject: kho: replace kho_preserve_phys() with kho_preserve_pages()

to make it clear that KHO operates on pages rather than on a random
physical address.

The kho_preserve_pages() will be also used in upcoming support for vmalloc
preservation.

Link: https://lkml.kernel.org/r/20250921054458.4043761-3-rppt@kernel.org
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Cc: Alexander Graf <graf@amazon.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Changyuan Lyu <changyuanl@google.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kexec_handover.h |  5 +++--
 kernel/kexec_handover.c        | 25 +++++++++++--------------
 mm/memblock.c                  |  4 +++-
 3 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h
index 559d13a3bc44..cec663b39861 100644
--- a/include/linux/kexec_handover.h
+++ b/include/linux/kexec_handover.h
@@ -18,6 +18,7 @@ enum kho_event {
 
 struct folio;
 struct notifier_block;
+struct page;
 
 #define DECLARE_KHOSER_PTR(name, type) \
 	union {                        \
@@ -43,7 +44,7 @@ bool kho_is_enabled(void);
 bool is_kho_boot(void);
 
 int kho_preserve_folio(struct folio *folio);
-int kho_preserve_phys(phys_addr_t phys, size_t size);
+int kho_preserve_pages(struct page *page, unsigned int nr_pages);
 struct folio *kho_restore_folio(phys_addr_t phys);
 int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt);
 int kho_retrieve_subtree(const char *name, phys_addr_t *phys);
@@ -71,7 +72,7 @@ static inline int kho_preserve_folio(struct folio *folio)
 	return -EOPNOTSUPP;
 }
 
-static inline int kho_preserve_phys(phys_addr_t phys, size_t size)
+static inline int kho_preserve_pages(struct page *page, unsigned int nr_pages)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c
index b8d0d63f6145..1c44a55f758e 100644
--- a/kernel/kexec_handover.c
+++ b/kernel/kexec_handover.c
@@ -725,26 +725,23 @@ int kho_preserve_folio(struct folio *folio)
 EXPORT_SYMBOL_GPL(kho_preserve_folio);
 
 /**
- * kho_preserve_phys - preserve a physically contiguous range across kexec.
- * @phys: physical address of the range.
- * @size: size of the range.
+ * kho_preserve_pages - preserve contiguous pages across kexec
+ * @page: first page in the list.
+ * @nr_pages: number of pages.
  *
- * Instructs KHO to preserve the memory range from @phys to @phys + @size
- * across kexec.
+ * Preserve a contiguous list of order 0 pages. Must be restored using
+ * kho_restore_pages() to ensure the pages are restored properly as order 0.
  *
  * Return: 0 on success, error code on failure
  */
-int kho_preserve_phys(phys_addr_t phys, size_t size)
+int kho_preserve_pages(struct page *page, unsigned int nr_pages)
 {
-	unsigned long pfn = PHYS_PFN(phys);
+	struct kho_mem_track *track = &kho_out.ser.track;
+	const unsigned long start_pfn = page_to_pfn(page);
+	const unsigned long end_pfn = start_pfn + nr_pages;
+	unsigned long pfn = start_pfn;
 	unsigned long failed_pfn = 0;
-	const unsigned long start_pfn = pfn;
-	const unsigned long end_pfn = PHYS_PFN(phys + size);
 	int err = 0;
-	struct kho_mem_track *track = &kho_out.ser.track;
-
-	if (!PAGE_ALIGNED(phys) || !PAGE_ALIGNED(size))
-		return -EINVAL;
 
 	while (pfn < end_pfn) {
 		const unsigned int order =
@@ -764,7 +761,7 @@ int kho_preserve_phys(phys_addr_t phys, size_t size)
 
 	return err;
 }
-EXPORT_SYMBOL_GPL(kho_preserve_phys);
+EXPORT_SYMBOL_GPL(kho_preserve_pages);
 
 /* Handling for debug/kho/out */
 
diff --git a/mm/memblock.c b/mm/memblock.c
index 120a501a887a..e23e16618e9b 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -2452,8 +2452,10 @@ static int reserve_mem_kho_finalize(struct kho_serialization *ser)
 
 	for (i = 0; i < reserved_mem_count; i++) {
 		struct reserve_mem_table *map = &reserved_mem_table[i];
+		struct page *page = phys_to_page(map->start);
+		unsigned int nr_pages = map->size >> PAGE_SHIFT;
 
-		err |= kho_preserve_phys(map->start, map->size);
+		err |= kho_preserve_pages(page, nr_pages);
 	}
 
 	err |= kho_preserve_folio(page_folio(kho_fdt));
-- 
cgit v1.2.3


From a667300bd53f272a3055238bcefe108f88836270 Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Date: Sun, 21 Sep 2025 08:44:57 +0300
Subject: kho: add support for preserving vmalloc allocations

A vmalloc allocation is preserved using binary structure similar to global
KHO memory tracker.  It's a linked list of pages where each page is an
array of physical address of pages in vmalloc area.

kho_preserve_vmalloc() hands out the physical address of the head page to
the caller.  This address is used as the argument to kho_vmalloc_restore()
to restore the mapping in the vmalloc address space and populate it with
the preserved pages.

[pasha.tatashin@soleen.com: free chunks using free_page() not kfree()]
  Link: https://lkml.kernel.org/r/mafs0a52idbeg.fsf@kernel.org
[akpm@linux-foundation.org: coding-style cleanups]
Link: https://lkml.kernel.org/r/20250921054458.4043761-4-rppt@kernel.org
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Cc: Alexander Graf <graf@amazon.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Changyuan Lyu <changyuanl@google.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kexec_handover.h |  28 ++++
 kernel/kexec_handover.c        | 281 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 309 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h
index cec663b39861..25042c1d8d54 100644
--- a/include/linux/kexec_handover.h
+++ b/include/linux/kexec_handover.h
@@ -39,13 +39,24 @@ struct page;
 
 struct kho_serialization;
 
+struct kho_vmalloc_chunk;
+struct kho_vmalloc {
+	DECLARE_KHOSER_PTR(first, struct kho_vmalloc_chunk *);
+	unsigned int total_pages;
+	unsigned short flags;
+	unsigned short order;
+};
+
 #ifdef CONFIG_KEXEC_HANDOVER
 bool kho_is_enabled(void);
 bool is_kho_boot(void);
 
 int kho_preserve_folio(struct folio *folio);
 int kho_preserve_pages(struct page *page, unsigned int nr_pages);
+int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation);
 struct folio *kho_restore_folio(phys_addr_t phys);
+struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages);
+void *kho_restore_vmalloc(const struct kho_vmalloc *preservation);
 int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt);
 int kho_retrieve_subtree(const char *name, phys_addr_t *phys);
 
@@ -77,11 +88,28 @@ static inline int kho_preserve_pages(struct page *page, unsigned int nr_pages)
 	return -EOPNOTSUPP;
 }
 
+static inline int kho_preserve_vmalloc(void *ptr,
+				       struct kho_vmalloc *preservation)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline struct folio *kho_restore_folio(phys_addr_t phys)
 {
 	return NULL;
 }
 
+static inline struct page *kho_restore_pages(phys_addr_t phys,
+					     unsigned int nr_pages)
+{
+	return NULL;
+}
+
+static inline void *kho_restore_vmalloc(const struct kho_vmalloc *preservation)
+{
+	return NULL;
+}
+
 static inline int kho_add_subtree(struct kho_serialization *ser,
 				  const char *name, void *fdt)
 {
diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c
index 1c44a55f758e..76f0940fb485 100644
--- a/kernel/kexec_handover.c
+++ b/kernel/kexec_handover.c
@@ -18,6 +18,7 @@
 #include <linux/memblock.h>
 #include <linux/notifier.h>
 #include <linux/page-isolation.h>
+#include <linux/vmalloc.h>
 
 #include <asm/early_ioremap.h>
 
@@ -274,6 +275,37 @@ struct folio *kho_restore_folio(phys_addr_t phys)
 }
 EXPORT_SYMBOL_GPL(kho_restore_folio);
 
+/**
+ * kho_restore_pages - restore list of contiguous order 0 pages.
+ * @phys: physical address of the first page.
+ * @nr_pages: number of pages.
+ *
+ * Restore a contiguous list of order 0 pages that was preserved with
+ * kho_preserve_pages().
+ *
+ * Return: 0 on success, error code on failure
+ */
+struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages)
+{
+	const unsigned long start_pfn = PHYS_PFN(phys);
+	const unsigned long end_pfn = start_pfn + nr_pages;
+	unsigned long pfn = start_pfn;
+
+	while (pfn < end_pfn) {
+		const unsigned int order =
+			min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
+		struct page *page = kho_restore_page(PFN_PHYS(pfn));
+
+		if (!page)
+			return NULL;
+		split_page(page, order);
+		pfn += 1 << order;
+	}
+
+	return pfn_to_page(start_pfn);
+}
+EXPORT_SYMBOL_GPL(kho_restore_pages);
+
 /* Serialize and deserialize struct kho_mem_phys across kexec
  *
  * Record all the bitmaps in a linked list of pages for the next kernel to
@@ -763,6 +795,255 @@ int kho_preserve_pages(struct page *page, unsigned int nr_pages)
 }
 EXPORT_SYMBOL_GPL(kho_preserve_pages);
 
+struct kho_vmalloc_hdr {
+	DECLARE_KHOSER_PTR(next, struct kho_vmalloc_chunk *);
+};
+
+#define KHO_VMALLOC_SIZE				\
+	((PAGE_SIZE - sizeof(struct kho_vmalloc_hdr)) / \
+	 sizeof(phys_addr_t))
+
+struct kho_vmalloc_chunk {
+	struct kho_vmalloc_hdr hdr;
+	phys_addr_t phys[KHO_VMALLOC_SIZE];
+};
+
+static_assert(sizeof(struct kho_vmalloc_chunk) == PAGE_SIZE);
+
+/* vmalloc flags KHO supports */
+#define KHO_VMALLOC_SUPPORTED_FLAGS	(VM_ALLOC | VM_ALLOW_HUGE_VMAP)
+
+/* KHO internal flags for vmalloc preservations */
+#define KHO_VMALLOC_ALLOC	0x0001
+#define KHO_VMALLOC_HUGE_VMAP	0x0002
+
+static unsigned short vmalloc_flags_to_kho(unsigned int vm_flags)
+{
+	unsigned short kho_flags = 0;
+
+	if (vm_flags & VM_ALLOC)
+		kho_flags |= KHO_VMALLOC_ALLOC;
+	if (vm_flags & VM_ALLOW_HUGE_VMAP)
+		kho_flags |= KHO_VMALLOC_HUGE_VMAP;
+
+	return kho_flags;
+}
+
+static unsigned int kho_flags_to_vmalloc(unsigned short kho_flags)
+{
+	unsigned int vm_flags = 0;
+
+	if (kho_flags & KHO_VMALLOC_ALLOC)
+		vm_flags |= VM_ALLOC;
+	if (kho_flags & KHO_VMALLOC_HUGE_VMAP)
+		vm_flags |= VM_ALLOW_HUGE_VMAP;
+
+	return vm_flags;
+}
+
+static struct kho_vmalloc_chunk *new_vmalloc_chunk(struct kho_vmalloc_chunk *cur)
+{
+	struct kho_vmalloc_chunk *chunk;
+	int err;
+
+	chunk = (struct kho_vmalloc_chunk *)get_zeroed_page(GFP_KERNEL);
+	if (!chunk)
+		return NULL;
+
+	err = kho_preserve_pages(virt_to_page(chunk), 1);
+	if (err)
+		goto err_free;
+	if (cur)
+		KHOSER_STORE_PTR(cur->hdr.next, chunk);
+	return chunk;
+
+err_free:
+	free_page((unsigned long)chunk);
+	return NULL;
+}
+
+static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk)
+{
+	struct kho_mem_track *track = &kho_out.ser.track;
+	unsigned long pfn = PHYS_PFN(virt_to_phys(chunk));
+
+	__kho_unpreserve(track, pfn, pfn + 1);
+
+	for (int i = 0; chunk->phys[i]; i++) {
+		pfn = PHYS_PFN(chunk->phys[i]);
+		__kho_unpreserve(track, pfn, pfn + 1);
+	}
+}
+
+static void kho_vmalloc_free_chunks(struct kho_vmalloc *kho_vmalloc)
+{
+	struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(kho_vmalloc->first);
+
+	while (chunk) {
+		struct kho_vmalloc_chunk *tmp = chunk;
+
+		kho_vmalloc_unpreserve_chunk(chunk);
+
+		chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
+		free_page((unsigned long)tmp);
+	}
+}
+
+/**
+ * kho_preserve_vmalloc - preserve memory allocated with vmalloc() across kexec
+ * @ptr: pointer to the area in vmalloc address space
+ * @preservation: placeholder for preservation metadata
+ *
+ * Instructs KHO to preserve the area in vmalloc address space at @ptr. The
+ * physical pages mapped at @ptr will be preserved and on successful return
+ * @preservation will hold the physical address of a structure that describes
+ * the preservation.
+ *
+ * NOTE: The memory allocated with vmalloc_node() variants cannot be reliably
+ * restored on the same node
+ *
+ * Return: 0 on success, error code on failure
+ */
+int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation)
+{
+	struct kho_vmalloc_chunk *chunk;
+	struct vm_struct *vm = find_vm_area(ptr);
+	unsigned int order, flags, nr_contig_pages;
+	unsigned int idx = 0;
+	int err;
+
+	if (!vm)
+		return -EINVAL;
+
+	if (vm->flags & ~KHO_VMALLOC_SUPPORTED_FLAGS)
+		return -EOPNOTSUPP;
+
+	flags = vmalloc_flags_to_kho(vm->flags);
+	order = get_vm_area_page_order(vm);
+
+	chunk = new_vmalloc_chunk(NULL);
+	if (!chunk)
+		return -ENOMEM;
+	KHOSER_STORE_PTR(preservation->first, chunk);
+
+	nr_contig_pages = (1 << order);
+	for (int i = 0; i < vm->nr_pages; i += nr_contig_pages) {
+		phys_addr_t phys = page_to_phys(vm->pages[i]);
+
+		err = kho_preserve_pages(vm->pages[i], nr_contig_pages);
+		if (err)
+			goto err_free;
+
+		chunk->phys[idx++] = phys;
+		if (idx == ARRAY_SIZE(chunk->phys)) {
+			chunk = new_vmalloc_chunk(chunk);
+			if (!chunk)
+				goto err_free;
+			idx = 0;
+		}
+	}
+
+	preservation->total_pages = vm->nr_pages;
+	preservation->flags = flags;
+	preservation->order = order;
+
+	return 0;
+
+err_free:
+	kho_vmalloc_free_chunks(preservation);
+	return err;
+}
+EXPORT_SYMBOL_GPL(kho_preserve_vmalloc);
+
+/**
+ * kho_restore_vmalloc - recreates and populates an area in vmalloc address
+ * space from the preserved memory.
+ * @preservation: preservation metadata.
+ *
+ * Recreates an area in vmalloc address space and populates it with memory that
+ * was preserved using kho_preserve_vmalloc().
+ *
+ * Return: pointer to the area in the vmalloc address space, NULL on failure.
+ */
+void *kho_restore_vmalloc(const struct kho_vmalloc *preservation)
+{
+	struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(preservation->first);
+	unsigned int align, order, shift, vm_flags;
+	unsigned long total_pages, contig_pages;
+	unsigned long addr, size;
+	struct vm_struct *area;
+	struct page **pages;
+	unsigned int idx = 0;
+	int err;
+
+	vm_flags = kho_flags_to_vmalloc(preservation->flags);
+	if (vm_flags & ~KHO_VMALLOC_SUPPORTED_FLAGS)
+		return NULL;
+
+	total_pages = preservation->total_pages;
+	pages = kvmalloc_array(total_pages, sizeof(*pages), GFP_KERNEL);
+	if (!pages)
+		return NULL;
+	order = preservation->order;
+	contig_pages = (1 << order);
+	shift = PAGE_SHIFT + order;
+	align = 1 << shift;
+
+	while (chunk) {
+		struct page *page;
+
+		for (int i = 0; chunk->phys[i]; i++) {
+			phys_addr_t phys = chunk->phys[i];
+
+			if (idx + contig_pages > total_pages)
+				goto err_free_pages_array;
+
+			page = kho_restore_pages(phys, contig_pages);
+			if (!page)
+				goto err_free_pages_array;
+
+			for (int j = 0; j < contig_pages; j++)
+				pages[idx++] = page;
+
+			phys += contig_pages * PAGE_SIZE;
+		}
+
+		page = kho_restore_pages(virt_to_phys(chunk), 1);
+		if (!page)
+			goto err_free_pages_array;
+		chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
+		__free_page(page);
+	}
+
+	if (idx != total_pages)
+		goto err_free_pages_array;
+
+	area = __get_vm_area_node(total_pages * PAGE_SIZE, align, shift,
+				  vm_flags, VMALLOC_START, VMALLOC_END,
+				  NUMA_NO_NODE, GFP_KERNEL,
+				  __builtin_return_address(0));
+	if (!area)
+		goto err_free_pages_array;
+
+	addr = (unsigned long)area->addr;
+	size = get_vm_area_size(area);
+	err = vmap_pages_range(addr, addr + size, PAGE_KERNEL, pages, shift);
+	if (err)
+		goto err_free_vm_area;
+
+	area->nr_pages = total_pages;
+	area->pages = pages;
+
+	return area->addr;
+
+err_free_vm_area:
+	free_vm_area(area);
+err_free_pages_array:
+	kvfree(pages);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(kho_restore_vmalloc);
+
 /* Handling for debug/kho/out */
 
 static struct dentry *debugfs_root;
-- 
cgit v1.2.3


From fcc0669c5aa681994c507b50f1c706c969d99730 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeel.butt@linux.dev>
Date: Mon, 22 Sep 2025 15:02:03 -0700
Subject: memcg: skip cgroup_file_notify if spinning is not allowed

Generally memcg charging is allowed from all the contexts including NMI
where even spinning on spinlock can cause locking issues.  However one
call chain was missed during the addition of memcg charging from any
context support.  That is try_charge_memcg() -> memcg_memory_event() ->
cgroup_file_notify().

The possible function call tree under cgroup_file_notify() can acquire
many different spin locks in spinning mode.  Some of them are
cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock.  So, let's
just skip cgroup_file_notify() from memcg charging if the context does not
allow spinning.

Alternative approach was also explored where instead of skipping
cgroup_file_notify(), we defer the memcg event processing to irq_work [1].
However it adds complexity and it was decided to keep things simple until
we need more memcg events with !allow_spinning requirement.

Link: https://lore.kernel.org/all/5qi2llyzf7gklncflo6gxoozljbm4h3tpnuv4u4ej4ztysvi6f@x44v7nz2wdzd/ [1]
Link: https://lkml.kernel.org/r/20250922220203.261714-1-shakeel.butt@linux.dev
Fixes: 3ac4638a734a ("memcg: make memcg_rstat_updated nmi safe")
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: Michal Hocko <mhocko@suse.com>
Closes: https://lore.kernel.org/all/20250905061919.439648-1-yepeilin@google.com/
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Peilin Ye <yepeilin@google.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Tejun Heo <tj@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 26 +++++++++++++++++++-------
 mm/memcontrol.c            |  7 ++++---
 2 files changed, 23 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 16fe0306e50e..873e510d6f8d 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1001,22 +1001,28 @@ static inline void count_memcg_event_mm(struct mm_struct *mm,
 	count_memcg_events_mm(mm, idx, 1);
 }
 
-static inline void memcg_memory_event(struct mem_cgroup *memcg,
-				      enum memcg_memory_event event)
+static inline void __memcg_memory_event(struct mem_cgroup *memcg,
+					enum memcg_memory_event event,
+					bool allow_spinning)
 {
 	bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX ||
 			  event == MEMCG_SWAP_FAIL;
 
+	/* For now only MEMCG_MAX can happen with !allow_spinning context. */
+	VM_WARN_ON_ONCE(!allow_spinning && event != MEMCG_MAX);
+
 	atomic_long_inc(&memcg->memory_events_local[event]);
-	if (!swap_event)
+	if (!swap_event && allow_spinning)
 		cgroup_file_notify(&memcg->events_local_file);
 
 	do {
 		atomic_long_inc(&memcg->memory_events[event]);
-		if (swap_event)
-			cgroup_file_notify(&memcg->swap_events_file);
-		else
-			cgroup_file_notify(&memcg->events_file);
+		if (allow_spinning) {
+			if (swap_event)
+				cgroup_file_notify(&memcg->swap_events_file);
+			else
+				cgroup_file_notify(&memcg->events_file);
+		}
 
 		if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
 			break;
@@ -1026,6 +1032,12 @@ static inline void memcg_memory_event(struct mem_cgroup *memcg,
 		 !mem_cgroup_is_root(memcg));
 }
 
+static inline void memcg_memory_event(struct mem_cgroup *memcg,
+				      enum memcg_memory_event event)
+{
+	__memcg_memory_event(memcg, event, true);
+}
+
 static inline void memcg_memory_event_mm(struct mm_struct *mm,
 					 enum memcg_memory_event event)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e090f29eb03b..4deda33625f4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2307,12 +2307,13 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	bool drained = false;
 	bool raised_max_event = false;
 	unsigned long pflags;
+	bool allow_spinning = gfpflags_allow_spinning(gfp_mask);
 
 retry:
 	if (consume_stock(memcg, nr_pages))
 		return 0;
 
-	if (!gfpflags_allow_spinning(gfp_mask))
+	if (!allow_spinning)
 		/* Avoid the refill and flush of the older stock */
 		batch = nr_pages;
 
@@ -2348,7 +2349,7 @@ retry:
 	if (!gfpflags_allow_blocking(gfp_mask))
 		goto nomem;
 
-	memcg_memory_event(mem_over_limit, MEMCG_MAX);
+	__memcg_memory_event(mem_over_limit, MEMCG_MAX, allow_spinning);
 	raised_max_event = true;
 
 	psi_memstall_enter(&pflags);
@@ -2415,7 +2416,7 @@ force:
 	 * a MEMCG_MAX event.
 	 */
 	if (!raised_max_event)
-		memcg_memory_event(mem_over_limit, MEMCG_MAX);
+		__memcg_memory_event(mem_over_limit, MEMCG_MAX, allow_spinning);
 
 	/*
 	 * The allocation either can't fail or will lead to more memory
-- 
cgit v1.2.3


From f04aad36a07cc17b7a5d5b9a2d386ce6fae63e93 Mon Sep 17 00:00:00 2001
From: Jakub Acs <acsjakub@amazon.de>
Date: Wed, 1 Oct 2025 09:03:52 +0000
Subject: mm/ksm: fix flag-dropping behavior in ksm_madvise

syzkaller discovered the following crash: (kernel BUG)

[   44.607039] ------------[ cut here ]------------
[   44.607422] kernel BUG at mm/userfaultfd.c:2067!
[   44.608148] Oops: invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN NOPTI
[   44.608814] CPU: 1 UID: 0 PID: 2475 Comm: reproducer Not tainted 6.16.0-rc6 #1 PREEMPT(none)
[   44.609635] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014
[   44.610695] RIP: 0010:userfaultfd_release_all+0x3a8/0x460

<snip other registers, drop unreliable trace>

[   44.617726] Call Trace:
[   44.617926]  <TASK>
[   44.619284]  userfaultfd_release+0xef/0x1b0
[   44.620976]  __fput+0x3f9/0xb60
[   44.621240]  fput_close_sync+0x110/0x210
[   44.622222]  __x64_sys_close+0x8f/0x120
[   44.622530]  do_syscall_64+0x5b/0x2f0
[   44.622840]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[   44.623244] RIP: 0033:0x7f365bb3f227

Kernel panics because it detects UFFD inconsistency during
userfaultfd_release_all().  Specifically, a VMA which has a valid pointer
to vma->vm_userfaultfd_ctx, but no UFFD flags in vma->vm_flags.

The inconsistency is caused in ksm_madvise(): when user calls madvise()
with MADV_UNMEARGEABLE on a VMA that is registered for UFFD in MINOR mode,
it accidentally clears all flags stored in the upper 32 bits of
vma->vm_flags.

Assuming x86_64 kernel build, unsigned long is 64-bit and unsigned int and
int are 32-bit wide.  This setup causes the following mishap during the &=
~VM_MERGEABLE assignment.

VM_MERGEABLE is a 32-bit constant of type unsigned int, 0x8000'0000.
After ~ is applied, it becomes 0x7fff'ffff unsigned int, which is then
promoted to unsigned long before the & operation.  This promotion fills
upper 32 bits with leading 0s, as we're doing unsigned conversion (and
even for a signed conversion, this wouldn't help as the leading bit is 0).
& operation thus ends up AND-ing vm_flags with 0x0000'0000'7fff'ffff
instead of intended 0xffff'ffff'7fff'ffff and hence accidentally clears
the upper 32-bits of its value.

Fix it by changing `VM_MERGEABLE` constant to unsigned long, using the
BIT() macro.

Note: other VM_* flags are not affected: This only happens to the
VM_MERGEABLE flag, as the other VM_* flags are all constants of type int
and after ~ operation, they end up with leading 1 and are thus converted
to unsigned long with leading 1s.

Note 2:
After commit 31defc3b01d9 ("userfaultfd: remove (VM_)BUG_ON()s"), this is
no longer a kernel BUG, but a WARNING at the same place:

[   45.595973] WARNING: CPU: 1 PID: 2474 at mm/userfaultfd.c:2067

but the root-cause (flag-drop) remains the same.

[akpm@linux-foundation.org: rust bindgen wasn't able to handle BIT(), from Miguel]
  Link: https://lore.kernel.org/oe-kbuild-all/202510030449.VfSaAjvd-lkp@intel.com/
Link: https://lkml.kernel.org/r/20251001090353.57523-2-acsjakub@amazon.de
Fixes: 7677f7fd8be7 ("userfaultfd: add minor fault registration mode")
Signed-off-by: Jakub Acs <acsjakub@amazon.de>
Signed-off-by: Miguel Ojeda <miguel.ojeda.sandonis@gmail.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: SeongJae Park <sj@kernel.org>
Tested-by: Alice Ryhl <aliceryhl@google.com>
Tested-by: Miguel Ojeda <miguel.ojeda.sandonis@gmail.com>
Cc: Xu Xin <xu.xin16@zte.com.cn>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Peter Xu <peterx@redhat.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h              | 2 +-
 rust/bindings/bindings_helper.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 06978b4dbeb8..70a2a76007d4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -323,7 +323,7 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_MIXEDMAP	0x10000000	/* Can contain "struct page" and pure PFN pages */
 #define VM_HUGEPAGE	0x20000000	/* MADV_HUGEPAGE marked this vma */
 #define VM_NOHUGEPAGE	0x40000000	/* MADV_NOHUGEPAGE marked this vma */
-#define VM_MERGEABLE	0x80000000	/* KSM may merge identical pages */
+#define VM_MERGEABLE	BIT(31)		/* KSM may merge identical pages */
 
 #ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS
 #define VM_HIGH_ARCH_BIT_0	32	/* bit only usable on 64-bit architectures */
diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h
index 04b75d4d01c3..2e43c66635a2 100644
--- a/rust/bindings/bindings_helper.h
+++ b/rust/bindings/bindings_helper.h
@@ -108,6 +108,7 @@ const xa_mark_t RUST_CONST_HELPER_XA_PRESENT = XA_PRESENT;
 
 const gfp_t RUST_CONST_HELPER_XA_FLAGS_ALLOC = XA_FLAGS_ALLOC;
 const gfp_t RUST_CONST_HELPER_XA_FLAGS_ALLOC1 = XA_FLAGS_ALLOC1;
+const vm_flags_t RUST_CONST_HELPER_VM_MERGEABLE = VM_MERGEABLE;
 
 #if IS_ENABLED(CONFIG_ANDROID_BINDER_IPC_RUST)
 #include "../../drivers/android/binder/rust_binder.h"
-- 
cgit v1.2.3


From 27c0a7b05d13a0dc54ed0b95fc12218210fdea1a Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Thu, 31 Jul 2025 12:02:27 -0700
Subject: libceph: Use HMAC-SHA256 library instead of crypto_shash

Use the HMAC-SHA256 library functions instead of crypto_shash.  This is
simpler and faster.

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
Reviewed-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h |  4 ++-
 net/ceph/Kconfig               |  3 +-
 net/ceph/messenger_v2.c        | 77 ++++++++++++------------------------------
 3 files changed, 26 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 1717cc57cdac..4b49592a738f 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -2,6 +2,7 @@
 #ifndef __FS_CEPH_MESSENGER_H
 #define __FS_CEPH_MESSENGER_H
 
+#include <crypto/sha2.h>
 #include <linux/bvec.h>
 #include <linux/crypto.h>
 #include <linux/kref.h>
@@ -412,7 +413,8 @@ struct ceph_connection_v2_info {
 	struct ceph_msg_data_cursor in_cursor;
 	struct ceph_msg_data_cursor out_cursor;
 
-	struct crypto_shash *hmac_tfm;  /* post-auth signature */
+	struct hmac_sha256_key hmac_key;  /* post-auth signature */
+	bool hmac_key_set;
 	struct crypto_aead *gcm_tfm;  /* on-wire encryption */
 	struct aead_request *gcm_req;
 	struct crypto_wait gcm_wait;
diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig
index 0aa21fcbf6ec..ea60e3ef0834 100644
--- a/net/ceph/Kconfig
+++ b/net/ceph/Kconfig
@@ -6,8 +6,7 @@ config CEPH_LIB
 	select CRYPTO_AES
 	select CRYPTO_CBC
 	select CRYPTO_GCM
-	select CRYPTO_HMAC
-	select CRYPTO_SHA256
+	select CRYPTO_LIB_SHA256
 	select CRYPTO
 	select KEYS
 	default n
diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c
index 5483b4eed94e..c54c8b5a6526 100644
--- a/net/ceph/messenger_v2.c
+++ b/net/ceph/messenger_v2.c
@@ -709,7 +709,7 @@ static int setup_crypto(struct ceph_connection *con,
 
 	dout("%s con %p con_mode %d session_key_len %d con_secret_len %d\n",
 	     __func__, con, con->v2.con_mode, session_key_len, con_secret_len);
-	WARN_ON(con->v2.hmac_tfm || con->v2.gcm_tfm || con->v2.gcm_req);
+	WARN_ON(con->v2.hmac_key_set || con->v2.gcm_tfm || con->v2.gcm_req);
 
 	if (con->v2.con_mode != CEPH_CON_MODE_CRC &&
 	    con->v2.con_mode != CEPH_CON_MODE_SECURE) {
@@ -723,22 +723,8 @@ static int setup_crypto(struct ceph_connection *con,
 		return 0;  /* auth_none */
 	}
 
-	noio_flag = memalloc_noio_save();
-	con->v2.hmac_tfm = crypto_alloc_shash("hmac(sha256)", 0, 0);
-	memalloc_noio_restore(noio_flag);
-	if (IS_ERR(con->v2.hmac_tfm)) {
-		ret = PTR_ERR(con->v2.hmac_tfm);
-		con->v2.hmac_tfm = NULL;
-		pr_err("failed to allocate hmac tfm context: %d\n", ret);
-		return ret;
-	}
-
-	ret = crypto_shash_setkey(con->v2.hmac_tfm, session_key,
-				  session_key_len);
-	if (ret) {
-		pr_err("failed to set hmac key: %d\n", ret);
-		return ret;
-	}
+	hmac_sha256_preparekey(&con->v2.hmac_key, session_key, session_key_len);
+	con->v2.hmac_key_set = true;
 
 	if (con->v2.con_mode == CEPH_CON_MODE_CRC) {
 		WARN_ON(con_secret_len);
@@ -793,38 +779,26 @@ static int setup_crypto(struct ceph_connection *con,
 	return 0;  /* auth_x, secure mode */
 }
 
-static int ceph_hmac_sha256(struct ceph_connection *con,
-			    const struct kvec *kvecs, int kvec_cnt, u8 *hmac)
+static void ceph_hmac_sha256(struct ceph_connection *con,
+			     const struct kvec *kvecs, int kvec_cnt,
+			     u8 hmac[SHA256_DIGEST_SIZE])
 {
-	SHASH_DESC_ON_STACK(desc, con->v2.hmac_tfm);  /* tfm arg is ignored */
-	int ret;
+	struct hmac_sha256_ctx ctx;
 	int i;
 
-	dout("%s con %p hmac_tfm %p kvec_cnt %d\n", __func__, con,
-	     con->v2.hmac_tfm, kvec_cnt);
+	dout("%s con %p hmac_key_set %d kvec_cnt %d\n", __func__, con,
+	     con->v2.hmac_key_set, kvec_cnt);
 
-	if (!con->v2.hmac_tfm) {
+	if (!con->v2.hmac_key_set) {
 		memset(hmac, 0, SHA256_DIGEST_SIZE);
-		return 0;  /* auth_none */
+		return;  /* auth_none */
 	}
 
-	desc->tfm = con->v2.hmac_tfm;
-	ret = crypto_shash_init(desc);
-	if (ret)
-		goto out;
-
-	for (i = 0; i < kvec_cnt; i++) {
-		ret = crypto_shash_update(desc, kvecs[i].iov_base,
-					  kvecs[i].iov_len);
-		if (ret)
-			goto out;
-	}
-
-	ret = crypto_shash_final(desc, hmac);
-
-out:
-	shash_desc_zero(desc);
-	return ret;  /* auth_x, both plain and secure modes */
+	/* auth_x, both plain and secure modes */
+	hmac_sha256_init(&ctx, &con->v2.hmac_key);
+	for (i = 0; i < kvec_cnt; i++)
+		hmac_sha256_update(&ctx, kvecs[i].iov_base, kvecs[i].iov_len);
+	hmac_sha256_final(&ctx, hmac);
 }
 
 static void gcm_inc_nonce(struct ceph_gcm_nonce *nonce)
@@ -1455,17 +1429,14 @@ static int prepare_auth_request_more(struct ceph_connection *con,
 static int prepare_auth_signature(struct ceph_connection *con)
 {
 	void *buf;
-	int ret;
 
 	buf = alloc_conn_buf(con, head_onwire_len(SHA256_DIGEST_SIZE,
 						  con_secure(con)));
 	if (!buf)
 		return -ENOMEM;
 
-	ret = ceph_hmac_sha256(con, con->v2.in_sign_kvecs,
-			       con->v2.in_sign_kvec_cnt, CTRL_BODY(buf));
-	if (ret)
-		return ret;
+	ceph_hmac_sha256(con, con->v2.in_sign_kvecs, con->v2.in_sign_kvec_cnt,
+			 CTRL_BODY(buf));
 
 	return prepare_control(con, FRAME_TAG_AUTH_SIGNATURE, buf,
 			       SHA256_DIGEST_SIZE);
@@ -2460,10 +2431,8 @@ static int process_auth_signature(struct ceph_connection *con,
 		return -EINVAL;
 	}
 
-	ret = ceph_hmac_sha256(con, con->v2.out_sign_kvecs,
-			       con->v2.out_sign_kvec_cnt, hmac);
-	if (ret)
-		return ret;
+	ceph_hmac_sha256(con, con->v2.out_sign_kvecs, con->v2.out_sign_kvec_cnt,
+			 hmac);
 
 	ceph_decode_need(&p, end, SHA256_DIGEST_SIZE, bad);
 	if (crypto_memneq(p, hmac, SHA256_DIGEST_SIZE)) {
@@ -3814,10 +3783,8 @@ void ceph_con_v2_reset_protocol(struct ceph_connection *con)
 	memzero_explicit(&con->v2.in_gcm_nonce, CEPH_GCM_IV_LEN);
 	memzero_explicit(&con->v2.out_gcm_nonce, CEPH_GCM_IV_LEN);
 
-	if (con->v2.hmac_tfm) {
-		crypto_free_shash(con->v2.hmac_tfm);
-		con->v2.hmac_tfm = NULL;
-	}
+	memzero_explicit(&con->v2.hmac_key, sizeof(con->v2.hmac_key));
+	con->v2.hmac_key_set = false;
 	if (con->v2.gcm_req) {
 		aead_request_free(con->v2.gcm_req);
 		con->v2.gcm_req = NULL;
-- 
cgit v1.2.3


From 59699a5a7114f09f890e86c09a6b32afb5eaa64c Mon Sep 17 00:00:00 2001
From: Max Kellermann <max.kellermann@ionos.com>
Date: Wed, 6 Aug 2025 11:48:53 +0200
Subject: libceph: make ceph_con_get_out_msg() return the message pointer

The caller in messenger_v1.c loads it anyway, so let's keep the
pointer in the register instead of reloading it from memory.  This
eliminates a tiny bit of unnecessary overhead.

Signed-off-by: Max Kellermann <max.kellermann@ionos.com>
Reviewed-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h | 2 +-
 net/ceph/messenger.c           | 4 ++--
 net/ceph/messenger_v1.c        | 3 +--
 3 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 4b49592a738f..9ebcac2981fd 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -550,7 +550,7 @@ void ceph_addr_set_port(struct ceph_entity_addr *addr, int p);
 void ceph_con_process_message(struct ceph_connection *con);
 int ceph_con_in_msg_alloc(struct ceph_connection *con,
 			  struct ceph_msg_header *hdr, int *skip);
-void ceph_con_get_out_msg(struct ceph_connection *con);
+struct ceph_msg *ceph_con_get_out_msg(struct ceph_connection *con);
 
 /* messenger_v1.c */
 int ceph_con_v1_try_read(struct ceph_connection *con);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 9f6d860411cb..b6c7bfc03503 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2110,7 +2110,7 @@ int ceph_con_in_msg_alloc(struct ceph_connection *con,
 	return ret;
 }
 
-void ceph_con_get_out_msg(struct ceph_connection *con)
+struct ceph_msg *ceph_con_get_out_msg(struct ceph_connection *con)
 {
 	struct ceph_msg *msg;
 
@@ -2141,7 +2141,7 @@ void ceph_con_get_out_msg(struct ceph_connection *con)
 	 * message or in case of a fault.
 	 */
 	WARN_ON(con->out_msg);
-	con->out_msg = ceph_msg_get(msg);
+	return con->out_msg = ceph_msg_get(msg);
 }
 
 /*
diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c
index 0cb61c76b9b8..eebe4e19d75a 100644
--- a/net/ceph/messenger_v1.c
+++ b/net/ceph/messenger_v1.c
@@ -210,8 +210,7 @@ static void prepare_write_message(struct ceph_connection *con)
 			&con->v1.out_temp_ack);
 	}
 
-	ceph_con_get_out_msg(con);
-	m = con->out_msg;
+	m = ceph_con_get_out_msg(con);
 
 	dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n",
 	     m, con->out_seq, le16_to_cpu(m->hdr.type),
-- 
cgit v1.2.3


From 7399212dcf64d90a6ab239bdd98bd325d922fc7e Mon Sep 17 00:00:00 2001
From: Max Kellermann <max.kellermann@ionos.com>
Date: Wed, 6 Aug 2025 11:48:54 +0200
Subject: libceph: pass the message pointer instead of loading con->out_msg

This pointer is in a register anyway, so let's use that instead of
reloading from memory everywhere.

[ idryomov: formatting ]

Signed-off-by: Max Kellermann <max.kellermann@ionos.com>
Reviewed-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h |   4 +-
 net/ceph/messenger.c           |   4 +-
 net/ceph/messenger_v1.c        |  45 ++++++-----
 net/ceph/messenger_v2.c        | 168 +++++++++++++++++++++--------------------
 4 files changed, 114 insertions(+), 107 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 9ebcac2981fd..6aa4c6478c9f 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -555,7 +555,7 @@ struct ceph_msg *ceph_con_get_out_msg(struct ceph_connection *con);
 /* messenger_v1.c */
 int ceph_con_v1_try_read(struct ceph_connection *con);
 int ceph_con_v1_try_write(struct ceph_connection *con);
-void ceph_con_v1_revoke(struct ceph_connection *con);
+void ceph_con_v1_revoke(struct ceph_connection *con, struct ceph_msg *msg);
 void ceph_con_v1_revoke_incoming(struct ceph_connection *con);
 bool ceph_con_v1_opened(struct ceph_connection *con);
 void ceph_con_v1_reset_session(struct ceph_connection *con);
@@ -564,7 +564,7 @@ void ceph_con_v1_reset_protocol(struct ceph_connection *con);
 /* messenger_v2.c */
 int ceph_con_v2_try_read(struct ceph_connection *con);
 int ceph_con_v2_try_write(struct ceph_connection *con);
-void ceph_con_v2_revoke(struct ceph_connection *con);
+void ceph_con_v2_revoke(struct ceph_connection *con, struct ceph_msg *msg);
 void ceph_con_v2_revoke_incoming(struct ceph_connection *con);
 bool ceph_con_v2_opened(struct ceph_connection *con);
 void ceph_con_v2_reset_session(struct ceph_connection *con);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index b6c7bfc03503..08a6a083609f 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1793,9 +1793,9 @@ void ceph_msg_revoke(struct ceph_msg *msg)
 		WARN_ON(con->state != CEPH_CON_S_OPEN);
 		dout("%s con %p msg %p was sending\n", __func__, con, msg);
 		if (ceph_msgr2(from_msgr(con->msgr)))
-			ceph_con_v2_revoke(con);
+			ceph_con_v2_revoke(con, msg);
 		else
-			ceph_con_v1_revoke(con);
+			ceph_con_v1_revoke(con, msg);
 		ceph_msg_put(con->out_msg);
 		con->out_msg = NULL;
 	} else {
diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c
index eebe4e19d75a..cc4a36ef8462 100644
--- a/net/ceph/messenger_v1.c
+++ b/net/ceph/messenger_v1.c
@@ -169,10 +169,9 @@ static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
  * Prepare footer for currently outgoing message, and finish things
  * off.  Assumes out_kvec* are already valid.. we just add on to the end.
  */
-static void prepare_write_message_footer(struct ceph_connection *con)
+static void prepare_write_message_footer(struct ceph_connection *con,
+					 struct ceph_msg *m)
 {
-	struct ceph_msg *m = con->out_msg;
-
 	m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE;
 
 	dout("prepare_write_message_footer %p\n", con);
@@ -230,31 +229,31 @@ static void prepare_write_message(struct ceph_connection *con)
 
 	/* fill in hdr crc and finalize hdr */
 	crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc));
-	con->out_msg->hdr.crc = cpu_to_le32(crc);
-	memcpy(&con->v1.out_hdr, &con->out_msg->hdr, sizeof(con->v1.out_hdr));
+	m->hdr.crc = cpu_to_le32(crc);
+	memcpy(&con->v1.out_hdr, &m->hdr, sizeof(con->v1.out_hdr));
 
 	/* fill in front and middle crc, footer */
 	crc = crc32c(0, m->front.iov_base, m->front.iov_len);
-	con->out_msg->footer.front_crc = cpu_to_le32(crc);
+	m->footer.front_crc = cpu_to_le32(crc);
 	if (m->middle) {
 		crc = crc32c(0, m->middle->vec.iov_base,
 				m->middle->vec.iov_len);
-		con->out_msg->footer.middle_crc = cpu_to_le32(crc);
+		m->footer.middle_crc = cpu_to_le32(crc);
 	} else
-		con->out_msg->footer.middle_crc = 0;
+		m->footer.middle_crc = 0;
 	dout("%s front_crc %u middle_crc %u\n", __func__,
-	     le32_to_cpu(con->out_msg->footer.front_crc),
-	     le32_to_cpu(con->out_msg->footer.middle_crc));
-	con->out_msg->footer.flags = 0;
+	     le32_to_cpu(m->footer.front_crc),
+	     le32_to_cpu(m->footer.middle_crc));
+	m->footer.flags = 0;
 
 	/* is there a data payload? */
-	con->out_msg->footer.data_crc = 0;
+	m->footer.data_crc = 0;
 	if (m->data_length) {
-		prepare_message_data(con->out_msg, m->data_length);
+		prepare_message_data(m, m->data_length);
 		con->v1.out_more = 1;  /* data + footer will follow */
 	} else {
 		/* no, queue up footer too and be done */
-		prepare_write_message_footer(con);
+		prepare_write_message_footer(con, m);
 	}
 
 	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
@@ -461,9 +460,9 @@ out:
  *  0 -> socket full, but more to do
  * <0 -> error
  */
-static int write_partial_message_data(struct ceph_connection *con)
+static int write_partial_message_data(struct ceph_connection *con,
+				      struct ceph_msg *msg)
 {
-	struct ceph_msg *msg = con->out_msg;
 	struct ceph_msg_data_cursor *cursor = &msg->cursor;
 	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
 	u32 crc;
@@ -515,7 +514,7 @@ static int write_partial_message_data(struct ceph_connection *con)
 	else
 		msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
 	con_out_kvec_reset(con);
-	prepare_write_message_footer(con);
+	prepare_write_message_footer(con, msg);
 
 	return 1;	/* must return > 0 to indicate success */
 }
@@ -1471,6 +1470,7 @@ bad_tag:
  */
 int ceph_con_v1_try_write(struct ceph_connection *con)
 {
+	struct ceph_msg *msg;
 	int ret = 1;
 
 	dout("try_write start %p state %d\n", con, con->state);
@@ -1517,14 +1517,15 @@ more:
 	}
 
 	/* msg pages? */
-	if (con->out_msg) {
+	msg = con->out_msg;
+	if (msg) {
 		if (con->v1.out_msg_done) {
-			ceph_msg_put(con->out_msg);
+			ceph_msg_put(msg);
 			con->out_msg = NULL;   /* we're done with this one */
 			goto do_next;
 		}
 
-		ret = write_partial_message_data(con);
+		ret = write_partial_message_data(con, msg);
 		if (ret == 1)
 			goto more;  /* we need to send the footer, too! */
 		if (ret == 0)
@@ -1563,10 +1564,8 @@ out:
 	return ret;
 }
 
-void ceph_con_v1_revoke(struct ceph_connection *con)
+void ceph_con_v1_revoke(struct ceph_connection *con, struct ceph_msg *msg)
 {
-	struct ceph_msg *msg = con->out_msg;
-
 	WARN_ON(con->v1.out_skip);
 	/* footer */
 	if (con->v1.out_msg_done) {
diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c
index c54c8b5a6526..b44e936f3865 100644
--- a/net/ceph/messenger_v2.c
+++ b/net/ceph/messenger_v2.c
@@ -1560,10 +1560,11 @@ static int prepare_ack(struct ceph_connection *con)
 	return prepare_control(con, FRAME_TAG_ACK, con->v2.out_buf, 8);
 }
 
-static void prepare_epilogue_plain(struct ceph_connection *con, bool aborted)
+static void prepare_epilogue_plain(struct ceph_connection *con,
+				   struct ceph_msg *msg, bool aborted)
 {
 	dout("%s con %p msg %p aborted %d crcs %u %u %u\n", __func__, con,
-	     con->out_msg, aborted, con->v2.out_epil.front_crc,
+	     msg, aborted, con->v2.out_epil.front_crc,
 	     con->v2.out_epil.middle_crc, con->v2.out_epil.data_crc);
 
 	encode_epilogue_plain(con, aborted);
@@ -1574,10 +1575,9 @@ static void prepare_epilogue_plain(struct ceph_connection *con, bool aborted)
  * For "used" empty segments, crc is -1.  For unused (trailing)
  * segments, crc is 0.
  */
-static void prepare_message_plain(struct ceph_connection *con)
+static void prepare_message_plain(struct ceph_connection *con,
+				  struct ceph_msg *msg)
 {
-	struct ceph_msg *msg = con->out_msg;
-
 	prepare_head_plain(con, con->v2.out_buf,
 			   sizeof(struct ceph_msg_header2), NULL, 0, false);
 
@@ -1618,7 +1618,7 @@ static void prepare_message_plain(struct ceph_connection *con)
 		con->v2.out_state = OUT_S_QUEUE_DATA;
 	} else {
 		con->v2.out_epil.data_crc = 0;
-		prepare_epilogue_plain(con, false);
+		prepare_epilogue_plain(con, msg, false);
 		con->v2.out_state = OUT_S_FINISH_MESSAGE;
 	}
 }
@@ -1630,7 +1630,8 @@ static void prepare_message_plain(struct ceph_connection *con)
  * allocate pages for the entire tail of the message (currently up
  * to ~32M) and two sgs arrays (up to ~256K each)...
  */
-static int prepare_message_secure(struct ceph_connection *con)
+static int prepare_message_secure(struct ceph_connection *con,
+				  struct ceph_msg *msg)
 {
 	void *zerop = page_address(ceph_zero_page);
 	struct sg_table enc_sgt = {};
@@ -1645,7 +1646,7 @@ static int prepare_message_secure(struct ceph_connection *con)
 	if (ret)
 		return ret;
 
-	tail_len = tail_onwire_len(con->out_msg, true);
+	tail_len = tail_onwire_len(msg, true);
 	if (!tail_len) {
 		/*
 		 * Empty message: once the head is written,
@@ -1656,7 +1657,7 @@ static int prepare_message_secure(struct ceph_connection *con)
 	}
 
 	encode_epilogue_secure(con, false);
-	ret = setup_message_sgs(&sgt, con->out_msg, zerop, zerop, zerop,
+	ret = setup_message_sgs(&sgt, msg, zerop, zerop, zerop,
 				&con->v2.out_epil, NULL, 0, false);
 	if (ret)
 		goto out;
@@ -1685,7 +1686,7 @@ static int prepare_message_secure(struct ceph_connection *con)
 		goto out;
 
 	dout("%s con %p msg %p sg_cnt %d enc_page_cnt %d\n", __func__, con,
-	     con->out_msg, sgt.orig_nents, enc_page_cnt);
+	     msg, sgt.orig_nents, enc_page_cnt);
 	con->v2.out_state = OUT_S_QUEUE_ENC_PAGE;
 
 out:
@@ -1694,19 +1695,19 @@ out:
 	return ret;
 }
 
-static int prepare_message(struct ceph_connection *con)
+static int prepare_message(struct ceph_connection *con, struct ceph_msg *msg)
 {
 	int lens[] = {
 		sizeof(struct ceph_msg_header2),
-		front_len(con->out_msg),
-		middle_len(con->out_msg),
-		data_len(con->out_msg)
+		front_len(msg),
+		middle_len(msg),
+		data_len(msg)
 	};
 	struct ceph_frame_desc desc;
 	int ret;
 
 	dout("%s con %p msg %p logical %d+%d+%d+%d\n", __func__, con,
-	     con->out_msg, lens[0], lens[1], lens[2], lens[3]);
+	     msg, lens[0], lens[1], lens[2], lens[3]);
 
 	if (con->in_seq > con->in_seq_acked) {
 		dout("%s con %p in_seq_acked %llu -> %llu\n", __func__, con,
@@ -1717,15 +1718,15 @@ static int prepare_message(struct ceph_connection *con)
 	reset_out_kvecs(con);
 	init_frame_desc(&desc, FRAME_TAG_MESSAGE, lens, 4);
 	encode_preamble(&desc, con->v2.out_buf);
-	fill_header2(CTRL_BODY(con->v2.out_buf), &con->out_msg->hdr,
+	fill_header2(CTRL_BODY(con->v2.out_buf), &msg->hdr,
 		     con->in_seq_acked);
 
 	if (con_secure(con)) {
-		ret = prepare_message_secure(con);
+		ret = prepare_message_secure(con, msg);
 		if (ret)
 			return ret;
 	} else {
-		prepare_message_plain(con);
+		prepare_message_plain(con, msg);
 	}
 
 	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
@@ -3153,20 +3154,20 @@ int ceph_con_v2_try_read(struct ceph_connection *con)
 	}
 }
 
-static void queue_data(struct ceph_connection *con)
+static void queue_data(struct ceph_connection *con, struct ceph_msg *msg)
 {
 	struct bio_vec bv;
 
 	con->v2.out_epil.data_crc = -1;
-	ceph_msg_data_cursor_init(&con->v2.out_cursor, con->out_msg,
-				  data_len(con->out_msg));
+	ceph_msg_data_cursor_init(&con->v2.out_cursor, msg,
+				  data_len(msg));
 
 	get_bvec_at(&con->v2.out_cursor, &bv);
 	set_out_bvec(con, &bv, true);
 	con->v2.out_state = OUT_S_QUEUE_DATA_CONT;
 }
 
-static void queue_data_cont(struct ceph_connection *con)
+static void queue_data_cont(struct ceph_connection *con, struct ceph_msg *msg)
 {
 	struct bio_vec bv;
 
@@ -3187,7 +3188,7 @@ static void queue_data_cont(struct ceph_connection *con)
 	 * we are done.
 	 */
 	reset_out_kvecs(con);
-	prepare_epilogue_plain(con, false);
+	prepare_epilogue_plain(con, msg, false);
 	con->v2.out_state = OUT_S_FINISH_MESSAGE;
 }
 
@@ -3219,7 +3220,7 @@ static void queue_enc_page(struct ceph_connection *con)
 	con->v2.out_state = OUT_S_FINISH_MESSAGE;
 }
 
-static void queue_zeros(struct ceph_connection *con)
+static void queue_zeros(struct ceph_connection *con, struct ceph_msg *msg)
 {
 	dout("%s con %p out_zero %d\n", __func__, con, con->v2.out_zero);
 
@@ -3236,7 +3237,7 @@ static void queue_zeros(struct ceph_connection *con)
 	 * Once it's written, we are done patching up for the revoke.
 	 */
 	reset_out_kvecs(con);
-	prepare_epilogue_plain(con, true);
+	prepare_epilogue_plain(con, msg, true);
 	con->v2.out_state = OUT_S_FINISH_MESSAGE;
 }
 
@@ -3263,6 +3264,7 @@ static void finish_message(struct ceph_connection *con)
 
 static int populate_out_iter(struct ceph_connection *con)
 {
+	struct ceph_msg *msg;
 	int ret;
 
 	dout("%s con %p state %d out_state %d\n", __func__, con, con->state,
@@ -3278,18 +3280,18 @@ static int populate_out_iter(struct ceph_connection *con)
 	switch (con->v2.out_state) {
 	case OUT_S_QUEUE_DATA:
 		WARN_ON(!con->out_msg);
-		queue_data(con);
+		queue_data(con, con->out_msg);
 		goto populated;
 	case OUT_S_QUEUE_DATA_CONT:
 		WARN_ON(!con->out_msg);
-		queue_data_cont(con);
+		queue_data_cont(con, con->out_msg);
 		goto populated;
 	case OUT_S_QUEUE_ENC_PAGE:
 		queue_enc_page(con);
 		goto populated;
 	case OUT_S_QUEUE_ZEROS:
 		WARN_ON(con->out_msg);  /* revoked */
-		queue_zeros(con);
+		queue_zeros(con, con->out_msg);
 		goto populated;
 	case OUT_S_FINISH_MESSAGE:
 		finish_message(con);
@@ -3309,8 +3311,8 @@ static int populate_out_iter(struct ceph_connection *con)
 			return ret;
 		}
 	} else if (!list_empty(&con->out_queue)) {
-		ceph_con_get_out_msg(con);
-		ret = prepare_message(con);
+		msg = ceph_con_get_out_msg(con);
+		ret = prepare_message(con, msg);
 		if (ret) {
 			pr_err("prepare_message failed: %d\n", ret);
 			return ret;
@@ -3422,17 +3424,18 @@ static u32 crc32c_zeros(u32 crc, int zero_len)
 	return crc;
 }
 
-static void prepare_zero_front(struct ceph_connection *con, int resid)
+static void prepare_zero_front(struct ceph_connection *con,
+			       struct ceph_msg *msg, int resid)
 {
 	int sent;
 
-	WARN_ON(!resid || resid > front_len(con->out_msg));
-	sent = front_len(con->out_msg) - resid;
+	WARN_ON(!resid || resid > front_len(msg));
+	sent = front_len(msg) - resid;
 	dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid);
 
 	if (sent) {
 		con->v2.out_epil.front_crc =
-			crc32c(-1, con->out_msg->front.iov_base, sent);
+			crc32c(-1, msg->front.iov_base, sent);
 		con->v2.out_epil.front_crc =
 			crc32c_zeros(con->v2.out_epil.front_crc, resid);
 	} else {
@@ -3443,17 +3446,18 @@ static void prepare_zero_front(struct ceph_connection *con, int resid)
 	out_zero_add(con, resid);
 }
 
-static void prepare_zero_middle(struct ceph_connection *con, int resid)
+static void prepare_zero_middle(struct ceph_connection *con,
+				struct ceph_msg *msg, int resid)
 {
 	int sent;
 
-	WARN_ON(!resid || resid > middle_len(con->out_msg));
-	sent = middle_len(con->out_msg) - resid;
+	WARN_ON(!resid || resid > middle_len(msg));
+	sent = middle_len(msg) - resid;
 	dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid);
 
 	if (sent) {
 		con->v2.out_epil.middle_crc =
-			crc32c(-1, con->out_msg->middle->vec.iov_base, sent);
+			crc32c(-1, msg->middle->vec.iov_base, sent);
 		con->v2.out_epil.middle_crc =
 			crc32c_zeros(con->v2.out_epil.middle_crc, resid);
 	} else {
@@ -3464,61 +3468,64 @@ static void prepare_zero_middle(struct ceph_connection *con, int resid)
 	out_zero_add(con, resid);
 }
 
-static void prepare_zero_data(struct ceph_connection *con)
+static void prepare_zero_data(struct ceph_connection *con,
+			      struct ceph_msg *msg)
 {
 	dout("%s con %p\n", __func__, con);
-	con->v2.out_epil.data_crc = crc32c_zeros(-1, data_len(con->out_msg));
-	out_zero_add(con, data_len(con->out_msg));
+	con->v2.out_epil.data_crc = crc32c_zeros(-1, data_len(msg));
+	out_zero_add(con, data_len(msg));
 }
 
-static void revoke_at_queue_data(struct ceph_connection *con)
+static void revoke_at_queue_data(struct ceph_connection *con,
+				 struct ceph_msg *msg)
 {
 	int boundary;
 	int resid;
 
-	WARN_ON(!data_len(con->out_msg));
+	WARN_ON(!data_len(msg));
 	WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter));
 	resid = iov_iter_count(&con->v2.out_iter);
 
-	boundary = front_len(con->out_msg) + middle_len(con->out_msg);
+	boundary = front_len(msg) + middle_len(msg);
 	if (resid > boundary) {
 		resid -= boundary;
 		WARN_ON(resid > MESSAGE_HEAD_PLAIN_LEN);
 		dout("%s con %p was sending head\n", __func__, con);
-		if (front_len(con->out_msg))
-			prepare_zero_front(con, front_len(con->out_msg));
-		if (middle_len(con->out_msg))
-			prepare_zero_middle(con, middle_len(con->out_msg));
-		prepare_zero_data(con);
+		if (front_len(msg))
+			prepare_zero_front(con, msg, front_len(msg));
+		if (middle_len(msg))
+			prepare_zero_middle(con, msg, middle_len(msg));
+		prepare_zero_data(con, msg);
 		WARN_ON(iov_iter_count(&con->v2.out_iter) != resid);
 		con->v2.out_state = OUT_S_QUEUE_ZEROS;
 		return;
 	}
 
-	boundary = middle_len(con->out_msg);
+	boundary = middle_len(msg);
 	if (resid > boundary) {
 		resid -= boundary;
 		dout("%s con %p was sending front\n", __func__, con);
-		prepare_zero_front(con, resid);
-		if (middle_len(con->out_msg))
-			prepare_zero_middle(con, middle_len(con->out_msg));
-		prepare_zero_data(con);
-		queue_zeros(con);
+		prepare_zero_front(con, msg, resid);
+		if (middle_len(msg))
+			prepare_zero_middle(con, msg, middle_len(msg));
+		prepare_zero_data(con, msg);
+		queue_zeros(con, msg);
 		return;
 	}
 
 	WARN_ON(!resid);
 	dout("%s con %p was sending middle\n", __func__, con);
-	prepare_zero_middle(con, resid);
-	prepare_zero_data(con);
-	queue_zeros(con);
+	prepare_zero_middle(con, msg, resid);
+	prepare_zero_data(con, msg);
+	queue_zeros(con, msg);
 }
 
-static void revoke_at_queue_data_cont(struct ceph_connection *con)
+static void revoke_at_queue_data_cont(struct ceph_connection *con,
+				      struct ceph_msg *msg)
 {
 	int sent, resid;  /* current piece of data */
 
-	WARN_ON(!data_len(con->out_msg));
+	WARN_ON(!data_len(msg));
 	WARN_ON(!iov_iter_is_bvec(&con->v2.out_iter));
 	resid = iov_iter_count(&con->v2.out_iter);
 	WARN_ON(!resid || resid > con->v2.out_bvec.bv_len);
@@ -3537,10 +3544,11 @@ static void revoke_at_queue_data_cont(struct ceph_connection *con)
 
 	con->v2.out_iter.count -= resid;
 	out_zero_add(con, con->v2.out_cursor.total_resid);
-	queue_zeros(con);
+	queue_zeros(con, msg);
 }
 
-static void revoke_at_finish_message(struct ceph_connection *con)
+static void revoke_at_finish_message(struct ceph_connection *con,
+				     struct ceph_msg *msg)
 {
 	int boundary;
 	int resid;
@@ -3548,39 +3556,39 @@ static void revoke_at_finish_message(struct ceph_connection *con)
 	WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter));
 	resid = iov_iter_count(&con->v2.out_iter);
 
-	if (!front_len(con->out_msg) && !middle_len(con->out_msg) &&
-	    !data_len(con->out_msg)) {
+	if (!front_len(msg) && !middle_len(msg) &&
+	    !data_len(msg)) {
 		WARN_ON(!resid || resid > MESSAGE_HEAD_PLAIN_LEN);
 		dout("%s con %p was sending head (empty message) - noop\n",
 		     __func__, con);
 		return;
 	}
 
-	boundary = front_len(con->out_msg) + middle_len(con->out_msg) +
+	boundary = front_len(msg) + middle_len(msg) +
 		   CEPH_EPILOGUE_PLAIN_LEN;
 	if (resid > boundary) {
 		resid -= boundary;
 		WARN_ON(resid > MESSAGE_HEAD_PLAIN_LEN);
 		dout("%s con %p was sending head\n", __func__, con);
-		if (front_len(con->out_msg))
-			prepare_zero_front(con, front_len(con->out_msg));
-		if (middle_len(con->out_msg))
-			prepare_zero_middle(con, middle_len(con->out_msg));
+		if (front_len(msg))
+			prepare_zero_front(con, msg, front_len(msg));
+		if (middle_len(msg))
+			prepare_zero_middle(con, msg, middle_len(msg));
 		con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN;
 		WARN_ON(iov_iter_count(&con->v2.out_iter) != resid);
 		con->v2.out_state = OUT_S_QUEUE_ZEROS;
 		return;
 	}
 
-	boundary = middle_len(con->out_msg) + CEPH_EPILOGUE_PLAIN_LEN;
+	boundary = middle_len(msg) + CEPH_EPILOGUE_PLAIN_LEN;
 	if (resid > boundary) {
 		resid -= boundary;
 		dout("%s con %p was sending front\n", __func__, con);
-		prepare_zero_front(con, resid);
-		if (middle_len(con->out_msg))
-			prepare_zero_middle(con, middle_len(con->out_msg));
+		prepare_zero_front(con, msg, resid);
+		if (middle_len(msg))
+			prepare_zero_middle(con, msg, middle_len(msg));
 		con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN;
-		queue_zeros(con);
+		queue_zeros(con, msg);
 		return;
 	}
 
@@ -3588,9 +3596,9 @@ static void revoke_at_finish_message(struct ceph_connection *con)
 	if (resid > boundary) {
 		resid -= boundary;
 		dout("%s con %p was sending middle\n", __func__, con);
-		prepare_zero_middle(con, resid);
+		prepare_zero_middle(con, msg, resid);
 		con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN;
-		queue_zeros(con);
+		queue_zeros(con, msg);
 		return;
 	}
 
@@ -3598,7 +3606,7 @@ static void revoke_at_finish_message(struct ceph_connection *con)
 	dout("%s con %p was sending epilogue - noop\n", __func__, con);
 }
 
-void ceph_con_v2_revoke(struct ceph_connection *con)
+void ceph_con_v2_revoke(struct ceph_connection *con, struct ceph_msg *msg)
 {
 	WARN_ON(con->v2.out_zero);
 
@@ -3611,13 +3619,13 @@ void ceph_con_v2_revoke(struct ceph_connection *con)
 
 	switch (con->v2.out_state) {
 	case OUT_S_QUEUE_DATA:
-		revoke_at_queue_data(con);
+		revoke_at_queue_data(con, msg);
 		break;
 	case OUT_S_QUEUE_DATA_CONT:
-		revoke_at_queue_data_cont(con);
+		revoke_at_queue_data_cont(con, msg);
 		break;
 	case OUT_S_FINISH_MESSAGE:
-		revoke_at_finish_message(con);
+		revoke_at_finish_message(con, msg);
 		break;
 	default:
 		WARN(1, "bad out_state %d", con->v2.out_state);
-- 
cgit v1.2.3


From 207696b17f38e869e59889b44d395ab24bb678d3 Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko@kernel.org>
Date: Thu, 18 Sep 2025 22:30:18 +0300
Subject: tpm: use a map for tpm2_calc_ordinal_duration()

The current shenanigans for duration calculation introduce too much
complexity for a trivial problem, and further the code is hard to patch and
maintain.

Address these issues with a flat look-up table, which is easy to understand
and patch. If leaf driver specific patching is required in future, it is
easy enough to make a copy of this table during driver initialization and
add the chip parameter back.

'chip->duration' is retained for TPM 1.x.

As the first entry for this new behavior address TCG spec update mentioned
in this issue:

https://github.com/raspberrypi/linux/issues/7054

Therefore, for TPM_SelfTest the duration is set to 3000 ms.

This does not categorize a as bug, given that this is introduced to the
spec after the feature was originally made.

Reviewed-by: Serge Hallyn <serge@hallyn.com>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 drivers/char/tpm/tpm-interface.c |   2 +-
 drivers/char/tpm/tpm.h           |   2 +-
 drivers/char/tpm/tpm2-cmd.c      | 127 ++++++++++-----------------------------
 include/linux/tpm.h              |   5 +-
 4 files changed, 37 insertions(+), 99 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/tpm/tpm-interface.c b/drivers/char/tpm/tpm-interface.c
index b71725827743..c9f173001d0e 100644
--- a/drivers/char/tpm/tpm-interface.c
+++ b/drivers/char/tpm/tpm-interface.c
@@ -52,7 +52,7 @@ MODULE_PARM_DESC(suspend_pcr,
 unsigned long tpm_calc_ordinal_duration(struct tpm_chip *chip, u32 ordinal)
 {
 	if (chip->flags & TPM_CHIP_FLAG_TPM2)
-		return tpm2_calc_ordinal_duration(chip, ordinal);
+		return tpm2_calc_ordinal_duration(ordinal);
 	else
 		return tpm1_calc_ordinal_duration(chip, ordinal);
 }
diff --git a/drivers/char/tpm/tpm.h b/drivers/char/tpm/tpm.h
index 7bb87fa5f7a1..2726bd38e5ac 100644
--- a/drivers/char/tpm/tpm.h
+++ b/drivers/char/tpm/tpm.h
@@ -299,7 +299,7 @@ ssize_t tpm2_get_tpm_pt(struct tpm_chip *chip, u32 property_id,
 ssize_t tpm2_get_pcr_allocation(struct tpm_chip *chip);
 int tpm2_auto_startup(struct tpm_chip *chip);
 void tpm2_shutdown(struct tpm_chip *chip, u16 shutdown_type);
-unsigned long tpm2_calc_ordinal_duration(struct tpm_chip *chip, u32 ordinal);
+unsigned long tpm2_calc_ordinal_duration(u32 ordinal);
 int tpm2_probe(struct tpm_chip *chip);
 int tpm2_get_cc_attrs_tbl(struct tpm_chip *chip);
 int tpm2_find_cc(struct tpm_chip *chip, u32 cc);
diff --git a/drivers/char/tpm/tpm2-cmd.c b/drivers/char/tpm/tpm2-cmd.c
index 524d802ede26..7d77f6fbc152 100644
--- a/drivers/char/tpm/tpm2-cmd.c
+++ b/drivers/char/tpm/tpm2-cmd.c
@@ -28,120 +28,57 @@ static struct tpm2_hash tpm2_hash_map[] = {
 
 int tpm2_get_timeouts(struct tpm_chip *chip)
 {
-	/* Fixed timeouts for TPM2 */
 	chip->timeout_a = msecs_to_jiffies(TPM2_TIMEOUT_A);
 	chip->timeout_b = msecs_to_jiffies(TPM2_TIMEOUT_B);
 	chip->timeout_c = msecs_to_jiffies(TPM2_TIMEOUT_C);
 	chip->timeout_d = msecs_to_jiffies(TPM2_TIMEOUT_D);
-
-	/* PTP spec timeouts */
-	chip->duration[TPM_SHORT] = msecs_to_jiffies(TPM2_DURATION_SHORT);
-	chip->duration[TPM_MEDIUM] = msecs_to_jiffies(TPM2_DURATION_MEDIUM);
-	chip->duration[TPM_LONG] = msecs_to_jiffies(TPM2_DURATION_LONG);
-
-	/* Key creation commands long timeouts */
-	chip->duration[TPM_LONG_LONG] =
-		msecs_to_jiffies(TPM2_DURATION_LONG_LONG);
-
 	chip->flags |= TPM_CHIP_FLAG_HAVE_TIMEOUTS;
-
 	return 0;
 }
 
-/**
- * tpm2_ordinal_duration_index() - returns an index to the chip duration table
- * @ordinal: TPM command ordinal.
- *
- * The function returns an index to the chip duration table
- * (enum tpm_duration), that describes the maximum amount of
- * time the chip could take to return the result for a  particular ordinal.
- *
- * The values of the MEDIUM, and LONG durations are taken
- * from the PC Client Profile (PTP) specification (750, 2000 msec)
- *
- * LONG_LONG is for commands that generates keys which empirically takes
- * a longer time on some systems.
- *
- * Return:
- * * TPM_MEDIUM
- * * TPM_LONG
- * * TPM_LONG_LONG
- * * TPM_UNDEFINED
+/*
+ * Contains the maximum durations in milliseconds for TPM2 commands.
  */
-static u8 tpm2_ordinal_duration_index(u32 ordinal)
-{
-	switch (ordinal) {
-	/* Startup */
-	case TPM2_CC_STARTUP:                 /* 144 */
-		return TPM_MEDIUM;
-
-	case TPM2_CC_SELF_TEST:               /* 143 */
-		return TPM_LONG;
-
-	case TPM2_CC_GET_RANDOM:              /* 17B */
-		return TPM_LONG;
-
-	case TPM2_CC_SEQUENCE_UPDATE:         /* 15C */
-		return TPM_MEDIUM;
-	case TPM2_CC_SEQUENCE_COMPLETE:       /* 13E */
-		return TPM_MEDIUM;
-	case TPM2_CC_EVENT_SEQUENCE_COMPLETE: /* 185 */
-		return TPM_MEDIUM;
-	case TPM2_CC_HASH_SEQUENCE_START:     /* 186 */
-		return TPM_MEDIUM;
-
-	case TPM2_CC_VERIFY_SIGNATURE:        /* 177 */
-		return TPM_LONG_LONG;
-
-	case TPM2_CC_PCR_EXTEND:              /* 182 */
-		return TPM_MEDIUM;
-
-	case TPM2_CC_HIERARCHY_CONTROL:       /* 121 */
-		return TPM_LONG;
-	case TPM2_CC_HIERARCHY_CHANGE_AUTH:   /* 129 */
-		return TPM_LONG;
-
-	case TPM2_CC_GET_CAPABILITY:          /* 17A */
-		return TPM_MEDIUM;
-
-	case TPM2_CC_NV_READ:                 /* 14E */
-		return TPM_LONG;
-
-	case TPM2_CC_CREATE_PRIMARY:          /* 131 */
-		return TPM_LONG_LONG;
-	case TPM2_CC_CREATE:                  /* 153 */
-		return TPM_LONG_LONG;
-	case TPM2_CC_CREATE_LOADED:           /* 191 */
-		return TPM_LONG_LONG;
-
-	default:
-		return TPM_UNDEFINED;
-	}
-}
+static const struct {
+	unsigned long ordinal;
+	unsigned long duration;
+} tpm2_ordinal_duration_map[] = {
+	{TPM2_CC_STARTUP, 750},
+	{TPM2_CC_SELF_TEST, 3000},
+	{TPM2_CC_GET_RANDOM, 2000},
+	{TPM2_CC_SEQUENCE_UPDATE, 750},
+	{TPM2_CC_SEQUENCE_COMPLETE, 750},
+	{TPM2_CC_EVENT_SEQUENCE_COMPLETE, 750},
+	{TPM2_CC_HASH_SEQUENCE_START, 750},
+	{TPM2_CC_VERIFY_SIGNATURE, 30000},
+	{TPM2_CC_PCR_EXTEND, 750},
+	{TPM2_CC_HIERARCHY_CONTROL, 2000},
+	{TPM2_CC_HIERARCHY_CHANGE_AUTH, 2000},
+	{TPM2_CC_GET_CAPABILITY, 750},
+	{TPM2_CC_NV_READ, 2000},
+	{TPM2_CC_CREATE_PRIMARY, 30000},
+	{TPM2_CC_CREATE, 30000},
+	{TPM2_CC_CREATE_LOADED, 30000},
+};
 
 /**
- * tpm2_calc_ordinal_duration() - calculate the maximum command duration
- * @chip:    TPM chip to use.
+ * tpm2_calc_ordinal_duration() - Calculate the maximum command duration
  * @ordinal: TPM command ordinal.
  *
- * The function returns the maximum amount of time the chip could take
- * to return the result for a particular ordinal in jiffies.
- *
- * Return: A maximal duration time for an ordinal in jiffies.
+ * Returns the maximum amount of time the chip is expected by kernel to
+ * take in jiffies.
  */
-unsigned long tpm2_calc_ordinal_duration(struct tpm_chip *chip, u32 ordinal)
+unsigned long tpm2_calc_ordinal_duration(u32 ordinal)
 {
-	unsigned int index;
+	int i;
 
-	index = tpm2_ordinal_duration_index(ordinal);
+	for (i = 0; i < ARRAY_SIZE(tpm2_ordinal_duration_map); i++)
+		if (ordinal == tpm2_ordinal_duration_map[i].ordinal)
+			return msecs_to_jiffies(tpm2_ordinal_duration_map[i].duration);
 
-	if (index != TPM_UNDEFINED)
-		return chip->duration[index];
-	else
-		return msecs_to_jiffies(TPM2_DURATION_DEFAULT);
+	return msecs_to_jiffies(TPM2_DURATION_DEFAULT);
 }
 
-
 struct tpm2_pcr_read_out {
 	__be32	update_cnt;
 	__be32	pcr_selects_cnt;
diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index b0e9eb5ef022..dc0338a783f3 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -228,10 +228,11 @@ enum tpm2_timeouts {
 	TPM2_TIMEOUT_B          =   4000,
 	TPM2_TIMEOUT_C          =    200,
 	TPM2_TIMEOUT_D          =     30,
+};
+
+enum tpm2_durations {
 	TPM2_DURATION_SHORT     =     20,
-	TPM2_DURATION_MEDIUM    =    750,
 	TPM2_DURATION_LONG      =   2000,
-	TPM2_DURATION_LONG_LONG = 300000,
 	TPM2_DURATION_DEFAULT   = 120000,
 };
 
-- 
cgit v1.2.3


From a8482d2c9071d75c920eba0db36428898250ea57 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Sat, 11 Oct 2025 12:31:53 +0200
Subject: Revert "i2c: boardinfo: Annotate code used in init phase only"

This reverts commit 1a2b423be6a89dd07d5fc27ea042be68697a6a49 because we
got a regression report and need time to find out the details.

Reported-by: Konrad Dybcio <konrad.dybcio@oss.qualcomm.com>
Closes: https://lore.kernel.org/r/29ec0082-4dd4-4120-acd2-44b35b4b9487@oss.qualcomm.com
Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
---
 drivers/i2c/i2c-boardinfo.c | 4 ++--
 include/linux/i2c.h         | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-boardinfo.c b/drivers/i2c/i2c-boardinfo.c
index 338800321f8b..4df8ad092df3 100644
--- a/drivers/i2c/i2c-boardinfo.c
+++ b/drivers/i2c/i2c-boardinfo.c
@@ -22,7 +22,7 @@ EXPORT_SYMBOL_GPL(__i2c_board_lock);
 LIST_HEAD(__i2c_board_list);
 EXPORT_SYMBOL_GPL(__i2c_board_list);
 
-int __i2c_first_dynamic_bus_num __ro_after_init;
+int __i2c_first_dynamic_bus_num;
 EXPORT_SYMBOL_GPL(__i2c_first_dynamic_bus_num);
 
 
@@ -48,7 +48,7 @@ EXPORT_SYMBOL_GPL(__i2c_first_dynamic_bus_num);
  * The board info passed can safely be __initdata, but be careful of embedded
  * pointers (for platform_data, functions, etc) since that won't be copied.
  */
-int __init i2c_register_board_info(int busnum, struct i2c_board_info const *info, unsigned len)
+int i2c_register_board_info(int busnum, struct i2c_board_info const *info, unsigned len)
 {
 	int status;
 
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 11a19241e360..20fd41b51d5c 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -499,7 +499,7 @@ static inline struct i2c_client *i2c_verify_client(struct device *dev)
  * Modules for add-on boards must use other calls.
  */
 #ifdef CONFIG_I2C_BOARDINFO
-int __init
+int
 i2c_register_board_info(int busnum, struct i2c_board_info const *info,
 			unsigned n);
 #else
-- 
cgit v1.2.3


From 12d724f2852d094d68dccaf5101e0ef89a971cde Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Thu, 9 Oct 2025 19:46:00 +0900
Subject: ata: libata-core: relax checks in ata_read_log_directory()

Commit 6d4405b16d37 ("ata: libata-core: Cache the general purpose log
directory") introduced caching of a device general purpose log directory
to avoid repeated access to this log page during device scan. This
change also added a check on this log page to verify that the log page
version is 0x0001 as mandated by the ACS specifications.

And it turns out that some devices do not bother reporting this version,
instead reporting a version 0, resulting in error messages such as:

ata6.00: Invalid log directory version 0x0000

and to the device being marked as not supporting the general purpose log
directory log page.

Since before commit 6d4405b16d37 the log page version check did not
exist and things were still working correctly for these devices, relax
ata_read_log_directory() version check and only warn about the invalid
log page version number without disabling access to the log directory
page.

Fixes: 6d4405b16d37 ("ata: libata-core: Cache the general purpose log directory")
Cc: stable@vger.kernel.org
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220635
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Signed-off-by: Niklas Cassel <cassel@kernel.org>
---
 drivers/ata/libata-core.c | 11 ++++-------
 include/linux/libata.h    |  6 ++++++
 2 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index ff53f5f029b4..2a210719c4ce 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -2174,13 +2174,10 @@ static int ata_read_log_directory(struct ata_device *dev)
 	}
 
 	version = get_unaligned_le16(&dev->gp_log_dir[0]);
-	if (version != 0x0001) {
-		ata_dev_err(dev, "Invalid log directory version 0x%04x\n",
-			    version);
-		ata_clear_log_directory(dev);
-		dev->quirks |= ATA_QUIRK_NO_LOG_DIR;
-		return -EINVAL;
-	}
+	if (version != 0x0001)
+		ata_dev_warn_once(dev,
+				  "Invalid log directory version 0x%04x\n",
+				  version);
 
 	return 0;
 }
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 21de0935775d..7a98de1cc995 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1594,6 +1594,12 @@ do {								\
 #define ata_dev_dbg(dev, fmt, ...)				\
 	ata_dev_printk(debug, dev, fmt, ##__VA_ARGS__)
 
+#define ata_dev_warn_once(dev, fmt, ...)			\
+	pr_warn_once("ata%u.%02u: " fmt,			\
+		     (dev)->link->ap->print_id,			\
+		     (dev)->link->pmp + (dev)->devno,		\
+		     ##__VA_ARGS__)
+
 static inline void ata_print_version_once(const struct device *dev,
 					  const char *version)
 {
-- 
cgit v1.2.3


From e4d0c909bf8328d986bf3aadba0c33a72b5ae30d Mon Sep 17 00:00:00 2001
From: Kamil Horák - 2N <kamilh@axis.com>
Date: Thu, 9 Oct 2025 15:06:56 +0200
Subject: net: phy: bcm54811: Fix GMII/MII/MII-Lite selection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Broadcom bcm54811 is hardware-strapped to select among RGMII and
GMII/MII/MII-Lite modes. However, the corresponding bit, RGMII Enable
in Miscellaneous Control Register must be also set to select desired
RGMII or MII(-lite)/GMII mode.

Fixes: 3117a11fff5af9e7 ("net: phy: bcm54811: PHY initialization")
Signed-off-by: Kamil Horák - 2N <kamilh@axis.com>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Link: https://patch.msgid.link/20251009130656.1308237-2-kamilh@axis.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/broadcom.c | 20 +++++++++++++++++++-
 include/linux/brcmphy.h    |  1 +
 2 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c
index 3459a0e9d8b9..cb306f9e80cc 100644
--- a/drivers/net/phy/broadcom.c
+++ b/drivers/net/phy/broadcom.c
@@ -405,7 +405,7 @@ static int bcm5481x_set_brrmode(struct phy_device *phydev, bool on)
 static int bcm54811_config_init(struct phy_device *phydev)
 {
 	struct bcm54xx_phy_priv *priv = phydev->priv;
-	int err, reg, exp_sync_ethernet;
+	int err, reg, exp_sync_ethernet, aux_rgmii_en;
 
 	/* Enable CLK125 MUX on LED4 if ref clock is enabled. */
 	if (!(phydev->dev_flags & PHY_BRCM_RX_REFCLK_UNUSED)) {
@@ -434,6 +434,24 @@ static int bcm54811_config_init(struct phy_device *phydev)
 	if (err < 0)
 		return err;
 
+	/* Enable RGMII if configured */
+	if (phy_interface_is_rgmii(phydev))
+		aux_rgmii_en = MII_BCM54XX_AUXCTL_SHDWSEL_MISC_RGMII_EN |
+			       MII_BCM54XX_AUXCTL_SHDWSEL_MISC_RGMII_SKEW_EN;
+	else
+		aux_rgmii_en = 0;
+
+	/* Also writing Reserved bits 6:5 because the documentation requires
+	 * them to be written to 0b11
+	 */
+	err = bcm54xx_auxctl_write(phydev,
+				   MII_BCM54XX_AUXCTL_SHDWSEL_MISC,
+				   MII_BCM54XX_AUXCTL_MISC_WREN |
+				   aux_rgmii_en |
+				   MII_BCM54XX_AUXCTL_SHDWSEL_MISC_RSVD);
+	if (err < 0)
+		return err;
+
 	return bcm5481x_set_brrmode(phydev, priv->brr_mode);
 }
 
diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index 15c35655f482..115a964f3006 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -137,6 +137,7 @@
 
 #define MII_BCM54XX_AUXCTL_SHDWSEL_MISC			0x07
 #define MII_BCM54XX_AUXCTL_SHDWSEL_MISC_WIRESPEED_EN	0x0010
+#define MII_BCM54XX_AUXCTL_SHDWSEL_MISC_RSVD		0x0060
 #define MII_BCM54XX_AUXCTL_SHDWSEL_MISC_RGMII_EN	0x0080
 #define MII_BCM54XX_AUXCTL_SHDWSEL_MISC_RGMII_SKEW_EN	0x0100
 #define MII_BCM54XX_AUXCTL_MISC_FORCE_AMDIX		0x0200
-- 
cgit v1.2.3