Merge tag 'locking-core-2026-04-13' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull locking updates from Ingo Molnar: "Mutexes: - Add killable flavor to guard definitions (Davidlohr Bueso) - Remove the list_head from struct mutex (Matthew Wilcox) - Rename mutex_init_lockep() (Davidlohr Bueso) rwsems: - Remove the list_head from struct rw_semaphore and replace it with a single pointer (Matthew Wilcox) - Fix logic error in rwsem_del_waiter() (Andrei Vagin) Semaphores: - Remove the list_head from struct semaphore (Matthew Wilcox) Jump labels: - Use ATOMIC_INIT() for initialization of .enabled (Thomas Weißschuh) - Remove workaround for old compilers in initializations (Thomas Weißschuh) Lock context analysis changes and improvements: - Add context analysis for rwsems (Peter Zijlstra) - Fix rwlock and spinlock lock context annotations (Bart Van Assche) - Fix rwlock support in <linux/spinlock_up.h> (Bart Van Assche) - Add lock context annotations in the spinlock implementation (Bart Van Assche) - signal: Fix the lock_task_sighand() annotation (Bart Van Assche) - ww-mutex: Fix the ww_acquire_ctx function annotations (Bart Van Assche) - Add lock context support in do_raw_{read,write}_trylock() (Bart Van Assche) - arm64, compiler-context-analysis: Permit alias analysis through __READ_ONCE() with CONFIG_LTO=y (Marco Elver) - Add __cond_releases() (Peter Zijlstra) - Add context analysis for mutexes (Peter Zijlstra) - Add context analysis for rtmutexes (Peter Zijlstra) - Convert futexes to compiler context analysis (Peter Zijlstra) Rust integration updates: - Add atomic fetch_sub() implementation (Andreas Hindborg) - Refactor various rust_helper_ methods for expansion (Boqun Feng) - Add Atomic<*{mut,const} T> support (Boqun Feng) - Add atomic operation helpers over raw pointers (Boqun Feng) - Add performance-optimal Flag type for atomic booleans, to avoid slow byte-sized RMWs on architectures that don't support them. (FUJITA Tomonori) - Misc cleanups and fixes (Andreas Hindborg, Boqun Feng, FUJITA Tomonori) LTO support updates: - arm64: Optimize __READ_ONCE() with CONFIG_LTO=y (Marco Elver) - compiler: Simplify generic RELOC_HIDE() (Marco Elver) Miscellaneous fixes and cleanups by Peter Zijlstra, Randy Dunlap, Thomas Weißschuh, Davidlohr Bueso and Mikhail Gavrilov" * tag 'locking-core-2026-04-13' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (39 commits) compiler: Simplify generic RELOC_HIDE() locking: Add lock context annotations in the spinlock implementation locking: Add lock context support in do_raw_{read,write}_trylock() locking: Fix rwlock support in <linux/spinlock_up.h> lockdep: Raise default stack trace limits when KASAN is enabled cleanup: Optimize guards jump_label: remove workaround for old compilers in initializations jump_label: use ATOMIC_INIT() for initialization of .enabled futex: Convert to compiler context analysis locking/rwsem: Fix logic error in rwsem_del_waiter() locking/rwsem: Add context analysis locking/rtmutex: Add context analysis locking/mutex: Add context analysis compiler-context-analysys: Add __cond_releases() locking/mutex: Remove the list_head from struct mutex locking/semaphore: Remove the list_head from struct semaphore locking/rwsem: Remove the list_head from struct rw_semaphore rust: atomic: Update a safety comment in impl of `fetch_add()` rust: sync: atomic: Update documentation for `fetch_add()` rust: sync: atomic: Add fetch_sub() ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2026-04-14 12:36:25 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2026-04-14 12:36:25 -0700
commit: 7393febcb1b2082c0484952729cbebfe4dc508d5 (patch)
tree: d561808391b363749ab77512def195da22566db3
parent: e80d033851b3bc94c3d254ac66660ddd0a49d72c (diff)
parent: a21c1e961de28b95099a9ca2c3774b2eee1a33bb (diff)
download: lwn-7393febcb1b2082c0484952729cbebfe4dc508d5.tar.gz
lwn-7393febcb1b2082c0484952729cbebfe4dc508d5.zip
45 files changed, 925 insertions, 327 deletions
diff --git a/arch/arm64/include/asm/rwonce.h b/arch/arm64/include/asm/rwonce.h
index fc0fb42b0b64..0f3a01d30f66 100644
--- a/arch/arm64/include/asm/rwonce.h
+++ b/arch/arm64/include/asm/rwonce.h
@@ -20,6 +20,17 @@
 	ARM64_HAS_LDAPR)
 
 /*
+ * Replace this with typeof_unqual() when minimum compiler versions are
+ * increased to GCC 14 and Clang 19. For the time being, we need this
+ * workaround, which relies on function return values dropping qualifiers.
+ */
+#define __rwonce_typeof_unqual(x) typeof(({				\
+	__diag_push()							\
+	__diag_ignore_all("-Wignored-qualifiers", "")			\
+	((typeof(x)(*)(void))0)();					\
+	__diag_pop() }))
+
+/*
  * When building with LTO, there is an increased risk of the compiler
  * converting an address dependency headed by a READ_ONCE() invocation
  * into a control dependency and consequently allowing for harmful
@@ -31,9 +42,12 @@
  */
 #define __READ_ONCE(x)							\
 ({									\
-	typeof(&(x)) __x = &(x);					\
-	int atomic = 1;							\
-	union { __unqual_scalar_typeof(*__x) __val; char __c[1]; } __u;	\
+	auto __x = &(x);						\
+	auto __ret = (__rwonce_typeof_unqual(*__x) *)__x;		\
+	/* Hides alias reassignment from Clang's -Wthread-safety. */	\
+	auto __retp = &__ret;						\
+	union { typeof(*__ret) __val; char __c[1]; } __u;		\
+	*__retp = &__u.__val;						\
 	switch (sizeof(x)) {						\
 	case 1:								\
 		asm volatile(__LOAD_RCPC(b, %w0, %1)			\
@@ -56,9 +70,9 @@
 			: "Q" (*__x) : "memory");			\
 		break;							\
 	default:							\
-		atomic = 0;						\
+		__u.__val = *(volatile typeof(*__x) *)__x;		\
 	}								\
-	atomic ? (typeof(*__x))__u.__val : (*(volatile typeof(*__x) *)__x);\
+	*__ret;								\
 })
 
 #endif	/* !BUILD_VDSO */
diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c
index c8fe6e54daea..ed2162a97bd2 100644
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@ -1263,7 +1263,7 @@ acpi_status acpi_os_delete_semaphore(acpi_handle handle)
 
 	ACPI_DEBUG_PRINT((ACPI_DB_MUTEX, "Deleting semaphore[%p].\n", handle));
 
-	BUG_ON(!list_empty(&sem->wait_list));
+	BUG_ON(sem->first_waiter);
 	kfree(sem);
 	sem = NULL;
 
diff --git a/include/asm-generic/futex.h b/include/asm-generic/futex.h
index 2a19215baae5..fbbcfd801cd0 100644
--- a/include/asm-generic/futex.h
+++ b/include/asm-generic/futex.h
@@ -25,7 +25,9 @@
  *			  argument and comparison of the previous
  *			  futex value with another constant.
  *
- * @encoded_op:	encoded operation to execute
+ * @op:		operation to execute
+ * @oparg:	argument of the operation
+ * @oval:	previous value at @uaddr on successful return
  * @uaddr:	pointer to user space address
  *
  * Return:
diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h
index dbc4162921e9..ea95ca4bc11c 100644
--- a/include/linux/cleanup.h
+++ b/include/linux/cleanup.h
@@ -286,15 +286,18 @@ static __always_inline _type class_##_name##_constructor(_init_args)	\
 	__no_context_analysis						\
 { _type t = _init; return t; }
 
-#define EXTEND_CLASS(_name, ext, _init, _init_args...)			\
-typedef lock_##_name##_t lock_##_name##ext##_t;			\
+#define EXTEND_CLASS_COND(_name, ext, _cond, _init, _init_args...)	\
+typedef lock_##_name##_t lock_##_name##ext##_t;				\
 typedef class_##_name##_t class_##_name##ext##_t;			\
-static __always_inline void class_##_name##ext##_destructor(class_##_name##_t *p) \
-{ class_##_name##_destructor(p); }					\
+static __always_inline void class_##_name##ext##_destructor(class_##_name##_t *_T) \
+{ if (_cond) return; class_##_name##_destructor(_T); }			\
 static __always_inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \
 	__no_context_analysis \
 { class_##_name##_t t = _init; return t; }
 
+#define EXTEND_CLASS(_name, ext, _init, _init_args...)			\
+	EXTEND_CLASS_COND(_name, ext, 0, _init, _init_args)
+
 #define CLASS(_name, var)						\
 	class_##_name##_t var __cleanup(class_##_name##_destructor) =	\
 		class_##_name##_constructor
@@ -394,12 +397,12 @@ static __maybe_unused const bool class_##_name##_is_conditional = _is_cond
 	__DEFINE_GUARD_LOCK_PTR(_name, _T)
 
 #define DEFINE_GUARD(_name, _type, _lock, _unlock) \
-	DEFINE_CLASS(_name, _type, if (!__GUARD_IS_ERR(_T)) { _unlock; }, ({ _lock; _T; }), _type _T); \
+	DEFINE_CLASS(_name, _type, if (_T) { _unlock; }, ({ _lock; _T; }), _type _T); \
 	DEFINE_CLASS_IS_GUARD(_name)
 
 #define DEFINE_GUARD_COND_4(_name, _ext, _lock, _cond) \
 	__DEFINE_CLASS_IS_CONDITIONAL(_name##_ext, true); \
-	EXTEND_CLASS(_name, _ext, \
+	EXTEND_CLASS_COND(_name, _ext, __GUARD_IS_ERR(*_T), \
 		     ({ void *_t = _T; int _RET = (_lock); if (_T && !(_cond)) _t = ERR_PTR(_RET); _t; }), \
 		     class_##_name##_t _T) \
 	static __always_inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \
@@ -488,7 +491,7 @@ typedef struct {							\
 static __always_inline void class_##_name##_destructor(class_##_name##_t *_T) \
 	__no_context_analysis						\
 {									\
-	if (!__GUARD_IS_ERR(_T->lock)) { _unlock; }			\
+	if (_T->lock) { _unlock; }					\
 }									\
 									\
 __DEFINE_GUARD_LOCK_PTR(_name, &_T->lock)
@@ -565,7 +568,7 @@ __DEFINE_LOCK_GUARD_0(_name, _lock)
 
 #define DEFINE_LOCK_GUARD_1_COND_4(_name, _ext, _lock, _cond)		\
 	__DEFINE_CLASS_IS_CONDITIONAL(_name##_ext, true);		\
-	EXTEND_CLASS(_name, _ext,					\
+	EXTEND_CLASS_COND(_name, _ext, __GUARD_IS_ERR(_T->lock),	\
 		     ({ class_##_name##_t _t = { .lock = l }, *_T = &_t;\
 		        int _RET = (_lock);                             \
 		        if (_T->lock && !(_cond)) _T->lock = ERR_PTR(_RET);\
diff --git a/include/linux/compiler-context-analysis.h b/include/linux/compiler-context-analysis.h
index 00c074a2ccb0..a9317571e6af 100644
--- a/include/linux/compiler-context-analysis.h
+++ b/include/linux/compiler-context-analysis.h
@@ -320,6 +320,38 @@ static inline void _context_unsafe_alias(void **p) { }
  */
 #define __releases(...)		__releases_ctx_lock(__VA_ARGS__)
 
+/*
+ * Clang's analysis does not care precisely about the value, only that it is
+ * either zero or non-zero. So the __cond_acquires() interface might be
+ * misleading if we say that @ret is the value returned if acquired. Instead,
+ * provide symbolic variants which we translate.
+ */
+#define __cond_acquires_impl_not_true(x, ...)     __try_acquires##__VA_ARGS__##_ctx_lock(0, x)
+#define __cond_acquires_impl_not_false(x, ...)    __try_acquires##__VA_ARGS__##_ctx_lock(1, x)
+#define __cond_acquires_impl_not_nonzero(x, ...)  __try_acquires##__VA_ARGS__##_ctx_lock(0, x)
+#define __cond_acquires_impl_not_0(x, ...)        __try_acquires##__VA_ARGS__##_ctx_lock(1, x)
+#define __cond_acquires_impl_not_nonnull(x, ...)  __try_acquires##__VA_ARGS__##_ctx_lock(0, x)
+#define __cond_acquires_impl_not_NULL(x, ...)     __try_acquires##__VA_ARGS__##_ctx_lock(1, x)
+
+/**
+ * __cond_releases() - function attribute, function conditionally
+ *                     releases a context lock exclusively
+ * @ret: abstract value returned by function if context lock releases
+ * @x: context lock instance pointer
+ *
+ * Function attribute declaring that the function conditionally releases the
+ * given context lock instance @x exclusively. The associated context(s) must
+ * be active on entry. The function return value @ret denotes when the context
+ * lock is released.
+ *
+ * @ret may be one of: true, false, nonzero, 0, nonnull, NULL.
+ *
+ * NOTE: clang does not have a native attribute for this; instead implement
+ *       it as an unconditional release and a conditional acquire for the
+ *       inverted condition -- which is semantically equivalent.
+ */
+#define __cond_releases(ret, x) __releases(x) __cond_acquires_impl_not_##ret(x)
+
 /**
  * __acquire() - function to acquire context lock exclusively
  * @x: context lock instance pointer
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index af16624b29fd..cb2f6050bdf7 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -149,10 +149,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 #endif
 
 #ifndef RELOC_HIDE
-# define RELOC_HIDE(ptr, off)					\
-  ({ unsigned long __ptr;					\
-     __ptr = (unsigned long) (ptr);				\
-    (typeof(ptr)) (__ptr + (off)); })
+# define RELOC_HIDE(ptr, off) ((typeof(ptr))((unsigned long)(ptr) + (off)))
 #endif
 
 #define absolute_pointer(val)	RELOC_HIDE((void *)(val), 0)
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index fdb79dd1ebd8..b9c7b0ebf7b9 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -87,13 +87,6 @@ struct static_key {
 	atomic_t enabled;
 #ifdef CONFIG_JUMP_LABEL
 /*
- * Note:
- *   To make anonymous unions work with old compilers, the static
- *   initialization of them requires brackets. This creates a dependency
- *   on the order of the struct with the initializers. If any fields
- *   are added, STATIC_KEY_INIT_TRUE and STATIC_KEY_INIT_FALSE may need
- *   to be modified.
- *
  * bit 0 => 1 if key is initially true
  *	    0 if initially false
  * bit 1 => 1 if points to struct static_key_mod
@@ -238,19 +231,12 @@ extern void static_key_enable_cpuslocked(struct static_key *key);
 extern void static_key_disable_cpuslocked(struct static_key *key);
 extern enum jump_label_type jump_label_init_type(struct jump_entry *entry);
 
-/*
- * We should be using ATOMIC_INIT() for initializing .enabled, but
- * the inclusion of atomic.h is problematic for inclusion of jump_label.h
- * in 'low-level' headers. Thus, we are initializing .enabled with a
- * raw value, but have added a BUILD_BUG_ON() to catch any issues in
- * jump_label_init() see: kernel/jump_label.c.
- */
 #define STATIC_KEY_INIT_TRUE					\
-	{ .enabled = { 1 },					\
-	  { .type = JUMP_TYPE_TRUE } }
+	{ .enabled = ATOMIC_INIT(1),				\
+	  .type = JUMP_TYPE_TRUE }
 #define STATIC_KEY_INIT_FALSE					\
-	{ .enabled = { 0 },					\
-	  { .type = JUMP_TYPE_FALSE } }
+	{ .enabled = ATOMIC_INIT(0),				\
+	  .type = JUMP_TYPE_FALSE }
 
 #else  /* !CONFIG_JUMP_LABEL */
 
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index ecaa0440f6ec..734048c02f4f 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -79,7 +79,7 @@ do {									\
 #define __MUTEX_INITIALIZER(lockname) \
 		{ .owner = ATOMIC_LONG_INIT(0) \
 		, .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(lockname.wait_lock) \
-		, .wait_list = LIST_HEAD_INIT(lockname.wait_list) \
+		, .first_waiter = NULL \
 		__DEBUG_MUTEX_INITIALIZER(lockname) \
 		__DEP_MAP_MUTEX_INITIALIZER(lockname) }
 
@@ -87,12 +87,12 @@ do {									\
 	struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
-void mutex_init_lockep(struct mutex *lock, const char *name, struct lock_class_key *key);
+void mutex_init_lockdep(struct mutex *lock, const char *name, struct lock_class_key *key);
 
 static inline void __mutex_init(struct mutex *lock, const char *name,
 				struct lock_class_key *key)
 {
-	mutex_init_lockep(lock, name, key);
+	mutex_init_lockdep(lock, name, key);
 }
 #else
 extern void mutex_init_generic(struct mutex *lock);
@@ -146,7 +146,7 @@ static inline void __mutex_init(struct mutex *lock, const char *name,
 {
 	mutex_rt_init_generic(lock);
 }
-#endif /* !CONFIG_LOCKDEP */
+#endif /* !CONFIG_DEBUG_LOCK_ALLOC */
 #endif /* CONFIG_PREEMPT_RT */
 
 #ifdef CONFIG_DEBUG_MUTEXES
@@ -183,7 +183,7 @@ static inline int __must_check __devm_mutex_init(struct device *dev, struct mute
  */
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 extern void mutex_lock_nested(struct mutex *lock, unsigned int subclass) __acquires(lock);
-extern void _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock);
+extern void _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock) __acquires(lock);
 extern int __must_check mutex_lock_interruptible_nested(struct mutex *lock,
 					unsigned int subclass) __cond_acquires(0, lock);
 extern int __must_check _mutex_lock_killable(struct mutex *lock,
@@ -253,6 +253,7 @@ extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock) __cond_a
 DEFINE_LOCK_GUARD_1(mutex, struct mutex, mutex_lock(_T->lock), mutex_unlock(_T->lock))
 DEFINE_LOCK_GUARD_1_COND(mutex, _try, mutex_trylock(_T->lock))
 DEFINE_LOCK_GUARD_1_COND(mutex, _intr, mutex_lock_interruptible(_T->lock), _RET == 0)
+DEFINE_LOCK_GUARD_1_COND(mutex, _kill, mutex_lock_killable(_T->lock), _RET == 0)
 DEFINE_LOCK_GUARD_1(mutex_init, struct mutex, mutex_init(_T->lock), /* */)
 
 DECLARE_LOCK_GUARD_1_ATTRS(mutex,	__acquires(_T), __releases(*(struct mutex **)_T))
@@ -261,6 +262,8 @@ DECLARE_LOCK_GUARD_1_ATTRS(mutex_try,	__acquires(_T), __releases(*(struct mutex
 #define class_mutex_try_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(mutex_try, _T)
 DECLARE_LOCK_GUARD_1_ATTRS(mutex_intr,	__acquires(_T), __releases(*(struct mutex **)_T))
 #define class_mutex_intr_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(mutex_intr, _T)
+DECLARE_LOCK_GUARD_1_ATTRS(mutex_kill,	__acquires(_T), __releases(*(struct mutex **)_T))
+#define class_mutex_kill_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(mutex_kill, _T)
 DECLARE_LOCK_GUARD_1_ATTRS(mutex_init,	__acquires(_T), __releases(*(struct mutex **)_T))
 #define class_mutex_init_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(mutex_init, _T)
 
diff --git a/include/linux/mutex_types.h b/include/linux/mutex_types.h
index 80975935ec48..24ed599fdda8 100644
--- a/include/linux/mutex_types.h
+++ b/include/linux/mutex_types.h
@@ -44,7 +44,7 @@ context_lock_struct(mutex) {
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
 	struct optimistic_spin_queue osq; /* Spinner MCS lock */
 #endif
-	struct list_head	wait_list;
+	struct mutex_waiter	*first_waiter __guarded_by(&wait_lock);
 #ifdef CONFIG_DEBUG_MUTEXES
 	void			*magic;
 #endif
diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
index ede4c6bf6f22..78e7e588817c 100644
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -22,8 +22,8 @@ extern int max_lock_depth;
 
 struct rt_mutex_base {
 	raw_spinlock_t		wait_lock;
-	struct rb_root_cached   waiters;
-	struct task_struct	*owner;
+	struct rb_root_cached   waiters __guarded_by(&wait_lock);
+	struct task_struct	*owner  __guarded_by(&wait_lock);
 };
 
 #define __RT_MUTEX_BASE_INITIALIZER(rtbasename)				\
@@ -41,7 +41,7 @@ struct rt_mutex_base {
  */
 static inline bool rt_mutex_base_is_locked(struct rt_mutex_base *lock)
 {
-	return READ_ONCE(lock->owner) != NULL;
+	return data_race(READ_ONCE(lock->owner) != NULL);
 }
 
 #ifdef CONFIG_RT_MUTEXES
@@ -49,7 +49,7 @@ static inline bool rt_mutex_base_is_locked(struct rt_mutex_base *lock)
 
 static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock)
 {
-	unsigned long owner = (unsigned long) READ_ONCE(lock->owner);
+	unsigned long owner = (unsigned long) data_race(READ_ONCE(lock->owner));
 
 	return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS);
 }
diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h
index 3390d21c95dd..4e67cd934d8f 100644
--- a/include/linux/rwlock.h
+++ b/include/linux/rwlock.h
@@ -30,17 +30,27 @@ do {								\
 
 #ifdef CONFIG_DEBUG_SPINLOCK
  extern void do_raw_read_lock(rwlock_t *lock) __acquires_shared(lock);
- extern int do_raw_read_trylock(rwlock_t *lock);
+ extern int do_raw_read_trylock(rwlock_t *lock) __cond_acquires_shared(true, lock);
  extern void do_raw_read_unlock(rwlock_t *lock) __releases_shared(lock);
  extern void do_raw_write_lock(rwlock_t *lock) __acquires(lock);
- extern int do_raw_write_trylock(rwlock_t *lock);
+extern int do_raw_write_trylock(rwlock_t *lock) __cond_acquires(true, lock);
  extern void do_raw_write_unlock(rwlock_t *lock) __releases(lock);
 #else
 # define do_raw_read_lock(rwlock)	do {__acquire_shared(lock); arch_read_lock(&(rwlock)->raw_lock); } while (0)
-# define do_raw_read_trylock(rwlock)	arch_read_trylock(&(rwlock)->raw_lock)
+static inline int do_raw_read_trylock(rwlock_t *rwlock)
+	__cond_acquires_shared(true, rwlock)
+	__no_context_analysis
+{
+	return arch_read_trylock(&(rwlock)->raw_lock);
+}
 # define do_raw_read_unlock(rwlock)	do {arch_read_unlock(&(rwlock)->raw_lock); __release_shared(lock); } while (0)
 # define do_raw_write_lock(rwlock)	do {__acquire(lock); arch_write_lock(&(rwlock)->raw_lock); } while (0)
-# define do_raw_write_trylock(rwlock)	arch_write_trylock(&(rwlock)->raw_lock)
+static inline int do_raw_write_trylock(rwlock_t *rwlock)
+	__cond_acquires(true, rwlock)
+	__no_context_analysis
+{
+	return arch_write_trylock(&(rwlock)->raw_lock);
+}
 # define do_raw_write_unlock(rwlock)	do {arch_write_unlock(&(rwlock)->raw_lock); __release(lock); } while (0)
 #endif
 
diff --git a/include/linux/rwlock_api_smp.h b/include/linux/rwlock_api_smp.h
index 61a852609eab..9e02a5f28cd1 100644
--- a/include/linux/rwlock_api_smp.h
+++ b/include/linux/rwlock_api_smp.h
@@ -23,7 +23,7 @@ void __lockfunc _raw_write_lock_bh(rwlock_t *lock)	__acquires(lock);
 void __lockfunc _raw_read_lock_irq(rwlock_t *lock)	__acquires_shared(lock);
 void __lockfunc _raw_write_lock_irq(rwlock_t *lock)	__acquires(lock);
 unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock)
-							__acquires(lock);
+							__acquires_shared(lock);
 unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock)
 							__acquires(lock);
 int __lockfunc _raw_read_trylock(rwlock_t *lock)	__cond_acquires_shared(true, lock);
@@ -36,7 +36,7 @@ void __lockfunc _raw_read_unlock_irq(rwlock_t *lock)	__releases_shared(lock);
 void __lockfunc _raw_write_unlock_irq(rwlock_t *lock)	__releases(lock);
 void __lockfunc
 _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
-							__releases(lock);
+							__releases_shared(lock);
 void __lockfunc
 _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
 							__releases(lock);
@@ -116,6 +116,7 @@ _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
 #endif
 
 static inline int __raw_read_trylock(rwlock_t *lock)
+	__cond_acquires_shared(true, lock)
 {
 	preempt_disable();
 	if (do_raw_read_trylock(lock)) {
@@ -127,6 +128,7 @@ static inline int __raw_read_trylock(rwlock_t *lock)
 }
 
 static inline int __raw_write_trylock(rwlock_t *lock)
+	__cond_acquires(true, lock)
 {
 	preempt_disable();
 	if (do_raw_write_trylock(lock)) {
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 9bf1d93d3d7b..6a1a7bae5f81 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -57,7 +57,7 @@ context_lock_struct(rw_semaphore) {
 	struct optimistic_spin_queue osq; /* spinner MCS lock */
 #endif
 	raw_spinlock_t wait_lock;
-	struct list_head wait_list;
+	struct rwsem_waiter *first_waiter __guarded_by(&wait_lock);
 #ifdef CONFIG_DEBUG_RWSEMS
 	void *magic;
 #endif
@@ -106,7 +106,7 @@ static inline void rwsem_assert_held_write_nolockdep(const struct rw_semaphore *
 	  .owner = ATOMIC_LONG_INIT(0),				\
 	  __RWSEM_OPT_INIT(name)				\
 	  .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),\
-	  .wait_list = LIST_HEAD_INIT((name).wait_list),	\
+	  .first_waiter = NULL,					\
 	  __RWSEM_DEBUG_INIT(name)				\
 	  __RWSEM_DEP_MAP_INIT(name) }
 
@@ -129,9 +129,9 @@ do {								\
  * rwsem to see if somebody from an incompatible type is wanting access to the
  * lock.
  */
-static inline int rwsem_is_contended(struct rw_semaphore *sem)
+static inline bool rwsem_is_contended(struct rw_semaphore *sem)
 {
-	return !list_empty(&sem->wait_list);
+	return data_race(sem->first_waiter != NULL);
 }
 
 #if defined(CONFIG_DEBUG_RWSEMS) || defined(CONFIG_DETECT_HUNG_TASK_BLOCKER)
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index f842c86b806f..584ae88b435e 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -740,7 +740,7 @@ static inline int thread_group_empty(struct task_struct *p)
 
 extern struct sighand_struct *lock_task_sighand(struct task_struct *task,
 						unsigned long *flags)
-	__acquires(&task->sighand->siglock);
+	__cond_acquires(nonnull, &task->sighand->siglock);
 
 static inline void unlock_task_sighand(struct task_struct *task,
 						unsigned long *flags)
diff --git a/include/linux/semaphore.h b/include/linux/semaphore.h
index 89706157e622..a4c8651ef021 100644
--- a/include/linux/semaphore.h
+++ b/include/linux/semaphore.h
@@ -15,7 +15,7 @@
 struct semaphore {
 	raw_spinlock_t		lock;
 	unsigned int		count;
-	struct list_head	wait_list;
+	struct semaphore_waiter *first_waiter;
 
 #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
 	unsigned long		last_holder;
@@ -33,7 +33,7 @@ struct semaphore {
 {									\
 	.lock		= __RAW_SPIN_LOCK_UNLOCKED((name).lock),	\
 	.count		= n,						\
-	.wait_list	= LIST_HEAD_INIT((name).wait_list)		\
+	.first_waiter	= NULL						\
 	__LAST_HOLDER_SEMAPHORE_INITIALIZER				\
 }
 
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index e1e2f144af9b..241277cd34cf 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -178,7 +178,7 @@ do {									\
 
 #ifdef CONFIG_DEBUG_SPINLOCK
  extern void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock);
- extern int do_raw_spin_trylock(raw_spinlock_t *lock);
+ extern int do_raw_spin_trylock(raw_spinlock_t *lock) __cond_acquires(true, lock);
  extern void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock);
 #else
 static inline void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock)
@@ -189,6 +189,7 @@ static inline void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock)
 }
 
 static inline int do_raw_spin_trylock(raw_spinlock_t *lock)
+	__cond_acquires(true, lock)
 {
 	int ret = arch_spin_trylock(&(lock)->raw_lock);
 
diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h
index 1e84e71ca495..3a50976471d7 100644
--- a/include/linux/spinlock_up.h
+++ b/include/linux/spinlock_up.h
@@ -48,16 +48,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock)
 	lock->slock = 1;
 }
 
-/*
- * Read-write spinlocks. No debug version.
- */
-#define arch_read_lock(lock)		do { barrier(); (void)(lock); } while (0)
-#define arch_write_lock(lock)		do { barrier(); (void)(lock); } while (0)
-#define arch_read_trylock(lock)	({ barrier(); (void)(lock); 1; })
-#define arch_write_trylock(lock)	({ barrier(); (void)(lock); 1; })
-#define arch_read_unlock(lock)		do { barrier(); (void)(lock); } while (0)
-#define arch_write_unlock(lock)	do { barrier(); (void)(lock); } while (0)
-
 #else /* DEBUG_SPINLOCK */
 #define arch_spin_is_locked(lock)	((void)(lock), 0)
 /* for sched/core.c and kernel_lock.c: */
@@ -68,4 +58,14 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock)
 
 #define arch_spin_is_contended(lock)	(((void)(lock), 0))
 
+/*
+ * Read-write spinlocks. No debug version.
+ */
+#define arch_read_lock(lock)		do { barrier(); (void)(lock); } while (0)
+#define arch_write_lock(lock)		do { barrier(); (void)(lock); } while (0)
+#define arch_read_trylock(lock)	({ barrier(); (void)(lock); 1; })
+#define arch_write_trylock(lock)	({ barrier(); (void)(lock); 1; })
+#define arch_read_unlock(lock)		do { barrier(); (void)(lock); } while (0)
+#define arch_write_unlock(lock)	do { barrier(); (void)(lock); } while (0)
+
 #endif /* __LINUX_SPINLOCK_UP_H */
diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h
index 85b1fff02fde..0c95ead5a297 100644
--- a/include/linux/ww_mutex.h
+++ b/include/linux/ww_mutex.h
@@ -181,7 +181,7 @@ static inline void ww_acquire_init(struct ww_acquire_ctx *ctx,
  * data structures.
  */
 static inline void ww_acquire_done(struct ww_acquire_ctx *ctx)
-	__releases(ctx) __acquires_shared(ctx) __no_context_analysis
+	__must_hold(ctx)
 {
 #ifdef DEBUG_WW_MUTEXES
 	lockdep_assert_held(ctx);
@@ -199,7 +199,7 @@ static inline void ww_acquire_done(struct ww_acquire_ctx *ctx)
  * mutexes have been released with ww_mutex_unlock.
  */
 static inline void ww_acquire_fini(struct ww_acquire_ctx *ctx)
-	__releases_shared(ctx) __no_context_analysis
+	__releases(ctx) __no_context_analysis
 {
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	mutex_release(&ctx->first_lock_dep_map, _THIS_IP_);
diff --git a/kernel/futex/Makefile b/kernel/futex/Makefile
index b77188d1fa07..dce70f8a322b 100644
--- a/kernel/futex/Makefile
+++ b/kernel/futex/Makefile
@@ -1,3 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
 
+CONTEXT_ANALYSIS := y
+
 obj-y += core.o syscalls.o pi.o requeue.o waitwake.o
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 31e83a09789e..ff2a4fb2993f 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -864,7 +864,6 @@ void __futex_unqueue(struct futex_q *q)
 
 /* The key must be already stored in q->key. */
 void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb)
-	__acquires(&hb->lock)
 {
 	/*
 	 * Increment the counter before taking the lock so that
@@ -879,10 +878,10 @@ void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb)
 	q->lock_ptr = &hb->lock;
 
 	spin_lock(&hb->lock);
+	__acquire(q->lock_ptr);
 }
 
 void futex_q_unlock(struct futex_hash_bucket *hb)
-	__releases(&hb->lock)
 {
 	futex_hb_waiters_dec(hb);
 	spin_unlock(&hb->lock);
@@ -1443,12 +1442,15 @@ static void futex_cleanup(struct task_struct *tsk)
 void futex_exit_recursive(struct task_struct *tsk)
 {
 	/* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */
-	if (tsk->futex_state == FUTEX_STATE_EXITING)
+	if (tsk->futex_state == FUTEX_STATE_EXITING) {
+		__assume_ctx_lock(&tsk->futex_exit_mutex);
 		mutex_unlock(&tsk->futex_exit_mutex);
+	}
 	tsk->futex_state = FUTEX_STATE_DEAD;
 }
 
 static void futex_cleanup_begin(struct task_struct *tsk)
+	__acquires(&tsk->futex_exit_mutex)
 {
 	/*
 	 * Prevent various race issues against a concurrent incoming waiter
@@ -1475,6 +1477,7 @@ static void futex_cleanup_begin(struct task_struct *tsk)
 }
 
 static void futex_cleanup_end(struct task_struct *tsk, int state)
+	__releases(&tsk->futex_exit_mutex)
 {
 	/*
 	 * Lockless store. The only side effect is that an observer might
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index 30c2afa03889..9f6bf6f585fc 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -217,7 +217,7 @@ enum futex_access {
 
 extern int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
 			 enum futex_access rw);
-extern void futex_q_lockptr_lock(struct futex_q *q);
+extern void futex_q_lockptr_lock(struct futex_q *q) __acquires(q->lock_ptr);
 extern struct hrtimer_sleeper *
 futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
 		  int flags, u64 range_ns);
@@ -311,9 +311,11 @@ extern int futex_unqueue(struct futex_q *q);
 static inline void futex_queue(struct futex_q *q, struct futex_hash_bucket *hb,
 			       struct task_struct *task)
 	__releases(&hb->lock)
+	__releases(q->lock_ptr)
 {
 	__futex_queue(q, hb, task);
 	spin_unlock(&hb->lock);
+	__release(q->lock_ptr);
 }
 
 extern void futex_unqueue_pi(struct futex_q *q);
@@ -358,9 +360,12 @@ static inline int futex_hb_waiters_pending(struct futex_hash_bucket *hb)
 #endif
 }
 
-extern void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb);
-extern void futex_q_unlock(struct futex_hash_bucket *hb);
+extern void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb)
+	__acquires(&hb->lock)
+	__acquires(q->lock_ptr);
 
+extern void futex_q_unlock(struct futex_hash_bucket *hb)
+	__releases(&hb->lock);
 
 extern int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
 				union futex_key *key,
@@ -379,6 +384,9 @@ extern int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked);
  */
 static inline void
 double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
+	__acquires(&hb1->lock)
+	__acquires(&hb2->lock)
+	__no_context_analysis
 {
 	if (hb1 > hb2)
 		swap(hb1, hb2);
@@ -390,6 +398,9 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
 
 static inline void
 double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
+	__releases(&hb1->lock)
+	__releases(&hb2->lock)
+	__no_context_analysis
 {
 	spin_unlock(&hb1->lock);
 	if (hb1 != hb2)
diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c
index 7808068fa59e..643199fdbe62 100644
--- a/kernel/futex/pi.c
+++ b/kernel/futex/pi.c
@@ -389,6 +389,7 @@ static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key,
 	 * Initialize the pi_mutex in locked state and make @p
 	 * the owner of it:
 	 */
+	__assume_ctx_lock(&pi_state->pi_mutex.wait_lock);
 	rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
 
 	/* Store the key for possible exit cleanups: */
@@ -614,6 +615,8 @@ int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
 static int wake_futex_pi(u32 __user *uaddr, u32 uval,
 			 struct futex_pi_state *pi_state,
 			 struct rt_mutex_waiter *top_waiter)
+	__must_hold(&pi_state->pi_mutex.wait_lock)
+	__releases(&pi_state->pi_mutex.wait_lock)
 {
 	struct task_struct *new_owner;
 	bool postunlock = false;
@@ -670,6 +673,8 @@ out_unlock:
 
 static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
 				  struct task_struct *argowner)
+	__must_hold(&q->pi_state->pi_mutex.wait_lock)
+	__must_hold(q->lock_ptr)
 {
 	struct futex_pi_state *pi_state = q->pi_state;
 	struct task_struct *oldowner, *newowner;
@@ -967,6 +972,7 @@ retry_private:
 				 * - EAGAIN: The user space value changed.
 				 */
 				futex_q_unlock(hb);
+				__release(q.lock_ptr);
 				/*
 				 * Handle the case where the owner is in the middle of
 				 * exiting. Wait for the exit to complete otherwise
@@ -1091,6 +1097,7 @@ no_block:
 		if (res)
 			ret = (res < 0) ? res : 0;
 
+		__release(&hb->lock);
 		futex_unqueue_pi(&q);
 		spin_unlock(q.lock_ptr);
 		if (q.drop_hb_ref) {
@@ -1102,10 +1109,12 @@ no_block:
 
 out_unlock_put_key:
 		futex_q_unlock(hb);
+		__release(q.lock_ptr);
 		goto out;
 
 uaddr_faulted:
 		futex_q_unlock(hb);
+		__release(q.lock_ptr);
 
 		ret = fault_in_user_writeable(uaddr);
 		if (ret)
diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
index 1c2dd03f11ec..ceed9d879059 100644
--- a/kernel/futex/waitwake.c
+++ b/kernel/futex/waitwake.c
@@ -462,6 +462,7 @@ retry:
 			}
 
 			futex_q_unlock(hb);
+			__release(q->lock_ptr);
 		}
 		__set_current_state(TASK_RUNNING);
 
@@ -628,6 +629,7 @@ retry_private:
 
 		if (ret) {
 			futex_q_unlock(hb);
+			__release(q->lock_ptr);
 
 			ret = get_user(uval, uaddr);
 			if (ret)
@@ -641,11 +643,13 @@ retry_private:
 
 		if (uval != val) {
 			futex_q_unlock(hb);
+			__release(q->lock_ptr);
 			return -EWOULDBLOCK;
 		}
 
 		if (key2 && futex_match(&q->key, key2)) {
 			futex_q_unlock(hb);
+			__release(q->lock_ptr);
 			return -EINVAL;
 		}
 
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 7cb19e601426..e851e4b37d0e 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -529,15 +529,6 @@ void __init jump_label_init(void)
 	struct static_key *key = NULL;
 	struct jump_entry *iter;
 
-	/*
-	 * Since we are initializing the static_key.enabled field with
-	 * with the 'raw' int values (to avoid pulling in atomic.h) in
-	 * jump_label.h, let's make sure that is safe. There are only two
-	 * cases to check since we initialize to 0 or 1.
-	 */
-	BUILD_BUG_ON((int)ATOMIC_INIT(0) != 0);
-	BUILD_BUG_ON((int)ATOMIC_INIT(1) != 1);
-
 	if (static_key_initialized)
 		return;
 
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index a114949eeed5..cee1901d4cff 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -3,6 +3,11 @@
 # and is generally not a function of system call inputs.
 KCOV_INSTRUMENT		:= n
 
+CONTEXT_ANALYSIS_mutex.o := y
+CONTEXT_ANALYSIS_rtmutex_api.o := y
+CONTEXT_ANALYSIS_ww_rt_mutex.o := y
+CONTEXT_ANALYSIS_rwsem.o := y
+
 obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
 
 # Avoid recursion lockdep -> sanitizer -> ... -> lockdep & improve performance.
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index 2c6b02d4699b..94930d506bcf 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -37,9 +37,8 @@ void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
 void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter)
 {
 	lockdep_assert_held(&lock->wait_lock);
-	DEBUG_LOCKS_WARN_ON(list_empty(&lock->wait_list));
+	DEBUG_LOCKS_WARN_ON(!lock->first_waiter);
 	DEBUG_LOCKS_WARN_ON(waiter->magic != waiter);
-	DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
 }
 
 void debug_mutex_free_waiter(struct mutex_waiter *waiter)
@@ -62,7 +61,6 @@ void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
 {
 	struct mutex *blocked_on = __get_task_blocked_on(task);
 
-	DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
 	DEBUG_LOCKS_WARN_ON(waiter->task != task);
 	DEBUG_LOCKS_WARN_ON(blocked_on && blocked_on != lock);
 
@@ -74,7 +72,6 @@ void debug_mutex_unlock(struct mutex *lock)
 {
 	if (likely(debug_locks)) {
 		DEBUG_LOCKS_WARN_ON(lock->magic != lock);
-		DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
 	}
 }
 
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 2a1d165b3167..427187ff02db 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -46,8 +46,9 @@
 static void __mutex_init_generic(struct mutex *lock)
 {
 	atomic_long_set(&lock->owner, 0);
-	raw_spin_lock_init(&lock->wait_lock);
-	INIT_LIST_HEAD(&lock->wait_list);
+	scoped_guard (raw_spinlock_init, &lock->wait_lock) {
+		lock->first_waiter = NULL;
+	}
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
 	osq_lock_init(&lock->osq);
 #endif
@@ -150,6 +151,7 @@ EXPORT_SYMBOL(mutex_init_generic);
  * follow with a __mutex_trylock() before failing.
  */
 static __always_inline bool __mutex_trylock_fast(struct mutex *lock)
+	__cond_acquires(true, lock)
 {
 	unsigned long curr = (unsigned long)current;
 	unsigned long zero = 0UL;
@@ -163,6 +165,7 @@ static __always_inline bool __mutex_trylock_fast(struct mutex *lock)
 }
 
 static __always_inline bool __mutex_unlock_fast(struct mutex *lock)
+	__cond_releases(true, lock)
 {
 	unsigned long curr = (unsigned long)current;
 
@@ -171,7 +174,7 @@ static __always_inline bool __mutex_unlock_fast(struct mutex *lock)
 
 #else /* !CONFIG_DEBUG_LOCK_ALLOC */
 
-void mutex_init_lockep(struct mutex *lock, const char *name, struct lock_class_key *key)
+void mutex_init_lockdep(struct mutex *lock, const char *name, struct lock_class_key *key)
 {
 	__mutex_init_generic(lock);
 
@@ -181,7 +184,7 @@ void mutex_init_lockep(struct mutex *lock, const char *name, struct lock_class_k
 	debug_check_no_locks_freed((void *)lock, sizeof(*lock));
 	lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP);
 }
-EXPORT_SYMBOL(mutex_init_lockep);
+EXPORT_SYMBOL(mutex_init_lockdep);
 #endif /* !CONFIG_DEBUG_LOCK_ALLOC */
 
 static inline void __mutex_set_flag(struct mutex *lock, unsigned long flag)
@@ -194,33 +197,44 @@ static inline void __mutex_clear_flag(struct mutex *lock, unsigned long flag)
 	atomic_long_andnot(flag, &lock->owner);
 }
 
-static inline bool __mutex_waiter_is_first(struct mutex *lock, struct mutex_waiter *waiter)
-{
-	return list_first_entry(&lock->wait_list, struct mutex_waiter, list) == waiter;
-}
-
 /*
  * Add @waiter to a given location in the lock wait_list and set the
  * FLAG_WAITERS flag if it's the first waiter.
  */
 static void
 __mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
-		   struct list_head *list)
+		   struct mutex_waiter *first)
+	__must_hold(&lock->wait_lock)
 {
 	hung_task_set_blocker(lock, BLOCKER_TYPE_MUTEX);
 	debug_mutex_add_waiter(lock, waiter, current);
 
-	list_add_tail(&waiter->list, list);
-	if (__mutex_waiter_is_first(lock, waiter))
+	if (!first)
+		first = lock->first_waiter;
+
+	if (first) {
+		list_add_tail(&waiter->list, &first->list);
+	} else {
+		INIT_LIST_HEAD(&waiter->list);
+		lock->first_waiter = waiter;
 		__mutex_set_flag(lock, MUTEX_FLAG_WAITERS);
+	}
 }
 
 static void
 __mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter)
+	__must_hold(&lock->wait_lock)
 {
-	list_del(&waiter->list);
-	if (likely(list_empty(&lock->wait_list)))
+	if (list_empty(&waiter->list)) {
 		__mutex_clear_flag(lock, MUTEX_FLAGS);
+		lock->first_waiter = NULL;
+	} else {
+		if (lock->first_waiter == waiter) {
+			lock->first_waiter = list_first_entry(&waiter->list,
+							      struct mutex_waiter, list);
+		}
+		list_del(&waiter->list);
+	}
 
 	debug_mutex_remove_waiter(lock, waiter, current);
 	hung_task_clear_blocker();
@@ -259,7 +273,8 @@ static void __mutex_handoff(struct mutex *lock, struct task_struct *task)
  * We also put the fastpath first in the kernel image, to make sure the
  * branch is predicted by the CPU as default-untaken.
  */
-static void __sched __mutex_lock_slowpath(struct mutex *lock);
+static void __sched __mutex_lock_slowpath(struct mutex *lock)
+	__acquires(lock);
 
 /**
  * mutex_lock - acquire the mutex
@@ -340,7 +355,7 @@ bool ww_mutex_spin_on_owner(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
 	 * Similarly, stop spinning if we are no longer the
 	 * first waiter.
 	 */
-	if (waiter && !__mutex_waiter_is_first(lock, waiter))
+	if (waiter && data_race(lock->first_waiter != waiter))
 		return false;
 
 	return true;
@@ -525,7 +540,8 @@ mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
 }
 #endif
 
-static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip);
+static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip)
+	__releases(lock);
 
 /**
  * mutex_unlock - release the mutex
@@ -565,6 +581,7 @@ EXPORT_SYMBOL(mutex_unlock);
  * of a unlocked mutex is not allowed.
  */
 void __sched ww_mutex_unlock(struct ww_mutex *lock)
+	__no_context_analysis
 {
 	__ww_mutex_unlock(lock);
 	mutex_unlock(&lock->base);
@@ -578,6 +595,7 @@ static __always_inline int __sched
 __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclass,
 		    struct lockdep_map *nest_lock, unsigned long ip,
 		    struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
+	__cond_acquires(0, lock)
 {
 	DEFINE_WAKE_Q(wake_q);
 	struct mutex_waiter waiter;
@@ -645,7 +663,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
 
 	if (!use_ww_ctx) {
 		/* add waiting tasks to the end of the waitqueue (FIFO): */
-		__mutex_add_waiter(lock, &waiter, &lock->wait_list);
+		__mutex_add_waiter(lock, &waiter, NULL);
 	} else {
 		/*
 		 * Add in stamp order, waking up waiters that must kill
@@ -691,7 +709,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
 
 		schedule_preempt_disabled();
 
-		first = __mutex_waiter_is_first(lock, &waiter);
+		first = lock->first_waiter == &waiter;
 
 		/*
 		 * As we likely have been woken up by task
@@ -734,8 +752,7 @@ acquired:
 		 * Wound-Wait; we stole the lock (!first_waiter), check the
 		 * waiters as anyone might want to wound us.
 		 */
-		if (!ww_ctx->is_wait_die &&
-		    !__mutex_waiter_is_first(lock, &waiter))
+		if (!ww_ctx->is_wait_die && lock->first_waiter != &waiter)
 			__ww_mutex_check_waiters(lock, ww_ctx, &wake_q);
 	}
 
@@ -772,6 +789,7 @@ err_early_kill:
 static int __sched
 __mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass,
 	     struct lockdep_map *nest_lock, unsigned long ip)
+	__cond_acquires(0, lock)
 {
 	return __mutex_lock_common(lock, state, subclass, nest_lock, ip, NULL, false);
 }
@@ -779,6 +797,7 @@ __mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass,
 static int __sched
 __ww_mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass,
 		unsigned long ip, struct ww_acquire_ctx *ww_ctx)
+	__cond_acquires(0, lock)
 {
 	return __mutex_lock_common(lock, state, subclass, NULL, ip, ww_ctx, true);
 }
@@ -826,6 +845,7 @@ void __sched
 mutex_lock_nested(struct mutex *lock, unsigned int subclass)
 {
 	__mutex_lock(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
+	__acquire(lock);
 }
 
 EXPORT_SYMBOL_GPL(mutex_lock_nested);
@@ -834,6 +854,7 @@ void __sched
 _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
 {
 	__mutex_lock(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_);
+	__acquire(lock);
 }
 EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
 
@@ -862,12 +883,14 @@ mutex_lock_io_nested(struct mutex *lock, unsigned int subclass)
 	token = io_schedule_prepare();
 	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
 			    subclass, NULL, _RET_IP_, NULL, 0);
+	__acquire(lock);
 	io_schedule_finish(token);
 }
 EXPORT_SYMBOL_GPL(mutex_lock_io_nested);
 
 static inline int
 ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+	__cond_releases(nonzero, lock)
 {
 #ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
 	unsigned tmp;
@@ -929,13 +952,16 @@ EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible);
  * Release the lock, slowpath:
  */
 static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip)
+	__releases(lock)
 {
 	struct task_struct *next = NULL;
+	struct mutex_waiter *waiter;
 	DEFINE_WAKE_Q(wake_q);
 	unsigned long owner;
 	unsigned long flags;
 
 	mutex_release(&lock->dep_map, ip);
+	__release(lock);
 
 	/*
 	 * Release the lock before (potentially) taking the spinlock such that
@@ -962,12 +988,8 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
 
 	raw_spin_lock_irqsave(&lock->wait_lock, flags);
 	debug_mutex_unlock(lock);
-	if (!list_empty(&lock->wait_list)) {
-		/* get the first entry from the wait-list: */
-		struct mutex_waiter *waiter =
-			list_first_entry(&lock->wait_list,
-					 struct mutex_waiter, list);
-
+	waiter = lock->first_waiter;
+	if (waiter) {
 		next = waiter->task;
 
 		debug_mutex_wake_waiter(lock, waiter);
@@ -1061,24 +1083,29 @@ EXPORT_SYMBOL_GPL(mutex_lock_io);
 
 static noinline void __sched
 __mutex_lock_slowpath(struct mutex *lock)
+	__acquires(lock)
 {
 	__mutex_lock(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
+	__acquire(lock);
 }
 
 static noinline int __sched
 __mutex_lock_killable_slowpath(struct mutex *lock)
+	__cond_acquires(0, lock)
 {
 	return __mutex_lock(lock, TASK_KILLABLE, 0, NULL, _RET_IP_);
 }
 
 static noinline int __sched
 __mutex_lock_interruptible_slowpath(struct mutex *lock)
+	__cond_acquires(0, lock)
 {
 	return __mutex_lock(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_);
 }
 
 static noinline int __sched
 __ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+	__cond_acquires(0, lock)
 {
 	return __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE, 0,
 			       _RET_IP_, ctx);
@@ -1087,6 +1114,7 @@ __ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 static noinline int __sched
 __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
 					    struct ww_acquire_ctx *ctx)
+	__cond_acquires(0, lock)
 {
 	return __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE, 0,
 			       _RET_IP_, ctx);
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index 9ad4da8cea00..b94ef40c1f48 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -7,6 +7,7 @@
  *  Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  */
 #ifndef CONFIG_PREEMPT_RT
+#include <linux/mutex.h>
 /*
  * This is the control structure for tasks blocked on mutex, which resides
  * on the blocked task's kernel stack:
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index c80902eacd79..ccaba6148b61 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -94,6 +94,7 @@ static inline int __ww_mutex_check_kill(struct rt_mutex *lock,
 
 static __always_inline struct task_struct *
 rt_mutex_owner_encode(struct rt_mutex_base *lock, struct task_struct *owner)
+	__must_hold(&lock->wait_lock)
 {
 	unsigned long val = (unsigned long)owner;
 
@@ -105,6 +106,7 @@ rt_mutex_owner_encode(struct rt_mutex_base *lock, struct task_struct *owner)
 
 static __always_inline void
 rt_mutex_set_owner(struct rt_mutex_base *lock, struct task_struct *owner)
+	__must_hold(&lock->wait_lock)
 {
 	/*
 	 * lock->wait_lock is held but explicit acquire semantics are needed
@@ -114,12 +116,14 @@ rt_mutex_set_owner(struct rt_mutex_base *lock, struct task_struct *owner)
 }
 
 static __always_inline void rt_mutex_clear_owner(struct rt_mutex_base *lock)
+	__must_hold(&lock->wait_lock)
 {
 	/* lock->wait_lock is held so the unlock provides release semantics. */
 	WRITE_ONCE(lock->owner, rt_mutex_owner_encode(lock, NULL));
 }
 
 static __always_inline void clear_rt_mutex_waiters(struct rt_mutex_base *lock)
+	__must_hold(&lock->wait_lock)
 {
 	lock->owner = (struct task_struct *)
 			((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
@@ -127,6 +131,7 @@ static __always_inline void clear_rt_mutex_waiters(struct rt_mutex_base *lock)
 
 static __always_inline void
 fixup_rt_mutex_waiters(struct rt_mutex_base *lock, bool acquire_lock)
+	__must_hold(&lock->wait_lock)
 {
 	unsigned long owner, *p = (unsigned long *) &lock->owner;
 
@@ -328,6 +333,7 @@ static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock,
 }
 
 static __always_inline void mark_rt_mutex_waiters(struct rt_mutex_base *lock)
+	__must_hold(&lock->wait_lock)
 {
 	lock->owner = (struct task_struct *)
 			((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
@@ -1206,6 +1212,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
 					   struct ww_acquire_ctx *ww_ctx,
 					   enum rtmutex_chainwalk chwalk,
 					   struct wake_q_head *wake_q)
+	__must_hold(&lock->wait_lock)
 {
 	struct task_struct *owner = rt_mutex_owner(lock);
 	struct rt_mutex_waiter *top_waiter = waiter;
@@ -1249,6 +1256,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
 
 		/* Check whether the waiter should back out immediately */
 		rtm = container_of(lock, struct rt_mutex, rtmutex);
+		__assume_ctx_lock(&rtm->rtmutex.wait_lock);
 		res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx, wake_q);
 		if (res) {
 			raw_spin_lock(&task->pi_lock);
@@ -1356,6 +1364,7 @@ static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh,
 }
 
 static int __sched __rt_mutex_slowtrylock(struct rt_mutex_base *lock)
+	__must_hold(&lock->wait_lock)
 {
 	int ret = try_to_take_rt_mutex(lock, current, NULL);
 
@@ -1505,7 +1514,7 @@ static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock,
 		 *  - the VCPU on which owner runs is preempted
 		 */
 		if (!owner_on_cpu(owner) || need_resched() ||
-		    !rt_mutex_waiter_is_top_waiter(lock, waiter)) {
+		    !data_race(rt_mutex_waiter_is_top_waiter(lock, waiter))) {
 			res = false;
 			break;
 		}
@@ -1538,6 +1547,7 @@ static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock,
  */
 static void __sched remove_waiter(struct rt_mutex_base *lock,
 				  struct rt_mutex_waiter *waiter)
+	__must_hold(&lock->wait_lock)
 {
 	bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
 	struct task_struct *owner = rt_mutex_owner(lock);
@@ -1613,6 +1623,8 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
 	struct task_struct *owner;
 	int ret = 0;
 
+	__assume_ctx_lock(&rtm->rtmutex.wait_lock);
+
 	lockevent_inc(rtmutex_slow_block);
 	for (;;) {
 		/* Try to acquire the lock: */
@@ -1658,6 +1670,7 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
 static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock,
 					     struct rt_mutex_base *lock,
 					     struct rt_mutex_waiter *w)
+	__must_hold(&lock->wait_lock)
 {
 	/*
 	 * If the result is not -EDEADLOCK or the caller requested
@@ -1694,11 +1707,13 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock,
 				       enum rtmutex_chainwalk chwalk,
 				       struct rt_mutex_waiter *waiter,
 				       struct wake_q_head *wake_q)
+	__must_hold(&lock->wait_lock)
 {
 	struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex);
 	struct ww_mutex *ww = ww_container_of(rtm);
 	int ret;
 
+	__assume_ctx_lock(&rtm->rtmutex.wait_lock);
 	lockdep_assert_held(&lock->wait_lock);
 	lockevent_inc(rtmutex_slowlock);
 
@@ -1750,6 +1765,7 @@ static inline int __rt_mutex_slowlock_locked(struct rt_mutex_base *lock,
 					     struct ww_acquire_ctx *ww_ctx,
 					     unsigned int state,
 					     struct wake_q_head *wake_q)
+	__must_hold(&lock->wait_lock)
 {
 	struct rt_mutex_waiter waiter;
 	int ret;
diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c
index 59dbd29cb219..124219aea46e 100644
--- a/kernel/locking/rtmutex_api.c
+++ b/kernel/locking/rtmutex_api.c
@@ -526,6 +526,7 @@ static __always_inline int __mutex_lock_common(struct mutex *lock,
 					       unsigned int subclass,
 					       struct lockdep_map *nest_lock,
 					       unsigned long ip)
+	__acquires(lock) __no_context_analysis
 {
 	int ret;
 
@@ -647,6 +648,7 @@ EXPORT_SYMBOL(mutex_trylock);
 #endif /* !CONFIG_DEBUG_LOCK_ALLOC */
 
 void __sched mutex_unlock(struct mutex *lock)
+	__releases(lock) __no_context_analysis
 {
 	mutex_release(&lock->dep_map, _RET_IP_);
 	__rt_mutex_unlock(&lock->rtmutex);
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index cf6ddd1b23a2..c38b7bdea7b3 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -79,12 +79,18 @@ struct rt_wake_q_head {
  * PI-futex support (proxy locking functions, etc.):
  */
 extern void rt_mutex_init_proxy_locked(struct rt_mutex_base *lock,
-				       struct task_struct *proxy_owner);
-extern void rt_mutex_proxy_unlock(struct rt_mutex_base *lock);
+				       struct task_struct *proxy_owner)
+	__must_hold(&lock->wait_lock);
+
+extern void rt_mutex_proxy_unlock(struct rt_mutex_base *lock)
+	__must_hold(&lock->wait_lock);
+
 extern int __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
 				     struct rt_mutex_waiter *waiter,
 				     struct task_struct *task,
-				     struct wake_q_head *);
+				     struct wake_q_head *)
+	__must_hold(&lock->wait_lock);
+
 extern int rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
 				     struct rt_mutex_waiter *waiter,
 				     struct task_struct *task);
@@ -94,8 +100,9 @@ extern int rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock,
 extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex_base *lock,
 				 struct rt_mutex_waiter *waiter);
 
-extern int rt_mutex_futex_trylock(struct rt_mutex_base *l);
-extern int __rt_mutex_futex_trylock(struct rt_mutex_base *l);
+extern int rt_mutex_futex_trylock(struct rt_mutex_base *lock);
+extern int __rt_mutex_futex_trylock(struct rt_mutex_base *lock)
+	__must_hold(&lock->wait_lock);
 
 extern void rt_mutex_futex_unlock(struct rt_mutex_base *lock);
 extern bool __rt_mutex_futex_unlock(struct rt_mutex_base *lock,
@@ -109,6 +116,7 @@ extern void rt_mutex_postunlock(struct rt_wake_q_head *wqh);
  */
 #ifdef CONFIG_RT_MUTEXES
 static inline int rt_mutex_has_waiters(struct rt_mutex_base *lock)
+	__must_hold(&lock->wait_lock)
 {
 	return !RB_EMPTY_ROOT(&lock->waiters.rb_root);
 }
@@ -120,6 +128,7 @@ static inline int rt_mutex_has_waiters(struct rt_mutex_base *lock)
  */
 static inline bool rt_mutex_waiter_is_top_waiter(struct rt_mutex_base *lock,
 						 struct rt_mutex_waiter *waiter)
+	__must_hold(&lock->wait_lock)
 {
 	struct rb_node *leftmost = rb_first_cached(&lock->waiters);
 
@@ -127,6 +136,7 @@ static inline bool rt_mutex_waiter_is_top_waiter(struct rt_mutex_base *lock,
 }
 
 static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base *lock)
+	__must_hold(&lock->wait_lock)
 {
 	struct rb_node *leftmost = rb_first_cached(&lock->waiters);
 	struct rt_mutex_waiter *w = NULL;
@@ -170,9 +180,10 @@ enum rtmutex_chainwalk {
 
 static inline void __rt_mutex_base_init(struct rt_mutex_base *lock)
 {
-	raw_spin_lock_init(&lock->wait_lock);
-	lock->waiters = RB_ROOT_CACHED;
-	lock->owner = NULL;
+	scoped_guard (raw_spinlock_init, &lock->wait_lock) {
+		lock->waiters = RB_ROOT_CACHED;
+		lock->owner = NULL;
+	}
 }
 
 /* Debug functions */
diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c
index 9f4322c07486..82e078c0665a 100644
--- a/kernel/locking/rwbase_rt.c
+++ b/kernel/locking/rwbase_rt.c
@@ -186,6 +186,7 @@ static __always_inline void rwbase_read_unlock(struct rwbase_rt *rwb,
 
 static inline void __rwbase_write_unlock(struct rwbase_rt *rwb, int bias,
 					 unsigned long flags)
+	__releases(&rwb->rtmutex.wait_lock)
 {
 	struct rt_mutex_base *rtm = &rwb->rtmutex;
 
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 24df4d98f7d2..bf647097369c 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -72,7 +72,7 @@
 		#c, atomic_long_read(&(sem)->count),		\
 		(unsigned long) sem->magic,			\
 		atomic_long_read(&(sem)->owner), (long)current,	\
-		list_empty(&(sem)->wait_list) ? "" : "not "))	\
+		rwsem_is_contended(sem) ? "" : "not "))		\
 			debug_locks_off();			\
 	} while (0)
 #else
@@ -320,9 +320,10 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
 	sem->magic = sem;
 #endif
 	atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE);
-	raw_spin_lock_init(&sem->wait_lock);
-	INIT_LIST_HEAD(&sem->wait_list);
 	atomic_long_set(&sem->owner, 0L);
+	scoped_guard (raw_spinlock_init, &sem->wait_lock) {
+		sem->first_waiter = NULL;
+	}
 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 	osq_lock_init(&sem->osq);
 #endif
@@ -341,8 +342,6 @@ struct rwsem_waiter {
 	unsigned long timeout;
 	bool handoff_set;
 };
-#define rwsem_first_waiter(sem) \
-	list_first_entry(&sem->wait_list, struct rwsem_waiter, list)
 
 enum rwsem_wake_type {
 	RWSEM_WAKE_ANY,		/* Wake whatever's at head of wait list */
@@ -365,12 +364,22 @@ enum rwsem_wake_type {
  */
 #define MAX_READERS_WAKEUP	0x100
 
-static inline void
-rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
+static inline
+bool __rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
+	__must_hold(&sem->wait_lock)
 {
-	lockdep_assert_held(&sem->wait_lock);
-	list_add_tail(&waiter->list, &sem->wait_list);
-	/* caller will set RWSEM_FLAG_WAITERS */
+	if (list_empty(&waiter->list)) {
+		sem->first_waiter = NULL;
+		return false;
+	}
+
+	if (sem->first_waiter == waiter) {
+		sem->first_waiter = list_first_entry(&waiter->list,
+						     struct rwsem_waiter, list);
+	}
+	list_del(&waiter->list);
+
+	return true;
 }
 
 /*
@@ -385,14 +394,24 @@ static inline bool
 rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
 {
 	lockdep_assert_held(&sem->wait_lock);
-	list_del(&waiter->list);
-	if (likely(!list_empty(&sem->wait_list)))
+	if (__rwsem_del_waiter(sem, waiter))
 		return true;
-
 	atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count);
 	return false;
 }
 
+static inline
+struct rwsem_waiter *next_waiter(const struct rw_semaphore *sem,
+				 const struct rwsem_waiter *waiter)
+	__must_hold(&sem->wait_lock)
+{
+	struct rwsem_waiter *next = list_first_entry(&waiter->list,
+						     struct rwsem_waiter, list);
+	if (next == sem->first_waiter)
+		return NULL;
+	return next;
+}
+
 /*
  * handle the lock release when processes blocked on it that can now run
  * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must
@@ -411,7 +430,7 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
 			    enum rwsem_wake_type wake_type,
 			    struct wake_q_head *wake_q)
 {
-	struct rwsem_waiter *waiter, *tmp;
+	struct rwsem_waiter *waiter, *next;
 	long oldcount, woken = 0, adjustment = 0;
 	struct list_head wlist;
 
@@ -421,7 +440,7 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
 	 * Take a peek at the queue head waiter such that we can determine
 	 * the wakeup(s) to perform.
 	 */
-	waiter = rwsem_first_waiter(sem);
+	waiter = sem->first_waiter;
 
 	if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
 		if (wake_type == RWSEM_WAKE_ANY) {
@@ -506,25 +525,28 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
 	 *    put them into wake_q to be woken up later.
 	 */
 	INIT_LIST_HEAD(&wlist);
-	list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) {
+	do {
+		next = next_waiter(sem, waiter);
 		if (waiter->type == RWSEM_WAITING_FOR_WRITE)
 			continue;
 
 		woken++;
 		list_move_tail(&waiter->list, &wlist);
+		if (sem->first_waiter == waiter)
+			sem->first_waiter = next;
 
 		/*
 		 * Limit # of readers that can be woken up per wakeup call.
 		 */
 		if (unlikely(woken >= MAX_READERS_WAKEUP))
 			break;
-	}
+	} while ((waiter = next) != NULL);
 
 	adjustment = woken * RWSEM_READER_BIAS - adjustment;
 	lockevent_cond_inc(rwsem_wake_reader, woken);
 
 	oldcount = atomic_long_read(&sem->count);
-	if (list_empty(&sem->wait_list)) {
+	if (!sem->first_waiter) {
 		/*
 		 * Combined with list_move_tail() above, this implies
 		 * rwsem_del_waiter().
@@ -545,7 +567,7 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
 		atomic_long_add(adjustment, &sem->count);
 
 	/* 2nd pass */
-	list_for_each_entry_safe(waiter, tmp, &wlist, list) {
+	list_for_each_entry_safe(waiter, next, &wlist, list) {
 		struct task_struct *tsk;
 
 		tsk = waiter->task;
@@ -577,7 +599,7 @@ rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter,
 		      struct wake_q_head *wake_q)
 		      __releases(&sem->wait_lock)
 {
-	bool first = rwsem_first_waiter(sem) == waiter;
+	bool first = sem->first_waiter == waiter;
 
 	wake_q_init(wake_q);
 
@@ -602,8 +624,9 @@ rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter,
  */
 static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
 					struct rwsem_waiter *waiter)
+	__must_hold(&sem->wait_lock)
 {
-	struct rwsem_waiter *first = rwsem_first_waiter(sem);
+	struct rwsem_waiter *first = sem->first_waiter;
 	long count, new;
 
 	lockdep_assert_held(&sem->wait_lock);
@@ -639,7 +662,7 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
 			new |= RWSEM_WRITER_LOCKED;
 			new &= ~RWSEM_FLAG_HANDOFF;
 
-			if (list_is_singular(&sem->wait_list))
+			if (list_empty(&first->list))
 				new &= ~RWSEM_FLAG_WAITERS;
 		}
 	} while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new));
@@ -659,7 +682,8 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
 	 * Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on
 	 * success.
 	 */
-	list_del(&waiter->list);
+	__rwsem_del_waiter(sem, waiter);
+
 	rwsem_set_owner(sem);
 	return true;
 }
@@ -994,7 +1018,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat
 {
 	long adjustment = -RWSEM_READER_BIAS;
 	long rcnt = (count >> RWSEM_READER_SHIFT);
-	struct rwsem_waiter waiter;
+	struct rwsem_waiter waiter, *first;
 	DEFINE_WAKE_Q(wake_q);
 
 	/*
@@ -1019,7 +1043,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat
 		 */
 		if ((rcnt == 1) && (count & RWSEM_FLAG_WAITERS)) {
 			raw_spin_lock_irq(&sem->wait_lock);
-			if (!list_empty(&sem->wait_list))
+			if (sem->first_waiter)
 				rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED,
 						&wake_q);
 			raw_spin_unlock_irq(&sem->wait_lock);
@@ -1035,7 +1059,8 @@ queue:
 	waiter.handoff_set = false;
 
 	raw_spin_lock_irq(&sem->wait_lock);
-	if (list_empty(&sem->wait_list)) {
+	first = sem->first_waiter;
+	if (!first) {
 		/*
 		 * In case the wait queue is empty and the lock isn't owned
 		 * by a writer, this reader can exit the slowpath and return
@@ -1051,8 +1076,11 @@ queue:
 			return sem;
 		}
 		adjustment += RWSEM_FLAG_WAITERS;
+		INIT_LIST_HEAD(&waiter.list);
+		sem->first_waiter = &waiter;
+	} else {
+		list_add_tail(&waiter.list, &first->list);
 	}
-	rwsem_add_waiter(sem, &waiter);
 
 	/* we're now waiting on the lock, but no longer actively locking */
 	count = atomic_long_add_return(adjustment, &sem->count);
@@ -1110,7 +1138,7 @@ out_nolock:
 static struct rw_semaphore __sched *
 rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
 {
-	struct rwsem_waiter waiter;
+	struct rwsem_waiter waiter, *first;
 	DEFINE_WAKE_Q(wake_q);
 
 	/* do optimistic spinning and steal lock if possible */
@@ -1129,10 +1157,10 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
 	waiter.handoff_set = false;
 
 	raw_spin_lock_irq(&sem->wait_lock);
-	rwsem_add_waiter(sem, &waiter);
 
-	/* we're now waiting on the lock */
-	if (rwsem_first_waiter(sem) != &waiter) {
+	first = sem->first_waiter;
+	if (first) {
+		list_add_tail(&waiter.list, &first->list);
 		rwsem_cond_wake_waiter(sem, atomic_long_read(&sem->count),
 				       &wake_q);
 		if (!wake_q_empty(&wake_q)) {
@@ -1145,6 +1173,8 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
 			raw_spin_lock_irq(&sem->wait_lock);
 		}
 	} else {
+		INIT_LIST_HEAD(&waiter.list);
+		sem->first_waiter = &waiter;
 		atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count);
 	}
 
@@ -1218,7 +1248,7 @@ static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
 
 	raw_spin_lock_irqsave(&sem->wait_lock, flags);
 
-	if (!list_empty(&sem->wait_list))
+	if (sem->first_waiter)
 		rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
 
 	raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -1239,7 +1269,7 @@ static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
 
 	raw_spin_lock_irqsave(&sem->wait_lock, flags);
 
-	if (!list_empty(&sem->wait_list))
+	if (sem->first_waiter)
 		rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
 
 	raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -1532,6 +1562,7 @@ static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
  * lock for reading
  */
 void __sched down_read(struct rw_semaphore *sem)
+	__no_context_analysis
 {
 	might_sleep();
 	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
@@ -1541,6 +1572,7 @@ void __sched down_read(struct rw_semaphore *sem)
 EXPORT_SYMBOL(down_read);
 
 int __sched down_read_interruptible(struct rw_semaphore *sem)
+	__no_context_analysis
 {
 	might_sleep();
 	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
@@ -1555,6 +1587,7 @@ int __sched down_read_interruptible(struct rw_semaphore *sem)
 EXPORT_SYMBOL(down_read_interruptible);
 
 int __sched down_read_killable(struct rw_semaphore *sem)
+	__no_context_analysis
 {
 	might_sleep();
 	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
@@ -1572,6 +1605,7 @@ EXPORT_SYMBOL(down_read_killable);
  * trylock for reading -- returns 1 if successful, 0 if contention
  */
 int down_read_trylock(struct rw_semaphore *sem)
+	__no_context_analysis
 {
 	int ret = __down_read_trylock(sem);
 
@@ -1585,6 +1619,7 @@ EXPORT_SYMBOL(down_read_trylock);
  * lock for writing
  */
 void __sched down_write(struct rw_semaphore *sem)
+	__no_context_analysis
 {
 	might_sleep();
 	rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
@@ -1596,6 +1631,7 @@ EXPORT_SYMBOL(down_write);
  * lock for writing
  */
 int __sched down_write_killable(struct rw_semaphore *sem)
+	__no_context_analysis
 {
 	might_sleep();
 	rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
@@ -1614,6 +1650,7 @@ EXPORT_SYMBOL(down_write_killable);
  * trylock for writing -- returns 1 if successful, 0 if contention
  */
 int down_write_trylock(struct rw_semaphore *sem)
+	__no_context_analysis
 {
 	int ret = __down_write_trylock(sem);
 
@@ -1628,6 +1665,7 @@ EXPORT_SYMBOL(down_write_trylock);
  * release a read lock
  */
 void up_read(struct rw_semaphore *sem)
+	__no_context_analysis
 {
 	rwsem_release(&sem->dep_map, _RET_IP_);
 	__up_read(sem);
@@ -1638,6 +1676,7 @@ EXPORT_SYMBOL(up_read);
  * release a write lock
  */
 void up_write(struct rw_semaphore *sem)
+	__no_context_analysis
 {
 	rwsem_release(&sem->dep_map, _RET_IP_);
 	__up_write(sem);
@@ -1648,6 +1687,7 @@ EXPORT_SYMBOL(up_write);
  * downgrade write lock to read lock
  */
 void downgrade_write(struct rw_semaphore *sem)
+	__no_context_analysis
 {
 	lock_downgrade(&sem->dep_map, _RET_IP_);
 	__downgrade_write(sem);
@@ -1657,6 +1697,7 @@ EXPORT_SYMBOL(downgrade_write);
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
 void down_read_nested(struct rw_semaphore *sem, int subclass)
+	__no_context_analysis
 {
 	might_sleep();
 	rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
@@ -1665,6 +1706,7 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
 EXPORT_SYMBOL(down_read_nested);
 
 int down_read_killable_nested(struct rw_semaphore *sem, int subclass)
+	__no_context_analysis
 {
 	might_sleep();
 	rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
@@ -1679,6 +1721,7 @@ int down_read_killable_nested(struct rw_semaphore *sem, int subclass)
 EXPORT_SYMBOL(down_read_killable_nested);
 
 void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
+	__no_context_analysis
 {
 	might_sleep();
 	rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
@@ -1687,6 +1730,7 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
 EXPORT_SYMBOL(_down_write_nest_lock);
 
 void down_read_non_owner(struct rw_semaphore *sem)
+	__no_context_analysis
 {
 	might_sleep();
 	__down_read(sem);
@@ -1701,6 +1745,7 @@ void down_read_non_owner(struct rw_semaphore *sem)
 EXPORT_SYMBOL(down_read_non_owner);
 
 void down_write_nested(struct rw_semaphore *sem, int subclass)
+	__no_context_analysis
 {
 	might_sleep();
 	rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
@@ -1709,6 +1754,7 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
 EXPORT_SYMBOL(down_write_nested);
 
 int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)
+	__no_context_analysis
 {
 	might_sleep();
 	rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
@@ -1724,6 +1770,7 @@ int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)
 EXPORT_SYMBOL(down_write_killable_nested);
 
 void up_read_non_owner(struct rw_semaphore *sem)
+	__no_context_analysis
 {
 	DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
 	__up_read(sem);
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 3ef032e22f7e..74d41433ba13 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -21,7 +21,7 @@
  * too.
  *
  * The ->count variable represents how many more tasks can acquire this
- * semaphore.  If it's zero, there may be tasks waiting on the wait_list.
+ * semaphore.  If it's zero, there may be waiters.
  */
 
 #include <linux/compiler.h>
@@ -226,7 +226,7 @@ void __sched up(struct semaphore *sem)
 
 	hung_task_sem_clear_if_holder(sem);
 
-	if (likely(list_empty(&sem->wait_list)))
+	if (likely(!sem->first_waiter))
 		sem->count++;
 	else
 		__up(sem, &wake_q);
@@ -244,6 +244,21 @@ struct semaphore_waiter {
 	bool up;
 };
 
+static inline
+void sem_del_waiter(struct semaphore *sem, struct semaphore_waiter *waiter)
+{
+	if (list_empty(&waiter->list)) {
+		sem->first_waiter = NULL;
+		return;
+	}
+
+	if (sem->first_waiter == waiter) {
+		sem->first_waiter = list_first_entry(&waiter->list,
+						     struct semaphore_waiter, list);
+	}
+	list_del(&waiter->list);
+}
+
 /*
  * Because this function is inlined, the 'state' parameter will be
  * constant, and thus optimised away by the compiler.  Likewise the
@@ -252,9 +267,15 @@ struct semaphore_waiter {
 static inline int __sched ___down_common(struct semaphore *sem, long state,
 								long timeout)
 {
-	struct semaphore_waiter waiter;
-
-	list_add_tail(&waiter.list, &sem->wait_list);
+	struct semaphore_waiter waiter, *first;
+
+	first = sem->first_waiter;
+	if (first) {
+		list_add_tail(&waiter.list, &first->list);
+	} else {
+		INIT_LIST_HEAD(&waiter.list);
+		sem->first_waiter = &waiter;
+	}
 	waiter.task = current;
 	waiter.up = false;
 
@@ -274,11 +295,11 @@ static inline int __sched ___down_common(struct semaphore *sem, long state,
 	}
 
  timed_out:
-	list_del(&waiter.list);
+	sem_del_waiter(sem, &waiter);
 	return -ETIME;
 
  interrupted:
-	list_del(&waiter.list);
+	sem_del_waiter(sem, &waiter);
 	return -EINTR;
 }
 
@@ -321,9 +342,9 @@ static noinline int __sched __down_timeout(struct semaphore *sem, long timeout)
 static noinline void __sched __up(struct semaphore *sem,
 				  struct wake_q_head *wake_q)
 {
-	struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
-						struct semaphore_waiter, list);
-	list_del(&waiter->list);
+	struct semaphore_waiter *waiter = sem->first_waiter;
+
+	sem_del_waiter(sem, waiter);
 	waiter->up = true;
 	wake_q_add(wake_q, waiter->task);
 }
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 7685defd7c52..b42d293da38b 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -64,8 +64,9 @@ EXPORT_PER_CPU_SYMBOL(__mmiowb_state);
  * time (making _this_ CPU preemptible if possible), and we also signal
  * towards that other CPU that it should break the lock ASAP.
  */
-#define BUILD_LOCK_OPS(op, locktype)					\
+#define BUILD_LOCK_OPS(op, locktype, lock_ctx_op)			\
 static void __lockfunc __raw_##op##_lock(locktype##_t *lock)		\
+	lock_ctx_op(lock)						\
 {									\
 	for (;;) {							\
 		preempt_disable();					\
@@ -78,6 +79,7 @@ static void __lockfunc __raw_##op##_lock(locktype##_t *lock)		\
 }									\
 									\
 static unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \
+	lock_ctx_op(lock)						\
 {									\
 	unsigned long flags;						\
 									\
@@ -96,11 +98,13 @@ static unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \
 }									\
 									\
 static void __lockfunc __raw_##op##_lock_irq(locktype##_t *lock)	\
+	lock_ctx_op(lock)						\
 {									\
 	_raw_##op##_lock_irqsave(lock);					\
 }									\
 									\
 static void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock)		\
+	lock_ctx_op(lock)						\
 {									\
 	unsigned long flags;						\
 									\
@@ -123,11 +127,11 @@ static void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock)		\
  *         __[spin|read|write]_lock_irqsave()
  *         __[spin|read|write]_lock_bh()
  */
-BUILD_LOCK_OPS(spin, raw_spinlock);
+BUILD_LOCK_OPS(spin, raw_spinlock, __acquires);
 
 #ifndef CONFIG_PREEMPT_RT
-BUILD_LOCK_OPS(read, rwlock);
-BUILD_LOCK_OPS(write, rwlock);
+BUILD_LOCK_OPS(read, rwlock, __acquires_shared);
+BUILD_LOCK_OPS(write, rwlock, __acquires);
 #endif
 
 #endif
diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h
index 31a785afee6c..b1834ab7e782 100644
--- a/kernel/locking/ww_mutex.h
+++ b/kernel/locking/ww_mutex.h
@@ -4,24 +4,21 @@
 
 #define MUTEX		mutex
 #define MUTEX_WAITER	mutex_waiter
+#define WAIT_LOCK	wait_lock
 
 static inline struct mutex_waiter *
 __ww_waiter_first(struct mutex *lock)
+	__must_hold(&lock->wait_lock)
 {
-	struct mutex_waiter *w;
-
-	w = list_first_entry(&lock->wait_list, struct mutex_waiter, list);
-	if (list_entry_is_head(w, &lock->wait_list, list))
-		return NULL;
-
-	return w;
+	return lock->first_waiter;
 }
 
 static inline struct mutex_waiter *
 __ww_waiter_next(struct mutex *lock, struct mutex_waiter *w)
+	__must_hold(&lock->wait_lock)
 {
 	w = list_next_entry(w, list);
-	if (list_entry_is_head(w, &lock->wait_list, list))
+	if (lock->first_waiter == w)
 		return NULL;
 
 	return w;
@@ -29,9 +26,10 @@ __ww_waiter_next(struct mutex *lock, struct mutex_waiter *w)
 
 static inline struct mutex_waiter *
 __ww_waiter_prev(struct mutex *lock, struct mutex_waiter *w)
+	__must_hold(&lock->wait_lock)
 {
 	w = list_prev_entry(w, list);
-	if (list_entry_is_head(w, &lock->wait_list, list))
+	if (lock->first_waiter == w)
 		return NULL;
 
 	return w;
@@ -39,23 +37,20 @@ __ww_waiter_prev(struct mutex *lock, struct mutex_waiter *w)
 
 static inline struct mutex_waiter *
 __ww_waiter_last(struct mutex *lock)
+	__must_hold(&lock->wait_lock)
 {
-	struct mutex_waiter *w;
-
-	w = list_last_entry(&lock->wait_list, struct mutex_waiter, list);
-	if (list_entry_is_head(w, &lock->wait_list, list))
-		return NULL;
+	struct mutex_waiter *w = lock->first_waiter;
 
+	if (w)
+		w = list_prev_entry(w, list);
 	return w;
 }
 
 static inline void
 __ww_waiter_add(struct mutex *lock, struct mutex_waiter *waiter, struct mutex_waiter *pos)
+	__must_hold(&lock->wait_lock)
 {
-	struct list_head *p = &lock->wait_list;
-	if (pos)
-		p = &pos->list;
-	__mutex_add_waiter(lock, waiter, p);
+	__mutex_add_waiter(lock, waiter, pos);
 }
 
 static inline struct task_struct *
@@ -71,16 +66,19 @@ __ww_mutex_has_waiters(struct mutex *lock)
 }
 
 static inline void lock_wait_lock(struct mutex *lock, unsigned long *flags)
+	__acquires(&lock->wait_lock)
 {
 	raw_spin_lock_irqsave(&lock->wait_lock, *flags);
 }
 
 static inline void unlock_wait_lock(struct mutex *lock, unsigned long *flags)
+	__releases(&lock->wait_lock)
 {
 	raw_spin_unlock_irqrestore(&lock->wait_lock, *flags);
 }
 
 static inline void lockdep_assert_wait_lock_held(struct mutex *lock)
+	__must_hold(&lock->wait_lock)
 {
 	lockdep_assert_held(&lock->wait_lock);
 }
@@ -89,9 +87,11 @@ static inline void lockdep_assert_wait_lock_held(struct mutex *lock)
 
 #define MUTEX		rt_mutex
 #define MUTEX_WAITER	rt_mutex_waiter
+#define WAIT_LOCK	rtmutex.wait_lock
 
 static inline struct rt_mutex_waiter *
 __ww_waiter_first(struct rt_mutex *lock)
+	__must_hold(&lock->rtmutex.wait_lock)
 {
 	struct rb_node *n = rb_first(&lock->rtmutex.waiters.rb_root);
 	if (!n)
@@ -119,6 +119,7 @@ __ww_waiter_prev(struct rt_mutex *lock, struct rt_mutex_waiter *w)
 
 static inline struct rt_mutex_waiter *
 __ww_waiter_last(struct rt_mutex *lock)
+	__must_hold(&lock->rtmutex.wait_lock)
 {
 	struct rb_node *n = rb_last(&lock->rtmutex.waiters.rb_root);
 	if (!n)
@@ -140,21 +141,25 @@ __ww_mutex_owner(struct rt_mutex *lock)
 
 static inline bool
 __ww_mutex_has_waiters(struct rt_mutex *lock)
+	__must_hold(&lock->rtmutex.wait_lock)
 {
 	return rt_mutex_has_waiters(&lock->rtmutex);
 }
 
 static inline void lock_wait_lock(struct rt_mutex *lock, unsigned long *flags)
+	__acquires(&lock->rtmutex.wait_lock)
 {
 	raw_spin_lock_irqsave(&lock->rtmutex.wait_lock, *flags);
 }
 
 static inline void unlock_wait_lock(struct rt_mutex *lock, unsigned long *flags)
+	__releases(&lock->rtmutex.wait_lock)
 {
 	raw_spin_unlock_irqrestore(&lock->rtmutex.wait_lock, *flags);
 }
 
 static inline void lockdep_assert_wait_lock_held(struct rt_mutex *lock)
+	__must_hold(&lock->rtmutex.wait_lock)
 {
 	lockdep_assert_held(&lock->rtmutex.wait_lock);
 }
@@ -307,6 +312,7 @@ static bool __ww_mutex_wound(struct MUTEX *lock,
 			     struct ww_acquire_ctx *ww_ctx,
 			     struct ww_acquire_ctx *hold_ctx,
 			     struct wake_q_head *wake_q)
+	__must_hold(&lock->WAIT_LOCK)
 {
 	struct task_struct *owner = __ww_mutex_owner(lock);
 
@@ -371,6 +377,7 @@ static bool __ww_mutex_wound(struct MUTEX *lock,
 static void
 __ww_mutex_check_waiters(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx,
 			 struct wake_q_head *wake_q)
+	__must_hold(&lock->WAIT_LOCK)
 {
 	struct MUTEX_WAITER *cur;
 
@@ -397,6 +404,7 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
 	DEFINE_WAKE_Q(wake_q);
 	unsigned long flags;
+	bool has_waiters;
 
 	ww_mutex_lock_acquired(lock, ctx);
 
@@ -418,7 +426,8 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 	 * __ww_mutex_add_waiter() and makes sure we either observe ww->ctx
 	 * and/or !empty list.
 	 */
-	if (likely(!__ww_mutex_has_waiters(&lock->base)))
+	has_waiters = data_race(__ww_mutex_has_waiters(&lock->base));
+	if (likely(!has_waiters))
 		return;
 
 	/*
@@ -464,6 +473,7 @@ __ww_mutex_kill(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx)
 static inline int
 __ww_mutex_check_kill(struct MUTEX *lock, struct MUTEX_WAITER *waiter,
 		      struct ww_acquire_ctx *ctx)
+	__must_hold(&lock->WAIT_LOCK)
 {
 	struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
 	struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx);
@@ -514,6 +524,7 @@ __ww_mutex_add_waiter(struct MUTEX_WAITER *waiter,
 		      struct MUTEX *lock,
 		      struct ww_acquire_ctx *ww_ctx,
 		      struct wake_q_head *wake_q)
+	__must_hold(&lock->WAIT_LOCK)
 {
 	struct MUTEX_WAITER *cur, *pos = NULL;
 	bool is_wait_die;
diff --git a/kernel/locking/ww_rt_mutex.c b/kernel/locking/ww_rt_mutex.c
index c7196de838ed..e07fb3b96bc3 100644
--- a/kernel/locking/ww_rt_mutex.c
+++ b/kernel/locking/ww_rt_mutex.c
@@ -90,6 +90,7 @@ ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 EXPORT_SYMBOL(ww_mutex_lock_interruptible);
 
 void __sched ww_mutex_unlock(struct ww_mutex *lock)
+	__no_context_analysis
 {
 	struct rt_mutex *rtm = &lock->base;
 
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 94eb2667b7e1..aac60b6cfa4b 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1618,14 +1618,22 @@ config LOCKDEP_STACK_TRACE_BITS
 	int "Size for MAX_STACK_TRACE_ENTRIES (as Nth power of 2)"
 	depends on LOCKDEP && !LOCKDEP_SMALL
 	range 10 26
+	default 21 if KASAN
 	default 19
 	help
 	  Try increasing this value if you hit "BUG: MAX_STACK_TRACE_ENTRIES too low!" message.
 
+	  KASAN significantly increases stack trace consumption because its
+	  slab tracking interacts with lockdep's dependency validation under
+	  PREEMPT_FULL, creating a feedback loop.  The higher default when
+	  KASAN is enabled costs ~12MB extra, which is negligible compared to
+	  KASAN's own shadow memory overhead.
+
 config LOCKDEP_STACK_TRACE_HASH_BITS
 	int "Size for STACK_TRACE_HASH_SIZE (as Nth power of 2)"
 	depends on LOCKDEP && !LOCKDEP_SMALL
 	range 10 26
+	default 16 if KASAN
 	default 14
 	help
 	  Try increasing this value if you need large STACK_TRACE_HASH_SIZE.
diff --git a/lib/test_context-analysis.c b/lib/test_context-analysis.c
index 140efa8a9763..06b4a6a028e0 100644
--- a/lib/test_context-analysis.c
+++ b/lib/test_context-analysis.c
@@ -596,3 +596,14 @@ static void __used test_ww_mutex_lock_ctx(struct test_ww_mutex_data *d)
 
 	ww_mutex_destroy(&d->mtx);
 }
+
+static DEFINE_PER_CPU(raw_spinlock_t, test_per_cpu_lock);
+
+static void __used test_per_cpu(int cpu)
+{
+	raw_spin_lock(&per_cpu(test_per_cpu_lock, cpu));
+	raw_spin_unlock(&per_cpu(test_per_cpu_lock, cpu));
+
+	raw_spin_lock(per_cpu_ptr(&test_per_cpu_lock, cpu));
+	raw_spin_unlock(per_cpu_ptr(&test_per_cpu_lock, cpu));
+}
diff --git a/rust/helpers/atomic_ext.c b/rust/helpers/atomic_ext.c
index 7d0c2bd340da..c267d5190529 100644
--- a/rust/helpers/atomic_ext.c
+++ b/rust/helpers/atomic_ext.c
@@ -4,45 +4,39 @@
 #include <asm/rwonce.h>
 #include <linux/atomic.h>
 
-__rust_helper s8 rust_helper_atomic_i8_read(s8 *ptr)
-{
-	return READ_ONCE(*ptr);
+#define GEN_READ_HELPER(tname, type)						\
+__rust_helper type rust_helper_atomic_##tname##_read(type *ptr)			\
+{										\
+	return READ_ONCE(*ptr);							\
 }
 
-__rust_helper s8 rust_helper_atomic_i8_read_acquire(s8 *ptr)
-{
-	return smp_load_acquire(ptr);
+#define GEN_SET_HELPER(tname, type)						\
+__rust_helper void rust_helper_atomic_##tname##_set(type *ptr, type val)	\
+{										\
+	WRITE_ONCE(*ptr, val);							\
 }
 
-__rust_helper s16 rust_helper_atomic_i16_read(s16 *ptr)
-{
-	return READ_ONCE(*ptr);
+#define GEN_READ_ACQUIRE_HELPER(tname, type)					\
+__rust_helper type rust_helper_atomic_##tname##_read_acquire(type *ptr)		\
+{										\
+	return smp_load_acquire(ptr);						\
 }
 
-__rust_helper s16 rust_helper_atomic_i16_read_acquire(s16 *ptr)
-{
-	return smp_load_acquire(ptr);
+#define GEN_SET_RELEASE_HELPER(tname, type)					\
+__rust_helper void rust_helper_atomic_##tname##_set_release(type *ptr, type val)\
+{										\
+	smp_store_release(ptr, val);						\
 }
 
-__rust_helper void rust_helper_atomic_i8_set(s8 *ptr, s8 val)
-{
-	WRITE_ONCE(*ptr, val);
-}
+#define GEN_READ_SET_HELPERS(tname, type)					\
+	GEN_READ_HELPER(tname, type)						\
+	GEN_SET_HELPER(tname, type)						\
+	GEN_READ_ACQUIRE_HELPER(tname, type)					\
+	GEN_SET_RELEASE_HELPER(tname, type)					\
 
-__rust_helper void rust_helper_atomic_i8_set_release(s8 *ptr, s8 val)
-{
-	smp_store_release(ptr, val);
-}
-
-__rust_helper void rust_helper_atomic_i16_set(s16 *ptr, s16 val)
-{
-	WRITE_ONCE(*ptr, val);
-}
-
-__rust_helper void rust_helper_atomic_i16_set_release(s16 *ptr, s16 val)
-{
-	smp_store_release(ptr, val);
-}
+GEN_READ_SET_HELPERS(i8, s8)
+GEN_READ_SET_HELPERS(i16, s16)
+GEN_READ_SET_HELPERS(ptr, const void *)
 
 /*
  * xchg helpers depend on ARCH_SUPPORTS_ATOMIC_RMW and on the
@@ -51,45 +45,22 @@ __rust_helper void rust_helper_atomic_i16_set_release(s16 *ptr, s16 val)
  * The architectures that currently support Rust (x86_64, armv7,
  * arm64, riscv, and loongarch) satisfy these requirements.
  */
-__rust_helper s8 rust_helper_atomic_i8_xchg(s8 *ptr, s8 new)
-{
-	return xchg(ptr, new);
+#define GEN_XCHG_HELPER(tname, type, suffix)					\
+__rust_helper type								\
+rust_helper_atomic_##tname##_xchg##suffix(type *ptr, type new)			\
+{										\
+	return xchg##suffix(ptr, new);					\
 }
 
-__rust_helper s16 rust_helper_atomic_i16_xchg(s16 *ptr, s16 new)
-{
-	return xchg(ptr, new);
-}
+#define GEN_XCHG_HELPERS(tname, type)						\
+	GEN_XCHG_HELPER(tname, type, )						\
+	GEN_XCHG_HELPER(tname, type, _acquire)					\
+	GEN_XCHG_HELPER(tname, type, _release)					\
+	GEN_XCHG_HELPER(tname, type, _relaxed)					\
 
-__rust_helper s8 rust_helper_atomic_i8_xchg_acquire(s8 *ptr, s8 new)
-{
-	return xchg_acquire(ptr, new);
-}
-
-__rust_helper s16 rust_helper_atomic_i16_xchg_acquire(s16 *ptr, s16 new)
-{
-	return xchg_acquire(ptr, new);
-}
-
-__rust_helper s8 rust_helper_atomic_i8_xchg_release(s8 *ptr, s8 new)
-{
-	return xchg_release(ptr, new);
-}
-
-__rust_helper s16 rust_helper_atomic_i16_xchg_release(s16 *ptr, s16 new)
-{
-	return xchg_release(ptr, new);
-}
-
-__rust_helper s8 rust_helper_atomic_i8_xchg_relaxed(s8 *ptr, s8 new)
-{
-	return xchg_relaxed(ptr, new);
-}
-
-__rust_helper s16 rust_helper_atomic_i16_xchg_relaxed(s16 *ptr, s16 new)
-{
-	return xchg_relaxed(ptr, new);
-}
+GEN_XCHG_HELPERS(i8, s8)
+GEN_XCHG_HELPERS(i16, s16)
+GEN_XCHG_HELPERS(ptr, const void *)
 
 /*
  * try_cmpxchg helpers depend on ARCH_SUPPORTS_ATOMIC_RMW and on the
@@ -98,42 +69,19 @@ __rust_helper s16 rust_helper_atomic_i16_xchg_relaxed(s16 *ptr, s16 new)
  * The architectures that currently support Rust (x86_64, armv7,
  * arm64, riscv, and loongarch) satisfy these requirements.
  */
-__rust_helper bool rust_helper_atomic_i8_try_cmpxchg(s8 *ptr, s8 *old, s8 new)
-{
-	return try_cmpxchg(ptr, old, new);
-}
-
-__rust_helper bool rust_helper_atomic_i16_try_cmpxchg(s16 *ptr, s16 *old, s16 new)
-{
-	return try_cmpxchg(ptr, old, new);
-}
-
-__rust_helper bool rust_helper_atomic_i8_try_cmpxchg_acquire(s8 *ptr, s8 *old, s8 new)
-{
-	return try_cmpxchg_acquire(ptr, old, new);
-}
-
-__rust_helper bool rust_helper_atomic_i16_try_cmpxchg_acquire(s16 *ptr, s16 *old, s16 new)
-{
-	return try_cmpxchg_acquire(ptr, old, new);
-}
-
-__rust_helper bool rust_helper_atomic_i8_try_cmpxchg_release(s8 *ptr, s8 *old, s8 new)
-{
-	return try_cmpxchg_release(ptr, old, new);
-}
-
-__rust_helper bool rust_helper_atomic_i16_try_cmpxchg_release(s16 *ptr, s16 *old, s16 new)
-{
-	return try_cmpxchg_release(ptr, old, new);
-}
-
-__rust_helper bool rust_helper_atomic_i8_try_cmpxchg_relaxed(s8 *ptr, s8 *old, s8 new)
-{
-	return try_cmpxchg_relaxed(ptr, old, new);
-}
-
-__rust_helper bool rust_helper_atomic_i16_try_cmpxchg_relaxed(s16 *ptr, s16 *old, s16 new)
-{
-	return try_cmpxchg_relaxed(ptr, old, new);
-}
+#define GEN_TRY_CMPXCHG_HELPER(tname, type, suffix)				\
+__rust_helper bool								\
+rust_helper_atomic_##tname##_try_cmpxchg##suffix(type *ptr, type *old, type new)\
+{										\
+	return try_cmpxchg##suffix(ptr, old, new);				\
+}
+
+#define GEN_TRY_CMPXCHG_HELPERS(tname, type)					\
+	GEN_TRY_CMPXCHG_HELPER(tname, type, )					\
+	GEN_TRY_CMPXCHG_HELPER(tname, type, _acquire)				\
+	GEN_TRY_CMPXCHG_HELPER(tname, type, _release)				\
+	GEN_TRY_CMPXCHG_HELPER(tname, type, _relaxed)				\
+
+GEN_TRY_CMPXCHG_HELPERS(i8, s8)
+GEN_TRY_CMPXCHG_HELPERS(i16, s16)
+GEN_TRY_CMPXCHG_HELPERS(ptr, const void *)
diff --git a/rust/kernel/list/arc.rs b/rust/kernel/list/arc.rs
index a9a2b0178f65..209b1173c826 100644
--- a/rust/kernel/list/arc.rs
+++ b/rust/kernel/list/arc.rs
@@ -6,7 +6,7 @@
 
 use crate::alloc::{AllocError, Flags};
 use crate::prelude::*;
-use crate::sync::atomic::{ordering, Atomic};
+use crate::sync::atomic::{ordering, AtomicFlag};
 use crate::sync::{Arc, ArcBorrow, UniqueArc};
 use core::marker::PhantomPinned;
 use core::ops::Deref;
@@ -450,7 +450,7 @@ where
 /// If the boolean is `false`, then there is no [`ListArc`] for this value.
 #[repr(transparent)]
 pub struct AtomicTracker<const ID: u64 = 0> {
-    inner: Atomic<bool>,
+    inner: AtomicFlag,
     // This value needs to be pinned to justify the INVARIANT: comment in `AtomicTracker::new`.
     _pin: PhantomPinned,
 }
@@ -461,12 +461,12 @@ impl<const ID: u64> AtomicTracker<ID> {
         // INVARIANT: Pin-init initializers can't be used on an existing `Arc`, so this value will
         // not be constructed in an `Arc` that already has a `ListArc`.
         Self {
-            inner: Atomic::new(false),
+            inner: AtomicFlag::new(false),
             _pin: PhantomPinned,
         }
     }
 
-    fn project_inner(self: Pin<&mut Self>) -> &mut Atomic<bool> {
+    fn project_inner(self: Pin<&mut Self>) -> &mut AtomicFlag {
         // SAFETY: The `inner` field is not structurally pinned, so we may obtain a mutable
         // reference to it even if we only have a pinned reference to `self`.
         unsafe { &mut Pin::into_inner_unchecked(self).inner }
diff --git a/rust/kernel/sync/atomic.rs b/rust/kernel/sync/atomic.rs
index 4aebeacb961a..9cd009d57e35 100644
--- a/rust/kernel/sync/atomic.rs
+++ b/rust/kernel/sync/atomic.rs
@@ -51,6 +51,10 @@ use ordering::OrderingType;
 #[repr(transparent)]
 pub struct Atomic<T: AtomicType>(AtomicRepr<T::Repr>);
 
+// SAFETY: `Atomic<T>` is safe to transfer between execution contexts because of the safety
+// requirement of `AtomicType`.
+unsafe impl<T: AtomicType> Send for Atomic<T> {}
+
 // SAFETY: `Atomic<T>` is safe to share among execution contexts because all accesses are atomic.
 unsafe impl<T: AtomicType> Sync for Atomic<T> {}
 
@@ -68,6 +72,11 @@ unsafe impl<T: AtomicType> Sync for Atomic<T> {}
 ///
 /// - [`Self`] must have the same size and alignment as [`Self::Repr`].
 /// - [`Self`] must be [round-trip transmutable] to  [`Self::Repr`].
+/// - [`Self`] must be safe to transfer between execution contexts, if it's [`Send`], this is
+///   automatically satisfied. The exception is pointer types that are even though marked as
+///   `!Send` (e.g. raw pointers and [`NonNull<T>`]) but requiring `unsafe` to do anything
+///   meaningful on them. This is because transferring pointer values between execution contexts is
+///   safe as long as the actual `unsafe` dereferencing is justified.
 ///
 /// Note that this is more relaxed than requiring the bi-directional transmutability (i.e.
 /// [`transmute()`] is always sound between `U` and `T`) because of the support for atomic
@@ -108,7 +117,8 @@ unsafe impl<T: AtomicType> Sync for Atomic<T> {}
 /// [`transmute()`]: core::mem::transmute
 /// [round-trip transmutable]: AtomicType#round-trip-transmutability
 /// [Examples]: AtomicType#examples
-pub unsafe trait AtomicType: Sized + Send + Copy {
+/// [`NonNull<T>`]: core::ptr::NonNull
+pub unsafe trait AtomicType: Sized + Copy {
     /// The backing atomic implementation type.
     type Repr: AtomicImpl;
 }
@@ -204,10 +214,7 @@ impl<T: AtomicType> Atomic<T> {
     /// // no data race.
     /// unsafe { Atomic::from_ptr(foo_a_ptr) }.store(2, Release);
     /// ```
-    pub unsafe fn from_ptr<'a>(ptr: *mut T) -> &'a Self
-    where
-        T: Sync,
-    {
+    pub unsafe fn from_ptr<'a>(ptr: *mut T) -> &'a Self {
         // CAST: `T` and `Atomic<T>` have the same size, alignment and bit validity.
         // SAFETY: Per function safety requirement, `ptr` is a valid pointer and the object will
         // live long enough. It's safe to return a `&Atomic<T>` because function safety requirement
@@ -235,6 +242,17 @@ impl<T: AtomicType> Atomic<T> {
     /// Returns a mutable reference to the underlying atomic `T`.
     ///
     /// This is safe because the mutable reference of the atomic `T` guarantees exclusive access.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use kernel::sync::atomic::{Atomic, Relaxed};
+    ///
+    /// let mut atomic_val = Atomic::new(0u32);
+    /// let val_mut = atomic_val.get_mut();
+    /// *val_mut = 101;
+    /// assert_eq!(101, atomic_val.load(Relaxed));
+    /// ```
     pub fn get_mut(&mut self) -> &mut T {
         // CAST: `T` and `T::Repr` has the same size and alignment per the safety requirement of
         // `AtomicType`, and per the type invariants `self.0` is a valid `T`, therefore the casting
@@ -527,16 +545,14 @@ where
     /// use kernel::sync::atomic::{Atomic, Acquire, Full, Relaxed};
     ///
     /// let x = Atomic::new(42);
-    ///
     /// assert_eq!(42, x.load(Relaxed));
-    ///
-    /// assert_eq!(54, { x.fetch_add(12, Acquire); x.load(Relaxed) });
+    /// assert_eq!(42, x.fetch_add(12, Acquire));
+    /// assert_eq!(54, x.load(Relaxed));
     ///
     /// let x = Atomic::new(42);
-    ///
     /// assert_eq!(42, x.load(Relaxed));
-    ///
-    /// assert_eq!(54, { x.fetch_add(12, Full); x.load(Relaxed) } );
+    /// assert_eq!(42, x.fetch_add(12, Full));
+    /// assert_eq!(54, x.load(Relaxed));
     /// ```
     #[inline(always)]
     pub fn fetch_add<Rhs, Ordering: ordering::Ordering>(&self, v: Rhs, _: Ordering) -> T
@@ -559,4 +575,276 @@ where
         // SAFETY: `ret` comes from reading `self.0`, which is a valid `T` per type invariants.
         unsafe { from_repr(ret) }
     }
+
+    /// Atomic fetch and subtract.
+    ///
+    /// Atomically updates `*self` to `(*self).wrapping_sub(v)`, and returns the value of `*self`
+    /// before the update.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use kernel::sync::atomic::{Atomic, Acquire, Full, Relaxed};
+    ///
+    /// let x = Atomic::new(42);
+    /// assert_eq!(42, x.load(Relaxed));
+    /// assert_eq!(42, x.fetch_sub(12, Acquire));
+    /// assert_eq!(30, x.load(Relaxed));
+    ///
+    /// let x = Atomic::new(42);
+    /// assert_eq!(42, x.load(Relaxed));
+    /// assert_eq!(42, x.fetch_sub(12, Full));
+    /// assert_eq!(30, x.load(Relaxed));
+    /// ```
+    #[inline(always)]
+    pub fn fetch_sub<Rhs, Ordering: ordering::Ordering>(&self, v: Rhs, _: Ordering) -> T
+    where
+        // Types that support addition also support subtraction.
+        T: AtomicAdd<Rhs>,
+    {
+        let v = T::rhs_into_delta(v);
+
+        // INVARIANT: `self.0` is a valid `T` after `atomic_fetch_sub*()` due to safety requirement
+        // of `AtomicAdd`.
+        let ret = {
+            match Ordering::TYPE {
+                OrderingType::Full => T::Repr::atomic_fetch_sub(&self.0, v),
+                OrderingType::Acquire => T::Repr::atomic_fetch_sub_acquire(&self.0, v),
+                OrderingType::Release => T::Repr::atomic_fetch_sub_release(&self.0, v),
+                OrderingType::Relaxed => T::Repr::atomic_fetch_sub_relaxed(&self.0, v),
+            }
+        };
+
+        // SAFETY: `ret` comes from reading `self.0`, which is a valid `T` per type invariants.
+        unsafe { from_repr(ret) }
+    }
+}
+
+#[cfg(any(CONFIG_X86_64, CONFIG_UML, CONFIG_ARM, CONFIG_ARM64))]
+#[repr(C)]
+#[derive(Clone, Copy)]
+struct Flag {
+    bool_field: bool,
+}
+
+/// # Invariants
+///
+/// `padding` must be all zeroes.
+#[cfg(not(any(CONFIG_X86_64, CONFIG_UML, CONFIG_ARM, CONFIG_ARM64)))]
+#[repr(C, align(4))]
+#[derive(Clone, Copy)]
+struct Flag {
+    #[cfg(target_endian = "big")]
+    padding: [u8; 3],
+    bool_field: bool,
+    #[cfg(target_endian = "little")]
+    padding: [u8; 3],
+}
+
+impl Flag {
+    #[inline(always)]
+    const fn new(b: bool) -> Self {
+        // INVARIANT: `padding` is all zeroes.
+        Self {
+            bool_field: b,
+            #[cfg(not(any(CONFIG_X86_64, CONFIG_UML, CONFIG_ARM, CONFIG_ARM64)))]
+            padding: [0; 3],
+        }
+    }
+}
+
+// SAFETY: `Flag` and `Repr` have the same size and alignment, and `Flag` is round-trip
+// transmutable to the selected representation (`i8` or `i32`).
+unsafe impl AtomicType for Flag {
+    #[cfg(any(CONFIG_X86_64, CONFIG_UML, CONFIG_ARM, CONFIG_ARM64))]
+    type Repr = i8;
+    #[cfg(not(any(CONFIG_X86_64, CONFIG_UML, CONFIG_ARM, CONFIG_ARM64)))]
+    type Repr = i32;
+}
+
+/// An atomic flag type intended to be backed by performance-optimal integer type.
+///
+/// The backing integer type is an implementation detail; it may vary by architecture and change
+/// in the future.
+///
+/// [`AtomicFlag`] is generally preferable to [`Atomic<bool>`] when you need read-modify-write
+/// (RMW) operations (e.g. [`Atomic::xchg()`]/[`Atomic::cmpxchg()`]) or when [`Atomic<bool>`] does
+/// not save memory due to padding. On some architectures that do not support byte-sized atomic
+/// RMW operations, RMW operations on [`Atomic<bool>`] are slower.
+///
+/// If you only use [`Atomic::load()`]/[`Atomic::store()`], [`Atomic<bool>`] is fine.
+///
+/// # Examples
+///
+/// ```
+/// use kernel::sync::atomic::{AtomicFlag, Relaxed};
+///
+/// let flag = AtomicFlag::new(false);
+/// assert_eq!(false, flag.load(Relaxed));
+/// flag.store(true, Relaxed);
+/// assert_eq!(true, flag.load(Relaxed));
+/// ```
+pub struct AtomicFlag(Atomic<Flag>);
+
+impl AtomicFlag {
+    /// Creates a new atomic flag.
+    #[inline(always)]
+    pub const fn new(b: bool) -> Self {
+        Self(Atomic::new(Flag::new(b)))
+    }
+
+    /// Returns a mutable reference to the underlying flag as a [`bool`].
+    ///
+    /// This is safe because the mutable reference of the atomic flag guarantees exclusive access.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use kernel::sync::atomic::{AtomicFlag, Relaxed};
+    ///
+    /// let mut atomic_flag = AtomicFlag::new(false);
+    /// assert_eq!(false, atomic_flag.load(Relaxed));
+    /// *atomic_flag.get_mut() = true;
+    /// assert_eq!(true, atomic_flag.load(Relaxed));
+    /// ```
+    #[inline(always)]
+    pub fn get_mut(&mut self) -> &mut bool {
+        &mut self.0.get_mut().bool_field
+    }
+
+    /// Loads the value from the atomic flag.
+    #[inline(always)]
+    pub fn load<Ordering: ordering::AcquireOrRelaxed>(&self, o: Ordering) -> bool {
+        self.0.load(o).bool_field
+    }
+
+    /// Stores a value to the atomic flag.
+    #[inline(always)]
+    pub fn store<Ordering: ordering::ReleaseOrRelaxed>(&self, v: bool, o: Ordering) {
+        self.0.store(Flag::new(v), o);
+    }
+
+    /// Stores a value to the atomic flag and returns the previous value.
+    #[inline(always)]
+    pub fn xchg<Ordering: ordering::Ordering>(&self, new: bool, o: Ordering) -> bool {
+        self.0.xchg(Flag::new(new), o).bool_field
+    }
+
+    /// Store a value to the atomic flag if the current value is equal to `old`.
+    #[inline(always)]
+    pub fn cmpxchg<Ordering: ordering::Ordering>(
+        &self,
+        old: bool,
+        new: bool,
+        o: Ordering,
+    ) -> Result<bool, bool> {
+        match self.0.cmpxchg(Flag::new(old), Flag::new(new), o) {
+            Ok(_) => Ok(old),
+            Err(f) => Err(f.bool_field),
+        }
+    }
+}
+
+/// Atomic load over raw pointers.
+///
+/// This function provides a short-cut of `Atomic::from_ptr().load(..)`, and can be used to work
+/// with C side on synchronizations:
+///
+/// - `atomic_load(.., Relaxed)` maps to `READ_ONCE()` when used for inter-thread communication.
+/// - `atomic_load(.., Acquire)` maps to `smp_load_acquire()`.
+///
+/// # Safety
+///
+/// - `ptr` is a valid pointer to `T` and aligned to `align_of::<T>()`.
+/// - If there is a concurrent store from kernel (C or Rust), it has to be atomic.
+#[doc(alias("READ_ONCE", "smp_load_acquire"))]
+#[inline(always)]
+pub unsafe fn atomic_load<T: AtomicType, Ordering: ordering::AcquireOrRelaxed>(
+    ptr: *mut T,
+    o: Ordering,
+) -> T
+where
+    T::Repr: AtomicBasicOps,
+{
+    // SAFETY: Per the function safety requirement, `ptr` is valid and aligned to
+    // `align_of::<T>()`, and all concurrent stores from kernel are atomic, hence no data race per
+    // LKMM.
+    unsafe { Atomic::from_ptr(ptr) }.load(o)
+}
+
+/// Atomic store over raw pointers.
+///
+/// This function provides a short-cut of `Atomic::from_ptr().load(..)`, and can be used to work
+/// with C side on synchronizations:
+///
+/// - `atomic_store(.., Relaxed)` maps to `WRITE_ONCE()` when used for inter-thread communication.
+/// - `atomic_load(.., Release)` maps to `smp_store_release()`.
+///
+/// # Safety
+///
+/// - `ptr` is a valid pointer to `T` and aligned to `align_of::<T>()`.
+/// - If there is a concurrent access from kernel (C or Rust), it has to be atomic.
+#[doc(alias("WRITE_ONCE", "smp_store_release"))]
+#[inline(always)]
+pub unsafe fn atomic_store<T: AtomicType, Ordering: ordering::ReleaseOrRelaxed>(
+    ptr: *mut T,
+    v: T,
+    o: Ordering,
+) where
+    T::Repr: AtomicBasicOps,
+{
+    // SAFETY: Per the function safety requirement, `ptr` is valid and aligned to
+    // `align_of::<T>()`, and all concurrent accesses from kernel are atomic, hence no data race
+    // per LKMM.
+    unsafe { Atomic::from_ptr(ptr) }.store(v, o);
+}
+
+/// Atomic exchange over raw pointers.
+///
+/// This function provides a short-cut of `Atomic::from_ptr().xchg(..)`, and can be used to work
+/// with C side on synchronizations.
+///
+/// # Safety
+///
+/// - `ptr` is a valid pointer to `T` and aligned to `align_of::<T>()`.
+/// - If there is a concurrent access from kernel (C or Rust), it has to be atomic.
+#[inline(always)]
+pub unsafe fn xchg<T: AtomicType, Ordering: ordering::Ordering>(
+    ptr: *mut T,
+    new: T,
+    o: Ordering,
+) -> T
+where
+    T::Repr: AtomicExchangeOps,
+{
+    // SAFETY: Per the function safety requirement, `ptr` is valid and aligned to
+    // `align_of::<T>()`, and all concurrent accesses from kernel are atomic, hence no data race
+    // per LKMM.
+    unsafe { Atomic::from_ptr(ptr) }.xchg(new, o)
+}
+
+/// Atomic compare and exchange over raw pointers.
+///
+/// This function provides a short-cut of `Atomic::from_ptr().cmpxchg(..)`, and can be used to work
+/// with C side on synchronizations.
+///
+/// # Safety
+///
+/// - `ptr` is a valid pointer to `T` and aligned to `align_of::<T>()`.
+/// - If there is a concurrent access from kernel (C or Rust), it has to be atomic.
+#[doc(alias("try_cmpxchg"))]
+#[inline(always)]
+pub unsafe fn cmpxchg<T: AtomicType, Ordering: ordering::Ordering>(
+    ptr: *mut T,
+    old: T,
+    new: T,
+    o: Ordering,
+) -> Result<T, T>
+where
+    T::Repr: AtomicExchangeOps,
+{
+    // SAFETY: Per the function safety requirement, `ptr` is valid and aligned to
+    // `align_of::<T>()`, and all concurrent accesses from kernel are atomic, hence no data race
+    // per LKMM.
+    unsafe { Atomic::from_ptr(ptr) }.cmpxchg(old, new, o)
 }
diff --git a/rust/kernel/sync/atomic/internal.rs b/rust/kernel/sync/atomic/internal.rs
index 0dac58bca2b3..ad810c2172ec 100644
--- a/rust/kernel/sync/atomic/internal.rs
+++ b/rust/kernel/sync/atomic/internal.rs
@@ -7,6 +7,7 @@
 use crate::bindings;
 use crate::macros::paste;
 use core::cell::UnsafeCell;
+use ffi::c_void;
 
 mod private {
     /// Sealed trait marker to disable customized impls on atomic implementation traits.
@@ -14,10 +15,11 @@ mod private {
 }
 
 // The C side supports atomic primitives only for `i32` and `i64` (`atomic_t` and `atomic64_t`),
-// while the Rust side also layers provides atomic support for `i8` and `i16`
-// on top of lower-level C primitives.
+// while the Rust side also provides atomic support for `i8`, `i16` and `*const c_void` on top of
+// lower-level C primitives.
 impl private::Sealed for i8 {}
 impl private::Sealed for i16 {}
+impl private::Sealed for *const c_void {}
 impl private::Sealed for i32 {}
 impl private::Sealed for i64 {}
 
@@ -26,10 +28,10 @@ impl private::Sealed for i64 {}
 /// This trait is sealed, and only types that map directly to the C side atomics
 /// or can be implemented with lower-level C primitives are allowed to implement this:
 ///
-/// - `i8` and `i16` are implemented with lower-level C primitives.
+/// - `i8`, `i16` and `*const c_void` are implemented with lower-level C primitives.
 /// - `i32` map to `atomic_t`
 /// - `i64` map to `atomic64_t`
-pub trait AtomicImpl: Sized + Send + Copy + private::Sealed {
+pub trait AtomicImpl: Sized + Copy + private::Sealed {
     /// The type of the delta in arithmetic or logical operations.
     ///
     /// For example, in `atomic_add(ptr, v)`, it's the type of `v`. Usually it's the same type of
@@ -37,20 +39,31 @@ pub trait AtomicImpl: Sized + Send + Copy + private::Sealed {
     type Delta;
 }
 
-// The current helpers of load/store uses `{WRITE,READ}_ONCE()` hence the atomicity is only
-// guaranteed against read-modify-write operations if the architecture supports native atomic RmW.
-#[cfg(CONFIG_ARCH_SUPPORTS_ATOMIC_RMW)]
+// The current helpers of load/store of atomic `i8`, `i16` and pointers use `{WRITE,READ}_ONCE()`
+// hence the atomicity is only guaranteed against read-modify-write operations if the architecture
+// supports native atomic RmW.
+//
+// In the future when a CONFIG_ARCH_SUPPORTS_ATOMIC_RMW=n architecture plans to support Rust, the
+// load/store helpers that guarantee atomicity against RmW operations (usually via a lock) need to
+// be added.
+crate::static_assert!(
+    cfg!(CONFIG_ARCH_SUPPORTS_ATOMIC_RMW),
+    "The current implementation of atomic i8/i16/ptr relies on the architecure being \
+    ARCH_SUPPORTS_ATOMIC_RMW"
+);
+
 impl AtomicImpl for i8 {
     type Delta = Self;
 }
 
-// The current helpers of load/store uses `{WRITE,READ}_ONCE()` hence the atomicity is only
-// guaranteed against read-modify-write operations if the architecture supports native atomic RmW.
-#[cfg(CONFIG_ARCH_SUPPORTS_ATOMIC_RMW)]
 impl AtomicImpl for i16 {
     type Delta = Self;
 }
 
+impl AtomicImpl for *const c_void {
+    type Delta = isize;
+}
+
 // `atomic_t` implements atomic operations on `i32`.
 impl AtomicImpl for i32 {
     type Delta = Self;
@@ -262,7 +275,7 @@ macro_rules! declare_and_impl_atomic_methods {
 }
 
 declare_and_impl_atomic_methods!(
-    [ i8 => atomic_i8, i16 => atomic_i16, i32 => atomic, i64 => atomic64 ]
+    [ i8 => atomic_i8, i16 => atomic_i16, *const c_void => atomic_ptr, i32 => atomic, i64 => atomic64 ]
     /// Basic atomic operations
     pub trait AtomicBasicOps {
         /// Atomic read (load).
@@ -280,7 +293,7 @@ declare_and_impl_atomic_methods!(
 );
 
 declare_and_impl_atomic_methods!(
-    [ i8 => atomic_i8, i16 => atomic_i16, i32 => atomic, i64 => atomic64 ]
+    [ i8 => atomic_i8, i16 => atomic_i16, *const c_void => atomic_ptr, i32 => atomic, i64 => atomic64 ]
     /// Exchange and compare-and-exchange atomic operations
     pub trait AtomicExchangeOps {
         /// Atomic exchange.
@@ -324,7 +337,12 @@ declare_and_impl_atomic_methods!(
         /// Atomically updates `*a` to `(*a).wrapping_add(v)`, and returns the value of `*a`
         /// before the update.
         fn fetch_add[acquire, release, relaxed](a: &AtomicRepr<Self>, v: Self::Delta) -> Self {
-            // SAFETY: `a.as_ptr()` is valid and properly aligned.
+            // SAFETY: `a.as_ptr()` guarantees the returned pointer is valid and properly aligned.
+            unsafe { bindings::#call(v, a.as_ptr().cast()) }
+        }
+
+        fn fetch_sub[acquire, release, relaxed](a: &AtomicRepr<Self>, v: Self::Delta) -> Self {
+            // SAFETY: `a.as_ptr()` guarantees the returned pointer is valid and properly aligned.
             unsafe { bindings::#call(v, a.as_ptr().cast()) }
         }
     }
diff --git a/rust/kernel/sync/atomic/predefine.rs b/rust/kernel/sync/atomic/predefine.rs
index 67a0406d3ea4..1d53834fcb12 100644
--- a/rust/kernel/sync/atomic/predefine.rs
+++ b/rust/kernel/sync/atomic/predefine.rs
@@ -4,6 +4,7 @@
 
 use crate::static_assert;
 use core::mem::{align_of, size_of};
+use ffi::c_void;
 
 // Ensure size and alignment requirements are checked.
 static_assert!(size_of::<bool>() == size_of::<i8>());
@@ -28,6 +29,26 @@ unsafe impl super::AtomicType for i16 {
     type Repr = i16;
 }
 
+// SAFETY:
+//
+// - `*mut T` has the same size and alignment with `*const c_void`, and is round-trip
+//   transmutable to `*const c_void`.
+// - `*mut T` is safe to transfer between execution contexts. See the safety requirement of
+//   [`AtomicType`].
+unsafe impl<T: Sized> super::AtomicType for *mut T {
+    type Repr = *const c_void;
+}
+
+// SAFETY:
+//
+// - `*const T` has the same size and alignment with `*const c_void`, and is round-trip
+//   transmutable to `*const c_void`.
+// - `*const T` is safe to transfer between execution contexts. See the safety requirement of
+//   [`AtomicType`].
+unsafe impl<T: Sized> super::AtomicType for *const T {
+    type Repr = *const c_void;
+}
+
 // SAFETY: `i32` has the same size and alignment with itself, and is round-trip transmutable to
 // itself.
 unsafe impl super::AtomicType for i32 {
@@ -157,6 +178,14 @@ mod tests {
 
             assert_eq!(v, x.load(Relaxed));
         });
+
+        for_each_type!(42 in [i8, i16, i32, i64, u32, u64, isize, usize] |v| {
+            let x = Atomic::new(v);
+            let ptr = x.as_ptr();
+
+            // SAFETY: `ptr` is a valid pointer and no concurrent access.
+            assert_eq!(v, unsafe { atomic_load(ptr, Relaxed) });
+        });
     }
 
     #[test]
@@ -167,6 +196,17 @@ mod tests {
             x.store(v, Release);
             assert_eq!(v, x.load(Acquire));
         });
+
+        for_each_type!(42 in [i8, i16, i32, i64, u32, u64, isize, usize] |v| {
+            let x = Atomic::new(0);
+            let ptr = x.as_ptr();
+
+            // SAFETY: `ptr` is a valid pointer and no concurrent access.
+            unsafe { atomic_store(ptr, v, Release) };
+
+            // SAFETY: `ptr` is a valid pointer and no concurrent access.
+            assert_eq!(v, unsafe { atomic_load(ptr, Acquire) });
+        });
     }
 
     #[test]
@@ -180,6 +220,18 @@ mod tests {
             assert_eq!(old, x.xchg(new, Full));
             assert_eq!(new, x.load(Relaxed));
         });
+
+        for_each_type!(42 in [i8, i16, i32, i64, u32, u64, isize, usize] |v| {
+            let x = Atomic::new(v);
+            let ptr = x.as_ptr();
+
+            let old = v;
+            let new = v + 1;
+
+            // SAFETY: `ptr` is a valid pointer and no concurrent access.
+            assert_eq!(old, unsafe { xchg(ptr, new, Full) });
+            assert_eq!(new, x.load(Relaxed));
+        });
     }
 
     #[test]
@@ -195,6 +247,21 @@ mod tests {
             assert_eq!(Ok(old), x.cmpxchg(old, new, Relaxed));
             assert_eq!(new, x.load(Relaxed));
         });
+
+        for_each_type!(42 in [i8, i16, i32, i64, u32, u64, isize, usize] |v| {
+            let x = Atomic::new(v);
+            let ptr = x.as_ptr();
+
+            let old = v;
+            let new = v + 1;
+
+            // SAFETY: `ptr` is a valid pointer and no concurrent access.
+            assert_eq!(Err(old), unsafe { cmpxchg(ptr, new, new, Full) });
+            assert_eq!(old, x.load(Relaxed));
+            // SAFETY: `ptr` is a valid pointer and no concurrent access.
+            assert_eq!(Ok(old), unsafe { cmpxchg(ptr, old, new, Relaxed) });
+            assert_eq!(new, x.load(Relaxed));
+        });
     }
 
     #[test]
@@ -226,4 +293,46 @@ mod tests {
         assert_eq!(false, x.load(Relaxed));
         assert_eq!(Ok(false), x.cmpxchg(false, true, Full));
     }
+
+    #[test]
+    fn atomic_ptr_tests() {
+        let mut v = 42;
+        let mut u = 43;
+        let x = Atomic::new(&raw mut v);
+
+        assert_eq!(x.load(Acquire), &raw mut v);
+        assert_eq!(x.cmpxchg(&raw mut u, &raw mut u, Relaxed), Err(&raw mut v));
+        assert_eq!(x.cmpxchg(&raw mut v, &raw mut u, Relaxed), Ok(&raw mut v));
+        assert_eq!(x.load(Relaxed), &raw mut u);
+
+        let x = Atomic::new(&raw const v);
+
+        assert_eq!(x.load(Acquire), &raw const v);
+        assert_eq!(
+            x.cmpxchg(&raw const u, &raw const u, Relaxed),
+            Err(&raw const v)
+        );
+        assert_eq!(
+            x.cmpxchg(&raw const v, &raw const u, Relaxed),
+            Ok(&raw const v)
+        );
+        assert_eq!(x.load(Relaxed), &raw const u);
+    }
+
+    #[test]
+    fn atomic_flag_tests() {
+        let mut flag = AtomicFlag::new(false);
+
+        assert_eq!(false, flag.load(Relaxed));
+
+        *flag.get_mut() = true;
+        assert_eq!(true, flag.load(Relaxed));
+
+        assert_eq!(true, flag.xchg(false, Relaxed));
+        assert_eq!(false, flag.load(Relaxed));
+
+        *flag.get_mut() = true;
+        assert_eq!(Ok(true), flag.cmpxchg(true, false, Full));
+        assert_eq!(false, flag.load(Relaxed));
+    }
 }
diff --git a/scripts/context-analysis-suppression.txt b/scripts/context-analysis-suppression.txt
index fd8951d06706..1c51b6153f08 100644
--- a/scripts/context-analysis-suppression.txt
+++ b/scripts/context-analysis-suppression.txt
@@ -24,6 +24,7 @@ src:*include/linux/mutex*.h=emit
 src:*include/linux/rcupdate.h=emit
 src:*include/linux/refcount.h=emit
 src:*include/linux/rhashtable.h=emit
+src:*include/linux/rtmutex*.h=emit
 src:*include/linux/rwlock*.h=emit
 src:*include/linux/rwsem.h=emit
 src:*include/linux/sched*=emit
author	Linus Torvalds <torvalds@linux-foundation.org>	2026-04-14 12:36:25 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2026-04-14 12:36:25 -0700
commit	7393febcb1b2082c0484952729cbebfe4dc508d5 (patch)
tree	d561808391b363749ab77512def195da22566db3
parent	e80d033851b3bc94c3d254ac66660ddd0a49d72c (diff)
parent	a21c1e961de28b95099a9ca2c3774b2eee1a33bb (diff)
download	lwn-7393febcb1b2082c0484952729cbebfe4dc508d5.tar.gz lwn-7393febcb1b2082c0484952729cbebfe4dc508d5.zip