summaryrefslogtreecommitdiff
path: root/include/asm-generic/rqspinlock.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/asm-generic/rqspinlock.h')
-rw-r--r--include/asm-generic/rqspinlock.h250
1 files changed, 250 insertions, 0 deletions
diff --git a/include/asm-generic/rqspinlock.h b/include/asm-generic/rqspinlock.h
new file mode 100644
index 000000000000..6d4244d643df
--- /dev/null
+++ b/include/asm-generic/rqspinlock.h
@@ -0,0 +1,250 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Resilient Queued Spin Lock
+ *
+ * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates.
+ *
+ * Authors: Kumar Kartikeya Dwivedi <memxor@gmail.com>
+ */
+#ifndef __ASM_GENERIC_RQSPINLOCK_H
+#define __ASM_GENERIC_RQSPINLOCK_H
+
+#include <linux/types.h>
+#include <vdso/time64.h>
+#include <linux/percpu.h>
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include <asm/qspinlock.h>
+#endif
+
+struct rqspinlock {
+ union {
+ atomic_t val;
+ u32 locked;
+ };
+};
+
+/* Even though this is same as struct rqspinlock, we need to emit a distinct
+ * type in BTF for BPF programs.
+ */
+struct bpf_res_spin_lock {
+ u32 val;
+};
+
+struct qspinlock;
+#ifdef CONFIG_QUEUED_SPINLOCKS
+typedef struct qspinlock rqspinlock_t;
+#else
+typedef struct rqspinlock rqspinlock_t;
+#endif
+
+extern int resilient_tas_spin_lock(rqspinlock_t *lock);
+#ifdef CONFIG_QUEUED_SPINLOCKS
+extern int resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val);
+#endif
+
+#ifndef resilient_virt_spin_lock_enabled
+static __always_inline bool resilient_virt_spin_lock_enabled(void)
+{
+ return false;
+}
+#endif
+
+#ifndef resilient_virt_spin_lock
+static __always_inline int resilient_virt_spin_lock(rqspinlock_t *lock)
+{
+ return 0;
+}
+#endif
+
+/*
+ * Default timeout for waiting loops is 0.25 seconds
+ */
+#define RES_DEF_TIMEOUT (NSEC_PER_SEC / 4)
+
+/*
+ * Choose 31 as it makes rqspinlock_held cacheline-aligned.
+ */
+#define RES_NR_HELD 31
+
+struct rqspinlock_held {
+ int cnt;
+ void *locks[RES_NR_HELD];
+};
+
+DECLARE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks);
+
+static __always_inline void grab_held_lock_entry(void *lock)
+{
+ int cnt = this_cpu_inc_return(rqspinlock_held_locks.cnt);
+
+ if (unlikely(cnt > RES_NR_HELD)) {
+ /* Still keep the inc so we decrement later. */
+ return;
+ }
+
+ /*
+ * Implied compiler barrier in per-CPU operations; otherwise we can have
+ * the compiler reorder inc with write to table, allowing interrupts to
+ * overwrite and erase our write to the table (as on interrupt exit it
+ * will be reset to NULL).
+ *
+ * It is fine for cnt inc to be reordered wrt remote readers though,
+ * they won't observe our entry until the cnt update is visible, that's
+ * all.
+ */
+ this_cpu_write(rqspinlock_held_locks.locks[cnt - 1], lock);
+}
+
+/*
+ * We simply don't support out-of-order unlocks, and keep the logic simple here.
+ * The verifier prevents BPF programs from unlocking out-of-order, and the same
+ * holds for in-kernel users.
+ *
+ * It is possible to run into misdetection scenarios of AA deadlocks on the same
+ * CPU, and missed ABBA deadlocks on remote CPUs if this function pops entries
+ * out of order (due to lock A, lock B, unlock A, unlock B) pattern. The correct
+ * logic to preserve right entries in the table would be to walk the array of
+ * held locks and swap and clear out-of-order entries, but that's too
+ * complicated and we don't have a compelling use case for out of order unlocking.
+ */
+static __always_inline void release_held_lock_entry(void)
+{
+ struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
+
+ if (unlikely(rqh->cnt > RES_NR_HELD))
+ goto dec;
+ WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL);
+dec:
+ /*
+ * Reordering of clearing above with inc and its write in
+ * grab_held_lock_entry that came before us (in same acquisition
+ * attempt) is ok, we either see a valid entry or NULL when it's
+ * visible.
+ *
+ * But this helper is invoked when we unwind upon failing to acquire the
+ * lock. Unlike the unlock path which constitutes a release store after
+ * we clear the entry, we need to emit a write barrier here. Otherwise,
+ * we may have a situation as follows:
+ *
+ * <error> for lock B
+ * release_held_lock_entry
+ *
+ * try_cmpxchg_acquire for lock A
+ * grab_held_lock_entry
+ *
+ * Lack of any ordering means reordering may occur such that dec, inc
+ * are done before entry is overwritten. This permits a remote lock
+ * holder of lock B (which this CPU failed to acquire) to now observe it
+ * as being attempted on this CPU, and may lead to misdetection (if this
+ * CPU holds a lock it is attempting to acquire, leading to false ABBA
+ * diagnosis).
+ *
+ * In case of unlock, we will always do a release on the lock word after
+ * releasing the entry, ensuring that other CPUs cannot hold the lock
+ * (and make conclusions about deadlocks) until the entry has been
+ * cleared on the local CPU, preventing any anomalies. Reordering is
+ * still possible there, but a remote CPU cannot observe a lock in our
+ * table which it is already holding, since visibility entails our
+ * release store for the said lock has not retired.
+ *
+ * In theory we don't have a problem if the dec and WRITE_ONCE above get
+ * reordered with each other, we either notice an empty NULL entry on
+ * top (if dec succeeds WRITE_ONCE), or a potentially stale entry which
+ * cannot be observed (if dec precedes WRITE_ONCE).
+ *
+ * Emit the write barrier _before_ the dec, this permits dec-inc
+ * reordering but that is harmless as we'd have new entry set to NULL
+ * already, i.e. they cannot precede the NULL store above.
+ */
+ smp_wmb();
+ this_cpu_dec(rqspinlock_held_locks.cnt);
+}
+
+#ifdef CONFIG_QUEUED_SPINLOCKS
+
+/**
+ * res_spin_lock - acquire a queued spinlock
+ * @lock: Pointer to queued spinlock structure
+ *
+ * Return:
+ * * 0 - Lock was acquired successfully.
+ * * -EDEADLK - Lock acquisition failed because of AA/ABBA deadlock.
+ * * -ETIMEDOUT - Lock acquisition failed because of timeout.
+ */
+static __always_inline int res_spin_lock(rqspinlock_t *lock)
+{
+ int val = 0;
+
+ if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) {
+ grab_held_lock_entry(lock);
+ return 0;
+ }
+ return resilient_queued_spin_lock_slowpath(lock, val);
+}
+
+#else
+
+#define res_spin_lock(lock) resilient_tas_spin_lock(lock)
+
+#endif /* CONFIG_QUEUED_SPINLOCKS */
+
+static __always_inline void res_spin_unlock(rqspinlock_t *lock)
+{
+ struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
+
+ if (unlikely(rqh->cnt > RES_NR_HELD))
+ goto unlock;
+ WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL);
+unlock:
+ /*
+ * Release barrier, ensures correct ordering. See release_held_lock_entry
+ * for details. Perform release store instead of queued_spin_unlock,
+ * since we use this function for test-and-set fallback as well. When we
+ * have CONFIG_QUEUED_SPINLOCKS=n, we clear the full 4-byte lockword.
+ *
+ * Like release_held_lock_entry, we can do the release before the dec.
+ * We simply care about not seeing the 'lock' in our table from a remote
+ * CPU once the lock has been released, which doesn't rely on the dec.
+ *
+ * Unlike smp_wmb(), release is not a two way fence, hence it is
+ * possible for a inc to move up and reorder with our clearing of the
+ * entry. This isn't a problem however, as for a misdiagnosis of ABBA,
+ * the remote CPU needs to hold this lock, which won't be released until
+ * the store below is done, which would ensure the entry is overwritten
+ * to NULL, etc.
+ */
+ smp_store_release(&lock->locked, 0);
+ this_cpu_dec(rqspinlock_held_locks.cnt);
+}
+
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; })
+#else
+#define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t){0}; })
+#endif
+
+#define raw_res_spin_lock(lock) \
+ ({ \
+ int __ret; \
+ preempt_disable(); \
+ __ret = res_spin_lock(lock); \
+ if (__ret) \
+ preempt_enable(); \
+ __ret; \
+ })
+
+#define raw_res_spin_unlock(lock) ({ res_spin_unlock(lock); preempt_enable(); })
+
+#define raw_res_spin_lock_irqsave(lock, flags) \
+ ({ \
+ int __ret; \
+ local_irq_save(flags); \
+ __ret = raw_res_spin_lock(lock); \
+ if (__ret) \
+ local_irq_restore(flags); \
+ __ret; \
+ })
+
+#define raw_res_spin_unlock_irqrestore(lock, flags) ({ raw_res_spin_unlock(lock); local_irq_restore(flags); })
+
+#endif /* __ASM_GENERIC_RQSPINLOCK_H */