diff options
Diffstat (limited to 'include/linux/mmap_lock.h')
| -rw-r--r-- | include/linux/mmap_lock.h | 419 |
1 files changed, 412 insertions, 7 deletions
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 45a21faa3ff6..04b8f61ece5d 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -1,12 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MMAP_LOCK_H #define _LINUX_MMAP_LOCK_H +/* Avoid a dependency loop by declaring here. */ +extern int rcuwait_wake_up(struct rcuwait *w); + #include <linux/lockdep.h> #include <linux/mm_types.h> #include <linux/mmdebug.h> #include <linux/rwsem.h> #include <linux/tracepoint-defs.h> #include <linux/types.h> +#include <linux/cleanup.h> +#include <linux/sched/mm.h> #define MMAP_LOCK_INITIALIZER(name) \ .mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock), @@ -72,6 +78,43 @@ static inline void mmap_assert_write_locked(const struct mm_struct *mm) #ifdef CONFIG_PER_VMA_LOCK +#ifdef CONFIG_LOCKDEP +#define __vma_lockdep_map(vma) (&vma->vmlock_dep_map) +#else +#define __vma_lockdep_map(vma) NULL +#endif + +/* + * VMA locks do not behave like most ordinary locks found in the kernel, so we + * cannot quite have full lockdep tracking in the way we would ideally prefer. + * + * Read locks act as shared locks which exclude an exclusive lock being + * taken. We therefore mark these accordingly on read lock acquire/release. + * + * Write locks are acquired exclusively per-VMA, but released in a shared + * fashion, that is upon vma_end_write_all(), we update the mmap's seqcount such + * that write lock is released. + * + * We therefore cannot track write locks per-VMA, nor do we try. Mitigating this + * is the fact that, of course, we do lockdep-track the mmap lock rwsem which + * must be held when taking a VMA write lock. + * + * We do, however, want to indicate that during either acquisition of a VMA + * write lock or detachment of a VMA that we require the lock held be exclusive, + * so we utilise lockdep to do so. + */ +#define __vma_lockdep_acquire_read(vma) \ + lock_acquire_shared(__vma_lockdep_map(vma), 0, 1, NULL, _RET_IP_) +#define __vma_lockdep_release_read(vma) \ + lock_release(__vma_lockdep_map(vma), _RET_IP_) +#define __vma_lockdep_acquire_exclusive(vma) \ + lock_acquire_exclusive(__vma_lockdep_map(vma), 0, 0, NULL, _RET_IP_) +#define __vma_lockdep_release_exclusive(vma) \ + lock_release(__vma_lockdep_map(vma), _RET_IP_) +/* Only meaningful if CONFIG_LOCK_STAT is defined. */ +#define __vma_lockdep_stat_mark_acquired(vma) \ + lock_acquired(__vma_lockdep_map(vma), _RET_IP_) + static inline void mm_lock_seqcount_init(struct mm_struct *mm) { seqcount_init(&mm->mm_lock_seq); @@ -104,6 +147,343 @@ static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int return read_seqcount_retry(&mm->mm_lock_seq, seq); } +static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + static struct lock_class_key lockdep_key; + + lockdep_init_map(__vma_lockdep_map(vma), "vm_lock", &lockdep_key, 0); +#endif + if (reset_refcnt) + refcount_set(&vma->vm_refcnt, 0); + vma->vm_lock_seq = UINT_MAX; +} + +/* + * This function determines whether the input VMA reference count describes a + * VMA which has excluded all VMA read locks. + * + * In the case of a detached VMA, we may incorrectly indicate that readers are + * excluded when one remains, because in that scenario we target a refcount of + * VM_REFCNT_EXCLUDE_READERS_FLAG, rather than the attached target of + * VM_REFCNT_EXCLUDE_READERS_FLAG + 1. + * + * However, the race window for that is very small so it is unlikely. + * + * Returns: true if readers are excluded, false otherwise. + */ +static inline bool __vma_are_readers_excluded(int refcnt) +{ + /* + * See the comment describing the vm_area_struct->vm_refcnt field for + * details of possible refcnt values. + */ + return (refcnt & VM_REFCNT_EXCLUDE_READERS_FLAG) && + refcnt <= VM_REFCNT_EXCLUDE_READERS_FLAG + 1; +} + +/* + * Actually decrement the VMA reference count. + * + * The function returns the reference count as it was immediately after the + * decrement took place. If it returns zero, the VMA is now detached. + */ +static inline __must_check unsigned int +__vma_refcount_put_return(struct vm_area_struct *vma) +{ + int oldcnt; + + if (__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) + return 0; + + return oldcnt - 1; +} + +/** + * vma_refcount_put() - Drop reference count in VMA vm_refcnt field due to a + * read-lock being dropped. + * @vma: The VMA whose reference count we wish to decrement. + * + * If we were the last reader, wake up threads waiting to obtain an exclusive + * lock. + */ +static inline void vma_refcount_put(struct vm_area_struct *vma) +{ + /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt. */ + struct mm_struct *mm = vma->vm_mm; + int newcnt; + + __vma_lockdep_release_read(vma); + newcnt = __vma_refcount_put_return(vma); + + /* + * __vma_start_exclude_readers() may be sleeping waiting for readers to + * drop their reference count, so wake it up if we were the last reader + * blocking it from being acquired. + * + * We may be raced by other readers temporarily incrementing the + * reference count, though the race window is very small, this might + * cause spurious wakeups. + */ + if (newcnt && __vma_are_readers_excluded(newcnt)) + rcuwait_wake_up(&mm->vma_writer_wait); +} + +/* + * Use only while holding mmap read lock which guarantees that locking will not + * fail (nobody can concurrently write-lock the vma). vma_start_read() should + * not be used in such cases because it might fail due to mm_lock_seq overflow. + * This functionality is used to obtain vma read lock and drop the mmap read lock. + */ +static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass) +{ + int oldcnt; + + mmap_assert_locked(vma->vm_mm); + if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, + VM_REFCNT_LIMIT))) + return false; + + __vma_lockdep_acquire_read(vma); + return true; +} + +/* + * Use only while holding mmap read lock which guarantees that locking will not + * fail (nobody can concurrently write-lock the vma). vma_start_read() should + * not be used in such cases because it might fail due to mm_lock_seq overflow. + * This functionality is used to obtain vma read lock and drop the mmap read lock. + */ +static inline bool vma_start_read_locked(struct vm_area_struct *vma) +{ + return vma_start_read_locked_nested(vma, 0); +} + +static inline void vma_end_read(struct vm_area_struct *vma) +{ + vma_refcount_put(vma); +} + +static inline unsigned int __vma_raw_mm_seqnum(struct vm_area_struct *vma) +{ + const struct mm_struct *mm = vma->vm_mm; + + /* We must hold an exclusive write lock for this access to be valid. */ + mmap_assert_write_locked(vma->vm_mm); + return mm->mm_lock_seq.sequence; +} + +/* + * Determine whether a VMA is write-locked. Must be invoked ONLY if the mmap + * write lock is held. + * + * Returns true if write-locked, otherwise false. + */ +static inline bool __is_vma_write_locked(struct vm_area_struct *vma) +{ + /* + * current task is holding mmap_write_lock, both vma->vm_lock_seq and + * mm->mm_lock_seq can't be concurrently modified. + */ + return vma->vm_lock_seq == __vma_raw_mm_seqnum(vma); +} + +int __vma_start_write(struct vm_area_struct *vma, int state); + +/* + * Begin writing to a VMA. + * Exclude concurrent readers under the per-VMA lock until the currently + * write-locked mmap_lock is dropped or downgraded. + */ +static inline void vma_start_write(struct vm_area_struct *vma) +{ + if (__is_vma_write_locked(vma)) + return; + + __vma_start_write(vma, TASK_UNINTERRUPTIBLE); +} + +/** + * vma_start_write_killable - Begin writing to a VMA. + * @vma: The VMA we are going to modify. + * + * Exclude concurrent readers under the per-VMA lock until the currently + * write-locked mmap_lock is dropped or downgraded. + * + * Context: May sleep while waiting for readers to drop the vma read lock. + * Caller must already hold the mmap_lock for write. + * + * Return: 0 for a successful acquisition. -EINTR if a fatal signal was + * received. + */ +static inline __must_check +int vma_start_write_killable(struct vm_area_struct *vma) +{ + if (__is_vma_write_locked(vma)) + return 0; + + return __vma_start_write(vma, TASK_KILLABLE); +} + +/** + * vma_assert_write_locked() - assert that @vma holds a VMA write lock. + * @vma: The VMA to assert. + */ +static inline void vma_assert_write_locked(struct vm_area_struct *vma) +{ + VM_WARN_ON_ONCE_VMA(!__is_vma_write_locked(vma), vma); +} + +/** + * vma_assert_locked() - assert that @vma holds either a VMA read or a VMA write + * lock and is not detached. + * @vma: The VMA to assert. + */ +static inline void vma_assert_locked(struct vm_area_struct *vma) +{ + unsigned int refcnt; + + if (IS_ENABLED(CONFIG_LOCKDEP)) { + if (!lock_is_held(__vma_lockdep_map(vma))) + vma_assert_write_locked(vma); + return; + } + + /* + * See the comment describing the vm_area_struct->vm_refcnt field for + * details of possible refcnt values. + */ + refcnt = refcount_read(&vma->vm_refcnt); + + /* + * In this case we're either read-locked, write-locked with temporary + * readers, or in the midst of excluding readers, all of which means + * we're locked. + */ + if (refcnt > 1) + return; + + /* It is a bug for the VMA to be detached here. */ + VM_WARN_ON_ONCE_VMA(!refcnt, vma); + + /* + * OK, the VMA has a reference count of 1 which means it is either + * unlocked and attached or write-locked, so assert that it is + * write-locked. + */ + vma_assert_write_locked(vma); +} + +/** + * vma_assert_stabilised() - assert that this VMA cannot be changed from + * underneath us either by having a VMA or mmap lock held. + * @vma: The VMA whose stability we wish to assess. + * + * If lockdep is enabled we can precisely ensure stability via either an mmap + * lock owned by us or a specific VMA lock. + * + * With lockdep disabled we may sometimes race with other threads acquiring the + * mmap read lock simultaneous with our VMA read lock. + */ +static inline void vma_assert_stabilised(struct vm_area_struct *vma) +{ + /* + * If another thread owns an mmap lock, it may go away at any time, and + * thus is no guarantee of stability. + * + * If lockdep is enabled we can accurately determine if an mmap lock is + * held and owned by us. Otherwise we must approximate. + * + * It doesn't necessarily mean we are not stabilised however, as we may + * hold a VMA read lock (not a write lock as this would require an owned + * mmap lock). + * + * If (assuming lockdep is not enabled) we were to assert a VMA read + * lock first we may also run into issues, as other threads can hold VMA + * read locks simlutaneous to us. + * + * Therefore if lockdep is not enabled we risk a false negative (i.e. no + * assert fired). If accurate checking is required, enable lockdep. + */ + if (IS_ENABLED(CONFIG_LOCKDEP)) { + if (lockdep_is_held(&vma->vm_mm->mmap_lock)) + return; + } else { + if (rwsem_is_locked(&vma->vm_mm->mmap_lock)) + return; + } + + /* + * We're not stabilised by the mmap lock, so assert that we're + * stabilised by a VMA lock. + */ + vma_assert_locked(vma); +} + +static inline bool vma_is_attached(struct vm_area_struct *vma) +{ + return refcount_read(&vma->vm_refcnt); +} + +/* + * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these + * assertions should be made either under mmap_write_lock or when the object + * has been isolated under mmap_write_lock, ensuring no competing writers. + */ +static inline void vma_assert_attached(struct vm_area_struct *vma) +{ + WARN_ON_ONCE(!vma_is_attached(vma)); +} + +static inline void vma_assert_detached(struct vm_area_struct *vma) +{ + WARN_ON_ONCE(vma_is_attached(vma)); +} + +static inline void vma_mark_attached(struct vm_area_struct *vma) +{ + vma_assert_write_locked(vma); + vma_assert_detached(vma); + refcount_set_release(&vma->vm_refcnt, 1); +} + +void __vma_exclude_readers_for_detach(struct vm_area_struct *vma); + +static inline void vma_mark_detached(struct vm_area_struct *vma) +{ + vma_assert_write_locked(vma); + vma_assert_attached(vma); + + /* + * The VMA still being attached (refcnt > 0) - is unlikely, because the + * vma has been already write-locked and readers can increment vm_refcnt + * only temporarily before they check vm_lock_seq, realize the vma is + * locked and drop back the vm_refcnt. That is a narrow window for + * observing a raised vm_refcnt. + * + * See the comment describing the vm_area_struct->vm_refcnt field for + * details of possible refcnt values. + */ + if (likely(!__vma_refcount_put_return(vma))) + return; + + __vma_exclude_readers_for_detach(vma); +} + +struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, + unsigned long address); + +/* + * Locks next vma pointed by the iterator. Confirms the locked vma has not + * been modified and will retry under mmap_lock protection if modification + * was detected. Should be called from read RCU section. + * Returns either a valid locked VMA, NULL if no more VMAs or -EINTR if the + * process was interrupted. + */ +struct vm_area_struct *lock_next_vma(struct mm_struct *mm, + struct vma_iterator *iter, + unsigned long address); + #else /* CONFIG_PER_VMA_LOCK */ static inline void mm_lock_seqcount_init(struct mm_struct *mm) {} @@ -119,15 +499,37 @@ static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int { return true; } +static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {} +static inline void vma_end_read(struct vm_area_struct *vma) {} +static inline void vma_start_write(struct vm_area_struct *vma) {} +static inline __must_check +int vma_start_write_killable(struct vm_area_struct *vma) { return 0; } +static inline void vma_assert_write_locked(struct vm_area_struct *vma) + { mmap_assert_write_locked(vma->vm_mm); } +static inline void vma_assert_attached(struct vm_area_struct *vma) {} +static inline void vma_assert_detached(struct vm_area_struct *vma) {} +static inline void vma_mark_attached(struct vm_area_struct *vma) {} +static inline void vma_mark_detached(struct vm_area_struct *vma) {} + +static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, + unsigned long address) +{ + return NULL; +} -#endif /* CONFIG_PER_VMA_LOCK */ +static inline void vma_assert_locked(struct vm_area_struct *vma) +{ + mmap_assert_locked(vma->vm_mm); +} -static inline void mmap_init_lock(struct mm_struct *mm) +static inline void vma_assert_stabilised(struct vm_area_struct *vma) { - init_rwsem(&mm->mmap_lock); - mm_lock_seqcount_init(mm); + /* If no VMA locks, then either mmap lock suffices to stabilise. */ + mmap_assert_locked(vma->vm_mm); } +#endif /* CONFIG_PER_VMA_LOCK */ + static inline void mmap_write_lock(struct mm_struct *mm) { __mmap_lock_trace_start_locking(mm, true); @@ -144,7 +546,7 @@ static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass) __mmap_lock_trace_acquire_returned(mm, true, true); } -static inline int mmap_write_lock_killable(struct mm_struct *mm) +static inline int __must_check mmap_write_lock_killable(struct mm_struct *mm) { int ret; @@ -191,7 +593,7 @@ static inline void mmap_read_lock(struct mm_struct *mm) __mmap_lock_trace_acquire_returned(mm, false, true); } -static inline int mmap_read_lock_killable(struct mm_struct *mm) +static inline int __must_check mmap_read_lock_killable(struct mm_struct *mm) { int ret; @@ -201,7 +603,7 @@ static inline int mmap_read_lock_killable(struct mm_struct *mm) return ret; } -static inline bool mmap_read_trylock(struct mm_struct *mm) +static inline bool __must_check mmap_read_trylock(struct mm_struct *mm) { bool ret; @@ -217,6 +619,9 @@ static inline void mmap_read_unlock(struct mm_struct *mm) up_read(&mm->mmap_lock); } +DEFINE_GUARD(mmap_read_lock, struct mm_struct *, + mmap_read_lock(_T), mmap_read_unlock(_T)) + static inline void mmap_read_unlock_non_owner(struct mm_struct *mm) { __mmap_lock_trace_released(mm, false); |
