diff options
Diffstat (limited to 'mm/userfaultfd.c')
-rw-r--r-- | mm/userfaultfd.c | 145 |
1 files changed, 111 insertions, 34 deletions
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index af3dfc3633db..fbf2cf62ab9f 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -18,6 +18,7 @@ #include <asm/tlbflush.h> #include <asm/tlb.h> #include "internal.h" +#include "swap.h" static __always_inline bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end) @@ -85,14 +86,10 @@ static struct vm_area_struct *uffd_lock_vma(struct mm_struct *mm, mmap_read_lock(mm); vma = find_vma_and_prepare_anon(mm, address); if (!IS_ERR(vma)) { - /* - * We cannot use vma_start_read() as it may fail due to - * false locked (see comment in vma_start_read()). We - * can avoid that by directly locking vm_lock under - * mmap_lock, which guarantees that nobody can lock the - * vma for write (vma_start_write()) under us. - */ - down_read(&vma->vm_lock->lock); + bool locked = vma_start_read_locked(vma); + + if (!locked) + vma = ERR_PTR(-EAGAIN); } mmap_read_unlock(mm); @@ -1076,16 +1073,14 @@ out: return err; } -static int move_swap_pte(struct mm_struct *mm, +static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, pte_t *dst_pte, pte_t *src_pte, pte_t orig_dst_pte, pte_t orig_src_pte, pmd_t *dst_pmd, pmd_t dst_pmdval, - spinlock_t *dst_ptl, spinlock_t *src_ptl) + spinlock_t *dst_ptl, spinlock_t *src_ptl, + struct folio *src_folio) { - if (!pte_swp_exclusive(orig_src_pte)) - return -EBUSY; - double_pt_lock(dst_ptl, src_ptl); if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, @@ -1094,6 +1089,16 @@ static int move_swap_pte(struct mm_struct *mm, return -EAGAIN; } + /* + * The src_folio resides in the swapcache, requiring an update to its + * index and mapping to align with the dst_vma, where a swap-in may + * occur and hit the swapcache after moving the PTE. + */ + if (src_folio) { + folio_move_anon_rmap(src_folio, dst_vma); + src_folio->index = linear_page_index(dst_vma, dst_addr); + } + orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); set_pte_at(mm, dst_addr, dst_pte, orig_src_pte); double_pt_unlock(dst_ptl, src_ptl); @@ -1141,6 +1146,7 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, __u64 mode) { swp_entry_t entry; + struct swap_info_struct *si = NULL; pte_t orig_src_pte, orig_dst_pte; pte_t src_folio_pte; spinlock_t *src_ptl, *dst_ptl; @@ -1240,6 +1246,7 @@ retry: */ if (!src_folio) { struct folio *folio; + bool locked; /* * Pin the page while holding the lock to be sure the @@ -1259,14 +1266,28 @@ retry: goto out; } + locked = folio_trylock(folio); + /* + * We avoid waiting for folio lock with a raised + * refcount for large folios because extra refcounts + * will result in split_folio() failing later and + * retrying. If multiple tasks are trying to move a + * large folio we can end up livelocking. + */ + if (!locked && folio_test_large(folio)) { + spin_unlock(src_ptl); + err = -EAGAIN; + goto out; + } + folio_get(folio); src_folio = folio; src_folio_pte = orig_src_pte; spin_unlock(src_ptl); - if (!folio_trylock(src_folio)) { - pte_unmap(&orig_src_pte); - pte_unmap(&orig_dst_pte); + if (!locked) { + pte_unmap(src_pte); + pte_unmap(dst_pte); src_pte = dst_pte = NULL; /* now we can block and wait */ folio_lock(src_folio); @@ -1282,8 +1303,8 @@ retry: /* at this point we have src_folio locked */ if (folio_test_large(src_folio)) { /* split_folio() can block */ - pte_unmap(&orig_src_pte); - pte_unmap(&orig_dst_pte); + pte_unmap(src_pte); + pte_unmap(dst_pte); src_pte = dst_pte = NULL; err = split_folio(src_folio); if (err) @@ -1308,8 +1329,8 @@ retry: goto out; } if (!anon_vma_trylock_write(src_anon_vma)) { - pte_unmap(&orig_src_pte); - pte_unmap(&orig_dst_pte); + pte_unmap(src_pte); + pte_unmap(dst_pte); src_pte = dst_pte = NULL; /* now we can block and wait */ anon_vma_lock_write(src_anon_vma); @@ -1322,11 +1343,13 @@ retry: orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval, dst_ptl, src_ptl, src_folio); } else { + struct folio *folio = NULL; + entry = pte_to_swp_entry(orig_src_pte); if (non_swap_entry(entry)) { if (is_migration_entry(entry)) { - pte_unmap(&orig_src_pte); - pte_unmap(&orig_dst_pte); + pte_unmap(src_pte); + pte_unmap(dst_pte); src_pte = dst_pte = NULL; migration_entry_wait(mm, src_pmd, src_addr); err = -EAGAIN; @@ -1335,9 +1358,53 @@ retry: goto out; } - err = move_swap_pte(mm, dst_addr, src_addr, dst_pte, src_pte, - orig_dst_pte, orig_src_pte, dst_pmd, - dst_pmdval, dst_ptl, src_ptl); + if (!pte_swp_exclusive(orig_src_pte)) { + err = -EBUSY; + goto out; + } + + si = get_swap_device(entry); + if (unlikely(!si)) { + err = -EAGAIN; + goto out; + } + /* + * Verify the existence of the swapcache. If present, the folio's + * index and mapping must be updated even when the PTE is a swap + * entry. The anon_vma lock is not taken during this process since + * the folio has already been unmapped, and the swap entry is + * exclusive, preventing rmap walks. + * + * For large folios, return -EBUSY immediately, as split_folio() + * also returns -EBUSY when attempting to split unmapped large + * folios in the swapcache. This issue needs to be resolved + * separately to allow proper handling. + */ + if (!src_folio) + folio = filemap_get_folio(swap_address_space(entry), + swap_cache_index(entry)); + if (!IS_ERR_OR_NULL(folio)) { + if (folio_test_large(folio)) { + err = -EBUSY; + folio_put(folio); + goto out; + } + src_folio = folio; + src_folio_pte = orig_src_pte; + if (!folio_trylock(src_folio)) { + pte_unmap(src_pte); + pte_unmap(dst_pte); + src_pte = dst_pte = NULL; + put_swap_device(si); + si = NULL; + /* now we can block and wait */ + folio_lock(src_folio); + goto retry; + } + } + err = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte, + orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval, + dst_ptl, src_ptl, src_folio); } out: @@ -1354,6 +1421,8 @@ out: if (src_pte) pte_unmap(src_pte); mmu_notifier_invalidate_range_end(&range); + if (si) + put_swap_device(si); return err; } @@ -1490,16 +1559,24 @@ static int uffd_move_lock(struct mm_struct *mm, mmap_read_lock(mm); err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap); - if (!err) { - /* - * See comment in uffd_lock_vma() as to why not using - * vma_start_read() here. - */ - down_read(&(*dst_vmap)->vm_lock->lock); - if (*dst_vmap != *src_vmap) - down_read_nested(&(*src_vmap)->vm_lock->lock, - SINGLE_DEPTH_NESTING); + if (err) + goto out; + + if (!vma_start_read_locked(*dst_vmap)) { + err = -EAGAIN; + goto out; } + + /* Nothing further to do if both vmas are locked. */ + if (*dst_vmap == *src_vmap) + goto out; + + if (!vma_start_read_locked_nested(*src_vmap, SINGLE_DEPTH_NESTING)) { + /* Undo dst_vmap locking if src_vmap failed to lock */ + vma_end_read(*dst_vmap); + err = -EAGAIN; + } +out: mmap_read_unlock(mm); return err; } |