diff options
Diffstat (limited to 'mm/rmap.c')
| -rw-r--r-- | mm/rmap.c | 324 |
1 files changed, 205 insertions, 119 deletions
diff --git a/mm/rmap.c b/mm/rmap.c index 7b9879ef442d..ab099405151f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1,8 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * mm/rmap.c - physical to virtual reverse mappings * * Copyright 2001, Rik van Riel <riel@conectiva.com.br> - * Released under the General Public License (GPL). * * Simple, low overhead reverse mapping scheme. * Please try to keep this thing as modular as possible. @@ -82,6 +82,7 @@ #include <trace/events/migrate.h> #include "internal.h" +#include "swap.h" static struct kmem_cache *anon_vma_cachep; static struct kmem_cache *anon_vma_chain_cachep; @@ -146,14 +147,13 @@ static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); } -static void anon_vma_chain_link(struct vm_area_struct *vma, - struct anon_vma_chain *avc, - struct anon_vma *anon_vma) +static void anon_vma_chain_assign(struct vm_area_struct *vma, + struct anon_vma_chain *avc, + struct anon_vma *anon_vma) { avc->vma = vma; avc->anon_vma = anon_vma; list_add(&avc->same_vma, &vma->anon_vma_chain); - anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); } /** @@ -210,7 +210,8 @@ int __anon_vma_prepare(struct vm_area_struct *vma) spin_lock(&mm->page_table_lock); if (likely(!vma->anon_vma)) { vma->anon_vma = anon_vma; - anon_vma_chain_link(vma, avc, anon_vma); + anon_vma_chain_assign(vma, avc, anon_vma); + anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); anon_vma->num_active_vmas++; allocated = NULL; avc = NULL; @@ -231,97 +232,141 @@ int __anon_vma_prepare(struct vm_area_struct *vma) return -ENOMEM; } -/* - * This is a useful helper function for locking the anon_vma root as - * we traverse the vma->anon_vma_chain, looping over anon_vma's that - * have the same vma. - * - * Such anon_vma's should have the same root, so you'd expect to see - * just a single mutex_lock for the whole traversal. - */ -static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma) -{ - struct anon_vma *new_root = anon_vma->root; - if (new_root != root) { - if (WARN_ON_ONCE(root)) - up_write(&root->rwsem); - root = new_root; - down_write(&root->rwsem); - } - return root; +static void check_anon_vma_clone(struct vm_area_struct *dst, + struct vm_area_struct *src, + enum vma_operation operation) +{ + /* The write lock must be held. */ + mmap_assert_write_locked(src->vm_mm); + /* If not a fork then must be on same mm. */ + VM_WARN_ON_ONCE(operation != VMA_OP_FORK && dst->vm_mm != src->vm_mm); + + /* If we have anything to do src->anon_vma must be provided. */ + VM_WARN_ON_ONCE(!src->anon_vma && !list_empty(&src->anon_vma_chain)); + VM_WARN_ON_ONCE(!src->anon_vma && dst->anon_vma); + /* We are establishing a new anon_vma_chain. */ + VM_WARN_ON_ONCE(!list_empty(&dst->anon_vma_chain)); + /* + * On fork, dst->anon_vma is set NULL (temporarily). Otherwise, anon_vma + * must be the same across dst and src. + */ + VM_WARN_ON_ONCE(dst->anon_vma && dst->anon_vma != src->anon_vma); + /* + * Essentially equivalent to above - if not a no-op, we should expect + * dst->anon_vma to be set for everything except a fork. + */ + VM_WARN_ON_ONCE(operation != VMA_OP_FORK && src->anon_vma && + !dst->anon_vma); + /* For the anon_vma to be compatible, it can only be singular. */ + VM_WARN_ON_ONCE(operation == VMA_OP_MERGE_UNFAULTED && + !list_is_singular(&src->anon_vma_chain)); +#ifdef CONFIG_PER_VMA_LOCK + /* Only merging an unfaulted VMA leaves the destination attached. */ + VM_WARN_ON_ONCE(operation != VMA_OP_MERGE_UNFAULTED && + vma_is_attached(dst)); +#endif } -static inline void unlock_anon_vma_root(struct anon_vma *root) +static void maybe_reuse_anon_vma(struct vm_area_struct *dst, + struct anon_vma *anon_vma) { - if (root) - up_write(&root->rwsem); + /* If already populated, nothing to do.*/ + if (dst->anon_vma) + return; + + /* + * We reuse an anon_vma if any linking VMAs were unmapped and it has + * only a single child at most. + */ + if (anon_vma->num_active_vmas > 0) + return; + if (anon_vma->num_children > 1) + return; + + dst->anon_vma = anon_vma; + anon_vma->num_active_vmas++; } -/* - * Attach the anon_vmas from src to dst. - * Returns 0 on success, -ENOMEM on failure. - * - * anon_vma_clone() is called by vma_expand(), vma_merge(), __split_vma(), - * copy_vma() and anon_vma_fork(). The first four want an exact copy of src, - * while the last one, anon_vma_fork(), may try to reuse an existing anon_vma to - * prevent endless growth of anon_vma. Since dst->anon_vma is set to NULL before - * call, we can identify this case by checking (!dst->anon_vma && - * src->anon_vma). - * - * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find - * and reuse existing anon_vma which has no vmas and only one child anon_vma. - * This prevents degradation of anon_vma hierarchy to endless linear chain in - * case of constantly forking task. On the other hand, an anon_vma with more - * than one child isn't reused even if there was no alive vma, thus rmap - * walker has a good chance of avoiding scanning the whole hierarchy when it - * searches where page is mapped. +static void cleanup_partial_anon_vmas(struct vm_area_struct *vma); + +/** + * anon_vma_clone - Establishes new anon_vma_chain objects in @dst linking to + * all of the anon_vma objects contained within @src anon_vma_chain's. + * @dst: The destination VMA with an empty anon_vma_chain. + * @src: The source VMA we wish to duplicate. + * @operation: The type of operation which resulted in the clone. + * + * This is the heart of the VMA side of the anon_vma implementation - we invoke + * this function whenever we need to set up a new VMA's anon_vma state. + * + * This is invoked for: + * + * - VMA Merge, but only when @dst is unfaulted and @src is faulted - meaning we + * clone @src into @dst. + * - VMA split. + * - VMA (m)remap. + * - Fork of faulted VMA. + * + * In all cases other than fork this is simply a duplication. Fork additionally + * adds a new active anon_vma. + * + * ONLY in the case of fork do we try to 'reuse' existing anon_vma's in an + * anon_vma hierarchy, reusing anon_vma's which have no VMA associated with them + * but do have a single child. This is to avoid waste of memory when repeatedly + * forking. + * + * Returns: 0 on success, -ENOMEM on failure. */ -int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) +int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src, + enum vma_operation operation) { struct anon_vma_chain *avc, *pavc; - struct anon_vma *root = NULL; - - list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { - struct anon_vma *anon_vma; - - avc = anon_vma_chain_alloc(GFP_NOWAIT); - if (unlikely(!avc)) { - unlock_anon_vma_root(root); - root = NULL; - avc = anon_vma_chain_alloc(GFP_KERNEL); - if (!avc) - goto enomem_failure; - } - anon_vma = pavc->anon_vma; - root = lock_anon_vma_root(root, anon_vma); - anon_vma_chain_link(dst, avc, anon_vma); + struct anon_vma *active_anon_vma = src->anon_vma; - /* - * Reuse existing anon_vma if it has no vma and only one - * anon_vma child. - * - * Root anon_vma is never reused: - * it has self-parent reference and at least one child. - */ - if (!dst->anon_vma && src->anon_vma && - anon_vma->num_children < 2 && - anon_vma->num_active_vmas == 0) - dst->anon_vma = anon_vma; + check_anon_vma_clone(dst, src, operation); + + if (!active_anon_vma) + return 0; + + /* + * Allocate AVCs. We don't need an anon_vma lock for this as we + * are not updating the anon_vma rbtree nor are we changing + * anon_vma statistics. + * + * Either src, dst have the same mm for which we hold an exclusive mmap + * write lock, or we are forking and we hold it on src->vm_mm and dst is + * not yet accessible to other threads so there's no possibliity of the + * unlinked AVC's being observed yet. + */ + list_for_each_entry(pavc, &src->anon_vma_chain, same_vma) { + avc = anon_vma_chain_alloc(GFP_KERNEL); + if (!avc) + goto enomem_failure; + + anon_vma_chain_assign(dst, avc, pavc->anon_vma); } - if (dst->anon_vma) + + /* + * Now link the anon_vma's back to the newly inserted AVCs. + * Note that all anon_vma's share the same root. + */ + anon_vma_lock_write(src->anon_vma); + list_for_each_entry_reverse(avc, &dst->anon_vma_chain, same_vma) { + struct anon_vma *anon_vma = avc->anon_vma; + + anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); + if (operation == VMA_OP_FORK) + maybe_reuse_anon_vma(dst, anon_vma); + } + + if (operation != VMA_OP_FORK) dst->anon_vma->num_active_vmas++; - unlock_anon_vma_root(root); + + anon_vma_unlock_write(active_anon_vma); return 0; enomem_failure: - /* - * dst->anon_vma is dropped here otherwise its num_active_vmas can - * be incorrectly decremented in unlink_anon_vmas(). - * We can safely do this because callers of anon_vma_clone() don't care - * about dst->anon_vma if anon_vma_clone() failed. - */ - dst->anon_vma = NULL; - unlink_anon_vmas(dst); + cleanup_partial_anon_vmas(dst); return -ENOMEM; } @@ -334,7 +379,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) { struct anon_vma_chain *avc; struct anon_vma *anon_vma; - int error; + int rc; /* Don't bother if the parent process has no anon_vma here. */ if (!pvma->anon_vma) @@ -343,27 +388,35 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) /* Drop inherited anon_vma, we'll reuse existing or allocate new. */ vma->anon_vma = NULL; + anon_vma = anon_vma_alloc(); + if (!anon_vma) + return -ENOMEM; + avc = anon_vma_chain_alloc(GFP_KERNEL); + if (!avc) { + put_anon_vma(anon_vma); + return -ENOMEM; + } + /* * First, attach the new VMA to the parent VMA's anon_vmas, * so rmap can find non-COWed pages in child processes. */ - error = anon_vma_clone(vma, pvma); - if (error) - return error; - - /* An existing anon_vma has been reused, all done then. */ - if (vma->anon_vma) - return 0; + rc = anon_vma_clone(vma, pvma, VMA_OP_FORK); + /* An error arose or an existing anon_vma was reused, all done then. */ + if (rc || vma->anon_vma) { + put_anon_vma(anon_vma); + anon_vma_chain_free(avc); + return rc; + } - /* Then add our own anon_vma. */ - anon_vma = anon_vma_alloc(); - if (!anon_vma) - goto out_error; - anon_vma->num_active_vmas++; - avc = anon_vma_chain_alloc(GFP_KERNEL); - if (!avc) - goto out_error_free_anon_vma; + /* + * OK no reuse, so add our own anon_vma. + * + * Since it is not linked anywhere we can safely manipulate anon_vma + * fields without a lock. + */ + anon_vma->num_active_vmas = 1; /* * The root anon_vma's rwsem is the lock actually used when we * lock any of the anon_vmas in this anon_vma tree. @@ -378,24 +431,59 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) get_anon_vma(anon_vma->root); /* Mark this anon_vma as the one where our new (COWed) pages go. */ vma->anon_vma = anon_vma; + anon_vma_chain_assign(vma, avc, anon_vma); + /* Now let rmap see it. */ anon_vma_lock_write(anon_vma); - anon_vma_chain_link(vma, avc, anon_vma); + anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); anon_vma->parent->num_children++; anon_vma_unlock_write(anon_vma); return 0; +} - out_error_free_anon_vma: - put_anon_vma(anon_vma); - out_error: - unlink_anon_vmas(vma); - return -ENOMEM; +/* + * In the unfortunate case of anon_vma_clone() failing to allocate memory we + * have to clean things up. + * + * Since we allocate anon_vma_chain's before we insert them into the interval + * trees, we simply have to free up the AVC's and remove the entries from the + * VMA's anon_vma_chain. + */ +static void cleanup_partial_anon_vmas(struct vm_area_struct *vma) +{ + struct anon_vma_chain *avc, *next; + + list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { + list_del(&avc->same_vma); + anon_vma_chain_free(avc); + } } +/** + * unlink_anon_vmas() - remove all links between a VMA and anon_vma's, freeing + * anon_vma_chain objects. + * @vma: The VMA whose links to anon_vma objects is to be severed. + * + * As part of the process anon_vma_chain's are freed, + * anon_vma->num_children,num_active_vmas is updated as required and, if the + * relevant anon_vma references no further VMAs, its reference count is + * decremented. + */ void unlink_anon_vmas(struct vm_area_struct *vma) { struct anon_vma_chain *avc, *next; - struct anon_vma *root = NULL; + struct anon_vma *active_anon_vma = vma->anon_vma; + + /* Always hold mmap lock, read-lock on unmap possibly. */ + mmap_assert_locked(vma->vm_mm); + + /* Unfaulted is a no-op. */ + if (!active_anon_vma) { + VM_WARN_ON_ONCE(!list_empty(&vma->anon_vma_chain)); + return; + } + + anon_vma_lock_write(active_anon_vma); /* * Unlink each anon_vma chained to the VMA. This list is ordered @@ -404,7 +492,6 @@ void unlink_anon_vmas(struct vm_area_struct *vma) list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { struct anon_vma *anon_vma = avc->anon_vma; - root = lock_anon_vma_root(root, anon_vma); anon_vma_interval_tree_remove(avc, &anon_vma->rb_root); /* @@ -419,16 +506,15 @@ void unlink_anon_vmas(struct vm_area_struct *vma) list_del(&avc->same_vma); anon_vma_chain_free(avc); } - if (vma->anon_vma) { - vma->anon_vma->num_active_vmas--; - /* - * vma would still be needed after unlink, and anon_vma will be prepared - * when handle fault. - */ - vma->anon_vma = NULL; - } - unlock_anon_vma_root(root); + active_anon_vma->num_active_vmas--; + /* + * vma would still be needed after unlink, and anon_vma will be prepared + * when handle fault. + */ + vma->anon_vma = NULL; + anon_vma_unlock_write(active_anon_vma); + /* * Iterate the list once more, it now only contains empty and unlinked @@ -2147,7 +2233,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, goto discard; } - if (swap_duplicate(entry) < 0) { + if (folio_dup_swap(folio, subpage) < 0) { set_pte_at(mm, address, pvmw.pte, pteval); goto walk_abort; } @@ -2158,7 +2244,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * so we'll not check/care. */ if (arch_unmap_one(mm, vma, address, pteval) < 0) { - swap_free(entry); + folio_put_swap(folio, subpage); set_pte_at(mm, address, pvmw.pte, pteval); goto walk_abort; } @@ -2166,7 +2252,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, /* See folio_try_share_anon_rmap(): clear PTE first. */ if (anon_exclusive && folio_try_share_anon_rmap_pte(folio, subpage)) { - swap_free(entry); + folio_put_swap(folio, subpage); set_pte_at(mm, address, pvmw.pte, pteval); goto walk_abort; } |
