summaryrefslogtreecommitdiff
path: root/mm/rmap.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/rmap.c')
-rw-r--r--mm/rmap.c324
1 files changed, 205 insertions, 119 deletions
diff --git a/mm/rmap.c b/mm/rmap.c
index 7b9879ef442d..ab099405151f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1,8 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* mm/rmap.c - physical to virtual reverse mappings
*
* Copyright 2001, Rik van Riel <riel@conectiva.com.br>
- * Released under the General Public License (GPL).
*
* Simple, low overhead reverse mapping scheme.
* Please try to keep this thing as modular as possible.
@@ -82,6 +82,7 @@
#include <trace/events/migrate.h>
#include "internal.h"
+#include "swap.h"
static struct kmem_cache *anon_vma_cachep;
static struct kmem_cache *anon_vma_chain_cachep;
@@ -146,14 +147,13 @@ static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
}
-static void anon_vma_chain_link(struct vm_area_struct *vma,
- struct anon_vma_chain *avc,
- struct anon_vma *anon_vma)
+static void anon_vma_chain_assign(struct vm_area_struct *vma,
+ struct anon_vma_chain *avc,
+ struct anon_vma *anon_vma)
{
avc->vma = vma;
avc->anon_vma = anon_vma;
list_add(&avc->same_vma, &vma->anon_vma_chain);
- anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
}
/**
@@ -210,7 +210,8 @@ int __anon_vma_prepare(struct vm_area_struct *vma)
spin_lock(&mm->page_table_lock);
if (likely(!vma->anon_vma)) {
vma->anon_vma = anon_vma;
- anon_vma_chain_link(vma, avc, anon_vma);
+ anon_vma_chain_assign(vma, avc, anon_vma);
+ anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
anon_vma->num_active_vmas++;
allocated = NULL;
avc = NULL;
@@ -231,97 +232,141 @@ int __anon_vma_prepare(struct vm_area_struct *vma)
return -ENOMEM;
}
-/*
- * This is a useful helper function for locking the anon_vma root as
- * we traverse the vma->anon_vma_chain, looping over anon_vma's that
- * have the same vma.
- *
- * Such anon_vma's should have the same root, so you'd expect to see
- * just a single mutex_lock for the whole traversal.
- */
-static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
-{
- struct anon_vma *new_root = anon_vma->root;
- if (new_root != root) {
- if (WARN_ON_ONCE(root))
- up_write(&root->rwsem);
- root = new_root;
- down_write(&root->rwsem);
- }
- return root;
+static void check_anon_vma_clone(struct vm_area_struct *dst,
+ struct vm_area_struct *src,
+ enum vma_operation operation)
+{
+ /* The write lock must be held. */
+ mmap_assert_write_locked(src->vm_mm);
+ /* If not a fork then must be on same mm. */
+ VM_WARN_ON_ONCE(operation != VMA_OP_FORK && dst->vm_mm != src->vm_mm);
+
+ /* If we have anything to do src->anon_vma must be provided. */
+ VM_WARN_ON_ONCE(!src->anon_vma && !list_empty(&src->anon_vma_chain));
+ VM_WARN_ON_ONCE(!src->anon_vma && dst->anon_vma);
+ /* We are establishing a new anon_vma_chain. */
+ VM_WARN_ON_ONCE(!list_empty(&dst->anon_vma_chain));
+ /*
+ * On fork, dst->anon_vma is set NULL (temporarily). Otherwise, anon_vma
+ * must be the same across dst and src.
+ */
+ VM_WARN_ON_ONCE(dst->anon_vma && dst->anon_vma != src->anon_vma);
+ /*
+ * Essentially equivalent to above - if not a no-op, we should expect
+ * dst->anon_vma to be set for everything except a fork.
+ */
+ VM_WARN_ON_ONCE(operation != VMA_OP_FORK && src->anon_vma &&
+ !dst->anon_vma);
+ /* For the anon_vma to be compatible, it can only be singular. */
+ VM_WARN_ON_ONCE(operation == VMA_OP_MERGE_UNFAULTED &&
+ !list_is_singular(&src->anon_vma_chain));
+#ifdef CONFIG_PER_VMA_LOCK
+ /* Only merging an unfaulted VMA leaves the destination attached. */
+ VM_WARN_ON_ONCE(operation != VMA_OP_MERGE_UNFAULTED &&
+ vma_is_attached(dst));
+#endif
}
-static inline void unlock_anon_vma_root(struct anon_vma *root)
+static void maybe_reuse_anon_vma(struct vm_area_struct *dst,
+ struct anon_vma *anon_vma)
{
- if (root)
- up_write(&root->rwsem);
+ /* If already populated, nothing to do.*/
+ if (dst->anon_vma)
+ return;
+
+ /*
+ * We reuse an anon_vma if any linking VMAs were unmapped and it has
+ * only a single child at most.
+ */
+ if (anon_vma->num_active_vmas > 0)
+ return;
+ if (anon_vma->num_children > 1)
+ return;
+
+ dst->anon_vma = anon_vma;
+ anon_vma->num_active_vmas++;
}
-/*
- * Attach the anon_vmas from src to dst.
- * Returns 0 on success, -ENOMEM on failure.
- *
- * anon_vma_clone() is called by vma_expand(), vma_merge(), __split_vma(),
- * copy_vma() and anon_vma_fork(). The first four want an exact copy of src,
- * while the last one, anon_vma_fork(), may try to reuse an existing anon_vma to
- * prevent endless growth of anon_vma. Since dst->anon_vma is set to NULL before
- * call, we can identify this case by checking (!dst->anon_vma &&
- * src->anon_vma).
- *
- * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find
- * and reuse existing anon_vma which has no vmas and only one child anon_vma.
- * This prevents degradation of anon_vma hierarchy to endless linear chain in
- * case of constantly forking task. On the other hand, an anon_vma with more
- * than one child isn't reused even if there was no alive vma, thus rmap
- * walker has a good chance of avoiding scanning the whole hierarchy when it
- * searches where page is mapped.
+static void cleanup_partial_anon_vmas(struct vm_area_struct *vma);
+
+/**
+ * anon_vma_clone - Establishes new anon_vma_chain objects in @dst linking to
+ * all of the anon_vma objects contained within @src anon_vma_chain's.
+ * @dst: The destination VMA with an empty anon_vma_chain.
+ * @src: The source VMA we wish to duplicate.
+ * @operation: The type of operation which resulted in the clone.
+ *
+ * This is the heart of the VMA side of the anon_vma implementation - we invoke
+ * this function whenever we need to set up a new VMA's anon_vma state.
+ *
+ * This is invoked for:
+ *
+ * - VMA Merge, but only when @dst is unfaulted and @src is faulted - meaning we
+ * clone @src into @dst.
+ * - VMA split.
+ * - VMA (m)remap.
+ * - Fork of faulted VMA.
+ *
+ * In all cases other than fork this is simply a duplication. Fork additionally
+ * adds a new active anon_vma.
+ *
+ * ONLY in the case of fork do we try to 'reuse' existing anon_vma's in an
+ * anon_vma hierarchy, reusing anon_vma's which have no VMA associated with them
+ * but do have a single child. This is to avoid waste of memory when repeatedly
+ * forking.
+ *
+ * Returns: 0 on success, -ENOMEM on failure.
*/
-int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
+int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src,
+ enum vma_operation operation)
{
struct anon_vma_chain *avc, *pavc;
- struct anon_vma *root = NULL;
-
- list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
- struct anon_vma *anon_vma;
-
- avc = anon_vma_chain_alloc(GFP_NOWAIT);
- if (unlikely(!avc)) {
- unlock_anon_vma_root(root);
- root = NULL;
- avc = anon_vma_chain_alloc(GFP_KERNEL);
- if (!avc)
- goto enomem_failure;
- }
- anon_vma = pavc->anon_vma;
- root = lock_anon_vma_root(root, anon_vma);
- anon_vma_chain_link(dst, avc, anon_vma);
+ struct anon_vma *active_anon_vma = src->anon_vma;
- /*
- * Reuse existing anon_vma if it has no vma and only one
- * anon_vma child.
- *
- * Root anon_vma is never reused:
- * it has self-parent reference and at least one child.
- */
- if (!dst->anon_vma && src->anon_vma &&
- anon_vma->num_children < 2 &&
- anon_vma->num_active_vmas == 0)
- dst->anon_vma = anon_vma;
+ check_anon_vma_clone(dst, src, operation);
+
+ if (!active_anon_vma)
+ return 0;
+
+ /*
+ * Allocate AVCs. We don't need an anon_vma lock for this as we
+ * are not updating the anon_vma rbtree nor are we changing
+ * anon_vma statistics.
+ *
+ * Either src, dst have the same mm for which we hold an exclusive mmap
+ * write lock, or we are forking and we hold it on src->vm_mm and dst is
+ * not yet accessible to other threads so there's no possibliity of the
+ * unlinked AVC's being observed yet.
+ */
+ list_for_each_entry(pavc, &src->anon_vma_chain, same_vma) {
+ avc = anon_vma_chain_alloc(GFP_KERNEL);
+ if (!avc)
+ goto enomem_failure;
+
+ anon_vma_chain_assign(dst, avc, pavc->anon_vma);
}
- if (dst->anon_vma)
+
+ /*
+ * Now link the anon_vma's back to the newly inserted AVCs.
+ * Note that all anon_vma's share the same root.
+ */
+ anon_vma_lock_write(src->anon_vma);
+ list_for_each_entry_reverse(avc, &dst->anon_vma_chain, same_vma) {
+ struct anon_vma *anon_vma = avc->anon_vma;
+
+ anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
+ if (operation == VMA_OP_FORK)
+ maybe_reuse_anon_vma(dst, anon_vma);
+ }
+
+ if (operation != VMA_OP_FORK)
dst->anon_vma->num_active_vmas++;
- unlock_anon_vma_root(root);
+
+ anon_vma_unlock_write(active_anon_vma);
return 0;
enomem_failure:
- /*
- * dst->anon_vma is dropped here otherwise its num_active_vmas can
- * be incorrectly decremented in unlink_anon_vmas().
- * We can safely do this because callers of anon_vma_clone() don't care
- * about dst->anon_vma if anon_vma_clone() failed.
- */
- dst->anon_vma = NULL;
- unlink_anon_vmas(dst);
+ cleanup_partial_anon_vmas(dst);
return -ENOMEM;
}
@@ -334,7 +379,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
{
struct anon_vma_chain *avc;
struct anon_vma *anon_vma;
- int error;
+ int rc;
/* Don't bother if the parent process has no anon_vma here. */
if (!pvma->anon_vma)
@@ -343,27 +388,35 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
/* Drop inherited anon_vma, we'll reuse existing or allocate new. */
vma->anon_vma = NULL;
+ anon_vma = anon_vma_alloc();
+ if (!anon_vma)
+ return -ENOMEM;
+ avc = anon_vma_chain_alloc(GFP_KERNEL);
+ if (!avc) {
+ put_anon_vma(anon_vma);
+ return -ENOMEM;
+ }
+
/*
* First, attach the new VMA to the parent VMA's anon_vmas,
* so rmap can find non-COWed pages in child processes.
*/
- error = anon_vma_clone(vma, pvma);
- if (error)
- return error;
-
- /* An existing anon_vma has been reused, all done then. */
- if (vma->anon_vma)
- return 0;
+ rc = anon_vma_clone(vma, pvma, VMA_OP_FORK);
+ /* An error arose or an existing anon_vma was reused, all done then. */
+ if (rc || vma->anon_vma) {
+ put_anon_vma(anon_vma);
+ anon_vma_chain_free(avc);
+ return rc;
+ }
- /* Then add our own anon_vma. */
- anon_vma = anon_vma_alloc();
- if (!anon_vma)
- goto out_error;
- anon_vma->num_active_vmas++;
- avc = anon_vma_chain_alloc(GFP_KERNEL);
- if (!avc)
- goto out_error_free_anon_vma;
+ /*
+ * OK no reuse, so add our own anon_vma.
+ *
+ * Since it is not linked anywhere we can safely manipulate anon_vma
+ * fields without a lock.
+ */
+ anon_vma->num_active_vmas = 1;
/*
* The root anon_vma's rwsem is the lock actually used when we
* lock any of the anon_vmas in this anon_vma tree.
@@ -378,24 +431,59 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
get_anon_vma(anon_vma->root);
/* Mark this anon_vma as the one where our new (COWed) pages go. */
vma->anon_vma = anon_vma;
+ anon_vma_chain_assign(vma, avc, anon_vma);
+ /* Now let rmap see it. */
anon_vma_lock_write(anon_vma);
- anon_vma_chain_link(vma, avc, anon_vma);
+ anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
anon_vma->parent->num_children++;
anon_vma_unlock_write(anon_vma);
return 0;
+}
- out_error_free_anon_vma:
- put_anon_vma(anon_vma);
- out_error:
- unlink_anon_vmas(vma);
- return -ENOMEM;
+/*
+ * In the unfortunate case of anon_vma_clone() failing to allocate memory we
+ * have to clean things up.
+ *
+ * Since we allocate anon_vma_chain's before we insert them into the interval
+ * trees, we simply have to free up the AVC's and remove the entries from the
+ * VMA's anon_vma_chain.
+ */
+static void cleanup_partial_anon_vmas(struct vm_area_struct *vma)
+{
+ struct anon_vma_chain *avc, *next;
+
+ list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
+ list_del(&avc->same_vma);
+ anon_vma_chain_free(avc);
+ }
}
+/**
+ * unlink_anon_vmas() - remove all links between a VMA and anon_vma's, freeing
+ * anon_vma_chain objects.
+ * @vma: The VMA whose links to anon_vma objects is to be severed.
+ *
+ * As part of the process anon_vma_chain's are freed,
+ * anon_vma->num_children,num_active_vmas is updated as required and, if the
+ * relevant anon_vma references no further VMAs, its reference count is
+ * decremented.
+ */
void unlink_anon_vmas(struct vm_area_struct *vma)
{
struct anon_vma_chain *avc, *next;
- struct anon_vma *root = NULL;
+ struct anon_vma *active_anon_vma = vma->anon_vma;
+
+ /* Always hold mmap lock, read-lock on unmap possibly. */
+ mmap_assert_locked(vma->vm_mm);
+
+ /* Unfaulted is a no-op. */
+ if (!active_anon_vma) {
+ VM_WARN_ON_ONCE(!list_empty(&vma->anon_vma_chain));
+ return;
+ }
+
+ anon_vma_lock_write(active_anon_vma);
/*
* Unlink each anon_vma chained to the VMA. This list is ordered
@@ -404,7 +492,6 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
struct anon_vma *anon_vma = avc->anon_vma;
- root = lock_anon_vma_root(root, anon_vma);
anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);
/*
@@ -419,16 +506,15 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
list_del(&avc->same_vma);
anon_vma_chain_free(avc);
}
- if (vma->anon_vma) {
- vma->anon_vma->num_active_vmas--;
- /*
- * vma would still be needed after unlink, and anon_vma will be prepared
- * when handle fault.
- */
- vma->anon_vma = NULL;
- }
- unlock_anon_vma_root(root);
+ active_anon_vma->num_active_vmas--;
+ /*
+ * vma would still be needed after unlink, and anon_vma will be prepared
+ * when handle fault.
+ */
+ vma->anon_vma = NULL;
+ anon_vma_unlock_write(active_anon_vma);
+
/*
* Iterate the list once more, it now only contains empty and unlinked
@@ -2147,7 +2233,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
goto discard;
}
- if (swap_duplicate(entry) < 0) {
+ if (folio_dup_swap(folio, subpage) < 0) {
set_pte_at(mm, address, pvmw.pte, pteval);
goto walk_abort;
}
@@ -2158,7 +2244,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
* so we'll not check/care.
*/
if (arch_unmap_one(mm, vma, address, pteval) < 0) {
- swap_free(entry);
+ folio_put_swap(folio, subpage);
set_pte_at(mm, address, pvmw.pte, pteval);
goto walk_abort;
}
@@ -2166,7 +2252,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
/* See folio_try_share_anon_rmap(): clear PTE first. */
if (anon_exclusive &&
folio_try_share_anon_rmap_pte(folio, subpage)) {
- swap_free(entry);
+ folio_put_swap(folio, subpage);
set_pte_at(mm, address, pvmw.pte, pteval);
goto walk_abort;
}