mm/hugetlb: make walk_hugetlb_range() safe to pmd unshare

Since walk_hugetlb_range() walks the pgtable, it needs the vma lock to make sure the pgtable page will not be freed concurrently. Link: https://lkml.kernel.org/r/20221216155226.2043738-1-peterx@redhat.com Signed-off-by: Peter Xu <peterx@redhat.com> Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com> Reviewed-by: John Hubbard <jhubbard@nvidia.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: David Hildenbrand <david@redhat.com> Cc: James Houghton <jthoughton@google.com> Cc: Jann Horn <jannh@google.com> Cc: Miaohe Lin <linmiaohe@huawei.com> Cc: Muchun Song <songmuchun@bytedance.com> Cc: Nadav Amit <nadav.amit@gmail.com> Cc: Rik van Riel <riel@surriel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
author: Peter Xu <peterx@redhat.com> 2022-12-16 10:52:26 -0500
committer: Andrew Morton <akpm@linux-foundation.org> 2023-01-18 17:12:39 -0800
commit: dd361e5033cf36c51acab996ea17748b81cedb38 (patch)
tree: 5b3eccbb296ae07feecfdfe622c6e051689308b3
parent: eefc7fa53608920203a1402ecf7255ecfa8bb030 (diff)
download: lwn-dd361e5033cf36c51acab996ea17748b81cedb38.tar.gz
lwn-dd361e5033cf36c51acab996ea17748b81cedb38.zip
3 files changed, 26 insertions, 2 deletions
diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
index 959f52e5867d..27a6df448ee5 100644
--- a/include/linux/pagewalk.h
+++ b/include/linux/pagewalk.h
@@ -21,7 +21,16 @@ struct mm_walk;
  *			depth is -1 if not known, 0:PGD, 1:P4D, 2:PUD, 3:PMD.
  *			Any folded depths (where PTRS_PER_P?D is equal to 1)
  *			are skipped.
- * @hugetlb_entry:	if set, called for each hugetlb entry
+ * @hugetlb_entry:	if set, called for each hugetlb entry. This hook
+ *			function is called with the vma lock held, in order to
+ *			protect against a concurrent freeing of the pte_t* or
+ *			the ptl. In some cases, the hook function needs to drop
+ *			and retake the vma lock in order to avoid deadlocks
+ *			while calling other functions. In such cases the hook
+ *			function must either refrain from accessing the pte or
+ *			ptl after dropping the vma lock, or else revalidate
+ *			those items after re-acquiring the vma lock and before
+ *			accessing them.
  * @test_walk:		caller specific callback function to determine whether
  *			we walk over the current vma or not. Returning 0 means
  *			"do page table walk over the current vma", returning
diff --git a/mm/hmm.c b/mm/hmm.c
index 601a99ce3c84..6a151c09de5e 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -492,8 +492,21 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
 	required_fault =
 		hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags);
 	if (required_fault) {
+		int ret;
+
 		spin_unlock(ptl);
-		return hmm_vma_fault(addr, end, required_fault, walk);
+		hugetlb_vma_unlock_read(vma);
+		/*
+		 * Avoid deadlock: drop the vma lock before calling
+		 * hmm_vma_fault(), which will itself potentially take and
+		 * drop the vma lock. This is also correct from a
+		 * protection point of view, because there is no further
+		 * use here of either pte or ptl after dropping the vma
+		 * lock.
+		 */
+		ret = hmm_vma_fault(addr, end, required_fault, walk);
+		hugetlb_vma_lock_read(vma);
+		return ret;
 	}
 
 	pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT);
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 7f1c9b274906..d98564a7be57 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -302,6 +302,7 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end,
 	const struct mm_walk_ops *ops = walk->ops;
 	int err = 0;
 
+	hugetlb_vma_lock_read(vma);
 	do {
 		next = hugetlb_entry_end(h, addr, end);
 		pte = huge_pte_offset(walk->mm, addr & hmask, sz);
@@ -314,6 +315,7 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end,
 		if (err)
 			break;
 	} while (addr = next, addr != end);
+	hugetlb_vma_unlock_read(vma);
 
 	return err;
 }
author	Peter Xu <peterx@redhat.com>	2022-12-16 10:52:26 -0500
committer	Andrew Morton <akpm@linux-foundation.org>	2023-01-18 17:12:39 -0800
commit	dd361e5033cf36c51acab996ea17748b81cedb38 (patch)
tree	5b3eccbb296ae07feecfdfe622c6e051689308b3
parent	eefc7fa53608920203a1402ecf7255ecfa8bb030 (diff)
download	lwn-dd361e5033cf36c51acab996ea17748b81cedb38.tar.gz lwn-dd361e5033cf36c51acab996ea17748b81cedb38.zip