From 332c151c710ad404e6e67eba7ae899ad8333333f Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Mon, 22 May 2023 17:43:10 -0700 Subject: arm64: mte: simplify swap tag restoration logic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As a result of the patches "mm: Call arch_swap_restore() from do_swap_page()" and "mm: Call arch_swap_restore() from unuse_pte()", there are no circumstances in which a swapped-in page is installed in a page table without first having arch_swap_restore() called on it. Therefore, we no longer need the logic in set_pte_at() that restores the tags, so remove it. Link: https://lkml.kernel.org/r/20230523004312.1807357-4-pcc@google.com Link: https://linux-review.googlesource.com/id/I8ad54476f3b2d0144ccd8ce0c1d7a2963e5ff6f3 Signed-off-by: Peter Collingbourne Reviewed-by: Steven Price Reviewed-by: Catalin Marinas Cc: Alexandru Elisei Cc: Chinwen Chang Cc: David Hildenbrand Cc: Evgenii Stepanov Cc: Greg Kroah-Hartman Cc: kasan-dev@googlegroups.com Cc: kasan-dev Cc: "Kuan-Ying Lee (李冠穎)" Cc: Qun-Wei Lin Cc: Suren Baghdasaryan Cc: Vincenzo Frascino Cc: Will Deacon Cc: "Huang, Ying" Signed-off-by: Andrew Morton --- arch/arm64/include/asm/mte.h | 4 ++-- arch/arm64/include/asm/pgtable.h | 14 ++------------ arch/arm64/kernel/mte.c | 37 +++++++------------------------------ 3 files changed, 11 insertions(+), 44 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h index c028afb1cd0b..4cedbaa16f41 100644 --- a/arch/arm64/include/asm/mte.h +++ b/arch/arm64/include/asm/mte.h @@ -90,7 +90,7 @@ static inline bool try_page_mte_tagging(struct page *page) } void mte_zero_clear_page_tags(void *addr); -void mte_sync_tags(pte_t old_pte, pte_t pte); +void mte_sync_tags(pte_t pte); void mte_copy_page_tags(void *kto, const void *kfrom); void mte_thread_init_user(void); void mte_thread_switch(struct task_struct *next); @@ -122,7 +122,7 @@ static inline bool try_page_mte_tagging(struct page *page) static inline void mte_zero_clear_page_tags(void *addr) { } -static inline void mte_sync_tags(pte_t old_pte, pte_t pte) +static inline void mte_sync_tags(pte_t pte) { } static inline void mte_copy_page_tags(void *kto, const void *kfrom) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 0bd18de9fd97..e8a252e62b12 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -337,18 +337,8 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, * don't expose tags (instruction fetches don't check tags). */ if (system_supports_mte() && pte_access_permitted(pte, false) && - !pte_special(pte)) { - pte_t old_pte = READ_ONCE(*ptep); - /* - * We only need to synchronise if the new PTE has tags enabled - * or if swapping in (in which case another mapping may have - * set tags in the past even if this PTE isn't tagged). - * (!pte_none() && !pte_present()) is an open coded version of - * is_swap_pte() - */ - if (pte_tagged(pte) || (!pte_none(old_pte) && !pte_present(old_pte))) - mte_sync_tags(old_pte, pte); - } + !pte_special(pte) && pte_tagged(pte)) + mte_sync_tags(pte); __check_safe_pte_update(mm, ptep, pte); diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 4c5ef9b20065..4edecaac8f91 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -35,41 +35,18 @@ DEFINE_STATIC_KEY_FALSE(mte_async_or_asymm_mode); EXPORT_SYMBOL_GPL(mte_async_or_asymm_mode); #endif -static void mte_sync_page_tags(struct page *page, pte_t old_pte, - bool check_swap, bool pte_is_tagged) -{ - if (check_swap && is_swap_pte(old_pte)) { - swp_entry_t entry = pte_to_swp_entry(old_pte); - - if (!non_swap_entry(entry)) - mte_restore_tags(entry, page); - } - - if (!pte_is_tagged) - return; - - if (try_page_mte_tagging(page)) { - mte_clear_page_tags(page_address(page)); - set_page_mte_tagged(page); - } -} - -void mte_sync_tags(pte_t old_pte, pte_t pte) +void mte_sync_tags(pte_t pte) { struct page *page = pte_page(pte); long i, nr_pages = compound_nr(page); - bool check_swap = nr_pages == 1; - bool pte_is_tagged = pte_tagged(pte); - - /* Early out if there's nothing to do */ - if (!check_swap && !pte_is_tagged) - return; /* if PG_mte_tagged is set, tags have already been initialised */ - for (i = 0; i < nr_pages; i++, page++) - if (!page_mte_tagged(page)) - mte_sync_page_tags(page, old_pte, check_swap, - pte_is_tagged); + for (i = 0; i < nr_pages; i++, page++) { + if (try_page_mte_tagging(page)) { + mte_clear_page_tags(page_address(page)); + set_page_mte_tagged(page); + } + } /* ensure the tags are visible before the PTE is set */ smp_wmb(); -- cgit v1.2.3 From aa232204c4689427cefa55fe975692b57291523a Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 14 Jul 2023 01:26:31 +0800 Subject: mm/page_table_check: remove unused parameter in [__]page_table_check_pte_clear Remove unused addr in page_table_check_pte_clear and __page_table_check_pte_clear. Link: https://lkml.kernel.org/r/20230713172636.1705415-4-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Pavel Tatashin Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 2 +- arch/riscv/include/asm/pgtable.h | 2 +- arch/x86/include/asm/pgtable.h | 4 ++-- include/linux/page_table_check.h | 11 ++++------- include/linux/pgtable.h | 2 +- mm/page_table_check.c | 7 +++---- 6 files changed, 12 insertions(+), 16 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index e8a252e62b12..f7ea51f9c1c1 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -928,7 +928,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, { pte_t pte = __pte(xchg_relaxed(&pte_val(*ptep), 0)); - page_table_check_pte_clear(mm, address, pte); + page_table_check_pte_clear(mm, pte); return pte; } diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 75970ee2bda2..5e07312cd3e1 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -529,7 +529,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, { pte_t pte = __pte(atomic_long_xchg((atomic_long_t *)ptep, 0)); - page_table_check_pte_clear(mm, address, pte); + page_table_check_pte_clear(mm, pte); return pte; } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 5700bb337987..5085e838b860 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1068,7 +1068,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { pte_t pte = native_ptep_get_and_clear(ptep); - page_table_check_pte_clear(mm, addr, pte); + page_table_check_pte_clear(mm, pte); return pte; } @@ -1084,7 +1084,7 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, * care about updates and native needs no locking */ pte = native_local_ptep_get_and_clear(ptep); - page_table_check_pte_clear(mm, addr, pte); + page_table_check_pte_clear(mm, pte); } else { pte = ptep_get_and_clear(mm, addr, ptep); } diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 01e16c7696ec..35c53c4b94d3 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -14,8 +14,7 @@ extern struct static_key_true page_table_check_disabled; extern struct page_ext_operations page_table_check_ops; void __page_table_check_zero(struct page *page, unsigned int order); -void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr, - pte_t pte); +void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte); void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr, pmd_t pmd); void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, @@ -46,13 +45,12 @@ static inline void page_table_check_free(struct page *page, unsigned int order) __page_table_check_zero(page, order); } -static inline void page_table_check_pte_clear(struct mm_struct *mm, - unsigned long addr, pte_t pte) +static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pte_clear(mm, addr, pte); + __page_table_check_pte_clear(mm, pte); } static inline void page_table_check_pmd_clear(struct mm_struct *mm, @@ -123,8 +121,7 @@ static inline void page_table_check_free(struct page *page, unsigned int order) { } -static inline void page_table_check_pte_clear(struct mm_struct *mm, - unsigned long addr, pte_t pte) +static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) { } diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 9fa34be65159..a1ccb13c4853 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -322,7 +322,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, { pte_t pte = ptep_get(ptep); pte_clear(mm, address, ptep); - page_table_check_pte_clear(mm, address, pte); + page_table_check_pte_clear(mm, pte); return pte; } #endif diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 53a9a1e4f342..a1015fc4d045 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -149,8 +149,7 @@ void __page_table_check_zero(struct page *page, unsigned int order) page_ext_put(page_ext); } -void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr, - pte_t pte) +void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) { if (&init_mm == mm) return; @@ -191,7 +190,7 @@ void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr, if (&init_mm == mm) return; - __page_table_check_pte_clear(mm, addr, ptep_get(ptep)); + __page_table_check_pte_clear(mm, ptep_get(ptep)); if (pte_user_accessible_page(pte)) { page_table_check_set(pte_pfn(pte), PAGE_SIZE >> PAGE_SHIFT, pte_write(pte)); @@ -241,7 +240,7 @@ void __page_table_check_pte_clear_range(struct mm_struct *mm, if (WARN_ON(!ptep)) return; for (i = 0; i < PTRS_PER_PTE; i++) { - __page_table_check_pte_clear(mm, addr, ptep_get(ptep)); + __page_table_check_pte_clear(mm, ptep_get(ptep)); addr += PAGE_SIZE; ptep++; } -- cgit v1.2.3 From 1831414cd729a34af937d56ad684a66599de6344 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 14 Jul 2023 01:26:32 +0800 Subject: mm/page_table_check: remove unused parameter in [__]page_table_check_pmd_clear Remove unused addr in page_table_check_pmd_clear and __page_table_check_pmd_clear. Link: https://lkml.kernel.org/r/20230713172636.1705415-5-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Pavel Tatashin Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 2 +- arch/riscv/include/asm/pgtable.h | 2 +- arch/x86/include/asm/pgtable.h | 2 +- include/linux/page_table_check.h | 11 ++++------- include/linux/pgtable.h | 2 +- mm/page_table_check.c | 5 ++--- 6 files changed, 10 insertions(+), 14 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index f7ea51f9c1c1..6e3387ec6013 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -940,7 +940,7 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, { pmd_t pmd = __pmd(xchg_relaxed(&pmd_val(*pmdp), 0)); - page_table_check_pmd_clear(mm, address, pmd); + page_table_check_pmd_clear(mm, pmd); return pmd; } diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 5e07312cd3e1..388c3af8a9f9 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -742,7 +742,7 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, { pmd_t pmd = __pmd(atomic_long_xchg((atomic_long_t *)pmdp, 0)); - page_table_check_pmd_clear(mm, address, pmd); + page_table_check_pmd_clear(mm, pmd); return pmd; } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 5085e838b860..5d71f933d933 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1133,7 +1133,7 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long { pmd_t pmd = native_pmdp_get_and_clear(pmdp); - page_table_check_pmd_clear(mm, addr, pmd); + page_table_check_pmd_clear(mm, pmd); return pmd; } diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 35c53c4b94d3..0f777bca5283 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -15,8 +15,7 @@ extern struct page_ext_operations page_table_check_ops; void __page_table_check_zero(struct page *page, unsigned int order); void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte); -void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr, - pmd_t pmd); +void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd); void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, pud_t pud); void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr, @@ -53,13 +52,12 @@ static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) __page_table_check_pte_clear(mm, pte); } -static inline void page_table_check_pmd_clear(struct mm_struct *mm, - unsigned long addr, pmd_t pmd) +static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pmd_clear(mm, addr, pmd); + __page_table_check_pmd_clear(mm, pmd); } static inline void page_table_check_pud_clear(struct mm_struct *mm, @@ -125,8 +123,7 @@ static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) { } -static inline void page_table_check_pmd_clear(struct mm_struct *mm, - unsigned long addr, pmd_t pmd) +static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) { } diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index a1ccb13c4853..3edef5ed008f 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -425,7 +425,7 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, pmd_t pmd = *pmdp; pmd_clear(pmdp); - page_table_check_pmd_clear(mm, address, pmd); + page_table_check_pmd_clear(mm, pmd); return pmd; } diff --git a/mm/page_table_check.c b/mm/page_table_check.c index a1015fc4d045..51f2274c0a20 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -160,8 +160,7 @@ void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) } EXPORT_SYMBOL(__page_table_check_pte_clear); -void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr, - pmd_t pmd) +void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) { if (&init_mm == mm) return; @@ -204,7 +203,7 @@ void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr, if (&init_mm == mm) return; - __page_table_check_pmd_clear(mm, addr, *pmdp); + __page_table_check_pmd_clear(mm, *pmdp); if (pmd_user_accessible_page(pmd)) { page_table_check_set(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT, pmd_write(pmd)); -- cgit v1.2.3 From 1066293d426d3000793c3c3b4276ef38b63ada4a Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 14 Jul 2023 01:26:34 +0800 Subject: mm/page_table_check: remove unused parameter in [__]page_table_check_pte_set Remove unused addr in __page_table_check_pte_set and page_table_check_pte_set. Link: https://lkml.kernel.org/r/20230713172636.1705415-7-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Pavel Tatashin Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 2 +- arch/riscv/include/asm/pgtable.h | 2 +- arch/x86/include/asm/pgtable.h | 2 +- include/linux/page_table_check.h | 11 ++++------- mm/page_table_check.c | 3 +-- 5 files changed, 8 insertions(+), 12 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 6e3387ec6013..f0a8dcbca04a 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -348,7 +348,7 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { - page_table_check_pte_set(mm, addr, ptep, pte); + page_table_check_pte_set(mm, ptep, pte); return __set_pte_at(mm, addr, ptep, pte); } diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 388c3af8a9f9..90063afe8d36 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -499,7 +499,7 @@ static inline void __set_pte_at(struct mm_struct *mm, static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { - page_table_check_pte_set(mm, addr, ptep, pteval); + page_table_check_pte_set(mm, ptep, pteval); __set_pte_at(mm, addr, ptep, pteval); } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index f07c610c3458..d14f0d92f04b 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1023,7 +1023,7 @@ static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp) static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { - page_table_check_pte_set(mm, addr, ptep, pte); + page_table_check_pte_set(mm, ptep, pte); set_pte(ptep, pte); } diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 5c9dc848a1bc..63ebd9fcf28b 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -17,8 +17,7 @@ void __page_table_check_zero(struct page *page, unsigned int order); void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte); void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd); void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud); -void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte); +void __page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, pte_t pte); void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd); void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, @@ -67,14 +66,13 @@ static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) __page_table_check_pud_clear(mm, pud); } -static inline void page_table_check_pte_set(struct mm_struct *mm, - unsigned long addr, pte_t *ptep, +static inline void page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, pte_t pte) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pte_set(mm, addr, ptep, pte); + __page_table_check_pte_set(mm, ptep, pte); } static inline void page_table_check_pmd_set(struct mm_struct *mm, @@ -129,8 +127,7 @@ static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) { } -static inline void page_table_check_pte_set(struct mm_struct *mm, - unsigned long addr, pte_t *ptep, +static inline void page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, pte_t pte) { } diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 2643135bf45c..fc20ddc3a63e 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -182,8 +182,7 @@ void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) } EXPORT_SYMBOL(__page_table_check_pud_clear); -void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) +void __page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, pte_t pte) { if (&init_mm == mm) return; -- cgit v1.2.3 From a3b837130b5865521fa8662aceaa6ebc8d29389a Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 14 Jul 2023 01:26:35 +0800 Subject: mm/page_table_check: remove unused parameter in [__]page_table_check_pmd_set Remove unused addr in __page_table_check_pmd_set and page_table_check_pmd_set. Link: https://lkml.kernel.org/r/20230713172636.1705415-8-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Pavel Tatashin Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 4 ++-- arch/riscv/include/asm/pgtable.h | 4 ++-- arch/x86/include/asm/pgtable.h | 4 ++-- include/linux/page_table_check.h | 11 ++++------- mm/page_table_check.c | 3 +-- 5 files changed, 11 insertions(+), 15 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index f0a8dcbca04a..1fbf8d3f42b1 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -524,7 +524,7 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd) static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { - page_table_check_pmd_set(mm, addr, pmdp, pmd); + page_table_check_pmd_set(mm, pmdp, pmd); return __set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd)); } @@ -976,7 +976,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, static inline pmd_t pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { - page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd); + page_table_check_pmd_set(vma->vm_mm, pmdp, pmd); return __pmd(xchg_relaxed(&pmd_val(*pmdp), pmd_val(pmd))); } #endif diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 90063afe8d36..a30658b2611b 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -687,7 +687,7 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd) static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { - page_table_check_pmd_set(mm, addr, pmdp, pmd); + page_table_check_pmd_set(mm, pmdp, pmd); return __set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd)); } @@ -758,7 +758,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, static inline pmd_t pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { - page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd); + page_table_check_pmd_set(vma->vm_mm, pmdp, pmd); return __pmd(atomic_long_xchg((atomic_long_t *)pmdp, pmd_val(pmd))); } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index d14f0d92f04b..9cc26cb0bc9f 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1030,7 +1030,7 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { - page_table_check_pmd_set(mm, addr, pmdp, pmd); + page_table_check_pmd_set(mm, pmdp, pmd); set_pmd(pmdp, pmd); } @@ -1167,7 +1167,7 @@ static inline int pud_write(pud_t pud) static inline pmd_t pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { - page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd); + page_table_check_pmd_set(vma->vm_mm, pmdp, pmd); if (IS_ENABLED(CONFIG_SMP)) { return xchg(pmdp, pmd); } else { diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 63ebd9fcf28b..dd58dfb0e643 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -18,8 +18,7 @@ void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte); void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd); void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud); void __page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, pte_t pte); -void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, pmd_t pmd); +void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd); void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud); void __page_table_check_pte_clear_range(struct mm_struct *mm, @@ -75,14 +74,13 @@ static inline void page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, __page_table_check_pte_set(mm, ptep, pte); } -static inline void page_table_check_pmd_set(struct mm_struct *mm, - unsigned long addr, pmd_t *pmdp, +static inline void page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pmd_set(mm, addr, pmdp, pmd); + __page_table_check_pmd_set(mm, pmdp, pmd); } static inline void page_table_check_pud_set(struct mm_struct *mm, @@ -132,8 +130,7 @@ static inline void page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, { } -static inline void page_table_check_pmd_set(struct mm_struct *mm, - unsigned long addr, pmd_t *pmdp, +static inline void page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd) { } diff --git a/mm/page_table_check.c b/mm/page_table_check.c index fc20ddc3a63e..033956704a64 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -195,8 +195,7 @@ void __page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, pte_t pte) } EXPORT_SYMBOL(__page_table_check_pte_set); -void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, pmd_t pmd) +void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd) { if (&init_mm == mm) return; -- cgit v1.2.3 From 6d144436d954311f2dbacb5bf7b084042448d83e Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 14 Jul 2023 01:26:36 +0800 Subject: mm/page_table_check: remove unused parameter in [__]page_table_check_pud_set Remove unused addr in __page_table_check_pud_set and page_table_check_pud_set. Link: https://lkml.kernel.org/r/20230713172636.1705415-9-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Pavel Tatashin Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 2 +- arch/riscv/include/asm/pgtable.h | 2 +- arch/x86/include/asm/pgtable.h | 2 +- include/linux/page_table_check.h | 11 ++++------- mm/page_table_check.c | 3 +-- 5 files changed, 8 insertions(+), 12 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 1fbf8d3f42b1..fe4b913589ee 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -531,7 +531,7 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud) { - page_table_check_pud_set(mm, addr, pudp, pud); + page_table_check_pud_set(mm, pudp, pud); return __set_pte_at(mm, addr, (pte_t *)pudp, pud_pte(pud)); } diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index a30658b2611b..44377f0d7c35 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -694,7 +694,7 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud) { - page_table_check_pud_set(mm, addr, pudp, pud); + page_table_check_pud_set(mm, pudp, pud); return __set_pte_at(mm, addr, (pte_t *)pudp, pud_pte(pud)); } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 9cc26cb0bc9f..ada1bbf12961 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1037,7 +1037,7 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud) { - page_table_check_pud_set(mm, addr, pudp, pud); + page_table_check_pud_set(mm, pudp, pud); native_set_pud(pudp, pud); } diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index dd58dfb0e643..7f6b9bf926c5 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -19,8 +19,7 @@ void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd); void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud); void __page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, pte_t pte); void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd); -void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, - pud_t *pudp, pud_t pud); +void __page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud); void __page_table_check_pte_clear_range(struct mm_struct *mm, unsigned long addr, pmd_t pmd); @@ -83,14 +82,13 @@ static inline void page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, __page_table_check_pmd_set(mm, pmdp, pmd); } -static inline void page_table_check_pud_set(struct mm_struct *mm, - unsigned long addr, pud_t *pudp, +static inline void page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pud_set(mm, addr, pudp, pud); + __page_table_check_pud_set(mm, pudp, pud); } static inline void page_table_check_pte_clear_range(struct mm_struct *mm, @@ -135,8 +133,7 @@ static inline void page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, { } -static inline void page_table_check_pud_set(struct mm_struct *mm, - unsigned long addr, pud_t *pudp, +static inline void page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud) { } diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 033956704a64..84c8163984e5 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -208,8 +208,7 @@ void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd) } EXPORT_SYMBOL(__page_table_check_pmd_set); -void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, - pud_t *pudp, pud_t pud) +void __page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud) { if (&init_mm == mm) return; -- cgit v1.2.3 From 8f03d74f716313892908819a389ff9ede483c3a5 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 6 Jul 2023 23:45:19 +0800 Subject: arm64 : mm: add wrapper function ioremap_prot() Since hook functions ioremap_allowed() and iounmap_allowed() will be obsoleted, add wrapper function ioremap_prot() to contain the the specific handling in addition to generic_ioremap_prot() invocation. Link: https://lkml.kernel.org/r/20230706154520.11257-19-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Christoph Hellwig Reviewed-by: Kefeng Wang Reviewed-by: Mike Rapoport (IBM) Acked-by: Catalin Marinas Cc: Catalin Marinas Cc: Will Deacon Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Brian Cain Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Chris Zankel Cc: David Laight Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Max Filippov Cc: Michael Ellerman Cc: Nathan Chancellor Cc: Nicholas Piggin Cc: Niklas Schnelle Cc: Rich Felker Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/arm64/include/asm/io.h | 3 +-- arch/arm64/mm/ioremap.c | 10 ++++++---- 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index 51d92abf945e..3b694511b98f 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -139,8 +139,7 @@ extern void __memset_io(volatile void __iomem *, int, size_t); * I/O memory mapping functions. */ -bool ioremap_allowed(phys_addr_t phys_addr, size_t size, unsigned long prot); -#define ioremap_allowed ioremap_allowed +#define ioremap_prot ioremap_prot #define _PAGE_IOREMAP PROT_DEVICE_nGnRE diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c index c5af103d4ad4..269f2f63ab7d 100644 --- a/arch/arm64/mm/ioremap.c +++ b/arch/arm64/mm/ioremap.c @@ -3,20 +3,22 @@ #include #include -bool ioremap_allowed(phys_addr_t phys_addr, size_t size, unsigned long prot) +void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, + unsigned long prot) { unsigned long last_addr = phys_addr + size - 1; /* Don't allow outside PHYS_MASK */ if (last_addr & ~PHYS_MASK) - return false; + return NULL; /* Don't allow RAM to be mapped. */ if (WARN_ON(pfn_is_map_memory(__phys_to_pfn(phys_addr)))) - return false; + return NULL; - return true; + return generic_ioremap_prot(phys_addr, size, __pgprot(prot)); } +EXPORT_SYMBOL(ioremap_prot); /* * Must be called after early_fixmap_init -- cgit v1.2.3 From 43b3dfdd04553171488cb11d46d21948b6b90e27 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Mon, 17 Jul 2023 21:10:04 +0800 Subject: arm64: support batched/deferred tlb shootdown during page reclamation/migration On x86, batched and deferred tlb shootdown has lead to 90% performance increase on tlb shootdown. on arm64, HW can do tlb shootdown without software IPI. But sync tlbi is still quite expensive. Even running a simplest program which requires swapout can prove this is true, #include #include #include #include int main() { #define SIZE (1 * 1024 * 1024) volatile unsigned char *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); memset(p, 0x88, SIZE); for (int k = 0; k < 10000; k++) { /* swap in */ for (int i = 0; i < SIZE; i += 4096) { (void)p[i]; } /* swap out */ madvise(p, SIZE, MADV_PAGEOUT); } } Perf result on snapdragon 888 with 8 cores by using zRAM as the swap block device. ~ # perf record taskset -c 4 ./a.out [ perf record: Woken up 10 times to write data ] [ perf record: Captured and wrote 2.297 MB perf.data (60084 samples) ] ~ # perf report # To display the perf.data header info, please use --header/--header-only options. # To display the perf.data header info, please use --header/--header-only options. # # # Total Lost Samples: 0 # # Samples: 60K of event 'cycles' # Event count (approx.): 35706225414 # # Overhead Command Shared Object Symbol # ........ ....... ................. ...... # 21.07% a.out [kernel.kallsyms] [k] _raw_spin_unlock_irq 8.23% a.out [kernel.kallsyms] [k] _raw_spin_unlock_irqrestore 6.67% a.out [kernel.kallsyms] [k] filemap_map_pages 6.16% a.out [kernel.kallsyms] [k] __zram_bvec_write 5.36% a.out [kernel.kallsyms] [k] ptep_clear_flush 3.71% a.out [kernel.kallsyms] [k] _raw_spin_lock 3.49% a.out [kernel.kallsyms] [k] memset64 1.63% a.out [kernel.kallsyms] [k] clear_page 1.42% a.out [kernel.kallsyms] [k] _raw_spin_unlock 1.26% a.out [kernel.kallsyms] [k] mod_zone_state.llvm.8525150236079521930 1.23% a.out [kernel.kallsyms] [k] xas_load 1.15% a.out [kernel.kallsyms] [k] zram_slot_lock ptep_clear_flush() takes 5.36% CPU in the micro-benchmark swapping in/out a page mapped by only one process. If the page is mapped by multiple processes, typically, like more than 100 on a phone, the overhead would be much higher as we have to run tlb flush 100 times for one single page. Plus, tlb flush overhead will increase with the number of CPU cores due to the bad scalability of tlb shootdown in HW, so those ARM64 servers should expect much higher overhead. Further perf annonate shows 95% cpu time of ptep_clear_flush is actually used by the final dsb() to wait for the completion of tlb flush. This provides us a very good chance to leverage the existing batched tlb in kernel. The minimum modification is that we only send async tlbi in the first stage and we send dsb while we have to sync in the second stage. With the above simplest micro benchmark, collapsed time to finish the program decreases around 5%. Typical collapsed time w/o patch: ~ # time taskset -c 4 ./a.out 0.21user 14.34system 0:14.69elapsed w/ patch: ~ # time taskset -c 4 ./a.out 0.22user 13.45system 0:13.80elapsed Also tested with benchmark in the commit on Kunpeng920 arm64 server and observed an improvement around 12.5% with command `time ./swap_bench`. w/o w/ real 0m13.460s 0m11.771s user 0m0.248s 0m0.279s sys 0m12.039s 0m11.458s Originally it's noticed a 16.99% overhead of ptep_clear_flush() which has been eliminated by this patch: [root@localhost yang]# perf record -- ./swap_bench && perf report [...] 16.99% swap_bench [kernel.kallsyms] [k] ptep_clear_flush It is tested on 4,8,128 CPU platforms and shows to be beneficial on large systems but may not have improvement on small systems like on a 4 CPU platform. Also this patch improve the performance of page migration. Using pmbench and tries to migrate the pages of pmbench between node 0 and node 1 for 100 times for 1G memory, this patch decrease the time used around 20% (prev 18.338318910 sec after 13.981866350 sec) and saved the time used by ptep_clear_flush(). Link: https://lkml.kernel.org/r/20230717131004.12662-5-yangyicong@huawei.com Tested-by: Yicong Yang Tested-by: Xin Hao Tested-by: Punit Agrawal Signed-off-by: Barry Song Signed-off-by: Yicong Yang Reviewed-by: Kefeng Wang Reviewed-by: Xin Hao Reviewed-by: Anshuman Khandual Reviewed-by: Catalin Marinas Cc: Anshuman Khandual Cc: Jonathan Corbet Cc: Nadav Amit Cc: Mel Gorman Cc: Anshuman Khandual Cc: Arnd Bergmann Cc: Barry Song Cc: Darren Hart Cc: Jonathan Cameron Cc: lipeifeng Cc: Mark Rutland Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Steven Miao Cc: Will Deacon Cc: Zeng Tao Signed-off-by: Andrew Morton --- Documentation/features/vm/TLB/arch-support.txt | 2 +- arch/arm64/Kconfig | 1 + arch/arm64/include/asm/tlbbatch.h | 12 +++++++ arch/arm64/include/asm/tlbflush.h | 44 ++++++++++++++++++++++++-- 4 files changed, 55 insertions(+), 4 deletions(-) create mode 100644 arch/arm64/include/asm/tlbbatch.h (limited to 'arch/arm64') diff --git a/Documentation/features/vm/TLB/arch-support.txt b/Documentation/features/vm/TLB/arch-support.txt index 7f049c251a79..76208db88f3b 100644 --- a/Documentation/features/vm/TLB/arch-support.txt +++ b/Documentation/features/vm/TLB/arch-support.txt @@ -9,7 +9,7 @@ | alpha: | TODO | | arc: | TODO | | arm: | TODO | - | arm64: | N/A | + | arm64: | ok | | csky: | TODO | | hexagon: | TODO | | ia64: | TODO | diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index a2511b30d0f6..751d8c8821db 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -96,6 +96,7 @@ config ARM64 select ARCH_SUPPORTS_NUMA_BALANCING select ARCH_SUPPORTS_PAGE_TABLE_CHECK select ARCH_SUPPORTS_PER_VMA_LOCK + select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT select ARCH_WANT_DEFAULT_BPF_JIT select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT diff --git a/arch/arm64/include/asm/tlbbatch.h b/arch/arm64/include/asm/tlbbatch.h new file mode 100644 index 000000000000..fedb0b87b8db --- /dev/null +++ b/arch/arm64/include/asm/tlbbatch.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ARCH_ARM64_TLBBATCH_H +#define _ARCH_ARM64_TLBBATCH_H + +struct arch_tlbflush_unmap_batch { + /* + * For arm64, HW can do tlb shootdown, so we don't + * need to record cpumask for sending IPI + */ +}; + +#endif /* _ARCH_ARM64_TLBBATCH_H */ diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 412a3b9a3c25..3456866c6a1d 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -254,17 +254,23 @@ static inline void flush_tlb_mm(struct mm_struct *mm) dsb(ish); } -static inline void flush_tlb_page_nosync(struct vm_area_struct *vma, - unsigned long uaddr) +static inline void __flush_tlb_page_nosync(struct mm_struct *mm, + unsigned long uaddr) { unsigned long addr; dsb(ishst); - addr = __TLBI_VADDR(uaddr, ASID(vma->vm_mm)); + addr = __TLBI_VADDR(uaddr, ASID(mm)); __tlbi(vale1is, addr); __tlbi_user(vale1is, addr); } +static inline void flush_tlb_page_nosync(struct vm_area_struct *vma, + unsigned long uaddr) +{ + return __flush_tlb_page_nosync(vma->vm_mm, uaddr); +} + static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr) { @@ -272,6 +278,38 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, dsb(ish); } +static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm) +{ +#ifdef CONFIG_ARM64_WORKAROUND_REPEAT_TLBI + /* + * TLB flush deferral is not required on systems which are affected by + * ARM64_WORKAROUND_REPEAT_TLBI, as __tlbi()/__tlbi_user() implementation + * will have two consecutive TLBI instructions with a dsb(ish) in between + * defeating the purpose (i.e save overall 'dsb ish' cost). + */ + if (unlikely(cpus_have_const_cap(ARM64_WORKAROUND_REPEAT_TLBI))) + return false; +#endif + return true; +} + +static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch, + struct mm_struct *mm, + unsigned long uaddr) +{ + __flush_tlb_page_nosync(mm, uaddr); +} + +static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm) +{ + dsb(ish); +} + +static inline void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) +{ + dsb(ish); +} + /* * This is meant to avoid soft lock-ups on large TLB flushing ranges and not * necessarily a performance improvement. -- cgit v1.2.3 From 6bbd42e2df8f90b022fa0b82986d58ecb30b9dcc Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Tue, 25 Jul 2023 23:42:05 +1000 Subject: mmu_notifiers: call invalidate_range() when invalidating TLBs The invalidate_range() is going to become an architecture specific mmu notifier used to keep the TLB of secondary MMUs such as an IOMMU in sync with the CPU page tables. Currently it is called from separate code paths to the main CPU TLB invalidations. This can lead to a secondary TLB not getting invalidated when required and makes it hard to reason about when exactly the secondary TLB is invalidated. To fix this move the notifier call to the architecture specific TLB maintenance functions for architectures that have secondary MMUs requiring explicit software invalidations. This fixes a SMMU bug on ARM64. On ARM64 PTE permission upgrades require a TLB invalidation. This invalidation is done by the architecture specific ptep_set_access_flags() which calls flush_tlb_page() if required. However this doesn't call the notifier resulting in infinite faults being generated by devices using the SMMU if it has previously cached a read-only PTE in it's TLB. Moving the invalidations into the TLB invalidation functions ensures all invalidations happen at the same time as the CPU invalidation. The architecture specific flush_tlb_all() routines do not call the notifier as none of the IOMMUs require this. Link: https://lkml.kernel.org/r/0287ae32d91393a582897d6c4db6f7456b1001f2.1690292440.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Suggested-by: Jason Gunthorpe Tested-by: SeongJae Park Acked-by: Catalin Marinas Reviewed-by: Jason Gunthorpe Tested-by: Luis Chamberlain Cc: Andrew Donnellan Cc: Chaitanya Kumar Borah Cc: Frederic Barrat Cc: John Hubbard Cc: Kevin Tian Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Nicolin Chen Cc: Robin Murphy Cc: Sean Christopherson Cc: Tvrtko Ursulin Cc: Will Deacon Cc: Zhi Wang Signed-off-by: Andrew Morton --- arch/arm64/include/asm/tlbflush.h | 5 +++++ arch/powerpc/include/asm/book3s/64/tlbflush.h | 1 + arch/powerpc/mm/book3s64/radix_hugetlbpage.c | 1 + arch/powerpc/mm/book3s64/radix_tlb.c | 4 ++++ arch/x86/include/asm/tlbflush.h | 2 ++ arch/x86/mm/tlb.c | 2 ++ include/asm-generic/tlb.h | 1 - 7 files changed, 15 insertions(+), 1 deletion(-) (limited to 'arch/arm64') diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 3456866c6a1d..a99349d10c0e 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -252,6 +253,7 @@ static inline void flush_tlb_mm(struct mm_struct *mm) __tlbi(aside1is, asid); __tlbi_user(aside1is, asid); dsb(ish); + mmu_notifier_invalidate_range(mm, 0, -1UL); } static inline void __flush_tlb_page_nosync(struct mm_struct *mm, @@ -263,6 +265,8 @@ static inline void __flush_tlb_page_nosync(struct mm_struct *mm, addr = __TLBI_VADDR(uaddr, ASID(mm)); __tlbi(vale1is, addr); __tlbi_user(vale1is, addr); + mmu_notifier_invalidate_range(mm, uaddr & PAGE_MASK, + (uaddr & PAGE_MASK) + PAGE_SIZE); } static inline void flush_tlb_page_nosync(struct vm_area_struct *vma, @@ -396,6 +400,7 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma, scale++; } dsb(ish); + mmu_notifier_invalidate_range(vma->vm_mm, start, end); } static inline void flush_tlb_range(struct vm_area_struct *vma, diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h index 0d0c1447ecf0..dca0477b0709 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h @@ -5,6 +5,7 @@ #define MMU_NO_CONTEXT ~0UL #include +#include #include #include diff --git a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c index 5e3195568525..f3fb49fd32fe 100644 --- a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c +++ b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c @@ -39,6 +39,7 @@ void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long st radix__flush_tlb_pwc_range_psize(vma->vm_mm, start, end, psize); else radix__flush_tlb_range_psize(vma->vm_mm, start, end, psize); + mmu_notifier_invalidate_range(vma->vm_mm, start, end); } void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma, diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index 0bd4866d9824..4d44902a4962 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -987,6 +987,7 @@ void radix__flush_tlb_mm(struct mm_struct *mm) } } preempt_enable(); + mmu_notifier_invalidate_range(mm, 0, -1UL); } EXPORT_SYMBOL(radix__flush_tlb_mm); @@ -1020,6 +1021,7 @@ static void __flush_all_mm(struct mm_struct *mm, bool fullmm) _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL); } preempt_enable(); + mmu_notifier_invalidate_range(mm, 0, -1UL); } void radix__flush_all_mm(struct mm_struct *mm) @@ -1228,6 +1230,7 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm, } out: preempt_enable(); + mmu_notifier_invalidate_range(mm, start, end); } void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, @@ -1392,6 +1395,7 @@ static void __radix__flush_tlb_range_psize(struct mm_struct *mm, } out: preempt_enable(); + mmu_notifier_invalidate_range(mm, start, end); } void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 837e4a50281a..0a5432364c5a 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -3,6 +3,7 @@ #define _ASM_X86_TLBFLUSH_H #include +#include #include #include @@ -282,6 +283,7 @@ static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *b { inc_mm_tlb_gen(mm); cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); + mmu_notifier_invalidate_range(mm, 0, -1UL); } static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 267acf27480a..93b2f81f09da 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -1036,6 +1037,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, put_flush_tlb_info(); put_cpu(); + mmu_notifier_invalidate_range(mm, start, end); } diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index b46617207c93..bc32a2284c56 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -456,7 +456,6 @@ static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) return; tlb_flush(tlb); - mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end); __tlb_reset_range(tlb); } -- cgit v1.2.3 From 1af5a8109904b7f00828e7f9f63f5695b42f8215 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Tue, 25 Jul 2023 23:42:07 +1000 Subject: mmu_notifiers: rename invalidate_range notifier There are two main use cases for mmu notifiers. One is by KVM which uses mmu_notifier_invalidate_range_start()/end() to manage a software TLB. The other is to manage hardware TLBs which need to use the invalidate_range() callback because HW can establish new TLB entries at any time. Hence using start/end() can lead to memory corruption as these callbacks happen too soon/late during page unmap. mmu notifier users should therefore either use the start()/end() callbacks or the invalidate_range() callbacks. To make this usage clearer rename the invalidate_range() callback to arch_invalidate_secondary_tlbs() and update documention. Link: https://lkml.kernel.org/r/6f77248cd25545c8020a54b4e567e8b72be4dca1.1690292440.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Suggested-by: Jason Gunthorpe Acked-by: Catalin Marinas Reviewed-by: Jason Gunthorpe Cc: Andrew Donnellan Cc: Chaitanya Kumar Borah Cc: Frederic Barrat Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kevin Tian Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Nicolin Chen Cc: Robin Murphy Cc: Sean Christopherson Cc: SeongJae Park Cc: Tvrtko Ursulin Cc: Will Deacon Cc: Zhi Wang Signed-off-by: Andrew Morton --- arch/arm64/include/asm/tlbflush.h | 6 ++-- arch/powerpc/mm/book3s64/radix_hugetlbpage.c | 2 +- arch/powerpc/mm/book3s64/radix_tlb.c | 8 ++--- arch/x86/include/asm/tlbflush.h | 2 +- arch/x86/mm/tlb.c | 2 +- drivers/iommu/amd/iommu_v2.c | 10 +++--- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 13 +++---- drivers/iommu/intel/svm.c | 8 ++--- drivers/misc/ocxl/link.c | 8 ++--- include/linux/mmu_notifier.h | 48 ++++++++++++------------- mm/huge_memory.c | 4 +-- mm/hugetlb.c | 7 ++-- mm/mmu_notifier.c | 21 ++++++++--- 13 files changed, 76 insertions(+), 63 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index a99349d10c0e..84a05a0bd2b6 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -253,7 +253,7 @@ static inline void flush_tlb_mm(struct mm_struct *mm) __tlbi(aside1is, asid); __tlbi_user(aside1is, asid); dsb(ish); - mmu_notifier_invalidate_range(mm, 0, -1UL); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); } static inline void __flush_tlb_page_nosync(struct mm_struct *mm, @@ -265,7 +265,7 @@ static inline void __flush_tlb_page_nosync(struct mm_struct *mm, addr = __TLBI_VADDR(uaddr, ASID(mm)); __tlbi(vale1is, addr); __tlbi_user(vale1is, addr); - mmu_notifier_invalidate_range(mm, uaddr & PAGE_MASK, + mmu_notifier_arch_invalidate_secondary_tlbs(mm, uaddr & PAGE_MASK, (uaddr & PAGE_MASK) + PAGE_SIZE); } @@ -400,7 +400,7 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma, scale++; } dsb(ish); - mmu_notifier_invalidate_range(vma->vm_mm, start, end); + mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end); } static inline void flush_tlb_range(struct vm_area_struct *vma, diff --git a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c index f3fb49fd32fe..17075c78d4bc 100644 --- a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c +++ b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c @@ -39,7 +39,7 @@ void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long st radix__flush_tlb_pwc_range_psize(vma->vm_mm, start, end, psize); else radix__flush_tlb_range_psize(vma->vm_mm, start, end, psize); - mmu_notifier_invalidate_range(vma->vm_mm, start, end); + mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end); } void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma, diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index 4d44902a4962..06e647ef19d1 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -987,7 +987,7 @@ void radix__flush_tlb_mm(struct mm_struct *mm) } } preempt_enable(); - mmu_notifier_invalidate_range(mm, 0, -1UL); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); } EXPORT_SYMBOL(radix__flush_tlb_mm); @@ -1021,7 +1021,7 @@ static void __flush_all_mm(struct mm_struct *mm, bool fullmm) _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL); } preempt_enable(); - mmu_notifier_invalidate_range(mm, 0, -1UL); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); } void radix__flush_all_mm(struct mm_struct *mm) @@ -1230,7 +1230,7 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm, } out: preempt_enable(); - mmu_notifier_invalidate_range(mm, start, end); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); } void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, @@ -1395,7 +1395,7 @@ static void __radix__flush_tlb_range_psize(struct mm_struct *mm, } out: preempt_enable(); - mmu_notifier_invalidate_range(mm, start, end); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); } void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 0a5432364c5a..6ab42caaa67a 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -283,7 +283,7 @@ static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *b { inc_mm_tlb_gen(mm); cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); - mmu_notifier_invalidate_range(mm, 0, -1UL); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); } static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 93b2f81f09da..2d253919b3e8 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -1037,7 +1037,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, put_flush_tlb_info(); put_cpu(); - mmu_notifier_invalidate_range(mm, start, end); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); } diff --git a/drivers/iommu/amd/iommu_v2.c b/drivers/iommu/amd/iommu_v2.c index 261352a23271..2596466cd5a6 100644 --- a/drivers/iommu/amd/iommu_v2.c +++ b/drivers/iommu/amd/iommu_v2.c @@ -355,9 +355,9 @@ static struct pasid_state *mn_to_state(struct mmu_notifier *mn) return container_of(mn, struct pasid_state, mn); } -static void mn_invalidate_range(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, unsigned long end) +static void mn_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end) { struct pasid_state *pasid_state; struct device_state *dev_state; @@ -391,8 +391,8 @@ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm) } static const struct mmu_notifier_ops iommu_mn = { - .release = mn_release, - .invalidate_range = mn_invalidate_range, + .release = mn_release, + .arch_invalidate_secondary_tlbs = mn_arch_invalidate_secondary_tlbs, }; static void set_pri_tag_status(struct pasid_state *pasid_state, diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c index 2a19784b698e..dbc812a0e57e 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c @@ -186,9 +186,10 @@ static void arm_smmu_free_shared_cd(struct arm_smmu_ctx_desc *cd) } } -static void arm_smmu_mm_invalidate_range(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, unsigned long end) +static void arm_smmu_mm_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, + unsigned long end) { struct arm_smmu_mmu_notifier *smmu_mn = mn_to_smmu(mn); struct arm_smmu_domain *smmu_domain = smmu_mn->domain; @@ -247,9 +248,9 @@ static void arm_smmu_mmu_notifier_free(struct mmu_notifier *mn) } static const struct mmu_notifier_ops arm_smmu_mmu_notifier_ops = { - .invalidate_range = arm_smmu_mm_invalidate_range, - .release = arm_smmu_mm_release, - .free_notifier = arm_smmu_mmu_notifier_free, + .arch_invalidate_secondary_tlbs = arm_smmu_mm_arch_invalidate_secondary_tlbs, + .release = arm_smmu_mm_release, + .free_notifier = arm_smmu_mmu_notifier_free, }; /* Allocate or get existing MMU notifier for this {domain, mm} pair */ diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index e95b339e9cdc..8f6d68006ab6 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -219,9 +219,9 @@ static void intel_flush_svm_range(struct intel_svm *svm, unsigned long address, } /* Pages have been freed at this point */ -static void intel_invalidate_range(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, unsigned long end) +static void intel_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end) { struct intel_svm *svm = container_of(mn, struct intel_svm, notifier); @@ -256,7 +256,7 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) static const struct mmu_notifier_ops intel_mmuops = { .release = intel_mm_release, - .invalidate_range = intel_invalidate_range, + .arch_invalidate_secondary_tlbs = intel_arch_invalidate_secondary_tlbs, }; static DEFINE_MUTEX(pasid_mutex); diff --git a/drivers/misc/ocxl/link.c b/drivers/misc/ocxl/link.c index 4cf4c55a5f00..c06c699c0e7b 100644 --- a/drivers/misc/ocxl/link.c +++ b/drivers/misc/ocxl/link.c @@ -491,9 +491,9 @@ void ocxl_link_release(struct pci_dev *dev, void *link_handle) } EXPORT_SYMBOL_GPL(ocxl_link_release); -static void invalidate_range(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, unsigned long end) +static void arch_invalidate_secondary_tlbs(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end) { struct pe_data *pe_data = container_of(mn, struct pe_data, mmu_notifier); struct ocxl_link *link = pe_data->link; @@ -509,7 +509,7 @@ static void invalidate_range(struct mmu_notifier *mn, } static const struct mmu_notifier_ops ocxl_mmu_notifier_ops = { - .invalidate_range = invalidate_range, + .arch_invalidate_secondary_tlbs = arch_invalidate_secondary_tlbs, }; static u64 calculate_cfg_state(bool kernel) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index f2e9edc6aa43..6e3c857606f1 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -187,27 +187,27 @@ struct mmu_notifier_ops { const struct mmu_notifier_range *range); /* - * invalidate_range() is either called between - * invalidate_range_start() and invalidate_range_end() when the - * VM has to free pages that where unmapped, but before the - * pages are actually freed, or outside of _start()/_end() when - * a (remote) TLB is necessary. + * arch_invalidate_secondary_tlbs() is used to manage a non-CPU TLB + * which shares page-tables with the CPU. The + * invalidate_range_start()/end() callbacks should not be implemented as + * invalidate_secondary_tlbs() already catches the points in time when + * an external TLB needs to be flushed. * - * If invalidate_range() is used to manage a non-CPU TLB with - * shared page-tables, it not necessary to implement the - * invalidate_range_start()/end() notifiers, as - * invalidate_range() already catches the points in time when an - * external TLB range needs to be flushed. For more in depth - * discussion on this see Documentation/mm/mmu_notifier.rst + * This requires arch_invalidate_secondary_tlbs() to be called while + * holding the ptl spin-lock and therefore this callback is not allowed + * to sleep. * - * Note that this function might be called with just a sub-range - * of what was passed to invalidate_range_start()/end(), if - * called between those functions. + * This is called by architecture code whenever invalidating a TLB + * entry. It is assumed that any secondary TLB has the same rules for + * when invalidations are required. If this is not the case architecture + * code will need to call this explicitly when required for secondary + * TLB invalidation. */ - void (*invalidate_range)(struct mmu_notifier *subscription, - struct mm_struct *mm, - unsigned long start, - unsigned long end); + void (*arch_invalidate_secondary_tlbs)( + struct mmu_notifier *subscription, + struct mm_struct *mm, + unsigned long start, + unsigned long end); /* * These callbacks are used with the get/put interface to manage the @@ -396,8 +396,8 @@ extern void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte); extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r); extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r); -extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, - unsigned long start, unsigned long end); +extern void __mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, + unsigned long start, unsigned long end); extern bool mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range); @@ -483,11 +483,11 @@ mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) __mmu_notifier_invalidate_range_end(range); } -static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, - unsigned long start, unsigned long end) +static inline void mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, + unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) - __mmu_notifier_invalidate_range(mm, start, end); + __mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); } static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm) @@ -664,7 +664,7 @@ void mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) { } -static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, +static inline void mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, unsigned long start, unsigned long end) { } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3ece117de898..e0420de0e2e0 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2120,8 +2120,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (is_huge_zero_pmd(*pmd)) { /* * FIXME: Do we want to invalidate secondary mmu by calling - * mmu_notifier_invalidate_range() see comments below inside - * __split_huge_pmd() ? + * mmu_notifier_arch_invalidate_secondary_tlbs() see comments below + * inside __split_huge_pmd() ? * * We are going from a zero huge page write protected to zero * small page also write protected so it does not seems useful diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4672752b0b17..5ef7bccda50c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6649,8 +6649,9 @@ long hugetlb_change_protection(struct vm_area_struct *vma, else flush_hugetlb_tlb_range(vma, start, end); /* - * No need to call mmu_notifier_invalidate_range() we are downgrading - * page table protection not changing it to point to a new page. + * No need to call mmu_notifier_arch_invalidate_secondary_tlbs() we are + * downgrading page table protection not changing it to point to a new + * page. * * See Documentation/mm/mmu_notifier.rst */ @@ -7294,7 +7295,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, i_mmap_unlock_write(vma->vm_file->f_mapping); hugetlb_vma_unlock_write(vma); /* - * No need to call mmu_notifier_invalidate_range(), see + * No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see * Documentation/mm/mmu_notifier.rst. */ mmu_notifier_invalidate_range_end(&range); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 453a156d93c0..ec3b068cbbe6 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -585,8 +585,8 @@ void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) lock_map_release(&__mmu_notifier_invalidate_range_start_map); } -void __mmu_notifier_invalidate_range(struct mm_struct *mm, - unsigned long start, unsigned long end) +void __mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, + unsigned long start, unsigned long end) { struct mmu_notifier *subscription; int id; @@ -595,9 +595,10 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm, hlist_for_each_entry_rcu(subscription, &mm->notifier_subscriptions->list, hlist, srcu_read_lock_held(&srcu)) { - if (subscription->ops->invalidate_range) - subscription->ops->invalidate_range(subscription, mm, - start, end); + if (subscription->ops->arch_invalidate_secondary_tlbs) + subscription->ops->arch_invalidate_secondary_tlbs( + subscription, mm, + start, end); } srcu_read_unlock(&srcu, id); } @@ -616,6 +617,16 @@ int __mmu_notifier_register(struct mmu_notifier *subscription, mmap_assert_write_locked(mm); BUG_ON(atomic_read(&mm->mm_users) <= 0); + /* + * Subsystems should only register for invalidate_secondary_tlbs() or + * invalidate_range_start()/end() callbacks, not both. + */ + if (WARN_ON_ONCE(subscription && + (subscription->ops->arch_invalidate_secondary_tlbs && + (subscription->ops->invalidate_range_start || + subscription->ops->invalidate_range_end)))) + return -EINVAL; + if (!mm->notifier_subscriptions) { /* * kmalloc cannot be called under mm_take_all_locks(), but we -- cgit v1.2.3 From 284e05920498788c5df1a7dd6424adb426498e1c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 24 Jul 2023 19:54:01 +0100 Subject: mm: remove CONFIG_PER_VMA_LOCK ifdefs Patch series "Handle most file-backed faults under the VMA lock", v3. This patchset adds the ability to handle page faults on parts of files which are already in the page cache without taking the mmap lock. This patch (of 10): Provide lock_vma_under_rcu() when CONFIG_PER_VMA_LOCK is not defined to eliminate ifdefs in the users. Link: https://lkml.kernel.org/r/20230724185410.1124082-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230724185410.1124082-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Cc: Punit Agrawal Cc: Arjun Roy Cc: Eric Dumazet Signed-off-by: Andrew Morton --- arch/arm64/mm/fault.c | 2 -- arch/powerpc/mm/fault.c | 4 ---- arch/riscv/mm/fault.c | 4 ---- arch/s390/mm/fault.c | 2 -- arch/x86/mm/fault.c | 4 ---- include/linux/mm.h | 6 ++++++ 6 files changed, 6 insertions(+), 16 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 3fe516b32577..103fcbdc6552 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -587,7 +587,6 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); -#ifdef CONFIG_PER_VMA_LOCK if (!(mm_flags & FAULT_FLAG_USER)) goto lock_mmap; @@ -615,7 +614,6 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, return 0; } lock_mmap: -#endif /* CONFIG_PER_VMA_LOCK */ retry: vma = lock_mm_and_find_vma(mm, addr, regs); diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 5bfdf6ecfa96..fafce6bdeff0 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -469,7 +469,6 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address, if (is_exec) flags |= FAULT_FLAG_INSTRUCTION; -#ifdef CONFIG_PER_VMA_LOCK if (!(flags & FAULT_FLAG_USER)) goto lock_mmap; @@ -501,7 +500,6 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address, return user_mode(regs) ? 0 : SIGBUS; lock_mmap: -#endif /* CONFIG_PER_VMA_LOCK */ /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the @@ -551,9 +549,7 @@ retry: mmap_read_unlock(current->mm); -#ifdef CONFIG_PER_VMA_LOCK done: -#endif if (unlikely(fault & VM_FAULT_ERROR)) return mm_fault_error(regs, address, fault); diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 6ea2cce4cc17..046732fcb48c 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -283,7 +283,6 @@ void handle_page_fault(struct pt_regs *regs) flags |= FAULT_FLAG_WRITE; else if (cause == EXC_INST_PAGE_FAULT) flags |= FAULT_FLAG_INSTRUCTION; -#ifdef CONFIG_PER_VMA_LOCK if (!(flags & FAULT_FLAG_USER)) goto lock_mmap; @@ -311,7 +310,6 @@ void handle_page_fault(struct pt_regs *regs) return; } lock_mmap: -#endif /* CONFIG_PER_VMA_LOCK */ retry: vma = lock_mm_and_find_vma(mm, addr, regs); @@ -368,9 +366,7 @@ retry: mmap_read_unlock(mm); -#ifdef CONFIG_PER_VMA_LOCK done: -#endif if (unlikely(fault & VM_FAULT_ERROR)) { tsk->thread.bad_cause = cause; mm_fault_error(regs, addr, fault); diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 2f123429a291..6f6b9881e55e 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -407,7 +407,6 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) access = VM_WRITE; if (access == VM_WRITE) flags |= FAULT_FLAG_WRITE; -#ifdef CONFIG_PER_VMA_LOCK if (!(flags & FAULT_FLAG_USER)) goto lock_mmap; vma = lock_vma_under_rcu(mm, address); @@ -432,7 +431,6 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) goto out; } lock_mmap: -#endif /* CONFIG_PER_VMA_LOCK */ mmap_read_lock(mm); gmap = NULL; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index e8711b2cafaf..787da09d24f3 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1328,7 +1328,6 @@ void do_user_addr_fault(struct pt_regs *regs, } #endif -#ifdef CONFIG_PER_VMA_LOCK if (!(flags & FAULT_FLAG_USER)) goto lock_mmap; @@ -1358,7 +1357,6 @@ void do_user_addr_fault(struct pt_regs *regs, return; } lock_mmap: -#endif /* CONFIG_PER_VMA_LOCK */ retry: vma = lock_mm_and_find_vma(mm, address, regs); @@ -1418,9 +1416,7 @@ retry: } mmap_read_unlock(mm); -#ifdef CONFIG_PER_VMA_LOCK done: -#endif if (likely(!(fault & VM_FAULT_ERROR))) return; diff --git a/include/linux/mm.h b/include/linux/mm.h index ded514ee2588..21299a0cfbca 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -742,6 +742,12 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma) {} static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached) {} +static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, + unsigned long address) +{ + return NULL; +} + #endif /* CONFIG_PER_VMA_LOCK */ /* -- cgit v1.2.3 From 6a718bd2ed4a589e7e50e7d028d24e087139dd03 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Tue, 1 Aug 2023 20:42:03 +0800 Subject: arm64: tlbflush: add some comments for TLB batched flushing Add comments for arch_flush_tlb_batched_pending() and arch_tlbbatch_flush() to illustrate why only a DSB is needed. Link: https://lkml.kernel.org/r/20230801124203.62164-1-yangyicong@huawei.com Cc: Catalin Marinas Signed-off-by: Yicong Yang Reviewed-by: Alistair Popple Reviewed-by: Catalin Marinas Cc: Barry Song <21cnbao@gmail.com> Signed-off-by: Andrew Morton --- arch/arm64/include/asm/tlbflush.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'arch/arm64') diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 84a05a0bd2b6..55b50e1d4a84 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -304,11 +304,26 @@ static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *b __flush_tlb_page_nosync(mm, uaddr); } +/* + * If mprotect/munmap/etc occurs during TLB batched flushing, we need to + * synchronise all the TLBI issued with a DSB to avoid the race mentioned in + * flush_tlb_batched_pending(). + */ static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm) { dsb(ish); } +/* + * To support TLB batched flush for multiple pages unmapping, we only send + * the TLBI for each page in arch_tlbbatch_add_pending() and wait for the + * completion at the end in arch_tlbbatch_flush(). Since we've already issued + * TLBI for each page so only a DSB is needed to synchronise its effect on the + * other CPUs. + * + * This will save the time waiting on DSB comparing issuing a TLBI;DSB sequence + * for each page. + */ static inline void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) { dsb(ish); -- cgit v1.2.3 From 9cf6a060f95578c8147bdacdf55a1eaaa182ce49 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 2 Aug 2023 09:27:31 +0800 Subject: arm64: hugetlb: enable __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE It is better to use huge page size instead of PAGE_SIZE for stride when flush hugepage, which reduces the loop in __flush_tlb_range(). Let's support arch's flush_hugetlb_tlb_range(), which is used in hugetlb_unshare_all_pmds(), move_hugetlb_page_tables() and hugetlb_change_protection() for now. Note,: for hugepages based on contiguous bit, it has to be invalidated individually since the contiguous PTE bit is just a hint, the hardware may or may not take it into account. Link: https://lkml.kernel.org/r/20230802012731.62512-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Muchun Song Reviewed-by: Catalin Marinas Cc: Barry Song <21cnbao@gmail.com> Cc: Joel Fernandes (Google) Cc: Kalesh Singh Cc: "Kirill A. Shutemov" Cc: Mike Kravetz Cc: Mina Almasry Cc: Will Deacon Cc: William Kucharski Signed-off-by: Andrew Morton --- arch/arm64/include/asm/hugetlb.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'arch/arm64') diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 6a4a1ab8eb23..a91d6219aa78 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -60,4 +60,19 @@ extern void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, #include +#define __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE +static inline void flush_hugetlb_tlb_range(struct vm_area_struct *vma, + unsigned long start, + unsigned long end) +{ + unsigned long stride = huge_page_size(hstate_vma(vma)); + + if (stride == PMD_SIZE) + __flush_tlb_range(vma, start, end, stride, false, 2); + else if (stride == PUD_SIZE) + __flush_tlb_range(vma, start, end, stride, false, 1); + else + __flush_tlb_range(vma, start, end, PAGE_SIZE, false, 0); +} + #endif /* __ASM_HUGETLB_H */ -- cgit v1.2.3 From 04d5ea46a15149a12f79c686b6a1ffc9c3233272 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 8 Aug 2023 14:44:56 +0530 Subject: mm/memory_hotplug: simplify ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE kconfig Patch series "Add support for memmap on memory feature on ppc64", v8. This patch series update memmap on memory feature to fall back to memmap allocation outside the memory block if the alignment rules are not met. This makes the feature more useful on architectures like ppc64 where alignment rules are different with 64K page size. This patch (of 6): Instead of adding menu entry with all supported architectures, add mm/Kconfig variable and select the same from supported architectures. No functional change in this patch. Link: https://lkml.kernel.org/r/20230808091501.287660-1-aneesh.kumar@linux.ibm.com Link: https://lkml.kernel.org/r/20230808091501.287660-2-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Acked-by: Michal Hocko Acked-by: David Hildenbrand Cc: Christophe Leroy Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Vishal Verma Signed-off-by: Andrew Morton --- arch/arm64/Kconfig | 4 +--- arch/x86/Kconfig | 4 +--- mm/Kconfig | 3 +++ 3 files changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 751d8c8821db..a7a88322bf46 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -78,6 +78,7 @@ config ARM64 select ARCH_INLINE_SPIN_UNLOCK_IRQ if !PREEMPTION select ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE if !PREEMPTION select ARCH_KEEP_MEMBLOCK + select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE select ARCH_USE_CMPXCHG_LOCKREF select ARCH_USE_GNU_PROPERTY select ARCH_USE_MEMTEST @@ -349,9 +350,6 @@ config GENERIC_CSUM config GENERIC_CALIBRATE_DELAY def_bool y -config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE - def_bool y - config SMP def_bool y diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 78224aa76409..d0258e92a8af 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -102,6 +102,7 @@ config X86 select ARCH_HAS_DEBUG_WX select ARCH_HAS_ZONE_DMA_SET if EXPERT select ARCH_HAVE_NMI_SAFE_CMPXCHG + select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO @@ -2610,9 +2611,6 @@ config ARCH_HAS_ADD_PAGES def_bool y depends on ARCH_ENABLE_MEMORY_HOTPLUG -config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE - def_bool y - menu "Power management and ACPI options" config ARCH_HIBERNATION_HEADER diff --git a/mm/Kconfig b/mm/Kconfig index 5fe49c030961..721dc88423c7 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -571,6 +571,9 @@ config MHP_MEMMAP_ON_MEMORY endif # MEMORY_HOTPLUG +config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE + bool + # Heavily threaded applications may benefit from splitting the mm-wide # page_table_lock, so that faults on different parts of the user address # space can be handled with less contention: split it at this NR_CPUS. -- cgit v1.2.3 From 11b4fa8b2a56c0e7b9db4fe21c134a4cba414657 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:05:00 -0700 Subject: arm64: convert various functions to use ptdescs As part of the conversions to replace pgtable constructor/destructors with ptdesc equivalents, convert various page table functions to use ptdescs. Link: https://lkml.kernel.org/r/20230807230513.102486-19-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Acked-by: Catalin Marinas Cc: Arnd Bergmann Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/arm64/include/asm/tlb.h | 14 ++++++++------ arch/arm64/mm/mmu.c | 7 ++++--- 2 files changed, 12 insertions(+), 9 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h index c995d1f4594f..2c29239d05c3 100644 --- a/arch/arm64/include/asm/tlb.h +++ b/arch/arm64/include/asm/tlb.h @@ -75,18 +75,20 @@ static inline void tlb_flush(struct mmu_gather *tlb) static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long addr) { - pgtable_pte_page_dtor(pte); - tlb_remove_table(tlb, pte); + struct ptdesc *ptdesc = page_ptdesc(pte); + + pagetable_pte_dtor(ptdesc); + tlb_remove_ptdesc(tlb, ptdesc); } #if CONFIG_PGTABLE_LEVELS > 2 static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, unsigned long addr) { - struct page *page = virt_to_page(pmdp); + struct ptdesc *ptdesc = virt_to_ptdesc(pmdp); - pgtable_pmd_page_dtor(page); - tlb_remove_table(tlb, page); + pagetable_pmd_dtor(ptdesc); + tlb_remove_ptdesc(tlb, ptdesc); } #endif @@ -94,7 +96,7 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pudp, unsigned long addr) { - tlb_remove_table(tlb, virt_to_page(pudp)); + tlb_remove_ptdesc(tlb, virt_to_ptdesc(pudp)); } #endif diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 95d360805f8a..47781bec6171 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -426,6 +426,7 @@ static phys_addr_t __pgd_pgtable_alloc(int shift) static phys_addr_t pgd_pgtable_alloc(int shift) { phys_addr_t pa = __pgd_pgtable_alloc(shift); + struct ptdesc *ptdesc = page_ptdesc(phys_to_page(pa)); /* * Call proper page table ctor in case later we need to @@ -433,12 +434,12 @@ static phys_addr_t pgd_pgtable_alloc(int shift) * this pre-allocated page table. * * We don't select ARCH_ENABLE_SPLIT_PMD_PTLOCK if pmd is - * folded, and if so pgtable_pmd_page_ctor() becomes nop. + * folded, and if so pagetable_pte_ctor() becomes nop. */ if (shift == PAGE_SHIFT) - BUG_ON(!pgtable_pte_page_ctor(phys_to_page(pa))); + BUG_ON(!pagetable_pte_ctor(ptdesc)); else if (shift == PMD_SHIFT) - BUG_ON(!pgtable_pmd_page_ctor(phys_to_page(pa))); + BUG_ON(!pagetable_pmd_ctor(ptdesc)); return pa; } -- cgit v1.2.3 From 1de8c835a936859f01a91174474a8b96d61b0400 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 13 Aug 2023 21:53:17 -0700 Subject: arm64: include asm/cacheflush.h in asm/hugetlb.h PG_dcache_clean is used in asm/hugetlb.h but defined in asm/cacheflush.h: builds rely on an accident of that being included via linux/mempolicy.h, but better include it directly (like arch/sh/include/asm/hugetlb.h does). Link: https://lkml.kernel.org/r/bd77cc1b-e83b-f276-9e27-c19e7c9119aa@google.com Signed-off-by: Hugh Dickins Cc: Catalin Marinas Cc: Mike Kravetz Cc: Palmer Dabbelt Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/include/asm/hugetlb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/arm64') diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index a91d6219aa78..f43a38ac1779 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -10,6 +10,7 @@ #ifndef __ASM_HUGETLB_H #define __ASM_HUGETLB_H +#include #include #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION -- cgit v1.2.3 From 4089eef0e6ac1a179c58304c657b3df3bb6fe509 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 30 Jun 2023 14:19:54 -0700 Subject: mm: drop per-VMA lock when returning VM_FAULT_RETRY or VM_FAULT_COMPLETED handle_mm_fault returning VM_FAULT_RETRY or VM_FAULT_COMPLETED means mmap_lock has been released. However with per-VMA locks behavior is different and the caller should still release it. To make the rules consistent for the caller, drop the per-VMA lock when returning VM_FAULT_RETRY or VM_FAULT_COMPLETED. Currently the only path returning VM_FAULT_RETRY under per-VMA locks is do_swap_page and no path returns VM_FAULT_COMPLETED for now. [willy@infradead.org: fix riscv] Link: https://lkml.kernel.org/r/CAJuCfpE6GWEx1rPBmNpUfoD5o-gNFz9-UFywzCE2PbEGBiVz7g@mail.gmail.com Link: https://lkml.kernel.org/r/20230630211957.1341547-4-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Peter Xu Tested-by: Conor Dooley Cc: Alistair Popple Cc: Al Viro Cc: Christian Brauner Cc: Christoph Hellwig Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hillf Danton Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Johannes Weiner Cc: Josef Bacik Cc: Laurent Dufour Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Michal Hocko Cc: Michel Lespinasse Cc: Minchan Kim Cc: Pavel Tatashin Cc: Punit Agrawal Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton --- arch/arm64/mm/fault.c | 3 ++- arch/powerpc/mm/fault.c | 3 ++- arch/riscv/mm/fault.c | 3 ++- arch/s390/mm/fault.c | 3 ++- arch/x86/mm/fault.c | 3 ++- mm/memory.c | 12 ++++++++++++ 6 files changed, 22 insertions(+), 5 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 103fcbdc6552..2e5d1e238af9 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -599,7 +599,8 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, goto lock_mmap; } fault = handle_mm_fault(vma, addr, mm_flags | FAULT_FLAG_VMA_LOCK, regs); - vma_end_read(vma); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); if (!(fault & VM_FAULT_RETRY)) { count_vm_vma_lock_event(VMA_LOCK_SUCCESS); diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index fafce6bdeff0..b1723094d464 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -488,7 +488,8 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address, } fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); - vma_end_read(vma); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); if (!(fault & VM_FAULT_RETRY)) { count_vm_vma_lock_event(VMA_LOCK_SUCCESS); diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 046732fcb48c..6115d7514972 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -296,7 +296,8 @@ void handle_page_fault(struct pt_regs *regs) } fault = handle_mm_fault(vma, addr, flags | FAULT_FLAG_VMA_LOCK, regs); - vma_end_read(vma); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); if (!(fault & VM_FAULT_RETRY)) { count_vm_vma_lock_event(VMA_LOCK_SUCCESS); diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 6f6b9881e55e..a063774ba584 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -417,7 +417,8 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) goto lock_mmap; } fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); - vma_end_read(vma); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); if (!(fault & VM_FAULT_RETRY)) { count_vm_vma_lock_event(VMA_LOCK_SUCCESS); if (likely(!(fault & VM_FAULT_ERROR))) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 787da09d24f3..2e861b9360c7 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1340,7 +1340,8 @@ void do_user_addr_fault(struct pt_regs *regs, goto lock_mmap; } fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); - vma_end_read(vma); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); if (!(fault & VM_FAULT_RETRY)) { count_vm_vma_lock_event(VMA_LOCK_SUCCESS); diff --git a/mm/memory.c b/mm/memory.c index f9c3ad489823..b9c3780fd426 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3747,6 +3747,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (vmf->flags & FAULT_FLAG_VMA_LOCK) { ret = VM_FAULT_RETRY; + vma_end_read(vma); goto out; } @@ -5248,6 +5249,17 @@ static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma, !is_cow_mapping(vma->vm_flags))) return VM_FAULT_SIGSEGV; } +#ifdef CONFIG_PER_VMA_LOCK + /* + * Per-VMA locks can't be used with FAULT_FLAG_RETRY_NOWAIT because of + * the assumption that lock is dropped on VM_FAULT_RETRY. + */ + if (WARN_ON_ONCE((*flags & + (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT)) == + (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT))) + return VM_FAULT_SIGSEGV; +#endif + return 0; } -- cgit v1.2.3 From a379322022c0961fe0b638cdd842d3c38eeff92c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:30 +0100 Subject: mm: convert page_table_check_pte_set() to page_table_check_ptes_set() Tell the page table check how many PTEs & PFNs we want it to check. Link: https://lkml.kernel.org/r/20230802151406.3735276-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Mike Rapoport (IBM) Acked-by: Pasha Tatashin Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 2 +- arch/riscv/include/asm/pgtable.h | 2 +- arch/x86/include/asm/pgtable.h | 2 +- include/linux/page_table_check.h | 13 +++++++------ mm/page_table_check.c | 16 +++++++++------- 5 files changed, 19 insertions(+), 16 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index fe4b913589ee..445b18d7a47c 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -348,7 +348,7 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { - page_table_check_pte_set(mm, ptep, pte); + page_table_check_ptes_set(mm, ptep, pte, 1); return __set_pte_at(mm, addr, ptep, pte); } diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 44377f0d7c35..01e4aabc8898 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -499,7 +499,7 @@ static inline void __set_pte_at(struct mm_struct *mm, static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { - page_table_check_pte_set(mm, ptep, pteval); + page_table_check_ptes_set(mm, ptep, pteval, 1); __set_pte_at(mm, addr, ptep, pteval); } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index ada1bbf12961..cd0b6337d03c 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1023,7 +1023,7 @@ static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp) static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { - page_table_check_pte_set(mm, ptep, pte); + page_table_check_ptes_set(mm, ptep, pte, 1); set_pte(ptep, pte); } diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 7f6b9bf926c5..6722941c7cb8 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -17,7 +17,8 @@ void __page_table_check_zero(struct page *page, unsigned int order); void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte); void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd); void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud); -void __page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, pte_t pte); +void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, + unsigned int nr); void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd); void __page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud); void __page_table_check_pte_clear_range(struct mm_struct *mm, @@ -64,13 +65,13 @@ static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) __page_table_check_pud_clear(mm, pud); } -static inline void page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, - pte_t pte) +static inline void page_table_check_ptes_set(struct mm_struct *mm, + pte_t *ptep, pte_t pte, unsigned int nr) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pte_set(mm, ptep, pte); + __page_table_check_ptes_set(mm, ptep, pte, nr); } static inline void page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, @@ -123,8 +124,8 @@ static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) { } -static inline void page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, - pte_t pte) +static inline void page_table_check_ptes_set(struct mm_struct *mm, + pte_t *ptep, pte_t pte, unsigned int nr) { } diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 46e77c12c81e..af69c3c8f7c2 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -182,18 +182,20 @@ void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) } EXPORT_SYMBOL(__page_table_check_pud_clear); -void __page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, pte_t pte) +void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, + unsigned int nr) { + unsigned int i; + if (&init_mm == mm) return; - __page_table_check_pte_clear(mm, ptep_get(ptep)); - if (pte_user_accessible_page(pte)) { - page_table_check_set(pte_pfn(pte), PAGE_SIZE >> PAGE_SHIFT, - pte_write(pte)); - } + for (i = 0; i < nr; i++) + __page_table_check_pte_clear(mm, ptep_get(ptep + i)); + if (pte_user_accessible_page(pte)) + page_table_check_set(pte_pfn(pte), nr, pte_write(pte)); } -EXPORT_SYMBOL(__page_table_check_pte_set); +EXPORT_SYMBOL(__page_table_check_ptes_set); void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd) { -- cgit v1.2.3 From 4a169d61c2ede9fdf27103e1f454d4a0401d9025 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:38 +0100 Subject: arm64: implement the new page table range API Add set_ptes(), update_mmu_cache_range() and flush_dcache_folio(). Change the PG_dcache_clean flag from being per-page to per-folio. Link: https://lkml.kernel.org/r/20230802151406.3735276-11-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Catalin Marinas Acked-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- arch/arm64/include/asm/cacheflush.h | 4 +++- arch/arm64/include/asm/pgtable.h | 26 +++++++++++++++++++------- arch/arm64/mm/flush.c | 36 ++++++++++++++---------------------- 3 files changed, 36 insertions(+), 30 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h index 37185e978aeb..d115451ed263 100644 --- a/arch/arm64/include/asm/cacheflush.h +++ b/arch/arm64/include/asm/cacheflush.h @@ -114,7 +114,7 @@ extern void copy_to_user_page(struct vm_area_struct *, struct page *, #define copy_to_user_page copy_to_user_page /* - * flush_dcache_page is used when the kernel has written to the page + * flush_dcache_folio is used when the kernel has written to the page * cache page at virtual address page->virtual. * * If this page isn't mapped (ie, page_mapping == NULL), or it might @@ -127,6 +127,8 @@ extern void copy_to_user_page(struct vm_area_struct *, struct page *, */ #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 extern void flush_dcache_page(struct page *); +void flush_dcache_folio(struct folio *); +#define flush_dcache_folio flush_dcache_folio static __always_inline void icache_inval_all_pou(void) { diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 445b18d7a47c..76bba654b5d7 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -345,12 +345,21 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, set_pte(ptep, pte); } -static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) -{ - page_table_check_ptes_set(mm, ptep, pte, 1); - return __set_pte_at(mm, addr, ptep, pte); +static inline void set_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned int nr) +{ + page_table_check_ptes_set(mm, ptep, pte, nr); + + for (;;) { + __set_pte_at(mm, addr, ptep, pte); + if (--nr == 0) + break; + ptep++; + addr += PAGE_SIZE; + pte_val(pte) += PAGE_SIZE; + } } +#define set_ptes set_ptes /* * Huge pte definitions. @@ -1049,8 +1058,9 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio) /* * On AArch64, the cache coherency is handled via the set_pte_at() function. */ -static inline void update_mmu_cache(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +static inline void update_mmu_cache_range(struct vm_fault *vmf, + struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, + unsigned int nr) { /* * We don't do anything here, so there's a very small chance of @@ -1059,6 +1069,8 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, */ } +#define update_mmu_cache(vma, addr, ptep) \ + update_mmu_cache_range(NULL, vma, addr, ptep, 1) #define update_mmu_cache_pmd(vma, address, pmd) do { } while (0) #ifdef CONFIG_ARM64_PA_BITS_52 diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index 4e6476094952..013eead9b695 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -51,20 +51,13 @@ void copy_to_user_page(struct vm_area_struct *vma, struct page *page, void __sync_icache_dcache(pte_t pte) { - struct page *page = pte_page(pte); + struct folio *folio = page_folio(pte_page(pte)); - /* - * HugeTLB pages are always fully mapped, so only setting head page's - * PG_dcache_clean flag is enough. - */ - if (PageHuge(page)) - page = compound_head(page); - - if (!test_bit(PG_dcache_clean, &page->flags)) { - sync_icache_aliases((unsigned long)page_address(page), - (unsigned long)page_address(page) + - page_size(page)); - set_bit(PG_dcache_clean, &page->flags); + if (!test_bit(PG_dcache_clean, &folio->flags)) { + sync_icache_aliases((unsigned long)folio_address(folio), + (unsigned long)folio_address(folio) + + folio_size(folio)); + set_bit(PG_dcache_clean, &folio->flags); } } EXPORT_SYMBOL_GPL(__sync_icache_dcache); @@ -74,17 +67,16 @@ EXPORT_SYMBOL_GPL(__sync_icache_dcache); * it as dirty for later flushing when mapped in user space (if executable, * see __sync_icache_dcache). */ -void flush_dcache_page(struct page *page) +void flush_dcache_folio(struct folio *folio) { - /* - * HugeTLB pages are always fully mapped and only head page will be - * set PG_dcache_clean (see comments in __sync_icache_dcache()). - */ - if (PageHuge(page)) - page = compound_head(page); + if (test_bit(PG_dcache_clean, &folio->flags)) + clear_bit(PG_dcache_clean, &folio->flags); +} +EXPORT_SYMBOL(flush_dcache_folio); - if (test_bit(PG_dcache_clean, &page->flags)) - clear_bit(PG_dcache_clean, &page->flags); +void flush_dcache_page(struct page *page) +{ + flush_dcache_folio(page_folio(page)); } EXPORT_SYMBOL(flush_dcache_page); -- cgit v1.2.3 From 00de2c9f26b15f1a6f2af516dd8ec5f8d28189b7 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 10 Aug 2023 09:32:41 +0000 Subject: arm64: mm: use ptep_clear() instead of pte_clear() in clear_flush() In clear_flush(), the original pte may be a present entry, so we should use ptep_clear() to let page_table_check track the pte clearing operation, otherwise it may cause false positive in subsequent set_pte_at(). Link: https://lkml.kernel.org/r/20230810093241.1181142-1-qi.zheng@linux.dev Fixes: 42b2547137f5 ("arm64/mm: enable ARCH_SUPPORTS_PAGE_TABLE_CHECK") Signed-off-by: Qi Zheng Acked-by: Will Deacon Cc: Catalin Marinas Cc: Kefeng Wang Cc: Muchun Song Cc: Pasha Tatashin Cc: Qi Zheng Signed-off-by: Andrew Morton --- arch/arm64/mm/hugetlbpage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/arm64') diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 21716c940682..9c52718ea750 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -236,7 +236,7 @@ static void clear_flush(struct mm_struct *mm, unsigned long i, saddr = addr; for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) - pte_clear(mm, addr, ptep); + ptep_clear(mm, addr, ptep); flush_tlb_range(&vma, saddr, addr); } -- cgit v1.2.3 From cfeed8ffe55b37fa10286aaaa1369da00cb88440 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 21 Aug 2023 18:08:46 +0200 Subject: mm/swap: stop using page->private on tail pages for THP_SWAP Patch series "mm/swap: stop using page->private on tail pages for THP_SWAP + cleanups". This series stops using page->private on tail pages for THP_SWAP, replaces folio->private by folio->swap for swapcache folios, and starts using "new_folio" for tail pages that we are splitting to remove the usage of page->private for swapcache handling completely. This patch (of 4): Let's stop using page->private on tail pages, making it possible to just unconditionally reuse that field in the tail pages of large folios. The remaining usage of the private field for THP_SWAP is in the THP splitting code (mm/huge_memory.c), that we'll handle separately later. Update the THP_SWAP documentation and sanity checks in mm_types.h and __split_huge_page_tail(). [david@redhat.com: stop using page->private on tail pages for THP_SWAP] Link: https://lkml.kernel.org/r/6f0a82a3-6948-20d9-580b-be1dbf415701@redhat.com Link: https://lkml.kernel.org/r/20230821160849.531668-1-david@redhat.com Link: https://lkml.kernel.org/r/20230821160849.531668-2-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Catalin Marinas [arm64] Reviewed-by: Yosry Ahmed Cc: Dan Streetman Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Seth Jennings Cc: Vitaly Wool Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/mm/mteswap.c | 5 +++-- include/linux/mm_types.h | 12 +----------- include/linux/swap.h | 9 +++++++++ mm/huge_memory.c | 15 ++++++--------- mm/memory.c | 2 +- mm/rmap.c | 2 +- mm/swap_state.c | 5 +++-- mm/swapfile.c | 4 ++-- 8 files changed, 26 insertions(+), 28 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/mm/mteswap.c b/arch/arm64/mm/mteswap.c index cd508ba80ab1..a31833e3ddc5 100644 --- a/arch/arm64/mm/mteswap.c +++ b/arch/arm64/mm/mteswap.c @@ -33,8 +33,9 @@ int mte_save_tags(struct page *page) mte_save_page_tags(page_address(page), tag_storage); - /* page_private contains the swap entry.val set in do_swap_page */ - ret = xa_store(&mte_pages, page_private(page), tag_storage, GFP_KERNEL); + /* lookup the swap entry.val from the page */ + ret = xa_store(&mte_pages, page_swap_entry(page).val, tag_storage, + GFP_KERNEL); if (WARN(xa_is_err(ret), "Failed to store MTE tags")) { mte_free_tag_storage(tag_storage); return xa_err(ret); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 2b9d8be28361..55cd4bc57b8d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -322,11 +322,8 @@ struct folio { atomic_t _pincount; #ifdef CONFIG_64BIT unsigned int _folio_nr_pages; - /* 4 byte gap here */ - /* private: the union with struct page is transitional */ - /* Fix THP_SWAP to not use tail->private */ - unsigned long _private_1; #endif + /* private: the union with struct page is transitional */ }; struct page __page_1; }; @@ -347,9 +344,6 @@ struct folio { /* public: */ struct list_head _deferred_list; /* private: the union with struct page is transitional */ - unsigned long _avail_2a; - /* Fix THP_SWAP to not use tail->private */ - unsigned long _private_2a; }; struct page __page_2; }; @@ -374,9 +368,6 @@ FOLIO_MATCH(memcg_data, memcg_data); offsetof(struct page, pg) + sizeof(struct page)) FOLIO_MATCH(flags, _flags_1); FOLIO_MATCH(compound_head, _head_1); -#ifdef CONFIG_64BIT -FOLIO_MATCH(private, _private_1); -#endif #undef FOLIO_MATCH #define FOLIO_MATCH(pg, fl) \ static_assert(offsetof(struct folio, fl) == \ @@ -385,7 +376,6 @@ FOLIO_MATCH(flags, _flags_2); FOLIO_MATCH(compound_head, _head_2); FOLIO_MATCH(flags, _flags_2a); FOLIO_MATCH(compound_head, _head_2a); -FOLIO_MATCH(private, _private_2a); #undef FOLIO_MATCH /** diff --git a/include/linux/swap.h b/include/linux/swap.h index bb5adc604144..e5cf58a1cf9e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -339,6 +339,15 @@ static inline swp_entry_t folio_swap_entry(struct folio *folio) return entry; } +static inline swp_entry_t page_swap_entry(struct page *page) +{ + struct folio *folio = page_folio(page); + swp_entry_t entry = folio_swap_entry(folio); + + entry.val += folio_page_idx(folio, page); + return entry; +} + static inline void folio_set_swap_entry(struct folio *folio, swp_entry_t entry) { folio->private = (void *)entry.val; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index cb4432792b88..a28e9fe16585 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2446,18 +2446,15 @@ static void __split_huge_page_tail(struct page *head, int tail, page_tail->index = head->index + tail; /* - * page->private should not be set in tail pages with the exception - * of swap cache pages that store the swp_entry_t in tail pages. - * Fix up and warn once if private is unexpectedly set. - * - * What of 32-bit systems, on which folio->_pincount overlays - * head[1].private? No problem: THP_SWAP is not enabled on 32-bit, and - * pincount must be 0 for folio_ref_freeze() to have succeeded. + * page->private should not be set in tail pages. Fix up and warn once + * if private is unexpectedly set. */ - if (!folio_test_swapcache(page_folio(head))) { - VM_WARN_ON_ONCE_PAGE(page_tail->private != 0, page_tail); + if (unlikely(page_tail->private)) { + VM_WARN_ON_ONCE_PAGE(true, page_tail); page_tail->private = 0; } + if (PageSwapCache(head)) + set_page_private(page_tail, (unsigned long)head->private + tail); /* Page flags must be visible before we make the page non-compound. */ smp_wmb(); diff --git a/mm/memory.c b/mm/memory.c index 9d7fb721a680..d104a38e8545 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3879,7 +3879,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * changed. */ if (unlikely(!folio_test_swapcache(folio) || - page_private(page) != entry.val)) + page_swap_entry(page).val != entry.val)) goto out_page; /* diff --git a/mm/rmap.c b/mm/rmap.c index 1f04debdc87a..ec7f8e6c9e48 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1647,7 +1647,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, */ dec_mm_counter(mm, mm_counter(&folio->page)); } else if (folio_test_anon(folio)) { - swp_entry_t entry = { .val = page_private(subpage) }; + swp_entry_t entry = page_swap_entry(subpage); pte_t swp_pte; /* * Store the swap location in the pte. diff --git a/mm/swap_state.c b/mm/swap_state.c index 01f15139b7d9..2f2417810052 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -100,6 +100,7 @@ int add_to_swap_cache(struct folio *folio, swp_entry_t entry, folio_ref_add(folio, nr); folio_set_swapcache(folio); + folio_set_swap_entry(folio, entry); do { xas_lock_irq(&xas); @@ -113,7 +114,6 @@ int add_to_swap_cache(struct folio *folio, swp_entry_t entry, if (shadowp) *shadowp = old; } - set_page_private(folio_page(folio, i), entry.val + i); xas_store(&xas, folio); xas_next(&xas); } @@ -154,9 +154,10 @@ void __delete_from_swap_cache(struct folio *folio, for (i = 0; i < nr; i++) { void *entry = xas_store(&xas, shadow); VM_BUG_ON_PAGE(entry != folio, entry); - set_page_private(folio_page(folio, i), 0); xas_next(&xas); } + entry.val = 0; + folio_set_swap_entry(folio, entry); folio_clear_swapcache(folio); address_space->nrpages -= nr; __node_stat_mod_folio(folio, NR_FILE_PAGES, -nr); diff --git a/mm/swapfile.c b/mm/swapfile.c index d46933adf789..bd9d904671b9 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3369,7 +3369,7 @@ struct swap_info_struct *swp_swap_info(swp_entry_t entry) struct swap_info_struct *page_swap_info(struct page *page) { - swp_entry_t entry = { .val = page_private(page) }; + swp_entry_t entry = page_swap_entry(page); return swp_swap_info(entry); } @@ -3384,7 +3384,7 @@ EXPORT_SYMBOL_GPL(swapcache_mapping); pgoff_t __page_file_index(struct page *page) { - swp_entry_t swap = { .val = page_private(page) }; + swp_entry_t swap = page_swap_entry(page); return swp_offset(swap); } EXPORT_SYMBOL_GPL(__page_file_index); -- cgit v1.2.3