diff options
Diffstat (limited to 'arch/arm64/mm')
| -rw-r--r-- | arch/arm64/mm/Makefile | 2 | ||||
| -rw-r--r-- | arch/arm64/mm/cache.S | 57 | ||||
| -rw-r--r-- | arch/arm64/mm/context.c | 8 | ||||
| -rw-r--r-- | arch/arm64/mm/contpte.c | 337 | ||||
| -rw-r--r-- | arch/arm64/mm/copypage.c | 11 | ||||
| -rw-r--r-- | arch/arm64/mm/dma-mapping.c | 4 | ||||
| -rw-r--r-- | arch/arm64/mm/extable.c | 40 | ||||
| -rw-r--r-- | arch/arm64/mm/fault.c | 212 | ||||
| -rw-r--r-- | arch/arm64/mm/flush.c | 8 | ||||
| -rw-r--r-- | arch/arm64/mm/gcs.c | 24 | ||||
| -rw-r--r-- | arch/arm64/mm/hugetlbpage.c | 106 | ||||
| -rw-r--r-- | arch/arm64/mm/init.c | 104 | ||||
| -rw-r--r-- | arch/arm64/mm/ioremap.c | 10 | ||||
| -rw-r--r-- | arch/arm64/mm/kasan_init.c | 10 | ||||
| -rw-r--r-- | arch/arm64/mm/mmap.c | 16 | ||||
| -rw-r--r-- | arch/arm64/mm/mmu.c | 1113 | ||||
| -rw-r--r-- | arch/arm64/mm/pageattr.c | 185 | ||||
| -rw-r--r-- | arch/arm64/mm/pgd.c | 2 | ||||
| -rw-r--r-- | arch/arm64/mm/physaddr.c | 2 | ||||
| -rw-r--r-- | arch/arm64/mm/proc.S | 99 | ||||
| -rw-r--r-- | arch/arm64/mm/ptdump.c | 65 | ||||
| -rw-r--r-- | arch/arm64/mm/ptdump_debugfs.c | 3 | ||||
| -rw-r--r-- | arch/arm64/mm/trans_pgd.c | 42 |
23 files changed, 1811 insertions, 649 deletions
diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile index fc92170a8f37..c26489cf96cd 100644 --- a/arch/arm64/mm/Makefile +++ b/arch/arm64/mm/Makefile @@ -5,7 +5,7 @@ obj-y := dma-mapping.o extable.o fault.o init.o \ context.o proc.o pageattr.o fixmap.o obj-$(CONFIG_ARM64_CONTPTE) += contpte.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o -obj-$(CONFIG_PTDUMP_CORE) += ptdump.o +obj-$(CONFIG_PTDUMP) += ptdump.o obj-$(CONFIG_PTDUMP_DEBUGFS) += ptdump_debugfs.o obj-$(CONFIG_TRANS_TABLE) += trans_pgd.o obj-$(CONFIG_TRANS_TABLE) += trans_pgd-asm.o diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S index 503567c864fd..ab75c050f559 100644 --- a/arch/arm64/mm/cache.S +++ b/arch/arm64/mm/cache.S @@ -132,17 +132,7 @@ alternative_else_nop_endif ret SYM_FUNC_END(dcache_clean_pou) -/* - * dcache_inval_poc(start, end) - * - * Ensure that any D-cache lines for the interval [start, end) - * are invalidated. Any partial lines at the ends of the interval are - * also cleaned to PoC to prevent data loss. - * - * - start - kernel start address of region - * - end - kernel end address of region - */ -SYM_FUNC_START(__pi_dcache_inval_poc) +.macro __dcache_inval_poc_nosync dcache_line_size x2, x3 sub x3, x2, #1 tst x1, x3 // end cache line aligned? @@ -158,12 +148,42 @@ SYM_FUNC_START(__pi_dcache_inval_poc) 3: add x0, x0, x2 cmp x0, x1 b.lo 2b +.endm + +/* + * dcache_inval_poc(start, end) + * + * Ensure that any D-cache lines for the interval [start, end) + * are invalidated. Any partial lines at the ends of the interval are + * also cleaned to PoC to prevent data loss. + * + * - start - kernel start address of region + * - end - kernel end address of region + */ +SYM_FUNC_START(__pi_dcache_inval_poc) + __dcache_inval_poc_nosync dsb sy ret SYM_FUNC_END(__pi_dcache_inval_poc) SYM_FUNC_ALIAS(dcache_inval_poc, __pi_dcache_inval_poc) /* + * dcache_inval_poc_nosync(start, end) + * + * Issue the instructions of D-cache lines for the interval [start, end) + * for invalidation. Not necessarily cleaned to PoC till an explicit dsb + * sy is issued later + * + * - start - kernel start address of region + * - end - kernel end address of region + */ +SYM_FUNC_START(__pi_dcache_inval_poc_nosync) + __dcache_inval_poc_nosync + ret +SYM_FUNC_END(__pi_dcache_inval_poc_nosync) +SYM_FUNC_ALIAS(dcache_inval_poc_nosync, __pi_dcache_inval_poc_nosync) + +/* * dcache_clean_poc(start, end) * * Ensure that any D-cache lines for the interval [start, end) @@ -179,6 +199,21 @@ SYM_FUNC_END(__pi_dcache_clean_poc) SYM_FUNC_ALIAS(dcache_clean_poc, __pi_dcache_clean_poc) /* + * dcache_clean_poc_nosync(start, end) + * + * Issue the instructions of D-cache lines for the interval [start, end). + * not necessarily cleaned to the PoC till an explicit dsb sy afterward. + * + * - start - virtual start address of region + * - end - virtual end address of region + */ +SYM_FUNC_START(__pi_dcache_clean_poc_nosync) + dcache_by_line_op_nosync cvac, x0, x1, x2, x3 + ret +SYM_FUNC_END(__pi_dcache_clean_poc_nosync) +SYM_FUNC_ALIAS(dcache_clean_poc_nosync, __pi_dcache_clean_poc_nosync) + +/* * dcache_clean_pop(start, end) * * Ensure that any D-cache lines for the interval [start, end) diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index b2ac06246327..0f4a28b87469 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -354,15 +354,15 @@ void cpu_do_switch_mm(phys_addr_t pgd_phys, struct mm_struct *mm) /* Skip CNP for the reserved ASID */ if (system_supports_cnp() && asid) - ttbr0 |= TTBR_CNP_BIT; + ttbr0 |= TTBRx_EL1_CnP; /* SW PAN needs a copy of the ASID in TTBR0 for entry */ if (IS_ENABLED(CONFIG_ARM64_SW_TTBR0_PAN)) - ttbr0 |= FIELD_PREP(TTBR_ASID_MASK, asid); + ttbr0 |= FIELD_PREP(TTBRx_EL1_ASID_MASK, asid); /* Set ASID in TTBR1 since TCR.A1 is set */ - ttbr1 &= ~TTBR_ASID_MASK; - ttbr1 |= FIELD_PREP(TTBR_ASID_MASK, asid); + ttbr1 &= ~TTBRx_EL1_ASID_MASK; + ttbr1 |= FIELD_PREP(TTBRx_EL1_ASID_MASK, asid); cpu_set_reserved_ttbr0_nosync(); write_sysreg(ttbr1, ttbr1_el1); diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c index 55107d27d3f8..2de12656b4d8 100644 --- a/arch/arm64/mm/contpte.c +++ b/arch/arm64/mm/contpte.c @@ -26,6 +26,26 @@ static inline pte_t *contpte_align_down(pte_t *ptep) return PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * CONT_PTES); } +static inline pte_t *contpte_align_addr_ptep(unsigned long *start, + unsigned long *end, pte_t *ptep, + unsigned int nr) +{ + /* + * Note: caller must ensure these nr PTEs are consecutive (present) + * PTEs that map consecutive pages of the same large folio within a + * single VMA and a single page table. + */ + if (pte_cont(__ptep_get(ptep + nr - 1))) + *end = ALIGN(*end, CONT_PTE_SIZE); + + if (pte_cont(__ptep_get(ptep))) { + *start = ALIGN_DOWN(*start, CONT_PTE_SIZE); + ptep = contpte_align_down(ptep); + } + + return ptep; +} + static void contpte_try_unfold_partial(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr) { @@ -68,7 +88,145 @@ static void contpte_convert(struct mm_struct *mm, unsigned long addr, pte = pte_mkyoung(pte); } - __flush_tlb_range(&vma, start_addr, addr, PAGE_SIZE, true, 3); + /* + * On eliding the __tlb_flush_range() under BBML2+noabort: + * + * NOTE: Instead of using N=16 as the contiguous block length, we use + * N=4 for clarity. + * + * NOTE: 'n' and 'c' are used to denote the "contiguous bit" being + * unset and set, respectively. + * + * We worry about two cases where contiguous bit is used: + * - When folding N smaller non-contiguous ptes as 1 contiguous block. + * - When unfolding a contiguous block into N smaller non-contiguous ptes. + * + * Currently, the BBML0 folding case looks as follows: + * + * 0) Initial page-table layout: + * + * +----+----+----+----+ + * |RO,n|RO,n|RO,n|RW,n| <--- last page being set as RO + * +----+----+----+----+ + * + * 1) Aggregate AF + dirty flags using __ptep_get_and_clear(): + * + * +----+----+----+----+ + * | 0 | 0 | 0 | 0 | + * +----+----+----+----+ + * + * 2) __flush_tlb_range(): + * + * |____ tlbi + dsb ____| + * + * 3) __set_ptes() to repaint contiguous block: + * + * +----+----+----+----+ + * |RO,c|RO,c|RO,c|RO,c| + * +----+----+----+----+ + * + * 4) The kernel will eventually __flush_tlb() for changed page: + * + * |____| <--- tlbi + dsb + * + * As expected, the intermediate tlbi+dsb ensures that other PEs + * only ever see an invalid (0) entry, or the new contiguous TLB entry. + * The final tlbi+dsb will always throw away the newly installed + * contiguous TLB entry, which is a micro-optimisation opportunity, + * but does not affect correctness. + * + * In the BBML2 case, the change is avoiding the intermediate tlbi+dsb. + * This means a few things, but notably other PEs will still "see" any + * stale cached TLB entries. This could lead to a "contiguous bit + * misprogramming" issue until the final tlbi+dsb of the changed page, + * which would clear out both the stale (RW,n) entry and the new (RO,c) + * contiguous entry installed in its place. + * + * What this is saying, is the following: + * + * +----+----+----+----+ + * |RO,n|RO,n|RO,n|RW,n| <--- old page tables, all non-contiguous + * +----+----+----+----+ + * + * +----+----+----+----+ + * |RO,c|RO,c|RO,c|RO,c| <--- new page tables, all contiguous + * +----+----+----+----+ + * /\ + * || + * + * If both the old single (RW,n) and new contiguous (RO,c) TLB entries + * are present, and a write is made to this address, do we fault or + * is the write permitted (via amalgamation)? + * + * The relevant Arm ARM DDI 0487L.a requirements are RNGLXZ and RJQQTC, + * and together state that when BBML1 or BBML2 are implemented, either + * a TLB conflict abort is raised (which we expressly forbid), or will + * "produce an OA, access permissions, and memory attributes that are + * consistent with any of the programmed translation table values". + * + * That is to say, will either raise a TLB conflict, or produce one of + * the cached TLB entries, but never amalgamate. + * + * Thus, as the page tables are only considered "consistent" after + * the final tlbi+dsb (which evicts both the single stale (RW,n) TLB + * entry as well as the new contiguous (RO,c) TLB entry), omitting the + * initial tlbi+dsb is correct. + * + * It is also important to note that at the end of the BBML2 folding + * case, we are still left with potentially all N TLB entries still + * cached (the N-1 non-contiguous ptes, and the single contiguous + * block). However, over time, natural TLB pressure will cause the + * non-contiguous pte TLB entries to be flushed, leaving only the + * contiguous block TLB entry. This means that omitting the tlbi+dsb is + * not only correct, but also keeps our eventual performance benefits. + * + * For the unfolding case, BBML0 looks as follows: + * + * 0) Initial page-table layout: + * + * +----+----+----+----+ + * |RW,c|RW,c|RW,c|RW,c| <--- last page being set as RO + * +----+----+----+----+ + * + * 1) Aggregate AF + dirty flags using __ptep_get_and_clear(): + * + * +----+----+----+----+ + * | 0 | 0 | 0 | 0 | + * +----+----+----+----+ + * + * 2) __flush_tlb_range(): + * + * |____ tlbi + dsb ____| + * + * 3) __set_ptes() to repaint as non-contiguous: + * + * +----+----+----+----+ + * |RW,n|RW,n|RW,n|RW,n| + * +----+----+----+----+ + * + * 4) Update changed page permissions: + * + * +----+----+----+----+ + * |RW,n|RW,n|RW,n|RO,n| <--- last page permissions set + * +----+----+----+----+ + * + * 5) The kernel will eventually __flush_tlb() for changed page: + * + * |____| <--- tlbi + dsb + * + * For BBML2, we again remove the intermediate tlbi+dsb. Here, there + * are no issues, as the final tlbi+dsb covering the changed page is + * guaranteed to remove the original large contiguous (RW,c) TLB entry, + * as well as the intermediate (RW,n) TLB entry; the next access will + * install the new (RO,n) TLB entry and the page tables are only + * considered "consistent" after the final tlbi+dsb, so software must + * be prepared for this inconsistency prior to finishing the mm dance + * regardless. + */ + + if (!system_supports_bbml2_noabort()) + __flush_tlb_range(&vma, start_addr, addr, PAGE_SIZE, 3, + TLBF_NOWALKCACHE); __set_ptes(mm, start_addr, start_ptep, pte, CONT_PTES); } @@ -169,17 +327,46 @@ pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte) for (i = 0; i < CONT_PTES; i++, ptep++) { pte = __ptep_get(ptep); - if (pte_dirty(pte)) + if (pte_dirty(pte)) { orig_pte = pte_mkdirty(orig_pte); - - if (pte_young(pte)) + for (; i < CONT_PTES; i++, ptep++) { + pte = __ptep_get(ptep); + if (pte_young(pte)) { + orig_pte = pte_mkyoung(orig_pte); + break; + } + } + break; + } + + if (pte_young(pte)) { orig_pte = pte_mkyoung(orig_pte); + i++; + ptep++; + for (; i < CONT_PTES; i++, ptep++) { + pte = __ptep_get(ptep); + if (pte_dirty(pte)) { + orig_pte = pte_mkdirty(orig_pte); + break; + } + } + break; + } } return orig_pte; } EXPORT_SYMBOL_GPL(contpte_ptep_get); +static inline bool contpte_is_consistent(pte_t pte, unsigned long pfn, + pgprot_t orig_prot) +{ + pgprot_t prot = pte_pgprot(pte_mkold(pte_mkclean(pte))); + + return pte_valid_cont(pte) && pte_pfn(pte) == pfn && + pgprot_val(prot) == pgprot_val(orig_prot); +} + pte_t contpte_ptep_get_lockless(pte_t *orig_ptep) { /* @@ -202,7 +389,6 @@ pte_t contpte_ptep_get_lockless(pte_t *orig_ptep) pgprot_t orig_prot; unsigned long pfn; pte_t orig_pte; - pgprot_t prot; pte_t *ptep; pte_t pte; int i; @@ -219,18 +405,44 @@ retry: for (i = 0; i < CONT_PTES; i++, ptep++, pfn++) { pte = __ptep_get(ptep); - prot = pte_pgprot(pte_mkold(pte_mkclean(pte))); - if (!pte_valid_cont(pte) || - pte_pfn(pte) != pfn || - pgprot_val(prot) != pgprot_val(orig_prot)) + if (!contpte_is_consistent(pte, pfn, orig_prot)) goto retry; - if (pte_dirty(pte)) + if (pte_dirty(pte)) { orig_pte = pte_mkdirty(orig_pte); + for (; i < CONT_PTES; i++, ptep++, pfn++) { + pte = __ptep_get(ptep); + + if (!contpte_is_consistent(pte, pfn, orig_prot)) + goto retry; + + if (pte_young(pte)) { + orig_pte = pte_mkyoung(orig_pte); + break; + } + } + break; + } - if (pte_young(pte)) + if (pte_young(pte)) { orig_pte = pte_mkyoung(orig_pte); + i++; + ptep++; + pfn++; + for (; i < CONT_PTES; i++, ptep++, pfn++) { + pte = __ptep_get(ptep); + + if (!contpte_is_consistent(pte, pfn, orig_prot)) + goto retry; + + if (pte_dirty(pte)) { + orig_pte = pte_mkdirty(orig_pte); + break; + } + } + break; + } } return orig_pte; @@ -297,8 +509,8 @@ pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm, } EXPORT_SYMBOL_GPL(contpte_get_and_clear_full_ptes); -int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +bool contpte_test_and_clear_young_ptes(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, unsigned int nr) { /* * ptep_clear_flush_young() technically requires us to clear the access @@ -307,41 +519,44 @@ int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma, * contig range when the range is covered by a single folio, we can get * away with clearing young for the whole contig range here, so we avoid * having to unfold. + * + * The 'nr' means consecutive (present) PTEs that map consecutive pages + * of the same large folio in a single VMA and a single page table. */ - int young = 0; - int i; - - ptep = contpte_align_down(ptep); - addr = ALIGN_DOWN(addr, CONT_PTE_SIZE); + unsigned long end = addr + nr * PAGE_SIZE; + bool young = false; - for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE) + ptep = contpte_align_addr_ptep(&addr, &end, ptep, nr); + for (; addr != end; ptep++, addr += PAGE_SIZE) young |= __ptep_test_and_clear_young(vma, addr, ptep); return young; } -EXPORT_SYMBOL_GPL(contpte_ptep_test_and_clear_young); +EXPORT_SYMBOL_GPL(contpte_test_and_clear_young_ptes); -int contpte_ptep_clear_flush_young(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +bool contpte_clear_flush_young_ptes(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, unsigned int nr) { - int young; + bool young; - young = contpte_ptep_test_and_clear_young(vma, addr, ptep); + young = contpte_test_and_clear_young_ptes(vma, addr, ptep, nr); if (young) { + unsigned long end = addr + nr * PAGE_SIZE; + + contpte_align_addr_ptep(&addr, &end, ptep, nr); /* * See comment in __ptep_clear_flush_young(); same rationale for * eliding the trailing DSB applies here. */ - addr = ALIGN_DOWN(addr, CONT_PTE_SIZE); - __flush_tlb_range_nosync(vma, addr, addr + CONT_PTE_SIZE, - PAGE_SIZE, true, 3); + __flush_tlb_range(vma, addr, end, PAGE_SIZE, 3, + TLBF_NOWALKCACHE | TLBF_NOSYNC); } return young; } -EXPORT_SYMBOL_GPL(contpte_ptep_clear_flush_young); +EXPORT_SYMBOL_GPL(contpte_clear_flush_young_ptes); void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr) @@ -378,17 +593,31 @@ void contpte_clear_young_dirty_ptes(struct vm_area_struct *vma, unsigned long start = addr; unsigned long end = start + nr * PAGE_SIZE; - if (pte_cont(__ptep_get(ptep + nr - 1))) - end = ALIGN(end, CONT_PTE_SIZE); + ptep = contpte_align_addr_ptep(&start, &end, ptep, nr); + __clear_young_dirty_ptes(vma, start, ptep, (end - start) / PAGE_SIZE, flags); +} +EXPORT_SYMBOL_GPL(contpte_clear_young_dirty_ptes); - if (pte_cont(__ptep_get(ptep))) { - start = ALIGN_DOWN(start, CONT_PTE_SIZE); - ptep = contpte_align_down(ptep); +static bool contpte_all_subptes_match_access_flags(pte_t *ptep, pte_t entry) +{ + pte_t *cont_ptep = contpte_align_down(ptep); + /* + * PFNs differ per sub-PTE. Match only bits consumed by + * __ptep_set_access_flags(): AF, DIRTY and write permission. + */ + const pteval_t cmp_mask = PTE_RDONLY | PTE_AF | PTE_WRITE | PTE_DIRTY; + pteval_t entry_cmp = pte_val(entry) & cmp_mask; + int i; + + for (i = 0; i < CONT_PTES; i++) { + pteval_t pte_cmp = pte_val(__ptep_get(cont_ptep + i)) & cmp_mask; + + if (pte_cmp != entry_cmp) + return false; } - __clear_young_dirty_ptes(vma, start, ptep, (end - start) / PAGE_SIZE, flags); + return true; } -EXPORT_SYMBOL_GPL(contpte_clear_young_dirty_ptes); int contpte_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, @@ -399,14 +628,38 @@ int contpte_ptep_set_access_flags(struct vm_area_struct *vma, int i; /* - * Gather the access/dirty bits for the contiguous range. If nothing has - * changed, its a noop. + * Check whether all sub-PTEs in the CONT block already match the + * requested access flags/write permission, using raw per-PTE values + * rather than the gathered ptep_get() view. + * + * __ptep_set_access_flags() can update AF, dirty and write + * permission, but only to make the mapping more permissive. + * + * ptep_get() gathers AF/dirty state across the whole CONT block, + * which is correct for a CPU with FEAT_HAFDBS. But page-table + * walkers that evaluate each descriptor individually (e.g. a CPU + * without DBM support, or an SMMU without HTTU, or with HA/HD + * disabled in CD.TCR) can keep faulting on the target sub-PTE if + * only a sibling has been updated. Gathering can therefore cause + * false no-ops when only a sibling has been updated: + * - write faults: target still has PTE_RDONLY (needs PTE_RDONLY cleared) + * - read faults: target still lacks PTE_AF + * + * Per Arm ARM (DDI 0487) D8.7.1, any sub-PTE in a CONT range may + * become the effective cached translation, so all entries must have + * consistent attributes. Check the full CONT block before returning + * no-op, and when any sub-PTE mismatches, proceed to update the whole + * range. */ - orig_pte = pte_mknoncont(ptep_get(ptep)); - if (pte_val(orig_pte) == pte_val(entry)) + if (contpte_all_subptes_match_access_flags(ptep, entry)) return 0; /* + * Use raw target pte (not gathered) for write-bit unfold decision. + */ + orig_pte = pte_mknoncont(__ptep_get(ptep)); + + /* * We can fix up access/dirty bits without having to unfold the contig * range. But if the write bit is changing, we must unfold. */ @@ -431,8 +684,10 @@ int contpte_ptep_set_access_flags(struct vm_area_struct *vma, __ptep_set_access_flags(vma, addr, ptep, entry, 0); if (dirty) - __flush_tlb_range(vma, start_addr, addr, - PAGE_SIZE, true, 3); + __flush_tlb_range(vma, start_addr, + start_addr + CONT_PTE_SIZE, + PAGE_SIZE, 3, + TLBF_NOWALKCACHE | TLBF_NOBROADCAST); } else { __contpte_try_unfold(vma->vm_mm, addr, ptep, orig_pte); __ptep_set_access_flags(vma, addr, ptep, entry, dirty); diff --git a/arch/arm64/mm/copypage.c b/arch/arm64/mm/copypage.c index a86c897017df..cd5912ba617b 100644 --- a/arch/arm64/mm/copypage.c +++ b/arch/arm64/mm/copypage.c @@ -35,7 +35,7 @@ void copy_highpage(struct page *to, struct page *from) from != folio_page(src, 0)) return; - WARN_ON_ONCE(!folio_try_hugetlb_mte_tagging(dst)); + folio_try_hugetlb_mte_tagging(dst); /* * Populate tags for all subpages. @@ -51,8 +51,13 @@ void copy_highpage(struct page *to, struct page *from) } folio_set_hugetlb_mte_tagged(dst); } else if (page_mte_tagged(from)) { - /* It's a new page, shouldn't have been tagged yet */ - WARN_ON_ONCE(!try_page_mte_tagging(to)); + /* + * Most of the time it's a new page that shouldn't have been + * tagged yet. However, folio migration can end up reusing the + * same page without untagging it. Ignore the warning if the + * page is already tagged. + */ + try_page_mte_tagging(to); mte_copy_page_tags(kto, kfrom); set_page_mte_tagged(to); diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index b2b5792b2caa..ae1ae0280eef 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -17,7 +17,7 @@ void arch_sync_dma_for_device(phys_addr_t paddr, size_t size, { unsigned long start = (unsigned long)phys_to_virt(paddr); - dcache_clean_poc(start, start + size); + dcache_clean_poc_nosync(start, start + size); } void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, @@ -28,7 +28,7 @@ void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, if (dir == DMA_TO_DEVICE) return; - dcache_inval_poc(start, start + size); + dcache_inval_poc_nosync(start, start + size); } void arch_dma_prep_coherent(struct page *page, size_t size) diff --git a/arch/arm64/mm/extable.c b/arch/arm64/mm/extable.c index 228d681a8715..6e0528831cd3 100644 --- a/arch/arm64/mm/extable.c +++ b/arch/arm64/mm/extable.c @@ -8,8 +8,33 @@ #include <linux/uaccess.h> #include <asm/asm-extable.h> +#include <asm/esr.h> #include <asm/ptrace.h> +static bool cpy_faulted_on_uaccess(const struct exception_table_entry *ex, + unsigned long esr) +{ + bool uaccess_is_write = FIELD_GET(EX_DATA_UACCESS_WRITE, ex->data); + bool fault_on_write = esr & ESR_ELx_WNR; + + return uaccess_is_write == fault_on_write; +} + +bool insn_may_access_user(unsigned long addr, unsigned long esr) +{ + const struct exception_table_entry *ex = search_exception_tables(addr); + + if (!ex) + return false; + + switch (ex->type) { + case EX_TYPE_UACCESS_CPY: + return cpy_faulted_on_uaccess(ex, esr); + default: + return true; + } +} + static inline unsigned long get_ex_fixup(const struct exception_table_entry *ex) { @@ -29,6 +54,17 @@ static bool ex_handler_uaccess_err_zero(const struct exception_table_entry *ex, return true; } +static bool ex_handler_uaccess_cpy(const struct exception_table_entry *ex, + struct pt_regs *regs, unsigned long esr) +{ + /* Do not fix up faults on kernel memory accesses */ + if (!cpy_faulted_on_uaccess(ex, esr)) + return false; + + regs->pc = get_ex_fixup(ex); + return true; +} + static bool ex_handler_load_unaligned_zeropad(const struct exception_table_entry *ex, struct pt_regs *regs) @@ -56,7 +92,7 @@ ex_handler_load_unaligned_zeropad(const struct exception_table_entry *ex, return true; } -bool fixup_exception(struct pt_regs *regs) +bool fixup_exception(struct pt_regs *regs, unsigned long esr) { const struct exception_table_entry *ex; @@ -70,6 +106,8 @@ bool fixup_exception(struct pt_regs *regs) case EX_TYPE_UACCESS_ERR_ZERO: case EX_TYPE_KACCESS_ERR_ZERO: return ex_handler_uaccess_err_zero(ex, regs); + case EX_TYPE_UACCESS_CPY: + return ex_handler_uaccess_cpy(ex, regs, esr); case EX_TYPE_LOAD_UNALIGNED_ZEROPAD: return ex_handler_load_unaligned_zeropad(ex, regs); } diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index ef63651099a9..739800835920 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -43,6 +43,7 @@ #include <asm/system_misc.h> #include <asm/tlbflush.h> #include <asm/traps.h> +#include <asm/virt.h> struct fault_info { int (*fn)(unsigned long far, unsigned long esr, @@ -53,18 +54,12 @@ struct fault_info { }; static const struct fault_info fault_info[]; -static struct fault_info debug_fault_info[]; static inline const struct fault_info *esr_to_fault_info(unsigned long esr) { return fault_info + (esr & ESR_ELx_FSC); } -static inline const struct fault_info *esr_to_debug_fault_info(unsigned long esr) -{ - return debug_fault_info + DBG_ESR_EVT(esr); -} - static void data_abort_decode(unsigned long esr) { unsigned long iss2 = ESR_ELx_ISS2(esr); @@ -210,12 +205,13 @@ static void show_pte(unsigned long addr) * * Returns whether or not the PTE actually changed. */ -int __ptep_set_access_flags(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep, - pte_t entry, int dirty) +int __ptep_set_access_flags_anysz(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty, unsigned long pgsize) { pteval_t old_pteval, pteval; pte_t pte = __ptep_get(ptep); + int level; if (pte_same(pte, entry)) return 0; @@ -239,9 +235,32 @@ int __ptep_set_access_flags(struct vm_area_struct *vma, pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval); } while (pteval != old_pteval); - /* Invalidate a stale read-only entry */ - if (dirty) - flush_tlb_page(vma, address); + /* + * Invalidate the local stale read-only entry. Remote stale entries + * may still cause page faults and be invalidated via + * flush_tlb_fix_spurious_fault(). + */ + if (dirty) { + switch (pgsize) { + case PAGE_SIZE: + level = 3; + break; + case PMD_SIZE: + level = 2; + break; +#ifndef __PAGETABLE_PMD_FOLDED + case PUD_SIZE: + level = 1; + break; +#endif + default: + level = TLBI_TTL_UNKNOWN; + WARN_ON(1); + } + + __flush_tlb_range(vma, address, address + pgsize, pgsize, level, + TLBF_NOWALKCACHE | TLBF_NOBROADCAST); + } return 1; } @@ -271,6 +290,15 @@ static inline bool is_el1_permission_fault(unsigned long addr, unsigned long esr return false; } +static bool is_pkvm_stage2_abort(unsigned int esr) +{ + /* + * S1PTW should only ever be set in ESR_EL1 if the pkvm hypervisor + * injected a stage-2 abort -- see host_inject_mem_abort(). + */ + return is_pkvm_initialized() && (esr & ESR_ELx_S1PTW); +} + static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr, unsigned long esr, struct pt_regs *regs) @@ -291,8 +319,14 @@ static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr, * If we now have a valid translation, treat the translation fault as * spurious. */ - if (!(par & SYS_PAR_EL1_F)) + if (!(par & SYS_PAR_EL1_F)) { + if (is_pkvm_stage2_abort(esr)) { + par &= SYS_PAR_EL1_PA; + return pkvm_force_reclaim_guest_page(par); + } + return true; + } /* * If we got a different type of fault from the AT instruction, @@ -375,12 +409,14 @@ static void __do_kernel_fault(unsigned long addr, unsigned long esr, * Are we prepared to handle this kernel fault? * We are almost certainly not prepared to handle instruction faults. */ - if (!is_el1_instruction_abort(esr) && fixup_exception(regs)) + if (!is_el1_instruction_abort(esr) && fixup_exception(regs, esr)) return; - if (WARN_RATELIMIT(is_spurious_el1_translation_fault(addr, esr, regs), - "Ignoring spurious kernel translation fault at virtual address %016lx\n", addr)) + if (is_spurious_el1_translation_fault(addr, esr, regs)) { + WARN_RATELIMIT(!is_pkvm_stage2_abort(esr), + "Ignoring spurious kernel translation fault at virtual address %016lx\n", addr); return; + } if (is_el1_mte_sync_tag_check_fault(esr)) { do_tag_recovery(addr, esr, regs); @@ -397,6 +433,8 @@ static void __do_kernel_fault(unsigned long addr, unsigned long esr, msg = "read from unreadable memory"; } else if (addr < PAGE_SIZE) { msg = "NULL pointer dereference"; + } else if (is_pkvm_stage2_abort(esr)) { + msg = "access to hypervisor-protected memory"; } else { if (esr_fsc_is_translation_fault(esr) && kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs)) @@ -487,17 +525,29 @@ static void do_bad_area(unsigned long far, unsigned long esr, } } -static bool fault_from_pkey(unsigned long esr, struct vm_area_struct *vma, - unsigned int mm_flags) +static bool fault_from_pkey(struct vm_area_struct *vma, unsigned int mm_flags) { - unsigned long iss2 = ESR_ELx_ISS2(esr); - if (!system_supports_poe()) return false; - if (esr_fsc_is_permission_fault(esr) && (iss2 & ESR_ELx_Overlay)) - return true; - + /* + * We do not check whether an Overlay fault has occurred because we + * cannot make a decision based solely on its value: + * + * - If Overlay is set, a fault did occur due to POE, but it may be + * spurious in those cases where we update POR_EL0 without ISB (e.g. + * on context-switch). We would then need to manually check POR_EL0 + * against vma_pkey(vma), which is exactly what + * arch_vma_access_permitted() does. + * + * - If Overlay is not set, we may still need to report a pkey fault. + * This is the case if an access was made within a mapping but with no + * page mapped, and POR_EL0 forbids the access (according to + * vma_pkey()). Such access will result in a SIGSEGV regardless + * because core code checks arch_vma_access_permitted(), but in order + * to report the correct error code - SEGV_PKUERR - we must handle + * that case here. + */ return !arch_vma_access_permitted(vma, mm_flags & FAULT_FLAG_WRITE, mm_flags & FAULT_FLAG_INSTRUCTION, @@ -549,7 +599,7 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, const struct fault_info *inf; struct mm_struct *mm = current->mm; vm_fault_t fault; - unsigned long vm_flags; + vm_flags_t vm_flags; unsigned int mm_flags = FAULT_FLAG_DEFAULT; unsigned long addr = untagged_addr(far); struct vm_area_struct *vma; @@ -606,11 +656,18 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, die_kernel_fault("execution of user memory", addr, esr, regs); - if (!search_exception_tables(regs->pc)) + if (!insn_may_access_user(regs->pc, esr)) die_kernel_fault("access to user memory outside uaccess routines", addr, esr, regs); } + if (is_pkvm_stage2_abort(esr)) { + if (!user_mode(regs)) + goto no_context; + arm64_force_sig_fault(SIGSEGV, SEGV_ACCERR, far, "stage-2 fault"); + return 0; + } + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); if (!(mm_flags & FAULT_FLAG_USER)) @@ -635,7 +692,7 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, goto bad_area; } - if (fault_from_pkey(esr, vma, mm_flags)) { + if (fault_from_pkey(vma, mm_flags)) { pkey = vma_pkey(vma); vma_end_read(vma); fault = 0; @@ -679,7 +736,7 @@ retry: goto bad_area; } - if (fault_from_pkey(esr, vma, mm_flags)) { + if (fault_from_pkey(vma, mm_flags)) { pkey = vma_pkey(vma); mmap_read_unlock(mm); fault = 0; @@ -826,6 +883,7 @@ static int do_sea(unsigned long far, unsigned long esr, struct pt_regs *regs) */ siaddr = untagged_addr(far); } + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_STILL_OK); arm64_notify_die(inf->name, regs, inf->sig, inf->code, siaddr, esr); return 0; @@ -837,9 +895,12 @@ static int do_tag_check_fault(unsigned long far, unsigned long esr, /* * The architecture specifies that bits 63:60 of FAR_EL1 are UNKNOWN * for tag check faults. Set them to corresponding bits in the untagged - * address. + * address if ARM64_MTE_FAR isn't supported. + * Otherwise, bits 63:60 of FAR_EL1 are not UNKNOWN. */ - far = (__untagged_addr(far) & ~MTE_TAG_MASK) | (far & MTE_TAG_MASK); + if (!cpus_have_cap(ARM64_MTE_FAR)) + far = (__untagged_addr(far) & ~MTE_TAG_MASK) | (far & MTE_TAG_MASK); + do_bad_area(far, esr, regs); return 0; } @@ -939,75 +1000,6 @@ void do_sp_pc_abort(unsigned long addr, unsigned long esr, struct pt_regs *regs) NOKPROBE_SYMBOL(do_sp_pc_abort); /* - * __refdata because early_brk64 is __init, but the reference to it is - * clobbered at arch_initcall time. - * See traps.c and debug-monitors.c:debug_traps_init(). - */ -static struct fault_info __refdata debug_fault_info[] = { - { do_bad, SIGTRAP, TRAP_HWBKPT, "hardware breakpoint" }, - { do_bad, SIGTRAP, TRAP_HWBKPT, "hardware single-step" }, - { do_bad, SIGTRAP, TRAP_HWBKPT, "hardware watchpoint" }, - { do_bad, SIGKILL, SI_KERNEL, "unknown 3" }, - { do_bad, SIGTRAP, TRAP_BRKPT, "aarch32 BKPT" }, - { do_bad, SIGKILL, SI_KERNEL, "aarch32 vector catch" }, - { early_brk64, SIGTRAP, TRAP_BRKPT, "aarch64 BRK" }, - { do_bad, SIGKILL, SI_KERNEL, "unknown 7" }, -}; - -void __init hook_debug_fault_code(int nr, - int (*fn)(unsigned long, unsigned long, struct pt_regs *), - int sig, int code, const char *name) -{ - BUG_ON(nr < 0 || nr >= ARRAY_SIZE(debug_fault_info)); - - debug_fault_info[nr].fn = fn; - debug_fault_info[nr].sig = sig; - debug_fault_info[nr].code = code; - debug_fault_info[nr].name = name; -} - -/* - * In debug exception context, we explicitly disable preemption despite - * having interrupts disabled. - * This serves two purposes: it makes it much less likely that we would - * accidentally schedule in exception context and it will force a warning - * if we somehow manage to schedule by accident. - */ -static void debug_exception_enter(struct pt_regs *regs) -{ - preempt_disable(); - - /* This code is a bit fragile. Test it. */ - RCU_LOCKDEP_WARN(!rcu_is_watching(), "exception_enter didn't work"); -} -NOKPROBE_SYMBOL(debug_exception_enter); - -static void debug_exception_exit(struct pt_regs *regs) -{ - preempt_enable_no_resched(); -} -NOKPROBE_SYMBOL(debug_exception_exit); - -void do_debug_exception(unsigned long addr_if_watchpoint, unsigned long esr, - struct pt_regs *regs) -{ - const struct fault_info *inf = esr_to_debug_fault_info(esr); - unsigned long pc = instruction_pointer(regs); - - debug_exception_enter(regs); - - if (user_mode(regs) && !is_ttbr0_addr(pc)) - arm64_apply_bp_hardening(); - - if (inf->fn(addr_if_watchpoint, esr, regs)) { - arm64_notify_die(inf->name, regs, inf->sig, inf->code, pc, esr); - } - - debug_exception_exit(regs); -} -NOKPROBE_SYMBOL(do_debug_exception); - -/* * Used during anonymous page fault handling. */ struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, @@ -1026,10 +1018,24 @@ struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, return vma_alloc_folio(flags, 0, vma, vaddr); } -void tag_clear_highpage(struct page *page) +bool tag_clear_highpages(struct page *page, int numpages, bool clear_pages) { - /* Newly allocated page, shouldn't have been tagged yet */ - WARN_ON_ONCE(!try_page_mte_tagging(page)); - mte_zero_clear_page_tags(page_address(page)); - set_page_mte_tagged(page); + /* + * Check if MTE is supported and fall back to clear_highpage(). + * get_huge_zero_folio() unconditionally passes __GFP_ZEROTAGS and + * post_alloc_hook() will invoke tag_clear_highpages(). + */ + if (!system_supports_mte()) + return clear_pages; + + /* Newly allocated pages, shouldn't have been tagged yet */ + for (int i = 0; i < numpages; i++, page++) { + WARN_ON_ONCE(!try_page_mte_tagging(page)); + if (clear_pages) + mte_zero_clear_page_tags(page_address(page)); + else + mte_clear_page_tags(page_address(page)); + set_page_mte_tagged(page); + } + return false; } diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index 013eead9b695..fbf08b543c3f 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -53,11 +53,11 @@ void __sync_icache_dcache(pte_t pte) { struct folio *folio = page_folio(pte_page(pte)); - if (!test_bit(PG_dcache_clean, &folio->flags)) { + if (!test_bit(PG_dcache_clean, &folio->flags.f)) { sync_icache_aliases((unsigned long)folio_address(folio), (unsigned long)folio_address(folio) + folio_size(folio)); - set_bit(PG_dcache_clean, &folio->flags); + set_bit(PG_dcache_clean, &folio->flags.f); } } EXPORT_SYMBOL_GPL(__sync_icache_dcache); @@ -69,8 +69,8 @@ EXPORT_SYMBOL_GPL(__sync_icache_dcache); */ void flush_dcache_folio(struct folio *folio) { - if (test_bit(PG_dcache_clean, &folio->flags)) - clear_bit(PG_dcache_clean, &folio->flags); + if (test_bit(PG_dcache_clean, &folio->flags.f)) + clear_bit(PG_dcache_clean, &folio->flags.f); } EXPORT_SYMBOL(flush_dcache_folio); diff --git a/arch/arm64/mm/gcs.c b/arch/arm64/mm/gcs.c index 5c46ec527b1c..680749611a9a 100644 --- a/arch/arm64/mm/gcs.c +++ b/arch/arm64/mm/gcs.c @@ -12,19 +12,7 @@ static unsigned long alloc_gcs(unsigned long addr, unsigned long size) { - int flags = MAP_ANONYMOUS | MAP_PRIVATE; - struct mm_struct *mm = current->mm; - unsigned long mapped_addr, unused; - - if (addr) - flags |= MAP_FIXED_NOREPLACE; - - mmap_write_lock(mm); - mapped_addr = do_mmap(NULL, addr, size, PROT_READ, flags, - VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL); - mmap_write_unlock(mm); - - return mapped_addr; + return vm_mmap_shadow_stack(addr, size, 0); } static unsigned long gcs_size(unsigned long size) @@ -157,12 +145,6 @@ void gcs_free(struct task_struct *task) if (!system_supports_gcs()) return; - /* - * When fork() with CLONE_VM fails, the child (tsk) already - * has a GCS allocated, and exit_thread() calls this function - * to free it. In this case the parent (current) and the - * child share the same mm struct. - */ if (!task->mm || task->mm != current->mm) return; @@ -205,8 +187,8 @@ int arch_set_shadow_stack_status(struct task_struct *task, unsigned long arg) size = gcs_size(0); gcs = alloc_gcs(0, size); - if (!gcs) - return -ENOMEM; + if (IS_ERR_VALUE(gcs)) + return gcs; task->thread.gcspr_el0 = gcs + size - sizeof(u64); task->thread.gcs_base = gcs; diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index b3a7fafe8892..30772a909aea 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -36,16 +36,12 @@ * huge pages could still be served from those areas. */ #ifdef CONFIG_CMA -void __init arm64_hugetlb_cma_reserve(void) +unsigned int arch_hugetlb_cma_order(void) { - int order; - if (pud_sect_supported()) - order = PUD_SHIFT - PAGE_SHIFT; - else - order = CONT_PMD_SHIFT - PAGE_SHIFT; + return PUD_SHIFT - PAGE_SHIFT; - hugetlb_cma_reserve(order); + return CONT_PMD_SHIFT - PAGE_SHIFT; } #endif /* CONFIG_CMA */ @@ -129,7 +125,7 @@ pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep) if (!pte_present(orig_pte) || !pte_cont(orig_pte)) return orig_pte; - ncontig = num_contig_ptes(page_size(pte_page(orig_pte)), &pgsize); + ncontig = find_num_contig(mm, addr, ptep, &pgsize); for (i = 0; i < ncontig; i++, ptep++) { pte_t pte = __ptep_get(ptep); @@ -159,12 +155,12 @@ static pte_t get_clear_contig(struct mm_struct *mm, pte_t pte, tmp_pte; bool present; - pte = __ptep_get_and_clear(mm, addr, ptep); + pte = __ptep_get_and_clear_anysz(mm, addr, ptep, pgsize); present = pte_present(pte); while (--ncontig) { ptep++; addr += pgsize; - tmp_pte = __ptep_get_and_clear(mm, addr, ptep); + tmp_pte = __ptep_get_and_clear_anysz(mm, addr, ptep, pgsize); if (present) { if (pte_dirty(tmp_pte)) pte = pte_mkdirty(pte); @@ -183,8 +179,9 @@ static pte_t get_clear_contig_flush(struct mm_struct *mm, { pte_t orig_pte = get_clear_contig(mm, addr, ptep, pgsize, ncontig); struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0); + unsigned long end = addr + (pgsize * ncontig); - flush_tlb_range(&vma, addr, addr + (pgsize * ncontig)); + __flush_hugetlb_tlb_range(&vma, addr, end, pgsize, TLBF_NOWALKCACHE); return orig_pte; } @@ -207,9 +204,12 @@ static void clear_flush(struct mm_struct *mm, unsigned long i, saddr = addr; for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) - __ptep_get_and_clear(mm, addr, ptep); + __ptep_get_and_clear_anysz(mm, addr, ptep, pgsize); - flush_tlb_range(&vma, saddr, addr); + if (mm == &init_mm) + flush_tlb_kernel_range(saddr, addr); + else + __flush_hugetlb_tlb_range(&vma, saddr, addr, pgsize, TLBF_NOWALKCACHE); } void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, @@ -218,30 +218,20 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, size_t pgsize; int i; int ncontig; - unsigned long pfn, dpfn; - pgprot_t hugeprot; ncontig = num_contig_ptes(sz, &pgsize); if (!pte_present(pte)) { for (i = 0; i < ncontig; i++, ptep++, addr += pgsize) - __set_ptes(mm, addr, ptep, pte, 1); - return; - } - - if (!pte_cont(pte)) { - __set_ptes(mm, addr, ptep, pte, 1); + __set_ptes_anysz(mm, addr, ptep, pte, 1, pgsize); return; } - pfn = pte_pfn(pte); - dpfn = pgsize >> PAGE_SHIFT; - hugeprot = pte_pgprot(pte); - - clear_flush(mm, addr, ptep, pgsize, ncontig); + /* Only need to "break" if transitioning valid -> valid. */ + if (pte_cont(pte) && pte_valid(__ptep_get(ptep))) + clear_flush(mm, addr, ptep, pgsize, ncontig); - for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) - __set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1); + __set_ptes_anysz(mm, addr, ptep, pte, ncontig, pgsize); } pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, @@ -334,7 +324,9 @@ unsigned long hugetlb_mask_last_page(struct hstate *h) switch (hp_size) { #ifndef __PAGETABLE_PMD_FOLDED case PUD_SIZE: - return PGDIR_SIZE - PUD_SIZE; + if (pud_sect_supported()) + return PGDIR_SIZE - PUD_SIZE; + break; #endif case CONT_PMD_SIZE: return PUD_SIZE - CONT_PMD_SIZE; @@ -356,23 +348,21 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags) switch (pagesize) { #ifndef __PAGETABLE_PMD_FOLDED case PUD_SIZE: - entry = pud_pte(pud_mkhuge(pte_pud(entry))); + if (pud_sect_supported()) + return pud_pte(pud_mkhuge(pte_pud(entry))); break; #endif case CONT_PMD_SIZE: - entry = pmd_pte(pmd_mkcont(pte_pmd(entry))); - fallthrough; + return pmd_pte(pmd_mkhuge(pmd_mkcont(pte_pmd(entry)))); case PMD_SIZE: - entry = pmd_pte(pmd_mkhuge(pte_pmd(entry))); - break; + return pmd_pte(pmd_mkhuge(pte_pmd(entry))); case CONT_PTE_SIZE: - entry = pte_mkcont(entry); - break; + return pte_mkcont(entry); default: - pr_warn("%s: unrecognized huge page size 0x%lx\n", - __func__, pagesize); break; } + pr_warn("%s: unrecognized huge page size 0x%lx\n", + __func__, pagesize); return entry; } @@ -431,23 +421,23 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty) { - int ncontig, i; + int ncontig; size_t pgsize = 0; - unsigned long pfn = pte_pfn(pte), dpfn; struct mm_struct *mm = vma->vm_mm; - pgprot_t hugeprot; pte_t orig_pte; - if (!pte_cont(pte)) - return __ptep_set_access_flags(vma, addr, ptep, pte, dirty); + VM_WARN_ON(!pte_present(pte)); + ncontig = num_contig_ptes(huge_page_size(hstate_vma(vma)), &pgsize); - ncontig = find_num_contig(mm, addr, ptep, &pgsize); - dpfn = pgsize >> PAGE_SHIFT; + if (!pte_cont(pte)) + return __ptep_set_access_flags_anysz(vma, addr, ptep, pte, + dirty, pgsize); if (!__cont_access_flags_changed(ptep, pte, ncontig)) return 0; orig_pte = get_clear_contig_flush(mm, addr, ptep, pgsize, ncontig); + VM_WARN_ON(!pte_present(orig_pte)); /* Make sure we don't lose the dirty or young state */ if (pte_dirty(orig_pte)) @@ -456,38 +446,31 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, if (pte_young(orig_pte)) pte = pte_mkyoung(pte); - hugeprot = pte_pgprot(pte); - for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) - __set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1); - + __set_ptes_anysz(mm, addr, ptep, pte, ncontig, pgsize); return 1; } void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - unsigned long pfn, dpfn; - pgprot_t hugeprot; - int ncontig, i; + int ncontig; size_t pgsize; pte_t pte; - if (!pte_cont(__ptep_get(ptep))) { + pte = __ptep_get(ptep); + VM_WARN_ON(!pte_present(pte)); + + if (!pte_cont(pte)) { __ptep_set_wrprotect(mm, addr, ptep); return; } ncontig = find_num_contig(mm, addr, ptep, &pgsize); - dpfn = pgsize >> PAGE_SHIFT; pte = get_clear_contig_flush(mm, addr, ptep, pgsize, ncontig); pte = pte_wrprotect(pte); - hugeprot = pte_pgprot(pte); - pfn = pte_pfn(pte); - - for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) - __set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1); + __set_ptes_anysz(mm, addr, ptep, pte, ncontig, pgsize); } pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, @@ -497,10 +480,7 @@ pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, size_t pgsize; int ncontig; - if (!pte_cont(__ptep_get(ptep))) - return ptep_clear_flush(vma, addr, ptep); - - ncontig = find_num_contig(mm, addr, ptep, &pgsize); + ncontig = num_contig_ptes(huge_page_size(hstate_vma(vma)), &pgsize); return get_clear_contig_flush(mm, addr, ptep, pgsize, ncontig); } diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index ccdef53872a0..97987f850a33 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -98,21 +98,19 @@ static void __init arch_reserve_crashkernel(void) { unsigned long long low_size = 0; unsigned long long crash_base, crash_size; - char *cmdline = boot_command_line; bool high = false; int ret; if (!IS_ENABLED(CONFIG_CRASH_RESERVE)) return; - ret = parse_crashkernel(cmdline, memblock_phys_mem_size(), + ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), &crash_size, &crash_base, - &low_size, &high); + &low_size, NULL, &high); if (ret) return; - reserve_crashkernel_generic(cmdline, crash_size, crash_base, - low_size, high); + reserve_crashkernel_generic(crash_size, crash_base, low_size, high); } static phys_addr_t __init max_zone_phys(phys_addr_t zone_limit) @@ -120,9 +118,22 @@ static phys_addr_t __init max_zone_phys(phys_addr_t zone_limit) return min(zone_limit, memblock_end_of_DRAM() - 1) + 1; } -static void __init zone_sizes_init(void) +void __init arch_zone_limits_init(unsigned long *max_zone_pfns) +{ + phys_addr_t __maybe_unused dma32_phys_limit = + max_zone_phys(DMA_BIT_MASK(32)); + +#ifdef CONFIG_ZONE_DMA + max_zone_pfns[ZONE_DMA] = PFN_DOWN(max_zone_phys(zone_dma_limit)); +#endif +#ifdef CONFIG_ZONE_DMA32 + max_zone_pfns[ZONE_DMA32] = PFN_DOWN(dma32_phys_limit); +#endif + max_zone_pfns[ZONE_NORMAL] = max_pfn; +} + +static void __init dma_limits_init(void) { - unsigned long max_zone_pfns[MAX_NR_ZONES] = {0}; phys_addr_t __maybe_unused acpi_zone_dma_limit; phys_addr_t __maybe_unused dt_zone_dma_limit; phys_addr_t __maybe_unused dma32_phys_limit = @@ -141,18 +152,13 @@ static void __init zone_sizes_init(void) if (memblock_start_of_DRAM() < U32_MAX) zone_dma_limit = min(zone_dma_limit, U32_MAX); arm64_dma_phys_limit = max_zone_phys(zone_dma_limit); - max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit); #endif #ifdef CONFIG_ZONE_DMA32 - max_zone_pfns[ZONE_DMA32] = PFN_DOWN(dma32_phys_limit); if (!arm64_dma_phys_limit) arm64_dma_phys_limit = dma32_phys_limit; #endif if (!arm64_dma_phys_limit) arm64_dma_phys_limit = PHYS_MASK + 1; - max_zone_pfns[ZONE_NORMAL] = max_pfn; - - free_area_init(max_zone_pfns); } int pfn_is_map_memory(unsigned long pfn) @@ -245,7 +251,7 @@ void __init arm64_memblock_init(void) */ if (memory_limit != PHYS_ADDR_MAX) { memblock_mem_limit_remove_map(memory_limit); - memblock_add(__pa_symbol(_text), (u64)(_end - _text)); + memblock_add(__pa_symbol(_text), (resource_size_t)(_end - _text)); } if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) { @@ -254,8 +260,8 @@ void __init arm64_memblock_init(void) * initrd to become inaccessible via the linear mapping. * Otherwise, this is a no-op */ - u64 base = phys_initrd_start & PAGE_MASK; - u64 size = PAGE_ALIGN(phys_initrd_start + phys_initrd_size) - base; + phys_addr_t base = phys_initrd_start & PAGE_MASK; + resource_size_t size = PAGE_ALIGN(phys_initrd_start + phys_initrd_size) - base; /* * We can only add back the initrd memory if we don't end up @@ -277,31 +283,11 @@ void __init arm64_memblock_init(void) } } - if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { - extern u16 memstart_offset_seed; - u64 mmfr0 = read_cpuid(ID_AA64MMFR0_EL1); - int parange = cpuid_feature_extract_unsigned_field( - mmfr0, ID_AA64MMFR0_EL1_PARANGE_SHIFT); - s64 range = linear_region_size - - BIT(id_aa64mmfr0_parange_to_phys_shift(parange)); - - /* - * If the size of the linear region exceeds, by a sufficient - * margin, the size of the region that the physical memory can - * span, randomize the linear region as well. - */ - if (memstart_offset_seed > 0 && range >= (s64)ARM64_MEMSTART_ALIGN) { - range /= ARM64_MEMSTART_ALIGN; - memstart_addr -= ARM64_MEMSTART_ALIGN * - ((range * memstart_offset_seed) >> 16); - } - } - /* * Register the kernel text, kernel data, initrd, and initial * pagetables with memblock. */ - memblock_reserve(__pa_symbol(_stext), _end - _stext); + memblock_reserve(__pa_symbol(_text), _end - _text); if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) { /* the generic initrd code expects virtual addresses */ initrd_start = __phys_to_virt(phys_initrd_start); @@ -309,8 +295,6 @@ void __init arm64_memblock_init(void) } early_init_fdt_scan_reserved_mem(); - - high_memory = __va(memblock_end_of_DRAM() - 1) + 1; } void __init bootmem_init(void) @@ -327,23 +311,8 @@ void __init bootmem_init(void) arch_numa_init(); - /* - * must be done after arch_numa_init() which calls numa_init() to - * initialize node_online_map that gets used in hugetlb_cma_reserve() - * while allocating required CMA size across online nodes. - */ -#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA) - arm64_hugetlb_cma_reserve(); -#endif - kvm_hyp_reserve(); - - /* - * sparse_init() tries to allocate memory from memblock, so must be - * done after the fixed reservations - */ - sparse_init(); - zone_sizes_init(); + dma_limits_init(); /* * Reserve the CMA area after arm64_dma_phys_limit was initialised. @@ -359,12 +328,12 @@ void __init bootmem_init(void) memblock_dump_all(); } -/* - * mem_init() marks the free areas in the mem_map and tells us how much memory - * is free. This is done after various parts of the system have claimed their - * memory after the kernel image. - */ -void __init mem_init(void) +void __init arch_setup_zero_pages(void) +{ + __zero_page = phys_to_page(__pa_symbol(empty_zero_page)); +} + +void __init arch_mm_preinit(void) { unsigned int flags = SWIOTLB_VERBOSE; bool swiotlb = max_pfn > PFN_DOWN(arm64_dma_phys_limit); @@ -386,10 +355,6 @@ void __init mem_init(void) } swiotlb_init(swiotlb, flags); - swiotlb_update_mem_attributes(); - - /* this will put all unused low memory onto the freelists */ - memblock_free_all(); /* * Check boundaries twice: Some fundamental inconsistencies can be @@ -416,6 +381,14 @@ void __init mem_init(void) } } +bool page_alloc_available __ro_after_init; + +void __init mem_init(void) +{ + page_alloc_available = true; + swiotlb_update_mem_attributes(); +} + void free_initmem(void) { void *lm_init_begin = lm_alias(__init_begin); @@ -424,9 +397,6 @@ void free_initmem(void) WARN_ON(!IS_ALIGNED((unsigned long)lm_init_begin, PAGE_SIZE)); WARN_ON(!IS_ALIGNED((unsigned long)lm_init_end, PAGE_SIZE)); - /* Delete __init region from memblock.reserved. */ - memblock_free(lm_init_begin, lm_init_end - lm_init_begin); - free_reserved_area(lm_init_begin, lm_init_end, POISON_FREE_INITMEM, "unused kernel"); /* diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c index 6cc0b7e7eb03..6eef48ef6278 100644 --- a/arch/arm64/mm/ioremap.c +++ b/arch/arm64/mm/ioremap.c @@ -14,18 +14,18 @@ int arm64_ioremap_prot_hook_register(ioremap_prot_hook_t hook) return 0; } -void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, - unsigned long prot) +void __iomem *__ioremap_prot(phys_addr_t phys_addr, size_t size, + pgprot_t pgprot) { unsigned long last_addr = phys_addr + size - 1; - pgprot_t pgprot = __pgprot(prot); /* Don't allow outside PHYS_MASK */ if (last_addr & ~PHYS_MASK) return NULL; /* Don't allow RAM to be mapped. */ - if (WARN_ON(pfn_is_map_memory(__phys_to_pfn(phys_addr)))) + if (WARN_ONCE(pfn_is_map_memory(__phys_to_pfn(phys_addr)), + "ioremap attempted on RAM pfn\n")) return NULL; /* @@ -39,7 +39,7 @@ void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, return generic_ioremap_prot(phys_addr, size, pgprot); } -EXPORT_SYMBOL(ioremap_prot); +EXPORT_SYMBOL(__ioremap_prot); /* * Must be called after early_fixmap_init diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index b65a29440a0c..abeb81bf6ebd 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -190,7 +190,7 @@ static void __init kasan_pgd_populate(unsigned long addr, unsigned long end, */ static bool __init root_level_aligned(u64 addr) { - int shift = (ARM64_HW_PGTABLE_LEVELS(vabits_actual) - 1) * (PAGE_SHIFT - 3); + int shift = (ARM64_HW_PGTABLE_LEVELS(vabits_actual) - 1) * PTDESC_TABLE_SHIFT; return (addr % (PAGE_SIZE << shift)) == 0; } @@ -245,7 +245,7 @@ static int __init root_level_idx(u64 addr) */ u64 vabits = IS_ENABLED(CONFIG_ARM64_64K_PAGES) ? VA_BITS : vabits_actual; - int shift = (ARM64_HW_PGTABLE_LEVELS(vabits) - 1) * (PAGE_SHIFT - 3); + int shift = (ARM64_HW_PGTABLE_LEVELS(vabits) - 1) * PTDESC_TABLE_SHIFT; return (addr & ~_PAGE_OFFSET(vabits)) >> (shift + PAGE_SHIFT); } @@ -269,7 +269,7 @@ static void __init clone_next_level(u64 addr, pgd_t *tmp_pg_dir, pud_t *pud) */ static int __init next_level_idx(u64 addr) { - int shift = (ARM64_HW_PGTABLE_LEVELS(vabits_actual) - 2) * (PAGE_SHIFT - 3); + int shift = (ARM64_HW_PGTABLE_LEVELS(vabits_actual) - 2) * PTDESC_TABLE_SHIFT; return (addr >> (shift + PAGE_SHIFT)) % PTRS_PER_PTE; } @@ -399,14 +399,12 @@ void __init kasan_init(void) { kasan_init_shadow(); kasan_init_depth(); -#if defined(CONFIG_KASAN_GENERIC) + kasan_init_generic(); /* * Generic KASAN is now fully initialized. * Software and Hardware Tag-Based modes still require * kasan_init_sw_tags() and kasan_init_hw_tags() correspondingly. */ - pr_info("KernelAddressSanitizer initialized (generic)\n"); -#endif } #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index 07aeab8a7606..92b2f5097a96 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -34,6 +34,8 @@ static pgprot_t protection_map[16] __ro_after_init = { [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_SHARED_EXEC }; +static ptdesc_t gcs_page_prot __ro_after_init = _PAGE_GCS_RO; + /* * You really shouldn't be using read() or write() on /dev/mem. This might go * away in the future. @@ -73,21 +75,27 @@ static int __init adjust_protection_map(void) protection_map[VM_EXEC | VM_SHARED] = PAGE_EXECONLY; } - if (lpa2_is_enabled()) + if (lpa2_is_enabled()) { for (int i = 0; i < ARRAY_SIZE(protection_map); i++) pgprot_val(protection_map[i]) &= ~PTE_SHARED; + gcs_page_prot &= ~PTE_SHARED; + } return 0; } arch_initcall(adjust_protection_map); -pgprot_t vm_get_page_prot(unsigned long vm_flags) +pgprot_t vm_get_page_prot(vm_flags_t vm_flags) { - pteval_t prot; + ptdesc_t prot; /* Short circuit GCS to avoid bloating the table. */ if (system_supports_gcs() && (vm_flags & VM_SHADOW_STACK)) { - prot = _PAGE_GCS_RO; + /* Honour mprotect(PROT_NONE) on shadow stack mappings */ + if (vm_flags & VM_ACCESS_FLAGS) + prot = gcs_page_prot; + else + prot = pgprot_val(protection_map[VM_NONE]); } else { prot = pgprot_val(protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 1dfe1a8efdbe..dd85e093ffdb 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -26,6 +26,9 @@ #include <linux/set_memory.h> #include <linux/kfence.h> #include <linux/pkeys.h> +#include <linux/mm_inline.h> +#include <linux/pagewalk.h> +#include <linux/stop_machine.h> #include <asm/barrier.h> #include <asm/cputype.h> @@ -46,6 +49,8 @@ #define NO_CONT_MAPPINGS BIT(1) #define NO_EXEC_MAPPINGS BIT(2) /* assumes FEAT_HPDS is not used */ +DEFINE_STATIC_KEY_FALSE(arm64_ptdump_lock_key); + u64 kimage_voffset __ro_after_init; EXPORT_SYMBOL(kimage_voffset); @@ -59,13 +64,6 @@ static bool rodata_is_rw __ro_after_init = true; */ long __section(".mmuoff.data.write") __early_cpu_boot_status; -/* - * Empty_zero_page is a special page that is used for zero-initialized data - * and COW. - */ -unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; -EXPORT_SYMBOL(empty_zero_page); - static DEFINE_SPINLOCK(swapper_pgdir_lock); static DEFINE_MUTEX(fixmap_lock); @@ -107,7 +105,7 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, } EXPORT_SYMBOL(phys_mem_access_prot); -static phys_addr_t __init early_pgtable_alloc(int shift) +static phys_addr_t __init early_pgtable_alloc(enum pgtable_level pgtable_level) { phys_addr_t phys; @@ -189,17 +187,17 @@ static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end, } while (ptep++, addr += PAGE_SIZE, addr != end); } -static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, - unsigned long end, phys_addr_t phys, - pgprot_t prot, - phys_addr_t (*pgtable_alloc)(int), - int flags) +static int alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, + unsigned long end, phys_addr_t phys, + pgprot_t prot, + phys_addr_t (*pgtable_alloc)(enum pgtable_level), + int flags) { unsigned long next; pmd_t pmd = READ_ONCE(*pmdp); pte_t *ptep; - BUG_ON(pmd_sect(pmd)); + BUG_ON(pmd_leaf(pmd)); if (pmd_none(pmd)) { pmdval_t pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN | PMD_TABLE_AF; phys_addr_t pte_phys; @@ -207,7 +205,9 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, if (flags & NO_EXEC_MAPPINGS) pmdval |= PMD_TABLE_PXN; BUG_ON(!pgtable_alloc); - pte_phys = pgtable_alloc(PAGE_SHIFT); + pte_phys = pgtable_alloc(PGTABLE_LEVEL_PTE); + if (pte_phys == INVALID_PHYS_ADDR) + return -ENOMEM; ptep = pte_set_fixmap(pte_phys); init_clear_pgtable(ptep); ptep += pte_index(addr); @@ -239,11 +239,13 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, * walker. */ pte_clear_fixmap(); + + return 0; } -static void init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end, - phys_addr_t phys, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(int), int flags) +static int init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end, + phys_addr_t phys, pgprot_t prot, + phys_addr_t (*pgtable_alloc)(enum pgtable_level), int flags) { unsigned long next; @@ -264,21 +266,29 @@ static void init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end, BUG_ON(!pgattr_change_is_safe(pmd_val(old_pmd), READ_ONCE(pmd_val(*pmdp)))); } else { - alloc_init_cont_pte(pmdp, addr, next, phys, prot, - pgtable_alloc, flags); + int ret; + + ret = alloc_init_cont_pte(pmdp, addr, next, phys, prot, + pgtable_alloc, flags); + if (ret) + return ret; BUG_ON(pmd_val(old_pmd) != 0 && pmd_val(old_pmd) != READ_ONCE(pmd_val(*pmdp))); } phys += next - addr; } while (pmdp++, addr = next, addr != end); + + return 0; } -static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, - unsigned long end, phys_addr_t phys, - pgprot_t prot, - phys_addr_t (*pgtable_alloc)(int), int flags) +static int alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, + unsigned long end, phys_addr_t phys, + pgprot_t prot, + phys_addr_t (*pgtable_alloc)(enum pgtable_level), + int flags) { + int ret; unsigned long next; pud_t pud = READ_ONCE(*pudp); pmd_t *pmdp; @@ -286,7 +296,7 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, /* * Check for initial section mappings in the pgd/pud. */ - BUG_ON(pud_sect(pud)); + BUG_ON(pud_leaf(pud)); if (pud_none(pud)) { pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN | PUD_TABLE_AF; phys_addr_t pmd_phys; @@ -294,7 +304,9 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, if (flags & NO_EXEC_MAPPINGS) pudval |= PUD_TABLE_PXN; BUG_ON(!pgtable_alloc); - pmd_phys = pgtable_alloc(PMD_SHIFT); + pmd_phys = pgtable_alloc(PGTABLE_LEVEL_PMD); + if (pmd_phys == INVALID_PHYS_ADDR) + return -ENOMEM; pmdp = pmd_set_fixmap(pmd_phys); init_clear_pgtable(pmdp); pmdp += pmd_index(addr); @@ -314,20 +326,26 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, (flags & NO_CONT_MAPPINGS) == 0) __prot = __pgprot(pgprot_val(prot) | PTE_CONT); - init_pmd(pmdp, addr, next, phys, __prot, pgtable_alloc, flags); + ret = init_pmd(pmdp, addr, next, phys, __prot, pgtable_alloc, flags); + if (ret) + goto out; pmdp += pmd_index(next) - pmd_index(addr); phys += next - addr; } while (addr = next, addr != end); +out: pmd_clear_fixmap(); + + return ret; } -static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end, - phys_addr_t phys, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(int), - int flags) +static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end, + phys_addr_t phys, pgprot_t prot, + phys_addr_t (*pgtable_alloc)(enum pgtable_level), + int flags) { + int ret = 0; unsigned long next; p4d_t p4d = READ_ONCE(*p4dp); pud_t *pudp; @@ -339,7 +357,9 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end, if (flags & NO_EXEC_MAPPINGS) p4dval |= P4D_TABLE_PXN; BUG_ON(!pgtable_alloc); - pud_phys = pgtable_alloc(PUD_SHIFT); + pud_phys = pgtable_alloc(PGTABLE_LEVEL_PUD); + if (pud_phys == INVALID_PHYS_ADDR) + return -ENOMEM; pudp = pud_set_fixmap(pud_phys); init_clear_pgtable(pudp); pudp += pud_index(addr); @@ -369,8 +389,10 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end, BUG_ON(!pgattr_change_is_safe(pud_val(old_pud), READ_ONCE(pud_val(*pudp)))); } else { - alloc_init_cont_pmd(pudp, addr, next, phys, prot, - pgtable_alloc, flags); + ret = alloc_init_cont_pmd(pudp, addr, next, phys, prot, + pgtable_alloc, flags); + if (ret) + goto out; BUG_ON(pud_val(old_pud) != 0 && pud_val(old_pud) != READ_ONCE(pud_val(*pudp))); @@ -378,14 +400,18 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end, phys += next - addr; } while (pudp++, addr = next, addr != end); +out: pud_clear_fixmap(); + + return ret; } -static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end, - phys_addr_t phys, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(int), - int flags) +static int alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end, + phys_addr_t phys, pgprot_t prot, + phys_addr_t (*pgtable_alloc)(enum pgtable_level), + int flags) { + int ret; unsigned long next; pgd_t pgd = READ_ONCE(*pgdp); p4d_t *p4dp; @@ -397,7 +423,9 @@ static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end, if (flags & NO_EXEC_MAPPINGS) pgdval |= PGD_TABLE_PXN; BUG_ON(!pgtable_alloc); - p4d_phys = pgtable_alloc(P4D_SHIFT); + p4d_phys = pgtable_alloc(PGTABLE_LEVEL_P4D); + if (p4d_phys == INVALID_PHYS_ADDR) + return -ENOMEM; p4dp = p4d_set_fixmap(p4d_phys); init_clear_pgtable(p4dp); p4dp += p4d_index(addr); @@ -412,8 +440,10 @@ static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end, next = p4d_addr_end(addr, end); - alloc_init_pud(p4dp, addr, next, phys, prot, - pgtable_alloc, flags); + ret = alloc_init_pud(p4dp, addr, next, phys, prot, + pgtable_alloc, flags); + if (ret) + goto out; BUG_ON(p4d_val(old_p4d) != 0 && p4d_val(old_p4d) != READ_ONCE(p4d_val(*p4dp))); @@ -421,15 +451,19 @@ static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end, phys += next - addr; } while (p4dp++, addr = next, addr != end); +out: p4d_clear_fixmap(); + + return ret; } -static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys, - unsigned long virt, phys_addr_t size, - pgprot_t prot, - phys_addr_t (*pgtable_alloc)(int), - int flags) +static int __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys, + unsigned long virt, phys_addr_t size, + pgprot_t prot, + phys_addr_t (*pgtable_alloc)(enum pgtable_level), + int flags) { + int ret; unsigned long addr, end, next; pgd_t *pgdp = pgd_offset_pgd(pgdir, virt); @@ -438,7 +472,7 @@ static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys, * within a page, we cannot map the region as the caller expects. */ if (WARN_ON((phys ^ virt) & ~PAGE_MASK)) - return; + return -EINVAL; phys &= PAGE_MASK; addr = virt & PAGE_MASK; @@ -446,59 +480,511 @@ static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys, do { next = pgd_addr_end(addr, end); - alloc_init_p4d(pgdp, addr, next, phys, prot, pgtable_alloc, - flags); + ret = alloc_init_p4d(pgdp, addr, next, phys, prot, pgtable_alloc, + flags); + if (ret) + return ret; phys += next - addr; } while (pgdp++, addr = next, addr != end); + + return 0; } -static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, - unsigned long virt, phys_addr_t size, - pgprot_t prot, - phys_addr_t (*pgtable_alloc)(int), - int flags) +static int __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, + unsigned long virt, phys_addr_t size, + pgprot_t prot, + phys_addr_t (*pgtable_alloc)(enum pgtable_level), + int flags) { + int ret; + mutex_lock(&fixmap_lock); - __create_pgd_mapping_locked(pgdir, phys, virt, size, prot, - pgtable_alloc, flags); + ret = __create_pgd_mapping_locked(pgdir, phys, virt, size, prot, + pgtable_alloc, flags); mutex_unlock(&fixmap_lock); + + return ret; } -#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 -extern __alias(__create_pgd_mapping_locked) -void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, - phys_addr_t size, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(int), int flags); -#endif +static void early_create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, + unsigned long virt, phys_addr_t size, + pgprot_t prot, + phys_addr_t (*pgtable_alloc)(enum pgtable_level), + int flags) +{ + int ret; -static phys_addr_t __pgd_pgtable_alloc(int shift) + ret = __create_pgd_mapping(pgdir, phys, virt, size, prot, pgtable_alloc, + flags); + if (ret) + panic("Failed to create page tables\n"); +} + +static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, gfp_t gfp, + enum pgtable_level pgtable_level) { /* Page is zeroed by init_clear_pgtable() so don't duplicate effort. */ - void *ptr = (void *)__get_free_page(GFP_PGTABLE_KERNEL & ~__GFP_ZERO); + struct ptdesc *ptdesc = pagetable_alloc(gfp & ~__GFP_ZERO, 0); + phys_addr_t pa; + + if (!ptdesc) + return INVALID_PHYS_ADDR; + + pa = page_to_phys(ptdesc_page(ptdesc)); + + switch (pgtable_level) { + case PGTABLE_LEVEL_PTE: + BUG_ON(!pagetable_pte_ctor(mm, ptdesc)); + break; + case PGTABLE_LEVEL_PMD: + BUG_ON(!pagetable_pmd_ctor(mm, ptdesc)); + break; + case PGTABLE_LEVEL_PUD: + pagetable_pud_ctor(ptdesc); + break; + case PGTABLE_LEVEL_P4D: + pagetable_p4d_ctor(ptdesc); + break; + case PGTABLE_LEVEL_PGD: + VM_WARN_ON(1); + break; + } + + return pa; +} + +static phys_addr_t +pgd_pgtable_alloc_init_mm_gfp(enum pgtable_level pgtable_level, gfp_t gfp) +{ + return __pgd_pgtable_alloc(&init_mm, gfp, pgtable_level); +} - BUG_ON(!ptr); - return __pa(ptr); +static phys_addr_t __maybe_unused +pgd_pgtable_alloc_init_mm(enum pgtable_level pgtable_level) +{ + return pgd_pgtable_alloc_init_mm_gfp(pgtable_level, GFP_PGTABLE_KERNEL); +} + +static phys_addr_t +pgd_pgtable_alloc_special_mm(enum pgtable_level pgtable_level) +{ + return __pgd_pgtable_alloc(NULL, GFP_PGTABLE_KERNEL, pgtable_level); } -static phys_addr_t pgd_pgtable_alloc(int shift) +static void split_contpte(pte_t *ptep) { - phys_addr_t pa = __pgd_pgtable_alloc(shift); - struct ptdesc *ptdesc = page_ptdesc(phys_to_page(pa)); + int i; + + ptep = PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * CONT_PTES); + for (i = 0; i < CONT_PTES; i++, ptep++) + __set_pte(ptep, pte_mknoncont(__ptep_get(ptep))); +} + +static int split_pmd(pmd_t *pmdp, pmd_t pmd, gfp_t gfp, bool to_cont) +{ + pmdval_t tableprot = PMD_TYPE_TABLE | PMD_TABLE_UXN | PMD_TABLE_AF; + unsigned long pfn = pmd_pfn(pmd); + pgprot_t prot = pmd_pgprot(pmd); + phys_addr_t pte_phys; + pte_t *ptep; + int i; + + pte_phys = pgd_pgtable_alloc_init_mm_gfp(PGTABLE_LEVEL_PTE, gfp); + if (pte_phys == INVALID_PHYS_ADDR) + return -ENOMEM; + ptep = (pte_t *)phys_to_virt(pte_phys); + + if (pgprot_val(prot) & PMD_SECT_PXN) + tableprot |= PMD_TABLE_PXN; + + prot = __pgprot((pgprot_val(prot) & ~PTE_TYPE_MASK) | PTE_TYPE_PAGE); + if (!pmd_valid(pmd)) + prot = pte_pgprot(pte_mkinvalid(pfn_pte(0, prot))); + prot = __pgprot(pgprot_val(prot) & ~PTE_CONT); + if (to_cont) + prot = __pgprot(pgprot_val(prot) | PTE_CONT); + + for (i = 0; i < PTRS_PER_PTE; i++, ptep++, pfn++) + __set_pte(ptep, pfn_pte(pfn, prot)); /* - * Call proper page table ctor in case later we need to - * call core mm functions like apply_to_page_range() on - * this pre-allocated page table. + * Ensure the pte entries are visible to the table walker by the time + * the pmd entry that points to the ptes is visible. + */ + dsb(ishst); + __pmd_populate(pmdp, pte_phys, tableprot); + + return 0; +} + +static void split_contpmd(pmd_t *pmdp) +{ + int i; + + pmdp = PTR_ALIGN_DOWN(pmdp, sizeof(*pmdp) * CONT_PMDS); + for (i = 0; i < CONT_PMDS; i++, pmdp++) + set_pmd(pmdp, pmd_mknoncont(pmdp_get(pmdp))); +} + +static int split_pud(pud_t *pudp, pud_t pud, gfp_t gfp, bool to_cont) +{ + pudval_t tableprot = PUD_TYPE_TABLE | PUD_TABLE_UXN | PUD_TABLE_AF; + unsigned int step = PMD_SIZE >> PAGE_SHIFT; + unsigned long pfn = pud_pfn(pud); + pgprot_t prot = pud_pgprot(pud); + phys_addr_t pmd_phys; + pmd_t *pmdp; + int i; + + pmd_phys = pgd_pgtable_alloc_init_mm_gfp(PGTABLE_LEVEL_PMD, gfp); + if (pmd_phys == INVALID_PHYS_ADDR) + return -ENOMEM; + pmdp = (pmd_t *)phys_to_virt(pmd_phys); + + if (pgprot_val(prot) & PMD_SECT_PXN) + tableprot |= PUD_TABLE_PXN; + + prot = __pgprot((pgprot_val(prot) & ~PMD_TYPE_MASK) | PMD_TYPE_SECT); + if (!pud_valid(pud)) + prot = pmd_pgprot(pmd_mkinvalid(pfn_pmd(0, prot))); + prot = __pgprot(pgprot_val(prot) & ~PTE_CONT); + if (to_cont) + prot = __pgprot(pgprot_val(prot) | PTE_CONT); + + for (i = 0; i < PTRS_PER_PMD; i++, pmdp++, pfn += step) + set_pmd(pmdp, pfn_pmd(pfn, prot)); + + /* + * Ensure the pmd entries are visible to the table walker by the time + * the pud entry that points to the pmds is visible. + */ + dsb(ishst); + __pud_populate(pudp, pmd_phys, tableprot); + + return 0; +} + +static int split_kernel_leaf_mapping_locked(unsigned long addr) +{ + pgd_t *pgdp, pgd; + p4d_t *p4dp, p4d; + pud_t *pudp, pud; + pmd_t *pmdp, pmd; + pte_t *ptep, pte; + int ret = 0; + + /* + * PGD: If addr is PGD aligned then addr already describes a leaf + * boundary. If not present then there is nothing to split. + */ + if (ALIGN_DOWN(addr, PGDIR_SIZE) == addr) + goto out; + pgdp = pgd_offset_k(addr); + pgd = pgdp_get(pgdp); + if (!pgd_present(pgd)) + goto out; + + /* + * P4D: If addr is P4D aligned then addr already describes a leaf + * boundary. If not present then there is nothing to split. + */ + if (ALIGN_DOWN(addr, P4D_SIZE) == addr) + goto out; + p4dp = p4d_offset(pgdp, addr); + p4d = p4dp_get(p4dp); + if (!p4d_present(p4d)) + goto out; + + /* + * PUD: If addr is PUD aligned then addr already describes a leaf + * boundary. If not present then there is nothing to split. Otherwise, + * if we have a pud leaf, split to contpmd. + */ + if (ALIGN_DOWN(addr, PUD_SIZE) == addr) + goto out; + pudp = pud_offset(p4dp, addr); + pud = pudp_get(pudp); + if (!pud_present(pud)) + goto out; + if (pud_leaf(pud)) { + ret = split_pud(pudp, pud, GFP_PGTABLE_KERNEL, true); + if (ret) + goto out; + } + + /* + * CONTPMD: If addr is CONTPMD aligned then addr already describes a + * leaf boundary. If not present then there is nothing to split. + * Otherwise, if we have a contpmd leaf, split to pmd. + */ + if (ALIGN_DOWN(addr, CONT_PMD_SIZE) == addr) + goto out; + pmdp = pmd_offset(pudp, addr); + pmd = pmdp_get(pmdp); + if (!pmd_present(pmd)) + goto out; + if (pmd_leaf(pmd)) { + if (pmd_cont(pmd)) + split_contpmd(pmdp); + /* + * PMD: If addr is PMD aligned then addr already describes a + * leaf boundary. Otherwise, split to contpte. + */ + if (ALIGN_DOWN(addr, PMD_SIZE) == addr) + goto out; + ret = split_pmd(pmdp, pmd, GFP_PGTABLE_KERNEL, true); + if (ret) + goto out; + } + + /* + * CONTPTE: If addr is CONTPTE aligned then addr already describes a + * leaf boundary. If not present then there is nothing to split. + * Otherwise, if we have a contpte leaf, split to pte. + */ + if (ALIGN_DOWN(addr, CONT_PTE_SIZE) == addr) + goto out; + ptep = pte_offset_kernel(pmdp, addr); + pte = __ptep_get(ptep); + if (!pte_present(pte)) + goto out; + if (pte_cont(pte)) + split_contpte(ptep); + +out: + return ret; +} + +static inline bool force_pte_mapping(void) +{ + const bool bbml2 = system_capabilities_finalized() ? + system_supports_bbml2_noabort() : cpu_supports_bbml2_noabort(); + + if (debug_pagealloc_enabled()) + return true; + if (bbml2) + return false; + return rodata_full || arm64_kfence_can_set_direct_map() || is_realm_world(); +} + +static DEFINE_MUTEX(pgtable_split_lock); +static bool linear_map_requires_bbml2; + +int split_kernel_leaf_mapping(unsigned long start, unsigned long end) +{ + int ret; + + /* + * If the region is within a pte-mapped area, there is no need to try to + * split. Additionally, CONFIG_DEBUG_PAGEALLOC and CONFIG_KFENCE may + * change permissions from atomic context so for those cases (which are + * always pte-mapped), we must not go any further because taking the + * mutex below may sleep. Do not call force_pte_mapping() here because + * it could return a confusing result if called from a secondary cpu + * prior to finalizing caps. Instead, linear_map_requires_bbml2 gives us + * what we need. + */ + if (!linear_map_requires_bbml2 || is_kfence_address((void *)start)) + return 0; + + if (!system_supports_bbml2_noabort()) { + /* + * !BBML2_NOABORT systems should not be trying to change + * permissions on anything that is not pte-mapped in the first + * place. Just return early and let the permission change code + * raise a warning if not already pte-mapped. + */ + if (system_capabilities_finalized()) + return 0; + + /* + * Boot-time: split_kernel_leaf_mapping_locked() allocates from + * page allocator. Can't split until it's available. + */ + if (WARN_ON(!page_alloc_available)) + return -EBUSY; + + /* + * Boot-time: Started secondary cpus but don't know if they + * support BBML2_NOABORT yet. Can't allow splitting in this + * window in case they don't. + */ + if (WARN_ON(num_online_cpus() > 1)) + return -EBUSY; + } + + /* + * Ensure start and end are at least page-aligned since this is the + * finest granularity we can split to. + */ + if (start != PAGE_ALIGN(start) || end != PAGE_ALIGN(end)) + return -EINVAL; + + mutex_lock(&pgtable_split_lock); + lazy_mmu_mode_enable(); + + /* + * The split_kernel_leaf_mapping_locked() may sleep, it is not a + * problem for ARM64 since ARM64's lazy MMU implementation allows + * sleeping. * - * We don't select ARCH_ENABLE_SPLIT_PMD_PTLOCK if pmd is - * folded, and if so pagetable_pte_ctor() becomes nop. + * Optimize for the common case of splitting out a single page from a + * larger mapping. Here we can just split on the "least aligned" of + * start and end and this will guarantee that there must also be a split + * on the more aligned address since the both addresses must be in the + * same contpte block and it must have been split to ptes. */ - if (shift == PAGE_SHIFT) - BUG_ON(!pagetable_pte_ctor(ptdesc)); - else if (shift == PMD_SHIFT) - BUG_ON(!pagetable_pmd_ctor(ptdesc)); + if (end - start == PAGE_SIZE) { + start = __ffs(start) < __ffs(end) ? start : end; + ret = split_kernel_leaf_mapping_locked(start); + } else { + ret = split_kernel_leaf_mapping_locked(start); + if (!ret) + ret = split_kernel_leaf_mapping_locked(end); + } - return pa; + lazy_mmu_mode_disable(); + mutex_unlock(&pgtable_split_lock); + return ret; +} + +static int split_to_ptes_pud_entry(pud_t *pudp, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + gfp_t gfp = *(gfp_t *)walk->private; + pud_t pud = pudp_get(pudp); + int ret = 0; + + if (pud_leaf(pud)) + ret = split_pud(pudp, pud, gfp, false); + + return ret; +} + +static int split_to_ptes_pmd_entry(pmd_t *pmdp, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + gfp_t gfp = *(gfp_t *)walk->private; + pmd_t pmd = pmdp_get(pmdp); + int ret = 0; + + if (pmd_leaf(pmd)) { + if (pmd_cont(pmd)) + split_contpmd(pmdp); + ret = split_pmd(pmdp, pmd, gfp, false); + + /* + * We have split the pmd directly to ptes so there is no need to + * visit each pte to check if they are contpte. + */ + walk->action = ACTION_CONTINUE; + } + + return ret; +} + +static int split_to_ptes_pte_entry(pte_t *ptep, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pte_t pte = __ptep_get(ptep); + + if (pte_cont(pte)) + split_contpte(ptep); + + return 0; +} + +static const struct mm_walk_ops split_to_ptes_ops = { + .pud_entry = split_to_ptes_pud_entry, + .pmd_entry = split_to_ptes_pmd_entry, + .pte_entry = split_to_ptes_pte_entry, +}; + +static int range_split_to_ptes(unsigned long start, unsigned long end, gfp_t gfp) +{ + int ret; + + lazy_mmu_mode_enable(); + ret = walk_kernel_page_table_range_lockless(start, end, + &split_to_ptes_ops, NULL, &gfp); + lazy_mmu_mode_disable(); + + return ret; +} + +u32 idmap_kpti_bbml2_flag; + +static void __init init_idmap_kpti_bbml2_flag(void) +{ + WRITE_ONCE(idmap_kpti_bbml2_flag, 1); + /* Must be visible to other CPUs before stop_machine() is called. */ + smp_mb(); +} + +static int __init linear_map_split_to_ptes(void *__unused) +{ + /* + * Repainting the linear map must be done by CPU0 (the boot CPU) because + * that's the only CPU that we know supports BBML2. The other CPUs will + * be held in a waiting area with the idmap active. + */ + if (!smp_processor_id()) { + unsigned long lstart = _PAGE_OFFSET(vabits_actual); + unsigned long lend = PAGE_END; + unsigned long kstart = (unsigned long)lm_alias(_stext); + unsigned long kend = (unsigned long)lm_alias(__init_begin); + int ret; + + /* + * Wait for all secondary CPUs to be put into the waiting area. + */ + smp_cond_load_acquire(&idmap_kpti_bbml2_flag, VAL == num_online_cpus()); + + /* + * Walk all of the linear map [lstart, lend), except the kernel + * linear map alias [kstart, kend), and split all mappings to + * PTE. The kernel alias remains static throughout runtime so + * can continue to be safely mapped with large mappings. + */ + ret = range_split_to_ptes(lstart, kstart, GFP_ATOMIC); + if (!ret) + ret = range_split_to_ptes(kend, lend, GFP_ATOMIC); + if (ret) + panic("Failed to split linear map\n"); + flush_tlb_kernel_range(lstart, lend); + + /* + * Relies on dsb in flush_tlb_kernel_range() to avoid reordering + * before any page table split operations. + */ + WRITE_ONCE(idmap_kpti_bbml2_flag, 0); + } else { + typedef void (wait_split_fn)(void); + extern wait_split_fn wait_linear_map_split_to_ptes; + wait_split_fn *wait_fn; + + wait_fn = (void *)__pa_symbol(wait_linear_map_split_to_ptes); + + /* + * At least one secondary CPU doesn't support BBML2 so cannot + * tolerate the size of the live mappings changing. So have the + * secondary CPUs wait for the boot CPU to make the changes + * with the idmap active and init_mm inactive. + */ + cpu_install_idmap(); + wait_fn(); + cpu_uninstall_idmap(); + } + + return 0; +} + +void __init linear_map_maybe_split_to_ptes(void) +{ + if (linear_map_requires_bbml2 && !system_supports_bbml2_noabort()) { + init_idmap_kpti_bbml2_flag(); + stop_machine(linear_map_split_to_ptes, NULL, cpu_online_mask); + } } /* @@ -514,8 +1000,8 @@ void __init create_mapping_noalloc(phys_addr_t phys, unsigned long virt, &phys, virt); return; } - __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, - NO_CONT_MAPPINGS); + early_create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, + NO_CONT_MAPPINGS); } void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, @@ -529,8 +1015,8 @@ void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, if (page_mappings_only) flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; - __create_pgd_mapping(mm->pgd, phys, virt, size, prot, - pgd_pgtable_alloc, flags); + early_create_pgd_mapping(mm->pgd, phys, virt, size, prot, + pgd_pgtable_alloc_special_mm, flags); } static void update_mapping_prot(phys_addr_t phys, unsigned long virt, @@ -542,8 +1028,8 @@ static void update_mapping_prot(phys_addr_t phys, unsigned long virt, return; } - __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, - NO_CONT_MAPPINGS); + early_create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, + NO_CONT_MAPPINGS); /* flush the TLBs after updating live kernel mappings */ flush_tlb_kernel_range(virt, virt + size); @@ -552,8 +1038,8 @@ static void update_mapping_prot(phys_addr_t phys, unsigned long virt, static void __init __map_memblock(pgd_t *pgdp, phys_addr_t start, phys_addr_t end, pgprot_t prot, int flags) { - __create_pgd_mapping(pgdp, start, __phys_to_virt(start), end - start, - prot, early_pgtable_alloc, flags); + early_create_pgd_mapping(pgdp, start, __phys_to_virt(start), end - start, + prot, early_pgtable_alloc, flags); } void __init mark_linear_text_alias_ro(void) @@ -561,8 +1047,8 @@ void __init mark_linear_text_alias_ro(void) /* * Remove the write permissions from the linear alias of .text/.rodata */ - update_mapping_prot(__pa_symbol(_stext), (unsigned long)lm_alias(_stext), - (unsigned long)__init_begin - (unsigned long)_stext, + update_mapping_prot(__pa_symbol(_text), (unsigned long)lm_alias(_text), + (unsigned long)__init_begin - (unsigned long)_text, PAGE_KERNEL_RO); } @@ -613,6 +1099,33 @@ static void __init arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp) memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE); __kfence_pool = phys_to_virt(kfence_pool); } + +bool arch_kfence_init_pool(void) +{ + unsigned long start = (unsigned long)__kfence_pool; + unsigned long end = start + KFENCE_POOL_SIZE; + int ret; + + /* Exit early if we know the linear map is already pte-mapped. */ + if (force_pte_mapping()) + return true; + + /* Kfence pool is already pte-mapped for the early init case. */ + if (kfence_early_init) + return true; + + mutex_lock(&pgtable_split_lock); + ret = range_split_to_ptes(start, end, GFP_PGTABLE_KERNEL); + mutex_unlock(&pgtable_split_lock); + + /* + * Since the system supports bbml2_noabort, tlb invalidation is not + * required here; the pgtable mappings have been split to pte but larger + * entries may safely linger in the TLB. + */ + + return !ret; +} #else /* CONFIG_KFENCE */ static inline phys_addr_t arm64_kfence_alloc_pool(void) { return 0; } @@ -623,7 +1136,7 @@ static inline void arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp) { static void __init map_mem(pgd_t *pgdp) { static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN); - phys_addr_t kernel_start = __pa_symbol(_stext); + phys_addr_t kernel_start = __pa_symbol(_text); phys_addr_t kernel_end = __pa_symbol(__init_begin); phys_addr_t start, end; phys_addr_t early_kfence_pool; @@ -645,7 +1158,9 @@ static void __init map_mem(pgd_t *pgdp) early_kfence_pool = arm64_kfence_alloc_pool(); - if (can_set_direct_map()) + linear_map_requires_bbml2 = !force_pte_mapping() && can_set_direct_map(); + + if (force_pte_mapping()) flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; /* @@ -670,7 +1185,7 @@ static void __init map_mem(pgd_t *pgdp) } /* - * Map the linear alias of the [_stext, __init_begin) interval + * Map the linear alias of the [_text, __init_begin) interval * as non-executable now, and remove the write permission in * mark_linear_text_alias_ro() below (which will be called after * alternative patching has completed). This makes the contents @@ -697,6 +1212,10 @@ void mark_rodata_ro(void) WRITE_ONCE(rodata_is_rw, false); update_mapping_prot(__pa_symbol(__start_rodata), (unsigned long)__start_rodata, section_size, PAGE_KERNEL_RO); + /* mark the range between _text and _stext as read only. */ + update_mapping_prot(__pa_symbol(_text), (unsigned long)_text, + (unsigned long)_stext - (unsigned long)_text, + PAGE_KERNEL_RO); } static void __init declare_vma(struct vm_struct *vma, @@ -722,7 +1241,97 @@ static void __init declare_vma(struct vm_struct *vma, } #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 -static pgprot_t kernel_exec_prot(void) +#define KPTI_NG_TEMP_VA (-(1UL << PMD_SHIFT)) + +static phys_addr_t kpti_ng_temp_alloc __initdata; + +static phys_addr_t __init kpti_ng_pgd_alloc(enum pgtable_level pgtable_level) +{ + kpti_ng_temp_alloc -= PAGE_SIZE; + return kpti_ng_temp_alloc; +} + +static int __init __kpti_install_ng_mappings(void *__unused) +{ + typedef void (kpti_remap_fn)(int, int, phys_addr_t, unsigned long); + extern kpti_remap_fn idmap_kpti_install_ng_mappings; + kpti_remap_fn *remap_fn; + + int cpu = smp_processor_id(); + int levels = CONFIG_PGTABLE_LEVELS; + int order = order_base_2(levels); + u64 kpti_ng_temp_pgd_pa = 0; + pgd_t *kpti_ng_temp_pgd; + u64 alloc = 0; + + if (levels == 5 && !pgtable_l5_enabled()) + levels = 4; + else if (levels == 4 && !pgtable_l4_enabled()) + levels = 3; + + remap_fn = (void *)__pa_symbol(idmap_kpti_install_ng_mappings); + + if (!cpu) { + int ret; + + alloc = __get_free_pages(GFP_ATOMIC | __GFP_ZERO, order); + kpti_ng_temp_pgd = (pgd_t *)(alloc + (levels - 1) * PAGE_SIZE); + kpti_ng_temp_alloc = kpti_ng_temp_pgd_pa = __pa(kpti_ng_temp_pgd); + + // + // Create a minimal page table hierarchy that permits us to map + // the swapper page tables temporarily as we traverse them. + // + // The physical pages are laid out as follows: + // + // +--------+-/-------+-/------ +-/------ +-\\\--------+ + // : PTE[] : | PMD[] : | PUD[] : | P4D[] : ||| PGD[] : + // +--------+-\-------+-\------ +-\------ +-///--------+ + // ^ + // The first page is mapped into this hierarchy at a PMD_SHIFT + // aligned virtual address, so that we can manipulate the PTE + // level entries while the mapping is active. The first entry + // covers the PTE[] page itself, the remaining entries are free + // to be used as a ad-hoc fixmap. + // + ret = __create_pgd_mapping_locked(kpti_ng_temp_pgd, __pa(alloc), + KPTI_NG_TEMP_VA, PAGE_SIZE, PAGE_KERNEL, + kpti_ng_pgd_alloc, 0); + if (ret) + panic("Failed to create page tables\n"); + } + + cpu_install_idmap(); + remap_fn(cpu, num_online_cpus(), kpti_ng_temp_pgd_pa, KPTI_NG_TEMP_VA); + cpu_uninstall_idmap(); + + if (!cpu) { + free_pages(alloc, order); + arm64_use_ng_mappings = true; + } + + return 0; +} + +void __init kpti_install_ng_mappings(void) +{ + /* Check whether KPTI is going to be used */ + if (!arm64_kernel_unmapped_at_el0()) + return; + + /* + * We don't need to rewrite the page-tables if either we've done + * it already or we have KASLR enabled and therefore have not + * created any global mappings at all. + */ + if (arm64_use_ng_mappings) + return; + + init_idmap_kpti_bbml2_flag(); + stop_machine(__kpti_install_ng_mappings, NULL, cpu_online_mask); +} + +static pgprot_t __init kernel_exec_prot(void) { return rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC; } @@ -742,9 +1351,9 @@ static int __init map_entry_trampoline(void) /* Map only the text into the trampoline page table */ memset(tramp_pg_dir, 0, PGD_SIZE); - __create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS, - entry_tramp_text_size(), prot, - __pgd_pgtable_alloc, NO_BLOCK_MAPPINGS); + early_create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS, + entry_tramp_text_size(), prot, + pgd_pgtable_alloc_init_mm, NO_BLOCK_MAPPINGS); /* Map both the text and data into the kernel page table */ for (i = 0; i < DIV_ROUND_UP(entry_tramp_text_size(), PAGE_SIZE); i++) @@ -767,38 +1376,41 @@ static void __init declare_kernel_vmas(void) { static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT]; - declare_vma(&vmlinux_seg[0], _stext, _etext, VM_NO_GUARD); + declare_vma(&vmlinux_seg[0], _text, _etext, VM_NO_GUARD); declare_vma(&vmlinux_seg[1], __start_rodata, __inittext_begin, VM_NO_GUARD); declare_vma(&vmlinux_seg[2], __inittext_begin, __inittext_end, VM_NO_GUARD); declare_vma(&vmlinux_seg[3], __initdata_begin, __initdata_end, VM_NO_GUARD); declare_vma(&vmlinux_seg[4], _data, _end, 0); } -void __pi_map_range(u64 *pgd, u64 start, u64 end, u64 pa, pgprot_t prot, - int level, pte_t *tbl, bool may_use_cont, u64 va_offset); +void __pi_map_range(phys_addr_t *pte, u64 start, u64 end, phys_addr_t pa, + pgprot_t prot, int level, pte_t *tbl, bool may_use_cont, + u64 va_offset); static u8 idmap_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init, - kpti_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init; + kpti_bbml2_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init; static void __init create_idmap(void) { - u64 start = __pa_symbol(__idmap_text_start); - u64 end = __pa_symbol(__idmap_text_end); - u64 ptep = __pa_symbol(idmap_ptes); + phys_addr_t start = __pa_symbol(__idmap_text_start); + phys_addr_t end = __pa_symbol(__idmap_text_end); + phys_addr_t ptep = __pa_symbol(idmap_ptes); __pi_map_range(&ptep, start, end, start, PAGE_KERNEL_ROX, IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false, __phys_to_virt(ptep) - ptep); - if (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0) && !arm64_use_ng_mappings) { - extern u32 __idmap_kpti_flag; - u64 pa = __pa_symbol(&__idmap_kpti_flag); + if (linear_map_requires_bbml2 || + (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0) && !arm64_use_ng_mappings)) { + phys_addr_t pa = __pa_symbol(&idmap_kpti_bbml2_flag); /* * The KPTI G-to-nG conversion code needs a read-write mapping - * of its synchronization flag in the ID map. + * of its synchronization flag in the ID map. This is also used + * when splitting the linear map to ptes if a secondary CPU + * doesn't support bbml2. */ - ptep = __pa_symbol(kpti_ptes); + ptep = __pa_symbol(kpti_bbml2_ptes); __pi_map_range(&ptep, pa, pa + sizeof(u32), pa, PAGE_KERNEL, IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false, __phys_to_virt(ptep) - ptep); @@ -823,7 +1435,7 @@ static void free_hotplug_page_range(struct page *page, size_t size, vmem_altmap_free(altmap, size >> PAGE_SHIFT); } else { WARN_ON(PageReserved(page)); - free_pages((unsigned long)page_address(page), get_order(size)); + __free_pages(page, get_order(size)); } } @@ -865,10 +1477,14 @@ static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr, WARN_ON(!pte_present(pte)); __pte_clear(&init_mm, addr, ptep); - flush_tlb_kernel_range(addr, addr + PAGE_SIZE); - if (free_mapped) + if (free_mapped) { + /* CONT blocks are not supported in the vmemmap */ + WARN_ON(pte_cont(pte)); + flush_tlb_kernel_range(addr, addr + PAGE_SIZE); free_hotplug_page_range(pte_page(pte), PAGE_SIZE, altmap); + } + /* unmap_hotplug_range() flushes TLB for !free_mapped */ } while (addr += PAGE_SIZE, addr < end); } @@ -887,17 +1503,16 @@ static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr, continue; WARN_ON(!pmd_present(pmd)); - if (pmd_sect(pmd)) { + if (pmd_leaf(pmd)) { pmd_clear(pmdp); - - /* - * One TLBI should be sufficient here as the PMD_SIZE - * range is mapped with a single block entry. - */ - flush_tlb_kernel_range(addr, addr + PAGE_SIZE); - if (free_mapped) + if (free_mapped) { + /* CONT blocks are not supported in the vmemmap */ + WARN_ON(pmd_cont(pmd)); + flush_tlb_kernel_range(addr, addr + PMD_SIZE); free_hotplug_page_range(pmd_page(pmd), PMD_SIZE, altmap); + } + /* unmap_hotplug_range() flushes TLB for !free_mapped */ continue; } WARN_ON(!pmd_table(pmd)); @@ -920,17 +1535,14 @@ static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr, continue; WARN_ON(!pud_present(pud)); - if (pud_sect(pud)) { + if (pud_leaf(pud)) { pud_clear(pudp); - - /* - * One TLBI should be sufficient here as the PUD_SIZE - * range is mapped with a single block entry. - */ - flush_tlb_kernel_range(addr, addr + PAGE_SIZE); - if (free_mapped) + if (free_mapped) { + flush_tlb_kernel_range(addr, addr + PUD_SIZE); free_hotplug_page_range(pud_page(pud), PUD_SIZE, altmap); + } + /* unmap_hotplug_range() flushes TLB for !free_mapped */ continue; } WARN_ON(!pud_table(pud)); @@ -960,6 +1572,7 @@ static void unmap_hotplug_p4d_range(pgd_t *pgdp, unsigned long addr, static void unmap_hotplug_range(unsigned long addr, unsigned long end, bool free_mapped, struct vmem_altmap *altmap) { + unsigned long start = addr; unsigned long next; pgd_t *pgdp, pgd; @@ -981,6 +1594,9 @@ static void unmap_hotplug_range(unsigned long addr, unsigned long end, WARN_ON(!pgd_present(pgd)); unmap_hotplug_p4d_range(pgdp, addr, next, free_mapped, altmap); } while (addr = next, addr < end); + + if (!free_mapped) + flush_tlb_kernel_range(start, end); } static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr, @@ -1034,7 +1650,7 @@ static void free_empty_pmd_table(pud_t *pudp, unsigned long addr, if (pmd_none(pmd)) continue; - WARN_ON(!pmd_present(pmd) || !pmd_table(pmd) || pmd_sect(pmd)); + WARN_ON(!pmd_present(pmd) || !pmd_table(pmd)); free_empty_pte_table(pmdp, addr, next, floor, ceiling); } while (addr = next, addr < end); @@ -1074,7 +1690,7 @@ static void free_empty_pud_table(p4d_t *p4dp, unsigned long addr, if (pud_none(pud)) continue; - WARN_ON(!pud_present(pud) || !pud_table(pud) || pud_sect(pud)); + WARN_ON(!pud_present(pud) || !pud_table(pud)); free_empty_pmd_table(pudp, addr, next, floor, ceiling); } while (addr = next, addr < end); @@ -1170,7 +1786,7 @@ int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node, { vmemmap_verify((pte_t *)pmdp, node, addr, next); - return pmd_sect(READ_ONCE(*pmdp)); + return pmd_leaf(READ_ONCE(*pmdp)); } int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, @@ -1234,7 +1850,7 @@ void p4d_clear_huge(p4d_t *p4dp) int pud_clear_huge(pud_t *pudp) { - if (!pud_sect(READ_ONCE(*pudp))) + if (!pud_leaf(READ_ONCE(*pudp))) return 0; pud_clear(pudp); return 1; @@ -1242,13 +1858,14 @@ int pud_clear_huge(pud_t *pudp) int pmd_clear_huge(pmd_t *pmdp) { - if (!pmd_sect(READ_ONCE(*pmdp))) + if (!pmd_leaf(READ_ONCE(*pmdp))) return 0; pmd_clear(pmdp); return 1; } -int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr) +static int __pmd_free_pte_page(pmd_t *pmdp, unsigned long addr, + bool acquire_mmap_lock) { pte_t *table; pmd_t pmd; @@ -1260,13 +1877,25 @@ int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr) return 1; } + /* See comment in pud_free_pmd_page for static key logic */ table = pte_offset_kernel(pmdp, addr); pmd_clear(pmdp); __flush_tlb_kernel_pgtable(addr); + if (static_branch_unlikely(&arm64_ptdump_lock_key) && acquire_mmap_lock) { + mmap_read_lock(&init_mm); + mmap_read_unlock(&init_mm); + } + pte_free_kernel(NULL, table); return 1; } +int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr) +{ + /* If ptdump is walking the pagetables, acquire init_mm.mmap_lock */ + return __pmd_free_pte_page(pmdp, addr, /* acquire_mmap_lock = */ true); +} + int pud_free_pmd_page(pud_t *pudp, unsigned long addr) { pmd_t *table; @@ -1282,15 +1911,36 @@ int pud_free_pmd_page(pud_t *pudp, unsigned long addr) } table = pmd_offset(pudp, addr); + + /* + * Our objective is to prevent ptdump from reading a PMD table which has + * been freed. In this race, if pud_free_pmd_page observes the key on + * (which got flipped by ptdump) then the mmap lock sequence here will, + * as a result of the mmap write lock/unlock sequence in ptdump, give + * us the correct synchronization. If not, this means that ptdump has + * yet not started walking the pagetables - the sequence of barriers + * issued by __flush_tlb_kernel_pgtable() guarantees that ptdump will + * observe an empty PUD. + */ + pud_clear(pudp); + __flush_tlb_kernel_pgtable(addr); + if (static_branch_unlikely(&arm64_ptdump_lock_key)) { + mmap_read_lock(&init_mm); + mmap_read_unlock(&init_mm); + } + pmdp = table; next = addr; end = addr + PUD_SIZE; do { - pmd_free_pte_page(pmdp, next); + if (pmd_present(pmdp_get(pmdp))) + /* + * PMD has been isolated, so ptdump won't see it. No + * need to acquire init_mm.mmap_lock. + */ + __pmd_free_pte_page(pmdp, next, /* acquire_mmap_lock = */ false); } while (pmdp++, next += PMD_SIZE, next != end); - pud_clear(pudp); - __flush_tlb_kernel_pgtable(addr); pmd_free(NULL, table); return 1; } @@ -1310,8 +1960,8 @@ static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size) struct range arch_get_mappable_range(void) { struct range mhp_range; - u64 start_linear_pa = __pa(_PAGE_OFFSET(vabits_actual)); - u64 end_linear_pa = __pa(PAGE_END - 1); + phys_addr_t start_linear_pa = __pa(_PAGE_OFFSET(vabits_actual)); + phys_addr_t end_linear_pa = __pa(PAGE_END - 1); if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { /* @@ -1346,25 +1996,31 @@ int arch_add_memory(int nid, u64 start, u64 size, VM_BUG_ON(!mhp_range_allowed(start, size, true)); - if (can_set_direct_map()) + if (force_pte_mapping()) flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; - __create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start), - size, params->pgprot, __pgd_pgtable_alloc, - flags); + ret = __create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start), + size, params->pgprot, pgd_pgtable_alloc_init_mm, + flags); + if (ret) + goto err; memblock_clear_nomap(start, size); ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT, params); if (ret) - __remove_pgd_mapping(swapper_pg_dir, - __phys_to_virt(start), size); - else { - max_pfn = PFN_UP(start + size); - max_low_pfn = max_pfn; - } + goto err; + + /* Address of hotplugged memory can be smaller */ + max_pfn = max(max_pfn, PFN_UP(start + size)); + max_low_pfn = max_pfn; + return 0; + +err: + __remove_pgd_mapping(swapper_pg_dir, + __phys_to_virt(start), size); return ret; } @@ -1377,6 +2033,107 @@ void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) __remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size); } + +static bool addr_splits_kernel_leaf(unsigned long addr) +{ + pgd_t *pgdp, pgd; + p4d_t *p4dp, p4d; + pud_t *pudp, pud; + pmd_t *pmdp, pmd; + pte_t *ptep, pte; + + /* + * If the given address points at a the start address of + * a possible leaf, we certainly won't split. Otherwise, + * check if we would actually split a leaf by traversing + * the page tables further. + */ + if (IS_ALIGNED(addr, PGDIR_SIZE)) + return false; + + pgdp = pgd_offset_k(addr); + pgd = pgdp_get(pgdp); + if (!pgd_present(pgd)) + return false; + + if (IS_ALIGNED(addr, P4D_SIZE)) + return false; + + p4dp = p4d_offset(pgdp, addr); + p4d = p4dp_get(p4dp); + if (!p4d_present(p4d)) + return false; + + if (IS_ALIGNED(addr, PUD_SIZE)) + return false; + + pudp = pud_offset(p4dp, addr); + pud = pudp_get(pudp); + if (!pud_present(pud)) + return false; + + if (pud_leaf(pud)) + return true; + + if (IS_ALIGNED(addr, CONT_PMD_SIZE)) + return false; + + pmdp = pmd_offset(pudp, addr); + pmd = pmdp_get(pmdp); + if (!pmd_present(pmd)) + return false; + + if (pmd_cont(pmd)) + return true; + + if (IS_ALIGNED(addr, PMD_SIZE)) + return false; + + if (pmd_leaf(pmd)) + return true; + + if (IS_ALIGNED(addr, CONT_PTE_SIZE)) + return false; + + ptep = pte_offset_kernel(pmdp, addr); + pte = __ptep_get(ptep); + if (!pte_present(pte)) + return false; + + if (pte_cont(pte)) + return true; + + return !IS_ALIGNED(addr, PAGE_SIZE); +} + +static bool can_unmap_without_split(unsigned long pfn, unsigned long nr_pages) +{ + unsigned long phys_start, phys_end, start, end; + + phys_start = PFN_PHYS(pfn); + phys_end = phys_start + nr_pages * PAGE_SIZE; + + /* PFN range's linear map edges are leaf entry aligned */ + start = __phys_to_virt(phys_start); + end = __phys_to_virt(phys_end); + if (addr_splits_kernel_leaf(start) || addr_splits_kernel_leaf(end)) { + pr_warn("[%lx %lx] splits a leaf entry in linear map\n", + phys_start, phys_end); + return false; + } + + /* PFN range's vmemmap edges are leaf entry aligned */ + BUILD_BUG_ON(!IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)); + start = (unsigned long)pfn_to_page(pfn); + end = (unsigned long)pfn_to_page(pfn + nr_pages); + if (addr_splits_kernel_leaf(start) || addr_splits_kernel_leaf(end)) { + pr_warn("[%lx %lx] splits a leaf entry in vmemmap\n", + phys_start, phys_end); + return false; + } + return true; +} + /* * This memory hotplug notifier helps prevent boot memory from being * inadvertently removed as it blocks pfn range offlining process in @@ -1385,8 +2142,11 @@ void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) * In future if and when boot memory could be removed, this notifier * should be dropped and free_hotplug_page_range() should handle any * reserved pages allocated during boot. + * + * This also blocks any memory remove that would have caused a split + * in leaf entry in kernel linear or vmemmap mapping. */ -static int prevent_bootmem_remove_notifier(struct notifier_block *nb, +static int prevent_memory_remove_notifier(struct notifier_block *nb, unsigned long action, void *data) { struct mem_section *ms; @@ -1432,11 +2192,15 @@ static int prevent_bootmem_remove_notifier(struct notifier_block *nb, return NOTIFY_DONE; } } + + if (!can_unmap_without_split(pfn, arg->nr_pages)) + return NOTIFY_BAD; + return NOTIFY_OK; } -static struct notifier_block prevent_bootmem_remove_nb = { - .notifier_call = prevent_bootmem_remove_notifier, +static struct notifier_block prevent_memory_remove_nb = { + .notifier_call = prevent_memory_remove_notifier, }; /* @@ -1486,7 +2250,7 @@ static void validate_bootmem_online(void) } } -static int __init prevent_bootmem_remove_init(void) +static int __init prevent_memory_remove_init(void) { int ret = 0; @@ -1494,33 +2258,50 @@ static int __init prevent_bootmem_remove_init(void) return ret; validate_bootmem_online(); - ret = register_memory_notifier(&prevent_bootmem_remove_nb); + ret = register_memory_notifier(&prevent_memory_remove_nb); if (ret) pr_err("%s: Notifier registration failed %d\n", __func__, ret); return ret; } -early_initcall(prevent_bootmem_remove_init); +early_initcall(prevent_memory_remove_init); #endif -pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) +pte_t modify_prot_start_ptes(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep, unsigned int nr) { + pte_t pte = get_and_clear_ptes(vma->vm_mm, addr, ptep, nr); + if (alternative_has_cap_unlikely(ARM64_WORKAROUND_2645198)) { /* * Break-before-make (BBM) is required for all user space mappings * when the permission changes from executable to non-executable * in cases where cpu is affected with errata #2645198. */ - if (pte_user_exec(ptep_get(ptep))) - return ptep_clear_flush(vma, addr, ptep); + if (pte_accessible(vma->vm_mm, pte) && pte_user_exec(pte)) + __flush_tlb_range(vma, addr, nr * PAGE_SIZE, + PAGE_SIZE, 3, TLBF_NOWALKCACHE); } - return ptep_get_and_clear(vma->vm_mm, addr, ptep); + + return pte; +} + +pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) +{ + return modify_prot_start_ptes(vma, addr, ptep, 1); +} + +void modify_prot_commit_ptes(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep, pte_t old_pte, pte_t pte, + unsigned int nr) +{ + set_ptes(vma->vm_mm, addr, ptep, pte, nr); } void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte) { - set_pte_at(vma->vm_mm, addr, ptep, pte); + modify_prot_commit_ptes(vma, addr, ptep, old_pte, pte, 1); } /* @@ -1538,7 +2319,7 @@ void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp) phys_addr_t ttbr1 = phys_to_ttbr(virt_to_phys(pgdp)); if (cnp) - ttbr1 |= TTBR_CNP_BIT; + ttbr1 |= TTBRx_EL1_CnP; replace_phys = (void *)__pa_symbol(idmap_cpu_replace_ttbr1); @@ -1556,11 +2337,10 @@ void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp) } #ifdef CONFIG_ARCH_HAS_PKEYS -int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val) +int arch_set_user_pkey_access(int pkey, unsigned long init_val) { - u64 new_por = POE_RXW; + u64 new_por; u64 old_por; - u64 pkey_shift; if (!system_supports_poe()) return -ENOSPC; @@ -1574,7 +2354,7 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long i return -EINVAL; /* Set the bits we need in POR: */ - new_por = POE_RXW; + new_por = POE_RWX; if (init_val & PKEY_DISABLE_WRITE) new_por &= ~POE_W; if (init_val & PKEY_DISABLE_ACCESS) @@ -1585,12 +2365,11 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long i new_por &= ~POE_X; /* Shift the bits in to the correct place in POR for pkey: */ - pkey_shift = pkey * POR_BITS_PER_PKEY; - new_por <<= pkey_shift; + new_por = POR_ELx_PERM_PREP(pkey, new_por); /* Get old POR and mask off any old bits in place: */ old_por = read_sysreg_s(SYS_POR_EL0); - old_por &= ~(POE_MASK << pkey_shift); + old_por &= ~(POE_MASK << POR_ELx_PERM_SHIFT(pkey)); /* Write old part along with new part: */ write_sysreg_s(old_por | new_por, SYS_POR_EL0); diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 39fd1f7ff02a..ce035e1b4eaf 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -8,6 +8,7 @@ #include <linux/mem_encrypt.h> #include <linux/sched.h> #include <linux/vmalloc.h> +#include <linux/pagewalk.h> #include <asm/cacheflush.h> #include <asm/pgtable-prot.h> @@ -20,7 +21,71 @@ struct page_change_data { pgprot_t clear_mask; }; -bool rodata_full __ro_after_init = IS_ENABLED(CONFIG_RODATA_FULL_DEFAULT_ENABLED); +static ptdesc_t set_pageattr_masks(ptdesc_t val, struct mm_walk *walk) +{ + struct page_change_data *masks = walk->private; + + /* + * Some users clear and set bits which alias each other (e.g. PTE_NG and + * PTE_PRESENT_INVALID). It is therefore important that we always clear + * first then set. + */ + val &= ~(pgprot_val(masks->clear_mask)); + val |= (pgprot_val(masks->set_mask)); + + return val; +} + +static int pageattr_pud_entry(pud_t *pud, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pud_t val = pudp_get(pud); + + if (pud_leaf(val)) { + if (WARN_ON_ONCE((next - addr) != PUD_SIZE)) + return -EINVAL; + val = __pud(set_pageattr_masks(pud_val(val), walk)); + set_pud(pud, val); + walk->action = ACTION_CONTINUE; + } + + return 0; +} + +static int pageattr_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pmd_t val = pmdp_get(pmd); + + if (pmd_leaf(val)) { + if (WARN_ON_ONCE((next - addr) != PMD_SIZE)) + return -EINVAL; + val = __pmd(set_pageattr_masks(pmd_val(val), walk)); + set_pmd(pmd, val); + walk->action = ACTION_CONTINUE; + } + + return 0; +} + +static int pageattr_pte_entry(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pte_t val = __ptep_get(pte); + + val = __pte(set_pageattr_masks(pte_val(val), walk)); + __set_pte(pte, val); + + return 0; +} + +static const struct mm_walk_ops pageattr_ops = { + .pud_entry = pageattr_pud_entry, + .pmd_entry = pageattr_pmd_entry, + .pte_entry = pageattr_pte_entry, +}; + +bool rodata_full __ro_after_init = true; bool can_set_direct_map(void) { @@ -37,39 +102,47 @@ bool can_set_direct_map(void) arm64_kfence_can_set_direct_map() || is_realm_world(); } -static int change_page_range(pte_t *ptep, unsigned long addr, void *data) +static int update_range_prot(unsigned long start, unsigned long size, + pgprot_t set_mask, pgprot_t clear_mask) { - struct page_change_data *cdata = data; - pte_t pte = __ptep_get(ptep); + struct page_change_data data; + int ret; + + data.set_mask = set_mask; + data.clear_mask = clear_mask; + + ret = split_kernel_leaf_mapping(start, start + size); + if (WARN_ON_ONCE(ret)) + return ret; - pte = clear_pte_bit(pte, cdata->clear_mask); - pte = set_pte_bit(pte, cdata->set_mask); + lazy_mmu_mode_enable(); - __set_pte(ptep, pte); - return 0; + /* + * The caller must ensure that the range we are operating on does not + * partially overlap a block mapping, or a cont mapping. Any such case + * must be eliminated by splitting the mapping. + */ + ret = walk_kernel_page_table_range_lockless(start, start + size, + &pageattr_ops, NULL, &data); + lazy_mmu_mode_disable(); + + return ret; } -/* - * This function assumes that the range is mapped with PAGE_SIZE pages. - */ static int __change_memory_common(unsigned long start, unsigned long size, - pgprot_t set_mask, pgprot_t clear_mask) + pgprot_t set_mask, pgprot_t clear_mask) { - struct page_change_data data; int ret; - data.set_mask = set_mask; - data.clear_mask = clear_mask; - - ret = apply_to_page_range(&init_mm, start, size, change_page_range, - &data); + ret = update_range_prot(start, size, set_mask, clear_mask); /* - * If the memory is being made valid without changing any other bits - * then a TLBI isn't required as a non-valid entry cannot be cached in - * the TLB. + * If the memory is being switched from present-invalid to valid without + * changing any other bits then a TLBI isn't required as a non-valid + * entry cannot be cached in the TLB. */ - if (pgprot_val(set_mask) != PTE_VALID || pgprot_val(clear_mask)) + if (pgprot_val(set_mask) != PTE_PRESENT_VALID_KERNEL || + pgprot_val(clear_mask) != PTE_PRESENT_INVALID) flush_tlb_kernel_range(start, start + size); return ret; } @@ -81,7 +154,7 @@ static int change_memory_common(unsigned long addr, int numpages, unsigned long size = PAGE_SIZE * numpages; unsigned long end = start + size; struct vm_struct *area; - int i; + int ret; if (!PAGE_ALIGNED(addr)) { start &= PAGE_MASK; @@ -96,16 +169,17 @@ static int change_memory_common(unsigned long addr, int numpages, * we are operating on does not result in such splitting. * * Let's restrict ourselves to mappings created by vmalloc (or vmap). - * Those are guaranteed to consist entirely of page mappings, and - * splitting is never needed. + * Disallow VM_ALLOW_HUGE_VMAP mappings to guarantee that only page + * mappings are updated and splitting is never needed. * * So check whether the [addr, addr + size) interval is entirely * covered by precisely one VM area that has the VM_ALLOC flag set. */ area = find_vm_area((void *)addr); if (!area || - end > (unsigned long)kasan_reset_tag(area->addr) + area->size || - !(area->flags & VM_ALLOC)) + ((unsigned long)kasan_reset_tag((void *)end) > + (unsigned long)kasan_reset_tag(area->addr) + area->size) || + ((area->flags & (VM_ALLOC | VM_ALLOW_HUGE_VMAP)) != VM_ALLOC)) return -EINVAL; if (!numpages) @@ -117,9 +191,14 @@ static int change_memory_common(unsigned long addr, int numpages, */ if (rodata_full && (pgprot_val(set_mask) == PTE_RDONLY || pgprot_val(clear_mask) == PTE_RDONLY)) { - for (i = 0; i < area->nr_pages; i++) { - __change_memory_common((u64)page_address(area->pages[i]), - PAGE_SIZE, set_mask, clear_mask); + unsigned long idx = ((unsigned long)kasan_reset_tag((void *)start) - + (unsigned long)kasan_reset_tag(area->addr)) + >> PAGE_SHIFT; + for (; numpages; idx++, numpages--) { + ret = __change_memory_common((u64)page_address(area->pages[idx]), + PAGE_SIZE, set_mask, clear_mask); + if (ret) + return ret; } } @@ -164,42 +243,36 @@ int set_memory_valid(unsigned long addr, int numpages, int enable) { if (enable) return __change_memory_common(addr, PAGE_SIZE * numpages, - __pgprot(PTE_VALID), - __pgprot(0)); + __pgprot(PTE_PRESENT_VALID_KERNEL), + __pgprot(PTE_PRESENT_INVALID)); else return __change_memory_common(addr, PAGE_SIZE * numpages, - __pgprot(0), - __pgprot(PTE_VALID)); + __pgprot(PTE_PRESENT_INVALID), + __pgprot(PTE_PRESENT_VALID_KERNEL)); } int set_direct_map_invalid_noflush(struct page *page) { - struct page_change_data data = { - .set_mask = __pgprot(0), - .clear_mask = __pgprot(PTE_VALID), - }; + pgprot_t clear_mask = __pgprot(PTE_PRESENT_VALID_KERNEL); + pgprot_t set_mask = __pgprot(PTE_PRESENT_INVALID); if (!can_set_direct_map()) return 0; - return apply_to_page_range(&init_mm, - (unsigned long)page_address(page), - PAGE_SIZE, change_page_range, &data); + return update_range_prot((unsigned long)page_address(page), + PAGE_SIZE, set_mask, clear_mask); } int set_direct_map_default_noflush(struct page *page) { - struct page_change_data data = { - .set_mask = __pgprot(PTE_VALID | PTE_WRITE), - .clear_mask = __pgprot(PTE_RDONLY), - }; + pgprot_t set_mask = __pgprot(PTE_PRESENT_VALID_KERNEL | PTE_WRITE); + pgprot_t clear_mask = __pgprot(PTE_PRESENT_INVALID | PTE_RDONLY); if (!can_set_direct_map()) return 0; - return apply_to_page_range(&init_mm, - (unsigned long)page_address(page), - PAGE_SIZE, change_page_range, &data); + return update_range_prot((unsigned long)page_address(page), + PAGE_SIZE, set_mask, clear_mask); } static int __set_memory_enc_dec(unsigned long addr, @@ -229,8 +302,8 @@ static int __set_memory_enc_dec(unsigned long addr, * entries or Synchronous External Aborts caused by RIPAS_EMPTY */ ret = __change_memory_common(addr, PAGE_SIZE * numpages, - __pgprot(set_prot), - __pgprot(clear_prot | PTE_VALID)); + __pgprot(set_prot | PTE_PRESENT_INVALID), + __pgprot(clear_prot | PTE_PRESENT_VALID_KERNEL)); if (ret) return ret; @@ -244,8 +317,8 @@ static int __set_memory_enc_dec(unsigned long addr, return ret; return __change_memory_common(addr, PAGE_SIZE * numpages, - __pgprot(PTE_VALID), - __pgprot(0)); + __pgprot(PTE_PRESENT_VALID_KERNEL), + __pgprot(PTE_PRESENT_INVALID)); } static int realm_set_memory_encrypted(unsigned long addr, int numpages) @@ -337,15 +410,15 @@ bool kernel_page_present(struct page *page) pud = READ_ONCE(*pudp); if (pud_none(pud)) return false; - if (pud_sect(pud)) - return true; + if (pud_leaf(pud)) + return pud_valid(pud); pmdp = pmd_offset(pudp, addr); pmd = READ_ONCE(*pmdp); if (pmd_none(pmd)) return false; - if (pmd_sect(pmd)) - return true; + if (pmd_leaf(pmd)) + return pmd_valid(pmd); ptep = pte_offset_kernel(pmdp, addr); return pte_valid(__ptep_get(ptep)); diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c index 8160cff35089..bf5110b91e2f 100644 --- a/arch/arm64/mm/pgd.c +++ b/arch/arm64/mm/pgd.c @@ -56,7 +56,7 @@ void __init pgtable_cache_init(void) * With 52-bit physical addresses, the architecture requires the * top-level table to be aligned to at least 64 bytes. */ - BUILD_BUG_ON(PGD_SIZE < 64); + BUILD_BUG_ON(!IS_ALIGNED(PGD_SIZE, 64)); #endif /* diff --git a/arch/arm64/mm/physaddr.c b/arch/arm64/mm/physaddr.c index cde44c13dda1..7d94e09b01b3 100644 --- a/arch/arm64/mm/physaddr.c +++ b/arch/arm64/mm/physaddr.c @@ -10,7 +10,7 @@ phys_addr_t __virt_to_phys(unsigned long x) { WARN(!__is_lm_address(__tag_reset(x)), - "virt_to_phys used for non-linear address: %pK (%pS)\n", + "virt_to_phys used for non-linear address: %p (%pS)\n", (void *)x, (void *)x); diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index fb30c8804f87..22866b49be37 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -23,15 +23,18 @@ #include <asm/sysreg.h> #ifdef CONFIG_ARM64_64K_PAGES -#define TCR_TG_FLAGS TCR_TG0_64K | TCR_TG1_64K +#define TCR_TG_FLAGS ((TCR_EL1_TG0_64K << TCR_EL1_TG0_SHIFT) |\ + (TCR_EL1_TG1_64K << TCR_EL1_TG1_SHIFT)) #elif defined(CONFIG_ARM64_16K_PAGES) -#define TCR_TG_FLAGS TCR_TG0_16K | TCR_TG1_16K +#define TCR_TG_FLAGS ((TCR_EL1_TG0_16K << TCR_EL1_TG0_SHIFT) |\ + (TCR_EL1_TG1_16K << TCR_EL1_TG1_SHIFT)) #else /* CONFIG_ARM64_4K_PAGES */ -#define TCR_TG_FLAGS TCR_TG0_4K | TCR_TG1_4K +#define TCR_TG_FLAGS ((TCR_EL1_TG0_4K << TCR_EL1_TG0_SHIFT) |\ + (TCR_EL1_TG1_4K << TCR_EL1_TG1_SHIFT)) #endif #ifdef CONFIG_RANDOMIZE_BASE -#define TCR_KASLR_FLAGS TCR_NFD1 +#define TCR_KASLR_FLAGS TCR_EL1_NFD1 #else #define TCR_KASLR_FLAGS 0 #endif @@ -40,23 +43,30 @@ #define TCR_CACHE_FLAGS TCR_IRGN_WBWA | TCR_ORGN_WBWA #ifdef CONFIG_KASAN_SW_TAGS -#define TCR_KASAN_SW_FLAGS TCR_TBI1 | TCR_TBID1 +#define TCR_KASAN_SW_FLAGS TCR_EL1_TBI1 | TCR_EL1_TBID1 #else #define TCR_KASAN_SW_FLAGS 0 #endif -#ifdef CONFIG_KASAN_HW_TAGS -#define TCR_MTE_FLAGS TCR_TCMA1 | TCR_TBI1 | TCR_TBID1 -#elif defined(CONFIG_ARM64_MTE) +#ifdef CONFIG_ARM64_MTE /* * The mte_zero_clear_page_tags() implementation uses DC GZVA, which relies on - * TBI being enabled at EL1. + * TBI being enabled at EL1. TCMA1 is needed to treat accesses with the + * match-all tag (0xF) as Tag Unchecked, irrespective of the SCTLR_EL1.TCF + * setting. */ -#define TCR_MTE_FLAGS TCR_TBI1 | TCR_TBID1 +#define TCR_MTE_FLAGS TCR_EL1_TCMA1 | TCR_EL1_TBI1 | TCR_EL1_TBID1 #else #define TCR_MTE_FLAGS 0 #endif +#define TCR_IRGN_WBWA ((TCR_EL1_IRGN0_WBWA << TCR_EL1_IRGN0_SHIFT) |\ + (TCR_EL1_IRGN1_WBWA << TCR_EL1_IRGN1_SHIFT)) +#define TCR_ORGN_WBWA ((TCR_EL1_ORGN0_WBWA << TCR_EL1_ORGN0_SHIFT) |\ + (TCR_EL1_ORGN1_WBWA << TCR_EL1_ORGN1_SHIFT)) +#define TCR_SHARED ((TCR_EL1_SH0_INNER << TCR_EL1_SH0_SHIFT) |\ + (TCR_EL1_SH1_INNER << TCR_EL1_SH1_SHIFT)) + /* * Default MAIR_EL1. MT_NORMAL_TAGGED is initially mapped as Normal memory and * changed during mte_cpu_setup to Normal Tagged if the system supports MTE. @@ -100,6 +110,10 @@ SYM_FUNC_START(cpu_do_suspend) * call stack. */ str x18, [x0, #96] +alternative_if ARM64_HAS_TCR2 + mrs x2, REG_TCR2_EL1 + str x2, [x0, #104] +alternative_else_nop_endif ret SYM_FUNC_END(cpu_do_suspend) @@ -129,11 +143,15 @@ SYM_FUNC_START(cpu_do_resume) /* Don't change t0sz here, mask those bits when restoring */ mrs x7, tcr_el1 - bfi x8, x7, TCR_T0SZ_OFFSET, TCR_TxSZ_WIDTH + bfi x8, x7, TCR_EL1_T0SZ_SHIFT, TCR_EL1_T0SZ_WIDTH msr tcr_el1, x8 msr vbar_el1, x9 msr mdscr_el1, x10 +alternative_if ARM64_HAS_TCR2 + ldr x2, [x0, #104] + msr REG_TCR2_EL1, x2 +alternative_else_nop_endif msr sctlr_el1, x12 set_this_cpu_offset x13 @@ -245,10 +263,6 @@ SYM_FUNC_ALIAS(__pi_idmap_cpu_replace_ttbr1, idmap_cpu_replace_ttbr1) * * Called exactly once from stop_machine context by each CPU found during boot. */ - .pushsection ".data", "aw", %progbits -SYM_DATA(__idmap_kpti_flag, .long 1) - .popsection - SYM_TYPED_FUNC_START(idmap_kpti_install_ng_mappings) cpu .req w0 temp_pte .req x0 @@ -273,7 +287,7 @@ SYM_TYPED_FUNC_START(idmap_kpti_install_ng_mappings) mov x5, x3 // preserve temp_pte arg mrs swapper_ttb, ttbr1_el1 - adr_l flag_ptr, __idmap_kpti_flag + adr_l flag_ptr, idmap_kpti_bbml2_flag cbnz cpu, __idmap_kpti_secondary @@ -416,7 +430,25 @@ alternative_else_nop_endif __idmap_kpti_secondary: /* Uninstall swapper before surgery begins */ __idmap_cpu_set_reserved_ttbr1 x16, x17 + b scondary_cpu_wait + + .unreq swapper_ttb + .unreq flag_ptr +SYM_FUNC_END(idmap_kpti_install_ng_mappings) + .popsection +#endif + + .pushsection ".idmap.text", "a" +SYM_TYPED_FUNC_START(wait_linear_map_split_to_ptes) + /* Must be same registers as in idmap_kpti_install_ng_mappings */ + swapper_ttb .req x3 + flag_ptr .req x4 + + mrs swapper_ttb, ttbr1_el1 + adr_l flag_ptr, idmap_kpti_bbml2_flag + __idmap_cpu_set_reserved_ttbr1 x16, x17 +scondary_cpu_wait: /* Increment the flag to let the boot CPU we're ready */ 1: ldxr w16, [flag_ptr] add w16, w16, #1 @@ -436,9 +468,8 @@ __idmap_kpti_secondary: .unreq swapper_ttb .unreq flag_ptr -SYM_FUNC_END(idmap_kpti_install_ng_mappings) +SYM_FUNC_END(wait_linear_map_split_to_ptes) .popsection -#endif /* * __cpu_setup @@ -454,7 +485,7 @@ SYM_FUNC_START(__cpu_setup) dsb nsh msr cpacr_el1, xzr // Reset cpacr_el1 - mov x1, #1 << 12 // Reset mdscr_el1 and disable + mov x1, MDSCR_EL1_TDCC // Reset mdscr_el1 and disable msr mdscr_el1, x1 // access to the DCC from EL0 reset_pmuserenr_el0 x1 // Disable PMU access from EL0 reset_amuserenr_el0 x1 // Disable AMU access from EL0 @@ -468,8 +499,8 @@ SYM_FUNC_START(__cpu_setup) tcr2 .req x15 mov_q mair, MAIR_EL1_SET mov_q tcr, TCR_T0SZ(IDMAP_VA_BITS) | TCR_T1SZ(VA_BITS_MIN) | TCR_CACHE_FLAGS | \ - TCR_SHARED | TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \ - TCR_TBI0 | TCR_A1 | TCR_KASAN_SW_FLAGS | TCR_MTE_FLAGS + TCR_SHARED | TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_EL1_AS | \ + TCR_EL1_TBI0 | TCR_EL1_A1 | TCR_KASAN_SW_FLAGS | TCR_MTE_FLAGS mov tcr2, xzr tcr_clear_errata_bits tcr, x9, x5 @@ -479,7 +510,7 @@ SYM_FUNC_START(__cpu_setup) alternative_if ARM64_HAS_VA52 tcr_set_t1sz tcr, x9 #ifdef CONFIG_ARM64_LPA2 - orr tcr, tcr, #TCR_DS + orr tcr, tcr, #TCR_EL1_DS #endif alternative_else_nop_endif #endif @@ -487,7 +518,7 @@ alternative_else_nop_endif /* * Set the IPS bits in TCR_EL1. */ - tcr_compute_pa_size tcr, #TCR_IPS_SHIFT, x5, x6 + tcr_compute_pa_size tcr, #TCR_EL1_IPS_SHIFT, x5, x6 #ifdef CONFIG_ARM64_HW_AFDBM /* * Enable hardware update of the Access Flags bit. @@ -497,7 +528,7 @@ alternative_else_nop_endif mrs x9, ID_AA64MMFR1_EL1 ubfx x9, x9, ID_AA64MMFR1_EL1_HAFDBS_SHIFT, #4 cbz x9, 1f - orr tcr, tcr, #TCR_HA // hardware Access flag update + orr tcr, tcr, #TCR_EL1_HA // hardware Access flag update #ifdef CONFIG_ARM64_HAFT cmp x9, ID_AA64MMFR1_EL1_HAFDBS_HAFT b.lt 1f @@ -512,28 +543,12 @@ alternative_else_nop_endif ubfx x1, x1, #ID_AA64MMFR3_EL1_S1PIE_SHIFT, #4 cbz x1, .Lskip_indirection - /* - * The PROT_* macros describing the various memory types may resolve to - * C expressions if they include the PTE_MAYBE_* macros, and so they - * can only be used from C code. The PIE_E* constants below are also - * defined in terms of those macros, but will mask out those - * PTE_MAYBE_* constants, whether they are set or not. So #define them - * as 0x0 here so we can evaluate the PIE_E* constants in asm context. - */ - -#define PTE_MAYBE_NG 0 -#define PTE_MAYBE_SHARED 0 - - mov_q x0, PIE_E0 + mov_q x0, PIE_E0_ASM msr REG_PIRE0_EL1, x0 - mov_q x0, PIE_E1 + mov_q x0, PIE_E1_ASM msr REG_PIR_EL1, x0 -#undef PTE_MAYBE_NG -#undef PTE_MAYBE_SHARED - orr tcr2, tcr2, TCR2_EL1_PIE - msr REG_TCR2_EL1, x0 .Lskip_indirection: diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c index 688fbe0271ca..ab9899ca1e5f 100644 --- a/arch/arm64/mm/ptdump.c +++ b/arch/arm64/mm/ptdump.c @@ -80,8 +80,8 @@ static const struct ptdump_prot_bits pte_bits[] = { .set = "CON", .clear = " ", }, { - .mask = PTE_TABLE_BIT | PTE_VALID, - .val = PTE_VALID, + .mask = PMD_TYPE_MASK, + .val = PMD_TYPE_SECT, .set = "BLK", .clear = " ", }, { @@ -189,12 +189,12 @@ static void note_prot_wx(struct ptdump_pg_state *st, unsigned long addr) } void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, - u64 val) + pteval_t val) { struct ptdump_pg_state *st = container_of(pt_st, struct ptdump_pg_state, ptdump); struct ptdump_pg_level *pg_level = st->pg_level; static const char units[] = "KMGTPE"; - u64 prot = 0; + ptdesc_t prot = 0; /* check if the current level has been folded dynamically */ if (st->mm && ((level == 1 && mm_p4d_folded(st->mm)) || @@ -251,6 +251,45 @@ void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, } +void note_page_pte(struct ptdump_state *pt_st, unsigned long addr, pte_t pte) +{ + note_page(pt_st, addr, 4, pte_val(pte)); +} + +void note_page_pmd(struct ptdump_state *pt_st, unsigned long addr, pmd_t pmd) +{ + note_page(pt_st, addr, 3, pmd_val(pmd)); +} + +void note_page_pud(struct ptdump_state *pt_st, unsigned long addr, pud_t pud) +{ + note_page(pt_st, addr, 2, pud_val(pud)); +} + +void note_page_p4d(struct ptdump_state *pt_st, unsigned long addr, p4d_t p4d) +{ + note_page(pt_st, addr, 1, p4d_val(p4d)); +} + +void note_page_pgd(struct ptdump_state *pt_st, unsigned long addr, pgd_t pgd) +{ + note_page(pt_st, addr, 0, pgd_val(pgd)); +} + +void note_page_flush(struct ptdump_state *pt_st) +{ + pte_t pte_zero = {0}; + + note_page(pt_st, 0, -1, pte_val(pte_zero)); +} + +static void arm64_ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm) +{ + static_branch_inc(&arm64_ptdump_lock_key); + ptdump_walk_pgd(st, mm, NULL); + static_branch_dec(&arm64_ptdump_lock_key); +} + void ptdump_walk(struct seq_file *s, struct ptdump_info *info) { unsigned long end = ~0UL; @@ -266,7 +305,12 @@ void ptdump_walk(struct seq_file *s, struct ptdump_info *info) .pg_level = &kernel_pg_levels[0], .level = -1, .ptdump = { - .note_page = note_page, + .note_page_pte = note_page_pte, + .note_page_pmd = note_page_pmd, + .note_page_pud = note_page_pud, + .note_page_p4d = note_page_p4d, + .note_page_pgd = note_page_pgd, + .note_page_flush = note_page_flush, .range = (struct ptdump_range[]){ {info->base_addr, end}, {0, 0} @@ -274,7 +318,7 @@ void ptdump_walk(struct seq_file *s, struct ptdump_info *info) } }; - ptdump_walk_pgd(&st.ptdump, info->mm, NULL); + arm64_ptdump_walk_pgd(&st.ptdump, info->mm); } static void __init ptdump_initialize(void) @@ -303,7 +347,12 @@ bool ptdump_check_wx(void) .level = -1, .check_wx = true, .ptdump = { - .note_page = note_page, + .note_page_pte = note_page_pte, + .note_page_pmd = note_page_pmd, + .note_page_pud = note_page_pud, + .note_page_p4d = note_page_p4d, + .note_page_pgd = note_page_pgd, + .note_page_flush = note_page_flush, .range = (struct ptdump_range[]) { {_PAGE_OFFSET(vabits_actual), ~0UL}, {0, 0} @@ -311,7 +360,7 @@ bool ptdump_check_wx(void) } }; - ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); + arm64_ptdump_walk_pgd(&st.ptdump, &init_mm); if (st.wx_pages || st.uxn_pages) { pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found, %lu non-UXN pages found\n", diff --git a/arch/arm64/mm/ptdump_debugfs.c b/arch/arm64/mm/ptdump_debugfs.c index 68bf1a125502..1e308328c079 100644 --- a/arch/arm64/mm/ptdump_debugfs.c +++ b/arch/arm64/mm/ptdump_debugfs.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/debugfs.h> -#include <linux/memory_hotplug.h> #include <linux/seq_file.h> #include <asm/ptdump.h> @@ -9,9 +8,7 @@ static int ptdump_show(struct seq_file *m, void *v) { struct ptdump_info *info = m->private; - get_online_mems(); ptdump_walk(m, info); - put_online_mems(); return 0; } DEFINE_SHOW_ATTRIBUTE(ptdump); diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c index 18543b603c77..cca9706a875c 100644 --- a/arch/arm64/mm/trans_pgd.c +++ b/arch/arm64/mm/trans_pgd.c @@ -31,36 +31,6 @@ static void *trans_alloc(struct trans_pgd_info *info) return info->trans_alloc_page(info->trans_alloc_arg); } -static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr) -{ - pte_t pte = __ptep_get(src_ptep); - - if (pte_valid(pte)) { - /* - * Resume will overwrite areas that may be marked - * read only (code, rodata). Clear the RDONLY bit from - * the temporary mappings we use during restore. - */ - __set_pte(dst_ptep, pte_mkwrite_novma(pte)); - } else if (!pte_none(pte)) { - /* - * debug_pagealloc will removed the PTE_VALID bit if - * the page isn't in use by the resume kernel. It may have - * been in use by the original kernel, in which case we need - * to put it back in our copy to do the restore. - * - * Other cases include kfence / vmalloc / memfd_secret which - * may call `set_direct_map_invalid_noflush()`. - * - * Before marking this entry valid, check the pfn should - * be mapped. - */ - BUG_ON(!pfn_valid(pte_pfn(pte))); - - __set_pte(dst_ptep, pte_mkvalid(pte_mkwrite_novma(pte))); - } -} - static int copy_pte(struct trans_pgd_info *info, pmd_t *dst_pmdp, pmd_t *src_pmdp, unsigned long start, unsigned long end) { @@ -76,7 +46,11 @@ static int copy_pte(struct trans_pgd_info *info, pmd_t *dst_pmdp, src_ptep = pte_offset_kernel(src_pmdp, start); do { - _copy_pte(dst_ptep, src_ptep, addr); + pte_t pte = __ptep_get(src_ptep); + + if (pte_none(pte)) + continue; + __set_pte(dst_ptep, pte_mkvalid_k(pte_mkwrite_novma(pte))); } while (dst_ptep++, src_ptep++, addr += PAGE_SIZE, addr != end); return 0; @@ -109,8 +83,7 @@ static int copy_pmd(struct trans_pgd_info *info, pud_t *dst_pudp, if (copy_pte(info, dst_pmdp, src_pmdp, addr, next)) return -ENOMEM; } else { - set_pmd(dst_pmdp, - __pmd(pmd_val(pmd) & ~PMD_SECT_RDONLY)); + set_pmd(dst_pmdp, pmd_mkvalid_k(pmd_mkwrite_novma(pmd))); } } while (dst_pmdp++, src_pmdp++, addr = next, addr != end); @@ -145,8 +118,7 @@ static int copy_pud(struct trans_pgd_info *info, p4d_t *dst_p4dp, if (copy_pmd(info, dst_pudp, src_pudp, addr, next)) return -ENOMEM; } else { - set_pud(dst_pudp, - __pud(pud_val(pud) & ~PUD_SECT_RDONLY)); + set_pud(dst_pudp, pud_mkvalid_k(pud_mkwrite_novma(pud))); } } while (dst_pudp++, src_pudp++, addr = next, addr != end); |
