summaryrefslogtreecommitdiff
path: root/arch/arm64/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/arm64/mm')
-rw-r--r--arch/arm64/mm/Makefile2
-rw-r--r--arch/arm64/mm/cache.S57
-rw-r--r--arch/arm64/mm/context.c8
-rw-r--r--arch/arm64/mm/contpte.c337
-rw-r--r--arch/arm64/mm/copypage.c11
-rw-r--r--arch/arm64/mm/dma-mapping.c4
-rw-r--r--arch/arm64/mm/extable.c40
-rw-r--r--arch/arm64/mm/fault.c212
-rw-r--r--arch/arm64/mm/flush.c8
-rw-r--r--arch/arm64/mm/gcs.c24
-rw-r--r--arch/arm64/mm/hugetlbpage.c106
-rw-r--r--arch/arm64/mm/init.c104
-rw-r--r--arch/arm64/mm/ioremap.c10
-rw-r--r--arch/arm64/mm/kasan_init.c10
-rw-r--r--arch/arm64/mm/mmap.c16
-rw-r--r--arch/arm64/mm/mmu.c1113
-rw-r--r--arch/arm64/mm/pageattr.c185
-rw-r--r--arch/arm64/mm/pgd.c2
-rw-r--r--arch/arm64/mm/physaddr.c2
-rw-r--r--arch/arm64/mm/proc.S99
-rw-r--r--arch/arm64/mm/ptdump.c65
-rw-r--r--arch/arm64/mm/ptdump_debugfs.c3
-rw-r--r--arch/arm64/mm/trans_pgd.c42
23 files changed, 1811 insertions, 649 deletions
diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile
index fc92170a8f37..c26489cf96cd 100644
--- a/arch/arm64/mm/Makefile
+++ b/arch/arm64/mm/Makefile
@@ -5,7 +5,7 @@ obj-y := dma-mapping.o extable.o fault.o init.o \
context.o proc.o pageattr.o fixmap.o
obj-$(CONFIG_ARM64_CONTPTE) += contpte.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
-obj-$(CONFIG_PTDUMP_CORE) += ptdump.o
+obj-$(CONFIG_PTDUMP) += ptdump.o
obj-$(CONFIG_PTDUMP_DEBUGFS) += ptdump_debugfs.o
obj-$(CONFIG_TRANS_TABLE) += trans_pgd.o
obj-$(CONFIG_TRANS_TABLE) += trans_pgd-asm.o
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index 503567c864fd..ab75c050f559 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -132,17 +132,7 @@ alternative_else_nop_endif
ret
SYM_FUNC_END(dcache_clean_pou)
-/*
- * dcache_inval_poc(start, end)
- *
- * Ensure that any D-cache lines for the interval [start, end)
- * are invalidated. Any partial lines at the ends of the interval are
- * also cleaned to PoC to prevent data loss.
- *
- * - start - kernel start address of region
- * - end - kernel end address of region
- */
-SYM_FUNC_START(__pi_dcache_inval_poc)
+.macro __dcache_inval_poc_nosync
dcache_line_size x2, x3
sub x3, x2, #1
tst x1, x3 // end cache line aligned?
@@ -158,12 +148,42 @@ SYM_FUNC_START(__pi_dcache_inval_poc)
3: add x0, x0, x2
cmp x0, x1
b.lo 2b
+.endm
+
+/*
+ * dcache_inval_poc(start, end)
+ *
+ * Ensure that any D-cache lines for the interval [start, end)
+ * are invalidated. Any partial lines at the ends of the interval are
+ * also cleaned to PoC to prevent data loss.
+ *
+ * - start - kernel start address of region
+ * - end - kernel end address of region
+ */
+SYM_FUNC_START(__pi_dcache_inval_poc)
+ __dcache_inval_poc_nosync
dsb sy
ret
SYM_FUNC_END(__pi_dcache_inval_poc)
SYM_FUNC_ALIAS(dcache_inval_poc, __pi_dcache_inval_poc)
/*
+ * dcache_inval_poc_nosync(start, end)
+ *
+ * Issue the instructions of D-cache lines for the interval [start, end)
+ * for invalidation. Not necessarily cleaned to PoC till an explicit dsb
+ * sy is issued later
+ *
+ * - start - kernel start address of region
+ * - end - kernel end address of region
+ */
+SYM_FUNC_START(__pi_dcache_inval_poc_nosync)
+ __dcache_inval_poc_nosync
+ ret
+SYM_FUNC_END(__pi_dcache_inval_poc_nosync)
+SYM_FUNC_ALIAS(dcache_inval_poc_nosync, __pi_dcache_inval_poc_nosync)
+
+/*
* dcache_clean_poc(start, end)
*
* Ensure that any D-cache lines for the interval [start, end)
@@ -179,6 +199,21 @@ SYM_FUNC_END(__pi_dcache_clean_poc)
SYM_FUNC_ALIAS(dcache_clean_poc, __pi_dcache_clean_poc)
/*
+ * dcache_clean_poc_nosync(start, end)
+ *
+ * Issue the instructions of D-cache lines for the interval [start, end).
+ * not necessarily cleaned to the PoC till an explicit dsb sy afterward.
+ *
+ * - start - virtual start address of region
+ * - end - virtual end address of region
+ */
+SYM_FUNC_START(__pi_dcache_clean_poc_nosync)
+ dcache_by_line_op_nosync cvac, x0, x1, x2, x3
+ ret
+SYM_FUNC_END(__pi_dcache_clean_poc_nosync)
+SYM_FUNC_ALIAS(dcache_clean_poc_nosync, __pi_dcache_clean_poc_nosync)
+
+/*
* dcache_clean_pop(start, end)
*
* Ensure that any D-cache lines for the interval [start, end)
diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index b2ac06246327..0f4a28b87469 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -354,15 +354,15 @@ void cpu_do_switch_mm(phys_addr_t pgd_phys, struct mm_struct *mm)
/* Skip CNP for the reserved ASID */
if (system_supports_cnp() && asid)
- ttbr0 |= TTBR_CNP_BIT;
+ ttbr0 |= TTBRx_EL1_CnP;
/* SW PAN needs a copy of the ASID in TTBR0 for entry */
if (IS_ENABLED(CONFIG_ARM64_SW_TTBR0_PAN))
- ttbr0 |= FIELD_PREP(TTBR_ASID_MASK, asid);
+ ttbr0 |= FIELD_PREP(TTBRx_EL1_ASID_MASK, asid);
/* Set ASID in TTBR1 since TCR.A1 is set */
- ttbr1 &= ~TTBR_ASID_MASK;
- ttbr1 |= FIELD_PREP(TTBR_ASID_MASK, asid);
+ ttbr1 &= ~TTBRx_EL1_ASID_MASK;
+ ttbr1 |= FIELD_PREP(TTBRx_EL1_ASID_MASK, asid);
cpu_set_reserved_ttbr0_nosync();
write_sysreg(ttbr1, ttbr1_el1);
diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
index 55107d27d3f8..2de12656b4d8 100644
--- a/arch/arm64/mm/contpte.c
+++ b/arch/arm64/mm/contpte.c
@@ -26,6 +26,26 @@ static inline pte_t *contpte_align_down(pte_t *ptep)
return PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * CONT_PTES);
}
+static inline pte_t *contpte_align_addr_ptep(unsigned long *start,
+ unsigned long *end, pte_t *ptep,
+ unsigned int nr)
+{
+ /*
+ * Note: caller must ensure these nr PTEs are consecutive (present)
+ * PTEs that map consecutive pages of the same large folio within a
+ * single VMA and a single page table.
+ */
+ if (pte_cont(__ptep_get(ptep + nr - 1)))
+ *end = ALIGN(*end, CONT_PTE_SIZE);
+
+ if (pte_cont(__ptep_get(ptep))) {
+ *start = ALIGN_DOWN(*start, CONT_PTE_SIZE);
+ ptep = contpte_align_down(ptep);
+ }
+
+ return ptep;
+}
+
static void contpte_try_unfold_partial(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, unsigned int nr)
{
@@ -68,7 +88,145 @@ static void contpte_convert(struct mm_struct *mm, unsigned long addr,
pte = pte_mkyoung(pte);
}
- __flush_tlb_range(&vma, start_addr, addr, PAGE_SIZE, true, 3);
+ /*
+ * On eliding the __tlb_flush_range() under BBML2+noabort:
+ *
+ * NOTE: Instead of using N=16 as the contiguous block length, we use
+ * N=4 for clarity.
+ *
+ * NOTE: 'n' and 'c' are used to denote the "contiguous bit" being
+ * unset and set, respectively.
+ *
+ * We worry about two cases where contiguous bit is used:
+ * - When folding N smaller non-contiguous ptes as 1 contiguous block.
+ * - When unfolding a contiguous block into N smaller non-contiguous ptes.
+ *
+ * Currently, the BBML0 folding case looks as follows:
+ *
+ * 0) Initial page-table layout:
+ *
+ * +----+----+----+----+
+ * |RO,n|RO,n|RO,n|RW,n| <--- last page being set as RO
+ * +----+----+----+----+
+ *
+ * 1) Aggregate AF + dirty flags using __ptep_get_and_clear():
+ *
+ * +----+----+----+----+
+ * | 0 | 0 | 0 | 0 |
+ * +----+----+----+----+
+ *
+ * 2) __flush_tlb_range():
+ *
+ * |____ tlbi + dsb ____|
+ *
+ * 3) __set_ptes() to repaint contiguous block:
+ *
+ * +----+----+----+----+
+ * |RO,c|RO,c|RO,c|RO,c|
+ * +----+----+----+----+
+ *
+ * 4) The kernel will eventually __flush_tlb() for changed page:
+ *
+ * |____| <--- tlbi + dsb
+ *
+ * As expected, the intermediate tlbi+dsb ensures that other PEs
+ * only ever see an invalid (0) entry, or the new contiguous TLB entry.
+ * The final tlbi+dsb will always throw away the newly installed
+ * contiguous TLB entry, which is a micro-optimisation opportunity,
+ * but does not affect correctness.
+ *
+ * In the BBML2 case, the change is avoiding the intermediate tlbi+dsb.
+ * This means a few things, but notably other PEs will still "see" any
+ * stale cached TLB entries. This could lead to a "contiguous bit
+ * misprogramming" issue until the final tlbi+dsb of the changed page,
+ * which would clear out both the stale (RW,n) entry and the new (RO,c)
+ * contiguous entry installed in its place.
+ *
+ * What this is saying, is the following:
+ *
+ * +----+----+----+----+
+ * |RO,n|RO,n|RO,n|RW,n| <--- old page tables, all non-contiguous
+ * +----+----+----+----+
+ *
+ * +----+----+----+----+
+ * |RO,c|RO,c|RO,c|RO,c| <--- new page tables, all contiguous
+ * +----+----+----+----+
+ * /\
+ * ||
+ *
+ * If both the old single (RW,n) and new contiguous (RO,c) TLB entries
+ * are present, and a write is made to this address, do we fault or
+ * is the write permitted (via amalgamation)?
+ *
+ * The relevant Arm ARM DDI 0487L.a requirements are RNGLXZ and RJQQTC,
+ * and together state that when BBML1 or BBML2 are implemented, either
+ * a TLB conflict abort is raised (which we expressly forbid), or will
+ * "produce an OA, access permissions, and memory attributes that are
+ * consistent with any of the programmed translation table values".
+ *
+ * That is to say, will either raise a TLB conflict, or produce one of
+ * the cached TLB entries, but never amalgamate.
+ *
+ * Thus, as the page tables are only considered "consistent" after
+ * the final tlbi+dsb (which evicts both the single stale (RW,n) TLB
+ * entry as well as the new contiguous (RO,c) TLB entry), omitting the
+ * initial tlbi+dsb is correct.
+ *
+ * It is also important to note that at the end of the BBML2 folding
+ * case, we are still left with potentially all N TLB entries still
+ * cached (the N-1 non-contiguous ptes, and the single contiguous
+ * block). However, over time, natural TLB pressure will cause the
+ * non-contiguous pte TLB entries to be flushed, leaving only the
+ * contiguous block TLB entry. This means that omitting the tlbi+dsb is
+ * not only correct, but also keeps our eventual performance benefits.
+ *
+ * For the unfolding case, BBML0 looks as follows:
+ *
+ * 0) Initial page-table layout:
+ *
+ * +----+----+----+----+
+ * |RW,c|RW,c|RW,c|RW,c| <--- last page being set as RO
+ * +----+----+----+----+
+ *
+ * 1) Aggregate AF + dirty flags using __ptep_get_and_clear():
+ *
+ * +----+----+----+----+
+ * | 0 | 0 | 0 | 0 |
+ * +----+----+----+----+
+ *
+ * 2) __flush_tlb_range():
+ *
+ * |____ tlbi + dsb ____|
+ *
+ * 3) __set_ptes() to repaint as non-contiguous:
+ *
+ * +----+----+----+----+
+ * |RW,n|RW,n|RW,n|RW,n|
+ * +----+----+----+----+
+ *
+ * 4) Update changed page permissions:
+ *
+ * +----+----+----+----+
+ * |RW,n|RW,n|RW,n|RO,n| <--- last page permissions set
+ * +----+----+----+----+
+ *
+ * 5) The kernel will eventually __flush_tlb() for changed page:
+ *
+ * |____| <--- tlbi + dsb
+ *
+ * For BBML2, we again remove the intermediate tlbi+dsb. Here, there
+ * are no issues, as the final tlbi+dsb covering the changed page is
+ * guaranteed to remove the original large contiguous (RW,c) TLB entry,
+ * as well as the intermediate (RW,n) TLB entry; the next access will
+ * install the new (RO,n) TLB entry and the page tables are only
+ * considered "consistent" after the final tlbi+dsb, so software must
+ * be prepared for this inconsistency prior to finishing the mm dance
+ * regardless.
+ */
+
+ if (!system_supports_bbml2_noabort())
+ __flush_tlb_range(&vma, start_addr, addr, PAGE_SIZE, 3,
+ TLBF_NOWALKCACHE);
__set_ptes(mm, start_addr, start_ptep, pte, CONT_PTES);
}
@@ -169,17 +327,46 @@ pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte)
for (i = 0; i < CONT_PTES; i++, ptep++) {
pte = __ptep_get(ptep);
- if (pte_dirty(pte))
+ if (pte_dirty(pte)) {
orig_pte = pte_mkdirty(orig_pte);
-
- if (pte_young(pte))
+ for (; i < CONT_PTES; i++, ptep++) {
+ pte = __ptep_get(ptep);
+ if (pte_young(pte)) {
+ orig_pte = pte_mkyoung(orig_pte);
+ break;
+ }
+ }
+ break;
+ }
+
+ if (pte_young(pte)) {
orig_pte = pte_mkyoung(orig_pte);
+ i++;
+ ptep++;
+ for (; i < CONT_PTES; i++, ptep++) {
+ pte = __ptep_get(ptep);
+ if (pte_dirty(pte)) {
+ orig_pte = pte_mkdirty(orig_pte);
+ break;
+ }
+ }
+ break;
+ }
}
return orig_pte;
}
EXPORT_SYMBOL_GPL(contpte_ptep_get);
+static inline bool contpte_is_consistent(pte_t pte, unsigned long pfn,
+ pgprot_t orig_prot)
+{
+ pgprot_t prot = pte_pgprot(pte_mkold(pte_mkclean(pte)));
+
+ return pte_valid_cont(pte) && pte_pfn(pte) == pfn &&
+ pgprot_val(prot) == pgprot_val(orig_prot);
+}
+
pte_t contpte_ptep_get_lockless(pte_t *orig_ptep)
{
/*
@@ -202,7 +389,6 @@ pte_t contpte_ptep_get_lockless(pte_t *orig_ptep)
pgprot_t orig_prot;
unsigned long pfn;
pte_t orig_pte;
- pgprot_t prot;
pte_t *ptep;
pte_t pte;
int i;
@@ -219,18 +405,44 @@ retry:
for (i = 0; i < CONT_PTES; i++, ptep++, pfn++) {
pte = __ptep_get(ptep);
- prot = pte_pgprot(pte_mkold(pte_mkclean(pte)));
- if (!pte_valid_cont(pte) ||
- pte_pfn(pte) != pfn ||
- pgprot_val(prot) != pgprot_val(orig_prot))
+ if (!contpte_is_consistent(pte, pfn, orig_prot))
goto retry;
- if (pte_dirty(pte))
+ if (pte_dirty(pte)) {
orig_pte = pte_mkdirty(orig_pte);
+ for (; i < CONT_PTES; i++, ptep++, pfn++) {
+ pte = __ptep_get(ptep);
+
+ if (!contpte_is_consistent(pte, pfn, orig_prot))
+ goto retry;
+
+ if (pte_young(pte)) {
+ orig_pte = pte_mkyoung(orig_pte);
+ break;
+ }
+ }
+ break;
+ }
- if (pte_young(pte))
+ if (pte_young(pte)) {
orig_pte = pte_mkyoung(orig_pte);
+ i++;
+ ptep++;
+ pfn++;
+ for (; i < CONT_PTES; i++, ptep++, pfn++) {
+ pte = __ptep_get(ptep);
+
+ if (!contpte_is_consistent(pte, pfn, orig_prot))
+ goto retry;
+
+ if (pte_dirty(pte)) {
+ orig_pte = pte_mkdirty(orig_pte);
+ break;
+ }
+ }
+ break;
+ }
}
return orig_pte;
@@ -297,8 +509,8 @@ pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm,
}
EXPORT_SYMBOL_GPL(contpte_get_and_clear_full_ptes);
-int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
+bool contpte_test_and_clear_young_ptes(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep, unsigned int nr)
{
/*
* ptep_clear_flush_young() technically requires us to clear the access
@@ -307,41 +519,44 @@ int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
* contig range when the range is covered by a single folio, we can get
* away with clearing young for the whole contig range here, so we avoid
* having to unfold.
+ *
+ * The 'nr' means consecutive (present) PTEs that map consecutive pages
+ * of the same large folio in a single VMA and a single page table.
*/
- int young = 0;
- int i;
-
- ptep = contpte_align_down(ptep);
- addr = ALIGN_DOWN(addr, CONT_PTE_SIZE);
+ unsigned long end = addr + nr * PAGE_SIZE;
+ bool young = false;
- for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE)
+ ptep = contpte_align_addr_ptep(&addr, &end, ptep, nr);
+ for (; addr != end; ptep++, addr += PAGE_SIZE)
young |= __ptep_test_and_clear_young(vma, addr, ptep);
return young;
}
-EXPORT_SYMBOL_GPL(contpte_ptep_test_and_clear_young);
+EXPORT_SYMBOL_GPL(contpte_test_and_clear_young_ptes);
-int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
+bool contpte_clear_flush_young_ptes(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep, unsigned int nr)
{
- int young;
+ bool young;
- young = contpte_ptep_test_and_clear_young(vma, addr, ptep);
+ young = contpte_test_and_clear_young_ptes(vma, addr, ptep, nr);
if (young) {
+ unsigned long end = addr + nr * PAGE_SIZE;
+
+ contpte_align_addr_ptep(&addr, &end, ptep, nr);
/*
* See comment in __ptep_clear_flush_young(); same rationale for
* eliding the trailing DSB applies here.
*/
- addr = ALIGN_DOWN(addr, CONT_PTE_SIZE);
- __flush_tlb_range_nosync(vma, addr, addr + CONT_PTE_SIZE,
- PAGE_SIZE, true, 3);
+ __flush_tlb_range(vma, addr, end, PAGE_SIZE, 3,
+ TLBF_NOWALKCACHE | TLBF_NOSYNC);
}
return young;
}
-EXPORT_SYMBOL_GPL(contpte_ptep_clear_flush_young);
+EXPORT_SYMBOL_GPL(contpte_clear_flush_young_ptes);
void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, unsigned int nr)
@@ -378,17 +593,31 @@ void contpte_clear_young_dirty_ptes(struct vm_area_struct *vma,
unsigned long start = addr;
unsigned long end = start + nr * PAGE_SIZE;
- if (pte_cont(__ptep_get(ptep + nr - 1)))
- end = ALIGN(end, CONT_PTE_SIZE);
+ ptep = contpte_align_addr_ptep(&start, &end, ptep, nr);
+ __clear_young_dirty_ptes(vma, start, ptep, (end - start) / PAGE_SIZE, flags);
+}
+EXPORT_SYMBOL_GPL(contpte_clear_young_dirty_ptes);
- if (pte_cont(__ptep_get(ptep))) {
- start = ALIGN_DOWN(start, CONT_PTE_SIZE);
- ptep = contpte_align_down(ptep);
+static bool contpte_all_subptes_match_access_flags(pte_t *ptep, pte_t entry)
+{
+ pte_t *cont_ptep = contpte_align_down(ptep);
+ /*
+ * PFNs differ per sub-PTE. Match only bits consumed by
+ * __ptep_set_access_flags(): AF, DIRTY and write permission.
+ */
+ const pteval_t cmp_mask = PTE_RDONLY | PTE_AF | PTE_WRITE | PTE_DIRTY;
+ pteval_t entry_cmp = pte_val(entry) & cmp_mask;
+ int i;
+
+ for (i = 0; i < CONT_PTES; i++) {
+ pteval_t pte_cmp = pte_val(__ptep_get(cont_ptep + i)) & cmp_mask;
+
+ if (pte_cmp != entry_cmp)
+ return false;
}
- __clear_young_dirty_ptes(vma, start, ptep, (end - start) / PAGE_SIZE, flags);
+ return true;
}
-EXPORT_SYMBOL_GPL(contpte_clear_young_dirty_ptes);
int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep,
@@ -399,14 +628,38 @@ int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
int i;
/*
- * Gather the access/dirty bits for the contiguous range. If nothing has
- * changed, its a noop.
+ * Check whether all sub-PTEs in the CONT block already match the
+ * requested access flags/write permission, using raw per-PTE values
+ * rather than the gathered ptep_get() view.
+ *
+ * __ptep_set_access_flags() can update AF, dirty and write
+ * permission, but only to make the mapping more permissive.
+ *
+ * ptep_get() gathers AF/dirty state across the whole CONT block,
+ * which is correct for a CPU with FEAT_HAFDBS. But page-table
+ * walkers that evaluate each descriptor individually (e.g. a CPU
+ * without DBM support, or an SMMU without HTTU, or with HA/HD
+ * disabled in CD.TCR) can keep faulting on the target sub-PTE if
+ * only a sibling has been updated. Gathering can therefore cause
+ * false no-ops when only a sibling has been updated:
+ * - write faults: target still has PTE_RDONLY (needs PTE_RDONLY cleared)
+ * - read faults: target still lacks PTE_AF
+ *
+ * Per Arm ARM (DDI 0487) D8.7.1, any sub-PTE in a CONT range may
+ * become the effective cached translation, so all entries must have
+ * consistent attributes. Check the full CONT block before returning
+ * no-op, and when any sub-PTE mismatches, proceed to update the whole
+ * range.
*/
- orig_pte = pte_mknoncont(ptep_get(ptep));
- if (pte_val(orig_pte) == pte_val(entry))
+ if (contpte_all_subptes_match_access_flags(ptep, entry))
return 0;
/*
+ * Use raw target pte (not gathered) for write-bit unfold decision.
+ */
+ orig_pte = pte_mknoncont(__ptep_get(ptep));
+
+ /*
* We can fix up access/dirty bits without having to unfold the contig
* range. But if the write bit is changing, we must unfold.
*/
@@ -431,8 +684,10 @@ int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
__ptep_set_access_flags(vma, addr, ptep, entry, 0);
if (dirty)
- __flush_tlb_range(vma, start_addr, addr,
- PAGE_SIZE, true, 3);
+ __flush_tlb_range(vma, start_addr,
+ start_addr + CONT_PTE_SIZE,
+ PAGE_SIZE, 3,
+ TLBF_NOWALKCACHE | TLBF_NOBROADCAST);
} else {
__contpte_try_unfold(vma->vm_mm, addr, ptep, orig_pte);
__ptep_set_access_flags(vma, addr, ptep, entry, dirty);
diff --git a/arch/arm64/mm/copypage.c b/arch/arm64/mm/copypage.c
index a86c897017df..cd5912ba617b 100644
--- a/arch/arm64/mm/copypage.c
+++ b/arch/arm64/mm/copypage.c
@@ -35,7 +35,7 @@ void copy_highpage(struct page *to, struct page *from)
from != folio_page(src, 0))
return;
- WARN_ON_ONCE(!folio_try_hugetlb_mte_tagging(dst));
+ folio_try_hugetlb_mte_tagging(dst);
/*
* Populate tags for all subpages.
@@ -51,8 +51,13 @@ void copy_highpage(struct page *to, struct page *from)
}
folio_set_hugetlb_mte_tagged(dst);
} else if (page_mte_tagged(from)) {
- /* It's a new page, shouldn't have been tagged yet */
- WARN_ON_ONCE(!try_page_mte_tagging(to));
+ /*
+ * Most of the time it's a new page that shouldn't have been
+ * tagged yet. However, folio migration can end up reusing the
+ * same page without untagging it. Ignore the warning if the
+ * page is already tagged.
+ */
+ try_page_mte_tagging(to);
mte_copy_page_tags(kto, kfrom);
set_page_mte_tagged(to);
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index b2b5792b2caa..ae1ae0280eef 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -17,7 +17,7 @@ void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
{
unsigned long start = (unsigned long)phys_to_virt(paddr);
- dcache_clean_poc(start, start + size);
+ dcache_clean_poc_nosync(start, start + size);
}
void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
@@ -28,7 +28,7 @@ void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
if (dir == DMA_TO_DEVICE)
return;
- dcache_inval_poc(start, start + size);
+ dcache_inval_poc_nosync(start, start + size);
}
void arch_dma_prep_coherent(struct page *page, size_t size)
diff --git a/arch/arm64/mm/extable.c b/arch/arm64/mm/extable.c
index 228d681a8715..6e0528831cd3 100644
--- a/arch/arm64/mm/extable.c
+++ b/arch/arm64/mm/extable.c
@@ -8,8 +8,33 @@
#include <linux/uaccess.h>
#include <asm/asm-extable.h>
+#include <asm/esr.h>
#include <asm/ptrace.h>
+static bool cpy_faulted_on_uaccess(const struct exception_table_entry *ex,
+ unsigned long esr)
+{
+ bool uaccess_is_write = FIELD_GET(EX_DATA_UACCESS_WRITE, ex->data);
+ bool fault_on_write = esr & ESR_ELx_WNR;
+
+ return uaccess_is_write == fault_on_write;
+}
+
+bool insn_may_access_user(unsigned long addr, unsigned long esr)
+{
+ const struct exception_table_entry *ex = search_exception_tables(addr);
+
+ if (!ex)
+ return false;
+
+ switch (ex->type) {
+ case EX_TYPE_UACCESS_CPY:
+ return cpy_faulted_on_uaccess(ex, esr);
+ default:
+ return true;
+ }
+}
+
static inline unsigned long
get_ex_fixup(const struct exception_table_entry *ex)
{
@@ -29,6 +54,17 @@ static bool ex_handler_uaccess_err_zero(const struct exception_table_entry *ex,
return true;
}
+static bool ex_handler_uaccess_cpy(const struct exception_table_entry *ex,
+ struct pt_regs *regs, unsigned long esr)
+{
+ /* Do not fix up faults on kernel memory accesses */
+ if (!cpy_faulted_on_uaccess(ex, esr))
+ return false;
+
+ regs->pc = get_ex_fixup(ex);
+ return true;
+}
+
static bool
ex_handler_load_unaligned_zeropad(const struct exception_table_entry *ex,
struct pt_regs *regs)
@@ -56,7 +92,7 @@ ex_handler_load_unaligned_zeropad(const struct exception_table_entry *ex,
return true;
}
-bool fixup_exception(struct pt_regs *regs)
+bool fixup_exception(struct pt_regs *regs, unsigned long esr)
{
const struct exception_table_entry *ex;
@@ -70,6 +106,8 @@ bool fixup_exception(struct pt_regs *regs)
case EX_TYPE_UACCESS_ERR_ZERO:
case EX_TYPE_KACCESS_ERR_ZERO:
return ex_handler_uaccess_err_zero(ex, regs);
+ case EX_TYPE_UACCESS_CPY:
+ return ex_handler_uaccess_cpy(ex, regs, esr);
case EX_TYPE_LOAD_UNALIGNED_ZEROPAD:
return ex_handler_load_unaligned_zeropad(ex, regs);
}
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index ef63651099a9..739800835920 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -43,6 +43,7 @@
#include <asm/system_misc.h>
#include <asm/tlbflush.h>
#include <asm/traps.h>
+#include <asm/virt.h>
struct fault_info {
int (*fn)(unsigned long far, unsigned long esr,
@@ -53,18 +54,12 @@ struct fault_info {
};
static const struct fault_info fault_info[];
-static struct fault_info debug_fault_info[];
static inline const struct fault_info *esr_to_fault_info(unsigned long esr)
{
return fault_info + (esr & ESR_ELx_FSC);
}
-static inline const struct fault_info *esr_to_debug_fault_info(unsigned long esr)
-{
- return debug_fault_info + DBG_ESR_EVT(esr);
-}
-
static void data_abort_decode(unsigned long esr)
{
unsigned long iss2 = ESR_ELx_ISS2(esr);
@@ -210,12 +205,13 @@ static void show_pte(unsigned long addr)
*
* Returns whether or not the PTE actually changed.
*/
-int __ptep_set_access_flags(struct vm_area_struct *vma,
- unsigned long address, pte_t *ptep,
- pte_t entry, int dirty)
+int __ptep_set_access_flags_anysz(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep,
+ pte_t entry, int dirty, unsigned long pgsize)
{
pteval_t old_pteval, pteval;
pte_t pte = __ptep_get(ptep);
+ int level;
if (pte_same(pte, entry))
return 0;
@@ -239,9 +235,32 @@ int __ptep_set_access_flags(struct vm_area_struct *vma,
pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval);
} while (pteval != old_pteval);
- /* Invalidate a stale read-only entry */
- if (dirty)
- flush_tlb_page(vma, address);
+ /*
+ * Invalidate the local stale read-only entry. Remote stale entries
+ * may still cause page faults and be invalidated via
+ * flush_tlb_fix_spurious_fault().
+ */
+ if (dirty) {
+ switch (pgsize) {
+ case PAGE_SIZE:
+ level = 3;
+ break;
+ case PMD_SIZE:
+ level = 2;
+ break;
+#ifndef __PAGETABLE_PMD_FOLDED
+ case PUD_SIZE:
+ level = 1;
+ break;
+#endif
+ default:
+ level = TLBI_TTL_UNKNOWN;
+ WARN_ON(1);
+ }
+
+ __flush_tlb_range(vma, address, address + pgsize, pgsize, level,
+ TLBF_NOWALKCACHE | TLBF_NOBROADCAST);
+ }
return 1;
}
@@ -271,6 +290,15 @@ static inline bool is_el1_permission_fault(unsigned long addr, unsigned long esr
return false;
}
+static bool is_pkvm_stage2_abort(unsigned int esr)
+{
+ /*
+ * S1PTW should only ever be set in ESR_EL1 if the pkvm hypervisor
+ * injected a stage-2 abort -- see host_inject_mem_abort().
+ */
+ return is_pkvm_initialized() && (esr & ESR_ELx_S1PTW);
+}
+
static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr,
unsigned long esr,
struct pt_regs *regs)
@@ -291,8 +319,14 @@ static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr,
* If we now have a valid translation, treat the translation fault as
* spurious.
*/
- if (!(par & SYS_PAR_EL1_F))
+ if (!(par & SYS_PAR_EL1_F)) {
+ if (is_pkvm_stage2_abort(esr)) {
+ par &= SYS_PAR_EL1_PA;
+ return pkvm_force_reclaim_guest_page(par);
+ }
+
return true;
+ }
/*
* If we got a different type of fault from the AT instruction,
@@ -375,12 +409,14 @@ static void __do_kernel_fault(unsigned long addr, unsigned long esr,
* Are we prepared to handle this kernel fault?
* We are almost certainly not prepared to handle instruction faults.
*/
- if (!is_el1_instruction_abort(esr) && fixup_exception(regs))
+ if (!is_el1_instruction_abort(esr) && fixup_exception(regs, esr))
return;
- if (WARN_RATELIMIT(is_spurious_el1_translation_fault(addr, esr, regs),
- "Ignoring spurious kernel translation fault at virtual address %016lx\n", addr))
+ if (is_spurious_el1_translation_fault(addr, esr, regs)) {
+ WARN_RATELIMIT(!is_pkvm_stage2_abort(esr),
+ "Ignoring spurious kernel translation fault at virtual address %016lx\n", addr);
return;
+ }
if (is_el1_mte_sync_tag_check_fault(esr)) {
do_tag_recovery(addr, esr, regs);
@@ -397,6 +433,8 @@ static void __do_kernel_fault(unsigned long addr, unsigned long esr,
msg = "read from unreadable memory";
} else if (addr < PAGE_SIZE) {
msg = "NULL pointer dereference";
+ } else if (is_pkvm_stage2_abort(esr)) {
+ msg = "access to hypervisor-protected memory";
} else {
if (esr_fsc_is_translation_fault(esr) &&
kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs))
@@ -487,17 +525,29 @@ static void do_bad_area(unsigned long far, unsigned long esr,
}
}
-static bool fault_from_pkey(unsigned long esr, struct vm_area_struct *vma,
- unsigned int mm_flags)
+static bool fault_from_pkey(struct vm_area_struct *vma, unsigned int mm_flags)
{
- unsigned long iss2 = ESR_ELx_ISS2(esr);
-
if (!system_supports_poe())
return false;
- if (esr_fsc_is_permission_fault(esr) && (iss2 & ESR_ELx_Overlay))
- return true;
-
+ /*
+ * We do not check whether an Overlay fault has occurred because we
+ * cannot make a decision based solely on its value:
+ *
+ * - If Overlay is set, a fault did occur due to POE, but it may be
+ * spurious in those cases where we update POR_EL0 without ISB (e.g.
+ * on context-switch). We would then need to manually check POR_EL0
+ * against vma_pkey(vma), which is exactly what
+ * arch_vma_access_permitted() does.
+ *
+ * - If Overlay is not set, we may still need to report a pkey fault.
+ * This is the case if an access was made within a mapping but with no
+ * page mapped, and POR_EL0 forbids the access (according to
+ * vma_pkey()). Such access will result in a SIGSEGV regardless
+ * because core code checks arch_vma_access_permitted(), but in order
+ * to report the correct error code - SEGV_PKUERR - we must handle
+ * that case here.
+ */
return !arch_vma_access_permitted(vma,
mm_flags & FAULT_FLAG_WRITE,
mm_flags & FAULT_FLAG_INSTRUCTION,
@@ -549,7 +599,7 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
const struct fault_info *inf;
struct mm_struct *mm = current->mm;
vm_fault_t fault;
- unsigned long vm_flags;
+ vm_flags_t vm_flags;
unsigned int mm_flags = FAULT_FLAG_DEFAULT;
unsigned long addr = untagged_addr(far);
struct vm_area_struct *vma;
@@ -606,11 +656,18 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
die_kernel_fault("execution of user memory",
addr, esr, regs);
- if (!search_exception_tables(regs->pc))
+ if (!insn_may_access_user(regs->pc, esr))
die_kernel_fault("access to user memory outside uaccess routines",
addr, esr, regs);
}
+ if (is_pkvm_stage2_abort(esr)) {
+ if (!user_mode(regs))
+ goto no_context;
+ arm64_force_sig_fault(SIGSEGV, SEGV_ACCERR, far, "stage-2 fault");
+ return 0;
+ }
+
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
if (!(mm_flags & FAULT_FLAG_USER))
@@ -635,7 +692,7 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
goto bad_area;
}
- if (fault_from_pkey(esr, vma, mm_flags)) {
+ if (fault_from_pkey(vma, mm_flags)) {
pkey = vma_pkey(vma);
vma_end_read(vma);
fault = 0;
@@ -679,7 +736,7 @@ retry:
goto bad_area;
}
- if (fault_from_pkey(esr, vma, mm_flags)) {
+ if (fault_from_pkey(vma, mm_flags)) {
pkey = vma_pkey(vma);
mmap_read_unlock(mm);
fault = 0;
@@ -826,6 +883,7 @@ static int do_sea(unsigned long far, unsigned long esr, struct pt_regs *regs)
*/
siaddr = untagged_addr(far);
}
+ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_STILL_OK);
arm64_notify_die(inf->name, regs, inf->sig, inf->code, siaddr, esr);
return 0;
@@ -837,9 +895,12 @@ static int do_tag_check_fault(unsigned long far, unsigned long esr,
/*
* The architecture specifies that bits 63:60 of FAR_EL1 are UNKNOWN
* for tag check faults. Set them to corresponding bits in the untagged
- * address.
+ * address if ARM64_MTE_FAR isn't supported.
+ * Otherwise, bits 63:60 of FAR_EL1 are not UNKNOWN.
*/
- far = (__untagged_addr(far) & ~MTE_TAG_MASK) | (far & MTE_TAG_MASK);
+ if (!cpus_have_cap(ARM64_MTE_FAR))
+ far = (__untagged_addr(far) & ~MTE_TAG_MASK) | (far & MTE_TAG_MASK);
+
do_bad_area(far, esr, regs);
return 0;
}
@@ -939,75 +1000,6 @@ void do_sp_pc_abort(unsigned long addr, unsigned long esr, struct pt_regs *regs)
NOKPROBE_SYMBOL(do_sp_pc_abort);
/*
- * __refdata because early_brk64 is __init, but the reference to it is
- * clobbered at arch_initcall time.
- * See traps.c and debug-monitors.c:debug_traps_init().
- */
-static struct fault_info __refdata debug_fault_info[] = {
- { do_bad, SIGTRAP, TRAP_HWBKPT, "hardware breakpoint" },
- { do_bad, SIGTRAP, TRAP_HWBKPT, "hardware single-step" },
- { do_bad, SIGTRAP, TRAP_HWBKPT, "hardware watchpoint" },
- { do_bad, SIGKILL, SI_KERNEL, "unknown 3" },
- { do_bad, SIGTRAP, TRAP_BRKPT, "aarch32 BKPT" },
- { do_bad, SIGKILL, SI_KERNEL, "aarch32 vector catch" },
- { early_brk64, SIGTRAP, TRAP_BRKPT, "aarch64 BRK" },
- { do_bad, SIGKILL, SI_KERNEL, "unknown 7" },
-};
-
-void __init hook_debug_fault_code(int nr,
- int (*fn)(unsigned long, unsigned long, struct pt_regs *),
- int sig, int code, const char *name)
-{
- BUG_ON(nr < 0 || nr >= ARRAY_SIZE(debug_fault_info));
-
- debug_fault_info[nr].fn = fn;
- debug_fault_info[nr].sig = sig;
- debug_fault_info[nr].code = code;
- debug_fault_info[nr].name = name;
-}
-
-/*
- * In debug exception context, we explicitly disable preemption despite
- * having interrupts disabled.
- * This serves two purposes: it makes it much less likely that we would
- * accidentally schedule in exception context and it will force a warning
- * if we somehow manage to schedule by accident.
- */
-static void debug_exception_enter(struct pt_regs *regs)
-{
- preempt_disable();
-
- /* This code is a bit fragile. Test it. */
- RCU_LOCKDEP_WARN(!rcu_is_watching(), "exception_enter didn't work");
-}
-NOKPROBE_SYMBOL(debug_exception_enter);
-
-static void debug_exception_exit(struct pt_regs *regs)
-{
- preempt_enable_no_resched();
-}
-NOKPROBE_SYMBOL(debug_exception_exit);
-
-void do_debug_exception(unsigned long addr_if_watchpoint, unsigned long esr,
- struct pt_regs *regs)
-{
- const struct fault_info *inf = esr_to_debug_fault_info(esr);
- unsigned long pc = instruction_pointer(regs);
-
- debug_exception_enter(regs);
-
- if (user_mode(regs) && !is_ttbr0_addr(pc))
- arm64_apply_bp_hardening();
-
- if (inf->fn(addr_if_watchpoint, esr, regs)) {
- arm64_notify_die(inf->name, regs, inf->sig, inf->code, pc, esr);
- }
-
- debug_exception_exit(regs);
-}
-NOKPROBE_SYMBOL(do_debug_exception);
-
-/*
* Used during anonymous page fault handling.
*/
struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma,
@@ -1026,10 +1018,24 @@ struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma,
return vma_alloc_folio(flags, 0, vma, vaddr);
}
-void tag_clear_highpage(struct page *page)
+bool tag_clear_highpages(struct page *page, int numpages, bool clear_pages)
{
- /* Newly allocated page, shouldn't have been tagged yet */
- WARN_ON_ONCE(!try_page_mte_tagging(page));
- mte_zero_clear_page_tags(page_address(page));
- set_page_mte_tagged(page);
+ /*
+ * Check if MTE is supported and fall back to clear_highpage().
+ * get_huge_zero_folio() unconditionally passes __GFP_ZEROTAGS and
+ * post_alloc_hook() will invoke tag_clear_highpages().
+ */
+ if (!system_supports_mte())
+ return clear_pages;
+
+ /* Newly allocated pages, shouldn't have been tagged yet */
+ for (int i = 0; i < numpages; i++, page++) {
+ WARN_ON_ONCE(!try_page_mte_tagging(page));
+ if (clear_pages)
+ mte_zero_clear_page_tags(page_address(page));
+ else
+ mte_clear_page_tags(page_address(page));
+ set_page_mte_tagged(page);
+ }
+ return false;
}
diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c
index 013eead9b695..fbf08b543c3f 100644
--- a/arch/arm64/mm/flush.c
+++ b/arch/arm64/mm/flush.c
@@ -53,11 +53,11 @@ void __sync_icache_dcache(pte_t pte)
{
struct folio *folio = page_folio(pte_page(pte));
- if (!test_bit(PG_dcache_clean, &folio->flags)) {
+ if (!test_bit(PG_dcache_clean, &folio->flags.f)) {
sync_icache_aliases((unsigned long)folio_address(folio),
(unsigned long)folio_address(folio) +
folio_size(folio));
- set_bit(PG_dcache_clean, &folio->flags);
+ set_bit(PG_dcache_clean, &folio->flags.f);
}
}
EXPORT_SYMBOL_GPL(__sync_icache_dcache);
@@ -69,8 +69,8 @@ EXPORT_SYMBOL_GPL(__sync_icache_dcache);
*/
void flush_dcache_folio(struct folio *folio)
{
- if (test_bit(PG_dcache_clean, &folio->flags))
- clear_bit(PG_dcache_clean, &folio->flags);
+ if (test_bit(PG_dcache_clean, &folio->flags.f))
+ clear_bit(PG_dcache_clean, &folio->flags.f);
}
EXPORT_SYMBOL(flush_dcache_folio);
diff --git a/arch/arm64/mm/gcs.c b/arch/arm64/mm/gcs.c
index 5c46ec527b1c..680749611a9a 100644
--- a/arch/arm64/mm/gcs.c
+++ b/arch/arm64/mm/gcs.c
@@ -12,19 +12,7 @@
static unsigned long alloc_gcs(unsigned long addr, unsigned long size)
{
- int flags = MAP_ANONYMOUS | MAP_PRIVATE;
- struct mm_struct *mm = current->mm;
- unsigned long mapped_addr, unused;
-
- if (addr)
- flags |= MAP_FIXED_NOREPLACE;
-
- mmap_write_lock(mm);
- mapped_addr = do_mmap(NULL, addr, size, PROT_READ, flags,
- VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL);
- mmap_write_unlock(mm);
-
- return mapped_addr;
+ return vm_mmap_shadow_stack(addr, size, 0);
}
static unsigned long gcs_size(unsigned long size)
@@ -157,12 +145,6 @@ void gcs_free(struct task_struct *task)
if (!system_supports_gcs())
return;
- /*
- * When fork() with CLONE_VM fails, the child (tsk) already
- * has a GCS allocated, and exit_thread() calls this function
- * to free it. In this case the parent (current) and the
- * child share the same mm struct.
- */
if (!task->mm || task->mm != current->mm)
return;
@@ -205,8 +187,8 @@ int arch_set_shadow_stack_status(struct task_struct *task, unsigned long arg)
size = gcs_size(0);
gcs = alloc_gcs(0, size);
- if (!gcs)
- return -ENOMEM;
+ if (IS_ERR_VALUE(gcs))
+ return gcs;
task->thread.gcspr_el0 = gcs + size - sizeof(u64);
task->thread.gcs_base = gcs;
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index b3a7fafe8892..30772a909aea 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -36,16 +36,12 @@
* huge pages could still be served from those areas.
*/
#ifdef CONFIG_CMA
-void __init arm64_hugetlb_cma_reserve(void)
+unsigned int arch_hugetlb_cma_order(void)
{
- int order;
-
if (pud_sect_supported())
- order = PUD_SHIFT - PAGE_SHIFT;
- else
- order = CONT_PMD_SHIFT - PAGE_SHIFT;
+ return PUD_SHIFT - PAGE_SHIFT;
- hugetlb_cma_reserve(order);
+ return CONT_PMD_SHIFT - PAGE_SHIFT;
}
#endif /* CONFIG_CMA */
@@ -129,7 +125,7 @@ pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
if (!pte_present(orig_pte) || !pte_cont(orig_pte))
return orig_pte;
- ncontig = num_contig_ptes(page_size(pte_page(orig_pte)), &pgsize);
+ ncontig = find_num_contig(mm, addr, ptep, &pgsize);
for (i = 0; i < ncontig; i++, ptep++) {
pte_t pte = __ptep_get(ptep);
@@ -159,12 +155,12 @@ static pte_t get_clear_contig(struct mm_struct *mm,
pte_t pte, tmp_pte;
bool present;
- pte = __ptep_get_and_clear(mm, addr, ptep);
+ pte = __ptep_get_and_clear_anysz(mm, addr, ptep, pgsize);
present = pte_present(pte);
while (--ncontig) {
ptep++;
addr += pgsize;
- tmp_pte = __ptep_get_and_clear(mm, addr, ptep);
+ tmp_pte = __ptep_get_and_clear_anysz(mm, addr, ptep, pgsize);
if (present) {
if (pte_dirty(tmp_pte))
pte = pte_mkdirty(pte);
@@ -183,8 +179,9 @@ static pte_t get_clear_contig_flush(struct mm_struct *mm,
{
pte_t orig_pte = get_clear_contig(mm, addr, ptep, pgsize, ncontig);
struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
+ unsigned long end = addr + (pgsize * ncontig);
- flush_tlb_range(&vma, addr, addr + (pgsize * ncontig));
+ __flush_hugetlb_tlb_range(&vma, addr, end, pgsize, TLBF_NOWALKCACHE);
return orig_pte;
}
@@ -207,9 +204,12 @@ static void clear_flush(struct mm_struct *mm,
unsigned long i, saddr = addr;
for (i = 0; i < ncontig; i++, addr += pgsize, ptep++)
- __ptep_get_and_clear(mm, addr, ptep);
+ __ptep_get_and_clear_anysz(mm, addr, ptep, pgsize);
- flush_tlb_range(&vma, saddr, addr);
+ if (mm == &init_mm)
+ flush_tlb_kernel_range(saddr, addr);
+ else
+ __flush_hugetlb_tlb_range(&vma, saddr, addr, pgsize, TLBF_NOWALKCACHE);
}
void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
@@ -218,30 +218,20 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
size_t pgsize;
int i;
int ncontig;
- unsigned long pfn, dpfn;
- pgprot_t hugeprot;
ncontig = num_contig_ptes(sz, &pgsize);
if (!pte_present(pte)) {
for (i = 0; i < ncontig; i++, ptep++, addr += pgsize)
- __set_ptes(mm, addr, ptep, pte, 1);
- return;
- }
-
- if (!pte_cont(pte)) {
- __set_ptes(mm, addr, ptep, pte, 1);
+ __set_ptes_anysz(mm, addr, ptep, pte, 1, pgsize);
return;
}
- pfn = pte_pfn(pte);
- dpfn = pgsize >> PAGE_SHIFT;
- hugeprot = pte_pgprot(pte);
-
- clear_flush(mm, addr, ptep, pgsize, ncontig);
+ /* Only need to "break" if transitioning valid -> valid. */
+ if (pte_cont(pte) && pte_valid(__ptep_get(ptep)))
+ clear_flush(mm, addr, ptep, pgsize, ncontig);
- for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn)
- __set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1);
+ __set_ptes_anysz(mm, addr, ptep, pte, ncontig, pgsize);
}
pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -334,7 +324,9 @@ unsigned long hugetlb_mask_last_page(struct hstate *h)
switch (hp_size) {
#ifndef __PAGETABLE_PMD_FOLDED
case PUD_SIZE:
- return PGDIR_SIZE - PUD_SIZE;
+ if (pud_sect_supported())
+ return PGDIR_SIZE - PUD_SIZE;
+ break;
#endif
case CONT_PMD_SIZE:
return PUD_SIZE - CONT_PMD_SIZE;
@@ -356,23 +348,21 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
switch (pagesize) {
#ifndef __PAGETABLE_PMD_FOLDED
case PUD_SIZE:
- entry = pud_pte(pud_mkhuge(pte_pud(entry)));
+ if (pud_sect_supported())
+ return pud_pte(pud_mkhuge(pte_pud(entry)));
break;
#endif
case CONT_PMD_SIZE:
- entry = pmd_pte(pmd_mkcont(pte_pmd(entry)));
- fallthrough;
+ return pmd_pte(pmd_mkhuge(pmd_mkcont(pte_pmd(entry))));
case PMD_SIZE:
- entry = pmd_pte(pmd_mkhuge(pte_pmd(entry)));
- break;
+ return pmd_pte(pmd_mkhuge(pte_pmd(entry)));
case CONT_PTE_SIZE:
- entry = pte_mkcont(entry);
- break;
+ return pte_mkcont(entry);
default:
- pr_warn("%s: unrecognized huge page size 0x%lx\n",
- __func__, pagesize);
break;
}
+ pr_warn("%s: unrecognized huge page size 0x%lx\n",
+ __func__, pagesize);
return entry;
}
@@ -431,23 +421,23 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep,
pte_t pte, int dirty)
{
- int ncontig, i;
+ int ncontig;
size_t pgsize = 0;
- unsigned long pfn = pte_pfn(pte), dpfn;
struct mm_struct *mm = vma->vm_mm;
- pgprot_t hugeprot;
pte_t orig_pte;
- if (!pte_cont(pte))
- return __ptep_set_access_flags(vma, addr, ptep, pte, dirty);
+ VM_WARN_ON(!pte_present(pte));
+ ncontig = num_contig_ptes(huge_page_size(hstate_vma(vma)), &pgsize);
- ncontig = find_num_contig(mm, addr, ptep, &pgsize);
- dpfn = pgsize >> PAGE_SHIFT;
+ if (!pte_cont(pte))
+ return __ptep_set_access_flags_anysz(vma, addr, ptep, pte,
+ dirty, pgsize);
if (!__cont_access_flags_changed(ptep, pte, ncontig))
return 0;
orig_pte = get_clear_contig_flush(mm, addr, ptep, pgsize, ncontig);
+ VM_WARN_ON(!pte_present(orig_pte));
/* Make sure we don't lose the dirty or young state */
if (pte_dirty(orig_pte))
@@ -456,38 +446,31 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
if (pte_young(orig_pte))
pte = pte_mkyoung(pte);
- hugeprot = pte_pgprot(pte);
- for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn)
- __set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1);
-
+ __set_ptes_anysz(mm, addr, ptep, pte, ncontig, pgsize);
return 1;
}
void huge_ptep_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
- unsigned long pfn, dpfn;
- pgprot_t hugeprot;
- int ncontig, i;
+ int ncontig;
size_t pgsize;
pte_t pte;
- if (!pte_cont(__ptep_get(ptep))) {
+ pte = __ptep_get(ptep);
+ VM_WARN_ON(!pte_present(pte));
+
+ if (!pte_cont(pte)) {
__ptep_set_wrprotect(mm, addr, ptep);
return;
}
ncontig = find_num_contig(mm, addr, ptep, &pgsize);
- dpfn = pgsize >> PAGE_SHIFT;
pte = get_clear_contig_flush(mm, addr, ptep, pgsize, ncontig);
pte = pte_wrprotect(pte);
- hugeprot = pte_pgprot(pte);
- pfn = pte_pfn(pte);
-
- for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn)
- __set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1);
+ __set_ptes_anysz(mm, addr, ptep, pte, ncontig, pgsize);
}
pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
@@ -497,10 +480,7 @@ pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
size_t pgsize;
int ncontig;
- if (!pte_cont(__ptep_get(ptep)))
- return ptep_clear_flush(vma, addr, ptep);
-
- ncontig = find_num_contig(mm, addr, ptep, &pgsize);
+ ncontig = num_contig_ptes(huge_page_size(hstate_vma(vma)), &pgsize);
return get_clear_contig_flush(mm, addr, ptep, pgsize, ncontig);
}
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index ccdef53872a0..97987f850a33 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -98,21 +98,19 @@ static void __init arch_reserve_crashkernel(void)
{
unsigned long long low_size = 0;
unsigned long long crash_base, crash_size;
- char *cmdline = boot_command_line;
bool high = false;
int ret;
if (!IS_ENABLED(CONFIG_CRASH_RESERVE))
return;
- ret = parse_crashkernel(cmdline, memblock_phys_mem_size(),
+ ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
&crash_size, &crash_base,
- &low_size, &high);
+ &low_size, NULL, &high);
if (ret)
return;
- reserve_crashkernel_generic(cmdline, crash_size, crash_base,
- low_size, high);
+ reserve_crashkernel_generic(crash_size, crash_base, low_size, high);
}
static phys_addr_t __init max_zone_phys(phys_addr_t zone_limit)
@@ -120,9 +118,22 @@ static phys_addr_t __init max_zone_phys(phys_addr_t zone_limit)
return min(zone_limit, memblock_end_of_DRAM() - 1) + 1;
}
-static void __init zone_sizes_init(void)
+void __init arch_zone_limits_init(unsigned long *max_zone_pfns)
+{
+ phys_addr_t __maybe_unused dma32_phys_limit =
+ max_zone_phys(DMA_BIT_MASK(32));
+
+#ifdef CONFIG_ZONE_DMA
+ max_zone_pfns[ZONE_DMA] = PFN_DOWN(max_zone_phys(zone_dma_limit));
+#endif
+#ifdef CONFIG_ZONE_DMA32
+ max_zone_pfns[ZONE_DMA32] = PFN_DOWN(dma32_phys_limit);
+#endif
+ max_zone_pfns[ZONE_NORMAL] = max_pfn;
+}
+
+static void __init dma_limits_init(void)
{
- unsigned long max_zone_pfns[MAX_NR_ZONES] = {0};
phys_addr_t __maybe_unused acpi_zone_dma_limit;
phys_addr_t __maybe_unused dt_zone_dma_limit;
phys_addr_t __maybe_unused dma32_phys_limit =
@@ -141,18 +152,13 @@ static void __init zone_sizes_init(void)
if (memblock_start_of_DRAM() < U32_MAX)
zone_dma_limit = min(zone_dma_limit, U32_MAX);
arm64_dma_phys_limit = max_zone_phys(zone_dma_limit);
- max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit);
#endif
#ifdef CONFIG_ZONE_DMA32
- max_zone_pfns[ZONE_DMA32] = PFN_DOWN(dma32_phys_limit);
if (!arm64_dma_phys_limit)
arm64_dma_phys_limit = dma32_phys_limit;
#endif
if (!arm64_dma_phys_limit)
arm64_dma_phys_limit = PHYS_MASK + 1;
- max_zone_pfns[ZONE_NORMAL] = max_pfn;
-
- free_area_init(max_zone_pfns);
}
int pfn_is_map_memory(unsigned long pfn)
@@ -245,7 +251,7 @@ void __init arm64_memblock_init(void)
*/
if (memory_limit != PHYS_ADDR_MAX) {
memblock_mem_limit_remove_map(memory_limit);
- memblock_add(__pa_symbol(_text), (u64)(_end - _text));
+ memblock_add(__pa_symbol(_text), (resource_size_t)(_end - _text));
}
if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) {
@@ -254,8 +260,8 @@ void __init arm64_memblock_init(void)
* initrd to become inaccessible via the linear mapping.
* Otherwise, this is a no-op
*/
- u64 base = phys_initrd_start & PAGE_MASK;
- u64 size = PAGE_ALIGN(phys_initrd_start + phys_initrd_size) - base;
+ phys_addr_t base = phys_initrd_start & PAGE_MASK;
+ resource_size_t size = PAGE_ALIGN(phys_initrd_start + phys_initrd_size) - base;
/*
* We can only add back the initrd memory if we don't end up
@@ -277,31 +283,11 @@ void __init arm64_memblock_init(void)
}
}
- if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
- extern u16 memstart_offset_seed;
- u64 mmfr0 = read_cpuid(ID_AA64MMFR0_EL1);
- int parange = cpuid_feature_extract_unsigned_field(
- mmfr0, ID_AA64MMFR0_EL1_PARANGE_SHIFT);
- s64 range = linear_region_size -
- BIT(id_aa64mmfr0_parange_to_phys_shift(parange));
-
- /*
- * If the size of the linear region exceeds, by a sufficient
- * margin, the size of the region that the physical memory can
- * span, randomize the linear region as well.
- */
- if (memstart_offset_seed > 0 && range >= (s64)ARM64_MEMSTART_ALIGN) {
- range /= ARM64_MEMSTART_ALIGN;
- memstart_addr -= ARM64_MEMSTART_ALIGN *
- ((range * memstart_offset_seed) >> 16);
- }
- }
-
/*
* Register the kernel text, kernel data, initrd, and initial
* pagetables with memblock.
*/
- memblock_reserve(__pa_symbol(_stext), _end - _stext);
+ memblock_reserve(__pa_symbol(_text), _end - _text);
if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) {
/* the generic initrd code expects virtual addresses */
initrd_start = __phys_to_virt(phys_initrd_start);
@@ -309,8 +295,6 @@ void __init arm64_memblock_init(void)
}
early_init_fdt_scan_reserved_mem();
-
- high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
}
void __init bootmem_init(void)
@@ -327,23 +311,8 @@ void __init bootmem_init(void)
arch_numa_init();
- /*
- * must be done after arch_numa_init() which calls numa_init() to
- * initialize node_online_map that gets used in hugetlb_cma_reserve()
- * while allocating required CMA size across online nodes.
- */
-#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA)
- arm64_hugetlb_cma_reserve();
-#endif
-
kvm_hyp_reserve();
-
- /*
- * sparse_init() tries to allocate memory from memblock, so must be
- * done after the fixed reservations
- */
- sparse_init();
- zone_sizes_init();
+ dma_limits_init();
/*
* Reserve the CMA area after arm64_dma_phys_limit was initialised.
@@ -359,12 +328,12 @@ void __init bootmem_init(void)
memblock_dump_all();
}
-/*
- * mem_init() marks the free areas in the mem_map and tells us how much memory
- * is free. This is done after various parts of the system have claimed their
- * memory after the kernel image.
- */
-void __init mem_init(void)
+void __init arch_setup_zero_pages(void)
+{
+ __zero_page = phys_to_page(__pa_symbol(empty_zero_page));
+}
+
+void __init arch_mm_preinit(void)
{
unsigned int flags = SWIOTLB_VERBOSE;
bool swiotlb = max_pfn > PFN_DOWN(arm64_dma_phys_limit);
@@ -386,10 +355,6 @@ void __init mem_init(void)
}
swiotlb_init(swiotlb, flags);
- swiotlb_update_mem_attributes();
-
- /* this will put all unused low memory onto the freelists */
- memblock_free_all();
/*
* Check boundaries twice: Some fundamental inconsistencies can be
@@ -416,6 +381,14 @@ void __init mem_init(void)
}
}
+bool page_alloc_available __ro_after_init;
+
+void __init mem_init(void)
+{
+ page_alloc_available = true;
+ swiotlb_update_mem_attributes();
+}
+
void free_initmem(void)
{
void *lm_init_begin = lm_alias(__init_begin);
@@ -424,9 +397,6 @@ void free_initmem(void)
WARN_ON(!IS_ALIGNED((unsigned long)lm_init_begin, PAGE_SIZE));
WARN_ON(!IS_ALIGNED((unsigned long)lm_init_end, PAGE_SIZE));
- /* Delete __init region from memblock.reserved. */
- memblock_free(lm_init_begin, lm_init_end - lm_init_begin);
-
free_reserved_area(lm_init_begin, lm_init_end,
POISON_FREE_INITMEM, "unused kernel");
/*
diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c
index 6cc0b7e7eb03..6eef48ef6278 100644
--- a/arch/arm64/mm/ioremap.c
+++ b/arch/arm64/mm/ioremap.c
@@ -14,18 +14,18 @@ int arm64_ioremap_prot_hook_register(ioremap_prot_hook_t hook)
return 0;
}
-void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
- unsigned long prot)
+void __iomem *__ioremap_prot(phys_addr_t phys_addr, size_t size,
+ pgprot_t pgprot)
{
unsigned long last_addr = phys_addr + size - 1;
- pgprot_t pgprot = __pgprot(prot);
/* Don't allow outside PHYS_MASK */
if (last_addr & ~PHYS_MASK)
return NULL;
/* Don't allow RAM to be mapped. */
- if (WARN_ON(pfn_is_map_memory(__phys_to_pfn(phys_addr))))
+ if (WARN_ONCE(pfn_is_map_memory(__phys_to_pfn(phys_addr)),
+ "ioremap attempted on RAM pfn\n"))
return NULL;
/*
@@ -39,7 +39,7 @@ void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
return generic_ioremap_prot(phys_addr, size, pgprot);
}
-EXPORT_SYMBOL(ioremap_prot);
+EXPORT_SYMBOL(__ioremap_prot);
/*
* Must be called after early_fixmap_init
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index b65a29440a0c..abeb81bf6ebd 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -190,7 +190,7 @@ static void __init kasan_pgd_populate(unsigned long addr, unsigned long end,
*/
static bool __init root_level_aligned(u64 addr)
{
- int shift = (ARM64_HW_PGTABLE_LEVELS(vabits_actual) - 1) * (PAGE_SHIFT - 3);
+ int shift = (ARM64_HW_PGTABLE_LEVELS(vabits_actual) - 1) * PTDESC_TABLE_SHIFT;
return (addr % (PAGE_SIZE << shift)) == 0;
}
@@ -245,7 +245,7 @@ static int __init root_level_idx(u64 addr)
*/
u64 vabits = IS_ENABLED(CONFIG_ARM64_64K_PAGES) ? VA_BITS
: vabits_actual;
- int shift = (ARM64_HW_PGTABLE_LEVELS(vabits) - 1) * (PAGE_SHIFT - 3);
+ int shift = (ARM64_HW_PGTABLE_LEVELS(vabits) - 1) * PTDESC_TABLE_SHIFT;
return (addr & ~_PAGE_OFFSET(vabits)) >> (shift + PAGE_SHIFT);
}
@@ -269,7 +269,7 @@ static void __init clone_next_level(u64 addr, pgd_t *tmp_pg_dir, pud_t *pud)
*/
static int __init next_level_idx(u64 addr)
{
- int shift = (ARM64_HW_PGTABLE_LEVELS(vabits_actual) - 2) * (PAGE_SHIFT - 3);
+ int shift = (ARM64_HW_PGTABLE_LEVELS(vabits_actual) - 2) * PTDESC_TABLE_SHIFT;
return (addr >> (shift + PAGE_SHIFT)) % PTRS_PER_PTE;
}
@@ -399,14 +399,12 @@ void __init kasan_init(void)
{
kasan_init_shadow();
kasan_init_depth();
-#if defined(CONFIG_KASAN_GENERIC)
+ kasan_init_generic();
/*
* Generic KASAN is now fully initialized.
* Software and Hardware Tag-Based modes still require
* kasan_init_sw_tags() and kasan_init_hw_tags() correspondingly.
*/
- pr_info("KernelAddressSanitizer initialized (generic)\n");
-#endif
}
#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c
index 07aeab8a7606..92b2f5097a96 100644
--- a/arch/arm64/mm/mmap.c
+++ b/arch/arm64/mm/mmap.c
@@ -34,6 +34,8 @@ static pgprot_t protection_map[16] __ro_after_init = {
[VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_SHARED_EXEC
};
+static ptdesc_t gcs_page_prot __ro_after_init = _PAGE_GCS_RO;
+
/*
* You really shouldn't be using read() or write() on /dev/mem. This might go
* away in the future.
@@ -73,21 +75,27 @@ static int __init adjust_protection_map(void)
protection_map[VM_EXEC | VM_SHARED] = PAGE_EXECONLY;
}
- if (lpa2_is_enabled())
+ if (lpa2_is_enabled()) {
for (int i = 0; i < ARRAY_SIZE(protection_map); i++)
pgprot_val(protection_map[i]) &= ~PTE_SHARED;
+ gcs_page_prot &= ~PTE_SHARED;
+ }
return 0;
}
arch_initcall(adjust_protection_map);
-pgprot_t vm_get_page_prot(unsigned long vm_flags)
+pgprot_t vm_get_page_prot(vm_flags_t vm_flags)
{
- pteval_t prot;
+ ptdesc_t prot;
/* Short circuit GCS to avoid bloating the table. */
if (system_supports_gcs() && (vm_flags & VM_SHADOW_STACK)) {
- prot = _PAGE_GCS_RO;
+ /* Honour mprotect(PROT_NONE) on shadow stack mappings */
+ if (vm_flags & VM_ACCESS_FLAGS)
+ prot = gcs_page_prot;
+ else
+ prot = pgprot_val(protection_map[VM_NONE]);
} else {
prot = pgprot_val(protection_map[vm_flags &
(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]);
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 1dfe1a8efdbe..dd85e093ffdb 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -26,6 +26,9 @@
#include <linux/set_memory.h>
#include <linux/kfence.h>
#include <linux/pkeys.h>
+#include <linux/mm_inline.h>
+#include <linux/pagewalk.h>
+#include <linux/stop_machine.h>
#include <asm/barrier.h>
#include <asm/cputype.h>
@@ -46,6 +49,8 @@
#define NO_CONT_MAPPINGS BIT(1)
#define NO_EXEC_MAPPINGS BIT(2) /* assumes FEAT_HPDS is not used */
+DEFINE_STATIC_KEY_FALSE(arm64_ptdump_lock_key);
+
u64 kimage_voffset __ro_after_init;
EXPORT_SYMBOL(kimage_voffset);
@@ -59,13 +64,6 @@ static bool rodata_is_rw __ro_after_init = true;
*/
long __section(".mmuoff.data.write") __early_cpu_boot_status;
-/*
- * Empty_zero_page is a special page that is used for zero-initialized data
- * and COW.
- */
-unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss;
-EXPORT_SYMBOL(empty_zero_page);
-
static DEFINE_SPINLOCK(swapper_pgdir_lock);
static DEFINE_MUTEX(fixmap_lock);
@@ -107,7 +105,7 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
}
EXPORT_SYMBOL(phys_mem_access_prot);
-static phys_addr_t __init early_pgtable_alloc(int shift)
+static phys_addr_t __init early_pgtable_alloc(enum pgtable_level pgtable_level)
{
phys_addr_t phys;
@@ -189,17 +187,17 @@ static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end,
} while (ptep++, addr += PAGE_SIZE, addr != end);
}
-static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
- unsigned long end, phys_addr_t phys,
- pgprot_t prot,
- phys_addr_t (*pgtable_alloc)(int),
- int flags)
+static int alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
+ unsigned long end, phys_addr_t phys,
+ pgprot_t prot,
+ phys_addr_t (*pgtable_alloc)(enum pgtable_level),
+ int flags)
{
unsigned long next;
pmd_t pmd = READ_ONCE(*pmdp);
pte_t *ptep;
- BUG_ON(pmd_sect(pmd));
+ BUG_ON(pmd_leaf(pmd));
if (pmd_none(pmd)) {
pmdval_t pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN | PMD_TABLE_AF;
phys_addr_t pte_phys;
@@ -207,7 +205,9 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
if (flags & NO_EXEC_MAPPINGS)
pmdval |= PMD_TABLE_PXN;
BUG_ON(!pgtable_alloc);
- pte_phys = pgtable_alloc(PAGE_SHIFT);
+ pte_phys = pgtable_alloc(PGTABLE_LEVEL_PTE);
+ if (pte_phys == INVALID_PHYS_ADDR)
+ return -ENOMEM;
ptep = pte_set_fixmap(pte_phys);
init_clear_pgtable(ptep);
ptep += pte_index(addr);
@@ -239,11 +239,13 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
* walker.
*/
pte_clear_fixmap();
+
+ return 0;
}
-static void init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
- phys_addr_t phys, pgprot_t prot,
- phys_addr_t (*pgtable_alloc)(int), int flags)
+static int init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
+ phys_addr_t phys, pgprot_t prot,
+ phys_addr_t (*pgtable_alloc)(enum pgtable_level), int flags)
{
unsigned long next;
@@ -264,21 +266,29 @@ static void init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
BUG_ON(!pgattr_change_is_safe(pmd_val(old_pmd),
READ_ONCE(pmd_val(*pmdp))));
} else {
- alloc_init_cont_pte(pmdp, addr, next, phys, prot,
- pgtable_alloc, flags);
+ int ret;
+
+ ret = alloc_init_cont_pte(pmdp, addr, next, phys, prot,
+ pgtable_alloc, flags);
+ if (ret)
+ return ret;
BUG_ON(pmd_val(old_pmd) != 0 &&
pmd_val(old_pmd) != READ_ONCE(pmd_val(*pmdp)));
}
phys += next - addr;
} while (pmdp++, addr = next, addr != end);
+
+ return 0;
}
-static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
- unsigned long end, phys_addr_t phys,
- pgprot_t prot,
- phys_addr_t (*pgtable_alloc)(int), int flags)
+static int alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
+ unsigned long end, phys_addr_t phys,
+ pgprot_t prot,
+ phys_addr_t (*pgtable_alloc)(enum pgtable_level),
+ int flags)
{
+ int ret;
unsigned long next;
pud_t pud = READ_ONCE(*pudp);
pmd_t *pmdp;
@@ -286,7 +296,7 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
/*
* Check for initial section mappings in the pgd/pud.
*/
- BUG_ON(pud_sect(pud));
+ BUG_ON(pud_leaf(pud));
if (pud_none(pud)) {
pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN | PUD_TABLE_AF;
phys_addr_t pmd_phys;
@@ -294,7 +304,9 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
if (flags & NO_EXEC_MAPPINGS)
pudval |= PUD_TABLE_PXN;
BUG_ON(!pgtable_alloc);
- pmd_phys = pgtable_alloc(PMD_SHIFT);
+ pmd_phys = pgtable_alloc(PGTABLE_LEVEL_PMD);
+ if (pmd_phys == INVALID_PHYS_ADDR)
+ return -ENOMEM;
pmdp = pmd_set_fixmap(pmd_phys);
init_clear_pgtable(pmdp);
pmdp += pmd_index(addr);
@@ -314,20 +326,26 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
(flags & NO_CONT_MAPPINGS) == 0)
__prot = __pgprot(pgprot_val(prot) | PTE_CONT);
- init_pmd(pmdp, addr, next, phys, __prot, pgtable_alloc, flags);
+ ret = init_pmd(pmdp, addr, next, phys, __prot, pgtable_alloc, flags);
+ if (ret)
+ goto out;
pmdp += pmd_index(next) - pmd_index(addr);
phys += next - addr;
} while (addr = next, addr != end);
+out:
pmd_clear_fixmap();
+
+ return ret;
}
-static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
- phys_addr_t phys, pgprot_t prot,
- phys_addr_t (*pgtable_alloc)(int),
- int flags)
+static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
+ phys_addr_t phys, pgprot_t prot,
+ phys_addr_t (*pgtable_alloc)(enum pgtable_level),
+ int flags)
{
+ int ret = 0;
unsigned long next;
p4d_t p4d = READ_ONCE(*p4dp);
pud_t *pudp;
@@ -339,7 +357,9 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
if (flags & NO_EXEC_MAPPINGS)
p4dval |= P4D_TABLE_PXN;
BUG_ON(!pgtable_alloc);
- pud_phys = pgtable_alloc(PUD_SHIFT);
+ pud_phys = pgtable_alloc(PGTABLE_LEVEL_PUD);
+ if (pud_phys == INVALID_PHYS_ADDR)
+ return -ENOMEM;
pudp = pud_set_fixmap(pud_phys);
init_clear_pgtable(pudp);
pudp += pud_index(addr);
@@ -369,8 +389,10 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
BUG_ON(!pgattr_change_is_safe(pud_val(old_pud),
READ_ONCE(pud_val(*pudp))));
} else {
- alloc_init_cont_pmd(pudp, addr, next, phys, prot,
- pgtable_alloc, flags);
+ ret = alloc_init_cont_pmd(pudp, addr, next, phys, prot,
+ pgtable_alloc, flags);
+ if (ret)
+ goto out;
BUG_ON(pud_val(old_pud) != 0 &&
pud_val(old_pud) != READ_ONCE(pud_val(*pudp)));
@@ -378,14 +400,18 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
phys += next - addr;
} while (pudp++, addr = next, addr != end);
+out:
pud_clear_fixmap();
+
+ return ret;
}
-static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
- phys_addr_t phys, pgprot_t prot,
- phys_addr_t (*pgtable_alloc)(int),
- int flags)
+static int alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
+ phys_addr_t phys, pgprot_t prot,
+ phys_addr_t (*pgtable_alloc)(enum pgtable_level),
+ int flags)
{
+ int ret;
unsigned long next;
pgd_t pgd = READ_ONCE(*pgdp);
p4d_t *p4dp;
@@ -397,7 +423,9 @@ static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
if (flags & NO_EXEC_MAPPINGS)
pgdval |= PGD_TABLE_PXN;
BUG_ON(!pgtable_alloc);
- p4d_phys = pgtable_alloc(P4D_SHIFT);
+ p4d_phys = pgtable_alloc(PGTABLE_LEVEL_P4D);
+ if (p4d_phys == INVALID_PHYS_ADDR)
+ return -ENOMEM;
p4dp = p4d_set_fixmap(p4d_phys);
init_clear_pgtable(p4dp);
p4dp += p4d_index(addr);
@@ -412,8 +440,10 @@ static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
next = p4d_addr_end(addr, end);
- alloc_init_pud(p4dp, addr, next, phys, prot,
- pgtable_alloc, flags);
+ ret = alloc_init_pud(p4dp, addr, next, phys, prot,
+ pgtable_alloc, flags);
+ if (ret)
+ goto out;
BUG_ON(p4d_val(old_p4d) != 0 &&
p4d_val(old_p4d) != READ_ONCE(p4d_val(*p4dp)));
@@ -421,15 +451,19 @@ static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
phys += next - addr;
} while (p4dp++, addr = next, addr != end);
+out:
p4d_clear_fixmap();
+
+ return ret;
}
-static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
- unsigned long virt, phys_addr_t size,
- pgprot_t prot,
- phys_addr_t (*pgtable_alloc)(int),
- int flags)
+static int __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
+ unsigned long virt, phys_addr_t size,
+ pgprot_t prot,
+ phys_addr_t (*pgtable_alloc)(enum pgtable_level),
+ int flags)
{
+ int ret;
unsigned long addr, end, next;
pgd_t *pgdp = pgd_offset_pgd(pgdir, virt);
@@ -438,7 +472,7 @@ static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
* within a page, we cannot map the region as the caller expects.
*/
if (WARN_ON((phys ^ virt) & ~PAGE_MASK))
- return;
+ return -EINVAL;
phys &= PAGE_MASK;
addr = virt & PAGE_MASK;
@@ -446,59 +480,511 @@ static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
do {
next = pgd_addr_end(addr, end);
- alloc_init_p4d(pgdp, addr, next, phys, prot, pgtable_alloc,
- flags);
+ ret = alloc_init_p4d(pgdp, addr, next, phys, prot, pgtable_alloc,
+ flags);
+ if (ret)
+ return ret;
phys += next - addr;
} while (pgdp++, addr = next, addr != end);
+
+ return 0;
}
-static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
- unsigned long virt, phys_addr_t size,
- pgprot_t prot,
- phys_addr_t (*pgtable_alloc)(int),
- int flags)
+static int __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
+ unsigned long virt, phys_addr_t size,
+ pgprot_t prot,
+ phys_addr_t (*pgtable_alloc)(enum pgtable_level),
+ int flags)
{
+ int ret;
+
mutex_lock(&fixmap_lock);
- __create_pgd_mapping_locked(pgdir, phys, virt, size, prot,
- pgtable_alloc, flags);
+ ret = __create_pgd_mapping_locked(pgdir, phys, virt, size, prot,
+ pgtable_alloc, flags);
mutex_unlock(&fixmap_lock);
+
+ return ret;
}
-#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
-extern __alias(__create_pgd_mapping_locked)
-void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,
- phys_addr_t size, pgprot_t prot,
- phys_addr_t (*pgtable_alloc)(int), int flags);
-#endif
+static void early_create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
+ unsigned long virt, phys_addr_t size,
+ pgprot_t prot,
+ phys_addr_t (*pgtable_alloc)(enum pgtable_level),
+ int flags)
+{
+ int ret;
-static phys_addr_t __pgd_pgtable_alloc(int shift)
+ ret = __create_pgd_mapping(pgdir, phys, virt, size, prot, pgtable_alloc,
+ flags);
+ if (ret)
+ panic("Failed to create page tables\n");
+}
+
+static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, gfp_t gfp,
+ enum pgtable_level pgtable_level)
{
/* Page is zeroed by init_clear_pgtable() so don't duplicate effort. */
- void *ptr = (void *)__get_free_page(GFP_PGTABLE_KERNEL & ~__GFP_ZERO);
+ struct ptdesc *ptdesc = pagetable_alloc(gfp & ~__GFP_ZERO, 0);
+ phys_addr_t pa;
+
+ if (!ptdesc)
+ return INVALID_PHYS_ADDR;
+
+ pa = page_to_phys(ptdesc_page(ptdesc));
+
+ switch (pgtable_level) {
+ case PGTABLE_LEVEL_PTE:
+ BUG_ON(!pagetable_pte_ctor(mm, ptdesc));
+ break;
+ case PGTABLE_LEVEL_PMD:
+ BUG_ON(!pagetable_pmd_ctor(mm, ptdesc));
+ break;
+ case PGTABLE_LEVEL_PUD:
+ pagetable_pud_ctor(ptdesc);
+ break;
+ case PGTABLE_LEVEL_P4D:
+ pagetable_p4d_ctor(ptdesc);
+ break;
+ case PGTABLE_LEVEL_PGD:
+ VM_WARN_ON(1);
+ break;
+ }
+
+ return pa;
+}
+
+static phys_addr_t
+pgd_pgtable_alloc_init_mm_gfp(enum pgtable_level pgtable_level, gfp_t gfp)
+{
+ return __pgd_pgtable_alloc(&init_mm, gfp, pgtable_level);
+}
- BUG_ON(!ptr);
- return __pa(ptr);
+static phys_addr_t __maybe_unused
+pgd_pgtable_alloc_init_mm(enum pgtable_level pgtable_level)
+{
+ return pgd_pgtable_alloc_init_mm_gfp(pgtable_level, GFP_PGTABLE_KERNEL);
+}
+
+static phys_addr_t
+pgd_pgtable_alloc_special_mm(enum pgtable_level pgtable_level)
+{
+ return __pgd_pgtable_alloc(NULL, GFP_PGTABLE_KERNEL, pgtable_level);
}
-static phys_addr_t pgd_pgtable_alloc(int shift)
+static void split_contpte(pte_t *ptep)
{
- phys_addr_t pa = __pgd_pgtable_alloc(shift);
- struct ptdesc *ptdesc = page_ptdesc(phys_to_page(pa));
+ int i;
+
+ ptep = PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * CONT_PTES);
+ for (i = 0; i < CONT_PTES; i++, ptep++)
+ __set_pte(ptep, pte_mknoncont(__ptep_get(ptep)));
+}
+
+static int split_pmd(pmd_t *pmdp, pmd_t pmd, gfp_t gfp, bool to_cont)
+{
+ pmdval_t tableprot = PMD_TYPE_TABLE | PMD_TABLE_UXN | PMD_TABLE_AF;
+ unsigned long pfn = pmd_pfn(pmd);
+ pgprot_t prot = pmd_pgprot(pmd);
+ phys_addr_t pte_phys;
+ pte_t *ptep;
+ int i;
+
+ pte_phys = pgd_pgtable_alloc_init_mm_gfp(PGTABLE_LEVEL_PTE, gfp);
+ if (pte_phys == INVALID_PHYS_ADDR)
+ return -ENOMEM;
+ ptep = (pte_t *)phys_to_virt(pte_phys);
+
+ if (pgprot_val(prot) & PMD_SECT_PXN)
+ tableprot |= PMD_TABLE_PXN;
+
+ prot = __pgprot((pgprot_val(prot) & ~PTE_TYPE_MASK) | PTE_TYPE_PAGE);
+ if (!pmd_valid(pmd))
+ prot = pte_pgprot(pte_mkinvalid(pfn_pte(0, prot)));
+ prot = __pgprot(pgprot_val(prot) & ~PTE_CONT);
+ if (to_cont)
+ prot = __pgprot(pgprot_val(prot) | PTE_CONT);
+
+ for (i = 0; i < PTRS_PER_PTE; i++, ptep++, pfn++)
+ __set_pte(ptep, pfn_pte(pfn, prot));
/*
- * Call proper page table ctor in case later we need to
- * call core mm functions like apply_to_page_range() on
- * this pre-allocated page table.
+ * Ensure the pte entries are visible to the table walker by the time
+ * the pmd entry that points to the ptes is visible.
+ */
+ dsb(ishst);
+ __pmd_populate(pmdp, pte_phys, tableprot);
+
+ return 0;
+}
+
+static void split_contpmd(pmd_t *pmdp)
+{
+ int i;
+
+ pmdp = PTR_ALIGN_DOWN(pmdp, sizeof(*pmdp) * CONT_PMDS);
+ for (i = 0; i < CONT_PMDS; i++, pmdp++)
+ set_pmd(pmdp, pmd_mknoncont(pmdp_get(pmdp)));
+}
+
+static int split_pud(pud_t *pudp, pud_t pud, gfp_t gfp, bool to_cont)
+{
+ pudval_t tableprot = PUD_TYPE_TABLE | PUD_TABLE_UXN | PUD_TABLE_AF;
+ unsigned int step = PMD_SIZE >> PAGE_SHIFT;
+ unsigned long pfn = pud_pfn(pud);
+ pgprot_t prot = pud_pgprot(pud);
+ phys_addr_t pmd_phys;
+ pmd_t *pmdp;
+ int i;
+
+ pmd_phys = pgd_pgtable_alloc_init_mm_gfp(PGTABLE_LEVEL_PMD, gfp);
+ if (pmd_phys == INVALID_PHYS_ADDR)
+ return -ENOMEM;
+ pmdp = (pmd_t *)phys_to_virt(pmd_phys);
+
+ if (pgprot_val(prot) & PMD_SECT_PXN)
+ tableprot |= PUD_TABLE_PXN;
+
+ prot = __pgprot((pgprot_val(prot) & ~PMD_TYPE_MASK) | PMD_TYPE_SECT);
+ if (!pud_valid(pud))
+ prot = pmd_pgprot(pmd_mkinvalid(pfn_pmd(0, prot)));
+ prot = __pgprot(pgprot_val(prot) & ~PTE_CONT);
+ if (to_cont)
+ prot = __pgprot(pgprot_val(prot) | PTE_CONT);
+
+ for (i = 0; i < PTRS_PER_PMD; i++, pmdp++, pfn += step)
+ set_pmd(pmdp, pfn_pmd(pfn, prot));
+
+ /*
+ * Ensure the pmd entries are visible to the table walker by the time
+ * the pud entry that points to the pmds is visible.
+ */
+ dsb(ishst);
+ __pud_populate(pudp, pmd_phys, tableprot);
+
+ return 0;
+}
+
+static int split_kernel_leaf_mapping_locked(unsigned long addr)
+{
+ pgd_t *pgdp, pgd;
+ p4d_t *p4dp, p4d;
+ pud_t *pudp, pud;
+ pmd_t *pmdp, pmd;
+ pte_t *ptep, pte;
+ int ret = 0;
+
+ /*
+ * PGD: If addr is PGD aligned then addr already describes a leaf
+ * boundary. If not present then there is nothing to split.
+ */
+ if (ALIGN_DOWN(addr, PGDIR_SIZE) == addr)
+ goto out;
+ pgdp = pgd_offset_k(addr);
+ pgd = pgdp_get(pgdp);
+ if (!pgd_present(pgd))
+ goto out;
+
+ /*
+ * P4D: If addr is P4D aligned then addr already describes a leaf
+ * boundary. If not present then there is nothing to split.
+ */
+ if (ALIGN_DOWN(addr, P4D_SIZE) == addr)
+ goto out;
+ p4dp = p4d_offset(pgdp, addr);
+ p4d = p4dp_get(p4dp);
+ if (!p4d_present(p4d))
+ goto out;
+
+ /*
+ * PUD: If addr is PUD aligned then addr already describes a leaf
+ * boundary. If not present then there is nothing to split. Otherwise,
+ * if we have a pud leaf, split to contpmd.
+ */
+ if (ALIGN_DOWN(addr, PUD_SIZE) == addr)
+ goto out;
+ pudp = pud_offset(p4dp, addr);
+ pud = pudp_get(pudp);
+ if (!pud_present(pud))
+ goto out;
+ if (pud_leaf(pud)) {
+ ret = split_pud(pudp, pud, GFP_PGTABLE_KERNEL, true);
+ if (ret)
+ goto out;
+ }
+
+ /*
+ * CONTPMD: If addr is CONTPMD aligned then addr already describes a
+ * leaf boundary. If not present then there is nothing to split.
+ * Otherwise, if we have a contpmd leaf, split to pmd.
+ */
+ if (ALIGN_DOWN(addr, CONT_PMD_SIZE) == addr)
+ goto out;
+ pmdp = pmd_offset(pudp, addr);
+ pmd = pmdp_get(pmdp);
+ if (!pmd_present(pmd))
+ goto out;
+ if (pmd_leaf(pmd)) {
+ if (pmd_cont(pmd))
+ split_contpmd(pmdp);
+ /*
+ * PMD: If addr is PMD aligned then addr already describes a
+ * leaf boundary. Otherwise, split to contpte.
+ */
+ if (ALIGN_DOWN(addr, PMD_SIZE) == addr)
+ goto out;
+ ret = split_pmd(pmdp, pmd, GFP_PGTABLE_KERNEL, true);
+ if (ret)
+ goto out;
+ }
+
+ /*
+ * CONTPTE: If addr is CONTPTE aligned then addr already describes a
+ * leaf boundary. If not present then there is nothing to split.
+ * Otherwise, if we have a contpte leaf, split to pte.
+ */
+ if (ALIGN_DOWN(addr, CONT_PTE_SIZE) == addr)
+ goto out;
+ ptep = pte_offset_kernel(pmdp, addr);
+ pte = __ptep_get(ptep);
+ if (!pte_present(pte))
+ goto out;
+ if (pte_cont(pte))
+ split_contpte(ptep);
+
+out:
+ return ret;
+}
+
+static inline bool force_pte_mapping(void)
+{
+ const bool bbml2 = system_capabilities_finalized() ?
+ system_supports_bbml2_noabort() : cpu_supports_bbml2_noabort();
+
+ if (debug_pagealloc_enabled())
+ return true;
+ if (bbml2)
+ return false;
+ return rodata_full || arm64_kfence_can_set_direct_map() || is_realm_world();
+}
+
+static DEFINE_MUTEX(pgtable_split_lock);
+static bool linear_map_requires_bbml2;
+
+int split_kernel_leaf_mapping(unsigned long start, unsigned long end)
+{
+ int ret;
+
+ /*
+ * If the region is within a pte-mapped area, there is no need to try to
+ * split. Additionally, CONFIG_DEBUG_PAGEALLOC and CONFIG_KFENCE may
+ * change permissions from atomic context so for those cases (which are
+ * always pte-mapped), we must not go any further because taking the
+ * mutex below may sleep. Do not call force_pte_mapping() here because
+ * it could return a confusing result if called from a secondary cpu
+ * prior to finalizing caps. Instead, linear_map_requires_bbml2 gives us
+ * what we need.
+ */
+ if (!linear_map_requires_bbml2 || is_kfence_address((void *)start))
+ return 0;
+
+ if (!system_supports_bbml2_noabort()) {
+ /*
+ * !BBML2_NOABORT systems should not be trying to change
+ * permissions on anything that is not pte-mapped in the first
+ * place. Just return early and let the permission change code
+ * raise a warning if not already pte-mapped.
+ */
+ if (system_capabilities_finalized())
+ return 0;
+
+ /*
+ * Boot-time: split_kernel_leaf_mapping_locked() allocates from
+ * page allocator. Can't split until it's available.
+ */
+ if (WARN_ON(!page_alloc_available))
+ return -EBUSY;
+
+ /*
+ * Boot-time: Started secondary cpus but don't know if they
+ * support BBML2_NOABORT yet. Can't allow splitting in this
+ * window in case they don't.
+ */
+ if (WARN_ON(num_online_cpus() > 1))
+ return -EBUSY;
+ }
+
+ /*
+ * Ensure start and end are at least page-aligned since this is the
+ * finest granularity we can split to.
+ */
+ if (start != PAGE_ALIGN(start) || end != PAGE_ALIGN(end))
+ return -EINVAL;
+
+ mutex_lock(&pgtable_split_lock);
+ lazy_mmu_mode_enable();
+
+ /*
+ * The split_kernel_leaf_mapping_locked() may sleep, it is not a
+ * problem for ARM64 since ARM64's lazy MMU implementation allows
+ * sleeping.
*
- * We don't select ARCH_ENABLE_SPLIT_PMD_PTLOCK if pmd is
- * folded, and if so pagetable_pte_ctor() becomes nop.
+ * Optimize for the common case of splitting out a single page from a
+ * larger mapping. Here we can just split on the "least aligned" of
+ * start and end and this will guarantee that there must also be a split
+ * on the more aligned address since the both addresses must be in the
+ * same contpte block and it must have been split to ptes.
*/
- if (shift == PAGE_SHIFT)
- BUG_ON(!pagetable_pte_ctor(ptdesc));
- else if (shift == PMD_SHIFT)
- BUG_ON(!pagetable_pmd_ctor(ptdesc));
+ if (end - start == PAGE_SIZE) {
+ start = __ffs(start) < __ffs(end) ? start : end;
+ ret = split_kernel_leaf_mapping_locked(start);
+ } else {
+ ret = split_kernel_leaf_mapping_locked(start);
+ if (!ret)
+ ret = split_kernel_leaf_mapping_locked(end);
+ }
- return pa;
+ lazy_mmu_mode_disable();
+ mutex_unlock(&pgtable_split_lock);
+ return ret;
+}
+
+static int split_to_ptes_pud_entry(pud_t *pudp, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ gfp_t gfp = *(gfp_t *)walk->private;
+ pud_t pud = pudp_get(pudp);
+ int ret = 0;
+
+ if (pud_leaf(pud))
+ ret = split_pud(pudp, pud, gfp, false);
+
+ return ret;
+}
+
+static int split_to_ptes_pmd_entry(pmd_t *pmdp, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ gfp_t gfp = *(gfp_t *)walk->private;
+ pmd_t pmd = pmdp_get(pmdp);
+ int ret = 0;
+
+ if (pmd_leaf(pmd)) {
+ if (pmd_cont(pmd))
+ split_contpmd(pmdp);
+ ret = split_pmd(pmdp, pmd, gfp, false);
+
+ /*
+ * We have split the pmd directly to ptes so there is no need to
+ * visit each pte to check if they are contpte.
+ */
+ walk->action = ACTION_CONTINUE;
+ }
+
+ return ret;
+}
+
+static int split_to_ptes_pte_entry(pte_t *ptep, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pte_t pte = __ptep_get(ptep);
+
+ if (pte_cont(pte))
+ split_contpte(ptep);
+
+ return 0;
+}
+
+static const struct mm_walk_ops split_to_ptes_ops = {
+ .pud_entry = split_to_ptes_pud_entry,
+ .pmd_entry = split_to_ptes_pmd_entry,
+ .pte_entry = split_to_ptes_pte_entry,
+};
+
+static int range_split_to_ptes(unsigned long start, unsigned long end, gfp_t gfp)
+{
+ int ret;
+
+ lazy_mmu_mode_enable();
+ ret = walk_kernel_page_table_range_lockless(start, end,
+ &split_to_ptes_ops, NULL, &gfp);
+ lazy_mmu_mode_disable();
+
+ return ret;
+}
+
+u32 idmap_kpti_bbml2_flag;
+
+static void __init init_idmap_kpti_bbml2_flag(void)
+{
+ WRITE_ONCE(idmap_kpti_bbml2_flag, 1);
+ /* Must be visible to other CPUs before stop_machine() is called. */
+ smp_mb();
+}
+
+static int __init linear_map_split_to_ptes(void *__unused)
+{
+ /*
+ * Repainting the linear map must be done by CPU0 (the boot CPU) because
+ * that's the only CPU that we know supports BBML2. The other CPUs will
+ * be held in a waiting area with the idmap active.
+ */
+ if (!smp_processor_id()) {
+ unsigned long lstart = _PAGE_OFFSET(vabits_actual);
+ unsigned long lend = PAGE_END;
+ unsigned long kstart = (unsigned long)lm_alias(_stext);
+ unsigned long kend = (unsigned long)lm_alias(__init_begin);
+ int ret;
+
+ /*
+ * Wait for all secondary CPUs to be put into the waiting area.
+ */
+ smp_cond_load_acquire(&idmap_kpti_bbml2_flag, VAL == num_online_cpus());
+
+ /*
+ * Walk all of the linear map [lstart, lend), except the kernel
+ * linear map alias [kstart, kend), and split all mappings to
+ * PTE. The kernel alias remains static throughout runtime so
+ * can continue to be safely mapped with large mappings.
+ */
+ ret = range_split_to_ptes(lstart, kstart, GFP_ATOMIC);
+ if (!ret)
+ ret = range_split_to_ptes(kend, lend, GFP_ATOMIC);
+ if (ret)
+ panic("Failed to split linear map\n");
+ flush_tlb_kernel_range(lstart, lend);
+
+ /*
+ * Relies on dsb in flush_tlb_kernel_range() to avoid reordering
+ * before any page table split operations.
+ */
+ WRITE_ONCE(idmap_kpti_bbml2_flag, 0);
+ } else {
+ typedef void (wait_split_fn)(void);
+ extern wait_split_fn wait_linear_map_split_to_ptes;
+ wait_split_fn *wait_fn;
+
+ wait_fn = (void *)__pa_symbol(wait_linear_map_split_to_ptes);
+
+ /*
+ * At least one secondary CPU doesn't support BBML2 so cannot
+ * tolerate the size of the live mappings changing. So have the
+ * secondary CPUs wait for the boot CPU to make the changes
+ * with the idmap active and init_mm inactive.
+ */
+ cpu_install_idmap();
+ wait_fn();
+ cpu_uninstall_idmap();
+ }
+
+ return 0;
+}
+
+void __init linear_map_maybe_split_to_ptes(void)
+{
+ if (linear_map_requires_bbml2 && !system_supports_bbml2_noabort()) {
+ init_idmap_kpti_bbml2_flag();
+ stop_machine(linear_map_split_to_ptes, NULL, cpu_online_mask);
+ }
}
/*
@@ -514,8 +1000,8 @@ void __init create_mapping_noalloc(phys_addr_t phys, unsigned long virt,
&phys, virt);
return;
}
- __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL,
- NO_CONT_MAPPINGS);
+ early_create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL,
+ NO_CONT_MAPPINGS);
}
void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
@@ -529,8 +1015,8 @@ void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
if (page_mappings_only)
flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
- __create_pgd_mapping(mm->pgd, phys, virt, size, prot,
- pgd_pgtable_alloc, flags);
+ early_create_pgd_mapping(mm->pgd, phys, virt, size, prot,
+ pgd_pgtable_alloc_special_mm, flags);
}
static void update_mapping_prot(phys_addr_t phys, unsigned long virt,
@@ -542,8 +1028,8 @@ static void update_mapping_prot(phys_addr_t phys, unsigned long virt,
return;
}
- __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL,
- NO_CONT_MAPPINGS);
+ early_create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL,
+ NO_CONT_MAPPINGS);
/* flush the TLBs after updating live kernel mappings */
flush_tlb_kernel_range(virt, virt + size);
@@ -552,8 +1038,8 @@ static void update_mapping_prot(phys_addr_t phys, unsigned long virt,
static void __init __map_memblock(pgd_t *pgdp, phys_addr_t start,
phys_addr_t end, pgprot_t prot, int flags)
{
- __create_pgd_mapping(pgdp, start, __phys_to_virt(start), end - start,
- prot, early_pgtable_alloc, flags);
+ early_create_pgd_mapping(pgdp, start, __phys_to_virt(start), end - start,
+ prot, early_pgtable_alloc, flags);
}
void __init mark_linear_text_alias_ro(void)
@@ -561,8 +1047,8 @@ void __init mark_linear_text_alias_ro(void)
/*
* Remove the write permissions from the linear alias of .text/.rodata
*/
- update_mapping_prot(__pa_symbol(_stext), (unsigned long)lm_alias(_stext),
- (unsigned long)__init_begin - (unsigned long)_stext,
+ update_mapping_prot(__pa_symbol(_text), (unsigned long)lm_alias(_text),
+ (unsigned long)__init_begin - (unsigned long)_text,
PAGE_KERNEL_RO);
}
@@ -613,6 +1099,33 @@ static void __init arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp)
memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE);
__kfence_pool = phys_to_virt(kfence_pool);
}
+
+bool arch_kfence_init_pool(void)
+{
+ unsigned long start = (unsigned long)__kfence_pool;
+ unsigned long end = start + KFENCE_POOL_SIZE;
+ int ret;
+
+ /* Exit early if we know the linear map is already pte-mapped. */
+ if (force_pte_mapping())
+ return true;
+
+ /* Kfence pool is already pte-mapped for the early init case. */
+ if (kfence_early_init)
+ return true;
+
+ mutex_lock(&pgtable_split_lock);
+ ret = range_split_to_ptes(start, end, GFP_PGTABLE_KERNEL);
+ mutex_unlock(&pgtable_split_lock);
+
+ /*
+ * Since the system supports bbml2_noabort, tlb invalidation is not
+ * required here; the pgtable mappings have been split to pte but larger
+ * entries may safely linger in the TLB.
+ */
+
+ return !ret;
+}
#else /* CONFIG_KFENCE */
static inline phys_addr_t arm64_kfence_alloc_pool(void) { return 0; }
@@ -623,7 +1136,7 @@ static inline void arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp) {
static void __init map_mem(pgd_t *pgdp)
{
static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN);
- phys_addr_t kernel_start = __pa_symbol(_stext);
+ phys_addr_t kernel_start = __pa_symbol(_text);
phys_addr_t kernel_end = __pa_symbol(__init_begin);
phys_addr_t start, end;
phys_addr_t early_kfence_pool;
@@ -645,7 +1158,9 @@ static void __init map_mem(pgd_t *pgdp)
early_kfence_pool = arm64_kfence_alloc_pool();
- if (can_set_direct_map())
+ linear_map_requires_bbml2 = !force_pte_mapping() && can_set_direct_map();
+
+ if (force_pte_mapping())
flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
/*
@@ -670,7 +1185,7 @@ static void __init map_mem(pgd_t *pgdp)
}
/*
- * Map the linear alias of the [_stext, __init_begin) interval
+ * Map the linear alias of the [_text, __init_begin) interval
* as non-executable now, and remove the write permission in
* mark_linear_text_alias_ro() below (which will be called after
* alternative patching has completed). This makes the contents
@@ -697,6 +1212,10 @@ void mark_rodata_ro(void)
WRITE_ONCE(rodata_is_rw, false);
update_mapping_prot(__pa_symbol(__start_rodata), (unsigned long)__start_rodata,
section_size, PAGE_KERNEL_RO);
+ /* mark the range between _text and _stext as read only. */
+ update_mapping_prot(__pa_symbol(_text), (unsigned long)_text,
+ (unsigned long)_stext - (unsigned long)_text,
+ PAGE_KERNEL_RO);
}
static void __init declare_vma(struct vm_struct *vma,
@@ -722,7 +1241,97 @@ static void __init declare_vma(struct vm_struct *vma,
}
#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
-static pgprot_t kernel_exec_prot(void)
+#define KPTI_NG_TEMP_VA (-(1UL << PMD_SHIFT))
+
+static phys_addr_t kpti_ng_temp_alloc __initdata;
+
+static phys_addr_t __init kpti_ng_pgd_alloc(enum pgtable_level pgtable_level)
+{
+ kpti_ng_temp_alloc -= PAGE_SIZE;
+ return kpti_ng_temp_alloc;
+}
+
+static int __init __kpti_install_ng_mappings(void *__unused)
+{
+ typedef void (kpti_remap_fn)(int, int, phys_addr_t, unsigned long);
+ extern kpti_remap_fn idmap_kpti_install_ng_mappings;
+ kpti_remap_fn *remap_fn;
+
+ int cpu = smp_processor_id();
+ int levels = CONFIG_PGTABLE_LEVELS;
+ int order = order_base_2(levels);
+ u64 kpti_ng_temp_pgd_pa = 0;
+ pgd_t *kpti_ng_temp_pgd;
+ u64 alloc = 0;
+
+ if (levels == 5 && !pgtable_l5_enabled())
+ levels = 4;
+ else if (levels == 4 && !pgtable_l4_enabled())
+ levels = 3;
+
+ remap_fn = (void *)__pa_symbol(idmap_kpti_install_ng_mappings);
+
+ if (!cpu) {
+ int ret;
+
+ alloc = __get_free_pages(GFP_ATOMIC | __GFP_ZERO, order);
+ kpti_ng_temp_pgd = (pgd_t *)(alloc + (levels - 1) * PAGE_SIZE);
+ kpti_ng_temp_alloc = kpti_ng_temp_pgd_pa = __pa(kpti_ng_temp_pgd);
+
+ //
+ // Create a minimal page table hierarchy that permits us to map
+ // the swapper page tables temporarily as we traverse them.
+ //
+ // The physical pages are laid out as follows:
+ //
+ // +--------+-/-------+-/------ +-/------ +-\\\--------+
+ // : PTE[] : | PMD[] : | PUD[] : | P4D[] : ||| PGD[] :
+ // +--------+-\-------+-\------ +-\------ +-///--------+
+ // ^
+ // The first page is mapped into this hierarchy at a PMD_SHIFT
+ // aligned virtual address, so that we can manipulate the PTE
+ // level entries while the mapping is active. The first entry
+ // covers the PTE[] page itself, the remaining entries are free
+ // to be used as a ad-hoc fixmap.
+ //
+ ret = __create_pgd_mapping_locked(kpti_ng_temp_pgd, __pa(alloc),
+ KPTI_NG_TEMP_VA, PAGE_SIZE, PAGE_KERNEL,
+ kpti_ng_pgd_alloc, 0);
+ if (ret)
+ panic("Failed to create page tables\n");
+ }
+
+ cpu_install_idmap();
+ remap_fn(cpu, num_online_cpus(), kpti_ng_temp_pgd_pa, KPTI_NG_TEMP_VA);
+ cpu_uninstall_idmap();
+
+ if (!cpu) {
+ free_pages(alloc, order);
+ arm64_use_ng_mappings = true;
+ }
+
+ return 0;
+}
+
+void __init kpti_install_ng_mappings(void)
+{
+ /* Check whether KPTI is going to be used */
+ if (!arm64_kernel_unmapped_at_el0())
+ return;
+
+ /*
+ * We don't need to rewrite the page-tables if either we've done
+ * it already or we have KASLR enabled and therefore have not
+ * created any global mappings at all.
+ */
+ if (arm64_use_ng_mappings)
+ return;
+
+ init_idmap_kpti_bbml2_flag();
+ stop_machine(__kpti_install_ng_mappings, NULL, cpu_online_mask);
+}
+
+static pgprot_t __init kernel_exec_prot(void)
{
return rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC;
}
@@ -742,9 +1351,9 @@ static int __init map_entry_trampoline(void)
/* Map only the text into the trampoline page table */
memset(tramp_pg_dir, 0, PGD_SIZE);
- __create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS,
- entry_tramp_text_size(), prot,
- __pgd_pgtable_alloc, NO_BLOCK_MAPPINGS);
+ early_create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS,
+ entry_tramp_text_size(), prot,
+ pgd_pgtable_alloc_init_mm, NO_BLOCK_MAPPINGS);
/* Map both the text and data into the kernel page table */
for (i = 0; i < DIV_ROUND_UP(entry_tramp_text_size(), PAGE_SIZE); i++)
@@ -767,38 +1376,41 @@ static void __init declare_kernel_vmas(void)
{
static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT];
- declare_vma(&vmlinux_seg[0], _stext, _etext, VM_NO_GUARD);
+ declare_vma(&vmlinux_seg[0], _text, _etext, VM_NO_GUARD);
declare_vma(&vmlinux_seg[1], __start_rodata, __inittext_begin, VM_NO_GUARD);
declare_vma(&vmlinux_seg[2], __inittext_begin, __inittext_end, VM_NO_GUARD);
declare_vma(&vmlinux_seg[3], __initdata_begin, __initdata_end, VM_NO_GUARD);
declare_vma(&vmlinux_seg[4], _data, _end, 0);
}
-void __pi_map_range(u64 *pgd, u64 start, u64 end, u64 pa, pgprot_t prot,
- int level, pte_t *tbl, bool may_use_cont, u64 va_offset);
+void __pi_map_range(phys_addr_t *pte, u64 start, u64 end, phys_addr_t pa,
+ pgprot_t prot, int level, pte_t *tbl, bool may_use_cont,
+ u64 va_offset);
static u8 idmap_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init,
- kpti_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init;
+ kpti_bbml2_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init;
static void __init create_idmap(void)
{
- u64 start = __pa_symbol(__idmap_text_start);
- u64 end = __pa_symbol(__idmap_text_end);
- u64 ptep = __pa_symbol(idmap_ptes);
+ phys_addr_t start = __pa_symbol(__idmap_text_start);
+ phys_addr_t end = __pa_symbol(__idmap_text_end);
+ phys_addr_t ptep = __pa_symbol(idmap_ptes);
__pi_map_range(&ptep, start, end, start, PAGE_KERNEL_ROX,
IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false,
__phys_to_virt(ptep) - ptep);
- if (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0) && !arm64_use_ng_mappings) {
- extern u32 __idmap_kpti_flag;
- u64 pa = __pa_symbol(&__idmap_kpti_flag);
+ if (linear_map_requires_bbml2 ||
+ (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0) && !arm64_use_ng_mappings)) {
+ phys_addr_t pa = __pa_symbol(&idmap_kpti_bbml2_flag);
/*
* The KPTI G-to-nG conversion code needs a read-write mapping
- * of its synchronization flag in the ID map.
+ * of its synchronization flag in the ID map. This is also used
+ * when splitting the linear map to ptes if a secondary CPU
+ * doesn't support bbml2.
*/
- ptep = __pa_symbol(kpti_ptes);
+ ptep = __pa_symbol(kpti_bbml2_ptes);
__pi_map_range(&ptep, pa, pa + sizeof(u32), pa, PAGE_KERNEL,
IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false,
__phys_to_virt(ptep) - ptep);
@@ -823,7 +1435,7 @@ static void free_hotplug_page_range(struct page *page, size_t size,
vmem_altmap_free(altmap, size >> PAGE_SHIFT);
} else {
WARN_ON(PageReserved(page));
- free_pages((unsigned long)page_address(page), get_order(size));
+ __free_pages(page, get_order(size));
}
}
@@ -865,10 +1477,14 @@ static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
WARN_ON(!pte_present(pte));
__pte_clear(&init_mm, addr, ptep);
- flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
- if (free_mapped)
+ if (free_mapped) {
+ /* CONT blocks are not supported in the vmemmap */
+ WARN_ON(pte_cont(pte));
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
free_hotplug_page_range(pte_page(pte),
PAGE_SIZE, altmap);
+ }
+ /* unmap_hotplug_range() flushes TLB for !free_mapped */
} while (addr += PAGE_SIZE, addr < end);
}
@@ -887,17 +1503,16 @@ static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
continue;
WARN_ON(!pmd_present(pmd));
- if (pmd_sect(pmd)) {
+ if (pmd_leaf(pmd)) {
pmd_clear(pmdp);
-
- /*
- * One TLBI should be sufficient here as the PMD_SIZE
- * range is mapped with a single block entry.
- */
- flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
- if (free_mapped)
+ if (free_mapped) {
+ /* CONT blocks are not supported in the vmemmap */
+ WARN_ON(pmd_cont(pmd));
+ flush_tlb_kernel_range(addr, addr + PMD_SIZE);
free_hotplug_page_range(pmd_page(pmd),
PMD_SIZE, altmap);
+ }
+ /* unmap_hotplug_range() flushes TLB for !free_mapped */
continue;
}
WARN_ON(!pmd_table(pmd));
@@ -920,17 +1535,14 @@ static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr,
continue;
WARN_ON(!pud_present(pud));
- if (pud_sect(pud)) {
+ if (pud_leaf(pud)) {
pud_clear(pudp);
-
- /*
- * One TLBI should be sufficient here as the PUD_SIZE
- * range is mapped with a single block entry.
- */
- flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
- if (free_mapped)
+ if (free_mapped) {
+ flush_tlb_kernel_range(addr, addr + PUD_SIZE);
free_hotplug_page_range(pud_page(pud),
PUD_SIZE, altmap);
+ }
+ /* unmap_hotplug_range() flushes TLB for !free_mapped */
continue;
}
WARN_ON(!pud_table(pud));
@@ -960,6 +1572,7 @@ static void unmap_hotplug_p4d_range(pgd_t *pgdp, unsigned long addr,
static void unmap_hotplug_range(unsigned long addr, unsigned long end,
bool free_mapped, struct vmem_altmap *altmap)
{
+ unsigned long start = addr;
unsigned long next;
pgd_t *pgdp, pgd;
@@ -981,6 +1594,9 @@ static void unmap_hotplug_range(unsigned long addr, unsigned long end,
WARN_ON(!pgd_present(pgd));
unmap_hotplug_p4d_range(pgdp, addr, next, free_mapped, altmap);
} while (addr = next, addr < end);
+
+ if (!free_mapped)
+ flush_tlb_kernel_range(start, end);
}
static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr,
@@ -1034,7 +1650,7 @@ static void free_empty_pmd_table(pud_t *pudp, unsigned long addr,
if (pmd_none(pmd))
continue;
- WARN_ON(!pmd_present(pmd) || !pmd_table(pmd) || pmd_sect(pmd));
+ WARN_ON(!pmd_present(pmd) || !pmd_table(pmd));
free_empty_pte_table(pmdp, addr, next, floor, ceiling);
} while (addr = next, addr < end);
@@ -1074,7 +1690,7 @@ static void free_empty_pud_table(p4d_t *p4dp, unsigned long addr,
if (pud_none(pud))
continue;
- WARN_ON(!pud_present(pud) || !pud_table(pud) || pud_sect(pud));
+ WARN_ON(!pud_present(pud) || !pud_table(pud));
free_empty_pmd_table(pudp, addr, next, floor, ceiling);
} while (addr = next, addr < end);
@@ -1170,7 +1786,7 @@ int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node,
{
vmemmap_verify((pte_t *)pmdp, node, addr, next);
- return pmd_sect(READ_ONCE(*pmdp));
+ return pmd_leaf(READ_ONCE(*pmdp));
}
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
@@ -1234,7 +1850,7 @@ void p4d_clear_huge(p4d_t *p4dp)
int pud_clear_huge(pud_t *pudp)
{
- if (!pud_sect(READ_ONCE(*pudp)))
+ if (!pud_leaf(READ_ONCE(*pudp)))
return 0;
pud_clear(pudp);
return 1;
@@ -1242,13 +1858,14 @@ int pud_clear_huge(pud_t *pudp)
int pmd_clear_huge(pmd_t *pmdp)
{
- if (!pmd_sect(READ_ONCE(*pmdp)))
+ if (!pmd_leaf(READ_ONCE(*pmdp)))
return 0;
pmd_clear(pmdp);
return 1;
}
-int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr)
+static int __pmd_free_pte_page(pmd_t *pmdp, unsigned long addr,
+ bool acquire_mmap_lock)
{
pte_t *table;
pmd_t pmd;
@@ -1260,13 +1877,25 @@ int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr)
return 1;
}
+ /* See comment in pud_free_pmd_page for static key logic */
table = pte_offset_kernel(pmdp, addr);
pmd_clear(pmdp);
__flush_tlb_kernel_pgtable(addr);
+ if (static_branch_unlikely(&arm64_ptdump_lock_key) && acquire_mmap_lock) {
+ mmap_read_lock(&init_mm);
+ mmap_read_unlock(&init_mm);
+ }
+
pte_free_kernel(NULL, table);
return 1;
}
+int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr)
+{
+ /* If ptdump is walking the pagetables, acquire init_mm.mmap_lock */
+ return __pmd_free_pte_page(pmdp, addr, /* acquire_mmap_lock = */ true);
+}
+
int pud_free_pmd_page(pud_t *pudp, unsigned long addr)
{
pmd_t *table;
@@ -1282,15 +1911,36 @@ int pud_free_pmd_page(pud_t *pudp, unsigned long addr)
}
table = pmd_offset(pudp, addr);
+
+ /*
+ * Our objective is to prevent ptdump from reading a PMD table which has
+ * been freed. In this race, if pud_free_pmd_page observes the key on
+ * (which got flipped by ptdump) then the mmap lock sequence here will,
+ * as a result of the mmap write lock/unlock sequence in ptdump, give
+ * us the correct synchronization. If not, this means that ptdump has
+ * yet not started walking the pagetables - the sequence of barriers
+ * issued by __flush_tlb_kernel_pgtable() guarantees that ptdump will
+ * observe an empty PUD.
+ */
+ pud_clear(pudp);
+ __flush_tlb_kernel_pgtable(addr);
+ if (static_branch_unlikely(&arm64_ptdump_lock_key)) {
+ mmap_read_lock(&init_mm);
+ mmap_read_unlock(&init_mm);
+ }
+
pmdp = table;
next = addr;
end = addr + PUD_SIZE;
do {
- pmd_free_pte_page(pmdp, next);
+ if (pmd_present(pmdp_get(pmdp)))
+ /*
+ * PMD has been isolated, so ptdump won't see it. No
+ * need to acquire init_mm.mmap_lock.
+ */
+ __pmd_free_pte_page(pmdp, next, /* acquire_mmap_lock = */ false);
} while (pmdp++, next += PMD_SIZE, next != end);
- pud_clear(pudp);
- __flush_tlb_kernel_pgtable(addr);
pmd_free(NULL, table);
return 1;
}
@@ -1310,8 +1960,8 @@ static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size)
struct range arch_get_mappable_range(void)
{
struct range mhp_range;
- u64 start_linear_pa = __pa(_PAGE_OFFSET(vabits_actual));
- u64 end_linear_pa = __pa(PAGE_END - 1);
+ phys_addr_t start_linear_pa = __pa(_PAGE_OFFSET(vabits_actual));
+ phys_addr_t end_linear_pa = __pa(PAGE_END - 1);
if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
/*
@@ -1346,25 +1996,31 @@ int arch_add_memory(int nid, u64 start, u64 size,
VM_BUG_ON(!mhp_range_allowed(start, size, true));
- if (can_set_direct_map())
+ if (force_pte_mapping())
flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
- __create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
- size, params->pgprot, __pgd_pgtable_alloc,
- flags);
+ ret = __create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
+ size, params->pgprot, pgd_pgtable_alloc_init_mm,
+ flags);
+ if (ret)
+ goto err;
memblock_clear_nomap(start, size);
ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
params);
if (ret)
- __remove_pgd_mapping(swapper_pg_dir,
- __phys_to_virt(start), size);
- else {
- max_pfn = PFN_UP(start + size);
- max_low_pfn = max_pfn;
- }
+ goto err;
+
+ /* Address of hotplugged memory can be smaller */
+ max_pfn = max(max_pfn, PFN_UP(start + size));
+ max_low_pfn = max_pfn;
+ return 0;
+
+err:
+ __remove_pgd_mapping(swapper_pg_dir,
+ __phys_to_virt(start), size);
return ret;
}
@@ -1377,6 +2033,107 @@ void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
__remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size);
}
+
+static bool addr_splits_kernel_leaf(unsigned long addr)
+{
+ pgd_t *pgdp, pgd;
+ p4d_t *p4dp, p4d;
+ pud_t *pudp, pud;
+ pmd_t *pmdp, pmd;
+ pte_t *ptep, pte;
+
+ /*
+ * If the given address points at a the start address of
+ * a possible leaf, we certainly won't split. Otherwise,
+ * check if we would actually split a leaf by traversing
+ * the page tables further.
+ */
+ if (IS_ALIGNED(addr, PGDIR_SIZE))
+ return false;
+
+ pgdp = pgd_offset_k(addr);
+ pgd = pgdp_get(pgdp);
+ if (!pgd_present(pgd))
+ return false;
+
+ if (IS_ALIGNED(addr, P4D_SIZE))
+ return false;
+
+ p4dp = p4d_offset(pgdp, addr);
+ p4d = p4dp_get(p4dp);
+ if (!p4d_present(p4d))
+ return false;
+
+ if (IS_ALIGNED(addr, PUD_SIZE))
+ return false;
+
+ pudp = pud_offset(p4dp, addr);
+ pud = pudp_get(pudp);
+ if (!pud_present(pud))
+ return false;
+
+ if (pud_leaf(pud))
+ return true;
+
+ if (IS_ALIGNED(addr, CONT_PMD_SIZE))
+ return false;
+
+ pmdp = pmd_offset(pudp, addr);
+ pmd = pmdp_get(pmdp);
+ if (!pmd_present(pmd))
+ return false;
+
+ if (pmd_cont(pmd))
+ return true;
+
+ if (IS_ALIGNED(addr, PMD_SIZE))
+ return false;
+
+ if (pmd_leaf(pmd))
+ return true;
+
+ if (IS_ALIGNED(addr, CONT_PTE_SIZE))
+ return false;
+
+ ptep = pte_offset_kernel(pmdp, addr);
+ pte = __ptep_get(ptep);
+ if (!pte_present(pte))
+ return false;
+
+ if (pte_cont(pte))
+ return true;
+
+ return !IS_ALIGNED(addr, PAGE_SIZE);
+}
+
+static bool can_unmap_without_split(unsigned long pfn, unsigned long nr_pages)
+{
+ unsigned long phys_start, phys_end, start, end;
+
+ phys_start = PFN_PHYS(pfn);
+ phys_end = phys_start + nr_pages * PAGE_SIZE;
+
+ /* PFN range's linear map edges are leaf entry aligned */
+ start = __phys_to_virt(phys_start);
+ end = __phys_to_virt(phys_end);
+ if (addr_splits_kernel_leaf(start) || addr_splits_kernel_leaf(end)) {
+ pr_warn("[%lx %lx] splits a leaf entry in linear map\n",
+ phys_start, phys_end);
+ return false;
+ }
+
+ /* PFN range's vmemmap edges are leaf entry aligned */
+ BUILD_BUG_ON(!IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP));
+ start = (unsigned long)pfn_to_page(pfn);
+ end = (unsigned long)pfn_to_page(pfn + nr_pages);
+ if (addr_splits_kernel_leaf(start) || addr_splits_kernel_leaf(end)) {
+ pr_warn("[%lx %lx] splits a leaf entry in vmemmap\n",
+ phys_start, phys_end);
+ return false;
+ }
+ return true;
+}
+
/*
* This memory hotplug notifier helps prevent boot memory from being
* inadvertently removed as it blocks pfn range offlining process in
@@ -1385,8 +2142,11 @@ void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
* In future if and when boot memory could be removed, this notifier
* should be dropped and free_hotplug_page_range() should handle any
* reserved pages allocated during boot.
+ *
+ * This also blocks any memory remove that would have caused a split
+ * in leaf entry in kernel linear or vmemmap mapping.
*/
-static int prevent_bootmem_remove_notifier(struct notifier_block *nb,
+static int prevent_memory_remove_notifier(struct notifier_block *nb,
unsigned long action, void *data)
{
struct mem_section *ms;
@@ -1432,11 +2192,15 @@ static int prevent_bootmem_remove_notifier(struct notifier_block *nb,
return NOTIFY_DONE;
}
}
+
+ if (!can_unmap_without_split(pfn, arg->nr_pages))
+ return NOTIFY_BAD;
+
return NOTIFY_OK;
}
-static struct notifier_block prevent_bootmem_remove_nb = {
- .notifier_call = prevent_bootmem_remove_notifier,
+static struct notifier_block prevent_memory_remove_nb = {
+ .notifier_call = prevent_memory_remove_notifier,
};
/*
@@ -1486,7 +2250,7 @@ static void validate_bootmem_online(void)
}
}
-static int __init prevent_bootmem_remove_init(void)
+static int __init prevent_memory_remove_init(void)
{
int ret = 0;
@@ -1494,33 +2258,50 @@ static int __init prevent_bootmem_remove_init(void)
return ret;
validate_bootmem_online();
- ret = register_memory_notifier(&prevent_bootmem_remove_nb);
+ ret = register_memory_notifier(&prevent_memory_remove_nb);
if (ret)
pr_err("%s: Notifier registration failed %d\n", __func__, ret);
return ret;
}
-early_initcall(prevent_bootmem_remove_init);
+early_initcall(prevent_memory_remove_init);
#endif
-pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
+pte_t modify_prot_start_ptes(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, unsigned int nr)
{
+ pte_t pte = get_and_clear_ptes(vma->vm_mm, addr, ptep, nr);
+
if (alternative_has_cap_unlikely(ARM64_WORKAROUND_2645198)) {
/*
* Break-before-make (BBM) is required for all user space mappings
* when the permission changes from executable to non-executable
* in cases where cpu is affected with errata #2645198.
*/
- if (pte_user_exec(ptep_get(ptep)))
- return ptep_clear_flush(vma, addr, ptep);
+ if (pte_accessible(vma->vm_mm, pte) && pte_user_exec(pte))
+ __flush_tlb_range(vma, addr, nr * PAGE_SIZE,
+ PAGE_SIZE, 3, TLBF_NOWALKCACHE);
}
- return ptep_get_and_clear(vma->vm_mm, addr, ptep);
+
+ return pte;
+}
+
+pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
+{
+ return modify_prot_start_ptes(vma, addr, ptep, 1);
+}
+
+void modify_prot_commit_ptes(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, pte_t old_pte, pte_t pte,
+ unsigned int nr)
+{
+ set_ptes(vma->vm_mm, addr, ptep, pte, nr);
}
void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep,
pte_t old_pte, pte_t pte)
{
- set_pte_at(vma->vm_mm, addr, ptep, pte);
+ modify_prot_commit_ptes(vma, addr, ptep, old_pte, pte, 1);
}
/*
@@ -1538,7 +2319,7 @@ void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp)
phys_addr_t ttbr1 = phys_to_ttbr(virt_to_phys(pgdp));
if (cnp)
- ttbr1 |= TTBR_CNP_BIT;
+ ttbr1 |= TTBRx_EL1_CnP;
replace_phys = (void *)__pa_symbol(idmap_cpu_replace_ttbr1);
@@ -1556,11 +2337,10 @@ void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp)
}
#ifdef CONFIG_ARCH_HAS_PKEYS
-int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val)
+int arch_set_user_pkey_access(int pkey, unsigned long init_val)
{
- u64 new_por = POE_RXW;
+ u64 new_por;
u64 old_por;
- u64 pkey_shift;
if (!system_supports_poe())
return -ENOSPC;
@@ -1574,7 +2354,7 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long i
return -EINVAL;
/* Set the bits we need in POR: */
- new_por = POE_RXW;
+ new_por = POE_RWX;
if (init_val & PKEY_DISABLE_WRITE)
new_por &= ~POE_W;
if (init_val & PKEY_DISABLE_ACCESS)
@@ -1585,12 +2365,11 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long i
new_por &= ~POE_X;
/* Shift the bits in to the correct place in POR for pkey: */
- pkey_shift = pkey * POR_BITS_PER_PKEY;
- new_por <<= pkey_shift;
+ new_por = POR_ELx_PERM_PREP(pkey, new_por);
/* Get old POR and mask off any old bits in place: */
old_por = read_sysreg_s(SYS_POR_EL0);
- old_por &= ~(POE_MASK << pkey_shift);
+ old_por &= ~(POE_MASK << POR_ELx_PERM_SHIFT(pkey));
/* Write old part along with new part: */
write_sysreg_s(old_por | new_por, SYS_POR_EL0);
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index 39fd1f7ff02a..ce035e1b4eaf 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -8,6 +8,7 @@
#include <linux/mem_encrypt.h>
#include <linux/sched.h>
#include <linux/vmalloc.h>
+#include <linux/pagewalk.h>
#include <asm/cacheflush.h>
#include <asm/pgtable-prot.h>
@@ -20,7 +21,71 @@ struct page_change_data {
pgprot_t clear_mask;
};
-bool rodata_full __ro_after_init = IS_ENABLED(CONFIG_RODATA_FULL_DEFAULT_ENABLED);
+static ptdesc_t set_pageattr_masks(ptdesc_t val, struct mm_walk *walk)
+{
+ struct page_change_data *masks = walk->private;
+
+ /*
+ * Some users clear and set bits which alias each other (e.g. PTE_NG and
+ * PTE_PRESENT_INVALID). It is therefore important that we always clear
+ * first then set.
+ */
+ val &= ~(pgprot_val(masks->clear_mask));
+ val |= (pgprot_val(masks->set_mask));
+
+ return val;
+}
+
+static int pageattr_pud_entry(pud_t *pud, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pud_t val = pudp_get(pud);
+
+ if (pud_leaf(val)) {
+ if (WARN_ON_ONCE((next - addr) != PUD_SIZE))
+ return -EINVAL;
+ val = __pud(set_pageattr_masks(pud_val(val), walk));
+ set_pud(pud, val);
+ walk->action = ACTION_CONTINUE;
+ }
+
+ return 0;
+}
+
+static int pageattr_pmd_entry(pmd_t *pmd, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pmd_t val = pmdp_get(pmd);
+
+ if (pmd_leaf(val)) {
+ if (WARN_ON_ONCE((next - addr) != PMD_SIZE))
+ return -EINVAL;
+ val = __pmd(set_pageattr_masks(pmd_val(val), walk));
+ set_pmd(pmd, val);
+ walk->action = ACTION_CONTINUE;
+ }
+
+ return 0;
+}
+
+static int pageattr_pte_entry(pte_t *pte, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pte_t val = __ptep_get(pte);
+
+ val = __pte(set_pageattr_masks(pte_val(val), walk));
+ __set_pte(pte, val);
+
+ return 0;
+}
+
+static const struct mm_walk_ops pageattr_ops = {
+ .pud_entry = pageattr_pud_entry,
+ .pmd_entry = pageattr_pmd_entry,
+ .pte_entry = pageattr_pte_entry,
+};
+
+bool rodata_full __ro_after_init = true;
bool can_set_direct_map(void)
{
@@ -37,39 +102,47 @@ bool can_set_direct_map(void)
arm64_kfence_can_set_direct_map() || is_realm_world();
}
-static int change_page_range(pte_t *ptep, unsigned long addr, void *data)
+static int update_range_prot(unsigned long start, unsigned long size,
+ pgprot_t set_mask, pgprot_t clear_mask)
{
- struct page_change_data *cdata = data;
- pte_t pte = __ptep_get(ptep);
+ struct page_change_data data;
+ int ret;
+
+ data.set_mask = set_mask;
+ data.clear_mask = clear_mask;
+
+ ret = split_kernel_leaf_mapping(start, start + size);
+ if (WARN_ON_ONCE(ret))
+ return ret;
- pte = clear_pte_bit(pte, cdata->clear_mask);
- pte = set_pte_bit(pte, cdata->set_mask);
+ lazy_mmu_mode_enable();
- __set_pte(ptep, pte);
- return 0;
+ /*
+ * The caller must ensure that the range we are operating on does not
+ * partially overlap a block mapping, or a cont mapping. Any such case
+ * must be eliminated by splitting the mapping.
+ */
+ ret = walk_kernel_page_table_range_lockless(start, start + size,
+ &pageattr_ops, NULL, &data);
+ lazy_mmu_mode_disable();
+
+ return ret;
}
-/*
- * This function assumes that the range is mapped with PAGE_SIZE pages.
- */
static int __change_memory_common(unsigned long start, unsigned long size,
- pgprot_t set_mask, pgprot_t clear_mask)
+ pgprot_t set_mask, pgprot_t clear_mask)
{
- struct page_change_data data;
int ret;
- data.set_mask = set_mask;
- data.clear_mask = clear_mask;
-
- ret = apply_to_page_range(&init_mm, start, size, change_page_range,
- &data);
+ ret = update_range_prot(start, size, set_mask, clear_mask);
/*
- * If the memory is being made valid without changing any other bits
- * then a TLBI isn't required as a non-valid entry cannot be cached in
- * the TLB.
+ * If the memory is being switched from present-invalid to valid without
+ * changing any other bits then a TLBI isn't required as a non-valid
+ * entry cannot be cached in the TLB.
*/
- if (pgprot_val(set_mask) != PTE_VALID || pgprot_val(clear_mask))
+ if (pgprot_val(set_mask) != PTE_PRESENT_VALID_KERNEL ||
+ pgprot_val(clear_mask) != PTE_PRESENT_INVALID)
flush_tlb_kernel_range(start, start + size);
return ret;
}
@@ -81,7 +154,7 @@ static int change_memory_common(unsigned long addr, int numpages,
unsigned long size = PAGE_SIZE * numpages;
unsigned long end = start + size;
struct vm_struct *area;
- int i;
+ int ret;
if (!PAGE_ALIGNED(addr)) {
start &= PAGE_MASK;
@@ -96,16 +169,17 @@ static int change_memory_common(unsigned long addr, int numpages,
* we are operating on does not result in such splitting.
*
* Let's restrict ourselves to mappings created by vmalloc (or vmap).
- * Those are guaranteed to consist entirely of page mappings, and
- * splitting is never needed.
+ * Disallow VM_ALLOW_HUGE_VMAP mappings to guarantee that only page
+ * mappings are updated and splitting is never needed.
*
* So check whether the [addr, addr + size) interval is entirely
* covered by precisely one VM area that has the VM_ALLOC flag set.
*/
area = find_vm_area((void *)addr);
if (!area ||
- end > (unsigned long)kasan_reset_tag(area->addr) + area->size ||
- !(area->flags & VM_ALLOC))
+ ((unsigned long)kasan_reset_tag((void *)end) >
+ (unsigned long)kasan_reset_tag(area->addr) + area->size) ||
+ ((area->flags & (VM_ALLOC | VM_ALLOW_HUGE_VMAP)) != VM_ALLOC))
return -EINVAL;
if (!numpages)
@@ -117,9 +191,14 @@ static int change_memory_common(unsigned long addr, int numpages,
*/
if (rodata_full && (pgprot_val(set_mask) == PTE_RDONLY ||
pgprot_val(clear_mask) == PTE_RDONLY)) {
- for (i = 0; i < area->nr_pages; i++) {
- __change_memory_common((u64)page_address(area->pages[i]),
- PAGE_SIZE, set_mask, clear_mask);
+ unsigned long idx = ((unsigned long)kasan_reset_tag((void *)start) -
+ (unsigned long)kasan_reset_tag(area->addr))
+ >> PAGE_SHIFT;
+ for (; numpages; idx++, numpages--) {
+ ret = __change_memory_common((u64)page_address(area->pages[idx]),
+ PAGE_SIZE, set_mask, clear_mask);
+ if (ret)
+ return ret;
}
}
@@ -164,42 +243,36 @@ int set_memory_valid(unsigned long addr, int numpages, int enable)
{
if (enable)
return __change_memory_common(addr, PAGE_SIZE * numpages,
- __pgprot(PTE_VALID),
- __pgprot(0));
+ __pgprot(PTE_PRESENT_VALID_KERNEL),
+ __pgprot(PTE_PRESENT_INVALID));
else
return __change_memory_common(addr, PAGE_SIZE * numpages,
- __pgprot(0),
- __pgprot(PTE_VALID));
+ __pgprot(PTE_PRESENT_INVALID),
+ __pgprot(PTE_PRESENT_VALID_KERNEL));
}
int set_direct_map_invalid_noflush(struct page *page)
{
- struct page_change_data data = {
- .set_mask = __pgprot(0),
- .clear_mask = __pgprot(PTE_VALID),
- };
+ pgprot_t clear_mask = __pgprot(PTE_PRESENT_VALID_KERNEL);
+ pgprot_t set_mask = __pgprot(PTE_PRESENT_INVALID);
if (!can_set_direct_map())
return 0;
- return apply_to_page_range(&init_mm,
- (unsigned long)page_address(page),
- PAGE_SIZE, change_page_range, &data);
+ return update_range_prot((unsigned long)page_address(page),
+ PAGE_SIZE, set_mask, clear_mask);
}
int set_direct_map_default_noflush(struct page *page)
{
- struct page_change_data data = {
- .set_mask = __pgprot(PTE_VALID | PTE_WRITE),
- .clear_mask = __pgprot(PTE_RDONLY),
- };
+ pgprot_t set_mask = __pgprot(PTE_PRESENT_VALID_KERNEL | PTE_WRITE);
+ pgprot_t clear_mask = __pgprot(PTE_PRESENT_INVALID | PTE_RDONLY);
if (!can_set_direct_map())
return 0;
- return apply_to_page_range(&init_mm,
- (unsigned long)page_address(page),
- PAGE_SIZE, change_page_range, &data);
+ return update_range_prot((unsigned long)page_address(page),
+ PAGE_SIZE, set_mask, clear_mask);
}
static int __set_memory_enc_dec(unsigned long addr,
@@ -229,8 +302,8 @@ static int __set_memory_enc_dec(unsigned long addr,
* entries or Synchronous External Aborts caused by RIPAS_EMPTY
*/
ret = __change_memory_common(addr, PAGE_SIZE * numpages,
- __pgprot(set_prot),
- __pgprot(clear_prot | PTE_VALID));
+ __pgprot(set_prot | PTE_PRESENT_INVALID),
+ __pgprot(clear_prot | PTE_PRESENT_VALID_KERNEL));
if (ret)
return ret;
@@ -244,8 +317,8 @@ static int __set_memory_enc_dec(unsigned long addr,
return ret;
return __change_memory_common(addr, PAGE_SIZE * numpages,
- __pgprot(PTE_VALID),
- __pgprot(0));
+ __pgprot(PTE_PRESENT_VALID_KERNEL),
+ __pgprot(PTE_PRESENT_INVALID));
}
static int realm_set_memory_encrypted(unsigned long addr, int numpages)
@@ -337,15 +410,15 @@ bool kernel_page_present(struct page *page)
pud = READ_ONCE(*pudp);
if (pud_none(pud))
return false;
- if (pud_sect(pud))
- return true;
+ if (pud_leaf(pud))
+ return pud_valid(pud);
pmdp = pmd_offset(pudp, addr);
pmd = READ_ONCE(*pmdp);
if (pmd_none(pmd))
return false;
- if (pmd_sect(pmd))
- return true;
+ if (pmd_leaf(pmd))
+ return pmd_valid(pmd);
ptep = pte_offset_kernel(pmdp, addr);
return pte_valid(__ptep_get(ptep));
diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c
index 8160cff35089..bf5110b91e2f 100644
--- a/arch/arm64/mm/pgd.c
+++ b/arch/arm64/mm/pgd.c
@@ -56,7 +56,7 @@ void __init pgtable_cache_init(void)
* With 52-bit physical addresses, the architecture requires the
* top-level table to be aligned to at least 64 bytes.
*/
- BUILD_BUG_ON(PGD_SIZE < 64);
+ BUILD_BUG_ON(!IS_ALIGNED(PGD_SIZE, 64));
#endif
/*
diff --git a/arch/arm64/mm/physaddr.c b/arch/arm64/mm/physaddr.c
index cde44c13dda1..7d94e09b01b3 100644
--- a/arch/arm64/mm/physaddr.c
+++ b/arch/arm64/mm/physaddr.c
@@ -10,7 +10,7 @@
phys_addr_t __virt_to_phys(unsigned long x)
{
WARN(!__is_lm_address(__tag_reset(x)),
- "virt_to_phys used for non-linear address: %pK (%pS)\n",
+ "virt_to_phys used for non-linear address: %p (%pS)\n",
(void *)x,
(void *)x);
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index fb30c8804f87..22866b49be37 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -23,15 +23,18 @@
#include <asm/sysreg.h>
#ifdef CONFIG_ARM64_64K_PAGES
-#define TCR_TG_FLAGS TCR_TG0_64K | TCR_TG1_64K
+#define TCR_TG_FLAGS ((TCR_EL1_TG0_64K << TCR_EL1_TG0_SHIFT) |\
+ (TCR_EL1_TG1_64K << TCR_EL1_TG1_SHIFT))
#elif defined(CONFIG_ARM64_16K_PAGES)
-#define TCR_TG_FLAGS TCR_TG0_16K | TCR_TG1_16K
+#define TCR_TG_FLAGS ((TCR_EL1_TG0_16K << TCR_EL1_TG0_SHIFT) |\
+ (TCR_EL1_TG1_16K << TCR_EL1_TG1_SHIFT))
#else /* CONFIG_ARM64_4K_PAGES */
-#define TCR_TG_FLAGS TCR_TG0_4K | TCR_TG1_4K
+#define TCR_TG_FLAGS ((TCR_EL1_TG0_4K << TCR_EL1_TG0_SHIFT) |\
+ (TCR_EL1_TG1_4K << TCR_EL1_TG1_SHIFT))
#endif
#ifdef CONFIG_RANDOMIZE_BASE
-#define TCR_KASLR_FLAGS TCR_NFD1
+#define TCR_KASLR_FLAGS TCR_EL1_NFD1
#else
#define TCR_KASLR_FLAGS 0
#endif
@@ -40,23 +43,30 @@
#define TCR_CACHE_FLAGS TCR_IRGN_WBWA | TCR_ORGN_WBWA
#ifdef CONFIG_KASAN_SW_TAGS
-#define TCR_KASAN_SW_FLAGS TCR_TBI1 | TCR_TBID1
+#define TCR_KASAN_SW_FLAGS TCR_EL1_TBI1 | TCR_EL1_TBID1
#else
#define TCR_KASAN_SW_FLAGS 0
#endif
-#ifdef CONFIG_KASAN_HW_TAGS
-#define TCR_MTE_FLAGS TCR_TCMA1 | TCR_TBI1 | TCR_TBID1
-#elif defined(CONFIG_ARM64_MTE)
+#ifdef CONFIG_ARM64_MTE
/*
* The mte_zero_clear_page_tags() implementation uses DC GZVA, which relies on
- * TBI being enabled at EL1.
+ * TBI being enabled at EL1. TCMA1 is needed to treat accesses with the
+ * match-all tag (0xF) as Tag Unchecked, irrespective of the SCTLR_EL1.TCF
+ * setting.
*/
-#define TCR_MTE_FLAGS TCR_TBI1 | TCR_TBID1
+#define TCR_MTE_FLAGS TCR_EL1_TCMA1 | TCR_EL1_TBI1 | TCR_EL1_TBID1
#else
#define TCR_MTE_FLAGS 0
#endif
+#define TCR_IRGN_WBWA ((TCR_EL1_IRGN0_WBWA << TCR_EL1_IRGN0_SHIFT) |\
+ (TCR_EL1_IRGN1_WBWA << TCR_EL1_IRGN1_SHIFT))
+#define TCR_ORGN_WBWA ((TCR_EL1_ORGN0_WBWA << TCR_EL1_ORGN0_SHIFT) |\
+ (TCR_EL1_ORGN1_WBWA << TCR_EL1_ORGN1_SHIFT))
+#define TCR_SHARED ((TCR_EL1_SH0_INNER << TCR_EL1_SH0_SHIFT) |\
+ (TCR_EL1_SH1_INNER << TCR_EL1_SH1_SHIFT))
+
/*
* Default MAIR_EL1. MT_NORMAL_TAGGED is initially mapped as Normal memory and
* changed during mte_cpu_setup to Normal Tagged if the system supports MTE.
@@ -100,6 +110,10 @@ SYM_FUNC_START(cpu_do_suspend)
* call stack.
*/
str x18, [x0, #96]
+alternative_if ARM64_HAS_TCR2
+ mrs x2, REG_TCR2_EL1
+ str x2, [x0, #104]
+alternative_else_nop_endif
ret
SYM_FUNC_END(cpu_do_suspend)
@@ -129,11 +143,15 @@ SYM_FUNC_START(cpu_do_resume)
/* Don't change t0sz here, mask those bits when restoring */
mrs x7, tcr_el1
- bfi x8, x7, TCR_T0SZ_OFFSET, TCR_TxSZ_WIDTH
+ bfi x8, x7, TCR_EL1_T0SZ_SHIFT, TCR_EL1_T0SZ_WIDTH
msr tcr_el1, x8
msr vbar_el1, x9
msr mdscr_el1, x10
+alternative_if ARM64_HAS_TCR2
+ ldr x2, [x0, #104]
+ msr REG_TCR2_EL1, x2
+alternative_else_nop_endif
msr sctlr_el1, x12
set_this_cpu_offset x13
@@ -245,10 +263,6 @@ SYM_FUNC_ALIAS(__pi_idmap_cpu_replace_ttbr1, idmap_cpu_replace_ttbr1)
*
* Called exactly once from stop_machine context by each CPU found during boot.
*/
- .pushsection ".data", "aw", %progbits
-SYM_DATA(__idmap_kpti_flag, .long 1)
- .popsection
-
SYM_TYPED_FUNC_START(idmap_kpti_install_ng_mappings)
cpu .req w0
temp_pte .req x0
@@ -273,7 +287,7 @@ SYM_TYPED_FUNC_START(idmap_kpti_install_ng_mappings)
mov x5, x3 // preserve temp_pte arg
mrs swapper_ttb, ttbr1_el1
- adr_l flag_ptr, __idmap_kpti_flag
+ adr_l flag_ptr, idmap_kpti_bbml2_flag
cbnz cpu, __idmap_kpti_secondary
@@ -416,7 +430,25 @@ alternative_else_nop_endif
__idmap_kpti_secondary:
/* Uninstall swapper before surgery begins */
__idmap_cpu_set_reserved_ttbr1 x16, x17
+ b scondary_cpu_wait
+
+ .unreq swapper_ttb
+ .unreq flag_ptr
+SYM_FUNC_END(idmap_kpti_install_ng_mappings)
+ .popsection
+#endif
+
+ .pushsection ".idmap.text", "a"
+SYM_TYPED_FUNC_START(wait_linear_map_split_to_ptes)
+ /* Must be same registers as in idmap_kpti_install_ng_mappings */
+ swapper_ttb .req x3
+ flag_ptr .req x4
+
+ mrs swapper_ttb, ttbr1_el1
+ adr_l flag_ptr, idmap_kpti_bbml2_flag
+ __idmap_cpu_set_reserved_ttbr1 x16, x17
+scondary_cpu_wait:
/* Increment the flag to let the boot CPU we're ready */
1: ldxr w16, [flag_ptr]
add w16, w16, #1
@@ -436,9 +468,8 @@ __idmap_kpti_secondary:
.unreq swapper_ttb
.unreq flag_ptr
-SYM_FUNC_END(idmap_kpti_install_ng_mappings)
+SYM_FUNC_END(wait_linear_map_split_to_ptes)
.popsection
-#endif
/*
* __cpu_setup
@@ -454,7 +485,7 @@ SYM_FUNC_START(__cpu_setup)
dsb nsh
msr cpacr_el1, xzr // Reset cpacr_el1
- mov x1, #1 << 12 // Reset mdscr_el1 and disable
+ mov x1, MDSCR_EL1_TDCC // Reset mdscr_el1 and disable
msr mdscr_el1, x1 // access to the DCC from EL0
reset_pmuserenr_el0 x1 // Disable PMU access from EL0
reset_amuserenr_el0 x1 // Disable AMU access from EL0
@@ -468,8 +499,8 @@ SYM_FUNC_START(__cpu_setup)
tcr2 .req x15
mov_q mair, MAIR_EL1_SET
mov_q tcr, TCR_T0SZ(IDMAP_VA_BITS) | TCR_T1SZ(VA_BITS_MIN) | TCR_CACHE_FLAGS | \
- TCR_SHARED | TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \
- TCR_TBI0 | TCR_A1 | TCR_KASAN_SW_FLAGS | TCR_MTE_FLAGS
+ TCR_SHARED | TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_EL1_AS | \
+ TCR_EL1_TBI0 | TCR_EL1_A1 | TCR_KASAN_SW_FLAGS | TCR_MTE_FLAGS
mov tcr2, xzr
tcr_clear_errata_bits tcr, x9, x5
@@ -479,7 +510,7 @@ SYM_FUNC_START(__cpu_setup)
alternative_if ARM64_HAS_VA52
tcr_set_t1sz tcr, x9
#ifdef CONFIG_ARM64_LPA2
- orr tcr, tcr, #TCR_DS
+ orr tcr, tcr, #TCR_EL1_DS
#endif
alternative_else_nop_endif
#endif
@@ -487,7 +518,7 @@ alternative_else_nop_endif
/*
* Set the IPS bits in TCR_EL1.
*/
- tcr_compute_pa_size tcr, #TCR_IPS_SHIFT, x5, x6
+ tcr_compute_pa_size tcr, #TCR_EL1_IPS_SHIFT, x5, x6
#ifdef CONFIG_ARM64_HW_AFDBM
/*
* Enable hardware update of the Access Flags bit.
@@ -497,7 +528,7 @@ alternative_else_nop_endif
mrs x9, ID_AA64MMFR1_EL1
ubfx x9, x9, ID_AA64MMFR1_EL1_HAFDBS_SHIFT, #4
cbz x9, 1f
- orr tcr, tcr, #TCR_HA // hardware Access flag update
+ orr tcr, tcr, #TCR_EL1_HA // hardware Access flag update
#ifdef CONFIG_ARM64_HAFT
cmp x9, ID_AA64MMFR1_EL1_HAFDBS_HAFT
b.lt 1f
@@ -512,28 +543,12 @@ alternative_else_nop_endif
ubfx x1, x1, #ID_AA64MMFR3_EL1_S1PIE_SHIFT, #4
cbz x1, .Lskip_indirection
- /*
- * The PROT_* macros describing the various memory types may resolve to
- * C expressions if they include the PTE_MAYBE_* macros, and so they
- * can only be used from C code. The PIE_E* constants below are also
- * defined in terms of those macros, but will mask out those
- * PTE_MAYBE_* constants, whether they are set or not. So #define them
- * as 0x0 here so we can evaluate the PIE_E* constants in asm context.
- */
-
-#define PTE_MAYBE_NG 0
-#define PTE_MAYBE_SHARED 0
-
- mov_q x0, PIE_E0
+ mov_q x0, PIE_E0_ASM
msr REG_PIRE0_EL1, x0
- mov_q x0, PIE_E1
+ mov_q x0, PIE_E1_ASM
msr REG_PIR_EL1, x0
-#undef PTE_MAYBE_NG
-#undef PTE_MAYBE_SHARED
-
orr tcr2, tcr2, TCR2_EL1_PIE
- msr REG_TCR2_EL1, x0
.Lskip_indirection:
diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c
index 688fbe0271ca..ab9899ca1e5f 100644
--- a/arch/arm64/mm/ptdump.c
+++ b/arch/arm64/mm/ptdump.c
@@ -80,8 +80,8 @@ static const struct ptdump_prot_bits pte_bits[] = {
.set = "CON",
.clear = " ",
}, {
- .mask = PTE_TABLE_BIT | PTE_VALID,
- .val = PTE_VALID,
+ .mask = PMD_TYPE_MASK,
+ .val = PMD_TYPE_SECT,
.set = "BLK",
.clear = " ",
}, {
@@ -189,12 +189,12 @@ static void note_prot_wx(struct ptdump_pg_state *st, unsigned long addr)
}
void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
- u64 val)
+ pteval_t val)
{
struct ptdump_pg_state *st = container_of(pt_st, struct ptdump_pg_state, ptdump);
struct ptdump_pg_level *pg_level = st->pg_level;
static const char units[] = "KMGTPE";
- u64 prot = 0;
+ ptdesc_t prot = 0;
/* check if the current level has been folded dynamically */
if (st->mm && ((level == 1 && mm_p4d_folded(st->mm)) ||
@@ -251,6 +251,45 @@ void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
}
+void note_page_pte(struct ptdump_state *pt_st, unsigned long addr, pte_t pte)
+{
+ note_page(pt_st, addr, 4, pte_val(pte));
+}
+
+void note_page_pmd(struct ptdump_state *pt_st, unsigned long addr, pmd_t pmd)
+{
+ note_page(pt_st, addr, 3, pmd_val(pmd));
+}
+
+void note_page_pud(struct ptdump_state *pt_st, unsigned long addr, pud_t pud)
+{
+ note_page(pt_st, addr, 2, pud_val(pud));
+}
+
+void note_page_p4d(struct ptdump_state *pt_st, unsigned long addr, p4d_t p4d)
+{
+ note_page(pt_st, addr, 1, p4d_val(p4d));
+}
+
+void note_page_pgd(struct ptdump_state *pt_st, unsigned long addr, pgd_t pgd)
+{
+ note_page(pt_st, addr, 0, pgd_val(pgd));
+}
+
+void note_page_flush(struct ptdump_state *pt_st)
+{
+ pte_t pte_zero = {0};
+
+ note_page(pt_st, 0, -1, pte_val(pte_zero));
+}
+
+static void arm64_ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm)
+{
+ static_branch_inc(&arm64_ptdump_lock_key);
+ ptdump_walk_pgd(st, mm, NULL);
+ static_branch_dec(&arm64_ptdump_lock_key);
+}
+
void ptdump_walk(struct seq_file *s, struct ptdump_info *info)
{
unsigned long end = ~0UL;
@@ -266,7 +305,12 @@ void ptdump_walk(struct seq_file *s, struct ptdump_info *info)
.pg_level = &kernel_pg_levels[0],
.level = -1,
.ptdump = {
- .note_page = note_page,
+ .note_page_pte = note_page_pte,
+ .note_page_pmd = note_page_pmd,
+ .note_page_pud = note_page_pud,
+ .note_page_p4d = note_page_p4d,
+ .note_page_pgd = note_page_pgd,
+ .note_page_flush = note_page_flush,
.range = (struct ptdump_range[]){
{info->base_addr, end},
{0, 0}
@@ -274,7 +318,7 @@ void ptdump_walk(struct seq_file *s, struct ptdump_info *info)
}
};
- ptdump_walk_pgd(&st.ptdump, info->mm, NULL);
+ arm64_ptdump_walk_pgd(&st.ptdump, info->mm);
}
static void __init ptdump_initialize(void)
@@ -303,7 +347,12 @@ bool ptdump_check_wx(void)
.level = -1,
.check_wx = true,
.ptdump = {
- .note_page = note_page,
+ .note_page_pte = note_page_pte,
+ .note_page_pmd = note_page_pmd,
+ .note_page_pud = note_page_pud,
+ .note_page_p4d = note_page_p4d,
+ .note_page_pgd = note_page_pgd,
+ .note_page_flush = note_page_flush,
.range = (struct ptdump_range[]) {
{_PAGE_OFFSET(vabits_actual), ~0UL},
{0, 0}
@@ -311,7 +360,7 @@ bool ptdump_check_wx(void)
}
};
- ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
+ arm64_ptdump_walk_pgd(&st.ptdump, &init_mm);
if (st.wx_pages || st.uxn_pages) {
pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found, %lu non-UXN pages found\n",
diff --git a/arch/arm64/mm/ptdump_debugfs.c b/arch/arm64/mm/ptdump_debugfs.c
index 68bf1a125502..1e308328c079 100644
--- a/arch/arm64/mm/ptdump_debugfs.c
+++ b/arch/arm64/mm/ptdump_debugfs.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/debugfs.h>
-#include <linux/memory_hotplug.h>
#include <linux/seq_file.h>
#include <asm/ptdump.h>
@@ -9,9 +8,7 @@ static int ptdump_show(struct seq_file *m, void *v)
{
struct ptdump_info *info = m->private;
- get_online_mems();
ptdump_walk(m, info);
- put_online_mems();
return 0;
}
DEFINE_SHOW_ATTRIBUTE(ptdump);
diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c
index 18543b603c77..cca9706a875c 100644
--- a/arch/arm64/mm/trans_pgd.c
+++ b/arch/arm64/mm/trans_pgd.c
@@ -31,36 +31,6 @@ static void *trans_alloc(struct trans_pgd_info *info)
return info->trans_alloc_page(info->trans_alloc_arg);
}
-static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr)
-{
- pte_t pte = __ptep_get(src_ptep);
-
- if (pte_valid(pte)) {
- /*
- * Resume will overwrite areas that may be marked
- * read only (code, rodata). Clear the RDONLY bit from
- * the temporary mappings we use during restore.
- */
- __set_pte(dst_ptep, pte_mkwrite_novma(pte));
- } else if (!pte_none(pte)) {
- /*
- * debug_pagealloc will removed the PTE_VALID bit if
- * the page isn't in use by the resume kernel. It may have
- * been in use by the original kernel, in which case we need
- * to put it back in our copy to do the restore.
- *
- * Other cases include kfence / vmalloc / memfd_secret which
- * may call `set_direct_map_invalid_noflush()`.
- *
- * Before marking this entry valid, check the pfn should
- * be mapped.
- */
- BUG_ON(!pfn_valid(pte_pfn(pte)));
-
- __set_pte(dst_ptep, pte_mkvalid(pte_mkwrite_novma(pte)));
- }
-}
-
static int copy_pte(struct trans_pgd_info *info, pmd_t *dst_pmdp,
pmd_t *src_pmdp, unsigned long start, unsigned long end)
{
@@ -76,7 +46,11 @@ static int copy_pte(struct trans_pgd_info *info, pmd_t *dst_pmdp,
src_ptep = pte_offset_kernel(src_pmdp, start);
do {
- _copy_pte(dst_ptep, src_ptep, addr);
+ pte_t pte = __ptep_get(src_ptep);
+
+ if (pte_none(pte))
+ continue;
+ __set_pte(dst_ptep, pte_mkvalid_k(pte_mkwrite_novma(pte)));
} while (dst_ptep++, src_ptep++, addr += PAGE_SIZE, addr != end);
return 0;
@@ -109,8 +83,7 @@ static int copy_pmd(struct trans_pgd_info *info, pud_t *dst_pudp,
if (copy_pte(info, dst_pmdp, src_pmdp, addr, next))
return -ENOMEM;
} else {
- set_pmd(dst_pmdp,
- __pmd(pmd_val(pmd) & ~PMD_SECT_RDONLY));
+ set_pmd(dst_pmdp, pmd_mkvalid_k(pmd_mkwrite_novma(pmd)));
}
} while (dst_pmdp++, src_pmdp++, addr = next, addr != end);
@@ -145,8 +118,7 @@ static int copy_pud(struct trans_pgd_info *info, p4d_t *dst_p4dp,
if (copy_pmd(info, dst_pudp, src_pudp, addr, next))
return -ENOMEM;
} else {
- set_pud(dst_pudp,
- __pud(pud_val(pud) & ~PUD_SECT_RDONLY));
+ set_pud(dst_pudp, pud_mkvalid_k(pud_mkwrite_novma(pud)));
}
} while (dst_pudp++, src_pudp++, addr = next, addr != end);