diff options
author | Will Deacon <will@kernel.org> | 2021-02-12 15:03:53 +0000 |
---|---|---|
committer | Will Deacon <will@kernel.org> | 2021-02-12 15:03:53 +0000 |
commit | b374d0f981a79303d6079d7210c04af304fc6b9d (patch) | |
tree | c586b9feb00f938e0b418404e4c6c017598a5c0d | |
parent | 6b76c3aedb07588ef558ba33896d6ae75229c7b7 (diff) | |
parent | d1bbc35fcab28668c8992c4d5777234b794d7306 (diff) | |
download | lwn-b374d0f981a79303d6079d7210c04af304fc6b9d.tar.gz lwn-b374d0f981a79303d6079d7210c04af304fc6b9d.zip |
Merge branch 'for-next/kexec' into for-next/core
Significant steps along the road to leaving the MMU enabled during kexec
relocation.
* for-next/kexec:
arm64: hibernate: add __force attribute to gfp_t casting
arm64: kexec: arm64_relocate_new_kernel don't use x0 as temp
arm64: kexec: arm64_relocate_new_kernel clean-ups and optimizations
arm64: kexec: call kexec_image_info only once
arm64: kexec: move relocation function setup
arm64: trans_pgd: hibernate: idmap the single page that holds the copy page routines
arm64: mm: Always update TCR_EL1 from __cpu_set_tcr_t0sz()
arm64: trans_pgd: pass NULL instead of init_mm to *_populate functions
arm64: trans_pgd: pass allocator trans_pgd_create_copy
arm64: trans_pgd: make trans_pgd_map_page generic
arm64: hibernate: move page handling function to new trans_pgd.c
arm64: hibernate: variable pudp is used instead of pd4dp
arm64: kexec: make dtb_mem always enabled
-rw-r--r-- | arch/arm64/Kconfig | 4 | ||||
-rw-r--r-- | arch/arm64/include/asm/kexec.h | 5 | ||||
-rw-r--r-- | arch/arm64/include/asm/mmu_context.h | 7 | ||||
-rw-r--r-- | arch/arm64/include/asm/trans_pgd.h | 39 | ||||
-rw-r--r-- | arch/arm64/kernel/hibernate.c | 271 | ||||
-rw-r--r-- | arch/arm64/kernel/machine_kexec.c | 57 | ||||
-rw-r--r-- | arch/arm64/kernel/relocate_kernel.S | 48 | ||||
-rw-r--r-- | arch/arm64/mm/Makefile | 1 | ||||
-rw-r--r-- | arch/arm64/mm/trans_pgd.c | 324 |
9 files changed, 434 insertions, 322 deletions
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 3dfb25afa616..43a9867e0dee 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1132,6 +1132,10 @@ config CRASH_DUMP For more details see Documentation/admin-guide/kdump/kdump.rst +config TRANS_TABLE + def_bool y + depends on HIBERNATION + config XEN_DOM0 def_bool y depends on XEN diff --git a/arch/arm64/include/asm/kexec.h b/arch/arm64/include/asm/kexec.h index d24b527e8c00..9befcd87e9a8 100644 --- a/arch/arm64/include/asm/kexec.h +++ b/arch/arm64/include/asm/kexec.h @@ -90,18 +90,19 @@ static inline void crash_prepare_suspend(void) {} static inline void crash_post_resume(void) {} #endif -#ifdef CONFIG_KEXEC_FILE #define ARCH_HAS_KIMAGE_ARCH struct kimage_arch { void *dtb; - unsigned long dtb_mem; + phys_addr_t dtb_mem; + phys_addr_t kern_reloc; /* Core ELF header buffer */ void *elf_headers; unsigned long elf_headers_mem; unsigned long elf_headers_sz; }; +#ifdef CONFIG_KEXEC_FILE extern const struct kexec_file_ops kexec_image_ops; struct kimage; diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index 0b3079fd28eb..70ce8c1d2b07 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -81,16 +81,15 @@ static inline bool __cpu_uses_extended_idmap_level(void) } /* - * Set TCR.T0SZ to its default value (based on VA_BITS) + * Ensure TCR.T0SZ is set to the provided value. */ static inline void __cpu_set_tcr_t0sz(unsigned long t0sz) { - unsigned long tcr; + unsigned long tcr = read_sysreg(tcr_el1); - if (!__cpu_uses_extended_idmap()) + if ((tcr & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET == t0sz) return; - tcr = read_sysreg(tcr_el1); tcr &= ~TCR_T0SZ_MASK; tcr |= t0sz << TCR_T0SZ_OFFSET; write_sysreg(tcr, tcr_el1); diff --git a/arch/arm64/include/asm/trans_pgd.h b/arch/arm64/include/asm/trans_pgd.h new file mode 100644 index 000000000000..5d08e5adf3d5 --- /dev/null +++ b/arch/arm64/include/asm/trans_pgd.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (c) 2020, Microsoft Corporation. + * Pavel Tatashin <pasha.tatashin@soleen.com> + */ + +#ifndef _ASM_TRANS_TABLE_H +#define _ASM_TRANS_TABLE_H + +#include <linux/bits.h> +#include <linux/types.h> +#include <asm/pgtable-types.h> + +/* + * trans_alloc_page + * - Allocator that should return exactly one zeroed page, if this + * allocator fails, trans_pgd_create_copy() and trans_pgd_map_page() + * return -ENOMEM error. + * + * trans_alloc_arg + * - Passed to trans_alloc_page as an argument + */ + +struct trans_pgd_info { + void * (*trans_alloc_page)(void *arg); + void *trans_alloc_arg; +}; + +int trans_pgd_create_copy(struct trans_pgd_info *info, pgd_t **trans_pgd, + unsigned long start, unsigned long end); + +int trans_pgd_map_page(struct trans_pgd_info *info, pgd_t *trans_pgd, + void *page, unsigned long dst_addr, pgprot_t pgprot); + +int trans_pgd_idmap_page(struct trans_pgd_info *info, phys_addr_t *trans_ttbr0, + unsigned long *t0sz, void *page); + +#endif /* _ASM_TRANS_TABLE_H */ diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index 9c9f47e9f7f4..b1cef371df2b 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -16,7 +16,6 @@ #define pr_fmt(x) "hibernate: " x #include <linux/cpu.h> #include <linux/kvm_host.h> -#include <linux/mm.h> #include <linux/pm.h> #include <linux/sched.h> #include <linux/suspend.h> @@ -31,13 +30,12 @@ #include <asm/memory.h> #include <asm/mmu_context.h> #include <asm/mte.h> -#include <asm/pgalloc.h> -#include <asm/pgtable-hwdef.h> #include <asm/sections.h> #include <asm/smp.h> #include <asm/smp_plat.h> #include <asm/suspend.h> #include <asm/sysreg.h> +#include <asm/trans_pgd.h> #include <asm/virt.h> /* @@ -178,52 +176,9 @@ int arch_hibernation_header_restore(void *addr) } EXPORT_SYMBOL(arch_hibernation_header_restore); -static int trans_pgd_map_page(pgd_t *trans_pgd, void *page, - unsigned long dst_addr, - pgprot_t pgprot) +static void *hibernate_page_alloc(void *arg) { - pgd_t *pgdp; - p4d_t *p4dp; - pud_t *pudp; - pmd_t *pmdp; - pte_t *ptep; - - pgdp = pgd_offset_pgd(trans_pgd, dst_addr); - if (pgd_none(READ_ONCE(*pgdp))) { - pudp = (void *)get_safe_page(GFP_ATOMIC); - if (!pudp) - return -ENOMEM; - pgd_populate(&init_mm, pgdp, pudp); - } - - p4dp = p4d_offset(pgdp, dst_addr); - if (p4d_none(READ_ONCE(*p4dp))) { - pudp = (void *)get_safe_page(GFP_ATOMIC); - if (!pudp) - return -ENOMEM; - p4d_populate(&init_mm, p4dp, pudp); - } - - pudp = pud_offset(p4dp, dst_addr); - if (pud_none(READ_ONCE(*pudp))) { - pmdp = (void *)get_safe_page(GFP_ATOMIC); - if (!pmdp) - return -ENOMEM; - pud_populate(&init_mm, pudp, pmdp); - } - - pmdp = pmd_offset(pudp, dst_addr); - if (pmd_none(READ_ONCE(*pmdp))) { - ptep = (void *)get_safe_page(GFP_ATOMIC); - if (!ptep) - return -ENOMEM; - pmd_populate_kernel(&init_mm, pmdp, ptep); - } - - ptep = pte_offset_kernel(pmdp, dst_addr); - set_pte(ptep, pfn_pte(virt_to_pfn(page), PAGE_KERNEL_EXEC)); - - return 0; + return (void *)get_safe_page((__force gfp_t)(unsigned long)arg); } /* @@ -239,11 +194,16 @@ static int trans_pgd_map_page(pgd_t *trans_pgd, void *page, * page system. */ static int create_safe_exec_page(void *src_start, size_t length, - unsigned long dst_addr, phys_addr_t *phys_dst_addr) { + struct trans_pgd_info trans_info = { + .trans_alloc_page = hibernate_page_alloc, + .trans_alloc_arg = (__force void *)GFP_ATOMIC, + }; + void *page = (void *)get_safe_page(GFP_ATOMIC); - pgd_t *trans_pgd; + phys_addr_t trans_ttbr0; + unsigned long t0sz; int rc; if (!page) @@ -251,13 +211,7 @@ static int create_safe_exec_page(void *src_start, size_t length, memcpy(page, src_start, length); __flush_icache_range((unsigned long)page, (unsigned long)page + length); - - trans_pgd = (void *)get_safe_page(GFP_ATOMIC); - if (!trans_pgd) - return -ENOMEM; - - rc = trans_pgd_map_page(trans_pgd, page, dst_addr, - PAGE_KERNEL_EXEC); + rc = trans_pgd_idmap_page(&trans_info, &trans_ttbr0, &t0sz, page); if (rc) return rc; @@ -270,12 +224,15 @@ static int create_safe_exec_page(void *src_start, size_t length, * page, but TLBs may contain stale ASID-tagged entries (e.g. for EFI * runtime services), while for a userspace-driven test_resume cycle it * points to userspace page tables (and we must point it at a zero page - * ourselves). Elsewhere we only (un)install the idmap with preemption - * disabled, so T0SZ should be as required regardless. + * ourselves). + * + * We change T0SZ as part of installing the idmap. This is undone by + * cpu_uninstall_idmap() in __cpu_suspend_exit(). */ cpu_set_reserved_ttbr0(); local_flush_tlb_all(); - write_sysreg(phys_to_ttbr(virt_to_phys(trans_pgd)), ttbr0_el1); + __cpu_set_tcr_t0sz(t0sz); + write_sysreg(trans_ttbr0, ttbr0_el1); isb(); *phys_dst_addr = virt_to_phys(page); @@ -462,182 +419,6 @@ int swsusp_arch_suspend(void) return ret; } -static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr) -{ - pte_t pte = READ_ONCE(*src_ptep); - - if (pte_valid(pte)) { - /* - * Resume will overwrite areas that may be marked - * read only (code, rodata). Clear the RDONLY bit from - * the temporary mappings we use during restore. - */ - set_pte(dst_ptep, pte_mkwrite(pte)); - } else if (debug_pagealloc_enabled() && !pte_none(pte)) { - /* - * debug_pagealloc will removed the PTE_VALID bit if - * the page isn't in use by the resume kernel. It may have - * been in use by the original kernel, in which case we need - * to put it back in our copy to do the restore. - * - * Before marking this entry valid, check the pfn should - * be mapped. - */ - BUG_ON(!pfn_valid(pte_pfn(pte))); - - set_pte(dst_ptep, pte_mkpresent(pte_mkwrite(pte))); - } -} - -static int copy_pte(pmd_t *dst_pmdp, pmd_t *src_pmdp, unsigned long start, - unsigned long end) -{ - pte_t *src_ptep; - pte_t *dst_ptep; - unsigned long addr = start; - - dst_ptep = (pte_t *)get_safe_page(GFP_ATOMIC); - if (!dst_ptep) - return -ENOMEM; - pmd_populate_kernel(&init_mm, dst_pmdp, dst_ptep); - dst_ptep = pte_offset_kernel(dst_pmdp, start); - - src_ptep = pte_offset_kernel(src_pmdp, start); - do { - _copy_pte(dst_ptep, src_ptep, addr); - } while (dst_ptep++, src_ptep++, addr += PAGE_SIZE, addr != end); - - return 0; -} - -static int copy_pmd(pud_t *dst_pudp, pud_t *src_pudp, unsigned long start, - unsigned long end) -{ - pmd_t *src_pmdp; - pmd_t *dst_pmdp; - unsigned long next; - unsigned long addr = start; - - if (pud_none(READ_ONCE(*dst_pudp))) { - dst_pmdp = (pmd_t *)get_safe_page(GFP_ATOMIC); - if (!dst_pmdp) - return -ENOMEM; - pud_populate(&init_mm, dst_pudp, dst_pmdp); - } - dst_pmdp = pmd_offset(dst_pudp, start); - - src_pmdp = pmd_offset(src_pudp, start); - do { - pmd_t pmd = READ_ONCE(*src_pmdp); - - next = pmd_addr_end(addr, end); - if (pmd_none(pmd)) - continue; - if (pmd_table(pmd)) { - if (copy_pte(dst_pmdp, src_pmdp, addr, next)) - return -ENOMEM; - } else { - set_pmd(dst_pmdp, - __pmd(pmd_val(pmd) & ~PMD_SECT_RDONLY)); - } - } while (dst_pmdp++, src_pmdp++, addr = next, addr != end); - - return 0; -} - -static int copy_pud(p4d_t *dst_p4dp, p4d_t *src_p4dp, unsigned long start, - unsigned long end) -{ - pud_t *dst_pudp; - pud_t *src_pudp; - unsigned long next; - unsigned long addr = start; - - if (p4d_none(READ_ONCE(*dst_p4dp))) { - dst_pudp = (pud_t *)get_safe_page(GFP_ATOMIC); - if (!dst_pudp) - return -ENOMEM; - p4d_populate(&init_mm, dst_p4dp, dst_pudp); - } - dst_pudp = pud_offset(dst_p4dp, start); - - src_pudp = pud_offset(src_p4dp, start); - do { - pud_t pud = READ_ONCE(*src_pudp); - - next = pud_addr_end(addr, end); - if (pud_none(pud)) - continue; - if (pud_table(pud)) { - if (copy_pmd(dst_pudp, src_pudp, addr, next)) - return -ENOMEM; - } else { - set_pud(dst_pudp, - __pud(pud_val(pud) & ~PUD_SECT_RDONLY)); - } - } while (dst_pudp++, src_pudp++, addr = next, addr != end); - - return 0; -} - -static int copy_p4d(pgd_t *dst_pgdp, pgd_t *src_pgdp, unsigned long start, - unsigned long end) -{ - p4d_t *dst_p4dp; - p4d_t *src_p4dp; - unsigned long next; - unsigned long addr = start; - - dst_p4dp = p4d_offset(dst_pgdp, start); - src_p4dp = p4d_offset(src_pgdp, start); - do { - next = p4d_addr_end(addr, end); - if (p4d_none(READ_ONCE(*src_p4dp))) - continue; - if (copy_pud(dst_p4dp, src_p4dp, addr, next)) - return -ENOMEM; - } while (dst_p4dp++, src_p4dp++, addr = next, addr != end); - - return 0; -} - -static int copy_page_tables(pgd_t *dst_pgdp, unsigned long start, - unsigned long end) -{ - unsigned long next; - unsigned long addr = start; - pgd_t *src_pgdp = pgd_offset_k(start); - - dst_pgdp = pgd_offset_pgd(dst_pgdp, start); - do { - next = pgd_addr_end(addr, end); - if (pgd_none(READ_ONCE(*src_pgdp))) - continue; - if (copy_p4d(dst_pgdp, src_pgdp, addr, next)) - return -ENOMEM; - } while (dst_pgdp++, src_pgdp++, addr = next, addr != end); - - return 0; -} - -static int trans_pgd_create_copy(pgd_t **dst_pgdp, unsigned long start, - unsigned long end) -{ - int rc; - pgd_t *trans_pgd = (pgd_t *)get_safe_page(GFP_ATOMIC); - - if (!trans_pgd) { - pr_err("Failed to allocate memory for temporary page tables.\n"); - return -ENOMEM; - } - - rc = copy_page_tables(trans_pgd, start, end); - if (!rc) - *dst_pgdp = trans_pgd; - - return rc; -} - /* * Setup then Resume from the hibernate image using swsusp_arch_suspend_exit(). * @@ -650,16 +431,20 @@ int swsusp_arch_resume(void) void *zero_page; size_t exit_size; pgd_t *tmp_pg_dir; - phys_addr_t phys_hibernate_exit; void __noreturn (*hibernate_exit)(phys_addr_t, phys_addr_t, void *, void *, phys_addr_t, phys_addr_t); + struct trans_pgd_info trans_info = { + .trans_alloc_page = hibernate_page_alloc, + .trans_alloc_arg = (void *)GFP_ATOMIC, + }; /* * Restoring the memory image will overwrite the ttbr1 page tables. * Create a second copy of just the linear map, and use this when * restoring. */ - rc = trans_pgd_create_copy(&tmp_pg_dir, PAGE_OFFSET, PAGE_END); + rc = trans_pgd_create_copy(&trans_info, &tmp_pg_dir, PAGE_OFFSET, + PAGE_END); if (rc) return rc; @@ -673,19 +458,13 @@ int swsusp_arch_resume(void) return -ENOMEM; } - /* - * Locate the exit code in the bottom-but-one page, so that *NULL - * still has disastrous affects. - */ - hibernate_exit = (void *)PAGE_SIZE; exit_size = __hibernate_exit_text_end - __hibernate_exit_text_start; /* * Copy swsusp_arch_suspend_exit() to a safe page. This will generate * a new set of ttbr0 page tables and load them. */ rc = create_safe_exec_page(__hibernate_exit_text_start, exit_size, - (unsigned long)hibernate_exit, - &phys_hibernate_exit); + (phys_addr_t *)&hibernate_exit); if (rc) { pr_err("Failed to create safe executable page for hibernate_exit code.\n"); return rc; @@ -704,7 +483,7 @@ int swsusp_arch_resume(void) * We can skip this step if we booted at EL1, or are running with VHE. */ if (el2_reset_needed()) { - phys_addr_t el2_vectors = phys_hibernate_exit; /* base */ + phys_addr_t el2_vectors = (phys_addr_t)hibernate_exit; el2_vectors += hibernate_el2_vectors - __hibernate_exit_text_start; /* offset */ diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c index a0b144cfaea7..90a335c74442 100644 --- a/arch/arm64/kernel/machine_kexec.c +++ b/arch/arm64/kernel/machine_kexec.c @@ -42,6 +42,7 @@ static void _kexec_image_info(const char *func, int line, pr_debug(" start: %lx\n", kimage->start); pr_debug(" head: %lx\n", kimage->head); pr_debug(" nr_segments: %lu\n", kimage->nr_segments); + pr_debug(" kern_reloc: %pa\n", &kimage->arch.kern_reloc); for (i = 0; i < kimage->nr_segments; i++) { pr_debug(" segment[%lu]: %016lx - %016lx, 0x%lx bytes, %lu pages\n", @@ -58,6 +59,23 @@ void machine_kexec_cleanup(struct kimage *kimage) /* Empty routine needed to avoid build errors. */ } +int machine_kexec_post_load(struct kimage *kimage) +{ + void *reloc_code = page_to_virt(kimage->control_code_page); + + memcpy(reloc_code, arm64_relocate_new_kernel, + arm64_relocate_new_kernel_size); + kimage->arch.kern_reloc = __pa(reloc_code); + kexec_image_info(kimage); + + /* Flush the reloc_code in preparation for its execution. */ + __flush_dcache_area(reloc_code, arm64_relocate_new_kernel_size); + flush_icache_range((uintptr_t)reloc_code, (uintptr_t)reloc_code + + arm64_relocate_new_kernel_size); + + return 0; +} + /** * machine_kexec_prepare - Prepare for a kexec reboot. * @@ -67,8 +85,6 @@ void machine_kexec_cleanup(struct kimage *kimage) */ int machine_kexec_prepare(struct kimage *kimage) { - kexec_image_info(kimage); - if (kimage->type != KEXEC_TYPE_CRASH && cpus_are_stuck_in_kernel()) { pr_err("Can't kexec: CPUs are stuck in the kernel.\n"); return -EBUSY; @@ -143,8 +159,6 @@ static void kexec_segment_flush(const struct kimage *kimage) */ void machine_kexec(struct kimage *kimage) { - phys_addr_t reboot_code_buffer_phys; - void *reboot_code_buffer; bool in_kexec_crash = (kimage == kexec_crash_image); bool stuck_cpus = cpus_are_stuck_in_kernel(); @@ -155,31 +169,6 @@ void machine_kexec(struct kimage *kimage) WARN(in_kexec_crash && (stuck_cpus || smp_crash_stop_failed()), "Some CPUs may be stale, kdump will be unreliable.\n"); - reboot_code_buffer_phys = page_to_phys(kimage->control_code_page); - reboot_code_buffer = phys_to_virt(reboot_code_buffer_phys); - - kexec_image_info(kimage); - - /* - * Copy arm64_relocate_new_kernel to the reboot_code_buffer for use - * after the kernel is shut down. - */ - memcpy(reboot_code_buffer, arm64_relocate_new_kernel, - arm64_relocate_new_kernel_size); - - /* Flush the reboot_code_buffer in preparation for its execution. */ - __flush_dcache_area(reboot_code_buffer, arm64_relocate_new_kernel_size); - - /* - * Although we've killed off the secondary CPUs, we don't update - * the online mask if we're handling a crash kernel and consequently - * need to avoid flush_icache_range(), which will attempt to IPI - * the offline CPUs. Therefore, we must use the __* variant here. - */ - __flush_icache_range((uintptr_t)reboot_code_buffer, - (uintptr_t)reboot_code_buffer + - arm64_relocate_new_kernel_size); - /* Flush the kimage list and its buffers. */ kexec_list_flush(kimage); @@ -193,7 +182,7 @@ void machine_kexec(struct kimage *kimage) /* * cpu_soft_restart will shutdown the MMU, disable data caches, then - * transfer control to the reboot_code_buffer which contains a copy of + * transfer control to the kern_reloc which contains a copy of * the arm64_relocate_new_kernel routine. arm64_relocate_new_kernel * uses physical addressing to relocate the new image to its final * position and transfers control to the image entry point when the @@ -203,12 +192,8 @@ void machine_kexec(struct kimage *kimage) * userspace (kexec-tools). * In kexec_file case, the kernel starts directly without purgatory. */ - cpu_soft_restart(reboot_code_buffer_phys, kimage->head, kimage->start, -#ifdef CONFIG_KEXEC_FILE - kimage->arch.dtb_mem); -#else - 0); -#endif + cpu_soft_restart(kimage->arch.kern_reloc, kimage->head, kimage->start, + kimage->arch.dtb_mem); BUG(); /* Should never get here. */ } diff --git a/arch/arm64/kernel/relocate_kernel.S b/arch/arm64/kernel/relocate_kernel.S index 84eec95ec06c..b78ea5de97a4 100644 --- a/arch/arm64/kernel/relocate_kernel.S +++ b/arch/arm64/kernel/relocate_kernel.S @@ -17,28 +17,24 @@ /* * arm64_relocate_new_kernel - Put a 2nd stage image in place and boot it. * - * The memory that the old kernel occupies may be overwritten when coping the + * The memory that the old kernel occupies may be overwritten when copying the * new image to its final location. To assure that the * arm64_relocate_new_kernel routine which does that copy is not overwritten, * all code and data needed by arm64_relocate_new_kernel must be between the * symbols arm64_relocate_new_kernel and arm64_relocate_new_kernel_end. The * machine_kexec() routine will copy arm64_relocate_new_kernel to the kexec - * control_code_page, a special page which has been set up to be preserved - * during the copy operation. + * safe memory that has been set up to be preserved during the copy operation. */ SYM_CODE_START(arm64_relocate_new_kernel) - /* Setup the list loop variables. */ mov x18, x2 /* x18 = dtb address */ mov x17, x1 /* x17 = kimage_start */ mov x16, x0 /* x16 = kimage_head */ - raw_dcache_line_size x15, x0 /* x15 = dcache line size */ mov x14, xzr /* x14 = entry ptr */ mov x13, xzr /* x13 = copy dest */ - /* Check if the new image needs relocation. */ tbnz x16, IND_DONE_BIT, .Ldone - + raw_dcache_line_size x15, x1 /* x15 = dcache line size */ .Lloop: and x12, x16, PAGE_MASK /* x12 = addr */ @@ -47,44 +43,28 @@ SYM_CODE_START(arm64_relocate_new_kernel) tbz x16, IND_SOURCE_BIT, .Ltest_indirection /* Invalidate dest page to PoC. */ - mov x0, x13 - add x20, x0, #PAGE_SIZE + mov x2, x13 + add x20, x2, #PAGE_SIZE sub x1, x15, #1 - bic x0, x0, x1 -2: dc ivac, x0 - add x0, x0, x15 - cmp x0, x20 + bic x2, x2, x1 +2: dc ivac, x2 + add x2, x2, x15 + cmp x2, x20 b.lo 2b dsb sy - mov x20, x13 - mov x21, x12 - copy_page x20, x21, x0, x1, x2, x3, x4, x5, x6, x7 - - /* dest += PAGE_SIZE */ - add x13, x13, PAGE_SIZE + copy_page x13, x12, x1, x2, x3, x4, x5, x6, x7, x8 b .Lnext - .Ltest_indirection: tbz x16, IND_INDIRECTION_BIT, .Ltest_destination - - /* ptr = addr */ - mov x14, x12 + mov x14, x12 /* ptr = addr */ b .Lnext - .Ltest_destination: tbz x16, IND_DESTINATION_BIT, .Lnext - - /* dest = addr */ - mov x13, x12 - + mov x13, x12 /* dest = addr */ .Lnext: - /* entry = *ptr++ */ - ldr x16, [x14], #8 - - /* while (!(entry & DONE)) */ - tbz x16, IND_DONE_BIT, .Lloop - + ldr x16, [x14], #8 /* entry = *ptr++ */ + tbz x16, IND_DONE_BIT, .Lloop /* while (!(entry & DONE)) */ .Ldone: /* wait for writes from copy_page to finish */ dsb nsh diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile index 5ead3c3de3b6..77222d92667a 100644 --- a/arch/arm64/mm/Makefile +++ b/arch/arm64/mm/Makefile @@ -6,6 +6,7 @@ obj-y := dma-mapping.o extable.o fault.o init.o \ obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_PTDUMP_CORE) += ptdump.o obj-$(CONFIG_PTDUMP_DEBUGFS) += ptdump_debugfs.o +obj-$(CONFIG_TRANS_TABLE) += trans_pgd.o obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_DEBUG_VIRTUAL) += physaddr.o obj-$(CONFIG_ARM64_MTE) += mteswap.o diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c new file mode 100644 index 000000000000..527f0a39c3da --- /dev/null +++ b/arch/arm64/mm/trans_pgd.c @@ -0,0 +1,324 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Transitional page tables for kexec and hibernate + * + * This file derived from: arch/arm64/kernel/hibernate.c + * + * Copyright (c) 2020, Microsoft Corporation. + * Pavel Tatashin <pasha.tatashin@soleen.com> + * + */ + +/* + * Transitional tables are used during system transferring from one world to + * another: such as during hibernate restore, and kexec reboots. During these + * phases one cannot rely on page table not being overwritten. This is because + * hibernate and kexec can overwrite the current page tables during transition. + */ + +#include <asm/trans_pgd.h> +#include <asm/pgalloc.h> +#include <asm/pgtable.h> +#include <linux/suspend.h> +#include <linux/bug.h> +#include <linux/mm.h> +#include <linux/mmzone.h> + +static void *trans_alloc(struct trans_pgd_info *info) +{ + return info->trans_alloc_page(info->trans_alloc_arg); +} + +static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr) +{ + pte_t pte = READ_ONCE(*src_ptep); + + if (pte_valid(pte)) { + /* + * Resume will overwrite areas that may be marked + * read only (code, rodata). Clear the RDONLY bit from + * the temporary mappings we use during restore. + */ + set_pte(dst_ptep, pte_mkwrite(pte)); + } else if (debug_pagealloc_enabled() && !pte_none(pte)) { + /* + * debug_pagealloc will removed the PTE_VALID bit if + * the page isn't in use by the resume kernel. It may have + * been in use by the original kernel, in which case we need + * to put it back in our copy to do the restore. + * + * Before marking this entry valid, check the pfn should + * be mapped. + */ + BUG_ON(!pfn_valid(pte_pfn(pte))); + + set_pte(dst_ptep, pte_mkpresent(pte_mkwrite(pte))); + } +} + +static int copy_pte(struct trans_pgd_info *info, pmd_t *dst_pmdp, + pmd_t *src_pmdp, unsigned long start, unsigned long end) +{ + pte_t *src_ptep; + pte_t *dst_ptep; + unsigned long addr = start; + + dst_ptep = trans_alloc(info); + if (!dst_ptep) + return -ENOMEM; + pmd_populate_kernel(NULL, dst_pmdp, dst_ptep); + dst_ptep = pte_offset_kernel(dst_pmdp, start); + + src_ptep = pte_offset_kernel(src_pmdp, start); + do { + _copy_pte(dst_ptep, src_ptep, addr); + } while (dst_ptep++, src_ptep++, addr += PAGE_SIZE, addr != end); + + return 0; +} + +static int copy_pmd(struct trans_pgd_info *info, pud_t *dst_pudp, + pud_t *src_pudp, unsigned long start, unsigned long end) +{ + pmd_t *src_pmdp; + pmd_t *dst_pmdp; + unsigned long next; + unsigned long addr = start; + + if (pud_none(READ_ONCE(*dst_pudp))) { + dst_pmdp = trans_alloc(info); + if (!dst_pmdp) + return -ENOMEM; + pud_populate(NULL, dst_pudp, dst_pmdp); + } + dst_pmdp = pmd_offset(dst_pudp, start); + + src_pmdp = pmd_offset(src_pudp, start); + do { + pmd_t pmd = READ_ONCE(*src_pmdp); + + next = pmd_addr_end(addr, end); + if (pmd_none(pmd)) + continue; + if (pmd_table(pmd)) { + if (copy_pte(info, dst_pmdp, src_pmdp, addr, next)) + return -ENOMEM; + } else { + set_pmd(dst_pmdp, + __pmd(pmd_val(pmd) & ~PMD_SECT_RDONLY)); + } + } while (dst_pmdp++, src_pmdp++, addr = next, addr != end); + + return 0; +} + +static int copy_pud(struct trans_pgd_info *info, p4d_t *dst_p4dp, + p4d_t *src_p4dp, unsigned long start, + unsigned long end) +{ + pud_t *dst_pudp; + pud_t *src_pudp; + unsigned long next; + unsigned long addr = start; + + if (p4d_none(READ_ONCE(*dst_p4dp))) { + dst_pudp = trans_alloc(info); + if (!dst_pudp) + return -ENOMEM; + p4d_populate(NULL, dst_p4dp, dst_pudp); + } + dst_pudp = pud_offset(dst_p4dp, start); + + src_pudp = pud_offset(src_p4dp, start); + do { + pud_t pud = READ_ONCE(*src_pudp); + + next = pud_addr_end(addr, end); + if (pud_none(pud)) + continue; + if (pud_table(pud)) { + if (copy_pmd(info, dst_pudp, src_pudp, addr, next)) + return -ENOMEM; + } else { + set_pud(dst_pudp, + __pud(pud_val(pud) & ~PUD_SECT_RDONLY)); + } + } while (dst_pudp++, src_pudp++, addr = next, addr != end); + + return 0; +} + +static int copy_p4d(struct trans_pgd_info *info, pgd_t *dst_pgdp, + pgd_t *src_pgdp, unsigned long start, + unsigned long end) +{ + p4d_t *dst_p4dp; + p4d_t *src_p4dp; + unsigned long next; + unsigned long addr = start; + + dst_p4dp = p4d_offset(dst_pgdp, start); + src_p4dp = p4d_offset(src_pgdp, start); + do { + next = p4d_addr_end(addr, end); + if (p4d_none(READ_ONCE(*src_p4dp))) + continue; + if (copy_pud(info, dst_p4dp, src_p4dp, addr, next)) + return -ENOMEM; + } while (dst_p4dp++, src_p4dp++, addr = next, addr != end); + + return 0; +} + +static int copy_page_tables(struct trans_pgd_info *info, pgd_t *dst_pgdp, + unsigned long start, unsigned long end) +{ + unsigned long next; + unsigned long addr = start; + pgd_t *src_pgdp = pgd_offset_k(start); + + dst_pgdp = pgd_offset_pgd(dst_pgdp, start); + do { + next = pgd_addr_end(addr, end); + if (pgd_none(READ_ONCE(*src_pgdp))) + continue; + if (copy_p4d(info, dst_pgdp, src_pgdp, addr, next)) + return -ENOMEM; + } while (dst_pgdp++, src_pgdp++, addr = next, addr != end); + + return 0; +} + +/* + * Create trans_pgd and copy linear map. + * info: contains allocator and its argument + * dst_pgdp: new page table that is created, and to which map is copied. + * start: Start of the interval (inclusive). + * end: End of the interval (exclusive). + * + * Returns 0 on success, and -ENOMEM on failure. + */ +int trans_pgd_create_copy(struct trans_pgd_info *info, pgd_t **dst_pgdp, + unsigned long start, unsigned long end) +{ + int rc; + pgd_t *trans_pgd = trans_alloc(info); + + if (!trans_pgd) { + pr_err("Failed to allocate memory for temporary page tables.\n"); + return -ENOMEM; + } + + rc = copy_page_tables(info, trans_pgd, start, end); + if (!rc) + *dst_pgdp = trans_pgd; + + return rc; +} + +/* + * Add map entry to trans_pgd for a base-size page at PTE level. + * info: contains allocator and its argument + * trans_pgd: page table in which new map is added. + * page: page to be mapped. + * dst_addr: new VA address for the page + * pgprot: protection for the page. + * + * Returns 0 on success, and -ENOMEM on failure. + */ +int trans_pgd_map_page(struct trans_pgd_info *info, pgd_t *trans_pgd, + void *page, unsigned long dst_addr, pgprot_t pgprot) +{ + pgd_t *pgdp; + p4d_t *p4dp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + pgdp = pgd_offset_pgd(trans_pgd, dst_addr); + if (pgd_none(READ_ONCE(*pgdp))) { + p4dp = trans_alloc(info); + if (!pgdp) + return -ENOMEM; + pgd_populate(NULL, pgdp, p4dp); + } + + p4dp = p4d_offset(pgdp, dst_addr); + if (p4d_none(READ_ONCE(*p4dp))) { + pudp = trans_alloc(info); + if (!pudp) + return -ENOMEM; + p4d_populate(NULL, p4dp, pudp); + } + + pudp = pud_offset(p4dp, dst_addr); + if (pud_none(READ_ONCE(*pudp))) { + pmdp = trans_alloc(info); + if (!pmdp) + return -ENOMEM; + pud_populate(NULL, pudp, pmdp); + } + + pmdp = pmd_offset(pudp, dst_addr); + if (pmd_none(READ_ONCE(*pmdp))) { + ptep = trans_alloc(info); + if (!ptep) + return -ENOMEM; + pmd_populate_kernel(NULL, pmdp, ptep); + } + + ptep = pte_offset_kernel(pmdp, dst_addr); + set_pte(ptep, pfn_pte(virt_to_pfn(page), pgprot)); + + return 0; +} + +/* + * The page we want to idmap may be outside the range covered by VA_BITS that + * can be built using the kernel's p?d_populate() helpers. As a one off, for a + * single page, we build these page tables bottom up and just assume that will + * need the maximum T0SZ. + * + * Returns 0 on success, and -ENOMEM on failure. + * On success trans_ttbr0 contains page table with idmapped page, t0sz is set to + * maximum T0SZ for this page. + */ +int trans_pgd_idmap_page(struct trans_pgd_info *info, phys_addr_t *trans_ttbr0, + unsigned long *t0sz, void *page) +{ + phys_addr_t dst_addr = virt_to_phys(page); + unsigned long pfn = __phys_to_pfn(dst_addr); + int max_msb = (dst_addr & GENMASK(52, 48)) ? 51 : 47; + int bits_mapped = PAGE_SHIFT - 4; + unsigned long level_mask, prev_level_entry, *levels[4]; + int this_level, index, level_lsb, level_msb; + + dst_addr &= PAGE_MASK; + prev_level_entry = pte_val(pfn_pte(pfn, PAGE_KERNEL_EXEC)); + + for (this_level = 3; this_level >= 0; this_level--) { + levels[this_level] = trans_alloc(info); + if (!levels[this_level]) + return -ENOMEM; + + level_lsb = ARM64_HW_PGTABLE_LEVEL_SHIFT(this_level); + level_msb = min(level_lsb + bits_mapped, max_msb); + level_mask = GENMASK_ULL(level_msb, level_lsb); + + index = (dst_addr & level_mask) >> level_lsb; + *(levels[this_level] + index) = prev_level_entry; + + pfn = virt_to_pfn(levels[this_level]); + prev_level_entry = pte_val(pfn_pte(pfn, + __pgprot(PMD_TYPE_TABLE))); + + if (level_msb == max_msb) + break; + } + + *trans_ttbr0 = phys_to_ttbr(__pfn_to_phys(pfn)); + *t0sz = TCR_T0SZ(max_msb + 1); + + return 0; +} |