From c48b5a4cf3125adb679e28ef093f66ff81368d05 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 6 Aug 2024 20:48:43 +0200 Subject: x86/mm: Fix PTI for i386 some more So it turns out that we have to do two passes of pti_clone_entry_text(), once before initcalls, such that device and late initcalls can use user-mode-helper / modprobe and once after free_initmem() / mark_readonly(). Now obviously mark_readonly() can cause PMD splits, and pti_clone_pgtable() doesn't like that much. Allow the late clone to split PMDs so that pagetables stay in sync. [peterz: Changelog and comments] Reported-by: Guenter Roeck Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Tested-by: Guenter Roeck Link: https://lkml.kernel.org/r/20240806184843.GX37996@noisy.programming.kicks-ass.net --- arch/x86/mm/pti.c | 45 +++++++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index bfdf5f45b137..851ec8f1363a 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -241,7 +241,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) * * Returns a pointer to a PTE on success, or NULL on failure. */ -static pte_t *pti_user_pagetable_walk_pte(unsigned long address) +static pte_t *pti_user_pagetable_walk_pte(unsigned long address, bool late_text) { gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); pmd_t *pmd; @@ -251,10 +251,15 @@ static pte_t *pti_user_pagetable_walk_pte(unsigned long address) if (!pmd) return NULL; - /* We can't do anything sensible if we hit a large mapping. */ + /* Large PMD mapping found */ if (pmd_leaf(*pmd)) { - WARN_ON(1); - return NULL; + /* Clear the PMD if we hit a large mapping from the first round */ + if (late_text) { + set_pmd(pmd, __pmd(0)); + } else { + WARN_ON_ONCE(1); + return NULL; + } } if (pmd_none(*pmd)) { @@ -283,7 +288,7 @@ static void __init pti_setup_vsyscall(void) if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte)) return; - target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR); + target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR, false); if (WARN_ON(!target_pte)) return; @@ -301,7 +306,7 @@ enum pti_clone_level { static void pti_clone_pgtable(unsigned long start, unsigned long end, - enum pti_clone_level level) + enum pti_clone_level level, bool late_text) { unsigned long addr; @@ -390,7 +395,7 @@ pti_clone_pgtable(unsigned long start, unsigned long end, return; /* Allocate PTE in the user page-table */ - target_pte = pti_user_pagetable_walk_pte(addr); + target_pte = pti_user_pagetable_walk_pte(addr, late_text); if (WARN_ON(!target_pte)) return; @@ -452,7 +457,7 @@ static void __init pti_clone_user_shared(void) phys_addr_t pa = per_cpu_ptr_to_phys((void *)va); pte_t *target_pte; - target_pte = pti_user_pagetable_walk_pte(va); + target_pte = pti_user_pagetable_walk_pte(va, false); if (WARN_ON(!target_pte)) return; @@ -475,7 +480,7 @@ static void __init pti_clone_user_shared(void) start = CPU_ENTRY_AREA_BASE; end = start + (PAGE_SIZE * CPU_ENTRY_AREA_PAGES); - pti_clone_pgtable(start, end, PTI_CLONE_PMD); + pti_clone_pgtable(start, end, PTI_CLONE_PMD, false); } #endif /* CONFIG_X86_64 */ @@ -492,11 +497,11 @@ static void __init pti_setup_espfix64(void) /* * Clone the populated PMDs of the entry text and force it RO. */ -static void pti_clone_entry_text(void) +static void pti_clone_entry_text(bool late) { pti_clone_pgtable((unsigned long) __entry_text_start, (unsigned long) __entry_text_end, - PTI_LEVEL_KERNEL_IMAGE); + PTI_LEVEL_KERNEL_IMAGE, late); } /* @@ -571,7 +576,7 @@ static void pti_clone_kernel_text(void) * pti_set_kernel_image_nonglobal() did to clear the * global bit. */ - pti_clone_pgtable(start, end_clone, PTI_LEVEL_KERNEL_IMAGE); + pti_clone_pgtable(start, end_clone, PTI_LEVEL_KERNEL_IMAGE, false); /* * pti_clone_pgtable() will set the global bit in any PMDs @@ -638,8 +643,15 @@ void __init pti_init(void) /* Undo all global bits from the init pagetables in head_64.S: */ pti_set_kernel_image_nonglobal(); + /* Replace some of the global bits just for shared entry text: */ - pti_clone_entry_text(); + /* + * This is very early in boot. Device and Late initcalls can do + * modprobe before free_initmem() and mark_readonly(). This + * pti_clone_entry_text() allows those user-mode-helpers to function, + * but notably the text is still RW. + */ + pti_clone_entry_text(false); pti_setup_espfix64(); pti_setup_vsyscall(); } @@ -656,10 +668,11 @@ void pti_finalize(void) if (!boot_cpu_has(X86_FEATURE_PTI)) return; /* - * We need to clone everything (again) that maps parts of the - * kernel image. + * This is after free_initmem() (all initcalls are done) and we've done + * mark_readonly(). Text is now NX which might've split some PMDs + * relative to the early clone. */ - pti_clone_entry_text(); + pti_clone_entry_text(true); pti_clone_kernel_text(); debug_checkwx_user(); -- cgit v1.2.3 From ab84ba647f2c94ac4d0c3fc6951c49f08aa1fcf7 Mon Sep 17 00:00:00 2001 From: Zhiquan Li Date: Mon, 5 Aug 2024 18:35:31 +0800 Subject: x86/acpi: Remove __ro_after_init from acpi_mp_wake_mailbox On a platform using the "Multiprocessor Wakeup Structure"[1] to startup secondary CPUs the control processor needs to memremap() the physical address of the MP Wakeup Structure mailbox to the variable acpi_mp_wake_mailbox, which holds the virtual address of mailbox. To wake up the AP the control processor writes the APIC ID of AP, the wakeup vector and the ACPI_MP_WAKE_COMMAND_WAKEUP command into the mailbox. Current implementation doesn't consider the case which restricts boot time CPU bringup to 1 with the kernel parameter "maxcpus=1" and brings other CPUs online later from user space as it sets acpi_mp_wake_mailbox to read-only after init. So when the first AP is tried to brought online after init, the attempt to update the variable results in a kernel panic. The memremap() call that initializes the variable cannot be moved into acpi_parse_mp_wake() because memremap() is not functional at that point in the boot process. Also as the APs might never be brought up, keep the memremap() call in acpi_wakeup_cpu() so that the operation only takes place when needed. Fixes: 24dd05da8c79 ("x86/apic: Mark acpi_mp_wake_* variables as __ro_after_init") Signed-off-by: Zhiquan Li Signed-off-by: Thomas Gleixner Reviewed-by: Kirill A. Shutemov Link: https://lore.kernel.org/all/20240805103531.1230635-1-zhiquan1.li@intel.com --- arch/x86/kernel/acpi/madt_wakeup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/acpi/madt_wakeup.c b/arch/x86/kernel/acpi/madt_wakeup.c index 6cfe762be28b..d5ef6215583b 100644 --- a/arch/x86/kernel/acpi/madt_wakeup.c +++ b/arch/x86/kernel/acpi/madt_wakeup.c @@ -19,7 +19,7 @@ static u64 acpi_mp_wake_mailbox_paddr __ro_after_init; /* Virtual address of the Multiprocessor Wakeup Structure mailbox */ -static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox __ro_after_init; +static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox; static u64 acpi_mp_pgd __ro_after_init; static u64 acpi_mp_reset_vector_paddr __ro_after_init; -- cgit v1.2.3 From e639222a51196c69c70b49b67098ce2f9919ed08 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Tue, 6 Aug 2024 19:22:07 +0800 Subject: x86/paravirt: Fix incorrect virt spinlock setting on bare metal The kernel can change spinlock behavior when running as a guest. But this guest-friendly behavior causes performance problems on bare metal. The kernel uses a static key to switch between the two modes. In theory, the static key is enabled by default (run in guest mode) and should be disabled for bare metal (and in some guests that want native behavior or paravirt spinlock). A performance drop is reported when running encode/decode workload and BenchSEE cache sub-workload. Bisect points to commit ce0a1b608bfc ("x86/paravirt: Silence unused native_pv_lock_init() function warning"). When CONFIG_PARAVIRT_SPINLOCKS is disabled the virt_spin_lock_key is incorrectly set to true on bare metal. The qspinlock degenerates to test-and-set spinlock, which decreases the performance on bare metal. Set the default value of virt_spin_lock_key to false. If booting in a VM, enable this key. Later during the VM initialization, if other high-efficient spinlock is preferred (e.g. paravirt-spinlock), or the user wants the native qspinlock (via nopvspin boot commandline), the virt_spin_lock_key is disabled accordingly. This results in the following decision matrix: X86_FEATURE_HYPERVISOR Y Y Y N CONFIG_PARAVIRT_SPINLOCKS Y Y N Y/N PV spinlock Y N N Y/N virt_spin_lock_key N Y/N Y N Fixes: ce0a1b608bfc ("x86/paravirt: Silence unused native_pv_lock_init() function warning") Reported-by: Prem Nath Dey Reported-by: Xiaoping Zhou Suggested-by: Dave Hansen Suggested-by: Qiuxu Zhuo Suggested-by: Nikolay Borisov Signed-off-by: Chen Yu Signed-off-by: Thomas Gleixner Reviewed-by: Nikolay Borisov Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20240806112207.29792-1-yu.c.chen@intel.com --- arch/x86/include/asm/qspinlock.h | 12 +++++++----- arch/x86/kernel/paravirt.c | 7 +++---- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h index a053c1293975..68da67df304d 100644 --- a/arch/x86/include/asm/qspinlock.h +++ b/arch/x86/include/asm/qspinlock.h @@ -66,13 +66,15 @@ static inline bool vcpu_is_preempted(long cpu) #ifdef CONFIG_PARAVIRT /* - * virt_spin_lock_key - enables (by default) the virt_spin_lock() hijack. + * virt_spin_lock_key - disables by default the virt_spin_lock() hijack. * - * Native (and PV wanting native due to vCPU pinning) should disable this key. - * It is done in this backwards fashion to only have a single direction change, - * which removes ordering between native_pv_spin_init() and HV setup. + * Native (and PV wanting native due to vCPU pinning) should keep this key + * disabled. Native does not touch the key. + * + * When in a guest then native_pv_lock_init() enables the key first and + * KVM/XEN might conditionally disable it later in the boot process again. */ -DECLARE_STATIC_KEY_TRUE(virt_spin_lock_key); +DECLARE_STATIC_KEY_FALSE(virt_spin_lock_key); /* * Shortcut for the queued_spin_lock_slowpath() function that allows diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 5358d43886ad..fec381533555 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -51,13 +51,12 @@ DEFINE_ASM_FUNC(pv_native_irq_enable, "sti", .noinstr.text); DEFINE_ASM_FUNC(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text); #endif -DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key); +DEFINE_STATIC_KEY_FALSE(virt_spin_lock_key); void __init native_pv_lock_init(void) { - if (IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) && - !boot_cpu_has(X86_FEATURE_HYPERVISOR)) - static_branch_disable(&virt_spin_lock_key); + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + static_branch_enable(&virt_spin_lock_key); } static void native_tlb_remove_table(struct mmu_gather *tlb, void *table) -- cgit v1.2.3 From 919f18f961c03d6694aa726c514184f2311a4614 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 7 Aug 2024 17:02:44 -0700 Subject: x86/mtrr: Check if fixed MTRRs exist before saving them MTRRs have an obsolete fixed variant for fine grained caching control of the 640K-1MB region that uses separate MSRs. This fixed variant has a separate capability bit in the MTRR capability MSR. So far all x86 CPUs which support MTRR have this separate bit set, so it went unnoticed that mtrr_save_state() does not check the capability bit before accessing the fixed MTRR MSRs. Though on a CPU that does not support the fixed MTRR capability this results in a #GP. The #GP itself is harmless because the RDMSR fault is handled gracefully, but results in a WARN_ON(). Add the missing capability check to prevent this. Fixes: 2b1f6278d77c ("[PATCH] x86: Save the MTRRs of the BSP before booting an AP") Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20240808000244.946864-1-ak@linux.intel.com --- arch/x86/kernel/cpu/mtrr/mtrr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c index 767bf1c71aad..2a2fc14955cd 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.c +++ b/arch/x86/kernel/cpu/mtrr/mtrr.c @@ -609,7 +609,7 @@ void mtrr_save_state(void) { int first_cpu; - if (!mtrr_enabled()) + if (!mtrr_enabled() || !mtrr_state.have_fixed) return; first_cpu = cpumask_first(cpu_online_mask); -- cgit v1.2.3