diff options
Diffstat (limited to 'arch/x86')
53 files changed, 384 insertions, 323 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9395ec37bb64..4b9f378e05f6 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -27,6 +27,7 @@ config X86_64 # Options that are inherently 64-bit kernel only: select ARCH_HAS_GIGANTIC_PAGE select ARCH_HAS_PTDUMP + select ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 select ARCH_SUPPORTS_PER_VMA_LOCK select ARCH_SUPPORTS_HUGE_PFNMAP if TRANSPARENT_HUGEPAGE @@ -889,6 +890,7 @@ config INTEL_TDX_GUEST depends on X86_64 && CPU_SUP_INTEL depends on X86_X2APIC depends on EFI_STUB + depends on PARAVIRT select ARCH_HAS_CC_PLATFORM select X86_MEM_ENCRYPT select X86_MCE @@ -2136,6 +2138,7 @@ config RANDOMIZE_BASE config X86_NEED_RELOCS def_bool y depends on RANDOMIZE_BASE || (X86_32 && RELOCATABLE) + select ARCH_VMLINUX_NEEDS_RELOCS config PHYSICAL_ALIGN hex "Alignment value to which kernel should be aligned" diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 27efe2dc2aa8..594723005d95 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -245,12 +245,6 @@ endif KBUILD_LDFLAGS += -m elf_$(UTS_MACHINE) -ifdef CONFIG_X86_NEED_RELOCS -LDFLAGS_vmlinux := --emit-relocs --discard-none -else -LDFLAGS_vmlinux := -endif - # # The 64-bit kernel must be aligned to 2MB. Pass -z max-page-size=0x200000 to # the linker to force 2MB page size regardless of the default page size used diff --git a/arch/x86/Makefile.postlink b/arch/x86/Makefile.postlink deleted file mode 100644 index 8b8a68162c94..000000000000 --- a/arch/x86/Makefile.postlink +++ /dev/null @@ -1,40 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# =========================================================================== -# Post-link x86 pass -# =========================================================================== -# -# 1. Separate relocations from vmlinux into vmlinux.relocs. -# 2. Strip relocations from vmlinux. - -PHONY := __archpost -__archpost: - --include include/config/auto.conf -include $(srctree)/scripts/Kbuild.include -include $(srctree)/scripts/Makefile.lib - -CMD_RELOCS = arch/x86/tools/relocs -OUT_RELOCS = arch/x86/boot/compressed -quiet_cmd_relocs = RELOCS $(OUT_RELOCS)/$@.relocs - cmd_relocs = \ - mkdir -p $(OUT_RELOCS); \ - $(CMD_RELOCS) $@ > $(OUT_RELOCS)/$@.relocs; \ - $(CMD_RELOCS) --abs-relocs $@ - -# `@true` prevents complaint when there is nothing to be done - -vmlinux: FORCE - @true -ifeq ($(CONFIG_X86_NEED_RELOCS),y) - $(call cmd,relocs) - $(call cmd,strip_relocs) -endif - -clean: - @rm -f $(OUT_RELOCS)/vmlinux.relocs - -PHONY += FORCE clean - -FORCE: - -.PHONY: $(PHONY) diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um index a46b1397ad01..c86cbd9cbba3 100644 --- a/arch/x86/Makefile.um +++ b/arch/x86/Makefile.um @@ -7,12 +7,13 @@ core-y += arch/x86/crypto/ # GCC versions < 11. See: # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99652 # -ifeq ($(CONFIG_CC_IS_CLANG),y) -KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx -KBUILD_RUSTFLAGS += --target=$(objtree)/scripts/target.json +ifeq ($(call gcc-min-version, 110000)$(CONFIG_CC_IS_CLANG),y) +KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2 endif +KBUILD_RUSTFLAGS += --target=$(objtree)/scripts/target.json + ifeq ($(CONFIG_X86_32),y) START := 0x8048000 diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 8589471b65a1..81f55da81967 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -53,7 +53,6 @@ targets += cpustr.h KBUILD_CFLAGS := $(REALMODE_CFLAGS) -D_SETUP KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ -KBUILD_CFLAGS += $(call cc-option,-fmacro-prefix-map=$(srctree)/=) KBUILD_CFLAGS += -fno-asynchronous-unwind-tables KBUILD_CFLAGS += $(CONFIG_CC_IMPLICIT_FALLTHROUGH) diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 2eb63536c5d0..fdbce022db55 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -38,7 +38,6 @@ KBUILD_CFLAGS += -fno-stack-protector KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) KBUILD_CFLAGS += $(call cc-disable-warning, gnu) KBUILD_CFLAGS += -Wno-pointer-sign -KBUILD_CFLAGS += $(call cc-option,-fmacro-prefix-map=$(srctree)/=) KBUILD_CFLAGS += -fno-asynchronous-unwind-tables KBUILD_CFLAGS += -D__DISABLE_EXPORTS # Disable relocation relaxation in case the link is not PIE. @@ -117,9 +116,12 @@ $(obj)/vmlinux.bin: vmlinux FORCE targets += $(patsubst $(obj)/%,%,$(vmlinux-objs-y)) vmlinux.bin.all vmlinux.relocs -# vmlinux.relocs is created by the vmlinux postlink step. -$(obj)/vmlinux.relocs: vmlinux - @true +CMD_RELOCS = arch/x86/tools/relocs +quiet_cmd_relocs = RELOCS $@ + cmd_relocs = $(CMD_RELOCS) $< > $@;$(CMD_RELOCS) --abs-relocs $< + +$(obj)/vmlinux.relocs: vmlinux.unstripped FORCE + $(call if_changed,relocs) vmlinux.bin.all-y := $(obj)/vmlinux.bin vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 7772b01ab738..edab6d6049be 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -14,6 +14,7 @@ #include <asm/ia32.h> #include <asm/insn.h> #include <asm/insn-eval.h> +#include <asm/paravirt_types.h> #include <asm/pgtable.h> #include <asm/set_memory.h> #include <asm/traps.h> @@ -392,13 +393,21 @@ static int handle_halt(struct ve_info *ve) { const bool irq_disabled = irqs_disabled(); + /* + * HLT with IRQs enabled is unsafe, as an IRQ that is intended to be a + * wake event may be consumed before requesting HLT emulation, leaving + * the vCPU blocking indefinitely. + */ + if (WARN_ONCE(!irq_disabled, "HLT emulation with IRQs enabled")) + return -EIO; + if (__halt(irq_disabled)) return -EIO; return ve_instr_len(ve); } -void __cpuidle tdx_safe_halt(void) +void __cpuidle tdx_halt(void) { const bool irq_disabled = false; @@ -409,6 +418,16 @@ void __cpuidle tdx_safe_halt(void) WARN_ONCE(1, "HLT instruction emulation failed\n"); } +static void __cpuidle tdx_safe_halt(void) +{ + tdx_halt(); + /* + * "__cpuidle" section doesn't support instrumentation, so stick + * with raw_* variant that avoids tracing hooks. + */ + raw_local_irq_enable(); +} + static int read_msr(struct pt_regs *regs, struct ve_info *ve) { struct tdx_module_args args = { @@ -1110,6 +1129,19 @@ void __init tdx_early_init(void) x86_platform.guest.enc_kexec_finish = tdx_kexec_finish; /* + * Avoid "sti;hlt" execution in TDX guests as HLT induces a #VE that + * will enable interrupts before HLT TDCALL invocation if executed + * in STI-shadow, possibly resulting in missed wakeup events. + * + * Modify all possible HLT execution paths to use TDX specific routines + * that directly execute TDCALL and toggle the interrupt state as + * needed after TDCALL completion. This also reduces HLT related #VEs + * in addition to having a reliable halt logic execution. + */ + pv_ops.irq.safe_halt = tdx_safe_halt; + pv_ops.irq.halt = tdx_halt; + + /* * TDX intercepts the RDMSR to read the X2APIC ID in the parallel * bringup low level code. That raises #VE which cannot be handled * there. diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S index d3caa31240ed..175958b02f2b 100644 --- a/arch/x86/entry/entry.S +++ b/arch/x86/entry/entry.S @@ -17,19 +17,20 @@ .pushsection .noinstr.text, "ax" -SYM_FUNC_START(entry_ibpb) +/* Clobbers AX, CX, DX */ +SYM_FUNC_START(write_ibpb) ANNOTATE_NOENDBR movl $MSR_IA32_PRED_CMD, %ecx - movl $PRED_CMD_IBPB, %eax + movl _ASM_RIP(x86_pred_cmd), %eax xorl %edx, %edx wrmsr /* Make sure IBPB clears return stack preductions too. */ FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_BUG_IBPB_NO_RET RET -SYM_FUNC_END(entry_ibpb) +SYM_FUNC_END(write_ibpb) /* For KVM */ -EXPORT_SYMBOL_GPL(entry_ibpb); +EXPORT_SYMBOL_GPL(write_ibpb); .popsection diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 9518bf1ddf35..adb299d3b6a1 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -162,7 +162,8 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) text_start, image->size, VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| + VM_SEALED_SYSMAP, &vdso_mapping); if (IS_ERR(vma)) { @@ -181,7 +182,7 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) VDSO_VCLOCK_PAGES_START(addr), VDSO_NR_VCLOCK_PAGES * PAGE_SIZE, VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP| - VM_PFNMAP, + VM_PFNMAP|VM_SEALED_SYSMAP, &vvar_vclock_mapping); if (IS_ERR(vma)) { diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h index b5982b94bdba..cbc6157f0b4b 100644 --- a/arch/x86/include/asm/arch_hweight.h +++ b/arch/x86/include/asm/arch_hweight.h @@ -16,7 +16,8 @@ static __always_inline unsigned int __arch_hweight32(unsigned int w) { unsigned int res; - asm_inline (ALTERNATIVE("call __sw_hweight32", + asm_inline (ALTERNATIVE(ANNOTATE_IGNORE_ALTERNATIVE + "call __sw_hweight32", "popcntl %[val], %[cnt]", X86_FEATURE_POPCNT) : [cnt] "=" REG_OUT (res), ASM_CALL_CONSTRAINT : [val] REG_IN (w)); @@ -45,7 +46,8 @@ static __always_inline unsigned long __arch_hweight64(__u64 w) { unsigned long res; - asm_inline (ALTERNATIVE("call __sw_hweight64", + asm_inline (ALTERNATIVE(ANNOTATE_IGNORE_ALTERNATIVE + "call __sw_hweight64", "popcntq %[val], %[cnt]", X86_FEATURE_POPCNT) : [cnt] "=" REG_OUT (res), ASM_CALL_CONSTRAINT : [val] REG_IN (w)); diff --git a/arch/x86/include/asm/iosf_mbi.h b/arch/x86/include/asm/iosf_mbi.h index af7541c11821..8ace6559d399 100644 --- a/arch/x86/include/asm/iosf_mbi.h +++ b/arch/x86/include/asm/iosf_mbi.h @@ -168,13 +168,6 @@ void iosf_mbi_unblock_punit_i2c_access(void); int iosf_mbi_register_pmic_bus_access_notifier(struct notifier_block *nb); /** - * iosf_mbi_register_pmic_bus_access_notifier - Unregister PMIC bus notifier - * - * @nb: notifier_block to unregister - */ -int iosf_mbi_unregister_pmic_bus_access_notifier(struct notifier_block *nb); - -/** * iosf_mbi_unregister_pmic_bus_access_notifier_unlocked - Unregister PMIC bus * notifier, unlocked * diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index abb8374c9ff7..9a9b21b78905 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -76,6 +76,28 @@ static __always_inline void native_local_irq_restore(unsigned long flags) #endif +#ifndef CONFIG_PARAVIRT +#ifndef __ASSEMBLY__ +/* + * Used in the idle loop; sti takes one instruction cycle + * to complete: + */ +static __always_inline void arch_safe_halt(void) +{ + native_safe_halt(); +} + +/* + * Used when interrupts are already enabled or to + * shutdown the processor: + */ +static __always_inline void halt(void) +{ + native_halt(); +} +#endif /* __ASSEMBLY__ */ +#endif /* CONFIG_PARAVIRT */ + #ifdef CONFIG_PARAVIRT_XXL #include <asm/paravirt.h> #else @@ -98,24 +120,6 @@ static __always_inline void arch_local_irq_enable(void) } /* - * Used in the idle loop; sti takes one instruction cycle - * to complete: - */ -static __always_inline void arch_safe_halt(void) -{ - native_safe_halt(); -} - -/* - * Used when interrupts are already enabled or to - * shutdown the processor: - */ -static __always_inline void halt(void) -{ - native_halt(); -} - -/* * For spinlocks, etc: */ static __always_inline unsigned long arch_local_irq_save(void) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a884ab544335..3bdae454a959 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1472,8 +1472,13 @@ struct kvm_arch { struct once nx_once; #ifdef CONFIG_X86_64 - /* The number of TDP MMU pages across all roots. */ +#ifdef CONFIG_KVM_PROVE_MMU + /* + * The number of TDP MMU pages across all roots. Used only to sanity + * check that KVM isn't leaking TDP MMU pages. + */ atomic64_t tdp_mmu_pages; +#endif /* * List of struct kvm_mmu_pages being used as roots. diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 8a5cc8e70439..5c43f145454d 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -269,7 +269,7 @@ * typically has NO_MELTDOWN). * * While retbleed_untrain_ret() doesn't clobber anything but requires stack, - * entry_ibpb() will clobber AX, CX, DX. + * write_ibpb() will clobber AX, CX, DX. * * As such, this must be placed after every *SWITCH_TO_KERNEL_CR3 at a point * where we have a stack but before any RET instruction. @@ -279,7 +279,7 @@ VALIDATE_UNRET_END CALL_UNTRAIN_RET ALTERNATIVE_2 "", \ - "call entry_ibpb", \ibpb_feature, \ + "call write_ibpb", \ibpb_feature, \ __stringify(\call_depth_insns), X86_FEATURE_CALL_DEPTH #endif .endm @@ -368,7 +368,7 @@ extern void srso_return_thunk(void); extern void srso_alias_return_thunk(void); extern void entry_untrain_ret(void); -extern void entry_ibpb(void); +extern void write_ibpb(void); #ifdef CONFIG_X86_64 extern void clear_bhb_loop(void); @@ -514,11 +514,11 @@ void alternative_msr_write(unsigned int msr, u64 val, unsigned int feature) : "memory"); } -extern u64 x86_pred_cmd; - static inline void indirect_branch_prediction_barrier(void) { - alternative_msr_write(MSR_IA32_PRED_CMD, x86_pred_cmd, X86_FEATURE_IBPB); + asm_inline volatile(ALTERNATIVE("", "call write_ibpb", X86_FEATURE_IBPB) + : ASM_CALL_CONSTRAINT + :: "rax", "rcx", "rdx", "memory"); } /* The Intel SPEC CTRL MSR base value cache */ diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index bed346bfac89..c4c23190925c 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -102,6 +102,16 @@ static inline void notify_page_enc_status_changed(unsigned long pfn, PVOP_VCALL3(mmu.notify_page_enc_status_changed, pfn, npages, enc); } +static __always_inline void arch_safe_halt(void) +{ + PVOP_VCALL0(irq.safe_halt); +} + +static inline void halt(void) +{ + PVOP_VCALL0(irq.halt); +} + #ifdef CONFIG_PARAVIRT_XXL static inline void load_sp0(unsigned long sp0) { @@ -165,16 +175,6 @@ static inline void __write_cr4(unsigned long x) PVOP_VCALL1(cpu.write_cr4, x); } -static __always_inline void arch_safe_halt(void) -{ - PVOP_VCALL0(irq.safe_halt); -} - -static inline void halt(void) -{ - PVOP_VCALL0(irq.halt); -} - static inline u64 paravirt_read_msr(unsigned msr) { return PVOP_CALL1(u64, cpu.read_msr, msr); diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 62912023b46f..631c306ce1ff 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -120,10 +120,9 @@ struct pv_irq_ops { struct paravirt_callee_save save_fl; struct paravirt_callee_save irq_disable; struct paravirt_callee_save irq_enable; - +#endif void (*safe_halt)(void); void (*halt)(void); -#endif } __no_randomize_layout; struct pv_mmu_ops { diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h index daea94c2993c..4f84d421d1cf 100644 --- a/arch/x86/include/asm/smap.h +++ b/arch/x86/include/asm/smap.h @@ -40,7 +40,8 @@ static __always_inline unsigned long smap_save(void) unsigned long flags; asm volatile ("# smap_save\n\t" - ALTERNATIVE("", "pushf; pop %0; " "clac" "\n\t", + ALTERNATIVE(ANNOTATE_IGNORE_ALTERNATIVE + "", "pushf; pop %0; clac", X86_FEATURE_SMAP) : "=rm" (flags) : : "memory", "cc"); @@ -50,7 +51,8 @@ static __always_inline unsigned long smap_save(void) static __always_inline void smap_restore(unsigned long flags) { asm volatile ("# smap_restore\n\t" - ALTERNATIVE("", "push %0; popf\n\t", + ALTERNATIVE(ANNOTATE_IGNORE_ALTERNATIVE + "", "push %0; popf", X86_FEATURE_SMAP) : : "g" (flags) : "memory", "cc"); } @@ -61,6 +63,11 @@ static __always_inline void smap_restore(unsigned long flags) #define ASM_STAC \ ALTERNATIVE("", "stac", X86_FEATURE_SMAP) +#define ASM_CLAC_UNSAFE \ + ALTERNATIVE("", ANNOTATE_IGNORE_ALTERNATIVE "clac", X86_FEATURE_SMAP) +#define ASM_STAC_UNSAFE \ + ALTERNATIVE("", ANNOTATE_IGNORE_ALTERNATIVE "stac", X86_FEATURE_SMAP) + #endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_SMAP_H */ diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 65394aa9b49f..4a1922ec80cf 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -58,7 +58,7 @@ void tdx_get_ve_info(struct ve_info *ve); bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve); -void tdx_safe_halt(void); +void tdx_halt(void); bool tdx_early_handle_ve(struct pt_regs *regs); @@ -72,7 +72,7 @@ void __init tdx_dump_td_ctls(u64 td_ctls); #else static inline void tdx_early_init(void) { }; -static inline void tdx_safe_halt(void) { }; +static inline void tdx_halt(void) { }; static inline bool tdx_early_handle_ve(struct pt_regs *regs) { return false; } diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 97771b9d33af..59a62c3780a2 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -231,14 +231,12 @@ static __always_inline void __xen_stac(void) * Suppress objtool seeing the STAC/CLAC and getting confused about it * calling random code with AC=1. */ - asm volatile(ANNOTATE_IGNORE_ALTERNATIVE - ASM_STAC ::: "memory", "flags"); + asm volatile(ASM_STAC_UNSAFE ::: "memory", "flags"); } static __always_inline void __xen_clac(void) { - asm volatile(ANNOTATE_IGNORE_ALTERNATIVE - ASM_CLAC ::: "memory", "flags"); + asm volatile(ASM_CLAC_UNSAFE ::: "memory", "flags"); } static inline long diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index dae6a73be40e..9fa321a95eb3 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -23,6 +23,8 @@ #include <linux/serial_core.h> #include <linux/pgtable.h> +#include <xen/xen.h> + #include <asm/e820/api.h> #include <asm/irqdomain.h> #include <asm/pci_x86.h> @@ -1729,6 +1731,15 @@ int __init acpi_mps_check(void) { #if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_X86_MPPARSE) /* mptable code is not built-in*/ + + /* + * Xen disables ACPI in PV DomU guests but it still emulates APIC and + * supports SMP. Returning early here ensures that APIC is not disabled + * unnecessarily and the guest is not limited to a single vCPU. + */ + if (xen_pv_domain() && !xen_initial_domain()) + return 0; + if (acpi_disabled || acpi_noirq) { pr_warn("MPS support code is not built-in, using acpi=off or acpi=noirq or pci=noacpi may have problem\n"); return 1; diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 72fa4bb78f0a..fee42a73d64a 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -799,7 +799,7 @@ int __init arch_early_irq_init(void) x86_vector_domain = irq_domain_create_tree(fn, &x86_vector_domain_ops, NULL); BUG_ON(x86_vector_domain == NULL); - irq_set_default_host(x86_vector_domain); + irq_set_default_domain(x86_vector_domain); BUG_ON(!alloc_cpumask_var(&vector_searchmask, GFP_KERNEL)); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 79569f72b8ee..a839ff506f45 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -805,6 +805,7 @@ static void init_amd_bd(struct cpuinfo_x86 *c) static const struct x86_cpu_id erratum_1386_microcode[] = { X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, 0x17, 0x01), 0x2, 0x2, 0x0800126e), X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, 0x17, 0x31), 0x0, 0x0, 0x08301052), + {} }; static void fix_erratum_1386(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 4386aa6c69e1..362602b705cc 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -59,7 +59,6 @@ DEFINE_PER_CPU(u64, x86_spec_ctrl_current); EXPORT_PER_CPU_SYMBOL_GPL(x86_spec_ctrl_current); u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB; -EXPORT_SYMBOL_GPL(x86_pred_cmd); static u64 __ro_after_init x86_arch_cap_msr; @@ -1142,7 +1141,7 @@ do_cmd_auto: setup_clear_cpu_cap(X86_FEATURE_RETHUNK); /* - * There is no need for RSB filling: entry_ibpb() ensures + * There is no need for RSB filling: write_ibpb() ensures * all predictions, including the RSB, are invalidated, * regardless of IBPB implementation. */ @@ -1592,51 +1591,54 @@ static void __init spec_ctrl_disable_kernel_rrsba(void) rrsba_disabled = true; } -static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_mitigation mode) +static void __init spectre_v2_select_rsb_mitigation(enum spectre_v2_mitigation mode) { /* - * Similar to context switches, there are two types of RSB attacks - * after VM exit: + * WARNING! There are many subtleties to consider when changing *any* + * code related to RSB-related mitigations. Before doing so, carefully + * read the following document, and update if necessary: * - * 1) RSB underflow + * Documentation/admin-guide/hw-vuln/rsb.rst * - * 2) Poisoned RSB entry + * In an overly simplified nutshell: * - * When retpoline is enabled, both are mitigated by filling/clearing - * the RSB. + * - User->user RSB attacks are conditionally mitigated during + * context switches by cond_mitigation -> write_ibpb(). * - * When IBRS is enabled, while #1 would be mitigated by the IBRS branch - * prediction isolation protections, RSB still needs to be cleared - * because of #2. Note that SMEP provides no protection here, unlike - * user-space-poisoned RSB entries. + * - User->kernel and guest->host attacks are mitigated by eIBRS or + * RSB filling. * - * eIBRS should protect against RSB poisoning, but if the EIBRS_PBRSB - * bug is present then a LITE version of RSB protection is required, - * just a single call needs to retire before a RET is executed. + * Though, depending on config, note that other alternative + * mitigations may end up getting used instead, e.g., IBPB on + * entry/vmexit, call depth tracking, or return thunks. */ + switch (mode) { case SPECTRE_V2_NONE: - return; + break; - case SPECTRE_V2_EIBRS_LFENCE: case SPECTRE_V2_EIBRS: + case SPECTRE_V2_EIBRS_LFENCE: + case SPECTRE_V2_EIBRS_RETPOLINE: if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) { - setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE); pr_info("Spectre v2 / PBRSB-eIBRS: Retire a single CALL on VMEXIT\n"); + setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE); } - return; + break; - case SPECTRE_V2_EIBRS_RETPOLINE: case SPECTRE_V2_RETPOLINE: case SPECTRE_V2_LFENCE: case SPECTRE_V2_IBRS: + pr_info("Spectre v2 / SpectreRSB: Filling RSB on context switch and VMEXIT\n"); + setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT); - pr_info("Spectre v2 / SpectreRSB : Filling RSB on VMEXIT\n"); - return; - } + break; - pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation at VM exit"); - dump_stack(); + default: + pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation\n"); + dump_stack(); + break; + } } /* @@ -1830,48 +1832,7 @@ static void __init spectre_v2_select_mitigation(void) spectre_v2_enabled = mode; pr_info("%s\n", spectre_v2_strings[mode]); - /* - * If Spectre v2 protection has been enabled, fill the RSB during a - * context switch. In general there are two types of RSB attacks - * across context switches, for which the CALLs/RETs may be unbalanced. - * - * 1) RSB underflow - * - * Some Intel parts have "bottomless RSB". When the RSB is empty, - * speculated return targets may come from the branch predictor, - * which could have a user-poisoned BTB or BHB entry. - * - * AMD has it even worse: *all* returns are speculated from the BTB, - * regardless of the state of the RSB. - * - * When IBRS or eIBRS is enabled, the "user -> kernel" attack - * scenario is mitigated by the IBRS branch prediction isolation - * properties, so the RSB buffer filling wouldn't be necessary to - * protect against this type of attack. - * - * The "user -> user" attack scenario is mitigated by RSB filling. - * - * 2) Poisoned RSB entry - * - * If the 'next' in-kernel return stack is shorter than 'prev', - * 'next' could be tricked into speculating with a user-poisoned RSB - * entry. - * - * The "user -> kernel" attack scenario is mitigated by SMEP and - * eIBRS. - * - * The "user -> user" scenario, also known as SpectreBHB, requires - * RSB clearing. - * - * So to mitigate all cases, unconditionally fill RSB on context - * switches. - * - * FIXME: Is this pointless for retbleed-affected AMD? - */ - setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); - pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n"); - - spectre_v2_determine_rsb_fill_type_at_vmexit(mode); + spectre_v2_select_rsb_mitigation(mode); /* * Retpoline protects the kernel, but doesn't protect firmware. IBRS @@ -2676,7 +2637,7 @@ static void __init srso_select_mitigation(void) setup_clear_cpu_cap(X86_FEATURE_RETHUNK); /* - * There is no need for RSB filling: entry_ibpb() ensures + * There is no need for RSB filling: write_ibpb() ensures * all predictions, including the RSB, are invalidated, * regardless of IBPB implementation. */ @@ -2701,7 +2662,7 @@ ibpb_on_vmexit: srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT; /* - * There is no need for RSB filling: entry_ibpb() ensures + * There is no need for RSB filling: write_ibpb() ensures * all predictions, including the RSB, are invalidated, * regardless of IBPB implementation. */ diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 1f14c3308b6b..f6fd71b64b66 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1786,13 +1786,13 @@ void mce_timer_kick(bool storm) __this_cpu_write(mce_next_interval, check_interval * HZ); } -/* Must not be called in IRQ context where del_timer_sync() can deadlock */ +/* Must not be called in IRQ context where timer_delete_sync() can deadlock */ static void mce_timer_delete_all(void) { int cpu; for_each_online_cpu(cpu) - del_timer_sync(&per_cpu(mce_timer, cpu)); + timer_delete_sync(&per_cpu(mce_timer, cpu)); } static void __mcheck_cpu_mce_banks_init(void) @@ -2820,7 +2820,7 @@ static int mce_cpu_pre_down(unsigned int cpu) struct timer_list *t = this_cpu_ptr(&mce_timer); mce_disable_cpu(); - del_timer_sync(t); + timer_delete_sync(t); mce_threshold_remove_device(cpu); mce_device_remove(cpu); return 0; diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 93ec829015f1..cc4a54145c83 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -3553,6 +3553,22 @@ static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp) free_rmid(rgrp->closid, rgrp->mon.rmid); } +/* + * We allow creating mon groups only with in a directory called "mon_groups" + * which is present in every ctrl_mon group. Check if this is a valid + * "mon_groups" directory. + * + * 1. The directory should be named "mon_groups". + * 2. The mon group itself should "not" be named "mon_groups". + * This makes sure "mon_groups" directory always has a ctrl_mon group + * as parent. + */ +static bool is_mon_groups(struct kernfs_node *kn, const char *name) +{ + return (!strcmp(rdt_kn_name(kn), "mon_groups") && + strcmp(name, "mon_groups")); +} + static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, const char *name, umode_t mode, enum rdt_group_type rtype, struct rdtgroup **r) @@ -3568,6 +3584,15 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, goto out_unlock; } + /* + * Check that the parent directory for a monitor group is a "mon_groups" + * directory. + */ + if (rtype == RDTMON_GROUP && !is_mon_groups(parent_kn, name)) { + ret = -EPERM; + goto out_unlock; + } + if (rtype == RDTMON_GROUP && (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) { @@ -3751,22 +3776,6 @@ out_unlock: return ret; } -/* - * We allow creating mon groups only with in a directory called "mon_groups" - * which is present in every ctrl_mon group. Check if this is a valid - * "mon_groups" directory. - * - * 1. The directory should be named "mon_groups". - * 2. The mon group itself should "not" be named "mon_groups". - * This makes sure "mon_groups" directory always has a ctrl_mon group - * as parent. - */ -static bool is_mon_groups(struct kernfs_node *kn, const char *name) -{ - return (!strcmp(rdt_kn_name(kn), "mon_groups") && - strcmp(name, "mon_groups")); -} - static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) { @@ -3782,11 +3791,8 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (resctrl_arch_alloc_capable() && parent_kn == rdtgroup_default.kn) return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode); - /* - * If RDT monitoring is supported and the parent directory is a valid - * "mon_groups" directory, add a monitoring subdirectory. - */ - if (resctrl_arch_mon_capable() && is_mon_groups(parent_kn, name)) + /* Else, attempt to add a monitoring subdirectory. */ + if (resctrl_arch_mon_capable()) return rdtgroup_mkdir_mon(parent_kn, name, mode); return -EPERM; diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 57120f0749cc..9d8dd8deb2a7 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -753,22 +753,21 @@ void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len) void __init e820__register_nosave_regions(unsigned long limit_pfn) { int i; - unsigned long pfn = 0; + u64 last_addr = 0; for (i = 0; i < e820_table->nr_entries; i++) { struct e820_entry *entry = &e820_table->entries[i]; - if (pfn < PFN_UP(entry->addr)) - register_nosave_region(pfn, PFN_UP(entry->addr)); - - pfn = PFN_DOWN(entry->addr + entry->size); - if (entry->type != E820_TYPE_RAM) - register_nosave_region(PFN_UP(entry->addr), pfn); + continue; - if (pfn >= limit_pfn) - break; + if (last_addr < entry->addr) + register_nosave_region(PFN_DOWN(last_addr), PFN_UP(entry->addr)); + + last_addr = entry->addr + entry->size; } + + register_nosave_region(PFN_DOWN(last_addr), limit_pfn); } #ifdef CONFIG_ACPI diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 611f27e3890c..3aad78bfcb26 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -389,10 +389,10 @@ static int __init setup_early_printk(char *buf) keep = (strstr(buf, "keep") != NULL); while (*buf != '\0') { - if (!strncmp(buf, "mmio", 4)) { - early_mmio_serial_init(buf + 4); + if (!strncmp(buf, "mmio32", 6)) { + buf += 6; + early_mmio_serial_init(buf); early_console_register(&early_serial_console, keep); - buf += 4; } if (!strncmp(buf, "serial", 6)) { buf += 6; @@ -407,9 +407,9 @@ static int __init setup_early_printk(char *buf) } #ifdef CONFIG_PCI if (!strncmp(buf, "pciserial", 9)) { - early_pci_serial_init(buf + 9); + buf += 9; /* Keep from match the above "pciserial" */ + early_pci_serial_init(buf); early_console_register(&early_serial_console, keep); - buf += 9; /* Keep from match the above "serial" */ } #endif if (!strncmp(buf, "vga", 3) && diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 97925632c28e..1ccd05d8999f 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -75,6 +75,11 @@ void paravirt_set_sched_clock(u64 (*func)(void)) static_call_update(pv_sched_clock, func); } +static noinstr void pv_native_safe_halt(void) +{ + native_safe_halt(); +} + #ifdef CONFIG_PARAVIRT_XXL static noinstr void pv_native_write_cr2(unsigned long val) { @@ -100,11 +105,6 @@ static noinstr void pv_native_set_debugreg(int regno, unsigned long val) { native_set_debugreg(regno, val); } - -static noinstr void pv_native_safe_halt(void) -{ - native_safe_halt(); -} #endif struct pv_info pv_info = { @@ -161,9 +161,11 @@ struct paravirt_patch_template pv_ops = { .irq.save_fl = __PV_IS_CALLEE_SAVE(pv_native_save_fl), .irq.irq_disable = __PV_IS_CALLEE_SAVE(pv_native_irq_disable), .irq.irq_enable = __PV_IS_CALLEE_SAVE(pv_native_irq_enable), +#endif /* CONFIG_PARAVIRT_XXL */ + + /* Irq HLT ops. */ .irq.safe_halt = pv_native_safe_halt, .irq.halt = native_halt, -#endif /* CONFIG_PARAVIRT_XXL */ /* Mmu ops. */ .mmu.flush_tlb_user = native_flush_tlb_local, diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 91f6ff618852..962c3ce39323 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -939,7 +939,7 @@ void __init select_idle_routine(void) static_call_update(x86_idle, mwait_idle); } else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { pr_info("using TDX aware idle routine\n"); - static_call_update(x86_idle, tdx_safe_halt); + static_call_update(x86_idle, tdx_halt); } else { static_call_update(x86_idle, default_idle); } diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 5e4d4934c0d3..571c906ffcbf 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1427,8 +1427,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) } break; case 0xa: { /* Architectural Performance Monitoring */ - union cpuid10_eax eax; - union cpuid10_edx edx; + union cpuid10_eax eax = { }; + union cpuid10_edx edx = { }; if (!enable_pmu || !static_cpu_has(X86_FEATURE_ARCH_PERFMON)) { entry->eax = entry->ebx = entry->ecx = entry->edx = 0; @@ -1444,8 +1444,6 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) if (kvm_pmu_cap.version) edx.split.anythread_deprecated = 1; - edx.split.reserved1 = 0; - edx.split.reserved2 = 0; entry->eax = eax.full; entry->ebx = kvm_pmu_cap.events_mask; @@ -1763,7 +1761,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) break; /* AMD Extended Performance Monitoring and Debug */ case 0x80000022: { - union cpuid_0x80000022_ebx ebx; + union cpuid_0x80000022_ebx ebx = { }; entry->ecx = entry->edx = 0; if (!enable_pmu || !kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) { diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 7cc0564f5f97..21a3b8166242 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -40,7 +40,9 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) kvm_tdp_mmu_invalidate_roots(kvm, KVM_VALID_ROOTS); kvm_tdp_mmu_zap_invalidated_roots(kvm, false); - WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages)); +#ifdef CONFIG_KVM_PROVE_MMU + KVM_MMU_WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages)); +#endif WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); /* @@ -325,13 +327,17 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { kvm_account_pgtable_pages((void *)sp->spt, +1); +#ifdef CONFIG_KVM_PROVE_MMU atomic64_inc(&kvm->arch.tdp_mmu_pages); +#endif } static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { kvm_account_pgtable_pages((void *)sp->spt, -1); +#ifdef CONFIG_KVM_PROVE_MMU atomic64_dec(&kvm->arch.tdp_mmu_pages); +#endif } /** diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index ec08fa3caf43..51116fe69a50 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -31,6 +31,8 @@ static DEFINE_PER_CPU(struct list_head, wakeup_vcpus_on_cpu); */ static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock); +#define PI_LOCK_SCHED_OUT SINGLE_DEPTH_NESTING + static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) { return &(to_vmx(vcpu)->pi_desc); @@ -89,9 +91,20 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) * current pCPU if the task was migrated. */ if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) { - raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); + raw_spinlock_t *spinlock = &per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu); + + /* + * In addition to taking the wakeup lock for the regular/IRQ + * context, tell lockdep it is being taken for the "sched out" + * context as well. vCPU loads happens in task context, and + * this is taking the lock of the *previous* CPU, i.e. can race + * with both the scheduler and the wakeup handler. + */ + raw_spin_lock(spinlock); + spin_acquire(&spinlock->dep_map, PI_LOCK_SCHED_OUT, 0, _RET_IP_); list_del(&vmx->pi_wakeup_list); - raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); + spin_release(&spinlock->dep_map, _RET_IP_); + raw_spin_unlock(spinlock); } dest = cpu_physical_id(cpu); @@ -148,11 +161,23 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); struct pi_desc old, new; - unsigned long flags; - local_irq_save(flags); + lockdep_assert_irqs_disabled(); - raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); + /* + * Acquire the wakeup lock using the "sched out" context to workaround + * a lockdep false positive. When this is called, schedule() holds + * various per-CPU scheduler locks. When the wakeup handler runs, it + * holds this CPU's wakeup lock while calling try_to_wake_up(), which + * can eventually take the aforementioned scheduler locks, which causes + * lockdep to assume there is deadlock. + * + * Deadlock can't actually occur because IRQs are disabled for the + * entirety of the sched_out critical section, i.e. the wakeup handler + * can't run while the scheduler locks are held. + */ + raw_spin_lock_nested(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu), + PI_LOCK_SCHED_OUT); list_add_tail(&vmx->pi_wakeup_list, &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu)); raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); @@ -176,8 +201,6 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) */ if (pi_test_on(&new)) __apic_send_IPI_self(POSTED_INTR_WAKEUP_VECTOR); - - local_irq_restore(flags); } static bool vmx_needs_pi_wakeup(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c841817a914a..3712dde0bf9d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11786,6 +11786,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, if (kvm_mpx_supported()) kvm_load_guest_fpu(vcpu); + kvm_vcpu_srcu_read_lock(vcpu); + r = kvm_apic_accept_events(vcpu); if (r < 0) goto out; @@ -11799,6 +11801,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, mp_state->mp_state = vcpu->arch.mp_state; out: + kvm_vcpu_srcu_read_unlock(vcpu); + if (kvm_mpx_supported()) kvm_put_guest_fpu(vcpu); vcpu_put(vcpu); diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index bd21e9c335ad..38b33cdd4232 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -1553,7 +1553,7 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, kvm_vcpu_halt(vcpu); if (sched_poll.timeout) - del_timer(&vcpu->arch.xen.poll_timer); + timer_delete(&vcpu->arch.xen.poll_timer); kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); } @@ -2308,7 +2308,7 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache); kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_time_info_cache); - del_timer_sync(&vcpu->arch.xen.poll_timer); + timer_delete_sync(&vcpu->arch.xen.poll_timer); } void kvm_xen_init_vm(struct kvm *kvm) diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index aa8c341b2441..06296eb69fd4 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -77,6 +77,24 @@ SYM_FUNC_START(rep_movs_alternative) _ASM_EXTABLE_UA( 0b, 1b) .Llarge_movsq: + /* Do the first possibly unaligned word */ +0: movq (%rsi),%rax +1: movq %rax,(%rdi) + + _ASM_EXTABLE_UA( 0b, .Lcopy_user_tail) + _ASM_EXTABLE_UA( 1b, .Lcopy_user_tail) + + /* What would be the offset to the aligned destination? */ + leaq 8(%rdi),%rax + andq $-8,%rax + subq %rdi,%rax + + /* .. and update pointers and count to match */ + addq %rax,%rdi + addq %rax,%rsi + subq %rax,%rcx + + /* make %rcx contain the number of words, %rax the remainder */ movq %rcx,%rax shrq $3,%rcx andl $7,%eax diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 36a017b4a30d..7c4f6f591f2b 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -959,9 +959,18 @@ int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, ret = __add_pages(nid, start_pfn, nr_pages, params); WARN_ON_ONCE(ret); - /* update max_pfn, max_low_pfn and high_memory */ - update_end_of_memory_vars(start_pfn << PAGE_SHIFT, - nr_pages << PAGE_SHIFT); + /* + * Special case: add_pages() is called by memremap_pages() for adding device + * private pages. Do not bump up max_pfn in the device private path, + * because max_pfn changes affect dma_addressing_limited(). + * + * dma_addressing_limited() returning true when max_pfn is the device's + * addressable memory can force device drivers to use bounce buffers + * and impact their performance negatively: + */ + if (!params->pgmap) + /* update max_pfn, max_low_pfn and high_memory */ + update_end_of_memory_vars(start_pfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT); return ret; } diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 72405d315b41..def3d9284254 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -2274,6 +2274,7 @@ int set_mce_nospec(unsigned long pfn) pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); return rc; } +EXPORT_SYMBOL_GPL(set_mce_nospec); /* Restore full speculative operation to the pfn. */ int clear_mce_nospec(unsigned long pfn) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index cec321fb74f2..a05fcddfc811 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -20,7 +20,7 @@ pgtable_t pte_alloc_one(struct mm_struct *mm) void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { paravirt_release_pte(page_to_pfn(pte)); - tlb_remove_table(tlb, page_ptdesc(pte)); + tlb_remove_ptdesc(tlb, page_ptdesc(pte)); } #if CONFIG_PGTABLE_LEVELS > 2 @@ -34,21 +34,21 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) #ifdef CONFIG_X86_PAE tlb->need_flush_all = 1; #endif - tlb_remove_table(tlb, virt_to_ptdesc(pmd)); + tlb_remove_ptdesc(tlb, virt_to_ptdesc(pmd)); } #if CONFIG_PGTABLE_LEVELS > 3 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) { paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); - tlb_remove_table(tlb, virt_to_ptdesc(pud)); + tlb_remove_ptdesc(tlb, virt_to_ptdesc(pud)); } #if CONFIG_PGTABLE_LEVELS > 4 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) { paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); - tlb_remove_table(tlb, virt_to_ptdesc(p4d)); + tlb_remove_ptdesc(tlb, virt_to_ptdesc(p4d)); } #endif /* CONFIG_PGTABLE_LEVELS > 4 */ #endif /* CONFIG_PGTABLE_LEVELS > 3 */ diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index e459d97ef397..eb83348f9305 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -667,9 +667,9 @@ static void cond_mitigation(struct task_struct *next) prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_spec); /* - * Avoid user/user BTB poisoning by flushing the branch predictor - * when switching between processes. This stops one process from - * doing Spectre-v2 attacks on another. + * Avoid user->user BTB/RSB poisoning by flushing them when switching + * between processes. This stops one process from doing Spectre-v2 + * attacks on another. * * Both, the conditional and the always IBPB mode use the mm * pointer to avoid the IBPB when switching between tasks of the diff --git a/arch/x86/platform/intel/iosf_mbi.c b/arch/x86/platform/intel/iosf_mbi.c index c81cea208c2c..40ae94db20d8 100644 --- a/arch/x86/platform/intel/iosf_mbi.c +++ b/arch/x86/platform/intel/iosf_mbi.c @@ -422,19 +422,6 @@ int iosf_mbi_unregister_pmic_bus_access_notifier_unlocked( } EXPORT_SYMBOL(iosf_mbi_unregister_pmic_bus_access_notifier_unlocked); -int iosf_mbi_unregister_pmic_bus_access_notifier(struct notifier_block *nb) -{ - int ret; - - /* Wait for the bus to go inactive before unregistering */ - iosf_mbi_punit_acquire(); - ret = iosf_mbi_unregister_pmic_bus_access_notifier_unlocked(nb); - iosf_mbi_punit_release(); - - return ret; -} -EXPORT_SYMBOL(iosf_mbi_unregister_pmic_bus_access_notifier); - void iosf_mbi_assert_punit_acquired(void) { WARN_ON(iosf_mbi_pmic_punit_access_count == 0); diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 63230ff8cf4f..08e76a5ca155 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -27,6 +27,7 @@ #include <asm/mmu_context.h> #include <asm/cpu_device_id.h> #include <asm/microcode.h> +#include <asm/fred.h> #ifdef CONFIG_X86_32 __visible unsigned long saved_context_ebx; @@ -231,6 +232,19 @@ static void notrace __restore_processor_state(struct saved_context *ctxt) */ #ifdef CONFIG_X86_64 wrmsrl(MSR_GS_BASE, ctxt->kernelmode_gs_base); + + /* + * Reinitialize FRED to ensure the FRED MSRs contain the same values + * as before hibernation. + * + * Note, the setup of FRED RSPs requires access to percpu data + * structures. Therefore, FRED reinitialization can only occur after + * the percpu access pointer (i.e., MSR_GS_BASE) is restored. + */ + if (ctxt->cr4 & X86_CR4_FRED) { + cpu_init_fred_exceptions(); + cpu_init_fred_rsps(); + } #else loadsegment(fs, __KERNEL_PERCPU); #endif diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S index 8c534c36adfa..66f066b8feda 100644 --- a/arch/x86/power/hibernate_asm_64.S +++ b/arch/x86/power/hibernate_asm_64.S @@ -26,7 +26,7 @@ /* code below belongs to the image kernel */ .align PAGE_SIZE SYM_FUNC_START(restore_registers) - ANNOTATE_NOENDBR + ENDBR /* go back to the original page tables */ movq %r9, %cr3 @@ -120,7 +120,7 @@ SYM_FUNC_END(restore_image) /* code below has been relocated to a safe page */ SYM_FUNC_START(core_restore_code) - ANNOTATE_NOENDBR + ENDBR /* switch to temporary page tables */ movq %rax, %cr3 /* flush TLB */ diff --git a/arch/x86/tools/insn_decoder_test.c b/arch/x86/tools/insn_decoder_test.c index 6c2986d2ad11..08cd913cbd4e 100644 --- a/arch/x86/tools/insn_decoder_test.c +++ b/arch/x86/tools/insn_decoder_test.c @@ -12,8 +12,6 @@ #include <stdarg.h> #include <linux/kallsyms.h> -#define unlikely(cond) (cond) - #include <asm/insn.h> #include <inat.c> #include <insn.c> diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h index 4da336965698..b51aefd6ec2b 100644 --- a/arch/x86/um/asm/barrier.h +++ b/arch/x86/um/asm/barrier.h @@ -12,9 +12,9 @@ */ #ifdef CONFIG_X86_32 -#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2) -#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2) -#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM) +#define mb() alternative("lock addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2) +#define rmb() alternative("lock addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2) +#define wmb() alternative("lock addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM) #else /* CONFIG_X86_32 */ diff --git a/arch/x86/um/asm/module.h b/arch/x86/um/asm/module.h deleted file mode 100644 index a3b061d66082..000000000000 --- a/arch/x86/um/asm/module.h +++ /dev/null @@ -1,24 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __UM_MODULE_H -#define __UM_MODULE_H - -/* UML is simple */ -struct mod_arch_specific -{ -}; - -#ifdef CONFIG_X86_32 - -#define Elf_Shdr Elf32_Shdr -#define Elf_Sym Elf32_Sym -#define Elf_Ehdr Elf32_Ehdr - -#else - -#define Elf_Shdr Elf64_Shdr -#define Elf_Sym Elf64_Sym -#define Elf_Ehdr Elf64_Ehdr - -#endif - -#endif diff --git a/arch/x86/um/os-Linux/mcontext.c b/arch/x86/um/os-Linux/mcontext.c index e80ab7d28117..37decaa74761 100644 --- a/arch/x86/um/os-Linux/mcontext.c +++ b/arch/x86/um/os-Linux/mcontext.c @@ -4,6 +4,7 @@ #include <asm/ptrace.h> #include <sysdep/ptrace.h> #include <sysdep/mcontext.h> +#include <arch.h> void get_regs_from_mc(struct uml_pt_regs *regs, mcontext_t *mc) { @@ -27,7 +28,17 @@ void get_regs_from_mc(struct uml_pt_regs *regs, mcontext_t *mc) COPY(RIP); COPY2(EFLAGS, EFL); COPY2(CS, CSGSFS); - regs->gp[CS / sizeof(unsigned long)] &= 0xffff; - regs->gp[CS / sizeof(unsigned long)] |= 3; + regs->gp[SS / sizeof(unsigned long)] = mc->gregs[REG_CSGSFS] >> 48; +#endif +} + +void mc_set_rip(void *_mc, void *target) +{ + mcontext_t *mc = _mc; + +#ifdef __i386__ + mc->gregs[REG_EIP] = (unsigned long)target; +#else + mc->gregs[REG_RIP] = (unsigned long)target; #endif } diff --git a/arch/x86/um/shared/sysdep/faultinfo_32.h b/arch/x86/um/shared/sysdep/faultinfo_32.h index b6f2437ec29c..ab5c8e47049c 100644 --- a/arch/x86/um/shared/sysdep/faultinfo_32.h +++ b/arch/x86/um/shared/sysdep/faultinfo_32.h @@ -29,4 +29,16 @@ struct faultinfo { #define PTRACE_FULL_FAULTINFO 0 +#define ___backtrack_faulted(_faulted) \ + asm volatile ( \ + "mov $0, %0\n" \ + "movl $__get_kernel_nofault_faulted_%=,%1\n" \ + "jmp _end_%=\n" \ + "__get_kernel_nofault_faulted_%=:\n" \ + "mov $1, %0;" \ + "_end_%=:" \ + : "=r" (_faulted), \ + "=m" (current->thread.segv_continue) :: \ + ) + #endif diff --git a/arch/x86/um/shared/sysdep/faultinfo_64.h b/arch/x86/um/shared/sysdep/faultinfo_64.h index ee88f88974ea..26fb4835d3e9 100644 --- a/arch/x86/um/shared/sysdep/faultinfo_64.h +++ b/arch/x86/um/shared/sysdep/faultinfo_64.h @@ -29,4 +29,16 @@ struct faultinfo { #define PTRACE_FULL_FAULTINFO 1 +#define ___backtrack_faulted(_faulted) \ + asm volatile ( \ + "mov $0, %0\n" \ + "movq $__get_kernel_nofault_faulted_%=,%1\n" \ + "jmp _end_%=\n" \ + "__get_kernel_nofault_faulted_%=:\n" \ + "mov $1, %0;" \ + "_end_%=:" \ + : "=r" (_faulted), \ + "=m" (current->thread.segv_continue) :: \ + ) + #endif diff --git a/arch/x86/um/vdso/vma.c b/arch/x86/um/vdso/vma.c index f238f7b33cdd..dc8dfb2abd80 100644 --- a/arch/x86/um/vdso/vma.c +++ b/arch/x86/um/vdso/vma.c @@ -12,33 +12,22 @@ static unsigned int __read_mostly vdso_enabled = 1; unsigned long um_vdso_addr; +static struct page *um_vdso; extern unsigned long task_size; extern char vdso_start[], vdso_end[]; -static struct page **vdsop; - static int __init init_vdso(void) { - struct page *um_vdso; - BUG_ON(vdso_end - vdso_start > PAGE_SIZE); um_vdso_addr = task_size - PAGE_SIZE; - vdsop = kmalloc(sizeof(struct page *), GFP_KERNEL); - if (!vdsop) - goto oom; - um_vdso = alloc_page(GFP_KERNEL); - if (!um_vdso) { - kfree(vdsop); - + if (!um_vdso) goto oom; - } copy_page(page_address(um_vdso), vdso_start); - *vdsop = um_vdso; return 0; @@ -56,6 +45,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) struct mm_struct *mm = current->mm; static struct vm_special_mapping vdso_mapping = { .name = "[vdso]", + .pages = &um_vdso, }; if (!vdso_enabled) @@ -64,7 +54,6 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) if (mmap_write_lock_killable(mm)) return -EINTR; - vdso_mapping.pages = vdsop; vma = _install_special_mapping(mm, um_vdso_addr, PAGE_SIZE, VM_READ|VM_EXEC| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 43dcd8c7badc..1b7710bd0d05 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -70,6 +70,9 @@ EXPORT_SYMBOL(xen_start_flags); */ struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info; +/* Number of pages released from the initial allocation. */ +unsigned long xen_released_pages; + static __ref void xen_get_vendor(void) { init_cpu_devs(); @@ -466,6 +469,13 @@ int __init arch_xen_unpopulated_init(struct resource **res) xen_free_unpopulated_pages(1, &pg); } + /* + * Account for the region being in the physmap but unpopulated. + * The value in xen_released_pages is used by the balloon + * driver to know how much of the physmap is unpopulated and + * set an accurate initial memory target. + */ + xen_released_pages += xen_extra_mem[i].n_pfns; /* Zero so region is not also added to the balloon driver. */ xen_extra_mem[i].n_pfns = 0; } diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c index 0e3d930bcb89..9d25d9373945 100644 --- a/arch/x86/xen/enlighten_pvh.c +++ b/arch/x86/xen/enlighten_pvh.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/acpi.h> +#include <linux/cpufreq.h> +#include <linux/cpuidle.h> #include <linux/export.h> #include <linux/mm.h> @@ -123,8 +125,23 @@ static void __init pvh_arch_setup(void) { pvh_reserve_extra_memory(); - if (xen_initial_domain()) + if (xen_initial_domain()) { xen_add_preferred_consoles(); + + /* + * Disable usage of CPU idle and frequency drivers: when + * running as hardware domain the exposed native ACPI tables + * causes idle and/or frequency drivers to attach and + * malfunction. It's Xen the entity that controls the idle and + * frequency states. + * + * For unprivileged domains the exposed ACPI tables are + * fabricated and don't contain such data. + */ + disable_cpuidle(); + disable_cpufreq(); + WARN_ON(xen_set_default_idle()); + } } void __init xen_pvh_init(struct boot_params *boot_params) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index c3db71d96c43..3823e52aef52 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -37,9 +37,6 @@ #define GB(x) ((uint64_t)(x) * 1024 * 1024 * 1024) -/* Number of pages released from the initial allocation. */ -unsigned long xen_released_pages; - /* Memory map would allow PCI passthrough. */ bool xen_pv_pci_possible; diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index 109af12f7647..461bb1526502 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -226,9 +226,7 @@ SYM_CODE_END(xen_early_idt_handler_array) push %rax mov $__HYPERVISOR_iret, %eax syscall /* Do the IRET. */ -#ifdef CONFIG_MITIGATION_SLS - int3 -#endif + ud2 /* The SYSCALL should never return. */ .endm SYM_CODE_START(xen_iret) |