diff options
Diffstat (limited to 'arch/x86')
70 files changed, 960 insertions, 582 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 66bfabae8814..9b2a598cce8d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -28,7 +28,6 @@ config X86_64 select ARCH_HAS_GIGANTIC_PAGE select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 select ARCH_SUPPORTS_PER_VMA_LOCK - select ARCH_USE_CMPXCHG_LOCKREF select HAVE_ARCH_SOFT_DIRTY select MODULES_USE_ELF_RELA select NEED_DMA_MAP_STATE @@ -118,6 +117,7 @@ config X86 select ARCH_SUPPORTS_LTO_CLANG select ARCH_SUPPORTS_LTO_CLANG_THIN select ARCH_USE_BUILTIN_BSWAP + select ARCH_USE_CMPXCHG_LOCKREF if X86_CMPXCHG64 select ARCH_USE_MEMTEST select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS @@ -1534,6 +1534,7 @@ config NUMA depends on X86_64 || (X86_32 && HIGHMEM64G && X86_BIGSMP) default y if X86_BIGSMP select USE_PERCPU_NUMA_NODE_ID + select OF_NUMA if OF help Enable NUMA (Non-Uniform Memory Access) support. diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index dc8c876fbd8f..80d76aea1f7b 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -103,6 +103,16 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, return ES_OK; } +static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size) +{ + return ES_OK; +} + +static bool fault_in_kernel_space(unsigned long address) +{ + return false; +} + #undef __init #define __init diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 2d0b1bd866ea..31c48bc2c3d8 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -457,3 +457,6 @@ 450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node 451 i386 cachestat sys_cachestat 452 i386 fchmodat2 sys_fchmodat2 +454 i386 futex_wake sys_futex_wake +455 i386 futex_wait sys_futex_wait +456 i386 futex_requeue sys_futex_requeue diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 1d6eee30eceb..a577bb27c16d 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -375,6 +375,9 @@ 451 common cachestat sys_cachestat 452 common fchmodat2 sys_fchmodat2 453 64 map_shadow_stack sys_map_shadow_stack +454 common futex_wake sys_futex_wake +455 common futex_wait sys_futex_wait +456 common futex_requeue sys_futex_requeue # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 6a1821bd7d5e..83c0afb7c741 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -42,7 +42,8 @@ vdso_img-$(VDSO64-y) += 64 vdso_img-$(VDSOX32-y) += x32 vdso_img-$(VDSO32-y) += 32 -obj-$(VDSO32-y) += vdso32-setup.o +obj-$(VDSO32-y) += vdso32-setup.o +OBJECT_FILES_NON_STANDARD_vdso32-setup.o := n vobjs := $(foreach F,$(vobjs-y),$(obj)/$F) vobjs32 := $(foreach F,$(vobjs32-y),$(obj)/$F) diff --git a/arch/x86/events/utils.c b/arch/x86/events/utils.c index 76b1f8bb0fd5..dab4ed199227 100644 --- a/arch/x86/events/utils.c +++ b/arch/x86/events/utils.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <asm/insn.h> +#include <linux/mm.h> #include "perf_event.h" @@ -132,9 +133,9 @@ static int get_branch_type(unsigned long from, unsigned long to, int abort, * The LBR logs any address in the IP, even if the IP just * faulted. This means userspace can control the from address. * Ensure we don't blindly read any address by validating it is - * a known text address. + * a known text address and not a vsyscall address. */ - if (kernel_text_address(from)) { + if (kernel_text_address(from) && !in_gate_area_no_mm(from)) { addr = (void *)from; /* * Assume we can get the maximum possible size diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 783ed339f341..21556ad87f4b 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -7,6 +7,8 @@ * Author : K. Y. Srinivasan <kys@microsoft.com> */ +#define pr_fmt(fmt) "Hyper-V: " fmt + #include <linux/efi.h> #include <linux/types.h> #include <linux/bitfield.h> @@ -191,7 +193,7 @@ void set_hv_tscchange_cb(void (*cb)(void)) struct hv_tsc_emulation_control emu_ctrl = {.enabled = 1}; if (!hv_reenlightenment_available()) { - pr_warn("Hyper-V: reenlightenment support is unavailable\n"); + pr_warn("reenlightenment support is unavailable\n"); return; } @@ -394,6 +396,7 @@ static void __init hv_get_partition_id(void) local_irq_restore(flags); } +#if IS_ENABLED(CONFIG_HYPERV_VTL_MODE) static u8 __init get_vtl(void) { u64 control = HV_HYPERCALL_REP_COMP_1 | HVCALL_GET_VP_REGISTERS; @@ -416,13 +419,16 @@ static u8 __init get_vtl(void) if (hv_result_success(ret)) { ret = output->as64.low & HV_X64_VTL_MASK; } else { - pr_err("Failed to get VTL(%lld) and set VTL to zero by default.\n", ret); - ret = 0; + pr_err("Failed to get VTL(error: %lld) exiting...\n", ret); + BUG(); } local_irq_restore(flags); return ret; } +#else +static inline u8 get_vtl(void) { return 0; } +#endif /* * This function is to be invoked early in the boot sequence after the @@ -564,7 +570,7 @@ skip_hypercall_pg_init: if (cpu_feature_enabled(X86_FEATURE_IBT) && *(u32 *)hv_hypercall_pg != gen_endbr()) { setup_clear_cpu_cap(X86_FEATURE_IBT); - pr_warn("Hyper-V: Disabling IBT because of Hyper-V bug\n"); + pr_warn("Disabling IBT because of Hyper-V bug\n"); } #endif @@ -604,8 +610,10 @@ skip_hypercall_pg_init: hv_query_ext_cap(0); /* Find the VTL */ - if (!ms_hyperv.paravisor_present && hv_isolation_type_snp()) - ms_hyperv.vtl = get_vtl(); + ms_hyperv.vtl = get_vtl(); + + if (ms_hyperv.vtl > 0) /* non default VTL */ + hv_vtl_early_init(); return; diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c index 36a562218010..999f5ac82fe9 100644 --- a/arch/x86/hyperv/hv_vtl.c +++ b/arch/x86/hyperv/hv_vtl.c @@ -215,7 +215,7 @@ static int hv_vtl_wakeup_secondary_cpu(int apicid, unsigned long start_eip) return hv_vtl_bringup_vcpu(vp_id, start_eip); } -static int __init hv_vtl_early_init(void) +int __init hv_vtl_early_init(void) { /* * `boot_cpu_has` returns the runtime feature support, @@ -230,4 +230,3 @@ static int __init hv_vtl_early_init(void) return 0; } -early_initcall(hv_vtl_early_init); diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 9c4da699e11a..65f79092c9d9 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -58,7 +58,7 @@ #define ANNOTATE_IGNORE_ALTERNATIVE \ "999:\n\t" \ ".pushsection .discard.ignore_alts\n\t" \ - ".long 999b - .\n\t" \ + ".long 999b\n\t" \ ".popsection\n\t" /* @@ -352,7 +352,7 @@ static inline int alternatives_text_reserved(void *start, void *end) .macro ANNOTATE_IGNORE_ALTERNATIVE .Lannotate_\@: .pushsection .discard.ignore_alts - .long .Lannotate_\@ - . + .long .Lannotate_\@ .popsection .endm diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index d53636506134..5612648b0202 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -221,12 +221,18 @@ extern void __add_wrong_size(void) #define __try_cmpxchg(ptr, pold, new, size) \ __raw_try_cmpxchg((ptr), (pold), (new), (size), LOCK_PREFIX) +#define __sync_try_cmpxchg(ptr, pold, new, size) \ + __raw_try_cmpxchg((ptr), (pold), (new), (size), "lock; ") + #define __try_cmpxchg_local(ptr, pold, new, size) \ __raw_try_cmpxchg((ptr), (pold), (new), (size), "") #define arch_try_cmpxchg(ptr, pold, new) \ __try_cmpxchg((ptr), (pold), (new), sizeof(*(ptr))) +#define arch_sync_try_cmpxchg(ptr, pold, new) \ + __sync_try_cmpxchg((ptr), (pold), (new), sizeof(*(ptr))) + #define arch_try_cmpxchg_local(ptr, pold, new) \ __try_cmpxchg_local((ptr), (pold), (new), sizeof(*(ptr))) diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index 3a233ebff712..25050d953eee 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -28,8 +28,6 @@ struct x86_cpu { }; #ifdef CONFIG_HOTPLUG_CPU -extern int arch_register_cpu(int num); -extern void arch_unregister_cpu(int); extern void soft_restart_cpu(void); #endif diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index 31089b851c4f..a2be3aefff9f 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -157,7 +157,8 @@ static inline void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd) { static inline void fpu_sync_guest_vmexit_xfd_state(void) { } #endif -extern void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, unsigned int size, u32 pkru); +extern void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, + unsigned int size, u64 xfeatures, u32 pkru); extern int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, u64 xcr0, u32 *vpkru); static inline void fpstate_set_confidential(struct fpu_guest *gfpu) diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h index 637fa1df3512..c715097e92fd 100644 --- a/arch/x86/include/asm/i8259.h +++ b/arch/x86/include/asm/i8259.h @@ -69,6 +69,8 @@ struct legacy_pic { void (*make_irq)(unsigned int irq); }; +void legacy_pic_pcat_compat(void); + extern struct legacy_pic *legacy_pic; extern struct legacy_pic null_legacy_pic; diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 5fcd85fd64fd..197316121f04 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -27,6 +27,7 @@ * _X - regular server parts * _D - micro server parts * _N,_P - other mobile parts + * _H - premium mobile parts * _S - other client parts * * Historical OPTDIFFs: @@ -124,6 +125,7 @@ #define INTEL_FAM6_METEORLAKE 0xAC #define INTEL_FAM6_METEORLAKE_L 0xAA +#define INTEL_FAM6_ARROWLAKE_H 0xC5 #define INTEL_FAM6_ARROWLAKE 0xC6 #define INTEL_FAM6_LUNARLAKE_M 0xBD diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 17715cb8731d..70d139406bc8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -528,7 +528,6 @@ struct kvm_pmu { u64 raw_event_mask; struct kvm_pmc gp_counters[KVM_INTEL_PMC_MAX_GENERIC]; struct kvm_pmc fixed_counters[KVM_PMC_MAX_FIXED]; - struct irq_work irq_work; /* * Overlay the bitmap with a 64-bit atomic so that all bits can be diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h index 635132a12778..73dba8b94443 100644 --- a/arch/x86/include/asm/local.h +++ b/arch/x86/include/asm/local.h @@ -135,28 +135,27 @@ static inline bool local_try_cmpxchg(local_t *l, long *old, long new) #define local_xchg(l, n) (xchg(&((l)->a.counter), (n))) /** - * local_add_unless - add unless the number is a given value + * local_add_unless - add unless the number is already a given value * @l: pointer of type local_t * @a: the amount to add to l... * @u: ...unless l is equal to u. * - * Atomically adds @a to @l, so long as it was not @u. - * Returns non-zero if @l was not @u, and zero otherwise. + * Atomically adds @a to @l, if @v was not already @u. + * Returns true if the addition was done. */ -#define local_add_unless(l, a, u) \ -({ \ - long c, old; \ - c = local_read((l)); \ - for (;;) { \ - if (unlikely(c == (u))) \ - break; \ - old = local_cmpxchg((l), c, c + (a)); \ - if (likely(old == c)) \ - break; \ - c = old; \ - } \ - c != (u); \ -}) +static __always_inline bool +local_add_unless(local_t *l, long a, long u) +{ + long c = local_read(l); + + do { + if (unlikely(c == u)) + return false; + } while (!local_try_cmpxchg(l, &c, c + a)); + + return true; +} + #define local_inc_not_zero(l) local_add_unless((l), 1, 0) /* On x86_32, these are no better than the atomic variants. diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 180b1cbfcc4e..6de6e1d95952 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -245,7 +245,7 @@ static inline void cmci_recheck(void) {} int mce_available(struct cpuinfo_x86 *c); bool mce_is_memory_error(struct mce *m); bool mce_is_correctable(struct mce *m); -int mce_usable_address(struct mce *m); +bool mce_usable_address(struct mce *m); DECLARE_PER_CPU(unsigned, mce_exception_count); DECLARE_PER_CPU(unsigned, mce_poll_count); diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 033b53f993c6..896445edc6a8 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -340,8 +340,10 @@ static inline u64 hv_get_non_nested_register(unsigned int reg) { return 0; } #ifdef CONFIG_HYPERV_VTL_MODE void __init hv_vtl_init_platform(void); +int __init hv_vtl_early_init(void); #else static inline void __init hv_vtl_init_platform(void) {} +static inline int __init hv_vtl_early_init(void) { return 0; } #endif #include <asm-generic/mshyperv.h> diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index f8b502867dd1..e3fa9cecd599 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -1121,12 +1121,16 @@ #define MSR_IA32_VMX_MISC_INTEL_PT (1ULL << 14) #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29) #define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE 0x1F -/* AMD-V MSRs */ +/* AMD-V MSRs */ #define MSR_VM_CR 0xc0010114 #define MSR_VM_IGNNE 0xc0010115 #define MSR_VM_HSAVE_PA 0xc0010117 +#define SVM_VM_CR_VALID_MASK 0x001fULL +#define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL +#define SVM_VM_CR_SVM_DIS_MASK 0x0010ULL + /* Hardware Feedback Interface */ #define MSR_IA32_HW_FEEDBACK_PTR 0x17d0 #define MSR_IA32_HW_FEEDBACK_CONFIG 0x17d1 diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index c55cc243592e..f93e9b96927a 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -196,7 +196,7 @@ .macro ANNOTATE_RETPOLINE_SAFE .Lhere_\@: .pushsection .discard.retpoline_safe - .long .Lhere_\@ - . + .long .Lhere_\@ .popsection .endm @@ -271,7 +271,7 @@ .Lskip_rsb_\@: .endm -#ifdef CONFIG_CPU_UNRET_ENTRY +#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO) #define CALL_UNTRAIN_RET "call entry_untrain_ret" #else #define CALL_UNTRAIN_RET "" @@ -288,38 +288,24 @@ * As such, this must be placed after every *SWITCH_TO_KERNEL_CR3 at a point * where we have a stack but before any RET instruction. */ -.macro UNTRAIN_RET -#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \ - defined(CONFIG_CALL_DEPTH_TRACKING) || defined(CONFIG_CPU_SRSO) +.macro __UNTRAIN_RET ibpb_feature, call_depth_insns +#if defined(CONFIG_RETHUNK) || defined(CONFIG_CPU_IBPB_ENTRY) VALIDATE_UNRET_END ALTERNATIVE_3 "", \ CALL_UNTRAIN_RET, X86_FEATURE_UNRET, \ - "call entry_ibpb", X86_FEATURE_ENTRY_IBPB, \ - __stringify(RESET_CALL_DEPTH), X86_FEATURE_CALL_DEPTH + "call entry_ibpb", \ibpb_feature, \ + __stringify(\call_depth_insns), X86_FEATURE_CALL_DEPTH #endif .endm -.macro UNTRAIN_RET_VM -#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \ - defined(CONFIG_CALL_DEPTH_TRACKING) || defined(CONFIG_CPU_SRSO) - VALIDATE_UNRET_END - ALTERNATIVE_3 "", \ - CALL_UNTRAIN_RET, X86_FEATURE_UNRET, \ - "call entry_ibpb", X86_FEATURE_IBPB_ON_VMEXIT, \ - __stringify(RESET_CALL_DEPTH), X86_FEATURE_CALL_DEPTH -#endif -.endm +#define UNTRAIN_RET \ + __UNTRAIN_RET X86_FEATURE_ENTRY_IBPB, __stringify(RESET_CALL_DEPTH) -.macro UNTRAIN_RET_FROM_CALL -#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \ - defined(CONFIG_CALL_DEPTH_TRACKING) - VALIDATE_UNRET_END - ALTERNATIVE_3 "", \ - CALL_UNTRAIN_RET, X86_FEATURE_UNRET, \ - "call entry_ibpb", X86_FEATURE_ENTRY_IBPB, \ - __stringify(RESET_CALL_DEPTH_FROM_CALL), X86_FEATURE_CALL_DEPTH -#endif -.endm +#define UNTRAIN_RET_VM \ + __UNTRAIN_RET X86_FEATURE_IBPB_ON_VMEXIT, __stringify(RESET_CALL_DEPTH) + +#define UNTRAIN_RET_FROM_CALL \ + __UNTRAIN_RET X86_FEATURE_ENTRY_IBPB, __stringify(RESET_CALL_DEPTH_FROM_CALL) .macro CALL_DEPTH_ACCOUNT @@ -334,7 +320,7 @@ #define ANNOTATE_RETPOLINE_SAFE \ "999:\n\t" \ ".pushsection .discard.retpoline_safe\n\t" \ - ".long 999b - .\n\t" \ + ".long 999b\n\t" \ ".popsection\n\t" typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE]; @@ -348,13 +334,23 @@ extern void __x86_return_thunk(void); static inline void __x86_return_thunk(void) {} #endif +#ifdef CONFIG_CPU_UNRET_ENTRY extern void retbleed_return_thunk(void); +#else +static inline void retbleed_return_thunk(void) {} +#endif + +#ifdef CONFIG_CPU_SRSO extern void srso_return_thunk(void); extern void srso_alias_return_thunk(void); +#else +static inline void srso_return_thunk(void) {} +static inline void srso_alias_return_thunk(void) {} +#endif -extern void retbleed_untrain_ret(void); -extern void srso_untrain_ret(void); -extern void srso_alias_untrain_ret(void); +extern void retbleed_return_thunk(void); +extern void srso_return_thunk(void); +extern void srso_alias_return_thunk(void); extern void entry_untrain_ret(void); extern void entry_ibpb(void); @@ -362,12 +358,7 @@ extern void entry_ibpb(void); extern void (*x86_return_thunk)(void); #ifdef CONFIG_CALL_DEPTH_TRACKING -extern void __x86_return_skl(void); - -static inline void x86_set_skl_return_thunk(void) -{ - x86_return_thunk = &__x86_return_skl; -} +extern void call_depth_return_thunk(void); #define CALL_DEPTH_ACCOUNT \ ALTERNATIVE("", \ @@ -380,12 +371,12 @@ DECLARE_PER_CPU(u64, __x86_ret_count); DECLARE_PER_CPU(u64, __x86_stuffs_count); DECLARE_PER_CPU(u64, __x86_ctxsw_count); #endif -#else -static inline void x86_set_skl_return_thunk(void) {} +#else /* !CONFIG_CALL_DEPTH_TRACKING */ +static inline void call_depth_return_thunk(void) {} #define CALL_DEPTH_ACCOUNT "" -#endif +#endif /* CONFIG_CALL_DEPTH_TRACKING */ #ifdef CONFIG_RETPOLINE diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h index b716d291d0d4..65dee2420624 100644 --- a/arch/x86/include/asm/prom.h +++ b/arch/x86/include/asm/prom.h @@ -31,6 +31,11 @@ static inline void x86_dtb_init(void) { } #define of_ioapic 0 #endif +#ifdef CONFIG_OF_EARLY_FLATTREE +void x86_flattree_get_config(void); +#else +static inline void x86_flattree_get_config(void) { } +#endif extern char cmd_line[COMMAND_LINE_SIZE]; #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index ad98dd1d9cfb..c31c633419fe 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -129,7 +129,6 @@ void native_smp_send_reschedule(int cpu); void native_send_call_func_ipi(const struct cpumask *mask); void native_send_call_func_single_ipi(int cpu); -bool smp_park_other_cpus_in_init(void); void smp_store_cpu_info(int id); asmlinkage __visible void smp_reboot_interrupt(void); diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h index cb0386fc4dc3..c648502e4535 100644 --- a/arch/x86/include/asm/spec-ctrl.h +++ b/arch/x86/include/asm/spec-ctrl.h @@ -4,6 +4,7 @@ #include <linux/thread_info.h> #include <asm/nospec-branch.h> +#include <asm/msr.h> /* * On VMENTER we must preserve whatever view of the SPEC_CTRL MSR @@ -76,6 +77,16 @@ static inline u64 ssbd_tif_to_amd_ls_cfg(u64 tifn) return (tifn & _TIF_SSBD) ? x86_amd_ls_cfg_ssbd_mask : 0ULL; } +/* + * This can be used in noinstr functions & should only be called in bare + * metal context. + */ +static __always_inline void __update_spec_ctrl(u64 val) +{ + __this_cpu_write(x86_spec_ctrl_current, val); + native_wrmsrl(MSR_IA32_SPEC_CTRL, val); +} + #ifdef CONFIG_SMP extern void speculative_store_bypass_ht_init(void); #else diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 19bf955b67e0..87a7b917d30e 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -229,10 +229,6 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT) #define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT) -#define SVM_VM_CR_VALID_MASK 0x001fULL -#define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL -#define SVM_VM_CR_SVM_DIS_MASK 0x0010ULL - #define SVM_NESTED_CTL_NP_ENABLE BIT(0) #define SVM_NESTED_CTL_SEV_ENABLE BIT(1) #define SVM_NESTED_CTL_SEV_ES_ENABLE BIT(2) @@ -268,6 +264,7 @@ enum avic_ipi_failure_cause { AVIC_IPI_FAILURE_TARGET_NOT_RUNNING, AVIC_IPI_FAILURE_INVALID_TARGET, AVIC_IPI_FAILURE_INVALID_BACKING_PAGE, + AVIC_IPI_FAILURE_INVALID_IPI_VECTOR, }; #define AVIC_PHYSICAL_MAX_INDEX_MASK GENMASK_ULL(8, 0) @@ -571,8 +568,6 @@ struct vmcb { #define SVM_CPUID_FUNC 0x8000000a -#define SVM_VM_CR_SVM_DISABLE 4 - #define SVM_SELECTOR_S_SHIFT 4 #define SVM_SELECTOR_DPL_SHIFT 5 #define SVM_SELECTOR_P_SHIFT 7 diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 8bae40a66282..5c367c1290c3 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -496,7 +496,7 @@ copy_mc_to_kernel(void *to, const void *from, unsigned len); #define copy_mc_to_kernel copy_mc_to_kernel unsigned long __must_check -copy_mc_to_user(void *to, const void *from, unsigned len); +copy_mc_to_user(void __user *to, const void *from, unsigned len); #endif /* diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 2a0ea38955df..c55c0ef47a18 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -148,6 +148,9 @@ static int __init acpi_parse_madt(struct acpi_table_header *table) pr_debug("Local APIC address 0x%08x\n", madt->address); } + if (madt->flags & ACPI_MADT_PCAT_COMPAT) + legacy_pic_pcat_compat(); + /* ACPI 6.3 and newer support the online capable bit. */ if (acpi_gbl_FADT.header.revision > 6 || (acpi_gbl_FADT.header.revision == 6 && diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 517ee01503be..73be3931e4f0 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -403,6 +403,17 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, u8 insn_buff[MAX_PATCH_LEN]; DPRINTK(ALT, "alt table %px, -> %px", start, end); + + /* + * In the case CONFIG_X86_5LEVEL=y, KASAN_SHADOW_START is defined using + * cpu_feature_enabled(X86_FEATURE_LA57) and is therefore patched here. + * During the process, KASAN becomes confused seeing partial LA57 + * conversion and triggers a false-positive out-of-bound report. + * + * Disable KASAN until the patching is complete. + */ + kasan_disable_current(); + /* * The scan order should be from start to end. A later scanned * alternative code can overwrite previously scanned alternative code. @@ -452,6 +463,8 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, text_poke_early(instr, insn_buff, insn_buff_sz); } + + kasan_enable_current(); } static inline bool is_jcc32(struct insn *insn) diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 356de955e78d..43238ac2e845 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -27,6 +27,7 @@ #define PCI_DEVICE_ID_AMD_1AH_M00H_ROOT 0x153a #define PCI_DEVICE_ID_AMD_1AH_M20H_ROOT 0x1507 #define PCI_DEVICE_ID_AMD_MI200_ROOT 0x14bb +#define PCI_DEVICE_ID_AMD_MI300_ROOT 0x14f8 #define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464 #define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec @@ -43,6 +44,7 @@ #define PCI_DEVICE_ID_AMD_19H_M78H_DF_F4 0x12fc #define PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4 0x12c4 #define PCI_DEVICE_ID_AMD_MI200_DF_F4 0x14d4 +#define PCI_DEVICE_ID_AMD_MI300_DF_F4 0x152c /* Protect the PCI config register pairs used for SMN. */ static DEFINE_MUTEX(smn_mutex); @@ -62,6 +64,7 @@ static const struct pci_device_id amd_root_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_ROOT) }, {} }; @@ -93,6 +96,7 @@ static const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_DF_F3) }, {} }; @@ -112,9 +116,13 @@ static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_DF_F4) }, {} }; diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index faa9f2299848..e9ad518a5003 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -48,11 +48,6 @@ EXPORT_SYMBOL_GPL(__x86_call_count); extern s32 __call_sites[], __call_sites_end[]; -struct thunk_desc { - void *template; - unsigned int template_size; -}; - struct core_text { unsigned long base; unsigned long end; diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index ece2b5b7b0fe..d97d473545a6 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1014,7 +1014,6 @@ static bool cpu_has_zenbleed_microcode(void) default: return false; - break; } if (boot_cpu_data.microcode < good_rev) @@ -1044,6 +1043,8 @@ static void zenbleed_check(struct cpuinfo_x86 *c) static void init_amd(struct cpuinfo_x86 *c) { + u64 vm_cr; + early_init_amd(c); /* @@ -1095,6 +1096,14 @@ static void init_amd(struct cpuinfo_x86 *c) init_amd_cacheinfo(c); + if (cpu_has(c, X86_FEATURE_SVM)) { + rdmsrl(MSR_VM_CR, vm_cr); + if (vm_cr & SVM_VM_CR_SVM_DIS_MASK) { + pr_notice_once("SVM disabled (by BIOS) in MSR_VM_CR\n"); + clear_cpu_cap(c, X86_FEATURE_SVM); + } + } + if (!cpu_has(c, X86_FEATURE_LFENCE_RDTSC) && cpu_has(c, X86_FEATURE_XMM2)) { /* * Use LFENCE for execution serialization. On families which diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 10499bcd4e39..bb0ab8466b91 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -63,7 +63,7 @@ EXPORT_SYMBOL_GPL(x86_pred_cmd); static DEFINE_MUTEX(spec_ctrl_mutex); -void (*x86_return_thunk)(void) __ro_after_init = &__x86_return_thunk; +void (*x86_return_thunk)(void) __ro_after_init = __x86_return_thunk; /* Update SPEC_CTRL MSR and its cached copy unconditionally */ static void update_spec_ctrl(u64 val) @@ -717,7 +717,7 @@ void update_gds_msr(void) case GDS_MITIGATION_UCODE_NEEDED: case GDS_MITIGATION_HYPERVISOR: return; - }; + } wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); @@ -1019,7 +1019,6 @@ static void __init retbleed_select_mitigation(void) do_cmd_auto: case RETBLEED_CMD_AUTO: - default: if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { if (IS_ENABLED(CONFIG_CPU_UNRET_ENTRY)) @@ -1042,8 +1041,7 @@ do_cmd_auto: setup_force_cpu_cap(X86_FEATURE_RETHUNK); setup_force_cpu_cap(X86_FEATURE_UNRET); - if (IS_ENABLED(CONFIG_RETHUNK)) - x86_return_thunk = retbleed_return_thunk; + x86_return_thunk = retbleed_return_thunk; if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) @@ -1061,7 +1059,8 @@ do_cmd_auto: case RETBLEED_MITIGATION_STUFF: setup_force_cpu_cap(X86_FEATURE_RETHUNK); setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH); - x86_set_skl_return_thunk(); + + x86_return_thunk = call_depth_return_thunk; break; default: @@ -1290,6 +1289,8 @@ spectre_v2_user_select_mitigation(void) spectre_v2_user_ibpb = mode; switch (cmd) { + case SPECTRE_V2_USER_CMD_NONE: + break; case SPECTRE_V2_USER_CMD_FORCE: case SPECTRE_V2_USER_CMD_PRCTL_IBPB: case SPECTRE_V2_USER_CMD_SECCOMP_IBPB: @@ -1301,8 +1302,6 @@ spectre_v2_user_select_mitigation(void) case SPECTRE_V2_USER_CMD_SECCOMP: static_branch_enable(&switch_mm_cond_ibpb); break; - default: - break; } pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n", @@ -2160,6 +2159,10 @@ static int l1d_flush_prctl_get(struct task_struct *task) static int ssb_prctl_get(struct task_struct *task) { switch (ssb_mode) { + case SPEC_STORE_BYPASS_NONE: + if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) + return PR_SPEC_ENABLE; + return PR_SPEC_NOT_AFFECTED; case SPEC_STORE_BYPASS_DISABLE: return PR_SPEC_DISABLE; case SPEC_STORE_BYPASS_SECCOMP: @@ -2171,11 +2174,8 @@ static int ssb_prctl_get(struct task_struct *task) if (task_spec_ssb_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_DISABLE; return PR_SPEC_PRCTL | PR_SPEC_ENABLE; - default: - if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) - return PR_SPEC_ENABLE; - return PR_SPEC_NOT_AFFECTED; } + BUG(); } static int ib_prctl_get(struct task_struct *task) @@ -2353,6 +2353,8 @@ early_param("l1tf", l1tf_cmdline); enum srso_mitigation { SRSO_MITIGATION_NONE, + SRSO_MITIGATION_UCODE_NEEDED, + SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED, SRSO_MITIGATION_MICROCODE, SRSO_MITIGATION_SAFE_RET, SRSO_MITIGATION_IBPB, @@ -2368,11 +2370,13 @@ enum srso_mitigation_cmd { }; static const char * const srso_strings[] = { - [SRSO_MITIGATION_NONE] = "Vulnerable", - [SRSO_MITIGATION_MICROCODE] = "Mitigation: microcode", - [SRSO_MITIGATION_SAFE_RET] = "Mitigation: safe RET", - [SRSO_MITIGATION_IBPB] = "Mitigation: IBPB", - [SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only" + [SRSO_MITIGATION_NONE] = "Vulnerable", + [SRSO_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", + [SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED] = "Vulnerable: Safe RET, no microcode", + [SRSO_MITIGATION_MICROCODE] = "Vulnerable: Microcode, no safe RET", + [SRSO_MITIGATION_SAFE_RET] = "Mitigation: Safe RET", + [SRSO_MITIGATION_IBPB] = "Mitigation: IBPB", + [SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only" }; static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_NONE; @@ -2406,34 +2410,44 @@ static void __init srso_select_mitigation(void) { bool has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE); - if (!boot_cpu_has_bug(X86_BUG_SRSO) || cpu_mitigations_off()) - goto pred_cmd; + if (cpu_mitigations_off()) + return; - if (!has_microcode) { - pr_warn("IBPB-extending microcode not applied!\n"); - pr_warn(SRSO_NOTICE); - } else { + if (!boot_cpu_has_bug(X86_BUG_SRSO)) { + if (boot_cpu_has(X86_FEATURE_SBPB)) + x86_pred_cmd = PRED_CMD_SBPB; + return; + } + + if (has_microcode) { /* * Zen1/2 with SMT off aren't vulnerable after the right * IBPB microcode has been applied. + * + * Zen1/2 don't have SBPB, no need to try to enable it here. */ if (boot_cpu_data.x86 < 0x19 && !cpu_smt_possible()) { setup_force_cpu_cap(X86_FEATURE_SRSO_NO); return; } - } - if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB) { - if (has_microcode) { - pr_err("Retbleed IBPB mitigation enabled, using same for SRSO\n"); + if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB) { srso_mitigation = SRSO_MITIGATION_IBPB; - goto pred_cmd; + goto out; } + } else { + pr_warn("IBPB-extending microcode not applied!\n"); + pr_warn(SRSO_NOTICE); + + /* may be overwritten by SRSO_CMD_SAFE_RET below */ + srso_mitigation = SRSO_MITIGATION_UCODE_NEEDED; } switch (srso_cmd) { case SRSO_CMD_OFF: - goto pred_cmd; + if (boot_cpu_has(X86_FEATURE_SBPB)) + x86_pred_cmd = PRED_CMD_SBPB; + return; case SRSO_CMD_MICROCODE: if (has_microcode) { @@ -2458,10 +2472,12 @@ static void __init srso_select_mitigation(void) setup_force_cpu_cap(X86_FEATURE_SRSO); x86_return_thunk = srso_return_thunk; } - srso_mitigation = SRSO_MITIGATION_SAFE_RET; + if (has_microcode) + srso_mitigation = SRSO_MITIGATION_SAFE_RET; + else + srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED; } else { pr_err("WARNING: kernel not compiled with CPU_SRSO.\n"); - goto pred_cmd; } break; @@ -2473,7 +2489,6 @@ static void __init srso_select_mitigation(void) } } else { pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n"); - goto pred_cmd; } break; @@ -2485,20 +2500,12 @@ static void __init srso_select_mitigation(void) } } else { pr_err("WARNING: kernel not compiled with CPU_SRSO.\n"); - goto pred_cmd; } break; - - default: - break; } - pr_info("%s%s\n", srso_strings[srso_mitigation], (has_microcode ? "" : ", no microcode")); - -pred_cmd: - if ((boot_cpu_has(X86_FEATURE_SRSO_NO) || srso_cmd == SRSO_CMD_OFF) && - boot_cpu_has(X86_FEATURE_SBPB)) - x86_pred_cmd = PRED_CMD_SBPB; +out: + pr_info("%s\n", srso_strings[srso_mitigation]); } #undef pr_fmt @@ -2704,9 +2711,7 @@ static ssize_t srso_show_state(char *buf) if (boot_cpu_has(X86_FEATURE_SRSO_NO)) return sysfs_emit(buf, "Mitigation: SMT disabled\n"); - return sysfs_emit(buf, "%s%s\n", - srso_strings[srso_mitigation], - boot_cpu_has(X86_FEATURE_IBPB_BRTYPE) ? "" : ", no microcode"); + return sysfs_emit(buf, "%s\n", srso_strings[srso_mitigation]); } static ssize_t gds_show_state(char *buf) diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index defdc594be14..16f34639ecf7 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -290,6 +290,8 @@ static void early_init_hygon(struct cpuinfo_x86 *c) static void init_hygon(struct cpuinfo_x86 *c) { + u64 vm_cr; + early_init_hygon(c); /* @@ -320,6 +322,14 @@ static void init_hygon(struct cpuinfo_x86 *c) init_hygon_cacheinfo(c); + if (cpu_has(c, X86_FEATURE_SVM)) { + rdmsrl(MSR_VM_CR, vm_cr); + if (vm_cr & SVM_VM_CR_SVM_DIS_MASK) { + pr_notice_once("SVM disabled (by BIOS) in MSR_VM_CR\n"); + clear_cpu_cap(c, X86_FEATURE_SVM); + } + } + if (cpu_has(c, X86_FEATURE_XMM2)) { /* * Use LFENCE for execution serialization. On families which diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index c267f43de39e..f3517b8a8e91 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -713,17 +713,75 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) deferred_error_interrupt_enable(c); } -bool amd_mce_is_memory_error(struct mce *m) +/* + * DRAM ECC errors are reported in the Northbridge (bank 4) with + * Extended Error Code 8. + */ +static bool legacy_mce_is_memory_error(struct mce *m) +{ + return m->bank == 4 && XEC(m->status, 0x1f) == 8; +} + +/* + * DRAM ECC errors are reported in Unified Memory Controllers with + * Extended Error Code 0. + */ +static bool smca_mce_is_memory_error(struct mce *m) { enum smca_bank_types bank_type; - /* ErrCodeExt[20:16] */ - u8 xec = (m->status >> 16) & 0x1f; + + if (XEC(m->status, 0x3f)) + return false; bank_type = smca_get_bank_type(m->extcpu, m->bank); + + return bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2; +} + +bool amd_mce_is_memory_error(struct mce *m) +{ if (mce_flags.smca) - return (bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2) && xec == 0x0; + return smca_mce_is_memory_error(m); + else + return legacy_mce_is_memory_error(m); +} + +/* + * AMD systems do not have an explicit indicator that the value in MCA_ADDR is + * a system physical address. Therefore, individual cases need to be detected. + * Future cases and checks will be added as needed. + * + * 1) General case + * a) Assume address is not usable. + * 2) Poison errors + * a) Indicated by MCA_STATUS[43]: poison. Defined for all banks except legacy + * northbridge (bank 4). + * b) Refers to poison consumption in the core. Does not include "no action", + * "action optional", or "deferred" error severities. + * c) Will include a usable address so that immediate action can be taken. + * 3) Northbridge DRAM ECC errors + * a) Reported in legacy bank 4 with extended error code (XEC) 8. + * b) MCA_STATUS[43] is *not* defined as poison in legacy bank 4. Therefore, + * this bit should not be checked. + * + * NOTE: SMCA UMC memory errors fall into case #1. + */ +bool amd_mce_usable_address(struct mce *m) +{ + /* Check special northbridge case 3) first. */ + if (!mce_flags.smca) { + if (legacy_mce_is_memory_error(m)) + return true; + else if (m->bank == 4) + return false; + } - return m->bank == 4 && xec == 0x8; + /* Check poison bit for all other bank types. */ + if (m->status & MCI_STATUS_POISON) + return true; + + /* Assume address is not usable for all others. */ + return false; } static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 6f35f724cc14..0214d4232346 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -453,32 +453,22 @@ static void mce_irq_work_cb(struct irq_work *entry) mce_schedule_work(); } -/* - * Check if the address reported by the CPU is in a format we can parse. - * It would be possible to add code for most other cases, but all would - * be somewhat complicated (e.g. segment offset would require an instruction - * parser). So only support physical addresses up to page granularity for now. - */ -int mce_usable_address(struct mce *m) +bool mce_usable_address(struct mce *m) { if (!(m->status & MCI_STATUS_ADDRV)) - return 0; - - /* Checks after this one are Intel/Zhaoxin-specific: */ - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL && - boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN) - return 1; - - if (!(m->status & MCI_STATUS_MISCV)) - return 0; + return false; - if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) - return 0; + switch (m->cpuvendor) { + case X86_VENDOR_AMD: + return amd_mce_usable_address(m); - if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) - return 0; + case X86_VENDOR_INTEL: + case X86_VENDOR_ZHAOXIN: + return intel_mce_usable_address(m); - return 1; + default: + return true; + } } EXPORT_SYMBOL_GPL(mce_usable_address); diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index f5323551c1a9..52bce533ddcc 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -536,3 +536,23 @@ bool intel_filter_mce(struct mce *m) return false; } + +/* + * Check if the address reported by the CPU is in a format we can parse. + * It would be possible to add code for most other cases, but all would + * be somewhat complicated (e.g. segment offset would require an instruction + * parser). So only support physical addresses up to page granularity for now. + */ +bool intel_mce_usable_address(struct mce *m) +{ + if (!(m->status & MCI_STATUS_MISCV)) + return false; + + if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) + return false; + + if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) + return false; + + return true; +} diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index bcf1b3c66c9c..e13a26c9c0ac 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -49,6 +49,7 @@ void intel_init_cmci(void); void intel_init_lmce(void); void intel_clear_lmce(void); bool intel_filter_mce(struct mce *m); +bool intel_mce_usable_address(struct mce *m); #else # define cmci_intel_adjust_timer mce_adjust_timer_default static inline bool mce_intel_cmci_poll(void) { return false; } @@ -58,6 +59,7 @@ static inline void intel_init_cmci(void) { } static inline void intel_init_lmce(void) { } static inline void intel_clear_lmce(void) { } static inline bool intel_filter_mce(struct mce *m) { return false; } +static inline bool intel_mce_usable_address(struct mce *m) { return false; } #endif void mce_timer_kick(unsigned long interval); @@ -210,6 +212,7 @@ extern bool filter_mce(struct mce *m); #ifdef CONFIG_X86_MCE_AMD extern bool amd_filter_mce(struct mce *m); +bool amd_mce_usable_address(struct mce *m); /* * If MCA_CONFIG[McaLsbInStatusSupported] is set, extract ErrAddr in bits @@ -237,6 +240,7 @@ static __always_inline void smca_extract_err_addr(struct mce *m) #else static inline bool amd_filter_mce(struct mce *m) { return false; } +static inline bool amd_mce_usable_address(struct mce *m) { return false; } static inline void smca_extract_err_addr(struct mce *m) { } #endif diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 030d3b409768..19e0681f0435 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -152,6 +152,7 @@ static inline void cache_alloc_hsw_probe(void) r->cache.cbm_len = 20; r->cache.shareable_bits = 0xc0000; r->cache.min_cbm_bits = 2; + r->cache.arch_has_sparse_bitmasks = false; r->alloc_capable = true; rdt_alloc_capable = true; @@ -267,15 +268,18 @@ static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r) { struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); union cpuid_0x10_1_eax eax; + union cpuid_0x10_x_ecx ecx; union cpuid_0x10_x_edx edx; - u32 ebx, ecx; + u32 ebx; - cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx, &edx.full); + cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx.full, &edx.full); hw_res->num_closid = edx.split.cos_max + 1; r->cache.cbm_len = eax.split.cbm_len + 1; r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1; r->cache.shareable_bits = ebx & r->default_ctrl; r->data_width = (r->cache.cbm_len + 3) / 4; + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + r->cache.arch_has_sparse_bitmasks = ecx.split.noncont; r->alloc_capable = true; } @@ -872,7 +876,6 @@ static __init void rdt_init_res_defs_intel(void) if (r->rid == RDT_RESOURCE_L3 || r->rid == RDT_RESOURCE_L2) { - r->cache.arch_has_sparse_bitmaps = false; r->cache.arch_has_per_cpu_cfg = false; r->cache.min_cbm_bits = 1; } else if (r->rid == RDT_RESOURCE_MBA) { @@ -892,7 +895,7 @@ static __init void rdt_init_res_defs_amd(void) if (r->rid == RDT_RESOURCE_L3 || r->rid == RDT_RESOURCE_L2) { - r->cache.arch_has_sparse_bitmaps = true; + r->cache.arch_has_sparse_bitmasks = true; r->cache.arch_has_per_cpu_cfg = true; r->cache.min_cbm_bits = 0; } else if (r->rid == RDT_RESOURCE_MBA) { diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index b44c487727d4..beccb0e87ba7 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -87,10 +87,12 @@ int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, /* * Check whether a cache bit mask is valid. - * For Intel the SDM says: - * Please note that all (and only) contiguous '1' combinations - * are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.). - * Additionally Haswell requires at least two bits set. + * On Intel CPUs, non-contiguous 1s value support is indicated by CPUID: + * - CPUID.0x10.1:ECX[3]: L3 non-contiguous 1s value supported if 1 + * - CPUID.0x10.2:ECX[3]: L2 non-contiguous 1s value supported if 1 + * + * Haswell does not support a non-contiguous 1s value and additionally + * requires at least two bits set. * AMD allows non-contiguous bitmasks. */ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) @@ -113,8 +115,8 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) first_bit = find_first_bit(&val, cbm_len); zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); - /* Are non-contiguous bitmaps allowed? */ - if (!r->cache.arch_has_sparse_bitmaps && + /* Are non-contiguous bitmasks allowed? */ + if (!r->cache.arch_has_sparse_bitmasks && (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)) { rdt_last_cmd_printf("The mask %lx has non-consecutive 1-bits\n", val); return false; diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 85ceaf9a31ac..a4f1aa15f0a2 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -59,6 +59,7 @@ struct rdt_fs_context { bool enable_cdpl2; bool enable_cdpl3; bool enable_mba_mbps; + bool enable_debug; }; static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) @@ -243,18 +244,17 @@ struct rdtgroup { */ #define RFTYPE_INFO BIT(0) #define RFTYPE_BASE BIT(1) -#define RF_CTRLSHIFT 4 -#define RF_MONSHIFT 5 -#define RF_TOPSHIFT 6 -#define RFTYPE_CTRL BIT(RF_CTRLSHIFT) -#define RFTYPE_MON BIT(RF_MONSHIFT) -#define RFTYPE_TOP BIT(RF_TOPSHIFT) +#define RFTYPE_CTRL BIT(4) +#define RFTYPE_MON BIT(5) +#define RFTYPE_TOP BIT(6) #define RFTYPE_RES_CACHE BIT(8) #define RFTYPE_RES_MB BIT(9) -#define RF_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) -#define RF_MON_INFO (RFTYPE_INFO | RFTYPE_MON) -#define RF_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP) -#define RF_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL) +#define RFTYPE_DEBUG BIT(10) +#define RFTYPE_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) +#define RFTYPE_MON_INFO (RFTYPE_INFO | RFTYPE_MON) +#define RFTYPE_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP) +#define RFTYPE_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL) +#define RFTYPE_MON_BASE (RFTYPE_BASE | RFTYPE_MON) /* List of all resource groups */ extern struct list_head rdt_all_groups; @@ -270,7 +270,7 @@ void __exit rdtgroup_exit(void); * @mode: Access mode * @kf_ops: File operations * @flags: File specific RFTYPE_FLAGS_* flags - * @fflags: File specific RF_* or RFTYPE_* flags + * @fflags: File specific RFTYPE_* flags * @seq_show: Show content of the file * @write: Write to the file */ @@ -492,6 +492,15 @@ union cpuid_0x10_3_eax { unsigned int full; }; +/* CPUID.(EAX=10H, ECX=ResID).ECX */ +union cpuid_0x10_x_ecx { + struct { + unsigned int reserved:3; + unsigned int noncont:1; + } split; + unsigned int full; +}; + /* CPUID.(EAX=10H, ECX=ResID).EDX */ union cpuid_0x10_x_edx { struct { diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 725344048f85..69a1de92384a 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -54,8 +54,13 @@ static struct kernfs_node *kn_mondata; static struct seq_buf last_cmd_status; static char last_cmd_status_buf[512]; +static int rdtgroup_setup_root(struct rdt_fs_context *ctx); +static void rdtgroup_destroy_root(void); + struct dentry *debugfs_resctrl; +static bool resctrl_debug; + void rdt_last_cmd_clear(void) { lockdep_assert_held(&rdtgroup_mutex); @@ -696,11 +701,10 @@ static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct rdtgroup *rdtgrp; + char *pid_str; int ret = 0; pid_t pid; - if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) - return -EINVAL; rdtgrp = rdtgroup_kn_lock_live(of->kn); if (!rdtgrp) { rdtgroup_kn_unlock(of->kn); @@ -715,7 +719,27 @@ static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, goto unlock; } - ret = rdtgroup_move_task(pid, rdtgrp, of); + while (buf && buf[0] != '\0' && buf[0] != '\n') { + pid_str = strim(strsep(&buf, ",")); + + if (kstrtoint(pid_str, 0, &pid)) { + rdt_last_cmd_printf("Task list parsing error pid %s\n", pid_str); + ret = -EINVAL; + break; + } + + if (pid < 0) { + rdt_last_cmd_printf("Invalid pid %d\n", pid); + ret = -EINVAL; + break; + } + + ret = rdtgroup_move_task(pid, rdtgrp, of); + if (ret) { + rdt_last_cmd_printf("Error while processing task %d\n", pid); + break; + } + } unlock: rdtgroup_kn_unlock(of->kn); @@ -755,6 +779,38 @@ static int rdtgroup_tasks_show(struct kernfs_open_file *of, return ret; } +static int rdtgroup_closid_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (rdtgrp) + seq_printf(s, "%u\n", rdtgrp->closid); + else + ret = -ENOENT; + rdtgroup_kn_unlock(of->kn); + + return ret; +} + +static int rdtgroup_rmid_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (rdtgrp) + seq_printf(s, "%u\n", rdtgrp->mon.rmid); + else + ret = -ENOENT; + rdtgroup_kn_unlock(of->kn); + + return ret; +} + #ifdef CONFIG_PROC_CPU_RESCTRL /* @@ -895,7 +951,7 @@ static int rdt_shareable_bits_show(struct kernfs_open_file *of, return 0; } -/** +/* * rdt_bit_usage_show - Display current usage of resources * * A domain is a shared resource that can now be allocated differently. Here @@ -1117,12 +1173,24 @@ static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type) } } +static int rdt_has_sparse_bitmasks_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = of->kn->parent->priv; + struct rdt_resource *r = s->res; + + seq_printf(seq, "%u\n", r->cache.arch_has_sparse_bitmasks); + + return 0; +} + /** * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other * @r: Resource to which domain instance @d belongs. * @d: The domain instance for which @closid is being tested. * @cbm: Capacity bitmask being tested. * @closid: Intended closid for @cbm. + * @type: CDP type of @r. * @exclusive: Only check if overlaps with exclusive resource groups * * Checks if provided @cbm intended to be used for @closid on domain @@ -1209,6 +1277,7 @@ bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d, /** * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive + * @rdtgrp: Resource group identified through its closid. * * An exclusive resource group implies that there should be no sharing of * its allocated resources. At the time this group is considered to be @@ -1251,9 +1320,8 @@ static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) return true; } -/** +/* * rdtgroup_mode_write - Modify the resource group's mode - * */ static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) @@ -1357,12 +1425,11 @@ unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, return size; } -/** +/* * rdtgroup_size_show - Display size in bytes of allocated regions * * The "size" file mirrors the layout of the "schemata" file, printing the * size in bytes of each region instead of the capacity bitmask. - * */ static int rdtgroup_size_show(struct kernfs_open_file *of, struct seq_file *s, void *v) @@ -1686,77 +1753,77 @@ static struct rftype res_common_files[] = { .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_last_cmd_status_show, - .fflags = RF_TOP_INFO, + .fflags = RFTYPE_TOP_INFO, }, { .name = "num_closids", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_num_closids_show, - .fflags = RF_CTRL_INFO, + .fflags = RFTYPE_CTRL_INFO, }, { .name = "mon_features", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_mon_features_show, - .fflags = RF_MON_INFO, + .fflags = RFTYPE_MON_INFO, }, { .name = "num_rmids", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_num_rmids_show, - .fflags = RF_MON_INFO, + .fflags = RFTYPE_MON_INFO, }, { .name = "cbm_mask", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_default_ctrl_show, - .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, }, { .name = "min_cbm_bits", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_min_cbm_bits_show, - .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, }, { .name = "shareable_bits", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_shareable_bits_show, - .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, }, { .name = "bit_usage", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_bit_usage_show, - .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, }, { .name = "min_bandwidth", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_min_bw_show, - .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, }, { .name = "bandwidth_gran", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_bw_gran_show, - .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, }, { .name = "delay_linear", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_delay_linear_show, - .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, }, /* * Platform specific which (if any) capabilities are provided by @@ -1775,7 +1842,7 @@ static struct rftype res_common_files[] = { .kf_ops = &rdtgroup_kf_single_ops, .write = max_threshold_occ_write, .seq_show = max_threshold_occ_show, - .fflags = RF_MON_INFO | RFTYPE_RES_CACHE, + .fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE, }, { .name = "mbm_total_bytes_config", @@ -1817,12 +1884,19 @@ static struct rftype res_common_files[] = { .fflags = RFTYPE_BASE, }, { + .name = "mon_hw_id", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdtgroup_rmid_show, + .fflags = RFTYPE_MON_BASE | RFTYPE_DEBUG, + }, + { .name = "schemata", .mode = 0644, .kf_ops = &rdtgroup_kf_single_ops, .write = rdtgroup_schemata_write, .seq_show = rdtgroup_schemata_show, - .fflags = RF_CTRL_BASE, + .fflags = RFTYPE_CTRL_BASE, }, { .name = "mode", @@ -1830,14 +1904,28 @@ static struct rftype res_common_files[] = { .kf_ops = &rdtgroup_kf_single_ops, .write = rdtgroup_mode_write, .seq_show = rdtgroup_mode_show, - .fflags = RF_CTRL_BASE, + .fflags = RFTYPE_CTRL_BASE, }, { .name = "size", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdtgroup_size_show, - .fflags = RF_CTRL_BASE, + .fflags = RFTYPE_CTRL_BASE, + }, + { + .name = "sparse_masks", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_has_sparse_bitmasks_show, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, + }, + { + .name = "ctrl_hw_id", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdtgroup_closid_show, + .fflags = RFTYPE_CTRL_BASE | RFTYPE_DEBUG, }, }; @@ -1852,6 +1940,9 @@ static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags) lockdep_assert_held(&rdtgroup_mutex); + if (resctrl_debug) + fflags |= RFTYPE_DEBUG; + for (rft = rfts; rft < rfts + len; rft++) { if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) { ret = rdtgroup_add_file(kn, rft); @@ -1894,7 +1985,7 @@ void __init thread_throttle_mode_init(void) if (!rft) return; - rft->fflags = RF_CTRL_INFO | RFTYPE_RES_MB; + rft->fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB; } void __init mbm_config_rftype_init(const char *config) @@ -1903,7 +1994,7 @@ void __init mbm_config_rftype_init(const char *config) rft = rdtgroup_get_rftype_by_name(config); if (rft) - rft->fflags = RF_MON_INFO | RFTYPE_RES_CACHE; + rft->fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE; } /** @@ -2038,21 +2129,21 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) if (IS_ERR(kn_info)) return PTR_ERR(kn_info); - ret = rdtgroup_add_files(kn_info, RF_TOP_INFO); + ret = rdtgroup_add_files(kn_info, RFTYPE_TOP_INFO); if (ret) goto out_destroy; /* loop over enabled controls, these are all alloc_capable */ list_for_each_entry(s, &resctrl_schema_all, list) { r = s->res; - fflags = r->fflags | RF_CTRL_INFO; + fflags = r->fflags | RFTYPE_CTRL_INFO; ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags); if (ret) goto out_destroy; } for_each_mon_capable_rdt_resource(r) { - fflags = r->fflags | RF_MON_INFO; + fflags = r->fflags | RFTYPE_MON_INFO; sprintf(name, "%s_MON", r->name); ret = rdtgroup_mkdir_info_resdir(r, name, fflags); if (ret) @@ -2271,14 +2362,6 @@ int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable) return 0; } -static void cdp_disable_all(void) -{ - if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3)) - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); - if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); -} - /* * We don't allow rdtgroup directories to be created anywhere * except the root directory. Thus when looking for the rdtgroup @@ -2358,19 +2441,47 @@ static int mkdir_mondata_all(struct kernfs_node *parent_kn, struct rdtgroup *prgrp, struct kernfs_node **mon_data_kn); +static void rdt_disable_ctx(void) +{ + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); + set_mba_sc(false); + + resctrl_debug = false; +} + static int rdt_enable_ctx(struct rdt_fs_context *ctx) { int ret = 0; - if (ctx->enable_cdpl2) + if (ctx->enable_cdpl2) { ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, true); + if (ret) + goto out_done; + } - if (!ret && ctx->enable_cdpl3) + if (ctx->enable_cdpl3) { ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, true); + if (ret) + goto out_cdpl2; + } - if (!ret && ctx->enable_mba_mbps) + if (ctx->enable_mba_mbps) { ret = set_mba_sc(true); + if (ret) + goto out_cdpl3; + } + + if (ctx->enable_debug) + resctrl_debug = true; + return 0; + +out_cdpl3: + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); +out_cdpl2: + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); +out_done: return ret; } @@ -2463,6 +2574,7 @@ static void schemata_list_destroy(void) static int rdt_get_tree(struct fs_context *fc) { struct rdt_fs_context *ctx = rdt_fc2context(fc); + unsigned long flags = RFTYPE_CTRL_BASE; struct rdt_domain *dom; struct rdt_resource *r; int ret; @@ -2477,18 +2589,31 @@ static int rdt_get_tree(struct fs_context *fc) goto out; } + ret = rdtgroup_setup_root(ctx); + if (ret) + goto out; + ret = rdt_enable_ctx(ctx); - if (ret < 0) - goto out_cdp; + if (ret) + goto out_root; ret = schemata_list_create(); if (ret) { schemata_list_destroy(); - goto out_mba; + goto out_ctx; } closid_init(); + if (rdt_mon_capable) + flags |= RFTYPE_MON; + + ret = rdtgroup_add_files(rdtgroup_default.kn, flags); + if (ret) + goto out_schemata_free; + + kernfs_activate(rdtgroup_default.kn); + ret = rdtgroup_create_info_dir(rdtgroup_default.kn); if (ret < 0) goto out_schemata_free; @@ -2543,11 +2668,10 @@ out_info: kernfs_remove(kn_info); out_schemata_free: schemata_list_destroy(); -out_mba: - if (ctx->enable_mba_mbps) - set_mba_sc(false); -out_cdp: - cdp_disable_all(); +out_ctx: + rdt_disable_ctx(); +out_root: + rdtgroup_destroy_root(); out: rdt_last_cmd_clear(); mutex_unlock(&rdtgroup_mutex); @@ -2559,6 +2683,7 @@ enum rdt_param { Opt_cdp, Opt_cdpl2, Opt_mba_mbps, + Opt_debug, nr__rdt_params }; @@ -2566,6 +2691,7 @@ static const struct fs_parameter_spec rdt_fs_parameters[] = { fsparam_flag("cdp", Opt_cdp), fsparam_flag("cdpl2", Opt_cdpl2), fsparam_flag("mba_MBps", Opt_mba_mbps), + fsparam_flag("debug", Opt_debug), {} }; @@ -2591,6 +2717,9 @@ static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) return -EINVAL; ctx->enable_mba_mbps = true; return 0; + case Opt_debug: + ctx->enable_debug = true; + return 0; } return -EINVAL; @@ -2618,7 +2747,6 @@ static int rdt_init_fs_context(struct fs_context *fc) if (!ctx) return -ENOMEM; - ctx->kfc.root = rdt_root; ctx->kfc.magic = RDTGROUP_SUPER_MAGIC; fc->fs_private = &ctx->kfc; fc->ops = &rdt_fs_context_ops; @@ -2779,16 +2907,16 @@ static void rdt_kill_sb(struct super_block *sb) cpus_read_lock(); mutex_lock(&rdtgroup_mutex); - set_mba_sc(false); + rdt_disable_ctx(); /*Put everything back to default values. */ for_each_alloc_capable_rdt_resource(r) reset_all_ctrls(r); - cdp_disable_all(); rmdir_all_sub(); rdt_pseudo_lock_release(); rdtgroup_default.mode = RDT_MODE_SHAREABLE; schemata_list_destroy(); + rdtgroup_destroy_root(); static_branch_disable_cpuslocked(&rdt_alloc_enable_key); static_branch_disable_cpuslocked(&rdt_mon_enable_key); static_branch_disable_cpuslocked(&rdt_enable_key); @@ -3170,8 +3298,8 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, enum rdt_group_type rtype, struct rdtgroup **r) { struct rdtgroup *prdtgrp, *rdtgrp; + unsigned long files = 0; struct kernfs_node *kn; - uint files = 0; int ret; prdtgrp = rdtgroup_kn_lock_live(parent_kn); @@ -3223,7 +3351,14 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, goto out_destroy; } - files = RFTYPE_BASE | BIT(RF_CTRLSHIFT + rtype); + if (rtype == RDTCTRL_GROUP) { + files = RFTYPE_BASE | RFTYPE_CTRL; + if (rdt_mon_capable) + files |= RFTYPE_MON; + } else { + files = RFTYPE_BASE | RFTYPE_MON; + } + ret = rdtgroup_add_files(kn, files); if (ret) { rdt_last_cmd_puts("kernfs fill error\n"); @@ -3656,6 +3791,9 @@ static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) if (is_mba_sc(&rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl)) seq_puts(seq, ",mba_MBps"); + if (resctrl_debug) + seq_puts(seq, ",debug"); + return 0; } @@ -3666,10 +3804,8 @@ static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = { .show_options = rdtgroup_show_options, }; -static int __init rdtgroup_setup_root(void) +static int rdtgroup_setup_root(struct rdt_fs_context *ctx) { - int ret; - rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops, KERNFS_ROOT_CREATE_DEACTIVATED | KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK, @@ -3677,6 +3813,20 @@ static int __init rdtgroup_setup_root(void) if (IS_ERR(rdt_root)) return PTR_ERR(rdt_root); + ctx->kfc.root = rdt_root; + rdtgroup_default.kn = kernfs_root_to_node(rdt_root); + + return 0; +} + +static void rdtgroup_destroy_root(void) +{ + kernfs_destroy_root(rdt_root); + rdtgroup_default.kn = NULL; +} + +static void __init rdtgroup_setup_default(void) +{ mutex_lock(&rdtgroup_mutex); rdtgroup_default.closid = 0; @@ -3686,19 +3836,7 @@ static int __init rdtgroup_setup_root(void) list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups); - ret = rdtgroup_add_files(kernfs_root_to_node(rdt_root), RF_CTRL_BASE); - if (ret) { - kernfs_destroy_root(rdt_root); - goto out; - } - - rdtgroup_default.kn = kernfs_root_to_node(rdt_root); - kernfs_activate(rdtgroup_default.kn); - -out: mutex_unlock(&rdtgroup_mutex); - - return ret; } static void domain_destroy_mon_state(struct rdt_domain *d) @@ -3820,13 +3958,11 @@ int __init rdtgroup_init(void) seq_buf_init(&last_cmd_status, last_cmd_status_buf, sizeof(last_cmd_status_buf)); - ret = rdtgroup_setup_root(); - if (ret) - return ret; + rdtgroup_setup_default(); ret = sysfs_create_mount_point(fs_kobj, "resctrl"); if (ret) - goto cleanup_root; + return ret; ret = register_filesystem(&rdt_fs_type); if (ret) @@ -3859,8 +3995,6 @@ int __init rdtgroup_init(void) cleanup_mountpoint: sysfs_remove_mount_point(fs_kobj, "resctrl"); -cleanup_root: - kernfs_destroy_root(rdt_root); return ret; } @@ -3870,5 +4004,4 @@ void __exit rdtgroup_exit(void) debugfs_remove_recursive(debugfs_resctrl); unregister_filesystem(&rdt_fs_type); sysfs_remove_mount_point(fs_kobj, "resctrl"); - kernfs_destroy_root(rdt_root); } diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 87d38f17ff5c..afd09924094e 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -278,7 +278,7 @@ static void __init dtb_apic_setup(void) } #ifdef CONFIG_OF_EARLY_FLATTREE -static void __init x86_flattree_get_config(void) +void __init x86_flattree_get_config(void) { u32 size, map_len; void *dt; @@ -300,14 +300,10 @@ static void __init x86_flattree_get_config(void) unflatten_and_copy_device_tree(); early_memunmap(dt, map_len); } -#else -static inline void x86_flattree_get_config(void) { } #endif void __init x86_dtb_init(void) { - x86_flattree_get_config(); - if (!of_have_populated_dt()) return; diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index a86d37052a64..a21a4d0ecc34 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -369,14 +369,15 @@ int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest) EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpstate); void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, - unsigned int size, u32 pkru) + unsigned int size, u64 xfeatures, u32 pkru) { struct fpstate *kstate = gfpu->fpstate; union fpregs_state *ustate = buf; struct membuf mb = { .p = buf, .left = size }; if (cpu_feature_enabled(X86_FEATURE_XSAVE)) { - __copy_xstate_to_uabi_buf(mb, kstate, pkru, XSTATE_COPY_XSAVE); + __copy_xstate_to_uabi_buf(mb, kstate, xfeatures, pkru, + XSTATE_COPY_XSAVE); } else { memcpy(&ustate->fxsave, &kstate->regs.fxsave, sizeof(ustate->fxsave)); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index cadf68737e6b..117e74c44e75 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1049,6 +1049,7 @@ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate, * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer * @to: membuf descriptor * @fpstate: The fpstate buffer from which to copy + * @xfeatures: The mask of xfeatures to save (XSAVE mode only) * @pkru_val: The PKRU value to store in the PKRU component * @copy_mode: The requested copy mode * @@ -1059,7 +1060,8 @@ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate, * It supports partial copy but @to.pos always starts from zero. */ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, - u32 pkru_val, enum xstate_copy_mode copy_mode) + u64 xfeatures, u32 pkru_val, + enum xstate_copy_mode copy_mode) { const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr); struct xregs_state *xinit = &init_fpstate.regs.xsave; @@ -1083,7 +1085,7 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, break; case XSTATE_COPY_XSAVE: - header.xfeatures &= fpstate->user_xfeatures; + header.xfeatures &= fpstate->user_xfeatures & xfeatures; break; } @@ -1185,6 +1187,7 @@ void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, enum xstate_copy_mode copy_mode) { __copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate, + tsk->thread.fpu.fpstate->user_xfeatures, tsk->thread.pkru, copy_mode); } @@ -1536,10 +1539,7 @@ static int fpstate_realloc(u64 xfeatures, unsigned int ksize, fpregs_restore_userregs(); newfps->xfeatures = curfps->xfeatures | xfeatures; - - if (!guest_fpu) - newfps->user_xfeatures = curfps->user_xfeatures | xfeatures; - + newfps->user_xfeatures = curfps->user_xfeatures | xfeatures; newfps->xfd = curfps->xfd & ~xfeatures; /* Do the final updates within the locked region */ @@ -1736,7 +1736,6 @@ EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm); /** * fpu_xstate_prctl - xstate permission operations - * @tsk: Redundant pointer to current * @option: A subfunction of arch_prctl() * @arg2: option argument * Return: 0 if successful; otherwise, an error code diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index a4ecb04d8d64..3518fb26d06b 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -43,7 +43,8 @@ enum xstate_copy_mode { struct membuf; extern void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, - u32 pkru_val, enum xstate_copy_mode copy_mode); + u64 xfeatures, u32 pkru_val, + enum xstate_copy_mode copy_mode); extern void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, enum xstate_copy_mode mode); extern int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru); diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 30a55207c000..c20d1832c481 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -32,6 +32,7 @@ */ static void init_8259A(int auto_eoi); +static bool pcat_compat __ro_after_init; static int i8259A_auto_eoi; DEFINE_RAW_SPINLOCK(i8259A_lock); @@ -299,15 +300,32 @@ static void unmask_8259A(void) static int probe_8259A(void) { + unsigned char new_val, probe_val = ~(1 << PIC_CASCADE_IR); unsigned long flags; - unsigned char probe_val = ~(1 << PIC_CASCADE_IR); - unsigned char new_val; + + /* + * If MADT has the PCAT_COMPAT flag set, then do not bother probing + * for the PIC. Some BIOSes leave the PIC uninitialized and probing + * fails. + * + * Right now this causes problems as quite some code depends on + * nr_legacy_irqs() > 0 or has_legacy_pic() == true. This is silly + * when the system has an IO/APIC because then PIC is not required + * at all, except for really old machines where the timer interrupt + * must be routed through the PIC. So just pretend that the PIC is + * there and let legacy_pic->init() initialize it for nothing. + * + * Alternatively this could just try to initialize the PIC and + * repeat the probe, but for cases where there is no PIC that's + * just pointless. + */ + if (pcat_compat) + return nr_legacy_irqs(); + /* - * Check to see if we have a PIC. - * Mask all except the cascade and read - * back the value we just wrote. If we don't - * have a PIC, we will read 0xff as opposed to the - * value we wrote. + * Check to see if we have a PIC. Mask all except the cascade and + * read back the value we just wrote. If we don't have a PIC, we + * will read 0xff as opposed to the value we wrote. */ raw_spin_lock_irqsave(&i8259A_lock, flags); @@ -429,5 +447,9 @@ static int __init i8259A_init_ops(void) return 0; } - device_initcall(i8259A_init_ops); + +void __init legacy_pic_pcat_compat(void) +{ + pcat_compat = true; +} diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b098b1fa2470..3e05ecf8cf8d 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1217,6 +1217,8 @@ void __init setup_arch(char **cmdline_p) early_acpi_boot_init(); + x86_flattree_get_config(); + initmem_init(); dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT); diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index dcf325b7b022..ccb0915e84e1 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -632,6 +632,23 @@ fail: sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); } +static enum es_result vc_insn_string_check(struct es_em_ctxt *ctxt, + unsigned long address, + bool write) +{ + if (user_mode(ctxt->regs) && fault_in_kernel_space(address)) { + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = X86_PF_USER; + ctxt->fi.cr2 = address; + if (write) + ctxt->fi.error_code |= X86_PF_WRITE; + + return ES_EXCEPTION; + } + + return ES_OK; +} + static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt, void *src, char *buf, unsigned int data_size, @@ -639,7 +656,12 @@ static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt, bool backwards) { int i, b = backwards ? -1 : 1; - enum es_result ret = ES_OK; + unsigned long address = (unsigned long)src; + enum es_result ret; + + ret = vc_insn_string_check(ctxt, address, false); + if (ret != ES_OK) + return ret; for (i = 0; i < count; i++) { void *s = src + (i * data_size * b); @@ -660,7 +682,12 @@ static enum es_result vc_insn_string_write(struct es_em_ctxt *ctxt, bool backwards) { int i, s = backwards ? -1 : 1; - enum es_result ret = ES_OK; + unsigned long address = (unsigned long)dst; + enum es_result ret; + + ret = vc_insn_string_check(ctxt, address, true); + if (ret != ES_OK) + return ret; for (i = 0; i < count; i++) { void *d = dst + (i * data_size * s); @@ -696,6 +723,9 @@ static enum es_result vc_insn_string_write(struct es_em_ctxt *ctxt, static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo) { struct insn *insn = &ctxt->insn; + size_t size; + u64 port; + *exitinfo = 0; switch (insn->opcode.bytes[0]) { @@ -704,7 +734,7 @@ static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo) case 0x6d: *exitinfo |= IOIO_TYPE_INS; *exitinfo |= IOIO_SEG_ES; - *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; + port = ctxt->regs->dx & 0xffff; break; /* OUTS opcodes */ @@ -712,41 +742,43 @@ static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo) case 0x6f: *exitinfo |= IOIO_TYPE_OUTS; *exitinfo |= IOIO_SEG_DS; - *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; + port = ctxt->regs->dx & 0xffff; break; /* IN immediate opcodes */ case 0xe4: case 0xe5: *exitinfo |= IOIO_TYPE_IN; - *exitinfo |= (u8)insn->immediate.value << 16; + port = (u8)insn->immediate.value & 0xffff; break; /* OUT immediate opcodes */ case 0xe6: case 0xe7: *exitinfo |= IOIO_TYPE_OUT; - *exitinfo |= (u8)insn->immediate.value << 16; + port = (u8)insn->immediate.value & 0xffff; break; /* IN register opcodes */ case 0xec: case 0xed: *exitinfo |= IOIO_TYPE_IN; - *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; + port = ctxt->regs->dx & 0xffff; break; /* OUT register opcodes */ case 0xee: case 0xef: *exitinfo |= IOIO_TYPE_OUT; - *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; + port = ctxt->regs->dx & 0xffff; break; default: return ES_DECODE_FAILED; } + *exitinfo |= port << 16; + switch (insn->opcode.bytes[0]) { case 0x6c: case 0x6e: @@ -756,12 +788,15 @@ static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo) case 0xee: /* Single byte opcodes */ *exitinfo |= IOIO_DATA_8; + size = 1; break; default: /* Length determined by instruction parsing */ *exitinfo |= (insn->opnd_bytes == 2) ? IOIO_DATA_16 : IOIO_DATA_32; + size = (insn->opnd_bytes == 2) ? 2 : 4; } + switch (insn->addr_bytes) { case 2: *exitinfo |= IOIO_ADDR_16; @@ -777,7 +812,7 @@ static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo) if (insn_has_rep_prefix(insn)) *exitinfo |= IOIO_REP; - return ES_OK; + return vc_ioio_check(ctxt, (u16)port, size); } static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index d8c1e3be74c0..6395bfd87b68 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -524,6 +524,33 @@ static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt return ES_OK; } +static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size) +{ + BUG_ON(size > 4); + + if (user_mode(ctxt->regs)) { + struct thread_struct *t = ¤t->thread; + struct io_bitmap *iobm = t->io_bitmap; + size_t idx; + + if (!iobm) + goto fault; + + for (idx = port; idx < port + size; ++idx) { + if (test_bit(idx, iobm->bitmap)) + goto fault; + } + } + + return ES_OK; + +fault: + ctxt->fi.vector = X86_TRAP_GP; + ctxt->fi.error_code = 0; + + return ES_EXCEPTION; +} + /* Include code shared with pre-decompression boot stage */ #include "sev-shared.c" @@ -1508,6 +1535,9 @@ static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) return ES_DECODE_FAILED; } + if (user_mode(ctxt->regs)) + return ES_UNSUPPORTED; + switch (mmio) { case INSN_MMIO_WRITE: memcpy(ghcb->shared_buffer, reg_data, bytes); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 6eb06d001bcc..96a771f9f930 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -131,7 +131,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) } /* - * Disable virtualization, APIC etc. and park the CPU in a HLT loop + * this function calls the 'stop' function on all other CPUs in the system. */ DEFINE_IDTENTRY_SYSVEC(sysvec_reboot) { @@ -172,17 +172,13 @@ static void native_stop_other_cpus(int wait) * 2) Wait for all other CPUs to report that they reached the * HLT loop in stop_this_cpu() * - * 3) If the system uses INIT/STARTUP for CPU bringup, then - * send all present CPUs an INIT vector, which brings them - * completely out of the way. + * 3) If #2 timed out send an NMI to the CPUs which did not + * yet report * - * 4) If #3 is not possible and #2 timed out send an NMI to the - * CPUs which did not yet report - * - * 5) Wait for all other CPUs to report that they reached the + * 4) Wait for all other CPUs to report that they reached the * HLT loop in stop_this_cpu() * - * #4 can obviously race against a CPU reaching the HLT loop late. + * #3 can obviously race against a CPU reaching the HLT loop late. * That CPU will have reported already and the "have all CPUs * reached HLT" condition will be true despite the fact that the * other CPU is still handling the NMI. Again, there is no @@ -198,7 +194,7 @@ static void native_stop_other_cpus(int wait) /* * Don't wait longer than a second for IPI completion. The * wait request is not checked here because that would - * prevent an NMI/INIT shutdown in case that not all + * prevent an NMI shutdown attempt in case that not all * CPUs reach shutdown state. */ timeout = USEC_PER_SEC; @@ -206,27 +202,7 @@ static void native_stop_other_cpus(int wait) udelay(1); } - /* - * Park all other CPUs in INIT including "offline" CPUs, if - * possible. That's a safe place where they can't resume execution - * of HLT and then execute the HLT loop from overwritten text or - * page tables. - * - * The only downside is a broadcast MCE, but up to the point where - * the kexec() kernel brought all APs online again an MCE will just - * make HLT resume and handle the MCE. The machine crashes and burns - * due to overwritten text, page tables and data. So there is a - * choice between fire and frying pan. The result is pretty much - * the same. Chose frying pan until x86 provides a sane mechanism - * to park a CPU. - */ - if (smp_park_other_cpus_in_init()) - goto done; - - /* - * If park with INIT was not possible and the REBOOT_VECTOR didn't - * take all secondary CPUs offline, try with the NMI. - */ + /* if the REBOOT_VECTOR didn't work, try with the NMI */ if (!cpumask_empty(&cpus_stop_mask)) { /* * If NMI IPI is enabled, try to register the stop handler @@ -249,7 +225,6 @@ static void native_stop_other_cpus(int wait) udelay(1); } -done: local_irq_save(flags); disable_local_APIC(); mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 48e040618731..f7bcd42edaeb 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -87,6 +87,7 @@ #include <asm/hw_irq.h> #include <asm/stackprotector.h> #include <asm/sev.h> +#include <asm/spec-ctrl.h> /* representing HT siblings of each logical CPU */ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); @@ -640,13 +641,13 @@ static void __init build_sched_topology(void) }; #endif /* - * When there is NUMA topology inside the package skip the DIE domain + * When there is NUMA topology inside the package skip the PKG domain * since the NUMA domains will auto-magically create the right spanning * domains based on the SLIT. */ if (!x86_has_numa_in_package) { x86_topology[i++] = (struct sched_domain_topology_level){ - cpu_cpu_mask, x86_die_flags, SD_INIT_NAME(DIE) + cpu_cpu_mask, x86_die_flags, SD_INIT_NAME(PKG) }; } @@ -1240,33 +1241,6 @@ void arch_thaw_secondary_cpus_end(void) cache_aps_init(); } -bool smp_park_other_cpus_in_init(void) -{ - unsigned int cpu, this_cpu = smp_processor_id(); - unsigned int apicid; - - if (apic->wakeup_secondary_cpu_64 || apic->wakeup_secondary_cpu) - return false; - - /* - * If this is a crash stop which does not execute on the boot CPU, - * then this cannot use the INIT mechanism because INIT to the boot - * CPU will reset the machine. - */ - if (this_cpu) - return false; - - for_each_cpu_and(cpu, &cpus_booted_once_mask, cpu_present_mask) { - if (cpu == this_cpu) - continue; - apicid = apic->cpu_present_to_apicid(cpu); - if (apicid == BAD_APICID) - continue; - send_init_sequence(apicid); - } - return true; -} - /* * Early setup to make printk work. */ @@ -1623,8 +1597,15 @@ void __noreturn hlt_play_dead(void) native_halt(); } +/* + * native_play_dead() is essentially a __noreturn function, but it can't + * be marked as such as the compiler may complain about it. + */ void native_play_dead(void) { + if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS)) + __update_spec_ctrl(0); + play_dead_common(); tboot_shutdown(TB_SHUTDOWN_WFS); diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index ca004e2e4469..0bab03130033 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -54,7 +54,7 @@ void arch_unregister_cpu(int num) EXPORT_SYMBOL(arch_unregister_cpu); #else /* CONFIG_HOTPLUG_CPU */ -static int __init arch_register_cpu(int num) +int __init arch_register_cpu(int num) { return register_cpu(&per_cpu(cpu_devices, num).cpu, num); } diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index bbc440c93e08..1123ef3ccf90 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -15,6 +15,7 @@ * ( The serial nature of the boot logic and the CPU hotplug lock * protects against more than 2 CPUs entering this code. ) */ +#include <linux/workqueue.h> #include <linux/topology.h> #include <linux/spinlock.h> #include <linux/kernel.h> @@ -342,6 +343,13 @@ static inline unsigned int loop_timeout(int cpu) return (cpumask_weight(topology_core_cpumask(cpu)) > 1) ? 2 : 20; } +static void tsc_sync_mark_tsc_unstable(struct work_struct *work) +{ + mark_tsc_unstable("check_tsc_sync_source failed"); +} + +static DECLARE_WORK(tsc_sync_work, tsc_sync_mark_tsc_unstable); + /* * The freshly booted CPU initiates this via an async SMP function call. */ @@ -395,7 +403,7 @@ retry: "turning off TSC clock.\n", max_warp); if (random_warps) pr_warn("TSC warped randomly between CPUs\n"); - mark_tsc_unstable("check_tsc_sync_source failed"); + schedule_work(&tsc_sync_work); } /* diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 7e574cf3bf8a..d00c28aaa5be 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -85,7 +85,7 @@ static struct orc_entry *__orc_find(int *ip_table, struct orc_entry *u_table, { int *first = ip_table; int *last = ip_table + num_entries - 1; - int *mid = first, *found = first; + int *mid, *found = first; if (!num_entries) return NULL; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index f15fb71f280e..54a5596adaa6 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -139,10 +139,7 @@ SECTIONS STATIC_CALL_TEXT ALIGN_ENTRY_TEXT_BEGIN -#ifdef CONFIG_CPU_SRSO *(.text..__x86.rethunk_untrain) -#endif - ENTRY_TEXT #ifdef CONFIG_CPU_SRSO @@ -520,12 +517,12 @@ INIT_PER_CPU(irq_stack_backing_store); "fixed_percpu_data is not at start of per-cpu area"); #endif -#ifdef CONFIG_RETHUNK +#ifdef CONFIG_CPU_UNRET_ENTRY . = ASSERT((retbleed_return_thunk & 0x3f) == 0, "retbleed_return_thunk not cacheline-aligned"); -. = ASSERT((srso_safe_ret & 0x3f) == 0, "srso_safe_ret not cacheline-aligned"); #endif #ifdef CONFIG_CPU_SRSO +. = ASSERT((srso_safe_ret & 0x3f) == 0, "srso_safe_ret not cacheline-aligned"); /* * GNU ld cannot do XOR until 2.41. * https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=f6f78318fca803c4907fb8d7f6ded8295f1947b1 diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 0544e30b4946..773132c3bf5a 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -360,14 +360,6 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vcpu->arch.guest_supported_xcr0 = cpuid_get_supported_xcr0(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent); - /* - * FP+SSE can always be saved/restored via KVM_{G,S}ET_XSAVE, even if - * XSAVE/XCRO are not exposed to the guest, and even if XSAVE isn't - * supported by the host. - */ - vcpu->arch.guest_fpu.fpstate->user_xfeatures = vcpu->arch.guest_supported_xcr0 | - XFEATURE_MASK_FPSSE; - kvm_update_pv_runtime(vcpu); vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index dcd60b39e794..3e977dbbf993 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2759,13 +2759,17 @@ int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) { u32 reg = kvm_lapic_get_reg(apic, lvt_type); int vector, mode, trig_mode; + int r; if (kvm_apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) { vector = reg & APIC_VECTOR_MASK; mode = reg & APIC_MODE_MASK; trig_mode = reg & APIC_LVT_LEVEL_TRIGGER; - return __apic_accept_irq(apic, mode, vector, 1, trig_mode, - NULL); + + r = __apic_accept_irq(apic, mode, vector, 1, trig_mode, NULL); + if (r && lvt_type == APIC_LVTPC) + kvm_lapic_set_reg(apic, APIC_LVTPC, reg | APIC_LVT_MASKED); + return r; } return 0; } diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index edb89b51b383..9ae07db6f0f6 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -93,14 +93,6 @@ void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops) #undef __KVM_X86_PMU_OP } -static void kvm_pmi_trigger_fn(struct irq_work *irq_work) -{ - struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work); - struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); - - kvm_pmu_deliver_pmi(vcpu); -} - static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); @@ -124,20 +116,7 @@ static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); } - if (!pmc->intr || skip_pmi) - return; - - /* - * Inject PMI. If vcpu was in a guest mode during NMI PMI - * can be ejected on a guest mode re-entry. Otherwise we can't - * be sure that vcpu wasn't executing hlt instruction at the - * time of vmexit and is not going to re-enter guest mode until - * woken up. So we should wake it, but this is impossible from - * NMI context. Do it from irq work instead. - */ - if (in_pmi && !kvm_handling_nmi_from_guest(pmc->vcpu)) - irq_work_queue(&pmc_to_pmu(pmc)->irq_work); - else + if (pmc->intr && !skip_pmi) kvm_make_request(KVM_REQ_PMI, pmc->vcpu); } @@ -675,9 +654,6 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) void kvm_pmu_reset(struct kvm_vcpu *vcpu) { - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - - irq_work_sync(&pmu->irq_work); static_call(kvm_x86_pmu_reset)(vcpu); } @@ -687,7 +663,6 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu) memset(pmu, 0, sizeof(*pmu)); static_call(kvm_x86_pmu_init)(vcpu); - init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn); pmu->event_count = 0; pmu->need_cleanup = false; kvm_pmu_refresh(vcpu); diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 7d9ba301c090..1d64113de488 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -74,6 +74,12 @@ static inline u64 pmc_read_counter(struct kvm_pmc *pmc) return counter & pmc_bitmask(pmc); } +static inline void pmc_write_counter(struct kvm_pmc *pmc, u64 val) +{ + pmc->counter += val - pmc_read_counter(pmc); + pmc->counter &= pmc_bitmask(pmc); +} + static inline void pmc_release_perf_event(struct kvm_pmc *pmc) { if (pmc->perf_event) { diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 2092db892d7d..4b74ea91f4e6 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -529,8 +529,11 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu) case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE: WARN_ONCE(1, "Invalid backing page\n"); break; + case AVIC_IPI_FAILURE_INVALID_IPI_VECTOR: + /* Invalid IPI with vector < 16 */ + break; default: - pr_err("Unknown IPI interception\n"); + vcpu_unimpl(vcpu, "Unknown avic incomplete IPI interception\n"); } return 1; diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index dd496c9e5f91..3fea8c47679e 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1253,6 +1253,9 @@ void svm_leave_nested(struct kvm_vcpu *vcpu) nested_svm_uninit_mmu_context(vcpu); vmcb_mark_all_dirty(svm->vmcb); + + if (kvm_apicv_activated(vcpu->kvm)) + kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); } kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index cef5a3d0abd0..373ff6a6687b 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -160,7 +160,7 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) /* MSR_PERFCTRn */ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER); if (pmc) { - pmc->counter += data - pmc_read_counter(pmc); + pmc_write_counter(pmc, data); pmc_update_sample_period(pmc); return 0; } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9507df93f410..ded1d80d72cb 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -531,8 +531,6 @@ static bool __kvm_is_svm_supported(void) int cpu = smp_processor_id(); struct cpuinfo_x86 *c = &cpu_data(cpu); - u64 vm_cr; - if (c->x86_vendor != X86_VENDOR_AMD && c->x86_vendor != X86_VENDOR_HYGON) { pr_err("CPU %d isn't AMD or Hygon\n", cpu); @@ -549,12 +547,6 @@ static bool __kvm_is_svm_supported(void) return false; } - rdmsrl(MSR_VM_CR, vm_cr); - if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE)) { - pr_err("SVM disabled (by BIOS) in MSR_VM_CR on CPU %d\n", cpu); - return false; - } - return true; } @@ -691,7 +683,7 @@ static int svm_hardware_enable(void) */ if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) { struct sev_es_save_area *hostsa; - u32 msr_hi; + u32 __maybe_unused msr_hi; hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400); @@ -913,8 +905,7 @@ void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept) if (intercept == svm->x2avic_msrs_intercepted) return; - if (!x2avic_enabled || - !apic_x2apic_mode(svm->vcpu.arch.apic)) + if (!x2avic_enabled) return; for (i = 0; i < MAX_DIRECT_ACCESS_MSRS; i++) { diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index f2efa0bf7ae8..820d3e1f6b4f 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -436,11 +436,11 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!msr_info->host_initiated && !(msr & MSR_PMC_FULL_WIDTH_BIT)) data = (s64)(s32)data; - pmc->counter += data - pmc_read_counter(pmc); + pmc_write_counter(pmc, data); pmc_update_sample_period(pmc); break; } else if ((pmc = get_fixed_pmc(pmu, msr))) { - pmc->counter += data - pmc_read_counter(pmc); + pmc_write_counter(pmc, data); pmc_update_sample_period(pmc); break; } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9f18b06bbda6..41cce5031126 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5382,26 +5382,37 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, return 0; } -static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, - struct kvm_xsave *guest_xsave) -{ - if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) - return; - - fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, - guest_xsave->region, - sizeof(guest_xsave->region), - vcpu->arch.pkru); -} static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu, u8 *state, unsigned int size) { + /* + * Only copy state for features that are enabled for the guest. The + * state itself isn't problematic, but setting bits in the header for + * features that are supported in *this* host but not exposed to the + * guest can result in KVM_SET_XSAVE failing when live migrating to a + * compatible host without the features that are NOT exposed to the + * guest. + * + * FP+SSE can always be saved/restored via KVM_{G,S}ET_XSAVE, even if + * XSAVE/XCRO are not exposed to the guest, and even if XSAVE isn't + * supported by the host. + */ + u64 supported_xcr0 = vcpu->arch.guest_supported_xcr0 | + XFEATURE_MASK_FPSSE; + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) return; - fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, - state, size, vcpu->arch.pkru); + fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, state, size, + supported_xcr0, vcpu->arch.pkru); +} + +static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, + struct kvm_xsave *guest_xsave) +{ + return kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region, + sizeof(guest_xsave->region)); } static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, @@ -12843,6 +12854,9 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) return true; #endif + if (kvm_test_request(KVM_REQ_PMI, vcpu)) + return true; + if (kvm_arch_interrupt_allowed(vcpu) && (kvm_cpu_has_interrupt(vcpu) || kvm_guest_apic_has_interrupt(vcpu))) diff --git a/arch/x86/lib/copy_mc.c b/arch/x86/lib/copy_mc.c index 80efd45a7761..6e8b7e600def 100644 --- a/arch/x86/lib/copy_mc.c +++ b/arch/x86/lib/copy_mc.c @@ -70,23 +70,23 @@ unsigned long __must_check copy_mc_to_kernel(void *dst, const void *src, unsigne } EXPORT_SYMBOL_GPL(copy_mc_to_kernel); -unsigned long __must_check copy_mc_to_user(void *dst, const void *src, unsigned len) +unsigned long __must_check copy_mc_to_user(void __user *dst, const void *src, unsigned len) { unsigned long ret; if (copy_mc_fragile_enabled) { __uaccess_begin(); - ret = copy_mc_fragile(dst, src, len); + ret = copy_mc_fragile((__force void *)dst, src, len); __uaccess_end(); return ret; } if (static_cpu_has(X86_FEATURE_ERMS)) { __uaccess_begin(); - ret = copy_mc_enhanced_fast_string(dst, src, len); + ret = copy_mc_enhanced_fast_string((__force void *)dst, src, len); __uaccess_end(); return ret; } - return copy_user_generic(dst, src, len); + return copy_user_generic((__force void *)dst, src, len); } diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index cd86aeb5fdd3..a48077c5ca61 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -126,11 +126,19 @@ SYM_CODE_END(__x86_indirect_jump_thunk_array) #include <asm/GEN-for-each-reg.h> #undef GEN #endif + +#ifdef CONFIG_RETHUNK + /* - * This function name is magical and is used by -mfunction-return=thunk-extern - * for the compiler to generate JMPs to it. + * Be careful here: that label cannot really be removed because in + * some configurations and toolchains, the JMP __x86_return_thunk the + * compiler issues is either a short one or the compiler doesn't use + * relocations for same-section JMPs and that breaks the returns + * detection logic in apply_returns() and in objtool. */ -#ifdef CONFIG_RETHUNK + .section .text..__x86.return_thunk + +#ifdef CONFIG_CPU_SRSO /* * srso_alias_untrain_ret() and srso_alias_safe_ret() are placed at @@ -147,29 +155,18 @@ SYM_CODE_END(__x86_indirect_jump_thunk_array) * * As a result, srso_alias_safe_ret() becomes a safe return. */ -#ifdef CONFIG_CPU_SRSO - .section .text..__x86.rethunk_untrain - -SYM_START(srso_alias_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE) + .pushsection .text..__x86.rethunk_untrain +SYM_CODE_START_NOALIGN(srso_alias_untrain_ret) UNWIND_HINT_FUNC ANNOTATE_NOENDBR ASM_NOP2 lfence jmp srso_alias_return_thunk SYM_FUNC_END(srso_alias_untrain_ret) -__EXPORT_THUNK(srso_alias_untrain_ret) - - .section .text..__x86.rethunk_safe -#else -/* dummy definition for alternatives */ -SYM_START(srso_alias_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE) - ANNOTATE_UNRET_SAFE - ret - int3 -SYM_FUNC_END(srso_alias_untrain_ret) -#endif + .popsection -SYM_START(srso_alias_safe_ret, SYM_L_GLOBAL, SYM_A_NONE) + .pushsection .text..__x86.rethunk_safe +SYM_CODE_START_NOALIGN(srso_alias_safe_ret) lea 8(%_ASM_SP), %_ASM_SP UNWIND_HINT_FUNC ANNOTATE_UNRET_SAFE @@ -177,14 +174,63 @@ SYM_START(srso_alias_safe_ret, SYM_L_GLOBAL, SYM_A_NONE) int3 SYM_FUNC_END(srso_alias_safe_ret) - .section .text..__x86.return_thunk - -SYM_CODE_START(srso_alias_return_thunk) +SYM_CODE_START_NOALIGN(srso_alias_return_thunk) UNWIND_HINT_FUNC ANNOTATE_NOENDBR call srso_alias_safe_ret ud2 SYM_CODE_END(srso_alias_return_thunk) + .popsection + +/* + * SRSO untraining sequence for Zen1/2, similar to retbleed_untrain_ret() + * above. On kernel entry, srso_untrain_ret() is executed which is a + * + * movabs $0xccccc30824648d48,%rax + * + * and when the return thunk executes the inner label srso_safe_ret() + * later, it is a stack manipulation and a RET which is mispredicted and + * thus a "safe" one to use. + */ + .align 64 + .skip 64 - (srso_safe_ret - srso_untrain_ret), 0xcc +SYM_CODE_START_LOCAL_NOALIGN(srso_untrain_ret) + ANNOTATE_NOENDBR + .byte 0x48, 0xb8 + +/* + * This forces the function return instruction to speculate into a trap + * (UD2 in srso_return_thunk() below). This RET will then mispredict + * and execution will continue at the return site read from the top of + * the stack. + */ +SYM_INNER_LABEL(srso_safe_ret, SYM_L_GLOBAL) + lea 8(%_ASM_SP), %_ASM_SP + ret + int3 + int3 + /* end of movabs */ + lfence + call srso_safe_ret + ud2 +SYM_CODE_END(srso_safe_ret) +SYM_FUNC_END(srso_untrain_ret) + +SYM_CODE_START(srso_return_thunk) + UNWIND_HINT_FUNC + ANNOTATE_NOENDBR + call srso_safe_ret + ud2 +SYM_CODE_END(srso_return_thunk) + +#define JMP_SRSO_UNTRAIN_RET "jmp srso_untrain_ret" +#define JMP_SRSO_ALIAS_UNTRAIN_RET "jmp srso_alias_untrain_ret" +#else /* !CONFIG_CPU_SRSO */ +#define JMP_SRSO_UNTRAIN_RET "ud2" +#define JMP_SRSO_ALIAS_UNTRAIN_RET "ud2" +#endif /* CONFIG_CPU_SRSO */ + +#ifdef CONFIG_CPU_UNRET_ENTRY /* * Some generic notes on the untraining sequences: @@ -216,7 +262,7 @@ SYM_CODE_END(srso_alias_return_thunk) */ .align 64 .skip 64 - (retbleed_return_thunk - retbleed_untrain_ret), 0xcc -SYM_START(retbleed_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE) +SYM_CODE_START_LOCAL_NOALIGN(retbleed_untrain_ret) ANNOTATE_NOENDBR /* * As executed from retbleed_untrain_ret, this is: @@ -264,72 +310,27 @@ SYM_CODE_END(retbleed_return_thunk) jmp retbleed_return_thunk int3 SYM_FUNC_END(retbleed_untrain_ret) -__EXPORT_THUNK(retbleed_untrain_ret) -/* - * SRSO untraining sequence for Zen1/2, similar to retbleed_untrain_ret() - * above. On kernel entry, srso_untrain_ret() is executed which is a - * - * movabs $0xccccc30824648d48,%rax - * - * and when the return thunk executes the inner label srso_safe_ret() - * later, it is a stack manipulation and a RET which is mispredicted and - * thus a "safe" one to use. - */ - .align 64 - .skip 64 - (srso_safe_ret - srso_untrain_ret), 0xcc -SYM_START(srso_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE) - ANNOTATE_NOENDBR - .byte 0x48, 0xb8 +#define JMP_RETBLEED_UNTRAIN_RET "jmp retbleed_untrain_ret" +#else /* !CONFIG_CPU_UNRET_ENTRY */ +#define JMP_RETBLEED_UNTRAIN_RET "ud2" +#endif /* CONFIG_CPU_UNRET_ENTRY */ -/* - * This forces the function return instruction to speculate into a trap - * (UD2 in srso_return_thunk() below). This RET will then mispredict - * and execution will continue at the return site read from the top of - * the stack. - */ -SYM_INNER_LABEL(srso_safe_ret, SYM_L_GLOBAL) - lea 8(%_ASM_SP), %_ASM_SP - ret - int3 - int3 - /* end of movabs */ - lfence - call srso_safe_ret - ud2 -SYM_CODE_END(srso_safe_ret) -SYM_FUNC_END(srso_untrain_ret) -__EXPORT_THUNK(srso_untrain_ret) - -SYM_CODE_START(srso_return_thunk) - UNWIND_HINT_FUNC - ANNOTATE_NOENDBR - call srso_safe_ret - ud2 -SYM_CODE_END(srso_return_thunk) +#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO) SYM_FUNC_START(entry_untrain_ret) - ALTERNATIVE_2 "jmp retbleed_untrain_ret", \ - "jmp srso_untrain_ret", X86_FEATURE_SRSO, \ - "jmp srso_alias_untrain_ret", X86_FEATURE_SRSO_ALIAS + ALTERNATIVE_2 JMP_RETBLEED_UNTRAIN_RET, \ + JMP_SRSO_UNTRAIN_RET, X86_FEATURE_SRSO, \ + JMP_SRSO_ALIAS_UNTRAIN_RET, X86_FEATURE_SRSO_ALIAS SYM_FUNC_END(entry_untrain_ret) __EXPORT_THUNK(entry_untrain_ret) -SYM_CODE_START(__x86_return_thunk) - UNWIND_HINT_FUNC - ANNOTATE_NOENDBR - ANNOTATE_UNRET_SAFE - ret - int3 -SYM_CODE_END(__x86_return_thunk) -EXPORT_SYMBOL(__x86_return_thunk) - -#endif /* CONFIG_RETHUNK */ +#endif /* CONFIG_CPU_UNRET_ENTRY || CONFIG_CPU_SRSO */ #ifdef CONFIG_CALL_DEPTH_TRACKING .align 64 -SYM_FUNC_START(__x86_return_skl) +SYM_FUNC_START(call_depth_return_thunk) ANNOTATE_NOENDBR /* * Keep the hotpath in a 16byte I-fetch for the non-debug @@ -356,6 +357,33 @@ SYM_FUNC_START(__x86_return_skl) ANNOTATE_UNRET_SAFE ret int3 -SYM_FUNC_END(__x86_return_skl) +SYM_FUNC_END(call_depth_return_thunk) #endif /* CONFIG_CALL_DEPTH_TRACKING */ + +/* + * This function name is magical and is used by -mfunction-return=thunk-extern + * for the compiler to generate JMPs to it. + * + * This code is only used during kernel boot or module init. All + * 'JMP __x86_return_thunk' sites are changed to something else by + * apply_returns(). + * + * This should be converted eventually to call a warning function which + * should scream loudly when the default return thunk is called after + * alternatives have been applied. + * + * That warning function cannot BUG() because the bug splat cannot be + * displayed in all possible configurations, leading to users not really + * knowing why the machine froze. + */ +SYM_CODE_START(__x86_return_thunk) + UNWIND_HINT_FUNC + ANNOTATE_NOENDBR + ANNOTATE_UNRET_SAFE + ret + int3 +SYM_CODE_END(__x86_return_thunk) +EXPORT_SYMBOL(__x86_return_thunk) + +#endif /* CONFIG_RETHUNK */ diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 2aadb2019b4f..c79f12e449ea 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -3,6 +3,7 @@ #include <linux/acpi.h> #include <linux/kernel.h> #include <linux/mm.h> +#include <linux/of.h> #include <linux/string.h> #include <linux/init.h> #include <linux/memblock.h> @@ -733,6 +734,8 @@ void __init x86_numa_init(void) if (!numa_init(amd_numa_init)) return; #endif + if (acpi_disabled && !numa_init(of_numa_init)) + return; } numa_init(dummy_numa_init); diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 78414c6d1b5e..5dd733944629 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -69,6 +69,7 @@ static void __init pti_print_if_secure(const char *reason) pr_info("%s\n", reason); } +/* Assume mode is auto unless overridden via cmdline below. */ static enum pti_mode { PTI_AUTO = 0, PTI_FORCE_OFF, @@ -77,50 +78,49 @@ static enum pti_mode { void __init pti_check_boottime_disable(void) { - char arg[5]; - int ret; - - /* Assume mode is auto unless overridden. */ - pti_mode = PTI_AUTO; - if (hypervisor_is_type(X86_HYPER_XEN_PV)) { pti_mode = PTI_FORCE_OFF; pti_print_if_insecure("disabled on XEN PV."); return; } - ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg)); - if (ret > 0) { - if (ret == 3 && !strncmp(arg, "off", 3)) { - pti_mode = PTI_FORCE_OFF; - pti_print_if_insecure("disabled on command line."); - return; - } - if (ret == 2 && !strncmp(arg, "on", 2)) { - pti_mode = PTI_FORCE_ON; - pti_print_if_secure("force enabled on command line."); - goto enable; - } - if (ret == 4 && !strncmp(arg, "auto", 4)) { - pti_mode = PTI_AUTO; - goto autosel; - } - } - - if (cmdline_find_option_bool(boot_command_line, "nopti") || - cpu_mitigations_off()) { + if (cpu_mitigations_off()) pti_mode = PTI_FORCE_OFF; + if (pti_mode == PTI_FORCE_OFF) { pti_print_if_insecure("disabled on command line."); return; } -autosel: - if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) + if (pti_mode == PTI_FORCE_ON) + pti_print_if_secure("force enabled on command line."); + + if (pti_mode == PTI_AUTO && !boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) return; -enable: + setup_force_cpu_cap(X86_FEATURE_PTI); } +static int __init pti_parse_cmdline(char *arg) +{ + if (!strcmp(arg, "off")) + pti_mode = PTI_FORCE_OFF; + else if (!strcmp(arg, "on")) + pti_mode = PTI_FORCE_ON; + else if (!strcmp(arg, "auto")) + pti_mode = PTI_AUTO; + else + return -EINVAL; + return 0; +} +early_param("pti", pti_parse_cmdline); + +static int __init pti_parse_cmdline_nopti(char *arg) +{ + pti_mode = PTI_FORCE_OFF; + return 0; +} +early_param("nopti", pti_parse_cmdline_nopti); + pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) { /* diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c index 45d0c17ce77c..e03207de2880 100644 --- a/arch/x86/platform/uv/uv_nmi.c +++ b/arch/x86/platform/uv/uv_nmi.c @@ -17,6 +17,7 @@ #include <linux/sched.h> #include <linux/sched/debug.h> #include <linux/slab.h> +#include <linux/string.h> #include <linux/clocksource.h> #include <asm/apic.h> @@ -178,49 +179,56 @@ module_param_named(debug, uv_nmi_debug, int, 0644); } while (0) /* Valid NMI Actions */ -#define ACTION_LEN 16 -static struct nmi_action { - char *action; - char *desc; -} valid_acts[] = { - { "kdump", "do kernel crash dump" }, - { "dump", "dump process stack for each cpu" }, - { "ips", "dump Inst Ptr info for each cpu" }, - { "kdb", "enter KDB (needs kgdboc= assignment)" }, - { "kgdb", "enter KGDB (needs gdb target remote)" }, - { "health", "check if CPUs respond to NMI" }, +enum action_t { + nmi_act_kdump, + nmi_act_dump, + nmi_act_ips, + nmi_act_kdb, + nmi_act_kgdb, + nmi_act_health, + nmi_act_max }; -typedef char action_t[ACTION_LEN]; -static action_t uv_nmi_action = { "dump" }; + +static const char * const actions[nmi_act_max] = { + [nmi_act_kdump] = "kdump", + [nmi_act_dump] = "dump", + [nmi_act_ips] = "ips", + [nmi_act_kdb] = "kdb", + [nmi_act_kgdb] = "kgdb", + [nmi_act_health] = "health", +}; + +static const char * const actions_desc[nmi_act_max] = { + [nmi_act_kdump] = "do kernel crash dump", + [nmi_act_dump] = "dump process stack for each cpu", + [nmi_act_ips] = "dump Inst Ptr info for each cpu", + [nmi_act_kdb] = "enter KDB (needs kgdboc= assignment)", + [nmi_act_kgdb] = "enter KGDB (needs gdb target remote)", + [nmi_act_health] = "check if CPUs respond to NMI", +}; + +static enum action_t uv_nmi_action = nmi_act_dump; static int param_get_action(char *buffer, const struct kernel_param *kp) { - return sprintf(buffer, "%s\n", uv_nmi_action); + return sprintf(buffer, "%s\n", actions[uv_nmi_action]); } static int param_set_action(const char *val, const struct kernel_param *kp) { - int i; - int n = ARRAY_SIZE(valid_acts); - char arg[ACTION_LEN]; - - /* (remove possible '\n') */ - strscpy(arg, val, strnchrnul(val, sizeof(arg)-1, '\n') - val + 1); - - for (i = 0; i < n; i++) - if (!strcmp(arg, valid_acts[i].action)) - break; + int i, n = ARRAY_SIZE(actions); - if (i < n) { - strscpy(uv_nmi_action, arg, sizeof(uv_nmi_action)); - pr_info("UV: New NMI action:%s\n", uv_nmi_action); + i = sysfs_match_string(actions, val); + if (i >= 0) { + uv_nmi_action = i; + pr_info("UV: New NMI action:%s\n", actions[i]); return 0; } - pr_err("UV: Invalid NMI action:%s, valid actions are:\n", arg); + pr_err("UV: Invalid NMI action. Valid actions are:\n"); for (i = 0; i < n; i++) - pr_err("UV: %-8s - %s\n", - valid_acts[i].action, valid_acts[i].desc); + pr_err("UV: %-8s - %s\n", actions[i], actions_desc[i]); + return -EINVAL; } @@ -228,15 +236,10 @@ static const struct kernel_param_ops param_ops_action = { .get = param_get_action, .set = param_set_action, }; -#define param_check_action(name, p) __param_check(name, p, action_t) +#define param_check_action(name, p) __param_check(name, p, enum action_t) module_param_named(action, uv_nmi_action, action, 0644); -static inline bool uv_nmi_action_is(const char *action) -{ - return (strncmp(uv_nmi_action, action, strlen(action)) == 0); -} - /* Setup which NMI support is present in system */ static void uv_nmi_setup_mmrs(void) { @@ -727,10 +730,10 @@ static void uv_nmi_dump_state_cpu(int cpu, struct pt_regs *regs) if (cpu == 0) uv_nmi_dump_cpu_ip_hdr(); - if (current->pid != 0 || !uv_nmi_action_is("ips")) + if (current->pid != 0 || uv_nmi_action != nmi_act_ips) uv_nmi_dump_cpu_ip(cpu, regs); - if (uv_nmi_action_is("dump")) { + if (uv_nmi_action == nmi_act_dump) { pr_info("UV:%sNMI process trace for CPU %d\n", dots, cpu); show_regs(regs); } @@ -798,7 +801,7 @@ static void uv_nmi_dump_state(int cpu, struct pt_regs *regs, int master) int saved_console_loglevel = console_loglevel; pr_alert("UV: tracing %s for %d CPUs from CPU %d\n", - uv_nmi_action_is("ips") ? "IPs" : "processes", + uv_nmi_action == nmi_act_ips ? "IPs" : "processes", atomic_read(&uv_nmi_cpus_in_nmi), cpu); console_loglevel = uv_nmi_loglevel; @@ -874,7 +877,7 @@ static inline int uv_nmi_kdb_reason(void) static inline int uv_nmi_kdb_reason(void) { /* Ensure user is expecting to attach gdb remote */ - if (uv_nmi_action_is("kgdb")) + if (uv_nmi_action == nmi_act_kgdb) return 0; pr_err("UV: NMI error: KDB is not enabled in this kernel\n"); @@ -950,28 +953,35 @@ static int uv_handle_nmi(unsigned int reason, struct pt_regs *regs) master = (atomic_read(&uv_nmi_cpu) == cpu); /* If NMI action is "kdump", then attempt to do it */ - if (uv_nmi_action_is("kdump")) { + if (uv_nmi_action == nmi_act_kdump) { uv_nmi_kdump(cpu, master, regs); /* Unexpected return, revert action to "dump" */ if (master) - strscpy(uv_nmi_action, "dump", sizeof(uv_nmi_action)); + uv_nmi_action = nmi_act_dump; } /* Pause as all CPU's enter the NMI handler */ uv_nmi_wait(master); /* Process actions other than "kdump": */ - if (uv_nmi_action_is("health")) { + switch (uv_nmi_action) { + case nmi_act_health: uv_nmi_action_health(cpu, regs, master); - } else if (uv_nmi_action_is("ips") || uv_nmi_action_is("dump")) { + break; + case nmi_act_ips: + case nmi_act_dump: uv_nmi_dump_state(cpu, regs, master); - } else if (uv_nmi_action_is("kdb") || uv_nmi_action_is("kgdb")) { + break; + case nmi_act_kdb: + case nmi_act_kgdb: uv_call_kgdb_kdb(cpu, regs, master); - } else { + break; + default: if (master) - pr_alert("UV: unknown NMI action: %s\n", uv_nmi_action); + pr_alert("UV: unknown NMI action: %d\n", uv_nmi_action); uv_nmi_sync_exit(master); + break; } /* Clear per_cpu "in_nmi" flag */ diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c index 54663f3e00cb..ff5afc8a5a41 100644 --- a/arch/x86/platform/uv/uv_time.c +++ b/arch/x86/platform/uv/uv_time.c @@ -53,7 +53,7 @@ struct uv_rtc_timer_head { struct { int lcpu; /* systemwide logical cpu number */ u64 expires; /* next timer expiration for this cpu */ - } cpu[]; + } cpu[] __counted_by(ncpus); }; /* |