diff options
Diffstat (limited to 'arch/x86')
189 files changed, 4714 insertions, 2927 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4e001bbbb425..5ddfbbe5c51e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -125,6 +125,7 @@ config X86 select CLOCKSOURCE_VALIDATE_LAST_CYCLE select CLOCKSOURCE_WATCHDOG select DCACHE_WORD_ACCESS + select DYNAMIC_SIGFRAME select EDAC_ATOMIC_SCRUB select EDAC_SUPPORT select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC) @@ -339,6 +340,11 @@ config NEED_PER_CPU_PAGE_FIRST_CHUNK config ARCH_HIBERNATION_POSSIBLE def_bool y +config ARCH_NR_GPIO + int + default 1024 if X86_64 + default 512 + config ARCH_SUSPEND_POSSIBLE def_bool y @@ -605,9 +611,7 @@ config X86_INTEL_MID depends on X86_IO_APIC select I2C select DW_APB_TIMER - select APB_TIMER select INTEL_SCU_PCI - select MFD_INTEL_MSIC help Select to build a kernel capable of supporting Intel MID (Mobile Internet Device) platform systems which do not have the PCI legacy @@ -996,6 +1000,17 @@ config NR_CPUS This is purely to save memory: each supported CPU adds about 8KB to the kernel image. +config SCHED_CLUSTER + bool "Cluster scheduler support" + depends on SMP + default y + help + Cluster scheduler support improves the CPU scheduler's decision + making when dealing with machines that have clusters of CPUs. + Cluster usually means a couple of CPUs which are placed closely + by sharing mid-level caches, last-level cache tags or internal + busses. + config SCHED_SMT def_bool y if SMP @@ -1251,7 +1266,8 @@ config TOSHIBA config I8K tristate "Dell i8k legacy laptop support" - select HWMON + depends on HWMON + depends on PROC_FS select SENSORS_DELL_SMM help This option enables legacy /proc/i8k userspace interface in hwmon @@ -1400,7 +1416,7 @@ config HIGHMEM4G config HIGHMEM64G bool "64GB" - depends on !M486SX && !M486 && !M586 && !M586TSC && !M586MMX && !MGEODE_LX && !MGEODEGX1 && !MCYRIXIII && !MELAN && !MWINCHIPC6 && !WINCHIP3D && !MK6 + depends on !M486SX && !M486 && !M586 && !M586TSC && !M586MMX && !MGEODE_LX && !MGEODEGX1 && !MCYRIXIII && !MELAN && !MWINCHIPC6 && !MWINCHIP3D && !MK6 select X86_PAE help Select this if you have a 32-bit processor and more than 4 @@ -1513,6 +1529,7 @@ config AMD_MEM_ENCRYPT select ARCH_HAS_FORCE_DMA_UNENCRYPTED select INSTRUCTION_DECODER select ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS + select ARCH_HAS_CC_PLATFORM help Say yes to enable support for the encryption of system memory. This requires an AMD processor that supports Secure Memory @@ -1520,7 +1537,6 @@ config AMD_MEM_ENCRYPT config AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT bool "Activate AMD Secure Memory Encryption (SME) by default" - default y depends on AMD_MEM_ENCRYPT help Say yes to have system memory encrypted by default if running on @@ -2384,6 +2400,22 @@ config MODIFY_LDT_SYSCALL Saying 'N' here may make sense for embedded or server kernels. +config STRICT_SIGALTSTACK_SIZE + bool "Enforce strict size checking for sigaltstack" + depends on DYNAMIC_SIGFRAME + help + For historical reasons MINSIGSTKSZ is a constant which became + already too small with AVX512 support. Add a mechanism to + enforce strict checking of the sigaltstack size against the + real size of the FPU frame. This option enables the check + by default. It can also be controlled via the kernel command + line option 'strict_sas_size' independent of this config + switch. Enabling it might break existing applications which + allocate a too small sigaltstack but 'work' because they + never get a signal delivered. + + Say 'N' unless you want to really enforce this check. + source "kernel/livepatch/Kconfig" endmenu @@ -2605,7 +2637,6 @@ config PCI_OLPC config PCI_XEN def_bool y depends on PCI && XEN - select SWIOTLB_XEN config MMCONF_FAM10H def_bool y @@ -2828,8 +2859,6 @@ config HAVE_ATOMIC_IOMAP def_bool y depends on X86_32 -source "drivers/firmware/Kconfig" - source "arch/x86/kvm/Kconfig" source "arch/x86/Kconfig.assembler" diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 814fe0d349b0..eefc434351db 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -508,3 +508,16 @@ config CPU_SUP_ZHAOXIN CPU might render the kernel unbootable. If unsure, say N. + +config CPU_SUP_VORTEX_32 + default y + bool "Support Vortex processors" if PROCESSOR_SELECT + depends on X86_32 + help + This enables detection, tunings and quirks for Vortex processors + + You need this enabled if you want your kernel to run on a + Vortex CPU. Disabling this option on other types of CPUs + makes the kernel a tiny bit smaller. + + If unsure, say N. diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 7488cfbbd2f6..aab70413ae7a 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -299,7 +299,7 @@ define archhelp echo ' isoimage - Create a boot CD-ROM image (arch/x86/boot/image.iso)' echo ' bzdisk/fdimage*/hdimage/isoimage also accept:' echo ' FDARGS="..." arguments for the booted kernel' - echo ' FDINITRD=file initrd for the booted kernel' + echo ' FDINITRD=file initrd for the booted kernel' echo '' echo ' kvm_guest.config - Enable Kconfig items for running this kernel as a KVM guest' echo ' xen.config - Enable Kconfig items for running this kernel as a Xen guest' diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu index e7355f8b51c2..94834c4b5e5e 100644 --- a/arch/x86/Makefile_32.cpu +++ b/arch/x86/Makefile_32.cpu @@ -4,6 +4,12 @@ tune = $(call cc-option,-mtune=$(1),$(2)) +ifdef CONFIG_CC_IS_CLANG +align := -falign-functions=0 $(call cc-option,-falign-jumps=0) $(call cc-option,-falign-loops=0) +else +align := -falign-functions=0 -falign-jumps=0 -falign-loops=0 +endif + cflags-$(CONFIG_M486SX) += -march=i486 cflags-$(CONFIG_M486) += -march=i486 cflags-$(CONFIG_M586) += -march=i586 @@ -19,11 +25,11 @@ cflags-$(CONFIG_MK6) += -march=k6 # They make zero difference whatsosever to performance at this time. cflags-$(CONFIG_MK7) += -march=athlon cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8,-march=athlon) -cflags-$(CONFIG_MCRUSOE) += -march=i686 -falign-functions=0 -falign-jumps=0 -falign-loops=0 -cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) -falign-functions=0 -falign-jumps=0 -falign-loops=0 +cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align) +cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) $(align) cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586) cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586) -cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) -falign-functions=0 -falign-jumps=0 -falign-loops=0 +cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align) cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) cflags-$(CONFIG_MVIAC7) += -march=i686 cflags-$(CONFIG_MCORE2) += -march=i686 $(call tune,core2) diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 31139256859f..49bde196da9b 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -14,6 +14,8 @@ #undef CONFIG_KASAN #undef CONFIG_KASAN_GENERIC +#define __NO_FORTIFY + /* cpu_feature_enabled() cannot be used this early */ #define USE_EARLY_PGTABLE_L5 diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index 2a78746f5a4c..a1733319a22a 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "misc.h" #include <linux/efi.h> #include <asm/e820/types.h> #include <asm/processor.h> diff --git a/arch/x86/boot/genimage.sh b/arch/x86/boot/genimage.sh index 0673fdfc1a11..c9299aeb7333 100644 --- a/arch/x86/boot/genimage.sh +++ b/arch/x86/boot/genimage.sh @@ -120,12 +120,13 @@ efiarch() { } # Get the combined sizes in bytes of the files given, counting sparse -# files as full length, and padding each file to a 4K block size +# files as full length, and padding each file to cluster size +cluster=16384 filesizes() { local t=0 local s for s in $(ls -lnL "$@" 2>/dev/null | awk '/^-/{ print $5; }'); do - t=$((t + ((s+4095)/4096)*4096)) + t=$((t + ((s+cluster-1)/cluster)*cluster)) done echo $t } @@ -230,14 +231,14 @@ genhdimage() { ptype='-T 0xef' # EFI system partition, no GPT fi sizes=$(filesizes "$FBZIMAGE" "${FDINITRDS[@]}" "$efishell") - # Allow 1% + 1 MiB for filesystem and partition table overhead, - # syslinux, and config files + # Allow 1% + 2 MiB for filesystem and partition table overhead, + # syslinux, and config files; this is probably excessive... megs=$(((sizes + sizes/100 + 2*1024*1024 - 1)/(1024*1024))) $dd if=/dev/zero of="$FIMAGE" bs=$((1024*1024)) count=$megs 2>/dev/null - mpartition -I -c -s 32 -h 64 -t $megs $ptype -b 512 -a h: + mpartition -I -c -s 32 -h 64 $ptype -b 64 -a p: $dd if="$mbr" of="$FIMAGE" bs=440 count=1 conv=notrunc 2>/dev/null - mformat -v 'LINUX_BOOT' -s 32 -h 64 -t $megs h: - syslinux --offset $((512*512)) "$FIMAGE" + mformat -v 'LINUX_BOOT' -s 32 -h 64 -c $((cluster/512)) -t $megs h: + syslinux --offset $((64*512)) "$FIMAGE" do_mcopy h: } diff --git a/arch/x86/boot/mtools.conf.in b/arch/x86/boot/mtools.conf.in index 9e2662d01364..174c60508766 100644 --- a/arch/x86/boot/mtools.conf.in +++ b/arch/x86/boot/mtools.conf.in @@ -14,7 +14,8 @@ drive v: drive w: file="@OBJ@/fdimage" cylinders=80 heads=2 sectors=36 filter -# Hard disk +# Hard disk (h: for the filesystem, p: for format - old mtools bug?) drive h: + file="@OBJ@/hdimage" offset=32768 mformat_only +drive p: file="@OBJ@/hdimage" partition=1 mformat_only - diff --git a/arch/x86/crypto/sm4-aesni-avx-asm_64.S b/arch/x86/crypto/sm4-aesni-avx-asm_64.S index fa2c3f50aecb..1cc72b4804fa 100644 --- a/arch/x86/crypto/sm4-aesni-avx-asm_64.S +++ b/arch/x86/crypto/sm4-aesni-avx-asm_64.S @@ -78,7 +78,7 @@ vpxor tmp0, x, x; -.section .rodata.cst164, "aM", @progbits, 164 +.section .rodata.cst16, "aM", @progbits, 16 .align 16 /* @@ -133,6 +133,10 @@ .L0f0f0f0f: .long 0x0f0f0f0f +/* 12 bytes, only for padding */ +.Lpadding_deadbeef: + .long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef + .text .align 16 @@ -367,10 +371,11 @@ SYM_FUNC_START(sm4_aesni_avx_crypt8) * %rdx: src (1..8 blocks) * %rcx: num blocks (1..8) */ - FRAME_BEGIN - cmpq $5, %rcx; jb sm4_aesni_avx_crypt4; + + FRAME_BEGIN + vmovdqu (0 * 16)(%rdx), RA0; vmovdqu (1 * 16)(%rdx), RA1; vmovdqu (2 * 16)(%rdx), RA2; diff --git a/arch/x86/crypto/sm4-aesni-avx2-asm_64.S b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S index d2ffd7f76ee2..9c5d3f3ad45a 100644 --- a/arch/x86/crypto/sm4-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S @@ -93,7 +93,7 @@ vpxor tmp0, x, x; -.section .rodata.cst164, "aM", @progbits, 164 +.section .rodata.cst16, "aM", @progbits, 16 .align 16 /* @@ -148,6 +148,10 @@ .L0f0f0f0f: .long 0x0f0f0f0f +/* 12 bytes, only for padding */ +.Lpadding_deadbeef: + .long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef + .text .align 16 diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 960a021d543e..7e25543693de 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -453,3 +453,4 @@ 446 i386 landlock_restrict_self sys_landlock_restrict_self 447 i386 memfd_secret sys_memfd_secret 448 i386 process_mrelease sys_process_mrelease +449 i386 futex_waitv sys_futex_waitv diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 18b5500ea8bf..fe8f8dd157b4 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -370,6 +370,7 @@ 446 common landlock_restrict_self sys_landlock_restrict_self 447 common memfd_secret sys_memfd_secret 448 common process_mrelease sys_process_mrelease +449 common futex_waitv sys_futex_waitv # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 2a57dbed4894..38b2c779146f 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -66,6 +66,8 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_enable_all, *x86_pmu.enable_all); DEFINE_STATIC_CALL_NULL(x86_pmu_enable, *x86_pmu.enable); DEFINE_STATIC_CALL_NULL(x86_pmu_disable, *x86_pmu.disable); +DEFINE_STATIC_CALL_NULL(x86_pmu_assign, *x86_pmu.assign); + DEFINE_STATIC_CALL_NULL(x86_pmu_add, *x86_pmu.add); DEFINE_STATIC_CALL_NULL(x86_pmu_del, *x86_pmu.del); DEFINE_STATIC_CALL_NULL(x86_pmu_read, *x86_pmu.read); @@ -1215,6 +1217,8 @@ static inline void x86_assign_hw_event(struct perf_event *event, hwc->last_cpu = smp_processor_id(); hwc->last_tag = ++cpuc->tags[i]; + static_call_cond(x86_pmu_assign)(event, idx); + switch (hwc->idx) { case INTEL_PMC_IDX_FIXED_BTS: case INTEL_PMC_IDX_FIXED_VLBR: @@ -2005,6 +2009,8 @@ static void x86_pmu_static_call_update(void) static_call_update(x86_pmu_enable, x86_pmu.enable); static_call_update(x86_pmu_disable, x86_pmu.disable); + static_call_update(x86_pmu_assign, x86_pmu.assign); + static_call_update(x86_pmu_add, x86_pmu.add); static_call_update(x86_pmu_del, x86_pmu.del); static_call_update(x86_pmu_read, x86_pmu.read); @@ -2465,6 +2471,7 @@ static int x86_pmu_event_init(struct perf_event *event) if (err) { if (event->destroy) event->destroy(event); + event->destroy = NULL; } if (READ_ONCE(x86_pmu.attr_rdpmc) && diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 6320d2cfd9d3..974e917e65b2 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -209,6 +209,12 @@ static void bts_update(struct bts_ctx *bts) } else { local_set(&buf->data_size, head); } + + /* + * Since BTS is coherent, just add compiler barrier to ensure + * BTS updating is ordered against bts::handle::event. + */ + barrier(); } static int diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 7011e87be6d0..aa3f5b527dc0 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -243,7 +243,8 @@ static struct extra_reg intel_skl_extra_regs[] __read_mostly = { static struct event_constraint intel_icl_event_constraints[] = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ - FIXED_EVENT_CONSTRAINT(0x01c0, 0), /* INST_RETIRED.PREC_DIST */ + FIXED_EVENT_CONSTRAINT(0x01c0, 0), /* old INST_RETIRED.PREC_DIST */ + FIXED_EVENT_CONSTRAINT(0x0100, 0), /* INST_RETIRED.PREC_DIST */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */ @@ -263,6 +264,7 @@ static struct event_constraint intel_icl_event_constraints[] = { INTEL_EVENT_CONSTRAINT_RANGE(0xa8, 0xb0, 0xf), INTEL_EVENT_CONSTRAINT_RANGE(0xb7, 0xbd, 0xf), INTEL_EVENT_CONSTRAINT_RANGE(0xd0, 0xe6, 0xf), + INTEL_EVENT_CONSTRAINT(0xef, 0xf), INTEL_EVENT_CONSTRAINT_RANGE(0xf0, 0xf4, 0xf), EVENT_CONSTRAINT_END }; @@ -287,7 +289,7 @@ static struct extra_reg intel_spr_extra_regs[] __read_mostly = { static struct event_constraint intel_spr_event_constraints[] = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ - FIXED_EVENT_CONSTRAINT(0x01c0, 0), /* INST_RETIRED.PREC_DIST */ + FIXED_EVENT_CONSTRAINT(0x0100, 0), /* INST_RETIRED.PREC_DIST */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */ @@ -2402,6 +2404,12 @@ static void intel_pmu_disable_event(struct perf_event *event) intel_pmu_pebs_disable(event); } +static void intel_pmu_assign_event(struct perf_event *event, int idx) +{ + if (is_pebs_pt(event)) + perf_report_aux_output_id(event, idx); +} + static void intel_pmu_del_event(struct perf_event *event) { if (needs_branch_stack(event)) @@ -4494,8 +4502,16 @@ static int intel_pmu_check_period(struct perf_event *event, u64 value) return intel_pmu_has_bts_period(event, value) ? -EINVAL : 0; } +static void intel_aux_output_init(void) +{ + /* Refer also intel_pmu_aux_output_match() */ + if (x86_pmu.intel_cap.pebs_output_pt_available) + x86_pmu.assign = intel_pmu_assign_event; +} + static int intel_pmu_aux_output_match(struct perf_event *event) { + /* intel_pmu_assign_event() is needed, refer intel_aux_output_init() */ if (!x86_pmu.intel_cap.pebs_output_pt_available) return 0; @@ -6301,6 +6317,8 @@ __init int intel_pmu_init(void) if (is_hybrid()) intel_pmu_check_hybrid_pmus((u64)fixed_mask); + intel_aux_output_init(); + return 0; } diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 8647713276a7..4dbb55a43dad 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -923,7 +923,8 @@ struct event_constraint intel_skl_pebs_event_constraints[] = { }; struct event_constraint intel_icl_pebs_event_constraints[] = { - INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x100000000ULL), /* old INST_RETIRED.PREC_DIST */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x0100, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */ INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL), /* SLOTS */ INTEL_PLD_CONSTRAINT(0x1cd, 0xff), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ @@ -943,7 +944,7 @@ struct event_constraint intel_icl_pebs_event_constraints[] = { }; struct event_constraint intel_spr_pebs_event_constraints[] = { - INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x100000000ULL), + INTEL_FLAGS_UEVENT_CONSTRAINT(0x100, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */ INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL), INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xfe), diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h index 7280c8a3c831..6d735611c281 100644 --- a/arch/x86/events/intel/uncore_discovery.h +++ b/arch/x86/events/intel/uncore_discovery.h @@ -30,7 +30,7 @@ #define uncore_discovery_invalid_unit(unit) \ - (!unit.table1 || !unit.ctl || !unit.table3 || \ + (!unit.table1 || !unit.ctl || \ unit.table1 == -1ULL || unit.ctl == -1ULL || \ unit.table3 == -1ULL) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 5ddc0f30db6f..eb2c6cea9d0d 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -452,7 +452,7 @@ #define ICX_M3UPI_PCI_PMON_BOX_CTL 0xa0 /* ICX IMC */ -#define ICX_NUMBER_IMC_CHN 2 +#define ICX_NUMBER_IMC_CHN 3 #define ICX_IMC_MEM_STRIDE 0x4 /* SPR */ @@ -5076,8 +5076,10 @@ static struct event_constraint icx_uncore_iio_constraints[] = { UNCORE_EVENT_CONSTRAINT(0x02, 0x3), UNCORE_EVENT_CONSTRAINT(0x03, 0x3), UNCORE_EVENT_CONSTRAINT(0x83, 0x3), + UNCORE_EVENT_CONSTRAINT(0x88, 0xc), UNCORE_EVENT_CONSTRAINT(0xc0, 0xc), UNCORE_EVENT_CONSTRAINT(0xc5, 0xc), + UNCORE_EVENT_CONSTRAINT(0xd5, 0xc), EVENT_CONSTRAINT_END }; @@ -5463,7 +5465,7 @@ static struct intel_uncore_ops icx_uncore_mmio_ops = { static struct intel_uncore_type icx_uncore_imc = { .name = "imc", .num_counters = 4, - .num_boxes = 8, + .num_boxes = 12, .perf_ctr_bits = 48, .fixed_ctr_bits = 48, .fixed_ctr = SNR_IMC_MMIO_PMON_FIXED_CTR, @@ -5647,6 +5649,7 @@ static struct intel_uncore_type spr_uncore_chabox = { .event_mask = SPR_CHA_PMON_EVENT_MASK, .event_mask_ext = SPR_RAW_EVENT_MASK_EXT, .num_shared_regs = 1, + .constraints = skx_uncore_chabox_constraints, .ops = &spr_uncore_chabox_ops, .format_group = &spr_uncore_chabox_format_group, .attr_update = uncore_alias_groups, @@ -5658,6 +5661,7 @@ static struct intel_uncore_type spr_uncore_iio = { .event_mask_ext = SNR_IIO_PMON_RAW_EVENT_MASK_EXT, .format_group = &snr_uncore_iio_format_group, .attr_update = uncore_alias_groups, + .constraints = icx_uncore_iio_constraints, }; static struct attribute *spr_uncore_raw_formats_attr[] = { @@ -5686,9 +5690,16 @@ static struct intel_uncore_type spr_uncore_irp = { }; +static struct event_constraint spr_uncore_m2pcie_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x14, 0x3), + UNCORE_EVENT_CONSTRAINT(0x2d, 0x3), + EVENT_CONSTRAINT_END +}; + static struct intel_uncore_type spr_uncore_m2pcie = { SPR_UNCORE_COMMON_FORMAT(), .name = "m2pcie", + .constraints = spr_uncore_m2pcie_constraints, }; static struct intel_uncore_type spr_uncore_pcu = { @@ -5765,6 +5776,7 @@ static struct intel_uncore_type spr_uncore_upi = { static struct intel_uncore_type spr_uncore_m3upi = { SPR_UNCORE_PCI_COMMON_FORMAT(), .name = "m3upi", + .constraints = icx_uncore_m3upi_constraints, }; static struct intel_uncore_type spr_uncore_mdf = { diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index c853b28efa33..96c775abe31f 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -68,6 +68,7 @@ static bool test_intel(int idx, void *data) case INTEL_FAM6_BROADWELL_D: case INTEL_FAM6_BROADWELL_G: case INTEL_FAM6_BROADWELL_X: + case INTEL_FAM6_SAPPHIRERAPIDS_X: case INTEL_FAM6_ATOM_SILVERMONT: case INTEL_FAM6_ATOM_SILVERMONT_D: diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index e3ac05c97b5e..7b5167db3e78 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -14,6 +14,7 @@ #include <linux/perf_event.h> +#include <asm/fpu/xstate.h> #include <asm/intel_ds.h> #include <asm/cpu.h> @@ -726,6 +727,7 @@ struct x86_pmu { void (*enable_all)(int added); void (*enable)(struct perf_event *); void (*disable)(struct perf_event *); + void (*assign)(struct perf_event *event, int idx); void (*add)(struct perf_event *); void (*del)(struct perf_event *); void (*read)(struct perf_event *event); diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c index 90e682a92820..db2d92fb44da 100644 --- a/arch/x86/hyperv/hv_apic.c +++ b/arch/x86/hyperv/hv_apic.c @@ -99,7 +99,8 @@ static void hv_apic_eoi_write(u32 reg, u32 val) /* * IPI implementation on Hyper-V. */ -static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector) +static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector, + bool exclude_self) { struct hv_send_ipi_ex **arg; struct hv_send_ipi_ex *ipi_arg; @@ -121,14 +122,27 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector) ipi_arg->reserved = 0; ipi_arg->vp_set.valid_bank_mask = 0; - if (!cpumask_equal(mask, cpu_present_mask)) { + /* + * Use HV_GENERIC_SET_ALL and avoid converting cpumask to VP_SET + * when the IPI is sent to all currently present CPUs. + */ + if (!cpumask_equal(mask, cpu_present_mask) || exclude_self) { ipi_arg->vp_set.format = HV_GENERIC_SET_SPARSE_4K; - nr_bank = cpumask_to_vpset(&(ipi_arg->vp_set), mask); - } - if (nr_bank < 0) - goto ipi_mask_ex_done; - if (!nr_bank) + if (exclude_self) + nr_bank = cpumask_to_vpset_noself(&(ipi_arg->vp_set), mask); + else + nr_bank = cpumask_to_vpset(&(ipi_arg->vp_set), mask); + + /* + * 'nr_bank <= 0' means some CPUs in cpumask can't be + * represented in VP_SET. Return an error and fall back to + * native (architectural) method of sending IPIs. + */ + if (nr_bank <= 0) + goto ipi_mask_ex_done; + } else { ipi_arg->vp_set.format = HV_GENERIC_SET_ALL; + } status = hv_do_rep_hypercall(HVCALL_SEND_IPI_EX, 0, nr_bank, ipi_arg, NULL); @@ -138,15 +152,25 @@ ipi_mask_ex_done: return hv_result_success(status); } -static bool __send_ipi_mask(const struct cpumask *mask, int vector) +static bool __send_ipi_mask(const struct cpumask *mask, int vector, + bool exclude_self) { - int cur_cpu, vcpu; + int cur_cpu, vcpu, this_cpu = smp_processor_id(); struct hv_send_ipi ipi_arg; u64 status; + unsigned int weight; trace_hyperv_send_ipi_mask(mask, vector); - if (cpumask_empty(mask)) + weight = cpumask_weight(mask); + + /* + * Do nothing if + * 1. the mask is empty + * 2. the mask only contains self when exclude_self is true + */ + if (weight == 0 || + (exclude_self && weight == 1 && cpumask_test_cpu(this_cpu, mask))) return true; if (!hv_hypercall_pg) @@ -172,6 +196,8 @@ static bool __send_ipi_mask(const struct cpumask *mask, int vector) ipi_arg.cpu_mask = 0; for_each_cpu(cur_cpu, mask) { + if (exclude_self && cur_cpu == this_cpu) + continue; vcpu = hv_cpu_number_to_vp_number(cur_cpu); if (vcpu == VP_INVAL) return false; @@ -191,7 +217,7 @@ static bool __send_ipi_mask(const struct cpumask *mask, int vector) return hv_result_success(status); do_ex_hypercall: - return __send_ipi_mask_ex(mask, vector); + return __send_ipi_mask_ex(mask, vector, exclude_self); } static bool __send_ipi_one(int cpu, int vector) @@ -208,7 +234,7 @@ static bool __send_ipi_one(int cpu, int vector) return false; if (vp >= 64) - return __send_ipi_mask_ex(cpumask_of(cpu), vector); + return __send_ipi_mask_ex(cpumask_of(cpu), vector, false); status = hv_do_fast_hypercall16(HVCALL_SEND_IPI, vector, BIT_ULL(vp)); return hv_result_success(status); @@ -222,20 +248,13 @@ static void hv_send_ipi(int cpu, int vector) static void hv_send_ipi_mask(const struct cpumask *mask, int vector) { - if (!__send_ipi_mask(mask, vector)) + if (!__send_ipi_mask(mask, vector, false)) orig_apic.send_IPI_mask(mask, vector); } static void hv_send_ipi_mask_allbutself(const struct cpumask *mask, int vector) { - unsigned int this_cpu = smp_processor_id(); - struct cpumask new_mask; - const struct cpumask *local_mask; - - cpumask_copy(&new_mask, mask); - cpumask_clear_cpu(this_cpu, &new_mask); - local_mask = &new_mask; - if (!__send_ipi_mask(local_mask, vector)) + if (!__send_ipi_mask(mask, vector, true)) orig_apic.send_IPI_mask_allbutself(mask, vector); } @@ -246,7 +265,7 @@ static void hv_send_ipi_allbutself(int vector) static void hv_send_ipi_all(int vector) { - if (!__send_ipi_mask(cpu_online_mask, vector)) + if (!__send_ipi_mask(cpu_online_mask, vector, false)) orig_apic.send_IPI_all(vector); } diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 5e3d9b7fd5fb..c9c3859322fa 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -24,7 +24,6 @@ #include <linux/syscalls.h> #include <asm/ucontext.h> #include <linux/uaccess.h> -#include <asm/fpu/internal.h> #include <asm/fpu/signal.h> #include <asm/ptrace.h> #include <asm/ia32_unistd.h> @@ -57,8 +56,8 @@ static inline void reload_segments(struct sigcontext_32 *sc) /* * Do a signal return; undo the signal stack. */ -static int ia32_restore_sigcontext(struct pt_regs *regs, - struct sigcontext_32 __user *usc) +static bool ia32_restore_sigcontext(struct pt_regs *regs, + struct sigcontext_32 __user *usc) { struct sigcontext_32 sc; @@ -66,7 +65,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, current->restart_block.fn = do_no_restart_syscall; if (unlikely(copy_from_user(&sc, usc, sizeof(sc)))) - return -EFAULT; + return false; /* Get only the ia32 registers. */ regs->bx = sc.bx; @@ -111,7 +110,7 @@ COMPAT_SYSCALL_DEFINE0(sigreturn) set_current_blocked(&set); - if (ia32_restore_sigcontext(regs, &frame->sc)) + if (!ia32_restore_sigcontext(regs, &frame->sc)) goto badframe; return regs->ax; @@ -135,7 +134,7 @@ COMPAT_SYSCALL_DEFINE0(rt_sigreturn) set_current_blocked(&set); - if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext)) + if (!ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext)) goto badframe; if (compat_restore_altstack(&frame->uc.uc_stack)) @@ -220,8 +219,8 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, sp = fpu__alloc_mathframe(sp, 1, &fx_aligned, &math_size); *fpstate = (struct _fpstate_32 __user *) sp; - if (copy_fpstate_to_sigframe(*fpstate, (void __user *)fx_aligned, - math_size) < 0) + if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)fx_aligned, + math_size)) return (void __user *) -1L; sp -= frame_size; diff --git a/arch/x86/include/asm/GEN-for-each-reg.h b/arch/x86/include/asm/GEN-for-each-reg.h index 1b07fb102c4e..07949102a08d 100644 --- a/arch/x86/include/asm/GEN-for-each-reg.h +++ b/arch/x86/include/asm/GEN-for-each-reg.h @@ -1,11 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * These are in machine order; things rely on that. + */ #ifdef CONFIG_64BIT GEN(rax) -GEN(rbx) GEN(rcx) GEN(rdx) +GEN(rbx) +GEN(rsp) +GEN(rbp) GEN(rsi) GEN(rdi) -GEN(rbp) GEN(r8) GEN(r9) GEN(r10) @@ -16,10 +21,11 @@ GEN(r14) GEN(r15) #else GEN(eax) -GEN(ebx) GEN(ecx) GEN(edx) +GEN(ebx) +GEN(esp) +GEN(ebp) GEN(esi) GEN(edi) -GEN(ebp) #endif diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index a3c2315aca12..58eee6402832 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -75,6 +75,7 @@ extern int alternatives_patched; extern void alternative_instructions(void); extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); +extern void apply_retpolines(s32 *start, s32 *end); struct module; diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index 4cb726c71ed8..8f80de627c60 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -17,21 +17,3 @@ extern void cmpxchg8b_emu(void); #endif -#ifdef CONFIG_RETPOLINE - -#undef GEN -#define GEN(reg) \ - extern asmlinkage void __x86_indirect_thunk_ ## reg (void); -#include <asm/GEN-for-each-reg.h> - -#undef GEN -#define GEN(reg) \ - extern asmlinkage void __x86_indirect_alt_call_ ## reg (void); -#include <asm/GEN-for-each-reg.h> - -#undef GEN -#define GEN(reg) \ - extern asmlinkage void __x86_indirect_alt_jmp_ ## reg (void); -#include <asm/GEN-for-each-reg.h> - -#endif /* CONFIG_RETPOLINE */ diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 3ad3da9a7d97..3a168483bc8e 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -6,11 +6,13 @@ # define __ASM_FORM(x, ...) x,## __VA_ARGS__ # define __ASM_FORM_RAW(x, ...) x,## __VA_ARGS__ # define __ASM_FORM_COMMA(x, ...) x,## __VA_ARGS__, +# define __ASM_REGPFX % #else #include <linux/stringify.h> # define __ASM_FORM(x, ...) " " __stringify(x,##__VA_ARGS__) " " # define __ASM_FORM_RAW(x, ...) __stringify(x,##__VA_ARGS__) # define __ASM_FORM_COMMA(x, ...) " " __stringify(x,##__VA_ARGS__) "," +# define __ASM_REGPFX %% #endif #define _ASM_BYTES(x, ...) __ASM_FORM(.byte x,##__VA_ARGS__ ;) @@ -49,6 +51,9 @@ #define _ASM_SI __ASM_REG(si) #define _ASM_DI __ASM_REG(di) +/* Adds a (%rip) suffix on 64 bits only; for immediate memory references */ +#define _ASM_RIP(x) __ASM_SEL_RAW(x, x (__ASM_REGPFX rip)) + #ifndef __x86_64__ /* 32 bit */ @@ -122,28 +127,19 @@ #ifdef __KERNEL__ +# include <asm/extable_fixup_types.h> + /* Exception table entry */ #ifdef __ASSEMBLY__ -# define _ASM_EXTABLE_HANDLE(from, to, handler) \ + +# define _ASM_EXTABLE_TYPE(from, to, type) \ .pushsection "__ex_table","a" ; \ .balign 4 ; \ .long (from) - . ; \ .long (to) - . ; \ - .long (handler) - . ; \ + .long type ; \ .popsection -# define _ASM_EXTABLE(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_default) - -# define _ASM_EXTABLE_UA(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_uaccess) - -# define _ASM_EXTABLE_CPY(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_copy) - -# define _ASM_EXTABLE_FAULT(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_fault) - # ifdef CONFIG_KPROBES # define _ASM_NOKPROBE(entry) \ .pushsection "_kprobe_blacklist","aw" ; \ @@ -155,27 +151,15 @@ # endif #else /* ! __ASSEMBLY__ */ -# define _EXPAND_EXTABLE_HANDLE(x) #x -# define _ASM_EXTABLE_HANDLE(from, to, handler) \ + +# define _ASM_EXTABLE_TYPE(from, to, type) \ " .pushsection \"__ex_table\",\"a\"\n" \ " .balign 4\n" \ " .long (" #from ") - .\n" \ " .long (" #to ") - .\n" \ - " .long (" _EXPAND_EXTABLE_HANDLE(handler) ") - .\n" \ + " .long " __stringify(type) " \n" \ " .popsection\n" -# define _ASM_EXTABLE(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_default) - -# define _ASM_EXTABLE_UA(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_uaccess) - -# define _ASM_EXTABLE_CPY(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_copy) - -# define _ASM_EXTABLE_FAULT(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_fault) - /* For C file, we already have NOKPROBE_SYMBOL macro */ /* @@ -188,6 +172,17 @@ register unsigned long current_stack_pointer asm(_ASM_SP); #define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer) #endif /* __ASSEMBLY__ */ -#endif /* __KERNEL__ */ +#define _ASM_EXTABLE(from, to) \ + _ASM_EXTABLE_TYPE(from, to, EX_TYPE_DEFAULT) +#define _ASM_EXTABLE_UA(from, to) \ + _ASM_EXTABLE_TYPE(from, to, EX_TYPE_UACCESS) + +#define _ASM_EXTABLE_CPY(from, to) \ + _ASM_EXTABLE_TYPE(from, to, EX_TYPE_COPY) + +#define _ASM_EXTABLE_FAULT(from, to) \ + _ASM_EXTABLE_TYPE(from, to, EX_TYPE_FAULT) + +#endif /* __KERNEL__ */ #endif /* _ASM_X86_ASM_H */ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 16a51e7288d5..1261842d006c 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -173,20 +173,25 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); * means that the boot_cpu_has() variant is already fast enough for the * majority of cases and you should stick to using it as it is generally * only two instructions: a RIP-relative MOV and a TEST. + * + * Do not use an "m" constraint for [cap_byte] here: gcc doesn't know + * that this is only used on a fallback path and will sometimes cause + * it to manifest the address of boot_cpu_data in a register, fouling + * the mainline (post-initialization) code. */ static __always_inline bool _static_cpu_has(u16 bit) { asm_volatile_goto( ALTERNATIVE_TERNARY("jmp 6f", %P[feature], "", "jmp %l[t_no]") - ".section .altinstr_aux,\"ax\"\n" + ".pushsection .altinstr_aux,\"ax\"\n" "6:\n" - " testb %[bitnum],%[cap_byte]\n" + " testb %[bitnum]," _ASM_RIP(%P[cap_byte]) "\n" " jnz %l[t_yes]\n" " jmp %l[t_no]\n" - ".previous\n" + ".popsection\n" : : [feature] "i" (bit), [bitnum] "i" (1 << (bit & 7)), - [cap_byte] "m" (((const char *)boot_cpu_data.x86_capability)[bit >> 3]) + [cap_byte] "i" (&((const char *)boot_cpu_data.x86_capability)[bit >> 3]) : : t_yes, t_no); t_yes: return true; diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index d0ce5cfd3ac1..d5b5f2ab87a0 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -277,6 +277,7 @@ #define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC instruction */ #define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 instruction */ #define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS instructions */ +#define X86_FEATURE_XFD (10*32+ 4) /* "" eXtended Feature Disabling */ /* * Extended auxiliary flags: Linux defined - for features scattered in various @@ -298,6 +299,7 @@ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ #define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ +#define X86_FEATURE_AMX_TILE (18*32+24) /* AMX tile Support */ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h index 14ebd2196569..43184640b579 100644 --- a/arch/x86/include/asm/entry-common.h +++ b/arch/x86/include/asm/entry-common.h @@ -25,7 +25,7 @@ static __always_inline void arch_check_user_regs(struct pt_regs *regs) * For !SMAP hardware we patch out CLAC on entry. */ if (boot_cpu_has(X86_FEATURE_SMAP) || - (IS_ENABLED(CONFIG_64_BIT) && boot_cpu_has(X86_FEATURE_XENPV))) + (IS_ENABLED(CONFIG_64BIT) && boot_cpu_has(X86_FEATURE_XENPV))) mask |= X86_EFLAGS_AC; WARN_ON_ONCE(flags & mask); diff --git a/arch/x86/include/asm/extable.h b/arch/x86/include/asm/extable.h index 1f0cbc52937c..93f400eb728f 100644 --- a/arch/x86/include/asm/extable.h +++ b/arch/x86/include/asm/extable.h @@ -1,12 +1,18 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_EXTABLE_H #define _ASM_X86_EXTABLE_H + +#include <asm/extable_fixup_types.h> + /* - * The exception table consists of triples of addresses relative to the - * exception table entry itself. The first address is of an instruction - * that is allowed to fault, the second is the target at which the program - * should continue. The third is a handler function to deal with the fault - * caused by the instruction in the first field. + * The exception table consists of two addresses relative to the + * exception table entry itself and a type selector field. + * + * The first address is of an instruction that is allowed to fault, the + * second is the target at which the program should continue. + * + * The type entry is used by fixup_exception() to select the handler to + * deal with the fault caused by the instruction in the first field. * * All the routines below use bits of fixup code that are out of line * with the main instruction path. This means when everything is well, @@ -15,7 +21,7 @@ */ struct exception_table_entry { - int insn, fixup, handler; + int insn, fixup, type; }; struct pt_regs; @@ -25,21 +31,27 @@ struct pt_regs; do { \ (a)->fixup = (b)->fixup + (delta); \ (b)->fixup = (tmp).fixup - (delta); \ - (a)->handler = (b)->handler + (delta); \ - (b)->handler = (tmp).handler - (delta); \ + (a)->type = (b)->type; \ + (b)->type = (tmp).type; \ } while (0) -enum handler_type { - EX_HANDLER_NONE, - EX_HANDLER_FAULT, - EX_HANDLER_UACCESS, - EX_HANDLER_OTHER -}; - extern int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code, unsigned long fault_addr); extern int fixup_bug(struct pt_regs *regs, int trapnr); -extern enum handler_type ex_get_fault_handler_type(unsigned long ip); +extern int ex_get_fixup_type(unsigned long ip); extern void early_fixup_exception(struct pt_regs *regs, int trapnr); +#ifdef CONFIG_X86_MCE +extern void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr); +#else +static inline void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr) { } +#endif + +#if defined(CONFIG_BPF_JIT) && defined(CONFIG_X86_64) +bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs); +#else +static inline bool ex_handler_bpf(const struct exception_table_entry *x, + struct pt_regs *regs) { return false; } +#endif + #endif diff --git a/arch/x86/include/asm/extable_fixup_types.h b/arch/x86/include/asm/extable_fixup_types.h new file mode 100644 index 000000000000..409524d5d2eb --- /dev/null +++ b/arch/x86/include/asm/extable_fixup_types.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_EXTABLE_FIXUP_TYPES_H +#define _ASM_X86_EXTABLE_FIXUP_TYPES_H + +#define EX_TYPE_NONE 0 +#define EX_TYPE_DEFAULT 1 +#define EX_TYPE_FAULT 2 +#define EX_TYPE_UACCESS 3 +#define EX_TYPE_COPY 4 +#define EX_TYPE_CLEAR_FS 5 +#define EX_TYPE_FPU_RESTORE 6 +#define EX_TYPE_WRMSR 7 +#define EX_TYPE_RDMSR 8 +#define EX_TYPE_BPF 9 + +#define EX_TYPE_WRMSR_IN_MCE 10 +#define EX_TYPE_RDMSR_IN_MCE 11 + +#define EX_TYPE_DEFAULT_MCE_SAFE 12 +#define EX_TYPE_FAULT_MCE_SAFE 13 + +#endif diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index 23bef08a8388..b7267b9e452f 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -12,6 +12,8 @@ #define _ASM_X86_FPU_API_H #include <linux/bottom_half.h> +#include <asm/fpu/types.h> + /* * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It * disables preemption so be careful if you intend to use it for long periods @@ -48,9 +50,9 @@ static inline void kernel_fpu_begin(void) } /* - * Use fpregs_lock() while editing CPU's FPU registers or fpu->state. + * Use fpregs_lock() while editing CPU's FPU registers or fpu->fpstate. * A context switch will (and softirq might) save CPU's FPU registers to - * fpu->state and set TIF_NEED_FPU_LOAD leaving CPU's FPU registers in + * fpu->fpstate.regs and set TIF_NEED_FPU_LOAD leaving CPU's FPU registers in * a random state. * * local_bh_disable() protects against both preemption and soft interrupts @@ -108,4 +110,56 @@ extern int cpu_has_xfeatures(u64 xfeatures_mask, const char **feature_name); static inline void update_pasid(void) { } +/* Trap handling */ +extern int fpu__exception_code(struct fpu *fpu, int trap_nr); +extern void fpu_sync_fpstate(struct fpu *fpu); +extern void fpu_reset_from_exception_fixup(void); + +/* Boot, hotplug and resume */ +extern void fpu__init_cpu(void); +extern void fpu__init_system(struct cpuinfo_x86 *c); +extern void fpu__init_check_bugs(void); +extern void fpu__resume_cpu(void); + +#ifdef CONFIG_MATH_EMULATION +extern void fpstate_init_soft(struct swregs_state *soft); +#else +static inline void fpstate_init_soft(struct swregs_state *soft) {} +#endif + +/* State tracking */ +DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); + +/* Process cleanup */ +#ifdef CONFIG_X86_64 +extern void fpstate_free(struct fpu *fpu); +#else +static inline void fpstate_free(struct fpu *fpu) { } +#endif + +/* fpstate-related functions which are exported to KVM */ +extern void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature); + +/* KVM specific functions */ +extern bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu); +extern void fpu_free_guest_fpstate(struct fpu_guest *gfpu); +extern int fpu_swap_kvm_fpstate(struct fpu_guest *gfpu, bool enter_guest); + +extern void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, unsigned int size, u32 pkru); +extern int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, u64 xcr0, u32 *vpkru); + +static inline void fpstate_set_confidential(struct fpu_guest *gfpu) +{ + gfpu->fpstate->is_confidential = true; +} + +static inline bool fpstate_is_confidential(struct fpu_guest *gfpu) +{ + return gfpu->fpstate->is_confidential; +} + +/* prctl */ +struct task_struct; +extern long fpu_xstate_prctl(struct task_struct *tsk, int option, unsigned long arg2); + #endif /* _ASM_X86_FPU_API_H */ diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 5a18694a89b2..e69de29bb2d1 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -1,540 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 1994 Linus Torvalds - * - * Pentium III FXSR, SSE support - * General FPU state handling cleanups - * Gareth Hughes <gareth@valinux.com>, May 2000 - * x86-64 work by Andi Kleen 2002 - */ - -#ifndef _ASM_X86_FPU_INTERNAL_H -#define _ASM_X86_FPU_INTERNAL_H - -#include <linux/compat.h> -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/mm.h> - -#include <asm/user.h> -#include <asm/fpu/api.h> -#include <asm/fpu/xstate.h> -#include <asm/fpu/xcr.h> -#include <asm/cpufeature.h> -#include <asm/trace/fpu.h> - -/* - * High level FPU state handling functions: - */ -extern int fpu__restore_sig(void __user *buf, int ia32_frame); -extern void fpu__drop(struct fpu *fpu); -extern void fpu__clear_user_states(struct fpu *fpu); -extern int fpu__exception_code(struct fpu *fpu, int trap_nr); - -extern void fpu_sync_fpstate(struct fpu *fpu); - -/* Clone and exit operations */ -extern int fpu_clone(struct task_struct *dst); -extern void fpu_flush_thread(void); - -/* - * Boot time FPU initialization functions: - */ -extern void fpu__init_cpu(void); -extern void fpu__init_system_xstate(void); -extern void fpu__init_cpu_xstate(void); -extern void fpu__init_system(struct cpuinfo_x86 *c); -extern void fpu__init_check_bugs(void); -extern void fpu__resume_cpu(void); - -/* - * Debugging facility: - */ -#ifdef CONFIG_X86_DEBUG_FPU -# define WARN_ON_FPU(x) WARN_ON_ONCE(x) -#else -# define WARN_ON_FPU(x) ({ (void)(x); 0; }) -#endif - -/* - * FPU related CPU feature flag helper routines: - */ -static __always_inline __pure bool use_xsaveopt(void) -{ - return static_cpu_has(X86_FEATURE_XSAVEOPT); -} - -static __always_inline __pure bool use_xsave(void) -{ - return static_cpu_has(X86_FEATURE_XSAVE); -} - -static __always_inline __pure bool use_fxsr(void) -{ - return static_cpu_has(X86_FEATURE_FXSR); -} - -/* - * fpstate handling functions: - */ - -extern union fpregs_state init_fpstate; - -extern void fpstate_init(union fpregs_state *state); -#ifdef CONFIG_MATH_EMULATION -extern void fpstate_init_soft(struct swregs_state *soft); -#else -static inline void fpstate_init_soft(struct swregs_state *soft) {} -#endif -extern void save_fpregs_to_fpstate(struct fpu *fpu); - -/* Returns 0 or the negated trap number, which results in -EFAULT for #PF */ -#define user_insn(insn, output, input...) \ -({ \ - int err; \ - \ - might_fault(); \ - \ - asm volatile(ASM_STAC "\n" \ - "1: " #insn "\n" \ - "2: " ASM_CLAC "\n" \ - ".section .fixup,\"ax\"\n" \ - "3: negl %%eax\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE_FAULT(1b, 3b) \ - : [err] "=a" (err), output \ - : "0"(0), input); \ - err; \ -}) - -#define kernel_insn_err(insn, output, input...) \ -({ \ - int err; \ - asm volatile("1:" #insn "\n\t" \ - "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3: movl $-1,%[err]\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE(1b, 3b) \ - : [err] "=r" (err), output \ - : "0"(0), input); \ - err; \ -}) - -#define kernel_insn(insn, output, input...) \ - asm volatile("1:" #insn "\n\t" \ - "2:\n" \ - _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_fprestore) \ - : output : input) - -static inline int fnsave_to_user_sigframe(struct fregs_state __user *fx) -{ - return user_insn(fnsave %[fx]; fwait, [fx] "=m" (*fx), "m" (*fx)); -} - -static inline int fxsave_to_user_sigframe(struct fxregs_state __user *fx) -{ - if (IS_ENABLED(CONFIG_X86_32)) - return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx)); - else - return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx)); - -} - -static inline void fxrstor(struct fxregs_state *fx) -{ - if (IS_ENABLED(CONFIG_X86_32)) - kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); - else - kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); -} - -static inline int fxrstor_safe(struct fxregs_state *fx) -{ - if (IS_ENABLED(CONFIG_X86_32)) - return kernel_insn_err(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); - else - return kernel_insn_err(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); -} - -static inline int fxrstor_from_user_sigframe(struct fxregs_state __user *fx) -{ - if (IS_ENABLED(CONFIG_X86_32)) - return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); - else - return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); -} - -static inline void frstor(struct fregs_state *fx) -{ - kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); -} - -static inline int frstor_safe(struct fregs_state *fx) -{ - return kernel_insn_err(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); -} - -static inline int frstor_from_user_sigframe(struct fregs_state __user *fx) -{ - return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); -} - -static inline void fxsave(struct fxregs_state *fx) -{ - if (IS_ENABLED(CONFIG_X86_32)) - asm volatile( "fxsave %[fx]" : [fx] "=m" (*fx)); - else - asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx)); -} - -/* These macros all use (%edi)/(%rdi) as the single memory argument. */ -#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27" -#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37" -#define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f" -#define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f" -#define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f" - -/* - * After this @err contains 0 on success or the negated trap number when - * the operation raises an exception. For faults this results in -EFAULT. - */ -#define XSTATE_OP(op, st, lmask, hmask, err) \ - asm volatile("1:" op "\n\t" \ - "xor %[err], %[err]\n" \ - "2:\n\t" \ - ".pushsection .fixup,\"ax\"\n\t" \ - "3: negl %%eax\n\t" \ - "jmp 2b\n\t" \ - ".popsection\n\t" \ - _ASM_EXTABLE_FAULT(1b, 3b) \ - : [err] "=a" (err) \ - : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ - : "memory") - -/* - * If XSAVES is enabled, it replaces XSAVEOPT because it supports a compact - * format and supervisor states in addition to modified optimization in - * XSAVEOPT. - * - * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT - * supports modified optimization which is not supported by XSAVE. - * - * We use XSAVE as a fallback. - * - * The 661 label is defined in the ALTERNATIVE* macros as the address of the - * original instruction which gets replaced. We need to use it here as the - * address of the instruction where we might get an exception at. - */ -#define XSTATE_XSAVE(st, lmask, hmask, err) \ - asm volatile(ALTERNATIVE_2(XSAVE, \ - XSAVEOPT, X86_FEATURE_XSAVEOPT, \ - XSAVES, X86_FEATURE_XSAVES) \ - "\n" \ - "xor %[err], %[err]\n" \ - "3:\n" \ - ".pushsection .fixup,\"ax\"\n" \ - "4: movl $-2, %[err]\n" \ - "jmp 3b\n" \ - ".popsection\n" \ - _ASM_EXTABLE(661b, 4b) \ - : [err] "=r" (err) \ - : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ - : "memory") - -/* - * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact - * XSAVE area format. - */ -#define XSTATE_XRESTORE(st, lmask, hmask) \ - asm volatile(ALTERNATIVE(XRSTOR, \ - XRSTORS, X86_FEATURE_XSAVES) \ - "\n" \ - "3:\n" \ - _ASM_EXTABLE_HANDLE(661b, 3b, ex_handler_fprestore)\ - : \ - : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ - : "memory") - -/* - * This function is called only during boot time when x86 caps are not set - * up and alternative can not be used yet. - */ -static inline void os_xrstor_booting(struct xregs_state *xstate) -{ - u64 mask = xfeatures_mask_fpstate(); - u32 lmask = mask; - u32 hmask = mask >> 32; - int err; - - WARN_ON(system_state != SYSTEM_BOOTING); - - if (boot_cpu_has(X86_FEATURE_XSAVES)) - XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); - else - XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); - - /* - * We should never fault when copying from a kernel buffer, and the FPU - * state we set at boot time should be valid. - */ - WARN_ON_FPU(err); -} - -/* - * Save processor xstate to xsave area. - * - * Uses either XSAVE or XSAVEOPT or XSAVES depending on the CPU features - * and command line options. The choice is permanent until the next reboot. - */ -static inline void os_xsave(struct xregs_state *xstate) -{ - u64 mask = xfeatures_mask_all; - u32 lmask = mask; - u32 hmask = mask >> 32; - int err; - - WARN_ON_FPU(!alternatives_patched); - - XSTATE_XSAVE(xstate, lmask, hmask, err); - - /* We should never fault when copying to a kernel buffer: */ - WARN_ON_FPU(err); -} - -/* - * Restore processor xstate from xsave area. - * - * Uses XRSTORS when XSAVES is used, XRSTOR otherwise. - */ -static inline void os_xrstor(struct xregs_state *xstate, u64 mask) -{ - u32 lmask = mask; - u32 hmask = mask >> 32; - - XSTATE_XRESTORE(xstate, lmask, hmask); -} - -/* - * Save xstate to user space xsave area. - * - * We don't use modified optimization because xrstor/xrstors might track - * a different application. - * - * We don't use compacted format xsave area for - * backward compatibility for old applications which don't understand - * compacted format of xsave area. - */ -static inline int xsave_to_user_sigframe(struct xregs_state __user *buf) -{ - /* - * Include the features which are not xsaved/rstored by the kernel - * internally, e.g. PKRU. That's user space ABI and also required - * to allow the signal handler to modify PKRU. - */ - u64 mask = xfeatures_mask_uabi(); - u32 lmask = mask; - u32 hmask = mask >> 32; - int err; - - /* - * Clear the xsave header first, so that reserved fields are - * initialized to zero. - */ - err = __clear_user(&buf->header, sizeof(buf->header)); - if (unlikely(err)) - return -EFAULT; - - stac(); - XSTATE_OP(XSAVE, buf, lmask, hmask, err); - clac(); - - return err; -} - -/* - * Restore xstate from user space xsave area. - */ -static inline int xrstor_from_user_sigframe(struct xregs_state __user *buf, u64 mask) -{ - struct xregs_state *xstate = ((__force struct xregs_state *)buf); - u32 lmask = mask; - u32 hmask = mask >> 32; - int err; - - stac(); - XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); - clac(); - - return err; -} - -/* - * Restore xstate from kernel space xsave area, return an error code instead of - * an exception. - */ -static inline int os_xrstor_safe(struct xregs_state *xstate, u64 mask) -{ - u32 lmask = mask; - u32 hmask = mask >> 32; - int err; - - if (cpu_feature_enabled(X86_FEATURE_XSAVES)) - XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); - else - XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); - - return err; -} - -extern void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask); - -static inline void restore_fpregs_from_fpstate(union fpregs_state *fpstate) -{ - __restore_fpregs_from_fpstate(fpstate, xfeatures_mask_fpstate()); -} - -extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size); - -/* - * FPU context switch related helper methods: - */ - -DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); - -/* - * The in-register FPU state for an FPU context on a CPU is assumed to be - * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx - * matches the FPU. - * - * If the FPU register state is valid, the kernel can skip restoring the - * FPU state from memory. - * - * Any code that clobbers the FPU registers or updates the in-memory - * FPU state for a task MUST let the rest of the kernel know that the - * FPU registers are no longer valid for this task. - * - * Either one of these invalidation functions is enough. Invalidate - * a resource you control: CPU if using the CPU for something else - * (with preemption disabled), FPU for the current task, or a task that - * is prevented from running by the current task. - */ -static inline void __cpu_invalidate_fpregs_state(void) -{ - __this_cpu_write(fpu_fpregs_owner_ctx, NULL); -} - -static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu) -{ - fpu->last_cpu = -1; -} - -static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu) -{ - return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; -} - -/* - * These generally need preemption protection to work, - * do try to avoid using these on their own: - */ -static inline void fpregs_deactivate(struct fpu *fpu) -{ - this_cpu_write(fpu_fpregs_owner_ctx, NULL); - trace_x86_fpu_regs_deactivated(fpu); -} - -static inline void fpregs_activate(struct fpu *fpu) -{ - this_cpu_write(fpu_fpregs_owner_ctx, fpu); - trace_x86_fpu_regs_activated(fpu); -} - -/* Internal helper for switch_fpu_return() and signal frame setup */ -static inline void fpregs_restore_userregs(void) -{ - struct fpu *fpu = ¤t->thread.fpu; - int cpu = smp_processor_id(); - - if (WARN_ON_ONCE(current->flags & PF_KTHREAD)) - return; - - if (!fpregs_state_valid(fpu, cpu)) { - u64 mask; - - /* - * This restores _all_ xstate which has not been - * established yet. - * - * If PKRU is enabled, then the PKRU value is already - * correct because it was either set in switch_to() or in - * flush_thread(). So it is excluded because it might be - * not up to date in current->thread.fpu.xsave state. - */ - mask = xfeatures_mask_restore_user() | - xfeatures_mask_supervisor(); - __restore_fpregs_from_fpstate(&fpu->state, mask); - - fpregs_activate(fpu); - fpu->last_cpu = cpu; - } - clear_thread_flag(TIF_NEED_FPU_LOAD); -} - -/* - * FPU state switching for scheduling. - * - * This is a two-stage process: - * - * - switch_fpu_prepare() saves the old state. - * This is done within the context of the old process. - * - * - switch_fpu_finish() sets TIF_NEED_FPU_LOAD; the floating point state - * will get loaded on return to userspace, or when the kernel needs it. - * - * If TIF_NEED_FPU_LOAD is cleared then the CPU's FPU registers - * are saved in the current thread's FPU register state. - * - * If TIF_NEED_FPU_LOAD is set then CPU's FPU registers may not - * hold current()'s FPU registers. It is required to load the - * registers before returning to userland or using the content - * otherwise. - * - * The FPU context is only stored/restored for a user task and - * PF_KTHREAD is used to distinguish between kernel and user threads. - */ -static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) -{ - if (static_cpu_has(X86_FEATURE_FPU) && !(current->flags & PF_KTHREAD)) { - save_fpregs_to_fpstate(old_fpu); - /* - * The save operation preserved register state, so the - * fpu_fpregs_owner_ctx is still @old_fpu. Store the - * current CPU number in @old_fpu, so the next return - * to user space can avoid the FPU register restore - * when is returns on the same CPU and still owns the - * context. - */ - old_fpu->last_cpu = cpu; - - trace_x86_fpu_regs_deactivated(old_fpu); - } -} - -/* - * Misc helper functions: - */ - -/* - * Delay loading of the complete FPU state until the return to userland. - * PKRU is handled separately. - */ -static inline void switch_fpu_finish(struct fpu *new_fpu) -{ - if (cpu_feature_enabled(X86_FEATURE_FPU)) - set_thread_flag(TIF_NEED_FPU_LOAD); -} - -#endif /* _ASM_X86_FPU_INTERNAL_H */ diff --git a/arch/x86/include/asm/fpu/sched.h b/arch/x86/include/asm/fpu/sched.h new file mode 100644 index 000000000000..99a8820e8cc4 --- /dev/null +++ b/arch/x86/include/asm/fpu/sched.h @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_FPU_SCHED_H +#define _ASM_X86_FPU_SCHED_H + +#include <linux/sched.h> + +#include <asm/cpufeature.h> +#include <asm/fpu/types.h> + +#include <asm/trace/fpu.h> + +extern void save_fpregs_to_fpstate(struct fpu *fpu); +extern void fpu__drop(struct fpu *fpu); +extern int fpu_clone(struct task_struct *dst, unsigned long clone_flags); +extern void fpu_flush_thread(void); + +/* + * FPU state switching for scheduling. + * + * This is a two-stage process: + * + * - switch_fpu_prepare() saves the old state. + * This is done within the context of the old process. + * + * - switch_fpu_finish() sets TIF_NEED_FPU_LOAD; the floating point state + * will get loaded on return to userspace, or when the kernel needs it. + * + * If TIF_NEED_FPU_LOAD is cleared then the CPU's FPU registers + * are saved in the current thread's FPU register state. + * + * If TIF_NEED_FPU_LOAD is set then CPU's FPU registers may not + * hold current()'s FPU registers. It is required to load the + * registers before returning to userland or using the content + * otherwise. + * + * The FPU context is only stored/restored for a user task and + * PF_KTHREAD is used to distinguish between kernel and user threads. + */ +static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) +{ + if (cpu_feature_enabled(X86_FEATURE_FPU) && + !(current->flags & PF_KTHREAD)) { + save_fpregs_to_fpstate(old_fpu); + /* + * The save operation preserved register state, so the + * fpu_fpregs_owner_ctx is still @old_fpu. Store the + * current CPU number in @old_fpu, so the next return + * to user space can avoid the FPU register restore + * when is returns on the same CPU and still owns the + * context. + */ + old_fpu->last_cpu = cpu; + + trace_x86_fpu_regs_deactivated(old_fpu); + } +} + +/* + * Delay loading of the complete FPU state until the return to userland. + * PKRU is handled separately. + */ +static inline void switch_fpu_finish(void) +{ + if (cpu_feature_enabled(X86_FEATURE_FPU)) + set_thread_flag(TIF_NEED_FPU_LOAD); +} + +#endif /* _ASM_X86_FPU_SCHED_H */ diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h index 8b6631dffefd..22b0273a8bf1 100644 --- a/arch/x86/include/asm/fpu/signal.h +++ b/arch/x86/include/asm/fpu/signal.h @@ -5,6 +5,11 @@ #ifndef _ASM_X86_FPU_SIGNAL_H #define _ASM_X86_FPU_SIGNAL_H +#include <linux/compat.h> +#include <linux/user.h> + +#include <asm/fpu/types.h> + #ifdef CONFIG_X86_64 # include <uapi/asm/sigcontext.h> # include <asm/user32.h> @@ -31,6 +36,12 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame, unsigned long fpu__get_fpstate_size(void); -extern void fpu__init_prepare_fx_sw_frame(void); +extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size); +extern void fpu__clear_user_states(struct fpu *fpu); +extern bool fpu__restore_sig(void __user *buf, int ia32_frame); + +extern void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask); + +extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size); #endif /* _ASM_X86_FPU_SIGNAL_H */ diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index f5a38a5f3ae1..3c06c82ab355 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -120,6 +120,9 @@ enum xfeature { XFEATURE_RSRVD_COMP_13, XFEATURE_RSRVD_COMP_14, XFEATURE_LBR, + XFEATURE_RSRVD_COMP_16, + XFEATURE_XTILE_CFG, + XFEATURE_XTILE_DATA, XFEATURE_MAX, }; @@ -136,12 +139,21 @@ enum xfeature { #define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU) #define XFEATURE_MASK_PASID (1 << XFEATURE_PASID) #define XFEATURE_MASK_LBR (1 << XFEATURE_LBR) +#define XFEATURE_MASK_XTILE_CFG (1 << XFEATURE_XTILE_CFG) +#define XFEATURE_MASK_XTILE_DATA (1 << XFEATURE_XTILE_DATA) #define XFEATURE_MASK_FPSSE (XFEATURE_MASK_FP | XFEATURE_MASK_SSE) #define XFEATURE_MASK_AVX512 (XFEATURE_MASK_OPMASK \ | XFEATURE_MASK_ZMM_Hi256 \ | XFEATURE_MASK_Hi16_ZMM) +#ifdef CONFIG_X86_64 +# define XFEATURE_MASK_XTILE (XFEATURE_MASK_XTILE_DATA \ + | XFEATURE_MASK_XTILE_CFG) +#else +# define XFEATURE_MASK_XTILE (0) +#endif + #define FIRST_EXTENDED_XFEATURE XFEATURE_YMM struct reg_128_bit { @@ -153,6 +165,9 @@ struct reg_256_bit { struct reg_512_bit { u8 regbytes[512/8]; }; +struct reg_1024_byte { + u8 regbytes[1024]; +}; /* * State component 2: @@ -255,6 +270,23 @@ struct arch_lbr_state { u64 ler_to; u64 ler_info; struct lbr_entry entries[]; +}; + +/* + * State component 17: 64-byte tile configuration register. + */ +struct xtile_cfg { + u64 tcfg[8]; +} __packed; + +/* + * State component 18: 1KB tile data register. + * Each register represents 16 64-byte rows of the matrix + * data. But the number of registers depends on the actual + * implementation. + */ +struct xtile_data { + struct reg_1024_byte tmm; } __packed; /* @@ -309,6 +341,91 @@ union fpregs_state { u8 __padding[PAGE_SIZE]; }; +struct fpstate { + /* @kernel_size: The size of the kernel register image */ + unsigned int size; + + /* @user_size: The size in non-compacted UABI format */ + unsigned int user_size; + + /* @xfeatures: xfeatures for which the storage is sized */ + u64 xfeatures; + + /* @user_xfeatures: xfeatures valid in UABI buffers */ + u64 user_xfeatures; + + /* @xfd: xfeatures disabled to trap userspace use. */ + u64 xfd; + + /* @is_valloc: Indicator for dynamically allocated state */ + unsigned int is_valloc : 1; + + /* @is_guest: Indicator for guest state (KVM) */ + unsigned int is_guest : 1; + + /* + * @is_confidential: Indicator for KVM confidential mode. + * The FPU registers are restored by the + * vmentry firmware from encrypted guest + * memory. On vmexit the FPU registers are + * saved by firmware to encrypted guest memory + * and the registers are scrubbed before + * returning to the host. So there is no + * content which is worth saving and restoring. + * The fpstate has to be there so that + * preemption and softirq FPU usage works + * without special casing. + */ + unsigned int is_confidential : 1; + + /* @in_use: State is in use */ + unsigned int in_use : 1; + + /* @regs: The register state union for all supported formats */ + union fpregs_state regs; + + /* @regs is dynamically sized! Don't add anything after @regs! */ +} __aligned(64); + +struct fpu_state_perm { + /* + * @__state_perm: + * + * This bitmap indicates the permission for state components, which + * are available to a thread group. The permission prctl() sets the + * enabled state bits in thread_group_leader()->thread.fpu. + * + * All run time operations use the per thread information in the + * currently active fpu.fpstate which contains the xfeature masks + * and sizes for kernel and user space. + * + * This master permission field is only to be used when + * task.fpu.fpstate based checks fail to validate whether the task + * is allowed to expand it's xfeatures set which requires to + * allocate a larger sized fpstate buffer. + * + * Do not access this field directly. Use the provided helper + * function. Unlocked access is possible for quick checks. + */ + u64 __state_perm; + + /* + * @__state_size: + * + * The size required for @__state_perm. Only valid to access + * with sighand locked. + */ + unsigned int __state_size; + + /* + * @__user_state_size: + * + * The size required for @__state_perm user part. Only valid to + * access with sighand locked. + */ + unsigned int __user_state_size; +}; + /* * Highest level per task FPU state data structure that * contains the FPU register state plus various FPU @@ -337,19 +454,100 @@ struct fpu { unsigned long avx512_timestamp; /* - * @state: + * @fpstate: + * + * Pointer to the active struct fpstate. Initialized to + * point at @__fpstate below. + */ + struct fpstate *fpstate; + + /* + * @__task_fpstate: + * + * Pointer to an inactive struct fpstate. Initialized to NULL. Is + * used only for KVM support to swap out the regular task fpstate. + */ + struct fpstate *__task_fpstate; + + /* + * @perm: + * + * Permission related information + */ + struct fpu_state_perm perm; + + /* + * @__fpstate: * - * In-memory copy of all FPU registers that we save/restore - * over context switches. If the task is using the FPU then - * the registers in the FPU are more recent than this state - * copy. If the task context-switches away then they get - * saved here and represent the FPU state. + * Initial in-memory storage for FPU registers which are saved in + * context switch and when the kernel uses the FPU. The registers + * are restored from this storage on return to user space if they + * are not longer containing the tasks FPU register state. */ - union fpregs_state state; + struct fpstate __fpstate; /* - * WARNING: 'state' is dynamically-sized. Do not put + * WARNING: '__fpstate' is dynamically-sized. Do not put * anything after it here. */ }; +/* + * Guest pseudo FPU container + */ +struct fpu_guest { + /* + * @fpstate: Pointer to the allocated guest fpstate + */ + struct fpstate *fpstate; +}; + +/* + * FPU state configuration data. Initialized at boot time. Read only after init. + */ +struct fpu_state_config { + /* + * @max_size: + * + * The maximum size of the register state buffer. Includes all + * supported features except independent managed features. + */ + unsigned int max_size; + + /* + * @default_size: + * + * The default size of the register state buffer. Includes all + * supported features except independent managed features and + * features which have to be requested by user space before usage. + */ + unsigned int default_size; + + /* + * @max_features: + * + * The maximum supported features bitmap. Does not include + * independent managed features. + */ + u64 max_features; + + /* + * @default_features: + * + * The default supported features bitmap. Does not include + * independent managed features and features which have to + * be requested by user space before usage. + */ + u64 default_features; + /* + * @legacy_features: + * + * Features which can be reported back to user space + * even without XSAVE support, i.e. legacy features FP + SSE + */ + u64 legacy_features; +}; + +/* FPU state configuration information */ +extern struct fpu_state_config fpu_kernel_cfg, fpu_user_cfg; + #endif /* _ASM_X86_FPU_H */ diff --git a/arch/x86/include/asm/fpu/xcr.h b/arch/x86/include/asm/fpu/xcr.h index 1c7ab8d95da5..79f95d3787e2 100644 --- a/arch/x86/include/asm/fpu/xcr.h +++ b/arch/x86/include/asm/fpu/xcr.h @@ -2,17 +2,6 @@ #ifndef _ASM_X86_FPU_XCR_H #define _ASM_X86_FPU_XCR_H -/* - * MXCSR and XCR definitions: - */ - -static inline void ldmxcsr(u32 mxcsr) -{ - asm volatile("ldmxcsr %0" :: "m" (mxcsr)); -} - -extern unsigned int mxcsr_feature_mask; - #define XCR_XFEATURE_ENABLED_MASK 0x00000000 static inline u64 xgetbv(u32 index) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 109dfcc75299..0f8b90ab18c9 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -14,6 +14,8 @@ #define XSTATE_CPUID 0x0000000d +#define TILE_CPUID 0x0000001d + #define FXSAVE_SIZE 512 #define XSAVE_HDR_SIZE 64 @@ -33,7 +35,8 @@ XFEATURE_MASK_Hi16_ZMM | \ XFEATURE_MASK_PKRU | \ XFEATURE_MASK_BNDREGS | \ - XFEATURE_MASK_BNDCSR) + XFEATURE_MASK_BNDCSR | \ + XFEATURE_MASK_XTILE) /* * Features which are restored when returning to user space. @@ -43,6 +46,9 @@ #define XFEATURE_MASK_USER_RESTORE \ (XFEATURE_MASK_USER_SUPPORTED & ~XFEATURE_MASK_PKRU) +/* Features which are dynamically enabled for a process on request */ +#define XFEATURE_MASK_USER_DYNAMIC XFEATURE_MASK_XTILE_DATA + /* All currently supported supervisor features */ #define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID) @@ -78,78 +84,42 @@ XFEATURE_MASK_INDEPENDENT | \ XFEATURE_MASK_SUPERVISOR_UNSUPPORTED) -#ifdef CONFIG_X86_64 -#define REX_PREFIX "0x48, " -#else -#define REX_PREFIX -#endif - -extern u64 xfeatures_mask_all; - -static inline u64 xfeatures_mask_supervisor(void) -{ - return xfeatures_mask_all & XFEATURE_MASK_SUPERVISOR_SUPPORTED; -} - /* - * The xfeatures which are enabled in XCR0 and expected to be in ptrace - * buffers and signal frames. + * The feature mask required to restore FPU state: + * - All user states which are not eagerly switched in switch_to()/exec() + * - The suporvisor states */ -static inline u64 xfeatures_mask_uabi(void) -{ - return xfeatures_mask_all & XFEATURE_MASK_USER_SUPPORTED; -} - -/* - * The xfeatures which are restored by the kernel when returning to user - * mode. This is not necessarily the same as xfeatures_mask_uabi() as the - * kernel does not manage all XCR0 enabled features via xsave/xrstor as - * some of them have to be switched eagerly on context switch and exec(). - */ -static inline u64 xfeatures_mask_restore_user(void) -{ - return xfeatures_mask_all & XFEATURE_MASK_USER_RESTORE; -} - -/* - * Like xfeatures_mask_restore_user() but additionally restors the - * supported supervisor states. - */ -static inline u64 xfeatures_mask_fpstate(void) -{ - return xfeatures_mask_all & \ - (XFEATURE_MASK_USER_RESTORE | XFEATURE_MASK_SUPERVISOR_SUPPORTED); -} - -static inline u64 xfeatures_mask_independent(void) -{ - if (!boot_cpu_has(X86_FEATURE_ARCH_LBR)) - return XFEATURE_MASK_INDEPENDENT & ~XFEATURE_MASK_LBR; - - return XFEATURE_MASK_INDEPENDENT; -} +#define XFEATURE_MASK_FPSTATE (XFEATURE_MASK_USER_RESTORE | \ + XFEATURE_MASK_SUPERVISOR_SUPPORTED) extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; extern void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask); -void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr); int xfeature_size(int xfeature_nr); -int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf); -int copy_sigframe_from_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf); void xsaves(struct xregs_state *xsave, u64 mask); void xrstors(struct xregs_state *xsave, u64 mask); -enum xstate_copy_mode { - XSTATE_COPY_FP, - XSTATE_COPY_FX, - XSTATE_COPY_XSAVE, -}; +int xfd_enable_feature(u64 xfd_err); + +#ifdef CONFIG_X86_64 +DECLARE_STATIC_KEY_FALSE(__fpu_state_size_dynamic); +#endif + +#ifdef CONFIG_X86_64 +DECLARE_STATIC_KEY_FALSE(__fpu_state_size_dynamic); -struct membuf; -void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, - enum xstate_copy_mode mode); +static __always_inline __pure bool fpu_state_size_dynamic(void) +{ + return static_branch_unlikely(&__fpu_state_size_dynamic); +} +#else +static __always_inline __pure bool fpu_state_size_dynamic(void) +{ + return false; +} +#endif #endif diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h index 2c5f7861d373..fada857f0a1e 100644 --- a/arch/x86/include/asm/ia32.h +++ b/arch/x86/include/asm/ia32.h @@ -68,6 +68,6 @@ extern void ia32_pick_mmap_layout(struct mm_struct *mm); #endif -#endif /* !CONFIG_IA32_SUPPORT */ +#endif /* CONFIG_IA32_EMULATION */ #endif /* _ASM_X86_IA32_H */ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 841a5d104afa..5c6a4af0b911 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -391,6 +391,7 @@ extern void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size) #define arch_io_reserve_memtype_wc arch_io_reserve_memtype_wc #endif +#ifdef CONFIG_AMD_MEM_ENCRYPT extern bool arch_memremap_can_ram_remap(resource_size_t offset, unsigned long size, unsigned long flags); @@ -398,6 +399,13 @@ extern bool arch_memremap_can_ram_remap(resource_size_t offset, extern bool phys_mem_access_encrypted(unsigned long phys_addr, unsigned long size); +#else +static inline bool phys_mem_access_encrypted(unsigned long phys_addr, + unsigned long size) +{ + return true; +} +#endif /** * iosubmit_cmds512 - copy data to single MMIO location, in 512-bit units diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h index 562854c60808..7dcc2b0a4f09 100644 --- a/arch/x86/include/asm/irq_stack.h +++ b/arch/x86/include/asm/irq_stack.h @@ -58,7 +58,7 @@ * the output constraints to make the compiler aware that R11 cannot be * reused after the asm() statement. * - * For builds with CONFIG_UNWIND_FRAME_POINTER ASM_CALL_CONSTRAINT is + * For builds with CONFIG_UNWINDER_FRAME_POINTER, ASM_CALL_CONSTRAINT is * required as well as this prevents certain creative GCC variants from * misplacing the ASM code. * @@ -185,6 +185,7 @@ IRQ_CONSTRAINTS, regs, vector); \ } +#ifndef CONFIG_PREEMPT_RT #define ASM_CALL_SOFTIRQ \ "call %P[__func] \n" @@ -201,6 +202,8 @@ __this_cpu_write(hardirq_stack_inuse, false); \ } +#endif + #else /* CONFIG_X86_64 */ /* System vector handlers always run on the stack they interrupted. */ #define run_sysvec_on_irqstack_cond(func, regs) \ diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 0a6e34b07017..11b7c06e2828 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -129,7 +129,7 @@ relocate_kernel(unsigned long indirection_page, unsigned long page_list, unsigned long start_address, unsigned int preserve_context, - unsigned int sme_active); + unsigned int host_mem_enc_active); #endif #define ARCH_HAS_KIMAGE_ARCH diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f8f48a7ec577..32f300dade5e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -691,18 +691,18 @@ struct kvm_vcpu_arch { * * Note that while the PKRU state lives inside the fpu registers, * it is switched out separately at VMENTER and VMEXIT time. The - * "guest_fpu" state here contains the guest FPU context, with the + * "guest_fpstate" state here contains the guest FPU context, with the * host PRKU bits. */ - struct fpu *user_fpu; - struct fpu *guest_fpu; + struct fpu_guest guest_fpu; u64 xcr0; u64 guest_supported_xcr0; struct kvm_pio_request pio; void *pio_data; - void *guest_ins_data; + void *sev_pio_data; + unsigned sev_pio_count; u8 event_exit_inst_len; @@ -1097,7 +1097,7 @@ struct kvm_arch { u64 cur_tsc_generation; int nr_vcpus_matched_tsc; - spinlock_t pvclock_gtod_sync_lock; + raw_spinlock_t pvclock_gtod_sync_lock; bool use_master_clock; u64 master_kernel_ns; u64 master_cycle_now; @@ -1685,8 +1685,6 @@ void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code); -void kvm_free_guest_fpu(struct kvm_vcpu *vcpu); - void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0); void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4); int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h index 87bd6025d91d..6a5f3acf2b33 100644 --- a/arch/x86/include/asm/kvm_page_track.h +++ b/arch/x86/include/asm/kvm_page_track.h @@ -46,7 +46,7 @@ struct kvm_page_track_notifier_node { struct kvm_page_track_notifier_node *node); }; -void kvm_page_track_init(struct kvm *kvm); +int kvm_page_track_init(struct kvm *kvm); void kvm_page_track_cleanup(struct kvm *kvm); void kvm_page_track_free_memslot(struct kvm_memory_slot *slot); diff --git a/arch/x86/include/asm/kvmclock.h b/arch/x86/include/asm/kvmclock.h index eceea9299097..6c5765192102 100644 --- a/arch/x86/include/asm/kvmclock.h +++ b/arch/x86/include/asm/kvmclock.h @@ -2,6 +2,20 @@ #ifndef _ASM_X86_KVM_CLOCK_H #define _ASM_X86_KVM_CLOCK_H +#include <linux/percpu.h> + extern struct clocksource kvm_clock; +DECLARE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu); + +static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void) +{ + return &this_cpu_read(hv_clock_per_cpu)->pvti; +} + +static inline struct pvclock_vsyscall_time_info *this_cpu_hvclock(void) +{ + return this_cpu_read(hv_clock_per_cpu); +} + #endif /* _ASM_X86_KVM_CLOCK_H */ diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index da9321548f6f..813b4f5b0dd6 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -205,28 +205,16 @@ struct cper_ia_proc_ctx; int mcheck_init(void); void mcheck_cpu_init(struct cpuinfo_x86 *c); void mcheck_cpu_clear(struct cpuinfo_x86 *c); -void mcheck_vendor_init_severity(void); int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id); #else static inline int mcheck_init(void) { return 0; } static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {} static inline void mcheck_cpu_clear(struct cpuinfo_x86 *c) {} -static inline void mcheck_vendor_init_severity(void) {} static inline int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) { return -EINVAL; } #endif -#ifdef CONFIG_X86_ANCIENT_MCE -void intel_p5_mcheck_init(struct cpuinfo_x86 *c); -void winchip_mcheck_init(struct cpuinfo_x86 *c); -static inline void enable_p5_mce(void) { mce_p5_enabled = 1; } -#else -static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {} -static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {} -static inline void enable_p5_mce(void) {} -#endif - void mce_setup(struct mce *m); void mce_log(struct mce *m); DECLARE_PER_CPU(struct device *, mce_device); diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 9c80c68d75b5..2d4f5c17d79c 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -13,6 +13,7 @@ #ifndef __ASSEMBLY__ #include <linux/init.h> +#include <linux/cc_platform.h> #include <asm/bootparam.h> @@ -50,9 +51,6 @@ void __init mem_encrypt_free_decrypted_mem(void); void __init mem_encrypt_init(void); void __init sev_es_init_vc_handling(void); -bool sme_active(void); -bool sev_active(void); -bool sev_es_active(void); #define __bss_decrypted __section(".bss..decrypted") @@ -75,9 +73,6 @@ static inline void __init sme_encrypt_kernel(struct boot_params *bp) { } static inline void __init sme_enable(struct boot_params *bp) { } static inline void sev_es_init_vc_handling(void) { } -static inline bool sme_active(void) { return false; } -static inline bool sev_active(void) { return false; } -static inline bool sev_es_active(void) { return false; } static inline int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0; } @@ -101,11 +96,6 @@ static inline void mem_encrypt_free_decrypted_mem(void) { } extern char __start_bss_decrypted[], __end_bss_decrypted[], __start_bss_decrypted_unused[]; -static inline bool mem_encrypt_active(void) -{ - return sme_me_mask; -} - static inline u64 sme_get_me_mask(void) { return sme_me_mask; diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index a7c413432b33..01e2650b9585 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -625,6 +625,8 @@ #define MSR_IA32_BNDCFGS_RSVD 0x00000ffc +#define MSR_IA32_XFD 0x000001c4 +#define MSR_IA32_XFD_ERR 0x000001c5 #define MSR_IA32_XSS 0x00000da0 #define MSR_IA32_APICBASE 0x0000001b diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index a3f87f1015d3..6b52182e178a 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -92,7 +92,7 @@ static __always_inline unsigned long long __rdmsr(unsigned int msr) asm volatile("1: rdmsr\n" "2:\n" - _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_rdmsr_unsafe) + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR) : EAX_EDX_RET(val, low, high) : "c" (msr)); return EAX_EDX_VAL(val, low, high); @@ -102,7 +102,7 @@ static __always_inline void __wrmsr(unsigned int msr, u32 low, u32 high) { asm volatile("1: wrmsr\n" "2:\n" - _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_unsafe) + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR) : : "c" (msr), "a"(low), "d" (high) : "memory"); } diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index ec2d5c8c6694..cc74dc584836 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -5,12 +5,15 @@ #include <linux/static_key.h> #include <linux/objtool.h> +#include <linux/linkage.h> #include <asm/alternative.h> #include <asm/cpufeatures.h> #include <asm/msr-index.h> #include <asm/unwind_hints.h> +#define RETPOLINE_THUNK_SIZE 32 + /* * Fill the CPU return stack buffer. * @@ -118,6 +121,16 @@ ".popsection\n\t" #ifdef CONFIG_RETPOLINE + +typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE]; + +#define GEN(reg) \ + extern retpoline_thunk_t __x86_indirect_thunk_ ## reg; +#include <asm/GEN-for-each-reg.h> +#undef GEN + +extern retpoline_thunk_t __x86_indirect_thunk_array[]; + #ifdef CONFIG_X86_64 /* @@ -303,63 +316,4 @@ static inline void mds_idle_clear_cpu_buffers(void) #endif /* __ASSEMBLY__ */ -/* - * Below is used in the eBPF JIT compiler and emits the byte sequence - * for the following assembly: - * - * With retpolines configured: - * - * callq do_rop - * spec_trap: - * pause - * lfence - * jmp spec_trap - * do_rop: - * mov %rcx,(%rsp) for x86_64 - * mov %edx,(%esp) for x86_32 - * retq - * - * Without retpolines configured: - * - * jmp *%rcx for x86_64 - * jmp *%edx for x86_32 - */ -#ifdef CONFIG_RETPOLINE -# ifdef CONFIG_X86_64 -# define RETPOLINE_RCX_BPF_JIT_SIZE 17 -# define RETPOLINE_RCX_BPF_JIT() \ -do { \ - EMIT1_off32(0xE8, 7); /* callq do_rop */ \ - /* spec_trap: */ \ - EMIT2(0xF3, 0x90); /* pause */ \ - EMIT3(0x0F, 0xAE, 0xE8); /* lfence */ \ - EMIT2(0xEB, 0xF9); /* jmp spec_trap */ \ - /* do_rop: */ \ - EMIT4(0x48, 0x89, 0x0C, 0x24); /* mov %rcx,(%rsp) */ \ - EMIT1(0xC3); /* retq */ \ -} while (0) -# else /* !CONFIG_X86_64 */ -# define RETPOLINE_EDX_BPF_JIT() \ -do { \ - EMIT1_off32(0xE8, 7); /* call do_rop */ \ - /* spec_trap: */ \ - EMIT2(0xF3, 0x90); /* pause */ \ - EMIT3(0x0F, 0xAE, 0xE8); /* lfence */ \ - EMIT2(0xEB, 0xF9); /* jmp spec_trap */ \ - /* do_rop: */ \ - EMIT3(0x89, 0x14, 0x24); /* mov %edx,(%esp) */ \ - EMIT1(0xC3); /* ret */ \ -} while (0) -# endif -#else /* !CONFIG_RETPOLINE */ -# ifdef CONFIG_X86_64 -# define RETPOLINE_RCX_BPF_JIT_SIZE 2 -# define RETPOLINE_RCX_BPF_JIT() \ - EMIT2(0xFF, 0xE1); /* jmp *%rcx */ -# else /* !CONFIG_X86_64 */ -# define RETPOLINE_EDX_BPF_JIT() \ - EMIT2(0xFF, 0xE2) /* jmp *%edx */ -# endif -#endif - #endif /* _ASM_X86_NOSPEC_BRANCH_H_ */ diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h index 94dbd51df58f..b13f8488ac85 100644 --- a/arch/x86/include/asm/page_32.h +++ b/arch/x86/include/asm/page_32.h @@ -43,7 +43,7 @@ static inline void copy_page(void *to, void *from) { memcpy(to, from, PAGE_SIZE); } -#endif /* CONFIG_X86_3DNOW */ +#endif /* CONFIG_X86_USE_3DNOW */ #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_PAGE_32_H */ diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index da3a1ac82be5..cebec95a7124 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -52,11 +52,11 @@ void __init paravirt_set_cap(void); /* The paravirtualized I/O functions */ static inline void slow_down_io(void) { - pv_ops.cpu.io_delay(); + PVOP_VCALL0(cpu.io_delay); #ifdef REALLY_SLOW_IO - pv_ops.cpu.io_delay(); - pv_ops.cpu.io_delay(); - pv_ops.cpu.io_delay(); + PVOP_VCALL0(cpu.io_delay); + PVOP_VCALL0(cpu.io_delay); + PVOP_VCALL0(cpu.io_delay); #endif } @@ -113,12 +113,12 @@ static inline void __cpuid(unsigned int *eax, unsigned int *ebx, /* * These special macros can be used to get or set a debugging register */ -static inline unsigned long paravirt_get_debugreg(int reg) +static __always_inline unsigned long paravirt_get_debugreg(int reg) { return PVOP_CALL1(unsigned long, cpu.get_debugreg, reg); } #define get_debugreg(var, reg) var = paravirt_get_debugreg(reg) -static inline void set_debugreg(unsigned long val, int reg) +static __always_inline void set_debugreg(unsigned long val, int reg) { PVOP_VCALL2(cpu.set_debugreg, reg, val); } @@ -133,14 +133,14 @@ static inline void write_cr0(unsigned long x) PVOP_VCALL1(cpu.write_cr0, x); } -static inline unsigned long read_cr2(void) +static __always_inline unsigned long read_cr2(void) { return PVOP_ALT_CALLEE0(unsigned long, mmu.read_cr2, "mov %%cr2, %%rax;", ALT_NOT(X86_FEATURE_XENPV)); } -static inline void write_cr2(unsigned long x) +static __always_inline void write_cr2(unsigned long x) { PVOP_VCALL1(mmu.write_cr2, x); } @@ -653,10 +653,10 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu); * functions. */ #define PV_THUNK_NAME(func) "__raw_callee_save_" #func -#define PV_CALLEE_SAVE_REGS_THUNK(func) \ +#define __PV_CALLEE_SAVE_REGS_THUNK(func, section) \ extern typeof(func) __raw_callee_save_##func; \ \ - asm(".pushsection .text;" \ + asm(".pushsection " section ", \"ax\";" \ ".globl " PV_THUNK_NAME(func) ";" \ ".type " PV_THUNK_NAME(func) ", @function;" \ PV_THUNK_NAME(func) ":" \ @@ -669,6 +669,9 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu); ".size " PV_THUNK_NAME(func) ", .-" PV_THUNK_NAME(func) ";" \ ".popsection") +#define PV_CALLEE_SAVE_REGS_THUNK(func) \ + __PV_CALLEE_SAVE_REGS_THUNK(func, ".text") + /* Get a reference to a callee-save function */ #define PV_CALLEE_SAVE(func) \ ((struct paravirt_callee_save) { __raw_callee_save_##func }) @@ -678,23 +681,23 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu); ((struct paravirt_callee_save) { func }) #ifdef CONFIG_PARAVIRT_XXL -static inline notrace unsigned long arch_local_save_flags(void) +static __always_inline unsigned long arch_local_save_flags(void) { return PVOP_ALT_CALLEE0(unsigned long, irq.save_fl, "pushf; pop %%rax;", ALT_NOT(X86_FEATURE_XENPV)); } -static inline notrace void arch_local_irq_disable(void) +static __always_inline void arch_local_irq_disable(void) { PVOP_ALT_VCALLEE0(irq.irq_disable, "cli;", ALT_NOT(X86_FEATURE_XENPV)); } -static inline notrace void arch_local_irq_enable(void) +static __always_inline void arch_local_irq_enable(void) { PVOP_ALT_VCALLEE0(irq.irq_enable, "sti;", ALT_NOT(X86_FEATURE_XENPV)); } -static inline notrace unsigned long arch_local_irq_save(void) +static __always_inline unsigned long arch_local_irq_save(void) { unsigned long f; diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h index 5c7bcaa79623..1d5f14aff5f6 100644 --- a/arch/x86/include/asm/pkeys.h +++ b/arch/x86/include/asm/pkeys.h @@ -2,8 +2,6 @@ #ifndef _ASM_X86_PKEYS_H #define _ASM_X86_PKEYS_H -#define ARCH_DEFAULT_PKEY 0 - /* * If more than 16 keys are ever supported, a thorough audit * will be necessary to ensure that the types that store key diff --git a/arch/x86/include/asm/pkru.h b/arch/x86/include/asm/pkru.h index ccc539faa5bb..4cd49afa0ca4 100644 --- a/arch/x86/include/asm/pkru.h +++ b/arch/x86/include/asm/pkru.h @@ -2,7 +2,7 @@ #ifndef _ASM_X86_PKRU_H #define _ASM_X86_PKRU_H -#include <asm/fpu/xstate.h> +#include <asm/cpufeature.h> #define PKRU_AD_BIT 0x1 #define PKRU_WD_BIT 0x2 diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 9ad2acaaae9b..887380fc9dfe 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -164,7 +164,8 @@ enum cpuid_regs_idx { #define X86_VENDOR_NSC 8 #define X86_VENDOR_HYGON 9 #define X86_VENDOR_ZHAOXIN 10 -#define X86_VENDOR_NUM 11 +#define X86_VENDOR_VORTEX 11 +#define X86_VENDOR_NUM 12 #define X86_VENDOR_UNKNOWN 0xff @@ -461,9 +462,6 @@ DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); DECLARE_PER_CPU(struct irq_stack *, softirq_stack_ptr); #endif /* !X86_64 */ -extern unsigned int fpu_kernel_xstate_size; -extern unsigned int fpu_user_xstate_size; - struct perf_event; struct thread_struct { @@ -537,12 +535,12 @@ struct thread_struct { */ }; -/* Whitelist the FPU state from the task_struct for hardened usercopy. */ +extern void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size); + static inline void arch_thread_struct_whitelist(unsigned long *offset, unsigned long *size) { - *offset = offsetof(struct thread_struct, fpu.state); - *size = fpu_kernel_xstate_size; + fpu_thread_struct_whitelist(offset, size); } static inline void @@ -589,7 +587,7 @@ static inline void load_sp0(unsigned long sp0) /* Free all resources held by a thread. */ extern void release_thread(struct task_struct *); -unsigned long get_wchan(struct task_struct *p); +unsigned long __get_wchan(struct task_struct *p); /* * Generic CPUID function diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 8c5d1910a848..feed36d44d04 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -40,6 +40,6 @@ void x86_report_nx(void); extern int reboot_force; long do_arch_prctl_common(struct task_struct *task, int option, - unsigned long cpuid_enabled); + unsigned long arg2); #endif /* _ASM_X86_PROTO_H */ diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index b94f615600d5..703663175a5a 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -181,7 +181,7 @@ static inline bool any_64bit_mode(struct pt_regs *regs) #define current_user_stack_pointer() current_pt_regs()->sp #define compat_user_stack_pointer() current_pt_regs()->sp -static inline bool ip_within_syscall_gap(struct pt_regs *regs) +static __always_inline bool ip_within_syscall_gap(struct pt_regs *regs) { bool ret = (regs->ip >= (unsigned long)entry_SYSCALL_64 && regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack); diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 72044026eb3c..8dd8e8ec9fa5 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -339,7 +339,7 @@ static inline void __loadsegment_fs(unsigned short value) "1: movw %0, %%fs \n" "2: \n" - _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_clear_fs) + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_CLEAR_FS) : : "rm" (value) : "memory"); } diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index fa5cd05d3b5b..ec060c433589 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -53,6 +53,7 @@ static inline u64 lower_bits(u64 val, unsigned int bits) struct real_mode_header; enum stack_type; +struct ghcb; /* Early IDT entry points for #VC handler */ extern void vc_no_ghcb(void); @@ -81,6 +82,11 @@ static __always_inline void sev_es_nmi_complete(void) __sev_es_nmi_complete(); } extern int __init sev_es_efi_map_ghcbs(pgd_t *pgd); +extern enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, + bool set_ghcb_msr, + struct es_em_ctxt *ctxt, + u64 exit_code, u64 exit_info_1, + u64 exit_info_2); #else static inline void sev_es_ist_enter(struct pt_regs *regs) { } static inline void sev_es_ist_exit(void) { } diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 630ff08532be..08b0e90623ad 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -16,7 +16,9 @@ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map); DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map); /* cpus sharing the last level cache: */ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); +DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map); DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id); +DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id); DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number); static inline struct cpumask *cpu_llc_shared_mask(int cpu) @@ -24,6 +26,11 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu) return per_cpu(cpu_llc_shared_map, cpu); } +static inline struct cpumask *cpu_l2c_shared_mask(int cpu) +{ + return per_cpu(cpu_l2c_shared_map, cpu); +} + DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid); DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid); DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid); diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index f3fbb84ff8a7..68c257a3de0d 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -275,7 +275,7 @@ static inline int enqcmds(void __iomem *dst, const void *src) { const struct { char _[64]; } *__src = src; struct { char _[64]; } __iomem *__dst = dst; - int zf; + bool zf; /* * ENQCMDS %(rdx), rax diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index cf132663c219..ebec69c35e95 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -57,6 +57,9 @@ struct thread_info { unsigned long flags; /* low level flags */ unsigned long syscall_work; /* SYSCALL_WORK_ flags */ u32 status; /* thread synchronous flags */ +#ifdef CONFIG_SMP + u32 cpu; /* current CPU */ +#endif }; #define INIT_THREAD_INFO(tsk) \ diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 9239399e5491..cc164777e661 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -103,6 +103,7 @@ static inline void setup_node_to_cpumask_map(void) { } #include <asm-generic/topology.h> extern const struct cpumask *cpu_coregroup_mask(int cpu); +extern const struct cpumask *cpu_clustergroup_mask(int cpu); #define topology_logical_package_id(cpu) (cpu_data(cpu).logical_proc_id) #define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id) @@ -113,7 +114,9 @@ extern const struct cpumask *cpu_coregroup_mask(int cpu); extern unsigned int __max_die_per_package; #ifdef CONFIG_SMP +#define topology_cluster_id(cpu) (per_cpu(cpu_l2c_id, cpu)) #define topology_die_cpumask(cpu) (per_cpu(cpu_die_map, cpu)) +#define topology_cluster_cpumask(cpu) (cpu_clustergroup_mask(cpu)) #define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) #define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu)) diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h index 879b77792f94..4645a6334063 100644 --- a/arch/x86/include/asm/trace/fpu.h +++ b/arch/x86/include/asm/trace/fpu.h @@ -22,8 +22,8 @@ DECLARE_EVENT_CLASS(x86_fpu, __entry->fpu = fpu; __entry->load_fpu = test_thread_flag(TIF_NEED_FPU_LOAD); if (boot_cpu_has(X86_FEATURE_OSXSAVE)) { - __entry->xfeatures = fpu->state.xsave.header.xfeatures; - __entry->xcomp_bv = fpu->state.xsave.header.xcomp_bv; + __entry->xfeatures = fpu->fpstate->regs.xsave.header.xfeatures; + __entry->xcomp_bv = fpu->fpstate->regs.xsave.header.xcomp_bv; } ), TP_printk("x86/fpu: %p load: %d xfeatures: %llx xcomp_bv: %llx", diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index c9fa7be3df82..33a68407def3 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -301,8 +301,8 @@ do { \ unsigned int __gu_low, __gu_high; \ const unsigned int __user *__gu_ptr; \ __gu_ptr = (const void __user *)(ptr); \ - __get_user_asm(__gu_low, ptr, "l", "=r", label); \ - __get_user_asm(__gu_high, ptr+1, "l", "=r", label); \ + __get_user_asm(__gu_low, __gu_ptr, "l", "=r", label); \ + __get_user_asm(__gu_high, __gu_ptr+1, "l", "=r", label); \ (x) = ((unsigned long long)__gu_high << 32) | __gu_low; \ } while (0) #else @@ -411,7 +411,7 @@ do { \ : [umem] "m" (__m(addr)), \ [efault] "i" (-EFAULT), "0" (err)) -#endif // CONFIG_CC_ASM_GOTO_OUTPUT +#endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT /* FIXME: this hack is definitely wrong -AK */ struct __large_struct { unsigned long buf[100]; }; diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 454b20815f35..4a7ff8b0db20 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -308,13 +308,13 @@ HYPERVISOR_platform_op(struct xen_platform_op *op) return _hypercall1(int, platform_op, op); } -static inline int +static __always_inline int HYPERVISOR_set_debugreg(int reg, unsigned long value) { return _hypercall2(int, set_debugreg, reg, value); } -static inline unsigned long +static __always_inline unsigned long HYPERVISOR_get_debugreg(int reg) { return _hypercall1(unsigned long, get_debugreg, reg); @@ -358,7 +358,7 @@ HYPERVISOR_event_channel_op(int cmd, void *arg) return _hypercall2(int, event_channel_op, cmd, arg); } -static inline int +static __always_inline int HYPERVISOR_xen_version(int cmd, void *arg) { return _hypercall2(int, xen_version, cmd, arg); diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h index 3506d8c598c1..4557f7cb0fa6 100644 --- a/arch/x86/include/asm/xen/pci.h +++ b/arch/x86/include/asm/xen/pci.h @@ -14,16 +14,19 @@ static inline int pci_xen_hvm_init(void) return -1; } #endif -#if defined(CONFIG_XEN_DOM0) +#ifdef CONFIG_XEN_PV_DOM0 int __init pci_xen_initial_domain(void); -int xen_find_device_domain_owner(struct pci_dev *dev); -int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain); -int xen_unregister_device_domain_owner(struct pci_dev *dev); #else static inline int __init pci_xen_initial_domain(void) { return -1; } +#endif +#ifdef CONFIG_XEN_DOM0 +int xen_find_device_domain_owner(struct pci_dev *dev); +int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain); +int xen_unregister_device_domain_owner(struct pci_dev *dev); +#else static inline int xen_find_device_domain_owner(struct pci_dev *dev) { return -1; diff --git a/arch/x86/include/asm/xen/swiotlb-xen.h b/arch/x86/include/asm/xen/swiotlb-xen.h index 6b56d0d45d15..66b4ddde7743 100644 --- a/arch/x86/include/asm/xen/swiotlb-xen.h +++ b/arch/x86/include/asm/xen/swiotlb-xen.h @@ -3,14 +3,10 @@ #define _ASM_X86_SWIOTLB_XEN_H #ifdef CONFIG_SWIOTLB_XEN -extern int xen_swiotlb; extern int __init pci_xen_swiotlb_detect(void); -extern void __init pci_xen_swiotlb_init(void); extern int pci_xen_swiotlb_init_late(void); #else -#define xen_swiotlb (0) -static inline int __init pci_xen_swiotlb_detect(void) { return 0; } -static inline void __init pci_xen_swiotlb_init(void) { } +#define pci_xen_swiotlb_detect NULL static inline int pci_xen_swiotlb_init_late(void) { return -ENXIO; } #endif diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h index 5a6aac9fa41f..754a07856817 100644 --- a/arch/x86/include/uapi/asm/prctl.h +++ b/arch/x86/include/uapi/asm/prctl.h @@ -10,6 +10,10 @@ #define ARCH_GET_CPUID 0x1011 #define ARCH_SET_CPUID 0x1012 +#define ARCH_GET_XCOMP_SUPP 0x1021 +#define ARCH_GET_XCOMP_PERM 0x1022 +#define ARCH_REQ_XCOMP_PERM 0x1023 + #define ARCH_MAP_VDSO_X32 0x2001 #define ARCH_MAP_VDSO_32 0x2002 #define ARCH_MAP_VDSO_64 0x2003 diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h index 9690d6899ad9..f4b81587e90b 100644 --- a/arch/x86/include/uapi/asm/sgx.h +++ b/arch/x86/include/uapi/asm/sgx.h @@ -27,6 +27,8 @@ enum sgx_page_flags { _IOW(SGX_MAGIC, 0x02, struct sgx_enclave_init) #define SGX_IOC_ENCLAVE_PROVISION \ _IOW(SGX_MAGIC, 0x03, struct sgx_enclave_provision) +#define SGX_IOC_VEPC_REMOVE_ALL \ + _IO(SGX_MAGIC, 0x04) /** * struct sgx_enclave_create - parameter structure for the diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 8f4e8fa6ed75..2ff3e600f426 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -21,6 +21,7 @@ CFLAGS_REMOVE_ftrace.o = -pg CFLAGS_REMOVE_early_printk.o = -pg CFLAGS_REMOVE_head64.o = -pg CFLAGS_REMOVE_sev.o = -pg +CFLAGS_REMOVE_cc_platform.o = -pg endif KASAN_SANITIZE_head$(BITS).o := n @@ -29,6 +30,7 @@ KASAN_SANITIZE_dumpstack_$(BITS).o := n KASAN_SANITIZE_stacktrace.o := n KASAN_SANITIZE_paravirt.o := n KASAN_SANITIZE_sev.o := n +KASAN_SANITIZE_cc_platform.o := n # With some compiler versions the generated code results in boot hangs, caused # by several compilation units. To be safe, disable all instrumentation. @@ -47,6 +49,7 @@ endif KCOV_INSTRUMENT := n CFLAGS_head$(BITS).o += -fno-stack-protector +CFLAGS_cc_platform.o += -fno-stack-protector CFLAGS_irq.o := -I $(srctree)/$(src)/../include/asm/trace @@ -147,6 +150,9 @@ obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev.o + +obj-$(CONFIG_ARCH_HAS_CC_PLATFORM) += cc_platform.o + ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index e9da3dc71254..23fb4d51a5da 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -29,6 +29,7 @@ #include <asm/io.h> #include <asm/fixmap.h> #include <asm/paravirt.h> +#include <asm/asm-prototypes.h> int __read_mostly alternatives_patched; @@ -113,6 +114,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len) } } +extern s32 __retpoline_sites[], __retpoline_sites_end[]; extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern s32 __smp_locks[], __smp_locks_end[]; void text_poke_early(void *addr, const void *opcode, size_t len); @@ -221,7 +223,7 @@ static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off) * "noinline" to cause control flow change and thus invalidate I$ and * cause refetch after modification. */ -static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr) +static void __init_or_module noinline optimize_nops(u8 *instr, size_t len) { struct insn insn; int i = 0; @@ -239,11 +241,11 @@ static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *ins * optimized. */ if (insn.length == 1 && insn.opcode.bytes[0] == 0x90) - i += optimize_nops_range(instr, a->instrlen, i); + i += optimize_nops_range(instr, len, i); else i += insn.length; - if (i >= a->instrlen) + if (i >= len) return; } } @@ -331,10 +333,185 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, text_poke_early(instr, insn_buff, insn_buff_sz); next: - optimize_nops(a, instr); + optimize_nops(instr, a->instrlen); } } +#if defined(CONFIG_RETPOLINE) && defined(CONFIG_STACK_VALIDATION) + +/* + * CALL/JMP *%\reg + */ +static int emit_indirect(int op, int reg, u8 *bytes) +{ + int i = 0; + u8 modrm; + + switch (op) { + case CALL_INSN_OPCODE: + modrm = 0x10; /* Reg = 2; CALL r/m */ + break; + + case JMP32_INSN_OPCODE: + modrm = 0x20; /* Reg = 4; JMP r/m */ + break; + + default: + WARN_ON_ONCE(1); + return -1; + } + + if (reg >= 8) { + bytes[i++] = 0x41; /* REX.B prefix */ + reg -= 8; + } + + modrm |= 0xc0; /* Mod = 3 */ + modrm += reg; + + bytes[i++] = 0xff; /* opcode */ + bytes[i++] = modrm; + + return i; +} + +/* + * Rewrite the compiler generated retpoline thunk calls. + * + * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate + * indirect instructions, avoiding the extra indirection. + * + * For example, convert: + * + * CALL __x86_indirect_thunk_\reg + * + * into: + * + * CALL *%\reg + * + * It also tries to inline spectre_v2=retpoline,amd when size permits. + */ +static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) +{ + retpoline_thunk_t *target; + int reg, ret, i = 0; + u8 op, cc; + + target = addr + insn->length + insn->immediate.value; + reg = target - __x86_indirect_thunk_array; + + if (WARN_ON_ONCE(reg & ~0xf)) + return -1; + + /* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */ + BUG_ON(reg == 4); + + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) && + !cpu_feature_enabled(X86_FEATURE_RETPOLINE_AMD)) + return -1; + + op = insn->opcode.bytes[0]; + + /* + * Convert: + * + * Jcc.d32 __x86_indirect_thunk_\reg + * + * into: + * + * Jncc.d8 1f + * [ LFENCE ] + * JMP *%\reg + * [ NOP ] + * 1: + */ + /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */ + if (op == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80) { + cc = insn->opcode.bytes[1] & 0xf; + cc ^= 1; /* invert condition */ + + bytes[i++] = 0x70 + cc; /* Jcc.d8 */ + bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */ + + /* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */ + op = JMP32_INSN_OPCODE; + } + + /* + * For RETPOLINE_AMD: prepend the indirect CALL/JMP with an LFENCE. + */ + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_AMD)) { + bytes[i++] = 0x0f; + bytes[i++] = 0xae; + bytes[i++] = 0xe8; /* LFENCE */ + } + + ret = emit_indirect(op, reg, bytes + i); + if (ret < 0) + return ret; + i += ret; + + for (; i < insn->length;) + bytes[i++] = BYTES_NOP1; + + return i; +} + +/* + * Generated by 'objtool --retpoline'. + */ +void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + struct insn insn; + int len, ret; + u8 bytes[16]; + u8 op1, op2; + + ret = insn_decode_kernel(&insn, addr); + if (WARN_ON_ONCE(ret < 0)) + continue; + + op1 = insn.opcode.bytes[0]; + op2 = insn.opcode.bytes[1]; + + switch (op1) { + case CALL_INSN_OPCODE: + case JMP32_INSN_OPCODE: + break; + + case 0x0f: /* escape */ + if (op2 >= 0x80 && op2 <= 0x8f) + break; + fallthrough; + default: + WARN_ON_ONCE(1); + continue; + } + + DPRINTK("retpoline at: %pS (%px) len: %d to: %pS", + addr, addr, insn.length, + addr + insn.length + insn.immediate.value); + + len = patch_retpoline(addr, &insn, bytes); + if (len == insn.length) { + optimize_nops(bytes, len); + DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr); + DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr); + text_poke_early(addr, bytes, len); + } + } +} + +#else /* !RETPOLINES || !CONFIG_STACK_VALIDATION */ + +void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { } + +#endif /* CONFIG_RETPOLINE && CONFIG_STACK_VALIDATION */ + #ifdef CONFIG_SMP static void alternatives_smp_lock(const s32 *start, const s32 *end, u8 *text, u8 *text_end) @@ -643,6 +820,12 @@ void __init alternative_instructions(void) apply_paravirt(__parainstructions, __parainstructions_end); /* + * Rewrite the retpolines, must be done before alternatives since + * those can rewrite the retpoline thunks. + */ + apply_retpolines(__retpoline_sites, __retpoline_sites_end); + + /* * Then patch alternatives, such that those paravirt calls that are in * alternatives can be overwritten by their immediate fragments. */ diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index f4da9bb69a88..e696e22d0531 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -15,9 +15,15 @@ struct cluster_mask { struct cpumask mask; }; -static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); +/* + * __x2apic_send_IPI_mask() possibly needs to read + * x86_cpu_to_logical_apicid for all online cpus in a sequential way. + * Using per cpu variable would cost one cache line per cpu. + */ +static u32 *x86_cpu_to_logical_apicid __read_mostly; + static DEFINE_PER_CPU(cpumask_var_t, ipi_mask); -static DEFINE_PER_CPU(struct cluster_mask *, cluster_masks); +static DEFINE_PER_CPU_READ_MOSTLY(struct cluster_mask *, cluster_masks); static struct cluster_mask *cluster_hotplug_mask; static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) @@ -27,7 +33,7 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) static void x2apic_send_IPI(int cpu, int vector) { - u32 dest = per_cpu(x86_cpu_to_logical_apicid, cpu); + u32 dest = x86_cpu_to_logical_apicid[cpu]; /* x2apic MSRs are special and need a special fence: */ weak_wrmsr_fence(); @@ -58,7 +64,7 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) dest = 0; for_each_cpu_and(clustercpu, tmpmsk, &cmsk->mask) - dest |= per_cpu(x86_cpu_to_logical_apicid, clustercpu); + dest |= x86_cpu_to_logical_apicid[clustercpu]; if (!dest) continue; @@ -94,7 +100,7 @@ static void x2apic_send_IPI_all(int vector) static u32 x2apic_calc_apicid(unsigned int cpu) { - return per_cpu(x86_cpu_to_logical_apicid, cpu); + return x86_cpu_to_logical_apicid[cpu]; } static void init_x2apic_ldr(void) @@ -103,7 +109,7 @@ static void init_x2apic_ldr(void) u32 cluster, apicid = apic_read(APIC_LDR); unsigned int cpu; - this_cpu_write(x86_cpu_to_logical_apicid, apicid); + x86_cpu_to_logical_apicid[smp_processor_id()] = apicid; if (cmsk) goto update; @@ -166,12 +172,21 @@ static int x2apic_dead_cpu(unsigned int dead_cpu) static int x2apic_cluster_probe(void) { + u32 slots; + if (!x2apic_mode) return 0; + slots = max_t(u32, L1_CACHE_BYTES/sizeof(u32), nr_cpu_ids); + x86_cpu_to_logical_apicid = kcalloc(slots, sizeof(u32), GFP_KERNEL); + if (!x86_cpu_to_logical_apicid) + return 0; + if (cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "x86/x2apic:prepare", x2apic_prepare_cpu, x2apic_dead_cpu) < 0) { pr_err("Failed to register X2APIC_PREPARE\n"); + kfree(x86_cpu_to_logical_apicid); + x86_cpu_to_logical_apicid = NULL; return 0; } init_x2apic_ldr(); diff --git a/arch/x86/kernel/cc_platform.c b/arch/x86/kernel/cc_platform.c new file mode 100644 index 000000000000..03bb2f343ddb --- /dev/null +++ b/arch/x86/kernel/cc_platform.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Confidential Computing Platform Capability checks + * + * Copyright (C) 2021 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky <thomas.lendacky@amd.com> + */ + +#include <linux/export.h> +#include <linux/cc_platform.h> +#include <linux/mem_encrypt.h> + +#include <asm/processor.h> + +static bool __maybe_unused intel_cc_platform_has(enum cc_attr attr) +{ +#ifdef CONFIG_INTEL_TDX_GUEST + return false; +#else + return false; +#endif +} + +/* + * SME and SEV are very similar but they are not the same, so there are + * times that the kernel will need to distinguish between SME and SEV. The + * cc_platform_has() function is used for this. When a distinction isn't + * needed, the CC_ATTR_MEM_ENCRYPT attribute can be used. + * + * The trampoline code is a good example for this requirement. Before + * paging is activated, SME will access all memory as decrypted, but SEV + * will access all memory as encrypted. So, when APs are being brought + * up under SME the trampoline area cannot be encrypted, whereas under SEV + * the trampoline area must be encrypted. + */ +static bool amd_cc_platform_has(enum cc_attr attr) +{ +#ifdef CONFIG_AMD_MEM_ENCRYPT + switch (attr) { + case CC_ATTR_MEM_ENCRYPT: + return sme_me_mask; + + case CC_ATTR_HOST_MEM_ENCRYPT: + return sme_me_mask && !(sev_status & MSR_AMD64_SEV_ENABLED); + + case CC_ATTR_GUEST_MEM_ENCRYPT: + return sev_status & MSR_AMD64_SEV_ENABLED; + + case CC_ATTR_GUEST_STATE_ENCRYPT: + return sev_status & MSR_AMD64_SEV_ES_ENABLED; + + default: + return false; + } +#else + return false; +#endif +} + + +bool cc_platform_has(enum cc_attr attr) +{ + if (sme_me_mask) + return amd_cc_platform_has(attr); + + return false; +} +EXPORT_SYMBOL_GPL(cc_platform_has); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 637b499450d1..9661e3e802be 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -43,6 +43,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o obj-$(CONFIG_CPU_SUP_ZHAOXIN) += zhaoxin.o +obj-$(CONFIG_CPU_SUP_VORTEX_32) += vortex.o obj-$(CONFIG_X86_MCE) += mce/ obj-$(CONFIG_MTRR) += mtrr/ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 2131af9f2fa2..4edb6f0f628c 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -989,6 +989,8 @@ static void init_amd(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_IRPERF) && !cpu_has_amd_erratum(c, amd_erratum_1054)) msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT); + + check_null_seg_clears_base(c); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index ecfca3bbcd96..1c1f218a701d 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -22,7 +22,7 @@ #include <asm/bugs.h> #include <asm/processor.h> #include <asm/processor-flags.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> #include <asm/msr.h> #include <asm/vmx.h> #include <asm/paravirt.h> @@ -758,11 +758,11 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) case SPECTRE_V2_USER_CMD_FORCE: mode = SPECTRE_V2_USER_STRICT; break; + case SPECTRE_V2_USER_CMD_AUTO: case SPECTRE_V2_USER_CMD_PRCTL: case SPECTRE_V2_USER_CMD_PRCTL_IBPB: mode = SPECTRE_V2_USER_PRCTL; break; - case SPECTRE_V2_USER_CMD_AUTO: case SPECTRE_V2_USER_CMD_SECCOMP: case SPECTRE_V2_USER_CMD_SECCOMP_IBPB: if (IS_ENABLED(CONFIG_SECCOMP)) @@ -882,13 +882,6 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) return SPECTRE_V2_CMD_AUTO; } - if (cmd == SPECTRE_V2_CMD_RETPOLINE_AMD && - boot_cpu_data.x86_vendor != X86_VENDOR_HYGON && - boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { - pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n"); - return SPECTRE_V2_CMD_AUTO; - } - spec_v2_print_cond(mitigation_options[i].option, mitigation_options[i].secure); return cmd; @@ -1169,7 +1162,6 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) return mode; switch (cmd) { - case SPEC_STORE_BYPASS_CMD_AUTO: case SPEC_STORE_BYPASS_CMD_SECCOMP: /* * Choose prctl+seccomp as the default mode if seccomp is @@ -1183,6 +1175,7 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) case SPEC_STORE_BYPASS_CMD_ON: mode = SPEC_STORE_BYPASS_DISABLE; break; + case SPEC_STORE_BYPASS_CMD_AUTO: case SPEC_STORE_BYPASS_CMD_PRCTL: mode = SPEC_STORE_BYPASS_PRCTL; break; diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index b5e36bd0425b..fe98a1465be6 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -846,6 +846,7 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c) l2 = new_l2; #ifdef CONFIG_SMP per_cpu(cpu_llc_id, cpu) = l2_id; + per_cpu(cpu_l2c_id, cpu) = l2_id; #endif } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 0f8885949e8c..0083464de5e3 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -42,7 +42,7 @@ #include <asm/setup.h> #include <asm/apic.h> #include <asm/desc.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> #include <asm/mtrr.h> #include <asm/hwcap2.h> #include <linux/numa.h> @@ -85,6 +85,9 @@ u16 get_llc_id(unsigned int cpu) } EXPORT_SYMBOL_GPL(get_llc_id); +/* L2 cache ID of each logical CPU */ +DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id) = BAD_APICID; + /* correctly size the local cpu masks */ void __init setup_cpu_local_masks(void) { @@ -326,6 +329,7 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_SMAP cr4_set_bits(X86_CR4_SMAP); #else + clear_cpu_cap(c, X86_FEATURE_SMAP); cr4_clear_bits(X86_CR4_SMAP); #endif } @@ -1044,6 +1048,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL(CENTAUR, 5, X86_MODEL_ANY, NO_SPECULATION), VULNWL(INTEL, 5, X86_MODEL_ANY, NO_SPECULATION), VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION), + VULNWL(VORTEX, 5, X86_MODEL_ANY, NO_SPECULATION), + VULNWL(VORTEX, 6, X86_MODEL_ANY, NO_SPECULATION), /* Intel Family 6 */ VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT), @@ -1395,9 +1401,8 @@ void __init early_cpu_init(void) early_identify_cpu(&boot_cpu_data); } -static void detect_null_seg_behavior(struct cpuinfo_x86 *c) +static bool detect_null_seg_behavior(void) { -#ifdef CONFIG_X86_64 /* * Empirically, writing zero to a segment selector on AMD does * not clear the base, whereas writing zero to a segment @@ -1418,10 +1423,43 @@ static void detect_null_seg_behavior(struct cpuinfo_x86 *c) wrmsrl(MSR_FS_BASE, 1); loadsegment(fs, 0); rdmsrl(MSR_FS_BASE, tmp); - if (tmp != 0) - set_cpu_bug(c, X86_BUG_NULL_SEG); wrmsrl(MSR_FS_BASE, old_base); -#endif + return tmp == 0; +} + +void check_null_seg_clears_base(struct cpuinfo_x86 *c) +{ + /* BUG_NULL_SEG is only relevant with 64bit userspace */ + if (!IS_ENABLED(CONFIG_X86_64)) + return; + + /* Zen3 CPUs advertise Null Selector Clears Base in CPUID. */ + if (c->extended_cpuid_level >= 0x80000021 && + cpuid_eax(0x80000021) & BIT(6)) + return; + + /* + * CPUID bit above wasn't set. If this kernel is still running + * as a HV guest, then the HV has decided not to advertize + * that CPUID bit for whatever reason. For example, one + * member of the migration pool might be vulnerable. Which + * means, the bug is present: set the BUG flag and return. + */ + if (cpu_has(c, X86_FEATURE_HYPERVISOR)) { + set_cpu_bug(c, X86_BUG_NULL_SEG); + return; + } + + /* + * Zen2 CPUs also have this behaviour, but no CPUID bit. + * 0x18 is the respective family for Hygon. + */ + if ((c->x86 == 0x17 || c->x86 == 0x18) && + detect_null_seg_behavior()) + return; + + /* All the remaining ones are affected */ + set_cpu_bug(c, X86_BUG_NULL_SEG); } static void generic_identify(struct cpuinfo_x86 *c) @@ -1457,8 +1495,6 @@ static void generic_identify(struct cpuinfo_x86 *c) get_model_name(c); /* Default name */ - detect_null_seg_behavior(c); - /* * ESPFIX is a strange bug. All real CPUs have it. Paravirt * systems that run Linux at CPL > 0 may or may not have the diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 95521302630d..ee6f23f7587d 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -75,6 +75,7 @@ extern int detect_extended_topology_early(struct cpuinfo_x86 *c); extern int detect_extended_topology(struct cpuinfo_x86 *c); extern int detect_ht_early(struct cpuinfo_x86 *c); extern void detect_ht(struct cpuinfo_x86 *c); +extern void check_null_seg_clears_base(struct cpuinfo_x86 *c); unsigned int aperfmperf_get_khz(int cpu); diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index defda61f372d..cb2fdd130aae 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -75,6 +75,8 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_SGX_LC, X86_FEATURE_SGX }, { X86_FEATURE_SGX1, X86_FEATURE_SGX }, { X86_FEATURE_SGX2, X86_FEATURE_SGX1 }, + { X86_FEATURE_XFD, X86_FEATURE_XSAVES }, + { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD }, {} }; diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index 6d50136f7ab9..3fcdda4c1e11 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -335,6 +335,8 @@ static void init_hygon(struct cpuinfo_x86 *c) /* Hygon CPUs don't reset SS attributes on SYSRET, Xen does. */ if (!cpu_has(c, X86_FEATURE_XENPV)) set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); + + check_null_seg_clears_base(c); } static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 08831acc1d03..27cacf504663 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -526,7 +526,7 @@ static u32 get_block_address(u32 current_addr, u32 low, u32 high, /* Fall back to method we used for older processors: */ switch (block) { case 0: - addr = msr_ops.misc(bank); + addr = mca_msr_reg(bank, MCA_MISC); break; case 1: offset = ((low & MASK_BLKPTR_LO) >> 21); @@ -978,8 +978,8 @@ static void log_error_deferred(unsigned int bank) { bool defrd; - defrd = _log_error_bank(bank, msr_ops.status(bank), - msr_ops.addr(bank), 0); + defrd = _log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS), + mca_msr_reg(bank, MCA_ADDR), 0); if (!mce_flags.smca) return; @@ -1009,7 +1009,7 @@ static void amd_deferred_error_interrupt(void) static void log_error_thresholding(unsigned int bank, u64 misc) { - _log_error_bank(bank, msr_ops.status(bank), msr_ops.addr(bank), misc); + _log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS), mca_msr_reg(bank, MCA_ADDR), misc); } static void log_and_reset_block(struct threshold_block *block) @@ -1397,7 +1397,7 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, } } - err = allocate_threshold_blocks(cpu, b, bank, 0, msr_ops.misc(bank)); + err = allocate_threshold_blocks(cpu, b, bank, 0, mca_msr_reg(bank, MCA_MISC)); if (err) goto out_kobj; diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 8cb7816d03b4..6ed365337a3b 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -121,8 +121,6 @@ mce_banks_t mce_banks_ce_disabled; static struct work_struct mce_work; static struct irq_work mce_irq_work; -static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); - /* * CPU/chipset specific EDAC code can register a notifier call here to print * MCE errors in a human-readable form. @@ -176,53 +174,27 @@ void mce_unregister_decode_chain(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); -static inline u32 ctl_reg(int bank) -{ - return MSR_IA32_MCx_CTL(bank); -} - -static inline u32 status_reg(int bank) -{ - return MSR_IA32_MCx_STATUS(bank); -} - -static inline u32 addr_reg(int bank) -{ - return MSR_IA32_MCx_ADDR(bank); -} - -static inline u32 misc_reg(int bank) +u32 mca_msr_reg(int bank, enum mca_msr reg) { - return MSR_IA32_MCx_MISC(bank); -} - -static inline u32 smca_ctl_reg(int bank) -{ - return MSR_AMD64_SMCA_MCx_CTL(bank); -} - -static inline u32 smca_status_reg(int bank) -{ - return MSR_AMD64_SMCA_MCx_STATUS(bank); -} + if (mce_flags.smca) { + switch (reg) { + case MCA_CTL: return MSR_AMD64_SMCA_MCx_CTL(bank); + case MCA_ADDR: return MSR_AMD64_SMCA_MCx_ADDR(bank); + case MCA_MISC: return MSR_AMD64_SMCA_MCx_MISC(bank); + case MCA_STATUS: return MSR_AMD64_SMCA_MCx_STATUS(bank); + } + } -static inline u32 smca_addr_reg(int bank) -{ - return MSR_AMD64_SMCA_MCx_ADDR(bank); -} + switch (reg) { + case MCA_CTL: return MSR_IA32_MCx_CTL(bank); + case MCA_ADDR: return MSR_IA32_MCx_ADDR(bank); + case MCA_MISC: return MSR_IA32_MCx_MISC(bank); + case MCA_STATUS: return MSR_IA32_MCx_STATUS(bank); + } -static inline u32 smca_misc_reg(int bank) -{ - return MSR_AMD64_SMCA_MCx_MISC(bank); + return 0; } -struct mca_msr_regs msr_ops = { - .ctl = ctl_reg, - .status = status_reg, - .addr = addr_reg, - .misc = misc_reg -}; - static void __print_mce(struct mce *m) { pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n", @@ -362,24 +334,27 @@ static int msr_to_offset(u32 msr) if (msr == mca_cfg.rip_msr) return offsetof(struct mce, ip); - if (msr == msr_ops.status(bank)) + if (msr == mca_msr_reg(bank, MCA_STATUS)) return offsetof(struct mce, status); - if (msr == msr_ops.addr(bank)) + if (msr == mca_msr_reg(bank, MCA_ADDR)) return offsetof(struct mce, addr); - if (msr == msr_ops.misc(bank)) + if (msr == mca_msr_reg(bank, MCA_MISC)) return offsetof(struct mce, misc); if (msr == MSR_IA32_MCG_STATUS) return offsetof(struct mce, mcgstatus); return -1; } -__visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr) +void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr) { - pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n", - (unsigned int)regs->cx, regs->ip, (void *)regs->ip); + if (wrmsr) { + pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n", + (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax, + regs->ip, (void *)regs->ip); + } else { + pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n", + (unsigned int)regs->cx, regs->ip, (void *)regs->ip); + } show_stack_regs(regs); @@ -387,8 +362,6 @@ __visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup, while (true) cpu_relax(); - - return true; } /* MSR access wrappers used for error injection */ @@ -420,32 +393,13 @@ static noinstr u64 mce_rdmsrl(u32 msr) */ asm volatile("1: rdmsr\n" "2:\n" - _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_rdmsr_fault) + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR_IN_MCE) : EAX_EDX_RET(val, low, high) : "c" (msr)); return EAX_EDX_VAL(val, low, high); } -__visible bool ex_handler_wrmsr_fault(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr) -{ - pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n", - (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax, - regs->ip, (void *)regs->ip); - - show_stack_regs(regs); - - panic("MCA architectural violation!\n"); - - while (true) - cpu_relax(); - - return true; -} - static noinstr void mce_wrmsrl(u32 msr, u64 v) { u32 low, high; @@ -470,7 +424,7 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v) /* See comment in mce_rdmsrl() */ asm volatile("1: wrmsr\n" "2:\n" - _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_fault) + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR_IN_MCE) : : "c" (msr), "a"(low), "d" (high) : "memory"); } @@ -685,10 +639,10 @@ static struct notifier_block mce_default_nb = { static void mce_read_aux(struct mce *m, int i) { if (m->status & MCI_STATUS_MISCV) - m->misc = mce_rdmsrl(msr_ops.misc(i)); + m->misc = mce_rdmsrl(mca_msr_reg(i, MCA_MISC)); if (m->status & MCI_STATUS_ADDRV) { - m->addr = mce_rdmsrl(msr_ops.addr(i)); + m->addr = mce_rdmsrl(mca_msr_reg(i, MCA_ADDR)); /* * Mask the reported address by the reported granularity. @@ -758,7 +712,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) m.bank = i; barrier(); - m.status = mce_rdmsrl(msr_ops.status(i)); + m.status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); /* If this entry is not valid, ignore it */ if (!(m.status & MCI_STATUS_VAL)) @@ -826,7 +780,7 @@ clear_it: /* * Clear state for this bank. */ - mce_wrmsrl(msr_ops.status(i), 0); + mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); } /* @@ -841,6 +795,34 @@ clear_it: EXPORT_SYMBOL_GPL(machine_check_poll); /* + * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and + * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM + * Vol 3B Table 15-20). But this confuses both the code that determines + * whether the machine check occurred in kernel or user mode, and also + * the severity assessment code. Pretend that EIPV was set, and take the + * ip/cs values from the pt_regs that mce_gather_info() ignored earlier. + */ +static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) +{ + if (bank != 0) + return; + if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0) + return; + if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC| + MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV| + MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR| + MCACOD)) != + (MCI_STATUS_UC|MCI_STATUS_EN| + MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S| + MCI_STATUS_AR|MCACOD_INSTR)) + return; + + m->mcgstatus |= MCG_STATUS_EIPV; + m->ip = regs->ip; + m->cs = regs->cs; +} + +/* * Do a quick check if any of the events requires a panic. * This decides if we keep the events around or clear them. */ @@ -851,13 +833,13 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, int i; for (i = 0; i < this_cpu_read(mce_num_banks); i++) { - m->status = mce_rdmsrl(msr_ops.status(i)); + m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); if (!(m->status & MCI_STATUS_VAL)) continue; __set_bit(i, validp); - if (quirk_no_way_out) - quirk_no_way_out(i, m, regs); + if (mce_flags.snb_ifu_quirk) + quirk_sandybridge_ifu(i, m, regs); m->bank = i; if (mce_severity(m, regs, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) { @@ -1144,7 +1126,7 @@ static void mce_clear_state(unsigned long *toclear) for (i = 0; i < this_cpu_read(mce_num_banks); i++) { if (test_bit(i, toclear)) - mce_wrmsrl(msr_ops.status(i), 0); + mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); } } @@ -1203,7 +1185,7 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin m->addr = 0; m->bank = i; - m->status = mce_rdmsrl(msr_ops.status(i)); + m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); if (!(m->status & MCI_STATUS_VAL)) continue; @@ -1253,6 +1235,9 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin static void kill_me_now(struct callback_head *ch) { + struct task_struct *p = container_of(ch, struct task_struct, mce_kill_me); + + p->mce_count = 0; force_sig(SIGBUS); } @@ -1262,13 +1247,14 @@ static void kill_me_maybe(struct callback_head *cb) int flags = MF_ACTION_REQUIRED; int ret; + p->mce_count = 0; pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr); if (!p->mce_ripv) flags |= MF_MUST_KILL; ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags); - if (!ret && !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) { + if (!ret) { set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); sync_core(); return; @@ -1282,29 +1268,57 @@ static void kill_me_maybe(struct callback_head *cb) if (ret == -EHWPOISON) return; - if (p->mce_vaddr != (void __user *)-1l) { - force_sig_mceerr(BUS_MCEERR_AR, p->mce_vaddr, PAGE_SHIFT); - } else { - pr_err("Memory error not recovered"); - kill_me_now(cb); - } + pr_err("Memory error not recovered"); + kill_me_now(cb); } -static void queue_task_work(struct mce *m, int kill_current_task) +static void kill_me_never(struct callback_head *cb) { - current->mce_addr = m->addr; - current->mce_kflags = m->kflags; - current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV); - current->mce_whole_page = whole_page(m); + struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me); - if (kill_current_task) - current->mce_kill_me.func = kill_me_now; - else - current->mce_kill_me.func = kill_me_maybe; + p->mce_count = 0; + pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr); + if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0)) + set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); +} + +static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *)) +{ + int count = ++current->mce_count; + + /* First call, save all the details */ + if (count == 1) { + current->mce_addr = m->addr; + current->mce_kflags = m->kflags; + current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV); + current->mce_whole_page = whole_page(m); + current->mce_kill_me.func = func; + } + + /* Ten is likely overkill. Don't expect more than two faults before task_work() */ + if (count > 10) + mce_panic("Too many consecutive machine checks while accessing user data", m, msg); + + /* Second or later call, make sure page address matches the one from first call */ + if (count > 1 && (current->mce_addr >> PAGE_SHIFT) != (m->addr >> PAGE_SHIFT)) + mce_panic("Consecutive machine checks to different user pages", m, msg); + + /* Do not call task_work_add() more than once */ + if (count > 1) + return; task_work_add(current, ¤t->mce_kill_me, TWA_RESUME); } +/* Handle unconfigured int18 (should never happen) */ +static noinstr void unexpected_machine_check(struct pt_regs *regs) +{ + instrumentation_begin(); + pr_err("CPU#%d: Unexpected int18 (Machine Check)\n", + smp_processor_id()); + instrumentation_end(); +} + /* * The actual machine check handler. This only handles real * exceptions when something got corrupted coming in through int 18. @@ -1325,36 +1339,43 @@ static void queue_task_work(struct mce *m, int kill_current_task) */ noinstr void do_machine_check(struct pt_regs *regs) { + int worst = 0, order, no_way_out, kill_current_task, lmce; DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); DECLARE_BITMAP(toclear, MAX_NR_BANKS); struct mca_config *cfg = &mca_cfg; struct mce m, *final; char *msg = NULL; - int worst = 0; + + if (unlikely(mce_flags.p5)) + return pentium_machine_check(regs); + else if (unlikely(mce_flags.winchip)) + return winchip_machine_check(regs); + else if (unlikely(!mca_cfg.initialized)) + return unexpected_machine_check(regs); /* * Establish sequential order between the CPUs entering the machine * check handler. */ - int order = -1; + order = -1; /* * If no_way_out gets set, there is no safe way to recover from this * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway. */ - int no_way_out = 0; + no_way_out = 0; /* * If kill_current_task is not set, there might be a way to recover from this * error. */ - int kill_current_task = 0; + kill_current_task = 0; /* * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES * on Intel. */ - int lmce = 1; + lmce = 1; this_cpu_inc(mce_exception_count); @@ -1438,7 +1459,10 @@ noinstr void do_machine_check(struct pt_regs *regs) /* If this triggers there is no way to recover. Die hard. */ BUG_ON(!on_thread_stack() || !user_mode(regs)); - queue_task_work(&m, kill_current_task); + if (kill_current_task) + queue_task_work(&m, msg, kill_me_now); + else + queue_task_work(&m, msg, kill_me_maybe); } else { /* @@ -1456,7 +1480,7 @@ noinstr void do_machine_check(struct pt_regs *regs) } if (m.kflags & MCE_IN_KERNEL_COPYIN) - queue_task_work(&m, kill_current_task); + queue_task_work(&m, msg, kill_me_never); } out: mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); @@ -1666,8 +1690,8 @@ static void __mcheck_cpu_init_clear_banks(void) if (!b->init) continue; - wrmsrl(msr_ops.ctl(i), b->ctl); - wrmsrl(msr_ops.status(i), 0); + wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl); + wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); } } @@ -1693,39 +1717,11 @@ static void __mcheck_cpu_check_banks(void) if (!b->init) continue; - rdmsrl(msr_ops.ctl(i), msrval); + rdmsrl(mca_msr_reg(i, MCA_CTL), msrval); b->init = !!msrval; } } -/* - * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and - * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM - * Vol 3B Table 15-20). But this confuses both the code that determines - * whether the machine check occurred in kernel or user mode, and also - * the severity assessment code. Pretend that EIPV was set, and take the - * ip/cs values from the pt_regs that mce_gather_info() ignored earlier. - */ -static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) -{ - if (bank != 0) - return; - if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0) - return; - if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC| - MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV| - MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR| - MCACOD)) != - (MCI_STATUS_UC|MCI_STATUS_EN| - MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S| - MCI_STATUS_AR|MCACOD_INSTR)) - return; - - m->mcgstatus |= MCG_STATUS_EIPV; - m->ip = regs->ip; - m->cs = regs->cs; -} - /* Add per CPU specific workarounds here */ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) { @@ -1799,7 +1795,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) cfg->bootlog = 0; if (c->x86 == 6 && c->x86_model == 45) - quirk_no_way_out = quirk_sandybridge_ifu; + mce_flags.snb_ifu_quirk = 1; } if (c->x86_vendor == X86_VENDOR_ZHAOXIN) { @@ -1829,9 +1825,11 @@ static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) switch (c->x86_vendor) { case X86_VENDOR_INTEL: intel_p5_mcheck_init(c); + mce_flags.p5 = 1; return 1; case X86_VENDOR_CENTAUR: winchip_mcheck_init(c); + mce_flags.winchip = 1; return 1; default: return 0; @@ -1850,13 +1848,6 @@ static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c) mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR); mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA); mce_flags.amd_threshold = 1; - - if (mce_flags.smca) { - msr_ops.ctl = smca_ctl_reg; - msr_ops.status = smca_status_reg; - msr_ops.addr = smca_addr_reg; - msr_ops.misc = smca_misc_reg; - } } } @@ -1986,18 +1977,6 @@ bool filter_mce(struct mce *m) return false; } -/* Handle unconfigured int18 (should never happen) */ -static noinstr void unexpected_machine_check(struct pt_regs *regs) -{ - instrumentation_begin(); - pr_err("CPU#%d: Unexpected int18 (Machine Check)\n", - smp_processor_id()); - instrumentation_end(); -} - -/* Call the installed machine check handler for this CPU setup. */ -void (*machine_check_vector)(struct pt_regs *) = unexpected_machine_check; - static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) { irqentry_state_t irq_state; @@ -2008,31 +1987,22 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) * Only required when from kernel mode. See * mce_check_crashing_cpu() for details. */ - if (machine_check_vector == do_machine_check && - mce_check_crashing_cpu()) + if (mca_cfg.initialized && mce_check_crashing_cpu()) return; irq_state = irqentry_nmi_enter(regs); - /* - * The call targets are marked noinstr, but objtool can't figure - * that out because it's an indirect call. Annotate it. - */ - instrumentation_begin(); - machine_check_vector(regs); + do_machine_check(regs); - instrumentation_end(); irqentry_nmi_exit(regs, irq_state); } static __always_inline void exc_machine_check_user(struct pt_regs *regs) { irqentry_enter_from_user_mode(regs); - instrumentation_begin(); - machine_check_vector(regs); + do_machine_check(regs); - instrumentation_end(); irqentry_exit_to_user_mode(regs); } @@ -2099,7 +2069,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) return; } - machine_check_vector = do_machine_check; + mca_cfg.initialized = 1; __mcheck_cpu_init_early(c); __mcheck_cpu_init_generic(); @@ -2207,7 +2177,6 @@ int __init mcheck_init(void) mce_register_decode_chain(&early_nb); mce_register_decode_chain(&mce_uc_nb); mce_register_decode_chain(&mce_default_nb); - mcheck_vendor_init_severity(); INIT_WORK(&mce_work, mce_gen_pool_process); init_irq_work(&mce_irq_work, mce_irq_work_cb); @@ -2232,7 +2201,7 @@ static void mce_disable_error_reporting(void) struct mce_bank *b = &mce_banks[i]; if (b->init) - wrmsrl(msr_ops.ctl(i), 0); + wrmsrl(mca_msr_reg(i, MCA_CTL), 0); } return; } @@ -2584,7 +2553,7 @@ static void mce_reenable_cpu(void) struct mce_bank *b = &mce_banks[i]; if (b->init) - wrmsrl(msr_ops.ctl(i), b->ctl); + wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl); } } diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 88dcc79cfb07..acd61c41846c 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -8,9 +8,6 @@ #include <linux/device.h> #include <asm/mce.h> -/* Pointer to the installed machine check handler for this CPU setup. */ -extern void (*machine_check_vector)(struct pt_regs *); - enum severity_level { MCE_NO_SEVERITY, MCE_DEFERRED_SEVERITY, @@ -38,8 +35,7 @@ int mce_gen_pool_add(struct mce *mce); int mce_gen_pool_init(void); struct llist_node *mce_gen_pool_prepare_records(void); -extern int (*mce_severity)(struct mce *a, struct pt_regs *regs, - int tolerant, char **msg, bool is_excp); +int mce_severity(struct mce *a, struct pt_regs *regs, int tolerant, char **msg, bool is_excp); struct dentry *mce_get_debugfs_dir(void); extern mce_banks_t mce_banks_ce_disabled; @@ -61,7 +57,7 @@ static inline void cmci_disable_bank(int bank) { } static inline void intel_init_cmci(void) { } static inline void intel_init_lmce(void) { } static inline void intel_clear_lmce(void) { } -static inline bool intel_filter_mce(struct mce *m) { return false; }; +static inline bool intel_filter_mce(struct mce *m) { return false; } #endif void mce_timer_kick(unsigned long interval); @@ -117,23 +113,25 @@ static inline void mce_unregister_injector_chain(struct notifier_block *nb) { } #endif struct mca_config { - bool dont_log_ce; - bool cmci_disabled; - bool ignore_ce; - bool print_all; - __u64 lmce_disabled : 1, disabled : 1, ser : 1, recovery : 1, bios_cmci_threshold : 1, - __reserved : 59; + /* Proper #MC exception handler is set */ + initialized : 1, + __reserved : 58; + + bool dont_log_ce; + bool cmci_disabled; + bool ignore_ce; + bool print_all; - s8 bootlog; int tolerant; int monarch_timeout; int panic_timeout; u32 rip_msr; + s8 bootlog; }; extern struct mca_config mca_cfg; @@ -163,19 +161,28 @@ struct mce_vendor_flags { /* AMD-style error thresholding banks present. */ amd_threshold : 1, - __reserved_0 : 60; + /* Pentium, family 5-style MCA */ + p5 : 1, + + /* Centaur Winchip C6-style MCA */ + winchip : 1, + + /* SandyBridge IFU quirk */ + snb_ifu_quirk : 1, + + __reserved_0 : 57; }; extern struct mce_vendor_flags mce_flags; -struct mca_msr_regs { - u32 (*ctl) (int bank); - u32 (*status) (int bank); - u32 (*addr) (int bank); - u32 (*misc) (int bank); +enum mca_msr { + MCA_CTL, + MCA_STATUS, + MCA_ADDR, + MCA_MISC, }; -extern struct mca_msr_regs msr_ops; +u32 mca_msr_reg(int bank, enum mca_msr reg); /* Decide whether to add MCE record to MCE event pool or filter it out. */ extern bool filter_mce(struct mce *m); @@ -183,17 +190,21 @@ extern bool filter_mce(struct mce *m); #ifdef CONFIG_X86_MCE_AMD extern bool amd_filter_mce(struct mce *m); #else -static inline bool amd_filter_mce(struct mce *m) { return false; }; +static inline bool amd_filter_mce(struct mce *m) { return false; } #endif -__visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr); - -__visible bool ex_handler_wrmsr_fault(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr); +#ifdef CONFIG_X86_ANCIENT_MCE +void intel_p5_mcheck_init(struct cpuinfo_x86 *c); +void winchip_mcheck_init(struct cpuinfo_x86 *c); +noinstr void pentium_machine_check(struct pt_regs *regs); +noinstr void winchip_machine_check(struct pt_regs *regs); +static inline void enable_p5_mce(void) { mce_p5_enabled = 1; } +#else +static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {} +static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {} +static inline void enable_p5_mce(void) {} +static inline void pentium_machine_check(struct pt_regs *regs) {} +static inline void winchip_machine_check(struct pt_regs *regs) {} +#endif #endif /* __X86_MCE_INTERNAL_H__ */ diff --git a/arch/x86/kernel/cpu/mce/p5.c b/arch/x86/kernel/cpu/mce/p5.c index 19e90cae8e97..2272ad53fc33 100644 --- a/arch/x86/kernel/cpu/mce/p5.c +++ b/arch/x86/kernel/cpu/mce/p5.c @@ -21,7 +21,7 @@ int mce_p5_enabled __read_mostly; /* Machine check handler for Pentium class Intel CPUs: */ -static noinstr void pentium_machine_check(struct pt_regs *regs) +noinstr void pentium_machine_check(struct pt_regs *regs) { u32 loaddr, hi, lotype; @@ -54,10 +54,6 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c) if (!cpu_has(c, X86_FEATURE_MCE)) return; - machine_check_vector = pentium_machine_check; - /* Make sure the vector pointer is visible before we enable MCEs: */ - wmb(); - /* Read registers before enabling: */ rdmsr(MSR_IA32_P5_MC_ADDR, l, h); rdmsr(MSR_IA32_P5_MC_TYPE, l, h); diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index 17e631443116..bb019a594a2c 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -265,25 +265,25 @@ static bool is_copy_from_user(struct pt_regs *regs) */ static int error_context(struct mce *m, struct pt_regs *regs) { - enum handler_type t; - if ((m->cs & 3) == 3) return IN_USER; if (!mc_recoverable(m->mcgstatus)) return IN_KERNEL; - t = ex_get_fault_handler_type(m->ip); - if (t == EX_HANDLER_FAULT) { - m->kflags |= MCE_IN_KERNEL_RECOV; - return IN_KERNEL_RECOV; - } - if (t == EX_HANDLER_UACCESS && regs && is_copy_from_user(regs)) { - m->kflags |= MCE_IN_KERNEL_RECOV; + switch (ex_get_fixup_type(m->ip)) { + case EX_TYPE_UACCESS: + case EX_TYPE_COPY: + if (!regs || !is_copy_from_user(regs)) + return IN_KERNEL; m->kflags |= MCE_IN_KERNEL_COPYIN; + fallthrough; + case EX_TYPE_FAULT_MCE_SAFE: + case EX_TYPE_DEFAULT_MCE_SAFE: + m->kflags |= MCE_IN_KERNEL_RECOV; return IN_KERNEL_RECOV; + default: + return IN_KERNEL; } - - return IN_KERNEL; } static int mce_severity_amd_smca(struct mce *m, enum context err_ctx) @@ -407,15 +407,14 @@ static int mce_severity_intel(struct mce *m, struct pt_regs *regs, } } -/* Default to mce_severity_intel */ -int (*mce_severity)(struct mce *m, struct pt_regs *regs, int tolerant, char **msg, bool is_excp) = - mce_severity_intel; - -void __init mcheck_vendor_init_severity(void) +int mce_severity(struct mce *m, struct pt_regs *regs, int tolerant, char **msg, + bool is_excp) { if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) - mce_severity = mce_severity_amd; + return mce_severity_amd(m, regs, tolerant, msg, is_excp); + else + return mce_severity_intel(m, regs, tolerant, msg, is_excp); } #ifdef CONFIG_DEBUG_FS diff --git a/arch/x86/kernel/cpu/mce/winchip.c b/arch/x86/kernel/cpu/mce/winchip.c index 9c9f0abd2d7f..6c99f2941909 100644 --- a/arch/x86/kernel/cpu/mce/winchip.c +++ b/arch/x86/kernel/cpu/mce/winchip.c @@ -17,7 +17,7 @@ #include "internal.h" /* Machine check handler for WinChip C6: */ -static noinstr void winchip_machine_check(struct pt_regs *regs) +noinstr void winchip_machine_check(struct pt_regs *regs) { instrumentation_begin(); pr_emerg("CPU0: Machine Check Exception.\n"); @@ -30,10 +30,6 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c) { u32 lo, hi; - machine_check_vector = winchip_machine_check; - /* Make sure the vector pointer is visible before we enable MCEs: */ - wmb(); - rdmsr(MSR_IDT_FCR1, lo, hi); lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */ lo &= ~(1<<4); /* Enable MCE */ diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 4b8813bafffd..bb1c3f5f60c8 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -527,12 +527,14 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) rdt_domain_reconfigure_cdp(r); if (r->alloc_capable && domain_setup_ctrlval(r, d)) { - kfree(d); + kfree(hw_dom); return; } if (r->mon_capable && domain_setup_mon_state(r, d)) { - kfree(d); + kfree(hw_dom->ctrl_val); + kfree(hw_dom->mbps_val); + kfree(hw_dom); return; } diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c index 64511c4a5200..6a77a14eee38 100644 --- a/arch/x86/kernel/cpu/sgx/virt.c +++ b/arch/x86/kernel/cpu/sgx/virt.c @@ -111,10 +111,8 @@ static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma) return 0; } -static int sgx_vepc_free_page(struct sgx_epc_page *epc_page) +static int sgx_vepc_remove_page(struct sgx_epc_page *epc_page) { - int ret; - /* * Take a previously guest-owned EPC page and return it to the * general EPC page pool. @@ -124,7 +122,12 @@ static int sgx_vepc_free_page(struct sgx_epc_page *epc_page) * case that a guest properly EREMOVE'd this page, a superfluous * EREMOVE is harmless. */ - ret = __eremove(sgx_get_epc_virt_addr(epc_page)); + return __eremove(sgx_get_epc_virt_addr(epc_page)); +} + +static int sgx_vepc_free_page(struct sgx_epc_page *epc_page) +{ + int ret = sgx_vepc_remove_page(epc_page); if (ret) { /* * Only SGX_CHILD_PRESENT is expected, which is because of @@ -144,10 +147,44 @@ static int sgx_vepc_free_page(struct sgx_epc_page *epc_page) } sgx_free_epc_page(epc_page); - return 0; } +static long sgx_vepc_remove_all(struct sgx_vepc *vepc) +{ + struct sgx_epc_page *entry; + unsigned long index; + long failures = 0; + + xa_for_each(&vepc->page_array, index, entry) { + int ret = sgx_vepc_remove_page(entry); + if (ret) { + if (ret == SGX_CHILD_PRESENT) { + /* The page is a SECS, userspace will retry. */ + failures++; + } else { + /* + * Report errors due to #GP or SGX_ENCLAVE_ACT; do not + * WARN, as userspace can induce said failures by + * calling the ioctl concurrently on multiple vEPCs or + * while one or more CPUs is running the enclave. Only + * a #PF on EREMOVE indicates a kernel/hardware issue. + */ + WARN_ON_ONCE(encls_faulted(ret) && + ENCLS_TRAPNR(ret) != X86_TRAP_GP); + return -EBUSY; + } + } + cond_resched(); + } + + /* + * Return the number of SECS pages that failed to be removed, so + * userspace knows that it has to retry. + */ + return failures; +} + static int sgx_vepc_release(struct inode *inode, struct file *file) { struct sgx_vepc *vepc = file->private_data; @@ -233,9 +270,27 @@ static int sgx_vepc_open(struct inode *inode, struct file *file) return 0; } +static long sgx_vepc_ioctl(struct file *file, + unsigned int cmd, unsigned long arg) +{ + struct sgx_vepc *vepc = file->private_data; + + switch (cmd) { + case SGX_IOC_VEPC_REMOVE_ALL: + if (arg) + return -EINVAL; + return sgx_vepc_remove_all(vepc); + + default: + return -ENOTTY; + } +} + static const struct file_operations sgx_vepc_fops = { .owner = THIS_MODULE, .open = sgx_vepc_open, + .unlocked_ioctl = sgx_vepc_ioctl, + .compat_ioctl = sgx_vepc_ioctl, .release = sgx_vepc_release, .mmap = sgx_vepc_mmap, }; diff --git a/arch/x86/kernel/cpu/vortex.c b/arch/x86/kernel/cpu/vortex.c new file mode 100644 index 000000000000..e2685470ba94 --- /dev/null +++ b/arch/x86/kernel/cpu/vortex.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kernel.h> +#include <asm/processor.h> +#include "cpu.h" + +/* + * No special init required for Vortex processors. + */ + +static const struct cpu_dev vortex_cpu_dev = { + .c_vendor = "Vortex", + .c_ident = { "Vortex86 SoC" }, + .legacy_models = { + { + .family = 5, + .model_names = { + [2] = "Vortex86DX", + [8] = "Vortex86MX", + }, + }, + { + .family = 6, + .model_names = { + /* + * Both the Vortex86EX and the Vortex86EX2 + * have the same family and model id. + * + * However, the -EX2 supports the product name + * CPUID call, so this name will only be used + * for the -EX, which does not. + */ + [0] = "Vortex86EX", + }, + }, + }, + .c_x86_vendor = X86_VENDOR_VORTEX, +}; + +cpu_dev_register(vortex_cpu_dev); diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index 045e82e8945b..a7f617a3981d 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c @@ -10,6 +10,7 @@ #include <linux/crash_dump.h> #include <linux/uaccess.h> #include <linux/io.h> +#include <linux/cc_platform.h> static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, unsigned long offset, int userbuf, @@ -73,5 +74,6 @@ ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize, ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos) { - return read_from_oldmem(buf, count, ppos, 0, sev_active()); + return read_from_oldmem(buf, count, ppos, 0, + cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)); } diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 6a4cb71c2498..78b2311b3b8b 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -31,11 +31,6 @@ char __initdata cmd_line[COMMAND_LINE_SIZE]; int __initdata of_ioapic; -void __init early_init_dt_scan_chosen_arch(unsigned long node) -{ - BUG(); -} - void __init early_init_dt_add_memory_arch(u64 base, u64 size) { BUG(); diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 38837dad46e6..391a4e2b8604 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -714,12 +714,6 @@ static struct chipset early_qrk[] __initdata = { */ { PCI_VENDOR_ID_INTEL, 0x0f00, PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, - { PCI_VENDOR_ID_INTEL, 0x3e20, - PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, - { PCI_VENDOR_ID_INTEL, 0x3ec4, - PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, - { PCI_VENDOR_ID_INTEL, 0x8a12, - PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, { PCI_VENDOR_ID_BROADCOM, 0x4331, PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset}, {} diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c index 2954fab15e51..794e70151203 100644 --- a/arch/x86/kernel/fpu/bugs.c +++ b/arch/x86/kernel/fpu/bugs.c @@ -2,7 +2,7 @@ /* * x86 FPU bug checks: */ -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> /* * Boot time CPU/FPU FDIV bug detection code: diff --git a/arch/x86/kernel/fpu/context.h b/arch/x86/kernel/fpu/context.h new file mode 100644 index 000000000000..958accf2ccf0 --- /dev/null +++ b/arch/x86/kernel/fpu/context.h @@ -0,0 +1,83 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __X86_KERNEL_FPU_CONTEXT_H +#define __X86_KERNEL_FPU_CONTEXT_H + +#include <asm/fpu/xstate.h> +#include <asm/trace/fpu.h> + +/* Functions related to FPU context tracking */ + +/* + * The in-register FPU state for an FPU context on a CPU is assumed to be + * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx + * matches the FPU. + * + * If the FPU register state is valid, the kernel can skip restoring the + * FPU state from memory. + * + * Any code that clobbers the FPU registers or updates the in-memory + * FPU state for a task MUST let the rest of the kernel know that the + * FPU registers are no longer valid for this task. + * + * Either one of these invalidation functions is enough. Invalidate + * a resource you control: CPU if using the CPU for something else + * (with preemption disabled), FPU for the current task, or a task that + * is prevented from running by the current task. + */ +static inline void __cpu_invalidate_fpregs_state(void) +{ + __this_cpu_write(fpu_fpregs_owner_ctx, NULL); +} + +static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu) +{ + fpu->last_cpu = -1; +} + +static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu) +{ + return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; +} + +static inline void fpregs_deactivate(struct fpu *fpu) +{ + __this_cpu_write(fpu_fpregs_owner_ctx, NULL); + trace_x86_fpu_regs_deactivated(fpu); +} + +static inline void fpregs_activate(struct fpu *fpu) +{ + __this_cpu_write(fpu_fpregs_owner_ctx, fpu); + trace_x86_fpu_regs_activated(fpu); +} + +/* Internal helper for switch_fpu_return() and signal frame setup */ +static inline void fpregs_restore_userregs(void) +{ + struct fpu *fpu = ¤t->thread.fpu; + int cpu = smp_processor_id(); + + if (WARN_ON_ONCE(current->flags & PF_KTHREAD)) + return; + + if (!fpregs_state_valid(fpu, cpu)) { + /* + * This restores _all_ xstate which has not been + * established yet. + * + * If PKRU is enabled, then the PKRU value is already + * correct because it was either set in switch_to() or in + * flush_thread(). So it is excluded because it might be + * not up to date in current->thread.fpu.xsave state. + * + * XFD state is handled in restore_fpregs_from_fpstate(). + */ + restore_fpregs_from_fpstate(fpu->fpstate, XFEATURE_MASK_FPSTATE); + + fpregs_activate(fpu); + fpu->last_cpu = cpu; + } + clear_thread_flag(TIF_NEED_FPU_LOAD); +} + +#endif diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 7ada7bd03a32..8ea306b1bf8e 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -6,8 +6,9 @@ * General FPU state handling cleanups * Gareth Hughes <gareth@valinux.com>, May 2000 */ -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> #include <asm/fpu/regset.h> +#include <asm/fpu/sched.h> #include <asm/fpu/signal.h> #include <asm/fpu/types.h> #include <asm/traps.h> @@ -15,15 +16,30 @@ #include <linux/hardirq.h> #include <linux/pkeys.h> +#include <linux/vmalloc.h> + +#include "context.h" +#include "internal.h" +#include "legacy.h" +#include "xstate.h" #define CREATE_TRACE_POINTS #include <asm/trace/fpu.h> +#ifdef CONFIG_X86_64 +DEFINE_STATIC_KEY_FALSE(__fpu_state_size_dynamic); +DEFINE_PER_CPU(u64, xfd_state); +#endif + +/* The FPU state configuration data for kernel and user space */ +struct fpu_state_config fpu_kernel_cfg __ro_after_init; +struct fpu_state_config fpu_user_cfg __ro_after_init; + /* * Represents the initial FPU state. It's mostly (but not completely) zeroes, * depending on the FPU hardware format: */ -union fpregs_state init_fpstate __ro_after_init; +struct fpstate init_fpstate __ro_after_init; /* * Track whether the kernel is using the FPU state @@ -83,7 +99,7 @@ bool irq_fpu_usable(void) EXPORT_SYMBOL(irq_fpu_usable); /* - * Save the FPU register state in fpu->state. The register state is + * Save the FPU register state in fpu->fpstate->regs. The register state is * preserved. * * Must be called with fpregs_lock() held. @@ -99,19 +115,19 @@ EXPORT_SYMBOL(irq_fpu_usable); void save_fpregs_to_fpstate(struct fpu *fpu) { if (likely(use_xsave())) { - os_xsave(&fpu->state.xsave); + os_xsave(fpu->fpstate); /* * AVX512 state is tracked here because its use is * known to slow the max clock speed of the core. */ - if (fpu->state.xsave.header.xfeatures & XFEATURE_MASK_AVX512) + if (fpu->fpstate->regs.xsave.header.xfeatures & XFEATURE_MASK_AVX512) fpu->avx512_timestamp = jiffies; return; } if (likely(use_fxsr())) { - fxsave(&fpu->state.fxsave); + fxsave(&fpu->fpstate->regs.fxsave); return; } @@ -119,12 +135,11 @@ void save_fpregs_to_fpstate(struct fpu *fpu) * Legacy FPU register saving, FNSAVE always clears FPU registers, * so we have to reload them from the memory state. */ - asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave)); - frstor(&fpu->state.fsave); + asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->fpstate->regs.fsave)); + frstor(&fpu->fpstate->regs.fsave); } -EXPORT_SYMBOL(save_fpregs_to_fpstate); -void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask) +void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask) { /* * AMD K7/K8 and later CPUs up to Zen don't save/restore @@ -141,15 +156,181 @@ void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask) } if (use_xsave()) { - os_xrstor(&fpstate->xsave, mask); + /* + * Dynamically enabled features are enabled in XCR0, but + * usage requires also that the corresponding bits in XFD + * are cleared. If the bits are set then using a related + * instruction will raise #NM. This allows to do the + * allocation of the larger FPU buffer lazy from #NM or if + * the task has no permission to kill it which would happen + * via #UD if the feature is disabled in XCR0. + * + * XFD state is following the same life time rules as + * XSTATE and to restore state correctly XFD has to be + * updated before XRSTORS otherwise the component would + * stay in or go into init state even if the bits are set + * in fpstate::regs::xsave::xfeatures. + */ + xfd_update_state(fpstate); + + /* + * Restoring state always needs to modify all features + * which are in @mask even if the current task cannot use + * extended features. + * + * So fpstate->xfeatures cannot be used here, because then + * a feature for which the task has no permission but was + * used by the previous task would not go into init state. + */ + mask = fpu_kernel_cfg.max_features & mask; + + os_xrstor(fpstate, mask); } else { if (use_fxsr()) - fxrstor(&fpstate->fxsave); + fxrstor(&fpstate->regs.fxsave); else - frstor(&fpstate->fsave); + frstor(&fpstate->regs.fsave); } } -EXPORT_SYMBOL_GPL(__restore_fpregs_from_fpstate); + +void fpu_reset_from_exception_fixup(void) +{ + restore_fpregs_from_fpstate(&init_fpstate, XFEATURE_MASK_FPSTATE); +} + +#if IS_ENABLED(CONFIG_KVM) +static void __fpstate_reset(struct fpstate *fpstate); + +bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu) +{ + struct fpstate *fpstate; + unsigned int size; + + size = fpu_user_cfg.default_size + ALIGN(offsetof(struct fpstate, regs), 64); + fpstate = vzalloc(size); + if (!fpstate) + return false; + + __fpstate_reset(fpstate); + fpstate_init_user(fpstate); + fpstate->is_valloc = true; + fpstate->is_guest = true; + + gfpu->fpstate = fpstate; + return true; +} +EXPORT_SYMBOL_GPL(fpu_alloc_guest_fpstate); + +void fpu_free_guest_fpstate(struct fpu_guest *gfpu) +{ + struct fpstate *fps = gfpu->fpstate; + + if (!fps) + return; + + if (WARN_ON_ONCE(!fps->is_valloc || !fps->is_guest || fps->in_use)) + return; + + gfpu->fpstate = NULL; + vfree(fps); +} +EXPORT_SYMBOL_GPL(fpu_free_guest_fpstate); + +int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest) +{ + struct fpstate *guest_fps = guest_fpu->fpstate; + struct fpu *fpu = ¤t->thread.fpu; + struct fpstate *cur_fps = fpu->fpstate; + + fpregs_lock(); + if (!cur_fps->is_confidential && !test_thread_flag(TIF_NEED_FPU_LOAD)) + save_fpregs_to_fpstate(fpu); + + /* Swap fpstate */ + if (enter_guest) { + fpu->__task_fpstate = cur_fps; + fpu->fpstate = guest_fps; + guest_fps->in_use = true; + } else { + guest_fps->in_use = false; + fpu->fpstate = fpu->__task_fpstate; + fpu->__task_fpstate = NULL; + } + + cur_fps = fpu->fpstate; + + if (!cur_fps->is_confidential) { + /* Includes XFD update */ + restore_fpregs_from_fpstate(cur_fps, XFEATURE_MASK_FPSTATE); + } else { + /* + * XSTATE is restored by firmware from encrypted + * memory. Make sure XFD state is correct while + * running with guest fpstate + */ + xfd_update_state(cur_fps); + } + + fpregs_mark_activate(); + fpregs_unlock(); + return 0; +} +EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpstate); + +void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, + unsigned int size, u32 pkru) +{ + struct fpstate *kstate = gfpu->fpstate; + union fpregs_state *ustate = buf; + struct membuf mb = { .p = buf, .left = size }; + + if (cpu_feature_enabled(X86_FEATURE_XSAVE)) { + __copy_xstate_to_uabi_buf(mb, kstate, pkru, XSTATE_COPY_XSAVE); + } else { + memcpy(&ustate->fxsave, &kstate->regs.fxsave, + sizeof(ustate->fxsave)); + /* Make it restorable on a XSAVE enabled host */ + ustate->xsave.header.xfeatures = XFEATURE_MASK_FPSSE; + } +} +EXPORT_SYMBOL_GPL(fpu_copy_guest_fpstate_to_uabi); + +int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, + u64 xcr0, u32 *vpkru) +{ + struct fpstate *kstate = gfpu->fpstate; + const union fpregs_state *ustate = buf; + struct pkru_state *xpkru; + int ret; + + if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) { + if (ustate->xsave.header.xfeatures & ~XFEATURE_MASK_FPSSE) + return -EINVAL; + if (ustate->fxsave.mxcsr & ~mxcsr_feature_mask) + return -EINVAL; + memcpy(&kstate->regs.fxsave, &ustate->fxsave, sizeof(ustate->fxsave)); + return 0; + } + + if (ustate->xsave.header.xfeatures & ~xcr0) + return -EINVAL; + + ret = copy_uabi_from_kernel_to_xstate(kstate, ustate); + if (ret) + return ret; + + /* Retrieve PKRU if not in init state */ + if (kstate->regs.xsave.header.xfeatures & XFEATURE_MASK_PKRU) { + xpkru = get_xsave_addr(&kstate->regs.xsave, XFEATURE_PKRU); + *vpkru = xpkru->pkru; + } + + /* Ensure that XCOMP_BV is set up for XSAVES */ + xstate_init_xcomp_bv(&kstate->regs.xsave, kstate->xfeatures); + return 0; +} +EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate); +#endif /* CONFIG_KVM */ void kernel_fpu_begin_mask(unsigned int kfpu_mask) { @@ -203,52 +384,88 @@ void fpu_sync_fpstate(struct fpu *fpu) fpregs_unlock(); } -static inline void fpstate_init_xstate(struct xregs_state *xsave) +static inline unsigned int init_fpstate_copy_size(void) { - /* - * XRSTORS requires these bits set in xcomp_bv, or it will - * trigger #GP: - */ - xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask_all; + if (!use_xsave()) + return fpu_kernel_cfg.default_size; + + /* XSAVE(S) just needs the legacy and the xstate header part */ + return sizeof(init_fpstate.regs.xsave); } -static inline void fpstate_init_fxstate(struct fxregs_state *fx) +static inline void fpstate_init_fxstate(struct fpstate *fpstate) { - fx->cwd = 0x37f; - fx->mxcsr = MXCSR_DEFAULT; + fpstate->regs.fxsave.cwd = 0x37f; + fpstate->regs.fxsave.mxcsr = MXCSR_DEFAULT; } /* * Legacy x87 fpstate state init: */ -static inline void fpstate_init_fstate(struct fregs_state *fp) +static inline void fpstate_init_fstate(struct fpstate *fpstate) { - fp->cwd = 0xffff037fu; - fp->swd = 0xffff0000u; - fp->twd = 0xffffffffu; - fp->fos = 0xffff0000u; + fpstate->regs.fsave.cwd = 0xffff037fu; + fpstate->regs.fsave.swd = 0xffff0000u; + fpstate->regs.fsave.twd = 0xffffffffu; + fpstate->regs.fsave.fos = 0xffff0000u; } -void fpstate_init(union fpregs_state *state) +/* + * Used in two places: + * 1) Early boot to setup init_fpstate for non XSAVE systems + * 2) fpu_init_fpstate_user() which is invoked from KVM + */ +void fpstate_init_user(struct fpstate *fpstate) { - if (!static_cpu_has(X86_FEATURE_FPU)) { - fpstate_init_soft(&state->soft); + if (!cpu_feature_enabled(X86_FEATURE_FPU)) { + fpstate_init_soft(&fpstate->regs.soft); return; } - memset(state, 0, fpu_kernel_xstate_size); + xstate_init_xcomp_bv(&fpstate->regs.xsave, fpstate->xfeatures); - if (static_cpu_has(X86_FEATURE_XSAVES)) - fpstate_init_xstate(&state->xsave); - if (static_cpu_has(X86_FEATURE_FXSR)) - fpstate_init_fxstate(&state->fxsave); + if (cpu_feature_enabled(X86_FEATURE_FXSR)) + fpstate_init_fxstate(fpstate); else - fpstate_init_fstate(&state->fsave); + fpstate_init_fstate(fpstate); +} + +static void __fpstate_reset(struct fpstate *fpstate) +{ + /* Initialize sizes and feature masks */ + fpstate->size = fpu_kernel_cfg.default_size; + fpstate->user_size = fpu_user_cfg.default_size; + fpstate->xfeatures = fpu_kernel_cfg.default_features; + fpstate->user_xfeatures = fpu_user_cfg.default_features; + fpstate->xfd = init_fpstate.xfd; +} + +void fpstate_reset(struct fpu *fpu) +{ + /* Set the fpstate pointer to the default fpstate */ + fpu->fpstate = &fpu->__fpstate; + __fpstate_reset(fpu->fpstate); + + /* Initialize the permission related info in fpu */ + fpu->perm.__state_perm = fpu_kernel_cfg.default_features; + fpu->perm.__state_size = fpu_kernel_cfg.default_size; + fpu->perm.__user_state_size = fpu_user_cfg.default_size; +} + +static inline void fpu_inherit_perms(struct fpu *dst_fpu) +{ + if (fpu_state_size_dynamic()) { + struct fpu *src_fpu = ¤t->group_leader->thread.fpu; + + spin_lock_irq(¤t->sighand->siglock); + /* Fork also inherits the permissions of the parent */ + dst_fpu->perm = src_fpu->perm; + spin_unlock_irq(¤t->sighand->siglock); + } } -EXPORT_SYMBOL_GPL(fpstate_init); /* Clone current's FPU state on fork */ -int fpu_clone(struct task_struct *dst) +int fpu_clone(struct task_struct *dst, unsigned long clone_flags) { struct fpu *src_fpu = ¤t->thread.fpu; struct fpu *dst_fpu = &dst->thread.fpu; @@ -256,30 +473,51 @@ int fpu_clone(struct task_struct *dst) /* The new task's FPU state cannot be valid in the hardware. */ dst_fpu->last_cpu = -1; + fpstate_reset(dst_fpu); + if (!cpu_feature_enabled(X86_FEATURE_FPU)) return 0; /* - * Don't let 'init optimized' areas of the XSAVE area - * leak into the child task: + * Enforce reload for user space tasks and prevent kernel threads + * from trying to save the FPU registers on context switch. + */ + set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD); + + /* + * No FPU state inheritance for kernel threads and IO + * worker threads. + */ + if (dst->flags & (PF_KTHREAD | PF_IO_WORKER)) { + /* Clear out the minimal state */ + memcpy(&dst_fpu->fpstate->regs, &init_fpstate.regs, + init_fpstate_copy_size()); + return 0; + } + + /* + * If a new feature is added, ensure all dynamic features are + * caller-saved from here! */ - memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size); + BUILD_BUG_ON(XFEATURE_MASK_USER_DYNAMIC != XFEATURE_MASK_XTILE_DATA); /* - * If the FPU registers are not owned by current just memcpy() the - * state. Otherwise save the FPU registers directly into the - * child's FPU context, without any memory-to-memory copying. + * Save the default portion of the current FPU state into the + * clone. Assume all dynamic features to be defined as caller- + * saved, which enables skipping both the expansion of fpstate + * and the copying of any dynamic state. + * + * Do not use memcpy() when TIF_NEED_FPU_LOAD is set because + * copying is not valid when current uses non-default states. */ fpregs_lock(); if (test_thread_flag(TIF_NEED_FPU_LOAD)) - memcpy(&dst_fpu->state, &src_fpu->state, fpu_kernel_xstate_size); - - else - save_fpregs_to_fpstate(dst_fpu); + fpregs_restore_userregs(); + save_fpregs_to_fpstate(dst_fpu); + if (!(clone_flags & CLONE_THREAD)) + fpu_inherit_perms(dst_fpu); fpregs_unlock(); - set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD); - trace_x86_fpu_copy_src(src_fpu); trace_x86_fpu_copy_dst(dst_fpu); @@ -287,6 +525,16 @@ int fpu_clone(struct task_struct *dst) } /* + * Whitelist the FPU register state embedded into task_struct for hardened + * usercopy. + */ +void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size) +{ + *offset = offsetof(struct thread_struct, fpu.__fpstate.regs); + *size = fpu_kernel_cfg.default_size; +} + +/* * Drops current FPU state: deactivates the fpregs and * the fpstate. NOTE: it still leaves previous contents * in the fpregs in the eager-FPU case. @@ -319,28 +567,19 @@ void fpu__drop(struct fpu *fpu) static inline void restore_fpregs_from_init_fpstate(u64 features_mask) { if (use_xsave()) - os_xrstor(&init_fpstate.xsave, features_mask); + os_xrstor(&init_fpstate, features_mask); else if (use_fxsr()) - fxrstor(&init_fpstate.fxsave); + fxrstor(&init_fpstate.regs.fxsave); else - frstor(&init_fpstate.fsave); + frstor(&init_fpstate.regs.fsave); pkru_write_default(); } -static inline unsigned int init_fpstate_copy_size(void) -{ - if (!use_xsave()) - return fpu_kernel_xstate_size; - - /* XSAVE(S) just needs the legacy and the xstate header part */ - return sizeof(init_fpstate.xsave); -} - /* * Reset current->fpu memory state to the init values. */ -static void fpu_reset_fpstate(void) +static void fpu_reset_fpregs(void) { struct fpu *fpu = ¤t->thread.fpu; @@ -359,7 +598,7 @@ static void fpu_reset_fpstate(void) * user space as PKRU is eagerly written in switch_to() and * flush_thread(). */ - memcpy(&fpu->state, &init_fpstate, init_fpstate_copy_size()); + memcpy(&fpu->fpstate->regs, &init_fpstate.regs, init_fpstate_copy_size()); set_thread_flag(TIF_NEED_FPU_LOAD); fpregs_unlock(); } @@ -375,7 +614,7 @@ void fpu__clear_user_states(struct fpu *fpu) fpregs_lock(); if (!cpu_feature_enabled(X86_FEATURE_FPU)) { - fpu_reset_fpstate(); + fpu_reset_fpregs(); fpregs_unlock(); return; } @@ -385,12 +624,11 @@ void fpu__clear_user_states(struct fpu *fpu) * corresponding registers. */ if (xfeatures_mask_supervisor() && - !fpregs_state_valid(fpu, smp_processor_id())) { - os_xrstor(&fpu->state.xsave, xfeatures_mask_supervisor()); - } + !fpregs_state_valid(fpu, smp_processor_id())) + os_xrstor_supervisor(fpu->fpstate); /* Reset user states in registers. */ - restore_fpregs_from_init_fpstate(xfeatures_mask_restore_user()); + restore_fpregs_from_init_fpstate(XFEATURE_MASK_USER_RESTORE); /* * Now all FPU registers have their desired values. Inform the FPU @@ -405,7 +643,8 @@ void fpu__clear_user_states(struct fpu *fpu) void fpu_flush_thread(void) { - fpu_reset_fpstate(); + fpstate_reset(¤t->thread.fpu); + fpu_reset_fpregs(); } /* * Load FPU context before returning to userspace. @@ -445,7 +684,6 @@ void fpregs_mark_activate(void) fpu->last_cpu = smp_processor_id(); clear_thread_flag(TIF_NEED_FPU_LOAD); } -EXPORT_SYMBOL_GPL(fpregs_mark_activate); /* * x87 math exception handling: @@ -468,11 +706,11 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr) * fully reproduce the context of the exception. */ if (boot_cpu_has(X86_FEATURE_FXSR)) { - cwd = fpu->state.fxsave.cwd; - swd = fpu->state.fxsave.swd; + cwd = fpu->fpstate->regs.fxsave.cwd; + swd = fpu->fpstate->regs.fxsave.swd; } else { - cwd = (unsigned short)fpu->state.fsave.cwd; - swd = (unsigned short)fpu->state.fsave.swd; + cwd = (unsigned short)fpu->fpstate->regs.fsave.cwd; + swd = (unsigned short)fpu->fpstate->regs.fsave.swd; } err = swd & ~cwd; @@ -486,7 +724,7 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr) unsigned short mxcsr = MXCSR_DEFAULT; if (boot_cpu_has(X86_FEATURE_XMM)) - mxcsr = fpu->state.fxsave.mxcsr; + mxcsr = fpu->fpstate->regs.fxsave.mxcsr; err = ~(mxcsr >> 7) & mxcsr; } diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 64e29927cc32..621f4b6cac4a 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -2,7 +2,7 @@ /* * x86 FPU boot time init code: */ -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> #include <asm/tlbflush.h> #include <asm/setup.h> @@ -10,6 +10,10 @@ #include <linux/sched/task.h> #include <linux/init.h> +#include "internal.h" +#include "legacy.h" +#include "xstate.h" + /* * Initialize the registers found in all CPUs, CR0 and CR4: */ @@ -34,7 +38,7 @@ static void fpu__init_cpu_generic(void) /* Flush out any pending x87 state: */ #ifdef CONFIG_MATH_EMULATION if (!boot_cpu_has(X86_FEATURE_FPU)) - fpstate_init_soft(¤t->thread.fpu.state.soft); + fpstate_init_soft(¤t->thread.fpu.fpstate->regs.soft); else #endif asm volatile ("fninit"); @@ -121,23 +125,14 @@ static void __init fpu__init_system_mxcsr(void) static void __init fpu__init_system_generic(void) { /* - * Set up the legacy init FPU context. (xstate init might overwrite this - * with a more modern format, if the CPU supports it.) + * Set up the legacy init FPU context. Will be updated when the + * CPU supports XSAVE[S]. */ - fpstate_init(&init_fpstate); + fpstate_init_user(&init_fpstate); fpu__init_system_mxcsr(); } -/* - * Size of the FPU context state. All tasks in the system use the - * same context size, regardless of what portion they use. - * This is inherent to the XSAVE architecture which puts all state - * components into a single, continuous memory block: - */ -unsigned int fpu_kernel_xstate_size __ro_after_init; -EXPORT_SYMBOL_GPL(fpu_kernel_xstate_size); - /* Get alignment of the TYPE. */ #define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test) @@ -162,13 +157,13 @@ static void __init fpu__init_task_struct_size(void) * Subtract off the static size of the register state. * It potentially has a bunch of padding. */ - task_size -= sizeof(((struct task_struct *)0)->thread.fpu.state); + task_size -= sizeof(current->thread.fpu.__fpstate.regs); /* * Add back the dynamically-calculated register state * size. */ - task_size += fpu_kernel_xstate_size; + task_size += fpu_kernel_cfg.default_size; /* * We dynamically size 'struct fpu', so we require that @@ -177,7 +172,7 @@ static void __init fpu__init_task_struct_size(void) * you hit a compile error here, check the structure to * see if something got added to the end. */ - CHECK_MEMBER_AT_END_OF(struct fpu, state); + CHECK_MEMBER_AT_END_OF(struct fpu, __fpstate); CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu); CHECK_MEMBER_AT_END_OF(struct task_struct, thread); @@ -192,37 +187,34 @@ static void __init fpu__init_task_struct_size(void) */ static void __init fpu__init_system_xstate_size_legacy(void) { - static int on_boot_cpu __initdata = 1; - - WARN_ON_FPU(!on_boot_cpu); - on_boot_cpu = 0; + unsigned int size; /* - * Note that xstate sizes might be overwritten later during - * fpu__init_system_xstate(). + * Note that the size configuration might be overwritten later + * during fpu__init_system_xstate(). */ - - if (!boot_cpu_has(X86_FEATURE_FPU)) { - fpu_kernel_xstate_size = sizeof(struct swregs_state); + if (!cpu_feature_enabled(X86_FEATURE_FPU)) { + size = sizeof(struct swregs_state); + } else if (cpu_feature_enabled(X86_FEATURE_FXSR)) { + size = sizeof(struct fxregs_state); + fpu_user_cfg.legacy_features = XFEATURE_MASK_FPSSE; } else { - if (boot_cpu_has(X86_FEATURE_FXSR)) - fpu_kernel_xstate_size = - sizeof(struct fxregs_state); - else - fpu_kernel_xstate_size = - sizeof(struct fregs_state); + size = sizeof(struct fregs_state); + fpu_user_cfg.legacy_features = XFEATURE_MASK_FP; } - fpu_user_xstate_size = fpu_kernel_xstate_size; + fpu_kernel_cfg.max_size = size; + fpu_kernel_cfg.default_size = size; + fpu_user_cfg.max_size = size; + fpu_user_cfg.default_size = size; + fpstate_reset(¤t->thread.fpu); } -/* Legacy code to initialize eager fpu mode. */ -static void __init fpu__init_system_ctx_switch(void) +static void __init fpu__init_init_fpstate(void) { - static bool on_boot_cpu __initdata = 1; - - WARN_ON_FPU(!on_boot_cpu); - on_boot_cpu = 0; + /* Bring init_fpstate size and features up to date */ + init_fpstate.size = fpu_kernel_cfg.max_size; + init_fpstate.xfeatures = fpu_kernel_cfg.max_features; } /* @@ -231,6 +223,7 @@ static void __init fpu__init_system_ctx_switch(void) */ void __init fpu__init_system(struct cpuinfo_x86 *c) { + fpstate_reset(¤t->thread.fpu); fpu__init_system_early_generic(c); /* @@ -241,8 +234,7 @@ void __init fpu__init_system(struct cpuinfo_x86 *c) fpu__init_system_generic(); fpu__init_system_xstate_size_legacy(); - fpu__init_system_xstate(); + fpu__init_system_xstate(fpu_kernel_cfg.max_size); fpu__init_task_struct_size(); - - fpu__init_system_ctx_switch(); + fpu__init_init_fpstate(); } diff --git a/arch/x86/kernel/fpu/internal.h b/arch/x86/kernel/fpu/internal.h new file mode 100644 index 000000000000..dbdb31f55fc7 --- /dev/null +++ b/arch/x86/kernel/fpu/internal.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __X86_KERNEL_FPU_INTERNAL_H +#define __X86_KERNEL_FPU_INTERNAL_H + +extern struct fpstate init_fpstate; + +/* CPU feature check wrappers */ +static __always_inline __pure bool use_xsave(void) +{ + return cpu_feature_enabled(X86_FEATURE_XSAVE); +} + +static __always_inline __pure bool use_fxsr(void) +{ + return cpu_feature_enabled(X86_FEATURE_FXSR); +} + +#ifdef CONFIG_X86_DEBUG_FPU +# define WARN_ON_FPU(x) WARN_ON_ONCE(x) +#else +# define WARN_ON_FPU(x) ({ (void)(x); 0; }) +#endif + +/* Used in init.c */ +extern void fpstate_init_user(struct fpstate *fpstate); +extern void fpstate_reset(struct fpu *fpu); + +#endif diff --git a/arch/x86/kernel/fpu/legacy.h b/arch/x86/kernel/fpu/legacy.h new file mode 100644 index 000000000000..17c26b164c63 --- /dev/null +++ b/arch/x86/kernel/fpu/legacy.h @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __X86_KERNEL_FPU_LEGACY_H +#define __X86_KERNEL_FPU_LEGACY_H + +#include <asm/fpu/types.h> + +extern unsigned int mxcsr_feature_mask; + +static inline void ldmxcsr(u32 mxcsr) +{ + asm volatile("ldmxcsr %0" :: "m" (mxcsr)); +} + +/* + * Returns 0 on success or the trap number when the operation raises an + * exception. + */ +#define user_insn(insn, output, input...) \ +({ \ + int err; \ + \ + might_fault(); \ + \ + asm volatile(ASM_STAC "\n" \ + "1: " #insn "\n" \ + "2: " ASM_CLAC "\n" \ + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE) \ + : [err] "=a" (err), output \ + : "0"(0), input); \ + err; \ +}) + +#define kernel_insn_err(insn, output, input...) \ +({ \ + int err; \ + asm volatile("1:" #insn "\n\t" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: movl $-1,%[err]\n" \ + " jmp 2b\n" \ + ".previous\n" \ + _ASM_EXTABLE(1b, 3b) \ + : [err] "=r" (err), output \ + : "0"(0), input); \ + err; \ +}) + +#define kernel_insn(insn, output, input...) \ + asm volatile("1:" #insn "\n\t" \ + "2:\n" \ + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FPU_RESTORE) \ + : output : input) + +static inline int fnsave_to_user_sigframe(struct fregs_state __user *fx) +{ + return user_insn(fnsave %[fx]; fwait, [fx] "=m" (*fx), "m" (*fx)); +} + +static inline int fxsave_to_user_sigframe(struct fxregs_state __user *fx) +{ + if (IS_ENABLED(CONFIG_X86_32)) + return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx)); + else + return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx)); + +} + +static inline void fxrstor(struct fxregs_state *fx) +{ + if (IS_ENABLED(CONFIG_X86_32)) + kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); + else + kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + +static inline int fxrstor_safe(struct fxregs_state *fx) +{ + if (IS_ENABLED(CONFIG_X86_32)) + return kernel_insn_err(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); + else + return kernel_insn_err(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + +static inline int fxrstor_from_user_sigframe(struct fxregs_state __user *fx) +{ + if (IS_ENABLED(CONFIG_X86_32)) + return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); + else + return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + +static inline void frstor(struct fregs_state *fx) +{ + kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + +static inline int frstor_safe(struct fregs_state *fx) +{ + return kernel_insn_err(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + +static inline int frstor_from_user_sigframe(struct fregs_state __user *fx) +{ + return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + +static inline void fxsave(struct fxregs_state *fx) +{ + if (IS_ENABLED(CONFIG_X86_32)) + asm volatile( "fxsave %[fx]" : [fx] "=m" (*fx)); + else + asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx)); +} + +#endif diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 66ed317ebc0d..437d7c930c0b 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -5,10 +5,14 @@ #include <linux/sched/task_stack.h> #include <linux/vmalloc.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> #include <asm/fpu/signal.h> #include <asm/fpu/regset.h> -#include <asm/fpu/xstate.h> + +#include "context.h" +#include "internal.h" +#include "legacy.h" +#include "xstate.h" /* * The xstateregs_active() routine is the same as the regset_fpregs_active() routine, @@ -74,8 +78,8 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset, sync_fpstate(fpu); if (!use_xsave()) { - return membuf_write(&to, &fpu->state.fxsave, - sizeof(fpu->state.fxsave)); + return membuf_write(&to, &fpu->fpstate->regs.fxsave, + sizeof(fpu->fpstate->regs.fxsave)); } copy_xstate_to_uabi_buf(to, target, XSTATE_COPY_FX); @@ -110,15 +114,15 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, fpu_force_restore(fpu); /* Copy the state */ - memcpy(&fpu->state.fxsave, &newstate, sizeof(newstate)); + memcpy(&fpu->fpstate->regs.fxsave, &newstate, sizeof(newstate)); /* Clear xmm8..15 */ - BUILD_BUG_ON(sizeof(fpu->state.fxsave.xmm_space) != 16 * 16); - memset(&fpu->state.fxsave.xmm_space[8], 0, 8 * 16); + BUILD_BUG_ON(sizeof(fpu->__fpstate.regs.fxsave.xmm_space) != 16 * 16); + memset(&fpu->fpstate->regs.fxsave.xmm_space[8], 0, 8 * 16); /* Mark FP and SSE as in use when XSAVE is enabled */ if (use_xsave()) - fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE; + fpu->fpstate->regs.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE; return 0; } @@ -149,7 +153,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, /* * A whole standard-format XSAVE buffer is needed: */ - if (pos != 0 || count != fpu_user_xstate_size) + if (pos != 0 || count != fpu_user_cfg.max_size) return -EFAULT; if (!kbuf) { @@ -164,7 +168,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, } fpu_force_restore(fpu); - ret = copy_uabi_from_kernel_to_xstate(&fpu->state.xsave, kbuf ?: tmpbuf); + ret = copy_uabi_from_kernel_to_xstate(fpu->fpstate, kbuf ?: tmpbuf); out: vfree(tmpbuf); @@ -283,7 +287,7 @@ static void __convert_from_fxsr(struct user_i387_ia32_struct *env, void convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) { - __convert_from_fxsr(env, tsk, &tsk->thread.fpu.state.fxsave); + __convert_from_fxsr(env, tsk, &tsk->thread.fpu.fpstate->regs.fxsave); } void convert_to_fxsr(struct fxregs_state *fxsave, @@ -326,7 +330,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, return fpregs_soft_get(target, regset, to); if (!cpu_feature_enabled(X86_FEATURE_FXSR)) { - return membuf_write(&to, &fpu->state.fsave, + return membuf_write(&to, &fpu->fpstate->regs.fsave, sizeof(struct fregs_state)); } @@ -337,7 +341,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, copy_xstate_to_uabi_buf(mb, target, XSTATE_COPY_FP); fx = &fxsave; } else { - fx = &fpu->state.fxsave; + fx = &fpu->fpstate->regs.fxsave; } __convert_from_fxsr(&env, target, fx); @@ -366,16 +370,16 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, fpu_force_restore(fpu); if (cpu_feature_enabled(X86_FEATURE_FXSR)) - convert_to_fxsr(&fpu->state.fxsave, &env); + convert_to_fxsr(&fpu->fpstate->regs.fxsave, &env); else - memcpy(&fpu->state.fsave, &env, sizeof(env)); + memcpy(&fpu->fpstate->regs.fsave, &env, sizeof(env)); /* * Update the header bit in the xsave header, indicating the * presence of FP. */ if (cpu_feature_enabled(X86_FEATURE_XSAVE)) - fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FP; + fpu->fpstate->regs.xsave.header.xfeatures |= XFEATURE_MASK_FP; return 0; } diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 445c57c9c539..cc977da6e128 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -7,23 +7,25 @@ #include <linux/cpu.h> #include <linux/pagemap.h> -#include <asm/fpu/internal.h> #include <asm/fpu/signal.h> #include <asm/fpu/regset.h> #include <asm/fpu/xstate.h> #include <asm/sigframe.h> +#include <asm/trapnr.h> #include <asm/trace/fpu.h> -static struct _fpx_sw_bytes fx_sw_reserved __ro_after_init; -static struct _fpx_sw_bytes fx_sw_reserved_ia32 __ro_after_init; +#include "context.h" +#include "internal.h" +#include "legacy.h" +#include "xstate.h" /* * Check for the presence of extended state information in the * user fpstate pointer in the sigcontext. */ -static inline int check_xstate_in_sigframe(struct fxregs_state __user *fxbuf, - struct _fpx_sw_bytes *fx_sw) +static inline bool check_xstate_in_sigframe(struct fxregs_state __user *fxbuf, + struct _fpx_sw_bytes *fx_sw) { int min_xstate_size = sizeof(struct fxregs_state) + sizeof(struct xstate_header); @@ -31,12 +33,12 @@ static inline int check_xstate_in_sigframe(struct fxregs_state __user *fxbuf, unsigned int magic2; if (__copy_from_user(fx_sw, &fxbuf->sw_reserved[0], sizeof(*fx_sw))) - return -EFAULT; + return false; /* Check for the first magic field and other error scenarios. */ if (fx_sw->magic1 != FP_XSTATE_MAGIC1 || fx_sw->xstate_size < min_xstate_size || - fx_sw->xstate_size > fpu_user_xstate_size || + fx_sw->xstate_size > current->thread.fpu.fpstate->user_size || fx_sw->xstate_size > fx_sw->extended_size) goto setfx; @@ -47,10 +49,10 @@ static inline int check_xstate_in_sigframe(struct fxregs_state __user *fxbuf, * in the memory layout. */ if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size))) - return -EFAULT; + return false; if (likely(magic2 == FP_XSTATE_MAGIC2)) - return 0; + return true; setfx: trace_x86_fpu_xstate_check_failed(¤t->thread.fpu); @@ -58,22 +60,22 @@ setfx: fx_sw->magic1 = 0; fx_sw->xstate_size = sizeof(struct fxregs_state); fx_sw->xfeatures = XFEATURE_MASK_FPSSE; - return 0; + return true; } /* * Signal frame handlers. */ -static inline int save_fsave_header(struct task_struct *tsk, void __user *buf) +static inline bool save_fsave_header(struct task_struct *tsk, void __user *buf) { if (use_fxsr()) { - struct xregs_state *xsave = &tsk->thread.fpu.state.xsave; + struct xregs_state *xsave = &tsk->thread.fpu.fpstate->regs.xsave; struct user_i387_ia32_struct env; struct _fpstate_32 __user *fp = buf; fpregs_lock(); if (!test_thread_flag(TIF_NEED_FPU_LOAD)) - fxsave(&tsk->thread.fpu.state.fxsave); + fxsave(&tsk->thread.fpu.fpstate->regs.fxsave); fpregs_unlock(); convert_from_fxsr(&env, tsk); @@ -81,33 +83,54 @@ static inline int save_fsave_header(struct task_struct *tsk, void __user *buf) if (__copy_to_user(buf, &env, sizeof(env)) || __put_user(xsave->i387.swd, &fp->status) || __put_user(X86_FXSR_MAGIC, &fp->magic)) - return -1; + return false; } else { struct fregs_state __user *fp = buf; u32 swd; + if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status)) - return -1; + return false; } - return 0; + return true; +} + +/* + * Prepare the SW reserved portion of the fxsave memory layout, indicating + * the presence of the extended state information in the memory layout + * pointed to by the fpstate pointer in the sigcontext. + * This is saved when ever the FP and extended state context is + * saved on the user stack during the signal handler delivery to the user. + */ +static inline void save_sw_bytes(struct _fpx_sw_bytes *sw_bytes, bool ia32_frame, + struct fpstate *fpstate) +{ + sw_bytes->magic1 = FP_XSTATE_MAGIC1; + sw_bytes->extended_size = fpstate->user_size + FP_XSTATE_MAGIC2_SIZE; + sw_bytes->xfeatures = fpstate->user_xfeatures; + sw_bytes->xstate_size = fpstate->user_size; + + if (ia32_frame) + sw_bytes->extended_size += sizeof(struct fregs_state); } -static inline int save_xstate_epilog(void __user *buf, int ia32_frame) +static inline bool save_xstate_epilog(void __user *buf, int ia32_frame, + struct fpstate *fpstate) { struct xregs_state __user *x = buf; - struct _fpx_sw_bytes *sw_bytes; + struct _fpx_sw_bytes sw_bytes; u32 xfeatures; int err; /* Setup the bytes not touched by the [f]xsave and reserved for SW. */ - sw_bytes = ia32_frame ? &fx_sw_reserved_ia32 : &fx_sw_reserved; - err = __copy_to_user(&x->i387.sw_reserved, sw_bytes, sizeof(*sw_bytes)); + save_sw_bytes(&sw_bytes, ia32_frame, fpstate); + err = __copy_to_user(&x->i387.sw_reserved, &sw_bytes, sizeof(sw_bytes)); if (!use_xsave()) - return err; + return !err; err |= __put_user(FP_XSTATE_MAGIC2, - (__u32 __user *)(buf + fpu_user_xstate_size)); + (__u32 __user *)(buf + fpstate->user_size)); /* * Read the xfeatures which we copied (directly from the cpu or @@ -130,23 +153,17 @@ static inline int save_xstate_epilog(void __user *buf, int ia32_frame) err |= __put_user(xfeatures, (__u32 __user *)&x->header.xfeatures); - return err; + return !err; } static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) { - int err; - if (use_xsave()) - err = xsave_to_user_sigframe(buf); - else if (use_fxsr()) - err = fxsave_to_user_sigframe((struct fxregs_state __user *) buf); + return xsave_to_user_sigframe(buf); + if (use_fxsr()) + return fxsave_to_user_sigframe((struct fxregs_state __user *) buf); else - err = fnsave_to_user_sigframe((struct fregs_state __user *) buf); - - if (unlikely(err) && __clear_user(buf, fpu_user_xstate_size)) - err = -EFAULT; - return err; + return fnsave_to_user_sigframe((struct fregs_state __user *) buf); } /* @@ -159,10 +176,8 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) * buf == buf_fx for 64-bit frames and 32-bit fsave frame. * buf != buf_fx for 32-bit frames with fxstate. * - * Try to save it directly to the user frame with disabled page fault handler. - * If this fails then do the slow path where the FPU state is first saved to - * task's fpu->state and then copy it to the user frame pointed to by the - * aligned pointer 'buf_fx'. + * Save it directly to the user frame with disabled page fault handler. If + * that faults, try to clear the frame which handles the page fault. * * If this is a 32-bit frame with fxstate, put a fsave header before * the aligned state at 'buf_fx'. @@ -170,10 +185,11 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) * For [f]xsave state, update the SW reserved fields in the [f]xsave frame * indicating the absence/presence of the extended state to the user. */ -int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) +bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) { struct task_struct *tsk = current; - int ia32_fxstate = (buf != buf_fx); + struct fpstate *fpstate = tsk->thread.fpu.fpstate; + bool ia32_fxstate = (buf != buf_fx); int ret; ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) || @@ -181,13 +197,25 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) if (!static_cpu_has(X86_FEATURE_FPU)) { struct user_i387_ia32_struct fp; + fpregs_soft_get(current, NULL, (struct membuf){.p = &fp, .left = sizeof(fp)}); - return copy_to_user(buf, &fp, sizeof(fp)) ? -EFAULT : 0; + return !copy_to_user(buf, &fp, sizeof(fp)); } if (!access_ok(buf, size)) - return -EACCES; + return false; + + if (use_xsave()) { + struct xregs_state __user *xbuf = buf_fx; + + /* + * Clear the xsave header first, so that reserved fields are + * initialized to zero. + */ + if (__clear_user(&xbuf->header, sizeof(xbuf->header))) + return false; + } retry: /* * Load the FPU registers if they are not valid for the current task. @@ -205,26 +233,26 @@ retry: fpregs_unlock(); if (ret) { - if (!fault_in_pages_writeable(buf_fx, fpu_user_xstate_size)) + if (!__clear_user(buf_fx, fpstate->user_size)) goto retry; - return -EFAULT; + return false; } /* Save the fsave header for the 32-bit frames. */ - if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf)) - return -1; + if ((ia32_fxstate || !use_fxsr()) && !save_fsave_header(tsk, buf)) + return false; - if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate)) - return -1; + if (use_fxsr() && !save_xstate_epilog(buf_fx, ia32_fxstate, fpstate)) + return false; - return 0; + return true; } -static int __restore_fpregs_from_user(void __user *buf, u64 xrestore, - bool fx_only) +static int __restore_fpregs_from_user(void __user *buf, u64 ufeatures, + u64 xrestore, bool fx_only) { if (use_xsave()) { - u64 init_bv = xfeatures_mask_uabi() & ~xrestore; + u64 init_bv = ufeatures & ~xrestore; int ret; if (likely(!fx_only)) @@ -233,7 +261,7 @@ static int __restore_fpregs_from_user(void __user *buf, u64 xrestore, ret = fxrstor_from_user_sigframe(buf); if (!ret && unlikely(init_bv)) - os_xrstor(&init_fpstate.xsave, init_bv); + os_xrstor(&init_fpstate, init_bv); return ret; } else if (use_fxsr()) { return fxrstor_from_user_sigframe(buf); @@ -246,16 +274,19 @@ static int __restore_fpregs_from_user(void __user *buf, u64 xrestore, * Attempt to restore the FPU registers directly from user memory. * Pagefaults are handled and any errors returned are fatal. */ -static int restore_fpregs_from_user(void __user *buf, u64 xrestore, - bool fx_only, unsigned int size) +static bool restore_fpregs_from_user(void __user *buf, u64 xrestore, + bool fx_only, unsigned int size) { struct fpu *fpu = ¤t->thread.fpu; int ret; retry: fpregs_lock(); + /* Ensure that XFD is up to date */ + xfd_update_state(fpu->fpstate); pagefault_disable(); - ret = __restore_fpregs_from_user(buf, xrestore, fx_only); + ret = __restore_fpregs_from_user(buf, fpu->fpstate->user_xfeatures, + xrestore, fx_only); pagefault_enable(); if (unlikely(ret)) { @@ -275,13 +306,12 @@ retry: fpregs_unlock(); /* Try to handle #PF, but anything else is fatal. */ - if (ret != -EFAULT) - return -EINVAL; + if (ret != X86_TRAP_PF) + return false; - ret = fault_in_pages_readable(buf, size); - if (!ret) + if (!fault_in_pages_readable(buf, size)) goto retry; - return ret; + return false; } /* @@ -294,45 +324,40 @@ retry: * been restored from a user buffer directly. */ if (test_thread_flag(TIF_NEED_FPU_LOAD) && xfeatures_mask_supervisor()) - os_xrstor(&fpu->state.xsave, xfeatures_mask_supervisor()); + os_xrstor_supervisor(fpu->fpstate); fpregs_mark_activate(); fpregs_unlock(); - return 0; + return true; } -static int __fpu_restore_sig(void __user *buf, void __user *buf_fx, - bool ia32_fxstate) +static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx, + bool ia32_fxstate) { - int state_size = fpu_kernel_xstate_size; struct task_struct *tsk = current; struct fpu *fpu = &tsk->thread.fpu; struct user_i387_ia32_struct env; + bool success, fx_only = false; + union fpregs_state *fpregs; + unsigned int state_size; u64 user_xfeatures = 0; - bool fx_only = false; - int ret; if (use_xsave()) { struct _fpx_sw_bytes fx_sw_user; - ret = check_xstate_in_sigframe(buf_fx, &fx_sw_user); - if (unlikely(ret)) - return ret; + if (!check_xstate_in_sigframe(buf_fx, &fx_sw_user)) + return false; fx_only = !fx_sw_user.magic1; state_size = fx_sw_user.xstate_size; user_xfeatures = fx_sw_user.xfeatures; } else { user_xfeatures = XFEATURE_MASK_FPSSE; + state_size = fpu->fpstate->user_size; } if (likely(!ia32_fxstate)) { - /* - * Attempt to restore the FPU registers directly from user - * memory. For that to succeed, the user access cannot cause page - * faults. If it does, fall back to the slow path below, going - * through the kernel buffer with the enabled pagefault handler. - */ + /* Restore the FPU registers directly from user memory. */ return restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only, state_size); } @@ -342,9 +367,8 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx, * to be ignored for histerical raisins. The legacy state is folded * in once the larger state has been copied. */ - ret = __copy_from_user(&env, buf, sizeof(env)); - if (ret) - return ret; + if (__copy_from_user(&env, buf, sizeof(env))) + return false; /* * By setting TIF_NEED_FPU_LOAD it is ensured that our xstate is @@ -363,33 +387,38 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx, * the right place in memory. It's ia32 mode. Shrug. */ if (xfeatures_mask_supervisor()) - os_xsave(&fpu->state.xsave); + os_xsave(fpu->fpstate); set_thread_flag(TIF_NEED_FPU_LOAD); } __fpu_invalidate_fpregs_state(fpu); __cpu_invalidate_fpregs_state(); fpregs_unlock(); + fpregs = &fpu->fpstate->regs; if (use_xsave() && !fx_only) { - ret = copy_sigframe_from_user_to_xstate(&fpu->state.xsave, buf_fx); - if (ret) - return ret; + if (copy_sigframe_from_user_to_xstate(fpu->fpstate, buf_fx)) + return false; } else { - if (__copy_from_user(&fpu->state.fxsave, buf_fx, - sizeof(fpu->state.fxsave))) - return -EFAULT; - - /* Reject invalid MXCSR values. */ - if (fpu->state.fxsave.mxcsr & ~mxcsr_feature_mask) - return -EINVAL; + if (__copy_from_user(&fpregs->fxsave, buf_fx, + sizeof(fpregs->fxsave))) + return false; + + if (IS_ENABLED(CONFIG_X86_64)) { + /* Reject invalid MXCSR values. */ + if (fpregs->fxsave.mxcsr & ~mxcsr_feature_mask) + return false; + } else { + /* Mask invalid bits out for historical reasons (broken hardware). */ + fpregs->fxsave.mxcsr &= mxcsr_feature_mask; + } /* Enforce XFEATURE_MASK_FPSSE when XSAVE is enabled */ if (use_xsave()) - fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE; + fpregs->xsave.header.xfeatures |= XFEATURE_MASK_FPSSE; } /* Fold the legacy FP storage */ - convert_to_fxsr(&fpu->state.fxsave, &env); + convert_to_fxsr(&fpregs->fxsave, &env); fpregs_lock(); if (use_xsave()) { @@ -404,40 +433,45 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx, */ u64 mask = user_xfeatures | xfeatures_mask_supervisor(); - fpu->state.xsave.header.xfeatures &= mask; - ret = os_xrstor_safe(&fpu->state.xsave, xfeatures_mask_all); + fpregs->xsave.header.xfeatures &= mask; + success = !os_xrstor_safe(fpu->fpstate, + fpu_kernel_cfg.max_features); } else { - ret = fxrstor_safe(&fpu->state.fxsave); + success = !fxrstor_safe(&fpregs->fxsave); } - if (likely(!ret)) + if (likely(success)) fpregs_mark_activate(); fpregs_unlock(); - return ret; + return success; } -static inline int xstate_sigframe_size(void) + +static inline unsigned int xstate_sigframe_size(struct fpstate *fpstate) { - return use_xsave() ? fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE : - fpu_user_xstate_size; + unsigned int size = fpstate->user_size; + + return use_xsave() ? size + FP_XSTATE_MAGIC2_SIZE : size; } /* * Restore FPU state from a sigframe: */ -int fpu__restore_sig(void __user *buf, int ia32_frame) +bool fpu__restore_sig(void __user *buf, int ia32_frame) { - unsigned int size = xstate_sigframe_size(); struct fpu *fpu = ¤t->thread.fpu; void __user *buf_fx = buf; bool ia32_fxstate = false; - int ret; + bool success = false; + unsigned int size; if (unlikely(!buf)) { fpu__clear_user_states(fpu); - return 0; + return true; } + size = xstate_sigframe_size(fpu->fpstate); + ia32_frame &= (IS_ENABLED(CONFIG_X86_32) || IS_ENABLED(CONFIG_IA32_EMULATION)); @@ -451,30 +485,28 @@ int fpu__restore_sig(void __user *buf, int ia32_frame) ia32_fxstate = true; } - if (!access_ok(buf, size)) { - ret = -EACCES; + if (!access_ok(buf, size)) goto out; - } if (!IS_ENABLED(CONFIG_X86_64) && !cpu_feature_enabled(X86_FEATURE_FPU)) { - ret = fpregs_soft_set(current, NULL, 0, - sizeof(struct user_i387_ia32_struct), - NULL, buf); + success = !fpregs_soft_set(current, NULL, 0, + sizeof(struct user_i387_ia32_struct), + NULL, buf); } else { - ret = __fpu_restore_sig(buf, buf_fx, ia32_fxstate); + success = __fpu_restore_sig(buf, buf_fx, ia32_fxstate); } out: - if (unlikely(ret)) + if (unlikely(!success)) fpu__clear_user_states(fpu); - return ret; + return success; } unsigned long fpu__alloc_mathframe(unsigned long sp, int ia32_frame, unsigned long *buf_fx, unsigned long *size) { - unsigned long frame_size = xstate_sigframe_size(); + unsigned long frame_size = xstate_sigframe_size(current->thread.fpu.fpstate); *buf_fx = sp = round_down(sp - frame_size, 64); if (ia32_frame && use_fxsr()) { @@ -487,9 +519,12 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame, return sp; } -unsigned long fpu__get_fpstate_size(void) +unsigned long __init fpu__get_fpstate_size(void) { - unsigned long ret = xstate_sigframe_size(); + unsigned long ret = fpu_user_cfg.max_size; + + if (use_xsave()) + ret += FP_XSTATE_MAGIC2_SIZE; /* * This space is needed on (most) 32-bit kernels, or when a 32-bit @@ -505,28 +540,3 @@ unsigned long fpu__get_fpstate_size(void) return ret; } -/* - * Prepare the SW reserved portion of the fxsave memory layout, indicating - * the presence of the extended state information in the memory layout - * pointed by the fpstate pointer in the sigcontext. - * This will be saved when ever the FP and extended state context is - * saved on the user stack during the signal handler delivery to the user. - */ -void fpu__init_prepare_fx_sw_frame(void) -{ - int size = fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE; - - fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; - fx_sw_reserved.extended_size = size; - fx_sw_reserved.xfeatures = xfeatures_mask_uabi(); - fx_sw_reserved.xstate_size = fpu_user_xstate_size; - - if (IS_ENABLED(CONFIG_IA32_EMULATION) || - IS_ENABLED(CONFIG_X86_32)) { - int fsave_header_size = sizeof(struct fregs_state); - - fx_sw_reserved_ia32 = fx_sw_reserved; - fx_sw_reserved_ia32.extended_size = size + fsave_header_size; - } -} - diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index c8def1b7f8fb..d28829403ed0 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -4,21 +4,33 @@ * * Author: Suresh Siddha <suresh.b.siddha@intel.com> */ +#include <linux/bitops.h> #include <linux/compat.h> #include <linux/cpu.h> #include <linux/mman.h> +#include <linux/nospec.h> #include <linux/pkeys.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> +#include <linux/vmalloc.h> #include <asm/fpu/api.h> -#include <asm/fpu/internal.h> -#include <asm/fpu/signal.h> #include <asm/fpu/regset.h> -#include <asm/fpu/xstate.h> +#include <asm/fpu/signal.h> +#include <asm/fpu/xcr.h> #include <asm/tlbflush.h> -#include <asm/cpufeature.h> +#include <asm/prctl.h> +#include <asm/elf.h> + +#include "context.h" +#include "internal.h" +#include "legacy.h" +#include "xstate.h" + +#define for_each_extended_xfeature(bit, mask) \ + (bit) = FIRST_EXTENDED_XFEATURE; \ + for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask)) /* * Although we spell it out in here, the Processor Trace @@ -39,29 +51,32 @@ static const char *xfeature_names[] = "Protection Keys User registers", "PASID state", "unknown xstate feature" , + "unknown xstate feature" , + "unknown xstate feature" , + "unknown xstate feature" , + "unknown xstate feature" , + "unknown xstate feature" , + "AMX Tile config" , + "AMX Tile data" , + "unknown xstate feature" , }; -static short xsave_cpuid_features[] __initdata = { - X86_FEATURE_FPU, - X86_FEATURE_XMM, - X86_FEATURE_AVX, - X86_FEATURE_MPX, - X86_FEATURE_MPX, - X86_FEATURE_AVX512F, - X86_FEATURE_AVX512F, - X86_FEATURE_AVX512F, - X86_FEATURE_INTEL_PT, - X86_FEATURE_PKU, - X86_FEATURE_ENQCMD, +static unsigned short xsave_cpuid_features[] __initdata = { + [XFEATURE_FP] = X86_FEATURE_FPU, + [XFEATURE_SSE] = X86_FEATURE_XMM, + [XFEATURE_YMM] = X86_FEATURE_AVX, + [XFEATURE_BNDREGS] = X86_FEATURE_MPX, + [XFEATURE_BNDCSR] = X86_FEATURE_MPX, + [XFEATURE_OPMASK] = X86_FEATURE_AVX512F, + [XFEATURE_ZMM_Hi256] = X86_FEATURE_AVX512F, + [XFEATURE_Hi16_ZMM] = X86_FEATURE_AVX512F, + [XFEATURE_PT_UNIMPLEMENTED_SO_FAR] = X86_FEATURE_INTEL_PT, + [XFEATURE_PKRU] = X86_FEATURE_PKU, + [XFEATURE_PASID] = X86_FEATURE_ENQCMD, + [XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE, + [XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE, }; -/* - * This represents the full set of bits that should ever be set in a kernel - * XSAVE buffer, both supervisor and user xstates. - */ -u64 xfeatures_mask_all __ro_after_init; -EXPORT_SYMBOL_GPL(xfeatures_mask_all); - static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init = { [ 0 ... XFEATURE_MAX - 1] = -1}; static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init = @@ -72,20 +87,13 @@ static unsigned int xstate_supervisor_only_offsets[XFEATURE_MAX] __ro_after_init { [ 0 ... XFEATURE_MAX - 1] = -1}; /* - * The XSAVE area of kernel can be in standard or compacted format; - * it is always in standard format for user mode. This is the user - * mode standard format size used for signal and ptrace frames. - */ -unsigned int fpu_user_xstate_size __ro_after_init; - -/* * Return whether the system supports a given xfeature. * * Also return the name of the (most advanced) feature that the caller requested: */ int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name) { - u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask_all; + u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features; if (unlikely(feature_name)) { long xfeature_idx, max_idx; @@ -135,17 +143,26 @@ static bool xfeature_is_supervisor(int xfeature_nr) */ void fpu__init_cpu_xstate(void) { - if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask_all) + if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features) return; cr4_set_bits(X86_CR4_OSXSAVE); /* + * Must happen after CR4 setup and before xsetbv() to allow KVM + * lazy passthrough. Write independent of the dynamic state static + * key as that does not work on the boot CPU. This also ensures + * that any stale state is wiped out from XFD. + */ + if (cpu_feature_enabled(X86_FEATURE_XFD)) + wrmsrl(MSR_IA32_XFD, init_fpstate.xfd); + + /* * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features * managed by XSAVE{C, OPT, S} and XRSTOR{S}. Only XSAVE user * states can be set here. */ - xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_uabi()); + xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features); /* * MSR_IA32_XSS sets supervisor states managed by XSAVES. @@ -158,7 +175,7 @@ void fpu__init_cpu_xstate(void) static bool xfeature_enabled(enum xfeature xfeature) { - return xfeatures_mask_all & BIT_ULL(xfeature); + return fpu_kernel_cfg.max_features & BIT_ULL(xfeature); } /* @@ -184,10 +201,7 @@ static void __init setup_xstate_features(void) xstate_sizes[XFEATURE_SSE] = sizeof_field(struct fxregs_state, xmm_space); - for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { - if (!xfeature_enabled(i)) - continue; - + for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); xstate_sizes[i] = eax; @@ -236,6 +250,8 @@ static void __init print_xstate_features(void) print_xstate_feature(XFEATURE_MASK_Hi16_ZMM); print_xstate_feature(XFEATURE_MASK_PKRU); print_xstate_feature(XFEATURE_MASK_PASID); + print_xstate_feature(XFEATURE_MASK_XTILE_CFG); + print_xstate_feature(XFEATURE_MASK_XTILE_DATA); } /* @@ -291,20 +307,15 @@ static void __init setup_xstate_comp_offsets(void) xstate_comp_offsets[XFEATURE_SSE] = offsetof(struct fxregs_state, xmm_space); - if (!boot_cpu_has(X86_FEATURE_XSAVES)) { - for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { - if (xfeature_enabled(i)) - xstate_comp_offsets[i] = xstate_offsets[i]; - } + if (!cpu_feature_enabled(X86_FEATURE_XSAVES)) { + for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) + xstate_comp_offsets[i] = xstate_offsets[i]; return; } next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE; - for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { - if (!xfeature_enabled(i)) - continue; - + for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { if (xfeature_is_aligned(i)) next_offset = ALIGN(next_offset, 64); @@ -328,8 +339,8 @@ static void __init setup_supervisor_only_offsets(void) next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE; - for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { - if (!xfeature_enabled(i) || !xfeature_is_supervisor(i)) + for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { + if (!xfeature_is_supervisor(i)) continue; if (xfeature_is_aligned(i)) @@ -347,15 +358,36 @@ static void __init print_xstate_offset_size(void) { int i; - for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { - if (!xfeature_enabled(i)) - continue; + for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n", i, xstate_comp_offsets[i], i, xstate_sizes[i]); } } /* + * This function is called only during boot time when x86 caps are not set + * up and alternative can not be used yet. + */ +static __init void os_xrstor_booting(struct xregs_state *xstate) +{ + u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE; + u32 lmask = mask; + u32 hmask = mask >> 32; + int err; + + if (cpu_feature_enabled(X86_FEATURE_XSAVES)) + XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); + else + XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); + + /* + * We should never fault when copying from a kernel buffer, and the FPU + * state we set at boot time should be valid. + */ + WARN_ON_FPU(err); +} + +/* * All supported features have either init state all zeros or are * handled in setup_init_fpu() individually. This is an explicit * feature list and does not use XFEATURE_MASK*SUPPORTED to catch @@ -372,36 +404,30 @@ static void __init print_xstate_offset_size(void) XFEATURE_MASK_PKRU | \ XFEATURE_MASK_BNDREGS | \ XFEATURE_MASK_BNDCSR | \ - XFEATURE_MASK_PASID) + XFEATURE_MASK_PASID | \ + XFEATURE_MASK_XTILE) /* * setup the xstate image representing the init state */ static void __init setup_init_fpu_buf(void) { - static int on_boot_cpu __initdata = 1; - BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED | XFEATURE_MASK_SUPERVISOR_SUPPORTED) != XFEATURES_INIT_FPSTATE_HANDLED); - WARN_ON_FPU(!on_boot_cpu); - on_boot_cpu = 0; - if (!boot_cpu_has(X86_FEATURE_XSAVE)) return; setup_xstate_features(); print_xstate_features(); - if (boot_cpu_has(X86_FEATURE_XSAVES)) - init_fpstate.xsave.header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | - xfeatures_mask_all; + xstate_init_xcomp_bv(&init_fpstate.regs.xsave, fpu_kernel_cfg.max_features); /* * Init all the features state with header.xfeatures being 0x0 */ - os_xrstor_booting(&init_fpstate.xsave); + os_xrstor_booting(&init_fpstate.regs.xsave); /* * All components are now in init state. Read the state back so @@ -419,7 +445,7 @@ static void __init setup_init_fpu_buf(void) * state is all zeroes or if not to add the necessary handling * here. */ - fxsave(&init_fpstate.fxsave); + fxsave(&init_fpstate.regs.fxsave); } static int xfeature_uncompacted_offset(int xfeature_nr) @@ -451,10 +477,11 @@ int xfeature_size(int xfeature_nr) } /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ -static int validate_user_xstate_header(const struct xstate_header *hdr) +static int validate_user_xstate_header(const struct xstate_header *hdr, + struct fpstate *fpstate) { /* No unknown or supervisor features may be set */ - if (hdr->xfeatures & ~xfeatures_mask_uabi()) + if (hdr->xfeatures & ~fpstate->user_xfeatures) return -EINVAL; /* Userspace must use the uncompacted format */ @@ -474,7 +501,7 @@ static int validate_user_xstate_header(const struct xstate_header *hdr) return 0; } -static void __xstate_dump_leaves(void) +static void __init __xstate_dump_leaves(void) { int i; u32 eax, ebx, ecx, edx; @@ -509,12 +536,73 @@ static void __xstate_dump_leaves(void) } \ } while (0) +/** + * check_xtile_data_against_struct - Check tile data state size. + * + * Calculate the state size by multiplying the single tile size which is + * recorded in a C struct, and the number of tiles that the CPU informs. + * Compare the provided size with the calculation. + * + * @size: The tile data state size + * + * Returns: 0 on success, -EINVAL on mismatch. + */ +static int __init check_xtile_data_against_struct(int size) +{ + u32 max_palid, palid, state_size; + u32 eax, ebx, ecx, edx; + u16 max_tile; + + /* + * Check the maximum palette id: + * eax: the highest numbered palette subleaf. + */ + cpuid_count(TILE_CPUID, 0, &max_palid, &ebx, &ecx, &edx); + + /* + * Cross-check each tile size and find the maximum number of + * supported tiles. + */ + for (palid = 1, max_tile = 0; palid <= max_palid; palid++) { + u16 tile_size, max; + + /* + * Check the tile size info: + * eax[31:16]: bytes per title + * ebx[31:16]: the max names (or max number of tiles) + */ + cpuid_count(TILE_CPUID, palid, &eax, &ebx, &edx, &edx); + tile_size = eax >> 16; + max = ebx >> 16; + + if (tile_size != sizeof(struct xtile_data)) { + pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n", + __stringify(XFEATURE_XTILE_DATA), + sizeof(struct xtile_data), tile_size); + __xstate_dump_leaves(); + return -EINVAL; + } + + if (max > max_tile) + max_tile = max; + } + + state_size = sizeof(struct xtile_data) * max_tile; + if (size != state_size) { + pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n", + __stringify(XFEATURE_XTILE_DATA), state_size, size); + __xstate_dump_leaves(); + return -EINVAL; + } + return 0; +} + /* * We have a C struct for each 'xstate'. We need to ensure * that our software representation matches what the CPU * tells us about the state's size. */ -static void check_xstate_against_struct(int nr) +static bool __init check_xstate_against_struct(int nr) { /* * Ask the CPU for the size of the state. @@ -532,6 +620,11 @@ static void check_xstate_against_struct(int nr) XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM, struct avx_512_hi16_state); XCHECK_SZ(sz, nr, XFEATURE_PKRU, struct pkru_state); XCHECK_SZ(sz, nr, XFEATURE_PASID, struct ia32_pasid_state); + XCHECK_SZ(sz, nr, XFEATURE_XTILE_CFG, struct xtile_cfg); + + /* The tile data size varies between implementations. */ + if (nr == XFEATURE_XTILE_DATA) + check_xtile_data_against_struct(sz); /* * Make *SURE* to add any feature numbers in below if @@ -541,10 +634,39 @@ static void check_xstate_against_struct(int nr) if ((nr < XFEATURE_YMM) || (nr >= XFEATURE_MAX) || (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) || - ((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_LBR))) { + ((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_RSRVD_COMP_16))) { WARN_ONCE(1, "no structure for xstate: %d\n", nr); XSTATE_WARN_ON(1); + return false; + } + return true; +} + +static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted) +{ + unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE; + int i; + + for_each_extended_xfeature(i, xfeatures) { + /* Align from the end of the previous feature */ + if (xfeature_is_aligned(i)) + size = ALIGN(size, 64); + /* + * In compacted format the enabled features are packed, + * i.e. disabled features do not occupy space. + * + * In non-compacted format the offsets are fixed and + * disabled states still occupy space in the memory buffer. + */ + if (!compacted) + size = xfeature_uncompacted_offset(i); + /* + * Add the feature size even for non-compacted format + * to make the end result correct + */ + size += xfeature_size(i); } + return size; } /* @@ -556,44 +678,29 @@ static void check_xstate_against_struct(int nr) * covered by these checks. Only the size of the buffer for task->fpu * is checked here. */ -static void do_extra_xstate_size_checks(void) +static bool __init paranoid_xstate_size_valid(unsigned int kernel_size) { - int paranoid_xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE; + bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES); + unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE; int i; - for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { - if (!xfeature_enabled(i)) - continue; - - check_xstate_against_struct(i); + for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { + if (!check_xstate_against_struct(i)) + return false; /* * Supervisor state components can be managed only by * XSAVES. */ - if (!cpu_feature_enabled(X86_FEATURE_XSAVES)) - XSTATE_WARN_ON(xfeature_is_supervisor(i)); - - /* Align from the end of the previous feature */ - if (xfeature_is_aligned(i)) - paranoid_xstate_size = ALIGN(paranoid_xstate_size, 64); - /* - * The offset of a given state in the non-compacted - * format is given to us in a CPUID leaf. We check - * them for being ordered (increasing offsets) in - * setup_xstate_features(). XSAVES uses compacted format. - */ - if (!cpu_feature_enabled(X86_FEATURE_XSAVES)) - paranoid_xstate_size = xfeature_uncompacted_offset(i); - /* - * The compacted-format offset always depends on where - * the previous state ended. - */ - paranoid_xstate_size += xfeature_size(i); + if (!compacted && xfeature_is_supervisor(i)) { + XSTATE_WARN_ON(1); + return false; + } } - XSTATE_WARN_ON(paranoid_xstate_size != fpu_kernel_xstate_size); + size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted); + XSTATE_WARN_ON(size != kernel_size); + return size == kernel_size; } - /* * Get total size of enabled xstates in XCR0 | IA32_XSS. * @@ -644,7 +751,7 @@ static unsigned int __init get_xsaves_size_no_independent(void) return size; } -static unsigned int __init get_xsave_size(void) +static unsigned int __init get_xsave_size_user(void) { unsigned int eax, ebx, ecx, edx; /* @@ -662,44 +769,54 @@ static unsigned int __init get_xsave_size(void) * Will the runtime-enumerated 'xstate_size' fit in the init * task's statically-allocated buffer? */ -static bool is_supported_xstate_size(unsigned int test_xstate_size) +static bool __init is_supported_xstate_size(unsigned int test_xstate_size) { - if (test_xstate_size <= sizeof(union fpregs_state)) + if (test_xstate_size <= sizeof(init_fpstate.regs)) return true; pr_warn("x86/fpu: xstate buffer too small (%zu < %d), disabling xsave\n", - sizeof(union fpregs_state), test_xstate_size); + sizeof(init_fpstate.regs), test_xstate_size); return false; } static int __init init_xstate_size(void) { /* Recompute the context size for enabled features: */ - unsigned int possible_xstate_size; - unsigned int xsave_size; + unsigned int user_size, kernel_size, kernel_default_size; + bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES); - xsave_size = get_xsave_size(); + /* Uncompacted user space size */ + user_size = get_xsave_size_user(); - if (boot_cpu_has(X86_FEATURE_XSAVES)) - possible_xstate_size = get_xsaves_size_no_independent(); + /* + * XSAVES kernel size includes supervisor states and + * uses compacted format when available. + * + * XSAVE does not support supervisor states so + * kernel and user size is identical. + */ + if (compacted) + kernel_size = get_xsaves_size_no_independent(); else - possible_xstate_size = xsave_size; + kernel_size = user_size; + + kernel_default_size = + xstate_calculate_size(fpu_kernel_cfg.default_features, compacted); - /* Ensure we have the space to store all enabled: */ - if (!is_supported_xstate_size(possible_xstate_size)) + /* Ensure we have the space to store all default enabled features. */ + if (!is_supported_xstate_size(kernel_default_size)) return -EINVAL; - /* - * The size is OK, we are definitely going to use xsave, - * make it known to the world that we need more space. - */ - fpu_kernel_xstate_size = possible_xstate_size; - do_extra_xstate_size_checks(); + if (!paranoid_xstate_size_valid(kernel_size)) + return -EINVAL; + + fpu_kernel_cfg.max_size = kernel_size; + fpu_user_cfg.max_size = user_size; + + fpu_kernel_cfg.default_size = kernel_default_size; + fpu_user_cfg.default_size = + xstate_calculate_size(fpu_user_cfg.default_features, false); - /* - * User space is always in standard format. - */ - fpu_user_xstate_size = xsave_size; return 0; } @@ -707,28 +824,38 @@ static int __init init_xstate_size(void) * We enabled the XSAVE hardware, but something went wrong and * we can not use it. Disable it. */ -static void fpu__init_disable_system_xstate(void) +static void __init fpu__init_disable_system_xstate(unsigned int legacy_size) { - xfeatures_mask_all = 0; + fpu_kernel_cfg.max_features = 0; cr4_clear_bits(X86_CR4_OSXSAVE); setup_clear_cpu_cap(X86_FEATURE_XSAVE); + + /* Restore the legacy size.*/ + fpu_kernel_cfg.max_size = legacy_size; + fpu_kernel_cfg.default_size = legacy_size; + fpu_user_cfg.max_size = legacy_size; + fpu_user_cfg.default_size = legacy_size; + + /* + * Prevent enabling the static branch which enables writes to the + * XFD MSR. + */ + init_fpstate.xfd = 0; + + fpstate_reset(¤t->thread.fpu); } /* * Enable and initialize the xsave feature. * Called once per system bootup. */ -void __init fpu__init_system_xstate(void) +void __init fpu__init_system_xstate(unsigned int legacy_size) { unsigned int eax, ebx, ecx, edx; - static int on_boot_cpu __initdata = 1; u64 xfeatures; int err; int i; - WARN_ON_FPU(!on_boot_cpu); - on_boot_cpu = 0; - if (!boot_cpu_has(X86_FEATURE_FPU)) { pr_info("x86/fpu: No FPU detected\n"); return; @@ -749,22 +876,22 @@ void __init fpu__init_system_xstate(void) * Find user xstates supported by the processor. */ cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); - xfeatures_mask_all = eax + ((u64)edx << 32); + fpu_kernel_cfg.max_features = eax + ((u64)edx << 32); /* * Find supervisor xstates supported by the processor. */ cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); - xfeatures_mask_all |= ecx + ((u64)edx << 32); + fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32); - if ((xfeatures_mask_uabi() & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { + if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { /* * This indicates that something really unexpected happened * with the enumeration. Disable XSAVE and try to continue * booting without it. This is too early to BUG(). */ pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", - xfeatures_mask_all); + fpu_kernel_cfg.max_features); goto out_disable; } @@ -772,15 +899,39 @@ void __init fpu__init_system_xstate(void) * Clear XSAVE features that are disabled in the normal CPUID. */ for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) { - if (!boot_cpu_has(xsave_cpuid_features[i])) - xfeatures_mask_all &= ~BIT_ULL(i); + unsigned short cid = xsave_cpuid_features[i]; + + /* Careful: X86_FEATURE_FPU is 0! */ + if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid)) + fpu_kernel_cfg.max_features &= ~BIT_ULL(i); } - xfeatures_mask_all &= XFEATURE_MASK_USER_SUPPORTED | + if (!cpu_feature_enabled(X86_FEATURE_XFD)) + fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC; + + fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED | XFEATURE_MASK_SUPERVISOR_SUPPORTED; + fpu_user_cfg.max_features = fpu_kernel_cfg.max_features; + fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED; + + /* Clean out dynamic features from default */ + fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features; + fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC; + + fpu_user_cfg.default_features = fpu_user_cfg.max_features; + fpu_user_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC; + /* Store it for paranoia check at the end */ - xfeatures = xfeatures_mask_all; + xfeatures = fpu_kernel_cfg.max_features; + + /* + * Initialize the default XFD state in initfp_state and enable the + * dynamic sizing mechanism if dynamic states are available. The + * static key cannot be enabled here because this runs before + * jump_label_init(). This is delayed to an initcall. + */ + init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC; /* Enable xstate instructions to be able to continue with initialization: */ fpu__init_cpu_xstate(); @@ -788,13 +939,16 @@ void __init fpu__init_system_xstate(void) if (err) goto out_disable; + /* Reset the state for the current task */ + fpstate_reset(¤t->thread.fpu); + /* * Update info used for ptrace frames; use standard-format size and no * supervisor xstates: */ - update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask_uabi()); + update_regset_xstate_info(fpu_user_cfg.max_size, + fpu_user_cfg.max_features); - fpu__init_prepare_fx_sw_frame(); setup_init_fpu_buf(); setup_xstate_comp_offsets(); setup_supervisor_only_offsets(); @@ -803,22 +957,22 @@ void __init fpu__init_system_xstate(void) * Paranoia check whether something in the setup modified the * xfeatures mask. */ - if (xfeatures != xfeatures_mask_all) { + if (xfeatures != fpu_kernel_cfg.max_features) { pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init, disabling XSAVE\n", - xfeatures, xfeatures_mask_all); + xfeatures, fpu_kernel_cfg.max_features); goto out_disable; } print_xstate_offset_size(); pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", - xfeatures_mask_all, - fpu_kernel_xstate_size, + fpu_kernel_cfg.max_features, + fpu_kernel_cfg.max_size, boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard"); return; out_disable: /* something went wrong, try to boot without any XSAVE support */ - fpu__init_disable_system_xstate(); + fpu__init_disable_system_xstate(legacy_size); } /* @@ -830,7 +984,7 @@ void fpu__resume_cpu(void) * Restore XCR0 on xsave capable CPUs: */ if (cpu_feature_enabled(X86_FEATURE_XSAVE)) - xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_uabi()); + xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features); /* * Restore IA32_XSS. The same CPUID bit enumerates support @@ -840,6 +994,9 @@ void fpu__resume_cpu(void) wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | xfeatures_mask_independent()); } + + if (fpu_state_size_dynamic()) + wrmsrl(MSR_IA32_XFD, current->thread.fpu.fpstate->xfd); } /* @@ -886,7 +1043,7 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) * We should not ever be requesting features that we * have not enabled. */ - WARN_ONCE(!(xfeatures_mask_all & BIT_ULL(xfeature_nr)), + WARN_ONCE(!(fpu_kernel_cfg.max_features & BIT_ULL(xfeature_nr)), "get of unsupported state"); /* * This assumes the last 'xsave*' instruction to @@ -904,7 +1061,6 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) return __raw_xsave_addr(xsave, xfeature_nr); } -EXPORT_SYMBOL_GPL(get_xsave_addr); #ifdef CONFIG_ARCH_HAS_PKEYS @@ -961,9 +1117,10 @@ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate, } /** - * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer + * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer * @to: membuf descriptor - * @tsk: The task from which to copy the saved xstate + * @fpstate: The fpstate buffer from which to copy + * @pkru_val: The PKRU value to store in the PKRU component * @copy_mode: The requested copy mode * * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming @@ -972,14 +1129,15 @@ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate, * * It supports partial copy but @to.pos always starts from zero. */ -void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, - enum xstate_copy_mode copy_mode) +void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, + u32 pkru_val, enum xstate_copy_mode copy_mode) { const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr); - struct xregs_state *xsave = &tsk->thread.fpu.state.xsave; - struct xregs_state *xinit = &init_fpstate.xsave; + struct xregs_state *xinit = &init_fpstate.regs.xsave; + struct xregs_state *xsave = &fpstate->regs.xsave; struct xstate_header header; unsigned int zerofrom; + u64 mask; int i; memset(&header, 0, sizeof(header)); @@ -996,7 +1154,7 @@ void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, break; case XSTATE_COPY_XSAVE: - header.xfeatures &= xfeatures_mask_uabi(); + header.xfeatures &= fpstate->user_xfeatures; break; } @@ -1033,17 +1191,15 @@ void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, zerofrom = offsetof(struct xregs_state, extended_state_area); - for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { - /* - * The ptrace buffer is in non-compacted XSAVE format. - * In non-compacted format disabled features still occupy - * state space, but there is no state to copy from in the - * compacted init_fpstate. The gap tracking will zero this - * later. - */ - if (!(xfeatures_mask_uabi() & BIT_ULL(i))) - continue; + /* + * The ptrace buffer is in non-compacted XSAVE format. In + * non-compacted format disabled features still occupy state space, + * but there is no state to copy from in the compacted + * init_fpstate. The gap tracking will zero these states. + */ + mask = fpstate->user_xfeatures; + for_each_extended_xfeature(i, mask) { /* * If there was a feature or alignment gap, zero the space * in the destination buffer. @@ -1055,10 +1211,9 @@ void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, struct pkru_state pkru = {0}; /* * PKRU is not necessarily up to date in the - * thread's XSAVE buffer. Fill this part from the - * per-thread storage. + * XSAVE buffer. Use the provided value. */ - pkru.pkru = tsk->thread.pkru; + pkru.pkru = pkru_val; membuf_write(&to, &pkru, sizeof(pkru)); } else { copy_feature(header.xfeatures & BIT_ULL(i), &to, @@ -1078,6 +1233,25 @@ out: membuf_zero(&to, to.left); } +/** + * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer + * @to: membuf descriptor + * @tsk: The task from which to copy the saved xstate + * @copy_mode: The requested copy mode + * + * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming + * format, i.e. from the kernel internal hardware dependent storage format + * to the requested @mode. UABI XSTATE is always uncompacted! + * + * It supports partial copy but @to.pos always starts from zero. + */ +void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, + enum xstate_copy_mode copy_mode) +{ + __copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate, + tsk->thread.pkru, copy_mode); +} + static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size, const void *kbuf, const void __user *ubuf) { @@ -1091,9 +1265,10 @@ static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size, } -static int copy_uabi_to_xstate(struct xregs_state *xsave, const void *kbuf, +static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf, const void __user *ubuf) { + struct xregs_state *xsave = &fpstate->regs.xsave; unsigned int offset, size; struct xstate_header hdr; u64 mask; @@ -1103,7 +1278,7 @@ static int copy_uabi_to_xstate(struct xregs_state *xsave, const void *kbuf, if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf)) return -EFAULT; - if (validate_user_xstate_header(&hdr)) + if (validate_user_xstate_header(&hdr, fpstate)) return -EINVAL; /* Validate MXCSR when any of the related features is in use */ @@ -1156,12 +1331,11 @@ static int copy_uabi_to_xstate(struct xregs_state *xsave, const void *kbuf, /* * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S] - * format and copy to the target thread. This is called from - * xstateregs_set(). + * format and copy to the target thread. Used by ptrace and KVM. */ -int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) +int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf) { - return copy_uabi_to_xstate(xsave, kbuf, NULL); + return copy_uabi_to_xstate(fpstate, kbuf, NULL); } /* @@ -1169,26 +1343,20 @@ int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) * XSAVE[S] format and copy to the target thread. This is called from the * sigreturn() and rt_sigreturn() system calls. */ -int copy_sigframe_from_user_to_xstate(struct xregs_state *xsave, +int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate, const void __user *ubuf) { - return copy_uabi_to_xstate(xsave, NULL, ubuf); + return copy_uabi_to_xstate(fpstate, NULL, ubuf); } -static bool validate_xsaves_xrstors(u64 mask) +static bool validate_independent_components(u64 mask) { u64 xchk; if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES))) return false; - /* - * Validate that this is either a task->fpstate related component - * subset or an independent one. - */ - if (mask & xfeatures_mask_independent()) - xchk = ~xfeatures_mask_independent(); - else - xchk = ~xfeatures_mask_all; + + xchk = ~xfeatures_mask_independent(); if (WARN_ON_ONCE(!mask || mask & xchk)) return false; @@ -1206,14 +1374,13 @@ static bool validate_xsaves_xrstors(u64 mask) * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer * can #GP. * - * The feature mask must either be a subset of the independent features or - * a subset of the task->fpstate related features. + * The feature mask must be a subset of the independent features. */ void xsaves(struct xregs_state *xstate, u64 mask) { int err; - if (!validate_xsaves_xrstors(mask)) + if (!validate_independent_components(mask)) return; XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err); @@ -1231,20 +1398,379 @@ void xsaves(struct xregs_state *xstate, u64 mask) * Proper usage is to restore the state which was saved with * xsaves() into @xstate. * - * The feature mask must either be a subset of the independent features or - * a subset of the task->fpstate related features. + * The feature mask must be a subset of the independent features. */ void xrstors(struct xregs_state *xstate, u64 mask) { int err; - if (!validate_xsaves_xrstors(mask)) + if (!validate_independent_components(mask)) return; XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err); WARN_ON_ONCE(err); } +#if IS_ENABLED(CONFIG_KVM) +void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature) +{ + void *addr = get_xsave_addr(&fps->regs.xsave, xfeature); + + if (addr) + memset(addr, 0, xstate_sizes[xfeature]); +} +EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component); +#endif + +#ifdef CONFIG_X86_64 + +#ifdef CONFIG_X86_DEBUG_FPU +/* + * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask + * can safely operate on the @fpstate buffer. + */ +static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor) +{ + u64 xfd = __this_cpu_read(xfd_state); + + if (fpstate->xfd == xfd) + return true; + + /* + * The XFD MSR does not match fpstate->xfd. That's invalid when + * the passed in fpstate is current's fpstate. + */ + if (fpstate->xfd == current->thread.fpu.fpstate->xfd) + return false; + + /* + * XRSTOR(S) from init_fpstate are always correct as it will just + * bring all components into init state and not read from the + * buffer. XSAVE(S) raises #PF after init. + */ + if (fpstate == &init_fpstate) + return rstor; + + /* + * XSAVE(S): clone(), fpu_swap_kvm_fpu() + * XRSTORS(S): fpu_swap_kvm_fpu() + */ + + /* + * No XSAVE/XRSTOR instructions (except XSAVE itself) touch + * the buffer area for XFD-disabled state components. + */ + mask &= ~xfd; + + /* + * Remove features which are valid in fpstate. They + * have space allocated in fpstate. + */ + mask &= ~fpstate->xfeatures; + + /* + * Any remaining state components in 'mask' might be written + * by XSAVE/XRSTOR. Fail validation it found. + */ + return !mask; +} + +void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor) +{ + WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor)); +} +#endif /* CONFIG_X86_DEBUG_FPU */ + +static int __init xfd_update_static_branch(void) +{ + /* + * If init_fpstate.xfd has bits set then dynamic features are + * available and the dynamic sizing must be enabled. + */ + if (init_fpstate.xfd) + static_branch_enable(&__fpu_state_size_dynamic); + return 0; +} +arch_initcall(xfd_update_static_branch) + +void fpstate_free(struct fpu *fpu) +{ + if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate) + vfree(fpu->fpstate); +} + +/** + * fpu_install_fpstate - Update the active fpstate in the FPU + * + * @fpu: A struct fpu * pointer + * @newfps: A struct fpstate * pointer + * + * Returns: A null pointer if the last active fpstate is the embedded + * one or the new fpstate is already installed; + * otherwise, a pointer to the old fpstate which has to + * be freed by the caller. + */ +static struct fpstate *fpu_install_fpstate(struct fpu *fpu, + struct fpstate *newfps) +{ + struct fpstate *oldfps = fpu->fpstate; + + if (fpu->fpstate == newfps) + return NULL; + + fpu->fpstate = newfps; + return oldfps != &fpu->__fpstate ? oldfps : NULL; +} + +/** + * fpstate_realloc - Reallocate struct fpstate for the requested new features + * + * @xfeatures: A bitmap of xstate features which extend the enabled features + * of that task + * @ksize: The required size for the kernel buffer + * @usize: The required size for user space buffers + * + * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer + * terminates quickly, vfree()-induced IPIs may be a concern, but tasks + * with large states are likely to live longer. + * + * Returns: 0 on success, -ENOMEM on allocation error. + */ +static int fpstate_realloc(u64 xfeatures, unsigned int ksize, + unsigned int usize) +{ + struct fpu *fpu = ¤t->thread.fpu; + struct fpstate *curfps, *newfps = NULL; + unsigned int fpsize; + + curfps = fpu->fpstate; + fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64); + + newfps = vzalloc(fpsize); + if (!newfps) + return -ENOMEM; + newfps->size = ksize; + newfps->user_size = usize; + newfps->is_valloc = true; + + fpregs_lock(); + /* + * Ensure that the current state is in the registers before + * swapping fpstate as that might invalidate it due to layout + * changes. + */ + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + fpregs_restore_userregs(); + + newfps->xfeatures = curfps->xfeatures | xfeatures; + newfps->user_xfeatures = curfps->user_xfeatures | xfeatures; + newfps->xfd = curfps->xfd & ~xfeatures; + + curfps = fpu_install_fpstate(fpu, newfps); + + /* Do the final updates within the locked region */ + xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures); + xfd_update_state(newfps); + + fpregs_unlock(); + + vfree(curfps); + return 0; +} + +static int validate_sigaltstack(unsigned int usize) +{ + struct task_struct *thread, *leader = current->group_leader; + unsigned long framesize = get_sigframe_size(); + + lockdep_assert_held(¤t->sighand->siglock); + + /* get_sigframe_size() is based on fpu_user_cfg.max_size */ + framesize -= fpu_user_cfg.max_size; + framesize += usize; + for_each_thread(leader, thread) { + if (thread->sas_ss_size && thread->sas_ss_size < framesize) + return -ENOSPC; + } + return 0; +} + +static int __xstate_request_perm(u64 permitted, u64 requested) +{ + /* + * This deliberately does not exclude !XSAVES as we still might + * decide to optionally context switch XCR0 or talk the silicon + * vendors into extending XFD for the pre AMX states, especially + * AVX512. + */ + bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES); + struct fpu *fpu = ¤t->group_leader->thread.fpu; + unsigned int ksize, usize; + u64 mask; + int ret; + + /* Check whether fully enabled */ + if ((permitted & requested) == requested) + return 0; + + /* Calculate the resulting kernel state size */ + mask = permitted | requested; + ksize = xstate_calculate_size(mask, compacted); + + /* Calculate the resulting user state size */ + mask &= XFEATURE_MASK_USER_SUPPORTED; + usize = xstate_calculate_size(mask, false); + + ret = validate_sigaltstack(usize); + if (ret) + return ret; + + /* Pairs with the READ_ONCE() in xstate_get_group_perm() */ + WRITE_ONCE(fpu->perm.__state_perm, requested); + /* Protected by sighand lock */ + fpu->perm.__state_size = ksize; + fpu->perm.__user_state_size = usize; + return ret; +} + +/* + * Permissions array to map facilities with more than one component + */ +static const u64 xstate_prctl_req[XFEATURE_MAX] = { + [XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA, +}; + +static int xstate_request_perm(unsigned long idx) +{ + u64 permitted, requested; + int ret; + + if (idx >= XFEATURE_MAX) + return -EINVAL; + + /* + * Look up the facility mask which can require more than + * one xstate component. + */ + idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req)); + requested = xstate_prctl_req[idx]; + if (!requested) + return -EOPNOTSUPP; + + if ((fpu_user_cfg.max_features & requested) != requested) + return -EOPNOTSUPP; + + /* Lockless quick check */ + permitted = xstate_get_host_group_perm(); + if ((permitted & requested) == requested) + return 0; + + /* Protect against concurrent modifications */ + spin_lock_irq(¤t->sighand->siglock); + permitted = xstate_get_host_group_perm(); + ret = __xstate_request_perm(permitted, requested); + spin_unlock_irq(¤t->sighand->siglock); + return ret; +} + +int xfd_enable_feature(u64 xfd_err) +{ + u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC; + unsigned int ksize, usize; + struct fpu *fpu; + + if (!xfd_event) { + pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err); + return 0; + } + + /* Protect against concurrent modifications */ + spin_lock_irq(¤t->sighand->siglock); + + /* If not permitted let it die */ + if ((xstate_get_host_group_perm() & xfd_event) != xfd_event) { + spin_unlock_irq(¤t->sighand->siglock); + return -EPERM; + } + + fpu = ¤t->group_leader->thread.fpu; + ksize = fpu->perm.__state_size; + usize = fpu->perm.__user_state_size; + /* + * The feature is permitted. State size is sufficient. Dropping + * the lock is safe here even if more features are added from + * another task, the retrieved buffer sizes are valid for the + * currently requested feature(s). + */ + spin_unlock_irq(¤t->sighand->siglock); + + /* + * Try to allocate a new fpstate. If that fails there is no way + * out. + */ + if (fpstate_realloc(xfd_event, ksize, usize)) + return -EFAULT; + return 0; +} +#else /* CONFIG_X86_64 */ +static inline int xstate_request_perm(unsigned long idx) +{ + return -EPERM; +} +#endif /* !CONFIG_X86_64 */ + +/** + * fpu_xstate_prctl - xstate permission operations + * @tsk: Redundant pointer to current + * @option: A subfunction of arch_prctl() + * @arg2: option argument + * Return: 0 if successful; otherwise, an error code + * + * Option arguments: + * + * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info + * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info + * ARCH_REQ_XCOMP_PERM: Facility number requested + * + * For facilities which require more than one XSTATE component, the request + * must be the highest state component number related to that facility, + * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and + * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18). + */ +long fpu_xstate_prctl(struct task_struct *tsk, int option, unsigned long arg2) +{ + u64 __user *uptr = (u64 __user *)arg2; + u64 permitted, supported; + unsigned long idx = arg2; + + if (tsk != current) + return -EPERM; + + switch (option) { + case ARCH_GET_XCOMP_SUPP: + supported = fpu_user_cfg.max_features | fpu_user_cfg.legacy_features; + return put_user(supported, uptr); + + case ARCH_GET_XCOMP_PERM: + /* + * Lockless snapshot as it can also change right after the + * dropping the lock. + */ + permitted = xstate_get_host_group_perm(); + permitted &= XFEATURE_MASK_USER_SUPPORTED; + return put_user(permitted, uptr); + + case ARCH_REQ_XCOMP_PERM: + if (!IS_ENABLED(CONFIG_X86_64)) + return -EOPNOTSUPP; + + return xstate_request_perm(idx); + + default: + return -EINVAL; + } +} + #ifdef CONFIG_PROC_PID_ARCH_STATUS /* * Report the amount of time elapsed in millisecond since last AVX512 diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h new file mode 100644 index 000000000000..e18210dff88c --- /dev/null +++ b/arch/x86/kernel/fpu/xstate.h @@ -0,0 +1,278 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __X86_KERNEL_FPU_XSTATE_H +#define __X86_KERNEL_FPU_XSTATE_H + +#include <asm/cpufeature.h> +#include <asm/fpu/xstate.h> + +#ifdef CONFIG_X86_64 +DECLARE_PER_CPU(u64, xfd_state); +#endif + +static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask) +{ + /* + * XRSTORS requires these bits set in xcomp_bv, or it will + * trigger #GP: + */ + if (cpu_feature_enabled(X86_FEATURE_XSAVES)) + xsave->header.xcomp_bv = mask | XCOMP_BV_COMPACTED_FORMAT; +} + +static inline u64 xstate_get_host_group_perm(void) +{ + /* Pairs with WRITE_ONCE() in xstate_request_perm() */ + return READ_ONCE(current->group_leader->thread.fpu.perm.__state_perm); +} + +enum xstate_copy_mode { + XSTATE_COPY_FP, + XSTATE_COPY_FX, + XSTATE_COPY_XSAVE, +}; + +struct membuf; +extern void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, + u32 pkru_val, enum xstate_copy_mode copy_mode); +extern void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, + enum xstate_copy_mode mode); +extern int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf); +extern int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate, const void __user *ubuf); + + +extern void fpu__init_cpu_xstate(void); +extern void fpu__init_system_xstate(unsigned int legacy_size); + +extern void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr); + +static inline u64 xfeatures_mask_supervisor(void) +{ + return fpu_kernel_cfg.max_features & XFEATURE_MASK_SUPERVISOR_SUPPORTED; +} + +static inline u64 xfeatures_mask_independent(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) + return XFEATURE_MASK_INDEPENDENT & ~XFEATURE_MASK_LBR; + + return XFEATURE_MASK_INDEPENDENT; +} + +/* XSAVE/XRSTOR wrapper functions */ + +#ifdef CONFIG_X86_64 +#define REX_PREFIX "0x48, " +#else +#define REX_PREFIX +#endif + +/* These macros all use (%edi)/(%rdi) as the single memory argument. */ +#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27" +#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37" +#define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f" +#define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f" +#define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f" + +/* + * After this @err contains 0 on success or the trap number when the + * operation raises an exception. + */ +#define XSTATE_OP(op, st, lmask, hmask, err) \ + asm volatile("1:" op "\n\t" \ + "xor %[err], %[err]\n" \ + "2:\n\t" \ + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE) \ + : [err] "=a" (err) \ + : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ + : "memory") + +/* + * If XSAVES is enabled, it replaces XSAVEOPT because it supports a compact + * format and supervisor states in addition to modified optimization in + * XSAVEOPT. + * + * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT + * supports modified optimization which is not supported by XSAVE. + * + * We use XSAVE as a fallback. + * + * The 661 label is defined in the ALTERNATIVE* macros as the address of the + * original instruction which gets replaced. We need to use it here as the + * address of the instruction where we might get an exception at. + */ +#define XSTATE_XSAVE(st, lmask, hmask, err) \ + asm volatile(ALTERNATIVE_2(XSAVE, \ + XSAVEOPT, X86_FEATURE_XSAVEOPT, \ + XSAVES, X86_FEATURE_XSAVES) \ + "\n" \ + "xor %[err], %[err]\n" \ + "3:\n" \ + ".pushsection .fixup,\"ax\"\n" \ + "4: movl $-2, %[err]\n" \ + "jmp 3b\n" \ + ".popsection\n" \ + _ASM_EXTABLE(661b, 4b) \ + : [err] "=r" (err) \ + : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ + : "memory") + +/* + * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact + * XSAVE area format. + */ +#define XSTATE_XRESTORE(st, lmask, hmask) \ + asm volatile(ALTERNATIVE(XRSTOR, \ + XRSTORS, X86_FEATURE_XSAVES) \ + "\n" \ + "3:\n" \ + _ASM_EXTABLE_TYPE(661b, 3b, EX_TYPE_FPU_RESTORE) \ + : \ + : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ + : "memory") + +#if defined(CONFIG_X86_64) && defined(CONFIG_X86_DEBUG_FPU) +extern void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor); +#else +static inline void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor) { } +#endif + +#ifdef CONFIG_X86_64 +static inline void xfd_update_state(struct fpstate *fpstate) +{ + if (fpu_state_size_dynamic()) { + u64 xfd = fpstate->xfd; + + if (__this_cpu_read(xfd_state) != xfd) { + wrmsrl(MSR_IA32_XFD, xfd); + __this_cpu_write(xfd_state, xfd); + } + } +} +#else +static inline void xfd_update_state(struct fpstate *fpstate) { } +#endif + +/* + * Save processor xstate to xsave area. + * + * Uses either XSAVE or XSAVEOPT or XSAVES depending on the CPU features + * and command line options. The choice is permanent until the next reboot. + */ +static inline void os_xsave(struct fpstate *fpstate) +{ + u64 mask = fpstate->xfeatures; + u32 lmask = mask; + u32 hmask = mask >> 32; + int err; + + WARN_ON_FPU(!alternatives_patched); + xfd_validate_state(fpstate, mask, false); + + XSTATE_XSAVE(&fpstate->regs.xsave, lmask, hmask, err); + + /* We should never fault when copying to a kernel buffer: */ + WARN_ON_FPU(err); +} + +/* + * Restore processor xstate from xsave area. + * + * Uses XRSTORS when XSAVES is used, XRSTOR otherwise. + */ +static inline void os_xrstor(struct fpstate *fpstate, u64 mask) +{ + u32 lmask = mask; + u32 hmask = mask >> 32; + + xfd_validate_state(fpstate, mask, true); + XSTATE_XRESTORE(&fpstate->regs.xsave, lmask, hmask); +} + +/* Restore of supervisor state. Does not require XFD */ +static inline void os_xrstor_supervisor(struct fpstate *fpstate) +{ + u64 mask = xfeatures_mask_supervisor(); + u32 lmask = mask; + u32 hmask = mask >> 32; + + XSTATE_XRESTORE(&fpstate->regs.xsave, lmask, hmask); +} + +/* + * Save xstate to user space xsave area. + * + * We don't use modified optimization because xrstor/xrstors might track + * a different application. + * + * We don't use compacted format xsave area for backward compatibility for + * old applications which don't understand the compacted format of the + * xsave area. + * + * The caller has to zero buf::header before calling this because XSAVE* + * does not touch the reserved fields in the header. + */ +static inline int xsave_to_user_sigframe(struct xregs_state __user *buf) +{ + /* + * Include the features which are not xsaved/rstored by the kernel + * internally, e.g. PKRU. That's user space ABI and also required + * to allow the signal handler to modify PKRU. + */ + struct fpstate *fpstate = current->thread.fpu.fpstate; + u64 mask = fpstate->user_xfeatures; + u32 lmask = mask; + u32 hmask = mask >> 32; + int err; + + xfd_validate_state(fpstate, mask, false); + + stac(); + XSTATE_OP(XSAVE, buf, lmask, hmask, err); + clac(); + + return err; +} + +/* + * Restore xstate from user space xsave area. + */ +static inline int xrstor_from_user_sigframe(struct xregs_state __user *buf, u64 mask) +{ + struct xregs_state *xstate = ((__force struct xregs_state *)buf); + u32 lmask = mask; + u32 hmask = mask >> 32; + int err; + + xfd_validate_state(current->thread.fpu.fpstate, mask, true); + + stac(); + XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); + clac(); + + return err; +} + +/* + * Restore xstate from kernel space xsave area, return an error code instead of + * an exception. + */ +static inline int os_xrstor_safe(struct fpstate *fpstate, u64 mask) +{ + struct xregs_state *xstate = &fpstate->regs.xsave; + u32 lmask = mask; + u32 hmask = mask >> 32; + int err; + + /* Ensure that XFD is up to date */ + xfd_update_state(fpstate); + + if (cpu_feature_enabled(X86_FEATURE_XSAVES)) + XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); + else + XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); + + return err; +} + + +#endif diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index de01903c3735..fc5371a7e9d1 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -19,7 +19,7 @@ #include <linux/start_kernel.h> #include <linux/io.h> #include <linux/memblock.h> -#include <linux/mem_encrypt.h> +#include <linux/cc_platform.h> #include <linux/pgtable.h> #include <asm/processor.h> @@ -284,8 +284,13 @@ unsigned long __head __startup_64(unsigned long physaddr, * The bss section will be memset to zero later in the initialization so * there is no need to zero it after changing the memory encryption * attribute. + * + * This is early code, use an open coded check for SME instead of + * using cc_platform_has(). This eliminates worries about removing + * instrumentation or checking boot_cpu_data in the cc_platform_has() + * function. */ - if (mem_encrypt_active()) { + if (sme_get_me_mask()) { vaddr = (unsigned long)__start_bss_decrypted; vaddr_end = (unsigned long)__end_bss_decrypted; for (; vaddr < vaddr_end; vaddr += PMD_SIZE) { diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 42fc41dd0e1f..882213df3713 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -10,6 +10,7 @@ #include <asm/irq_remapping.h> #include <asm/hpet.h> #include <asm/time.h> +#include <asm/mwait.h> #undef pr_fmt #define pr_fmt(fmt) "hpet: " fmt @@ -916,6 +917,83 @@ static bool __init hpet_counting(void) return false; } +static bool __init mwait_pc10_supported(void) +{ + unsigned int eax, ebx, ecx, mwait_substates; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return false; + + if (!cpu_feature_enabled(X86_FEATURE_MWAIT)) + return false; + + if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) + return false; + + cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &mwait_substates); + + return (ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) && + (ecx & CPUID5_ECX_INTERRUPT_BREAK) && + (mwait_substates & (0xF << 28)); +} + +/* + * Check whether the system supports PC10. If so force disable HPET as that + * stops counting in PC10. This check is overbroad as it does not take any + * of the following into account: + * + * - ACPI tables + * - Enablement of intel_idle + * - Command line arguments which limit intel_idle C-state support + * + * That's perfectly fine. HPET is a piece of hardware designed by committee + * and the only reasons why it is still in use on modern systems is the + * fact that it is impossible to reliably query TSC and CPU frequency via + * CPUID or firmware. + * + * If HPET is functional it is useful for calibrating TSC, but this can be + * done via PMTIMER as well which seems to be the last remaining timer on + * X86/INTEL platforms that has not been completely wreckaged by feature + * creep. + * + * In theory HPET support should be removed altogether, but there are older + * systems out there which depend on it because TSC and APIC timer are + * dysfunctional in deeper C-states. + * + * It's only 20 years now that hardware people have been asked to provide + * reliable and discoverable facilities which can be used for timekeeping + * and per CPU timer interrupts. + * + * The probability that this problem is going to be solved in the + * forseeable future is close to zero, so the kernel has to be cluttered + * with heuristics to keep up with the ever growing amount of hardware and + * firmware trainwrecks. Hopefully some day hardware people will understand + * that the approach of "This can be fixed in software" is not sustainable. + * Hope dies last... + */ +static bool __init hpet_is_pc10_damaged(void) +{ + unsigned long long pcfg; + + /* Check whether PC10 substates are supported */ + if (!mwait_pc10_supported()) + return false; + + /* Check whether PC10 is enabled in PKG C-state limit */ + rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, pcfg); + if ((pcfg & 0xF) < 8) + return false; + + if (hpet_force_user) { + pr_warn("HPET force enabled via command line, but dysfunctional in PC10.\n"); + return false; + } + + pr_info("HPET dysfunctional in PC10. Force disabled.\n"); + boot_hpet_disable = true; + return true; +} + /** * hpet_enable - Try to setup the HPET timer. Returns 1 on success. */ @@ -929,6 +1007,9 @@ int __init hpet_enable(void) if (!is_hpet_capable()) return 0; + if (hpet_is_pc10_damaged()) + return 0; + hpet_set_mapping(); if (!hpet_virt_address) return 0; diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 044902d5a3c4..e5dd6da78713 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -132,6 +132,7 @@ int irq_init_percpu_irqstack(unsigned int cpu) return 0; } +#ifndef CONFIG_PREEMPT_RT void do_softirq_own_stack(void) { struct irq_stack *irqstk; @@ -148,6 +149,7 @@ void do_softirq_own_stack(void) call_on_stack(__do_softirq, isp); } +#endif void __handle_irq(struct irq_desc *desc, struct pt_regs *regs) { diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S index 8ef35063964b..760e1f293093 100644 --- a/arch/x86/kernel/irqflags.S +++ b/arch/x86/kernel/irqflags.S @@ -7,9 +7,11 @@ /* * unsigned long native_save_fl(void) */ +.pushsection .noinstr.text, "ax" SYM_FUNC_START(native_save_fl) pushf pop %_ASM_AX ret SYM_FUNC_END(native_save_fl) +.popsection EXPORT_SYMBOL(native_save_fl) diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c index 1afbdd1dd777..9ff480e94511 100644 --- a/arch/x86/kernel/itmt.c +++ b/arch/x86/kernel/itmt.c @@ -198,7 +198,7 @@ void sched_set_itmt_core_prio(int prio, int core_cpu) * of the priority chain and only used when * all other high priority cpus are out of capacity. */ - smt_prio = prio * smp_num_siblings / i; + smt_prio = prio * smp_num_siblings / (i * i); per_cpu(sched_core_priority, cpu) = smt_prio; i++; } diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index b656456c3a94..8863d1941f1b 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -27,6 +27,7 @@ #include <linux/nmi.h> #include <linux/swait.h> #include <linux/syscore_ops.h> +#include <linux/cc_platform.h> #include <asm/timer.h> #include <asm/cpu.h> #include <asm/traps.h> @@ -418,7 +419,7 @@ static void __init sev_map_percpu_data(void) { int cpu; - if (!sev_active()) + if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) return; for_each_possible_cpu(cpu) { diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index ad273e5861c1..462dd8e9b03d 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -16,9 +16,9 @@ #include <linux/mm.h> #include <linux/slab.h> #include <linux/set_memory.h> +#include <linux/cc_platform.h> #include <asm/hypervisor.h> -#include <asm/mem_encrypt.h> #include <asm/x86_init.h> #include <asm/kvmclock.h> @@ -49,18 +49,9 @@ early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); static struct pvclock_vsyscall_time_info hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __bss_decrypted __aligned(PAGE_SIZE); static struct pvclock_wall_clock wall_clock __bss_decrypted; -static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu); static struct pvclock_vsyscall_time_info *hvclock_mem; - -static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void) -{ - return &this_cpu_read(hv_clock_per_cpu)->pvti; -} - -static inline struct pvclock_vsyscall_time_info *this_cpu_hvclock(void) -{ - return this_cpu_read(hv_clock_per_cpu); -} +DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu); +EXPORT_PER_CPU_SYMBOL_GPL(hv_clock_per_cpu); /* * The wallclock is the time of day when we booted. Since then, some time may @@ -232,7 +223,7 @@ static void __init kvmclock_init_mem(void) * hvclock is shared between the guest and the hypervisor, must * be mapped decrypted. */ - if (sev_active()) { + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { r = set_memory_decrypted((unsigned long) hvclock_mem, 1UL << order); if (r) { diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 131f30fdcfbd..f5da4a18070a 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -17,6 +17,7 @@ #include <linux/suspend.h> #include <linux/vmalloc.h> #include <linux/efi.h> +#include <linux/cc_platform.h> #include <asm/init.h> #include <asm/tlbflush.h> @@ -166,7 +167,7 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) } pte = pte_offset_kernel(pmd, vaddr); - if (sev_active()) + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) prot = PAGE_KERNEL_EXEC; set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot)); @@ -206,7 +207,7 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) level4p = (pgd_t *)__va(start_pgtable); clear_page(level4p); - if (sev_active()) { + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { info.page_flag |= _PAGE_ENC; info.kernpg_flag |= _PAGE_ENC; } @@ -358,7 +359,7 @@ void machine_kexec(struct kimage *image) (unsigned long)page_list, image->start, image->preserve_context, - sme_active()); + cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)); #ifdef CONFIG_KEXEC_JUMP if (image->preserve_context) @@ -569,12 +570,12 @@ void arch_kexec_unprotect_crashkres(void) */ int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp) { - if (sev_active()) + if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) return 0; /* - * If SME is active we need to be sure that kexec pages are - * not encrypted because when we boot to the new kernel the + * If host memory encryption is active we need to be sure that kexec + * pages are not encrypted because when we boot to the new kernel the * pages won't be accessed encrypted (initially). */ return set_memory_decrypted((unsigned long)vaddr, pages); @@ -582,12 +583,12 @@ int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp) void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) { - if (sev_active()) + if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) return; /* - * If SME is active we need to reset the pages back to being - * an encrypted mapping before freeing them. + * If host memory encryption is active we need to reset the pages back + * to being an encrypted mapping before freeing them. */ set_memory_encrypted((unsigned long)vaddr, pages); } diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 5e9a34b5bd74..169fb6f4cd2e 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -251,7 +251,8 @@ int module_finalize(const Elf_Ehdr *hdr, struct module *me) { const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, - *para = NULL, *orc = NULL, *orc_ip = NULL; + *para = NULL, *orc = NULL, *orc_ip = NULL, + *retpolines = NULL; char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { @@ -267,8 +268,14 @@ int module_finalize(const Elf_Ehdr *hdr, orc = s; if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name)) orc_ip = s; + if (!strcmp(".retpoline_sites", secstrings + s->sh_name)) + retpolines = s; } + if (retpolines) { + void *rseg = (void *)retpolines->sh_addr; + apply_retpolines(rseg, rseg + retpolines->sh_size); + } if (alt) { /* patch .altinstructions */ void *aseg = (void *)alt->sh_addr; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 04cafc057bed..ebc45360ffd4 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -218,6 +218,36 @@ void paravirt_end_context_switch(struct task_struct *next) if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES)) arch_enter_lazy_mmu_mode(); } + +static noinstr unsigned long pv_native_read_cr2(void) +{ + return native_read_cr2(); +} + +static noinstr void pv_native_write_cr2(unsigned long val) +{ + native_write_cr2(val); +} + +static noinstr unsigned long pv_native_get_debugreg(int regno) +{ + return native_get_debugreg(regno); +} + +static noinstr void pv_native_set_debugreg(int regno, unsigned long val) +{ + native_set_debugreg(regno, val); +} + +static noinstr void pv_native_irq_enable(void) +{ + native_irq_enable(); +} + +static noinstr void pv_native_irq_disable(void) +{ + native_irq_disable(); +} #endif enum paravirt_lazy_mode paravirt_get_lazy_mode(void) @@ -244,8 +274,8 @@ struct paravirt_patch_template pv_ops = { #ifdef CONFIG_PARAVIRT_XXL .cpu.cpuid = native_cpuid, - .cpu.get_debugreg = native_get_debugreg, - .cpu.set_debugreg = native_set_debugreg, + .cpu.get_debugreg = pv_native_get_debugreg, + .cpu.set_debugreg = pv_native_set_debugreg, .cpu.read_cr0 = native_read_cr0, .cpu.write_cr0 = native_write_cr0, .cpu.write_cr4 = native_write_cr4, @@ -281,8 +311,8 @@ struct paravirt_patch_template pv_ops = { /* Irq ops. */ .irq.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl), - .irq.irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable), - .irq.irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable), + .irq.irq_disable = __PV_IS_CALLEE_SAVE(pv_native_irq_disable), + .irq.irq_enable = __PV_IS_CALLEE_SAVE(pv_native_irq_enable), .irq.safe_halt = native_safe_halt, .irq.halt = native_halt, #endif /* CONFIG_PARAVIRT_XXL */ @@ -298,8 +328,8 @@ struct paravirt_patch_template pv_ops = { .mmu.exit_mmap = paravirt_nop, #ifdef CONFIG_PARAVIRT_XXL - .mmu.read_cr2 = __PV_IS_CALLEE_SAVE(native_read_cr2), - .mmu.write_cr2 = native_write_cr2, + .mmu.read_cr2 = __PV_IS_CALLEE_SAVE(pv_native_read_cr2), + .mmu.write_cr2 = pv_native_write_cr2, .mmu.read_cr3 = __native_read_cr3, .mmu.write_cr3 = native_write_cr3, @@ -371,9 +401,6 @@ struct paravirt_patch_template pv_ops = { }; #ifdef CONFIG_PARAVIRT_XXL -/* At this point, native_get/set_debugreg has real function entries */ -NOKPROBE_SYMBOL(native_get_debugreg); -NOKPROBE_SYMBOL(native_set_debugreg); NOKPROBE_SYMBOL(native_load_idt); void (*paravirt_iret)(void) = native_iret; diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index c2cfa5e7c152..814ab46a0dad 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -6,7 +6,7 @@ #include <linux/swiotlb.h> #include <linux/memblock.h> #include <linux/dma-direct.h> -#include <linux/mem_encrypt.h> +#include <linux/cc_platform.h> #include <asm/iommu.h> #include <asm/swiotlb.h> @@ -45,11 +45,10 @@ int __init pci_swiotlb_detect_4gb(void) swiotlb = 1; /* - * If SME is active then swiotlb will be set to 1 so that bounce - * buffers are allocated and used for devices that do not support - * the addressing range required for the encryption mask. + * Set swiotlb to 1 so that bounce buffers are allocated and used for + * devices that can't support DMA to encrypted memory. */ - if (sme_active()) + if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) swiotlb = 1; return swiotlb; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 1d9463e3096b..eb470be0e5ae 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -30,7 +30,9 @@ #include <asm/apic.h> #include <linux/uaccess.h> #include <asm/mwait.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> +#include <asm/fpu/sched.h> +#include <asm/fpu/xstate.h> #include <asm/debugreg.h> #include <asm/nmi.h> #include <asm/tlbflush.h> @@ -43,6 +45,7 @@ #include <asm/io_bitmap.h> #include <asm/proto.h> #include <asm/frame.h> +#include <asm/unwind.h> #include "process.h" @@ -87,9 +90,20 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) #ifdef CONFIG_VM86 dst->thread.vm86 = NULL; #endif - return fpu_clone(dst); + /* Drop the copied pointer to current's fpstate */ + dst->thread.fpu.fpstate = NULL; + + return 0; } +#ifdef CONFIG_X86_64 +void arch_release_task_struct(struct task_struct *tsk) +{ + if (fpu_state_size_dynamic()) + fpstate_free(&tsk->thread.fpu); +} +#endif + /* * Free thread data structures etc.. */ @@ -154,6 +168,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, frame->flags = X86_EFLAGS_FIXED; #endif + fpu_clone(p, clone_flags); + /* Kernel thread ? */ if (unlikely(p->flags & PF_KTHREAD)) { p->thread.pkru = pkru_get_init_value(); @@ -942,70 +958,36 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) * because the task might wake up and we might look at a stack * changing under us. */ -unsigned long get_wchan(struct task_struct *p) +unsigned long __get_wchan(struct task_struct *p) { - unsigned long start, bottom, top, sp, fp, ip, ret = 0; - int count = 0; - - if (p == current || task_is_running(p)) - return 0; - - if (!try_get_task_stack(p)) - return 0; - - start = (unsigned long)task_stack_page(p); - if (!start) - goto out; + struct unwind_state state; + unsigned long addr = 0; - /* - * Layout of the stack page: - * - * ----------- topmax = start + THREAD_SIZE - sizeof(unsigned long) - * PADDING - * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING - * stack - * ----------- bottom = start - * - * The tasks stack pointer points at the location where the - * framepointer is stored. The data on the stack is: - * ... IP FP ... IP FP - * - * We need to read FP and IP, so we need to adjust the upper - * bound by another unsigned long. - */ - top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; - top -= 2 * sizeof(unsigned long); - bottom = start; - - sp = READ_ONCE(p->thread.sp); - if (sp < bottom || sp > top) - goto out; - - fp = READ_ONCE_NOCHECK(((struct inactive_task_frame *)sp)->bp); - do { - if (fp < bottom || fp > top) - goto out; - ip = READ_ONCE_NOCHECK(*(unsigned long *)(fp + sizeof(unsigned long))); - if (!in_sched_functions(ip)) { - ret = ip; - goto out; - } - fp = READ_ONCE_NOCHECK(*(unsigned long *)fp); - } while (count++ < 16 && !task_is_running(p)); + for (unwind_start(&state, p, NULL, NULL); !unwind_done(&state); + unwind_next_frame(&state)) { + addr = unwind_get_return_address(&state); + if (!addr) + break; + if (in_sched_functions(addr)) + continue; + break; + } -out: - put_task_stack(p); - return ret; + return addr; } long do_arch_prctl_common(struct task_struct *task, int option, - unsigned long cpuid_enabled) + unsigned long arg2) { switch (option) { case ARCH_GET_CPUID: return get_cpuid_mode(); case ARCH_SET_CPUID: - return set_cpuid_mode(task, cpuid_enabled); + return set_cpuid_mode(task, arg2); + case ARCH_GET_XCOMP_SUPP: + case ARCH_GET_XCOMP_PERM: + case ARCH_REQ_XCOMP_PERM: + return fpu_xstate_prctl(task, option, arg2); } return -EINVAL; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 4f2f54e1281c..26edb1cd07a4 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -41,7 +41,7 @@ #include <asm/ldt.h> #include <asm/processor.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/sched.h> #include <asm/desc.h> #include <linux/err.h> @@ -160,7 +160,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; struct fpu *prev_fpu = &prev->fpu; - struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ @@ -213,7 +212,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) this_cpu_write(current_task, next_p); - switch_fpu_finish(next_fpu); + switch_fpu_finish(); /* Load the Intel cache allocation PQR MSR. */ resctrl_sched_in(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ec0d836a13b1..3402edec236c 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -42,7 +42,7 @@ #include <asm/processor.h> #include <asm/pkru.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/sched.h> #include <asm/mmu_context.h> #include <asm/prctl.h> #include <asm/desc.h> @@ -559,7 +559,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct thread_struct *prev = &prev_p->thread; struct thread_struct *next = &next_p->thread; struct fpu *prev_fpu = &prev->fpu; - struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && @@ -620,7 +619,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) this_cpu_write(current_task, next_p); this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); - switch_fpu_finish(next_fpu); + switch_fpu_finish(); /* Reload sp0. */ update_task_stack(next_p); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 4c208ea3bd9f..6d2244c94799 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -29,9 +29,9 @@ #include <linux/uaccess.h> #include <asm/processor.h> -#include <asm/fpu/internal.h> #include <asm/fpu/signal.h> #include <asm/fpu/regset.h> +#include <asm/fpu/xstate.h> #include <asm/debugreg.h> #include <asm/ldt.h> #include <asm/desc.h> diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index c53271aebb64..c8fe74a28143 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -47,7 +47,7 @@ SYM_CODE_START_NOALIGN(relocate_kernel) * %rsi page_list * %rdx start address * %rcx preserve_context - * %r8 sme_active + * %r8 host_mem_enc_active */ /* Save the CPU context, used for jumping back */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 79f164141116..40ed44ead063 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -830,6 +830,20 @@ void __init setup_arch(char **cmdline_p) x86_init.oem.arch_setup(); + /* + * Do some memory reservations *before* memory is added to memblock, so + * memblock allocations won't overwrite it. + * + * After this point, everything still needed from the boot loader or + * firmware or kernel text should be early reserved or marked not RAM in + * e820. All other memory is free game. + * + * This call needs to happen before e820__memory_setup() which calls the + * xen_memory_setup() on Xen dom0 which relies on the fact that those + * early reservations have happened already. + */ + early_reserve_memory(); + iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; e820__memory_setup(); parse_setup_data(); @@ -876,18 +890,6 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); - /* - * Do some memory reservations *before* memory is added to - * memblock, so memblock allocations won't overwrite it. - * Do it after early param, so we could get (unlikely) panic from - * serial. - * - * After this point everything still needed from the boot loader or - * firmware or kernel text should be early reserved or marked not - * RAM in e820. All other memory is free game. - */ - early_reserve_memory(); - #ifdef CONFIG_MEMORY_HOTPLUG /* * Memory used by the kernel cannot be hot-removed because Linux diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 78a32b956e81..5afd98559193 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -135,7 +135,7 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align) static void __init pcpu_fc_free(void *ptr, size_t size) { - memblock_free(__pa(ptr), size); + memblock_free_ptr(ptr, size); } static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index 9f90f460a28c..787dc5f568b5 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -64,7 +64,7 @@ static bool sev_es_negotiate_protocol(void) static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb) { ghcb->save.sw_exit_code = 0; - memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); + __builtin_memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); } static bool vc_decoding_needed(unsigned long exit_code) @@ -94,25 +94,15 @@ static void vc_finish_insn(struct es_em_ctxt *ctxt) ctxt->regs->ip += ctxt->insn.length; } -static enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, - struct es_em_ctxt *ctxt, - u64 exit_code, u64 exit_info_1, - u64 exit_info_2) +static enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt *ctxt) { - enum es_result ret; - - /* Fill in protocol and format specifiers */ - ghcb->protocol_version = GHCB_PROTOCOL_MAX; - ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; - - ghcb_set_sw_exit_code(ghcb, exit_code); - ghcb_set_sw_exit_info_1(ghcb, exit_info_1); - ghcb_set_sw_exit_info_2(ghcb, exit_info_2); + u32 ret; - sev_es_wr_ghcb_msr(__pa(ghcb)); - VMGEXIT(); + ret = ghcb->save.sw_exit_info_1 & GENMASK_ULL(31, 0); + if (!ret) + return ES_OK; - if ((ghcb->save.sw_exit_info_1 & 0xffffffff) == 1) { + if (ret == 1) { u64 info = ghcb->save.sw_exit_info_2; unsigned long v; @@ -124,17 +114,40 @@ static enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, ((v == X86_TRAP_GP) || (v == X86_TRAP_UD)) && ((info & SVM_EVTINJ_TYPE_MASK) == SVM_EVTINJ_TYPE_EXEPT)) { ctxt->fi.vector = v; + if (info & SVM_EVTINJ_VALID_ERR) ctxt->fi.error_code = info >> 32; - ret = ES_EXCEPTION; - } else { - ret = ES_VMM_ERROR; + + return ES_EXCEPTION; } - } else { - ret = ES_OK; } - return ret; + return ES_VMM_ERROR; +} + +enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, bool set_ghcb_msr, + struct es_em_ctxt *ctxt, u64 exit_code, + u64 exit_info_1, u64 exit_info_2) +{ + /* Fill in protocol and format specifiers */ + ghcb->protocol_version = GHCB_PROTOCOL_MAX; + ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; + + ghcb_set_sw_exit_code(ghcb, exit_code); + ghcb_set_sw_exit_info_1(ghcb, exit_info_1); + ghcb_set_sw_exit_info_2(ghcb, exit_info_2); + + /* + * Hyper-V unenlightened guests use a paravisor for communicating and + * GHCB pages are being allocated and set up by that paravisor. Linux + * should not change the GHCB page's physical address. + */ + if (set_ghcb_msr) + sev_es_wr_ghcb_msr(__pa(ghcb)); + + VMGEXIT(); + + return verify_exception_info(ghcb, ctxt); } /* @@ -411,7 +424,7 @@ static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) */ sw_scratch = __pa(ghcb) + offsetof(struct ghcb, shared_buffer); ghcb_set_sw_scratch(ghcb, sw_scratch); - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, + ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_IOIO, exit_info_1, exit_info_2); if (ret != ES_OK) return ret; @@ -453,7 +466,8 @@ static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) ghcb_set_rax(ghcb, rax); - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, exit_info_1, 0); + ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, + SVM_EXIT_IOIO, exit_info_1, 0); if (ret != ES_OK) return ret; @@ -484,7 +498,7 @@ static enum es_result vc_handle_cpuid(struct ghcb *ghcb, /* xgetbv will cause #GP - use reset value for xcr0 */ ghcb_set_xcr0(ghcb, 1); - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0); + ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_CPUID, 0, 0); if (ret != ES_OK) return ret; @@ -509,7 +523,7 @@ static enum es_result vc_handle_rdtsc(struct ghcb *ghcb, bool rdtscp = (exit_code == SVM_EXIT_RDTSCP); enum es_result ret; - ret = sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, 0, 0); + ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, exit_code, 0, 0); if (ret != ES_OK) return ret; diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index a6895e440bc3..c195ffe58049 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -11,7 +11,7 @@ #include <linux/sched/debug.h> /* For show_regs() */ #include <linux/percpu-defs.h> -#include <linux/mem_encrypt.h> +#include <linux/cc_platform.h> #include <linux/printk.h> #include <linux/mm_types.h> #include <linux/set_memory.h> @@ -23,7 +23,7 @@ #include <asm/stacktrace.h> #include <asm/sev.h> #include <asm/insn-eval.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/xcr.h> #include <asm/processor.h> #include <asm/realmode.h> #include <asm/traps.h> @@ -615,7 +615,7 @@ int __init sev_es_efi_map_ghcbs(pgd_t *pgd) int cpu; u64 pfn; - if (!sev_es_active()) + if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) return 0; pflags = _PAGE_NX | _PAGE_RW; @@ -648,7 +648,8 @@ static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) ghcb_set_rdx(ghcb, regs->dx); } - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_MSR, exit_info_1, 0); + ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_MSR, + exit_info_1, 0); if ((ret == ES_OK) && (!exit_info_1)) { regs->ax = ghcb->save.rax; @@ -774,7 +775,7 @@ void __init sev_es_init_vc_handling(void) BUILD_BUG_ON(offsetof(struct sev_es_runtime_data, ghcb_page) % PAGE_SIZE); - if (!sev_es_active()) + if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) return; if (!sev_es_check_cpu_features()) @@ -867,7 +868,7 @@ static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt, ghcb_set_sw_scratch(ghcb, ghcb_pa + offsetof(struct ghcb, shared_buffer)); - return sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, exit_info_1, exit_info_2); + return sev_es_ghcb_hv_call(ghcb, true, ctxt, exit_code, exit_info_1, exit_info_2); } static enum es_result vc_handle_mmio_twobyte_ops(struct ghcb *ghcb, @@ -1117,7 +1118,7 @@ static enum es_result vc_handle_dr7_write(struct ghcb *ghcb, /* Using a value of 0 for ExitInfo1 means RAX holds the value */ ghcb_set_rax(ghcb, val); - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WRITE_DR7, 0, 0); + ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_WRITE_DR7, 0, 0); if (ret != ES_OK) return ret; @@ -1147,7 +1148,7 @@ static enum es_result vc_handle_dr7_read(struct ghcb *ghcb, static enum es_result vc_handle_wbinvd(struct ghcb *ghcb, struct es_em_ctxt *ctxt) { - return sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WBINVD, 0, 0); + return sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_WBINVD, 0, 0); } static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt) @@ -1156,7 +1157,7 @@ static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt ghcb_set_rcx(ghcb, ctxt->regs->cx); - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_RDPMC, 0, 0); + ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_RDPMC, 0, 0); if (ret != ES_OK) return ret; @@ -1197,7 +1198,7 @@ static enum es_result vc_handle_vmmcall(struct ghcb *ghcb, if (x86_platform.hyper.sev_es_hcall_prepare) x86_platform.hyper.sev_es_hcall_prepare(ghcb, ctxt->regs); - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_VMMCALL, 0, 0); + ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_VMMCALL, 0, 0); if (ret != ES_OK) return ret; @@ -1319,13 +1320,26 @@ static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt) } } -static __always_inline bool on_vc_fallback_stack(struct pt_regs *regs) +static __always_inline bool is_vc2_stack(unsigned long sp) { - unsigned long sp = (unsigned long)regs; - return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2)); } +static __always_inline bool vc_from_invalid_context(struct pt_regs *regs) +{ + unsigned long sp, prev_sp; + + sp = (unsigned long)regs; + prev_sp = regs->sp; + + /* + * If the code was already executing on the VC2 stack when the #VC + * happened, let it proceed to the normal handling routine. This way the + * code executing on the VC2 stack can cause #VC exceptions to get handled. + */ + return is_vc2_stack(sp) && !is_vc2_stack(prev_sp); +} + static bool vc_raw_handle_exception(struct pt_regs *regs, unsigned long error_code) { struct ghcb_state state; @@ -1406,7 +1420,7 @@ DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication) * But keep this here in case the noinstr annotations are violated due * to bug elsewhere. */ - if (unlikely(on_vc_fallback_stack(regs))) { + if (unlikely(vc_from_invalid_context(regs))) { instrumentation_begin(); panic("Can't handle #VC exception from unsupported context\n"); instrumentation_end(); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index f4d21e470083..ec71e06ae364 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -15,6 +15,7 @@ #include <linux/mm.h> #include <linux/smp.h> #include <linux/kernel.h> +#include <linux/kstrtox.h> #include <linux/errno.h> #include <linux/wait.h> #include <linux/tracehook.h> @@ -30,8 +31,8 @@ #include <asm/processor.h> #include <asm/ucontext.h> -#include <asm/fpu/internal.h> #include <asm/fpu/signal.h> +#include <asm/fpu/xstate.h> #include <asm/vdso.h> #include <asm/mce.h> #include <asm/sighandling.h> @@ -41,6 +42,7 @@ #include <linux/compat.h> #include <asm/proto.h> #include <asm/ia32_unistd.h> +#include <asm/fpu/xstate.h> #endif /* CONFIG_X86_64 */ #include <asm/syscall.h> @@ -79,9 +81,9 @@ static void force_valid_ss(struct pt_regs *regs) # define CONTEXT_COPY_SIZE sizeof(struct sigcontext) #endif -static int restore_sigcontext(struct pt_regs *regs, - struct sigcontext __user *usc, - unsigned long uc_flags) +static bool restore_sigcontext(struct pt_regs *regs, + struct sigcontext __user *usc, + unsigned long uc_flags) { struct sigcontext sc; @@ -89,7 +91,7 @@ static int restore_sigcontext(struct pt_regs *regs, current->restart_block.fn = do_no_restart_syscall; if (copy_from_user(&sc, usc, CONTEXT_COPY_SIZE)) - return -EFAULT; + return false; #ifdef CONFIG_X86_32 set_user_gs(regs, sc.gs); @@ -244,7 +246,6 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, unsigned long math_size = 0; unsigned long sp = regs->sp; unsigned long buf_fx = 0; - int ret; /* redzone */ if (IS_ENABLED(CONFIG_X86_64)) @@ -292,8 +293,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, } /* save i387 and extended state */ - ret = copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size); - if (ret < 0) + if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size)) return (void __user *)-1L; return (void __user *)sp; @@ -643,7 +643,7 @@ SYSCALL_DEFINE0(sigreturn) * x86_32 has no uc_flags bits relevant to restore_sigcontext. * Save a few cycles by skipping the __get_user. */ - if (restore_sigcontext(regs, &frame->sc, 0)) + if (!restore_sigcontext(regs, &frame->sc, 0)) goto badframe; return regs->ax; @@ -671,7 +671,7 @@ SYSCALL_DEFINE0(rt_sigreturn) set_current_blocked(&set); - if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) + if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) goto badframe; if (restore_altstack(&frame->uc.uc_stack)) @@ -721,12 +721,15 @@ badframe: /* max_frame_size tells userspace the worst case signal stack size. */ static unsigned long __ro_after_init max_frame_size; +static unsigned int __ro_after_init fpu_default_state_size; void __init init_sigframe_size(void) { + fpu_default_state_size = fpu__get_fpstate_size(); + max_frame_size = MAX_FRAME_SIGINFO_UCTXT_SIZE + MAX_FRAME_PADDING; - max_frame_size += fpu__get_fpstate_size() + MAX_XSAVE_PADDING; + max_frame_size += fpu_default_state_size + MAX_XSAVE_PADDING; /* Userspace expects an aligned size. */ max_frame_size = round_up(max_frame_size, FRAME_ALIGNMENT); @@ -910,6 +913,62 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where) force_sig(SIGSEGV); } +#ifdef CONFIG_DYNAMIC_SIGFRAME +#ifdef CONFIG_STRICT_SIGALTSTACK_SIZE +static bool strict_sigaltstack_size __ro_after_init = true; +#else +static bool strict_sigaltstack_size __ro_after_init = false; +#endif + +static int __init strict_sas_size(char *arg) +{ + return kstrtobool(arg, &strict_sigaltstack_size); +} +__setup("strict_sas_size", strict_sas_size); + +/* + * MINSIGSTKSZ is 2048 and can't be changed despite the fact that AVX512 + * exceeds that size already. As such programs might never use the + * sigaltstack they just continued to work. While always checking against + * the real size would be correct, this might be considered a regression. + * + * Therefore avoid the sanity check, unless enforced by kernel + * configuration or command line option. + * + * When dynamic FPU features are supported, the check is also enforced when + * the task has permissions to use dynamic features. Tasks which have no + * permission are checked against the size of the non-dynamic feature set + * if strict checking is enabled. This avoids forcing all tasks on the + * system to allocate large sigaltstacks even if they are never going + * to use a dynamic feature. As this is serialized via sighand::siglock + * any permission request for a dynamic feature either happened already + * or will see the newly install sigaltstack size in the permission checks. + */ +bool sigaltstack_size_valid(size_t ss_size) +{ + unsigned long fsize = max_frame_size - fpu_default_state_size; + u64 mask; + + lockdep_assert_held(¤t->sighand->siglock); + + if (!fpu_state_size_dynamic() && !strict_sigaltstack_size) + return true; + + fsize += current->group_leader->thread.fpu.perm.__user_state_size; + if (likely(ss_size > fsize)) + return true; + + if (strict_sigaltstack_size) + return ss_size > fsize; + + mask = current->group_leader->thread.fpu.perm.__state_perm; + if (mask & XFEATURE_MASK_USER_DYNAMIC) + return ss_size > fsize; + + return true; +} +#endif /* CONFIG_DYNAMIC_SIGFRAME */ + #ifdef CONFIG_X86_X32_ABI COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn) { @@ -929,7 +988,7 @@ COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn) set_current_blocked(&set); - if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) + if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) goto badframe; if (compat_restore_altstack(&frame->uc.uc_stack)) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 85f6e242b6b4..8241927addff 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -70,7 +70,7 @@ #include <asm/mwait.h> #include <asm/apic.h> #include <asm/io_apic.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> #include <asm/setup.h> #include <asm/uv/uv.h> #include <linux/mc146818rtc.h> @@ -101,6 +101,8 @@ EXPORT_PER_CPU_SYMBOL(cpu_die_map); DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); +DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map); + /* Per CPU bogomips and other parameters */ DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); EXPORT_PER_CPU_SYMBOL(cpu_info); @@ -464,6 +466,21 @@ static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) return false; } +static bool match_l2c(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ + int cpu1 = c->cpu_index, cpu2 = o->cpu_index; + + /* If the arch didn't set up l2c_id, fall back to SMT */ + if (per_cpu(cpu_l2c_id, cpu1) == BAD_APICID) + return match_smt(c, o); + + /* Do not match if L2 cache id does not match: */ + if (per_cpu(cpu_l2c_id, cpu1) != per_cpu(cpu_l2c_id, cpu2)) + return false; + + return topology_sane(c, o, "l2c"); +} + /* * Unlike the other levels, we do not enforce keeping a * multicore group inside a NUMA node. If this happens, we will @@ -523,7 +540,7 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) } -#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) +#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_CLUSTER) || defined(CONFIG_SCHED_MC) static inline int x86_sched_itmt_flags(void) { return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0; @@ -541,12 +558,21 @@ static int x86_smt_flags(void) return cpu_smt_flags() | x86_sched_itmt_flags(); } #endif +#ifdef CONFIG_SCHED_CLUSTER +static int x86_cluster_flags(void) +{ + return cpu_cluster_flags() | x86_sched_itmt_flags(); +} +#endif #endif static struct sched_domain_topology_level x86_numa_in_package_topology[] = { #ifdef CONFIG_SCHED_SMT { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, #endif +#ifdef CONFIG_SCHED_CLUSTER + { cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) }, +#endif #ifdef CONFIG_SCHED_MC { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, #endif @@ -557,6 +583,9 @@ static struct sched_domain_topology_level x86_topology[] = { #ifdef CONFIG_SCHED_SMT { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, #endif +#ifdef CONFIG_SCHED_CLUSTER + { cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) }, +#endif #ifdef CONFIG_SCHED_MC { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, #endif @@ -584,6 +613,7 @@ void set_cpu_sibling_map(int cpu) if (!has_mp) { cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu)); cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu)); + cpumask_set_cpu(cpu, cpu_l2c_shared_mask(cpu)); cpumask_set_cpu(cpu, topology_core_cpumask(cpu)); cpumask_set_cpu(cpu, topology_die_cpumask(cpu)); c->booted_cores = 1; @@ -602,6 +632,9 @@ void set_cpu_sibling_map(int cpu) if ((i == cpu) || (has_mp && match_llc(c, o))) link_mask(cpu_llc_shared_mask, cpu, i); + if ((i == cpu) || (has_mp && match_l2c(c, o))) + link_mask(cpu_l2c_shared_mask, cpu, i); + if ((i == cpu) || (has_mp && match_die(c, o))) link_mask(topology_die_cpumask, cpu, i); } @@ -652,6 +685,11 @@ const struct cpumask *cpu_coregroup_mask(int cpu) return cpu_llc_shared_mask(cpu); } +const struct cpumask *cpu_clustergroup_mask(int cpu) +{ + return cpu_l2c_shared_mask(cpu); +} + static void impress_friends(void) { int cpu; @@ -1335,6 +1373,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL); zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); + zalloc_cpumask_var(&per_cpu(cpu_l2c_shared_map, i), GFP_KERNEL); } /* @@ -1564,7 +1603,10 @@ static void remove_siblinginfo(int cpu) for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling)); + for_each_cpu(sibling, cpu_l2c_shared_mask(cpu)) + cpumask_clear_cpu(cpu, cpu_l2c_shared_mask(sibling)); cpumask_clear(cpu_llc_shared_mask(cpu)); + cpumask_clear(cpu_l2c_shared_mask(cpu)); cpumask_clear(topology_sibling_cpumask(cpu)); cpumask_clear(topology_core_cpumask(cpu)); cpumask_clear(topology_die_cpumask(cpu)); @@ -2166,7 +2208,7 @@ DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; void arch_scale_freq_tick(void) { - u64 freq_scale = SCHED_CAPACITY_SCALE; + u64 freq_scale; u64 aperf, mperf; u64 acnt, mcnt; diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a58800973aed..43f70bc01762 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -48,7 +48,7 @@ #include <asm/ftrace.h> #include <asm/traps.h> #include <asm/desc.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> #include <asm/cpu.h> #include <asm/cpu_entry_area.h> #include <asm/mce.h> @@ -709,7 +709,7 @@ asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *r stack = (unsigned long *)sp; if (!get_stack_info_noinstr(stack, current, &info) || info.type == STACK_TYPE_ENTRY || - info.type >= STACK_TYPE_EXCEPTION_LAST) + info.type > STACK_TYPE_EXCEPTION_LAST) sp = __this_cpu_ist_top_va(VC2); sync: @@ -1108,10 +1108,48 @@ DEFINE_IDTENTRY(exc_spurious_interrupt_bug) */ } +static bool handle_xfd_event(struct pt_regs *regs) +{ + u64 xfd_err; + int err; + + if (!IS_ENABLED(CONFIG_X86_64) || !cpu_feature_enabled(X86_FEATURE_XFD)) + return false; + + rdmsrl(MSR_IA32_XFD_ERR, xfd_err); + if (!xfd_err) + return false; + + wrmsrl(MSR_IA32_XFD_ERR, 0); + + /* Die if that happens in kernel space */ + if (WARN_ON(!user_mode(regs))) + return false; + + local_irq_enable(); + + err = xfd_enable_feature(xfd_err); + + switch (err) { + case -EPERM: + force_sig_fault(SIGILL, ILL_ILLOPC, error_get_trap_addr(regs)); + break; + case -EFAULT: + force_sig(SIGSEGV); + break; + } + + local_irq_disable(); + return true; +} + DEFINE_IDTENTRY(exc_device_not_available) { unsigned long cr0 = read_cr0(); + if (handle_xfd_event(regs)) + return; + #ifdef CONFIG_MATH_EMULATION if (!boot_cpu_has(X86_FEATURE_FPU) && (cr0 & X86_CR0_EM)) { struct math_emu_info info = { }; diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c index 576b47e7523d..5a4b21389b1d 100644 --- a/arch/x86/kernel/umip.c +++ b/arch/x86/kernel/umip.c @@ -92,8 +92,8 @@ static const char * const umip_insns[5] = { #define umip_pr_err(regs, fmt, ...) \ umip_printk(regs, KERN_ERR, fmt, ##__VA_ARGS__) -#define umip_pr_warn(regs, fmt, ...) \ - umip_printk(regs, KERN_WARNING, fmt, ##__VA_ARGS__) +#define umip_pr_debug(regs, fmt, ...) \ + umip_printk(regs, KERN_DEBUG, fmt, ##__VA_ARGS__) /** * umip_printk() - Print a rate-limited message @@ -361,10 +361,10 @@ bool fixup_umip_exception(struct pt_regs *regs) if (umip_inst < 0) return false; - umip_pr_warn(regs, "%s instruction cannot be used by applications.\n", + umip_pr_debug(regs, "%s instruction cannot be used by applications.\n", umip_insns[umip_inst]); - umip_pr_warn(regs, "For now, expensive software emulation returns the result.\n"); + umip_pr_debug(regs, "For now, expensive software emulation returns the result.\n"); if (emulate_umip_insn(&insn, umip_inst, dummy_data, &dummy_data_size, user_64bit_mode(regs))) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index efd9e9ea17f2..3d6dc12d198f 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -272,6 +272,20 @@ SECTIONS __parainstructions_end = .; } +#ifdef CONFIG_RETPOLINE + /* + * List of instructions that call/jmp/jcc to retpoline thunks + * __x86_indirect_thunk_*(). These instructions can be patched along + * with alternatives, after which the section can be freed. + */ + . = ALIGN(8); + .retpoline_sites : AT(ADDR(.retpoline_sites) - LOAD_OFFSET) { + __retpoline_sites = .; + *(.retpoline_sites) + __retpoline_sites_end = .; + } +#endif + /* * struct alt_inst entries. From the header (alternative.h): * "Alternative instructions for different CPU types or capabilities" diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index fe03bd978761..751aa85a3001 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -65,8 +65,8 @@ static inline struct kvm_cpuid_entry2 *cpuid_entry2_find( for (i = 0; i < nent; i++) { e = &entries[i]; - if (e->function == function && (e->index == index || - !(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX))) + if (e->function == function && + (!(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) || e->index == index)) return e; } diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 2837110e66ed..9a144ca8e146 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -435,7 +435,6 @@ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop); __FOP_RET(#op) asm(".pushsection .fixup, \"ax\"\n" - ".global kvm_fastop_exception \n" "kvm_fastop_exception: xor %esi, %esi; ret\n" ".popsection"); @@ -4206,7 +4205,7 @@ static int check_rdtsc(struct x86_emulate_ctxt *ctxt) u64 cr4 = ctxt->ops->get_cr(ctxt, 4); if (cr4 & X86_CR4_TSD && ctxt->ops->cpl(ctxt)) - return emulate_ud(ctxt); + return emulate_gp(ctxt, 0); return X86EMUL_CONTINUE; } diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 232a86a6faaf..d5124b520f76 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -939,7 +939,7 @@ static int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu) for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) stimer_init(&hv_vcpu->stimer[i], i); - hv_vcpu->vp_index = kvm_vcpu_get_idx(vcpu); + hv_vcpu->vp_index = vcpu->vcpu_idx; return 0; } @@ -1444,7 +1444,6 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) switch (msr) { case HV_X64_MSR_VP_INDEX: { struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); - int vcpu_idx = kvm_vcpu_get_idx(vcpu); u32 new_vp_index = (u32)data; if (!host || new_vp_index >= KVM_MAX_VCPUS) @@ -1459,9 +1458,9 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) * VP index is changing, adjust num_mismatched_vp_indexes if * it now matches or no longer matches vcpu_idx. */ - if (hv_vcpu->vp_index == vcpu_idx) + if (hv_vcpu->vp_index == vcpu->vcpu_idx) atomic_inc(&hv->num_mismatched_vp_indexes); - else if (new_vp_index == vcpu_idx) + else if (new_vp_index == vcpu->vcpu_idx) atomic_dec(&hv->num_mismatched_vp_indexes); hv_vcpu->vp_index = new_vp_index; diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 730da8537d05..ed1c4e546d04 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -83,7 +83,7 @@ static inline u32 kvm_hv_get_vpindex(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); - return hv_vcpu ? hv_vcpu->vp_index : kvm_vcpu_get_idx(vcpu); + return hv_vcpu ? hv_vcpu->vp_index : vcpu->vcpu_idx; } int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host); diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index ff005fe738a4..8c065da73f8e 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -319,8 +319,8 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) unsigned index; bool mask_before, mask_after; union kvm_ioapic_redirect_entry *e; - unsigned long vcpu_bitmap; int old_remote_irr, old_delivery_status, old_dest_id, old_dest_mode; + DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS); switch (ioapic->ioregsel) { case IOAPIC_REG_VERSION: @@ -384,9 +384,9 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) irq.shorthand = APIC_DEST_NOSHORT; irq.dest_id = e->fields.dest_id; irq.msi_redir_hint = false; - bitmap_zero(&vcpu_bitmap, 16); + bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS); kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq, - &vcpu_bitmap); + vcpu_bitmap); if (old_dest_mode != e->fields.dest_mode || old_dest_id != e->fields.dest_id) { /* @@ -399,10 +399,10 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) kvm_lapic_irq_dest_mode( !!e->fields.dest_mode); kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq, - &vcpu_bitmap); + vcpu_bitmap); } kvm_make_scan_ioapic_request_mask(ioapic->kvm, - &vcpu_bitmap); + vcpu_bitmap); } else { kvm_make_scan_ioapic_request(ioapic->kvm); } diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 76fb00921203..d6ac32f3f650 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2321,13 +2321,14 @@ EXPORT_SYMBOL_GPL(kvm_apic_update_apicv); void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) { struct kvm_lapic *apic = vcpu->arch.apic; + u64 msr_val; int i; if (!init_event) { - vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE | - MSR_IA32_APICBASE_ENABLE; + msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_reset_bsp(vcpu)) - vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; + msr_val |= MSR_IA32_APICBASE_BSP; + kvm_lapic_set_base(vcpu, msr_val); } if (!apic) @@ -2336,11 +2337,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) /* Stop the timer in case it's a reset to an active apic */ hrtimer_cancel(&apic->lapic_timer.timer); - if (!init_event) { - apic->base_address = APIC_DEFAULT_PHYS_BASE; - + /* The xAPIC ID is set at RESET even if the APIC was already enabled. */ + if (!init_event) kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); - } kvm_apic_set_version(apic->vcpu); for (i = 0; i < KVM_APIC_LVT_NUM; i++) @@ -2481,6 +2480,11 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) lapic_timer_advance_dynamic = false; } + /* + * Stuff the APIC ENABLE bit in lieu of temporarily incrementing + * apic_hw_disabled; the full RESET value is set by kvm_lapic_reset(). + */ + vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE; static_branch_inc(&apic_sw_disabled.key); /* sw disabled at reset */ kvm_iodevice_init(&apic->dev, &apic_mmio_ops); @@ -2942,5 +2946,7 @@ int kvm_apic_accept_events(struct kvm_vcpu *vcpu) void kvm_lapic_exit(void) { static_key_deferred_flush(&apic_hw_disabled); + WARN_ON(static_branch_unlikely(&apic_hw_disabled.key)); static_key_deferred_flush(&apic_sw_disabled); + WARN_ON(static_branch_unlikely(&apic_sw_disabled.key)); } diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 2d7e61122af8..0cc58901bf7a 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2027,8 +2027,8 @@ static void mmu_pages_clear_parents(struct mmu_page_path *parents) } while (!sp->unsync_children); } -static void mmu_sync_children(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *parent) +static int mmu_sync_children(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *parent, bool can_yield) { int i; struct kvm_mmu_page *sp; @@ -2055,12 +2055,18 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, } if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) { kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); + if (!can_yield) { + kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); + return -EINTR; + } + cond_resched_rwlock_write(&vcpu->kvm->mmu_lock); flush = false; } } kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); + return 0; } static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp) @@ -2146,9 +2152,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } - if (sp->unsync_children) - kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); - __clear_sp_write_flooding_count(sp); trace_get_page: @@ -3684,7 +3687,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) write_lock(&vcpu->kvm->mmu_lock); kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); - mmu_sync_children(vcpu, sp); + mmu_sync_children(vcpu, sp, true); kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); write_unlock(&vcpu->kvm->mmu_lock); @@ -3700,7 +3703,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) if (IS_VALID_PAE_ROOT(root)) { root &= PT64_BASE_ADDR_MASK; sp = to_shadow_page(root); - mmu_sync_children(vcpu, sp); + mmu_sync_children(vcpu, sp, true); } } @@ -4593,10 +4596,10 @@ static void update_pkru_bitmask(struct kvm_mmu *mmu) unsigned bit; bool wp; - if (!is_cr4_pke(mmu)) { - mmu->pkru_mask = 0; + mmu->pkru_mask = 0; + + if (!is_cr4_pke(mmu)) return; - } wp = is_cr0_wp(mmu); diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index 269f11f92fd0..21427e84a82e 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -164,13 +164,13 @@ void kvm_page_track_cleanup(struct kvm *kvm) cleanup_srcu_struct(&head->track_srcu); } -void kvm_page_track_init(struct kvm *kvm) +int kvm_page_track_init(struct kvm *kvm) { struct kvm_page_track_notifier_head *head; head = &kvm->arch.track_notifier_head; - init_srcu_struct(&head->track_srcu); INIT_HLIST_HEAD(&head->track_notifier_list); + return init_srcu_struct(&head->track_srcu); } /* diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 7d03e9b7ccfa..913d52a7923e 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -707,8 +707,27 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, if (!is_shadow_present_pte(*it.sptep)) { table_gfn = gw->table_gfn[it.level - 2]; access = gw->pt_access[it.level - 2]; - sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1, - false, access); + sp = kvm_mmu_get_page(vcpu, table_gfn, addr, + it.level-1, false, access); + /* + * We must synchronize the pagetable before linking it + * because the guest doesn't need to flush tlb when + * the gpte is changed from non-present to present. + * Otherwise, the guest may use the wrong mapping. + * + * For PG_LEVEL_4K, kvm_mmu_get_page() has already + * synchronized it transiently via kvm_sync_page(). + * + * For higher level pagetable, we synchronize it via + * the slower mmu_sync_children(). If it needs to + * break, some progress has been made; return + * RET_PF_RETRY and retry on the next #PF. + * KVM_REQ_MMU_SYNC is not necessary but it + * expedites the process. + */ + if (sp->unsync_children && + mmu_sync_children(vcpu, sp, false)) + return RET_PF_RETRY; } /* @@ -1047,14 +1066,6 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gpa_t vaddr, * Using the cached information from sp->gfns is safe because: * - The spte has a reference to the struct page, so the pfn for a given gfn * can't change unless all sptes pointing to it are nuked first. - * - * Note: - * We should flush all tlbs if spte is dropped even though guest is - * responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page - * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't - * used by guest then tlbs are not flushed, so guest is allowed to access the - * freed pages. - * And we increase kvm->tlbs_dirty to delay tlbs flush in this case. */ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { @@ -1107,13 +1118,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) return 0; if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { - /* - * Update spte before increasing tlbs_dirty to make - * sure no tlb flush is lost after spte is zapped; see - * the comments in kvm_flush_remote_tlbs(). - */ - smp_wmb(); - vcpu->kvm->tlbs_dirty++; + set_spte_ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH; continue; } @@ -1128,12 +1133,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) if (gfn != sp->gfns[i]) { drop_spte(vcpu->kvm, &sp->spt[i]); - /* - * The same as above where we are doing - * prefetch_invalid_gpte(). - */ - smp_wmb(); - vcpu->kvm->tlbs_dirty++; + set_spte_ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH; continue; } diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 2545d0c61985..510b833cbd39 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -545,7 +545,6 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) (svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) | (svm->vmcb01.ptr->control.int_ctl & int_ctl_vmcb01_bits); - svm->vmcb->control.virt_ext = svm->nested.ctl.virt_ext; svm->vmcb->control.int_vector = svm->nested.ctl.int_vector; svm->vmcb->control.int_state = svm->nested.ctl.int_state; svm->vmcb->control.event_inj = svm->nested.ctl.event_inj; @@ -579,7 +578,7 @@ static void nested_svm_copy_common_state(struct vmcb *from_vmcb, struct vmcb *to } int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, - struct vmcb *vmcb12) + struct vmcb *vmcb12, bool from_vmrun) { struct vcpu_svm *svm = to_svm(vcpu); int ret; @@ -609,13 +608,16 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, nested_vmcb02_prepare_save(svm, vmcb12); ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3, - nested_npt_enabled(svm), true); + nested_npt_enabled(svm), from_vmrun); if (ret) return ret; if (!npt_enabled) vcpu->arch.mmu->inject_page_fault = svm_inject_page_fault_nested; + if (!from_vmrun) + kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); + svm_set_gif(svm, true); return 0; @@ -681,7 +683,7 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) svm->nested.nested_run_pending = 1; - if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12)) + if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, true)) goto out_exit_err; if (nested_svm_vmrun_msrpm(svm)) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 75e0b21ad07c..5847b05d29da 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -17,10 +17,10 @@ #include <linux/misc_cgroup.h> #include <linux/processor.h> #include <linux/trace_events.h> -#include <asm/fpu/internal.h> #include <asm/pkru.h> #include <asm/trapnr.h> +#include <asm/fpu/xcr.h> #include "x86.h" #include "svm.h" @@ -595,43 +595,55 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm) return 0; } -static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) +static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu, + int *error) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_launch_update_vmsa vmsa; + struct vcpu_svm *svm = to_svm(vcpu); + int ret; + + /* Perform some pre-encryption checks against the VMSA */ + ret = sev_es_sync_vmsa(svm); + if (ret) + return ret; + + /* + * The LAUNCH_UPDATE_VMSA command will perform in-place encryption of + * the VMSA memory content (i.e it will write the same memory region + * with the guest's key), so invalidate it first. + */ + clflush_cache_range(svm->vmsa, PAGE_SIZE); + + vmsa.reserved = 0; + vmsa.handle = to_kvm_svm(kvm)->sev_info.handle; + vmsa.address = __sme_pa(svm->vmsa); + vmsa.len = PAGE_SIZE; + ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, error); + if (ret) + return ret; + + vcpu->arch.guest_state_protected = true; + return 0; +} + +static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ struct kvm_vcpu *vcpu; int i, ret; if (!sev_es_guest(kvm)) return -ENOTTY; - vmsa.reserved = 0; - kvm_for_each_vcpu(i, vcpu, kvm) { - struct vcpu_svm *svm = to_svm(vcpu); - - /* Perform some pre-encryption checks against the VMSA */ - ret = sev_es_sync_vmsa(svm); + ret = mutex_lock_killable(&vcpu->mutex); if (ret) return ret; - /* - * The LAUNCH_UPDATE_VMSA command will perform in-place - * encryption of the VMSA memory content (i.e it will write - * the same memory region with the guest's key), so invalidate - * it first. - */ - clflush_cache_range(svm->vmsa, PAGE_SIZE); + ret = __sev_launch_update_vmsa(kvm, vcpu, &argp->error); - vmsa.handle = sev->handle; - vmsa.address = __sme_pa(svm->vmsa); - vmsa.len = PAGE_SIZE; - ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, - &argp->error); + mutex_unlock(&vcpu->mutex); if (ret) return ret; - - svm->vcpu.arch.guest_state_protected = true; } return 0; @@ -1397,8 +1409,10 @@ static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp) /* Bind ASID to this guest */ ret = sev_bind_asid(kvm, start.handle, error); - if (ret) + if (ret) { + sev_decommission(start.handle); goto e_free_session; + } params.handle = start.handle; if (copy_to_user((void __user *)(uintptr_t)argp->data, @@ -1464,12 +1478,19 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) /* Pin guest memory */ guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK, - PAGE_SIZE, &n, 0); + PAGE_SIZE, &n, 1); if (IS_ERR(guest_page)) { ret = PTR_ERR(guest_page); goto e_free_trans; } + /* + * Flush (on non-coherent CPUs) before RECEIVE_UPDATE_DATA, the PSP + * encrypts the written data with the guest's key, and the cache may + * contain dirty, unencrypted data. + */ + sev_clflush_pages(guest_page, n); + /* The RECEIVE_UPDATE_DATA command requires C-bit to be always set. */ data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset; data.guest_address |= sev_me_mask; @@ -1501,6 +1522,20 @@ static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) return sev_issue_cmd(kvm, SEV_CMD_RECEIVE_FINISH, &data, &argp->error); } +static bool cmd_allowed_from_miror(u32 cmd_id) +{ + /* + * Allow mirrors VM to call KVM_SEV_LAUNCH_UPDATE_VMSA to enable SEV-ES + * active mirror VMs. Also allow the debugging and status commands. + */ + if (cmd_id == KVM_SEV_LAUNCH_UPDATE_VMSA || + cmd_id == KVM_SEV_GUEST_STATUS || cmd_id == KVM_SEV_DBG_DECRYPT || + cmd_id == KVM_SEV_DBG_ENCRYPT) + return true; + + return false; +} + int svm_mem_enc_op(struct kvm *kvm, void __user *argp) { struct kvm_sev_cmd sev_cmd; @@ -1517,8 +1552,9 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp) mutex_lock(&kvm->lock); - /* enc_context_owner handles all memory enc operations */ - if (is_mirroring_enc_context(kvm)) { + /* Only the enc_context_owner handles some memory enc operations. */ + if (is_mirroring_enc_context(kvm) && + !cmd_allowed_from_miror(sev_cmd.id)) { r = -EINVAL; goto out; } @@ -1715,8 +1751,7 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd) { struct file *source_kvm_file; struct kvm *source_kvm; - struct kvm_sev_info *mirror_sev; - unsigned int asid; + struct kvm_sev_info source_sev, *mirror_sev; int ret; source_kvm_file = fget(source_fd); @@ -1739,7 +1774,8 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd) goto e_source_unlock; } - asid = to_kvm_svm(source_kvm)->sev_info.asid; + memcpy(&source_sev, &to_kvm_svm(source_kvm)->sev_info, + sizeof(source_sev)); /* * The mirror kvm holds an enc_context_owner ref so its asid can't @@ -1759,8 +1795,16 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd) /* Set enc_context_owner and copy its encryption context over */ mirror_sev = &to_kvm_svm(kvm)->sev_info; mirror_sev->enc_context_owner = source_kvm; - mirror_sev->asid = asid; mirror_sev->active = true; + mirror_sev->asid = source_sev.asid; + mirror_sev->fd = source_sev.fd; + mirror_sev->es_active = source_sev.es_active; + mirror_sev->handle = source_sev.handle; + /* + * Do not copy ap_jump_table. Since the mirror does not share the same + * KVM contexts as the original, and they may have different + * memory-views. + */ mutex_unlock(&kvm->lock); return 0; @@ -2547,11 +2591,20 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in) { - if (!setup_vmgexit_scratch(svm, in, svm->vmcb->control.exit_info_2)) + int count; + int bytes; + + if (svm->vmcb->control.exit_info_2 > INT_MAX) + return -EINVAL; + + count = svm->vmcb->control.exit_info_2; + if (unlikely(check_mul_overflow(count, size, &bytes))) + return -EINVAL; + + if (!setup_vmgexit_scratch(svm, in, bytes)) return -EINVAL; - return kvm_sev_es_string_io(&svm->vcpu, size, port, - svm->ghcb_sa, svm->ghcb_sa_len, in); + return kvm_sev_es_string_io(&svm->vcpu, size, port, svm->ghcb_sa, count, in); } void sev_es_init_vmcb(struct vcpu_svm *svm) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 05e8d4d27969..226482daa6eb 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -25,6 +25,7 @@ #include <linux/pagemap.h> #include <linux/swap.h> #include <linux/rwsem.h> +#include <linux/cc_platform.h> #include <asm/apic.h> #include <asm/perf_event.h> @@ -36,6 +37,7 @@ #include <asm/spec-ctrl.h> #include <asm/cpu_device_id.h> #include <asm/traps.h> +#include <asm/fpu/api.h> #include <asm/virtext.h> #include "trace.h" @@ -455,7 +457,7 @@ static int has_svm(void) return 0; } - if (sev_active()) { + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { pr_info("KVM is unsupported when running as an SEV guest\n"); return 0; } @@ -1346,10 +1348,10 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) /* * SEV-ES guests maintain an encrypted version of their FPU * state which is restored and saved on VMRUN and VMEXIT. - * Free the fpu structure to prevent KVM from attempting to - * access the FPU state. + * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't + * do xsave/xrstor on it. */ - kvm_free_guest_fpu(vcpu); + fpstate_set_confidential(&vcpu->arch.guest_fpu); } err = avic_init_vcpu(svm); @@ -1566,6 +1568,8 @@ static void svm_clear_vintr(struct vcpu_svm *svm) svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl & V_IRQ_INJECTION_BITS_MASK; + + svm->vmcb->control.int_vector = svm->nested.ctl.int_vector; } vmcb_mark_dirty(svm->vmcb, VMCB_INTR); @@ -2222,6 +2226,10 @@ static int gp_interception(struct kvm_vcpu *vcpu) if (error_code) goto reinject; + /* All SVM instructions expect page aligned RAX */ + if (svm->vmcb->save.rax & ~PAGE_MASK) + goto reinject; + /* Decode the instruction for usage later */ if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK) goto reinject; @@ -4285,43 +4293,44 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) struct kvm_host_map map_save; int ret; - if (is_guest_mode(vcpu)) { - /* FED8h - SVM Guest */ - put_smstate(u64, smstate, 0x7ed8, 1); - /* FEE0h - SVM Guest VMCB Physical Address */ - put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa); + if (!is_guest_mode(vcpu)) + return 0; - svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; - svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; - svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; + /* FED8h - SVM Guest */ + put_smstate(u64, smstate, 0x7ed8, 1); + /* FEE0h - SVM Guest VMCB Physical Address */ + put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa); - ret = nested_svm_vmexit(svm); - if (ret) - return ret; + svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; + svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; + svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; - /* - * KVM uses VMCB01 to store L1 host state while L2 runs but - * VMCB01 is going to be used during SMM and thus the state will - * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save - * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the - * format of the area is identical to guest save area offsetted - * by 0x400 (matches the offset of 'struct vmcb_save_area' - * within 'struct vmcb'). Note: HSAVE area may also be used by - * L1 hypervisor to save additional host context (e.g. KVM does - * that, see svm_prepare_guest_switch()) which must be - * preserved. - */ - if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), - &map_save) == -EINVAL) - return 1; + ret = nested_svm_vmexit(svm); + if (ret) + return ret; - BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400); + /* + * KVM uses VMCB01 to store L1 host state while L2 runs but + * VMCB01 is going to be used during SMM and thus the state will + * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save + * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the + * format of the area is identical to guest save area offsetted + * by 0x400 (matches the offset of 'struct vmcb_save_area' + * within 'struct vmcb'). Note: HSAVE area may also be used by + * L1 hypervisor to save additional host context (e.g. KVM does + * that, see svm_prepare_guest_switch()) which must be + * preserved. + */ + if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), + &map_save) == -EINVAL) + return 1; - svm_copy_vmrun_state(map_save.hva + 0x400, - &svm->vmcb01.ptr->save); + BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400); - kvm_vcpu_unmap(vcpu, &map_save, true); - } + svm_copy_vmrun_state(map_save.hva + 0x400, + &svm->vmcb01.ptr->save); + + kvm_vcpu_unmap(vcpu, &map_save, true); return 0; } @@ -4329,50 +4338,54 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_host_map map, map_save; - int ret = 0; + u64 saved_efer, vmcb12_gpa; + struct vmcb *vmcb12; + int ret; - if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) { - u64 saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0); - u64 guest = GET_SMSTATE(u64, smstate, 0x7ed8); - u64 vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0); - struct vmcb *vmcb12; + if (!guest_cpuid_has(vcpu, X86_FEATURE_LM)) + return 0; - if (guest) { - if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM)) - return 1; + /* Non-zero if SMI arrived while vCPU was in guest mode. */ + if (!GET_SMSTATE(u64, smstate, 0x7ed8)) + return 0; - if (!(saved_efer & EFER_SVME)) - return 1; + if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM)) + return 1; - if (kvm_vcpu_map(vcpu, - gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL) - return 1; + saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0); + if (!(saved_efer & EFER_SVME)) + return 1; - if (svm_allocate_nested(svm)) - return 1; + vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0); + if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL) + return 1; - vmcb12 = map.hva; + ret = 1; + if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save) == -EINVAL) + goto unmap_map; - nested_load_control_from_vmcb12(svm, &vmcb12->control); + if (svm_allocate_nested(svm)) + goto unmap_save; - ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12); - kvm_vcpu_unmap(vcpu, &map, true); + /* + * Restore L1 host state from L1 HSAVE area as VMCB01 was + * used during SMM (see svm_enter_smm()) + */ - /* - * Restore L1 host state from L1 HSAVE area as VMCB01 was - * used during SMM (see svm_enter_smm()) - */ - if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), - &map_save) == -EINVAL) - return 1; + svm_copy_vmrun_state(&svm->vmcb01.ptr->save, map_save.hva + 0x400); - svm_copy_vmrun_state(&svm->vmcb01.ptr->save, - map_save.hva + 0x400); + /* + * Enter the nested guest now + */ - kvm_vcpu_unmap(vcpu, &map_save, true); - } - } + vmcb12 = map.hva; + nested_load_control_from_vmcb12(svm, &vmcb12->control); + ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, false); +unmap_save: + kvm_vcpu_unmap(vcpu, &map_save, true); +unmap_map: + kvm_vcpu_unmap(vcpu, &map, true); return ret; } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 524d943f3efc..e63ac08115cf 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -191,7 +191,7 @@ struct vcpu_svm { /* SEV-ES scratch area support */ void *ghcb_sa; - u64 ghcb_sa_len; + u32 ghcb_sa_len; bool ghcb_sa_sync; bool ghcb_sa_free; @@ -218,12 +218,12 @@ DECLARE_PER_CPU(struct svm_cpu_data *, svm_data); void recalc_intercepts(struct vcpu_svm *svm); -static inline struct kvm_svm *to_kvm_svm(struct kvm *kvm) +static __always_inline struct kvm_svm *to_kvm_svm(struct kvm *kvm) { return container_of(kvm, struct kvm_svm, kvm); } -static inline bool sev_guest(struct kvm *kvm) +static __always_inline bool sev_guest(struct kvm *kvm) { #ifdef CONFIG_KVM_AMD_SEV struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; @@ -234,7 +234,7 @@ static inline bool sev_guest(struct kvm *kvm) #endif } -static inline bool sev_es_guest(struct kvm *kvm) +static __always_inline bool sev_es_guest(struct kvm *kvm) { #ifdef CONFIG_KVM_AMD_SEV struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; @@ -271,7 +271,7 @@ static inline bool vmcb_is_dirty(struct vmcb *vmcb, int bit) return !test_bit(bit, (unsigned long *)&vmcb->control.clean); } -static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) +static __always_inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) { return container_of(vcpu, struct vcpu_svm, vcpu); } @@ -459,7 +459,8 @@ static inline bool nested_exit_on_nmi(struct vcpu_svm *svm) return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_NMI); } -int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb_gpa, struct vmcb *vmcb12); +int enter_svm_guest_mode(struct kvm_vcpu *vcpu, + u64 vmcb_gpa, struct vmcb *vmcb12, bool from_vmrun); void svm_leave_nested(struct vcpu_svm *svm); void svm_free_nested(struct vcpu_svm *svm); int svm_allocate_nested(struct vcpu_svm *svm); diff --git a/arch/x86/kvm/svm/svm_ops.h b/arch/x86/kvm/svm/svm_ops.h index 22e2b019de37..9430d6437c9f 100644 --- a/arch/x86/kvm/svm/svm_ops.h +++ b/arch/x86/kvm/svm/svm_ops.h @@ -56,12 +56,12 @@ static inline void invlpga(unsigned long addr, u32 asid) * VMSAVE, VMLOAD, etc... is still controlled by the effective address size, * hence 'unsigned long' instead of 'hpa_t'. */ -static inline void vmsave(unsigned long pa) +static __always_inline void vmsave(unsigned long pa) { svm_asm1(vmsave, "a" (pa), "memory"); } -static inline void vmload(unsigned long pa) +static __always_inline void vmload(unsigned long pa) { svm_asm1(vmload, "a" (pa), "memory"); } diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c index 0dab1b7b529f..ba6f99f584ac 100644 --- a/arch/x86/kvm/vmx/evmcs.c +++ b/arch/x86/kvm/vmx/evmcs.c @@ -353,14 +353,20 @@ void nested_evmcs_filter_control_msr(u32 msr_index, u64 *pdata) switch (msr_index) { case MSR_IA32_VMX_EXIT_CTLS: case MSR_IA32_VMX_TRUE_EXIT_CTLS: - ctl_high &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; + ctl_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL; break; case MSR_IA32_VMX_ENTRY_CTLS: case MSR_IA32_VMX_TRUE_ENTRY_CTLS: - ctl_high &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; + ctl_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL; break; case MSR_IA32_VMX_PROCBASED_CTLS2: - ctl_high &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + ctl_high &= ~EVMCS1_UNSUPPORTED_2NDEXEC; + break; + case MSR_IA32_VMX_PINBASED_CTLS: + ctl_high &= ~EVMCS1_UNSUPPORTED_PINCTRL; + break; + case MSR_IA32_VMX_VMFUNC: + ctl_low &= ~EVMCS1_UNSUPPORTED_VMFUNC; break; } diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h index 152ab0aa82cf..16731d2cf231 100644 --- a/arch/x86/kvm/vmx/evmcs.h +++ b/arch/x86/kvm/vmx/evmcs.h @@ -93,7 +93,7 @@ static __always_inline int get_evmcs_offset(unsigned long field, return evmcs_field->offset; } -static inline void evmcs_write64(unsigned long field, u64 value) +static __always_inline void evmcs_write64(unsigned long field, u64 value) { u16 clean_field; int offset = get_evmcs_offset(field, &clean_field); @@ -183,7 +183,7 @@ static inline void evmcs_load(u64 phys_addr) __init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf); #else /* !IS_ENABLED(CONFIG_HYPERV) */ -static inline void evmcs_write64(unsigned long field, u64 value) {} +static __always_inline void evmcs_write64(unsigned long field, u64 value) {} static inline void evmcs_write32(unsigned long field, u32 value) {} static inline void evmcs_write16(unsigned long field, u16 value) {} static inline u64 evmcs_read64(unsigned long field) { return 0; } diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index ccb03d69546c..eedcebf58004 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2583,8 +2583,13 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, * Guest state is invalid and unrestricted guest is disabled, * which means L1 attempted VMEntry to L2 with invalid state. * Fail the VMEntry. + * + * However when force loading the guest state (SMM exit or + * loading nested state after migration, it is possible to + * have invalid guest state now, which will be later fixed by + * restoring L2 register state */ - if (CC(!vmx_guest_state_valid(vcpu))) { + if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) { *entry_failure_code = ENTRY_FAIL_DEFAULT; return -EINVAL; } @@ -4351,6 +4356,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, vmcs12->vm_exit_msr_load_count)) nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); + + to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); } static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) @@ -4899,14 +4906,7 @@ out_vmcs02: return -ENOMEM; } -/* - * Emulate the VMXON instruction. - * Currently, we just remember that VMX is active, and do not save or even - * inspect the argument to VMXON (the so-called "VMXON pointer") because we - * do not currently need to store anything in that guest-allocated memory - * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their - * argument is different from the VMXON pointer (which the spec says they do). - */ +/* Emulate the VMXON instruction. */ static int handle_vmon(struct kvm_vcpu *vcpu) { int ret; @@ -5903,6 +5903,12 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, case EXIT_REASON_VMFUNC: /* VM functions are emulated through L2->L0 vmexits. */ return true; + case EXIT_REASON_BUS_LOCK: + /* + * At present, bus lock VM exit is never exposed to L1. + * Handle L2's bus locks in L0 directly. + */ + return true; default: break; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 0c2c0d5ae873..fb9e4ac3df22 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -35,7 +35,7 @@ #include <asm/cpu_device_id.h> #include <asm/debugreg.h> #include <asm/desc.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> #include <asm/idtentry.h> #include <asm/io.h> #include <asm/irq_remapping.h> @@ -1323,7 +1323,7 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu) vmx_prepare_switch_to_host(to_vmx(vcpu)); } -static bool emulation_required(struct kvm_vcpu *vcpu) +bool vmx_emulation_required(struct kvm_vcpu *vcpu) { return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu); } @@ -1367,7 +1367,7 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) vmcs_writel(GUEST_RFLAGS, rflags); if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM) - vmx->emulation_required = emulation_required(vcpu); + vmx->emulation_required = vmx_emulation_required(vcpu); } u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) @@ -1837,10 +1837,11 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) &msr_info->data)) return 1; /* - * Enlightened VMCS v1 doesn't have certain fields, but buggy - * Hyper-V versions are still trying to use corresponding - * features when they are exposed. Filter out the essential - * minimum. + * Enlightened VMCS v1 doesn't have certain VMCS fields but + * instead of just ignoring the features, different Hyper-V + * versions are either trying to use them and fail or do some + * sanity checking and refuse to boot. Filter all unsupported + * features out. */ if (!msr_info->host_initiated && vmx->nested.enlightened_vmcs_enabled) @@ -3077,7 +3078,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } /* depends on vcpu->arch.cr0 to be set to a new value */ - vmx->emulation_required = emulation_required(vcpu); + vmx->emulation_required = vmx_emulation_required(vcpu); } static int vmx_get_max_tdp_level(void) @@ -3330,7 +3331,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int { __vmx_set_segment(vcpu, var, seg); - to_vmx(vcpu)->emulation_required = emulation_required(vcpu); + to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); } static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) @@ -5561,9 +5562,13 @@ static int handle_encls(struct kvm_vcpu *vcpu) static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu) { - vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; - vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK; - return 0; + /* + * Hardware may or may not set the BUS_LOCK_DETECTED flag on BUS_LOCK + * VM-Exits. Unconditionally set the flag here and leave the handling to + * vmx_handle_exit(). + */ + to_vmx(vcpu)->exit_reason.bus_lock_detected = true; + return 1; } /* @@ -6050,9 +6055,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) int ret = __vmx_handle_exit(vcpu, exit_fastpath); /* - * Even when current exit reason is handled by KVM internally, we - * still need to exit to user space when bus lock detected to inform - * that there is a bus lock in guest. + * Exit to user space when bus lock detected to inform that there is + * a bus lock in guest. */ if (to_vmx(vcpu)->exit_reason.bus_lock_detected) { if (ret > 0) @@ -6301,18 +6305,13 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) /* * If we are running L2 and L1 has a new pending interrupt - * which can be injected, we should re-evaluate - * what should be done with this new L1 interrupt. - * If L1 intercepts external-interrupts, we should - * exit from L2 to L1. Otherwise, interrupt should be - * delivered directly to L2. + * which can be injected, this may cause a vmexit or it may + * be injected into L2. Either way, this interrupt will be + * processed via KVM_REQ_EVENT, not RVI, because we do not use + * virtual interrupt delivery to inject L1 interrupts into L2. */ - if (is_guest_mode(vcpu) && max_irr_updated) { - if (nested_exit_on_intr(vcpu)) - kvm_vcpu_exiting_guest_mode(vcpu); - else - kvm_make_request(KVM_REQ_EVENT, vcpu); - } + if (is_guest_mode(vcpu) && max_irr_updated) + kvm_make_request(KVM_REQ_EVENT, vcpu); } else { max_irr = kvm_lapic_find_highest_irr(vcpu); } @@ -6621,10 +6620,24 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->loaded_vmcs->soft_vnmi_blocked)) vmx->loaded_vmcs->entry_time = ktime_get(); - /* Don't enter VMX if guest state is invalid, let the exit handler - start emulation until we arrive back to a valid state */ - if (vmx->emulation_required) + /* + * Don't enter VMX if guest state is invalid, let the exit handler + * start emulation until we arrive back to a valid state. Synthesize a + * consistency check VM-Exit due to invalid guest state and bail. + */ + if (unlikely(vmx->emulation_required)) { + + /* We don't emulate invalid state of a nested guest */ + vmx->fail = is_guest_mode(vcpu); + + vmx->exit_reason.full = EXIT_REASON_INVALID_STATE; + vmx->exit_reason.failed_vmentry = 1; + kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1); + vmx->exit_qualification = ENTRY_FAIL_DEFAULT; + kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2); + vmx->exit_intr_info = 0; return EXIT_FASTPATH_NONE; + } trace_kvm_entry(vcpu); @@ -6833,7 +6846,7 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) */ tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); if (tsx_ctrl) - vmx->guest_uret_msrs[i].mask = ~(u64)TSX_CTRL_CPUID_CLEAR; + tsx_ctrl->mask = ~(u64)TSX_CTRL_CPUID_CLEAR; } err = alloc_loaded_vmcs(&vmx->vmcs01); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 4858c5fd95f2..592217fd7d92 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -248,12 +248,8 @@ struct vcpu_vmx { * only loaded into hardware when necessary, e.g. SYSCALL #UDs outside * of 64-bit mode or if EFER.SCE=1, thus the SYSCALL MSRs don't need to * be loaded into hardware if those conditions aren't met. - * nr_active_uret_msrs tracks the number of MSRs that need to be loaded - * into hardware when running the guest. guest_uret_msrs[] is resorted - * whenever the number of "active" uret MSRs is modified. */ struct vmx_uret_msr guest_uret_msrs[MAX_NR_USER_RETURN_MSRS]; - int nr_active_uret_msrs; bool guest_uret_msrs_loaded; #ifdef CONFIG_X86_64 u64 msr_host_kernel_gs_base; @@ -359,6 +355,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, unsigned long fs_base, unsigned long gs_base); int vmx_get_cpl(struct kvm_vcpu *vcpu); +bool vmx_emulation_required(struct kvm_vcpu *vcpu); unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu); void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 28ef14155726..2686f2edb47c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -68,7 +68,9 @@ #include <asm/mce.h> #include <asm/pkru.h> #include <linux/kernel_stat.h> -#include <asm/fpu/internal.h> /* Ugh! */ +#include <asm/fpu/api.h> +#include <asm/fpu/xcr.h> +#include <asm/fpu/xstate.h> #include <asm/pvclock.h> #include <asm/div64.h> #include <asm/irq_remapping.h> @@ -293,8 +295,6 @@ u64 __read_mostly host_xcr0; u64 __read_mostly supported_xcr0; EXPORT_SYMBOL_GPL(supported_xcr0); -static struct kmem_cache *x86_fpu_cache; - static struct kmem_cache *x86_emulator_cache; /* @@ -1332,6 +1332,13 @@ static const u32 msrs_to_save_all[] = { MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13, MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15, MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, + + MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, + MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, + MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2, + MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, + MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, + MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, }; static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)]; @@ -2535,7 +2542,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) kvm_vcpu_write_tsc_offset(vcpu, offset); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); - spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags); + raw_spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags); if (!matched) { kvm->arch.nr_vcpus_matched_tsc = 0; } else if (!already_matched) { @@ -2543,7 +2550,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) } kvm_track_tsc_matching(vcpu); - spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags); + raw_spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags); } static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, @@ -2773,9 +2780,9 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) kvm_make_mclock_inprogress_request(kvm); /* no guest entries from this point */ - spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); + raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); pvclock_update_vm_gtod_copy(kvm); - spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); + raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); kvm_for_each_vcpu(i, vcpu, kvm) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); @@ -2793,15 +2800,15 @@ u64 get_kvmclock_ns(struct kvm *kvm) unsigned long flags; u64 ret; - spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); + raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); if (!ka->use_master_clock) { - spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); + raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); return get_kvmclock_base_ns() + ka->kvmclock_offset; } hv_clock.tsc_timestamp = ka->master_cycle_now; hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; - spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); + raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); /* both __this_cpu_read() and rdtsc() should be on the same cpu */ get_cpu(); @@ -2895,13 +2902,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) * If the host uses TSC clock, then passthrough TSC as stable * to the guest. */ - spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); + raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); use_master_clock = ka->use_master_clock; if (use_master_clock) { host_tsc = ka->master_cycle_now; kernel_ns = ka->master_kernel_ns; } - spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); + raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); @@ -2969,7 +2976,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) offsetof(struct compat_vcpu_info, time)); if (vcpu->xen.vcpu_time_info_set) kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_time_info_cache, 0); - if (v == kvm_get_vcpu(v->kvm, 0)) + if (!v->vcpu_idx) kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock); return 0; } @@ -4693,144 +4700,27 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, return 0; } -#define XSTATE_COMPACTION_ENABLED (1ULL << 63) - -static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) -{ - struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave; - u64 xstate_bv = xsave->header.xfeatures; - u64 valid; - - /* - * Copy legacy XSAVE area, to avoid complications with CPUID - * leaves 0 and 1 in the loop below. - */ - memcpy(dest, xsave, XSAVE_HDR_OFFSET); - - /* Set XSTATE_BV */ - xstate_bv &= vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE; - *(u64 *)(dest + XSAVE_HDR_OFFSET) = xstate_bv; - - /* - * Copy each region from the possibly compacted offset to the - * non-compacted offset. - */ - valid = xstate_bv & ~XFEATURE_MASK_FPSSE; - while (valid) { - u32 size, offset, ecx, edx; - u64 xfeature_mask = valid & -valid; - int xfeature_nr = fls64(xfeature_mask) - 1; - void *src; - - cpuid_count(XSTATE_CPUID, xfeature_nr, - &size, &offset, &ecx, &edx); - - if (xfeature_nr == XFEATURE_PKRU) { - memcpy(dest + offset, &vcpu->arch.pkru, - sizeof(vcpu->arch.pkru)); - } else { - src = get_xsave_addr(xsave, xfeature_nr); - if (src) - memcpy(dest + offset, src, size); - } - - valid -= xfeature_mask; - } -} - -static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) -{ - struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave; - u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET); - u64 valid; - - /* - * Copy legacy XSAVE area, to avoid complications with CPUID - * leaves 0 and 1 in the loop below. - */ - memcpy(xsave, src, XSAVE_HDR_OFFSET); - - /* Set XSTATE_BV and possibly XCOMP_BV. */ - xsave->header.xfeatures = xstate_bv; - if (boot_cpu_has(X86_FEATURE_XSAVES)) - xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED; - - /* - * Copy each region from the non-compacted offset to the - * possibly compacted offset. - */ - valid = xstate_bv & ~XFEATURE_MASK_FPSSE; - while (valid) { - u32 size, offset, ecx, edx; - u64 xfeature_mask = valid & -valid; - int xfeature_nr = fls64(xfeature_mask) - 1; - - cpuid_count(XSTATE_CPUID, xfeature_nr, - &size, &offset, &ecx, &edx); - - if (xfeature_nr == XFEATURE_PKRU) { - memcpy(&vcpu->arch.pkru, src + offset, - sizeof(vcpu->arch.pkru)); - } else { - void *dest = get_xsave_addr(xsave, xfeature_nr); - - if (dest) - memcpy(dest, src + offset, size); - } - - valid -= xfeature_mask; - } -} - static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave) { - if (!vcpu->arch.guest_fpu) + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) return; - if (boot_cpu_has(X86_FEATURE_XSAVE)) { - memset(guest_xsave, 0, sizeof(struct kvm_xsave)); - fill_xsave((u8 *) guest_xsave->region, vcpu); - } else { - memcpy(guest_xsave->region, - &vcpu->arch.guest_fpu->state.fxsave, - sizeof(struct fxregs_state)); - *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] = - XFEATURE_MASK_FPSSE; - } + fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, + guest_xsave->region, + sizeof(guest_xsave->region), + vcpu->arch.pkru); } -#define XSAVE_MXCSR_OFFSET 24 - static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave) { - u64 xstate_bv; - u32 mxcsr; - - if (!vcpu->arch.guest_fpu) + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) return 0; - xstate_bv = *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)]; - mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)]; - - if (boot_cpu_has(X86_FEATURE_XSAVE)) { - /* - * Here we allow setting states that are not present in - * CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility - * with old userspace. - */ - if (xstate_bv & ~supported_xcr0 || mxcsr & ~mxcsr_feature_mask) - return -EINVAL; - load_xsave(vcpu, (u8 *)guest_xsave->region); - } else { - if (xstate_bv & ~XFEATURE_MASK_FPSSE || - mxcsr & ~mxcsr_feature_mask) - return -EINVAL; - memcpy(&vcpu->arch.guest_fpu->state.fxsave, - guest_xsave->region, sizeof(struct fxregs_state)); - } - return 0; + return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu, + guest_xsave->region, + supported_xcr0, &vcpu->arch.pkru); } static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, @@ -6093,13 +5983,13 @@ set_pit2_out: * is slightly ahead) here we risk going negative on unsigned * 'system_time' when 'user_ns.clock' is very small. */ - spin_lock_irq(&ka->pvclock_gtod_sync_lock); + raw_spin_lock_irq(&ka->pvclock_gtod_sync_lock); if (kvm->arch.use_master_clock) now_ns = ka->master_kernel_ns; else now_ns = get_kvmclock_base_ns(); ka->kvmclock_offset = user_ns.clock - now_ns; - spin_unlock_irq(&ka->pvclock_gtod_sync_lock); + raw_spin_unlock_irq(&ka->pvclock_gtod_sync_lock); kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE); break; @@ -6899,7 +6789,7 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) } static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, - unsigned short port, void *val, + unsigned short port, unsigned int count, bool in) { vcpu->arch.pio.port = port; @@ -6907,10 +6797,8 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, vcpu->arch.pio.count = count; vcpu->arch.pio.size = size; - if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { - vcpu->arch.pio.count = 0; + if (!kernel_pio(vcpu, vcpu->arch.pio_data)) return 1; - } vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; @@ -6922,26 +6810,39 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, return 0; } -static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, - unsigned short port, void *val, unsigned int count) +static int __emulator_pio_in(struct kvm_vcpu *vcpu, int size, + unsigned short port, unsigned int count) { - int ret; + WARN_ON(vcpu->arch.pio.count); + memset(vcpu->arch.pio_data, 0, size * count); + return emulator_pio_in_out(vcpu, size, port, count, true); +} - if (vcpu->arch.pio.count) - goto data_avail; +static void complete_emulator_pio_in(struct kvm_vcpu *vcpu, void *val) +{ + int size = vcpu->arch.pio.size; + unsigned count = vcpu->arch.pio.count; + memcpy(val, vcpu->arch.pio_data, size * count); + trace_kvm_pio(KVM_PIO_IN, vcpu->arch.pio.port, size, count, vcpu->arch.pio_data); + vcpu->arch.pio.count = 0; +} - memset(vcpu->arch.pio_data, 0, size * count); +static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, + unsigned short port, void *val, unsigned int count) +{ + if (vcpu->arch.pio.count) { + /* Complete previous iteration. */ + } else { + int r = __emulator_pio_in(vcpu, size, port, count); + if (!r) + return r; - ret = emulator_pio_in_out(vcpu, size, port, val, count, true); - if (ret) { -data_avail: - memcpy(val, vcpu->arch.pio_data, size * count); - trace_kvm_pio(KVM_PIO_IN, port, size, count, vcpu->arch.pio_data); - vcpu->arch.pio.count = 0; - return 1; + /* Results already available, fall through. */ } - return 0; + WARN_ON(count != vcpu->arch.pio.count); + complete_emulator_pio_in(vcpu, val); + return 1; } static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, @@ -6956,9 +6857,15 @@ static int emulator_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port, const void *val, unsigned int count) { + int ret; + memcpy(vcpu->arch.pio_data, val, size * count); trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data); - return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false); + ret = emulator_pio_in_out(vcpu, size, port, count, false); + if (ret) + vcpu->arch.pio.count = 0; + + return ret; } static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, @@ -7658,6 +7565,13 @@ static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm) /* Process a latched INIT or SMI, if any. */ kvm_make_request(KVM_REQ_EVENT, vcpu); + + /* + * Even if KVM_SET_SREGS2 loaded PDPTRs out of band, + * on SMM exit we still need to reload them from + * guest memory + */ + vcpu->arch.pdptrs_from_userspace = false; } kvm_mmu_reset_context(vcpu); @@ -8125,9 +8039,9 @@ static void kvm_hyperv_tsc_notifier(void) list_for_each_entry(kvm, &vm_list, vm_list) { struct kvm_arch *ka = &kvm->arch; - spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); + raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); pvclock_update_vm_gtod_copy(kvm); - spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); + raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); kvm_for_each_vcpu(cpu, vcpu, kvm) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); @@ -8403,18 +8317,11 @@ int kvm_arch_init(void *opaque) } r = -ENOMEM; - x86_fpu_cache = kmem_cache_create("x86_fpu", sizeof(struct fpu), - __alignof__(struct fpu), SLAB_ACCOUNT, - NULL); - if (!x86_fpu_cache) { - printk(KERN_ERR "kvm: failed to allocate cache for x86 fpu\n"); - goto out; - } x86_emulator_cache = kvm_alloc_emulator_cache(); if (!x86_emulator_cache) { pr_err("kvm: failed to allocate cache for x86 emulator\n"); - goto out_free_x86_fpu_cache; + goto out; } user_return_msrs = alloc_percpu(struct kvm_user_return_msrs); @@ -8452,8 +8359,6 @@ out_free_percpu: free_percpu(user_return_msrs); out_free_x86_emulator_cache: kmem_cache_destroy(x86_emulator_cache); -out_free_x86_fpu_cache: - kmem_cache_destroy(x86_fpu_cache); out: return r; } @@ -8480,7 +8385,6 @@ void kvm_arch_exit(void) kvm_mmu_module_exit(); free_percpu(user_return_msrs); kmem_cache_destroy(x86_emulator_cache); - kmem_cache_destroy(x86_fpu_cache); #ifdef CONFIG_KVM_XEN static_key_deferred_flush(&kvm_xen_enabled); WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key)); @@ -8769,9 +8673,17 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu) kvm_run->cr8 = kvm_get_cr8(vcpu); kvm_run->apic_base = kvm_get_apic_base(vcpu); + + /* + * The call to kvm_ready_for_interrupt_injection() may end up in + * kvm_xen_has_interrupt() which may require the srcu lock to be + * held, to protect against changes in the vcpu_info address. + */ + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); kvm_run->ready_for_interrupt_injection = pic_in_kernel(vcpu->kvm) || kvm_vcpu_ready_for_interrupt_injection(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); if (is_smm(vcpu)) kvm_run->flags |= KVM_RUN_X86_SMM; @@ -9629,14 +9541,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) break; - if (unlikely(kvm_vcpu_exit_request(vcpu))) { + if (vcpu->arch.apicv_active) + static_call(kvm_x86_sync_pir_to_irr)(vcpu); + + if (unlikely(kvm_vcpu_exit_request(vcpu))) { exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED; break; } - - if (vcpu->arch.apicv_active) - static_call(kvm_x86_sync_pir_to_irr)(vcpu); - } + } /* * Do this here before restoring debug registers on the host. And @@ -9899,58 +9811,21 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) return 0; } -static void kvm_save_current_fpu(struct fpu *fpu) -{ - /* - * If the target FPU state is not resident in the CPU registers, just - * memcpy() from current, else save CPU state directly to the target. - */ - if (test_thread_flag(TIF_NEED_FPU_LOAD)) - memcpy(&fpu->state, ¤t->thread.fpu.state, - fpu_kernel_xstate_size); - else - save_fpregs_to_fpstate(fpu); -} - /* Swap (qemu) user FPU context for the guest FPU context. */ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { - fpregs_lock(); - - kvm_save_current_fpu(vcpu->arch.user_fpu); - /* - * Guests with protected state can't have it set by the hypervisor, - * so skip trying to set it. + * Exclude PKRU from restore as restored separately in + * kvm_x86_ops.run(). */ - if (vcpu->arch.guest_fpu) - /* PKRU is separately restored in kvm_x86_ops.run. */ - __restore_fpregs_from_fpstate(&vcpu->arch.guest_fpu->state, - ~XFEATURE_MASK_PKRU); - - fpregs_mark_activate(); - fpregs_unlock(); - + fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true); trace_kvm_fpu(1); } /* When vcpu_run ends, restore user space FPU context. */ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { - fpregs_lock(); - - /* - * Guests with protected state can't have it read by the hypervisor, - * so skip trying to save it. - */ - if (vcpu->arch.guest_fpu) - kvm_save_current_fpu(vcpu->arch.guest_fpu); - - restore_fpregs_from_fpstate(&vcpu->arch.user_fpu->state); - - fpregs_mark_activate(); - fpregs_unlock(); - + fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, false); ++vcpu->stat.fpu_reload; trace_kvm_fpu(0); } @@ -10531,12 +10406,12 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { struct fxregs_state *fxsave; - if (!vcpu->arch.guest_fpu) + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) return 0; vcpu_load(vcpu); - fxsave = &vcpu->arch.guest_fpu->state.fxsave; + fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave; memcpy(fpu->fpr, fxsave->st_space, 128); fpu->fcw = fxsave->cwd; fpu->fsw = fxsave->swd; @@ -10554,12 +10429,12 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { struct fxregs_state *fxsave; - if (!vcpu->arch.guest_fpu) + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) return 0; vcpu_load(vcpu); - fxsave = &vcpu->arch.guest_fpu->state.fxsave; + fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave; memcpy(fxsave->st_space, fpu->fpr, 128); fxsave->cwd = fpu->fcw; @@ -10612,14 +10487,6 @@ static int sync_regs(struct kvm_vcpu *vcpu) static void fx_init(struct kvm_vcpu *vcpu) { - if (!vcpu->arch.guest_fpu) - return; - - fpstate_init(&vcpu->arch.guest_fpu->state); - if (boot_cpu_has(X86_FEATURE_XSAVES)) - vcpu->arch.guest_fpu->state.xsave.header.xcomp_bv = - host_xcr0 | XSTATE_COMPACTION_ENABLED; - /* * Ensure guest xcr0 is valid for loading */ @@ -10628,15 +10495,6 @@ static void fx_init(struct kvm_vcpu *vcpu) vcpu->arch.cr0 |= X86_CR0_ET; } -void kvm_free_guest_fpu(struct kvm_vcpu *vcpu) -{ - if (vcpu->arch.guest_fpu) { - kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); - vcpu->arch.guest_fpu = NULL; - } -} -EXPORT_SYMBOL_GPL(kvm_free_guest_fpu); - int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) { if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) @@ -10652,6 +10510,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) int r; vcpu->arch.last_vmentry_cpu = -1; + vcpu->arch.regs_avail = ~0; + vcpu->arch.regs_dirty = ~0; if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; @@ -10691,19 +10551,11 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) if (!alloc_emulate_ctxt(vcpu)) goto free_wbinvd_dirty_mask; - vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, - GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.user_fpu) { - pr_err("kvm: failed to allocate userspace's fpu\n"); + if (!fpu_alloc_guest_fpstate(&vcpu->arch.guest_fpu)) { + pr_err("kvm: failed to allocate vcpu's fpu\n"); goto free_emulate_ctxt; } - vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, - GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.guest_fpu) { - pr_err("kvm: failed to allocate vcpu's fpu\n"); - goto free_user_fpu; - } fx_init(vcpu); vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); @@ -10736,9 +10588,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) return 0; free_guest_fpu: - kvm_free_guest_fpu(vcpu); -free_user_fpu: - kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); + fpu_free_guest_fpstate(&vcpu->arch.guest_fpu); free_emulate_ctxt: kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt); free_wbinvd_dirty_mask: @@ -10787,8 +10637,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt); free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); - kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); - kvm_free_guest_fpu(vcpu); + fpu_free_guest_fpstate(&vcpu->arch.guest_fpu); kvm_hv_vcpu_uninit(vcpu); kvm_pmu_destroy(vcpu); @@ -10840,8 +10689,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_async_pf_hash_reset(vcpu); vcpu->arch.apf.halted = false; - if (vcpu->arch.guest_fpu && kvm_mpx_supported()) { - void *mpx_state_buffer; + if (vcpu->arch.guest_fpu.fpstate && kvm_mpx_supported()) { + struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate; /* * To avoid have the INIT path from kvm_apic_has_events() that be @@ -10849,14 +10698,10 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) */ if (init_event) kvm_put_guest_fpu(vcpu); - mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave, - XFEATURE_BNDREGS); - if (mpx_state_buffer) - memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state)); - mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave, - XFEATURE_BNDCSR); - if (mpx_state_buffer) - memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr)); + + fpstate_clear_xstate_component(fpstate, XFEATURE_BNDREGS); + fpstate_clear_xstate_component(fpstate, XFEATURE_BNDCSR); + if (init_event) kvm_load_guest_fpu(vcpu); } @@ -10893,6 +10738,9 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); kvm_rip_write(vcpu, 0xfff0); + vcpu->arch.cr3 = 0; + kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); + /* * CR0.CD/NW are set on RESET, preserved on INIT. Note, some versions * of Intel's SDM list CD/NW as being set on INIT, but they contradict @@ -11139,9 +10987,15 @@ void kvm_arch_free_vm(struct kvm *kvm) int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { + int ret; + if (type) return -EINVAL; + ret = kvm_page_track_init(kvm); + if (ret) + return ret; + INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); @@ -11157,7 +11011,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) raw_spin_lock_init(&kvm->arch.tsc_write_lock); mutex_init(&kvm->arch.apic_map_lock); - spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); + raw_spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); kvm->arch.kvmclock_offset = -get_kvmclock_base_ns(); pvclock_update_vm_gtod_copy(kvm); @@ -11174,7 +11028,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_apicv_init(kvm); kvm_hv_init_vm(kvm); - kvm_page_track_init(kvm); kvm_mmu_init_vm(kvm); kvm_xen_init_vm(kvm); @@ -11368,7 +11221,8 @@ static int memslot_rmap_alloc(struct kvm_memory_slot *slot, int level = i + 1; int lpages = __kvm_mmu_slot_lpages(slot, npages, level); - WARN_ON(slot->arch.rmap[i]); + if (slot->arch.rmap[i]) + continue; slot->arch.rmap[i] = kvcalloc(lpages, sz, GFP_KERNEL_ACCOUNT); if (!slot->arch.rmap[i]) { @@ -12343,44 +12197,81 @@ int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes, } EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_read); -static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu) +static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size, + unsigned int port); + +static int complete_sev_es_emulated_outs(struct kvm_vcpu *vcpu) { - memcpy(vcpu->arch.guest_ins_data, vcpu->arch.pio_data, - vcpu->arch.pio.count * vcpu->arch.pio.size); - vcpu->arch.pio.count = 0; + int size = vcpu->arch.pio.size; + int port = vcpu->arch.pio.port; + vcpu->arch.pio.count = 0; + if (vcpu->arch.sev_pio_count) + return kvm_sev_es_outs(vcpu, size, port); return 1; } static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size, - unsigned int port, void *data, unsigned int count) + unsigned int port) { - int ret; - - ret = emulator_pio_out_emulated(vcpu->arch.emulate_ctxt, size, port, - data, count); - if (ret) - return ret; + for (;;) { + unsigned int count = + min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count); + int ret = emulator_pio_out(vcpu, size, port, vcpu->arch.sev_pio_data, count); + + /* memcpy done already by emulator_pio_out. */ + vcpu->arch.sev_pio_count -= count; + vcpu->arch.sev_pio_data += count * vcpu->arch.pio.size; + if (!ret) + break; - vcpu->arch.pio.count = 0; + /* Emulation done by the kernel. */ + if (!vcpu->arch.sev_pio_count) + return 1; + } + vcpu->arch.complete_userspace_io = complete_sev_es_emulated_outs; return 0; } static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size, - unsigned int port, void *data, unsigned int count) + unsigned int port); + +static void advance_sev_es_emulated_ins(struct kvm_vcpu *vcpu) { - int ret; + unsigned count = vcpu->arch.pio.count; + complete_emulator_pio_in(vcpu, vcpu->arch.sev_pio_data); + vcpu->arch.sev_pio_count -= count; + vcpu->arch.sev_pio_data += count * vcpu->arch.pio.size; +} - ret = emulator_pio_in_emulated(vcpu->arch.emulate_ctxt, size, port, - data, count); - if (ret) { - vcpu->arch.pio.count = 0; - } else { - vcpu->arch.guest_ins_data = data; - vcpu->arch.complete_userspace_io = complete_sev_es_emulated_ins; +static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu) +{ + int size = vcpu->arch.pio.size; + int port = vcpu->arch.pio.port; + + advance_sev_es_emulated_ins(vcpu); + if (vcpu->arch.sev_pio_count) + return kvm_sev_es_ins(vcpu, size, port); + return 1; +} + +static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size, + unsigned int port) +{ + for (;;) { + unsigned int count = + min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count); + if (!__emulator_pio_in(vcpu, size, port, count)) + break; + + /* Emulation done by the kernel. */ + advance_sev_es_emulated_ins(vcpu); + if (!vcpu->arch.sev_pio_count) + return 1; } + vcpu->arch.complete_userspace_io = complete_sev_es_emulated_ins; return 0; } @@ -12388,8 +12279,10 @@ int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size, unsigned int port, void *data, unsigned int count, int in) { - return in ? kvm_sev_es_ins(vcpu, size, port, data, count) - : kvm_sev_es_outs(vcpu, size, port, data, count); + vcpu->arch.sev_pio_data = data; + vcpu->arch.sev_pio_count = count; + return in ? kvm_sev_es_ins(vcpu, size, port) + : kvm_sev_es_outs(vcpu, size, port); } EXPORT_SYMBOL_GPL(kvm_sev_es_string_io); diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 9ea9c3dabe37..8f62baebd028 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -190,6 +190,7 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) int __kvm_xen_has_interrupt(struct kvm_vcpu *v) { + int err; u8 rc = 0; /* @@ -216,13 +217,29 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) if (likely(slots->generation == ghc->generation && !kvm_is_error_hva(ghc->hva) && ghc->memslot)) { /* Fast path */ - __get_user(rc, (u8 __user *)ghc->hva + offset); - } else { - /* Slow path */ - kvm_read_guest_offset_cached(v->kvm, ghc, &rc, offset, - sizeof(rc)); + pagefault_disable(); + err = __get_user(rc, (u8 __user *)ghc->hva + offset); + pagefault_enable(); + if (!err) + return rc; } + /* Slow path */ + + /* + * This function gets called from kvm_vcpu_block() after setting the + * task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately + * from a HLT. So we really mustn't sleep. If the page ended up absent + * at that point, just return 1 in order to trigger an immediate wake, + * and we'll end up getting called again from a context where we *can* + * fault in the page and wait for it. + */ + if (in_atomic() || !task_is_running(current)) + return 1; + + kvm_read_guest_offset_cached(v->kvm, ghc, &rc, offset, + sizeof(rc)); + return rc; } diff --git a/arch/x86/lib/copy_mc_64.S b/arch/x86/lib/copy_mc_64.S index e5f77e293034..7334055157ba 100644 --- a/arch/x86/lib/copy_mc_64.S +++ b/arch/x86/lib/copy_mc_64.S @@ -107,9 +107,9 @@ SYM_FUNC_END(copy_mc_fragile) .previous - _ASM_EXTABLE_FAULT(.L_read_leading_bytes, .E_leading_bytes) - _ASM_EXTABLE_FAULT(.L_read_words, .E_read_words) - _ASM_EXTABLE_FAULT(.L_read_trailing_bytes, .E_trailing_bytes) + _ASM_EXTABLE_TYPE(.L_read_leading_bytes, .E_leading_bytes, EX_TYPE_DEFAULT_MCE_SAFE) + _ASM_EXTABLE_TYPE(.L_read_words, .E_read_words, EX_TYPE_DEFAULT_MCE_SAFE) + _ASM_EXTABLE_TYPE(.L_read_trailing_bytes, .E_trailing_bytes, EX_TYPE_DEFAULT_MCE_SAFE) _ASM_EXTABLE(.L_write_leading_bytes, .E_leading_bytes) _ASM_EXTABLE(.L_write_words, .E_write_words) _ASM_EXTABLE(.L_write_trailing_bytes, .E_trailing_bytes) @@ -149,5 +149,5 @@ SYM_FUNC_END(copy_mc_enhanced_fast_string) .previous - _ASM_EXTABLE_FAULT(.L_copy, .E_copy) + _ASM_EXTABLE_TYPE(.L_copy, .E_copy, EX_TYPE_DEFAULT_MCE_SAFE) #endif /* !CONFIG_UML */ diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 57b79c577496..2797e630b9b1 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -234,24 +234,11 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string) */ SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail) movl %edx,%ecx - cmp $X86_TRAP_MC,%eax /* check if X86_TRAP_MC */ - je 3f 1: rep movsb 2: mov %ecx,%eax ASM_CLAC ret - /* - * Return zero to pretend that this copy succeeded. This - * is counter-intuitive, but needed to prevent the code - * in lib/iov_iter.c from retrying and running back into - * the poison cache line again. The machine check handler - * will ensure that a SIGBUS is sent to the task. - */ -3: xorl %eax,%eax - ASM_CLAC - ret - _ASM_EXTABLE_CPY(1b, 2b) SYM_CODE_END(.Lcopy_user_handle_tail) diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index 058f19b20465..55e371cc69fd 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -13,6 +13,7 @@ #endif #include <asm/inat.h> /*__ignore_sync_check__ */ #include <asm/insn.h> /* __ignore_sync_check__ */ +#include <asm/unaligned.h> /* __ignore_sync_check__ */ #include <linux/errno.h> #include <linux/kconfig.h> @@ -37,10 +38,10 @@ ((insn)->next_byte + sizeof(t) + n <= (insn)->end_kaddr) #define __get_next(t, insn) \ - ({ t r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); leXX_to_cpu(t, r); }) + ({ t r = get_unaligned((t *)(insn)->next_byte); (insn)->next_byte += sizeof(t); leXX_to_cpu(t, r); }) #define __peek_nbyte_next(t, insn, n) \ - ({ t r = *(t*)((insn)->next_byte + n); leXX_to_cpu(t, r); }) + ({ t r = get_unaligned((t *)(insn)->next_byte + n); leXX_to_cpu(t, r); }) #define get_next(t, insn) \ ({ if (unlikely(!validate_next(t, insn, 0))) goto err_out; __get_next(t, insn); }) diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index ec9922cba30a..cf0b39f97adc 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -28,46 +28,14 @@ .macro THUNK reg - .align 32 - -SYM_FUNC_START(__x86_indirect_thunk_\reg) + .align RETPOLINE_THUNK_SIZE +SYM_INNER_LABEL(__x86_indirect_thunk_\reg, SYM_L_GLOBAL) + UNWIND_HINT_EMPTY ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \ __stringify(RETPOLINE \reg), X86_FEATURE_RETPOLINE, \ __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_AMD -SYM_FUNC_END(__x86_indirect_thunk_\reg) - -.endm - -/* - * This generates .altinstr_replacement symbols for use by objtool. They, - * however, must not actually live in .altinstr_replacement since that will be - * discarded after init, but module alternatives will also reference these - * symbols. - * - * Their names matches the "__x86_indirect_" prefix to mark them as retpolines. - */ -.macro ALT_THUNK reg - - .align 1 - -SYM_FUNC_START_NOALIGN(__x86_indirect_alt_call_\reg) - ANNOTATE_RETPOLINE_SAFE -1: call *%\reg -2: .skip 5-(2b-1b), 0x90 -SYM_FUNC_END(__x86_indirect_alt_call_\reg) - -STACK_FRAME_NON_STANDARD(__x86_indirect_alt_call_\reg) - -SYM_FUNC_START_NOALIGN(__x86_indirect_alt_jmp_\reg) - ANNOTATE_RETPOLINE_SAFE -1: jmp *%\reg -2: .skip 5-(2b-1b), 0x90 -SYM_FUNC_END(__x86_indirect_alt_jmp_\reg) - -STACK_FRAME_NON_STANDARD(__x86_indirect_alt_jmp_\reg) - .endm /* @@ -85,22 +53,16 @@ STACK_FRAME_NON_STANDARD(__x86_indirect_alt_jmp_\reg) #define __EXPORT_THUNK(sym) _ASM_NOKPROBE(sym); EXPORT_SYMBOL(sym) #define EXPORT_THUNK(reg) __EXPORT_THUNK(__x86_indirect_thunk_ ## reg) -#undef GEN + .align RETPOLINE_THUNK_SIZE +SYM_CODE_START(__x86_indirect_thunk_array) + #define GEN(reg) THUNK reg #include <asm/GEN-for-each-reg.h> - #undef GEN -#define GEN(reg) EXPORT_THUNK(reg) -#include <asm/GEN-for-each-reg.h> -#undef GEN -#define GEN(reg) ALT_THUNK reg -#include <asm/GEN-for-each-reg.h> + .align RETPOLINE_THUNK_SIZE +SYM_CODE_END(__x86_indirect_thunk_array) -#undef GEN -#define GEN(reg) __EXPORT_THUNK(__x86_indirect_alt_call_ ## reg) +#define GEN(reg) EXPORT_THUNK(reg) #include <asm/GEN-for-each-reg.h> - #undef GEN -#define GEN(reg) __EXPORT_THUNK(__x86_indirect_alt_jmp_ ## reg) -#include <asm/GEN-for-each-reg.h> diff --git a/arch/x86/lib/string_32.c b/arch/x86/lib/string_32.c index d15fdae9656e..53b3f202267c 100644 --- a/arch/x86/lib/string_32.c +++ b/arch/x86/lib/string_32.c @@ -11,6 +11,7 @@ * strings. */ +#define __NO_FORTIFY #include <linux/string.h> #include <linux/export.h> diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c index 034748459482..d62662bdd460 100644 --- a/arch/x86/math-emu/fpu_aux.c +++ b/arch/x86/math-emu/fpu_aux.c @@ -53,7 +53,7 @@ void fpstate_init_soft(struct swregs_state *soft) void finit(void) { - fpstate_init_soft(¤t->thread.fpu.state.soft); + fpstate_init_soft(¤t->thread.fpu.fpstate->regs.soft); } /* diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index 8679a9d6c47f..7fe56c594aa6 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -31,7 +31,7 @@ #include <linux/uaccess.h> #include <asm/traps.h> #include <asm/user.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> #include "fpu_system.h" #include "fpu_emu.h" @@ -640,7 +640,7 @@ int fpregs_soft_set(struct task_struct *target, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - struct swregs_state *s387 = &target->thread.fpu.state.soft; + struct swregs_state *s387 = &target->thread.fpu.fpstate->regs.soft; void *space = s387->st_space; int ret; int offset, other, i, tags, regnr, tag, newtop; @@ -691,7 +691,7 @@ int fpregs_soft_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { - struct swregs_state *s387 = &target->thread.fpu.state.soft; + struct swregs_state *s387 = &target->thread.fpu.fpstate->regs.soft; const void *space = s387->st_space; int offset = (S387->ftop & 7) * 10, other = 80 - offset; diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h index 9b41391867dc..eec3e4805c75 100644 --- a/arch/x86/math-emu/fpu_system.h +++ b/arch/x86/math-emu/fpu_system.h @@ -73,7 +73,7 @@ static inline bool seg_writable(struct desc_struct *d) return (d->type & SEG_TYPE_EXECUTE_MASK) == SEG_TYPE_WRITABLE; } -#define I387 (¤t->thread.fpu.state) +#define I387 (¤t->thread.fpu.fpstate->regs) #define FPU_info (I387->soft.info) #define FPU_CS (*(unsigned short *) &(FPU_info->regs->cs)) diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index e1664e9f969c..5cd2a88930a9 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -4,46 +4,30 @@ #include <linux/sched/debug.h> #include <xen/xen.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> #include <asm/sev.h> #include <asm/traps.h> #include <asm/kdebug.h> -typedef bool (*ex_handler_t)(const struct exception_table_entry *, - struct pt_regs *, int, unsigned long, - unsigned long); - static inline unsigned long ex_fixup_addr(const struct exception_table_entry *x) { return (unsigned long)&x->fixup + x->fixup; } -static inline ex_handler_t -ex_fixup_handler(const struct exception_table_entry *x) -{ - return (ex_handler_t)((unsigned long)&x->handler + x->handler); -} -__visible bool ex_handler_default(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr) +static bool ex_handler_default(const struct exception_table_entry *fixup, + struct pt_regs *regs) { regs->ip = ex_fixup_addr(fixup); return true; } -EXPORT_SYMBOL(ex_handler_default); -__visible bool ex_handler_fault(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr) +static bool ex_handler_fault(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) { - regs->ip = ex_fixup_addr(fixup); regs->ax = trapnr; - return true; + return ex_handler_default(fixup, regs); } -EXPORT_SYMBOL_GPL(ex_handler_fault); /* * Handler for when we fail to restore a task's FPU state. We should never get @@ -55,65 +39,47 @@ EXPORT_SYMBOL_GPL(ex_handler_fault); * of vulnerability by restoring from the initial state (essentially, zeroing * out all the FPU registers) if we can't restore from the task's FPU state. */ -__visible bool ex_handler_fprestore(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr) +static bool ex_handler_fprestore(const struct exception_table_entry *fixup, + struct pt_regs *regs) { regs->ip = ex_fixup_addr(fixup); WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.", (void *)instruction_pointer(regs)); - __restore_fpregs_from_fpstate(&init_fpstate, xfeatures_mask_fpstate()); + fpu_reset_from_exception_fixup(); return true; } -EXPORT_SYMBOL_GPL(ex_handler_fprestore); -__visible bool ex_handler_uaccess(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr) +static bool ex_handler_uaccess(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) { WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?"); - regs->ip = ex_fixup_addr(fixup); - return true; + return ex_handler_default(fixup, regs); } -EXPORT_SYMBOL(ex_handler_uaccess); -__visible bool ex_handler_copy(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr) +static bool ex_handler_copy(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) { WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?"); - regs->ip = ex_fixup_addr(fixup); - regs->ax = trapnr; - return true; + return ex_handler_fault(fixup, regs, trapnr); } -EXPORT_SYMBOL(ex_handler_copy); -__visible bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr) +static bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup, + struct pt_regs *regs) { if (pr_warn_once("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n", (unsigned int)regs->cx, regs->ip, (void *)regs->ip)) show_stack_regs(regs); /* Pretend that the read succeeded and returned 0. */ - regs->ip = ex_fixup_addr(fixup); regs->ax = 0; regs->dx = 0; - return true; + return ex_handler_default(fixup, regs); } -EXPORT_SYMBOL(ex_handler_rdmsr_unsafe); -__visible bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr) +static bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup, + struct pt_regs *regs) { if (pr_warn_once("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n", (unsigned int)regs->cx, (unsigned int)regs->dx, @@ -121,45 +87,29 @@ __visible bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup show_stack_regs(regs); /* Pretend that the write succeeded. */ - regs->ip = ex_fixup_addr(fixup); - return true; + return ex_handler_default(fixup, regs); } -EXPORT_SYMBOL(ex_handler_wrmsr_unsafe); -__visible bool ex_handler_clear_fs(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr) +static bool ex_handler_clear_fs(const struct exception_table_entry *fixup, + struct pt_regs *regs) { if (static_cpu_has(X86_BUG_NULL_SEG)) asm volatile ("mov %0, %%fs" : : "rm" (__USER_DS)); asm volatile ("mov %0, %%fs" : : "rm" (0)); - return ex_handler_default(fixup, regs, trapnr, error_code, fault_addr); + return ex_handler_default(fixup, regs); } -EXPORT_SYMBOL(ex_handler_clear_fs); -enum handler_type ex_get_fault_handler_type(unsigned long ip) +int ex_get_fixup_type(unsigned long ip) { - const struct exception_table_entry *e; - ex_handler_t handler; + const struct exception_table_entry *e = search_exception_tables(ip); - e = search_exception_tables(ip); - if (!e) - return EX_HANDLER_NONE; - handler = ex_fixup_handler(e); - if (handler == ex_handler_fault) - return EX_HANDLER_FAULT; - else if (handler == ex_handler_uaccess || handler == ex_handler_copy) - return EX_HANDLER_UACCESS; - else - return EX_HANDLER_OTHER; + return e ? e->type : EX_TYPE_NONE; } int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code, unsigned long fault_addr) { const struct exception_table_entry *e; - ex_handler_t handler; #ifdef CONFIG_PNPBIOS if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) { @@ -179,8 +129,35 @@ int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code, if (!e) return 0; - handler = ex_fixup_handler(e); - return handler(e, regs, trapnr, error_code, fault_addr); + switch (e->type) { + case EX_TYPE_DEFAULT: + case EX_TYPE_DEFAULT_MCE_SAFE: + return ex_handler_default(e, regs); + case EX_TYPE_FAULT: + case EX_TYPE_FAULT_MCE_SAFE: + return ex_handler_fault(e, regs, trapnr); + case EX_TYPE_UACCESS: + return ex_handler_uaccess(e, regs, trapnr); + case EX_TYPE_COPY: + return ex_handler_copy(e, regs, trapnr); + case EX_TYPE_CLEAR_FS: + return ex_handler_clear_fs(e, regs); + case EX_TYPE_FPU_RESTORE: + return ex_handler_fprestore(e, regs); + case EX_TYPE_RDMSR: + return ex_handler_rdmsr_unsafe(e, regs); + case EX_TYPE_WRMSR: + return ex_handler_wrmsr_unsafe(e, regs); + case EX_TYPE_BPF: + return ex_handler_bpf(e, regs); + case EX_TYPE_RDMSR_IN_MCE: + ex_handler_msr_mce(regs, false); + break; + case EX_TYPE_WRMSR_IN_MCE: + ex_handler_msr_mce(regs, true); + break; + } + BUG(); } extern unsigned int early_recursion_flag; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index b2eefdefc108..84a2c8c4af73 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -710,7 +710,8 @@ oops: static noinline void kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code, - unsigned long address, int signal, int si_code) + unsigned long address, int signal, int si_code, + u32 pkey) { WARN_ON_ONCE(user_mode(regs)); @@ -735,8 +736,12 @@ kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code, set_signal_archinfo(address, error_code); - /* XXX: hwpoison faults will set the wrong code. */ - force_sig_fault(signal, si_code, (void __user *)address); + if (si_code == SEGV_PKUERR) { + force_sig_pkuerr((void __user *)address, pkey); + } else { + /* XXX: hwpoison faults will set the wrong code. */ + force_sig_fault(signal, si_code, (void __user *)address); + } } /* @@ -798,7 +803,8 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, struct task_struct *tsk = current; if (!user_mode(regs)) { - kernelmode_fixup_or_oops(regs, error_code, address, pkey, si_code); + kernelmode_fixup_or_oops(regs, error_code, address, + SIGSEGV, si_code, pkey); return; } @@ -930,7 +936,8 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, { /* Kernel mode? Handle exceptions or die: */ if (!user_mode(regs)) { - kernelmode_fixup_or_oops(regs, error_code, address, SIGBUS, BUS_ADRERR); + kernelmode_fixup_or_oops(regs, error_code, address, + SIGBUS, BUS_ADRERR, ARCH_DEFAULT_PKEY); return; } @@ -1396,7 +1403,8 @@ good_area: */ if (!user_mode(regs)) kernelmode_fixup_or_oops(regs, error_code, address, - SIGBUS, BUS_ADRERR); + SIGBUS, BUS_ADRERR, + ARCH_DEFAULT_PKEY); return; } @@ -1416,7 +1424,8 @@ good_area: return; if (fatal_signal_pending(current) && !user_mode(regs)) { - kernelmode_fixup_or_oops(regs, error_code, address, 0, 0); + kernelmode_fixup_or_oops(regs, error_code, address, + 0, 0, ARCH_DEFAULT_PKEY); return; } @@ -1424,7 +1433,8 @@ good_area: /* Kernel mode? Handle exceptions or die: */ if (!user_mode(regs)) { kernelmode_fixup_or_oops(regs, error_code, address, - SIGSEGV, SEGV_MAPERR); + SIGSEGV, SEGV_MAPERR, + ARCH_DEFAULT_PKEY); return; } diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index a6e11763763f..36098226a957 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1432,18 +1432,18 @@ int kern_addr_valid(unsigned long addr) return 0; p4d = p4d_offset(pgd, addr); - if (p4d_none(*p4d)) + if (!p4d_present(*p4d)) return 0; pud = pud_offset(p4d, addr); - if (pud_none(*pud)) + if (!pud_present(*pud)) return 0; if (pud_large(*pud)) return pfn_valid(pud_pfn(*pud)); pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd)) + if (!pmd_present(*pmd)) return 0; if (pmd_large(*pmd)) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 60ade7dd71bd..026031b3b782 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -14,7 +14,7 @@ #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/mmiotrace.h> -#include <linux/mem_encrypt.h> +#include <linux/cc_platform.h> #include <linux/efi.h> #include <linux/pgtable.h> @@ -92,7 +92,7 @@ static unsigned int __ioremap_check_ram(struct resource *res) */ static unsigned int __ioremap_check_encrypted(struct resource *res) { - if (!sev_active()) + if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) return 0; switch (res->desc) { @@ -112,7 +112,7 @@ static unsigned int __ioremap_check_encrypted(struct resource *res) */ static void __ioremap_check_other(resource_size_t addr, struct ioremap_desc *desc) { - if (!sev_active()) + if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) return; if (!IS_ENABLED(CONFIG_EFI)) @@ -508,6 +508,7 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr) memunmap((void *)((unsigned long)addr & PAGE_MASK)); } +#ifdef CONFIG_AMD_MEM_ENCRYPT /* * Examine the physical address to determine if it is an area of memory * that should be mapped decrypted. If the memory is not part of the @@ -555,7 +556,7 @@ static bool memremap_should_map_decrypted(resource_size_t phys_addr, case E820_TYPE_NVS: case E820_TYPE_UNUSABLE: /* For SEV, these areas are encrypted */ - if (sev_active()) + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) break; fallthrough; @@ -693,7 +694,7 @@ static bool __init early_memremap_is_setup_data(resource_size_t phys_addr, bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size, unsigned long flags) { - if (!mem_encrypt_active()) + if (!cc_platform_has(CC_ATTR_MEM_ENCRYPT)) return true; if (flags & MEMREMAP_ENC) @@ -702,7 +703,7 @@ bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size, if (flags & MEMREMAP_DEC) return false; - if (sme_active()) { + if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) { if (memremap_is_setup_data(phys_addr, size) || memremap_is_efi_data(phys_addr, size)) return false; @@ -723,12 +724,12 @@ pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr, { bool encrypted_prot; - if (!mem_encrypt_active()) + if (!cc_platform_has(CC_ATTR_MEM_ENCRYPT)) return prot; encrypted_prot = true; - if (sme_active()) { + if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) { if (early_memremap_is_setup_data(phys_addr, size) || memremap_is_efi_data(phys_addr, size)) encrypted_prot = false; @@ -746,7 +747,6 @@ bool phys_mem_access_encrypted(unsigned long phys_addr, unsigned long size) return arch_memremap_can_ram_remap(phys_addr, size, 0); } -#ifdef CONFIG_AMD_MEM_ENCRYPT /* Remap memory with encryption */ void __init *early_memremap_encrypted(resource_size_t phys_addr, unsigned long size) diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 1a50434c8a4d..ef885370719a 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -49,8 +49,7 @@ static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr, p = early_alloc(PMD_SIZE, nid, false); if (p && pmd_set_huge(pmd, __pa(p), PAGE_KERNEL)) return; - else if (p) - memblock_free(__pa(p), PMD_SIZE); + memblock_free_ptr(p, PMD_SIZE); } p = early_alloc(PAGE_SIZE, nid, true); @@ -86,8 +85,7 @@ static void __init kasan_populate_pud(pud_t *pud, unsigned long addr, p = early_alloc(PUD_SIZE, nid, false); if (p && pud_set_huge(pud, __pa(p), PAGE_KERNEL)) return; - else if (p) - memblock_free(__pa(p), PUD_SIZE); + memblock_free_ptr(p, PUD_SIZE); } p = early_alloc(PAGE_SIZE, nid, true); diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index ff08dc463634..23d54b810f08 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -20,6 +20,7 @@ #include <linux/bitops.h> #include <linux/dma-mapping.h> #include <linux/virtio_config.h> +#include <linux/cc_platform.h> #include <asm/tlbflush.h> #include <asm/fixmap.h> @@ -143,7 +144,7 @@ void __init sme_unmap_bootdata(char *real_mode_data) struct boot_params *boot_data; unsigned long cmdline_paddr; - if (!sme_active()) + if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) return; /* Get the command line address before unmapping the real_mode_data */ @@ -163,7 +164,7 @@ void __init sme_map_bootdata(char *real_mode_data) struct boot_params *boot_data; unsigned long cmdline_paddr; - if (!sme_active()) + if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) return; __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), true); @@ -193,7 +194,7 @@ void __init sme_early_init(void) for (i = 0; i < ARRAY_SIZE(protection_map); i++) protection_map[i] = pgprot_encrypted(protection_map[i]); - if (sev_active()) + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) swiotlb_force = SWIOTLB_FORCE; } @@ -202,7 +203,7 @@ void __init sev_setup_arch(void) phys_addr_t total_mem = memblock_phys_mem_size(); unsigned long size; - if (!sev_active()) + if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) return; /* @@ -360,42 +361,13 @@ int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size) return early_set_memory_enc_dec(vaddr, size, true); } -/* - * SME and SEV are very similar but they are not the same, so there are - * times that the kernel will need to distinguish between SME and SEV. The - * sme_active() and sev_active() functions are used for this. When a - * distinction isn't needed, the mem_encrypt_active() function can be used. - * - * The trampoline code is a good example for this requirement. Before - * paging is activated, SME will access all memory as decrypted, but SEV - * will access all memory as encrypted. So, when APs are being brought - * up under SME the trampoline area cannot be encrypted, whereas under SEV - * the trampoline area must be encrypted. - */ -bool sev_active(void) -{ - return sev_status & MSR_AMD64_SEV_ENABLED; -} - -bool sme_active(void) -{ - return sme_me_mask && !sev_active(); -} -EXPORT_SYMBOL_GPL(sev_active); - -/* Needs to be called from non-instrumentable code */ -bool noinstr sev_es_active(void) -{ - return sev_status & MSR_AMD64_SEV_ES_ENABLED; -} - /* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */ bool force_dma_unencrypted(struct device *dev) { /* * For SEV, all DMA must be to unencrypted addresses. */ - if (sev_active()) + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) return true; /* @@ -403,7 +375,7 @@ bool force_dma_unencrypted(struct device *dev) * device does not support DMA to addresses that include the * encryption mask. */ - if (sme_active()) { + if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) { u64 dma_enc_mask = DMA_BIT_MASK(__ffs64(sme_me_mask)); u64 dma_dev_mask = min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit); @@ -428,7 +400,7 @@ void __init mem_encrypt_free_decrypted_mem(void) * The unused memory range was mapped decrypted, change the encryption * attribute from decrypted to encrypted before freeing it. */ - if (mem_encrypt_active()) { + if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) { r = set_memory_encrypted(vaddr, npages); if (r) { pr_warn("failed to free unused decrypted pages\n"); @@ -444,7 +416,7 @@ static void print_mem_encrypt_feature_info(void) pr_info("AMD Memory Encryption Features active:"); /* Secure Memory Encryption */ - if (sme_active()) { + if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) { /* * SME is mutually exclusive with any of the SEV * features below. @@ -454,11 +426,11 @@ static void print_mem_encrypt_feature_info(void) } /* Secure Encrypted Virtualization */ - if (sev_active()) + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) pr_cont(" SEV"); /* Encrypted Register State */ - if (sev_es_active()) + if (cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) pr_cont(" SEV-ES"); pr_cont("\n"); @@ -477,7 +449,8 @@ void __init mem_encrypt_init(void) * With SEV, we need to unroll the rep string I/O instructions, * but SEV-ES supports them through the #VC handler. */ - if (sev_active() && !sev_es_active()) + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) && + !cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) static_branch_enable(&sev_enable_key); print_mem_encrypt_feature_info(); @@ -485,6 +458,6 @@ void __init mem_encrypt_init(void) int arch_has_restricted_virtio_memory_access(void) { - return sev_active(); + return cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT); } EXPORT_SYMBOL_GPL(arch_has_restricted_virtio_memory_access); diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index 470b20208430..3f0abb403340 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -27,9 +27,19 @@ #undef CONFIG_PARAVIRT_XXL #undef CONFIG_PARAVIRT_SPINLOCKS +/* + * This code runs before CPU feature bits are set. By default, the + * pgtable_l5_enabled() function uses bit X86_FEATURE_LA57 to determine if + * 5-level paging is active, so that won't work here. USE_EARLY_PGTABLE_L5 + * is provided to handle this situation and, instead, use a variable that + * has been set by the early boot code. + */ +#define USE_EARLY_PGTABLE_L5 + #include <linux/kernel.h> #include <linux/mm.h> #include <linux/mem_encrypt.h> +#include <linux/cc_platform.h> #include <asm/setup.h> #include <asm/sections.h> @@ -287,7 +297,13 @@ void __init sme_encrypt_kernel(struct boot_params *bp) unsigned long pgtable_area_len; unsigned long decrypted_base; - if (!sme_active()) + /* + * This is early code, use an open coded check for SME instead of + * using cc_platform_has(). This eliminates worries about removing + * instrumentation or checking boot_cpu_data in the cc_platform_has() + * function. + */ + if (!sme_get_me_mask() || sev_status & MSR_AMD64_SEV_ENABLED) return; /* diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index a1b5c71099e6..1e9b93b088db 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -355,7 +355,7 @@ void __init numa_reset_distance(void) /* numa_distance could be 1LU marking allocation failure, test cnt */ if (numa_distance_cnt) - memblock_free(__pa(numa_distance), size); + memblock_free_ptr(numa_distance, size); numa_distance_cnt = 0; numa_distance = NULL; /* enable table creation */ } diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index 737491b13728..e801e30089c4 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -517,8 +517,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) } /* free the copied physical distance table */ - if (phys_dist) - memblock_free(__pa(phys_dist), phys_size); + memblock_free_ptr(phys_dist, phys_size); return; no_emu: diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index 3112ca7786ed..4ba2a3ee4bce 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -583,7 +583,12 @@ int memtype_reserve(u64 start, u64 end, enum page_cache_mode req_type, int err = 0; start = sanitize_phys(start); - end = sanitize_phys(end); + + /* + * The end address passed into this function is exclusive, but + * sanitize_phys() expects an inclusive address. + */ + end = sanitize_phys(end - 1) + 1; if (start >= end) { WARN(1, "%s failed: [mem %#010Lx-%#010Lx], req %s\n", __func__, start, end - 1, cattr_name(req_type)); diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index ad8a5c586a35..527957586f3c 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -18,6 +18,7 @@ #include <linux/libnvdimm.h> #include <linux/vmstat.h> #include <linux/kernel.h> +#include <linux/cc_platform.h> #include <asm/e820/api.h> #include <asm/processor.h> @@ -1986,7 +1987,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) int ret; /* Nothing to do if memory encryption is not active */ - if (!mem_encrypt_active()) + if (!cc_platform_has(CC_ATTR_MEM_ENCRYPT)) return 0; /* Should not be working on unaligned addresses */ diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 0fe6aacef3db..8796559f62a4 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -15,7 +15,6 @@ #include <asm/set_memory.h> #include <asm/nospec-branch.h> #include <asm/text-patching.h> -#include <asm/asm-prototypes.h> static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) { @@ -225,6 +224,14 @@ static void jit_fill_hole(void *area, unsigned int size) struct jit_context { int cleanup_addr; /* Epilogue code offset */ + + /* + * Program specific offsets of labels in the code; these rely on the + * JIT doing at least 2 passes, recording the position on the first + * pass, only to generate the correct offset on the second pass. + */ + int tail_call_direct_label; + int tail_call_indirect_label; }; /* Maximum number of bytes emitted while JITing one eBPF insn */ @@ -380,20 +387,23 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, return __bpf_arch_text_poke(ip, t, old_addr, new_addr, true); } -static int get_pop_bytes(bool *callee_regs_used) +#define EMIT_LFENCE() EMIT3(0x0F, 0xAE, 0xE8) + +static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip) { - int bytes = 0; + u8 *prog = *pprog; - if (callee_regs_used[3]) - bytes += 2; - if (callee_regs_used[2]) - bytes += 2; - if (callee_regs_used[1]) - bytes += 2; - if (callee_regs_used[0]) - bytes += 1; +#ifdef CONFIG_RETPOLINE + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_AMD)) { + EMIT_LFENCE(); + EMIT2(0xFF, 0xE0 + reg); + } else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) { + emit_jump(&prog, &__x86_indirect_thunk_array[reg], ip); + } else +#endif + EMIT2(0xFF, 0xE0 + reg); - return bytes; + *pprog = prog; } /* @@ -411,29 +421,12 @@ static int get_pop_bytes(bool *callee_regs_used) * out: */ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used, - u32 stack_depth) + u32 stack_depth, u8 *ip, + struct jit_context *ctx) { int tcc_off = -4 - round_up(stack_depth, 8); - u8 *prog = *pprog; - int pop_bytes = 0; - int off1 = 42; - int off2 = 31; - int off3 = 9; - - /* count the additional bytes used for popping callee regs from stack - * that need to be taken into account for each of the offsets that - * are used for bailing out of the tail call - */ - pop_bytes = get_pop_bytes(callee_regs_used); - off1 += pop_bytes; - off2 += pop_bytes; - off3 += pop_bytes; - - if (stack_depth) { - off1 += 7; - off2 += 7; - off3 += 7; - } + u8 *prog = *pprog, *start = *pprog; + int offset; /* * rdi - pointer to ctx @@ -448,8 +441,9 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used, EMIT2(0x89, 0xD2); /* mov edx, edx */ EMIT3(0x39, 0x56, /* cmp dword ptr [rsi + 16], edx */ offsetof(struct bpf_array, map.max_entries)); -#define OFFSET1 (off1 + RETPOLINE_RCX_BPF_JIT_SIZE) /* Number of bytes to jump */ - EMIT2(X86_JBE, OFFSET1); /* jbe out */ + + offset = ctx->tail_call_indirect_label - (prog + 2 - start); + EMIT2(X86_JBE, offset); /* jbe out */ /* * if (tail_call_cnt > MAX_TAIL_CALL_CNT) @@ -457,8 +451,9 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used, */ EMIT2_off32(0x8B, 0x85, tcc_off); /* mov eax, dword ptr [rbp - tcc_off] */ EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ -#define OFFSET2 (off2 + RETPOLINE_RCX_BPF_JIT_SIZE) - EMIT2(X86_JA, OFFSET2); /* ja out */ + + offset = ctx->tail_call_indirect_label - (prog + 2 - start); + EMIT2(X86_JA, offset); /* ja out */ EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ EMIT2_off32(0x89, 0x85, tcc_off); /* mov dword ptr [rbp - tcc_off], eax */ @@ -471,12 +466,11 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used, * goto out; */ EMIT3(0x48, 0x85, 0xC9); /* test rcx,rcx */ -#define OFFSET3 (off3 + RETPOLINE_RCX_BPF_JIT_SIZE) - EMIT2(X86_JE, OFFSET3); /* je out */ - *pprog = prog; - pop_callee_regs(pprog, callee_regs_used); - prog = *pprog; + offset = ctx->tail_call_indirect_label - (prog + 2 - start); + EMIT2(X86_JE, offset); /* je out */ + + pop_callee_regs(&prog, callee_regs_used); EMIT1(0x58); /* pop rax */ if (stack_depth) @@ -493,41 +487,21 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used, * rdi == ctx (1st arg) * rcx == prog->bpf_func + X86_TAIL_CALL_OFFSET */ - RETPOLINE_RCX_BPF_JIT(); + emit_indirect_jump(&prog, 1 /* rcx */, ip + (prog - start)); /* out: */ + ctx->tail_call_indirect_label = prog - start; *pprog = prog; } static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke, - u8 **pprog, int addr, u8 *image, - bool *callee_regs_used, u32 stack_depth) + u8 **pprog, u8 *ip, + bool *callee_regs_used, u32 stack_depth, + struct jit_context *ctx) { int tcc_off = -4 - round_up(stack_depth, 8); - u8 *prog = *pprog; - int pop_bytes = 0; - int off1 = 20; - int poke_off; - - /* count the additional bytes used for popping callee regs to stack - * that need to be taken into account for jump offset that is used for - * bailing out from of the tail call when limit is reached - */ - pop_bytes = get_pop_bytes(callee_regs_used); - off1 += pop_bytes; - - /* - * total bytes for: - * - nop5/ jmpq $off - * - pop callee regs - * - sub rsp, $val if depth > 0 - * - pop rax - */ - poke_off = X86_PATCH_SIZE + pop_bytes + 1; - if (stack_depth) { - poke_off += 7; - off1 += 7; - } + u8 *prog = *pprog, *start = *pprog; + int offset; /* * if (tail_call_cnt > MAX_TAIL_CALL_CNT) @@ -535,28 +509,30 @@ static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke, */ EMIT2_off32(0x8B, 0x85, tcc_off); /* mov eax, dword ptr [rbp - tcc_off] */ EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ - EMIT2(X86_JA, off1); /* ja out */ + + offset = ctx->tail_call_direct_label - (prog + 2 - start); + EMIT2(X86_JA, offset); /* ja out */ EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ EMIT2_off32(0x89, 0x85, tcc_off); /* mov dword ptr [rbp - tcc_off], eax */ - poke->tailcall_bypass = image + (addr - poke_off - X86_PATCH_SIZE); + poke->tailcall_bypass = ip + (prog - start); poke->adj_off = X86_TAIL_CALL_OFFSET; - poke->tailcall_target = image + (addr - X86_PATCH_SIZE); + poke->tailcall_target = ip + ctx->tail_call_direct_label - X86_PATCH_SIZE; poke->bypass_addr = (u8 *)poke->tailcall_target + X86_PATCH_SIZE; emit_jump(&prog, (u8 *)poke->tailcall_target + X86_PATCH_SIZE, poke->tailcall_bypass); - *pprog = prog; - pop_callee_regs(pprog, callee_regs_used); - prog = *pprog; + pop_callee_regs(&prog, callee_regs_used); EMIT1(0x58); /* pop rax */ if (stack_depth) EMIT3_off32(0x48, 0x81, 0xC4, round_up(stack_depth, 8)); memcpy(prog, x86_nops[5], X86_PATCH_SIZE); prog += X86_PATCH_SIZE; + /* out: */ + ctx->tail_call_direct_label = prog - start; *pprog = prog; } @@ -827,9 +803,7 @@ static int emit_atomic(u8 **pprog, u8 atomic_op, return 0; } -static bool ex_handler_bpf(const struct exception_table_entry *x, - struct pt_regs *regs, int trapnr, - unsigned long error_code, unsigned long fault_addr) +bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs) { u32 reg = x->fixup >> 8; @@ -1222,8 +1196,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, /* speculation barrier */ case BPF_ST | BPF_NOSPEC: if (boot_cpu_has(X86_FEATURE_XMM2)) - /* Emit 'lfence' */ - EMIT3(0x0F, 0xAE, 0xE8); + EMIT_LFENCE(); break; /* ST: *(u8*)(dst_reg + off) = imm */ @@ -1313,12 +1286,7 @@ st: if (is_imm8(insn->off)) } ex->insn = delta; - delta = (u8 *)ex_handler_bpf - (u8 *)&ex->handler; - if (!is_simm32(delta)) { - pr_err("extable->handler doesn't fit into 32-bit\n"); - return -EFAULT; - } - ex->handler = delta; + ex->type = EX_TYPE_BPF; if (dst_reg > BPF_REG_9) { pr_err("verifier error\n"); @@ -1341,9 +1309,10 @@ st: if (is_imm8(insn->off)) if (insn->imm == (BPF_AND | BPF_FETCH) || insn->imm == (BPF_OR | BPF_FETCH) || insn->imm == (BPF_XOR | BPF_FETCH)) { - u8 *branch_target; bool is64 = BPF_SIZE(insn->code) == BPF_DW; u32 real_src_reg = src_reg; + u32 real_dst_reg = dst_reg; + u8 *branch_target; /* * Can't be implemented with a single x86 insn. @@ -1354,11 +1323,13 @@ st: if (is_imm8(insn->off)) emit_mov_reg(&prog, true, BPF_REG_AX, BPF_REG_0); if (src_reg == BPF_REG_0) real_src_reg = BPF_REG_AX; + if (dst_reg == BPF_REG_0) + real_dst_reg = BPF_REG_AX; branch_target = prog; /* Load old value */ emit_ldx(&prog, BPF_SIZE(insn->code), - BPF_REG_0, dst_reg, insn->off); + BPF_REG_0, real_dst_reg, insn->off); /* * Perform the (commutative) operation locally, * put the result in the AUX_REG. @@ -1369,7 +1340,8 @@ st: if (is_imm8(insn->off)) add_2reg(0xC0, AUX_REG, real_src_reg)); /* Attempt to swap in new value */ err = emit_atomic(&prog, BPF_CMPXCHG, - dst_reg, AUX_REG, insn->off, + real_dst_reg, AUX_REG, + insn->off, BPF_SIZE(insn->code)); if (WARN_ON(err)) return err; @@ -1383,11 +1355,10 @@ st: if (is_imm8(insn->off)) /* Restore R0 after clobbering RAX */ emit_mov_reg(&prog, true, BPF_REG_0, BPF_REG_AX); break; - } err = emit_atomic(&prog, insn->imm, dst_reg, src_reg, - insn->off, BPF_SIZE(insn->code)); + insn->off, BPF_SIZE(insn->code)); if (err) return err; break; @@ -1409,13 +1380,16 @@ st: if (is_imm8(insn->off)) case BPF_JMP | BPF_TAIL_CALL: if (imm32) emit_bpf_tail_call_direct(&bpf_prog->aux->poke_tab[imm32 - 1], - &prog, addrs[i], image, + &prog, image + addrs[i - 1], callee_regs_used, - bpf_prog->aux->stack_depth); + bpf_prog->aux->stack_depth, + ctx); else emit_bpf_tail_call_indirect(&prog, callee_regs_used, - bpf_prog->aux->stack_depth); + bpf_prog->aux->stack_depth, + image + addrs[i - 1], + ctx); break; /* cond jump */ @@ -1744,7 +1718,7 @@ static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_args, } static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, - struct bpf_prog *p, int stack_size, bool mod_ret) + struct bpf_prog *p, int stack_size, bool save_ret) { u8 *prog = *pprog; u8 *jmp_insn; @@ -1777,11 +1751,15 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, if (emit_call(&prog, p->bpf_func, prog)) return -EINVAL; - /* BPF_TRAMP_MODIFY_RETURN trampolines can modify the return + /* + * BPF_TRAMP_MODIFY_RETURN trampolines can modify the return * of the previous call which is then passed on the stack to * the next BPF program. + * + * BPF_TRAMP_FENTRY trampoline may need to return the return + * value of BPF_PROG_TYPE_STRUCT_OPS prog. */ - if (mod_ret) + if (save_ret) emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); /* replace 2 nops with JE insn, since jmp target is known */ @@ -1828,13 +1806,15 @@ static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond) } static int invoke_bpf(const struct btf_func_model *m, u8 **pprog, - struct bpf_tramp_progs *tp, int stack_size) + struct bpf_tramp_progs *tp, int stack_size, + bool save_ret) { int i; u8 *prog = *pprog; for (i = 0; i < tp->nr_progs; i++) { - if (invoke_bpf_prog(m, &prog, tp->progs[i], stack_size, false)) + if (invoke_bpf_prog(m, &prog, tp->progs[i], stack_size, + save_ret)) return -EINVAL; } *pprog = prog; @@ -1877,6 +1857,23 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, return 0; } +static bool is_valid_bpf_tramp_flags(unsigned int flags) +{ + if ((flags & BPF_TRAMP_F_RESTORE_REGS) && + (flags & BPF_TRAMP_F_SKIP_FRAME)) + return false; + + /* + * BPF_TRAMP_F_RET_FENTRY_RET is only used by bpf_struct_ops, + * and it must be used alone. + */ + if ((flags & BPF_TRAMP_F_RET_FENTRY_RET) && + (flags & ~BPF_TRAMP_F_RET_FENTRY_RET)) + return false; + + return true; +} + /* Example: * __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev); * its 'struct btf_func_model' will be nr_args=2 @@ -1949,17 +1946,19 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i struct bpf_tramp_progs *fmod_ret = &tprogs[BPF_TRAMP_MODIFY_RETURN]; u8 **branches = NULL; u8 *prog; + bool save_ret; /* x86-64 supports up to 6 arguments. 7+ can be added in the future */ if (nr_args > 6) return -ENOTSUPP; - if ((flags & BPF_TRAMP_F_RESTORE_REGS) && - (flags & BPF_TRAMP_F_SKIP_FRAME)) + if (!is_valid_bpf_tramp_flags(flags)) return -EINVAL; - if (flags & BPF_TRAMP_F_CALL_ORIG) - stack_size += 8; /* room for return value of orig_call */ + /* room for return value of orig_call or fentry prog */ + save_ret = flags & (BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_RET_FENTRY_RET); + if (save_ret) + stack_size += 8; if (flags & BPF_TRAMP_F_IP_ARG) stack_size += 8; /* room for IP address argument */ @@ -2005,7 +2004,8 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i } if (fentry->nr_progs) - if (invoke_bpf(m, &prog, fentry, stack_size)) + if (invoke_bpf(m, &prog, fentry, stack_size, + flags & BPF_TRAMP_F_RET_FENTRY_RET)) return -EINVAL; if (fmod_ret->nr_progs) { @@ -2052,7 +2052,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i } if (fexit->nr_progs) - if (invoke_bpf(m, &prog, fexit, stack_size)) { + if (invoke_bpf(m, &prog, fexit, stack_size, false)) { ret = -EINVAL; goto cleanup; } @@ -2072,9 +2072,10 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i ret = -EINVAL; goto cleanup; } - /* restore original return value back into RAX */ - emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8); } + /* restore return value of orig_call or fentry prog back into RAX */ + if (save_ret) + emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8); EMIT1(0x5B); /* pop rbx */ EMIT1(0xC9); /* leave */ @@ -2094,24 +2095,6 @@ cleanup: return ret; } -static int emit_fallback_jump(u8 **pprog) -{ - u8 *prog = *pprog; - int err = 0; - -#ifdef CONFIG_RETPOLINE - /* Note that this assumes the the compiler uses external - * thunks for indirect calls. Both clang and GCC use the same - * naming convention for external thunks. - */ - err = emit_jump(&prog, __x86_indirect_thunk_rdx, prog); -#else - EMIT2(0xFF, 0xE2); /* jmp rdx */ -#endif - *pprog = prog; - return err; -} - static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs) { u8 *jg_reloc, *prog = *pprog; @@ -2133,9 +2116,7 @@ static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs) if (err) return err; - err = emit_fallback_jump(&prog); /* jmp thunk/indirect */ - if (err) - return err; + emit_indirect_jump(&prog, 2 /* rdx */, prog); *pprog = prog; return 0; diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c index 3bfda5f502cb..da9b7cfa4632 100644 --- a/arch/x86/net/bpf_jit_comp32.c +++ b/arch/x86/net/bpf_jit_comp32.c @@ -15,6 +15,7 @@ #include <asm/cacheflush.h> #include <asm/set_memory.h> #include <asm/nospec-branch.h> +#include <asm/asm-prototypes.h> #include <linux/bpf.h> /* @@ -1267,6 +1268,21 @@ static void emit_epilogue(u8 **pprog, u32 stack_depth) *pprog = prog; } +static int emit_jmp_edx(u8 **pprog, u8 *ip) +{ + u8 *prog = *pprog; + int cnt = 0; + +#ifdef CONFIG_RETPOLINE + EMIT1_off32(0xE9, (u8 *)__x86_indirect_thunk_edx - (ip + 5)); +#else + EMIT2(0xFF, 0xE2); +#endif + *pprog = prog; + + return cnt; +} + /* * Generate the following code: * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ... @@ -1280,7 +1296,7 @@ static void emit_epilogue(u8 **pprog, u32 stack_depth) * goto *(prog->bpf_func + prologue_size); * out: */ -static void emit_bpf_tail_call(u8 **pprog) +static void emit_bpf_tail_call(u8 **pprog, u8 *ip) { u8 *prog = *pprog; int cnt = 0; @@ -1362,7 +1378,7 @@ static void emit_bpf_tail_call(u8 **pprog) * eax == ctx (1st arg) * edx == prog->bpf_func + prologue_size */ - RETPOLINE_EDX_BPF_JIT(); + cnt += emit_jmp_edx(&prog, ip + cnt); if (jmp_label1 == -1) jmp_label1 = cnt; @@ -2122,7 +2138,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, break; } case BPF_JMP | BPF_TAIL_CALL: - emit_bpf_tail_call(&prog); + emit_bpf_tail_call(&prog, image + addrs[i - 1]); break; /* cond jump */ diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 3d41a09c2c14..5debe4ac6f81 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -113,7 +113,7 @@ static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi, false /* no mapping of GSI to PIRQ */); } -#ifdef CONFIG_XEN_DOM0 +#ifdef CONFIG_XEN_PV_DOM0 static int xen_register_gsi(u32 gsi, int triggering, int polarity) { int rc, irq; @@ -261,7 +261,7 @@ error: return irq; } -#ifdef CONFIG_XEN_DOM0 +#ifdef CONFIG_XEN_PV_DOM0 static bool __read_mostly pci_seg_supported = true; static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) @@ -375,10 +375,10 @@ static void xen_initdom_restore_msi_irqs(struct pci_dev *dev) WARN(ret && ret != -ENOSYS, "restore_msi -> %d\n", ret); } } -#else /* CONFIG_XEN_DOM0 */ +#else /* CONFIG_XEN_PV_DOM0 */ #define xen_initdom_setup_msi_irqs NULL #define xen_initdom_restore_msi_irqs NULL -#endif /* !CONFIG_XEN_DOM0 */ +#endif /* !CONFIG_XEN_PV_DOM0 */ static void xen_teardown_msi_irqs(struct pci_dev *dev) { @@ -555,7 +555,7 @@ int __init pci_xen_hvm_init(void) return 0; } -#ifdef CONFIG_XEN_DOM0 +#ifdef CONFIG_XEN_PV_DOM0 int __init pci_xen_initial_domain(void) { int irq; @@ -583,6 +583,9 @@ int __init pci_xen_initial_domain(void) } return 0; } +#endif + +#ifdef CONFIG_XEN_DOM0 struct xen_device_domain_owner { domid_t domain; @@ -656,4 +659,4 @@ int xen_unregister_device_domain_owner(struct pci_dev *dev) return 0; } EXPORT_SYMBOL_GPL(xen_unregister_device_domain_owner); -#endif +#endif /* CONFIG_XEN_DOM0 */ diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 7515e78ef898..1f3675453a57 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -33,7 +33,7 @@ #include <linux/reboot.h> #include <linux/slab.h> #include <linux/ucs2_string.h> -#include <linux/mem_encrypt.h> +#include <linux/cc_platform.h> #include <linux/sched/task.h> #include <asm/setup.h> @@ -284,7 +284,8 @@ static void __init __map_region(efi_memory_desc_t *md, u64 va) if (!(md->attribute & EFI_MEMORY_WB)) flags |= _PAGE_PCD; - if (sev_active() && md->type != EFI_MEMORY_MAPPED_IO) + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) && + md->type != EFI_MEMORY_MAPPED_IO) flags |= _PAGE_ENC; pfn = md->phys_addr >> PAGE_SHIFT; @@ -390,7 +391,7 @@ static int __init efi_update_mem_attr(struct mm_struct *mm, efi_memory_desc_t *m if (!(md->attribute & EFI_MEMORY_RO)) pf |= _PAGE_RW; - if (sev_active()) + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) pf |= _PAGE_ENC; return efi_update_mappings(md, pf); @@ -438,7 +439,7 @@ void __init efi_runtime_update_mappings(void) (md->type != EFI_RUNTIME_SERVICES_CODE)) pf |= _PAGE_RW; - if (sev_active()) + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) pf |= _PAGE_ENC; efi_update_mappings(md, pf); diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c index ee2beda590d0..1d4a00e767ec 100644 --- a/arch/x86/platform/olpc/olpc.c +++ b/arch/x86/platform/olpc/olpc.c @@ -274,7 +274,7 @@ static struct olpc_ec_driver ec_xo1_driver = { static struct olpc_ec_driver ec_xo1_5_driver = { .ec_cmd = olpc_xo1_ec_cmd, -#ifdef CONFIG_OLPC_XO1_5_SCI +#ifdef CONFIG_OLPC_XO15_SCI /* * XO-1.5 EC wakeups are available when olpc-xo15-sci driver is * compiled in diff --git a/arch/x86/platform/pvh/enlighten.c b/arch/x86/platform/pvh/enlighten.c index 9ac7457f52a3..ed0442e35434 100644 --- a/arch/x86/platform/pvh/enlighten.c +++ b/arch/x86/platform/pvh/enlighten.c @@ -16,15 +16,15 @@ /* * PVH variables. * - * pvh_bootparams and pvh_start_info need to live in the data segment since + * pvh_bootparams and pvh_start_info need to live in a data segment since * they are used after startup_{32|64}, which clear .bss, are invoked. */ -struct boot_params pvh_bootparams __section(".data"); -struct hvm_start_info pvh_start_info __section(".data"); +struct boot_params __initdata pvh_bootparams; +struct hvm_start_info __initdata pvh_start_info; -unsigned int pvh_start_info_sz = sizeof(pvh_start_info); +const unsigned int __initconst pvh_start_info_sz = sizeof(pvh_start_info); -static u64 pvh_get_root_pointer(void) +static u64 __init pvh_get_root_pointer(void) { return pvh_start_info.rsdp_paddr; } @@ -107,7 +107,7 @@ void __init __weak xen_pvh_init(struct boot_params *boot_params) BUG(); } -static void hypervisor_specific_init(bool xen_guest) +static void __init hypervisor_specific_init(bool xen_guest) { if (xen_guest) xen_pvh_init(&pvh_bootparams); diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 6665f8802098..9f2b251e83c5 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -20,7 +20,7 @@ #include <asm/page.h> #include <asm/mce.h> #include <asm/suspend.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> #include <asm/debugreg.h> #include <asm/cpu.h> #include <asm/mmu_context.h> diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index 31b5856010cb..4a3da7592b99 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -2,7 +2,7 @@ #include <linux/io.h> #include <linux/slab.h> #include <linux/memblock.h> -#include <linux/mem_encrypt.h> +#include <linux/cc_platform.h> #include <linux/pgtable.h> #include <asm/set_memory.h> @@ -44,10 +44,10 @@ void __init reserve_real_mode(void) static void sme_sev_setup_real_mode(struct trampoline_header *th) { #ifdef CONFIG_AMD_MEM_ENCRYPT - if (sme_active()) + if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) th->flags |= TH_FLAGS_SME_ACTIVE; - if (sev_es_active()) { + if (cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) { /* * Skip the call to verify_cpu() in secondary_startup_64 as it * will cause #VC exceptions when the AP can't handle them yet. @@ -81,7 +81,7 @@ static void __init setup_real_mode(void) * decrypted memory in order to bring up other processors * successfully. This is not needed for SEV. */ - if (sme_active()) + if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) set_memory_decrypted((unsigned long)base, size >> PAGE_SHIFT); memcpy(base, real_mode_blob, size); diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index afc1da68b06d..6bcd3d8ca6ac 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -43,13 +43,9 @@ config XEN_PV_SMP def_bool y depends on XEN_PV && SMP -config XEN_DOM0 - bool "Xen PV Dom0 support" - default y - depends on XEN_PV && PCI_XEN && SWIOTLB_XEN - depends on X86_IO_APIC && ACPI && PCI - help - Support running as a Xen PV Dom0 guest. +config XEN_PV_DOM0 + def_bool y + depends on XEN_PV && XEN_DOM0 config XEN_PVHVM def_bool y @@ -86,3 +82,12 @@ config XEN_PVH def_bool n help Support for running as a Xen PVH guest. + +config XEN_DOM0 + bool "Xen Dom0 support" + default XEN_PV + depends on (XEN_PV && SWIOTLB_XEN) || (XEN_PVH && X86_64) + depends on X86_IO_APIC && ACPI && PCI + select X86_X2APIC if XEN_PVH && X86_64 + help + Support running as a Xen Dom0 guest. diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 40b5779fce21..4953260e281c 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -45,7 +45,7 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o -obj-$(CONFIG_XEN_DOM0) += vga.o +obj-$(CONFIG_XEN_PV_DOM0) += vga.o obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index c79bd0af2e8c..95d970359e17 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -3,6 +3,7 @@ #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG #include <linux/memblock.h> #endif +#include <linux/console.h> #include <linux/cpu.h> #include <linux/kexec.h> #include <linux/slab.h> @@ -10,12 +11,15 @@ #include <xen/xen.h> #include <xen/features.h> +#include <xen/interface/sched.h> +#include <xen/interface/version.h> #include <xen/page.h> #include <asm/xen/hypercall.h> #include <asm/xen/hypervisor.h> #include <asm/cpu.h> #include <asm/e820/api.h> +#include <asm/setup.h> #include "xen-ops.h" #include "smp.h" @@ -52,9 +56,6 @@ DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); DEFINE_PER_CPU(uint32_t, xen_vcpu_id); EXPORT_PER_CPU_SYMBOL(xen_vcpu_id); -enum xen_domain_type xen_domain_type = XEN_NATIVE; -EXPORT_SYMBOL_GPL(xen_domain_type); - unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START; EXPORT_SYMBOL(machine_to_phys_mapping); unsigned long machine_to_phys_nr; @@ -69,10 +70,12 @@ __read_mostly int xen_have_vector_callback; EXPORT_SYMBOL_GPL(xen_have_vector_callback); /* - * NB: needs to live in .data because it's used by xen_prepare_pvh which runs - * before clearing the bss. + * NB: These need to live in .data or alike because they're used by + * xen_prepare_pvh() which runs before clearing the bss. */ -uint32_t xen_start_flags __section(".data") = 0; +enum xen_domain_type __ro_after_init xen_domain_type = XEN_NATIVE; +EXPORT_SYMBOL_GPL(xen_domain_type); +uint32_t __ro_after_init xen_start_flags; EXPORT_SYMBOL(xen_start_flags); /* @@ -258,6 +261,45 @@ int xen_vcpu_setup(int cpu) return ((per_cpu(xen_vcpu, cpu) == NULL) ? -ENODEV : 0); } +void __init xen_banner(void) +{ + unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL); + struct xen_extraversion extra; + + HYPERVISOR_xen_version(XENVER_extraversion, &extra); + + pr_info("Booting kernel on %s\n", pv_info.name); + pr_info("Xen version: %u.%u%s%s\n", + version >> 16, version & 0xffff, extra.extraversion, + xen_feature(XENFEAT_mmu_pt_update_preserve_ad) + ? " (preserve-AD)" : ""); +} + +/* Check if running on Xen version (major, minor) or later */ +bool xen_running_on_version_or_later(unsigned int major, unsigned int minor) +{ + unsigned int version; + + if (!xen_domain()) + return false; + + version = HYPERVISOR_xen_version(XENVER_version, NULL); + if ((((version >> 16) == major) && ((version & 0xffff) >= minor)) || + ((version >> 16) > major)) + return true; + return false; +} + +void __init xen_add_preferred_consoles(void) +{ + add_preferred_console("xenboot", 0, NULL); + if (!boot_params.screen_info.orig_video_isVGA) + add_preferred_console("tty", 0, NULL); + add_preferred_console("hvc", 0, NULL); + if (boot_params.screen_info.orig_video_isVGA) + add_preferred_console("tty", 0, NULL); +} + void xen_reboot(int reason) { struct sched_shutdown r = { .reason = reason }; diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 753f63734c13..4f63117f09bb 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -28,7 +28,6 @@ #include <linux/mm.h> #include <linux/page-flags.h> #include <linux/highmem.h> -#include <linux/console.h> #include <linux/pci.h> #include <linux/gfp.h> #include <linux/edd.h> @@ -109,17 +108,6 @@ struct tls_descs { */ static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc); -static void __init xen_banner(void) -{ - unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL); - struct xen_extraversion extra; - HYPERVISOR_xen_version(XENVER_extraversion, &extra); - - pr_info("Booting paravirtualized kernel on %s\n", pv_info.name); - pr_info("Xen version: %d.%d%s (preserve-AD)\n", - version >> 16, version & 0xffff, extra.extraversion); -} - static void __init xen_pv_init_platform(void) { populate_extra_pte(fix_to_virt(FIX_PARAVIRT_BOOTMAP)); @@ -142,22 +130,6 @@ static void __init xen_pv_guest_late_init(void) #endif } -/* Check if running on Xen version (major, minor) or later */ -bool -xen_running_on_version_or_later(unsigned int major, unsigned int minor) -{ - unsigned int version; - - if (!xen_domain()) - return false; - - version = HYPERVISOR_xen_version(XENVER_version, NULL); - if ((((version >> 16) == major) && ((version & 0xffff) >= minor)) || - ((version >> 16) > major)) - return true; - return false; -} - static __read_mostly unsigned int cpuid_leaf5_ecx_val; static __read_mostly unsigned int cpuid_leaf5_edx_val; @@ -311,12 +283,12 @@ static void __init xen_init_capabilities(void) } } -static void xen_set_debugreg(int reg, unsigned long val) +static noinstr void xen_set_debugreg(int reg, unsigned long val) { HYPERVISOR_set_debugreg(reg, val); } -static unsigned long xen_get_debugreg(int reg) +static noinstr unsigned long xen_get_debugreg(int reg) { return HYPERVISOR_get_debugreg(reg); } @@ -755,8 +727,8 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g) preempt_enable(); } -static void xen_convert_trap_info(const struct desc_ptr *desc, - struct trap_info *traps) +static unsigned xen_convert_trap_info(const struct desc_ptr *desc, + struct trap_info *traps, bool full) { unsigned in, out, count; @@ -766,17 +738,18 @@ static void xen_convert_trap_info(const struct desc_ptr *desc, for (in = out = 0; in < count; in++) { gate_desc *entry = (gate_desc *)(desc->address) + in; - if (cvt_gate_to_trap(in, entry, &traps[out])) + if (cvt_gate_to_trap(in, entry, &traps[out]) || full) out++; } - traps[out].address = 0; + + return out; } void xen_copy_trap_info(struct trap_info *traps) { const struct desc_ptr *desc = this_cpu_ptr(&idt_desc); - xen_convert_trap_info(desc, traps); + xen_convert_trap_info(desc, traps, true); } /* Load a new IDT into Xen. In principle this can be per-CPU, so we @@ -786,6 +759,7 @@ static void xen_load_idt(const struct desc_ptr *desc) { static DEFINE_SPINLOCK(lock); static struct trap_info traps[257]; + unsigned out; trace_xen_cpu_load_idt(desc); @@ -793,7 +767,8 @@ static void xen_load_idt(const struct desc_ptr *desc) memcpy(this_cpu_ptr(&idt_desc), desc, sizeof(idt_desc)); - xen_convert_trap_info(desc, traps); + out = xen_convert_trap_info(desc, traps, false); + memset(&traps[out], 0, sizeof(traps[0])); xen_mc_flush(); if (HYPERVISOR_set_trap_table(traps)) @@ -1050,52 +1025,54 @@ static const struct pv_info xen_info __initconst = { .name = "Xen", }; -static const struct pv_cpu_ops xen_cpu_ops __initconst = { - .cpuid = xen_cpuid, +static const typeof(pv_ops) xen_cpu_ops __initconst = { + .cpu = { + .cpuid = xen_cpuid, - .set_debugreg = xen_set_debugreg, - .get_debugreg = xen_get_debugreg, + .set_debugreg = xen_set_debugreg, + .get_debugreg = xen_get_debugreg, - .read_cr0 = xen_read_cr0, - .write_cr0 = xen_write_cr0, + .read_cr0 = xen_read_cr0, + .write_cr0 = xen_write_cr0, - .write_cr4 = xen_write_cr4, + .write_cr4 = xen_write_cr4, - .wbinvd = native_wbinvd, + .wbinvd = native_wbinvd, - .read_msr = xen_read_msr, - .write_msr = xen_write_msr, + .read_msr = xen_read_msr, + .write_msr = xen_write_msr, - .read_msr_safe = xen_read_msr_safe, - .write_msr_safe = xen_write_msr_safe, + .read_msr_safe = xen_read_msr_safe, + .write_msr_safe = xen_write_msr_safe, - .read_pmc = xen_read_pmc, + .read_pmc = xen_read_pmc, - .load_tr_desc = paravirt_nop, - .set_ldt = xen_set_ldt, - .load_gdt = xen_load_gdt, - .load_idt = xen_load_idt, - .load_tls = xen_load_tls, - .load_gs_index = xen_load_gs_index, + .load_tr_desc = paravirt_nop, + .set_ldt = xen_set_ldt, + .load_gdt = xen_load_gdt, + .load_idt = xen_load_idt, + .load_tls = xen_load_tls, + .load_gs_index = xen_load_gs_index, - .alloc_ldt = xen_alloc_ldt, - .free_ldt = xen_free_ldt, + .alloc_ldt = xen_alloc_ldt, + .free_ldt = xen_free_ldt, - .store_tr = xen_store_tr, + .store_tr = xen_store_tr, - .write_ldt_entry = xen_write_ldt_entry, - .write_gdt_entry = xen_write_gdt_entry, - .write_idt_entry = xen_write_idt_entry, - .load_sp0 = xen_load_sp0, + .write_ldt_entry = xen_write_ldt_entry, + .write_gdt_entry = xen_write_gdt_entry, + .write_idt_entry = xen_write_idt_entry, + .load_sp0 = xen_load_sp0, #ifdef CONFIG_X86_IOPL_IOPERM - .invalidate_io_bitmap = xen_invalidate_io_bitmap, - .update_io_bitmap = xen_update_io_bitmap, + .invalidate_io_bitmap = xen_invalidate_io_bitmap, + .update_io_bitmap = xen_update_io_bitmap, #endif - .io_delay = xen_io_delay, + .io_delay = xen_io_delay, - .start_context_switch = paravirt_start_context_switch, - .end_context_switch = xen_end_context_switch, + .start_context_switch = paravirt_start_context_switch, + .end_context_switch = xen_end_context_switch, + }, }; static void xen_restart(char *msg) @@ -1214,6 +1191,11 @@ static void __init xen_dom0_set_legacy_features(void) x86_platform.legacy.rtc = 1; } +static void __init xen_domu_set_legacy_features(void) +{ + x86_platform.legacy.rtc = 0; +} + /* First C function to be called on Xen boot */ asmlinkage __visible void __init xen_start_kernel(void) { @@ -1231,7 +1213,7 @@ asmlinkage __visible void __init xen_start_kernel(void) /* Install Xen paravirt ops */ pv_info = xen_info; - pv_ops.cpu = xen_cpu_ops; + pv_ops.cpu = xen_cpu_ops.cpu; paravirt_iret = xen_iret; xen_init_irq_ops(); @@ -1356,9 +1338,10 @@ asmlinkage __visible void __init xen_start_kernel(void) boot_params.hdr.hardware_subarch = X86_SUBARCH_XEN; if (!xen_initial_domain()) { - add_preferred_console("xenboot", 0, NULL); if (pci_xen) x86_init.pci.arch_init = pci_xen_init; + x86_platform.set_legacy_features = + xen_domu_set_legacy_features; } else { const struct dom0_vga_console_info *info = (void *)((char *)xen_start_info + @@ -1399,11 +1382,7 @@ asmlinkage __visible void __init xen_start_kernel(void) #endif } - if (!boot_params.screen_info.orig_video_isVGA) - add_preferred_console("tty", 0, NULL); - add_preferred_console("hvc", 0, NULL); - if (boot_params.screen_info.orig_video_isVGA) - add_preferred_console("tty", 0, NULL); + xen_add_preferred_consoles(); #ifdef CONFIG_PCI /* PCI BIOS service won't work from a PV guest. */ diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c index 0d5e34b9e6f9..bcae606bbc5c 100644 --- a/arch/x86/xen/enlighten_pvh.c +++ b/arch/x86/xen/enlighten_pvh.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/acpi.h> +#include <linux/export.h> #include <xen/hvc-console.h> @@ -18,10 +19,11 @@ /* * PVH variables. * - * The variable xen_pvh needs to live in the data segment since it is used + * The variable xen_pvh needs to live in a data segment since it is used * after startup_{32|64} is invoked, which will clear the .bss segment. */ -bool xen_pvh __section(".data") = 0; +bool __ro_after_init xen_pvh; +EXPORT_SYMBOL_GPL(xen_pvh); void __init xen_pvh_init(struct boot_params *boot_params) { @@ -36,6 +38,10 @@ void __init xen_pvh_init(struct boot_params *boot_params) pfn = __pa(hypercall_page); wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); + if (xen_initial_domain()) + x86_init.oem.arch_setup = xen_add_preferred_consoles; + x86_init.oem.banner = xen_banner; + xen_efi_init(boot_params); } diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index dfa091d79c2e..4fe387e520af 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -19,12 +19,12 @@ * callback mask. We do this in a very simple manner, by making a call * down into Xen. The pending flag will be checked by Xen on return. */ -void xen_force_evtchn_callback(void) +noinstr void xen_force_evtchn_callback(void) { (void)HYPERVISOR_xen_version(0, NULL); } -asmlinkage __visible unsigned long xen_save_fl(void) +asmlinkage __visible noinstr unsigned long xen_save_fl(void) { struct vcpu_info *vcpu; unsigned long flags; @@ -40,9 +40,9 @@ asmlinkage __visible unsigned long xen_save_fl(void) */ return (-flags) & X86_EFLAGS_IF; } -PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl); +__PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl, ".noinstr.text"); -asmlinkage __visible void xen_irq_disable(void) +asmlinkage __visible noinstr void xen_irq_disable(void) { /* There's a one instruction preempt window here. We need to make sure we're don't switch CPUs between getting the vcpu @@ -51,9 +51,9 @@ asmlinkage __visible void xen_irq_disable(void) this_cpu_read(xen_vcpu)->evtchn_upcall_mask = 1; preempt_enable_no_resched(); } -PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable); +__PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable, ".noinstr.text"); -asmlinkage __visible void xen_irq_enable(void) +asmlinkage __visible noinstr void xen_irq_enable(void) { struct vcpu_info *vcpu; @@ -76,7 +76,7 @@ asmlinkage __visible void xen_irq_enable(void) preempt_enable(); } -PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable); +__PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable, ".noinstr.text"); static void xen_safe_halt(void) { @@ -94,17 +94,20 @@ static void xen_halt(void) xen_safe_halt(); } -static const struct pv_irq_ops xen_irq_ops __initconst = { - .save_fl = PV_CALLEE_SAVE(xen_save_fl), - .irq_disable = PV_CALLEE_SAVE(xen_irq_disable), - .irq_enable = PV_CALLEE_SAVE(xen_irq_enable), +static const typeof(pv_ops) xen_irq_ops __initconst = { + .irq = { - .safe_halt = xen_safe_halt, - .halt = xen_halt, + .save_fl = PV_CALLEE_SAVE(xen_save_fl), + .irq_disable = PV_CALLEE_SAVE(xen_irq_disable), + .irq_enable = PV_CALLEE_SAVE(xen_irq_enable), + + .safe_halt = xen_safe_halt, + .halt = xen_halt, + }, }; void __init xen_init_irq_ops(void) { - pv_ops.irq = xen_irq_ops; + pv_ops.irq = xen_irq_ops.irq; x86_init.irqs.intr_init = xen_init_IRQ; } diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 1df5f01529e5..1ce436eeda15 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -1204,7 +1204,8 @@ static void __init xen_pagetable_init(void) xen_remap_memory(); xen_setup_mfn_list_list(); } -static void xen_write_cr2(unsigned long cr2) + +static noinstr void xen_write_cr2(unsigned long cr2) { this_cpu_read(xen_vcpu)->arch.cr2 = cr2; } @@ -1518,14 +1519,17 @@ static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, if (pinned) { struct page *page = pfn_to_page(pfn); - if (static_branch_likely(&xen_struct_pages_ready)) + pinned = false; + if (static_branch_likely(&xen_struct_pages_ready)) { + pinned = PagePinned(page); SetPagePinned(page); + } xen_mc_batch(); __set_pfn_prot(pfn, PAGE_KERNEL_RO); - if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS) + if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS && !pinned) __pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); xen_mc_issue(PARAVIRT_LAZY_MMU); @@ -2075,67 +2079,69 @@ static void xen_leave_lazy_mmu(void) preempt_enable(); } -static const struct pv_mmu_ops xen_mmu_ops __initconst = { - .read_cr2 = __PV_IS_CALLEE_SAVE(xen_read_cr2), - .write_cr2 = xen_write_cr2, +static const typeof(pv_ops) xen_mmu_ops __initconst = { + .mmu = { + .read_cr2 = __PV_IS_CALLEE_SAVE(xen_read_cr2), + .write_cr2 = xen_write_cr2, - .read_cr3 = xen_read_cr3, - .write_cr3 = xen_write_cr3_init, + .read_cr3 = xen_read_cr3, + .write_cr3 = xen_write_cr3_init, - .flush_tlb_user = xen_flush_tlb, - .flush_tlb_kernel = xen_flush_tlb, - .flush_tlb_one_user = xen_flush_tlb_one_user, - .flush_tlb_multi = xen_flush_tlb_multi, - .tlb_remove_table = tlb_remove_table, + .flush_tlb_user = xen_flush_tlb, + .flush_tlb_kernel = xen_flush_tlb, + .flush_tlb_one_user = xen_flush_tlb_one_user, + .flush_tlb_multi = xen_flush_tlb_multi, + .tlb_remove_table = tlb_remove_table, - .pgd_alloc = xen_pgd_alloc, - .pgd_free = xen_pgd_free, + .pgd_alloc = xen_pgd_alloc, + .pgd_free = xen_pgd_free, - .alloc_pte = xen_alloc_pte_init, - .release_pte = xen_release_pte_init, - .alloc_pmd = xen_alloc_pmd_init, - .release_pmd = xen_release_pmd_init, + .alloc_pte = xen_alloc_pte_init, + .release_pte = xen_release_pte_init, + .alloc_pmd = xen_alloc_pmd_init, + .release_pmd = xen_release_pmd_init, - .set_pte = xen_set_pte_init, - .set_pmd = xen_set_pmd_hyper, + .set_pte = xen_set_pte_init, + .set_pmd = xen_set_pmd_hyper, - .ptep_modify_prot_start = xen_ptep_modify_prot_start, - .ptep_modify_prot_commit = xen_ptep_modify_prot_commit, + .ptep_modify_prot_start = xen_ptep_modify_prot_start, + .ptep_modify_prot_commit = xen_ptep_modify_prot_commit, - .pte_val = PV_CALLEE_SAVE(xen_pte_val), - .pgd_val = PV_CALLEE_SAVE(xen_pgd_val), + .pte_val = PV_CALLEE_SAVE(xen_pte_val), + .pgd_val = PV_CALLEE_SAVE(xen_pgd_val), - .make_pte = PV_CALLEE_SAVE(xen_make_pte_init), - .make_pgd = PV_CALLEE_SAVE(xen_make_pgd), + .make_pte = PV_CALLEE_SAVE(xen_make_pte_init), + .make_pgd = PV_CALLEE_SAVE(xen_make_pgd), - .set_pud = xen_set_pud_hyper, + .set_pud = xen_set_pud_hyper, - .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), - .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), + .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), + .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), - .pud_val = PV_CALLEE_SAVE(xen_pud_val), - .make_pud = PV_CALLEE_SAVE(xen_make_pud), - .set_p4d = xen_set_p4d_hyper, + .pud_val = PV_CALLEE_SAVE(xen_pud_val), + .make_pud = PV_CALLEE_SAVE(xen_make_pud), + .set_p4d = xen_set_p4d_hyper, - .alloc_pud = xen_alloc_pmd_init, - .release_pud = xen_release_pmd_init, + .alloc_pud = xen_alloc_pmd_init, + .release_pud = xen_release_pmd_init, #if CONFIG_PGTABLE_LEVELS >= 5 - .p4d_val = PV_CALLEE_SAVE(xen_p4d_val), - .make_p4d = PV_CALLEE_SAVE(xen_make_p4d), + .p4d_val = PV_CALLEE_SAVE(xen_p4d_val), + .make_p4d = PV_CALLEE_SAVE(xen_make_p4d), #endif - .activate_mm = xen_activate_mm, - .dup_mmap = xen_dup_mmap, - .exit_mmap = xen_exit_mmap, + .activate_mm = xen_activate_mm, + .dup_mmap = xen_dup_mmap, + .exit_mmap = xen_exit_mmap, - .lazy_mode = { - .enter = paravirt_enter_lazy_mmu, - .leave = xen_leave_lazy_mmu, - .flush = paravirt_flush_lazy_mmu, - }, + .lazy_mode = { + .enter = paravirt_enter_lazy_mmu, + .leave = xen_leave_lazy_mmu, + .flush = paravirt_flush_lazy_mmu, + }, - .set_fixmap = xen_set_fixmap, + .set_fixmap = xen_set_fixmap, + }, }; void __init xen_init_mmu_ops(void) @@ -2143,7 +2149,7 @@ void __init xen_init_mmu_ops(void) x86_init.paging.pagetable_init = xen_pagetable_init; x86_init.hyper.init_after_bootmem = xen_after_bootmem; - pv_ops.mmu = xen_mmu_ops; + pv_ops.mmu = xen_mmu_ops.mmu; memset(dummy_mapping, 0xff, PAGE_SIZE); } @@ -2395,7 +2401,7 @@ static int remap_area_pfn_pte_fn(pte_t *ptep, unsigned long addr, void *data) int xen_remap_pfn(struct vm_area_struct *vma, unsigned long addr, xen_pfn_t *pfn, int nr, int *err_ptr, pgprot_t prot, - unsigned int domid, bool no_translate, struct page **pages) + unsigned int domid, bool no_translate) { int err = 0; struct remap_data rmd; diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index 54f9aa7e8457..46df59aeaa06 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -18,7 +18,7 @@ #endif #include <linux/export.h> -int xen_swiotlb __read_mostly; +static int xen_swiotlb __read_mostly; /* * pci_xen_swiotlb_detect - set xen_swiotlb to 1 if necessary @@ -56,7 +56,7 @@ int __init pci_xen_swiotlb_detect(void) return xen_swiotlb; } -void __init pci_xen_swiotlb_init(void) +static void __init pci_xen_swiotlb_init(void) { if (xen_swiotlb) { xen_swiotlb_init_early(); diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index 96afadf9878e..7ed56c6075b0 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -290,8 +290,6 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) gdt = get_cpu_gdt_rw(cpu); - memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); - /* * Bring up the CPU in cpu_bringup_and_idle() with the stack * pointing just below where pt_regs would be if it were a normal @@ -308,8 +306,6 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) xen_copy_trap_info(ctxt->trap_ctxt); - ctxt->ldt_ents = 0; - BUG_ON((unsigned long)gdt & ~PAGE_MASK); gdt_mfn = arbitrary_virt_to_mfn(gdt); diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index 1e626444712b..220dd9678494 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -21,6 +21,45 @@ #include <linux/init.h> #include <linux/linkage.h> +.pushsection .noinstr.text, "ax" +/* + * Disabling events is simply a matter of making the event mask + * non-zero. + */ +SYM_FUNC_START(xen_irq_disable_direct) + movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask + ret +SYM_FUNC_END(xen_irq_disable_direct) + +/* + * Force an event check by making a hypercall, but preserve regs + * before making the call. + */ +SYM_FUNC_START(check_events) + FRAME_BEGIN + push %rax + push %rcx + push %rdx + push %rsi + push %rdi + push %r8 + push %r9 + push %r10 + push %r11 + call xen_force_evtchn_callback + pop %r11 + pop %r10 + pop %r9 + pop %r8 + pop %rdi + pop %rsi + pop %rdx + pop %rcx + pop %rax + FRAME_END + ret +SYM_FUNC_END(check_events) + /* * Enable events. This clears the event mask and tests the pending * event status with one and operation. If there are pending events, @@ -47,16 +86,6 @@ SYM_FUNC_START(xen_irq_enable_direct) ret SYM_FUNC_END(xen_irq_enable_direct) - -/* - * Disabling events is simply a matter of making the event mask - * non-zero. - */ -SYM_FUNC_START(xen_irq_disable_direct) - movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask - ret -SYM_FUNC_END(xen_irq_disable_direct) - /* * (xen_)save_fl is used to get the current interrupt enable status. * Callers expect the status to be in X86_EFLAGS_IF, and other bits @@ -73,35 +102,6 @@ SYM_FUNC_START(xen_save_fl_direct) ret SYM_FUNC_END(xen_save_fl_direct) -/* - * Force an event check by making a hypercall, but preserve regs - * before making the call. - */ -SYM_FUNC_START(check_events) - FRAME_BEGIN - push %rax - push %rcx - push %rdx - push %rsi - push %rdi - push %r8 - push %r9 - push %r10 - push %r11 - call xen_force_evtchn_callback - pop %r11 - pop %r10 - pop %r9 - pop %r8 - pop %rdi - pop %rsi - pop %rdx - pop %rcx - pop %rax - FRAME_END - ret -SYM_FUNC_END(check_events) - SYM_FUNC_START(xen_read_cr2) FRAME_BEGIN _ASM_MOV PER_CPU_VAR(xen_vcpu), %_ASM_AX @@ -116,6 +116,7 @@ SYM_FUNC_START(xen_read_cr2_direct) FRAME_END ret SYM_FUNC_END(xen_read_cr2_direct); +.popsection .macro xen_pv_trap name SYM_CODE_START(xen_\name) diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index cb6538ae2fe0..9e27b86a0c31 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -20,6 +20,23 @@ #include <xen/interface/xen-mca.h> #include <asm/xen/interface.h> +.pushsection .noinstr.text, "ax" + .balign PAGE_SIZE +SYM_CODE_START(hypercall_page) + .rept (PAGE_SIZE / 32) + UNWIND_HINT_FUNC + .skip 31, 0x90 + ret + .endr + +#define HYPERCALL(n) \ + .equ xen_hypercall_##n, hypercall_page + __HYPERVISOR_##n * 32; \ + .type xen_hypercall_##n, @function; .size xen_hypercall_##n, 32 +#include <asm/xen-hypercalls.h> +#undef HYPERCALL +SYM_CODE_END(hypercall_page) +.popsection + #ifdef CONFIG_XEN_PV __INIT SYM_CODE_START(startup_xen) @@ -64,23 +81,6 @@ SYM_CODE_END(asm_cpu_bringup_and_idle) #endif #endif -.pushsection .text - .balign PAGE_SIZE -SYM_CODE_START(hypercall_page) - .rept (PAGE_SIZE / 32) - UNWIND_HINT_FUNC - .skip 31, 0x90 - ret - .endr - -#define HYPERCALL(n) \ - .equ xen_hypercall_##n, hypercall_page + __HYPERVISOR_##n * 32; \ - .type xen_hypercall_##n, @function; .size xen_hypercall_##n, 32 -#include <asm/xen-hypercalls.h> -#undef HYPERCALL -SYM_CODE_END(hypercall_page) -.popsection - ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 8d7ec49a35fb..8bc8b72a205d 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -51,6 +51,7 @@ void __init xen_remap_memory(void); phys_addr_t __init xen_find_free_area(phys_addr_t size); char * __init xen_memory_setup(void); void __init xen_arch_setup(void); +void xen_banner(void); void xen_enable_sysenter(void); void xen_enable_syscall(void); void xen_vcpu_restore(void); @@ -109,7 +110,7 @@ static inline void xen_uninit_lock_cpu(int cpu) struct dom0_vga_console_info; -#ifdef CONFIG_XEN_DOM0 +#ifdef CONFIG_XEN_PV_DOM0 void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size); #else static inline void __init xen_init_vga(const struct dom0_vga_console_info *info, @@ -118,6 +119,8 @@ static inline void __init xen_init_vga(const struct dom0_vga_console_info *info, } #endif +void xen_add_preferred_consoles(void); + void __init xen_init_apic(void); #ifdef CONFIG_XEN_EFI |