summaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
Diffstat (limited to 'arch')
-rw-r--r--arch/Kconfig4
-rw-r--r--arch/alpha/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/arm/boot/dts/broadcom/bcm2711-rpi.dtsi5
-rw-r--r--arch/arm/boot/dts/broadcom/bcm2711.dtsi12
-rw-r--r--arch/arm/boot/dts/broadcom/bcm4709-asus-rt-ac3200.dts8
-rw-r--r--arch/arm/boot/dts/broadcom/bcm47094-asus-rt-ac5300.dts8
-rw-r--r--arch/arm/boot/dts/nxp/imx/imx6qdl-apalis.dtsi10
-rw-r--r--arch/arm/include/asm/io.h2
-rw-r--r--arch/arm/include/asm/vdso.h2
-rw-r--r--arch/arm/include/asm/vdso/gettimeofday.h7
-rw-r--r--arch/arm/include/asm/vdso/vsyscall.h12
-rw-r--r--arch/arm/kernel/asm-offsets.c4
-rw-r--r--arch/arm/kernel/traps.c11
-rw-r--r--arch/arm/kernel/vdso.c34
-rw-r--r--arch/arm/mach-davinci/Kconfig1
-rw-r--r--arch/arm/mach-davinci/da830.c1
-rw-r--r--arch/arm/mach-imx/mmdc.c5
-rw-r--r--arch/arm/mach-omap1/Kconfig1
-rw-r--r--arch/arm/mach-shmobile/headsmp.S1
-rw-r--r--arch/arm/mm/Kconfig1
-rw-r--r--arch/arm/mm/cache-l2x0-pmu.c3
-rw-r--r--arch/arm/mm/ioremap.c2
-rw-r--r--arch/arm/mm/nommu.c2
-rw-r--r--arch/arm/tools/syscall.tbl1
-rw-r--r--arch/arm/vdso/Makefile2
-rw-r--r--arch/arm/vdso/vdso.lds.S4
-rw-r--r--arch/arm64/Kconfig13
-rw-r--r--arch/arm64/boot/dts/broadcom/bcm2712.dtsi2
-rw-r--r--arch/arm64/boot/dts/freescale/imx8mm-verdin-dahlia.dtsi6
-rw-r--r--arch/arm64/boot/dts/freescale/imx8mp-tqma8mpql.dtsi16
-rw-r--r--arch/arm64/boot/dts/freescale/imx8mp-verdin-dahlia.dtsi6
-rw-r--r--arch/arm64/boot/dts/qcom/sdm845.dtsi1
-rw-r--r--arch/arm64/boot/dts/rockchip/px30-ringneck-haikou.dts12
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3399-nanopi-r4s.dtsi2
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3399-rockpro64.dtsi14
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3566-lubancat-1.dts1
-rw-r--r--arch/arm64/boot/dts/rockchip/rk356x-base.dtsi14
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3588-jaguar.dts1
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3588-rock-5-itx.dts3
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3588-tiger.dtsi1
-rw-r--r--arch/arm64/hyperv/hv_core.c17
-rw-r--r--arch/arm64/hyperv/mshyperv.c6
-rw-r--r--arch/arm64/include/asm/apple_m1_pmu.h1
-rw-r--r--arch/arm64/include/asm/asm-extable.h10
-rw-r--r--arch/arm64/include/asm/asm-uaccess.h4
-rw-r--r--arch/arm64/include/asm/cache.h4
-rw-r--r--arch/arm64/include/asm/cpucaps.h2
-rw-r--r--arch/arm64/include/asm/cpufeature.h28
-rw-r--r--arch/arm64/include/asm/cputype.h54
-rw-r--r--arch/arm64/include/asm/el2_setup.h25
-rw-r--r--arch/arm64/include/asm/extable.h4
-rw-r--r--arch/arm64/include/asm/fpsimd.h1
-rw-r--r--arch/arm64/include/asm/hypervisor.h1
-rw-r--r--arch/arm64/include/asm/kernel-pgtable.h8
-rw-r--r--arch/arm64/include/asm/kvm_arm.h4
-rw-r--r--arch/arm64/include/asm/kvm_emulate.h37
-rw-r--r--arch/arm64/include/asm/kvm_host.h67
-rw-r--r--arch/arm64/include/asm/kvm_hyp.h2
-rw-r--r--arch/arm64/include/asm/kvm_nested.h1
-rw-r--r--arch/arm64/include/asm/kvm_pkvm.h1
-rw-r--r--arch/arm64/include/asm/mem_encrypt.h11
-rw-r--r--arch/arm64/include/asm/mmu.h3
-rw-r--r--arch/arm64/include/asm/mshyperv.h13
-rw-r--r--arch/arm64/include/asm/pgtable-hwdef.h35
-rw-r--r--arch/arm64/include/asm/pgtable-prot.h36
-rw-r--r--arch/arm64/include/asm/pgtable.h80
-rw-r--r--arch/arm64/include/asm/por.h11
-rw-r--r--arch/arm64/include/asm/spectre.h1
-rw-r--r--arch/arm64/include/asm/sysreg.h45
-rw-r--r--arch/arm64/include/asm/vdso.h2
-rw-r--r--arch/arm64/include/asm/vdso/compat_gettimeofday.h38
-rw-r--r--arch/arm64/include/asm/vdso/getrandom.h12
-rw-r--r--arch/arm64/include/asm/vdso/gettimeofday.h16
-rw-r--r--arch/arm64/include/asm/vdso/vsyscall.h29
-rw-r--r--arch/arm64/include/uapi/asm/kvm.h14
-rw-r--r--arch/arm64/kernel/cpu_errata.c117
-rw-r--r--arch/arm64/kernel/cpufeature.c53
-rw-r--r--arch/arm64/kernel/elfcore.c3
-rw-r--r--arch/arm64/kernel/image-vars.h6
-rw-r--r--arch/arm64/kernel/pi/map_range.c6
-rw-r--r--arch/arm64/kernel/proton-pack.c224
-rw-r--r--arch/arm64/kernel/signal.c2
-rw-r--r--arch/arm64/kernel/topology.c182
-rw-r--r--arch/arm64/kernel/traps.c10
-rw-r--r--arch/arm64/kernel/vdso.c90
-rw-r--r--arch/arm64/kernel/vdso/Makefile2
-rw-r--r--arch/arm64/kernel/vdso/vdso.lds.S7
-rw-r--r--arch/arm64/kernel/vdso32/Makefile2
-rw-r--r--arch/arm64/kernel/vdso32/vdso.lds.S7
-rw-r--r--arch/arm64/kvm/Makefile2
-rw-r--r--arch/arm64/kvm/arch_timer.c7
-rw-r--r--arch/arm64/kvm/arm.c76
-rw-r--r--arch/arm64/kvm/at.c8
-rw-r--r--arch/arm64/kvm/emulate-nested.c24
-rw-r--r--arch/arm64/kvm/handle_exit.c6
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/switch.h4
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h14
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/mem_protect.h2
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/pkvm.h6
-rw-r--r--arch/arm64/kvm/hyp/nvhe/mem_protect.c2
-rw-r--r--arch/arm64/kvm/hyp/nvhe/pkvm.c79
-rw-r--r--arch/arm64/kvm/hyp/nvhe/sysreg-sr.c4
-rw-r--r--arch/arm64/kvm/hyp/vgic-v3-sr.c16
-rw-r--r--arch/arm64/kvm/hyp/vhe/switch.c22
-rw-r--r--arch/arm64/kvm/hyp/vhe/sysreg-sr.c28
-rw-r--r--arch/arm64/kvm/hypercalls.c13
-rw-r--r--arch/arm64/kvm/mmu.c22
-rw-r--r--arch/arm64/kvm/nested.c298
-rw-r--r--arch/arm64/kvm/pkvm.c75
-rw-r--r--arch/arm64/kvm/pmu-emul.c194
-rw-r--r--arch/arm64/kvm/pmu.c10
-rw-r--r--arch/arm64/kvm/ptdump.c4
-rw-r--r--arch/arm64/kvm/reset.c3
-rw-r--r--arch/arm64/kvm/sys_regs.c478
-rw-r--r--arch/arm64/kvm/sys_regs.h10
-rw-r--r--arch/arm64/kvm/vgic-sys-reg-v3.c8
-rw-r--r--arch/arm64/kvm/vgic/vgic-init.c29
-rw-r--r--arch/arm64/kvm/vgic/vgic-kvm-device.c29
-rw-r--r--arch/arm64/kvm/vgic/vgic-v3-nested.c409
-rw-r--r--arch/arm64/kvm/vgic/vgic-v3.c46
-rw-r--r--arch/arm64/kvm/vgic/vgic-v4.c35
-rw-r--r--arch/arm64/kvm/vgic/vgic.c38
-rw-r--r--arch/arm64/kvm/vgic/vgic.h6
-rw-r--r--arch/arm64/lib/clear_user.S25
-rw-r--r--arch/arm64/lib/copy_from_user.S10
-rw-r--r--arch/arm64/lib/copy_template.S10
-rw-r--r--arch/arm64/lib/copy_to_user.S10
-rw-r--r--arch/arm64/mm/extable.c40
-rw-r--r--arch/arm64/mm/fault.c4
-rw-r--r--arch/arm64/mm/hugetlbpage.c20
-rw-r--r--arch/arm64/mm/kasan_init.c6
-rw-r--r--arch/arm64/mm/mmu.c10
-rw-r--r--arch/arm64/mm/physaddr.c2
-rw-r--r--arch/arm64/mm/ptdump.c4
-rw-r--r--arch/arm64/tools/cpucaps2
-rwxr-xr-xarch/arm64/tools/gen-sysreg.awk31
-rw-r--r--arch/arm64/tools/syscall_32.tbl1
-rw-r--r--arch/arm64/tools/sysreg153
-rw-r--r--arch/csky/kernel/vdso/Makefile2
-rw-r--r--arch/loongarch/Kconfig5
-rw-r--r--arch/loongarch/Makefile6
-rw-r--r--arch/loongarch/configs/loongson3_defconfig1
-rw-r--r--arch/loongarch/include/asm/kvm_host.h7
-rw-r--r--arch/loongarch/include/asm/vdso.h1
-rw-r--r--arch/loongarch/include/asm/vdso/arch_data.h25
-rw-r--r--arch/loongarch/include/asm/vdso/getrandom.h5
-rw-r--r--arch/loongarch/include/asm/vdso/gettimeofday.h14
-rw-r--r--arch/loongarch/include/asm/vdso/vdso.h38
-rw-r--r--arch/loongarch/include/asm/vdso/vsyscall.h17
-rw-r--r--arch/loongarch/kernel/asm-offsets.c3
-rw-r--r--arch/loongarch/kernel/vdso.c92
-rw-r--r--arch/loongarch/kvm/Kconfig1
-rw-r--r--arch/loongarch/kvm/Makefile2
-rw-r--r--arch/loongarch/kvm/main.c3
-rw-r--r--arch/loongarch/kvm/switch.S12
-rw-r--r--arch/loongarch/kvm/vcpu.c41
-rw-r--r--arch/loongarch/vdso/Makefile2
-rw-r--r--arch/loongarch/vdso/vdso.lds.S8
-rw-r--r--arch/loongarch/vdso/vgetcpu.c12
-rw-r--r--arch/m68k/configs/amiga_defconfig3
-rw-r--r--arch/m68k/configs/apollo_defconfig3
-rw-r--r--arch/m68k/configs/atari_defconfig3
-rw-r--r--arch/m68k/configs/bvme6000_defconfig3
-rw-r--r--arch/m68k/configs/hp300_defconfig3
-rw-r--r--arch/m68k/configs/mac_defconfig3
-rw-r--r--arch/m68k/configs/multi_defconfig3
-rw-r--r--arch/m68k/configs/mvme147_defconfig3
-rw-r--r--arch/m68k/configs/mvme16x_defconfig3
-rw-r--r--arch/m68k/configs/q40_defconfig3
-rw-r--r--arch/m68k/configs/sun3_defconfig3
-rw-r--r--arch/m68k/configs/sun3x_defconfig3
-rw-r--r--arch/m68k/include/asm/processor.h14
-rw-r--r--arch/m68k/kernel/setup_mm.c3
-rw-r--r--arch/m68k/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/m68k/sun3/mmu_emu.c7
-rw-r--r--arch/microblaze/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/mips/Kconfig1
-rw-r--r--arch/mips/configs/malta_defconfig1
-rw-r--r--arch/mips/configs/malta_kvm_defconfig1
-rw-r--r--arch/mips/configs/maltaup_xpa_defconfig1
-rw-r--r--arch/mips/configs/rm200_defconfig1
-rw-r--r--arch/mips/include/asm/kvm_host.h1
-rw-r--r--arch/mips/include/asm/vdso/gettimeofday.h9
-rw-r--r--arch/mips/include/asm/vdso/vdso.h19
-rw-r--r--arch/mips/include/asm/vdso/vsyscall.h14
-rw-r--r--arch/mips/kernel/ptrace.c20
-rw-r--r--arch/mips/kernel/syscalls/syscall_n32.tbl1
-rw-r--r--arch/mips/kernel/syscalls/syscall_n64.tbl1
-rw-r--r--arch/mips/kernel/syscalls/syscall_o32.tbl1
-rw-r--r--arch/mips/kernel/vdso.c47
-rw-r--r--arch/mips/kvm/mips.c5
-rw-r--r--arch/mips/vdso/Makefile2
-rw-r--r--arch/mips/vdso/vdso.lds.S5
-rw-r--r--arch/parisc/configs/generic-64bit_defconfig1
-rw-r--r--arch/parisc/include/asm/vdso.h2
-rw-r--r--arch/parisc/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/parisc/kernel/vdso32/Makefile2
-rw-r--r--arch/parisc/kernel/vdso64/Makefile2
-rw-r--r--arch/powerpc/Kconfig2
-rw-r--r--arch/powerpc/configs/fsl-emb-nonhw.config1
-rw-r--r--arch/powerpc/configs/ppc6xx_defconfig1
-rw-r--r--arch/powerpc/include/asm/kvm_host.h1
-rw-r--r--arch/powerpc/include/asm/topology.h1
-rw-r--r--arch/powerpc/include/asm/vdso.h1
-rw-r--r--arch/powerpc/include/asm/vdso/arch_data.h37
-rw-r--r--arch/powerpc/include/asm/vdso/getrandom.h11
-rw-r--r--arch/powerpc/include/asm/vdso/gettimeofday.h29
-rw-r--r--arch/powerpc/include/asm/vdso/vsyscall.h13
-rw-r--r--arch/powerpc/include/asm/vdso_datapage.h44
-rw-r--r--arch/powerpc/kernel/asm-offsets.c1
-rw-r--r--arch/powerpc/kernel/fadump.c2
-rw-r--r--arch/powerpc/kernel/ptrace/ptrace.c2
-rw-r--r--arch/powerpc/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/powerpc/kernel/time.c2
-rw-r--r--arch/powerpc/kernel/traps.c3
-rw-r--r--arch/powerpc/kernel/vdso.c115
-rw-r--r--arch/powerpc/kernel/vdso/Makefile2
-rw-r--r--arch/powerpc/kernel/vdso/cacheflush.S2
-rw-r--r--arch/powerpc/kernel/vdso/datapage.S4
-rw-r--r--arch/powerpc/kernel/vdso/gettimeofday.S4
-rw-r--r--arch/powerpc/kernel/vdso/vdso32.lds.S4
-rw-r--r--arch/powerpc/kernel/vdso/vdso64.lds.S4
-rw-r--r--arch/powerpc/kernel/vdso/vgettimeofday.c14
-rw-r--r--arch/powerpc/kernel/watchdog.c3
-rw-r--r--arch/powerpc/kvm/powerpc.c4
-rw-r--r--arch/powerpc/perf/core-book3s.c8
-rw-r--r--arch/powerpc/platforms/powernv/opal-core.c8
-rw-r--r--arch/powerpc/platforms/pseries/papr-vpd.c7
-rw-r--r--arch/powerpc/xmon/xmon.c6
-rw-r--r--arch/riscv/Kconfig4
-rw-r--r--arch/riscv/boot/dts/sophgo/sg2042.dtsi10
-rw-r--r--arch/riscv/boot/dts/starfive/jh7110-pinfunc.h2
-rw-r--r--arch/riscv/include/asm/bitops.h4
-rw-r--r--arch/riscv/include/asm/io.h2
-rw-r--r--arch/riscv/include/asm/kvm_host.h2
-rw-r--r--arch/riscv/include/asm/vdso.h2
-rw-r--r--arch/riscv/include/asm/vdso/arch_data.h (renamed from arch/riscv/include/asm/vdso/time_data.h)8
-rw-r--r--arch/riscv/include/asm/vdso/gettimeofday.h14
-rw-r--r--arch/riscv/include/asm/vdso/vsyscall.h9
-rw-r--r--arch/riscv/kernel/sys_hwprobe.c3
-rw-r--r--arch/riscv/kernel/vdso.c90
-rw-r--r--arch/riscv/kernel/vdso/Makefile2
-rw-r--r--arch/riscv/kernel/vdso/hwprobe.c6
-rw-r--r--arch/riscv/kernel/vdso/vdso.lds.S7
-rw-r--r--arch/riscv/kvm/main.c4
-rw-r--r--arch/riscv/kvm/vcpu_onereg.c2
-rw-r--r--arch/riscv/kvm/vcpu_pmu.c1
-rw-r--r--arch/riscv/kvm/vcpu_timer.c7
-rw-r--r--arch/s390/Kconfig1
-rw-r--r--arch/s390/include/asm/gmap.h1
-rw-r--r--arch/s390/include/asm/kvm_host.h1
-rw-r--r--arch/s390/include/asm/uv.h2
-rw-r--r--arch/s390/include/asm/vdso.h4
-rw-r--r--arch/s390/include/asm/vdso/getrandom.h12
-rw-r--r--arch/s390/include/asm/vdso/gettimeofday.h15
-rw-r--r--arch/s390/include/asm/vdso/vsyscall.h20
-rw-r--r--arch/s390/kernel/crash_dump.c62
-rw-r--r--arch/s390/kernel/dumpstack.c7
-rw-r--r--arch/s390/kernel/perf_pai_crypto.c3
-rw-r--r--arch/s390/kernel/perf_pai_ext.c3
-rw-r--r--arch/s390/kernel/processor.c2
-rw-r--r--arch/s390/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/s390/kernel/time.c11
-rw-r--r--arch/s390/kernel/uv.c136
-rw-r--r--arch/s390/kernel/vdso.c97
-rw-r--r--arch/s390/kernel/vdso32/Makefile2
-rw-r--r--arch/s390/kernel/vdso32/vdso32.lds.S7
-rw-r--r--arch/s390/kernel/vdso64/Makefile2
-rw-r--r--arch/s390/kernel/vdso64/vdso64.lds.S8
-rw-r--r--arch/s390/kvm/gmap.c103
-rw-r--r--arch/s390/kvm/interrupt.c3
-rw-r--r--arch/s390/kvm/kvm-s390.c29
-rw-r--r--arch/s390/mm/gmap.c28
-rw-r--r--arch/sh/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/sparc/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/sparc/vdso/Makefile7
-rw-r--r--arch/sparc/vdso/checkundef.sh10
-rw-r--r--arch/um/kernel/um_arch.c11
-rw-r--r--arch/x86/Kbuild4
-rw-r--r--arch/x86/Kconfig210
-rw-r--r--arch/x86/Kconfig.cpu105
-rw-r--r--arch/x86/Kconfig.cpufeatures201
-rw-r--r--arch/x86/Makefile55
-rw-r--r--arch/x86/Makefile_32.cpu5
-rw-r--r--arch/x86/boot/.gitignore1
-rw-r--r--arch/x86/boot/Makefile7
-rw-r--r--arch/x86/boot/boot.h4
-rw-r--r--arch/x86/boot/compressed/Makefile1
-rw-r--r--arch/x86/boot/compressed/head_64.S103
-rw-r--r--arch/x86/boot/compressed/la57toggle.S112
-rw-r--r--arch/x86/boot/compressed/misc.c14
-rw-r--r--arch/x86/boot/compressed/vmlinux.lds.S2
-rw-r--r--arch/x86/boot/cpucheck.c3
-rw-r--r--arch/x86/boot/cpuflags.c27
-rw-r--r--arch/x86/boot/cpuflags.h7
-rw-r--r--arch/x86/boot/genimage.sh5
-rw-r--r--arch/x86/boot/mkcpustr.c3
-rw-r--r--arch/x86/boot/setup.ld2
-rw-r--r--arch/x86/boot/tools/.gitignore2
-rw-r--r--arch/x86/boot/tools/build.c247
-rw-r--r--arch/x86/coco/sev/core.c5
-rw-r--r--arch/x86/coco/tdx/tdx.c4
-rw-r--r--arch/x86/configs/xen.config2
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S2
-rw-r--r--arch/x86/crypto/camellia-aesni-avx-asm_64.S7
-rw-r--r--arch/x86/crypto/camellia-aesni-avx2-asm_64.S1
-rw-r--r--arch/x86/crypto/camellia-x86_64-asm_64.S9
-rw-r--r--arch/x86/crypto/serpent-avx-x86_64-asm_64.S7
-rw-r--r--arch/x86/crypto/twofish-x86_64-asm_64-3way.S5
-rw-r--r--arch/x86/crypto/twofish-x86_64-asm_64.S5
-rw-r--r--arch/x86/entry/Makefile8
-rw-r--r--arch/x86/entry/calling.h1
-rw-r--r--arch/x86/entry/common.c524
-rw-r--r--arch/x86/entry/entry.S6
-rw-r--r--arch/x86/entry/entry_32.S4
-rw-r--r--arch/x86/entry/entry_64.S11
-rw-r--r--arch/x86/entry/entry_64_compat.S4
-rw-r--r--arch/x86/entry/entry_64_fred.S1
-rw-r--r--arch/x86/entry/syscall_32.c332
-rw-r--r--arch/x86/entry/syscall_64.c111
-rw-r--r--arch/x86/entry/syscall_x32.c25
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl3
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl1
-rw-r--r--arch/x86/entry/vdso/Makefile10
-rwxr-xr-xarch/x86/entry/vdso/checkundef.sh10
-rw-r--r--arch/x86/entry/vdso/extable.h2
-rw-r--r--arch/x86/entry/vdso/vdso-layout.lds.S10
-rw-r--r--arch/x86/entry/vdso/vdso2c.c21
-rw-r--r--arch/x86/entry/vdso/vdso2c.h20
-rw-r--r--arch/x86/entry/vdso/vma.c128
-rw-r--r--arch/x86/events/amd/brs.c3
-rw-r--r--arch/x86/events/amd/ibs.c282
-rw-r--r--arch/x86/events/amd/iommu.c2
-rw-r--r--arch/x86/events/amd/lbr.c3
-rw-r--r--arch/x86/events/core.c29
-rw-r--r--arch/x86/events/intel/bts.c45
-rw-r--r--arch/x86/events/intel/core.c160
-rw-r--r--arch/x86/events/intel/ds.c204
-rw-r--r--arch/x86/events/intel/lbr.c73
-rw-r--r--arch/x86/events/intel/p4.c9
-rw-r--r--arch/x86/events/intel/p6.c26
-rw-r--r--arch/x86/events/intel/uncore.c3
-rw-r--r--arch/x86/events/perf_event.h61
-rw-r--r--arch/x86/events/perf_event_flags.h2
-rw-r--r--arch/x86/events/rapl.c10
-rw-r--r--arch/x86/hyperv/Makefile2
-rw-r--r--arch/x86/hyperv/hv_apic.c5
-rw-r--r--arch/x86/hyperv/hv_init.c35
-rw-r--r--arch/x86/hyperv/hv_proc.c198
-rw-r--r--arch/x86/hyperv/hv_vtl.c34
-rw-r--r--arch/x86/hyperv/irqdomain.c6
-rw-r--r--arch/x86/hyperv/ivm.c2
-rw-r--r--arch/x86/hyperv/mmu.c5
-rw-r--r--arch/x86/include/asm/Kbuild1
-rw-r--r--arch/x86/include/asm/alternative.h47
-rw-r--r--arch/x86/include/asm/amd-ibs.h3
-rw-r--r--arch/x86/include/asm/amd_nb.h1
-rw-r--r--arch/x86/include/asm/amd_node.h24
-rw-r--r--arch/x86/include/asm/apic.h4
-rw-r--r--arch/x86/include/asm/arch_hweight.h14
-rw-r--r--arch/x86/include/asm/asm-prototypes.h4
-rw-r--r--arch/x86/include/asm/asm.h21
-rw-r--r--arch/x86/include/asm/atomic.h14
-rw-r--r--arch/x86/include/asm/atomic64_32.h98
-rw-r--r--arch/x86/include/asm/atomic64_64.h14
-rw-r--r--arch/x86/include/asm/barrier.h8
-rw-r--r--arch/x86/include/asm/bitops.h14
-rw-r--r--arch/x86/include/asm/boot.h2
-rw-r--r--arch/x86/include/asm/bug.h8
-rw-r--r--arch/x86/include/asm/cfi.h26
-rw-r--r--arch/x86/include/asm/cmpxchg.h28
-rw-r--r--arch/x86/include/asm/cmpxchg_32.h42
-rw-r--r--arch/x86/include/asm/cmpxchg_64.h4
-rw-r--r--arch/x86/include/asm/coco.h10
-rw-r--r--arch/x86/include/asm/cpu.h15
-rw-r--r--arch/x86/include/asm/cpu_device_id.h130
-rw-r--r--arch/x86/include/asm/cpufeature.h81
-rw-r--r--arch/x86/include/asm/cpufeatures.h16
-rw-r--r--arch/x86/include/asm/cpuid.h216
-rw-r--r--arch/x86/include/asm/cpuid/api.h210
-rw-r--r--arch/x86/include/asm/cpuid/types.h32
-rw-r--r--arch/x86/include/asm/cpumask.h4
-rw-r--r--arch/x86/include/asm/current.h40
-rw-r--r--arch/x86/include/asm/desc.h1
-rw-r--r--arch/x86/include/asm/desc_defs.h4
-rw-r--r--arch/x86/include/asm/disabled-features.h161
-rw-r--r--arch/x86/include/asm/dwarf2.h2
-rw-r--r--arch/x86/include/asm/e820/api.h1
-rw-r--r--arch/x86/include/asm/e820/types.h9
-rw-r--r--arch/x86/include/asm/edac.h2
-rw-r--r--arch/x86/include/asm/elf.h5
-rw-r--r--arch/x86/include/asm/fixmap.h4
-rw-r--r--arch/x86/include/asm/fpu/api.h17
-rw-r--r--arch/x86/include/asm/frame.h10
-rw-r--r--arch/x86/include/asm/fred.h4
-rw-r--r--arch/x86/include/asm/fsgsbase.h4
-rw-r--r--arch/x86/include/asm/ftrace.h24
-rw-r--r--arch/x86/include/asm/hardirq.h4
-rw-r--r--arch/x86/include/asm/hw_irq.h4
-rw-r--r--arch/x86/include/asm/ibt.h25
-rw-r--r--arch/x86/include/asm/idtentry.h6
-rw-r--r--arch/x86/include/asm/init.h4
-rw-r--r--arch/x86/include/asm/inst.h2
-rw-r--r--arch/x86/include/asm/intel-family.h50
-rw-r--r--arch/x86/include/asm/io.h3
-rw-r--r--arch/x86/include/asm/irq_stack.h12
-rw-r--r--arch/x86/include/asm/irqflags.h10
-rw-r--r--arch/x86/include/asm/jump_label.h4
-rw-r--r--arch/x86/include/asm/kasan.h2
-rw-r--r--arch/x86/include/asm/kexec.h62
-rw-r--r--arch/x86/include/asm/kvm_host.h21
-rw-r--r--arch/x86/include/asm/linkage.h24
-rw-r--r--arch/x86/include/asm/mce.h2
-rw-r--r--arch/x86/include/asm/mem_encrypt.h4
-rw-r--r--arch/x86/include/asm/mmu.h12
-rw-r--r--arch/x86/include/asm/mmu_context.h10
-rw-r--r--arch/x86/include/asm/mshyperv.h26
-rw-r--r--arch/x86/include/asm/msr-index.h21
-rw-r--r--arch/x86/include/asm/msr.h4
-rw-r--r--arch/x86/include/asm/nmi.h2
-rw-r--r--arch/x86/include/asm/nops.h2
-rw-r--r--arch/x86/include/asm/nospec-branch.h21
-rw-r--r--arch/x86/include/asm/orc_types.h4
-rw-r--r--arch/x86/include/asm/page.h4
-rw-r--r--arch/x86/include/asm/page_32.h4
-rw-r--r--arch/x86/include/asm/page_32_types.h8
-rw-r--r--arch/x86/include/asm/page_64.h9
-rw-r--r--arch/x86/include/asm/page_64_types.h2
-rw-r--r--arch/x86/include/asm/page_types.h4
-rw-r--r--arch/x86/include/asm/paravirt.h19
-rw-r--r--arch/x86/include/asm/paravirt_types.h20
-rw-r--r--arch/x86/include/asm/percpu.h153
-rw-r--r--arch/x86/include/asm/perf_event.h20
-rw-r--r--arch/x86/include/asm/pgalloc.h5
-rw-r--r--arch/x86/include/asm/pgtable-2level_types.h4
-rw-r--r--arch/x86/include/asm/pgtable-3level_types.h4
-rw-r--r--arch/x86/include/asm/pgtable-invert.h4
-rw-r--r--arch/x86/include/asm/pgtable.h12
-rw-r--r--arch/x86/include/asm/pgtable_32.h4
-rw-r--r--arch/x86/include/asm/pgtable_32_areas.h2
-rw-r--r--arch/x86/include/asm/pgtable_64.h6
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h4
-rw-r--r--arch/x86/include/asm/pgtable_types.h14
-rw-r--r--arch/x86/include/asm/preempt.h25
-rw-r--r--arch/x86/include/asm/processor.h67
-rw-r--r--arch/x86/include/asm/prom.h4
-rw-r--r--arch/x86/include/asm/proto.h3
-rw-r--r--arch/x86/include/asm/pti.h4
-rw-r--r--arch/x86/include/asm/ptrace.h4
-rw-r--r--arch/x86/include/asm/purgatory.h4
-rw-r--r--arch/x86/include/asm/pvclock-abi.h4
-rw-r--r--arch/x86/include/asm/realmode.h4
-rw-r--r--arch/x86/include/asm/required-features.h105
-rw-r--r--arch/x86/include/asm/resctrl.h36
-rw-r--r--arch/x86/include/asm/rmwcc.h2
-rw-r--r--arch/x86/include/asm/runtime-const.h13
-rw-r--r--arch/x86/include/asm/segment.h8
-rw-r--r--arch/x86/include/asm/set_memory.h2
-rw-r--r--arch/x86/include/asm/setup.h7
-rw-r--r--arch/x86/include/asm/setup_data.h4
-rw-r--r--arch/x86/include/asm/sev-common.h12
-rw-r--r--arch/x86/include/asm/shared/tdx.h4
-rw-r--r--arch/x86/include/asm/shstk.h4
-rw-r--r--arch/x86/include/asm/signal.h8
-rw-r--r--arch/x86/include/asm/smap.h6
-rw-r--r--arch/x86/include/asm/smp.h24
-rw-r--r--arch/x86/include/asm/special_insns.h22
-rw-r--r--arch/x86/include/asm/sta2x11.h13
-rw-r--r--arch/x86/include/asm/stackprotector.h36
-rw-r--r--arch/x86/include/asm/string_64.h2
-rw-r--r--arch/x86/include/asm/svm.h5
-rw-r--r--arch/x86/include/asm/sync_bitops.h12
-rw-r--r--arch/x86/include/asm/tdx.h4
-rw-r--r--arch/x86/include/asm/thread_info.h12
-rw-r--r--arch/x86/include/asm/tlb.h138
-rw-r--r--arch/x86/include/asm/tlbbatch.h5
-rw-r--r--arch/x86/include/asm/tlbflush.h72
-rw-r--r--arch/x86/include/asm/topology.h2
-rw-r--r--arch/x86/include/asm/traps.h2
-rw-r--r--arch/x86/include/asm/unwind_hints.h4
-rw-r--r--arch/x86/include/asm/vdso.h6
-rw-r--r--arch/x86/include/asm/vdso/getrandom.h14
-rw-r--r--arch/x86/include/asm/vdso/gettimeofday.h41
-rw-r--r--arch/x86/include/asm/vdso/processor.h4
-rw-r--r--arch/x86/include/asm/vdso/vsyscall.h27
-rw-r--r--arch/x86/include/asm/vermagic.h4
-rw-r--r--arch/x86/include/asm/vmx.h28
-rw-r--r--arch/x86/include/asm/xen/interface.h10
-rw-r--r--arch/x86/include/asm/xen/interface_32.h4
-rw-r--r--arch/x86/include/asm/xen/interface_64.h4
-rw-r--r--arch/x86/include/uapi/asm/bootparam.h4
-rw-r--r--arch/x86/include/uapi/asm/e820.h4
-rw-r--r--arch/x86/include/uapi/asm/kvm.h3
-rw-r--r--arch/x86/include/uapi/asm/ldt.h4
-rw-r--r--arch/x86/include/uapi/asm/msr.h4
-rw-r--r--arch/x86/include/uapi/asm/ptrace-abi.h6
-rw-r--r--arch/x86/include/uapi/asm/ptrace.h4
-rw-r--r--arch/x86/include/uapi/asm/setup_data.h4
-rw-r--r--arch/x86/include/uapi/asm/signal.h8
-rw-r--r--arch/x86/include/uapi/asm/svm.h2
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/acpi/cppc.c4
-rw-r--r--arch/x86/kernel/acpi/cstate.c19
-rw-r--r--arch/x86/kernel/acpi/madt_playdead.S1
-rw-r--r--arch/x86/kernel/acpi/madt_wakeup.c73
-rw-r--r--arch/x86/kernel/acpi/wakeup_64.S1
-rw-r--r--arch/x86/kernel/alternative.c645
-rw-r--r--arch/x86/kernel/amd_nb.c1
-rw-r--r--arch/x86/kernel/amd_node.c149
-rw-r--r--arch/x86/kernel/apic/Makefile3
-rw-r--r--arch/x86/kernel/apic/apic.c7
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c105
-rw-r--r--arch/x86/kernel/apic/ipi.c33
-rw-r--r--arch/x86/kernel/apic/local.h13
-rw-r--r--arch/x86/kernel/apic/probe_32.c29
-rw-r--r--arch/x86/kernel/apic/vector.c231
-rw-r--r--arch/x86/kernel/asm-offsets.c5
-rw-r--r--arch/x86/kernel/asm-offsets_64.c6
-rw-r--r--arch/x86/kernel/bootflag.c29
-rw-r--r--arch/x86/kernel/callthunks.c13
-rw-r--r--arch/x86/kernel/cfi.c26
-rw-r--r--arch/x86/kernel/cpu/amd.c30
-rw-r--r--arch/x86/kernel/cpu/aperfmperf.c2
-rw-r--r--arch/x86/kernel/cpu/bugs.c121
-rw-r--r--arch/x86/kernel/cpu/bus_lock.c20
-rw-r--r--arch/x86/kernel/cpu/cacheinfo.c31
-rw-r--r--arch/x86/kernel/cpu/common.c241
-rw-r--r--arch/x86/kernel/cpu/cpu.h8
-rw-r--r--arch/x86/kernel/cpu/cpuid-deps.c35
-rw-r--r--arch/x86/kernel/cpu/debugfs.c4
-rw-r--r--arch/x86/kernel/cpu/hygon.c16
-rw-r--r--arch/x86/kernel/cpu/intel.c297
-rw-r--r--arch/x86/kernel/cpu/match.c30
-rw-r--r--arch/x86/kernel/cpu/mce/core.c44
-rw-r--r--arch/x86/kernel/cpu/mce/inject.c1
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c2
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c40
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c17
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c6
-rw-r--r--arch/x86/kernel/cpu/proc.c7
-rw-r--r--arch/x86/kernel/cpu/resctrl/Makefile5
-rw-r--r--arch/x86/kernel/cpu/resctrl/core.c181
-rw-r--r--arch/x86/kernel/cpu/resctrl/ctrlmondata.c93
-rw-r--r--arch/x86/kernel/cpu/resctrl/internal.h201
-rw-r--r--arch/x86/kernel/cpu/resctrl/monitor.c119
-rw-r--r--arch/x86/kernel/cpu/resctrl/pseudo_lock.c55
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c284
-rw-r--r--arch/x86/kernel/crash.c2
-rw-r--r--arch/x86/kernel/dumpstack.c9
-rw-r--r--arch/x86/kernel/dumpstack_32.c4
-rw-r--r--arch/x86/kernel/dumpstack_64.c2
-rw-r--r--arch/x86/kernel/e820.c115
-rw-r--r--arch/x86/kernel/early_printk.c49
-rw-r--r--arch/x86/kernel/fpu/core.c23
-rw-r--r--arch/x86/kernel/fpu/internal.h2
-rw-r--r--arch/x86/kernel/fpu/signal.c11
-rw-r--r--arch/x86/kernel/fpu/xstate.c30
-rw-r--r--arch/x86/kernel/fpu/xstate.h31
-rw-r--r--arch/x86/kernel/ftrace.c30
-rw-r--r--arch/x86/kernel/ftrace_64.S5
-rw-r--r--arch/x86/kernel/head64.c4
-rw-r--r--arch/x86/kernel/head_64.S24
-rw-r--r--arch/x86/kernel/i8259.c1
-rw-r--r--arch/x86/kernel/ioport.c2
-rw-r--r--arch/x86/kernel/irq.c5
-rw-r--r--arch/x86/kernel/irq_32.c51
-rw-r--r--arch/x86/kernel/irq_64.c8
-rw-r--r--arch/x86/kernel/irqflags.S1
-rw-r--r--arch/x86/kernel/kprobes/core.c11
-rw-r--r--arch/x86/kernel/kvm.c1
-rw-r--r--arch/x86/kernel/module.c81
-rw-r--r--arch/x86/kernel/nmi.c42
-rw-r--r--arch/x86/kernel/paravirt.c48
-rw-r--r--arch/x86/kernel/process.c17
-rw-r--r--arch/x86/kernel/process_32.c11
-rw-r--r--arch/x86/kernel/process_64.c31
-rw-r--r--arch/x86/kernel/quirks.c2
-rw-r--r--arch/x86/kernel/reboot.c12
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S23
-rw-r--r--arch/x86/kernel/setup.c200
-rw-r--r--arch/x86/kernel/setup_percpu.c15
-rw-r--r--arch/x86/kernel/signal_32.c62
-rw-r--r--arch/x86/kernel/smpboot.c92
-rw-r--r--arch/x86/kernel/tboot.c3
-rw-r--r--arch/x86/kernel/traps.c150
-rw-r--r--arch/x86/kernel/tsc.c4
-rw-r--r--arch/x86/kernel/tsc_msr.c2
-rw-r--r--arch/x86/kernel/uprobes.c14
-rw-r--r--arch/x86/kernel/verify_cpu.S4
-rw-r--r--arch/x86/kernel/vmlinux.lds.S43
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/cpuid.c70
-rw-r--r--arch/x86/kvm/cpuid.h9
-rw-r--r--arch/x86/kvm/emulate.c5
-rw-r--r--arch/x86/kvm/hyperv.c3
-rw-r--r--arch/x86/kvm/i8254.c3
-rw-r--r--arch/x86/kvm/i8259.c2
-rw-r--r--arch/x86/kvm/kvm_emulate.h7
-rw-r--r--arch/x86/kvm/lapic.c22
-rw-r--r--arch/x86/kvm/mmu/mmu.c365
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h3
-rw-r--r--arch/x86/kvm/mmu/spte.c31
-rw-r--r--arch/x86/kvm/mmu/spte.h2
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.h34
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c45
-rw-r--r--arch/x86/kvm/smm.c2
-rw-r--r--arch/x86/kvm/svm/nested.c2
-rw-r--r--arch/x86/kvm/svm/sev.c373
-rw-r--r--arch/x86/kvm/svm/svm.c65
-rw-r--r--arch/x86/kvm/svm/svm.h39
-rw-r--r--arch/x86/kvm/trace.h14
-rw-r--r--arch/x86/kvm/vmx/nested.c23
-rw-r--r--arch/x86/kvm/vmx/nested.h22
-rw-r--r--arch/x86/kvm/vmx/vmx.c227
-rw-r--r--arch/x86/kvm/vmx/vmx_ops.h18
-rw-r--r--arch/x86/kvm/x86.c342
-rw-r--r--arch/x86/kvm/x86.h8
-rw-r--r--arch/x86/kvm/xen.c125
-rw-r--r--arch/x86/kvm/xen.h30
-rw-r--r--arch/x86/lib/Makefile5
-rw-r--r--arch/x86/lib/bhi.S147
-rw-r--r--arch/x86/lib/clear_page_64.S9
-rw-r--r--arch/x86/lib/cmpxchg8b_emu.S2
-rw-r--r--arch/x86/lib/copy_page_64.S3
-rw-r--r--arch/x86/lib/copy_user_64.S3
-rw-r--r--arch/x86/lib/copy_user_uncached_64.S2
-rw-r--r--arch/x86/lib/delay.c2
-rw-r--r--arch/x86/lib/getuser.S16
-rw-r--r--arch/x86/lib/hweight.S3
-rw-r--r--arch/x86/lib/memmove_64.S3
-rw-r--r--arch/x86/lib/memset_64.S3
-rw-r--r--arch/x86/lib/msr-reg.S3
-rw-r--r--arch/x86/lib/msr.c2
-rw-r--r--arch/x86/lib/putuser.S9
-rw-r--r--arch/x86/lib/retpoline.S3
-rw-r--r--arch/x86/lib/usercopy_64.c2
-rw-r--r--arch/x86/math-emu/control_w.h2
-rw-r--r--arch/x86/math-emu/exception.h6
-rw-r--r--arch/x86/math-emu/fpu_emu.h6
-rw-r--r--arch/x86/math-emu/status_w.h6
-rw-r--r--arch/x86/mm/ident_map.c14
-rw-r--r--arch/x86/mm/init.c9
-rw-r--r--arch/x86/mm/init_32.c9
-rw-r--r--arch/x86/mm/init_64.c8
-rw-r--r--arch/x86/mm/ioremap.c8
-rw-r--r--arch/x86/mm/kasan_init_64.c1
-rw-r--r--arch/x86/mm/kaslr.c10
-rw-r--r--arch/x86/mm/mem_encrypt_amd.c2
-rw-r--r--arch/x86/mm/mem_encrypt_boot.S1
-rw-r--r--arch/x86/mm/mem_encrypt_identity.c6
-rw-r--r--arch/x86/mm/mmap.c9
-rw-r--r--arch/x86/mm/pat/cpa-test.c2
-rw-r--r--arch/x86/mm/pat/memtype.c6
-rw-r--r--arch/x86/mm/pat/set_memory.c243
-rw-r--r--arch/x86/mm/pgtable.c54
-rw-r--r--arch/x86/mm/tlb.c432
-rw-r--r--arch/x86/net/bpf_jit_comp.c34
-rw-r--r--arch/x86/pci/Makefile2
-rw-r--r--arch/x86/pci/sta2x11-fixup.c233
-rw-r--r--arch/x86/pci/xen.c8
-rw-r--r--arch/x86/platform/olpc/olpc-xo1-sci.c1
-rw-r--r--arch/x86/platform/olpc/olpc_dt.c3
-rw-r--r--arch/x86/platform/pvh/head.S14
-rw-r--r--arch/x86/power/hibernate_asm_64.S2
-rw-r--r--arch/x86/realmode/rm/realmode.h4
-rw-r--r--arch/x86/realmode/rm/wakeup.h2
-rwxr-xr-xarch/x86/tools/cpufeaturemasks.awk88
-rw-r--r--arch/x86/tools/relocs.c147
-rw-r--r--arch/x86/virt/svm/sev.c1
-rw-r--r--arch/x86/xen/Kconfig2
-rw-r--r--arch/x86/xen/enlighten_pv.c69
-rw-r--r--arch/x86/xen/mmu_pv.c1
-rw-r--r--arch/x86/xen/smp_pv.c2
-rw-r--r--arch/x86/xen/xen-asm.S5
-rw-r--r--arch/x86/xen/xen-head.S12
-rw-r--r--arch/xtensa/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/xtensa/kernel/traps.c6
677 files changed, 11054 insertions, 9084 deletions
diff --git a/arch/Kconfig b/arch/Kconfig
index b8a4ff365582..9f6eb09ef12d 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1584,6 +1584,10 @@ config HAVE_SPARSE_SYSCALL_NR
entries at 4000, 5000 and 6000 locations. This option turns on syscall
related optimizations for a given architecture.
+config ARCH_HAS_VDSO_ARCH_DATA
+ depends on GENERIC_VDSO_DATA_STORE
+ bool
+
config ARCH_HAS_VDSO_TIME_DATA
bool
diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index c59d53d6d3f3..2dd6340de6b4 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -506,3 +506,4 @@
574 common getxattrat sys_getxattrat
575 common listxattrat sys_listxattrat
576 common removexattrat sys_removexattrat
+577 common open_tree_attr sys_open_tree_attr
diff --git a/arch/arm/boot/dts/broadcom/bcm2711-rpi.dtsi b/arch/arm/boot/dts/broadcom/bcm2711-rpi.dtsi
index 6bf4241fe3b7..c78ed064d166 100644
--- a/arch/arm/boot/dts/broadcom/bcm2711-rpi.dtsi
+++ b/arch/arm/boot/dts/broadcom/bcm2711-rpi.dtsi
@@ -1,7 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcm2835-rpi.dtsi"
-#include <dt-bindings/power/raspberrypi-power.h>
#include <dt-bindings/reset/raspberrypi,firmware-reset.h>
/ {
@@ -101,7 +100,3 @@
&vchiq {
interrupts = <GIC_SPI 34 IRQ_TYPE_LEVEL_HIGH>;
};
-
-&xhci {
- power-domains = <&power RPI_POWER_DOMAIN_USB>;
-};
diff --git a/arch/arm/boot/dts/broadcom/bcm2711.dtsi b/arch/arm/boot/dts/broadcom/bcm2711.dtsi
index e4e42af21ef3..c06d9f5e53c8 100644
--- a/arch/arm/boot/dts/broadcom/bcm2711.dtsi
+++ b/arch/arm/boot/dts/broadcom/bcm2711.dtsi
@@ -134,7 +134,7 @@
clocks = <&clocks BCM2835_CLOCK_UART>,
<&clocks BCM2835_CLOCK_VPU>;
clock-names = "uartclk", "apb_pclk";
- arm,primecell-periphid = <0x00241011>;
+ arm,primecell-periphid = <0x00341011>;
status = "disabled";
};
@@ -145,7 +145,7 @@
clocks = <&clocks BCM2835_CLOCK_UART>,
<&clocks BCM2835_CLOCK_VPU>;
clock-names = "uartclk", "apb_pclk";
- arm,primecell-periphid = <0x00241011>;
+ arm,primecell-periphid = <0x00341011>;
status = "disabled";
};
@@ -156,7 +156,7 @@
clocks = <&clocks BCM2835_CLOCK_UART>,
<&clocks BCM2835_CLOCK_VPU>;
clock-names = "uartclk", "apb_pclk";
- arm,primecell-periphid = <0x00241011>;
+ arm,primecell-periphid = <0x00341011>;
status = "disabled";
};
@@ -167,7 +167,7 @@
clocks = <&clocks BCM2835_CLOCK_UART>,
<&clocks BCM2835_CLOCK_VPU>;
clock-names = "uartclk", "apb_pclk";
- arm,primecell-periphid = <0x00241011>;
+ arm,primecell-periphid = <0x00341011>;
status = "disabled";
};
@@ -451,8 +451,6 @@
IRQ_TYPE_LEVEL_LOW)>,
<GIC_PPI 10 (GIC_CPU_MASK_SIMPLE(4) |
IRQ_TYPE_LEVEL_LOW)>;
- /* This only applies to the ARMv7 stub */
- arm,cpu-registers-not-fw-configured;
};
cpus: cpus {
@@ -610,6 +608,7 @@
#address-cells = <1>;
#size-cells = <0>;
interrupts = <GIC_SPI 176 IRQ_TYPE_LEVEL_HIGH>;
+ power-domains = <&pm BCM2835_POWER_DOMAIN_USB>;
/* DWC2 and this IP block share the same USB PHY,
* enabling both at the same time results in lockups.
* So keep this node disabled and let the bootloader
@@ -1177,6 +1176,7 @@
};
&uart0 {
+ arm,primecell-periphid = <0x00341011>;
interrupts = <GIC_SPI 121 IRQ_TYPE_LEVEL_HIGH>;
};
diff --git a/arch/arm/boot/dts/broadcom/bcm4709-asus-rt-ac3200.dts b/arch/arm/boot/dts/broadcom/bcm4709-asus-rt-ac3200.dts
index 53cb0c58f6d0..3da2daee0c84 100644
--- a/arch/arm/boot/dts/broadcom/bcm4709-asus-rt-ac3200.dts
+++ b/arch/arm/boot/dts/broadcom/bcm4709-asus-rt-ac3200.dts
@@ -124,19 +124,19 @@
};
port@1 {
- label = "lan1";
+ label = "lan4";
};
port@2 {
- label = "lan2";
+ label = "lan3";
};
port@3 {
- label = "lan3";
+ label = "lan2";
};
port@4 {
- label = "lan4";
+ label = "lan1";
};
};
};
diff --git a/arch/arm/boot/dts/broadcom/bcm47094-asus-rt-ac5300.dts b/arch/arm/boot/dts/broadcom/bcm47094-asus-rt-ac5300.dts
index 6c666dc7ad23..01ec8c03686a 100644
--- a/arch/arm/boot/dts/broadcom/bcm47094-asus-rt-ac5300.dts
+++ b/arch/arm/boot/dts/broadcom/bcm47094-asus-rt-ac5300.dts
@@ -126,11 +126,11 @@
ports {
port@0 {
- label = "lan4";
+ label = "wan";
};
port@1 {
- label = "lan3";
+ label = "lan1";
};
port@2 {
@@ -138,11 +138,11 @@
};
port@3 {
- label = "lan1";
+ label = "lan3";
};
port@4 {
- label = "wan";
+ label = "lan4";
};
};
};
diff --git a/arch/arm/boot/dts/nxp/imx/imx6qdl-apalis.dtsi b/arch/arm/boot/dts/nxp/imx/imx6qdl-apalis.dtsi
index dffab5aa8b9c..88be29166c1a 100644
--- a/arch/arm/boot/dts/nxp/imx/imx6qdl-apalis.dtsi
+++ b/arch/arm/boot/dts/nxp/imx/imx6qdl-apalis.dtsi
@@ -108,6 +108,11 @@
};
};
+ poweroff {
+ compatible = "regulator-poweroff";
+ cpu-supply = <&vgen2_reg>;
+ };
+
reg_module_3v3: regulator-module-3v3 {
compatible = "regulator-fixed";
regulator-always-on;
@@ -236,10 +241,6 @@
status = "disabled";
};
-&clks {
- fsl,pmic-stby-poweroff;
-};
-
/* Apalis SPI1 */
&ecspi1 {
cs-gpios = <&gpio5 25 GPIO_ACTIVE_LOW>;
@@ -527,7 +528,6 @@
pmic: pmic@8 {
compatible = "fsl,pfuze100";
- fsl,pmic-stby-poweroff;
reg = <0x08>;
regulators {
diff --git a/arch/arm/include/asm/io.h b/arch/arm/include/asm/io.h
index 1815748f5d2a..bae5edf348ef 100644
--- a/arch/arm/include/asm/io.h
+++ b/arch/arm/include/asm/io.h
@@ -381,7 +381,7 @@ void __iomem *ioremap_wc(resource_size_t res_cookie, size_t size);
void iounmap(volatile void __iomem *io_addr);
#define iounmap iounmap
-void *arch_memremap_wb(phys_addr_t phys_addr, size_t size);
+void *arch_memremap_wb(phys_addr_t phys_addr, size_t size, unsigned long flags);
#define arch_memremap_wb arch_memremap_wb
/*
diff --git a/arch/arm/include/asm/vdso.h b/arch/arm/include/asm/vdso.h
index 5b85889f82ee..88364a6727ff 100644
--- a/arch/arm/include/asm/vdso.h
+++ b/arch/arm/include/asm/vdso.h
@@ -4,6 +4,8 @@
#ifdef __KERNEL__
+#define __VDSO_PAGES 4
+
#ifndef __ASSEMBLY__
struct mm_struct;
diff --git a/arch/arm/include/asm/vdso/gettimeofday.h b/arch/arm/include/asm/vdso/gettimeofday.h
index 592d3d015ca7..1e9f81639c88 100644
--- a/arch/arm/include/asm/vdso/gettimeofday.h
+++ b/arch/arm/include/asm/vdso/gettimeofday.h
@@ -112,7 +112,7 @@ static inline bool arm_vdso_hres_capable(void)
#define __arch_vdso_hres_capable arm_vdso_hres_capable
static __always_inline u64 __arch_get_hw_counter(int clock_mode,
- const struct vdso_data *vd)
+ const struct vdso_time_data *vd)
{
#ifdef CONFIG_ARM_ARCH_TIMER
u64 cycle_now;
@@ -135,11 +135,6 @@ static __always_inline u64 __arch_get_hw_counter(int clock_mode,
#endif
}
-static __always_inline const struct vdso_data *__arch_get_vdso_data(void)
-{
- return _vdso_data;
-}
-
#endif /* !__ASSEMBLY__ */
#endif /* __ASM_VDSO_GETTIMEOFDAY_H */
diff --git a/arch/arm/include/asm/vdso/vsyscall.h b/arch/arm/include/asm/vdso/vsyscall.h
index 705414710dcd..4e7226ad02ec 100644
--- a/arch/arm/include/asm/vdso/vsyscall.h
+++ b/arch/arm/include/asm/vdso/vsyscall.h
@@ -7,22 +7,14 @@
#include <vdso/datapage.h>
#include <asm/cacheflush.h>
-extern struct vdso_data *vdso_data;
extern bool cntvct_ok;
static __always_inline
-struct vdso_data *__arm_get_k_vdso_data(void)
-{
- return vdso_data;
-}
-#define __arch_get_k_vdso_data __arm_get_k_vdso_data
-
-static __always_inline
-void __arm_sync_vdso_data(struct vdso_data *vdata)
+void __arch_sync_vdso_time_data(struct vdso_time_data *vdata)
{
flush_dcache_page(virt_to_page(vdata));
}
-#define __arch_sync_vdso_data __arm_sync_vdso_data
+#define __arch_sync_vdso_time_data __arch_sync_vdso_time_data
/* The asm-generic header needs to be included after the definitions above */
#include <asm-generic/vdso/vsyscall.h>
diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
index 4853875740d0..123f4a8ef446 100644
--- a/arch/arm/kernel/asm-offsets.c
+++ b/arch/arm/kernel/asm-offsets.c
@@ -153,10 +153,6 @@ int main(void)
DEFINE(CACHE_WRITEBACK_ORDER, __CACHE_WRITEBACK_ORDER);
DEFINE(CACHE_WRITEBACK_GRANULE, __CACHE_WRITEBACK_GRANULE);
BLANK();
-#ifdef CONFIG_VDSO
- DEFINE(VDSO_DATA_SIZE, sizeof(union vdso_data_store));
-#endif
- BLANK();
#ifdef CONFIG_ARM_MPU
DEFINE(MPU_RNG_INFO_RNGS, offsetof(struct mpu_rgn_info, rgns));
DEFINE(MPU_RNG_INFO_USED, offsetof(struct mpu_rgn_info, used));
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index 6ea645939573..afbd2ebe5c39 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -258,13 +258,6 @@ void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl)
barrier();
}
-#ifdef CONFIG_PREEMPT
-#define S_PREEMPT " PREEMPT"
-#elif defined(CONFIG_PREEMPT_RT)
-#define S_PREEMPT " PREEMPT_RT"
-#else
-#define S_PREEMPT ""
-#endif
#ifdef CONFIG_SMP
#define S_SMP " SMP"
#else
@@ -282,8 +275,8 @@ static int __die(const char *str, int err, struct pt_regs *regs)
static int die_counter;
int ret;
- pr_emerg("Internal error: %s: %x [#%d]" S_PREEMPT S_SMP S_ISA "\n",
- str, err, ++die_counter);
+ pr_emerg("Internal error: %s: %x [#%d]" S_SMP S_ISA "\n",
+ str, err, ++die_counter);
/* trap and error numbers are mostly meaningless on ARM */
ret = notify_die(DIE_OOPS, str, regs, err, tsk->thread.trap_no, SIGSEGV);
diff --git a/arch/arm/kernel/vdso.c b/arch/arm/kernel/vdso.c
index 29dd2f3c62fe..325448ffbba0 100644
--- a/arch/arm/kernel/vdso.c
+++ b/arch/arm/kernel/vdso.c
@@ -7,6 +7,7 @@
*/
#include <linux/cache.h>
+#include <linux/vdso_datastore.h>
#include <linux/elf.h>
#include <linux/err.h>
#include <linux/kernel.h>
@@ -33,15 +34,6 @@ extern char vdso_start[], vdso_end[];
/* Total number of pages needed for the data and text portions of the VDSO. */
unsigned int vdso_total_pages __ro_after_init;
-static union vdso_data_store vdso_data_store __page_aligned_data;
-struct vdso_data *vdso_data = vdso_data_store.data;
-
-static struct page *vdso_data_page __ro_after_init;
-static const struct vm_special_mapping vdso_data_mapping = {
- .name = "[vvar]",
- .pages = &vdso_data_page,
-};
-
static int vdso_mremap(const struct vm_special_mapping *sm,
struct vm_area_struct *new_vma)
{
@@ -192,9 +184,6 @@ static int __init vdso_init(void)
if (vdso_text_pagelist == NULL)
return -ENOMEM;
- /* Grab the VDSO data page. */
- vdso_data_page = virt_to_page(vdso_data);
-
/* Grab the VDSO text pages. */
for (i = 0; i < text_pages; i++) {
struct page *page;
@@ -205,7 +194,7 @@ static int __init vdso_init(void)
vdso_text_mapping.pages = vdso_text_pagelist;
- vdso_total_pages = 1; /* for the data/vvar page */
+ vdso_total_pages = VDSO_NR_PAGES; /* for the data/vvar pages */
vdso_total_pages += text_pages;
cntvct_ok = cntvct_functional();
@@ -216,16 +205,7 @@ static int __init vdso_init(void)
}
arch_initcall(vdso_init);
-static int install_vvar(struct mm_struct *mm, unsigned long addr)
-{
- struct vm_area_struct *vma;
-
- vma = _install_special_mapping(mm, addr, PAGE_SIZE,
- VM_READ | VM_MAYREAD,
- &vdso_data_mapping);
-
- return PTR_ERR_OR_ZERO(vma);
-}
+static_assert(__VDSO_PAGES == VDSO_NR_PAGES);
/* assumes mmap_lock is write-locked */
void arm_install_vdso(struct mm_struct *mm, unsigned long addr)
@@ -238,12 +218,12 @@ void arm_install_vdso(struct mm_struct *mm, unsigned long addr)
if (vdso_text_pagelist == NULL)
return;
- if (install_vvar(mm, addr))
+ if (IS_ERR(vdso_install_vvar_mapping(mm, addr)))
return;
- /* Account for vvar page. */
- addr += PAGE_SIZE;
- len = (vdso_total_pages - 1) << PAGE_SHIFT;
+ /* Account for vvar pages. */
+ addr += VDSO_NR_PAGES * PAGE_SIZE;
+ len = (vdso_total_pages - VDSO_NR_PAGES) << PAGE_SHIFT;
vma = _install_special_mapping(mm, addr, len,
VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC,
diff --git a/arch/arm/mach-davinci/Kconfig b/arch/arm/mach-davinci/Kconfig
index 2a8a9fe46586..3fa15f342240 100644
--- a/arch/arm/mach-davinci/Kconfig
+++ b/arch/arm/mach-davinci/Kconfig
@@ -27,6 +27,7 @@ config ARCH_DAVINCI_DA830
config ARCH_DAVINCI_DA850
bool "DA850/OMAP-L138/AM18x based system"
+ select ARCH_DAVINCI_DA8XX
select DAVINCI_CP_INTC
config ARCH_DAVINCI_DA8XX
diff --git a/arch/arm/mach-davinci/da830.c b/arch/arm/mach-davinci/da830.c
index 2e497745b624..a044ea5cb4f1 100644
--- a/arch/arm/mach-davinci/da830.c
+++ b/arch/arm/mach-davinci/da830.c
@@ -11,7 +11,6 @@
#include <linux/gpio.h>
#include <linux/init.h>
#include <linux/io.h>
-#include <linux/irqchip/irq-davinci-cp-intc.h>
#include <clocksource/timer-davinci.h>
diff --git a/arch/arm/mach-imx/mmdc.c b/arch/arm/mach-imx/mmdc.c
index e898f7c2733e..94e4f4a2f73f 100644
--- a/arch/arm/mach-imx/mmdc.c
+++ b/arch/arm/mach-imx/mmdc.c
@@ -509,9 +509,8 @@ static int imx_mmdc_perf_init(struct platform_device *pdev, void __iomem *mmdc_b
pmu_mmdc->mmdc_ipg_clk = mmdc_ipg_clk;
pmu_mmdc->devtype_data = device_get_match_data(&pdev->dev);
- hrtimer_init(&pmu_mmdc->hrtimer, CLOCK_MONOTONIC,
- HRTIMER_MODE_REL);
- pmu_mmdc->hrtimer.function = mmdc_pmu_timer_handler;
+ hrtimer_setup(&pmu_mmdc->hrtimer, mmdc_pmu_timer_handler, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL);
cpumask_set_cpu(raw_smp_processor_id(), &pmu_mmdc->cpu);
diff --git a/arch/arm/mach-omap1/Kconfig b/arch/arm/mach-omap1/Kconfig
index a643b71e30a3..08ec6bd84ada 100644
--- a/arch/arm/mach-omap1/Kconfig
+++ b/arch/arm/mach-omap1/Kconfig
@@ -8,6 +8,7 @@ menuconfig ARCH_OMAP1
select ARCH_OMAP
select CLKSRC_MMIO
select FORCE_PCI if PCCARD
+ select GENERIC_IRQ_CHIP
select GPIOLIB
help
Support for older TI OMAP1 (omap7xx, omap15xx or omap16xx)
diff --git a/arch/arm/mach-shmobile/headsmp.S b/arch/arm/mach-shmobile/headsmp.S
index a956b489b6ea..2bc7e73a8582 100644
--- a/arch/arm/mach-shmobile/headsmp.S
+++ b/arch/arm/mach-shmobile/headsmp.S
@@ -136,6 +136,7 @@ ENDPROC(shmobile_smp_sleep)
.long shmobile_smp_arg - 1b
.bss
+ .align 2
.globl shmobile_smp_mpidr
shmobile_smp_mpidr:
.space NR_CPUS * 4
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
index 2b6f50dd5478..5c1023a6d78c 100644
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -928,6 +928,7 @@ config VDSO
select GENERIC_TIME_VSYSCALL
select GENERIC_VDSO_32
select GENERIC_GETTIMEOFDAY
+ select GENERIC_VDSO_DATA_STORE
help
Place in the process address space an ELF shared object
providing fast implementations of gettimeofday and
diff --git a/arch/arm/mm/cache-l2x0-pmu.c b/arch/arm/mm/cache-l2x0-pmu.c
index 993fefdc167a..93ef0502b7ff 100644
--- a/arch/arm/mm/cache-l2x0-pmu.c
+++ b/arch/arm/mm/cache-l2x0-pmu.c
@@ -539,8 +539,7 @@ static __init int l2x0_pmu_init(void)
* at higher frequencies.
*/
l2x0_pmu_poll_period = ms_to_ktime(1000);
- hrtimer_init(&l2x0_pmu_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- l2x0_pmu_hrtimer.function = l2x0_pmu_poll;
+ hrtimer_setup(&l2x0_pmu_hrtimer, l2x0_pmu_poll, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
cpumask_set_cpu(0, &pmu_cpu);
ret = cpuhp_setup_state_nocalls(CPUHP_AP_PERF_ARM_L2X0_ONLINE,
diff --git a/arch/arm/mm/ioremap.c b/arch/arm/mm/ioremap.c
index 89f1c97f3079..748698e91a4b 100644
--- a/arch/arm/mm/ioremap.c
+++ b/arch/arm/mm/ioremap.c
@@ -436,7 +436,7 @@ void __arm_iomem_set_ro(void __iomem *ptr, size_t size)
set_memory_ro((unsigned long)ptr, PAGE_ALIGN(size) / PAGE_SIZE);
}
-void *arch_memremap_wb(phys_addr_t phys_addr, size_t size)
+void *arch_memremap_wb(phys_addr_t phys_addr, size_t size, unsigned long flags)
{
return (__force void *)arch_ioremap_caller(phys_addr, size,
MT_MEMORY_RW,
diff --git a/arch/arm/mm/nommu.c b/arch/arm/mm/nommu.c
index 1a8f6914ee59..d638cc87807e 100644
--- a/arch/arm/mm/nommu.c
+++ b/arch/arm/mm/nommu.c
@@ -248,7 +248,7 @@ void __iomem *pci_remap_cfgspace(resource_size_t res_cookie, size_t size)
EXPORT_SYMBOL_GPL(pci_remap_cfgspace);
#endif
-void *arch_memremap_wb(phys_addr_t phys_addr, size_t size)
+void *arch_memremap_wb(phys_addr_t phys_addr, size_t size, unsigned long flags)
{
return (void *)phys_addr;
}
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index 49eeb2ad8dbd..27c1d5ebcd91 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -481,3 +481,4 @@
464 common getxattrat sys_getxattrat
465 common listxattrat sys_listxattrat
466 common removexattrat sys_removexattrat
+467 common open_tree_attr sys_open_tree_attr
diff --git a/arch/arm/vdso/Makefile b/arch/arm/vdso/Makefile
index 8a306bbec4a0..cb044bfd145d 100644
--- a/arch/arm/vdso/Makefile
+++ b/arch/arm/vdso/Makefile
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
# Include the generic Makefile to check the built vdso.
-include $(srctree)/lib/vdso/Makefile
+include $(srctree)/lib/vdso/Makefile.include
hostprogs := vdsomunge
diff --git a/arch/arm/vdso/vdso.lds.S b/arch/arm/vdso/vdso.lds.S
index 9bfa0f52923c..7c08371f4400 100644
--- a/arch/arm/vdso/vdso.lds.S
+++ b/arch/arm/vdso/vdso.lds.S
@@ -11,16 +11,16 @@
*/
#include <linux/const.h>
-#include <asm/asm-offsets.h>
#include <asm/page.h>
#include <asm/vdso.h>
+#include <vdso/datapage.h>
OUTPUT_FORMAT("elf32-littlearm", "elf32-bigarm", "elf32-littlearm")
OUTPUT_ARCH(arm)
SECTIONS
{
- PROVIDE(_vdso_data = . - VDSO_DATA_SIZE);
+ VDSO_VVAR_SYMS
. = SIZEOF_HEADERS;
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 940343beb3d4..f2d724c1dfd2 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -162,6 +162,7 @@ config ARM64
select GENERIC_SMP_IDLE_THREAD
select GENERIC_TIME_VSYSCALL
select GENERIC_GETTIMEOFDAY
+ select GENERIC_VDSO_DATA_STORE
select GENERIC_VDSO_TIME_NS
select HARDIRQS_SW_RESEND
select HAS_IOPORT
@@ -250,6 +251,7 @@ config ARM64
select HAVE_KRETPROBES
select HAVE_GENERIC_VDSO
select HOTPLUG_CORE_SYNC_DEAD if HOTPLUG_CPU
+ select HOTPLUG_SMT if HOTPLUG_CPU
select IRQ_DOMAIN
select IRQ_FORCED_THREADING
select KASAN_VMALLOC if KASAN
@@ -323,7 +325,7 @@ config ARCH_MMAP_RND_BITS_MIN
default 18
# max bits determined by the following formula:
-# VA_BITS - PAGE_SHIFT - 3
+# VA_BITS - PTDESC_TABLE_SHIFT
config ARCH_MMAP_RND_BITS_MAX
default 19 if ARM64_VA_BITS=36
default 24 if ARM64_VA_BITS=39
@@ -1302,6 +1304,15 @@ config NVIDIA_CARMEL_CNP_ERRATUM
If unsure, say Y.
+config ROCKCHIP_ERRATUM_3568002
+ bool "Rockchip 3568002: GIC600 can not access physical addresses higher than 4GB"
+ default y
+ help
+ The Rockchip RK3566 and RK3568 GIC600 SoC integrations have AXI
+ addressing limited to the first 32bit of physical address space.
+
+ If unsure, say Y.
+
config ROCKCHIP_ERRATUM_3588001
bool "Rockchip 3588001: GIC600 can not support shareability attributes"
default y
diff --git a/arch/arm64/boot/dts/broadcom/bcm2712.dtsi b/arch/arm64/boot/dts/broadcom/bcm2712.dtsi
index 689c82b7f596..9e610a89a337 100644
--- a/arch/arm64/boot/dts/broadcom/bcm2712.dtsi
+++ b/arch/arm64/boot/dts/broadcom/bcm2712.dtsi
@@ -227,7 +227,7 @@
interrupts = <GIC_SPI 121 IRQ_TYPE_LEVEL_HIGH>;
clocks = <&clk_uart>, <&clk_vpu>;
clock-names = "uartclk", "apb_pclk";
- arm,primecell-periphid = <0x00241011>;
+ arm,primecell-periphid = <0x00341011>;
status = "disabled";
};
diff --git a/arch/arm64/boot/dts/freescale/imx8mm-verdin-dahlia.dtsi b/arch/arm64/boot/dts/freescale/imx8mm-verdin-dahlia.dtsi
index ce20de259805..3d0b14968131 100644
--- a/arch/arm64/boot/dts/freescale/imx8mm-verdin-dahlia.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mm-verdin-dahlia.dtsi
@@ -16,10 +16,10 @@
"Headphone Jack", "HPOUTR",
"IN2L", "Line In Jack",
"IN2R", "Line In Jack",
- "Headphone Jack", "MICBIAS",
- "IN1L", "Headphone Jack";
+ "Microphone Jack", "MICBIAS",
+ "IN1L", "Microphone Jack";
simple-audio-card,widgets =
- "Microphone", "Headphone Jack",
+ "Microphone", "Microphone Jack",
"Headphone", "Headphone Jack",
"Line", "Line In Jack";
diff --git a/arch/arm64/boot/dts/freescale/imx8mp-tqma8mpql.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-tqma8mpql.dtsi
index 336785a9fba8..3ddc5aaa7c5f 100644
--- a/arch/arm64/boot/dts/freescale/imx8mp-tqma8mpql.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mp-tqma8mpql.dtsi
@@ -1,7 +1,8 @@
// SPDX-License-Identifier: GPL-2.0-or-later OR MIT
/*
- * Copyright 2021-2022 TQ-Systems GmbH
- * Author: Alexander Stein <alexander.stein@tq-group.com>
+ * Copyright 2021-2025 TQ-Systems GmbH <linux@ew.tq-group.com>,
+ * D-82229 Seefeld, Germany.
+ * Author: Alexander Stein
*/
#include "imx8mp.dtsi"
@@ -23,15 +24,6 @@
regulator-max-microvolt = <3300000>;
regulator-always-on;
};
-
- /* e-MMC IO, needed for HS modes */
- reg_vcc1v8: regulator-vcc1v8 {
- compatible = "regulator-fixed";
- regulator-name = "VCC1V8";
- regulator-min-microvolt = <1800000>;
- regulator-max-microvolt = <1800000>;
- regulator-always-on;
- };
};
&A53_0 {
@@ -197,7 +189,7 @@
no-sd;
no-sdio;
vmmc-supply = <&reg_vcc3v3>;
- vqmmc-supply = <&reg_vcc1v8>;
+ vqmmc-supply = <&buck5_reg>;
status = "okay";
};
diff --git a/arch/arm64/boot/dts/freescale/imx8mp-verdin-dahlia.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-verdin-dahlia.dtsi
index da8902c5f7e5..1493319aa748 100644
--- a/arch/arm64/boot/dts/freescale/imx8mp-verdin-dahlia.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mp-verdin-dahlia.dtsi
@@ -28,10 +28,10 @@
"Headphone Jack", "HPOUTR",
"IN2L", "Line In Jack",
"IN2R", "Line In Jack",
- "Headphone Jack", "MICBIAS",
- "IN1L", "Headphone Jack";
+ "Microphone Jack", "MICBIAS",
+ "IN1L", "Microphone Jack";
simple-audio-card,widgets =
- "Microphone", "Headphone Jack",
+ "Microphone", "Microphone Jack",
"Headphone", "Headphone Jack",
"Line", "Line In Jack";
diff --git a/arch/arm64/boot/dts/qcom/sdm845.dtsi b/arch/arm64/boot/dts/qcom/sdm845.dtsi
index e0ce804bb1a3..d0314cdf0b92 100644
--- a/arch/arm64/boot/dts/qcom/sdm845.dtsi
+++ b/arch/arm64/boot/dts/qcom/sdm845.dtsi
@@ -5163,7 +5163,6 @@
<GIC_SPI 341 IRQ_TYPE_LEVEL_HIGH>,
<GIC_SPI 342 IRQ_TYPE_LEVEL_HIGH>,
<GIC_SPI 343 IRQ_TYPE_LEVEL_HIGH>;
- dma-coherent;
};
anoc_1_tbu: tbu@150c5000 {
diff --git a/arch/arm64/boot/dts/rockchip/px30-ringneck-haikou.dts b/arch/arm64/boot/dts/rockchip/px30-ringneck-haikou.dts
index eb9470a00e54..1a59e8b1dc46 100644
--- a/arch/arm64/boot/dts/rockchip/px30-ringneck-haikou.dts
+++ b/arch/arm64/boot/dts/rockchip/px30-ringneck-haikou.dts
@@ -194,6 +194,13 @@
<3 RK_PB3 RK_FUNC_GPIO &pcfg_pull_none>;
};
};
+
+ uart {
+ uart5_rts_pin: uart5-rts-pin {
+ rockchip,pins =
+ <0 RK_PB5 RK_FUNC_GPIO &pcfg_pull_none>;
+ };
+ };
};
&pwm0 {
@@ -222,10 +229,15 @@
};
&uart0 {
+ pinctrl-names = "default";
+ pinctrl-0 = <&uart0_xfer>;
status = "okay";
};
&uart5 {
+ /* Add pinmux for rts-gpios (uart5_rts_pin) */
+ pinctrl-names = "default";
+ pinctrl-0 = <&uart5_xfer &uart5_rts_pin>;
rts-gpios = <&gpio0 RK_PB5 GPIO_ACTIVE_HIGH>;
status = "okay";
};
diff --git a/arch/arm64/boot/dts/rockchip/rk3399-nanopi-r4s.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-nanopi-r4s.dtsi
index b1c9bd0e63ef..8d94d9f91a5c 100644
--- a/arch/arm64/boot/dts/rockchip/rk3399-nanopi-r4s.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3399-nanopi-r4s.dtsi
@@ -115,7 +115,7 @@
};
&u2phy1_host {
- status = "disabled";
+ phy-supply = <&vdd_5v>;
};
&uart0 {
diff --git a/arch/arm64/boot/dts/rockchip/rk3399-rockpro64.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-rockpro64.dtsi
index 69a9d6170649..51c6aa26d828 100644
--- a/arch/arm64/boot/dts/rockchip/rk3399-rockpro64.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3399-rockpro64.dtsi
@@ -227,6 +227,16 @@
vin-supply = <&vcc12v_dcin>;
};
+ vcca_0v9: regulator-vcca-0v9 {
+ compatible = "regulator-fixed";
+ regulator-name = "vcca_0v9";
+ regulator-always-on;
+ regulator-boot-on;
+ regulator-min-microvolt = <900000>;
+ regulator-max-microvolt = <900000>;
+ vin-supply = <&vcc3v3_sys>;
+ };
+
vdd_log: regulator-vdd-log {
compatible = "pwm-regulator";
pwms = <&pwm2 0 25000 1>;
@@ -312,6 +322,8 @@
};
&hdmi {
+ avdd-0v9-supply = <&vcca_0v9>;
+ avdd-1v8-supply = <&vcc1v8_dvp>;
ddc-i2c-bus = <&i2c3>;
pinctrl-names = "default";
pinctrl-0 = <&hdmi_cec>;
@@ -661,6 +673,8 @@
num-lanes = <4>;
pinctrl-names = "default";
pinctrl-0 = <&pcie_perst>;
+ vpcie0v9-supply = <&vcca_0v9>;
+ vpcie1v8-supply = <&vcca_1v8>;
vpcie12v-supply = <&vcc12v_dcin>;
vpcie3v3-supply = <&vcc3v3_pcie>;
status = "okay";
diff --git a/arch/arm64/boot/dts/rockchip/rk3566-lubancat-1.dts b/arch/arm64/boot/dts/rockchip/rk3566-lubancat-1.dts
index 61dd71c259aa..ddf84c2a19cf 100644
--- a/arch/arm64/boot/dts/rockchip/rk3566-lubancat-1.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3566-lubancat-1.dts
@@ -512,7 +512,6 @@
&sdmmc0 {
max-frequency = <150000000>;
- supports-sd;
bus-width = <4>;
cap-mmc-highspeed;
cap-sd-highspeed;
diff --git a/arch/arm64/boot/dts/rockchip/rk356x-base.dtsi b/arch/arm64/boot/dts/rockchip/rk356x-base.dtsi
index e55390629114..4e730aecf84d 100644
--- a/arch/arm64/boot/dts/rockchip/rk356x-base.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk356x-base.dtsi
@@ -284,6 +284,18 @@
mbi-alias = <0x0 0xfd410000>;
mbi-ranges = <296 24>;
msi-controller;
+ ranges;
+ #address-cells = <2>;
+ #size-cells = <2>;
+ dma-noncoherent;
+
+ its: msi-controller@fd440000 {
+ compatible = "arm,gic-v3-its";
+ reg = <0x0 0xfd440000 0 0x20000>;
+ dma-noncoherent;
+ msi-controller;
+ #msi-cells = <1>;
+ };
};
usb_host0_ehci: usb@fd800000 {
@@ -957,7 +969,7 @@
num-ib-windows = <6>;
num-ob-windows = <2>;
max-link-speed = <2>;
- msi-map = <0x0 &gic 0x0 0x1000>;
+ msi-map = <0x0 &its 0x0 0x1000>;
num-lanes = <1>;
phys = <&combphy2 PHY_TYPE_PCIE>;
phy-names = "pcie-phy";
diff --git a/arch/arm64/boot/dts/rockchip/rk3588-jaguar.dts b/arch/arm64/boot/dts/rockchip/rk3588-jaguar.dts
index 90f823b2c219..7f457ab78015 100644
--- a/arch/arm64/boot/dts/rockchip/rk3588-jaguar.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3588-jaguar.dts
@@ -503,7 +503,6 @@
non-removable;
pinctrl-names = "default";
pinctrl-0 = <&emmc_bus8 &emmc_cmd &emmc_clk &emmc_data_strobe>;
- supports-cqe;
vmmc-supply = <&vcc_3v3_s3>;
vqmmc-supply = <&vcc_1v8_s3>;
status = "okay";
diff --git a/arch/arm64/boot/dts/rockchip/rk3588-rock-5-itx.dts b/arch/arm64/boot/dts/rockchip/rk3588-rock-5-itx.dts
index 6d68f70284e4..2a0590209462 100644
--- a/arch/arm64/boot/dts/rockchip/rk3588-rock-5-itx.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3588-rock-5-itx.dts
@@ -690,10 +690,9 @@
&sdhci {
bus-width = <8>;
- max-frequency = <200000000>;
+ max-frequency = <150000000>;
mmc-hs400-1_8v;
mmc-hs400-enhanced-strobe;
- mmc-hs200-1_8v;
no-sdio;
no-sd;
non-removable;
diff --git a/arch/arm64/boot/dts/rockchip/rk3588-tiger.dtsi b/arch/arm64/boot/dts/rockchip/rk3588-tiger.dtsi
index 81a6a05ce13b..e8fa449517c2 100644
--- a/arch/arm64/boot/dts/rockchip/rk3588-tiger.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3588-tiger.dtsi
@@ -386,7 +386,6 @@
non-removable;
pinctrl-names = "default";
pinctrl-0 = <&emmc_bus8 &emmc_cmd &emmc_clk &emmc_data_strobe>;
- supports-cqe;
vmmc-supply = <&vcc_3v3_s3>;
vqmmc-supply = <&vcc_1v8_s3>;
status = "okay";
diff --git a/arch/arm64/hyperv/hv_core.c b/arch/arm64/hyperv/hv_core.c
index 69004f619c57..e33a9e3c366a 100644
--- a/arch/arm64/hyperv/hv_core.c
+++ b/arch/arm64/hyperv/hv_core.c
@@ -54,6 +54,23 @@ u64 hv_do_fast_hypercall8(u16 code, u64 input)
EXPORT_SYMBOL_GPL(hv_do_fast_hypercall8);
/*
+ * hv_do_fast_hypercall16 -- Invoke the specified hypercall
+ * with arguments in registers instead of physical memory.
+ * Avoids the overhead of virt_to_phys for simple hypercalls.
+ */
+u64 hv_do_fast_hypercall16(u16 code, u64 input1, u64 input2)
+{
+ struct arm_smccc_res res;
+ u64 control;
+
+ control = (u64)code | HV_HYPERCALL_FAST_BIT;
+
+ arm_smccc_1_1_hvc(HV_FUNC_ID, control, input1, input2, &res);
+ return res.a0;
+}
+EXPORT_SYMBOL_GPL(hv_do_fast_hypercall16);
+
+/*
* Set a single VP register to a 64-bit value.
*/
void hv_set_vpreg(u32 msr, u64 value)
diff --git a/arch/arm64/hyperv/mshyperv.c b/arch/arm64/hyperv/mshyperv.c
index fc49949b7df6..4e27cc29c79e 100644
--- a/arch/arm64/hyperv/mshyperv.c
+++ b/arch/arm64/hyperv/mshyperv.c
@@ -26,6 +26,7 @@ int hv_get_hypervisor_version(union hv_hypervisor_version_info *info)
return 0;
}
+EXPORT_SYMBOL_GPL(hv_get_hypervisor_version);
static int __init hyperv_init(void)
{
@@ -61,6 +62,8 @@ static int __init hyperv_init(void)
ms_hyperv.features, ms_hyperv.priv_high, ms_hyperv.hints,
ms_hyperv.misc_features);
+ hv_identify_partition_type();
+
ret = hv_common_init();
if (ret)
return ret;
@@ -72,6 +75,9 @@ static int __init hyperv_init(void)
return ret;
}
+ if (ms_hyperv.priv_high & HV_ACCESS_PARTITION_ID)
+ hv_get_partition_id();
+
ms_hyperv_late_init();
hyperv_initialized = true;
diff --git a/arch/arm64/include/asm/apple_m1_pmu.h b/arch/arm64/include/asm/apple_m1_pmu.h
index 99483b19b99f..02e05d05851f 100644
--- a/arch/arm64/include/asm/apple_m1_pmu.h
+++ b/arch/arm64/include/asm/apple_m1_pmu.h
@@ -37,6 +37,7 @@
#define PMCR0_PMI_ENABLE_8_9 GENMASK(45, 44)
#define SYS_IMP_APL_PMCR1_EL1 sys_reg(3, 1, 15, 1, 0)
+#define SYS_IMP_APL_PMCR1_EL12 sys_reg(3, 1, 15, 7, 2)
#define PMCR1_COUNT_A64_EL0_0_7 GENMASK(15, 8)
#define PMCR1_COUNT_A64_EL1_0_7 GENMASK(23, 16)
#define PMCR1_COUNT_A64_EL0_8_9 GENMASK(41, 40)
diff --git a/arch/arm64/include/asm/asm-extable.h b/arch/arm64/include/asm/asm-extable.h
index b8a5861dc7b7..292f2687a12e 100644
--- a/arch/arm64/include/asm/asm-extable.h
+++ b/arch/arm64/include/asm/asm-extable.h
@@ -9,7 +9,8 @@
#define EX_TYPE_BPF 1
#define EX_TYPE_UACCESS_ERR_ZERO 2
#define EX_TYPE_KACCESS_ERR_ZERO 3
-#define EX_TYPE_LOAD_UNALIGNED_ZEROPAD 4
+#define EX_TYPE_UACCESS_CPY 4
+#define EX_TYPE_LOAD_UNALIGNED_ZEROPAD 5
/* Data fields for EX_TYPE_UACCESS_ERR_ZERO */
#define EX_DATA_REG_ERR_SHIFT 0
@@ -23,6 +24,9 @@
#define EX_DATA_REG_ADDR_SHIFT 5
#define EX_DATA_REG_ADDR GENMASK(9, 5)
+/* Data fields for EX_TYPE_UACCESS_CPY */
+#define EX_DATA_UACCESS_WRITE BIT(0)
+
#ifdef __ASSEMBLY__
#define __ASM_EXTABLE_RAW(insn, fixup, type, data) \
@@ -69,6 +73,10 @@
.endif
.endm
+ .macro _asm_extable_uaccess_cpy, insn, fixup, uaccess_is_write
+ __ASM_EXTABLE_RAW(\insn, \fixup, EX_TYPE_UACCESS_CPY, \uaccess_is_write)
+ .endm
+
#else /* __ASSEMBLY__ */
#include <linux/stringify.h>
diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h
index 5b6efe8abeeb..9148f5a31968 100644
--- a/arch/arm64/include/asm/asm-uaccess.h
+++ b/arch/arm64/include/asm/asm-uaccess.h
@@ -61,6 +61,10 @@ alternative_else_nop_endif
9999: x; \
_asm_extable_uaccess 9999b, l
+#define USER_CPY(l, uaccess_is_write, x...) \
+9999: x; \
+ _asm_extable_uaccess_cpy 9999b, l, uaccess_is_write
+
/*
* Generate the assembly for LDTR/STTR with exception table entries.
* This is complicated as there is no post-increment or pair versions of the
diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h
index 06a4670bdb0b..99cd6546e72e 100644
--- a/arch/arm64/include/asm/cache.h
+++ b/arch/arm64/include/asm/cache.h
@@ -35,7 +35,7 @@
#define ARCH_DMA_MINALIGN (128)
#define ARCH_KMALLOC_MINALIGN (8)
-#ifndef __ASSEMBLY__
+#if !defined(__ASSEMBLY__) && !defined(BUILD_VDSO)
#include <linux/bitops.h>
#include <linux/kasan-enabled.h>
@@ -118,6 +118,6 @@ static inline u32 __attribute_const__ read_cpuid_effective_cachetype(void)
return ctr;
}
-#endif /* __ASSEMBLY__ */
+#endif /* !defined(__ASSEMBLY__) && !defined(BUILD_VDSO) */
#endif
diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index 0b5ca6e0eb09..9d769291a306 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -71,6 +71,8 @@ cpucap_is_possible(const unsigned int cap)
* KVM MPAM support doesn't rely on the host kernel supporting MPAM.
*/
return true;
+ case ARM64_HAS_PMUV3:
+ return IS_ENABLED(CONFIG_HW_PERF_EVENTS);
}
return true;
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index e0e4478f5fb5..c4326f1cb917 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -525,29 +525,6 @@ cpuid_feature_extract_unsigned_field(u64 features, int field)
return cpuid_feature_extract_unsigned_field_width(features, field, 4);
}
-/*
- * Fields that identify the version of the Performance Monitors Extension do
- * not follow the standard ID scheme. See ARM DDI 0487E.a page D13-2825,
- * "Alternative ID scheme used for the Performance Monitors Extension version".
- */
-static inline u64 __attribute_const__
-cpuid_feature_cap_perfmon_field(u64 features, int field, u64 cap)
-{
- u64 val = cpuid_feature_extract_unsigned_field(features, field);
- u64 mask = GENMASK_ULL(field + 3, field);
-
- /* Treat IMPLEMENTATION DEFINED functionality as unimplemented */
- if (val == ID_AA64DFR0_EL1_PMUVer_IMP_DEF)
- val = 0;
-
- if (val > cap) {
- features &= ~mask;
- features |= (cap << field) & mask;
- }
-
- return features;
-}
-
static inline u64 arm64_ftr_mask(const struct arm64_ftr_bits *ftrp)
{
return (u64)GENMASK(ftrp->shift + ftrp->width - 1, ftrp->shift);
@@ -866,6 +843,11 @@ static __always_inline bool system_supports_mpam_hcr(void)
return alternative_has_cap_unlikely(ARM64_MPAM_HCR);
}
+static inline bool system_supports_pmuv3(void)
+{
+ return cpus_have_final_cap(ARM64_HAS_PMUV3);
+}
+
int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt);
bool try_emulate_mrs(struct pt_regs *regs, u32 isn);
diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h
index 6f3f4142e214..c607e0bf5e0b 100644
--- a/arch/arm64/include/asm/cputype.h
+++ b/arch/arm64/include/asm/cputype.h
@@ -75,6 +75,7 @@
#define ARM_CPU_PART_CORTEX_A76 0xD0B
#define ARM_CPU_PART_NEOVERSE_N1 0xD0C
#define ARM_CPU_PART_CORTEX_A77 0xD0D
+#define ARM_CPU_PART_CORTEX_A76AE 0xD0E
#define ARM_CPU_PART_NEOVERSE_V1 0xD40
#define ARM_CPU_PART_CORTEX_A78 0xD41
#define ARM_CPU_PART_CORTEX_A78AE 0xD42
@@ -119,6 +120,7 @@
#define QCOM_CPU_PART_KRYO 0x200
#define QCOM_CPU_PART_KRYO_2XX_GOLD 0x800
#define QCOM_CPU_PART_KRYO_2XX_SILVER 0x801
+#define QCOM_CPU_PART_KRYO_3XX_GOLD 0x802
#define QCOM_CPU_PART_KRYO_3XX_SILVER 0x803
#define QCOM_CPU_PART_KRYO_4XX_GOLD 0x804
#define QCOM_CPU_PART_KRYO_4XX_SILVER 0x805
@@ -159,6 +161,7 @@
#define MIDR_CORTEX_A76 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A76)
#define MIDR_NEOVERSE_N1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N1)
#define MIDR_CORTEX_A77 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A77)
+#define MIDR_CORTEX_A76AE MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A76AE)
#define MIDR_NEOVERSE_V1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V1)
#define MIDR_CORTEX_A78 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78)
#define MIDR_CORTEX_A78AE MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78AE)
@@ -196,10 +199,21 @@
#define MIDR_QCOM_KRYO MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO)
#define MIDR_QCOM_KRYO_2XX_GOLD MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_2XX_GOLD)
#define MIDR_QCOM_KRYO_2XX_SILVER MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_2XX_SILVER)
+#define MIDR_QCOM_KRYO_3XX_GOLD MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_3XX_GOLD)
#define MIDR_QCOM_KRYO_3XX_SILVER MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_3XX_SILVER)
#define MIDR_QCOM_KRYO_4XX_GOLD MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_4XX_GOLD)
#define MIDR_QCOM_KRYO_4XX_SILVER MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_4XX_SILVER)
#define MIDR_QCOM_ORYON_X1 MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_ORYON_X1)
+
+/*
+ * NOTES:
+ * - Qualcomm Kryo 5XX Prime / Gold ID themselves as MIDR_CORTEX_A77
+ * - Qualcomm Kryo 5XX Silver IDs itself as MIDR_QCOM_KRYO_4XX_SILVER
+ * - Qualcomm Kryo 6XX Prime IDs itself as MIDR_CORTEX_X1
+ * - Qualcomm Kryo 6XX Gold IDs itself as ARM_CPU_PART_CORTEX_A78
+ * - Qualcomm Kryo 6XX Silver IDs itself as MIDR_CORTEX_A55
+ */
+
#define MIDR_NVIDIA_DENVER MIDR_CPU_MODEL(ARM_CPU_IMP_NVIDIA, NVIDIA_CPU_PART_DENVER)
#define MIDR_NVIDIA_CARMEL MIDR_CPU_MODEL(ARM_CPU_IMP_NVIDIA, NVIDIA_CPU_PART_CARMEL)
#define MIDR_FUJITSU_A64FX MIDR_CPU_MODEL(ARM_CPU_IMP_FUJITSU, FUJITSU_CPU_PART_A64FX)
@@ -232,6 +246,16 @@
#define read_cpuid(reg) read_sysreg_s(SYS_ ## reg)
/*
+ * The CPU ID never changes at run time, so we might as well tell the
+ * compiler that it's constant. Use this function to read the CPU ID
+ * rather than directly reading processor_id or read_cpuid() directly.
+ */
+static inline u32 __attribute_const__ read_cpuid_id(void)
+{
+ return read_cpuid(MIDR_EL1);
+}
+
+/*
* Represent a range of MIDR values for a given CPU model and a
* range of variant/revision values.
*
@@ -266,30 +290,14 @@ static inline bool midr_is_cpu_model_range(u32 midr, u32 model, u32 rv_min,
return _model == model && rv >= rv_min && rv <= rv_max;
}
-static inline bool is_midr_in_range(u32 midr, struct midr_range const *range)
-{
- return midr_is_cpu_model_range(midr, range->model,
- range->rv_min, range->rv_max);
-}
-
-static inline bool
-is_midr_in_range_list(u32 midr, struct midr_range const *ranges)
-{
- while (ranges->model)
- if (is_midr_in_range(midr, ranges++))
- return true;
- return false;
-}
+struct target_impl_cpu {
+ u64 midr;
+ u64 revidr;
+ u64 aidr;
+};
-/*
- * The CPU ID never changes at run time, so we might as well tell the
- * compiler that it's constant. Use this function to read the CPU ID
- * rather than directly reading processor_id or read_cpuid() directly.
- */
-static inline u32 __attribute_const__ read_cpuid_id(void)
-{
- return read_cpuid(MIDR_EL1);
-}
+bool cpu_errata_set_target_impl(u64 num, void *impl_cpus);
+bool is_midr_in_range_list(struct midr_range const *ranges);
static inline u64 __attribute_const__ read_cpuid_mpidr(void)
{
diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h
index 555c613fd232..ebceaae3c749 100644
--- a/arch/arm64/include/asm/el2_setup.h
+++ b/arch/arm64/include/asm/el2_setup.h
@@ -259,6 +259,30 @@
.Lskip_fgt_\@:
.endm
+.macro __init_el2_fgt2
+ mrs x1, id_aa64mmfr0_el1
+ ubfx x1, x1, #ID_AA64MMFR0_EL1_FGT_SHIFT, #4
+ cmp x1, #ID_AA64MMFR0_EL1_FGT_FGT2
+ b.lt .Lskip_fgt2_\@
+
+ mov x0, xzr
+ mrs x1, id_aa64dfr0_el1
+ ubfx x1, x1, #ID_AA64DFR0_EL1_PMUVer_SHIFT, #4
+ cmp x1, #ID_AA64DFR0_EL1_PMUVer_V3P9
+ b.lt .Lskip_pmuv3p9_\@
+
+ orr x0, x0, #HDFGRTR2_EL2_nPMICNTR_EL0
+ orr x0, x0, #HDFGRTR2_EL2_nPMICFILTR_EL0
+ orr x0, x0, #HDFGRTR2_EL2_nPMUACR_EL1
+.Lskip_pmuv3p9_\@:
+ msr_s SYS_HDFGRTR2_EL2, x0
+ msr_s SYS_HDFGWTR2_EL2, x0
+ msr_s SYS_HFGRTR2_EL2, xzr
+ msr_s SYS_HFGWTR2_EL2, xzr
+ msr_s SYS_HFGITR2_EL2, xzr
+.Lskip_fgt2_\@:
+.endm
+
.macro __init_el2_gcs
mrs_s x1, SYS_ID_AA64PFR1_EL1
ubfx x1, x1, #ID_AA64PFR1_EL1_GCS_SHIFT, #4
@@ -304,6 +328,7 @@
__init_el2_nvhe_idregs
__init_el2_cptr
__init_el2_fgt
+ __init_el2_fgt2
__init_el2_gcs
.endm
diff --git a/arch/arm64/include/asm/extable.h b/arch/arm64/include/asm/extable.h
index 72b0e71cc3de..9dc39612bdf5 100644
--- a/arch/arm64/include/asm/extable.h
+++ b/arch/arm64/include/asm/extable.h
@@ -33,6 +33,8 @@ do { \
(b)->data = (tmp).data; \
} while (0)
+bool insn_may_access_user(unsigned long addr, unsigned long esr);
+
#ifdef CONFIG_BPF_JIT
bool ex_handler_bpf(const struct exception_table_entry *ex,
struct pt_regs *regs);
@@ -45,5 +47,5 @@ bool ex_handler_bpf(const struct exception_table_entry *ex,
}
#endif /* !CONFIG_BPF_JIT */
-bool fixup_exception(struct pt_regs *regs);
+bool fixup_exception(struct pt_regs *regs, unsigned long esr);
#endif
diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index f2a84efc3618..564bc09b3e06 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -80,7 +80,6 @@ extern void fpsimd_signal_preserve_current_state(void);
extern void fpsimd_preserve_current_state(void);
extern void fpsimd_restore_current_state(void);
extern void fpsimd_update_current_state(struct user_fpsimd_state const *state);
-extern void fpsimd_kvm_prepare(void);
struct cpu_fp_state {
struct user_fpsimd_state *st;
diff --git a/arch/arm64/include/asm/hypervisor.h b/arch/arm64/include/asm/hypervisor.h
index 409e239834d1..a12fd897c877 100644
--- a/arch/arm64/include/asm/hypervisor.h
+++ b/arch/arm64/include/asm/hypervisor.h
@@ -6,6 +6,7 @@
void kvm_init_hyp_services(void);
bool kvm_arm_hyp_service_available(u32 func_id);
+void kvm_arm_target_impl_cpu_init(void);
#ifdef CONFIG_ARM_PKVM_GUEST
void pkvm_init_hyp_services(void);
diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h
index fd5a08450b12..9e93733523f6 100644
--- a/arch/arm64/include/asm/kernel-pgtable.h
+++ b/arch/arm64/include/asm/kernel-pgtable.h
@@ -45,11 +45,11 @@
#define SPAN_NR_ENTRIES(vstart, vend, shift) \
((((vend) - 1) >> (shift)) - ((vstart) >> (shift)) + 1)
-#define EARLY_ENTRIES(vstart, vend, shift, add) \
- (SPAN_NR_ENTRIES(vstart, vend, shift) + (add))
+#define EARLY_ENTRIES(lvl, vstart, vend) \
+ SPAN_NR_ENTRIES(vstart, vend, SWAPPER_BLOCK_SHIFT + lvl * PTDESC_TABLE_SHIFT)
-#define EARLY_LEVEL(lvl, lvls, vstart, vend, add) \
- (lvls > lvl ? EARLY_ENTRIES(vstart, vend, SWAPPER_BLOCK_SHIFT + lvl * (PAGE_SHIFT - 3), add) : 0)
+#define EARLY_LEVEL(lvl, lvls, vstart, vend, add) \
+ ((lvls) > (lvl) ? EARLY_ENTRIES(lvl, vstart, vend) + (add) : 0)
#define EARLY_PAGES(lvls, vstart, vend, add) (1 /* PGDIR page */ \
+ EARLY_LEVEL(3, (lvls), (vstart), (vend), add) /* each entry needs a next level page table */ \
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index c2417a424b98..974d72b5905b 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -92,12 +92,12 @@
* SWIO: Turn set/way invalidates into set/way clean+invalidate
* PTW: Take a stage2 fault if a stage1 walk steps in device memory
* TID3: Trap EL1 reads of group 3 ID registers
- * TID2: Trap CTR_EL0, CCSIDR2_EL1, CLIDR_EL1, and CSSELR_EL1
+ * TID1: Trap REVIDR_EL1, AIDR_EL1, and SMIDR_EL1
*/
#define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_TWI | HCR_VM | \
HCR_BSU_IS | HCR_FB | HCR_TACR | \
HCR_AMO | HCR_SWIO | HCR_TIDCP | HCR_RW | HCR_TLOR | \
- HCR_FMO | HCR_IMO | HCR_PTW | HCR_TID3)
+ HCR_FMO | HCR_IMO | HCR_PTW | HCR_TID3 | HCR_TID1)
#define HCR_HOST_NVHE_FLAGS (HCR_RW | HCR_API | HCR_APK | HCR_ATA)
#define HCR_HOST_NVHE_PROTECTED_FLAGS (HCR_HOST_NVHE_FLAGS | HCR_TSC)
#define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H)
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index 78ec1ef2cfe8..d7cf66573aca 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -275,6 +275,19 @@ static __always_inline u64 kvm_vcpu_get_esr(const struct kvm_vcpu *vcpu)
return vcpu->arch.fault.esr_el2;
}
+static inline bool guest_hyp_wfx_traps_enabled(const struct kvm_vcpu *vcpu)
+{
+ u64 esr = kvm_vcpu_get_esr(vcpu);
+ bool is_wfe = !!(esr & ESR_ELx_WFx_ISS_WFE);
+ u64 hcr_el2 = __vcpu_sys_reg(vcpu, HCR_EL2);
+
+ if (!vcpu_has_nv(vcpu) || vcpu_is_el2(vcpu))
+ return false;
+
+ return ((is_wfe && (hcr_el2 & HCR_TWE)) ||
+ (!is_wfe && (hcr_el2 & HCR_TWI)));
+}
+
static __always_inline int kvm_vcpu_get_condition(const struct kvm_vcpu *vcpu)
{
u64 esr = kvm_vcpu_get_esr(vcpu);
@@ -649,4 +662,28 @@ static inline bool guest_hyp_sve_traps_enabled(const struct kvm_vcpu *vcpu)
{
return __guest_hyp_cptr_xen_trap_enabled(vcpu, ZEN);
}
+
+static inline void vcpu_set_hcrx(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ if (cpus_have_final_cap(ARM64_HAS_HCX)) {
+ /*
+ * In general, all HCRX_EL2 bits are gated by a feature.
+ * The only reason we can set SMPME without checking any
+ * feature is that its effects are not directly observable
+ * from the guest.
+ */
+ vcpu->arch.hcrx_el2 = HCRX_EL2_SMPME;
+
+ if (kvm_has_feat(kvm, ID_AA64ISAR2_EL1, MOPS, IMP))
+ vcpu->arch.hcrx_el2 |= (HCRX_EL2_MSCEn | HCRX_EL2_MCE2);
+
+ if (kvm_has_tcr2(kvm))
+ vcpu->arch.hcrx_el2 |= HCRX_EL2_TCR2En;
+
+ if (kvm_has_fpmr(kvm))
+ vcpu->arch.hcrx_el2 |= HCRX_EL2_EnFPM;
+ }
+}
#endif /* __ARM64_KVM_EMULATE_H__ */
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index d919557af5e5..e98cfe7855a6 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -44,14 +44,15 @@
#define KVM_REQ_SLEEP \
KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
-#define KVM_REQ_IRQ_PENDING KVM_ARCH_REQ(1)
-#define KVM_REQ_VCPU_RESET KVM_ARCH_REQ(2)
-#define KVM_REQ_RECORD_STEAL KVM_ARCH_REQ(3)
-#define KVM_REQ_RELOAD_GICv4 KVM_ARCH_REQ(4)
-#define KVM_REQ_RELOAD_PMU KVM_ARCH_REQ(5)
-#define KVM_REQ_SUSPEND KVM_ARCH_REQ(6)
-#define KVM_REQ_RESYNC_PMU_EL0 KVM_ARCH_REQ(7)
-#define KVM_REQ_NESTED_S2_UNMAP KVM_ARCH_REQ(8)
+#define KVM_REQ_IRQ_PENDING KVM_ARCH_REQ(1)
+#define KVM_REQ_VCPU_RESET KVM_ARCH_REQ(2)
+#define KVM_REQ_RECORD_STEAL KVM_ARCH_REQ(3)
+#define KVM_REQ_RELOAD_GICv4 KVM_ARCH_REQ(4)
+#define KVM_REQ_RELOAD_PMU KVM_ARCH_REQ(5)
+#define KVM_REQ_SUSPEND KVM_ARCH_REQ(6)
+#define KVM_REQ_RESYNC_PMU_EL0 KVM_ARCH_REQ(7)
+#define KVM_REQ_NESTED_S2_UNMAP KVM_ARCH_REQ(8)
+#define KVM_REQ_GUEST_HYP_IRQ_PENDING KVM_ARCH_REQ(9)
#define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \
KVM_DIRTY_LOG_INITIALLY_SET)
@@ -86,6 +87,9 @@ struct kvm_hyp_memcache {
phys_addr_t head;
unsigned long nr_pages;
struct pkvm_mapping *mapping; /* only used from EL1 */
+
+#define HYP_MEMCACHE_ACCOUNT_STAGE2 BIT(1)
+ unsigned long flags;
};
static inline void push_hyp_memcache(struct kvm_hyp_memcache *mc,
@@ -237,7 +241,8 @@ struct kvm_arch_memory_slot {
struct kvm_smccc_features {
unsigned long std_bmap;
unsigned long std_hyp_bmap;
- unsigned long vendor_hyp_bmap;
+ unsigned long vendor_hyp_bmap; /* Function numbers 0-63 */
+ unsigned long vendor_hyp_bmap_2; /* Function numbers 64-127 */
};
typedef unsigned int pkvm_handle_t;
@@ -245,6 +250,7 @@ typedef unsigned int pkvm_handle_t;
struct kvm_protected_vm {
pkvm_handle_t handle;
struct kvm_hyp_memcache teardown_mc;
+ struct kvm_hyp_memcache stage2_teardown_mc;
bool enabled;
};
@@ -334,6 +340,8 @@ struct kvm_arch {
#define KVM_ARCH_FLAG_FGU_INITIALIZED 8
/* SVE exposed to guest */
#define KVM_ARCH_FLAG_GUEST_HAS_SVE 9
+ /* MIDR_EL1, REVIDR_EL1, and AIDR_EL1 are writable from userspace */
+#define KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS 10
unsigned long flags;
/* VM-wide vCPU feature set */
@@ -373,6 +381,9 @@ struct kvm_arch {
#define KVM_ARM_ID_REG_NUM (IDREG_IDX(sys_reg(3, 0, 0, 7, 7)) + 1)
u64 id_regs[KVM_ARM_ID_REG_NUM];
+ u64 midr_el1;
+ u64 revidr_el1;
+ u64 aidr_el1;
u64 ctr_el0;
/* Masks for VNCR-backed and general EL2 sysregs */
@@ -557,7 +568,33 @@ enum vcpu_sysreg {
VNCR(CNTP_CVAL_EL0),
VNCR(CNTP_CTL_EL0),
+ VNCR(ICH_LR0_EL2),
+ VNCR(ICH_LR1_EL2),
+ VNCR(ICH_LR2_EL2),
+ VNCR(ICH_LR3_EL2),
+ VNCR(ICH_LR4_EL2),
+ VNCR(ICH_LR5_EL2),
+ VNCR(ICH_LR6_EL2),
+ VNCR(ICH_LR7_EL2),
+ VNCR(ICH_LR8_EL2),
+ VNCR(ICH_LR9_EL2),
+ VNCR(ICH_LR10_EL2),
+ VNCR(ICH_LR11_EL2),
+ VNCR(ICH_LR12_EL2),
+ VNCR(ICH_LR13_EL2),
+ VNCR(ICH_LR14_EL2),
+ VNCR(ICH_LR15_EL2),
+
+ VNCR(ICH_AP0R0_EL2),
+ VNCR(ICH_AP0R1_EL2),
+ VNCR(ICH_AP0R2_EL2),
+ VNCR(ICH_AP0R3_EL2),
+ VNCR(ICH_AP1R0_EL2),
+ VNCR(ICH_AP1R1_EL2),
+ VNCR(ICH_AP1R2_EL2),
+ VNCR(ICH_AP1R3_EL2),
VNCR(ICH_HCR_EL2),
+ VNCR(ICH_VMCR_EL2),
NR_SYS_REGS /* Nothing after this line! */
};
@@ -869,6 +906,8 @@ struct kvm_vcpu_arch {
#define VCPU_INITIALIZED __vcpu_single_flag(cflags, BIT(0))
/* SVE config completed */
#define VCPU_SVE_FINALIZED __vcpu_single_flag(cflags, BIT(1))
+/* pKVM VCPU setup completed */
+#define VCPU_PKVM_FINALIZED __vcpu_single_flag(cflags, BIT(2))
/* Exception pending */
#define PENDING_EXCEPTION __vcpu_single_flag(iflags, BIT(0))
@@ -919,6 +958,8 @@ struct kvm_vcpu_arch {
#define PMUSERENR_ON_CPU __vcpu_single_flag(sflags, BIT(5))
/* WFI instruction trapped */
#define IN_WFI __vcpu_single_flag(sflags, BIT(6))
+/* KVM is currently emulating a nested ERET */
+#define IN_NESTED_ERET __vcpu_single_flag(sflags, BIT(7))
/* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */
@@ -1334,8 +1375,6 @@ static inline bool kvm_system_needs_idmapped_vectors(void)
return cpus_have_final_cap(ARM64_SPECTRE_V3A);
}
-static inline void kvm_arch_sync_events(struct kvm *kvm) {}
-
void kvm_init_host_debug_data(void);
void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu);
void kvm_vcpu_put_debug(struct kvm_vcpu *vcpu);
@@ -1459,6 +1498,12 @@ static inline u64 *__vm_id_reg(struct kvm_arch *ka, u32 reg)
return &ka->id_regs[IDREG_IDX(reg)];
case SYS_CTR_EL0:
return &ka->ctr_el0;
+ case SYS_MIDR_EL1:
+ return &ka->midr_el1;
+ case SYS_REVIDR_EL1:
+ return &ka->revidr_el1;
+ case SYS_AIDR_EL1:
+ return &ka->aidr_el1;
default:
WARN_ON_ONCE(1);
return NULL;
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index c838309e4ec4..e6be1f5d0967 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -76,6 +76,8 @@ DECLARE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
int __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu);
+u64 __gic_v3_get_lr(unsigned int lr);
+
void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if);
void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if);
void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if);
diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h
index 56c4bcd35e2e..692f403c1896 100644
--- a/arch/arm64/include/asm/kvm_nested.h
+++ b/arch/arm64/include/asm/kvm_nested.h
@@ -188,6 +188,7 @@ static inline bool kvm_supported_tlbi_s1e2_op(struct kvm_vcpu *vpcu, u32 instr)
}
int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu);
+u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val);
#ifdef CONFIG_ARM64_PTR_AUTH
bool kvm_auth_eretax(struct kvm_vcpu *vcpu, u64 *elr);
diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h
index eb65f12e81d9..abd693ce5b93 100644
--- a/arch/arm64/include/asm/kvm_pkvm.h
+++ b/arch/arm64/include/asm/kvm_pkvm.h
@@ -19,6 +19,7 @@
int pkvm_init_host_vm(struct kvm *kvm);
int pkvm_create_hyp_vm(struct kvm *kvm);
void pkvm_destroy_hyp_vm(struct kvm *kvm);
+int pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu);
/*
* This functions as an allow-list of protected VM capabilities.
diff --git a/arch/arm64/include/asm/mem_encrypt.h b/arch/arm64/include/asm/mem_encrypt.h
index f8f78f622dd2..a2a1eeb36d4b 100644
--- a/arch/arm64/include/asm/mem_encrypt.h
+++ b/arch/arm64/include/asm/mem_encrypt.h
@@ -21,4 +21,15 @@ static inline bool force_dma_unencrypted(struct device *dev)
return is_realm_world();
}
+/*
+ * For Arm CCA guests, canonical addresses are "encrypted", so no changes
+ * required for dma_addr_encrypted().
+ * The unencrypted DMA buffers must be accessed via the unprotected IPA,
+ * "top IPA bit" set.
+ */
+#define dma_addr_unencrypted(x) ((x) | PROT_NS_SHARED)
+
+/* Clear the "top" IPA bit while converting back */
+#define dma_addr_canonical(x) ((x) & ~PROT_NS_SHARED)
+
#endif /* __ASM_MEM_ENCRYPT_H */
diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index 662471cfc536..30a29e88994b 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -101,8 +101,7 @@ static inline bool kaslr_requires_kpti(void)
if (IS_ENABLED(CONFIG_CAVIUM_ERRATUM_27456)) {
extern const struct midr_range cavium_erratum_27456_cpus[];
- if (is_midr_in_range_list(read_cpuid_id(),
- cavium_erratum_27456_cpus))
+ if (is_midr_in_range_list(cavium_erratum_27456_cpus))
return false;
}
diff --git a/arch/arm64/include/asm/mshyperv.h b/arch/arm64/include/asm/mshyperv.h
index 2e2f83bafcfb..b721d3134ab6 100644
--- a/arch/arm64/include/asm/mshyperv.h
+++ b/arch/arm64/include/asm/mshyperv.h
@@ -40,6 +40,19 @@ static inline u64 hv_get_msr(unsigned int reg)
return hv_get_vpreg(reg);
}
+/*
+ * Nested is not supported on arm64
+ */
+static inline void hv_set_non_nested_msr(unsigned int reg, u64 value)
+{
+ hv_set_msr(reg, value);
+}
+
+static inline u64 hv_get_non_nested_msr(unsigned int reg)
+{
+ return hv_get_msr(reg);
+}
+
/* SMCCC hypercall parameters */
#define HV_SMCCC_FUNC_NUMBER 1
#define HV_FUNC_ID ARM_SMCCC_CALL_VAL( \
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index a9136cc551cc..f3b77deedfa2 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -7,40 +7,46 @@
#include <asm/memory.h>
+#define PTDESC_ORDER 3
+
+/* Number of VA bits resolved by a single translation table level */
+#define PTDESC_TABLE_SHIFT (PAGE_SHIFT - PTDESC_ORDER)
+
/*
* Number of page-table levels required to address 'va_bits' wide
* address, without section mapping. We resolve the top (va_bits - PAGE_SHIFT)
- * bits with (PAGE_SHIFT - 3) bits at each page table level. Hence:
+ * bits with PTDESC_TABLE_SHIFT bits at each page table level. Hence:
*
- * levels = DIV_ROUND_UP((va_bits - PAGE_SHIFT), (PAGE_SHIFT - 3))
+ * levels = DIV_ROUND_UP((va_bits - PAGE_SHIFT), PTDESC_TABLE_SHIFT)
*
* where DIV_ROUND_UP(n, d) => (((n) + (d) - 1) / (d))
*
* We cannot include linux/kernel.h which defines DIV_ROUND_UP here
* due to build issues. So we open code DIV_ROUND_UP here:
*
- * ((((va_bits) - PAGE_SHIFT) + (PAGE_SHIFT - 3) - 1) / (PAGE_SHIFT - 3))
+ * ((((va_bits) - PAGE_SHIFT) + PTDESC_TABLE_SHIFT - 1) / PTDESC_TABLE_SHIFT)
*
* which gets simplified as :
*/
-#define ARM64_HW_PGTABLE_LEVELS(va_bits) (((va_bits) - 4) / (PAGE_SHIFT - 3))
+#define ARM64_HW_PGTABLE_LEVELS(va_bits) \
+ (((va_bits) - PTDESC_ORDER - 1) / PTDESC_TABLE_SHIFT)
/*
* Size mapped by an entry at level n ( -1 <= n <= 3)
- * We map (PAGE_SHIFT - 3) at all translation levels and PAGE_SHIFT bits
+ * We map PTDESC_TABLE_SHIFT at all translation levels and PAGE_SHIFT bits
* in the final page. The maximum number of translation levels supported by
* the architecture is 5. Hence, starting at level n, we have further
* ((4 - n) - 1) levels of translation excluding the offset within the page.
* So, the total number of bits mapped by an entry at level n is :
*
- * ((4 - n) - 1) * (PAGE_SHIFT - 3) + PAGE_SHIFT
+ * ((4 - n) - 1) * PTDESC_TABLE_SHIFT + PAGE_SHIFT
*
* Rearranging it a bit we get :
- * (4 - n) * (PAGE_SHIFT - 3) + 3
+ * (4 - n) * PTDESC_TABLE_SHIFT + PTDESC_ORDER
*/
-#define ARM64_HW_PGTABLE_LEVEL_SHIFT(n) ((PAGE_SHIFT - 3) * (4 - (n)) + 3)
+#define ARM64_HW_PGTABLE_LEVEL_SHIFT(n) (PTDESC_TABLE_SHIFT * (4 - (n)) + PTDESC_ORDER)
-#define PTRS_PER_PTE (1 << (PAGE_SHIFT - 3))
+#define PTRS_PER_PTE (1 << PTDESC_TABLE_SHIFT)
/*
* PMD_SHIFT determines the size a level 2 page table entry can map.
@@ -49,7 +55,7 @@
#define PMD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(2)
#define PMD_SIZE (_AC(1, UL) << PMD_SHIFT)
#define PMD_MASK (~(PMD_SIZE-1))
-#define PTRS_PER_PMD (1 << (PAGE_SHIFT - 3))
+#define PTRS_PER_PMD (1 << PTDESC_TABLE_SHIFT)
#endif
/*
@@ -59,14 +65,14 @@
#define PUD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(1)
#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT)
#define PUD_MASK (~(PUD_SIZE-1))
-#define PTRS_PER_PUD (1 << (PAGE_SHIFT - 3))
+#define PTRS_PER_PUD (1 << PTDESC_TABLE_SHIFT)
#endif
#if CONFIG_PGTABLE_LEVELS > 4
#define P4D_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(0)
#define P4D_SIZE (_AC(1, UL) << P4D_SHIFT)
#define P4D_MASK (~(P4D_SIZE-1))
-#define PTRS_PER_P4D (1 << (PAGE_SHIFT - 3))
+#define PTRS_PER_P4D (1 << PTDESC_TABLE_SHIFT)
#endif
/*
@@ -97,7 +103,6 @@
* Level -1 descriptor (PGD).
*/
#define PGD_TYPE_TABLE (_AT(pgdval_t, 3) << 0)
-#define PGD_TABLE_BIT (_AT(pgdval_t, 1) << 1)
#define PGD_TYPE_MASK (_AT(pgdval_t, 3) << 0)
#define PGD_TABLE_AF (_AT(pgdval_t, 1) << 10) /* Ignored if no FEAT_HAFT */
#define PGD_TABLE_PXN (_AT(pgdval_t, 1) << 59)
@@ -107,7 +112,6 @@
* Level 0 descriptor (P4D).
*/
#define P4D_TYPE_TABLE (_AT(p4dval_t, 3) << 0)
-#define P4D_TABLE_BIT (_AT(p4dval_t, 1) << 1)
#define P4D_TYPE_MASK (_AT(p4dval_t, 3) << 0)
#define P4D_TYPE_SECT (_AT(p4dval_t, 1) << 0)
#define P4D_SECT_RDONLY (_AT(p4dval_t, 1) << 7) /* AP[2] */
@@ -119,7 +123,6 @@
* Level 1 descriptor (PUD).
*/
#define PUD_TYPE_TABLE (_AT(pudval_t, 3) << 0)
-#define PUD_TABLE_BIT (_AT(pudval_t, 1) << 1)
#define PUD_TYPE_MASK (_AT(pudval_t, 3) << 0)
#define PUD_TYPE_SECT (_AT(pudval_t, 1) << 0)
#define PUD_SECT_RDONLY (_AT(pudval_t, 1) << 7) /* AP[2] */
@@ -133,7 +136,6 @@
#define PMD_TYPE_MASK (_AT(pmdval_t, 3) << 0)
#define PMD_TYPE_TABLE (_AT(pmdval_t, 3) << 0)
#define PMD_TYPE_SECT (_AT(pmdval_t, 1) << 0)
-#define PMD_TABLE_BIT (_AT(pmdval_t, 1) << 1)
#define PMD_TABLE_AF (_AT(pmdval_t, 1) << 10) /* Ignored if no FEAT_HAFT */
/*
@@ -162,7 +164,6 @@
#define PTE_VALID (_AT(pteval_t, 1) << 0)
#define PTE_TYPE_MASK (_AT(pteval_t, 3) << 0)
#define PTE_TYPE_PAGE (_AT(pteval_t, 3) << 0)
-#define PTE_TABLE_BIT (_AT(pteval_t, 1) << 1)
#define PTE_USER (_AT(pteval_t, 1) << 6) /* AP[1] */
#define PTE_RDONLY (_AT(pteval_t, 1) << 7) /* AP[2] */
#define PTE_SHARED (_AT(pteval_t, 3) << 8) /* SH[1:0], inner shareable */
diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h
index a95f1f77bb39..7830d031742e 100644
--- a/arch/arm64/include/asm/pgtable-prot.h
+++ b/arch/arm64/include/asm/pgtable-prot.h
@@ -169,25 +169,25 @@ static inline bool __pure lpa2_is_enabled(void)
#define PAGE_GCS_RO __pgprot(_PAGE_GCS_RO)
#define PIE_E0 ( \
- PIRx_ELx_PERM(pte_pi_index(_PAGE_GCS), PIE_GCS) | \
- PIRx_ELx_PERM(pte_pi_index(_PAGE_GCS_RO), PIE_R) | \
- PIRx_ELx_PERM(pte_pi_index(_PAGE_EXECONLY), PIE_X_O) | \
- PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY_EXEC), PIE_RX_O) | \
- PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED_EXEC), PIE_RWX_O) | \
- PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY), PIE_R_O) | \
- PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED), PIE_RW_O))
+ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS), PIE_GCS) | \
+ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS_RO), PIE_R) | \
+ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_EXECONLY), PIE_X_O) | \
+ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_READONLY_EXEC), PIE_RX_O) | \
+ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_SHARED_EXEC), PIE_RWX_O) | \
+ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_READONLY), PIE_R_O) | \
+ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_SHARED), PIE_RW_O))
#define PIE_E1 ( \
- PIRx_ELx_PERM(pte_pi_index(_PAGE_GCS), PIE_NONE_O) | \
- PIRx_ELx_PERM(pte_pi_index(_PAGE_GCS_RO), PIE_NONE_O) | \
- PIRx_ELx_PERM(pte_pi_index(_PAGE_EXECONLY), PIE_NONE_O) | \
- PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY_EXEC), PIE_R) | \
- PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED_EXEC), PIE_RW) | \
- PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY), PIE_R) | \
- PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED), PIE_RW) | \
- PIRx_ELx_PERM(pte_pi_index(_PAGE_KERNEL_ROX), PIE_RX) | \
- PIRx_ELx_PERM(pte_pi_index(_PAGE_KERNEL_EXEC), PIE_RWX) | \
- PIRx_ELx_PERM(pte_pi_index(_PAGE_KERNEL_RO), PIE_R) | \
- PIRx_ELx_PERM(pte_pi_index(_PAGE_KERNEL), PIE_RW))
+ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS), PIE_NONE_O) | \
+ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS_RO), PIE_NONE_O) | \
+ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_EXECONLY), PIE_NONE_O) | \
+ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_READONLY_EXEC), PIE_R) | \
+ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_SHARED_EXEC), PIE_RW) | \
+ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_READONLY), PIE_R) | \
+ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_SHARED), PIE_RW) | \
+ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL_ROX), PIE_RX) | \
+ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL_EXEC), PIE_RWX) | \
+ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL_RO), PIE_R) | \
+ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL), PIE_RW))
#endif /* __ASM_PGTABLE_PROT_H */
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 0b2a2ad1b9e8..84f05f781a70 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -68,10 +68,6 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
#define pte_ERROR(e) \
pr_err("%s:%d: bad pte %016llx.\n", __FILE__, __LINE__, pte_val(e))
-/*
- * Macros to convert between a physical address and its placement in a
- * page table entry, taking care of 52-bit addresses.
- */
#ifdef CONFIG_ARM64_PA_BITS_52
static inline phys_addr_t __pte_to_phys(pte_t pte)
{
@@ -84,8 +80,15 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
return (phys | (phys >> PTE_ADDR_HIGH_SHIFT)) & PHYS_TO_PTE_ADDR_MASK;
}
#else
-#define __pte_to_phys(pte) (pte_val(pte) & PTE_ADDR_LOW)
-#define __phys_to_pte_val(phys) (phys)
+static inline phys_addr_t __pte_to_phys(pte_t pte)
+{
+ return pte_val(pte) & PTE_ADDR_LOW;
+}
+
+static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
+{
+ return phys;
+}
#endif
#define pte_pfn(pte) (__pte_to_phys(pte) >> PAGE_SHIFT)
@@ -483,12 +486,12 @@ static inline pmd_t pte_pmd(pte_t pte)
static inline pgprot_t mk_pud_sect_prot(pgprot_t prot)
{
- return __pgprot((pgprot_val(prot) & ~PUD_TABLE_BIT) | PUD_TYPE_SECT);
+ return __pgprot((pgprot_val(prot) & ~PUD_TYPE_MASK) | PUD_TYPE_SECT);
}
static inline pgprot_t mk_pmd_sect_prot(pgprot_t prot)
{
- return __pgprot((pgprot_val(prot) & ~PMD_TABLE_BIT) | PMD_TYPE_SECT);
+ return __pgprot((pgprot_val(prot) & ~PMD_TYPE_MASK) | PMD_TYPE_SECT);
}
static inline pte_t pte_swp_mkexclusive(pte_t pte)
@@ -548,18 +551,6 @@ static inline int pmd_protnone(pmd_t pmd)
#endif
#define pmd_present(pmd) pte_present(pmd_pte(pmd))
-
-/*
- * THP definitions.
- */
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static inline int pmd_trans_huge(pmd_t pmd)
-{
- return pmd_val(pmd) && pmd_present(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT);
-}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-
#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd))
#define pmd_young(pmd) pte_young(pmd_pte(pmd))
#define pmd_valid(pmd) pte_valid(pmd_pte(pmd))
@@ -585,7 +576,18 @@ static inline int pmd_trans_huge(pmd_t pmd)
#define pmd_write(pmd) pte_write(pmd_pte(pmd))
-#define pmd_mkhuge(pmd) (__pmd(pmd_val(pmd) & ~PMD_TABLE_BIT))
+static inline pmd_t pmd_mkhuge(pmd_t pmd)
+{
+ /*
+ * It's possible that the pmd is present-invalid on entry
+ * and in that case it needs to remain present-invalid on
+ * exit. So ensure the VALID bit does not get modified.
+ */
+ pmdval_t mask = PMD_TYPE_MASK & ~PTE_VALID;
+ pmdval_t val = PMD_TYPE_SECT & ~PTE_VALID;
+
+ return __pmd((pmd_val(pmd) & ~mask) | val);
+}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define pmd_devmap(pmd) pte_devmap(pmd_pte(pmd))
@@ -613,7 +615,18 @@ static inline pmd_t pmd_mkspecial(pmd_t pmd)
#define pud_mkyoung(pud) pte_pud(pte_mkyoung(pud_pte(pud)))
#define pud_write(pud) pte_write(pud_pte(pud))
-#define pud_mkhuge(pud) (__pud(pud_val(pud) & ~PUD_TABLE_BIT))
+static inline pud_t pud_mkhuge(pud_t pud)
+{
+ /*
+ * It's possible that the pud is present-invalid on entry
+ * and in that case it needs to remain present-invalid on
+ * exit. So ensure the VALID bit does not get modified.
+ */
+ pudval_t mask = PUD_TYPE_MASK & ~PTE_VALID;
+ pudval_t val = PUD_TYPE_SECT & ~PTE_VALID;
+
+ return __pud((pud_val(pud) & ~mask) | val);
+}
#define __pud_to_phys(pud) __pte_to_phys(pud_pte(pud))
#define __phys_to_pud_val(phys) __phys_to_pte_val(phys)
@@ -724,6 +737,18 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
#define pmd_leaf_size(pmd) (pmd_cont(pmd) ? CONT_PMD_SIZE : PMD_SIZE)
#define pte_leaf_size(pte) (pte_cont(pte) ? CONT_PTE_SIZE : PAGE_SIZE)
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline int pmd_trans_huge(pmd_t pmd)
+{
+ /*
+ * If pmd is present-invalid, pmd_table() won't detect it
+ * as a table, so force the valid bit for the comparison.
+ */
+ return pmd_val(pmd) && pmd_present(pmd) &&
+ !pmd_table(__pmd(pmd_val(pmd) | PTE_VALID));
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
#if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS < 3
static inline bool pud_sect(pud_t pud) { return false; }
static inline bool pud_table(pud_t pud) { return true; }
@@ -805,7 +830,8 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
pr_err("%s:%d: bad pmd %016llx.\n", __FILE__, __LINE__, pmd_val(e))
#define pud_none(pud) (!pud_val(pud))
-#define pud_bad(pud) (!pud_table(pud))
+#define pud_bad(pud) ((pud_val(pud) & PUD_TYPE_MASK) != \
+ PUD_TYPE_TABLE)
#define pud_present(pud) pte_present(pud_pte(pud))
#ifndef __PAGETABLE_PMD_FOLDED
#define pud_leaf(pud) (pud_present(pud) && !pud_table(pud))
@@ -896,7 +922,9 @@ static inline bool mm_pud_folded(const struct mm_struct *mm)
pr_err("%s:%d: bad pud %016llx.\n", __FILE__, __LINE__, pud_val(e))
#define p4d_none(p4d) (pgtable_l4_enabled() && !p4d_val(p4d))
-#define p4d_bad(p4d) (pgtable_l4_enabled() && !(p4d_val(p4d) & P4D_TABLE_BIT))
+#define p4d_bad(p4d) (pgtable_l4_enabled() && \
+ ((p4d_val(p4d) & P4D_TYPE_MASK) != \
+ P4D_TYPE_TABLE))
#define p4d_present(p4d) (!p4d_none(p4d))
static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
@@ -1023,7 +1051,9 @@ static inline bool mm_p4d_folded(const struct mm_struct *mm)
pr_err("%s:%d: bad p4d %016llx.\n", __FILE__, __LINE__, p4d_val(e))
#define pgd_none(pgd) (pgtable_l5_enabled() && !pgd_val(pgd))
-#define pgd_bad(pgd) (pgtable_l5_enabled() && !(pgd_val(pgd) & PGD_TABLE_BIT))
+#define pgd_bad(pgd) (pgtable_l5_enabled() && \
+ ((pgd_val(pgd) & PGD_TYPE_MASK) != \
+ PGD_TYPE_TABLE))
#define pgd_present(pgd) (!pgd_none(pgd))
static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
diff --git a/arch/arm64/include/asm/por.h b/arch/arm64/include/asm/por.h
index e06e9f473675..d913d5b529e4 100644
--- a/arch/arm64/include/asm/por.h
+++ b/arch/arm64/include/asm/por.h
@@ -6,26 +6,27 @@
#ifndef _ASM_ARM64_POR_H
#define _ASM_ARM64_POR_H
-#define POR_BITS_PER_PKEY 4
-#define POR_ELx_IDX(por_elx, idx) (((por_elx) >> ((idx) * POR_BITS_PER_PKEY)) & 0xf)
+#include <asm/sysreg.h>
+
+#define POR_EL0_INIT POR_ELx_PERM_PREP(0, POE_RWX)
static inline bool por_elx_allows_read(u64 por, u8 pkey)
{
- u8 perm = POR_ELx_IDX(por, pkey);
+ u8 perm = POR_ELx_PERM_GET(pkey, por);
return perm & POE_R;
}
static inline bool por_elx_allows_write(u64 por, u8 pkey)
{
- u8 perm = POR_ELx_IDX(por, pkey);
+ u8 perm = POR_ELx_PERM_GET(pkey, por);
return perm & POE_W;
}
static inline bool por_elx_allows_exec(u64 por, u8 pkey)
{
- u8 perm = POR_ELx_IDX(por, pkey);
+ u8 perm = POR_ELx_PERM_GET(pkey, por);
return perm & POE_X;
}
diff --git a/arch/arm64/include/asm/spectre.h b/arch/arm64/include/asm/spectre.h
index 0c4d9045c31f..f1524cdeacf1 100644
--- a/arch/arm64/include/asm/spectre.h
+++ b/arch/arm64/include/asm/spectre.h
@@ -97,7 +97,6 @@ enum mitigation_state arm64_get_meltdown_state(void);
enum mitigation_state arm64_get_spectre_bhb_state(void);
bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry, int scope);
-u8 spectre_bhb_loop_affected(int scope);
void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *__unused);
bool try_emulate_el1_ssbs(struct pt_regs *regs, u32 instr);
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 05ea5223d2d5..2639d3633073 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -562,9 +562,6 @@
#define SYS_ICH_VSEIR_EL2 sys_reg(3, 4, 12, 9, 4)
#define SYS_ICC_SRE_EL2 sys_reg(3, 4, 12, 9, 5)
-#define SYS_ICH_HCR_EL2 sys_reg(3, 4, 12, 11, 0)
-#define SYS_ICH_VTR_EL2 sys_reg(3, 4, 12, 11, 1)
-#define SYS_ICH_MISR_EL2 sys_reg(3, 4, 12, 11, 2)
#define SYS_ICH_EISR_EL2 sys_reg(3, 4, 12, 11, 3)
#define SYS_ICH_ELRSR_EL2 sys_reg(3, 4, 12, 11, 5)
#define SYS_ICH_VMCR_EL2 sys_reg(3, 4, 12, 11, 7)
@@ -985,10 +982,6 @@
#define SYS_MPIDR_SAFE_VAL (BIT(31))
/* GIC Hypervisor interface registers */
-/* ICH_MISR_EL2 bit definitions */
-#define ICH_MISR_EOI (1 << 0)
-#define ICH_MISR_U (1 << 1)
-
/* ICH_LR*_EL2 bit definitions */
#define ICH_LR_VIRTUAL_ID_MASK ((1ULL << 32) - 1)
@@ -1003,17 +996,6 @@
#define ICH_LR_PRIORITY_SHIFT 48
#define ICH_LR_PRIORITY_MASK (0xffULL << ICH_LR_PRIORITY_SHIFT)
-/* ICH_HCR_EL2 bit definitions */
-#define ICH_HCR_EN (1 << 0)
-#define ICH_HCR_UIE (1 << 1)
-#define ICH_HCR_NPIE (1 << 3)
-#define ICH_HCR_TC (1 << 10)
-#define ICH_HCR_TALL0 (1 << 11)
-#define ICH_HCR_TALL1 (1 << 12)
-#define ICH_HCR_TDIR (1 << 14)
-#define ICH_HCR_EOIcount_SHIFT 27
-#define ICH_HCR_EOIcount_MASK (0x1f << ICH_HCR_EOIcount_SHIFT)
-
/* ICH_VMCR_EL2 bit definitions */
#define ICH_VMCR_ACK_CTL_SHIFT 2
#define ICH_VMCR_ACK_CTL_MASK (1 << ICH_VMCR_ACK_CTL_SHIFT)
@@ -1034,18 +1016,6 @@
#define ICH_VMCR_ENG1_SHIFT 1
#define ICH_VMCR_ENG1_MASK (1 << ICH_VMCR_ENG1_SHIFT)
-/* ICH_VTR_EL2 bit definitions */
-#define ICH_VTR_PRI_BITS_SHIFT 29
-#define ICH_VTR_PRI_BITS_MASK (7 << ICH_VTR_PRI_BITS_SHIFT)
-#define ICH_VTR_ID_BITS_SHIFT 23
-#define ICH_VTR_ID_BITS_MASK (7 << ICH_VTR_ID_BITS_SHIFT)
-#define ICH_VTR_SEIS_SHIFT 22
-#define ICH_VTR_SEIS_MASK (1 << ICH_VTR_SEIS_SHIFT)
-#define ICH_VTR_A3V_SHIFT 21
-#define ICH_VTR_A3V_MASK (1 << ICH_VTR_A3V_SHIFT)
-#define ICH_VTR_TDS_SHIFT 19
-#define ICH_VTR_TDS_MASK (1 << ICH_VTR_TDS_SHIFT)
-
/*
* Permission Indirection Extension (PIE) permission encodings.
* Encodings with the _O suffix, have overlays applied (Permission Overlay Extension).
@@ -1062,8 +1032,11 @@
#define PIE_RX UL(0xa)
#define PIE_RW UL(0xc)
#define PIE_RWX UL(0xe)
+#define PIE_MASK UL(0xf)
-#define PIRx_ELx_PERM(idx, perm) ((perm) << ((idx) * 4))
+#define PIRx_ELx_BITS_PER_IDX 4
+#define PIRx_ELx_PERM_SHIFT(idx) ((idx) * PIRx_ELx_BITS_PER_IDX)
+#define PIRx_ELx_PERM_PREP(idx, perm) (((perm) & PIE_MASK) << PIRx_ELx_PERM_SHIFT(idx))
/*
* Permission Overlay Extension (POE) permission encodings.
@@ -1074,12 +1047,14 @@
#define POE_RX UL(0x3)
#define POE_W UL(0x4)
#define POE_RW UL(0x5)
-#define POE_XW UL(0x6)
-#define POE_RXW UL(0x7)
+#define POE_WX UL(0x6)
+#define POE_RWX UL(0x7)
#define POE_MASK UL(0xf)
-/* Initial value for Permission Overlay Extension for EL0 */
-#define POR_EL0_INIT POE_RXW
+#define POR_ELx_BITS_PER_IDX 4
+#define POR_ELx_PERM_SHIFT(idx) ((idx) * POR_ELx_BITS_PER_IDX)
+#define POR_ELx_PERM_GET(idx, reg) (((reg) >> POR_ELx_PERM_SHIFT(idx)) & POE_MASK)
+#define POR_ELx_PERM_PREP(idx, perm) (((perm) & POE_MASK) << POR_ELx_PERM_SHIFT(idx))
/*
* Definitions for Guarded Control Stack
diff --git a/arch/arm64/include/asm/vdso.h b/arch/arm64/include/asm/vdso.h
index 3e3c3fdb1842..61679070f595 100644
--- a/arch/arm64/include/asm/vdso.h
+++ b/arch/arm64/include/asm/vdso.h
@@ -5,7 +5,7 @@
#ifndef __ASM_VDSO_H
#define __ASM_VDSO_H
-#define __VVAR_PAGES 2
+#define __VDSO_PAGES 4
#ifndef __ASSEMBLY__
diff --git a/arch/arm64/include/asm/vdso/compat_gettimeofday.h b/arch/arm64/include/asm/vdso/compat_gettimeofday.h
index 778c1202bbbf..d60ea7a72a9c 100644
--- a/arch/arm64/include/asm/vdso/compat_gettimeofday.h
+++ b/arch/arm64/include/asm/vdso/compat_gettimeofday.h
@@ -104,7 +104,7 @@ int clock_getres32_fallback(clockid_t _clkid, struct old_timespec32 *_ts)
}
static __always_inline u64 __arch_get_hw_counter(s32 clock_mode,
- const struct vdso_data *vd)
+ const struct vdso_time_data *vd)
{
u64 res;
@@ -131,45 +131,33 @@ static __always_inline u64 __arch_get_hw_counter(s32 clock_mode,
return res;
}
-static __always_inline const struct vdso_data *__arch_get_vdso_data(void)
+static __always_inline const struct vdso_time_data *__arch_get_vdso_u_time_data(void)
{
- const struct vdso_data *ret;
+ const struct vdso_time_data *ret;
/*
- * This simply puts &_vdso_data into ret. The reason why we don't use
- * `ret = _vdso_data` is that the compiler tends to optimise this in a
- * very suboptimal way: instead of keeping &_vdso_data in a register,
- * it goes through a relocation almost every time _vdso_data must be
+ * This simply puts &_vdso_time_data into ret. The reason why we don't use
+ * `ret = _vdso_time_data` is that the compiler tends to optimise this in a
+ * very suboptimal way: instead of keeping &_vdso_time_data in a register,
+ * it goes through a relocation almost every time _vdso_time_data must be
* accessed (even in subfunctions). This is both time and space
* consuming: each relocation uses a word in the code section, and it
* has to be loaded at runtime.
*
* This trick hides the assignment from the compiler. Since it cannot
* track where the pointer comes from, it will only use one relocation
- * where __arch_get_vdso_data() is called, and then keep the result in
- * a register.
+ * where __aarch64_get_vdso_u_time_data() is called, and then keep the
+ * result in a register.
*/
- asm volatile("mov %0, %1" : "=r"(ret) : "r"(_vdso_data));
+ asm volatile("mov %0, %1" : "=r"(ret) : "r"(&vdso_u_time_data));
return ret;
}
+#define __arch_get_vdso_u_time_data __arch_get_vdso_u_time_data
-#ifdef CONFIG_TIME_NS
-static __always_inline
-const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd)
-{
- const struct vdso_data *ret;
-
- /* See __arch_get_vdso_data(). */
- asm volatile("mov %0, %1" : "=r"(ret) : "r"(_timens_data));
-
- return ret;
-}
-#endif
-
-static inline bool vdso_clocksource_ok(const struct vdso_data *vd)
+static inline bool vdso_clocksource_ok(const struct vdso_clock *vc)
{
- return vd->clock_mode == VDSO_CLOCKMODE_ARCHTIMER;
+ return vc->clock_mode == VDSO_CLOCKMODE_ARCHTIMER;
}
#define vdso_clocksource_ok vdso_clocksource_ok
diff --git a/arch/arm64/include/asm/vdso/getrandom.h b/arch/arm64/include/asm/vdso/getrandom.h
index 342f807e2044..a2197da1951b 100644
--- a/arch/arm64/include/asm/vdso/getrandom.h
+++ b/arch/arm64/include/asm/vdso/getrandom.h
@@ -33,18 +33,6 @@ static __always_inline ssize_t getrandom_syscall(void *_buffer, size_t _len, uns
return ret;
}
-static __always_inline const struct vdso_rng_data *__arch_get_vdso_rng_data(void)
-{
- /*
- * The RNG data is in the real VVAR data page, but if a task belongs to a time namespace
- * then VVAR_DATA_PAGE_OFFSET points to the namespace-specific VVAR page and VVAR_TIMENS_
- * PAGE_OFFSET points to the real VVAR page.
- */
- if (IS_ENABLED(CONFIG_TIME_NS) && _vdso_data->clock_mode == VDSO_CLOCKMODE_TIMENS)
- return (void *)&_vdso_rng_data + VVAR_TIMENS_PAGE_OFFSET * (1UL << CONFIG_PAGE_SHIFT);
- return &_vdso_rng_data;
-}
-
#endif /* !__ASSEMBLY__ */
#endif /* __ASM_VDSO_GETRANDOM_H */
diff --git a/arch/arm64/include/asm/vdso/gettimeofday.h b/arch/arm64/include/asm/vdso/gettimeofday.h
index 764d13e2916c..92a2b59a9f3d 100644
--- a/arch/arm64/include/asm/vdso/gettimeofday.h
+++ b/arch/arm64/include/asm/vdso/gettimeofday.h
@@ -67,7 +67,7 @@ int clock_getres_fallback(clockid_t _clkid, struct __kernel_timespec *_ts)
}
static __always_inline u64 __arch_get_hw_counter(s32 clock_mode,
- const struct vdso_data *vd)
+ const struct vdso_time_data *vd)
{
u64 res;
@@ -99,20 +99,6 @@ static __always_inline u64 __arch_get_hw_counter(s32 clock_mode,
return res;
}
-static __always_inline
-const struct vdso_data *__arch_get_vdso_data(void)
-{
- return _vdso_data;
-}
-
-#ifdef CONFIG_TIME_NS
-static __always_inline
-const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd)
-{
- return _timens_data;
-}
-#endif
-
#endif /* !__ASSEMBLY__ */
#endif /* __ASM_VDSO_GETTIMEOFDAY_H */
diff --git a/arch/arm64/include/asm/vdso/vsyscall.h b/arch/arm64/include/asm/vdso/vsyscall.h
index eea51946d45a..de58951b8df6 100644
--- a/arch/arm64/include/asm/vdso/vsyscall.h
+++ b/arch/arm64/include/asm/vdso/vsyscall.h
@@ -2,44 +2,21 @@
#ifndef __ASM_VDSO_VSYSCALL_H
#define __ASM_VDSO_VSYSCALL_H
-#define __VDSO_RND_DATA_OFFSET 480
-
#ifndef __ASSEMBLY__
#include <vdso/datapage.h>
-enum vvar_pages {
- VVAR_DATA_PAGE_OFFSET,
- VVAR_TIMENS_PAGE_OFFSET,
- VVAR_NR_PAGES,
-};
-
#define VDSO_PRECISION_MASK ~(0xFF00ULL<<48)
-extern struct vdso_data *vdso_data;
/*
* Update the vDSO data page to keep in sync with kernel timekeeping.
*/
static __always_inline
-struct vdso_data *__arm64_get_k_vdso_data(void)
-{
- return vdso_data;
-}
-#define __arch_get_k_vdso_data __arm64_get_k_vdso_data
-
-static __always_inline
-struct vdso_rng_data *__arm64_get_k_vdso_rnd_data(void)
-{
- return (void *)vdso_data + __VDSO_RND_DATA_OFFSET;
-}
-#define __arch_get_k_vdso_rng_data __arm64_get_k_vdso_rnd_data
-
-static __always_inline
-void __arm64_update_vsyscall(struct vdso_data *vdata)
+void __arm64_update_vsyscall(struct vdso_time_data *vdata)
{
- vdata[CS_HRES_COARSE].mask = VDSO_PRECISION_MASK;
- vdata[CS_RAW].mask = VDSO_PRECISION_MASK;
+ vdata->clock_data[CS_HRES_COARSE].mask = VDSO_PRECISION_MASK;
+ vdata->clock_data[CS_RAW].mask = VDSO_PRECISION_MASK;
}
#define __arch_update_vsyscall __arm64_update_vsyscall
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index 568bf858f319..af9d9acaf997 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -105,6 +105,7 @@ struct kvm_regs {
#define KVM_ARM_VCPU_PTRAUTH_ADDRESS 5 /* VCPU uses address authentication */
#define KVM_ARM_VCPU_PTRAUTH_GENERIC 6 /* VCPU uses generic authentication */
#define KVM_ARM_VCPU_HAS_EL2 7 /* Support nested virtualization */
+#define KVM_ARM_VCPU_HAS_EL2_E2H0 8 /* Limit NV support to E2H RES0 */
struct kvm_vcpu_init {
__u32 target;
@@ -371,6 +372,7 @@ enum {
#endif
};
+/* Vendor hyper call function numbers 0-63 */
#define KVM_REG_ARM_VENDOR_HYP_BMAP KVM_REG_ARM_FW_FEAT_BMAP_REG(2)
enum {
@@ -381,6 +383,17 @@ enum {
#endif
};
+/* Vendor hyper call function numbers 64-127 */
+#define KVM_REG_ARM_VENDOR_HYP_BMAP_2 KVM_REG_ARM_FW_FEAT_BMAP_REG(3)
+
+enum {
+ KVM_REG_ARM_VENDOR_HYP_BIT_DISCOVER_IMPL_VER = 0,
+ KVM_REG_ARM_VENDOR_HYP_BIT_DISCOVER_IMPL_CPUS = 1,
+#ifdef __KERNEL__
+ KVM_REG_ARM_VENDOR_HYP_BMAP_2_BIT_COUNT,
+#endif
+};
+
/* Device Control API on vm fd */
#define KVM_ARM_VM_SMCCC_CTRL 0
#define KVM_ARM_VM_SMCCC_FILTER 0
@@ -403,6 +416,7 @@ enum {
#define KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS 6
#define KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO 7
#define KVM_DEV_ARM_VGIC_GRP_ITS_REGS 8
+#define KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ 9
#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT 10
#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK \
(0x3fffffULL << KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT)
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index 7ce555862895..b55f5f705750 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -14,31 +14,85 @@
#include <asm/kvm_asm.h>
#include <asm/smp_plat.h>
+static u64 target_impl_cpu_num;
+static struct target_impl_cpu *target_impl_cpus;
+
+bool cpu_errata_set_target_impl(u64 num, void *impl_cpus)
+{
+ if (target_impl_cpu_num || !num || !impl_cpus)
+ return false;
+
+ target_impl_cpu_num = num;
+ target_impl_cpus = impl_cpus;
+ return true;
+}
+
+static inline bool is_midr_in_range(struct midr_range const *range)
+{
+ int i;
+
+ if (!target_impl_cpu_num)
+ return midr_is_cpu_model_range(read_cpuid_id(), range->model,
+ range->rv_min, range->rv_max);
+
+ for (i = 0; i < target_impl_cpu_num; i++) {
+ if (midr_is_cpu_model_range(target_impl_cpus[i].midr,
+ range->model,
+ range->rv_min, range->rv_max))
+ return true;
+ }
+ return false;
+}
+
+bool is_midr_in_range_list(struct midr_range const *ranges)
+{
+ while (ranges->model)
+ if (is_midr_in_range(ranges++))
+ return true;
+ return false;
+}
+EXPORT_SYMBOL_GPL(is_midr_in_range_list);
+
static bool __maybe_unused
-is_affected_midr_range(const struct arm64_cpu_capabilities *entry, int scope)
+__is_affected_midr_range(const struct arm64_cpu_capabilities *entry,
+ u32 midr, u32 revidr)
{
const struct arm64_midr_revidr *fix;
- u32 midr = read_cpuid_id(), revidr;
-
- WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());
- if (!is_midr_in_range(midr, &entry->midr_range))
+ if (!is_midr_in_range(&entry->midr_range))
return false;
midr &= MIDR_REVISION_MASK | MIDR_VARIANT_MASK;
- revidr = read_cpuid(REVIDR_EL1);
for (fix = entry->fixed_revs; fix && fix->revidr_mask; fix++)
if (midr == fix->midr_rv && (revidr & fix->revidr_mask))
return false;
-
return true;
}
static bool __maybe_unused
+is_affected_midr_range(const struct arm64_cpu_capabilities *entry, int scope)
+{
+ int i;
+
+ if (!target_impl_cpu_num) {
+ WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());
+ return __is_affected_midr_range(entry, read_cpuid_id(),
+ read_cpuid(REVIDR_EL1));
+ }
+
+ for (i = 0; i < target_impl_cpu_num; i++) {
+ if (__is_affected_midr_range(entry, target_impl_cpus[i].midr,
+ target_impl_cpus[i].midr))
+ return true;
+ }
+ return false;
+}
+
+static bool __maybe_unused
is_affected_midr_range_list(const struct arm64_cpu_capabilities *entry,
int scope)
{
WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());
- return is_midr_in_range_list(read_cpuid_id(), entry->midr_range_list);
+ return is_midr_in_range_list(entry->midr_range_list);
}
static bool __maybe_unused
@@ -186,12 +240,48 @@ static bool __maybe_unused
has_neoverse_n1_erratum_1542419(const struct arm64_cpu_capabilities *entry,
int scope)
{
- u32 midr = read_cpuid_id();
bool has_dic = read_cpuid_cachetype() & BIT(CTR_EL0_DIC_SHIFT);
const struct midr_range range = MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1);
WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());
- return is_midr_in_range(midr, &range) && has_dic;
+ return is_midr_in_range(&range) && has_dic;
+}
+
+static const struct midr_range impdef_pmuv3_cpus[] = {
+ MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM),
+ MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM),
+ MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_PRO),
+ MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_PRO),
+ MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_MAX),
+ MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_MAX),
+ MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD),
+ MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE),
+ MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_PRO),
+ MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_PRO),
+ MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_MAX),
+ MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_MAX),
+ {},
+};
+
+static bool has_impdef_pmuv3(const struct arm64_cpu_capabilities *entry, int scope)
+{
+ u64 dfr0 = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1);
+ unsigned int pmuver;
+
+ if (!is_kernel_in_hyp_mode())
+ return false;
+
+ pmuver = cpuid_feature_extract_unsigned_field(dfr0,
+ ID_AA64DFR0_EL1_PMUVer_SHIFT);
+ if (pmuver != ID_AA64DFR0_EL1_PMUVer_IMP_DEF)
+ return false;
+
+ return is_midr_in_range_list(impdef_pmuv3_cpus);
+}
+
+static void cpu_enable_impdef_pmuv3_traps(const struct arm64_cpu_capabilities *__unused)
+{
+ sysreg_clear_set_s(SYS_HACR_EL2, 0, BIT(56));
}
#ifdef CONFIG_ARM64_WORKAROUND_REPEAT_TLBI
@@ -795,5 +885,12 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
})),
},
{
+ .desc = "Apple IMPDEF PMUv3 Traps",
+ .capability = ARM64_WORKAROUND_PMUV3_IMPDEF_TRAPS,
+ .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM,
+ .matches = has_impdef_pmuv3,
+ .cpu_enable = cpu_enable_impdef_pmuv3_traps,
+ },
+ {
}
};
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index d561cf3b8ac7..9c4d6d552b25 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -86,6 +86,7 @@
#include <asm/kvm_host.h>
#include <asm/mmu_context.h>
#include <asm/mte.h>
+#include <asm/hypervisor.h>
#include <asm/processor.h>
#include <asm/smp.h>
#include <asm/sysreg.h>
@@ -497,6 +498,7 @@ static const struct arm64_ftr_bits ftr_id_aa64mmfr3[] = {
static const struct arm64_ftr_bits ftr_id_aa64mmfr4[] = {
S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR4_EL1_E2H0_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR4_EL1_NV_frac_SHIFT, 4, 0),
ARM64_FTR_END,
};
@@ -1792,7 +1794,7 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry,
char const *str = "kpti command line option";
bool meltdown_safe;
- meltdown_safe = is_midr_in_range_list(read_cpuid_id(), kpti_safe_list);
+ meltdown_safe = is_midr_in_range_list(kpti_safe_list);
/* Defer to CPU feature registers */
if (has_cpuid_feature(entry, scope))
@@ -1862,7 +1864,7 @@ static bool has_nv1(const struct arm64_cpu_capabilities *entry, int scope)
return (__system_matches_cap(ARM64_HAS_NESTED_VIRT) &&
!(has_cpuid_feature(entry, scope) ||
- is_midr_in_range_list(read_cpuid_id(), nv1_ni_list)));
+ is_midr_in_range_list(nv1_ni_list)));
}
#if defined(ID_AA64MMFR0_EL1_TGRAN_LPA2) && defined(ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_LPA2)
@@ -1898,6 +1900,28 @@ static bool has_lpa2(const struct arm64_cpu_capabilities *entry, int scope)
}
#endif
+#ifdef CONFIG_HW_PERF_EVENTS
+static bool has_pmuv3(const struct arm64_cpu_capabilities *entry, int scope)
+{
+ u64 dfr0 = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1);
+ unsigned int pmuver;
+
+ /*
+ * PMUVer follows the standard ID scheme for an unsigned field with the
+ * exception of 0xF (IMP_DEF) which is treated specially and implies
+ * FEAT_PMUv3 is not implemented.
+ *
+ * See DDI0487L.a D24.1.3.2 for more details.
+ */
+ pmuver = cpuid_feature_extract_unsigned_field(dfr0,
+ ID_AA64DFR0_EL1_PMUVer_SHIFT);
+ if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF)
+ return false;
+
+ return pmuver >= ID_AA64DFR0_EL1_PMUVer_IMP;
+}
+#endif
+
#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
#define KPTI_NG_TEMP_VA (-(1UL << PMD_SHIFT))
@@ -2045,7 +2069,7 @@ static bool cpu_has_broken_dbm(void)
{},
};
- return is_midr_in_range_list(read_cpuid_id(), cpus);
+ return is_midr_in_range_list(cpus);
}
static bool cpu_can_use_dbm(const struct arm64_cpu_capabilities *cap)
@@ -2162,7 +2186,7 @@ static bool has_nested_virt_support(const struct arm64_cpu_capabilities *cap,
if (kvm_get_mode() != KVM_MODE_NV)
return false;
- if (!has_cpuid_feature(cap, scope)) {
+ if (!cpucap_multi_entry_cap_matches(cap, scope)) {
pr_warn("unavailable: %s\n", cap->desc);
return false;
}
@@ -2519,7 +2543,17 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.capability = ARM64_HAS_NESTED_VIRT,
.type = ARM64_CPUCAP_SYSTEM_FEATURE,
.matches = has_nested_virt_support,
- ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, NV, NV2)
+ .match_list = (const struct arm64_cpu_capabilities []){
+ {
+ .matches = has_cpuid_feature,
+ ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, NV, NV2)
+ },
+ {
+ .matches = has_cpuid_feature,
+ ARM64_CPUID_FIELDS(ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY)
+ },
+ { /* Sentinel */ }
+ },
},
{
.capability = ARM64_HAS_32BIT_EL0_DO_NOT_USE,
@@ -2999,6 +3033,14 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, GCS, IMP)
},
#endif
+#ifdef CONFIG_HW_PERF_EVENTS
+ {
+ .desc = "PMUv3",
+ .capability = ARM64_HAS_PMUV3,
+ .type = ARM64_CPUCAP_SYSTEM_FEATURE,
+ .matches = has_pmuv3,
+ },
+#endif
{},
};
@@ -3680,6 +3722,7 @@ unsigned long cpu_get_elf_hwcap3(void)
static void __init setup_boot_cpu_capabilities(void)
{
+ kvm_arm_target_impl_cpu_init();
/*
* The boot CPU's feature register values have been recorded. Detect
* boot cpucaps and local cpucaps for the boot CPU, then enable and
diff --git a/arch/arm64/kernel/elfcore.c b/arch/arm64/kernel/elfcore.c
index 2e94d20c4ac7..b735f4c2fe5e 100644
--- a/arch/arm64/kernel/elfcore.c
+++ b/arch/arm64/kernel/elfcore.c
@@ -27,9 +27,10 @@ static int mte_dump_tag_range(struct coredump_params *cprm,
int ret = 1;
unsigned long addr;
void *tags = NULL;
+ int locked = 0;
for (addr = start; addr < start + len; addr += PAGE_SIZE) {
- struct page *page = get_dump_page(addr);
+ struct page *page = get_dump_page(addr, &locked);
/*
* get_dump_page() returns NULL when encountering an empty
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index ef3a69cc398e..5e3c4b58f279 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -49,6 +49,7 @@ PROVIDE(__pi_arm64_sw_feature_override = arm64_sw_feature_override);
PROVIDE(__pi_arm64_use_ng_mappings = arm64_use_ng_mappings);
#ifdef CONFIG_CAVIUM_ERRATUM_27456
PROVIDE(__pi_cavium_erratum_27456_cpus = cavium_erratum_27456_cpus);
+PROVIDE(__pi_is_midr_in_range_list = is_midr_in_range_list);
#endif
PROVIDE(__pi__ctype = _ctype);
PROVIDE(__pi_memstart_offset_seed = memstart_offset_seed);
@@ -112,11 +113,6 @@ KVM_NVHE_ALIAS(broken_cntvoff_key);
KVM_NVHE_ALIAS(__start___kvm_ex_table);
KVM_NVHE_ALIAS(__stop___kvm_ex_table);
-/* PMU available static key */
-#ifdef CONFIG_HW_PERF_EVENTS
-KVM_NVHE_ALIAS(kvm_arm_pmu_available);
-#endif
-
/* Position-independent library routines */
KVM_NVHE_ALIAS_HYP(clear_page, __pi_clear_page);
KVM_NVHE_ALIAS_HYP(copy_page, __pi_copy_page);
diff --git a/arch/arm64/kernel/pi/map_range.c b/arch/arm64/kernel/pi/map_range.c
index 2b69e3beeef8..81345f68f9fc 100644
--- a/arch/arm64/kernel/pi/map_range.c
+++ b/arch/arm64/kernel/pi/map_range.c
@@ -31,7 +31,7 @@ void __init map_range(u64 *pte, u64 start, u64 end, u64 pa, pgprot_t prot,
{
u64 cmask = (level == 3) ? CONT_PTE_SIZE - 1 : U64_MAX;
pteval_t protval = pgprot_val(prot) & ~PTE_TYPE_MASK;
- int lshift = (3 - level) * (PAGE_SHIFT - 3);
+ int lshift = (3 - level) * PTDESC_TABLE_SHIFT;
u64 lmask = (PAGE_SIZE << lshift) - 1;
start &= PAGE_MASK;
@@ -45,12 +45,12 @@ void __init map_range(u64 *pte, u64 start, u64 end, u64 pa, pgprot_t prot,
* clearing the mapping
*/
if (protval)
- protval |= (level < 3) ? PMD_TYPE_SECT : PTE_TYPE_PAGE;
+ protval |= (level == 2) ? PMD_TYPE_SECT : PTE_TYPE_PAGE;
while (start < end) {
u64 next = min((start | lmask) + 1, PAGE_ALIGN(end));
- if (level < 3 && (start | next | pa) & lmask) {
+ if (level < 2 || (level == 2 && (start | next | pa) & lmask)) {
/*
* This chunk needs a finer grained mapping. Create a
* table mapping if necessary and recurse.
diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c
index da53722f95d4..d5d11fd11549 100644
--- a/arch/arm64/kernel/proton-pack.c
+++ b/arch/arm64/kernel/proton-pack.c
@@ -172,7 +172,7 @@ static enum mitigation_state spectre_v2_get_cpu_hw_mitigation_state(void)
return SPECTRE_UNAFFECTED;
/* Alternatively, we have a list of unaffected CPUs */
- if (is_midr_in_range_list(read_cpuid_id(), spectre_v2_safe_list))
+ if (is_midr_in_range_list(spectre_v2_safe_list))
return SPECTRE_UNAFFECTED;
return SPECTRE_VULNERABLE;
@@ -331,7 +331,7 @@ bool has_spectre_v3a(const struct arm64_cpu_capabilities *entry, int scope)
};
WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());
- return is_midr_in_range_list(read_cpuid_id(), spectre_v3a_unsafe_list);
+ return is_midr_in_range_list(spectre_v3a_unsafe_list);
}
void spectre_v3a_enable_mitigation(const struct arm64_cpu_capabilities *__unused)
@@ -475,7 +475,7 @@ static enum mitigation_state spectre_v4_get_cpu_hw_mitigation_state(void)
{ /* sentinel */ },
};
- if (is_midr_in_range_list(read_cpuid_id(), spectre_v4_safe_list))
+ if (is_midr_in_range_list(spectre_v4_safe_list))
return SPECTRE_UNAFFECTED;
/* CPU features are detected first */
@@ -845,52 +845,86 @@ static unsigned long system_bhb_mitigations;
* This must be called with SCOPE_LOCAL_CPU for each type of CPU, before any
* SCOPE_SYSTEM call will give the right answer.
*/
-u8 spectre_bhb_loop_affected(int scope)
+static bool is_spectre_bhb_safe(int scope)
+{
+ static const struct midr_range spectre_bhb_safe_list[] = {
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A35),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A53),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A55),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A510),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A520),
+ MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53),
+ MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_2XX_SILVER),
+ MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER),
+ MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER),
+ {},
+ };
+ static bool all_safe = true;
+
+ if (scope != SCOPE_LOCAL_CPU)
+ return all_safe;
+
+ if (is_midr_in_range_list(spectre_bhb_safe_list))
+ return true;
+
+ all_safe = false;
+
+ return false;
+}
+
+static u8 spectre_bhb_loop_affected(void)
{
u8 k = 0;
- static u8 max_bhb_k;
-
- if (scope == SCOPE_LOCAL_CPU) {
- static const struct midr_range spectre_bhb_k32_list[] = {
- MIDR_ALL_VERSIONS(MIDR_CORTEX_A78),
- MIDR_ALL_VERSIONS(MIDR_CORTEX_A78AE),
- MIDR_ALL_VERSIONS(MIDR_CORTEX_A78C),
- MIDR_ALL_VERSIONS(MIDR_CORTEX_X1),
- MIDR_ALL_VERSIONS(MIDR_CORTEX_A710),
- MIDR_ALL_VERSIONS(MIDR_CORTEX_X2),
- MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2),
- MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1),
- {},
- };
- static const struct midr_range spectre_bhb_k24_list[] = {
- MIDR_ALL_VERSIONS(MIDR_CORTEX_A76),
- MIDR_ALL_VERSIONS(MIDR_CORTEX_A77),
- MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1),
- {},
- };
- static const struct midr_range spectre_bhb_k11_list[] = {
- MIDR_ALL_VERSIONS(MIDR_AMPERE1),
- {},
- };
- static const struct midr_range spectre_bhb_k8_list[] = {
- MIDR_ALL_VERSIONS(MIDR_CORTEX_A72),
- MIDR_ALL_VERSIONS(MIDR_CORTEX_A57),
- {},
- };
-
- if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k32_list))
- k = 32;
- else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k24_list))
- k = 24;
- else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k11_list))
- k = 11;
- else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k8_list))
- k = 8;
-
- max_bhb_k = max(max_bhb_k, k);
- } else {
- k = max_bhb_k;
- }
+
+ static const struct midr_range spectre_bhb_k132_list[] = {
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_X3),
+ MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2),
+ };
+ static const struct midr_range spectre_bhb_k38_list[] = {
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A715),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A720),
+ };
+ static const struct midr_range spectre_bhb_k32_list[] = {
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A78),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A78AE),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A78C),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_X1),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A710),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_X2),
+ MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2),
+ MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1),
+ {},
+ };
+ static const struct midr_range spectre_bhb_k24_list[] = {
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A76),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A76AE),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A77),
+ MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1),
+ MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_GOLD),
+ {},
+ };
+ static const struct midr_range spectre_bhb_k11_list[] = {
+ MIDR_ALL_VERSIONS(MIDR_AMPERE1),
+ {},
+ };
+ static const struct midr_range spectre_bhb_k8_list[] = {
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A72),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A57),
+ {},
+ };
+
+ if (is_midr_in_range_list(spectre_bhb_k132_list))
+ k = 132;
+ else if (is_midr_in_range_list(spectre_bhb_k38_list))
+ k = 38;
+ else if (is_midr_in_range_list(spectre_bhb_k32_list))
+ k = 32;
+ else if (is_midr_in_range_list(spectre_bhb_k24_list))
+ k = 24;
+ else if (is_midr_in_range_list(spectre_bhb_k11_list))
+ k = 11;
+ else if (is_midr_in_range_list(spectre_bhb_k8_list))
+ k = 8;
return k;
}
@@ -916,29 +950,13 @@ static enum mitigation_state spectre_bhb_get_cpu_fw_mitigation_state(void)
}
}
-static bool is_spectre_bhb_fw_affected(int scope)
+static bool has_spectre_bhb_fw_mitigation(void)
{
- static bool system_affected;
enum mitigation_state fw_state;
bool has_smccc = arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_NONE;
- static const struct midr_range spectre_bhb_firmware_mitigated_list[] = {
- MIDR_ALL_VERSIONS(MIDR_CORTEX_A73),
- MIDR_ALL_VERSIONS(MIDR_CORTEX_A75),
- {},
- };
- bool cpu_in_list = is_midr_in_range_list(read_cpuid_id(),
- spectre_bhb_firmware_mitigated_list);
-
- if (scope != SCOPE_LOCAL_CPU)
- return system_affected;
fw_state = spectre_bhb_get_cpu_fw_mitigation_state();
- if (cpu_in_list || (has_smccc && fw_state == SPECTRE_MITIGATED)) {
- system_affected = true;
- return true;
- }
-
- return false;
+ return has_smccc && fw_state == SPECTRE_MITIGATED;
}
static bool supports_ecbhb(int scope)
@@ -954,6 +972,8 @@ static bool supports_ecbhb(int scope)
ID_AA64MMFR1_EL1_ECBHB_SHIFT);
}
+static u8 max_bhb_k;
+
bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry,
int scope)
{
@@ -962,16 +982,18 @@ bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry,
if (supports_csv2p3(scope))
return false;
- if (supports_clearbhb(scope))
- return true;
-
- if (spectre_bhb_loop_affected(scope))
- return true;
+ if (is_spectre_bhb_safe(scope))
+ return false;
- if (is_spectre_bhb_fw_affected(scope))
- return true;
+ /*
+ * At this point the core isn't known to be "safe" so we're going to
+ * assume it's vulnerable. We still need to update `max_bhb_k` though,
+ * but only if we aren't mitigating with clearbhb though.
+ */
+ if (scope == SCOPE_LOCAL_CPU && !supports_clearbhb(SCOPE_LOCAL_CPU))
+ max_bhb_k = max(max_bhb_k, spectre_bhb_loop_affected());
- return false;
+ return true;
}
static void this_cpu_set_vectors(enum arm64_bp_harden_el1_vectors slot)
@@ -1002,7 +1024,7 @@ early_param("nospectre_bhb", parse_spectre_bhb_param);
void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry)
{
bp_hardening_cb_t cpu_cb;
- enum mitigation_state fw_state, state = SPECTRE_VULNERABLE;
+ enum mitigation_state state = SPECTRE_VULNERABLE;
struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data);
if (!is_spectre_bhb_affected(entry, SCOPE_LOCAL_CPU))
@@ -1028,7 +1050,7 @@ void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry)
this_cpu_set_vectors(EL1_VECTOR_BHB_CLEAR_INSN);
state = SPECTRE_MITIGATED;
set_bit(BHB_INSN, &system_bhb_mitigations);
- } else if (spectre_bhb_loop_affected(SCOPE_LOCAL_CPU)) {
+ } else if (spectre_bhb_loop_affected()) {
/*
* Ensure KVM uses the indirect vector which will have the
* branchy-loop added. A57/A72-r0 will already have selected
@@ -1041,32 +1063,29 @@ void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry)
this_cpu_set_vectors(EL1_VECTOR_BHB_LOOP);
state = SPECTRE_MITIGATED;
set_bit(BHB_LOOP, &system_bhb_mitigations);
- } else if (is_spectre_bhb_fw_affected(SCOPE_LOCAL_CPU)) {
- fw_state = spectre_bhb_get_cpu_fw_mitigation_state();
- if (fw_state == SPECTRE_MITIGATED) {
- /*
- * Ensure KVM uses one of the spectre bp_hardening
- * vectors. The indirect vector doesn't include the EL3
- * call, so needs upgrading to
- * HYP_VECTOR_SPECTRE_INDIRECT.
- */
- if (!data->slot || data->slot == HYP_VECTOR_INDIRECT)
- data->slot += 1;
-
- this_cpu_set_vectors(EL1_VECTOR_BHB_FW);
-
- /*
- * The WA3 call in the vectors supersedes the WA1 call
- * made during context-switch. Uninstall any firmware
- * bp_hardening callback.
- */
- cpu_cb = spectre_v2_get_sw_mitigation_cb();
- if (__this_cpu_read(bp_hardening_data.fn) != cpu_cb)
- __this_cpu_write(bp_hardening_data.fn, NULL);
-
- state = SPECTRE_MITIGATED;
- set_bit(BHB_FW, &system_bhb_mitigations);
- }
+ } else if (has_spectre_bhb_fw_mitigation()) {
+ /*
+ * Ensure KVM uses one of the spectre bp_hardening
+ * vectors. The indirect vector doesn't include the EL3
+ * call, so needs upgrading to
+ * HYP_VECTOR_SPECTRE_INDIRECT.
+ */
+ if (!data->slot || data->slot == HYP_VECTOR_INDIRECT)
+ data->slot += 1;
+
+ this_cpu_set_vectors(EL1_VECTOR_BHB_FW);
+
+ /*
+ * The WA3 call in the vectors supersedes the WA1 call
+ * made during context-switch. Uninstall any firmware
+ * bp_hardening callback.
+ */
+ cpu_cb = spectre_v2_get_sw_mitigation_cb();
+ if (__this_cpu_read(bp_hardening_data.fn) != cpu_cb)
+ __this_cpu_write(bp_hardening_data.fn, NULL);
+
+ state = SPECTRE_MITIGATED;
+ set_bit(BHB_FW, &system_bhb_mitigations);
}
update_mitigation_state(&spectre_bhb_state, state);
@@ -1100,7 +1119,6 @@ void noinstr spectre_bhb_patch_loop_iter(struct alt_instr *alt,
{
u8 rd;
u32 insn;
- u16 loop_count = spectre_bhb_loop_affected(SCOPE_SYSTEM);
BUG_ON(nr_inst != 1); /* MOV -> MOV */
@@ -1109,7 +1127,7 @@ void noinstr spectre_bhb_patch_loop_iter(struct alt_instr *alt,
insn = le32_to_cpu(*origptr);
rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, insn);
- insn = aarch64_insn_gen_movewide(rd, loop_count, 0,
+ insn = aarch64_insn_gen_movewide(rd, max_bhb_k, 0,
AARCH64_INSN_VARIANT_64BIT,
AARCH64_INSN_MOVEWIDE_ZERO);
*updptr++ = cpu_to_le32(insn);
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 99ea26d400ff..a7c37afb4ebe 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -91,7 +91,7 @@ static void save_reset_user_access_state(struct user_access_state *ua_state)
u64 por_enable_all = 0;
for (int pkey = 0; pkey < arch_max_pkey(); pkey++)
- por_enable_all |= POE_RXW << (pkey * POR_BITS_PER_PKEY);
+ por_enable_all |= POR_ELx_PERM_PREP(pkey, POE_RWX);
ua_state->por_el0 = read_sysreg_s(SYS_POR_EL0);
write_sysreg_s(por_enable_all, SYS_POR_EL0);
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index cb180684d10d..5d07ee85bdae 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -15,8 +15,11 @@
#include <linux/arch_topology.h>
#include <linux/cacheinfo.h>
#include <linux/cpufreq.h>
+#include <linux/cpu_smt.h>
#include <linux/init.h>
#include <linux/percpu.h>
+#include <linux/sched/isolation.h>
+#include <linux/xarray.h>
#include <asm/cpu.h>
#include <asm/cputype.h>
@@ -37,17 +40,28 @@ static bool __init acpi_cpu_is_threaded(int cpu)
return !!is_threaded;
}
+struct cpu_smt_info {
+ unsigned int thread_num;
+ int core_id;
+};
+
/*
* Propagate the topology information of the processor_topology_node tree to the
* cpu_topology array.
*/
int __init parse_acpi_topology(void)
{
+ unsigned int max_smt_thread_num = 1;
+ struct cpu_smt_info *entry;
+ struct xarray hetero_cpu;
+ unsigned long hetero_id;
int cpu, topology_id;
if (acpi_disabled)
return 0;
+ xa_init(&hetero_cpu);
+
for_each_possible_cpu(cpu) {
topology_id = find_acpi_cpu_topology(cpu, 0);
if (topology_id < 0)
@@ -57,6 +71,34 @@ int __init parse_acpi_topology(void)
cpu_topology[cpu].thread_id = topology_id;
topology_id = find_acpi_cpu_topology(cpu, 1);
cpu_topology[cpu].core_id = topology_id;
+
+ /*
+ * In the PPTT, CPUs below a node with the 'identical
+ * implementation' flag have the same number of threads.
+ * Count the number of threads for only one CPU (i.e.
+ * one core_id) among those with the same hetero_id.
+ * See the comment of find_acpi_cpu_topology_hetero_id()
+ * for more details.
+ *
+ * One entry is created for each node having:
+ * - the 'identical implementation' flag
+ * - its parent not having the flag
+ */
+ hetero_id = find_acpi_cpu_topology_hetero_id(cpu);
+ entry = xa_load(&hetero_cpu, hetero_id);
+ if (!entry) {
+ entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+ WARN_ON_ONCE(!entry);
+
+ if (entry) {
+ entry->core_id = topology_id;
+ entry->thread_num = 1;
+ xa_store(&hetero_cpu, hetero_id,
+ entry, GFP_KERNEL);
+ }
+ } else if (entry->core_id == topology_id) {
+ entry->thread_num++;
+ }
} else {
cpu_topology[cpu].thread_id = -1;
cpu_topology[cpu].core_id = topology_id;
@@ -67,6 +109,19 @@ int __init parse_acpi_topology(void)
cpu_topology[cpu].package_id = topology_id;
}
+ /*
+ * This is a short loop since the number of XArray elements is the
+ * number of heterogeneous CPU clusters. On a homogeneous system
+ * there's only one entry in the XArray.
+ */
+ xa_for_each(&hetero_cpu, hetero_id, entry) {
+ max_smt_thread_num = max(max_smt_thread_num, entry->thread_num);
+ xa_erase(&hetero_cpu, hetero_id);
+ kfree(entry);
+ }
+
+ cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num);
+ xa_destroy(&hetero_cpu);
return 0;
}
#endif
@@ -88,18 +143,28 @@ int __init parse_acpi_topology(void)
* initialized.
*/
static DEFINE_PER_CPU_READ_MOSTLY(unsigned long, arch_max_freq_scale) = 1UL << (2 * SCHED_CAPACITY_SHIFT);
-static DEFINE_PER_CPU(u64, arch_const_cycles_prev);
-static DEFINE_PER_CPU(u64, arch_core_cycles_prev);
static cpumask_var_t amu_fie_cpus;
+struct amu_cntr_sample {
+ u64 arch_const_cycles_prev;
+ u64 arch_core_cycles_prev;
+ unsigned long last_scale_update;
+};
+
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct amu_cntr_sample, cpu_amu_samples);
+
void update_freq_counters_refs(void)
{
- this_cpu_write(arch_core_cycles_prev, read_corecnt());
- this_cpu_write(arch_const_cycles_prev, read_constcnt());
+ struct amu_cntr_sample *amu_sample = this_cpu_ptr(&cpu_amu_samples);
+
+ amu_sample->arch_core_cycles_prev = read_corecnt();
+ amu_sample->arch_const_cycles_prev = read_constcnt();
}
static inline bool freq_counters_valid(int cpu)
{
+ struct amu_cntr_sample *amu_sample = per_cpu_ptr(&cpu_amu_samples, cpu);
+
if ((cpu >= nr_cpu_ids) || !cpumask_test_cpu(cpu, cpu_present_mask))
return false;
@@ -108,8 +173,8 @@ static inline bool freq_counters_valid(int cpu)
return false;
}
- if (unlikely(!per_cpu(arch_const_cycles_prev, cpu) ||
- !per_cpu(arch_core_cycles_prev, cpu))) {
+ if (unlikely(!amu_sample->arch_const_cycles_prev ||
+ !amu_sample->arch_core_cycles_prev)) {
pr_debug("CPU%d: cycle counters are not enabled.\n", cpu);
return false;
}
@@ -152,17 +217,22 @@ void freq_inv_set_max_ratio(int cpu, u64 max_rate)
static void amu_scale_freq_tick(void)
{
+ struct amu_cntr_sample *amu_sample = this_cpu_ptr(&cpu_amu_samples);
u64 prev_core_cnt, prev_const_cnt;
u64 core_cnt, const_cnt, scale;
- prev_const_cnt = this_cpu_read(arch_const_cycles_prev);
- prev_core_cnt = this_cpu_read(arch_core_cycles_prev);
+ prev_const_cnt = amu_sample->arch_const_cycles_prev;
+ prev_core_cnt = amu_sample->arch_core_cycles_prev;
update_freq_counters_refs();
- const_cnt = this_cpu_read(arch_const_cycles_prev);
- core_cnt = this_cpu_read(arch_core_cycles_prev);
+ const_cnt = amu_sample->arch_const_cycles_prev;
+ core_cnt = amu_sample->arch_core_cycles_prev;
+ /*
+ * This should not happen unless the AMUs have been reset and the
+ * counter values have not been restored - unlikely
+ */
if (unlikely(core_cnt <= prev_core_cnt ||
const_cnt <= prev_const_cnt))
return;
@@ -182,6 +252,8 @@ static void amu_scale_freq_tick(void)
scale = min_t(unsigned long, scale, SCHED_CAPACITY_SCALE);
this_cpu_write(arch_freq_scale, (unsigned long)scale);
+
+ amu_sample->last_scale_update = jiffies;
}
static struct scale_freq_data amu_sfd = {
@@ -189,6 +261,96 @@ static struct scale_freq_data amu_sfd = {
.set_freq_scale = amu_scale_freq_tick,
};
+static __always_inline bool amu_fie_cpu_supported(unsigned int cpu)
+{
+ return cpumask_available(amu_fie_cpus) &&
+ cpumask_test_cpu(cpu, amu_fie_cpus);
+}
+
+void arch_cpu_idle_enter(void)
+{
+ unsigned int cpu = smp_processor_id();
+
+ if (!amu_fie_cpu_supported(cpu))
+ return;
+
+ /* Kick in AMU update but only if one has not happened already */
+ if (housekeeping_cpu(cpu, HK_TYPE_TICK) &&
+ time_is_before_jiffies(per_cpu(cpu_amu_samples.last_scale_update, cpu)))
+ amu_scale_freq_tick();
+}
+
+#define AMU_SAMPLE_EXP_MS 20
+
+int arch_freq_get_on_cpu(int cpu)
+{
+ struct amu_cntr_sample *amu_sample;
+ unsigned int start_cpu = cpu;
+ unsigned long last_update;
+ unsigned int freq = 0;
+ u64 scale;
+
+ if (!amu_fie_cpu_supported(cpu) || !arch_scale_freq_ref(cpu))
+ return -EOPNOTSUPP;
+
+ while (1) {
+
+ amu_sample = per_cpu_ptr(&cpu_amu_samples, cpu);
+
+ last_update = amu_sample->last_scale_update;
+
+ /*
+ * For those CPUs that are in full dynticks mode, or those that have
+ * not seen tick for a while, try an alternative source for the counters
+ * (and thus freq scale), if available, for given policy: this boils
+ * down to identifying an active cpu within the same freq domain, if any.
+ */
+ if (!housekeeping_cpu(cpu, HK_TYPE_TICK) ||
+ time_is_before_jiffies(last_update + msecs_to_jiffies(AMU_SAMPLE_EXP_MS))) {
+ struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
+ int ref_cpu;
+
+ if (!policy)
+ return -EINVAL;
+
+ if (!cpumask_intersects(policy->related_cpus,
+ housekeeping_cpumask(HK_TYPE_TICK))) {
+ cpufreq_cpu_put(policy);
+ return -EOPNOTSUPP;
+ }
+
+ for_each_cpu_wrap(ref_cpu, policy->cpus, cpu + 1) {
+ if (ref_cpu == start_cpu) {
+ /* Prevent verifying same CPU twice */
+ ref_cpu = nr_cpu_ids;
+ break;
+ }
+ if (!idle_cpu(ref_cpu))
+ break;
+ }
+
+ cpufreq_cpu_put(policy);
+
+ if (ref_cpu >= nr_cpu_ids)
+ /* No alternative to pull info from */
+ return -EAGAIN;
+
+ cpu = ref_cpu;
+ } else {
+ break;
+ }
+ }
+ /*
+ * Reversed computation to the one used to determine
+ * the arch_freq_scale value
+ * (see amu_scale_freq_tick for details)
+ */
+ scale = arch_scale_freq_capacity(cpu);
+ freq = scale * arch_scale_freq_ref(cpu);
+ freq >>= SCHED_CAPACITY_SHIFT;
+ return freq;
+}
+
static void amu_fie_setup(const struct cpumask *cpus)
{
int cpu;
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 4e26bd356a48..529cff825531 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -172,14 +172,6 @@ static void dump_kernel_instr(const char *lvl, struct pt_regs *regs)
printk("%sCode: %s\n", lvl, str);
}
-#ifdef CONFIG_PREEMPT
-#define S_PREEMPT " PREEMPT"
-#elif defined(CONFIG_PREEMPT_RT)
-#define S_PREEMPT " PREEMPT_RT"
-#else
-#define S_PREEMPT ""
-#endif
-
#define S_SMP " SMP"
static int __die(const char *str, long err, struct pt_regs *regs)
@@ -187,7 +179,7 @@ static int __die(const char *str, long err, struct pt_regs *regs)
static int die_counter;
int ret;
- pr_emerg("Internal error: %s: %016lx [#%d]" S_PREEMPT S_SMP "\n",
+ pr_emerg("Internal error: %s: %016lx [#%d] " S_SMP "\n",
str, err, ++die_counter);
/* trap and error numbers are mostly meaningless on ARM */
diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index e8ed8e5b713b..887ac0b05961 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -18,7 +18,7 @@
#include <linux/sched.h>
#include <linux/signal.h>
#include <linux/slab.h>
-#include <linux/time_namespace.h>
+#include <linux/vdso_datastore.h>
#include <linux/vmalloc.h>
#include <vdso/datapage.h>
#include <vdso/helpers.h>
@@ -57,12 +57,6 @@ static struct vdso_abi_info vdso_info[] __ro_after_init = {
#endif /* CONFIG_COMPAT_VDSO */
};
-/*
- * The vDSO data page.
- */
-static union vdso_data_store vdso_data_store __page_aligned_data;
-struct vdso_data *vdso_data = vdso_data_store.data;
-
static int vdso_mremap(const struct vm_special_mapping *sm,
struct vm_area_struct *new_vma)
{
@@ -104,78 +98,6 @@ static int __init __vdso_init(enum vdso_abi abi)
return 0;
}
-#ifdef CONFIG_TIME_NS
-struct vdso_data *arch_get_vdso_data(void *vvar_page)
-{
- return (struct vdso_data *)(vvar_page);
-}
-
-static const struct vm_special_mapping vvar_map;
-
-/*
- * The vvar mapping contains data for a specific time namespace, so when a task
- * changes namespace we must unmap its vvar data for the old namespace.
- * Subsequent faults will map in data for the new namespace.
- *
- * For more details see timens_setup_vdso_data().
- */
-int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
-{
- struct mm_struct *mm = task->mm;
- struct vm_area_struct *vma;
- VMA_ITERATOR(vmi, mm, 0);
-
- mmap_read_lock(mm);
-
- for_each_vma(vmi, vma) {
- if (vma_is_special_mapping(vma, &vvar_map))
- zap_vma_pages(vma);
- }
-
- mmap_read_unlock(mm);
- return 0;
-}
-#endif
-
-static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
- struct vm_area_struct *vma, struct vm_fault *vmf)
-{
- struct page *timens_page = find_timens_vvar_page(vma);
- unsigned long pfn;
-
- switch (vmf->pgoff) {
- case VVAR_DATA_PAGE_OFFSET:
- if (timens_page)
- pfn = page_to_pfn(timens_page);
- else
- pfn = sym_to_pfn(vdso_data);
- break;
-#ifdef CONFIG_TIME_NS
- case VVAR_TIMENS_PAGE_OFFSET:
- /*
- * If a task belongs to a time namespace then a namespace
- * specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and
- * the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET
- * offset.
- * See also the comment near timens_setup_vdso_data().
- */
- if (!timens_page)
- return VM_FAULT_SIGBUS;
- pfn = sym_to_pfn(vdso_data);
- break;
-#endif /* CONFIG_TIME_NS */
- default:
- return VM_FAULT_SIGBUS;
- }
-
- return vmf_insert_pfn(vma, vmf->address, pfn);
-}
-
-static const struct vm_special_mapping vvar_map = {
- .name = "[vvar]",
- .fault = vvar_fault,
-};
-
static int __setup_additional_pages(enum vdso_abi abi,
struct mm_struct *mm,
struct linux_binprm *bprm,
@@ -185,11 +107,11 @@ static int __setup_additional_pages(enum vdso_abi abi,
unsigned long gp_flags = 0;
void *ret;
- BUILD_BUG_ON(VVAR_NR_PAGES != __VVAR_PAGES);
+ BUILD_BUG_ON(VDSO_NR_PAGES != __VDSO_PAGES);
vdso_text_len = vdso_info[abi].vdso_pages << PAGE_SHIFT;
/* Be sure to map the data page */
- vdso_mapping_len = vdso_text_len + VVAR_NR_PAGES * PAGE_SIZE;
+ vdso_mapping_len = vdso_text_len + VDSO_NR_PAGES * PAGE_SIZE;
vdso_base = get_unmapped_area(NULL, 0, vdso_mapping_len, 0, 0);
if (IS_ERR_VALUE(vdso_base)) {
@@ -197,16 +119,14 @@ static int __setup_additional_pages(enum vdso_abi abi,
goto up_fail;
}
- ret = _install_special_mapping(mm, vdso_base, VVAR_NR_PAGES * PAGE_SIZE,
- VM_READ|VM_MAYREAD|VM_PFNMAP,
- &vvar_map);
+ ret = vdso_install_vvar_mapping(mm, vdso_base);
if (IS_ERR(ret))
goto up_fail;
if (system_supports_bti_kernel())
gp_flags = VM_ARM64_BTI;
- vdso_base += VVAR_NR_PAGES * PAGE_SIZE;
+ vdso_base += VDSO_NR_PAGES * PAGE_SIZE;
mm->context.vdso = (void *)vdso_base;
ret = _install_special_mapping(mm, vdso_base, vdso_text_len,
VM_READ|VM_EXEC|gp_flags|
diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile
index 35685c036044..5e27e46aa496 100644
--- a/arch/arm64/kernel/vdso/Makefile
+++ b/arch/arm64/kernel/vdso/Makefile
@@ -7,7 +7,7 @@
#
# Include the generic Makefile to check the built vdso.
-include $(srctree)/lib/vdso/Makefile
+include $(srctree)/lib/vdso/Makefile.include
obj-vdso := vgettimeofday.o note.o sigreturn.o vgetrandom.o vgetrandom-chacha.o
diff --git a/arch/arm64/kernel/vdso/vdso.lds.S b/arch/arm64/kernel/vdso/vdso.lds.S
index 47ad6944f9f0..52314be29191 100644
--- a/arch/arm64/kernel/vdso/vdso.lds.S
+++ b/arch/arm64/kernel/vdso/vdso.lds.S
@@ -20,11 +20,8 @@ OUTPUT_ARCH(aarch64)
SECTIONS
{
- PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE);
- PROVIDE(_vdso_rng_data = _vdso_data + __VDSO_RND_DATA_OFFSET);
-#ifdef CONFIG_TIME_NS
- PROVIDE(_timens_data = _vdso_data + PAGE_SIZE);
-#endif
+ VDSO_VVAR_SYMS
+
. = SIZEOF_HEADERS;
.hash : { *(.hash) } :text
diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile
index 25a2cb6317f3..f2dfdc7dc818 100644
--- a/arch/arm64/kernel/vdso32/Makefile
+++ b/arch/arm64/kernel/vdso32/Makefile
@@ -3,7 +3,7 @@
# Makefile for vdso32
#
-include $(srctree)/lib/vdso/Makefile
+include $(srctree)/lib/vdso/Makefile.include
# Same as cc-*option, but using CC_COMPAT instead of CC
ifeq ($(CONFIG_CC_IS_CLANG), y)
diff --git a/arch/arm64/kernel/vdso32/vdso.lds.S b/arch/arm64/kernel/vdso32/vdso.lds.S
index 732702a187e9..e02b27487ce8 100644
--- a/arch/arm64/kernel/vdso32/vdso.lds.S
+++ b/arch/arm64/kernel/vdso32/vdso.lds.S
@@ -12,16 +12,15 @@
#include <asm/page.h>
#include <asm/vdso.h>
#include <asm-generic/vmlinux.lds.h>
+#include <vdso/datapage.h>
OUTPUT_FORMAT("elf32-littlearm", "elf32-bigarm", "elf32-littlearm")
OUTPUT_ARCH(arm)
SECTIONS
{
- PROVIDE_HIDDEN(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE);
-#ifdef CONFIG_TIME_NS
- PROVIDE_HIDDEN(_timens_data = _vdso_data + PAGE_SIZE);
-#endif
+ VDSO_VVAR_SYMS
+
. = SIZEOF_HEADERS;
.hash : { *(.hash) } :text
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 3cf7adb2b503..209bc76263f1 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -23,7 +23,7 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \
vgic/vgic-v3.o vgic/vgic-v4.o \
vgic/vgic-mmio.o vgic/vgic-mmio-v2.o \
vgic/vgic-mmio-v3.o vgic/vgic-kvm-device.o \
- vgic/vgic-its.o vgic/vgic-debug.o
+ vgic/vgic-its.o vgic/vgic-debug.o vgic/vgic-v3-nested.o
kvm-$(CONFIG_HW_PERF_EVENTS) += pmu-emul.o pmu.o
kvm-$(CONFIG_ARM64_PTR_AUTH) += pauth.o
diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
index 70802e4c91cf..5133dcbfe9f7 100644
--- a/arch/arm64/kvm/arch_timer.c
+++ b/arch/arm64/kvm/arch_timer.c
@@ -1070,8 +1070,7 @@ static void timer_context_init(struct kvm_vcpu *vcpu, int timerid)
else
ctxt->offset.vm_offset = &kvm->arch.timer_data.poffset;
- hrtimer_init(&ctxt->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
- ctxt->hrtimer.function = kvm_hrtimer_expire;
+ hrtimer_setup(&ctxt->hrtimer, kvm_hrtimer_expire, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
switch (timerid) {
case TIMER_PTIMER:
@@ -1098,8 +1097,8 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
timer_set_offset(vcpu_ptimer(vcpu), 0);
}
- hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
- timer->bg_timer.function = kvm_bg_timer_expire;
+ hrtimer_setup(&timer->bg_timer, kvm_bg_timer_expire, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS_HARD);
}
void kvm_timer_init_vm(struct kvm *kvm)
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 0160b4924351..68fec8c95fee 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -125,6 +125,14 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
}
mutex_unlock(&kvm->slots_lock);
break;
+ case KVM_CAP_ARM_WRITABLE_IMP_ID_REGS:
+ mutex_lock(&kvm->lock);
+ if (!kvm->created_vcpus) {
+ r = 0;
+ set_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &kvm->arch.flags);
+ }
+ mutex_unlock(&kvm->lock);
+ break;
default:
break;
}
@@ -313,6 +321,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ARM_SYSTEM_SUSPEND:
case KVM_CAP_IRQFD_RESAMPLE:
case KVM_CAP_COUNTER_OFFSET:
+ case KVM_CAP_ARM_WRITABLE_IMP_ID_REGS:
r = 1;
break;
case KVM_CAP_SET_GUEST_DEBUG2:
@@ -366,7 +375,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = get_num_wrps();
break;
case KVM_CAP_ARM_PMU_V3:
- r = kvm_arm_support_pmu_v3();
+ r = kvm_supports_guest_pmuv3();
break;
case KVM_CAP_ARM_INJECT_SERROR_ESR:
r = cpus_have_final_cap(ARM64_HAS_RAS_EXTN);
@@ -466,7 +475,11 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
if (err)
return err;
- return kvm_share_hyp(vcpu, vcpu + 1);
+ err = kvm_share_hyp(vcpu, vcpu + 1);
+ if (err)
+ kvm_vgic_vcpu_destroy(vcpu);
+
+ return err;
}
void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
@@ -586,8 +599,12 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
nommu:
vcpu->cpu = cpu;
- kvm_vgic_load(vcpu);
+ /*
+ * The timer must be loaded before the vgic to correctly set up physical
+ * interrupt deactivation in nested state (e.g. timer interrupt).
+ */
kvm_timer_vcpu_load(vcpu);
+ kvm_vgic_load(vcpu);
kvm_vcpu_load_debug(vcpu);
if (has_vhe())
kvm_vcpu_load_vhe(vcpu);
@@ -825,6 +842,12 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
if (ret)
return ret;
+ if (vcpu_has_nv(vcpu)) {
+ ret = kvm_vgic_vcpu_nv_init(vcpu);
+ if (ret)
+ return ret;
+ }
+
/*
* This needs to happen after any restriction has been applied
* to the feature set.
@@ -835,14 +858,20 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
if (ret)
return ret;
- ret = kvm_arm_pmu_v3_enable(vcpu);
- if (ret)
- return ret;
+ if (kvm_vcpu_has_pmu(vcpu)) {
+ ret = kvm_arm_pmu_v3_enable(vcpu);
+ if (ret)
+ return ret;
+ }
if (is_protected_kvm_enabled()) {
ret = pkvm_create_hyp_vm(kvm);
if (ret)
return ret;
+
+ ret = pkvm_create_hyp_vcpu(vcpu);
+ if (ret)
+ return ret;
}
mutex_lock(&kvm->arch.config_lock);
@@ -1148,7 +1177,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
*/
preempt_disable();
- kvm_pmu_flush_hwstate(vcpu);
+ if (kvm_vcpu_has_pmu(vcpu))
+ kvm_pmu_flush_hwstate(vcpu);
local_irq_disable();
@@ -1167,7 +1197,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
if (ret <= 0 || kvm_vcpu_exit_request(vcpu, &ret)) {
vcpu->mode = OUTSIDE_GUEST_MODE;
isb(); /* Ensure work in x_flush_hwstate is committed */
- kvm_pmu_sync_hwstate(vcpu);
+ if (kvm_vcpu_has_pmu(vcpu))
+ kvm_pmu_sync_hwstate(vcpu);
if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
kvm_timer_sync_user(vcpu);
kvm_vgic_sync_hwstate(vcpu);
@@ -1197,7 +1228,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
* that the vgic can properly sample the updated state of the
* interrupt line.
*/
- kvm_pmu_sync_hwstate(vcpu);
+ if (kvm_vcpu_has_pmu(vcpu))
+ kvm_pmu_sync_hwstate(vcpu);
/*
* Sync the vgic state before syncing the timer state because
@@ -1386,7 +1418,7 @@ static unsigned long system_supported_vcpu_features(void)
if (!cpus_have_final_cap(ARM64_HAS_32BIT_EL1))
clear_bit(KVM_ARM_VCPU_EL1_32BIT, &features);
- if (!kvm_arm_support_pmu_v3())
+ if (!kvm_supports_guest_pmuv3())
clear_bit(KVM_ARM_VCPU_PMU_V3, &features);
if (!system_supports_sve())
@@ -2307,6 +2339,13 @@ static int __init init_subsystems(void)
goto out;
}
+ if (kvm_mode == KVM_MODE_NV &&
+ !(vgic_present && kvm_vgic_global_state.type == VGIC_V3)) {
+ kvm_err("NV support requires GICv3, giving up\n");
+ err = -EINVAL;
+ goto out;
+ }
+
/*
* Init HYP architected timer support
*/
@@ -2714,6 +2753,14 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
{
struct kvm_kernel_irqfd *irqfd =
container_of(cons, struct kvm_kernel_irqfd, consumer);
+ struct kvm_kernel_irq_routing_entry *irq_entry = &irqfd->irq_entry;
+
+ /*
+ * The only thing we have a chance of directly-injecting is LPIs. Maybe
+ * one day...
+ */
+ if (irq_entry->type != KVM_IRQ_ROUTING_MSI)
+ return 0;
return kvm_vgic_v4_set_forwarding(irqfd->kvm, prod->irq,
&irqfd->irq_entry);
@@ -2723,6 +2770,10 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
{
struct kvm_kernel_irqfd *irqfd =
container_of(cons, struct kvm_kernel_irqfd, consumer);
+ struct kvm_kernel_irq_routing_entry *irq_entry = &irqfd->irq_entry;
+
+ if (irq_entry->type != KVM_IRQ_ROUTING_MSI)
+ return;
kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq,
&irqfd->irq_entry);
@@ -2803,11 +2854,12 @@ static __init int kvm_arm_init(void)
if (err)
goto out_hyp;
- kvm_info("%s%sVHE mode initialized successfully\n",
+ kvm_info("%s%sVHE%s mode initialized successfully\n",
in_hyp_mode ? "" : (is_protected_kvm_enabled() ?
"Protected " : "Hyp "),
in_hyp_mode ? "" : (cpus_have_final_cap(ARM64_KVM_HVHE) ?
- "h" : "n"));
+ "h" : "n"),
+ cpus_have_final_cap(ARM64_HAS_NESTED_VIRT) ? "+NV2": "");
/*
* FIXME: Do something reasonable if kvm_init() fails after pKVM
diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
index 3a96c96816e9..f74a66ce3064 100644
--- a/arch/arm64/kvm/at.c
+++ b/arch/arm64/kvm/at.c
@@ -1090,22 +1090,22 @@ static void compute_s1_overlay_permissions(struct kvm_vcpu *vcpu,
break;
}
- if (pov_perms & ~POE_RXW)
+ if (pov_perms & ~POE_RWX)
pov_perms = POE_NONE;
if (wi->poe && wr->pov) {
wr->pr &= pov_perms & POE_R;
- wr->px &= pov_perms & POE_X;
wr->pw &= pov_perms & POE_W;
+ wr->px &= pov_perms & POE_X;
}
- if (uov_perms & ~POE_RXW)
+ if (uov_perms & ~POE_RWX)
uov_perms = POE_NONE;
if (wi->e0poe && wr->uov) {
wr->ur &= uov_perms & POE_R;
- wr->ux &= uov_perms & POE_X;
wr->uw &= uov_perms & POE_W;
+ wr->ux &= uov_perms & POE_X;
}
}
diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c
index 607d37bab70b..0fcfcc0478f9 100644
--- a/arch/arm64/kvm/emulate-nested.c
+++ b/arch/arm64/kvm/emulate-nested.c
@@ -412,26 +412,26 @@ static const struct trap_bits coarse_trap_bits[] = {
},
[CGT_ICH_HCR_TC] = {
.index = ICH_HCR_EL2,
- .value = ICH_HCR_TC,
- .mask = ICH_HCR_TC,
+ .value = ICH_HCR_EL2_TC,
+ .mask = ICH_HCR_EL2_TC,
.behaviour = BEHAVE_FORWARD_RW,
},
[CGT_ICH_HCR_TALL0] = {
.index = ICH_HCR_EL2,
- .value = ICH_HCR_TALL0,
- .mask = ICH_HCR_TALL0,
+ .value = ICH_HCR_EL2_TALL0,
+ .mask = ICH_HCR_EL2_TALL0,
.behaviour = BEHAVE_FORWARD_RW,
},
[CGT_ICH_HCR_TALL1] = {
.index = ICH_HCR_EL2,
- .value = ICH_HCR_TALL1,
- .mask = ICH_HCR_TALL1,
+ .value = ICH_HCR_EL2_TALL1,
+ .mask = ICH_HCR_EL2_TALL1,
.behaviour = BEHAVE_FORWARD_RW,
},
[CGT_ICH_HCR_TDIR] = {
.index = ICH_HCR_EL2,
- .value = ICH_HCR_TDIR,
- .mask = ICH_HCR_TDIR,
+ .value = ICH_HCR_EL2_TDIR,
+ .mask = ICH_HCR_EL2_TDIR,
.behaviour = BEHAVE_FORWARD_RW,
},
};
@@ -2503,6 +2503,7 @@ void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu)
}
preempt_disable();
+ vcpu_set_flag(vcpu, IN_NESTED_ERET);
kvm_arch_vcpu_put(vcpu);
if (!esr_iss_is_eretax(esr))
@@ -2514,9 +2515,11 @@ void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu)
*vcpu_cpsr(vcpu) = spsr;
kvm_arch_vcpu_load(vcpu, smp_processor_id());
+ vcpu_clear_flag(vcpu, IN_NESTED_ERET);
preempt_enable();
- kvm_pmu_nested_transition(vcpu);
+ if (kvm_vcpu_has_pmu(vcpu))
+ kvm_pmu_nested_transition(vcpu);
}
static void kvm_inject_el2_exception(struct kvm_vcpu *vcpu, u64 esr_el2,
@@ -2599,7 +2602,8 @@ static int kvm_inject_nested(struct kvm_vcpu *vcpu, u64 esr_el2,
kvm_arch_vcpu_load(vcpu, smp_processor_id());
preempt_enable();
- kvm_pmu_nested_transition(vcpu);
+ if (kvm_vcpu_has_pmu(vcpu))
+ kvm_pmu_nested_transition(vcpu);
return 1;
}
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index 512d152233ff..b73dc26bc44b 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -129,8 +129,12 @@ static int kvm_handle_fpasimd(struct kvm_vcpu *vcpu)
static int kvm_handle_wfx(struct kvm_vcpu *vcpu)
{
u64 esr = kvm_vcpu_get_esr(vcpu);
+ bool is_wfe = !!(esr & ESR_ELx_WFx_ISS_WFE);
- if (esr & ESR_ELx_WFx_ISS_WFE) {
+ if (guest_hyp_wfx_traps_enabled(vcpu))
+ return kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu));
+
+ if (is_wfe) {
trace_kvm_wfx_arm64(*vcpu_pc(vcpu), true);
vcpu->stat.wfe_exit_stat++;
} else {
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index 23bbe28eaaf9..b741ea6aefa5 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -244,7 +244,7 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu)
* counter, which could make a PMXEVCNTR_EL0 access UNDEF at
* EL1 instead of being trapped to EL2.
*/
- if (kvm_arm_support_pmu_v3()) {
+ if (system_supports_pmuv3()) {
struct kvm_cpu_context *hctxt;
write_sysreg(0, pmselr_el0);
@@ -281,7 +281,7 @@ static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu)
write_sysreg(*host_data_ptr(host_debug_state.mdcr_el2), mdcr_el2);
write_sysreg(0, hstr_el2);
- if (kvm_arm_support_pmu_v3()) {
+ if (system_supports_pmuv3()) {
struct kvm_cpu_context *hctxt;
hctxt = host_data_ptr(host_ctxt);
diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
index 76ff095c6b6e..b9cff893bbe0 100644
--- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
+++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
@@ -43,6 +43,17 @@ static inline u64 *ctxt_mdscr_el1(struct kvm_cpu_context *ctxt)
return &ctxt_sys_reg(ctxt, MDSCR_EL1);
}
+static inline u64 ctxt_midr_el1(struct kvm_cpu_context *ctxt)
+{
+ struct kvm *kvm = kern_hyp_va(ctxt_to_vcpu(ctxt)->kvm);
+
+ if (!(ctxt_is_guest(ctxt) &&
+ test_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &kvm->arch.flags)))
+ return read_cpuid_id();
+
+ return kvm_read_vm_id_reg(kvm, SYS_MIDR_EL1);
+}
+
static inline void __sysreg_save_common_state(struct kvm_cpu_context *ctxt)
{
*ctxt_mdscr_el1(ctxt) = read_sysreg(mdscr_el1);
@@ -168,8 +179,9 @@ static inline void __sysreg_restore_user_state(struct kvm_cpu_context *ctxt)
}
static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt,
- u64 mpidr)
+ u64 midr, u64 mpidr)
{
+ write_sysreg(midr, vpidr_el2);
write_sysreg(mpidr, vmpidr_el2);
if (has_vhe() ||
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index 978f38c386ee..ea0a704da9b8 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -56,7 +56,7 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt);
int hyp_pin_shared_mem(void *from, void *to);
void hyp_unpin_shared_mem(void *from, void *to);
-void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc);
+void reclaim_pgtable_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc);
int refill_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages,
struct kvm_hyp_memcache *host_mc);
diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
index e42bf68c8848..ce31d3b73603 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
@@ -43,12 +43,6 @@ struct pkvm_hyp_vm {
struct hyp_pool pool;
hyp_spinlock_t lock;
- /*
- * The number of vcpus initialized and ready to run.
- * Modifying this is protected by 'vm_table_lock'.
- */
- unsigned int nr_vcpus;
-
/* Array of the hyp vCPU structures for this VM. */
struct pkvm_hyp_vcpu *vcpus[];
};
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 19c3c631708c..f34f11c720d7 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -266,7 +266,7 @@ int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd)
return 0;
}
-void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc)
+void reclaim_pgtable_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc)
{
struct hyp_page *page;
void *addr;
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index 3927fe52a3dd..5a335a51deca 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -46,7 +46,8 @@ static void pkvm_vcpu_reset_hcr(struct kvm_vcpu *vcpu)
vcpu->arch.hcr_el2 |= HCR_FWB;
if (cpus_have_final_cap(ARM64_HAS_EVT) &&
- !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE))
+ !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE) &&
+ kvm_read_vm_id_reg(vcpu->kvm, SYS_CTR_EL0) == read_cpuid(CTR_EL0))
vcpu->arch.hcr_el2 |= HCR_TID4;
else
vcpu->arch.hcr_el2 |= HCR_TID2;
@@ -166,8 +167,13 @@ static int pkvm_vcpu_init_traps(struct pkvm_hyp_vcpu *hyp_vcpu)
pkvm_vcpu_reset_hcr(vcpu);
- if ((!pkvm_hyp_vcpu_is_protected(hyp_vcpu)))
+ if ((!pkvm_hyp_vcpu_is_protected(hyp_vcpu))) {
+ struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
+
+ /* Trust the host for non-protected vcpu features. */
+ vcpu->arch.hcrx_el2 = host_vcpu->arch.hcrx_el2;
return 0;
+ }
ret = pkvm_check_pvm_cpu_features(vcpu);
if (ret)
@@ -175,6 +181,7 @@ static int pkvm_vcpu_init_traps(struct pkvm_hyp_vcpu *hyp_vcpu)
pvm_init_traps_hcr(vcpu);
pvm_init_traps_mdcr(vcpu);
+ vcpu_set_hcrx(vcpu);
return 0;
}
@@ -239,10 +246,12 @@ struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle,
hyp_spin_lock(&vm_table_lock);
hyp_vm = get_vm_by_handle(handle);
- if (!hyp_vm || hyp_vm->nr_vcpus <= vcpu_idx)
+ if (!hyp_vm || hyp_vm->kvm.created_vcpus <= vcpu_idx)
goto unlock;
hyp_vcpu = hyp_vm->vcpus[vcpu_idx];
+ if (!hyp_vcpu)
+ goto unlock;
/* Ensure vcpu isn't loaded on more than one cpu simultaneously. */
if (unlikely(hyp_vcpu->loaded_hyp_vcpu)) {
@@ -315,6 +324,9 @@ static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struc
unsigned long host_arch_flags = READ_ONCE(host_kvm->arch.flags);
DECLARE_BITMAP(allowed_features, KVM_VCPU_MAX_FEATURES);
+ /* CTR_EL0 is always under host control, even for protected VMs. */
+ hyp_vm->kvm.arch.ctr_el0 = host_kvm->arch.ctr_el0;
+
if (test_bit(KVM_ARCH_FLAG_MTE_ENABLED, &host_kvm->arch.flags))
set_bit(KVM_ARCH_FLAG_MTE_ENABLED, &kvm->arch.flags);
@@ -325,6 +337,10 @@ static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struc
bitmap_copy(kvm->arch.vcpu_features,
host_kvm->arch.vcpu_features,
KVM_VCPU_MAX_FEATURES);
+
+ if (test_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &host_arch_flags))
+ hyp_vm->kvm.arch.midr_el1 = host_kvm->arch.midr_el1;
+
return;
}
@@ -361,8 +377,14 @@ static void unpin_host_vcpus(struct pkvm_hyp_vcpu *hyp_vcpus[],
{
int i;
- for (i = 0; i < nr_vcpus; i++)
- unpin_host_vcpu(hyp_vcpus[i]->host_vcpu);
+ for (i = 0; i < nr_vcpus; i++) {
+ struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vcpus[i];
+
+ if (!hyp_vcpu)
+ continue;
+
+ unpin_host_vcpu(hyp_vcpu->host_vcpu);
+ }
}
static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm,
@@ -386,24 +408,18 @@ static void pkvm_vcpu_init_sve(struct pkvm_hyp_vcpu *hyp_vcpu, struct kvm_vcpu *
static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu,
struct pkvm_hyp_vm *hyp_vm,
- struct kvm_vcpu *host_vcpu,
- unsigned int vcpu_idx)
+ struct kvm_vcpu *host_vcpu)
{
int ret = 0;
if (hyp_pin_shared_mem(host_vcpu, host_vcpu + 1))
return -EBUSY;
- if (host_vcpu->vcpu_idx != vcpu_idx) {
- ret = -EINVAL;
- goto done;
- }
-
hyp_vcpu->host_vcpu = host_vcpu;
hyp_vcpu->vcpu.kvm = &hyp_vm->kvm;
hyp_vcpu->vcpu.vcpu_id = READ_ONCE(host_vcpu->vcpu_id);
- hyp_vcpu->vcpu.vcpu_idx = vcpu_idx;
+ hyp_vcpu->vcpu.vcpu_idx = READ_ONCE(host_vcpu->vcpu_idx);
hyp_vcpu->vcpu.arch.hw_mmu = &hyp_vm->kvm.arch.mmu;
hyp_vcpu->vcpu.arch.cflags = READ_ONCE(host_vcpu->arch.cflags);
@@ -641,27 +657,28 @@ int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu,
goto unlock;
}
- idx = hyp_vm->nr_vcpus;
+ ret = init_pkvm_hyp_vcpu(hyp_vcpu, hyp_vm, host_vcpu);
+ if (ret)
+ goto unlock;
+
+ idx = hyp_vcpu->vcpu.vcpu_idx;
if (idx >= hyp_vm->kvm.created_vcpus) {
ret = -EINVAL;
goto unlock;
}
- ret = init_pkvm_hyp_vcpu(hyp_vcpu, hyp_vm, host_vcpu, idx);
- if (ret)
+ if (hyp_vm->vcpus[idx]) {
+ ret = -EINVAL;
goto unlock;
+ }
hyp_vm->vcpus[idx] = hyp_vcpu;
- hyp_vm->nr_vcpus++;
unlock:
hyp_spin_unlock(&vm_table_lock);
- if (ret) {
+ if (ret)
unmap_donated_memory(hyp_vcpu, sizeof(*hyp_vcpu));
- return ret;
- }
-
- return 0;
+ return ret;
}
static void
@@ -678,7 +695,7 @@ teardown_donated_memory(struct kvm_hyp_memcache *mc, void *addr, size_t size)
int __pkvm_teardown_vm(pkvm_handle_t handle)
{
- struct kvm_hyp_memcache *mc;
+ struct kvm_hyp_memcache *mc, *stage2_mc;
struct pkvm_hyp_vm *hyp_vm;
struct kvm *host_kvm;
unsigned int idx;
@@ -706,18 +723,24 @@ int __pkvm_teardown_vm(pkvm_handle_t handle)
/* Reclaim guest pages (including page-table pages) */
mc = &host_kvm->arch.pkvm.teardown_mc;
- reclaim_guest_pages(hyp_vm, mc);
- unpin_host_vcpus(hyp_vm->vcpus, hyp_vm->nr_vcpus);
+ stage2_mc = &host_kvm->arch.pkvm.stage2_teardown_mc;
+ reclaim_pgtable_pages(hyp_vm, stage2_mc);
+ unpin_host_vcpus(hyp_vm->vcpus, hyp_vm->kvm.created_vcpus);
/* Push the metadata pages to the teardown memcache */
- for (idx = 0; idx < hyp_vm->nr_vcpus; ++idx) {
+ for (idx = 0; idx < hyp_vm->kvm.created_vcpus; ++idx) {
struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vm->vcpus[idx];
- struct kvm_hyp_memcache *vcpu_mc = &hyp_vcpu->vcpu.arch.pkvm_memcache;
+ struct kvm_hyp_memcache *vcpu_mc;
+
+ if (!hyp_vcpu)
+ continue;
+
+ vcpu_mc = &hyp_vcpu->vcpu.arch.pkvm_memcache;
while (vcpu_mc->nr_pages) {
void *addr = pop_hyp_memcache(vcpu_mc, hyp_phys_to_virt);
- push_hyp_memcache(mc, addr, hyp_virt_to_phys);
+ push_hyp_memcache(stage2_mc, addr, hyp_virt_to_phys);
unmap_donated_memory_noclear(addr, PAGE_SIZE);
}
diff --git a/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c b/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c
index dba101565de3..3cc613cce5f5 100644
--- a/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c
+++ b/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c
@@ -28,7 +28,9 @@ void __sysreg_save_state_nvhe(struct kvm_cpu_context *ctxt)
void __sysreg_restore_state_nvhe(struct kvm_cpu_context *ctxt)
{
- __sysreg_restore_el1_state(ctxt, ctxt_sys_reg(ctxt, MPIDR_EL1));
+ u64 midr = ctxt_midr_el1(ctxt);
+
+ __sysreg_restore_el1_state(ctxt, midr, ctxt_sys_reg(ctxt, MPIDR_EL1));
__sysreg_restore_common_state(ctxt);
__sysreg_restore_user_state(ctxt);
__sysreg_restore_el2_return_state(ctxt);
diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c
index 3f9741e51d41..ed363aa3027e 100644
--- a/arch/arm64/kvm/hyp/vgic-v3-sr.c
+++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c
@@ -18,7 +18,7 @@
#define vtr_to_nr_pre_bits(v) ((((u32)(v) >> 26) & 7) + 1)
#define vtr_to_nr_apr_regs(v) (1 << (vtr_to_nr_pre_bits(v) - 5))
-static u64 __gic_v3_get_lr(unsigned int lr)
+u64 __gic_v3_get_lr(unsigned int lr)
{
switch (lr & 0xf) {
case 0:
@@ -218,7 +218,7 @@ void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if)
elrsr = read_gicreg(ICH_ELRSR_EL2);
- write_gicreg(cpu_if->vgic_hcr & ~ICH_HCR_EN, ICH_HCR_EL2);
+ write_gicreg(cpu_if->vgic_hcr & ~ICH_HCR_EL2_En, ICH_HCR_EL2);
for (i = 0; i < used_lrs; i++) {
if (elrsr & (1 << i))
@@ -274,7 +274,7 @@ void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if)
* system registers to trap to EL1 (duh), force ICC_SRE_EL1.SRE to 1
* so that the trap bits can take effect. Yes, we *loves* the GIC.
*/
- if (!(cpu_if->vgic_hcr & ICH_HCR_EN)) {
+ if (!(cpu_if->vgic_hcr & ICH_HCR_EL2_En)) {
write_gicreg(ICC_SRE_EL1_SRE, ICC_SRE_EL1);
isb();
} else if (!cpu_if->vgic_sre) {
@@ -752,7 +752,7 @@ static void __vgic_v3_bump_eoicount(void)
u32 hcr;
hcr = read_gicreg(ICH_HCR_EL2);
- hcr += 1 << ICH_HCR_EOIcount_SHIFT;
+ hcr += 1 << ICH_HCR_EL2_EOIcount_SHIFT;
write_gicreg(hcr, ICH_HCR_EL2);
}
@@ -1069,7 +1069,7 @@ static bool __vgic_v3_check_trap_forwarding(struct kvm_vcpu *vcpu,
case SYS_ICC_EOIR0_EL1:
case SYS_ICC_HPPIR0_EL1:
case SYS_ICC_IAR0_EL1:
- return ich_hcr & ICH_HCR_TALL0;
+ return ich_hcr & ICH_HCR_EL2_TALL0;
case SYS_ICC_IGRPEN1_EL1:
if (is_read &&
@@ -1090,10 +1090,10 @@ static bool __vgic_v3_check_trap_forwarding(struct kvm_vcpu *vcpu,
case SYS_ICC_EOIR1_EL1:
case SYS_ICC_HPPIR1_EL1:
case SYS_ICC_IAR1_EL1:
- return ich_hcr & ICH_HCR_TALL1;
+ return ich_hcr & ICH_HCR_EL2_TALL1;
case SYS_ICC_DIR_EL1:
- if (ich_hcr & ICH_HCR_TDIR)
+ if (ich_hcr & ICH_HCR_EL2_TDIR)
return true;
fallthrough;
@@ -1101,7 +1101,7 @@ static bool __vgic_v3_check_trap_forwarding(struct kvm_vcpu *vcpu,
case SYS_ICC_RPR_EL1:
case SYS_ICC_CTLR_EL1:
case SYS_ICC_PMR_EL1:
- return ich_hcr & ICH_HCR_TC;
+ return ich_hcr & ICH_HCR_EL2_TC;
default:
return false;
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
index 647737d6e8d0..731a0378ed13 100644
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -527,6 +527,25 @@ static bool kvm_hyp_handle_sysreg_vhe(struct kvm_vcpu *vcpu, u64 *exit_code)
return kvm_hyp_handle_sysreg(vcpu, exit_code);
}
+static bool kvm_hyp_handle_impdef(struct kvm_vcpu *vcpu, u64 *exit_code)
+{
+ u64 iss;
+
+ if (!cpus_have_final_cap(ARM64_WORKAROUND_PMUV3_IMPDEF_TRAPS))
+ return false;
+
+ /*
+ * Compute a synthetic ESR for a sysreg trap. Conveniently, AFSR1_EL2
+ * is populated with a correct ISS for a sysreg trap. These fruity
+ * parts are 64bit only, so unconditionally set IL.
+ */
+ iss = ESR_ELx_ISS(read_sysreg_s(SYS_AFSR1_EL2));
+ vcpu->arch.fault.esr_el2 = FIELD_PREP(ESR_ELx_EC_MASK, ESR_ELx_EC_SYS64) |
+ FIELD_PREP(ESR_ELx_ISS_MASK, iss) |
+ ESR_ELx_IL;
+ return false;
+}
+
static const exit_handler_fn hyp_exit_handlers[] = {
[0 ... ESR_ELx_EC_MAX] = NULL,
[ESR_ELx_EC_CP15_32] = kvm_hyp_handle_cp15_32,
@@ -538,6 +557,9 @@ static const exit_handler_fn hyp_exit_handlers[] = {
[ESR_ELx_EC_WATCHPT_LOW] = kvm_hyp_handle_watchpt_low,
[ESR_ELx_EC_ERET] = kvm_hyp_handle_eret,
[ESR_ELx_EC_MOPS] = kvm_hyp_handle_mops,
+
+ /* Apple shenanigans */
+ [0x3F] = kvm_hyp_handle_impdef,
};
static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
index 90b018e06f2c..3814b0b2c937 100644
--- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
+++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
@@ -87,11 +87,12 @@ static void __sysreg_restore_vel2_state(struct kvm_vcpu *vcpu)
write_sysreg(__vcpu_sys_reg(vcpu, PAR_EL1), par_el1);
write_sysreg(__vcpu_sys_reg(vcpu, TPIDR_EL1), tpidr_el1);
- write_sysreg(__vcpu_sys_reg(vcpu, MPIDR_EL1), vmpidr_el2);
- write_sysreg_el1(__vcpu_sys_reg(vcpu, MAIR_EL2), SYS_MAIR);
- write_sysreg_el1(__vcpu_sys_reg(vcpu, VBAR_EL2), SYS_VBAR);
- write_sysreg_el1(__vcpu_sys_reg(vcpu, CONTEXTIDR_EL2), SYS_CONTEXTIDR);
- write_sysreg_el1(__vcpu_sys_reg(vcpu, AMAIR_EL2), SYS_AMAIR);
+ write_sysreg(ctxt_midr_el1(&vcpu->arch.ctxt), vpidr_el2);
+ write_sysreg(__vcpu_sys_reg(vcpu, MPIDR_EL1), vmpidr_el2);
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, MAIR_EL2), SYS_MAIR);
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, VBAR_EL2), SYS_VBAR);
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, CONTEXTIDR_EL2), SYS_CONTEXTIDR);
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, AMAIR_EL2), SYS_AMAIR);
if (vcpu_el2_e2h_is_set(vcpu)) {
/*
@@ -191,7 +192,7 @@ void __vcpu_load_switch_sysregs(struct kvm_vcpu *vcpu)
{
struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt;
struct kvm_cpu_context *host_ctxt;
- u64 mpidr;
+ u64 midr, mpidr;
host_ctxt = host_data_ptr(host_ctxt);
__sysreg_save_user_state(host_ctxt);
@@ -221,22 +222,17 @@ void __vcpu_load_switch_sysregs(struct kvm_vcpu *vcpu)
} else {
if (vcpu_has_nv(vcpu)) {
/*
- * Use the guest hypervisor's VPIDR_EL2 when in a
- * nested state. The hardware value of MIDR_EL1 gets
- * restored on put.
- */
- write_sysreg(ctxt_sys_reg(guest_ctxt, VPIDR_EL2), vpidr_el2);
-
- /*
* As we're restoring a nested guest, set the value
* provided by the guest hypervisor.
*/
+ midr = ctxt_sys_reg(guest_ctxt, VPIDR_EL2);
mpidr = ctxt_sys_reg(guest_ctxt, VMPIDR_EL2);
} else {
+ midr = ctxt_midr_el1(guest_ctxt);
mpidr = ctxt_sys_reg(guest_ctxt, MPIDR_EL1);
}
- __sysreg_restore_el1_state(guest_ctxt, mpidr);
+ __sysreg_restore_el1_state(guest_ctxt, midr, mpidr);
}
vcpu_set_flag(vcpu, SYSREGS_ON_CPU);
@@ -271,9 +267,5 @@ void __vcpu_put_switch_sysregs(struct kvm_vcpu *vcpu)
/* Restore host user state */
__sysreg_restore_user_state(host_ctxt);
- /* If leaving a nesting guest, restore MIDR_EL1 default view */
- if (vcpu_has_nv(vcpu))
- write_sysreg(read_cpuid_id(), vpidr_el2);
-
vcpu_clear_flag(vcpu, SYSREGS_ON_CPU);
}
diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
index 27ce4cb44904..569941eeb3fe 100644
--- a/arch/arm64/kvm/hypercalls.c
+++ b/arch/arm64/kvm/hypercalls.c
@@ -15,6 +15,8 @@
GENMASK(KVM_REG_ARM_STD_HYP_BMAP_BIT_COUNT - 1, 0)
#define KVM_ARM_SMCCC_VENDOR_HYP_FEATURES \
GENMASK(KVM_REG_ARM_VENDOR_HYP_BMAP_BIT_COUNT - 1, 0)
+#define KVM_ARM_SMCCC_VENDOR_HYP_FEATURES_2 \
+ GENMASK(KVM_REG_ARM_VENDOR_HYP_BMAP_2_BIT_COUNT - 1, 0)
static void kvm_ptp_get_time(struct kvm_vcpu *vcpu, u64 *val)
{
@@ -360,6 +362,8 @@ int kvm_smccc_call_handler(struct kvm_vcpu *vcpu)
break;
case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID:
val[0] = smccc_feat->vendor_hyp_bmap;
+ /* Function numbers 2-63 are reserved for pKVM for now */
+ val[2] = smccc_feat->vendor_hyp_bmap_2;
break;
case ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID:
kvm_ptp_get_time(vcpu, val);
@@ -387,6 +391,7 @@ static const u64 kvm_arm_fw_reg_ids[] = {
KVM_REG_ARM_STD_BMAP,
KVM_REG_ARM_STD_HYP_BMAP,
KVM_REG_ARM_VENDOR_HYP_BMAP,
+ KVM_REG_ARM_VENDOR_HYP_BMAP_2,
};
void kvm_arm_init_hypercalls(struct kvm *kvm)
@@ -497,6 +502,9 @@ int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
case KVM_REG_ARM_VENDOR_HYP_BMAP:
val = READ_ONCE(smccc_feat->vendor_hyp_bmap);
break;
+ case KVM_REG_ARM_VENDOR_HYP_BMAP_2:
+ val = READ_ONCE(smccc_feat->vendor_hyp_bmap_2);
+ break;
default:
return -ENOENT;
}
@@ -527,6 +535,10 @@ static int kvm_arm_set_fw_reg_bmap(struct kvm_vcpu *vcpu, u64 reg_id, u64 val)
fw_reg_bmap = &smccc_feat->vendor_hyp_bmap;
fw_reg_features = KVM_ARM_SMCCC_VENDOR_HYP_FEATURES;
break;
+ case KVM_REG_ARM_VENDOR_HYP_BMAP_2:
+ fw_reg_bmap = &smccc_feat->vendor_hyp_bmap_2;
+ fw_reg_features = KVM_ARM_SMCCC_VENDOR_HYP_FEATURES_2;
+ break;
default:
return -ENOENT;
}
@@ -633,6 +645,7 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
case KVM_REG_ARM_STD_BMAP:
case KVM_REG_ARM_STD_HYP_BMAP:
case KVM_REG_ARM_VENDOR_HYP_BMAP:
+ case KVM_REG_ARM_VENDOR_HYP_BMAP_2:
return kvm_arm_set_fw_reg_bmap(vcpu, reg->id, val);
default:
return -ENOENT;
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 1f55b0c7b11d..2feb6c6b63af 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1086,14 +1086,26 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
}
}
-static void hyp_mc_free_fn(void *addr, void *unused)
+static void hyp_mc_free_fn(void *addr, void *mc)
{
+ struct kvm_hyp_memcache *memcache = mc;
+
+ if (memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2)
+ kvm_account_pgtable_pages(addr, -1);
+
free_page((unsigned long)addr);
}
-static void *hyp_mc_alloc_fn(void *unused)
+static void *hyp_mc_alloc_fn(void *mc)
{
- return (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
+ struct kvm_hyp_memcache *memcache = mc;
+ void *addr;
+
+ addr = (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
+ if (addr && memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2)
+ kvm_account_pgtable_pages(addr, 1);
+
+ return addr;
}
void free_hyp_memcache(struct kvm_hyp_memcache *mc)
@@ -1102,7 +1114,7 @@ void free_hyp_memcache(struct kvm_hyp_memcache *mc)
return;
kfree(mc->mapping);
- __free_hyp_memcache(mc, hyp_mc_free_fn, kvm_host_va, NULL);
+ __free_hyp_memcache(mc, hyp_mc_free_fn, kvm_host_va, mc);
}
int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages)
@@ -1117,7 +1129,7 @@ int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages)
}
return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn,
- kvm_host_pa, NULL);
+ kvm_host_pa, mc);
}
/**
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index 0c9387d2f507..4a3fc11f7ecf 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -16,9 +16,6 @@
#include "sys_regs.h"
-/* Protection against the sysreg repainting madness... */
-#define NV_FTR(r, f) ID_AA64##r##_EL1_##f
-
/*
* Ratio of live shadow S2 MMU per vcpu. This is a trade-off between
* memory usage and potential number of different sets of S2 PTs in
@@ -54,6 +51,10 @@ int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu)
struct kvm_s2_mmu *tmp;
int num_mmus, ret = 0;
+ if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features) &&
+ !cpus_have_final_cap(ARM64_HAS_HCR_NV1))
+ return -EINVAL;
+
/*
* Let's treat memory allocation failures as benign: If we fail to
* allocate anything, return an error and keep the allocated array
@@ -807,134 +808,151 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm)
* This list should get updated as new features get added to the NV
* support, and new extension to the architecture.
*/
-static void limit_nv_id_regs(struct kvm *kvm)
+u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val)
{
- u64 val, tmp;
-
- /* Support everything but TME */
- val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64ISAR0_EL1);
- val &= ~NV_FTR(ISAR0, TME);
- kvm_set_vm_id_reg(kvm, SYS_ID_AA64ISAR0_EL1, val);
-
- /* Support everything but Spec Invalidation and LS64 */
- val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64ISAR1_EL1);
- val &= ~(NV_FTR(ISAR1, LS64) |
- NV_FTR(ISAR1, SPECRES));
- kvm_set_vm_id_reg(kvm, SYS_ID_AA64ISAR1_EL1, val);
-
- /* No AMU, MPAM, S-EL2, or RAS */
- val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1);
- val &= ~(GENMASK_ULL(55, 52) |
- NV_FTR(PFR0, AMU) |
- NV_FTR(PFR0, MPAM) |
- NV_FTR(PFR0, SEL2) |
- NV_FTR(PFR0, RAS) |
- NV_FTR(PFR0, EL3) |
- NV_FTR(PFR0, EL2) |
- NV_FTR(PFR0, EL1) |
- NV_FTR(PFR0, EL0));
- /* 64bit only at any EL */
- val |= FIELD_PREP(NV_FTR(PFR0, EL0), 0b0001);
- val |= FIELD_PREP(NV_FTR(PFR0, EL1), 0b0001);
- val |= FIELD_PREP(NV_FTR(PFR0, EL2), 0b0001);
- val |= FIELD_PREP(NV_FTR(PFR0, EL3), 0b0001);
- kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1, val);
-
- /* Only support BTI, SSBS, CSV2_frac */
- val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR1_EL1);
- val &= (NV_FTR(PFR1, BT) |
- NV_FTR(PFR1, SSBS) |
- NV_FTR(PFR1, CSV2_frac));
- kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR1_EL1, val);
-
- /* Hide ECV, ExS, Secure Memory */
- val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64MMFR0_EL1);
- val &= ~(NV_FTR(MMFR0, ECV) |
- NV_FTR(MMFR0, EXS) |
- NV_FTR(MMFR0, TGRAN4_2) |
- NV_FTR(MMFR0, TGRAN16_2) |
- NV_FTR(MMFR0, TGRAN64_2) |
- NV_FTR(MMFR0, SNSMEM));
-
- /* Disallow unsupported S2 page sizes */
- switch (PAGE_SIZE) {
- case SZ_64K:
- val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN16_2), 0b0001);
- fallthrough;
- case SZ_16K:
- val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN4_2), 0b0001);
- fallthrough;
- case SZ_4K:
- /* Support everything */
+ switch (reg) {
+ case SYS_ID_AA64ISAR0_EL1:
+ /* Support everything but TME */
+ val &= ~ID_AA64ISAR0_EL1_TME;
break;
- }
- /*
- * Since we can't support a guest S2 page size smaller than
- * the host's own page size (due to KVM only populating its
- * own S2 using the kernel's page size), advertise the
- * limitation using FEAT_GTG.
- */
- switch (PAGE_SIZE) {
- case SZ_4K:
- val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN4_2), 0b0010);
- fallthrough;
- case SZ_16K:
- val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN16_2), 0b0010);
- fallthrough;
- case SZ_64K:
- val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN64_2), 0b0010);
+
+ case SYS_ID_AA64ISAR1_EL1:
+ /* Support everything but LS64 and Spec Invalidation */
+ val &= ~(ID_AA64ISAR1_EL1_LS64 |
+ ID_AA64ISAR1_EL1_SPECRES);
+ break;
+
+ case SYS_ID_AA64PFR0_EL1:
+ /* No RME, AMU, MPAM, S-EL2, or RAS */
+ val &= ~(ID_AA64PFR0_EL1_RME |
+ ID_AA64PFR0_EL1_AMU |
+ ID_AA64PFR0_EL1_MPAM |
+ ID_AA64PFR0_EL1_SEL2 |
+ ID_AA64PFR0_EL1_RAS |
+ ID_AA64PFR0_EL1_EL3 |
+ ID_AA64PFR0_EL1_EL2 |
+ ID_AA64PFR0_EL1_EL1 |
+ ID_AA64PFR0_EL1_EL0);
+ /* 64bit only at any EL */
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL0, IMP);
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL1, IMP);
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL2, IMP);
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL3, IMP);
+ break;
+
+ case SYS_ID_AA64PFR1_EL1:
+ /* Only support BTI, SSBS, CSV2_frac */
+ val &= (ID_AA64PFR1_EL1_BT |
+ ID_AA64PFR1_EL1_SSBS |
+ ID_AA64PFR1_EL1_CSV2_frac);
+ break;
+
+ case SYS_ID_AA64MMFR0_EL1:
+ /* Hide ExS, Secure Memory */
+ val &= ~(ID_AA64MMFR0_EL1_EXS |
+ ID_AA64MMFR0_EL1_TGRAN4_2 |
+ ID_AA64MMFR0_EL1_TGRAN16_2 |
+ ID_AA64MMFR0_EL1_TGRAN64_2 |
+ ID_AA64MMFR0_EL1_SNSMEM);
+
+ /* Hide CNTPOFF if present */
+ val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR0_EL1, ECV, IMP);
+
+ /* Disallow unsupported S2 page sizes */
+ switch (PAGE_SIZE) {
+ case SZ_64K:
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN16_2, NI);
+ fallthrough;
+ case SZ_16K:
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN4_2, NI);
+ fallthrough;
+ case SZ_4K:
+ /* Support everything */
+ break;
+ }
+
+ /*
+ * Since we can't support a guest S2 page size smaller
+ * than the host's own page size (due to KVM only
+ * populating its own S2 using the kernel's page
+ * size), advertise the limitation using FEAT_GTG.
+ */
+ switch (PAGE_SIZE) {
+ case SZ_4K:
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN4_2, IMP);
+ fallthrough;
+ case SZ_16K:
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN16_2, IMP);
+ fallthrough;
+ case SZ_64K:
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN64_2, IMP);
+ break;
+ }
+
+ /* Cap PARange to 48bits */
+ val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR0_EL1, PARANGE, 48);
+ break;
+
+ case SYS_ID_AA64MMFR1_EL1:
+ val &= (ID_AA64MMFR1_EL1_HCX |
+ ID_AA64MMFR1_EL1_PAN |
+ ID_AA64MMFR1_EL1_LO |
+ ID_AA64MMFR1_EL1_HPDS |
+ ID_AA64MMFR1_EL1_VH |
+ ID_AA64MMFR1_EL1_VMIDBits);
+ /* FEAT_E2H0 implies no VHE */
+ if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features))
+ val &= ~ID_AA64MMFR1_EL1_VH;
+ break;
+
+ case SYS_ID_AA64MMFR2_EL1:
+ val &= ~(ID_AA64MMFR2_EL1_BBM |
+ ID_AA64MMFR2_EL1_TTL |
+ GENMASK_ULL(47, 44) |
+ ID_AA64MMFR2_EL1_ST |
+ ID_AA64MMFR2_EL1_CCIDX |
+ ID_AA64MMFR2_EL1_VARange);
+
+ /* Force TTL support */
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR2_EL1, TTL, IMP);
+ break;
+
+ case SYS_ID_AA64MMFR4_EL1:
+ /*
+ * You get EITHER
+ *
+ * - FEAT_VHE without FEAT_E2H0
+ * - FEAT_NV limited to FEAT_NV2
+ * - HCR_EL2.NV1 being RES0
+ *
+ * OR
+ *
+ * - FEAT_E2H0 without FEAT_VHE nor FEAT_NV
+ *
+ * Life is too short for anything else.
+ */
+ if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features)) {
+ val = 0;
+ } else {
+ val = SYS_FIELD_PREP_ENUM(ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY);
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR4_EL1, E2H0, NI_NV1);
+ }
+ break;
+
+ case SYS_ID_AA64DFR0_EL1:
+ /* Only limited support for PMU, Debug, BPs, WPs, and HPMN0 */
+ val &= (ID_AA64DFR0_EL1_PMUVer |
+ ID_AA64DFR0_EL1_WRPs |
+ ID_AA64DFR0_EL1_BRPs |
+ ID_AA64DFR0_EL1_DebugVer|
+ ID_AA64DFR0_EL1_HPMN0);
+
+ /* Cap Debug to ARMv8.1 */
+ val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64DFR0_EL1, DebugVer, VHE);
break;
}
- /* Cap PARange to 48bits */
- tmp = FIELD_GET(NV_FTR(MMFR0, PARANGE), val);
- if (tmp > 0b0101) {
- val &= ~NV_FTR(MMFR0, PARANGE);
- val |= FIELD_PREP(NV_FTR(MMFR0, PARANGE), 0b0101);
- }
- kvm_set_vm_id_reg(kvm, SYS_ID_AA64MMFR0_EL1, val);
-
- val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64MMFR1_EL1);
- val &= (NV_FTR(MMFR1, HCX) |
- NV_FTR(MMFR1, PAN) |
- NV_FTR(MMFR1, LO) |
- NV_FTR(MMFR1, HPDS) |
- NV_FTR(MMFR1, VH) |
- NV_FTR(MMFR1, VMIDBits));
- kvm_set_vm_id_reg(kvm, SYS_ID_AA64MMFR1_EL1, val);
-
- val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64MMFR2_EL1);
- val &= ~(NV_FTR(MMFR2, BBM) |
- NV_FTR(MMFR2, TTL) |
- GENMASK_ULL(47, 44) |
- NV_FTR(MMFR2, ST) |
- NV_FTR(MMFR2, CCIDX) |
- NV_FTR(MMFR2, VARange));
-
- /* Force TTL support */
- val |= FIELD_PREP(NV_FTR(MMFR2, TTL), 0b0001);
- kvm_set_vm_id_reg(kvm, SYS_ID_AA64MMFR2_EL1, val);
-
- val = 0;
- if (!cpus_have_final_cap(ARM64_HAS_HCR_NV1))
- val |= FIELD_PREP(NV_FTR(MMFR4, E2H0),
- ID_AA64MMFR4_EL1_E2H0_NI_NV1);
- kvm_set_vm_id_reg(kvm, SYS_ID_AA64MMFR4_EL1, val);
-
- /* Only limited support for PMU, Debug, BPs, WPs, and HPMN0 */
- val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64DFR0_EL1);
- val &= (NV_FTR(DFR0, PMUVer) |
- NV_FTR(DFR0, WRPs) |
- NV_FTR(DFR0, BRPs) |
- NV_FTR(DFR0, DebugVer) |
- NV_FTR(DFR0, HPMN0));
-
- /* Cap Debug to ARMv8.1 */
- tmp = FIELD_GET(NV_FTR(DFR0, DebugVer), val);
- if (tmp > 0b0111) {
- val &= ~NV_FTR(DFR0, DebugVer);
- val |= FIELD_PREP(NV_FTR(DFR0, DebugVer), 0b0111);
- }
- kvm_set_vm_id_reg(kvm, SYS_ID_AA64DFR0_EL1, val);
+
+ return val;
}
u64 kvm_vcpu_apply_reg_masks(const struct kvm_vcpu *vcpu,
@@ -981,8 +999,6 @@ int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu)
if (!kvm->arch.sysreg_masks)
return -ENOMEM;
- limit_nv_id_regs(kvm);
-
/* VTTBR_EL2 */
res0 = res1 = 0;
if (!kvm_has_feat_enum(kvm, ID_AA64MMFR1_EL1, VMIDBits, 16))
@@ -1021,10 +1037,11 @@ int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu)
res0 |= HCR_FIEN;
if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, FWB, IMP))
res0 |= HCR_FWB;
- if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, NV, NV2))
- res0 |= HCR_NV2;
- if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, NV, IMP))
- res0 |= (HCR_AT | HCR_NV1 | HCR_NV);
+ /* Implementation choice: NV2 is the only supported config */
+ if (!kvm_has_feat(kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY))
+ res0 |= (HCR_NV2 | HCR_NV | HCR_AT);
+ if (!kvm_has_feat(kvm, ID_AA64MMFR4_EL1, E2H0, NI))
+ res0 |= HCR_NV1;
if (!(kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_PTRAUTH_ADDRESS) &&
kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_PTRAUTH_GENERIC)))
res0 |= (HCR_API | HCR_APK);
@@ -1034,6 +1051,8 @@ int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu)
res0 |= (HCR_TEA | HCR_TERR);
if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, LO, IMP))
res0 |= HCR_TLOR;
+ if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, VH, IMP))
+ res0 |= HCR_E2H;
if (!kvm_has_feat(kvm, ID_AA64MMFR4_EL1, E2H0, IMP))
res1 |= HCR_E2H;
set_sysreg_masks(kvm, HCR_EL2, res0, res1);
@@ -1290,6 +1309,15 @@ int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu)
res0 |= GENMASK(11, 8);
set_sysreg_masks(kvm, CNTHCTL_EL2, res0, res1);
+ /* ICH_HCR_EL2 */
+ res0 = ICH_HCR_EL2_RES0;
+ res1 = ICH_HCR_EL2_RES1;
+ if (!(kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_EL2_TDS))
+ res0 |= ICH_HCR_EL2_TDIR;
+ /* No GICv4 is presented to the guest */
+ res0 |= ICH_HCR_EL2_DVIM | ICH_HCR_EL2_vSGIEOICount;
+ set_sysreg_masks(kvm, ICH_HCR_EL2, res0, res1);
+
out:
for (enum vcpu_sysreg sr = __SANITISED_REG_START__; sr < NR_SYS_REGS; sr++)
(void)__vcpu_sys_reg(vcpu, sr);
@@ -1309,4 +1337,8 @@ void check_nested_vcpu_requests(struct kvm_vcpu *vcpu)
}
write_unlock(&vcpu->kvm->mmu_lock);
}
+
+ /* Must be last, as may switch context! */
+ if (kvm_check_request(KVM_REQ_GUEST_HYP_IRQ_PENDING, vcpu))
+ kvm_inject_nested_irq(vcpu);
}
diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index 930b677eb9b0..0f89157d31fd 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -111,6 +111,29 @@ static void __pkvm_destroy_hyp_vm(struct kvm *host_kvm)
host_kvm->arch.pkvm.handle = 0;
free_hyp_memcache(&host_kvm->arch.pkvm.teardown_mc);
+ free_hyp_memcache(&host_kvm->arch.pkvm.stage2_teardown_mc);
+}
+
+static int __pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu)
+{
+ size_t hyp_vcpu_sz = PAGE_ALIGN(PKVM_HYP_VCPU_SIZE);
+ pkvm_handle_t handle = vcpu->kvm->arch.pkvm.handle;
+ void *hyp_vcpu;
+ int ret;
+
+ vcpu->arch.pkvm_memcache.flags |= HYP_MEMCACHE_ACCOUNT_STAGE2;
+
+ hyp_vcpu = alloc_pages_exact(hyp_vcpu_sz, GFP_KERNEL_ACCOUNT);
+ if (!hyp_vcpu)
+ return -ENOMEM;
+
+ ret = kvm_call_hyp_nvhe(__pkvm_init_vcpu, handle, vcpu, hyp_vcpu);
+ if (!ret)
+ vcpu_set_flag(vcpu, VCPU_PKVM_FINALIZED);
+ else
+ free_pages_exact(hyp_vcpu, hyp_vcpu_sz);
+
+ return ret;
}
/*
@@ -125,11 +148,8 @@ static void __pkvm_destroy_hyp_vm(struct kvm *host_kvm)
*/
static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
{
- size_t pgd_sz, hyp_vm_sz, hyp_vcpu_sz;
- struct kvm_vcpu *host_vcpu;
- pkvm_handle_t handle;
+ size_t pgd_sz, hyp_vm_sz;
void *pgd, *hyp_vm;
- unsigned long idx;
int ret;
if (host_kvm->created_vcpus < 1)
@@ -161,40 +181,11 @@ static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
if (ret < 0)
goto free_vm;
- handle = ret;
-
- host_kvm->arch.pkvm.handle = handle;
-
- /* Donate memory for the vcpus at hyp and initialize it. */
- hyp_vcpu_sz = PAGE_ALIGN(PKVM_HYP_VCPU_SIZE);
- kvm_for_each_vcpu(idx, host_vcpu, host_kvm) {
- void *hyp_vcpu;
-
- /* Indexing of the vcpus to be sequential starting at 0. */
- if (WARN_ON(host_vcpu->vcpu_idx != idx)) {
- ret = -EINVAL;
- goto destroy_vm;
- }
-
- hyp_vcpu = alloc_pages_exact(hyp_vcpu_sz, GFP_KERNEL_ACCOUNT);
- if (!hyp_vcpu) {
- ret = -ENOMEM;
- goto destroy_vm;
- }
-
- ret = kvm_call_hyp_nvhe(__pkvm_init_vcpu, handle, host_vcpu,
- hyp_vcpu);
- if (ret) {
- free_pages_exact(hyp_vcpu, hyp_vcpu_sz);
- goto destroy_vm;
- }
- }
+ host_kvm->arch.pkvm.handle = ret;
+ host_kvm->arch.pkvm.stage2_teardown_mc.flags |= HYP_MEMCACHE_ACCOUNT_STAGE2;
+ kvm_account_pgtable_pages(pgd, pgd_sz / PAGE_SIZE);
return 0;
-
-destroy_vm:
- __pkvm_destroy_hyp_vm(host_kvm);
- return ret;
free_vm:
free_pages_exact(hyp_vm, hyp_vm_sz);
free_pgd:
@@ -214,6 +205,18 @@ int pkvm_create_hyp_vm(struct kvm *host_kvm)
return ret;
}
+int pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu)
+{
+ int ret = 0;
+
+ mutex_lock(&vcpu->kvm->arch.config_lock);
+ if (!vcpu_get_flag(vcpu, VCPU_PKVM_FINALIZED))
+ ret = __pkvm_create_hyp_vcpu(vcpu);
+ mutex_unlock(&vcpu->kvm->arch.config_lock);
+
+ return ret;
+}
+
void pkvm_destroy_hyp_vm(struct kvm *host_kvm)
{
mutex_lock(&host_kvm->arch.config_lock);
diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
index 6c5950b9ceac..a1bc10d7116a 100644
--- a/arch/arm64/kvm/pmu-emul.c
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -17,8 +17,6 @@
#define PERF_ATTR_CFG1_COUNTER_64BIT BIT(0)
-DEFINE_STATIC_KEY_FALSE(kvm_arm_pmu_available);
-
static LIST_HEAD(arm_pmus);
static DEFINE_MUTEX(arm_pmus_lock);
@@ -26,6 +24,12 @@ static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc);
static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc);
static bool kvm_pmu_counter_is_enabled(struct kvm_pmc *pmc);
+bool kvm_supports_guest_pmuv3(void)
+{
+ guard(mutex)(&arm_pmus_lock);
+ return !list_empty(&arm_pmus);
+}
+
static struct kvm_vcpu *kvm_pmc_to_vcpu(const struct kvm_pmc *pmc)
{
return container_of(pmc, struct kvm_vcpu, arch.pmu.pmc[pmc->idx]);
@@ -150,9 +154,6 @@ static u64 kvm_pmu_get_pmc_value(struct kvm_pmc *pmc)
*/
u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx)
{
- if (!kvm_vcpu_has_pmu(vcpu))
- return 0;
-
return kvm_pmu_get_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, select_idx));
}
@@ -191,13 +192,23 @@ static void kvm_pmu_set_pmc_value(struct kvm_pmc *pmc, u64 val, bool force)
*/
void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val)
{
- if (!kvm_vcpu_has_pmu(vcpu))
- return;
-
kvm_pmu_set_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, select_idx), val, false);
}
/**
+ * kvm_pmu_set_counter_value_user - set PMU counter value from user
+ * @vcpu: The vcpu pointer
+ * @select_idx: The counter index
+ * @val: The counter value
+ */
+void kvm_pmu_set_counter_value_user(struct kvm_vcpu *vcpu, u64 select_idx, u64 val)
+{
+ kvm_pmu_release_perf_event(kvm_vcpu_idx_to_pmc(vcpu, select_idx));
+ __vcpu_sys_reg(vcpu, counter_index_to_reg(select_idx)) = val;
+ kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu);
+}
+
+/**
* kvm_pmu_release_perf_event - remove the perf event
* @pmc: The PMU counter pointer
*/
@@ -248,20 +259,6 @@ void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu)
}
/**
- * kvm_pmu_vcpu_reset - reset pmu state for cpu
- * @vcpu: The vcpu pointer
- *
- */
-void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu)
-{
- unsigned long mask = kvm_pmu_implemented_counter_mask(vcpu);
- int i;
-
- for_each_set_bit(i, &mask, 32)
- kvm_pmu_stop_counter(kvm_vcpu_idx_to_pmc(vcpu, i));
-}
-
-/**
* kvm_pmu_vcpu_destroy - free perf event of PMU for cpu
* @vcpu: The vcpu pointer
*
@@ -350,7 +347,7 @@ void kvm_pmu_reprogram_counter_mask(struct kvm_vcpu *vcpu, u64 val)
{
int i;
- if (!kvm_vcpu_has_pmu(vcpu) || !val)
+ if (!val)
return;
for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++) {
@@ -401,9 +398,6 @@ static void kvm_pmu_update_state(struct kvm_vcpu *vcpu)
struct kvm_pmu *pmu = &vcpu->arch.pmu;
bool overflow;
- if (!kvm_vcpu_has_pmu(vcpu))
- return;
-
overflow = kvm_pmu_overflow_status(vcpu);
if (pmu->irq_level == overflow)
return;
@@ -599,9 +593,6 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val)
{
int i;
- if (!kvm_vcpu_has_pmu(vcpu))
- return;
-
/* Fixup PMCR_EL0 to reconcile the PMU version and the LP bit */
if (!kvm_has_feat(vcpu->kvm, ID_AA64DFR0_EL1, PMUVer, V3P5))
val &= ~ARMV8_PMU_PMCR_LP;
@@ -673,6 +664,20 @@ static bool kvm_pmc_counts_at_el2(struct kvm_pmc *pmc)
return kvm_pmc_read_evtreg(pmc) & ARMV8_PMU_INCLUDE_EL2;
}
+static int kvm_map_pmu_event(struct kvm *kvm, unsigned int eventsel)
+{
+ struct arm_pmu *pmu = kvm->arch.arm_pmu;
+
+ /*
+ * The CPU PMU likely isn't PMUv3; let the driver provide a mapping
+ * for the guest's PMUv3 event ID.
+ */
+ if (unlikely(pmu->map_pmuv3_event))
+ return pmu->map_pmuv3_event(eventsel);
+
+ return eventsel;
+}
+
/**
* kvm_pmu_create_perf_event - create a perf event for a counter
* @pmc: Counter context
@@ -683,7 +688,8 @@ static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc)
struct arm_pmu *arm_pmu = vcpu->kvm->arch.arm_pmu;
struct perf_event *event;
struct perf_event_attr attr;
- u64 eventsel, evtreg;
+ int eventsel;
+ u64 evtreg;
evtreg = kvm_pmc_read_evtreg(pmc);
@@ -709,6 +715,14 @@ static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc)
!test_bit(eventsel, vcpu->kvm->arch.pmu_filter))
return;
+ /*
+ * Don't create an event if we're running on hardware that requires
+ * PMUv3 event translation and we couldn't find a valid mapping.
+ */
+ eventsel = kvm_map_pmu_event(vcpu->kvm, eventsel);
+ if (eventsel < 0)
+ return;
+
memset(&attr, 0, sizeof(struct perf_event_attr));
attr.type = arm_pmu->pmu.type;
attr.size = sizeof(attr);
@@ -766,9 +780,6 @@ void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, select_idx);
u64 reg;
- if (!kvm_vcpu_has_pmu(vcpu))
- return;
-
reg = counter_index_to_evtreg(pmc->idx);
__vcpu_sys_reg(vcpu, reg) = data & kvm_pmu_evtyper_mask(vcpu->kvm);
@@ -786,29 +797,23 @@ void kvm_host_pmu_init(struct arm_pmu *pmu)
if (!pmuv3_implemented(kvm_arm_pmu_get_pmuver_limit()))
return;
- mutex_lock(&arm_pmus_lock);
+ guard(mutex)(&arm_pmus_lock);
entry = kmalloc(sizeof(*entry), GFP_KERNEL);
if (!entry)
- goto out_unlock;
+ return;
entry->arm_pmu = pmu;
list_add_tail(&entry->entry, &arm_pmus);
-
- if (list_is_singular(&arm_pmus))
- static_branch_enable(&kvm_arm_pmu_available);
-
-out_unlock:
- mutex_unlock(&arm_pmus_lock);
}
static struct arm_pmu *kvm_pmu_probe_armpmu(void)
{
- struct arm_pmu *tmp, *pmu = NULL;
struct arm_pmu_entry *entry;
+ struct arm_pmu *pmu;
int cpu;
- mutex_lock(&arm_pmus_lock);
+ guard(mutex)(&arm_pmus_lock);
/*
* It is safe to use a stale cpu to iterate the list of PMUs so long as
@@ -829,42 +834,62 @@ static struct arm_pmu *kvm_pmu_probe_armpmu(void)
*/
cpu = raw_smp_processor_id();
list_for_each_entry(entry, &arm_pmus, entry) {
- tmp = entry->arm_pmu;
+ pmu = entry->arm_pmu;
- if (cpumask_test_cpu(cpu, &tmp->supported_cpus)) {
- pmu = tmp;
- break;
- }
+ if (cpumask_test_cpu(cpu, &pmu->supported_cpus))
+ return pmu;
}
- mutex_unlock(&arm_pmus_lock);
+ return NULL;
+}
+
+static u64 __compute_pmceid(struct arm_pmu *pmu, bool pmceid1)
+{
+ u32 hi[2], lo[2];
+
+ bitmap_to_arr32(lo, pmu->pmceid_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS);
+ bitmap_to_arr32(hi, pmu->pmceid_ext_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS);
- return pmu;
+ return ((u64)hi[pmceid1] << 32) | lo[pmceid1];
+}
+
+static u64 compute_pmceid0(struct arm_pmu *pmu)
+{
+ u64 val = __compute_pmceid(pmu, 0);
+
+ /* always support SW_INCR */
+ val |= BIT(ARMV8_PMUV3_PERFCTR_SW_INCR);
+ /* always support CHAIN */
+ val |= BIT(ARMV8_PMUV3_PERFCTR_CHAIN);
+ return val;
+}
+
+static u64 compute_pmceid1(struct arm_pmu *pmu)
+{
+ u64 val = __compute_pmceid(pmu, 1);
+
+ /*
+ * Don't advertise STALL_SLOT*, as PMMIR_EL0 is handled
+ * as RAZ
+ */
+ val &= ~(BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT - 32) |
+ BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT_FRONTEND - 32) |
+ BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT_BACKEND - 32));
+ return val;
}
u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1)
{
+ struct arm_pmu *cpu_pmu = vcpu->kvm->arch.arm_pmu;
unsigned long *bmap = vcpu->kvm->arch.pmu_filter;
u64 val, mask = 0;
int base, i, nr_events;
- if (!kvm_vcpu_has_pmu(vcpu))
- return 0;
-
if (!pmceid1) {
- val = read_sysreg(pmceid0_el0);
- /* always support CHAIN */
- val |= BIT(ARMV8_PMUV3_PERFCTR_CHAIN);
+ val = compute_pmceid0(cpu_pmu);
base = 0;
} else {
- val = read_sysreg(pmceid1_el0);
- /*
- * Don't advertise STALL_SLOT*, as PMMIR_EL0 is handled
- * as RAZ
- */
- val &= ~(BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT - 32) |
- BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT_FRONTEND - 32) |
- BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT_BACKEND - 32));
+ val = compute_pmceid1(cpu_pmu);
base = 32;
}
@@ -900,9 +925,6 @@ void kvm_vcpu_reload_pmu(struct kvm_vcpu *vcpu)
int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu)
{
- if (!kvm_vcpu_has_pmu(vcpu))
- return 0;
-
if (!vcpu->arch.pmu.created)
return -EINVAL;
@@ -925,9 +947,6 @@ int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu)
return -EINVAL;
}
- /* One-off reload of the PMU on first run */
- kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu);
-
return 0;
}
@@ -995,6 +1014,13 @@ u8 kvm_arm_pmu_get_max_counters(struct kvm *kvm)
struct arm_pmu *arm_pmu = kvm->arch.arm_pmu;
/*
+ * PMUv3 requires that all event counters are capable of counting any
+ * event, though the same may not be true of non-PMUv3 hardware.
+ */
+ if (cpus_have_final_cap(ARM64_WORKAROUND_PMUV3_IMPDEF_TRAPS))
+ return 1;
+
+ /*
* The arm_pmu->cntr_mask considers the fixed counter(s) as well.
* Ignore those and return only the general-purpose counters.
*/
@@ -1205,13 +1231,26 @@ int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
u8 kvm_arm_pmu_get_pmuver_limit(void)
{
- u64 tmp;
+ unsigned int pmuver;
+
+ pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer,
+ read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1));
+
+ /*
+ * Spoof a barebones PMUv3 implementation if the system supports IMPDEF
+ * traps of the PMUv3 sysregs
+ */
+ if (cpus_have_final_cap(ARM64_WORKAROUND_PMUV3_IMPDEF_TRAPS))
+ return ID_AA64DFR0_EL1_PMUVer_IMP;
+
+ /*
+ * Otherwise, treat IMPLEMENTATION DEFINED functionality as
+ * unimplemented
+ */
+ if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF)
+ return 0;
- tmp = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1);
- tmp = cpuid_feature_cap_perfmon_field(tmp,
- ID_AA64DFR0_EL1_PMUVer_SHIFT,
- ID_AA64DFR0_EL1_PMUVer_V3P5);
- return FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), tmp);
+ return min(pmuver, ID_AA64DFR0_EL1_PMUVer_V3P5);
}
/**
@@ -1231,9 +1270,6 @@ void kvm_pmu_nested_transition(struct kvm_vcpu *vcpu)
unsigned long mask;
int i;
- if (!kvm_vcpu_has_pmu(vcpu))
- return;
-
mask = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
for_each_set_bit(i, &mask, 32) {
struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, i);
diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c
index 0b3adf3e17b4..6b48a3d16d0d 100644
--- a/arch/arm64/kvm/pmu.c
+++ b/arch/arm64/kvm/pmu.c
@@ -41,7 +41,7 @@ void kvm_set_pmu_events(u64 set, struct perf_event_attr *attr)
{
struct kvm_pmu_events *pmu = kvm_get_pmu_events();
- if (!kvm_arm_support_pmu_v3() || !kvm_pmu_switch_needed(attr))
+ if (!system_supports_pmuv3() || !kvm_pmu_switch_needed(attr))
return;
if (!attr->exclude_host)
@@ -57,7 +57,7 @@ void kvm_clr_pmu_events(u64 clr)
{
struct kvm_pmu_events *pmu = kvm_get_pmu_events();
- if (!kvm_arm_support_pmu_v3())
+ if (!system_supports_pmuv3())
return;
pmu->events_host &= ~clr;
@@ -133,7 +133,7 @@ void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu)
struct kvm_pmu_events *pmu;
u64 events_guest, events_host;
- if (!kvm_arm_support_pmu_v3() || !has_vhe())
+ if (!system_supports_pmuv3() || !has_vhe())
return;
preempt_disable();
@@ -154,7 +154,7 @@ void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu)
struct kvm_pmu_events *pmu;
u64 events_guest, events_host;
- if (!kvm_arm_support_pmu_v3() || !has_vhe())
+ if (!system_supports_pmuv3() || !has_vhe())
return;
pmu = kvm_get_pmu_events();
@@ -180,7 +180,7 @@ bool kvm_set_pmuserenr(u64 val)
struct kvm_cpu_context *hctxt;
struct kvm_vcpu *vcpu;
- if (!kvm_arm_support_pmu_v3() || !has_vhe())
+ if (!system_supports_pmuv3() || !has_vhe())
return false;
vcpu = kvm_get_running_vcpu();
diff --git a/arch/arm64/kvm/ptdump.c b/arch/arm64/kvm/ptdump.c
index e4a342e903e2..098416d7e5c2 100644
--- a/arch/arm64/kvm/ptdump.c
+++ b/arch/arm64/kvm/ptdump.c
@@ -52,8 +52,8 @@ static const struct ptdump_prot_bits stage2_pte_bits[] = {
.set = "AF",
.clear = " ",
}, {
- .mask = PTE_TABLE_BIT | PTE_VALID,
- .val = PTE_VALID,
+ .mask = PMD_TYPE_MASK,
+ .val = PMD_TYPE_SECT,
.set = "BLK",
.clear = " ",
},
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index 803e11b0dc8f..f82fcc614e13 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -196,9 +196,6 @@ void kvm_reset_vcpu(struct kvm_vcpu *vcpu)
vcpu->arch.reset_state.reset = false;
spin_unlock(&vcpu->arch.mp_state_lock);
- /* Reset PMU outside of the non-preemptible section */
- kvm_pmu_vcpu_reset(vcpu);
-
preempt_disable();
loaded = (vcpu->cpu != -1);
if (loaded)
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 82430c1e1dd0..005ad28f7306 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -17,6 +17,7 @@
#include <linux/mm.h>
#include <linux/printk.h>
#include <linux/uaccess.h>
+#include <linux/irqchip/arm-gic-v3.h>
#include <asm/arm_pmuv3.h>
#include <asm/cacheflush.h>
@@ -531,7 +532,13 @@ static bool access_gic_sre(struct kvm_vcpu *vcpu,
if (p->is_write)
return ignore_write(vcpu, p);
- p->regval = vcpu->arch.vgic_cpu.vgic_v3.vgic_sre;
+ if (p->Op1 == 4) { /* ICC_SRE_EL2 */
+ p->regval = (ICC_SRE_EL2_ENABLE | ICC_SRE_EL2_SRE |
+ ICC_SRE_EL1_DIB | ICC_SRE_EL1_DFB);
+ } else { /* ICC_SRE_EL1 */
+ p->regval = vcpu->arch.vgic_cpu.vgic_v3.vgic_sre;
+ }
+
return true;
}
@@ -960,6 +967,22 @@ static int get_pmu_evcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
return 0;
}
+static int set_pmu_evcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
+ u64 val)
+{
+ u64 idx;
+
+ if (r->CRn == 9 && r->CRm == 13 && r->Op2 == 0)
+ /* PMCCNTR_EL0 */
+ idx = ARMV8_PMU_CYCLE_IDX;
+ else
+ /* PMEVCNTRn_EL0 */
+ idx = ((r->CRm & 3) << 3) | (r->Op2 & 7);
+
+ kvm_pmu_set_counter_value_user(vcpu, idx, val);
+ return 0;
+}
+
static bool access_pmu_evcntr(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
const struct sys_reg_desc *r)
@@ -1051,25 +1074,10 @@ static bool access_pmu_evtyper(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
static int set_pmreg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 val)
{
- bool set;
-
- val &= kvm_pmu_accessible_counter_mask(vcpu);
-
- switch (r->reg) {
- case PMOVSSET_EL0:
- /* CRm[1] being set indicates a SET register, and CLR otherwise */
- set = r->CRm & 2;
- break;
- default:
- /* Op2[0] being set indicates a SET register, and CLR otherwise */
- set = r->Op2 & 1;
- break;
- }
+ u64 mask = kvm_pmu_accessible_counter_mask(vcpu);
- if (set)
- __vcpu_sys_reg(vcpu, r->reg) |= val;
- else
- __vcpu_sys_reg(vcpu, r->reg) &= ~val;
+ __vcpu_sys_reg(vcpu, r->reg) = val & mask;
+ kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu);
return 0;
}
@@ -1229,6 +1237,8 @@ static int set_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
val |= ARMV8_PMU_PMCR_LC;
__vcpu_sys_reg(vcpu, r->reg) = val;
+ kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu);
+
return 0;
}
@@ -1255,6 +1265,7 @@ static int set_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
#define PMU_PMEVCNTR_EL0(n) \
{ PMU_SYS_REG(PMEVCNTRn_EL0(n)), \
.reset = reset_pmevcntr, .get_user = get_pmu_evcntr, \
+ .set_user = set_pmu_evcntr, \
.access = access_pmu_evcntr, .reg = (PMEVCNTR0_EL0 + n), }
/* Macro to expand the PMEVTYPERn_EL0 register */
@@ -1627,6 +1638,7 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu,
break;
case SYS_ID_AA64MMFR2_EL1:
val &= ~ID_AA64MMFR2_EL1_CCIDX_MASK;
+ val &= ~ID_AA64MMFR2_EL1_NV;
break;
case SYS_ID_AA64MMFR3_EL1:
val &= ID_AA64MMFR3_EL1_TCRX | ID_AA64MMFR3_EL1_S1POE |
@@ -1637,6 +1649,9 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu,
break;
}
+ if (vcpu_has_nv(vcpu))
+ val = limit_nv_id_reg(vcpu->kvm, id, val);
+
return val;
}
@@ -1663,15 +1678,24 @@ static bool is_feature_id_reg(u32 encoding)
* Return true if the register's (Op0, Op1, CRn, CRm, Op2) is
* (3, 0, 0, crm, op2), where 1<=crm<8, 0<=op2<8, which is the range of ID
* registers KVM maintains on a per-VM basis.
+ *
+ * Additionally, the implementation ID registers and CTR_EL0 are handled as
+ * per-VM registers.
*/
static inline bool is_vm_ftr_id_reg(u32 id)
{
- if (id == SYS_CTR_EL0)
+ switch (id) {
+ case SYS_CTR_EL0:
+ case SYS_MIDR_EL1:
+ case SYS_REVIDR_EL1:
+ case SYS_AIDR_EL1:
return true;
+ default:
+ return (sys_reg_Op0(id) == 3 && sys_reg_Op1(id) == 0 &&
+ sys_reg_CRn(id) == 0 && sys_reg_CRm(id) >= 1 &&
+ sys_reg_CRm(id) < 8);
- return (sys_reg_Op0(id) == 3 && sys_reg_Op1(id) == 0 &&
- sys_reg_CRn(id) == 0 && sys_reg_CRm(id) >= 1 &&
- sys_reg_CRm(id) < 8);
+ }
}
static inline bool is_vcpu_ftr_id_reg(u32 id)
@@ -1802,16 +1826,6 @@ static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val)
return val;
}
-#define ID_REG_LIMIT_FIELD_ENUM(val, reg, field, limit) \
-({ \
- u64 __f_val = FIELD_GET(reg##_##field##_MASK, val); \
- (val) &= ~reg##_##field##_MASK; \
- (val) |= FIELD_PREP(reg##_##field##_MASK, \
- min(__f_val, \
- (u64)SYS_FIELD_VALUE(reg, field, limit))); \
- (val); \
-})
-
static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val)
{
val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64DFR0_EL1, DebugVer, V8P8);
@@ -1870,12 +1884,14 @@ static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu,
static u64 read_sanitised_id_dfr0_el1(struct kvm_vcpu *vcpu,
const struct sys_reg_desc *rd)
{
- u8 perfmon = pmuver_to_perfmon(kvm_arm_pmu_get_pmuver_limit());
+ u8 perfmon;
u64 val = read_sanitised_ftr_reg(SYS_ID_DFR0_EL1);
val &= ~ID_DFR0_EL1_PerfMon_MASK;
- if (kvm_vcpu_has_pmu(vcpu))
+ if (kvm_vcpu_has_pmu(vcpu)) {
+ perfmon = pmuver_to_perfmon(kvm_arm_pmu_get_pmuver_limit());
val |= SYS_FIELD_PREP(ID_DFR0_EL1, PerfMon, perfmon);
+ }
val = ID_REG_LIMIT_FIELD_ENUM(val, ID_DFR0_EL1, CopDbg, Debugv8p8);
@@ -1945,6 +1961,37 @@ static int set_id_aa64pfr1_el1(struct kvm_vcpu *vcpu,
return set_id_reg(vcpu, rd, user_val);
}
+static int set_id_aa64mmfr0_el1(struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd, u64 user_val)
+{
+ u64 sanitized_val = kvm_read_sanitised_id_reg(vcpu, rd);
+ u64 tgran2_mask = ID_AA64MMFR0_EL1_TGRAN4_2_MASK |
+ ID_AA64MMFR0_EL1_TGRAN16_2_MASK |
+ ID_AA64MMFR0_EL1_TGRAN64_2_MASK;
+
+ if (vcpu_has_nv(vcpu) &&
+ ((sanitized_val & tgran2_mask) != (user_val & tgran2_mask)))
+ return -EINVAL;
+
+ return set_id_reg(vcpu, rd, user_val);
+}
+
+static int set_id_aa64mmfr2_el1(struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd, u64 user_val)
+{
+ u64 hw_val = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1);
+ u64 nv_mask = ID_AA64MMFR2_EL1_NV_MASK;
+
+ /*
+ * We made the mistake to expose the now deprecated NV field,
+ * so allow userspace to write it, but silently ignore it.
+ */
+ if ((hw_val & nv_mask) == (user_val & nv_mask))
+ user_val &= ~nv_mask;
+
+ return set_id_reg(vcpu, rd, user_val);
+}
+
static int set_ctr_el0(struct kvm_vcpu *vcpu,
const struct sys_reg_desc *rd, u64 user_val)
{
@@ -2266,35 +2313,33 @@ static bool bad_redir_trap(struct kvm_vcpu *vcpu,
* from userspace.
*/
+#define ID_DESC_DEFAULT_CALLBACKS \
+ .access = access_id_reg, \
+ .get_user = get_id_reg, \
+ .set_user = set_id_reg, \
+ .visibility = id_visibility, \
+ .reset = kvm_read_sanitised_id_reg
+
#define ID_DESC(name) \
SYS_DESC(SYS_##name), \
- .access = access_id_reg, \
- .get_user = get_id_reg \
+ ID_DESC_DEFAULT_CALLBACKS
/* sys_reg_desc initialiser for known cpufeature ID registers */
#define ID_SANITISED(name) { \
ID_DESC(name), \
- .set_user = set_id_reg, \
- .visibility = id_visibility, \
- .reset = kvm_read_sanitised_id_reg, \
.val = 0, \
}
/* sys_reg_desc initialiser for known cpufeature ID registers */
#define AA32_ID_SANITISED(name) { \
ID_DESC(name), \
- .set_user = set_id_reg, \
.visibility = aa32_id_visibility, \
- .reset = kvm_read_sanitised_id_reg, \
.val = 0, \
}
/* sys_reg_desc initialiser for writable ID registers */
#define ID_WRITABLE(name, mask) { \
ID_DESC(name), \
- .set_user = set_id_reg, \
- .visibility = id_visibility, \
- .reset = kvm_read_sanitised_id_reg, \
.val = mask, \
}
@@ -2302,8 +2347,6 @@ static bool bad_redir_trap(struct kvm_vcpu *vcpu,
#define ID_FILTERED(sysreg, name, mask) { \
ID_DESC(sysreg), \
.set_user = set_##name, \
- .visibility = id_visibility, \
- .reset = kvm_read_sanitised_id_reg, \
.val = (mask), \
}
@@ -2313,12 +2356,10 @@ static bool bad_redir_trap(struct kvm_vcpu *vcpu,
* (1 <= crm < 8, 0 <= Op2 < 8).
*/
#define ID_UNALLOCATED(crm, op2) { \
+ .name = "S3_0_0_" #crm "_" #op2, \
Op0(3), Op1(0), CRn(0), CRm(crm), Op2(op2), \
- .access = access_id_reg, \
- .get_user = get_id_reg, \
- .set_user = set_id_reg, \
+ ID_DESC_DEFAULT_CALLBACKS, \
.visibility = raz_visibility, \
- .reset = kvm_read_sanitised_id_reg, \
.val = 0, \
}
@@ -2329,9 +2370,7 @@ static bool bad_redir_trap(struct kvm_vcpu *vcpu,
*/
#define ID_HIDDEN(name) { \
ID_DESC(name), \
- .set_user = set_id_reg, \
.visibility = raz_visibility, \
- .reset = kvm_read_sanitised_id_reg, \
.val = 0, \
}
@@ -2426,6 +2465,59 @@ static bool access_zcr_el2(struct kvm_vcpu *vcpu,
vq = SYS_FIELD_GET(ZCR_ELx, LEN, p->regval) + 1;
vq = min(vq, vcpu_sve_max_vq(vcpu));
vcpu_write_sys_reg(vcpu, vq - 1, ZCR_EL2);
+
+ return true;
+}
+
+static bool access_gic_vtr(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ if (p->is_write)
+ return write_to_read_only(vcpu, p, r);
+
+ p->regval = kvm_vgic_global_state.ich_vtr_el2;
+ p->regval &= ~(ICH_VTR_EL2_DVIM |
+ ICH_VTR_EL2_A3V |
+ ICH_VTR_EL2_IDbits);
+ p->regval |= ICH_VTR_EL2_nV4;
+
+ return true;
+}
+
+static bool access_gic_misr(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ if (p->is_write)
+ return write_to_read_only(vcpu, p, r);
+
+ p->regval = vgic_v3_get_misr(vcpu);
+
+ return true;
+}
+
+static bool access_gic_eisr(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ if (p->is_write)
+ return write_to_read_only(vcpu, p, r);
+
+ p->regval = vgic_v3_get_eisr(vcpu);
+
+ return true;
+}
+
+static bool access_gic_elrsr(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ if (p->is_write)
+ return write_to_read_only(vcpu, p, r);
+
+ p->regval = vgic_v3_get_elrsr(vcpu);
+
return true;
}
@@ -2493,6 +2585,120 @@ static bool access_mdcr(struct kvm_vcpu *vcpu,
return true;
}
+/*
+ * For historical (ahem ABI) reasons, KVM treated MIDR_EL1, REVIDR_EL1, and
+ * AIDR_EL1 as "invariant" registers, meaning userspace cannot change them.
+ * The values made visible to userspace were the register values of the boot
+ * CPU.
+ *
+ * At the same time, reads from these registers at EL1 previously were not
+ * trapped, allowing the guest to read the actual hardware value. On big-little
+ * machines, this means the VM can see different values depending on where a
+ * given vCPU got scheduled.
+ *
+ * These registers are now trapped as collateral damage from SME, and what
+ * follows attempts to give a user / guest view consistent with the existing
+ * ABI.
+ */
+static bool access_imp_id_reg(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ if (p->is_write)
+ return write_to_read_only(vcpu, p, r);
+
+ /*
+ * Return the VM-scoped implementation ID register values if userspace
+ * has made them writable.
+ */
+ if (test_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &vcpu->kvm->arch.flags))
+ return access_id_reg(vcpu, p, r);
+
+ /*
+ * Otherwise, fall back to the old behavior of returning the value of
+ * the current CPU.
+ */
+ switch (reg_to_encoding(r)) {
+ case SYS_REVIDR_EL1:
+ p->regval = read_sysreg(revidr_el1);
+ break;
+ case SYS_AIDR_EL1:
+ p->regval = read_sysreg(aidr_el1);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
+
+ return true;
+}
+
+static u64 __ro_after_init boot_cpu_midr_val;
+static u64 __ro_after_init boot_cpu_revidr_val;
+static u64 __ro_after_init boot_cpu_aidr_val;
+
+static void init_imp_id_regs(void)
+{
+ boot_cpu_midr_val = read_sysreg(midr_el1);
+ boot_cpu_revidr_val = read_sysreg(revidr_el1);
+ boot_cpu_aidr_val = read_sysreg(aidr_el1);
+}
+
+static u64 reset_imp_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+{
+ switch (reg_to_encoding(r)) {
+ case SYS_MIDR_EL1:
+ return boot_cpu_midr_val;
+ case SYS_REVIDR_EL1:
+ return boot_cpu_revidr_val;
+ case SYS_AIDR_EL1:
+ return boot_cpu_aidr_val;
+ default:
+ KVM_BUG_ON(1, vcpu->kvm);
+ return 0;
+ }
+}
+
+static int set_imp_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
+ u64 val)
+{
+ struct kvm *kvm = vcpu->kvm;
+ u64 expected;
+
+ guard(mutex)(&kvm->arch.config_lock);
+
+ expected = read_id_reg(vcpu, r);
+ if (expected == val)
+ return 0;
+
+ if (!test_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &kvm->arch.flags))
+ return -EINVAL;
+
+ /*
+ * Once the VM has started the ID registers are immutable. Reject the
+ * write if userspace tries to change it.
+ */
+ if (kvm_vm_has_ran_once(kvm))
+ return -EBUSY;
+
+ /*
+ * Any value is allowed for the implementation ID registers so long as
+ * it is within the writable mask.
+ */
+ if ((val & r->val) != val)
+ return -EINVAL;
+
+ kvm_set_vm_id_reg(kvm, reg_to_encoding(r), val);
+ return 0;
+}
+
+#define IMPLEMENTATION_ID(reg, mask) { \
+ SYS_DESC(SYS_##reg), \
+ .access = access_imp_id_reg, \
+ .get_user = get_id_reg, \
+ .set_user = set_imp_id_reg, \
+ .reset = reset_imp_id_reg, \
+ .val = mask, \
+}
/*
* Architected system registers.
@@ -2542,7 +2748,9 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_DBGVCR32_EL2), undef_access, reset_val, DBGVCR32_EL2, 0 },
+ IMPLEMENTATION_ID(MIDR_EL1, GENMASK_ULL(31, 0)),
{ SYS_DESC(SYS_MPIDR_EL1), NULL, reset_mpidr, MPIDR_EL1 },
+ IMPLEMENTATION_ID(REVIDR_EL1, GENMASK_ULL(63, 0)),
/*
* ID regs: all ID_SANITISED() entries here must have corresponding
@@ -2660,10 +2868,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
ID_UNALLOCATED(6,7),
/* CRm=7 */
- ID_WRITABLE(ID_AA64MMFR0_EL1, ~(ID_AA64MMFR0_EL1_RES0 |
- ID_AA64MMFR0_EL1_TGRAN4_2 |
- ID_AA64MMFR0_EL1_TGRAN64_2 |
- ID_AA64MMFR0_EL1_TGRAN16_2 |
+ ID_FILTERED(ID_AA64MMFR0_EL1, id_aa64mmfr0_el1,
+ ~(ID_AA64MMFR0_EL1_RES0 |
ID_AA64MMFR0_EL1_ASIDBITS)),
ID_WRITABLE(ID_AA64MMFR1_EL1, ~(ID_AA64MMFR1_EL1_RES0 |
ID_AA64MMFR1_EL1_HCX |
@@ -2671,7 +2877,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
ID_AA64MMFR1_EL1_XNX |
ID_AA64MMFR1_EL1_VH |
ID_AA64MMFR1_EL1_VMIDBits)),
- ID_WRITABLE(ID_AA64MMFR2_EL1, ~(ID_AA64MMFR2_EL1_RES0 |
+ ID_FILTERED(ID_AA64MMFR2_EL1,
+ id_aa64mmfr2_el1, ~(ID_AA64MMFR2_EL1_RES0 |
ID_AA64MMFR2_EL1_EVT |
ID_AA64MMFR2_EL1_FWB |
ID_AA64MMFR2_EL1_IDS |
@@ -2680,7 +2887,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
ID_WRITABLE(ID_AA64MMFR3_EL1, (ID_AA64MMFR3_EL1_TCRX |
ID_AA64MMFR3_EL1_S1PIE |
ID_AA64MMFR3_EL1_S1POE)),
- ID_SANITISED(ID_AA64MMFR4_EL1),
+ ID_WRITABLE(ID_AA64MMFR4_EL1, ID_AA64MMFR4_EL1_NV_frac),
ID_UNALLOCATED(7,5),
ID_UNALLOCATED(7,6),
ID_UNALLOCATED(7,7),
@@ -2814,6 +3021,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
.set_user = set_clidr, .val = ~CLIDR_EL1_RES0 },
{ SYS_DESC(SYS_CCSIDR2_EL1), undef_access },
{ SYS_DESC(SYS_SMIDR_EL1), undef_access },
+ IMPLEMENTATION_ID(AIDR_EL1, GENMASK_ULL(63, 0)),
{ SYS_DESC(SYS_CSSELR_EL1), access_csselr, reset_unknown, CSSELR_EL1 },
ID_FILTERED(CTR_EL0, ctr_el0,
CTR_EL0_DIC_MASK |
@@ -2850,7 +3058,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
.access = access_pmceid, .reset = NULL },
{ PMU_SYS_REG(PMCCNTR_EL0),
.access = access_pmu_evcntr, .reset = reset_unknown,
- .reg = PMCCNTR_EL0, .get_user = get_pmu_evcntr},
+ .reg = PMCCNTR_EL0, .get_user = get_pmu_evcntr,
+ .set_user = set_pmu_evcntr },
{ PMU_SYS_REG(PMXEVTYPER_EL0),
.access = access_pmu_evtyper, .reset = NULL },
{ PMU_SYS_REG(PMXEVCNTR_EL0),
@@ -3102,7 +3311,40 @@ static const struct sys_reg_desc sys_reg_descs[] = {
EL2_REG(RVBAR_EL2, access_rw, reset_val, 0),
{ SYS_DESC(SYS_RMR_EL2), undef_access },
+ EL2_REG_VNCR(ICH_AP0R0_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_AP0R1_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_AP0R2_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_AP0R3_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_AP1R0_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_AP1R1_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_AP1R2_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_AP1R3_EL2, reset_val, 0),
+
+ { SYS_DESC(SYS_ICC_SRE_EL2), access_gic_sre },
+
EL2_REG_VNCR(ICH_HCR_EL2, reset_val, 0),
+ { SYS_DESC(SYS_ICH_VTR_EL2), access_gic_vtr },
+ { SYS_DESC(SYS_ICH_MISR_EL2), access_gic_misr },
+ { SYS_DESC(SYS_ICH_EISR_EL2), access_gic_eisr },
+ { SYS_DESC(SYS_ICH_ELRSR_EL2), access_gic_elrsr },
+ EL2_REG_VNCR(ICH_VMCR_EL2, reset_val, 0),
+
+ EL2_REG_VNCR(ICH_LR0_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_LR1_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_LR2_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_LR3_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_LR4_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_LR5_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_LR6_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_LR7_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_LR8_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_LR9_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_LR10_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_LR11_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_LR12_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_LR13_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_LR14_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_LR15_EL2, reset_val, 0),
EL2_REG(CONTEXTIDR_EL2, access_rw, reset_val, 0),
EL2_REG(TPIDR_EL2, access_rw, reset_val, 0),
@@ -4272,9 +4514,13 @@ int kvm_handle_cp15_32(struct kvm_vcpu *vcpu)
* Certain AArch32 ID registers are handled by rerouting to the AArch64
* system register table. Registers in the ID range where CRm=0 are
* excluded from this scheme as they do not trivially map into AArch64
- * system register encodings.
+ * system register encodings, except for AIDR/REVIDR.
*/
- if (params.Op1 == 0 && params.CRn == 0 && params.CRm)
+ if (params.Op1 == 0 && params.CRn == 0 &&
+ (params.CRm || params.Op2 == 6 /* REVIDR */))
+ return kvm_emulate_cp15_id_reg(vcpu, &params);
+ if (params.Op1 == 1 && params.CRn == 0 &&
+ params.CRm == 0 && params.Op2 == 7 /* AIDR */)
return kvm_emulate_cp15_id_reg(vcpu, &params);
return kvm_handle_cp_32(vcpu, &params, cp15_regs, ARRAY_SIZE(cp15_regs));
@@ -4473,6 +4719,9 @@ void kvm_reset_sys_regs(struct kvm_vcpu *vcpu)
}
set_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags);
+
+ if (kvm_vcpu_has_pmu(vcpu))
+ kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu);
}
/**
@@ -4578,65 +4827,6 @@ id_to_sys_reg_desc(struct kvm_vcpu *vcpu, u64 id,
return r;
}
-/*
- * These are the invariant sys_reg registers: we let the guest see the
- * host versions of these, so they're part of the guest state.
- *
- * A future CPU may provide a mechanism to present different values to
- * the guest, or a future kvm may trap them.
- */
-
-#define FUNCTION_INVARIANT(reg) \
- static u64 reset_##reg(struct kvm_vcpu *v, \
- const struct sys_reg_desc *r) \
- { \
- ((struct sys_reg_desc *)r)->val = read_sysreg(reg); \
- return ((struct sys_reg_desc *)r)->val; \
- }
-
-FUNCTION_INVARIANT(midr_el1)
-FUNCTION_INVARIANT(revidr_el1)
-FUNCTION_INVARIANT(aidr_el1)
-
-/* ->val is filled in by kvm_sys_reg_table_init() */
-static struct sys_reg_desc invariant_sys_regs[] __ro_after_init = {
- { SYS_DESC(SYS_MIDR_EL1), NULL, reset_midr_el1 },
- { SYS_DESC(SYS_REVIDR_EL1), NULL, reset_revidr_el1 },
- { SYS_DESC(SYS_AIDR_EL1), NULL, reset_aidr_el1 },
-};
-
-static int get_invariant_sys_reg(u64 id, u64 __user *uaddr)
-{
- const struct sys_reg_desc *r;
-
- r = get_reg_by_id(id, invariant_sys_regs,
- ARRAY_SIZE(invariant_sys_regs));
- if (!r)
- return -ENOENT;
-
- return put_user(r->val, uaddr);
-}
-
-static int set_invariant_sys_reg(u64 id, u64 __user *uaddr)
-{
- const struct sys_reg_desc *r;
- u64 val;
-
- r = get_reg_by_id(id, invariant_sys_regs,
- ARRAY_SIZE(invariant_sys_regs));
- if (!r)
- return -ENOENT;
-
- if (get_user(val, uaddr))
- return -EFAULT;
-
- /* This is what we mean by invariant: you can't change it. */
- if (r->val != val)
- return -EINVAL;
-
- return 0;
-}
-
static int demux_c15_get(struct kvm_vcpu *vcpu, u64 id, void __user *uaddr)
{
u32 val;
@@ -4718,15 +4908,10 @@ int kvm_sys_reg_get_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg,
int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
{
void __user *uaddr = (void __user *)(unsigned long)reg->addr;
- int err;
if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX)
return demux_c15_get(vcpu, reg->id, uaddr);
- err = get_invariant_sys_reg(reg->id, uaddr);
- if (err != -ENOENT)
- return err;
-
return kvm_sys_reg_get_user(vcpu, reg,
sys_reg_descs, ARRAY_SIZE(sys_reg_descs));
}
@@ -4762,15 +4947,10 @@ int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg,
int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
{
void __user *uaddr = (void __user *)(unsigned long)reg->addr;
- int err;
if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX)
return demux_c15_set(vcpu, reg->id, uaddr);
- err = set_invariant_sys_reg(reg->id, uaddr);
- if (err != -ENOENT)
- return err;
-
return kvm_sys_reg_set_user(vcpu, reg,
sys_reg_descs, ARRAY_SIZE(sys_reg_descs));
}
@@ -4859,23 +5039,14 @@ static int walk_sys_regs(struct kvm_vcpu *vcpu, u64 __user *uind)
unsigned long kvm_arm_num_sys_reg_descs(struct kvm_vcpu *vcpu)
{
- return ARRAY_SIZE(invariant_sys_regs)
- + num_demux_regs()
+ return num_demux_regs()
+ walk_sys_regs(vcpu, (u64 __user *)NULL);
}
int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
{
- unsigned int i;
int err;
- /* Then give them all the invariant registers' indices. */
- for (i = 0; i < ARRAY_SIZE(invariant_sys_regs); i++) {
- if (put_user(sys_reg_to_index(&invariant_sys_regs[i]), uindices))
- return -EFAULT;
- uindices++;
- }
-
err = walk_sys_regs(vcpu, uindices);
if (err < 0)
return err;
@@ -4971,25 +5142,7 @@ void kvm_calculate_traps(struct kvm_vcpu *vcpu)
mutex_lock(&kvm->arch.config_lock);
vcpu_set_hcr(vcpu);
vcpu_set_ich_hcr(vcpu);
-
- if (cpus_have_final_cap(ARM64_HAS_HCX)) {
- /*
- * In general, all HCRX_EL2 bits are gated by a feature.
- * The only reason we can set SMPME without checking any
- * feature is that its effects are not directly observable
- * from the guest.
- */
- vcpu->arch.hcrx_el2 = HCRX_EL2_SMPME;
-
- if (kvm_has_feat(kvm, ID_AA64ISAR2_EL1, MOPS, IMP))
- vcpu->arch.hcrx_el2 |= (HCRX_EL2_MSCEn | HCRX_EL2_MCE2);
-
- if (kvm_has_tcr2(kvm))
- vcpu->arch.hcrx_el2 |= HCRX_EL2_TCR2En;
-
- if (kvm_has_fpmr(kvm))
- vcpu->arch.hcrx_el2 |= HCRX_EL2_EnFPM;
- }
+ vcpu_set_hcrx(vcpu);
if (test_bit(KVM_ARCH_FLAG_FGU_INITIALIZED, &kvm->arch.flags))
goto out;
@@ -5101,15 +5254,12 @@ int __init kvm_sys_reg_table_init(void)
valid &= check_sysreg_table(cp14_64_regs, ARRAY_SIZE(cp14_64_regs), true);
valid &= check_sysreg_table(cp15_regs, ARRAY_SIZE(cp15_regs), true);
valid &= check_sysreg_table(cp15_64_regs, ARRAY_SIZE(cp15_64_regs), true);
- valid &= check_sysreg_table(invariant_sys_regs, ARRAY_SIZE(invariant_sys_regs), false);
valid &= check_sysreg_table(sys_insn_descs, ARRAY_SIZE(sys_insn_descs), false);
if (!valid)
return -EINVAL;
- /* We abuse the reset function to overwrite the table itself. */
- for (i = 0; i < ARRAY_SIZE(invariant_sys_regs); i++)
- invariant_sys_regs[i].reset(NULL, &invariant_sys_regs[i]);
+ init_imp_id_regs();
ret = populate_nv_trap_config();
diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h
index 1d94ed6efad2..cc6338d38766 100644
--- a/arch/arm64/kvm/sys_regs.h
+++ b/arch/arm64/kvm/sys_regs.h
@@ -247,4 +247,14 @@ int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu);
CRn(sys_reg_CRn(reg)), CRm(sys_reg_CRm(reg)), \
Op2(sys_reg_Op2(reg))
+#define ID_REG_LIMIT_FIELD_ENUM(val, reg, field, limit) \
+({ \
+ u64 __f_val = FIELD_GET(reg##_##field##_MASK, val); \
+ (val) &= ~reg##_##field##_MASK; \
+ (val) |= FIELD_PREP(reg##_##field##_MASK, \
+ min(__f_val, \
+ (u64)SYS_FIELD_VALUE(reg, field, limit))); \
+ (val); \
+})
+
#endif /* __ARM64_KVM_SYS_REGS_LOCAL_H__ */
diff --git a/arch/arm64/kvm/vgic-sys-reg-v3.c b/arch/arm64/kvm/vgic-sys-reg-v3.c
index 9e7c486b48c2..5eacb4b3250a 100644
--- a/arch/arm64/kvm/vgic-sys-reg-v3.c
+++ b/arch/arm64/kvm/vgic-sys-reg-v3.c
@@ -35,12 +35,12 @@ static int set_gic_ctlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
vgic_v3_cpu->num_id_bits = host_id_bits;
- host_seis = FIELD_GET(ICH_VTR_SEIS_MASK, kvm_vgic_global_state.ich_vtr_el2);
+ host_seis = FIELD_GET(ICH_VTR_EL2_SEIS, kvm_vgic_global_state.ich_vtr_el2);
seis = FIELD_GET(ICC_CTLR_EL1_SEIS_MASK, val);
if (host_seis != seis)
return -EINVAL;
- host_a3v = FIELD_GET(ICH_VTR_A3V_MASK, kvm_vgic_global_state.ich_vtr_el2);
+ host_a3v = FIELD_GET(ICH_VTR_EL2_A3V, kvm_vgic_global_state.ich_vtr_el2);
a3v = FIELD_GET(ICC_CTLR_EL1_A3V_MASK, val);
if (host_a3v != a3v)
return -EINVAL;
@@ -68,10 +68,10 @@ static int get_gic_ctlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
val |= FIELD_PREP(ICC_CTLR_EL1_PRI_BITS_MASK, vgic_v3_cpu->num_pri_bits - 1);
val |= FIELD_PREP(ICC_CTLR_EL1_ID_BITS_MASK, vgic_v3_cpu->num_id_bits);
val |= FIELD_PREP(ICC_CTLR_EL1_SEIS_MASK,
- FIELD_GET(ICH_VTR_SEIS_MASK,
+ FIELD_GET(ICH_VTR_EL2_SEIS,
kvm_vgic_global_state.ich_vtr_el2));
val |= FIELD_PREP(ICC_CTLR_EL1_A3V_MASK,
- FIELD_GET(ICH_VTR_A3V_MASK, kvm_vgic_global_state.ich_vtr_el2));
+ FIELD_GET(ICH_VTR_EL2_A3V, kvm_vgic_global_state.ich_vtr_el2));
/*
* The VMCR.CTLR value is in ICC_CTLR_EL1 layout.
* Extract it directly using ICC_CTLR_EL1 reg definitions.
diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
index 775461cf2d2d..1f33e71c2a73 100644
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -198,6 +198,27 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis)
return 0;
}
+/* Default GICv3 Maintenance Interrupt INTID, as per SBSA */
+#define DEFAULT_MI_INTID 25
+
+int kvm_vgic_vcpu_nv_init(struct kvm_vcpu *vcpu)
+{
+ int ret;
+
+ guard(mutex)(&vcpu->kvm->arch.config_lock);
+
+ /*
+ * Matching the tradition established with the timers, provide
+ * a default PPI for the maintenance interrupt. It makes
+ * things easier to reason about.
+ */
+ if (vcpu->kvm->arch.vgic.mi_intid == 0)
+ vcpu->kvm->arch.vgic.mi_intid = DEFAULT_MI_INTID;
+ ret = kvm_vgic_set_owner(vcpu, vcpu->kvm->arch.vgic.mi_intid, vcpu);
+
+ return ret;
+}
+
static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type)
{
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
@@ -588,12 +609,20 @@ void kvm_vgic_cpu_down(void)
static irqreturn_t vgic_maintenance_handler(int irq, void *data)
{
+ struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)data;
+
/*
* We cannot rely on the vgic maintenance interrupt to be
* delivered synchronously. This means we can only use it to
* exit the VM, and we perform the handling of EOIed
* interrupts on the exit path (see vgic_fold_lr_state).
+ *
+ * Of course, NV throws a wrench in this plan, and needs
+ * something special.
*/
+ if (vcpu && vgic_state_is_nested(vcpu))
+ vgic_v3_handle_nested_maint_irq(vcpu);
+
return IRQ_HANDLED;
}
diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c b/arch/arm64/kvm/vgic/vgic-kvm-device.c
index 5f4f57aaa23e..359094f68c23 100644
--- a/arch/arm64/kvm/vgic/vgic-kvm-device.c
+++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c
@@ -303,6 +303,12 @@ static int vgic_get_common_attr(struct kvm_device *dev,
VGIC_NR_PRIVATE_IRQS, uaddr);
break;
}
+ case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ: {
+ u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+
+ r = put_user(dev->kvm->arch.vgic.mi_intid, uaddr);
+ break;
+ }
}
return r;
@@ -517,7 +523,7 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev,
struct vgic_reg_attr reg_attr;
gpa_t addr;
struct kvm_vcpu *vcpu;
- bool uaccess;
+ bool uaccess, post_init = true;
u32 val;
int ret;
@@ -533,6 +539,9 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev,
/* Sysregs uaccess is performed by the sysreg handling code */
uaccess = false;
break;
+ case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ:
+ post_init = false;
+ fallthrough;
default:
uaccess = true;
}
@@ -552,7 +561,7 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev,
mutex_lock(&dev->kvm->arch.config_lock);
- if (unlikely(!vgic_initialized(dev->kvm))) {
+ if (post_init != vgic_initialized(dev->kvm)) {
ret = -EBUSY;
goto out;
}
@@ -582,6 +591,19 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev,
}
break;
}
+ case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ:
+ if (!is_write) {
+ val = dev->kvm->arch.vgic.mi_intid;
+ ret = 0;
+ break;
+ }
+
+ ret = -EINVAL;
+ if ((val < VGIC_NR_PRIVATE_IRQS) && (val >= VGIC_NR_SGIS)) {
+ dev->kvm->arch.vgic.mi_intid = val;
+ ret = 0;
+ }
+ break;
default:
ret = -EINVAL;
break;
@@ -608,6 +630,7 @@ static int vgic_v3_set_attr(struct kvm_device *dev,
case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:
case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS:
case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO:
+ case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ:
return vgic_v3_attr_regs_access(dev, attr, true);
default:
return vgic_set_common_attr(dev, attr);
@@ -622,6 +645,7 @@ static int vgic_v3_get_attr(struct kvm_device *dev,
case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:
case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS:
case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO:
+ case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ:
return vgic_v3_attr_regs_access(dev, attr, false);
default:
return vgic_get_common_attr(dev, attr);
@@ -645,6 +669,7 @@ static int vgic_v3_has_attr(struct kvm_device *dev,
case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS:
return vgic_v3_has_attr_regs(dev, attr);
case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
+ case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ:
return 0;
case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: {
if (((attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK) >>
diff --git a/arch/arm64/kvm/vgic/vgic-v3-nested.c b/arch/arm64/kvm/vgic/vgic-v3-nested.c
new file mode 100644
index 000000000000..bfa5bde1f106
--- /dev/null
+++ b/arch/arm64/kvm/vgic/vgic-v3-nested.c
@@ -0,0 +1,409 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/cpu.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/uaccess.h>
+
+#include <kvm/arm_vgic.h>
+
+#include <asm/kvm_arm.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_nested.h>
+
+#include "vgic.h"
+
+#define ICH_LRN(n) (ICH_LR0_EL2 + (n))
+#define ICH_AP0RN(n) (ICH_AP0R0_EL2 + (n))
+#define ICH_AP1RN(n) (ICH_AP1R0_EL2 + (n))
+
+struct mi_state {
+ u16 eisr;
+ u16 elrsr;
+ bool pend;
+};
+
+/*
+ * The shadow registers loaded to the hardware when running a L2 guest
+ * with the virtual IMO/FMO bits set.
+ */
+struct shadow_if {
+ struct vgic_v3_cpu_if cpuif;
+ unsigned long lr_map;
+};
+
+static DEFINE_PER_CPU(struct shadow_if, shadow_if);
+
+/*
+ * Nesting GICv3 support
+ *
+ * On a non-nesting VM (only running at EL0/EL1), the host hypervisor
+ * completely controls the interrupts injected via the list registers.
+ * Consequently, most of the state that is modified by the guest (by ACK-ing
+ * and EOI-ing interrupts) is synced by KVM on each entry/exit, so that we
+ * keep a semi-consistent view of the interrupts.
+ *
+ * This still applies for a NV guest, but only while "InHost" (either
+ * running at EL2, or at EL0 with HCR_EL2.{E2H.TGE}=={1,1}.
+ *
+ * When running a L2 guest ("not InHost"), things are radically different,
+ * as the L1 guest is in charge of provisioning the interrupts via its own
+ * view of the ICH_LR*_EL2 registers, which conveniently live in the VNCR
+ * page. This means that the flow described above does work (there is no
+ * state to rebuild in the L0 hypervisor), and that most things happed on L2
+ * load/put:
+ *
+ * - on L2 load: move the in-memory L1 vGIC configuration into a shadow,
+ * per-CPU data structure that is used to populate the actual LRs. This is
+ * an extra copy that we could avoid, but life is short. In the process,
+ * we remap any interrupt that has the HW bit set to the mapped interrupt
+ * on the host, should the host consider it a HW one. This allows the HW
+ * deactivation to take its course, such as for the timer.
+ *
+ * - on L2 put: perform the inverse transformation, so that the result of L2
+ * running becomes visible to L1 in the VNCR-accessible registers.
+ *
+ * - there is nothing to do on L2 entry, as everything will have happened
+ * on load. However, this is the point where we detect that an interrupt
+ * targeting L1 and prepare the grand switcheroo.
+ *
+ * - on L2 exit: emulate the HW bit, and deactivate corresponding the L1
+ * interrupt. The L0 active state will be cleared by the HW if the L1
+ * interrupt was itself backed by a HW interrupt.
+ *
+ * Maintenance Interrupt (MI) management:
+ *
+ * Since the L2 guest runs the vgic in its full glory, MIs get delivered and
+ * used as a handover point between L2 and L1.
+ *
+ * - on delivery of a MI to L0 while L2 is running: make the L1 MI pending,
+ * and let it rip. This will initiate a vcpu_put() on L2, and allow L1 to
+ * run and process the MI.
+ *
+ * - L1 MI is a fully virtual interrupt, not linked to the host's MI. Its
+ * state must be computed at each entry/exit of the guest, much like we do
+ * it for the PMU interrupt.
+ *
+ * - because most of the ICH_*_EL2 registers live in the VNCR page, the
+ * quality of emulation is poor: L1 can setup the vgic so that an MI would
+ * immediately fire, and not observe anything until the next exit. Trying
+ * to read ICH_MISR_EL2 would do the trick, for example.
+ *
+ * System register emulation:
+ *
+ * We get two classes of registers:
+ *
+ * - those backed by memory (LRs, APRs, HCR, VMCR): L1 can freely access
+ * them, and L0 doesn't see a thing.
+ *
+ * - those that always trap (ELRSR, EISR, MISR): these are status registers
+ * that are built on the fly based on the in-memory state.
+ *
+ * Only L1 can access the ICH_*_EL2 registers. A non-NV L2 obviously cannot,
+ * and a NV L2 would either access the VNCR page provided by L1 (memory
+ * based registers), or see the access redirected to L1 (registers that
+ * trap) thanks to NV being set by L1.
+ */
+
+bool vgic_state_is_nested(struct kvm_vcpu *vcpu)
+{
+ u64 xmo;
+
+ if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) {
+ xmo = __vcpu_sys_reg(vcpu, HCR_EL2) & (HCR_IMO | HCR_FMO);
+ WARN_ONCE(xmo && xmo != (HCR_IMO | HCR_FMO),
+ "Separate virtual IRQ/FIQ settings not supported\n");
+
+ return !!xmo;
+ }
+
+ return false;
+}
+
+static struct shadow_if *get_shadow_if(void)
+{
+ return this_cpu_ptr(&shadow_if);
+}
+
+static bool lr_triggers_eoi(u64 lr)
+{
+ return !(lr & (ICH_LR_STATE | ICH_LR_HW)) && (lr & ICH_LR_EOI);
+}
+
+static void vgic_compute_mi_state(struct kvm_vcpu *vcpu, struct mi_state *mi_state)
+{
+ u16 eisr = 0, elrsr = 0;
+ bool pend = false;
+
+ for (int i = 0; i < kvm_vgic_global_state.nr_lr; i++) {
+ u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i));
+
+ if (lr_triggers_eoi(lr))
+ eisr |= BIT(i);
+ if (!(lr & ICH_LR_STATE))
+ elrsr |= BIT(i);
+ pend |= (lr & ICH_LR_PENDING_BIT);
+ }
+
+ mi_state->eisr = eisr;
+ mi_state->elrsr = elrsr;
+ mi_state->pend = pend;
+}
+
+u16 vgic_v3_get_eisr(struct kvm_vcpu *vcpu)
+{
+ struct mi_state mi_state;
+
+ vgic_compute_mi_state(vcpu, &mi_state);
+ return mi_state.eisr;
+}
+
+u16 vgic_v3_get_elrsr(struct kvm_vcpu *vcpu)
+{
+ struct mi_state mi_state;
+
+ vgic_compute_mi_state(vcpu, &mi_state);
+ return mi_state.elrsr;
+}
+
+u64 vgic_v3_get_misr(struct kvm_vcpu *vcpu)
+{
+ struct mi_state mi_state;
+ u64 reg = 0, hcr, vmcr;
+
+ hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2);
+ vmcr = __vcpu_sys_reg(vcpu, ICH_VMCR_EL2);
+
+ vgic_compute_mi_state(vcpu, &mi_state);
+
+ if (mi_state.eisr)
+ reg |= ICH_MISR_EL2_EOI;
+
+ if (__vcpu_sys_reg(vcpu, ICH_HCR_EL2) & ICH_HCR_EL2_UIE) {
+ int used_lrs = kvm_vgic_global_state.nr_lr;
+
+ used_lrs -= hweight16(mi_state.elrsr);
+ reg |= (used_lrs <= 1) ? ICH_MISR_EL2_U : 0;
+ }
+
+ if ((hcr & ICH_HCR_EL2_LRENPIE) && FIELD_GET(ICH_HCR_EL2_EOIcount_MASK, hcr))
+ reg |= ICH_MISR_EL2_LRENP;
+
+ if ((hcr & ICH_HCR_EL2_NPIE) && !mi_state.pend)
+ reg |= ICH_MISR_EL2_NP;
+
+ if ((hcr & ICH_HCR_EL2_VGrp0EIE) && (vmcr & ICH_VMCR_ENG0_MASK))
+ reg |= ICH_MISR_EL2_VGrp0E;
+
+ if ((hcr & ICH_HCR_EL2_VGrp0DIE) && !(vmcr & ICH_VMCR_ENG0_MASK))
+ reg |= ICH_MISR_EL2_VGrp0D;
+
+ if ((hcr & ICH_HCR_EL2_VGrp1EIE) && (vmcr & ICH_VMCR_ENG1_MASK))
+ reg |= ICH_MISR_EL2_VGrp1E;
+
+ if ((hcr & ICH_HCR_EL2_VGrp1DIE) && !(vmcr & ICH_VMCR_ENG1_MASK))
+ reg |= ICH_MISR_EL2_VGrp1D;
+
+ return reg;
+}
+
+/*
+ * For LRs which have HW bit set such as timer interrupts, we modify them to
+ * have the host hardware interrupt number instead of the virtual one programmed
+ * by the guest hypervisor.
+ */
+static void vgic_v3_create_shadow_lr(struct kvm_vcpu *vcpu,
+ struct vgic_v3_cpu_if *s_cpu_if)
+{
+ unsigned long lr_map = 0;
+ int index = 0;
+
+ for (int i = 0; i < kvm_vgic_global_state.nr_lr; i++) {
+ u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i));
+ struct vgic_irq *irq;
+
+ if (!(lr & ICH_LR_STATE))
+ lr = 0;
+
+ if (!(lr & ICH_LR_HW))
+ goto next;
+
+ /* We have the HW bit set, check for validity of pINTID */
+ irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr));
+ if (!irq || !irq->hw || irq->intid > VGIC_MAX_SPI ) {
+ /* There was no real mapping, so nuke the HW bit */
+ lr &= ~ICH_LR_HW;
+ if (irq)
+ vgic_put_irq(vcpu->kvm, irq);
+ goto next;
+ }
+
+ /* It is illegal to have the EOI bit set with HW */
+ lr &= ~ICH_LR_EOI;
+
+ /* Translate the virtual mapping to the real one */
+ lr &= ~ICH_LR_PHYS_ID_MASK;
+ lr |= FIELD_PREP(ICH_LR_PHYS_ID_MASK, (u64)irq->hwintid);
+
+ vgic_put_irq(vcpu->kvm, irq);
+
+next:
+ s_cpu_if->vgic_lr[index] = lr;
+ if (lr) {
+ lr_map |= BIT(i);
+ index++;
+ }
+ }
+
+ container_of(s_cpu_if, struct shadow_if, cpuif)->lr_map = lr_map;
+ s_cpu_if->used_lrs = index;
+}
+
+void vgic_v3_sync_nested(struct kvm_vcpu *vcpu)
+{
+ struct shadow_if *shadow_if = get_shadow_if();
+ int i, index = 0;
+
+ for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) {
+ u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i));
+ struct vgic_irq *irq;
+
+ if (!(lr & ICH_LR_HW) || !(lr & ICH_LR_STATE))
+ goto next;
+
+ /*
+ * If we had a HW lr programmed by the guest hypervisor, we
+ * need to emulate the HW effect between the guest hypervisor
+ * and the nested guest.
+ */
+ irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr));
+ if (WARN_ON(!irq)) /* Shouldn't happen as we check on load */
+ goto next;
+
+ lr = __gic_v3_get_lr(index);
+ if (!(lr & ICH_LR_STATE))
+ irq->active = false;
+
+ vgic_put_irq(vcpu->kvm, irq);
+ next:
+ index++;
+ }
+}
+
+static void vgic_v3_create_shadow_state(struct kvm_vcpu *vcpu,
+ struct vgic_v3_cpu_if *s_cpu_if)
+{
+ struct vgic_v3_cpu_if *host_if = &vcpu->arch.vgic_cpu.vgic_v3;
+ u64 val = 0;
+ int i;
+
+ /*
+ * If we're on a system with a broken vgic that requires
+ * trapping, propagate the trapping requirements.
+ *
+ * Ah, the smell of rotten fruits...
+ */
+ if (static_branch_unlikely(&vgic_v3_cpuif_trap))
+ val = host_if->vgic_hcr & (ICH_HCR_EL2_TALL0 | ICH_HCR_EL2_TALL1 |
+ ICH_HCR_EL2_TC | ICH_HCR_EL2_TDIR);
+ s_cpu_if->vgic_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2) | val;
+ s_cpu_if->vgic_vmcr = __vcpu_sys_reg(vcpu, ICH_VMCR_EL2);
+ s_cpu_if->vgic_sre = host_if->vgic_sre;
+
+ for (i = 0; i < 4; i++) {
+ s_cpu_if->vgic_ap0r[i] = __vcpu_sys_reg(vcpu, ICH_AP0RN(i));
+ s_cpu_if->vgic_ap1r[i] = __vcpu_sys_reg(vcpu, ICH_AP1RN(i));
+ }
+
+ vgic_v3_create_shadow_lr(vcpu, s_cpu_if);
+}
+
+void vgic_v3_load_nested(struct kvm_vcpu *vcpu)
+{
+ struct shadow_if *shadow_if = get_shadow_if();
+ struct vgic_v3_cpu_if *cpu_if = &shadow_if->cpuif;
+
+ BUG_ON(!vgic_state_is_nested(vcpu));
+
+ vgic_v3_create_shadow_state(vcpu, cpu_if);
+
+ __vgic_v3_restore_vmcr_aprs(cpu_if);
+ __vgic_v3_activate_traps(cpu_if);
+
+ __vgic_v3_restore_state(cpu_if);
+
+ /*
+ * Propagate the number of used LRs for the benefit of the HYP
+ * GICv3 emulation code. Yes, this is a pretty sorry hack.
+ */
+ vcpu->arch.vgic_cpu.vgic_v3.used_lrs = cpu_if->used_lrs;
+}
+
+void vgic_v3_put_nested(struct kvm_vcpu *vcpu)
+{
+ struct shadow_if *shadow_if = get_shadow_if();
+ struct vgic_v3_cpu_if *s_cpu_if = &shadow_if->cpuif;
+ u64 val;
+ int i;
+
+ __vgic_v3_save_vmcr_aprs(s_cpu_if);
+ __vgic_v3_deactivate_traps(s_cpu_if);
+ __vgic_v3_save_state(s_cpu_if);
+
+ /*
+ * Translate the shadow state HW fields back to the virtual ones
+ * before copying the shadow struct back to the nested one.
+ */
+ val = __vcpu_sys_reg(vcpu, ICH_HCR_EL2);
+ val &= ~ICH_HCR_EL2_EOIcount_MASK;
+ val |= (s_cpu_if->vgic_hcr & ICH_HCR_EL2_EOIcount_MASK);
+ __vcpu_sys_reg(vcpu, ICH_HCR_EL2) = val;
+ __vcpu_sys_reg(vcpu, ICH_VMCR_EL2) = s_cpu_if->vgic_vmcr;
+
+ for (i = 0; i < 4; i++) {
+ __vcpu_sys_reg(vcpu, ICH_AP0RN(i)) = s_cpu_if->vgic_ap0r[i];
+ __vcpu_sys_reg(vcpu, ICH_AP1RN(i)) = s_cpu_if->vgic_ap1r[i];
+ }
+
+ for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) {
+ val = __vcpu_sys_reg(vcpu, ICH_LRN(i));
+
+ val &= ~ICH_LR_STATE;
+ val |= s_cpu_if->vgic_lr[i] & ICH_LR_STATE;
+
+ __vcpu_sys_reg(vcpu, ICH_LRN(i)) = val;
+ s_cpu_if->vgic_lr[i] = 0;
+ }
+
+ shadow_if->lr_map = 0;
+ vcpu->arch.vgic_cpu.vgic_v3.used_lrs = 0;
+}
+
+/*
+ * If we exit a L2 VM with a pending maintenance interrupt from the GIC,
+ * then we need to forward this to L1 so that it can re-sync the appropriate
+ * LRs and sample level triggered interrupts again.
+ */
+void vgic_v3_handle_nested_maint_irq(struct kvm_vcpu *vcpu)
+{
+ bool state = read_sysreg_s(SYS_ICH_MISR_EL2);
+
+ /* This will force a switch back to L1 if the level is high */
+ kvm_vgic_inject_irq(vcpu->kvm, vcpu,
+ vcpu->kvm->arch.vgic.mi_intid, state, vcpu);
+
+ sysreg_clear_set_s(SYS_ICH_HCR_EL2, ICH_HCR_EL2_En, 0);
+}
+
+void vgic_v3_nested_update_mi(struct kvm_vcpu *vcpu)
+{
+ bool level;
+
+ level = __vcpu_sys_reg(vcpu, ICH_HCR_EL2) & ICH_HCR_EL2_En;
+ if (level)
+ level &= vgic_v3_get_misr(vcpu);
+ kvm_vgic_inject_irq(vcpu->kvm, vcpu,
+ vcpu->kvm->arch.vgic.mi_intid, level, vcpu);
+}
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index d7233ab982d0..b9ad7c42c5b0 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -24,7 +24,7 @@ void vgic_v3_set_underflow(struct kvm_vcpu *vcpu)
{
struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3;
- cpuif->vgic_hcr |= ICH_HCR_UIE;
+ cpuif->vgic_hcr |= ICH_HCR_EL2_UIE;
}
static bool lr_signals_eoi_mi(u64 lr_val)
@@ -42,7 +42,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
- cpuif->vgic_hcr &= ~ICH_HCR_UIE;
+ cpuif->vgic_hcr &= ~ICH_HCR_EL2_UIE;
for (lr = 0; lr < cpuif->used_lrs; lr++) {
u64 val = cpuif->vgic_lr[lr];
@@ -284,15 +284,13 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu)
vgic_v3->vgic_sre = 0;
}
- vcpu->arch.vgic_cpu.num_id_bits = (kvm_vgic_global_state.ich_vtr_el2 &
- ICH_VTR_ID_BITS_MASK) >>
- ICH_VTR_ID_BITS_SHIFT;
- vcpu->arch.vgic_cpu.num_pri_bits = ((kvm_vgic_global_state.ich_vtr_el2 &
- ICH_VTR_PRI_BITS_MASK) >>
- ICH_VTR_PRI_BITS_SHIFT) + 1;
+ vcpu->arch.vgic_cpu.num_id_bits = FIELD_GET(ICH_VTR_EL2_IDbits,
+ kvm_vgic_global_state.ich_vtr_el2);
+ vcpu->arch.vgic_cpu.num_pri_bits = FIELD_GET(ICH_VTR_EL2_PRIbits,
+ kvm_vgic_global_state.ich_vtr_el2) + 1;
/* Get the show on the road... */
- vgic_v3->vgic_hcr = ICH_HCR_EN;
+ vgic_v3->vgic_hcr = ICH_HCR_EL2_En;
}
void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu)
@@ -301,18 +299,19 @@ void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu)
/* Hide GICv3 sysreg if necessary */
if (!kvm_has_gicv3(vcpu->kvm)) {
- vgic_v3->vgic_hcr |= ICH_HCR_TALL0 | ICH_HCR_TALL1 | ICH_HCR_TC;
+ vgic_v3->vgic_hcr |= (ICH_HCR_EL2_TALL0 | ICH_HCR_EL2_TALL1 |
+ ICH_HCR_EL2_TC);
return;
}
if (group0_trap)
- vgic_v3->vgic_hcr |= ICH_HCR_TALL0;
+ vgic_v3->vgic_hcr |= ICH_HCR_EL2_TALL0;
if (group1_trap)
- vgic_v3->vgic_hcr |= ICH_HCR_TALL1;
+ vgic_v3->vgic_hcr |= ICH_HCR_EL2_TALL1;
if (common_trap)
- vgic_v3->vgic_hcr |= ICH_HCR_TC;
+ vgic_v3->vgic_hcr |= ICH_HCR_EL2_TC;
if (dir_trap)
- vgic_v3->vgic_hcr |= ICH_HCR_TDIR;
+ vgic_v3->vgic_hcr |= ICH_HCR_EL2_TDIR;
}
int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq)
@@ -632,8 +631,8 @@ static const struct midr_range broken_seis[] = {
static bool vgic_v3_broken_seis(void)
{
- return ((kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_SEIS_MASK) &&
- is_midr_in_range_list(read_cpuid_id(), broken_seis));
+ return ((kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_EL2_SEIS) &&
+ is_midr_in_range_list(broken_seis));
}
/**
@@ -706,10 +705,10 @@ int vgic_v3_probe(const struct gic_kvm_info *info)
if (vgic_v3_broken_seis()) {
kvm_info("GICv3 with broken locally generated SEI\n");
- kvm_vgic_global_state.ich_vtr_el2 &= ~ICH_VTR_SEIS_MASK;
+ kvm_vgic_global_state.ich_vtr_el2 &= ~ICH_VTR_EL2_SEIS;
group0_trap = true;
group1_trap = true;
- if (ich_vtr_el2 & ICH_VTR_TDS_MASK)
+ if (ich_vtr_el2 & ICH_VTR_EL2_TDS)
dir_trap = true;
else
common_trap = true;
@@ -735,6 +734,12 @@ void vgic_v3_load(struct kvm_vcpu *vcpu)
{
struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+ /* If the vgic is nested, perform the full state loading */
+ if (vgic_state_is_nested(vcpu)) {
+ vgic_v3_load_nested(vcpu);
+ return;
+ }
+
if (likely(!is_protected_kvm_enabled()))
kvm_call_hyp(__vgic_v3_restore_vmcr_aprs, cpu_if);
@@ -748,6 +753,11 @@ void vgic_v3_put(struct kvm_vcpu *vcpu)
{
struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+ if (vgic_state_is_nested(vcpu)) {
+ vgic_v3_put_nested(vcpu);
+ return;
+ }
+
if (likely(!is_protected_kvm_enabled()))
kvm_call_hyp(__vgic_v3_save_vmcr_aprs, cpu_if);
WARN_ON(vgic_v4_put(vcpu));
diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
index eedecbbbcf31..c7de6154627c 100644
--- a/arch/arm64/kvm/vgic/vgic-v4.c
+++ b/arch/arm64/kvm/vgic/vgic-v4.c
@@ -336,6 +336,22 @@ void vgic_v4_teardown(struct kvm *kvm)
its_vm->vpes = NULL;
}
+static inline bool vgic_v4_want_doorbell(struct kvm_vcpu *vcpu)
+{
+ if (vcpu_get_flag(vcpu, IN_WFI))
+ return true;
+
+ if (likely(!vcpu_has_nv(vcpu)))
+ return false;
+
+ /*
+ * GICv4 hardware is only ever used for the L1. Mark the vPE (i.e. the
+ * L1 context) nonresident and request a doorbell to kick us out of the
+ * L2 when an IRQ becomes pending.
+ */
+ return vcpu_get_flag(vcpu, IN_NESTED_ERET);
+}
+
int vgic_v4_put(struct kvm_vcpu *vcpu)
{
struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
@@ -343,7 +359,7 @@ int vgic_v4_put(struct kvm_vcpu *vcpu)
if (!vgic_supports_direct_msis(vcpu->kvm) || !vpe->resident)
return 0;
- return its_make_vpe_non_resident(vpe, !!vcpu_get_flag(vcpu, IN_WFI));
+ return its_make_vpe_non_resident(vpe, vgic_v4_want_doorbell(vcpu));
}
int vgic_v4_load(struct kvm_vcpu *vcpu)
@@ -415,7 +431,7 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq,
struct vgic_irq *irq;
struct its_vlpi_map map;
unsigned long flags;
- int ret;
+ int ret = 0;
if (!vgic_supports_direct_msis(kvm))
return 0;
@@ -430,10 +446,15 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq,
mutex_lock(&its->its_lock);
- /* Perform the actual DevID/EventID -> LPI translation. */
- ret = vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid,
- irq_entry->msi.data, &irq);
- if (ret)
+ /*
+ * Perform the actual DevID/EventID -> LPI translation.
+ *
+ * Silently exit if translation fails as the guest (or userspace!) has
+ * managed to do something stupid. Emulated LPI injection will still
+ * work if the guest figures itself out at a later time.
+ */
+ if (vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid,
+ irq_entry->msi.data, &irq))
goto out;
/* Silently exit if the vLPI is already mapped */
@@ -512,7 +533,7 @@ int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int virq,
if (ret)
goto out;
- WARN_ON(!(irq->hw && irq->host_irq == virq));
+ WARN_ON(irq->hw && irq->host_irq != virq);
if (irq->hw) {
atomic_dec(&irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count);
irq->hw = false;
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
index cc8c6b9b5dd8..8f8096d48925 100644
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -872,6 +872,15 @@ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
{
int used_lrs;
+ /* If nesting, emulate the HW effect from L0 to L1 */
+ if (vgic_state_is_nested(vcpu)) {
+ vgic_v3_sync_nested(vcpu);
+ return;
+ }
+
+ if (vcpu_has_nv(vcpu))
+ vgic_v3_nested_update_mi(vcpu);
+
/* An empty ap_list_head implies used_lrs == 0 */
if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head))
return;
@@ -901,6 +910,35 @@ static inline void vgic_restore_state(struct kvm_vcpu *vcpu)
void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
{
/*
+ * If in a nested state, we must return early. Two possibilities:
+ *
+ * - If we have any pending IRQ for the guest and the guest
+ * expects IRQs to be handled in its virtual EL2 mode (the
+ * virtual IMO bit is set) and it is not already running in
+ * virtual EL2 mode, then we have to emulate an IRQ
+ * exception to virtual EL2.
+ *
+ * We do that by placing a request to ourselves which will
+ * abort the entry procedure and inject the exception at the
+ * beginning of the run loop.
+ *
+ * - Otherwise, do exactly *NOTHING*. The guest state is
+ * already loaded, and we can carry on with running it.
+ *
+ * If we have NV, but are not in a nested state, compute the
+ * maintenance interrupt state, as it may fire.
+ */
+ if (vgic_state_is_nested(vcpu)) {
+ if (kvm_vgic_vcpu_pending_irq(vcpu))
+ kvm_make_request(KVM_REQ_GUEST_HYP_IRQ_PENDING, vcpu);
+
+ return;
+ }
+
+ if (vcpu_has_nv(vcpu))
+ vgic_v3_nested_update_mi(vcpu);
+
+ /*
* If there are no virtual interrupts active or pending for this
* VCPU, then there is no work to do and we can bail out without
* taking any lock. There is a potential race with someone injecting
diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h
index 122d95b4e284..0c5a63712702 100644
--- a/arch/arm64/kvm/vgic/vgic.h
+++ b/arch/arm64/kvm/vgic/vgic.h
@@ -353,4 +353,10 @@ static inline bool kvm_has_gicv3(struct kvm *kvm)
return kvm_has_feat(kvm, ID_AA64PFR0_EL1, GIC, IMP);
}
+void vgic_v3_sync_nested(struct kvm_vcpu *vcpu);
+void vgic_v3_load_nested(struct kvm_vcpu *vcpu);
+void vgic_v3_put_nested(struct kvm_vcpu *vcpu);
+void vgic_v3_handle_nested_maint_irq(struct kvm_vcpu *vcpu);
+void vgic_v3_nested_update_mi(struct kvm_vcpu *vcpu);
+
#endif
diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S
index a5a5f5b97b17..de9a303b6ad0 100644
--- a/arch/arm64/lib/clear_user.S
+++ b/arch/arm64/lib/clear_user.S
@@ -17,14 +17,27 @@
* Alignment fixed up by hardware.
*/
- .p2align 4
- // Alignment is for the loop, but since the prologue (including BTI)
- // is also 16 bytes we can keep any padding outside the function
SYM_FUNC_START(__arch_clear_user)
add x2, x0, x1
+
+#ifdef CONFIG_AS_HAS_MOPS
+ .arch_extension mops
+alternative_if_not ARM64_HAS_MOPS
+ b .Lno_mops
+alternative_else_nop_endif
+
+USER(9f, setpt [x0]!, x1!, xzr)
+USER(6f, setmt [x0]!, x1!, xzr)
+USER(6f, setet [x0]!, x1!, xzr)
+ mov x0, #0
+ ret
+.Lno_mops:
+#endif
+
subs x1, x1, #8
b.mi 2f
-1:
+
+1: .p2align 4
USER(9f, sttr xzr, [x0])
add x0, x0, #8
subs x1, x1, #8
@@ -47,6 +60,10 @@ USER(7f, sttrb wzr, [x2, #-1])
ret
// Exception fixups
+6: b.cs 9f
+ // Registers are in Option A format
+ add x0, x0, x1
+ b 9f
7: sub x0, x2, #5 // Adjust for faulting on the final byte...
8: add x0, x0, #4 // ...or the second word of the 4-7 byte case
9: sub x0, x2, x0
diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
index 34e317907524..400057d607ec 100644
--- a/arch/arm64/lib/copy_from_user.S
+++ b/arch/arm64/lib/copy_from_user.S
@@ -52,6 +52,13 @@
stp \reg1, \reg2, [\ptr], \val
.endm
+ .macro cpy1 dst, src, count
+ .arch_extension mops
+ USER_CPY(9997f, 0, cpyfprt [\dst]!, [\src]!, \count!)
+ USER_CPY(9996f, 0, cpyfmrt [\dst]!, [\src]!, \count!)
+ USER_CPY(9996f, 0, cpyfert [\dst]!, [\src]!, \count!)
+ .endm
+
end .req x5
srcin .req x15
SYM_FUNC_START(__arch_copy_from_user)
@@ -62,6 +69,9 @@ SYM_FUNC_START(__arch_copy_from_user)
ret
// Exception fixups
+9996: b.cs 9997f
+ // Registers are in Option A format
+ add dst, dst, count
9997: cmp dst, dstin
b.ne 9998f
// Before being absolutely sure we couldn't copy anything, try harder
diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S
index 488df234c49a..7f2f5a0e2fb9 100644
--- a/arch/arm64/lib/copy_template.S
+++ b/arch/arm64/lib/copy_template.S
@@ -40,6 +40,16 @@ D_l .req x13
D_h .req x14
mov dst, dstin
+
+#ifdef CONFIG_AS_HAS_MOPS
+alternative_if_not ARM64_HAS_MOPS
+ b .Lno_mops
+alternative_else_nop_endif
+ cpy1 dst, src, count
+ b .Lexitfunc
+.Lno_mops:
+#endif
+
cmp count, #16
/*When memory length is less than 16, the accessed are not aligned.*/
b.lo .Ltiny15
diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
index 802231772608..819f2e3fc7a9 100644
--- a/arch/arm64/lib/copy_to_user.S
+++ b/arch/arm64/lib/copy_to_user.S
@@ -51,6 +51,13 @@
user_stp 9997f, \reg1, \reg2, \ptr, \val
.endm
+ .macro cpy1 dst, src, count
+ .arch_extension mops
+ USER_CPY(9997f, 1, cpyfpwt [\dst]!, [\src]!, \count!)
+ USER_CPY(9996f, 1, cpyfmwt [\dst]!, [\src]!, \count!)
+ USER_CPY(9996f, 1, cpyfewt [\dst]!, [\src]!, \count!)
+ .endm
+
end .req x5
srcin .req x15
SYM_FUNC_START(__arch_copy_to_user)
@@ -61,6 +68,9 @@ SYM_FUNC_START(__arch_copy_to_user)
ret
// Exception fixups
+9996: b.cs 9997f
+ // Registers are in Option A format
+ add dst, dst, count
9997: cmp dst, dstin
b.ne 9998f
// Before being absolutely sure we couldn't copy anything, try harder
diff --git a/arch/arm64/mm/extable.c b/arch/arm64/mm/extable.c
index 228d681a8715..6e0528831cd3 100644
--- a/arch/arm64/mm/extable.c
+++ b/arch/arm64/mm/extable.c
@@ -8,8 +8,33 @@
#include <linux/uaccess.h>
#include <asm/asm-extable.h>
+#include <asm/esr.h>
#include <asm/ptrace.h>
+static bool cpy_faulted_on_uaccess(const struct exception_table_entry *ex,
+ unsigned long esr)
+{
+ bool uaccess_is_write = FIELD_GET(EX_DATA_UACCESS_WRITE, ex->data);
+ bool fault_on_write = esr & ESR_ELx_WNR;
+
+ return uaccess_is_write == fault_on_write;
+}
+
+bool insn_may_access_user(unsigned long addr, unsigned long esr)
+{
+ const struct exception_table_entry *ex = search_exception_tables(addr);
+
+ if (!ex)
+ return false;
+
+ switch (ex->type) {
+ case EX_TYPE_UACCESS_CPY:
+ return cpy_faulted_on_uaccess(ex, esr);
+ default:
+ return true;
+ }
+}
+
static inline unsigned long
get_ex_fixup(const struct exception_table_entry *ex)
{
@@ -29,6 +54,17 @@ static bool ex_handler_uaccess_err_zero(const struct exception_table_entry *ex,
return true;
}
+static bool ex_handler_uaccess_cpy(const struct exception_table_entry *ex,
+ struct pt_regs *regs, unsigned long esr)
+{
+ /* Do not fix up faults on kernel memory accesses */
+ if (!cpy_faulted_on_uaccess(ex, esr))
+ return false;
+
+ regs->pc = get_ex_fixup(ex);
+ return true;
+}
+
static bool
ex_handler_load_unaligned_zeropad(const struct exception_table_entry *ex,
struct pt_regs *regs)
@@ -56,7 +92,7 @@ ex_handler_load_unaligned_zeropad(const struct exception_table_entry *ex,
return true;
}
-bool fixup_exception(struct pt_regs *regs)
+bool fixup_exception(struct pt_regs *regs, unsigned long esr)
{
const struct exception_table_entry *ex;
@@ -70,6 +106,8 @@ bool fixup_exception(struct pt_regs *regs)
case EX_TYPE_UACCESS_ERR_ZERO:
case EX_TYPE_KACCESS_ERR_ZERO:
return ex_handler_uaccess_err_zero(ex, regs);
+ case EX_TYPE_UACCESS_CPY:
+ return ex_handler_uaccess_cpy(ex, regs, esr);
case EX_TYPE_LOAD_UNALIGNED_ZEROPAD:
return ex_handler_load_unaligned_zeropad(ex, regs);
}
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index ef63651099a9..ec0a337891dd 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -375,7 +375,7 @@ static void __do_kernel_fault(unsigned long addr, unsigned long esr,
* Are we prepared to handle this kernel fault?
* We are almost certainly not prepared to handle instruction faults.
*/
- if (!is_el1_instruction_abort(esr) && fixup_exception(regs))
+ if (!is_el1_instruction_abort(esr) && fixup_exception(regs, esr))
return;
if (WARN_RATELIMIT(is_spurious_el1_translation_fault(addr, esr, regs),
@@ -606,7 +606,7 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
die_kernel_fault("execution of user memory",
addr, esr, regs);
- if (!search_exception_tables(regs->pc))
+ if (!insn_may_access_user(regs->pc, esr))
die_kernel_fault("access to user memory outside uaccess routines",
addr, esr, regs);
}
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index b3a7fafe8892..cfe8cb8ba1cc 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -334,7 +334,9 @@ unsigned long hugetlb_mask_last_page(struct hstate *h)
switch (hp_size) {
#ifndef __PAGETABLE_PMD_FOLDED
case PUD_SIZE:
- return PGDIR_SIZE - PUD_SIZE;
+ if (pud_sect_supported())
+ return PGDIR_SIZE - PUD_SIZE;
+ break;
#endif
case CONT_PMD_SIZE:
return PUD_SIZE - CONT_PMD_SIZE;
@@ -356,23 +358,21 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
switch (pagesize) {
#ifndef __PAGETABLE_PMD_FOLDED
case PUD_SIZE:
- entry = pud_pte(pud_mkhuge(pte_pud(entry)));
+ if (pud_sect_supported())
+ return pud_pte(pud_mkhuge(pte_pud(entry)));
break;
#endif
case CONT_PMD_SIZE:
- entry = pmd_pte(pmd_mkcont(pte_pmd(entry)));
- fallthrough;
+ return pmd_pte(pmd_mkhuge(pmd_mkcont(pte_pmd(entry))));
case PMD_SIZE:
- entry = pmd_pte(pmd_mkhuge(pte_pmd(entry)));
- break;
+ return pmd_pte(pmd_mkhuge(pte_pmd(entry)));
case CONT_PTE_SIZE:
- entry = pte_mkcont(entry);
- break;
+ return pte_mkcont(entry);
default:
- pr_warn("%s: unrecognized huge page size 0x%lx\n",
- __func__, pagesize);
break;
}
+ pr_warn("%s: unrecognized huge page size 0x%lx\n",
+ __func__, pagesize);
return entry;
}
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index b65a29440a0c..d541ce45daeb 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -190,7 +190,7 @@ static void __init kasan_pgd_populate(unsigned long addr, unsigned long end,
*/
static bool __init root_level_aligned(u64 addr)
{
- int shift = (ARM64_HW_PGTABLE_LEVELS(vabits_actual) - 1) * (PAGE_SHIFT - 3);
+ int shift = (ARM64_HW_PGTABLE_LEVELS(vabits_actual) - 1) * PTDESC_TABLE_SHIFT;
return (addr % (PAGE_SIZE << shift)) == 0;
}
@@ -245,7 +245,7 @@ static int __init root_level_idx(u64 addr)
*/
u64 vabits = IS_ENABLED(CONFIG_ARM64_64K_PAGES) ? VA_BITS
: vabits_actual;
- int shift = (ARM64_HW_PGTABLE_LEVELS(vabits) - 1) * (PAGE_SHIFT - 3);
+ int shift = (ARM64_HW_PGTABLE_LEVELS(vabits) - 1) * PTDESC_TABLE_SHIFT;
return (addr & ~_PAGE_OFFSET(vabits)) >> (shift + PAGE_SHIFT);
}
@@ -269,7 +269,7 @@ static void __init clone_next_level(u64 addr, pgd_t *tmp_pg_dir, pud_t *pud)
*/
static int __init next_level_idx(u64 addr)
{
- int shift = (ARM64_HW_PGTABLE_LEVELS(vabits_actual) - 2) * (PAGE_SHIFT - 3);
+ int shift = (ARM64_HW_PGTABLE_LEVELS(vabits_actual) - 2) * PTDESC_TABLE_SHIFT;
return (addr >> (shift + PAGE_SHIFT)) % PTRS_PER_PTE;
}
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 1dfe1a8efdbe..b98f89420713 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1558,9 +1558,8 @@ void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp)
#ifdef CONFIG_ARCH_HAS_PKEYS
int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val)
{
- u64 new_por = POE_RXW;
+ u64 new_por;
u64 old_por;
- u64 pkey_shift;
if (!system_supports_poe())
return -ENOSPC;
@@ -1574,7 +1573,7 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long i
return -EINVAL;
/* Set the bits we need in POR: */
- new_por = POE_RXW;
+ new_por = POE_RWX;
if (init_val & PKEY_DISABLE_WRITE)
new_por &= ~POE_W;
if (init_val & PKEY_DISABLE_ACCESS)
@@ -1585,12 +1584,11 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long i
new_por &= ~POE_X;
/* Shift the bits in to the correct place in POR for pkey: */
- pkey_shift = pkey * POR_BITS_PER_PKEY;
- new_por <<= pkey_shift;
+ new_por = POR_ELx_PERM_PREP(pkey, new_por);
/* Get old POR and mask off any old bits in place: */
old_por = read_sysreg_s(SYS_POR_EL0);
- old_por &= ~(POE_MASK << pkey_shift);
+ old_por &= ~(POE_MASK << POR_ELx_PERM_SHIFT(pkey));
/* Write old part along with new part: */
write_sysreg_s(old_por | new_por, SYS_POR_EL0);
diff --git a/arch/arm64/mm/physaddr.c b/arch/arm64/mm/physaddr.c
index cde44c13dda1..7d94e09b01b3 100644
--- a/arch/arm64/mm/physaddr.c
+++ b/arch/arm64/mm/physaddr.c
@@ -10,7 +10,7 @@
phys_addr_t __virt_to_phys(unsigned long x)
{
WARN(!__is_lm_address(__tag_reset(x)),
- "virt_to_phys used for non-linear address: %pK (%pS)\n",
+ "virt_to_phys used for non-linear address: %p (%pS)\n",
(void *)x,
(void *)x);
diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c
index 688fbe0271ca..8cec0da4cff2 100644
--- a/arch/arm64/mm/ptdump.c
+++ b/arch/arm64/mm/ptdump.c
@@ -80,8 +80,8 @@ static const struct ptdump_prot_bits pte_bits[] = {
.set = "CON",
.clear = " ",
}, {
- .mask = PTE_TABLE_BIT | PTE_VALID,
- .val = PTE_VALID,
+ .mask = PMD_TYPE_MASK,
+ .val = PMD_TYPE_SECT,
.set = "BLK",
.clear = " ",
}, {
diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps
index 1e65f2fb45bd..772c1b008e43 100644
--- a/arch/arm64/tools/cpucaps
+++ b/arch/arm64/tools/cpucaps
@@ -45,6 +45,7 @@ HAS_LSE_ATOMICS
HAS_MOPS
HAS_NESTED_VIRT
HAS_PAN
+HAS_PMUV3
HAS_S1PIE
HAS_S1POE
HAS_RAS_EXTN
@@ -104,6 +105,7 @@ WORKAROUND_CAVIUM_TX2_219_TVM
WORKAROUND_CLEAN_CACHE
WORKAROUND_DEVICE_LOAD_ACQUIRE
WORKAROUND_NVIDIA_CARMEL_CNP
+WORKAROUND_PMUV3_IMPDEF_TRAPS
WORKAROUND_QCOM_FALKOR_E1003
WORKAROUND_QCOM_ORYON_CNTVOFF
WORKAROUND_REPEAT_TLBI
diff --git a/arch/arm64/tools/gen-sysreg.awk b/arch/arm64/tools/gen-sysreg.awk
index 1a2afc9fdd42..f2a1732cb1f6 100755
--- a/arch/arm64/tools/gen-sysreg.awk
+++ b/arch/arm64/tools/gen-sysreg.awk
@@ -111,7 +111,7 @@ END {
/^$/ { next }
/^[\t ]*#/ { next }
-/^SysregFields/ && block_current() == "Root" {
+$1 == "SysregFields" && block_current() == "Root" {
block_push("SysregFields")
expect_fields(2)
@@ -127,7 +127,8 @@ END {
next
}
-/^EndSysregFields/ && block_current() == "SysregFields" {
+$1 == "EndSysregFields" && block_current() == "SysregFields" {
+ expect_fields(1)
if (next_bit > 0)
fatal("Unspecified bits in " reg)
@@ -145,7 +146,7 @@ END {
next
}
-/^Sysreg/ && block_current() == "Root" {
+$1 == "Sysreg" && block_current() == "Root" {
block_push("Sysreg")
expect_fields(7)
@@ -177,7 +178,8 @@ END {
next
}
-/^EndSysreg/ && block_current() == "Sysreg" {
+$1 == "EndSysreg" && block_current() == "Sysreg" {
+ expect_fields(1)
if (next_bit > 0)
fatal("Unspecified bits in " reg)
@@ -206,7 +208,7 @@ END {
# Currently this is effectivey a comment, in future we may want to emit
# defines for the fields.
-(/^Fields/ || /^Mapping/) && block_current() == "Sysreg" {
+($1 == "Fields" || $1 == "Mapping") && block_current() == "Sysreg" {
expect_fields(2)
if (next_bit != 63)
@@ -224,7 +226,7 @@ END {
}
-/^Res0/ && (block_current() == "Sysreg" || block_current() == "SysregFields") {
+$1 == "Res0" && (block_current() == "Sysreg" || block_current() == "SysregFields") {
expect_fields(2)
parse_bitdef(reg, "RES0", $2)
field = "RES0_" msb "_" lsb
@@ -234,7 +236,7 @@ END {
next
}
-/^Res1/ && (block_current() == "Sysreg" || block_current() == "SysregFields") {
+$1 == "Res1" && (block_current() == "Sysreg" || block_current() == "SysregFields") {
expect_fields(2)
parse_bitdef(reg, "RES1", $2)
field = "RES1_" msb "_" lsb
@@ -244,7 +246,7 @@ END {
next
}
-/^Unkn/ && (block_current() == "Sysreg" || block_current() == "SysregFields") {
+$1 == "Unkn" && (block_current() == "Sysreg" || block_current() == "SysregFields") {
expect_fields(2)
parse_bitdef(reg, "UNKN", $2)
field = "UNKN_" msb "_" lsb
@@ -254,7 +256,7 @@ END {
next
}
-/^Field/ && (block_current() == "Sysreg" || block_current() == "SysregFields") {
+$1 == "Field" && (block_current() == "Sysreg" || block_current() == "SysregFields") {
expect_fields(3)
field = $3
parse_bitdef(reg, field, $2)
@@ -265,14 +267,14 @@ END {
next
}
-/^Raz/ && (block_current() == "Sysreg" || block_current() == "SysregFields") {
+$1 == "Raz" && (block_current() == "Sysreg" || block_current() == "SysregFields") {
expect_fields(2)
parse_bitdef(reg, field, $2)
next
}
-/^SignedEnum/ && (block_current() == "Sysreg" || block_current() == "SysregFields") {
+$1 == "SignedEnum" && (block_current() == "Sysreg" || block_current() == "SysregFields") {
block_push("Enum")
expect_fields(3)
@@ -285,7 +287,7 @@ END {
next
}
-/^UnsignedEnum/ && (block_current() == "Sysreg" || block_current() == "SysregFields") {
+$1 == "UnsignedEnum" && (block_current() == "Sysreg" || block_current() == "SysregFields") {
block_push("Enum")
expect_fields(3)
@@ -298,7 +300,7 @@ END {
next
}
-/^Enum/ && (block_current() == "Sysreg" || block_current() == "SysregFields") {
+$1 == "Enum" && (block_current() == "Sysreg" || block_current() == "SysregFields") {
block_push("Enum")
expect_fields(3)
@@ -310,7 +312,8 @@ END {
next
}
-/^EndEnum/ && block_current() == "Enum" {
+$1 == "EndEnum" && block_current() == "Enum" {
+ expect_fields(1)
field = null
msb = null
diff --git a/arch/arm64/tools/syscall_32.tbl b/arch/arm64/tools/syscall_32.tbl
index 69a829912a05..0765b3a8d6d6 100644
--- a/arch/arm64/tools/syscall_32.tbl
+++ b/arch/arm64/tools/syscall_32.tbl
@@ -478,3 +478,4 @@
464 common getxattrat sys_getxattrat
465 common listxattrat sys_listxattrat
466 common removexattrat sys_removexattrat
+467 common open_tree_attr sys_open_tree_attr
diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg
index 762ee084b37c..f9476848a2ed 100644
--- a/arch/arm64/tools/sysreg
+++ b/arch/arm64/tools/sysreg
@@ -1664,6 +1664,7 @@ EndEnum
UnsignedEnum 59:56 FGT
0b0000 NI
0b0001 IMP
+ 0b0010 FGT2
EndEnum
Res0 55:48
UnsignedEnum 47:44 EXS
@@ -1725,6 +1726,7 @@ Enum 3:0 PARANGE
0b0100 44
0b0101 48
0b0110 52
+ 0b0111 56
EndEnum
EndSysreg
@@ -2074,7 +2076,7 @@ EndEnum
Res0 4:2
Field 1 ExTRE
Field 0 E0TRE
-EndSysregFields
+EndSysreg
Sysreg SMPRI_EL1 3 0 1 2 4
Res0 63:4
@@ -2641,6 +2643,101 @@ Field 0 E0HTRE
EndSysreg
+Sysreg HDFGRTR2_EL2 3 4 3 1 0
+Res0 63:25
+Field 24 nPMBMAR_EL1
+Field 23 nMDSTEPOP_EL1
+Field 22 nTRBMPAM_EL1
+Res0 21
+Field 20 nTRCITECR_EL1
+Field 19 nPMSDSFR_EL1
+Field 18 nSPMDEVAFF_EL1
+Field 17 nSPMID
+Field 16 nSPMSCR_EL1
+Field 15 nSPMACCESSR_EL1
+Field 14 nSPMCR_EL0
+Field 13 nSPMOVS
+Field 12 nSPMINTEN
+Field 11 nSPMCNTEN
+Field 10 nSPMSELR_EL0
+Field 9 nSPMEVTYPERn_EL0
+Field 8 nSPMEVCNTRn_EL0
+Field 7 nPMSSCR_EL1
+Field 6 nPMSSDATA
+Field 5 nMDSELR_EL1
+Field 4 nPMUACR_EL1
+Field 3 nPMICFILTR_EL0
+Field 2 nPMICNTR_EL0
+Field 1 nPMIAR_EL1
+Field 0 nPMECR_EL1
+EndSysreg
+
+Sysreg HDFGWTR2_EL2 3 4 3 1 1
+Res0 63:25
+Field 24 nPMBMAR_EL1
+Field 23 nMDSTEPOP_EL1
+Field 22 nTRBMPAM_EL1
+Field 21 nPMZR_EL0
+Field 20 nTRCITECR_EL1
+Field 19 nPMSDSFR_EL1
+Res0 18:17
+Field 16 nSPMSCR_EL1
+Field 15 nSPMACCESSR_EL1
+Field 14 nSPMCR_EL0
+Field 13 nSPMOVS
+Field 12 nSPMINTEN
+Field 11 nSPMCNTEN
+Field 10 nSPMSELR_EL0
+Field 9 nSPMEVTYPERn_EL0
+Field 8 nSPMEVCNTRn_EL0
+Field 7 nPMSSCR_EL1
+Res0 6
+Field 5 nMDSELR_EL1
+Field 4 nPMUACR_EL1
+Field 3 nPMICFILTR_EL0
+Field 2 nPMICNTR_EL0
+Field 1 nPMIAR_EL1
+Field 0 nPMECR_EL1
+EndSysreg
+
+Sysreg HFGRTR2_EL2 3 4 3 1 2
+Res0 63:15
+Field 14 nACTLRALIAS_EL1
+Field 13 nACTLRMASK_EL1
+Field 12 nTCR2ALIAS_EL1
+Field 11 nTCRALIAS_EL1
+Field 10 nSCTLRALIAS2_EL1
+Field 9 nSCTLRALIAS_EL1
+Field 8 nCPACRALIAS_EL1
+Field 7 nTCR2MASK_EL1
+Field 6 nTCRMASK_EL1
+Field 5 nSCTLR2MASK_EL1
+Field 4 nSCTLRMASK_EL1
+Field 3 nCPACRMASK_EL1
+Field 2 nRCWSMASK_EL1
+Field 1 nERXGSR_EL1
+Field 0 nPFAR_EL1
+EndSysreg
+
+Sysreg HFGWTR2_EL2 3 4 3 1 3
+Res0 63:15
+Field 14 nACTLRALIAS_EL1
+Field 13 nACTLRMASK_EL1
+Field 12 nTCR2ALIAS_EL1
+Field 11 nTCRALIAS_EL1
+Field 10 nSCTLRALIAS2_EL1
+Field 9 nSCTLRALIAS_EL1
+Field 8 nCPACRALIAS_EL1
+Field 7 nTCR2MASK_EL1
+Field 6 nTCRMASK_EL1
+Field 5 nSCTLR2MASK_EL1
+Field 4 nSCTLRMASK_EL1
+Field 3 nCPACRMASK_EL1
+Field 2 nRCWSMASK_EL1
+Res0 1
+Field 0 nPFAR_EL1
+EndSysreg
+
Sysreg HDFGRTR_EL2 3 4 3 1 4
Field 63 PMBIDR_EL1
Field 62 nPMSNEVFR_EL1
@@ -2813,6 +2910,12 @@ Field 1 AMEVCNTR00_EL0
Field 0 AMCNTEN0
EndSysreg
+Sysreg HFGITR2_EL2 3 4 3 1 7
+Res0 63:2
+Field 1 nDCCIVAPS
+Field 0 TSBCSYNC
+EndSysreg
+
Sysreg ZCR_EL2 3 4 1 2 0
Fields ZCR_ELx
EndSysreg
@@ -3035,6 +3138,54 @@ Field 31:16 PhyPARTID29
Field 15:0 PhyPARTID28
EndSysreg
+Sysreg ICH_HCR_EL2 3 4 12 11 0
+Res0 63:32
+Field 31:27 EOIcount
+Res0 26:16
+Field 15 DVIM
+Field 14 TDIR
+Field 13 TSEI
+Field 12 TALL1
+Field 11 TALL0
+Field 10 TC
+Res0 9
+Field 8 vSGIEOICount
+Field 7 VGrp1DIE
+Field 6 VGrp1EIE
+Field 5 VGrp0DIE
+Field 4 VGrp0EIE
+Field 3 NPIE
+Field 2 LRENPIE
+Field 1 UIE
+Field 0 En
+EndSysreg
+
+Sysreg ICH_VTR_EL2 3 4 12 11 1
+Res0 63:32
+Field 31:29 PRIbits
+Field 28:26 PREbits
+Field 25:23 IDbits
+Field 22 SEIS
+Field 21 A3V
+Field 20 nV4
+Field 19 TDS
+Field 18 DVIM
+Res0 17:5
+Field 4:0 ListRegs
+EndSysreg
+
+Sysreg ICH_MISR_EL2 3 4 12 11 2
+Res0 63:8
+Field 7 VGrp1D
+Field 6 VGrp1E
+Field 5 VGrp0D
+Field 4 VGrp0E
+Field 3 NP
+Field 2 LRENP
+Field 1 U
+Field 0 EOI
+EndSysreg
+
Sysreg CONTEXTIDR_EL2 3 4 13 0 1
Fields CONTEXTIDR_ELx
EndSysreg
diff --git a/arch/csky/kernel/vdso/Makefile b/arch/csky/kernel/vdso/Makefile
index 069ef0b17fe5..a3042287a070 100644
--- a/arch/csky/kernel/vdso/Makefile
+++ b/arch/csky/kernel/vdso/Makefile
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
# Include the generic Makefile to check the built vdso.
-include $(srctree)/lib/vdso/Makefile
+include $(srctree)/lib/vdso/Makefile.include
# Symbols present in the vdso
vdso-syms += rt_sigreturn
diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index 2b8bd27a852f..687502917ae2 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -30,6 +30,7 @@ config LOONGARCH
select ARCH_HAS_SET_MEMORY
select ARCH_HAS_SET_DIRECT_MAP
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
+ select ARCH_HAS_VDSO_ARCH_DATA
select ARCH_INLINE_READ_LOCK if !PREEMPTION
select ARCH_INLINE_READ_LOCK_BH if !PREEMPTION
select ARCH_INLINE_READ_LOCK_IRQ if !PREEMPTION
@@ -106,6 +107,7 @@ config LOONGARCH
select GENERIC_SCHED_CLOCK
select GENERIC_SMP_IDLE_THREAD
select GENERIC_TIME_VSYSCALL
+ select GENERIC_VDSO_DATA_STORE
select GENERIC_VDSO_TIME_NS
select GPIOLIB
select HAS_IOPORT
@@ -291,6 +293,9 @@ config AS_HAS_LBT_EXTENSION
config AS_HAS_LVZ_EXTENSION
def_bool $(as-instr,hvcl 0)
+config CC_HAS_ANNOTATE_TABLEJUMP
+ def_bool $(cc-option,-mannotate-tablejump)
+
menu "Kernel type and options"
source "kernel/Kconfig.hz"
diff --git a/arch/loongarch/Makefile b/arch/loongarch/Makefile
index 567bd122a9ee..0304eabbe606 100644
--- a/arch/loongarch/Makefile
+++ b/arch/loongarch/Makefile
@@ -101,7 +101,11 @@ KBUILD_AFLAGS += $(call cc-option,-mthin-add-sub) $(call cc-option,-Wa$(comma)
KBUILD_CFLAGS += $(call cc-option,-mthin-add-sub) $(call cc-option,-Wa$(comma)-mthin-add-sub)
ifdef CONFIG_OBJTOOL
-KBUILD_CFLAGS += -fno-jump-tables
+ifdef CONFIG_CC_HAS_ANNOTATE_TABLEJUMP
+KBUILD_CFLAGS += -mannotate-tablejump
+else
+KBUILD_CFLAGS += -fno-jump-tables # keep compatibility with older compilers
+endif
endif
KBUILD_RUSTFLAGS += --target=loongarch64-unknown-none-softfloat -Ccode-model=small
diff --git a/arch/loongarch/configs/loongson3_defconfig b/arch/loongarch/configs/loongson3_defconfig
index 73c77500ac46..3c240afe5aed 100644
--- a/arch/loongarch/configs/loongson3_defconfig
+++ b/arch/loongarch/configs/loongson3_defconfig
@@ -981,7 +981,6 @@ CONFIG_MINIX_FS=m
CONFIG_ROMFS_FS=m
CONFIG_PSTORE=m
CONFIG_PSTORE_COMPRESS=y
-CONFIG_SYSV_FS=m
CONFIG_UFS_FS=m
CONFIG_EROFS_FS=m
CONFIG_EROFS_FS_ZIP_LZMA=y
diff --git a/arch/loongarch/include/asm/kvm_host.h b/arch/loongarch/include/asm/kvm_host.h
index 590982cd986e..f457c2662e2f 100644
--- a/arch/loongarch/include/asm/kvm_host.h
+++ b/arch/loongarch/include/asm/kvm_host.h
@@ -12,6 +12,7 @@
#include <linux/kvm.h>
#include <linux/kvm_types.h>
#include <linux/mutex.h>
+#include <linux/perf_event.h>
#include <linux/spinlock.h>
#include <linux/threads.h>
#include <linux/types.h>
@@ -176,6 +177,9 @@ struct kvm_vcpu_arch {
/* Pointers stored here for easy accessing from assembly code */
int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu);
+ /* GPA (=HVA) of PGD for secondary mmu */
+ unsigned long kvm_pgd;
+
/* Host registers preserved across guest mode execution */
unsigned long host_sp;
unsigned long host_tp;
@@ -289,6 +293,8 @@ static inline int kvm_get_pmu_num(struct kvm_vcpu_arch *arch)
return (arch->cpucfg[6] & CPUCFG6_PMNUM) >> CPUCFG6_PMNUM_SHIFT;
}
+bool kvm_arch_pmi_in_guest(struct kvm_vcpu *vcpu);
+
/* Debug: dump vcpu state */
int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu);
@@ -320,7 +326,6 @@ static inline bool kvm_is_ifetch_fault(struct kvm_vcpu_arch *arch)
/* Misc */
static inline void kvm_arch_hardware_unsetup(void) {}
-static inline void kvm_arch_sync_events(struct kvm *kvm) {}
static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {}
static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
diff --git a/arch/loongarch/include/asm/vdso.h b/arch/loongarch/include/asm/vdso.h
index d3ba35eb23e7..f72ec79e2dde 100644
--- a/arch/loongarch/include/asm/vdso.h
+++ b/arch/loongarch/include/asm/vdso.h
@@ -31,7 +31,6 @@ struct loongarch_vdso_info {
unsigned long size;
unsigned long offset_sigreturn;
struct vm_special_mapping code_mapping;
- struct vm_special_mapping data_mapping;
};
extern struct loongarch_vdso_info vdso_info;
diff --git a/arch/loongarch/include/asm/vdso/arch_data.h b/arch/loongarch/include/asm/vdso/arch_data.h
new file mode 100644
index 000000000000..322d0a5f1c84
--- /dev/null
+++ b/arch/loongarch/include/asm/vdso/arch_data.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Author: Huacai Chen <chenhuacai@loongson.cn>
+ * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
+ */
+
+#ifndef _VDSO_ARCH_DATA_H
+#define _VDSO_ARCH_DATA_H
+
+#ifndef __ASSEMBLY__
+
+#include <asm/asm.h>
+#include <asm/vdso.h>
+
+struct vdso_pcpu_data {
+ u32 node;
+} ____cacheline_aligned_in_smp;
+
+struct vdso_arch_data {
+ struct vdso_pcpu_data pdata[NR_CPUS];
+};
+
+#endif /* __ASSEMBLY__ */
+
+#endif
diff --git a/arch/loongarch/include/asm/vdso/getrandom.h b/arch/loongarch/include/asm/vdso/getrandom.h
index e80f3c4ac748..48c43f55b039 100644
--- a/arch/loongarch/include/asm/vdso/getrandom.h
+++ b/arch/loongarch/include/asm/vdso/getrandom.h
@@ -28,11 +28,6 @@ static __always_inline ssize_t getrandom_syscall(void *_buffer, size_t _len, uns
return ret;
}
-static __always_inline const struct vdso_rng_data *__arch_get_vdso_rng_data(void)
-{
- return &_loongarch_data.rng_data;
-}
-
#endif /* !__ASSEMBLY__ */
#endif /* __ASM_VDSO_GETRANDOM_H */
diff --git a/arch/loongarch/include/asm/vdso/gettimeofday.h b/arch/loongarch/include/asm/vdso/gettimeofday.h
index 7eb3f041af76..88cfcf133116 100644
--- a/arch/loongarch/include/asm/vdso/gettimeofday.h
+++ b/arch/loongarch/include/asm/vdso/gettimeofday.h
@@ -72,7 +72,7 @@ static __always_inline int clock_getres_fallback(
}
static __always_inline u64 __arch_get_hw_counter(s32 clock_mode,
- const struct vdso_data *vd)
+ const struct vdso_time_data *vd)
{
uint64_t count;
@@ -89,18 +89,6 @@ static inline bool loongarch_vdso_hres_capable(void)
}
#define __arch_vdso_hres_capable loongarch_vdso_hres_capable
-static __always_inline const struct vdso_data *__arch_get_vdso_data(void)
-{
- return _vdso_data;
-}
-
-#ifdef CONFIG_TIME_NS
-static __always_inline
-const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd)
-{
- return _timens_data;
-}
-#endif
#endif /* !__ASSEMBLY__ */
#endif /* __ASM_VDSO_GETTIMEOFDAY_H */
diff --git a/arch/loongarch/include/asm/vdso/vdso.h b/arch/loongarch/include/asm/vdso/vdso.h
index 1c183a9b2115..50c65fb29daf 100644
--- a/arch/loongarch/include/asm/vdso/vdso.h
+++ b/arch/loongarch/include/asm/vdso/vdso.h
@@ -12,43 +12,9 @@
#include <asm/asm.h>
#include <asm/page.h>
#include <asm/vdso.h>
+#include <vdso/datapage.h>
-struct vdso_pcpu_data {
- u32 node;
-} ____cacheline_aligned_in_smp;
-
-struct loongarch_vdso_data {
- struct vdso_pcpu_data pdata[NR_CPUS];
- struct vdso_rng_data rng_data;
-};
-
-/*
- * The layout of vvar:
- *
- * high
- * +---------------------+--------------------------+
- * | loongarch vdso data | LOONGARCH_VDSO_DATA_SIZE |
- * +---------------------+--------------------------+
- * | time-ns vdso data | PAGE_SIZE |
- * +---------------------+--------------------------+
- * | generic vdso data | PAGE_SIZE |
- * +---------------------+--------------------------+
- * low
- */
-#define LOONGARCH_VDSO_DATA_SIZE PAGE_ALIGN(sizeof(struct loongarch_vdso_data))
-#define LOONGARCH_VDSO_DATA_PAGES (LOONGARCH_VDSO_DATA_SIZE >> PAGE_SHIFT)
-
-enum vvar_pages {
- VVAR_GENERIC_PAGE_OFFSET,
- VVAR_TIMENS_PAGE_OFFSET,
- VVAR_LOONGARCH_PAGES_START,
- VVAR_LOONGARCH_PAGES_END = VVAR_LOONGARCH_PAGES_START + LOONGARCH_VDSO_DATA_PAGES - 1,
- VVAR_NR_PAGES,
-};
-
-#define VVAR_SIZE (VVAR_NR_PAGES << PAGE_SHIFT)
-
-extern struct loongarch_vdso_data _loongarch_data __attribute__((visibility("hidden")));
+#define VVAR_SIZE (VDSO_NR_PAGES << PAGE_SHIFT)
#endif /* __ASSEMBLY__ */
diff --git a/arch/loongarch/include/asm/vdso/vsyscall.h b/arch/loongarch/include/asm/vdso/vsyscall.h
index 8987e951d0a9..1140b54b4bc8 100644
--- a/arch/loongarch/include/asm/vdso/vsyscall.h
+++ b/arch/loongarch/include/asm/vdso/vsyscall.h
@@ -6,23 +6,6 @@
#include <vdso/datapage.h>
-extern struct vdso_data *vdso_data;
-extern struct vdso_rng_data *vdso_rng_data;
-
-static __always_inline
-struct vdso_data *__loongarch_get_k_vdso_data(void)
-{
- return vdso_data;
-}
-#define __arch_get_k_vdso_data __loongarch_get_k_vdso_data
-
-static __always_inline
-struct vdso_rng_data *__loongarch_get_k_vdso_rng_data(void)
-{
- return vdso_rng_data;
-}
-#define __arch_get_k_vdso_rng_data __loongarch_get_k_vdso_rng_data
-
/* The asm-generic header needs to be included after the definitions above */
#include <asm-generic/vdso/vsyscall.h>
diff --git a/arch/loongarch/kernel/asm-offsets.c b/arch/loongarch/kernel/asm-offsets.c
index 8be1c38ad8eb..db1e4bb26b6a 100644
--- a/arch/loongarch/kernel/asm-offsets.c
+++ b/arch/loongarch/kernel/asm-offsets.c
@@ -296,6 +296,7 @@ static void __used output_kvm_defines(void)
OFFSET(KVM_ARCH_HSP, kvm_vcpu_arch, host_sp);
OFFSET(KVM_ARCH_HTP, kvm_vcpu_arch, host_tp);
OFFSET(KVM_ARCH_HPGD, kvm_vcpu_arch, host_pgd);
+ OFFSET(KVM_ARCH_KVMPGD, kvm_vcpu_arch, kvm_pgd);
OFFSET(KVM_ARCH_HANDLE_EXIT, kvm_vcpu_arch, handle_exit);
OFFSET(KVM_ARCH_HEENTRY, kvm_vcpu_arch, host_eentry);
OFFSET(KVM_ARCH_GEENTRY, kvm_vcpu_arch, guest_eentry);
@@ -315,6 +316,6 @@ static void __used output_vdso_defines(void)
{
COMMENT("LoongArch vDSO offsets.");
- DEFINE(__VVAR_PAGES, VVAR_NR_PAGES);
+ DEFINE(__VDSO_PAGES, VDSO_NR_PAGES);
BLANK();
}
diff --git a/arch/loongarch/kernel/vdso.c b/arch/loongarch/kernel/vdso.c
index 05e5fbac102a..10cf1608c7b3 100644
--- a/arch/loongarch/kernel/vdso.c
+++ b/arch/loongarch/kernel/vdso.c
@@ -14,7 +14,7 @@
#include <linux/random.h>
#include <linux/sched.h>
#include <linux/slab.h>
-#include <linux/time_namespace.h>
+#include <linux/vdso_datastore.h>
#include <asm/page.h>
#include <asm/vdso.h>
@@ -25,18 +25,6 @@
extern char vdso_start[], vdso_end[];
-/* Kernel-provided data used by the VDSO. */
-static union vdso_data_store generic_vdso_data __page_aligned_data;
-
-static union {
- u8 page[LOONGARCH_VDSO_DATA_SIZE];
- struct loongarch_vdso_data vdata;
-} loongarch_vdso_data __page_aligned_data;
-
-struct vdso_data *vdso_data = generic_vdso_data.data;
-struct vdso_pcpu_data *vdso_pdata = loongarch_vdso_data.vdata.pdata;
-struct vdso_rng_data *vdso_rng_data = &loongarch_vdso_data.vdata.rng_data;
-
static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma)
{
current->mm->context.vdso = (void *)(new_vma->vm_start);
@@ -44,53 +32,12 @@ static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struc
return 0;
}
-static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
- struct vm_area_struct *vma, struct vm_fault *vmf)
-{
- unsigned long pfn;
- struct page *timens_page = find_timens_vvar_page(vma);
-
- switch (vmf->pgoff) {
- case VVAR_GENERIC_PAGE_OFFSET:
- if (!timens_page)
- pfn = sym_to_pfn(vdso_data);
- else
- pfn = page_to_pfn(timens_page);
- break;
-#ifdef CONFIG_TIME_NS
- case VVAR_TIMENS_PAGE_OFFSET:
- /*
- * If a task belongs to a time namespace then a namespace specific
- * VVAR is mapped with the VVAR_GENERIC_PAGE_OFFSET and the real
- * VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET offset.
- * See also the comment near timens_setup_vdso_data().
- */
- if (!timens_page)
- return VM_FAULT_SIGBUS;
- else
- pfn = sym_to_pfn(vdso_data);
- break;
-#endif /* CONFIG_TIME_NS */
- case VVAR_LOONGARCH_PAGES_START ... VVAR_LOONGARCH_PAGES_END:
- pfn = sym_to_pfn(&loongarch_vdso_data) + vmf->pgoff - VVAR_LOONGARCH_PAGES_START;
- break;
- default:
- return VM_FAULT_SIGBUS;
- }
-
- return vmf_insert_pfn(vma, vmf->address, pfn);
-}
-
struct loongarch_vdso_info vdso_info = {
.vdso = vdso_start,
.code_mapping = {
.name = "[vdso]",
.mremap = vdso_mremap,
},
- .data_mapping = {
- .name = "[vvar]",
- .fault = vvar_fault,
- },
.offset_sigreturn = vdso_offset_sigreturn,
};
@@ -101,7 +48,7 @@ static int __init init_vdso(void)
BUG_ON(!PAGE_ALIGNED(vdso_info.vdso));
for_each_possible_cpu(cpu)
- vdso_pdata[cpu].node = cpu_to_node(cpu);
+ vdso_k_arch_data->pdata[cpu].node = cpu_to_node(cpu);
vdso_info.size = PAGE_ALIGN(vdso_end - vdso_start);
vdso_info.code_mapping.pages =
@@ -115,37 +62,6 @@ static int __init init_vdso(void)
}
subsys_initcall(init_vdso);
-#ifdef CONFIG_TIME_NS
-struct vdso_data *arch_get_vdso_data(void *vvar_page)
-{
- return (struct vdso_data *)(vvar_page);
-}
-
-/*
- * The vvar mapping contains data for a specific time namespace, so when a
- * task changes namespace we must unmap its vvar data for the old namespace.
- * Subsequent faults will map in data for the new namespace.
- *
- * For more details see timens_setup_vdso_data().
- */
-int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
-{
- struct mm_struct *mm = task->mm;
- struct vm_area_struct *vma;
-
- VMA_ITERATOR(vmi, mm, 0);
-
- mmap_read_lock(mm);
- for_each_vma(vmi, vma) {
- if (vma_is_special_mapping(vma, &vdso_info.data_mapping))
- zap_vma_pages(vma);
- }
- mmap_read_unlock(mm);
-
- return 0;
-}
-#endif
-
static unsigned long vdso_base(void)
{
unsigned long base = STACK_TOP;
@@ -181,9 +97,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
goto out;
}
- vma = _install_special_mapping(mm, data_addr, VVAR_SIZE,
- VM_READ | VM_MAYREAD | VM_PFNMAP,
- &info->data_mapping);
+ vma = vdso_install_vvar_mapping(mm, data_addr);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
goto out;
diff --git a/arch/loongarch/kvm/Kconfig b/arch/loongarch/kvm/Kconfig
index 97a811077ac3..40eea6da7c25 100644
--- a/arch/loongarch/kvm/Kconfig
+++ b/arch/loongarch/kvm/Kconfig
@@ -33,6 +33,7 @@ config KVM
select KVM_MMIO
select KVM_XFER_TO_GUEST_WORK
select SCHED_INFO
+ select GUEST_PERF_EVENTS if PERF_EVENTS
help
Support hosting virtualized guest machines using
hardware virtualization extensions. You will need
diff --git a/arch/loongarch/kvm/Makefile b/arch/loongarch/kvm/Makefile
index 3a01292f71cc..f4c8e35c216a 100644
--- a/arch/loongarch/kvm/Makefile
+++ b/arch/loongarch/kvm/Makefile
@@ -3,8 +3,6 @@
# Makefile for LoongArch KVM support
#
-ccflags-y += -I $(src)
-
include $(srctree)/virt/kvm/Makefile.kvm
obj-$(CONFIG_KVM) += kvm.o
diff --git a/arch/loongarch/kvm/main.c b/arch/loongarch/kvm/main.c
index b6864d6e5ec8..d165cd38c6bb 100644
--- a/arch/loongarch/kvm/main.c
+++ b/arch/loongarch/kvm/main.c
@@ -394,6 +394,7 @@ static int kvm_loongarch_env_init(void)
}
kvm_init_gcsr_flag();
+ kvm_register_perf_callbacks(NULL);
/* Register LoongArch IPI interrupt controller interface. */
ret = kvm_loongarch_register_ipi_device();
@@ -425,6 +426,8 @@ static void kvm_loongarch_env_exit(void)
}
kfree(kvm_loongarch_ops);
}
+
+ kvm_unregister_perf_callbacks();
}
static int kvm_loongarch_init(void)
diff --git a/arch/loongarch/kvm/switch.S b/arch/loongarch/kvm/switch.S
index 1be185e94807..f1768b7a6194 100644
--- a/arch/loongarch/kvm/switch.S
+++ b/arch/loongarch/kvm/switch.S
@@ -60,16 +60,8 @@
ld.d t0, a2, KVM_ARCH_GPC
csrwr t0, LOONGARCH_CSR_ERA
- /* Save host PGDL */
- csrrd t0, LOONGARCH_CSR_PGDL
- st.d t0, a2, KVM_ARCH_HPGD
-
- /* Switch to kvm */
- ld.d t1, a2, KVM_VCPU_KVM - KVM_VCPU_ARCH
-
- /* Load guest PGDL */
- li.w t0, KVM_GPGD
- ldx.d t0, t1, t0
+ /* Load PGD for KVM hypervisor */
+ ld.d t0, a2, KVM_ARCH_KVMPGD
csrwr t0, LOONGARCH_CSR_PGDL
/* Mix GID and RID */
diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c
index 9e1a9b4aa4c6..8e427b379661 100644
--- a/arch/loongarch/kvm/vcpu.c
+++ b/arch/loongarch/kvm/vcpu.c
@@ -361,6 +361,34 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
{
+ unsigned long val;
+
+ preempt_disable();
+ val = gcsr_read(LOONGARCH_CSR_CRMD);
+ preempt_enable();
+
+ return (val & CSR_PRMD_PPLV) == PLV_KERN;
+}
+
+#ifdef CONFIG_GUEST_PERF_EVENTS
+unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.pc;
+}
+
+/*
+ * Returns true if a Performance Monitoring Interrupt (PMI), a.k.a. perf event,
+ * arrived in guest context. For LoongArch64, if PMU is not passthrough to VM,
+ * any event that arrives while a vCPU is loaded is considered to be "in guest".
+ */
+bool kvm_arch_pmi_in_guest(struct kvm_vcpu *vcpu)
+{
+ return (vcpu && !(vcpu->arch.aux_inuse & KVM_LARCH_PMU));
+}
+#endif
+
+bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu)
+{
return false;
}
@@ -1459,8 +1487,17 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
vcpu->arch.vpid = 0;
vcpu->arch.flush_gpa = INVALID_GPA;
- hrtimer_init(&vcpu->arch.swtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
- vcpu->arch.swtimer.function = kvm_swtimer_wakeup;
+ hrtimer_setup(&vcpu->arch.swtimer, kvm_swtimer_wakeup, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS_PINNED_HARD);
+
+ /* Get GPA (=HVA) of PGD for kvm hypervisor */
+ vcpu->arch.kvm_pgd = __pa(vcpu->kvm->arch.pgd);
+
+ /*
+ * Get PGD for primary mmu, virtual address is used since there is
+ * memory access after loading from CSR_PGD in tlb exception fast path.
+ */
+ vcpu->arch.host_pgd = (unsigned long)vcpu->kvm->mm->pgd;
vcpu->arch.handle_exit = kvm_handle_exit;
vcpu->arch.guest_eentry = (unsigned long)kvm_loongarch_ops->exc_entry;
diff --git a/arch/loongarch/vdso/Makefile b/arch/loongarch/vdso/Makefile
index fdde1bcd4e26..1c26147aff70 100644
--- a/arch/loongarch/vdso/Makefile
+++ b/arch/loongarch/vdso/Makefile
@@ -2,7 +2,7 @@
# Objects to go into the VDSO.
# Include the generic Makefile to check the built vdso.
-include $(srctree)/lib/vdso/Makefile
+include $(srctree)/lib/vdso/Makefile.include
obj-vdso-y := elf.o vgetcpu.o vgettimeofday.o vgetrandom.o \
vgetrandom-chacha.o sigreturn.o
diff --git a/arch/loongarch/vdso/vdso.lds.S b/arch/loongarch/vdso/vdso.lds.S
index 160cfaef2de4..8ff986499947 100644
--- a/arch/loongarch/vdso/vdso.lds.S
+++ b/arch/loongarch/vdso/vdso.lds.S
@@ -5,6 +5,7 @@
*/
#include <asm/page.h>
#include <generated/asm-offsets.h>
+#include <vdso/datapage.h>
OUTPUT_FORMAT("elf64-loongarch", "elf64-loongarch", "elf64-loongarch")
@@ -12,11 +13,8 @@ OUTPUT_ARCH(loongarch)
SECTIONS
{
- PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE);
-#ifdef CONFIG_TIME_NS
- PROVIDE(_timens_data = _vdso_data + PAGE_SIZE);
-#endif
- PROVIDE(_loongarch_data = _vdso_data + 2 * PAGE_SIZE);
+ VDSO_VVAR_SYMS
+
. = SIZEOF_HEADERS;
.hash : { *(.hash) } :text
diff --git a/arch/loongarch/vdso/vgetcpu.c b/arch/loongarch/vdso/vgetcpu.c
index 0db51258b2a7..5301cd9d0f83 100644
--- a/arch/loongarch/vdso/vgetcpu.c
+++ b/arch/loongarch/vdso/vgetcpu.c
@@ -19,27 +19,19 @@ static __always_inline int read_cpu_id(void)
return cpu_id;
}
-static __always_inline const struct vdso_pcpu_data *get_pcpu_data(void)
-{
- return _loongarch_data.pdata;
-}
-
extern
int __vdso_getcpu(unsigned int *cpu, unsigned int *node, struct getcpu_cache *unused);
int __vdso_getcpu(unsigned int *cpu, unsigned int *node, struct getcpu_cache *unused)
{
int cpu_id;
- const struct vdso_pcpu_data *data;
cpu_id = read_cpu_id();
if (cpu)
*cpu = cpu_id;
- if (node) {
- data = get_pcpu_data();
- *node = data[cpu_id].node;
- }
+ if (node)
+ *node = vdso_u_arch_data.pdata[cpu_id].node;
return 0;
}
diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig
index dbf2ea561c85..31ecb8b7b9f1 100644
--- a/arch/m68k/configs/amiga_defconfig
+++ b/arch/m68k/configs/amiga_defconfig
@@ -299,7 +299,6 @@ CONFIG_NET_IFE=m
CONFIG_DEVTMPFS=y
CONFIG_DEVTMPFS_MOUNT=y
CONFIG_TEST_ASYNC_DRIVER_PROBE=m
-CONFIG_DM_KUNIT_TEST=m
CONFIG_CONNECTOR=m
CONFIG_PARPORT=m
CONFIG_PARPORT_AMIGA=m
@@ -336,6 +335,7 @@ CONFIG_ATA=y
CONFIG_PATA_GAYLE=y
CONFIG_PATA_BUDDHA=y
CONFIG_MD=y
+CONFIG_MD_LINEAR=m
CONFIG_BLK_DEV_DM=m
CONFIG_DM_UNSTRIPED=m
CONFIG_DM_CRYPT=m
@@ -486,7 +486,6 @@ CONFIG_OMFS_FS=m
CONFIG_HPFS_FS=m
CONFIG_QNX4FS_FS=m
CONFIG_QNX6FS_FS=m
-CONFIG_SYSV_FS=m
CONFIG_UFS_FS=m
CONFIG_EROFS_FS=m
CONFIG_NFS_FS=y
diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig
index b0fd199cc0a4..1f57514624d5 100644
--- a/arch/m68k/configs/apollo_defconfig
+++ b/arch/m68k/configs/apollo_defconfig
@@ -295,7 +295,6 @@ CONFIG_NET_IFE=m
CONFIG_DEVTMPFS=y
CONFIG_DEVTMPFS_MOUNT=y
CONFIG_TEST_ASYNC_DRIVER_PROBE=m
-CONFIG_DM_KUNIT_TEST=m
CONFIG_CONNECTOR=m
CONFIG_ZRAM=m
CONFIG_BLK_DEV_LOOP=y
@@ -316,6 +315,7 @@ CONFIG_SCSI_SAS_ATTRS=m
CONFIG_ISCSI_TCP=m
CONFIG_ISCSI_BOOT_SYSFS=m
CONFIG_MD=y
+CONFIG_MD_LINEAR=m
CONFIG_BLK_DEV_DM=m
CONFIG_DM_UNSTRIPED=m
CONFIG_DM_CRYPT=m
@@ -443,7 +443,6 @@ CONFIG_OMFS_FS=m
CONFIG_HPFS_FS=m
CONFIG_QNX4FS_FS=m
CONFIG_QNX6FS_FS=m
-CONFIG_SYSV_FS=m
CONFIG_UFS_FS=m
CONFIG_EROFS_FS=m
CONFIG_NFS_FS=y
diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig
index bb5b2d3b6c10..02db7a48e57e 100644
--- a/arch/m68k/configs/atari_defconfig
+++ b/arch/m68k/configs/atari_defconfig
@@ -302,7 +302,6 @@ CONFIG_NET_IFE=m
CONFIG_DEVTMPFS=y
CONFIG_DEVTMPFS_MOUNT=y
CONFIG_TEST_ASYNC_DRIVER_PROBE=m
-CONFIG_DM_KUNIT_TEST=m
CONFIG_CONNECTOR=m
CONFIG_PARPORT=m
CONFIG_PARPORT_ATARI=m
@@ -331,6 +330,7 @@ CONFIG_ATA=y
# CONFIG_ATA_BMDMA is not set
CONFIG_PATA_FALCON=y
CONFIG_MD=y
+CONFIG_MD_LINEAR=m
CONFIG_BLK_DEV_DM=m
CONFIG_DM_UNSTRIPED=m
CONFIG_DM_CRYPT=m
@@ -463,7 +463,6 @@ CONFIG_OMFS_FS=m
CONFIG_HPFS_FS=m
CONFIG_QNX4FS_FS=m
CONFIG_QNX6FS_FS=m
-CONFIG_SYSV_FS=m
CONFIG_UFS_FS=m
CONFIG_EROFS_FS=m
CONFIG_NFS_FS=y
diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig
index 8315a13bab73..f0e673cb17eb 100644
--- a/arch/m68k/configs/bvme6000_defconfig
+++ b/arch/m68k/configs/bvme6000_defconfig
@@ -292,7 +292,6 @@ CONFIG_NET_IFE=m
CONFIG_DEVTMPFS=y
CONFIG_DEVTMPFS_MOUNT=y
CONFIG_TEST_ASYNC_DRIVER_PROBE=m
-CONFIG_DM_KUNIT_TEST=m
CONFIG_CONNECTOR=m
CONFIG_ZRAM=m
CONFIG_BLK_DEV_LOOP=y
@@ -314,6 +313,7 @@ CONFIG_ISCSI_TCP=m
CONFIG_ISCSI_BOOT_SYSFS=m
CONFIG_BVME6000_SCSI=y
CONFIG_MD=y
+CONFIG_MD_LINEAR=m
CONFIG_BLK_DEV_DM=m
CONFIG_DM_UNSTRIPED=m
CONFIG_DM_CRYPT=m
@@ -435,7 +435,6 @@ CONFIG_OMFS_FS=m
CONFIG_HPFS_FS=m
CONFIG_QNX4FS_FS=m
CONFIG_QNX6FS_FS=m
-CONFIG_SYSV_FS=m
CONFIG_UFS_FS=m
CONFIG_EROFS_FS=m
CONFIG_NFS_FS=y
diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig
index 350370657e5f..e8ca5a50b86d 100644
--- a/arch/m68k/configs/hp300_defconfig
+++ b/arch/m68k/configs/hp300_defconfig
@@ -294,7 +294,6 @@ CONFIG_NET_IFE=m
CONFIG_DEVTMPFS=y
CONFIG_DEVTMPFS_MOUNT=y
CONFIG_TEST_ASYNC_DRIVER_PROBE=m
-CONFIG_DM_KUNIT_TEST=m
CONFIG_CONNECTOR=m
CONFIG_ZRAM=m
CONFIG_BLK_DEV_LOOP=y
@@ -315,6 +314,7 @@ CONFIG_SCSI_SAS_ATTRS=m
CONFIG_ISCSI_TCP=m
CONFIG_ISCSI_BOOT_SYSFS=m
CONFIG_MD=y
+CONFIG_MD_LINEAR=m
CONFIG_BLK_DEV_DM=m
CONFIG_DM_UNSTRIPED=m
CONFIG_DM_CRYPT=m
@@ -445,7 +445,6 @@ CONFIG_OMFS_FS=m
CONFIG_HPFS_FS=m
CONFIG_QNX4FS_FS=m
CONFIG_QNX6FS_FS=m
-CONFIG_SYSV_FS=m
CONFIG_UFS_FS=m
CONFIG_EROFS_FS=m
CONFIG_NFS_FS=y
diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig
index f942b4755702..b3a270441bb1 100644
--- a/arch/m68k/configs/mac_defconfig
+++ b/arch/m68k/configs/mac_defconfig
@@ -293,7 +293,6 @@ CONFIG_NET_IFE=m
CONFIG_DEVTMPFS=y
CONFIG_DEVTMPFS_MOUNT=y
CONFIG_TEST_ASYNC_DRIVER_PROBE=m
-CONFIG_DM_KUNIT_TEST=m
CONFIG_CONNECTOR=m
CONFIG_BLK_DEV_SWIM=m
CONFIG_ZRAM=m
@@ -320,6 +319,7 @@ CONFIG_ATA=y
# CONFIG_ATA_BMDMA is not set
CONFIG_PATA_PLATFORM=y
CONFIG_MD=y
+CONFIG_MD_LINEAR=m
CONFIG_BLK_DEV_DM=m
CONFIG_DM_UNSTRIPED=m
CONFIG_DM_CRYPT=m
@@ -462,7 +462,6 @@ CONFIG_OMFS_FS=m
CONFIG_HPFS_FS=m
CONFIG_QNX4FS_FS=m
CONFIG_QNX6FS_FS=m
-CONFIG_SYSV_FS=m
CONFIG_UFS_FS=m
CONFIG_EROFS_FS=m
CONFIG_NFS_FS=y
diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig
index b1eaad02efab..d215dba006ce 100644
--- a/arch/m68k/configs/multi_defconfig
+++ b/arch/m68k/configs/multi_defconfig
@@ -313,7 +313,6 @@ CONFIG_NET_IFE=m
CONFIG_DEVTMPFS=y
CONFIG_DEVTMPFS_MOUNT=y
CONFIG_TEST_ASYNC_DRIVER_PROBE=m
-CONFIG_DM_KUNIT_TEST=m
CONFIG_CONNECTOR=m
CONFIG_PARPORT=m
CONFIG_PARPORT_PC=m
@@ -363,6 +362,7 @@ CONFIG_PATA_GAYLE=y
CONFIG_PATA_BUDDHA=y
CONFIG_PATA_PLATFORM=y
CONFIG_MD=y
+CONFIG_MD_LINEAR=m
CONFIG_BLK_DEV_DM=m
CONFIG_DM_UNSTRIPED=m
CONFIG_DM_CRYPT=m
@@ -549,7 +549,6 @@ CONFIG_OMFS_FS=m
CONFIG_HPFS_FS=m
CONFIG_QNX4FS_FS=m
CONFIG_QNX6FS_FS=m
-CONFIG_SYSV_FS=m
CONFIG_UFS_FS=m
CONFIG_EROFS_FS=m
CONFIG_NFS_FS=y
diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig
index 6309a4442bb3..a888ed93ff82 100644
--- a/arch/m68k/configs/mvme147_defconfig
+++ b/arch/m68k/configs/mvme147_defconfig
@@ -291,7 +291,6 @@ CONFIG_NET_IFE=m
CONFIG_DEVTMPFS=y
CONFIG_DEVTMPFS_MOUNT=y
CONFIG_TEST_ASYNC_DRIVER_PROBE=m
-CONFIG_DM_KUNIT_TEST=m
CONFIG_CONNECTOR=m
CONFIG_ZRAM=m
CONFIG_BLK_DEV_LOOP=y
@@ -313,6 +312,7 @@ CONFIG_ISCSI_TCP=m
CONFIG_ISCSI_BOOT_SYSFS=m
CONFIG_MVME147_SCSI=y
CONFIG_MD=y
+CONFIG_MD_LINEAR=m
CONFIG_BLK_DEV_DM=m
CONFIG_DM_UNSTRIPED=m
CONFIG_DM_CRYPT=m
@@ -435,7 +435,6 @@ CONFIG_OMFS_FS=m
CONFIG_HPFS_FS=m
CONFIG_QNX4FS_FS=m
CONFIG_QNX6FS_FS=m
-CONFIG_SYSV_FS=m
CONFIG_UFS_FS=m
CONFIG_EROFS_FS=m
CONFIG_NFS_FS=y
diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig
index 3feb0731f814..b481782375f6 100644
--- a/arch/m68k/configs/mvme16x_defconfig
+++ b/arch/m68k/configs/mvme16x_defconfig
@@ -292,7 +292,6 @@ CONFIG_NET_IFE=m
CONFIG_DEVTMPFS=y
CONFIG_DEVTMPFS_MOUNT=y
CONFIG_TEST_ASYNC_DRIVER_PROBE=m
-CONFIG_DM_KUNIT_TEST=m
CONFIG_CONNECTOR=m
CONFIG_ZRAM=m
CONFIG_BLK_DEV_LOOP=y
@@ -314,6 +313,7 @@ CONFIG_ISCSI_TCP=m
CONFIG_ISCSI_BOOT_SYSFS=m
CONFIG_MVME16x_SCSI=y
CONFIG_MD=y
+CONFIG_MD_LINEAR=m
CONFIG_BLK_DEV_DM=m
CONFIG_DM_UNSTRIPED=m
CONFIG_DM_CRYPT=m
@@ -436,7 +436,6 @@ CONFIG_OMFS_FS=m
CONFIG_HPFS_FS=m
CONFIG_QNX4FS_FS=m
CONFIG_QNX6FS_FS=m
-CONFIG_SYSV_FS=m
CONFIG_UFS_FS=m
CONFIG_EROFS_FS=m
CONFIG_NFS_FS=y
diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig
index ea04b1b0da7d..6eba743d8eb5 100644
--- a/arch/m68k/configs/q40_defconfig
+++ b/arch/m68k/configs/q40_defconfig
@@ -293,7 +293,6 @@ CONFIG_NET_IFE=m
CONFIG_DEVTMPFS=y
CONFIG_DEVTMPFS_MOUNT=y
CONFIG_TEST_ASYNC_DRIVER_PROBE=m
-CONFIG_DM_KUNIT_TEST=m
CONFIG_CONNECTOR=m
CONFIG_PARPORT=m
CONFIG_PARPORT_PC=m
@@ -320,6 +319,7 @@ CONFIG_ATA=y
# CONFIG_ATA_BMDMA is not set
CONFIG_PATA_FALCON=y
CONFIG_MD=y
+CONFIG_MD_LINEAR=m
CONFIG_BLK_DEV_DM=m
CONFIG_DM_UNSTRIPED=m
CONFIG_DM_CRYPT=m
@@ -452,7 +452,6 @@ CONFIG_OMFS_FS=m
CONFIG_HPFS_FS=m
CONFIG_QNX4FS_FS=m
CONFIG_QNX6FS_FS=m
-CONFIG_SYSV_FS=m
CONFIG_UFS_FS=m
CONFIG_EROFS_FS=m
CONFIG_NFS_FS=y
diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig
index f52d9af92153..9bdbb418ffa8 100644
--- a/arch/m68k/configs/sun3_defconfig
+++ b/arch/m68k/configs/sun3_defconfig
@@ -288,7 +288,6 @@ CONFIG_NET_IFE=m
CONFIG_DEVTMPFS=y
CONFIG_DEVTMPFS_MOUNT=y
CONFIG_TEST_ASYNC_DRIVER_PROBE=m
-CONFIG_DM_KUNIT_TEST=m
CONFIG_CONNECTOR=m
CONFIG_ZRAM=m
CONFIG_BLK_DEV_LOOP=y
@@ -310,6 +309,7 @@ CONFIG_ISCSI_TCP=m
CONFIG_ISCSI_BOOT_SYSFS=m
CONFIG_SUN3_SCSI=y
CONFIG_MD=y
+CONFIG_MD_LINEAR=m
CONFIG_BLK_DEV_DM=m
CONFIG_DM_UNSTRIPED=m
CONFIG_DM_CRYPT=m
@@ -433,7 +433,6 @@ CONFIG_OMFS_FS=m
CONFIG_HPFS_FS=m
CONFIG_QNX4FS_FS=m
CONFIG_QNX6FS_FS=m
-CONFIG_SYSV_FS=m
CONFIG_UFS_FS=m
CONFIG_EROFS_FS=m
CONFIG_NFS_FS=y
diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig
index f348447824da..e1cf20fa5343 100644
--- a/arch/m68k/configs/sun3x_defconfig
+++ b/arch/m68k/configs/sun3x_defconfig
@@ -289,7 +289,6 @@ CONFIG_NET_IFE=m
CONFIG_DEVTMPFS=y
CONFIG_DEVTMPFS_MOUNT=y
CONFIG_TEST_ASYNC_DRIVER_PROBE=m
-CONFIG_DM_KUNIT_TEST=m
CONFIG_CONNECTOR=m
CONFIG_ZRAM=m
CONFIG_BLK_DEV_LOOP=y
@@ -311,6 +310,7 @@ CONFIG_ISCSI_TCP=m
CONFIG_ISCSI_BOOT_SYSFS=m
CONFIG_SUN3X_ESP=y
CONFIG_MD=y
+CONFIG_MD_LINEAR=m
CONFIG_BLK_DEV_DM=m
CONFIG_DM_UNSTRIPED=m
CONFIG_DM_CRYPT=m
@@ -433,7 +433,6 @@ CONFIG_OMFS_FS=m
CONFIG_HPFS_FS=m
CONFIG_QNX4FS_FS=m
CONFIG_QNX6FS_FS=m
-CONFIG_SYSV_FS=m
CONFIG_UFS_FS=m
CONFIG_EROFS_FS=m
CONFIG_NFS_FS=y
diff --git a/arch/m68k/include/asm/processor.h b/arch/m68k/include/asm/processor.h
index 8f2676c3a988..3c43c09d4489 100644
--- a/arch/m68k/include/asm/processor.h
+++ b/arch/m68k/include/asm/processor.h
@@ -95,10 +95,24 @@ static inline void set_fc(unsigned long val)
"movec %0,%/dfc\n\t"
: /* no outputs */ : "r" (val) : "memory");
}
+
+static inline unsigned long get_fc(void)
+{
+ unsigned long val;
+
+ __asm__ ("movec %/dfc,%0" : "=r" (val) : );
+
+ return val;
+}
#else
static inline void set_fc(unsigned long val)
{
}
+
+static inline unsigned long get_fc(void)
+{
+ return USER_DATA;
+}
#endif /* CONFIG_CPU_HAS_ADDRESS_SPACES */
struct thread_struct {
diff --git a/arch/m68k/kernel/setup_mm.c b/arch/m68k/kernel/setup_mm.c
index 15c1a595a1de..3bc28bc512ac 100644
--- a/arch/m68k/kernel/setup_mm.c
+++ b/arch/m68k/kernel/setup_mm.c
@@ -147,8 +147,7 @@ static void __init m68k_parse_bootinfo(const struct bi_record *record)
break;
case BI_COMMAND_LINE:
- strscpy(m68k_command_line, data,
- sizeof(m68k_command_line));
+ strscpy(m68k_command_line, data);
break;
case BI_RNG_SEED: {
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index f5ed71f1910d..9fe47112c586 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -466,3 +466,4 @@
464 common getxattrat sys_getxattrat
465 common listxattrat sys_listxattrat
466 common removexattrat sys_removexattrat
+467 common open_tree_attr sys_open_tree_attr
diff --git a/arch/m68k/sun3/mmu_emu.c b/arch/m68k/sun3/mmu_emu.c
index 119bd32efcfb..b39fc3717d8e 100644
--- a/arch/m68k/sun3/mmu_emu.c
+++ b/arch/m68k/sun3/mmu_emu.c
@@ -17,6 +17,7 @@
#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/sched/mm.h>
+#include <linux/string_choices.h>
#include <asm/setup.h>
#include <asm/traps.h>
@@ -370,8 +371,8 @@ int mmu_emu_handle_fault (unsigned long vaddr, int read_flag, int kernel_fault)
}
#ifdef DEBUG_MMU_EMU
- pr_info("%s: vaddr=%lx type=%s crp=%p\n", __func__, vaddr,
- read_flag ? "read" : "write", crp);
+ pr_info("%s: vaddr=%lx type=%s crp=%px\n", __func__, vaddr,
+ str_read_write(read_flag), crp);
#endif
segment = (vaddr >> SUN3_PMEG_SIZE_BITS) & 0x7FF;
@@ -417,7 +418,7 @@ int mmu_emu_handle_fault (unsigned long vaddr, int read_flag, int kernel_fault)
pte_val (*pte) |= SUN3_PAGE_ACCESSED;
#ifdef DEBUG_MMU_EMU
- pr_info("seg:%ld crp:%p ->", get_fs().seg, crp);
+ pr_info("seg:%ld crp:%px ->", get_fc(), crp);
print_pte_vaddr (vaddr);
pr_cont("\n");
#endif
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index 680f568b77f2..7b6e97828e55 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -472,3 +472,4 @@
464 common getxattrat sys_getxattrat
465 common listxattrat sys_listxattrat
466 common removexattrat sys_removexattrat
+467 common open_tree_attr sys_open_tree_attr
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 1924f2d83932..b51168e319ea 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -51,6 +51,7 @@ config MIPS
select GENERIC_SMP_IDLE_THREAD
select GENERIC_IDLE_POLL_SETUP
select GENERIC_TIME_VSYSCALL
+ select GENERIC_VDSO_DATA_STORE
select GUP_GET_PXX_LOW_HIGH if CPU_MIPS32 && PHYS_ADDR_T_64BIT
select HAS_IOPORT if !NO_IOPORT_MAP || ISA
select HAVE_ARCH_COMPILER_H
diff --git a/arch/mips/configs/malta_defconfig b/arch/mips/configs/malta_defconfig
index 4390d30206d9..e308b82c094a 100644
--- a/arch/mips/configs/malta_defconfig
+++ b/arch/mips/configs/malta_defconfig
@@ -347,7 +347,6 @@ CONFIG_CRAMFS=m
CONFIG_VXFS_FS=m
CONFIG_MINIX_FS=m
CONFIG_ROMFS_FS=m
-CONFIG_SYSV_FS=m
CONFIG_UFS_FS=m
CONFIG_NFS_FS=y
CONFIG_ROOT_NFS=y
diff --git a/arch/mips/configs/malta_kvm_defconfig b/arch/mips/configs/malta_kvm_defconfig
index d63d8be8cb50..fa5b04063ddb 100644
--- a/arch/mips/configs/malta_kvm_defconfig
+++ b/arch/mips/configs/malta_kvm_defconfig
@@ -354,7 +354,6 @@ CONFIG_CRAMFS=m
CONFIG_VXFS_FS=m
CONFIG_MINIX_FS=m
CONFIG_ROMFS_FS=m
-CONFIG_SYSV_FS=m
CONFIG_UFS_FS=m
CONFIG_NFS_FS=y
CONFIG_ROOT_NFS=y
diff --git a/arch/mips/configs/maltaup_xpa_defconfig b/arch/mips/configs/maltaup_xpa_defconfig
index 338bb6544a93..40283171af68 100644
--- a/arch/mips/configs/maltaup_xpa_defconfig
+++ b/arch/mips/configs/maltaup_xpa_defconfig
@@ -353,7 +353,6 @@ CONFIG_CRAMFS=m
CONFIG_VXFS_FS=m
CONFIG_MINIX_FS=m
CONFIG_ROMFS_FS=m
-CONFIG_SYSV_FS=m
CONFIG_UFS_FS=m
CONFIG_NFS_FS=y
CONFIG_ROOT_NFS=y
diff --git a/arch/mips/configs/rm200_defconfig b/arch/mips/configs/rm200_defconfig
index 08e1c1f2f4de..3a6d0384b774 100644
--- a/arch/mips/configs/rm200_defconfig
+++ b/arch/mips/configs/rm200_defconfig
@@ -336,7 +336,6 @@ CONFIG_MINIX_FS=m
CONFIG_HPFS_FS=m
CONFIG_QNX4FS_FS=m
CONFIG_ROMFS_FS=m
-CONFIG_SYSV_FS=m
CONFIG_UFS_FS=m
CONFIG_NFS_FS=m
CONFIG_NFSD=m
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index f7222eb594ea..c14b10821817 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -886,7 +886,6 @@ extern unsigned long kvm_mips_get_ramsize(struct kvm *kvm);
extern int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
struct kvm_mips_interrupt *irq);
-static inline void kvm_arch_sync_events(struct kvm *kvm) {}
static inline void kvm_arch_free_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot) {}
static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {}
diff --git a/arch/mips/include/asm/vdso/gettimeofday.h b/arch/mips/include/asm/vdso/gettimeofday.h
index 44a45f3fa4b0..fd32baa30e17 100644
--- a/arch/mips/include/asm/vdso/gettimeofday.h
+++ b/arch/mips/include/asm/vdso/gettimeofday.h
@@ -167,7 +167,7 @@ static __always_inline u64 read_r4k_count(void)
#ifdef CONFIG_CLKSRC_MIPS_GIC
-static __always_inline u64 read_gic_count(const struct vdso_data *data)
+static __always_inline u64 read_gic_count(const struct vdso_time_data *data)
{
void __iomem *gic = get_gic(data);
u32 hi, hi2, lo;
@@ -184,7 +184,7 @@ static __always_inline u64 read_gic_count(const struct vdso_data *data)
#endif
static __always_inline u64 __arch_get_hw_counter(s32 clock_mode,
- const struct vdso_data *vd)
+ const struct vdso_time_data *vd)
{
#ifdef CONFIG_CSRC_R4K
if (clock_mode == VDSO_CLOCKMODE_R4K)
@@ -209,10 +209,11 @@ static inline bool mips_vdso_hres_capable(void)
}
#define __arch_vdso_hres_capable mips_vdso_hres_capable
-static __always_inline const struct vdso_data *__arch_get_vdso_data(void)
+static __always_inline const struct vdso_time_data *__arch_get_vdso_u_time_data(void)
{
- return get_vdso_data();
+ return get_vdso_time_data();
}
+#define __arch_get_vdso_u_time_data __arch_get_vdso_u_time_data
#endif /* !__ASSEMBLY__ */
diff --git a/arch/mips/include/asm/vdso/vdso.h b/arch/mips/include/asm/vdso/vdso.h
index 6cd88191fefa..acd0efcd3d93 100644
--- a/arch/mips/include/asm/vdso/vdso.h
+++ b/arch/mips/include/asm/vdso/vdso.h
@@ -5,16 +5,18 @@
*/
#include <asm/sgidefs.h>
+#include <vdso/page.h>
+
+#define __VDSO_PAGES 4
#ifndef __ASSEMBLY__
#include <asm/asm.h>
-#include <asm/page.h>
#include <asm/vdso.h>
-static inline unsigned long get_vdso_base(void)
+static inline const struct vdso_time_data *get_vdso_time_data(void)
{
- unsigned long addr;
+ const struct vdso_time_data *addr;
/*
* We can't use cpu_has_mips_r6 since it needs the cpu_data[]
@@ -27,7 +29,7 @@ static inline unsigned long get_vdso_base(void)
* We can't use addiupc because there is no label-label
* support for the addiupc reloc
*/
- __asm__("lapc %0, _start \n"
+ __asm__("lapc %0, vdso_u_time_data \n"
: "=r" (addr) : :);
#else
/*
@@ -46,7 +48,7 @@ static inline unsigned long get_vdso_base(void)
" .set noreorder \n"
" bal 1f \n"
" nop \n"
- " .word _start - . \n"
+ " .word vdso_u_time_data - . \n"
"1: lw %0, 0($31) \n"
" " STR(PTR_ADDU) " %0, $31, %0 \n"
" .set pop \n"
@@ -58,14 +60,9 @@ static inline unsigned long get_vdso_base(void)
return addr;
}
-static inline const struct vdso_data *get_vdso_data(void)
-{
- return (const struct vdso_data *)(get_vdso_base() - PAGE_SIZE);
-}
-
#ifdef CONFIG_CLKSRC_MIPS_GIC
-static inline void __iomem *get_gic(const struct vdso_data *data)
+static inline void __iomem *get_gic(const struct vdso_time_data *data)
{
return (void __iomem *)((unsigned long)data & PAGE_MASK) - PAGE_SIZE;
}
diff --git a/arch/mips/include/asm/vdso/vsyscall.h b/arch/mips/include/asm/vdso/vsyscall.h
index a4582870aaea..2b1debb62dee 100644
--- a/arch/mips/include/asm/vdso/vsyscall.h
+++ b/arch/mips/include/asm/vdso/vsyscall.h
@@ -2,22 +2,12 @@
#ifndef __ASM_VDSO_VSYSCALL_H
#define __ASM_VDSO_VSYSCALL_H
+#include <asm/page.h>
+
#ifndef __ASSEMBLY__
#include <vdso/datapage.h>
-extern struct vdso_data *vdso_data;
-
-/*
- * Update the vDSO data page to keep in sync with kernel timekeeping.
- */
-static __always_inline
-struct vdso_data *__mips_get_k_vdso_data(void)
-{
- return vdso_data;
-}
-#define __arch_get_k_vdso_data __mips_get_k_vdso_data
-
/* The asm-generic header needs to be included after the definitions above */
#include <asm-generic/vdso/vsyscall.h>
diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index 61503a36067e..f7107479c7fa 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -1326,24 +1326,8 @@ asmlinkage long syscall_trace_enter(struct pt_regs *regs)
return -1;
}
-#ifdef CONFIG_SECCOMP
- if (unlikely(test_thread_flag(TIF_SECCOMP))) {
- int ret, i;
- struct seccomp_data sd;
- unsigned long args[6];
-
- sd.nr = current_thread_info()->syscall;
- sd.arch = syscall_get_arch(current);
- syscall_get_arguments(current, regs, args);
- for (i = 0; i < 6; i++)
- sd.args[i] = args[i];
- sd.instruction_pointer = KSTK_EIP(current);
-
- ret = __secure_computing(&sd);
- if (ret == -1)
- return ret;
- }
-#endif
+ if (secure_computing())
+ return -1;
if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
trace_sys_enter(regs, regs->regs[2]);
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index 0b9b7e25b69a..aa70e371bb54 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -405,3 +405,4 @@
464 n32 getxattrat sys_getxattrat
465 n32 listxattrat sys_listxattrat
466 n32 removexattrat sys_removexattrat
+467 n32 open_tree_attr sys_open_tree_attr
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index c844cd5cda62..1e8c44c7b614 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -381,3 +381,4 @@
464 n64 getxattrat sys_getxattrat
465 n64 listxattrat sys_listxattrat
466 n64 removexattrat sys_removexattrat
+467 n64 open_tree_attr sys_open_tree_attr
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 349b8aad1159..114a5a1a6230 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -454,3 +454,4 @@
464 o32 getxattrat sys_getxattrat
465 o32 listxattrat sys_listxattrat
466 o32 removexattrat sys_removexattrat
+467 o32 open_tree_attr sys_open_tree_attr
diff --git a/arch/mips/kernel/vdso.c b/arch/mips/kernel/vdso.c
index 75c9d3618f58..de096777172f 100644
--- a/arch/mips/kernel/vdso.c
+++ b/arch/mips/kernel/vdso.c
@@ -15,6 +15,7 @@
#include <linux/random.h>
#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/vdso_datastore.h>
#include <asm/abi.h>
#include <asm/mips-cps.h>
@@ -23,20 +24,7 @@
#include <vdso/helpers.h>
#include <vdso/vsyscall.h>
-/* Kernel-provided data used by the VDSO. */
-static union vdso_data_store mips_vdso_data __page_aligned_data;
-struct vdso_data *vdso_data = mips_vdso_data.data;
-
-/*
- * Mapping for the VDSO data/GIC pages. The real pages are mapped manually, as
- * what we map and where within the area they are mapped is determined at
- * runtime.
- */
-static struct page *no_pages[] = { NULL };
-static struct vm_special_mapping vdso_vvar_mapping = {
- .name = "[vvar]",
- .pages = no_pages,
-};
+static_assert(VDSO_NR_PAGES == __VDSO_PAGES);
static void __init init_vdso_image(struct mips_vdso_image *image)
{
@@ -90,7 +78,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
{
struct mips_vdso_image *image = current->thread.abi->vdso;
struct mm_struct *mm = current->mm;
- unsigned long gic_size, vvar_size, size, base, data_addr, vdso_addr, gic_pfn, gic_base;
+ unsigned long gic_size, size, base, data_addr, vdso_addr, gic_pfn, gic_base;
struct vm_area_struct *vma;
int ret;
@@ -119,8 +107,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
* the counter registers at the start.
*/
gic_size = mips_gic_present() ? PAGE_SIZE : 0;
- vvar_size = gic_size + PAGE_SIZE;
- size = vvar_size + image->size;
+ size = gic_size + VDSO_NR_PAGES * PAGE_SIZE + image->size;
/*
* Find a region that's large enough for us to perform the
@@ -143,15 +130,13 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
*/
if (cpu_has_dc_aliases) {
base = __ALIGN_MASK(base, shm_align_mask);
- base += ((unsigned long)vdso_data - gic_size) & shm_align_mask;
+ base += ((unsigned long)vdso_k_time_data - gic_size) & shm_align_mask;
}
data_addr = base + gic_size;
- vdso_addr = data_addr + PAGE_SIZE;
+ vdso_addr = data_addr + VDSO_NR_PAGES * PAGE_SIZE;
- vma = _install_special_mapping(mm, base, vvar_size,
- VM_READ | VM_MAYREAD,
- &vdso_vvar_mapping);
+ vma = vdso_install_vvar_mapping(mm, data_addr);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
goto out;
@@ -161,6 +146,17 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
if (gic_size) {
gic_base = (unsigned long)mips_gic_base + MIPS_GIC_USER_OFS;
gic_pfn = PFN_DOWN(__pa(gic_base));
+ static const struct vm_special_mapping gic_mapping = {
+ .name = "[gic]",
+ .pages = (struct page **) { NULL },
+ };
+
+ vma = _install_special_mapping(mm, base, gic_size, VM_READ | VM_MAYREAD,
+ &gic_mapping);
+ if (IS_ERR(vma)) {
+ ret = PTR_ERR(vma);
+ goto out;
+ }
ret = io_remap_pfn_range(vma, base, gic_pfn, gic_size,
pgprot_noncached(vma->vm_page_prot));
@@ -168,13 +164,6 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
goto out;
}
- /* Map data page. */
- ret = remap_pfn_range(vma, data_addr,
- virt_to_phys(vdso_data) >> PAGE_SHIFT,
- PAGE_SIZE, vma->vm_page_prot);
- if (ret)
- goto out;
-
/* Map VDSO image. */
vma = _install_special_mapping(mm, vdso_addr, image->size,
VM_READ | VM_EXEC |
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 60b43ea85c12..cef3c423a41a 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -288,9 +288,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
if (err)
return err;
- hrtimer_init(&vcpu->arch.comparecount_timer, CLOCK_MONOTONIC,
- HRTIMER_MODE_REL);
- vcpu->arch.comparecount_timer.function = kvm_mips_comparecount_wakeup;
+ hrtimer_setup(&vcpu->arch.comparecount_timer, kvm_mips_comparecount_wakeup, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL);
/*
* Allocate space for host mode exception handlers that handle
diff --git a/arch/mips/vdso/Makefile b/arch/mips/vdso/Makefile
index b289b2c1b294..fb4c493aaffa 100644
--- a/arch/mips/vdso/Makefile
+++ b/arch/mips/vdso/Makefile
@@ -2,7 +2,7 @@
# Objects to go into the VDSO.
# Include the generic Makefile to check the built vdso.
-include $(srctree)/lib/vdso/Makefile
+include $(srctree)/lib/vdso/Makefile.include
obj-vdso-y := elf.o vgettimeofday.o sigreturn.o
diff --git a/arch/mips/vdso/vdso.lds.S b/arch/mips/vdso/vdso.lds.S
index 836465e3bcb8..c8bbe56d89cb 100644
--- a/arch/mips/vdso/vdso.lds.S
+++ b/arch/mips/vdso/vdso.lds.S
@@ -5,6 +5,8 @@
*/
#include <asm/sgidefs.h>
+#include <asm/vdso/vdso.h>
+#include <vdso/datapage.h>
#if _MIPS_SIM == _MIPS_SIM_ABI64
OUTPUT_FORMAT("elf64-tradlittlemips", "elf64-tradbigmips", "elf64-tradlittlemips")
@@ -18,7 +20,8 @@ OUTPUT_ARCH(mips)
SECTIONS
{
- PROVIDE(_start = .);
+ VDSO_VVAR_SYMS
+
. = SIZEOF_HEADERS;
/*
diff --git a/arch/parisc/configs/generic-64bit_defconfig b/arch/parisc/configs/generic-64bit_defconfig
index 19a804860ed5..ea623f910748 100644
--- a/arch/parisc/configs/generic-64bit_defconfig
+++ b/arch/parisc/configs/generic-64bit_defconfig
@@ -268,7 +268,6 @@ CONFIG_PROC_KCORE=y
CONFIG_TMPFS=y
CONFIG_TMPFS_XATTR=y
CONFIG_CONFIGFS_FS=y
-CONFIG_SYSV_FS=y
CONFIG_NFS_FS=m
CONFIG_NFS_V4=m
CONFIG_NFS_V4_1=y
diff --git a/arch/parisc/include/asm/vdso.h b/arch/parisc/include/asm/vdso.h
index 2a2dc11b5545..5f581c1d6460 100644
--- a/arch/parisc/include/asm/vdso.h
+++ b/arch/parisc/include/asm/vdso.h
@@ -12,8 +12,6 @@
#define VDSO64_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso64_offset_##name))
#define VDSO32_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso32_offset_##name))
-extern struct vdso_data *vdso_data;
-
#endif /* __ASSEMBLY __ */
/* Default link addresses for the vDSOs */
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index d9fc94c86965..94df3cb957e9 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -465,3 +465,4 @@
464 common getxattrat sys_getxattrat
465 common listxattrat sys_listxattrat
466 common removexattrat sys_removexattrat
+467 common open_tree_attr sys_open_tree_attr
diff --git a/arch/parisc/kernel/vdso32/Makefile b/arch/parisc/kernel/vdso32/Makefile
index 288f8b85978f..4ee8d17da229 100644
--- a/arch/parisc/kernel/vdso32/Makefile
+++ b/arch/parisc/kernel/vdso32/Makefile
@@ -1,5 +1,5 @@
# Include the generic Makefile to check the built vdso.
-include $(srctree)/lib/vdso/Makefile
+include $(srctree)/lib/vdso/Makefile.include
KCOV_INSTRUMENT := n
diff --git a/arch/parisc/kernel/vdso64/Makefile b/arch/parisc/kernel/vdso64/Makefile
index bc5d9553f311..c63f4069170f 100644
--- a/arch/parisc/kernel/vdso64/Makefile
+++ b/arch/parisc/kernel/vdso64/Makefile
@@ -1,5 +1,5 @@
# Include the generic Makefile to check the built vdso.
-include $(srctree)/lib/vdso/Makefile
+include $(srctree)/lib/vdso/Makefile.include
KCOV_INSTRUMENT := n
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 424f188e62d9..8a2a6e403eb1 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -159,6 +159,7 @@ config PPC
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_HAS_UACCESS_FLUSHCACHE
select ARCH_HAS_UBSAN
+ select ARCH_HAS_VDSO_ARCH_DATA
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select ARCH_HAVE_EXTRA_ELF_NOTES if SPU_BASE
select ARCH_KEEP_MEMBLOCK
@@ -209,6 +210,7 @@ config PPC
select GENERIC_PTDUMP
select GENERIC_SMP_IDLE_THREAD
select GENERIC_TIME_VSYSCALL
+ select GENERIC_VDSO_DATA_STORE
select GENERIC_VDSO_TIME_NS
select HAS_IOPORT if PCI
select HAVE_ARCH_AUDITSYSCALL
diff --git a/arch/powerpc/configs/fsl-emb-nonhw.config b/arch/powerpc/configs/fsl-emb-nonhw.config
index 3009b0efaf34..d6d2a458847b 100644
--- a/arch/powerpc/configs/fsl-emb-nonhw.config
+++ b/arch/powerpc/configs/fsl-emb-nonhw.config
@@ -112,7 +112,6 @@ CONFIG_QNX4FS_FS=m
CONFIG_RCU_TRACE=y
CONFIG_RESET_CONTROLLER=y
CONFIG_ROOT_NFS=y
-CONFIG_SYSV_FS=m
CONFIG_SYSVIPC=y
CONFIG_TMPFS=y
CONFIG_UBIFS_FS=y
diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig
index ca0c90e95837..364d1a78bc12 100644
--- a/arch/powerpc/configs/ppc6xx_defconfig
+++ b/arch/powerpc/configs/ppc6xx_defconfig
@@ -986,7 +986,6 @@ CONFIG_MINIX_FS=m
CONFIG_OMFS_FS=m
CONFIG_QNX4FS_FS=m
CONFIG_ROMFS_FS=m
-CONFIG_SYSV_FS=m
CONFIG_UFS_FS=m
CONFIG_NFS_FS=m
CONFIG_NFS_V3_ACL=y
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 6e1108f8fce6..2d139c807577 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -902,7 +902,6 @@ struct kvm_vcpu_arch {
#define __KVM_HAVE_ARCH_WQP
#define __KVM_HAVE_CREATE_DEVICE
-static inline void kvm_arch_sync_events(struct kvm *kvm) {}
static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {}
static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index 16bacfe8c7a2..da15b5efe807 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -152,6 +152,7 @@ static inline bool topology_is_primary_thread(unsigned int cpu)
{
return cpu == cpu_first_thread_sibling(cpu);
}
+#define topology_is_primary_thread topology_is_primary_thread
static inline bool topology_smt_thread_allowed(unsigned int cpu)
{
diff --git a/arch/powerpc/include/asm/vdso.h b/arch/powerpc/include/asm/vdso.h
index 8d972bc98b55..1ca23fbfe087 100644
--- a/arch/powerpc/include/asm/vdso.h
+++ b/arch/powerpc/include/asm/vdso.h
@@ -3,6 +3,7 @@
#define _ASM_POWERPC_VDSO_H
#define VDSO_VERSION_STRING LINUX_2.6.15
+#define __VDSO_PAGES 4
#ifndef __ASSEMBLY__
diff --git a/arch/powerpc/include/asm/vdso/arch_data.h b/arch/powerpc/include/asm/vdso/arch_data.h
new file mode 100644
index 000000000000..c240a6b87518
--- /dev/null
+++ b/arch/powerpc/include/asm/vdso/arch_data.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2002 Peter Bergner <bergner@vnet.ibm.com>, IBM
+ * Copyright (C) 2005 Benjamin Herrenschmidy <benh@kernel.crashing.org>,
+ * IBM Corp.
+ */
+#ifndef _ASM_POWERPC_VDSO_ARCH_DATA_H
+#define _ASM_POWERPC_VDSO_ARCH_DATA_H
+
+#include <linux/unistd.h>
+#include <linux/types.h>
+
+#define SYSCALL_MAP_SIZE ((NR_syscalls + 31) / 32)
+
+#ifdef CONFIG_PPC64
+
+struct vdso_arch_data {
+ __u64 tb_ticks_per_sec; /* Timebase tics / sec */
+ __u32 dcache_block_size; /* L1 d-cache block size */
+ __u32 icache_block_size; /* L1 i-cache block size */
+ __u32 dcache_log_block_size; /* L1 d-cache log block size */
+ __u32 icache_log_block_size; /* L1 i-cache log block size */
+ __u32 syscall_map[SYSCALL_MAP_SIZE]; /* Map of syscalls */
+ __u32 compat_syscall_map[SYSCALL_MAP_SIZE]; /* Map of compat syscalls */
+};
+
+#else /* CONFIG_PPC64 */
+
+struct vdso_arch_data {
+ __u64 tb_ticks_per_sec; /* Timebase tics / sec */
+ __u32 syscall_map[SYSCALL_MAP_SIZE]; /* Map of syscalls */
+ __u32 compat_syscall_map[0]; /* No compat syscalls on PPC32 */
+};
+
+#endif /* CONFIG_PPC64 */
+
+#endif /* _ASM_POWERPC_VDSO_ARCH_DATA_H */
diff --git a/arch/powerpc/include/asm/vdso/getrandom.h b/arch/powerpc/include/asm/vdso/getrandom.h
index 80ce0709725e..067a5396aac6 100644
--- a/arch/powerpc/include/asm/vdso/getrandom.h
+++ b/arch/powerpc/include/asm/vdso/getrandom.h
@@ -43,20 +43,21 @@ static __always_inline ssize_t getrandom_syscall(void *buffer, size_t len, unsig
(unsigned long)len, (unsigned long)flags);
}
-static __always_inline struct vdso_rng_data *__arch_get_vdso_rng_data(void)
+static __always_inline const struct vdso_rng_data *__arch_get_vdso_u_rng_data(void)
{
- struct vdso_arch_data *data;
+ struct vdso_rng_data *data;
asm (
" bcl 20, 31, .+4 ;"
"0: mflr %0 ;"
- " addis %0, %0, (_vdso_datapage - 0b)@ha ;"
- " addi %0, %0, (_vdso_datapage - 0b)@l ;"
+ " addis %0, %0, (vdso_u_rng_data - 0b)@ha ;"
+ " addi %0, %0, (vdso_u_rng_data - 0b)@l ;"
: "=r" (data) : : "lr"
);
- return &data->rng_data;
+ return data;
}
+#define __arch_get_vdso_u_rng_data __arch_get_vdso_u_rng_data
ssize_t __c_kernel_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state,
size_t opaque_len);
diff --git a/arch/powerpc/include/asm/vdso/gettimeofday.h b/arch/powerpc/include/asm/vdso/gettimeofday.h
index c6390890a60c..99c9d6f43fde 100644
--- a/arch/powerpc/include/asm/vdso/gettimeofday.h
+++ b/arch/powerpc/include/asm/vdso/gettimeofday.h
@@ -94,22 +94,12 @@ int clock_getres32_fallback(clockid_t _clkid, struct old_timespec32 *_ts)
#endif
static __always_inline u64 __arch_get_hw_counter(s32 clock_mode,
- const struct vdso_data *vd)
+ const struct vdso_time_data *vd)
{
return get_tb();
}
-const struct vdso_data *__arch_get_vdso_data(void);
-
-#ifdef CONFIG_TIME_NS
-static __always_inline
-const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd)
-{
- return (void *)vd + (1U << CONFIG_PAGE_SHIFT);
-}
-#endif
-
-static inline bool vdso_clocksource_ok(const struct vdso_data *vd)
+static inline bool vdso_clocksource_ok(const struct vdso_clock *vc)
{
return true;
}
@@ -135,21 +125,22 @@ static __always_inline u64 vdso_shift_ns(u64 ns, unsigned long shift)
#ifdef __powerpc64__
int __c_kernel_clock_gettime(clockid_t clock, struct __kernel_timespec *ts,
- const struct vdso_data *vd);
+ const struct vdso_time_data *vd);
int __c_kernel_clock_getres(clockid_t clock_id, struct __kernel_timespec *res,
- const struct vdso_data *vd);
+ const struct vdso_time_data *vd);
#else
int __c_kernel_clock_gettime(clockid_t clock, struct old_timespec32 *ts,
- const struct vdso_data *vd);
+ const struct vdso_time_data *vd);
int __c_kernel_clock_gettime64(clockid_t clock, struct __kernel_timespec *ts,
- const struct vdso_data *vd);
+ const struct vdso_time_data *vd);
int __c_kernel_clock_getres(clockid_t clock_id, struct old_timespec32 *res,
- const struct vdso_data *vd);
+ const struct vdso_time_data *vd);
#endif
int __c_kernel_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz,
- const struct vdso_data *vd);
+ const struct vdso_time_data *vd);
__kernel_old_time_t __c_kernel_time(__kernel_old_time_t *time,
- const struct vdso_data *vd);
+ const struct vdso_time_data *vd);
+
#endif /* __ASSEMBLY__ */
#endif /* _ASM_POWERPC_VDSO_GETTIMEOFDAY_H */
diff --git a/arch/powerpc/include/asm/vdso/vsyscall.h b/arch/powerpc/include/asm/vdso/vsyscall.h
index 48560a119559..c2c9ae1b22e7 100644
--- a/arch/powerpc/include/asm/vdso/vsyscall.h
+++ b/arch/powerpc/include/asm/vdso/vsyscall.h
@@ -6,19 +6,6 @@
#include <asm/vdso_datapage.h>
-static __always_inline
-struct vdso_data *__arch_get_k_vdso_data(void)
-{
- return vdso_data->data;
-}
-#define __arch_get_k_vdso_data __arch_get_k_vdso_data
-
-static __always_inline
-struct vdso_rng_data *__arch_get_k_vdso_rng_data(void)
-{
- return &vdso_data->rng_data;
-}
-
/* The asm-generic header needs to be included after the definitions above */
#include <asm-generic/vdso/vsyscall.h>
diff --git a/arch/powerpc/include/asm/vdso_datapage.h b/arch/powerpc/include/asm/vdso_datapage.h
index a202f5b63479..95d45a50355d 100644
--- a/arch/powerpc/include/asm/vdso_datapage.h
+++ b/arch/powerpc/include/asm/vdso_datapage.h
@@ -11,56 +11,18 @@
#ifndef __ASSEMBLY__
-#include <linux/unistd.h>
-#include <linux/time.h>
#include <vdso/datapage.h>
-#define SYSCALL_MAP_SIZE ((NR_syscalls + 31) / 32)
-
-#ifdef CONFIG_PPC64
-
-struct vdso_arch_data {
- __u64 tb_ticks_per_sec; /* Timebase tics / sec */
- __u32 dcache_block_size; /* L1 d-cache block size */
- __u32 icache_block_size; /* L1 i-cache block size */
- __u32 dcache_log_block_size; /* L1 d-cache log block size */
- __u32 icache_log_block_size; /* L1 i-cache log block size */
- __u32 syscall_map[SYSCALL_MAP_SIZE]; /* Map of syscalls */
- __u32 compat_syscall_map[SYSCALL_MAP_SIZE]; /* Map of compat syscalls */
-
- struct vdso_rng_data rng_data;
-
- struct vdso_data data[CS_BASES] __aligned(1 << CONFIG_PAGE_SHIFT);
-};
-
-#else /* CONFIG_PPC64 */
-
-struct vdso_arch_data {
- __u64 tb_ticks_per_sec; /* Timebase tics / sec */
- __u32 syscall_map[SYSCALL_MAP_SIZE]; /* Map of syscalls */
- __u32 compat_syscall_map[0]; /* No compat syscalls on PPC32 */
- struct vdso_rng_data rng_data;
-
- struct vdso_data data[CS_BASES] __aligned(1 << CONFIG_PAGE_SHIFT);
-};
-
-#endif /* CONFIG_PPC64 */
-
-extern struct vdso_arch_data *vdso_data;
-
#else /* __ASSEMBLY__ */
-.macro get_datapage ptr offset=0
+.macro get_datapage ptr symbol
bcl 20, 31, .+4
999:
mflr \ptr
- addis \ptr, \ptr, (_vdso_datapage - 999b + \offset)@ha
- addi \ptr, \ptr, (_vdso_datapage - 999b + \offset)@l
+ addis \ptr, \ptr, (\symbol - 999b)@ha
+ addi \ptr, \ptr, (\symbol - 999b)@l
.endm
-#include <asm/asm-offsets.h>
-#include <asm/page.h>
-
#endif /* __ASSEMBLY__ */
#endif /* __KERNEL__ */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 7a390bd4f4af..b3048f6d3822 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -334,7 +334,6 @@ int main(void)
#endif /* ! CONFIG_PPC64 */
/* datapage offsets for use by vdso */
- OFFSET(VDSO_DATA_OFFSET, vdso_arch_data, data);
OFFSET(CFG_TB_TICKS_PER_SEC, vdso_arch_data, tb_ticks_per_sec);
#ifdef CONFIG_PPC64
OFFSET(CFG_ICACHE_BLOCKSZ, vdso_arch_data, icache_block_size);
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 4b371c738213..d44349fe8e2b 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -751,7 +751,7 @@ u32 *__init fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs)
* prstatus.pr_pid = ????
*/
elf_core_copy_regs(&prstatus.pr_reg, regs);
- buf = append_elf_note(buf, CRASH_CORE_NOTE_NAME, NT_PRSTATUS,
+ buf = append_elf_note(buf, NN_PRSTATUS, NT_PRSTATUS,
&prstatus, sizeof(prstatus));
return buf;
}
diff --git a/arch/powerpc/kernel/ptrace/ptrace.c b/arch/powerpc/kernel/ptrace/ptrace.c
index 727ed4a14545..c6997df63287 100644
--- a/arch/powerpc/kernel/ptrace/ptrace.c
+++ b/arch/powerpc/kernel/ptrace/ptrace.c
@@ -215,7 +215,7 @@ static int do_seccomp(struct pt_regs *regs)
* have already loaded -ENOSYS into r3, or seccomp has put
* something else in r3 (via SECCOMP_RET_ERRNO/TRACE).
*/
- if (__secure_computing(NULL))
+ if (__secure_computing())
return -1;
/*
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index d8b4ab78bef0..9a084bdb8926 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -557,3 +557,4 @@
464 common getxattrat sys_getxattrat
465 common listxattrat sys_listxattrat
466 common removexattrat sys_removexattrat
+467 common open_tree_attr sys_open_tree_attr
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 0727332ad86f..15784c5c95c7 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -950,7 +950,7 @@ void __init time_init(void)
sys_tz.tz_dsttime = 0;
}
- vdso_data->tb_ticks_per_sec = tb_ticks_per_sec;
+ vdso_k_arch_data->tb_ticks_per_sec = tb_ticks_per_sec;
#ifdef CONFIG_PPC64_PROC_SYSTEMCFG
systemcfg->tb_ticks_per_sec = tb_ticks_per_sec;
#endif
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index edf5cabe5dfd..cb8e9357383e 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -263,10 +263,9 @@ static int __die(const char *str, struct pt_regs *regs, long err)
{
printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter);
- printk("%s PAGE_SIZE=%luK%s%s%s%s%s%s %s\n",
+ printk("%s PAGE_SIZE=%luK%s %s%s%s%s %s\n",
IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN) ? "LE" : "BE",
PAGE_SIZE / 1024, get_mmu_str(),
- IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "",
IS_ENABLED(CONFIG_SMP) ? " SMP" : "",
IS_ENABLED(CONFIG_SMP) ? (" NR_CPUS=" __stringify(NR_CPUS)) : "",
debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "",
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index 43379365ce1b..219d67bcf747 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -17,7 +17,7 @@
#include <linux/elf.h>
#include <linux/security.h>
#include <linux/syscalls.h>
-#include <linux/time_namespace.h>
+#include <linux/vdso_datastore.h>
#include <vdso/datapage.h>
#include <asm/syscall.h>
@@ -32,6 +32,8 @@
#include <asm/vdso_datapage.h>
#include <asm/setup.h>
+static_assert(__VDSO_PAGES == VDSO_NR_PAGES);
+
/* The alignment of the vDSO */
#define VDSO_ALIGNMENT (1 << 16)
@@ -40,24 +42,6 @@ extern char vdso64_start, vdso64_end;
long sys_ni_syscall(void);
-/*
- * The vdso data page (aka. systemcfg for old ppc64 fans) is here.
- * Once the early boot kernel code no longer needs to muck around
- * with it, it will become dynamically allocated
- */
-static union {
- struct vdso_arch_data data;
- u8 page[2 * PAGE_SIZE];
-} vdso_data_store __page_aligned_data;
-struct vdso_arch_data *vdso_data = &vdso_data_store.data;
-
-enum vvar_pages {
- VVAR_BASE_PAGE_OFFSET,
- VVAR_TIME_PAGE_OFFSET,
- VVAR_TIMENS_PAGE_OFFSET,
- VVAR_NR_PAGES,
-};
-
static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma,
unsigned long text_size)
{
@@ -96,14 +80,6 @@ static void vdso_close(const struct vm_special_mapping *sm, struct vm_area_struc
mm->context.vdso = NULL;
}
-static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
- struct vm_area_struct *vma, struct vm_fault *vmf);
-
-static struct vm_special_mapping vvar_spec __ro_after_init = {
- .name = "[vvar]",
- .fault = vvar_fault,
-};
-
static struct vm_special_mapping vdso32_spec __ro_after_init = {
.name = "[vdso]",
.mremap = vdso32_mremap,
@@ -116,73 +92,6 @@ static struct vm_special_mapping vdso64_spec __ro_after_init = {
.close = vdso_close,
};
-#ifdef CONFIG_TIME_NS
-struct vdso_data *arch_get_vdso_data(void *vvar_page)
-{
- return vvar_page;
-}
-
-/*
- * The vvar mapping contains data for a specific time namespace, so when a task
- * changes namespace we must unmap its vvar data for the old namespace.
- * Subsequent faults will map in data for the new namespace.
- *
- * For more details see timens_setup_vdso_data().
- */
-int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
-{
- struct mm_struct *mm = task->mm;
- VMA_ITERATOR(vmi, mm, 0);
- struct vm_area_struct *vma;
-
- mmap_read_lock(mm);
- for_each_vma(vmi, vma) {
- if (vma_is_special_mapping(vma, &vvar_spec))
- zap_vma_pages(vma);
- }
- mmap_read_unlock(mm);
-
- return 0;
-}
-#endif
-
-static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
- struct vm_area_struct *vma, struct vm_fault *vmf)
-{
- struct page *timens_page = find_timens_vvar_page(vma);
- unsigned long pfn;
-
- switch (vmf->pgoff) {
- case VVAR_BASE_PAGE_OFFSET:
- pfn = virt_to_pfn(vdso_data);
- break;
- case VVAR_TIME_PAGE_OFFSET:
- if (timens_page)
- pfn = page_to_pfn(timens_page);
- else
- pfn = virt_to_pfn(vdso_data->data);
- break;
-#ifdef CONFIG_TIME_NS
- case VVAR_TIMENS_PAGE_OFFSET:
- /*
- * If a task belongs to a time namespace then a namespace
- * specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and
- * the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET
- * offset.
- * See also the comment near timens_setup_vdso_data().
- */
- if (!timens_page)
- return VM_FAULT_SIGBUS;
- pfn = virt_to_pfn(vdso_data->data);
- break;
-#endif /* CONFIG_TIME_NS */
- default:
- return VM_FAULT_SIGBUS;
- }
-
- return vmf_insert_pfn(vma, vmf->address, pfn);
-}
-
/*
* This is called from binfmt_elf, we create the special vma for the
* vDSO and insert it into the mm struct tree
@@ -191,7 +100,7 @@ static int __arch_setup_additional_pages(struct linux_binprm *bprm, int uses_int
{
unsigned long vdso_size, vdso_base, mappings_size;
struct vm_special_mapping *vdso_spec;
- unsigned long vvar_size = VVAR_NR_PAGES * PAGE_SIZE;
+ unsigned long vvar_size = VDSO_NR_PAGES * PAGE_SIZE;
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
@@ -217,9 +126,7 @@ static int __arch_setup_additional_pages(struct linux_binprm *bprm, int uses_int
/* Add required alignment. */
vdso_base = ALIGN(vdso_base, VDSO_ALIGNMENT);
- vma = _install_special_mapping(mm, vdso_base, vvar_size,
- VM_READ | VM_MAYREAD | VM_IO |
- VM_DONTDUMP | VM_PFNMAP, &vvar_spec);
+ vma = vdso_install_vvar_mapping(mm, vdso_base);
if (IS_ERR(vma))
return PTR_ERR(vma);
@@ -299,10 +206,10 @@ static void __init vdso_setup_syscall_map(void)
for (i = 0; i < NR_syscalls; i++) {
if (sys_call_table[i] != (void *)&sys_ni_syscall)
- vdso_data->syscall_map[i >> 5] |= 0x80000000UL >> (i & 0x1f);
+ vdso_k_arch_data->syscall_map[i >> 5] |= 0x80000000UL >> (i & 0x1f);
if (IS_ENABLED(CONFIG_COMPAT) &&
compat_sys_call_table[i] != (void *)&sys_ni_syscall)
- vdso_data->compat_syscall_map[i >> 5] |= 0x80000000UL >> (i & 0x1f);
+ vdso_k_arch_data->compat_syscall_map[i >> 5] |= 0x80000000UL >> (i & 0x1f);
}
}
@@ -352,10 +259,10 @@ static struct page ** __init vdso_setup_pages(void *start, void *end)
static int __init vdso_init(void)
{
#ifdef CONFIG_PPC64
- vdso_data->dcache_block_size = ppc64_caches.l1d.block_size;
- vdso_data->icache_block_size = ppc64_caches.l1i.block_size;
- vdso_data->dcache_log_block_size = ppc64_caches.l1d.log_block_size;
- vdso_data->icache_log_block_size = ppc64_caches.l1i.log_block_size;
+ vdso_k_arch_data->dcache_block_size = ppc64_caches.l1d.block_size;
+ vdso_k_arch_data->icache_block_size = ppc64_caches.l1i.block_size;
+ vdso_k_arch_data->dcache_log_block_size = ppc64_caches.l1d.log_block_size;
+ vdso_k_arch_data->icache_log_block_size = ppc64_caches.l1i.log_block_size;
#endif /* CONFIG_PPC64 */
vdso_setup_syscall_map();
diff --git a/arch/powerpc/kernel/vdso/Makefile b/arch/powerpc/kernel/vdso/Makefile
index 0e3ed6fb199f..e8824f933326 100644
--- a/arch/powerpc/kernel/vdso/Makefile
+++ b/arch/powerpc/kernel/vdso/Makefile
@@ -3,7 +3,7 @@
# List of files in the vdso, has to be asm only for now
# Include the generic Makefile to check the built vdso.
-include $(srctree)/lib/vdso/Makefile
+include $(srctree)/lib/vdso/Makefile.include
obj-vdso32 = sigtramp32-32.o gettimeofday-32.o datapage-32.o cacheflush-32.o note-32.o getcpu-32.o
obj-vdso64 = sigtramp64-64.o gettimeofday-64.o datapage-64.o cacheflush-64.o note-64.o getcpu-64.o
diff --git a/arch/powerpc/kernel/vdso/cacheflush.S b/arch/powerpc/kernel/vdso/cacheflush.S
index 0085ae464dac..488d3ade11e6 100644
--- a/arch/powerpc/kernel/vdso/cacheflush.S
+++ b/arch/powerpc/kernel/vdso/cacheflush.S
@@ -30,7 +30,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
#ifdef CONFIG_PPC64
mflr r12
.cfi_register lr,r12
- get_datapage r10
+ get_datapage r10 vdso_u_arch_data
mtlr r12
.cfi_restore lr
#endif
diff --git a/arch/powerpc/kernel/vdso/datapage.S b/arch/powerpc/kernel/vdso/datapage.S
index db8e167f0166..d23b2e8e2a34 100644
--- a/arch/powerpc/kernel/vdso/datapage.S
+++ b/arch/powerpc/kernel/vdso/datapage.S
@@ -28,7 +28,7 @@ V_FUNCTION_BEGIN(__kernel_get_syscall_map)
mflr r12
.cfi_register lr,r12
mr. r4,r3
- get_datapage r3
+ get_datapage r3 vdso_u_arch_data
mtlr r12
#ifdef __powerpc64__
addi r3,r3,CFG_SYSCALL_MAP64
@@ -52,7 +52,7 @@ V_FUNCTION_BEGIN(__kernel_get_tbfreq)
.cfi_startproc
mflr r12
.cfi_register lr,r12
- get_datapage r3
+ get_datapage r3 vdso_u_arch_data
#ifndef __powerpc64__
lwz r4,(CFG_TB_TICKS_PER_SEC + 4)(r3)
#endif
diff --git a/arch/powerpc/kernel/vdso/gettimeofday.S b/arch/powerpc/kernel/vdso/gettimeofday.S
index 5333848322ca..79c967212444 100644
--- a/arch/powerpc/kernel/vdso/gettimeofday.S
+++ b/arch/powerpc/kernel/vdso/gettimeofday.S
@@ -33,9 +33,9 @@
.cfi_rel_offset r2, PPC_MIN_STKFRM + STK_GOT
#endif
.ifeq \call_time
- get_datapage r5 VDSO_DATA_OFFSET
+ get_datapage r5 vdso_u_time_data
.else
- get_datapage r4 VDSO_DATA_OFFSET
+ get_datapage r4 vdso_u_time_data
.endif
bl CFUNC(DOTSYM(\funct))
PPC_LL r0, PPC_MIN_STKFRM + PPC_LR_STKOFF(r1)
diff --git a/arch/powerpc/kernel/vdso/vdso32.lds.S b/arch/powerpc/kernel/vdso/vdso32.lds.S
index 1a1b0b6d681a..72a1012b8a20 100644
--- a/arch/powerpc/kernel/vdso/vdso32.lds.S
+++ b/arch/powerpc/kernel/vdso/vdso32.lds.S
@@ -6,6 +6,7 @@
#include <asm/vdso.h>
#include <asm/page.h>
#include <asm-generic/vmlinux.lds.h>
+#include <vdso/datapage.h>
#ifdef __LITTLE_ENDIAN__
OUTPUT_FORMAT("elf32-powerpcle", "elf32-powerpcle", "elf32-powerpcle")
@@ -16,7 +17,8 @@ OUTPUT_ARCH(powerpc:common)
SECTIONS
{
- PROVIDE(_vdso_datapage = . - 3 * PAGE_SIZE);
+ VDSO_VVAR_SYMS
+
. = SIZEOF_HEADERS;
.hash : { *(.hash) } :text
diff --git a/arch/powerpc/kernel/vdso/vdso64.lds.S b/arch/powerpc/kernel/vdso/vdso64.lds.S
index e21b5506cad6..32102a05eaa7 100644
--- a/arch/powerpc/kernel/vdso/vdso64.lds.S
+++ b/arch/powerpc/kernel/vdso/vdso64.lds.S
@@ -6,6 +6,7 @@
#include <asm/vdso.h>
#include <asm/page.h>
#include <asm-generic/vmlinux.lds.h>
+#include <vdso/datapage.h>
#ifdef __LITTLE_ENDIAN__
OUTPUT_FORMAT("elf64-powerpcle", "elf64-powerpcle", "elf64-powerpcle")
@@ -16,7 +17,8 @@ OUTPUT_ARCH(powerpc:common64)
SECTIONS
{
- PROVIDE(_vdso_datapage = . - 3 * PAGE_SIZE);
+ VDSO_VVAR_SYMS
+
. = SIZEOF_HEADERS;
.hash : { *(.hash) } :text
diff --git a/arch/powerpc/kernel/vdso/vgettimeofday.c b/arch/powerpc/kernel/vdso/vgettimeofday.c
index 55a287c9a736..6f5167d81af5 100644
--- a/arch/powerpc/kernel/vdso/vgettimeofday.c
+++ b/arch/powerpc/kernel/vdso/vgettimeofday.c
@@ -7,43 +7,43 @@
#ifdef __powerpc64__
int __c_kernel_clock_gettime(clockid_t clock, struct __kernel_timespec *ts,
- const struct vdso_data *vd)
+ const struct vdso_time_data *vd)
{
return __cvdso_clock_gettime_data(vd, clock, ts);
}
int __c_kernel_clock_getres(clockid_t clock_id, struct __kernel_timespec *res,
- const struct vdso_data *vd)
+ const struct vdso_time_data *vd)
{
return __cvdso_clock_getres_data(vd, clock_id, res);
}
#else
int __c_kernel_clock_gettime(clockid_t clock, struct old_timespec32 *ts,
- const struct vdso_data *vd)
+ const struct vdso_time_data *vd)
{
return __cvdso_clock_gettime32_data(vd, clock, ts);
}
int __c_kernel_clock_gettime64(clockid_t clock, struct __kernel_timespec *ts,
- const struct vdso_data *vd)
+ const struct vdso_time_data *vd)
{
return __cvdso_clock_gettime_data(vd, clock, ts);
}
int __c_kernel_clock_getres(clockid_t clock_id, struct old_timespec32 *res,
- const struct vdso_data *vd)
+ const struct vdso_time_data *vd)
{
return __cvdso_clock_getres_time32_data(vd, clock_id, res);
}
#endif
int __c_kernel_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz,
- const struct vdso_data *vd)
+ const struct vdso_time_data *vd)
{
return __cvdso_gettimeofday_data(vd, tv, tz);
}
-__kernel_old_time_t __c_kernel_time(__kernel_old_time_t *time, const struct vdso_data *vd)
+__kernel_old_time_t __c_kernel_time(__kernel_old_time_t *time, const struct vdso_time_data *vd)
{
return __cvdso_time_data(vd, time);
}
diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index 8c464a5d8246..2429cb1c7baa 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -495,8 +495,7 @@ static void start_watchdog(void *arg)
*this_cpu_ptr(&wd_timer_tb) = get_tb();
- hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- hrtimer->function = watchdog_timer_fn;
+ hrtimer_setup(hrtimer, watchdog_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
hrtimer_start(hrtimer, ms_to_ktime(wd_timer_period_ms),
HRTIMER_MODE_REL_PINNED);
}
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index ce1d91eed231..61f2b7e007fa 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -766,8 +766,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
{
int err;
- hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
- vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup;
+ hrtimer_setup(&vcpu->arch.dec_timer, kvmppc_decrementer_wakeup, CLOCK_REALTIME,
+ HRTIMER_MODE_ABS);
#ifdef CONFIG_KVM_EXIT_TIMING
mutex_init(&vcpu->arch.exit_timing_lock);
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 2b79171ee185..f4e03aaabb4c 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -132,7 +132,10 @@ static unsigned long ebb_switch_in(bool ebb, struct cpu_hw_events *cpuhw)
static inline void power_pmu_bhrb_enable(struct perf_event *event) {}
static inline void power_pmu_bhrb_disable(struct perf_event *event) {}
-static void power_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) {}
+static void power_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx,
+ struct task_struct *task, bool sched_in)
+{
+}
static inline void power_pmu_bhrb_read(struct perf_event *event, struct cpu_hw_events *cpuhw) {}
static void pmao_restore_workaround(bool ebb) { }
#endif /* CONFIG_PPC32 */
@@ -444,7 +447,8 @@ static void power_pmu_bhrb_disable(struct perf_event *event)
/* Called from ctxsw to prevent one process's branch entries to
* mingle with the other process's entries during context switch.
*/
-static void power_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
+static void power_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx,
+ struct task_struct *task, bool sched_in)
{
if (!ppmu->bhrb_nr)
return;
diff --git a/arch/powerpc/platforms/powernv/opal-core.c b/arch/powerpc/platforms/powernv/opal-core.c
index c9a9b759cc92..a379ff86c120 100644
--- a/arch/powerpc/platforms/powernv/opal-core.c
+++ b/arch/powerpc/platforms/powernv/opal-core.c
@@ -149,7 +149,7 @@ static Elf64_Word *__init auxv_to_elf64_notes(Elf64_Word *buf,
/* end of vector */
bufp[idx++] = cpu_to_be64(AT_NULL);
- buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME, NT_AUXV,
+ buf = append_elf64_note(buf, NN_AUXV, NT_AUXV,
oc_conf->auxv_buf, AUXV_DESC_SZ);
return buf;
}
@@ -252,7 +252,7 @@ static Elf64_Word * __init opalcore_append_cpu_notes(Elf64_Word *buf)
* crashing CPU's prstatus.
*/
first_cpu_note = buf;
- buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME, NT_PRSTATUS,
+ buf = append_elf64_note(buf, NN_PRSTATUS, NT_PRSTATUS,
&prstatus, sizeof(prstatus));
for (i = 0; i < oc_conf->num_cpus; i++, bufp += size_per_thread) {
@@ -279,7 +279,7 @@ static Elf64_Word * __init opalcore_append_cpu_notes(Elf64_Word *buf)
fill_prstatus(&prstatus, thread_pir, &regs);
if (thread_pir != oc_conf->crashing_cpu) {
- buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME,
+ buf = append_elf64_note(buf, NN_PRSTATUS,
NT_PRSTATUS, &prstatus,
sizeof(prstatus));
} else {
@@ -287,7 +287,7 @@ static Elf64_Word * __init opalcore_append_cpu_notes(Elf64_Word *buf)
* Add crashing CPU as the first NT_PRSTATUS note for
* GDB to process the core file appropriately.
*/
- append_elf64_note(first_cpu_note, CRASH_CORE_NOTE_NAME,
+ append_elf64_note(first_cpu_note, NN_PRSTATUS,
NT_PRSTATUS, &prstatus,
sizeof(prstatus));
}
diff --git a/arch/powerpc/platforms/pseries/papr-vpd.c b/arch/powerpc/platforms/pseries/papr-vpd.c
index 1574176e3ffc..c86950d7105a 100644
--- a/arch/powerpc/platforms/pseries/papr-vpd.c
+++ b/arch/powerpc/platforms/pseries/papr-vpd.c
@@ -482,14 +482,13 @@ static long papr_vpd_create_handle(struct papr_location_code __user *ulc)
goto free_blob;
}
- file = anon_inode_getfile("[papr-vpd]", &papr_vpd_handle_ops,
- (void *)blob, O_RDONLY);
+ file = anon_inode_getfile_fmode("[papr-vpd]", &papr_vpd_handle_ops,
+ (void *)blob, O_RDONLY,
+ FMODE_LSEEK | FMODE_PREAD);
if (IS_ERR(file)) {
err = PTR_ERR(file);
goto put_fd;
}
-
- file->f_mode |= FMODE_LSEEK | FMODE_PREAD;
fd_install(fd, file);
return fd;
put_fd:
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 268859e4df87..1acb53aab252 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -1271,11 +1271,7 @@ static int xmon_batch_next_cpu(void)
{
unsigned long cpu;
- while (!cpumask_empty(&xmon_batch_cpus)) {
- cpu = cpumask_next_wrap(smp_processor_id(), &xmon_batch_cpus,
- xmon_batch_start_cpu, true);
- if (cpu >= nr_cpu_ids)
- break;
+ for_each_cpu_wrap(cpu, &xmon_batch_cpus, xmon_batch_start_cpu) {
if (xmon_batch_start_cpu == -1)
xmon_batch_start_cpu = cpu;
if (xmon_switch_cpu(cpu))
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 7612c52e9b1e..0409be768666 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -53,7 +53,7 @@ config RISCV
select ARCH_HAS_SYSCALL_WRAPPER
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_HAS_UBSAN
- select ARCH_HAS_VDSO_TIME_DATA
+ select ARCH_HAS_VDSO_ARCH_DATA if GENERIC_VDSO_DATA_STORE
select ARCH_KEEP_MEMBLOCK if ACPI
select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE if 64BIT && MMU
select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
@@ -111,11 +111,13 @@ config RISCV
select GENERIC_IRQ_SHOW
select GENERIC_IRQ_SHOW_LEVEL
select GENERIC_LIB_DEVMEM_IS_ALLOWED
+ select GENERIC_PENDING_IRQ if SMP
select GENERIC_PCI_IOMAP
select GENERIC_PTDUMP if MMU
select GENERIC_SCHED_CLOCK
select GENERIC_SMP_IDLE_THREAD
select GENERIC_TIME_VSYSCALL if MMU && 64BIT
+ select GENERIC_VDSO_DATA_STORE if MMU
select GENERIC_VDSO_TIME_NS if HAVE_GENERIC_VDSO
select HARDIRQS_SW_RESEND
select HAS_IOPORT if MMU
diff --git a/arch/riscv/boot/dts/sophgo/sg2042.dtsi b/arch/riscv/boot/dts/sophgo/sg2042.dtsi
index e62ac51ac55a..fef2a0e0f7a3 100644
--- a/arch/riscv/boot/dts/sophgo/sg2042.dtsi
+++ b/arch/riscv/boot/dts/sophgo/sg2042.dtsi
@@ -173,6 +173,16 @@
#clock-cells = <1>;
};
+ msi: msi-controller@7030010304 {
+ compatible = "sophgo,sg2042-msi";
+ reg = <0x70 0x30010304 0x0 0x4>,
+ <0x70 0x30010300 0x0 0x4>;
+ reg-names = "clr", "doorbell";
+ msi-controller;
+ #msi-cells = <0>;
+ msi-ranges = <&intc 64 IRQ_TYPE_LEVEL_HIGH 32>;
+ };
+
rpgate: clock-controller@7030010368 {
compatible = "sophgo,sg2042-rpgate";
reg = <0x70 0x30010368 0x0 0x98>;
diff --git a/arch/riscv/boot/dts/starfive/jh7110-pinfunc.h b/arch/riscv/boot/dts/starfive/jh7110-pinfunc.h
index 256de17f5261..ae49c908e7fb 100644
--- a/arch/riscv/boot/dts/starfive/jh7110-pinfunc.h
+++ b/arch/riscv/boot/dts/starfive/jh7110-pinfunc.h
@@ -89,7 +89,7 @@
#define GPOUT_SYS_SDIO1_DATA1 59
#define GPOUT_SYS_SDIO1_DATA2 60
#define GPOUT_SYS_SDIO1_DATA3 61
-#define GPOUT_SYS_SDIO1_DATA4 63
+#define GPOUT_SYS_SDIO1_DATA4 62
#define GPOUT_SYS_SDIO1_DATA5 63
#define GPOUT_SYS_SDIO1_DATA6 64
#define GPOUT_SYS_SDIO1_DATA7 65
diff --git a/arch/riscv/include/asm/bitops.h b/arch/riscv/include/asm/bitops.h
index c6bd3d8354a9..49a0f48d93df 100644
--- a/arch/riscv/include/asm/bitops.h
+++ b/arch/riscv/include/asm/bitops.h
@@ -226,7 +226,7 @@ legacy:
* @nr: Bit to set
* @addr: Address to count from
*
- * This operation may be reordered on other architectures than x86.
+ * This is an atomic fully-ordered operation (implied full memory barrier).
*/
static __always_inline int arch_test_and_set_bit(int nr, volatile unsigned long *addr)
{
@@ -238,7 +238,7 @@ static __always_inline int arch_test_and_set_bit(int nr, volatile unsigned long
* @nr: Bit to clear
* @addr: Address to count from
*
- * This operation can be reordered on other architectures other than x86.
+ * This is an atomic fully-ordered operation (implied full memory barrier).
*/
static __always_inline int arch_test_and_clear_bit(int nr, volatile unsigned long *addr)
{
diff --git a/arch/riscv/include/asm/io.h b/arch/riscv/include/asm/io.h
index 1c5c641075d2..0257f4aa7ff4 100644
--- a/arch/riscv/include/asm/io.h
+++ b/arch/riscv/include/asm/io.h
@@ -136,7 +136,7 @@ __io_writes_outs(outs, u64, q, __io_pbr(), __io_paw())
#include <asm-generic/io.h>
#ifdef CONFIG_MMU
-#define arch_memremap_wb(addr, size) \
+#define arch_memremap_wb(addr, size, flags) \
((__force void *)ioremap_prot((addr), (size), _PAGE_KERNEL))
#endif
diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h
index cc33e35cd628..0e9c2fab6378 100644
--- a/arch/riscv/include/asm/kvm_host.h
+++ b/arch/riscv/include/asm/kvm_host.h
@@ -301,8 +301,6 @@ static inline bool kvm_arch_pmi_in_guest(struct kvm_vcpu *vcpu)
return IS_ENABLED(CONFIG_GUEST_PERF_EVENTS) && !!vcpu;
}
-static inline void kvm_arch_sync_events(struct kvm *kvm) {}
-
#define KVM_RISCV_GSTAGE_TLB_MIN_ORDER 12
void kvm_riscv_local_hfence_gvma_vmid_gpa(unsigned long vmid,
diff --git a/arch/riscv/include/asm/vdso.h b/arch/riscv/include/asm/vdso.h
index f891478829a5..c130d8100232 100644
--- a/arch/riscv/include/asm/vdso.h
+++ b/arch/riscv/include/asm/vdso.h
@@ -14,7 +14,7 @@
*/
#ifdef CONFIG_MMU
-#define __VVAR_PAGES 2
+#define __VDSO_PAGES 4
#ifndef __ASSEMBLY__
#include <generated/vdso-offsets.h>
diff --git a/arch/riscv/include/asm/vdso/time_data.h b/arch/riscv/include/asm/vdso/arch_data.h
index dfa65228999b..da57a3786f7a 100644
--- a/arch/riscv/include/asm/vdso/time_data.h
+++ b/arch/riscv/include/asm/vdso/arch_data.h
@@ -1,12 +1,12 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __RISCV_ASM_VDSO_TIME_DATA_H
-#define __RISCV_ASM_VDSO_TIME_DATA_H
+#ifndef __RISCV_ASM_VDSO_ARCH_DATA_H
+#define __RISCV_ASM_VDSO_ARCH_DATA_H
#include <linux/types.h>
#include <vdso/datapage.h>
#include <asm/hwprobe.h>
-struct arch_vdso_time_data {
+struct vdso_arch_data {
/* Stash static answers to the hwprobe queries when all CPUs are selected. */
__u64 all_cpu_hwprobe_values[RISCV_HWPROBE_MAX_KEY + 1];
@@ -14,4 +14,4 @@ struct arch_vdso_time_data {
__u8 homogeneous_cpus;
};
-#endif /* __RISCV_ASM_VDSO_TIME_DATA_H */
+#endif /* __RISCV_ASM_VDSO_ARCH_DATA_H */
diff --git a/arch/riscv/include/asm/vdso/gettimeofday.h b/arch/riscv/include/asm/vdso/gettimeofday.h
index ba3283cf7acc..29164f84f93c 100644
--- a/arch/riscv/include/asm/vdso/gettimeofday.h
+++ b/arch/riscv/include/asm/vdso/gettimeofday.h
@@ -69,7 +69,7 @@ int clock_getres_fallback(clockid_t _clkid, struct __kernel_timespec *_ts)
#endif /* CONFIG_GENERIC_TIME_VSYSCALL */
static __always_inline u64 __arch_get_hw_counter(s32 clock_mode,
- const struct vdso_data *vd)
+ const struct vdso_time_data *vd)
{
/*
* The purpose of csr_read(CSR_TIME) is to trap the system into
@@ -79,18 +79,6 @@ static __always_inline u64 __arch_get_hw_counter(s32 clock_mode,
return csr_read(CSR_TIME);
}
-static __always_inline const struct vdso_data *__arch_get_vdso_data(void)
-{
- return _vdso_data;
-}
-
-#ifdef CONFIG_TIME_NS
-static __always_inline
-const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd)
-{
- return _timens_data;
-}
-#endif
#endif /* !__ASSEMBLY__ */
#endif /* __ASM_VDSO_GETTIMEOFDAY_H */
diff --git a/arch/riscv/include/asm/vdso/vsyscall.h b/arch/riscv/include/asm/vdso/vsyscall.h
index e8a9c4b53c0c..1140b54b4bc8 100644
--- a/arch/riscv/include/asm/vdso/vsyscall.h
+++ b/arch/riscv/include/asm/vdso/vsyscall.h
@@ -6,15 +6,6 @@
#include <vdso/datapage.h>
-extern struct vdso_data *vdso_data;
-
-static __always_inline struct vdso_data *__riscv_get_k_vdso_data(void)
-{
- return vdso_data;
-}
-
-#define __arch_get_k_vdso_data __riscv_get_k_vdso_data
-
/* The asm-generic header needs to be included after the definitions above */
#include <asm-generic/vdso/vsyscall.h>
diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c
index bcd3b816306c..04a4e5495512 100644
--- a/arch/riscv/kernel/sys_hwprobe.c
+++ b/arch/riscv/kernel/sys_hwprobe.c
@@ -450,8 +450,7 @@ static int do_riscv_hwprobe(struct riscv_hwprobe __user *pairs,
static int __init init_hwprobe_vdso_data(void)
{
- struct vdso_data *vd = __arch_get_k_vdso_data();
- struct arch_vdso_time_data *avd = &vd->arch_data;
+ struct vdso_arch_data *avd = vdso_k_arch_data;
u64 id_bitsmash = 0;
struct riscv_hwprobe pair;
int key;
diff --git a/arch/riscv/kernel/vdso.c b/arch/riscv/kernel/vdso.c
index 3ca3ae4277e1..cc2895d1fbc2 100644
--- a/arch/riscv/kernel/vdso.c
+++ b/arch/riscv/kernel/vdso.c
@@ -13,20 +13,11 @@
#include <linux/err.h>
#include <asm/page.h>
#include <asm/vdso.h>
-#include <linux/time_namespace.h>
+#include <linux/vdso_datastore.h>
#include <vdso/datapage.h>
#include <vdso/vsyscall.h>
-enum vvar_pages {
- VVAR_DATA_PAGE_OFFSET,
- VVAR_TIMENS_PAGE_OFFSET,
- VVAR_NR_PAGES,
-};
-
-#define VVAR_SIZE (VVAR_NR_PAGES << PAGE_SHIFT)
-
-static union vdso_data_store vdso_data_store __page_aligned_data;
-struct vdso_data *vdso_data = vdso_data_store.data;
+#define VVAR_SIZE (VDSO_NR_PAGES << PAGE_SHIFT)
struct __vdso_info {
const char *name;
@@ -79,78 +70,6 @@ static void __init __vdso_init(struct __vdso_info *vdso_info)
vdso_info->cm->pages = vdso_pagelist;
}
-#ifdef CONFIG_TIME_NS
-struct vdso_data *arch_get_vdso_data(void *vvar_page)
-{
- return (struct vdso_data *)(vvar_page);
-}
-
-static const struct vm_special_mapping rv_vvar_map;
-
-/*
- * The vvar mapping contains data for a specific time namespace, so when a task
- * changes namespace we must unmap its vvar data for the old namespace.
- * Subsequent faults will map in data for the new namespace.
- *
- * For more details see timens_setup_vdso_data().
- */
-int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
-{
- struct mm_struct *mm = task->mm;
- struct vm_area_struct *vma;
- VMA_ITERATOR(vmi, mm, 0);
-
- mmap_read_lock(mm);
-
- for_each_vma(vmi, vma) {
- if (vma_is_special_mapping(vma, &rv_vvar_map))
- zap_vma_pages(vma);
- }
-
- mmap_read_unlock(mm);
- return 0;
-}
-#endif
-
-static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
- struct vm_area_struct *vma, struct vm_fault *vmf)
-{
- struct page *timens_page = find_timens_vvar_page(vma);
- unsigned long pfn;
-
- switch (vmf->pgoff) {
- case VVAR_DATA_PAGE_OFFSET:
- if (timens_page)
- pfn = page_to_pfn(timens_page);
- else
- pfn = sym_to_pfn(vdso_data);
- break;
-#ifdef CONFIG_TIME_NS
- case VVAR_TIMENS_PAGE_OFFSET:
- /*
- * If a task belongs to a time namespace then a namespace
- * specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and
- * the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET
- * offset.
- * See also the comment near timens_setup_vdso_data().
- */
- if (!timens_page)
- return VM_FAULT_SIGBUS;
- pfn = sym_to_pfn(vdso_data);
- break;
-#endif /* CONFIG_TIME_NS */
- default:
- return VM_FAULT_SIGBUS;
- }
-
- return vmf_insert_pfn(vma, vmf->address, pfn);
-}
-
-static const struct vm_special_mapping rv_vvar_map = {
- .name = "[vvar]",
- .fault = vvar_fault,
-};
-
static struct vm_special_mapping rv_vdso_map __ro_after_init = {
.name = "[vdso]",
.mremap = vdso_mremap,
@@ -196,7 +115,7 @@ static int __setup_additional_pages(struct mm_struct *mm,
unsigned long vdso_base, vdso_text_len, vdso_mapping_len;
void *ret;
- BUILD_BUG_ON(VVAR_NR_PAGES != __VVAR_PAGES);
+ BUILD_BUG_ON(VDSO_NR_PAGES != __VDSO_PAGES);
vdso_text_len = vdso_info->vdso_pages << PAGE_SHIFT;
/* Be sure to map the data page */
@@ -208,8 +127,7 @@ static int __setup_additional_pages(struct mm_struct *mm,
goto up_fail;
}
- ret = _install_special_mapping(mm, vdso_base, VVAR_SIZE,
- (VM_READ | VM_MAYREAD | VM_PFNMAP), &rv_vvar_map);
+ ret = vdso_install_vvar_mapping(mm, vdso_base);
if (IS_ERR(ret))
goto up_fail;
diff --git a/arch/riscv/kernel/vdso/Makefile b/arch/riscv/kernel/vdso/Makefile
index 9a1b555e8733..ad73607abc28 100644
--- a/arch/riscv/kernel/vdso/Makefile
+++ b/arch/riscv/kernel/vdso/Makefile
@@ -2,7 +2,7 @@
# Copied from arch/tile/kernel/vdso/Makefile
# Include the generic Makefile to check the built vdso.
-include $(srctree)/lib/vdso/Makefile
+include $(srctree)/lib/vdso/Makefile.include
# Symbols present in the vdso
vdso-syms = rt_sigreturn
ifdef CONFIG_64BIT
diff --git a/arch/riscv/kernel/vdso/hwprobe.c b/arch/riscv/kernel/vdso/hwprobe.c
index a158c029344f..2ddeba6c68dd 100644
--- a/arch/riscv/kernel/vdso/hwprobe.c
+++ b/arch/riscv/kernel/vdso/hwprobe.c
@@ -16,8 +16,7 @@ static int riscv_vdso_get_values(struct riscv_hwprobe *pairs, size_t pair_count,
size_t cpusetsize, unsigned long *cpus,
unsigned int flags)
{
- const struct vdso_data *vd = __arch_get_vdso_data();
- const struct arch_vdso_time_data *avd = &vd->arch_data;
+ const struct vdso_arch_data *avd = &vdso_u_arch_data;
bool all_cpus = !cpusetsize && !cpus;
struct riscv_hwprobe *p = pairs;
struct riscv_hwprobe *end = pairs + pair_count;
@@ -51,8 +50,7 @@ static int riscv_vdso_get_cpus(struct riscv_hwprobe *pairs, size_t pair_count,
size_t cpusetsize, unsigned long *cpus,
unsigned int flags)
{
- const struct vdso_data *vd = __arch_get_vdso_data();
- const struct arch_vdso_time_data *avd = &vd->arch_data;
+ const struct vdso_arch_data *avd = &vdso_u_arch_data;
struct riscv_hwprobe *p = pairs;
struct riscv_hwprobe *end = pairs + pair_count;
unsigned char *c = (unsigned char *)cpus;
diff --git a/arch/riscv/kernel/vdso/vdso.lds.S b/arch/riscv/kernel/vdso/vdso.lds.S
index cbe2a179331d..8e86965a8aae 100644
--- a/arch/riscv/kernel/vdso/vdso.lds.S
+++ b/arch/riscv/kernel/vdso/vdso.lds.S
@@ -4,15 +4,14 @@
*/
#include <asm/page.h>
#include <asm/vdso.h>
+#include <vdso/datapage.h>
OUTPUT_ARCH(riscv)
SECTIONS
{
- PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE);
-#ifdef CONFIG_TIME_NS
- PROVIDE(_timens_data = _vdso_data + PAGE_SIZE);
-#endif
+ VDSO_VVAR_SYMS
+
. = SIZEOF_HEADERS;
.hash : { *(.hash) } :text
diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c
index 1fa8be5ee509..4b24705dc63a 100644
--- a/arch/riscv/kvm/main.c
+++ b/arch/riscv/kvm/main.c
@@ -172,8 +172,8 @@ module_init(riscv_kvm_init);
static void __exit riscv_kvm_exit(void)
{
- kvm_riscv_teardown();
-
kvm_exit();
+
+ kvm_riscv_teardown();
}
module_exit(riscv_kvm_exit);
diff --git a/arch/riscv/kvm/vcpu_onereg.c b/arch/riscv/kvm/vcpu_onereg.c
index f6d27b59c641..43ee8e33ba23 100644
--- a/arch/riscv/kvm/vcpu_onereg.c
+++ b/arch/riscv/kvm/vcpu_onereg.c
@@ -203,7 +203,7 @@ static bool kvm_riscv_vcpu_isa_disable_allowed(unsigned long ext)
case KVM_RISCV_ISA_EXT_SVADE:
/*
* The henvcfg.ADUE is read-only zero if menvcfg.ADUE is zero.
- * Svade is not allowed to disable when the platform use Svade.
+ * Svade can't be disabled unless we support Svadu.
*/
return arch_has_hw_pte_young();
default:
diff --git a/arch/riscv/kvm/vcpu_pmu.c b/arch/riscv/kvm/vcpu_pmu.c
index 2707a51b082c..78ac3216a54d 100644
--- a/arch/riscv/kvm/vcpu_pmu.c
+++ b/arch/riscv/kvm/vcpu_pmu.c
@@ -666,6 +666,7 @@ int kvm_riscv_vcpu_pmu_ctr_cfg_match(struct kvm_vcpu *vcpu, unsigned long ctr_ba
.type = etype,
.size = sizeof(struct perf_event_attr),
.pinned = true,
+ .disabled = true,
/*
* It should never reach here if the platform doesn't support the sscofpmf
* extension as mode filtering won't work without it.
diff --git a/arch/riscv/kvm/vcpu_timer.c b/arch/riscv/kvm/vcpu_timer.c
index 96e7a4e463f7..ff672fa71fcc 100644
--- a/arch/riscv/kvm/vcpu_timer.c
+++ b/arch/riscv/kvm/vcpu_timer.c
@@ -248,18 +248,19 @@ int kvm_riscv_vcpu_timer_init(struct kvm_vcpu *vcpu)
if (t->init_done)
return -EINVAL;
- hrtimer_init(&t->hrt, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
t->init_done = true;
t->next_set = false;
/* Enable sstc for every vcpu if available in hardware */
if (riscv_isa_extension_available(NULL, SSTC)) {
t->sstc_enabled = true;
- t->hrt.function = kvm_riscv_vcpu_vstimer_expired;
+ hrtimer_setup(&t->hrt, kvm_riscv_vcpu_vstimer_expired, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL);
t->timer_next_event = kvm_riscv_vcpu_update_vstimecmp;
} else {
t->sstc_enabled = false;
- t->hrt.function = kvm_riscv_vcpu_hrtimer_expired;
+ hrtimer_setup(&t->hrt, kvm_riscv_vcpu_hrtimer_expired, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL);
t->timer_next_event = kvm_riscv_vcpu_update_hrtimer;
}
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 9c9ec08d78c7..7ed6f229250c 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -166,6 +166,7 @@ config S390
select GENERIC_PTDUMP
select GENERIC_SMP_IDLE_THREAD
select GENERIC_TIME_VSYSCALL
+ select GENERIC_VDSO_DATA_STORE
select GENERIC_VDSO_TIME_NS
select GENERIC_IOREMAP if PCI
select HAVE_ALIGNED_STRUCT_PAGE
diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
index 4e73ef46d4b2..9f2814d0e1e9 100644
--- a/arch/s390/include/asm/gmap.h
+++ b/arch/s390/include/asm/gmap.h
@@ -139,7 +139,6 @@ int s390_replace_asce(struct gmap *gmap);
void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns);
int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
unsigned long end, bool interruptible);
-int kvm_s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio, bool split);
unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level);
/**
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 9a367866cab0..424f899d8163 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -1056,7 +1056,6 @@ bool kvm_s390_pv_cpu_is_protected(struct kvm_vcpu *vcpu);
extern int kvm_s390_gisc_register(struct kvm *kvm, u32 gisc);
extern int kvm_s390_gisc_unregister(struct kvm *kvm, u32 gisc);
-static inline void kvm_arch_sync_events(struct kvm *kvm) {}
static inline void kvm_arch_free_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot) {}
static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {}
diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h
index b11f5b6d0bd1..46fb0ef6f984 100644
--- a/arch/s390/include/asm/uv.h
+++ b/arch/s390/include/asm/uv.h
@@ -631,7 +631,7 @@ int uv_pin_shared(unsigned long paddr);
int uv_destroy_folio(struct folio *folio);
int uv_destroy_pte(pte_t pte);
int uv_convert_from_secure_pte(pte_t pte);
-int make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb);
+int make_hva_secure(struct mm_struct *mm, unsigned long hva, struct uv_cb_header *uvcb);
int uv_convert_from_secure(unsigned long paddr);
int uv_convert_from_secure_folio(struct folio *folio);
diff --git a/arch/s390/include/asm/vdso.h b/arch/s390/include/asm/vdso.h
index 92c73e4d97a9..420a073fdde5 100644
--- a/arch/s390/include/asm/vdso.h
+++ b/arch/s390/include/asm/vdso.h
@@ -6,13 +6,11 @@
#ifndef __ASSEMBLY__
-extern struct vdso_data *vdso_data;
-
int vdso_getcpu_init(void);
#endif /* __ASSEMBLY__ */
-#define __VVAR_PAGES 2
+#define __VDSO_PAGES 4
#define VDSO_VERSION_STRING LINUX_2.6.29
diff --git a/arch/s390/include/asm/vdso/getrandom.h b/arch/s390/include/asm/vdso/getrandom.h
index 36355af7160b..f8713ce39bb2 100644
--- a/arch/s390/include/asm/vdso/getrandom.h
+++ b/arch/s390/include/asm/vdso/getrandom.h
@@ -23,18 +23,6 @@ static __always_inline ssize_t getrandom_syscall(void *buffer, size_t len, unsig
return syscall3(__NR_getrandom, (long)buffer, (long)len, (long)flags);
}
-static __always_inline const struct vdso_rng_data *__arch_get_vdso_rng_data(void)
-{
- /*
- * The RNG data is in the real VVAR data page, but if a task belongs to a time namespace
- * then VVAR_DATA_PAGE_OFFSET points to the namespace-specific VVAR page and VVAR_TIMENS_
- * PAGE_OFFSET points to the real VVAR page.
- */
- if (IS_ENABLED(CONFIG_TIME_NS) && _vdso_data->clock_mode == VDSO_CLOCKMODE_TIMENS)
- return (void *)&_vdso_rng_data + VVAR_TIMENS_PAGE_OFFSET * PAGE_SIZE;
- return &_vdso_rng_data;
-}
-
#endif /* !__ASSEMBLY__ */
#endif /* __ASM_VDSO_GETRANDOM_H */
diff --git a/arch/s390/include/asm/vdso/gettimeofday.h b/arch/s390/include/asm/vdso/gettimeofday.h
index 7937765ccfa5..fb4564308e9d 100644
--- a/arch/s390/include/asm/vdso/gettimeofday.h
+++ b/arch/s390/include/asm/vdso/gettimeofday.h
@@ -14,12 +14,7 @@
#include <linux/compiler.h>
-static __always_inline const struct vdso_data *__arch_get_vdso_data(void)
-{
- return _vdso_data;
-}
-
-static inline u64 __arch_get_hw_counter(s32 clock_mode, const struct vdso_data *vd)
+static inline u64 __arch_get_hw_counter(s32 clock_mode, const struct vdso_time_data *vd)
{
u64 adj, now;
@@ -49,12 +44,4 @@ long clock_getres_fallback(clockid_t clkid, struct __kernel_timespec *ts)
return syscall2(__NR_clock_getres, (long)clkid, (long)ts);
}
-#ifdef CONFIG_TIME_NS
-static __always_inline
-const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd)
-{
- return _timens_data;
-}
-#endif
-
#endif
diff --git a/arch/s390/include/asm/vdso/vsyscall.h b/arch/s390/include/asm/vdso/vsyscall.h
index 3eb576ecd3bd..d346ebe51301 100644
--- a/arch/s390/include/asm/vdso/vsyscall.h
+++ b/arch/s390/include/asm/vdso/vsyscall.h
@@ -2,32 +2,12 @@
#ifndef __ASM_VDSO_VSYSCALL_H
#define __ASM_VDSO_VSYSCALL_H
-#define __VDSO_RND_DATA_OFFSET 768
-
#ifndef __ASSEMBLY__
#include <linux/hrtimer.h>
#include <vdso/datapage.h>
#include <asm/vdso.h>
-enum vvar_pages {
- VVAR_DATA_PAGE_OFFSET,
- VVAR_TIMENS_PAGE_OFFSET,
- VVAR_NR_PAGES
-};
-
-static __always_inline struct vdso_data *__s390_get_k_vdso_data(void)
-{
- return vdso_data;
-}
-#define __arch_get_k_vdso_data __s390_get_k_vdso_data
-
-static __always_inline struct vdso_rng_data *__s390_get_k_vdso_rnd_data(void)
-{
- return (void *)vdso_data + __VDSO_RND_DATA_OFFSET;
-}
-#define __arch_get_k_vdso_rng_data __s390_get_k_vdso_rnd_data
-
/* The asm-generic header needs to be included after the definitions above */
#include <asm-generic/vdso/vsyscall.h>
diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c
index 276cb4c1e11b..4a981266b483 100644
--- a/arch/s390/kernel/crash_dump.c
+++ b/arch/s390/kernel/crash_dump.c
@@ -246,15 +246,6 @@ bool is_kdump_kernel(void)
}
EXPORT_SYMBOL_GPL(is_kdump_kernel);
-static const char *nt_name(Elf64_Word type)
-{
- const char *name = "LINUX";
-
- if (type == NT_PRPSINFO || type == NT_PRSTATUS || type == NT_PRFPREG)
- name = KEXEC_CORE_NOTE_NAME;
- return name;
-}
-
/*
* Initialize ELF note
*/
@@ -279,10 +270,8 @@ static void *nt_init_name(void *buf, Elf64_Word type, void *desc, int d_len,
return PTR_ADD(buf, len);
}
-static inline void *nt_init(void *buf, Elf64_Word type, void *desc, int d_len)
-{
- return nt_init_name(buf, type, desc, d_len, nt_name(type));
-}
+#define nt_init(buf, type, desc) \
+ nt_init_name(buf, NT_ ## type, &(desc), sizeof(desc), NN_ ## type)
/*
* Calculate the size of ELF note
@@ -298,10 +287,7 @@ static size_t nt_size_name(int d_len, const char *name)
return size;
}
-static inline size_t nt_size(Elf64_Word type, int d_len)
-{
- return nt_size_name(d_len, nt_name(type));
-}
+#define nt_size(type, desc) nt_size_name(sizeof(desc), NN_ ## type)
/*
* Fill ELF notes for one CPU with save area registers
@@ -322,18 +308,16 @@ static void *fill_cpu_elf_notes(void *ptr, int cpu, struct save_area *sa)
memcpy(&nt_fpregset.fpc, &sa->fpc, sizeof(sa->fpc));
memcpy(&nt_fpregset.fprs, &sa->fprs, sizeof(sa->fprs));
/* Create ELF notes for the CPU */
- ptr = nt_init(ptr, NT_PRSTATUS, &nt_prstatus, sizeof(nt_prstatus));
- ptr = nt_init(ptr, NT_PRFPREG, &nt_fpregset, sizeof(nt_fpregset));
- ptr = nt_init(ptr, NT_S390_TIMER, &sa->timer, sizeof(sa->timer));
- ptr = nt_init(ptr, NT_S390_TODCMP, &sa->todcmp, sizeof(sa->todcmp));
- ptr = nt_init(ptr, NT_S390_TODPREG, &sa->todpreg, sizeof(sa->todpreg));
- ptr = nt_init(ptr, NT_S390_CTRS, &sa->ctrs, sizeof(sa->ctrs));
- ptr = nt_init(ptr, NT_S390_PREFIX, &sa->prefix, sizeof(sa->prefix));
+ ptr = nt_init(ptr, PRSTATUS, nt_prstatus);
+ ptr = nt_init(ptr, PRFPREG, nt_fpregset);
+ ptr = nt_init(ptr, S390_TIMER, sa->timer);
+ ptr = nt_init(ptr, S390_TODCMP, sa->todcmp);
+ ptr = nt_init(ptr, S390_TODPREG, sa->todpreg);
+ ptr = nt_init(ptr, S390_CTRS, sa->ctrs);
+ ptr = nt_init(ptr, S390_PREFIX, sa->prefix);
if (cpu_has_vx()) {
- ptr = nt_init(ptr, NT_S390_VXRS_HIGH,
- &sa->vxrs_high, sizeof(sa->vxrs_high));
- ptr = nt_init(ptr, NT_S390_VXRS_LOW,
- &sa->vxrs_low, sizeof(sa->vxrs_low));
+ ptr = nt_init(ptr, S390_VXRS_HIGH, sa->vxrs_high);
+ ptr = nt_init(ptr, S390_VXRS_LOW, sa->vxrs_low);
}
return ptr;
}
@@ -346,16 +330,16 @@ static size_t get_cpu_elf_notes_size(void)
struct save_area *sa = NULL;
size_t size;
- size = nt_size(NT_PRSTATUS, sizeof(struct elf_prstatus));
- size += nt_size(NT_PRFPREG, sizeof(elf_fpregset_t));
- size += nt_size(NT_S390_TIMER, sizeof(sa->timer));
- size += nt_size(NT_S390_TODCMP, sizeof(sa->todcmp));
- size += nt_size(NT_S390_TODPREG, sizeof(sa->todpreg));
- size += nt_size(NT_S390_CTRS, sizeof(sa->ctrs));
- size += nt_size(NT_S390_PREFIX, sizeof(sa->prefix));
+ size = nt_size(PRSTATUS, struct elf_prstatus);
+ size += nt_size(PRFPREG, elf_fpregset_t);
+ size += nt_size(S390_TIMER, sa->timer);
+ size += nt_size(S390_TODCMP, sa->todcmp);
+ size += nt_size(S390_TODPREG, sa->todpreg);
+ size += nt_size(S390_CTRS, sa->ctrs);
+ size += nt_size(S390_PREFIX, sa->prefix);
if (cpu_has_vx()) {
- size += nt_size(NT_S390_VXRS_HIGH, sizeof(sa->vxrs_high));
- size += nt_size(NT_S390_VXRS_LOW, sizeof(sa->vxrs_low));
+ size += nt_size(S390_VXRS_HIGH, sa->vxrs_high);
+ size += nt_size(S390_VXRS_LOW, sa->vxrs_low);
}
return size;
@@ -371,7 +355,7 @@ static void *nt_prpsinfo(void *ptr)
memset(&prpsinfo, 0, sizeof(prpsinfo));
prpsinfo.pr_sname = 'R';
strcpy(prpsinfo.pr_fname, "vmlinux");
- return nt_init(ptr, NT_PRPSINFO, &prpsinfo, sizeof(prpsinfo));
+ return nt_init(ptr, PRPSINFO, prpsinfo);
}
/*
@@ -610,7 +594,7 @@ static size_t get_elfcorehdr_size(int phdr_count)
/* PT_NOTES */
size += sizeof(Elf64_Phdr);
/* nt_prpsinfo */
- size += nt_size(NT_PRPSINFO, sizeof(struct elf_prpsinfo));
+ size += nt_size(PRPSINFO, struct elf_prpsinfo);
/* regsets */
size += get_cpu_cnt() * get_cpu_elf_notes_size();
/* nt_vmcoreinfo */
diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c
index 1ecd0580561f..911b95cd57e5 100644
--- a/arch/s390/kernel/dumpstack.c
+++ b/arch/s390/kernel/dumpstack.c
@@ -198,13 +198,8 @@ void __noreturn die(struct pt_regs *regs, const char *str)
console_verbose();
spin_lock_irq(&die_lock);
bust_spinlocks(1);
- printk("%s: %04x ilc:%d [#%d] ", str, regs->int_code & 0xffff,
+ printk("%s: %04x ilc:%d [#%d]", str, regs->int_code & 0xffff,
regs->int_code >> 17, ++die_counter);
-#ifdef CONFIG_PREEMPT
- pr_cont("PREEMPT ");
-#elif defined(CONFIG_PREEMPT_RT)
- pr_cont("PREEMPT_RT ");
-#endif
pr_cont("SMP ");
if (debug_pagealloc_enabled())
pr_cont("DEBUG_PAGEALLOC");
diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c
index 10725f5a6f0f..63875270941b 100644
--- a/arch/s390/kernel/perf_pai_crypto.c
+++ b/arch/s390/kernel/perf_pai_crypto.c
@@ -518,7 +518,8 @@ static void paicrypt_have_samples(void)
/* Called on schedule-in and schedule-out. No access to event structure,
* but for sampling only event CRYPTO_ALL is allowed.
*/
-static void paicrypt_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
+static void paicrypt_sched_task(struct perf_event_pmu_context *pmu_ctx,
+ struct task_struct *task, bool sched_in)
{
/* We started with a clean page on event installation. So read out
* results on schedule_out and if page was dirty, save old values.
diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c
index a8f0bad99cf0..fd14d5ebccbc 100644
--- a/arch/s390/kernel/perf_pai_ext.c
+++ b/arch/s390/kernel/perf_pai_ext.c
@@ -542,7 +542,8 @@ static void paiext_have_samples(void)
/* Called on schedule-in and schedule-out. No access to event structure,
* but for sampling only event NNPA_ALL is allowed.
*/
-static void paiext_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
+static void paiext_sched_task(struct perf_event_pmu_context *pmu_ctx,
+ struct task_struct *task, bool sched_in)
{
/* We started with a clean page on event installation. So read out
* results on schedule_out and if page was dirty, save old values.
diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c
index 5ce9a795a0fe..745649ad9779 100644
--- a/arch/s390/kernel/processor.c
+++ b/arch/s390/kernel/processor.c
@@ -72,7 +72,7 @@ void notrace stop_machine_yield(const struct cpumask *cpumask)
this_cpu = smp_processor_id();
if (__this_cpu_inc_return(cpu_relax_retry) >= spin_retry) {
__this_cpu_write(cpu_relax_retry, 0);
- cpu = cpumask_next_wrap(this_cpu, cpumask, this_cpu, false);
+ cpu = cpumask_next_wrap(this_cpu, cpumask);
if (cpu >= nr_cpu_ids)
return;
if (arch_vcpu_is_preempted(cpu))
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index e9115b4d8b63..a4569b96ef06 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -469,3 +469,4 @@
464 common getxattrat sys_getxattrat sys_getxattrat
465 common listxattrat sys_listxattrat sys_listxattrat
466 common removexattrat sys_removexattrat sys_removexattrat
+467 common open_tree_attr sys_open_tree_attr sys_open_tree_attr
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index e9f47c3a6197..699a18f1c54e 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -79,12 +79,10 @@ void __init time_early_init(void)
{
struct ptff_qto qto;
struct ptff_qui qui;
- int cs;
/* Initialize TOD steering parameters */
tod_steering_end = tod_clock_base.tod;
- for (cs = 0; cs < CS_BASES; cs++)
- vdso_data[cs].arch_data.tod_steering_end = tod_steering_end;
+ vdso_k_time_data->arch_data.tod_steering_end = tod_steering_end;
if (!test_facility(28))
return;
@@ -373,7 +371,6 @@ static void clock_sync_global(long delta)
{
unsigned long now, adj;
struct ptff_qto qto;
- int cs;
/* Fixup the monotonic sched clock. */
tod_clock_base.eitod += delta;
@@ -389,10 +386,8 @@ static void clock_sync_global(long delta)
panic("TOD clock sync offset %li is too large to drift\n",
tod_steering_delta);
tod_steering_end = now + (abs(tod_steering_delta) << 15);
- for (cs = 0; cs < CS_BASES; cs++) {
- vdso_data[cs].arch_data.tod_steering_end = tod_steering_end;
- vdso_data[cs].arch_data.tod_steering_delta = tod_steering_delta;
- }
+ vdso_k_time_data->arch_data.tod_steering_end = tod_steering_end;
+ vdso_k_time_data->arch_data.tod_steering_delta = tod_steering_delta;
/* Update LPAR offset. */
if (ptff_query(PTFF_QTO) && ptff(&qto, sizeof(qto), PTFF_QTO) == 0)
diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c
index 9f05df2da2f7..9a5d5be8acf4 100644
--- a/arch/s390/kernel/uv.c
+++ b/arch/s390/kernel/uv.c
@@ -206,6 +206,39 @@ int uv_convert_from_secure_pte(pte_t pte)
return uv_convert_from_secure_folio(pfn_folio(pte_pfn(pte)));
}
+/**
+ * should_export_before_import - Determine whether an export is needed
+ * before an import-like operation
+ * @uvcb: the Ultravisor control block of the UVC to be performed
+ * @mm: the mm of the process
+ *
+ * Returns whether an export is needed before every import-like operation.
+ * This is needed for shared pages, which don't trigger a secure storage
+ * exception when accessed from a different guest.
+ *
+ * Although considered as one, the Unpin Page UVC is not an actual import,
+ * so it is not affected.
+ *
+ * No export is needed also when there is only one protected VM, because the
+ * page cannot belong to the wrong VM in that case (there is no "other VM"
+ * it can belong to).
+ *
+ * Return: true if an export is needed before every import, otherwise false.
+ */
+static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm)
+{
+ /*
+ * The misc feature indicates, among other things, that importing a
+ * shared page from a different protected VM will automatically also
+ * transfer its ownership.
+ */
+ if (uv_has_feature(BIT_UV_FEAT_MISC))
+ return false;
+ if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED)
+ return false;
+ return atomic_read(&mm->context.protected_count) > 1;
+}
+
/*
* Calculate the expected ref_count for a folio that would otherwise have no
* further pins. This was cribbed from similar functions in other places in
@@ -228,7 +261,7 @@ static int expected_folio_refs(struct folio *folio)
}
/**
- * make_folio_secure() - make a folio secure
+ * __make_folio_secure() - make a folio secure
* @folio: the folio to make secure
* @uvcb: the uvcb that describes the UVC to be used
*
@@ -237,20 +270,18 @@ static int expected_folio_refs(struct folio *folio)
*
* Return: 0 on success;
* -EBUSY if the folio is in writeback or has too many references;
- * -E2BIG if the folio is large;
* -EAGAIN if the UVC needs to be attempted again;
* -ENXIO if the address is not mapped;
* -EINVAL if the UVC failed for other reasons.
*
* Context: The caller must hold exactly one extra reference on the folio
- * (it's the same logic as split_folio())
+ * (it's the same logic as split_folio()), and the folio must be
+ * locked.
*/
-int make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb)
+static int __make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb)
{
int expected, cc = 0;
- if (folio_test_large(folio))
- return -E2BIG;
if (folio_test_writeback(folio))
return -EBUSY;
expected = expected_folio_refs(folio) + 1;
@@ -277,7 +308,98 @@ int make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb)
return -EAGAIN;
return uvcb->rc == 0x10a ? -ENXIO : -EINVAL;
}
-EXPORT_SYMBOL_GPL(make_folio_secure);
+
+static int make_folio_secure(struct mm_struct *mm, struct folio *folio, struct uv_cb_header *uvcb)
+{
+ int rc;
+
+ if (!folio_trylock(folio))
+ return -EAGAIN;
+ if (should_export_before_import(uvcb, mm))
+ uv_convert_from_secure(folio_to_phys(folio));
+ rc = __make_folio_secure(folio, uvcb);
+ folio_unlock(folio);
+
+ return rc;
+}
+
+/**
+ * s390_wiggle_split_folio() - try to drain extra references to a folio and optionally split.
+ * @mm: the mm containing the folio to work on
+ * @folio: the folio
+ * @split: whether to split a large folio
+ *
+ * Context: Must be called while holding an extra reference to the folio;
+ * the mm lock should not be held.
+ * Return: 0 if the folio was split successfully;
+ * -EAGAIN if the folio was not split successfully but another attempt
+ * can be made, or if @split was set to false;
+ * -EINVAL in case of other errors. See split_folio().
+ */
+static int s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio, bool split)
+{
+ int rc;
+
+ lockdep_assert_not_held(&mm->mmap_lock);
+ folio_wait_writeback(folio);
+ lru_add_drain_all();
+ if (split) {
+ folio_lock(folio);
+ rc = split_folio(folio);
+ folio_unlock(folio);
+
+ if (rc != -EBUSY)
+ return rc;
+ }
+ return -EAGAIN;
+}
+
+int make_hva_secure(struct mm_struct *mm, unsigned long hva, struct uv_cb_header *uvcb)
+{
+ struct vm_area_struct *vma;
+ struct folio_walk fw;
+ struct folio *folio;
+ int rc;
+
+ mmap_read_lock(mm);
+ vma = vma_lookup(mm, hva);
+ if (!vma) {
+ mmap_read_unlock(mm);
+ return -EFAULT;
+ }
+ folio = folio_walk_start(&fw, vma, hva, 0);
+ if (!folio) {
+ mmap_read_unlock(mm);
+ return -ENXIO;
+ }
+
+ folio_get(folio);
+ /*
+ * Secure pages cannot be huge and userspace should not combine both.
+ * In case userspace does it anyway this will result in an -EFAULT for
+ * the unpack. The guest is thus never reaching secure mode.
+ * If userspace plays dirty tricks and decides to map huge pages at a
+ * later point in time, it will receive a segmentation fault or
+ * KVM_RUN will return -EFAULT.
+ */
+ if (folio_test_hugetlb(folio))
+ rc = -EFAULT;
+ else if (folio_test_large(folio))
+ rc = -E2BIG;
+ else if (!pte_write(fw.pte) || (pte_val(fw.pte) & _PAGE_INVALID))
+ rc = -ENXIO;
+ else
+ rc = make_folio_secure(mm, folio, uvcb);
+ folio_walk_end(&fw, vma);
+ mmap_read_unlock(mm);
+
+ if (rc == -E2BIG || rc == -EBUSY)
+ rc = s390_wiggle_split_folio(mm, folio, rc == -E2BIG);
+ folio_put(folio);
+
+ return rc;
+}
+EXPORT_SYMBOL_GPL(make_hva_secure);
/*
* To be called with the folio locked or with an extra reference! This will
diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c
index 598b512cde01..70c8f9ad13cd 100644
--- a/arch/s390/kernel/vdso.c
+++ b/arch/s390/kernel/vdso.c
@@ -16,8 +16,8 @@
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/smp.h>
-#include <linux/time_namespace.h>
#include <linux/random.h>
+#include <linux/vdso_datastore.h>
#include <vdso/datapage.h>
#include <asm/vdso/vsyscall.h>
#include <asm/alternative.h>
@@ -26,85 +26,6 @@
extern char vdso64_start[], vdso64_end[];
extern char vdso32_start[], vdso32_end[];
-static struct vm_special_mapping vvar_mapping;
-
-static union vdso_data_store vdso_data_store __page_aligned_data;
-
-struct vdso_data *vdso_data = vdso_data_store.data;
-
-#ifdef CONFIG_TIME_NS
-struct vdso_data *arch_get_vdso_data(void *vvar_page)
-{
- return (struct vdso_data *)(vvar_page);
-}
-
-/*
- * The VVAR page layout depends on whether a task belongs to the root or
- * non-root time namespace. Whenever a task changes its namespace, the VVAR
- * page tables are cleared and then they will be re-faulted with a
- * corresponding layout.
- * See also the comment near timens_setup_vdso_data() for details.
- */
-int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
-{
- struct mm_struct *mm = task->mm;
- VMA_ITERATOR(vmi, mm, 0);
- struct vm_area_struct *vma;
-
- mmap_read_lock(mm);
- for_each_vma(vmi, vma) {
- if (!vma_is_special_mapping(vma, &vvar_mapping))
- continue;
- zap_vma_pages(vma);
- break;
- }
- mmap_read_unlock(mm);
- return 0;
-}
-#endif
-
-static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
- struct vm_area_struct *vma, struct vm_fault *vmf)
-{
- struct page *timens_page = find_timens_vvar_page(vma);
- unsigned long addr, pfn;
- vm_fault_t err;
-
- switch (vmf->pgoff) {
- case VVAR_DATA_PAGE_OFFSET:
- pfn = virt_to_pfn(vdso_data);
- if (timens_page) {
- /*
- * Fault in VVAR page too, since it will be accessed
- * to get clock data anyway.
- */
- addr = vmf->address + VVAR_TIMENS_PAGE_OFFSET * PAGE_SIZE;
- err = vmf_insert_pfn(vma, addr, pfn);
- if (unlikely(err & VM_FAULT_ERROR))
- return err;
- pfn = page_to_pfn(timens_page);
- }
- break;
-#ifdef CONFIG_TIME_NS
- case VVAR_TIMENS_PAGE_OFFSET:
- /*
- * If a task belongs to a time namespace then a namespace
- * specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and
- * the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET
- * offset.
- * See also the comment near timens_setup_vdso_data().
- */
- if (!timens_page)
- return VM_FAULT_SIGBUS;
- pfn = virt_to_pfn(vdso_data);
- break;
-#endif /* CONFIG_TIME_NS */
- default:
- return VM_FAULT_SIGBUS;
- }
- return vmf_insert_pfn(vma, vmf->address, pfn);
-}
-
static int vdso_mremap(const struct vm_special_mapping *sm,
struct vm_area_struct *vma)
{
@@ -112,11 +33,6 @@ static int vdso_mremap(const struct vm_special_mapping *sm,
return 0;
}
-static struct vm_special_mapping vvar_mapping = {
- .name = "[vvar]",
- .fault = vvar_fault,
-};
-
static struct vm_special_mapping vdso64_mapping = {
.name = "[vdso]",
.mremap = vdso_mremap,
@@ -142,7 +58,7 @@ static int map_vdso(unsigned long addr, unsigned long vdso_mapping_len)
struct vm_area_struct *vma;
int rc;
- BUILD_BUG_ON(VVAR_NR_PAGES != __VVAR_PAGES);
+ BUILD_BUG_ON(VDSO_NR_PAGES != __VDSO_PAGES);
if (mmap_write_lock_killable(mm))
return -EINTR;
@@ -157,14 +73,11 @@ static int map_vdso(unsigned long addr, unsigned long vdso_mapping_len)
rc = vvar_start;
if (IS_ERR_VALUE(vvar_start))
goto out;
- vma = _install_special_mapping(mm, vvar_start, VVAR_NR_PAGES*PAGE_SIZE,
- VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP|
- VM_PFNMAP,
- &vvar_mapping);
+ vma = vdso_install_vvar_mapping(mm, vvar_start);
rc = PTR_ERR(vma);
if (IS_ERR(vma))
goto out;
- vdso_text_start = vvar_start + VVAR_NR_PAGES * PAGE_SIZE;
+ vdso_text_start = vvar_start + VDSO_NR_PAGES * PAGE_SIZE;
/* VM_MAYWRITE for COW so gdb can set breakpoints */
vma = _install_special_mapping(mm, vdso_text_start, vdso_text_len,
VM_READ|VM_EXEC|
@@ -220,7 +133,7 @@ unsigned long vdso_text_size(void)
unsigned long vdso_size(void)
{
- return vdso_text_size() + VVAR_NR_PAGES * PAGE_SIZE;
+ return vdso_text_size() + VDSO_NR_PAGES * PAGE_SIZE;
}
int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
diff --git a/arch/s390/kernel/vdso32/Makefile b/arch/s390/kernel/vdso32/Makefile
index 2c5afb88d298..1e4ddd1a683f 100644
--- a/arch/s390/kernel/vdso32/Makefile
+++ b/arch/s390/kernel/vdso32/Makefile
@@ -2,7 +2,7 @@
# List of files in the vdso
# Include the generic Makefile to check the built vdso.
-include $(srctree)/lib/vdso/Makefile
+include $(srctree)/lib/vdso/Makefile.include
obj-vdso32 = vdso_user_wrapper-32.o note-32.o
# Build rules
diff --git a/arch/s390/kernel/vdso32/vdso32.lds.S b/arch/s390/kernel/vdso32/vdso32.lds.S
index c916c4f73f76..9630d58c2080 100644
--- a/arch/s390/kernel/vdso32/vdso32.lds.S
+++ b/arch/s390/kernel/vdso32/vdso32.lds.S
@@ -6,16 +6,15 @@
#include <asm/page.h>
#include <asm/vdso.h>
+#include <vdso/datapage.h>
OUTPUT_FORMAT("elf32-s390", "elf32-s390", "elf32-s390")
OUTPUT_ARCH(s390:31-bit)
SECTIONS
{
- PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE);
-#ifdef CONFIG_TIME_NS
- PROVIDE(_timens_data = _vdso_data + PAGE_SIZE);
-#endif
+ VDSO_VVAR_SYMS
+
. = SIZEOF_HEADERS;
.hash : { *(.hash) } :text
diff --git a/arch/s390/kernel/vdso64/Makefile b/arch/s390/kernel/vdso64/Makefile
index ad206f2068d8..d8f0df742809 100644
--- a/arch/s390/kernel/vdso64/Makefile
+++ b/arch/s390/kernel/vdso64/Makefile
@@ -2,7 +2,7 @@
# List of files in the vdso
# Include the generic Makefile to check the built vdso.
-include $(srctree)/lib/vdso/Makefile
+include $(srctree)/lib/vdso/Makefile.include
obj-vdso64 = vdso_user_wrapper.o note.o vgetrandom-chacha.o
obj-cvdso64 = vdso64_generic.o getcpu.o vgetrandom.o
VDSO_CFLAGS_REMOVE := -pg $(CC_FLAGS_FTRACE) $(CC_FLAGS_EXPOLINE)
diff --git a/arch/s390/kernel/vdso64/vdso64.lds.S b/arch/s390/kernel/vdso64/vdso64.lds.S
index ec42b7d9cb53..e4f6551ae898 100644
--- a/arch/s390/kernel/vdso64/vdso64.lds.S
+++ b/arch/s390/kernel/vdso64/vdso64.lds.S
@@ -7,17 +7,15 @@
#include <asm/vdso/vsyscall.h>
#include <asm/page.h>
#include <asm/vdso.h>
+#include <vdso/datapage.h>
OUTPUT_FORMAT("elf64-s390", "elf64-s390", "elf64-s390")
OUTPUT_ARCH(s390:64-bit)
SECTIONS
{
- PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE);
- PROVIDE(_vdso_rng_data = _vdso_data + __VDSO_RND_DATA_OFFSET);
-#ifdef CONFIG_TIME_NS
- PROVIDE(_timens_data = _vdso_data + PAGE_SIZE);
-#endif
+ VDSO_VVAR_SYMS
+
. = SIZEOF_HEADERS;
.hash : { *(.hash) } :text
diff --git a/arch/s390/kvm/gmap.c b/arch/s390/kvm/gmap.c
index 02adf151d4de..6d8944d1b4a0 100644
--- a/arch/s390/kvm/gmap.c
+++ b/arch/s390/kvm/gmap.c
@@ -23,116 +23,25 @@
#include "gmap.h"
/**
- * should_export_before_import - Determine whether an export is needed
- * before an import-like operation
- * @uvcb: the Ultravisor control block of the UVC to be performed
- * @mm: the mm of the process
- *
- * Returns whether an export is needed before every import-like operation.
- * This is needed for shared pages, which don't trigger a secure storage
- * exception when accessed from a different guest.
- *
- * Although considered as one, the Unpin Page UVC is not an actual import,
- * so it is not affected.
- *
- * No export is needed also when there is only one protected VM, because the
- * page cannot belong to the wrong VM in that case (there is no "other VM"
- * it can belong to).
- *
- * Return: true if an export is needed before every import, otherwise false.
- */
-static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm)
-{
- /*
- * The misc feature indicates, among other things, that importing a
- * shared page from a different protected VM will automatically also
- * transfer its ownership.
- */
- if (uv_has_feature(BIT_UV_FEAT_MISC))
- return false;
- if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED)
- return false;
- return atomic_read(&mm->context.protected_count) > 1;
-}
-
-static int __gmap_make_secure(struct gmap *gmap, struct page *page, void *uvcb)
-{
- struct folio *folio = page_folio(page);
- int rc;
-
- /*
- * Secure pages cannot be huge and userspace should not combine both.
- * In case userspace does it anyway this will result in an -EFAULT for
- * the unpack. The guest is thus never reaching secure mode.
- * If userspace plays dirty tricks and decides to map huge pages at a
- * later point in time, it will receive a segmentation fault or
- * KVM_RUN will return -EFAULT.
- */
- if (folio_test_hugetlb(folio))
- return -EFAULT;
- if (folio_test_large(folio)) {
- mmap_read_unlock(gmap->mm);
- rc = kvm_s390_wiggle_split_folio(gmap->mm, folio, true);
- mmap_read_lock(gmap->mm);
- if (rc)
- return rc;
- folio = page_folio(page);
- }
-
- if (!folio_trylock(folio))
- return -EAGAIN;
- if (should_export_before_import(uvcb, gmap->mm))
- uv_convert_from_secure(folio_to_phys(folio));
- rc = make_folio_secure(folio, uvcb);
- folio_unlock(folio);
-
- /*
- * In theory a race is possible and the folio might have become
- * large again before the folio_trylock() above. In that case, no
- * action is performed and -EAGAIN is returned; the callers will
- * have to try again later.
- * In most cases this implies running the VM again, getting the same
- * exception again, and make another attempt in this function.
- * This is expected to happen extremely rarely.
- */
- if (rc == -E2BIG)
- return -EAGAIN;
- /* The folio has too many references, try to shake some off */
- if (rc == -EBUSY) {
- mmap_read_unlock(gmap->mm);
- kvm_s390_wiggle_split_folio(gmap->mm, folio, false);
- mmap_read_lock(gmap->mm);
- return -EAGAIN;
- }
-
- return rc;
-}
-
-/**
* gmap_make_secure() - make one guest page secure
* @gmap: the guest gmap
* @gaddr: the guest address that needs to be made secure
* @uvcb: the UVCB specifying which operation needs to be performed
*
* Context: needs to be called with kvm->srcu held.
- * Return: 0 on success, < 0 in case of error (see __gmap_make_secure()).
+ * Return: 0 on success, < 0 in case of error.
*/
int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb)
{
struct kvm *kvm = gmap->private;
- struct page *page;
- int rc = 0;
+ unsigned long vmaddr;
lockdep_assert_held(&kvm->srcu);
- page = gfn_to_page(kvm, gpa_to_gfn(gaddr));
- mmap_read_lock(gmap->mm);
- if (page)
- rc = __gmap_make_secure(gmap, page, uvcb);
- kvm_release_page_clean(page);
- mmap_read_unlock(gmap->mm);
-
- return rc;
+ vmaddr = gfn_to_hva(kvm, gpa_to_gfn(gaddr));
+ if (kvm_is_error_hva(vmaddr))
+ return -EFAULT;
+ return make_hva_secure(gmap->mm, vmaddr, uvcb);
}
int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr)
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 07ff0e10cb7f..0f00f8e85fee 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -3174,8 +3174,7 @@ void kvm_s390_gisa_init(struct kvm *kvm)
gi->alert.mask = 0;
spin_lock_init(&gi->alert.ref_lock);
gi->expires = 50 * 1000; /* 50 usec */
- hrtimer_init(&gi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- gi->timer.function = gisa_vcpu_kicker;
+ hrtimer_setup(&gi->timer, gisa_vcpu_kicker, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
memset(gi->origin, 0, sizeof(struct kvm_s390_gisa));
gi->origin->next_alert = (u32)virt_to_phys(gi->origin);
VM_EVENT(kvm, 3, "gisa 0x%pK initialized", gi->origin);
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index ebecb96bacce..48104e59648b 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -3943,8 +3943,8 @@ static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu)
if (rc)
return rc;
}
- hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- vcpu->arch.ckc_timer.function = kvm_s390_idle_wakeup;
+ hrtimer_setup(&vcpu->arch.ckc_timer, kvm_s390_idle_wakeup, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL);
vcpu->arch.sie_block->hpid = HPID_KVM;
@@ -4952,6 +4952,7 @@ static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu)
{
unsigned int flags = 0;
unsigned long gaddr;
+ int rc;
gaddr = current->thread.gmap_teid.addr * PAGE_SIZE;
if (kvm_s390_cur_gmap_fault_is_write())
@@ -4961,16 +4962,6 @@ static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu)
case 0:
vcpu->stat.exit_null++;
break;
- case PGM_NON_SECURE_STORAGE_ACCESS:
- kvm_s390_assert_primary_as(vcpu);
- /*
- * This is normal operation; a page belonging to a protected
- * guest has not been imported yet. Try to import the page into
- * the protected guest.
- */
- if (gmap_convert_to_secure(vcpu->arch.gmap, gaddr) == -EINVAL)
- send_sig(SIGSEGV, current, 0);
- break;
case PGM_SECURE_STORAGE_ACCESS:
case PGM_SECURE_STORAGE_VIOLATION:
kvm_s390_assert_primary_as(vcpu);
@@ -4995,6 +4986,20 @@ static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu)
send_sig(SIGSEGV, current, 0);
}
break;
+ case PGM_NON_SECURE_STORAGE_ACCESS:
+ kvm_s390_assert_primary_as(vcpu);
+ /*
+ * This is normal operation; a page belonging to a protected
+ * guest has not been imported yet. Try to import the page into
+ * the protected guest.
+ */
+ rc = gmap_convert_to_secure(vcpu->arch.gmap, gaddr);
+ if (rc == -EINVAL)
+ send_sig(SIGSEGV, current, 0);
+ if (rc != -ENXIO)
+ break;
+ flags = FAULT_FLAG_WRITE;
+ fallthrough;
case PGM_PROTECTION:
case PGM_SEGMENT_TRANSLATION:
case PGM_PAGE_TRANSLATION:
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 94d927785800..d14b488e7a1f 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -2626,31 +2626,3 @@ int s390_replace_asce(struct gmap *gmap)
return 0;
}
EXPORT_SYMBOL_GPL(s390_replace_asce);
-
-/**
- * kvm_s390_wiggle_split_folio() - try to drain extra references to a folio and optionally split
- * @mm: the mm containing the folio to work on
- * @folio: the folio
- * @split: whether to split a large folio
- *
- * Context: Must be called while holding an extra reference to the folio;
- * the mm lock should not be held.
- */
-int kvm_s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio, bool split)
-{
- int rc;
-
- lockdep_assert_not_held(&mm->mmap_lock);
- folio_wait_writeback(folio);
- lru_add_drain_all();
- if (split) {
- folio_lock(folio);
- rc = split_folio(folio);
- folio_unlock(folio);
-
- if (rc != -EBUSY)
- return rc;
- }
- return -EAGAIN;
-}
-EXPORT_SYMBOL_GPL(kvm_s390_wiggle_split_folio);
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index c8cad33bf250..52a7652fcff6 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -470,3 +470,4 @@
464 common getxattrat sys_getxattrat
465 common listxattrat sys_listxattrat
466 common removexattrat sys_removexattrat
+467 common open_tree_attr sys_open_tree_attr
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index 727f99d333b3..83e45eb6c095 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -512,3 +512,4 @@
464 common getxattrat sys_getxattrat
465 common listxattrat sys_listxattrat
466 common removexattrat sys_removexattrat
+467 common open_tree_attr sys_open_tree_attr
diff --git a/arch/sparc/vdso/Makefile b/arch/sparc/vdso/Makefile
index 50ec2978cda5..fdc4a8f5a49c 100644
--- a/arch/sparc/vdso/Makefile
+++ b/arch/sparc/vdso/Makefile
@@ -22,7 +22,7 @@ targets += $(foreach x, 32 64, vdso-image-$(x).c vdso$(x).so vdso$(x).so.dbg)
CPPFLAGS_vdso.lds += -P -C
-VDSO_LDFLAGS_vdso.lds = -m elf64_sparc -soname linux-vdso.so.1 --no-undefined \
+VDSO_LDFLAGS_vdso.lds = -m elf64_sparc -soname linux-vdso.so.1 \
-z max-page-size=8192
$(obj)/vdso64.so.dbg: $(obj)/vdso.lds $(vobjs) FORCE
@@ -101,7 +101,6 @@ $(obj)/vdso32.so.dbg: FORCE \
quiet_cmd_vdso = VDSO $@
cmd_vdso = $(LD) -nostdlib -o $@ \
$(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \
- -T $(filter %.lds,$^) $(filter %.o,$^) && \
- sh $(src)/checkundef.sh '$(OBJDUMP)' '$@'
+ -T $(filter %.lds,$^) $(filter %.o,$^)
-VDSO_LDFLAGS = -shared --hash-style=both --build-id=sha1 -Bsymbolic
+VDSO_LDFLAGS = -shared --hash-style=both --build-id=sha1 -Bsymbolic --no-undefined
diff --git a/arch/sparc/vdso/checkundef.sh b/arch/sparc/vdso/checkundef.sh
deleted file mode 100644
index 2d85876ffc32..000000000000
--- a/arch/sparc/vdso/checkundef.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/sh
-objdump="$1"
-file="$2"
-$objdump -t "$file" | grep '*UUND*' | grep -v '#scratch' > /dev/null 2>&1
-if [ $? -eq 1 ]; then
- exit 0
-else
- echo "$file: undefined symbols found" >&2
- exit 1
-fi
diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c
index 79ea97d4797e..8be91974e786 100644
--- a/arch/um/kernel/um_arch.c
+++ b/arch/um/kernel/um_arch.c
@@ -440,25 +440,24 @@ void __init arch_cpu_finalize_init(void)
os_check_bugs();
}
-void apply_seal_endbr(s32 *start, s32 *end, struct module *mod)
+void apply_seal_endbr(s32 *start, s32 *end)
{
}
-void apply_retpolines(s32 *start, s32 *end, struct module *mod)
+void apply_retpolines(s32 *start, s32 *end)
{
}
-void apply_returns(s32 *start, s32 *end, struct module *mod)
+void apply_returns(s32 *start, s32 *end)
{
}
void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
- s32 *start_cfi, s32 *end_cfi, struct module *mod)
+ s32 *start_cfi, s32 *end_cfi)
{
}
-void apply_alternatives(struct alt_instr *start, struct alt_instr *end,
- struct module *mod)
+void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
{
}
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index cf0ad89f5639..f7fb3d88c57b 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -1,4 +1,8 @@
# SPDX-License-Identifier: GPL-2.0
+
+# Branch profiling isn't noinstr-safe. Disable it for arch/x86/*
+subdir-ccflags-$(CONFIG_TRACE_BRANCH_PROFILING) += -DDISABLE_BRANCH_PROFILING
+
obj-$(CONFIG_ARCH_HAS_CC_PLATFORM) += coco/
obj-y += entry/
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0e27ebd7e36a..05b4eca156cf 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -85,6 +85,7 @@ config X86
select ARCH_HAS_DMA_OPS if GART_IOMMU || XEN
select ARCH_HAS_EARLY_DEBUG if KGDB
select ARCH_HAS_ELF_RANDOMIZE
+ select ARCH_HAS_EXECMEM_ROX if X86_64
select ARCH_HAS_FAST_MULTIPLIER
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
@@ -132,7 +133,7 @@ config X86
select ARCH_SUPPORTS_AUTOFDO_CLANG
select ARCH_SUPPORTS_PROPELLER_CLANG if X86_64
select ARCH_USE_BUILTIN_BSWAP
- select ARCH_USE_CMPXCHG_LOCKREF if X86_CMPXCHG64
+ select ARCH_USE_CMPXCHG_LOCKREF if X86_CX8
select ARCH_USE_MEMTEST
select ARCH_USE_QUEUED_RWLOCKS
select ARCH_USE_QUEUED_SPINLOCKS
@@ -178,6 +179,7 @@ config X86
select GENERIC_SMP_IDLE_THREAD
select GENERIC_TIME_VSYSCALL
select GENERIC_GETTIMEOFDAY
+ select GENERIC_VDSO_DATA_STORE
select GENERIC_VDSO_TIME_NS
select GENERIC_VDSO_OVERFLOW_PROTECT
select GUP_GET_PXX_LOW_HIGH if X86_PAE
@@ -232,7 +234,7 @@ config X86
select HAVE_SAMPLE_FTRACE_DIRECT_MULTI if X86_64
select HAVE_EBPF_JIT
select HAVE_EFFICIENT_UNALIGNED_ACCESS
- select HAVE_EISA
+ select HAVE_EISA if X86_32
select HAVE_EXIT_THREAD
select HAVE_GUP_FAST
select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE
@@ -277,7 +279,7 @@ config X86
select HAVE_PCI
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
- select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT
+ select MMU_GATHER_RCU_TABLE_FREE
select MMU_GATHER_MERGE_VMAS
select HAVE_POSIX_CPU_TIMERS_TASK_WORK
select HAVE_REGS_AND_STACK_ACCESS_API
@@ -285,7 +287,7 @@ config X86
select HAVE_FUNCTION_ARG_ACCESS_API
select HAVE_SETUP_PER_CPU_AREA
select HAVE_SOFTIRQ_ON_OWN_STACK
- select HAVE_STACKPROTECTOR if CC_HAS_SANE_STACKPROTECTOR
+ select HAVE_STACKPROTECTOR
select HAVE_STACK_VALIDATION if HAVE_OBJTOOL
select HAVE_STATIC_CALL
select HAVE_STATIC_CALL_INLINE if HAVE_OBJTOOL
@@ -426,15 +428,6 @@ config PGTABLE_LEVELS
default 3 if X86_PAE
default 2
-config CC_HAS_SANE_STACKPROTECTOR
- bool
- default $(success,$(srctree)/scripts/gcc-x86_64-has-stack-protector.sh $(CC) $(CLANG_FLAGS)) if 64BIT
- default $(success,$(srctree)/scripts/gcc-x86_32-has-stack-protector.sh $(CC) $(CLANG_FLAGS))
- help
- We have to make sure stack protector is unconditionally disabled if
- the compiler produces broken code or if it does not let us control
- the segment on 32-bit kernels.
-
menu "Processor type and features"
config SMP
@@ -505,6 +498,7 @@ config X86_CPU_RESCTRL
depends on X86 && (CPU_SUP_INTEL || CPU_SUP_AMD)
select KERNFS
select PROC_CPU_RESCTRL if PROC_FS
+ select RESCTRL_FS_PSEUDO_LOCK
help
Enable x86 CPU resource control support.
@@ -521,6 +515,12 @@ config X86_CPU_RESCTRL
Say N if unsure.
+config RESCTRL_FS_PSEUDO_LOCK
+ bool
+ help
+ Software mechanism to pin data in a cache portion using
+ micro-architecture specific knowledge.
+
config X86_FRED
bool "Flexible Return and Event Delivery"
depends on X86_64
@@ -530,12 +530,6 @@ config X86_FRED
ring transitions and exception/interrupt handling if the
system supports it.
-config X86_BIGSMP
- bool "Support for big SMP systems with more than 8 CPUs"
- depends on SMP && X86_32
- help
- This option is needed for the systems that have more than 8 CPUs.
-
config X86_EXTENDED_PLATFORM
bool "Support for extended (non-PC) x86 platforms"
default y
@@ -553,13 +547,12 @@ config X86_EXTENDED_PLATFORM
AMD Elan
RDC R-321x SoC
SGI 320/540 (Visual Workstation)
- STA2X11-based (e.g. Northville)
- Moorestown MID devices
64-bit platforms (CONFIG_64BIT=y):
Numascale NumaChip
ScaleMP vSMP
SGI Ultraviolet
+ Merrifield/Moorefield MID devices
If you have one of these systems, or if you want to build a
generic distribution kernel, say Y here - otherwise say N.
@@ -604,8 +597,31 @@ config X86_UV
This option is needed in order to support SGI Ultraviolet systems.
If you don't have one of these, you should say N here.
-# Following is an alphabetically sorted list of 32 bit extended platforms
-# Please maintain the alphabetic order if and when there are additions
+config X86_INTEL_MID
+ bool "Intel Z34xx/Z35xx MID platform support"
+ depends on X86_EXTENDED_PLATFORM
+ depends on X86_PLATFORM_DEVICES
+ depends on PCI
+ depends on X86_64 || (EXPERT && PCI_GOANY)
+ depends on X86_IO_APIC
+ select I2C
+ select DW_APB_TIMER
+ select INTEL_SCU_PCI
+ help
+ Select to build a kernel capable of supporting 64-bit Intel MID
+ (Mobile Internet Device) platform systems which do not have
+ the PCI legacy interfaces.
+
+ The only supported devices are the 22nm Merrified (Z34xx)
+ and Moorefield (Z35xx) SoC used in the Intel Edison board and
+ a small number of Android devices such as the Asus Zenfone 2,
+ Asus FonePad 8 and Dell Venue 7.
+
+ If you are building for a PC class system or non-MID tablet
+ SoCs like Bay Trail (Z36xx/Z37xx), say N here.
+
+ Intel MID platforms are based on an Intel processor and chipset which
+ consume less power than most of the x86 derivatives.
config X86_GOLDFISH
bool "Goldfish (Virtual Platform)"
@@ -615,6 +631,9 @@ config X86_GOLDFISH
for Android development. Unless you are building for the Android
Goldfish emulator say N here.
+# Following is an alphabetically sorted list of 32 bit extended platforms
+# Please maintain the alphabetic order if and when there are additions
+
config X86_INTEL_CE
bool "CE4100 TV platform"
depends on PCI
@@ -630,24 +649,6 @@ config X86_INTEL_CE
This option compiles in support for the CE4100 SOC for settop
boxes and media devices.
-config X86_INTEL_MID
- bool "Intel MID platform support"
- depends on X86_EXTENDED_PLATFORM
- depends on X86_PLATFORM_DEVICES
- depends on PCI
- depends on X86_64 || (PCI_GOANY && X86_32)
- depends on X86_IO_APIC
- select I2C
- select DW_APB_TIMER
- select INTEL_SCU_PCI
- help
- Select to build a kernel capable of supporting Intel MID (Mobile
- Internet Device) platform systems which do not have the PCI legacy
- interfaces. If you are building for a PC class system say N here.
-
- Intel MID platforms are based on an Intel processor and chipset which
- consume less power than most of the x86 derivatives.
-
config X86_INTEL_QUARK
bool "Intel Quark platform support"
depends on X86_32
@@ -729,18 +730,6 @@ config X86_RDC321X
as R-8610-(G).
If you don't have one of these chips, you should say N here.
-config X86_32_NON_STANDARD
- bool "Support non-standard 32-bit SMP architectures"
- depends on X86_32 && SMP
- depends on X86_EXTENDED_PLATFORM
- help
- This option compiles in the bigsmp and STA2X11 default
- subarchitectures. It is intended for a generic binary
- kernel. If you select them all, kernel will probe it one by
- one and will fallback to default.
-
-# Alphabetically sorted list of Non standard 32 bit platforms
-
config X86_SUPPORTS_MEMORY_FAILURE
def_bool y
# MCE code calls memory_failure():
@@ -750,19 +739,6 @@ config X86_SUPPORTS_MEMORY_FAILURE
depends on X86_64 || !SPARSEMEM
select ARCH_SUPPORTS_MEMORY_FAILURE
-config STA2X11
- bool "STA2X11 Companion Chip Support"
- depends on X86_32_NON_STANDARD && PCI
- select SWIOTLB
- select MFD_STA2X11
- select GPIOLIB
- help
- This adds support for boards based on the STA2X11 IO-Hub,
- a.k.a. "ConneXt". The chip is used in place of the standard
- PC chipset, so all "standard" peripherals are missing. If this
- option is selected the kernel will still be able to boot on
- standard PC machines.
-
config X86_32_IRIS
tristate "Eurobraille/Iris poweroff module"
depends on X86_32
@@ -1012,8 +988,7 @@ config NR_CPUS_RANGE_BEGIN
config NR_CPUS_RANGE_END
int
depends on X86_32
- default 64 if SMP && X86_BIGSMP
- default 8 if SMP && !X86_BIGSMP
+ default 8 if SMP
default 1 if !SMP
config NR_CPUS_RANGE_END
@@ -1026,7 +1001,6 @@ config NR_CPUS_RANGE_END
config NR_CPUS_DEFAULT
int
depends on X86_32
- default 32 if X86_BIGSMP
default 8 if SMP
default 1 if !SMP
@@ -1102,7 +1076,7 @@ config UP_LATE_INIT
config X86_UP_APIC
bool "Local APIC support on uniprocessors" if !PCI_MSI
default PCI_MSI
- depends on X86_32 && !SMP && !X86_32_NON_STANDARD
+ depends on X86_32 && !SMP
help
A local APIC (Advanced Programmable Interrupt Controller) is an
integrated interrupt controller in the CPU. If you have a single-CPU
@@ -1127,7 +1101,7 @@ config X86_UP_IOAPIC
config X86_LOCAL_APIC
def_bool y
- depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC || PCI_MSI
+ depends on X86_64 || SMP || X86_UP_APIC || PCI_MSI
select IRQ_DOMAIN_HIERARCHY
config ACPI_MADT_WAKEUP
@@ -1396,15 +1370,11 @@ config X86_CPUID
with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to
/dev/cpu/31/cpuid.
-choice
- prompt "High Memory Support"
- default HIGHMEM4G
+config HIGHMEM4G
+ bool "High Memory Support"
depends on X86_32
-
-config NOHIGHMEM
- bool "off"
help
- Linux can use up to 64 Gigabytes of physical memory on x86 systems.
+ Linux can use up to 4 Gigabytes of physical memory on x86 systems.
However, the address space of 32-bit x86 processors is only 4
Gigabytes large. That means that, if you have a large amount of
physical memory, not all of it can be "permanently mapped" by the
@@ -1420,38 +1390,9 @@ config NOHIGHMEM
possible.
If the machine has between 1 and 4 Gigabytes physical RAM, then
- answer "4GB" here.
-
- If more than 4 Gigabytes is used then answer "64GB" here. This
- selection turns Intel PAE (Physical Address Extension) mode on.
- PAE implements 3-level paging on IA32 processors. PAE is fully
- supported by Linux, PAE mode is implemented on all recent Intel
- processors (Pentium Pro and better). NOTE: If you say "64GB" here,
- then the kernel will not boot on CPUs that don't support PAE!
-
- The actual amount of total physical memory will either be
- auto detected or can be forced by using a kernel command line option
- such as "mem=256M". (Try "man bootparam" or see the documentation of
- your boot loader (lilo or loadlin) about how to pass options to the
- kernel at boot time.)
-
- If unsure, say "off".
-
-config HIGHMEM4G
- bool "4GB"
- help
- Select this if you have a 32-bit processor and between 1 and 4
- gigabytes of physical RAM.
-
-config HIGHMEM64G
- bool "64GB"
- depends on X86_HAVE_PAE
- select X86_PAE
- help
- Select this if you have a 32-bit processor and more than 4
- gigabytes of physical RAM.
+ answer "Y" here.
-endchoice
+ If unsure, say N.
choice
prompt "Memory split" if EXPERT
@@ -1497,14 +1438,12 @@ config PAGE_OFFSET
depends on X86_32
config HIGHMEM
- def_bool y
- depends on X86_32 && (HIGHMEM64G || HIGHMEM4G)
+ def_bool HIGHMEM4G
config X86_PAE
bool "PAE (Physical Address Extension) Support"
depends on X86_32 && X86_HAVE_PAE
select PHYS_ADDR_T_64BIT
- select SWIOTLB
help
PAE is required for NX support, and furthermore enables
larger swapspace support for non-overcommit purposes. It
@@ -1574,8 +1513,7 @@ config AMD_MEM_ENCRYPT
config NUMA
bool "NUMA Memory Allocation and Scheduler Support"
depends on SMP
- depends on X86_64 || (X86_32 && HIGHMEM64G && X86_BIGSMP)
- default y if X86_BIGSMP
+ depends on X86_64
select USE_PERCPU_NUMA_NODE_ID
select OF_NUMA if OF
help
@@ -1588,9 +1526,6 @@ config NUMA
For 64-bit this is recommended if the system is Intel Core i7
(or later), AMD Opteron, or EM64T NUMA.
- For 32-bit this is only needed if you boot a 32-bit
- kernel on a 64-bit NUMA platform.
-
Otherwise, you should say N.
config AMD_NUMA
@@ -1629,7 +1564,7 @@ config ARCH_FLATMEM_ENABLE
config ARCH_SPARSEMEM_ENABLE
def_bool y
- depends on X86_64 || NUMA || X86_32 || X86_32_NON_STANDARD
+ depends on X86_64 || NUMA || X86_32
select SPARSEMEM_STATIC if X86_32
select SPARSEMEM_VMEMMAP_ENABLE if X86_64
@@ -1675,15 +1610,6 @@ config X86_PMEM_LEGACY
Say Y if unsure.
-config HIGHPTE
- bool "Allocate 3rd-level pagetables from highmem"
- depends on HIGHMEM
- help
- The VM uses one page table entry for each page of physical memory.
- For systems with a lot of RAM, this can be wasteful of precious
- low memory. Setting this option will put user-space page table
- entries in high memory.
-
config X86_CHECK_BIOS_CORRUPTION
bool "Check for low memory corruption"
help
@@ -2451,18 +2377,20 @@ config CC_HAS_NAMED_AS
def_bool $(success,echo 'int __seg_fs fs; int __seg_gs gs;' | $(CC) -x c - -S -o /dev/null)
depends on CC_IS_GCC
+#
+# -fsanitize=kernel-address (KASAN) and -fsanitize=thread (KCSAN)
+# are incompatible with named address spaces with GCC < 13.3
+# (see GCC PR sanitizer/111736 and also PR sanitizer/115172).
+#
+
config CC_HAS_NAMED_AS_FIXED_SANITIZERS
- def_bool CC_IS_GCC && GCC_VERSION >= 130300
+ def_bool y
+ depends on !(KASAN || KCSAN) || GCC_VERSION >= 130300
+ depends on !(UBSAN_BOOL && KASAN) || GCC_VERSION >= 140200
config USE_X86_SEG_SUPPORT
- def_bool y
- depends on CC_HAS_NAMED_AS
- #
- # -fsanitize=kernel-address (KASAN) and -fsanitize=thread
- # (KCSAN) are incompatible with named address spaces with
- # GCC < 13.3 - see GCC PR sanitizer/111736.
- #
- depends on !(KASAN || KCSAN) || CC_HAS_NAMED_AS_FIXED_SANITIZERS
+ def_bool CC_HAS_NAMED_AS
+ depends on CC_HAS_NAMED_AS_FIXED_SANITIZERS
config CC_HAS_SLS
def_bool $(cc-option,-mharden-sls=all)
@@ -2473,6 +2401,10 @@ config CC_HAS_RETURN_THUNK
config CC_HAS_ENTRY_PADDING
def_bool $(cc-option,-fpatchable-function-entry=16,16)
+config CC_HAS_KCFI_ARITY
+ def_bool $(cc-option,-fsanitize=kcfi -fsanitize-kcfi-arity)
+ depends on CC_IS_CLANG && !RUST
+
config FUNCTION_PADDING_CFI
int
default 59 if FUNCTION_ALIGNMENT_64B
@@ -2498,6 +2430,10 @@ config FINEIBT
depends on X86_KERNEL_IBT && CFI_CLANG && MITIGATION_RETPOLINE
select CALL_PADDING
+config FINEIBT_BHI
+ def_bool y
+ depends on FINEIBT && CC_HAS_KCFI_ARITY
+
config HAVE_CALL_THUNKS
def_bool y
depends on CC_HAS_ENTRY_PADDING && MITIGATION_RETHUNK && OBJTOOL
@@ -3202,4 +3138,6 @@ config HAVE_ATOMIC_IOMAP
source "arch/x86/kvm/Kconfig"
+source "arch/x86/Kconfig.cpufeatures"
+
source "arch/x86/Kconfig.assembler"
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 2a7279d80460..753b8763abae 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -1,9 +1,9 @@
# SPDX-License-Identifier: GPL-2.0
# Put here option for CPU selection and depending optimization
choice
- prompt "Processor family"
- default M686 if X86_32
- default GENERIC_CPU if X86_64
+ prompt "x86-32 Processor family"
+ depends on X86_32
+ default M686
help
This is the processor type of your CPU. This information is
used for optimizing purposes. In order to compile a kernel
@@ -31,7 +31,6 @@ choice
- "Pentium-4" for the Intel Pentium 4 or P4-based Celeron.
- "K6" for the AMD K6, K6-II and K6-III (aka K6-3D).
- "Athlon" for the AMD K7 family (Athlon/Duron/Thunderbird).
- - "Opteron/Athlon64/Hammer/K8" for all K8 and newer AMD CPUs.
- "Crusoe" for the Transmeta Crusoe series.
- "Efficeon" for the Transmeta Efficeon series.
- "Winchip-C6" for original IDT Winchip.
@@ -42,13 +41,10 @@ choice
- "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3.
- "VIA C3-2" for VIA C3-2 "Nehemiah" (model 9 and above).
- "VIA C7" for VIA C7.
- - "Intel P4" for the Pentium 4/Netburst microarchitecture.
- - "Core 2/newer Xeon" for all core2 and newer Intel CPUs.
- "Intel Atom" for the Atom-microarchitecture CPUs.
- - "Generic-x86-64" for a kernel which runs on any x86-64 CPU.
See each option's help text for additional details. If you don't know
- what to do, choose "486".
+ what to do, choose "Pentium-Pro".
config M486SX
bool "486SX"
@@ -114,11 +110,11 @@ config MPENTIUMIII
extensions.
config MPENTIUMM
- bool "Pentium M"
+ bool "Pentium M/Pentium Dual Core/Core Solo/Core Duo"
depends on X86_32
help
Select this for Intel Pentium M (not Pentium-4 M)
- notebook chips.
+ "Merom" Core Solo/Duo notebook chips
config MPENTIUM4
bool "Pentium-4/Celeron(P4-based)/Pentium-4 M/older Xeon"
@@ -139,22 +135,10 @@ config MPENTIUM4
-Mobile Pentium 4
-Mobile Pentium 4 M
-Extreme Edition (Gallatin)
- -Prescott
- -Prescott 2M
- -Cedar Mill
- -Presler
- -Smithfiled
Xeons (Intel Xeon, Xeon MP, Xeon LV, Xeon MV) corename:
-Foster
-Prestonia
-Gallatin
- -Nocona
- -Irwindale
- -Cranford
- -Potomac
- -Paxville
- -Dempsey
-
config MK6
bool "K6/K6-II/K6-III"
@@ -172,13 +156,6 @@ config MK7
some extended instructions, and passes appropriate optimization
flags to GCC.
-config MK8
- bool "Opteron/Athlon64/Hammer/K8"
- help
- Select this for an AMD Opteron or Athlon64 Hammer-family processor.
- Enables use of some extended instructions, and passes appropriate
- optimization flags to GCC.
-
config MCRUSOE
bool "Crusoe"
depends on X86_32
@@ -258,42 +235,14 @@ config MVIAC7
Select this for a VIA C7. Selecting this uses the correct cache
shift and tells gcc to treat the CPU as a 686.
-config MPSC
- bool "Intel P4 / older Netburst based Xeon"
- depends on X86_64
- help
- Optimize for Intel Pentium 4, Pentium D and older Nocona/Dempsey
- Xeon CPUs with Intel 64bit which is compatible with x86-64.
- Note that the latest Xeons (Xeon 51xx and 53xx) are not based on the
- Netburst core and shouldn't use this option. You can distinguish them
- using the cpu family field
- in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one.
-
-config MCORE2
- bool "Core 2/newer Xeon"
- help
-
- Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and
- 53xx) CPUs. You can distinguish newer from older Xeons by the CPU
- family in /proc/cpuinfo. Newer ones have 6 and older ones 15
- (not a typo)
-
config MATOM
bool "Intel Atom"
help
-
Select this for the Intel Atom platform. Intel Atom CPUs have an
in-order pipelining architecture and thus can benefit from
accordingly optimized code. Use a recent GCC with specific Atom
support in order to fully benefit from selecting this option.
-config GENERIC_CPU
- bool "Generic-x86-64"
- depends on X86_64
- help
- Generic x86-64 CPU.
- Run equally well on all x86-64 CPUs.
-
endchoice
config X86_GENERIC
@@ -317,8 +266,8 @@ config X86_INTERNODE_CACHE_SHIFT
config X86_L1_CACHE_SHIFT
int
- default "7" if MPENTIUM4 || MPSC
- default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
+ default "7" if MPENTIUM4
+ default "6" if MK7 || MPENTIUMM || MATOM || MVIAC7 || X86_GENERIC || X86_64
default "4" if MELAN || M486SX || M486 || MGEODEGX1
default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
@@ -336,51 +285,35 @@ config X86_ALIGNMENT_16
config X86_INTEL_USERCOPY
def_bool y
- depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2
+ depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK7 || MEFFICEON
config X86_USE_PPRO_CHECKSUM
def_bool y
- depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM
-
-#
-# P6_NOPs are a relatively minor optimization that require a family >=
-# 6 processor, except that it is broken on certain VIA chips.
-# Furthermore, AMD chips prefer a totally different sequence of NOPs
-# (which work on all CPUs). In addition, it looks like Virtual PC
-# does not understand them.
-#
-# As a result, disallow these if we're not compiling for X86_64 (these
-# NOPs do work on all x86-64 capable chips); the list of processors in
-# the right-hand clause are the cores that benefit from this optimization.
-#
-config X86_P6_NOP
- def_bool y
- depends on X86_64
- depends on (MCORE2 || MPENTIUM4 || MPSC)
+ depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MATOM
config X86_TSC
def_bool y
- depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) || X86_64
+ depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MATOM) || X86_64
config X86_HAVE_PAE
def_bool y
- depends on MCRUSOE || MEFFICEON || MCYRIXIII || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC7 || MCORE2 || MATOM || X86_64
+ depends on MCRUSOE || MEFFICEON || MCYRIXIII || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC7 || MATOM || X86_64
-config X86_CMPXCHG64
+config X86_CX8
def_bool y
- depends on X86_HAVE_PAE || M586TSC || M586MMX || MK6 || MK7
+ depends on X86_HAVE_PAE || M586TSC || M586MMX || MK6 || MK7 || MGEODEGX1 || MGEODE_LX
# this should be set for all -march=.. options where the compiler
# generates cmov.
config X86_CMOV
def_bool y
- depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX)
+ depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || MATOM || MGEODE_LX || X86_64)
config X86_MINIMUM_CPU_FAMILY
int
default "64" if X86_64
- default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCORE2 || MK7 || MK8)
- default "5" if X86_32 && X86_CMPXCHG64
+ default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MK7)
+ default "5" if X86_32 && X86_CX8
default "4"
config X86_DEBUGCTLMSR
@@ -401,6 +334,10 @@ menuconfig PROCESSOR_SELECT
This lets you choose what x86 vendor support code your kernel
will include.
+config BROADCAST_TLB_FLUSH
+ def_bool y
+ depends on CPU_SUP_AMD && 64BIT
+
config CPU_SUP_INTEL
default y
bool "Support Intel processors" if PROCESSOR_SELECT
diff --git a/arch/x86/Kconfig.cpufeatures b/arch/x86/Kconfig.cpufeatures
new file mode 100644
index 000000000000..e12d5b7e39a2
--- /dev/null
+++ b/arch/x86/Kconfig.cpufeatures
@@ -0,0 +1,201 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# x86 feature bits (see arch/x86/include/asm/cpufeatures.h) that are
+# either REQUIRED to be enabled, or DISABLED (always ignored) for this
+# particular compile-time configuration. The tests for these features
+# are turned into compile-time constants via the generated
+# <asm/cpufeaturemasks.h>.
+#
+# The naming of these variables *must* match asm/cpufeatures.h, e.g.,
+# X86_FEATURE_ALWAYS <==> X86_REQUIRED_FEATURE_ALWAYS
+# X86_FEATURE_FRED <==> X86_DISABLED_FEATURE_FRED
+#
+# And these REQUIRED and DISABLED config options are manipulated in an
+# AWK script as the following example:
+#
+# +----------------------+
+# | X86_FRED = y ? |
+# +----------------------+
+# / \
+# Y / \ N
+# +-------------------------------------+ +-------------------------------+
+# | X86_DISABLED_FEATURE_FRED undefined | | X86_DISABLED_FEATURE_FRED = y |
+# +-------------------------------------+ +-------------------------------+
+# |
+# |
+# +-------------------------------------------+ |
+# | X86_FEATURE_FRED: feature word 12, bit 17 | ---->|
+# +-------------------------------------------+ |
+# |
+# |
+# +-------------------------------+
+# | set bit 17 of DISABLED_MASK12 |
+# +-------------------------------+
+#
+
+config X86_REQUIRED_FEATURE_ALWAYS
+ def_bool y
+
+config X86_REQUIRED_FEATURE_NOPL
+ def_bool y
+ depends on X86_64 || X86_P6_NOP
+
+config X86_REQUIRED_FEATURE_CX8
+ def_bool y
+ depends on X86_CX8
+
+# this should be set for all -march=.. options where the compiler
+# generates cmov.
+config X86_REQUIRED_FEATURE_CMOV
+ def_bool y
+ depends on X86_CMOV
+
+# this should be set for all -march= options where the compiler
+# generates movbe.
+config X86_REQUIRED_FEATURE_MOVBE
+ def_bool y
+ depends on MATOM
+
+config X86_REQUIRED_FEATURE_CPUID
+ def_bool y
+ depends on X86_64
+
+config X86_REQUIRED_FEATURE_UP
+ def_bool y
+ depends on !SMP
+
+config X86_REQUIRED_FEATURE_FPU
+ def_bool y
+ depends on !MATH_EMULATION
+
+config X86_REQUIRED_FEATURE_PAE
+ def_bool y
+ depends on X86_64 || X86_PAE
+
+config X86_REQUIRED_FEATURE_PSE
+ def_bool y
+ depends on X86_64 && !PARAVIRT_XXL
+
+config X86_REQUIRED_FEATURE_PGE
+ def_bool y
+ depends on X86_64 && !PARAVIRT_XXL
+
+config X86_REQUIRED_FEATURE_MSR
+ def_bool y
+ depends on X86_64
+
+config X86_REQUIRED_FEATURE_FXSR
+ def_bool y
+ depends on X86_64
+
+config X86_REQUIRED_FEATURE_XMM
+ def_bool y
+ depends on X86_64
+
+config X86_REQUIRED_FEATURE_XMM2
+ def_bool y
+ depends on X86_64
+
+config X86_REQUIRED_FEATURE_LM
+ def_bool y
+ depends on X86_64
+
+config X86_DISABLED_FEATURE_UMIP
+ def_bool y
+ depends on !X86_UMIP
+
+config X86_DISABLED_FEATURE_VME
+ def_bool y
+ depends on X86_64
+
+config X86_DISABLED_FEATURE_K6_MTRR
+ def_bool y
+ depends on X86_64
+
+config X86_DISABLED_FEATURE_CYRIX_ARR
+ def_bool y
+ depends on X86_64
+
+config X86_DISABLED_FEATURE_CENTAUR_MCR
+ def_bool y
+ depends on X86_64
+
+config X86_DISABLED_FEATURE_PCID
+ def_bool y
+ depends on !X86_64
+
+config X86_DISABLED_FEATURE_PKU
+ def_bool y
+ depends on !X86_INTEL_MEMORY_PROTECTION_KEYS
+
+config X86_DISABLED_FEATURE_OSPKE
+ def_bool y
+ depends on !X86_INTEL_MEMORY_PROTECTION_KEYS
+
+config X86_DISABLED_FEATURE_LA57
+ def_bool y
+ depends on !X86_5LEVEL
+
+config X86_DISABLED_FEATURE_PTI
+ def_bool y
+ depends on !MITIGATION_PAGE_TABLE_ISOLATION
+
+config X86_DISABLED_FEATURE_RETPOLINE
+ def_bool y
+ depends on !MITIGATION_RETPOLINE
+
+config X86_DISABLED_FEATURE_RETPOLINE_LFENCE
+ def_bool y
+ depends on !MITIGATION_RETPOLINE
+
+config X86_DISABLED_FEATURE_RETHUNK
+ def_bool y
+ depends on !MITIGATION_RETHUNK
+
+config X86_DISABLED_FEATURE_UNRET
+ def_bool y
+ depends on !MITIGATION_UNRET_ENTRY
+
+config X86_DISABLED_FEATURE_CALL_DEPTH
+ def_bool y
+ depends on !MITIGATION_CALL_DEPTH_TRACKING
+
+config X86_DISABLED_FEATURE_LAM
+ def_bool y
+ depends on !ADDRESS_MASKING
+
+config X86_DISABLED_FEATURE_ENQCMD
+ def_bool y
+ depends on !INTEL_IOMMU_SVM
+
+config X86_DISABLED_FEATURE_SGX
+ def_bool y
+ depends on !X86_SGX
+
+config X86_DISABLED_FEATURE_XENPV
+ def_bool y
+ depends on !XEN_PV
+
+config X86_DISABLED_FEATURE_TDX_GUEST
+ def_bool y
+ depends on !INTEL_TDX_GUEST
+
+config X86_DISABLED_FEATURE_USER_SHSTK
+ def_bool y
+ depends on !X86_USER_SHADOW_STACK
+
+config X86_DISABLED_FEATURE_IBT
+ def_bool y
+ depends on !X86_KERNEL_IBT
+
+config X86_DISABLED_FEATURE_FRED
+ def_bool y
+ depends on !X86_FRED
+
+config X86_DISABLED_FEATURE_SEV_SNP
+ def_bool y
+ depends on !KVM_AMD_SEV
+
+config X86_DISABLED_FEATURE_INVLPGB
+ def_bool y
+ depends on !BROADCAST_TLB_FLUSH
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 5b773b34768d..0fc7e8fd1a2e 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -137,17 +137,12 @@ ifeq ($(CONFIG_X86_32),y)
include $(srctree)/arch/x86/Makefile_32.cpu
KBUILD_CFLAGS += $(cflags-y)
- # temporary until string.h is fixed
+ ifneq ($(call clang-min-version, 160000),y)
+ # https://github.com/llvm/llvm-project/issues/53645
KBUILD_CFLAGS += -ffreestanding
-
- ifeq ($(CONFIG_STACKPROTECTOR),y)
- ifeq ($(CONFIG_SMP),y)
- KBUILD_CFLAGS += -mstack-protector-guard-reg=fs \
- -mstack-protector-guard-symbol=__ref_stack_chk_guard
- else
- KBUILD_CFLAGS += -mstack-protector-guard=global
- endif
endif
+
+ percpu_seg := fs
else
BITS := 64
UTS_MACHINE := x86_64
@@ -178,25 +173,24 @@ else
# Use -mskip-rax-setup if supported.
KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup)
- # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu)
- cflags-$(CONFIG_MK8) += -march=k8
- cflags-$(CONFIG_MPSC) += -march=nocona
- cflags-$(CONFIG_MCORE2) += -march=core2
- cflags-$(CONFIG_MATOM) += -march=atom
- cflags-$(CONFIG_GENERIC_CPU) += -mtune=generic
- KBUILD_CFLAGS += $(cflags-y)
-
- rustflags-$(CONFIG_MK8) += -Ctarget-cpu=k8
- rustflags-$(CONFIG_MPSC) += -Ctarget-cpu=nocona
- rustflags-$(CONFIG_MCORE2) += -Ctarget-cpu=core2
- rustflags-$(CONFIG_MATOM) += -Ctarget-cpu=atom
- rustflags-$(CONFIG_GENERIC_CPU) += -Ztune-cpu=generic
- KBUILD_RUSTFLAGS += $(rustflags-y)
+ KBUILD_CFLAGS += -march=x86-64 -mtune=generic
+ KBUILD_RUSTFLAGS += -Ctarget-cpu=x86-64 -Ztune-cpu=generic
KBUILD_CFLAGS += -mno-red-zone
KBUILD_CFLAGS += -mcmodel=kernel
KBUILD_RUSTFLAGS += -Cno-redzone=y
KBUILD_RUSTFLAGS += -Ccode-model=kernel
+
+ percpu_seg := gs
+endif
+
+ifeq ($(CONFIG_STACKPROTECTOR),y)
+ ifeq ($(CONFIG_SMP),y)
+ KBUILD_CFLAGS += -mstack-protector-guard-reg=$(percpu_seg)
+ KBUILD_CFLAGS += -mstack-protector-guard-symbol=__ref_stack_chk_guard
+ else
+ KBUILD_CFLAGS += -mstack-protector-guard=global
+ endif
endif
#
@@ -277,6 +271,21 @@ archheaders:
$(Q)$(MAKE) $(build)=arch/x86/entry/syscalls all
###
+# <asm/cpufeaturemasks.h> header generation
+
+cpufeaturemasks.hdr := arch/x86/include/generated/asm/cpufeaturemasks.h
+cpufeaturemasks.awk := $(srctree)/arch/x86/tools/cpufeaturemasks.awk
+cpufeatures_hdr := $(srctree)/arch/x86/include/asm/cpufeatures.h
+targets += $(cpufeaturemasks.hdr)
+quiet_cmd_gen_featuremasks = GEN $@
+ cmd_gen_featuremasks = $(AWK) -f $(cpufeaturemasks.awk) $(cpufeatures_hdr) $(KCONFIG_CONFIG) > $@
+
+$(cpufeaturemasks.hdr): $(cpufeaturemasks.awk) $(cpufeatures_hdr) $(KCONFIG_CONFIG) FORCE
+ $(shell mkdir -p $(dir $@))
+ $(call if_changed,gen_featuremasks)
+archprepare: $(cpufeaturemasks.hdr)
+
+###
# Kernel objects
libs-y += arch/x86/lib/
diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu
index 94834c4b5e5e..af7de9a42752 100644
--- a/arch/x86/Makefile_32.cpu
+++ b/arch/x86/Makefile_32.cpu
@@ -24,7 +24,6 @@ cflags-$(CONFIG_MK6) += -march=k6
# Please note, that patches that add -march=athlon-xp and friends are pointless.
# They make zero difference whatsosever to performance at this time.
cflags-$(CONFIG_MK7) += -march=athlon
-cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8,-march=athlon)
cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align)
cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) $(align)
cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586)
@@ -32,9 +31,7 @@ cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586)
cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)
cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686)
cflags-$(CONFIG_MVIAC7) += -march=i686
-cflags-$(CONFIG_MCORE2) += -march=i686 $(call tune,core2)
-cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom,$(call cc-option,-march=core2,-march=i686)) \
- $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic))
+cflags-$(CONFIG_MATOM) += -march=atom
# AMD Elan support
cflags-$(CONFIG_MELAN) += -march=i486
diff --git a/arch/x86/boot/.gitignore b/arch/x86/boot/.gitignore
index 1189be057ebd..070ef534c915 100644
--- a/arch/x86/boot/.gitignore
+++ b/arch/x86/boot/.gitignore
@@ -12,3 +12,4 @@ fdimage
mtools.conf
image.iso
hdimage
+tools/
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 9cc0ff6e9067..8589471b65a1 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -35,7 +35,6 @@ setup-y += video-vesa.o
setup-y += video-bios.o
targets += $(setup-y)
-hostprogs := tools/build
hostprogs += mkcpustr
HOST_EXTRACFLAGS += -I$(srctree)/tools/include \
@@ -61,11 +60,9 @@ KBUILD_CFLAGS += $(CONFIG_CC_IMPLICIT_FALLTHROUGH)
$(obj)/bzImage: asflags-y := $(SVGA_MODE)
quiet_cmd_image = BUILD $@
-silent_redirect_image = >/dev/null
-cmd_image = $(obj)/tools/build $(obj)/setup.bin $(obj)/vmlinux.bin \
- $(obj)/zoffset.h $@ $($(quiet)redirect_image)
+ cmd_image = cp $< $@; truncate -s %4K $@; cat $(obj)/vmlinux.bin >>$@
-$(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin $(obj)/tools/build FORCE
+$(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin FORCE
$(call if_changed,image)
@$(kecho) 'Kernel: $@ is ready' ' (#'$(or $(KBUILD_BUILD_VERSION),`cat .version`)')'
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index 0f24f7ebec9b..38f17a1e1e36 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -16,7 +16,7 @@
#define STACK_SIZE 1024 /* Minimum number of bytes for stack */
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/stdarg.h>
#include <linux/types.h>
@@ -327,6 +327,6 @@ void probe_cards(int unsafe);
/* video-vesa.c */
void vesa_store_edid(void);
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* BOOT_BOOT_H */
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 606c74f27459..0e0b238e8363 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -98,6 +98,7 @@ ifdef CONFIG_X86_64
vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/mem_encrypt.o
vmlinux-objs-y += $(obj)/pgtable_64.o
vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/sev.o
+ vmlinux-objs-y += $(obj)/la57toggle.o
endif
vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 1dcb794c5479..3dc86352cdbe 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -483,110 +483,7 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated)
jmp *%rax
SYM_FUNC_END(.Lrelocated)
-/*
- * This is the 32-bit trampoline that will be copied over to low memory. It
- * will be called using the ordinary 64-bit calling convention from code
- * running in 64-bit mode.
- *
- * Return address is at the top of the stack (might be above 4G).
- * The first argument (EDI) contains the address of the temporary PGD level
- * page table in 32-bit addressable memory which will be programmed into
- * register CR3.
- */
- .section ".rodata", "a", @progbits
-SYM_CODE_START(trampoline_32bit_src)
- /*
- * Preserve callee save 64-bit registers on the stack: this is
- * necessary because the architecture does not guarantee that GPRs will
- * retain their full 64-bit values across a 32-bit mode switch.
- */
- pushq %r15
- pushq %r14
- pushq %r13
- pushq %r12
- pushq %rbp
- pushq %rbx
-
- /* Preserve top half of RSP in a legacy mode GPR to avoid truncation */
- movq %rsp, %rbx
- shrq $32, %rbx
-
- /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */
- pushq $__KERNEL32_CS
- leaq 0f(%rip), %rax
- pushq %rax
- lretq
-
- /*
- * The 32-bit code below will do a far jump back to long mode and end
- * up here after reconfiguring the number of paging levels. First, the
- * stack pointer needs to be restored to its full 64-bit value before
- * the callee save register contents can be popped from the stack.
- */
-.Lret:
- shlq $32, %rbx
- orq %rbx, %rsp
-
- /* Restore the preserved 64-bit registers */
- popq %rbx
- popq %rbp
- popq %r12
- popq %r13
- popq %r14
- popq %r15
- retq
-
.code32
-0:
- /* Disable paging */
- movl %cr0, %eax
- btrl $X86_CR0_PG_BIT, %eax
- movl %eax, %cr0
-
- /* Point CR3 to the trampoline's new top level page table */
- movl %edi, %cr3
-
- /* Set EFER.LME=1 as a precaution in case hypervsior pulls the rug */
- movl $MSR_EFER, %ecx
- rdmsr
- btsl $_EFER_LME, %eax
- /* Avoid writing EFER if no change was made (for TDX guest) */
- jc 1f
- wrmsr
-1:
- /* Toggle CR4.LA57 */
- movl %cr4, %eax
- btcl $X86_CR4_LA57_BIT, %eax
- movl %eax, %cr4
-
- /* Enable paging again. */
- movl %cr0, %eax
- btsl $X86_CR0_PG_BIT, %eax
- movl %eax, %cr0
-
- /*
- * Return to the 64-bit calling code using LJMP rather than LRET, to
- * avoid the need for a 32-bit addressable stack. The destination
- * address will be adjusted after the template code is copied into a
- * 32-bit addressable buffer.
- */
-.Ljmp: ljmpl $__KERNEL_CS, $(.Lret - trampoline_32bit_src)
-SYM_CODE_END(trampoline_32bit_src)
-
-/*
- * This symbol is placed right after trampoline_32bit_src() so its address can
- * be used to infer the size of the trampoline code.
- */
-SYM_DATA(trampoline_ljmp_imm_offset, .word .Ljmp + 1 - trampoline_32bit_src)
-
- /*
- * The trampoline code has a size limit.
- * Make sure we fail to compile if the trampoline code grows
- * beyond TRAMPOLINE_32BIT_CODE_SIZE bytes.
- */
- .org trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_SIZE
-
- .text
SYM_FUNC_START_LOCAL_NOALIGN(.Lno_longmode)
/* This isn't an x86-64 CPU, so hang intentionally, we cannot continue */
1:
diff --git a/arch/x86/boot/compressed/la57toggle.S b/arch/x86/boot/compressed/la57toggle.S
new file mode 100644
index 000000000000..9ee002387eb1
--- /dev/null
+++ b/arch/x86/boot/compressed/la57toggle.S
@@ -0,0 +1,112 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/boot.h>
+#include <asm/msr.h>
+#include <asm/processor-flags.h>
+#include "pgtable.h"
+
+/*
+ * This is the 32-bit trampoline that will be copied over to low memory. It
+ * will be called using the ordinary 64-bit calling convention from code
+ * running in 64-bit mode.
+ *
+ * Return address is at the top of the stack (might be above 4G).
+ * The first argument (EDI) contains the address of the temporary PGD level
+ * page table in 32-bit addressable memory which will be programmed into
+ * register CR3.
+ */
+
+ .section ".rodata", "a", @progbits
+SYM_CODE_START(trampoline_32bit_src)
+ /*
+ * Preserve callee save 64-bit registers on the stack: this is
+ * necessary because the architecture does not guarantee that GPRs will
+ * retain their full 64-bit values across a 32-bit mode switch.
+ */
+ pushq %r15
+ pushq %r14
+ pushq %r13
+ pushq %r12
+ pushq %rbp
+ pushq %rbx
+
+ /* Preserve top half of RSP in a legacy mode GPR to avoid truncation */
+ movq %rsp, %rbx
+ shrq $32, %rbx
+
+ /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */
+ pushq $__KERNEL32_CS
+ leaq 0f(%rip), %rax
+ pushq %rax
+ lretq
+
+ /*
+ * The 32-bit code below will do a far jump back to long mode and end
+ * up here after reconfiguring the number of paging levels. First, the
+ * stack pointer needs to be restored to its full 64-bit value before
+ * the callee save register contents can be popped from the stack.
+ */
+.Lret:
+ shlq $32, %rbx
+ orq %rbx, %rsp
+
+ /* Restore the preserved 64-bit registers */
+ popq %rbx
+ popq %rbp
+ popq %r12
+ popq %r13
+ popq %r14
+ popq %r15
+ retq
+
+ .code32
+0:
+ /* Disable paging */
+ movl %cr0, %eax
+ btrl $X86_CR0_PG_BIT, %eax
+ movl %eax, %cr0
+
+ /* Point CR3 to the trampoline's new top level page table */
+ movl %edi, %cr3
+
+ /* Set EFER.LME=1 as a precaution in case hypervsior pulls the rug */
+ movl $MSR_EFER, %ecx
+ rdmsr
+ btsl $_EFER_LME, %eax
+ /* Avoid writing EFER if no change was made (for TDX guest) */
+ jc 1f
+ wrmsr
+1:
+ /* Toggle CR4.LA57 */
+ movl %cr4, %eax
+ btcl $X86_CR4_LA57_BIT, %eax
+ movl %eax, %cr4
+
+ /* Enable paging again. */
+ movl %cr0, %eax
+ btsl $X86_CR0_PG_BIT, %eax
+ movl %eax, %cr0
+
+ /*
+ * Return to the 64-bit calling code using LJMP rather than LRET, to
+ * avoid the need for a 32-bit addressable stack. The destination
+ * address will be adjusted after the template code is copied into a
+ * 32-bit addressable buffer.
+ */
+.Ljmp: ljmpl $__KERNEL_CS, $(.Lret - trampoline_32bit_src)
+SYM_CODE_END(trampoline_32bit_src)
+
+/*
+ * This symbol is placed right after trampoline_32bit_src() so its address can
+ * be used to infer the size of the trampoline code.
+ */
+SYM_DATA(trampoline_ljmp_imm_offset, .word .Ljmp + 1 - trampoline_32bit_src)
+
+ /*
+ * The trampoline code has a size limit.
+ * Make sure we fail to compile if the trampoline code grows
+ * beyond TRAMPOLINE_32BIT_CODE_SIZE bytes.
+ */
+ .org trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_SIZE
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 0d37420cad02..1cdcd4aaf395 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -235,7 +235,7 @@ static void handle_relocations(void *output, unsigned long output_len,
/*
* Process relocations: 32 bit relocations first then 64 bit after.
- * Three sets of binary relocations are added to the end of the kernel
+ * Two sets of binary relocations are added to the end of the kernel
* before compression. Each relocation table entry is the kernel
* address of the location which needs to be updated stored as a
* 32-bit value which is sign extended to 64 bits.
@@ -245,8 +245,6 @@ static void handle_relocations(void *output, unsigned long output_len,
* kernel bits...
* 0 - zero terminator for 64 bit relocations
* 64 bit relocation repeated
- * 0 - zero terminator for inverse 32 bit relocations
- * 32 bit inverse relocation repeated
* 0 - zero terminator for 32 bit relocations
* 32 bit relocation repeated
*
@@ -263,16 +261,6 @@ static void handle_relocations(void *output, unsigned long output_len,
*(uint32_t *)ptr += delta;
}
#ifdef CONFIG_X86_64
- while (*--reloc) {
- long extended = *reloc;
- extended += map;
-
- ptr = (unsigned long)extended;
- if (ptr < min_addr || ptr > max_addr)
- error("inverse 32-bit relocation outside of kernel!\n");
-
- *(int32_t *)ptr -= delta;
- }
for (reloc--; *reloc; reloc--) {
long extended = *reloc;
extended += map;
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
index 083ec6d7722a..3b2bc61c9408 100644
--- a/arch/x86/boot/compressed/vmlinux.lds.S
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -48,7 +48,7 @@ SECTIONS
*(.data)
*(.data.*)
- /* Add 4 bytes of extra space for a CRC-32 checksum */
+ /* Add 4 bytes of extra space for the obsolete CRC-32 checksum */
. = ALIGN(. + 4, 0x200);
_edata = . ;
}
diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c
index 0aae4d4ed615..f82de8de5dc6 100644
--- a/arch/x86/boot/cpucheck.c
+++ b/arch/x86/boot/cpucheck.c
@@ -22,10 +22,11 @@
# include "boot.h"
#endif
#include <linux/types.h>
+#include <asm/cpufeaturemasks.h>
#include <asm/intel-family.h>
#include <asm/processor-flags.h>
-#include <asm/required-features.h>
#include <asm/msr-index.h>
+
#include "string.h"
#include "msr.h"
diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c
index d75237ba7ce9..916bac09b464 100644
--- a/arch/x86/boot/cpuflags.c
+++ b/arch/x86/boot/cpuflags.c
@@ -3,7 +3,6 @@
#include "bitops.h"
#include <asm/processor-flags.h>
-#include <asm/required-features.h>
#include <asm/msr-index.h>
#include "cpuflags.h"
@@ -29,40 +28,32 @@ static int has_fpu(void)
return fsw == 0 && (fcw & 0x103f) == 0x003f;
}
+#ifdef CONFIG_X86_32
/*
* For building the 16-bit code we want to explicitly specify 32-bit
* push/pop operations, rather than just saying 'pushf' or 'popf' and
- * letting the compiler choose. But this is also included from the
- * compressed/ directory where it may be 64-bit code, and thus needs
- * to be 'pushfq' or 'popfq' in that case.
+ * letting the compiler choose.
*/
-#ifdef __x86_64__
-#define PUSHF "pushfq"
-#define POPF "popfq"
-#else
-#define PUSHF "pushfl"
-#define POPF "popfl"
-#endif
-
-int has_eflag(unsigned long mask)
+bool has_eflag(unsigned long mask)
{
unsigned long f0, f1;
- asm volatile(PUSHF " \n\t"
- PUSHF " \n\t"
+ asm volatile("pushfl \n\t"
+ "pushfl \n\t"
"pop %0 \n\t"
"mov %0,%1 \n\t"
"xor %2,%1 \n\t"
"push %1 \n\t"
- POPF " \n\t"
- PUSHF " \n\t"
+ "popfl \n\t"
+ "pushfl \n\t"
"pop %1 \n\t"
- POPF
+ "popfl"
: "=&r" (f0), "=&r" (f1)
: "ri" (mask));
return !!((f0^f1) & mask);
}
+#endif
void cpuid_count(u32 id, u32 count, u32 *a, u32 *b, u32 *c, u32 *d)
{
diff --git a/arch/x86/boot/cpuflags.h b/arch/x86/boot/cpuflags.h
index 475b8fde90f7..a398d9204ad0 100644
--- a/arch/x86/boot/cpuflags.h
+++ b/arch/x86/boot/cpuflags.h
@@ -15,8 +15,13 @@ struct cpu_features {
extern struct cpu_features cpu;
extern u32 cpu_vendor[3];
-int has_eflag(unsigned long mask);
+#ifdef CONFIG_X86_32
+bool has_eflag(unsigned long mask);
+#else
+static inline bool has_eflag(unsigned long mask) { return true; }
+#endif
void get_cpuflags(void);
void cpuid_count(u32 id, u32 count, u32 *a, u32 *b, u32 *c, u32 *d);
+bool has_cpuflag(int flag);
#endif
diff --git a/arch/x86/boot/genimage.sh b/arch/x86/boot/genimage.sh
index c9299aeb7333..3882ead513f7 100644
--- a/arch/x86/boot/genimage.sh
+++ b/arch/x86/boot/genimage.sh
@@ -22,6 +22,7 @@
# This script requires:
# bash
# syslinux
+# genisoimage
# mtools (for fdimage* and hdimage)
# edk2/OVMF (for hdimage)
#
@@ -251,7 +252,9 @@ geniso() {
cp "$isolinux" "$ldlinux" "$tmp_dir"
cp "$FBZIMAGE" "$tmp_dir"/linux
echo default linux "$KCMDLINE" > "$tmp_dir"/isolinux.cfg
- cp "${FDINITRDS[@]}" "$tmp_dir"/
+ if [ ${#FDINITRDS[@]} -gt 0 ]; then
+ cp "${FDINITRDS[@]}" "$tmp_dir"/
+ fi
genisoimage -J -r -appid 'LINUX_BOOT' -input-charset=utf-8 \
-quiet -o "$FIMAGE" -b isolinux.bin \
-c boot.cat -no-emul-boot -boot-load-size 4 \
diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c
index da0ccc5de538..22d730b227e3 100644
--- a/arch/x86/boot/mkcpustr.c
+++ b/arch/x86/boot/mkcpustr.c
@@ -12,8 +12,6 @@
#include <stdio.h>
-#include "../include/asm/required-features.h"
-#include "../include/asm/disabled-features.h"
#include "../include/asm/cpufeatures.h"
#include "../include/asm/vmxfeatures.h"
#include "../kernel/cpu/capflags.c"
@@ -23,6 +21,7 @@ int main(void)
int i, j;
const char *str;
+ printf("#include <asm/cpufeaturemasks.h>\n\n");
printf("static const char x86_cap_strs[] =\n");
for (i = 0; i < NCAPINTS; i++) {
diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld
index 3a2d1360abb0..e1d594a60204 100644
--- a/arch/x86/boot/setup.ld
+++ b/arch/x86/boot/setup.ld
@@ -45,6 +45,8 @@ SECTIONS
setup_size = ALIGN(ABSOLUTE(.), 4096);
setup_sects = ABSOLUTE(setup_size / 512);
+ ASSERT(setup_sects >= 5, "The setup must be at least 5 sectors in size");
+ ASSERT(setup_sects <= 64, "The setup must be at most 64 sectors in size");
}
. = ALIGN(16);
diff --git a/arch/x86/boot/tools/.gitignore b/arch/x86/boot/tools/.gitignore
deleted file mode 100644
index ae91f4d0d78b..000000000000
--- a/arch/x86/boot/tools/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-build
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
deleted file mode 100644
index 10311d77c67f..000000000000
--- a/arch/x86/boot/tools/build.c
+++ /dev/null
@@ -1,247 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 1991, 1992 Linus Torvalds
- * Copyright (C) 1997 Martin Mares
- * Copyright (C) 2007 H. Peter Anvin
- */
-
-/*
- * This file builds a disk-image from three different files:
- *
- * - setup: 8086 machine code, sets up system parm
- * - system: 80386 code for actual system
- * - zoffset.h: header with ZO_* defines
- *
- * It does some checking that all files are of the correct type, and writes
- * the result to the specified destination, removing headers and padding to
- * the right amount. It also writes some system data to stdout.
- */
-
-/*
- * Changes by tytso to allow root device specification
- * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
- * Cross compiling fixes by Gertjan van Wingerde, July 1996
- * Rewritten by Martin Mares, April 1997
- * Substantially overhauled by H. Peter Anvin, April 2007
- */
-
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include <stdarg.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/mman.h>
-#include <tools/le_byteshift.h>
-
-typedef unsigned char u8;
-typedef unsigned short u16;
-typedef unsigned int u32;
-
-/* Minimal number of setup sectors */
-#define SETUP_SECT_MIN 5
-#define SETUP_SECT_MAX 64
-
-/* This must be large enough to hold the entire setup */
-u8 buf[SETUP_SECT_MAX*512];
-
-static unsigned long _edata;
-
-/*----------------------------------------------------------------------*/
-
-static const u32 crctab32[] = {
- 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419,
- 0x706af48f, 0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4,
- 0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07,
- 0x90bf1d91, 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de,
- 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, 0x136c9856,
- 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
- 0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4,
- 0xa2677172, 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,
- 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3,
- 0x45df5c75, 0xdcd60dcf, 0xabd13d59, 0x26d930ac, 0x51de003a,
- 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, 0xcfba9599,
- 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
- 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190,
- 0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f,
- 0x9fbfe4a5, 0xe8b8d433, 0x7807c9a2, 0x0f00f934, 0x9609a88e,
- 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01,
- 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed,
- 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950,
- 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3,
- 0xfbd44c65, 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2,
- 0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a,
- 0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5,
- 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa, 0xbe0b1010,
- 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
- 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17,
- 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6,
- 0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af, 0x04db2615,
- 0x73dc1683, 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8,
- 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, 0xf00f9344,
- 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
- 0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a,
- 0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5,
- 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1,
- 0xa6bc5767, 0x3fb506dd, 0x48b2364b, 0xd80d2bda, 0xaf0a1b4c,
- 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, 0x316e8eef,
- 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
- 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe,
- 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31,
- 0x2cd99e8b, 0x5bdeae1d, 0x9b64c2b0, 0xec63f226, 0x756aa39c,
- 0x026d930a, 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713,
- 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b,
- 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242,
- 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1,
- 0x18b74777, 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c,
- 0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45, 0xa00ae278,
- 0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7,
- 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc, 0x40df0b66,
- 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
- 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605,
- 0xcdd70693, 0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8,
- 0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b,
- 0x2d02ef8d
-};
-
-static u32 partial_crc32_one(u8 c, u32 crc)
-{
- return crctab32[(crc ^ c) & 0xff] ^ (crc >> 8);
-}
-
-static u32 partial_crc32(const u8 *s, int len, u32 crc)
-{
- while (len--)
- crc = partial_crc32_one(*s++, crc);
- return crc;
-}
-
-static void die(const char * str, ...)
-{
- va_list args;
- va_start(args, str);
- vfprintf(stderr, str, args);
- va_end(args);
- fputc('\n', stderr);
- exit(1);
-}
-
-static void usage(void)
-{
- die("Usage: build setup system zoffset.h image");
-}
-
-/*
- * Parse zoffset.h and find the entry points. We could just #include zoffset.h
- * but that would mean tools/build would have to be rebuilt every time. It's
- * not as if parsing it is hard...
- */
-#define PARSE_ZOFS(p, sym) do { \
- if (!strncmp(p, "#define ZO_" #sym " ", 11+sizeof(#sym))) \
- sym = strtoul(p + 11 + sizeof(#sym), NULL, 16); \
-} while (0)
-
-static void parse_zoffset(char *fname)
-{
- FILE *file;
- char *p;
- int c;
-
- file = fopen(fname, "r");
- if (!file)
- die("Unable to open `%s': %m", fname);
- c = fread(buf, 1, sizeof(buf) - 1, file);
- if (ferror(file))
- die("read-error on `zoffset.h'");
- fclose(file);
- buf[c] = 0;
-
- p = (char *)buf;
-
- while (p && *p) {
- PARSE_ZOFS(p, _edata);
-
- p = strchr(p, '\n');
- while (p && (*p == '\r' || *p == '\n'))
- p++;
- }
-}
-
-int main(int argc, char ** argv)
-{
- unsigned int i, sz, setup_sectors;
- int c;
- struct stat sb;
- FILE *file, *dest;
- int fd;
- void *kernel;
- u32 crc = 0xffffffffUL;
-
- if (argc != 5)
- usage();
- parse_zoffset(argv[3]);
-
- dest = fopen(argv[4], "w");
- if (!dest)
- die("Unable to write `%s': %m", argv[4]);
-
- /* Copy the setup code */
- file = fopen(argv[1], "r");
- if (!file)
- die("Unable to open `%s': %m", argv[1]);
- c = fread(buf, 1, sizeof(buf), file);
- if (ferror(file))
- die("read-error on `setup'");
- if (c < 1024)
- die("The setup must be at least 1024 bytes");
- if (get_unaligned_le16(&buf[510]) != 0xAA55)
- die("Boot block hasn't got boot flag (0xAA55)");
- fclose(file);
-
- /* Pad unused space with zeros */
- setup_sectors = (c + 4095) / 4096;
- setup_sectors *= 8;
- if (setup_sectors < SETUP_SECT_MIN)
- setup_sectors = SETUP_SECT_MIN;
- i = setup_sectors*512;
- memset(buf+c, 0, i-c);
-
- /* Open and stat the kernel file */
- fd = open(argv[2], O_RDONLY);
- if (fd < 0)
- die("Unable to open `%s': %m", argv[2]);
- if (fstat(fd, &sb))
- die("Unable to stat `%s': %m", argv[2]);
- if (_edata != sb.st_size)
- die("Unexpected file size `%s': %u != %u", argv[2], _edata,
- sb.st_size);
- sz = _edata - 4;
- kernel = mmap(NULL, sz, PROT_READ, MAP_SHARED, fd, 0);
- if (kernel == MAP_FAILED)
- die("Unable to mmap '%s': %m", argv[2]);
-
- crc = partial_crc32(buf, i, crc);
- if (fwrite(buf, 1, i, dest) != i)
- die("Writing setup failed");
-
- /* Copy the kernel code */
- crc = partial_crc32(kernel, sz, crc);
- if (fwrite(kernel, 1, sz, dest) != sz)
- die("Writing kernel failed");
-
- /* Write the CRC */
- put_unaligned_le32(crc, buf);
- if (fwrite(buf, 1, 4, dest) != 4)
- die("Writing CRC failed");
-
- /* Catch any delayed write failures */
- if (fclose(dest))
- die("Writing image failed");
-
- close(fd);
-
- /* Everything is OK */
- return 0;
-}
diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c
index 96c7bc698e6b..b0c1a7a57497 100644
--- a/arch/x86/coco/sev/core.c
+++ b/arch/x86/coco/sev/core.c
@@ -9,8 +9,6 @@
#define pr_fmt(fmt) "SEV: " fmt
-#define DISABLE_BRANCH_PROFILING
-
#include <linux/sched/debug.h> /* For show_regs() */
#include <linux/percpu-defs.h>
#include <linux/cc_platform.h>
@@ -1482,8 +1480,7 @@ static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
case MSR_AMD64_GUEST_TSC_FREQ:
if (sev_status & MSR_AMD64_SNP_SECURE_TSC)
return __vc_handle_secure_tsc_msrs(regs, write);
- else
- break;
+ break;
default:
break;
}
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index 32809a06dab4..7772b01ab738 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -167,11 +167,11 @@ static void __noreturn tdx_panic(const char *msg)
/* Define register order according to the GHCI */
struct { u64 r14, r15, rbx, rdi, rsi, r8, r9, rdx; };
- char str[64];
+ char bytes[64] __nonstring;
} message;
/* VMM assumes '\0' in byte 65, if the message took all 64 bytes */
- strtomem_pad(message.str, msg, '\0');
+ strtomem_pad(message.bytes, msg, '\0');
args.r8 = message.r8;
args.r9 = message.r9;
diff --git a/arch/x86/configs/xen.config b/arch/x86/configs/xen.config
index 581296255b39..d5d091e03bd3 100644
--- a/arch/x86/configs/xen.config
+++ b/arch/x86/configs/xen.config
@@ -1,6 +1,4 @@
# global x86 required specific stuff
-# On 32-bit HIGHMEM4G is not allowed
-CONFIG_HIGHMEM64G=y
CONFIG_64BIT=y
# These enable us to allow some of the
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index eb153eff9331..b37881bb9f15 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -17,6 +17,7 @@
*/
#include <linux/linkage.h>
+#include <linux/objtool.h>
#include <asm/frame.h>
#define STATE1 %xmm0
@@ -1071,6 +1072,7 @@ SYM_FUNC_END(_aesni_inc)
* size_t len, u8 *iv)
*/
SYM_FUNC_START(aesni_ctr_enc)
+ ANNOTATE_NOENDBR
FRAME_BEGIN
cmp $16, LEN
jb .Lctr_enc_just_ret
diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
index 646477a13e11..1dfef28c1266 100644
--- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
@@ -16,6 +16,7 @@
*/
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
#include <asm/frame.h>
#define CAMELLIA_TABLE_BYTE_LEN 272
@@ -882,7 +883,7 @@ SYM_FUNC_START_LOCAL(__camellia_dec_blk16)
jmp .Ldec_max24;
SYM_FUNC_END(__camellia_dec_blk16)
-SYM_FUNC_START(camellia_ecb_enc_16way)
+SYM_TYPED_FUNC_START(camellia_ecb_enc_16way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst (16 blocks)
@@ -907,7 +908,7 @@ SYM_FUNC_START(camellia_ecb_enc_16way)
RET;
SYM_FUNC_END(camellia_ecb_enc_16way)
-SYM_FUNC_START(camellia_ecb_dec_16way)
+SYM_TYPED_FUNC_START(camellia_ecb_dec_16way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst (16 blocks)
@@ -937,7 +938,7 @@ SYM_FUNC_START(camellia_ecb_dec_16way)
RET;
SYM_FUNC_END(camellia_ecb_dec_16way)
-SYM_FUNC_START(camellia_cbc_dec_16way)
+SYM_TYPED_FUNC_START(camellia_cbc_dec_16way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst (16 blocks)
diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
index a0eb94e53b1b..b1c9b9450555 100644
--- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
@@ -6,6 +6,7 @@
*/
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
#include <asm/frame.h>
#define CAMELLIA_TABLE_BYTE_LEN 272
diff --git a/arch/x86/crypto/camellia-x86_64-asm_64.S b/arch/x86/crypto/camellia-x86_64-asm_64.S
index 816b6bb8bded..824cb94de6c2 100644
--- a/arch/x86/crypto/camellia-x86_64-asm_64.S
+++ b/arch/x86/crypto/camellia-x86_64-asm_64.S
@@ -6,6 +6,7 @@
*/
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
.file "camellia-x86_64-asm_64.S"
.text
@@ -177,7 +178,7 @@
bswapq RAB0; \
movq RAB0, 4*2(RIO);
-SYM_FUNC_START(__camellia_enc_blk)
+SYM_TYPED_FUNC_START(__camellia_enc_blk)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -224,7 +225,7 @@ SYM_FUNC_START(__camellia_enc_blk)
RET;
SYM_FUNC_END(__camellia_enc_blk)
-SYM_FUNC_START(camellia_dec_blk)
+SYM_TYPED_FUNC_START(camellia_dec_blk)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -411,7 +412,7 @@ SYM_FUNC_END(camellia_dec_blk)
bswapq RAB1; \
movq RAB1, 12*2(RIO);
-SYM_FUNC_START(__camellia_enc_blk_2way)
+SYM_TYPED_FUNC_START(__camellia_enc_blk_2way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -460,7 +461,7 @@ SYM_FUNC_START(__camellia_enc_blk_2way)
RET;
SYM_FUNC_END(__camellia_enc_blk_2way)
-SYM_FUNC_START(camellia_dec_blk_2way)
+SYM_TYPED_FUNC_START(camellia_dec_blk_2way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
index 97e283621851..84e47f7f6188 100644
--- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
@@ -9,6 +9,7 @@
*/
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
#include <asm/frame.h>
#include "glue_helper-asm-avx.S"
@@ -656,7 +657,7 @@ SYM_FUNC_START_LOCAL(__serpent_dec_blk8_avx)
RET;
SYM_FUNC_END(__serpent_dec_blk8_avx)
-SYM_FUNC_START(serpent_ecb_enc_8way_avx)
+SYM_TYPED_FUNC_START(serpent_ecb_enc_8way_avx)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -674,7 +675,7 @@ SYM_FUNC_START(serpent_ecb_enc_8way_avx)
RET;
SYM_FUNC_END(serpent_ecb_enc_8way_avx)
-SYM_FUNC_START(serpent_ecb_dec_8way_avx)
+SYM_TYPED_FUNC_START(serpent_ecb_dec_8way_avx)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -692,7 +693,7 @@ SYM_FUNC_START(serpent_ecb_dec_8way_avx)
RET;
SYM_FUNC_END(serpent_ecb_dec_8way_avx)
-SYM_FUNC_START(serpent_cbc_dec_8way_avx)
+SYM_TYPED_FUNC_START(serpent_cbc_dec_8way_avx)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
diff --git a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
index d2288bf38a8a..071e90e7f0d8 100644
--- a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
+++ b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
@@ -6,6 +6,7 @@
*/
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
.file "twofish-x86_64-asm-3way.S"
.text
@@ -220,7 +221,7 @@
rorq $32, RAB2; \
outunpack3(mov, RIO, 2, RAB, 2);
-SYM_FUNC_START(__twofish_enc_blk_3way)
+SYM_TYPED_FUNC_START(__twofish_enc_blk_3way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -269,7 +270,7 @@ SYM_FUNC_START(__twofish_enc_blk_3way)
RET;
SYM_FUNC_END(__twofish_enc_blk_3way)
-SYM_FUNC_START(twofish_dec_blk_3way)
+SYM_TYPED_FUNC_START(twofish_dec_blk_3way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S
index 775af290cd19..e08b4ba07b93 100644
--- a/arch/x86/crypto/twofish-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-x86_64-asm_64.S
@@ -8,6 +8,7 @@
.text
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
#include <asm/asm-offsets.h>
#define a_offset 0
@@ -202,7 +203,7 @@
xor %r8d, d ## D;\
ror $1, d ## D;
-SYM_FUNC_START(twofish_enc_blk)
+SYM_TYPED_FUNC_START(twofish_enc_blk)
pushq R1
/* %rdi contains the ctx address */
@@ -255,7 +256,7 @@ SYM_FUNC_START(twofish_enc_blk)
RET
SYM_FUNC_END(twofish_enc_blk)
-SYM_FUNC_START(twofish_dec_blk)
+SYM_TYPED_FUNC_START(twofish_dec_blk)
pushq R1
/* %rdi contains the ctx address */
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
index ce1cc1622385..72cae8e0ce85 100644
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -7,12 +7,13 @@ KASAN_SANITIZE := n
UBSAN_SANITIZE := n
KCOV_INSTRUMENT := n
-CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_syscall_32.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_syscall_64.o = $(CC_FLAGS_FTRACE)
-CFLAGS_common.o += -fno-stack-protector
+CFLAGS_syscall_32.o += -fno-stack-protector
+CFLAGS_syscall_64.o += -fno-stack-protector
obj-y := entry.o entry_$(BITS).o syscall_$(BITS).o
-obj-y += common.o
obj-y += vdso/
obj-y += vsyscall/
@@ -23,4 +24,3 @@ CFLAGS_REMOVE_entry_fred.o += -pg $(CC_FLAGS_FTRACE)
obj-$(CONFIG_X86_FRED) += entry_64_fred.o entry_fred.o
obj-$(CONFIG_IA32_EMULATION) += entry_64_compat.o syscall_32.o
-obj-$(CONFIG_X86_X32_ABI) += syscall_x32.o
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index ea81770629ee..cb0911c5dc5d 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -431,6 +431,7 @@ For 32-bit we have the following conventions - kernel is built with
/* rdi: arg1 ... normal C conventions. rax is saved/restored. */
.macro THUNK name, func
SYM_FUNC_START(\name)
+ ANNOTATE_NOENDBR
pushq %rbp
movq %rsp, %rbp
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
deleted file mode 100644
index 14db5b85114c..000000000000
--- a/arch/x86/entry/common.c
+++ /dev/null
@@ -1,524 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * common.c - C code for kernel entry and exit
- * Copyright (c) 2015 Andrew Lutomirski
- *
- * Based on asm and ptrace code by many authors. The code here originated
- * in ptrace.c and signal.c.
- */
-
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/sched/task_stack.h>
-#include <linux/entry-common.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/errno.h>
-#include <linux/ptrace.h>
-#include <linux/export.h>
-#include <linux/nospec.h>
-#include <linux/syscalls.h>
-#include <linux/uaccess.h>
-#include <linux/init.h>
-
-#ifdef CONFIG_XEN_PV
-#include <xen/xen-ops.h>
-#include <xen/events.h>
-#endif
-
-#include <asm/apic.h>
-#include <asm/desc.h>
-#include <asm/traps.h>
-#include <asm/vdso.h>
-#include <asm/cpufeature.h>
-#include <asm/fpu/api.h>
-#include <asm/nospec-branch.h>
-#include <asm/io_bitmap.h>
-#include <asm/syscall.h>
-#include <asm/irq_stack.h>
-
-#ifdef CONFIG_X86_64
-
-static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
-{
- /*
- * Convert negative numbers to very high and thus out of range
- * numbers for comparisons.
- */
- unsigned int unr = nr;
-
- if (likely(unr < NR_syscalls)) {
- unr = array_index_nospec(unr, NR_syscalls);
- regs->ax = x64_sys_call(regs, unr);
- return true;
- }
- return false;
-}
-
-static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
-{
- /*
- * Adjust the starting offset of the table, and convert numbers
- * < __X32_SYSCALL_BIT to very high and thus out of range
- * numbers for comparisons.
- */
- unsigned int xnr = nr - __X32_SYSCALL_BIT;
-
- if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) {
- xnr = array_index_nospec(xnr, X32_NR_syscalls);
- regs->ax = x32_sys_call(regs, xnr);
- return true;
- }
- return false;
-}
-
-/* Returns true to return using SYSRET, or false to use IRET */
-__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
-{
- add_random_kstack_offset();
- nr = syscall_enter_from_user_mode(regs, nr);
-
- instrumentation_begin();
-
- if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) {
- /* Invalid system call, but still a system call. */
- regs->ax = __x64_sys_ni_syscall(regs);
- }
-
- instrumentation_end();
- syscall_exit_to_user_mode(regs);
-
- /*
- * Check that the register state is valid for using SYSRET to exit
- * to userspace. Otherwise use the slower but fully capable IRET
- * exit path.
- */
-
- /* XEN PV guests always use the IRET path */
- if (cpu_feature_enabled(X86_FEATURE_XENPV))
- return false;
-
- /* SYSRET requires RCX == RIP and R11 == EFLAGS */
- if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags))
- return false;
-
- /* CS and SS must match the values set in MSR_STAR */
- if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS))
- return false;
-
- /*
- * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
- * in kernel space. This essentially lets the user take over
- * the kernel, since userspace controls RSP.
- *
- * TASK_SIZE_MAX covers all user-accessible addresses other than
- * the deprecated vsyscall page.
- */
- if (unlikely(regs->ip >= TASK_SIZE_MAX))
- return false;
-
- /*
- * SYSRET cannot restore RF. It can restore TF, but unlike IRET,
- * restoring TF results in a trap from userspace immediately after
- * SYSRET.
- */
- if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)))
- return false;
-
- /* Use SYSRET to exit to userspace */
- return true;
-}
-#endif
-
-#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
-static __always_inline int syscall_32_enter(struct pt_regs *regs)
-{
- if (IS_ENABLED(CONFIG_IA32_EMULATION))
- current_thread_info()->status |= TS_COMPAT;
-
- return (int)regs->orig_ax;
-}
-
-#ifdef CONFIG_IA32_EMULATION
-bool __ia32_enabled __ro_after_init = !IS_ENABLED(CONFIG_IA32_EMULATION_DEFAULT_DISABLED);
-
-static int ia32_emulation_override_cmdline(char *arg)
-{
- return kstrtobool(arg, &__ia32_enabled);
-}
-early_param("ia32_emulation", ia32_emulation_override_cmdline);
-#endif
-
-/*
- * Invoke a 32-bit syscall. Called with IRQs on in CT_STATE_KERNEL.
- */
-static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr)
-{
- /*
- * Convert negative numbers to very high and thus out of range
- * numbers for comparisons.
- */
- unsigned int unr = nr;
-
- if (likely(unr < IA32_NR_syscalls)) {
- unr = array_index_nospec(unr, IA32_NR_syscalls);
- regs->ax = ia32_sys_call(regs, unr);
- } else if (nr != -1) {
- regs->ax = __ia32_sys_ni_syscall(regs);
- }
-}
-
-#ifdef CONFIG_IA32_EMULATION
-static __always_inline bool int80_is_external(void)
-{
- const unsigned int offs = (0x80 / 32) * 0x10;
- const u32 bit = BIT(0x80 % 32);
-
- /* The local APIC on XENPV guests is fake */
- if (cpu_feature_enabled(X86_FEATURE_XENPV))
- return false;
-
- /*
- * If vector 0x80 is set in the APIC ISR then this is an external
- * interrupt. Either from broken hardware or injected by a VMM.
- *
- * Note: In guest mode this is only valid for secure guests where
- * the secure module fully controls the vAPIC exposed to the guest.
- */
- return apic_read(APIC_ISR + offs) & bit;
-}
-
-/**
- * do_int80_emulation - 32-bit legacy syscall C entry from asm
- * @regs: syscall arguments in struct pt_args on the stack.
- *
- * This entry point can be used by 32-bit and 64-bit programs to perform
- * 32-bit system calls. Instances of INT $0x80 can be found inline in
- * various programs and libraries. It is also used by the vDSO's
- * __kernel_vsyscall fallback for hardware that doesn't support a faster
- * entry method. Restarted 32-bit system calls also fall back to INT
- * $0x80 regardless of what instruction was originally used to do the
- * system call.
- *
- * This is considered a slow path. It is not used by most libc
- * implementations on modern hardware except during process startup.
- *
- * The arguments for the INT $0x80 based syscall are on stack in the
- * pt_regs structure:
- * eax: system call number
- * ebx, ecx, edx, esi, edi, ebp: arg1 - arg 6
- */
-__visible noinstr void do_int80_emulation(struct pt_regs *regs)
-{
- int nr;
-
- /* Kernel does not use INT $0x80! */
- if (unlikely(!user_mode(regs))) {
- irqentry_enter(regs);
- instrumentation_begin();
- panic("Unexpected external interrupt 0x80\n");
- }
-
- /*
- * Establish kernel context for instrumentation, including for
- * int80_is_external() below which calls into the APIC driver.
- * Identical for soft and external interrupts.
- */
- enter_from_user_mode(regs);
-
- instrumentation_begin();
- add_random_kstack_offset();
-
- /* Validate that this is a soft interrupt to the extent possible */
- if (unlikely(int80_is_external()))
- panic("Unexpected external interrupt 0x80\n");
-
- /*
- * The low level idtentry code pushed -1 into regs::orig_ax
- * and regs::ax contains the syscall number.
- *
- * User tracing code (ptrace or signal handlers) might assume
- * that the regs::orig_ax contains a 32-bit number on invoking
- * a 32-bit syscall.
- *
- * Establish the syscall convention by saving the 32bit truncated
- * syscall number in regs::orig_ax and by invalidating regs::ax.
- */
- regs->orig_ax = regs->ax & GENMASK(31, 0);
- regs->ax = -ENOSYS;
-
- nr = syscall_32_enter(regs);
-
- local_irq_enable();
- nr = syscall_enter_from_user_mode_work(regs, nr);
- do_syscall_32_irqs_on(regs, nr);
-
- instrumentation_end();
- syscall_exit_to_user_mode(regs);
-}
-
-#ifdef CONFIG_X86_FRED
-/*
- * A FRED-specific INT80 handler is warranted for the follwing reasons:
- *
- * 1) As INT instructions and hardware interrupts are separate event
- * types, FRED does not preclude the use of vector 0x80 for external
- * interrupts. As a result, the FRED setup code does not reserve
- * vector 0x80 and calling int80_is_external() is not merely
- * suboptimal but actively incorrect: it could cause a system call
- * to be incorrectly ignored.
- *
- * 2) It is called only for handling vector 0x80 of event type
- * EVENT_TYPE_SWINT and will never be called to handle any external
- * interrupt (event type EVENT_TYPE_EXTINT).
- *
- * 3) FRED has separate entry flows depending on if the event came from
- * user space or kernel space, and because the kernel does not use
- * INT insns, the FRED kernel entry handler fred_entry_from_kernel()
- * falls through to fred_bad_type() if the event type is
- * EVENT_TYPE_SWINT, i.e., INT insns. So if the kernel is handling
- * an INT insn, it can only be from a user level.
- *
- * 4) int80_emulation() does a CLEAR_BRANCH_HISTORY. While FRED will
- * likely take a different approach if it is ever needed: it
- * probably belongs in either fred_intx()/ fred_other() or
- * asm_fred_entrypoint_user(), depending on if this ought to be done
- * for all entries from userspace or only system
- * calls.
- *
- * 5) INT $0x80 is the fast path for 32-bit system calls under FRED.
- */
-DEFINE_FREDENTRY_RAW(int80_emulation)
-{
- int nr;
-
- enter_from_user_mode(regs);
-
- instrumentation_begin();
- add_random_kstack_offset();
-
- /*
- * FRED pushed 0 into regs::orig_ax and regs::ax contains the
- * syscall number.
- *
- * User tracing code (ptrace or signal handlers) might assume
- * that the regs::orig_ax contains a 32-bit number on invoking
- * a 32-bit syscall.
- *
- * Establish the syscall convention by saving the 32bit truncated
- * syscall number in regs::orig_ax and by invalidating regs::ax.
- */
- regs->orig_ax = regs->ax & GENMASK(31, 0);
- regs->ax = -ENOSYS;
-
- nr = syscall_32_enter(regs);
-
- local_irq_enable();
- nr = syscall_enter_from_user_mode_work(regs, nr);
- do_syscall_32_irqs_on(regs, nr);
-
- instrumentation_end();
- syscall_exit_to_user_mode(regs);
-}
-#endif
-#else /* CONFIG_IA32_EMULATION */
-
-/* Handles int $0x80 on a 32bit kernel */
-__visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
-{
- int nr = syscall_32_enter(regs);
-
- add_random_kstack_offset();
- /*
- * Subtlety here: if ptrace pokes something larger than 2^31-1 into
- * orig_ax, the int return value truncates it. This matches
- * the semantics of syscall_get_nr().
- */
- nr = syscall_enter_from_user_mode(regs, nr);
- instrumentation_begin();
-
- do_syscall_32_irqs_on(regs, nr);
-
- instrumentation_end();
- syscall_exit_to_user_mode(regs);
-}
-#endif /* !CONFIG_IA32_EMULATION */
-
-static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
-{
- int nr = syscall_32_enter(regs);
- int res;
-
- add_random_kstack_offset();
- /*
- * This cannot use syscall_enter_from_user_mode() as it has to
- * fetch EBP before invoking any of the syscall entry work
- * functions.
- */
- syscall_enter_from_user_mode_prepare(regs);
-
- instrumentation_begin();
- /* Fetch EBP from where the vDSO stashed it. */
- if (IS_ENABLED(CONFIG_X86_64)) {
- /*
- * Micro-optimization: the pointer we're following is
- * explicitly 32 bits, so it can't be out of range.
- */
- res = __get_user(*(u32 *)&regs->bp,
- (u32 __user __force *)(unsigned long)(u32)regs->sp);
- } else {
- res = get_user(*(u32 *)&regs->bp,
- (u32 __user __force *)(unsigned long)(u32)regs->sp);
- }
-
- if (res) {
- /* User code screwed up. */
- regs->ax = -EFAULT;
-
- local_irq_disable();
- instrumentation_end();
- irqentry_exit_to_user_mode(regs);
- return false;
- }
-
- nr = syscall_enter_from_user_mode_work(regs, nr);
-
- /* Now this is just like a normal syscall. */
- do_syscall_32_irqs_on(regs, nr);
-
- instrumentation_end();
- syscall_exit_to_user_mode(regs);
- return true;
-}
-
-/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */
-__visible noinstr bool do_fast_syscall_32(struct pt_regs *regs)
-{
- /*
- * Called using the internal vDSO SYSENTER/SYSCALL32 calling
- * convention. Adjust regs so it looks like we entered using int80.
- */
- unsigned long landing_pad = (unsigned long)current->mm->context.vdso +
- vdso_image_32.sym_int80_landing_pad;
-
- /*
- * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward
- * so that 'regs->ip -= 2' lands back on an int $0x80 instruction.
- * Fix it up.
- */
- regs->ip = landing_pad;
-
- /* Invoke the syscall. If it failed, keep it simple: use IRET. */
- if (!__do_fast_syscall_32(regs))
- return false;
-
- /*
- * Check that the register state is valid for using SYSRETL/SYSEXIT
- * to exit to userspace. Otherwise use the slower but fully capable
- * IRET exit path.
- */
-
- /* XEN PV guests always use the IRET path */
- if (cpu_feature_enabled(X86_FEATURE_XENPV))
- return false;
-
- /* EIP must point to the VDSO landing pad */
- if (unlikely(regs->ip != landing_pad))
- return false;
-
- /* CS and SS must match the values set in MSR_STAR */
- if (unlikely(regs->cs != __USER32_CS || regs->ss != __USER_DS))
- return false;
-
- /* If the TF, RF, or VM flags are set, use IRET */
- if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)))
- return false;
-
- /* Use SYSRETL/SYSEXIT to exit to userspace */
- return true;
-}
-
-/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */
-__visible noinstr bool do_SYSENTER_32(struct pt_regs *regs)
-{
- /* SYSENTER loses RSP, but the vDSO saved it in RBP. */
- regs->sp = regs->bp;
-
- /* SYSENTER clobbers EFLAGS.IF. Assume it was set in usermode. */
- regs->flags |= X86_EFLAGS_IF;
-
- return do_fast_syscall_32(regs);
-}
-#endif
-
-SYSCALL_DEFINE0(ni_syscall)
-{
- return -ENOSYS;
-}
-
-#ifdef CONFIG_XEN_PV
-#ifndef CONFIG_PREEMPTION
-/*
- * Some hypercalls issued by the toolstack can take many 10s of
- * seconds. Allow tasks running hypercalls via the privcmd driver to
- * be voluntarily preempted even if full kernel preemption is
- * disabled.
- *
- * Such preemptible hypercalls are bracketed by
- * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end()
- * calls.
- */
-DEFINE_PER_CPU(bool, xen_in_preemptible_hcall);
-EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall);
-
-/*
- * In case of scheduling the flag must be cleared and restored after
- * returning from schedule as the task might move to a different CPU.
- */
-static __always_inline bool get_and_clear_inhcall(void)
-{
- bool inhcall = __this_cpu_read(xen_in_preemptible_hcall);
-
- __this_cpu_write(xen_in_preemptible_hcall, false);
- return inhcall;
-}
-
-static __always_inline void restore_inhcall(bool inhcall)
-{
- __this_cpu_write(xen_in_preemptible_hcall, inhcall);
-}
-#else
-static __always_inline bool get_and_clear_inhcall(void) { return false; }
-static __always_inline void restore_inhcall(bool inhcall) { }
-#endif
-
-static void __xen_pv_evtchn_do_upcall(struct pt_regs *regs)
-{
- struct pt_regs *old_regs = set_irq_regs(regs);
-
- inc_irq_stat(irq_hv_callback_count);
-
- xen_evtchn_do_upcall();
-
- set_irq_regs(old_regs);
-}
-
-__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
-{
- irqentry_state_t state = irqentry_enter(regs);
- bool inhcall;
-
- instrumentation_begin();
- run_sysvec_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs);
-
- inhcall = get_and_clear_inhcall();
- if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) {
- irqentry_exit_cond_resched();
- instrumentation_end();
- restore_inhcall(inhcall);
- } else {
- instrumentation_end();
- irqentry_exit(regs, state);
- }
-}
-#endif /* CONFIG_XEN_PV */
diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S
index b7ea3e8e9ecc..d3caa31240ed 100644
--- a/arch/x86/entry/entry.S
+++ b/arch/x86/entry/entry.S
@@ -5,6 +5,7 @@
#include <linux/export.h>
#include <linux/linkage.h>
+#include <linux/objtool.h>
#include <asm/msr-index.h>
#include <asm/unwind_hints.h>
#include <asm/segment.h>
@@ -17,6 +18,7 @@
.pushsection .noinstr.text, "ax"
SYM_FUNC_START(entry_ibpb)
+ ANNOTATE_NOENDBR
movl $MSR_IA32_PRED_CMD, %ecx
movl $PRED_CMD_IBPB, %eax
xorl %edx, %edx
@@ -52,7 +54,6 @@ EXPORT_SYMBOL_GPL(mds_verw_sel);
THUNK warn_thunk_thunk, __warn_thunk
-#ifndef CONFIG_X86_64
/*
* Clang's implementation of TLS stack cookies requires the variable in
* question to be a TLS variable. If the variable happens to be defined as an
@@ -63,7 +64,6 @@ THUNK warn_thunk_thunk, __warn_thunk
* entirely in the C code, and use an alias emitted by the linker script
* instead.
*/
-#ifdef CONFIG_STACKPROTECTOR
+#if defined(CONFIG_STACKPROTECTOR) && defined(CONFIG_SMP)
EXPORT_SYMBOL(__ref_stack_chk_guard);
#endif
-#endif
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 20be5758c2d2..92c0b4a94e0a 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -1153,7 +1153,7 @@ SYM_CODE_START(asm_exc_nmi)
* is using the thread stack right now, so it's safe for us to use it.
*/
movl %esp, %ebx
- movl PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %esp
+ movl PER_CPU_VAR(cpu_current_top_of_stack), %esp
call exc_nmi
movl %ebx, %esp
@@ -1217,7 +1217,7 @@ SYM_CODE_START(rewind_stack_and_make_dead)
/* Prevent any naive code from trying to unwind to our caller. */
xorl %ebp, %ebp
- movl PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %esi
+ movl PER_CPU_VAR(cpu_current_top_of_stack), %esi
leal -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%esi), %esp
call make_task_dead
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index f52dbe0ad93c..f40bdf97d390 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -92,7 +92,7 @@ SYM_CODE_START(entry_SYSCALL_64)
/* tss.sp2 is scratch space. */
movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
- movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
@@ -175,6 +175,7 @@ SYM_CODE_END(entry_SYSCALL_64)
*/
.pushsection .text, "ax"
SYM_FUNC_START(__switch_to_asm)
+ ANNOTATE_NOENDBR
/*
* Save callee-saved registers
* This must match the order in inactive_task_frame
@@ -192,7 +193,7 @@ SYM_FUNC_START(__switch_to_asm)
#ifdef CONFIG_STACKPROTECTOR
movq TASK_stack_canary(%rsi), %rbx
- movq %rbx, PER_CPU_VAR(fixed_percpu_data + FIXED_stack_canary)
+ movq %rbx, PER_CPU_VAR(__stack_chk_guard)
#endif
/*
@@ -742,6 +743,7 @@ _ASM_NOKPROBE(common_interrupt_return)
* Is in entry.text as it shouldn't be instrumented.
*/
SYM_FUNC_START(asm_load_gs_index)
+ ANNOTATE_NOENDBR
FRAME_BEGIN
swapgs
.Lgs_change:
@@ -1166,7 +1168,7 @@ SYM_CODE_START(asm_exc_nmi)
FENCE_SWAPGS_USER_ENTRY
SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
movq %rsp, %rdx
- movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
UNWIND_HINT_IRET_REGS base=%rdx offset=8
pushq 5*8(%rdx) /* pt_regs->ss */
pushq 4*8(%rdx) /* pt_regs->rsp */
@@ -1484,7 +1486,7 @@ SYM_CODE_START_NOALIGN(rewind_stack_and_make_dead)
/* Prevent any naive code from trying to unwind to our caller. */
xorl %ebp, %ebp
- movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rax
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rax
leaq -PTREGS_SIZE(%rax), %rsp
UNWIND_HINT_REGS
@@ -1526,6 +1528,7 @@ SYM_CODE_END(rewind_stack_and_make_dead)
* refactored in the future if needed.
*/
SYM_FUNC_START(clear_bhb_loop)
+ ANNOTATE_NOENDBR
push %rbp
mov %rsp, %rbp
movl $5, %ecx
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index ed0a5f2dc129..a45e1125fc6c 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -57,7 +57,7 @@ SYM_CODE_START(entry_SYSENTER_compat)
SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
popq %rax
- movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
/* Construct struct pt_regs on stack */
pushq $__USER_DS /* pt_regs->ss */
@@ -193,7 +193,7 @@ SYM_CODE_START(entry_SYSCALL_compat)
SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
/* Switch to the kernel stack */
- movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
SYM_INNER_LABEL(entry_SYSCALL_compat_safe_stack, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
diff --git a/arch/x86/entry/entry_64_fred.S b/arch/x86/entry/entry_64_fred.S
index a02bc6f3d2e6..29c5c32c16c3 100644
--- a/arch/x86/entry/entry_64_fred.S
+++ b/arch/x86/entry/entry_64_fred.S
@@ -58,6 +58,7 @@ SYM_CODE_END(asm_fred_entrypoint_kernel)
#if IS_ENABLED(CONFIG_KVM_INTEL)
SYM_FUNC_START(asm_fred_entry_from_kvm)
+ ANNOTATE_NOENDBR
push %rbp
mov %rsp, %rbp
diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c
index 8cc9950d7104..2b15ea17bb7c 100644
--- a/arch/x86/entry/syscall_32.c
+++ b/arch/x86/entry/syscall_32.c
@@ -1,10 +1,16 @@
-// SPDX-License-Identifier: GPL-2.0
-/* System call table for i386. */
+// SPDX-License-Identifier: GPL-2.0-only
+/* 32-bit system call dispatch */
#include <linux/linkage.h>
#include <linux/sys.h>
#include <linux/cache.h>
#include <linux/syscalls.h>
+#include <linux/entry-common.h>
+#include <linux/nospec.h>
+#include <linux/uaccess.h>
+#include <asm/apic.h>
+#include <asm/traps.h>
+#include <asm/cpufeature.h>
#include <asm/syscall.h>
#ifdef CONFIG_IA32_EMULATION
@@ -41,4 +47,324 @@ long ia32_sys_call(const struct pt_regs *regs, unsigned int nr)
#include <asm/syscalls_32.h>
default: return __ia32_sys_ni_syscall(regs);
}
-};
+}
+
+static __always_inline int syscall_32_enter(struct pt_regs *regs)
+{
+ if (IS_ENABLED(CONFIG_IA32_EMULATION))
+ current_thread_info()->status |= TS_COMPAT;
+
+ return (int)regs->orig_ax;
+}
+
+#ifdef CONFIG_IA32_EMULATION
+bool __ia32_enabled __ro_after_init = !IS_ENABLED(CONFIG_IA32_EMULATION_DEFAULT_DISABLED);
+
+static int __init ia32_emulation_override_cmdline(char *arg)
+{
+ return kstrtobool(arg, &__ia32_enabled);
+}
+early_param("ia32_emulation", ia32_emulation_override_cmdline);
+#endif
+
+/*
+ * Invoke a 32-bit syscall. Called with IRQs on in CT_STATE_KERNEL.
+ */
+static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr)
+{
+ /*
+ * Convert negative numbers to very high and thus out of range
+ * numbers for comparisons.
+ */
+ unsigned int unr = nr;
+
+ if (likely(unr < IA32_NR_syscalls)) {
+ unr = array_index_nospec(unr, IA32_NR_syscalls);
+ regs->ax = ia32_sys_call(regs, unr);
+ } else if (nr != -1) {
+ regs->ax = __ia32_sys_ni_syscall(regs);
+ }
+}
+
+#ifdef CONFIG_IA32_EMULATION
+static __always_inline bool int80_is_external(void)
+{
+ const unsigned int offs = (0x80 / 32) * 0x10;
+ const u32 bit = BIT(0x80 % 32);
+
+ /* The local APIC on XENPV guests is fake */
+ if (cpu_feature_enabled(X86_FEATURE_XENPV))
+ return false;
+
+ /*
+ * If vector 0x80 is set in the APIC ISR then this is an external
+ * interrupt. Either from broken hardware or injected by a VMM.
+ *
+ * Note: In guest mode this is only valid for secure guests where
+ * the secure module fully controls the vAPIC exposed to the guest.
+ */
+ return apic_read(APIC_ISR + offs) & bit;
+}
+
+/**
+ * do_int80_emulation - 32-bit legacy syscall C entry from asm
+ * @regs: syscall arguments in struct pt_args on the stack.
+ *
+ * This entry point can be used by 32-bit and 64-bit programs to perform
+ * 32-bit system calls. Instances of INT $0x80 can be found inline in
+ * various programs and libraries. It is also used by the vDSO's
+ * __kernel_vsyscall fallback for hardware that doesn't support a faster
+ * entry method. Restarted 32-bit system calls also fall back to INT
+ * $0x80 regardless of what instruction was originally used to do the
+ * system call.
+ *
+ * This is considered a slow path. It is not used by most libc
+ * implementations on modern hardware except during process startup.
+ *
+ * The arguments for the INT $0x80 based syscall are on stack in the
+ * pt_regs structure:
+ * eax: system call number
+ * ebx, ecx, edx, esi, edi, ebp: arg1 - arg 6
+ */
+__visible noinstr void do_int80_emulation(struct pt_regs *regs)
+{
+ int nr;
+
+ /* Kernel does not use INT $0x80! */
+ if (unlikely(!user_mode(regs))) {
+ irqentry_enter(regs);
+ instrumentation_begin();
+ panic("Unexpected external interrupt 0x80\n");
+ }
+
+ /*
+ * Establish kernel context for instrumentation, including for
+ * int80_is_external() below which calls into the APIC driver.
+ * Identical for soft and external interrupts.
+ */
+ enter_from_user_mode(regs);
+
+ instrumentation_begin();
+ add_random_kstack_offset();
+
+ /* Validate that this is a soft interrupt to the extent possible */
+ if (unlikely(int80_is_external()))
+ panic("Unexpected external interrupt 0x80\n");
+
+ /*
+ * The low level idtentry code pushed -1 into regs::orig_ax
+ * and regs::ax contains the syscall number.
+ *
+ * User tracing code (ptrace or signal handlers) might assume
+ * that the regs::orig_ax contains a 32-bit number on invoking
+ * a 32-bit syscall.
+ *
+ * Establish the syscall convention by saving the 32bit truncated
+ * syscall number in regs::orig_ax and by invalidating regs::ax.
+ */
+ regs->orig_ax = regs->ax & GENMASK(31, 0);
+ regs->ax = -ENOSYS;
+
+ nr = syscall_32_enter(regs);
+
+ local_irq_enable();
+ nr = syscall_enter_from_user_mode_work(regs, nr);
+ do_syscall_32_irqs_on(regs, nr);
+
+ instrumentation_end();
+ syscall_exit_to_user_mode(regs);
+}
+
+#ifdef CONFIG_X86_FRED
+/*
+ * A FRED-specific INT80 handler is warranted for the follwing reasons:
+ *
+ * 1) As INT instructions and hardware interrupts are separate event
+ * types, FRED does not preclude the use of vector 0x80 for external
+ * interrupts. As a result, the FRED setup code does not reserve
+ * vector 0x80 and calling int80_is_external() is not merely
+ * suboptimal but actively incorrect: it could cause a system call
+ * to be incorrectly ignored.
+ *
+ * 2) It is called only for handling vector 0x80 of event type
+ * EVENT_TYPE_SWINT and will never be called to handle any external
+ * interrupt (event type EVENT_TYPE_EXTINT).
+ *
+ * 3) FRED has separate entry flows depending on if the event came from
+ * user space or kernel space, and because the kernel does not use
+ * INT insns, the FRED kernel entry handler fred_entry_from_kernel()
+ * falls through to fred_bad_type() if the event type is
+ * EVENT_TYPE_SWINT, i.e., INT insns. So if the kernel is handling
+ * an INT insn, it can only be from a user level.
+ *
+ * 4) int80_emulation() does a CLEAR_BRANCH_HISTORY. While FRED will
+ * likely take a different approach if it is ever needed: it
+ * probably belongs in either fred_intx()/ fred_other() or
+ * asm_fred_entrypoint_user(), depending on if this ought to be done
+ * for all entries from userspace or only system
+ * calls.
+ *
+ * 5) INT $0x80 is the fast path for 32-bit system calls under FRED.
+ */
+DEFINE_FREDENTRY_RAW(int80_emulation)
+{
+ int nr;
+
+ enter_from_user_mode(regs);
+
+ instrumentation_begin();
+ add_random_kstack_offset();
+
+ /*
+ * FRED pushed 0 into regs::orig_ax and regs::ax contains the
+ * syscall number.
+ *
+ * User tracing code (ptrace or signal handlers) might assume
+ * that the regs::orig_ax contains a 32-bit number on invoking
+ * a 32-bit syscall.
+ *
+ * Establish the syscall convention by saving the 32bit truncated
+ * syscall number in regs::orig_ax and by invalidating regs::ax.
+ */
+ regs->orig_ax = regs->ax & GENMASK(31, 0);
+ regs->ax = -ENOSYS;
+
+ nr = syscall_32_enter(regs);
+
+ local_irq_enable();
+ nr = syscall_enter_from_user_mode_work(regs, nr);
+ do_syscall_32_irqs_on(regs, nr);
+
+ instrumentation_end();
+ syscall_exit_to_user_mode(regs);
+}
+#endif /* CONFIG_X86_FRED */
+
+#else /* CONFIG_IA32_EMULATION */
+
+/* Handles int $0x80 on a 32bit kernel */
+__visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
+{
+ int nr = syscall_32_enter(regs);
+
+ add_random_kstack_offset();
+ /*
+ * Subtlety here: if ptrace pokes something larger than 2^31-1 into
+ * orig_ax, the int return value truncates it. This matches
+ * the semantics of syscall_get_nr().
+ */
+ nr = syscall_enter_from_user_mode(regs, nr);
+ instrumentation_begin();
+
+ do_syscall_32_irqs_on(regs, nr);
+
+ instrumentation_end();
+ syscall_exit_to_user_mode(regs);
+}
+#endif /* !CONFIG_IA32_EMULATION */
+
+static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
+{
+ int nr = syscall_32_enter(regs);
+ int res;
+
+ add_random_kstack_offset();
+ /*
+ * This cannot use syscall_enter_from_user_mode() as it has to
+ * fetch EBP before invoking any of the syscall entry work
+ * functions.
+ */
+ syscall_enter_from_user_mode_prepare(regs);
+
+ instrumentation_begin();
+ /* Fetch EBP from where the vDSO stashed it. */
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ /*
+ * Micro-optimization: the pointer we're following is
+ * explicitly 32 bits, so it can't be out of range.
+ */
+ res = __get_user(*(u32 *)&regs->bp,
+ (u32 __user __force *)(unsigned long)(u32)regs->sp);
+ } else {
+ res = get_user(*(u32 *)&regs->bp,
+ (u32 __user __force *)(unsigned long)(u32)regs->sp);
+ }
+
+ if (res) {
+ /* User code screwed up. */
+ regs->ax = -EFAULT;
+
+ local_irq_disable();
+ instrumentation_end();
+ irqentry_exit_to_user_mode(regs);
+ return false;
+ }
+
+ nr = syscall_enter_from_user_mode_work(regs, nr);
+
+ /* Now this is just like a normal syscall. */
+ do_syscall_32_irqs_on(regs, nr);
+
+ instrumentation_end();
+ syscall_exit_to_user_mode(regs);
+ return true;
+}
+
+/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */
+__visible noinstr bool do_fast_syscall_32(struct pt_regs *regs)
+{
+ /*
+ * Called using the internal vDSO SYSENTER/SYSCALL32 calling
+ * convention. Adjust regs so it looks like we entered using int80.
+ */
+ unsigned long landing_pad = (unsigned long)current->mm->context.vdso +
+ vdso_image_32.sym_int80_landing_pad;
+
+ /*
+ * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward
+ * so that 'regs->ip -= 2' lands back on an int $0x80 instruction.
+ * Fix it up.
+ */
+ regs->ip = landing_pad;
+
+ /* Invoke the syscall. If it failed, keep it simple: use IRET. */
+ if (!__do_fast_syscall_32(regs))
+ return false;
+
+ /*
+ * Check that the register state is valid for using SYSRETL/SYSEXIT
+ * to exit to userspace. Otherwise use the slower but fully capable
+ * IRET exit path.
+ */
+
+ /* XEN PV guests always use the IRET path */
+ if (cpu_feature_enabled(X86_FEATURE_XENPV))
+ return false;
+
+ /* EIP must point to the VDSO landing pad */
+ if (unlikely(regs->ip != landing_pad))
+ return false;
+
+ /* CS and SS must match the values set in MSR_STAR */
+ if (unlikely(regs->cs != __USER32_CS || regs->ss != __USER_DS))
+ return false;
+
+ /* If the TF, RF, or VM flags are set, use IRET */
+ if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)))
+ return false;
+
+ /* Use SYSRETL/SYSEXIT to exit to userspace */
+ return true;
+}
+
+/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */
+__visible noinstr bool do_SYSENTER_32(struct pt_regs *regs)
+{
+ /* SYSENTER loses RSP, but the vDSO saved it in RBP. */
+ regs->sp = regs->bp;
+
+ /* SYSENTER clobbers EFLAGS.IF. Assume it was set in usermode. */
+ regs->flags |= X86_EFLAGS_IF;
+
+ return do_fast_syscall_32(regs);
+}
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index ba8354424860..b6e68ea98b83 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -1,15 +1,20 @@
-// SPDX-License-Identifier: GPL-2.0
-/* System call table for x86-64. */
+// SPDX-License-Identifier: GPL-2.0-only
+/* 64-bit system call dispatch */
#include <linux/linkage.h>
#include <linux/sys.h>
#include <linux/cache.h>
#include <linux/syscalls.h>
+#include <linux/entry-common.h>
+#include <linux/nospec.h>
#include <asm/syscall.h>
#define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *);
#define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __x64_##sym(const struct pt_regs *);
#include <asm/syscalls_64.h>
+#ifdef CONFIG_X86_X32_ABI
+#include <asm/syscalls_x32.h>
+#endif
#undef __SYSCALL
#undef __SYSCALL_NORETURN
@@ -33,4 +38,104 @@ long x64_sys_call(const struct pt_regs *regs, unsigned int nr)
#include <asm/syscalls_64.h>
default: return __x64_sys_ni_syscall(regs);
}
-};
+}
+
+#ifdef CONFIG_X86_X32_ABI
+long x32_sys_call(const struct pt_regs *regs, unsigned int nr)
+{
+ switch (nr) {
+ #include <asm/syscalls_x32.h>
+ default: return __x64_sys_ni_syscall(regs);
+ }
+}
+#endif
+
+static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
+{
+ /*
+ * Convert negative numbers to very high and thus out of range
+ * numbers for comparisons.
+ */
+ unsigned int unr = nr;
+
+ if (likely(unr < NR_syscalls)) {
+ unr = array_index_nospec(unr, NR_syscalls);
+ regs->ax = x64_sys_call(regs, unr);
+ return true;
+ }
+ return false;
+}
+
+static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
+{
+ /*
+ * Adjust the starting offset of the table, and convert numbers
+ * < __X32_SYSCALL_BIT to very high and thus out of range
+ * numbers for comparisons.
+ */
+ unsigned int xnr = nr - __X32_SYSCALL_BIT;
+
+ if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) {
+ xnr = array_index_nospec(xnr, X32_NR_syscalls);
+ regs->ax = x32_sys_call(regs, xnr);
+ return true;
+ }
+ return false;
+}
+
+/* Returns true to return using SYSRET, or false to use IRET */
+__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
+{
+ add_random_kstack_offset();
+ nr = syscall_enter_from_user_mode(regs, nr);
+
+ instrumentation_begin();
+
+ if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) {
+ /* Invalid system call, but still a system call. */
+ regs->ax = __x64_sys_ni_syscall(regs);
+ }
+
+ instrumentation_end();
+ syscall_exit_to_user_mode(regs);
+
+ /*
+ * Check that the register state is valid for using SYSRET to exit
+ * to userspace. Otherwise use the slower but fully capable IRET
+ * exit path.
+ */
+
+ /* XEN PV guests always use the IRET path */
+ if (cpu_feature_enabled(X86_FEATURE_XENPV))
+ return false;
+
+ /* SYSRET requires RCX == RIP and R11 == EFLAGS */
+ if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags))
+ return false;
+
+ /* CS and SS must match the values set in MSR_STAR */
+ if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS))
+ return false;
+
+ /*
+ * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
+ * in kernel space. This essentially lets the user take over
+ * the kernel, since userspace controls RSP.
+ *
+ * TASK_SIZE_MAX covers all user-accessible addresses other than
+ * the deprecated vsyscall page.
+ */
+ if (unlikely(regs->ip >= TASK_SIZE_MAX))
+ return false;
+
+ /*
+ * SYSRET cannot restore RF. It can restore TF, but unlike IRET,
+ * restoring TF results in a trap from userspace immediately after
+ * SYSRET.
+ */
+ if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)))
+ return false;
+
+ /* Use SYSRET to exit to userspace */
+ return true;
+}
diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c
deleted file mode 100644
index fb77908f44f3..000000000000
--- a/arch/x86/entry/syscall_x32.c
+++ /dev/null
@@ -1,25 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* System call table for x32 ABI. */
-
-#include <linux/linkage.h>
-#include <linux/sys.h>
-#include <linux/cache.h>
-#include <linux/syscalls.h>
-#include <asm/syscall.h>
-
-#define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *);
-#define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __x64_##sym(const struct pt_regs *);
-#include <asm/syscalls_x32.h>
-#undef __SYSCALL
-
-#undef __SYSCALL_NORETURN
-#define __SYSCALL_NORETURN __SYSCALL
-
-#define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs);
-long x32_sys_call(const struct pt_regs *regs, unsigned int nr)
-{
- switch (nr) {
- #include <asm/syscalls_x32.h>
- default: return __x64_sys_ni_syscall(regs);
- }
-};
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 4d0fb2fba7e2..ac007ea00979 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -396,7 +396,7 @@
381 i386 pkey_alloc sys_pkey_alloc
382 i386 pkey_free sys_pkey_free
383 i386 statx sys_statx
-384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl
+384 i386 arch_prctl sys_arch_prctl
385 i386 io_pgetevents sys_io_pgetevents_time32 compat_sys_io_pgetevents
386 i386 rseq sys_rseq
393 i386 semget sys_semget
@@ -472,3 +472,4 @@
464 i386 getxattrat sys_getxattrat
465 i386 listxattrat sys_listxattrat
466 i386 removexattrat sys_removexattrat
+467 i386 open_tree_attr sys_open_tree_attr
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 5eb708bff1c7..cfb5ca41e30d 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -390,6 +390,7 @@
464 common getxattrat sys_getxattrat
465 common listxattrat sys_listxattrat
466 common removexattrat sys_removexattrat
+467 common open_tree_attr sys_open_tree_attr
#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index c9216ac4fb1e..54d3e9774d62 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -4,7 +4,7 @@
#
# Include the generic Makefile to check the built vDSO:
-include $(srctree)/lib/vdso/Makefile
+include $(srctree)/lib/vdso/Makefile.include
# Files to link into the vDSO:
vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vgetrandom.o vgetrandom-chacha.o
@@ -32,7 +32,7 @@ targets += $(foreach x, 64 x32 32, vdso-image-$(x).c vdso$(x).so vdso$(x).so.dbg
CPPFLAGS_vdso.lds += -P -C
-VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -soname linux-vdso.so.1 --no-undefined \
+VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -soname linux-vdso.so.1 \
-z max-page-size=4096
$(obj)/vdso64.so.dbg: $(obj)/vdso.lds $(vobjs) FORCE
@@ -133,6 +133,7 @@ KBUILD_CFLAGS_32 += -fno-stack-protector
KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls)
KBUILD_CFLAGS_32 += -fno-omit-frame-pointer
KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING
+KBUILD_CFLAGS_32 += -DBUILD_VDSO
ifdef CONFIG_MITIGATION_RETPOLINE
ifneq ($(RETPOLINE_VDSO_CFLAGS),)
@@ -151,10 +152,9 @@ $(obj)/vdso32.so.dbg: $(obj)/vdso32/vdso32.lds $(vobjs32) FORCE
quiet_cmd_vdso = VDSO $@
cmd_vdso = $(LD) -o $@ \
$(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \
- -T $(filter %.lds,$^) $(filter %.o,$^) && \
- sh $(src)/checkundef.sh '$(NM)' '$@'
+ -T $(filter %.lds,$^) $(filter %.o,$^)
-VDSO_LDFLAGS = -shared --hash-style=both --build-id=sha1 \
+VDSO_LDFLAGS = -shared --hash-style=both --build-id=sha1 --no-undefined \
$(call ld-option, --eh-frame-hdr) -Bsymbolic -z noexecstack
quiet_cmd_vdso_and_check = VDSO $@
diff --git a/arch/x86/entry/vdso/checkundef.sh b/arch/x86/entry/vdso/checkundef.sh
deleted file mode 100755
index 7ee90a9b549d..000000000000
--- a/arch/x86/entry/vdso/checkundef.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/sh
-nm="$1"
-file="$2"
-$nm "$file" | grep '^ *U' > /dev/null 2>&1
-if [ $? -eq 1 ]; then
- exit 0
-else
- echo "$file: undefined symbols found" >&2
- exit 1
-fi
diff --git a/arch/x86/entry/vdso/extable.h b/arch/x86/entry/vdso/extable.h
index b56f6b012941..baba612b832c 100644
--- a/arch/x86/entry/vdso/extable.h
+++ b/arch/x86/entry/vdso/extable.h
@@ -7,7 +7,7 @@
* vDSO uses a dedicated handler the addresses are relative to the overall
* exception table, not each individual entry.
*/
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
#define _ASM_VDSO_EXTABLE_HANDLE(from, to) \
ASM_VDSO_EXTABLE_HANDLE from to
diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S
index 872947c1004c..ec1ac191a057 100644
--- a/arch/x86/entry/vdso/vdso-layout.lds.S
+++ b/arch/x86/entry/vdso/vdso-layout.lds.S
@@ -1,6 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <asm/vdso.h>
#include <asm/vdso/vsyscall.h>
+#include <vdso/datapage.h>
/*
* Linker script for vDSO. This is an ELF shared object prelinked to
@@ -17,14 +18,9 @@ SECTIONS
* segment.
*/
- vvar_start = . - __VVAR_PAGES * PAGE_SIZE;
- vvar_page = vvar_start;
+ VDSO_VVAR_SYMS
- vdso_rng_data = vvar_page + __VDSO_RND_DATA_OFFSET;
-
- timens_page = vvar_start + PAGE_SIZE;
-
- vclock_pages = vvar_start + VDSO_NR_VCLOCK_PAGES * PAGE_SIZE;
+ vclock_pages = VDSO_VCLOCK_PAGES_START(vdso_u_data);
pvclock_page = vclock_pages + VDSO_PAGE_PVCLOCK_OFFSET * PAGE_SIZE;
hvclock_page = vclock_pages + VDSO_PAGE_HVCLOCK_OFFSET * PAGE_SIZE;
diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c
index 90d15f2a7205..f84e8f8fa5fe 100644
--- a/arch/x86/entry/vdso/vdso2c.c
+++ b/arch/x86/entry/vdso/vdso2c.c
@@ -69,33 +69,12 @@
const char *outfilename;
-/* Symbols that we need in vdso2c. */
-enum {
- sym_vvar_start,
- sym_vvar_page,
- sym_pvclock_page,
- sym_hvclock_page,
- sym_timens_page,
-};
-
-const int special_pages[] = {
- sym_vvar_page,
- sym_pvclock_page,
- sym_hvclock_page,
- sym_timens_page,
-};
-
struct vdso_sym {
const char *name;
bool export;
};
struct vdso_sym required_syms[] = {
- [sym_vvar_start] = {"vvar_start", true},
- [sym_vvar_page] = {"vvar_page", true},
- [sym_pvclock_page] = {"pvclock_page", true},
- [sym_hvclock_page] = {"hvclock_page", true},
- [sym_timens_page] = {"timens_page", true},
{"VDSO32_NOTE_MASK", true},
{"__kernel_vsyscall", true},
{"__kernel_sigreturn", true},
diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h
index 67b3e37576a6..78ed1c1f28b9 100644
--- a/arch/x86/entry/vdso/vdso2c.h
+++ b/arch/x86/entry/vdso/vdso2c.h
@@ -150,26 +150,6 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
}
}
- /* Validate mapping addresses. */
- for (i = 0; i < sizeof(special_pages) / sizeof(special_pages[0]); i++) {
- INT_BITS symval = syms[special_pages[i]];
-
- if (!symval)
- continue; /* The mapping isn't used; ignore it. */
-
- if (symval % 4096)
- fail("%s must be a multiple of 4096\n",
- required_syms[i].name);
- if (symval + 4096 < syms[sym_vvar_start])
- fail("%s underruns vvar_start\n",
- required_syms[i].name);
- if (symval + 4096 > 0)
- fail("%s is on the wrong side of the vdso text\n",
- required_syms[i].name);
- }
- if (syms[sym_vvar_start] % 4096)
- fail("vvar_begin must be a multiple of 4096\n");
-
if (!image_name) {
fwrite(stripped_addr, stripped_len, 1, outfile);
return;
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 39e6efc1a9ca..9518bf1ddf35 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -14,7 +14,7 @@
#include <linux/elf.h>
#include <linux/cpu.h>
#include <linux/ptrace.h>
-#include <linux/time_namespace.h>
+#include <linux/vdso_datastore.h>
#include <asm/pvclock.h>
#include <asm/vgtod.h>
@@ -27,13 +27,7 @@
#include <asm/vdso/vsyscall.h>
#include <clocksource/hyperv_timer.h>
-struct vdso_data *arch_get_vdso_data(void *vvar_page)
-{
- return (struct vdso_data *)vvar_page;
-}
-
-static union vdso_data_store vdso_data_store __page_aligned_data;
-struct vdso_data *vdso_data = vdso_data_store.data;
+static_assert(VDSO_NR_PAGES + VDSO_NR_VCLOCK_PAGES == __VDSO_PAGES);
unsigned int vclocks_used __read_mostly;
@@ -48,13 +42,11 @@ int __init init_vdso_image(const struct vdso_image *image)
apply_alternatives((struct alt_instr *)(image->data + image->alt),
(struct alt_instr *)(image->data + image->alt +
- image->alt_len),
- NULL);
+ image->alt_len));
return 0;
}
-static const struct vm_special_mapping vvar_mapping;
struct linux_binprm;
static vm_fault_t vdso_fault(const struct vm_special_mapping *sm,
@@ -98,99 +90,6 @@ static int vdso_mremap(const struct vm_special_mapping *sm,
return 0;
}
-#ifdef CONFIG_TIME_NS
-/*
- * The vvar page layout depends on whether a task belongs to the root or
- * non-root time namespace. Whenever a task changes its namespace, the VVAR
- * page tables are cleared and then they will re-faulted with a
- * corresponding layout.
- * See also the comment near timens_setup_vdso_data() for details.
- */
-int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
-{
- struct mm_struct *mm = task->mm;
- struct vm_area_struct *vma;
- VMA_ITERATOR(vmi, mm, 0);
-
- mmap_read_lock(mm);
- for_each_vma(vmi, vma) {
- if (vma_is_special_mapping(vma, &vvar_mapping))
- zap_vma_pages(vma);
- }
- mmap_read_unlock(mm);
-
- return 0;
-}
-#endif
-
-static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
- struct vm_area_struct *vma, struct vm_fault *vmf)
-{
- const struct vdso_image *image = vma->vm_mm->context.vdso_image;
- unsigned long pfn;
- long sym_offset;
-
- if (!image)
- return VM_FAULT_SIGBUS;
-
- sym_offset = (long)(vmf->pgoff << PAGE_SHIFT) +
- image->sym_vvar_start;
-
- /*
- * Sanity check: a symbol offset of zero means that the page
- * does not exist for this vdso image, not that the page is at
- * offset zero relative to the text mapping. This should be
- * impossible here, because sym_offset should only be zero for
- * the page past the end of the vvar mapping.
- */
- if (sym_offset == 0)
- return VM_FAULT_SIGBUS;
-
- if (sym_offset == image->sym_vvar_page) {
- struct page *timens_page = find_timens_vvar_page(vma);
-
- pfn = __pa_symbol(vdso_data) >> PAGE_SHIFT;
-
- /*
- * If a task belongs to a time namespace then a namespace
- * specific VVAR is mapped with the sym_vvar_page offset and
- * the real VVAR page is mapped with the sym_timens_page
- * offset.
- * See also the comment near timens_setup_vdso_data().
- */
- if (timens_page) {
- unsigned long addr;
- vm_fault_t err;
-
- /*
- * Optimization: inside time namespace pre-fault
- * VVAR page too. As on timens page there are only
- * offsets for clocks on VVAR, it'll be faulted
- * shortly by VDSO code.
- */
- addr = vmf->address + (image->sym_timens_page - sym_offset);
- err = vmf_insert_pfn(vma, addr, pfn);
- if (unlikely(err & VM_FAULT_ERROR))
- return err;
-
- pfn = page_to_pfn(timens_page);
- }
-
- return vmf_insert_pfn(vma, vmf->address, pfn);
-
- } else if (sym_offset == image->sym_timens_page) {
- struct page *timens_page = find_timens_vvar_page(vma);
-
- if (!timens_page)
- return VM_FAULT_SIGBUS;
-
- pfn = __pa_symbol(vdso_data) >> PAGE_SHIFT;
- return vmf_insert_pfn(vma, vmf->address, pfn);
- }
-
- return VM_FAULT_SIGBUS;
-}
-
static vm_fault_t vvar_vclock_fault(const struct vm_special_mapping *sm,
struct vm_area_struct *vma, struct vm_fault *vmf)
{
@@ -212,7 +111,6 @@ static vm_fault_t vvar_vclock_fault(const struct vm_special_mapping *sm,
case VDSO_PAGE_HVCLOCK_OFFSET:
{
unsigned long pfn = hv_get_tsc_pfn();
-
if (pfn && vclock_was_used(VDSO_CLOCKMODE_HVCLOCK))
return vmf_insert_pfn(vma, vmf->address, pfn);
break;
@@ -228,10 +126,6 @@ static const struct vm_special_mapping vdso_mapping = {
.fault = vdso_fault,
.mremap = vdso_mremap,
};
-static const struct vm_special_mapping vvar_mapping = {
- .name = "[vvar]",
- .fault = vvar_fault,
-};
static const struct vm_special_mapping vvar_vclock_mapping = {
.name = "[vvar_vclock]",
.fault = vvar_vclock_fault,
@@ -253,13 +147,13 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr)
return -EINTR;
addr = get_unmapped_area(NULL, addr,
- image->size - image->sym_vvar_start, 0, 0);
+ image->size + __VDSO_PAGES * PAGE_SIZE, 0, 0);
if (IS_ERR_VALUE(addr)) {
ret = addr;
goto up_fail;
}
- text_start = addr - image->sym_vvar_start;
+ text_start = addr + __VDSO_PAGES * PAGE_SIZE;
/*
* MAYWRITE to allow gdb to COW and set breakpoints
@@ -276,13 +170,7 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr)
goto up_fail;
}
- vma = _install_special_mapping(mm,
- addr,
- (__VVAR_PAGES - VDSO_NR_VCLOCK_PAGES) * PAGE_SIZE,
- VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP|
- VM_PFNMAP,
- &vvar_mapping);
-
+ vma = vdso_install_vvar_mapping(mm, addr);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
do_munmap(mm, text_start, image->size, NULL);
@@ -290,7 +178,7 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr)
}
vma = _install_special_mapping(mm,
- addr + (__VVAR_PAGES - VDSO_NR_VCLOCK_PAGES) * PAGE_SIZE,
+ VDSO_VCLOCK_PAGES_START(addr),
VDSO_NR_VCLOCK_PAGES * PAGE_SIZE,
VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP|
VM_PFNMAP,
@@ -327,7 +215,7 @@ int map_vdso_once(const struct vdso_image *image, unsigned long addr)
*/
for_each_vma(vmi, vma) {
if (vma_is_special_mapping(vma, &vdso_mapping) ||
- vma_is_special_mapping(vma, &vvar_mapping) ||
+ vma_is_special_mapping(vma, &vdso_vvar_mapping) ||
vma_is_special_mapping(vma, &vvar_vclock_mapping)) {
mmap_write_unlock(mm);
return -EEXIST;
diff --git a/arch/x86/events/amd/brs.c b/arch/x86/events/amd/brs.c
index 780acd3dff22..ec3427463382 100644
--- a/arch/x86/events/amd/brs.c
+++ b/arch/x86/events/amd/brs.c
@@ -381,7 +381,8 @@ static void amd_brs_poison_buffer(void)
* On ctxswin, sched_in = true, called after the PMU has started
* On ctxswout, sched_in = false, called before the PMU is stopped
*/
-void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
+void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx,
+ struct task_struct *task, bool sched_in)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index e7a8b8758e08..0252b7ea8bca 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -28,9 +28,6 @@ static u32 ibs_caps;
#include <asm/nmi.h>
#include <asm/amd-ibs.h>
-#define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
-#define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT
-
/* attr.config2 */
#define IBS_SW_FILTER_MASK 1
@@ -89,6 +86,7 @@ struct perf_ibs {
u64 cnt_mask;
u64 enable_mask;
u64 valid_mask;
+ u16 min_period;
u64 max_period;
unsigned long offset_mask[1];
int offset_max;
@@ -270,11 +268,19 @@ static int validate_group(struct perf_event *event)
return 0;
}
+static bool perf_ibs_ldlat_event(struct perf_ibs *perf_ibs,
+ struct perf_event *event)
+{
+ return perf_ibs == &perf_ibs_op &&
+ (ibs_caps & IBS_CAPS_OPLDLAT) &&
+ (event->attr.config1 & 0xFFF);
+}
+
static int perf_ibs_init(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
struct perf_ibs *perf_ibs;
- u64 max_cnt, config;
+ u64 config;
int ret;
perf_ibs = get_ibs_pmu(event->attr.type);
@@ -310,25 +316,47 @@ static int perf_ibs_init(struct perf_event *event)
if (config & perf_ibs->cnt_mask)
/* raw max_cnt may not be set */
return -EINVAL;
- if (!event->attr.sample_freq && hwc->sample_period & 0x0f)
- /*
- * lower 4 bits can not be set in ibs max cnt,
- * but allowing it in case we adjust the
- * sample period to set a frequency.
- */
- return -EINVAL;
- hwc->sample_period &= ~0x0FULL;
- if (!hwc->sample_period)
- hwc->sample_period = 0x10;
+
+ if (event->attr.freq) {
+ hwc->sample_period = perf_ibs->min_period;
+ } else {
+ /* Silently mask off lower nibble. IBS hw mandates it. */
+ hwc->sample_period &= ~0x0FULL;
+ if (hwc->sample_period < perf_ibs->min_period)
+ return -EINVAL;
+ }
} else {
- max_cnt = config & perf_ibs->cnt_mask;
+ u64 period = 0;
+
+ if (event->attr.freq)
+ return -EINVAL;
+
+ if (perf_ibs == &perf_ibs_op) {
+ period = (config & IBS_OP_MAX_CNT) << 4;
+ if (ibs_caps & IBS_CAPS_OPCNTEXT)
+ period |= config & IBS_OP_MAX_CNT_EXT_MASK;
+ } else {
+ period = (config & IBS_FETCH_MAX_CNT) << 4;
+ }
+
config &= ~perf_ibs->cnt_mask;
- event->attr.sample_period = max_cnt << 4;
- hwc->sample_period = event->attr.sample_period;
+ event->attr.sample_period = period;
+ hwc->sample_period = period;
+
+ if (hwc->sample_period < perf_ibs->min_period)
+ return -EINVAL;
}
- if (!hwc->sample_period)
- return -EINVAL;
+ if (perf_ibs_ldlat_event(perf_ibs, event)) {
+ u64 ldlat = event->attr.config1 & 0xFFF;
+
+ if (ldlat < 128 || ldlat > 2048)
+ return -EINVAL;
+ ldlat >>= 7;
+
+ config |= (ldlat - 1) << 59;
+ config |= IBS_OP_L3MISSONLY | IBS_OP_LDLAT_EN;
+ }
/*
* If we modify hwc->sample_period, we also need to update
@@ -349,7 +377,8 @@ static int perf_ibs_set_period(struct perf_ibs *perf_ibs,
int overflow;
/* ignore lower 4 bits in min count: */
- overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period);
+ overflow = perf_event_set_period(hwc, perf_ibs->min_period,
+ perf_ibs->max_period, period);
local64_set(&hwc->prev_count, 0);
return overflow;
@@ -447,6 +476,9 @@ static void perf_ibs_start(struct perf_event *event, int flags)
WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
hwc->state = 0;
+ if (event->attr.freq && hwc->sample_period < perf_ibs->min_period)
+ hwc->sample_period = perf_ibs->min_period;
+
perf_ibs_set_period(perf_ibs, hwc, &period);
if (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_OPCNTEXT)) {
config |= period & IBS_OP_MAX_CNT_EXT_MASK;
@@ -554,6 +586,28 @@ static void perf_ibs_del(struct perf_event *event, int flags)
static void perf_ibs_read(struct perf_event *event) { }
+static int perf_ibs_check_period(struct perf_event *event, u64 value)
+{
+ struct perf_ibs *perf_ibs;
+ u64 low_nibble;
+
+ if (event->attr.freq)
+ return 0;
+
+ perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+ low_nibble = value & 0xFULL;
+
+ /*
+ * This contradicts with perf_ibs_init() which allows sample period
+ * with lower nibble bits set but silently masks them off. Whereas
+ * this returns error.
+ */
+ if (low_nibble || value < perf_ibs->min_period)
+ return -EINVAL;
+
+ return 0;
+}
+
/*
* We need to initialize with empty group if all attributes in the
* group are dynamic.
@@ -572,7 +626,10 @@ PMU_FORMAT_ATTR(cnt_ctl, "config:19");
PMU_FORMAT_ATTR(swfilt, "config2:0");
PMU_EVENT_ATTR_STRING(l3missonly, fetch_l3missonly, "config:59");
PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16");
+PMU_EVENT_ATTR_STRING(ldlat, ibs_op_ldlat_format, "config1:0-11");
PMU_EVENT_ATTR_STRING(zen4_ibs_extensions, zen4_ibs_extensions, "1");
+PMU_EVENT_ATTR_STRING(ldlat, ibs_op_ldlat_cap, "1");
+PMU_EVENT_ATTR_STRING(dtlb_pgsize, ibs_op_dtlb_pgsize_cap, "1");
static umode_t
zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int i)
@@ -580,6 +637,18 @@ zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int
return ibs_caps & IBS_CAPS_ZEN4 ? attr->mode : 0;
}
+static umode_t
+ibs_op_ldlat_is_visible(struct kobject *kobj, struct attribute *attr, int i)
+{
+ return ibs_caps & IBS_CAPS_OPLDLAT ? attr->mode : 0;
+}
+
+static umode_t
+ibs_op_dtlb_pgsize_is_visible(struct kobject *kobj, struct attribute *attr, int i)
+{
+ return ibs_caps & IBS_CAPS_OPDTLBPGSIZE ? attr->mode : 0;
+}
+
static struct attribute *fetch_attrs[] = {
&format_attr_rand_en.attr,
&format_attr_swfilt.attr,
@@ -596,6 +665,16 @@ static struct attribute *zen4_ibs_extensions_attrs[] = {
NULL,
};
+static struct attribute *ibs_op_ldlat_cap_attrs[] = {
+ &ibs_op_ldlat_cap.attr.attr,
+ NULL,
+};
+
+static struct attribute *ibs_op_dtlb_pgsize_cap_attrs[] = {
+ &ibs_op_dtlb_pgsize_cap.attr.attr,
+ NULL,
+};
+
static struct attribute_group group_fetch_formats = {
.name = "format",
.attrs = fetch_attrs,
@@ -613,6 +692,18 @@ static struct attribute_group group_zen4_ibs_extensions = {
.is_visible = zen4_ibs_extensions_is_visible,
};
+static struct attribute_group group_ibs_op_ldlat_cap = {
+ .name = "caps",
+ .attrs = ibs_op_ldlat_cap_attrs,
+ .is_visible = ibs_op_ldlat_is_visible,
+};
+
+static struct attribute_group group_ibs_op_dtlb_pgsize_cap = {
+ .name = "caps",
+ .attrs = ibs_op_dtlb_pgsize_cap_attrs,
+ .is_visible = ibs_op_dtlb_pgsize_is_visible,
+};
+
static const struct attribute_group *fetch_attr_groups[] = {
&group_fetch_formats,
&empty_caps_group,
@@ -651,6 +742,11 @@ static struct attribute_group group_op_formats = {
.attrs = op_attrs,
};
+static struct attribute *ibs_op_ldlat_format_attrs[] = {
+ &ibs_op_ldlat_format.attr.attr,
+ NULL,
+};
+
static struct attribute_group group_cnt_ctl = {
.name = "format",
.attrs = cnt_ctl_attrs,
@@ -669,10 +765,19 @@ static const struct attribute_group *op_attr_groups[] = {
NULL,
};
+static struct attribute_group group_ibs_op_ldlat_format = {
+ .name = "format",
+ .attrs = ibs_op_ldlat_format_attrs,
+ .is_visible = ibs_op_ldlat_is_visible,
+};
+
static const struct attribute_group *op_attr_update[] = {
&group_cnt_ctl,
&group_op_l3missonly,
&group_zen4_ibs_extensions,
+ &group_ibs_op_ldlat_cap,
+ &group_ibs_op_ldlat_format,
+ &group_ibs_op_dtlb_pgsize_cap,
NULL,
};
@@ -686,12 +791,14 @@ static struct perf_ibs perf_ibs_fetch = {
.start = perf_ibs_start,
.stop = perf_ibs_stop,
.read = perf_ibs_read,
+ .check_period = perf_ibs_check_period,
},
.msr = MSR_AMD64_IBSFETCHCTL,
- .config_mask = IBS_FETCH_CONFIG_MASK,
+ .config_mask = IBS_FETCH_MAX_CNT | IBS_FETCH_RAND_EN,
.cnt_mask = IBS_FETCH_MAX_CNT,
.enable_mask = IBS_FETCH_ENABLE,
.valid_mask = IBS_FETCH_VAL,
+ .min_period = 0x10,
.max_period = IBS_FETCH_MAX_CNT << 4,
.offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK },
.offset_max = MSR_AMD64_IBSFETCH_REG_COUNT,
@@ -709,13 +816,15 @@ static struct perf_ibs perf_ibs_op = {
.start = perf_ibs_start,
.stop = perf_ibs_stop,
.read = perf_ibs_read,
+ .check_period = perf_ibs_check_period,
},
.msr = MSR_AMD64_IBSOPCTL,
- .config_mask = IBS_OP_CONFIG_MASK,
+ .config_mask = IBS_OP_MAX_CNT,
.cnt_mask = IBS_OP_MAX_CNT | IBS_OP_CUR_CNT |
IBS_OP_CUR_CNT_RAND,
.enable_mask = IBS_OP_ENABLE,
.valid_mask = IBS_OP_VAL,
+ .min_period = 0x90,
.max_period = IBS_OP_MAX_CNT << 4,
.offset_mask = { MSR_AMD64_IBSOP_REG_MASK },
.offset_max = MSR_AMD64_IBSOP_REG_COUNT,
@@ -917,6 +1026,10 @@ static void perf_ibs_get_tlb_lvl(union ibs_op_data3 *op_data3,
if (!op_data3->dc_lin_addr_valid)
return;
+ if ((ibs_caps & IBS_CAPS_OPDTLBPGSIZE) &&
+ !op_data3->dc_phy_addr_valid)
+ return;
+
if (!op_data3->dc_l1tlb_miss) {
data_src->mem_dtlb = PERF_MEM_TLB_L1 | PERF_MEM_TLB_HIT;
return;
@@ -941,6 +1054,8 @@ static void perf_ibs_get_mem_lock(union ibs_op_data3 *op_data3,
data_src->mem_lock = PERF_MEM_LOCK_LOCKED;
}
+/* Be careful. Works only for contiguous MSRs. */
+#define ibs_fetch_msr_idx(msr) (msr - MSR_AMD64_IBSFETCHCTL)
#define ibs_op_msr_idx(msr) (msr - MSR_AMD64_IBSOPCTL)
static void perf_ibs_get_data_src(struct perf_ibs_data *ibs_data,
@@ -1021,21 +1136,92 @@ static void perf_ibs_parse_ld_st_data(__u64 sample_type,
}
}
-static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, u64 sample_type,
+static bool perf_ibs_is_mem_sample_type(struct perf_ibs *perf_ibs,
+ struct perf_event *event)
+{
+ u64 sample_type = event->attr.sample_type;
+
+ return perf_ibs == &perf_ibs_op &&
+ sample_type & (PERF_SAMPLE_DATA_SRC |
+ PERF_SAMPLE_WEIGHT_TYPE |
+ PERF_SAMPLE_ADDR |
+ PERF_SAMPLE_PHYS_ADDR);
+}
+
+static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs,
+ struct perf_event *event,
int check_rip)
{
- if (sample_type & PERF_SAMPLE_RAW ||
- (perf_ibs == &perf_ibs_op &&
- (sample_type & PERF_SAMPLE_DATA_SRC ||
- sample_type & PERF_SAMPLE_WEIGHT_TYPE ||
- sample_type & PERF_SAMPLE_ADDR ||
- sample_type & PERF_SAMPLE_PHYS_ADDR)))
+ if (event->attr.sample_type & PERF_SAMPLE_RAW ||
+ perf_ibs_is_mem_sample_type(perf_ibs, event) ||
+ perf_ibs_ldlat_event(perf_ibs, event))
return perf_ibs->offset_max;
else if (check_rip)
return 3;
return 1;
}
+static bool perf_ibs_is_kernel_data_addr(struct perf_event *event,
+ struct perf_ibs_data *ibs_data)
+{
+ u64 sample_type_mask = PERF_SAMPLE_ADDR | PERF_SAMPLE_RAW;
+ union ibs_op_data3 op_data3;
+ u64 dc_lin_addr;
+
+ op_data3.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)];
+ dc_lin_addr = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCLINAD)];
+
+ return unlikely((event->attr.sample_type & sample_type_mask) &&
+ op_data3.dc_lin_addr_valid && kernel_ip(dc_lin_addr));
+}
+
+static bool perf_ibs_is_kernel_br_target(struct perf_event *event,
+ struct perf_ibs_data *ibs_data,
+ int br_target_idx)
+{
+ union ibs_op_data op_data;
+ u64 br_target;
+
+ op_data.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA)];
+ br_target = ibs_data->regs[br_target_idx];
+
+ return unlikely((event->attr.sample_type & PERF_SAMPLE_RAW) &&
+ op_data.op_brn_ret && kernel_ip(br_target));
+}
+
+static bool perf_ibs_swfilt_discard(struct perf_ibs *perf_ibs, struct perf_event *event,
+ struct pt_regs *regs, struct perf_ibs_data *ibs_data,
+ int br_target_idx)
+{
+ if (perf_exclude_event(event, regs))
+ return true;
+
+ if (perf_ibs != &perf_ibs_op || !event->attr.exclude_kernel)
+ return false;
+
+ if (perf_ibs_is_kernel_data_addr(event, ibs_data))
+ return true;
+
+ if (br_target_idx != -1 &&
+ perf_ibs_is_kernel_br_target(event, ibs_data, br_target_idx))
+ return true;
+
+ return false;
+}
+
+static void perf_ibs_phyaddr_clear(struct perf_ibs *perf_ibs,
+ struct perf_ibs_data *ibs_data)
+{
+ if (perf_ibs == &perf_ibs_op) {
+ ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)] &= ~(1ULL << 18);
+ ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCPHYSAD)] = 0;
+ return;
+ }
+
+ ibs_data->regs[ibs_fetch_msr_idx(MSR_AMD64_IBSFETCHCTL)] &= ~(1ULL << 52);
+ ibs_data->regs[ibs_fetch_msr_idx(MSR_AMD64_IBSFETCHPHYSAD)] = 0;
+}
+
static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
{
struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
@@ -1048,6 +1234,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
int offset, size, check_rip, offset_max, throttle = 0;
unsigned int msr;
u64 *buf, *config, period, new_config = 0;
+ int br_target_idx = -1;
if (!test_bit(IBS_STARTED, pcpu->state)) {
fail:
@@ -1084,7 +1271,7 @@ fail:
offset = 1;
check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK));
- offset_max = perf_ibs_get_offset_max(perf_ibs, event->attr.sample_type, check_rip);
+ offset_max = perf_ibs_get_offset_max(perf_ibs, event, check_rip);
do {
rdmsrl(msr + offset, *buf++);
@@ -1093,6 +1280,22 @@ fail:
perf_ibs->offset_max,
offset + 1);
} while (offset < offset_max);
+
+ if (perf_ibs_ldlat_event(perf_ibs, event)) {
+ union ibs_op_data3 op_data3;
+
+ op_data3.val = ibs_data.regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)];
+ /*
+ * Opening event is errored out if load latency threshold is
+ * outside of [128, 2048] range. Since the event has reached
+ * interrupt handler, we can safely assume the threshold is
+ * within [128, 2048] range.
+ */
+ if (!op_data3.ld_op || !op_data3.dc_miss ||
+ op_data3.dc_miss_lat <= (event->attr.config1 & 0xFFF))
+ goto out;
+ }
+
/*
* Read IbsBrTarget, IbsOpData4, and IbsExtdCtl separately
* depending on their availability.
@@ -1102,6 +1305,7 @@ fail:
if (perf_ibs == &perf_ibs_op) {
if (ibs_caps & IBS_CAPS_BRNTRGT) {
rdmsrl(MSR_AMD64_IBSBRTARGET, *buf++);
+ br_target_idx = size;
size++;
}
if (ibs_caps & IBS_CAPS_OPDATA4) {
@@ -1129,10 +1333,19 @@ fail:
}
if ((event->attr.config2 & IBS_SW_FILTER_MASK) &&
- perf_exclude_event(event, &regs)) {
+ perf_ibs_swfilt_discard(perf_ibs, event, &regs, &ibs_data, br_target_idx)) {
throttle = perf_event_account_interrupt(event);
goto out;
}
+ /*
+ * Prevent leaking physical addresses to unprivileged users. Skip
+ * PERF_SAMPLE_PHYS_ADDR check since generic code prevents it for
+ * unprivileged users.
+ */
+ if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
+ perf_allow_kernel()) {
+ perf_ibs_phyaddr_clear(perf_ibs, &ibs_data);
+ }
if (event->attr.sample_type & PERF_SAMPLE_RAW) {
raw = (struct perf_raw_record){
@@ -1155,6 +1368,10 @@ fail:
perf_sample_save_callchain(&data, event, iregs);
throttle = perf_event_overflow(event, &data, &regs);
+
+ if (event->attr.freq && hwc->sample_period < perf_ibs->min_period)
+ hwc->sample_period = perf_ibs->min_period;
+
out:
if (throttle) {
perf_ibs_stop(event, 0);
@@ -1244,7 +1461,8 @@ static __init int perf_ibs_op_init(void)
if (ibs_caps & IBS_CAPS_OPCNTEXT) {
perf_ibs_op.max_period |= IBS_OP_MAX_CNT_EXT_MASK;
perf_ibs_op.config_mask |= IBS_OP_MAX_CNT_EXT_MASK;
- perf_ibs_op.cnt_mask |= IBS_OP_MAX_CNT_EXT_MASK;
+ perf_ibs_op.cnt_mask |= (IBS_OP_MAX_CNT_EXT_MASK |
+ IBS_OP_CUR_CNT_EXT_MASK);
}
if (ibs_caps & IBS_CAPS_ZEN4)
diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c
index b15f7b950d2e..f8228d8243f7 100644
--- a/arch/x86/events/amd/iommu.c
+++ b/arch/x86/events/amd/iommu.c
@@ -30,7 +30,7 @@
#define GET_DOMID_MASK(x) (((x)->conf1 >> 16) & 0xFFFFULL)
#define GET_PASID_MASK(x) (((x)->conf1 >> 32) & 0xFFFFFULL)
-#define IOMMU_NAME_SIZE 16
+#define IOMMU_NAME_SIZE 24
struct perf_amd_iommu {
struct list_head list;
diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c
index 19c7b76e21bc..c06ccca96851 100644
--- a/arch/x86/events/amd/lbr.c
+++ b/arch/x86/events/amd/lbr.c
@@ -371,7 +371,8 @@ void amd_pmu_lbr_del(struct perf_event *event)
perf_sched_cb_dec(event->pmu);
}
-void amd_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
+void amd_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx,
+ struct task_struct *task, bool sched_in)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 2092d615333d..6866cc5acb0b 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -87,13 +87,14 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_commit_scheduling, *x86_pmu.commit_scheduling);
DEFINE_STATIC_CALL_NULL(x86_pmu_stop_scheduling, *x86_pmu.stop_scheduling);
DEFINE_STATIC_CALL_NULL(x86_pmu_sched_task, *x86_pmu.sched_task);
-DEFINE_STATIC_CALL_NULL(x86_pmu_swap_task_ctx, *x86_pmu.swap_task_ctx);
DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs);
DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases);
DEFINE_STATIC_CALL_NULL(x86_pmu_filter, *x86_pmu.filter);
+DEFINE_STATIC_CALL_NULL(x86_pmu_late_setup, *x86_pmu.late_setup);
+
/*
* This one is magic, it will get called even when PMU init fails (because
* there is no PMU), in which case it should simply return NULL.
@@ -1298,6 +1299,15 @@ static void x86_pmu_enable(struct pmu *pmu)
if (cpuc->n_added) {
int n_running = cpuc->n_events - cpuc->n_added;
+
+ /*
+ * The late setup (after counters are scheduled)
+ * is required for some cases, e.g., PEBS counters
+ * snapshotting. Because an accurate counter index
+ * is needed.
+ */
+ static_call_cond(x86_pmu_late_setup)();
+
/*
* apply assignment obtained either from
* hw_perf_group_sched_in() or x86_pmu_enable()
@@ -2028,13 +2038,14 @@ static void x86_pmu_static_call_update(void)
static_call_update(x86_pmu_stop_scheduling, x86_pmu.stop_scheduling);
static_call_update(x86_pmu_sched_task, x86_pmu.sched_task);
- static_call_update(x86_pmu_swap_task_ctx, x86_pmu.swap_task_ctx);
static_call_update(x86_pmu_drain_pebs, x86_pmu.drain_pebs);
static_call_update(x86_pmu_pebs_aliases, x86_pmu.pebs_aliases);
static_call_update(x86_pmu_guest_get_msrs, x86_pmu.guest_get_msrs);
static_call_update(x86_pmu_filter, x86_pmu.filter);
+
+ static_call_update(x86_pmu_late_setup, x86_pmu.late_setup);
}
static void _x86_pmu_read(struct perf_event *event)
@@ -2625,15 +2636,10 @@ static const struct attribute_group *x86_pmu_attr_groups[] = {
NULL,
};
-static void x86_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
-{
- static_call_cond(x86_pmu_sched_task)(pmu_ctx, sched_in);
-}
-
-static void x86_pmu_swap_task_ctx(struct perf_event_pmu_context *prev_epc,
- struct perf_event_pmu_context *next_epc)
+static void x86_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx,
+ struct task_struct *task, bool sched_in)
{
- static_call_cond(x86_pmu_swap_task_ctx)(prev_epc, next_epc);
+ static_call_cond(x86_pmu_sched_task)(pmu_ctx, task, sched_in);
}
void perf_check_microcode(void)
@@ -2700,7 +2706,6 @@ static struct pmu pmu = {
.event_idx = x86_pmu_event_idx,
.sched_task = x86_pmu_sched_task,
- .swap_task_ctx = x86_pmu_swap_task_ctx,
.check_period = x86_pmu_check_period,
.aux_output_match = x86_pmu_aux_output_match,
@@ -2844,7 +2849,7 @@ static bool is_uprobe_at_func_entry(struct pt_regs *regs)
return true;
/* endbr64 (64-bit only) */
- if (user_64bit_mode(regs) && is_endbr(*(u32 *)auprobe->insn))
+ if (user_64bit_mode(regs) && is_endbr((u32 *)auprobe->insn))
return true;
return false;
diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
index 8f78b0c900ef..a95e6c91c4d7 100644
--- a/arch/x86/events/intel/bts.c
+++ b/arch/x86/events/intel/bts.c
@@ -36,7 +36,7 @@ enum {
BTS_STATE_ACTIVE,
};
-static DEFINE_PER_CPU(struct bts_ctx, bts_ctx);
+static struct bts_ctx __percpu *bts_ctx;
#define BTS_RECORD_SIZE 24
#define BTS_SAFETY_MARGIN 4080
@@ -58,7 +58,7 @@ struct bts_buffer {
local_t head;
unsigned long end;
void **data_pages;
- struct bts_phys buf[];
+ struct bts_phys buf[] __counted_by(nr_bufs);
};
static struct pmu bts_pmu;
@@ -231,7 +231,7 @@ bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle);
static void __bts_event_start(struct perf_event *event)
{
- struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+ struct bts_ctx *bts = this_cpu_ptr(bts_ctx);
struct bts_buffer *buf = perf_get_aux(&bts->handle);
u64 config = 0;
@@ -260,7 +260,7 @@ static void __bts_event_start(struct perf_event *event)
static void bts_event_start(struct perf_event *event, int flags)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
- struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+ struct bts_ctx *bts = this_cpu_ptr(bts_ctx);
struct bts_buffer *buf;
buf = perf_aux_output_begin(&bts->handle, event);
@@ -290,7 +290,7 @@ fail_stop:
static void __bts_event_stop(struct perf_event *event, int state)
{
- struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+ struct bts_ctx *bts = this_cpu_ptr(bts_ctx);
/* ACTIVE -> INACTIVE(PMI)/STOPPED(->stop()) */
WRITE_ONCE(bts->state, state);
@@ -305,7 +305,7 @@ static void __bts_event_stop(struct perf_event *event, int state)
static void bts_event_stop(struct perf_event *event, int flags)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
- struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+ struct bts_ctx *bts = this_cpu_ptr(bts_ctx);
struct bts_buffer *buf = NULL;
int state = READ_ONCE(bts->state);
@@ -338,9 +338,14 @@ static void bts_event_stop(struct perf_event *event, int flags)
void intel_bts_enable_local(void)
{
- struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
- int state = READ_ONCE(bts->state);
+ struct bts_ctx *bts;
+ int state;
+
+ if (!bts_ctx)
+ return;
+ bts = this_cpu_ptr(bts_ctx);
+ state = READ_ONCE(bts->state);
/*
* Here we transition from INACTIVE to ACTIVE;
* if we instead are STOPPED from the interrupt handler,
@@ -358,7 +363,12 @@ void intel_bts_enable_local(void)
void intel_bts_disable_local(void)
{
- struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+ struct bts_ctx *bts;
+
+ if (!bts_ctx)
+ return;
+
+ bts = this_cpu_ptr(bts_ctx);
/*
* Here we transition from ACTIVE to INACTIVE;
@@ -450,12 +460,17 @@ bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle)
int intel_bts_interrupt(void)
{
struct debug_store *ds = this_cpu_ptr(&cpu_hw_events)->ds;
- struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
- struct perf_event *event = bts->handle.event;
+ struct bts_ctx *bts;
+ struct perf_event *event;
struct bts_buffer *buf;
s64 old_head;
int err = -ENOSPC, handled = 0;
+ if (!bts_ctx)
+ return 0;
+
+ bts = this_cpu_ptr(bts_ctx);
+ event = bts->handle.event;
/*
* The only surefire way of knowing if this NMI is ours is by checking
* the write ptr against the PMI threshold.
@@ -518,7 +533,7 @@ static void bts_event_del(struct perf_event *event, int mode)
static int bts_event_add(struct perf_event *event, int mode)
{
- struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+ struct bts_ctx *bts = this_cpu_ptr(bts_ctx);
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
@@ -559,7 +574,7 @@ static int bts_event_init(struct perf_event *event)
* to the user in a zero-copy fashion.
*/
if (event->attr.exclude_kernel) {
- ret = perf_allow_kernel(&event->attr);
+ ret = perf_allow_kernel();
if (ret)
return ret;
}
@@ -605,6 +620,10 @@ static __init int bts_init(void)
return -ENODEV;
}
+ bts_ctx = alloc_percpu(struct bts_ctx);
+ if (!bts_ctx)
+ return -ENOMEM;
+
bts_pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE |
PERF_PMU_CAP_EXCLUSIVE;
bts_pmu.task_ctx_nr = perf_sw_context;
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index cdb19e3ba3aa..09d2d66c9f21 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2714,7 +2714,7 @@ static void update_saved_topdown_regs(struct perf_event *event, u64 slots,
* modify by a NMI. PMU has to be disabled before calling this function.
*/
-static u64 intel_update_topdown_event(struct perf_event *event, int metric_end)
+static u64 intel_update_topdown_event(struct perf_event *event, int metric_end, u64 *val)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct perf_event *other;
@@ -2722,13 +2722,24 @@ static u64 intel_update_topdown_event(struct perf_event *event, int metric_end)
bool reset = true;
int idx;
- /* read Fixed counter 3 */
- rdpmcl((3 | INTEL_PMC_FIXED_RDPMC_BASE), slots);
- if (!slots)
- return 0;
+ if (!val) {
+ /* read Fixed counter 3 */
+ rdpmcl((3 | INTEL_PMC_FIXED_RDPMC_BASE), slots);
+ if (!slots)
+ return 0;
- /* read PERF_METRICS */
- rdpmcl(INTEL_PMC_FIXED_RDPMC_METRICS, metrics);
+ /* read PERF_METRICS */
+ rdpmcl(INTEL_PMC_FIXED_RDPMC_METRICS, metrics);
+ } else {
+ slots = val[0];
+ metrics = val[1];
+ /*
+ * Don't reset the PERF_METRICS and Fixed counter 3
+ * for each PEBS record read. Utilize the RDPMC metrics
+ * clear mode.
+ */
+ reset = false;
+ }
for_each_set_bit(idx, cpuc->active_mask, metric_end + 1) {
if (!is_topdown_idx(idx))
@@ -2771,36 +2782,47 @@ static u64 intel_update_topdown_event(struct perf_event *event, int metric_end)
return slots;
}
-static u64 icl_update_topdown_event(struct perf_event *event)
+static u64 icl_update_topdown_event(struct perf_event *event, u64 *val)
{
return intel_update_topdown_event(event, INTEL_PMC_IDX_METRIC_BASE +
- x86_pmu.num_topdown_events - 1);
+ x86_pmu.num_topdown_events - 1,
+ val);
}
-DEFINE_STATIC_CALL(intel_pmu_update_topdown_event, x86_perf_event_update);
+DEFINE_STATIC_CALL(intel_pmu_update_topdown_event, intel_pmu_topdown_event_update);
-static void intel_pmu_read_topdown_event(struct perf_event *event)
+static void intel_pmu_read_event(struct perf_event *event)
{
- struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ if (event->hw.flags & (PERF_X86_EVENT_AUTO_RELOAD | PERF_X86_EVENT_TOPDOWN) ||
+ is_pebs_counter_event_group(event)) {
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ bool pmu_enabled = cpuc->enabled;
- /* Only need to call update_topdown_event() once for group read. */
- if ((cpuc->txn_flags & PERF_PMU_TXN_READ) &&
- !is_slots_event(event))
- return;
+ /* Only need to call update_topdown_event() once for group read. */
+ if (is_metric_event(event) && (cpuc->txn_flags & PERF_PMU_TXN_READ))
+ return;
- perf_pmu_disable(event->pmu);
- static_call(intel_pmu_update_topdown_event)(event);
- perf_pmu_enable(event->pmu);
-}
+ cpuc->enabled = 0;
+ if (pmu_enabled)
+ intel_pmu_disable_all();
-static void intel_pmu_read_event(struct perf_event *event)
-{
- if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)
- intel_pmu_auto_reload_read(event);
- else if (is_topdown_count(event))
- intel_pmu_read_topdown_event(event);
- else
- x86_perf_event_update(event);
+ /*
+ * If the PEBS counters snapshotting is enabled,
+ * the topdown event is available in PEBS records.
+ */
+ if (is_topdown_event(event) && !is_pebs_counter_event_group(event))
+ static_call(intel_pmu_update_topdown_event)(event, NULL);
+ else
+ intel_pmu_drain_pebs_buffer();
+
+ cpuc->enabled = pmu_enabled;
+ if (pmu_enabled)
+ intel_pmu_enable_all(0);
+
+ return;
+ }
+
+ x86_perf_event_update(event);
}
static void intel_pmu_enable_fixed(struct perf_event *event)
@@ -2932,7 +2954,7 @@ static int intel_pmu_set_period(struct perf_event *event)
static u64 intel_pmu_update(struct perf_event *event)
{
if (unlikely(is_topdown_count(event)))
- return static_call(intel_pmu_update_topdown_event)(event);
+ return static_call(intel_pmu_update_topdown_event)(event, NULL);
return x86_perf_event_update(event);
}
@@ -3070,7 +3092,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
handled++;
x86_pmu_handle_guest_pebs(regs, &data);
- x86_pmu.drain_pebs(regs, &data);
+ static_call(x86_pmu_drain_pebs)(regs, &data);
status &= intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI;
/*
@@ -3098,7 +3120,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
*/
if (__test_and_clear_bit(GLOBAL_STATUS_PERF_METRICS_OVF_BIT, (unsigned long *)&status)) {
handled++;
- static_call(intel_pmu_update_topdown_event)(NULL);
+ static_call(intel_pmu_update_topdown_event)(NULL, NULL);
}
/*
@@ -3116,6 +3138,27 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
if (!test_bit(bit, cpuc->active_mask))
continue;
+ /*
+ * There may be unprocessed PEBS records in the PEBS buffer,
+ * which still stores the previous values.
+ * Process those records first before handling the latest value.
+ * For example,
+ * A is a regular counter
+ * B is a PEBS event which reads A
+ * C is a PEBS event
+ *
+ * The following can happen:
+ * B-assist A=1
+ * C A=2
+ * B-assist A=3
+ * A-overflow-PMI A=4
+ * C-assist-PMI (PEBS buffer) A=5
+ *
+ * The PEBS buffer has to be drained before handling the A-PMI
+ */
+ if (is_pebs_counter_event_group(event))
+ x86_pmu.drain_pebs(regs, &data);
+
if (!intel_pmu_save_and_restart(event))
continue;
@@ -4148,6 +4191,13 @@ static int intel_pmu_hw_config(struct perf_event *event)
event->hw.flags |= PERF_X86_EVENT_PEBS_VIA_PT;
}
+ if ((event->attr.sample_type & PERF_SAMPLE_READ) &&
+ (x86_pmu.intel_cap.pebs_format >= 6) &&
+ x86_pmu.intel_cap.pebs_baseline &&
+ is_sampling_event(event) &&
+ event->attr.precise_ip)
+ event->group_leader->hw.flags |= PERF_X86_EVENT_PEBS_CNTR;
+
if ((event->attr.type == PERF_TYPE_HARDWARE) ||
(event->attr.type == PERF_TYPE_HW_CACHE))
return 0;
@@ -4247,7 +4297,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
if (x86_pmu.version < 3)
return -EINVAL;
- ret = perf_allow_cpu(&event->attr);
+ ret = perf_allow_cpu();
if (ret)
return ret;
@@ -4685,9 +4735,9 @@ static int adl_hw_config(struct perf_event *event)
return -EOPNOTSUPP;
}
-static enum hybrid_cpu_type adl_get_hybrid_cpu_type(void)
+static enum intel_cpu_type adl_get_hybrid_cpu_type(void)
{
- return HYBRID_INTEL_CORE;
+ return INTEL_CPU_TYPE_CORE;
}
static inline bool erratum_hsw11(struct perf_event *event)
@@ -5032,7 +5082,8 @@ static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu)
static struct x86_hybrid_pmu *find_hybrid_pmu_for_cpu(void)
{
- u8 cpu_type = get_this_hybrid_cpu_type();
+ struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
+ enum intel_cpu_type cpu_type = c->topo.intel_type;
int i;
/*
@@ -5041,7 +5092,7 @@ static struct x86_hybrid_pmu *find_hybrid_pmu_for_cpu(void)
* on it. There should be a fixup function provided for these
* troublesome CPUs (->get_hybrid_cpu_type).
*/
- if (cpu_type == HYBRID_INTEL_NONE) {
+ if (cpu_type == INTEL_CPU_TYPE_UNKNOWN) {
if (x86_pmu.get_hybrid_cpu_type)
cpu_type = x86_pmu.get_hybrid_cpu_type();
else
@@ -5058,16 +5109,16 @@ static struct x86_hybrid_pmu *find_hybrid_pmu_for_cpu(void)
enum hybrid_pmu_type pmu_type = x86_pmu.hybrid_pmu[i].pmu_type;
u32 native_id;
- if (cpu_type == HYBRID_INTEL_CORE && pmu_type == hybrid_big)
+ if (cpu_type == INTEL_CPU_TYPE_CORE && pmu_type == hybrid_big)
return &x86_pmu.hybrid_pmu[i];
- if (cpu_type == HYBRID_INTEL_ATOM) {
+ if (cpu_type == INTEL_CPU_TYPE_ATOM) {
if (x86_pmu.num_hybrid_pmus == 2 && pmu_type == hybrid_small)
return &x86_pmu.hybrid_pmu[i];
- native_id = get_this_hybrid_cpu_native_id();
- if (native_id == skt_native_id && pmu_type == hybrid_small)
+ native_id = c->topo.intel_native_model_id;
+ if (native_id == INTEL_ATOM_SKT_NATIVE_ID && pmu_type == hybrid_small)
return &x86_pmu.hybrid_pmu[i];
- if (native_id == cmt_native_id && pmu_type == hybrid_tiny)
+ if (native_id == INTEL_ATOM_CMT_NATIVE_ID && pmu_type == hybrid_tiny)
return &x86_pmu.hybrid_pmu[i];
}
}
@@ -5244,16 +5295,10 @@ static void intel_pmu_cpu_dead(int cpu)
}
static void intel_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx,
- bool sched_in)
+ struct task_struct *task, bool sched_in)
{
intel_pmu_pebs_sched_task(pmu_ctx, sched_in);
- intel_pmu_lbr_sched_task(pmu_ctx, sched_in);
-}
-
-static void intel_pmu_swap_task_ctx(struct perf_event_pmu_context *prev_epc,
- struct perf_event_pmu_context *next_epc)
-{
- intel_pmu_lbr_swap_task_ctx(prev_epc, next_epc);
+ intel_pmu_lbr_sched_task(pmu_ctx, task, sched_in);
}
static int intel_pmu_check_period(struct perf_event *event, u64 value)
@@ -5424,7 +5469,6 @@ static __initconst const struct x86_pmu intel_pmu = {
.guest_get_msrs = intel_guest_get_msrs,
.sched_task = intel_pmu_sched_task,
- .swap_task_ctx = intel_pmu_swap_task_ctx,
.check_period = intel_pmu_check_period,
@@ -6540,15 +6584,21 @@ __init int intel_pmu_init(void)
char *name;
struct x86_hybrid_pmu *pmu;
+ /* Architectural Perfmon was introduced starting with Core "Yonah" */
if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
switch (boot_cpu_data.x86) {
- case 0x6:
- return p6_pmu_init();
- case 0xb:
+ case 6:
+ if (boot_cpu_data.x86_vfm < INTEL_CORE_YONAH)
+ return p6_pmu_init();
+ break;
+ case 11:
return knc_pmu_init();
- case 0xf:
+ case 15:
return p4_pmu_init();
}
+
+ pr_cont("unsupported CPU family %d model %d ",
+ boot_cpu_data.x86, boot_cpu_data.x86_model);
return -ENODEV;
}
@@ -6696,7 +6746,7 @@ __init int intel_pmu_init(void)
case INTEL_ATOM_SILVERMONT_D:
case INTEL_ATOM_SILVERMONT_MID:
case INTEL_ATOM_AIRMONT:
- case INTEL_ATOM_AIRMONT_MID:
+ case INTEL_ATOM_SILVERMONT_MID2:
memcpy(hw_cache_event_ids, slm_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs,
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index f122882ef278..1f7e1a692a7a 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -953,11 +953,11 @@ unlock:
return 1;
}
-static inline void intel_pmu_drain_pebs_buffer(void)
+void intel_pmu_drain_pebs_buffer(void)
{
struct perf_sample_data data;
- x86_pmu.drain_pebs(NULL, &data);
+ static_call(x86_pmu_drain_pebs)(NULL, &data);
}
/*
@@ -1294,6 +1294,19 @@ static inline void pebs_update_threshold(struct cpu_hw_events *cpuc)
ds->pebs_interrupt_threshold = threshold;
}
+#define PEBS_DATACFG_CNTRS(x) \
+ ((x >> PEBS_DATACFG_CNTR_SHIFT) & PEBS_DATACFG_CNTR_MASK)
+
+#define PEBS_DATACFG_CNTR_BIT(x) \
+ (((1ULL << x) & PEBS_DATACFG_CNTR_MASK) << PEBS_DATACFG_CNTR_SHIFT)
+
+#define PEBS_DATACFG_FIX(x) \
+ ((x >> PEBS_DATACFG_FIX_SHIFT) & PEBS_DATACFG_FIX_MASK)
+
+#define PEBS_DATACFG_FIX_BIT(x) \
+ (((1ULL << (x)) & PEBS_DATACFG_FIX_MASK) \
+ << PEBS_DATACFG_FIX_SHIFT)
+
static void adaptive_pebs_record_size_update(void)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -1308,10 +1321,58 @@ static void adaptive_pebs_record_size_update(void)
sz += sizeof(struct pebs_xmm);
if (pebs_data_cfg & PEBS_DATACFG_LBRS)
sz += x86_pmu.lbr_nr * sizeof(struct lbr_entry);
+ if (pebs_data_cfg & (PEBS_DATACFG_METRICS | PEBS_DATACFG_CNTR)) {
+ sz += sizeof(struct pebs_cntr_header);
+
+ /* Metrics base and Metrics Data */
+ if (pebs_data_cfg & PEBS_DATACFG_METRICS)
+ sz += 2 * sizeof(u64);
+
+ if (pebs_data_cfg & PEBS_DATACFG_CNTR) {
+ sz += (hweight64(PEBS_DATACFG_CNTRS(pebs_data_cfg)) +
+ hweight64(PEBS_DATACFG_FIX(pebs_data_cfg))) *
+ sizeof(u64);
+ }
+ }
cpuc->pebs_record_size = sz;
}
+static void __intel_pmu_pebs_update_cfg(struct perf_event *event,
+ int idx, u64 *pebs_data_cfg)
+{
+ if (is_metric_event(event)) {
+ *pebs_data_cfg |= PEBS_DATACFG_METRICS;
+ return;
+ }
+
+ *pebs_data_cfg |= PEBS_DATACFG_CNTR;
+
+ if (idx >= INTEL_PMC_IDX_FIXED)
+ *pebs_data_cfg |= PEBS_DATACFG_FIX_BIT(idx - INTEL_PMC_IDX_FIXED);
+ else
+ *pebs_data_cfg |= PEBS_DATACFG_CNTR_BIT(idx);
+}
+
+
+static void intel_pmu_late_setup(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct perf_event *event;
+ u64 pebs_data_cfg = 0;
+ int i;
+
+ for (i = 0; i < cpuc->n_events; i++) {
+ event = cpuc->event_list[i];
+ if (!is_pebs_counter_event_group(event))
+ continue;
+ __intel_pmu_pebs_update_cfg(event, cpuc->assign[i], &pebs_data_cfg);
+ }
+
+ if (pebs_data_cfg & ~cpuc->pebs_data_cfg)
+ cpuc->pebs_data_cfg |= pebs_data_cfg | PEBS_UPDATE_DS_SW;
+}
+
#define PERF_PEBS_MEMINFO_TYPE (PERF_SAMPLE_ADDR | PERF_SAMPLE_DATA_SRC | \
PERF_SAMPLE_PHYS_ADDR | \
PERF_SAMPLE_WEIGHT_TYPE | \
@@ -1914,12 +1975,89 @@ static void adaptive_pebs_save_regs(struct pt_regs *regs,
#endif
}
+static void intel_perf_event_update_pmc(struct perf_event *event, u64 pmc)
+{
+ int shift = 64 - x86_pmu.cntval_bits;
+ struct hw_perf_event *hwc;
+ u64 delta, prev_pmc;
+
+ /*
+ * A recorded counter may not have an assigned event in the
+ * following cases. The value should be dropped.
+ * - An event is deleted. There is still an active PEBS event.
+ * The PEBS record doesn't shrink on pmu::del().
+ * If the counter of the deleted event once occurred in a PEBS
+ * record, PEBS still records the counter until the counter is
+ * reassigned.
+ * - An event is stopped for some reason, e.g., throttled.
+ * During this period, another event is added and takes the
+ * counter of the stopped event. The stopped event is assigned
+ * to another new and uninitialized counter, since the
+ * x86_pmu_start(RELOAD) is not invoked for a stopped event.
+ * The PEBS__DATA_CFG is updated regardless of the event state.
+ * The uninitialized counter can be recorded in a PEBS record.
+ * But the cpuc->events[uninitialized_counter] is always NULL,
+ * because the event is stopped. The uninitialized value is
+ * safely dropped.
+ */
+ if (!event)
+ return;
+
+ hwc = &event->hw;
+ prev_pmc = local64_read(&hwc->prev_count);
+
+ /* Only update the count when the PMU is disabled */
+ WARN_ON(this_cpu_read(cpu_hw_events.enabled));
+ local64_set(&hwc->prev_count, pmc);
+
+ delta = (pmc << shift) - (prev_pmc << shift);
+ delta >>= shift;
+
+ local64_add(delta, &event->count);
+ local64_sub(delta, &hwc->period_left);
+}
+
+static inline void __setup_pebs_counter_group(struct cpu_hw_events *cpuc,
+ struct perf_event *event,
+ struct pebs_cntr_header *cntr,
+ void *next_record)
+{
+ int bit;
+
+ for_each_set_bit(bit, (unsigned long *)&cntr->cntr, INTEL_PMC_MAX_GENERIC) {
+ intel_perf_event_update_pmc(cpuc->events[bit], *(u64 *)next_record);
+ next_record += sizeof(u64);
+ }
+
+ for_each_set_bit(bit, (unsigned long *)&cntr->fixed, INTEL_PMC_MAX_FIXED) {
+ /* The slots event will be handled with perf_metric later */
+ if ((cntr->metrics == INTEL_CNTR_METRICS) &&
+ (bit + INTEL_PMC_IDX_FIXED == INTEL_PMC_IDX_FIXED_SLOTS)) {
+ next_record += sizeof(u64);
+ continue;
+ }
+ intel_perf_event_update_pmc(cpuc->events[bit + INTEL_PMC_IDX_FIXED],
+ *(u64 *)next_record);
+ next_record += sizeof(u64);
+ }
+
+ /* HW will reload the value right after the overflow. */
+ if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)
+ local64_set(&event->hw.prev_count, (u64)-event->hw.sample_period);
+
+ if (cntr->metrics == INTEL_CNTR_METRICS) {
+ static_call(intel_pmu_update_topdown_event)
+ (cpuc->events[INTEL_PMC_IDX_FIXED_SLOTS],
+ (u64 *)next_record);
+ next_record += 2 * sizeof(u64);
+ }
+}
+
#define PEBS_LATENCY_MASK 0xffff
/*
* With adaptive PEBS the layout depends on what fields are configured.
*/
-
static void setup_pebs_adaptive_sample_data(struct perf_event *event,
struct pt_regs *iregs, void *__pebs,
struct perf_sample_data *data,
@@ -2049,6 +2187,28 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
}
}
+ if (format_group & (PEBS_DATACFG_CNTR | PEBS_DATACFG_METRICS)) {
+ struct pebs_cntr_header *cntr = next_record;
+ unsigned int nr;
+
+ next_record += sizeof(struct pebs_cntr_header);
+ /*
+ * The PEBS_DATA_CFG is a global register, which is the
+ * superset configuration for all PEBS events.
+ * For the PEBS record of non-sample-read group, ignore
+ * the counter snapshot fields.
+ */
+ if (is_pebs_counter_event_group(event)) {
+ __setup_pebs_counter_group(cpuc, event, cntr, next_record);
+ data->sample_flags |= PERF_SAMPLE_READ;
+ }
+
+ nr = hweight32(cntr->cntr) + hweight32(cntr->fixed);
+ if (cntr->metrics == INTEL_CNTR_METRICS)
+ nr += 2;
+ next_record += nr * sizeof(u64);
+ }
+
WARN_ONCE(next_record != __pebs + basic->format_size,
"PEBS record size %u, expected %llu, config %llx\n",
basic->format_size,
@@ -2094,15 +2254,6 @@ get_next_pebs_record_by_bit(void *base, void *top, int bit)
return NULL;
}
-void intel_pmu_auto_reload_read(struct perf_event *event)
-{
- WARN_ON(!(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD));
-
- perf_pmu_disable(event->pmu);
- intel_pmu_drain_pebs_buffer();
- perf_pmu_enable(event->pmu);
-}
-
/*
* Special variant of intel_pmu_save_and_restart() for auto-reload.
*/
@@ -2211,13 +2362,21 @@ __intel_pmu_pebs_last_event(struct perf_event *event,
}
if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
- /*
- * Now, auto-reload is only enabled in fixed period mode.
- * The reload value is always hwc->sample_period.
- * May need to change it, if auto-reload is enabled in
- * freq mode later.
- */
- intel_pmu_save_and_restart_reload(event, count);
+ if ((is_pebs_counter_event_group(event))) {
+ /*
+ * The value of each sample has been updated when setup
+ * the corresponding sample data.
+ */
+ perf_event_update_userpage(event);
+ } else {
+ /*
+ * Now, auto-reload is only enabled in fixed period mode.
+ * The reload value is always hwc->sample_period.
+ * May need to change it, if auto-reload is enabled in
+ * freq mode later.
+ */
+ intel_pmu_save_and_restart_reload(event, count);
+ }
} else
intel_pmu_save_and_restart(event);
}
@@ -2552,6 +2711,11 @@ void __init intel_ds_init(void)
break;
case 6:
+ if (x86_pmu.intel_cap.pebs_baseline) {
+ x86_pmu.large_pebs_flags |= PERF_SAMPLE_READ;
+ x86_pmu.late_setup = intel_pmu_late_setup;
+ }
+ fallthrough;
case 5:
x86_pmu.pebs_ept = 1;
fallthrough;
@@ -2576,7 +2740,7 @@ void __init intel_ds_init(void)
PERF_SAMPLE_REGS_USER |
PERF_SAMPLE_REGS_INTR);
}
- pr_cont("PEBS fmt4%c%s, ", pebs_type, pebs_qual);
+ pr_cont("PEBS fmt%d%c%s, ", format, pebs_type, pebs_qual);
/*
* The PEBS-via-PT is not supported on hybrid platforms,
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index dc641b50814e..f44c3d866f24 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -422,11 +422,17 @@ static __always_inline bool lbr_is_reset_in_cstate(void *ctx)
return !rdlbr_from(((struct x86_perf_task_context *)ctx)->tos, NULL);
}
+static inline bool has_lbr_callstack_users(void *ctx)
+{
+ return task_context_opt(ctx)->lbr_callstack_users ||
+ x86_pmu.lbr_callstack_users;
+}
+
static void __intel_pmu_lbr_restore(void *ctx)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
- if (task_context_opt(ctx)->lbr_callstack_users == 0 ||
+ if (!has_lbr_callstack_users(ctx) ||
task_context_opt(ctx)->lbr_stack_state == LBR_NONE) {
intel_pmu_lbr_reset();
return;
@@ -503,7 +509,7 @@ static void __intel_pmu_lbr_save(void *ctx)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
- if (task_context_opt(ctx)->lbr_callstack_users == 0) {
+ if (!has_lbr_callstack_users(ctx)) {
task_context_opt(ctx)->lbr_stack_state = LBR_NONE;
return;
}
@@ -516,32 +522,11 @@ static void __intel_pmu_lbr_save(void *ctx)
cpuc->last_log_id = ++task_context_opt(ctx)->log_id;
}
-void intel_pmu_lbr_swap_task_ctx(struct perf_event_pmu_context *prev_epc,
- struct perf_event_pmu_context *next_epc)
-{
- void *prev_ctx_data, *next_ctx_data;
-
- swap(prev_epc->task_ctx_data, next_epc->task_ctx_data);
-
- /*
- * Architecture specific synchronization makes sense in case
- * both prev_epc->task_ctx_data and next_epc->task_ctx_data
- * pointers are allocated.
- */
-
- prev_ctx_data = next_epc->task_ctx_data;
- next_ctx_data = prev_epc->task_ctx_data;
-
- if (!prev_ctx_data || !next_ctx_data)
- return;
-
- swap(task_context_opt(prev_ctx_data)->lbr_callstack_users,
- task_context_opt(next_ctx_data)->lbr_callstack_users);
-}
-
-void intel_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
+void intel_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx,
+ struct task_struct *task, bool sched_in)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct perf_ctx_data *ctx_data;
void *task_ctx;
if (!cpuc->lbr_users)
@@ -552,14 +537,18 @@ void intel_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched
* the task was scheduled out, restore the stack. Otherwise flush
* the LBR stack.
*/
- task_ctx = pmu_ctx ? pmu_ctx->task_ctx_data : NULL;
+ rcu_read_lock();
+ ctx_data = rcu_dereference(task->perf_ctx_data);
+ task_ctx = ctx_data ? ctx_data->data : NULL;
if (task_ctx) {
if (sched_in)
__intel_pmu_lbr_restore(task_ctx);
else
__intel_pmu_lbr_save(task_ctx);
+ rcu_read_unlock();
return;
}
+ rcu_read_unlock();
/*
* Since a context switch can flip the address space and LBR entries
@@ -588,9 +577,19 @@ void intel_pmu_lbr_add(struct perf_event *event)
cpuc->br_sel = event->hw.branch_reg.reg;
- if (branch_user_callstack(cpuc->br_sel) && event->pmu_ctx->task_ctx_data)
- task_context_opt(event->pmu_ctx->task_ctx_data)->lbr_callstack_users++;
+ if (branch_user_callstack(cpuc->br_sel)) {
+ if (event->attach_state & PERF_ATTACH_TASK) {
+ struct task_struct *task = event->hw.target;
+ struct perf_ctx_data *ctx_data;
+ rcu_read_lock();
+ ctx_data = rcu_dereference(task->perf_ctx_data);
+ if (ctx_data)
+ task_context_opt(ctx_data->data)->lbr_callstack_users++;
+ rcu_read_unlock();
+ } else
+ x86_pmu.lbr_callstack_users++;
+ }
/*
* Request pmu::sched_task() callback, which will fire inside the
* regular perf event scheduling, so that call will:
@@ -664,9 +663,19 @@ void intel_pmu_lbr_del(struct perf_event *event)
if (!x86_pmu.lbr_nr)
return;
- if (branch_user_callstack(cpuc->br_sel) &&
- event->pmu_ctx->task_ctx_data)
- task_context_opt(event->pmu_ctx->task_ctx_data)->lbr_callstack_users--;
+ if (branch_user_callstack(cpuc->br_sel)) {
+ if (event->attach_state & PERF_ATTACH_TASK) {
+ struct task_struct *task = event->hw.target;
+ struct perf_ctx_data *ctx_data;
+
+ rcu_read_lock();
+ ctx_data = rcu_dereference(task->perf_ctx_data);
+ if (ctx_data)
+ task_context_opt(ctx_data->data)->lbr_callstack_users--;
+ rcu_read_unlock();
+ } else
+ x86_pmu.lbr_callstack_users--;
+ }
if (event->hw.flags & PERF_X86_EVENT_LBR_SELECT)
cpuc->lbr_select = 0;
diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c
index 844bc4fc4724..c85a9fc44355 100644
--- a/arch/x86/events/intel/p4.c
+++ b/arch/x86/events/intel/p4.c
@@ -10,6 +10,7 @@
#include <linux/perf_event.h>
#include <asm/perf_event_p4.h>
+#include <asm/cpu_device_id.h>
#include <asm/hardirq.h>
#include <asm/apic.h>
@@ -732,9 +733,9 @@ static bool p4_event_match_cpu_model(unsigned int event_idx)
{
/* INSTR_COMPLETED event only exist for model 3, 4, 6 (Prescott) */
if (event_idx == P4_EVENT_INSTR_COMPLETED) {
- if (boot_cpu_data.x86_model != 3 &&
- boot_cpu_data.x86_model != 4 &&
- boot_cpu_data.x86_model != 6)
+ if (boot_cpu_data.x86_vfm != INTEL_P4_PRESCOTT &&
+ boot_cpu_data.x86_vfm != INTEL_P4_PRESCOTT_2M &&
+ boot_cpu_data.x86_vfm != INTEL_P4_CEDARMILL)
return false;
}
@@ -776,7 +777,7 @@ static int p4_validate_raw_event(struct perf_event *event)
* the user needs special permissions to be able to use it
*/
if (p4_ht_active() && p4_event_bind_map[v].shared) {
- v = perf_allow_cpu(&event->attr);
+ v = perf_allow_cpu();
if (v)
return v;
}
diff --git a/arch/x86/events/intel/p6.c b/arch/x86/events/intel/p6.c
index a6cffb4f4ef5..65b45e9d7016 100644
--- a/arch/x86/events/intel/p6.c
+++ b/arch/x86/events/intel/p6.c
@@ -2,6 +2,8 @@
#include <linux/perf_event.h>
#include <linux/types.h>
+#include <asm/cpu_device_id.h>
+
#include "../perf_event.h"
/*
@@ -248,30 +250,8 @@ __init int p6_pmu_init(void)
{
x86_pmu = p6_pmu;
- switch (boot_cpu_data.x86_model) {
- case 1: /* Pentium Pro */
+ if (boot_cpu_data.x86_vfm == INTEL_PENTIUM_PRO)
x86_add_quirk(p6_pmu_rdpmc_quirk);
- break;
-
- case 3: /* Pentium II - Klamath */
- case 5: /* Pentium II - Deschutes */
- case 6: /* Pentium II - Mendocino */
- break;
-
- case 7: /* Pentium III - Katmai */
- case 8: /* Pentium III - Coppermine */
- case 10: /* Pentium III Xeon */
- case 11: /* Pentium III - Tualatin */
- break;
-
- case 9: /* Pentium M - Banias */
- case 13: /* Pentium M - Dothan */
- break;
-
- default:
- pr_cont("unsupported p6 CPU model %d ", boot_cpu_data.x86_model);
- return -ENODEV;
- }
memcpy(hw_cache_event_ids, p6_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 60b3078b7502..a34e50fc4a8f 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -347,8 +347,7 @@ void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box)
static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box)
{
- hrtimer_init(&box->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- box->hrtimer.function = uncore_pmu_hrtimer;
+ hrtimer_setup(&box->hrtimer, uncore_pmu_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
}
static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type,
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 31c2771545a6..2c0ce0e9545e 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -115,6 +115,11 @@ static inline bool is_branch_counters_group(struct perf_event *event)
return event->group_leader->hw.flags & PERF_X86_EVENT_BRANCH_COUNTERS;
}
+static inline bool is_pebs_counter_event_group(struct perf_event *event)
+{
+ return event->group_leader->hw.flags & PERF_X86_EVENT_PEBS_CNTR;
+}
+
struct amd_nb {
int nb_id; /* NorthBridge id */
int refcnt; /* reference count */
@@ -669,18 +674,6 @@ enum {
#define PERF_PEBS_DATA_SOURCE_GRT_MAX 0x10
#define PERF_PEBS_DATA_SOURCE_GRT_MASK (PERF_PEBS_DATA_SOURCE_GRT_MAX - 1)
-/*
- * CPUID.1AH.EAX[31:0] uniquely identifies the microarchitecture
- * of the core. Bits 31-24 indicates its core type (Core or Atom)
- * and Bits [23:0] indicates the native model ID of the core.
- * Core type and native model ID are defined in below enumerations.
- */
-enum hybrid_cpu_type {
- HYBRID_INTEL_NONE,
- HYBRID_INTEL_ATOM = 0x20,
- HYBRID_INTEL_CORE = 0x40,
-};
-
#define X86_HYBRID_PMU_ATOM_IDX 0
#define X86_HYBRID_PMU_CORE_IDX 1
#define X86_HYBRID_PMU_TINY_IDX 2
@@ -697,11 +690,6 @@ enum hybrid_pmu_type {
hybrid_big_small_tiny = hybrid_big | hybrid_small_tiny,
};
-enum atom_native_id {
- cmt_native_id = 0x2, /* Crestmont */
- skt_native_id = 0x3, /* Skymont */
-};
-
struct x86_hybrid_pmu {
struct pmu pmu;
const char *name;
@@ -800,6 +788,7 @@ struct x86_pmu {
u64 (*update)(struct perf_event *event);
int (*hw_config)(struct perf_event *event);
int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
+ void (*late_setup)(void);
unsigned eventsel;
unsigned perfctr;
unsigned fixedctr;
@@ -869,7 +858,7 @@ struct x86_pmu {
void (*check_microcode)(void);
void (*sched_task)(struct perf_event_pmu_context *pmu_ctx,
- bool sched_in);
+ struct task_struct *task, bool sched_in);
/*
* Intel Arch Perfmon v2+
@@ -914,6 +903,7 @@ struct x86_pmu {
const int *lbr_sel_map; /* lbr_select mappings */
int *lbr_ctl_map; /* LBR_CTL mappings */
};
+ u64 lbr_callstack_users; /* lbr callstack system wide users */
bool lbr_double_abort; /* duplicated lbr aborts */
bool lbr_pt_coexist; /* (LBR|BTS) may coexist with PT */
@@ -952,14 +942,6 @@ struct x86_pmu {
int num_topdown_events;
/*
- * perf task context (i.e. struct perf_event_pmu_context::task_ctx_data)
- * switch helper to bridge calls from perf/core to perf/x86.
- * See struct pmu::swap_task_ctx() usage for examples;
- */
- void (*swap_task_ctx)(struct perf_event_pmu_context *prev_epc,
- struct perf_event_pmu_context *next_epc);
-
- /*
* AMD bits
*/
unsigned int amd_nb_constraints : 1;
@@ -994,7 +976,7 @@ struct x86_pmu {
*/
int num_hybrid_pmus;
struct x86_hybrid_pmu *hybrid_pmu;
- enum hybrid_cpu_type (*get_hybrid_cpu_type) (void);
+ enum intel_cpu_type (*get_hybrid_cpu_type) (void);
};
struct x86_perf_task_context_opt {
@@ -1107,6 +1089,8 @@ extern struct x86_pmu x86_pmu __read_mostly;
DECLARE_STATIC_CALL(x86_pmu_set_period, *x86_pmu.set_period);
DECLARE_STATIC_CALL(x86_pmu_update, *x86_pmu.update);
+DECLARE_STATIC_CALL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs);
+DECLARE_STATIC_CALL(x86_pmu_late_setup, *x86_pmu.late_setup);
static __always_inline struct x86_perf_task_context_opt *task_context_opt(void *ctx)
{
@@ -1148,6 +1132,12 @@ extern u64 __read_mostly hw_cache_extra_regs
u64 x86_perf_event_update(struct perf_event *event);
+static inline u64 intel_pmu_topdown_event_update(struct perf_event *event, u64 *val)
+{
+ return x86_perf_event_update(event);
+}
+DECLARE_STATIC_CALL(intel_pmu_update_topdown_event, intel_pmu_topdown_event_update);
+
static inline unsigned int x86_pmu_config_addr(int index)
{
return x86_pmu.eventsel + (x86_pmu.addr_offset ?
@@ -1394,7 +1384,8 @@ void amd_pmu_lbr_reset(void);
void amd_pmu_lbr_read(void);
void amd_pmu_lbr_add(struct perf_event *event);
void amd_pmu_lbr_del(struct perf_event *event);
-void amd_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in);
+void amd_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx,
+ struct task_struct *task, bool sched_in);
void amd_pmu_lbr_enable_all(void);
void amd_pmu_lbr_disable_all(void);
int amd_pmu_lbr_hw_config(struct perf_event *event);
@@ -1448,7 +1439,8 @@ static inline void amd_pmu_brs_del(struct perf_event *event)
perf_sched_cb_dec(event->pmu);
}
-void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in);
+void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx,
+ struct task_struct *task, bool sched_in);
#else
static inline int amd_brs_init(void)
{
@@ -1473,7 +1465,8 @@ static inline void amd_pmu_brs_del(struct perf_event *event)
{
}
-static inline void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
+static inline void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx,
+ struct task_struct *task, bool sched_in)
{
}
@@ -1643,7 +1636,7 @@ void intel_pmu_pebs_disable_all(void);
void intel_pmu_pebs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in);
-void intel_pmu_auto_reload_read(struct perf_event *event);
+void intel_pmu_drain_pebs_buffer(void);
void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr);
@@ -1653,10 +1646,8 @@ void intel_pmu_lbr_save_brstack(struct perf_sample_data *data,
struct cpu_hw_events *cpuc,
struct perf_event *event);
-void intel_pmu_lbr_swap_task_ctx(struct perf_event_pmu_context *prev_epc,
- struct perf_event_pmu_context *next_epc);
-
-void intel_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in);
+void intel_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx,
+ struct task_struct *task, bool sched_in);
u64 lbr_from_signext_quirk_wr(u64 val);
diff --git a/arch/x86/events/perf_event_flags.h b/arch/x86/events/perf_event_flags.h
index 6c977c19f2cd..1d9e385649b5 100644
--- a/arch/x86/events/perf_event_flags.h
+++ b/arch/x86/events/perf_event_flags.h
@@ -9,7 +9,7 @@ PERF_ARCH(PEBS_LD_HSW, 0x00008) /* haswell style datala, load */
PERF_ARCH(PEBS_NA_HSW, 0x00010) /* haswell style datala, unknown */
PERF_ARCH(EXCL, 0x00020) /* HT exclusivity on counter */
PERF_ARCH(DYNAMIC, 0x00040) /* dynamic alloc'd constraint */
- /* 0x00080 */
+PERF_ARCH(PEBS_CNTR, 0x00080) /* PEBS counters snapshot */
PERF_ARCH(EXCL_ACCT, 0x00100) /* accounted EXCL event */
PERF_ARCH(AUTO_RELOAD, 0x00200) /* use PEBS auto-reload */
PERF_ARCH(LARGE_PEBS, 0x00400) /* use large PEBS */
diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
index 6941f4811bec..8ddace8cea96 100644
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -274,8 +274,7 @@ static void rapl_hrtimer_init(struct rapl_pmu *rapl_pmu)
{
struct hrtimer *hr = &rapl_pmu->hrtimer;
- hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- hr->function = rapl_hrtimer_handle;
+ hrtimer_setup(hr, rapl_hrtimer_handle, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
}
static void __rapl_pmu_event_start(struct rapl_pmu *rapl_pmu,
@@ -730,6 +729,7 @@ static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr, int rapl_pmu_
{
int nr_rapl_pmu = topology_max_packages();
struct rapl_pmus *rapl_pmus;
+ int ret;
/*
* rapl_pmu_scope must be either PKG, DIE or CORE
@@ -761,7 +761,11 @@ static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr, int rapl_pmu_
rapl_pmus->pmu.module = THIS_MODULE;
rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
- return init_rapl_pmu(rapl_pmus);
+ ret = init_rapl_pmu(rapl_pmus);
+ if (ret)
+ kfree(rapl_pmus);
+
+ return ret;
}
static struct rapl_model model_snb = {
diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile
index 3a1548054b48..d55f494f471d 100644
--- a/arch/x86/hyperv/Makefile
+++ b/arch/x86/hyperv/Makefile
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-y := hv_init.o mmu.o nested.o irqdomain.o ivm.o
-obj-$(CONFIG_X86_64) += hv_apic.o hv_proc.o
+obj-$(CONFIG_X86_64) += hv_apic.o
obj-$(CONFIG_HYPERV_VTL_MODE) += hv_vtl.o
ifdef CONFIG_X86_64
diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
index f022d5f64fb6..6d91ac5f9836 100644
--- a/arch/x86/hyperv/hv_apic.c
+++ b/arch/x86/hyperv/hv_apic.c
@@ -145,6 +145,11 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector,
ipi_arg->vp_set.format = HV_GENERIC_SET_ALL;
}
+ /*
+ * For this hypercall, Hyper-V treats the valid_bank_mask field
+ * of ipi_arg->vp_set as part of the fixed size input header.
+ * So the variable input header size is equal to nr_bank.
+ */
status = hv_do_rep_hypercall(HVCALL_SEND_IPI_EX, 0, nr_bank,
ipi_arg, NULL);
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 173005e6a95d..ddeb40930bc8 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -34,9 +34,6 @@
#include <clocksource/hyperv_timer.h>
#include <linux/highmem.h>
-u64 hv_current_partition_id = ~0ull;
-EXPORT_SYMBOL_GPL(hv_current_partition_id);
-
void *hv_hypercall_pg;
EXPORT_SYMBOL_GPL(hv_hypercall_pg);
@@ -93,7 +90,7 @@ static int hv_cpu_init(unsigned int cpu)
return 0;
hvp = &hv_vp_assist_page[cpu];
- if (hv_root_partition) {
+ if (hv_root_partition()) {
/*
* For root partition we get the hypervisor provided VP assist
* page, instead of allocating a new page.
@@ -245,7 +242,7 @@ static int hv_cpu_die(unsigned int cpu)
if (hv_vp_assist_page && hv_vp_assist_page[cpu]) {
union hv_vp_assist_msr_contents msr = { 0 };
- if (hv_root_partition) {
+ if (hv_root_partition()) {
/*
* For root partition the VP assist page is mapped to
* hypervisor provided page, and thus we unmap the
@@ -320,7 +317,7 @@ static int hv_suspend(void)
union hv_x64_msr_hypercall_contents hypercall_msr;
int ret;
- if (hv_root_partition)
+ if (hv_root_partition())
return -EPERM;
/*
@@ -393,24 +390,6 @@ static void __init hv_stimer_setup_percpu_clockev(void)
old_setup_percpu_clockev();
}
-static void __init hv_get_partition_id(void)
-{
- struct hv_get_partition_id *output_page;
- u64 status;
- unsigned long flags;
-
- local_irq_save(flags);
- output_page = *this_cpu_ptr(hyperv_pcpu_output_arg);
- status = hv_do_hypercall(HVCALL_GET_PARTITION_ID, NULL, output_page);
- if (!hv_result_success(status)) {
- /* No point in proceeding if this failed */
- pr_err("Failed to get partition ID: %lld\n", status);
- BUG();
- }
- hv_current_partition_id = output_page->partition_id;
- local_irq_restore(flags);
-}
-
#if IS_ENABLED(CONFIG_HYPERV_VTL_MODE)
static u8 __init get_vtl(void)
{
@@ -539,7 +518,7 @@ void __init hyperv_init(void)
rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
hypercall_msr.enable = 1;
- if (hv_root_partition) {
+ if (hv_root_partition()) {
struct page *pg;
void *src;
@@ -605,17 +584,15 @@ skip_hypercall_pg_init:
register_syscore_ops(&hv_syscore_ops);
- if (cpuid_ebx(HYPERV_CPUID_FEATURES) & HV_ACCESS_PARTITION_ID)
+ if (ms_hyperv.priv_high & HV_ACCESS_PARTITION_ID)
hv_get_partition_id();
- BUG_ON(hv_root_partition && hv_current_partition_id == ~0ull);
-
#ifdef CONFIG_PCI_MSI
/*
* If we're running as root, we want to create our own PCI MSI domain.
* We can't set this in hv_pci_init because that would be too late.
*/
- if (hv_root_partition)
+ if (hv_root_partition())
x86_init.irqs.create_pci_msi_domain = hv_create_pci_msi_domain;
#endif
diff --git a/arch/x86/hyperv/hv_proc.c b/arch/x86/hyperv/hv_proc.c
deleted file mode 100644
index ac4c834d4435..000000000000
--- a/arch/x86/hyperv/hv_proc.c
+++ /dev/null
@@ -1,198 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/types.h>
-#include <linux/vmalloc.h>
-#include <linux/mm.h>
-#include <linux/clockchips.h>
-#include <linux/slab.h>
-#include <linux/cpuhotplug.h>
-#include <linux/minmax.h>
-#include <asm/hypervisor.h>
-#include <asm/mshyperv.h>
-#include <asm/apic.h>
-
-#include <asm/trace/hyperv.h>
-
-/*
- * See struct hv_deposit_memory. The first u64 is partition ID, the rest
- * are GPAs.
- */
-#define HV_DEPOSIT_MAX (HV_HYP_PAGE_SIZE / sizeof(u64) - 1)
-
-/* Deposits exact number of pages. Must be called with interrupts enabled. */
-int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages)
-{
- struct page **pages, *page;
- int *counts;
- int num_allocations;
- int i, j, page_count;
- int order;
- u64 status;
- int ret;
- u64 base_pfn;
- struct hv_deposit_memory *input_page;
- unsigned long flags;
-
- if (num_pages > HV_DEPOSIT_MAX)
- return -E2BIG;
- if (!num_pages)
- return 0;
-
- /* One buffer for page pointers and counts */
- page = alloc_page(GFP_KERNEL);
- if (!page)
- return -ENOMEM;
- pages = page_address(page);
-
- counts = kcalloc(HV_DEPOSIT_MAX, sizeof(int), GFP_KERNEL);
- if (!counts) {
- free_page((unsigned long)pages);
- return -ENOMEM;
- }
-
- /* Allocate all the pages before disabling interrupts */
- i = 0;
-
- while (num_pages) {
- /* Find highest order we can actually allocate */
- order = 31 - __builtin_clz(num_pages);
-
- while (1) {
- pages[i] = alloc_pages_node(node, GFP_KERNEL, order);
- if (pages[i])
- break;
- if (!order) {
- ret = -ENOMEM;
- num_allocations = i;
- goto err_free_allocations;
- }
- --order;
- }
-
- split_page(pages[i], order);
- counts[i] = 1 << order;
- num_pages -= counts[i];
- i++;
- }
- num_allocations = i;
-
- local_irq_save(flags);
-
- input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
-
- input_page->partition_id = partition_id;
-
- /* Populate gpa_page_list - these will fit on the input page */
- for (i = 0, page_count = 0; i < num_allocations; ++i) {
- base_pfn = page_to_pfn(pages[i]);
- for (j = 0; j < counts[i]; ++j, ++page_count)
- input_page->gpa_page_list[page_count] = base_pfn + j;
- }
- status = hv_do_rep_hypercall(HVCALL_DEPOSIT_MEMORY,
- page_count, 0, input_page, NULL);
- local_irq_restore(flags);
- if (!hv_result_success(status)) {
- pr_err("Failed to deposit pages: %lld\n", status);
- ret = hv_result(status);
- goto err_free_allocations;
- }
-
- ret = 0;
- goto free_buf;
-
-err_free_allocations:
- for (i = 0; i < num_allocations; ++i) {
- base_pfn = page_to_pfn(pages[i]);
- for (j = 0; j < counts[i]; ++j)
- __free_page(pfn_to_page(base_pfn + j));
- }
-
-free_buf:
- free_page((unsigned long)pages);
- kfree(counts);
- return ret;
-}
-
-int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id)
-{
- struct hv_input_add_logical_processor *input;
- struct hv_output_add_logical_processor *output;
- u64 status;
- unsigned long flags;
- int ret = HV_STATUS_SUCCESS;
-
- /*
- * When adding a logical processor, the hypervisor may return
- * HV_STATUS_INSUFFICIENT_MEMORY. When that happens, we deposit more
- * pages and retry.
- */
- do {
- local_irq_save(flags);
-
- input = *this_cpu_ptr(hyperv_pcpu_input_arg);
- /* We don't do anything with the output right now */
- output = *this_cpu_ptr(hyperv_pcpu_output_arg);
-
- input->lp_index = lp_index;
- input->apic_id = apic_id;
- input->proximity_domain_info = hv_numa_node_to_pxm_info(node);
- status = hv_do_hypercall(HVCALL_ADD_LOGICAL_PROCESSOR,
- input, output);
- local_irq_restore(flags);
-
- if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
- if (!hv_result_success(status)) {
- pr_err("%s: cpu %u apic ID %u, %lld\n", __func__,
- lp_index, apic_id, status);
- ret = hv_result(status);
- }
- break;
- }
- ret = hv_call_deposit_pages(node, hv_current_partition_id, 1);
- } while (!ret);
-
- return ret;
-}
-
-int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags)
-{
- struct hv_create_vp *input;
- u64 status;
- unsigned long irq_flags;
- int ret = HV_STATUS_SUCCESS;
-
- /* Root VPs don't seem to need pages deposited */
- if (partition_id != hv_current_partition_id) {
- /* The value 90 is empirically determined. It may change. */
- ret = hv_call_deposit_pages(node, partition_id, 90);
- if (ret)
- return ret;
- }
-
- do {
- local_irq_save(irq_flags);
-
- input = *this_cpu_ptr(hyperv_pcpu_input_arg);
-
- input->partition_id = partition_id;
- input->vp_index = vp_index;
- input->flags = flags;
- input->subnode_type = HV_SUBNODE_ANY;
- input->proximity_domain_info = hv_numa_node_to_pxm_info(node);
- status = hv_do_hypercall(HVCALL_CREATE_VP, input, NULL);
- local_irq_restore(irq_flags);
-
- if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
- if (!hv_result_success(status)) {
- pr_err("%s: vcpu %u, lp %u, %lld\n", __func__,
- vp_index, flags, status);
- ret = hv_result(status);
- }
- break;
- }
- ret = hv_call_deposit_pages(node, partition_id, 1);
-
- } while (!ret);
-
- return ret;
-}
-
diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c
index 3f4e20d7b724..13242ed8ff16 100644
--- a/arch/x86/hyperv/hv_vtl.c
+++ b/arch/x86/hyperv/hv_vtl.c
@@ -12,6 +12,7 @@
#include <asm/i8259.h>
#include <asm/mshyperv.h>
#include <asm/realmode.h>
+#include <asm/reboot.h>
#include <../kernel/smpboot.h>
extern struct boot_params boot_params;
@@ -22,6 +23,36 @@ static bool __init hv_vtl_msi_ext_dest_id(void)
return true;
}
+/*
+ * The `native_machine_emergency_restart` function from `reboot.c` writes
+ * to the physical address 0x472 to indicate the type of reboot for the
+ * firmware. We cannot have that in VSM as the memory composition might
+ * be more generic, and such write effectively corrupts the memory thus
+ * making diagnostics harder at the very least.
+ */
+static void __noreturn hv_vtl_emergency_restart(void)
+{
+ /*
+ * Cause a triple fault and the immediate reset. Here the code does not run
+ * on the top of any firmware, whereby cannot reach out to its services.
+ * The inifinite loop is for the improbable case that the triple fault does
+ * not work and have to preserve the state intact for debugging.
+ */
+ for (;;) {
+ idt_invalidate();
+ __asm__ __volatile__("int3");
+ }
+}
+
+/*
+ * The only way to restart in the VTL mode is to triple fault as the kernel runs
+ * as firmware.
+ */
+static void __noreturn hv_vtl_restart(char __maybe_unused *cmd)
+{
+ hv_vtl_emergency_restart();
+}
+
void __init hv_vtl_init_platform(void)
{
pr_info("Linux runs in Hyper-V Virtual Trust Level\n");
@@ -236,6 +267,9 @@ static int hv_vtl_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip)
int __init hv_vtl_early_init(void)
{
+ machine_ops.emergency_restart = hv_vtl_emergency_restart;
+ machine_ops.restart = hv_vtl_restart;
+
/*
* `boot_cpu_has` returns the runtime feature support,
* and here is the earliest it can be used.
diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c
index 64b921360b0f..31f0d29cbc5e 100644
--- a/arch/x86/hyperv/irqdomain.c
+++ b/arch/x86/hyperv/irqdomain.c
@@ -64,7 +64,7 @@ static int hv_map_interrupt(union hv_device_id device_id, bool level,
local_irq_restore(flags);
if (!hv_result_success(status))
- pr_err("%s: hypercall failed, status %lld\n", __func__, status);
+ hv_status_err(status, "\n");
return hv_result(status);
}
@@ -224,7 +224,7 @@ static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
kfree(stored_entry);
if (status != HV_STATUS_SUCCESS) {
- pr_debug("%s: failed to unmap, status %lld", __func__, status);
+ hv_status_debug(status, "failed to unmap\n");
return;
}
}
@@ -273,7 +273,7 @@ static void hv_teardown_msi_irq(struct pci_dev *dev, struct irq_data *irqd)
status = hv_unmap_msi_interrupt(dev, &old_entry);
if (status != HV_STATUS_SUCCESS)
- pr_err("%s: hypercall failed, status %lld\n", __func__, status);
+ hv_status_err(status, "\n");
}
static void hv_msi_free_irq(struct irq_domain *domain,
diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c
index ec7880271cf9..77bf05f06b9e 100644
--- a/arch/x86/hyperv/ivm.c
+++ b/arch/x86/hyperv/ivm.c
@@ -338,7 +338,7 @@ int hv_snp_boot_ap(u32 cpu, unsigned long start_ip)
vmsa->sev_features = sev_status >> 2;
ret = snp_set_vmsa(vmsa, true);
- if (!ret) {
+ if (ret) {
pr_err("RMPADJUST(%llx) failed: %llx\n", (u64)vmsa, ret);
free_page((u64)vmsa);
return ret;
diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
index cc8c3bd0e7c2..cfcb60468b01 100644
--- a/arch/x86/hyperv/mmu.c
+++ b/arch/x86/hyperv/mmu.c
@@ -205,6 +205,10 @@ static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
/*
* We can flush not more than max_gvas with one hypercall. Flush the
* whole address space if we were asked to do more.
+ *
+ * For these hypercalls, Hyper-V treats the valid_bank_mask field
+ * of flush->hv_vp_set as part of the fixed size input header.
+ * So the variable input header size is equal to nr_bank.
*/
max_gvas =
(PAGE_SIZE - sizeof(*flush) - nr_bank *
@@ -239,5 +243,4 @@ void hyperv_setup_mmu_ops(void)
pr_info("Using hypercall for remote TLB flush\n");
pv_ops.mmu.flush_tlb_multi = hyperv_flush_tlb_multi;
- pv_ops.mmu.tlb_remove_table = tlb_remove_table;
}
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 58f4ddecc5fa..4566000e15c4 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -8,6 +8,7 @@ generated-y += syscalls_x32.h
generated-y += unistd_32_ia32.h
generated-y += unistd_64_x32.h
generated-y += xen-hypercalls.h
+generated-y += cpufeaturemasks.h
generic-y += early_ioremap.h
generic-y += fprobe.h
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index e3903b731305..4a37a8bd87fd 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -15,7 +15,7 @@
#define ALT_DIRECT_CALL(feature) ((ALT_FLAG_DIRECT_CALL << ALT_FLAGS_SHIFT) | (feature))
#define ALT_CALL_ALWAYS ALT_DIRECT_CALL(X86_FEATURE_ALWAYS)
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/stddef.h>
@@ -48,7 +48,7 @@
".popsection\n" \
"671:"
-#define LOCK_PREFIX LOCK_PREFIX_HERE "\n\tlock; "
+#define LOCK_PREFIX LOCK_PREFIX_HERE "\n\tlock "
#else /* ! CONFIG_SMP */
#define LOCK_PREFIX_HERE ""
@@ -87,20 +87,19 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
* instructions were patched in already:
*/
extern int alternatives_patched;
-struct module;
extern void alternative_instructions(void);
-extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end,
- struct module *mod);
-extern void apply_retpolines(s32 *start, s32 *end, struct module *mod);
-extern void apply_returns(s32 *start, s32 *end, struct module *mod);
-extern void apply_seal_endbr(s32 *start, s32 *end, struct module *mod);
+extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
+extern void apply_retpolines(s32 *start, s32 *end);
+extern void apply_returns(s32 *start, s32 *end);
+extern void apply_seal_endbr(s32 *start, s32 *end);
extern void apply_fineibt(s32 *start_retpoline, s32 *end_retpoine,
- s32 *start_cfi, s32 *end_cfi, struct module *mod);
+ s32 *start_cfi, s32 *end_cfi);
+
+struct module;
struct callthunk_sites {
s32 *call_start, *call_end;
- struct alt_instr *alt_start, *alt_end;
};
#ifdef CONFIG_CALL_THUNKS
@@ -237,10 +236,12 @@ static inline int alternatives_text_reserved(void *start, void *end)
* references: i.e., if used for a function, it would add the PLT
* suffix.
*/
-#define alternative_call(oldfunc, newfunc, ft_flags, output, input...) \
+#define alternative_call(oldfunc, newfunc, ft_flags, output, input, clobbers...) \
asm_inline volatile(ALTERNATIVE("call %c[old]", "call %c[new]", ft_flags) \
: ALT_OUTPUT_SP(output) \
- : [old] "i" (oldfunc), [new] "i" (newfunc), ## input)
+ : [old] "i" (oldfunc), [new] "i" (newfunc) \
+ COMMA(input) \
+ : clobbers)
/*
* Like alternative_call, but there are two features and respective functions.
@@ -249,24 +250,14 @@ static inline int alternatives_text_reserved(void *start, void *end)
* Otherwise, old function is used.
*/
#define alternative_call_2(oldfunc, newfunc1, ft_flags1, newfunc2, ft_flags2, \
- output, input...) \
+ output, input, clobbers...) \
asm_inline volatile(ALTERNATIVE_2("call %c[old]", "call %c[new1]", ft_flags1, \
"call %c[new2]", ft_flags2) \
: ALT_OUTPUT_SP(output) \
: [old] "i" (oldfunc), [new1] "i" (newfunc1), \
- [new2] "i" (newfunc2), ## input)
-
-/*
- * use this macro(s) if you need more than one output parameter
- * in alternative_io
- */
-#define ASM_OUTPUT2(a...) a
-
-/*
- * use this macro if you need clobbers but no inputs in
- * alternative_{input,io,call}()
- */
-#define ASM_NO_INPUT_CLOBBER(clbr...) "i" (0) : clbr
+ [new2] "i" (newfunc2) \
+ COMMA(input) \
+ : clobbers)
#define ALT_OUTPUT_SP(...) ASM_CALL_CONSTRAINT, ## __VA_ARGS__
@@ -286,7 +277,7 @@ static inline int alternatives_text_reserved(void *start, void *end)
void BUG_func(void);
void nop_func(void);
-#else /* __ASSEMBLY__ */
+#else /* __ASSEMBLER__ */
#ifdef CONFIG_SMP
.macro LOCK_PREFIX
@@ -369,6 +360,6 @@ void nop_func(void);
ALTERNATIVE_2 oldinstr, newinstr_no, X86_FEATURE_ALWAYS, \
newinstr_yes, ft_flags
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_X86_ALTERNATIVE_H */
diff --git a/arch/x86/include/asm/amd-ibs.h b/arch/x86/include/asm/amd-ibs.h
index cb2a5e113daa..77f3a589a99a 100644
--- a/arch/x86/include/asm/amd-ibs.h
+++ b/arch/x86/include/asm/amd-ibs.h
@@ -64,7 +64,8 @@ union ibs_op_ctl {
opmaxcnt_ext:7, /* 20-26: upper 7 bits of periodic op maximum count */
reserved0:5, /* 27-31: reserved */
opcurcnt:27, /* 32-58: periodic op counter current count */
- reserved1:5; /* 59-63: reserved */
+ ldlat_thrsh:4, /* 59-62: Load Latency threshold */
+ ldlat_en:1; /* 63: Load Latency enabled */
};
};
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 4c4efb93045e..adfa0854cf2d 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -27,7 +27,6 @@ struct amd_l3_cache {
};
struct amd_northbridge {
- struct pci_dev *root;
struct pci_dev *misc;
struct pci_dev *link;
struct amd_l3_cache l3_cache;
diff --git a/arch/x86/include/asm/amd_node.h b/arch/x86/include/asm/amd_node.h
index 113ad3e8ee40..23fe617898a8 100644
--- a/arch/x86/include/asm/amd_node.h
+++ b/arch/x86/include/asm/amd_node.h
@@ -30,7 +30,31 @@ static inline u16 amd_num_nodes(void)
return topology_amd_nodes_per_pkg() * topology_max_packages();
}
+#ifdef CONFIG_AMD_NODE
int __must_check amd_smn_read(u16 node, u32 address, u32 *value);
int __must_check amd_smn_write(u16 node, u32 address, u32 value);
+/* Should only be used by the HSMP driver. */
+int __must_check amd_smn_hsmp_rdwr(u16 node, u32 address, u32 *value, bool write);
+#else
+static inline int __must_check amd_smn_read(u16 node, u32 address, u32 *value) { return -ENODEV; }
+static inline int __must_check amd_smn_write(u16 node, u32 address, u32 value) { return -ENODEV; }
+
+static inline int __must_check amd_smn_hsmp_rdwr(u16 node, u32 address, u32 *value, bool write)
+{
+ return -ENODEV;
+}
+#endif /* CONFIG_AMD_NODE */
+
+/* helper for use with read_poll_timeout */
+static inline int smn_read_register(u32 reg)
+{
+ int data, rc;
+
+ rc = amd_smn_read(0, reg, &data);
+ if (rc)
+ return rc;
+
+ return data;
+}
#endif /*_ASM_X86_AMD_NODE_H_*/
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index f21ff1932699..c903d358405d 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -99,8 +99,8 @@ static inline void native_apic_mem_write(u32 reg, u32 v)
volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg);
alternative_io("movl %0, %1", "xchgl %0, %1", X86_BUG_11AP,
- ASM_OUTPUT2("=r" (v), "=m" (*addr)),
- ASM_OUTPUT2("0" (v), "m" (*addr)));
+ ASM_OUTPUT("=r" (v), "=m" (*addr)),
+ ASM_INPUT("0" (v), "m" (*addr)));
}
static inline u32 native_apic_mem_read(u32 reg)
diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h
index ba88edd0d58b..b5982b94bdba 100644
--- a/arch/x86/include/asm/arch_hweight.h
+++ b/arch/x86/include/asm/arch_hweight.h
@@ -16,9 +16,10 @@ static __always_inline unsigned int __arch_hweight32(unsigned int w)
{
unsigned int res;
- asm (ALTERNATIVE("call __sw_hweight32", "popcntl %1, %0", X86_FEATURE_POPCNT)
- : "="REG_OUT (res)
- : REG_IN (w));
+ asm_inline (ALTERNATIVE("call __sw_hweight32",
+ "popcntl %[val], %[cnt]", X86_FEATURE_POPCNT)
+ : [cnt] "=" REG_OUT (res), ASM_CALL_CONSTRAINT
+ : [val] REG_IN (w));
return res;
}
@@ -44,9 +45,10 @@ static __always_inline unsigned long __arch_hweight64(__u64 w)
{
unsigned long res;
- asm (ALTERNATIVE("call __sw_hweight64", "popcntq %1, %0", X86_FEATURE_POPCNT)
- : "="REG_OUT (res)
- : REG_IN (w));
+ asm_inline (ALTERNATIVE("call __sw_hweight64",
+ "popcntq %[val], %[cnt]", X86_FEATURE_POPCNT)
+ : [cnt] "=" REG_OUT (res), ASM_CALL_CONSTRAINT
+ : [val] REG_IN (w));
return res;
}
diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h
index 3674006e3974..11c6fecc3ad7 100644
--- a/arch/x86/include/asm/asm-prototypes.h
+++ b/arch/x86/include/asm/asm-prototypes.h
@@ -16,10 +16,10 @@
#include <asm/gsseg.h>
#include <asm/nospec-branch.h>
-#ifndef CONFIG_X86_CMPXCHG64
+#ifndef CONFIG_X86_CX8
extern void cmpxchg8b_emu(void);
#endif
-#if defined(__GENKSYMS__) && defined(CONFIG_STACKPROTECTOR)
+#ifdef CONFIG_STACKPROTECTOR
extern unsigned long __ref_stack_chk_guard;
#endif
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 2bec0c89a95c..cc2881576c2c 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -2,7 +2,7 @@
#ifndef _ASM_X86_ASM_H
#define _ASM_X86_ASM_H
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
# define __ASM_FORM(x, ...) x,## __VA_ARGS__
# define __ASM_FORM_RAW(x, ...) x,## __VA_ARGS__
# define __ASM_FORM_COMMA(x, ...) x,## __VA_ARGS__,
@@ -113,7 +113,7 @@
#endif
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#ifndef __pic__
static __always_inline __pure void *rip_rel_ptr(void *p)
{
@@ -144,7 +144,7 @@ static __always_inline __pure void *rip_rel_ptr(void *p)
# include <asm/extable_fixup_types.h>
/* Exception table entry */
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
# define _ASM_EXTABLE_TYPE(from, to, type) \
.pushsection "__ex_table","a" ; \
@@ -164,7 +164,7 @@ static __always_inline __pure void *rip_rel_ptr(void *p)
# define _ASM_NOKPROBE(entry)
# endif
-#else /* ! __ASSEMBLY__ */
+#else /* ! __ASSEMBLER__ */
# define DEFINE_EXTABLE_TYPE_REG \
".macro extable_type_reg type:req reg:req\n" \
@@ -213,6 +213,17 @@ static __always_inline __pure void *rip_rel_ptr(void *p)
/* For C file, we already have NOKPROBE_SYMBOL macro */
+/* Insert a comma if args are non-empty */
+#define COMMA(x...) __COMMA(x)
+#define __COMMA(...) , ##__VA_ARGS__
+
+/*
+ * Combine multiple asm inline constraint args into a single arg for passing to
+ * another macro.
+ */
+#define ASM_OUTPUT(x...) x
+#define ASM_INPUT(x...) x
+
/*
* This output constraint should be used for any inline asm which has a "call"
* instruction. Otherwise the asm may be inserted before the frame pointer
@@ -221,7 +232,7 @@ static __always_inline __pure void *rip_rel_ptr(void *p)
*/
register unsigned long current_stack_pointer asm(_ASM_SP);
#define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer)
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#define _ASM_EXTABLE(from, to) \
_ASM_EXTABLE_TYPE(from, to, EX_TYPE_DEFAULT)
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 55b4d24356ea..75743f1dfd4e 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -30,14 +30,14 @@ static __always_inline void arch_atomic_set(atomic_t *v, int i)
static __always_inline void arch_atomic_add(int i, atomic_t *v)
{
- asm volatile(LOCK_PREFIX "addl %1,%0"
+ asm_inline volatile(LOCK_PREFIX "addl %1, %0"
: "+m" (v->counter)
: "ir" (i) : "memory");
}
static __always_inline void arch_atomic_sub(int i, atomic_t *v)
{
- asm volatile(LOCK_PREFIX "subl %1,%0"
+ asm_inline volatile(LOCK_PREFIX "subl %1, %0"
: "+m" (v->counter)
: "ir" (i) : "memory");
}
@@ -50,14 +50,14 @@ static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v)
static __always_inline void arch_atomic_inc(atomic_t *v)
{
- asm volatile(LOCK_PREFIX "incl %0"
+ asm_inline volatile(LOCK_PREFIX "incl %0"
: "+m" (v->counter) :: "memory");
}
#define arch_atomic_inc arch_atomic_inc
static __always_inline void arch_atomic_dec(atomic_t *v)
{
- asm volatile(LOCK_PREFIX "decl %0"
+ asm_inline volatile(LOCK_PREFIX "decl %0"
: "+m" (v->counter) :: "memory");
}
#define arch_atomic_dec arch_atomic_dec
@@ -116,7 +116,7 @@ static __always_inline int arch_atomic_xchg(atomic_t *v, int new)
static __always_inline void arch_atomic_and(int i, atomic_t *v)
{
- asm volatile(LOCK_PREFIX "andl %1,%0"
+ asm_inline volatile(LOCK_PREFIX "andl %1, %0"
: "+m" (v->counter)
: "ir" (i)
: "memory");
@@ -134,7 +134,7 @@ static __always_inline int arch_atomic_fetch_and(int i, atomic_t *v)
static __always_inline void arch_atomic_or(int i, atomic_t *v)
{
- asm volatile(LOCK_PREFIX "orl %1,%0"
+ asm_inline volatile(LOCK_PREFIX "orl %1, %0"
: "+m" (v->counter)
: "ir" (i)
: "memory");
@@ -152,7 +152,7 @@ static __always_inline int arch_atomic_fetch_or(int i, atomic_t *v)
static __always_inline void arch_atomic_xor(int i, atomic_t *v)
{
- asm volatile(LOCK_PREFIX "xorl %1,%0"
+ asm_inline volatile(LOCK_PREFIX "xorl %1, %0"
: "+m" (v->counter)
: "ir" (i)
: "memory");
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index 6c6e9b9f98a4..ab838205c1c6 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -48,17 +48,20 @@ static __always_inline s64 arch_atomic64_read_nonatomic(const atomic64_t *v)
ATOMIC64_EXPORT(atomic64_##sym)
#endif
-#ifdef CONFIG_X86_CMPXCHG64
-#define __alternative_atomic64(f, g, out, in...) \
- asm volatile("call %c[func]" \
+#ifdef CONFIG_X86_CX8
+#define __alternative_atomic64(f, g, out, in, clobbers...) \
+ asm volatile("call %c[func]" \
: ALT_OUTPUT_SP(out) \
- : [func] "i" (atomic64_##g##_cx8), ## in)
+ : [func] "i" (atomic64_##g##_cx8) \
+ COMMA(in) \
+ : clobbers)
#define ATOMIC64_DECL(sym) ATOMIC64_DECL_ONE(sym##_cx8)
#else
-#define __alternative_atomic64(f, g, out, in...) \
- alternative_call(atomic64_##f##_386, atomic64_##g##_cx8, \
- X86_FEATURE_CX8, ASM_OUTPUT2(out), ## in)
+#define __alternative_atomic64(f, g, out, in, clobbers...) \
+ alternative_call(atomic64_##f##_386, atomic64_##g##_cx8, \
+ X86_FEATURE_CX8, ASM_OUTPUT(out), \
+ ASM_INPUT(in), clobbers)
#define ATOMIC64_DECL(sym) ATOMIC64_DECL_ONE(sym##_cx8); \
ATOMIC64_DECL_ONE(sym##_386)
@@ -69,8 +72,8 @@ ATOMIC64_DECL_ONE(inc_386);
ATOMIC64_DECL_ONE(dec_386);
#endif
-#define alternative_atomic64(f, out, in...) \
- __alternative_atomic64(f, f, ASM_OUTPUT2(out), ## in)
+#define alternative_atomic64(f, out, in, clobbers...) \
+ __alternative_atomic64(f, f, ASM_OUTPUT(out), ASM_INPUT(in), clobbers)
ATOMIC64_DECL(read);
ATOMIC64_DECL(set);
@@ -105,9 +108,10 @@ static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 n)
s64 o;
unsigned high = (unsigned)(n >> 32);
unsigned low = (unsigned)n;
- alternative_atomic64(xchg, "=&A" (o),
- "S" (v), "b" (low), "c" (high)
- : "memory");
+ alternative_atomic64(xchg,
+ "=&A" (o),
+ ASM_INPUT("S" (v), "b" (low), "c" (high)),
+ "memory");
return o;
}
#define arch_atomic64_xchg arch_atomic64_xchg
@@ -116,23 +120,25 @@ static __always_inline void arch_atomic64_set(atomic64_t *v, s64 i)
{
unsigned high = (unsigned)(i >> 32);
unsigned low = (unsigned)i;
- alternative_atomic64(set, /* no output */,
- "S" (v), "b" (low), "c" (high)
- : "eax", "edx", "memory");
+ alternative_atomic64(set,
+ /* no output */,
+ ASM_INPUT("S" (v), "b" (low), "c" (high)),
+ "eax", "edx", "memory");
}
static __always_inline s64 arch_atomic64_read(const atomic64_t *v)
{
s64 r;
- alternative_atomic64(read, "=&A" (r), "c" (v) : "memory");
+ alternative_atomic64(read, "=&A" (r), "c" (v), "memory");
return r;
}
static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v)
{
alternative_atomic64(add_return,
- ASM_OUTPUT2("+A" (i), "+c" (v)),
- ASM_NO_INPUT_CLOBBER("memory"));
+ ASM_OUTPUT("+A" (i), "+c" (v)),
+ /* no input */,
+ "memory");
return i;
}
#define arch_atomic64_add_return arch_atomic64_add_return
@@ -140,8 +146,9 @@ static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v)
static __always_inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v)
{
alternative_atomic64(sub_return,
- ASM_OUTPUT2("+A" (i), "+c" (v)),
- ASM_NO_INPUT_CLOBBER("memory"));
+ ASM_OUTPUT("+A" (i), "+c" (v)),
+ /* no input */,
+ "memory");
return i;
}
#define arch_atomic64_sub_return arch_atomic64_sub_return
@@ -149,8 +156,10 @@ static __always_inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v)
static __always_inline s64 arch_atomic64_inc_return(atomic64_t *v)
{
s64 a;
- alternative_atomic64(inc_return, "=&A" (a),
- "S" (v) : "memory", "ecx");
+ alternative_atomic64(inc_return,
+ "=&A" (a),
+ "S" (v),
+ "memory", "ecx");
return a;
}
#define arch_atomic64_inc_return arch_atomic64_inc_return
@@ -158,8 +167,10 @@ static __always_inline s64 arch_atomic64_inc_return(atomic64_t *v)
static __always_inline s64 arch_atomic64_dec_return(atomic64_t *v)
{
s64 a;
- alternative_atomic64(dec_return, "=&A" (a),
- "S" (v) : "memory", "ecx");
+ alternative_atomic64(dec_return,
+ "=&A" (a),
+ "S" (v),
+ "memory", "ecx");
return a;
}
#define arch_atomic64_dec_return arch_atomic64_dec_return
@@ -167,28 +178,34 @@ static __always_inline s64 arch_atomic64_dec_return(atomic64_t *v)
static __always_inline void arch_atomic64_add(s64 i, atomic64_t *v)
{
__alternative_atomic64(add, add_return,
- ASM_OUTPUT2("+A" (i), "+c" (v)),
- ASM_NO_INPUT_CLOBBER("memory"));
+ ASM_OUTPUT("+A" (i), "+c" (v)),
+ /* no input */,
+ "memory");
}
static __always_inline void arch_atomic64_sub(s64 i, atomic64_t *v)
{
__alternative_atomic64(sub, sub_return,
- ASM_OUTPUT2("+A" (i), "+c" (v)),
- ASM_NO_INPUT_CLOBBER("memory"));
+ ASM_OUTPUT("+A" (i), "+c" (v)),
+ /* no input */,
+ "memory");
}
static __always_inline void arch_atomic64_inc(atomic64_t *v)
{
- __alternative_atomic64(inc, inc_return, /* no output */,
- "S" (v) : "memory", "eax", "ecx", "edx");
+ __alternative_atomic64(inc, inc_return,
+ /* no output */,
+ "S" (v),
+ "memory", "eax", "ecx", "edx");
}
#define arch_atomic64_inc arch_atomic64_inc
static __always_inline void arch_atomic64_dec(atomic64_t *v)
{
- __alternative_atomic64(dec, dec_return, /* no output */,
- "S" (v) : "memory", "eax", "ecx", "edx");
+ __alternative_atomic64(dec, dec_return,
+ /* no output */,
+ "S" (v),
+ "memory", "eax", "ecx", "edx");
}
#define arch_atomic64_dec arch_atomic64_dec
@@ -197,8 +214,9 @@ static __always_inline int arch_atomic64_add_unless(atomic64_t *v, s64 a, s64 u)
unsigned low = (unsigned)u;
unsigned high = (unsigned)(u >> 32);
alternative_atomic64(add_unless,
- ASM_OUTPUT2("+A" (a), "+c" (low), "+D" (high)),
- "S" (v) : "memory");
+ ASM_OUTPUT("+A" (a), "+c" (low), "+D" (high)),
+ "S" (v),
+ "memory");
return (int)a;
}
#define arch_atomic64_add_unless arch_atomic64_add_unless
@@ -206,8 +224,10 @@ static __always_inline int arch_atomic64_add_unless(atomic64_t *v, s64 a, s64 u)
static __always_inline int arch_atomic64_inc_not_zero(atomic64_t *v)
{
int r;
- alternative_atomic64(inc_not_zero, "=&a" (r),
- "S" (v) : "ecx", "edx", "memory");
+ alternative_atomic64(inc_not_zero,
+ "=&a" (r),
+ "S" (v),
+ "ecx", "edx", "memory");
return r;
}
#define arch_atomic64_inc_not_zero arch_atomic64_inc_not_zero
@@ -215,8 +235,10 @@ static __always_inline int arch_atomic64_inc_not_zero(atomic64_t *v)
static __always_inline s64 arch_atomic64_dec_if_positive(atomic64_t *v)
{
s64 r;
- alternative_atomic64(dec_if_positive, "=&A" (r),
- "S" (v) : "ecx", "memory");
+ alternative_atomic64(dec_if_positive,
+ "=&A" (r),
+ "S" (v),
+ "ecx", "memory");
return r;
}
#define arch_atomic64_dec_if_positive arch_atomic64_dec_if_positive
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index ae12acae5b06..87b496325b5b 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -22,14 +22,14 @@ static __always_inline void arch_atomic64_set(atomic64_t *v, s64 i)
static __always_inline void arch_atomic64_add(s64 i, atomic64_t *v)
{
- asm volatile(LOCK_PREFIX "addq %1,%0"
+ asm_inline volatile(LOCK_PREFIX "addq %1, %0"
: "=m" (v->counter)
: "er" (i), "m" (v->counter) : "memory");
}
static __always_inline void arch_atomic64_sub(s64 i, atomic64_t *v)
{
- asm volatile(LOCK_PREFIX "subq %1,%0"
+ asm_inline volatile(LOCK_PREFIX "subq %1, %0"
: "=m" (v->counter)
: "er" (i), "m" (v->counter) : "memory");
}
@@ -42,7 +42,7 @@ static __always_inline bool arch_atomic64_sub_and_test(s64 i, atomic64_t *v)
static __always_inline void arch_atomic64_inc(atomic64_t *v)
{
- asm volatile(LOCK_PREFIX "incq %0"
+ asm_inline volatile(LOCK_PREFIX "incq %0"
: "=m" (v->counter)
: "m" (v->counter) : "memory");
}
@@ -50,7 +50,7 @@ static __always_inline void arch_atomic64_inc(atomic64_t *v)
static __always_inline void arch_atomic64_dec(atomic64_t *v)
{
- asm volatile(LOCK_PREFIX "decq %0"
+ asm_inline volatile(LOCK_PREFIX "decq %0"
: "=m" (v->counter)
: "m" (v->counter) : "memory");
}
@@ -110,7 +110,7 @@ static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 new)
static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v)
{
- asm volatile(LOCK_PREFIX "andq %1,%0"
+ asm_inline volatile(LOCK_PREFIX "andq %1, %0"
: "+m" (v->counter)
: "er" (i)
: "memory");
@@ -128,7 +128,7 @@ static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v)
{
- asm volatile(LOCK_PREFIX "orq %1,%0"
+ asm_inline volatile(LOCK_PREFIX "orq %1, %0"
: "+m" (v->counter)
: "er" (i)
: "memory");
@@ -146,7 +146,7 @@ static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v)
{
- asm volatile(LOCK_PREFIX "xorq %1,%0"
+ asm_inline volatile(LOCK_PREFIX "xorq %1, %0"
: "+m" (v->counter)
: "er" (i)
: "memory");
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 7b44b3c4cce1..db70832232d4 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -12,11 +12,11 @@
*/
#ifdef CONFIG_X86_32
-#define mb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "mfence", \
+#define mb() asm volatile(ALTERNATIVE("lock addl $0,-4(%%esp)", "mfence", \
X86_FEATURE_XMM2) ::: "memory", "cc")
-#define rmb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "lfence", \
+#define rmb() asm volatile(ALTERNATIVE("lock addl $0,-4(%%esp)", "lfence", \
X86_FEATURE_XMM2) ::: "memory", "cc")
-#define wmb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "sfence", \
+#define wmb() asm volatile(ALTERNATIVE("lock addl $0,-4(%%esp)", "sfence", \
X86_FEATURE_XMM2) ::: "memory", "cc")
#else
#define __mb() asm volatile("mfence":::"memory")
@@ -50,7 +50,7 @@
#define __dma_rmb() barrier()
#define __dma_wmb() barrier()
-#define __smp_mb() asm volatile("lock; addl $0,-4(%%" _ASM_SP ")" ::: "memory", "cc")
+#define __smp_mb() asm volatile("lock addl $0,-4(%%" _ASM_SP ")" ::: "memory", "cc")
#define __smp_rmb() dma_rmb()
#define __smp_wmb() barrier()
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index b96d45944c59..100413aff640 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -52,12 +52,12 @@ static __always_inline void
arch_set_bit(long nr, volatile unsigned long *addr)
{
if (__builtin_constant_p(nr)) {
- asm volatile(LOCK_PREFIX "orb %b1,%0"
+ asm_inline volatile(LOCK_PREFIX "orb %b1,%0"
: CONST_MASK_ADDR(nr, addr)
: "iq" (CONST_MASK(nr))
: "memory");
} else {
- asm volatile(LOCK_PREFIX __ASM_SIZE(bts) " %1,%0"
+ asm_inline volatile(LOCK_PREFIX __ASM_SIZE(bts) " %1,%0"
: : RLONG_ADDR(addr), "Ir" (nr) : "memory");
}
}
@@ -72,11 +72,11 @@ static __always_inline void
arch_clear_bit(long nr, volatile unsigned long *addr)
{
if (__builtin_constant_p(nr)) {
- asm volatile(LOCK_PREFIX "andb %b1,%0"
+ asm_inline volatile(LOCK_PREFIX "andb %b1,%0"
: CONST_MASK_ADDR(nr, addr)
: "iq" (~CONST_MASK(nr)));
} else {
- asm volatile(LOCK_PREFIX __ASM_SIZE(btr) " %1,%0"
+ asm_inline volatile(LOCK_PREFIX __ASM_SIZE(btr) " %1,%0"
: : RLONG_ADDR(addr), "Ir" (nr) : "memory");
}
}
@@ -98,7 +98,7 @@ static __always_inline bool arch_xor_unlock_is_negative_byte(unsigned long mask,
volatile unsigned long *addr)
{
bool negative;
- asm volatile(LOCK_PREFIX "xorb %2,%1"
+ asm_inline volatile(LOCK_PREFIX "xorb %2,%1"
CC_SET(s)
: CC_OUT(s) (negative), WBYTE_ADDR(addr)
: "iq" ((char)mask) : "memory");
@@ -122,11 +122,11 @@ static __always_inline void
arch_change_bit(long nr, volatile unsigned long *addr)
{
if (__builtin_constant_p(nr)) {
- asm volatile(LOCK_PREFIX "xorb %b1,%0"
+ asm_inline volatile(LOCK_PREFIX "xorb %b1,%0"
: CONST_MASK_ADDR(nr, addr)
: "iq" (CONST_MASK(nr)));
} else {
- asm volatile(LOCK_PREFIX __ASM_SIZE(btc) " %1,%0"
+ asm_inline volatile(LOCK_PREFIX __ASM_SIZE(btc) " %1,%0"
: : RLONG_ADDR(addr), "Ir" (nr) : "memory");
}
}
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 3e5b111e619d..3f02ff6d333d 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -74,7 +74,7 @@
# define BOOT_STACK_SIZE 0x1000
#endif
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
extern unsigned int output_len;
extern const unsigned long kernel_text_size;
extern const unsigned long kernel_total_size;
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index e85ac0c7c039..f0e9acf72547 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -17,13 +17,17 @@
* In clang we have UD1s reporting UBSAN failures on X86, 64 and 32bit.
*/
#define INSN_ASOP 0x67
+#define INSN_LOCK 0xf0
#define OPCODE_ESCAPE 0x0f
#define SECOND_BYTE_OPCODE_UD1 0xb9
#define SECOND_BYTE_OPCODE_UD2 0x0b
#define BUG_NONE 0xffff
-#define BUG_UD1 0xfffe
-#define BUG_UD2 0xfffd
+#define BUG_UD2 0xfffe
+#define BUG_UD1 0xfffd
+#define BUG_UD1_UBSAN 0xfffc
+#define BUG_EA 0xffea
+#define BUG_LOCK 0xfff0
#ifdef CONFIG_GENERIC_BUG
diff --git a/arch/x86/include/asm/cfi.h b/arch/x86/include/asm/cfi.h
index 31d19c815f99..3e51ba459154 100644
--- a/arch/x86/include/asm/cfi.h
+++ b/arch/x86/include/asm/cfi.h
@@ -101,6 +101,16 @@ enum cfi_mode {
extern enum cfi_mode cfi_mode;
+#ifdef CONFIG_FINEIBT_BHI
+extern bool cfi_bhi;
+#else
+#define cfi_bhi (0)
+#endif
+
+typedef u8 bhi_thunk[32];
+extern bhi_thunk __bhi_args[];
+extern bhi_thunk __bhi_args_end[];
+
struct pt_regs;
#ifdef CONFIG_CFI_CLANG
@@ -125,6 +135,18 @@ static inline int cfi_get_offset(void)
#define cfi_get_offset cfi_get_offset
extern u32 cfi_get_func_hash(void *func);
+extern int cfi_get_func_arity(void *func);
+
+#ifdef CONFIG_FINEIBT
+extern bool decode_fineibt_insn(struct pt_regs *regs, unsigned long *target, u32 *type);
+#else
+static inline bool
+decode_fineibt_insn(struct pt_regs *regs, unsigned long *target, u32 *type)
+{
+ return false;
+}
+
+#endif
#else
static inline enum bug_trap_type handle_cfi_failure(struct pt_regs *regs)
@@ -137,6 +159,10 @@ static inline u32 cfi_get_func_hash(void *func)
{
return 0;
}
+static inline int cfi_get_func_arity(void *func)
+{
+ return 0;
+}
#endif /* CONFIG_CFI_CLANG */
#if HAS_KERNEL_IBT == 1
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index 5612648b0202..b61f32c3459f 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -44,22 +44,22 @@ extern void __add_wrong_size(void)
__typeof__ (*(ptr)) __ret = (arg); \
switch (sizeof(*(ptr))) { \
case __X86_CASE_B: \
- asm volatile (lock #op "b %b0, %1\n" \
+ asm_inline volatile (lock #op "b %b0, %1" \
: "+q" (__ret), "+m" (*(ptr)) \
: : "memory", "cc"); \
break; \
case __X86_CASE_W: \
- asm volatile (lock #op "w %w0, %1\n" \
+ asm_inline volatile (lock #op "w %w0, %1" \
: "+r" (__ret), "+m" (*(ptr)) \
: : "memory", "cc"); \
break; \
case __X86_CASE_L: \
- asm volatile (lock #op "l %0, %1\n" \
+ asm_inline volatile (lock #op "l %0, %1" \
: "+r" (__ret), "+m" (*(ptr)) \
: : "memory", "cc"); \
break; \
case __X86_CASE_Q: \
- asm volatile (lock #op "q %q0, %1\n" \
+ asm_inline volatile (lock #op "q %q0, %1" \
: "+r" (__ret), "+m" (*(ptr)) \
: : "memory", "cc"); \
break; \
@@ -91,7 +91,7 @@ extern void __add_wrong_size(void)
case __X86_CASE_B: \
{ \
volatile u8 *__ptr = (volatile u8 *)(ptr); \
- asm volatile(lock "cmpxchgb %2,%1" \
+ asm_inline volatile(lock "cmpxchgb %2, %1" \
: "=a" (__ret), "+m" (*__ptr) \
: "q" (__new), "0" (__old) \
: "memory"); \
@@ -100,7 +100,7 @@ extern void __add_wrong_size(void)
case __X86_CASE_W: \
{ \
volatile u16 *__ptr = (volatile u16 *)(ptr); \
- asm volatile(lock "cmpxchgw %2,%1" \
+ asm_inline volatile(lock "cmpxchgw %2, %1" \
: "=a" (__ret), "+m" (*__ptr) \
: "r" (__new), "0" (__old) \
: "memory"); \
@@ -109,7 +109,7 @@ extern void __add_wrong_size(void)
case __X86_CASE_L: \
{ \
volatile u32 *__ptr = (volatile u32 *)(ptr); \
- asm volatile(lock "cmpxchgl %2,%1" \
+ asm_inline volatile(lock "cmpxchgl %2, %1" \
: "=a" (__ret), "+m" (*__ptr) \
: "r" (__new), "0" (__old) \
: "memory"); \
@@ -118,7 +118,7 @@ extern void __add_wrong_size(void)
case __X86_CASE_Q: \
{ \
volatile u64 *__ptr = (volatile u64 *)(ptr); \
- asm volatile(lock "cmpxchgq %2,%1" \
+ asm_inline volatile(lock "cmpxchgq %2, %1" \
: "=a" (__ret), "+m" (*__ptr) \
: "r" (__new), "0" (__old) \
: "memory"); \
@@ -134,7 +134,7 @@ extern void __add_wrong_size(void)
__raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX)
#define __sync_cmpxchg(ptr, old, new, size) \
- __raw_cmpxchg((ptr), (old), (new), (size), "lock; ")
+ __raw_cmpxchg((ptr), (old), (new), (size), "lock ")
#define __cmpxchg_local(ptr, old, new, size) \
__raw_cmpxchg((ptr), (old), (new), (size), "")
@@ -165,7 +165,7 @@ extern void __add_wrong_size(void)
case __X86_CASE_B: \
{ \
volatile u8 *__ptr = (volatile u8 *)(_ptr); \
- asm volatile(lock "cmpxchgb %[new], %[ptr]" \
+ asm_inline volatile(lock "cmpxchgb %[new], %[ptr]" \
CC_SET(z) \
: CC_OUT(z) (success), \
[ptr] "+m" (*__ptr), \
@@ -177,7 +177,7 @@ extern void __add_wrong_size(void)
case __X86_CASE_W: \
{ \
volatile u16 *__ptr = (volatile u16 *)(_ptr); \
- asm volatile(lock "cmpxchgw %[new], %[ptr]" \
+ asm_inline volatile(lock "cmpxchgw %[new], %[ptr]" \
CC_SET(z) \
: CC_OUT(z) (success), \
[ptr] "+m" (*__ptr), \
@@ -189,7 +189,7 @@ extern void __add_wrong_size(void)
case __X86_CASE_L: \
{ \
volatile u32 *__ptr = (volatile u32 *)(_ptr); \
- asm volatile(lock "cmpxchgl %[new], %[ptr]" \
+ asm_inline volatile(lock "cmpxchgl %[new], %[ptr]" \
CC_SET(z) \
: CC_OUT(z) (success), \
[ptr] "+m" (*__ptr), \
@@ -201,7 +201,7 @@ extern void __add_wrong_size(void)
case __X86_CASE_Q: \
{ \
volatile u64 *__ptr = (volatile u64 *)(_ptr); \
- asm volatile(lock "cmpxchgq %[new], %[ptr]" \
+ asm_inline volatile(lock "cmpxchgq %[new], %[ptr]" \
CC_SET(z) \
: CC_OUT(z) (success), \
[ptr] "+m" (*__ptr), \
@@ -222,7 +222,7 @@ extern void __add_wrong_size(void)
__raw_try_cmpxchg((ptr), (pold), (new), (size), LOCK_PREFIX)
#define __sync_try_cmpxchg(ptr, pold, new, size) \
- __raw_try_cmpxchg((ptr), (pold), (new), (size), "lock; ")
+ __raw_try_cmpxchg((ptr), (pold), (new), (size), "lock ")
#define __try_cmpxchg_local(ptr, pold, new, size) \
__raw_try_cmpxchg((ptr), (pold), (new), (size), "")
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index fd1282a783dd..371f7906019e 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -19,7 +19,7 @@ union __u64_halves {
union __u64_halves o = { .full = (_old), }, \
n = { .full = (_new), }; \
\
- asm volatile(_lock "cmpxchg8b %[ptr]" \
+ asm_inline volatile(_lock "cmpxchg8b %[ptr]" \
: [ptr] "+m" (*(_ptr)), \
"+a" (o.low), "+d" (o.high) \
: "b" (n.low), "c" (n.high) \
@@ -45,7 +45,7 @@ static __always_inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new
n = { .full = (_new), }; \
bool ret; \
\
- asm volatile(_lock "cmpxchg8b %[ptr]" \
+ asm_inline volatile(_lock "cmpxchg8b %[ptr]" \
CC_SET(e) \
: CC_OUT(e) (ret), \
[ptr] "+m" (*(_ptr)), \
@@ -69,7 +69,7 @@ static __always_inline bool __try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp,
return __arch_try_cmpxchg64(ptr, oldp, new,);
}
-#ifdef CONFIG_X86_CMPXCHG64
+#ifdef CONFIG_X86_CX8
#define arch_cmpxchg64 __cmpxchg64
@@ -91,19 +91,21 @@ static __always_inline bool __try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp,
union __u64_halves o = { .full = (_old), }, \
n = { .full = (_new), }; \
\
- asm volatile(ALTERNATIVE(_lock_loc \
- "call cmpxchg8b_emu", \
- _lock "cmpxchg8b %a[ptr]", X86_FEATURE_CX8) \
- : ALT_OUTPUT_SP("+a" (o.low), "+d" (o.high)) \
- : "b" (n.low), "c" (n.high), [ptr] "S" (_ptr) \
- : "memory"); \
+ asm_inline volatile( \
+ ALTERNATIVE(_lock_loc \
+ "call cmpxchg8b_emu", \
+ _lock "cmpxchg8b %a[ptr]", X86_FEATURE_CX8) \
+ : ALT_OUTPUT_SP("+a" (o.low), "+d" (o.high)) \
+ : "b" (n.low), "c" (n.high), \
+ [ptr] "S" (_ptr) \
+ : "memory"); \
\
o.full; \
})
static __always_inline u64 arch_cmpxchg64(volatile u64 *ptr, u64 old, u64 new)
{
- return __arch_cmpxchg64_emu(ptr, old, new, LOCK_PREFIX_HERE, "lock; ");
+ return __arch_cmpxchg64_emu(ptr, old, new, LOCK_PREFIX_HERE, "lock ");
}
#define arch_cmpxchg64 arch_cmpxchg64
@@ -119,14 +121,16 @@ static __always_inline u64 arch_cmpxchg64_local(volatile u64 *ptr, u64 old, u64
n = { .full = (_new), }; \
bool ret; \
\
- asm volatile(ALTERNATIVE(_lock_loc \
- "call cmpxchg8b_emu", \
- _lock "cmpxchg8b %a[ptr]", X86_FEATURE_CX8) \
- CC_SET(e) \
- : ALT_OUTPUT_SP(CC_OUT(e) (ret), \
- "+a" (o.low), "+d" (o.high)) \
- : "b" (n.low), "c" (n.high), [ptr] "S" (_ptr) \
- : "memory"); \
+ asm_inline volatile( \
+ ALTERNATIVE(_lock_loc \
+ "call cmpxchg8b_emu", \
+ _lock "cmpxchg8b %a[ptr]", X86_FEATURE_CX8) \
+ CC_SET(e) \
+ : ALT_OUTPUT_SP(CC_OUT(e) (ret), \
+ "+a" (o.low), "+d" (o.high)) \
+ : "b" (n.low), "c" (n.high), \
+ [ptr] "S" (_ptr) \
+ : "memory"); \
\
if (unlikely(!ret)) \
*(_oldp) = o.full; \
@@ -136,7 +140,7 @@ static __always_inline u64 arch_cmpxchg64_local(volatile u64 *ptr, u64 old, u64
static __always_inline bool arch_try_cmpxchg64(volatile u64 *ptr, u64 *oldp, u64 new)
{
- return __arch_try_cmpxchg64_emu(ptr, oldp, new, LOCK_PREFIX_HERE, "lock; ");
+ return __arch_try_cmpxchg64_emu(ptr, oldp, new, LOCK_PREFIX_HERE, "lock ");
}
#define arch_try_cmpxchg64 arch_try_cmpxchg64
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 5e241306db26..71d1e72ed879 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -38,7 +38,7 @@ union __u128_halves {
union __u128_halves o = { .full = (_old), }, \
n = { .full = (_new), }; \
\
- asm volatile(_lock "cmpxchg16b %[ptr]" \
+ asm_inline volatile(_lock "cmpxchg16b %[ptr]" \
: [ptr] "+m" (*(_ptr)), \
"+a" (o.low), "+d" (o.high) \
: "b" (n.low), "c" (n.high) \
@@ -65,7 +65,7 @@ static __always_inline u128 arch_cmpxchg128_local(volatile u128 *ptr, u128 old,
n = { .full = (_new), }; \
bool ret; \
\
- asm volatile(_lock "cmpxchg16b %[ptr]" \
+ asm_inline volatile(_lock "cmpxchg16b %[ptr]" \
CC_SET(e) \
: CC_OUT(e) (ret), \
[ptr] "+m" (*(_ptr)), \
diff --git a/arch/x86/include/asm/coco.h b/arch/x86/include/asm/coco.h
index aa6c8f8ca958..e7225452963f 100644
--- a/arch/x86/include/asm/coco.h
+++ b/arch/x86/include/asm/coco.h
@@ -15,6 +15,11 @@ enum cc_vendor {
extern enum cc_vendor cc_vendor;
extern u64 cc_mask;
+static inline u64 cc_get_mask(void)
+{
+ return cc_mask;
+}
+
static inline void cc_set_mask(u64 mask)
{
RIP_REL_REF(cc_mask) = mask;
@@ -25,7 +30,10 @@ u64 cc_mkdec(u64 val);
void cc_random_init(void);
#else
#define cc_vendor (CC_VENDOR_NONE)
-static const u64 cc_mask = 0;
+static inline u64 cc_get_mask(void)
+{
+ return 0;
+}
static inline u64 cc_mkenc(u64 val)
{
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index 98eced5084ca..ad235dda1ded 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -12,7 +12,6 @@
#ifndef CONFIG_SMP
#define cpu_physical_id(cpu) boot_cpu_physical_apicid
#define cpu_acpi_id(cpu) 0
-#define safe_smp_processor_id() 0
#endif /* CONFIG_SMP */
#ifdef CONFIG_HOTPLUG_CPU
@@ -50,20 +49,6 @@ static inline void split_lock_init(void) {}
static inline void bus_lock_init(void) {}
#endif
-#ifdef CONFIG_CPU_SUP_INTEL
-u8 get_this_hybrid_cpu_type(void);
-u32 get_this_hybrid_cpu_native_id(void);
-#else
-static inline u8 get_this_hybrid_cpu_type(void)
-{
- return 0;
-}
-
-static inline u32 get_this_hybrid_cpu_native_id(void)
-{
- return 0;
-}
-#endif
#ifdef CONFIG_IA32_FEAT_CTL
void init_ia32_feat_ctl(struct cpuinfo_x86 *c);
#else
diff --git a/arch/x86/include/asm/cpu_device_id.h b/arch/x86/include/asm/cpu_device_id.h
index ba32e0f44cba..6be777a06944 100644
--- a/arch/x86/include/asm/cpu_device_id.h
+++ b/arch/x86/include/asm/cpu_device_id.h
@@ -57,7 +57,7 @@
#define X86_CPU_ID_FLAG_ENTRY_VALID BIT(0)
/**
- * X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE - Base macro for CPU matching
+ * X86_MATCH_CPU - Base macro for CPU matching
* @_vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY
* The name is expanded to X86_VENDOR_@_vendor
* @_family: The family number or X86_FAMILY_ANY
@@ -74,47 +74,18 @@
* into another macro at the usage site for good reasons, then please
* start this local macro with X86_MATCH to allow easy grepping.
*/
-#define X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(_vendor, _family, _model, \
- _steppings, _feature, _data) { \
- .vendor = X86_VENDOR_##_vendor, \
- .family = _family, \
- .model = _model, \
- .steppings = _steppings, \
- .feature = _feature, \
- .flags = X86_CPU_ID_FLAG_ENTRY_VALID, \
- .driver_data = (unsigned long) _data \
-}
-
-#define X86_MATCH_VENDORID_FAM_MODEL_STEPPINGS_FEATURE(_vendor, _family, _model, \
- _steppings, _feature, _data) { \
+#define X86_MATCH_CPU(_vendor, _family, _model, _steppings, _feature, _type, _data) { \
.vendor = _vendor, \
.family = _family, \
.model = _model, \
.steppings = _steppings, \
.feature = _feature, \
.flags = X86_CPU_ID_FLAG_ENTRY_VALID, \
+ .type = _type, \
.driver_data = (unsigned long) _data \
}
/**
- * X86_MATCH_VENDOR_FAM_MODEL_FEATURE - Macro for CPU matching
- * @_vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY
- * The name is expanded to X86_VENDOR_@_vendor
- * @_family: The family number or X86_FAMILY_ANY
- * @_model: The model number, model constant or X86_MODEL_ANY
- * @_feature: A X86_FEATURE bit or X86_FEATURE_ANY
- * @_data: Driver specific data or NULL. The internal storage
- * format is unsigned long. The supplied value, pointer
- * etc. is casted to unsigned long internally.
- *
- * The steppings arguments of X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE() is
- * set to wildcards.
- */
-#define X86_MATCH_VENDOR_FAM_MODEL_FEATURE(vendor, family, model, feature, data) \
- X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(vendor, family, model, \
- X86_STEPPING_ANY, feature, data)
-
-/**
* X86_MATCH_VENDOR_FAM_FEATURE - Macro for matching vendor, family and CPU feature
* @vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY
* The name is expanded to X86_VENDOR_@vendor
@@ -123,13 +94,10 @@
* @data: Driver specific data or NULL. The internal storage
* format is unsigned long. The supplied value, pointer
* etc. is casted to unsigned long internally.
- *
- * All other missing arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are
- * set to wildcards.
*/
-#define X86_MATCH_VENDOR_FAM_FEATURE(vendor, family, feature, data) \
- X86_MATCH_VENDOR_FAM_MODEL_FEATURE(vendor, family, \
- X86_MODEL_ANY, feature, data)
+#define X86_MATCH_VENDOR_FAM_FEATURE(vendor, family, feature, data) \
+ X86_MATCH_CPU(X86_VENDOR_##vendor, family, X86_MODEL_ANY, \
+ X86_STEPPING_ANY, feature, X86_CPU_TYPE_ANY, data)
/**
* X86_MATCH_VENDOR_FEATURE - Macro for matching vendor and CPU feature
@@ -139,12 +107,10 @@
* @data: Driver specific data or NULL. The internal storage
* format is unsigned long. The supplied value, pointer
* etc. is casted to unsigned long internally.
- *
- * All other missing arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are
- * set to wildcards.
*/
-#define X86_MATCH_VENDOR_FEATURE(vendor, feature, data) \
- X86_MATCH_VENDOR_FAM_FEATURE(vendor, X86_FAMILY_ANY, feature, data)
+#define X86_MATCH_VENDOR_FEATURE(vendor, feature, data) \
+ X86_MATCH_CPU(X86_VENDOR_##vendor, X86_FAMILY_ANY, X86_MODEL_ANY, \
+ X86_STEPPING_ANY, feature, X86_CPU_TYPE_ANY, data)
/**
* X86_MATCH_FEATURE - Macro for matching a CPU feature
@@ -152,12 +118,10 @@
* @data: Driver specific data or NULL. The internal storage
* format is unsigned long. The supplied value, pointer
* etc. is casted to unsigned long internally.
- *
- * All other missing arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are
- * set to wildcards.
*/
-#define X86_MATCH_FEATURE(feature, data) \
- X86_MATCH_VENDOR_FEATURE(ANY, feature, data)
+#define X86_MATCH_FEATURE(feature, data) \
+ X86_MATCH_CPU(X86_VENDOR_ANY, X86_FAMILY_ANY, X86_MODEL_ANY, \
+ X86_STEPPING_ANY, feature, X86_CPU_TYPE_ANY, data)
/**
* X86_MATCH_VENDOR_FAM_MODEL - Match vendor, family and model
@@ -168,13 +132,10 @@
* @data: Driver specific data or NULL. The internal storage
* format is unsigned long. The supplied value, pointer
* etc. is casted to unsigned long internally.
- *
- * All other missing arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are
- * set to wildcards.
*/
-#define X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, data) \
- X86_MATCH_VENDOR_FAM_MODEL_FEATURE(vendor, family, model, \
- X86_FEATURE_ANY, data)
+#define X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, data) \
+ X86_MATCH_CPU(X86_VENDOR_##vendor, family, model, X86_STEPPING_ANY, \
+ X86_FEATURE_ANY, X86_CPU_TYPE_ANY, data)
/**
* X86_MATCH_VENDOR_FAM - Match vendor and family
@@ -184,12 +145,10 @@
* @data: Driver specific data or NULL. The internal storage
* format is unsigned long. The supplied value, pointer
* etc. is casted to unsigned long internally.
- *
- * All other missing arguments to X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are
- * set of wildcards.
*/
-#define X86_MATCH_VENDOR_FAM(vendor, family, data) \
- X86_MATCH_VENDOR_FAM_MODEL(vendor, family, X86_MODEL_ANY, data)
+#define X86_MATCH_VENDOR_FAM(vendor, family, data) \
+ X86_MATCH_CPU(X86_VENDOR_##vendor, family, X86_MODEL_ANY, \
+ X86_STEPPING_ANY, X86_FEATURE_ANY, X86_CPU_TYPE_ANY, data)
/**
* X86_MATCH_VFM - Match encoded vendor/family/model
@@ -197,34 +156,26 @@
* @data: Driver specific data or NULL. The internal storage
* format is unsigned long. The supplied value, pointer
* etc. is cast to unsigned long internally.
- *
- * Stepping and feature are set to wildcards
*/
-#define X86_MATCH_VFM(vfm, data) \
- X86_MATCH_VENDORID_FAM_MODEL_STEPPINGS_FEATURE( \
- VFM_VENDOR(vfm), \
- VFM_FAMILY(vfm), \
- VFM_MODEL(vfm), \
- X86_STEPPING_ANY, X86_FEATURE_ANY, data)
+#define X86_MATCH_VFM(vfm, data) \
+ X86_MATCH_CPU(VFM_VENDOR(vfm), VFM_FAMILY(vfm), VFM_MODEL(vfm), \
+ X86_STEPPING_ANY, X86_FEATURE_ANY, X86_CPU_TYPE_ANY, data)
#define __X86_STEPPINGS(mins, maxs) GENMASK(maxs, mins)
/**
- * X86_MATCH_VFM_STEPPINGS - Match encoded vendor/family/model/stepping
+ * X86_MATCH_VFM_STEPS - Match encoded vendor/family/model and steppings
+ * range.
* @vfm: Encoded 8-bits each for vendor, family, model
- * @steppings: Bitmask of steppings to match
+ * @min_step: Lowest stepping number to match
+ * @max_step: Highest stepping number to match
* @data: Driver specific data or NULL. The internal storage
* format is unsigned long. The supplied value, pointer
* etc. is cast to unsigned long internally.
- *
- * feature is set to wildcard
*/
-#define X86_MATCH_VFM_STEPS(vfm, min_step, max_step, data) \
- X86_MATCH_VENDORID_FAM_MODEL_STEPPINGS_FEATURE( \
- VFM_VENDOR(vfm), \
- VFM_FAMILY(vfm), \
- VFM_MODEL(vfm), \
- __X86_STEPPINGS(min_step, max_step), \
- X86_FEATURE_ANY, data)
+#define X86_MATCH_VFM_STEPS(vfm, min_step, max_step, data) \
+ X86_MATCH_CPU(VFM_VENDOR(vfm), VFM_FAMILY(vfm), VFM_MODEL(vfm), \
+ __X86_STEPPINGS(min_step, max_step), X86_FEATURE_ANY, \
+ X86_CPU_TYPE_ANY, data)
/**
* X86_MATCH_VFM_FEATURE - Match encoded vendor/family/model/feature
@@ -233,15 +184,22 @@
* @data: Driver specific data or NULL. The internal storage
* format is unsigned long. The supplied value, pointer
* etc. is cast to unsigned long internally.
- *
- * Steppings is set to wildcard
*/
-#define X86_MATCH_VFM_FEATURE(vfm, feature, data) \
- X86_MATCH_VENDORID_FAM_MODEL_STEPPINGS_FEATURE( \
- VFM_VENDOR(vfm), \
- VFM_FAMILY(vfm), \
- VFM_MODEL(vfm), \
- X86_STEPPING_ANY, feature, data)
+#define X86_MATCH_VFM_FEATURE(vfm, feature, data) \
+ X86_MATCH_CPU(VFM_VENDOR(vfm), VFM_FAMILY(vfm), VFM_MODEL(vfm), \
+ X86_STEPPING_ANY, feature, X86_CPU_TYPE_ANY, data)
+
+/**
+ * X86_MATCH_VFM_CPU_TYPE - Match encoded vendor/family/model/type
+ * @vfm: Encoded 8-bits each for vendor, family, model
+ * @type: CPU type e.g. P-core, E-core
+ * @data: Driver specific data or NULL. The internal storage
+ * format is unsigned long. The supplied value, pointer
+ * etc. is cast to unsigned long internally.
+ */
+#define X86_MATCH_VFM_CPU_TYPE(vfm, type, data) \
+ X86_MATCH_CPU(VFM_VENDOR(vfm), VFM_FAMILY(vfm), VFM_MODEL(vfm), \
+ X86_STEPPING_ANY, X86_FEATURE_ANY, type, data)
extern const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match);
extern bool x86_match_min_microcode_rev(const struct x86_cpu_id *table);
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index de1ad09fe8d7..893cbca37fe9 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -4,11 +4,12 @@
#include <asm/processor.h>
-#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
+#if defined(__KERNEL__) && !defined(__ASSEMBLER__)
#include <asm/asm.h>
#include <linux/bitops.h>
#include <asm/alternative.h>
+#include <asm/cpufeaturemasks.h>
enum cpuid_leafs
{
@@ -37,92 +38,19 @@ enum cpuid_leafs
NR_CPUID_WORDS,
};
-#define X86_CAP_FMT_NUM "%d:%d"
-#define x86_cap_flag_num(flag) ((flag) >> 5), ((flag) & 31)
-
extern const char * const x86_cap_flags[NCAPINTS*32];
extern const char * const x86_power_flags[32];
-#define X86_CAP_FMT "%s"
-#define x86_cap_flag(flag) x86_cap_flags[flag]
/*
* In order to save room, we index into this array by doing
* X86_BUG_<name> - NCAPINTS*32.
*/
extern const char * const x86_bug_flags[NBUGINTS*32];
+#define x86_bug_flag(flag) x86_bug_flags[flag]
#define test_cpu_cap(c, bit) \
arch_test_bit(bit, (unsigned long *)((c)->x86_capability))
-/*
- * There are 32 bits/features in each mask word. The high bits
- * (selected with (bit>>5) give us the word number and the low 5
- * bits give us the bit/feature number inside the word.
- * (1UL<<((bit)&31) gives us a mask for the feature_bit so we can
- * see if it is set in the mask word.
- */
-#define CHECK_BIT_IN_MASK_WORD(maskname, word, bit) \
- (((bit)>>5)==(word) && (1UL<<((bit)&31) & maskname##word ))
-
-/*
- * {REQUIRED,DISABLED}_MASK_CHECK below may seem duplicated with the
- * following BUILD_BUG_ON_ZERO() check but when NCAPINTS gets changed, all
- * header macros which use NCAPINTS need to be changed. The duplicated macro
- * use causes the compiler to issue errors for all headers so that all usage
- * sites can be corrected.
- */
-#define REQUIRED_MASK_BIT_SET(feature_bit) \
- ( CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 0, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 1, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 2, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 3, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 4, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 5, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 6, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 7, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 8, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 9, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 10, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 11, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 12, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 13, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 14, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 15, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 16, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 19, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 20, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 21, feature_bit) || \
- REQUIRED_MASK_CHECK || \
- BUILD_BUG_ON_ZERO(NCAPINTS != 22))
-
-#define DISABLED_MASK_BIT_SET(feature_bit) \
- ( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 1, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 2, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 3, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 4, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 5, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 6, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 7, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 8, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 9, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 10, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 11, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 12, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 13, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 14, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 15, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 16, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 19, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 20, feature_bit) || \
- CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 21, feature_bit) || \
- DISABLED_MASK_CHECK || \
- BUILD_BUG_ON_ZERO(NCAPINTS != 22))
-
#define cpu_has(c, bit) \
(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
test_cpu_cap(c, bit))
@@ -149,6 +77,7 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
extern void setup_clear_cpu_cap(unsigned int bit);
extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
+void check_cpufeature_deps(struct cpuinfo_x86 *c);
#define setup_force_cpu_cap(bit) do { \
\
@@ -208,5 +137,5 @@ t_no:
#define CPU_FEATURE_TYPEVAL boot_cpu_data.x86_vendor, boot_cpu_data.x86, \
boot_cpu_data.x86_model
-#endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */
+#endif /* defined(__KERNEL__) && !defined(__ASSEMBLER__) */
#endif /* _ASM_X86_CPUFEATURE_H */
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 508c0dad116b..70e1516b15da 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -2,14 +2,6 @@
#ifndef _ASM_X86_CPUFEATURES_H
#define _ASM_X86_CPUFEATURES_H
-#ifndef _ASM_X86_REQUIRED_FEATURES_H
-#include <asm/required-features.h>
-#endif
-
-#ifndef _ASM_X86_DISABLED_FEATURES_H
-#include <asm/disabled-features.h>
-#endif
-
/*
* Defines x86 CPU feature bits
*/
@@ -210,7 +202,6 @@
#define X86_FEATURE_MBA ( 7*32+18) /* "mba" Memory Bandwidth Allocation */
#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */
#define X86_FEATURE_PERFMON_V2 ( 7*32+20) /* "perfmon_v2" AMD Performance Monitoring Version 2 */
-#define X86_FEATURE_USE_IBPB ( 7*32+21) /* Indirect Branch Prediction Barrier enabled */
#define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* Use IBRS during runtime firmware calls */
#define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* Disable Speculative Store Bypass. */
#define X86_FEATURE_LS_CFG_SSBD ( 7*32+24) /* AMD SSBD implementation via LS_CFG MSR */
@@ -338,6 +329,7 @@
#define X86_FEATURE_CLZERO (13*32+ 0) /* "clzero" CLZERO instruction */
#define X86_FEATURE_IRPERF (13*32+ 1) /* "irperf" Instructions Retired Count */
#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* "xsaveerptr" Always save/restore FP error pointers */
+#define X86_FEATURE_INVLPGB (13*32+ 3) /* INVLPGB and TLBSYNC instructions supported */
#define X86_FEATURE_RDPRU (13*32+ 4) /* "rdpru" Read processor register at user level */
#define X86_FEATURE_WBNOINVD (13*32+ 9) /* "wbnoinvd" WBNOINVD instruction */
#define X86_FEATURE_AMD_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */
@@ -386,6 +378,7 @@
#define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* "v_spec_ctrl" Virtual SPEC_CTRL */
#define X86_FEATURE_VNMI (15*32+25) /* "vnmi" Virtual NMI */
#define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* SVME addr check */
+#define X86_FEATURE_IDLE_HLT (15*32+30) /* IDLE HLT intercept */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */
#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* "avx512vbmi" AVX512 Vector Bit Manipulation instructions*/
@@ -468,6 +461,10 @@
#define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* MSR_PRED_CMD[IBPB] flushes all branch type predictions */
#define X86_FEATURE_SRSO_NO (20*32+29) /* CPU is not affected by SRSO */
#define X86_FEATURE_SRSO_USER_KERNEL_NO (20*32+30) /* CPU is not affected by SRSO across user/kernel boundaries */
+#define X86_FEATURE_SRSO_BP_SPEC_REDUCE (20*32+31) /*
+ * BP_CFG[BpSpecReduce] can be used to mitigate SRSO for VMs.
+ * (SRSO_MSR_FIX in the official doc).
+ */
/*
* Extended auxiliary flags: Linux defined - for features scattered in various
@@ -534,4 +531,5 @@
#define X86_BUG_RFDS X86_BUG(1*32 + 2) /* "rfds" CPU is vulnerable to Register File Data Sampling */
#define X86_BUG_BHI X86_BUG(1*32 + 3) /* "bhi" CPU is affected by Branch History Injection */
#define X86_BUG_IBPB_NO_RET X86_BUG(1*32 + 4) /* "ibpb_no_ret" IBPB omits return target predictions */
+#define X86_BUG_SPECTRE_V2_USER X86_BUG(1*32 + 5) /* "spectre_v2_user" CPU is affected by Spectre variant 2 attack between user processes */
#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/cpuid.h b/arch/x86/include/asm/cpuid.h
index b2b9b4ef3dae..d5749b25fa10 100644
--- a/arch/x86/include/asm/cpuid.h
+++ b/arch/x86/include/asm/cpuid.h
@@ -1,222 +1,8 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * CPUID-related helpers/definitions
- */
#ifndef _ASM_X86_CPUID_H
#define _ASM_X86_CPUID_H
-#include <linux/types.h>
-
-#include <asm/string.h>
-
-struct cpuid_regs {
- u32 eax, ebx, ecx, edx;
-};
-
-enum cpuid_regs_idx {
- CPUID_EAX = 0,
- CPUID_EBX,
- CPUID_ECX,
- CPUID_EDX,
-};
-
-#define CPUID_LEAF_MWAIT 0x5
-#define CPUID_LEAF_DCA 0x9
-#define CPUID_LEAF_XSTATE 0x0d
-#define CPUID_LEAF_TSC 0x15
-#define CPUID_LEAF_FREQ 0x16
-#define CPUID_LEAF_TILE 0x1d
-
-#ifdef CONFIG_X86_32
-bool have_cpuid_p(void);
-#else
-static inline bool have_cpuid_p(void)
-{
- return true;
-}
-#endif
-static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
- unsigned int *ecx, unsigned int *edx)
-{
- /* ecx is often an input as well as an output. */
- asm volatile("cpuid"
- : "=a" (*eax),
- "=b" (*ebx),
- "=c" (*ecx),
- "=d" (*edx)
- : "0" (*eax), "2" (*ecx)
- : "memory");
-}
-
-#define native_cpuid_reg(reg) \
-static inline unsigned int native_cpuid_##reg(unsigned int op) \
-{ \
- unsigned int eax = op, ebx, ecx = 0, edx; \
- \
- native_cpuid(&eax, &ebx, &ecx, &edx); \
- \
- return reg; \
-}
-
-/*
- * Native CPUID functions returning a single datum.
- */
-native_cpuid_reg(eax)
-native_cpuid_reg(ebx)
-native_cpuid_reg(ecx)
-native_cpuid_reg(edx)
-
-#ifdef CONFIG_PARAVIRT_XXL
-#include <asm/paravirt.h>
-#else
-#define __cpuid native_cpuid
-#endif
-
-/*
- * Generic CPUID function
- * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
- * resulting in stale register contents being returned.
- */
-static inline void cpuid(unsigned int op,
- unsigned int *eax, unsigned int *ebx,
- unsigned int *ecx, unsigned int *edx)
-{
- *eax = op;
- *ecx = 0;
- __cpuid(eax, ebx, ecx, edx);
-}
-
-/* Some CPUID calls want 'count' to be placed in ecx */
-static inline void cpuid_count(unsigned int op, int count,
- unsigned int *eax, unsigned int *ebx,
- unsigned int *ecx, unsigned int *edx)
-{
- *eax = op;
- *ecx = count;
- __cpuid(eax, ebx, ecx, edx);
-}
-
-/*
- * CPUID functions returning a single datum
- */
-static inline unsigned int cpuid_eax(unsigned int op)
-{
- unsigned int eax, ebx, ecx, edx;
-
- cpuid(op, &eax, &ebx, &ecx, &edx);
-
- return eax;
-}
-
-static inline unsigned int cpuid_ebx(unsigned int op)
-{
- unsigned int eax, ebx, ecx, edx;
-
- cpuid(op, &eax, &ebx, &ecx, &edx);
-
- return ebx;
-}
-
-static inline unsigned int cpuid_ecx(unsigned int op)
-{
- unsigned int eax, ebx, ecx, edx;
-
- cpuid(op, &eax, &ebx, &ecx, &edx);
-
- return ecx;
-}
-
-static inline unsigned int cpuid_edx(unsigned int op)
-{
- unsigned int eax, ebx, ecx, edx;
-
- cpuid(op, &eax, &ebx, &ecx, &edx);
-
- return edx;
-}
-
-static inline void __cpuid_read(unsigned int leaf, unsigned int subleaf, u32 *regs)
-{
- regs[CPUID_EAX] = leaf;
- regs[CPUID_ECX] = subleaf;
- __cpuid(regs + CPUID_EAX, regs + CPUID_EBX, regs + CPUID_ECX, regs + CPUID_EDX);
-}
-
-#define cpuid_subleaf(leaf, subleaf, regs) { \
- static_assert(sizeof(*(regs)) == 16); \
- __cpuid_read(leaf, subleaf, (u32 *)(regs)); \
-}
-
-#define cpuid_leaf(leaf, regs) { \
- static_assert(sizeof(*(regs)) == 16); \
- __cpuid_read(leaf, 0, (u32 *)(regs)); \
-}
-
-static inline void __cpuid_read_reg(unsigned int leaf, unsigned int subleaf,
- enum cpuid_regs_idx regidx, u32 *reg)
-{
- u32 regs[4];
-
- __cpuid_read(leaf, subleaf, regs);
- *reg = regs[regidx];
-}
-
-#define cpuid_subleaf_reg(leaf, subleaf, regidx, reg) { \
- static_assert(sizeof(*(reg)) == 4); \
- __cpuid_read_reg(leaf, subleaf, regidx, (u32 *)(reg)); \
-}
-
-#define cpuid_leaf_reg(leaf, regidx, reg) { \
- static_assert(sizeof(*(reg)) == 4); \
- __cpuid_read_reg(leaf, 0, regidx, (u32 *)(reg)); \
-}
-
-static __always_inline bool cpuid_function_is_indexed(u32 function)
-{
- switch (function) {
- case 4:
- case 7:
- case 0xb:
- case 0xd:
- case 0xf:
- case 0x10:
- case 0x12:
- case 0x14:
- case 0x17:
- case 0x18:
- case 0x1d:
- case 0x1e:
- case 0x1f:
- case 0x24:
- case 0x8000001d:
- return true;
- }
-
- return false;
-}
-
-#define for_each_possible_hypervisor_cpuid_base(function) \
- for (function = 0x40000000; function < 0x40010000; function += 0x100)
-
-static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves)
-{
- uint32_t base, eax, signature[3];
-
- for_each_possible_hypervisor_cpuid_base(base) {
- cpuid(base, &eax, &signature[0], &signature[1], &signature[2]);
-
- /*
- * This must not compile to "call memcmp" because it's called
- * from PVH early boot code before instrumentation is set up
- * and memcmp() itself may be instrumented.
- */
- if (!__builtin_memcmp(sig, signature, 12) &&
- (leaves == 0 || ((eax - base) >= leaves)))
- return base;
- }
-
- return 0;
-}
+#include <asm/cpuid/api.h>
#endif /* _ASM_X86_CPUID_H */
diff --git a/arch/x86/include/asm/cpuid/api.h b/arch/x86/include/asm/cpuid/api.h
new file mode 100644
index 000000000000..9c180c9cc58e
--- /dev/null
+++ b/arch/x86/include/asm/cpuid/api.h
@@ -0,0 +1,210 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_CPUID_API_H
+#define _ASM_X86_CPUID_API_H
+
+#include <asm/cpuid/types.h>
+
+#include <linux/build_bug.h>
+#include <linux/types.h>
+
+#include <asm/string.h>
+
+/*
+ * Raw CPUID accessors:
+ */
+
+#ifdef CONFIG_X86_32
+bool have_cpuid_p(void);
+#else
+static inline bool have_cpuid_p(void)
+{
+ return true;
+}
+#endif
+
+static inline void native_cpuid(u32 *eax, u32 *ebx,
+ u32 *ecx, u32 *edx)
+{
+ /* ecx is often an input as well as an output. */
+ asm volatile("cpuid"
+ : "=a" (*eax),
+ "=b" (*ebx),
+ "=c" (*ecx),
+ "=d" (*edx)
+ : "0" (*eax), "2" (*ecx)
+ : "memory");
+}
+
+#define NATIVE_CPUID_REG(reg) \
+static inline u32 native_cpuid_##reg(u32 op) \
+{ \
+ u32 eax = op, ebx, ecx = 0, edx; \
+ \
+ native_cpuid(&eax, &ebx, &ecx, &edx); \
+ \
+ return reg; \
+}
+
+/*
+ * Native CPUID functions returning a single datum:
+ */
+NATIVE_CPUID_REG(eax)
+NATIVE_CPUID_REG(ebx)
+NATIVE_CPUID_REG(ecx)
+NATIVE_CPUID_REG(edx)
+
+#ifdef CONFIG_PARAVIRT_XXL
+# include <asm/paravirt.h>
+#else
+# define __cpuid native_cpuid
+#endif
+
+/*
+ * Generic CPUID function
+ *
+ * Clear ECX since some CPUs (Cyrix MII) do not set or clear ECX
+ * resulting in stale register contents being returned.
+ */
+static inline void cpuid(u32 op,
+ u32 *eax, u32 *ebx,
+ u32 *ecx, u32 *edx)
+{
+ *eax = op;
+ *ecx = 0;
+ __cpuid(eax, ebx, ecx, edx);
+}
+
+/* Some CPUID calls want 'count' to be placed in ECX */
+static inline void cpuid_count(u32 op, int count,
+ u32 *eax, u32 *ebx,
+ u32 *ecx, u32 *edx)
+{
+ *eax = op;
+ *ecx = count;
+ __cpuid(eax, ebx, ecx, edx);
+}
+
+/*
+ * CPUID functions returning a single datum:
+ */
+
+static inline u32 cpuid_eax(u32 op)
+{
+ u32 eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
+
+ return eax;
+}
+
+static inline u32 cpuid_ebx(u32 op)
+{
+ u32 eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
+
+ return ebx;
+}
+
+static inline u32 cpuid_ecx(u32 op)
+{
+ u32 eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
+
+ return ecx;
+}
+
+static inline u32 cpuid_edx(u32 op)
+{
+ u32 eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
+
+ return edx;
+}
+
+static inline void __cpuid_read(u32 leaf, u32 subleaf, u32 *regs)
+{
+ regs[CPUID_EAX] = leaf;
+ regs[CPUID_ECX] = subleaf;
+ __cpuid(regs + CPUID_EAX, regs + CPUID_EBX, regs + CPUID_ECX, regs + CPUID_EDX);
+}
+
+#define cpuid_subleaf(leaf, subleaf, regs) { \
+ static_assert(sizeof(*(regs)) == 16); \
+ __cpuid_read(leaf, subleaf, (u32 *)(regs)); \
+}
+
+#define cpuid_leaf(leaf, regs) { \
+ static_assert(sizeof(*(regs)) == 16); \
+ __cpuid_read(leaf, 0, (u32 *)(regs)); \
+}
+
+static inline void __cpuid_read_reg(u32 leaf, u32 subleaf,
+ enum cpuid_regs_idx regidx, u32 *reg)
+{
+ u32 regs[4];
+
+ __cpuid_read(leaf, subleaf, regs);
+ *reg = regs[regidx];
+}
+
+#define cpuid_subleaf_reg(leaf, subleaf, regidx, reg) { \
+ static_assert(sizeof(*(reg)) == 4); \
+ __cpuid_read_reg(leaf, subleaf, regidx, (u32 *)(reg)); \
+}
+
+#define cpuid_leaf_reg(leaf, regidx, reg) { \
+ static_assert(sizeof(*(reg)) == 4); \
+ __cpuid_read_reg(leaf, 0, regidx, (u32 *)(reg)); \
+}
+
+static __always_inline bool cpuid_function_is_indexed(u32 function)
+{
+ switch (function) {
+ case 4:
+ case 7:
+ case 0xb:
+ case 0xd:
+ case 0xf:
+ case 0x10:
+ case 0x12:
+ case 0x14:
+ case 0x17:
+ case 0x18:
+ case 0x1d:
+ case 0x1e:
+ case 0x1f:
+ case 0x24:
+ case 0x8000001d:
+ return true;
+ }
+
+ return false;
+}
+
+#define for_each_possible_hypervisor_cpuid_base(function) \
+ for (function = 0x40000000; function < 0x40010000; function += 0x100)
+
+static inline u32 hypervisor_cpuid_base(const char *sig, u32 leaves)
+{
+ u32 base, eax, signature[3];
+
+ for_each_possible_hypervisor_cpuid_base(base) {
+ cpuid(base, &eax, &signature[0], &signature[1], &signature[2]);
+
+ /*
+ * This must not compile to "call memcmp" because it's called
+ * from PVH early boot code before instrumentation is set up
+ * and memcmp() itself may be instrumented.
+ */
+ if (!__builtin_memcmp(sig, signature, 12) &&
+ (leaves == 0 || ((eax - base) >= leaves)))
+ return base;
+ }
+
+ return 0;
+}
+
+#endif /* _ASM_X86_CPUID_API_H */
diff --git a/arch/x86/include/asm/cpuid/types.h b/arch/x86/include/asm/cpuid/types.h
new file mode 100644
index 000000000000..8582e27e836d
--- /dev/null
+++ b/arch/x86/include/asm/cpuid/types.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_CPUID_TYPES_H
+#define _ASM_X86_CPUID_TYPES_H
+
+#include <linux/types.h>
+
+/*
+ * Types for raw CPUID access:
+ */
+
+struct cpuid_regs {
+ u32 eax;
+ u32 ebx;
+ u32 ecx;
+ u32 edx;
+};
+
+enum cpuid_regs_idx {
+ CPUID_EAX = 0,
+ CPUID_EBX,
+ CPUID_ECX,
+ CPUID_EDX,
+};
+
+#define CPUID_LEAF_MWAIT 0x05
+#define CPUID_LEAF_DCA 0x09
+#define CPUID_LEAF_XSTATE 0x0d
+#define CPUID_LEAF_TSC 0x15
+#define CPUID_LEAF_FREQ 0x16
+#define CPUID_LEAF_TILE 0x1d
+
+#endif /* _ASM_X86_CPUID_TYPES_H */
diff --git a/arch/x86/include/asm/cpumask.h b/arch/x86/include/asm/cpumask.h
index 4acfd57de8f1..70f6b60ad67b 100644
--- a/arch/x86/include/asm/cpumask.h
+++ b/arch/x86/include/asm/cpumask.h
@@ -1,7 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_CPUMASK_H
#define _ASM_X86_CPUMASK_H
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/cpumask.h>
extern void setup_cpu_local_masks(void);
@@ -34,5 +34,5 @@ static __always_inline void arch_cpumask_clear_cpu(int cpu, struct cpumask *dstp
#define arch_cpu_is_offline(cpu) unlikely(!arch_cpu_online(cpu))
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_X86_CPUMASK_H */
diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h
index bf5953883ec3..cc4a3f725b37 100644
--- a/arch/x86/include/asm/current.h
+++ b/arch/x86/include/asm/current.h
@@ -5,52 +5,28 @@
#include <linux/build_bug.h>
#include <linux/compiler.h>
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/cache.h>
#include <asm/percpu.h>
struct task_struct;
-struct pcpu_hot {
- union {
- struct {
- struct task_struct *current_task;
- int preempt_count;
- int cpu_number;
-#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING
- u64 call_depth;
-#endif
- unsigned long top_of_stack;
- void *hardirq_stack_ptr;
- u16 softirq_pending;
-#ifdef CONFIG_X86_64
- bool hardirq_stack_inuse;
-#else
- void *softirq_stack_ptr;
-#endif
- };
- u8 pad[64];
- };
-};
-static_assert(sizeof(struct pcpu_hot) == 64);
-
-DECLARE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot);
-
-/* const-qualified alias to pcpu_hot, aliased by linker. */
-DECLARE_PER_CPU_ALIGNED(const struct pcpu_hot __percpu_seg_override,
- const_pcpu_hot);
+DECLARE_PER_CPU_CACHE_HOT(struct task_struct *, current_task);
+/* const-qualified alias provided by the linker. */
+DECLARE_PER_CPU_CACHE_HOT(struct task_struct * const __percpu_seg_override,
+ const_current_task);
static __always_inline struct task_struct *get_current(void)
{
if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT))
- return this_cpu_read_const(const_pcpu_hot.current_task);
+ return this_cpu_read_const(const_current_task);
- return this_cpu_read_stable(pcpu_hot.current_task);
+ return this_cpu_read_stable(current_task);
}
#define current get_current()
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_X86_CURRENT_H */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 62dc9f59ea76..ec95fe44fa3a 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -46,7 +46,6 @@ struct gdt_page {
} __attribute__((aligned(PAGE_SIZE)));
DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
-DECLARE_INIT_PER_CPU(gdt_page);
/* Provide the original GDT */
static inline struct desc_struct *get_cpu_gdt_rw(unsigned int cpu)
diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h
index d440a65af8f3..7e6b9314758a 100644
--- a/arch/x86/include/asm/desc_defs.h
+++ b/arch/x86/include/asm/desc_defs.h
@@ -58,7 +58,7 @@
#define DESC_USER (_DESC_DPL(3))
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/types.h>
@@ -166,7 +166,7 @@ struct desc_ptr {
unsigned long address;
} __attribute__((packed)) ;
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
/* Boot IDT definitions */
#define BOOT_IDT_ENTRIES 32
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
deleted file mode 100644
index c492bdc97b05..000000000000
--- a/arch/x86/include/asm/disabled-features.h
+++ /dev/null
@@ -1,161 +0,0 @@
-#ifndef _ASM_X86_DISABLED_FEATURES_H
-#define _ASM_X86_DISABLED_FEATURES_H
-
-/* These features, although they might be available in a CPU
- * will not be used because the compile options to support
- * them are not present.
- *
- * This code allows them to be checked and disabled at
- * compile time without an explicit #ifdef. Use
- * cpu_feature_enabled().
- */
-
-#ifdef CONFIG_X86_UMIP
-# define DISABLE_UMIP 0
-#else
-# define DISABLE_UMIP (1<<(X86_FEATURE_UMIP & 31))
-#endif
-
-#ifdef CONFIG_X86_64
-# define DISABLE_VME (1<<(X86_FEATURE_VME & 31))
-# define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31))
-# define DISABLE_CYRIX_ARR (1<<(X86_FEATURE_CYRIX_ARR & 31))
-# define DISABLE_CENTAUR_MCR (1<<(X86_FEATURE_CENTAUR_MCR & 31))
-# define DISABLE_PCID 0
-#else
-# define DISABLE_VME 0
-# define DISABLE_K6_MTRR 0
-# define DISABLE_CYRIX_ARR 0
-# define DISABLE_CENTAUR_MCR 0
-# define DISABLE_PCID (1<<(X86_FEATURE_PCID & 31))
-#endif /* CONFIG_X86_64 */
-
-#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
-# define DISABLE_PKU 0
-# define DISABLE_OSPKE 0
-#else
-# define DISABLE_PKU (1<<(X86_FEATURE_PKU & 31))
-# define DISABLE_OSPKE (1<<(X86_FEATURE_OSPKE & 31))
-#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
-
-#ifdef CONFIG_X86_5LEVEL
-# define DISABLE_LA57 0
-#else
-# define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31))
-#endif
-
-#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
-# define DISABLE_PTI 0
-#else
-# define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31))
-#endif
-
-#ifdef CONFIG_MITIGATION_RETPOLINE
-# define DISABLE_RETPOLINE 0
-#else
-# define DISABLE_RETPOLINE ((1 << (X86_FEATURE_RETPOLINE & 31)) | \
- (1 << (X86_FEATURE_RETPOLINE_LFENCE & 31)))
-#endif
-
-#ifdef CONFIG_MITIGATION_RETHUNK
-# define DISABLE_RETHUNK 0
-#else
-# define DISABLE_RETHUNK (1 << (X86_FEATURE_RETHUNK & 31))
-#endif
-
-#ifdef CONFIG_MITIGATION_UNRET_ENTRY
-# define DISABLE_UNRET 0
-#else
-# define DISABLE_UNRET (1 << (X86_FEATURE_UNRET & 31))
-#endif
-
-#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING
-# define DISABLE_CALL_DEPTH_TRACKING 0
-#else
-# define DISABLE_CALL_DEPTH_TRACKING (1 << (X86_FEATURE_CALL_DEPTH & 31))
-#endif
-
-#ifdef CONFIG_ADDRESS_MASKING
-# define DISABLE_LAM 0
-#else
-# define DISABLE_LAM (1 << (X86_FEATURE_LAM & 31))
-#endif
-
-#ifdef CONFIG_INTEL_IOMMU_SVM
-# define DISABLE_ENQCMD 0
-#else
-# define DISABLE_ENQCMD (1 << (X86_FEATURE_ENQCMD & 31))
-#endif
-
-#ifdef CONFIG_X86_SGX
-# define DISABLE_SGX 0
-#else
-# define DISABLE_SGX (1 << (X86_FEATURE_SGX & 31))
-#endif
-
-#ifdef CONFIG_XEN_PV
-# define DISABLE_XENPV 0
-#else
-# define DISABLE_XENPV (1 << (X86_FEATURE_XENPV & 31))
-#endif
-
-#ifdef CONFIG_INTEL_TDX_GUEST
-# define DISABLE_TDX_GUEST 0
-#else
-# define DISABLE_TDX_GUEST (1 << (X86_FEATURE_TDX_GUEST & 31))
-#endif
-
-#ifdef CONFIG_X86_USER_SHADOW_STACK
-#define DISABLE_USER_SHSTK 0
-#else
-#define DISABLE_USER_SHSTK (1 << (X86_FEATURE_USER_SHSTK & 31))
-#endif
-
-#ifdef CONFIG_X86_KERNEL_IBT
-#define DISABLE_IBT 0
-#else
-#define DISABLE_IBT (1 << (X86_FEATURE_IBT & 31))
-#endif
-
-#ifdef CONFIG_X86_FRED
-# define DISABLE_FRED 0
-#else
-# define DISABLE_FRED (1 << (X86_FEATURE_FRED & 31))
-#endif
-
-#ifdef CONFIG_KVM_AMD_SEV
-#define DISABLE_SEV_SNP 0
-#else
-#define DISABLE_SEV_SNP (1 << (X86_FEATURE_SEV_SNP & 31))
-#endif
-
-/*
- * Make sure to add features to the correct mask
- */
-#define DISABLED_MASK0 (DISABLE_VME)
-#define DISABLED_MASK1 0
-#define DISABLED_MASK2 0
-#define DISABLED_MASK3 (DISABLE_CYRIX_ARR|DISABLE_CENTAUR_MCR|DISABLE_K6_MTRR)
-#define DISABLED_MASK4 (DISABLE_PCID)
-#define DISABLED_MASK5 0
-#define DISABLED_MASK6 0
-#define DISABLED_MASK7 (DISABLE_PTI)
-#define DISABLED_MASK8 (DISABLE_XENPV|DISABLE_TDX_GUEST)
-#define DISABLED_MASK9 (DISABLE_SGX)
-#define DISABLED_MASK10 0
-#define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \
- DISABLE_CALL_DEPTH_TRACKING|DISABLE_USER_SHSTK)
-#define DISABLED_MASK12 (DISABLE_FRED|DISABLE_LAM)
-#define DISABLED_MASK13 0
-#define DISABLED_MASK14 0
-#define DISABLED_MASK15 0
-#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \
- DISABLE_ENQCMD)
-#define DISABLED_MASK17 0
-#define DISABLED_MASK18 (DISABLE_IBT)
-#define DISABLED_MASK19 (DISABLE_SEV_SNP)
-#define DISABLED_MASK20 0
-#define DISABLED_MASK21 0
-#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 22)
-
-#endif /* _ASM_X86_DISABLED_FEATURES_H */
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h
index 430fca13bb56..302e11b15da8 100644
--- a/arch/x86/include/asm/dwarf2.h
+++ b/arch/x86/include/asm/dwarf2.h
@@ -2,7 +2,7 @@
#ifndef _ASM_X86_DWARF2_H
#define _ASM_X86_DWARF2_H
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#warning "asm/dwarf2.h should be only included in pure assembly files"
#endif
diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h
index 2e74a7f0e935..c83645d5b2a8 100644
--- a/arch/x86/include/asm/e820/api.h
+++ b/arch/x86/include/asm/e820/api.h
@@ -29,7 +29,6 @@ extern unsigned long e820__end_of_low_ram_pfn(void);
extern u64 e820__memblock_alloc_reserved(u64 size, u64 align);
extern void e820__memblock_setup(void);
-extern void e820__reserve_setup_data(void);
extern void e820__finish_early_params(void);
extern void e820__reserve_resources(void);
extern void e820__reserve_resources_late(void);
diff --git a/arch/x86/include/asm/e820/types.h b/arch/x86/include/asm/e820/types.h
index 314f75d886d0..80c4a7266629 100644
--- a/arch/x86/include/asm/e820/types.h
+++ b/arch/x86/include/asm/e820/types.h
@@ -35,15 +35,6 @@ enum e820_type {
* marking it with the IORES_DESC_SOFT_RESERVED designation.
*/
E820_TYPE_SOFT_RESERVED = 0xefffffff,
-
- /*
- * Reserved RAM used by the kernel itself if
- * CONFIG_INTEL_TXT=y is enabled, memory of this type
- * will be included in the S3 integrity calculation
- * and so should not include any memory that the BIOS
- * might alter over the S3 transition:
- */
- E820_TYPE_RESERVED_KERN = 128,
};
/*
diff --git a/arch/x86/include/asm/edac.h b/arch/x86/include/asm/edac.h
index 426fc53ff803..dfbd1ebb9f10 100644
--- a/arch/x86/include/asm/edac.h
+++ b/arch/x86/include/asm/edac.h
@@ -13,7 +13,7 @@ static inline void edac_atomic_scrub(void *va, u32 size)
* are interrupt, DMA and SMP safe.
*/
for (i = 0; i < size / 4; i++, virt_addr++)
- asm volatile("lock; addl $0, %0"::"m" (*virt_addr));
+ asm volatile("lock addl $0, %0"::"m" (*virt_addr));
}
#endif /* _ASM_X86_EDAC_H */
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 1fb83d47711f..128602612eca 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -54,8 +54,9 @@ typedef struct user_i387_struct elf_fpregset_t;
#define R_X86_64_GLOB_DAT 6 /* Create GOT entry */
#define R_X86_64_JUMP_SLOT 7 /* Create PLT entry */
#define R_X86_64_RELATIVE 8 /* Adjust by program base */
-#define R_X86_64_GOTPCREL 9 /* 32 bit signed pc relative
- offset to GOT */
+#define R_X86_64_GOTPCREL 9 /* 32 bit signed pc relative offset to GOT */
+#define R_X86_64_GOTPCRELX 41
+#define R_X86_64_REX_GOTPCRELX 42
#define R_X86_64_32 10 /* Direct 32 bit zero extended */
#define R_X86_64_32S 11 /* Direct 32 bit sign extended */
#define R_X86_64_16 12 /* Direct 16 bit zero extended */
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index d0dcefb5cc59..4519c9f35ba0 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -31,7 +31,7 @@
/* fixmap starts downwards from the 507th entry in level2_fixmap_pgt */
#define FIXMAP_PMD_TOP 507
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/kernel.h>
#include <asm/apicdef.h>
#include <asm/page.h>
@@ -196,5 +196,5 @@ void __init *early_memremap_decrypted_wp(resource_size_t phys_addr,
void __early_set_fixmap(enum fixed_addresses idx,
phys_addr_t phys, pgprot_t flags);
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#endif /* _ASM_X86_FIXMAP_H */
diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index f86ad3335529..f42de5f05e7e 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -16,10 +16,9 @@
/*
* Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It
- * disables preemption so be careful if you intend to use it for long periods
- * of time.
- * If you intend to use the FPU in irq/softirq you need to check first with
- * irq_fpu_usable() if it is possible.
+ * disables preemption and softirq processing, so be careful if you intend to
+ * use it for long periods of time. Kernel-mode FPU cannot be used in all
+ * contexts -- see irq_fpu_usable() for details.
*/
/* Kernel FPU states to initialize in kernel_fpu_begin_mask() */
@@ -50,10 +49,10 @@ static inline void kernel_fpu_begin(void)
}
/*
- * Use fpregs_lock() while editing CPU's FPU registers or fpu->fpstate.
- * A context switch will (and softirq might) save CPU's FPU registers to
- * fpu->fpstate.regs and set TIF_NEED_FPU_LOAD leaving CPU's FPU registers in
- * a random state.
+ * Use fpregs_lock() while editing CPU's FPU registers or fpu->fpstate, or while
+ * using the FPU in kernel mode. A context switch will (and softirq might) save
+ * CPU's FPU registers to fpu->fpstate.regs and set TIF_NEED_FPU_LOAD leaving
+ * CPU's FPU registers in a random state.
*
* local_bh_disable() protects against both preemption and soft interrupts
* on !RT kernels.
@@ -63,8 +62,6 @@ static inline void kernel_fpu_begin(void)
* preemptible. Disabling preemption is the right choice here as bottom
* half processing is always in thread context on RT kernels so it
* implicitly prevents bottom half processing as well.
- *
- * Disabling preemption also serializes against kernel_fpu_begin().
*/
static inline void fpregs_lock(void)
{
diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h
index fb42659f6e98..0ab65073c1cc 100644
--- a/arch/x86/include/asm/frame.h
+++ b/arch/x86/include/asm/frame.h
@@ -11,7 +11,7 @@
#ifdef CONFIG_FRAME_POINTER
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
.macro FRAME_BEGIN
push %_ASM_BP
@@ -51,7 +51,7 @@
.endm
#endif /* CONFIG_X86_64 */
-#else /* !__ASSEMBLY__ */
+#else /* !__ASSEMBLER__ */
#define FRAME_BEGIN \
"push %" _ASM_BP "\n" \
@@ -82,18 +82,18 @@ static inline unsigned long encode_frame_pointer(struct pt_regs *regs)
#endif /* CONFIG_X86_64 */
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#define FRAME_OFFSET __ASM_SEL(4, 8)
#else /* !CONFIG_FRAME_POINTER */
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
.macro ENCODE_FRAME_POINTER ptregs_offset=0
.endm
-#else /* !__ASSEMBLY */
+#else /* !__ASSEMBLER__ */
#define ENCODE_FRAME_POINTER
diff --git a/arch/x86/include/asm/fred.h b/arch/x86/include/asm/fred.h
index 25ca00bd70e8..2a29e5216881 100644
--- a/arch/x86/include/asm/fred.h
+++ b/arch/x86/include/asm/fred.h
@@ -32,7 +32,7 @@
#define FRED_CONFIG_INT_STKLVL(l) (_AT(unsigned long, l) << 9)
#define FRED_CONFIG_ENTRYPOINT(p) _AT(unsigned long, (p))
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#ifdef CONFIG_X86_FRED
#include <linux/kernel.h>
@@ -113,6 +113,6 @@ static inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) {
static inline void fred_sync_rsp0(unsigned long rsp0) { }
static inline void fred_update_rsp0(void) { }
#endif /* CONFIG_X86_FRED */
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#endif /* ASM_X86_FRED_H */
diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h
index 9e7e8ca8e299..02f239569b93 100644
--- a/arch/x86/include/asm/fsgsbase.h
+++ b/arch/x86/include/asm/fsgsbase.h
@@ -2,7 +2,7 @@
#ifndef _ASM_FSGSBASE_H
#define _ASM_FSGSBASE_H
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#ifdef CONFIG_X86_64
@@ -80,6 +80,6 @@ extern unsigned long x86_fsgsbase_read_task(struct task_struct *task,
#endif /* CONFIG_X86_64 */
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_FSGSBASE_H */
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index f9cb4d07df58..93156ac4ffe0 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -22,7 +22,7 @@
#define ARCH_SUPPORTS_FTRACE_OPS 1
#endif
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
extern void __fentry__(void);
static inline unsigned long ftrace_call_adjust(unsigned long addr)
@@ -36,21 +36,9 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr)
static inline unsigned long arch_ftrace_get_symaddr(unsigned long fentry_ip)
{
-#ifdef CONFIG_X86_KERNEL_IBT
- u32 instr;
-
- /* We want to be extra safe in case entry ip is on the page edge,
- * but otherwise we need to avoid get_kernel_nofault()'s overhead.
- */
- if ((fentry_ip & ~PAGE_MASK) < ENDBR_INSN_SIZE) {
- if (get_kernel_nofault(instr, (u32 *)(fentry_ip - ENDBR_INSN_SIZE)))
- return fentry_ip;
- } else {
- instr = *(u32 *)(fentry_ip - ENDBR_INSN_SIZE);
- }
- if (is_endbr(instr))
+ if (is_endbr((void*)(fentry_ip - ENDBR_INSN_SIZE)))
fentry_ip -= ENDBR_INSN_SIZE;
-#endif
+
return fentry_ip;
}
#define ftrace_get_symaddr(fentry_ip) arch_ftrace_get_symaddr(fentry_ip)
@@ -118,11 +106,11 @@ struct dyn_arch_ftrace {
};
#endif /* CONFIG_DYNAMIC_FTRACE */
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* CONFIG_FUNCTION_TRACER */
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
void prepare_ftrace_return(unsigned long ip, unsigned long *parent,
unsigned long frame_pointer);
@@ -166,6 +154,6 @@ static inline bool arch_trace_is_compat_syscall(struct pt_regs *regs)
}
#endif /* CONFIG_FTRACE_SYSCALLS && CONFIG_IA32_EMULATION */
#endif /* !COMPILE_OFFSETS */
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#endif /* _ASM_X86_FTRACE_H */
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 6ffa8b75f4cd..f00c09ffe6a9 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -3,7 +3,6 @@
#define _ASM_X86_HARDIRQ_H
#include <linux/threads.h>
-#include <asm/current.h>
typedef struct {
#if IS_ENABLED(CONFIG_KVM_INTEL)
@@ -66,7 +65,8 @@ extern u64 arch_irq_stat_cpu(unsigned int cpu);
extern u64 arch_irq_stat(void);
#define arch_irq_stat arch_irq_stat
-#define local_softirq_pending_ref pcpu_hot.softirq_pending
+DECLARE_PER_CPU_CACHE_HOT(u16, __softirq_pending);
+#define local_softirq_pending_ref __softirq_pending
#if IS_ENABLED(CONFIG_KVM_INTEL)
/*
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index edebf1020e04..162ebd73a698 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -16,7 +16,7 @@
#include <asm/irq_vectors.h>
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/percpu.h>
#include <linux/profile.h>
@@ -128,6 +128,6 @@ extern char spurious_entries_start[];
typedef struct irq_desc* vector_irq_t[NR_VECTORS];
DECLARE_PER_CPU(vector_irq_t, vector_irq);
-#endif /* !ASSEMBLY_ */
+#endif /* !__ASSEMBLER__ */
#endif /* _ASM_X86_HW_IRQ_H */
diff --git a/arch/x86/include/asm/ibt.h b/arch/x86/include/asm/ibt.h
index 1e59581d500c..28d845257303 100644
--- a/arch/x86/include/asm/ibt.h
+++ b/arch/x86/include/asm/ibt.h
@@ -21,7 +21,7 @@
#define HAS_KERNEL_IBT 1
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#ifdef CONFIG_X86_64
#define ASM_ENDBR "endbr64\n\t"
@@ -41,7 +41,7 @@
_ASM_PTR fname "\n\t" \
".popsection\n\t"
-static inline __attribute_const__ u32 gen_endbr(void)
+static __always_inline __attribute_const__ u32 gen_endbr(void)
{
u32 endbr;
@@ -56,7 +56,7 @@ static inline __attribute_const__ u32 gen_endbr(void)
return endbr;
}
-static inline __attribute_const__ u32 gen_endbr_poison(void)
+static __always_inline __attribute_const__ u32 gen_endbr_poison(void)
{
/*
* 4 byte NOP that isn't NOP4 (in fact it is OSP NOP3), such that it
@@ -65,19 +65,24 @@ static inline __attribute_const__ u32 gen_endbr_poison(void)
return 0x001f0f66; /* osp nopl (%rax) */
}
-static inline bool is_endbr(u32 val)
+static inline bool __is_endbr(u32 val)
{
if (val == gen_endbr_poison())
return true;
+ /* See cfi_fineibt_bhi_preamble() */
+ if (IS_ENABLED(CONFIG_FINEIBT_BHI) && val == 0x001f0ff5)
+ return true;
+
val &= ~0x01000000U; /* ENDBR32 -> ENDBR64 */
return val == gen_endbr();
}
+extern __noendbr bool is_endbr(u32 *val);
extern __noendbr u64 ibt_save(bool disable);
extern __noendbr void ibt_restore(u64 save);
-#else /* __ASSEMBLY__ */
+#else /* __ASSEMBLER__ */
#ifdef CONFIG_X86_64
#define ENDBR endbr64
@@ -85,29 +90,29 @@ extern __noendbr void ibt_restore(u64 save);
#define ENDBR endbr32
#endif
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#else /* !IBT */
#define HAS_KERNEL_IBT 0
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#define ASM_ENDBR
#define IBT_NOSEAL(name)
#define __noendbr
-static inline bool is_endbr(u32 val) { return false; }
+static inline bool is_endbr(u32 *val) { return false; }
static inline u64 ibt_save(bool disable) { return 0; }
static inline void ibt_restore(u64 save) { }
-#else /* __ASSEMBLY__ */
+#else /* __ASSEMBLER__ */
#define ENDBR
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* CONFIG_X86_KERNEL_IBT */
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index ad5c68f0509d..a4ec27c67988 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -7,7 +7,7 @@
#define IDT_ALIGN (8 * (1 + HAS_KERNEL_IBT))
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/entry-common.h>
#include <linux/hardirq.h>
@@ -474,7 +474,7 @@ static inline void fred_install_sysvec(unsigned int vector, const idtentry_t fun
idt_install_sysvec(vector, asm_##function); \
}
-#else /* !__ASSEMBLY__ */
+#else /* !__ASSEMBLER__ */
/*
* The ASM variants for DECLARE_IDTENTRY*() which emit the ASM entry stubs.
@@ -579,7 +579,7 @@ SYM_CODE_START(spurious_entries_start)
SYM_CODE_END(spurious_entries_start)
#endif
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
/*
* The actual entry points. Note that DECLARE_IDTENTRY*() serves two
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index 0e82ebc5d1e1..8b1b1abcef15 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -2,7 +2,11 @@
#ifndef _ASM_X86_INIT_H
#define _ASM_X86_INIT_H
+#if defined(CONFIG_CC_IS_CLANG) && CONFIG_CLANG_VERSION < 170000
+#define __head __section(".head.text") __no_sanitize_undefined __no_stack_protector
+#else
#define __head __section(".head.text") __no_sanitize_undefined
+#endif
struct x86_mapping_info {
void *(*alloc_pgt_page)(void *); /* allocate buf for page table */
diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h
index 438ccd4f3cc4..e48a00b3311d 100644
--- a/arch/x86/include/asm/inst.h
+++ b/arch/x86/include/asm/inst.h
@@ -6,7 +6,7 @@
#ifndef X86_ASM_INST_H
#define X86_ASM_INST_H
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
#define REG_NUM_INVALID 100
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index 6d7b04ffc5fd..3a97a7eefb51 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -45,7 +45,18 @@
/* Wildcard match so X86_MATCH_VFM(ANY) works */
#define INTEL_ANY IFM(X86_FAMILY_ANY, X86_MODEL_ANY)
+/* Family 5 */
+#define INTEL_FAM5_START IFM(5, 0x00) /* Notational marker, also P5 A-step */
+#define INTEL_PENTIUM_75 IFM(5, 0x02) /* P54C */
+#define INTEL_PENTIUM_MMX IFM(5, 0x04) /* P55C */
+#define INTEL_QUARK_X1000 IFM(5, 0x09) /* Quark X1000 SoC */
+
+/* Family 6 */
#define INTEL_PENTIUM_PRO IFM(6, 0x01)
+#define INTEL_PENTIUM_II_KLAMATH IFM(6, 0x03)
+#define INTEL_PENTIUM_III_DESCHUTES IFM(6, 0x05)
+#define INTEL_PENTIUM_III_TUALATIN IFM(6, 0x0B)
+#define INTEL_PENTIUM_M_DOTHAN IFM(6, 0x0D)
#define INTEL_CORE_YONAH IFM(6, 0x0E)
@@ -110,9 +121,9 @@
#define INTEL_SAPPHIRERAPIDS_X IFM(6, 0x8F) /* Golden Cove */
-#define INTEL_EMERALDRAPIDS_X IFM(6, 0xCF)
+#define INTEL_EMERALDRAPIDS_X IFM(6, 0xCF) /* Raptor Cove */
-#define INTEL_GRANITERAPIDS_X IFM(6, 0xAD)
+#define INTEL_GRANITERAPIDS_X IFM(6, 0xAD) /* Redwood Cove */
#define INTEL_GRANITERAPIDS_D IFM(6, 0xAE)
/* "Hybrid" Processors (P-Core/E-Core) */
@@ -126,16 +137,16 @@
#define INTEL_RAPTORLAKE_P IFM(6, 0xBA)
#define INTEL_RAPTORLAKE_S IFM(6, 0xBF)
-#define INTEL_METEORLAKE IFM(6, 0xAC)
+#define INTEL_METEORLAKE IFM(6, 0xAC) /* Redwood Cove / Crestmont */
#define INTEL_METEORLAKE_L IFM(6, 0xAA)
-#define INTEL_ARROWLAKE_H IFM(6, 0xC5)
+#define INTEL_ARROWLAKE_H IFM(6, 0xC5) /* Lion Cove / Skymont */
#define INTEL_ARROWLAKE IFM(6, 0xC6)
#define INTEL_ARROWLAKE_U IFM(6, 0xB5)
-#define INTEL_LUNARLAKE_M IFM(6, 0xBD)
+#define INTEL_LUNARLAKE_M IFM(6, 0xBD) /* Lion Cove / Skymont */
-#define INTEL_PANTHERLAKE_L IFM(6, 0xCC)
+#define INTEL_PANTHERLAKE_L IFM(6, 0xCC) /* Cougar Cove / Crestmont */
/* "Small Core" Processors (Atom/E-Core) */
@@ -149,9 +160,9 @@
#define INTEL_ATOM_SILVERMONT IFM(6, 0x37) /* Bay Trail, Valleyview */
#define INTEL_ATOM_SILVERMONT_D IFM(6, 0x4D) /* Avaton, Rangely */
#define INTEL_ATOM_SILVERMONT_MID IFM(6, 0x4A) /* Merriefield */
+#define INTEL_ATOM_SILVERMONT_MID2 IFM(6, 0x5A) /* Anniedale */
#define INTEL_ATOM_AIRMONT IFM(6, 0x4C) /* Cherry Trail, Braswell */
-#define INTEL_ATOM_AIRMONT_MID IFM(6, 0x5A) /* Moorefield */
#define INTEL_ATOM_AIRMONT_NP IFM(6, 0x75) /* Lightning Mountain */
#define INTEL_ATOM_GOLDMONT IFM(6, 0x5C) /* Apollo Lake */
@@ -176,16 +187,35 @@
#define INTEL_XEON_PHI_KNL IFM(6, 0x57) /* Knights Landing */
#define INTEL_XEON_PHI_KNM IFM(6, 0x85) /* Knights Mill */
-/* Family 5 */
-#define INTEL_QUARK_X1000 IFM(5, 0x09) /* Quark X1000 SoC */
+/* Notational marker denoting the last Family 6 model */
+#define INTEL_FAM6_LAST IFM(6, 0xFF)
+
+/* Family 15 - NetBurst */
+#define INTEL_P4_WILLAMETTE IFM(15, 0x01) /* Also Xeon Foster */
+#define INTEL_P4_PRESCOTT IFM(15, 0x03)
+#define INTEL_P4_PRESCOTT_2M IFM(15, 0x04)
+#define INTEL_P4_CEDARMILL IFM(15, 0x06) /* Also Xeon Dempsey */
/* Family 19 */
#define INTEL_PANTHERCOVE_X IFM(19, 0x01) /* Diamond Rapids */
-/* CPU core types */
+/*
+ * Intel CPU core types
+ *
+ * CPUID.1AH.EAX[31:0] uniquely identifies the microarchitecture
+ * of the core. Bits 31-24 indicates its core type (Core or Atom)
+ * and Bits [23:0] indicates the native model ID of the core.
+ * Core type and native model ID are defined in below enumerations.
+ */
enum intel_cpu_type {
+ INTEL_CPU_TYPE_UNKNOWN,
INTEL_CPU_TYPE_ATOM = 0x20,
INTEL_CPU_TYPE_CORE = 0x40,
};
+enum intel_native_id {
+ INTEL_ATOM_CMT_NATIVE_ID = 0x2, /* Crestmont */
+ INTEL_ATOM_SKT_NATIVE_ID = 0x3, /* Skymont */
+};
+
#endif /* _ASM_X86_INTEL_FAMILY_H */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index ed580c7f9d0a..1a0dc2b2bf5b 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -175,6 +175,9 @@ extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, un
extern void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size);
#define ioremap_encrypted ioremap_encrypted
+void *arch_memremap_wb(phys_addr_t phys_addr, size_t size, unsigned long flags);
+#define arch_memremap_wb arch_memremap_wb
+
/**
* ioremap - map bus memory into CPU space
* @offset: bus address of the memory
diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h
index 562a547c29a5..735c3a491f60 100644
--- a/arch/x86/include/asm/irq_stack.h
+++ b/arch/x86/include/asm/irq_stack.h
@@ -116,7 +116,7 @@
ASM_CALL_ARG2
#define call_on_irqstack(func, asm_call, argconstr...) \
- call_on_stack(__this_cpu_read(pcpu_hot.hardirq_stack_ptr), \
+ call_on_stack(__this_cpu_read(hardirq_stack_ptr), \
func, asm_call, argconstr)
/* Macros to assert type correctness for run_*_on_irqstack macros */
@@ -135,7 +135,7 @@
* User mode entry and interrupt on the irq stack do not \
* switch stacks. If from user mode the task stack is empty. \
*/ \
- if (user_mode(regs) || __this_cpu_read(pcpu_hot.hardirq_stack_inuse)) { \
+ if (user_mode(regs) || __this_cpu_read(hardirq_stack_inuse)) { \
irq_enter_rcu(); \
func(c_args); \
irq_exit_rcu(); \
@@ -146,9 +146,9 @@
* places. Invoke the stack switch macro with the call \
* sequence which matches the above direct invocation. \
*/ \
- __this_cpu_write(pcpu_hot.hardirq_stack_inuse, true); \
+ __this_cpu_write(hardirq_stack_inuse, true); \
call_on_irqstack(func, asm_call, constr); \
- __this_cpu_write(pcpu_hot.hardirq_stack_inuse, false); \
+ __this_cpu_write(hardirq_stack_inuse, false); \
} \
}
@@ -212,9 +212,9 @@
*/
#define do_softirq_own_stack() \
{ \
- __this_cpu_write(pcpu_hot.hardirq_stack_inuse, true); \
+ __this_cpu_write(hardirq_stack_inuse, true); \
call_on_irqstack(__do_softirq, ASM_CALL_ARG0); \
- __this_cpu_write(pcpu_hot.hardirq_stack_inuse, false); \
+ __this_cpu_write(hardirq_stack_inuse, false); \
}
#endif
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index cf7fc2b8e3ce..abb8374c9ff7 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -4,7 +4,7 @@
#include <asm/processor-flags.h>
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <asm/nospec-branch.h>
@@ -79,7 +79,7 @@ static __always_inline void native_local_irq_restore(unsigned long flags)
#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/types.h>
static __always_inline unsigned long arch_local_save_flags(void)
@@ -133,10 +133,10 @@ static __always_inline unsigned long arch_local_irq_save(void)
#endif
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* CONFIG_PARAVIRT_XXL */
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
static __always_inline int arch_irqs_disabled_flags(unsigned long flags)
{
return !(flags & X86_EFLAGS_IF);
@@ -154,6 +154,6 @@ static __always_inline void arch_local_irq_restore(unsigned long flags)
if (!arch_irqs_disabled_flags(flags))
arch_local_irq_enable();
}
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#endif
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index 3f1c1d6c0da1..61dd1dee7812 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -7,7 +7,7 @@
#include <asm/asm.h>
#include <asm/nops.h>
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/stringify.h>
#include <linux/types.h>
@@ -55,6 +55,6 @@ l_yes:
extern int arch_jump_entry_size(struct jump_entry *entry);
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif
diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h
index de75306b932e..d7e33c7f096b 100644
--- a/arch/x86/include/asm/kasan.h
+++ b/arch/x86/include/asm/kasan.h
@@ -23,7 +23,7 @@
(1ULL << (__VIRTUAL_MASK_SHIFT - \
KASAN_SHADOW_SCALE_SHIFT)))
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#ifdef CONFIG_KASAN
void __init kasan_early_init(void);
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 8ad187462b68..5432457d2338 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -13,11 +13,12 @@
# define KEXEC_CONTROL_PAGE_SIZE 4096
# define KEXEC_CONTROL_CODE_MAX_SIZE 2048
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/string.h>
#include <linux/kernel.h>
+#include <asm/asm.h>
#include <asm/page.h>
#include <asm/ptrace.h>
@@ -71,41 +72,32 @@ static inline void crash_setup_regs(struct pt_regs *newregs,
if (oldregs) {
memcpy(newregs, oldregs, sizeof(*newregs));
} else {
+ asm volatile("mov %%" _ASM_BX ",%0" : "=m"(newregs->bx));
+ asm volatile("mov %%" _ASM_CX ",%0" : "=m"(newregs->cx));
+ asm volatile("mov %%" _ASM_DX ",%0" : "=m"(newregs->dx));
+ asm volatile("mov %%" _ASM_SI ",%0" : "=m"(newregs->si));
+ asm volatile("mov %%" _ASM_DI ",%0" : "=m"(newregs->di));
+ asm volatile("mov %%" _ASM_BP ",%0" : "=m"(newregs->bp));
+ asm volatile("mov %%" _ASM_AX ",%0" : "=m"(newregs->ax));
+ asm volatile("mov %%" _ASM_SP ",%0" : "=m"(newregs->sp));
+#ifdef CONFIG_X86_64
+ asm volatile("mov %%r8,%0" : "=m"(newregs->r8));
+ asm volatile("mov %%r9,%0" : "=m"(newregs->r9));
+ asm volatile("mov %%r10,%0" : "=m"(newregs->r10));
+ asm volatile("mov %%r11,%0" : "=m"(newregs->r11));
+ asm volatile("mov %%r12,%0" : "=m"(newregs->r12));
+ asm volatile("mov %%r13,%0" : "=m"(newregs->r13));
+ asm volatile("mov %%r14,%0" : "=m"(newregs->r14));
+ asm volatile("mov %%r15,%0" : "=m"(newregs->r15));
+#endif
+ asm volatile("mov %%ss,%k0" : "=a"(newregs->ss));
+ asm volatile("mov %%cs,%k0" : "=a"(newregs->cs));
#ifdef CONFIG_X86_32
- asm volatile("movl %%ebx,%0" : "=m"(newregs->bx));
- asm volatile("movl %%ecx,%0" : "=m"(newregs->cx));
- asm volatile("movl %%edx,%0" : "=m"(newregs->dx));
- asm volatile("movl %%esi,%0" : "=m"(newregs->si));
- asm volatile("movl %%edi,%0" : "=m"(newregs->di));
- asm volatile("movl %%ebp,%0" : "=m"(newregs->bp));
- asm volatile("movl %%eax,%0" : "=m"(newregs->ax));
- asm volatile("movl %%esp,%0" : "=m"(newregs->sp));
- asm volatile("movl %%ss, %%eax;" :"=a"(newregs->ss));
- asm volatile("movl %%cs, %%eax;" :"=a"(newregs->cs));
- asm volatile("movl %%ds, %%eax;" :"=a"(newregs->ds));
- asm volatile("movl %%es, %%eax;" :"=a"(newregs->es));
- asm volatile("pushfl; popl %0" :"=m"(newregs->flags));
-#else
- asm volatile("movq %%rbx,%0" : "=m"(newregs->bx));
- asm volatile("movq %%rcx,%0" : "=m"(newregs->cx));
- asm volatile("movq %%rdx,%0" : "=m"(newregs->dx));
- asm volatile("movq %%rsi,%0" : "=m"(newregs->si));
- asm volatile("movq %%rdi,%0" : "=m"(newregs->di));
- asm volatile("movq %%rbp,%0" : "=m"(newregs->bp));
- asm volatile("movq %%rax,%0" : "=m"(newregs->ax));
- asm volatile("movq %%rsp,%0" : "=m"(newregs->sp));
- asm volatile("movq %%r8,%0" : "=m"(newregs->r8));
- asm volatile("movq %%r9,%0" : "=m"(newregs->r9));
- asm volatile("movq %%r10,%0" : "=m"(newregs->r10));
- asm volatile("movq %%r11,%0" : "=m"(newregs->r11));
- asm volatile("movq %%r12,%0" : "=m"(newregs->r12));
- asm volatile("movq %%r13,%0" : "=m"(newregs->r13));
- asm volatile("movq %%r14,%0" : "=m"(newregs->r14));
- asm volatile("movq %%r15,%0" : "=m"(newregs->r15));
- asm volatile("movl %%ss, %%eax;" :"=a"(newregs->ss));
- asm volatile("movl %%cs, %%eax;" :"=a"(newregs->cs));
- asm volatile("pushfq; popq %0" :"=m"(newregs->flags));
+ asm volatile("mov %%ds,%k0" : "=a"(newregs->ds));
+ asm volatile("mov %%es,%k0" : "=a"(newregs->es));
#endif
+ asm volatile("pushf\n\t"
+ "pop %0" : "=m"(newregs->flags));
newregs->ip = _THIS_IP_;
}
}
@@ -225,6 +217,6 @@ unsigned int arch_crash_get_elfcorehdr_size(void);
#define crash_get_elfcorehdr_size arch_crash_get_elfcorehdr_size
#endif
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_X86_KEXEC_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 32ae3aa50c7e..a884ab544335 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -27,6 +27,7 @@
#include <linux/kfifo.h>
#include <linux/sched/vhost_task.h>
#include <linux/call_once.h>
+#include <linux/atomic.h>
#include <asm/apic.h>
#include <asm/pvclock-abi.h>
@@ -405,7 +406,7 @@ union kvm_cpu_role {
};
struct kvm_rmap_head {
- unsigned long val;
+ atomic_long_t val;
};
struct kvm_pio_request {
@@ -880,6 +881,7 @@ struct kvm_vcpu_arch {
int cpuid_nent;
struct kvm_cpuid_entry2 *cpuid_entries;
+ bool cpuid_dynamic_bits_dirty;
bool is_amd_compatible;
/*
@@ -909,7 +911,8 @@ struct kvm_vcpu_arch {
int (*complete_userspace_io)(struct kvm_vcpu *vcpu);
gpa_t time;
- struct pvclock_vcpu_time_info hv_clock;
+ s8 pvclock_tsc_shift;
+ u32 pvclock_tsc_mul;
unsigned int hw_tsc_khz;
struct gfn_to_pfn_cache pv_time;
/* set guest stopped flag in pvclock flags field */
@@ -997,8 +1000,8 @@ struct kvm_vcpu_arch {
u64 msr_int_val; /* MSR_KVM_ASYNC_PF_INT */
u16 vec;
u32 id;
- bool send_user_only;
u32 host_apf_flags;
+ bool send_always;
bool delivery_as_pf_vmexit;
bool pageready_pending;
} apf;
@@ -1053,6 +1056,7 @@ struct kvm_vcpu_arch {
/* Protected Guests */
bool guest_state_protected;
+ bool guest_tsc_protected;
/*
* Set when PDPTS were loaded directly by the userspace without
@@ -1189,6 +1193,8 @@ struct kvm_xen {
struct gfn_to_pfn_cache shinfo_cache;
struct idr evtchn_ports;
unsigned long poll_mask[BITS_TO_LONGS(KVM_MAX_VCPUS)];
+
+ struct kvm_xen_hvm_config hvm_config;
};
#endif
@@ -1354,8 +1360,6 @@ struct kvm_arch {
u64 shadow_mmio_value;
- struct iommu_domain *iommu_domain;
- bool iommu_noncoherent;
#define __KVM_HAVE_ARCH_NONCOHERENT_DMA
atomic_t noncoherent_dma_count;
#define __KVM_HAVE_ARCH_ASSIGNED_DEVICE
@@ -1411,8 +1415,6 @@ struct kvm_arch {
struct delayed_work kvmclock_update_work;
struct delayed_work kvmclock_sync_work;
- struct kvm_xen_hvm_config xen_hvm_config;
-
/* reads protected by irq_srcu, writes by irq_lock */
struct hlist_head mask_notifier_list;
@@ -1479,6 +1481,7 @@ struct kvm_arch {
* tdp_mmu_page set.
*
* For reads, this list is protected by:
+ * RCU alone or
* the MMU lock in read mode + RCU or
* the MMU lock in write mode
*
@@ -2164,8 +2167,8 @@ int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu);
void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, unsigned long payload);
-void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
-void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
+void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned int nr,
+ bool has_error_code, u32 error_code);
void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
struct x86_exception *fault);
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
index dc31b13b87a0..b51d8a4673f5 100644
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h
@@ -38,7 +38,7 @@
#define ASM_FUNC_ALIGN __stringify(__FUNC_ALIGN)
#define SYM_F_ALIGN __FUNC_ALIGN
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
#if defined(CONFIG_MITIGATION_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO)
#define RET jmp __x86_return_thunk
@@ -50,7 +50,7 @@
#endif
#endif /* CONFIG_MITIGATION_RETPOLINE */
-#else /* __ASSEMBLY__ */
+#else /* __ASSEMBLER__ */
#if defined(CONFIG_MITIGATION_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO)
#define ASM_RET "jmp __x86_return_thunk\n\t"
@@ -62,7 +62,7 @@
#endif
#endif /* CONFIG_MITIGATION_RETPOLINE */
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
/*
* Depending on -fpatchable-function-entry=N,N usage (CONFIG_CALL_PADDING) the
@@ -119,33 +119,27 @@
/* SYM_FUNC_START -- use for global functions */
#define SYM_FUNC_START(name) \
- SYM_START(name, SYM_L_GLOBAL, SYM_F_ALIGN) \
- ENDBR
+ SYM_START(name, SYM_L_GLOBAL, SYM_F_ALIGN)
/* SYM_FUNC_START_NOALIGN -- use for global functions, w/o alignment */
#define SYM_FUNC_START_NOALIGN(name) \
- SYM_START(name, SYM_L_GLOBAL, SYM_A_NONE) \
- ENDBR
+ SYM_START(name, SYM_L_GLOBAL, SYM_A_NONE)
/* SYM_FUNC_START_LOCAL -- use for local functions */
#define SYM_FUNC_START_LOCAL(name) \
- SYM_START(name, SYM_L_LOCAL, SYM_F_ALIGN) \
- ENDBR
+ SYM_START(name, SYM_L_LOCAL, SYM_F_ALIGN)
/* SYM_FUNC_START_LOCAL_NOALIGN -- use for local functions, w/o alignment */
#define SYM_FUNC_START_LOCAL_NOALIGN(name) \
- SYM_START(name, SYM_L_LOCAL, SYM_A_NONE) \
- ENDBR
+ SYM_START(name, SYM_L_LOCAL, SYM_A_NONE)
/* SYM_FUNC_START_WEAK -- use for weak functions */
#define SYM_FUNC_START_WEAK(name) \
- SYM_START(name, SYM_L_WEAK, SYM_F_ALIGN) \
- ENDBR
+ SYM_START(name, SYM_L_WEAK, SYM_F_ALIGN)
/* SYM_FUNC_START_WEAK_NOALIGN -- use for weak functions, w/o alignment */
#define SYM_FUNC_START_WEAK_NOALIGN(name) \
- SYM_START(name, SYM_L_WEAK, SYM_A_NONE) \
- ENDBR
+ SYM_START(name, SYM_L_WEAK, SYM_A_NONE)
#endif /* _ASM_X86_LINKAGE_H */
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index eb2db07ef39c..6c77c03139f7 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -296,8 +296,6 @@ enum mcp_flags {
void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
-bool mce_notify_irq(void);
-
DECLARE_PER_CPU(struct mce, injectm);
/* Disable CMCI/polling for MCA bank claimed by firmware */
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index f922b682b9b4..1530ee301dfe 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -10,7 +10,7 @@
#ifndef __X86_MEM_ENCRYPT_H__
#define __X86_MEM_ENCRYPT_H__
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/init.h>
#include <linux/cc_platform.h>
@@ -114,6 +114,6 @@ void add_encrypt_protection_map(void);
extern char __start_bss_decrypted[], __end_bss_decrypted[], __start_bss_decrypted_unused[];
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* __X86_MEM_ENCRYPT_H__ */
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 3b496cdcb74b..8b8055a8eb9e 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -69,6 +69,18 @@ typedef struct {
u16 pkey_allocation_map;
s16 execute_only_pkey;
#endif
+
+#ifdef CONFIG_BROADCAST_TLB_FLUSH
+ /*
+ * The global ASID will be a non-zero value when the process has
+ * the same ASID across all CPUs, allowing it to make use of
+ * hardware-assisted remote TLB invalidation like AMD INVLPGB.
+ */
+ u16 global_asid;
+
+ /* The process is transitioning to a new global ASID number. */
+ bool asid_transition;
+#endif
} mm_context_t;
#define INIT_MM_CONTEXT(mm) \
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 795fdd53bd0a..2398058b6e83 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -2,7 +2,6 @@
#ifndef _ASM_X86_MMU_CONTEXT_H
#define _ASM_X86_MMU_CONTEXT_H
-#include <asm/desc.h>
#include <linux/atomic.h>
#include <linux/mm_types.h>
#include <linux/pkeys.h>
@@ -13,6 +12,7 @@
#include <asm/paravirt.h>
#include <asm/debugreg.h>
#include <asm/gsseg.h>
+#include <asm/desc.h>
extern atomic64_t last_mm_ctx_id;
@@ -139,6 +139,11 @@ static inline void mm_reset_untag_mask(struct mm_struct *mm)
#define enter_lazy_tlb enter_lazy_tlb
extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
+#define mm_init_global_asid mm_init_global_asid
+extern void mm_init_global_asid(struct mm_struct *mm);
+
+extern void mm_free_global_asid(struct mm_struct *mm);
+
/*
* Init a new mm. Used on mm copies, like at fork()
* and on mm's that are brand-new, like at execve().
@@ -161,6 +166,8 @@ static inline int init_new_context(struct task_struct *tsk,
mm->context.execute_only_pkey = -1;
}
#endif
+
+ mm_init_global_asid(mm);
mm_reset_untag_mask(mm);
init_new_context_ldt(mm);
return 0;
@@ -170,6 +177,7 @@ static inline int init_new_context(struct task_struct *tsk,
static inline void destroy_context(struct mm_struct *mm)
{
destroy_context_ldt(mm);
+ mm_free_global_asid(mm);
}
extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index f91ab1e75f9f..bab5ccfc60a7 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -43,8 +43,6 @@ extern bool hyperv_paravisor_present;
extern void *hv_hypercall_pg;
-extern u64 hv_current_partition_id;
-
extern union hv_ghcb * __percpu *hv_ghcb_pg;
bool hv_isolation_type_snp(void);
@@ -58,10 +56,6 @@ u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2);
#define HV_AP_INIT_GPAT_DEFAULT 0x0007040600070406ULL
#define HV_AP_SEGMENT_LIMIT 0xffffffff
-int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages);
-int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id);
-int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags);
-
/*
* If the hypercall involves no input or output parameters, the hypervisor
* ignores the corresponding GPA pointer.
@@ -77,11 +71,11 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
return hv_tdx_hypercall(control, input_address, output_address);
if (hv_isolation_type_snp() && !hyperv_paravisor_present) {
- __asm__ __volatile__("mov %4, %%r8\n"
+ __asm__ __volatile__("mov %[output_address], %%r8\n"
"vmmcall"
: "=a" (hv_status), ASM_CALL_CONSTRAINT,
"+c" (control), "+d" (input_address)
- : "r" (output_address)
+ : [output_address] "r" (output_address)
: "cc", "memory", "r8", "r9", "r10", "r11");
return hv_status;
}
@@ -89,12 +83,12 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
if (!hv_hypercall_pg)
return U64_MAX;
- __asm__ __volatile__("mov %4, %%r8\n"
+ __asm__ __volatile__("mov %[output_address], %%r8\n"
CALL_NOSPEC
: "=a" (hv_status), ASM_CALL_CONSTRAINT,
"+c" (control), "+d" (input_address)
- : "r" (output_address),
- THUNK_TARGET(hv_hypercall_pg)
+ : [output_address] "r" (output_address),
+ THUNK_TARGET(hv_hypercall_pg)
: "cc", "memory", "r8", "r9", "r10", "r11");
#else
u32 input_address_hi = upper_32_bits(input_address);
@@ -160,7 +154,7 @@ static inline u64 _hv_do_fast_hypercall8(u64 control, u64 input1)
: "cc", "edi", "esi");
}
#endif
- return hv_status;
+ return hv_status;
}
static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1)
@@ -187,18 +181,18 @@ static inline u64 _hv_do_fast_hypercall16(u64 control, u64 input1, u64 input2)
return hv_tdx_hypercall(control, input1, input2);
if (hv_isolation_type_snp() && !hyperv_paravisor_present) {
- __asm__ __volatile__("mov %4, %%r8\n"
+ __asm__ __volatile__("mov %[input2], %%r8\n"
"vmmcall"
: "=a" (hv_status), ASM_CALL_CONSTRAINT,
"+c" (control), "+d" (input1)
- : "r" (input2)
+ : [input2] "r" (input2)
: "cc", "r8", "r9", "r10", "r11");
} else {
- __asm__ __volatile__("mov %4, %%r8\n"
+ __asm__ __volatile__("mov %[input2], %%r8\n"
CALL_NOSPEC
: "=a" (hv_status), ASM_CALL_CONSTRAINT,
"+c" (control), "+d" (input1)
- : "r" (input2),
+ : [input2] "r" (input2),
THUNK_TARGET(hv_hypercall_pg)
: "cc", "r8", "r9", "r10", "r11");
}
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 72765b2fe0d8..e6134ef2263d 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -25,6 +25,7 @@
#define _EFER_SVME 12 /* Enable virtualization */
#define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */
#define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */
+#define _EFER_TCE 15 /* Enable Translation Cache Extensions */
#define _EFER_AUTOIBRS 21 /* Enable Automatic IBRS */
#define EFER_SCE (1<<_EFER_SCE)
@@ -34,6 +35,7 @@
#define EFER_SVME (1<<_EFER_SVME)
#define EFER_LMSLE (1<<_EFER_LMSLE)
#define EFER_FFXSR (1<<_EFER_FFXSR)
+#define EFER_TCE (1<<_EFER_TCE)
#define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS)
/*
@@ -701,15 +703,17 @@
#define MSR_AMD_CPPC_REQ 0xc00102b3
#define MSR_AMD_CPPC_STATUS 0xc00102b4
-#define AMD_CPPC_LOWEST_PERF(x) (((x) >> 0) & 0xff)
-#define AMD_CPPC_LOWNONLIN_PERF(x) (((x) >> 8) & 0xff)
-#define AMD_CPPC_NOMINAL_PERF(x) (((x) >> 16) & 0xff)
-#define AMD_CPPC_HIGHEST_PERF(x) (((x) >> 24) & 0xff)
+/* Masks for use with MSR_AMD_CPPC_CAP1 */
+#define AMD_CPPC_LOWEST_PERF_MASK GENMASK(7, 0)
+#define AMD_CPPC_LOWNONLIN_PERF_MASK GENMASK(15, 8)
+#define AMD_CPPC_NOMINAL_PERF_MASK GENMASK(23, 16)
+#define AMD_CPPC_HIGHEST_PERF_MASK GENMASK(31, 24)
-#define AMD_CPPC_MAX_PERF(x) (((x) & 0xff) << 0)
-#define AMD_CPPC_MIN_PERF(x) (((x) & 0xff) << 8)
-#define AMD_CPPC_DES_PERF(x) (((x) & 0xff) << 16)
-#define AMD_CPPC_ENERGY_PERF_PREF(x) (((x) & 0xff) << 24)
+/* Masks for use with MSR_AMD_CPPC_REQ */
+#define AMD_CPPC_MAX_PERF_MASK GENMASK(7, 0)
+#define AMD_CPPC_MIN_PERF_MASK GENMASK(15, 8)
+#define AMD_CPPC_DES_PERF_MASK GENMASK(23, 16)
+#define AMD_CPPC_EPP_PERF_MASK GENMASK(31, 24)
/* AMD Performance Counter Global Status and Control MSRs */
#define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS 0xc0000300
@@ -721,6 +725,7 @@
/* Zen4 */
#define MSR_ZEN4_BP_CFG 0xc001102e
+#define MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT 4
#define MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT 5
/* Fam 19h MSRs */
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 001853541f1e..9397a319d165 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -4,7 +4,7 @@
#include "msr-index.h"
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <asm/asm.h>
#include <asm/errno.h>
@@ -397,5 +397,5 @@ static inline int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8])
return wrmsr_safe_regs(regs);
}
#endif /* CONFIG_SMP */
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_X86_MSR_H */
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 41a0ebb699ec..f677382093f3 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -56,6 +56,8 @@ int __register_nmi_handler(unsigned int, struct nmiaction *);
void unregister_nmi_handler(unsigned int, const char *);
+void set_emergency_nmi_handler(unsigned int type, nmi_handler_t handler);
+
void stop_nmi(void);
void restart_nmi(void);
void local_touch_nmi(void);
diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h
index 1c1b7550fa55..cd94221d8335 100644
--- a/arch/x86/include/asm/nops.h
+++ b/arch/x86/include/asm/nops.h
@@ -82,7 +82,7 @@
#define ASM_NOP7 _ASM_BYTES(BYTES_NOP7)
#define ASM_NOP8 _ASM_BYTES(BYTES_NOP8)
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
extern const unsigned char * const x86_nops[];
#endif
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index aee26bb8230f..e4d11e3318f0 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -12,7 +12,6 @@
#include <asm/msr-index.h>
#include <asm/unwind_hints.h>
#include <asm/percpu.h>
-#include <asm/current.h>
/*
* Call depth tracking for Intel SKL CPUs to address the RSB underflow
@@ -78,21 +77,21 @@
#include <asm/asm-offsets.h>
#define CREDIT_CALL_DEPTH \
- movq $-1, PER_CPU_VAR(pcpu_hot + X86_call_depth);
+ movq $-1, PER_CPU_VAR(__x86_call_depth);
#define RESET_CALL_DEPTH \
xor %eax, %eax; \
bts $63, %rax; \
- movq %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth);
+ movq %rax, PER_CPU_VAR(__x86_call_depth);
#define RESET_CALL_DEPTH_FROM_CALL \
movb $0xfc, %al; \
shl $56, %rax; \
- movq %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth); \
+ movq %rax, PER_CPU_VAR(__x86_call_depth); \
CALL_THUNKS_DEBUG_INC_CALLS
#define INCREMENT_CALL_DEPTH \
- sarq $5, PER_CPU_VAR(pcpu_hot + X86_call_depth); \
+ sarq $5, PER_CPU_VAR(__x86_call_depth); \
CALL_THUNKS_DEBUG_INC_CALLS
#else
@@ -177,7 +176,7 @@
add $(BITS_PER_LONG/8), %_ASM_SP; \
lfence;
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
/*
* (ab)use RETPOLINE_SAFE on RET to annotate away 'bare' RET instructions
@@ -335,7 +334,7 @@
#define CLEAR_BRANCH_HISTORY_VMEXIT
#endif
-#else /* __ASSEMBLY__ */
+#else /* __ASSEMBLER__ */
typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE];
extern retpoline_thunk_t __x86_indirect_thunk_array[];
@@ -387,6 +386,8 @@ extern void call_depth_return_thunk(void);
__stringify(INCREMENT_CALL_DEPTH), \
X86_FEATURE_CALL_DEPTH)
+DECLARE_PER_CPU_CACHE_HOT(u64, __x86_call_depth);
+
#ifdef CONFIG_CALL_THUNKS_DEBUG
DECLARE_PER_CPU(u64, __x86_call_count);
DECLARE_PER_CPU(u64, __x86_ret_count);
@@ -521,7 +522,7 @@ extern u64 x86_pred_cmd;
static inline void indirect_branch_prediction_barrier(void)
{
- alternative_msr_write(MSR_IA32_PRED_CMD, x86_pred_cmd, X86_FEATURE_USE_IBPB);
+ alternative_msr_write(MSR_IA32_PRED_CMD, x86_pred_cmd, X86_FEATURE_IBPB);
}
/* The Intel SPEC CTRL MSR base value cache */
@@ -558,6 +559,8 @@ DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp);
DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
+DECLARE_STATIC_KEY_FALSE(switch_vcpu_ibpb);
+
DECLARE_STATIC_KEY_FALSE(mds_idle_clear);
DECLARE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush);
@@ -602,6 +605,6 @@ static __always_inline void mds_idle_clear_cpu_buffers(void)
mds_clear_cpu_buffers();
}
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_X86_NOSPEC_BRANCH_H_ */
diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h
index 46d7e06763c9..e0125afa53fb 100644
--- a/arch/x86/include/asm/orc_types.h
+++ b/arch/x86/include/asm/orc_types.h
@@ -45,7 +45,7 @@
#define ORC_TYPE_REGS 3
#define ORC_TYPE_REGS_PARTIAL 4
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <asm/byteorder.h>
/*
@@ -73,6 +73,6 @@ struct orc_entry {
#endif
} __packed;
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ORC_TYPES_H */
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index c9fe207916f4..9265f2fca99a 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -14,7 +14,7 @@
#include <asm/page_32.h>
#endif /* CONFIG_X86_64 */
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
struct page;
@@ -84,7 +84,7 @@ static __always_inline u64 __is_canonical_address(u64 vaddr, u8 vaddr_bits)
return __canonical_address(vaddr, vaddr_bits) == vaddr;
}
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#include <asm-generic/memory_model.h>
#include <asm-generic/getorder.h>
diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h
index 580d71aca65a..0c623706cb7e 100644
--- a/arch/x86/include/asm/page_32.h
+++ b/arch/x86/include/asm/page_32.h
@@ -4,7 +4,7 @@
#include <asm/page_32_types.h>
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#define __phys_addr_nodebug(x) ((x) - PAGE_OFFSET)
#ifdef CONFIG_DEBUG_VIRTUAL
@@ -26,6 +26,6 @@ static inline void copy_page(void *to, void *from)
{
memcpy(to, from, PAGE_SIZE);
}
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#endif /* _ASM_X86_PAGE_32_H */
diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h
index faf9cc1c14bb..a9b62e0e6f79 100644
--- a/arch/x86/include/asm/page_32_types.h
+++ b/arch/x86/include/asm/page_32_types.h
@@ -11,8 +11,8 @@
* a virtual address space of one gigabyte, which limits the
* amount of physical memory you can use to about 950MB.
*
- * If you want more physical memory than this then see the CONFIG_HIGHMEM4G
- * and CONFIG_HIGHMEM64G options in the kernel configuration.
+ * If you want more physical memory than this then see the CONFIG_VMSPLIT_2G
+ * and CONFIG_HIGHMEM4G options in the kernel configuration.
*/
#define __PAGE_OFFSET_BASE _AC(CONFIG_PAGE_OFFSET, UL)
#define __PAGE_OFFSET __PAGE_OFFSET_BASE
@@ -63,7 +63,7 @@
*/
#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
/*
* This much address space is reserved for vmalloc() and iomap()
@@ -75,6 +75,6 @@ extern int sysctl_legacy_va_layout;
extern void find_low_pfn_range(void);
extern void setup_bootmem_allocator(void);
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#endif /* _ASM_X86_PAGE_32_DEFS_H */
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index d63576608ce7..d3aab6f4e59a 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -4,7 +4,7 @@
#include <asm/page_64_types.h>
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <asm/cpufeatures.h>
#include <asm/alternative.h>
@@ -55,11 +55,12 @@ static inline void clear_page(void *page)
clear_page_rep, X86_FEATURE_REP_GOOD,
clear_page_erms, X86_FEATURE_ERMS,
"=D" (page),
- "D" (page)
- : "cc", "memory", "rax", "rcx");
+ "D" (page),
+ "cc", "memory", "rax", "rcx");
}
void copy_page(void *to, void *from);
+KCFI_REFERENCE(copy_page);
#ifdef CONFIG_X86_5LEVEL
/*
@@ -94,7 +95,7 @@ static __always_inline unsigned long task_size_max(void)
}
#endif /* CONFIG_X86_5LEVEL */
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#ifdef CONFIG_X86_VSYSCALL_EMULATION
# define __HAVE_ARCH_GATE_AREA 1
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 06ef25411d62..1faa8f88850a 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -2,7 +2,7 @@
#ifndef _ASM_X86_PAGE_64_DEFS_H
#define _ASM_X86_PAGE_64_DEFS_H
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <asm/kaslr.h>
#endif
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 974688973cf6..9f77bf03d747 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -43,7 +43,7 @@
#define IOREMAP_MAX_ORDER (PMD_SHIFT)
#endif /* CONFIG_X86_64 */
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
extern phys_addr_t physical_mask;
@@ -66,6 +66,6 @@ bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn);
extern void initmem_init(void);
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#endif /* _ASM_X86_PAGE_DEFS_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 041aff51eb50..bed346bfac89 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -6,7 +6,7 @@
#include <asm/paravirt_types.h>
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
struct mm_struct;
#endif
@@ -15,7 +15,7 @@ struct mm_struct;
#include <asm/asm.h>
#include <asm/nospec-branch.h>
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/bug.h>
#include <linux/types.h>
#include <linux/cpumask.h>
@@ -91,11 +91,6 @@ static inline void __flush_tlb_multi(const struct cpumask *cpumask,
PVOP_VCALL2(mmu.flush_tlb_multi, cpumask, info);
}
-static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
-{
- PVOP_VCALL2(mmu.tlb_remove_table, tlb, table);
-}
-
static inline void paravirt_arch_exit_mmap(struct mm_struct *mm)
{
PVOP_VCALL1(mmu.exit_mmap, mm);
@@ -720,7 +715,7 @@ static __always_inline unsigned long arch_local_irq_save(void)
extern void default_banner(void);
void native_pv_lock_init(void) __init;
-#else /* __ASSEMBLY__ */
+#else /* __ASSEMBLER__ */
#ifdef CONFIG_X86_64
#ifdef CONFIG_PARAVIRT_XXL
@@ -740,18 +735,18 @@ void native_pv_lock_init(void) __init;
#endif /* CONFIG_PARAVIRT_XXL */
#endif /* CONFIG_X86_64 */
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#else /* CONFIG_PARAVIRT */
# define default_banner x86_init_noop
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
static inline void native_pv_lock_init(void)
{
}
#endif
#endif /* !CONFIG_PARAVIRT */
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#ifndef CONFIG_PARAVIRT_XXL
static inline void paravirt_enter_mmap(struct mm_struct *mm)
{
@@ -769,5 +764,5 @@ static inline void paravirt_set_cap(void)
{
}
#endif
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_X86_PARAVIRT_H */
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index fea56b04f436..62912023b46f 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -4,7 +4,7 @@
#ifdef CONFIG_PARAVIRT
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/types.h>
#include <asm/desc_defs.h>
@@ -134,8 +134,6 @@ struct pv_mmu_ops {
void (*flush_tlb_multi)(const struct cpumask *cpus,
const struct flush_tlb_info *info);
- void (*tlb_remove_table)(struct mmu_gather *tlb, void *table);
-
/* Hook for intercepting the destruction of an mm_struct. */
void (*exit_mmap)(struct mm_struct *mm);
void (*notify_page_enc_status_changed)(unsigned long pfn, int npages, bool enc);
@@ -242,9 +240,17 @@ extern struct paravirt_patch_template pv_ops;
#define paravirt_ptr(op) [paravirt_opptr] "m" (pv_ops.op)
-int paravirt_disable_iospace(void);
-
-/* This generates an indirect call based on the operation type number. */
+/*
+ * This generates an indirect call based on the operation type number.
+ *
+ * Since alternatives run after enabling CET/IBT -- the latter setting/clearing
+ * capabilities and the former requiring all capabilities being finalized --
+ * these indirect calls are subject to IBT and the paravirt stubs should have
+ * ENDBR on.
+ *
+ * OTOH since this is effectively a __nocfi indirect call, the paravirt stubs
+ * don't need to bother with CFI prefixes.
+ */
#define PARAVIRT_CALL \
ANNOTATE_RETPOLINE_SAFE \
"call *%[paravirt_opptr];"
@@ -519,7 +525,7 @@ unsigned long pv_native_read_cr2(void);
#define paravirt_nop ((void *)nop_func)
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#define ALT_NOT_XEN ALT_NOT(X86_FEATURE_XENPV)
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index e525cd85f999..105db2d33c7b 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -10,7 +10,7 @@
# define __percpu_rel
#endif
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
#ifdef CONFIG_SMP
# define __percpu %__percpu_seg:
@@ -20,14 +20,9 @@
#define PER_CPU_VAR(var) __percpu(var)__percpu_rel
-#ifdef CONFIG_X86_64_SMP
-# define INIT_PER_CPU_VAR(var) init_per_cpu__##var
-#else
-# define INIT_PER_CPU_VAR(var) var
-#endif
-
#else /* !__ASSEMBLY__: */
+#include <linux/args.h>
#include <linux/build_bug.h>
#include <linux/stringify.h>
#include <asm/asm.h>
@@ -41,12 +36,7 @@
# define __seg_fs __attribute__((address_space(__seg_fs)))
#endif
-#ifdef CONFIG_X86_64
-# define __percpu_seg_override __seg_gs
-#else
-# define __percpu_seg_override __seg_fs
-#endif
-
+#define __percpu_seg_override CONCATENATE(__seg_, __percpu_seg)
#define __percpu_prefix ""
#else /* !CONFIG_CC_HAS_NAMED_AS: */
@@ -98,22 +88,6 @@
#define __force_percpu_arg(x) __force_percpu_prefix "%" #x
/*
- * Initialized pointers to per-CPU variables needed for the boot
- * processor need to use these macros to get the proper address
- * offset from __per_cpu_load on SMP.
- *
- * There also must be an entry in vmlinux_64.lds.S
- */
-#define DECLARE_INIT_PER_CPU(var) \
- extern typeof(var) init_per_cpu_var(var)
-
-#ifdef CONFIG_X86_64_SMP
-# define init_per_cpu_var(var) init_per_cpu__##var
-#else
-# define init_per_cpu_var(var) var
-#endif
-
-/*
* For arch-specific code, we can use direct single-insn ops (they
* don't give an lvalue though).
*/
@@ -128,15 +102,10 @@
#define __pcpu_cast_4(val) ((u32)(((unsigned long) val) & 0xffffffff))
#define __pcpu_cast_8(val) ((u64)(val))
-#define __pcpu_op1_1(op, dst) op "b " dst
-#define __pcpu_op1_2(op, dst) op "w " dst
-#define __pcpu_op1_4(op, dst) op "l " dst
-#define __pcpu_op1_8(op, dst) op "q " dst
-
-#define __pcpu_op2_1(op, src, dst) op "b " src ", " dst
-#define __pcpu_op2_2(op, src, dst) op "w " src ", " dst
-#define __pcpu_op2_4(op, src, dst) op "l " src ", " dst
-#define __pcpu_op2_8(op, src, dst) op "q " src ", " dst
+#define __pcpu_op_1(op) op "b "
+#define __pcpu_op_2(op) op "w "
+#define __pcpu_op_4(op) op "l "
+#define __pcpu_op_8(op) op "q "
#define __pcpu_reg_1(mod, x) mod "q" (x)
#define __pcpu_reg_2(mod, x) mod "r" (x)
@@ -168,7 +137,8 @@ do { \
({ \
__pcpu_type_##size pfo_val__; \
\
- asm qual (__pcpu_op2_##size("mov", __percpu_arg([var]), "%[val]") \
+ asm qual (__pcpu_op_##size("mov") \
+ __percpu_arg([var]) ", %[val]" \
: [val] __pcpu_reg_##size("=", pfo_val__) \
: [var] "m" (__my_cpu_var(_var))); \
\
@@ -184,7 +154,8 @@ do { \
pto_tmp__ = (_val); \
(void)pto_tmp__; \
} \
- asm qual(__pcpu_op2_##size("mov", "%[val]", __percpu_arg([var])) \
+ asm qual (__pcpu_op_##size("mov") "%[val], " \
+ __percpu_arg([var]) \
: [var] "=m" (__my_cpu_var(_var)) \
: [val] __pcpu_reg_imm_##size(pto_val__)); \
} while (0)
@@ -201,7 +172,8 @@ do { \
({ \
__pcpu_type_##size pfo_val__; \
\
- asm(__pcpu_op2_##size("mov", __force_percpu_arg(a[var]), "%[val]") \
+ asm(__pcpu_op_##size("mov") \
+ __force_percpu_arg(a[var]) ", %[val]" \
: [val] __pcpu_reg_##size("=", pfo_val__) \
: [var] "i" (&(_var))); \
\
@@ -210,7 +182,7 @@ do { \
#define percpu_unary_op(size, qual, op, _var) \
({ \
- asm qual (__pcpu_op1_##size(op, __percpu_arg([var])) \
+ asm qual (__pcpu_op_##size(op) __percpu_arg([var]) \
: [var] "+m" (__my_cpu_var(_var))); \
})
@@ -223,7 +195,7 @@ do { \
pto_tmp__ = (_val); \
(void)pto_tmp__; \
} \
- asm qual(__pcpu_op2_##size(op, "%[val]", __percpu_arg([var])) \
+ asm qual (__pcpu_op_##size(op) "%[val], " __percpu_arg([var]) \
: [var] "+m" (__my_cpu_var(_var)) \
: [val] __pcpu_reg_imm_##size(pto_val__)); \
} while (0)
@@ -259,8 +231,8 @@ do { \
({ \
__pcpu_type_##size paro_tmp__ = __pcpu_cast_##size(_val); \
\
- asm qual (__pcpu_op2_##size("xadd", "%[tmp]", \
- __percpu_arg([var])) \
+ asm qual (__pcpu_op_##size("xadd") "%[tmp], " \
+ __percpu_arg([var]) \
: [tmp] __pcpu_reg_##size("+", paro_tmp__), \
[var] "+m" (__my_cpu_var(_var)) \
: : "memory"); \
@@ -303,8 +275,8 @@ do { \
__pcpu_type_##size pco_old__ = __pcpu_cast_##size(_oval); \
__pcpu_type_##size pco_new__ = __pcpu_cast_##size(_nval); \
\
- asm qual (__pcpu_op2_##size("cmpxchg", "%[nval]", \
- __percpu_arg([var])) \
+ asm qual (__pcpu_op_##size("cmpxchg") "%[nval], " \
+ __percpu_arg([var]) \
: [oval] "+a" (pco_old__), \
[var] "+m" (__my_cpu_var(_var)) \
: [nval] __pcpu_reg_##size(, pco_new__) \
@@ -320,8 +292,8 @@ do { \
__pcpu_type_##size pco_old__ = *pco_oval__; \
__pcpu_type_##size pco_new__ = __pcpu_cast_##size(_nval); \
\
- asm qual (__pcpu_op2_##size("cmpxchg", "%[nval]", \
- __percpu_arg([var])) \
+ asm qual (__pcpu_op_##size("cmpxchg") "%[nval], " \
+ __percpu_arg([var]) \
CC_SET(z) \
: CC_OUT(z) (success), \
[oval] "+a" (pco_old__), \
@@ -348,15 +320,14 @@ do { \
old__.var = _oval; \
new__.var = _nval; \
\
- asm qual (ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \
- "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \
- : [var] "+m" (__my_cpu_var(_var)), \
- "+a" (old__.low), \
- "+d" (old__.high) \
- : "b" (new__.low), \
- "c" (new__.high), \
- "S" (&(_var)) \
- : "memory"); \
+ asm_inline qual ( \
+ ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \
+ "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \
+ : ALT_OUTPUT_SP([var] "+m" (__my_cpu_var(_var)), \
+ "+a" (old__.low), "+d" (old__.high)) \
+ : "b" (new__.low), "c" (new__.high), \
+ "S" (&(_var)) \
+ : "memory"); \
\
old__.var; \
})
@@ -378,17 +349,16 @@ do { \
old__.var = *_oval; \
new__.var = _nval; \
\
- asm qual (ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \
- "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \
- CC_SET(z) \
- : CC_OUT(z) (success), \
- [var] "+m" (__my_cpu_var(_var)), \
- "+a" (old__.low), \
- "+d" (old__.high) \
- : "b" (new__.low), \
- "c" (new__.high), \
- "S" (&(_var)) \
- : "memory"); \
+ asm_inline qual ( \
+ ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \
+ "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \
+ CC_SET(z) \
+ : ALT_OUTPUT_SP(CC_OUT(z) (success), \
+ [var] "+m" (__my_cpu_var(_var)), \
+ "+a" (old__.low), "+d" (old__.high)) \
+ : "b" (new__.low), "c" (new__.high), \
+ "S" (&(_var)) \
+ : "memory"); \
if (unlikely(!success)) \
*_oval = old__.var; \
\
@@ -419,15 +389,14 @@ do { \
old__.var = _oval; \
new__.var = _nval; \
\
- asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \
- "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \
- : [var] "+m" (__my_cpu_var(_var)), \
- "+a" (old__.low), \
- "+d" (old__.high) \
- : "b" (new__.low), \
- "c" (new__.high), \
- "S" (&(_var)) \
- : "memory"); \
+ asm_inline qual ( \
+ ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \
+ "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \
+ : ALT_OUTPUT_SP([var] "+m" (__my_cpu_var(_var)), \
+ "+a" (old__.low), "+d" (old__.high)) \
+ : "b" (new__.low), "c" (new__.high), \
+ "S" (&(_var)) \
+ : "memory"); \
\
old__.var; \
})
@@ -449,19 +418,19 @@ do { \
old__.var = *_oval; \
new__.var = _nval; \
\
- asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \
- "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \
- CC_SET(z) \
- : CC_OUT(z) (success), \
- [var] "+m" (__my_cpu_var(_var)), \
- "+a" (old__.low), \
- "+d" (old__.high) \
- : "b" (new__.low), \
- "c" (new__.high), \
- "S" (&(_var)) \
- : "memory"); \
+ asm_inline qual ( \
+ ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \
+ "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \
+ CC_SET(z) \
+ : ALT_OUTPUT_SP(CC_OUT(z) (success), \
+ [var] "+m" (__my_cpu_var(_var)), \
+ "+a" (old__.low), "+d" (old__.high)) \
+ : "b" (new__.low), "c" (new__.high), \
+ "S" (&(_var)) \
+ : "memory"); \
if (unlikely(!success)) \
*_oval = old__.var; \
+ \
likely(success); \
})
@@ -582,7 +551,7 @@ do { \
* it is accessed while this_cpu_read_stable() allows the value to be cached.
* this_cpu_read_stable() is more efficient and can be used if its value
* is guaranteed to be valid across CPUs. The current users include
- * pcpu_hot.current_task and pcpu_hot.top_of_stack, both of which are
+ * current_task and cpu_current_top_of_stack, both of which are
* actually per-thread variables implemented as per-CPU variables and
* thus stable for the duration of the respective task.
*/
@@ -617,9 +586,9 @@ do { \
#include <asm-generic/percpu.h>
/* We can use this directly for local CPU (faster). */
-DECLARE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off);
+DECLARE_PER_CPU_CACHE_HOT(unsigned long, this_cpu_off);
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#ifdef CONFIG_SMP
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 0ba8d20f2d1d..812dac3f79f0 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -141,6 +141,12 @@
#define PEBS_DATACFG_XMMS BIT_ULL(2)
#define PEBS_DATACFG_LBRS BIT_ULL(3)
#define PEBS_DATACFG_LBR_SHIFT 24
+#define PEBS_DATACFG_CNTR BIT_ULL(4)
+#define PEBS_DATACFG_CNTR_SHIFT 32
+#define PEBS_DATACFG_CNTR_MASK GENMASK_ULL(15, 0)
+#define PEBS_DATACFG_FIX_SHIFT 48
+#define PEBS_DATACFG_FIX_MASK GENMASK_ULL(7, 0)
+#define PEBS_DATACFG_METRICS BIT_ULL(5)
/* Steal the highest bit of pebs_data_cfg for SW usage */
#define PEBS_UPDATE_DS_SW BIT_ULL(63)
@@ -482,6 +488,15 @@ struct pebs_xmm {
u64 xmm[16*2]; /* two entries for each register */
};
+struct pebs_cntr_header {
+ u32 cntr;
+ u32 fixed;
+ u32 metrics;
+ u32 reserved;
+};
+
+#define INTEL_CNTR_METRICS 0x3
+
/*
* AMD Extended Performance Monitoring and Debug cpuid feature detection
*/
@@ -509,6 +524,8 @@ struct pebs_xmm {
#define IBS_CAPS_FETCHCTLEXTD (1U<<9)
#define IBS_CAPS_OPDATA4 (1U<<10)
#define IBS_CAPS_ZEN4 (1U<<11)
+#define IBS_CAPS_OPLDLAT (1U<<12)
+#define IBS_CAPS_OPDTLBPGSIZE (1U<<19)
#define IBS_CAPS_DEFAULT (IBS_CAPS_AVAIL \
| IBS_CAPS_FETCHSAM \
@@ -534,8 +551,11 @@ struct pebs_xmm {
* The lower 7 bits of the current count are random bits
* preloaded by hardware and ignored in software
*/
+#define IBS_OP_LDLAT_EN (1ULL<<63)
+#define IBS_OP_LDLAT_THRSH (0xFULL<<59)
#define IBS_OP_CUR_CNT (0xFFF80ULL<<32)
#define IBS_OP_CUR_CNT_RAND (0x0007FULL<<32)
+#define IBS_OP_CUR_CNT_EXT_MASK (0x7FULL<<52)
#define IBS_OP_CNT_CTL (1ULL<<19)
#define IBS_OP_VAL (1ULL<<18)
#define IBS_OP_ENABLE (1ULL<<17)
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index dd4841231bb9..a33147520044 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -29,11 +29,6 @@ static inline void paravirt_release_pud(unsigned long pfn) {}
static inline void paravirt_release_p4d(unsigned long pfn) {}
#endif
-/*
- * Flags to use when allocating a user page table page.
- */
-extern gfp_t __userpte_alloc_gfp;
-
#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
/*
* Instead of one PGD, we acquire two PGDs. Being order-1, it is
diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h
index 4a12c276b181..66425424ce91 100644
--- a/arch/x86/include/asm/pgtable-2level_types.h
+++ b/arch/x86/include/asm/pgtable-2level_types.h
@@ -2,7 +2,7 @@
#ifndef _ASM_X86_PGTABLE_2LEVEL_DEFS_H
#define _ASM_X86_PGTABLE_2LEVEL_DEFS_H
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/types.h>
typedef unsigned long pteval_t;
@@ -16,7 +16,7 @@ typedef union {
pteval_t pte;
pteval_t pte_low;
} pte_t;
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#define SHARED_KERNEL_PMD 0
diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h
index 80911349519e..9d5b257d44e3 100644
--- a/arch/x86/include/asm/pgtable-3level_types.h
+++ b/arch/x86/include/asm/pgtable-3level_types.h
@@ -2,7 +2,7 @@
#ifndef _ASM_X86_PGTABLE_3LEVEL_DEFS_H
#define _ASM_X86_PGTABLE_3LEVEL_DEFS_H
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/types.h>
typedef u64 pteval_t;
@@ -25,7 +25,7 @@ typedef union {
};
pmdval_t pmd;
} pmd_t;
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#define SHARED_KERNEL_PMD (!static_cpu_has(X86_FEATURE_PTI))
diff --git a/arch/x86/include/asm/pgtable-invert.h b/arch/x86/include/asm/pgtable-invert.h
index a0c1525f1b6f..e12e52ae8083 100644
--- a/arch/x86/include/asm/pgtable-invert.h
+++ b/arch/x86/include/asm/pgtable-invert.h
@@ -2,7 +2,7 @@
#ifndef _ASM_PGTABLE_INVERT_H
#define _ASM_PGTABLE_INVERT_H 1
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
/*
* A clear pte value is special, and doesn't get inverted.
@@ -36,6 +36,6 @@ static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask)
return val;
}
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 593f10aabd45..7bd6bd6df4a1 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -15,7 +15,7 @@
cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS))) \
: (prot))
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/spinlock.h>
#include <asm/x86_init.h>
#include <asm/pkru.h>
@@ -973,7 +973,7 @@ static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd)
}
#endif /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#ifdef CONFIG_X86_32
@@ -982,7 +982,7 @@ static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd)
# include <asm/pgtable_64.h>
#endif
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/mm_types.h>
#include <linux/mmdebug.h>
#include <linux/log2.h>
@@ -1233,12 +1233,12 @@ static inline int pgd_none(pgd_t pgd)
}
#endif /* CONFIG_PGTABLE_LEVELS > 4 */
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#define KERNEL_PGD_BOUNDARY pgd_index(PAGE_OFFSET)
#define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY)
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
extern int direct_gbpages;
void init_mem_mapping(void);
@@ -1812,6 +1812,6 @@ bool arch_is_platform_page(u64 paddr);
WARN_ON_ONCE(pgd_present(*pgdp) && !pgd_same(*pgdp, pgd)); \
set_pgd(pgdp, pgd); \
})
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_X86_PGTABLE_H */
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index 7d4ad8907297..b612cc57a4d3 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -13,7 +13,7 @@
* This file contains the functions and defines necessary to modify and use
* the i386 page table tree.
*/
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <asm/processor.h>
#include <linux/threads.h>
#include <asm/paravirt.h>
@@ -45,7 +45,7 @@ do { \
flush_tlb_one_kernel((vaddr)); \
} while (0)
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
/*
* This is used to calculate the .brk reservation for initial pagetables.
diff --git a/arch/x86/include/asm/pgtable_32_areas.h b/arch/x86/include/asm/pgtable_32_areas.h
index b6355416a15a..921148b42967 100644
--- a/arch/x86/include/asm/pgtable_32_areas.h
+++ b/arch/x86/include/asm/pgtable_32_areas.h
@@ -13,7 +13,7 @@
*/
#define VMALLOC_OFFSET (8 * 1024 * 1024)
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
extern bool __vmalloc_start_set; /* set once high_memory is set */
#endif
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index d1426b64c1b9..b89f8f1194a9 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -5,7 +5,7 @@
#include <linux/const.h>
#include <asm/pgtable_64_types.h>
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
/*
* This file contains the functions and defines necessary to modify and use
@@ -270,7 +270,7 @@ static inline bool gup_fast_permitted(unsigned long start, unsigned long end)
#include <asm/pgtable-invert.h>
-#else /* __ASSEMBLY__ */
+#else /* __ASSEMBLER__ */
#define l4_index(x) (((x) >> 39) & 511)
#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
@@ -291,5 +291,5 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map)
i = i + 1 ; \
.endr
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_X86_PGTABLE_64_H */
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index ec68f8369bdc..5bb782d856f2 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -4,7 +4,7 @@
#include <asm/sparsemem.h>
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/types.h>
#include <asm/kaslr.h>
@@ -44,7 +44,7 @@ static inline bool pgtable_l5_enabled(void)
extern unsigned int pgdir_shift;
extern unsigned int ptrs_per_p4d;
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#define SHARED_KERNEL_PMD 0
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 4b804531b03c..b74ec5c3643b 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -33,6 +33,7 @@
#define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1
#define _PAGE_BIT_UFFD_WP _PAGE_BIT_SOFTW2 /* userfaultfd wrprotected */
#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
+#define _PAGE_BIT_KERNEL_4K _PAGE_BIT_SOFTW3 /* page must not be converted to large */
#define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4
#ifdef CONFIG_X86_64
@@ -64,6 +65,7 @@
#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
+#define _PAGE_KERNEL_4K (_AT(pteval_t, 1) << _PAGE_BIT_KERNEL_4K)
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
#define _PAGE_PKEY_BIT0 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT0)
#define _PAGE_PKEY_BIT1 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT1)
@@ -164,7 +166,7 @@
* to have the WB mode at index 0 (all bits clear). This is the default
* right now and likely would break too much if changed.
*/
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
enum page_cache_mode {
_PAGE_CACHE_MODE_WB = 0,
_PAGE_CACHE_MODE_WC = 1,
@@ -177,7 +179,7 @@ enum page_cache_mode {
};
#endif
-#define _PAGE_CC (_AT(pteval_t, cc_mask))
+#define _PAGE_CC (_AT(pteval_t, cc_get_mask()))
#define _PAGE_ENC (_AT(pteval_t, sme_me_mask))
#define _PAGE_CACHE_MASK (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)
@@ -239,7 +241,7 @@ enum page_cache_mode {
#define __PAGE_KERNEL_IO_NOCACHE __PAGE_KERNEL_NOCACHE
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#define __PAGE_KERNEL_ENC (__PAGE_KERNEL | _ENC)
#define __PAGE_KERNEL_ENC_WP (__PAGE_KERNEL_WP | _ENC)
@@ -262,7 +264,7 @@ enum page_cache_mode {
#define PAGE_KERNEL_IO __pgprot_mask(__PAGE_KERNEL_IO)
#define PAGE_KERNEL_IO_NOCACHE __pgprot_mask(__PAGE_KERNEL_IO_NOCACHE)
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
/*
* early identity mapping pte attrib macros.
@@ -281,7 +283,7 @@ enum page_cache_mode {
# include <asm/pgtable_64_types.h>
#endif
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/types.h>
@@ -580,6 +582,6 @@ extern int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn,
unsigned long page_flags);
extern int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address,
unsigned long numpages);
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#endif /* _ASM_X86_PGTABLE_DEFS_H */
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 919909d8cb77..578441db09f0 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -4,10 +4,11 @@
#include <asm/rmwcc.h>
#include <asm/percpu.h>
-#include <asm/current.h>
#include <linux/static_call_types.h>
+DECLARE_PER_CPU_CACHE_HOT(int, __preempt_count);
+
/* We use the MSB mostly because its available */
#define PREEMPT_NEED_RESCHED 0x80000000
@@ -23,18 +24,18 @@
*/
static __always_inline int preempt_count(void)
{
- return raw_cpu_read_4(pcpu_hot.preempt_count) & ~PREEMPT_NEED_RESCHED;
+ return raw_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED;
}
static __always_inline void preempt_count_set(int pc)
{
int old, new;
- old = raw_cpu_read_4(pcpu_hot.preempt_count);
+ old = raw_cpu_read_4(__preempt_count);
do {
new = (old & PREEMPT_NEED_RESCHED) |
(pc & ~PREEMPT_NEED_RESCHED);
- } while (!raw_cpu_try_cmpxchg_4(pcpu_hot.preempt_count, &old, new));
+ } while (!raw_cpu_try_cmpxchg_4(__preempt_count, &old, new));
}
/*
@@ -43,7 +44,7 @@ static __always_inline void preempt_count_set(int pc)
#define init_task_preempt_count(p) do { } while (0)
#define init_idle_preempt_count(p, cpu) do { \
- per_cpu(pcpu_hot.preempt_count, (cpu)) = PREEMPT_DISABLED; \
+ per_cpu(__preempt_count, (cpu)) = PREEMPT_DISABLED; \
} while (0)
/*
@@ -57,17 +58,17 @@ static __always_inline void preempt_count_set(int pc)
static __always_inline void set_preempt_need_resched(void)
{
- raw_cpu_and_4(pcpu_hot.preempt_count, ~PREEMPT_NEED_RESCHED);
+ raw_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED);
}
static __always_inline void clear_preempt_need_resched(void)
{
- raw_cpu_or_4(pcpu_hot.preempt_count, PREEMPT_NEED_RESCHED);
+ raw_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED);
}
static __always_inline bool test_preempt_need_resched(void)
{
- return !(raw_cpu_read_4(pcpu_hot.preempt_count) & PREEMPT_NEED_RESCHED);
+ return !(raw_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED);
}
/*
@@ -76,12 +77,12 @@ static __always_inline bool test_preempt_need_resched(void)
static __always_inline void __preempt_count_add(int val)
{
- raw_cpu_add_4(pcpu_hot.preempt_count, val);
+ raw_cpu_add_4(__preempt_count, val);
}
static __always_inline void __preempt_count_sub(int val)
{
- raw_cpu_add_4(pcpu_hot.preempt_count, -val);
+ raw_cpu_add_4(__preempt_count, -val);
}
/*
@@ -91,7 +92,7 @@ static __always_inline void __preempt_count_sub(int val)
*/
static __always_inline bool __preempt_count_dec_and_test(void)
{
- return GEN_UNARY_RMWcc("decl", __my_cpu_var(pcpu_hot.preempt_count), e,
+ return GEN_UNARY_RMWcc("decl", __my_cpu_var(__preempt_count), e,
__percpu_arg([var]));
}
@@ -100,7 +101,7 @@ static __always_inline bool __preempt_count_dec_and_test(void)
*/
static __always_inline bool should_resched(int preempt_offset)
{
- return unlikely(raw_cpu_read_4(pcpu_hot.preempt_count) == preempt_offset);
+ return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
}
#ifdef CONFIG_PREEMPTION
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c0cd10182e90..5d2f7e5aff26 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -60,18 +60,13 @@ struct vm86;
# define ARCH_MIN_MMSTRUCT_ALIGN 0
#endif
-enum tlb_infos {
- ENTRIES,
- NR_INFO
-};
-
-extern u16 __read_mostly tlb_lli_4k[NR_INFO];
-extern u16 __read_mostly tlb_lli_2m[NR_INFO];
-extern u16 __read_mostly tlb_lli_4m[NR_INFO];
-extern u16 __read_mostly tlb_lld_4k[NR_INFO];
-extern u16 __read_mostly tlb_lld_2m[NR_INFO];
-extern u16 __read_mostly tlb_lld_4m[NR_INFO];
-extern u16 __read_mostly tlb_lld_1g[NR_INFO];
+extern u16 __read_mostly tlb_lli_4k;
+extern u16 __read_mostly tlb_lli_2m;
+extern u16 __read_mostly tlb_lli_4m;
+extern u16 __read_mostly tlb_lld_4k;
+extern u16 __read_mostly tlb_lld_2m;
+extern u16 __read_mostly tlb_lld_4m;
+extern u16 __read_mostly tlb_lld_1g;
/*
* CPU type and hardware bug flags. Kept separately for each CPU.
@@ -234,7 +229,7 @@ static inline unsigned long long l1tf_pfn_limit(void)
void init_cpu_devs(void);
void get_cpu_vendor(struct cpuinfo_x86 *c);
extern void early_cpu_init(void);
-extern void identify_secondary_cpu(struct cpuinfo_x86 *);
+extern void identify_secondary_cpu(unsigned int cpu);
extern void print_cpu_info(struct cpuinfo_x86 *);
void print_cpu_msr(struct cpuinfo_x86 *);
@@ -420,37 +415,33 @@ struct irq_stack {
char stack[IRQ_STACK_SIZE];
} __aligned(IRQ_STACK_SIZE);
+DECLARE_PER_CPU_CACHE_HOT(struct irq_stack *, hardirq_stack_ptr);
#ifdef CONFIG_X86_64
-struct fixed_percpu_data {
- /*
- * GCC hardcodes the stack canary as %gs:40. Since the
- * irq_stack is the object at %gs:0, we reserve the bottom
- * 48 bytes of the irq stack for the canary.
- *
- * Once we are willing to require -mstack-protector-guard-symbol=
- * support for x86_64 stackprotector, we can get rid of this.
- */
- char gs_base[40];
- unsigned long stack_canary;
-};
+DECLARE_PER_CPU_CACHE_HOT(bool, hardirq_stack_inuse);
+#else
+DECLARE_PER_CPU_CACHE_HOT(struct irq_stack *, softirq_stack_ptr);
+#endif
-DECLARE_PER_CPU_FIRST(struct fixed_percpu_data, fixed_percpu_data) __visible;
-DECLARE_INIT_PER_CPU(fixed_percpu_data);
+DECLARE_PER_CPU_CACHE_HOT(unsigned long, cpu_current_top_of_stack);
+/* const-qualified alias provided by the linker. */
+DECLARE_PER_CPU_CACHE_HOT(const unsigned long __percpu_seg_override,
+ const_cpu_current_top_of_stack);
+#ifdef CONFIG_X86_64
static inline unsigned long cpu_kernelmode_gs_base(int cpu)
{
- return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu);
+#ifdef CONFIG_SMP
+ return per_cpu_offset(cpu);
+#else
+ return 0;
+#endif
}
extern asmlinkage void entry_SYSCALL32_ignore(void);
/* Save actual FS/GS selectors and bases to current->thread */
void current_save_fsgs(void);
-#else /* X86_64 */
-#ifdef CONFIG_STACKPROTECTOR
-DECLARE_PER_CPU(unsigned long, __stack_chk_guard);
-#endif
-#endif /* !X86_64 */
+#endif /* X86_64 */
struct perf_event;
@@ -561,9 +552,9 @@ static __always_inline unsigned long current_top_of_stack(void)
* entry trampoline.
*/
if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT))
- return this_cpu_read_const(const_pcpu_hot.top_of_stack);
+ return this_cpu_read_const(const_cpu_current_top_of_stack);
- return this_cpu_read_stable(pcpu_hot.top_of_stack);
+ return this_cpu_read_stable(cpu_current_top_of_stack);
}
static __always_inline bool on_thread_stack(void)
@@ -668,8 +659,6 @@ static __always_inline void prefetchw(const void *x)
.sysenter_cs = __KERNEL_CS, \
}
-#define KSTK_ESP(task) (task_pt_regs(task)->sp)
-
#else
extern unsigned long __top_init_kernel_stack[];
@@ -677,8 +666,6 @@ extern unsigned long __top_init_kernel_stack[];
.sp = (unsigned long)&__top_init_kernel_stack, \
}
-extern unsigned long KSTK_ESP(struct task_struct *task);
-
#endif /* CONFIG_X86_64 */
extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
@@ -692,6 +679,7 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
#define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE_LOW)
#define KSTK_EIP(task) (task_pt_regs(task)->ip)
+#define KSTK_ESP(task) (task_pt_regs(task)->sp)
/* Get/set a process' ability to use the timestamp counter instruction */
#define GET_TSC_CTL(adr) get_tsc_mode((adr))
@@ -757,6 +745,7 @@ extern enum l1tf_mitigations l1tf_mitigation;
enum mds_mitigations {
MDS_MITIGATION_OFF,
+ MDS_MITIGATION_AUTO,
MDS_MITIGATION_FULL,
MDS_MITIGATION_VMWERV,
};
diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h
index 365798cb4408..5d0dbab85264 100644
--- a/arch/x86/include/asm/prom.h
+++ b/arch/x86/include/asm/prom.h
@@ -8,7 +8,7 @@
#ifndef _ASM_X86_PROM_H
#define _ASM_X86_PROM_H
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/of.h>
#include <linux/types.h>
@@ -33,5 +33,5 @@ static inline void x86_flattree_get_config(void) { }
extern char cmd_line[COMMAND_LINE_SIZE];
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 484f4f0131a5..05224a695872 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -15,7 +15,6 @@ void entry_SYSCALL_64(void);
void entry_SYSCALL_64_safe_stack(void);
void entry_SYSRETQ_unsafe_stack(void);
void entry_SYSRETQ_end(void);
-long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2);
#endif
#ifdef CONFIG_X86_32
@@ -41,6 +40,6 @@ void x86_configure_nx(void);
extern int reboot_force;
-long do_arch_prctl_common(int option, unsigned long arg2);
+long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2);
#endif /* _ASM_X86_PROTO_H */
diff --git a/arch/x86/include/asm/pti.h b/arch/x86/include/asm/pti.h
index ab167c96b9ab..88d0a1ab1f77 100644
--- a/arch/x86/include/asm/pti.h
+++ b/arch/x86/include/asm/pti.h
@@ -1,7 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PTI_H
#define _ASM_X86_PTI_H
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
extern void pti_init(void);
@@ -11,5 +11,5 @@ extern void pti_finalize(void);
static inline void pti_check_boottime_disable(void) { }
#endif
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_X86_PTI_H */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 5a83fbd9bc0b..50f75467f73d 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -6,7 +6,7 @@
#include <asm/page_types.h>
#include <uapi/asm/ptrace.h>
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#ifdef __i386__
struct pt_regs {
@@ -469,5 +469,5 @@ extern int do_set_thread_area(struct task_struct *p, int idx,
# define do_set_thread_area_64(p, s, t) (0)
#endif
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#endif /* _ASM_X86_PTRACE_H */
diff --git a/arch/x86/include/asm/purgatory.h b/arch/x86/include/asm/purgatory.h
index 5528e9325049..2fee5e9f1ccc 100644
--- a/arch/x86/include/asm/purgatory.h
+++ b/arch/x86/include/asm/purgatory.h
@@ -2,10 +2,10 @@
#ifndef _ASM_X86_PURGATORY_H
#define _ASM_X86_PURGATORY_H
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/purgatory.h>
extern void purgatory(void);
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_PURGATORY_H */
diff --git a/arch/x86/include/asm/pvclock-abi.h b/arch/x86/include/asm/pvclock-abi.h
index 1436226efe3e..b9fece5fc96d 100644
--- a/arch/x86/include/asm/pvclock-abi.h
+++ b/arch/x86/include/asm/pvclock-abi.h
@@ -1,7 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PVCLOCK_ABI_H
#define _ASM_X86_PVCLOCK_ABI_H
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
/*
* These structs MUST NOT be changed.
@@ -44,5 +44,5 @@ struct pvclock_wall_clock {
#define PVCLOCK_GUEST_STOPPED (1 << 1)
/* PVCLOCK_COUNTS_FROM_ZERO broke ABI and can't be used anymore. */
#define PVCLOCK_COUNTS_FROM_ZERO (1 << 2)
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_X86_PVCLOCK_ABI_H */
diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index 87e5482acd0d..f607081a022a 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -9,7 +9,7 @@
#define TH_FLAGS_SME_ACTIVE_BIT 0
#define TH_FLAGS_SME_ACTIVE BIT(TH_FLAGS_SME_ACTIVE_BIT)
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/types.h>
#include <asm/io.h>
@@ -95,6 +95,6 @@ void reserve_real_mode(void);
void load_trampoline_pgtable(void);
void init_real_mode(void);
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ARCH_X86_REALMODE_H */
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
deleted file mode 100644
index e9187ddd3d1f..000000000000
--- a/arch/x86/include/asm/required-features.h
+++ /dev/null
@@ -1,105 +0,0 @@
-#ifndef _ASM_X86_REQUIRED_FEATURES_H
-#define _ASM_X86_REQUIRED_FEATURES_H
-
-/* Define minimum CPUID feature set for kernel These bits are checked
- really early to actually display a visible error message before the
- kernel dies. Make sure to assign features to the proper mask!
-
- Some requirements that are not in CPUID yet are also in the
- CONFIG_X86_MINIMUM_CPU_FAMILY which is checked too.
-
- The real information is in arch/x86/Kconfig.cpu, this just converts
- the CONFIGs into a bitmask */
-
-#ifndef CONFIG_MATH_EMULATION
-# define NEED_FPU (1<<(X86_FEATURE_FPU & 31))
-#else
-# define NEED_FPU 0
-#endif
-
-#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
-# define NEED_PAE (1<<(X86_FEATURE_PAE & 31))
-#else
-# define NEED_PAE 0
-#endif
-
-#ifdef CONFIG_X86_CMPXCHG64
-# define NEED_CX8 (1<<(X86_FEATURE_CX8 & 31))
-#else
-# define NEED_CX8 0
-#endif
-
-#if defined(CONFIG_X86_CMOV) || defined(CONFIG_X86_64)
-# define NEED_CMOV (1<<(X86_FEATURE_CMOV & 31))
-#else
-# define NEED_CMOV 0
-#endif
-
-# define NEED_3DNOW 0
-
-#if defined(CONFIG_X86_P6_NOP) || defined(CONFIG_X86_64)
-# define NEED_NOPL (1<<(X86_FEATURE_NOPL & 31))
-#else
-# define NEED_NOPL 0
-#endif
-
-#ifdef CONFIG_MATOM
-# define NEED_MOVBE (1<<(X86_FEATURE_MOVBE & 31))
-#else
-# define NEED_MOVBE 0
-#endif
-
-#ifdef CONFIG_X86_64
-#ifdef CONFIG_PARAVIRT_XXL
-/* Paravirtualized systems may not have PSE or PGE available */
-#define NEED_PSE 0
-#define NEED_PGE 0
-#else
-#define NEED_PSE (1<<(X86_FEATURE_PSE) & 31)
-#define NEED_PGE (1<<(X86_FEATURE_PGE) & 31)
-#endif
-#define NEED_MSR (1<<(X86_FEATURE_MSR & 31))
-#define NEED_FXSR (1<<(X86_FEATURE_FXSR & 31))
-#define NEED_XMM (1<<(X86_FEATURE_XMM & 31))
-#define NEED_XMM2 (1<<(X86_FEATURE_XMM2 & 31))
-#define NEED_LM (1<<(X86_FEATURE_LM & 31))
-#else
-#define NEED_PSE 0
-#define NEED_MSR 0
-#define NEED_PGE 0
-#define NEED_FXSR 0
-#define NEED_XMM 0
-#define NEED_XMM2 0
-#define NEED_LM 0
-#endif
-
-#define REQUIRED_MASK0 (NEED_FPU|NEED_PSE|NEED_MSR|NEED_PAE|\
- NEED_CX8|NEED_PGE|NEED_FXSR|NEED_CMOV|\
- NEED_XMM|NEED_XMM2)
-#define SSE_MASK (NEED_XMM|NEED_XMM2)
-
-#define REQUIRED_MASK1 (NEED_LM|NEED_3DNOW)
-
-#define REQUIRED_MASK2 0
-#define REQUIRED_MASK3 (NEED_NOPL)
-#define REQUIRED_MASK4 (NEED_MOVBE)
-#define REQUIRED_MASK5 0
-#define REQUIRED_MASK6 0
-#define REQUIRED_MASK7 0
-#define REQUIRED_MASK8 0
-#define REQUIRED_MASK9 0
-#define REQUIRED_MASK10 0
-#define REQUIRED_MASK11 0
-#define REQUIRED_MASK12 0
-#define REQUIRED_MASK13 0
-#define REQUIRED_MASK14 0
-#define REQUIRED_MASK15 0
-#define REQUIRED_MASK16 0
-#define REQUIRED_MASK17 0
-#define REQUIRED_MASK18 0
-#define REQUIRED_MASK19 0
-#define REQUIRED_MASK20 0
-#define REQUIRED_MASK21 0
-#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 22)
-
-#endif /* _ASM_X86_REQUIRED_FEATURES_H */
diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h
index 8b1b6ce1e51b..011bf67a1866 100644
--- a/arch/x86/include/asm/resctrl.h
+++ b/arch/x86/include/asm/resctrl.h
@@ -4,8 +4,10 @@
#ifdef CONFIG_X86_CPU_RESCTRL
-#include <linux/sched.h>
#include <linux/jump_label.h>
+#include <linux/percpu.h>
+#include <linux/resctrl_types.h>
+#include <linux/sched.h>
/*
* This value can never be a valid CLOSID, and is used when mapping a
@@ -40,6 +42,7 @@ DECLARE_PER_CPU(struct resctrl_pqr_state, pqr_state);
extern bool rdt_alloc_capable;
extern bool rdt_mon_capable;
+extern unsigned int rdt_mon_features;
DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
@@ -79,6 +82,21 @@ static inline void resctrl_arch_disable_mon(void)
static_branch_dec_cpuslocked(&rdt_enable_key);
}
+static inline bool resctrl_arch_is_llc_occupancy_enabled(void)
+{
+ return (rdt_mon_features & (1 << QOS_L3_OCCUP_EVENT_ID));
+}
+
+static inline bool resctrl_arch_is_mbm_total_enabled(void)
+{
+ return (rdt_mon_features & (1 << QOS_L3_MBM_TOTAL_EVENT_ID));
+}
+
+static inline bool resctrl_arch_is_mbm_local_enabled(void)
+{
+ return (rdt_mon_features & (1 << QOS_L3_MBM_LOCAL_EVENT_ID));
+}
+
/*
* __resctrl_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR
*
@@ -96,8 +114,8 @@ static inline void resctrl_arch_disable_mon(void)
static inline void __resctrl_sched_in(struct task_struct *tsk)
{
struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state);
- u32 closid = state->default_closid;
- u32 rmid = state->default_rmid;
+ u32 closid = READ_ONCE(state->default_closid);
+ u32 rmid = READ_ONCE(state->default_rmid);
u32 tmp;
/*
@@ -132,6 +150,13 @@ static inline unsigned int resctrl_arch_round_mon_val(unsigned int val)
return val * scale;
}
+static inline void resctrl_arch_set_cpu_default_closid_rmid(int cpu, u32 closid,
+ u32 rmid)
+{
+ WRITE_ONCE(per_cpu(pqr_state.default_closid, cpu), closid);
+ WRITE_ONCE(per_cpu(pqr_state.default_rmid, cpu), rmid);
+}
+
static inline void resctrl_arch_set_closid_rmid(struct task_struct *tsk,
u32 closid, u32 rmid)
{
@@ -178,6 +203,11 @@ static inline void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, int evtid
static inline void resctrl_arch_mon_ctx_free(struct rdt_resource *r, int evtid,
void *ctx) { };
+u64 resctrl_arch_get_prefetch_disable_bits(void);
+int resctrl_arch_pseudo_lock_fn(void *_plr);
+int resctrl_arch_measure_cycles_lat_fn(void *_plr);
+int resctrl_arch_measure_l2_residency(void *_plr);
+int resctrl_arch_measure_l3_residency(void *_plr);
void resctrl_cpu_detect(struct cpuinfo_x86 *c);
#else
diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h
index 363266cbcada..3821ee3fae35 100644
--- a/arch/x86/include/asm/rmwcc.h
+++ b/arch/x86/include/asm/rmwcc.h
@@ -29,7 +29,7 @@ cc_label: c = true; \
#define __GEN_RMWcc(fullop, _var, cc, clobbers, ...) \
({ \
bool c; \
- asm volatile (fullop CC_SET(cc) \
+ asm_inline volatile (fullop CC_SET(cc) \
: [var] "+m" (_var), CC_OUT(cc) (c) \
: __VA_ARGS__ : clobbers); \
c; \
diff --git a/arch/x86/include/asm/runtime-const.h b/arch/x86/include/asm/runtime-const.h
index 6652ebddfd02..8d983cfd06ea 100644
--- a/arch/x86/include/asm/runtime-const.h
+++ b/arch/x86/include/asm/runtime-const.h
@@ -2,6 +2,18 @@
#ifndef _ASM_RUNTIME_CONST_H
#define _ASM_RUNTIME_CONST_H
+#ifdef __ASSEMBLY__
+
+.macro RUNTIME_CONST_PTR sym reg
+ movq $0x0123456789abcdef, %\reg
+ 1:
+ .pushsection runtime_ptr_\sym, "a"
+ .long 1b - 8 - .
+ .popsection
+.endm
+
+#else /* __ASSEMBLY__ */
+
#define runtime_const_ptr(sym) ({ \
typeof(sym) __ret; \
asm_inline("mov %1,%0\n1:\n" \
@@ -58,4 +70,5 @@ static inline void runtime_const_fixup(void (*fn)(void *, unsigned long),
}
}
+#endif /* __ASSEMBLY__ */
#endif
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 9d6411c65920..77d8f49b92bd 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -233,7 +233,7 @@
#define VDSO_CPUNODE_BITS 12
#define VDSO_CPUNODE_MASK 0xfff
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
/* Helper functions to store/load CPU and node numbers */
@@ -265,7 +265,7 @@ static inline void vdso_read_cpunode(unsigned *cpu, unsigned *node)
*node = (p >> VDSO_CPUNODE_BITS);
}
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#ifdef __KERNEL__
@@ -286,7 +286,7 @@ static inline void vdso_read_cpunode(unsigned *cpu, unsigned *node)
*/
#define XEN_EARLY_IDT_HANDLER_SIZE (8 + ENDBR_INSN_SIZE)
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
extern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDLER_SIZE];
extern void early_ignore_irq(void);
@@ -350,7 +350,7 @@ static inline void __loadsegment_fs(unsigned short value)
#define savesegment(seg, value) \
asm("mov %%" #seg ",%0":"=r" (value) : : "memory")
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#endif /* __KERNEL__ */
#endif /* _ASM_X86_SEGMENT_H */
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index cc62ef70ccc0..8d9f1c9aaa4c 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -2,7 +2,6 @@
#ifndef _ASM_X86_SET_MEMORY_H
#define _ASM_X86_SET_MEMORY_H
-#include <linux/mm.h>
#include <asm/page.h>
#include <asm-generic/set_memory.h>
@@ -38,7 +37,6 @@ int set_memory_rox(unsigned long addr, int numpages);
* The caller is required to take care of these.
*/
-int __set_memory_prot(unsigned long addr, int numpages, pgprot_t prot);
int _set_memory_uc(unsigned long addr, int numpages);
int _set_memory_wc(unsigned long addr, int numpages);
int _set_memory_wt(unsigned long addr, int numpages);
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 85f4fde3515c..ad9212df0ec0 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -27,7 +27,7 @@
#define OLD_CL_ADDRESS 0x020 /* Relative to real mode data */
#define NEW_CL_POINTER 0x228 /* Relative to real mode data */
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/cache.h>
#include <asm/bootparam.h>
@@ -46,6 +46,7 @@ void setup_bios_corruption_check(void);
void early_platform_quirks(void);
extern unsigned long saved_video_mode;
+extern unsigned long acpi_realmode_flags;
extern void reserve_standard_io_resources(void);
extern void i386_reserve_resources(void);
@@ -141,7 +142,7 @@ extern bool builtin_cmdline_added __ro_after_init;
#define builtin_cmdline_added 0
#endif
-#else /* __ASSEMBLY */
+#else /* __ASSEMBLER__ */
.macro __RESERVE_BRK name, size
.pushsection .bss..brk, "aw"
@@ -153,6 +154,6 @@ SYM_DATA_END(__brk_\name)
#define RESERVE_BRK(name, size) __RESERVE_BRK name, size
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_X86_SETUP_H */
diff --git a/arch/x86/include/asm/setup_data.h b/arch/x86/include/asm/setup_data.h
index 77c51111a893..7bb16f843c93 100644
--- a/arch/x86/include/asm/setup_data.h
+++ b/arch/x86/include/asm/setup_data.h
@@ -4,7 +4,7 @@
#include <uapi/asm/setup_data.h>
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
struct pci_setup_rom {
struct setup_data data;
@@ -27,6 +27,6 @@ struct efi_setup_data {
u64 reserved[8];
};
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_X86_SETUP_DATA_H */
diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h
index dcbccdb280f9..acb85b9346d8 100644
--- a/arch/x86/include/asm/sev-common.h
+++ b/arch/x86/include/asm/sev-common.h
@@ -212,8 +212,16 @@ struct snp_psc_desc {
#define GHCB_RESP_CODE(v) ((v) & GHCB_MSR_INFO_MASK)
/*
- * Error codes related to GHCB input that can be communicated back to the guest
- * by setting the lower 32-bits of the GHCB SW_EXITINFO1 field to 2.
+ * GHCB-defined return codes that are communicated back to the guest via
+ * SW_EXITINFO1.
+ */
+#define GHCB_HV_RESP_NO_ACTION 0
+#define GHCB_HV_RESP_ISSUE_EXCEPTION 1
+#define GHCB_HV_RESP_MALFORMED_INPUT 2
+
+/*
+ * GHCB-defined sub-error codes for malformed input (see above) that are
+ * communicated back to the guest via SW_EXITINFO2[31:0].
*/
#define GHCB_ERR_NOT_REGISTERED 1
#define GHCB_ERR_INVALID_USAGE 2
diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h
index fcbbef484a78..a28ff6b14145 100644
--- a/arch/x86/include/asm/shared/tdx.h
+++ b/arch/x86/include/asm/shared/tdx.h
@@ -106,7 +106,7 @@
#define TDX_PS_1G 2
#define TDX_PS_NR (TDX_PS_1G + 1)
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/compiler_attributes.h>
@@ -177,5 +177,5 @@ static __always_inline u64 hcall_func(u64 exit_reason)
return exit_reason;
}
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#endif /* _ASM_X86_SHARED_TDX_H */
diff --git a/arch/x86/include/asm/shstk.h b/arch/x86/include/asm/shstk.h
index 4cb77e004615..ba6f2fe43848 100644
--- a/arch/x86/include/asm/shstk.h
+++ b/arch/x86/include/asm/shstk.h
@@ -2,7 +2,7 @@
#ifndef _ASM_X86_SHSTK_H
#define _ASM_X86_SHSTK_H
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/types.h>
struct task_struct;
@@ -37,6 +37,6 @@ static inline int shstk_update_last_frame(unsigned long val) { return 0; }
static inline bool shstk_is_enabled(void) { return false; }
#endif /* CONFIG_X86_USER_SHADOW_STACK */
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_X86_SHSTK_H */
diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
index 4a4043ca6493..c72d46175374 100644
--- a/arch/x86/include/asm/signal.h
+++ b/arch/x86/include/asm/signal.h
@@ -2,7 +2,7 @@
#ifndef _ASM_X86_SIGNAL_H
#define _ASM_X86_SIGNAL_H
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/linkage.h>
/* Most things should be clean enough to redefine this at will, if care
@@ -28,9 +28,9 @@ typedef struct {
#define SA_IA32_ABI 0x02000000u
#define SA_X32_ABI 0x01000000u
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#include <uapi/asm/signal.h>
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#define __ARCH_HAS_SA_RESTORER
@@ -101,5 +101,5 @@ struct pt_regs;
#endif /* !__i386__ */
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_X86_SIGNAL_H */
diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h
index 2de1e5a75c57..daea94c2993c 100644
--- a/arch/x86/include/asm/smap.h
+++ b/arch/x86/include/asm/smap.h
@@ -13,7 +13,7 @@
#include <asm/cpufeatures.h>
#include <asm/alternative.h>
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
#define ASM_CLAC \
ALTERNATIVE "", "clac", X86_FEATURE_SMAP
@@ -21,7 +21,7 @@
#define ASM_STAC \
ALTERNATIVE "", "stac", X86_FEATURE_SMAP
-#else /* __ASSEMBLY__ */
+#else /* __ASSEMBLER__ */
static __always_inline void clac(void)
{
@@ -61,6 +61,6 @@ static __always_inline void smap_restore(unsigned long flags)
#define ASM_STAC \
ALTERNATIVE("", "stac", X86_FEATURE_SMAP)
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_X86_SMAP_H */
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index ca073f40698f..0c1c68039d6f 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -1,12 +1,13 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_SMP_H
#define _ASM_X86_SMP_H
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/cpumask.h>
+#include <linux/thread_info.h>
#include <asm/cpumask.h>
-#include <asm/current.h>
-#include <asm/thread_info.h>
+
+DECLARE_PER_CPU_CACHE_HOT(int, cpu_number);
DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
@@ -114,13 +115,12 @@ void wbinvd_on_cpu(int cpu);
int wbinvd_on_all_cpus(void);
void smp_kick_mwait_play_dead(void);
+void __noreturn mwait_play_dead(unsigned int eax_hint);
void native_smp_send_reschedule(int cpu);
void native_send_call_func_ipi(const struct cpumask *mask);
void native_send_call_func_single_ipi(int cpu);
-void smp_store_cpu_info(int id);
-
asmlinkage __visible void smp_reboot_interrupt(void);
__visible void smp_reschedule_interrupt(struct pt_regs *regs);
__visible void smp_call_function_interrupt(struct pt_regs *regs);
@@ -133,14 +133,8 @@ __visible void smp_call_function_single_interrupt(struct pt_regs *r);
* This function is needed by all SMP systems. It must _always_ be valid
* from the initial startup.
*/
-#define raw_smp_processor_id() this_cpu_read(pcpu_hot.cpu_number)
-#define __smp_processor_id() __this_cpu_read(pcpu_hot.cpu_number)
-
-#ifdef CONFIG_X86_32
-extern int safe_smp_processor_id(void);
-#else
-# define safe_smp_processor_id() smp_processor_id()
-#endif
+#define raw_smp_processor_id() this_cpu_read(cpu_number)
+#define __smp_processor_id() __this_cpu_read(cpu_number)
static inline struct cpumask *cpu_llc_shared_mask(int cpu)
{
@@ -164,6 +158,8 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu)
{
return (struct cpumask *)cpumask_of(0);
}
+
+static inline void __noreturn mwait_play_dead(unsigned int eax_hint) { BUG(); }
#endif /* CONFIG_SMP */
#ifdef CONFIG_DEBUG_NMI_SELFTEST
@@ -175,7 +171,7 @@ extern void nmi_selftest(void);
extern unsigned int smpboot_control;
extern unsigned long apic_mmio_base;
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
/* Control bits for startup_64 */
#define STARTUP_READ_APICID 0x80000000
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 03e7c2d49559..6266d6b9e0b8 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -42,14 +42,14 @@ static __always_inline void native_write_cr2(unsigned long val)
asm volatile("mov %0,%%cr2": : "r" (val) : "memory");
}
-static inline unsigned long __native_read_cr3(void)
+static __always_inline unsigned long __native_read_cr3(void)
{
unsigned long val;
asm volatile("mov %%cr3,%0\n\t" : "=r" (val) : __FORCE_ORDER);
return val;
}
-static inline void native_write_cr3(unsigned long val)
+static __always_inline void native_write_cr3(unsigned long val)
{
asm volatile("mov %0,%%cr3": : "r" (val) : "memory");
}
@@ -176,9 +176,8 @@ static __always_inline void clflush(volatile void *__p)
static inline void clflushopt(volatile void *__p)
{
- alternative_io(".byte 0x3e; clflush %0",
- ".byte 0x66; clflush %0",
- X86_FEATURE_CLFLUSHOPT,
+ alternative_io("ds clflush %0",
+ "clflushopt %0", X86_FEATURE_CLFLUSHOPT,
"+m" (*(volatile char __force *)__p));
}
@@ -186,14 +185,11 @@ static inline void clwb(volatile void *__p)
{
volatile struct { char x[64]; } *p = __p;
- asm volatile(ALTERNATIVE_2(
- ".byte 0x3e; clflush (%[pax])",
- ".byte 0x66; clflush (%[pax])", /* clflushopt (%%rax) */
- X86_FEATURE_CLFLUSHOPT,
- ".byte 0x66, 0x0f, 0xae, 0x30", /* clwb (%%rax) */
- X86_FEATURE_CLWB)
- : [p] "+m" (*p)
- : [pax] "a" (p));
+ asm_inline volatile(ALTERNATIVE_2(
+ "ds clflush %0",
+ "clflushopt %0", X86_FEATURE_CLFLUSHOPT,
+ "clwb %0", X86_FEATURE_CLWB)
+ : "+m" (*p));
}
#ifdef CONFIG_X86_USER_SHADOW_STACK
diff --git a/arch/x86/include/asm/sta2x11.h b/arch/x86/include/asm/sta2x11.h
deleted file mode 100644
index e0975e9c4f47..000000000000
--- a/arch/x86/include/asm/sta2x11.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Header file for STMicroelectronics ConneXt (STA2X11) IOHub
- */
-#ifndef __ASM_STA2X11_H
-#define __ASM_STA2X11_H
-
-#include <linux/pci.h>
-
-/* This needs to be called from the MFD to configure its sub-devices */
-struct sta2x11_instance *sta2x11_get_instance(struct pci_dev *pdev);
-
-#endif /* __ASM_STA2X11_H */
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
index 00473a650f51..cd761b14eb02 100644
--- a/arch/x86/include/asm/stackprotector.h
+++ b/arch/x86/include/asm/stackprotector.h
@@ -2,26 +2,10 @@
/*
* GCC stack protector support.
*
- * Stack protector works by putting predefined pattern at the start of
+ * Stack protector works by putting a predefined pattern at the start of
* the stack frame and verifying that it hasn't been overwritten when
- * returning from the function. The pattern is called stack canary
- * and unfortunately gcc historically required it to be at a fixed offset
- * from the percpu segment base. On x86_64, the offset is 40 bytes.
- *
- * The same segment is shared by percpu area and stack canary. On
- * x86_64, percpu symbols are zero based and %gs (64-bit) points to the
- * base of percpu area. The first occupant of the percpu area is always
- * fixed_percpu_data which contains stack_canary at the appropriate
- * offset. On x86_32, the stack canary is just a regular percpu
- * variable.
- *
- * Putting percpu data in %fs on 32-bit is a minor optimization compared to
- * using %gs. Since 32-bit userspace normally has %fs == 0, we are likely
- * to load 0 into %fs on exit to usermode, whereas with percpu data in
- * %gs, we are likely to load a non-null %gs on return to user mode.
- *
- * Once we are willing to require GCC 8.1 or better for 64-bit stackprotector
- * support, we can remove some of this complexity.
+ * returning from the function. The pattern is called the stack canary
+ * and is a unique value for each task.
*/
#ifndef _ASM_STACKPROTECTOR_H
@@ -36,6 +20,8 @@
#include <linux/sched.h>
+DECLARE_PER_CPU_CACHE_HOT(unsigned long, __stack_chk_guard);
+
/*
* Initialize the stackprotector canary value.
*
@@ -51,25 +37,13 @@ static __always_inline void boot_init_stack_canary(void)
{
unsigned long canary = get_random_canary();
-#ifdef CONFIG_X86_64
- BUILD_BUG_ON(offsetof(struct fixed_percpu_data, stack_canary) != 40);
-#endif
-
current->stack_canary = canary;
-#ifdef CONFIG_X86_64
- this_cpu_write(fixed_percpu_data.stack_canary, canary);
-#else
this_cpu_write(__stack_chk_guard, canary);
-#endif
}
static inline void cpu_init_stack_canary(int cpu, struct task_struct *idle)
{
-#ifdef CONFIG_X86_64
- per_cpu(fixed_percpu_data.stack_canary, cpu) = idle->stack_canary;
-#else
per_cpu(__stack_chk_guard, cpu) = idle->stack_canary;
-#endif
}
#else /* STACKPROTECTOR */
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 9d0b324eab21..79e9695dc13e 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -21,6 +21,7 @@ extern void *__memcpy(void *to, const void *from, size_t len);
#define __HAVE_ARCH_MEMSET
void *memset(void *s, int c, size_t n);
void *__memset(void *s, int c, size_t n);
+KCFI_REFERENCE(__memset);
/*
* KMSAN needs to instrument as much code as possible. Use C versions of
@@ -70,6 +71,7 @@ static inline void *memset64(uint64_t *s, uint64_t v, size_t n)
#define __HAVE_ARCH_MEMMOVE
void *memmove(void *dest, const void *src, size_t count);
void *__memmove(void *dest, const void *src, size_t count);
+KCFI_REFERENCE(__memmove);
int memcmp(const void *cs, const void *ct, size_t count);
size_t strlen(const char *s);
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index e2fac21471f5..9b7fa99ae951 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -116,6 +116,7 @@ enum {
INTERCEPT_INVPCID,
INTERCEPT_MCOMMIT,
INTERCEPT_TLBSYNC,
+ INTERCEPT_IDLE_HLT = 166,
};
@@ -290,10 +291,6 @@ static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_
#define SVM_SEV_FEAT_ALTERNATE_INJECTION BIT(4)
#define SVM_SEV_FEAT_DEBUG_SWAP BIT(5)
-#define SVM_SEV_FEAT_INT_INJ_MODES \
- (SVM_SEV_FEAT_RESTRICTED_INJECTION | \
- SVM_SEV_FEAT_ALTERNATE_INJECTION)
-
struct vmcb_seg {
u16 selector;
u16 attrib;
diff --git a/arch/x86/include/asm/sync_bitops.h b/arch/x86/include/asm/sync_bitops.h
index 6d8d6bc183b7..cd21a0405ac5 100644
--- a/arch/x86/include/asm/sync_bitops.h
+++ b/arch/x86/include/asm/sync_bitops.h
@@ -31,7 +31,7 @@
*/
static inline void sync_set_bit(long nr, volatile unsigned long *addr)
{
- asm volatile("lock; " __ASM_SIZE(bts) " %1,%0"
+ asm volatile("lock " __ASM_SIZE(bts) " %1,%0"
: "+m" (ADDR)
: "Ir" (nr)
: "memory");
@@ -49,7 +49,7 @@ static inline void sync_set_bit(long nr, volatile unsigned long *addr)
*/
static inline void sync_clear_bit(long nr, volatile unsigned long *addr)
{
- asm volatile("lock; " __ASM_SIZE(btr) " %1,%0"
+ asm volatile("lock " __ASM_SIZE(btr) " %1,%0"
: "+m" (ADDR)
: "Ir" (nr)
: "memory");
@@ -66,7 +66,7 @@ static inline void sync_clear_bit(long nr, volatile unsigned long *addr)
*/
static inline void sync_change_bit(long nr, volatile unsigned long *addr)
{
- asm volatile("lock; " __ASM_SIZE(btc) " %1,%0"
+ asm volatile("lock " __ASM_SIZE(btc) " %1,%0"
: "+m" (ADDR)
: "Ir" (nr)
: "memory");
@@ -82,7 +82,7 @@ static inline void sync_change_bit(long nr, volatile unsigned long *addr)
*/
static inline bool sync_test_and_set_bit(long nr, volatile unsigned long *addr)
{
- return GEN_BINARY_RMWcc("lock; " __ASM_SIZE(bts), *addr, c, "Ir", nr);
+ return GEN_BINARY_RMWcc("lock " __ASM_SIZE(bts), *addr, c, "Ir", nr);
}
/**
@@ -95,7 +95,7 @@ static inline bool sync_test_and_set_bit(long nr, volatile unsigned long *addr)
*/
static inline int sync_test_and_clear_bit(long nr, volatile unsigned long *addr)
{
- return GEN_BINARY_RMWcc("lock; " __ASM_SIZE(btr), *addr, c, "Ir", nr);
+ return GEN_BINARY_RMWcc("lock " __ASM_SIZE(btr), *addr, c, "Ir", nr);
}
/**
@@ -108,7 +108,7 @@ static inline int sync_test_and_clear_bit(long nr, volatile unsigned long *addr)
*/
static inline int sync_test_and_change_bit(long nr, volatile unsigned long *addr)
{
- return GEN_BINARY_RMWcc("lock; " __ASM_SIZE(btc), *addr, c, "Ir", nr);
+ return GEN_BINARY_RMWcc("lock " __ASM_SIZE(btc), *addr, c, "Ir", nr);
}
#define sync_test_bit(nr, addr) test_bit(nr, addr)
diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index b4b16dafd55e..65394aa9b49f 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -30,7 +30,7 @@
#define TDX_SUCCESS 0ULL
#define TDX_RND_NO_ENTROPY 0x8000020300000000ULL
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <uapi/asm/mce.h>
@@ -126,5 +126,5 @@ static inline int tdx_enable(void) { return -ENODEV; }
static inline const char *tdx_dump_mce_info(struct mce *m) { return NULL; }
#endif /* CONFIG_INTEL_TDX_HOST */
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#endif /* _ASM_X86_TDX_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index a55c214f3ba6..9282465eea21 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -54,7 +54,7 @@
* - this struct should fit entirely inside of one cache line
* - this struct shares the supervisor stack pages
*/
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
struct task_struct;
#include <asm/cpufeature.h>
#include <linux/atomic.h>
@@ -73,7 +73,7 @@ struct thread_info {
.flags = 0, \
}
-#else /* !__ASSEMBLY__ */
+#else /* !__ASSEMBLER__ */
#include <asm/asm-offsets.h>
@@ -161,7 +161,7 @@ struct thread_info {
*
* preempt_count needs to be 1 initially, until the scheduler is functional.
*/
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
/*
* Walks up the stack frames to make sure that the specified object is
@@ -213,7 +213,7 @@ static inline int arch_within_stack_frames(const void * const stack,
#endif
}
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
/*
* Thread-synchronous status.
@@ -224,7 +224,7 @@ static inline int arch_within_stack_frames(const void * const stack,
*/
#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#ifdef CONFIG_COMPAT
#define TS_I386_REGS_POKED 0x0004 /* regs poked by 32-bit ptracer */
@@ -242,6 +242,6 @@ static inline int arch_within_stack_frames(const void * const stack,
extern void arch_setup_new_exec(void);
#define arch_setup_new_exec arch_setup_new_exec
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#endif /* _ASM_X86_THREAD_INFO_H */
diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
index 77f52bc1578a..866ea78ba156 100644
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -6,6 +6,9 @@
static inline void tlb_flush(struct mmu_gather *tlb);
#include <asm-generic/tlb.h>
+#include <linux/kernel.h>
+#include <vdso/bits.h>
+#include <vdso/page.h>
static inline void tlb_flush(struct mmu_gather *tlb)
{
@@ -25,4 +28,139 @@ static inline void invlpg(unsigned long addr)
asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
}
+enum addr_stride {
+ PTE_STRIDE = 0,
+ PMD_STRIDE = 1
+};
+
+/*
+ * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination
+ * of the three. For example:
+ * - FLAG_VA | FLAG_INCLUDE_GLOBAL: invalidate all TLB entries at the address
+ * - FLAG_PCID: invalidate all TLB entries matching the PCID
+ *
+ * The first is used to invalidate (kernel) mappings at a particular
+ * address across all processes.
+ *
+ * The latter invalidates all TLB entries matching a PCID.
+ */
+#define INVLPGB_FLAG_VA BIT(0)
+#define INVLPGB_FLAG_PCID BIT(1)
+#define INVLPGB_FLAG_ASID BIT(2)
+#define INVLPGB_FLAG_INCLUDE_GLOBAL BIT(3)
+#define INVLPGB_FLAG_FINAL_ONLY BIT(4)
+#define INVLPGB_FLAG_INCLUDE_NESTED BIT(5)
+
+/* The implied mode when all bits are clear: */
+#define INVLPGB_MODE_ALL_NONGLOBALS 0UL
+
+#ifdef CONFIG_BROADCAST_TLB_FLUSH
+/*
+ * INVLPGB does broadcast TLB invalidation across all the CPUs in the system.
+ *
+ * The INVLPGB instruction is weakly ordered, and a batch of invalidations can
+ * be done in a parallel fashion.
+ *
+ * The instruction takes the number of extra pages to invalidate, beyond the
+ * first page, while __invlpgb gets the more human readable number of pages to
+ * invalidate.
+ *
+ * The bits in rax[0:2] determine respectively which components of the address
+ * (VA, PCID, ASID) get compared when flushing. If neither bits are set, *any*
+ * address in the specified range matches.
+ *
+ * Since it is desired to only flush TLB entries for the ASID that is executing
+ * the instruction (a host/hypervisor or a guest), the ASID valid bit should
+ * always be set. On a host/hypervisor, the hardware will use the ASID value
+ * specified in EDX[15:0] (which should be 0). On a guest, the hardware will
+ * use the actual ASID value of the guest.
+ *
+ * TLBSYNC is used to ensure that pending INVLPGB invalidations initiated from
+ * this CPU have completed.
+ */
+static inline void __invlpgb(unsigned long asid, unsigned long pcid,
+ unsigned long addr, u16 nr_pages,
+ enum addr_stride stride, u8 flags)
+{
+ u64 rax = addr | flags | INVLPGB_FLAG_ASID;
+ u32 ecx = (stride << 31) | (nr_pages - 1);
+ u32 edx = (pcid << 16) | asid;
+
+ /* The low bits in rax are for flags. Verify addr is clean. */
+ VM_WARN_ON_ONCE(addr & ~PAGE_MASK);
+
+ /* INVLPGB; supported in binutils >= 2.36. */
+ asm volatile(".byte 0x0f, 0x01, 0xfe" :: "a" (rax), "c" (ecx), "d" (edx));
+}
+
+static inline void __invlpgb_all(unsigned long asid, unsigned long pcid, u8 flags)
+{
+ __invlpgb(asid, pcid, 0, 1, 0, flags);
+}
+
+static inline void __tlbsync(void)
+{
+ /*
+ * TLBSYNC waits for INVLPGB instructions originating on the same CPU
+ * to have completed. Print a warning if the task has been migrated,
+ * and might not be waiting on all the INVLPGBs issued during this TLB
+ * invalidation sequence.
+ */
+ cant_migrate();
+
+ /* TLBSYNC: supported in binutils >= 0.36. */
+ asm volatile(".byte 0x0f, 0x01, 0xff" ::: "memory");
+}
+#else
+/* Some compilers (I'm looking at you clang!) simply can't do DCE */
+static inline void __invlpgb(unsigned long asid, unsigned long pcid,
+ unsigned long addr, u16 nr_pages,
+ enum addr_stride s, u8 flags) { }
+static inline void __invlpgb_all(unsigned long asid, unsigned long pcid, u8 flags) { }
+static inline void __tlbsync(void) { }
+#endif
+
+static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid,
+ unsigned long addr,
+ u16 nr, bool stride)
+{
+ enum addr_stride str = stride ? PMD_STRIDE : PTE_STRIDE;
+ u8 flags = INVLPGB_FLAG_PCID | INVLPGB_FLAG_VA;
+
+ __invlpgb(0, pcid, addr, nr, str, flags);
+}
+
+/* Flush all mappings for a given PCID, not including globals. */
+static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid)
+{
+ __invlpgb_all(0, pcid, INVLPGB_FLAG_PCID);
+}
+
+/* Flush all mappings, including globals, for all PCIDs. */
+static inline void invlpgb_flush_all(void)
+{
+ /*
+ * TLBSYNC at the end needs to make sure all flushes done on the
+ * current CPU have been executed system-wide. Therefore, make
+ * sure nothing gets migrated in-between but disable preemption
+ * as it is cheaper.
+ */
+ guard(preempt)();
+ __invlpgb_all(0, 0, INVLPGB_FLAG_INCLUDE_GLOBAL);
+ __tlbsync();
+}
+
+/* Flush addr, including globals, for all PCIDs. */
+static inline void invlpgb_flush_addr_nosync(unsigned long addr, u16 nr)
+{
+ __invlpgb(0, 0, addr, nr, PTE_STRIDE, INVLPGB_FLAG_INCLUDE_GLOBAL);
+}
+
+/* Flush all mappings for all PCIDs except globals. */
+static inline void invlpgb_flush_all_nonglobals(void)
+{
+ guard(preempt)();
+ __invlpgb_all(0, 0, INVLPGB_MODE_ALL_NONGLOBALS);
+ __tlbsync();
+}
#endif /* _ASM_X86_TLB_H */
diff --git a/arch/x86/include/asm/tlbbatch.h b/arch/x86/include/asm/tlbbatch.h
index 1ad56eb3e8a8..80aaf64ff25f 100644
--- a/arch/x86/include/asm/tlbbatch.h
+++ b/arch/x86/include/asm/tlbbatch.h
@@ -10,6 +10,11 @@ struct arch_tlbflush_unmap_batch {
* the PFNs being flushed..
*/
struct cpumask cpumask;
+ /*
+ * Set if pages were unmapped from any MM, even one that does not
+ * have active CPUs in its cpumask.
+ */
+ bool unmapped_pages;
};
#endif /* _ARCH_X86_TLBBATCH_H */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 02fc2aa06e9e..a9af8759de34 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -6,6 +6,7 @@
#include <linux/mmu_notifier.h>
#include <linux/sched.h>
+#include <asm/barrier.h>
#include <asm/processor.h>
#include <asm/cpufeature.h>
#include <asm/special_insns.h>
@@ -183,6 +184,9 @@ static inline void cr4_init_shadow(void)
extern unsigned long mmu_cr4_features;
extern u32 *trampoline_cr4_features;
+/* How many pages can be invalidated with one INVLPGB. */
+extern u16 invlpgb_count_max;
+
extern void initialize_tlbstate_and_flush(void);
/*
@@ -231,6 +235,71 @@ void flush_tlb_one_kernel(unsigned long addr);
void flush_tlb_multi(const struct cpumask *cpumask,
const struct flush_tlb_info *info);
+static inline bool is_dyn_asid(u16 asid)
+{
+ return asid < TLB_NR_DYN_ASIDS;
+}
+
+static inline bool is_global_asid(u16 asid)
+{
+ return !is_dyn_asid(asid);
+}
+
+#ifdef CONFIG_BROADCAST_TLB_FLUSH
+static inline u16 mm_global_asid(struct mm_struct *mm)
+{
+ u16 asid;
+
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ return 0;
+
+ asid = smp_load_acquire(&mm->context.global_asid);
+
+ /* mm->context.global_asid is either 0, or a global ASID */
+ VM_WARN_ON_ONCE(asid && is_dyn_asid(asid));
+
+ return asid;
+}
+
+static inline void mm_init_global_asid(struct mm_struct *mm)
+{
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
+ mm->context.global_asid = 0;
+ mm->context.asid_transition = false;
+ }
+}
+
+static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid)
+{
+ /*
+ * Notably flush_tlb_mm_range() -> broadcast_tlb_flush() ->
+ * finish_asid_transition() needs to observe asid_transition = true
+ * once it observes global_asid.
+ */
+ mm->context.asid_transition = true;
+ smp_store_release(&mm->context.global_asid, asid);
+}
+
+static inline void mm_clear_asid_transition(struct mm_struct *mm)
+{
+ WRITE_ONCE(mm->context.asid_transition, false);
+}
+
+static inline bool mm_in_asid_transition(struct mm_struct *mm)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ return false;
+
+ return mm && READ_ONCE(mm->context.asid_transition);
+}
+#else
+static inline u16 mm_global_asid(struct mm_struct *mm) { return 0; }
+static inline void mm_init_global_asid(struct mm_struct *mm) { }
+static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) { }
+static inline void mm_clear_asid_transition(struct mm_struct *mm) { }
+static inline bool mm_in_asid_transition(struct mm_struct *mm) { return false; }
+#endif /* CONFIG_BROADCAST_TLB_FLUSH */
+
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#endif
@@ -242,7 +311,7 @@ void flush_tlb_multi(const struct cpumask *cpumask,
flush_tlb_mm_range((vma)->vm_mm, start, end, \
((vma)->vm_flags & VM_HUGETLB) \
? huge_page_shift(hstate_vma(vma)) \
- : PAGE_SHIFT, false)
+ : PAGE_SHIFT, true)
extern void flush_tlb_all(void);
extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
@@ -284,6 +353,7 @@ static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *b
{
inc_mm_tlb_gen(mm);
cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
+ batch->unmapped_pages = true;
mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
}
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index ec134b719144..6c79ee7c0957 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -229,11 +229,11 @@ static inline bool topology_is_primary_thread(unsigned int cpu)
{
return cpumask_test_cpu(cpu, cpu_primary_thread_mask);
}
+#define topology_is_primary_thread topology_is_primary_thread
#else /* CONFIG_SMP */
static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; }
static inline int topology_max_smt_threads(void) { return 1; }
-static inline bool topology_is_primary_thread(unsigned int cpu) { return true; }
static inline unsigned int topology_amd_nodes_per_pkg(void) { return 1; }
#endif /* !CONFIG_SMP */
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 1f1deaecd364..869b88061801 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -35,8 +35,6 @@ static inline int get_si_code(unsigned long condition)
return TRAP_BRKPT;
}
-extern int panic_on_unrecovered_nmi;
-
void math_emulate(struct math_emu_info *);
bool fault_in_kernel_space(unsigned long address);
diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h
index 85cc57cb6539..8f4579c5a6f8 100644
--- a/arch/x86/include/asm/unwind_hints.h
+++ b/arch/x86/include/asm/unwind_hints.h
@@ -5,7 +5,7 @@
#include "orc_types.h"
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
.macro UNWIND_HINT_END_OF_STACK
UNWIND_HINT type=UNWIND_HINT_TYPE_END_OF_STACK
@@ -88,6 +88,6 @@
#define UNWIND_HINT_RESTORE \
UNWIND_HINT(UNWIND_HINT_TYPE_RESTORE, 0, 0, 0)
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_X86_UNWIND_HINTS_H */
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index d7f6592b74a9..80be0da733df 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -18,12 +18,6 @@ struct vdso_image {
unsigned long extable_base, extable_len;
const void *extable;
- long sym_vvar_start; /* Negative offset to the vvar area */
-
- long sym_vvar_page;
- long sym_pvclock_page;
- long sym_hvclock_page;
- long sym_timens_page;
long sym_VDSO32_NOTE_MASK;
long sym___kernel_sigreturn;
long sym___kernel_rt_sigreturn;
diff --git a/arch/x86/include/asm/vdso/getrandom.h b/arch/x86/include/asm/vdso/getrandom.h
index 2bf9c0e970c3..ff1c11b9fa27 100644
--- a/arch/x86/include/asm/vdso/getrandom.h
+++ b/arch/x86/include/asm/vdso/getrandom.h
@@ -5,7 +5,7 @@
#ifndef __ASM_VDSO_GETRANDOM_H
#define __ASM_VDSO_GETRANDOM_H
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <asm/unistd.h>
@@ -27,16 +27,6 @@ static __always_inline ssize_t getrandom_syscall(void *buffer, size_t len, unsig
return ret;
}
-extern struct vdso_rng_data vdso_rng_data
- __attribute__((visibility("hidden")));
-
-static __always_inline const struct vdso_rng_data *__arch_get_vdso_rng_data(void)
-{
- if (IS_ENABLED(CONFIG_TIME_NS) && __arch_get_vdso_data()->clock_mode == VDSO_CLOCKMODE_TIMENS)
- return (void *)&vdso_rng_data + ((void *)&timens_page - (void *)__arch_get_vdso_data());
- return &vdso_rng_data;
-}
-
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#endif /* __ASM_VDSO_GETRANDOM_H */
diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h
index 375a34b0f365..73b2e7ee8f0f 100644
--- a/arch/x86/include/asm/vdso/gettimeofday.h
+++ b/arch/x86/include/asm/vdso/gettimeofday.h
@@ -10,7 +10,7 @@
#ifndef __ASM_VDSO_GETTIMEOFDAY_H
#define __ASM_VDSO_GETTIMEOFDAY_H
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <uapi/linux/time.h>
#include <asm/vgtod.h>
@@ -19,12 +19,6 @@
#include <asm/pvclock.h>
#include <clocksource/hyperv_timer.h>
-extern struct vdso_data vvar_page
- __attribute__((visibility("hidden")));
-
-extern struct vdso_data timens_page
- __attribute__((visibility("hidden")));
-
#define VDSO_HAS_TIME 1
#define VDSO_HAS_CLOCK_GETRES 1
@@ -59,14 +53,6 @@ extern struct ms_hyperv_tsc_page hvclock_page
__attribute__((visibility("hidden")));
#endif
-#ifdef CONFIG_TIME_NS
-static __always_inline
-const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd)
-{
- return &timens_page;
-}
-#endif
-
#ifndef BUILD_VDSO32
static __always_inline
@@ -250,7 +236,7 @@ static u64 vread_hvclock(void)
#endif
static inline u64 __arch_get_hw_counter(s32 clock_mode,
- const struct vdso_data *vd)
+ const struct vdso_time_data *vd)
{
if (likely(clock_mode == VDSO_CLOCKMODE_TSC))
return (u64)rdtsc_ordered() & S64_MAX;
@@ -275,12 +261,7 @@ static inline u64 __arch_get_hw_counter(s32 clock_mode,
return U64_MAX;
}
-static __always_inline const struct vdso_data *__arch_get_vdso_data(void)
-{
- return &vvar_page;
-}
-
-static inline bool arch_vdso_clocksource_ok(const struct vdso_data *vd)
+static inline bool arch_vdso_clocksource_ok(const struct vdso_clock *vc)
{
return true;
}
@@ -319,37 +300,37 @@ static inline bool arch_vdso_cycles_ok(u64 cycles)
* declares everything with the MSB/Sign-bit set as invalid. Therefore the
* effective mask is S64_MAX.
*/
-static __always_inline u64 vdso_calc_ns(const struct vdso_data *vd, u64 cycles, u64 base)
+static __always_inline u64 vdso_calc_ns(const struct vdso_clock *vc, u64 cycles, u64 base)
{
- u64 delta = cycles - vd->cycle_last;
+ u64 delta = cycles - vc->cycle_last;
/*
* Negative motion and deltas which can cause multiplication
* overflow require special treatment. This check covers both as
- * negative motion is guaranteed to be greater than @vd::max_cycles
+ * negative motion is guaranteed to be greater than @vc::max_cycles
* due to unsigned comparison.
*
* Due to the MSB/Sign-bit being used as invalid marker (see
* arch_vdso_cycles_ok() above), the effective mask is S64_MAX, but that
* case is also unlikely and will also take the unlikely path here.
*/
- if (unlikely(delta > vd->max_cycles)) {
+ if (unlikely(delta > vc->max_cycles)) {
/*
* Due to the above mentioned TSC wobbles, filter out
* negative motion. Per the above masking, the effective
* sign bit is now bit 62.
*/
if (delta & (1ULL << 62))
- return base >> vd->shift;
+ return base >> vc->shift;
/* Handle multiplication overflow gracefully */
- return mul_u64_u32_add_u64_shr(delta & S64_MAX, vd->mult, base, vd->shift);
+ return mul_u64_u32_add_u64_shr(delta & S64_MAX, vc->mult, base, vc->shift);
}
- return ((delta * vd->mult) + base) >> vd->shift;
+ return ((delta * vc->mult) + base) >> vc->shift;
}
#define vdso_calc_ns vdso_calc_ns
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#endif /* __ASM_VDSO_GETTIMEOFDAY_H */
diff --git a/arch/x86/include/asm/vdso/processor.h b/arch/x86/include/asm/vdso/processor.h
index 2cbce97d29ea..c9b2ba7a9ec4 100644
--- a/arch/x86/include/asm/vdso/processor.h
+++ b/arch/x86/include/asm/vdso/processor.h
@@ -5,7 +5,7 @@
#ifndef __ASM_VDSO_PROCESSOR_H
#define __ASM_VDSO_PROCESSOR_H
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
static __always_inline void rep_nop(void)
@@ -22,6 +22,6 @@ struct getcpu_cache;
notrace long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused);
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* __ASM_VDSO_PROCESSOR_H */
diff --git a/arch/x86/include/asm/vdso/vsyscall.h b/arch/x86/include/asm/vdso/vsyscall.h
index 37b4a70559a8..4aa311a923f2 100644
--- a/arch/x86/include/asm/vdso/vsyscall.h
+++ b/arch/x86/include/asm/vdso/vsyscall.h
@@ -2,40 +2,21 @@
#ifndef __ASM_VDSO_VSYSCALL_H
#define __ASM_VDSO_VSYSCALL_H
-#define __VDSO_RND_DATA_OFFSET 640
-#define __VVAR_PAGES 4
+#define __VDSO_PAGES 6
#define VDSO_NR_VCLOCK_PAGES 2
+#define VDSO_VCLOCK_PAGES_START(_b) ((_b) + (__VDSO_PAGES - VDSO_NR_VCLOCK_PAGES) * PAGE_SIZE)
#define VDSO_PAGE_PVCLOCK_OFFSET 0
#define VDSO_PAGE_HVCLOCK_OFFSET 1
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <vdso/datapage.h>
#include <asm/vgtod.h>
-extern struct vdso_data *vdso_data;
-
-/*
- * Update the vDSO data page to keep in sync with kernel timekeeping.
- */
-static __always_inline
-struct vdso_data *__x86_get_k_vdso_data(void)
-{
- return vdso_data;
-}
-#define __arch_get_k_vdso_data __x86_get_k_vdso_data
-
-static __always_inline
-struct vdso_rng_data *__x86_get_k_vdso_rng_data(void)
-{
- return (void *)vdso_data + __VDSO_RND_DATA_OFFSET;
-}
-#define __arch_get_k_vdso_rng_data __x86_get_k_vdso_rng_data
-
/* The asm-generic header needs to be included after the definitions above */
#include <asm-generic/vdso/vsyscall.h>
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#endif /* __ASM_VDSO_VSYSCALL_H */
diff --git a/arch/x86/include/asm/vermagic.h b/arch/x86/include/asm/vermagic.h
index 75884d2cdec3..5d471253c755 100644
--- a/arch/x86/include/asm/vermagic.h
+++ b/arch/x86/include/asm/vermagic.h
@@ -15,8 +15,6 @@
#define MODULE_PROC_FAMILY "586TSC "
#elif defined CONFIG_M586MMX
#define MODULE_PROC_FAMILY "586MMX "
-#elif defined CONFIG_MCORE2
-#define MODULE_PROC_FAMILY "CORE2 "
#elif defined CONFIG_MATOM
#define MODULE_PROC_FAMILY "ATOM "
#elif defined CONFIG_M686
@@ -33,8 +31,6 @@
#define MODULE_PROC_FAMILY "K6 "
#elif defined CONFIG_MK7
#define MODULE_PROC_FAMILY "K7 "
-#elif defined CONFIG_MK8
-#define MODULE_PROC_FAMILY "K8 "
#elif defined CONFIG_MELAN
#define MODULE_PROC_FAMILY "ELAN "
#elif defined CONFIG_MCRUSOE
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index f7fd4369b821..8707361b24da 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -580,18 +580,22 @@ enum vm_entry_failure_code {
/*
* Exit Qualifications for EPT Violations
*/
-#define EPT_VIOLATION_ACC_READ_BIT 0
-#define EPT_VIOLATION_ACC_WRITE_BIT 1
-#define EPT_VIOLATION_ACC_INSTR_BIT 2
-#define EPT_VIOLATION_RWX_SHIFT 3
-#define EPT_VIOLATION_GVA_IS_VALID_BIT 7
-#define EPT_VIOLATION_GVA_TRANSLATED_BIT 8
-#define EPT_VIOLATION_ACC_READ (1 << EPT_VIOLATION_ACC_READ_BIT)
-#define EPT_VIOLATION_ACC_WRITE (1 << EPT_VIOLATION_ACC_WRITE_BIT)
-#define EPT_VIOLATION_ACC_INSTR (1 << EPT_VIOLATION_ACC_INSTR_BIT)
-#define EPT_VIOLATION_RWX_MASK (VMX_EPT_RWX_MASK << EPT_VIOLATION_RWX_SHIFT)
-#define EPT_VIOLATION_GVA_IS_VALID (1 << EPT_VIOLATION_GVA_IS_VALID_BIT)
-#define EPT_VIOLATION_GVA_TRANSLATED (1 << EPT_VIOLATION_GVA_TRANSLATED_BIT)
+#define EPT_VIOLATION_ACC_READ BIT(0)
+#define EPT_VIOLATION_ACC_WRITE BIT(1)
+#define EPT_VIOLATION_ACC_INSTR BIT(2)
+#define EPT_VIOLATION_PROT_READ BIT(3)
+#define EPT_VIOLATION_PROT_WRITE BIT(4)
+#define EPT_VIOLATION_PROT_EXEC BIT(5)
+#define EPT_VIOLATION_PROT_MASK (EPT_VIOLATION_PROT_READ | \
+ EPT_VIOLATION_PROT_WRITE | \
+ EPT_VIOLATION_PROT_EXEC)
+#define EPT_VIOLATION_GVA_IS_VALID BIT(7)
+#define EPT_VIOLATION_GVA_TRANSLATED BIT(8)
+
+#define EPT_VIOLATION_RWX_TO_PROT(__epte) (((__epte) & VMX_EPT_RWX_MASK) << 3)
+
+static_assert(EPT_VIOLATION_RWX_TO_PROT(VMX_EPT_RWX_MASK) ==
+ (EPT_VIOLATION_PROT_READ | EPT_VIOLATION_PROT_WRITE | EPT_VIOLATION_PROT_EXEC));
/*
* Exit Qualifications for NOTIFY VM EXIT
diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h
index baca0b00ef76..a078a2b0f032 100644
--- a/arch/x86/include/asm/xen/interface.h
+++ b/arch/x86/include/asm/xen/interface.h
@@ -72,7 +72,7 @@
#endif
#endif
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
/* Explicitly size integers that represent pfns in the public interface
* with Xen so that on ARM we can have one ABI that works for 32 and 64
* bit guests. */
@@ -137,7 +137,7 @@ DEFINE_GUEST_HANDLE(xen_ulong_t);
#define TI_SET_DPL(_ti, _dpl) ((_ti)->flags |= (_dpl))
#define TI_SET_IF(_ti, _if) ((_ti)->flags |= ((!!(_if))<<2))
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
struct trap_info {
uint8_t vector; /* exception vector */
uint8_t flags; /* 0-3: privilege level; 4: clear event enable? */
@@ -186,7 +186,7 @@ struct arch_shared_info {
uint32_t wc_sec_hi;
#endif
};
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#ifdef CONFIG_X86_32
#include <asm/xen/interface_32.h>
@@ -196,7 +196,7 @@ struct arch_shared_info {
#include <asm/pvclock-abi.h>
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
/*
* The following is all CPU context. Note that the fpu_ctxt block is filled
* in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
@@ -376,7 +376,7 @@ struct xen_pmu_arch {
} c;
};
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
/*
* Prefix forces emulation of some non-trapping instructions.
diff --git a/arch/x86/include/asm/xen/interface_32.h b/arch/x86/include/asm/xen/interface_32.h
index dc40578abded..74d9768a9cf7 100644
--- a/arch/x86/include/asm/xen/interface_32.h
+++ b/arch/x86/include/asm/xen/interface_32.h
@@ -44,7 +44,7 @@
*/
#define __HYPERVISOR_VIRT_START 0xF5800000
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
struct cpu_user_regs {
uint32_t ebx;
@@ -85,7 +85,7 @@ typedef struct xen_callback xen_callback_t;
#define XEN_CALLBACK(__cs, __eip) \
((struct xen_callback){ .cs = (__cs), .eip = (unsigned long)(__eip) })
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
/*
diff --git a/arch/x86/include/asm/xen/interface_64.h b/arch/x86/include/asm/xen/interface_64.h
index c10f279aae93..38a19edb81a3 100644
--- a/arch/x86/include/asm/xen/interface_64.h
+++ b/arch/x86/include/asm/xen/interface_64.h
@@ -77,7 +77,7 @@
#define VGCF_in_syscall (1<<_VGCF_in_syscall)
#define VGCF_IN_SYSCALL VGCF_in_syscall
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
struct iret_context {
/* Top of stack (%rsp at point of hypercall). */
@@ -143,7 +143,7 @@ typedef unsigned long xen_callback_t;
#define XEN_CALLBACK(__cs, __rip) \
((unsigned long)(__rip))
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#endif /* _ASM_X86_XEN_INTERFACE_64_H */
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index 9b82eebd7add..dafbf581c515 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -26,7 +26,7 @@
#define XLF_5LEVEL_ENABLED (1<<6)
#define XLF_MEM_ENCRYPTION (1<<7)
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/types.h>
#include <linux/screen_info.h>
@@ -210,6 +210,6 @@ enum x86_hardware_subarch {
X86_NR_SUBARCHS,
};
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_X86_BOOTPARAM_H */
diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h
index 2f491efe3a12..55bc66867156 100644
--- a/arch/x86/include/uapi/asm/e820.h
+++ b/arch/x86/include/uapi/asm/e820.h
@@ -54,7 +54,7 @@
*/
#define E820_RESERVED_KERN 128
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/types.h>
struct e820entry {
__u64 addr; /* start of memory segment */
@@ -76,7 +76,7 @@ struct e820map {
#define BIOS_ROM_BASE 0xffe00000
#define BIOS_ROM_END 0xffffffff
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _UAPI_ASM_X86_E820_H */
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 9e75da97bce0..460306b35a4b 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -559,6 +559,9 @@ struct kvm_x86_mce {
#define KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE (1 << 7)
#define KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA (1 << 8)
+#define KVM_XEN_MSR_MIN_INDEX 0x40000000u
+#define KVM_XEN_MSR_MAX_INDEX 0x4fffffffu
+
struct kvm_xen_hvm_config {
__u32 flags;
__u32 msr;
diff --git a/arch/x86/include/uapi/asm/ldt.h b/arch/x86/include/uapi/asm/ldt.h
index d62ac5db093b..a82c039d8e6a 100644
--- a/arch/x86/include/uapi/asm/ldt.h
+++ b/arch/x86/include/uapi/asm/ldt.h
@@ -12,7 +12,7 @@
/* The size of each LDT entry. */
#define LDT_ENTRY_SIZE 8
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
/*
* Note on 64bit base and limit is ignored and you cannot set DS/ES/CS
* not to the default values if you still want to do syscalls. This
@@ -44,5 +44,5 @@ struct user_desc {
#define MODIFY_LDT_CONTENTS_STACK 1
#define MODIFY_LDT_CONTENTS_CODE 2
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#endif /* _ASM_X86_LDT_H */
diff --git a/arch/x86/include/uapi/asm/msr.h b/arch/x86/include/uapi/asm/msr.h
index e7516b402a00..4b8917ca28fe 100644
--- a/arch/x86/include/uapi/asm/msr.h
+++ b/arch/x86/include/uapi/asm/msr.h
@@ -2,7 +2,7 @@
#ifndef _UAPI_ASM_X86_MSR_H
#define _UAPI_ASM_X86_MSR_H
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/types.h>
#include <linux/ioctl.h>
@@ -10,5 +10,5 @@
#define X86_IOC_RDMSR_REGS _IOWR('c', 0xA0, __u32[8])
#define X86_IOC_WRMSR_REGS _IOWR('c', 0xA1, __u32[8])
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _UAPI_ASM_X86_MSR_H */
diff --git a/arch/x86/include/uapi/asm/ptrace-abi.h b/arch/x86/include/uapi/asm/ptrace-abi.h
index 16074b9c93bb..5823584dea13 100644
--- a/arch/x86/include/uapi/asm/ptrace-abi.h
+++ b/arch/x86/include/uapi/asm/ptrace-abi.h
@@ -25,7 +25,7 @@
#else /* __i386__ */
-#if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS)
+#if defined(__ASSEMBLER__) || defined(__FRAME_OFFSETS)
/*
* C ABI says these regs are callee-preserved. They aren't saved on kernel entry
* unless syscall needs a complete, fully filled "struct pt_regs".
@@ -57,7 +57,7 @@
#define EFLAGS 144
#define RSP 152
#define SS 160
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
/* top of stack page */
#define FRAME_SIZE 168
@@ -87,7 +87,7 @@
#define PTRACE_SINGLEBLOCK 33 /* resume execution until next branch */
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/types.h>
#endif
diff --git a/arch/x86/include/uapi/asm/ptrace.h b/arch/x86/include/uapi/asm/ptrace.h
index 85165c0edafc..e0b5b4f6226b 100644
--- a/arch/x86/include/uapi/asm/ptrace.h
+++ b/arch/x86/include/uapi/asm/ptrace.h
@@ -7,7 +7,7 @@
#include <asm/processor-flags.h>
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#ifdef __i386__
/* this struct defines the way the registers are stored on the
@@ -81,6 +81,6 @@ struct pt_regs {
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#endif /* _UAPI_ASM_X86_PTRACE_H */
diff --git a/arch/x86/include/uapi/asm/setup_data.h b/arch/x86/include/uapi/asm/setup_data.h
index b111b0c18544..50c45ead4e7c 100644
--- a/arch/x86/include/uapi/asm/setup_data.h
+++ b/arch/x86/include/uapi/asm/setup_data.h
@@ -18,7 +18,7 @@
#define SETUP_INDIRECT (1<<31)
#define SETUP_TYPE_MAX (SETUP_ENUM_MAX | SETUP_INDIRECT)
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/types.h>
@@ -78,6 +78,6 @@ struct ima_setup_data {
__u64 size;
} __attribute__((packed));
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _UAPI_ASM_X86_SETUP_DATA_H */
diff --git a/arch/x86/include/uapi/asm/signal.h b/arch/x86/include/uapi/asm/signal.h
index f777346450ec..1067efabf18b 100644
--- a/arch/x86/include/uapi/asm/signal.h
+++ b/arch/x86/include/uapi/asm/signal.h
@@ -2,7 +2,7 @@
#ifndef _UAPI_ASM_X86_SIGNAL_H
#define _UAPI_ASM_X86_SIGNAL_H
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/types.h>
#include <linux/compiler.h>
@@ -16,7 +16,7 @@ struct siginfo;
typedef unsigned long sigset_t;
#endif /* __KERNEL__ */
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#define SIGHUP 1
@@ -68,7 +68,7 @@ typedef unsigned long sigset_t;
#include <asm-generic/signal-defs.h>
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
# ifndef __KERNEL__
@@ -106,6 +106,6 @@ typedef struct sigaltstack {
__kernel_size_t ss_size;
} stack_t;
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _UAPI_ASM_X86_SIGNAL_H */
diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h
index 1814b413fd57..ec1321248dac 100644
--- a/arch/x86/include/uapi/asm/svm.h
+++ b/arch/x86/include/uapi/asm/svm.h
@@ -95,6 +95,7 @@
#define SVM_EXIT_CR14_WRITE_TRAP 0x09e
#define SVM_EXIT_CR15_WRITE_TRAP 0x09f
#define SVM_EXIT_INVPCID 0x0a2
+#define SVM_EXIT_IDLE_HLT 0x0a6
#define SVM_EXIT_NPF 0x400
#define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401
#define SVM_EXIT_AVIC_UNACCELERATED_ACCESS 0x402
@@ -224,6 +225,7 @@
{ SVM_EXIT_CR4_WRITE_TRAP, "write_cr4_trap" }, \
{ SVM_EXIT_CR8_WRITE_TRAP, "write_cr8_trap" }, \
{ SVM_EXIT_INVPCID, "invpcid" }, \
+ { SVM_EXIT_IDLE_HLT, "idle-halt" }, \
{ SVM_EXIT_NPF, "npf" }, \
{ SVM_EXIT_AVIC_INCOMPLETE_IPI, "avic_incomplete_ipi" }, \
{ SVM_EXIT_AVIC_UNACCELERATED_ACCESS, "avic_unaccelerated_access" }, \
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index b43eb7e384eb..84cfa179802c 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -44,6 +44,8 @@ KCOV_INSTRUMENT_unwind_orc.o := n
KCOV_INSTRUMENT_unwind_frame.o := n
KCOV_INSTRUMENT_unwind_guess.o := n
+CFLAGS_head32.o := -fno-stack-protector
+CFLAGS_head64.o := -fno-stack-protector
CFLAGS_irq.o := -I $(src)/../include/asm/trace
obj-y += head_$(BITS).o
diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c
index d745dd586303..77bfb846490c 100644
--- a/arch/x86/kernel/acpi/cppc.c
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -4,6 +4,8 @@
* Copyright (c) 2016, Intel Corporation.
*/
+#include <linux/bitfield.h>
+
#include <acpi/cppc_acpi.h>
#include <asm/msr.h>
#include <asm/processor.h>
@@ -149,7 +151,7 @@ int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf)
if (ret)
goto out;
- val = AMD_CPPC_HIGHEST_PERF(val);
+ val = FIELD_GET(AMD_CPPC_HIGHEST_PERF_MASK, val);
} else {
ret = cppc_get_highest_perf(cpu, &val);
if (ret)
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 5854f0b8f0f1..d5ac34186555 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -13,9 +13,11 @@
#include <linux/sched.h>
#include <acpi/processor.h>
+#include <asm/cpu_device_id.h>
#include <asm/cpuid.h>
#include <asm/mwait.h>
#include <asm/special_insns.h>
+#include <asm/smp.h>
/*
* Initialize bm_flags based on the CPU cache properties
@@ -47,12 +49,11 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags,
/*
* On all recent Intel platforms, ARB_DISABLE is a nop.
* So, set bm_control to zero to indicate that ARB_DISABLE
- * is not required while entering C3 type state on
- * P4, Core and beyond CPUs
+ * is not required while entering C3 type state.
*/
if (c->x86_vendor == X86_VENDOR_INTEL &&
- (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 0x0f)))
- flags->bm_control = 0;
+ (c->x86 > 15 || (c->x86_vfm >= INTEL_CORE2_MEROM && c->x86_vfm <= INTEL_FAM6_LAST)))
+ flags->bm_control = 0;
if (c->x86_vendor == X86_VENDOR_CENTAUR) {
if (c->x86 > 6 || (c->x86 == 6 && c->x86_model == 0x0f &&
@@ -205,6 +206,16 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
}
EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
+void __noreturn acpi_processor_ffh_play_dead(struct acpi_processor_cx *cx)
+{
+ unsigned int cpu = smp_processor_id();
+ struct cstate_entry *percpu_entry;
+
+ percpu_entry = per_cpu_ptr(cpu_cstate_entry, cpu);
+ mwait_play_dead(percpu_entry->states[cx->index].eax);
+}
+EXPORT_SYMBOL_GPL(acpi_processor_ffh_play_dead);
+
void __cpuidle acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
{
unsigned int cpu = smp_processor_id();
diff --git a/arch/x86/kernel/acpi/madt_playdead.S b/arch/x86/kernel/acpi/madt_playdead.S
index 4e498d28cdc8..aefb9cb583ad 100644
--- a/arch/x86/kernel/acpi/madt_playdead.S
+++ b/arch/x86/kernel/acpi/madt_playdead.S
@@ -14,6 +14,7 @@
* rsi: PGD of the identity mapping
*/
SYM_FUNC_START(asm_acpi_mp_play_dead)
+ ANNOTATE_NOENDBR
/* Turn off global entries. Following CR3 write will flush them. */
movq %cr4, %rdx
andq $~(X86_CR4_PGE), %rdx
diff --git a/arch/x86/kernel/acpi/madt_wakeup.c b/arch/x86/kernel/acpi/madt_wakeup.c
index d5ef6215583b..f36f28405dcc 100644
--- a/arch/x86/kernel/acpi/madt_wakeup.c
+++ b/arch/x86/kernel/acpi/madt_wakeup.c
@@ -70,58 +70,6 @@ static void __init free_pgt_page(void *pgt, void *dummy)
return memblock_free(pgt, PAGE_SIZE);
}
-/*
- * Make sure asm_acpi_mp_play_dead() is present in the identity mapping at
- * the same place as in the kernel page tables. asm_acpi_mp_play_dead() switches
- * to the identity mapping and the function has be present at the same spot in
- * the virtual address space before and after switching page tables.
- */
-static int __init init_transition_pgtable(pgd_t *pgd)
-{
- pgprot_t prot = PAGE_KERNEL_EXEC_NOENC;
- unsigned long vaddr, paddr;
- p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
-
- vaddr = (unsigned long)asm_acpi_mp_play_dead;
- pgd += pgd_index(vaddr);
- if (!pgd_present(*pgd)) {
- p4d = (p4d_t *)alloc_pgt_page(NULL);
- if (!p4d)
- return -ENOMEM;
- set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
- }
- p4d = p4d_offset(pgd, vaddr);
- if (!p4d_present(*p4d)) {
- pud = (pud_t *)alloc_pgt_page(NULL);
- if (!pud)
- return -ENOMEM;
- set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
- }
- pud = pud_offset(p4d, vaddr);
- if (!pud_present(*pud)) {
- pmd = (pmd_t *)alloc_pgt_page(NULL);
- if (!pmd)
- return -ENOMEM;
- set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
- }
- pmd = pmd_offset(pud, vaddr);
- if (!pmd_present(*pmd)) {
- pte = (pte_t *)alloc_pgt_page(NULL);
- if (!pte)
- return -ENOMEM;
- set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
- }
- pte = pte_offset_kernel(pmd, vaddr);
-
- paddr = __pa(vaddr);
- set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
-
- return 0;
-}
-
static int __init acpi_mp_setup_reset(u64 reset_vector)
{
struct x86_mapping_info info = {
@@ -130,6 +78,7 @@ static int __init acpi_mp_setup_reset(u64 reset_vector)
.page_flag = __PAGE_KERNEL_LARGE_EXEC,
.kernpg_flag = _KERNPG_TABLE_NOENC,
};
+ unsigned long mstart, mend;
pgd_t *pgd;
pgd = alloc_pgt_page(NULL);
@@ -137,8 +86,6 @@ static int __init acpi_mp_setup_reset(u64 reset_vector)
return -ENOMEM;
for (int i = 0; i < nr_pfn_mapped; i++) {
- unsigned long mstart, mend;
-
mstart = pfn_mapped[i].start << PAGE_SHIFT;
mend = pfn_mapped[i].end << PAGE_SHIFT;
if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) {
@@ -147,14 +94,24 @@ static int __init acpi_mp_setup_reset(u64 reset_vector)
}
}
- if (kernel_ident_mapping_init(&info, pgd,
- PAGE_ALIGN_DOWN(reset_vector),
- PAGE_ALIGN(reset_vector + 1))) {
+ mstart = PAGE_ALIGN_DOWN(reset_vector);
+ mend = mstart + PAGE_SIZE;
+ if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) {
kernel_ident_mapping_free(&info, pgd);
return -ENOMEM;
}
- if (init_transition_pgtable(pgd)) {
+ /*
+ * Make sure asm_acpi_mp_play_dead() is present in the identity mapping
+ * at the same place as in the kernel page tables.
+ * asm_acpi_mp_play_dead() switches to the identity mapping and the
+ * function must be present at the same spot in the virtual address space
+ * before and after switching page tables.
+ */
+ info.offset = __START_KERNEL_map - phys_base;
+ mstart = PAGE_ALIGN_DOWN(__pa(asm_acpi_mp_play_dead));
+ mend = mstart + PAGE_SIZE;
+ if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) {
kernel_ident_mapping_free(&info, pgd);
return -ENOMEM;
}
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index b200a193beeb..04f561f75e99 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -17,6 +17,7 @@
* Hooray, we are in Long 64-bit mode (but still running in low memory)
*/
SYM_FUNC_START(wakeup_long64)
+ ANNOTATE_NOENDBR
movq saved_magic(%rip), %rax
movq $0x123456789abcdef0, %rdx
cmpq %rdx, %rax
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index c71b575bf229..bf82c6f7d690 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -392,10 +392,8 @@ EXPORT_SYMBOL(BUG_func);
* Rewrite the "call BUG_func" replacement to point to the target of the
* indirect pv_ops call "call *disp(%ip)".
*/
-static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a,
- struct module *mod)
+static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a)
{
- u8 *wr_instr = module_writable_address(mod, instr);
void *target, *bug = &BUG_func;
s32 disp;
@@ -405,14 +403,14 @@ static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a,
}
if (a->instrlen != 6 ||
- wr_instr[0] != CALL_RIP_REL_OPCODE ||
- wr_instr[1] != CALL_RIP_REL_MODRM) {
+ instr[0] != CALL_RIP_REL_OPCODE ||
+ instr[1] != CALL_RIP_REL_MODRM) {
pr_err("ALT_FLAG_DIRECT_CALL set for unrecognized indirect call\n");
BUG();
}
/* Skip CALL_RIP_REL_OPCODE and CALL_RIP_REL_MODRM */
- disp = *(s32 *)(wr_instr + 2);
+ disp = *(s32 *)(instr + 2);
#ifdef CONFIG_X86_64
/* ff 15 00 00 00 00 call *0x0(%rip) */
/* target address is stored at "next instruction + disp". */
@@ -450,8 +448,7 @@ static inline u8 * instr_va(struct alt_instr *i)
* to refetch changed I$ lines.
*/
void __init_or_module noinline apply_alternatives(struct alt_instr *start,
- struct alt_instr *end,
- struct module *mod)
+ struct alt_instr *end)
{
u8 insn_buff[MAX_PATCH_LEN];
u8 *instr, *replacement;
@@ -480,7 +477,6 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
*/
for (a = start; a < end; a++) {
int insn_buff_sz = 0;
- u8 *wr_instr, *wr_replacement;
/*
* In case of nested ALTERNATIVE()s the outer alternative might
@@ -494,11 +490,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
}
instr = instr_va(a);
- wr_instr = module_writable_address(mod, instr);
-
replacement = (u8 *)&a->repl_offset + a->repl_offset;
- wr_replacement = module_writable_address(mod, replacement);
-
BUG_ON(a->instrlen > sizeof(insn_buff));
BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
@@ -509,9 +501,9 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
* patch if feature is *NOT* present.
*/
if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) {
- memcpy(insn_buff, wr_instr, a->instrlen);
+ memcpy(insn_buff, instr, a->instrlen);
optimize_nops(instr, insn_buff, a->instrlen);
- text_poke_early(wr_instr, insn_buff, a->instrlen);
+ text_poke_early(instr, insn_buff, a->instrlen);
continue;
}
@@ -521,12 +513,11 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
instr, instr, a->instrlen,
replacement, a->replacementlen, a->flags);
- memcpy(insn_buff, wr_replacement, a->replacementlen);
+ memcpy(insn_buff, replacement, a->replacementlen);
insn_buff_sz = a->replacementlen;
if (a->flags & ALT_FLAG_DIRECT_CALL) {
- insn_buff_sz = alt_replace_call(instr, insn_buff, a,
- mod);
+ insn_buff_sz = alt_replace_call(instr, insn_buff, a);
if (insn_buff_sz < 0)
continue;
}
@@ -536,11 +527,11 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen);
- DUMP_BYTES(ALT, wr_instr, a->instrlen, "%px: old_insn: ", instr);
+ DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr);
DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
- text_poke_early(wr_instr, insn_buff, insn_buff_sz);
+ text_poke_early(instr, insn_buff, insn_buff_sz);
}
kasan_enable_current();
@@ -731,20 +722,18 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
/*
* Generated by 'objtool --retpoline'.
*/
-void __init_or_module noinline apply_retpolines(s32 *start, s32 *end,
- struct module *mod)
+void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
{
s32 *s;
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
- void *wr_addr = module_writable_address(mod, addr);
struct insn insn;
int len, ret;
u8 bytes[16];
u8 op1, op2;
- ret = insn_decode_kernel(&insn, wr_addr);
+ ret = insn_decode_kernel(&insn, addr);
if (WARN_ON_ONCE(ret < 0))
continue;
@@ -752,6 +741,11 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end,
op2 = insn.opcode.bytes[1];
switch (op1) {
+ case 0x70 ... 0x7f: /* Jcc.d8 */
+ /* See cfi_paranoid. */
+ WARN_ON_ONCE(cfi_mode != CFI_FINEIBT);
+ continue;
+
case CALL_INSN_OPCODE:
case JMP32_INSN_OPCODE:
break;
@@ -772,9 +766,9 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end,
len = patch_retpoline(addr, &insn, bytes);
if (len == insn.length) {
optimize_nops(addr, bytes, len);
- DUMP_BYTES(RETPOLINE, ((u8*)wr_addr), len, "%px: orig: ", addr);
+ DUMP_BYTES(RETPOLINE, ((u8*)addr), len, "%px: orig: ", addr);
DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr);
- text_poke_early(wr_addr, bytes, len);
+ text_poke_early(addr, bytes, len);
}
}
}
@@ -810,8 +804,7 @@ static int patch_return(void *addr, struct insn *insn, u8 *bytes)
return i;
}
-void __init_or_module noinline apply_returns(s32 *start, s32 *end,
- struct module *mod)
+void __init_or_module noinline apply_returns(s32 *start, s32 *end)
{
s32 *s;
@@ -820,13 +813,12 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end,
for (s = start; s < end; s++) {
void *dest = NULL, *addr = (void *)s + *s;
- void *wr_addr = module_writable_address(mod, addr);
struct insn insn;
int len, ret;
u8 bytes[16];
u8 op;
- ret = insn_decode_kernel(&insn, wr_addr);
+ ret = insn_decode_kernel(&insn, addr);
if (WARN_ON_ONCE(ret < 0))
continue;
@@ -846,41 +838,59 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end,
len = patch_return(addr, &insn, bytes);
if (len == insn.length) {
- DUMP_BYTES(RET, ((u8*)wr_addr), len, "%px: orig: ", addr);
+ DUMP_BYTES(RET, ((u8*)addr), len, "%px: orig: ", addr);
DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr);
- text_poke_early(wr_addr, bytes, len);
+ text_poke_early(addr, bytes, len);
}
}
}
-#else
-void __init_or_module noinline apply_returns(s32 *start, s32 *end,
- struct module *mod) { }
-#endif /* CONFIG_MITIGATION_RETHUNK */
+#else /* !CONFIG_MITIGATION_RETHUNK: */
+void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
+#endif /* !CONFIG_MITIGATION_RETHUNK */
#else /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */
-void __init_or_module noinline apply_retpolines(s32 *start, s32 *end,
- struct module *mod) { }
-void __init_or_module noinline apply_returns(s32 *start, s32 *end,
- struct module *mod) { }
+void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { }
+void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
-#endif /* CONFIG_MITIGATION_RETPOLINE && CONFIG_OBJTOOL */
+#endif /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */
#ifdef CONFIG_X86_KERNEL_IBT
-static void poison_cfi(void *addr, void *wr_addr);
+__noendbr bool is_endbr(u32 *val)
+{
+ u32 endbr;
+
+ __get_kernel_nofault(&endbr, val, u32, Efault);
+ return __is_endbr(endbr);
+
+Efault:
+ return false;
+}
-static void __init_or_module poison_endbr(void *addr, void *wr_addr, bool warn)
+#ifdef CONFIG_FINEIBT
+
+static __noendbr bool exact_endbr(u32 *val)
{
- u32 endbr, poison = gen_endbr_poison();
+ u32 endbr;
- if (WARN_ON_ONCE(get_kernel_nofault(endbr, wr_addr)))
- return;
+ __get_kernel_nofault(&endbr, val, u32, Efault);
+ return endbr == gen_endbr();
+
+Efault:
+ return false;
+}
- if (!is_endbr(endbr)) {
- WARN_ON_ONCE(warn);
+#endif
+
+static void poison_cfi(void *addr);
+
+static void __init_or_module poison_endbr(void *addr)
+{
+ u32 poison = gen_endbr_poison();
+
+ if (WARN_ON_ONCE(!is_endbr(addr)))
return;
- }
DPRINTK(ENDBR, "ENDBR at: %pS (%px)", addr, addr);
@@ -889,7 +899,7 @@ static void __init_or_module poison_endbr(void *addr, void *wr_addr, bool warn)
*/
DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr);
DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr);
- text_poke_early(wr_addr, &poison, 4);
+ text_poke_early(addr, &poison, 4);
}
/*
@@ -898,36 +908,39 @@ static void __init_or_module poison_endbr(void *addr, void *wr_addr, bool warn)
* Seal the functions for indirect calls by clobbering the ENDBR instructions
* and the kCFI hash value.
*/
-void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end, struct module *mod)
+void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end)
{
s32 *s;
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
- void *wr_addr = module_writable_address(mod, addr);
- poison_endbr(addr, wr_addr, true);
+ poison_endbr(addr);
if (IS_ENABLED(CONFIG_FINEIBT))
- poison_cfi(addr - 16, wr_addr - 16);
+ poison_cfi(addr - 16);
}
}
-#else
+#else /* !CONFIG_X86_KERNEL_IBT: */
-void __init_or_module apply_seal_endbr(s32 *start, s32 *end, struct module *mod) { }
+void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { }
-#endif /* CONFIG_X86_KERNEL_IBT */
+#endif /* !CONFIG_X86_KERNEL_IBT */
#ifdef CONFIG_CFI_AUTO_DEFAULT
-#define __CFI_DEFAULT CFI_AUTO
+# define __CFI_DEFAULT CFI_AUTO
#elif defined(CONFIG_CFI_CLANG)
-#define __CFI_DEFAULT CFI_KCFI
+# define __CFI_DEFAULT CFI_KCFI
#else
-#define __CFI_DEFAULT CFI_OFF
+# define __CFI_DEFAULT CFI_OFF
#endif
enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT;
+#ifdef CONFIG_FINEIBT_BHI
+bool cfi_bhi __ro_after_init = false;
+#endif
+
#ifdef CONFIG_CFI_CLANG
struct bpf_insn;
@@ -935,11 +948,7 @@ struct bpf_insn;
extern unsigned int __bpf_prog_runX(const void *ctx,
const struct bpf_insn *insn);
-/*
- * Force a reference to the external symbol so the compiler generates
- * __kcfi_typid.
- */
-__ADDRESSABLE(__bpf_prog_runX);
+KCFI_REFERENCE(__bpf_prog_runX);
/* u32 __ro_after_init cfi_bpf_hash = __kcfi_typeid___bpf_prog_runX; */
asm (
@@ -956,7 +965,7 @@ asm (
/* Must match bpf_callback_t */
extern u64 __bpf_callback_fn(u64, u64, u64, u64, u64);
-__ADDRESSABLE(__bpf_callback_fn);
+KCFI_REFERENCE(__bpf_callback_fn);
/* u32 __ro_after_init cfi_bpf_subprog_hash = __kcfi_typeid___bpf_callback_fn; */
asm (
@@ -991,6 +1000,21 @@ u32 cfi_get_func_hash(void *func)
return hash;
}
+
+int cfi_get_func_arity(void *func)
+{
+ bhi_thunk *target;
+ s32 disp;
+
+ if (cfi_mode != CFI_FINEIBT && !cfi_bhi)
+ return 0;
+
+ if (get_kernel_nofault(disp, func - 4))
+ return 0;
+
+ target = func + disp;
+ return target - __bhi_args;
+}
#endif
#ifdef CONFIG_FINEIBT
@@ -998,6 +1022,8 @@ u32 cfi_get_func_hash(void *func)
static bool cfi_rand __ro_after_init = true;
static u32 cfi_seed __ro_after_init;
+static bool cfi_paranoid __ro_after_init = false;
+
/*
* Re-hash the CFI hash with a boot-time seed while making sure the result is
* not a valid ENDBR instruction.
@@ -1005,7 +1031,7 @@ static u32 cfi_seed __ro_after_init;
static u32 cfi_rehash(u32 hash)
{
hash ^= cfi_seed;
- while (unlikely(is_endbr(hash) || is_endbr(-hash))) {
+ while (unlikely(__is_endbr(hash) || __is_endbr(-hash))) {
bool lsb = hash & 1;
hash >>= 1;
if (lsb)
@@ -1037,6 +1063,25 @@ static __init int cfi_parse_cmdline(char *str)
cfi_mode = CFI_FINEIBT;
} else if (!strcmp(str, "norand")) {
cfi_rand = false;
+ } else if (!strcmp(str, "warn")) {
+ pr_alert("CFI mismatch non-fatal!\n");
+ cfi_warn = true;
+ } else if (!strcmp(str, "paranoid")) {
+ if (cfi_mode == CFI_FINEIBT) {
+ cfi_paranoid = true;
+ } else {
+ pr_err("Ignoring paranoid; depends on fineibt.\n");
+ }
+ } else if (!strcmp(str, "bhi")) {
+#ifdef CONFIG_FINEIBT_BHI
+ if (cfi_mode == CFI_FINEIBT) {
+ cfi_bhi = true;
+ } else {
+ pr_err("Ignoring bhi; depends on fineibt.\n");
+ }
+#else
+ pr_err("Ignoring bhi; depends on FINEIBT_BHI=y.\n");
+#endif
} else {
pr_err("Ignoring unknown cfi option (%s).", str);
}
@@ -1054,9 +1099,9 @@ early_param("cfi", cfi_parse_cmdline);
* __cfi_\func: __cfi_\func:
* movl $0x12345678,%eax // 5 endbr64 // 4
* nop subl $0x12345678,%r10d // 7
- * nop jz 1f // 2
- * nop ud2 // 2
- * nop 1: nop // 1
+ * nop jne __cfi_\func+6 // 2
+ * nop nop3 // 3
+ * nop
* nop
* nop
* nop
@@ -1068,34 +1113,53 @@ early_param("cfi", cfi_parse_cmdline);
*
* caller: caller:
* movl $(-0x12345678),%r10d // 6 movl $0x12345678,%r10d // 6
- * addl $-15(%r11),%r10d // 4 sub $16,%r11 // 4
+ * addl $-15(%r11),%r10d // 4 lea -0x10(%r11),%r11 // 4
* je 1f // 2 nop4 // 4
* ud2 // 2
- * 1: call __x86_indirect_thunk_r11 // 5 call *%r11; nop2; // 5
+ * 1: cs call __x86_indirect_thunk_r11 // 6 call *%r11; nop3; // 6
*
*/
-asm( ".pushsection .rodata \n"
- "fineibt_preamble_start: \n"
- " endbr64 \n"
- " subl $0x12345678, %r10d \n"
- " je fineibt_preamble_end \n"
- " ud2 \n"
- " nop \n"
- "fineibt_preamble_end: \n"
+/*
+ * <fineibt_preamble_start>:
+ * 0: f3 0f 1e fa endbr64
+ * 4: 41 81 <ea> 78 56 34 12 sub $0x12345678, %r10d
+ * b: 75 f9 jne 6 <fineibt_preamble_start+0x6>
+ * d: 0f 1f 00 nopl (%rax)
+ *
+ * Note that the JNE target is the 0xEA byte inside the SUB, this decodes as
+ * (bad) on x86_64 and raises #UD.
+ */
+asm( ".pushsection .rodata \n"
+ "fineibt_preamble_start: \n"
+ " endbr64 \n"
+ " subl $0x12345678, %r10d \n"
+ "fineibt_preamble_bhi: \n"
+ " jne fineibt_preamble_start+6 \n"
+ ASM_NOP3
+ "fineibt_preamble_end: \n"
".popsection\n"
);
extern u8 fineibt_preamble_start[];
+extern u8 fineibt_preamble_bhi[];
extern u8 fineibt_preamble_end[];
#define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start)
+#define fineibt_preamble_bhi (fineibt_preamble_bhi - fineibt_preamble_start)
+#define fineibt_preamble_ud 6
#define fineibt_preamble_hash 7
+/*
+ * <fineibt_caller_start>:
+ * 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d
+ * 6: 4d 8d 5b f0 lea -0x10(%r11), %r11
+ * a: 0f 1f 40 00 nopl 0x0(%rax)
+ */
asm( ".pushsection .rodata \n"
"fineibt_caller_start: \n"
" movl $0x12345678, %r10d \n"
- " sub $16, %r11 \n"
+ " lea -0x10(%r11), %r11 \n"
ASM_NOP4
"fineibt_caller_end: \n"
".popsection \n"
@@ -1109,13 +1173,62 @@ extern u8 fineibt_caller_end[];
#define fineibt_caller_jmp (fineibt_caller_size - 2)
-static u32 decode_preamble_hash(void *addr)
+/*
+ * Since FineIBT does hash validation on the callee side it is prone to
+ * circumvention attacks where a 'naked' ENDBR instruction exists that
+ * is not part of the fineibt_preamble sequence.
+ *
+ * Notably the x86 entry points must be ENDBR and equally cannot be
+ * fineibt_preamble.
+ *
+ * The fineibt_paranoid caller sequence adds additional caller side
+ * hash validation. This stops such circumvention attacks dead, but at the cost
+ * of adding a load.
+ *
+ * <fineibt_paranoid_start>:
+ * 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d
+ * 6: 45 3b 53 f7 cmp -0x9(%r11), %r10d
+ * a: 4d 8d 5b <f0> lea -0x10(%r11), %r11
+ * e: 75 fd jne d <fineibt_paranoid_start+0xd>
+ * 10: 41 ff d3 call *%r11
+ * 13: 90 nop
+ *
+ * Notably LEA does not modify flags and can be reordered with the CMP,
+ * avoiding a dependency. Again, using a non-taken (backwards) branch
+ * for the failure case, abusing LEA's immediate 0xf0 as LOCK prefix for the
+ * Jcc.d8, causing #UD.
+ */
+asm( ".pushsection .rodata \n"
+ "fineibt_paranoid_start: \n"
+ " movl $0x12345678, %r10d \n"
+ " cmpl -9(%r11), %r10d \n"
+ " lea -0x10(%r11), %r11 \n"
+ " jne fineibt_paranoid_start+0xd \n"
+ "fineibt_paranoid_ind: \n"
+ " call *%r11 \n"
+ " nop \n"
+ "fineibt_paranoid_end: \n"
+ ".popsection \n"
+);
+
+extern u8 fineibt_paranoid_start[];
+extern u8 fineibt_paranoid_ind[];
+extern u8 fineibt_paranoid_end[];
+
+#define fineibt_paranoid_size (fineibt_paranoid_end - fineibt_paranoid_start)
+#define fineibt_paranoid_ind (fineibt_paranoid_ind - fineibt_paranoid_start)
+#define fineibt_paranoid_ud 0xd
+
+static u32 decode_preamble_hash(void *addr, int *reg)
{
u8 *p = addr;
- /* b8 78 56 34 12 mov $0x12345678,%eax */
- if (p[0] == 0xb8)
+ /* b8+reg 78 56 34 12 movl $0x12345678,\reg */
+ if (p[0] >= 0xb8 && p[0] < 0xc0) {
+ if (reg)
+ *reg = p[0] - 0xb8;
return *(u32 *)(addr + 1);
+ }
return 0; /* invalid hash value */
}
@@ -1124,11 +1237,11 @@ static u32 decode_caller_hash(void *addr)
{
u8 *p = addr;
- /* 41 ba 78 56 34 12 mov $0x12345678,%r10d */
+ /* 41 ba 88 a9 cb ed mov $(-0x12345678),%r10d */
if (p[0] == 0x41 && p[1] == 0xba)
return -*(u32 *)(addr + 2);
- /* e8 0c 78 56 34 12 jmp.d8 +12 */
+ /* e8 0c 88 a9 cb ed jmp.d8 +12 */
if (p[0] == JMP8_INSN_OPCODE && p[1] == fineibt_caller_jmp)
return -*(u32 *)(addr + 2);
@@ -1136,7 +1249,7 @@ static u32 decode_caller_hash(void *addr)
}
/* .retpoline_sites */
-static int cfi_disable_callers(s32 *start, s32 *end, struct module *mod)
+static int cfi_disable_callers(s32 *start, s32 *end)
{
/*
* Disable kCFI by patching in a JMP.d8, this leaves the hash immediate
@@ -1148,23 +1261,20 @@ static int cfi_disable_callers(s32 *start, s32 *end, struct module *mod)
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
- void *wr_addr;
u32 hash;
addr -= fineibt_caller_size;
- wr_addr = module_writable_address(mod, addr);
- hash = decode_caller_hash(wr_addr);
-
+ hash = decode_caller_hash(addr);
if (!hash) /* nocfi callers */
continue;
- text_poke_early(wr_addr, jmp, 2);
+ text_poke_early(addr, jmp, 2);
}
return 0;
}
-static int cfi_enable_callers(s32 *start, s32 *end, struct module *mod)
+static int cfi_enable_callers(s32 *start, s32 *end)
{
/*
* Re-enable kCFI, undo what cfi_disable_callers() did.
@@ -1174,126 +1284,212 @@ static int cfi_enable_callers(s32 *start, s32 *end, struct module *mod)
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
- void *wr_addr;
u32 hash;
addr -= fineibt_caller_size;
- wr_addr = module_writable_address(mod, addr);
- hash = decode_caller_hash(wr_addr);
+ hash = decode_caller_hash(addr);
if (!hash) /* nocfi callers */
continue;
- text_poke_early(wr_addr, mov, 2);
+ text_poke_early(addr, mov, 2);
}
return 0;
}
/* .cfi_sites */
-static int cfi_rand_preamble(s32 *start, s32 *end, struct module *mod)
+static int cfi_rand_preamble(s32 *start, s32 *end)
{
s32 *s;
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
- void *wr_addr = module_writable_address(mod, addr);
u32 hash;
- hash = decode_preamble_hash(wr_addr);
+ hash = decode_preamble_hash(addr, NULL);
if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
addr, addr, 5, addr))
return -EINVAL;
hash = cfi_rehash(hash);
- text_poke_early(wr_addr + 1, &hash, 4);
+ text_poke_early(addr + 1, &hash, 4);
}
return 0;
}
-static int cfi_rewrite_preamble(s32 *start, s32 *end, struct module *mod)
+static void cfi_fineibt_bhi_preamble(void *addr, int arity)
+{
+ if (!arity)
+ return;
+
+ if (!cfi_warn && arity == 1) {
+ /*
+ * Crazy scheme to allow arity-1 inline:
+ *
+ * __cfi_foo:
+ * 0: f3 0f 1e fa endbr64
+ * 4: 41 81 <ea> 78 56 34 12 sub 0x12345678, %r10d
+ * b: 49 0f 45 fa cmovne %r10, %rdi
+ * f: 75 f5 jne __cfi_foo+6
+ * 11: 0f 1f 00 nopl (%rax)
+ *
+ * Code that direct calls to foo()+0, decodes the tail end as:
+ *
+ * foo:
+ * 0: f5 cmc
+ * 1: 0f 1f 00 nopl (%rax)
+ *
+ * which clobbers CF, but does not affect anything ABI
+ * wise.
+ *
+ * Notably, this scheme is incompatible with permissive CFI
+ * because the CMOVcc is unconditional and RDI will have been
+ * clobbered.
+ */
+ const u8 magic[9] = {
+ 0x49, 0x0f, 0x45, 0xfa,
+ 0x75, 0xf5,
+ BYTES_NOP3,
+ };
+
+ text_poke_early(addr + fineibt_preamble_bhi, magic, 9);
+
+ return;
+ }
+
+ text_poke_early(addr + fineibt_preamble_bhi,
+ text_gen_insn(CALL_INSN_OPCODE,
+ addr + fineibt_preamble_bhi,
+ __bhi_args[arity]),
+ CALL_INSN_SIZE);
+}
+
+static int cfi_rewrite_preamble(s32 *start, s32 *end)
{
s32 *s;
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
- void *wr_addr = module_writable_address(mod, addr);
+ int arity;
u32 hash;
- hash = decode_preamble_hash(wr_addr);
+ /*
+ * When the function doesn't start with ENDBR the compiler will
+ * have determined there are no indirect calls to it and we
+ * don't need no CFI either.
+ */
+ if (!is_endbr(addr + 16))
+ continue;
+
+ hash = decode_preamble_hash(addr, &arity);
if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
addr, addr, 5, addr))
return -EINVAL;
- text_poke_early(wr_addr, fineibt_preamble_start, fineibt_preamble_size);
- WARN_ON(*(u32 *)(wr_addr + fineibt_preamble_hash) != 0x12345678);
- text_poke_early(wr_addr + fineibt_preamble_hash, &hash, 4);
+ text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size);
+ WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678);
+ text_poke_early(addr + fineibt_preamble_hash, &hash, 4);
+
+ WARN_ONCE(!IS_ENABLED(CONFIG_FINEIBT_BHI) && arity,
+ "kCFI preamble has wrong register at: %pS %*ph\n",
+ addr, 5, addr);
+
+ if (cfi_bhi)
+ cfi_fineibt_bhi_preamble(addr, arity);
}
return 0;
}
-static void cfi_rewrite_endbr(s32 *start, s32 *end, struct module *mod)
+static void cfi_rewrite_endbr(s32 *start, s32 *end)
{
s32 *s;
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
- void *wr_addr = module_writable_address(mod, addr);
- poison_endbr(addr + 16, wr_addr + 16, false);
+ if (!exact_endbr(addr + 16))
+ continue;
+
+ poison_endbr(addr + 16);
}
}
/* .retpoline_sites */
-static int cfi_rand_callers(s32 *start, s32 *end, struct module *mod)
+static int cfi_rand_callers(s32 *start, s32 *end)
{
s32 *s;
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
- void *wr_addr;
u32 hash;
addr -= fineibt_caller_size;
- wr_addr = module_writable_address(mod, addr);
- hash = decode_caller_hash(wr_addr);
+ hash = decode_caller_hash(addr);
if (hash) {
hash = -cfi_rehash(hash);
- text_poke_early(wr_addr + 2, &hash, 4);
+ text_poke_early(addr + 2, &hash, 4);
}
}
return 0;
}
-static int cfi_rewrite_callers(s32 *start, s32 *end, struct module *mod)
+static int cfi_rewrite_callers(s32 *start, s32 *end)
{
s32 *s;
+ BUG_ON(fineibt_paranoid_size != 20);
+
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
- void *wr_addr;
+ struct insn insn;
+ u8 bytes[20];
u32 hash;
+ int ret;
+ u8 op;
addr -= fineibt_caller_size;
- wr_addr = module_writable_address(mod, addr);
- hash = decode_caller_hash(wr_addr);
- if (hash) {
- text_poke_early(wr_addr, fineibt_caller_start, fineibt_caller_size);
- WARN_ON(*(u32 *)(wr_addr + fineibt_caller_hash) != 0x12345678);
- text_poke_early(wr_addr + fineibt_caller_hash, &hash, 4);
+ hash = decode_caller_hash(addr);
+ if (!hash)
+ continue;
+
+ if (!cfi_paranoid) {
+ text_poke_early(addr, fineibt_caller_start, fineibt_caller_size);
+ WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678);
+ text_poke_early(addr + fineibt_caller_hash, &hash, 4);
+ /* rely on apply_retpolines() */
+ continue;
+ }
+
+ /* cfi_paranoid */
+ ret = insn_decode_kernel(&insn, addr + fineibt_caller_size);
+ if (WARN_ON_ONCE(ret < 0))
+ continue;
+
+ op = insn.opcode.bytes[0];
+ if (op != CALL_INSN_OPCODE && op != JMP32_INSN_OPCODE) {
+ WARN_ON_ONCE(1);
+ continue;
}
- /* rely on apply_retpolines() */
+
+ memcpy(bytes, fineibt_paranoid_start, fineibt_paranoid_size);
+ memcpy(bytes + fineibt_caller_hash, &hash, 4);
+
+ ret = emit_indirect(op, 11, bytes + fineibt_paranoid_ind);
+ if (WARN_ON_ONCE(ret != 3))
+ continue;
+
+ text_poke_early(addr, bytes, fineibt_paranoid_size);
}
return 0;
}
static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
- s32 *start_cfi, s32 *end_cfi, struct module *mod)
+ s32 *start_cfi, s32 *end_cfi, bool builtin)
{
- bool builtin = mod ? false : true;
int ret;
if (WARN_ONCE(fineibt_preamble_size != 16,
@@ -1302,8 +1498,15 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
if (cfi_mode == CFI_AUTO) {
cfi_mode = CFI_KCFI;
- if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT))
+ if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT)) {
+ /*
+ * FRED has much saner context on exception entry and
+ * is less easy to take advantage of.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_FRED))
+ cfi_paranoid = true;
cfi_mode = CFI_FINEIBT;
+ }
}
/*
@@ -1311,7 +1514,7 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
* rewrite them. This disables all CFI. If this succeeds but any of the
* later stages fails, we're without CFI.
*/
- ret = cfi_disable_callers(start_retpoline, end_retpoline, mod);
+ ret = cfi_disable_callers(start_retpoline, end_retpoline);
if (ret)
goto err;
@@ -1322,11 +1525,11 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
cfi_bpf_subprog_hash = cfi_rehash(cfi_bpf_subprog_hash);
}
- ret = cfi_rand_preamble(start_cfi, end_cfi, mod);
+ ret = cfi_rand_preamble(start_cfi, end_cfi);
if (ret)
goto err;
- ret = cfi_rand_callers(start_retpoline, end_retpoline, mod);
+ ret = cfi_rand_callers(start_retpoline, end_retpoline);
if (ret)
goto err;
}
@@ -1338,7 +1541,7 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
return;
case CFI_KCFI:
- ret = cfi_enable_callers(start_retpoline, end_retpoline, mod);
+ ret = cfi_enable_callers(start_retpoline, end_retpoline);
if (ret)
goto err;
@@ -1348,20 +1551,23 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
case CFI_FINEIBT:
/* place the FineIBT preamble at func()-16 */
- ret = cfi_rewrite_preamble(start_cfi, end_cfi, mod);
+ ret = cfi_rewrite_preamble(start_cfi, end_cfi);
if (ret)
goto err;
/* rewrite the callers to target func()-16 */
- ret = cfi_rewrite_callers(start_retpoline, end_retpoline, mod);
+ ret = cfi_rewrite_callers(start_retpoline, end_retpoline);
if (ret)
goto err;
/* now that nobody targets func()+0, remove ENDBR there */
- cfi_rewrite_endbr(start_cfi, end_cfi, mod);
+ cfi_rewrite_endbr(start_cfi, end_cfi);
- if (builtin)
- pr_info("Using FineIBT CFI\n");
+ if (builtin) {
+ pr_info("Using %sFineIBT%s CFI\n",
+ cfi_paranoid ? "paranoid " : "",
+ cfi_bhi ? "+BHI" : "");
+ }
return;
default:
@@ -1377,11 +1583,25 @@ static inline void poison_hash(void *addr)
*(u32 *)addr = 0;
}
-static void poison_cfi(void *addr, void *wr_addr)
+static void poison_cfi(void *addr)
{
+ /*
+ * Compilers manage to be inconsistent with ENDBR vs __cfi prefixes,
+ * some (static) functions for which they can determine the address
+ * is never taken do not get a __cfi prefix, but *DO* get an ENDBR.
+ *
+ * As such, these functions will get sealed, but we need to be careful
+ * to not unconditionally scribble the previous function.
+ */
switch (cfi_mode) {
case CFI_FINEIBT:
/*
+ * FineIBT prefix should start with an ENDBR.
+ */
+ if (!is_endbr(addr))
+ break;
+
+ /*
* __cfi_\func:
* osp nopl (%rax)
* subl $0, %r10d
@@ -1389,17 +1609,23 @@ static void poison_cfi(void *addr, void *wr_addr)
* ud2
* 1: nop
*/
- poison_endbr(addr, wr_addr, false);
- poison_hash(wr_addr + fineibt_preamble_hash);
+ poison_endbr(addr);
+ poison_hash(addr + fineibt_preamble_hash);
break;
case CFI_KCFI:
/*
+ * kCFI prefix should start with a valid hash.
+ */
+ if (!decode_preamble_hash(addr, NULL))
+ break;
+
+ /*
* __cfi_\func:
* movl $0, %eax
* .skip 11, 0x90
*/
- poison_hash(wr_addr + 1);
+ poison_hash(addr + 1);
break;
default:
@@ -1407,24 +1633,135 @@ static void poison_cfi(void *addr, void *wr_addr)
}
}
-#else
+/*
+ * When regs->ip points to a 0xEA byte in the FineIBT preamble,
+ * return true and fill out target and type.
+ *
+ * We check the preamble by checking for the ENDBR instruction relative to the
+ * 0xEA instruction.
+ */
+static bool decode_fineibt_preamble(struct pt_regs *regs, unsigned long *target, u32 *type)
+{
+ unsigned long addr = regs->ip - fineibt_preamble_ud;
+ u32 hash;
+
+ if (!exact_endbr((void *)addr))
+ return false;
+
+ *target = addr + fineibt_preamble_size;
+
+ __get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault);
+ *type = (u32)regs->r10 + hash;
+
+ /*
+ * Since regs->ip points to the middle of an instruction; it cannot
+ * continue with the normal fixup.
+ */
+ regs->ip = *target;
+
+ return true;
+
+Efault:
+ return false;
+}
+
+/*
+ * regs->ip points to one of the UD2 in __bhi_args[].
+ */
+static bool decode_fineibt_bhi(struct pt_regs *regs, unsigned long *target, u32 *type)
+{
+ unsigned long addr;
+ u32 hash;
+
+ if (!cfi_bhi)
+ return false;
+
+ if (regs->ip < (unsigned long)__bhi_args ||
+ regs->ip >= (unsigned long)__bhi_args_end)
+ return false;
+
+ /*
+ * Fetch the return address from the stack, this points to the
+ * FineIBT preamble. Since the CALL instruction is in the 5 last
+ * bytes of the preamble, the return address is in fact the target
+ * address.
+ */
+ __get_kernel_nofault(&addr, regs->sp, unsigned long, Efault);
+ *target = addr;
+
+ addr -= fineibt_preamble_size;
+ if (!exact_endbr((void *)addr))
+ return false;
+
+ __get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault);
+ *type = (u32)regs->r10 + hash;
+
+ /*
+ * The UD2 sites are constructed with a RET immediately following,
+ * as such the non-fatal case can use the regular fixup.
+ */
+ return true;
+
+Efault:
+ return false;
+}
+
+/*
+ * regs->ip points to a LOCK Jcc.d8 instruction from the fineibt_paranoid_start[]
+ * sequence.
+ */
+static bool decode_fineibt_paranoid(struct pt_regs *regs, unsigned long *target, u32 *type)
+{
+ unsigned long addr = regs->ip - fineibt_paranoid_ud;
+ u32 hash;
+
+ if (!cfi_paranoid || !is_cfi_trap(addr + fineibt_caller_size - LEN_UD2))
+ return false;
+
+ __get_kernel_nofault(&hash, addr + fineibt_caller_hash, u32, Efault);
+ *target = regs->r11 + fineibt_preamble_size;
+ *type = regs->r10;
+
+ /*
+ * Since the trapping instruction is the exact, but LOCK prefixed,
+ * Jcc.d8 that got us here, the normal fixup will work.
+ */
+ return true;
+
+Efault:
+ return false;
+}
+
+bool decode_fineibt_insn(struct pt_regs *regs, unsigned long *target, u32 *type)
+{
+ if (decode_fineibt_paranoid(regs, target, type))
+ return true;
+
+ if (decode_fineibt_bhi(regs, target, type))
+ return true;
+
+ return decode_fineibt_preamble(regs, target, type);
+}
+
+#else /* !CONFIG_FINEIBT: */
static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
- s32 *start_cfi, s32 *end_cfi, struct module *mod)
+ s32 *start_cfi, s32 *end_cfi, bool builtin)
{
}
#ifdef CONFIG_X86_KERNEL_IBT
-static void poison_cfi(void *addr, void *wr_addr) { }
+static void poison_cfi(void *addr) { }
#endif
-#endif
+#endif /* !CONFIG_FINEIBT */
void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
- s32 *start_cfi, s32 *end_cfi, struct module *mod)
+ s32 *start_cfi, s32 *end_cfi)
{
return __apply_fineibt(start_retpoline, end_retpoline,
- start_cfi, end_cfi, mod);
+ start_cfi, end_cfi,
+ /* .builtin = */ false);
}
#ifdef CONFIG_SMP
@@ -1721,27 +2058,27 @@ void __init alternative_instructions(void)
paravirt_set_cap();
__apply_fineibt(__retpoline_sites, __retpoline_sites_end,
- __cfi_sites, __cfi_sites_end, NULL);
+ __cfi_sites, __cfi_sites_end, true);
/*
* Rewrite the retpolines, must be done before alternatives since
* those can rewrite the retpoline thunks.
*/
- apply_retpolines(__retpoline_sites, __retpoline_sites_end, NULL);
- apply_returns(__return_sites, __return_sites_end, NULL);
-
- apply_alternatives(__alt_instructions, __alt_instructions_end, NULL);
+ apply_retpolines(__retpoline_sites, __retpoline_sites_end);
+ apply_returns(__return_sites, __return_sites_end);
/*
- * Now all calls are established. Apply the call thunks if
- * required.
+ * Adjust all CALL instructions to point to func()-10, including
+ * those in .altinstr_replacement.
*/
callthunks_patch_builtin_calls();
+ apply_alternatives(__alt_instructions, __alt_instructions_end);
+
/*
* Seal all functions that do not have their address taken.
*/
- apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end, NULL);
+ apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end);
#ifdef CONFIG_SMP
/* Patch to UP if other cpus not imminent. */
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 67e773744edb..6d12a9b69432 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -73,7 +73,6 @@ static int amd_cache_northbridges(void)
amd_northbridges.nb = nb;
for (i = 0; i < amd_northbridges.num; i++) {
- node_to_amd_nb(i)->root = amd_node_get_root(i);
node_to_amd_nb(i)->misc = amd_node_get_func(i, 3);
/*
diff --git a/arch/x86/kernel/amd_node.c b/arch/x86/kernel/amd_node.c
index d2ec7fd555c5..b670fa85c61b 100644
--- a/arch/x86/kernel/amd_node.c
+++ b/arch/x86/kernel/amd_node.c
@@ -8,6 +8,7 @@
* Author: Yazen Ghannam <Yazen.Ghannam@amd.com>
*/
+#include <linux/debugfs.h>
#include <asm/amd_node.h>
/*
@@ -93,10 +94,14 @@ static struct pci_dev **amd_roots;
/* Protect the PCI config register pairs used for SMN. */
static DEFINE_MUTEX(smn_mutex);
+static bool smn_exclusive;
#define SMN_INDEX_OFFSET 0x60
#define SMN_DATA_OFFSET 0x64
+#define HSMP_INDEX_OFFSET 0xc4
+#define HSMP_DATA_OFFSET 0xc8
+
/*
* SMN accesses may fail in ways that are difficult to detect here in the called
* functions amd_smn_read() and amd_smn_write(). Therefore, callers must do
@@ -146,6 +151,9 @@ static int __amd_smn_rw(u8 i_off, u8 d_off, u16 node, u32 address, u32 *value, b
if (!root)
return err;
+ if (!smn_exclusive)
+ return err;
+
guard(mutex)(&smn_mutex);
err = pci_write_config_dword(root, i_off, address);
@@ -179,6 +187,93 @@ int __must_check amd_smn_write(u16 node, u32 address, u32 value)
}
EXPORT_SYMBOL_GPL(amd_smn_write);
+int __must_check amd_smn_hsmp_rdwr(u16 node, u32 address, u32 *value, bool write)
+{
+ return __amd_smn_rw(HSMP_INDEX_OFFSET, HSMP_DATA_OFFSET, node, address, value, write);
+}
+EXPORT_SYMBOL_GPL(amd_smn_hsmp_rdwr);
+
+static struct dentry *debugfs_dir;
+static u16 debug_node;
+static u32 debug_address;
+
+static ssize_t smn_node_write(struct file *file, const char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ u16 node;
+ int ret;
+
+ ret = kstrtou16_from_user(userbuf, count, 0, &node);
+ if (ret)
+ return ret;
+
+ if (node >= amd_num_nodes())
+ return -ENODEV;
+
+ debug_node = node;
+ return count;
+}
+
+static int smn_node_show(struct seq_file *m, void *v)
+{
+ seq_printf(m, "0x%08x\n", debug_node);
+ return 0;
+}
+
+static ssize_t smn_address_write(struct file *file, const char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ int ret;
+
+ ret = kstrtouint_from_user(userbuf, count, 0, &debug_address);
+ if (ret)
+ return ret;
+
+ return count;
+}
+
+static int smn_address_show(struct seq_file *m, void *v)
+{
+ seq_printf(m, "0x%08x\n", debug_address);
+ return 0;
+}
+
+static int smn_value_show(struct seq_file *m, void *v)
+{
+ u32 val;
+ int ret;
+
+ ret = amd_smn_read(debug_node, debug_address, &val);
+ if (ret)
+ return ret;
+
+ seq_printf(m, "0x%08x\n", val);
+ return 0;
+}
+
+static ssize_t smn_value_write(struct file *file, const char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ u32 val;
+ int ret;
+
+ ret = kstrtouint_from_user(userbuf, count, 0, &val);
+ if (ret)
+ return ret;
+
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+
+ ret = amd_smn_write(debug_node, debug_address, val);
+ if (ret)
+ return ret;
+
+ return count;
+}
+
+DEFINE_SHOW_STORE_ATTRIBUTE(smn_node);
+DEFINE_SHOW_STORE_ATTRIBUTE(smn_address);
+DEFINE_SHOW_STORE_ATTRIBUTE(smn_value);
+
static int amd_cache_roots(void)
{
u16 node, num_nodes = amd_num_nodes();
@@ -193,6 +288,48 @@ static int amd_cache_roots(void)
return 0;
}
+static int reserve_root_config_spaces(void)
+{
+ struct pci_dev *root = NULL;
+ struct pci_bus *bus = NULL;
+
+ while ((bus = pci_find_next_bus(bus))) {
+ /* Root device is Device 0 Function 0 on each Primary Bus. */
+ root = pci_get_slot(bus, 0);
+ if (!root)
+ continue;
+
+ if (root->vendor != PCI_VENDOR_ID_AMD &&
+ root->vendor != PCI_VENDOR_ID_HYGON)
+ continue;
+
+ pci_dbg(root, "Reserving PCI config space\n");
+
+ /*
+ * There are a few SMN index/data pairs and other registers
+ * that shouldn't be accessed by user space.
+ * So reserve the entire PCI config space for simplicity rather
+ * than covering specific registers piecemeal.
+ */
+ if (!pci_request_config_region_exclusive(root, 0, PCI_CFG_SPACE_SIZE, NULL)) {
+ pci_err(root, "Failed to reserve config space\n");
+ return -EEXIST;
+ }
+ }
+
+ smn_exclusive = true;
+ return 0;
+}
+
+static bool enable_dfs;
+
+static int __init amd_smn_enable_dfs(char *str)
+{
+ enable_dfs = true;
+ return 1;
+}
+__setup("amd_smn_debugfs_enable", amd_smn_enable_dfs);
+
static int __init amd_smn_init(void)
{
int err;
@@ -209,6 +346,18 @@ static int __init amd_smn_init(void)
if (err)
return err;
+ err = reserve_root_config_spaces();
+ if (err)
+ return err;
+
+ if (enable_dfs) {
+ debugfs_dir = debugfs_create_dir("amd_smn", arch_debugfs_dir);
+
+ debugfs_create_file("node", 0600, debugfs_dir, NULL, &smn_node_fops);
+ debugfs_create_file("address", 0600, debugfs_dir, NULL, &smn_address_fops);
+ debugfs_create_file("value", 0600, debugfs_dir, NULL, &smn_value_fops);
+ }
+
return 0;
}
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 3bf0487cf3b7..52d1808ee360 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -23,8 +23,5 @@ obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o
obj-y += apic_flat_64.o
endif
-# APIC probe will depend on the listing order here
-obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o
-
# For 32bit, probe_32 need to be listed last
obj-$(CONFIG_X86_LOCAL_APIC) += probe_$(BITS).o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index e893dc6f11c1..62584a347931 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1371,8 +1371,6 @@ void __init apic_intr_mode_init(void)
x86_64_probe_apic();
- x86_32_install_bigsmp();
-
if (x86_platform.apic_post_init)
x86_platform.apic_post_init();
@@ -1674,7 +1672,6 @@ static __init void apic_read_boot_cpu_id(bool x2apic)
boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR));
}
topology_register_boot_apic(boot_cpu_physical_apicid);
- x86_32_probe_bigsmp_early();
}
#ifdef CONFIG_X86_X2APIC
@@ -2014,8 +2011,8 @@ static bool __init detect_init_APIC(void)
case X86_VENDOR_HYGON:
break;
case X86_VENDOR_INTEL:
- if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 ||
- (boot_cpu_data.x86 == 5 && boot_cpu_has(X86_FEATURE_APIC)))
+ if ((boot_cpu_data.x86 == 5 && boot_cpu_has(X86_FEATURE_APIC)) ||
+ boot_cpu_data.x86_vfm >= INTEL_PENTIUM_PRO)
break;
goto no_apic;
default:
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
deleted file mode 100644
index 9285d500d5b4..000000000000
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ /dev/null
@@ -1,105 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * APIC driver for "bigsmp" xAPIC machines with more than 8 virtual CPUs.
- *
- * Drives the local APIC in "clustered mode".
- */
-#include <linux/cpumask.h>
-#include <linux/dmi.h>
-#include <linux/smp.h>
-
-#include <asm/apic.h>
-#include <asm/io_apic.h>
-
-#include "local.h"
-
-static u32 bigsmp_get_apic_id(u32 x)
-{
- return (x >> 24) & 0xFF;
-}
-
-static void bigsmp_send_IPI_allbutself(int vector)
-{
- default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
-}
-
-static void bigsmp_send_IPI_all(int vector)
-{
- default_send_IPI_mask_sequence_phys(cpu_online_mask, vector);
-}
-
-static int dmi_bigsmp; /* can be set by dmi scanners */
-
-static int hp_ht_bigsmp(const struct dmi_system_id *d)
-{
- printk(KERN_NOTICE "%s detected: force use of apic=bigsmp\n", d->ident);
- dmi_bigsmp = 1;
-
- return 0;
-}
-
-
-static const struct dmi_system_id bigsmp_dmi_table[] = {
- { hp_ht_bigsmp, "HP ProLiant DL760 G2",
- { DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
- DMI_MATCH(DMI_BIOS_VERSION, "P44-"),
- }
- },
-
- { hp_ht_bigsmp, "HP ProLiant DL740",
- { DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
- DMI_MATCH(DMI_BIOS_VERSION, "P47-"),
- }
- },
- { } /* NULL entry stops DMI scanning */
-};
-
-static int probe_bigsmp(void)
-{
- return dmi_check_system(bigsmp_dmi_table);
-}
-
-static struct apic apic_bigsmp __ro_after_init = {
-
- .name = "bigsmp",
- .probe = probe_bigsmp,
-
- .dest_mode_logical = false,
-
- .disable_esr = 1,
-
- .cpu_present_to_apicid = default_cpu_present_to_apicid,
-
- .max_apic_id = 0xFE,
- .get_apic_id = bigsmp_get_apic_id,
-
- .calc_dest_apicid = apic_default_calc_apicid,
-
- .send_IPI = default_send_IPI_single_phys,
- .send_IPI_mask = default_send_IPI_mask_sequence_phys,
- .send_IPI_mask_allbutself = NULL,
- .send_IPI_allbutself = bigsmp_send_IPI_allbutself,
- .send_IPI_all = bigsmp_send_IPI_all,
- .send_IPI_self = default_send_IPI_self,
-
- .read = native_apic_mem_read,
- .write = native_apic_mem_write,
- .eoi = native_apic_mem_eoi,
- .icr_read = native_apic_icr_read,
- .icr_write = native_apic_icr_write,
- .wait_icr_idle = apic_mem_wait_icr_idle,
- .safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout,
-};
-
-bool __init apic_bigsmp_possible(bool cmdline_override)
-{
- return apic == &apic_bigsmp || !cmdline_override;
-}
-
-void __init apic_bigsmp_force(void)
-{
- if (apic != &apic_bigsmp)
- apic_install_driver(&apic_bigsmp);
-}
-
-apic_driver(apic_bigsmp);
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index 5da693d633b7..98a57cb4aa86 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -3,6 +3,7 @@
#include <linux/cpumask.h>
#include <linux/delay.h>
#include <linux/smp.h>
+#include <linux/string_choices.h>
#include <asm/io_apic.h>
@@ -23,7 +24,7 @@ __setup("no_ipi_broadcast=", apic_ipi_shorthand);
static int __init print_ipi_mode(void)
{
pr_info("IPI shorthand broadcast: %s\n",
- apic_ipi_shorthand_off ? "disabled" : "enabled");
+ str_disabled_enabled(apic_ipi_shorthand_off));
return 0;
}
late_initcall(print_ipi_mode);
@@ -287,34 +288,4 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector)
__default_send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL);
local_irq_restore(flags);
}
-
-#ifdef CONFIG_SMP
-static int convert_apicid_to_cpu(u32 apic_id)
-{
- int i;
-
- for_each_possible_cpu(i) {
- if (per_cpu(x86_cpu_to_apicid, i) == apic_id)
- return i;
- }
- return -1;
-}
-
-int safe_smp_processor_id(void)
-{
- u32 apicid;
- int cpuid;
-
- if (!boot_cpu_has(X86_FEATURE_APIC))
- return 0;
-
- apicid = read_apic_id();
- if (apicid == BAD_APICID)
- return 0;
-
- cpuid = convert_apicid_to_cpu(apicid);
-
- return cpuid >= 0 ? cpuid : 0;
-}
-#endif
#endif
diff --git a/arch/x86/kernel/apic/local.h b/arch/x86/kernel/apic/local.h
index 842fe28496be..bdcf609eb283 100644
--- a/arch/x86/kernel/apic/local.h
+++ b/arch/x86/kernel/apic/local.h
@@ -65,17 +65,4 @@ void default_send_IPI_self(int vector);
void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector);
void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, int vector);
void default_send_IPI_mask_logical(const struct cpumask *mask, int vector);
-void x86_32_probe_bigsmp_early(void);
-void x86_32_install_bigsmp(void);
-#else
-static inline void x86_32_probe_bigsmp_early(void) { }
-static inline void x86_32_install_bigsmp(void) { }
-#endif
-
-#ifdef CONFIG_X86_BIGSMP
-bool apic_bigsmp_possible(bool cmdline_selected);
-void apic_bigsmp_force(void);
-#else
-static inline bool apic_bigsmp_possible(bool cmdline_selected) { return false; };
-static inline void apic_bigsmp_force(void) { }
#endif
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index f75ee345c02d..87bc9e7ca5d6 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -93,35 +93,6 @@ static int __init parse_apic(char *arg)
}
early_param("apic", parse_apic);
-void __init x86_32_probe_bigsmp_early(void)
-{
- if (nr_cpu_ids <= 8 || xen_pv_domain())
- return;
-
- if (IS_ENABLED(CONFIG_X86_BIGSMP)) {
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_INTEL:
- if (!APIC_XAPIC(boot_cpu_apic_version))
- break;
- /* P4 and above */
- fallthrough;
- case X86_VENDOR_HYGON:
- case X86_VENDOR_AMD:
- if (apic_bigsmp_possible(cmdline_apic))
- return;
- break;
- }
- }
- pr_info("Limiting to 8 possible CPUs\n");
- set_nr_cpu_ids(8);
-}
-
-void __init x86_32_install_bigsmp(void)
-{
- if (nr_cpu_ids > 8 && !xen_pv_domain())
- apic_bigsmp_force();
-}
-
void __init x86_32_probe_apic(void)
{
if (!cmdline_apic) {
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 736f62812f5c..72fa4bb78f0a 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -888,8 +888,109 @@ static int apic_set_affinity(struct irq_data *irqd,
return err ? err : IRQ_SET_MASK_OK;
}
+static void free_moved_vector(struct apic_chip_data *apicd)
+{
+ unsigned int vector = apicd->prev_vector;
+ unsigned int cpu = apicd->prev_cpu;
+ bool managed = apicd->is_managed;
+
+ /*
+ * Managed interrupts are usually not migrated away
+ * from an online CPU, but CPU isolation 'managed_irq'
+ * can make that happen.
+ * 1) Activation does not take the isolation into account
+ * to keep the code simple
+ * 2) Migration away from an isolated CPU can happen when
+ * a non-isolated CPU which is in the calculated
+ * affinity mask comes online.
+ */
+ trace_vector_free_moved(apicd->irq, cpu, vector, managed);
+ irq_matrix_free(vector_matrix, cpu, vector, managed);
+ per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
+ hlist_del_init(&apicd->clist);
+ apicd->prev_vector = 0;
+ apicd->move_in_progress = 0;
+}
+
+/*
+ * Called from fixup_irqs() with @desc->lock held and interrupts disabled.
+ */
+static void apic_force_complete_move(struct irq_data *irqd)
+{
+ unsigned int cpu = smp_processor_id();
+ struct apic_chip_data *apicd;
+ unsigned int vector;
+
+ guard(raw_spinlock)(&vector_lock);
+ apicd = apic_chip_data(irqd);
+ if (!apicd)
+ return;
+
+ /*
+ * If prev_vector is empty or the descriptor is neither currently
+ * nor previously on the outgoing CPU no action required.
+ */
+ vector = apicd->prev_vector;
+ if (!vector || (apicd->cpu != cpu && apicd->prev_cpu != cpu))
+ return;
+
+ /*
+ * This is tricky. If the cleanup of the old vector has not been
+ * done yet, then the following setaffinity call will fail with
+ * -EBUSY. This can leave the interrupt in a stale state.
+ *
+ * All CPUs are stuck in stop machine with interrupts disabled so
+ * calling __irq_complete_move() would be completely pointless.
+ *
+ * 1) The interrupt is in move_in_progress state. That means that we
+ * have not seen an interrupt since the io_apic was reprogrammed to
+ * the new vector.
+ *
+ * 2) The interrupt has fired on the new vector, but the cleanup IPIs
+ * have not been processed yet.
+ */
+ if (apicd->move_in_progress) {
+ /*
+ * In theory there is a race:
+ *
+ * set_ioapic(new_vector) <-- Interrupt is raised before update
+ * is effective, i.e. it's raised on
+ * the old vector.
+ *
+ * So if the target cpu cannot handle that interrupt before
+ * the old vector is cleaned up, we get a spurious interrupt
+ * and in the worst case the ioapic irq line becomes stale.
+ *
+ * But in case of cpu hotplug this should be a non issue
+ * because if the affinity update happens right before all
+ * cpus rendezvous in stop machine, there is no way that the
+ * interrupt can be blocked on the target cpu because all cpus
+ * loops first with interrupts enabled in stop machine, so the
+ * old vector is not yet cleaned up when the interrupt fires.
+ *
+ * So the only way to run into this issue is if the delivery
+ * of the interrupt on the apic/system bus would be delayed
+ * beyond the point where the target cpu disables interrupts
+ * in stop machine. I doubt that it can happen, but at least
+ * there is a theoretical chance. Virtualization might be
+ * able to expose this, but AFAICT the IOAPIC emulation is not
+ * as stupid as the real hardware.
+ *
+ * Anyway, there is nothing we can do about that at this point
+ * w/o refactoring the whole fixup_irq() business completely.
+ * We print at least the irq number and the old vector number,
+ * so we have the necessary information when a problem in that
+ * area arises.
+ */
+ pr_warn("IRQ fixup: irq %d move in progress, old vector %d\n",
+ irqd->irq, vector);
+ }
+ free_moved_vector(apicd);
+}
+
#else
-# define apic_set_affinity NULL
+# define apic_set_affinity NULL
+# define apic_force_complete_move NULL
#endif
static int apic_retrigger_irq(struct irq_data *irqd)
@@ -923,39 +1024,16 @@ static void x86_vector_msi_compose_msg(struct irq_data *data,
}
static struct irq_chip lapic_controller = {
- .name = "APIC",
- .irq_ack = apic_ack_edge,
- .irq_set_affinity = apic_set_affinity,
- .irq_compose_msi_msg = x86_vector_msi_compose_msg,
- .irq_retrigger = apic_retrigger_irq,
+ .name = "APIC",
+ .irq_ack = apic_ack_edge,
+ .irq_set_affinity = apic_set_affinity,
+ .irq_compose_msi_msg = x86_vector_msi_compose_msg,
+ .irq_force_complete_move = apic_force_complete_move,
+ .irq_retrigger = apic_retrigger_irq,
};
#ifdef CONFIG_SMP
-static void free_moved_vector(struct apic_chip_data *apicd)
-{
- unsigned int vector = apicd->prev_vector;
- unsigned int cpu = apicd->prev_cpu;
- bool managed = apicd->is_managed;
-
- /*
- * Managed interrupts are usually not migrated away
- * from an online CPU, but CPU isolation 'managed_irq'
- * can make that happen.
- * 1) Activation does not take the isolation into account
- * to keep the code simple
- * 2) Migration away from an isolated CPU can happen when
- * a non-isolated CPU which is in the calculated
- * affinity mask comes online.
- */
- trace_vector_free_moved(apicd->irq, cpu, vector, managed);
- irq_matrix_free(vector_matrix, cpu, vector, managed);
- per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
- hlist_del_init(&apicd->clist);
- apicd->prev_vector = 0;
- apicd->move_in_progress = 0;
-}
-
static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr)
{
struct apic_chip_data *apicd;
@@ -1068,99 +1146,6 @@ void irq_complete_move(struct irq_cfg *cfg)
__vector_schedule_cleanup(apicd);
}
-/*
- * Called from fixup_irqs() with @desc->lock held and interrupts disabled.
- */
-void irq_force_complete_move(struct irq_desc *desc)
-{
- unsigned int cpu = smp_processor_id();
- struct apic_chip_data *apicd;
- struct irq_data *irqd;
- unsigned int vector;
-
- /*
- * The function is called for all descriptors regardless of which
- * irqdomain they belong to. For example if an IRQ is provided by
- * an irq_chip as part of a GPIO driver, the chip data for that
- * descriptor is specific to the irq_chip in question.
- *
- * Check first that the chip_data is what we expect
- * (apic_chip_data) before touching it any further.
- */
- irqd = irq_domain_get_irq_data(x86_vector_domain,
- irq_desc_get_irq(desc));
- if (!irqd)
- return;
-
- raw_spin_lock(&vector_lock);
- apicd = apic_chip_data(irqd);
- if (!apicd)
- goto unlock;
-
- /*
- * If prev_vector is empty or the descriptor is neither currently
- * nor previously on the outgoing CPU no action required.
- */
- vector = apicd->prev_vector;
- if (!vector || (apicd->cpu != cpu && apicd->prev_cpu != cpu))
- goto unlock;
-
- /*
- * This is tricky. If the cleanup of the old vector has not been
- * done yet, then the following setaffinity call will fail with
- * -EBUSY. This can leave the interrupt in a stale state.
- *
- * All CPUs are stuck in stop machine with interrupts disabled so
- * calling __irq_complete_move() would be completely pointless.
- *
- * 1) The interrupt is in move_in_progress state. That means that we
- * have not seen an interrupt since the io_apic was reprogrammed to
- * the new vector.
- *
- * 2) The interrupt has fired on the new vector, but the cleanup IPIs
- * have not been processed yet.
- */
- if (apicd->move_in_progress) {
- /*
- * In theory there is a race:
- *
- * set_ioapic(new_vector) <-- Interrupt is raised before update
- * is effective, i.e. it's raised on
- * the old vector.
- *
- * So if the target cpu cannot handle that interrupt before
- * the old vector is cleaned up, we get a spurious interrupt
- * and in the worst case the ioapic irq line becomes stale.
- *
- * But in case of cpu hotplug this should be a non issue
- * because if the affinity update happens right before all
- * cpus rendezvous in stop machine, there is no way that the
- * interrupt can be blocked on the target cpu because all cpus
- * loops first with interrupts enabled in stop machine, so the
- * old vector is not yet cleaned up when the interrupt fires.
- *
- * So the only way to run into this issue is if the delivery
- * of the interrupt on the apic/system bus would be delayed
- * beyond the point where the target cpu disables interrupts
- * in stop machine. I doubt that it can happen, but at least
- * there is a theoretical chance. Virtualization might be
- * able to expose this, but AFAICT the IOAPIC emulation is not
- * as stupid as the real hardware.
- *
- * Anyway, there is nothing we can do about that at this point
- * w/o refactoring the whole fixup_irq() business completely.
- * We print at least the irq number and the old vector number,
- * so we have the necessary information when a problem in that
- * area arises.
- */
- pr_warn("IRQ fixup: irq %d move in progress, old vector %d\n",
- irqd->irq, vector);
- }
- free_moved_vector(apicd);
-unlock:
- raw_spin_unlock(&vector_lock);
-}
-
#ifdef CONFIG_HOTPLUG_CPU
/*
* Note, this is not accurate accounting, but at least good enough to
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index a98020bf31bb..ad4ea6fb3b6c 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -107,11 +107,6 @@ static void __used common(void)
OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
OFFSET(TSS_sp2, tss_struct, x86_tss.sp2);
- OFFSET(X86_top_of_stack, pcpu_hot, top_of_stack);
- OFFSET(X86_current_task, pcpu_hot, current_task);
-#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING
- OFFSET(X86_call_depth, pcpu_hot, call_depth);
-#endif
#if IS_ENABLED(CONFIG_CRYPTO_ARIA_AESNI_AVX_X86_64)
/* Offset for fields in aria_ctx */
BLANK();
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index bb65371ea9df..590b6cd0eac0 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -54,11 +54,5 @@ int main(void)
BLANK();
#undef ENTRY
- BLANK();
-
-#ifdef CONFIG_STACKPROTECTOR
- OFFSET(FIXED_stack_canary, fixed_percpu_data, stack_canary);
- BLANK();
-#endif
return 0;
}
diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c
index 3fed7ae58b60..73274d76ce16 100644
--- a/arch/x86/kernel/bootflag.c
+++ b/arch/x86/kernel/bootflag.c
@@ -8,6 +8,7 @@
#include <linux/string.h>
#include <linux/spinlock.h>
#include <linux/acpi.h>
+#include <linux/bitops.h>
#include <asm/io.h>
#include <linux/mc146818rtc.h>
@@ -20,27 +21,13 @@
int sbf_port __initdata = -1; /* set via acpi_boot_init() */
-static int __init parity(u8 v)
-{
- int x = 0;
- int i;
-
- for (i = 0; i < 8; i++) {
- x ^= (v & 1);
- v >>= 1;
- }
-
- return x;
-}
-
static void __init sbf_write(u8 v)
{
unsigned long flags;
if (sbf_port != -1) {
- v &= ~SBF_PARITY;
- if (!parity(v))
- v |= SBF_PARITY;
+ if (!parity8(v))
+ v ^= SBF_PARITY;
printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n",
sbf_port, v);
@@ -66,14 +53,14 @@ static u8 __init sbf_read(void)
return v;
}
-static int __init sbf_value_valid(u8 v)
+static bool __init sbf_value_valid(u8 v)
{
if (v & SBF_RESERVED) /* Reserved bits */
- return 0;
- if (!parity(v))
- return 0;
+ return false;
+ if (!parity8(v))
+ return false;
- return 1;
+ return true;
}
static int __init sbf_init(void)
diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c
index 8418a892d195..25ae54250112 100644
--- a/arch/x86/kernel/callthunks.c
+++ b/arch/x86/kernel/callthunks.c
@@ -240,21 +240,10 @@ patch_call_sites(s32 *start, s32 *end, const struct core_text *ct)
}
static __init_or_module void
-patch_alt_call_sites(struct alt_instr *start, struct alt_instr *end,
- const struct core_text *ct)
-{
- struct alt_instr *a;
-
- for (a = start; a < end; a++)
- patch_call((void *)&a->instr_offset + a->instr_offset, ct);
-}
-
-static __init_or_module void
callthunks_setup(struct callthunk_sites *cs, const struct core_text *ct)
{
prdbg("Patching call sites %s\n", ct->name);
patch_call_sites(cs->call_start, cs->call_end, ct);
- patch_alt_call_sites(cs->alt_start, cs->alt_end, ct);
prdbg("Patching call sites done%s\n", ct->name);
}
@@ -263,8 +252,6 @@ void __init callthunks_patch_builtin_calls(void)
struct callthunk_sites cs = {
.call_start = __call_sites,
.call_end = __call_sites_end,
- .alt_start = __alt_instructions,
- .alt_end = __alt_instructions_end
};
if (!cpu_feature_enabled(X86_FEATURE_CALL_DEPTH))
diff --git a/arch/x86/kernel/cfi.c b/arch/x86/kernel/cfi.c
index e6bf78fac146..77086cf565ec 100644
--- a/arch/x86/kernel/cfi.c
+++ b/arch/x86/kernel/cfi.c
@@ -67,16 +67,30 @@ static bool decode_cfi_insn(struct pt_regs *regs, unsigned long *target,
*/
enum bug_trap_type handle_cfi_failure(struct pt_regs *regs)
{
- unsigned long target;
+ unsigned long target, addr = regs->ip;
u32 type;
- if (!is_cfi_trap(regs->ip))
- return BUG_TRAP_TYPE_NONE;
+ switch (cfi_mode) {
+ case CFI_KCFI:
+ if (!is_cfi_trap(addr))
+ return BUG_TRAP_TYPE_NONE;
+
+ if (!decode_cfi_insn(regs, &target, &type))
+ return report_cfi_failure_noaddr(regs, addr);
+
+ break;
- if (!decode_cfi_insn(regs, &target, &type))
- return report_cfi_failure_noaddr(regs, regs->ip);
+ case CFI_FINEIBT:
+ if (!decode_fineibt_insn(regs, &target, &type))
+ return BUG_TRAP_TYPE_NONE;
+
+ break;
+
+ default:
+ return BUG_TRAP_TYPE_NONE;
+ }
- return report_cfi_failure(regs, regs->ip, &target, type);
+ return report_cfi_failure(regs, addr, &target, type);
}
/*
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 54194f5995de..79569f72b8ee 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -29,6 +29,8 @@
#include "cpu.h"
+u16 invlpgb_count_max __ro_after_init;
+
static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
{
u32 gprs[8] = { 0 };
@@ -632,7 +634,7 @@ static void init_amd_k8(struct cpuinfo_x86 *c)
* (model = 0x14) and later actually support it.
* (AMD Erratum #110, docId: 25759).
*/
- if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) {
+ if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM) && !cpu_has(c, X86_FEATURE_HYPERVISOR)) {
clear_cpu_cap(c, X86_FEATURE_LAHF_LM);
if (!rdmsrl_amd_safe(0xc001100d, &value)) {
value &= ~BIT_64(32);
@@ -1073,6 +1075,10 @@ static void init_amd(struct cpuinfo_x86 *c)
/* AMD CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */
clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
+
+ /* Enable Translation Cache Extension */
+ if (cpu_has(c, X86_FEATURE_TCE))
+ msr_set_bit(MSR_EFER, _EFER_TCE);
}
#ifdef CONFIG_X86_32
@@ -1105,8 +1111,8 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
- tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask;
- tlb_lli_4k[ENTRIES] = ebx & mask;
+ tlb_lld_4k = (ebx >> 16) & mask;
+ tlb_lli_4k = ebx & mask;
/*
* K8 doesn't have 2M/4M entries in the L2 TLB so read out the L1 TLB
@@ -1119,26 +1125,30 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
/* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
if (!((eax >> 16) & mask))
- tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff;
+ tlb_lld_2m = (cpuid_eax(0x80000005) >> 16) & 0xff;
else
- tlb_lld_2m[ENTRIES] = (eax >> 16) & mask;
+ tlb_lld_2m = (eax >> 16) & mask;
/* a 4M entry uses two 2M entries */
- tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1;
+ tlb_lld_4m = tlb_lld_2m >> 1;
/* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
if (!(eax & mask)) {
/* Erratum 658 */
if (c->x86 == 0x15 && c->x86_model <= 0x1f) {
- tlb_lli_2m[ENTRIES] = 1024;
+ tlb_lli_2m = 1024;
} else {
cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
- tlb_lli_2m[ENTRIES] = eax & 0xff;
+ tlb_lli_2m = eax & 0xff;
}
} else
- tlb_lli_2m[ENTRIES] = eax & mask;
+ tlb_lli_2m = eax & mask;
+
+ tlb_lli_4m = tlb_lli_2m >> 1;
- tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1;
+ /* Max number of pages INVLPGB can invalidate in one shot */
+ if (cpu_has(c, X86_FEATURE_INVLPGB))
+ invlpgb_count_max = (cpuid_edx(0x80000008) & 0xffff) + 1;
}
static const struct cpu_dev amd_cpu_dev = {
diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
index f642de2ebdac..6cf31a1649c4 100644
--- a/arch/x86/kernel/cpu/aperfmperf.c
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -498,7 +498,7 @@ void arch_scale_freq_tick(void)
*/
#define MAX_SAMPLE_AGE ((unsigned long)HZ / 50)
-unsigned int arch_freq_get_on_cpu(int cpu)
+int arch_freq_get_on_cpu(int cpu)
{
struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu);
unsigned int seq, freq;
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index a5d0998d7604..4386aa6c69e1 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -113,6 +113,10 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
/* Control unconditional IBPB in switch_mm() */
DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
+/* Control IBPB on vCPU load */
+DEFINE_STATIC_KEY_FALSE(switch_vcpu_ibpb);
+EXPORT_SYMBOL_GPL(switch_vcpu_ibpb);
+
/* Control MDS CPU buffer clear before idling (halt, mwait) */
DEFINE_STATIC_KEY_FALSE(mds_idle_clear);
EXPORT_SYMBOL_GPL(mds_idle_clear);
@@ -234,7 +238,7 @@ static void x86_amd_ssb_disable(void)
/* Default mitigation for MDS-affected CPUs */
static enum mds_mitigations mds_mitigation __ro_after_init =
- IS_ENABLED(CONFIG_MITIGATION_MDS) ? MDS_MITIGATION_FULL : MDS_MITIGATION_OFF;
+ IS_ENABLED(CONFIG_MITIGATION_MDS) ? MDS_MITIGATION_AUTO : MDS_MITIGATION_OFF;
static bool mds_nosmt __ro_after_init = false;
static const char * const mds_strings[] = {
@@ -243,6 +247,40 @@ static const char * const mds_strings[] = {
[MDS_MITIGATION_VMWERV] = "Vulnerable: Clear CPU buffers attempted, no microcode",
};
+enum taa_mitigations {
+ TAA_MITIGATION_OFF,
+ TAA_MITIGATION_AUTO,
+ TAA_MITIGATION_UCODE_NEEDED,
+ TAA_MITIGATION_VERW,
+ TAA_MITIGATION_TSX_DISABLED,
+};
+
+/* Default mitigation for TAA-affected CPUs */
+static enum taa_mitigations taa_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_TAA) ? TAA_MITIGATION_AUTO : TAA_MITIGATION_OFF;
+
+enum mmio_mitigations {
+ MMIO_MITIGATION_OFF,
+ MMIO_MITIGATION_AUTO,
+ MMIO_MITIGATION_UCODE_NEEDED,
+ MMIO_MITIGATION_VERW,
+};
+
+/* Default mitigation for Processor MMIO Stale Data vulnerabilities */
+static enum mmio_mitigations mmio_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_MMIO_STALE_DATA) ? MMIO_MITIGATION_AUTO : MMIO_MITIGATION_OFF;
+
+enum rfds_mitigations {
+ RFDS_MITIGATION_OFF,
+ RFDS_MITIGATION_AUTO,
+ RFDS_MITIGATION_VERW,
+ RFDS_MITIGATION_UCODE_NEEDED,
+};
+
+/* Default mitigation for Register File Data Sampling */
+static enum rfds_mitigations rfds_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_RFDS) ? RFDS_MITIGATION_AUTO : RFDS_MITIGATION_OFF;
+
static void __init mds_select_mitigation(void)
{
if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) {
@@ -250,6 +288,9 @@ static void __init mds_select_mitigation(void)
return;
}
+ if (mds_mitigation == MDS_MITIGATION_AUTO)
+ mds_mitigation = MDS_MITIGATION_FULL;
+
if (mds_mitigation == MDS_MITIGATION_FULL) {
if (!boot_cpu_has(X86_FEATURE_MD_CLEAR))
mds_mitigation = MDS_MITIGATION_VMWERV;
@@ -286,16 +327,6 @@ early_param("mds", mds_cmdline);
#undef pr_fmt
#define pr_fmt(fmt) "TAA: " fmt
-enum taa_mitigations {
- TAA_MITIGATION_OFF,
- TAA_MITIGATION_UCODE_NEEDED,
- TAA_MITIGATION_VERW,
- TAA_MITIGATION_TSX_DISABLED,
-};
-
-/* Default mitigation for TAA-affected CPUs */
-static enum taa_mitigations taa_mitigation __ro_after_init =
- IS_ENABLED(CONFIG_MITIGATION_TAA) ? TAA_MITIGATION_VERW : TAA_MITIGATION_OFF;
static bool taa_nosmt __ro_after_init;
static const char * const taa_strings[] = {
@@ -386,15 +417,6 @@ early_param("tsx_async_abort", tsx_async_abort_parse_cmdline);
#undef pr_fmt
#define pr_fmt(fmt) "MMIO Stale Data: " fmt
-enum mmio_mitigations {
- MMIO_MITIGATION_OFF,
- MMIO_MITIGATION_UCODE_NEEDED,
- MMIO_MITIGATION_VERW,
-};
-
-/* Default mitigation for Processor MMIO Stale Data vulnerabilities */
-static enum mmio_mitigations mmio_mitigation __ro_after_init =
- IS_ENABLED(CONFIG_MITIGATION_MMIO_STALE_DATA) ? MMIO_MITIGATION_VERW : MMIO_MITIGATION_OFF;
static bool mmio_nosmt __ro_after_init = false;
static const char * const mmio_strings[] = {
@@ -483,16 +505,6 @@ early_param("mmio_stale_data", mmio_stale_data_parse_cmdline);
#undef pr_fmt
#define pr_fmt(fmt) "Register File Data Sampling: " fmt
-enum rfds_mitigations {
- RFDS_MITIGATION_OFF,
- RFDS_MITIGATION_VERW,
- RFDS_MITIGATION_UCODE_NEEDED,
-};
-
-/* Default mitigation for Register File Data Sampling */
-static enum rfds_mitigations rfds_mitigation __ro_after_init =
- IS_ENABLED(CONFIG_MITIGATION_RFDS) ? RFDS_MITIGATION_VERW : RFDS_MITIGATION_OFF;
-
static const char * const rfds_strings[] = {
[RFDS_MITIGATION_OFF] = "Vulnerable",
[RFDS_MITIGATION_VERW] = "Mitigation: Clear Register File",
@@ -508,6 +520,9 @@ static void __init rfds_select_mitigation(void)
if (rfds_mitigation == RFDS_MITIGATION_OFF)
return;
+ if (rfds_mitigation == RFDS_MITIGATION_AUTO)
+ rfds_mitigation = RFDS_MITIGATION_VERW;
+
if (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR)
setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
else
@@ -1293,9 +1308,13 @@ static __ro_after_init enum spectre_v2_mitigation_cmd spectre_v2_cmd;
static enum spectre_v2_user_cmd __init
spectre_v2_parse_user_cmdline(void)
{
+ enum spectre_v2_user_cmd mode;
char arg[20];
int ret, i;
+ mode = IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ?
+ SPECTRE_V2_USER_CMD_AUTO : SPECTRE_V2_USER_CMD_NONE;
+
switch (spectre_v2_cmd) {
case SPECTRE_V2_CMD_NONE:
return SPECTRE_V2_USER_CMD_NONE;
@@ -1308,7 +1327,7 @@ spectre_v2_parse_user_cmdline(void)
ret = cmdline_find_option(boot_command_line, "spectre_v2_user",
arg, sizeof(arg));
if (ret < 0)
- return SPECTRE_V2_USER_CMD_AUTO;
+ return mode;
for (i = 0; i < ARRAY_SIZE(v2_user_options); i++) {
if (match_option(arg, ret, v2_user_options[i].option)) {
@@ -1318,8 +1337,8 @@ spectre_v2_parse_user_cmdline(void)
}
}
- pr_err("Unknown user space protection option (%s). Switching to AUTO select\n", arg);
- return SPECTRE_V2_USER_CMD_AUTO;
+ pr_err("Unknown user space protection option (%s). Switching to default\n", arg);
+ return mode;
}
static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode)
@@ -1331,16 +1350,11 @@ static void __init
spectre_v2_user_select_mitigation(void)
{
enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE;
- bool smt_possible = IS_ENABLED(CONFIG_SMP);
enum spectre_v2_user_cmd cmd;
if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP))
return;
- if (cpu_smt_control == CPU_SMT_FORCE_DISABLED ||
- cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
- smt_possible = false;
-
cmd = spectre_v2_parse_user_cmdline();
switch (cmd) {
case SPECTRE_V2_USER_CMD_NONE:
@@ -1364,7 +1378,7 @@ spectre_v2_user_select_mitigation(void)
/* Initialize Indirect Branch Prediction Barrier */
if (boot_cpu_has(X86_FEATURE_IBPB)) {
- setup_force_cpu_cap(X86_FEATURE_USE_IBPB);
+ static_branch_enable(&switch_vcpu_ibpb);
spectre_v2_user_ibpb = mode;
switch (cmd) {
@@ -1401,7 +1415,7 @@ spectre_v2_user_select_mitigation(void)
* so allow for STIBP to be selected in those cases.
*/
if (!boot_cpu_has(X86_FEATURE_STIBP) ||
- !smt_possible ||
+ !cpu_smt_possible() ||
(spectre_v2_in_eibrs_mode(spectre_v2_enabled) &&
!boot_cpu_has(X86_FEATURE_AUTOIBRS)))
return;
@@ -1973,6 +1987,7 @@ void cpu_bugs_smt_update(void)
switch (mds_mitigation) {
case MDS_MITIGATION_FULL:
+ case MDS_MITIGATION_AUTO:
case MDS_MITIGATION_VMWERV:
if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY))
pr_warn_once(MDS_MSG_SMT);
@@ -1984,6 +1999,7 @@ void cpu_bugs_smt_update(void)
switch (taa_mitigation) {
case TAA_MITIGATION_VERW:
+ case TAA_MITIGATION_AUTO:
case TAA_MITIGATION_UCODE_NEEDED:
if (sched_smt_active())
pr_warn_once(TAA_MSG_SMT);
@@ -1995,6 +2011,7 @@ void cpu_bugs_smt_update(void)
switch (mmio_mitigation) {
case MMIO_MITIGATION_VERW:
+ case MMIO_MITIGATION_AUTO:
case MMIO_MITIGATION_UCODE_NEEDED:
if (sched_smt_active())
pr_warn_once(MMIO_MSG_SMT);
@@ -2522,6 +2539,7 @@ enum srso_mitigation {
SRSO_MITIGATION_SAFE_RET,
SRSO_MITIGATION_IBPB,
SRSO_MITIGATION_IBPB_ON_VMEXIT,
+ SRSO_MITIGATION_BP_SPEC_REDUCE,
};
enum srso_mitigation_cmd {
@@ -2539,7 +2557,8 @@ static const char * const srso_strings[] = {
[SRSO_MITIGATION_MICROCODE] = "Vulnerable: Microcode, no safe RET",
[SRSO_MITIGATION_SAFE_RET] = "Mitigation: Safe RET",
[SRSO_MITIGATION_IBPB] = "Mitigation: IBPB",
- [SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only"
+ [SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only",
+ [SRSO_MITIGATION_BP_SPEC_REDUCE] = "Mitigation: Reduced Speculation"
};
static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_NONE;
@@ -2578,7 +2597,7 @@ static void __init srso_select_mitigation(void)
srso_cmd == SRSO_CMD_OFF) {
if (boot_cpu_has(X86_FEATURE_SBPB))
x86_pred_cmd = PRED_CMD_SBPB;
- return;
+ goto out;
}
if (has_microcode) {
@@ -2590,7 +2609,7 @@ static void __init srso_select_mitigation(void)
*/
if (boot_cpu_data.x86 < 0x19 && !cpu_smt_possible()) {
setup_force_cpu_cap(X86_FEATURE_SRSO_NO);
- return;
+ goto out;
}
if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB) {
@@ -2670,6 +2689,12 @@ static void __init srso_select_mitigation(void)
ibpb_on_vmexit:
case SRSO_CMD_IBPB_ON_VMEXIT:
+ if (boot_cpu_has(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) {
+ pr_notice("Reducing speculation to address VM/HV SRSO attack vector.\n");
+ srso_mitigation = SRSO_MITIGATION_BP_SPEC_REDUCE;
+ break;
+ }
+
if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) {
if (has_microcode) {
setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT);
@@ -2691,7 +2716,15 @@ ibpb_on_vmexit:
}
out:
- pr_info("%s\n", srso_strings[srso_mitigation]);
+ /*
+ * Clear the feature flag if this mitigation is not selected as that
+ * feature flag controls the BpSpecReduce MSR bit toggling in KVM.
+ */
+ if (srso_mitigation != SRSO_MITIGATION_BP_SPEC_REDUCE)
+ setup_clear_cpu_cap(X86_FEATURE_SRSO_BP_SPEC_REDUCE);
+
+ if (srso_mitigation != SRSO_MITIGATION_NONE)
+ pr_info("%s\n", srso_strings[srso_mitigation]);
}
#undef pr_fmt
diff --git a/arch/x86/kernel/cpu/bus_lock.c b/arch/x86/kernel/cpu/bus_lock.c
index 6cba85c79d42..97222efb4d2a 100644
--- a/arch/x86/kernel/cpu/bus_lock.c
+++ b/arch/x86/kernel/cpu/bus_lock.c
@@ -192,7 +192,13 @@ static void __split_lock_reenable(struct work_struct *work)
{
sld_update_msr(true);
}
-static DECLARE_DELAYED_WORK(sl_reenable, __split_lock_reenable);
+/*
+ * In order for each CPU to schedule its delayed work independently of the
+ * others, delayed work struct must be per-CPU. This is not required when
+ * sysctl_sld_mitigate is enabled because of the semaphore that limits
+ * the number of simultaneously scheduled delayed works to 1.
+ */
+static DEFINE_PER_CPU(struct delayed_work, sl_reenable);
/*
* If a CPU goes offline with pending delayed work to re-enable split lock
@@ -213,7 +219,7 @@ static int splitlock_cpu_offline(unsigned int cpu)
static void split_lock_warn(unsigned long ip)
{
- struct delayed_work *work;
+ struct delayed_work *work = NULL;
int cpu;
if (!current->reported_split_lock)
@@ -235,11 +241,17 @@ static void split_lock_warn(unsigned long ip)
if (down_interruptible(&buslock_sem) == -EINTR)
return;
work = &sl_reenable_unlock;
- } else {
- work = &sl_reenable;
}
cpu = get_cpu();
+
+ if (!work) {
+ work = this_cpu_ptr(&sl_reenable);
+ /* Deferred initialization of per-CPU struct */
+ if (!work->work.func)
+ INIT_DELAYED_WORK(work, __split_lock_reenable);
+ }
+
schedule_delayed_work_on(cpu, work, 2);
/* Disable split lock detection on this CPU to make progress */
diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
index a6c6bccfa8b8..b3a520959b51 100644
--- a/arch/x86/kernel/cpu/cacheinfo.c
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -8,21 +8,19 @@
* Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD.
*/
-#include <linux/slab.h>
#include <linux/cacheinfo.h>
+#include <linux/capability.h>
#include <linux/cpu.h>
#include <linux/cpuhotplug.h>
-#include <linux/sched.h>
-#include <linux/capability.h>
-#include <linux/sysfs.h>
#include <linux/pci.h>
#include <linux/stop_machine.h>
+#include <linux/sysfs.h>
-#include <asm/cpufeature.h>
-#include <asm/cacheinfo.h>
#include <asm/amd_nb.h>
-#include <asm/smp.h>
+#include <asm/cacheinfo.h>
+#include <asm/cpufeature.h>
#include <asm/mtrr.h>
+#include <asm/smp.h>
#include <asm/tlbflush.h>
#include "cpu.h"
@@ -31,7 +29,6 @@
#define LVL_1_DATA 2
#define LVL_2 3
#define LVL_3 4
-#define LVL_TRACE 5
/* Shared last level cache maps */
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
@@ -96,10 +93,6 @@ static const struct _cache_table cache_table[] =
{ 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */
{ 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */
{ 0x68, LVL_1_DATA, 32 }, /* 4-way set assoc, sectored cache, 64 byte line size */
- { 0x70, LVL_TRACE, 12 }, /* 8-way set assoc */
- { 0x71, LVL_TRACE, 16 }, /* 8-way set assoc */
- { 0x72, LVL_TRACE, 32 }, /* 8-way set assoc */
- { 0x73, LVL_TRACE, 64 }, /* 8-way set assoc */
{ 0x78, LVL_2, MB(1) }, /* 4-way set assoc, 64 byte line size */
{ 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */
{ 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */
@@ -787,19 +780,13 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c)
}
}
}
- /*
- * Don't use cpuid2 if cpuid4 is supported. For P4, we use cpuid2 for
- * trace cache
- */
- if ((!ci->num_leaves || c->x86 == 15) && c->cpuid_level > 1) {
+
+ /* Don't use CPUID(2) if CPUID(4) is supported. */
+ if (!ci->num_leaves && c->cpuid_level > 1) {
/* supports eax=2 call */
int j, n;
unsigned int regs[4];
unsigned char *dp = (unsigned char *)regs;
- int only_trace = 0;
-
- if (ci->num_leaves && c->x86 == 15)
- only_trace = 1;
/* Number of times to iterate */
n = cpuid_eax(2) & 0xFF;
@@ -820,8 +807,6 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c)
/* look up this descriptor in the table */
while (cache_table[k].descriptor != 0) {
if (cache_table[k].descriptor == des) {
- if (only_trace && cache_table[k].cache_type != LVL_TRACE)
- break;
switch (cache_table[k].cache_type) {
case LVL_1_INST:
l1i += cache_table[k].size;
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 7cce91b19fb2..12126adbc3a9 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -667,8 +667,8 @@ static void filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
if (!warn)
continue;
- pr_warn("CPU: CPU feature " X86_CAP_FMT " disabled, no CPUID level 0x%x\n",
- x86_cap_flag(df->feature), df->level);
+ pr_warn("CPU: CPU feature %s disabled, no CPUID level 0x%x\n",
+ x86_cap_flags[df->feature], df->level);
}
}
@@ -846,13 +846,13 @@ void cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
c->x86_cache_size = l2size;
}
-u16 __read_mostly tlb_lli_4k[NR_INFO];
-u16 __read_mostly tlb_lli_2m[NR_INFO];
-u16 __read_mostly tlb_lli_4m[NR_INFO];
-u16 __read_mostly tlb_lld_4k[NR_INFO];
-u16 __read_mostly tlb_lld_2m[NR_INFO];
-u16 __read_mostly tlb_lld_4m[NR_INFO];
-u16 __read_mostly tlb_lld_1g[NR_INFO];
+u16 __read_mostly tlb_lli_4k;
+u16 __read_mostly tlb_lli_2m;
+u16 __read_mostly tlb_lli_4m;
+u16 __read_mostly tlb_lld_4k;
+u16 __read_mostly tlb_lld_2m;
+u16 __read_mostly tlb_lld_4m;
+u16 __read_mostly tlb_lld_1g;
static void cpu_detect_tlb(struct cpuinfo_x86 *c)
{
@@ -860,12 +860,10 @@ static void cpu_detect_tlb(struct cpuinfo_x86 *c)
this_cpu->c_detect_tlb(c);
pr_info("Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n",
- tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],
- tlb_lli_4m[ENTRIES]);
+ tlb_lli_4k, tlb_lli_2m, tlb_lli_4m);
pr_info("Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n",
- tlb_lld_4k[ENTRIES], tlb_lld_2m[ENTRIES],
- tlb_lld_4m[ENTRIES], tlb_lld_1g[ENTRIES]);
+ tlb_lld_4k, tlb_lld_2m, tlb_lld_4m, tlb_lld_1g);
}
void get_cpu_vendor(struct cpuinfo_x86 *c)
@@ -1164,7 +1162,7 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
VULNWL_INTEL(INTEL_CORE_YONAH, NO_SSB),
- VULNWL_INTEL(INTEL_ATOM_AIRMONT_MID, NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | MSBDS_ONLY),
+ VULNWL_INTEL(INTEL_ATOM_SILVERMONT_MID2,NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | MSBDS_ONLY),
VULNWL_INTEL(INTEL_ATOM_AIRMONT_NP, NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
VULNWL_INTEL(INTEL_ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
@@ -1205,6 +1203,9 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
#define VULNBL_INTEL_STEPS(vfm, max_stepping, issues) \
X86_MATCH_VFM_STEPS(vfm, X86_STEP_MIN, max_stepping, issues)
+#define VULNBL_INTEL_TYPE(vfm, cpu_type, issues) \
+ X86_MATCH_VFM_CPU_TYPE(vfm, INTEL_CPU_TYPE_##cpu_type, issues)
+
#define VULNBL_AMD(family, blacklist) \
VULNBL(AMD, family, X86_MODEL_ANY, blacklist)
@@ -1253,9 +1254,9 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
VULNBL_INTEL_STEPS(INTEL_TIGERLAKE, X86_STEP_MAX, GDS),
VULNBL_INTEL_STEPS(INTEL_LAKEFIELD, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED),
VULNBL_INTEL_STEPS(INTEL_ROCKETLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS),
- VULNBL_INTEL_STEPS(INTEL_ALDERLAKE, X86_STEP_MAX, RFDS),
+ VULNBL_INTEL_TYPE(INTEL_ALDERLAKE, ATOM, RFDS),
VULNBL_INTEL_STEPS(INTEL_ALDERLAKE_L, X86_STEP_MAX, RFDS),
- VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE, X86_STEP_MAX, RFDS),
+ VULNBL_INTEL_TYPE(INTEL_RAPTORLAKE, ATOM, RFDS),
VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_P, X86_STEP_MAX, RFDS),
VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_S, X86_STEP_MAX, RFDS),
VULNBL_INTEL_STEPS(INTEL_ATOM_GRACEMONT, X86_STEP_MAX, RFDS),
@@ -1331,8 +1332,10 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
- if (!cpu_matches(cpu_vuln_whitelist, NO_SPECTRE_V2))
+ if (!cpu_matches(cpu_vuln_whitelist, NO_SPECTRE_V2)) {
setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
+ setup_force_cpu_bug(X86_BUG_SPECTRE_V2_USER);
+ }
if (!cpu_matches(cpu_vuln_whitelist, NO_SSB) &&
!(x86_arch_cap_msr & ARCH_CAP_SSB_NO) &&
@@ -1479,15 +1482,96 @@ static void detect_nopl(void)
#endif
}
+static inline bool parse_set_clear_cpuid(char *arg, bool set)
+{
+ char *opt;
+ int taint = 0;
+
+ while (arg) {
+ bool found __maybe_unused = false;
+ unsigned int bit;
+
+ opt = strsep(&arg, ",");
+
+ /*
+ * Handle naked numbers first for feature flags which don't
+ * have names. It doesn't make sense for a bug not to have a
+ * name so don't handle bug flags here.
+ */
+ if (!kstrtouint(opt, 10, &bit)) {
+ if (bit < NCAPINTS * 32) {
+
+ if (set) {
+ pr_warn("setcpuid: force-enabling CPU feature flag:");
+ setup_force_cpu_cap(bit);
+ } else {
+ pr_warn("clearcpuid: force-disabling CPU feature flag:");
+ setup_clear_cpu_cap(bit);
+ }
+ /* empty-string, i.e., ""-defined feature flags */
+ if (!x86_cap_flags[bit])
+ pr_cont(" %d:%d\n", bit >> 5, bit & 31);
+ else
+ pr_cont(" %s\n", x86_cap_flags[bit]);
+
+ taint++;
+ }
+ /*
+ * The assumption is that there are no feature names with only
+ * numbers in the name thus go to the next argument.
+ */
+ continue;
+ }
+
+ for (bit = 0; bit < 32 * (NCAPINTS + NBUGINTS); bit++) {
+ const char *flag;
+ const char *kind;
+
+ if (bit < 32 * NCAPINTS) {
+ flag = x86_cap_flags[bit];
+ kind = "feature";
+ } else {
+ kind = "bug";
+ flag = x86_bug_flags[bit - (32 * NCAPINTS)];
+ }
+
+ if (!flag)
+ continue;
+
+ if (strcmp(flag, opt))
+ continue;
+
+ if (set) {
+ pr_warn("setcpuid: force-enabling CPU %s flag: %s\n",
+ kind, flag);
+ setup_force_cpu_cap(bit);
+ } else {
+ pr_warn("clearcpuid: force-disabling CPU %s flag: %s\n",
+ kind, flag);
+ setup_clear_cpu_cap(bit);
+ }
+ taint++;
+ found = true;
+ break;
+ }
+
+ if (!found)
+ pr_warn("%s: unknown CPU flag: %s", set ? "setcpuid" : "clearcpuid", opt);
+ }
+
+ return taint;
+}
+
+
/*
* We parse cpu parameters early because fpu__init_system() is executed
* before parse_early_param().
*/
static void __init cpu_parse_early_param(void)
{
+ bool cpuid_taint = false;
char arg[128];
- char *argptr = arg, *opt;
- int arglen, taint = 0;
+ int arglen;
#ifdef CONFIG_X86_32
if (cmdline_find_option_bool(boot_command_line, "no387"))
@@ -1519,61 +1603,17 @@ static void __init cpu_parse_early_param(void)
setup_clear_cpu_cap(X86_FEATURE_FRED);
arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg));
- if (arglen <= 0)
- return;
+ if (arglen > 0)
+ cpuid_taint |= parse_set_clear_cpuid(arg, false);
- pr_info("Clearing CPUID bits:");
-
- while (argptr) {
- bool found __maybe_unused = false;
- unsigned int bit;
+ arglen = cmdline_find_option(boot_command_line, "setcpuid", arg, sizeof(arg));
+ if (arglen > 0)
+ cpuid_taint |= parse_set_clear_cpuid(arg, true);
- opt = strsep(&argptr, ",");
-
- /*
- * Handle naked numbers first for feature flags which don't
- * have names.
- */
- if (!kstrtouint(opt, 10, &bit)) {
- if (bit < NCAPINTS * 32) {
-
- /* empty-string, i.e., ""-defined feature flags */
- if (!x86_cap_flags[bit])
- pr_cont(" " X86_CAP_FMT_NUM, x86_cap_flag_num(bit));
- else
- pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit));
-
- setup_clear_cpu_cap(bit);
- taint++;
- }
- /*
- * The assumption is that there are no feature names with only
- * numbers in the name thus go to the next argument.
- */
- continue;
- }
-
- for (bit = 0; bit < 32 * NCAPINTS; bit++) {
- if (!x86_cap_flag(bit))
- continue;
-
- if (strcmp(x86_cap_flag(bit), opt))
- continue;
-
- pr_cont(" %s", opt);
- setup_clear_cpu_cap(bit);
- taint++;
- found = true;
- break;
- }
-
- if (!found)
- pr_cont(" (unknown: %s)", opt);
- }
- pr_cont("\n");
-
- if (taint)
+ if (cpuid_taint) {
+ pr_warn("!!! setcpuid=/clearcpuid= in use, this is for TESTING ONLY, may break things horribly. Tainting kernel.\n");
add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+ }
}
/*
@@ -1610,6 +1650,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
c->cpu_index = 0;
filter_cpuid_features(c, false);
+ check_cpufeature_deps(c);
if (this_cpu->c_bsp_init)
this_cpu->c_bsp_init(c);
@@ -1870,6 +1911,9 @@ static void identify_cpu(struct cpuinfo_x86 *c)
/* Filter out anything that depends on CPUID levels we don't have */
filter_cpuid_features(c, true);
+ /* Check for unmet dependencies based on the CPUID dependency table */
+ check_cpufeature_deps(c);
+
/* If the model name is still unset, do table lookup. */
if (!c->x86_model_id[0]) {
const char *p;
@@ -1962,9 +2006,15 @@ static __init void identify_boot_cpu(void)
lkgs_init();
}
-void identify_secondary_cpu(struct cpuinfo_x86 *c)
+void identify_secondary_cpu(unsigned int cpu)
{
- BUG_ON(c == &boot_cpu_data);
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ /* Copy boot_cpu_data only on the first bringup */
+ if (!c->initialized)
+ *c = boot_cpu_data;
+ c->cpu_index = cpu;
+
identify_cpu(c);
#ifdef CONFIG_X86_32
enable_sep_cpu();
@@ -1975,6 +2025,7 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c)
update_gds_msr();
tsx_ap_init();
+ c->initialized = true;
}
void print_cpu_info(struct cpuinfo_x86 *c)
@@ -2005,27 +2056,40 @@ void print_cpu_info(struct cpuinfo_x86 *c)
}
/*
- * clearcpuid= was already parsed in cpu_parse_early_param(). This dummy
- * function prevents it from becoming an environment variable for init.
+ * clearcpuid= and setcpuid= were already parsed in cpu_parse_early_param().
+ * These dummy functions prevent them from becoming an environment variable for
+ * init.
*/
+
static __init int setup_clearcpuid(char *arg)
{
return 1;
}
__setup("clearcpuid=", setup_clearcpuid);
-DEFINE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot) = {
- .current_task = &init_task,
- .preempt_count = INIT_PREEMPT_COUNT,
- .top_of_stack = TOP_OF_INIT_STACK,
-};
-EXPORT_PER_CPU_SYMBOL(pcpu_hot);
-EXPORT_PER_CPU_SYMBOL(const_pcpu_hot);
+static __init int setup_setcpuid(char *arg)
+{
+ return 1;
+}
+__setup("setcpuid=", setup_setcpuid);
+
+DEFINE_PER_CPU_CACHE_HOT(struct task_struct *, current_task) = &init_task;
+EXPORT_PER_CPU_SYMBOL(current_task);
+EXPORT_PER_CPU_SYMBOL(const_current_task);
+
+DEFINE_PER_CPU_CACHE_HOT(int, __preempt_count) = INIT_PREEMPT_COUNT;
+EXPORT_PER_CPU_SYMBOL(__preempt_count);
+
+DEFINE_PER_CPU_CACHE_HOT(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK;
#ifdef CONFIG_X86_64
-DEFINE_PER_CPU_FIRST(struct fixed_percpu_data,
- fixed_percpu_data) __aligned(PAGE_SIZE) __visible;
-EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data);
+/*
+ * Note: Do not make this dependant on CONFIG_MITIGATION_CALL_DEPTH_TRACKING
+ * so that this space is reserved in the hot cache section even when the
+ * mitigation is disabled.
+ */
+DEFINE_PER_CPU_CACHE_HOT(u64, __x86_call_depth);
+EXPORT_PER_CPU_SYMBOL(__x86_call_depth);
static void wrmsrl_cstar(unsigned long val)
{
@@ -2089,18 +2153,15 @@ void syscall_init(void)
if (!cpu_feature_enabled(X86_FEATURE_FRED))
idt_syscall_init();
}
-
-#else /* CONFIG_X86_64 */
+#endif /* CONFIG_X86_64 */
#ifdef CONFIG_STACKPROTECTOR
-DEFINE_PER_CPU(unsigned long, __stack_chk_guard);
+DEFINE_PER_CPU_CACHE_HOT(unsigned long, __stack_chk_guard);
#ifndef CONFIG_SMP
EXPORT_PER_CPU_SYMBOL(__stack_chk_guard);
#endif
#endif
-#endif /* CONFIG_X86_64 */
-
/*
* Clear all 6 debug registers:
*/
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 1beccefbaff9..51deb60a9d26 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -33,14 +33,6 @@ struct cpu_dev {
#endif
};
-struct _tlb_table {
- unsigned char descriptor;
- char tlb_type;
- unsigned int entries;
- /* unsigned int ways; */
- char info[128];
-};
-
#define cpu_dev_register(cpu_devX) \
static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \
__section(".x86_cpu_dev.init") = \
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index df838e3bdbe0..a2fbea0be535 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -147,3 +147,38 @@ void setup_clear_cpu_cap(unsigned int feature)
{
do_clear_cpu_cap(NULL, feature);
}
+
+/*
+ * Return the feature "name" if available, otherwise return
+ * the X86_FEATURE_* numerals to make it easier to identify
+ * the feature.
+ */
+static const char *x86_feature_name(unsigned int feature, char *buf)
+{
+ if (x86_cap_flags[feature])
+ return x86_cap_flags[feature];
+
+ snprintf(buf, 16, "%d*32+%2d", feature / 32, feature % 32);
+
+ return buf;
+}
+
+void check_cpufeature_deps(struct cpuinfo_x86 *c)
+{
+ char feature_buf[16], depends_buf[16];
+ const struct cpuid_dep *d;
+
+ for (d = cpuid_deps; d->feature; d++) {
+ if (cpu_has(c, d->feature) && !cpu_has(c, d->depends)) {
+ /*
+ * Only warn about the first unmet dependency on the
+ * first CPU where it is encountered to avoid spamming
+ * the kernel log.
+ */
+ pr_warn_once("x86 CPU feature dependency check failure: CPU%d has '%s' enabled but '%s' disabled. Kernel might be fine, but no guarantees.\n",
+ smp_processor_id(),
+ x86_feature_name(d->feature, feature_buf),
+ x86_feature_name(d->depends, depends_buf));
+ }
+ }
+}
diff --git a/arch/x86/kernel/cpu/debugfs.c b/arch/x86/kernel/cpu/debugfs.c
index cacfd3f6abef..1976fef2dfe5 100644
--- a/arch/x86/kernel/cpu/debugfs.c
+++ b/arch/x86/kernel/cpu/debugfs.c
@@ -16,8 +16,8 @@ static int cpu_debug_show(struct seq_file *m, void *p)
if (!c->initialized)
return 0;
- seq_printf(m, "initial_apicid: %x\n", c->topo.initial_apicid);
- seq_printf(m, "apicid: %x\n", c->topo.apicid);
+ seq_printf(m, "initial_apicid: 0x%x\n", c->topo.initial_apicid);
+ seq_printf(m, "apicid: 0x%x\n", c->topo.apicid);
seq_printf(m, "pkg_id: %u\n", c->topo.pkg_id);
seq_printf(m, "die_id: %u\n", c->topo.die_id);
seq_printf(m, "cu_id: %u\n", c->topo.cu_id);
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
index c5191b06f9f2..6af4a4a90a52 100644
--- a/arch/x86/kernel/cpu/hygon.c
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -240,26 +240,26 @@ static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c)
cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
- tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask;
- tlb_lli_4k[ENTRIES] = ebx & mask;
+ tlb_lld_4k = (ebx >> 16) & mask;
+ tlb_lli_4k = ebx & mask;
/* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
if (!((eax >> 16) & mask))
- tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff;
+ tlb_lld_2m = (cpuid_eax(0x80000005) >> 16) & 0xff;
else
- tlb_lld_2m[ENTRIES] = (eax >> 16) & mask;
+ tlb_lld_2m = (eax >> 16) & mask;
/* a 4M entry uses two 2M entries */
- tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1;
+ tlb_lld_4m = tlb_lld_2m >> 1;
/* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
if (!(eax & mask)) {
cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
- tlb_lli_2m[ENTRIES] = eax & 0xff;
+ tlb_lli_2m = eax & 0xff;
} else
- tlb_lli_2m[ENTRIES] = eax & mask;
+ tlb_lli_2m = eax & mask;
- tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1;
+ tlb_lli_4m = tlb_lli_2m >> 1;
}
static const struct cpu_dev hygon_cpu_dev = {
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 134368a3f4b1..4cbb2e69bea1 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -1,40 +1,31 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/kernel.h>
-#include <linux/pgtable.h>
-#include <linux/string.h>
#include <linux/bitops.h>
-#include <linux/smp.h>
-#include <linux/sched.h>
-#include <linux/sched/clock.h>
-#include <linux/thread_info.h>
#include <linux/init.h>
-#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/minmax.h>
+#include <linux/smp.h>
+#include <linux/string.h>
+
+#ifdef CONFIG_X86_64
+#include <linux/topology.h>
+#endif
-#include <asm/cpufeature.h>
-#include <asm/msr.h>
#include <asm/bugs.h>
+#include <asm/cpu_device_id.h>
+#include <asm/cpufeature.h>
#include <asm/cpu.h>
+#include <asm/hwcap2.h>
#include <asm/intel-family.h>
#include <asm/microcode.h>
-#include <asm/hwcap2.h>
-#include <asm/elf.h>
-#include <asm/cpu_device_id.h>
-#include <asm/resctrl.h>
+#include <asm/msr.h>
#include <asm/numa.h>
+#include <asm/resctrl.h>
#include <asm/thermal.h>
-
-#ifdef CONFIG_X86_64
-#include <linux/topology.h>
-#endif
+#include <asm/uaccess.h>
#include "cpu.h"
-#ifdef CONFIG_X86_LOCAL_APIC
-#include <asm/mpspec.h>
-#include <asm/apic.h>
-#endif
-
/*
* Processors which have self-snooping capability can handle conflicting
* memory type across CPUs by snooping its own cache. However, there exists
@@ -195,7 +186,7 @@ void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c)
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
return;
- if (c->x86 < 6 || (c->x86 == 6 && c->x86_model < 0xd))
+ if (c->x86_vfm < INTEL_PENTIUM_M_DOTHAN)
return;
/*
@@ -210,10 +201,6 @@ static void early_init_intel(struct cpuinfo_x86 *c)
{
u64 misc_enable;
- if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
- (c->x86 == 0x6 && c->x86_model >= 0x0e))
- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
-
if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64))
c->microcode = intel_get_microcode_revision();
@@ -256,8 +243,8 @@ static void early_init_intel(struct cpuinfo_x86 *c)
#endif
/* CPUID workaround for 0F33/0F34 CPU */
- if (c->x86 == 0xF && c->x86_model == 0x3
- && (c->x86_stepping == 0x3 || c->x86_stepping == 0x4))
+ if (c->x86_vfm == INTEL_P4_PRESCOTT &&
+ (c->x86_stepping == 0x3 || c->x86_stepping == 0x4))
c->x86_phys_bits = 36;
/*
@@ -266,10 +253,16 @@ static void early_init_intel(struct cpuinfo_x86 *c)
*
* It is also reliable across cores and sockets. (but not across
* cabinets - we turn it off in that case explicitly.)
+ *
+ * Use a model-specific check for some older CPUs that have invariant
+ * TSC but may not report it architecturally via 8000_0007.
*/
if (c->x86_power & (1 << 8)) {
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
+ } else if ((c->x86_vfm >= INTEL_P4_PRESCOTT && c->x86_vfm <= INTEL_P4_WILLAMETTE) ||
+ (c->x86_vfm >= INTEL_CORE_YONAH && c->x86_vfm <= INTEL_IVYBRIDGE)) {
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
}
/* Penwell and Cloverview have the TSC which doesn't sleep on S3 */
@@ -298,12 +291,19 @@ static void early_init_intel(struct cpuinfo_x86 *c)
clear_cpu_cap(c, X86_FEATURE_PAT);
/*
- * If fast string is not enabled in IA32_MISC_ENABLE for any reason,
- * clear the fast string and enhanced fast string CPU capabilities.
+ * Modern CPUs are generally expected to have a sane fast string
+ * implementation. However, BIOSes typically have a knob to tweak
+ * the architectural MISC_ENABLE.FAST_STRING enable bit.
+ *
+ * Adhere to the preference and program the Linux-defined fast
+ * string flag and enhanced fast string capabilities accordingly.
*/
- if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
+ if (c->x86_vfm >= INTEL_PENTIUM_M_DOTHAN) {
rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
- if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
+ if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) {
+ /* X86_FEATURE_ERMS is set based on CPUID */
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+ } else {
pr_info("Disabled fast string operations\n");
setup_clear_cpu_cap(X86_FEATURE_REP_GOOD);
setup_clear_cpu_cap(X86_FEATURE_ERMS);
@@ -350,9 +350,7 @@ static void bsp_init_intel(struct cpuinfo_x86 *c)
int ppro_with_ram_bug(void)
{
/* Uses data from early_cpu_detect now */
- if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
- boot_cpu_data.x86 == 6 &&
- boot_cpu_data.x86_model == 1 &&
+ if (boot_cpu_data.x86_vfm == INTEL_PENTIUM_PRO &&
boot_cpu_data.x86_stepping < 8) {
pr_info("Pentium Pro with Errata#50 detected. Taking evasive action.\n");
return 1;
@@ -369,9 +367,8 @@ static void intel_smp_check(struct cpuinfo_x86 *c)
/*
* Mask B, Pentium, but not Pentium MMX
*/
- if (c->x86 == 5 &&
- c->x86_stepping >= 1 && c->x86_stepping <= 4 &&
- c->x86_model <= 3) {
+ if (c->x86_vfm >= INTEL_FAM5_START && c->x86_vfm < INTEL_PENTIUM_MMX &&
+ c->x86_stepping >= 1 && c->x86_stepping <= 4) {
/*
* Remember we have B step Pentia with bugs
*/
@@ -398,7 +395,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
* The Quark is also family 5, but does not have the same bug.
*/
clear_cpu_bug(c, X86_BUG_F00F);
- if (c->x86 == 5 && c->x86_model < 9) {
+ if (c->x86_vfm >= INTEL_FAM5_START && c->x86_vfm < INTEL_QUARK_X1000) {
static int f00f_workaround_enabled;
set_cpu_bug(c, X86_BUG_F00F);
@@ -413,7 +410,8 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until
* model 3 mask 3
*/
- if ((c->x86<<8 | c->x86_model<<4 | c->x86_stepping) < 0x633)
+ if ((c->x86_vfm == INTEL_PENTIUM_II_KLAMATH && c->x86_stepping < 3) ||
+ c->x86_vfm < INTEL_PENTIUM_II_KLAMATH)
clear_cpu_cap(c, X86_FEATURE_SEP);
/*
@@ -431,7 +429,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
* P4 Xeon erratum 037 workaround.
* Hardware prefetcher may cause stale data to be loaded into the cache.
*/
- if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_stepping == 1)) {
+ if (c->x86_vfm == INTEL_P4_WILLAMETTE && c->x86_stepping == 1) {
if (msr_set_bit(MSR_IA32_MISC_ENABLE,
MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT) > 0) {
pr_info("CPU: C0 stepping P4 Xeon detected.\n");
@@ -445,27 +443,20 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
* integrated APIC (see 11AP erratum in "Pentium Processor
* Specification Update").
*/
- if (boot_cpu_has(X86_FEATURE_APIC) && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
+ if (boot_cpu_has(X86_FEATURE_APIC) && c->x86_vfm == INTEL_PENTIUM_75 &&
(c->x86_stepping < 0x6 || c->x86_stepping == 0xb))
set_cpu_bug(c, X86_BUG_11AP);
-
#ifdef CONFIG_X86_INTEL_USERCOPY
/*
- * Set up the preferred alignment for movsl bulk memory moves
+ * MOVSL bulk memory moves can be slow when source and dest are not
+ * both 8-byte aligned. PII/PIII only like MOVSL with 8-byte alignment.
+ *
+ * Set the preferred alignment for Pentium Pro and newer processors, as
+ * it has only been tested on these.
*/
- switch (c->x86) {
- case 4: /* 486: untested */
- break;
- case 5: /* Old Pentia: untested */
- break;
- case 6: /* PII/PIII only like movsl with 8-byte alignment */
+ if (c->x86_vfm >= INTEL_PENTIUM_PRO)
movsl_mask.mask = 7;
- break;
- case 15: /* P4 is OK down to 8-byte alignment */
- movsl_mask.mask = 7;
- break;
- }
#endif
intel_smp_check(c);
@@ -563,8 +554,6 @@ static void init_intel(struct cpuinfo_x86 *c)
#ifdef CONFIG_X86_64
if (c->x86 == 15)
c->x86_cache_alignment = c->x86_clflush_size * 2;
- if (c->x86 == 6)
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
#else
/*
* Names for the Pentium II/Celeron processors
@@ -622,14 +611,14 @@ static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
* to determine which, so we use a boottime override
* for the 512kb model, and assume 256 otherwise.
*/
- if ((c->x86 == 6) && (c->x86_model == 11) && (size == 0))
+ if (c->x86_vfm == INTEL_PENTIUM_III_TUALATIN && size == 0)
size = 256;
/*
* Intel Quark SoC X1000 contains a 4-way set associative
* 16K cache with a 16 byte cache line and 256 lines per tag
*/
- if ((c->x86 == 5) && (c->x86_model == 9))
+ if (c->x86_vfm == INTEL_QUARK_X1000)
size = 16;
return size;
}
@@ -667,50 +656,58 @@ static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
*/
#define TLB_0x63_2M_4M_ENTRIES 32
+struct _tlb_table {
+ unsigned char descriptor;
+ char tlb_type;
+ unsigned int entries;
+};
+
static const struct _tlb_table intel_tlb_table[] = {
- { 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" },
- { 0x02, TLB_INST_4M, 2, " TLB_INST 4 MByte pages, full associative" },
- { 0x03, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way set associative" },
- { 0x04, TLB_DATA_4M, 8, " TLB_DATA 4 MByte pages, 4-way set associative" },
- { 0x05, TLB_DATA_4M, 32, " TLB_DATA 4 MByte pages, 4-way set associative" },
- { 0x0b, TLB_INST_4M, 4, " TLB_INST 4 MByte pages, 4-way set associative" },
- { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages" },
- { 0x50, TLB_INST_ALL, 64, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
- { 0x51, TLB_INST_ALL, 128, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
- { 0x52, TLB_INST_ALL, 256, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
- { 0x55, TLB_INST_2M_4M, 7, " TLB_INST 2-MByte or 4-MByte pages, fully associative" },
- { 0x56, TLB_DATA0_4M, 16, " TLB_DATA0 4 MByte pages, 4-way set associative" },
- { 0x57, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, 4-way associative" },
- { 0x59, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, fully associative" },
- { 0x5a, TLB_DATA0_2M_4M, 32, " TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative" },
- { 0x5b, TLB_DATA_4K_4M, 64, " TLB_DATA 4 KByte and 4 MByte pages" },
- { 0x5c, TLB_DATA_4K_4M, 128, " TLB_DATA 4 KByte and 4 MByte pages" },
- { 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" },
- { 0x61, TLB_INST_4K, 48, " TLB_INST 4 KByte pages, full associative" },
- { 0x63, TLB_DATA_1G_2M_4M, 4, " TLB_DATA 1 GByte pages, 4-way set associative"
- " (plus 32 entries TLB_DATA 2 MByte or 4 MByte pages, not encoded here)" },
- { 0x6b, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 8-way associative" },
- { 0x6c, TLB_DATA_2M_4M, 128, " TLB_DATA 2 MByte or 4 MByte pages, 8-way associative" },
- { 0x6d, TLB_DATA_1G, 16, " TLB_DATA 1 GByte pages, fully associative" },
- { 0x76, TLB_INST_2M_4M, 8, " TLB_INST 2-MByte or 4-MByte pages, fully associative" },
- { 0xb0, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 4-way set associative" },
- { 0xb1, TLB_INST_2M_4M, 4, " TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" },
- { 0xb2, TLB_INST_4K, 64, " TLB_INST 4KByte pages, 4-way set associative" },
- { 0xb3, TLB_DATA_4K, 128, " TLB_DATA 4 KByte pages, 4-way set associative" },
- { 0xb4, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 4-way associative" },
- { 0xb5, TLB_INST_4K, 64, " TLB_INST 4 KByte pages, 8-way set associative" },
- { 0xb6, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 8-way set associative" },
- { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" },
- { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" },
- { 0xc1, STLB_4K_2M, 1024, " STLB 4 KByte and 2 MByte pages, 8-way associative" },
- { 0xc2, TLB_DATA_2M_4M, 16, " TLB_DATA 2 MByte/4MByte pages, 4-way associative" },
- { 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" },
+ { 0x01, TLB_INST_4K, 32}, /* TLB_INST 4 KByte pages, 4-way set associative */
+ { 0x02, TLB_INST_4M, 2}, /* TLB_INST 4 MByte pages, full associative */
+ { 0x03, TLB_DATA_4K, 64}, /* TLB_DATA 4 KByte pages, 4-way set associative */
+ { 0x04, TLB_DATA_4M, 8}, /* TLB_DATA 4 MByte pages, 4-way set associative */
+ { 0x05, TLB_DATA_4M, 32}, /* TLB_DATA 4 MByte pages, 4-way set associative */
+ { 0x0b, TLB_INST_4M, 4}, /* TLB_INST 4 MByte pages, 4-way set associative */
+ { 0x4f, TLB_INST_4K, 32}, /* TLB_INST 4 KByte pages */
+ { 0x50, TLB_INST_ALL, 64}, /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */
+ { 0x51, TLB_INST_ALL, 128}, /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */
+ { 0x52, TLB_INST_ALL, 256}, /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */
+ { 0x55, TLB_INST_2M_4M, 7}, /* TLB_INST 2-MByte or 4-MByte pages, fully associative */
+ { 0x56, TLB_DATA0_4M, 16}, /* TLB_DATA0 4 MByte pages, 4-way set associative */
+ { 0x57, TLB_DATA0_4K, 16}, /* TLB_DATA0 4 KByte pages, 4-way associative */
+ { 0x59, TLB_DATA0_4K, 16}, /* TLB_DATA0 4 KByte pages, fully associative */
+ { 0x5a, TLB_DATA0_2M_4M, 32}, /* TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative */
+ { 0x5b, TLB_DATA_4K_4M, 64}, /* TLB_DATA 4 KByte and 4 MByte pages */
+ { 0x5c, TLB_DATA_4K_4M, 128}, /* TLB_DATA 4 KByte and 4 MByte pages */
+ { 0x5d, TLB_DATA_4K_4M, 256}, /* TLB_DATA 4 KByte and 4 MByte pages */
+ { 0x61, TLB_INST_4K, 48}, /* TLB_INST 4 KByte pages, full associative */
+ { 0x63, TLB_DATA_1G_2M_4M, 4}, /* TLB_DATA 1 GByte pages, 4-way set associative
+ * (plus 32 entries TLB_DATA 2 MByte or 4 MByte pages, not encoded here) */
+ { 0x6b, TLB_DATA_4K, 256}, /* TLB_DATA 4 KByte pages, 8-way associative */
+ { 0x6c, TLB_DATA_2M_4M, 128}, /* TLB_DATA 2 MByte or 4 MByte pages, 8-way associative */
+ { 0x6d, TLB_DATA_1G, 16}, /* TLB_DATA 1 GByte pages, fully associative */
+ { 0x76, TLB_INST_2M_4M, 8}, /* TLB_INST 2-MByte or 4-MByte pages, fully associative */
+ { 0xb0, TLB_INST_4K, 128}, /* TLB_INST 4 KByte pages, 4-way set associative */
+ { 0xb1, TLB_INST_2M_4M, 4}, /* TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries */
+ { 0xb2, TLB_INST_4K, 64}, /* TLB_INST 4KByte pages, 4-way set associative */
+ { 0xb3, TLB_DATA_4K, 128}, /* TLB_DATA 4 KByte pages, 4-way set associative */
+ { 0xb4, TLB_DATA_4K, 256}, /* TLB_DATA 4 KByte pages, 4-way associative */
+ { 0xb5, TLB_INST_4K, 64}, /* TLB_INST 4 KByte pages, 8-way set associative */
+ { 0xb6, TLB_INST_4K, 128}, /* TLB_INST 4 KByte pages, 8-way set associative */
+ { 0xba, TLB_DATA_4K, 64}, /* TLB_DATA 4 KByte pages, 4-way associative */
+ { 0xc0, TLB_DATA_4K_4M, 8}, /* TLB_DATA 4 KByte and 4 MByte pages, 4-way associative */
+ { 0xc1, STLB_4K_2M, 1024}, /* STLB 4 KByte and 2 MByte pages, 8-way associative */
+ { 0xc2, TLB_DATA_2M_4M, 16}, /* TLB_DATA 2 MByte/4MByte pages, 4-way associative */
+ { 0xca, STLB_4K, 512}, /* STLB 4 KByte pages, 4-way associative */
{ 0x00, 0, 0 }
};
static void intel_tlb_lookup(const unsigned char desc)
{
+ unsigned int entries;
unsigned char k;
+
if (desc == 0)
return;
@@ -722,81 +719,58 @@ static void intel_tlb_lookup(const unsigned char desc)
if (intel_tlb_table[k].tlb_type == 0)
return;
+ entries = intel_tlb_table[k].entries;
switch (intel_tlb_table[k].tlb_type) {
case STLB_4K:
- if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lli_4k = max(tlb_lli_4k, entries);
+ tlb_lld_4k = max(tlb_lld_4k, entries);
break;
case STLB_4K_2M:
- if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lli_4k = max(tlb_lli_4k, entries);
+ tlb_lld_4k = max(tlb_lld_4k, entries);
+ tlb_lli_2m = max(tlb_lli_2m, entries);
+ tlb_lld_2m = max(tlb_lld_2m, entries);
+ tlb_lli_4m = max(tlb_lli_4m, entries);
+ tlb_lld_4m = max(tlb_lld_4m, entries);
break;
case TLB_INST_ALL:
- if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lli_4k = max(tlb_lli_4k, entries);
+ tlb_lli_2m = max(tlb_lli_2m, entries);
+ tlb_lli_4m = max(tlb_lli_4m, entries);
break;
case TLB_INST_4K:
- if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lli_4k = max(tlb_lli_4k, entries);
break;
case TLB_INST_4M:
- if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lli_4m = max(tlb_lli_4m, entries);
break;
case TLB_INST_2M_4M:
- if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lli_2m = max(tlb_lli_2m, entries);
+ tlb_lli_4m = max(tlb_lli_4m, entries);
break;
case TLB_DATA_4K:
case TLB_DATA0_4K:
- if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lld_4k = max(tlb_lld_4k, entries);
break;
case TLB_DATA_4M:
case TLB_DATA0_4M:
- if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lld_4m = max(tlb_lld_4m, entries);
break;
case TLB_DATA_2M_4M:
case TLB_DATA0_2M_4M:
- if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lld_2m = max(tlb_lld_2m, entries);
+ tlb_lld_4m = max(tlb_lld_4m, entries);
break;
case TLB_DATA_4K_4M:
- if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lld_4k = max(tlb_lld_4k, entries);
+ tlb_lld_4m = max(tlb_lld_4m, entries);
break;
case TLB_DATA_1G_2M_4M:
- if (tlb_lld_2m[ENTRIES] < TLB_0x63_2M_4M_ENTRIES)
- tlb_lld_2m[ENTRIES] = TLB_0x63_2M_4M_ENTRIES;
- if (tlb_lld_4m[ENTRIES] < TLB_0x63_2M_4M_ENTRIES)
- tlb_lld_4m[ENTRIES] = TLB_0x63_2M_4M_ENTRIES;
+ tlb_lld_2m = max(tlb_lld_2m, TLB_0x63_2M_4M_ENTRIES);
+ tlb_lld_4m = max(tlb_lld_4m, TLB_0x63_2M_4M_ENTRIES);
fallthrough;
case TLB_DATA_1G:
- if (tlb_lld_1g[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_1g[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lld_1g = max(tlb_lld_1g, entries);
break;
}
}
@@ -891,34 +865,3 @@ static const struct cpu_dev intel_cpu_dev = {
};
cpu_dev_register(intel_cpu_dev);
-
-#define X86_HYBRID_CPU_TYPE_ID_SHIFT 24
-
-/**
- * get_this_hybrid_cpu_type() - Get the type of this hybrid CPU
- *
- * Returns the CPU type [31:24] (i.e., Atom or Core) of a CPU in
- * a hybrid processor. If the processor is not hybrid, returns 0.
- */
-u8 get_this_hybrid_cpu_type(void)
-{
- if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
- return 0;
-
- return cpuid_eax(0x0000001a) >> X86_HYBRID_CPU_TYPE_ID_SHIFT;
-}
-
-/**
- * get_this_hybrid_cpu_native_id() - Get the native id of this hybrid CPU
- *
- * Returns the uarch native ID [23:0] of a CPU in a hybrid processor.
- * If the processor is not hybrid, returns 0.
- */
-u32 get_this_hybrid_cpu_native_id(void)
-{
- if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
- return 0;
-
- return cpuid_eax(0x0000001a) &
- (BIT_ULL(X86_HYBRID_CPU_TYPE_ID_SHIFT) - 1);
-}
diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c
index 4f3c65429f82..6af1e8baeb0f 100644
--- a/arch/x86/kernel/cpu/match.c
+++ b/arch/x86/kernel/cpu/match.c
@@ -6,6 +6,34 @@
#include <linux/slab.h>
/**
+ * x86_match_vendor_cpu_type - helper function to match the hardware defined
+ * cpu-type for a single entry in the x86_cpu_id
+ * table. Note, this function does not match the
+ * generic cpu-types TOPO_CPU_TYPE_EFFICIENCY and
+ * TOPO_CPU_TYPE_PERFORMANCE.
+ * @c: Pointer to the cpuinfo_x86 structure of the CPU to match.
+ * @m: Pointer to the x86_cpu_id entry to match against.
+ *
+ * Return: true if the cpu-type matches, false otherwise.
+ */
+static bool x86_match_vendor_cpu_type(struct cpuinfo_x86 *c, const struct x86_cpu_id *m)
+{
+ if (m->type == X86_CPU_TYPE_ANY)
+ return true;
+
+ /* Hybrid CPUs are special, they are assumed to match all cpu-types */
+ if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
+ return true;
+
+ if (c->x86_vendor == X86_VENDOR_INTEL)
+ return m->type == c->topo.intel_type;
+ if (c->x86_vendor == X86_VENDOR_AMD)
+ return m->type == c->topo.amd_type;
+
+ return false;
+}
+
+/**
* x86_match_cpu - match current CPU against an array of x86_cpu_ids
* @match: Pointer to array of x86_cpu_ids. Last entry terminated with
* {}.
@@ -50,6 +78,8 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match)
continue;
if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature))
continue;
+ if (!x86_match_vendor_cpu_type(c, m))
+ continue;
return m;
}
return NULL;
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 0dc00c9894c7..1f14c3308b6b 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -584,6 +584,28 @@ bool mce_is_correctable(struct mce *m)
}
EXPORT_SYMBOL_GPL(mce_is_correctable);
+/*
+ * Notify the user(s) about new machine check events.
+ * Can be called from interrupt context, but not from machine check/NMI
+ * context.
+ */
+static bool mce_notify_irq(void)
+{
+ /* Not more than two messages every minute */
+ static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
+
+ if (test_and_clear_bit(0, &mce_need_notify)) {
+ mce_work_trigger();
+
+ if (__ratelimit(&ratelimit))
+ pr_info(HW_ERR "Machine check events logged\n");
+
+ return true;
+ }
+
+ return false;
+}
+
static int mce_early_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
@@ -1773,28 +1795,6 @@ static void mce_timer_delete_all(void)
del_timer_sync(&per_cpu(mce_timer, cpu));
}
-/*
- * Notify the user(s) about new machine check events.
- * Can be called from interrupt context, but not from machine check/NMI
- * context.
- */
-bool mce_notify_irq(void)
-{
- /* Not more than two messages every minute */
- static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
-
- if (test_and_clear_bit(0, &mce_need_notify)) {
- mce_work_trigger();
-
- if (__ratelimit(&ratelimit))
- pr_info(HW_ERR "Machine check events logged\n");
-
- return true;
- }
- return false;
-}
-EXPORT_SYMBOL_GPL(mce_notify_irq);
-
static void __mcheck_cpu_mce_banks_init(void)
{
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c
index 313fe682db33..06e3cf7229ce 100644
--- a/arch/x86/kernel/cpu/mce/inject.c
+++ b/arch/x86/kernel/cpu/mce/inject.c
@@ -229,7 +229,6 @@ static int raise_local(void)
} else if (m->status) {
pr_info("Starting machine check poll CPU %d\n", cpu);
raise_poll(m);
- mce_notify_irq();
pr_info("Machine check poll done on CPU %d\n", cpu);
} else
m->finished = 0;
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index f3d534807d91..819199bc0119 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -74,7 +74,7 @@ void intel_collect_cpu_info(struct cpu_signature *sig)
sig->pf = 0;
sig->rev = intel_get_microcode_revision();
- if (x86_model(sig->sig) >= 5 || x86_family(sig->sig) > 6) {
+ if (IFM(x86_family(sig->sig), x86_model(sig->sig)) >= INTEL_PENTIUM_III_DESCHUTES) {
unsigned int val[2];
/* get processor flags from MSR 0x17 */
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index f285757618fc..3e2533954675 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -33,8 +33,6 @@
#include <asm/numa.h>
#include <asm/svm.h>
-/* Is Linux running as the root partition? */
-bool hv_root_partition;
/* Is Linux running on nested Microsoft Hypervisor */
bool hv_nested;
struct ms_hyperv_info ms_hyperv;
@@ -109,6 +107,7 @@ void hv_set_msr(unsigned int reg, u64 value)
}
EXPORT_SYMBOL_GPL(hv_set_msr);
+static void (*mshv_handler)(void);
static void (*vmbus_handler)(void);
static void (*hv_stimer0_handler)(void);
static void (*hv_kexec_handler)(void);
@@ -119,6 +118,9 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback)
struct pt_regs *old_regs = set_irq_regs(regs);
inc_irq_stat(irq_hv_callback_count);
+ if (mshv_handler)
+ mshv_handler();
+
if (vmbus_handler)
vmbus_handler();
@@ -128,6 +130,11 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback)
set_irq_regs(old_regs);
}
+void hv_setup_mshv_handler(void (*handler)(void))
+{
+ mshv_handler = handler;
+}
+
void hv_setup_vmbus_handler(void (*handler)(void))
{
vmbus_handler = handler;
@@ -422,6 +429,7 @@ int hv_get_hypervisor_version(union hv_hypervisor_version_info *info)
return 0;
}
+EXPORT_SYMBOL_GPL(hv_get_hypervisor_version);
static void __init ms_hyperv_init_platform(void)
{
@@ -436,13 +444,15 @@ static void __init ms_hyperv_init_platform(void)
*/
ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
ms_hyperv.priv_high = cpuid_ebx(HYPERV_CPUID_FEATURES);
+ ms_hyperv.ext_features = cpuid_ecx(HYPERV_CPUID_FEATURES);
ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
hv_max_functions_eax = cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS);
- pr_info("Hyper-V: privilege flags low 0x%x, high 0x%x, hints 0x%x, misc 0x%x\n",
- ms_hyperv.features, ms_hyperv.priv_high, ms_hyperv.hints,
+ pr_info("Hyper-V: privilege flags low %#x, high %#x, ext %#x, hints %#x, misc %#x\n",
+ ms_hyperv.features, ms_hyperv.priv_high,
+ ms_hyperv.ext_features, ms_hyperv.hints,
ms_hyperv.misc_features);
ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS);
@@ -451,25 +461,7 @@ static void __init ms_hyperv_init_platform(void)
pr_debug("Hyper-V: max %u virtual processors, %u logical processors\n",
ms_hyperv.max_vp_index, ms_hyperv.max_lp_index);
- /*
- * Check CPU management privilege.
- *
- * To mirror what Windows does we should extract CPU management
- * features and use the ReservedIdentityBit to detect if Linux is the
- * root partition. But that requires negotiating CPU management
- * interface (a process to be finalized). For now, use the privilege
- * flag as the indicator for running as root.
- *
- * Hyper-V should never specify running as root and as a Confidential
- * VM. But to protect against a compromised/malicious Hyper-V trying
- * to exploit root behavior to expose Confidential VM memory, ignore
- * the root partition setting if also a Confidential VM.
- */
- if ((ms_hyperv.priv_high & HV_CPU_MANAGEMENT) &&
- !(ms_hyperv.priv_high & HV_ISOLATION)) {
- hv_root_partition = true;
- pr_info("Hyper-V: running as root partition\n");
- }
+ hv_identify_partition_type();
if (ms_hyperv.hints & HV_X64_HYPERV_NESTED) {
hv_nested = true;
@@ -618,7 +610,7 @@ static void __init ms_hyperv_init_platform(void)
# ifdef CONFIG_SMP
smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu;
- if (hv_root_partition ||
+ if (hv_root_partition() ||
(!ms_hyperv.paravisor_present && hv_isolation_type_snp()))
smp_ops.smp_prepare_cpus = hv_smp_prepare_cpus;
# endif
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 2fdfda2b60e4..e2c6b471d230 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -9,9 +9,11 @@
#include <linux/io.h>
#include <linux/mm.h>
#include <linux/cc_platform.h>
+#include <linux/string_choices.h>
#include <asm/processor-flags.h>
#include <asm/cacheinfo.h>
#include <asm/cpufeature.h>
+#include <asm/cpu_device_id.h>
#include <asm/hypervisor.h>
#include <asm/mshyperv.h>
#include <asm/tlbflush.h>
@@ -646,10 +648,10 @@ static void __init print_mtrr_state(void)
pr_info("MTRR default type: %s\n",
mtrr_attrib_to_str(mtrr_state.def_type));
if (mtrr_state.have_fixed) {
- pr_info("MTRR fixed ranges %sabled:\n",
- ((mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) &&
- (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) ?
- "en" : "dis");
+ pr_info("MTRR fixed ranges %s:\n",
+ str_enabled_disabled(
+ (mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) &&
+ (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)));
print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
for (i = 0; i < 2; ++i)
print_fixed(0x80000 + i * 0x20000, 0x04000,
@@ -661,8 +663,8 @@ static void __init print_mtrr_state(void)
/* tail */
print_fixed_last();
}
- pr_info("MTRR variable ranges %sabled:\n",
- mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED ? "en" : "dis");
+ pr_info("MTRR variable ranges %s:\n",
+ str_enabled_disabled(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED));
high_width = (boot_cpu_data.x86_phys_bits - (32 - PAGE_SHIFT) + 3) / 4;
for (i = 0; i < num_var_ranges; ++i) {
@@ -1025,8 +1027,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size,
* For Intel PPro stepping <= 7
* must be 4 MiB aligned and not touch 0x70000000 -> 0x7003FFFF
*/
- if (mtrr_if == &generic_mtrr_ops && boot_cpu_data.x86 == 6 &&
- boot_cpu_data.x86_model == 1 &&
+ if (mtrr_if == &generic_mtrr_ops && boot_cpu_data.x86_vfm == INTEL_PENTIUM_PRO &&
boot_cpu_data.x86_stepping <= 7) {
if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) {
pr_warn("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base);
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index a5c506f6da7f..4049235b1bfe 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -99,7 +99,6 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
char *ptr;
char line[LINE_SIZE];
int length;
- size_t linelen;
memset(line, 0, LINE_SIZE);
@@ -108,9 +107,8 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
if (length < 0)
return length;
- linelen = strlen(line);
- ptr = line + linelen - 1;
- if (linelen && *ptr == '\n')
+ ptr = line + length - 1;
+ if (length && *ptr == '\n')
*ptr = '\0';
if (!strncmp(line, "disable=", 8)) {
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 41ed01f46bd9..6571d432cbe3 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -86,9 +86,12 @@ static int show_cpuinfo(struct seq_file *m, void *v)
seq_printf(m, "microcode\t: 0x%x\n", c->microcode);
if (cpu_has(c, X86_FEATURE_TSC)) {
- unsigned int freq = arch_freq_get_on_cpu(cpu);
+ int freq = arch_freq_get_on_cpu(cpu);
- seq_printf(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, (freq % 1000));
+ if (freq < 0)
+ seq_puts(m, "cpu MHz\t\t: Unknown\n");
+ else
+ seq_printf(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, (freq % 1000));
}
/* Cache size */
diff --git a/arch/x86/kernel/cpu/resctrl/Makefile b/arch/x86/kernel/cpu/resctrl/Makefile
index 4a06c37b9cf1..0c13b0befd8a 100644
--- a/arch/x86/kernel/cpu/resctrl/Makefile
+++ b/arch/x86/kernel/cpu/resctrl/Makefile
@@ -1,4 +1,5 @@
# SPDX-License-Identifier: GPL-2.0
-obj-$(CONFIG_X86_CPU_RESCTRL) += core.o rdtgroup.o monitor.o
-obj-$(CONFIG_X86_CPU_RESCTRL) += ctrlmondata.o pseudo_lock.o
+obj-$(CONFIG_X86_CPU_RESCTRL) += core.o rdtgroup.o monitor.o
+obj-$(CONFIG_X86_CPU_RESCTRL) += ctrlmondata.o
+obj-$(CONFIG_RESCTRL_FS_PSEUDO_LOCK) += pseudo_lock.o
CFLAGS_pseudo_lock.o = -I$(src)
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 3d1735ed8d1f..cf29681d01e0 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -44,12 +44,6 @@ static DEFINE_MUTEX(domain_list_lock);
DEFINE_PER_CPU(struct resctrl_pqr_state, pqr_state);
/*
- * Used to store the max resource name width and max resource data width
- * to display the schemata in a tabular format
- */
-int max_name_width, max_data_width;
-
-/*
* Global boolean for rdt_alloc which is true if any
* resource allocation is enabled.
*/
@@ -62,7 +56,7 @@ static void mba_wrmsr_amd(struct msr_param *m);
#define ctrl_domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.ctrl_domains)
#define mon_domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.mon_domains)
-struct rdt_hw_resource rdt_resources_all[] = {
+struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = {
[RDT_RESOURCE_L3] =
{
.r_resctrl = {
@@ -72,9 +66,7 @@ struct rdt_hw_resource rdt_resources_all[] = {
.mon_scope = RESCTRL_L3_CACHE,
.ctrl_domains = ctrl_domain_init(RDT_RESOURCE_L3),
.mon_domains = mon_domain_init(RDT_RESOURCE_L3),
- .parse_ctrlval = parse_cbm,
- .format_str = "%d=%0*x",
- .fflags = RFTYPE_RES_CACHE,
+ .schema_fmt = RESCTRL_SCHEMA_BITMAP,
},
.msr_base = MSR_IA32_L3_CBM_BASE,
.msr_update = cat_wrmsr,
@@ -86,9 +78,7 @@ struct rdt_hw_resource rdt_resources_all[] = {
.name = "L2",
.ctrl_scope = RESCTRL_L2_CACHE,
.ctrl_domains = ctrl_domain_init(RDT_RESOURCE_L2),
- .parse_ctrlval = parse_cbm,
- .format_str = "%d=%0*x",
- .fflags = RFTYPE_RES_CACHE,
+ .schema_fmt = RESCTRL_SCHEMA_BITMAP,
},
.msr_base = MSR_IA32_L2_CBM_BASE,
.msr_update = cat_wrmsr,
@@ -100,9 +90,7 @@ struct rdt_hw_resource rdt_resources_all[] = {
.name = "MB",
.ctrl_scope = RESCTRL_L3_CACHE,
.ctrl_domains = ctrl_domain_init(RDT_RESOURCE_MBA),
- .parse_ctrlval = parse_bw,
- .format_str = "%d=%*u",
- .fflags = RFTYPE_RES_MB,
+ .schema_fmt = RESCTRL_SCHEMA_RANGE,
},
},
[RDT_RESOURCE_SMBA] =
@@ -112,9 +100,7 @@ struct rdt_hw_resource rdt_resources_all[] = {
.name = "SMBA",
.ctrl_scope = RESCTRL_L3_CACHE,
.ctrl_domains = ctrl_domain_init(RDT_RESOURCE_SMBA),
- .parse_ctrlval = parse_bw,
- .format_str = "%d=%*u",
- .fflags = RFTYPE_RES_MB,
+ .schema_fmt = RESCTRL_SCHEMA_RANGE,
},
},
};
@@ -127,6 +113,14 @@ u32 resctrl_arch_system_num_rmid_idx(void)
return r->num_rmid;
}
+struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l)
+{
+ if (l >= RDT_NUM_RESOURCES)
+ return NULL;
+
+ return &rdt_resources_all[l].r_resctrl;
+}
+
/*
* cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs
* as they do not have CPUID enumeration support for Cache allocation.
@@ -161,7 +155,6 @@ static inline void cache_alloc_hsw_probe(void)
return;
hw_res->num_closid = 4;
- r->default_ctrl = max_cbm;
r->cache.cbm_len = 20;
r->cache.shareable_bits = 0xc0000;
r->cache.min_cbm_bits = 2;
@@ -174,7 +167,7 @@ static inline void cache_alloc_hsw_probe(void)
bool is_mba_sc(struct rdt_resource *r)
{
if (!r)
- return rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl.membw.mba_sc;
+ r = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
/*
* The software controller support is only applicable to MBA resource.
@@ -217,7 +210,7 @@ static __init bool __get_mem_config_intel(struct rdt_resource *r)
cpuid_count(0x00000010, 3, &eax.full, &ebx, &ecx, &edx.full);
hw_res->num_closid = edx.split.cos_max + 1;
max_delay = eax.split.max_delay + 1;
- r->default_ctrl = MAX_MBA_BW;
+ r->membw.max_bw = MAX_MBA_BW;
r->membw.arch_needs_linear = true;
if (ecx & MBA_IS_LINEAR) {
r->membw.delay_linear = true;
@@ -228,16 +221,12 @@ static __init bool __get_mem_config_intel(struct rdt_resource *r)
return false;
r->membw.arch_needs_linear = false;
}
- r->data_width = 3;
if (boot_cpu_has(X86_FEATURE_PER_THREAD_MBA))
r->membw.throttle_mode = THREAD_THROTTLE_PER_THREAD;
else
r->membw.throttle_mode = THREAD_THROTTLE_MAX;
- resctrl_file_fflags_init("thread_throttle_mode",
- RFTYPE_CTRL_INFO | RFTYPE_RES_MB);
-
r->alloc_capable = true;
return true;
@@ -256,7 +245,7 @@ static __init bool __rdt_get_mem_config_amd(struct rdt_resource *r)
cpuid_count(0x80000020, subleaf, &eax, &ebx, &ecx, &edx);
hw_res->num_closid = edx + 1;
- r->default_ctrl = 1 << eax;
+ r->membw.max_bw = 1 << eax;
/* AMD does not use delay */
r->membw.delay_linear = false;
@@ -269,8 +258,6 @@ static __init bool __rdt_get_mem_config_amd(struct rdt_resource *r)
r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED;
r->membw.min_bw = 0;
r->membw.bw_gran = 1;
- /* Max value is 2048, Data width should be 4 in decimal */
- r->data_width = 4;
r->alloc_capable = true;
@@ -283,14 +270,13 @@ static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r)
union cpuid_0x10_1_eax eax;
union cpuid_0x10_x_ecx ecx;
union cpuid_0x10_x_edx edx;
- u32 ebx;
+ u32 ebx, default_ctrl;
cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx.full, &edx.full);
hw_res->num_closid = edx.split.cos_max + 1;
r->cache.cbm_len = eax.split.cbm_len + 1;
- r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1;
- r->cache.shareable_bits = ebx & r->default_ctrl;
- r->data_width = (r->cache.cbm_len + 3) / 4;
+ default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1;
+ r->cache.shareable_bits = ebx & default_ctrl;
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
r->cache.arch_has_sparse_bitmasks = ecx.split.noncont;
r->alloc_capable = true;
@@ -337,7 +323,7 @@ static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
return MAX_MBA_BW - bw;
pr_warn_once("Non Linear delay-bw map not supported but queried\n");
- return r->default_ctrl;
+ return MAX_MBA_BW;
}
static void mba_wrmsr_intel(struct msr_param *m)
@@ -361,36 +347,6 @@ static void cat_wrmsr(struct msr_param *m)
wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
}
-struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu, struct rdt_resource *r)
-{
- struct rdt_ctrl_domain *d;
-
- lockdep_assert_cpus_held();
-
- list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
- /* Find the domain that contains this CPU */
- if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask))
- return d;
- }
-
- return NULL;
-}
-
-struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu, struct rdt_resource *r)
-{
- struct rdt_mon_domain *d;
-
- lockdep_assert_cpus_held();
-
- list_for_each_entry(d, &r->mon_domains, hdr.list) {
- /* Find the domain that contains this CPU */
- if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask))
- return d;
- }
-
- return NULL;
-}
-
u32 resctrl_arch_get_num_closid(struct rdt_resource *r)
{
return resctrl_to_arch_res(r)->num_closid;
@@ -405,36 +361,6 @@ void rdt_ctrl_update(void *arg)
hw_res->msr_update(m);
}
-/*
- * rdt_find_domain - Search for a domain id in a resource domain list.
- *
- * Search the domain list to find the domain id. If the domain id is
- * found, return the domain. NULL otherwise. If the domain id is not
- * found (and NULL returned) then the first domain with id bigger than
- * the input id can be returned to the caller via @pos.
- */
-struct rdt_domain_hdr *rdt_find_domain(struct list_head *h, int id,
- struct list_head **pos)
-{
- struct rdt_domain_hdr *d;
- struct list_head *l;
-
- list_for_each(l, h) {
- d = list_entry(l, struct rdt_domain_hdr, list);
- /* When id is found, return its domain. */
- if (id == d->id)
- return d;
- /* Stop searching when finding id's position in sorted list. */
- if (id < d->id)
- break;
- }
-
- if (pos)
- *pos = l;
-
- return NULL;
-}
-
static void setup_default_ctrlval(struct rdt_resource *r, u32 *dc)
{
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
@@ -446,7 +372,7 @@ static void setup_default_ctrlval(struct rdt_resource *r, u32 *dc)
* For Memory Allocation: Set b/w requested to 100%
*/
for (i = 0; i < hw_res->num_closid; i++, dc++)
- *dc = r->default_ctrl;
+ *dc = resctrl_get_default_ctrl(r);
}
static void ctrl_domain_free(struct rdt_hw_ctrl_domain *hw_dom)
@@ -494,13 +420,13 @@ static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_mon_domain *hw_dom)
{
size_t tsize;
- if (is_mbm_total_enabled()) {
+ if (resctrl_arch_is_mbm_total_enabled()) {
tsize = sizeof(*hw_dom->arch_mbm_total);
hw_dom->arch_mbm_total = kcalloc(num_rmid, tsize, GFP_KERNEL);
if (!hw_dom->arch_mbm_total)
return -ENOMEM;
}
- if (is_mbm_local_enabled()) {
+ if (resctrl_arch_is_mbm_local_enabled()) {
tsize = sizeof(*hw_dom->arch_mbm_local);
hw_dom->arch_mbm_local = kcalloc(num_rmid, tsize, GFP_KERNEL);
if (!hw_dom->arch_mbm_local) {
@@ -545,7 +471,7 @@ static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r)
return;
}
- hdr = rdt_find_domain(&r->ctrl_domains, id, &add_pos);
+ hdr = resctrl_find_domain(&r->ctrl_domains, id, &add_pos);
if (hdr) {
if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN))
return;
@@ -600,7 +526,7 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r)
return;
}
- hdr = rdt_find_domain(&r->mon_domains, id, &add_pos);
+ hdr = resctrl_find_domain(&r->mon_domains, id, &add_pos);
if (hdr) {
if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN))
return;
@@ -665,7 +591,7 @@ static void domain_remove_cpu_ctrl(int cpu, struct rdt_resource *r)
return;
}
- hdr = rdt_find_domain(&r->ctrl_domains, id, NULL);
+ hdr = resctrl_find_domain(&r->ctrl_domains, id, NULL);
if (!hdr) {
pr_warn("Can't find control domain for id=%d for CPU %d for resource %s\n",
id, cpu, r->name);
@@ -711,7 +637,7 @@ static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r)
return;
}
- hdr = rdt_find_domain(&r->mon_domains, id, NULL);
+ hdr = resctrl_find_domain(&r->mon_domains, id, NULL);
if (!hdr) {
pr_warn("Can't find monitor domain for id=%d for CPU %d for resource %s\n",
id, cpu, r->name);
@@ -786,20 +712,6 @@ static int resctrl_arch_offline_cpu(unsigned int cpu)
return 0;
}
-/*
- * Choose a width for the resource name and resource data based on the
- * resource that has widest name and cbm.
- */
-static __init void rdt_init_padding(void)
-{
- struct rdt_resource *r;
-
- for_each_alloc_capable_rdt_resource(r) {
- if (r->data_width > max_data_width)
- max_data_width = r->data_width;
- }
-}
-
enum {
RDT_FLAG_CMT,
RDT_FLAG_MBM_TOTAL,
@@ -885,6 +797,21 @@ bool __init rdt_cpu_has(int flag)
return ret;
}
+__init bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt)
+{
+ if (!rdt_cpu_has(X86_FEATURE_BMEC))
+ return false;
+
+ switch (evt) {
+ case QOS_L3_MBM_TOTAL_EVENT_ID:
+ return rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL);
+ case QOS_L3_MBM_LOCAL_EVENT_ID:
+ return rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL);
+ default:
+ return false;
+ }
+}
+
static __init bool get_mem_config(void)
{
struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_MBA];
@@ -963,11 +890,6 @@ static __init bool get_rdt_mon_resources(void)
if (!rdt_mon_features)
return false;
- if (is_mbm_local_enabled())
- mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID;
- else if (is_mbm_total_enabled())
- mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID;
-
return !rdt_get_mon_l3_config(r);
}
@@ -1086,7 +1008,7 @@ void resctrl_cpu_detect(struct cpuinfo_x86 *c)
}
}
-static int __init resctrl_late_init(void)
+static int __init resctrl_arch_late_init(void)
{
struct rdt_resource *r;
int state, ret;
@@ -1102,8 +1024,6 @@ static int __init resctrl_late_init(void)
if (!get_rdt_resources())
return -ENODEV;
- rdt_init_padding();
-
state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
"x86/resctrl/cat:online:",
resctrl_arch_online_cpu,
@@ -1111,7 +1031,7 @@ static int __init resctrl_late_init(void)
if (state < 0)
return state;
- ret = rdtgroup_init();
+ ret = resctrl_init();
if (ret) {
cpuhp_remove_state(state);
return ret;
@@ -1127,18 +1047,13 @@ static int __init resctrl_late_init(void)
return 0;
}
-late_initcall(resctrl_late_init);
+late_initcall(resctrl_arch_late_init);
-static void __exit resctrl_exit(void)
+static void __exit resctrl_arch_exit(void)
{
- struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
-
cpuhp_remove_state(rdt_online);
- rdtgroup_exit();
-
- if (r->mon_capable)
- rdt_put_mon_l3_config();
+ resctrl_exit();
}
-__exitcall(resctrl_exit);
+__exitcall(resctrl_arch_exit);
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index 536351159cc2..0a0ac5f6112e 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -23,6 +23,15 @@
#include "internal.h"
+struct rdt_parse_data {
+ struct rdtgroup *rdtgrp;
+ char *buf;
+};
+
+typedef int (ctrlval_parser_t)(struct rdt_parse_data *data,
+ struct resctrl_schema *s,
+ struct rdt_ctrl_domain *d);
+
/*
* Check whether MBA bandwidth percentage value is correct. The value is
* checked against the minimum and max bandwidth values specified by the
@@ -54,9 +63,9 @@ static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r)
return true;
}
- if (bw < r->membw.min_bw || bw > r->default_ctrl) {
+ if (bw < r->membw.min_bw || bw > r->membw.max_bw) {
rdt_last_cmd_printf("MB value %u out of range [%d,%d]\n",
- bw, r->membw.min_bw, r->default_ctrl);
+ bw, r->membw.min_bw, r->membw.max_bw);
return false;
}
@@ -64,8 +73,8 @@ static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r)
return true;
}
-int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
- struct rdt_ctrl_domain *d)
+static int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
+ struct rdt_ctrl_domain *d)
{
struct resctrl_staged_config *cfg;
u32 closid = data->rdtgrp->closid;
@@ -104,8 +113,9 @@ int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
*/
static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
{
- unsigned long first_bit, zero_bit, val;
+ u32 supported_bits = BIT_MASK(r->cache.cbm_len) - 1;
unsigned int cbm_len = r->cache.cbm_len;
+ unsigned long first_bit, zero_bit, val;
int ret;
ret = kstrtoul(buf, 16, &val);
@@ -114,7 +124,7 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
return false;
}
- if ((r->cache.min_cbm_bits > 0 && val == 0) || val > r->default_ctrl) {
+ if ((r->cache.min_cbm_bits > 0 && val == 0) || val > supported_bits) {
rdt_last_cmd_puts("Mask out of range\n");
return false;
}
@@ -143,8 +153,8 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
* Read one cache bit mask (hex). Check that it is valid for the current
* resource type.
*/
-int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s,
- struct rdt_ctrl_domain *d)
+static int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s,
+ struct rdt_ctrl_domain *d)
{
struct rdtgroup *rdtgrp = data->rdtgrp;
struct resctrl_staged_config *cfg;
@@ -210,6 +220,7 @@ static int parse_line(char *line, struct resctrl_schema *s,
struct rdtgroup *rdtgrp)
{
enum resctrl_conf_type t = s->conf_type;
+ ctrlval_parser_t *parse_ctrlval = NULL;
struct resctrl_staged_config *cfg;
struct rdt_resource *r = s->res;
struct rdt_parse_data data;
@@ -220,6 +231,18 @@ static int parse_line(char *line, struct resctrl_schema *s,
/* Walking r->domains, ensure it can't race with cpuhp */
lockdep_assert_cpus_held();
+ switch (r->schema_fmt) {
+ case RESCTRL_SCHEMA_BITMAP:
+ parse_ctrlval = &parse_cbm;
+ break;
+ case RESCTRL_SCHEMA_RANGE:
+ parse_ctrlval = &parse_bw;
+ break;
+ }
+
+ if (WARN_ON_ONCE(!parse_ctrlval))
+ return -EINVAL;
+
if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP &&
(r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) {
rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n");
@@ -240,7 +263,7 @@ next:
if (d->hdr.id == dom_id) {
data.buf = dom;
data.rdtgrp = rdtgrp;
- if (r->parse_ctrlval(&data, s, d))
+ if (parse_ctrlval(&data, s, d))
return -EINVAL;
if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
cfg = &d->staged_config[t];
@@ -264,25 +287,12 @@ next:
return -EINVAL;
}
-static u32 get_config_index(u32 closid, enum resctrl_conf_type type)
-{
- switch (type) {
- default:
- case CDP_NONE:
- return closid;
- case CDP_CODE:
- return closid * 2 + 1;
- case CDP_DATA:
- return closid * 2;
- }
-}
-
int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d,
u32 closid, enum resctrl_conf_type t, u32 cfg_val)
{
struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d);
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
- u32 idx = get_config_index(closid, t);
+ u32 idx = resctrl_get_config_index(closid, t);
struct msr_param msr_param;
if (!cpumask_test_cpu(smp_processor_id(), &d->hdr.cpu_mask))
@@ -319,7 +329,7 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
if (!cfg->have_new_ctrl)
continue;
- idx = get_config_index(closid, t);
+ idx = resctrl_get_config_index(closid, t);
if (cfg->new_ctrl == hw_dom->ctrl_val[idx])
continue;
hw_dom->ctrl_val[idx] = cfg->new_ctrl;
@@ -439,7 +449,7 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d,
u32 closid, enum resctrl_conf_type type)
{
struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d);
- u32 idx = get_config_index(closid, type);
+ u32 idx = resctrl_get_config_index(closid, type);
return hw_dom->ctrl_val[idx];
}
@@ -465,8 +475,7 @@ static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int clo
ctrl_val = resctrl_arch_get_config(r, dom, closid,
schema->conf_type);
- seq_printf(s, r->format_str, dom->hdr.id, max_data_width,
- ctrl_val);
+ seq_printf(s, schema->fmt_str, dom->hdr.id, ctrl_val);
sep = true;
}
seq_puts(s, "\n");
@@ -537,12 +546,12 @@ ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of,
rdt_last_cmd_clear();
if (!strcmp(buf, "mbm_local_bytes")) {
- if (is_mbm_local_enabled())
+ if (resctrl_arch_is_mbm_local_enabled())
rdtgrp->mba_mbps_event = QOS_L3_MBM_LOCAL_EVENT_ID;
else
ret = -EINVAL;
} else if (!strcmp(buf, "mbm_total_bytes")) {
- if (is_mbm_total_enabled())
+ if (resctrl_arch_is_mbm_total_enabled())
rdtgrp->mba_mbps_event = QOS_L3_MBM_TOTAL_EVENT_ID;
else
ret = -EINVAL;
@@ -588,6 +597,28 @@ int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of,
return ret;
}
+struct rdt_domain_hdr *resctrl_find_domain(struct list_head *h, int id,
+ struct list_head **pos)
+{
+ struct rdt_domain_hdr *d;
+ struct list_head *l;
+
+ list_for_each(l, h) {
+ d = list_entry(l, struct rdt_domain_hdr, list);
+ /* When id is found, return its domain. */
+ if (id == d->id)
+ return d;
+ /* Stop searching when finding id's position in sorted list. */
+ if (id < d->id)
+ break;
+ }
+
+ if (pos)
+ *pos = l;
+
+ return NULL;
+}
+
void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
struct rdt_mon_domain *d, struct rdtgroup *rdtgrp,
cpumask_t *cpumask, int evtid, int first)
@@ -649,7 +680,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
resid = md.u.rid;
domid = md.u.domid;
evtid = md.u.evtid;
- r = &rdt_resources_all[resid].r_resctrl;
+ r = resctrl_arch_get_resource(resid);
if (md.u.sum) {
/*
@@ -673,7 +704,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
* This file provides data from a single domain. Search
* the resource to find the domain with "domid".
*/
- hdr = rdt_find_domain(&r->mon_domains, domid, NULL);
+ hdr = resctrl_find_domain(&r->mon_domains, domid, NULL);
if (!hdr || WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) {
ret = -ENOENT;
goto out;
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 20c898f09b7e..c44c5b496355 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -32,30 +32,6 @@
*/
#define MBM_CNTR_WIDTH_OFFSET_MAX (62 - MBM_CNTR_WIDTH_BASE)
-/* Reads to Local DRAM Memory */
-#define READS_TO_LOCAL_MEM BIT(0)
-
-/* Reads to Remote DRAM Memory */
-#define READS_TO_REMOTE_MEM BIT(1)
-
-/* Non-Temporal Writes to Local Memory */
-#define NON_TEMP_WRITE_TO_LOCAL_MEM BIT(2)
-
-/* Non-Temporal Writes to Remote Memory */
-#define NON_TEMP_WRITE_TO_REMOTE_MEM BIT(3)
-
-/* Reads to Local Memory the system identifies as "Slow Memory" */
-#define READS_TO_LOCAL_S_MEM BIT(4)
-
-/* Reads to Remote Memory the system identifies as "Slow Memory" */
-#define READS_TO_REMOTE_S_MEM BIT(5)
-
-/* Dirty Victims to All Types of Memory */
-#define DIRTY_VICTIMS_TO_ALL_MEM BIT(6)
-
-/* Max event bits supported */
-#define MAX_EVT_CONFIG_BITS GENMASK(6, 0)
-
/**
* cpumask_any_housekeeping() - Choose any CPU in @mask, preferring those that
* aren't marked nohz_full
@@ -180,7 +156,6 @@ struct rmid_read {
void *arch_mon_ctx;
};
-extern unsigned int rdt_mon_features;
extern struct list_head resctrl_schema_all;
extern bool resctrl_mounted;
@@ -234,43 +209,6 @@ struct mongroup {
};
/**
- * struct pseudo_lock_region - pseudo-lock region information
- * @s: Resctrl schema for the resource to which this
- * pseudo-locked region belongs
- * @d: RDT domain to which this pseudo-locked region
- * belongs
- * @cbm: bitmask of the pseudo-locked region
- * @lock_thread_wq: waitqueue used to wait on the pseudo-locking thread
- * completion
- * @thread_done: variable used by waitqueue to test if pseudo-locking
- * thread completed
- * @cpu: core associated with the cache on which the setup code
- * will be run
- * @line_size: size of the cache lines
- * @size: size of pseudo-locked region in bytes
- * @kmem: the kernel memory associated with pseudo-locked region
- * @minor: minor number of character device associated with this
- * region
- * @debugfs_dir: pointer to this region's directory in the debugfs
- * filesystem
- * @pm_reqs: Power management QoS requests related to this region
- */
-struct pseudo_lock_region {
- struct resctrl_schema *s;
- struct rdt_ctrl_domain *d;
- u32 cbm;
- wait_queue_head_t lock_thread_wq;
- int thread_done;
- int cpu;
- unsigned int line_size;
- unsigned int size;
- void *kmem;
- unsigned int minor;
- struct dentry *debugfs_dir;
- struct list_head pm_reqs;
-};
-
-/**
* struct rdtgroup - store rdtgroup's data in resctrl file system.
* @kn: kernfs node
* @rdtgroup_list: linked list for all rdtgroups
@@ -326,10 +264,7 @@ struct rdtgroup {
/* List of all resource groups */
extern struct list_head rdt_all_groups;
-extern int max_name_width, max_data_width;
-
-int __init rdtgroup_init(void);
-void __exit rdtgroup_exit(void);
+extern int max_name_width;
/**
* struct rftype - describe each file in the resctrl file system
@@ -433,37 +368,6 @@ struct msr_param {
u32 high;
};
-static inline bool is_llc_occupancy_enabled(void)
-{
- return (rdt_mon_features & (1 << QOS_L3_OCCUP_EVENT_ID));
-}
-
-static inline bool is_mbm_total_enabled(void)
-{
- return (rdt_mon_features & (1 << QOS_L3_MBM_TOTAL_EVENT_ID));
-}
-
-static inline bool is_mbm_local_enabled(void)
-{
- return (rdt_mon_features & (1 << QOS_L3_MBM_LOCAL_EVENT_ID));
-}
-
-static inline bool is_mbm_enabled(void)
-{
- return (is_mbm_total_enabled() || is_mbm_local_enabled());
-}
-
-static inline bool is_mbm_event(int e)
-{
- return (e >= QOS_L3_MBM_TOTAL_EVENT_ID &&
- e <= QOS_L3_MBM_LOCAL_EVENT_ID);
-}
-
-struct rdt_parse_data {
- struct rdtgroup *rdtgrp;
- char *buf;
-};
-
/**
* struct rdt_hw_resource - arch private attributes of a resctrl resource
* @r_resctrl: Attributes of the resource used directly by resctrl.
@@ -476,8 +380,6 @@ struct rdt_parse_data {
* @msr_update: Function pointer to update QOS MSRs
* @mon_scale: cqm counter * mon_scale = occupancy in bytes
* @mbm_width: Monitor width, to detect and correct for overflow.
- * @mbm_cfg_mask: Bandwidth sources that can be tracked when Bandwidth
- * Monitoring Event Configuration (BMEC) is supported.
* @cdp_enabled: CDP state of this resource
*
* Members of this structure are either private to the architecture
@@ -491,7 +393,6 @@ struct rdt_hw_resource {
void (*msr_update)(struct msr_param *m);
unsigned int mon_scale;
unsigned int mbm_width;
- unsigned int mbm_cfg_mask;
bool cdp_enabled;
};
@@ -500,11 +401,6 @@ static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r
return container_of(r, struct rdt_hw_resource, r_resctrl);
}
-int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s,
- struct rdt_ctrl_domain *d);
-int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
- struct rdt_ctrl_domain *d);
-
extern struct mutex rdtgroup_mutex;
extern struct rdt_hw_resource rdt_resources_all[];
@@ -512,24 +408,6 @@ extern struct rdtgroup rdtgroup_default;
extern struct dentry *debugfs_resctrl;
extern enum resctrl_event_id mba_mbps_default_event;
-enum resctrl_res_level {
- RDT_RESOURCE_L3,
- RDT_RESOURCE_L2,
- RDT_RESOURCE_MBA,
- RDT_RESOURCE_SMBA,
-
- /* Must be the last */
- RDT_NUM_RESOURCES,
-};
-
-static inline struct rdt_resource *resctrl_inc(struct rdt_resource *res)
-{
- struct rdt_hw_resource *hw_res = resctrl_to_arch_res(res);
-
- hw_res++;
- return &hw_res->r_resctrl;
-}
-
static inline bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l)
{
return rdt_resources_all[l].cdp_enabled;
@@ -539,27 +417,6 @@ int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable);
void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d);
-/*
- * To return the common struct rdt_resource, which is contained in struct
- * rdt_hw_resource, walk the resctrl member of struct rdt_hw_resource.
- */
-#define for_each_rdt_resource(r) \
- for (r = &rdt_resources_all[0].r_resctrl; \
- r <= &rdt_resources_all[RDT_NUM_RESOURCES - 1].r_resctrl; \
- r = resctrl_inc(r))
-
-#define for_each_capable_rdt_resource(r) \
- for_each_rdt_resource(r) \
- if (r->alloc_capable || r->mon_capable)
-
-#define for_each_alloc_capable_rdt_resource(r) \
- for_each_rdt_resource(r) \
- if (r->alloc_capable)
-
-#define for_each_mon_capable_rdt_resource(r) \
- for_each_rdt_resource(r) \
- if (r->mon_capable)
-
/* CPUID.(EAX=10H, ECX=ResID=1).EAX */
union cpuid_0x10_1_eax {
struct {
@@ -604,8 +461,6 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn);
int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name);
int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
umode_t mask);
-struct rdt_domain_hdr *rdt_find_domain(struct list_head *h, int id,
- struct list_head **pos);
ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off);
int rdtgroup_schemata_show(struct kernfs_open_file *of,
@@ -620,28 +475,19 @@ unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_ctrl_domain
unsigned long cbm);
enum rdtgrp_mode rdtgroup_mode_by_closid(int closid);
int rdtgroup_tasks_assigned(struct rdtgroup *r);
-int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp);
-int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp);
-bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm);
-bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d);
-int rdt_pseudo_lock_init(void);
-void rdt_pseudo_lock_release(void);
-int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp);
-void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp);
-struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu, struct rdt_resource *r);
-struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu, struct rdt_resource *r);
int closids_supported(void);
void closid_free(int closid);
int alloc_rmid(u32 closid);
void free_rmid(u32 closid, u32 rmid);
int rdt_get_mon_l3_config(struct rdt_resource *r);
-void __exit rdt_put_mon_l3_config(void);
+void resctrl_mon_resource_exit(void);
bool __init rdt_cpu_has(int flag);
void mon_event_count(void *info);
int rdtgroup_mondata_show(struct seq_file *m, void *arg);
void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
struct rdt_mon_domain *d, struct rdtgroup *rdtgrp,
cpumask_t *cpumask, int evtid, int first);
+int __init resctrl_mon_resource_init(void);
void mbm_setup_overflow_handler(struct rdt_mon_domain *dom,
unsigned long delay_ms,
int exclude_cpu);
@@ -658,4 +504,45 @@ void resctrl_file_fflags_init(const char *config, unsigned long fflags);
void rdt_staged_configs_clear(void);
bool closid_allocated(unsigned int closid);
int resctrl_find_cleanest_closid(void);
+
+#ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK
+int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp);
+int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp);
+bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm);
+bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d);
+int rdt_pseudo_lock_init(void);
+void rdt_pseudo_lock_release(void);
+int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp);
+void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp);
+#else
+static inline int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm)
+{
+ return false;
+}
+
+static inline bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d)
+{
+ return false;
+}
+
+static inline int rdt_pseudo_lock_init(void) { return 0; }
+static inline void rdt_pseudo_lock_release(void) { }
+static inline int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) { }
+#endif /* CONFIG_RESCTRL_FS_PSEUDO_LOCK */
+
#endif /* _ASM_X86_RESCTRL_INTERNAL_H */
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 94a1d9780461..a93ed7d2a160 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -295,11 +295,11 @@ void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *
{
struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
- if (is_mbm_total_enabled())
+ if (resctrl_arch_is_mbm_total_enabled())
memset(hw_dom->arch_mbm_total, 0,
sizeof(*hw_dom->arch_mbm_total) * r->num_rmid);
- if (is_mbm_local_enabled())
+ if (resctrl_arch_is_mbm_local_enabled())
memset(hw_dom->arch_mbm_local, 0,
sizeof(*hw_dom->arch_mbm_local) * r->num_rmid);
}
@@ -365,7 +365,7 @@ static void limbo_release_entry(struct rmid_entry *entry)
*/
void __check_limbo(struct rdt_mon_domain *d, bool force_free)
{
- struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+ struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
u32 idx_limit = resctrl_arch_system_num_rmid_idx();
struct rmid_entry *entry;
u32 idx, cur_idx = 1;
@@ -521,7 +521,7 @@ int alloc_rmid(u32 closid)
static void add_rmid_to_limbo(struct rmid_entry *entry)
{
- struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+ struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
struct rdt_mon_domain *d;
u32 idx;
@@ -569,7 +569,7 @@ void free_rmid(u32 closid, u32 rmid)
entry = __rmid_entry(idx);
- if (is_llc_occupancy_enabled())
+ if (resctrl_arch_is_llc_occupancy_enabled())
add_rmid_to_limbo(entry);
else
list_add_tail(&entry->list, &rmid_free_lru);
@@ -718,6 +718,22 @@ void mon_event_count(void *info)
rr->err = 0;
}
+static struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu,
+ struct rdt_resource *r)
+{
+ struct rdt_ctrl_domain *d;
+
+ lockdep_assert_cpus_held();
+
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
+ /* Find the domain that contains this CPU */
+ if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask))
+ return d;
+ }
+
+ return NULL;
+}
+
/*
* Feedback loop for MBA software controller (mba_sc)
*
@@ -761,7 +777,7 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm)
struct rdtgroup *entry;
u32 cur_bw, user_bw;
- r_mba = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
+ r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
evt_id = rgrp->mba_mbps_event;
closid = rgrp->closid;
@@ -852,10 +868,10 @@ static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d,
* This is protected from concurrent reads from user as both
* the user and overflow handler hold the global mutex.
*/
- if (is_mbm_total_enabled())
+ if (resctrl_arch_is_mbm_total_enabled())
mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID);
- if (is_mbm_local_enabled())
+ if (resctrl_arch_is_mbm_local_enabled())
mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID);
}
@@ -925,7 +941,7 @@ void mbm_handle_overflow(struct work_struct *work)
if (!resctrl_mounted || !resctrl_arch_mon_capable())
goto out_unlock;
- r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+ r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
d = container_of(work, struct rdt_mon_domain, mbm_over.work);
list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
@@ -1027,7 +1043,7 @@ static int dom_data_init(struct rdt_resource *r)
/*
* RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and
* are always allocated. These are used for the rdtgroup_default
- * control group, which will be setup later in rdtgroup_init().
+ * control group, which will be setup later in resctrl_init().
*/
idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID,
RESCTRL_RESERVED_RMID);
@@ -1040,10 +1056,13 @@ out_unlock:
return err;
}
-static void __exit dom_data_exit(void)
+static void dom_data_exit(struct rdt_resource *r)
{
mutex_lock(&rdtgroup_mutex);
+ if (!r->mon_capable)
+ goto out_unlock;
+
if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
kfree(closid_num_dirty_rmid);
closid_num_dirty_rmid = NULL;
@@ -1052,6 +1071,7 @@ static void __exit dom_data_exit(void)
kfree(rmid_ptrs);
rmid_ptrs = NULL;
+out_unlock:
mutex_unlock(&rdtgroup_mutex);
}
@@ -1081,11 +1101,11 @@ static void l3_mon_evt_init(struct rdt_resource *r)
{
INIT_LIST_HEAD(&r->evt_list);
- if (is_llc_occupancy_enabled())
+ if (resctrl_arch_is_llc_occupancy_enabled())
list_add_tail(&llc_occupancy_event.list, &r->evt_list);
- if (is_mbm_total_enabled())
+ if (resctrl_arch_is_mbm_total_enabled())
list_add_tail(&mbm_total_event.list, &r->evt_list);
- if (is_mbm_local_enabled())
+ if (resctrl_arch_is_mbm_local_enabled())
list_add_tail(&mbm_local_event.list, &r->evt_list);
}
@@ -1172,12 +1192,56 @@ static __init int snc_get_config(void)
return ret;
}
+/**
+ * resctrl_mon_resource_init() - Initialise global monitoring structures.
+ *
+ * Allocate and initialise global monitor resources that do not belong to a
+ * specific domain. i.e. the rmid_ptrs[] used for the limbo and free lists.
+ * Called once during boot after the struct rdt_resource's have been configured
+ * but before the filesystem is mounted.
+ * Resctrl's cpuhp callbacks may be called before this point to bring a domain
+ * online.
+ *
+ * Returns 0 for success, or -ENOMEM.
+ */
+int __init resctrl_mon_resource_init(void)
+{
+ struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
+ int ret;
+
+ if (!r->mon_capable)
+ return 0;
+
+ ret = dom_data_init(r);
+ if (ret)
+ return ret;
+
+ l3_mon_evt_init(r);
+
+ if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_TOTAL_EVENT_ID)) {
+ mbm_total_event.configurable = true;
+ resctrl_file_fflags_init("mbm_total_bytes_config",
+ RFTYPE_MON_INFO | RFTYPE_RES_CACHE);
+ }
+ if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_LOCAL_EVENT_ID)) {
+ mbm_local_event.configurable = true;
+ resctrl_file_fflags_init("mbm_local_bytes_config",
+ RFTYPE_MON_INFO | RFTYPE_RES_CACHE);
+ }
+
+ if (resctrl_arch_is_mbm_local_enabled())
+ mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID;
+ else if (resctrl_arch_is_mbm_total_enabled())
+ mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID;
+
+ return 0;
+}
+
int __init rdt_get_mon_l3_config(struct rdt_resource *r)
{
unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset;
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
unsigned int threshold;
- int ret;
snc_nodes_per_l3_cache = snc_get_config();
@@ -1207,39 +1271,24 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r)
*/
resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold);
- ret = dom_data_init(r);
- if (ret)
- return ret;
-
if (rdt_cpu_has(X86_FEATURE_BMEC)) {
u32 eax, ebx, ecx, edx;
/* Detect list of bandwidth sources that can be tracked */
cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx);
- hw_res->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS;
-
- if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) {
- mbm_total_event.configurable = true;
- resctrl_file_fflags_init("mbm_total_bytes_config",
- RFTYPE_MON_INFO | RFTYPE_RES_CACHE);
- }
- if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) {
- mbm_local_event.configurable = true;
- resctrl_file_fflags_init("mbm_local_bytes_config",
- RFTYPE_MON_INFO | RFTYPE_RES_CACHE);
- }
+ r->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS;
}
- l3_mon_evt_init(r);
-
r->mon_capable = true;
return 0;
}
-void __exit rdt_put_mon_l3_config(void)
+void resctrl_mon_resource_exit(void)
{
- dom_data_exit();
+ struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
+
+ dom_data_exit(r);
}
void __init intel_rdt_mbm_apply_quirk(void)
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index 42cc162f7fc9..01fa7890b43f 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -61,7 +61,8 @@ static const struct class pseudo_lock_class = {
};
/**
- * get_prefetch_disable_bits - prefetch disable bits of supported platforms
+ * resctrl_arch_get_prefetch_disable_bits - prefetch disable bits of supported
+ * platforms
* @void: It takes no parameters.
*
* Capture the list of platforms that have been validated to support
@@ -75,14 +76,16 @@ static const struct class pseudo_lock_class = {
* in the SDM.
*
* When adding a platform here also add support for its cache events to
- * measure_cycles_perf_fn()
+ * resctrl_arch_measure_l*_residency()
*
* Return:
* If platform is supported, the bits to disable hardware prefetchers, 0
* if platform is not supported.
*/
-static u64 get_prefetch_disable_bits(void)
+u64 resctrl_arch_get_prefetch_disable_bits(void)
{
+ prefetch_disable_bits = 0;
+
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
boot_cpu_data.x86 != 6)
return 0;
@@ -98,7 +101,8 @@ static u64 get_prefetch_disable_bits(void)
* 3 DCU IP Prefetcher Disable (R/W)
* 63:4 Reserved
*/
- return 0xF;
+ prefetch_disable_bits = 0xF;
+ break;
case INTEL_ATOM_GOLDMONT:
case INTEL_ATOM_GOLDMONT_PLUS:
/*
@@ -109,10 +113,11 @@ static u64 get_prefetch_disable_bits(void)
* 2 DCU Hardware Prefetcher Disable (R/W)
* 63:3 Reserved
*/
- return 0x5;
+ prefetch_disable_bits = 0x5;
+ break;
}
- return 0;
+ return prefetch_disable_bits;
}
/**
@@ -408,8 +413,8 @@ static void pseudo_lock_free(struct rdtgroup *rdtgrp)
}
/**
- * pseudo_lock_fn - Load kernel memory into cache
- * @_rdtgrp: resource group to which pseudo-lock region belongs
+ * resctrl_arch_pseudo_lock_fn - Load kernel memory into cache
+ * @_plr: the pseudo-lock region descriptor
*
* This is the core pseudo-locking flow.
*
@@ -426,10 +431,9 @@ static void pseudo_lock_free(struct rdtgroup *rdtgrp)
*
* Return: 0. Waiter on waitqueue will be woken on completion.
*/
-static int pseudo_lock_fn(void *_rdtgrp)
+int resctrl_arch_pseudo_lock_fn(void *_plr)
{
- struct rdtgroup *rdtgrp = _rdtgrp;
- struct pseudo_lock_region *plr = rdtgrp->plr;
+ struct pseudo_lock_region *plr = _plr;
u32 rmid_p, closid_p;
unsigned long i;
u64 saved_msr;
@@ -489,7 +493,8 @@ static int pseudo_lock_fn(void *_rdtgrp)
* pseudo-locked followed by reading of kernel memory to load it
* into the cache.
*/
- __wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, rdtgrp->closid);
+ __wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, plr->closid);
+
/*
* Cache was flushed earlier. Now access kernel memory to read it
* into cache region associated with just activated plr->closid.
@@ -712,8 +717,7 @@ int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp)
* Not knowing the bits to disable prefetching implies that this
* platform does not support Cache Pseudo-Locking.
*/
- prefetch_disable_bits = get_prefetch_disable_bits();
- if (prefetch_disable_bits == 0) {
+ if (resctrl_arch_get_prefetch_disable_bits() == 0) {
rdt_last_cmd_puts("Pseudo-locking not supported\n");
return -EINVAL;
}
@@ -872,7 +876,8 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d)
}
/**
- * measure_cycles_lat_fn - Measure cycle latency to read pseudo-locked memory
+ * resctrl_arch_measure_cycles_lat_fn - Measure cycle latency to read
+ * pseudo-locked memory
* @_plr: pseudo-lock region to measure
*
* There is no deterministic way to test if a memory region is cached. One
@@ -885,7 +890,7 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d)
*
* Return: 0. Waiter on waitqueue will be woken on completion.
*/
-static int measure_cycles_lat_fn(void *_plr)
+int resctrl_arch_measure_cycles_lat_fn(void *_plr)
{
struct pseudo_lock_region *plr = _plr;
u32 saved_low, saved_high;
@@ -1069,7 +1074,7 @@ out:
return 0;
}
-static int measure_l2_residency(void *_plr)
+int resctrl_arch_measure_l2_residency(void *_plr)
{
struct pseudo_lock_region *plr = _plr;
struct residency_counts counts = {0};
@@ -1107,7 +1112,7 @@ out:
return 0;
}
-static int measure_l3_residency(void *_plr)
+int resctrl_arch_measure_l3_residency(void *_plr)
{
struct pseudo_lock_region *plr = _plr;
struct residency_counts counts = {0};
@@ -1205,14 +1210,14 @@ static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel)
plr->cpu = cpu;
if (sel == 1)
- thread = kthread_run_on_cpu(measure_cycles_lat_fn, plr,
- cpu, "pseudo_lock_measure/%u");
+ thread = kthread_run_on_cpu(resctrl_arch_measure_cycles_lat_fn,
+ plr, cpu, "pseudo_lock_measure/%u");
else if (sel == 2)
- thread = kthread_run_on_cpu(measure_l2_residency, plr,
- cpu, "pseudo_lock_measure/%u");
+ thread = kthread_run_on_cpu(resctrl_arch_measure_l2_residency,
+ plr, cpu, "pseudo_lock_measure/%u");
else if (sel == 3)
- thread = kthread_run_on_cpu(measure_l3_residency, plr,
- cpu, "pseudo_lock_measure/%u");
+ thread = kthread_run_on_cpu(resctrl_arch_measure_l3_residency,
+ plr, cpu, "pseudo_lock_measure/%u");
else
goto out;
@@ -1307,7 +1312,7 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
plr->thread_done = 0;
- thread = kthread_run_on_cpu(pseudo_lock_fn, rdtgrp,
+ thread = kthread_run_on_cpu(resctrl_arch_pseudo_lock_fn, plr,
plr->cpu, "pseudo_lock/%u");
if (IS_ERR(thread)) {
ret = PTR_ERR(thread);
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 6419e04d8a7b..c6274d40b217 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -57,6 +57,12 @@ static struct kernfs_node *kn_mongrp;
/* Kernel fs node for "mon_data" directory under root */
static struct kernfs_node *kn_mondata;
+/*
+ * Used to store the max resource name width to display the schemata names in
+ * a tabular format.
+ */
+int max_name_width;
+
static struct seq_buf last_cmd_status;
static char last_cmd_status_buf[512];
@@ -111,6 +117,18 @@ void rdt_staged_configs_clear(void)
}
}
+static bool resctrl_is_mbm_enabled(void)
+{
+ return (resctrl_arch_is_mbm_total_enabled() ||
+ resctrl_arch_is_mbm_local_enabled());
+}
+
+static bool resctrl_is_mbm_event(int e)
+{
+ return (e >= QOS_L3_MBM_TOTAL_EVENT_ID &&
+ e <= QOS_L3_MBM_LOCAL_EVENT_ID);
+}
+
/*
* Trivial allocator for CLOSIDs. Since h/w only supports a small number,
* we can keep a bitmap of free CLOSIDs in a single integer.
@@ -157,7 +175,8 @@ static int closid_alloc(void)
lockdep_assert_held(&rdtgroup_mutex);
- if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
+ if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID) &&
+ resctrl_arch_is_llc_occupancy_enabled()) {
cleanest_closid = resctrl_find_cleanest_closid();
if (cleanest_closid < 0)
return cleanest_closid;
@@ -348,13 +367,13 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of,
* from update_closid_rmid() is protected against __switch_to() because
* preemption is disabled.
*/
-static void update_cpu_closid_rmid(void *info)
+void resctrl_arch_sync_cpu_closid_rmid(void *info)
{
- struct rdtgroup *r = info;
+ struct resctrl_cpu_defaults *r = info;
if (r) {
this_cpu_write(pqr_state.default_closid, r->closid);
- this_cpu_write(pqr_state.default_rmid, r->mon.rmid);
+ this_cpu_write(pqr_state.default_rmid, r->rmid);
}
/*
@@ -369,11 +388,20 @@ static void update_cpu_closid_rmid(void *info)
* Update the PGR_ASSOC MSR on all cpus in @cpu_mask,
*
* Per task closids/rmids must have been set up before calling this function.
+ * @r may be NULL.
*/
static void
update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r)
{
- on_each_cpu_mask(cpu_mask, update_cpu_closid_rmid, r, 1);
+ struct resctrl_cpu_defaults defaults, *p = NULL;
+
+ if (r) {
+ defaults.closid = r->closid;
+ defaults.rmid = r->mon.rmid;
+ p = &defaults;
+ }
+
+ on_each_cpu_mask(cpu_mask, resctrl_arch_sync_cpu_closid_rmid, p, 1);
}
static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
@@ -971,7 +999,7 @@ static int rdt_default_ctrl_show(struct kernfs_open_file *of,
struct resctrl_schema *s = of->kn->parent->priv;
struct rdt_resource *r = s->res;
- seq_printf(seq, "%x\n", r->default_ctrl);
+ seq_printf(seq, "%x\n", resctrl_get_default_ctrl(r));
return 0;
}
@@ -1160,10 +1188,19 @@ static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of,
struct resctrl_schema *s = of->kn->parent->priv;
struct rdt_resource *r = s->res;
- if (r->membw.throttle_mode == THREAD_THROTTLE_PER_THREAD)
+ switch (r->membw.throttle_mode) {
+ case THREAD_THROTTLE_PER_THREAD:
seq_puts(seq, "per-thread\n");
- else
+ return 0;
+ case THREAD_THROTTLE_MAX:
seq_puts(seq, "max\n");
+ return 0;
+ case THREAD_THROTTLE_UNDEFINED:
+ seq_puts(seq, "undefined\n");
+ return 0;
+ }
+
+ WARN_ON_ONCE(1);
return 0;
}
@@ -1425,7 +1462,8 @@ static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of,
goto out;
}
rdtgrp->mode = RDT_MODE_EXCLUSIVE;
- } else if (!strcmp(buf, "pseudo-locksetup")) {
+ } else if (IS_ENABLED(CONFIG_RESCTRL_FS_PSEUDO_LOCK) &&
+ !strcmp(buf, "pseudo-locksetup")) {
ret = rdtgroup_locksetup_enter(rdtgrp);
if (ret)
goto out;
@@ -1552,11 +1590,6 @@ out:
return ret;
}
-struct mon_config_info {
- u32 evtid;
- u32 mon_config;
-};
-
#define INVALID_CONFIG_INDEX UINT_MAX
/**
@@ -1581,31 +1614,32 @@ static inline unsigned int mon_event_config_index_get(u32 evtid)
}
}
-static void mon_event_config_read(void *info)
+void resctrl_arch_mon_event_config_read(void *_config_info)
{
- struct mon_config_info *mon_info = info;
+ struct resctrl_mon_config_info *config_info = _config_info;
unsigned int index;
u64 msrval;
- index = mon_event_config_index_get(mon_info->evtid);
+ index = mon_event_config_index_get(config_info->evtid);
if (index == INVALID_CONFIG_INDEX) {
- pr_warn_once("Invalid event id %d\n", mon_info->evtid);
+ pr_warn_once("Invalid event id %d\n", config_info->evtid);
return;
}
rdmsrl(MSR_IA32_EVT_CFG_BASE + index, msrval);
/* Report only the valid event configuration bits */
- mon_info->mon_config = msrval & MAX_EVT_CONFIG_BITS;
+ config_info->mon_config = msrval & MAX_EVT_CONFIG_BITS;
}
-static void mondata_config_read(struct rdt_mon_domain *d, struct mon_config_info *mon_info)
+static void mondata_config_read(struct resctrl_mon_config_info *mon_info)
{
- smp_call_function_any(&d->hdr.cpu_mask, mon_event_config_read, mon_info, 1);
+ smp_call_function_any(&mon_info->d->hdr.cpu_mask,
+ resctrl_arch_mon_event_config_read, mon_info, 1);
}
static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid)
{
- struct mon_config_info mon_info;
+ struct resctrl_mon_config_info mon_info;
struct rdt_mon_domain *dom;
bool sep = false;
@@ -1616,9 +1650,11 @@ static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid
if (sep)
seq_puts(s, ";");
- memset(&mon_info, 0, sizeof(struct mon_config_info));
+ memset(&mon_info, 0, sizeof(struct resctrl_mon_config_info));
+ mon_info.r = r;
+ mon_info.d = dom;
mon_info.evtid = evtid;
- mondata_config_read(dom, &mon_info);
+ mondata_config_read(&mon_info);
seq_printf(s, "%d=0x%02x", dom->hdr.id, mon_info.mon_config);
sep = true;
@@ -1651,30 +1687,32 @@ static int mbm_local_bytes_config_show(struct kernfs_open_file *of,
return 0;
}
-static void mon_event_config_write(void *info)
+void resctrl_arch_mon_event_config_write(void *_config_info)
{
- struct mon_config_info *mon_info = info;
+ struct resctrl_mon_config_info *config_info = _config_info;
unsigned int index;
- index = mon_event_config_index_get(mon_info->evtid);
+ index = mon_event_config_index_get(config_info->evtid);
if (index == INVALID_CONFIG_INDEX) {
- pr_warn_once("Invalid event id %d\n", mon_info->evtid);
+ pr_warn_once("Invalid event id %d\n", config_info->evtid);
return;
}
- wrmsr(MSR_IA32_EVT_CFG_BASE + index, mon_info->mon_config, 0);
+ wrmsr(MSR_IA32_EVT_CFG_BASE + index, config_info->mon_config, 0);
}
static void mbm_config_write_domain(struct rdt_resource *r,
struct rdt_mon_domain *d, u32 evtid, u32 val)
{
- struct mon_config_info mon_info = {0};
+ struct resctrl_mon_config_info mon_info = {0};
/*
* Read the current config value first. If both are the same then
* no need to write it again.
*/
+ mon_info.r = r;
+ mon_info.d = d;
mon_info.evtid = evtid;
- mondata_config_read(d, &mon_info);
+ mondata_config_read(&mon_info);
if (mon_info.mon_config == val)
return;
@@ -1686,7 +1724,7 @@ static void mbm_config_write_domain(struct rdt_resource *r,
* are scoped at the domain level. Writing any of these MSRs
* on one CPU is observed by all the CPUs in the domain.
*/
- smp_call_function_any(&d->hdr.cpu_mask, mon_event_config_write,
+ smp_call_function_any(&d->hdr.cpu_mask, resctrl_arch_mon_event_config_write,
&mon_info, 1);
/*
@@ -1703,7 +1741,6 @@ static void mbm_config_write_domain(struct rdt_resource *r,
static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid)
{
- struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
char *dom_str = NULL, *id_str;
unsigned long dom_id, val;
struct rdt_mon_domain *d;
@@ -1730,9 +1767,9 @@ next:
}
/* Value from user cannot be more than the supported set of events */
- if ((val & hw_res->mbm_cfg_mask) != val) {
+ if ((val & r->mbm_cfg_mask) != val) {
rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n",
- hw_res->mbm_cfg_mask);
+ r->mbm_cfg_mask);
return -EINVAL;
}
@@ -2036,6 +2073,28 @@ static struct rftype *rdtgroup_get_rftype_by_name(const char *name)
return NULL;
}
+static void thread_throttle_mode_init(void)
+{
+ enum membw_throttle_mode throttle_mode = THREAD_THROTTLE_UNDEFINED;
+ struct rdt_resource *r_mba, *r_smba;
+
+ r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
+ if (r_mba->alloc_capable &&
+ r_mba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED)
+ throttle_mode = r_mba->membw.throttle_mode;
+
+ r_smba = resctrl_arch_get_resource(RDT_RESOURCE_SMBA);
+ if (r_smba->alloc_capable &&
+ r_smba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED)
+ throttle_mode = r_smba->membw.throttle_mode;
+
+ if (throttle_mode == THREAD_THROTTLE_UNDEFINED)
+ return;
+
+ resctrl_file_fflags_init("thread_throttle_mode",
+ RFTYPE_CTRL_INFO | RFTYPE_RES_MB);
+}
+
void resctrl_file_fflags_init(const char *config, unsigned long fflags)
{
struct rftype *rft;
@@ -2164,6 +2223,20 @@ static int rdtgroup_mkdir_info_resdir(void *priv, char *name,
return ret;
}
+static unsigned long fflags_from_resource(struct rdt_resource *r)
+{
+ switch (r->rid) {
+ case RDT_RESOURCE_L3:
+ case RDT_RESOURCE_L2:
+ return RFTYPE_RES_CACHE;
+ case RDT_RESOURCE_MBA:
+ case RDT_RESOURCE_SMBA:
+ return RFTYPE_RES_MB;
+ }
+
+ return WARN_ON_ONCE(1);
+}
+
static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
{
struct resctrl_schema *s;
@@ -2184,14 +2257,14 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
/* loop over enabled controls, these are all alloc_capable */
list_for_each_entry(s, &resctrl_schema_all, list) {
r = s->res;
- fflags = r->fflags | RFTYPE_CTRL_INFO;
+ fflags = fflags_from_resource(r) | RFTYPE_CTRL_INFO;
ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags);
if (ret)
goto out_destroy;
}
for_each_mon_capable_rdt_resource(r) {
- fflags = r->fflags | RFTYPE_MON_INFO;
+ fflags = fflags_from_resource(r) | RFTYPE_MON_INFO;
sprintf(name, "%s_MON", r->name);
ret = rdtgroup_mkdir_info_resdir(r, name, fflags);
if (ret)
@@ -2255,7 +2328,7 @@ static void l2_qos_cfg_update(void *arg)
static inline bool is_mba_linear(void)
{
- return rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl.membw.delay_linear;
+ return resctrl_arch_get_resource(RDT_RESOURCE_MBA)->membw.delay_linear;
}
static int set_cache_qos_cfg(int level, bool enable)
@@ -2345,10 +2418,10 @@ static void mba_sc_domain_destroy(struct rdt_resource *r,
*/
static bool supports_mba_mbps(void)
{
- struct rdt_resource *rmbm = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
- struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
+ struct rdt_resource *rmbm = resctrl_arch_get_resource(RDT_RESOURCE_L3);
+ struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
- return (is_mbm_enabled() &&
+ return (resctrl_is_mbm_enabled() &&
r->alloc_capable && is_mba_linear() &&
r->ctrl_scope == rmbm->mon_scope);
}
@@ -2359,7 +2432,7 @@ static bool supports_mba_mbps(void)
*/
static int set_mba_sc(bool mba_sc)
{
- struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
+ struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
u32 num_closid = resctrl_arch_get_num_closid(r);
struct rdt_ctrl_domain *d;
unsigned long fflags;
@@ -2596,6 +2669,20 @@ static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type
if (cl > max_name_width)
max_name_width = cl;
+ switch (r->schema_fmt) {
+ case RESCTRL_SCHEMA_BITMAP:
+ s->fmt_str = "%d=%x";
+ break;
+ case RESCTRL_SCHEMA_RANGE:
+ s->fmt_str = "%d=%u";
+ break;
+ }
+
+ if (WARN_ON_ONCE(!s->fmt_str)) {
+ kfree(s);
+ return -EINVAL;
+ }
+
INIT_LIST_HEAD(&s->list);
list_add(&s->list, &resctrl_schema_all);
@@ -2712,8 +2799,8 @@ static int rdt_get_tree(struct fs_context *fc)
if (resctrl_arch_alloc_capable() || resctrl_arch_mon_capable())
resctrl_mounted = true;
- if (is_mbm_enabled()) {
- r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+ if (resctrl_is_mbm_enabled()) {
+ r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
list_for_each_entry(dom, &r->mon_domains, hdr.list)
mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL,
RESCTRL_PICK_ANY_CPU);
@@ -2823,7 +2910,7 @@ static int rdt_init_fs_context(struct fs_context *fc)
return 0;
}
-static int reset_all_ctrls(struct rdt_resource *r)
+void resctrl_arch_reset_all_ctrls(struct rdt_resource *r)
{
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
struct rdt_hw_ctrl_domain *hw_dom;
@@ -2847,12 +2934,12 @@ static int reset_all_ctrls(struct rdt_resource *r)
hw_dom = resctrl_to_arch_ctrl_dom(d);
for (i = 0; i < hw_res->num_closid; i++)
- hw_dom->ctrl_val[i] = r->default_ctrl;
+ hw_dom->ctrl_val[i] = resctrl_get_default_ctrl(r);
msr_param.dom = d;
smp_call_function_any(&d->hdr.cpu_mask, rdt_ctrl_update, &msr_param, 1);
}
- return 0;
+ return;
}
/*
@@ -2971,9 +3058,10 @@ static void rdt_kill_sb(struct super_block *sb)
rdt_disable_ctx();
- /*Put everything back to default values. */
+ /* Put everything back to default values. */
for_each_alloc_capable_rdt_resource(r)
- reset_all_ctrls(r);
+ resctrl_arch_reset_all_ctrls(r);
+
rmdir_all_sub();
rdt_pseudo_lock_release();
rdtgroup_default.mode = RDT_MODE_SHAREABLE;
@@ -3080,7 +3168,7 @@ static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d,
if (ret)
return ret;
- if (!do_sum && is_mbm_event(mevt->evtid))
+ if (!do_sum && resctrl_is_mbm_event(mevt->evtid))
mon_event_read(&rr, r, d, prgrp, &d->hdr.cpu_mask, mevt->evtid, true);
}
@@ -3382,7 +3470,7 @@ static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid)
}
cfg = &d->staged_config[CDP_NONE];
- cfg->new_ctrl = r->default_ctrl;
+ cfg->new_ctrl = resctrl_get_default_ctrl(r);
cfg->have_new_ctrl = true;
}
}
@@ -3696,14 +3784,21 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
{
struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
+ u32 closid, rmid;
int cpu;
/* Give any tasks back to the parent group */
rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask);
- /* Update per cpu rmid of the moved CPUs first */
+ /*
+ * Update per cpu closid/rmid of the moved CPUs first.
+ * Note: the closid will not change, but the arch code still needs it.
+ */
+ closid = prdtgrp->closid;
+ rmid = prdtgrp->mon.rmid;
for_each_cpu(cpu, &rdtgrp->cpu_mask)
- per_cpu(pqr_state.default_rmid, cpu) = prdtgrp->mon.rmid;
+ resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid);
+
/*
* Update the MSR on moved CPUs and CPUs which have moved
* task running on them.
@@ -3736,6 +3831,7 @@ static int rdtgroup_ctrl_remove(struct rdtgroup *rdtgrp)
static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
{
+ u32 closid, rmid;
int cpu;
/* Give any tasks back to the default group */
@@ -3746,10 +3842,10 @@ static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
&rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
/* Update per cpu closid and rmid of the moved CPUs first */
- for_each_cpu(cpu, &rdtgrp->cpu_mask) {
- per_cpu(pqr_state.default_closid, cpu) = rdtgroup_default.closid;
- per_cpu(pqr_state.default_rmid, cpu) = rdtgroup_default.mon.rmid;
- }
+ closid = rdtgroup_default.closid;
+ rmid = rdtgroup_default.mon.rmid;
+ for_each_cpu(cpu, &rdtgrp->cpu_mask)
+ resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid);
/*
* Update the MSR on moved CPUs and CPUs which have moved
@@ -3950,7 +4046,7 @@ static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2))
seq_puts(seq, ",cdpl2");
- if (is_mba_sc(&rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl))
+ if (is_mba_sc(resctrl_arch_get_resource(RDT_RESOURCE_MBA)))
seq_puts(seq, ",mba_MBps");
if (resctrl_debug)
@@ -4029,9 +4125,9 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d
if (resctrl_mounted && resctrl_arch_mon_capable())
rmdir_mondata_subdir_allrdtgrp(r, d);
- if (is_mbm_enabled())
+ if (resctrl_is_mbm_enabled())
cancel_delayed_work(&d->mbm_over);
- if (is_llc_occupancy_enabled() && has_busy_rmid(d)) {
+ if (resctrl_arch_is_llc_occupancy_enabled() && has_busy_rmid(d)) {
/*
* When a package is going down, forcefully
* decrement rmid->ebusy. There is no way to know
@@ -4049,17 +4145,30 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d
mutex_unlock(&rdtgroup_mutex);
}
+/**
+ * domain_setup_mon_state() - Initialise domain monitoring structures.
+ * @r: The resource for the newly online domain.
+ * @d: The newly online domain.
+ *
+ * Allocate monitor resources that belong to this domain.
+ * Called when the first CPU of a domain comes online, regardless of whether
+ * the filesystem is mounted.
+ * During boot this may be called before global allocations have been made by
+ * resctrl_mon_resource_init().
+ *
+ * Returns 0 for success, or -ENOMEM.
+ */
static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d)
{
u32 idx_limit = resctrl_arch_system_num_rmid_idx();
size_t tsize;
- if (is_llc_occupancy_enabled()) {
+ if (resctrl_arch_is_llc_occupancy_enabled()) {
d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL);
if (!d->rmid_busy_llc)
return -ENOMEM;
}
- if (is_mbm_total_enabled()) {
+ if (resctrl_arch_is_mbm_total_enabled()) {
tsize = sizeof(*d->mbm_total);
d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL);
if (!d->mbm_total) {
@@ -4067,7 +4176,7 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain
return -ENOMEM;
}
}
- if (is_mbm_local_enabled()) {
+ if (resctrl_arch_is_mbm_local_enabled()) {
tsize = sizeof(*d->mbm_local);
d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL);
if (!d->mbm_local) {
@@ -4106,13 +4215,13 @@ int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d)
if (err)
goto out_unlock;
- if (is_mbm_enabled()) {
+ if (resctrl_is_mbm_enabled()) {
INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow);
mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL,
RESCTRL_PICK_ANY_CPU);
}
- if (is_llc_occupancy_enabled())
+ if (resctrl_arch_is_llc_occupancy_enabled())
INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo);
/*
@@ -4148,9 +4257,25 @@ static void clear_childcpus(struct rdtgroup *r, unsigned int cpu)
}
}
+static struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu,
+ struct rdt_resource *r)
+{
+ struct rdt_mon_domain *d;
+
+ lockdep_assert_cpus_held();
+
+ list_for_each_entry(d, &r->mon_domains, hdr.list) {
+ /* Find the domain that contains this CPU */
+ if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask))
+ return d;
+ }
+
+ return NULL;
+}
+
void resctrl_offline_cpu(unsigned int cpu)
{
- struct rdt_resource *l3 = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+ struct rdt_resource *l3 = resctrl_arch_get_resource(RDT_RESOURCE_L3);
struct rdt_mon_domain *d;
struct rdtgroup *rdtgrp;
@@ -4167,12 +4292,12 @@ void resctrl_offline_cpu(unsigned int cpu)
d = get_mon_domain_from_cpu(cpu, l3);
if (d) {
- if (is_mbm_enabled() && cpu == d->mbm_work_cpu) {
+ if (resctrl_is_mbm_enabled() && cpu == d->mbm_work_cpu) {
cancel_delayed_work(&d->mbm_over);
mbm_setup_overflow_handler(d, 0, cpu);
}
- if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu &&
- has_busy_rmid(d)) {
+ if (resctrl_arch_is_llc_occupancy_enabled() &&
+ cpu == d->cqm_work_cpu && has_busy_rmid(d)) {
cancel_delayed_work(&d->cqm_limbo);
cqm_setup_limbo_handler(d, 0, cpu);
}
@@ -4183,14 +4308,14 @@ out_unlock:
}
/*
- * rdtgroup_init - rdtgroup initialization
+ * resctrl_init - resctrl filesystem initialization
*
* Setup resctrl file system including set up root, create mount point,
- * register rdtgroup filesystem, and initialize files under root directory.
+ * register resctrl filesystem, and initialize files under root directory.
*
* Return: 0 on success or -errno
*/
-int __init rdtgroup_init(void)
+int __init resctrl_init(void)
{
int ret = 0;
@@ -4199,10 +4324,18 @@ int __init rdtgroup_init(void)
rdtgroup_setup_default();
- ret = sysfs_create_mount_point(fs_kobj, "resctrl");
+ thread_throttle_mode_init();
+
+ ret = resctrl_mon_resource_init();
if (ret)
return ret;
+ ret = sysfs_create_mount_point(fs_kobj, "resctrl");
+ if (ret) {
+ resctrl_mon_resource_exit();
+ return ret;
+ }
+
ret = register_filesystem(&rdt_fs_type);
if (ret)
goto cleanup_mountpoint;
@@ -4234,13 +4367,16 @@ int __init rdtgroup_init(void)
cleanup_mountpoint:
sysfs_remove_mount_point(fs_kobj, "resctrl");
+ resctrl_mon_resource_exit();
return ret;
}
-void __exit rdtgroup_exit(void)
+void __exit resctrl_exit(void)
{
debugfs_remove_recursive(debugfs_resctrl);
unregister_filesystem(&rdt_fs_type);
sysfs_remove_mount_point(fs_kobj, "resctrl");
+
+ resctrl_mon_resource_exit();
}
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 340af8155658..0be61c45400c 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -140,7 +140,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
x86_platform.guest.enc_kexec_begin();
x86_platform.guest.enc_kexec_finish();
- crash_save_cpu(regs, safe_smp_processor_id());
+ crash_save_cpu(regs, smp_processor_id());
}
#if defined(CONFIG_KEXEC_FILE) || defined(CONFIG_CRASH_HOTPLUG)
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index a7d562697e50..91639d1e4ec2 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -395,18 +395,13 @@ NOKPROBE_SYMBOL(oops_end);
static void __die_header(const char *str, struct pt_regs *regs, long err)
{
- const char *pr = "";
-
/* Save the regs of the first oops for the executive summary later. */
if (!die_counter)
exec_summary_regs = *regs;
- if (IS_ENABLED(CONFIG_PREEMPTION))
- pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT";
-
printk(KERN_DEFAULT
- "Oops: %s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff,
- ++die_counter, pr,
+ "Oops: %s: %04lx [#%d]%s%s%s%s\n", str, err & 0xffff,
+ ++die_counter,
IS_ENABLED(CONFIG_SMP) ? " SMP" : "",
debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "",
IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "",
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index b4905d5173fd..722fd712e1cf 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -37,7 +37,7 @@ const char *stack_type_name(enum stack_type type)
static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info)
{
- unsigned long *begin = (unsigned long *)this_cpu_read(pcpu_hot.hardirq_stack_ptr);
+ unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack_ptr);
unsigned long *end = begin + (THREAD_SIZE / sizeof(long));
/*
@@ -62,7 +62,7 @@ static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info)
static bool in_softirq_stack(unsigned long *stack, struct stack_info *info)
{
- unsigned long *begin = (unsigned long *)this_cpu_read(pcpu_hot.softirq_stack_ptr);
+ unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack_ptr);
unsigned long *end = begin + (THREAD_SIZE / sizeof(long));
/*
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index f05339fee778..6c5defd6569a 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -134,7 +134,7 @@ static __always_inline bool in_exception_stack(unsigned long *stack, struct stac
static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info *info)
{
- unsigned long *end = (unsigned long *)this_cpu_read(pcpu_hot.hardirq_stack_ptr);
+ unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr);
unsigned long *begin;
/*
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 82b96ed9890a..57120f0749cc 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -28,18 +28,13 @@
* the first 128 E820 memory entries in boot_params.e820_table and the remaining
* (if any) entries of the SETUP_E820_EXT nodes. We use this to:
*
- * - inform the user about the firmware's notion of memory layout
- * via /sys/firmware/memmap
- *
* - the hibernation code uses it to generate a kernel-independent CRC32
* checksum of the physical memory layout of a system.
*
* - 'e820_table_kexec': a slightly modified (by the kernel) firmware version
* passed to us by the bootloader - the major difference between
- * e820_table_firmware[] and this one is that, the latter marks the setup_data
- * list created by the EFI boot stub as reserved, so that kexec can reuse the
- * setup_data information in the second kernel. Besides, e820_table_kexec[]
- * might also be modified by the kexec itself to fake a mptable.
+ * e820_table_firmware[] and this one is that e820_table_kexec[]
+ * might be modified by the kexec itself to fake an mptable.
* We use this to:
*
* - kexec, which is a bootloader in disguise, uses the original E820
@@ -47,6 +42,11 @@
* can have a restricted E820 map while the kexec()-ed kexec-kernel
* can have access to full memory - etc.
*
+ * Export the memory layout via /sys/firmware/memmap. kexec-tools uses
+ * the entries to create an E820 table for the kexec kernel.
+ *
+ * kexec_file_load in-kernel code uses the table for the kexec kernel.
+ *
* - 'e820_table': this is the main E820 table that is massaged by the
* low level x86 platform code, or modified by boot parameters, before
* passed on to higher level MM layers.
@@ -187,8 +187,7 @@ void __init e820__range_add(u64 start, u64 size, enum e820_type type)
static void __init e820_print_type(enum e820_type type)
{
switch (type) {
- case E820_TYPE_RAM: /* Fall through: */
- case E820_TYPE_RESERVED_KERN: pr_cont("usable"); break;
+ case E820_TYPE_RAM: pr_cont("usable"); break;
case E820_TYPE_RESERVED: pr_cont("reserved"); break;
case E820_TYPE_SOFT_RESERVED: pr_cont("soft reserved"); break;
case E820_TYPE_ACPI: pr_cont("ACPI data"); break;
@@ -764,7 +763,7 @@ void __init e820__register_nosave_regions(unsigned long limit_pfn)
pfn = PFN_DOWN(entry->addr + entry->size);
- if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN)
+ if (entry->type != E820_TYPE_RAM)
register_nosave_region(PFN_UP(entry->addr), pfn);
if (pfn >= limit_pfn)
@@ -991,60 +990,6 @@ static int __init parse_memmap_opt(char *str)
early_param("memmap", parse_memmap_opt);
/*
- * Reserve all entries from the bootloader's extensible data nodes list,
- * because if present we are going to use it later on to fetch e820
- * entries from it:
- */
-void __init e820__reserve_setup_data(void)
-{
- struct setup_indirect *indirect;
- struct setup_data *data;
- u64 pa_data, pa_next;
- u32 len;
-
- pa_data = boot_params.hdr.setup_data;
- if (!pa_data)
- return;
-
- while (pa_data) {
- data = early_memremap(pa_data, sizeof(*data));
- if (!data) {
- pr_warn("e820: failed to memremap setup_data entry\n");
- return;
- }
-
- len = sizeof(*data);
- pa_next = data->next;
-
- e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
-
- if (data->type == SETUP_INDIRECT) {
- len += data->len;
- early_memunmap(data, sizeof(*data));
- data = early_memremap(pa_data, len);
- if (!data) {
- pr_warn("e820: failed to memremap indirect setup_data\n");
- return;
- }
-
- indirect = (struct setup_indirect *)data->data;
-
- if (indirect->type != SETUP_INDIRECT)
- e820__range_update(indirect->addr, indirect->len,
- E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
- }
-
- pa_data = pa_next;
- early_memunmap(data, len);
- }
-
- e820__update_table(e820_table);
-
- pr_info("extended physical RAM map:\n");
- e820__print_table("reserve setup_data");
-}
-
-/*
* Called after parse_early_param(), after early parameters (such as mem=)
* have been processed, in which case we already have an E820 table filled in
* via the parameter callback function(s), but it's not sorted and printed yet:
@@ -1063,7 +1008,6 @@ void __init e820__finish_early_params(void)
static const char *__init e820_type_to_string(struct e820_entry *entry)
{
switch (entry->type) {
- case E820_TYPE_RESERVED_KERN: /* Fall-through: */
case E820_TYPE_RAM: return "System RAM";
case E820_TYPE_ACPI: return "ACPI Tables";
case E820_TYPE_NVS: return "ACPI Non-volatile Storage";
@@ -1079,7 +1023,6 @@ static const char *__init e820_type_to_string(struct e820_entry *entry)
static unsigned long __init e820_type_to_iomem_type(struct e820_entry *entry)
{
switch (entry->type) {
- case E820_TYPE_RESERVED_KERN: /* Fall-through: */
case E820_TYPE_RAM: return IORESOURCE_SYSTEM_RAM;
case E820_TYPE_ACPI: /* Fall-through: */
case E820_TYPE_NVS: /* Fall-through: */
@@ -1101,7 +1044,6 @@ static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry)
case E820_TYPE_PRAM: return IORES_DESC_PERSISTENT_MEMORY_LEGACY;
case E820_TYPE_RESERVED: return IORES_DESC_RESERVED;
case E820_TYPE_SOFT_RESERVED: return IORES_DESC_SOFT_RESERVED;
- case E820_TYPE_RESERVED_KERN: /* Fall-through: */
case E820_TYPE_RAM: /* Fall-through: */
case E820_TYPE_UNUSABLE: /* Fall-through: */
default: return IORES_DESC_NONE;
@@ -1124,7 +1066,6 @@ static bool __init do_mark_busy(enum e820_type type, struct resource *res)
case E820_TYPE_PRAM:
case E820_TYPE_PMEM:
return false;
- case E820_TYPE_RESERVED_KERN:
case E820_TYPE_RAM:
case E820_TYPE_ACPI:
case E820_TYPE_NVS:
@@ -1176,9 +1117,9 @@ void __init e820__reserve_resources(void)
res++;
}
- /* Expose the bootloader-provided memory layout to the sysfs. */
- for (i = 0; i < e820_table_firmware->nr_entries; i++) {
- struct e820_entry *entry = e820_table_firmware->entries + i;
+ /* Expose the kexec e820 table to the sysfs. */
+ for (i = 0; i < e820_table_kexec->nr_entries; i++) {
+ struct e820_entry *entry = e820_table_kexec->entries + i;
firmware_map_add_early(entry->addr, entry->addr + entry->size, e820_type_to_string(entry));
}
@@ -1302,6 +1243,36 @@ void __init e820__memblock_setup(void)
int i;
u64 end;
+#ifdef CONFIG_MEMORY_HOTPLUG
+ /*
+ * Memory used by the kernel cannot be hot-removed because Linux
+ * cannot migrate the kernel pages. When memory hotplug is
+ * enabled, we should prevent memblock from allocating memory
+ * for the kernel.
+ *
+ * ACPI SRAT records all hotpluggable memory ranges. But before
+ * SRAT is parsed, we don't know about it.
+ *
+ * The kernel image is loaded into memory at very early time. We
+ * cannot prevent this anyway. So on NUMA system, we set any
+ * node the kernel resides in as un-hotpluggable.
+ *
+ * Since on modern servers, one node could have double-digit
+ * gigabytes memory, we can assume the memory around the kernel
+ * image is also un-hotpluggable. So before SRAT is parsed, just
+ * allocate memory near the kernel image to try the best to keep
+ * the kernel away from hotpluggable memory.
+ */
+ if (movable_node_is_enabled())
+ memblock_set_bottom_up(true);
+#endif
+
+ /*
+ * At this point only the first megabyte is mapped for sure, the
+ * rest of the memory cannot be used for memblock resizing
+ */
+ memblock_set_current_limit(ISA_END_ADDRESS);
+
/*
* The bootstrap memblock region count maximum is 128 entries
* (INIT_MEMBLOCK_REGIONS), but EFI might pass us more E820 entries
@@ -1323,7 +1294,7 @@ void __init e820__memblock_setup(void)
if (entry->type == E820_TYPE_SOFT_RESERVED)
memblock_reserve(entry->addr, entry->size);
- if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN)
+ if (entry->type != E820_TYPE_RAM)
continue;
memblock_add(entry->addr, entry->size);
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 44f937015e1e..fc1714bad045 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -19,6 +19,7 @@
#include <linux/usb/ehci_def.h>
#include <linux/usb/xhci-dbgp.h>
#include <asm/pci_x86.h>
+#include <linux/static_call.h>
/* Simple VGA output */
#define VGABASE (__ISA_IO_base + 0xb8000)
@@ -94,26 +95,28 @@ static unsigned long early_serial_base = 0x3f8; /* ttyS0 */
#define DLL 0 /* Divisor Latch Low */
#define DLH 1 /* Divisor latch High */
-static unsigned int io_serial_in(unsigned long addr, int offset)
+static __noendbr unsigned int io_serial_in(unsigned long addr, int offset)
{
return inb(addr + offset);
}
+ANNOTATE_NOENDBR_SYM(io_serial_in);
-static void io_serial_out(unsigned long addr, int offset, int value)
+static __noendbr void io_serial_out(unsigned long addr, int offset, int value)
{
outb(value, addr + offset);
}
+ANNOTATE_NOENDBR_SYM(io_serial_out);
-static unsigned int (*serial_in)(unsigned long addr, int offset) = io_serial_in;
-static void (*serial_out)(unsigned long addr, int offset, int value) = io_serial_out;
+DEFINE_STATIC_CALL(serial_in, io_serial_in);
+DEFINE_STATIC_CALL(serial_out, io_serial_out);
static int early_serial_putc(unsigned char ch)
{
unsigned timeout = 0xffff;
- while ((serial_in(early_serial_base, LSR) & XMTRDY) == 0 && --timeout)
+ while ((static_call(serial_in)(early_serial_base, LSR) & XMTRDY) == 0 && --timeout)
cpu_relax();
- serial_out(early_serial_base, TXR, ch);
+ static_call(serial_out)(early_serial_base, TXR, ch);
return timeout ? 0 : -1;
}
@@ -131,16 +134,16 @@ static __init void early_serial_hw_init(unsigned divisor)
{
unsigned char c;
- serial_out(early_serial_base, LCR, 0x3); /* 8n1 */
- serial_out(early_serial_base, IER, 0); /* no interrupt */
- serial_out(early_serial_base, FCR, 0); /* no fifo */
- serial_out(early_serial_base, MCR, 0x3); /* DTR + RTS */
+ static_call(serial_out)(early_serial_base, LCR, 0x3); /* 8n1 */
+ static_call(serial_out)(early_serial_base, IER, 0); /* no interrupt */
+ static_call(serial_out)(early_serial_base, FCR, 0); /* no fifo */
+ static_call(serial_out)(early_serial_base, MCR, 0x3); /* DTR + RTS */
- c = serial_in(early_serial_base, LCR);
- serial_out(early_serial_base, LCR, c | DLAB);
- serial_out(early_serial_base, DLL, divisor & 0xff);
- serial_out(early_serial_base, DLH, (divisor >> 8) & 0xff);
- serial_out(early_serial_base, LCR, c & ~DLAB);
+ c = static_call(serial_in)(early_serial_base, LCR);
+ static_call(serial_out)(early_serial_base, LCR, c | DLAB);
+ static_call(serial_out)(early_serial_base, DLL, divisor & 0xff);
+ static_call(serial_out)(early_serial_base, DLH, (divisor >> 8) & 0xff);
+ static_call(serial_out)(early_serial_base, LCR, c & ~DLAB);
}
#define DEFAULT_BAUD 9600
@@ -183,28 +186,26 @@ static __init void early_serial_init(char *s)
/* Convert from baud to divisor value */
divisor = 115200 / baud;
- /* These will always be IO based ports */
- serial_in = io_serial_in;
- serial_out = io_serial_out;
-
/* Set up the HW */
early_serial_hw_init(divisor);
}
#ifdef CONFIG_PCI
-static void mem32_serial_out(unsigned long addr, int offset, int value)
+static __noendbr void mem32_serial_out(unsigned long addr, int offset, int value)
{
u32 __iomem *vaddr = (u32 __iomem *)addr;
/* shift implied by pointer type */
writel(value, vaddr + offset);
}
+ANNOTATE_NOENDBR_SYM(mem32_serial_out);
-static unsigned int mem32_serial_in(unsigned long addr, int offset)
+static __noendbr unsigned int mem32_serial_in(unsigned long addr, int offset)
{
u32 __iomem *vaddr = (u32 __iomem *)addr;
/* shift implied by pointer type */
return readl(vaddr + offset);
}
+ANNOTATE_NOENDBR_SYM(mem32_serial_in);
/*
* early_pci_serial_init()
@@ -278,15 +279,13 @@ static __init void early_pci_serial_init(char *s)
*/
if ((bar0 & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_IO) {
/* it is IO mapped */
- serial_in = io_serial_in;
- serial_out = io_serial_out;
early_serial_base = bar0 & PCI_BASE_ADDRESS_IO_MASK;
write_pci_config(bus, slot, func, PCI_COMMAND,
cmdreg|PCI_COMMAND_IO);
} else {
/* It is memory mapped - assume 32-bit alignment */
- serial_in = mem32_serial_in;
- serial_out = mem32_serial_out;
+ static_call_update(serial_in, mem32_serial_in);
+ static_call_update(serial_out, mem32_serial_out);
/* WARNING! assuming the address is always in the first 4G */
early_serial_base =
(unsigned long)early_ioremap(bar0 & PCI_BASE_ADDRESS_MEM_MASK, 0x10);
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 1209c7aebb21..1b734a9ff088 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -60,9 +60,16 @@ bool irq_fpu_usable(void)
if (WARN_ON_ONCE(in_nmi()))
return false;
- /* In kernel FPU usage already active? */
- if (this_cpu_read(in_kernel_fpu))
+ /*
+ * In kernel FPU usage already active? This detects any explicitly
+ * nested usage in task or softirq context, which is unsupported. It
+ * also detects attempted usage in a hardirq that has interrupted a
+ * kernel-mode FPU section.
+ */
+ if (this_cpu_read(in_kernel_fpu)) {
+ WARN_ON_FPU(!in_hardirq());
return false;
+ }
/*
* When not in NMI or hard interrupt context, FPU can be used in:
@@ -220,7 +227,7 @@ bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu)
struct fpstate *fpstate;
unsigned int size;
- size = fpu_user_cfg.default_size + ALIGN(offsetof(struct fpstate, regs), 64);
+ size = fpu_kernel_cfg.default_size + ALIGN(offsetof(struct fpstate, regs), 64);
fpstate = vzalloc(size);
if (!fpstate)
return false;
@@ -232,8 +239,8 @@ bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu)
fpstate->is_guest = true;
gfpu->fpstate = fpstate;
- gfpu->xfeatures = fpu_user_cfg.default_features;
- gfpu->perm = fpu_user_cfg.default_features;
+ gfpu->xfeatures = fpu_kernel_cfg.default_features;
+ gfpu->perm = fpu_kernel_cfg.default_features;
/*
* KVM sets the FP+SSE bits in the XSAVE header when copying FPU state
@@ -420,7 +427,8 @@ EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate);
void kernel_fpu_begin_mask(unsigned int kfpu_mask)
{
- preempt_disable();
+ if (!irqs_disabled())
+ fpregs_lock();
WARN_ON_FPU(!irq_fpu_usable());
WARN_ON_FPU(this_cpu_read(in_kernel_fpu));
@@ -448,7 +456,8 @@ void kernel_fpu_end(void)
WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));
this_cpu_write(in_kernel_fpu, false);
- preempt_enable();
+ if (!irqs_disabled())
+ fpregs_unlock();
}
EXPORT_SYMBOL_GPL(kernel_fpu_end);
diff --git a/arch/x86/kernel/fpu/internal.h b/arch/x86/kernel/fpu/internal.h
index dbdb31f55fc7..975de070c9c9 100644
--- a/arch/x86/kernel/fpu/internal.h
+++ b/arch/x86/kernel/fpu/internal.h
@@ -18,7 +18,7 @@ static __always_inline __pure bool use_fxsr(void)
#ifdef CONFIG_X86_DEBUG_FPU
# define WARN_ON_FPU(x) WARN_ON_ONCE(x)
#else
-# define WARN_ON_FPU(x) ({ (void)(x); 0; })
+# define WARN_ON_FPU(x) ({ BUILD_BUG_ON_INVALID(x); 0; })
#endif
/* Used in init.c */
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 8f62e0666dea..6c69cb28b298 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -27,19 +27,14 @@
static inline bool check_xstate_in_sigframe(struct fxregs_state __user *fxbuf,
struct _fpx_sw_bytes *fx_sw)
{
- int min_xstate_size = sizeof(struct fxregs_state) +
- sizeof(struct xstate_header);
void __user *fpstate = fxbuf;
unsigned int magic2;
if (__copy_from_user(fx_sw, &fxbuf->sw_reserved[0], sizeof(*fx_sw)))
return false;
- /* Check for the first magic field and other error scenarios. */
- if (fx_sw->magic1 != FP_XSTATE_MAGIC1 ||
- fx_sw->xstate_size < min_xstate_size ||
- fx_sw->xstate_size > current->thread.fpu.fpstate->user_size ||
- fx_sw->xstate_size > fx_sw->extended_size)
+ /* Check for the first magic field */
+ if (fx_sw->magic1 != FP_XSTATE_MAGIC1)
goto setfx;
/*
@@ -48,7 +43,7 @@ static inline bool check_xstate_in_sigframe(struct fxregs_state __user *fxbuf,
* fpstate layout with out copying the extended state information
* in the memory layout.
*/
- if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size)))
+ if (__get_user(magic2, (__u32 __user *)(fpstate + current->thread.fpu.fpstate->user_size)))
return false;
if (likely(magic2 == FP_XSTATE_MAGIC2))
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 27417b685c1d..6a41d1610d8b 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -259,32 +259,20 @@ static void __init setup_xstate_cache(void)
}
}
-static void __init print_xstate_feature(u64 xstate_mask)
-{
- const char *feature_name;
-
- if (cpu_has_xfeatures(xstate_mask, &feature_name))
- pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", xstate_mask, feature_name);
-}
-
/*
* Print out all the supported xstate features:
*/
static void __init print_xstate_features(void)
{
- print_xstate_feature(XFEATURE_MASK_FP);
- print_xstate_feature(XFEATURE_MASK_SSE);
- print_xstate_feature(XFEATURE_MASK_YMM);
- print_xstate_feature(XFEATURE_MASK_BNDREGS);
- print_xstate_feature(XFEATURE_MASK_BNDCSR);
- print_xstate_feature(XFEATURE_MASK_OPMASK);
- print_xstate_feature(XFEATURE_MASK_ZMM_Hi256);
- print_xstate_feature(XFEATURE_MASK_Hi16_ZMM);
- print_xstate_feature(XFEATURE_MASK_PKRU);
- print_xstate_feature(XFEATURE_MASK_PASID);
- print_xstate_feature(XFEATURE_MASK_CET_USER);
- print_xstate_feature(XFEATURE_MASK_XTILE_CFG);
- print_xstate_feature(XFEATURE_MASK_XTILE_DATA);
+ int i;
+
+ for (i = 0; i < XFEATURE_MAX; i++) {
+ u64 mask = BIT_ULL(i);
+ const char *name;
+
+ if (cpu_has_xfeatures(mask, &name))
+ pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", mask, name);
+ }
}
/*
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index aa16f1a1bbcf..0fd34f53f025 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -94,30 +94,33 @@ static inline int update_pkru_in_sigframe(struct xregs_state __user *buf, u64 ma
/* XSAVE/XRSTOR wrapper functions */
#ifdef CONFIG_X86_64
-#define REX_PREFIX "0x48, "
+#define REX_SUFFIX "64"
#else
-#define REX_PREFIX
+#define REX_SUFFIX
#endif
-/* These macros all use (%edi)/(%rdi) as the single memory argument. */
-#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27"
-#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37"
-#define XSAVEC ".byte " REX_PREFIX "0x0f,0xc7,0x27"
-#define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f"
-#define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f"
-#define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f"
+#define XSAVE "xsave" REX_SUFFIX " %[xa]"
+#define XSAVEOPT "xsaveopt" REX_SUFFIX " %[xa]"
+#define XSAVEC "xsavec" REX_SUFFIX " %[xa]"
+#define XSAVES "xsaves" REX_SUFFIX " %[xa]"
+#define XRSTOR "xrstor" REX_SUFFIX " %[xa]"
+#define XRSTORS "xrstors" REX_SUFFIX " %[xa]"
/*
* After this @err contains 0 on success or the trap number when the
* operation raises an exception.
+ *
+ * The [xa] input parameter below represents the struct xregs_state pointer
+ * and the asm symbolic name for the argument used in the XSAVE/XRSTOR insns
+ * above.
*/
#define XSTATE_OP(op, st, lmask, hmask, err) \
asm volatile("1:" op "\n\t" \
"xor %[err], %[err]\n" \
- "2:\n\t" \
+ "2:\n" \
_ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE) \
: [err] "=a" (err) \
- : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
+ : [xa] "m" (*(st)), "a" (lmask), "d" (hmask) \
: "memory")
/*
@@ -137,12 +140,12 @@ static inline int update_pkru_in_sigframe(struct xregs_state __user *buf, u64 ma
XSAVEOPT, X86_FEATURE_XSAVEOPT, \
XSAVEC, X86_FEATURE_XSAVEC, \
XSAVES, X86_FEATURE_XSAVES) \
- "\n" \
+ "\n\t" \
"xor %[err], %[err]\n" \
"3:\n" \
_ASM_EXTABLE_TYPE_REG(1b, 3b, EX_TYPE_EFAULT_REG, %[err]) \
: [err] "=r" (err) \
- : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
+ : [xa] "m" (*(st)), "a" (lmask), "d" (hmask) \
: "memory")
/*
@@ -156,7 +159,7 @@ static inline int update_pkru_in_sigframe(struct xregs_state __user *buf, u64 ma
"3:\n" \
_ASM_EXTABLE_TYPE(1b, 3b, EX_TYPE_FPU_RESTORE) \
: \
- : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
+ : [xa] "m" (*(st)), "a" (lmask), "d" (hmask) \
: "memory")
#if defined(CONFIG_X86_64) && defined(CONFIG_X86_DEBUG_FPU)
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 166bc0ea3bdf..cace6e8d7cc7 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -118,13 +118,10 @@ ftrace_modify_code_direct(unsigned long ip, const char *old_code,
return ret;
/* replace the text with the new text */
- if (ftrace_poke_late) {
+ if (ftrace_poke_late)
text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL);
- } else {
- mutex_lock(&text_mutex);
- text_poke((void *)ip, new_code, MCOUNT_INSN_SIZE);
- mutex_unlock(&text_mutex);
- }
+ else
+ text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE);
return 0;
}
@@ -321,7 +318,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
unsigned const char op_ref[] = { 0x48, 0x8b, 0x15 };
unsigned const char retq[] = { RET_INSN_OPCODE, INT3_INSN_OPCODE };
union ftrace_op_code_union op_ptr;
- void *ret;
+ int ret;
if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
start_offset = (unsigned long)ftrace_regs_caller;
@@ -352,15 +349,15 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE);
/* Copy ftrace_caller onto the trampoline memory */
- ret = text_poke_copy(trampoline, (void *)start_offset, size);
- if (WARN_ON(!ret))
+ ret = copy_from_kernel_nofault(trampoline, (void *)start_offset, size);
+ if (WARN_ON(ret < 0))
goto fail;
ip = trampoline + size;
if (cpu_feature_enabled(X86_FEATURE_RETHUNK))
__text_gen_insn(ip, JMP32_INSN_OPCODE, ip, x86_return_thunk, JMP32_INSN_SIZE);
else
- text_poke_copy(ip, retq, sizeof(retq));
+ memcpy(ip, retq, sizeof(retq));
/* No need to test direct calls on created trampolines */
if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
@@ -368,7 +365,8 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
ip = trampoline + (jmp_offset - start_offset);
if (WARN_ON(*(char *)ip != 0x75))
goto fail;
- if (!text_poke_copy(ip, x86_nops[2], 2))
+ ret = copy_from_kernel_nofault(ip, x86_nops[2], 2);
+ if (ret < 0)
goto fail;
}
@@ -381,7 +379,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
*/
ptr = (unsigned long *)(trampoline + size + RET_SIZE);
- text_poke_copy(ptr, &ops, sizeof(unsigned long));
+ *ptr = (unsigned long)ops;
op_offset -= start_offset;
memcpy(&op_ptr, trampoline + op_offset, OP_REF_SIZE);
@@ -397,7 +395,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
op_ptr.offset = offset;
/* put in the new offset to the ftrace_ops */
- text_poke_copy(trampoline + op_offset, &op_ptr, OP_REF_SIZE);
+ memcpy(trampoline + op_offset, &op_ptr, OP_REF_SIZE);
/* put in the call to the function */
mutex_lock(&text_mutex);
@@ -407,9 +405,9 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
* the depth accounting before the call already.
*/
dest = ftrace_ops_get_func(ops);
- text_poke_copy_locked(trampoline + call_offset,
- text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, dest),
- CALL_INSN_SIZE, false);
+ memcpy(trampoline + call_offset,
+ text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, dest),
+ CALL_INSN_SIZE);
mutex_unlock(&text_mutex);
/* ALLOC_TRAMP flags lets us know we created it */
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index d51647228596..367da3638167 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -146,12 +146,14 @@ SYM_FUNC_END(ftrace_stub_graph)
#ifdef CONFIG_DYNAMIC_FTRACE
SYM_FUNC_START(__fentry__)
+ ANNOTATE_NOENDBR
CALL_DEPTH_ACCOUNT
RET
SYM_FUNC_END(__fentry__)
EXPORT_SYMBOL(__fentry__)
SYM_FUNC_START(ftrace_caller)
+ ANNOTATE_NOENDBR
/* save_mcount_regs fills in first two parameters */
save_mcount_regs
@@ -197,6 +199,7 @@ SYM_FUNC_END(ftrace_caller);
STACK_FRAME_NON_STANDARD_FP(ftrace_caller)
SYM_FUNC_START(ftrace_regs_caller)
+ ANNOTATE_NOENDBR
/* Save the current flags before any operations that can change them */
pushfq
@@ -310,6 +313,7 @@ SYM_FUNC_END(ftrace_regs_caller)
STACK_FRAME_NON_STANDARD_FP(ftrace_regs_caller)
SYM_FUNC_START(ftrace_stub_direct_tramp)
+ ANNOTATE_NOENDBR
CALL_DEPTH_ACCOUNT
RET
SYM_FUNC_END(ftrace_stub_direct_tramp)
@@ -317,6 +321,7 @@ SYM_FUNC_END(ftrace_stub_direct_tramp)
#else /* ! CONFIG_DYNAMIC_FTRACE */
SYM_FUNC_START(__fentry__)
+ ANNOTATE_NOENDBR
CALL_DEPTH_ACCOUNT
cmpq $ftrace_stub, ftrace_trace_function
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 22c9ba305ac1..fa9b6339975f 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -5,8 +5,6 @@
* Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
*/
-#define DISABLE_BRANCH_PROFILING
-
/* cpu_feature_enabled() cannot be used this early */
#define USE_EARLY_PGTABLE_L5
@@ -567,7 +565,7 @@ void early_setup_idt(void)
*/
void __head startup_64_setup_gdt_idt(void)
{
- struct desc_struct *gdt = (void *)(__force unsigned long)init_per_cpu_var(gdt_page.gdt);
+ struct desc_struct *gdt = (void *)(__force unsigned long)gdt_page.gdt;
void *handler = NULL;
struct desc_ptr startup_gdt_descr = {
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 31345e0ba006..fefe2a25cf02 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -61,11 +61,14 @@ SYM_CODE_START_NOALIGN(startup_64)
/* Set up the stack for verify_cpu() */
leaq __top_init_kernel_stack(%rip), %rsp
- /* Setup GSBASE to allow stack canary access for C code */
+ /*
+ * Set up GSBASE.
+ * Note that on SMP the boot CPU uses the init data section until
+ * the per-CPU areas are set up.
+ */
movl $MSR_GS_BASE, %ecx
- leaq INIT_PER_CPU_VAR(fixed_percpu_data)(%rip), %rdx
- movl %edx, %eax
- shrq $32, %rdx
+ xorl %eax, %eax
+ xorl %edx, %edx
wrmsr
call startup_64_setup_gdt_idt
@@ -319,7 +322,7 @@ SYM_INNER_LABEL(common_startup_64, SYM_L_LOCAL)
*
* RDX contains the per-cpu offset
*/
- movq pcpu_hot + X86_current_task(%rdx), %rax
+ movq current_task(%rdx), %rax
movq TASK_threadsp(%rax), %rsp
/*
@@ -359,17 +362,12 @@ SYM_INNER_LABEL(common_startup_64, SYM_L_LOCAL)
movl %eax,%fs
movl %eax,%gs
- /* Set up %gs.
- *
- * The base of %gs always points to fixed_percpu_data. If the
- * stack protector canary is enabled, it is located at %gs:40.
+ /*
+ * Set up GSBASE.
* Note that, on SMP, the boot cpu uses init data section until
* the per cpu areas are set up.
*/
movl $MSR_GS_BASE,%ecx
-#ifndef CONFIG_SMP
- leaq INIT_PER_CPU_VAR(fixed_percpu_data)(%rip), %rdx
-#endif
movl %edx, %eax
shrq $32, %rdx
wrmsr
@@ -435,7 +433,7 @@ SYM_CODE_START(soft_restart_cpu)
UNWIND_HINT_END_OF_STACK
/* Find the idle task stack */
- movq PER_CPU_VAR(pcpu_hot + X86_current_task), %rcx
+ movq PER_CPU_VAR(current_task), %rcx
movq TASK_threadsp(%rcx), %rsp
jmp .Ljump_to_C_code
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index c20d1832c481..2bade73f49e3 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -23,6 +23,7 @@
#include <asm/desc.h>
#include <asm/apic.h>
#include <asm/i8259.h>
+#include <asm/io_apic.h>
/*
* This is the 'legacy' 8259A Programmable Interrupt Controller,
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index e2fab3ceb09f..6290dd120f5e 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -144,7 +144,7 @@ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on)
* Update the sequence number to force a TSS update on return to
* user mode.
*/
- iobm->sequence = atomic64_add_return(1, &io_bitmap_sequence);
+ iobm->sequence = atomic64_inc_return(&io_bitmap_sequence);
return 0;
}
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index feca4f20b06a..81f9b78e0f7b 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -33,6 +33,11 @@
DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
EXPORT_PER_CPU_SYMBOL(irq_stat);
+DEFINE_PER_CPU_CACHE_HOT(u16, __softirq_pending);
+EXPORT_PER_CPU_SYMBOL(__softirq_pending);
+
+DEFINE_PER_CPU_CACHE_HOT(struct irq_stack *, hardirq_stack_ptr);
+
atomic_t irq_err_count;
/*
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index dc1049c01f9b..c7a5d2960d57 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -29,12 +29,9 @@
int sysctl_panic_on_stackoverflow __read_mostly;
/* Debugging check for stack overflow: is there less than 1KB free? */
-static int check_stack_overflow(void)
+static bool check_stack_overflow(void)
{
- long sp;
-
- __asm__ __volatile__("andl %%esp,%0" :
- "=r" (sp) : "0" (THREAD_SIZE - 1));
+ unsigned long sp = current_stack_pointer & (THREAD_SIZE - 1);
return sp < (sizeof(struct thread_info) + STACK_WARN);
}
@@ -48,18 +45,19 @@ static void print_stack_overflow(void)
}
#else
-static inline int check_stack_overflow(void) { return 0; }
+static inline bool check_stack_overflow(void) { return false; }
static inline void print_stack_overflow(void) { }
#endif
+DEFINE_PER_CPU_CACHE_HOT(struct irq_stack *, softirq_stack_ptr);
+
static void call_on_stack(void *func, void *stack)
{
- asm volatile("xchgl %%ebx,%%esp \n"
+ asm volatile("xchgl %[sp], %%esp\n"
CALL_NOSPEC
- "movl %%ebx,%%esp \n"
- : "=b" (stack)
- : "0" (stack),
- [thunk_target] "D"(func)
+ "movl %[sp], %%esp"
+ : [sp] "+b" (stack)
+ : [thunk_target] "D" (func)
: "memory", "cc", "edx", "ecx", "eax");
}
@@ -68,13 +66,13 @@ static inline void *current_stack(void)
return (void *)(current_stack_pointer & ~(THREAD_SIZE - 1));
}
-static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc)
+static inline bool execute_on_irq_stack(bool overflow, struct irq_desc *desc)
{
struct irq_stack *curstk, *irqstk;
- u32 *isp, *prev_esp, arg1;
+ u32 *isp, *prev_esp;
curstk = (struct irq_stack *) current_stack();
- irqstk = __this_cpu_read(pcpu_hot.hardirq_stack_ptr);
+ irqstk = __this_cpu_read(hardirq_stack_ptr);
/*
* this is where we switch to the IRQ stack. However, if we are
@@ -83,7 +81,7 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc)
* current stack (which is the irq stack already after all)
*/
if (unlikely(curstk == irqstk))
- return 0;
+ return false;
isp = (u32 *) ((char *)irqstk + sizeof(*irqstk));
@@ -94,14 +92,13 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc)
if (unlikely(overflow))
call_on_stack(print_stack_overflow, isp);
- asm volatile("xchgl %%ebx,%%esp \n"
+ asm volatile("xchgl %[sp], %%esp\n"
CALL_NOSPEC
- "movl %%ebx,%%esp \n"
- : "=a" (arg1), "=b" (isp)
- : "0" (desc), "1" (isp),
- [thunk_target] "D" (desc->handle_irq)
- : "memory", "cc", "ecx");
- return 1;
+ "movl %[sp], %%esp"
+ : "+a" (desc), [sp] "+b" (isp)
+ : [thunk_target] "D" (desc->handle_irq)
+ : "memory", "cc", "edx", "ecx");
+ return true;
}
/*
@@ -112,7 +109,7 @@ int irq_init_percpu_irqstack(unsigned int cpu)
int node = cpu_to_node(cpu);
struct page *ph, *ps;
- if (per_cpu(pcpu_hot.hardirq_stack_ptr, cpu))
+ if (per_cpu(hardirq_stack_ptr, cpu))
return 0;
ph = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER);
@@ -124,8 +121,8 @@ int irq_init_percpu_irqstack(unsigned int cpu)
return -ENOMEM;
}
- per_cpu(pcpu_hot.hardirq_stack_ptr, cpu) = page_address(ph);
- per_cpu(pcpu_hot.softirq_stack_ptr, cpu) = page_address(ps);
+ per_cpu(hardirq_stack_ptr, cpu) = page_address(ph);
+ per_cpu(softirq_stack_ptr, cpu) = page_address(ps);
return 0;
}
@@ -135,7 +132,7 @@ void do_softirq_own_stack(void)
struct irq_stack *irqstk;
u32 *isp, *prev_esp;
- irqstk = __this_cpu_read(pcpu_hot.softirq_stack_ptr);
+ irqstk = __this_cpu_read(softirq_stack_ptr);
/* build the stack frame on the softirq stack */
isp = (u32 *) ((char *)irqstk + sizeof(*irqstk));
@@ -150,7 +147,7 @@ void do_softirq_own_stack(void)
void __handle_irq(struct irq_desc *desc, struct pt_regs *regs)
{
- int overflow = check_stack_overflow();
+ bool overflow = check_stack_overflow();
if (user_mode(regs) || !execute_on_irq_stack(overflow, desc)) {
if (unlikely(overflow))
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index ade0043ce56e..ca78dce39361 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -26,8 +26,8 @@
#include <asm/io_apic.h>
#include <asm/apic.h>
+DEFINE_PER_CPU_CACHE_HOT(bool, hardirq_stack_inuse);
DEFINE_PER_CPU_PAGE_ALIGNED(struct irq_stack, irq_stack_backing_store) __visible;
-DECLARE_INIT_PER_CPU(irq_stack_backing_store);
#ifdef CONFIG_VMAP_STACK
/*
@@ -51,7 +51,7 @@ static int map_irq_stack(unsigned int cpu)
return -ENOMEM;
/* Store actual TOS to avoid adjustment in the hotpath */
- per_cpu(pcpu_hot.hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8;
+ per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8;
return 0;
}
#else
@@ -64,14 +64,14 @@ static int map_irq_stack(unsigned int cpu)
void *va = per_cpu_ptr(&irq_stack_backing_store, cpu);
/* Store actual TOS to avoid adjustment in the hotpath */
- per_cpu(pcpu_hot.hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8;
+ per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8;
return 0;
}
#endif
int irq_init_percpu_irqstack(unsigned int cpu)
{
- if (per_cpu(pcpu_hot.hardirq_stack_ptr, cpu))
+ if (per_cpu(hardirq_stack_ptr, cpu))
return 0;
return map_irq_stack(cpu);
}
diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S
index 7f542a7799cb..fdabd5dda154 100644
--- a/arch/x86/kernel/irqflags.S
+++ b/arch/x86/kernel/irqflags.S
@@ -9,6 +9,7 @@
*/
.pushsection .noinstr.text, "ax"
SYM_FUNC_START(native_save_fl)
+ ENDBR
pushf
pop %_ASM_AX
RET
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 72e6a45e7ec2..09608fd93687 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -373,16 +373,7 @@ out:
kprobe_opcode_t *arch_adjust_kprobe_addr(unsigned long addr, unsigned long offset,
bool *on_func_entry)
{
- u32 insn;
-
- /*
- * Since 'addr' is not guaranteed to be safe to access, use
- * copy_from_kernel_nofault() to read the instruction:
- */
- if (copy_from_kernel_nofault(&insn, (void *)addr, sizeof(u32)))
- return NULL;
-
- if (is_endbr(insn)) {
+ if (is_endbr((u32 *)addr)) {
*on_func_entry = !offset || offset == 4;
if (*on_func_entry)
offset = 4;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 7a422a6c5983..3be9b3342c67 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -838,7 +838,6 @@ static void __init kvm_guest_init(void)
#ifdef CONFIG_SMP
if (pv_tlb_flush_supported()) {
pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi;
- pv_ops.mmu.tlb_remove_table = tlb_remove_table;
pr_info("KVM setup pv remote TLB flush\n");
}
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 8984abd91c00..a7998f351701 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -19,6 +19,7 @@
#include <linux/jump_label.h>
#include <linux/random.h>
#include <linux/memory.h>
+#include <linux/stackprotector.h>
#include <asm/text-patching.h>
#include <asm/page.h>
@@ -130,6 +131,20 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs,
goto overflow;
size = 4;
break;
+#if defined(CONFIG_STACKPROTECTOR) && \
+ defined(CONFIG_CC_IS_CLANG) && CONFIG_CLANG_VERSION < 170000
+ case R_X86_64_REX_GOTPCRELX: {
+ static unsigned long __percpu *const addr = &__stack_chk_guard;
+
+ if (sym->st_value != (u64)addr) {
+ pr_err("%s: Unsupported GOTPCREL relocation\n", me->name);
+ return -ENOEXEC;
+ }
+
+ val = (u64)&addr + rel[i].r_addend;
+ fallthrough;
+ }
+#endif
case R_X86_64_PC32:
case R_X86_64_PLT32:
val -= (u64)loc;
@@ -146,21 +161,18 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs,
}
if (apply) {
- void *wr_loc = module_writable_address(me, loc);
-
- if (memcmp(wr_loc, &zero, size)) {
+ if (memcmp(loc, &zero, size)) {
pr_err("x86/modules: Invalid relocation target, existing value is nonzero for type %d, loc %p, val %Lx\n",
(int)ELF64_R_TYPE(rel[i].r_info), loc, val);
return -ENOEXEC;
}
- write(wr_loc, &val, size);
+ write(loc, &val, size);
} else {
if (memcmp(loc, &val, size)) {
pr_warn("x86/modules: Invalid relocation target, existing value does not match expected value for type %d, loc %p, val %Lx\n",
(int)ELF64_R_TYPE(rel[i].r_info), loc, val);
return -ENOEXEC;
}
- /* FIXME: needs care for ROX module allocations */
write(loc, &zero, size);
}
}
@@ -227,7 +239,7 @@ int module_finalize(const Elf_Ehdr *hdr,
const Elf_Shdr *sechdrs,
struct module *me)
{
- const Elf_Shdr *s, *alt = NULL,
+ const Elf_Shdr *s, *alt = NULL, *locks = NULL,
*orc = NULL, *orc_ip = NULL,
*retpolines = NULL, *returns = NULL, *ibt_endbr = NULL,
*calls = NULL, *cfi = NULL;
@@ -236,6 +248,8 @@ int module_finalize(const Elf_Ehdr *hdr,
for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
if (!strcmp(".altinstructions", secstrings + s->sh_name))
alt = s;
+ if (!strcmp(".smp_locks", secstrings + s->sh_name))
+ locks = s;
if (!strcmp(".orc_unwind", secstrings + s->sh_name))
orc = s;
if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name))
@@ -266,60 +280,33 @@ int module_finalize(const Elf_Ehdr *hdr,
csize = cfi->sh_size;
}
- apply_fineibt(rseg, rseg + rsize, cseg, cseg + csize, me);
+ apply_fineibt(rseg, rseg + rsize, cseg, cseg + csize);
}
if (retpolines) {
void *rseg = (void *)retpolines->sh_addr;
- apply_retpolines(rseg, rseg + retpolines->sh_size, me);
+ apply_retpolines(rseg, rseg + retpolines->sh_size);
}
if (returns) {
void *rseg = (void *)returns->sh_addr;
- apply_returns(rseg, rseg + returns->sh_size, me);
- }
- if (alt) {
- /* patch .altinstructions */
- void *aseg = (void *)alt->sh_addr;
- apply_alternatives(aseg, aseg + alt->sh_size, me);
+ apply_returns(rseg, rseg + returns->sh_size);
}
- if (calls || alt) {
+ if (calls) {
struct callthunk_sites cs = {};
- if (calls) {
- cs.call_start = (void *)calls->sh_addr;
- cs.call_end = (void *)calls->sh_addr + calls->sh_size;
- }
-
- if (alt) {
- cs.alt_start = (void *)alt->sh_addr;
- cs.alt_end = (void *)alt->sh_addr + alt->sh_size;
- }
+ cs.call_start = (void *)calls->sh_addr;
+ cs.call_end = (void *)calls->sh_addr + calls->sh_size;
callthunks_patch_module_calls(&cs, me);
}
+ if (alt) {
+ /* patch .altinstructions */
+ void *aseg = (void *)alt->sh_addr;
+ apply_alternatives(aseg, aseg + alt->sh_size);
+ }
if (ibt_endbr) {
void *iseg = (void *)ibt_endbr->sh_addr;
- apply_seal_endbr(iseg, iseg + ibt_endbr->sh_size, me);
+ apply_seal_endbr(iseg, iseg + ibt_endbr->sh_size);
}
-
- if (orc && orc_ip)
- unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size,
- (void *)orc->sh_addr, orc->sh_size);
-
- return 0;
-}
-
-int module_post_finalize(const Elf_Ehdr *hdr,
- const Elf_Shdr *sechdrs,
- struct module *me)
-{
- const Elf_Shdr *s, *locks = NULL;
- char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
-
- for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
- if (!strcmp(".smp_locks", secstrings + s->sh_name))
- locks = s;
- }
-
if (locks) {
void *lseg = (void *)locks->sh_addr;
void *text = me->mem[MOD_TEXT].base;
@@ -329,6 +316,10 @@ int module_post_finalize(const Elf_Ehdr *hdr,
text, text_end);
}
+ if (orc && orc_ip)
+ unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size,
+ (void *)orc->sh_addr, orc->sh_size);
+
return 0;
}
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index ed163c8c8604..9a95d00f1423 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -40,8 +40,12 @@
#define CREATE_TRACE_POINTS
#include <trace/events/nmi.h>
+/*
+ * An emergency handler can be set in any context including NMI
+ */
struct nmi_desc {
raw_spinlock_t lock;
+ nmi_handler_t emerg_handler;
struct list_head head;
};
@@ -132,9 +136,22 @@ static void nmi_check_duration(struct nmiaction *action, u64 duration)
static int nmi_handle(unsigned int type, struct pt_regs *regs)
{
struct nmi_desc *desc = nmi_to_desc(type);
+ nmi_handler_t ehandler;
struct nmiaction *a;
int handled=0;
+ /*
+ * Call the emergency handler, if set
+ *
+ * In the case of crash_nmi_callback() emergency handler, it will
+ * return in the case of the crashing CPU to enable it to complete
+ * other necessary crashing actions ASAP. Other handlers in the
+ * linked list won't need to be run.
+ */
+ ehandler = desc->emerg_handler;
+ if (ehandler)
+ return ehandler(type, regs);
+
rcu_read_lock();
/*
@@ -224,6 +241,31 @@ void unregister_nmi_handler(unsigned int type, const char *name)
}
EXPORT_SYMBOL_GPL(unregister_nmi_handler);
+/**
+ * set_emergency_nmi_handler - Set emergency handler
+ * @type: NMI type
+ * @handler: the emergency handler to be stored
+ *
+ * Set an emergency NMI handler which, if set, will preempt all the other
+ * handlers in the linked list. If a NULL handler is passed in, it will clear
+ * it. It is expected that concurrent calls to this function will not happen
+ * or the system is screwed beyond repair.
+ */
+void set_emergency_nmi_handler(unsigned int type, nmi_handler_t handler)
+{
+ struct nmi_desc *desc = nmi_to_desc(type);
+
+ if (WARN_ON_ONCE(desc->emerg_handler == handler))
+ return;
+ desc->emerg_handler = handler;
+
+ /*
+ * Ensure the emergency handler is visible to other CPUs before
+ * function return
+ */
+ smp_wmb();
+}
+
static void
pci_serr_error(unsigned char reason, struct pt_regs *regs)
{
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1ccaa3397a67..97925632c28e 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -59,21 +59,6 @@ void __init native_pv_lock_init(void)
static_branch_enable(&virt_spin_lock_key);
}
-#ifndef CONFIG_PT_RECLAIM
-static void native_tlb_remove_table(struct mmu_gather *tlb, void *table)
-{
- struct ptdesc *ptdesc = (struct ptdesc *)table;
-
- pagetable_dtor(ptdesc);
- tlb_remove_page(tlb, ptdesc_page(ptdesc));
-}
-#else
-static void native_tlb_remove_table(struct mmu_gather *tlb, void *table)
-{
- tlb_remove_table(tlb, table);
-}
-#endif
-
struct static_key paravirt_steal_enabled;
struct static_key paravirt_steal_rq_enabled;
@@ -90,30 +75,20 @@ void paravirt_set_sched_clock(u64 (*func)(void))
static_call_update(pv_sched_clock, func);
}
-/* These are in entry.S */
-static struct resource reserve_ioports = {
- .start = 0,
- .end = IO_SPACE_LIMIT,
- .name = "paravirt-ioport",
- .flags = IORESOURCE_IO | IORESOURCE_BUSY,
-};
+#ifdef CONFIG_PARAVIRT_XXL
+static noinstr void pv_native_write_cr2(unsigned long val)
+{
+ native_write_cr2(val);
+}
-/*
- * Reserve the whole legacy IO space to prevent any legacy drivers
- * from wasting time probing for their hardware. This is a fairly
- * brute-force approach to disabling all non-virtual drivers.
- *
- * Note that this must be called very early to have any effect.
- */
-int paravirt_disable_iospace(void)
+static noinstr unsigned long pv_native_read_cr3(void)
{
- return request_resource(&ioport_resource, &reserve_ioports);
+ return __native_read_cr3();
}
-#ifdef CONFIG_PARAVIRT_XXL
-static noinstr void pv_native_write_cr2(unsigned long val)
+static noinstr void pv_native_write_cr3(unsigned long cr3)
{
- native_write_cr2(val);
+ native_write_cr3(cr3);
}
static noinstr unsigned long pv_native_get_debugreg(int regno)
@@ -195,7 +170,6 @@ struct paravirt_patch_template pv_ops = {
.mmu.flush_tlb_kernel = native_flush_tlb_global,
.mmu.flush_tlb_one_user = native_flush_tlb_one_user,
.mmu.flush_tlb_multi = native_flush_tlb_multi,
- .mmu.tlb_remove_table = native_tlb_remove_table,
.mmu.exit_mmap = paravirt_nop,
.mmu.notify_page_enc_status_changed = paravirt_nop,
@@ -203,8 +177,8 @@ struct paravirt_patch_template pv_ops = {
#ifdef CONFIG_PARAVIRT_XXL
.mmu.read_cr2 = __PV_IS_CALLEE_SAVE(pv_native_read_cr2),
.mmu.write_cr2 = pv_native_write_cr2,
- .mmu.read_cr3 = __native_read_cr3,
- .mmu.write_cr3 = native_write_cr3,
+ .mmu.read_cr3 = pv_native_read_cr3,
+ .mmu.write_cr3 = pv_native_write_cr3,
.mmu.pgd_alloc = __paravirt_pgd_alloc,
.mmu.pgd_free = paravirt_nop,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 6da6769d7254..91f6ff618852 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -93,7 +93,12 @@ EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid);
*/
int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
{
- memcpy(dst, src, arch_task_struct_size);
+ /* init_task is not dynamically sized (incomplete FPU state) */
+ if (unlikely(src == &init_task))
+ memcpy_and_pad(dst, arch_task_struct_size, src, sizeof(init_task), 0);
+ else
+ memcpy(dst, src, arch_task_struct_size);
+
#ifdef CONFIG_VM86
dst->thread.vm86 = NULL;
#endif
@@ -1043,7 +1048,7 @@ unsigned long __get_wchan(struct task_struct *p)
return addr;
}
-long do_arch_prctl_common(int option, unsigned long arg2)
+SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
{
switch (option) {
case ARCH_GET_CPUID:
@@ -1058,5 +1063,13 @@ long do_arch_prctl_common(int option, unsigned long arg2)
return fpu_xstate_prctl(option, arg2);
}
+ if (!in_ia32_syscall())
+ return do_arch_prctl_64(current, option, arg2);
+
return -EINVAL;
}
+
+SYSCALL_DEFINE0(ni_syscall)
+{
+ return -ENOSYS;
+}
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 0917c7f25720..4636ef359973 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -190,13 +190,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
arch_end_context_switch(next_p);
/*
- * Reload esp0 and pcpu_hot.top_of_stack. This changes
+ * Reload esp0 and cpu_current_top_of_stack. This changes
* current_thread_info(). Refresh the SYSENTER configuration in
* case prev or next is vm86.
*/
update_task_stack(next_p);
refresh_sysenter_cs(next);
- this_cpu_write(pcpu_hot.top_of_stack,
+ this_cpu_write(cpu_current_top_of_stack,
(unsigned long)task_stack_page(next_p) +
THREAD_SIZE);
@@ -206,7 +206,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
if (prev->gs | next->gs)
loadsegment(gs, next->gs);
- raw_cpu_write(pcpu_hot.current_task, next_p);
+ raw_cpu_write(current_task, next_p);
switch_fpu_finish(next_p);
@@ -215,8 +215,3 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
return prev_p;
}
-
-SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
-{
- return do_arch_prctl_common(option, arg2);
-}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 226472332a70..7196ca7048be 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -614,7 +614,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
int cpu = smp_processor_id();
WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
- this_cpu_read(pcpu_hot.hardirq_stack_inuse));
+ this_cpu_read(hardirq_stack_inuse));
if (!test_tsk_thread_flag(prev_p, TIF_NEED_FPU_LOAD))
switch_fpu_prepare(prev_p, cpu);
@@ -668,8 +668,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/*
* Switch the PDA and FPU contexts.
*/
- raw_cpu_write(pcpu_hot.current_task, next_p);
- raw_cpu_write(pcpu_hot.top_of_stack, task_top_of_stack(next_p));
+ raw_cpu_write(current_task, next_p);
+ raw_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
switch_fpu_finish(next_p);
@@ -942,7 +942,7 @@ long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
case ARCH_MAP_VDSO_X32:
return prctl_map_vdso(&vdso_image_x32, arg2);
# endif
-# if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+# ifdef CONFIG_IA32_EMULATION
case ARCH_MAP_VDSO_32:
return prctl_map_vdso(&vdso_image_32, arg2);
# endif
@@ -979,26 +979,3 @@ long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
return ret;
}
-
-SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
-{
- long ret;
-
- ret = do_arch_prctl_64(current, option, arg2);
- if (ret == -EINVAL)
- ret = do_arch_prctl_common(option, arg2);
-
- return ret;
-}
-
-#ifdef CONFIG_IA32_EMULATION
-COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
-{
- return do_arch_prctl_common(option, arg2);
-}
-#endif
-
-unsigned long KSTK_ESP(struct task_struct *task)
-{
- return task_pt_regs(task)->sp;
-}
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 6d0df6a58873..a92f18db9610 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -10,6 +10,8 @@
#include <asm/setup.h>
#include <asm/mce.h>
+#include <linux/platform_data/x86/apple.h>
+
#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
static void quirk_intel_irqbalance(struct pci_dev *dev)
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index dc1dd3f3e67f..964f6b0a3d68 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -921,20 +921,16 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
return;
/* Make a note of crashing cpu. Will be used in NMI callback. */
- crashing_cpu = safe_smp_processor_id();
+ crashing_cpu = smp_processor_id();
shootdown_callback = callback;
atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
- /* Would it be better to replace the trap vector here? */
- if (register_nmi_handler(NMI_LOCAL, crash_nmi_callback,
- NMI_FLAG_FIRST, "crash"))
- return; /* Return what? */
+
/*
- * Ensure the new callback function is set before sending
- * out the NMI
+ * Set emergency handler to preempt other handlers.
*/
- wmb();
+ set_emergency_nmi_handler(NMI_LOCAL, crash_nmi_callback);
apic_send_IPI_allbutself(NMI_VECTOR);
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index b44d8863e57f..ac058971a382 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -40,6 +40,16 @@ SYM_DATA(kexec_pa_table_page, .quad 0)
SYM_DATA(kexec_pa_swap_page, .quad 0)
SYM_DATA_LOCAL(pa_backup_pages_map, .quad 0)
+ .balign 16
+SYM_DATA_START_LOCAL(kexec_debug_gdt)
+ .word kexec_debug_gdt_end - kexec_debug_gdt - 1
+ .long 0
+ .word 0
+ .quad 0x00cf9a000000ffff /* __KERNEL32_CS */
+ .quad 0x00af9a000000ffff /* __KERNEL_CS */
+ .quad 0x00cf92000000ffff /* __KERNEL_DS */
+SYM_DATA_END_LABEL(kexec_debug_gdt, SYM_L_LOCAL, kexec_debug_gdt_end)
+
.section .text..relocate_kernel,"ax";
.code64
SYM_CODE_START_NOALIGN(relocate_kernel)
@@ -116,6 +126,19 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
/* store the start address on the stack */
pushq %rdx
+ /* Create a GDTR (16 bits limit, 64 bits addr) on stack */
+ leaq kexec_debug_gdt(%rip), %rax
+ pushq %rax
+ pushw (%rax)
+
+ /* Load the GDT, put the stack back */
+ lgdt (%rsp)
+ addq $10, %rsp
+
+ /* Test that we can load segments */
+ movq %ds, %rax
+ movq %rax, %ds
+
/*
* Clear X86_CR4_CET (if it was set) such that we can clear CR0_WP
* below.
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index cebee310e200..c7164a8de983 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -56,6 +56,9 @@
#include <asm/unwind.h>
#include <asm/vsyscall.h>
#include <linux/vmalloc.h>
+#if defined(CONFIG_X86_LOCAL_APIC)
+#include <asm/nmi.h>
+#endif
/*
* max_low_pfn_mapped: highest directly mapped pfn < 4 GB
@@ -146,6 +149,69 @@ static size_t ima_kexec_buffer_size;
/* Boot loader ID and version as integers, for the benefit of proc_dointvec */
int bootloader_type, bootloader_version;
+static const struct ctl_table x86_sysctl_table[] = {
+ {
+ .procname = "panic_on_unrecovered_nmi",
+ .data = &panic_on_unrecovered_nmi,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "panic_on_io_nmi",
+ .data = &panic_on_io_nmi,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "bootloader_type",
+ .data = &bootloader_type,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "bootloader_version",
+ .data = &bootloader_version,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "io_delay_type",
+ .data = &io_delay_type,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#if defined(CONFIG_X86_LOCAL_APIC)
+ {
+ .procname = "unknown_nmi_panic",
+ .data = &unknown_nmi_panic,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+#if defined(CONFIG_ACPI_SLEEP)
+ {
+ .procname = "acpi_video_flags",
+ .data = &acpi_realmode_flags,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+#endif
+};
+
+static int __init init_x86_sysctl(void)
+{
+ register_sysctl_init("kernel", x86_sysctl_table);
+ return 0;
+}
+arch_initcall(init_x86_sysctl);
+
/*
* Setup options
*/
@@ -429,6 +495,46 @@ static void __init parse_setup_data(void)
}
}
+/*
+ * Translate the fields of 'struct boot_param' into global variables
+ * representing these parameters.
+ */
+static void __init parse_boot_params(void)
+{
+ ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
+ screen_info = boot_params.screen_info;
+ edid_info = boot_params.edid_info;
+#ifdef CONFIG_X86_32
+ apm_info.bios = boot_params.apm_bios_info;
+ ist_info = boot_params.ist_info;
+#endif
+ saved_video_mode = boot_params.hdr.vid_mode;
+ bootloader_type = boot_params.hdr.type_of_loader;
+ if ((bootloader_type >> 4) == 0xe) {
+ bootloader_type &= 0xf;
+ bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4;
+ }
+ bootloader_version = bootloader_type & 0xf;
+ bootloader_version |= boot_params.hdr.ext_loader_ver << 4;
+
+#ifdef CONFIG_BLK_DEV_RAM
+ rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
+#endif
+#ifdef CONFIG_EFI
+ if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
+ EFI32_LOADER_SIGNATURE, 4)) {
+ set_bit(EFI_BOOT, &efi.flags);
+ } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
+ EFI64_LOADER_SIGNATURE, 4)) {
+ set_bit(EFI_BOOT, &efi.flags);
+ set_bit(EFI_64BIT, &efi.flags);
+ }
+#endif
+
+ if (!boot_params.hdr.root_flags)
+ root_mountflags &= ~MS_RDONLY;
+}
+
static void __init memblock_x86_reserve_range_setup_data(void)
{
struct setup_indirect *indirect;
@@ -527,6 +633,23 @@ void __init reserve_standard_io_resources(void)
}
+static void __init setup_kernel_resources(void)
+{
+ code_resource.start = __pa_symbol(_text);
+ code_resource.end = __pa_symbol(_etext)-1;
+ rodata_resource.start = __pa_symbol(__start_rodata);
+ rodata_resource.end = __pa_symbol(__end_rodata)-1;
+ data_resource.start = __pa_symbol(_sdata);
+ data_resource.end = __pa_symbol(_edata)-1;
+ bss_resource.start = __pa_symbol(__bss_start);
+ bss_resource.end = __pa_symbol(__bss_stop)-1;
+
+ insert_resource(&iomem_resource, &code_resource);
+ insert_resource(&iomem_resource, &rodata_resource);
+ insert_resource(&iomem_resource, &data_resource);
+ insert_resource(&iomem_resource, &bss_resource);
+}
+
static bool __init snb_gfx_workaround_needed(void)
{
#ifdef CONFIG_PCI
@@ -789,35 +912,7 @@ void __init setup_arch(char **cmdline_p)
setup_olpc_ofw_pgd();
- ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
- screen_info = boot_params.screen_info;
- edid_info = boot_params.edid_info;
-#ifdef CONFIG_X86_32
- apm_info.bios = boot_params.apm_bios_info;
- ist_info = boot_params.ist_info;
-#endif
- saved_video_mode = boot_params.hdr.vid_mode;
- bootloader_type = boot_params.hdr.type_of_loader;
- if ((bootloader_type >> 4) == 0xe) {
- bootloader_type &= 0xf;
- bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4;
- }
- bootloader_version = bootloader_type & 0xf;
- bootloader_version |= boot_params.hdr.ext_loader_ver << 4;
-
-#ifdef CONFIG_BLK_DEV_RAM
- rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
-#endif
-#ifdef CONFIG_EFI
- if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
- EFI32_LOADER_SIGNATURE, 4)) {
- set_bit(EFI_BOOT, &efi.flags);
- } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
- EFI64_LOADER_SIGNATURE, 4)) {
- set_bit(EFI_BOOT, &efi.flags);
- set_bit(EFI_64BIT, &efi.flags);
- }
-#endif
+ parse_boot_params();
x86_init.oem.arch_setup();
@@ -841,19 +936,8 @@ void __init setup_arch(char **cmdline_p)
copy_edd();
- if (!boot_params.hdr.root_flags)
- root_mountflags &= ~MS_RDONLY;
setup_initial_init_mm(_text, _etext, _edata, (void *)_brk_end);
- code_resource.start = __pa_symbol(_text);
- code_resource.end = __pa_symbol(_etext)-1;
- rodata_resource.start = __pa_symbol(__start_rodata);
- rodata_resource.end = __pa_symbol(__end_rodata)-1;
- data_resource.start = __pa_symbol(_sdata);
- data_resource.end = __pa_symbol(_edata)-1;
- bss_resource.start = __pa_symbol(__bss_start);
- bss_resource.end = __pa_symbol(__bss_stop)-1;
-
/*
* x86_configure_nx() is called before parse_early_param() to detect
* whether hardware doesn't support NX (so that the early EHCI debug
@@ -866,30 +950,6 @@ void __init setup_arch(char **cmdline_p)
if (efi_enabled(EFI_BOOT))
efi_memblock_x86_reserve_range();
-#ifdef CONFIG_MEMORY_HOTPLUG
- /*
- * Memory used by the kernel cannot be hot-removed because Linux
- * cannot migrate the kernel pages. When memory hotplug is
- * enabled, we should prevent memblock from allocating memory
- * for the kernel.
- *
- * ACPI SRAT records all hotpluggable memory ranges. But before
- * SRAT is parsed, we don't know about it.
- *
- * The kernel image is loaded into memory at very early time. We
- * cannot prevent this anyway. So on NUMA system, we set any
- * node the kernel resides in as un-hotpluggable.
- *
- * Since on modern servers, one node could have double-digit
- * gigabytes memory, we can assume the memory around the kernel
- * image is also un-hotpluggable. So before SRAT is parsed, just
- * allocate memory near the kernel image to try the best to keep
- * the kernel away from hotpluggable memory.
- */
- if (movable_node_is_enabled())
- memblock_set_bottom_up(true);
-#endif
-
x86_report_nx();
apic_setup_apic_calls();
@@ -901,7 +961,6 @@ void __init setup_arch(char **cmdline_p)
setup_clear_cpu_cap(X86_FEATURE_APIC);
}
- e820__reserve_setup_data();
e820__finish_early_params();
if (efi_enabled(EFI_BOOT))
@@ -921,11 +980,11 @@ void __init setup_arch(char **cmdline_p)
tsc_early_init();
x86_init.resources.probe_roms();
- /* after parse_early_param, so could debug it */
- insert_resource(&iomem_resource, &code_resource);
- insert_resource(&iomem_resource, &rodata_resource);
- insert_resource(&iomem_resource, &data_resource);
- insert_resource(&iomem_resource, &bss_resource);
+ /*
+ * Add resources for kernel text and data to the iomem_resource.
+ * Do it after parse_early_param, so it can be debugged.
+ */
+ setup_kernel_resources();
e820_add_kernel_range();
trim_bios_range();
@@ -990,7 +1049,6 @@ void __init setup_arch(char **cmdline_p)
cleanup_highmap();
- memblock_set_current_limit(ISA_END_ADDRESS);
e820__memblock_setup();
/*
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index b30d6e180df7..bfa48e7a32a2 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -23,18 +23,13 @@
#include <asm/cpumask.h>
#include <asm/cpu.h>
-#ifdef CONFIG_X86_64
-#define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load)
-#else
-#define BOOT_PERCPU_OFFSET 0
-#endif
+DEFINE_PER_CPU_CACHE_HOT(int, cpu_number);
+EXPORT_PER_CPU_SYMBOL(cpu_number);
-DEFINE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET;
+DEFINE_PER_CPU_CACHE_HOT(unsigned long, this_cpu_off);
EXPORT_PER_CPU_SYMBOL(this_cpu_off);
-unsigned long __per_cpu_offset[NR_CPUS] __ro_after_init = {
- [0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET,
-};
+unsigned long __per_cpu_offset[NR_CPUS] __ro_after_init;
EXPORT_SYMBOL(__per_cpu_offset);
/*
@@ -169,7 +164,7 @@ void __init setup_per_cpu_areas(void)
for_each_possible_cpu(cpu) {
per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu];
per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
- per_cpu(pcpu_hot.cpu_number, cpu) = cpu;
+ per_cpu(cpu_number, cpu) = cpu;
setup_percpu_segment(cpu);
/*
* Copy data used in early init routines from the
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index ef654530bf5a..98123ff10506 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -33,25 +33,55 @@
#include <asm/smap.h>
#include <asm/gsseg.h>
+/*
+ * The first GDT descriptor is reserved as 'NULL descriptor'. As bits 0
+ * and 1 of a segment selector, i.e., the RPL bits, are NOT used to index
+ * GDT, selector values 0~3 all point to the NULL descriptor, thus values
+ * 0, 1, 2 and 3 are all valid NULL selector values.
+ *
+ * However IRET zeros ES, FS, GS, and DS segment registers if any of them
+ * is found to have any nonzero NULL selector value, which can be used by
+ * userspace in pre-FRED systems to spot any interrupt/exception by loading
+ * a nonzero NULL selector and waiting for it to become zero. Before FRED
+ * there was nothing software could do to prevent such an information leak.
+ *
+ * ERETU, the only legit instruction to return to userspace from kernel
+ * under FRED, by design does NOT zero any segment register to avoid this
+ * problem behavior.
+ *
+ * As such, leave NULL selector values 0~3 unchanged.
+ */
+static inline u16 fixup_rpl(u16 sel)
+{
+ return sel <= 3 ? sel : sel | 3;
+}
+
#ifdef CONFIG_IA32_EMULATION
#include <asm/unistd_32_ia32.h>
static inline void reload_segments(struct sigcontext_32 *sc)
{
- unsigned int cur;
+ u16 cur;
+ /*
+ * Reload fs and gs if they have changed in the signal
+ * handler. This does not handle long fs/gs base changes in
+ * the handler, but does not clobber them at least in the
+ * normal case.
+ */
savesegment(gs, cur);
- if ((sc->gs | 0x03) != cur)
- load_gs_index(sc->gs | 0x03);
+ if (fixup_rpl(sc->gs) != cur)
+ load_gs_index(fixup_rpl(sc->gs));
savesegment(fs, cur);
- if ((sc->fs | 0x03) != cur)
- loadsegment(fs, sc->fs | 0x03);
+ if (fixup_rpl(sc->fs) != cur)
+ loadsegment(fs, fixup_rpl(sc->fs));
+
savesegment(ds, cur);
- if ((sc->ds | 0x03) != cur)
- loadsegment(ds, sc->ds | 0x03);
+ if (fixup_rpl(sc->ds) != cur)
+ loadsegment(ds, fixup_rpl(sc->ds));
savesegment(es, cur);
- if ((sc->es | 0x03) != cur)
- loadsegment(es, sc->es | 0x03);
+ if (fixup_rpl(sc->es) != cur)
+ loadsegment(es, fixup_rpl(sc->es));
}
#define sigset32_t compat_sigset_t
@@ -105,18 +135,12 @@ static bool ia32_restore_sigcontext(struct pt_regs *regs,
regs->orig_ax = -1;
#ifdef CONFIG_IA32_EMULATION
- /*
- * Reload fs and gs if they have changed in the signal
- * handler. This does not handle long fs/gs base changes in
- * the handler, but does not clobber them at least in the
- * normal case.
- */
reload_segments(&sc);
#else
- loadsegment(gs, sc.gs);
- regs->fs = sc.fs;
- regs->es = sc.es;
- regs->ds = sc.ds;
+ loadsegment(gs, fixup_rpl(sc.gs));
+ regs->fs = fixup_rpl(sc.fs);
+ regs->es = fixup_rpl(sc.es);
+ regs->ds = fixup_rpl(sc.ds);
#endif
return fpu__restore_sig(compat_ptr(sc.fpstate), 1);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index c10850ae6f09..d6cf1e23c2a3 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -190,7 +190,7 @@ static void ap_starting(void)
apic_ap_setup();
/* Save the processor parameters. */
- smp_store_cpu_info(cpuid);
+ identify_secondary_cpu(cpuid);
/*
* The topology information must be up to date before
@@ -215,7 +215,7 @@ static void ap_calibrate_delay(void)
{
/*
* Calibrate the delay loop and update loops_per_jiffy in cpu_data.
- * smp_store_cpu_info() stored a value that is close but not as
+ * identify_secondary_cpu() stored a value that is close but not as
* accurate as the value just calculated.
*
* As this is invoked after the TSC synchronization check,
@@ -229,7 +229,7 @@ static void ap_calibrate_delay(void)
/*
* Activate a secondary processor.
*/
-static void notrace start_secondary(void *unused)
+static void notrace __noendbr start_secondary(void *unused)
{
/*
* Don't put *anything* except direct CPU state initialization
@@ -314,26 +314,7 @@ static void notrace start_secondary(void *unused)
wmb();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
}
-
-/*
- * The bootstrap kernel entry code has set these up. Save them for
- * a given CPU
- */
-void smp_store_cpu_info(int id)
-{
- struct cpuinfo_x86 *c = &cpu_data(id);
-
- /* Copy boot_cpu_data only on the first bringup */
- if (!c->initialized)
- *c = boot_cpu_data;
- c->cpu_index = id;
- /*
- * During boot time, CPU0 has this setup already. Save the info when
- * bringing up an AP.
- */
- identify_secondary_cpu(c);
- c->initialized = true;
-}
+ANNOTATE_NOENDBR_SYM(start_secondary);
static bool
topology_same_node(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
@@ -654,10 +635,9 @@ static void impress_friends(void)
* But that slows boot and resume on modern processors, which include
* many cores and don't require that delay.
*
- * Cmdline "init_cpu_udelay=" is available to over-ride this delay.
- * Modern processor families are quirked to remove the delay entirely.
+ * Cmdline "cpu_init_udelay=" is available to override this delay.
*/
-#define UDELAY_10MS_DEFAULT 10000
+#define UDELAY_10MS_LEGACY 10000
static unsigned int init_udelay = UINT_MAX;
@@ -669,21 +649,21 @@ static int __init cpu_init_udelay(char *str)
}
early_param("cpu_init_udelay", cpu_init_udelay);
-static void __init smp_quirk_init_udelay(void)
+static void __init smp_set_init_udelay(void)
{
/* if cmdline changed it from default, leave it alone */
if (init_udelay != UINT_MAX)
return;
/* if modern processor, use no delay */
- if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) ||
- ((boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) && (boot_cpu_data.x86 >= 0x18)) ||
- ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) {
+ if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && boot_cpu_data.x86_vfm >= INTEL_PENTIUM_PRO) ||
+ (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON && boot_cpu_data.x86 >= 0x18) ||
+ (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && boot_cpu_data.x86 >= 0xF)) {
init_udelay = 0;
return;
}
/* else, use legacy delay */
- init_udelay = UDELAY_10MS_DEFAULT;
+ init_udelay = UDELAY_10MS_LEGACY;
}
/*
@@ -841,7 +821,7 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle)
/* Just in case we booted with a single CPU. */
alternatives_enable_smp();
- per_cpu(pcpu_hot.current_task, cpu) = idle;
+ per_cpu(current_task, cpu) = idle;
cpu_init_stack_canary(cpu, idle);
/* Initialize the interrupt stack(s) */
@@ -851,7 +831,7 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle)
#ifdef CONFIG_X86_32
/* Stack for startup_32 can be just as for start_secondary onwards */
- per_cpu(pcpu_hot.top_of_stack, cpu) = task_top_of_stack(idle);
+ per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle);
#endif
return 0;
}
@@ -1094,7 +1074,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
uv_system_init();
- smp_quirk_init_udelay();
+ smp_set_init_udelay();
speculative_store_bypass_ht_init();
@@ -1262,43 +1242,9 @@ void play_dead_common(void)
* We need to flush the caches before going to sleep, lest we have
* dirty data in our caches when we come back up.
*/
-static inline void mwait_play_dead(void)
+void __noreturn mwait_play_dead(unsigned int eax_hint)
{
struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
- unsigned int eax, ebx, ecx, edx;
- unsigned int highest_cstate = 0;
- unsigned int highest_subcstate = 0;
- int i;
-
- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
- boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
- return;
- if (!this_cpu_has(X86_FEATURE_MWAIT))
- return;
- if (!this_cpu_has(X86_FEATURE_CLFLUSH))
- return;
-
- eax = CPUID_LEAF_MWAIT;
- ecx = 0;
- native_cpuid(&eax, &ebx, &ecx, &edx);
-
- /*
- * eax will be 0 if EDX enumeration is not valid.
- * Initialized below to cstate, sub_cstate value when EDX is valid.
- */
- if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) {
- eax = 0;
- } else {
- edx >>= MWAIT_SUBSTATE_SIZE;
- for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
- if (edx & MWAIT_SUBSTATE_MASK) {
- highest_cstate = i;
- highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
- }
- }
- eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
- (highest_subcstate - 1);
- }
/* Set up state for the kexec() hack below */
md->status = CPUDEAD_MWAIT_WAIT;
@@ -1319,7 +1265,7 @@ static inline void mwait_play_dead(void)
mb();
__monitor(md, 0, 0);
mb();
- __mwait(eax, 0);
+ __mwait(eax_hint, 0);
if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) {
/*
@@ -1391,9 +1337,9 @@ void native_play_dead(void)
play_dead_common();
tboot_shutdown(TB_SHUTDOWN_WFS);
- mwait_play_dead();
- if (cpuidle_play_dead())
- hlt_play_dead();
+ /* Below returns only on error. */
+ cpuidle_play_dead();
+ hlt_play_dead();
}
#else /* ... !CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 4c1bcb6053fc..46b8f1f16676 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -200,8 +200,7 @@ static int tboot_setup_sleep(void)
tboot->num_mac_regions = 0;
for (i = 0; i < e820_table->nr_entries; i++) {
- if ((e820_table->entries[i].type != E820_TYPE_RAM)
- && (e820_table->entries[i].type != E820_TYPE_RESERVED_KERN))
+ if (e820_table->entries[i].type != E820_TYPE_RAM)
continue;
add_mac_region(e820_table->entries[i].addr, e820_table->entries[i].size);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 2dbadf347b5f..9f88b8a78e50 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -94,10 +94,20 @@ __always_inline int is_valid_bugaddr(unsigned long addr)
/*
* Check for UD1 or UD2, accounting for Address Size Override Prefixes.
- * If it's a UD1, get the ModRM byte to pass along to UBSan.
+ * If it's a UD1, further decode to determine its use:
+ *
+ * FineIBT: ea (bad)
+ * FineIBT: f0 75 f9 lock jne . - 6
+ * UBSan{0}: 67 0f b9 00 ud1 (%eax),%eax
+ * UBSan{10}: 67 0f b9 40 10 ud1 0x10(%eax),%eax
+ * static_call: 0f b9 cc ud1 %esp,%ecx
+ *
+ * Notably UBSAN uses EAX, static_call uses ECX.
*/
-__always_inline int decode_bug(unsigned long addr, u32 *imm)
+__always_inline int decode_bug(unsigned long addr, s32 *imm, int *len)
{
+ unsigned long start = addr;
+ bool lock = false;
u8 v;
if (addr < TASK_SIZE_MAX)
@@ -106,28 +116,67 @@ __always_inline int decode_bug(unsigned long addr, u32 *imm)
v = *(u8 *)(addr++);
if (v == INSN_ASOP)
v = *(u8 *)(addr++);
- if (v != OPCODE_ESCAPE)
+
+ if (v == INSN_LOCK) {
+ lock = true;
+ v = *(u8 *)(addr++);
+ }
+
+ switch (v) {
+ case 0x70 ... 0x7f: /* Jcc.d8 */
+ addr += 1; /* d8 */
+ *len = addr - start;
+ WARN_ON_ONCE(!lock);
+ return BUG_LOCK;
+
+ case 0xea:
+ *len = addr - start;
+ return BUG_EA;
+
+ case OPCODE_ESCAPE:
+ break;
+
+ default:
return BUG_NONE;
+ }
v = *(u8 *)(addr++);
- if (v == SECOND_BYTE_OPCODE_UD2)
+ if (v == SECOND_BYTE_OPCODE_UD2) {
+ *len = addr - start;
return BUG_UD2;
+ }
- if (!IS_ENABLED(CONFIG_UBSAN_TRAP) || v != SECOND_BYTE_OPCODE_UD1)
+ if (v != SECOND_BYTE_OPCODE_UD1)
return BUG_NONE;
- /* Retrieve the immediate (type value) for the UBSAN UD1 */
- v = *(u8 *)(addr++);
- if (X86_MODRM_RM(v) == 4)
- addr++;
-
*imm = 0;
- if (X86_MODRM_MOD(v) == 1)
- *imm = *(u8 *)addr;
- else if (X86_MODRM_MOD(v) == 2)
- *imm = *(u32 *)addr;
- else
- WARN_ONCE(1, "Unexpected MODRM_MOD: %u\n", X86_MODRM_MOD(v));
+ v = *(u8 *)(addr++); /* ModRM */
+
+ if (X86_MODRM_MOD(v) != 3 && X86_MODRM_RM(v) == 4)
+ addr++; /* SIB */
+
+ /* Decode immediate, if present */
+ switch (X86_MODRM_MOD(v)) {
+ case 0: if (X86_MODRM_RM(v) == 5)
+ addr += 4; /* RIP + disp32 */
+ break;
+
+ case 1: *imm = *(s8 *)addr;
+ addr += 1;
+ break;
+
+ case 2: *imm = *(s32 *)addr;
+ addr += 4;
+ break;
+
+ case 3: break;
+ }
+
+ /* record instruction length */
+ *len = addr - start;
+
+ if (X86_MODRM_REG(v) == 0) /* EAX */
+ return BUG_UD1_UBSAN;
return BUG_UD1;
}
@@ -257,11 +306,12 @@ static inline void handle_invalid_op(struct pt_regs *regs)
static noinstr bool handle_bug(struct pt_regs *regs)
{
+ unsigned long addr = regs->ip;
bool handled = false;
- int ud_type;
- u32 imm;
+ int ud_type, ud_len;
+ s32 ud_imm;
- ud_type = decode_bug(regs->ip, &imm);
+ ud_type = decode_bug(addr, &ud_imm, &ud_len);
if (ud_type == BUG_NONE)
return handled;
@@ -281,15 +331,47 @@ static noinstr bool handle_bug(struct pt_regs *regs)
*/
if (regs->flags & X86_EFLAGS_IF)
raw_local_irq_enable();
- if (ud_type == BUG_UD2) {
- if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN ||
- handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) {
- regs->ip += LEN_UD2;
+
+ switch (ud_type) {
+ case BUG_UD2:
+ if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) {
+ handled = true;
+ break;
+ }
+ fallthrough;
+
+ case BUG_EA:
+ case BUG_LOCK:
+ if (handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) {
handled = true;
+ break;
+ }
+ break;
+
+ case BUG_UD1_UBSAN:
+ if (IS_ENABLED(CONFIG_UBSAN_TRAP)) {
+ pr_crit("%s at %pS\n",
+ report_ubsan_failure(regs, ud_imm),
+ (void *)regs->ip);
}
- } else if (IS_ENABLED(CONFIG_UBSAN_TRAP)) {
- pr_crit("%s at %pS\n", report_ubsan_failure(regs, imm), (void *)regs->ip);
+ break;
+
+ default:
+ break;
+ }
+
+ /*
+ * When continuing, and regs->ip hasn't changed, move it to the next
+ * instruction. When not continuing execution, restore the instruction
+ * pointer.
+ */
+ if (handled) {
+ if (regs->ip == addr)
+ regs->ip += ud_len;
+ } else {
+ regs->ip = addr;
}
+
if (regs->flags & X86_EFLAGS_IF)
raw_local_irq_disable();
instrumentation_end();
@@ -380,6 +462,21 @@ __visible void __noreturn handle_stack_overflow(struct pt_regs *regs,
#endif
/*
+ * Prevent the compiler and/or objtool from marking the !CONFIG_X86_ESPFIX64
+ * version of exc_double_fault() as noreturn. Otherwise the noreturn mismatch
+ * between configs triggers objtool warnings.
+ *
+ * This is a temporary hack until we have compiler or plugin support for
+ * annotating noreturns.
+ */
+#ifdef CONFIG_X86_ESPFIX64
+#define always_true() true
+#else
+bool always_true(void);
+bool __weak always_true(void) { return true; }
+#endif
+
+/*
* Runs on an IST stack for x86_64 and on a special task stack for x86_32.
*
* On x86_64, this is more or less a normal kernel entry. Notwithstanding the
@@ -514,7 +611,8 @@ DEFINE_IDTENTRY_DF(exc_double_fault)
pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
die("double fault", regs, error_code);
- panic("Machine halted.");
+ if (always_true())
+ panic("Machine halted.");
instrumentation_end();
}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 34dec0b72ea8..88e5a4ed9db3 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -959,7 +959,7 @@ static unsigned long long cyc2ns_suspend;
void tsc_save_sched_clock_state(void)
{
- if (!sched_clock_stable())
+ if (!static_branch_likely(&__use_tsc) && !sched_clock_stable())
return;
cyc2ns_suspend = sched_clock();
@@ -979,7 +979,7 @@ void tsc_restore_sched_clock_state(void)
unsigned long flags;
int cpu;
- if (!sched_clock_stable())
+ if (!static_branch_likely(&__use_tsc) && !sched_clock_stable())
return;
local_irq_save(flags);
diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c
index deeb02825670..48e6cc1cb017 100644
--- a/arch/x86/kernel/tsc_msr.c
+++ b/arch/x86/kernel/tsc_msr.c
@@ -152,7 +152,7 @@ static const struct x86_cpu_id tsc_msr_cpu_ids[] = {
X86_MATCH_VFM(INTEL_ATOM_SILVERMONT, &freq_desc_byt),
X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID, &freq_desc_tng),
X86_MATCH_VFM(INTEL_ATOM_AIRMONT, &freq_desc_cht),
- X86_MATCH_VFM(INTEL_ATOM_AIRMONT_MID, &freq_desc_ann),
+ X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID2, &freq_desc_ann),
X86_MATCH_VFM(INTEL_ATOM_AIRMONT_NP, &freq_desc_lgm),
{}
};
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 5a952c5ea66b..9194695662b2 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -357,19 +357,23 @@ void *arch_uprobe_trampoline(unsigned long *psize)
return &insn;
}
-static unsigned long trampoline_check_ip(void)
+static unsigned long trampoline_check_ip(unsigned long tramp)
{
- unsigned long tramp = uprobe_get_trampoline_vaddr();
-
return tramp + (uretprobe_syscall_check - uretprobe_trampoline_entry);
}
SYSCALL_DEFINE0(uretprobe)
{
struct pt_regs *regs = task_pt_regs(current);
- unsigned long err, ip, sp, r11_cx_ax[3];
+ unsigned long err, ip, sp, r11_cx_ax[3], tramp;
+
+ /* If there's no trampoline, we are called from wrong place. */
+ tramp = uprobe_get_trampoline_vaddr();
+ if (unlikely(tramp == UPROBE_NO_TRAMPOLINE_VADDR))
+ goto sigill;
- if (regs->ip != trampoline_check_ip())
+ /* Make sure the ip matches the only allowed sys_uretprobe caller. */
+ if (unlikely(regs->ip != trampoline_check_ip(tramp)))
goto sigill;
err = copy_from_user(r11_cx_ax, (void __user *)regs->sp, sizeof(r11_cx_ax));
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index 1258a5872d12..37ad43792452 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -29,8 +29,12 @@
*/
#include <asm/cpufeatures.h>
+#include <asm/cpufeaturemasks.h>
#include <asm/msr-index.h>
+#define SSE_MASK \
+ (REQUIRED_MASK0 & ((1<<(X86_FEATURE_XMM & 31)) | (1<<(X86_FEATURE_XMM2 & 31))))
+
SYM_FUNC_START_LOCAL(verify_cpu)
pushf # Save caller passed flags
push $0 # Kill any dangerous flags
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 0deb4887d6e9..ccdc45e5b759 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -43,7 +43,8 @@ ENTRY(phys_startup_64)
#endif
jiffies = jiffies_64;
-const_pcpu_hot = pcpu_hot;
+const_current_task = current_task;
+const_cpu_current_top_of_stack = cpu_current_top_of_stack;
#if defined(CONFIG_X86_64)
/*
@@ -112,12 +113,6 @@ ASSERT(__relocate_kernel_end - __relocate_kernel_start <= KEXEC_CONTROL_CODE_MAX
PHDRS {
text PT_LOAD FLAGS(5); /* R_E */
data PT_LOAD FLAGS(6); /* RW_ */
-#ifdef CONFIG_X86_64
-#ifdef CONFIG_SMP
- percpu PT_LOAD FLAGS(6); /* RW_ */
-#endif
- init PT_LOAD FLAGS(7); /* RWE */
-#endif
note PT_NOTE FLAGS(0); /* ___ */
}
@@ -193,6 +188,8 @@ SECTIONS
PAGE_ALIGNED_DATA(PAGE_SIZE)
+ CACHE_HOT_DATA(L1_CACHE_BYTES)
+
CACHELINE_ALIGNED_DATA(L1_CACHE_BYTES)
DATA_DATA
@@ -216,21 +213,7 @@ SECTIONS
__init_begin = .; /* paired with __init_end */
}
-#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
- /*
- * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
- * output PHDR, so the next output section - .init.text - should
- * start another segment - init.
- */
- PERCPU_VADDR(INTERNODE_CACHE_BYTES, 0, :percpu)
- ASSERT(SIZEOF(.data..percpu) < CONFIG_PHYSICAL_START,
- "per-CPU data too large - increase CONFIG_PHYSICAL_START")
-#endif
-
INIT_TEXT_SECTION(PAGE_SIZE)
-#ifdef CONFIG_X86_64
- :init
-#endif
/*
* Section for code used exclusively before alternatives are run. All
@@ -347,9 +330,8 @@ SECTIONS
EXIT_DATA
}
-#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP)
- PERCPU_SECTION(INTERNODE_CACHE_BYTES)
-#endif
+ PERCPU_SECTION(L1_CACHE_BYTES)
+ ASSERT(__per_cpu_hot_end - __per_cpu_hot_start <= 64, "percpu cache hot data too large")
RUNTIME_CONST_VARIABLES
RUNTIME_CONST(ptr, USER_PTR_MAX)
@@ -493,19 +475,6 @@ SECTIONS
PROVIDE(__ref_stack_chk_guard = __stack_chk_guard);
#ifdef CONFIG_X86_64
-/*
- * Per-cpu symbols which need to be offset from __per_cpu_load
- * for the boot processor.
- */
-#define INIT_PER_CPU(x) init_per_cpu__##x = ABSOLUTE(x) + __per_cpu_load
-INIT_PER_CPU(gdt_page);
-INIT_PER_CPU(fixed_percpu_data);
-INIT_PER_CPU(irq_stack_backing_store);
-
-#ifdef CONFIG_SMP
-. = ASSERT((fixed_percpu_data == 0),
- "fixed_percpu_data is not at start of per-cpu area");
-#endif
#ifdef CONFIG_MITIGATION_UNRET_ENTRY
. = ASSERT((retbleed_return_thunk & 0x3f) == 0, "retbleed_return_thunk not cacheline-aligned");
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index ea2c4f21c1ca..fe8ea8c097de 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -22,6 +22,7 @@ config KVM_X86
select KVM_COMMON
select KVM_GENERIC_MMU_NOTIFIER
select KVM_ELIDE_TLB_FLUSH_IF_YOUNG
+ select KVM_MMU_LOCKLESS_AGING
select HAVE_KVM_IRQCHIP
select HAVE_KVM_PFNCACHE
select HAVE_KVM_DIRTY_RING_TSO
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 121edf1f2a79..5e4d4934c0d3 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -58,25 +58,24 @@ void __init kvm_init_xstate_sizes(void)
u32 xstate_required_size(u64 xstate_bv, bool compacted)
{
- int feature_bit = 0;
u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
+ int i;
xstate_bv &= XFEATURE_MASK_EXTEND;
- while (xstate_bv) {
- if (xstate_bv & 0x1) {
- struct cpuid_xstate_sizes *xs = &xstate_sizes[feature_bit];
- u32 offset;
-
- /* ECX[1]: 64B alignment in compacted form */
- if (compacted)
- offset = (xs->ecx & 0x2) ? ALIGN(ret, 64) : ret;
- else
- offset = xs->ebx;
- ret = max(ret, offset + xs->eax);
- }
+ for (i = XFEATURE_YMM; i < ARRAY_SIZE(xstate_sizes) && xstate_bv; i++) {
+ struct cpuid_xstate_sizes *xs = &xstate_sizes[i];
+ u32 offset;
- xstate_bv >>= 1;
- feature_bit++;
+ if (!(xstate_bv & BIT_ULL(i)))
+ continue;
+
+ /* ECX[1]: 64B alignment in compacted form */
+ if (compacted)
+ offset = (xs->ecx & 0x2) ? ALIGN(ret, 64) : ret;
+ else
+ offset = xs->ebx;
+ ret = max(ret, offset + xs->eax);
+ xstate_bv &= ~BIT_ULL(i);
}
return ret;
@@ -196,6 +195,7 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu)
}
static u32 kvm_apply_cpuid_pv_features_quirk(struct kvm_vcpu *vcpu);
+static void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu);
/* Check whether the supplied CPUID data is equal to what is already set for the vCPU. */
static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
@@ -300,10 +300,12 @@ static __always_inline void kvm_update_feature_runtime(struct kvm_vcpu *vcpu,
guest_cpu_cap_change(vcpu, x86_feature, has_feature);
}
-void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
+static void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
+ vcpu->arch.cpuid_dynamic_bits_dirty = false;
+
best = kvm_find_cpuid_entry(vcpu, 1);
if (best) {
kvm_update_feature_runtime(vcpu, best, X86_FEATURE_OSXSAVE,
@@ -333,7 +335,6 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
cpuid_entry_has(best, X86_FEATURE_XSAVEC)))
best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
}
-EXPORT_SYMBOL_GPL(kvm_update_cpuid_runtime);
static bool kvm_cpuid_has_hyperv(struct kvm_vcpu *vcpu)
{
@@ -646,6 +647,9 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
if (cpuid->nent < vcpu->arch.cpuid_nent)
return -E2BIG;
+ if (vcpu->arch.cpuid_dynamic_bits_dirty)
+ kvm_update_cpuid_runtime(vcpu);
+
if (copy_to_user(entries, vcpu->arch.cpuid_entries,
vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
return -EFAULT;
@@ -1704,7 +1708,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
phys_as = entry->eax & 0xff;
g_phys_as = phys_as;
if (kvm_mmu_get_max_tdp_level() < 5)
- g_phys_as = min(g_phys_as, 48);
+ g_phys_as = min(g_phys_as, 48U);
}
entry->eax = phys_as | (virt_as << 8) | (g_phys_as << 16);
@@ -1769,13 +1773,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
cpuid_entry_override(entry, CPUID_8000_0022_EAX);
- if (kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2))
- ebx.split.num_core_pmc = kvm_pmu_cap.num_counters_gp;
- else if (kvm_cpu_cap_has(X86_FEATURE_PERFCTR_CORE))
- ebx.split.num_core_pmc = AMD64_NUM_COUNTERS_CORE;
- else
- ebx.split.num_core_pmc = AMD64_NUM_COUNTERS;
-
+ ebx.split.num_core_pmc = kvm_pmu_cap.num_counters_gp;
entry->ebx = ebx.full;
break;
}
@@ -1985,6 +1983,9 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
struct kvm_cpuid_entry2 *entry;
bool exact, used_max_basic = false;
+ if (vcpu->arch.cpuid_dynamic_bits_dirty)
+ kvm_update_cpuid_runtime(vcpu);
+
entry = kvm_find_cpuid_entry_index(vcpu, function, index);
exact = !!entry;
@@ -2000,12 +2001,29 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
*edx = entry->edx;
if (function == 7 && index == 0) {
u64 data;
- if (!__kvm_get_msr(vcpu, MSR_IA32_TSX_CTRL, &data, true) &&
+ if ((*ebx & (feature_bit(RTM) | feature_bit(HLE))) &&
+ !__kvm_get_msr(vcpu, MSR_IA32_TSX_CTRL, &data, true) &&
(data & TSX_CTRL_CPUID_CLEAR))
*ebx &= ~(feature_bit(RTM) | feature_bit(HLE));
} else if (function == 0x80000007) {
if (kvm_hv_invtsc_suppressed(vcpu))
*edx &= ~feature_bit(CONSTANT_TSC);
+ } else if (IS_ENABLED(CONFIG_KVM_XEN) &&
+ kvm_xen_is_tsc_leaf(vcpu, function)) {
+ /*
+ * Update guest TSC frequency information if necessary.
+ * Ignore failures, there is no sane value that can be
+ * provided if KVM can't get the TSC frequency.
+ */
+ if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu))
+ kvm_guest_time_update(vcpu);
+
+ if (index == 1) {
+ *ecx = vcpu->arch.pvclock_tsc_mul;
+ *edx = vcpu->arch.pvclock_tsc_shift;
+ } else if (index == 2) {
+ *eax = vcpu->arch.hw_tsc_khz;
+ }
}
} else {
*eax = *ebx = *ecx = *edx = 0;
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 67d80aa72d50..d2884162a46a 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -11,7 +11,6 @@ extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
void kvm_set_cpu_caps(void);
void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu);
-void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu);
struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu,
u32 function, u32 index);
struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
@@ -232,6 +231,14 @@ static __always_inline bool guest_cpu_cap_has(struct kvm_vcpu *vcpu,
{
unsigned int x86_leaf = __feature_leaf(x86_feature);
+ /*
+ * Except for MWAIT, querying dynamic feature bits is disallowed, so
+ * that KVM can defer runtime updates until the next CPUID emulation.
+ */
+ BUILD_BUG_ON(x86_feature == X86_FEATURE_APIC ||
+ x86_feature == X86_FEATURE_OSXSAVE ||
+ x86_feature == X86_FEATURE_OSPKE);
+
return vcpu->arch.cpu_caps[x86_leaf] & __feature_bit(x86_feature);
}
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 60986f67c35a..1349e278cd2a 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -477,8 +477,11 @@ static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
.dst_val = ctxt->dst.val64,
.src_bytes = ctxt->src.bytes,
.dst_bytes = ctxt->dst.bytes,
+ .src_type = ctxt->src.type,
+ .dst_type = ctxt->dst.type,
.ad_bytes = ctxt->ad_bytes,
- .next_rip = ctxt->eip,
+ .rip = ctxt->eip,
+ .next_rip = ctxt->_eip,
};
return ctxt->ops->intercept(ctxt, &info, stage);
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 6ebeb6cea6c0..24f0318c50d7 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -952,8 +952,7 @@ static void stimer_init(struct kvm_vcpu_hv_stimer *stimer, int timer_index)
{
memset(stimer, 0, sizeof(*stimer));
stimer->index = timer_index;
- hrtimer_init(&stimer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
- stimer->timer.function = stimer_timer_callback;
+ hrtimer_setup(&stimer->timer, stimer_timer_callback, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
stimer_prepare_msg(stimer);
}
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index d7ab8780ab9e..739aa6c0d0c3 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -690,8 +690,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
pit->kvm = kvm;
pit_state = &pit->pit_state;
- hrtimer_init(&pit_state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
- pit_state->timer.function = pit_timer_fn;
+ hrtimer_setup(&pit_state->timer, pit_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
pit_state->irq_ack_notifier.gsi = 0;
pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 8dec646e764b..a8fb19940975 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -567,7 +567,7 @@ static void pic_irq_request(struct kvm *kvm, int level)
{
struct kvm_pic *s = kvm->arch.vpic;
- if (!s->output)
+ if (!s->output && level)
s->wakeup_needed = true;
s->output = level;
}
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index 73072585e164..c1df5acfacaf 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -44,7 +44,10 @@ struct x86_instruction_info {
u64 dst_val; /* value of destination operand */
u8 src_bytes; /* size of source operand */
u8 dst_bytes; /* size of destination operand */
+ u8 src_type; /* type of source operand */
+ u8 dst_type; /* type of destination operand */
u8 ad_bytes; /* size of src/dst address */
+ u64 rip; /* rip of the instruction */
u64 next_rip; /* rip following the instruction */
};
@@ -272,8 +275,10 @@ struct operand {
};
};
+#define X86_MAX_INSTRUCTION_LENGTH 15
+
struct fetch_cache {
- u8 data[15];
+ u8 data[X86_MAX_INSTRUCTION_LENGTH];
u8 *ptr;
u8 *end;
};
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index a009c94c26c2..28e3317124fd 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -221,13 +221,6 @@ static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map,
}
}
-static void kvm_apic_map_free(struct rcu_head *rcu)
-{
- struct kvm_apic_map *map = container_of(rcu, struct kvm_apic_map, rcu);
-
- kvfree(map);
-}
-
static int kvm_recalculate_phys_map(struct kvm_apic_map *new,
struct kvm_vcpu *vcpu,
bool *xapic_id_mismatch)
@@ -489,7 +482,7 @@ out:
mutex_unlock(&kvm->arch.apic_map_lock);
if (old)
- call_rcu(&old->rcu, kvm_apic_map_free);
+ kvfree_rcu(old, rcu);
kvm_make_scan_ioapic_request(kvm);
}
@@ -2593,7 +2586,7 @@ static void __kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value)
vcpu->arch.apic_base = value;
if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE)
- kvm_update_cpuid_runtime(vcpu);
+ vcpu->arch.cpuid_dynamic_bits_dirty = true;
if (!apic)
return;
@@ -2921,9 +2914,8 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
apic->nr_lvt_entries = kvm_apic_calc_nr_lvt_entries(vcpu);
- hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
- HRTIMER_MODE_ABS_HARD);
- apic->lapic_timer.timer.function = apic_timer_fn;
+ hrtimer_setup(&apic->lapic_timer.timer, apic_timer_fn, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS_HARD);
if (lapic_timer_advance)
apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT;
@@ -3397,9 +3389,9 @@ int kvm_apic_accept_events(struct kvm_vcpu *vcpu)
if (test_and_clear_bit(KVM_APIC_INIT, &apic->pending_events)) {
kvm_vcpu_reset(vcpu, true);
if (kvm_vcpu_is_bsp(apic->vcpu))
- vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
else
- vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED);
}
if (test_and_clear_bit(KVM_APIC_SIPI, &apic->pending_events)) {
if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
@@ -3408,7 +3400,7 @@ int kvm_apic_accept_events(struct kvm_vcpu *vcpu)
sipi_vector = apic->sipi_vector;
kvm_x86_call(vcpu_deliver_sipi_vector)(vcpu,
sipi_vector);
- vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
}
}
return 0;
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 8160870398b9..63bb77ee1bb1 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -501,7 +501,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
return false;
}
- if (!spte_has_volatile_bits(old_spte))
+ if (!spte_needs_atomic_update(old_spte))
__update_clear_spte_fast(sptep, new_spte);
else
old_spte = __update_clear_spte_slow(sptep, new_spte);
@@ -524,7 +524,7 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
int level = sptep_to_sp(sptep)->role.level;
if (!is_shadow_present_pte(old_spte) ||
- !spte_has_volatile_bits(old_spte))
+ !spte_needs_atomic_update(old_spte))
__update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE);
else
old_spte = __update_clear_spte_slow(sptep, SHADOW_NONPRESENT_VALUE);
@@ -853,32 +853,173 @@ static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu
* About rmap_head encoding:
*
* If the bit zero of rmap_head->val is clear, then it points to the only spte
- * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
+ * in this rmap chain. Otherwise, (rmap_head->val & ~3) points to a struct
* pte_list_desc containing more mappings.
*/
#define KVM_RMAP_MANY BIT(0)
/*
+ * rmaps and PTE lists are mostly protected by mmu_lock (the shadow MMU always
+ * operates with mmu_lock held for write), but rmaps can be walked without
+ * holding mmu_lock so long as the caller can tolerate SPTEs in the rmap chain
+ * being zapped/dropped _while the rmap is locked_.
+ *
+ * Other than the KVM_RMAP_LOCKED flag, modifications to rmap entries must be
+ * done while holding mmu_lock for write. This allows a task walking rmaps
+ * without holding mmu_lock to concurrently walk the same entries as a task
+ * that is holding mmu_lock but _not_ the rmap lock. Neither task will modify
+ * the rmaps, thus the walks are stable.
+ *
+ * As alluded to above, SPTEs in rmaps are _not_ protected by KVM_RMAP_LOCKED,
+ * only the rmap chains themselves are protected. E.g. holding an rmap's lock
+ * ensures all "struct pte_list_desc" fields are stable.
+ */
+#define KVM_RMAP_LOCKED BIT(1)
+
+static unsigned long __kvm_rmap_lock(struct kvm_rmap_head *rmap_head)
+{
+ unsigned long old_val, new_val;
+
+ lockdep_assert_preemption_disabled();
+
+ /*
+ * Elide the lock if the rmap is empty, as lockless walkers (read-only
+ * mode) don't need to (and can't) walk an empty rmap, nor can they add
+ * entries to the rmap. I.e. the only paths that process empty rmaps
+ * do so while holding mmu_lock for write, and are mutually exclusive.
+ */
+ old_val = atomic_long_read(&rmap_head->val);
+ if (!old_val)
+ return 0;
+
+ do {
+ /*
+ * If the rmap is locked, wait for it to be unlocked before
+ * trying acquire the lock, e.g. to avoid bouncing the cache
+ * line.
+ */
+ while (old_val & KVM_RMAP_LOCKED) {
+ cpu_relax();
+ old_val = atomic_long_read(&rmap_head->val);
+ }
+
+ /*
+ * Recheck for an empty rmap, it may have been purged by the
+ * task that held the lock.
+ */
+ if (!old_val)
+ return 0;
+
+ new_val = old_val | KVM_RMAP_LOCKED;
+ /*
+ * Use try_cmpxchg_acquire() to prevent reads and writes to the rmap
+ * from being reordered outside of the critical section created by
+ * __kvm_rmap_lock().
+ *
+ * Pairs with the atomic_long_set_release() in kvm_rmap_unlock().
+ *
+ * For the !old_val case, no ordering is needed, as there is no rmap
+ * to walk.
+ */
+ } while (!atomic_long_try_cmpxchg_acquire(&rmap_head->val, &old_val, new_val));
+
+ /*
+ * Return the old value, i.e. _without_ the LOCKED bit set. It's
+ * impossible for the return value to be 0 (see above), i.e. the read-
+ * only unlock flow can't get a false positive and fail to unlock.
+ */
+ return old_val;
+}
+
+static unsigned long kvm_rmap_lock(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head)
+{
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ return __kvm_rmap_lock(rmap_head);
+}
+
+static void __kvm_rmap_unlock(struct kvm_rmap_head *rmap_head,
+ unsigned long val)
+{
+ KVM_MMU_WARN_ON(val & KVM_RMAP_LOCKED);
+ /*
+ * Ensure that all accesses to the rmap have completed before unlocking
+ * the rmap.
+ *
+ * Pairs with the atomic_long_try_cmpxchg_acquire() in __kvm_rmap_lock().
+ */
+ atomic_long_set_release(&rmap_head->val, val);
+}
+
+static void kvm_rmap_unlock(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
+ unsigned long new_val)
+{
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ __kvm_rmap_unlock(rmap_head, new_val);
+}
+
+static unsigned long kvm_rmap_get(struct kvm_rmap_head *rmap_head)
+{
+ return atomic_long_read(&rmap_head->val) & ~KVM_RMAP_LOCKED;
+}
+
+/*
+ * If mmu_lock isn't held, rmaps can only be locked in read-only mode. The
+ * actual locking is the same, but the caller is disallowed from modifying the
+ * rmap, and so the unlock flow is a nop if the rmap is/was empty.
+ */
+static unsigned long kvm_rmap_lock_readonly(struct kvm_rmap_head *rmap_head)
+{
+ unsigned long rmap_val;
+
+ preempt_disable();
+ rmap_val = __kvm_rmap_lock(rmap_head);
+
+ if (!rmap_val)
+ preempt_enable();
+
+ return rmap_val;
+}
+
+static void kvm_rmap_unlock_readonly(struct kvm_rmap_head *rmap_head,
+ unsigned long old_val)
+{
+ if (!old_val)
+ return;
+
+ KVM_MMU_WARN_ON(old_val != kvm_rmap_get(rmap_head));
+
+ __kvm_rmap_unlock(rmap_head, old_val);
+ preempt_enable();
+}
+
+/*
* Returns the number of pointers in the rmap chain, not counting the new one.
*/
-static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
- struct kvm_rmap_head *rmap_head)
+static int pte_list_add(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
+ u64 *spte, struct kvm_rmap_head *rmap_head)
{
+ unsigned long old_val, new_val;
struct pte_list_desc *desc;
int count = 0;
- if (!rmap_head->val) {
- rmap_head->val = (unsigned long)spte;
- } else if (!(rmap_head->val & KVM_RMAP_MANY)) {
+ old_val = kvm_rmap_lock(kvm, rmap_head);
+
+ if (!old_val) {
+ new_val = (unsigned long)spte;
+ } else if (!(old_val & KVM_RMAP_MANY)) {
desc = kvm_mmu_memory_cache_alloc(cache);
- desc->sptes[0] = (u64 *)rmap_head->val;
+ desc->sptes[0] = (u64 *)old_val;
desc->sptes[1] = spte;
desc->spte_count = 2;
desc->tail_count = 0;
- rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY;
+ new_val = (unsigned long)desc | KVM_RMAP_MANY;
++count;
} else {
- desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
+ desc = (struct pte_list_desc *)(old_val & ~KVM_RMAP_MANY);
count = desc->tail_count + desc->spte_count;
/*
@@ -887,21 +1028,25 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
*/
if (desc->spte_count == PTE_LIST_EXT) {
desc = kvm_mmu_memory_cache_alloc(cache);
- desc->more = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
+ desc->more = (struct pte_list_desc *)(old_val & ~KVM_RMAP_MANY);
desc->spte_count = 0;
desc->tail_count = count;
- rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY;
+ new_val = (unsigned long)desc | KVM_RMAP_MANY;
+ } else {
+ new_val = old_val;
}
desc->sptes[desc->spte_count++] = spte;
}
+
+ kvm_rmap_unlock(kvm, rmap_head, new_val);
+
return count;
}
-static void pte_list_desc_remove_entry(struct kvm *kvm,
- struct kvm_rmap_head *rmap_head,
+static void pte_list_desc_remove_entry(struct kvm *kvm, unsigned long *rmap_val,
struct pte_list_desc *desc, int i)
{
- struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
+ struct pte_list_desc *head_desc = (struct pte_list_desc *)(*rmap_val & ~KVM_RMAP_MANY);
int j = head_desc->spte_count - 1;
/*
@@ -928,9 +1073,9 @@ static void pte_list_desc_remove_entry(struct kvm *kvm,
* head at the next descriptor, i.e. the new head.
*/
if (!head_desc->more)
- rmap_head->val = 0;
+ *rmap_val = 0;
else
- rmap_head->val = (unsigned long)head_desc->more | KVM_RMAP_MANY;
+ *rmap_val = (unsigned long)head_desc->more | KVM_RMAP_MANY;
mmu_free_pte_list_desc(head_desc);
}
@@ -938,24 +1083,26 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte,
struct kvm_rmap_head *rmap_head)
{
struct pte_list_desc *desc;
+ unsigned long rmap_val;
int i;
- if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_head->val, kvm))
- return;
+ rmap_val = kvm_rmap_lock(kvm, rmap_head);
+ if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_val, kvm))
+ goto out;
- if (!(rmap_head->val & KVM_RMAP_MANY)) {
- if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_head->val != spte, kvm))
- return;
+ if (!(rmap_val & KVM_RMAP_MANY)) {
+ if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_val != spte, kvm))
+ goto out;
- rmap_head->val = 0;
+ rmap_val = 0;
} else {
- desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
+ desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY);
while (desc) {
for (i = 0; i < desc->spte_count; ++i) {
if (desc->sptes[i] == spte) {
- pte_list_desc_remove_entry(kvm, rmap_head,
+ pte_list_desc_remove_entry(kvm, &rmap_val,
desc, i);
- return;
+ goto out;
}
}
desc = desc->more;
@@ -963,6 +1110,9 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte,
KVM_BUG_ON_DATA_CORRUPTION(true, kvm);
}
+
+out:
+ kvm_rmap_unlock(kvm, rmap_head, rmap_val);
}
static void kvm_zap_one_rmap_spte(struct kvm *kvm,
@@ -977,17 +1127,19 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
struct kvm_rmap_head *rmap_head)
{
struct pte_list_desc *desc, *next;
+ unsigned long rmap_val;
int i;
- if (!rmap_head->val)
+ rmap_val = kvm_rmap_lock(kvm, rmap_head);
+ if (!rmap_val)
return false;
- if (!(rmap_head->val & KVM_RMAP_MANY)) {
- mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val);
+ if (!(rmap_val & KVM_RMAP_MANY)) {
+ mmu_spte_clear_track_bits(kvm, (u64 *)rmap_val);
goto out;
}
- desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
+ desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY);
for (; desc; desc = next) {
for (i = 0; i < desc->spte_count; i++)
@@ -997,20 +1149,21 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
}
out:
/* rmap_head is meaningless now, remember to reset it */
- rmap_head->val = 0;
+ kvm_rmap_unlock(kvm, rmap_head, 0);
return true;
}
unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
{
+ unsigned long rmap_val = kvm_rmap_get(rmap_head);
struct pte_list_desc *desc;
- if (!rmap_head->val)
+ if (!rmap_val)
return 0;
- else if (!(rmap_head->val & KVM_RMAP_MANY))
+ else if (!(rmap_val & KVM_RMAP_MANY))
return 1;
- desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
+ desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY);
return desc->tail_count + desc->spte_count;
}
@@ -1053,6 +1206,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
*/
struct rmap_iterator {
/* private fields */
+ struct rmap_head *head;
struct pte_list_desc *desc; /* holds the sptep if not NULL */
int pos; /* index of the sptep */
};
@@ -1067,23 +1221,19 @@ struct rmap_iterator {
static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
struct rmap_iterator *iter)
{
- u64 *sptep;
+ unsigned long rmap_val = kvm_rmap_get(rmap_head);
- if (!rmap_head->val)
+ if (!rmap_val)
return NULL;
- if (!(rmap_head->val & KVM_RMAP_MANY)) {
+ if (!(rmap_val & KVM_RMAP_MANY)) {
iter->desc = NULL;
- sptep = (u64 *)rmap_head->val;
- goto out;
+ return (u64 *)rmap_val;
}
- iter->desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
+ iter->desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY);
iter->pos = 0;
- sptep = iter->desc->sptes[iter->pos];
-out:
- BUG_ON(!is_shadow_present_pte(*sptep));
- return sptep;
+ return iter->desc->sptes[iter->pos];
}
/*
@@ -1093,14 +1243,11 @@ out:
*/
static u64 *rmap_get_next(struct rmap_iterator *iter)
{
- u64 *sptep;
-
if (iter->desc) {
if (iter->pos < PTE_LIST_EXT - 1) {
++iter->pos;
- sptep = iter->desc->sptes[iter->pos];
- if (sptep)
- goto out;
+ if (iter->desc->sptes[iter->pos])
+ return iter->desc->sptes[iter->pos];
}
iter->desc = iter->desc->more;
@@ -1108,20 +1255,24 @@ static u64 *rmap_get_next(struct rmap_iterator *iter)
if (iter->desc) {
iter->pos = 0;
/* desc->sptes[0] cannot be NULL */
- sptep = iter->desc->sptes[iter->pos];
- goto out;
+ return iter->desc->sptes[iter->pos];
}
}
return NULL;
-out:
- BUG_ON(!is_shadow_present_pte(*sptep));
- return sptep;
}
-#define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \
- for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \
- _spte_; _spte_ = rmap_get_next(_iter_))
+#define __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \
+ for (_sptep_ = rmap_get_first(_rmap_head_, _iter_); \
+ _sptep_; _sptep_ = rmap_get_next(_iter_))
+
+#define for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \
+ __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \
+ if (!WARN_ON_ONCE(!is_shadow_present_pte(*(_sptep_)))) \
+
+#define for_each_rmap_spte_lockless(_rmap_head_, _iter_, _sptep_, _spte_) \
+ __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \
+ if (is_shadow_present_pte(_spte_ = mmu_spte_get_lockless(sptep)))
static void drop_spte(struct kvm *kvm, u64 *sptep)
{
@@ -1207,12 +1358,13 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
struct rmap_iterator iter;
bool flush = false;
- for_each_rmap_spte(rmap_head, &iter, sptep)
+ for_each_rmap_spte(rmap_head, &iter, sptep) {
if (spte_ad_need_write_protect(*sptep))
flush |= test_and_clear_bit(PT_WRITABLE_SHIFT,
(unsigned long *)sptep);
else
flush |= spte_clear_dirty(sptep);
+ }
return flush;
}
@@ -1401,7 +1553,7 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
while (++iterator->rmap <= iterator->end_rmap) {
iterator->gfn += KVM_PAGES_PER_HPAGE(iterator->level);
- if (iterator->rmap->val)
+ if (atomic_long_read(&iterator->rmap->val))
return;
}
@@ -1533,7 +1685,7 @@ static void __rmap_add(struct kvm *kvm,
kvm_update_page_stats(kvm, sp->role.level, 1);
rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
- rmap_count = pte_list_add(cache, spte, rmap_head);
+ rmap_count = pte_list_add(kvm, cache, spte, rmap_head);
if (rmap_count > kvm->stat.max_mmu_rmap_size)
kvm->stat.max_mmu_rmap_size = rmap_count;
@@ -1552,51 +1704,67 @@ static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot,
}
static bool kvm_rmap_age_gfn_range(struct kvm *kvm,
- struct kvm_gfn_range *range, bool test_only)
+ struct kvm_gfn_range *range,
+ bool test_only)
{
- struct slot_rmap_walk_iterator iterator;
+ struct kvm_rmap_head *rmap_head;
struct rmap_iterator iter;
+ unsigned long rmap_val;
bool young = false;
u64 *sptep;
+ gfn_t gfn;
+ int level;
+ u64 spte;
- for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
- range->start, range->end - 1, &iterator) {
- for_each_rmap_spte(iterator.rmap, &iter, sptep) {
- u64 spte = *sptep;
+ for (level = PG_LEVEL_4K; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
+ for (gfn = range->start; gfn < range->end;
+ gfn += KVM_PAGES_PER_HPAGE(level)) {
+ rmap_head = gfn_to_rmap(gfn, level, range->slot);
+ rmap_val = kvm_rmap_lock_readonly(rmap_head);
- if (!is_accessed_spte(spte))
- continue;
+ for_each_rmap_spte_lockless(rmap_head, &iter, sptep, spte) {
+ if (!is_accessed_spte(spte))
+ continue;
- if (test_only)
- return true;
-
- if (spte_ad_enabled(spte)) {
- clear_bit((ffs(shadow_accessed_mask) - 1),
- (unsigned long *)sptep);
- } else {
- /*
- * WARN if mmu_spte_update() signals the need
- * for a TLB flush, as Access tracking a SPTE
- * should never trigger an _immediate_ flush.
- */
- spte = mark_spte_for_access_track(spte);
- WARN_ON_ONCE(mmu_spte_update(sptep, spte));
+ if (test_only) {
+ kvm_rmap_unlock_readonly(rmap_head, rmap_val);
+ return true;
+ }
+
+ if (spte_ad_enabled(spte))
+ clear_bit((ffs(shadow_accessed_mask) - 1),
+ (unsigned long *)sptep);
+ else
+ /*
+ * If the following cmpxchg fails, the
+ * spte is being concurrently modified
+ * and should most likely stay young.
+ */
+ cmpxchg64(sptep, spte,
+ mark_spte_for_access_track(spte));
+ young = true;
}
- young = true;
+
+ kvm_rmap_unlock_readonly(rmap_head, rmap_val);
}
}
return young;
}
+static bool kvm_may_have_shadow_mmu_sptes(struct kvm *kvm)
+{
+ return !tdp_mmu_enabled || READ_ONCE(kvm->arch.indirect_shadow_pages);
+}
+
bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
bool young = false;
- if (kvm_memslots_have_rmaps(kvm))
- young = kvm_rmap_age_gfn_range(kvm, range, false);
-
if (tdp_mmu_enabled)
- young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
+ young = kvm_tdp_mmu_age_gfn_range(kvm, range);
+
+ if (kvm_may_have_shadow_mmu_sptes(kvm))
+ young |= kvm_rmap_age_gfn_range(kvm, range, false);
return young;
}
@@ -1605,11 +1773,14 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
bool young = false;
- if (kvm_memslots_have_rmaps(kvm))
- young = kvm_rmap_age_gfn_range(kvm, range, true);
-
if (tdp_mmu_enabled)
- young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
+ young = kvm_tdp_mmu_test_age_gfn(kvm, range);
+
+ if (young)
+ return young;
+
+ if (kvm_may_have_shadow_mmu_sptes(kvm))
+ young |= kvm_rmap_age_gfn_range(kvm, range, true);
return young;
}
@@ -1656,13 +1827,14 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn)
return hash_64(gfn, KVM_MMU_HASH_SHIFT);
}
-static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache,
+static void mmu_page_add_parent_pte(struct kvm *kvm,
+ struct kvm_mmu_memory_cache *cache,
struct kvm_mmu_page *sp, u64 *parent_pte)
{
if (!parent_pte)
return;
- pte_list_add(cache, parent_pte, &sp->parent_ptes);
+ pte_list_add(kvm, cache, parent_pte, &sp->parent_ptes);
}
static void mmu_page_remove_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
@@ -2352,7 +2524,7 @@ static void __link_shadow_page(struct kvm *kvm,
mmu_spte_set(sptep, spte);
- mmu_page_add_parent_pte(cache, sp, sptep);
+ mmu_page_add_parent_pte(kvm, cache, sp, sptep);
/*
* The non-direct sub-pagetable must be updated before linking. For
@@ -2416,7 +2588,8 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
* avoids retaining a large number of stale nested SPs.
*/
if (tdp_enabled && invalid_list &&
- child->role.guest_mode && !child->parent_ptes.val)
+ child->role.guest_mode &&
+ !atomic_long_read(&child->parent_ptes.val))
return kvm_mmu_prepare_zap_page(kvm, child,
invalid_list);
}
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index f4711674c47b..68e323568e95 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -510,8 +510,7 @@ error:
* Note, pte_access holds the raw RWX bits from the EPTE, not
* ACC_*_MASK flags!
*/
- walker->fault.exit_qualification |= (pte_access & VMX_EPT_RWX_MASK) <<
- EPT_VIOLATION_RWX_SHIFT;
+ walker->fault.exit_qualification |= EPT_VIOLATION_RWX_TO_PROT(pte_access);
}
#endif
walker->fault.address = addr;
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 22551e2f1d00..0f9f47b4ab0e 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -129,25 +129,32 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
}
/*
- * Returns true if the SPTE has bits that may be set without holding mmu_lock.
- * The caller is responsible for checking if the SPTE is shadow-present, and
- * for determining whether or not the caller cares about non-leaf SPTEs.
+ * Returns true if the SPTE needs to be updated atomically due to having bits
+ * that may be changed without holding mmu_lock, and for which KVM must not
+ * lose information. E.g. KVM must not drop Dirty bit information. The caller
+ * is responsible for checking if the SPTE is shadow-present, and for
+ * determining whether or not the caller cares about non-leaf SPTEs.
*/
-bool spte_has_volatile_bits(u64 spte)
+bool spte_needs_atomic_update(u64 spte)
{
+ /* SPTEs can be made Writable bit by KVM's fast page fault handler. */
if (!is_writable_pte(spte) && is_mmu_writable_spte(spte))
return true;
- if (is_access_track_spte(spte))
+ /*
+ * A/D-disabled SPTEs can be access-tracked by aging, and access-tracked
+ * SPTEs can be restored by KVM's fast page fault handler.
+ */
+ if (!spte_ad_enabled(spte))
return true;
- if (spte_ad_enabled(spte)) {
- if (!(spte & shadow_accessed_mask) ||
- (is_writable_pte(spte) && !(spte & shadow_dirty_mask)))
- return true;
- }
-
- return false;
+ /*
+ * Dirty and Accessed bits can be set by the CPU. Ignore the Accessed
+ * bit, as KVM tolerates false negatives/positives, e.g. KVM doesn't
+ * invalidate TLBs when aging SPTEs, and so it's safe to clobber the
+ * Accessed bit (and rare in practice).
+ */
+ return is_writable_pte(spte) && !(spte & shadow_dirty_mask);
}
bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 59746854c0af..79cdceba9857 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -519,7 +519,7 @@ static inline u64 get_mmio_spte_generation(u64 spte)
return gen;
}
-bool spte_has_volatile_bits(u64 spte);
+bool spte_needs_atomic_update(u64 spte);
bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
const struct kvm_memory_slot *slot,
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index 047b78333653..364c5da6c499 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -25,6 +25,13 @@ static inline u64 kvm_tdp_mmu_write_spte_atomic(tdp_ptep_t sptep, u64 new_spte)
return xchg(rcu_dereference(sptep), new_spte);
}
+static inline u64 tdp_mmu_clear_spte_bits_atomic(tdp_ptep_t sptep, u64 mask)
+{
+ atomic64_t *sptep_atomic = (atomic64_t *)rcu_dereference(sptep);
+
+ return (u64)atomic64_fetch_and(~mask, sptep_atomic);
+}
+
static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte)
{
KVM_MMU_WARN_ON(is_ept_ve_possible(new_spte));
@@ -32,28 +39,21 @@ static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte)
}
/*
- * SPTEs must be modified atomically if they are shadow-present, leaf
- * SPTEs, and have volatile bits, i.e. has bits that can be set outside
- * of mmu_lock. The Writable bit can be set by KVM's fast page fault
- * handler, and Accessed and Dirty bits can be set by the CPU.
- *
- * Note, non-leaf SPTEs do have Accessed bits and those bits are
- * technically volatile, but KVM doesn't consume the Accessed bit of
- * non-leaf SPTEs, i.e. KVM doesn't care if it clobbers the bit. This
- * logic needs to be reassessed if KVM were to use non-leaf Accessed
- * bits, e.g. to skip stepping down into child SPTEs when aging SPTEs.
+ * SPTEs must be modified atomically if they are shadow-present, leaf SPTEs,
+ * and have volatile bits (bits that can be set outside of mmu_lock) that
+ * must not be clobbered.
*/
-static inline bool kvm_tdp_mmu_spte_need_atomic_write(u64 old_spte, int level)
+static inline bool kvm_tdp_mmu_spte_need_atomic_update(u64 old_spte, int level)
{
return is_shadow_present_pte(old_spte) &&
is_last_spte(old_spte, level) &&
- spte_has_volatile_bits(old_spte);
+ spte_needs_atomic_update(old_spte);
}
static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
u64 new_spte, int level)
{
- if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level))
+ if (kvm_tdp_mmu_spte_need_atomic_update(old_spte, level))
return kvm_tdp_mmu_write_spte_atomic(sptep, new_spte);
__kvm_tdp_mmu_write_spte(sptep, new_spte);
@@ -63,12 +63,8 @@ static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
static inline u64 tdp_mmu_clear_spte_bits(tdp_ptep_t sptep, u64 old_spte,
u64 mask, int level)
{
- atomic64_t *sptep_atomic;
-
- if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level)) {
- sptep_atomic = (atomic64_t *)rcu_dereference(sptep);
- return (u64)atomic64_fetch_and(~mask, sptep_atomic);
- }
+ if (kvm_tdp_mmu_spte_need_atomic_update(old_spte, level))
+ return tdp_mmu_clear_spte_bits_atomic(sptep, mask);
__kvm_tdp_mmu_write_spte(sptep, old_spte & ~mask);
return old_spte;
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 046b6ba31197..7cc0564f5f97 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -193,6 +193,19 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
!tdp_mmu_root_match((_root), (_types)))) { \
} else
+/*
+ * Iterate over all TDP MMU roots in an RCU read-side critical section.
+ * It is safe to iterate over the SPTEs under the root, but their values will
+ * be unstable, so all writes must be atomic. As this routine is meant to be
+ * used without holding the mmu_lock at all, any bits that are flipped must
+ * be reflected in kvm_tdp_mmu_spte_need_atomic_write().
+ */
+#define for_each_tdp_mmu_root_rcu(_kvm, _root, _as_id, _types) \
+ list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link) \
+ if ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \
+ !tdp_mmu_root_match((_root), (_types))) { \
+ } else
+
#define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id) \
__for_each_tdp_mmu_root(_kvm, _root, _as_id, KVM_VALID_ROOTS)
@@ -774,9 +787,6 @@ static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
continue; \
else
-#define tdp_mmu_for_each_pte(_iter, _kvm, _root, _start, _end) \
- for_each_tdp_pte(_iter, _kvm, _root, _start, _end)
-
static inline bool __must_check tdp_mmu_iter_need_resched(struct kvm *kvm,
struct tdp_iter *iter)
{
@@ -1235,7 +1245,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
rcu_read_lock();
- tdp_mmu_for_each_pte(iter, kvm, root, fault->gfn, fault->gfn + 1) {
+ for_each_tdp_pte(iter, kvm, root, fault->gfn, fault->gfn + 1) {
int r;
if (fault->nx_huge_page_workaround_enabled)
@@ -1332,21 +1342,22 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
* from the clear_young() or clear_flush_young() notifier, which uses the
* return value to determine if the page has been accessed.
*/
-static void kvm_tdp_mmu_age_spte(struct tdp_iter *iter)
+static void kvm_tdp_mmu_age_spte(struct kvm *kvm, struct tdp_iter *iter)
{
u64 new_spte;
if (spte_ad_enabled(iter->old_spte)) {
- iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep,
- iter->old_spte,
- shadow_accessed_mask,
- iter->level);
+ iter->old_spte = tdp_mmu_clear_spte_bits_atomic(iter->sptep,
+ shadow_accessed_mask);
new_spte = iter->old_spte & ~shadow_accessed_mask;
} else {
new_spte = mark_spte_for_access_track(iter->old_spte);
- iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep,
- iter->old_spte, new_spte,
- iter->level);
+ /*
+ * It is safe for the following cmpxchg to fail. Leave the
+ * Accessed bit set, as the spte is most likely young anyway.
+ */
+ if (__tdp_mmu_set_spte_atomic(kvm, iter, new_spte))
+ return;
}
trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
@@ -1371,9 +1382,9 @@ static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm,
* valid roots!
*/
WARN_ON(types & ~KVM_VALID_ROOTS);
- __for_each_tdp_mmu_root(kvm, root, range->slot->as_id, types) {
- guard(rcu)();
+ guard(rcu)();
+ for_each_tdp_mmu_root_rcu(kvm, root, range->slot->as_id, types) {
tdp_root_for_each_leaf_pte(iter, kvm, root, range->start, range->end) {
if (!is_accessed_spte(iter.old_spte))
continue;
@@ -1382,7 +1393,7 @@ static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm,
return true;
ret = true;
- kvm_tdp_mmu_age_spte(&iter);
+ kvm_tdp_mmu_age_spte(kvm, &iter);
}
}
@@ -1904,7 +1915,7 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
*root_level = vcpu->arch.mmu->root_role.level;
- tdp_mmu_for_each_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
+ for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
leaf = iter.level;
sptes[leaf] = iter.old_spte;
}
@@ -1931,7 +1942,7 @@ u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn,
struct tdp_iter iter;
tdp_ptep_t sptep = NULL;
- tdp_mmu_for_each_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
+ for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
*spte = iter.old_spte;
sptep = iter.sptep;
}
diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c
index e0ab7df27b66..699e551ec93b 100644
--- a/arch/x86/kvm/smm.c
+++ b/arch/x86/kvm/smm.c
@@ -358,7 +358,7 @@ void enter_smm(struct kvm_vcpu *vcpu)
goto error;
#endif
- kvm_update_cpuid_runtime(vcpu);
+ vcpu->arch.cpuid_dynamic_bits_dirty = true;
kvm_mmu_reset_context(vcpu);
return;
error:
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 04c375bf1ac2..834b67672d50 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -994,7 +994,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
/* in case we halted in L2 */
- svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
/* Give the current vmcb to the guest */
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 661108d65ee7..0bc708ee2788 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -140,7 +140,7 @@ static inline bool is_mirroring_enc_context(struct kvm *kvm)
static bool sev_vcpu_has_debug_swap(struct vcpu_svm *svm)
{
struct kvm_vcpu *vcpu = &svm->vcpu;
- struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
+ struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm);
return sev->vmsa_features & SVM_SEV_FEAT_DEBUG_SWAP;
}
@@ -226,9 +226,7 @@ e_uncharge:
static unsigned int sev_get_asid(struct kvm *kvm)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-
- return sev->asid;
+ return to_kvm_sev_info(kvm)->asid;
}
static void sev_asid_free(struct kvm_sev_info *sev)
@@ -403,7 +401,7 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp,
struct kvm_sev_init *data,
unsigned long vm_type)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
struct sev_platform_init_args init_args = {0};
bool es_active = vm_type != KVM_X86_SEV_VM;
u64 valid_vmsa_features = es_active ? sev_supported_vmsa_features : 0;
@@ -500,10 +498,9 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
static int sev_guest_init2(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct kvm_sev_init data;
- if (!sev->need_init)
+ if (!to_kvm_sev_info(kvm)->need_init)
return -EINVAL;
if (kvm->arch.vm_type != KVM_X86_SEV_VM &&
@@ -543,14 +540,14 @@ static int __sev_issue_cmd(int fd, int id, void *data, int *error)
static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
return __sev_issue_cmd(sev->fd, id, data, error);
}
static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
struct sev_data_launch_start start;
struct kvm_sev_launch_start params;
void *dh_blob, *session_blob;
@@ -622,9 +619,9 @@ e_free_dh:
static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
unsigned long ulen, unsigned long *n,
- int write)
+ unsigned int flags)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
unsigned long npages, size;
int npinned;
unsigned long locked, lock_limit;
@@ -663,7 +660,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
return ERR_PTR(-ENOMEM);
/* Pin the user virtual address. */
- npinned = pin_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages);
+ npinned = pin_user_pages_fast(uaddr, npages, flags, pages);
if (npinned != npages) {
pr_err("SEV: Failure locking %lu pages.\n", npages);
ret = -ENOMEM;
@@ -686,11 +683,9 @@ err:
static void sev_unpin_memory(struct kvm *kvm, struct page **pages,
unsigned long npages)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-
unpin_user_pages(pages, npages);
kvfree(pages);
- sev->pages_locked -= npages;
+ to_kvm_sev_info(kvm)->pages_locked -= npages;
}
static void sev_clflush_pages(struct page *pages[], unsigned long npages)
@@ -734,7 +729,6 @@ static unsigned long get_num_contig_pages(unsigned long idx,
static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i;
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct kvm_sev_launch_update_data params;
struct sev_data_launch_update_data data;
struct page **inpages;
@@ -751,7 +745,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
vaddr_end = vaddr + size;
/* Lock the user memory. */
- inpages = sev_pin_memory(kvm, vaddr, size, &npages, 1);
+ inpages = sev_pin_memory(kvm, vaddr, size, &npages, FOLL_WRITE);
if (IS_ERR(inpages))
return PTR_ERR(inpages);
@@ -762,7 +756,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
sev_clflush_pages(inpages, npages);
data.reserved = 0;
- data.handle = sev->handle;
+ data.handle = to_kvm_sev_info(kvm)->handle;
for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) {
int offset, len;
@@ -802,7 +796,7 @@ e_unpin:
static int sev_es_sync_vmsa(struct vcpu_svm *svm)
{
struct kvm_vcpu *vcpu = &svm->vcpu;
- struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
+ struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm);
struct sev_es_save_area *save = svm->sev_es.vmsa;
struct xregs_state *xsave;
const u8 *s;
@@ -972,7 +966,6 @@ static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
void __user *measure = u64_to_user_ptr(argp->data);
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct sev_data_launch_measure data;
struct kvm_sev_launch_measure params;
void __user *p = NULL;
@@ -1005,7 +998,7 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
}
cmd:
- data.handle = sev->handle;
+ data.handle = to_kvm_sev_info(kvm)->handle;
ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, &data, &argp->error);
/*
@@ -1033,19 +1026,17 @@ e_free_blob:
static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct sev_data_launch_finish data;
if (!sev_guest(kvm))
return -ENOTTY;
- data.handle = sev->handle;
+ data.handle = to_kvm_sev_info(kvm)->handle;
return sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, &data, &argp->error);
}
static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct kvm_sev_guest_status params;
struct sev_data_guest_status data;
int ret;
@@ -1055,7 +1046,7 @@ static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp)
memset(&data, 0, sizeof(data));
- data.handle = sev->handle;
+ data.handle = to_kvm_sev_info(kvm)->handle;
ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, &data, &argp->error);
if (ret)
return ret;
@@ -1074,11 +1065,10 @@ static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src,
unsigned long dst, int size,
int *error, bool enc)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct sev_data_dbg data;
data.reserved = 0;
- data.handle = sev->handle;
+ data.handle = to_kvm_sev_info(kvm)->handle;
data.dst_addr = dst;
data.src_addr = src;
data.len = size;
@@ -1250,7 +1240,7 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
if (IS_ERR(src_p))
return PTR_ERR(src_p);
- dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, 1);
+ dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, FOLL_WRITE);
if (IS_ERR(dst_p)) {
sev_unpin_memory(kvm, src_p, n);
return PTR_ERR(dst_p);
@@ -1302,7 +1292,6 @@ err:
static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct sev_data_launch_secret data;
struct kvm_sev_launch_secret params;
struct page **pages;
@@ -1316,7 +1305,7 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
return -EFAULT;
- pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, 1);
+ pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, FOLL_WRITE);
if (IS_ERR(pages))
return PTR_ERR(pages);
@@ -1358,7 +1347,7 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
data.hdr_address = __psp_pa(hdr);
data.hdr_len = params.hdr_len;
- data.handle = sev->handle;
+ data.handle = to_kvm_sev_info(kvm)->handle;
ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, &data, &argp->error);
kfree(hdr);
@@ -1378,7 +1367,6 @@ e_unpin_memory:
static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
void __user *report = u64_to_user_ptr(argp->data);
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct sev_data_attestation_report data;
struct kvm_sev_attestation_report params;
void __user *p;
@@ -1411,7 +1399,7 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
memcpy(data.mnonce, params.mnonce, sizeof(params.mnonce));
}
cmd:
- data.handle = sev->handle;
+ data.handle = to_kvm_sev_info(kvm)->handle;
ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, &data, &argp->error);
/*
* If we query the session length, FW responded with expected data.
@@ -1441,12 +1429,11 @@ static int
__sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp,
struct kvm_sev_send_start *params)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct sev_data_send_start data;
int ret;
memset(&data, 0, sizeof(data));
- data.handle = sev->handle;
+ data.handle = to_kvm_sev_info(kvm)->handle;
ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
params->session_len = data.session_len;
@@ -1459,7 +1446,6 @@ __sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp,
static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct sev_data_send_start data;
struct kvm_sev_send_start params;
void *amd_certs, *session_data;
@@ -1520,7 +1506,7 @@ static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
data.amd_certs_len = params.amd_certs_len;
data.session_address = __psp_pa(session_data);
data.session_len = params.session_len;
- data.handle = sev->handle;
+ data.handle = to_kvm_sev_info(kvm)->handle;
ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
@@ -1552,12 +1538,11 @@ static int
__sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp,
struct kvm_sev_send_update_data *params)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct sev_data_send_update_data data;
int ret;
memset(&data, 0, sizeof(data));
- data.handle = sev->handle;
+ data.handle = to_kvm_sev_info(kvm)->handle;
ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error);
params->hdr_len = data.hdr_len;
@@ -1572,7 +1557,6 @@ __sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp,
static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct sev_data_send_update_data data;
struct kvm_sev_send_update_data params;
void *hdr, *trans_data;
@@ -1626,7 +1610,7 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset;
data.guest_address |= sev_me_mask;
data.guest_len = params.guest_len;
- data.handle = sev->handle;
+ data.handle = to_kvm_sev_info(kvm)->handle;
ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error);
@@ -1657,31 +1641,29 @@ e_unpin:
static int sev_send_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct sev_data_send_finish data;
if (!sev_guest(kvm))
return -ENOTTY;
- data.handle = sev->handle;
+ data.handle = to_kvm_sev_info(kvm)->handle;
return sev_issue_cmd(kvm, SEV_CMD_SEND_FINISH, &data, &argp->error);
}
static int sev_send_cancel(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct sev_data_send_cancel data;
if (!sev_guest(kvm))
return -ENOTTY;
- data.handle = sev->handle;
+ data.handle = to_kvm_sev_info(kvm)->handle;
return sev_issue_cmd(kvm, SEV_CMD_SEND_CANCEL, &data, &argp->error);
}
static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
struct sev_data_receive_start start;
struct kvm_sev_receive_start params;
int *error = &argp->error;
@@ -1755,7 +1737,6 @@ e_free_pdh:
static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct kvm_sev_receive_update_data params;
struct sev_data_receive_update_data data;
void *hdr = NULL, *trans = NULL;
@@ -1798,7 +1779,7 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
/* Pin guest memory */
guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK,
- PAGE_SIZE, &n, 1);
+ PAGE_SIZE, &n, FOLL_WRITE);
if (IS_ERR(guest_page)) {
ret = PTR_ERR(guest_page);
goto e_free_trans;
@@ -1815,7 +1796,7 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset;
data.guest_address |= sev_me_mask;
data.guest_len = params.guest_len;
- data.handle = sev->handle;
+ data.handle = to_kvm_sev_info(kvm)->handle;
ret = sev_issue_cmd(kvm, SEV_CMD_RECEIVE_UPDATE_DATA, &data,
&argp->error);
@@ -1832,13 +1813,12 @@ e_free_hdr:
static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct sev_data_receive_finish data;
if (!sev_guest(kvm))
return -ENOTTY;
- data.handle = sev->handle;
+ data.handle = to_kvm_sev_info(kvm)->handle;
return sev_issue_cmd(kvm, SEV_CMD_RECEIVE_FINISH, &data, &argp->error);
}
@@ -1858,8 +1838,8 @@ static bool is_cmd_allowed_from_mirror(u32 cmd_id)
static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
{
- struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info;
- struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info;
+ struct kvm_sev_info *dst_sev = to_kvm_sev_info(dst_kvm);
+ struct kvm_sev_info *src_sev = to_kvm_sev_info(src_kvm);
int r = -EBUSY;
if (dst_kvm == src_kvm)
@@ -1893,8 +1873,8 @@ release_dst:
static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
{
- struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info;
- struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info;
+ struct kvm_sev_info *dst_sev = to_kvm_sev_info(dst_kvm);
+ struct kvm_sev_info *src_sev = to_kvm_sev_info(src_kvm);
mutex_unlock(&dst_kvm->lock);
mutex_unlock(&src_kvm->lock);
@@ -1968,8 +1948,8 @@ static void sev_unlock_vcpus_for_migration(struct kvm *kvm)
static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm)
{
- struct kvm_sev_info *dst = &to_kvm_svm(dst_kvm)->sev_info;
- struct kvm_sev_info *src = &to_kvm_svm(src_kvm)->sev_info;
+ struct kvm_sev_info *dst = to_kvm_sev_info(dst_kvm);
+ struct kvm_sev_info *src = to_kvm_sev_info(src_kvm);
struct kvm_vcpu *dst_vcpu, *src_vcpu;
struct vcpu_svm *dst_svm, *src_svm;
struct kvm_sev_info *mirror;
@@ -2009,8 +1989,7 @@ static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm)
* and add the new mirror to the list.
*/
if (is_mirroring_enc_context(dst_kvm)) {
- struct kvm_sev_info *owner_sev_info =
- &to_kvm_svm(dst->enc_context_owner)->sev_info;
+ struct kvm_sev_info *owner_sev_info = to_kvm_sev_info(dst->enc_context_owner);
list_del(&src->mirror_entry);
list_add_tail(&dst->mirror_entry, &owner_sev_info->mirror_vms);
@@ -2069,7 +2048,7 @@ static int sev_check_source_vcpus(struct kvm *dst, struct kvm *src)
int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
{
- struct kvm_sev_info *dst_sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_info *dst_sev = to_kvm_sev_info(kvm);
struct kvm_sev_info *src_sev, *cg_cleanup_sev;
CLASS(fd, f)(source_fd);
struct kvm *source_kvm;
@@ -2093,7 +2072,7 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
goto out_unlock;
}
- src_sev = &to_kvm_svm(source_kvm)->sev_info;
+ src_sev = to_kvm_sev_info(source_kvm);
dst_sev->misc_cg = get_current_misc_cg();
cg_cleanup_sev = dst_sev;
@@ -2181,7 +2160,7 @@ static void *snp_context_create(struct kvm *kvm, struct kvm_sev_cmd *argp)
static int snp_bind_asid(struct kvm *kvm, int *error)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
struct sev_data_snp_activate data = {0};
data.gctx_paddr = __psp_pa(sev->snp_context);
@@ -2191,7 +2170,7 @@ static int snp_bind_asid(struct kvm *kvm, int *error)
static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
struct sev_data_snp_launch_start start = {0};
struct kvm_sev_snp_launch_start params;
int rc;
@@ -2260,7 +2239,7 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pf
void __user *src, int order, void *opaque)
{
struct sev_gmem_populate_args *sev_populate_args = opaque;
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
int n_private = 0, ret, i;
int npages = (1 << order);
gfn_t gfn;
@@ -2350,7 +2329,7 @@ err:
static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
struct sev_gmem_populate_args sev_populate_args = {0};
struct kvm_sev_snp_launch_update params;
struct kvm_memory_slot *memslot;
@@ -2434,7 +2413,7 @@ out:
static int snp_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
struct sev_data_snp_launch_update data = {};
struct kvm_vcpu *vcpu;
unsigned long i;
@@ -2482,7 +2461,7 @@ static int snp_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
static int snp_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
struct kvm_sev_snp_launch_finish params;
struct sev_data_snp_launch_finish *data;
void *id_block = NULL, *id_auth = NULL;
@@ -2677,7 +2656,7 @@ out:
int sev_mem_enc_register_region(struct kvm *kvm,
struct kvm_enc_region *range)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
struct enc_region *region;
int ret = 0;
@@ -2696,7 +2675,8 @@ int sev_mem_enc_register_region(struct kvm *kvm,
return -ENOMEM;
mutex_lock(&kvm->lock);
- region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages, 1);
+ region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages,
+ FOLL_WRITE | FOLL_LONGTERM);
if (IS_ERR(region->pages)) {
ret = PTR_ERR(region->pages);
mutex_unlock(&kvm->lock);
@@ -2729,7 +2709,7 @@ e_free:
static struct enc_region *
find_enc_region(struct kvm *kvm, struct kvm_enc_region *range)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
struct list_head *head = &sev->regions_list;
struct enc_region *i;
@@ -2824,9 +2804,9 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
* The mirror kvm holds an enc_context_owner ref so its asid can't
* disappear until we're done with it
*/
- source_sev = &to_kvm_svm(source_kvm)->sev_info;
+ source_sev = to_kvm_sev_info(source_kvm);
kvm_get_kvm(source_kvm);
- mirror_sev = &to_kvm_svm(kvm)->sev_info;
+ mirror_sev = to_kvm_sev_info(kvm);
list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms);
/* Set enc_context_owner and copy its encryption context over */
@@ -2854,7 +2834,7 @@ e_unlock:
static int snp_decommission_context(struct kvm *kvm)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
struct sev_data_snp_addr data = {};
int ret;
@@ -2879,7 +2859,7 @@ static int snp_decommission_context(struct kvm *kvm)
void sev_vm_destroy(struct kvm *kvm)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
struct list_head *head = &sev->regions_list;
struct list_head *pos, *q;
@@ -3271,7 +3251,7 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm)
if (kvm_ghcb_xcr0_is_valid(svm)) {
vcpu->arch.xcr0 = ghcb_get_xcr0(ghcb);
- kvm_update_cpuid_runtime(vcpu);
+ vcpu->arch.cpuid_dynamic_bits_dirty = true;
}
/* Copy the GHCB exit information into the VMCB fields */
@@ -3430,8 +3410,7 @@ vmgexit_err:
dump_ghcb(svm);
}
- ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2);
- ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, reason);
+ svm_vmgexit_bad_input(svm, reason);
/* Resume the guest to "return" the error code. */
return 1;
@@ -3472,10 +3451,19 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm)
svm->sev_es.ghcb = NULL;
}
-void pre_sev_run(struct vcpu_svm *svm, int cpu)
+int pre_sev_run(struct vcpu_svm *svm, int cpu)
{
struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
- unsigned int asid = sev_get_asid(svm->vcpu.kvm);
+ struct kvm *kvm = svm->vcpu.kvm;
+ unsigned int asid = sev_get_asid(kvm);
+
+ /*
+ * Reject KVM_RUN if userspace attempts to run the vCPU with an invalid
+ * VMSA, e.g. if userspace forces the vCPU to be RUNNABLE after an SNP
+ * AP Destroy event.
+ */
+ if (sev_es_guest(kvm) && !VALID_PAGE(svm->vmcb->control.vmsa_pa))
+ return -EINVAL;
/* Assign the asid allocated with this SEV guest */
svm->asid = asid;
@@ -3488,11 +3476,12 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu)
*/
if (sd->sev_vmcbs[asid] == svm->vmcb &&
svm->vcpu.arch.last_vmentry_cpu == cpu)
- return;
+ return 0;
sd->sev_vmcbs[asid] = svm->vmcb;
svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
+ return 0;
}
#define GHCB_SCRATCH_AREA_LIMIT (16ULL * PAGE_SIZE)
@@ -3574,8 +3563,7 @@ static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
return 0;
e_scratch:
- ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2);
- ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_SCRATCH_AREA);
+ svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_SCRATCH_AREA);
return 1;
}
@@ -3675,7 +3663,14 @@ static void snp_complete_psc(struct vcpu_svm *svm, u64 psc_ret)
svm->sev_es.psc_inflight = 0;
svm->sev_es.psc_idx = 0;
svm->sev_es.psc_2m = false;
- ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, psc_ret);
+
+ /*
+ * PSC requests always get a "no action" response in SW_EXITINFO1, with
+ * a PSC-specific return code in SW_EXITINFO2 that provides the "real"
+ * return code. E.g. if the PSC request was interrupted, the need to
+ * retry is communicated via SW_EXITINFO2, not SW_EXITINFO1.
+ */
+ svm_vmgexit_no_action(svm, psc_ret);
}
static void __snp_complete_one_psc(struct vcpu_svm *svm)
@@ -3847,110 +3842,90 @@ next_range:
BUG();
}
-static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu)
+/*
+ * Invoked as part of svm_vcpu_reset() processing of an init event.
+ */
+void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ struct kvm_memory_slot *slot;
+ struct page *page;
+ kvm_pfn_t pfn;
+ gfn_t gfn;
- WARN_ON(!mutex_is_locked(&svm->sev_es.snp_vmsa_mutex));
+ if (!sev_snp_guest(vcpu->kvm))
+ return;
+
+ guard(mutex)(&svm->sev_es.snp_vmsa_mutex);
+
+ if (!svm->sev_es.snp_ap_waiting_for_reset)
+ return;
+
+ svm->sev_es.snp_ap_waiting_for_reset = false;
/* Mark the vCPU as offline and not runnable */
vcpu->arch.pv.pv_unhalted = false;
- vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_HALTED);
/* Clear use of the VMSA */
svm->vmcb->control.vmsa_pa = INVALID_PAGE;
- if (VALID_PAGE(svm->sev_es.snp_vmsa_gpa)) {
- gfn_t gfn = gpa_to_gfn(svm->sev_es.snp_vmsa_gpa);
- struct kvm_memory_slot *slot;
- struct page *page;
- kvm_pfn_t pfn;
-
- slot = gfn_to_memslot(vcpu->kvm, gfn);
- if (!slot)
- return -EINVAL;
-
- /*
- * The new VMSA will be private memory guest memory, so
- * retrieve the PFN from the gmem backend.
- */
- if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, &page, NULL))
- return -EINVAL;
-
- /*
- * From this point forward, the VMSA will always be a
- * guest-mapped page rather than the initial one allocated
- * by KVM in svm->sev_es.vmsa. In theory, svm->sev_es.vmsa
- * could be free'd and cleaned up here, but that involves
- * cleanups like wbinvd_on_all_cpus() which would ideally
- * be handled during teardown rather than guest boot.
- * Deferring that also allows the existing logic for SEV-ES
- * VMSAs to be re-used with minimal SNP-specific changes.
- */
- svm->sev_es.snp_has_guest_vmsa = true;
-
- /* Use the new VMSA */
- svm->vmcb->control.vmsa_pa = pfn_to_hpa(pfn);
-
- /* Mark the vCPU as runnable */
- vcpu->arch.pv.pv_unhalted = false;
- vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
-
- svm->sev_es.snp_vmsa_gpa = INVALID_PAGE;
-
- /*
- * gmem pages aren't currently migratable, but if this ever
- * changes then care should be taken to ensure
- * svm->sev_es.vmsa is pinned through some other means.
- */
- kvm_release_page_clean(page);
- }
-
/*
* When replacing the VMSA during SEV-SNP AP creation,
* mark the VMCB dirty so that full state is always reloaded.
*/
vmcb_mark_all_dirty(svm->vmcb);
- return 0;
-}
+ if (!VALID_PAGE(svm->sev_es.snp_vmsa_gpa))
+ return;
-/*
- * Invoked as part of svm_vcpu_reset() processing of an init event.
- */
-void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- int ret;
+ gfn = gpa_to_gfn(svm->sev_es.snp_vmsa_gpa);
+ svm->sev_es.snp_vmsa_gpa = INVALID_PAGE;
- if (!sev_snp_guest(vcpu->kvm))
+ slot = gfn_to_memslot(vcpu->kvm, gfn);
+ if (!slot)
return;
- mutex_lock(&svm->sev_es.snp_vmsa_mutex);
+ /*
+ * The new VMSA will be private memory guest memory, so retrieve the
+ * PFN from the gmem backend.
+ */
+ if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, &page, NULL))
+ return;
- if (!svm->sev_es.snp_ap_waiting_for_reset)
- goto unlock;
+ /*
+ * From this point forward, the VMSA will always be a guest-mapped page
+ * rather than the initial one allocated by KVM in svm->sev_es.vmsa. In
+ * theory, svm->sev_es.vmsa could be free'd and cleaned up here, but
+ * that involves cleanups like wbinvd_on_all_cpus() which would ideally
+ * be handled during teardown rather than guest boot. Deferring that
+ * also allows the existing logic for SEV-ES VMSAs to be re-used with
+ * minimal SNP-specific changes.
+ */
+ svm->sev_es.snp_has_guest_vmsa = true;
- svm->sev_es.snp_ap_waiting_for_reset = false;
+ /* Use the new VMSA */
+ svm->vmcb->control.vmsa_pa = pfn_to_hpa(pfn);
- ret = __sev_snp_update_protected_guest_state(vcpu);
- if (ret)
- vcpu_unimpl(vcpu, "snp: AP state update on init failed\n");
+ /* Mark the vCPU as runnable */
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
-unlock:
- mutex_unlock(&svm->sev_es.snp_vmsa_mutex);
+ /*
+ * gmem pages aren't currently migratable, but if this ever changes
+ * then care should be taken to ensure svm->sev_es.vmsa is pinned
+ * through some other means.
+ */
+ kvm_release_page_clean(page);
}
static int sev_snp_ap_creation(struct vcpu_svm *svm)
{
- struct kvm_sev_info *sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info;
+ struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm);
struct kvm_vcpu *vcpu = &svm->vcpu;
struct kvm_vcpu *target_vcpu;
struct vcpu_svm *target_svm;
unsigned int request;
unsigned int apic_id;
- bool kick;
- int ret;
request = lower_32_bits(svm->vmcb->control.exit_info_1);
apic_id = upper_32_bits(svm->vmcb->control.exit_info_1);
@@ -3963,47 +3938,23 @@ static int sev_snp_ap_creation(struct vcpu_svm *svm)
return -EINVAL;
}
- ret = 0;
-
target_svm = to_svm(target_vcpu);
- /*
- * The target vCPU is valid, so the vCPU will be kicked unless the
- * request is for CREATE_ON_INIT. For any errors at this stage, the
- * kick will place the vCPU in an non-runnable state.
- */
- kick = true;
-
- mutex_lock(&target_svm->sev_es.snp_vmsa_mutex);
-
- target_svm->sev_es.snp_vmsa_gpa = INVALID_PAGE;
- target_svm->sev_es.snp_ap_waiting_for_reset = true;
-
- /* Interrupt injection mode shouldn't change for AP creation */
- if (request < SVM_VMGEXIT_AP_DESTROY) {
- u64 sev_features;
-
- sev_features = vcpu->arch.regs[VCPU_REGS_RAX];
- sev_features ^= sev->vmsa_features;
-
- if (sev_features & SVM_SEV_FEAT_INT_INJ_MODES) {
- vcpu_unimpl(vcpu, "vmgexit: invalid AP injection mode [%#lx] from guest\n",
- vcpu->arch.regs[VCPU_REGS_RAX]);
- ret = -EINVAL;
- goto out;
- }
- }
+ guard(mutex)(&target_svm->sev_es.snp_vmsa_mutex);
switch (request) {
case SVM_VMGEXIT_AP_CREATE_ON_INIT:
- kick = false;
- fallthrough;
case SVM_VMGEXIT_AP_CREATE:
+ if (vcpu->arch.regs[VCPU_REGS_RAX] != sev->vmsa_features) {
+ vcpu_unimpl(vcpu, "vmgexit: mismatched AP sev_features [%#lx] != [%#llx] from guest\n",
+ vcpu->arch.regs[VCPU_REGS_RAX], sev->vmsa_features);
+ return -EINVAL;
+ }
+
if (!page_address_valid(vcpu, svm->vmcb->control.exit_info_2)) {
vcpu_unimpl(vcpu, "vmgexit: invalid AP VMSA address [%#llx] from guest\n",
svm->vmcb->control.exit_info_2);
- ret = -EINVAL;
- goto out;
+ return -EINVAL;
}
/*
@@ -4017,30 +3968,32 @@ static int sev_snp_ap_creation(struct vcpu_svm *svm)
vcpu_unimpl(vcpu,
"vmgexit: AP VMSA address [%llx] from guest is unsafe as it is 2M aligned\n",
svm->vmcb->control.exit_info_2);
- ret = -EINVAL;
- goto out;
+ return -EINVAL;
}
target_svm->sev_es.snp_vmsa_gpa = svm->vmcb->control.exit_info_2;
break;
case SVM_VMGEXIT_AP_DESTROY:
+ target_svm->sev_es.snp_vmsa_gpa = INVALID_PAGE;
break;
default:
vcpu_unimpl(vcpu, "vmgexit: invalid AP creation request [%#x] from guest\n",
request);
- ret = -EINVAL;
- break;
+ return -EINVAL;
}
-out:
- if (kick) {
+ target_svm->sev_es.snp_ap_waiting_for_reset = true;
+
+ /*
+ * Unless Creation is deferred until INIT, signal the vCPU to update
+ * its state.
+ */
+ if (request != SVM_VMGEXIT_AP_CREATE_ON_INIT) {
kvm_make_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu);
kvm_vcpu_kick(target_vcpu);
}
- mutex_unlock(&target_svm->sev_es.snp_vmsa_mutex);
-
- return ret;
+ return 0;
}
static int snp_handle_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa)
@@ -4079,7 +4032,8 @@ static int snp_handle_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_
goto out_unlock;
}
- ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, SNP_GUEST_ERR(0, fw_err));
+ /* No action is requested *from KVM* if there was a firmware error. */
+ svm_vmgexit_no_action(svm, SNP_GUEST_ERR(0, fw_err));
ret = 1; /* resume guest */
@@ -4135,8 +4089,7 @@ static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t r
return snp_handle_guest_req(svm, req_gpa, resp_gpa);
request_invalid:
- ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2);
- ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_INPUT);
+ svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT);
return 1; /* resume guest */
}
@@ -4144,7 +4097,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
{
struct vmcb_control_area *control = &svm->vmcb->control;
struct kvm_vcpu *vcpu = &svm->vcpu;
- struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
+ struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm);
u64 ghcb_info;
int ret = 1;
@@ -4328,8 +4281,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
if (ret)
return ret;
- ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 0);
- ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 0);
+ svm_vmgexit_success(svm, 0);
exit_code = kvm_ghcb_get_sw_exit_code(control);
switch (exit_code) {
@@ -4364,7 +4316,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
ret = kvm_emulate_ap_reset_hold(vcpu);
break;
case SVM_VMGEXIT_AP_JUMP_TABLE: {
- struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
+ struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm);
switch (control->exit_info_1) {
case 0:
@@ -4373,21 +4325,19 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
break;
case 1:
/* Get AP jump table address */
- ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, sev->ap_jump_table);
+ svm_vmgexit_success(svm, sev->ap_jump_table);
break;
default:
pr_err("svm: vmgexit: unsupported AP jump table request - exit_info_1=%#llx\n",
control->exit_info_1);
- ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2);
- ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_INPUT);
+ svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT);
}
ret = 1;
break;
}
case SVM_VMGEXIT_HV_FEATURES:
- ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_HV_FT_SUPPORTED);
-
+ svm_vmgexit_success(svm, GHCB_HV_FT_SUPPORTED);
ret = 1;
break;
case SVM_VMGEXIT_TERM_REQUEST:
@@ -4408,8 +4358,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
case SVM_VMGEXIT_AP_CREATION:
ret = sev_snp_ap_creation(svm);
if (ret) {
- ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2);
- ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_INPUT);
+ svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT);
}
ret = 1;
@@ -4575,7 +4524,7 @@ void sev_init_vmcb(struct vcpu_svm *svm)
void sev_es_vcpu_reset(struct vcpu_svm *svm)
{
struct kvm_vcpu *vcpu = &svm->vcpu;
- struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
+ struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm);
/*
* Set the GHCB MSR value as per the GHCB specification when emulating
@@ -4655,7 +4604,7 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
* Return from an AP Reset Hold VMGEXIT, where the guest will
* set the CS and RIP. Set SW_EXIT_INFO_2 to a non-zero value.
*/
- ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1);
+ svm_vmgexit_success(svm, 1);
break;
case AP_RESET_HOLD_MSR_PROTO:
/*
@@ -4853,7 +4802,7 @@ static bool is_large_rmp_possible(struct kvm *kvm, kvm_pfn_t pfn, int order)
int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
kvm_pfn_t pfn_aligned;
gfn_t gfn_aligned;
int level, rc;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index e67de787fc71..d5d0c5c3300b 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -607,6 +607,9 @@ static void svm_disable_virtualization_cpu(void)
kvm_cpu_svm_disable();
amd_pmu_disable_virt();
+
+ if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
+ msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
}
static int svm_enable_virtualization_cpu(void)
@@ -684,6 +687,9 @@ static int svm_enable_virtualization_cpu(void)
rdmsr(MSR_TSC_AUX, sev_es_host_save_area(sd)->tsc_aux, msr_hi);
}
+ if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
+ msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
+
return 0;
}
@@ -1297,8 +1303,12 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
svm_set_intercept(svm, INTERCEPT_MWAIT);
}
- if (!kvm_hlt_in_guest(vcpu->kvm))
- svm_set_intercept(svm, INTERCEPT_HLT);
+ if (!kvm_hlt_in_guest(vcpu->kvm)) {
+ if (cpu_feature_enabled(X86_FEATURE_IDLE_HLT))
+ svm_set_intercept(svm, INTERCEPT_IDLE_HLT);
+ else
+ svm_set_intercept(svm, INTERCEPT_HLT);
+ }
control->iopm_base_pa = iopm_base;
control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
@@ -1559,7 +1569,8 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (sd->current_vmcb != svm->vmcb) {
sd->current_vmcb = svm->vmcb;
- if (!cpu_feature_enabled(X86_FEATURE_IBPB_ON_VMEXIT))
+ if (!cpu_feature_enabled(X86_FEATURE_IBPB_ON_VMEXIT) &&
+ static_branch_likely(&switch_vcpu_ibpb))
indirect_branch_prediction_barrier();
}
if (kvm_vcpu_apicv_active(vcpu))
@@ -1932,7 +1943,7 @@ void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
- kvm_update_cpuid_runtime(vcpu);
+ vcpu->arch.cpuid_dynamic_bits_dirty = true;
}
static void svm_set_segment(struct kvm_vcpu *vcpu,
@@ -2973,11 +2984,7 @@ static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb))
return kvm_complete_insn_gp(vcpu, err);
- ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 1);
- ghcb_set_sw_exit_info_2(svm->sev_es.ghcb,
- X86_TRAP_GP |
- SVM_EVTINJ_TYPE_EXEPT |
- SVM_EVTINJ_VALID);
+ svm_vmgexit_inject_exception(svm, X86_TRAP_GP);
return 1;
}
@@ -3293,6 +3300,17 @@ static int invpcid_interception(struct kvm_vcpu *vcpu)
type = svm->vmcb->control.exit_info_2;
gva = svm->vmcb->control.exit_info_1;
+ /*
+ * FIXME: Perform segment checks for 32-bit mode, and inject #SS if the
+ * stack segment is used. The intercept takes priority over all
+ * #GP checks except CPL>0, but somehow still generates a linear
+ * address? The APM is sorely lacking.
+ */
+ if (is_noncanonical_address(gva, vcpu, 0)) {
+ kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+ return 1;
+ }
+
return kvm_handle_invpcid(vcpu, type, gva);
}
@@ -3363,6 +3381,7 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[SVM_EXIT_CR4_WRITE_TRAP] = cr_trap,
[SVM_EXIT_CR8_WRITE_TRAP] = cr_trap,
[SVM_EXIT_INVPCID] = invpcid_interception,
+ [SVM_EXIT_IDLE_HLT] = kvm_emulate_halt,
[SVM_EXIT_NPF] = npf_interception,
[SVM_EXIT_RSM] = rsm_interception,
[SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception,
@@ -3525,7 +3544,7 @@ int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
return interrupt_window_interception(vcpu);
else if (exit_code == SVM_EXIT_INTR)
return intr_interception(vcpu);
- else if (exit_code == SVM_EXIT_HLT)
+ else if (exit_code == SVM_EXIT_HLT || exit_code == SVM_EXIT_IDLE_HLT)
return kvm_emulate_halt(vcpu);
else if (exit_code == SVM_EXIT_NPF)
return npf_interception(vcpu);
@@ -3608,7 +3627,7 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
return svm_invoke_exit_handler(vcpu, exit_code);
}
-static void pre_svm_run(struct kvm_vcpu *vcpu)
+static int pre_svm_run(struct kvm_vcpu *vcpu)
{
struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3630,6 +3649,8 @@ static void pre_svm_run(struct kvm_vcpu *vcpu)
/* FIXME: handle wraparound of asid_generation */
if (svm->current_vmcb->asid_generation != sd->asid_generation)
new_asid(svm, sd);
+
+ return 0;
}
static void svm_inject_nmi(struct kvm_vcpu *vcpu)
@@ -4137,20 +4158,23 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
vcpu->arch.nmi_injected = true;
svm->nmi_l1_to_l2 = nmi_l1_to_l2;
break;
- case SVM_EXITINTINFO_TYPE_EXEPT:
+ case SVM_EXITINTINFO_TYPE_EXEPT: {
+ u32 error_code = 0;
+
/*
* Never re-inject a #VC exception.
*/
if (vector == X86_TRAP_VC)
break;
- if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
- u32 err = svm->vmcb->control.exit_int_info_err;
- kvm_requeue_exception_e(vcpu, vector, err);
+ if (exitintinfo & SVM_EXITINTINFO_VALID_ERR)
+ error_code = svm->vmcb->control.exit_int_info_err;
- } else
- kvm_requeue_exception(vcpu, vector);
+ kvm_requeue_exception(vcpu, vector,
+ exitintinfo & SVM_EXITINTINFO_VALID_ERR,
+ error_code);
break;
+ }
case SVM_EXITINTINFO_TYPE_INTR:
kvm_queue_interrupt(vcpu, vector, false);
break;
@@ -4266,7 +4290,12 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
if (force_immediate_exit)
smp_send_reschedule(vcpu->cpu);
- pre_svm_run(vcpu);
+ if (pre_svm_run(vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+ vcpu->run->fail_entry.hardware_entry_failure_reason = SVM_EXIT_ERR;
+ vcpu->run->fail_entry.cpu = vcpu->cpu;
+ return EXIT_FASTPATH_EXIT_USERSPACE;
+ }
sync_lapic_to_cr8(vcpu);
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index ea44c1da5a7c..d4490eaed55d 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -361,20 +361,18 @@ static __always_inline struct kvm_sev_info *to_kvm_sev_info(struct kvm *kvm)
#ifdef CONFIG_KVM_AMD_SEV
static __always_inline bool sev_guest(struct kvm *kvm)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-
- return sev->active;
+ return to_kvm_sev_info(kvm)->active;
}
static __always_inline bool sev_es_guest(struct kvm *kvm)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
return sev->es_active && !WARN_ON_ONCE(!sev->active);
}
static __always_inline bool sev_snp_guest(struct kvm *kvm)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
return (sev->vmsa_features & SVM_SEV_FEAT_SNP_ACTIVE) &&
!WARN_ON_ONCE(!sev_es_guest(kvm));
@@ -581,6 +579,35 @@ static inline bool is_vnmi_enabled(struct vcpu_svm *svm)
return false;
}
+static inline void svm_vmgexit_set_return_code(struct vcpu_svm *svm,
+ u64 response, u64 data)
+{
+ ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, response);
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, data);
+}
+
+static inline void svm_vmgexit_inject_exception(struct vcpu_svm *svm, u8 vector)
+{
+ u64 data = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT | vector;
+
+ svm_vmgexit_set_return_code(svm, GHCB_HV_RESP_ISSUE_EXCEPTION, data);
+}
+
+static inline void svm_vmgexit_bad_input(struct vcpu_svm *svm, u64 suberror)
+{
+ svm_vmgexit_set_return_code(svm, GHCB_HV_RESP_MALFORMED_INPUT, suberror);
+}
+
+static inline void svm_vmgexit_success(struct vcpu_svm *svm, u64 data)
+{
+ svm_vmgexit_set_return_code(svm, GHCB_HV_RESP_NO_ACTION, data);
+}
+
+static inline void svm_vmgexit_no_action(struct vcpu_svm *svm, u64 data)
+{
+ svm_vmgexit_set_return_code(svm, GHCB_HV_RESP_NO_ACTION, data);
+}
+
/* svm.c */
#define MSR_INVALID 0xffffffffU
@@ -715,7 +742,7 @@ void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu);
/* sev.c */
-void pre_sev_run(struct vcpu_svm *svm, int cpu);
+int pre_sev_run(struct vcpu_svm *svm, int cpu);
void sev_init_vmcb(struct vcpu_svm *svm);
void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm);
int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in);
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 0b844cb97978..ccda95e53f62 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -830,12 +830,12 @@ TRACE_EVENT(kvm_emulate_insn,
TP_ARGS(vcpu, failed),
TP_STRUCT__entry(
- __field( __u64, rip )
- __field( __u32, csbase )
- __field( __u8, len )
- __array( __u8, insn, 15 )
- __field( __u8, flags )
- __field( __u8, failed )
+ __field( __u64, rip )
+ __field( __u32, csbase )
+ __field( __u8, len )
+ __array( __u8, insn, X86_MAX_INSTRUCTION_LENGTH )
+ __field( __u8, flags )
+ __field( __u8, failed )
),
TP_fast_assign(
@@ -846,7 +846,7 @@ TRACE_EVENT(kvm_emulate_insn,
__entry->rip = vcpu->arch.emulate_ctxt->_eip - __entry->len;
memcpy(__entry->insn,
vcpu->arch.emulate_ctxt->fetch.data,
- 15);
+ X86_MAX_INSTRUCTION_LENGTH);
__entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt->mode);
__entry->failed = failed;
),
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index ed8a3cb53961..5504d9e9fd32 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2970,7 +2970,7 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu,
case INTR_TYPE_SOFT_EXCEPTION:
case INTR_TYPE_SOFT_INTR:
case INTR_TYPE_PRIV_SW_EXCEPTION:
- if (CC(vmcs12->vm_entry_instruction_len > 15) ||
+ if (CC(vmcs12->vm_entry_instruction_len > X86_MAX_INSTRUCTION_LENGTH) ||
CC(vmcs12->vm_entry_instruction_len == 0 &&
CC(!nested_cpu_has_zero_length_injection(vcpu))))
return -EINVAL;
@@ -3771,7 +3771,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
break;
case GUEST_ACTIVITY_WAIT_SIPI:
vmx->nested.nested_run_pending = 0;
- vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED);
break;
default:
break;
@@ -4618,7 +4618,7 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
*/
static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
u32 vm_exit_reason, u32 exit_intr_info,
- unsigned long exit_qualification)
+ unsigned long exit_qualification, u32 exit_insn_len)
{
/* update exit information fields: */
vmcs12->vm_exit_reason = vm_exit_reason;
@@ -4646,7 +4646,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vm_exit_reason, exit_intr_info);
vmcs12->vm_exit_intr_info = exit_intr_info;
- vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ vmcs12->vm_exit_instruction_len = exit_insn_len;
vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
/*
@@ -4930,8 +4930,9 @@ vmabort:
* and modify vmcs12 to make it see what it would expect to see there if
* L2 was its real guest. Must only be called when in L2 (is_guest_mode())
*/
-void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
- u32 exit_intr_info, unsigned long exit_qualification)
+void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
+ u32 exit_intr_info, unsigned long exit_qualification,
+ u32 exit_insn_len)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
@@ -4981,7 +4982,8 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
if (vm_exit_reason != -1)
prepare_vmcs12(vcpu, vmcs12, vm_exit_reason,
- exit_intr_info, exit_qualification);
+ exit_intr_info, exit_qualification,
+ exit_insn_len);
/*
* Must happen outside of sync_vmcs02_to_vmcs12() as it will
@@ -5071,7 +5073,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
vmx->nested.need_vmcs12_to_shadow_sync = true;
/* in case we halted in L2 */
- vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
if (likely(!vmx->fail)) {
if (vm_exit_reason != -1)
@@ -5327,9 +5329,8 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu))
goto out_shadow_vmcs;
- hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
- HRTIMER_MODE_ABS_PINNED);
- vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
+ hrtimer_setup(&vmx->nested.preemption_timer, vmx_preemption_timer_fn, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS_PINNED);
vmx->nested.vpid02 = allocate_vpid();
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index 2c296b6abb8c..6eedcfc91070 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -26,8 +26,26 @@ void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu);
enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
bool from_vmentry);
bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu);
-void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
- u32 exit_intr_info, unsigned long exit_qualification);
+void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
+ u32 exit_intr_info, unsigned long exit_qualification,
+ u32 exit_insn_len);
+
+static inline void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
+ u32 exit_intr_info,
+ unsigned long exit_qualification)
+{
+ u32 exit_insn_len;
+
+ if (to_vmx(vcpu)->fail || vm_exit_reason == -1 ||
+ (vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
+ exit_insn_len = 0;
+ else
+ exit_insn_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+
+ __nested_vmx_vmexit(vcpu, vm_exit_reason, exit_intr_info,
+ exit_qualification, exit_insn_len);
+}
+
void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu);
int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 3b92f893b239..5c5766467a61 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1477,7 +1477,8 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
* performs IBPB on nested VM-Exit (a single nested transition
* may switch the active VMCS multiple times).
*/
- if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev))
+ if (static_branch_likely(&switch_vcpu_ibpb) &&
+ (!buddy || WARN_ON_ONCE(buddy->vmcs != prev)))
indirect_branch_prediction_barrier();
}
@@ -2578,6 +2579,34 @@ static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
return ctl_opt & allowed;
}
+#define vmx_check_entry_exit_pairs(pairs, entry_controls, exit_controls) \
+({ \
+ int i, r = 0; \
+ \
+ BUILD_BUG_ON(sizeof(pairs[0].entry_control) != sizeof(entry_controls)); \
+ BUILD_BUG_ON(sizeof(pairs[0].exit_control) != sizeof(exit_controls)); \
+ \
+ for (i = 0; i < ARRAY_SIZE(pairs); i++) { \
+ typeof(entry_controls) n_ctrl = pairs[i].entry_control; \
+ typeof(exit_controls) x_ctrl = pairs[i].exit_control; \
+ \
+ if (!(entry_controls & n_ctrl) == !(exit_controls & x_ctrl)) \
+ continue; \
+ \
+ pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, " \
+ "entry = %llx (%llx), exit = %llx (%llx)\n", \
+ (u64)(entry_controls & n_ctrl), (u64)n_ctrl, \
+ (u64)(exit_controls & x_ctrl), (u64)x_ctrl); \
+ \
+ if (error_on_inconsistent_vmcs_config) \
+ r = -EIO; \
+ \
+ entry_controls &= ~n_ctrl; \
+ exit_controls &= ~x_ctrl; \
+ } \
+ r; \
+})
+
static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
struct vmx_capability *vmx_cap)
{
@@ -2589,7 +2618,6 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
u32 _vmentry_control = 0;
u64 basic_msr;
u64 misc_msr;
- int i;
/*
* LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory.
@@ -2693,22 +2721,9 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
&_vmentry_control))
return -EIO;
- for (i = 0; i < ARRAY_SIZE(vmcs_entry_exit_pairs); i++) {
- u32 n_ctrl = vmcs_entry_exit_pairs[i].entry_control;
- u32 x_ctrl = vmcs_entry_exit_pairs[i].exit_control;
-
- if (!(_vmentry_control & n_ctrl) == !(_vmexit_control & x_ctrl))
- continue;
-
- pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, entry = %x, exit = %x\n",
- _vmentry_control & n_ctrl, _vmexit_control & x_ctrl);
-
- if (error_on_inconsistent_vmcs_config)
- return -EIO;
-
- _vmentry_control &= ~n_ctrl;
- _vmexit_control &= ~x_ctrl;
- }
+ if (vmx_check_entry_exit_pairs(vmcs_entry_exit_pairs,
+ _vmentry_control, _vmexit_control))
+ return -EIO;
/*
* Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they
@@ -3519,7 +3534,7 @@ void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
vmcs_writel(GUEST_CR4, hw_cr4);
if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
- kvm_update_cpuid_runtime(vcpu);
+ vcpu->arch.cpuid_dynamic_bits_dirty = true;
}
void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
@@ -5211,6 +5226,12 @@ bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu)
(kvm_get_rflags(vcpu) & X86_EFLAGS_AC);
}
+static bool is_xfd_nm_fault(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.guest_fpu.fpstate->xfd &&
+ !kvm_is_cr0_bit_set(vcpu, X86_CR0_TS);
+}
+
static int handle_exception_nmi(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -5237,7 +5258,8 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
* point.
*/
if (is_nm_fault(intr_info)) {
- kvm_queue_exception(vcpu, NM_VECTOR);
+ kvm_queue_exception_p(vcpu, NM_VECTOR,
+ is_xfd_nm_fault(vcpu) ? vcpu->arch.guest_fpu.xfd_err : 0);
return 1;
}
@@ -5817,7 +5839,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
? PFERR_FETCH_MASK : 0;
/* ept page table entry is present? */
- error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK)
+ error_code |= (exit_qualification & EPT_VIOLATION_PROT_MASK)
? PFERR_PRESENT_MASK : 0;
if (error_code & EPT_VIOLATION_GVA_IS_VALID)
@@ -5871,11 +5893,35 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu)
return 1;
}
-static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu)
+/*
+ * Returns true if emulation is required (due to the vCPU having invalid state
+ * with unsrestricted guest mode disabled) and KVM can't faithfully emulate the
+ * current vCPU state.
+ */
+static bool vmx_unhandleable_emulation_required(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- return vmx->emulation_required && !vmx->rmode.vm86_active &&
+ if (!vmx->emulation_required)
+ return false;
+
+ /*
+ * It is architecturally impossible for emulation to be required when a
+ * nested VM-Enter is pending completion, as VM-Enter will VM-Fail if
+ * guest state is invalid and unrestricted guest is disabled, i.e. KVM
+ * should synthesize VM-Fail instead emulation L2 code. This path is
+ * only reachable if userspace modifies L2 guest state after KVM has
+ * performed the nested VM-Enter consistency checks.
+ */
+ if (vmx->nested.nested_run_pending)
+ return true;
+
+ /*
+ * KVM only supports emulating exceptions if the vCPU is in Real Mode.
+ * If emulation is required, KVM can't perform a successful VM-Enter to
+ * inject the exception.
+ */
+ return !vmx->rmode.vm86_active &&
(kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected);
}
@@ -5898,7 +5944,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
if (!kvm_emulate_instruction(vcpu, 0))
return 0;
- if (vmx_emulation_required_with_pending_exception(vcpu)) {
+ if (vmx_unhandleable_emulation_required(vcpu)) {
kvm_prepare_emulation_failure_exit(vcpu);
return 0;
}
@@ -5922,7 +5968,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
{
- if (vmx_emulation_required_with_pending_exception(vcpu)) {
+ if (vmx_unhandleable_emulation_required(vcpu)) {
kvm_prepare_emulation_failure_exit(vcpu);
return 0;
}
@@ -6997,16 +7043,15 @@ static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
* MSR value is not clobbered by the host activity before the guest
* has chance to consume it.
*
- * Do not blindly read xfd_err here, since this exception might
- * be caused by L1 interception on a platform which doesn't
- * support xfd at all.
+ * Update the guest's XFD_ERR if and only if XFD is enabled, as the #NM
+ * interception may have been caused by L1 interception. Per the SDM,
+ * XFD_ERR is not modified for non-XFD #NM, i.e. if CR0.TS=1.
*
- * Do it conditionally upon guest_fpu::xfd. xfd_err matters
- * only when xfd contains a non-zero value.
- *
- * Queuing exception is done in vmx_handle_exit. See comment there.
+ * Note, XFD_ERR is updated _before_ the #NM interception check, i.e.
+ * unlike CR2 and DR6, the value is not a payload that is attached to
+ * the #NM exception.
*/
- if (vcpu->arch.guest_fpu.fpstate->xfd)
+ if (is_xfd_nm_fault(vcpu))
rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
}
@@ -7157,13 +7202,17 @@ static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
case INTR_TYPE_SOFT_EXCEPTION:
vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
fallthrough;
- case INTR_TYPE_HARD_EXCEPTION:
- if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
- u32 err = vmcs_read32(error_code_field);
- kvm_requeue_exception_e(vcpu, vector, err);
- } else
- kvm_requeue_exception(vcpu, vector);
+ case INTR_TYPE_HARD_EXCEPTION: {
+ u32 error_code = 0;
+
+ if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK)
+ error_code = vmcs_read32(error_code_field);
+
+ kvm_requeue_exception(vcpu, vector,
+ idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK,
+ error_code);
break;
+ }
case INTR_TYPE_SOFT_INTR:
vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
fallthrough;
@@ -8005,38 +8054,50 @@ static __init void vmx_set_cpu_caps(void)
kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
}
-static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
- struct x86_instruction_info *info)
+static bool vmx_is_io_intercepted(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info,
+ unsigned long *exit_qualification)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
unsigned short port;
- bool intercept;
int size;
+ bool imm;
+
+ /*
+ * If the 'use IO bitmaps' VM-execution control is 0, IO instruction
+ * VM-exits depend on the 'unconditional IO exiting' VM-execution
+ * control.
+ *
+ * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
+ */
+ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
+ return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
if (info->intercept == x86_intercept_in ||
info->intercept == x86_intercept_ins) {
port = info->src_val;
size = info->dst_bytes;
+ imm = info->src_type == OP_IMM;
} else {
port = info->dst_val;
size = info->src_bytes;
+ imm = info->dst_type == OP_IMM;
}
- /*
- * If the 'use IO bitmaps' VM-execution control is 0, IO instruction
- * VM-exits depend on the 'unconditional IO exiting' VM-execution
- * control.
- *
- * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
- */
- if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
- intercept = nested_cpu_has(vmcs12,
- CPU_BASED_UNCOND_IO_EXITING);
- else
- intercept = nested_vmx_check_io_bitmaps(vcpu, port, size);
- /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */
- return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
+ *exit_qualification = ((unsigned long)port << 16) | (size - 1);
+
+ if (info->intercept == x86_intercept_ins ||
+ info->intercept == x86_intercept_outs)
+ *exit_qualification |= BIT(4);
+
+ if (info->rep_prefix)
+ *exit_qualification |= BIT(5);
+
+ if (imm)
+ *exit_qualification |= BIT(6);
+
+ return nested_vmx_check_io_bitmaps(vcpu, port, size);
}
int vmx_check_intercept(struct kvm_vcpu *vcpu,
@@ -8045,26 +8106,34 @@ int vmx_check_intercept(struct kvm_vcpu *vcpu,
struct x86_exception *exception)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ unsigned long exit_qualification = 0;
+ u32 vm_exit_reason;
+ u64 exit_insn_len;
switch (info->intercept) {
- /*
- * RDPID causes #UD if disabled through secondary execution controls.
- * Because it is marked as EmulateOnUD, we need to intercept it here.
- * Note, RDPID is hidden behind ENABLE_RDTSCP.
- */
case x86_intercept_rdpid:
+ /*
+ * RDPID causes #UD if not enabled through secondary execution
+ * controls (ENABLE_RDTSCP). Note, the implicit MSR access to
+ * TSC_AUX is NOT subject to interception, i.e. checking only
+ * the dedicated execution control is architecturally correct.
+ */
if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) {
exception->vector = UD_VECTOR;
exception->error_code_valid = false;
return X86EMUL_PROPAGATE_FAULT;
}
- break;
+ return X86EMUL_CONTINUE;
case x86_intercept_in:
case x86_intercept_ins:
case x86_intercept_out:
case x86_intercept_outs:
- return vmx_check_intercept_io(vcpu, info);
+ if (!vmx_is_io_intercepted(vcpu, info, &exit_qualification))
+ return X86EMUL_CONTINUE;
+
+ vm_exit_reason = EXIT_REASON_IO_INSTRUCTION;
+ break;
case x86_intercept_lgdt:
case x86_intercept_lidt:
@@ -8077,7 +8146,24 @@ int vmx_check_intercept(struct kvm_vcpu *vcpu,
if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC))
return X86EMUL_CONTINUE;
- /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */
+ if (info->intercept == x86_intercept_lldt ||
+ info->intercept == x86_intercept_ltr ||
+ info->intercept == x86_intercept_sldt ||
+ info->intercept == x86_intercept_str)
+ vm_exit_reason = EXIT_REASON_LDTR_TR;
+ else
+ vm_exit_reason = EXIT_REASON_GDTR_IDTR;
+ /*
+ * FIXME: Decode the ModR/M to generate the correct exit
+ * qualification for memory operands.
+ */
+ break;
+
+ case x86_intercept_hlt:
+ if (!nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING))
+ return X86EMUL_CONTINUE;
+
+ vm_exit_reason = EXIT_REASON_HLT;
break;
case x86_intercept_pause:
@@ -8090,17 +8176,24 @@ int vmx_check_intercept(struct kvm_vcpu *vcpu,
* the PAUSE.
*/
if ((info->rep_prefix != REPE_PREFIX) ||
- !nested_cpu_has2(vmcs12, CPU_BASED_PAUSE_EXITING))
+ !nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING))
return X86EMUL_CONTINUE;
+ vm_exit_reason = EXIT_REASON_PAUSE_INSTRUCTION;
break;
/* TODO: check more intercepts... */
default:
- break;
+ return X86EMUL_UNHANDLEABLE;
}
- return X86EMUL_UNHANDLEABLE;
+ exit_insn_len = abs_diff((s64)info->next_rip, (s64)info->rip);
+ if (!exit_insn_len || exit_insn_len > X86_MAX_INSTRUCTION_LENGTH)
+ return X86EMUL_UNHANDLEABLE;
+
+ __nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification,
+ exit_insn_len);
+ return X86EMUL_INTERCEPTED;
}
#ifdef CONFIG_X86_64
diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h
index 633c87e2fd92..96677576c836 100644
--- a/arch/x86/kvm/vmx/vmx_ops.h
+++ b/arch/x86/kvm/vmx/vmx_ops.h
@@ -118,7 +118,7 @@ do_exception:
#else /* !CONFIG_CC_HAS_ASM_GOTO_OUTPUT */
- asm volatile("1: vmread %2, %1\n\t"
+ asm volatile("1: vmread %[field], %[output]\n\t"
".byte 0x3e\n\t" /* branch taken hint */
"ja 3f\n\t"
@@ -127,24 +127,26 @@ do_exception:
* @field, and bounce through the trampoline to preserve
* volatile registers.
*/
- "xorl %k1, %k1\n\t"
+ "xorl %k[output], %k[output]\n\t"
"2:\n\t"
- "push %1\n\t"
- "push %2\n\t"
+ "push %[output]\n\t"
+ "push %[field]\n\t"
"call vmread_error_trampoline\n\t"
/*
* Unwind the stack. Note, the trampoline zeros out the
* memory for @fault so that the result is '0' on error.
*/
- "pop %2\n\t"
- "pop %1\n\t"
+ "pop %[field]\n\t"
+ "pop %[output]\n\t"
"3:\n\t"
/* VMREAD faulted. As above, except push '1' for @fault. */
- _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_ONE_REG, %1)
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_ONE_REG, %[output])
- : ASM_CALL_CONSTRAINT, "=&r"(value) : "r"(field) : "cc");
+ : ASM_CALL_CONSTRAINT, [output] "=&r" (value)
+ : [field] "r" (field)
+ : "cc");
return value;
#endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4b64ab350bcd..c841817a914a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -800,9 +800,9 @@ static void kvm_queue_exception_vmexit(struct kvm_vcpu *vcpu, unsigned int vecto
ex->payload = payload;
}
-static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
- unsigned nr, bool has_error, u32 error_code,
- bool has_payload, unsigned long payload, bool reinject)
+static void kvm_multiple_exception(struct kvm_vcpu *vcpu, unsigned int nr,
+ bool has_error, u32 error_code,
+ bool has_payload, unsigned long payload)
{
u32 prev_nr;
int class1, class2;
@@ -810,13 +810,10 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
kvm_make_request(KVM_REQ_EVENT, vcpu);
/*
- * If the exception is destined for L2 and isn't being reinjected,
- * morph it to a VM-Exit if L1 wants to intercept the exception. A
- * previously injected exception is not checked because it was checked
- * when it was original queued, and re-checking is incorrect if _L1_
- * injected the exception, in which case it's exempt from interception.
+ * If the exception is destined for L2, morph it to a VM-Exit if L1
+ * wants to intercept the exception.
*/
- if (!reinject && is_guest_mode(vcpu) &&
+ if (is_guest_mode(vcpu) &&
kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, nr, error_code)) {
kvm_queue_exception_vmexit(vcpu, nr, has_error, error_code,
has_payload, payload);
@@ -825,28 +822,9 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
queue:
- if (reinject) {
- /*
- * On VM-Entry, an exception can be pending if and only
- * if event injection was blocked by nested_run_pending.
- * In that case, however, vcpu_enter_guest() requests an
- * immediate exit, and the guest shouldn't proceed far
- * enough to need reinjection.
- */
- WARN_ON_ONCE(kvm_is_exception_pending(vcpu));
- vcpu->arch.exception.injected = true;
- if (WARN_ON_ONCE(has_payload)) {
- /*
- * A reinjected event has already
- * delivered its payload.
- */
- has_payload = false;
- payload = 0;
- }
- } else {
- vcpu->arch.exception.pending = true;
- vcpu->arch.exception.injected = false;
- }
+ vcpu->arch.exception.pending = true;
+ vcpu->arch.exception.injected = false;
+
vcpu->arch.exception.has_error_code = has_error;
vcpu->arch.exception.vector = nr;
vcpu->arch.exception.error_code = error_code;
@@ -887,29 +865,52 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
{
- kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false);
+ kvm_multiple_exception(vcpu, nr, false, 0, false, 0);
}
EXPORT_SYMBOL_GPL(kvm_queue_exception);
-void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
-{
- kvm_multiple_exception(vcpu, nr, false, 0, false, 0, true);
-}
-EXPORT_SYMBOL_GPL(kvm_requeue_exception);
void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr,
unsigned long payload)
{
- kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false);
+ kvm_multiple_exception(vcpu, nr, false, 0, true, payload);
}
EXPORT_SYMBOL_GPL(kvm_queue_exception_p);
static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr,
u32 error_code, unsigned long payload)
{
- kvm_multiple_exception(vcpu, nr, true, error_code,
- true, payload, false);
+ kvm_multiple_exception(vcpu, nr, true, error_code, true, payload);
+}
+
+void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned int nr,
+ bool has_error_code, u32 error_code)
+{
+
+ /*
+ * On VM-Entry, an exception can be pending if and only if event
+ * injection was blocked by nested_run_pending. In that case, however,
+ * vcpu_enter_guest() requests an immediate exit, and the guest
+ * shouldn't proceed far enough to need reinjection.
+ */
+ WARN_ON_ONCE(kvm_is_exception_pending(vcpu));
+
+ /*
+ * Do not check for interception when injecting an event for L2, as the
+ * exception was checked for intercept when it was original queued, and
+ * re-checking is incorrect if _L1_ injected the exception, in which
+ * case it's exempt from interception.
+ */
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ vcpu->arch.exception.injected = true;
+ vcpu->arch.exception.has_error_code = has_error_code;
+ vcpu->arch.exception.vector = nr;
+ vcpu->arch.exception.error_code = error_code;
+ vcpu->arch.exception.has_payload = false;
+ vcpu->arch.exception.payload = 0;
}
+EXPORT_SYMBOL_GPL(kvm_requeue_exception);
int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
{
@@ -980,16 +981,10 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu)
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
{
- kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, false);
+ kvm_multiple_exception(vcpu, nr, true, error_code, false, 0);
}
EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
-void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
-{
- kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, true);
-}
-EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
-
/*
* Checks if cpl <= required_cpl; if true, return true. Otherwise queue
* a #GP and return false.
@@ -1264,7 +1259,7 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
vcpu->arch.xcr0 = xcr0;
if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
- kvm_update_cpuid_runtime(vcpu);
+ vcpu->arch.cpuid_dynamic_bits_dirty = true;
return 0;
}
@@ -2080,10 +2075,20 @@ EXPORT_SYMBOL_GPL(kvm_handle_invalid_op);
static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn)
{
- if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) &&
- !guest_cpu_cap_has(vcpu, X86_FEATURE_MWAIT))
+ bool enabled;
+
+ if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS))
+ goto emulate_as_nop;
+
+ if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT))
+ enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_MWAIT);
+ else
+ enabled = vcpu->arch.ia32_misc_enable_msr & MSR_IA32_MISC_ENABLE_MWAIT;
+
+ if (!enabled)
return kvm_handle_invalid_op(vcpu);
+emulate_as_nop:
pr_warn_once("%s instruction emulated as NOP!\n", insn);
return kvm_emulate_as_nop(vcpu);
}
@@ -2569,6 +2574,9 @@ EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_multiplier);
static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset)
{
+ if (vcpu->arch.guest_tsc_protected)
+ return;
+
trace_kvm_write_tsc_offset(vcpu->vcpu_id,
vcpu->arch.l1_tsc_offset,
l1_offset);
@@ -2626,12 +2634,18 @@ static inline bool kvm_check_tsc_unstable(void)
* participates in.
*/
static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc,
- u64 ns, bool matched)
+ u64 ns, bool matched, bool user_set_tsc)
{
struct kvm *kvm = vcpu->kvm;
lockdep_assert_held(&kvm->arch.tsc_write_lock);
+ if (vcpu->arch.guest_tsc_protected)
+ return;
+
+ if (user_set_tsc)
+ vcpu->kvm->arch.user_set_tsc = true;
+
/*
* We also track th most recent recorded KHZ, write and time to
* allow the matching interval to be extended at each write.
@@ -2717,8 +2731,6 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value)
}
}
- if (user_value)
- kvm->arch.user_set_tsc = true;
/*
* For a reliable TSC, we can match TSC offsets, and for an unstable
@@ -2738,7 +2750,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value)
matched = true;
}
- __kvm_synchronize_tsc(vcpu, offset, data, ns, matched);
+ __kvm_synchronize_tsc(vcpu, offset, data, ns, matched, !!user_value);
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
}
@@ -3116,15 +3128,17 @@ u64 get_kvmclock_ns(struct kvm *kvm)
return data.clock;
}
-static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
+static void kvm_setup_guest_pvclock(struct pvclock_vcpu_time_info *ref_hv_clock,
+ struct kvm_vcpu *vcpu,
struct gfn_to_pfn_cache *gpc,
- unsigned int offset,
- bool force_tsc_unstable)
+ unsigned int offset)
{
- struct kvm_vcpu_arch *vcpu = &v->arch;
struct pvclock_vcpu_time_info *guest_hv_clock;
+ struct pvclock_vcpu_time_info hv_clock;
unsigned long flags;
+ memcpy(&hv_clock, ref_hv_clock, sizeof(hv_clock));
+
read_lock_irqsave(&gpc->lock, flags);
while (!kvm_gpc_check(gpc, offset + sizeof(*guest_hv_clock))) {
read_unlock_irqrestore(&gpc->lock, flags);
@@ -3144,52 +3158,34 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
* it is consistent.
*/
- guest_hv_clock->version = vcpu->hv_clock.version = (guest_hv_clock->version + 1) | 1;
+ guest_hv_clock->version = hv_clock.version = (guest_hv_clock->version + 1) | 1;
smp_wmb();
/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
- vcpu->hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);
-
- if (vcpu->pvclock_set_guest_stopped_request) {
- vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
- vcpu->pvclock_set_guest_stopped_request = false;
- }
+ hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);
- memcpy(guest_hv_clock, &vcpu->hv_clock, sizeof(*guest_hv_clock));
-
- if (force_tsc_unstable)
- guest_hv_clock->flags &= ~PVCLOCK_TSC_STABLE_BIT;
+ memcpy(guest_hv_clock, &hv_clock, sizeof(*guest_hv_clock));
smp_wmb();
- guest_hv_clock->version = ++vcpu->hv_clock.version;
+ guest_hv_clock->version = ++hv_clock.version;
kvm_gpc_mark_dirty_in_slot(gpc);
read_unlock_irqrestore(&gpc->lock, flags);
- trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
+ trace_kvm_pvclock_update(vcpu->vcpu_id, &hv_clock);
}
-static int kvm_guest_time_update(struct kvm_vcpu *v)
+int kvm_guest_time_update(struct kvm_vcpu *v)
{
+ struct pvclock_vcpu_time_info hv_clock = {};
unsigned long flags, tgt_tsc_khz;
unsigned seq;
struct kvm_vcpu_arch *vcpu = &v->arch;
struct kvm_arch *ka = &v->kvm->arch;
s64 kernel_ns;
u64 tsc_timestamp, host_tsc;
- u8 pvclock_flags;
bool use_master_clock;
-#ifdef CONFIG_KVM_XEN
- /*
- * For Xen guests we may need to override PVCLOCK_TSC_STABLE_BIT as unless
- * explicitly told to use TSC as its clocksource Xen will not set this bit.
- * This default behaviour led to bugs in some guest kernels which cause
- * problems if they observe PVCLOCK_TSC_STABLE_BIT in the pvclock flags.
- */
- bool xen_pvclock_tsc_unstable =
- ka->xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
-#endif
kernel_ns = 0;
host_tsc = 0;
@@ -3250,35 +3246,57 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
- &vcpu->hv_clock.tsc_shift,
- &vcpu->hv_clock.tsc_to_system_mul);
+ &vcpu->pvclock_tsc_shift,
+ &vcpu->pvclock_tsc_mul);
vcpu->hw_tsc_khz = tgt_tsc_khz;
- kvm_xen_update_tsc_info(v);
}
- vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
- vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
+ hv_clock.tsc_shift = vcpu->pvclock_tsc_shift;
+ hv_clock.tsc_to_system_mul = vcpu->pvclock_tsc_mul;
+ hv_clock.tsc_timestamp = tsc_timestamp;
+ hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
vcpu->last_guest_tsc = tsc_timestamp;
/* If the host uses TSC clocksource, then it is stable */
- pvclock_flags = 0;
+ hv_clock.flags = 0;
if (use_master_clock)
- pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
+ hv_clock.flags |= PVCLOCK_TSC_STABLE_BIT;
- vcpu->hv_clock.flags = pvclock_flags;
+ if (vcpu->pv_time.active) {
+ /*
+ * GUEST_STOPPED is only supported by kvmclock, and KVM's
+ * historic behavior is to only process the request if kvmclock
+ * is active/enabled.
+ */
+ if (vcpu->pvclock_set_guest_stopped_request) {
+ hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
+ vcpu->pvclock_set_guest_stopped_request = false;
+ }
+ kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->pv_time, 0);
+
+ hv_clock.flags &= ~PVCLOCK_GUEST_STOPPED;
+ }
+
+ kvm_hv_setup_tsc_page(v->kvm, &hv_clock);
- if (vcpu->pv_time.active)
- kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0, false);
#ifdef CONFIG_KVM_XEN
+ /*
+ * For Xen guests we may need to override PVCLOCK_TSC_STABLE_BIT as unless
+ * explicitly told to use TSC as its clocksource Xen will not set this bit.
+ * This default behaviour led to bugs in some guest kernels which cause
+ * problems if they observe PVCLOCK_TSC_STABLE_BIT in the pvclock flags.
+ *
+ * Note! Clear TSC_STABLE only for Xen clocks, i.e. the order matters!
+ */
+ if (ka->xen.hvm_config.flags & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE)
+ hv_clock.flags &= ~PVCLOCK_TSC_STABLE_BIT;
+
if (vcpu->xen.vcpu_info_cache.active)
- kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache,
- offsetof(struct compat_vcpu_info, time),
- xen_pvclock_tsc_unstable);
+ kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->xen.vcpu_info_cache,
+ offsetof(struct compat_vcpu_info, time));
if (vcpu->xen.vcpu_time_info_cache.active)
- kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0,
- xen_pvclock_tsc_unstable);
+ kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->xen.vcpu_time_info_cache, 0);
#endif
- kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
return 0;
}
@@ -3544,7 +3562,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
sizeof(u64)))
return 1;
- vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
+ vcpu->arch.apf.send_always = (data & KVM_ASYNC_PF_SEND_ALWAYS);
vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
kvm_async_pf_wakeup_all(vcpu);
@@ -3733,7 +3751,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
u32 msr = msr_info->index;
u64 data = msr_info->data;
- if (msr && msr == vcpu->kvm->arch.xen_hvm_config.msr)
+ /*
+ * Do not allow host-initiated writes to trigger the Xen hypercall
+ * page setup; it could incur locking paths which are not expected
+ * if userspace sets the MSR in an unusual location.
+ */
+ if (kvm_xen_is_hypercall_page_msr(vcpu->kvm, msr) &&
+ !msr_info->host_initiated)
return kvm_xen_write_hypercall_page(vcpu, data);
switch (msr) {
@@ -3889,7 +3913,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XMM3))
return 1;
vcpu->arch.ia32_misc_enable_msr = data;
- kvm_update_cpuid_runtime(vcpu);
+ vcpu->arch.cpuid_dynamic_bits_dirty = true;
} else {
vcpu->arch.ia32_misc_enable_msr = data;
}
@@ -3906,7 +3930,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_TSC:
if (msr_info->host_initiated) {
kvm_synchronize_tsc(vcpu, &data);
- } else {
+ } else if (!vcpu->arch.guest_tsc_protected) {
u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
adjust_tsc_offset_guest(vcpu, adj);
vcpu->arch.ia32_tsc_adjust_msr += adj;
@@ -3924,7 +3948,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (data & ~kvm_caps.supported_xss)
return 1;
vcpu->arch.ia32_xss = data;
- kvm_update_cpuid_runtime(vcpu);
+ vcpu->arch.cpuid_dynamic_bits_dirty = true;
break;
case MSR_SMI_COUNT:
if (!msr_info->host_initiated)
@@ -4573,6 +4597,11 @@ static bool kvm_is_vm_type_supported(unsigned long type)
return type < 32 && (kvm_caps.supported_vm_types & BIT(type));
}
+static inline u32 kvm_sync_valid_fields(struct kvm *kvm)
+{
+ return kvm && kvm->arch.has_protected_state ? 0 : KVM_SYNC_X86_VALID_FIELDS;
+}
+
int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
{
int r = 0;
@@ -4681,7 +4710,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
break;
#endif
case KVM_CAP_SYNC_REGS:
- r = KVM_SYNC_X86_VALID_FIELDS;
+ r = kvm_sync_valid_fields(kvm);
break;
case KVM_CAP_ADJUST_CLOCK:
r = KVM_CLOCK_VALID_FLAGS;
@@ -4986,7 +5015,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
u64 offset = kvm_compute_l1_tsc_offset(vcpu,
vcpu->arch.last_guest_tsc);
kvm_vcpu_write_tsc_offset(vcpu, offset);
- vcpu->arch.tsc_catchup = 1;
+ if (!vcpu->arch.guest_tsc_protected)
+ vcpu->arch.tsc_catchup = 1;
}
if (kvm_lapic_hv_timer_in_use(vcpu))
@@ -5725,8 +5755,7 @@ static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu,
tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset;
ns = get_kvmclock_base_ns();
- kvm->arch.user_set_tsc = true;
- __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched);
+ __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched, true);
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
r = 0;
@@ -6905,23 +6934,15 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm)
{
struct kvm_vcpu *vcpu;
unsigned long i;
- int ret = 0;
-
- mutex_lock(&kvm->lock);
- kvm_for_each_vcpu(i, vcpu, kvm) {
- if (!vcpu->arch.pv_time.active)
- continue;
- ret = kvm_set_guest_paused(vcpu);
- if (ret) {
- kvm_err("Failed to pause guest VCPU%d: %d\n",
- vcpu->vcpu_id, ret);
- break;
- }
- }
- mutex_unlock(&kvm->lock);
+ /*
+ * Ignore the return, marking the guest paused only "fails" if the vCPU
+ * isn't using kvmclock; continuing on is correct and desirable.
+ */
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ (void)kvm_set_guest_paused(vcpu);
- return ret ? NOTIFY_BAD : NOTIFY_DONE;
+ return NOTIFY_DONE;
}
int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state)
@@ -11220,9 +11241,7 @@ static inline int vcpu_block(struct kvm_vcpu *vcpu)
switch(vcpu->arch.mp_state) {
case KVM_MP_STATE_HALTED:
case KVM_MP_STATE_AP_RESET_HOLD:
- vcpu->arch.pv.pv_unhalted = false;
- vcpu->arch.mp_state =
- KVM_MP_STATE_RUNNABLE;
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
fallthrough;
case KVM_MP_STATE_RUNNABLE:
vcpu->arch.apf.halted = false;
@@ -11299,9 +11318,8 @@ static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason)
++vcpu->stat.halt_exits;
if (lapic_in_kernel(vcpu)) {
if (kvm_vcpu_has_events(vcpu))
- vcpu->arch.pv.pv_unhalted = false;
- else
- vcpu->arch.mp_state = state;
+ state = KVM_MP_STATE_RUNNABLE;
+ kvm_set_mp_state(vcpu, state);
return 1;
} else {
vcpu->run->exit_reason = reason;
@@ -11474,6 +11492,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
{
struct kvm_queued_exception *ex = &vcpu->arch.exception;
struct kvm_run *kvm_run = vcpu->run;
+ u32 sync_valid_fields;
int r;
r = kvm_mmu_post_init_vm(vcpu->kvm);
@@ -11519,8 +11538,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
goto out;
}
- if ((kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) ||
- (kvm_run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)) {
+ sync_valid_fields = kvm_sync_valid_fields(vcpu->kvm);
+ if ((kvm_run->kvm_valid_regs & ~sync_valid_fields) ||
+ (kvm_run->kvm_dirty_regs & ~sync_valid_fields)) {
r = -EINVAL;
goto out;
}
@@ -11578,7 +11598,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
out:
kvm_put_guest_fpu(vcpu);
- if (kvm_run->kvm_valid_regs)
+ if (kvm_run->kvm_valid_regs && likely(!vcpu->arch.guest_state_protected))
store_regs(vcpu);
post_kvm_run_save(vcpu);
kvm_vcpu_srcu_read_unlock(vcpu);
@@ -11821,10 +11841,10 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
goto out;
if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
- vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED);
set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events);
} else
- vcpu->arch.mp_state = mp_state->mp_state;
+ kvm_set_mp_state(vcpu, mp_state->mp_state);
kvm_make_request(KVM_REQ_EVENT, vcpu);
ret = 0;
@@ -11951,7 +11971,7 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs,
if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
!is_protmode(vcpu))
- vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
return 0;
}
@@ -12254,9 +12274,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm);
if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
- vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
else
- vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_UNINITIALIZED);
r = kvm_mmu_create(vcpu);
if (r < 0)
@@ -12363,6 +12383,9 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
int idx;
+ kvm_clear_async_pf_completion_queue(vcpu);
+ kvm_mmu_unload(vcpu);
+
kvmclock_reset(vcpu);
kvm_x86_call(vcpu_free)(vcpu);
@@ -12756,31 +12779,6 @@ out:
return ret;
}
-static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
-{
- vcpu_load(vcpu);
- kvm_mmu_unload(vcpu);
- vcpu_put(vcpu);
-}
-
-static void kvm_unload_vcpu_mmus(struct kvm *kvm)
-{
- unsigned long i;
- struct kvm_vcpu *vcpu;
-
- kvm_for_each_vcpu(i, vcpu, kvm) {
- kvm_clear_async_pf_completion_queue(vcpu);
- kvm_unload_vcpu_mmu(vcpu);
- }
-}
-
-void kvm_arch_sync_events(struct kvm *kvm)
-{
- cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
- cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
- kvm_free_pit(kvm);
-}
-
/**
* __x86_set_memory_region: Setup KVM internal memory slot
*
@@ -12859,6 +12857,17 @@ EXPORT_SYMBOL_GPL(__x86_set_memory_region);
void kvm_arch_pre_destroy_vm(struct kvm *kvm)
{
+ /*
+ * Stop all background workers and kthreads before destroying vCPUs, as
+ * iterating over vCPUs in a different task while vCPUs are being freed
+ * is unsafe, i.e. will lead to use-after-free. The PIT also needs to
+ * be stopped before IRQ routing is freed.
+ */
+ cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
+ cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
+
+ kvm_free_pit(kvm);
+
kvm_mmu_pre_destroy_vm(kvm);
}
@@ -12878,9 +12887,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
__x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
mutex_unlock(&kvm->slots_lock);
}
- kvm_unload_vcpu_mmus(kvm);
kvm_destroy_vcpus(kvm);
- kvm_x86_call(vm_destroy)(kvm);
kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
kvm_pic_destroy(kvm);
kvm_ioapic_destroy(kvm);
@@ -12890,6 +12897,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kvm_page_track_cleanup(kvm);
kvm_xen_destroy_vm(kvm);
kvm_hv_destroy_vm(kvm);
+ kvm_x86_call(vm_destroy)(kvm);
}
static void memslot_rmap_free(struct kvm_memory_slot *slot)
@@ -13383,8 +13391,8 @@ static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu)
if (!kvm_pv_async_pf_enabled(vcpu))
return false;
- if (vcpu->arch.apf.send_user_only &&
- kvm_x86_call(get_cpl)(vcpu) == 0)
+ if (!vcpu->arch.apf.send_always &&
+ (vcpu->arch.guest_state_protected || !kvm_x86_call(get_cpl)(vcpu)))
return false;
if (is_guest_mode(vcpu)) {
@@ -13474,7 +13482,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
}
vcpu->arch.apf.halted = false;
- vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
}
void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 91e50a513100..9dc32a409076 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -121,6 +121,13 @@ static inline bool kvm_vcpu_has_run(struct kvm_vcpu *vcpu)
return vcpu->arch.last_vmentry_cpu != -1;
}
+static inline void kvm_set_mp_state(struct kvm_vcpu *vcpu, int mp_state)
+{
+ vcpu->arch.mp_state = mp_state;
+ if (mp_state == KVM_MP_STATE_RUNNABLE)
+ vcpu->arch.pv.pv_unhalted = false;
+}
+
static inline bool kvm_is_exception_pending(struct kvm_vcpu *vcpu)
{
return vcpu->arch.exception.pending ||
@@ -362,6 +369,7 @@ void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
u64 get_kvmclock_ns(struct kvm *kvm);
uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm);
bool kvm_get_monotonic_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp);
+int kvm_guest_time_update(struct kvm_vcpu *v);
int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
gva_t addr, void *val, unsigned int bytes,
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index a909b817b9c0..bd21e9c335ad 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -150,11 +150,46 @@ static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
+static int xen_get_guest_pvclock(struct kvm_vcpu *vcpu,
+ struct pvclock_vcpu_time_info *hv_clock,
+ struct gfn_to_pfn_cache *gpc,
+ unsigned int offset)
+{
+ unsigned long flags;
+ int r;
+
+ read_lock_irqsave(&gpc->lock, flags);
+ while (!kvm_gpc_check(gpc, offset + sizeof(*hv_clock))) {
+ read_unlock_irqrestore(&gpc->lock, flags);
+
+ r = kvm_gpc_refresh(gpc, offset + sizeof(*hv_clock));
+ if (r)
+ return r;
+
+ read_lock_irqsave(&gpc->lock, flags);
+ }
+
+ memcpy(hv_clock, gpc->khva + offset, sizeof(*hv_clock));
+ read_unlock_irqrestore(&gpc->lock, flags);
+
+ /*
+ * Sanity check TSC shift+multiplier to verify the guest's view of time
+ * is more or less consistent.
+ */
+ if (hv_clock->tsc_shift != vcpu->arch.pvclock_tsc_shift ||
+ hv_clock->tsc_to_system_mul != vcpu->arch.pvclock_tsc_mul)
+ return -EINVAL;
+
+ return 0;
+}
+
static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs,
bool linux_wa)
{
+ struct kvm_vcpu_xen *xen = &vcpu->arch.xen;
int64_t kernel_now, delta;
uint64_t guest_now;
+ int r = -EOPNOTSUPP;
/*
* The guest provides the requested timeout in absolute nanoseconds
@@ -173,10 +208,29 @@ static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs,
* the absolute CLOCK_MONOTONIC time at which the timer should
* fire.
*/
- if (vcpu->arch.hv_clock.version && vcpu->kvm->arch.use_master_clock &&
- static_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+ do {
+ struct pvclock_vcpu_time_info hv_clock;
uint64_t host_tsc, guest_tsc;
+ if (!static_cpu_has(X86_FEATURE_CONSTANT_TSC) ||
+ !vcpu->kvm->arch.use_master_clock)
+ break;
+
+ /*
+ * If both Xen PV clocks are active, arbitrarily try to use the
+ * compat clock first, but also try to use the non-compat clock
+ * if the compat clock is unusable. The two PV clocks hold the
+ * same information, but it's possible one (or both) is stale
+ * and/or currently unreachable.
+ */
+ if (xen->vcpu_info_cache.active)
+ r = xen_get_guest_pvclock(vcpu, &hv_clock, &xen->vcpu_info_cache,
+ offsetof(struct compat_vcpu_info, time));
+ if (r && xen->vcpu_time_info_cache.active)
+ r = xen_get_guest_pvclock(vcpu, &hv_clock, &xen->vcpu_time_info_cache, 0);
+ if (r)
+ break;
+
if (!IS_ENABLED(CONFIG_64BIT) ||
!kvm_get_monotonic_and_clockread(&kernel_now, &host_tsc)) {
/*
@@ -197,9 +251,10 @@ static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs,
/* Calculate the guest kvmclock as the guest would do it. */
guest_tsc = kvm_read_l1_tsc(vcpu, host_tsc);
- guest_now = __pvclock_read_cycles(&vcpu->arch.hv_clock,
- guest_tsc);
- } else {
+ guest_now = __pvclock_read_cycles(&hv_clock, guest_tsc);
+ } while (0);
+
+ if (r) {
/*
* Without CONSTANT_TSC, get_kvmclock_ns() is the only option.
*
@@ -1280,10 +1335,10 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
* Note, truncation is a non-issue as 'lm' is guaranteed to be
* false for a 32-bit kernel, i.e. when hva_t is only 4 bytes.
*/
- hva_t blob_addr = lm ? kvm->arch.xen_hvm_config.blob_addr_64
- : kvm->arch.xen_hvm_config.blob_addr_32;
- u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
- : kvm->arch.xen_hvm_config.blob_size_32;
+ hva_t blob_addr = lm ? kvm->arch.xen.hvm_config.blob_addr_64
+ : kvm->arch.xen.hvm_config.blob_addr_32;
+ u8 blob_size = lm ? kvm->arch.xen.hvm_config.blob_size_64
+ : kvm->arch.xen.hvm_config.blob_size_32;
u8 *page;
int ret;
@@ -1324,15 +1379,24 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
xhc->blob_size_32 || xhc->blob_size_64))
return -EINVAL;
+ /*
+ * Restrict the MSR to the range that is unofficially reserved for
+ * synthetic, virtualization-defined MSRs, e.g. to prevent confusing
+ * KVM by colliding with a real MSR that requires special handling.
+ */
+ if (xhc->msr &&
+ (xhc->msr < KVM_XEN_MSR_MIN_INDEX || xhc->msr > KVM_XEN_MSR_MAX_INDEX))
+ return -EINVAL;
+
mutex_lock(&kvm->arch.xen.xen_lock);
- if (xhc->msr && !kvm->arch.xen_hvm_config.msr)
+ if (xhc->msr && !kvm->arch.xen.hvm_config.msr)
static_branch_inc(&kvm_xen_enabled.key);
- else if (!xhc->msr && kvm->arch.xen_hvm_config.msr)
+ else if (!xhc->msr && kvm->arch.xen.hvm_config.msr)
static_branch_slow_dec_deferred(&kvm_xen_enabled);
- old_flags = kvm->arch.xen_hvm_config.flags;
- memcpy(&kvm->arch.xen_hvm_config, xhc, sizeof(*xhc));
+ old_flags = kvm->arch.xen.hvm_config.flags;
+ memcpy(&kvm->arch.xen.hvm_config, xhc, sizeof(*xhc));
mutex_unlock(&kvm->arch.xen.xen_lock);
@@ -1413,7 +1477,7 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode,
int i;
if (!lapic_in_kernel(vcpu) ||
- !(vcpu->kvm->arch.xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_EVTCHN_SEND))
+ !(vcpu->kvm->arch.xen.hvm_config.flags & KVM_XEN_HVM_CONFIG_EVTCHN_SEND))
return false;
if (IS_ENABLED(CONFIG_64BIT) && !longmode) {
@@ -1480,7 +1544,7 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode,
set_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask);
if (!wait_pending_event(vcpu, sched_poll.nr_ports, ports)) {
- vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_HALTED);
if (sched_poll.timeout)
mod_timer(&vcpu->arch.xen.poll_timer,
@@ -1491,7 +1555,7 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode,
if (sched_poll.timeout)
del_timer(&vcpu->arch.xen.poll_timer);
- vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
}
vcpu->arch.xen.poll_evtchn = 0;
@@ -2225,8 +2289,8 @@ void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)
vcpu->arch.xen.poll_evtchn = 0;
timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0);
- hrtimer_init(&vcpu->arch.xen.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
- vcpu->arch.xen.timer.function = xen_timer_callback;
+ hrtimer_setup(&vcpu->arch.xen.timer, xen_timer_callback, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS_HARD);
kvm_gpc_init(&vcpu->arch.xen.runstate_cache, vcpu->kvm);
kvm_gpc_init(&vcpu->arch.xen.runstate2_cache, vcpu->kvm);
@@ -2247,29 +2311,6 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
del_timer_sync(&vcpu->arch.xen.poll_timer);
}
-void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *entry;
- u32 function;
-
- if (!vcpu->arch.xen.cpuid.base)
- return;
-
- function = vcpu->arch.xen.cpuid.base | XEN_CPUID_LEAF(3);
- if (function > vcpu->arch.xen.cpuid.limit)
- return;
-
- entry = kvm_find_cpuid_entry_index(vcpu, function, 1);
- if (entry) {
- entry->ecx = vcpu->arch.hv_clock.tsc_to_system_mul;
- entry->edx = vcpu->arch.hv_clock.tsc_shift;
- }
-
- entry = kvm_find_cpuid_entry_index(vcpu, function, 2);
- if (entry)
- entry->eax = vcpu->arch.hw_tsc_khz;
-}
-
void kvm_xen_init_vm(struct kvm *kvm)
{
mutex_init(&kvm->arch.xen.xen_lock);
@@ -2291,6 +2332,6 @@ void kvm_xen_destroy_vm(struct kvm *kvm)
}
idr_destroy(&kvm->arch.xen.evtchn_ports);
- if (kvm->arch.xen_hvm_config.msr)
+ if (kvm->arch.xen.hvm_config.msr)
static_branch_slow_dec_deferred(&kvm_xen_enabled);
}
diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h
index f5841d9000ae..59e6128a7bd3 100644
--- a/arch/x86/kvm/xen.h
+++ b/arch/x86/kvm/xen.h
@@ -9,6 +9,7 @@
#ifndef __ARCH_X86_KVM_XEN_H__
#define __ARCH_X86_KVM_XEN_H__
+#include <asm/xen/cpuid.h>
#include <asm/xen/hypervisor.h>
#ifdef CONFIG_KVM_XEN
@@ -35,7 +36,6 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe,
int kvm_xen_setup_evtchn(struct kvm *kvm,
struct kvm_kernel_irq_routing_entry *e,
const struct kvm_irq_routing_entry *ue);
-void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu);
static inline void kvm_xen_sw_enable_lapic(struct kvm_vcpu *vcpu)
{
@@ -50,16 +50,32 @@ static inline void kvm_xen_sw_enable_lapic(struct kvm_vcpu *vcpu)
kvm_xen_inject_vcpu_vector(vcpu);
}
+static inline bool kvm_xen_is_tsc_leaf(struct kvm_vcpu *vcpu, u32 function)
+{
+ return static_branch_unlikely(&kvm_xen_enabled.key) &&
+ vcpu->arch.xen.cpuid.base &&
+ function <= vcpu->arch.xen.cpuid.limit &&
+ function == (vcpu->arch.xen.cpuid.base | XEN_CPUID_LEAF(3));
+}
+
static inline bool kvm_xen_msr_enabled(struct kvm *kvm)
{
return static_branch_unlikely(&kvm_xen_enabled.key) &&
- kvm->arch.xen_hvm_config.msr;
+ kvm->arch.xen.hvm_config.msr;
+}
+
+static inline bool kvm_xen_is_hypercall_page_msr(struct kvm *kvm, u32 msr)
+{
+ if (!static_branch_unlikely(&kvm_xen_enabled.key))
+ return false;
+
+ return msr && msr == kvm->arch.xen.hvm_config.msr;
}
static inline bool kvm_xen_hypercall_enabled(struct kvm *kvm)
{
return static_branch_unlikely(&kvm_xen_enabled.key) &&
- (kvm->arch.xen_hvm_config.flags &
+ (kvm->arch.xen.hvm_config.flags &
KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL);
}
@@ -124,6 +140,11 @@ static inline bool kvm_xen_msr_enabled(struct kvm *kvm)
return false;
}
+static inline bool kvm_xen_is_hypercall_page_msr(struct kvm *kvm, u32 msr)
+{
+ return false;
+}
+
static inline bool kvm_xen_hypercall_enabled(struct kvm *kvm)
{
return false;
@@ -157,8 +178,9 @@ static inline bool kvm_xen_timer_enabled(struct kvm_vcpu *vcpu)
return false;
}
-static inline void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu)
+static inline bool kvm_xen_is_tsc_leaf(struct kvm_vcpu *vcpu, u32 function)
{
+ return false;
}
#endif
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 8a59c61624c2..64ccecedc9f8 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -56,7 +56,7 @@ ifeq ($(CONFIG_X86_32),y)
lib-y += string_32.o
lib-y += memmove_32.o
lib-y += cmpxchg8b_emu.o
-ifneq ($(CONFIG_X86_CMPXCHG64),y)
+ifneq ($(CONFIG_X86_CX8),y)
lib-y += atomic64_386_32.o
endif
else
@@ -66,5 +66,6 @@ endif
lib-y += clear_page_64.o copy_page_64.o
lib-y += memmove_64.o memset_64.o
lib-y += copy_user_64.o copy_user_uncached_64.o
- lib-y += cmpxchg16b_emu.o
+ lib-y += cmpxchg16b_emu.o
+ lib-y += bhi.o
endif
diff --git a/arch/x86/lib/bhi.S b/arch/x86/lib/bhi.S
new file mode 100644
index 000000000000..58891681261b
--- /dev/null
+++ b/arch/x86/lib/bhi.S
@@ -0,0 +1,147 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/linkage.h>
+#include <asm/unwind_hints.h>
+#include <asm/nospec-branch.h>
+
+/*
+ * Notably, the FineIBT preamble calling these will have ZF set and r10 zero.
+ *
+ * The very last element is in fact larger than 32 bytes, but since its the
+ * last element, this does not matter,
+ *
+ * There are 2 #UD sites, located between 0,1-2,3 and 4,5-6,7 such that they
+ * can be reached using Jcc.d8, these elements (1 and 5) have sufficiently
+ * big alignment holes for this to not stagger the array.
+ */
+
+.pushsection .noinstr.text, "ax"
+
+ .align 32
+SYM_CODE_START(__bhi_args)
+
+#ifdef CONFIG_FINEIBT_BHI
+
+ .align 32
+SYM_INNER_LABEL(__bhi_args_0, SYM_L_LOCAL)
+ ANNOTATE_NOENDBR
+ UNWIND_HINT_FUNC
+ jne .Lud_1
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+
+ .align 32
+SYM_INNER_LABEL(__bhi_args_1, SYM_L_LOCAL)
+ ANNOTATE_NOENDBR
+ UNWIND_HINT_FUNC
+ jne .Lud_1
+ cmovne %r10, %rdi
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+
+ .align 8
+ ANNOTATE_REACHABLE
+.Lud_1: ud2
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+
+ .align 32
+SYM_INNER_LABEL(__bhi_args_2, SYM_L_LOCAL)
+ ANNOTATE_NOENDBR
+ UNWIND_HINT_FUNC
+ jne .Lud_1
+ cmovne %r10, %rdi
+ cmovne %r10, %rsi
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+
+ .align 32
+SYM_INNER_LABEL(__bhi_args_3, SYM_L_LOCAL)
+ ANNOTATE_NOENDBR
+ UNWIND_HINT_FUNC
+ jne .Lud_1
+ cmovne %r10, %rdi
+ cmovne %r10, %rsi
+ cmovne %r10, %rdx
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+
+ .align 32
+SYM_INNER_LABEL(__bhi_args_4, SYM_L_LOCAL)
+ ANNOTATE_NOENDBR
+ UNWIND_HINT_FUNC
+ jne .Lud_2
+ cmovne %r10, %rdi
+ cmovne %r10, %rsi
+ cmovne %r10, %rdx
+ cmovne %r10, %rcx
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+
+ .align 32
+SYM_INNER_LABEL(__bhi_args_5, SYM_L_LOCAL)
+ ANNOTATE_NOENDBR
+ UNWIND_HINT_FUNC
+ jne .Lud_2
+ cmovne %r10, %rdi
+ cmovne %r10, %rsi
+ cmovne %r10, %rdx
+ cmovne %r10, %rcx
+ cmovne %r10, %r8
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+
+ .align 8
+ ANNOTATE_REACHABLE
+.Lud_2: ud2
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+
+ .align 32
+SYM_INNER_LABEL(__bhi_args_6, SYM_L_LOCAL)
+ ANNOTATE_NOENDBR
+ UNWIND_HINT_FUNC
+ jne .Lud_2
+ cmovne %r10, %rdi
+ cmovne %r10, %rsi
+ cmovne %r10, %rdx
+ cmovne %r10, %rcx
+ cmovne %r10, %r8
+ cmovne %r10, %r9
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+
+ .align 32
+SYM_INNER_LABEL(__bhi_args_7, SYM_L_LOCAL)
+ ANNOTATE_NOENDBR
+ UNWIND_HINT_FUNC
+ jne .Lud_2
+ cmovne %r10, %rdi
+ cmovne %r10, %rsi
+ cmovne %r10, %rdx
+ cmovne %r10, %rcx
+ cmovne %r10, %r8
+ cmovne %r10, %r9
+ cmovne %r10, %rsp
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+
+#endif /* CONFIG_FINEIBT_BHI */
+
+ .align 32
+SYM_INNER_LABEL(__bhi_args_end, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
+ nop /* Work around toolchain+objtool quirk */
+SYM_CODE_END(__bhi_args)
+
+.popsection
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index 2760a15fbc00..a508e4a8c66a 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -1,6 +1,8 @@
/* SPDX-License-Identifier: GPL-2.0-only */
#include <linux/export.h>
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
+#include <linux/objtool.h>
#include <asm/asm.h>
/*
@@ -14,7 +16,7 @@
* Zero a page.
* %rdi - page
*/
-SYM_FUNC_START(clear_page_rep)
+SYM_TYPED_FUNC_START(clear_page_rep)
movl $4096/8,%ecx
xorl %eax,%eax
rep stosq
@@ -22,7 +24,7 @@ SYM_FUNC_START(clear_page_rep)
SYM_FUNC_END(clear_page_rep)
EXPORT_SYMBOL_GPL(clear_page_rep)
-SYM_FUNC_START(clear_page_orig)
+SYM_TYPED_FUNC_START(clear_page_orig)
xorl %eax,%eax
movl $4096/64,%ecx
.p2align 4
@@ -44,7 +46,7 @@ SYM_FUNC_START(clear_page_orig)
SYM_FUNC_END(clear_page_orig)
EXPORT_SYMBOL_GPL(clear_page_orig)
-SYM_FUNC_START(clear_page_erms)
+SYM_TYPED_FUNC_START(clear_page_erms)
movl $4096,%ecx
xorl %eax,%eax
rep stosb
@@ -63,6 +65,7 @@ EXPORT_SYMBOL_GPL(clear_page_erms)
* rcx: uncleared bytes or 0 if successful.
*/
SYM_FUNC_START(rep_stos_alternative)
+ ANNOTATE_NOENDBR
cmpq $64,%rcx
jae .Lunrolled
diff --git a/arch/x86/lib/cmpxchg8b_emu.S b/arch/x86/lib/cmpxchg8b_emu.S
index 1c96be769adc..d4bb24347ff8 100644
--- a/arch/x86/lib/cmpxchg8b_emu.S
+++ b/arch/x86/lib/cmpxchg8b_emu.S
@@ -7,7 +7,7 @@
.text
-#ifndef CONFIG_X86_CMPXCHG64
+#ifndef CONFIG_X86_CX8
/*
* Emulate 'cmpxchg8b (%esi)' on UP
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index d6ae793d08fa..d8e87fedc20d 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -3,6 +3,7 @@
#include <linux/export.h>
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
#include <asm/cpufeatures.h>
#include <asm/alternative.h>
@@ -13,7 +14,7 @@
* prefetch distance based on SMP/UP.
*/
ALIGN
-SYM_FUNC_START(copy_page)
+SYM_TYPED_FUNC_START(copy_page)
ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD
movl $4096/8, %ecx
rep movsq
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index fc9fb5d06174..aa8c341b2441 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -8,6 +8,8 @@
#include <linux/export.h>
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
+#include <linux/objtool.h>
#include <asm/cpufeatures.h>
#include <asm/alternative.h>
#include <asm/asm.h>
@@ -30,6 +32,7 @@
* it simpler for us, we can clobber rsi/rdi and rax freely.
*/
SYM_FUNC_START(rep_movs_alternative)
+ ANNOTATE_NOENDBR
cmpq $64,%rcx
jae .Llarge
diff --git a/arch/x86/lib/copy_user_uncached_64.S b/arch/x86/lib/copy_user_uncached_64.S
index 2918e36eece2..18350b343c2a 100644
--- a/arch/x86/lib/copy_user_uncached_64.S
+++ b/arch/x86/lib/copy_user_uncached_64.S
@@ -5,6 +5,7 @@
#include <linux/export.h>
#include <linux/linkage.h>
+#include <linux/objtool.h>
#include <asm/asm.h>
/*
@@ -27,6 +28,7 @@
* rax uncopied bytes or 0 if successful.
*/
SYM_FUNC_START(__copy_user_nocache)
+ ANNOTATE_NOENDBR
/* If destination is not 7-byte aligned, we'll have to align it */
testb $7,%dil
jne .Lalign
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index 23f81ca3f06b..e86eda2c0b04 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -131,7 +131,7 @@ static void delay_halt_mwaitx(u64 unused, u64 cycles)
* Use cpu_tss_rw as a cacheline-aligned, seldom accessed per-cpu
* variable as the monitor target.
*/
- __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0);
+ __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0);
/*
* AMD, like Intel, supports the EAX hint and EAX=0xf means, do not
diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
index 89ecd57c9d42..9d5654b8a72a 100644
--- a/arch/x86/lib/getuser.S
+++ b/arch/x86/lib/getuser.S
@@ -28,22 +28,20 @@
#include <linux/export.h>
#include <linux/linkage.h>
+#include <linux/objtool.h>
#include <asm/page_types.h>
#include <asm/errno.h>
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
#include <asm/asm.h>
#include <asm/smap.h>
+#include <asm/runtime-const.h>
#define ASM_BARRIER_NOSPEC ALTERNATIVE "", "lfence", X86_FEATURE_LFENCE_RDTSC
.macro check_range size:req
.if IS_ENABLED(CONFIG_X86_64)
- movq $0x0123456789abcdef,%rdx
- 1:
- .pushsection runtime_ptr_USER_PTR_MAX,"a"
- .long 1b - 8 - .
- .popsection
+ RUNTIME_CONST_PTR USER_PTR_MAX, rdx
cmp %rdx, %rax
cmova %rdx, %rax
.else
@@ -62,6 +60,7 @@
.text
SYM_FUNC_START(__get_user_1)
+ ANNOTATE_NOENDBR
check_range size=1
ASM_STAC
UACCESS movzbl (%_ASM_AX),%edx
@@ -72,6 +71,7 @@ SYM_FUNC_END(__get_user_1)
EXPORT_SYMBOL(__get_user_1)
SYM_FUNC_START(__get_user_2)
+ ANNOTATE_NOENDBR
check_range size=2
ASM_STAC
UACCESS movzwl (%_ASM_AX),%edx
@@ -82,6 +82,7 @@ SYM_FUNC_END(__get_user_2)
EXPORT_SYMBOL(__get_user_2)
SYM_FUNC_START(__get_user_4)
+ ANNOTATE_NOENDBR
check_range size=4
ASM_STAC
UACCESS movl (%_ASM_AX),%edx
@@ -92,6 +93,7 @@ SYM_FUNC_END(__get_user_4)
EXPORT_SYMBOL(__get_user_4)
SYM_FUNC_START(__get_user_8)
+ ANNOTATE_NOENDBR
#ifndef CONFIG_X86_64
xor %ecx,%ecx
#endif
@@ -111,6 +113,7 @@ EXPORT_SYMBOL(__get_user_8)
/* .. and the same for __get_user, just without the range checks */
SYM_FUNC_START(__get_user_nocheck_1)
+ ANNOTATE_NOENDBR
ASM_STAC
ASM_BARRIER_NOSPEC
UACCESS movzbl (%_ASM_AX),%edx
@@ -121,6 +124,7 @@ SYM_FUNC_END(__get_user_nocheck_1)
EXPORT_SYMBOL(__get_user_nocheck_1)
SYM_FUNC_START(__get_user_nocheck_2)
+ ANNOTATE_NOENDBR
ASM_STAC
ASM_BARRIER_NOSPEC
UACCESS movzwl (%_ASM_AX),%edx
@@ -131,6 +135,7 @@ SYM_FUNC_END(__get_user_nocheck_2)
EXPORT_SYMBOL(__get_user_nocheck_2)
SYM_FUNC_START(__get_user_nocheck_4)
+ ANNOTATE_NOENDBR
ASM_STAC
ASM_BARRIER_NOSPEC
UACCESS movl (%_ASM_AX),%edx
@@ -141,6 +146,7 @@ SYM_FUNC_END(__get_user_nocheck_4)
EXPORT_SYMBOL(__get_user_nocheck_4)
SYM_FUNC_START(__get_user_nocheck_8)
+ ANNOTATE_NOENDBR
ASM_STAC
ASM_BARRIER_NOSPEC
#ifdef CONFIG_X86_64
diff --git a/arch/x86/lib/hweight.S b/arch/x86/lib/hweight.S
index 774bdf3e6f0a..edbeb3ecad38 100644
--- a/arch/x86/lib/hweight.S
+++ b/arch/x86/lib/hweight.S
@@ -1,6 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/export.h>
#include <linux/linkage.h>
+#include <linux/objtool.h>
#include <asm/asm.h>
@@ -9,6 +10,7 @@
* %rdi: w
*/
SYM_FUNC_START(__sw_hweight32)
+ ANNOTATE_NOENDBR
#ifdef CONFIG_X86_64
movl %edi, %eax # w
@@ -42,6 +44,7 @@ EXPORT_SYMBOL(__sw_hweight32)
*/
#ifdef CONFIG_X86_64
SYM_FUNC_START(__sw_hweight64)
+ ANNOTATE_NOENDBR
pushq %rdi
pushq %rdx
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 1b60ae81ecd8..aa1f92ee6b2e 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -8,6 +8,7 @@
*/
#include <linux/export.h>
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
#include <asm/cpufeatures.h>
#include <asm/alternative.h>
@@ -26,7 +27,7 @@
* Output:
* rax: dest
*/
-SYM_FUNC_START(__memmove)
+SYM_TYPED_FUNC_START(__memmove)
mov %rdi, %rax
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 0199d56cb479..d66b710d628f 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -3,6 +3,7 @@
#include <linux/export.h>
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
#include <asm/cpufeatures.h>
#include <asm/alternative.h>
@@ -28,7 +29,7 @@
* only for the return value that is the same as the source input,
* which the compiler could/should do much better anyway.
*/
-SYM_FUNC_START(__memset)
+SYM_TYPED_FUNC_START(__memset)
ALTERNATIVE "jmp memset_orig", "", X86_FEATURE_FSRS
movq %rdi,%r9
diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S
index ebd259f31496..5ef8494896e8 100644
--- a/arch/x86/lib/msr-reg.S
+++ b/arch/x86/lib/msr-reg.S
@@ -1,6 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/linkage.h>
#include <linux/errno.h>
+#include <linux/cfi_types.h>
#include <asm/asm.h>
#include <asm/msr.h>
@@ -12,7 +13,7 @@
*
*/
.macro op_safe_regs op
-SYM_FUNC_START(\op\()_safe_regs)
+SYM_TYPED_FUNC_START(\op\()_safe_regs)
pushq %rbx
pushq %r12
movq %rdi, %r10 /* Save pointer */
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
index 4bf4fad5b148..5a18ecc04a6c 100644
--- a/arch/x86/lib/msr.c
+++ b/arch/x86/lib/msr.c
@@ -103,6 +103,7 @@ int msr_set_bit(u32 msr, u8 bit)
{
return __flip_bit(msr, bit, true);
}
+EXPORT_SYMBOL_GPL(msr_set_bit);
/**
* msr_clear_bit - Clear @bit in a MSR @msr.
@@ -118,6 +119,7 @@ int msr_clear_bit(u32 msr, u8 bit)
{
return __flip_bit(msr, bit, false);
}
+EXPORT_SYMBOL_GPL(msr_clear_bit);
#ifdef CONFIG_TRACEPOINTS
void do_trace_write_msr(unsigned int msr, u64 val, int failed)
diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S
index 975c9c18263d..46d9e9b98a61 100644
--- a/arch/x86/lib/putuser.S
+++ b/arch/x86/lib/putuser.S
@@ -13,6 +13,7 @@
*/
#include <linux/export.h>
#include <linux/linkage.h>
+#include <linux/objtool.h>
#include <asm/thread_info.h>
#include <asm/errno.h>
#include <asm/asm.h>
@@ -45,6 +46,7 @@
.text
SYM_FUNC_START(__put_user_1)
+ ANNOTATE_NOENDBR
check_range size=1
ASM_STAC
1: movb %al,(%_ASM_CX)
@@ -55,6 +57,7 @@ SYM_FUNC_END(__put_user_1)
EXPORT_SYMBOL(__put_user_1)
SYM_FUNC_START(__put_user_nocheck_1)
+ ANNOTATE_NOENDBR
ASM_STAC
2: movb %al,(%_ASM_CX)
xor %ecx,%ecx
@@ -64,6 +67,7 @@ SYM_FUNC_END(__put_user_nocheck_1)
EXPORT_SYMBOL(__put_user_nocheck_1)
SYM_FUNC_START(__put_user_2)
+ ANNOTATE_NOENDBR
check_range size=2
ASM_STAC
3: movw %ax,(%_ASM_CX)
@@ -74,6 +78,7 @@ SYM_FUNC_END(__put_user_2)
EXPORT_SYMBOL(__put_user_2)
SYM_FUNC_START(__put_user_nocheck_2)
+ ANNOTATE_NOENDBR
ASM_STAC
4: movw %ax,(%_ASM_CX)
xor %ecx,%ecx
@@ -83,6 +88,7 @@ SYM_FUNC_END(__put_user_nocheck_2)
EXPORT_SYMBOL(__put_user_nocheck_2)
SYM_FUNC_START(__put_user_4)
+ ANNOTATE_NOENDBR
check_range size=4
ASM_STAC
5: movl %eax,(%_ASM_CX)
@@ -93,6 +99,7 @@ SYM_FUNC_END(__put_user_4)
EXPORT_SYMBOL(__put_user_4)
SYM_FUNC_START(__put_user_nocheck_4)
+ ANNOTATE_NOENDBR
ASM_STAC
6: movl %eax,(%_ASM_CX)
xor %ecx,%ecx
@@ -102,6 +109,7 @@ SYM_FUNC_END(__put_user_nocheck_4)
EXPORT_SYMBOL(__put_user_nocheck_4)
SYM_FUNC_START(__put_user_8)
+ ANNOTATE_NOENDBR
check_range size=8
ASM_STAC
7: mov %_ASM_AX,(%_ASM_CX)
@@ -115,6 +123,7 @@ SYM_FUNC_END(__put_user_8)
EXPORT_SYMBOL(__put_user_8)
SYM_FUNC_START(__put_user_nocheck_8)
+ ANNOTATE_NOENDBR
ASM_STAC
9: mov %_ASM_AX,(%_ASM_CX)
#ifdef CONFIG_X86_32
diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index 391059b2c6fb..a26c43abd47d 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -326,6 +326,7 @@ SYM_FUNC_END(retbleed_untrain_ret)
#if defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO)
SYM_FUNC_START(entry_untrain_ret)
+ ANNOTATE_NOENDBR
ALTERNATIVE JMP_RETBLEED_UNTRAIN_RET, JMP_SRSO_UNTRAIN_RET, X86_FEATURE_SRSO
SYM_FUNC_END(entry_untrain_ret)
__EXPORT_THUNK(entry_untrain_ret)
@@ -342,7 +343,7 @@ SYM_FUNC_START(call_depth_return_thunk)
* case.
*/
CALL_THUNKS_DEBUG_INC_RETS
- shlq $5, PER_CPU_VAR(pcpu_hot + X86_call_depth)
+ shlq $5, PER_CPU_VAR(__x86_call_depth)
jz 1f
ANNOTATE_UNRET_SAFE
ret
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index e9251b89a9e9..654280aaa3e9 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -18,7 +18,7 @@
#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
/**
* clean_cache_range - write back a cache range with CLWB
- * @vaddr: virtual start address
+ * @addr: virtual start address
* @size: number of bytes to write back
*
* Write back a cache range using the CLWB (cache line write back)
diff --git a/arch/x86/math-emu/control_w.h b/arch/x86/math-emu/control_w.h
index 60f4dcc5edc3..93cbc89b34e2 100644
--- a/arch/x86/math-emu/control_w.h
+++ b/arch/x86/math-emu/control_w.h
@@ -11,7 +11,7 @@
#ifndef _CONTROLW_H_
#define _CONTROLW_H_
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
#define _Const_(x) $##x
#else
#define _Const_(x) x
diff --git a/arch/x86/math-emu/exception.h b/arch/x86/math-emu/exception.h
index 75230b977577..59961d350bc4 100644
--- a/arch/x86/math-emu/exception.h
+++ b/arch/x86/math-emu/exception.h
@@ -10,7 +10,7 @@
#ifndef _EXCEPTION_H_
#define _EXCEPTION_H_
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
#define Const_(x) $##x
#else
#define Const_(x) x
@@ -37,7 +37,7 @@
#define PRECISION_LOST_UP Const_((EX_Precision | SW_C1))
#define PRECISION_LOST_DOWN Const_(EX_Precision)
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#ifdef DEBUG
#define EXCEPTION(x) { printk("exception in %s at line %d\n", \
@@ -46,6 +46,6 @@
#define EXCEPTION(x) FPU_exception(x)
#endif
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _EXCEPTION_H_ */
diff --git a/arch/x86/math-emu/fpu_emu.h b/arch/x86/math-emu/fpu_emu.h
index 0c122226ca56..def569c50b76 100644
--- a/arch/x86/math-emu/fpu_emu.h
+++ b/arch/x86/math-emu/fpu_emu.h
@@ -20,7 +20,7 @@
*/
#define PECULIAR_486
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
#include "fpu_asm.h"
#define Const(x) $##x
#else
@@ -68,7 +68,7 @@
#define FPU_Exception Const(0x80000000) /* Added to tag returns. */
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include "fpu_system.h"
@@ -213,6 +213,6 @@ asmlinkage int FPU_round(FPU_REG *arg, unsigned int extent, int dummy,
#include "fpu_proto.h"
#endif
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _FPU_EMU_H_ */
diff --git a/arch/x86/math-emu/status_w.h b/arch/x86/math-emu/status_w.h
index b77bafec9526..f642957330ef 100644
--- a/arch/x86/math-emu/status_w.h
+++ b/arch/x86/math-emu/status_w.h
@@ -13,7 +13,7 @@
#include "fpu_emu.h" /* for definition of PECULIAR_486 */
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
#define Const__(x) $##x
#else
#define Const__(x) x
@@ -37,7 +37,7 @@
#define SW_Exc_Mask Const__(0x27f) /* Status word exception bit mask */
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#define COMP_A_gt_B 1
#define COMP_A_eq_B 2
@@ -63,6 +63,6 @@ static inline void setcc(int cc)
# define clear_C1()
#endif /* PECULIAR_486 */
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _STATUS_H_ */
diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
index 5ab7bd2f1983..bd5d101c5c37 100644
--- a/arch/x86/mm/ident_map.c
+++ b/arch/x86/mm/ident_map.c
@@ -101,9 +101,7 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
pmd_t *pmd;
bool use_gbpage;
- next = (addr & PUD_MASK) + PUD_SIZE;
- if (next > end)
- next = end;
+ next = pud_addr_end(addr, end);
/* if this is already a gbpage, this portion is already mapped */
if (pud_leaf(*pud))
@@ -154,10 +152,7 @@ static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page,
p4d_t *p4d = p4d_page + p4d_index(addr);
pud_t *pud;
- next = (addr & P4D_MASK) + P4D_SIZE;
- if (next > end)
- next = end;
-
+ next = p4d_addr_end(addr, end);
if (p4d_present(*p4d)) {
pud = pud_offset(p4d, 0);
result = ident_pud_init(info, pud, addr, next);
@@ -199,10 +194,7 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
pgd_t *pgd = pgd_page + pgd_index(addr);
p4d_t *p4d;
- next = (addr & PGDIR_MASK) + PGDIR_SIZE;
- if (next > end)
- next = end;
-
+ next = pgd_addr_end(addr, end);
if (pgd_present(*pgd)) {
p4d = p4d_offset(pgd, 0);
result = ident_p4d_init(info, p4d, addr, next);
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 62aa4d66a032..bfa444a7dbb0 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -645,8 +645,13 @@ static void __init memory_map_top_down(unsigned long map_start,
*/
addr = memblock_phys_alloc_range(PMD_SIZE, PMD_SIZE, map_start,
map_end);
- memblock_phys_free(addr, PMD_SIZE);
- real_end = addr + PMD_SIZE;
+ if (!addr) {
+ pr_warn("Failed to release memory for alloc_low_pages()");
+ real_end = max(map_start, ALIGN_DOWN(map_end, PMD_SIZE));
+ } else {
+ memblock_phys_free(addr, PMD_SIZE);
+ real_end = addr + PMD_SIZE;
+ }
/* step_size need to be small so pgt_buf from BRK could cover it */
step_size = PMD_SIZE;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index ac41b1e0940d..f288aad8dc74 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -582,7 +582,7 @@ static void __init lowmem_pfn_init(void)
"only %luMB highmem pages available, ignoring highmem size of %luMB!\n"
#define MSG_HIGHMEM_TRIMMED \
- "Warning: only 4GB will be used. Use a HIGHMEM64G enabled kernel!\n"
+ "Warning: only 4GB will be used. Support for for CONFIG_HIGHMEM64G was removed!\n"
/*
* We have more RAM than fits into lowmem - we try to put it into
* highmem, also taking the highmem=x boot parameter into account:
@@ -606,18 +606,13 @@ static void __init highmem_pfn_init(void)
#ifndef CONFIG_HIGHMEM
/* Maximum memory usable is what is directly addressable */
printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20);
- if (max_pfn > MAX_NONPAE_PFN)
- printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
- else
- printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
+ printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
max_pfn = MAXMEM_PFN;
#else /* !CONFIG_HIGHMEM */
-#ifndef CONFIG_HIGHMEM64G
if (max_pfn > MAX_NONPAE_PFN) {
max_pfn = MAX_NONPAE_PFN;
printk(KERN_WARNING MSG_HIGHMEM_TRIMMED);
}
-#endif /* !CONFIG_HIGHMEM64G */
#endif /* !CONFIG_HIGHMEM */
}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 01ea7c6df303..519aa53114fa 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -469,8 +469,6 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end,
!e820__mapped_any(paddr & PAGE_MASK, paddr_next,
E820_TYPE_RAM) &&
!e820__mapped_any(paddr & PAGE_MASK, paddr_next,
- E820_TYPE_RESERVED_KERN) &&
- !e820__mapped_any(paddr & PAGE_MASK, paddr_next,
E820_TYPE_ACPI))
set_pte_init(pte, __pte(0), init);
continue;
@@ -526,8 +524,6 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
!e820__mapped_any(paddr & PMD_MASK, paddr_next,
E820_TYPE_RAM) &&
!e820__mapped_any(paddr & PMD_MASK, paddr_next,
- E820_TYPE_RESERVED_KERN) &&
- !e820__mapped_any(paddr & PMD_MASK, paddr_next,
E820_TYPE_ACPI))
set_pmd_init(pmd, __pmd(0), init);
continue;
@@ -615,8 +611,6 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
!e820__mapped_any(paddr & PUD_MASK, paddr_next,
E820_TYPE_RAM) &&
!e820__mapped_any(paddr & PUD_MASK, paddr_next,
- E820_TYPE_RESERVED_KERN) &&
- !e820__mapped_any(paddr & PUD_MASK, paddr_next,
E820_TYPE_ACPI))
set_pud_init(pud, __pud(0), init);
continue;
@@ -704,8 +698,6 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
!e820__mapped_any(paddr & P4D_MASK, paddr_next,
E820_TYPE_RAM) &&
!e820__mapped_any(paddr & P4D_MASK, paddr_next,
- E820_TYPE_RESERVED_KERN) &&
- !e820__mapped_any(paddr & P4D_MASK, paddr_next,
E820_TYPE_ACPI))
set_p4d_init(p4d, __p4d(0), init);
continue;
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 38ff7791a9c7..42c90b420773 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -503,6 +503,14 @@ void iounmap(volatile void __iomem *addr)
}
EXPORT_SYMBOL(iounmap);
+void *arch_memremap_wb(phys_addr_t phys_addr, size_t size, unsigned long flags)
+{
+ if ((flags & MEMREMAP_DEC) || cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
+ return (void __force *)ioremap_cache(phys_addr, size);
+
+ return (void __force *)ioremap_encrypted(phys_addr, size);
+}
+
/*
* Convert a physical pointer to a virtual kernel pointer for /dev/mem
* access
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 9dddf19a5571..0539efd0d216 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -1,5 +1,4 @@
// SPDX-License-Identifier: GPL-2.0
-#define DISABLE_BRANCH_PROFILING
#define pr_fmt(fmt) "kasan: " fmt
/* cpu_feature_enabled() cannot be used this early */
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 11a93542d198..3c306de52fd4 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -113,8 +113,14 @@ void __init kernel_randomize_memory(void)
memory_tb = DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) +
CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;
- /* Adapt physical memory region size based on available memory */
- if (memory_tb < kaslr_regions[0].size_tb)
+ /*
+ * Adapt physical memory region size based on available memory,
+ * except when CONFIG_PCI_P2PDMA is enabled. P2PDMA exposes the
+ * device BAR space assuming the direct map space is large enough
+ * for creating a ZONE_DEVICE mapping in the direct map corresponding
+ * to the physical BAR address.
+ */
+ if (!IS_ENABLED(CONFIG_PCI_P2PDMA) && (memory_tb < kaslr_regions[0].size_tb))
kaslr_regions[0].size_tb = memory_tb;
/*
diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c
index b56c5c073003..7490ff6d83b1 100644
--- a/arch/x86/mm/mem_encrypt_amd.c
+++ b/arch/x86/mm/mem_encrypt_amd.c
@@ -7,8 +7,6 @@
* Author: Tom Lendacky <thomas.lendacky@amd.com>
*/
-#define DISABLE_BRANCH_PROFILING
-
#include <linux/linkage.h>
#include <linux/init.h>
#include <linux/mm.h>
diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S
index e25288ee33c2..f8a33b25ae86 100644
--- a/arch/x86/mm/mem_encrypt_boot.S
+++ b/arch/x86/mm/mem_encrypt_boot.S
@@ -72,6 +72,7 @@ SYM_FUNC_START(sme_encrypt_execute)
SYM_FUNC_END(sme_encrypt_execute)
SYM_FUNC_START(__enc_copy)
+ ANNOTATE_NOENDBR
/*
* Routine used to encrypt memory in place.
* This routine must be run outside of the kernel proper since
diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
index e6c7686f443a..5eecdd92da10 100644
--- a/arch/x86/mm/mem_encrypt_identity.c
+++ b/arch/x86/mm/mem_encrypt_identity.c
@@ -7,8 +7,6 @@
* Author: Tom Lendacky <thomas.lendacky@amd.com>
*/
-#define DISABLE_BRANCH_PROFILING
-
/*
* Since we're dealing with identity mappings, physical and virtual
* addresses are the same, so override these defines which are ultimately
@@ -565,7 +563,7 @@ void __head sme_enable(struct boot_params *bp)
}
RIP_REL_REF(sme_me_mask) = me_mask;
- physical_mask &= ~me_mask;
- cc_vendor = CC_VENDOR_AMD;
+ RIP_REL_REF(physical_mask) &= ~me_mask;
+ RIP_REL_REF(cc_vendor) = CC_VENDOR_AMD;
cc_set_mask(me_mask);
}
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index b8a6ffffb451..5ed2109211da 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -84,7 +84,6 @@ static unsigned long mmap_base(unsigned long rnd, unsigned long task_size,
{
unsigned long gap = rlim_stack->rlim_cur;
unsigned long pad = stack_maxrandom_size(task_size) + stack_guard_gap;
- unsigned long gap_min, gap_max;
/* Values close to RLIM_INFINITY can overflow. */
if (gap + pad > gap)
@@ -94,13 +93,7 @@ static unsigned long mmap_base(unsigned long rnd, unsigned long task_size,
* Top of mmap area (just below the process stack).
* Leave an at least ~128 MB hole with possible stack randomization.
*/
- gap_min = SIZE_128M;
- gap_max = (task_size / 6) * 5;
-
- if (gap < gap_min)
- gap = gap_min;
- else if (gap > gap_max)
- gap = gap_max;
+ gap = clamp(gap, SIZE_128M, (task_size / 6) * 5);
return PAGE_ALIGN(task_size - gap - rnd);
}
diff --git a/arch/x86/mm/pat/cpa-test.c b/arch/x86/mm/pat/cpa-test.c
index 3d2f7f0a6ed1..ad3c1feec990 100644
--- a/arch/x86/mm/pat/cpa-test.c
+++ b/arch/x86/mm/pat/cpa-test.c
@@ -183,7 +183,7 @@ static int pageattr_test(void)
break;
case 1:
- err = change_page_attr_set(addrs, len[1], PAGE_CPA_TEST, 1);
+ err = change_page_attr_set(addrs, len[i], PAGE_CPA_TEST, 1);
break;
case 2:
diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index feb8cc6a12bf..e40861c9cb90 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -43,6 +43,7 @@
#include <linux/fs.h>
#include <linux/rbtree.h>
+#include <asm/cpu_device_id.h>
#include <asm/cacheflush.h>
#include <asm/cacheinfo.h>
#include <asm/processor.h>
@@ -290,9 +291,8 @@ void __init pat_bp_init(void)
return;
}
- if ((c->x86_vendor == X86_VENDOR_INTEL) &&
- (((c->x86 == 0x6) && (c->x86_model <= 0xd)) ||
- ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) {
+ if ((c->x86_vfm >= INTEL_PENTIUM_PRO && c->x86_vfm <= INTEL_PENTIUM_M_DOTHAN) ||
+ (c->x86_vfm >= INTEL_P4_WILLAMETTE && c->x86_vfm <= INTEL_P4_CEDARMILL)) {
/*
* PAT support with the lower four entries. Intel Pentium 2,
* 3, M, and 4 are affected by PAT errata, which makes the
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index ef4514d64c05..72405d315b41 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -73,6 +73,7 @@ static DEFINE_SPINLOCK(cpa_lock);
#define CPA_ARRAY 2
#define CPA_PAGES_ARRAY 4
#define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */
+#define CPA_COLLAPSE 16 /* try to collapse large pages */
static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm)
{
@@ -105,6 +106,18 @@ static void split_page_count(int level)
direct_pages_count[level - 1] += PTRS_PER_PTE;
}
+static void collapse_page_count(int level)
+{
+ direct_pages_count[level]++;
+ if (system_state == SYSTEM_RUNNING) {
+ if (level == PG_LEVEL_2M)
+ count_vm_event(DIRECT_MAP_LEVEL2_COLLAPSE);
+ else if (level == PG_LEVEL_1G)
+ count_vm_event(DIRECT_MAP_LEVEL3_COLLAPSE);
+ }
+ direct_pages_count[level - 1] -= PTRS_PER_PTE;
+}
+
void arch_report_meminfo(struct seq_file *m)
{
seq_printf(m, "DirectMap4k: %8lu kB\n",
@@ -122,6 +135,7 @@ void arch_report_meminfo(struct seq_file *m)
}
#else
static inline void split_page_count(int level) { }
+static inline void collapse_page_count(int level) { }
#endif
#ifdef CONFIG_X86_CPA_STATISTICS
@@ -211,14 +225,14 @@ within(unsigned long addr, unsigned long start, unsigned long end)
return addr >= start && addr < end;
}
+#ifdef CONFIG_X86_64
+
static inline int
within_inclusive(unsigned long addr, unsigned long start, unsigned long end)
{
return addr >= start && addr <= end;
}
-#ifdef CONFIG_X86_64
-
/*
* The kernel image is mapped into two places in the virtual address space
* (addresses without KASLR, of course):
@@ -394,16 +408,49 @@ static void __cpa_flush_tlb(void *data)
flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i)));
}
-static void cpa_flush(struct cpa_data *data, int cache)
+static int collapse_large_pages(unsigned long addr, struct list_head *pgtables);
+
+static void cpa_collapse_large_pages(struct cpa_data *cpa)
+{
+ unsigned long start, addr, end;
+ struct ptdesc *ptdesc, *tmp;
+ LIST_HEAD(pgtables);
+ int collapsed = 0;
+ int i;
+
+ if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
+ for (i = 0; i < cpa->numpages; i++)
+ collapsed += collapse_large_pages(__cpa_addr(cpa, i),
+ &pgtables);
+ } else {
+ addr = __cpa_addr(cpa, 0);
+ start = addr & PMD_MASK;
+ end = addr + PAGE_SIZE * cpa->numpages;
+
+ for (addr = start; within(addr, start, end); addr += PMD_SIZE)
+ collapsed += collapse_large_pages(addr, &pgtables);
+ }
+
+ if (!collapsed)
+ return;
+
+ flush_tlb_all();
+
+ list_for_each_entry_safe(ptdesc, tmp, &pgtables, pt_list) {
+ list_del(&ptdesc->pt_list);
+ __free_page(ptdesc_page(ptdesc));
+ }
+}
+
+static void cpa_flush(struct cpa_data *cpa, int cache)
{
- struct cpa_data *cpa = data;
unsigned int i;
BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
cpa_flush_all(cache);
- return;
+ goto collapse_large_pages;
}
if (cpa->force_flush_all || cpa->numpages > tlb_single_page_flush_ceiling)
@@ -412,7 +459,7 @@ static void cpa_flush(struct cpa_data *data, int cache)
on_each_cpu(__cpa_flush_tlb, cpa, 1);
if (!cache)
- return;
+ goto collapse_large_pages;
mb();
for (i = 0; i < cpa->numpages; i++) {
@@ -428,6 +475,10 @@ static void cpa_flush(struct cpa_data *data, int cache)
clflush_cache_range_opt((void *)fix_addr(addr), PAGE_SIZE);
}
mb();
+
+collapse_large_pages:
+ if (cpa->flags & CPA_COLLAPSE)
+ cpa_collapse_large_pages(cpa);
}
static bool overlaps(unsigned long r1_start, unsigned long r1_end,
@@ -1197,6 +1248,161 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
return 0;
}
+static int collapse_pmd_page(pmd_t *pmd, unsigned long addr,
+ struct list_head *pgtables)
+{
+ pmd_t _pmd, old_pmd;
+ pte_t *pte, first;
+ unsigned long pfn;
+ pgprot_t pgprot;
+ int i = 0;
+
+ addr &= PMD_MASK;
+ pte = pte_offset_kernel(pmd, addr);
+ first = *pte;
+ pfn = pte_pfn(first);
+
+ /* Make sure alignment is suitable */
+ if (PFN_PHYS(pfn) & ~PMD_MASK)
+ return 0;
+
+ /* The page is 4k intentionally */
+ if (pte_flags(first) & _PAGE_KERNEL_4K)
+ return 0;
+
+ /* Check that the rest of PTEs are compatible with the first one */
+ for (i = 1, pte++; i < PTRS_PER_PTE; i++, pte++) {
+ pte_t entry = *pte;
+
+ if (!pte_present(entry))
+ return 0;
+ if (pte_flags(entry) != pte_flags(first))
+ return 0;
+ if (pte_pfn(entry) != pte_pfn(first) + i)
+ return 0;
+ }
+
+ old_pmd = *pmd;
+
+ /* Success: set up a large page */
+ pgprot = pgprot_4k_2_large(pte_pgprot(first));
+ pgprot_val(pgprot) |= _PAGE_PSE;
+ _pmd = pfn_pmd(pfn, pgprot);
+ set_pmd(pmd, _pmd);
+
+ /* Queue the page table to be freed after TLB flush */
+ list_add(&page_ptdesc(pmd_page(old_pmd))->pt_list, pgtables);
+
+ if (IS_ENABLED(CONFIG_X86_32) && !SHARED_KERNEL_PMD) {
+ struct page *page;
+
+ /* Update all PGD tables to use the same large page */
+ list_for_each_entry(page, &pgd_list, lru) {
+ pgd_t *pgd = (pgd_t *)page_address(page) + pgd_index(addr);
+ p4d_t *p4d = p4d_offset(pgd, addr);
+ pud_t *pud = pud_offset(p4d, addr);
+ pmd_t *pmd = pmd_offset(pud, addr);
+ /* Something is wrong if entries doesn't match */
+ if (WARN_ON(pmd_val(old_pmd) != pmd_val(*pmd)))
+ continue;
+ set_pmd(pmd, _pmd);
+ }
+ }
+
+ if (virt_addr_valid(addr) && pfn_range_is_mapped(pfn, pfn + 1))
+ collapse_page_count(PG_LEVEL_2M);
+
+ return 1;
+}
+
+static int collapse_pud_page(pud_t *pud, unsigned long addr,
+ struct list_head *pgtables)
+{
+ unsigned long pfn;
+ pmd_t *pmd, first;
+ int i;
+
+ if (!direct_gbpages)
+ return 0;
+
+ addr &= PUD_MASK;
+ pmd = pmd_offset(pud, addr);
+ first = *pmd;
+
+ /*
+ * To restore PUD page all PMD entries must be large and
+ * have suitable alignment
+ */
+ pfn = pmd_pfn(first);
+ if (!pmd_leaf(first) || (PFN_PHYS(pfn) & ~PUD_MASK))
+ return 0;
+
+ /*
+ * To restore PUD page, all following PMDs must be compatible with the
+ * first one.
+ */
+ for (i = 1, pmd++; i < PTRS_PER_PMD; i++, pmd++) {
+ pmd_t entry = *pmd;
+
+ if (!pmd_present(entry) || !pmd_leaf(entry))
+ return 0;
+ if (pmd_flags(entry) != pmd_flags(first))
+ return 0;
+ if (pmd_pfn(entry) != pmd_pfn(first) + i * PTRS_PER_PTE)
+ return 0;
+ }
+
+ /* Restore PUD page and queue page table to be freed after TLB flush */
+ list_add(&page_ptdesc(pud_page(*pud))->pt_list, pgtables);
+ set_pud(pud, pfn_pud(pfn, pmd_pgprot(first)));
+
+ if (virt_addr_valid(addr) && pfn_range_is_mapped(pfn, pfn + 1))
+ collapse_page_count(PG_LEVEL_1G);
+
+ return 1;
+}
+
+/*
+ * Collapse PMD and PUD pages in the kernel mapping around the address where
+ * possible.
+ *
+ * Caller must flush TLB and free page tables queued on the list before
+ * touching the new entries. CPU must not see TLB entries of different size
+ * with different attributes.
+ */
+static int collapse_large_pages(unsigned long addr, struct list_head *pgtables)
+{
+ int collapsed = 0;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ addr &= PMD_MASK;
+
+ spin_lock(&pgd_lock);
+ pgd = pgd_offset_k(addr);
+ if (pgd_none(*pgd))
+ goto out;
+ p4d = p4d_offset(pgd, addr);
+ if (p4d_none(*p4d))
+ goto out;
+ pud = pud_offset(p4d, addr);
+ if (!pud_present(*pud) || pud_leaf(*pud))
+ goto out;
+ pmd = pmd_offset(pud, addr);
+ if (!pmd_present(*pmd) || pmd_leaf(*pmd))
+ goto out;
+
+ collapsed = collapse_pmd_page(pmd, addr, pgtables);
+ if (collapsed)
+ collapsed += collapse_pud_page(pud, addr, pgtables);
+
+out:
+ spin_unlock(&pgd_lock);
+ return collapsed;
+}
+
static bool try_to_free_pte_page(pte_t *pte)
{
int i;
@@ -1942,19 +2148,6 @@ static inline int cpa_clear_pages_array(struct page **pages, int numpages,
CPA_PAGES_ARRAY, pages);
}
-/*
- * __set_memory_prot is an internal helper for callers that have been passed
- * a pgprot_t value from upper layers and a reservation has already been taken.
- * If you want to set the pgprot to a specific page protocol, use the
- * set_memory_xx() functions.
- */
-int __set_memory_prot(unsigned long addr, int numpages, pgprot_t prot)
-{
- return change_page_attr_set_clr(&addr, numpages, prot,
- __pgprot(~pgprot_val(prot)), 0, 0,
- NULL);
-}
-
int _set_memory_uc(unsigned long addr, int numpages)
{
/*
@@ -2120,7 +2313,8 @@ int set_memory_rox(unsigned long addr, int numpages)
if (__supported_pte_mask & _PAGE_NX)
clr.pgprot |= _PAGE_NX;
- return change_page_attr_clear(&addr, numpages, clr, 0);
+ return change_page_attr_set_clr(&addr, numpages, __pgprot(0), clr, 0,
+ CPA_COLLAPSE, NULL);
}
int set_memory_rw(unsigned long addr, int numpages)
@@ -2147,7 +2341,8 @@ int set_memory_p(unsigned long addr, int numpages)
int set_memory_4k(unsigned long addr, int numpages)
{
- return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
+ return change_page_attr_set_clr(&addr, numpages,
+ __pgprot(_PAGE_KERNEL_4K),
__pgprot(0), 1, 0, NULL);
}
@@ -2420,7 +2615,7 @@ static int __set_pages_np(struct page *page, int numpages)
.pgd = NULL,
.numpages = numpages,
.mask_set = __pgprot(0),
- .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
+ .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY),
.flags = CPA_NO_CHECK_ALIAS };
/*
@@ -2507,7 +2702,7 @@ int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
.pgd = pgd,
.numpages = numpages,
.mask_set = __pgprot(0),
- .mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW)),
+ .mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW|_PAGE_DIRTY)),
.flags = CPA_NO_CHECK_ALIAS,
};
@@ -2550,7 +2745,7 @@ int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address,
.pgd = pgd,
.numpages = numpages,
.mask_set = __pgprot(0),
- .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
+ .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY),
.flags = CPA_NO_CHECK_ALIAS,
};
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 1fef5ad32d5a..cec321fb74f2 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -12,59 +12,15 @@ phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1;
EXPORT_SYMBOL(physical_mask);
#endif
-#ifdef CONFIG_HIGHPTE
-#define PGTABLE_HIGHMEM __GFP_HIGHMEM
-#else
-#define PGTABLE_HIGHMEM 0
-#endif
-
-#ifndef CONFIG_PARAVIRT
-#ifndef CONFIG_PT_RECLAIM
-static inline
-void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
-{
- struct ptdesc *ptdesc = (struct ptdesc *)table;
-
- pagetable_dtor(ptdesc);
- tlb_remove_page(tlb, ptdesc_page(ptdesc));
-}
-#else
-static inline
-void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
-{
- tlb_remove_table(tlb, table);
-}
-#endif /* !CONFIG_PT_RECLAIM */
-#endif /* !CONFIG_PARAVIRT */
-
-gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM;
-
pgtable_t pte_alloc_one(struct mm_struct *mm)
{
- return __pte_alloc_one(mm, __userpte_alloc_gfp);
-}
-
-static int __init setup_userpte(char *arg)
-{
- if (!arg)
- return -EINVAL;
-
- /*
- * "userpte=nohigh" disables allocation of user pagetables in
- * high memory.
- */
- if (strcmp(arg, "nohigh") == 0)
- __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
- else
- return -EINVAL;
- return 0;
+ return __pte_alloc_one(mm, GFP_PGTABLE_USER);
}
-early_param("userpte", setup_userpte);
void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
{
paravirt_release_pte(page_to_pfn(pte));
- paravirt_tlb_remove_table(tlb, page_ptdesc(pte));
+ tlb_remove_table(tlb, page_ptdesc(pte));
}
#if CONFIG_PGTABLE_LEVELS > 2
@@ -78,21 +34,21 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
#ifdef CONFIG_X86_PAE
tlb->need_flush_all = 1;
#endif
- paravirt_tlb_remove_table(tlb, virt_to_ptdesc(pmd));
+ tlb_remove_table(tlb, virt_to_ptdesc(pmd));
}
#if CONFIG_PGTABLE_LEVELS > 3
void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
{
paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
- paravirt_tlb_remove_table(tlb, virt_to_ptdesc(pud));
+ tlb_remove_table(tlb, virt_to_ptdesc(pud));
}
#if CONFIG_PGTABLE_LEVELS > 4
void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
{
paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
- paravirt_tlb_remove_table(tlb, virt_to_ptdesc(p4d));
+ tlb_remove_table(tlb, virt_to_ptdesc(p4d));
}
#endif /* CONFIG_PGTABLE_LEVELS > 4 */
#endif /* CONFIG_PGTABLE_LEVELS > 3 */
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 6cf881a942bb..e459d97ef397 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -74,13 +74,15 @@
* use different names for each of them:
*
* ASID - [0, TLB_NR_DYN_ASIDS-1]
- * the canonical identifier for an mm
+ * the canonical identifier for an mm, dynamically allocated on each CPU
+ * [TLB_NR_DYN_ASIDS, MAX_ASID_AVAILABLE-1]
+ * the canonical, global identifier for an mm, identical across all CPUs
*
- * kPCID - [1, TLB_NR_DYN_ASIDS]
+ * kPCID - [1, MAX_ASID_AVAILABLE]
* the value we write into the PCID part of CR3; corresponds to the
* ASID+1, because PCID 0 is special.
*
- * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
+ * uPCID - [2048 + 1, 2048 + MAX_ASID_AVAILABLE]
* for KPTI each mm has two address spaces and thus needs two
* PCID values, but we can still do with a single ASID denomination
* for each mm. Corresponds to kPCID + 2048.
@@ -225,6 +227,20 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
return;
}
+ /*
+ * TLB consistency for global ASIDs is maintained with hardware assisted
+ * remote TLB flushing. Global ASIDs are always up to date.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
+ u16 global_asid = mm_global_asid(next);
+
+ if (global_asid) {
+ *new_asid = global_asid;
+ *need_flush = false;
+ return;
+ }
+ }
+
if (this_cpu_read(cpu_tlbstate.invalidate_other))
clear_asid_other();
@@ -252,6 +268,268 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
}
/*
+ * Global ASIDs are allocated for multi-threaded processes that are
+ * active on multiple CPUs simultaneously, giving each of those
+ * processes the same PCID on every CPU, for use with hardware-assisted
+ * TLB shootdown on remote CPUs, like AMD INVLPGB or Intel RAR.
+ *
+ * These global ASIDs are held for the lifetime of the process.
+ */
+static DEFINE_RAW_SPINLOCK(global_asid_lock);
+static u16 last_global_asid = MAX_ASID_AVAILABLE;
+static DECLARE_BITMAP(global_asid_used, MAX_ASID_AVAILABLE);
+static DECLARE_BITMAP(global_asid_freed, MAX_ASID_AVAILABLE);
+static int global_asid_available = MAX_ASID_AVAILABLE - TLB_NR_DYN_ASIDS - 1;
+
+/*
+ * When the search for a free ASID in the global ASID space reaches
+ * MAX_ASID_AVAILABLE, a global TLB flush guarantees that previously
+ * freed global ASIDs are safe to re-use.
+ *
+ * This way the global flush only needs to happen at ASID rollover
+ * time, and not at ASID allocation time.
+ */
+static void reset_global_asid_space(void)
+{
+ lockdep_assert_held(&global_asid_lock);
+
+ invlpgb_flush_all_nonglobals();
+
+ /*
+ * The TLB flush above makes it safe to re-use the previously
+ * freed global ASIDs.
+ */
+ bitmap_andnot(global_asid_used, global_asid_used,
+ global_asid_freed, MAX_ASID_AVAILABLE);
+ bitmap_clear(global_asid_freed, 0, MAX_ASID_AVAILABLE);
+
+ /* Restart the search from the start of global ASID space. */
+ last_global_asid = TLB_NR_DYN_ASIDS;
+}
+
+static u16 allocate_global_asid(void)
+{
+ u16 asid;
+
+ lockdep_assert_held(&global_asid_lock);
+
+ /* The previous allocation hit the edge of available address space */
+ if (last_global_asid >= MAX_ASID_AVAILABLE - 1)
+ reset_global_asid_space();
+
+ asid = find_next_zero_bit(global_asid_used, MAX_ASID_AVAILABLE, last_global_asid);
+
+ if (asid >= MAX_ASID_AVAILABLE && !global_asid_available) {
+ /* This should never happen. */
+ VM_WARN_ONCE(1, "Unable to allocate global ASID despite %d available\n",
+ global_asid_available);
+ return 0;
+ }
+
+ /* Claim this global ASID. */
+ __set_bit(asid, global_asid_used);
+ last_global_asid = asid;
+ global_asid_available--;
+ return asid;
+}
+
+/*
+ * Check whether a process is currently active on more than @threshold CPUs.
+ * This is a cheap estimation on whether or not it may make sense to assign
+ * a global ASID to this process, and use broadcast TLB invalidation.
+ */
+static bool mm_active_cpus_exceeds(struct mm_struct *mm, int threshold)
+{
+ int count = 0;
+ int cpu;
+
+ /* This quick check should eliminate most single threaded programs. */
+ if (cpumask_weight(mm_cpumask(mm)) <= threshold)
+ return false;
+
+ /* Slower check to make sure. */
+ for_each_cpu(cpu, mm_cpumask(mm)) {
+ /* Skip the CPUs that aren't really running this process. */
+ if (per_cpu(cpu_tlbstate.loaded_mm, cpu) != mm)
+ continue;
+
+ if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu))
+ continue;
+
+ if (++count > threshold)
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Assign a global ASID to the current process, protecting against
+ * races between multiple threads in the process.
+ */
+static void use_global_asid(struct mm_struct *mm)
+{
+ u16 asid;
+
+ guard(raw_spinlock_irqsave)(&global_asid_lock);
+
+ /* This process is already using broadcast TLB invalidation. */
+ if (mm_global_asid(mm))
+ return;
+
+ /*
+ * The last global ASID was consumed while waiting for the lock.
+ *
+ * If this fires, a more aggressive ASID reuse scheme might be
+ * needed.
+ */
+ if (!global_asid_available) {
+ VM_WARN_ONCE(1, "Ran out of global ASIDs\n");
+ return;
+ }
+
+ asid = allocate_global_asid();
+ if (!asid)
+ return;
+
+ mm_assign_global_asid(mm, asid);
+}
+
+void mm_free_global_asid(struct mm_struct *mm)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ return;
+
+ if (!mm_global_asid(mm))
+ return;
+
+ guard(raw_spinlock_irqsave)(&global_asid_lock);
+
+ /* The global ASID can be re-used only after flush at wrap-around. */
+#ifdef CONFIG_BROADCAST_TLB_FLUSH
+ __set_bit(mm->context.global_asid, global_asid_freed);
+
+ mm->context.global_asid = 0;
+ global_asid_available++;
+#endif
+}
+
+/*
+ * Is the mm transitioning from a CPU-local ASID to a global ASID?
+ */
+static bool mm_needs_global_asid(struct mm_struct *mm, u16 asid)
+{
+ u16 global_asid = mm_global_asid(mm);
+
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ return false;
+
+ /* Process is transitioning to a global ASID */
+ if (global_asid && asid != global_asid)
+ return true;
+
+ return false;
+}
+
+/*
+ * x86 has 4k ASIDs (2k when compiled with KPTI), but the largest x86
+ * systems have over 8k CPUs. Because of this potential ASID shortage,
+ * global ASIDs are handed out to processes that have frequent TLB
+ * flushes and are active on 4 or more CPUs simultaneously.
+ */
+static void consider_global_asid(struct mm_struct *mm)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ return;
+
+ /* Check every once in a while. */
+ if ((current->pid & 0x1f) != (jiffies & 0x1f))
+ return;
+
+ /*
+ * Assign a global ASID if the process is active on
+ * 4 or more CPUs simultaneously.
+ */
+ if (mm_active_cpus_exceeds(mm, 3))
+ use_global_asid(mm);
+}
+
+static void finish_asid_transition(struct flush_tlb_info *info)
+{
+ struct mm_struct *mm = info->mm;
+ int bc_asid = mm_global_asid(mm);
+ int cpu;
+
+ if (!mm_in_asid_transition(mm))
+ return;
+
+ for_each_cpu(cpu, mm_cpumask(mm)) {
+ /*
+ * The remote CPU is context switching. Wait for that to
+ * finish, to catch the unlikely case of it switching to
+ * the target mm with an out of date ASID.
+ */
+ while (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) == LOADED_MM_SWITCHING)
+ cpu_relax();
+
+ if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) != mm)
+ continue;
+
+ /*
+ * If at least one CPU is not using the global ASID yet,
+ * send a TLB flush IPI. The IPI should cause stragglers
+ * to transition soon.
+ *
+ * This can race with the CPU switching to another task;
+ * that results in a (harmless) extra IPI.
+ */
+ if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm_asid, cpu)) != bc_asid) {
+ flush_tlb_multi(mm_cpumask(info->mm), info);
+ return;
+ }
+ }
+
+ /* All the CPUs running this process are using the global ASID. */
+ mm_clear_asid_transition(mm);
+}
+
+static void broadcast_tlb_flush(struct flush_tlb_info *info)
+{
+ bool pmd = info->stride_shift == PMD_SHIFT;
+ unsigned long asid = mm_global_asid(info->mm);
+ unsigned long addr = info->start;
+
+ /*
+ * TLB flushes with INVLPGB are kicked off asynchronously.
+ * The inc_mm_tlb_gen() guarantees page table updates are done
+ * before these TLB flushes happen.
+ */
+ if (info->end == TLB_FLUSH_ALL) {
+ invlpgb_flush_single_pcid_nosync(kern_pcid(asid));
+ /* Do any CPUs supporting INVLPGB need PTI? */
+ if (cpu_feature_enabled(X86_FEATURE_PTI))
+ invlpgb_flush_single_pcid_nosync(user_pcid(asid));
+ } else do {
+ unsigned long nr = 1;
+
+ if (info->stride_shift <= PMD_SHIFT) {
+ nr = (info->end - addr) >> info->stride_shift;
+ nr = clamp_val(nr, 1, invlpgb_count_max);
+ }
+
+ invlpgb_flush_user_nr_nosync(kern_pcid(asid), addr, nr, pmd);
+ if (cpu_feature_enabled(X86_FEATURE_PTI))
+ invlpgb_flush_user_nr_nosync(user_pcid(asid), addr, nr, pmd);
+
+ addr += nr << info->stride_shift;
+ } while (addr < info->end);
+
+ finish_asid_transition(info);
+
+ /* Wait for the INVLPGBs kicked off above to finish. */
+ __tlbsync();
+}
+
+/*
* Given an ASID, flush the corresponding user ASID. We can delay this
* until the next time we switch to it.
*
@@ -447,8 +725,7 @@ static void cond_mitigation(struct task_struct *next)
* different context than the user space task which ran
* last on this CPU.
*/
- if ((prev_mm & ~LAST_USER_MM_SPEC_MASK) !=
- (unsigned long)next->mm)
+ if ((prev_mm & ~LAST_USER_MM_SPEC_MASK) != (unsigned long)next->mm)
indirect_branch_prediction_barrier();
}
@@ -556,7 +833,8 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
*/
if (prev == next) {
/* Not actually switching mm's */
- VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
+ VM_WARN_ON(is_dyn_asid(prev_asid) &&
+ this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
next->context.ctx_id);
/*
@@ -573,6 +851,20 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
!cpumask_test_cpu(cpu, mm_cpumask(next))))
cpumask_set_cpu(cpu, mm_cpumask(next));
+ /* Check if the current mm is transitioning to a global ASID */
+ if (mm_needs_global_asid(next, prev_asid)) {
+ next_tlb_gen = atomic64_read(&next->context.tlb_gen);
+ choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
+ goto reload_tlb;
+ }
+
+ /*
+ * Broadcast TLB invalidation keeps this ASID up to date
+ * all the time.
+ */
+ if (is_global_asid(prev_asid))
+ return;
+
/*
* If the CPU is not in lazy TLB mode, we are just switching
* from one thread in a process to another thread in the same
@@ -607,6 +899,13 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
cond_mitigation(tsk);
/*
+ * Let nmi_uaccess_okay() and finish_asid_transition()
+ * know that CR3 is changing.
+ */
+ this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
+ barrier();
+
+ /*
* Leave this CPU in prev's mm_cpumask. Atomic writes to
* mm_cpumask can be expensive under contention. The CPU
* will be removed lazily at TLB flush time.
@@ -620,14 +919,12 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
-
- /* Let nmi_uaccess_okay() know that we're changing CR3. */
- this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
- barrier();
}
+reload_tlb:
new_lam = mm_lam_cr3_mask(next);
if (need_flush) {
+ VM_WARN_ON_ONCE(is_global_asid(new_asid));
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
load_new_mm_cr3(next->pgd, new_asid, new_lam, true);
@@ -746,7 +1043,7 @@ static void flush_tlb_func(void *info)
const struct flush_tlb_info *f = info;
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
- u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
+ u64 local_tlb_gen;
bool local = smp_processor_id() == f->initiating_cpu;
unsigned long nr_invalidate = 0;
u64 mm_tlb_gen;
@@ -769,6 +1066,16 @@ static void flush_tlb_func(void *info)
if (unlikely(loaded_mm == &init_mm))
return;
+ /* Reload the ASID if transitioning into or out of a global ASID */
+ if (mm_needs_global_asid(loaded_mm, loaded_mm_asid)) {
+ switch_mm_irqs_off(NULL, loaded_mm, NULL);
+ loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+ }
+
+ /* Broadcast ASIDs are always kept up to date with INVLPGB. */
+ if (is_global_asid(loaded_mm_asid))
+ return;
+
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
loaded_mm->context.ctx_id);
@@ -786,6 +1093,8 @@ static void flush_tlb_func(void *info)
return;
}
+ local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
+
if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID &&
f->new_tlb_gen <= local_tlb_gen)) {
/*
@@ -953,7 +1262,7 @@ STATIC_NOPV void native_flush_tlb_multi(const struct cpumask *cpumask,
* up on the new contents of what used to be page tables, while
* doing a speculative memory access.
*/
- if (info->freed_tables)
+ if (info->freed_tables || mm_in_asid_transition(info->mm))
on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true);
else
on_each_cpu_cond_mask(should_flush_tlb, flush_tlb_func,
@@ -1000,6 +1309,15 @@ static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1);
#endif
+ /*
+ * If the number of flushes is so large that a full flush
+ * would be faster, do a full flush.
+ */
+ if ((end - start) >> stride_shift > tlb_single_page_flush_ceiling) {
+ start = 0;
+ end = TLB_FLUSH_ALL;
+ }
+
info->start = start;
info->end = end;
info->mm = mm;
@@ -1026,17 +1344,8 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
bool freed_tables)
{
struct flush_tlb_info *info;
+ int cpu = get_cpu();
u64 new_tlb_gen;
- int cpu;
-
- cpu = get_cpu();
-
- /* Should we flush just the requested range? */
- if ((end == TLB_FLUSH_ALL) ||
- ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) {
- start = 0;
- end = TLB_FLUSH_ALL;
- }
/* This is also a barrier that synchronizes with switch_mm(). */
new_tlb_gen = inc_mm_tlb_gen(mm);
@@ -1049,9 +1358,12 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
* a local TLB flush is needed. Optimize this use-case by calling
* flush_tlb_func_local() directly in this case.
*/
- if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
+ if (mm_global_asid(mm)) {
+ broadcast_tlb_flush(info);
+ } else if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
info->trim_cpumask = should_trim_cpumask(mm);
flush_tlb_multi(mm_cpumask(mm), info);
+ consider_global_asid(mm);
} else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
lockdep_assert_irqs_enabled();
local_irq_disable();
@@ -1064,7 +1376,6 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
}
-
static void do_flush_tlb_all(void *info)
{
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
@@ -1074,7 +1385,32 @@ static void do_flush_tlb_all(void *info)
void flush_tlb_all(void)
{
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
- on_each_cpu(do_flush_tlb_all, NULL, 1);
+
+ /* First try (faster) hardware-assisted TLB invalidation. */
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ invlpgb_flush_all();
+ else
+ /* Fall back to the IPI-based invalidation. */
+ on_each_cpu(do_flush_tlb_all, NULL, 1);
+}
+
+/* Flush an arbitrarily large range of memory with INVLPGB. */
+static void invlpgb_kernel_range_flush(struct flush_tlb_info *info)
+{
+ unsigned long addr, nr;
+
+ for (addr = info->start; addr < info->end; addr += nr << PAGE_SHIFT) {
+ nr = (info->end - addr) >> PAGE_SHIFT;
+
+ /*
+ * INVLPGB has a limit on the size of ranges it can
+ * flush. Break up large flushes.
+ */
+ nr = clamp_val(nr, 1, invlpgb_count_max);
+
+ invlpgb_flush_addr_nosync(addr, nr);
+ }
+ __tlbsync();
}
static void do_kernel_range_flush(void *info)
@@ -1087,24 +1423,37 @@ static void do_kernel_range_flush(void *info)
flush_tlb_one_kernel(addr);
}
-void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+static void kernel_tlb_flush_all(struct flush_tlb_info *info)
{
- /* Balance as user space task's flush, a bit conservative */
- if (end == TLB_FLUSH_ALL ||
- (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ invlpgb_flush_all();
+ else
on_each_cpu(do_flush_tlb_all, NULL, 1);
- } else {
- struct flush_tlb_info *info;
-
- preempt_disable();
- info = get_flush_tlb_info(NULL, start, end, 0, false,
- TLB_GENERATION_INVALID);
+}
+static void kernel_tlb_flush_range(struct flush_tlb_info *info)
+{
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ invlpgb_kernel_range_flush(info);
+ else
on_each_cpu(do_kernel_range_flush, info, 1);
+}
- put_flush_tlb_info();
- preempt_enable();
- }
+void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+ struct flush_tlb_info *info;
+
+ guard(preempt)();
+
+ info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, false,
+ TLB_GENERATION_INVALID);
+
+ if (info->end == TLB_FLUSH_ALL)
+ kernel_tlb_flush_all(info);
+ else
+ kernel_tlb_flush_range(info);
+
+ put_flush_tlb_info();
}
/*
@@ -1283,7 +1632,10 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
* a local TLB flush is needed. Optimize this use-case by calling
* flush_tlb_func_local() directly in this case.
*/
- if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB) && batch->unmapped_pages) {
+ invlpgb_flush_all_nonglobals();
+ batch->unmapped_pages = false;
+ } else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
flush_tlb_multi(&batch->cpumask, info);
} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
lockdep_assert_irqs_enabled();
@@ -1325,7 +1677,7 @@ bool nmi_uaccess_okay(void)
if (loaded_mm != current_mm)
return false;
- VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa()));
+ VM_WARN_ON_ONCE(__pa(current_mm->pgd) != read_cr3_pa());
return true;
}
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index a43fc5af973d..72776dcb75aa 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -410,16 +410,20 @@ static void emit_nops(u8 **pprog, int len)
* Emit the various CFI preambles, see asm/cfi.h and the comments about FineIBT
* in arch/x86/kernel/alternative.c
*/
+static int emit_call(u8 **prog, void *func, void *ip);
-static void emit_fineibt(u8 **pprog, u32 hash)
+static void emit_fineibt(u8 **pprog, u8 *ip, u32 hash, int arity)
{
u8 *prog = *pprog;
EMIT_ENDBR();
EMIT3_off32(0x41, 0x81, 0xea, hash); /* subl $hash, %r10d */
- EMIT2(0x74, 0x07); /* jz.d8 +7 */
- EMIT2(0x0f, 0x0b); /* ud2 */
- EMIT1(0x90); /* nop */
+ if (cfi_bhi) {
+ emit_call(&prog, __bhi_args[arity], ip + 11);
+ } else {
+ EMIT2(0x75, 0xf9); /* jne.d8 .-7 */
+ EMIT3(0x0f, 0x1f, 0x00); /* nop3 */
+ }
EMIT_ENDBR_POISON();
*pprog = prog;
@@ -448,13 +452,13 @@ static void emit_kcfi(u8 **pprog, u32 hash)
*pprog = prog;
}
-static void emit_cfi(u8 **pprog, u32 hash)
+static void emit_cfi(u8 **pprog, u8 *ip, u32 hash, int arity)
{
u8 *prog = *pprog;
switch (cfi_mode) {
case CFI_FINEIBT:
- emit_fineibt(&prog, hash);
+ emit_fineibt(&prog, ip, hash, arity);
break;
case CFI_KCFI:
@@ -505,13 +509,17 @@ static void emit_prologue_tail_call(u8 **pprog, bool is_subprog)
* bpf_tail_call helper will skip the first X86_TAIL_CALL_OFFSET bytes
* while jumping to another program
*/
-static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
+static void emit_prologue(u8 **pprog, u8 *ip, u32 stack_depth, bool ebpf_from_cbpf,
bool tail_call_reachable, bool is_subprog,
bool is_exception_cb)
{
u8 *prog = *pprog;
- emit_cfi(&prog, is_subprog ? cfi_bpf_subprog_hash : cfi_bpf_hash);
+ if (is_subprog) {
+ emit_cfi(&prog, ip, cfi_bpf_subprog_hash, 5);
+ } else {
+ emit_cfi(&prog, ip, cfi_bpf_hash, 1);
+ }
/* BPF trampoline can be made to work without these nops,
* but let's waste 5 bytes for now and optimize later
*/
@@ -641,7 +649,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
* See emit_prologue(), for IBT builds the trampoline hook is preceded
* with an ENDBR instruction.
*/
- if (is_endbr(*(u32 *)ip))
+ if (is_endbr(ip))
ip += ENDBR_INSN_SIZE;
return __bpf_arch_text_poke(ip, t, old_addr, new_addr);
@@ -1480,7 +1488,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
detect_reg_usage(insn, insn_cnt, callee_regs_used);
- emit_prologue(&prog, stack_depth,
+ emit_prologue(&prog, image, stack_depth,
bpf_prog_was_classic(bpf_prog), tail_call_reachable,
bpf_is_subprog(bpf_prog), bpf_prog->aux->exception_cb);
/* Exception callback will clobber callee regs for its own use, and
@@ -3036,7 +3044,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
/* skip patched call instruction and point orig_call to actual
* body of the kernel function.
*/
- if (is_endbr(*(u32 *)orig_call))
+ if (is_endbr(orig_call))
orig_call += ENDBR_INSN_SIZE;
orig_call += X86_PATCH_SIZE;
}
@@ -3047,7 +3055,9 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
/*
* Indirect call for bpf_struct_ops
*/
- emit_cfi(&prog, cfi_get_func_hash(func_addr));
+ emit_cfi(&prog, image,
+ cfi_get_func_hash(func_addr),
+ cfi_get_func_arity(func_addr));
} else {
/*
* Direct-call fentry stub, as such it needs accounting for the
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index 48bcada5cabe..4933fb337983 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -12,8 +12,6 @@ obj-$(CONFIG_X86_INTEL_CE) += ce4100.o
obj-$(CONFIG_ACPI) += acpi.o
obj-y += legacy.o irq.o
-obj-$(CONFIG_STA2X11) += sta2x11-fixup.o
-
obj-$(CONFIG_X86_NUMACHIP) += numachip.o
obj-$(CONFIG_X86_INTEL_MID) += intel_mid_pci.o
diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c
deleted file mode 100644
index 8c8ddc4dcc08..000000000000
--- a/arch/x86/pci/sta2x11-fixup.c
+++ /dev/null
@@ -1,233 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * DMA translation between STA2x11 AMBA memory mapping and the x86 memory mapping
- *
- * ST Microelectronics ConneXt (STA2X11/STA2X10)
- *
- * Copyright (c) 2010-2011 Wind River Systems, Inc.
- */
-
-#include <linux/pci.h>
-#include <linux/pci_ids.h>
-#include <linux/export.h>
-#include <linux/list.h>
-#include <linux/dma-map-ops.h>
-#include <linux/swiotlb.h>
-#include <asm/iommu.h>
-#include <asm/sta2x11.h>
-
-#define STA2X11_SWIOTLB_SIZE (4*1024*1024)
-
-/*
- * We build a list of bus numbers that are under the ConneXt. The
- * main bridge hosts 4 busses, which are the 4 endpoints, in order.
- */
-#define STA2X11_NR_EP 4 /* 0..3 included */
-#define STA2X11_NR_FUNCS 8 /* 0..7 included */
-#define STA2X11_AMBA_SIZE (512 << 20)
-
-struct sta2x11_ahb_regs { /* saved during suspend */
- u32 base, pexlbase, pexhbase, crw;
-};
-
-struct sta2x11_mapping {
- int is_suspended;
- struct sta2x11_ahb_regs regs[STA2X11_NR_FUNCS];
-};
-
-struct sta2x11_instance {
- struct list_head list;
- int bus0;
- struct sta2x11_mapping map[STA2X11_NR_EP];
-};
-
-static LIST_HEAD(sta2x11_instance_list);
-
-/* At probe time, record new instances of this bridge (likely one only) */
-static void sta2x11_new_instance(struct pci_dev *pdev)
-{
- struct sta2x11_instance *instance;
-
- instance = kzalloc(sizeof(*instance), GFP_ATOMIC);
- if (!instance)
- return;
- /* This has a subordinate bridge, with 4 more-subordinate ones */
- instance->bus0 = pdev->subordinate->number + 1;
-
- if (list_empty(&sta2x11_instance_list)) {
- int size = STA2X11_SWIOTLB_SIZE;
- /* First instance: register your own swiotlb area */
- dev_info(&pdev->dev, "Using SWIOTLB (size %i)\n", size);
- if (swiotlb_init_late(size, GFP_DMA, NULL))
- dev_emerg(&pdev->dev, "init swiotlb failed\n");
- }
- list_add(&instance->list, &sta2x11_instance_list);
-}
-DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_STMICRO, 0xcc17, sta2x11_new_instance);
-
-/*
- * Utility functions used in this file from below
- */
-static struct sta2x11_instance *sta2x11_pdev_to_instance(struct pci_dev *pdev)
-{
- struct sta2x11_instance *instance;
- int ep;
-
- list_for_each_entry(instance, &sta2x11_instance_list, list) {
- ep = pdev->bus->number - instance->bus0;
- if (ep >= 0 && ep < STA2X11_NR_EP)
- return instance;
- }
- return NULL;
-}
-
-static int sta2x11_pdev_to_ep(struct pci_dev *pdev)
-{
- struct sta2x11_instance *instance;
-
- instance = sta2x11_pdev_to_instance(pdev);
- if (!instance)
- return -1;
-
- return pdev->bus->number - instance->bus0;
-}
-
-/* This is exported, as some devices need to access the MFD registers */
-struct sta2x11_instance *sta2x11_get_instance(struct pci_dev *pdev)
-{
- return sta2x11_pdev_to_instance(pdev);
-}
-EXPORT_SYMBOL(sta2x11_get_instance);
-
-/* At setup time, we use our own ops if the device is a ConneXt one */
-static void sta2x11_setup_pdev(struct pci_dev *pdev)
-{
- struct sta2x11_instance *instance = sta2x11_pdev_to_instance(pdev);
-
- if (!instance) /* either a sta2x11 bridge or another ST device */
- return;
-
- /* We must enable all devices as master, for audio DMA to work */
- pci_set_master(pdev);
-}
-DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_STMICRO, PCI_ANY_ID, sta2x11_setup_pdev);
-
-/*
- * At boot we must set up the mappings for the pcie-to-amba bridge.
- * It involves device access, and the same happens at suspend/resume time
- */
-
-#define AHB_MAPB 0xCA4
-#define AHB_CRW(i) (AHB_MAPB + 0 + (i) * 0x10)
-#define AHB_CRW_SZMASK 0xfffffc00UL
-#define AHB_CRW_ENABLE (1 << 0)
-#define AHB_CRW_WTYPE_MEM (2 << 1)
-#define AHB_CRW_ROE (1UL << 3) /* Relax Order Ena */
-#define AHB_CRW_NSE (1UL << 4) /* No Snoop Enable */
-#define AHB_BASE(i) (AHB_MAPB + 4 + (i) * 0x10)
-#define AHB_PEXLBASE(i) (AHB_MAPB + 8 + (i) * 0x10)
-#define AHB_PEXHBASE(i) (AHB_MAPB + 12 + (i) * 0x10)
-
-/* At probe time, enable mapping for each endpoint, using the pdev */
-static void sta2x11_map_ep(struct pci_dev *pdev)
-{
- struct sta2x11_instance *instance = sta2x11_pdev_to_instance(pdev);
- struct device *dev = &pdev->dev;
- u32 amba_base, max_amba_addr;
- int i, ret;
-
- if (!instance)
- return;
-
- pci_read_config_dword(pdev, AHB_BASE(0), &amba_base);
- max_amba_addr = amba_base + STA2X11_AMBA_SIZE - 1;
-
- ret = dma_direct_set_offset(dev, 0, amba_base, STA2X11_AMBA_SIZE);
- if (ret)
- dev_err(dev, "sta2x11: could not set DMA offset\n");
-
- dev->bus_dma_limit = max_amba_addr;
- dma_set_mask_and_coherent(&pdev->dev, max_amba_addr);
-
- /* Configure AHB mapping */
- pci_write_config_dword(pdev, AHB_PEXLBASE(0), 0);
- pci_write_config_dword(pdev, AHB_PEXHBASE(0), 0);
- pci_write_config_dword(pdev, AHB_CRW(0), STA2X11_AMBA_SIZE |
- AHB_CRW_WTYPE_MEM | AHB_CRW_ENABLE);
-
- /* Disable all the other windows */
- for (i = 1; i < STA2X11_NR_FUNCS; i++)
- pci_write_config_dword(pdev, AHB_CRW(i), 0);
-
- dev_info(&pdev->dev,
- "sta2x11: Map EP %i: AMBA address %#8x-%#8x\n",
- sta2x11_pdev_to_ep(pdev), amba_base, max_amba_addr);
-}
-DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_STMICRO, PCI_ANY_ID, sta2x11_map_ep);
-
-#ifdef CONFIG_PM /* Some register values must be saved and restored */
-
-static struct sta2x11_mapping *sta2x11_pdev_to_mapping(struct pci_dev *pdev)
-{
- struct sta2x11_instance *instance;
- int ep;
-
- instance = sta2x11_pdev_to_instance(pdev);
- if (!instance)
- return NULL;
- ep = sta2x11_pdev_to_ep(pdev);
- return instance->map + ep;
-}
-
-static void suspend_mapping(struct pci_dev *pdev)
-{
- struct sta2x11_mapping *map = sta2x11_pdev_to_mapping(pdev);
- int i;
-
- if (!map)
- return;
-
- if (map->is_suspended)
- return;
- map->is_suspended = 1;
-
- /* Save all window configs */
- for (i = 0; i < STA2X11_NR_FUNCS; i++) {
- struct sta2x11_ahb_regs *regs = map->regs + i;
-
- pci_read_config_dword(pdev, AHB_BASE(i), &regs->base);
- pci_read_config_dword(pdev, AHB_PEXLBASE(i), &regs->pexlbase);
- pci_read_config_dword(pdev, AHB_PEXHBASE(i), &regs->pexhbase);
- pci_read_config_dword(pdev, AHB_CRW(i), &regs->crw);
- }
-}
-DECLARE_PCI_FIXUP_SUSPEND(PCI_VENDOR_ID_STMICRO, PCI_ANY_ID, suspend_mapping);
-
-static void resume_mapping(struct pci_dev *pdev)
-{
- struct sta2x11_mapping *map = sta2x11_pdev_to_mapping(pdev);
- int i;
-
- if (!map)
- return;
-
-
- if (!map->is_suspended)
- goto out;
- map->is_suspended = 0;
-
- /* Restore all window configs */
- for (i = 0; i < STA2X11_NR_FUNCS; i++) {
- struct sta2x11_ahb_regs *regs = map->regs + i;
-
- pci_write_config_dword(pdev, AHB_BASE(i), regs->base);
- pci_write_config_dword(pdev, AHB_PEXLBASE(i), regs->pexlbase);
- pci_write_config_dword(pdev, AHB_PEXHBASE(i), regs->pexhbase);
- pci_write_config_dword(pdev, AHB_CRW(i), regs->crw);
- }
-out:
- pci_set_master(pdev); /* Like at boot, enable master on all devices */
-}
-DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_STMICRO, PCI_ANY_ID, resume_mapping);
-
-#endif /* CONFIG_PM */
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 0f2fe524f60d..b8755cde2419 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -436,7 +436,8 @@ static struct msi_domain_ops xen_pci_msi_domain_ops = {
};
static struct msi_domain_info xen_pci_msi_domain_info = {
- .flags = MSI_FLAG_PCI_MSIX | MSI_FLAG_FREE_MSI_DESCS | MSI_FLAG_DEV_SYSFS,
+ .flags = MSI_FLAG_PCI_MSIX | MSI_FLAG_FREE_MSI_DESCS |
+ MSI_FLAG_DEV_SYSFS | MSI_FLAG_NO_MASK,
.ops = &xen_pci_msi_domain_ops,
};
@@ -484,11 +485,6 @@ static __init void xen_setup_pci_msi(void)
* in allocating the native domain and never use it.
*/
x86_init.irqs.create_pci_msi_domain = xen_create_pci_msi_domain;
- /*
- * With XEN PIRQ/Eventchannels in use PCI/MSI[-X] masking is solely
- * controlled by the hypervisor.
- */
- pci_msi_ignore_mask = 1;
}
#else /* CONFIG_PCI_MSI */
diff --git a/arch/x86/platform/olpc/olpc-xo1-sci.c b/arch/x86/platform/olpc/olpc-xo1-sci.c
index ccb23c73cbe8..63066e7c8517 100644
--- a/arch/x86/platform/olpc/olpc-xo1-sci.c
+++ b/arch/x86/platform/olpc/olpc-xo1-sci.c
@@ -14,7 +14,6 @@
#include <linux/interrupt.h>
#include <linux/platform_device.h>
#include <linux/pm.h>
-#include <linux/pm_wakeup.h>
#include <linux/power_supply.h>
#include <linux/suspend.h>
#include <linux/workqueue.h>
diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c
index cf5dca2dbb91..e108ce7dad6a 100644
--- a/arch/x86/platform/olpc/olpc_dt.c
+++ b/arch/x86/platform/olpc/olpc_dt.c
@@ -215,13 +215,12 @@ static u32 __init olpc_dt_get_board_revision(void)
static int __init olpc_dt_compatible_match(phandle node, const char *compat)
{
char buf[64], *p;
- int plen, len;
+ int plen;
plen = olpc_dt_getproperty(node, "compatible", buf, sizeof(buf));
if (plen <= 0)
return 0;
- len = strlen(compat);
for (p = buf; p < buf + plen; p += strlen(p) + 1) {
if (strcmp(p, compat) == 0)
return 1;
diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S
index 4733a5f467b8..cfa18ec7d55f 100644
--- a/arch/x86/platform/pvh/head.S
+++ b/arch/x86/platform/pvh/head.S
@@ -173,10 +173,14 @@ SYM_CODE_START(pvh_start_xen)
1:
UNWIND_HINT_END_OF_STACK
- /* Set base address in stack canary descriptor. */
- mov $MSR_GS_BASE,%ecx
- leal canary(%rip), %eax
- xor %edx, %edx
+ /*
+ * Set up GSBASE.
+ * Note that on SMP the boot CPU uses the init data section until
+ * the per-CPU areas are set up.
+ */
+ movl $MSR_GS_BASE,%ecx
+ xorl %eax, %eax
+ xorl %edx, %edx
wrmsr
/* Call xen_prepare_pvh() via the kernel virtual mapping */
@@ -238,8 +242,6 @@ SYM_DATA_START_LOCAL(gdt_start)
SYM_DATA_END_LABEL(gdt_start, SYM_L_LOCAL, gdt_end)
.balign 16
-SYM_DATA_LOCAL(canary, .fill 48, 1, 0)
-
SYM_DATA_START_LOCAL(early_stack)
.fill BOOT_STACK_SIZE, 1, 0
SYM_DATA_END_LABEL(early_stack, SYM_L_LOCAL, early_stack_end)
diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S
index 0a0539e1cc81..8c534c36adfa 100644
--- a/arch/x86/power/hibernate_asm_64.S
+++ b/arch/x86/power/hibernate_asm_64.S
@@ -26,6 +26,7 @@
/* code below belongs to the image kernel */
.align PAGE_SIZE
SYM_FUNC_START(restore_registers)
+ ANNOTATE_NOENDBR
/* go back to the original page tables */
movq %r9, %cr3
@@ -119,6 +120,7 @@ SYM_FUNC_END(restore_image)
/* code below has been relocated to a safe page */
SYM_FUNC_START(core_restore_code)
+ ANNOTATE_NOENDBR
/* switch to temporary page tables */
movq %rax, %cr3
/* flush TLB */
diff --git a/arch/x86/realmode/rm/realmode.h b/arch/x86/realmode/rm/realmode.h
index c76041a35397..867e55f1d6af 100644
--- a/arch/x86/realmode/rm/realmode.h
+++ b/arch/x86/realmode/rm/realmode.h
@@ -2,7 +2,7 @@
#ifndef ARCH_X86_REALMODE_RM_REALMODE_H
#define ARCH_X86_REALMODE_RM_REALMODE_H
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
/*
* 16-bit ljmpw to the real_mode_seg
@@ -12,7 +12,7 @@
*/
#define LJMPW_RM(to) .byte 0xea ; .word (to), real_mode_seg
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
/*
* Signature at the end of the realmode region
diff --git a/arch/x86/realmode/rm/wakeup.h b/arch/x86/realmode/rm/wakeup.h
index 0e4fd08ae447..3b6d8fa82d3e 100644
--- a/arch/x86/realmode/rm/wakeup.h
+++ b/arch/x86/realmode/rm/wakeup.h
@@ -7,7 +7,7 @@
#ifndef ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H
#define ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/types.h>
/* This must match data at wakeup.S */
diff --git a/arch/x86/tools/cpufeaturemasks.awk b/arch/x86/tools/cpufeaturemasks.awk
new file mode 100755
index 000000000000..173d5bf2d999
--- /dev/null
+++ b/arch/x86/tools/cpufeaturemasks.awk
@@ -0,0 +1,88 @@
+#!/usr/bin/awk
+#
+# Convert cpufeatures.h to a list of compile-time masks
+# Note: this blithely assumes that each word has at least one
+# feature defined in it; if not, something else is wrong!
+#
+
+BEGIN {
+ printf "#ifndef _ASM_X86_CPUFEATUREMASKS_H\n";
+ printf "#define _ASM_X86_CPUFEATUREMASKS_H\n\n";
+
+ file = 0
+}
+
+FNR == 1 {
+ ++file;
+
+ # arch/x86/include/asm/cpufeatures.h
+ if (file == 1)
+ FS = "[ \t()*+]+";
+
+ # .config
+ if (file == 2)
+ FS = "=";
+}
+
+# Create a dictionary of sorts, containing all defined feature bits
+file == 1 && $1 ~ /^#define$/ && $2 ~ /^X86_FEATURE_/ {
+ nfeat = $3 * $4 + $5;
+ feat = $2;
+ sub(/^X86_FEATURE_/, "", feat);
+ feats[nfeat] = feat;
+}
+file == 1 && $1 ~ /^#define$/ && $2 == "NCAPINTS" {
+ ncapints = int($3);
+}
+
+# Create a dictionary featstat[REQUIRED|DISABLED, FEATURE_NAME] = on | off
+file == 2 && $1 ~ /^CONFIG_X86_(REQUIRED|DISABLED)_FEATURE_/ {
+ on = ($2 == "y");
+ if (split($1, fs, "CONFIG_X86_|_FEATURE_") == 3)
+ featstat[fs[2], fs[3]] = on;
+}
+
+END {
+ sets[1] = "REQUIRED";
+ sets[2] = "DISABLED";
+
+ for (ns in sets) {
+ s = sets[ns];
+
+ printf "/*\n";
+ printf " * %s features:\n", s;
+ printf " *\n";
+ fstr = "";
+ for (i = 0; i < ncapints; i++) {
+ mask = 0;
+ for (j = 0; j < 32; j++) {
+ feat = feats[i*32 + j];
+ if (featstat[s, feat]) {
+ nfstr = fstr " " feat;
+ if (length(nfstr) > 72) {
+ printf " * %s\n", fstr;
+ nfstr = " " feat;
+ }
+ fstr = nfstr;
+ mask += (2 ^ j);
+ }
+ }
+ masks[i] = mask;
+ }
+ printf " * %s\n */\n", fstr;
+
+ for (i = 0; i < ncapints; i++)
+ printf "#define %s_MASK%d\t0x%08xU\n", s, i, masks[i];
+
+ printf "\n#define %s_MASK_BIT_SET(x)\t\t\t\\\n", s;
+ printf "\t((\t\t\t\t\t";
+ for (i = 0; i < ncapints; i++) {
+ if (masks[i])
+ printf "\t\\\n\t\t((x) >> 5) == %2d ? %s_MASK%d :", i, s, i;
+ }
+ printf " 0\t\\\n";
+ printf "\t) & (1U << ((x) & 31)))\n\n";
+ }
+
+ printf "#endif /* _ASM_X86_CPUFEATUREMASKS_H */\n";
+}
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index e937be979ec8..5778bc498415 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -29,9 +29,13 @@ static struct relocs relocs16;
static struct relocs relocs32;
#if ELF_BITS == 64
-static struct relocs relocs32neg;
static struct relocs relocs64;
# define FMT PRIu64
+
+#ifndef R_X86_64_REX_GOTPCRELX
+# define R_X86_64_REX_GOTPCRELX 42
+#endif
+
#else
# define FMT PRIu32
#endif
@@ -86,8 +90,6 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = {
"__initramfs_start|"
"(jiffies|jiffies_64)|"
#if ELF_BITS == 64
- "__per_cpu_load|"
- "init_per_cpu__.*|"
"__end_rodata_hpage_align|"
#endif
"_end)$"
@@ -227,6 +229,7 @@ static const char *rel_type(unsigned type)
REL_TYPE(R_X86_64_PC16),
REL_TYPE(R_X86_64_8),
REL_TYPE(R_X86_64_PC8),
+ REL_TYPE(R_X86_64_REX_GOTPCRELX),
#else
REL_TYPE(R_386_NONE),
REL_TYPE(R_386_32),
@@ -284,34 +287,6 @@ static const char *sym_name(const char *sym_strtab, Elf_Sym *sym)
return name;
}
-static Elf_Sym *sym_lookup(const char *symname)
-{
- int i;
-
- for (i = 0; i < shnum; i++) {
- struct section *sec = &secs[i];
- long nsyms;
- char *strtab;
- Elf_Sym *symtab;
- Elf_Sym *sym;
-
- if (sec->shdr.sh_type != SHT_SYMTAB)
- continue;
-
- nsyms = sec->shdr.sh_size/sizeof(Elf_Sym);
- symtab = sec->symtab;
- strtab = sec->link->strtab;
-
- for (sym = symtab; --nsyms >= 0; sym++) {
- if (!sym->st_name)
- continue;
- if (strcmp(symname, strtab + sym->st_name) == 0)
- return sym;
- }
- }
- return 0;
-}
-
#if BYTE_ORDER == LITTLE_ENDIAN
# define le16_to_cpu(val) (val)
# define le32_to_cpu(val) (val)
@@ -760,84 +735,8 @@ static void walk_relocs(int (*process)(struct section *sec, Elf_Rel *rel,
}
}
-/*
- * The .data..percpu section is a special case for x86_64 SMP kernels.
- * It is used to initialize the actual per_cpu areas and to provide
- * definitions for the per_cpu variables that correspond to their offsets
- * within the percpu area. Since the values of all of the symbols need
- * to be offsets from the start of the per_cpu area the virtual address
- * (sh_addr) of .data..percpu is 0 in SMP kernels.
- *
- * This means that:
- *
- * Relocations that reference symbols in the per_cpu area do not
- * need further relocation (since the value is an offset relative
- * to the start of the per_cpu area that does not change).
- *
- * Relocations that apply to the per_cpu area need to have their
- * offset adjusted by by the value of __per_cpu_load to make them
- * point to the correct place in the loaded image (because the
- * virtual address of .data..percpu is 0).
- *
- * For non SMP kernels .data..percpu is linked as part of the normal
- * kernel data and does not require special treatment.
- *
- */
-static int per_cpu_shndx = -1;
-static Elf_Addr per_cpu_load_addr;
-
-static void percpu_init(void)
-{
- int i;
-
- for (i = 0; i < shnum; i++) {
- ElfW(Sym) *sym;
-
- if (strcmp(sec_name(i), ".data..percpu"))
- continue;
-
- if (secs[i].shdr.sh_addr != 0) /* non SMP kernel */
- return;
-
- sym = sym_lookup("__per_cpu_load");
- if (!sym)
- die("can't find __per_cpu_load\n");
-
- per_cpu_shndx = i;
- per_cpu_load_addr = sym->st_value;
-
- return;
- }
-}
-
#if ELF_BITS == 64
-/*
- * Check to see if a symbol lies in the .data..percpu section.
- *
- * The linker incorrectly associates some symbols with the
- * .data..percpu section so we also need to check the symbol
- * name to make sure that we classify the symbol correctly.
- *
- * The GNU linker incorrectly associates:
- * __init_begin
- * __per_cpu_load
- *
- * The "gold" linker incorrectly associates:
- * init_per_cpu__fixed_percpu_data
- * init_per_cpu__gdt_page
- */
-static int is_percpu_sym(ElfW(Sym) *sym, const char *symname)
-{
- int shndx = sym_index(sym);
-
- return (shndx == per_cpu_shndx) &&
- strcmp(symname, "__init_begin") &&
- strcmp(symname, "__per_cpu_load") &&
- strncmp(symname, "init_per_cpu_", 13);
-}
-
-
static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym,
const char *symname)
{
@@ -848,12 +747,6 @@ static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym,
if (sym->st_shndx == SHN_UNDEF)
return 0;
- /*
- * Adjust the offset if this reloc applies to the percpu section.
- */
- if (sec->shdr.sh_info == per_cpu_shndx)
- offset += per_cpu_load_addr;
-
switch (r_type) {
case R_X86_64_NONE:
/* NONE can be ignored. */
@@ -861,33 +754,23 @@ static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym,
case R_X86_64_PC32:
case R_X86_64_PLT32:
+ case R_X86_64_REX_GOTPCRELX:
/*
- * PC relative relocations don't need to be adjusted unless
- * referencing a percpu symbol.
+ * PC relative relocations don't need to be adjusted.
*
* NB: R_X86_64_PLT32 can be treated as R_X86_64_PC32.
*/
- if (is_percpu_sym(sym, symname))
- add_reloc(&relocs32neg, offset);
break;
case R_X86_64_PC64:
/*
* Only used by jump labels
*/
- if (is_percpu_sym(sym, symname))
- die("Invalid R_X86_64_PC64 relocation against per-CPU symbol %s\n", symname);
break;
case R_X86_64_32:
case R_X86_64_32S:
case R_X86_64_64:
- /*
- * References to the percpu area don't need to be adjusted.
- */
- if (is_percpu_sym(sym, symname))
- break;
-
if (shn_abs) {
/*
* Whitelisted absolute symbols do not require
@@ -1055,7 +938,8 @@ static int cmp_relocs(const void *va, const void *vb)
static void sort_relocs(struct relocs *r)
{
- qsort(r->offset, r->count, sizeof(r->offset[0]), cmp_relocs);
+ if (r->count)
+ qsort(r->offset, r->count, sizeof(r->offset[0]), cmp_relocs);
}
static int write32(uint32_t v, FILE *f)
@@ -1099,7 +983,6 @@ static void emit_relocs(int as_text, int use_real_mode)
/* Order the relocations for more efficient processing */
sort_relocs(&relocs32);
#if ELF_BITS == 64
- sort_relocs(&relocs32neg);
sort_relocs(&relocs64);
#else
sort_relocs(&relocs16);
@@ -1131,13 +1014,6 @@ static void emit_relocs(int as_text, int use_real_mode)
/* Now print each relocation */
for (i = 0; i < relocs64.count; i++)
write_reloc(relocs64.offset[i], stdout);
-
- /* Print a stop */
- write_reloc(0, stdout);
-
- /* Now print each inverse 32-bit relocation */
- for (i = 0; i < relocs32neg.count; i++)
- write_reloc(relocs32neg.offset[i], stdout);
#endif
/* Print a stop */
@@ -1190,9 +1066,6 @@ void process(FILE *fp, int use_real_mode, int as_text,
read_symtabs(fp);
read_relocs(fp);
- if (ELF_BITS == 64)
- percpu_init();
-
if (show_absolute_syms) {
print_absolute_symbols();
return;
diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c
index 42e74a5a7d78..fc473ca12c44 100644
--- a/arch/x86/virt/svm/sev.c
+++ b/arch/x86/virt/svm/sev.c
@@ -198,7 +198,6 @@ static void __init __snp_fixup_e820_tables(u64 pa)
pr_info("Reserving start/end of RMP table on a 2MB boundary [0x%016llx]\n", pa);
e820__range_update(pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
e820__range_update_table(e820_table_kexec, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
- e820__range_update_table(e820_table_firmware, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
if (!memblock_is_region_reserved(pa, PMD_SIZE))
memblock_reserve(pa, PMD_SIZE);
}
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 77e788e928cd..98d8a50d2aed 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -9,7 +9,7 @@ config XEN
select PARAVIRT_CLOCK
select X86_HV_CALLBACK_VECTOR
depends on X86_64 || (X86_32 && X86_PAE)
- depends on X86_64 || (X86_GENERIC || MPENTIUM4 || MCORE2 || MATOM || MK8)
+ depends on X86_64 || (X86_GENERIC || MPENTIUM4 || MATOM)
depends on X86_LOCAL_APIC && X86_TSC
help
This is the Linux Xen port. Enabling this will allow the
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 5e57835e999d..dcc2041f8e61 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -73,6 +73,7 @@
#include <asm/mwait.h>
#include <asm/pci_x86.h>
#include <asm/cpu.h>
+#include <asm/irq_stack.h>
#ifdef CONFIG_X86_IOPL_IOPERM
#include <asm/io_bitmap.h>
#endif
@@ -94,6 +95,44 @@ void *xen_initial_gdt;
static int xen_cpu_up_prepare_pv(unsigned int cpu);
static int xen_cpu_dead_pv(unsigned int cpu);
+#ifndef CONFIG_PREEMPTION
+/*
+ * Some hypercalls issued by the toolstack can take many 10s of
+ * seconds. Allow tasks running hypercalls via the privcmd driver to
+ * be voluntarily preempted even if full kernel preemption is
+ * disabled.
+ *
+ * Such preemptible hypercalls are bracketed by
+ * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end()
+ * calls.
+ */
+DEFINE_PER_CPU(bool, xen_in_preemptible_hcall);
+EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall);
+
+/*
+ * In case of scheduling the flag must be cleared and restored after
+ * returning from schedule as the task might move to a different CPU.
+ */
+static __always_inline bool get_and_clear_inhcall(void)
+{
+ bool inhcall = __this_cpu_read(xen_in_preemptible_hcall);
+
+ __this_cpu_write(xen_in_preemptible_hcall, false);
+ return inhcall;
+}
+
+static __always_inline void restore_inhcall(bool inhcall)
+{
+ __this_cpu_write(xen_in_preemptible_hcall, inhcall);
+}
+
+#else
+
+static __always_inline bool get_and_clear_inhcall(void) { return false; }
+static __always_inline void restore_inhcall(bool inhcall) { }
+
+#endif
+
struct tls_descs {
struct desc_struct desc[3];
};
@@ -687,6 +726,36 @@ DEFINE_IDTENTRY_RAW(xenpv_exc_machine_check)
}
#endif
+static void __xen_pv_evtchn_do_upcall(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ inc_irq_stat(irq_hv_callback_count);
+
+ xen_evtchn_do_upcall();
+
+ set_irq_regs(old_regs);
+}
+
+__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
+{
+ irqentry_state_t state = irqentry_enter(regs);
+ bool inhcall;
+
+ instrumentation_begin();
+ run_sysvec_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs);
+
+ inhcall = get_and_clear_inhcall();
+ if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) {
+ irqentry_exit_cond_resched();
+ instrumentation_end();
+ restore_inhcall(inhcall);
+ } else {
+ instrumentation_end();
+ irqentry_exit(regs, state);
+ }
+}
+
struct trap_array_entry {
void (*orig)(void);
void (*xen)(void);
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index d078de2c952b..38971c6dcd4b 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -2189,7 +2189,6 @@ static const typeof(pv_ops) xen_mmu_ops __initconst = {
.flush_tlb_kernel = xen_flush_tlb,
.flush_tlb_one_user = xen_flush_tlb_one_user,
.flush_tlb_multi = xen_flush_tlb_multi,
- .tlb_remove_table = tlb_remove_table,
.pgd_alloc = xen_pgd_alloc,
.pgd_free = xen_pgd_free,
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index 6863d3da7dec..688ff59318ae 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -70,7 +70,7 @@ static void cpu_bringup(void)
xen_enable_syscall();
}
cpu = smp_processor_id();
- smp_store_cpu_info(cpu);
+ identify_secondary_cpu(cpu);
set_cpu_sibling_map(cpu);
speculative_store_bypass_ht_init();
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
index b518f36d1ca2..109af12f7647 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -51,6 +51,7 @@ SYM_FUNC_END(xen_hypercall_pv)
* non-zero.
*/
SYM_FUNC_START(xen_irq_disable_direct)
+ ENDBR
movb $1, PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_mask)
RET
SYM_FUNC_END(xen_irq_disable_direct)
@@ -90,6 +91,7 @@ SYM_FUNC_END(check_events)
* then enter the hypervisor to get them handled.
*/
SYM_FUNC_START(xen_irq_enable_direct)
+ ENDBR
FRAME_BEGIN
/* Unmask events */
movb $0, PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_mask)
@@ -120,6 +122,7 @@ SYM_FUNC_END(xen_irq_enable_direct)
* x86 use opposite senses (mask vs enable).
*/
SYM_FUNC_START(xen_save_fl_direct)
+ ENDBR
testb $0xff, PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_mask)
setz %ah
addb %ah, %ah
@@ -127,6 +130,7 @@ SYM_FUNC_START(xen_save_fl_direct)
SYM_FUNC_END(xen_save_fl_direct)
SYM_FUNC_START(xen_read_cr2)
+ ENDBR
FRAME_BEGIN
_ASM_MOV PER_CPU_VAR(xen_vcpu), %_ASM_AX
_ASM_MOV XEN_vcpu_info_arch_cr2(%_ASM_AX), %_ASM_AX
@@ -135,6 +139,7 @@ SYM_FUNC_START(xen_read_cr2)
SYM_FUNC_END(xen_read_cr2);
SYM_FUNC_START(xen_read_cr2_direct)
+ ENDBR
FRAME_BEGIN
_ASM_MOV PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_arch_cr2), %_ASM_AX
FRAME_END
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 894edf8d6d62..5dad6c51cdc3 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -31,16 +31,14 @@ SYM_CODE_START(startup_xen)
leaq __top_init_kernel_stack(%rip), %rsp
- /* Set up %gs.
- *
- * The base of %gs always points to fixed_percpu_data. If the
- * stack protector canary is enabled, it is located at %gs:40.
+ /*
+ * Set up GSBASE.
* Note that, on SMP, the boot cpu uses init data section until
* the per cpu areas are set up.
*/
movl $MSR_GS_BASE,%ecx
- movq $INIT_PER_CPU_VAR(fixed_percpu_data),%rax
- cdq
+ xorl %eax, %eax
+ xorl %edx, %edx
wrmsr
mov %rsi, %rdi
@@ -133,11 +131,13 @@ SYM_FUNC_START(xen_hypercall_hvm)
SYM_FUNC_END(xen_hypercall_hvm)
SYM_FUNC_START(xen_hypercall_amd)
+ ANNOTATE_NOENDBR
vmmcall
RET
SYM_FUNC_END(xen_hypercall_amd)
SYM_FUNC_START(xen_hypercall_intel)
+ ANNOTATE_NOENDBR
vmcall
RET
SYM_FUNC_END(xen_hypercall_intel)
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index 37effc1b134e..f657a77314f8 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -437,3 +437,4 @@
464 common getxattrat sys_getxattrat
465 common listxattrat sys_listxattrat
466 common removexattrat sys_removexattrat
+467 common open_tree_attr sys_open_tree_attr
diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c
index 38092d21acf8..44c07c4e0833 100644
--- a/arch/xtensa/kernel/traps.c
+++ b/arch/xtensa/kernel/traps.c
@@ -629,15 +629,11 @@ DEFINE_SPINLOCK(die_lock);
void __noreturn die(const char * str, struct pt_regs * regs, long err)
{
static int die_counter;
- const char *pr = "";
-
- if (IS_ENABLED(CONFIG_PREEMPTION))
- pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT";
console_verbose();
spin_lock_irq(&die_lock);
- pr_info("%s: sig: %ld [#%d]%s\n", str, err, ++die_counter, pr);
+ pr_info("%s: sig: %ld [#%d]\n", str, err, ++die_counter);
show_regs(regs);
if (!user_mode(regs))
show_stack(NULL, (unsigned long *)regs->areg[1], KERN_INFO);