diff options
Diffstat (limited to 'arch/s390')
355 files changed, 16153 insertions, 19150 deletions
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 9c9ec08d78c7..ecbcbb781e40 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -32,18 +32,12 @@ config GENERIC_BUG_RELATIVE_POINTERS config GENERIC_LOCKBREAK def_bool y if PREEMPTION -config PGSTE - def_bool y if KVM - config AUDIT_ARCH def_bool y config NO_IOPORT_MAP def_bool y -config PCI_QUIRKS - def_bool n - config ARCH_SUPPORTS_UPROBES def_bool y @@ -52,6 +46,13 @@ config KASAN_SHADOW_OFFSET depends on KASAN default 0x1C000000000000 +config CC_HAS_BUILTIN_FFS + def_bool !(CC_IS_GCC && GCC_VERSION < 160000) + help + GCC versions before 16.0.0 generate library calls to ffs() + for __builtin_ffs() even when __has_builtin(__builtin_ffs) + is true. + config CC_ASM_FLAG_OUTPUT_BROKEN def_bool CC_IS_GCC && GCC_VERSION < 140200 help @@ -65,21 +66,28 @@ config CC_HAS_ASM_AOR_FORMAT_FLAGS Clang versions before 19.1.0 do not support A, O, and R inline assembly format flags. +config CC_HAS_ASM_IMMEDIATE_STRINGS + def_bool !(CC_IS_GCC && GCC_VERSION < 90000) + help + GCC versions before 9.0.0 cannot handle strings as immediate + input operands in inline assemblies. + +config CC_HAS_STACKPROTECTOR_GLOBAL + def_bool $(cc-option, -mstack-protector-guard=global -mstack-protector-guard-record) + config S390 def_bool y # # Note: keep this list sorted alphabetically # - imply IMA_SECURE_AND_OR_TRUSTED_BOOT select ALTERNATE_USER_ADDRESS_SPACE select ARCH_32BIT_USTAT_F_TINODE - select ARCH_BINFMT_ELF_STATE select ARCH_CORRECT_STACKTRACE_ON_KRETPROBE select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM - select ARCH_ENABLE_MEMORY_HOTREMOVE select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2 + select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE + select ARCH_HAS_CC_CAN_LINK select ARCH_HAS_CPU_FINALIZE_INIT - select ARCH_HAS_CRC32 select ARCH_HAS_CURRENT_STACK_POINTER select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEBUG_VM_PGTABLE @@ -96,6 +104,7 @@ config S390 select ARCH_HAS_MEM_ENCRYPT select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS select ARCH_HAS_PREEMPT_LAZY + select ARCH_HAS_PTDUMP select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SCALED_CPUTIME select ARCH_HAS_SET_DIRECT_MAP @@ -106,6 +115,7 @@ config S390 select ARCH_HAS_UBSAN select ARCH_HAS_VDSO_TIME_DATA select ARCH_HAVE_NMI_SAFE_CMPXCHG + select ARCH_HAVE_TRACE_MMIO_ACCESS select ARCH_INLINE_READ_LOCK select ARCH_INLINE_READ_LOCK_BH select ARCH_INLINE_READ_LOCK_IRQ @@ -140,7 +150,9 @@ config S390 select ARCH_SUPPORTS_DEBUG_PAGEALLOC select ARCH_SUPPORTS_HUGETLBFS select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 && CC_IS_CLANG + select ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS select ARCH_SUPPORTS_NUMA_BALANCING + select ARCH_SUPPORTS_PAGE_TABLE_CHECK select ARCH_SUPPORTS_PER_VMA_LOCK select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF @@ -148,9 +160,10 @@ config S390 select ARCH_WANTS_NO_INSTR select ARCH_WANT_DEFAULT_BPF_JIT select ARCH_WANT_IPC_PARSE_VERSION + select ARCH_WANT_IRQS_OFF_ACTIVATE_MM select ARCH_WANT_KERNEL_PMD_MKWRITE select ARCH_WANT_LD_ORPHAN_WARN - select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP + select ARCH_WANTS_THP_SWAP select BUILDTIME_TABLE_SORT select CLONE_BACKWARDS2 select DCACHE_WORD_ACCESS if !KMSAN @@ -163,13 +176,12 @@ config S390 select GENERIC_CPU_VULNERABILITIES select GENERIC_ENTRY select GENERIC_GETTIMEOFDAY - select GENERIC_PTDUMP select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL - select GENERIC_VDSO_TIME_NS select GENERIC_IOREMAP if PCI select HAVE_ALIGNED_STRUCT_PAGE select HAVE_ARCH_AUDITSYSCALL + select HAVE_ARCH_GET_SECUREBOOT select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_JUMP_LABEL_RELATIVE select HAVE_ARCH_KASAN @@ -177,14 +189,15 @@ config S390 select HAVE_ARCH_KCSAN select HAVE_ARCH_KMSAN select HAVE_ARCH_KFENCE + select HAVE_ARCH_KSTACK_ERASE select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_SOFT_DIRTY - select HAVE_ARCH_STACKLEAK select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE select HAVE_ARCH_VMAP_STACK select HAVE_ASM_MODVERSIONS + select HAVE_BUILDTIME_MCOUNT_SORT select HAVE_CMPXCHG_DOUBLE select HAVE_CMPXCHG_LOCAL select HAVE_DEBUG_KMEMLEAK @@ -196,10 +209,10 @@ config S390 select HAVE_DYNAMIC_FTRACE_WITH_REGS select HAVE_EBPF_JIT if HAVE_MARCH_Z196_FEATURES select HAVE_EFFICIENT_UNALIGNED_ACCESS + select HAVE_GENERIC_TIF_BITS select HAVE_GUP_FAST select HAVE_FENTRY select HAVE_FTRACE_GRAPH_FUNC - select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_FUNCTION_ERROR_INJECTION select HAVE_FUNCTION_GRAPH_FREGS @@ -229,6 +242,7 @@ config S390 select HAVE_PERF_EVENTS select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HAVE_POSIX_CPU_TIMERS_TASK_WORK select HAVE_PREEMPT_DYNAMIC_KEY select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE @@ -238,11 +252,14 @@ config S390 select HAVE_SAMPLE_FTRACE_DIRECT_MULTI select HAVE_SETUP_PER_CPU_AREA select HAVE_SOFTIRQ_ON_OWN_STACK + select HAVE_STACKPROTECTOR if CC_HAS_STACKPROTECTOR_GLOBAL select HAVE_SYSCALL_TRACEPOINTS select HAVE_VIRT_CPU_ACCOUNTING select HAVE_VIRT_CPU_ACCOUNTING_IDLE + select HOTPLUG_SMT select IOMMU_HELPER if PCI select IOMMU_SUPPORT if PCI + select IRQ_MSI_LIB if PCI select KASAN_VMALLOC if KASAN select LOCK_MM_AND_FIND_VMA select MMU_GATHER_MERGE_VMAS @@ -258,9 +275,11 @@ config S390 select PCI_DOMAINS if PCI select PCI_MSI if PCI select PCI_MSI_ARCH_FALLBACKS if PCI_MSI + select PCI_QUIRKS if PCI select SPARSE_IRQ select SWIOTLB select SYSCTL_EXCEPTION_TRACE + select SYSTEM_DATA_VERIFICATION if KEXEC_SIG select THREAD_INFO_IN_TASK select TRACE_IRQFLAGS_SUPPORT select TTY @@ -280,6 +299,14 @@ config PGTABLE_LEVELS source "kernel/livepatch/Kconfig" +config ARCH_CC_CAN_LINK + bool + default $(cc_can_link_user,-m64) + +config ARCH_USERFLAGS + string + default "-m64" + config ARCH_SUPPORTS_KEXEC def_bool y @@ -287,7 +314,7 @@ config ARCH_SUPPORTS_KEXEC_FILE def_bool y config ARCH_SUPPORTS_KEXEC_SIG - def_bool MODULE_SIG_FORMAT + def_bool y config ARCH_SUPPORTS_KEXEC_PURGATORY def_bool y @@ -331,6 +358,10 @@ config HAVE_MARCH_Z16_FEATURES def_bool n select HAVE_MARCH_Z15_FEATURES +config HAVE_MARCH_Z17_FEATURES + def_bool n + select HAVE_MARCH_Z16_FEATURES + choice prompt "Processor type" default MARCH_Z196 @@ -396,6 +427,14 @@ config MARCH_Z16 Select this to enable optimizations for IBM z16 (3931 and 3932 series). +config MARCH_Z17 + bool "IBM z17" + select HAVE_MARCH_Z17_FEATURES + depends on $(cc-option,-march=z17) + help + Select this to enable optimizations for IBM z17 (9175 and + 9176 series). + endchoice config MARCH_Z10_TUNE @@ -419,6 +458,9 @@ config MARCH_Z15_TUNE config MARCH_Z16_TUNE def_bool TUNE_Z16 || MARCH_Z16 && TUNE_DEFAULT +config MARCH_Z17_TUNE + def_bool TUNE_Z17 || MARCH_Z17 && TUNE_DEFAULT + choice prompt "Tune code generation" default TUNE_DEFAULT @@ -463,6 +505,10 @@ config TUNE_Z16 bool "IBM z16" depends on $(cc-option,-mtune=z16) +config TUNE_Z17 + bool "IBM z17" + depends on $(cc-option,-mtune=z17) + endchoice config 64BIT @@ -476,22 +522,6 @@ config COMMAND_LINE_SIZE This allows you to specify the maximum length of the kernel command line. -config COMPAT - def_bool n - prompt "Kernel support for 31 bit emulation" - select ARCH_WANT_OLD_COMPAT_IPC - select COMPAT_OLD_SIGACTION - select HAVE_UID16 - depends on MULTIUSER - depends on !CC_IS_CLANG && !LD_IS_LLD - help - Select this option if you want to enable your system kernel to - handle system-calls from ELF binaries for 31 bit ESA. This option - (and some other stuff like libraries and such) is needed for - executing 31 bit applications. - - If unsure say N. - config SMP def_bool y @@ -524,15 +554,11 @@ config NODES_SHIFT depends on NUMA default "1" -config SCHED_SMT - def_bool n - -config SCHED_MC - def_bool n - config SCHED_TOPOLOGY def_bool y prompt "Topology scheduler support" + select ARCH_SUPPORTS_SCHED_SMT + select ARCH_SUPPORTS_SCHED_MC select SCHED_SMT select SCHED_MC help @@ -630,6 +656,7 @@ endchoice config RELOCATABLE def_bool y + select ARCH_VMLINUX_NEEDS_RELOCS help This builds a kernel image that retains relocation information so it can be loaded at an arbitrary address. @@ -686,11 +713,14 @@ menu "Memory setup" config ARCH_SPARSEMEM_ENABLE def_bool y select SPARSEMEM_VMEMMAP_ENABLE - select SPARSEMEM_VMEMMAP config ARCH_SPARSEMEM_DEFAULT def_bool y +config ILLEGAL_POINTER_VALUE + hex + default 0xdead000000000000 + config MAX_PHYSMEM_BITS int "Maximum size of supported physical memory in bits (42-53)" range 42 53 @@ -994,7 +1024,7 @@ config S390_KPROBES_SANITY_TEST config S390_MODULES_SANITY_TEST def_tristate n - depends on KUNIT + depends on KUNIT && m default KUNIT_ALL_TESTS prompt "Enable s390 specific modules tests" select S390_MODULES_SANITY_TEST_HELPERS diff --git a/arch/s390/Kconfig.debug b/arch/s390/Kconfig.debug index c4300ea4abf8..7955d7eee7d8 100644 --- a/arch/s390/Kconfig.debug +++ b/arch/s390/Kconfig.debug @@ -13,6 +13,16 @@ config DEBUG_ENTRY If unsure, say N. +config STRICT_MM_TYPECHECKS + bool "Strict Memory Management Type Checks" + depends on DEBUG_KERNEL + help + Enable strict type checking for memory management types like pte_t + and pmd_t. This generates slightly worse code and should be used + for debug builds. + + If unsure, say N. + config CIO_INJECT bool "CIO Inject interfaces" depends on DEBUG_KERNEL && DEBUG_FS diff --git a/arch/s390/Makefile b/arch/s390/Makefile index 5fae311203c2..297976b41088 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -12,19 +12,19 @@ LD_BFD := elf64-s390 KBUILD_LDFLAGS := -m elf64_s390 KBUILD_AFLAGS_MODULE += -fPIC KBUILD_CFLAGS_MODULE += -fPIC -KBUILD_AFLAGS += -m64 -KBUILD_CFLAGS += -m64 +KBUILD_CPPFLAGS += -m64 KBUILD_CFLAGS += -fPIC -LDFLAGS_vmlinux := -no-pie --emit-relocs --discard-none +LDFLAGS_vmlinux := $(call ld-option,-no-pie) extra_tools := relocs aflags_dwarf := -Wa,-gdwarf-2 KBUILD_AFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -D__ASSEMBLY__ ifndef CONFIG_AS_IS_LLVM KBUILD_AFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO),$(aflags_dwarf)) endif -KBUILD_CFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -O2 -mpacked-stack -std=gnu11 +KBUILD_CFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -O2 -mpacked-stack $(CC_FLAGS_DIALECT) KBUILD_CFLAGS_DECOMPRESSOR += -DDISABLE_BRANCH_PROFILING -D__NO_FORTIFY KBUILD_CFLAGS_DECOMPRESSOR += -D__DECOMPRESSOR +KBUILD_CFLAGS_DECOMPRESSOR += -Wno-pointer-sign KBUILD_CFLAGS_DECOMPRESSOR += -fno-delete-null-pointer-checks -msoft-float -mbackchain KBUILD_CFLAGS_DECOMPRESSOR += -fno-asynchronous-unwind-tables KBUILD_CFLAGS_DECOMPRESSOR += -ffreestanding @@ -48,6 +48,7 @@ mflags-$(CONFIG_MARCH_Z13) := -march=z13 mflags-$(CONFIG_MARCH_Z14) := -march=z14 mflags-$(CONFIG_MARCH_Z15) := -march=z15 mflags-$(CONFIG_MARCH_Z16) := -march=z16 +mflags-$(CONFIG_MARCH_Z17) := -march=z17 export CC_FLAGS_MARCH := $(mflags-y) @@ -61,6 +62,7 @@ cflags-$(CONFIG_MARCH_Z13_TUNE) += -mtune=z13 cflags-$(CONFIG_MARCH_Z14_TUNE) += -mtune=z14 cflags-$(CONFIG_MARCH_Z15_TUNE) += -mtune=z15 cflags-$(CONFIG_MARCH_Z16_TUNE) += -mtune=z16 +cflags-$(CONFIG_MARCH_Z17_TUNE) += -mtune=z17 cflags-y += -Wa,-I$(srctree)/arch/$(ARCH)/include @@ -86,6 +88,10 @@ ifdef CONFIG_EXPOLINE aflags-y += -DCC_USING_EXPOLINE endif +ifeq ($(CONFIG_STACKPROTECTOR),y) + KBUILD_CFLAGS += -mstack-protector-guard=global -mstack-protector-guard-record +endif + ifdef CONFIG_FUNCTION_TRACER ifeq ($(call cc-option,-mfentry -mnop-mcount),) # make use of hotpatch feature if the compiler supports it @@ -131,10 +137,9 @@ zfcpdump: $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@ archheaders: - $(Q)$(MAKE) $(build)=$(syscalls) uapi + $(Q)$(MAKE) $(build)=$(syscalls) all archprepare: - $(Q)$(MAKE) $(build)=$(syscalls) kapi $(Q)$(MAKE) $(build)=$(tools) kapi $(extra_tools) ifeq ($(KBUILD_EXTMOD),) # We need to generate vdso-offsets.h before compiling certain files in kernel/. @@ -145,12 +150,9 @@ ifeq ($(KBUILD_EXTMOD),) # this hack. prepare: vdso_prepare vdso_prepare: prepare0 - $(Q)$(MAKE) $(build)=arch/s390/kernel/vdso64 include/generated/vdso64-offsets.h - $(if $(CONFIG_COMPAT),$(Q)$(MAKE) \ - $(build)=arch/s390/kernel/vdso32 include/generated/vdso32-offsets.h) + $(Q)$(MAKE) $(build)=arch/s390/kernel/vdso include/generated/vdso-offsets.h -vdso-install-y += arch/s390/kernel/vdso64/vdso64.so.dbg -vdso-install-$(CONFIG_COMPAT) += arch/s390/kernel/vdso32/vdso32.so.dbg +vdso-install-y += arch/s390/kernel/vdso/vdso.so.dbg endif diff --git a/arch/s390/Makefile.postlink b/arch/s390/Makefile.postlink index 1ae5478cd6ac..c2b737500a91 100644 --- a/arch/s390/Makefile.postlink +++ b/arch/s390/Makefile.postlink @@ -11,7 +11,6 @@ __archpost: -include include/config/auto.conf include $(srctree)/scripts/Kbuild.include -include $(srctree)/scripts/Makefile.lib CMD_RELOCS=arch/s390/tools/relocs OUT_RELOCS = arch/s390/boot @@ -20,9 +19,8 @@ quiet_cmd_relocs = RELOCS $(OUT_RELOCS)/relocs.S mkdir -p $(OUT_RELOCS); \ $(CMD_RELOCS) $@ > $(OUT_RELOCS)/relocs.S -vmlinux: FORCE +vmlinux.unstripped: FORCE $(call cmd,relocs) - $(call cmd,strip_relocs) clean: @rm -f $(OUT_RELOCS)/relocs.S diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c index dd7ba7587dd5..edbedbb2a773 100644 --- a/arch/s390/appldata/appldata_base.c +++ b/arch/s390/appldata/appldata_base.c @@ -9,9 +9,9 @@ * Author: Gerald Schaefer <gerald.schaefer@de.ibm.com> */ -#define KMSG_COMPONENT "appldata" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "appldata: " fmt +#include <linux/export.h> #include <linux/module.h> #include <linux/sched/stat.h> #include <linux/init.h> @@ -140,7 +140,7 @@ int appldata_diag(char record_nr, u16 function, unsigned long buffer, struct appldata_product_id *id; int rc; - parm_list = kmalloc(sizeof(*parm_list), GFP_KERNEL); + parm_list = kmalloc_obj(*parm_list); id = kmemdup(&appldata_id, sizeof(appldata_id), GFP_KERNEL); rc = -ENOMEM; if (parm_list && id) { @@ -350,7 +350,7 @@ int appldata_register_ops(struct appldata_ops *ops) if (ops->size > APPLDATA_MAX_REC_SIZE) return -EINVAL; - ops->ctl_table = kcalloc(1, sizeof(struct ctl_table), GFP_KERNEL); + ops->ctl_table = kzalloc_objs(struct ctl_table, 1); if (!ops->ctl_table) return -ENOMEM; diff --git a/arch/s390/appldata/appldata_mem.c b/arch/s390/appldata/appldata_mem.c index fc608f9b79ab..659af5545a38 100644 --- a/arch/s390/appldata/appldata_mem.c +++ b/arch/s390/appldata/appldata_mem.c @@ -130,7 +130,7 @@ static int __init appldata_mem_init(void) { int ret; - ops.data = kzalloc(sizeof(struct appldata_mem_data), GFP_KERNEL); + ops.data = kzalloc_obj(struct appldata_mem_data); if (!ops.data) return -ENOMEM; diff --git a/arch/s390/appldata/appldata_net_sum.c b/arch/s390/appldata/appldata_net_sum.c index 59c282ca002f..972f6d592c32 100644 --- a/arch/s390/appldata/appldata_net_sum.c +++ b/arch/s390/appldata/appldata_net_sum.c @@ -132,7 +132,7 @@ static int __init appldata_net_init(void) { int ret; - ops.data = kzalloc(sizeof(struct appldata_net_sum_data), GFP_KERNEL); + ops.data = kzalloc_obj(struct appldata_net_sum_data); if (!ops.data) return -ENOMEM; diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c index a363d30ce739..137d4e7e1e9a 100644 --- a/arch/s390/appldata/appldata_os.c +++ b/arch/s390/appldata/appldata_os.c @@ -8,8 +8,7 @@ * Author: Gerald Schaefer <gerald.schaefer@de.ibm.com> */ -#define KMSG_COMPONENT "appldata" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "appldata: " fmt #include <linux/module.h> #include <linux/init.h> diff --git a/arch/s390/boot/.gitignore b/arch/s390/boot/.gitignore index f5ef099e2fd3..af2a6a7bc028 100644 --- a/arch/s390/boot/.gitignore +++ b/arch/s390/boot/.gitignore @@ -5,4 +5,5 @@ relocs.S section_cmp.* vmlinux vmlinux.lds +vmlinux.map vmlinux.syms diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile index 8bc1308ac892..a1e719a79d38 100644 --- a/arch/s390/boot/Makefile +++ b/arch/s390/boot/Makefile @@ -19,19 +19,21 @@ CC_FLAGS_MARCH_MINIMUM := -march=z10 KBUILD_AFLAGS := $(filter-out $(CC_FLAGS_MARCH),$(KBUILD_AFLAGS_DECOMPRESSOR)) KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_MARCH),$(KBUILD_CFLAGS_DECOMPRESSOR)) -KBUILD_AFLAGS += $(CC_FLAGS_MARCH_MINIMUM) -KBUILD_CFLAGS += $(CC_FLAGS_MARCH_MINIMUM) +KBUILD_AFLAGS += $(CC_FLAGS_MARCH_MINIMUM) -D__DISABLE_EXPORTS +KBUILD_CFLAGS += $(CC_FLAGS_MARCH_MINIMUM) -D__DISABLE_EXPORTS +KBUILD_CFLAGS += $(call cc-option, -Wno-default-const-init-unsafe) CFLAGS_sclp_early_core.o += -I$(srctree)/drivers/s390/char obj-y := head.o als.o startup.o physmem_info.o ipl_parm.o ipl_report.o vmem.o obj-y += string.o ebcdic.o sclp_early_core.o mem.o ipl_vmparm.o cmdline.o -obj-y += version.o pgm_check_info.o ctype.o ipl_data.o relocs.o alternative.o -obj-y += uv.o printk.o +obj-y += version.o pgm_check.o ctype.o ipl_data.o relocs.o alternative.o +obj-y += uv.o printk.o trampoline.o obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o obj-y += $(if $(CONFIG_KERNEL_UNCOMPRESSED),,decompressor.o) info.o obj-$(CONFIG_KERNEL_ZSTD) += clz_ctz.o obj-$(CONFIG_KMSAN) += kmsan.o +obj-$(CONFIG_STACKPROTECTOR) += stackprotector.o obj-all := $(obj-y) piggy.o syms.o targets := bzImage section_cmp.boot.data section_cmp.boot.preserved.data $(obj-y) diff --git a/arch/s390/boot/als.c b/arch/s390/boot/als.c index 79afb5fa7f1f..25a20986b96e 100644 --- a/arch/s390/boot/als.c +++ b/arch/s390/boot/als.c @@ -65,7 +65,7 @@ static void facility_mismatch(void) boot_emerg("The Linux kernel requires more recent processor hardware\n"); boot_emerg("Detected machine-type number: %4x\n", id.machine); print_missing_facilities(); - boot_emerg("See Principles of Operations for facility bits\n"); + boot_emerg("See z/Architecture Principles of Operation - Facility Indications\n"); disabled_wait(); } diff --git a/arch/s390/boot/alternative.c b/arch/s390/boot/alternative.c index abc08d2c873d..19ea7934b918 100644 --- a/arch/s390/boot/alternative.c +++ b/arch/s390/boot/alternative.c @@ -1,3 +1,138 @@ // SPDX-License-Identifier: GPL-2.0 +#define boot_fmt(fmt) "alt: " fmt +#include "boot.h" + +#define a_debug boot_debug #include "../kernel/alternative.c" + +static void alt_debug_all(int type) +{ + int i; + + switch (type) { + case ALT_TYPE_FACILITY: + for (i = 0; i < ARRAY_SIZE(alt_debug.facilities); i++) + alt_debug.facilities[i] = -1UL; + break; + case ALT_TYPE_FEATURE: + for (i = 0; i < ARRAY_SIZE(alt_debug.mfeatures); i++) + alt_debug.mfeatures[i] = -1UL; + break; + case ALT_TYPE_SPEC: + alt_debug.spec = 1; + break; + } +} + +static void alt_debug_modify(int type, unsigned int nr, bool clear) +{ + switch (type) { + case ALT_TYPE_FACILITY: + if (clear) + __clear_facility(nr, alt_debug.facilities); + else + __set_facility(nr, alt_debug.facilities); + break; + case ALT_TYPE_FEATURE: + if (clear) + __clear_machine_feature(nr, alt_debug.mfeatures); + else + __set_machine_feature(nr, alt_debug.mfeatures); + break; + } +} + +static char *alt_debug_parse(int type, char *str) +{ + unsigned long val, endval; + char *endp; + bool clear; + int i; + + if (*str == ':') { + str++; + } else { + alt_debug_all(type); + return str; + } + clear = false; + if (*str == '!') { + alt_debug_all(type); + clear = true; + str++; + } + while (*str) { + val = simple_strtoull(str, &endp, 0); + if (str == endp) + break; + str = endp; + if (*str == '-') { + str++; + endval = simple_strtoull(str, &endp, 0); + if (str == endp) + break; + str = endp; + while (val <= endval) { + alt_debug_modify(type, val, clear); + val++; + } + } else { + alt_debug_modify(type, val, clear); + } + if (*str != ',') + break; + str++; + } + return str; +} + +/* + * Use debug-alternative command line parameter for debugging: + * "debug-alternative" + * -> print debug message for every single alternative + * + * "debug-alternative=0;2" + * -> print debug message for all alternatives with type 0 and 2 + * + * "debug-alternative=0:0-7" + * -> print debug message for all alternatives with type 0 and with + * facility numbers within the range of 0-7 + * (if type 0 is ALT_TYPE_FACILITY) + * + * "debug-alternative=0:!8;1" + * -> print debug message for all alternatives with type 0, for all + * facility number, except facility 8, and in addition print all + * alternatives with type 1 + */ +void alt_debug_setup(char *str) +{ + unsigned long type; + char *endp; + int i; + + if (!str) { + alt_debug_all(ALT_TYPE_FACILITY); + alt_debug_all(ALT_TYPE_FEATURE); + alt_debug_all(ALT_TYPE_SPEC); + return; + } + while (*str) { + type = simple_strtoull(str, &endp, 0); + if (str == endp) + break; + str = endp; + switch (type) { + case ALT_TYPE_FACILITY: + case ALT_TYPE_FEATURE: + str = alt_debug_parse(type, str); + break; + case ALT_TYPE_SPEC: + alt_debug_all(ALT_TYPE_SPEC); + break; + } + if (*str != ';') + break; + str++; + } +} diff --git a/arch/s390/boot/boot.h b/arch/s390/boot/boot.h index 69f261566a64..61a205b489fb 100644 --- a/arch/s390/boot/boot.h +++ b/arch/s390/boot/boot.h @@ -6,15 +6,11 @@ #define IPL_START 0x200 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/printk.h> #include <asm/physmem_info.h> - -struct machine_info { - unsigned char has_edat1 : 1; - unsigned char has_edat2 : 1; -}; +#include <asm/stacktrace.h> struct vmlinux_info { unsigned long entry; @@ -32,6 +28,10 @@ struct vmlinux_info { unsigned long invalid_pg_dir_off; unsigned long alt_instructions; unsigned long alt_instructions_end; +#ifdef CONFIG_STACKPROTECTOR + unsigned long stack_prot_start; + unsigned long stack_prot_end; +#endif #ifdef CONFIG_KASAN unsigned long kasan_early_shadow_page_off; unsigned long kasan_early_shadow_pte_off; @@ -69,7 +69,8 @@ void parse_boot_command_line(void); void verify_facilities(void); void print_missing_facilities(void); void sclp_early_setup_buffer(void); -void print_pgm_check_info(void); +void alt_debug_setup(char *str); +void do_pgm_check(struct pt_regs *regs); unsigned long randomize_within_range(unsigned long size, unsigned long align, unsigned long min, unsigned long max); void setup_vmem(unsigned long kernel_start, unsigned long kernel_end, unsigned long asce_limit); @@ -78,6 +79,7 @@ void print_stacktrace(unsigned long sp); void error(char *m); int get_random(unsigned long limit, unsigned long *value); void boot_rb_dump(void); +void __noreturn jump_to_kernel(psw_t *psw); #ifndef boot_fmt #define boot_fmt(fmt) fmt @@ -92,6 +94,13 @@ void boot_rb_dump(void); #define boot_info(fmt, ...) boot_printk(KERN_INFO boot_fmt(fmt), ##__VA_ARGS__) #define boot_debug(fmt, ...) boot_printk(KERN_DEBUG boot_fmt(fmt), ##__VA_ARGS__) +#define boot_panic(...) do { \ + boot_emerg(__VA_ARGS__); \ + print_stacktrace(current_frame_address()); \ + boot_emerg(" -- System halted\n"); \ + disabled_wait(); \ +} while (0) + extern struct machine_info machine; extern int boot_console_loglevel; extern bool boot_ignore_loglevel; @@ -125,5 +134,5 @@ static inline bool intersects(unsigned long addr0, unsigned long size0, { return addr0 + size0 > addr1 && addr1 + size1 > addr0; } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* BOOT_BOOT_H */ diff --git a/arch/s390/boot/decompressor.c b/arch/s390/boot/decompressor.c index 03500b9d9fb9..8d1bc25a6bf4 100644 --- a/arch/s390/boot/decompressor.c +++ b/arch/s390/boot/decompressor.c @@ -68,9 +68,7 @@ static void decompress_error(char *m) { if (bootdebug) boot_rb_dump(); - boot_emerg("Decompression error: %s\n", m); - boot_emerg(" -- System halted\n"); - disabled_wait(); + boot_panic("Decompression error: %s\n", m); } unsigned long mem_safe_offset(void) diff --git a/arch/s390/boot/head.S b/arch/s390/boot/head.S index 0a47b16f6412..0b511d5c030b 100644 --- a/arch/s390/boot/head.S +++ b/arch/s390/boot/head.S @@ -254,8 +254,9 @@ SYM_CODE_START_LOCAL(startup_normal) xc 0xf00(256),0xf00 larl %r13,.Lctl lctlg %c0,%c15,0(%r13) # load control registers - stcke __LC_BOOT_CLOCK - mvc __LC_LAST_UPDATE_CLOCK(8),__LC_BOOT_CLOCK+1 + larl %r13,tod_clock_base + stcke 0(%r13) + mvc __LC_LAST_UPDATE_CLOCK(8),1(%r13) larl %r13,6f spt 0(%r13) mvc __LC_LAST_UPDATE_TIMER(8),0(%r13) @@ -292,12 +293,6 @@ SYM_CODE_END(startup_normal) #include "head_kdump.S" -# -# This program check is active immediately after kernel start -# and until early_pgm_check_handler is set in kernel/early.c -# It simply saves general/control registers and psw in -# the save area and does disabled wait with a faulty address. -# SYM_CODE_START_LOCAL(startup_pgm_check_handler) stmg %r8,%r15,__LC_SAVE_AREA la %r8,4095 @@ -311,8 +306,18 @@ SYM_CODE_START_LOCAL(startup_pgm_check_handler) oi __LC_RETURN_PSW+1,0x2 # set wait state bit larl %r9,.Lold_psw_disabled_wait stg %r9,__LC_PGM_NEW_PSW+8 - larl %r15,_dump_info_stack_end-STACK_FRAME_OVERHEAD - brasl %r14,print_pgm_check_info + larl %r15,_dump_info_stack_end-(STACK_FRAME_OVERHEAD+__PT_SIZE) + la %r2,STACK_FRAME_OVERHEAD(%r15) + mvc __PT_PSW(16,%r2),__LC_PSW_SAVE_AREA-4095(%r8) + mvc __PT_R0(128,%r2),__LC_GPREGS_SAVE_AREA-4095(%r8) + mvc __PT_LAST_BREAK(8,%r2),__LC_PGM_LAST_BREAK + mvc __PT_INT_CODE(4,%r2),__LC_PGM_INT_CODE + brasl %r14,do_pgm_check + larl %r9,startup_pgm_check_handler + stg %r9,__LC_PGM_NEW_PSW+8 + mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) + lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15) + lpswe __LC_RETURN_PSW .Lold_psw_disabled_wait: la %r8,4095 lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r8) diff --git a/arch/s390/boot/ipl_data.c b/arch/s390/boot/ipl_data.c index 0846e2b249c6..b0fd8a526b42 100644 --- a/arch/s390/boot/ipl_data.c +++ b/arch/s390/boot/ipl_data.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/compat.h> #include <linux/ptrace.h> #include <asm/cio.h> #include <asm/asm-offsets.h> @@ -12,11 +11,13 @@ #define PSW_MASK_DISABLED (PSW_MASK_WAIT | PSW_MASK_EA | PSW_MASK_BA) struct ipl_lowcore { - psw_t32 ipl_psw; /* 0x0000 */ + psw32_t ipl_psw; /* 0x0000 */ struct ccw0 ccwpgm[2]; /* 0x0008 */ u8 fill[56]; /* 0x0018 */ struct ccw0 ccwpgmcc[20]; /* 0x0050 */ - u8 pad_0xf0[0x01a0-0x00f0]; /* 0x00f0 */ + u8 pad_0xf0[0x0140-0x00f0]; /* 0x00f0 */ + psw_t svc_old_psw; /* 0x0140 */ + u8 pad_0x150[0x01a0-0x0150]; /* 0x0150 */ psw_t restart_psw; /* 0x01a0 */ psw_t external_new_psw; /* 0x01b0 */ psw_t svc_new_psw; /* 0x01c0 */ @@ -75,6 +76,11 @@ static struct ipl_lowcore ipl_lowcore __used __section(".ipldata") = { [18] = CCW0(CCW_CMD_READ_IPL, 0x690, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), [19] = CCW0(CCW_CMD_READ_IPL, 0x6e0, 0x50, CCW_FLAG_SLI), }, + /* + * Let the GDB's lx-symbols command find the jump_to_kernel symbol + * without having to load decompressor symbols. + */ + .svc_old_psw = { .mask = 0, .addr = (unsigned long)jump_to_kernel }, .restart_psw = { .mask = 0, .addr = IPL_START, }, .external_new_psw = { .mask = PSW_MASK_DISABLED, .addr = __LC_EXT_NEW_PSW, }, .svc_new_psw = { .mask = PSW_MASK_DISABLED, .addr = __LC_SVC_NEW_PSW, }, diff --git a/arch/s390/boot/ipl_parm.c b/arch/s390/boot/ipl_parm.c index d3731f2983b7..6bc950b92be7 100644 --- a/arch/s390/boot/ipl_parm.c +++ b/arch/s390/boot/ipl_parm.c @@ -3,8 +3,10 @@ #include <linux/init.h> #include <linux/ctype.h> #include <linux/pgtable.h> +#include <asm/arch-stackprotector.h> #include <asm/abs_lowcore.h> #include <asm/page-states.h> +#include <asm/machine.h> #include <asm/ebcdic.h> #include <asm/sclp.h> #include <asm/sections.h> @@ -34,29 +36,14 @@ int vmalloc_size_set; static inline int __diag308(unsigned long subcode, void *addr) { - unsigned long reg1, reg2; - union register_pair r1; - psw_t old; - - r1.even = (unsigned long) addr; - r1.odd = 0; - asm volatile( - " mvc 0(16,%[psw_old]),0(%[psw_pgm])\n" - " epsw %[reg1],%[reg2]\n" - " st %[reg1],0(%[psw_pgm])\n" - " st %[reg2],4(%[psw_pgm])\n" - " larl %[reg1],1f\n" - " stg %[reg1],8(%[psw_pgm])\n" + union register_pair r1 = { .even = (unsigned long)addr, .odd = 0 }; + + asm_inline volatile( " diag %[r1],%[subcode],0x308\n" - "1: mvc 0(16,%[psw_pgm]),0(%[psw_old])\n" - : [r1] "+&d" (r1.pair), - [reg1] "=&d" (reg1), - [reg2] "=&a" (reg2), - "+Q" (get_lowcore()->program_new_psw), - "=Q" (old) - : [subcode] "d" (subcode), - [psw_old] "a" (&old), - [psw_pgm] "a" (&get_lowcore()->program_new_psw) + "0:\n" + EX_TABLE(0b, 0b) + : [r1] "+d" (r1.pair) + : [subcode] "d" (subcode) : "cc", "memory"); return r1.odd; } @@ -193,7 +180,7 @@ void setup_boot_command_line(void) if (has_ebcdic_char(parmarea.command_line)) EBCASC(parmarea.command_line, COMMAND_LINE_SIZE); /* copy arch command line */ - strcpy(early_command_line, strim(parmarea.command_line)); + strscpy(early_command_line, strim(parmarea.command_line)); /* append IPL PARM data to the boot command line */ if (!is_prot_virt_guest() && ipl_block_valid) @@ -267,7 +254,8 @@ void parse_boot_command_line(void) int rc; __kaslr_enabled = IS_ENABLED(CONFIG_RANDOMIZE_BASE); - args = strcpy(command_line_buf, early_command_line); + strscpy(command_line_buf, early_command_line); + args = command_line_buf; while (*args) { args = next_arg(args, ¶m, &val); @@ -295,6 +283,9 @@ void parse_boot_command_line(void) if (!strcmp(param, "facilities") && val) modify_fac_list(val); + if (!strcmp(param, "debug-alternative")) + alt_debug_setup(val); + if (!strcmp(param, "nokaslr")) __kaslr_enabled = 0; @@ -304,6 +295,11 @@ void parse_boot_command_line(void) cmma_flag = 0; } +#ifdef CONFIG_STACKPROTECTOR + if (!strcmp(param, "debug_stackprotector")) + stack_protector_debug = 1; +#endif + #if IS_ENABLED(CONFIG_KVM) if (!strcmp(param, "prot_virt")) { rc = kstrtobool(val, &enabled); @@ -312,7 +308,7 @@ void parse_boot_command_line(void) } #endif if (!strcmp(param, "relocate_lowcore") && test_facility(193)) - relocate_lowcore = 1; + set_machine_feature(MFEATURE_LOWCORE); if (!strcmp(param, "earlyprintk")) boot_earlyprintk = true; if (!strcmp(param, "debug")) @@ -320,7 +316,7 @@ void parse_boot_command_line(void) if (!strcmp(param, "bootdebug")) { bootdebug = true; if (val) - strncpy(bootdebug_filter, val, sizeof(bootdebug_filter) - 1); + strscpy(bootdebug_filter, val); } if (!strcmp(param, "quiet")) boot_console_loglevel = CONSOLE_LOGLEVEL_QUIET; diff --git a/arch/s390/boot/pgm_check_info.c b/arch/s390/boot/pgm_check.c index 633f11600aab..fa621fa5bc02 100644 --- a/arch/s390/boot/pgm_check_info.c +++ b/arch/s390/boot/pgm_check.c @@ -32,26 +32,49 @@ void print_stacktrace(unsigned long sp) } } -void print_pgm_check_info(void) +extern struct exception_table_entry __start___ex_table[]; +extern struct exception_table_entry __stop___ex_table[]; + +static inline unsigned long extable_insn(const struct exception_table_entry *x) +{ + return (unsigned long)&x->insn + x->insn; +} + +static bool ex_handler(struct pt_regs *regs) +{ + const struct exception_table_entry *ex; + + for (ex = __start___ex_table; ex < __stop___ex_table; ex++) { + if (extable_insn(ex) != regs->psw.addr) + continue; + if (ex->type != EX_TYPE_FIXUP) + return false; + regs->psw.addr = extable_fixup(ex); + return true; + } + return false; +} + +void do_pgm_check(struct pt_regs *regs) { - unsigned long *gpregs = (unsigned long *)get_lowcore()->gpregs_save_area; - struct psw_bits *psw = &psw_bits(get_lowcore()->psw_save_area); + struct psw_bits *psw = &psw_bits(regs->psw); + unsigned long *gpregs = regs->gprs; + if (ex_handler(regs)) + return; if (bootdebug) boot_rb_dump(); boot_emerg("Linux version %s\n", kernel_version); if (!is_prot_virt_guest() && early_command_line[0]) boot_emerg("Kernel command line: %s\n", early_command_line); boot_emerg("Kernel fault: interruption code %04x ilc:%d\n", - get_lowcore()->pgm_code, get_lowcore()->pgm_ilc >> 1); + regs->int_code & 0xffff, regs->int_code >> 17); if (kaslr_enabled()) { boot_emerg("Kernel random base: %lx\n", __kaslr_offset); boot_emerg("Kernel random base phys: %lx\n", __kaslr_offset_phys); } boot_emerg("PSW : %016lx %016lx (%pS)\n", - get_lowcore()->psw_save_area.mask, - get_lowcore()->psw_save_area.addr, - (void *)get_lowcore()->psw_save_area.addr); + regs->psw.mask, regs->psw.addr, (void *)regs->psw.addr); boot_emerg(" R:%x T:%x IO:%x EX:%x Key:%x M:%x W:%x P:%x AS:%x CC:%x PM:%x RI:%x EA:%x\n", psw->per, psw->dat, psw->io, psw->ext, psw->key, psw->mcheck, psw->wait, psw->pstate, psw->as, psw->cc, psw->pm, psw->ri, psw->eaba); @@ -59,8 +82,11 @@ void print_pgm_check_info(void) boot_emerg(" %016lx %016lx %016lx %016lx\n", gpregs[4], gpregs[5], gpregs[6], gpregs[7]); boot_emerg(" %016lx %016lx %016lx %016lx\n", gpregs[8], gpregs[9], gpregs[10], gpregs[11]); boot_emerg(" %016lx %016lx %016lx %016lx\n", gpregs[12], gpregs[13], gpregs[14], gpregs[15]); - print_stacktrace(get_lowcore()->gpregs_save_area[15]); + print_stacktrace(gpregs[15]); boot_emerg("Last Breaking-Event-Address:\n"); - boot_emerg(" [<%016lx>] %pS\n", (unsigned long)get_lowcore()->pgm_last_break, - (void *)get_lowcore()->pgm_last_break); + boot_emerg(" [<%016lx>] %pS\n", regs->last_break, (void *)regs->last_break); + /* Convert to disabled wait PSW */ + psw->io = 0; + psw->ext = 0; + psw->wait = 1; } diff --git a/arch/s390/boot/physmem_info.c b/arch/s390/boot/physmem_info.c index aa096ef68e8c..1f2ca5435838 100644 --- a/arch/s390/boot/physmem_info.c +++ b/arch/s390/boot/physmem_info.c @@ -59,36 +59,22 @@ void add_physmem_online_range(u64 start, u64 end) static int __diag260(unsigned long rx1, unsigned long rx2) { - unsigned long reg1, reg2, ry; union register_pair rx; int cc, exception; - psw_t old; + unsigned long ry; rx.even = rx1; rx.odd = rx2; ry = 0x10; /* storage configuration */ exception = 1; - asm volatile( - " mvc 0(16,%[psw_old]),0(%[psw_pgm])\n" - " epsw %[reg1],%[reg2]\n" - " st %[reg1],0(%[psw_pgm])\n" - " st %[reg2],4(%[psw_pgm])\n" - " larl %[reg1],1f\n" - " stg %[reg1],8(%[psw_pgm])\n" + asm_inline volatile( " diag %[rx],%[ry],0x260\n" - " lhi %[exc],0\n" - "1: mvc 0(16,%[psw_pgm]),0(%[psw_old])\n" + "0: lhi %[exc],0\n" + "1:\n" CC_IPM(cc) - : CC_OUT(cc, cc), - [exc] "+d" (exception), - [reg1] "=&d" (reg1), - [reg2] "=&a" (reg2), - [ry] "+&d" (ry), - "+Q" (get_lowcore()->program_new_psw), - "=Q" (old) - : [rx] "d" (rx.pair), - [psw_old] "a" (&old), - [psw_pgm] "a" (&get_lowcore()->program_new_psw) + EX_TABLE(0b, 1b) + : CC_OUT(cc, cc), [exc] "+d" (exception), [ry] "+d" (ry) + : [rx] "d" (rx.pair) : CC_CLOBBER_LIST("memory")); cc = exception ? -1 : CC_TRANSFORM(cc); return cc == 0 ? ry : -1; @@ -118,29 +104,15 @@ static int diag260(void) static int diag500_storage_limit(unsigned long *max_physmem_end) { unsigned long storage_limit; - unsigned long reg1, reg2; - psw_t old; - - asm volatile( - " mvc 0(16,%[psw_old]),0(%[psw_pgm])\n" - " epsw %[reg1],%[reg2]\n" - " st %[reg1],0(%[psw_pgm])\n" - " st %[reg2],4(%[psw_pgm])\n" - " larl %[reg1],1f\n" - " stg %[reg1],8(%[psw_pgm])\n" - " lghi 1,%[subcode]\n" - " lghi 2,0\n" - " diag 2,4,0x500\n" - "1: mvc 0(16,%[psw_pgm]),0(%[psw_old])\n" - " lgr %[slimit],2\n" - : [reg1] "=&d" (reg1), - [reg2] "=&a" (reg2), - [slimit] "=d" (storage_limit), - "=Q" (get_lowcore()->program_new_psw), - "=Q" (old) - : [psw_old] "a" (&old), - [psw_pgm] "a" (&get_lowcore()->program_new_psw), - [subcode] "i" (DIAG500_SC_STOR_LIMIT) + + asm_inline volatile( + " lghi %%r1,%[subcode]\n" + " lghi %%r2,0\n" + " diag %%r2,%%r4,0x500\n" + "0: lgr %[slimit],%%r2\n" + EX_TABLE(0b, 0b) + : [slimit] "=d" (storage_limit) + : [subcode] "i" (DIAG500_SC_STOR_LIMIT) : "memory", "1", "2"); if (!storage_limit) return -EINVAL; @@ -151,31 +123,17 @@ static int diag500_storage_limit(unsigned long *max_physmem_end) static int tprot(unsigned long addr) { - unsigned long reg1, reg2; int cc, exception; - psw_t old; exception = 1; - asm volatile( - " mvc 0(16,%[psw_old]),0(%[psw_pgm])\n" - " epsw %[reg1],%[reg2]\n" - " st %[reg1],0(%[psw_pgm])\n" - " st %[reg2],4(%[psw_pgm])\n" - " larl %[reg1],1f\n" - " stg %[reg1],8(%[psw_pgm])\n" + asm_inline volatile( " tprot 0(%[addr]),0\n" - " lhi %[exc],0\n" - "1: mvc 0(16,%[psw_pgm]),0(%[psw_old])\n" + "0: lhi %[exc],0\n" + "1:\n" CC_IPM(cc) - : CC_OUT(cc, cc), - [exc] "+d" (exception), - [reg1] "=&d" (reg1), - [reg2] "=&a" (reg2), - "=Q" (get_lowcore()->program_new_psw.addr), - "=Q" (old) - : [psw_old] "a" (&old), - [psw_pgm] "a" (&get_lowcore()->program_new_psw), - [addr] "a" (addr) + EX_TABLE(0b, 1b) + : CC_OUT(cc, cc), [exc] "+d" (exception) + : [addr] "a" (addr) : CC_CLOBBER_LIST("memory")); cc = exception ? -EFAULT : CC_TRANSFORM(cc); return cc; @@ -270,9 +228,7 @@ static void die_oom(unsigned long size, unsigned long align, unsigned long min, boot_emerg("Usable online memory total: %lu Reserved: %lu Free: %lu\n", total_mem, total_reserved_mem, total_mem > total_reserved_mem ? total_mem - total_reserved_mem : 0); - print_stacktrace(current_frame_address()); - boot_emerg(" -- System halted\n"); - disabled_wait(); + boot_panic("Oom\n"); } static void _physmem_reserve(enum reserved_range_type type, unsigned long addr, unsigned long size) diff --git a/arch/s390/boot/printk.c b/arch/s390/boot/printk.c index b4c66fa667d5..4bb6bc95704e 100644 --- a/arch/s390/boot/printk.c +++ b/arch/s390/boot/printk.c @@ -8,6 +8,7 @@ #include <asm/sections.h> #include <asm/lowcore.h> #include <asm/setup.h> +#include <asm/timex.h> #include <asm/sclp.h> #include <asm/uv.h> #include "boot.h" @@ -28,7 +29,8 @@ static void boot_rb_add(const char *str, size_t len) /* store strings separated by '\0' */ if (len + 1 > avail) boot_rb_off = 0; - strcpy(boot_rb + boot_rb_off, str); + avail = sizeof(boot_rb) - boot_rb_off - 1; + strscpy(boot_rb + boot_rb_off, str, avail); boot_rb_off += len + 1; } @@ -157,10 +159,10 @@ static noinline char *strsym(char *buf, void *ip) p = findsym((unsigned long)ip, &off, &len); if (p) { - strncpy(buf, p, MAX_SYMLEN); + strscpy(buf, p, MAX_SYMLEN); /* reserve 15 bytes for offset/len in symbol+0x1234/0x1234 */ p = buf + strnlen(buf, MAX_SYMLEN - 15); - strcpy(p, "+0x"); + strscpy(p, "+0x", MAX_SYMLEN - (p - buf)); as_hex(p + 3, off, 0); strcat(p, "/0x"); as_hex(p + strlen(p), len, 0); @@ -199,8 +201,7 @@ static void boot_console_earlyprintk(const char *buf) static char *add_timestamp(char *buf) { #ifdef CONFIG_PRINTK_TIME - union tod_clock *boot_clock = (union tod_clock *)&get_lowcore()->boot_clock; - unsigned long ns = tod_to_ns(get_tod_clock() - boot_clock->tod); + unsigned long ns = tod_to_ns(__get_tod_clock_monotonic()); char ts[MAX_NUMLEN]; *buf++ = '['; diff --git a/arch/s390/boot/stackprotector.c b/arch/s390/boot/stackprotector.c new file mode 100644 index 000000000000..68494940c12a --- /dev/null +++ b/arch/s390/boot/stackprotector.c @@ -0,0 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define boot_fmt(fmt) "stackprot: " fmt + +#include "boot.h" +#include "../kernel/stackprotector.c" diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index 885bd1dd2c82..c1dcf8b1b579 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -6,9 +6,13 @@ #include <asm/boot_data.h> #include <asm/extmem.h> #include <asm/sections.h> +#include <asm/diag288.h> #include <asm/maccess.h> +#include <asm/machine.h> +#include <asm/sysinfo.h> #include <asm/cpu_mf.h> #include <asm/setup.h> +#include <asm/timex.h> #include <asm/kasan.h> #include <asm/kexec.h> #include <asm/sclp.h> @@ -16,6 +20,9 @@ #include <asm/uv.h> #include <asm/abs_lowcore.h> #include <asm/physmem_info.h> +#include <asm/stacktrace.h> +#include <asm/asm-offsets.h> +#include <asm/arch-stackprotector.h> #include "decompressor.h" #include "boot.h" #include "uv.h" @@ -34,64 +41,121 @@ unsigned long __bootdata_preserved(max_mappable); unsigned long __bootdata_preserved(page_noexec_mask); unsigned long __bootdata_preserved(segment_noexec_mask); unsigned long __bootdata_preserved(region_noexec_mask); -int __bootdata_preserved(relocate_lowcore); +union tod_clock __bootdata_preserved(tod_clock_base); +u64 __bootdata_preserved(clock_comparator_max) = -1UL; u64 __bootdata_preserved(stfle_fac_list[16]); struct oldmem_data __bootdata_preserved(oldmem_data); -struct machine_info machine; +static char sysinfo_page[PAGE_SIZE] __aligned(PAGE_SIZE); -void error(char *x) +static void detect_machine_type(void) { - boot_emerg("%s\n", x); - boot_emerg(" -- System halted\n"); - disabled_wait(); + struct sysinfo_3_2_2 *vmms = (struct sysinfo_3_2_2 *)&sysinfo_page; + + /* Check current-configuration-level */ + if (stsi(NULL, 0, 0, 0) <= 2) { + set_machine_feature(MFEATURE_LPAR); + return; + } + /* Get virtual-machine cpu information. */ + if (stsi(vmms, 3, 2, 2) || !vmms->count) + return; + /* Detect known hypervisors */ + if (!memcmp(vmms->vm[0].cpi, "\xd2\xe5\xd4", 3)) + set_machine_feature(MFEATURE_KVM); + else if (!memcmp(vmms->vm[0].cpi, "\xa9\x61\xe5\xd4", 4)) + set_machine_feature(MFEATURE_VM); +} + +static void detect_diag288(void) +{ + /* "BEGIN" in EBCDIC character set */ + static const char cmd[] = "\xc2\xc5\xc7\xc9\xd5"; + unsigned long action, len; + + action = machine_is_vm() ? (unsigned long)cmd : LPARWDT_RESTART; + len = machine_is_vm() ? sizeof(cmd) : 0; + if (__diag288(WDT_FUNC_INIT, MIN_INTERVAL, action, len)) + return; + __diag288(WDT_FUNC_CANCEL, 0, 0, 0); + set_machine_feature(MFEATURE_DIAG288); +} + +static void detect_diag9c(void) +{ + unsigned int cpu; + int rc = 1; + + cpu = stap(); + asm_inline volatile( + " diag %[cpu],%%r0,0x9c\n" + "0: lhi %[rc],0\n" + "1:\n" + EX_TABLE(0b, 1b) + : [rc] "+d" (rc) + : [cpu] "d" (cpu) + : "cc", "memory"); + if (!rc) + set_machine_feature(MFEATURE_DIAG9C); +} + +static void reset_tod_clock(void) +{ + union tod_clock clk; + + if (store_tod_clock_ext_cc(&clk) == 0) + return; + /* TOD clock not running. Set the clock to Unix Epoch. */ + if (set_tod_clock(TOD_UNIX_EPOCH) || store_tod_clock_ext_cc(&clk)) + disabled_wait(); + memset(&tod_clock_base, 0, sizeof(tod_clock_base)); + tod_clock_base.tod = TOD_UNIX_EPOCH; + get_lowcore()->last_update_clock = TOD_UNIX_EPOCH; } static void detect_facilities(void) { - if (test_facility(8)) { - machine.has_edat1 = 1; + if (cpu_has_edat1()) local_ctl_set_bit(0, CR0_EDAT_BIT); - } - if (test_facility(78)) - machine.has_edat2 = 1; page_noexec_mask = -1UL; segment_noexec_mask = -1UL; region_noexec_mask = -1UL; - if (!test_facility(130)) { + if (!cpu_has_nx()) { page_noexec_mask &= ~_PAGE_NOEXEC; segment_noexec_mask &= ~_SEGMENT_ENTRY_NOEXEC; region_noexec_mask &= ~_REGION_ENTRY_NOEXEC; } + if (IS_ENABLED(CONFIG_PCI) && test_facility(153)) + set_machine_feature(MFEATURE_PCI_MIO); + reset_tod_clock(); + if (test_facility(139) && (tod_clock_base.tod >> 63)) { + /* Enable signed clock comparator comparisons */ + set_machine_feature(MFEATURE_SCC); + clock_comparator_max = -1UL >> 1; + local_ctl_set_bit(0, CR0_CLOCK_COMPARATOR_SIGN_BIT); + } + if (test_facility(50) && test_facility(73)) { + set_machine_feature(MFEATURE_TX); + local_ctl_set_bit(0, CR0_TRANSACTIONAL_EXECUTION_BIT); + } + if (cpu_has_vx()) + local_ctl_set_bit(0, CR0_VECTOR_BIT); } static int cmma_test_essa(void) { - unsigned long reg1, reg2, tmp = 0; + unsigned long tmp = 0; int rc = 1; - psw_t old; /* Test ESSA_GET_STATE */ - asm volatile( - " mvc 0(16,%[psw_old]),0(%[psw_pgm])\n" - " epsw %[reg1],%[reg2]\n" - " st %[reg1],0(%[psw_pgm])\n" - " st %[reg2],4(%[psw_pgm])\n" - " larl %[reg1],1f\n" - " stg %[reg1],8(%[psw_pgm])\n" + asm_inline volatile( " .insn rrf,0xb9ab0000,%[tmp],%[tmp],%[cmd],0\n" - " la %[rc],0\n" - "1: mvc 0(16,%[psw_pgm]),0(%[psw_old])\n" - : [reg1] "=&d" (reg1), - [reg2] "=&a" (reg2), - [rc] "+&d" (rc), - [tmp] "=&d" (tmp), - "+Q" (get_lowcore()->program_new_psw), - "=Q" (old) - : [psw_old] "a" (&old), - [psw_pgm] "a" (&get_lowcore()->program_new_psw), - [cmd] "i" (ESSA_GET_STATE) + "0: lhi %[rc],0\n" + "1:\n" + EX_TABLE(0b, 1b) + : [rc] "+d" (rc), [tmp] "+d" (tmp) + : [cmd] "i" (ESSA_GET_STATE) : "cc", "memory"); return rc; } @@ -152,10 +216,10 @@ static void rescue_initrd(unsigned long min, unsigned long max) static void copy_bootdata(void) { if (__boot_data_end - __boot_data_start != vmlinux.bootdata_size) - error(".boot.data section size mismatch"); + boot_panic(".boot.data section size mismatch\n"); memcpy((void *)vmlinux.bootdata_off, __boot_data_start, vmlinux.bootdata_size); if (__boot_data_preserved_end - __boot_data_preserved_start != vmlinux.bootdata_preserved_size) - error(".boot.preserved.data section size mismatch"); + boot_panic(".boot.preserved.data section size mismatch\n"); memcpy((void *)vmlinux.bootdata_preserved_off, __boot_data_preserved_start, vmlinux.bootdata_preserved_size); } @@ -169,7 +233,7 @@ static void kaslr_adjust_relocs(unsigned long min_addr, unsigned long max_addr, for (reloc = (int *)__vmlinux_relocs_64_start; reloc < (int *)__vmlinux_relocs_64_end; reloc++) { loc = (long)*reloc + phys_offset; if (loc < min_addr || loc > max_addr) - error("64-bit relocation outside of kernel!\n"); + boot_panic("64-bit relocation outside of kernel!\n"); *(u64 *)loc += offset; } } @@ -272,6 +336,7 @@ static unsigned long setup_kernel_memory_layout(unsigned long kernel_size) BUILD_BUG_ON(!IS_ALIGNED(TEXT_OFFSET, THREAD_SIZE)); BUILD_BUG_ON(!IS_ALIGNED(__NO_KASLR_START_KERNEL, THREAD_SIZE)); BUILD_BUG_ON(__NO_KASLR_END_KERNEL > _REGION1_SIZE); + BUILD_BUG_ON(CONFIG_ILLEGAL_POINTER_VALUE < _REGION1_SIZE); vsize = get_vmem_size(ident_map_size, vmemmap_size, vmalloc_size, _REGION3_SIZE); boot_debug("vmem size estimated: 0x%016lx\n", vsize); if (IS_ENABLED(CONFIG_KASAN) || __NO_KASLR_END_KERNEL > _REGION2_SIZE || @@ -316,7 +381,7 @@ static unsigned long setup_kernel_memory_layout(unsigned long kernel_size) kernel_start = round_down(kernel_end - kernel_size, THREAD_SIZE); boot_debug("Randomization range: 0x%016lx-0x%016lx\n", vmax - kaslr_len, vmax); boot_debug("kernel image: 0x%016lx-0x%016lx (kaslr)\n", kernel_start, - kernel_size + kernel_size); + kernel_start + kernel_size); } else if (vmax < __NO_KASLR_END_KERNEL || vsize > __NO_KASLR_END_KERNEL) { kernel_start = round_down(vmax - kernel_size, THREAD_SIZE); boot_debug("kernel image: 0x%016lx-0x%016lx (constrained)\n", kernel_start, @@ -375,7 +440,8 @@ static unsigned long setup_kernel_memory_layout(unsigned long kernel_size) max_mappable = max(ident_map_size, MAX_DCSS_ADDR); max_mappable = min(max_mappable, vmemmap_start); #ifdef CONFIG_RANDOMIZE_IDENTITY_BASE - __identity_base = round_down(vmemmap_start - max_mappable, rte_size); + if (kaslr_enabled()) + __identity_base = round_down(vmemmap_start - max_mappable, rte_size); #endif boot_debug("identity map: 0x%016lx-0x%016lx\n", __identity_base, __identity_base + ident_map_size); @@ -416,6 +482,10 @@ static void kaslr_adjust_vmlinux_info(long offset) vmlinux.invalid_pg_dir_off += offset; vmlinux.alt_instructions += offset; vmlinux.alt_instructions_end += offset; +#ifdef CONFIG_STACKPROTECTOR + vmlinux.stack_prot_start += offset; + vmlinux.stack_prot_end += offset; +#endif #ifdef CONFIG_KASAN vmlinux.kasan_early_shadow_page_off += offset; vmlinux.kasan_early_shadow_pte_off += offset; @@ -462,7 +532,12 @@ void startup_kernel(void) read_ipl_report(); sclp_early_read_info(); + sclp_early_detect_machine_features(); detect_facilities(); + detect_diag9c(); + detect_machine_type(); + /* detect_diag288() needs machine type */ + detect_diag288(); cmma_init(); sanitize_prot_virt_host(); max_physmem_end = detect_max_physmem_end(); @@ -556,6 +631,7 @@ void startup_kernel(void) __apply_alternatives((struct alt_instr *)_vmlinux_info.alt_instructions, (struct alt_instr *)_vmlinux_info.alt_instructions_end, ALT_CTX_EARLY); + stack_protector_apply_early(text_lma); /* * Save KASLR offset for early dumps, before vmcore_info is set. @@ -569,5 +645,5 @@ void startup_kernel(void) psw.addr = __kaslr_offset + vmlinux.entry; psw.mask = PSW_KERNEL_BITS; boot_debug("Starting kernel at: 0x%016lx\n", psw.addr); - __load_psw(psw); + jump_to_kernel(&psw); } diff --git a/arch/s390/boot/string.c b/arch/s390/boot/string.c index f6b9b1df48a8..bd68161434a6 100644 --- a/arch/s390/boot/string.c +++ b/arch/s390/boot/string.c @@ -29,6 +29,18 @@ int strncmp(const char *cs, const char *ct, size_t count) return 0; } +ssize_t sized_strscpy(char *dst, const char *src, size_t count) +{ + size_t len; + + if (count == 0) + return -E2BIG; + len = strnlen(src, count - 1); + memcpy(dst, src, len); + dst[len] = '\0'; + return src[len] ? -E2BIG : len; +} + void *memset64(uint64_t *s, uint64_t v, size_t count) { uint64_t *xs = s; diff --git a/arch/s390/boot/trampoline.S b/arch/s390/boot/trampoline.S new file mode 100644 index 000000000000..1cb5adf005ea --- /dev/null +++ b/arch/s390/boot/trampoline.S @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <linux/linkage.h> + +# This function is identical to __load_psw(), but the lx-symbols GDB command +# puts a breakpoint on it, so it needs to be kept separate. +SYM_CODE_START(jump_to_kernel) + lpswe 0(%r2) +SYM_CODE_END(jump_to_kernel) diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c index cfca94a8eac4..7d6cc4c85af0 100644 --- a/arch/s390/boot/vmem.c +++ b/arch/s390/boot/vmem.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #define boot_fmt(fmt) "vmem: " fmt +#include <linux/cpufeature.h> #include <linux/sched/task.h> #include <linux/pgtable.h> #include <linux/kasan.h> @@ -10,11 +11,11 @@ #include <asm/ctlreg.h> #include <asm/physmem_info.h> #include <asm/maccess.h> +#include <asm/machine.h> #include <asm/abs_lowcore.h> #include "decompressor.h" #include "boot.h" -#define INVALID_PHYS_ADDR (~(phys_addr_t)0) struct ctlreg __bootdata_preserved(s390_invalid_asce); #ifdef CONFIG_PROC_FS @@ -243,22 +244,10 @@ static void *boot_crst_alloc(unsigned long val) static pte_t *boot_pte_alloc(void) { - static void *pte_leftover; pte_t *pte; - /* - * handling pte_leftovers this way helps to avoid memory fragmentation - * during POPULATE_KASAN_MAP_SHADOW when EDAT is off - */ - if (!pte_leftover) { - pte_leftover = (void *)physmem_alloc_or_die(RR_VMEM, PAGE_SIZE, PAGE_SIZE); - pte = pte_leftover + _PAGE_TABLE_SIZE; - __arch_set_page_dat(pte, 1); - } else { - pte = pte_leftover; - pte_leftover = NULL; - } - + pte = (void *)physmem_alloc_or_die(RR_VMEM, PAGE_SIZE, PAGE_SIZE); + __arch_set_page_dat(pte, 1); memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE); return pte; } @@ -314,7 +303,7 @@ static unsigned long try_get_large_pud_pa(pud_t *pu_dir, unsigned long addr, uns { unsigned long pa, size = end - addr; - if (!machine.has_edat2 || !large_page_mapping_allowed(mode) || + if (!cpu_has_edat2() || !large_page_mapping_allowed(mode) || !IS_ALIGNED(addr, PUD_SIZE) || (size < PUD_SIZE)) return INVALID_PHYS_ADDR; @@ -330,7 +319,7 @@ static unsigned long try_get_large_pmd_pa(pmd_t *pm_dir, unsigned long addr, uns { unsigned long pa, size = end - addr; - if (!machine.has_edat1 || !large_page_mapping_allowed(mode) || + if (!cpu_has_edat1() || !large_page_mapping_allowed(mode) || !IS_ALIGNED(addr, PMD_SIZE) || (size < PMD_SIZE)) return INVALID_PHYS_ADDR; @@ -516,7 +505,7 @@ void setup_vmem(unsigned long kernel_start, unsigned long kernel_end, unsigned l __arch_set_page_dat((void *)swapper_pg_dir, 1UL << CRST_ALLOC_ORDER); __arch_set_page_dat((void *)invalid_pg_dir, 1UL << CRST_ALLOC_ORDER); - if (relocate_lowcore) + if (machine_has_relocated_lowcore()) lowcore_address = LOWCORE_ALT_ADDRESS; /* @@ -528,6 +517,9 @@ void setup_vmem(unsigned long kernel_start, unsigned long kernel_end, unsigned l lowcore_address + sizeof(struct lowcore), POPULATE_LOWCORE); for_each_physmem_usable_range(i, &start, &end) { + /* Do not map lowcore with identity mapping */ + if (!start) + start = sizeof(struct lowcore); pgtable_populate((unsigned long)__identity_va(start), (unsigned long)__identity_va(end), POPULATE_IDENTITY); diff --git a/arch/s390/boot/vmlinux.lds.S b/arch/s390/boot/vmlinux.lds.S index 66670212a361..070bc18babd0 100644 --- a/arch/s390/boot/vmlinux.lds.S +++ b/arch/s390/boot/vmlinux.lds.S @@ -40,6 +40,7 @@ SECTIONS *(.rodata.*) _erodata = . ; } + EXCEPTION_TABLE(16) .got : { *(.got) } @@ -136,6 +137,15 @@ SECTIONS } _end = .; + /* Sections to be discarded */ + /DISCARD/ : { + COMMON_DISCARDS + *(.eh_frame) + *(*__ksymtab*) + *(___kcrctab*) + *(.modinfo) + } + DWARF_DEBUG ELF_DETAILS @@ -160,13 +170,4 @@ SECTIONS *(.rela.*) *(.rela_*) } ASSERT(SIZEOF(.rela.dyn) == 0, "Unexpected run-time relocations (.rela) detected!") - - /* Sections to be discarded */ - /DISCARD/ : { - COMMON_DISCARDS - *(.eh_frame) - *(__ex_table) - *(*__ksymtab*) - *(___kcrctab*) - } } diff --git a/arch/s390/configs/compat.config b/arch/s390/configs/compat.config deleted file mode 100644 index 6fd051453ae8..000000000000 --- a/arch/s390/configs/compat.config +++ /dev/null @@ -1,3 +0,0 @@ -# Help: Enable compat support -CONFIG_COMPAT=y -CONFIG_COMPAT_32BIT_TIME=y diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index d6beec5292a0..c28f9a7d0bd8 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -5,6 +5,7 @@ CONFIG_WATCH_QUEUE=y CONFIG_AUDIT=y CONFIG_NO_HZ_IDLE=y CONFIG_HIGH_RES_TIMERS=y +CONFIG_POSIX_AUX_CLOCKS=y CONFIG_BPF_SYSCALL=y CONFIG_BPF_JIT=y CONFIG_BPF_JIT_ALWAYS_ON=y @@ -19,6 +20,7 @@ CONFIG_TASK_XACCT=y CONFIG_TASK_IO_ACCOUNTING=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y +CONFIG_SCHED_PROXY_EXEC=y CONFIG_NUMA_BALANCING=y CONFIG_MEMCG=y CONFIG_BLK_CGROUP=y @@ -38,11 +40,11 @@ CONFIG_USER_NS=y CONFIG_CHECKPOINT_RESTORE=y CONFIG_SCHED_AUTOGROUP=y CONFIG_EXPERT=y -# CONFIG_SYSFS_SYSCALL is not set CONFIG_PROFILING=y CONFIG_KEXEC=y CONFIG_KEXEC_FILE=y CONFIG_KEXEC_SIG=y +CONFIG_CRASH_DM_CRYPT=y CONFIG_LIVEPATCH=y CONFIG_MARCH_Z13=y CONFIG_NR_CPUS=512 @@ -92,7 +94,6 @@ CONFIG_UNIXWARE_DISKLABEL=y CONFIG_IOSCHED_BFQ=y CONFIG_BINFMT_MISC=m CONFIG_ZSWAP=y -CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD=y CONFIG_ZSMALLOC_STAT=y CONFIG_SLAB_BUCKETS=y CONFIG_SLUB_STATS=y @@ -100,6 +101,7 @@ CONFIG_SLUB_STATS=y CONFIG_MEMORY_HOTPLUG=y CONFIG_MEMORY_HOTREMOVE=y CONFIG_KSM=y +CONFIG_PERSISTENT_HUGE_ZERO_FOLIO=y CONFIG_TRANSPARENT_HUGEPAGE=y CONFIG_CMA_DEBUGFS=y CONFIG_CMA_SYSFS=y @@ -107,6 +109,7 @@ CONFIG_CMA_AREAS=7 CONFIG_MEM_SOFT_DIRTY=y CONFIG_DEFERRED_STRUCT_PAGE_INIT=y CONFIG_IDLE_PAGE_TRACKING=y +CONFIG_ZONE_DEVICE=y CONFIG_PERCPU_STATS=y CONFIG_GUP_TEST=y CONFIG_ANON_VMA_NAME=y @@ -116,10 +119,17 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m +CONFIG_TLS_DEVICE=y +CONFIG_TLS_TOE=y CONFIG_XFRM_USER=m CONFIG_NET_KEY=m +CONFIG_SMC=m CONFIG_SMC_DIAG=m -CONFIG_SMC_LO=y +CONFIG_DIBS=y +CONFIG_DIBS_LO=y +CONFIG_XDP_SOCKETS=y +CONFIG_XDP_SOCKETS_DIAG=m CONFIG_INET=y CONFIG_IP_MULTICAST=y CONFIG_IP_ADVANCED_ROUTER=y @@ -225,17 +235,19 @@ CONFIG_NETFILTER_XT_TARGET_CONNMARK=m CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m CONFIG_NETFILTER_XT_TARGET_CT=m CONFIG_NETFILTER_XT_TARGET_DSCP=m +CONFIG_NETFILTER_XT_TARGET_HL=m CONFIG_NETFILTER_XT_TARGET_HMARK=m CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m CONFIG_NETFILTER_XT_TARGET_LOG=m CONFIG_NETFILTER_XT_TARGET_MARK=m +CONFIG_NETFILTER_XT_NAT=m CONFIG_NETFILTER_XT_TARGET_NETMAP=m CONFIG_NETFILTER_XT_TARGET_NFLOG=m CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m CONFIG_NETFILTER_XT_TARGET_REDIRECT=m +CONFIG_NETFILTER_XT_TARGET_MASQUERADE=m CONFIG_NETFILTER_XT_TARGET_TEE=m CONFIG_NETFILTER_XT_TARGET_TPROXY=m -CONFIG_NETFILTER_XT_TARGET_TRACE=m CONFIG_NETFILTER_XT_TARGET_SECMARK=m CONFIG_NETFILTER_XT_TARGET_TCPMSS=m CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m @@ -321,16 +333,8 @@ CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m CONFIG_IP_NF_MATCH_RPFILTER=m CONFIG_IP_NF_MATCH_TTL=m -CONFIG_IP_NF_FILTER=m CONFIG_IP_NF_TARGET_REJECT=m -CONFIG_IP_NF_NAT=m -CONFIG_IP_NF_TARGET_MASQUERADE=m -CONFIG_IP_NF_MANGLE=m CONFIG_IP_NF_TARGET_ECN=m -CONFIG_IP_NF_TARGET_TTL=m -CONFIG_IP_NF_RAW=m -CONFIG_IP_NF_SECURITY=m -CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m CONFIG_NFT_FIB_IPV6=m CONFIG_IP6_NF_IPTABLES=m @@ -343,15 +347,9 @@ CONFIG_IP6_NF_MATCH_IPV6HEADER=m CONFIG_IP6_NF_MATCH_MH=m CONFIG_IP6_NF_MATCH_RPFILTER=m CONFIG_IP6_NF_MATCH_RT=m -CONFIG_IP6_NF_TARGET_HL=m -CONFIG_IP6_NF_FILTER=m CONFIG_IP6_NF_TARGET_REJECT=m -CONFIG_IP6_NF_MANGLE=m -CONFIG_IP6_NF_RAW=m -CONFIG_IP6_NF_SECURITY=m -CONFIG_IP6_NF_NAT=m -CONFIG_IP6_NF_TARGET_MASQUERADE=m CONFIG_NF_TABLES_BRIDGE=m +CONFIG_IP_SCTP=m CONFIG_RDS=m CONFIG_RDS_RDMA=m CONFIG_RDS_TCP=m @@ -386,6 +384,7 @@ CONFIG_NET_SCH_FQ_CODEL=m CONFIG_NET_SCH_INGRESS=m CONFIG_NET_SCH_PLUG=m CONFIG_NET_SCH_ETS=m +CONFIG_NET_SCH_DUALPI2=m CONFIG_NET_CLS_BASIC=m CONFIG_NET_CLS_ROUTE4=m CONFIG_NET_CLS_FW=m @@ -395,6 +394,9 @@ CONFIG_CLS_U32_MARK=y CONFIG_NET_CLS_FLOW=m CONFIG_NET_CLS_CGROUP=y CONFIG_NET_CLS_BPF=m +CONFIG_NET_CLS_FLOWER=m +CONFIG_NET_CLS_MATCHALL=m +CONFIG_NET_EMATCH=y CONFIG_NET_CLS_ACT=y CONFIG_NET_ACT_POLICE=m CONFIG_NET_ACT_GACT=m @@ -405,6 +407,9 @@ CONFIG_NET_ACT_PEDIT=m CONFIG_NET_ACT_SIMP=m CONFIG_NET_ACT_SKBEDIT=m CONFIG_NET_ACT_CSUM=m +CONFIG_NET_ACT_VLAN=m +CONFIG_NET_ACT_TUNNEL_KEY=m +CONFIG_NET_ACT_CT=m CONFIG_NET_ACT_GATE=m CONFIG_NET_TC_SKB_EXT=y CONFIG_DNS_RESOLVER=y @@ -441,6 +446,7 @@ CONFIG_BLK_DEV_RAM_SIZE=32768 CONFIG_VIRTIO_BLK=y CONFIG_BLK_DEV_RBD=m CONFIG_BLK_DEV_NVME=m +CONFIG_BLK_DEV_NULL_BLK=m CONFIG_ENCLOSURE_SERVICES=m CONFIG_GENWQE=m CONFIG_RAID_ATTRS=m @@ -468,7 +474,9 @@ CONFIG_SCSI_DH_EMC=m CONFIG_SCSI_DH_ALUA=m CONFIG_MD=y CONFIG_BLK_DEV_MD=y +CONFIG_MD_LLBITMAP=y # CONFIG_MD_BITMAP_FILE is not set +CONFIG_MD_LINEAR=m CONFIG_MD_CLUSTER=m CONFIG_BCACHE=m CONFIG_BLK_DEV_DM=y @@ -500,6 +508,7 @@ CONFIG_DM_VDO=m CONFIG_NETDEVICES=y CONFIG_BONDING=m CONFIG_DUMMY=m +CONFIG_OVPN=m CONFIG_EQUALIZER=m CONFIG_IFB=m CONFIG_MACVLAN=m @@ -515,7 +524,6 @@ CONFIG_NLMON=m # CONFIG_NET_VENDOR_ADAPTEC is not set # CONFIG_NET_VENDOR_AGERE is not set # CONFIG_NET_VENDOR_ALACRITECH is not set -# CONFIG_NET_VENDOR_ALTEON is not set # CONFIG_NET_VENDOR_AMAZON is not set # CONFIG_NET_VENDOR_AMD is not set # CONFIG_NET_VENDOR_AQUANTIA is not set @@ -543,6 +551,7 @@ CONFIG_NLMON=m CONFIG_MLX4_EN=m CONFIG_MLX5_CORE=m CONFIG_MLX5_CORE_EN=y +CONFIG_MLX5_SF=y # CONFIG_NET_VENDOR_META is not set # CONFIG_NET_VENDOR_MICREL is not set # CONFIG_NET_VENDOR_MICROCHIP is not set @@ -551,7 +560,6 @@ CONFIG_MLX5_CORE_EN=y # CONFIG_NET_VENDOR_MYRI is not set # CONFIG_NET_VENDOR_NI is not set # CONFIG_NET_VENDOR_NATSEMI is not set -# CONFIG_NET_VENDOR_NETERION is not set # CONFIG_NET_VENDOR_NETRONOME is not set # CONFIG_NET_VENDOR_NVIDIA is not set # CONFIG_NET_VENDOR_OKI is not set @@ -627,8 +635,17 @@ CONFIG_VIRTIO_PCI=m CONFIG_VIRTIO_BALLOON=m CONFIG_VIRTIO_MEM=m CONFIG_VIRTIO_INPUT=y +CONFIG_VDPA=m +CONFIG_VDPA_SIM=m +CONFIG_VDPA_SIM_NET=m +CONFIG_VDPA_SIM_BLOCK=m +CONFIG_VDPA_USER=m +CONFIG_MLX5_VDPA_NET=m +CONFIG_VP_VDPA=m CONFIG_VHOST_NET=m CONFIG_VHOST_VSOCK=m +CONFIG_VHOST_VDPA=m +CONFIG_DEV_DAX=m CONFIG_EXT4_FS=y CONFIG_EXT4_FS_POSIX_ACL=y CONFIG_EXT4_FS_SECURITY=y @@ -638,9 +655,12 @@ CONFIG_JFS_POSIX_ACL=y CONFIG_JFS_SECURITY=y CONFIG_JFS_STATISTICS=y CONFIG_XFS_FS=y +CONFIG_XFS_SUPPORT_V4=y +CONFIG_XFS_SUPPORT_ASCII_CI=y CONFIG_XFS_QUOTA=y CONFIG_XFS_POSIX_ACL=y CONFIG_XFS_RT=y +# CONFIG_XFS_ONLINE_SCRUB is not set CONFIG_XFS_DEBUG=y CONFIG_GFS2_FS=m CONFIG_GFS2_FS_LOCKING_DLM=y @@ -650,10 +670,6 @@ CONFIG_BTRFS_FS_POSIX_ACL=y CONFIG_BTRFS_DEBUG=y CONFIG_BTRFS_ASSERT=y CONFIG_NILFS2_FS=m -CONFIG_BCACHEFS_FS=y -CONFIG_BCACHEFS_QUOTA=y -CONFIG_BCACHEFS_POSIX_ACL=y -CONFIG_FS_DAX=y CONFIG_EXPORTFS_BLOCK_OPS=y CONFIG_FS_ENCRYPTION=y CONFIG_FS_VERITY=y @@ -723,11 +739,10 @@ CONFIG_NLS_UTF8=m CONFIG_DLM=m CONFIG_UNICODE=y CONFIG_PERSISTENT_KEYRINGS=y +CONFIG_BIG_KEYS=y CONFIG_ENCRYPTED_KEYS=m CONFIG_KEY_NOTIFICATIONS=y CONFIG_SECURITY=y -CONFIG_HARDENED_USERCOPY=y -CONFIG_FORTIFY_SOURCE=y CONFIG_SECURITY_SELINUX=y CONFIG_SECURITY_SELINUX_BOOTPARAM=y CONFIG_SECURITY_LOCKDOWN_LSM=y @@ -740,23 +755,24 @@ CONFIG_IMA=y CONFIG_IMA_DEFAULT_HASH_SHA256=y CONFIG_IMA_WRITE_POLICY=y CONFIG_IMA_APPRAISE=y -CONFIG_LSM="yama,loadpin,safesetid,integrity,selinux,smack,tomoyo,apparmor" +CONFIG_FORTIFY_SOURCE=y +CONFIG_HARDENED_USERCOPY=y CONFIG_BUG_ON_DATA_CORRUPTION=y CONFIG_CRYPTO_USER=m -# CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set +CONFIG_CRYPTO_SELFTESTS=y +CONFIG_CRYPTO_SELFTESTS_FULL=y +CONFIG_CRYPTO_NULL=y CONFIG_CRYPTO_PCRYPT=m CONFIG_CRYPTO_CRYPTD=m -CONFIG_CRYPTO_TEST=m +CONFIG_CRYPTO_BENCHMARK=m CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_CURVE25519=m -CONFIG_CRYPTO_AES_TI=m +CONFIG_CRYPTO_AES=m CONFIG_CRYPTO_ANUBIS=m CONFIG_CRYPTO_ARIA=m CONFIG_CRYPTO_BLOWFISH=m -CONFIG_CRYPTO_CAMELLIA=m CONFIG_CRYPTO_CAST5=m CONFIG_CRYPTO_CAST6=m CONFIG_CRYPTO_DES=m @@ -773,14 +789,13 @@ CONFIG_CRYPTO_HCTR2=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_PCBC=m CONFIG_CRYPTO_AEGIS128=m -CONFIG_CRYPTO_CHACHA20POLY1305=m CONFIG_CRYPTO_GCM=y CONFIG_CRYPTO_SEQIV=y CONFIG_CRYPTO_MD4=m CONFIG_CRYPTO_MD5=y -CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_RMD160=m -CONFIG_CRYPTO_SM3_GENERIC=m +CONFIG_CRYPTO_SHA3=m +CONFIG_CRYPTO_SM3=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_XCBC=m CONFIG_CRYPTO_CRC32=m @@ -788,20 +803,11 @@ CONFIG_CRYPTO_842=m CONFIG_CRYPTO_LZ4=m CONFIG_CRYPTO_LZ4HC=m CONFIG_CRYPTO_ZSTD=m -CONFIG_CRYPTO_ANSI_CPRNG=m CONFIG_CRYPTO_USER_API_HASH=m CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m -CONFIG_CRYPTO_SHA512_S390=m -CONFIG_CRYPTO_SHA1_S390=m -CONFIG_CRYPTO_SHA256_S390=m -CONFIG_CRYPTO_SHA3_256_S390=m -CONFIG_CRYPTO_SHA3_512_S390=m -CONFIG_CRYPTO_GHASH_S390=m CONFIG_CRYPTO_AES_S390=m -CONFIG_CRYPTO_DES_S390=m -CONFIG_CRYPTO_CHACHA_S390=m CONFIG_CRYPTO_HMAC_S390=m CONFIG_ZCRYPT=m CONFIG_PKEY=m @@ -810,14 +816,13 @@ CONFIG_PKEY_EP11=m CONFIG_PKEY_PCKMO=m CONFIG_PKEY_UV=m CONFIG_CRYPTO_PAES_S390=m +CONFIG_CRYPTO_PHMAC_S390=m CONFIG_CRYPTO_DEV_VIRTIO=m CONFIG_SYSTEM_BLACKLIST_KEYRING=y +CONFIG_CRYPTO_KRB5=m +CONFIG_CRYPTO_KRB5_SELFTESTS=y CONFIG_CORDIC=m -CONFIG_CRYPTO_LIB_CURVE25519=m -CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m -CONFIG_CRC4=m -CONFIG_CRC7=m -CONFIG_CRC8=m +CONFIG_TRACE_MMIO_ACCESS=y CONFIG_RANDOM32_SELFTEST=y CONFIG_XZ_DEC_MICROLZMA=y CONFIG_DMA_CMA=y @@ -875,6 +880,7 @@ CONFIG_RCU_CPU_STALL_TIMEOUT=300 CONFIG_LATENCYTOP=y CONFIG_BOOTTIME_TRACING=y CONFIG_FUNCTION_GRAPH_RETVAL=y +CONFIG_FUNCTION_GRAPH_RETADDR=y CONFIG_FPROBE=y CONFIG_FUNCTION_PROFILER=y CONFIG_STACK_TRACER=y @@ -887,12 +893,14 @@ CONFIG_USER_EVENTS=y CONFIG_HIST_TRIGGERS=y CONFIG_FTRACE_STARTUP_TEST=y # CONFIG_EVENT_TRACE_STARTUP_TEST is not set +CONFIG_FTRACE_SORT_STARTUP_TEST=y CONFIG_SAMPLES=y CONFIG_SAMPLE_TRACE_PRINTK=m CONFIG_SAMPLE_FTRACE_DIRECT=m CONFIG_SAMPLE_FTRACE_DIRECT_MULTI=m CONFIG_SAMPLE_FTRACE_OPS=m CONFIG_DEBUG_ENTRY=y +CONFIG_STRICT_MM_TYPECHECKS=y CONFIG_CIO_INJECT=y CONFIG_KUNIT=m CONFIG_KUNIT_DEBUGFS=y @@ -909,7 +917,7 @@ CONFIG_FAULT_INJECTION_DEBUG_FS=y CONFIG_FAULT_INJECTION_CONFIGFS=y CONFIG_FAULT_INJECTION_STACKTRACE_FILTER=y CONFIG_LKDTM=m -CONFIG_TEST_MIN_HEAP=y +CONFIG_MIN_HEAP_KUNIT_TEST=m CONFIG_KPROBES_SANITY_TEST=m CONFIG_RBTREE_TEST=y CONFIG_INTERVAL_TREE_TEST=m @@ -917,3 +925,5 @@ CONFIG_PERCPU_TEST=m CONFIG_ATOMIC64_SELFTEST=y CONFIG_TEST_BITOPS=m CONFIG_TEST_BPF=m +CONFIG_PAGE_TABLE_CHECK=y +CONFIG_PAGE_TABLE_CHECK_ENFORCED=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index 8cfbfb10bba8..d89c988f33ea 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -4,6 +4,7 @@ CONFIG_WATCH_QUEUE=y CONFIG_AUDIT=y CONFIG_NO_HZ_IDLE=y CONFIG_HIGH_RES_TIMERS=y +CONFIG_POSIX_AUX_CLOCKS=y CONFIG_BPF_SYSCALL=y CONFIG_BPF_JIT=y CONFIG_BPF_JIT_ALWAYS_ON=y @@ -17,6 +18,7 @@ CONFIG_TASK_XACCT=y CONFIG_TASK_IO_ACCOUNTING=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y +CONFIG_SCHED_PROXY_EXEC=y CONFIG_NUMA_BALANCING=y CONFIG_MEMCG=y CONFIG_BLK_CGROUP=y @@ -36,16 +38,16 @@ CONFIG_USER_NS=y CONFIG_CHECKPOINT_RESTORE=y CONFIG_SCHED_AUTOGROUP=y CONFIG_EXPERT=y -# CONFIG_SYSFS_SYSCALL is not set CONFIG_PROFILING=y CONFIG_KEXEC=y CONFIG_KEXEC_FILE=y CONFIG_KEXEC_SIG=y +CONFIG_CRASH_DM_CRYPT=y CONFIG_LIVEPATCH=y CONFIG_MARCH_Z13=y CONFIG_NR_CPUS=512 CONFIG_NUMA=y -CONFIG_HZ_100=y +CONFIG_HZ_1000=y CONFIG_CERT_STORE=y CONFIG_EXPOLINE=y CONFIG_EXPOLINE_AUTO=y @@ -86,19 +88,20 @@ CONFIG_UNIXWARE_DISKLABEL=y CONFIG_IOSCHED_BFQ=y CONFIG_BINFMT_MISC=m CONFIG_ZSWAP=y -CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD=y CONFIG_ZSMALLOC_STAT=y CONFIG_SLAB_BUCKETS=y # CONFIG_COMPAT_BRK is not set CONFIG_MEMORY_HOTPLUG=y CONFIG_MEMORY_HOTREMOVE=y CONFIG_KSM=y +CONFIG_PERSISTENT_HUGE_ZERO_FOLIO=y CONFIG_TRANSPARENT_HUGEPAGE=y CONFIG_CMA_SYSFS=y CONFIG_CMA_AREAS=7 CONFIG_MEM_SOFT_DIRTY=y CONFIG_DEFERRED_STRUCT_PAGE_INIT=y CONFIG_IDLE_PAGE_TRACKING=y +CONFIG_ZONE_DEVICE=y CONFIG_PERCPU_STATS=y CONFIG_ANON_VMA_NAME=y CONFIG_USERFAULTFD=y @@ -107,10 +110,17 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m +CONFIG_TLS_DEVICE=y +CONFIG_TLS_TOE=y CONFIG_XFRM_USER=m CONFIG_NET_KEY=m +CONFIG_SMC=m CONFIG_SMC_DIAG=m -CONFIG_SMC_LO=y +CONFIG_DIBS=y +CONFIG_DIBS_LO=y +CONFIG_XDP_SOCKETS=y +CONFIG_XDP_SOCKETS_DIAG=m CONFIG_INET=y CONFIG_IP_MULTICAST=y CONFIG_IP_ADVANCED_ROUTER=y @@ -216,17 +226,19 @@ CONFIG_NETFILTER_XT_TARGET_CONNMARK=m CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m CONFIG_NETFILTER_XT_TARGET_CT=m CONFIG_NETFILTER_XT_TARGET_DSCP=m +CONFIG_NETFILTER_XT_TARGET_HL=m CONFIG_NETFILTER_XT_TARGET_HMARK=m CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m CONFIG_NETFILTER_XT_TARGET_LOG=m CONFIG_NETFILTER_XT_TARGET_MARK=m +CONFIG_NETFILTER_XT_NAT=m CONFIG_NETFILTER_XT_TARGET_NETMAP=m CONFIG_NETFILTER_XT_TARGET_NFLOG=m CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m CONFIG_NETFILTER_XT_TARGET_REDIRECT=m +CONFIG_NETFILTER_XT_TARGET_MASQUERADE=m CONFIG_NETFILTER_XT_TARGET_TEE=m CONFIG_NETFILTER_XT_TARGET_TPROXY=m -CONFIG_NETFILTER_XT_TARGET_TRACE=m CONFIG_NETFILTER_XT_TARGET_SECMARK=m CONFIG_NETFILTER_XT_TARGET_TCPMSS=m CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m @@ -312,16 +324,8 @@ CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m CONFIG_IP_NF_MATCH_RPFILTER=m CONFIG_IP_NF_MATCH_TTL=m -CONFIG_IP_NF_FILTER=m CONFIG_IP_NF_TARGET_REJECT=m -CONFIG_IP_NF_NAT=m -CONFIG_IP_NF_TARGET_MASQUERADE=m -CONFIG_IP_NF_MANGLE=m CONFIG_IP_NF_TARGET_ECN=m -CONFIG_IP_NF_TARGET_TTL=m -CONFIG_IP_NF_RAW=m -CONFIG_IP_NF_SECURITY=m -CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m CONFIG_NFT_FIB_IPV6=m CONFIG_IP6_NF_IPTABLES=m @@ -334,15 +338,9 @@ CONFIG_IP6_NF_MATCH_IPV6HEADER=m CONFIG_IP6_NF_MATCH_MH=m CONFIG_IP6_NF_MATCH_RPFILTER=m CONFIG_IP6_NF_MATCH_RT=m -CONFIG_IP6_NF_TARGET_HL=m -CONFIG_IP6_NF_FILTER=m CONFIG_IP6_NF_TARGET_REJECT=m -CONFIG_IP6_NF_MANGLE=m -CONFIG_IP6_NF_RAW=m -CONFIG_IP6_NF_SECURITY=m -CONFIG_IP6_NF_NAT=m -CONFIG_IP6_NF_TARGET_MASQUERADE=m CONFIG_NF_TABLES_BRIDGE=m +CONFIG_IP_SCTP=m CONFIG_RDS=m CONFIG_RDS_RDMA=m CONFIG_RDS_TCP=m @@ -376,6 +374,7 @@ CONFIG_NET_SCH_FQ_CODEL=m CONFIG_NET_SCH_INGRESS=m CONFIG_NET_SCH_PLUG=m CONFIG_NET_SCH_ETS=m +CONFIG_NET_SCH_DUALPI2=m CONFIG_NET_CLS_BASIC=m CONFIG_NET_CLS_ROUTE4=m CONFIG_NET_CLS_FW=m @@ -385,6 +384,9 @@ CONFIG_CLS_U32_MARK=y CONFIG_NET_CLS_FLOW=m CONFIG_NET_CLS_CGROUP=y CONFIG_NET_CLS_BPF=m +CONFIG_NET_CLS_FLOWER=m +CONFIG_NET_CLS_MATCHALL=m +CONFIG_NET_EMATCH=y CONFIG_NET_CLS_ACT=y CONFIG_NET_ACT_POLICE=m CONFIG_NET_ACT_GACT=m @@ -395,6 +397,9 @@ CONFIG_NET_ACT_PEDIT=m CONFIG_NET_ACT_SIMP=m CONFIG_NET_ACT_SKBEDIT=m CONFIG_NET_ACT_CSUM=m +CONFIG_NET_ACT_VLAN=m +CONFIG_NET_ACT_TUNNEL_KEY=m +CONFIG_NET_ACT_CT=m CONFIG_NET_ACT_GATE=m CONFIG_NET_TC_SKB_EXT=y CONFIG_DNS_RESOLVER=y @@ -431,6 +436,7 @@ CONFIG_BLK_DEV_RAM_SIZE=32768 CONFIG_VIRTIO_BLK=y CONFIG_BLK_DEV_RBD=m CONFIG_BLK_DEV_NVME=m +CONFIG_BLK_DEV_NULL_BLK=m CONFIG_ENCLOSURE_SERVICES=m CONFIG_GENWQE=m CONFIG_RAID_ATTRS=m @@ -458,7 +464,9 @@ CONFIG_SCSI_DH_EMC=m CONFIG_SCSI_DH_ALUA=m CONFIG_MD=y CONFIG_BLK_DEV_MD=y +CONFIG_MD_LLBITMAP=y # CONFIG_MD_BITMAP_FILE is not set +CONFIG_MD_LINEAR=m CONFIG_MD_CLUSTER=m CONFIG_BCACHE=m CONFIG_BLK_DEV_DM=y @@ -490,6 +498,7 @@ CONFIG_DM_VDO=m CONFIG_NETDEVICES=y CONFIG_BONDING=m CONFIG_DUMMY=m +CONFIG_OVPN=m CONFIG_EQUALIZER=m CONFIG_IFB=m CONFIG_MACVLAN=m @@ -505,7 +514,6 @@ CONFIG_NLMON=m # CONFIG_NET_VENDOR_ADAPTEC is not set # CONFIG_NET_VENDOR_AGERE is not set # CONFIG_NET_VENDOR_ALACRITECH is not set -# CONFIG_NET_VENDOR_ALTEON is not set # CONFIG_NET_VENDOR_AMAZON is not set # CONFIG_NET_VENDOR_AMD is not set # CONFIG_NET_VENDOR_AQUANTIA is not set @@ -533,6 +541,7 @@ CONFIG_NLMON=m CONFIG_MLX4_EN=m CONFIG_MLX5_CORE=m CONFIG_MLX5_CORE_EN=y +CONFIG_MLX5_SF=y # CONFIG_NET_VENDOR_META is not set # CONFIG_NET_VENDOR_MICREL is not set # CONFIG_NET_VENDOR_MICROCHIP is not set @@ -541,7 +550,6 @@ CONFIG_MLX5_CORE_EN=y # CONFIG_NET_VENDOR_MYRI is not set # CONFIG_NET_VENDOR_NI is not set # CONFIG_NET_VENDOR_NATSEMI is not set -# CONFIG_NET_VENDOR_NETERION is not set # CONFIG_NET_VENDOR_NETRONOME is not set # CONFIG_NET_VENDOR_NVIDIA is not set # CONFIG_NET_VENDOR_OKI is not set @@ -617,8 +625,17 @@ CONFIG_VIRTIO_PCI=m CONFIG_VIRTIO_BALLOON=m CONFIG_VIRTIO_MEM=m CONFIG_VIRTIO_INPUT=y +CONFIG_VDPA=m +CONFIG_VDPA_SIM=m +CONFIG_VDPA_SIM_NET=m +CONFIG_VDPA_SIM_BLOCK=m +CONFIG_VDPA_USER=m +CONFIG_MLX5_VDPA_NET=m +CONFIG_VP_VDPA=m CONFIG_VHOST_NET=m CONFIG_VHOST_VSOCK=m +CONFIG_VHOST_VDPA=m +CONFIG_DEV_DAX=m CONFIG_EXT4_FS=y CONFIG_EXT4_FS_POSIX_ACL=y CONFIG_EXT4_FS_SECURITY=y @@ -628,19 +645,18 @@ CONFIG_JFS_POSIX_ACL=y CONFIG_JFS_SECURITY=y CONFIG_JFS_STATISTICS=y CONFIG_XFS_FS=y +CONFIG_XFS_SUPPORT_V4=y +CONFIG_XFS_SUPPORT_ASCII_CI=y CONFIG_XFS_QUOTA=y CONFIG_XFS_POSIX_ACL=y CONFIG_XFS_RT=y +# CONFIG_XFS_ONLINE_SCRUB is not set CONFIG_GFS2_FS=m CONFIG_GFS2_FS_LOCKING_DLM=y CONFIG_OCFS2_FS=m CONFIG_BTRFS_FS=y CONFIG_BTRFS_FS_POSIX_ACL=y CONFIG_NILFS2_FS=m -CONFIG_BCACHEFS_FS=m -CONFIG_BCACHEFS_QUOTA=y -CONFIG_BCACHEFS_POSIX_ACL=y -CONFIG_FS_DAX=y CONFIG_EXPORTFS_BLOCK_OPS=y CONFIG_FS_ENCRYPTION=y CONFIG_FS_VERITY=y @@ -672,7 +688,6 @@ CONFIG_TMPFS_POSIX_ACL=y CONFIG_TMPFS_INODE64=y CONFIG_TMPFS_QUOTA=y CONFIG_HUGETLBFS=y -CONFIG_CONFIGFS_FS=m CONFIG_ECRYPT_FS=m CONFIG_CRAMFS=m CONFIG_SQUASHFS=m @@ -710,6 +725,7 @@ CONFIG_NLS_UTF8=m CONFIG_DLM=m CONFIG_UNICODE=y CONFIG_PERSISTENT_KEYRINGS=y +CONFIG_BIG_KEYS=y CONFIG_ENCRYPTED_KEYS=m CONFIG_KEY_NOTIFICATIONS=y CONFIG_SECURITY=y @@ -725,24 +741,22 @@ CONFIG_IMA=y CONFIG_IMA_DEFAULT_HASH_SHA256=y CONFIG_IMA_WRITE_POLICY=y CONFIG_IMA_APPRAISE=y -CONFIG_LSM="yama,loadpin,safesetid,integrity,selinux,smack,tomoyo,apparmor" CONFIG_BUG_ON_DATA_CORRUPTION=y CONFIG_CRYPTO_FIPS=y CONFIG_CRYPTO_USER=m -# CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set +CONFIG_CRYPTO_SELFTESTS=y +CONFIG_CRYPTO_NULL=y CONFIG_CRYPTO_PCRYPT=m CONFIG_CRYPTO_CRYPTD=m -CONFIG_CRYPTO_TEST=m +CONFIG_CRYPTO_BENCHMARK=m CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_CURVE25519=m -CONFIG_CRYPTO_AES_TI=m +CONFIG_CRYPTO_AES=m CONFIG_CRYPTO_ANUBIS=m CONFIG_CRYPTO_ARIA=m CONFIG_CRYPTO_BLOWFISH=m -CONFIG_CRYPTO_CAMELLIA=m CONFIG_CRYPTO_CAST5=m CONFIG_CRYPTO_CAST6=m CONFIG_CRYPTO_DES=m @@ -759,14 +773,13 @@ CONFIG_CRYPTO_HCTR2=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_PCBC=m CONFIG_CRYPTO_AEGIS128=m -CONFIG_CRYPTO_CHACHA20POLY1305=m CONFIG_CRYPTO_GCM=y CONFIG_CRYPTO_SEQIV=y CONFIG_CRYPTO_MD4=m CONFIG_CRYPTO_MD5=y -CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_RMD160=m -CONFIG_CRYPTO_SM3_GENERIC=m +CONFIG_CRYPTO_SHA3=m +CONFIG_CRYPTO_SM3=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_XCBC=m CONFIG_CRYPTO_CRC32=m @@ -774,21 +787,12 @@ CONFIG_CRYPTO_842=m CONFIG_CRYPTO_LZ4=m CONFIG_CRYPTO_LZ4HC=m CONFIG_CRYPTO_ZSTD=m -CONFIG_CRYPTO_ANSI_CPRNG=m CONFIG_CRYPTO_JITTERENTROPY_OSR=1 CONFIG_CRYPTO_USER_API_HASH=m CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m -CONFIG_CRYPTO_SHA512_S390=m -CONFIG_CRYPTO_SHA1_S390=m -CONFIG_CRYPTO_SHA256_S390=m -CONFIG_CRYPTO_SHA3_256_S390=m -CONFIG_CRYPTO_SHA3_512_S390=m -CONFIG_CRYPTO_GHASH_S390=m CONFIG_CRYPTO_AES_S390=m -CONFIG_CRYPTO_DES_S390=m -CONFIG_CRYPTO_CHACHA_S390=m CONFIG_CRYPTO_HMAC_S390=m CONFIG_ZCRYPT=m CONFIG_PKEY=m @@ -797,15 +801,13 @@ CONFIG_PKEY_EP11=m CONFIG_PKEY_PCKMO=m CONFIG_PKEY_UV=m CONFIG_CRYPTO_PAES_S390=m +CONFIG_CRYPTO_PHMAC_S390=m CONFIG_CRYPTO_DEV_VIRTIO=m CONFIG_SYSTEM_BLACKLIST_KEYRING=y +CONFIG_CRYPTO_KRB5=m +CONFIG_CRYPTO_KRB5_SELFTESTS=y CONFIG_CORDIC=m CONFIG_PRIME_NUMBERS=m -CONFIG_CRYPTO_LIB_CURVE25519=m -CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m -CONFIG_CRC4=m -CONFIG_CRC7=m -CONFIG_CRC8=m CONFIG_XZ_DEC_MICROLZMA=y CONFIG_DMA_CMA=y CONFIG_CMA_SIZE_MBYTES=0 @@ -826,6 +828,7 @@ CONFIG_RCU_CPU_STALL_TIMEOUT=60 CONFIG_LATENCYTOP=y CONFIG_BOOTTIME_TRACING=y CONFIG_FUNCTION_GRAPH_RETVAL=y +CONFIG_FUNCTION_GRAPH_RETADDR=y CONFIG_FPROBE=y CONFIG_FUNCTION_PROFILER=y CONFIG_STACK_TRACER=y diff --git a/arch/s390/configs/mmtypes.config b/arch/s390/configs/mmtypes.config new file mode 100644 index 000000000000..fe32b442d789 --- /dev/null +++ b/arch/s390/configs/mmtypes.config @@ -0,0 +1,2 @@ +# Help: Enable strict memory management typechecks +CONFIG_STRICT_MM_TYPECHECKS=y diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig index bcbaa069de96..b5478267d6a7 100644 --- a/arch/s390/configs/zfcpdump_defconfig +++ b/arch/s390/configs/zfcpdump_defconfig @@ -1,5 +1,6 @@ CONFIG_NO_HZ_IDLE=y CONFIG_HIGH_RES_TIMERS=y +CONFIG_POSIX_AUX_CLOCKS=y CONFIG_BPF_SYSCALL=y # CONFIG_CPU_ISOLATION is not set # CONFIG_UTS_NS is not set @@ -11,7 +12,7 @@ CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_KEXEC=y CONFIG_MARCH_Z13=y CONFIG_NR_CPUS=2 -CONFIG_HZ_100=y +CONFIG_HZ_1000=y # CONFIG_CHSC_SCH is not set # CONFIG_SCM_BUS is not set # CONFIG_AP is not set @@ -32,7 +33,6 @@ CONFIG_NET=y CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_SAFE=y CONFIG_BLK_DEV_RAM=y -# CONFIG_DCSSBLK is not set # CONFIG_DASD is not set CONFIG_ENCLOSURE_SERVICES=y CONFIG_SCSI=y @@ -62,7 +62,6 @@ CONFIG_ZFCP=y # CONFIG_INOTIFY_USER is not set # CONFIG_MISC_FILESYSTEMS is not set # CONFIG_NETWORK_FILESYSTEMS is not set -CONFIG_LSM="yama,loadpin,safesetid,integrity" # CONFIG_ZLIB_DFLTCC is not set CONFIG_XZ_DEC_MICROLZMA=y CONFIG_PRINTK_TIME=y @@ -71,7 +70,6 @@ CONFIG_DEBUG_KERNEL=y CONFIG_DEBUG_INFO_DWARF4=y CONFIG_DEBUG_FS=y CONFIG_PANIC_ON_OOPS=y -# CONFIG_SCHED_DEBUG is not set CONFIG_RCU_CPU_STALL_TIMEOUT=60 # CONFIG_RCU_TRACE is not set # CONFIG_FTRACE is not set diff --git a/arch/s390/crypto/Kconfig b/arch/s390/crypto/Kconfig index b760232537f1..00051d27db95 100644 --- a/arch/s390/crypto/Kconfig +++ b/arch/s390/crypto/Kconfig @@ -2,79 +2,10 @@ menu "Accelerated Cryptographic Algorithms for CPU (s390)" -config CRYPTO_SHA512_S390 - tristate "Hash functions: SHA-384 and SHA-512" - depends on S390 - select CRYPTO_HASH - help - SHA-384 and SHA-512 secure hash algorithms (FIPS 180) - - Architecture: s390 - - It is available as of z10. - -config CRYPTO_SHA1_S390 - tristate "Hash functions: SHA-1" - depends on S390 - select CRYPTO_HASH - help - SHA-1 secure hash algorithm (FIPS 180) - - Architecture: s390 - - It is available as of z990. - -config CRYPTO_SHA256_S390 - tristate "Hash functions: SHA-224 and SHA-256" - depends on S390 - select CRYPTO_HASH - help - SHA-224 and SHA-256 secure hash algorithms (FIPS 180) - - Architecture: s390 - - It is available as of z9. - -config CRYPTO_SHA3_256_S390 - tristate "Hash functions: SHA3-224 and SHA3-256" - depends on S390 - select CRYPTO_HASH - help - SHA3-224 and SHA3-256 secure hash algorithms (FIPS 202) - - Architecture: s390 - - It is available as of z14. - -config CRYPTO_SHA3_512_S390 - tristate "Hash functions: SHA3-384 and SHA3-512" - depends on S390 - select CRYPTO_HASH - help - SHA3-384 and SHA3-512 secure hash algorithms (FIPS 202) - - Architecture: s390 - - It is available as of z14. - -config CRYPTO_GHASH_S390 - tristate "Hash functions: GHASH" - depends on S390 - select CRYPTO_HASH - help - GCM GHASH hash function (NIST SP800-38D) - - Architecture: s390 - - It is available as of z196. - config CRYPTO_AES_S390 tristate "Ciphers: AES, modes: ECB, CBC, CTR, XTS, GCM" - depends on S390 - select CRYPTO_ALGAPI select CRYPTO_SKCIPHER help - Block cipher: AES cipher algorithms (FIPS 197) AEAD cipher: AES with GCM Length-preserving ciphers: AES with ECB, CBC, XTS, and CTR modes @@ -90,39 +21,8 @@ config CRYPTO_AES_S390 key sizes and XTS mode is hardware accelerated for 256 and 512 bit keys. -config CRYPTO_DES_S390 - tristate "Ciphers: DES and Triple DES EDE, modes: ECB, CBC, CTR" - depends on S390 - select CRYPTO_ALGAPI - select CRYPTO_SKCIPHER - select CRYPTO_LIB_DES - help - Block ciphers: DES (FIPS 46-2) cipher algorithm - Block ciphers: Triple DES EDE (FIPS 46-3) cipher algorithm - Length-preserving ciphers: DES with ECB, CBC, and CTR modes - Length-preserving ciphers: Triple DES EDED with ECB, CBC, and CTR modes - - Architecture: s390 - - As of z990 the ECB and CBC mode are hardware accelerated. - As of z196 the CTR mode is hardware accelerated. - -config CRYPTO_CHACHA_S390 - tristate "Ciphers: ChaCha20" - depends on S390 - select CRYPTO_SKCIPHER - select CRYPTO_LIB_CHACHA_GENERIC - select CRYPTO_ARCH_HAVE_LIB_CHACHA - help - Length-preserving cipher: ChaCha20 stream cipher (RFC 7539) - - Architecture: s390 - - It is available as of z13. - config CRYPTO_HMAC_S390 tristate "Keyed-hash message authentication code: HMAC" - depends on S390 select CRYPTO_HASH help s390 specific HMAC hardware support for SHA224, SHA256, SHA384 and diff --git a/arch/s390/crypto/Makefile b/arch/s390/crypto/Makefile index 14dafadbcbed..48aeb0c0ffbd 100644 --- a/arch/s390/crypto/Makefile +++ b/arch/s390/crypto/Makefile @@ -3,18 +3,9 @@ # Cryptographic API # -obj-$(CONFIG_CRYPTO_SHA1_S390) += sha1_s390.o sha_common.o -obj-$(CONFIG_CRYPTO_SHA256_S390) += sha256_s390.o sha_common.o -obj-$(CONFIG_CRYPTO_SHA512_S390) += sha512_s390.o sha_common.o -obj-$(CONFIG_CRYPTO_SHA3_256_S390) += sha3_256_s390.o sha_common.o -obj-$(CONFIG_CRYPTO_SHA3_512_S390) += sha3_512_s390.o sha_common.o -obj-$(CONFIG_CRYPTO_DES_S390) += des_s390.o obj-$(CONFIG_CRYPTO_AES_S390) += aes_s390.o obj-$(CONFIG_CRYPTO_PAES_S390) += paes_s390.o -obj-$(CONFIG_CRYPTO_CHACHA_S390) += chacha_s390.o obj-$(CONFIG_S390_PRNG) += prng.o -obj-$(CONFIG_CRYPTO_GHASH_S390) += ghash_s390.o obj-$(CONFIG_CRYPTO_HMAC_S390) += hmac_s390.o +obj-$(CONFIG_CRYPTO_PHMAC_S390) += phmac_s390.o obj-y += arch_random.o - -chacha_s390-y := chacha-glue.o chacha-s390.o diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c index 9c46b1b630b1..62edc66d5478 100644 --- a/arch/s390/crypto/aes_s390.c +++ b/arch/s390/crypto/aes_s390.c @@ -14,14 +14,12 @@ * Derived from "crypto/aes_generic.c" */ -#define KMSG_COMPONENT "aes_s390" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "aes_s390: " fmt #include <crypto/aes.h> #include <crypto/algapi.h> #include <crypto/ghash.h> #include <crypto/internal/aead.h> -#include <crypto/internal/cipher.h> #include <crypto/internal/skcipher.h> #include <crypto/scatterwalk.h> #include <linux/err.h> @@ -46,7 +44,6 @@ struct s390_aes_ctx { unsigned long fc; union { struct crypto_skcipher *skcipher; - struct crypto_cipher *cip; } fallback; }; @@ -66,7 +63,6 @@ struct s390_xts_ctx { struct gcm_sg_walk { struct scatter_walk walk; unsigned int walk_bytes; - u8 *walk_ptr; unsigned int walk_bytes_remain; u8 buf[AES_BLOCK_SIZE]; unsigned int buf_bytes; @@ -74,109 +70,6 @@ struct gcm_sg_walk { unsigned int nbytes; }; -static int setkey_fallback_cip(struct crypto_tfm *tfm, const u8 *in_key, - unsigned int key_len) -{ - struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm); - - sctx->fallback.cip->base.crt_flags &= ~CRYPTO_TFM_REQ_MASK; - sctx->fallback.cip->base.crt_flags |= (tfm->crt_flags & - CRYPTO_TFM_REQ_MASK); - - return crypto_cipher_setkey(sctx->fallback.cip, in_key, key_len); -} - -static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, - unsigned int key_len) -{ - struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm); - unsigned long fc; - - /* Pick the correct function code based on the key length */ - fc = (key_len == 16) ? CPACF_KM_AES_128 : - (key_len == 24) ? CPACF_KM_AES_192 : - (key_len == 32) ? CPACF_KM_AES_256 : 0; - - /* Check if the function code is available */ - sctx->fc = (fc && cpacf_test_func(&km_functions, fc)) ? fc : 0; - if (!sctx->fc) - return setkey_fallback_cip(tfm, in_key, key_len); - - sctx->key_len = key_len; - memcpy(sctx->key, in_key, key_len); - return 0; -} - -static void crypto_aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) -{ - struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm); - - if (unlikely(!sctx->fc)) { - crypto_cipher_encrypt_one(sctx->fallback.cip, out, in); - return; - } - cpacf_km(sctx->fc, &sctx->key, out, in, AES_BLOCK_SIZE); -} - -static void crypto_aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) -{ - struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm); - - if (unlikely(!sctx->fc)) { - crypto_cipher_decrypt_one(sctx->fallback.cip, out, in); - return; - } - cpacf_km(sctx->fc | CPACF_DECRYPT, - &sctx->key, out, in, AES_BLOCK_SIZE); -} - -static int fallback_init_cip(struct crypto_tfm *tfm) -{ - const char *name = tfm->__crt_alg->cra_name; - struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm); - - sctx->fallback.cip = crypto_alloc_cipher(name, 0, - CRYPTO_ALG_NEED_FALLBACK); - - if (IS_ERR(sctx->fallback.cip)) { - pr_err("Allocating AES fallback algorithm %s failed\n", - name); - return PTR_ERR(sctx->fallback.cip); - } - - return 0; -} - -static void fallback_exit_cip(struct crypto_tfm *tfm) -{ - struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm); - - crypto_free_cipher(sctx->fallback.cip); - sctx->fallback.cip = NULL; -} - -static struct crypto_alg aes_alg = { - .cra_name = "aes", - .cra_driver_name = "aes-s390", - .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_CIPHER | - CRYPTO_ALG_NEED_FALLBACK, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct s390_aes_ctx), - .cra_module = THIS_MODULE, - .cra_init = fallback_init_cip, - .cra_exit = fallback_exit_cip, - .cra_u = { - .cipher = { - .cia_min_keysize = AES_MIN_KEY_SIZE, - .cia_max_keysize = AES_MAX_KEY_SIZE, - .cia_setkey = aes_set_key, - .cia_encrypt = crypto_aes_encrypt, - .cia_decrypt = crypto_aes_decrypt, - } - } -}; - static int setkey_fallback_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int len) { @@ -787,29 +680,20 @@ static void gcm_walk_start(struct gcm_sg_walk *gw, struct scatterlist *sg, static inline unsigned int _gcm_sg_clamp_and_map(struct gcm_sg_walk *gw) { - struct scatterlist *nextsg; - - gw->walk_bytes = scatterwalk_clamp(&gw->walk, gw->walk_bytes_remain); - while (!gw->walk_bytes) { - nextsg = sg_next(gw->walk.sg); - if (!nextsg) - return 0; - scatterwalk_start(&gw->walk, nextsg); - gw->walk_bytes = scatterwalk_clamp(&gw->walk, - gw->walk_bytes_remain); - } - gw->walk_ptr = scatterwalk_map(&gw->walk); + if (gw->walk_bytes_remain == 0) + return 0; + gw->walk_bytes = scatterwalk_next(&gw->walk, gw->walk_bytes_remain); return gw->walk_bytes; } static inline void _gcm_sg_unmap_and_advance(struct gcm_sg_walk *gw, - unsigned int nbytes) + unsigned int nbytes, bool out) { gw->walk_bytes_remain -= nbytes; - scatterwalk_unmap(gw->walk_ptr); - scatterwalk_advance(&gw->walk, nbytes); - scatterwalk_done(&gw->walk, 0, gw->walk_bytes_remain); - gw->walk_ptr = NULL; + if (out) + scatterwalk_done_dst(&gw->walk, nbytes); + else + scatterwalk_done_src(&gw->walk, nbytes); } static int gcm_in_walk_go(struct gcm_sg_walk *gw, unsigned int minbytesneeded) @@ -835,16 +719,16 @@ static int gcm_in_walk_go(struct gcm_sg_walk *gw, unsigned int minbytesneeded) } if (!gw->buf_bytes && gw->walk_bytes >= minbytesneeded) { - gw->ptr = gw->walk_ptr; + gw->ptr = gw->walk.addr; gw->nbytes = gw->walk_bytes; goto out; } while (1) { n = min(gw->walk_bytes, AES_BLOCK_SIZE - gw->buf_bytes); - memcpy(gw->buf + gw->buf_bytes, gw->walk_ptr, n); + memcpy(gw->buf + gw->buf_bytes, gw->walk.addr, n); gw->buf_bytes += n; - _gcm_sg_unmap_and_advance(gw, n); + _gcm_sg_unmap_and_advance(gw, n, false); if (gw->buf_bytes >= minbytesneeded) { gw->ptr = gw->buf; gw->nbytes = gw->buf_bytes; @@ -876,13 +760,12 @@ static int gcm_out_walk_go(struct gcm_sg_walk *gw, unsigned int minbytesneeded) } if (gw->walk_bytes >= minbytesneeded) { - gw->ptr = gw->walk_ptr; + gw->ptr = gw->walk.addr; gw->nbytes = gw->walk_bytes; goto out; } - scatterwalk_unmap(gw->walk_ptr); - gw->walk_ptr = NULL; + scatterwalk_unmap(&gw->walk); gw->ptr = gw->buf; gw->nbytes = sizeof(gw->buf); @@ -904,7 +787,7 @@ static int gcm_in_walk_done(struct gcm_sg_walk *gw, unsigned int bytesdone) } else gw->buf_bytes = 0; } else - _gcm_sg_unmap_and_advance(gw, bytesdone); + _gcm_sg_unmap_and_advance(gw, bytesdone, false); return bytesdone; } @@ -921,11 +804,11 @@ static int gcm_out_walk_done(struct gcm_sg_walk *gw, unsigned int bytesdone) if (!_gcm_sg_clamp_and_map(gw)) return i; n = min(gw->walk_bytes, bytesdone - i); - memcpy(gw->walk_ptr, gw->buf + i, n); - _gcm_sg_unmap_and_advance(gw, n); + memcpy(gw->walk.addr, gw->buf + i, n); + _gcm_sg_unmap_and_advance(gw, n, true); } } else - _gcm_sg_unmap_and_advance(gw, bytesdone); + _gcm_sg_unmap_and_advance(gw, bytesdone, true); return bytesdone; } @@ -1061,7 +944,6 @@ static struct aead_alg gcm_aes_aead = { }, }; -static struct crypto_alg *aes_s390_alg; static struct skcipher_alg *aes_s390_skcipher_algs[5]; static int aes_s390_skciphers_num; static struct aead_alg *aes_s390_aead_alg; @@ -1078,8 +960,6 @@ static int aes_s390_register_skcipher(struct skcipher_alg *alg) static void aes_s390_fini(void) { - if (aes_s390_alg) - crypto_unregister_alg(aes_s390_alg); while (aes_s390_skciphers_num--) crypto_unregister_skcipher(aes_s390_skcipher_algs[aes_s390_skciphers_num]); if (ctrblk) @@ -1102,10 +982,6 @@ static int __init aes_s390_init(void) if (cpacf_test_func(&km_functions, CPACF_KM_AES_128) || cpacf_test_func(&km_functions, CPACF_KM_AES_192) || cpacf_test_func(&km_functions, CPACF_KM_AES_256)) { - ret = crypto_register_alg(&aes_alg); - if (ret) - goto out_err; - aes_s390_alg = &aes_alg; ret = aes_s390_register_skcipher(&ecb_aes_alg); if (ret) goto out_err; @@ -1168,4 +1044,3 @@ MODULE_ALIAS_CRYPTO("aes-all"); MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm"); MODULE_LICENSE("GPL"); -MODULE_IMPORT_NS("CRYPTO_INTERNAL"); diff --git a/arch/s390/crypto/arch_random.c b/arch/s390/crypto/arch_random.c index a8a2407381af..083e8d5eada2 100644 --- a/arch/s390/crypto/arch_random.c +++ b/arch/s390/crypto/arch_random.c @@ -6,6 +6,7 @@ * Author(s): Harald Freudenberger */ +#include <linux/export.h> #include <linux/kernel.h> #include <linux/atomic.h> #include <linux/random.h> diff --git a/arch/s390/crypto/chacha-glue.c b/arch/s390/crypto/chacha-glue.c deleted file mode 100644 index f8b0c52e77a4..000000000000 --- a/arch/s390/crypto/chacha-glue.c +++ /dev/null @@ -1,130 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * s390 ChaCha stream cipher. - * - * Copyright IBM Corp. 2021 - */ - -#define KMSG_COMPONENT "chacha_s390" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt - -#include <crypto/internal/chacha.h> -#include <crypto/internal/skcipher.h> -#include <crypto/algapi.h> -#include <linux/cpufeature.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/sizes.h> -#include <asm/fpu.h> -#include "chacha-s390.h" - -static void chacha20_crypt_s390(u32 *state, u8 *dst, const u8 *src, - unsigned int nbytes, const u32 *key, - u32 *counter) -{ - DECLARE_KERNEL_FPU_ONSTACK32(vxstate); - - kernel_fpu_begin(&vxstate, KERNEL_VXR); - chacha20_vx(dst, src, nbytes, key, counter); - kernel_fpu_end(&vxstate, KERNEL_VXR); - - *counter += round_up(nbytes, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE; -} - -static int chacha20_s390(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - u32 state[CHACHA_STATE_WORDS] __aligned(16); - struct skcipher_walk walk; - unsigned int nbytes; - int rc; - - rc = skcipher_walk_virt(&walk, req, false); - chacha_init_generic(state, ctx->key, req->iv); - - while (walk.nbytes > 0) { - nbytes = walk.nbytes; - if (nbytes < walk.total) - nbytes = round_down(nbytes, walk.stride); - - if (nbytes <= CHACHA_BLOCK_SIZE) { - chacha_crypt_generic(state, walk.dst.virt.addr, - walk.src.virt.addr, nbytes, - ctx->nrounds); - } else { - chacha20_crypt_s390(state, walk.dst.virt.addr, - walk.src.virt.addr, nbytes, - &state[4], &state[12]); - } - rc = skcipher_walk_done(&walk, walk.nbytes - nbytes); - } - return rc; -} - -void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds) -{ - /* TODO: implement hchacha_block_arch() in assembly */ - hchacha_block_generic(state, stream, nrounds); -} -EXPORT_SYMBOL(hchacha_block_arch); - -void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv) -{ - chacha_init_generic(state, key, iv); -} -EXPORT_SYMBOL(chacha_init_arch); - -void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) -{ - /* s390 chacha20 implementation has 20 rounds hard-coded, - * it cannot handle a block of data or less, but otherwise - * it can handle data of arbitrary size - */ - if (bytes <= CHACHA_BLOCK_SIZE || nrounds != 20 || !cpu_has_vx()) - chacha_crypt_generic(state, dst, src, bytes, nrounds); - else - chacha20_crypt_s390(state, dst, src, bytes, - &state[4], &state[12]); -} -EXPORT_SYMBOL(chacha_crypt_arch); - -static struct skcipher_alg chacha_algs[] = { - { - .base.cra_name = "chacha20", - .base.cra_driver_name = "chacha20-s390", - .base.cra_priority = 900, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = CHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .setkey = chacha20_setkey, - .encrypt = chacha20_s390, - .decrypt = chacha20_s390, - } -}; - -static int __init chacha_mod_init(void) -{ - return IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) ? - crypto_register_skciphers(chacha_algs, ARRAY_SIZE(chacha_algs)) : 0; -} - -static void __exit chacha_mod_fini(void) -{ - if (IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER)) - crypto_unregister_skciphers(chacha_algs, ARRAY_SIZE(chacha_algs)); -} - -module_cpu_feature_match(S390_CPU_FEATURE_VXRS, chacha_mod_init); -module_exit(chacha_mod_fini); - -MODULE_DESCRIPTION("ChaCha20 stream cipher"); -MODULE_LICENSE("GPL v2"); - -MODULE_ALIAS_CRYPTO("chacha20"); diff --git a/arch/s390/crypto/chacha-s390.S b/arch/s390/crypto/chacha-s390.S deleted file mode 100644 index 63f3102678c0..000000000000 --- a/arch/s390/crypto/chacha-s390.S +++ /dev/null @@ -1,908 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Original implementation written by Andy Polyakov, @dot-asm. - * This is an adaptation of the original code for kernel use. - * - * Copyright (C) 2006-2019 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved. - */ - -#include <linux/linkage.h> -#include <asm/nospec-insn.h> -#include <asm/fpu-insn.h> - -#define SP %r15 -#define FRAME (16 * 8 + 4 * 8) - - .data - .balign 32 - -SYM_DATA_START_LOCAL(sigma) - .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral - .long 1,0,0,0 - .long 2,0,0,0 - .long 3,0,0,0 - .long 0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c # byte swap - - .long 0,1,2,3 - .long 0x61707865,0x61707865,0x61707865,0x61707865 # smashed sigma - .long 0x3320646e,0x3320646e,0x3320646e,0x3320646e - .long 0x79622d32,0x79622d32,0x79622d32,0x79622d32 - .long 0x6b206574,0x6b206574,0x6b206574,0x6b206574 -SYM_DATA_END(sigma) - - .previous - - GEN_BR_THUNK %r14 - - .text - -############################################################################# -# void chacha20_vx_4x(u8 *out, counst u8 *inp, size_t len, -# counst u32 *key, const u32 *counter) - -#define OUT %r2 -#define INP %r3 -#define LEN %r4 -#define KEY %r5 -#define COUNTER %r6 - -#define BEPERM %v31 -#define CTR %v26 - -#define K0 %v16 -#define K1 %v17 -#define K2 %v18 -#define K3 %v19 - -#define XA0 %v0 -#define XA1 %v1 -#define XA2 %v2 -#define XA3 %v3 - -#define XB0 %v4 -#define XB1 %v5 -#define XB2 %v6 -#define XB3 %v7 - -#define XC0 %v8 -#define XC1 %v9 -#define XC2 %v10 -#define XC3 %v11 - -#define XD0 %v12 -#define XD1 %v13 -#define XD2 %v14 -#define XD3 %v15 - -#define XT0 %v27 -#define XT1 %v28 -#define XT2 %v29 -#define XT3 %v30 - -SYM_FUNC_START(chacha20_vx_4x) - stmg %r6,%r7,6*8(SP) - - larl %r7,sigma - lhi %r0,10 - lhi %r1,0 - - VL K0,0,,%r7 # load sigma - VL K1,0,,KEY # load key - VL K2,16,,KEY - VL K3,0,,COUNTER # load counter - - VL BEPERM,0x40,,%r7 - VL CTR,0x50,,%r7 - - VLM XA0,XA3,0x60,%r7,4 # load [smashed] sigma - - VREPF XB0,K1,0 # smash the key - VREPF XB1,K1,1 - VREPF XB2,K1,2 - VREPF XB3,K1,3 - - VREPF XD0,K3,0 - VREPF XD1,K3,1 - VREPF XD2,K3,2 - VREPF XD3,K3,3 - VAF XD0,XD0,CTR - - VREPF XC0,K2,0 - VREPF XC1,K2,1 - VREPF XC2,K2,2 - VREPF XC3,K2,3 - -.Loop_4x: - VAF XA0,XA0,XB0 - VX XD0,XD0,XA0 - VERLLF XD0,XD0,16 - - VAF XA1,XA1,XB1 - VX XD1,XD1,XA1 - VERLLF XD1,XD1,16 - - VAF XA2,XA2,XB2 - VX XD2,XD2,XA2 - VERLLF XD2,XD2,16 - - VAF XA3,XA3,XB3 - VX XD3,XD3,XA3 - VERLLF XD3,XD3,16 - - VAF XC0,XC0,XD0 - VX XB0,XB0,XC0 - VERLLF XB0,XB0,12 - - VAF XC1,XC1,XD1 - VX XB1,XB1,XC1 - VERLLF XB1,XB1,12 - - VAF XC2,XC2,XD2 - VX XB2,XB2,XC2 - VERLLF XB2,XB2,12 - - VAF XC3,XC3,XD3 - VX XB3,XB3,XC3 - VERLLF XB3,XB3,12 - - VAF XA0,XA0,XB0 - VX XD0,XD0,XA0 - VERLLF XD0,XD0,8 - - VAF XA1,XA1,XB1 - VX XD1,XD1,XA1 - VERLLF XD1,XD1,8 - - VAF XA2,XA2,XB2 - VX XD2,XD2,XA2 - VERLLF XD2,XD2,8 - - VAF XA3,XA3,XB3 - VX XD3,XD3,XA3 - VERLLF XD3,XD3,8 - - VAF XC0,XC0,XD0 - VX XB0,XB0,XC0 - VERLLF XB0,XB0,7 - - VAF XC1,XC1,XD1 - VX XB1,XB1,XC1 - VERLLF XB1,XB1,7 - - VAF XC2,XC2,XD2 - VX XB2,XB2,XC2 - VERLLF XB2,XB2,7 - - VAF XC3,XC3,XD3 - VX XB3,XB3,XC3 - VERLLF XB3,XB3,7 - - VAF XA0,XA0,XB1 - VX XD3,XD3,XA0 - VERLLF XD3,XD3,16 - - VAF XA1,XA1,XB2 - VX XD0,XD0,XA1 - VERLLF XD0,XD0,16 - - VAF XA2,XA2,XB3 - VX XD1,XD1,XA2 - VERLLF XD1,XD1,16 - - VAF XA3,XA3,XB0 - VX XD2,XD2,XA3 - VERLLF XD2,XD2,16 - - VAF XC2,XC2,XD3 - VX XB1,XB1,XC2 - VERLLF XB1,XB1,12 - - VAF XC3,XC3,XD0 - VX XB2,XB2,XC3 - VERLLF XB2,XB2,12 - - VAF XC0,XC0,XD1 - VX XB3,XB3,XC0 - VERLLF XB3,XB3,12 - - VAF XC1,XC1,XD2 - VX XB0,XB0,XC1 - VERLLF XB0,XB0,12 - - VAF XA0,XA0,XB1 - VX XD3,XD3,XA0 - VERLLF XD3,XD3,8 - - VAF XA1,XA1,XB2 - VX XD0,XD0,XA1 - VERLLF XD0,XD0,8 - - VAF XA2,XA2,XB3 - VX XD1,XD1,XA2 - VERLLF XD1,XD1,8 - - VAF XA3,XA3,XB0 - VX XD2,XD2,XA3 - VERLLF XD2,XD2,8 - - VAF XC2,XC2,XD3 - VX XB1,XB1,XC2 - VERLLF XB1,XB1,7 - - VAF XC3,XC3,XD0 - VX XB2,XB2,XC3 - VERLLF XB2,XB2,7 - - VAF XC0,XC0,XD1 - VX XB3,XB3,XC0 - VERLLF XB3,XB3,7 - - VAF XC1,XC1,XD2 - VX XB0,XB0,XC1 - VERLLF XB0,XB0,7 - brct %r0,.Loop_4x - - VAF XD0,XD0,CTR - - VMRHF XT0,XA0,XA1 # transpose data - VMRHF XT1,XA2,XA3 - VMRLF XT2,XA0,XA1 - VMRLF XT3,XA2,XA3 - VPDI XA0,XT0,XT1,0b0000 - VPDI XA1,XT0,XT1,0b0101 - VPDI XA2,XT2,XT3,0b0000 - VPDI XA3,XT2,XT3,0b0101 - - VMRHF XT0,XB0,XB1 - VMRHF XT1,XB2,XB3 - VMRLF XT2,XB0,XB1 - VMRLF XT3,XB2,XB3 - VPDI XB0,XT0,XT1,0b0000 - VPDI XB1,XT0,XT1,0b0101 - VPDI XB2,XT2,XT3,0b0000 - VPDI XB3,XT2,XT3,0b0101 - - VMRHF XT0,XC0,XC1 - VMRHF XT1,XC2,XC3 - VMRLF XT2,XC0,XC1 - VMRLF XT3,XC2,XC3 - VPDI XC0,XT0,XT1,0b0000 - VPDI XC1,XT0,XT1,0b0101 - VPDI XC2,XT2,XT3,0b0000 - VPDI XC3,XT2,XT3,0b0101 - - VMRHF XT0,XD0,XD1 - VMRHF XT1,XD2,XD3 - VMRLF XT2,XD0,XD1 - VMRLF XT3,XD2,XD3 - VPDI XD0,XT0,XT1,0b0000 - VPDI XD1,XT0,XT1,0b0101 - VPDI XD2,XT2,XT3,0b0000 - VPDI XD3,XT2,XT3,0b0101 - - VAF XA0,XA0,K0 - VAF XB0,XB0,K1 - VAF XC0,XC0,K2 - VAF XD0,XD0,K3 - - VPERM XA0,XA0,XA0,BEPERM - VPERM XB0,XB0,XB0,BEPERM - VPERM XC0,XC0,XC0,BEPERM - VPERM XD0,XD0,XD0,BEPERM - - VLM XT0,XT3,0,INP,0 - - VX XT0,XT0,XA0 - VX XT1,XT1,XB0 - VX XT2,XT2,XC0 - VX XT3,XT3,XD0 - - VSTM XT0,XT3,0,OUT,0 - - la INP,0x40(INP) - la OUT,0x40(OUT) - aghi LEN,-0x40 - - VAF XA0,XA1,K0 - VAF XB0,XB1,K1 - VAF XC0,XC1,K2 - VAF XD0,XD1,K3 - - VPERM XA0,XA0,XA0,BEPERM - VPERM XB0,XB0,XB0,BEPERM - VPERM XC0,XC0,XC0,BEPERM - VPERM XD0,XD0,XD0,BEPERM - - clgfi LEN,0x40 - jl .Ltail_4x - - VLM XT0,XT3,0,INP,0 - - VX XT0,XT0,XA0 - VX XT1,XT1,XB0 - VX XT2,XT2,XC0 - VX XT3,XT3,XD0 - - VSTM XT0,XT3,0,OUT,0 - - la INP,0x40(INP) - la OUT,0x40(OUT) - aghi LEN,-0x40 - je .Ldone_4x - - VAF XA0,XA2,K0 - VAF XB0,XB2,K1 - VAF XC0,XC2,K2 - VAF XD0,XD2,K3 - - VPERM XA0,XA0,XA0,BEPERM - VPERM XB0,XB0,XB0,BEPERM - VPERM XC0,XC0,XC0,BEPERM - VPERM XD0,XD0,XD0,BEPERM - - clgfi LEN,0x40 - jl .Ltail_4x - - VLM XT0,XT3,0,INP,0 - - VX XT0,XT0,XA0 - VX XT1,XT1,XB0 - VX XT2,XT2,XC0 - VX XT3,XT3,XD0 - - VSTM XT0,XT3,0,OUT,0 - - la INP,0x40(INP) - la OUT,0x40(OUT) - aghi LEN,-0x40 - je .Ldone_4x - - VAF XA0,XA3,K0 - VAF XB0,XB3,K1 - VAF XC0,XC3,K2 - VAF XD0,XD3,K3 - - VPERM XA0,XA0,XA0,BEPERM - VPERM XB0,XB0,XB0,BEPERM - VPERM XC0,XC0,XC0,BEPERM - VPERM XD0,XD0,XD0,BEPERM - - clgfi LEN,0x40 - jl .Ltail_4x - - VLM XT0,XT3,0,INP,0 - - VX XT0,XT0,XA0 - VX XT1,XT1,XB0 - VX XT2,XT2,XC0 - VX XT3,XT3,XD0 - - VSTM XT0,XT3,0,OUT,0 - -.Ldone_4x: - lmg %r6,%r7,6*8(SP) - BR_EX %r14 - -.Ltail_4x: - VLR XT0,XC0 - VLR XT1,XD0 - - VST XA0,8*8+0x00,,SP - VST XB0,8*8+0x10,,SP - VST XT0,8*8+0x20,,SP - VST XT1,8*8+0x30,,SP - - lghi %r1,0 - -.Loop_tail_4x: - llgc %r5,0(%r1,INP) - llgc %r6,8*8(%r1,SP) - xr %r6,%r5 - stc %r6,0(%r1,OUT) - la %r1,1(%r1) - brct LEN,.Loop_tail_4x - - lmg %r6,%r7,6*8(SP) - BR_EX %r14 -SYM_FUNC_END(chacha20_vx_4x) - -#undef OUT -#undef INP -#undef LEN -#undef KEY -#undef COUNTER - -#undef BEPERM - -#undef K0 -#undef K1 -#undef K2 -#undef K3 - - -############################################################################# -# void chacha20_vx(u8 *out, counst u8 *inp, size_t len, -# counst u32 *key, const u32 *counter) - -#define OUT %r2 -#define INP %r3 -#define LEN %r4 -#define KEY %r5 -#define COUNTER %r6 - -#define BEPERM %v31 - -#define K0 %v27 -#define K1 %v24 -#define K2 %v25 -#define K3 %v26 - -#define A0 %v0 -#define B0 %v1 -#define C0 %v2 -#define D0 %v3 - -#define A1 %v4 -#define B1 %v5 -#define C1 %v6 -#define D1 %v7 - -#define A2 %v8 -#define B2 %v9 -#define C2 %v10 -#define D2 %v11 - -#define A3 %v12 -#define B3 %v13 -#define C3 %v14 -#define D3 %v15 - -#define A4 %v16 -#define B4 %v17 -#define C4 %v18 -#define D4 %v19 - -#define A5 %v20 -#define B5 %v21 -#define C5 %v22 -#define D5 %v23 - -#define T0 %v27 -#define T1 %v28 -#define T2 %v29 -#define T3 %v30 - -SYM_FUNC_START(chacha20_vx) - clgfi LEN,256 - jle chacha20_vx_4x - stmg %r6,%r7,6*8(SP) - - lghi %r1,-FRAME - lgr %r0,SP - la SP,0(%r1,SP) - stg %r0,0(SP) # back-chain - - larl %r7,sigma - lhi %r0,10 - - VLM K1,K2,0,KEY,0 # load key - VL K3,0,,COUNTER # load counter - - VLM K0,BEPERM,0,%r7,4 # load sigma, increments, ... - -.Loop_outer_vx: - VLR A0,K0 - VLR B0,K1 - VLR A1,K0 - VLR B1,K1 - VLR A2,K0 - VLR B2,K1 - VLR A3,K0 - VLR B3,K1 - VLR A4,K0 - VLR B4,K1 - VLR A5,K0 - VLR B5,K1 - - VLR D0,K3 - VAF D1,K3,T1 # K[3]+1 - VAF D2,K3,T2 # K[3]+2 - VAF D3,K3,T3 # K[3]+3 - VAF D4,D2,T2 # K[3]+4 - VAF D5,D2,T3 # K[3]+5 - - VLR C0,K2 - VLR C1,K2 - VLR C2,K2 - VLR C3,K2 - VLR C4,K2 - VLR C5,K2 - - VLR T1,D1 - VLR T2,D2 - VLR T3,D3 - -.Loop_vx: - VAF A0,A0,B0 - VAF A1,A1,B1 - VAF A2,A2,B2 - VAF A3,A3,B3 - VAF A4,A4,B4 - VAF A5,A5,B5 - VX D0,D0,A0 - VX D1,D1,A1 - VX D2,D2,A2 - VX D3,D3,A3 - VX D4,D4,A4 - VX D5,D5,A5 - VERLLF D0,D0,16 - VERLLF D1,D1,16 - VERLLF D2,D2,16 - VERLLF D3,D3,16 - VERLLF D4,D4,16 - VERLLF D5,D5,16 - - VAF C0,C0,D0 - VAF C1,C1,D1 - VAF C2,C2,D2 - VAF C3,C3,D3 - VAF C4,C4,D4 - VAF C5,C5,D5 - VX B0,B0,C0 - VX B1,B1,C1 - VX B2,B2,C2 - VX B3,B3,C3 - VX B4,B4,C4 - VX B5,B5,C5 - VERLLF B0,B0,12 - VERLLF B1,B1,12 - VERLLF B2,B2,12 - VERLLF B3,B3,12 - VERLLF B4,B4,12 - VERLLF B5,B5,12 - - VAF A0,A0,B0 - VAF A1,A1,B1 - VAF A2,A2,B2 - VAF A3,A3,B3 - VAF A4,A4,B4 - VAF A5,A5,B5 - VX D0,D0,A0 - VX D1,D1,A1 - VX D2,D2,A2 - VX D3,D3,A3 - VX D4,D4,A4 - VX D5,D5,A5 - VERLLF D0,D0,8 - VERLLF D1,D1,8 - VERLLF D2,D2,8 - VERLLF D3,D3,8 - VERLLF D4,D4,8 - VERLLF D5,D5,8 - - VAF C0,C0,D0 - VAF C1,C1,D1 - VAF C2,C2,D2 - VAF C3,C3,D3 - VAF C4,C4,D4 - VAF C5,C5,D5 - VX B0,B0,C0 - VX B1,B1,C1 - VX B2,B2,C2 - VX B3,B3,C3 - VX B4,B4,C4 - VX B5,B5,C5 - VERLLF B0,B0,7 - VERLLF B1,B1,7 - VERLLF B2,B2,7 - VERLLF B3,B3,7 - VERLLF B4,B4,7 - VERLLF B5,B5,7 - - VSLDB C0,C0,C0,8 - VSLDB C1,C1,C1,8 - VSLDB C2,C2,C2,8 - VSLDB C3,C3,C3,8 - VSLDB C4,C4,C4,8 - VSLDB C5,C5,C5,8 - VSLDB B0,B0,B0,4 - VSLDB B1,B1,B1,4 - VSLDB B2,B2,B2,4 - VSLDB B3,B3,B3,4 - VSLDB B4,B4,B4,4 - VSLDB B5,B5,B5,4 - VSLDB D0,D0,D0,12 - VSLDB D1,D1,D1,12 - VSLDB D2,D2,D2,12 - VSLDB D3,D3,D3,12 - VSLDB D4,D4,D4,12 - VSLDB D5,D5,D5,12 - - VAF A0,A0,B0 - VAF A1,A1,B1 - VAF A2,A2,B2 - VAF A3,A3,B3 - VAF A4,A4,B4 - VAF A5,A5,B5 - VX D0,D0,A0 - VX D1,D1,A1 - VX D2,D2,A2 - VX D3,D3,A3 - VX D4,D4,A4 - VX D5,D5,A5 - VERLLF D0,D0,16 - VERLLF D1,D1,16 - VERLLF D2,D2,16 - VERLLF D3,D3,16 - VERLLF D4,D4,16 - VERLLF D5,D5,16 - - VAF C0,C0,D0 - VAF C1,C1,D1 - VAF C2,C2,D2 - VAF C3,C3,D3 - VAF C4,C4,D4 - VAF C5,C5,D5 - VX B0,B0,C0 - VX B1,B1,C1 - VX B2,B2,C2 - VX B3,B3,C3 - VX B4,B4,C4 - VX B5,B5,C5 - VERLLF B0,B0,12 - VERLLF B1,B1,12 - VERLLF B2,B2,12 - VERLLF B3,B3,12 - VERLLF B4,B4,12 - VERLLF B5,B5,12 - - VAF A0,A0,B0 - VAF A1,A1,B1 - VAF A2,A2,B2 - VAF A3,A3,B3 - VAF A4,A4,B4 - VAF A5,A5,B5 - VX D0,D0,A0 - VX D1,D1,A1 - VX D2,D2,A2 - VX D3,D3,A3 - VX D4,D4,A4 - VX D5,D5,A5 - VERLLF D0,D0,8 - VERLLF D1,D1,8 - VERLLF D2,D2,8 - VERLLF D3,D3,8 - VERLLF D4,D4,8 - VERLLF D5,D5,8 - - VAF C0,C0,D0 - VAF C1,C1,D1 - VAF C2,C2,D2 - VAF C3,C3,D3 - VAF C4,C4,D4 - VAF C5,C5,D5 - VX B0,B0,C0 - VX B1,B1,C1 - VX B2,B2,C2 - VX B3,B3,C3 - VX B4,B4,C4 - VX B5,B5,C5 - VERLLF B0,B0,7 - VERLLF B1,B1,7 - VERLLF B2,B2,7 - VERLLF B3,B3,7 - VERLLF B4,B4,7 - VERLLF B5,B5,7 - - VSLDB C0,C0,C0,8 - VSLDB C1,C1,C1,8 - VSLDB C2,C2,C2,8 - VSLDB C3,C3,C3,8 - VSLDB C4,C4,C4,8 - VSLDB C5,C5,C5,8 - VSLDB B0,B0,B0,12 - VSLDB B1,B1,B1,12 - VSLDB B2,B2,B2,12 - VSLDB B3,B3,B3,12 - VSLDB B4,B4,B4,12 - VSLDB B5,B5,B5,12 - VSLDB D0,D0,D0,4 - VSLDB D1,D1,D1,4 - VSLDB D2,D2,D2,4 - VSLDB D3,D3,D3,4 - VSLDB D4,D4,D4,4 - VSLDB D5,D5,D5,4 - brct %r0,.Loop_vx - - VAF A0,A0,K0 - VAF B0,B0,K1 - VAF C0,C0,K2 - VAF D0,D0,K3 - VAF A1,A1,K0 - VAF D1,D1,T1 # +K[3]+1 - - VPERM A0,A0,A0,BEPERM - VPERM B0,B0,B0,BEPERM - VPERM C0,C0,C0,BEPERM - VPERM D0,D0,D0,BEPERM - - clgfi LEN,0x40 - jl .Ltail_vx - - VAF D2,D2,T2 # +K[3]+2 - VAF D3,D3,T3 # +K[3]+3 - VLM T0,T3,0,INP,0 - - VX A0,A0,T0 - VX B0,B0,T1 - VX C0,C0,T2 - VX D0,D0,T3 - - VLM K0,T3,0,%r7,4 # re-load sigma and increments - - VSTM A0,D0,0,OUT,0 - - la INP,0x40(INP) - la OUT,0x40(OUT) - aghi LEN,-0x40 - je .Ldone_vx - - VAF B1,B1,K1 - VAF C1,C1,K2 - - VPERM A0,A1,A1,BEPERM - VPERM B0,B1,B1,BEPERM - VPERM C0,C1,C1,BEPERM - VPERM D0,D1,D1,BEPERM - - clgfi LEN,0x40 - jl .Ltail_vx - - VLM A1,D1,0,INP,0 - - VX A0,A0,A1 - VX B0,B0,B1 - VX C0,C0,C1 - VX D0,D0,D1 - - VSTM A0,D0,0,OUT,0 - - la INP,0x40(INP) - la OUT,0x40(OUT) - aghi LEN,-0x40 - je .Ldone_vx - - VAF A2,A2,K0 - VAF B2,B2,K1 - VAF C2,C2,K2 - - VPERM A0,A2,A2,BEPERM - VPERM B0,B2,B2,BEPERM - VPERM C0,C2,C2,BEPERM - VPERM D0,D2,D2,BEPERM - - clgfi LEN,0x40 - jl .Ltail_vx - - VLM A1,D1,0,INP,0 - - VX A0,A0,A1 - VX B0,B0,B1 - VX C0,C0,C1 - VX D0,D0,D1 - - VSTM A0,D0,0,OUT,0 - - la INP,0x40(INP) - la OUT,0x40(OUT) - aghi LEN,-0x40 - je .Ldone_vx - - VAF A3,A3,K0 - VAF B3,B3,K1 - VAF C3,C3,K2 - VAF D2,K3,T3 # K[3]+3 - - VPERM A0,A3,A3,BEPERM - VPERM B0,B3,B3,BEPERM - VPERM C0,C3,C3,BEPERM - VPERM D0,D3,D3,BEPERM - - clgfi LEN,0x40 - jl .Ltail_vx - - VAF D3,D2,T1 # K[3]+4 - VLM A1,D1,0,INP,0 - - VX A0,A0,A1 - VX B0,B0,B1 - VX C0,C0,C1 - VX D0,D0,D1 - - VSTM A0,D0,0,OUT,0 - - la INP,0x40(INP) - la OUT,0x40(OUT) - aghi LEN,-0x40 - je .Ldone_vx - - VAF A4,A4,K0 - VAF B4,B4,K1 - VAF C4,C4,K2 - VAF D4,D4,D3 # +K[3]+4 - VAF D3,D3,T1 # K[3]+5 - VAF K3,D2,T3 # K[3]+=6 - - VPERM A0,A4,A4,BEPERM - VPERM B0,B4,B4,BEPERM - VPERM C0,C4,C4,BEPERM - VPERM D0,D4,D4,BEPERM - - clgfi LEN,0x40 - jl .Ltail_vx - - VLM A1,D1,0,INP,0 - - VX A0,A0,A1 - VX B0,B0,B1 - VX C0,C0,C1 - VX D0,D0,D1 - - VSTM A0,D0,0,OUT,0 - - la INP,0x40(INP) - la OUT,0x40(OUT) - aghi LEN,-0x40 - je .Ldone_vx - - VAF A5,A5,K0 - VAF B5,B5,K1 - VAF C5,C5,K2 - VAF D5,D5,D3 # +K[3]+5 - - VPERM A0,A5,A5,BEPERM - VPERM B0,B5,B5,BEPERM - VPERM C0,C5,C5,BEPERM - VPERM D0,D5,D5,BEPERM - - clgfi LEN,0x40 - jl .Ltail_vx - - VLM A1,D1,0,INP,0 - - VX A0,A0,A1 - VX B0,B0,B1 - VX C0,C0,C1 - VX D0,D0,D1 - - VSTM A0,D0,0,OUT,0 - - la INP,0x40(INP) - la OUT,0x40(OUT) - lhi %r0,10 - aghi LEN,-0x40 - jne .Loop_outer_vx - -.Ldone_vx: - lmg %r6,%r7,FRAME+6*8(SP) - la SP,FRAME(SP) - BR_EX %r14 - -.Ltail_vx: - VSTM A0,D0,8*8,SP,3 - lghi %r1,0 - -.Loop_tail_vx: - llgc %r5,0(%r1,INP) - llgc %r6,8*8(%r1,SP) - xr %r6,%r5 - stc %r6,0(%r1,OUT) - la %r1,1(%r1) - brct LEN,.Loop_tail_vx - - lmg %r6,%r7,FRAME+6*8(SP) - la SP,FRAME(SP) - BR_EX %r14 -SYM_FUNC_END(chacha20_vx) - -.previous diff --git a/arch/s390/crypto/chacha-s390.h b/arch/s390/crypto/chacha-s390.h deleted file mode 100644 index 733744ce30f5..000000000000 --- a/arch/s390/crypto/chacha-s390.h +++ /dev/null @@ -1,14 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * s390 ChaCha stream cipher. - * - * Copyright IBM Corp. 2021 - */ - -#ifndef _CHACHA_S390_H -#define _CHACHA_S390_H - -void chacha20_vx(u8 *out, const u8 *inp, size_t len, const u32 *key, - const u32 *counter); - -#endif /* _CHACHA_S390_H */ diff --git a/arch/s390/crypto/des_s390.c b/arch/s390/crypto/des_s390.c deleted file mode 100644 index 8e75b83a5ddc..000000000000 --- a/arch/s390/crypto/des_s390.c +++ /dev/null @@ -1,502 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0+ -/* - * Cryptographic API. - * - * s390 implementation of the DES Cipher Algorithm. - * - * Copyright IBM Corp. 2003, 2011 - * Author(s): Thomas Spatzier - * Jan Glauber (jan.glauber@de.ibm.com) - */ - -#include <linux/init.h> -#include <linux/module.h> -#include <linux/cpufeature.h> -#include <linux/crypto.h> -#include <linux/fips.h> -#include <linux/mutex.h> -#include <crypto/algapi.h> -#include <crypto/internal/des.h> -#include <crypto/internal/skcipher.h> -#include <asm/cpacf.h> - -#define DES3_KEY_SIZE (3 * DES_KEY_SIZE) - -static u8 *ctrblk; -static DEFINE_MUTEX(ctrblk_lock); - -static cpacf_mask_t km_functions, kmc_functions, kmctr_functions; - -struct s390_des_ctx { - u8 iv[DES_BLOCK_SIZE]; - u8 key[DES3_KEY_SIZE]; -}; - -static int des_setkey(struct crypto_tfm *tfm, const u8 *key, - unsigned int key_len) -{ - struct s390_des_ctx *ctx = crypto_tfm_ctx(tfm); - int err; - - err = crypto_des_verify_key(tfm, key); - if (err) - return err; - - memcpy(ctx->key, key, key_len); - return 0; -} - -static int des_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, - unsigned int key_len) -{ - return des_setkey(crypto_skcipher_tfm(tfm), key, key_len); -} - -static void s390_des_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) -{ - struct s390_des_ctx *ctx = crypto_tfm_ctx(tfm); - - cpacf_km(CPACF_KM_DEA, ctx->key, out, in, DES_BLOCK_SIZE); -} - -static void s390_des_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) -{ - struct s390_des_ctx *ctx = crypto_tfm_ctx(tfm); - - cpacf_km(CPACF_KM_DEA | CPACF_DECRYPT, - ctx->key, out, in, DES_BLOCK_SIZE); -} - -static struct crypto_alg des_alg = { - .cra_name = "des", - .cra_driver_name = "des-s390", - .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_CIPHER, - .cra_blocksize = DES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct s390_des_ctx), - .cra_module = THIS_MODULE, - .cra_u = { - .cipher = { - .cia_min_keysize = DES_KEY_SIZE, - .cia_max_keysize = DES_KEY_SIZE, - .cia_setkey = des_setkey, - .cia_encrypt = s390_des_encrypt, - .cia_decrypt = s390_des_decrypt, - } - } -}; - -static int ecb_desall_crypt(struct skcipher_request *req, unsigned long fc) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct s390_des_ctx *ctx = crypto_skcipher_ctx(tfm); - struct skcipher_walk walk; - unsigned int nbytes, n; - int ret; - - ret = skcipher_walk_virt(&walk, req, false); - while ((nbytes = walk.nbytes) != 0) { - /* only use complete blocks */ - n = nbytes & ~(DES_BLOCK_SIZE - 1); - cpacf_km(fc, ctx->key, walk.dst.virt.addr, - walk.src.virt.addr, n); - ret = skcipher_walk_done(&walk, nbytes - n); - } - return ret; -} - -static int cbc_desall_crypt(struct skcipher_request *req, unsigned long fc) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct s390_des_ctx *ctx = crypto_skcipher_ctx(tfm); - struct skcipher_walk walk; - unsigned int nbytes, n; - int ret; - struct { - u8 iv[DES_BLOCK_SIZE]; - u8 key[DES3_KEY_SIZE]; - } param; - - ret = skcipher_walk_virt(&walk, req, false); - if (ret) - return ret; - memcpy(param.iv, walk.iv, DES_BLOCK_SIZE); - memcpy(param.key, ctx->key, DES3_KEY_SIZE); - while ((nbytes = walk.nbytes) != 0) { - /* only use complete blocks */ - n = nbytes & ~(DES_BLOCK_SIZE - 1); - cpacf_kmc(fc, ¶m, walk.dst.virt.addr, - walk.src.virt.addr, n); - memcpy(walk.iv, param.iv, DES_BLOCK_SIZE); - ret = skcipher_walk_done(&walk, nbytes - n); - } - return ret; -} - -static int ecb_des_encrypt(struct skcipher_request *req) -{ - return ecb_desall_crypt(req, CPACF_KM_DEA); -} - -static int ecb_des_decrypt(struct skcipher_request *req) -{ - return ecb_desall_crypt(req, CPACF_KM_DEA | CPACF_DECRYPT); -} - -static struct skcipher_alg ecb_des_alg = { - .base.cra_name = "ecb(des)", - .base.cra_driver_name = "ecb-des-s390", - .base.cra_priority = 400, /* combo: des + ecb */ - .base.cra_blocksize = DES_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct s390_des_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = DES_KEY_SIZE, - .max_keysize = DES_KEY_SIZE, - .setkey = des_setkey_skcipher, - .encrypt = ecb_des_encrypt, - .decrypt = ecb_des_decrypt, -}; - -static int cbc_des_encrypt(struct skcipher_request *req) -{ - return cbc_desall_crypt(req, CPACF_KMC_DEA); -} - -static int cbc_des_decrypt(struct skcipher_request *req) -{ - return cbc_desall_crypt(req, CPACF_KMC_DEA | CPACF_DECRYPT); -} - -static struct skcipher_alg cbc_des_alg = { - .base.cra_name = "cbc(des)", - .base.cra_driver_name = "cbc-des-s390", - .base.cra_priority = 400, /* combo: des + cbc */ - .base.cra_blocksize = DES_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct s390_des_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = DES_KEY_SIZE, - .max_keysize = DES_KEY_SIZE, - .ivsize = DES_BLOCK_SIZE, - .setkey = des_setkey_skcipher, - .encrypt = cbc_des_encrypt, - .decrypt = cbc_des_decrypt, -}; - -/* - * RFC2451: - * - * For DES-EDE3, there is no known need to reject weak or - * complementation keys. Any weakness is obviated by the use of - * multiple keys. - * - * However, if the first two or last two independent 64-bit keys are - * equal (k1 == k2 or k2 == k3), then the DES3 operation is simply the - * same as DES. Implementers MUST reject keys that exhibit this - * property. - * - * In fips mode additionally check for all 3 keys are unique. - * - */ -static int des3_setkey(struct crypto_tfm *tfm, const u8 *key, - unsigned int key_len) -{ - struct s390_des_ctx *ctx = crypto_tfm_ctx(tfm); - int err; - - err = crypto_des3_ede_verify_key(tfm, key); - if (err) - return err; - - memcpy(ctx->key, key, key_len); - return 0; -} - -static int des3_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, - unsigned int key_len) -{ - return des3_setkey(crypto_skcipher_tfm(tfm), key, key_len); -} - -static void des3_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) -{ - struct s390_des_ctx *ctx = crypto_tfm_ctx(tfm); - - cpacf_km(CPACF_KM_TDEA_192, ctx->key, dst, src, DES_BLOCK_SIZE); -} - -static void des3_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) -{ - struct s390_des_ctx *ctx = crypto_tfm_ctx(tfm); - - cpacf_km(CPACF_KM_TDEA_192 | CPACF_DECRYPT, - ctx->key, dst, src, DES_BLOCK_SIZE); -} - -static struct crypto_alg des3_alg = { - .cra_name = "des3_ede", - .cra_driver_name = "des3_ede-s390", - .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_CIPHER, - .cra_blocksize = DES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct s390_des_ctx), - .cra_module = THIS_MODULE, - .cra_u = { - .cipher = { - .cia_min_keysize = DES3_KEY_SIZE, - .cia_max_keysize = DES3_KEY_SIZE, - .cia_setkey = des3_setkey, - .cia_encrypt = des3_encrypt, - .cia_decrypt = des3_decrypt, - } - } -}; - -static int ecb_des3_encrypt(struct skcipher_request *req) -{ - return ecb_desall_crypt(req, CPACF_KM_TDEA_192); -} - -static int ecb_des3_decrypt(struct skcipher_request *req) -{ - return ecb_desall_crypt(req, CPACF_KM_TDEA_192 | CPACF_DECRYPT); -} - -static struct skcipher_alg ecb_des3_alg = { - .base.cra_name = "ecb(des3_ede)", - .base.cra_driver_name = "ecb-des3_ede-s390", - .base.cra_priority = 400, /* combo: des3 + ecb */ - .base.cra_blocksize = DES_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct s390_des_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = DES3_KEY_SIZE, - .max_keysize = DES3_KEY_SIZE, - .setkey = des3_setkey_skcipher, - .encrypt = ecb_des3_encrypt, - .decrypt = ecb_des3_decrypt, -}; - -static int cbc_des3_encrypt(struct skcipher_request *req) -{ - return cbc_desall_crypt(req, CPACF_KMC_TDEA_192); -} - -static int cbc_des3_decrypt(struct skcipher_request *req) -{ - return cbc_desall_crypt(req, CPACF_KMC_TDEA_192 | CPACF_DECRYPT); -} - -static struct skcipher_alg cbc_des3_alg = { - .base.cra_name = "cbc(des3_ede)", - .base.cra_driver_name = "cbc-des3_ede-s390", - .base.cra_priority = 400, /* combo: des3 + cbc */ - .base.cra_blocksize = DES_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct s390_des_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = DES3_KEY_SIZE, - .max_keysize = DES3_KEY_SIZE, - .ivsize = DES_BLOCK_SIZE, - .setkey = des3_setkey_skcipher, - .encrypt = cbc_des3_encrypt, - .decrypt = cbc_des3_decrypt, -}; - -static unsigned int __ctrblk_init(u8 *ctrptr, u8 *iv, unsigned int nbytes) -{ - unsigned int i, n; - - /* align to block size, max. PAGE_SIZE */ - n = (nbytes > PAGE_SIZE) ? PAGE_SIZE : nbytes & ~(DES_BLOCK_SIZE - 1); - memcpy(ctrptr, iv, DES_BLOCK_SIZE); - for (i = (n / DES_BLOCK_SIZE) - 1; i > 0; i--) { - memcpy(ctrptr + DES_BLOCK_SIZE, ctrptr, DES_BLOCK_SIZE); - crypto_inc(ctrptr + DES_BLOCK_SIZE, DES_BLOCK_SIZE); - ctrptr += DES_BLOCK_SIZE; - } - return n; -} - -static int ctr_desall_crypt(struct skcipher_request *req, unsigned long fc) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct s390_des_ctx *ctx = crypto_skcipher_ctx(tfm); - u8 buf[DES_BLOCK_SIZE], *ctrptr; - struct skcipher_walk walk; - unsigned int n, nbytes; - int ret, locked; - - locked = mutex_trylock(&ctrblk_lock); - - ret = skcipher_walk_virt(&walk, req, false); - while ((nbytes = walk.nbytes) >= DES_BLOCK_SIZE) { - n = DES_BLOCK_SIZE; - if (nbytes >= 2*DES_BLOCK_SIZE && locked) - n = __ctrblk_init(ctrblk, walk.iv, nbytes); - ctrptr = (n > DES_BLOCK_SIZE) ? ctrblk : walk.iv; - cpacf_kmctr(fc, ctx->key, walk.dst.virt.addr, - walk.src.virt.addr, n, ctrptr); - if (ctrptr == ctrblk) - memcpy(walk.iv, ctrptr + n - DES_BLOCK_SIZE, - DES_BLOCK_SIZE); - crypto_inc(walk.iv, DES_BLOCK_SIZE); - ret = skcipher_walk_done(&walk, nbytes - n); - } - if (locked) - mutex_unlock(&ctrblk_lock); - /* final block may be < DES_BLOCK_SIZE, copy only nbytes */ - if (nbytes) { - cpacf_kmctr(fc, ctx->key, buf, walk.src.virt.addr, - DES_BLOCK_SIZE, walk.iv); - memcpy(walk.dst.virt.addr, buf, nbytes); - crypto_inc(walk.iv, DES_BLOCK_SIZE); - ret = skcipher_walk_done(&walk, 0); - } - return ret; -} - -static int ctr_des_crypt(struct skcipher_request *req) -{ - return ctr_desall_crypt(req, CPACF_KMCTR_DEA); -} - -static struct skcipher_alg ctr_des_alg = { - .base.cra_name = "ctr(des)", - .base.cra_driver_name = "ctr-des-s390", - .base.cra_priority = 400, /* combo: des + ctr */ - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct s390_des_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = DES_KEY_SIZE, - .max_keysize = DES_KEY_SIZE, - .ivsize = DES_BLOCK_SIZE, - .setkey = des_setkey_skcipher, - .encrypt = ctr_des_crypt, - .decrypt = ctr_des_crypt, - .chunksize = DES_BLOCK_SIZE, -}; - -static int ctr_des3_crypt(struct skcipher_request *req) -{ - return ctr_desall_crypt(req, CPACF_KMCTR_TDEA_192); -} - -static struct skcipher_alg ctr_des3_alg = { - .base.cra_name = "ctr(des3_ede)", - .base.cra_driver_name = "ctr-des3_ede-s390", - .base.cra_priority = 400, /* combo: des3 + ede */ - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct s390_des_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = DES3_KEY_SIZE, - .max_keysize = DES3_KEY_SIZE, - .ivsize = DES_BLOCK_SIZE, - .setkey = des3_setkey_skcipher, - .encrypt = ctr_des3_crypt, - .decrypt = ctr_des3_crypt, - .chunksize = DES_BLOCK_SIZE, -}; - -static struct crypto_alg *des_s390_algs_ptr[2]; -static int des_s390_algs_num; -static struct skcipher_alg *des_s390_skciphers_ptr[6]; -static int des_s390_skciphers_num; - -static int des_s390_register_alg(struct crypto_alg *alg) -{ - int ret; - - ret = crypto_register_alg(alg); - if (!ret) - des_s390_algs_ptr[des_s390_algs_num++] = alg; - return ret; -} - -static int des_s390_register_skcipher(struct skcipher_alg *alg) -{ - int ret; - - ret = crypto_register_skcipher(alg); - if (!ret) - des_s390_skciphers_ptr[des_s390_skciphers_num++] = alg; - return ret; -} - -static void des_s390_exit(void) -{ - while (des_s390_algs_num--) - crypto_unregister_alg(des_s390_algs_ptr[des_s390_algs_num]); - while (des_s390_skciphers_num--) - crypto_unregister_skcipher(des_s390_skciphers_ptr[des_s390_skciphers_num]); - if (ctrblk) - free_page((unsigned long) ctrblk); -} - -static int __init des_s390_init(void) -{ - int ret; - - /* Query available functions for KM, KMC and KMCTR */ - cpacf_query(CPACF_KM, &km_functions); - cpacf_query(CPACF_KMC, &kmc_functions); - cpacf_query(CPACF_KMCTR, &kmctr_functions); - - if (cpacf_test_func(&km_functions, CPACF_KM_DEA)) { - ret = des_s390_register_alg(&des_alg); - if (ret) - goto out_err; - ret = des_s390_register_skcipher(&ecb_des_alg); - if (ret) - goto out_err; - } - if (cpacf_test_func(&kmc_functions, CPACF_KMC_DEA)) { - ret = des_s390_register_skcipher(&cbc_des_alg); - if (ret) - goto out_err; - } - if (cpacf_test_func(&km_functions, CPACF_KM_TDEA_192)) { - ret = des_s390_register_alg(&des3_alg); - if (ret) - goto out_err; - ret = des_s390_register_skcipher(&ecb_des3_alg); - if (ret) - goto out_err; - } - if (cpacf_test_func(&kmc_functions, CPACF_KMC_TDEA_192)) { - ret = des_s390_register_skcipher(&cbc_des3_alg); - if (ret) - goto out_err; - } - - if (cpacf_test_func(&kmctr_functions, CPACF_KMCTR_DEA) || - cpacf_test_func(&kmctr_functions, CPACF_KMCTR_TDEA_192)) { - ctrblk = (u8 *) __get_free_page(GFP_KERNEL); - if (!ctrblk) { - ret = -ENOMEM; - goto out_err; - } - } - - if (cpacf_test_func(&kmctr_functions, CPACF_KMCTR_DEA)) { - ret = des_s390_register_skcipher(&ctr_des_alg); - if (ret) - goto out_err; - } - if (cpacf_test_func(&kmctr_functions, CPACF_KMCTR_TDEA_192)) { - ret = des_s390_register_skcipher(&ctr_des3_alg); - if (ret) - goto out_err; - } - - return 0; -out_err: - des_s390_exit(); - return ret; -} - -module_cpu_feature_match(S390_CPU_FEATURE_MSA, des_s390_init); -module_exit(des_s390_exit); - -MODULE_ALIAS_CRYPTO("des"); -MODULE_ALIAS_CRYPTO("des3_ede"); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("DES & Triple DES EDE Cipher Algorithms"); diff --git a/arch/s390/crypto/ghash_s390.c b/arch/s390/crypto/ghash_s390.c deleted file mode 100644 index 0800a2a5799f..000000000000 --- a/arch/s390/crypto/ghash_s390.c +++ /dev/null @@ -1,154 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Cryptographic API. - * - * s390 implementation of the GHASH algorithm for GCM (Galois/Counter Mode). - * - * Copyright IBM Corp. 2011 - * Author(s): Gerald Schaefer <gerald.schaefer@de.ibm.com> - */ - -#include <crypto/internal/hash.h> -#include <linux/module.h> -#include <linux/cpufeature.h> -#include <asm/cpacf.h> - -#define GHASH_BLOCK_SIZE 16 -#define GHASH_DIGEST_SIZE 16 - -struct ghash_ctx { - u8 key[GHASH_BLOCK_SIZE]; -}; - -struct ghash_desc_ctx { - u8 icv[GHASH_BLOCK_SIZE]; - u8 key[GHASH_BLOCK_SIZE]; - u8 buffer[GHASH_BLOCK_SIZE]; - u32 bytes; -}; - -static int ghash_init(struct shash_desc *desc) -{ - struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); - struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); - - memset(dctx, 0, sizeof(*dctx)); - memcpy(dctx->key, ctx->key, GHASH_BLOCK_SIZE); - - return 0; -} - -static int ghash_setkey(struct crypto_shash *tfm, - const u8 *key, unsigned int keylen) -{ - struct ghash_ctx *ctx = crypto_shash_ctx(tfm); - - if (keylen != GHASH_BLOCK_SIZE) - return -EINVAL; - - memcpy(ctx->key, key, GHASH_BLOCK_SIZE); - - return 0; -} - -static int ghash_update(struct shash_desc *desc, - const u8 *src, unsigned int srclen) -{ - struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); - unsigned int n; - u8 *buf = dctx->buffer; - - if (dctx->bytes) { - u8 *pos = buf + (GHASH_BLOCK_SIZE - dctx->bytes); - - n = min(srclen, dctx->bytes); - dctx->bytes -= n; - srclen -= n; - - memcpy(pos, src, n); - src += n; - - if (!dctx->bytes) { - cpacf_kimd(CPACF_KIMD_GHASH, dctx, buf, - GHASH_BLOCK_SIZE); - } - } - - n = srclen & ~(GHASH_BLOCK_SIZE - 1); - if (n) { - cpacf_kimd(CPACF_KIMD_GHASH, dctx, src, n); - src += n; - srclen -= n; - } - - if (srclen) { - dctx->bytes = GHASH_BLOCK_SIZE - srclen; - memcpy(buf, src, srclen); - } - - return 0; -} - -static int ghash_flush(struct ghash_desc_ctx *dctx) -{ - u8 *buf = dctx->buffer; - - if (dctx->bytes) { - u8 *pos = buf + (GHASH_BLOCK_SIZE - dctx->bytes); - - memset(pos, 0, dctx->bytes); - cpacf_kimd(CPACF_KIMD_GHASH, dctx, buf, GHASH_BLOCK_SIZE); - dctx->bytes = 0; - } - - return 0; -} - -static int ghash_final(struct shash_desc *desc, u8 *dst) -{ - struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); - int ret; - - ret = ghash_flush(dctx); - if (!ret) - memcpy(dst, dctx->icv, GHASH_BLOCK_SIZE); - return ret; -} - -static struct shash_alg ghash_alg = { - .digestsize = GHASH_DIGEST_SIZE, - .init = ghash_init, - .update = ghash_update, - .final = ghash_final, - .setkey = ghash_setkey, - .descsize = sizeof(struct ghash_desc_ctx), - .base = { - .cra_name = "ghash", - .cra_driver_name = "ghash-s390", - .cra_priority = 300, - .cra_blocksize = GHASH_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct ghash_ctx), - .cra_module = THIS_MODULE, - }, -}; - -static int __init ghash_mod_init(void) -{ - if (!cpacf_query_func(CPACF_KIMD, CPACF_KIMD_GHASH)) - return -ENODEV; - - return crypto_register_shash(&ghash_alg); -} - -static void __exit ghash_mod_exit(void) -{ - crypto_unregister_shash(&ghash_alg); -} - -module_cpu_feature_match(S390_CPU_FEATURE_MSA, ghash_mod_init); -module_exit(ghash_mod_exit); - -MODULE_ALIAS_CRYPTO("ghash"); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("GHASH hash function, s390 implementation"); diff --git a/arch/s390/crypto/hmac_s390.c b/arch/s390/crypto/hmac_s390.c index bba9a818dfdc..f8cd09f341d4 100644 --- a/arch/s390/crypto/hmac_s390.c +++ b/arch/s390/crypto/hmac_s390.c @@ -5,14 +5,17 @@ * s390 specific HMAC support. */ -#define KMSG_COMPONENT "hmac_s390" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "hmac_s390: " fmt #include <asm/cpacf.h> -#include <crypto/sha2.h> #include <crypto/internal/hash.h> +#include <crypto/hmac.h> +#include <crypto/sha2.h> #include <linux/cpufeature.h> +#include <linux/errno.h> +#include <linux/kernel.h> #include <linux/module.h> +#include <linux/string.h> /* * KMAC param block layout for sha2 function codes: @@ -71,32 +74,31 @@ union s390_kmac_gr0 { struct s390_kmac_sha2_ctx { u8 param[MAX_DIGEST_SIZE + MAX_IMBL_SIZE + MAX_BLOCK_SIZE]; union s390_kmac_gr0 gr0; - u8 buf[MAX_BLOCK_SIZE]; - unsigned int buflen; + u64 buflen[2]; }; /* * kmac_sha2_set_imbl - sets the input message bit-length based on the blocksize */ -static inline void kmac_sha2_set_imbl(u8 *param, unsigned int buflen, - unsigned int blocksize) +static inline void kmac_sha2_set_imbl(u8 *param, u64 buflen_lo, + u64 buflen_hi, unsigned int blocksize) { u8 *imbl = param + SHA2_IMBL_OFFSET(blocksize); switch (blocksize) { case SHA256_BLOCK_SIZE: - *(u64 *)imbl = (u64)buflen * BITS_PER_BYTE; + *(u64 *)imbl = buflen_lo * BITS_PER_BYTE; break; case SHA512_BLOCK_SIZE: - *(u128 *)imbl = (u128)buflen * BITS_PER_BYTE; + *(u128 *)imbl = (((u128)buflen_hi << 64) + buflen_lo) << 3; break; default: break; } } -static int hash_key(const u8 *in, unsigned int inlen, - u8 *digest, unsigned int digestsize) +static int hash_data(const u8 *in, unsigned int inlen, + u8 *digest, unsigned int digestsize, bool final) { unsigned long func; union { @@ -123,19 +125,23 @@ static int hash_key(const u8 *in, unsigned int inlen, switch (digestsize) { case SHA224_DIGEST_SIZE: - func = CPACF_KLMD_SHA_256; + func = final ? CPACF_KLMD_SHA_256 : CPACF_KIMD_SHA_256; PARAM_INIT(256, 224, inlen * 8); + if (!final) + digestsize = SHA256_DIGEST_SIZE; break; case SHA256_DIGEST_SIZE: - func = CPACF_KLMD_SHA_256; + func = final ? CPACF_KLMD_SHA_256 : CPACF_KIMD_SHA_256; PARAM_INIT(256, 256, inlen * 8); break; case SHA384_DIGEST_SIZE: - func = CPACF_KLMD_SHA_512; + func = final ? CPACF_KLMD_SHA_512 : CPACF_KIMD_SHA_512; PARAM_INIT(512, 384, inlen * 8); + if (!final) + digestsize = SHA512_DIGEST_SIZE; break; case SHA512_DIGEST_SIZE: - func = CPACF_KLMD_SHA_512; + func = final ? CPACF_KLMD_SHA_512 : CPACF_KIMD_SHA_512; PARAM_INIT(512, 512, inlen * 8); break; default: @@ -151,6 +157,12 @@ static int hash_key(const u8 *in, unsigned int inlen, return 0; } +static int hash_key(const u8 *in, unsigned int inlen, + u8 *digest, unsigned int digestsize) +{ + return hash_data(in, inlen, digest, digestsize, true); +} + static int s390_hmac_sha2_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen) { @@ -176,7 +188,8 @@ static int s390_hmac_sha2_init(struct shash_desc *desc) memcpy(ctx->param + SHA2_KEY_OFFSET(bs), tfm_ctx->key, bs); - ctx->buflen = 0; + ctx->buflen[0] = 0; + ctx->buflen[1] = 0; ctx->gr0.reg = 0; switch (crypto_shash_digestsize(desc->tfm)) { case SHA224_DIGEST_SIZE: @@ -203,48 +216,31 @@ static int s390_hmac_sha2_update(struct shash_desc *desc, { struct s390_kmac_sha2_ctx *ctx = shash_desc_ctx(desc); unsigned int bs = crypto_shash_blocksize(desc->tfm); - unsigned int offset, n; - - /* check current buffer */ - offset = ctx->buflen % bs; - ctx->buflen += len; - if (offset + len < bs) - goto store; - - /* process one stored block */ - if (offset) { - n = bs - offset; - memcpy(ctx->buf + offset, data, n); - ctx->gr0.iimp = 1; - _cpacf_kmac(&ctx->gr0.reg, ctx->param, ctx->buf, bs); - data += n; - len -= n; - offset = 0; - } - /* process as many blocks as possible */ - if (len >= bs) { - n = (len / bs) * bs; - ctx->gr0.iimp = 1; - _cpacf_kmac(&ctx->gr0.reg, ctx->param, data, n); - data += n; - len -= n; - } -store: - /* store incomplete block in buffer */ - if (len) - memcpy(ctx->buf + offset, data, len); + unsigned int n = round_down(len, bs); - return 0; + ctx->buflen[0] += n; + if (ctx->buflen[0] < n) + ctx->buflen[1]++; + + /* process as many blocks as possible */ + ctx->gr0.iimp = 1; + _cpacf_kmac(&ctx->gr0.reg, ctx->param, data, n); + return len - n; } -static int s390_hmac_sha2_final(struct shash_desc *desc, u8 *out) +static int s390_hmac_sha2_finup(struct shash_desc *desc, const u8 *src, + unsigned int len, u8 *out) { struct s390_kmac_sha2_ctx *ctx = shash_desc_ctx(desc); unsigned int bs = crypto_shash_blocksize(desc->tfm); + ctx->buflen[0] += len; + if (ctx->buflen[0] < len) + ctx->buflen[1]++; + ctx->gr0.iimp = 0; - kmac_sha2_set_imbl(ctx->param, ctx->buflen, bs); - _cpacf_kmac(&ctx->gr0.reg, ctx->param, ctx->buf, ctx->buflen % bs); + kmac_sha2_set_imbl(ctx->param, ctx->buflen[0], ctx->buflen[1], bs); + _cpacf_kmac(&ctx->gr0.reg, ctx->param, src, len); memcpy(out, ctx->param, crypto_shash_digestsize(desc->tfm)); return 0; @@ -262,7 +258,7 @@ static int s390_hmac_sha2_digest(struct shash_desc *desc, return rc; ctx->gr0.iimp = 0; - kmac_sha2_set_imbl(ctx->param, len, + kmac_sha2_set_imbl(ctx->param, len, 0, crypto_shash_blocksize(desc->tfm)); _cpacf_kmac(&ctx->gr0.reg, ctx->param, data, len); memcpy(out, ctx->param, ds); @@ -270,22 +266,93 @@ static int s390_hmac_sha2_digest(struct shash_desc *desc, return 0; } -#define S390_HMAC_SHA2_ALG(x) { \ +static int s390_hmac_export_zero(struct shash_desc *desc, void *out) +{ + struct crypto_shash *tfm = desc->tfm; + u8 ipad[SHA512_BLOCK_SIZE]; + struct s390_hmac_ctx *ctx; + unsigned int bs; + int err, i; + + ctx = crypto_shash_ctx(tfm); + bs = crypto_shash_blocksize(tfm); + for (i = 0; i < bs; i++) + ipad[i] = ctx->key[i] ^ HMAC_IPAD_VALUE; + + err = hash_data(ipad, bs, out, crypto_shash_digestsize(tfm), false); + memzero_explicit(ipad, sizeof(ipad)); + return err; +} + +static int s390_hmac_export(struct shash_desc *desc, void *out) +{ + struct s390_kmac_sha2_ctx *ctx = shash_desc_ctx(desc); + unsigned int bs = crypto_shash_blocksize(desc->tfm); + unsigned int ds = bs / 2; + u64 lo = ctx->buflen[0]; + union { + u8 *u8; + u64 *u64; + } p = { .u8 = out }; + int err = 0; + + if (!ctx->gr0.ikp) + err = s390_hmac_export_zero(desc, out); + else + memcpy(p.u8, ctx->param, ds); + p.u8 += ds; + lo += bs; + put_unaligned(lo, p.u64++); + if (ds == SHA512_DIGEST_SIZE) + put_unaligned(ctx->buflen[1] + (lo < bs), p.u64); + return err; +} + +static int s390_hmac_import(struct shash_desc *desc, const void *in) +{ + struct s390_kmac_sha2_ctx *ctx = shash_desc_ctx(desc); + unsigned int bs = crypto_shash_blocksize(desc->tfm); + unsigned int ds = bs / 2; + union { + const u8 *u8; + const u64 *u64; + } p = { .u8 = in }; + u64 lo; + int err; + + err = s390_hmac_sha2_init(desc); + memcpy(ctx->param, p.u8, ds); + p.u8 += ds; + lo = get_unaligned(p.u64++); + ctx->buflen[0] = lo - bs; + if (ds == SHA512_DIGEST_SIZE) + ctx->buflen[1] = get_unaligned(p.u64) - (lo < bs); + if (ctx->buflen[0] | ctx->buflen[1]) + ctx->gr0.ikp = 1; + return err; +} + +#define S390_HMAC_SHA2_ALG(x, ss) { \ .fc = CPACF_KMAC_HMAC_SHA_##x, \ .alg = { \ .init = s390_hmac_sha2_init, \ .update = s390_hmac_sha2_update, \ - .final = s390_hmac_sha2_final, \ + .finup = s390_hmac_sha2_finup, \ .digest = s390_hmac_sha2_digest, \ .setkey = s390_hmac_sha2_setkey, \ + .export = s390_hmac_export, \ + .import = s390_hmac_import, \ .descsize = sizeof(struct s390_kmac_sha2_ctx), \ .halg = { \ + .statesize = ss, \ .digestsize = SHA##x##_DIGEST_SIZE, \ .base = { \ .cra_name = "hmac(sha" #x ")", \ .cra_driver_name = "hmac_s390_sha" #x, \ .cra_blocksize = SHA##x##_BLOCK_SIZE, \ .cra_priority = 400, \ + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | \ + CRYPTO_AHASH_ALG_FINUP_MAX, \ .cra_ctxsize = sizeof(struct s390_hmac_ctx), \ .cra_module = THIS_MODULE, \ }, \ @@ -298,10 +365,10 @@ static struct s390_hmac_alg { unsigned int fc; struct shash_alg alg; } s390_hmac_algs[] = { - S390_HMAC_SHA2_ALG(224), - S390_HMAC_SHA2_ALG(256), - S390_HMAC_SHA2_ALG(384), - S390_HMAC_SHA2_ALG(512), + S390_HMAC_SHA2_ALG(224, sizeof(struct crypto_sha256_state)), + S390_HMAC_SHA2_ALG(256, sizeof(struct crypto_sha256_state)), + S390_HMAC_SHA2_ALG(384, SHA512_STATE_SIZE), + S390_HMAC_SHA2_ALG(512, SHA512_STATE_SIZE), }; static __always_inline void _s390_hmac_algs_unregister(void) diff --git a/arch/s390/crypto/paes_s390.c b/arch/s390/crypto/paes_s390.c index 511093713a6f..8cfe6166c193 100644 --- a/arch/s390/crypto/paes_s390.c +++ b/arch/s390/crypto/paes_s390.c @@ -5,24 +5,25 @@ * s390 implementation of the AES Cipher Algorithm with protected keys. * * s390 Version: - * Copyright IBM Corp. 2017, 2023 + * Copyright IBM Corp. 2017, 2025 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> * Harald Freudenberger <freude@de.ibm.com> */ -#define KMSG_COMPONENT "paes_s390" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "paes_s390: " fmt -#include <crypto/aes.h> -#include <crypto/algapi.h> -#include <linux/bug.h> -#include <linux/err.h> -#include <linux/module.h> +#include <linux/atomic.h> #include <linux/cpufeature.h> +#include <linux/delay.h> +#include <linux/err.h> #include <linux/init.h> +#include <linux/miscdevice.h> +#include <linux/module.h> #include <linux/mutex.h> #include <linux/spinlock.h> -#include <linux/delay.h> +#include <crypto/aes.h> +#include <crypto/algapi.h> +#include <crypto/engine.h> #include <crypto/internal/skcipher.h> #include <crypto/xts.h> #include <asm/cpacf.h> @@ -39,28 +40,70 @@ #define PAES_256_PROTKEY_SIZE (32 + 32) /* key + verification pattern */ #define PXTS_256_PROTKEY_SIZE (32 + 32 + 32) /* k1 + k2 + verification pattern */ +static bool pkey_clrkey_allowed; +module_param_named(clrkey, pkey_clrkey_allowed, bool, 0444); +MODULE_PARM_DESC(clrkey, "Allow clear key material (default N)"); + static u8 *ctrblk; static DEFINE_MUTEX(ctrblk_lock); static cpacf_mask_t km_functions, kmc_functions, kmctr_functions; +static struct crypto_engine *paes_crypto_engine; +#define MAX_QLEN 10 + +/* + * protected key specific stuff + */ + struct paes_protkey { u32 type; u32 len; u8 protkey[PXTS_256_PROTKEY_SIZE]; }; -struct key_blob { - /* - * Small keys will be stored in the keybuf. Larger keys are - * stored in extra allocated memory. In both cases does - * key point to the memory where the key is stored. - * The code distinguishes by checking keylen against - * sizeof(keybuf). See the two following helper functions. - */ - u8 *key; - u8 keybuf[128]; +#define PK_STATE_NO_KEY 0 +#define PK_STATE_CONVERT_IN_PROGRESS 1 +#define PK_STATE_VALID 2 + +struct s390_paes_ctx { + /* source key material used to derive a protected key from */ + u8 keybuf[PAES_MAX_KEYSIZE]; + unsigned int keylen; + + /* cpacf function code to use with this protected key type */ + long fc; + + /* nr of requests enqueued via crypto engine which use this tfm ctx */ + atomic_t via_engine_ctr; + + /* spinlock to atomic read/update all the following fields */ + spinlock_t pk_lock; + + /* see PK_STATE* defines above, < 0 holds convert failure rc */ + int pk_state; + /* if state is valid, pk holds the protected key */ + struct paes_protkey pk; +}; + +struct s390_pxts_ctx { + /* source key material used to derive a protected key from */ + u8 keybuf[2 * PAES_MAX_KEYSIZE]; unsigned int keylen; + + /* cpacf function code to use with this protected key type */ + long fc; + + /* nr of requests enqueued via crypto engine which use this tfm ctx */ + atomic_t via_engine_ctr; + + /* spinlock to atomic read/update all the following fields */ + spinlock_t pk_lock; + + /* see PK_STATE* defines above, < 0 holds convert failure rc */ + int pk_state; + /* if state is valid, pk[] hold(s) the protected key(s) */ + struct paes_protkey pk[2]; }; /* @@ -89,214 +132,376 @@ static inline u32 make_clrkey_token(const u8 *ck, size_t cklen, u8 *dest) return sizeof(*token) + cklen; } -static inline int _key_to_kb(struct key_blob *kb, - const u8 *key, - unsigned int keylen) +/* + * paes_ctx_setkey() - Set key value into context, maybe construct + * a clear key token digestible by pkey from a clear key value. + */ +static inline int paes_ctx_setkey(struct s390_paes_ctx *ctx, + const u8 *key, unsigned int keylen) { + if (keylen > sizeof(ctx->keybuf)) + return -EINVAL; + switch (keylen) { case 16: case 24: case 32: /* clear key value, prepare pkey clear key token in keybuf */ - memset(kb->keybuf, 0, sizeof(kb->keybuf)); - kb->keylen = make_clrkey_token(key, keylen, kb->keybuf); - kb->key = kb->keybuf; + memset(ctx->keybuf, 0, sizeof(ctx->keybuf)); + ctx->keylen = make_clrkey_token(key, keylen, ctx->keybuf); break; default: /* other key material, let pkey handle this */ - if (keylen <= sizeof(kb->keybuf)) - kb->key = kb->keybuf; - else { - kb->key = kmalloc(keylen, GFP_KERNEL); - if (!kb->key) - return -ENOMEM; - } - memcpy(kb->key, key, keylen); - kb->keylen = keylen; + memcpy(ctx->keybuf, key, keylen); + ctx->keylen = keylen; break; } return 0; } -static inline int _xts_key_to_kb(struct key_blob *kb, - const u8 *key, - unsigned int keylen) +/* + * pxts_ctx_setkey() - Set key value into context, maybe construct + * a clear key token digestible by pkey from a clear key value. + */ +static inline int pxts_ctx_setkey(struct s390_pxts_ctx *ctx, + const u8 *key, unsigned int keylen) { size_t cklen = keylen / 2; - memset(kb->keybuf, 0, sizeof(kb->keybuf)); + if (keylen > sizeof(ctx->keybuf)) + return -EINVAL; switch (keylen) { case 32: case 64: /* clear key value, prepare pkey clear key tokens in keybuf */ - kb->key = kb->keybuf; - kb->keylen = make_clrkey_token(key, cklen, kb->key); - kb->keylen += make_clrkey_token(key + cklen, cklen, - kb->key + kb->keylen); + memset(ctx->keybuf, 0, sizeof(ctx->keybuf)); + ctx->keylen = make_clrkey_token(key, cklen, ctx->keybuf); + ctx->keylen += make_clrkey_token(key + cklen, cklen, + ctx->keybuf + ctx->keylen); break; default: /* other key material, let pkey handle this */ - if (keylen <= sizeof(kb->keybuf)) { - kb->key = kb->keybuf; - } else { - kb->key = kmalloc(keylen, GFP_KERNEL); - if (!kb->key) - return -ENOMEM; - } - memcpy(kb->key, key, keylen); - kb->keylen = keylen; + memcpy(ctx->keybuf, key, keylen); + ctx->keylen = keylen; break; } return 0; } -static inline void _free_kb_keybuf(struct key_blob *kb) +/* + * Convert the raw key material into a protected key via PKEY api. + * This function may sleep - don't call in non-sleeping context. + */ +static inline int convert_key(const u8 *key, unsigned int keylen, + struct paes_protkey *pk, bool tested) { - if (kb->key && kb->key != kb->keybuf - && kb->keylen > sizeof(kb->keybuf)) { - kfree_sensitive(kb->key); - kb->key = NULL; - } - memzero_explicit(kb->keybuf, sizeof(kb->keybuf)); -} - -struct s390_paes_ctx { - struct key_blob kb; - struct paes_protkey pk; - spinlock_t pk_lock; - unsigned long fc; -}; + u32 xflags = PKEY_XFLAG_NOMEMALLOC; + int rc, i; -struct s390_pxts_ctx { - struct key_blob kb; - struct paes_protkey pk[2]; - spinlock_t pk_lock; - unsigned long fc; -}; + if (tested && !pkey_clrkey_allowed) + xflags |= PKEY_XFLAG_NOCLEARKEY; -static inline int __paes_keyblob2pkey(const u8 *key, unsigned int keylen, - struct paes_protkey *pk) -{ - int i, rc = -EIO; + pk->len = sizeof(pk->protkey); - /* try three times in case of busy card */ - for (i = 0; rc && i < 3; i++) { - if (rc == -EBUSY && in_task()) { - if (msleep_interruptible(1000)) - return -EINTR; + /* + * In case of a busy card retry with increasing delay + * of 200, 400, 800 and 1600 ms - in total 3 s. + */ + for (rc = -EIO, i = 0; rc && i < 5; i++) { + if (rc == -EBUSY && msleep_interruptible((1 << i) * 100)) { + rc = -EINTR; + goto out; } - rc = pkey_key2protkey(key, keylen, pk->protkey, &pk->len, - &pk->type); + rc = pkey_key2protkey(key, keylen, + pk->protkey, &pk->len, &pk->type, + xflags); } +out: + pr_debug("rc=%d\n", rc); return rc; } -static inline int __paes_convert_key(struct s390_paes_ctx *ctx) +/* + * (Re-)Convert the raw key material from the ctx into a protected key + * via convert_key() function. Update the pk_state, pk_type, pk_len + * and the protected key in the tfm context. + * Please note this function may be invoked concurrently with the very + * same tfm context. The pk_lock spinlock in the context ensures an + * atomic update of the pk and the pk state but does not guarantee any + * order of update. So a fresh converted valid protected key may get + * updated with an 'old' expired key value. As the cpacf instructions + * detect this, refuse to operate with an invalid key and the calling + * code triggers a (re-)conversion this does no harm. This may lead to + * unnecessary additional conversion but never to invalid data on en- + * or decrypt operations. + */ +static int paes_convert_key(struct s390_paes_ctx *ctx, bool tested) { struct paes_protkey pk; int rc; - pk.len = sizeof(pk.protkey); - rc = __paes_keyblob2pkey(ctx->kb.key, ctx->kb.keylen, &pk); - if (rc) - return rc; + spin_lock_bh(&ctx->pk_lock); + ctx->pk_state = PK_STATE_CONVERT_IN_PROGRESS; + spin_unlock_bh(&ctx->pk_lock); + + rc = convert_key(ctx->keybuf, ctx->keylen, &pk, tested); + /* update context */ spin_lock_bh(&ctx->pk_lock); - memcpy(&ctx->pk, &pk, sizeof(pk)); + if (rc) { + ctx->pk_state = rc; + } else { + ctx->pk_state = PK_STATE_VALID; + ctx->pk = pk; + } spin_unlock_bh(&ctx->pk_lock); - return 0; + memzero_explicit(&pk, sizeof(pk)); + pr_debug("rc=%d\n", rc); + return rc; } -static int ecb_paes_init(struct crypto_skcipher *tfm) +/* + * (Re-)Convert the raw xts key material from the ctx into a + * protected key via convert_key() function. Update the pk_state, + * pk_type, pk_len and the protected key in the tfm context. + * See also comments on function paes_convert_key. + */ +static int pxts_convert_key(struct s390_pxts_ctx *ctx, bool tested) { - struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + struct paes_protkey pk0, pk1; + size_t split_keylen; + int rc; - ctx->kb.key = NULL; - spin_lock_init(&ctx->pk_lock); + spin_lock_bh(&ctx->pk_lock); + ctx->pk_state = PK_STATE_CONVERT_IN_PROGRESS; + spin_unlock_bh(&ctx->pk_lock); - return 0; -} + rc = convert_key(ctx->keybuf, ctx->keylen, &pk0, tested); + if (rc) + goto out; -static void ecb_paes_exit(struct crypto_skcipher *tfm) -{ - struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + switch (pk0.type) { + case PKEY_KEYTYPE_AES_128: + case PKEY_KEYTYPE_AES_256: + /* second keytoken required */ + if (ctx->keylen % 2) { + rc = -EINVAL; + goto out; + } + split_keylen = ctx->keylen / 2; + rc = convert_key(ctx->keybuf + split_keylen, + split_keylen, &pk1, tested); + if (rc) + goto out; + if (pk0.type != pk1.type) { + rc = -EINVAL; + goto out; + } + break; + case PKEY_KEYTYPE_AES_XTS_128: + case PKEY_KEYTYPE_AES_XTS_256: + /* single key */ + pk1.type = 0; + break; + default: + /* unsupported protected keytype */ + rc = -EINVAL; + goto out; + } + +out: + /* update context */ + spin_lock_bh(&ctx->pk_lock); + if (rc) { + ctx->pk_state = rc; + } else { + ctx->pk_state = PK_STATE_VALID; + ctx->pk[0] = pk0; + ctx->pk[1] = pk1; + } + spin_unlock_bh(&ctx->pk_lock); - _free_kb_keybuf(&ctx->kb); + memzero_explicit(&pk0, sizeof(pk0)); + memzero_explicit(&pk1, sizeof(pk1)); + pr_debug("rc=%d\n", rc); + return rc; } -static inline int __ecb_paes_set_key(struct s390_paes_ctx *ctx) +/* + * PAES ECB implementation + */ + +struct ecb_param { + u8 key[PAES_256_PROTKEY_SIZE]; +} __packed; + +struct s390_pecb_req_ctx { + unsigned long modifier; + struct skcipher_walk walk; + bool param_init_done; + struct ecb_param param; +}; + +static int ecb_paes_setkey(struct crypto_skcipher *tfm, const u8 *in_key, + unsigned int key_len) { - unsigned long fc; + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + bool tested = crypto_skcipher_tested(tfm); + long fc; int rc; - rc = __paes_convert_key(ctx); + /* set raw key into context */ + rc = paes_ctx_setkey(ctx, in_key, key_len); if (rc) - return rc; + goto out; - /* Pick the correct function code based on the protected key type */ - fc = (ctx->pk.type == PKEY_KEYTYPE_AES_128) ? CPACF_KM_PAES_128 : - (ctx->pk.type == PKEY_KEYTYPE_AES_192) ? CPACF_KM_PAES_192 : - (ctx->pk.type == PKEY_KEYTYPE_AES_256) ? CPACF_KM_PAES_256 : 0; + /* convert key into protected key */ + rc = paes_convert_key(ctx, tested); + if (rc) + goto out; - /* Check if the function code is available */ + /* Pick the correct function code based on the protected key type */ + switch (ctx->pk.type) { + case PKEY_KEYTYPE_AES_128: + fc = CPACF_KM_PAES_128; + break; + case PKEY_KEYTYPE_AES_192: + fc = CPACF_KM_PAES_192; + break; + case PKEY_KEYTYPE_AES_256: + fc = CPACF_KM_PAES_256; + break; + default: + fc = 0; + break; + } ctx->fc = (fc && cpacf_test_func(&km_functions, fc)) ? fc : 0; - return ctx->fc ? 0 : -EINVAL; + rc = fc ? 0 : -EINVAL; + +out: + pr_debug("rc=%d\n", rc); + return rc; } -static int ecb_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, - unsigned int key_len) +static int ecb_paes_do_crypt(struct s390_paes_ctx *ctx, + struct s390_pecb_req_ctx *req_ctx, + bool tested, bool maysleep) { - struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); - int rc; - - _free_kb_keybuf(&ctx->kb); - rc = _key_to_kb(&ctx->kb, in_key, key_len); + struct ecb_param *param = &req_ctx->param; + struct skcipher_walk *walk = &req_ctx->walk; + unsigned int nbytes, n, k; + int pk_state, rc = 0; + + if (!req_ctx->param_init_done) { + /* fetch and check protected key state */ + spin_lock_bh(&ctx->pk_lock); + pk_state = ctx->pk_state; + switch (pk_state) { + case PK_STATE_NO_KEY: + rc = -ENOKEY; + break; + case PK_STATE_CONVERT_IN_PROGRESS: + rc = -EKEYEXPIRED; + break; + case PK_STATE_VALID: + memcpy(param->key, ctx->pk.protkey, sizeof(param->key)); + req_ctx->param_init_done = true; + break; + default: + rc = pk_state < 0 ? pk_state : -EIO; + break; + } + spin_unlock_bh(&ctx->pk_lock); + } if (rc) - return rc; + goto out; - return __ecb_paes_set_key(ctx); + /* + * Note that in case of partial processing or failure the walk + * is NOT unmapped here. So a follow up task may reuse the walk + * or in case of unrecoverable failure needs to unmap it. + */ + while ((nbytes = walk->nbytes) != 0) { + /* only use complete blocks */ + n = nbytes & ~(AES_BLOCK_SIZE - 1); + k = cpacf_km(ctx->fc | req_ctx->modifier, param, + walk->dst.virt.addr, walk->src.virt.addr, n); + if (k) + rc = skcipher_walk_done(walk, nbytes - k); + if (k < n) { + if (!maysleep) { + rc = -EKEYEXPIRED; + goto out; + } + rc = paes_convert_key(ctx, tested); + if (rc) + goto out; + spin_lock_bh(&ctx->pk_lock); + memcpy(param->key, ctx->pk.protkey, sizeof(param->key)); + spin_unlock_bh(&ctx->pk_lock); + } + } + +out: + pr_debug("rc=%d\n", rc); + return rc; } static int ecb_paes_crypt(struct skcipher_request *req, unsigned long modifier) { + struct s390_pecb_req_ctx *req_ctx = skcipher_request_ctx(req); struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); - struct { - u8 key[PAES_256_PROTKEY_SIZE]; - } param; - struct skcipher_walk walk; - unsigned int nbytes, n, k; + struct skcipher_walk *walk = &req_ctx->walk; + bool tested = crypto_skcipher_tested(tfm); int rc; - rc = skcipher_walk_virt(&walk, req, false); + /* + * Attempt synchronous encryption first. If it fails, schedule the request + * asynchronously via the crypto engine. To preserve execution order, + * once a request is queued to the engine, further requests using the same + * tfm will also be routed through the engine. + */ + + rc = skcipher_walk_virt(walk, req, false); if (rc) - return rc; + goto out; - spin_lock_bh(&ctx->pk_lock); - memcpy(param.key, ctx->pk.protkey, PAES_256_PROTKEY_SIZE); - spin_unlock_bh(&ctx->pk_lock); + req_ctx->modifier = modifier; + req_ctx->param_init_done = false; - while ((nbytes = walk.nbytes) != 0) { - /* only use complete blocks */ - n = nbytes & ~(AES_BLOCK_SIZE - 1); - k = cpacf_km(ctx->fc | modifier, ¶m, - walk.dst.virt.addr, walk.src.virt.addr, n); - if (k) - rc = skcipher_walk_done(&walk, nbytes - k); - if (k < n) { - if (__paes_convert_key(ctx)) - return skcipher_walk_done(&walk, -EIO); - spin_lock_bh(&ctx->pk_lock); - memcpy(param.key, ctx->pk.protkey, PAES_256_PROTKEY_SIZE); - spin_unlock_bh(&ctx->pk_lock); - } + /* Try synchronous operation if no active engine usage */ + if (!atomic_read(&ctx->via_engine_ctr)) { + rc = ecb_paes_do_crypt(ctx, req_ctx, tested, false); + if (rc == 0) + goto out; + } + + /* + * If sync operation failed or key expired or there are already + * requests enqueued via engine, fallback to async. Mark tfm as + * using engine to serialize requests. + */ + if (rc == 0 || rc == -EKEYEXPIRED) { + atomic_inc(&ctx->via_engine_ctr); + rc = crypto_transfer_skcipher_request_to_engine(paes_crypto_engine, req); + if (rc != -EINPROGRESS) + atomic_dec(&ctx->via_engine_ctr); } + + if (rc != -EINPROGRESS) + skcipher_walk_done(walk, rc); + +out: + if (rc != -EINPROGRESS) + memzero_explicit(&req_ctx->param, sizeof(req_ctx->param)); + pr_debug("rc=%d\n", rc); return rc; } @@ -310,112 +515,259 @@ static int ecb_paes_decrypt(struct skcipher_request *req) return ecb_paes_crypt(req, CPACF_DECRYPT); } -static struct skcipher_alg ecb_paes_alg = { - .base.cra_name = "ecb(paes)", - .base.cra_driver_name = "ecb-paes-s390", - .base.cra_priority = 401, /* combo: aes + ecb + 1 */ - .base.cra_blocksize = AES_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct s390_paes_ctx), - .base.cra_module = THIS_MODULE, - .base.cra_list = LIST_HEAD_INIT(ecb_paes_alg.base.cra_list), - .init = ecb_paes_init, - .exit = ecb_paes_exit, - .min_keysize = PAES_MIN_KEYSIZE, - .max_keysize = PAES_MAX_KEYSIZE, - .setkey = ecb_paes_set_key, - .encrypt = ecb_paes_encrypt, - .decrypt = ecb_paes_decrypt, -}; - -static int cbc_paes_init(struct crypto_skcipher *tfm) +static int ecb_paes_init(struct crypto_skcipher *tfm) { struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); - ctx->kb.key = NULL; + memset(ctx, 0, sizeof(*ctx)); spin_lock_init(&ctx->pk_lock); + crypto_skcipher_set_reqsize(tfm, sizeof(struct s390_pecb_req_ctx)); + return 0; } -static void cbc_paes_exit(struct crypto_skcipher *tfm) +static void ecb_paes_exit(struct crypto_skcipher *tfm) { struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); - _free_kb_keybuf(&ctx->kb); + memzero_explicit(ctx, sizeof(*ctx)); } -static inline int __cbc_paes_set_key(struct s390_paes_ctx *ctx) +static int ecb_paes_do_one_request(struct crypto_engine *engine, void *areq) { - unsigned long fc; + struct skcipher_request *req = skcipher_request_cast(areq); + struct s390_pecb_req_ctx *req_ctx = skcipher_request_ctx(req); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk *walk = &req_ctx->walk; + bool tested = crypto_skcipher_tested(tfm); int rc; - rc = __paes_convert_key(ctx); - if (rc) - return rc; + /* walk has already been prepared */ + + rc = ecb_paes_do_crypt(ctx, req_ctx, tested, true); + if (rc == -EKEYEXPIRED) { + /* + * Protected key expired, conversion is in process. + * Trigger a re-schedule of this request by returning + * -ENOSPC ("hardware queue is full") to the crypto engine. + * To avoid immediately re-invocation of this callback, + * tell the scheduler to voluntarily give up the CPU here. + */ + cond_resched(); + pr_debug("rescheduling request\n"); + return -ENOSPC; + } else if (rc) { + skcipher_walk_done(walk, rc); + } - /* Pick the correct function code based on the protected key type */ - fc = (ctx->pk.type == PKEY_KEYTYPE_AES_128) ? CPACF_KMC_PAES_128 : - (ctx->pk.type == PKEY_KEYTYPE_AES_192) ? CPACF_KMC_PAES_192 : - (ctx->pk.type == PKEY_KEYTYPE_AES_256) ? CPACF_KMC_PAES_256 : 0; + memzero_explicit(&req_ctx->param, sizeof(req_ctx->param)); + pr_debug("request complete with rc=%d\n", rc); + local_bh_disable(); + atomic_dec(&ctx->via_engine_ctr); + crypto_finalize_skcipher_request(engine, req, rc); + local_bh_enable(); + return rc; +} - /* Check if the function code is available */ - ctx->fc = (fc && cpacf_test_func(&kmc_functions, fc)) ? fc : 0; +static struct skcipher_engine_alg ecb_paes_alg = { + .base = { + .base.cra_name = "ecb(paes)", + .base.cra_driver_name = "ecb-paes-s390", + .base.cra_priority = 401, /* combo: aes + ecb + 1 */ + .base.cra_blocksize = AES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct s390_paes_ctx), + .base.cra_module = THIS_MODULE, + .base.cra_list = LIST_HEAD_INIT(ecb_paes_alg.base.base.cra_list), + .init = ecb_paes_init, + .exit = ecb_paes_exit, + .min_keysize = PAES_MIN_KEYSIZE, + .max_keysize = PAES_MAX_KEYSIZE, + .setkey = ecb_paes_setkey, + .encrypt = ecb_paes_encrypt, + .decrypt = ecb_paes_decrypt, + }, + .op = { + .do_one_request = ecb_paes_do_one_request, + }, +}; - return ctx->fc ? 0 : -EINVAL; -} +/* + * PAES CBC implementation + */ + +struct cbc_param { + u8 iv[AES_BLOCK_SIZE]; + u8 key[PAES_256_PROTKEY_SIZE]; +} __packed; -static int cbc_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, - unsigned int key_len) +struct s390_pcbc_req_ctx { + unsigned long modifier; + struct skcipher_walk walk; + bool param_init_done; + struct cbc_param param; +}; + +static int cbc_paes_setkey(struct crypto_skcipher *tfm, const u8 *in_key, + unsigned int key_len) { struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + bool tested = crypto_skcipher_tested(tfm); + long fc; int rc; - _free_kb_keybuf(&ctx->kb); - rc = _key_to_kb(&ctx->kb, in_key, key_len); + /* set raw key into context */ + rc = paes_ctx_setkey(ctx, in_key, key_len); if (rc) - return rc; + goto out; - return __cbc_paes_set_key(ctx); + /* convert raw key into protected key */ + rc = paes_convert_key(ctx, tested); + if (rc) + goto out; + + /* Pick the correct function code based on the protected key type */ + switch (ctx->pk.type) { + case PKEY_KEYTYPE_AES_128: + fc = CPACF_KMC_PAES_128; + break; + case PKEY_KEYTYPE_AES_192: + fc = CPACF_KMC_PAES_192; + break; + case PKEY_KEYTYPE_AES_256: + fc = CPACF_KMC_PAES_256; + break; + default: + fc = 0; + break; + } + ctx->fc = (fc && cpacf_test_func(&kmc_functions, fc)) ? fc : 0; + + rc = fc ? 0 : -EINVAL; + +out: + pr_debug("rc=%d\n", rc); + return rc; } -static int cbc_paes_crypt(struct skcipher_request *req, unsigned long modifier) +static int cbc_paes_do_crypt(struct s390_paes_ctx *ctx, + struct s390_pcbc_req_ctx *req_ctx, + bool tested, bool maysleep) { - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); - struct { - u8 iv[AES_BLOCK_SIZE]; - u8 key[PAES_256_PROTKEY_SIZE]; - } param; - struct skcipher_walk walk; + struct cbc_param *param = &req_ctx->param; + struct skcipher_walk *walk = &req_ctx->walk; unsigned int nbytes, n, k; - int rc; - - rc = skcipher_walk_virt(&walk, req, false); + int pk_state, rc = 0; + + if (!req_ctx->param_init_done) { + /* fetch and check protected key state */ + spin_lock_bh(&ctx->pk_lock); + pk_state = ctx->pk_state; + switch (pk_state) { + case PK_STATE_NO_KEY: + rc = -ENOKEY; + break; + case PK_STATE_CONVERT_IN_PROGRESS: + rc = -EKEYEXPIRED; + break; + case PK_STATE_VALID: + memcpy(param->key, ctx->pk.protkey, sizeof(param->key)); + req_ctx->param_init_done = true; + break; + default: + rc = pk_state < 0 ? pk_state : -EIO; + break; + } + spin_unlock_bh(&ctx->pk_lock); + } if (rc) - return rc; + goto out; - memcpy(param.iv, walk.iv, AES_BLOCK_SIZE); - spin_lock_bh(&ctx->pk_lock); - memcpy(param.key, ctx->pk.protkey, PAES_256_PROTKEY_SIZE); - spin_unlock_bh(&ctx->pk_lock); + memcpy(param->iv, walk->iv, AES_BLOCK_SIZE); - while ((nbytes = walk.nbytes) != 0) { + /* + * Note that in case of partial processing or failure the walk + * is NOT unmapped here. So a follow up task may reuse the walk + * or in case of unrecoverable failure needs to unmap it. + */ + while ((nbytes = walk->nbytes) != 0) { /* only use complete blocks */ n = nbytes & ~(AES_BLOCK_SIZE - 1); - k = cpacf_kmc(ctx->fc | modifier, ¶m, - walk.dst.virt.addr, walk.src.virt.addr, n); + k = cpacf_kmc(ctx->fc | req_ctx->modifier, param, + walk->dst.virt.addr, walk->src.virt.addr, n); if (k) { - memcpy(walk.iv, param.iv, AES_BLOCK_SIZE); - rc = skcipher_walk_done(&walk, nbytes - k); + memcpy(walk->iv, param->iv, AES_BLOCK_SIZE); + rc = skcipher_walk_done(walk, nbytes - k); } if (k < n) { - if (__paes_convert_key(ctx)) - return skcipher_walk_done(&walk, -EIO); + if (!maysleep) { + rc = -EKEYEXPIRED; + goto out; + } + rc = paes_convert_key(ctx, tested); + if (rc) + goto out; spin_lock_bh(&ctx->pk_lock); - memcpy(param.key, ctx->pk.protkey, PAES_256_PROTKEY_SIZE); + memcpy(param->key, ctx->pk.protkey, sizeof(param->key)); spin_unlock_bh(&ctx->pk_lock); } } + +out: + pr_debug("rc=%d\n", rc); + return rc; +} + +static int cbc_paes_crypt(struct skcipher_request *req, unsigned long modifier) +{ + struct s390_pcbc_req_ctx *req_ctx = skcipher_request_ctx(req); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk *walk = &req_ctx->walk; + bool tested = crypto_skcipher_tested(tfm); + int rc; + + /* + * Attempt synchronous encryption first. If it fails, schedule the request + * asynchronously via the crypto engine. To preserve execution order, + * once a request is queued to the engine, further requests using the same + * tfm will also be routed through the engine. + */ + + rc = skcipher_walk_virt(walk, req, false); + if (rc) + goto out; + + req_ctx->modifier = modifier; + req_ctx->param_init_done = false; + + /* Try synchronous operation if no active engine usage */ + if (!atomic_read(&ctx->via_engine_ctr)) { + rc = cbc_paes_do_crypt(ctx, req_ctx, tested, false); + if (rc == 0) + goto out; + } + + /* + * If sync operation failed or key expired or there are already + * requests enqueued via engine, fallback to async. Mark tfm as + * using engine to serialize requests. + */ + if (rc == 0 || rc == -EKEYEXPIRED) { + atomic_inc(&ctx->via_engine_ctr); + rc = crypto_transfer_skcipher_request_to_engine(paes_crypto_engine, req); + if (rc != -EINPROGRESS) + atomic_dec(&ctx->via_engine_ctr); + } + + if (rc != -EINPROGRESS) + skcipher_walk_done(walk, rc); + +out: + if (rc != -EINPROGRESS) + memzero_explicit(&req_ctx->param, sizeof(req_ctx->param)); + pr_debug("rc=%d\n", rc); return rc; } @@ -429,496 +781,890 @@ static int cbc_paes_decrypt(struct skcipher_request *req) return cbc_paes_crypt(req, CPACF_DECRYPT); } -static struct skcipher_alg cbc_paes_alg = { - .base.cra_name = "cbc(paes)", - .base.cra_driver_name = "cbc-paes-s390", - .base.cra_priority = 402, /* ecb-paes-s390 + 1 */ - .base.cra_blocksize = AES_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct s390_paes_ctx), - .base.cra_module = THIS_MODULE, - .base.cra_list = LIST_HEAD_INIT(cbc_paes_alg.base.cra_list), - .init = cbc_paes_init, - .exit = cbc_paes_exit, - .min_keysize = PAES_MIN_KEYSIZE, - .max_keysize = PAES_MAX_KEYSIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = cbc_paes_set_key, - .encrypt = cbc_paes_encrypt, - .decrypt = cbc_paes_decrypt, -}; - -static int xts_paes_init(struct crypto_skcipher *tfm) +static int cbc_paes_init(struct crypto_skcipher *tfm) { - struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm); + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); - ctx->kb.key = NULL; + memset(ctx, 0, sizeof(*ctx)); spin_lock_init(&ctx->pk_lock); + crypto_skcipher_set_reqsize(tfm, sizeof(struct s390_pcbc_req_ctx)); + return 0; } -static void xts_paes_exit(struct crypto_skcipher *tfm) +static void cbc_paes_exit(struct crypto_skcipher *tfm) { - struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm); + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); - _free_kb_keybuf(&ctx->kb); + memzero_explicit(ctx, sizeof(*ctx)); } -static inline int __xts_paes_convert_key(struct s390_pxts_ctx *ctx) +static int cbc_paes_do_one_request(struct crypto_engine *engine, void *areq) { - struct paes_protkey pk0, pk1; - size_t split_keylen; + struct skcipher_request *req = skcipher_request_cast(areq); + struct s390_pcbc_req_ctx *req_ctx = skcipher_request_ctx(req); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk *walk = &req_ctx->walk; + bool tested = crypto_skcipher_tested(tfm); int rc; - pk0.len = sizeof(pk0.protkey); - pk1.len = sizeof(pk1.protkey); - - rc = __paes_keyblob2pkey(ctx->kb.key, ctx->kb.keylen, &pk0); - if (rc) - return rc; + /* walk has already been prepared */ + + rc = cbc_paes_do_crypt(ctx, req_ctx, tested, true); + if (rc == -EKEYEXPIRED) { + /* + * Protected key expired, conversion is in process. + * Trigger a re-schedule of this request by returning + * -ENOSPC ("hardware queue is full") to the crypto engine. + * To avoid immediately re-invocation of this callback, + * tell the scheduler to voluntarily give up the CPU here. + */ + cond_resched(); + pr_debug("rescheduling request\n"); + return -ENOSPC; + } else if (rc) { + skcipher_walk_done(walk, rc); + } - switch (pk0.type) { - case PKEY_KEYTYPE_AES_128: - case PKEY_KEYTYPE_AES_256: - /* second keytoken required */ - if (ctx->kb.keylen % 2) - return -EINVAL; - split_keylen = ctx->kb.keylen / 2; + memzero_explicit(&req_ctx->param, sizeof(req_ctx->param)); + pr_debug("request complete with rc=%d\n", rc); + local_bh_disable(); + atomic_dec(&ctx->via_engine_ctr); + crypto_finalize_skcipher_request(engine, req, rc); + local_bh_enable(); + return rc; +} - rc = __paes_keyblob2pkey(ctx->kb.key + split_keylen, - split_keylen, &pk1); - if (rc) - return rc; +static struct skcipher_engine_alg cbc_paes_alg = { + .base = { + .base.cra_name = "cbc(paes)", + .base.cra_driver_name = "cbc-paes-s390", + .base.cra_priority = 402, /* cbc-paes-s390 + 1 */ + .base.cra_blocksize = AES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct s390_paes_ctx), + .base.cra_module = THIS_MODULE, + .base.cra_list = LIST_HEAD_INIT(cbc_paes_alg.base.base.cra_list), + .init = cbc_paes_init, + .exit = cbc_paes_exit, + .min_keysize = PAES_MIN_KEYSIZE, + .max_keysize = PAES_MAX_KEYSIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = cbc_paes_setkey, + .encrypt = cbc_paes_encrypt, + .decrypt = cbc_paes_decrypt, + }, + .op = { + .do_one_request = cbc_paes_do_one_request, + }, +}; - if (pk0.type != pk1.type) - return -EINVAL; - break; - case PKEY_KEYTYPE_AES_XTS_128: - case PKEY_KEYTYPE_AES_XTS_256: - /* single key */ - pk1.type = 0; - break; - default: - /* unsupported protected keytype */ - return -EINVAL; - } +/* + * PAES CTR implementation + */ - spin_lock_bh(&ctx->pk_lock); - ctx->pk[0] = pk0; - ctx->pk[1] = pk1; - spin_unlock_bh(&ctx->pk_lock); +struct ctr_param { + u8 key[PAES_256_PROTKEY_SIZE]; +} __packed; - return 0; -} +struct s390_pctr_req_ctx { + unsigned long modifier; + struct skcipher_walk walk; + bool param_init_done; + struct ctr_param param; +}; -static inline int __xts_paes_set_key(struct s390_pxts_ctx *ctx) +static int ctr_paes_setkey(struct crypto_skcipher *tfm, const u8 *in_key, + unsigned int key_len) { - unsigned long fc; + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + bool tested = crypto_skcipher_tested(tfm); + long fc; int rc; - rc = __xts_paes_convert_key(ctx); + /* set raw key into context */ + rc = paes_ctx_setkey(ctx, in_key, key_len); if (rc) - return rc; + goto out; + + /* convert raw key into protected key */ + rc = paes_convert_key(ctx, tested); + if (rc) + goto out; /* Pick the correct function code based on the protected key type */ - switch (ctx->pk[0].type) { + switch (ctx->pk.type) { case PKEY_KEYTYPE_AES_128: - fc = CPACF_KM_PXTS_128; + fc = CPACF_KMCTR_PAES_128; break; - case PKEY_KEYTYPE_AES_256: - fc = CPACF_KM_PXTS_256; - break; - case PKEY_KEYTYPE_AES_XTS_128: - fc = CPACF_KM_PXTS_128_FULL; + case PKEY_KEYTYPE_AES_192: + fc = CPACF_KMCTR_PAES_192; break; - case PKEY_KEYTYPE_AES_XTS_256: - fc = CPACF_KM_PXTS_256_FULL; + case PKEY_KEYTYPE_AES_256: + fc = CPACF_KMCTR_PAES_256; break; default: fc = 0; break; } + ctx->fc = (fc && cpacf_test_func(&kmctr_functions, fc)) ? fc : 0; - /* Check if the function code is available */ - ctx->fc = (fc && cpacf_test_func(&km_functions, fc)) ? fc : 0; + rc = fc ? 0 : -EINVAL; + +out: + pr_debug("rc=%d\n", rc); + return rc; +} + +static inline unsigned int __ctrblk_init(u8 *ctrptr, u8 *iv, unsigned int nbytes) +{ + unsigned int i, n; + + /* only use complete blocks, max. PAGE_SIZE */ + memcpy(ctrptr, iv, AES_BLOCK_SIZE); + n = (nbytes > PAGE_SIZE) ? PAGE_SIZE : nbytes & ~(AES_BLOCK_SIZE - 1); + for (i = (n / AES_BLOCK_SIZE) - 1; i > 0; i--) { + memcpy(ctrptr + AES_BLOCK_SIZE, ctrptr, AES_BLOCK_SIZE); + crypto_inc(ctrptr + AES_BLOCK_SIZE, AES_BLOCK_SIZE); + ctrptr += AES_BLOCK_SIZE; + } + return n; +} + +static int ctr_paes_do_crypt(struct s390_paes_ctx *ctx, + struct s390_pctr_req_ctx *req_ctx, + bool tested, bool maysleep) +{ + struct ctr_param *param = &req_ctx->param; + struct skcipher_walk *walk = &req_ctx->walk; + u8 buf[AES_BLOCK_SIZE], *ctrptr; + unsigned int nbytes, n, k; + int pk_state, locked, rc = 0; + + if (!req_ctx->param_init_done) { + /* fetch and check protected key state */ + spin_lock_bh(&ctx->pk_lock); + pk_state = ctx->pk_state; + switch (pk_state) { + case PK_STATE_NO_KEY: + rc = -ENOKEY; + break; + case PK_STATE_CONVERT_IN_PROGRESS: + rc = -EKEYEXPIRED; + break; + case PK_STATE_VALID: + memcpy(param->key, ctx->pk.protkey, sizeof(param->key)); + req_ctx->param_init_done = true; + break; + default: + rc = pk_state < 0 ? pk_state : -EIO; + break; + } + spin_unlock_bh(&ctx->pk_lock); + } + if (rc) + goto out; + + locked = mutex_trylock(&ctrblk_lock); + + /* + * Note that in case of partial processing or failure the walk + * is NOT unmapped here. So a follow up task may reuse the walk + * or in case of unrecoverable failure needs to unmap it. + */ + while ((nbytes = walk->nbytes) >= AES_BLOCK_SIZE) { + n = AES_BLOCK_SIZE; + if (nbytes >= 2 * AES_BLOCK_SIZE && locked) + n = __ctrblk_init(ctrblk, walk->iv, nbytes); + ctrptr = (n > AES_BLOCK_SIZE) ? ctrblk : walk->iv; + k = cpacf_kmctr(ctx->fc, param, walk->dst.virt.addr, + walk->src.virt.addr, n, ctrptr); + if (k) { + if (ctrptr == ctrblk) + memcpy(walk->iv, ctrptr + k - AES_BLOCK_SIZE, + AES_BLOCK_SIZE); + crypto_inc(walk->iv, AES_BLOCK_SIZE); + rc = skcipher_walk_done(walk, nbytes - k); + } + if (k < n) { + if (!maysleep) { + if (locked) + mutex_unlock(&ctrblk_lock); + rc = -EKEYEXPIRED; + goto out; + } + rc = paes_convert_key(ctx, tested); + if (rc) { + if (locked) + mutex_unlock(&ctrblk_lock); + goto out; + } + spin_lock_bh(&ctx->pk_lock); + memcpy(param->key, ctx->pk.protkey, sizeof(param->key)); + spin_unlock_bh(&ctx->pk_lock); + } + } + if (locked) + mutex_unlock(&ctrblk_lock); + + /* final block may be < AES_BLOCK_SIZE, copy only nbytes */ + if (nbytes) { + memset(buf, 0, AES_BLOCK_SIZE); + memcpy(buf, walk->src.virt.addr, nbytes); + while (1) { + if (cpacf_kmctr(ctx->fc, param, buf, + buf, AES_BLOCK_SIZE, + walk->iv) == AES_BLOCK_SIZE) + break; + if (!maysleep) { + rc = -EKEYEXPIRED; + goto out; + } + rc = paes_convert_key(ctx, tested); + if (rc) + goto out; + spin_lock_bh(&ctx->pk_lock); + memcpy(param->key, ctx->pk.protkey, sizeof(param->key)); + spin_unlock_bh(&ctx->pk_lock); + } + memcpy(walk->dst.virt.addr, buf, nbytes); + crypto_inc(walk->iv, AES_BLOCK_SIZE); + rc = skcipher_walk_done(walk, 0); + } + +out: + pr_debug("rc=%d\n", rc); + return rc; +} + +static int ctr_paes_crypt(struct skcipher_request *req) +{ + struct s390_pctr_req_ctx *req_ctx = skcipher_request_ctx(req); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk *walk = &req_ctx->walk; + bool tested = crypto_skcipher_tested(tfm); + int rc; + + /* + * Attempt synchronous encryption first. If it fails, schedule the request + * asynchronously via the crypto engine. To preserve execution order, + * once a request is queued to the engine, further requests using the same + * tfm will also be routed through the engine. + */ + + rc = skcipher_walk_virt(walk, req, false); + if (rc) + goto out; + + req_ctx->param_init_done = false; + + /* Try synchronous operation if no active engine usage */ + if (!atomic_read(&ctx->via_engine_ctr)) { + rc = ctr_paes_do_crypt(ctx, req_ctx, tested, false); + if (rc == 0) + goto out; + } + + /* + * If sync operation failed or key expired or there are already + * requests enqueued via engine, fallback to async. Mark tfm as + * using engine to serialize requests. + */ + if (rc == 0 || rc == -EKEYEXPIRED) { + atomic_inc(&ctx->via_engine_ctr); + rc = crypto_transfer_skcipher_request_to_engine(paes_crypto_engine, req); + if (rc != -EINPROGRESS) + atomic_dec(&ctx->via_engine_ctr); + } + + if (rc != -EINPROGRESS) + skcipher_walk_done(walk, rc); + +out: + if (rc != -EINPROGRESS) + memzero_explicit(&req_ctx->param, sizeof(req_ctx->param)); + pr_debug("rc=%d\n", rc); + return rc; +} - return ctx->fc ? 0 : -EINVAL; +static int ctr_paes_init(struct crypto_skcipher *tfm) +{ + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + + memset(ctx, 0, sizeof(*ctx)); + spin_lock_init(&ctx->pk_lock); + + crypto_skcipher_set_reqsize(tfm, sizeof(struct s390_pctr_req_ctx)); + + return 0; +} + +static void ctr_paes_exit(struct crypto_skcipher *tfm) +{ + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + + memzero_explicit(ctx, sizeof(*ctx)); +} + +static int ctr_paes_do_one_request(struct crypto_engine *engine, void *areq) +{ + struct skcipher_request *req = skcipher_request_cast(areq); + struct s390_pctr_req_ctx *req_ctx = skcipher_request_ctx(req); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk *walk = &req_ctx->walk; + bool tested = crypto_skcipher_tested(tfm); + int rc; + + /* walk has already been prepared */ + + rc = ctr_paes_do_crypt(ctx, req_ctx, tested, true); + if (rc == -EKEYEXPIRED) { + /* + * Protected key expired, conversion is in process. + * Trigger a re-schedule of this request by returning + * -ENOSPC ("hardware queue is full") to the crypto engine. + * To avoid immediately re-invocation of this callback, + * tell the scheduler to voluntarily give up the CPU here. + */ + cond_resched(); + pr_debug("rescheduling request\n"); + return -ENOSPC; + } else if (rc) { + skcipher_walk_done(walk, rc); + } + + memzero_explicit(&req_ctx->param, sizeof(req_ctx->param)); + pr_debug("request complete with rc=%d\n", rc); + local_bh_disable(); + atomic_dec(&ctx->via_engine_ctr); + crypto_finalize_skcipher_request(engine, req, rc); + local_bh_enable(); + return rc; } -static int xts_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, - unsigned int in_keylen) +static struct skcipher_engine_alg ctr_paes_alg = { + .base = { + .base.cra_name = "ctr(paes)", + .base.cra_driver_name = "ctr-paes-s390", + .base.cra_priority = 402, /* ecb-paes-s390 + 1 */ + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct s390_paes_ctx), + .base.cra_module = THIS_MODULE, + .base.cra_list = LIST_HEAD_INIT(ctr_paes_alg.base.base.cra_list), + .init = ctr_paes_init, + .exit = ctr_paes_exit, + .min_keysize = PAES_MIN_KEYSIZE, + .max_keysize = PAES_MAX_KEYSIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ctr_paes_setkey, + .encrypt = ctr_paes_crypt, + .decrypt = ctr_paes_crypt, + .chunksize = AES_BLOCK_SIZE, + }, + .op = { + .do_one_request = ctr_paes_do_one_request, + }, +}; + +/* + * PAES XTS implementation + */ + +struct xts_full_km_param { + u8 key[64]; + u8 tweak[16]; + u8 nap[16]; + u8 wkvp[32]; +} __packed; + +struct xts_km_param { + u8 key[PAES_256_PROTKEY_SIZE]; + u8 init[16]; +} __packed; + +struct xts_pcc_param { + u8 key[PAES_256_PROTKEY_SIZE]; + u8 tweak[16]; + u8 block[16]; + u8 bit[16]; + u8 xts[16]; +} __packed; + +struct s390_pxts_req_ctx { + unsigned long modifier; + struct skcipher_walk walk; + bool param_init_done; + union { + struct xts_full_km_param full_km_param; + struct xts_km_param km_param; + } param; +}; + +static int xts_paes_setkey(struct crypto_skcipher *tfm, const u8 *in_key, + unsigned int in_keylen) { struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm); + bool tested = crypto_skcipher_tested(tfm); u8 ckey[2 * AES_MAX_KEY_SIZE]; unsigned int ckey_len; + long fc; int rc; if ((in_keylen == 32 || in_keylen == 64) && xts_verify_key(tfm, in_key, in_keylen)) return -EINVAL; - _free_kb_keybuf(&ctx->kb); - rc = _xts_key_to_kb(&ctx->kb, in_key, in_keylen); + /* set raw key into context */ + rc = pxts_ctx_setkey(ctx, in_key, in_keylen); if (rc) - return rc; + goto out; - rc = __xts_paes_set_key(ctx); + /* convert raw key(s) into protected key(s) */ + rc = pxts_convert_key(ctx, tested); if (rc) - return rc; + goto out; /* - * It is not possible on a single protected key (e.g. full AES-XTS) to - * check, if k1 and k2 are the same. - */ - if (ctx->pk[0].type == PKEY_KEYTYPE_AES_XTS_128 || - ctx->pk[0].type == PKEY_KEYTYPE_AES_XTS_256) - return 0; - /* * xts_verify_key verifies the key length is not odd and makes * sure that the two keys are not the same. This can be done - * on the two protected keys as well + * on the two protected keys as well - but not for full xts keys. */ - ckey_len = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ? - AES_KEYSIZE_128 : AES_KEYSIZE_256; - memcpy(ckey, ctx->pk[0].protkey, ckey_len); - memcpy(ckey + ckey_len, ctx->pk[1].protkey, ckey_len); - return xts_verify_key(tfm, ckey, 2*ckey_len); + if (ctx->pk[0].type == PKEY_KEYTYPE_AES_128 || + ctx->pk[0].type == PKEY_KEYTYPE_AES_256) { + ckey_len = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ? + AES_KEYSIZE_128 : AES_KEYSIZE_256; + memcpy(ckey, ctx->pk[0].protkey, ckey_len); + memcpy(ckey + ckey_len, ctx->pk[1].protkey, ckey_len); + rc = xts_verify_key(tfm, ckey, 2 * ckey_len); + memzero_explicit(ckey, sizeof(ckey)); + if (rc) + goto out; + } + + /* Pick the correct function code based on the protected key type */ + switch (ctx->pk[0].type) { + case PKEY_KEYTYPE_AES_128: + fc = CPACF_KM_PXTS_128; + break; + case PKEY_KEYTYPE_AES_256: + fc = CPACF_KM_PXTS_256; + break; + case PKEY_KEYTYPE_AES_XTS_128: + fc = CPACF_KM_PXTS_128_FULL; + break; + case PKEY_KEYTYPE_AES_XTS_256: + fc = CPACF_KM_PXTS_256_FULL; + break; + default: + fc = 0; + break; + } + ctx->fc = (fc && cpacf_test_func(&km_functions, fc)) ? fc : 0; + + rc = fc ? 0 : -EINVAL; + +out: + pr_debug("rc=%d\n", rc); + return rc; } -static int paes_xts_crypt_full(struct skcipher_request *req, - unsigned long modifier) +static int xts_paes_do_crypt_fullkey(struct s390_pxts_ctx *ctx, + struct s390_pxts_req_ctx *req_ctx, + bool tested, bool maysleep) { - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm); + struct xts_full_km_param *param = &req_ctx->param.full_km_param; + struct skcipher_walk *walk = &req_ctx->walk; unsigned int keylen, offset, nbytes, n, k; - struct { - u8 key[64]; - u8 tweak[16]; - u8 nap[16]; - u8 wkvp[32]; - } fxts_param = { - .nap = {0}, - }; - struct skcipher_walk walk; - int rc; + int rc = 0; - rc = skcipher_walk_virt(&walk, req, false); - if (rc) - return rc; + /* + * The calling function xts_paes_do_crypt() ensures the + * protected key state is always PK_STATE_VALID when this + * function is invoked. + */ keylen = (ctx->pk[0].type == PKEY_KEYTYPE_AES_XTS_128) ? 32 : 64; offset = (ctx->pk[0].type == PKEY_KEYTYPE_AES_XTS_128) ? 32 : 0; - spin_lock_bh(&ctx->pk_lock); - memcpy(fxts_param.key + offset, ctx->pk[0].protkey, keylen); - memcpy(fxts_param.wkvp, ctx->pk[0].protkey + keylen, - sizeof(fxts_param.wkvp)); - spin_unlock_bh(&ctx->pk_lock); - memcpy(fxts_param.tweak, walk.iv, sizeof(fxts_param.tweak)); - fxts_param.nap[0] = 0x01; /* initial alpha power (1, little-endian) */ + if (!req_ctx->param_init_done) { + memset(param, 0, sizeof(*param)); + spin_lock_bh(&ctx->pk_lock); + memcpy(param->key + offset, ctx->pk[0].protkey, keylen); + memcpy(param->wkvp, ctx->pk[0].protkey + keylen, sizeof(param->wkvp)); + spin_unlock_bh(&ctx->pk_lock); + memcpy(param->tweak, walk->iv, sizeof(param->tweak)); + param->nap[0] = 0x01; /* initial alpha power (1, little-endian) */ + req_ctx->param_init_done = true; + } - while ((nbytes = walk.nbytes) != 0) { + /* + * Note that in case of partial processing or failure the walk + * is NOT unmapped here. So a follow up task may reuse the walk + * or in case of unrecoverable failure needs to unmap it. + */ + while ((nbytes = walk->nbytes) != 0) { /* only use complete blocks */ n = nbytes & ~(AES_BLOCK_SIZE - 1); - k = cpacf_km(ctx->fc | modifier, fxts_param.key + offset, - walk.dst.virt.addr, walk.src.virt.addr, n); + k = cpacf_km(ctx->fc | req_ctx->modifier, param->key + offset, + walk->dst.virt.addr, walk->src.virt.addr, n); if (k) - rc = skcipher_walk_done(&walk, nbytes - k); + rc = skcipher_walk_done(walk, nbytes - k); if (k < n) { - if (__xts_paes_convert_key(ctx)) - return skcipher_walk_done(&walk, -EIO); + if (!maysleep) { + rc = -EKEYEXPIRED; + goto out; + } + rc = pxts_convert_key(ctx, tested); + if (rc) + goto out; spin_lock_bh(&ctx->pk_lock); - memcpy(fxts_param.key + offset, ctx->pk[0].protkey, - keylen); - memcpy(fxts_param.wkvp, ctx->pk[0].protkey + keylen, - sizeof(fxts_param.wkvp)); + memcpy(param->key + offset, ctx->pk[0].protkey, keylen); + memcpy(param->wkvp, ctx->pk[0].protkey + keylen, sizeof(param->wkvp)); spin_unlock_bh(&ctx->pk_lock); } } +out: + pr_debug("rc=%d\n", rc); return rc; } -static int paes_xts_crypt(struct skcipher_request *req, unsigned long modifier) +static inline int __xts_2keys_prep_param(struct s390_pxts_ctx *ctx, + struct xts_km_param *param, + struct skcipher_walk *walk, + unsigned int keylen, + unsigned int offset, + bool tested, bool maysleep) { - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm); + struct xts_pcc_param pcc_param; + unsigned long cc = 1; + int rc = 0; + + while (cc) { + memset(&pcc_param, 0, sizeof(pcc_param)); + memcpy(pcc_param.tweak, walk->iv, sizeof(pcc_param.tweak)); + spin_lock_bh(&ctx->pk_lock); + memcpy(pcc_param.key + offset, ctx->pk[1].protkey, keylen); + memcpy(param->key + offset, ctx->pk[0].protkey, keylen); + spin_unlock_bh(&ctx->pk_lock); + cc = cpacf_pcc(ctx->fc, pcc_param.key + offset); + if (cc) { + if (!maysleep) { + rc = -EKEYEXPIRED; + break; + } + rc = pxts_convert_key(ctx, tested); + if (rc) + break; + continue; + } + memcpy(param->init, pcc_param.xts, 16); + } + + memzero_explicit(pcc_param.key, sizeof(pcc_param.key)); + return rc; +} + +static int xts_paes_do_crypt_2keys(struct s390_pxts_ctx *ctx, + struct s390_pxts_req_ctx *req_ctx, + bool tested, bool maysleep) +{ + struct xts_km_param *param = &req_ctx->param.km_param; + struct skcipher_walk *walk = &req_ctx->walk; unsigned int keylen, offset, nbytes, n, k; - struct { - u8 key[PAES_256_PROTKEY_SIZE]; - u8 tweak[16]; - u8 block[16]; - u8 bit[16]; - u8 xts[16]; - } pcc_param; - struct { - u8 key[PAES_256_PROTKEY_SIZE]; - u8 init[16]; - } xts_param; - struct skcipher_walk walk; - int rc; + int rc = 0; - rc = skcipher_walk_virt(&walk, req, false); - if (rc) - return rc; + /* + * The calling function xts_paes_do_crypt() ensures the + * protected key state is always PK_STATE_VALID when this + * function is invoked. + */ keylen = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ? 48 : 64; offset = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ? 16 : 0; - memset(&pcc_param, 0, sizeof(pcc_param)); - memcpy(pcc_param.tweak, walk.iv, sizeof(pcc_param.tweak)); - spin_lock_bh(&ctx->pk_lock); - memcpy(pcc_param.key + offset, ctx->pk[1].protkey, keylen); - memcpy(xts_param.key + offset, ctx->pk[0].protkey, keylen); - spin_unlock_bh(&ctx->pk_lock); - cpacf_pcc(ctx->fc, pcc_param.key + offset); - memcpy(xts_param.init, pcc_param.xts, 16); + if (!req_ctx->param_init_done) { + rc = __xts_2keys_prep_param(ctx, param, walk, + keylen, offset, tested, maysleep); + if (rc) + goto out; + req_ctx->param_init_done = true; + } - while ((nbytes = walk.nbytes) != 0) { + /* + * Note that in case of partial processing or failure the walk + * is NOT unmapped here. So a follow up task may reuse the walk + * or in case of unrecoverable failure needs to unmap it. + */ + while ((nbytes = walk->nbytes) != 0) { /* only use complete blocks */ n = nbytes & ~(AES_BLOCK_SIZE - 1); - k = cpacf_km(ctx->fc | modifier, xts_param.key + offset, - walk.dst.virt.addr, walk.src.virt.addr, n); + k = cpacf_km(ctx->fc | req_ctx->modifier, param->key + offset, + walk->dst.virt.addr, walk->src.virt.addr, n); if (k) - rc = skcipher_walk_done(&walk, nbytes - k); + rc = skcipher_walk_done(walk, nbytes - k); if (k < n) { - if (__xts_paes_convert_key(ctx)) - return skcipher_walk_done(&walk, -EIO); + if (!maysleep) { + rc = -EKEYEXPIRED; + goto out; + } + rc = pxts_convert_key(ctx, tested); + if (rc) + goto out; spin_lock_bh(&ctx->pk_lock); - memcpy(xts_param.key + offset, - ctx->pk[0].protkey, keylen); + memcpy(param->key + offset, ctx->pk[0].protkey, keylen); spin_unlock_bh(&ctx->pk_lock); } } +out: + pr_debug("rc=%d\n", rc); return rc; } -static inline int xts_paes_crypt(struct skcipher_request *req, unsigned long modifier) +static int xts_paes_do_crypt(struct s390_pxts_ctx *ctx, + struct s390_pxts_req_ctx *req_ctx, + bool tested, bool maysleep) { - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm); + int pk_state, rc = 0; + + /* fetch and check protected key state */ + spin_lock_bh(&ctx->pk_lock); + pk_state = ctx->pk_state; + switch (pk_state) { + case PK_STATE_NO_KEY: + rc = -ENOKEY; + break; + case PK_STATE_CONVERT_IN_PROGRESS: + rc = -EKEYEXPIRED; + break; + case PK_STATE_VALID: + break; + default: + rc = pk_state < 0 ? pk_state : -EIO; + break; + } + spin_unlock_bh(&ctx->pk_lock); + if (rc) + goto out; + /* Call the 'real' crypt function based on the xts prot key type. */ switch (ctx->fc) { case CPACF_KM_PXTS_128: case CPACF_KM_PXTS_256: - return paes_xts_crypt(req, modifier); + rc = xts_paes_do_crypt_2keys(ctx, req_ctx, tested, maysleep); + break; case CPACF_KM_PXTS_128_FULL: case CPACF_KM_PXTS_256_FULL: - return paes_xts_crypt_full(req, modifier); + rc = xts_paes_do_crypt_fullkey(ctx, req_ctx, tested, maysleep); + break; default: - return -EINVAL; + rc = -EINVAL; } -} -static int xts_paes_encrypt(struct skcipher_request *req) -{ - return xts_paes_crypt(req, 0); +out: + pr_debug("rc=%d\n", rc); + return rc; } -static int xts_paes_decrypt(struct skcipher_request *req) +static inline int xts_paes_crypt(struct skcipher_request *req, unsigned long modifier) { - return xts_paes_crypt(req, CPACF_DECRYPT); -} + struct s390_pxts_req_ctx *req_ctx = skcipher_request_ctx(req); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk *walk = &req_ctx->walk; + bool tested = crypto_skcipher_tested(tfm); + int rc; -static struct skcipher_alg xts_paes_alg = { - .base.cra_name = "xts(paes)", - .base.cra_driver_name = "xts-paes-s390", - .base.cra_priority = 402, /* ecb-paes-s390 + 1 */ - .base.cra_blocksize = AES_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct s390_pxts_ctx), - .base.cra_module = THIS_MODULE, - .base.cra_list = LIST_HEAD_INIT(xts_paes_alg.base.cra_list), - .init = xts_paes_init, - .exit = xts_paes_exit, - .min_keysize = 2 * PAES_MIN_KEYSIZE, - .max_keysize = 2 * PAES_MAX_KEYSIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = xts_paes_set_key, - .encrypt = xts_paes_encrypt, - .decrypt = xts_paes_decrypt, -}; + /* + * Attempt synchronous encryption first. If it fails, schedule the request + * asynchronously via the crypto engine. To preserve execution order, + * once a request is queued to the engine, further requests using the same + * tfm will also be routed through the engine. + */ -static int ctr_paes_init(struct crypto_skcipher *tfm) -{ - struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + rc = skcipher_walk_virt(walk, req, false); + if (rc) + goto out; - ctx->kb.key = NULL; - spin_lock_init(&ctx->pk_lock); + req_ctx->modifier = modifier; + req_ctx->param_init_done = false; - return 0; -} + /* Try synchronous operation if no active engine usage */ + if (!atomic_read(&ctx->via_engine_ctr)) { + rc = xts_paes_do_crypt(ctx, req_ctx, tested, false); + if (rc == 0) + goto out; + } -static void ctr_paes_exit(struct crypto_skcipher *tfm) -{ - struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + /* + * If sync operation failed or key expired or there are already + * requests enqueued via engine, fallback to async. Mark tfm as + * using engine to serialize requests. + */ + if (rc == 0 || rc == -EKEYEXPIRED) { + atomic_inc(&ctx->via_engine_ctr); + rc = crypto_transfer_skcipher_request_to_engine(paes_crypto_engine, req); + if (rc != -EINPROGRESS) + atomic_dec(&ctx->via_engine_ctr); + } + + if (rc != -EINPROGRESS) + skcipher_walk_done(walk, rc); - _free_kb_keybuf(&ctx->kb); +out: + if (rc != -EINPROGRESS) + memzero_explicit(&req_ctx->param, sizeof(req_ctx->param)); + pr_debug("rc=%d\n", rc); + return rc; } -static inline int __ctr_paes_set_key(struct s390_paes_ctx *ctx) +static int xts_paes_encrypt(struct skcipher_request *req) { - unsigned long fc; - int rc; - - rc = __paes_convert_key(ctx); - if (rc) - return rc; - - /* Pick the correct function code based on the protected key type */ - fc = (ctx->pk.type == PKEY_KEYTYPE_AES_128) ? CPACF_KMCTR_PAES_128 : - (ctx->pk.type == PKEY_KEYTYPE_AES_192) ? CPACF_KMCTR_PAES_192 : - (ctx->pk.type == PKEY_KEYTYPE_AES_256) ? - CPACF_KMCTR_PAES_256 : 0; - - /* Check if the function code is available */ - ctx->fc = (fc && cpacf_test_func(&kmctr_functions, fc)) ? fc : 0; + return xts_paes_crypt(req, 0); +} - return ctx->fc ? 0 : -EINVAL; +static int xts_paes_decrypt(struct skcipher_request *req) +{ + return xts_paes_crypt(req, CPACF_DECRYPT); } -static int ctr_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, - unsigned int key_len) +static int xts_paes_init(struct crypto_skcipher *tfm) { - struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); - int rc; + struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm); - _free_kb_keybuf(&ctx->kb); - rc = _key_to_kb(&ctx->kb, in_key, key_len); - if (rc) - return rc; + memset(ctx, 0, sizeof(*ctx)); + spin_lock_init(&ctx->pk_lock); + + crypto_skcipher_set_reqsize(tfm, sizeof(struct s390_pxts_req_ctx)); - return __ctr_paes_set_key(ctx); + return 0; } -static unsigned int __ctrblk_init(u8 *ctrptr, u8 *iv, unsigned int nbytes) +static void xts_paes_exit(struct crypto_skcipher *tfm) { - unsigned int i, n; + struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm); - /* only use complete blocks, max. PAGE_SIZE */ - memcpy(ctrptr, iv, AES_BLOCK_SIZE); - n = (nbytes > PAGE_SIZE) ? PAGE_SIZE : nbytes & ~(AES_BLOCK_SIZE - 1); - for (i = (n / AES_BLOCK_SIZE) - 1; i > 0; i--) { - memcpy(ctrptr + AES_BLOCK_SIZE, ctrptr, AES_BLOCK_SIZE); - crypto_inc(ctrptr + AES_BLOCK_SIZE, AES_BLOCK_SIZE); - ctrptr += AES_BLOCK_SIZE; - } - return n; + memzero_explicit(ctx, sizeof(*ctx)); } -static int ctr_paes_crypt(struct skcipher_request *req) +static int xts_paes_do_one_request(struct crypto_engine *engine, void *areq) { + struct skcipher_request *req = skcipher_request_cast(areq); + struct s390_pxts_req_ctx *req_ctx = skcipher_request_ctx(req); struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); - u8 buf[AES_BLOCK_SIZE], *ctrptr; - struct { - u8 key[PAES_256_PROTKEY_SIZE]; - } param; - struct skcipher_walk walk; - unsigned int nbytes, n, k; - int rc, locked; - - rc = skcipher_walk_virt(&walk, req, false); - if (rc) - return rc; - - spin_lock_bh(&ctx->pk_lock); - memcpy(param.key, ctx->pk.protkey, PAES_256_PROTKEY_SIZE); - spin_unlock_bh(&ctx->pk_lock); - - locked = mutex_trylock(&ctrblk_lock); + struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk *walk = &req_ctx->walk; + bool tested = crypto_skcipher_tested(tfm); + int rc; - while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) { - n = AES_BLOCK_SIZE; - if (nbytes >= 2*AES_BLOCK_SIZE && locked) - n = __ctrblk_init(ctrblk, walk.iv, nbytes); - ctrptr = (n > AES_BLOCK_SIZE) ? ctrblk : walk.iv; - k = cpacf_kmctr(ctx->fc, ¶m, walk.dst.virt.addr, - walk.src.virt.addr, n, ctrptr); - if (k) { - if (ctrptr == ctrblk) - memcpy(walk.iv, ctrptr + k - AES_BLOCK_SIZE, - AES_BLOCK_SIZE); - crypto_inc(walk.iv, AES_BLOCK_SIZE); - rc = skcipher_walk_done(&walk, nbytes - k); - } - if (k < n) { - if (__paes_convert_key(ctx)) { - if (locked) - mutex_unlock(&ctrblk_lock); - return skcipher_walk_done(&walk, -EIO); - } - spin_lock_bh(&ctx->pk_lock); - memcpy(param.key, ctx->pk.protkey, PAES_256_PROTKEY_SIZE); - spin_unlock_bh(&ctx->pk_lock); - } - } - if (locked) - mutex_unlock(&ctrblk_lock); - /* - * final block may be < AES_BLOCK_SIZE, copy only nbytes - */ - if (nbytes) { - memset(buf, 0, AES_BLOCK_SIZE); - memcpy(buf, walk.src.virt.addr, nbytes); - while (1) { - if (cpacf_kmctr(ctx->fc, ¶m, buf, - buf, AES_BLOCK_SIZE, - walk.iv) == AES_BLOCK_SIZE) - break; - if (__paes_convert_key(ctx)) - return skcipher_walk_done(&walk, -EIO); - spin_lock_bh(&ctx->pk_lock); - memcpy(param.key, ctx->pk.protkey, PAES_256_PROTKEY_SIZE); - spin_unlock_bh(&ctx->pk_lock); - } - memcpy(walk.dst.virt.addr, buf, nbytes); - crypto_inc(walk.iv, AES_BLOCK_SIZE); - rc = skcipher_walk_done(&walk, nbytes); + /* walk has already been prepared */ + + rc = xts_paes_do_crypt(ctx, req_ctx, tested, true); + if (rc == -EKEYEXPIRED) { + /* + * Protected key expired, conversion is in process. + * Trigger a re-schedule of this request by returning + * -ENOSPC ("hardware queue is full") to the crypto engine. + * To avoid immediately re-invocation of this callback, + * tell the scheduler to voluntarily give up the CPU here. + */ + cond_resched(); + pr_debug("rescheduling request\n"); + return -ENOSPC; + } else if (rc) { + skcipher_walk_done(walk, rc); } + memzero_explicit(&req_ctx->param, sizeof(req_ctx->param)); + pr_debug("request complete with rc=%d\n", rc); + local_bh_disable(); + atomic_dec(&ctx->via_engine_ctr); + crypto_finalize_skcipher_request(engine, req, rc); + local_bh_enable(); return rc; } -static struct skcipher_alg ctr_paes_alg = { - .base.cra_name = "ctr(paes)", - .base.cra_driver_name = "ctr-paes-s390", - .base.cra_priority = 402, /* ecb-paes-s390 + 1 */ - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct s390_paes_ctx), - .base.cra_module = THIS_MODULE, - .base.cra_list = LIST_HEAD_INIT(ctr_paes_alg.base.cra_list), - .init = ctr_paes_init, - .exit = ctr_paes_exit, - .min_keysize = PAES_MIN_KEYSIZE, - .max_keysize = PAES_MAX_KEYSIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = ctr_paes_set_key, - .encrypt = ctr_paes_crypt, - .decrypt = ctr_paes_crypt, - .chunksize = AES_BLOCK_SIZE, +static struct skcipher_engine_alg xts_paes_alg = { + .base = { + .base.cra_name = "xts(paes)", + .base.cra_driver_name = "xts-paes-s390", + .base.cra_priority = 402, /* ecb-paes-s390 + 1 */ + .base.cra_blocksize = AES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct s390_pxts_ctx), + .base.cra_module = THIS_MODULE, + .base.cra_list = LIST_HEAD_INIT(xts_paes_alg.base.base.cra_list), + .init = xts_paes_init, + .exit = xts_paes_exit, + .min_keysize = 2 * PAES_MIN_KEYSIZE, + .max_keysize = 2 * PAES_MAX_KEYSIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = xts_paes_setkey, + .encrypt = xts_paes_encrypt, + .decrypt = xts_paes_decrypt, + }, + .op = { + .do_one_request = xts_paes_do_one_request, + }, }; -static inline void __crypto_unregister_skcipher(struct skcipher_alg *alg) +/* + * alg register, unregister, module init, exit + */ + +static struct miscdevice paes_dev = { + .name = "paes", + .minor = MISC_DYNAMIC_MINOR, +}; + +static inline void __crypto_unregister_skcipher(struct skcipher_engine_alg *alg) { - if (!list_empty(&alg->base.cra_list)) - crypto_unregister_skcipher(alg); + if (!list_empty(&alg->base.base.cra_list)) + crypto_engine_unregister_skcipher(alg); } static void paes_s390_fini(void) { + if (paes_crypto_engine) { + crypto_engine_stop(paes_crypto_engine); + crypto_engine_exit(paes_crypto_engine); + } __crypto_unregister_skcipher(&ctr_paes_alg); __crypto_unregister_skcipher(&xts_paes_alg); __crypto_unregister_skcipher(&cbc_paes_alg); __crypto_unregister_skcipher(&ecb_paes_alg); if (ctrblk) - free_page((unsigned long) ctrblk); + free_page((unsigned long)ctrblk); + misc_deregister(&paes_dev); } static int __init paes_s390_init(void) { int rc; + /* register a simple paes pseudo misc device */ + rc = misc_register(&paes_dev); + if (rc) + return rc; + + /* with this pseudo devie alloc and start a crypto engine */ + paes_crypto_engine = + crypto_engine_alloc_init_and_set(paes_dev.this_device, + true, false, MAX_QLEN); + if (!paes_crypto_engine) { + rc = -ENOMEM; + goto out_err; + } + rc = crypto_engine_start(paes_crypto_engine); + if (rc) { + crypto_engine_exit(paes_crypto_engine); + paes_crypto_engine = NULL; + goto out_err; + } + /* Query available functions for KM, KMC and KMCTR */ cpacf_query(CPACF_KM, &km_functions); cpacf_query(CPACF_KMC, &kmc_functions); @@ -927,40 +1673,45 @@ static int __init paes_s390_init(void) if (cpacf_test_func(&km_functions, CPACF_KM_PAES_128) || cpacf_test_func(&km_functions, CPACF_KM_PAES_192) || cpacf_test_func(&km_functions, CPACF_KM_PAES_256)) { - rc = crypto_register_skcipher(&ecb_paes_alg); + rc = crypto_engine_register_skcipher(&ecb_paes_alg); if (rc) goto out_err; + pr_debug("%s registered\n", ecb_paes_alg.base.base.cra_driver_name); } if (cpacf_test_func(&kmc_functions, CPACF_KMC_PAES_128) || cpacf_test_func(&kmc_functions, CPACF_KMC_PAES_192) || cpacf_test_func(&kmc_functions, CPACF_KMC_PAES_256)) { - rc = crypto_register_skcipher(&cbc_paes_alg); + rc = crypto_engine_register_skcipher(&cbc_paes_alg); if (rc) goto out_err; + pr_debug("%s registered\n", cbc_paes_alg.base.base.cra_driver_name); } if (cpacf_test_func(&km_functions, CPACF_KM_PXTS_128) || cpacf_test_func(&km_functions, CPACF_KM_PXTS_256)) { - rc = crypto_register_skcipher(&xts_paes_alg); + rc = crypto_engine_register_skcipher(&xts_paes_alg); if (rc) goto out_err; + pr_debug("%s registered\n", xts_paes_alg.base.base.cra_driver_name); } if (cpacf_test_func(&kmctr_functions, CPACF_KMCTR_PAES_128) || cpacf_test_func(&kmctr_functions, CPACF_KMCTR_PAES_192) || cpacf_test_func(&kmctr_functions, CPACF_KMCTR_PAES_256)) { - ctrblk = (u8 *) __get_free_page(GFP_KERNEL); + ctrblk = (u8 *)__get_free_page(GFP_KERNEL); if (!ctrblk) { rc = -ENOMEM; goto out_err; } - rc = crypto_register_skcipher(&ctr_paes_alg); + rc = crypto_engine_register_skcipher(&ctr_paes_alg); if (rc) goto out_err; + pr_debug("%s registered\n", ctr_paes_alg.base.base.cra_driver_name); } return 0; + out_err: paes_s390_fini(); return rc; diff --git a/arch/s390/crypto/phmac_s390.c b/arch/s390/crypto/phmac_s390.c new file mode 100644 index 000000000000..03ca33ffe6cc --- /dev/null +++ b/arch/s390/crypto/phmac_s390.c @@ -0,0 +1,1074 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright IBM Corp. 2025 + * + * s390 specific HMAC support for protected keys. + */ + +#define pr_fmt(fmt) "phmac_s390: " fmt + +#include <asm/cpacf.h> +#include <asm/pkey.h> +#include <crypto/engine.h> +#include <crypto/hash.h> +#include <crypto/internal/hash.h> +#include <crypto/sha2.h> +#include <linux/atomic.h> +#include <linux/cpufeature.h> +#include <linux/delay.h> +#include <linux/miscdevice.h> +#include <linux/module.h> +#include <linux/spinlock.h> + +static struct crypto_engine *phmac_crypto_engine; +#define MAX_QLEN 10 + +static bool pkey_clrkey_allowed; +module_param_named(clrkey, pkey_clrkey_allowed, bool, 0444); +MODULE_PARM_DESC(clrkey, "Allow clear key material (default N)"); + +/* + * A simple hash walk helper + */ + +struct hash_walk_helper { + struct crypto_hash_walk walk; + const u8 *walkaddr; + int walkbytes; +}; + +/* + * Prepare hash walk helper. + * Set up the base hash walk, fill walkaddr and walkbytes. + * Returns 0 on success or negative value on error. + */ +static inline int hwh_prepare(struct ahash_request *req, + struct hash_walk_helper *hwh) +{ + hwh->walkbytes = crypto_hash_walk_first(req, &hwh->walk); + if (hwh->walkbytes < 0) + return hwh->walkbytes; + hwh->walkaddr = hwh->walk.data; + return 0; +} + +/* + * Advance hash walk helper by n bytes. + * Progress the walkbytes and walkaddr fields by n bytes. + * If walkbytes is then 0, pull next hunk from hash walk + * and update walkbytes and walkaddr. + * If n is negative, unmap hash walk and return error. + * Returns 0 on success or negative value on error. + */ +static inline int hwh_advance(struct hash_walk_helper *hwh, int n) +{ + if (n < 0) + return crypto_hash_walk_done(&hwh->walk, n); + + hwh->walkbytes -= n; + hwh->walkaddr += n; + if (hwh->walkbytes > 0) + return 0; + + hwh->walkbytes = crypto_hash_walk_done(&hwh->walk, 0); + if (hwh->walkbytes < 0) + return hwh->walkbytes; + + hwh->walkaddr = hwh->walk.data; + return 0; +} + +/* + * KMAC param block layout for sha2 function codes: + * The layout of the param block for the KMAC instruction depends on the + * blocksize of the used hashing sha2-algorithm function codes. The param block + * contains the hash chaining value (cv), the input message bit-length (imbl) + * and the hmac-secret (key). To prevent code duplication, the sizes of all + * these are calculated based on the blocksize. + * + * param-block: + * +-------+ + * | cv | + * +-------+ + * | imbl | + * +-------+ + * | key | + * +-------+ + * + * sizes: + * part | sh2-alg | calculation | size | type + * -----+---------+-------------+------+-------- + * cv | 224/256 | blocksize/2 | 32 | u64[8] + * | 384/512 | | 64 | u128[8] + * imbl | 224/256 | blocksize/8 | 8 | u64 + * | 384/512 | | 16 | u128 + * key | 224/256 | blocksize | 96 | u8[96] + * | 384/512 | | 160 | u8[160] + */ + +#define MAX_DIGEST_SIZE SHA512_DIGEST_SIZE +#define MAX_IMBL_SIZE sizeof(u128) +#define MAX_BLOCK_SIZE SHA512_BLOCK_SIZE + +#define SHA2_CV_SIZE(bs) ((bs) >> 1) +#define SHA2_IMBL_SIZE(bs) ((bs) >> 3) + +#define SHA2_IMBL_OFFSET(bs) (SHA2_CV_SIZE(bs)) +#define SHA2_KEY_OFFSET(bs) (SHA2_CV_SIZE(bs) + SHA2_IMBL_SIZE(bs)) + +#define PHMAC_MAX_KEYSIZE 256 +#define PHMAC_SHA256_PK_SIZE (SHA256_BLOCK_SIZE + 32) +#define PHMAC_SHA512_PK_SIZE (SHA512_BLOCK_SIZE + 32) +#define PHMAC_MAX_PK_SIZE PHMAC_SHA512_PK_SIZE + +/* phmac protected key struct */ +struct phmac_protkey { + u32 type; + u32 len; + u8 protkey[PHMAC_MAX_PK_SIZE]; +}; + +#define PK_STATE_NO_KEY 0 +#define PK_STATE_CONVERT_IN_PROGRESS 1 +#define PK_STATE_VALID 2 + +/* phmac tfm context */ +struct phmac_tfm_ctx { + /* source key material used to derive a protected key from */ + u8 keybuf[PHMAC_MAX_KEYSIZE]; + unsigned int keylen; + + /* cpacf function code to use with this protected key type */ + long fc; + + /* nr of requests enqueued via crypto engine which use this tfm ctx */ + atomic_t via_engine_ctr; + + /* spinlock to atomic read/update all the following fields */ + spinlock_t pk_lock; + + /* see PK_STATE* defines above, < 0 holds convert failure rc */ + int pk_state; + /* if state is valid, pk holds the protected key */ + struct phmac_protkey pk; +}; + +union kmac_gr0 { + unsigned long reg; + struct { + unsigned long : 48; + unsigned long ikp : 1; + unsigned long iimp : 1; + unsigned long ccup : 1; + unsigned long : 6; + unsigned long fc : 7; + }; +}; + +struct kmac_sha2_ctx { + u8 param[MAX_DIGEST_SIZE + MAX_IMBL_SIZE + PHMAC_MAX_PK_SIZE]; + union kmac_gr0 gr0; + u8 buf[MAX_BLOCK_SIZE]; + u64 buflen[2]; +}; + +enum async_op { + OP_NOP = 0, + OP_UPDATE, + OP_FINAL, + OP_FINUP, +}; + +/* phmac request context */ +struct phmac_req_ctx { + struct hash_walk_helper hwh; + struct kmac_sha2_ctx kmac_ctx; + enum async_op async_op; +}; + +/* + * Pkey 'token' struct used to derive a protected key value from a clear key. + */ +struct hmac_clrkey_token { + u8 type; + u8 res0[3]; + u8 version; + u8 res1[3]; + u32 keytype; + u32 len; + u8 key[]; +} __packed; + +static int hash_key(const u8 *in, unsigned int inlen, + u8 *digest, unsigned int digestsize) +{ + unsigned long func; + union { + struct sha256_paramblock { + u32 h[8]; + u64 mbl; + } sha256; + struct sha512_paramblock { + u64 h[8]; + u128 mbl; + } sha512; + } __packed param; + +#define PARAM_INIT(x, y, z) \ + param.sha##x.h[0] = SHA##y ## _H0; \ + param.sha##x.h[1] = SHA##y ## _H1; \ + param.sha##x.h[2] = SHA##y ## _H2; \ + param.sha##x.h[3] = SHA##y ## _H3; \ + param.sha##x.h[4] = SHA##y ## _H4; \ + param.sha##x.h[5] = SHA##y ## _H5; \ + param.sha##x.h[6] = SHA##y ## _H6; \ + param.sha##x.h[7] = SHA##y ## _H7; \ + param.sha##x.mbl = (z) + + switch (digestsize) { + case SHA224_DIGEST_SIZE: + func = CPACF_KLMD_SHA_256; + PARAM_INIT(256, 224, inlen * 8); + break; + case SHA256_DIGEST_SIZE: + func = CPACF_KLMD_SHA_256; + PARAM_INIT(256, 256, inlen * 8); + break; + case SHA384_DIGEST_SIZE: + func = CPACF_KLMD_SHA_512; + PARAM_INIT(512, 384, inlen * 8); + break; + case SHA512_DIGEST_SIZE: + func = CPACF_KLMD_SHA_512; + PARAM_INIT(512, 512, inlen * 8); + break; + default: + return -EINVAL; + } + +#undef PARAM_INIT + + cpacf_klmd(func, ¶m, in, inlen); + + memcpy(digest, ¶m, digestsize); + + return 0; +} + +/* + * make_clrkey_token() - wrap the clear key into a pkey clearkey token. + */ +static inline int make_clrkey_token(const u8 *clrkey, size_t clrkeylen, + unsigned int digestsize, u8 *dest) +{ + struct hmac_clrkey_token *token = (struct hmac_clrkey_token *)dest; + unsigned int blocksize; + int rc; + + token->type = 0x00; + token->version = 0x02; + switch (digestsize) { + case SHA224_DIGEST_SIZE: + case SHA256_DIGEST_SIZE: + token->keytype = PKEY_KEYTYPE_HMAC_512; + blocksize = 64; + break; + case SHA384_DIGEST_SIZE: + case SHA512_DIGEST_SIZE: + token->keytype = PKEY_KEYTYPE_HMAC_1024; + blocksize = 128; + break; + default: + return -EINVAL; + } + token->len = blocksize; + + if (clrkeylen > blocksize) { + rc = hash_key(clrkey, clrkeylen, token->key, digestsize); + if (rc) + return rc; + } else { + memcpy(token->key, clrkey, clrkeylen); + } + + return 0; +} + +/* + * phmac_tfm_ctx_setkey() - Set key value into tfm context, maybe construct + * a clear key token digestible by pkey from a clear key value. + */ +static inline int phmac_tfm_ctx_setkey(struct phmac_tfm_ctx *tfm_ctx, + const u8 *key, unsigned int keylen) +{ + if (keylen > sizeof(tfm_ctx->keybuf)) + return -EINVAL; + + memcpy(tfm_ctx->keybuf, key, keylen); + tfm_ctx->keylen = keylen; + + return 0; +} + +/* + * Convert the raw key material into a protected key via PKEY api. + * This function may sleep - don't call in non-sleeping context. + */ +static inline int convert_key(const u8 *key, unsigned int keylen, + struct phmac_protkey *pk, bool tested) +{ + u32 xflags = PKEY_XFLAG_NOMEMALLOC; + int rc, i; + + if (tested && !pkey_clrkey_allowed) + xflags |= PKEY_XFLAG_NOCLEARKEY; + + pk->len = sizeof(pk->protkey); + + /* + * In case of a busy card retry with increasing delay + * of 200, 400, 800 and 1600 ms - in total 3 s. + */ + for (rc = -EIO, i = 0; rc && i < 5; i++) { + if (rc == -EBUSY && msleep_interruptible((1 << i) * 100)) { + rc = -EINTR; + goto out; + } + rc = pkey_key2protkey(key, keylen, + pk->protkey, &pk->len, &pk->type, + xflags); + } + +out: + pr_debug("rc=%d\n", rc); + return rc; +} + +/* + * (Re-)Convert the raw key material from the tfm ctx into a protected + * key via convert_key() function. Update the pk_state, pk_type, pk_len + * and the protected key in the tfm context. + * Please note this function may be invoked concurrently with the very + * same tfm context. The pk_lock spinlock in the context ensures an + * atomic update of the pk and the pk state but does not guarantee any + * order of update. So a fresh converted valid protected key may get + * updated with an 'old' expired key value. As the cpacf instructions + * detect this, refuse to operate with an invalid key and the calling + * code triggers a (re-)conversion this does no harm. This may lead to + * unnecessary additional conversion but never to invalid data on the + * hash operation. + */ +static int phmac_convert_key(struct phmac_tfm_ctx *tfm_ctx, bool tested) +{ + struct phmac_protkey pk; + int rc; + + spin_lock_bh(&tfm_ctx->pk_lock); + tfm_ctx->pk_state = PK_STATE_CONVERT_IN_PROGRESS; + spin_unlock_bh(&tfm_ctx->pk_lock); + + rc = convert_key(tfm_ctx->keybuf, tfm_ctx->keylen, &pk, tested); + + /* update context */ + spin_lock_bh(&tfm_ctx->pk_lock); + if (rc) { + tfm_ctx->pk_state = rc; + } else { + tfm_ctx->pk_state = PK_STATE_VALID; + tfm_ctx->pk = pk; + } + spin_unlock_bh(&tfm_ctx->pk_lock); + + memzero_explicit(&pk, sizeof(pk)); + pr_debug("rc=%d\n", rc); + return rc; +} + +/* + * kmac_sha2_set_imbl - sets the input message bit-length based on the blocksize + */ +static inline void kmac_sha2_set_imbl(u8 *param, u64 buflen_lo, + u64 buflen_hi, unsigned int blocksize) +{ + u8 *imbl = param + SHA2_IMBL_OFFSET(blocksize); + + switch (blocksize) { + case SHA256_BLOCK_SIZE: + *(u64 *)imbl = buflen_lo * BITS_PER_BYTE; + break; + case SHA512_BLOCK_SIZE: + *(u128 *)imbl = (((u128)buflen_hi << 64) + buflen_lo) << 3; + break; + default: + break; + } +} + +static int phmac_kmac_update(struct ahash_request *req, bool maysleep) +{ + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct phmac_tfm_ctx *tfm_ctx = crypto_ahash_ctx(tfm); + struct phmac_req_ctx *req_ctx = ahash_request_ctx(req); + struct kmac_sha2_ctx *ctx = &req_ctx->kmac_ctx; + struct hash_walk_helper *hwh = &req_ctx->hwh; + unsigned int bs = crypto_ahash_blocksize(tfm); + bool tested = crypto_ahash_tested(tfm); + unsigned int offset, k, n; + int rc = 0; + + /* + * The walk is always mapped when this function is called. + * Note that in case of partial processing or failure the walk + * is NOT unmapped here. So a follow up task may reuse the walk + * or in case of unrecoverable failure needs to unmap it. + */ + + while (hwh->walkbytes > 0) { + /* check sha2 context buffer */ + offset = ctx->buflen[0] % bs; + if (offset + hwh->walkbytes < bs) + goto store; + + if (offset) { + /* fill ctx buffer up to blocksize and process this block */ + n = bs - offset; + memcpy(ctx->buf + offset, hwh->walkaddr, n); + ctx->gr0.iimp = 1; + for (;;) { + k = _cpacf_kmac(&ctx->gr0.reg, ctx->param, ctx->buf, bs); + if (likely(k == bs)) + break; + if (unlikely(k > 0)) { + /* + * Can't deal with hunks smaller than blocksize. + * And kmac should always return the nr of + * processed bytes as 0 or a multiple of the + * blocksize. + */ + rc = -EIO; + goto out; + } + /* protected key is invalid and needs re-conversion */ + if (!maysleep) { + rc = -EKEYEXPIRED; + goto out; + } + rc = phmac_convert_key(tfm_ctx, tested); + if (rc) + goto out; + spin_lock_bh(&tfm_ctx->pk_lock); + memcpy(ctx->param + SHA2_KEY_OFFSET(bs), + tfm_ctx->pk.protkey, tfm_ctx->pk.len); + spin_unlock_bh(&tfm_ctx->pk_lock); + } + ctx->buflen[0] += n; + if (ctx->buflen[0] < n) + ctx->buflen[1]++; + rc = hwh_advance(hwh, n); + if (unlikely(rc)) + goto out; + offset = 0; + } + + /* process as many blocks as possible from the walk */ + while (hwh->walkbytes >= bs) { + n = (hwh->walkbytes / bs) * bs; + ctx->gr0.iimp = 1; + k = _cpacf_kmac(&ctx->gr0.reg, ctx->param, hwh->walkaddr, n); + if (likely(k > 0)) { + ctx->buflen[0] += k; + if (ctx->buflen[0] < k) + ctx->buflen[1]++; + rc = hwh_advance(hwh, k); + if (unlikely(rc)) + goto out; + } + if (unlikely(k < n)) { + /* protected key is invalid and needs re-conversion */ + if (!maysleep) { + rc = -EKEYEXPIRED; + goto out; + } + rc = phmac_convert_key(tfm_ctx, tested); + if (rc) + goto out; + spin_lock_bh(&tfm_ctx->pk_lock); + memcpy(ctx->param + SHA2_KEY_OFFSET(bs), + tfm_ctx->pk.protkey, tfm_ctx->pk.len); + spin_unlock_bh(&tfm_ctx->pk_lock); + } + } + +store: + /* store incomplete block in context buffer */ + if (hwh->walkbytes) { + memcpy(ctx->buf + offset, hwh->walkaddr, hwh->walkbytes); + ctx->buflen[0] += hwh->walkbytes; + if (ctx->buflen[0] < hwh->walkbytes) + ctx->buflen[1]++; + rc = hwh_advance(hwh, hwh->walkbytes); + if (unlikely(rc)) + goto out; + } + + } /* end of while (hwh->walkbytes > 0) */ + +out: + pr_debug("rc=%d\n", rc); + return rc; +} + +static int phmac_kmac_final(struct ahash_request *req, bool maysleep) +{ + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct phmac_tfm_ctx *tfm_ctx = crypto_ahash_ctx(tfm); + struct phmac_req_ctx *req_ctx = ahash_request_ctx(req); + struct kmac_sha2_ctx *ctx = &req_ctx->kmac_ctx; + unsigned int ds = crypto_ahash_digestsize(tfm); + unsigned int bs = crypto_ahash_blocksize(tfm); + bool tested = crypto_ahash_tested(tfm); + unsigned int k, n; + int rc = 0; + + n = ctx->buflen[0] % bs; + ctx->gr0.iimp = 0; + kmac_sha2_set_imbl(ctx->param, ctx->buflen[0], ctx->buflen[1], bs); + for (;;) { + k = _cpacf_kmac(&ctx->gr0.reg, ctx->param, ctx->buf, n); + if (likely(k == n)) + break; + if (unlikely(k > 0)) { + /* Can't deal with hunks smaller than blocksize. */ + rc = -EIO; + goto out; + } + /* protected key is invalid and needs re-conversion */ + if (!maysleep) { + rc = -EKEYEXPIRED; + goto out; + } + rc = phmac_convert_key(tfm_ctx, tested); + if (rc) + goto out; + spin_lock_bh(&tfm_ctx->pk_lock); + memcpy(ctx->param + SHA2_KEY_OFFSET(bs), + tfm_ctx->pk.protkey, tfm_ctx->pk.len); + spin_unlock_bh(&tfm_ctx->pk_lock); + } + + memcpy(req->result, ctx->param, ds); + +out: + pr_debug("rc=%d\n", rc); + return rc; +} + +static int phmac_init(struct ahash_request *req) +{ + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct phmac_tfm_ctx *tfm_ctx = crypto_ahash_ctx(tfm); + struct phmac_req_ctx *req_ctx = ahash_request_ctx(req); + struct kmac_sha2_ctx *kmac_ctx = &req_ctx->kmac_ctx; + unsigned int bs = crypto_ahash_blocksize(tfm); + int rc = 0; + + /* zero request context (includes the kmac sha2 context) */ + memset(req_ctx, 0, sizeof(*req_ctx)); + + /* + * setkey() should have set a valid fc into the tfm context. + * Copy this function code into the gr0 field of the kmac context. + */ + if (!tfm_ctx->fc) { + rc = -ENOKEY; + goto out; + } + kmac_ctx->gr0.fc = tfm_ctx->fc; + + /* + * Copy the pk from tfm ctx into kmac ctx. The protected key + * may be outdated but update() and final() will handle this. + */ + spin_lock_bh(&tfm_ctx->pk_lock); + memcpy(kmac_ctx->param + SHA2_KEY_OFFSET(bs), + tfm_ctx->pk.protkey, tfm_ctx->pk.len); + spin_unlock_bh(&tfm_ctx->pk_lock); + +out: + pr_debug("rc=%d\n", rc); + return rc; +} + +static int phmac_update(struct ahash_request *req) +{ + struct phmac_req_ctx *req_ctx = ahash_request_ctx(req); + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct phmac_tfm_ctx *tfm_ctx = crypto_ahash_ctx(tfm); + struct kmac_sha2_ctx *kmac_ctx = &req_ctx->kmac_ctx; + struct hash_walk_helper *hwh = &req_ctx->hwh; + int rc; + + /* prep the walk in the request context */ + rc = hwh_prepare(req, hwh); + if (rc) + goto out; + + /* Try synchronous operation if no active engine usage */ + if (!atomic_read(&tfm_ctx->via_engine_ctr)) { + rc = phmac_kmac_update(req, false); + if (rc == 0) + goto out; + } + + /* + * If sync operation failed or key expired or there are already + * requests enqueued via engine, fallback to async. Mark tfm as + * using engine to serialize requests. + */ + if (rc == 0 || rc == -EKEYEXPIRED) { + req_ctx->async_op = OP_UPDATE; + atomic_inc(&tfm_ctx->via_engine_ctr); + rc = crypto_transfer_hash_request_to_engine(phmac_crypto_engine, req); + if (rc != -EINPROGRESS) + atomic_dec(&tfm_ctx->via_engine_ctr); + } + + if (rc != -EINPROGRESS) { + hwh_advance(hwh, rc); + memzero_explicit(kmac_ctx, sizeof(*kmac_ctx)); + } + +out: + pr_debug("rc=%d\n", rc); + return rc; +} + +static int phmac_final(struct ahash_request *req) +{ + struct phmac_req_ctx *req_ctx = ahash_request_ctx(req); + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct phmac_tfm_ctx *tfm_ctx = crypto_ahash_ctx(tfm); + struct kmac_sha2_ctx *kmac_ctx = &req_ctx->kmac_ctx; + int rc = 0; + + /* Try synchronous operation if no active engine usage */ + if (!atomic_read(&tfm_ctx->via_engine_ctr)) { + rc = phmac_kmac_final(req, false); + if (rc == 0) + goto out; + } + + /* + * If sync operation failed or key expired or there are already + * requests enqueued via engine, fallback to async. Mark tfm as + * using engine to serialize requests. + */ + if (rc == 0 || rc == -EKEYEXPIRED) { + req_ctx->async_op = OP_FINAL; + atomic_inc(&tfm_ctx->via_engine_ctr); + rc = crypto_transfer_hash_request_to_engine(phmac_crypto_engine, req); + if (rc != -EINPROGRESS) + atomic_dec(&tfm_ctx->via_engine_ctr); + } + +out: + if (rc != -EINPROGRESS) + memzero_explicit(kmac_ctx, sizeof(*kmac_ctx)); + pr_debug("rc=%d\n", rc); + return rc; +} + +static int phmac_finup(struct ahash_request *req) +{ + struct phmac_req_ctx *req_ctx = ahash_request_ctx(req); + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct phmac_tfm_ctx *tfm_ctx = crypto_ahash_ctx(tfm); + struct kmac_sha2_ctx *kmac_ctx = &req_ctx->kmac_ctx; + struct hash_walk_helper *hwh = &req_ctx->hwh; + int rc; + + /* prep the walk in the request context */ + rc = hwh_prepare(req, hwh); + if (rc) + goto out; + + req_ctx->async_op = OP_FINUP; + + /* Try synchronous operations if no active engine usage */ + if (!atomic_read(&tfm_ctx->via_engine_ctr)) { + rc = phmac_kmac_update(req, false); + if (rc == 0) + req_ctx->async_op = OP_FINAL; + } + if (!rc && req_ctx->async_op == OP_FINAL && + !atomic_read(&tfm_ctx->via_engine_ctr)) { + rc = phmac_kmac_final(req, false); + if (rc == 0) + goto out; + } + + /* + * If sync operation failed or key expired or there are already + * requests enqueued via engine, fallback to async. Mark tfm as + * using engine to serialize requests. + */ + if (rc == 0 || rc == -EKEYEXPIRED) { + /* req->async_op has been set to either OP_FINUP or OP_FINAL */ + atomic_inc(&tfm_ctx->via_engine_ctr); + rc = crypto_transfer_hash_request_to_engine(phmac_crypto_engine, req); + if (rc != -EINPROGRESS) + atomic_dec(&tfm_ctx->via_engine_ctr); + } + + if (rc != -EINPROGRESS) + hwh_advance(hwh, rc); + +out: + if (rc != -EINPROGRESS) + memzero_explicit(kmac_ctx, sizeof(*kmac_ctx)); + pr_debug("rc=%d\n", rc); + return rc; +} + +static int phmac_digest(struct ahash_request *req) +{ + int rc; + + rc = phmac_init(req); + if (rc) + goto out; + + rc = phmac_finup(req); + +out: + pr_debug("rc=%d\n", rc); + return rc; +} + +static int phmac_setkey(struct crypto_ahash *tfm, + const u8 *key, unsigned int keylen) +{ + struct phmac_tfm_ctx *tfm_ctx = crypto_ahash_ctx(tfm); + unsigned int ds = crypto_ahash_digestsize(tfm); + unsigned int bs = crypto_ahash_blocksize(tfm); + bool tested = crypto_ahash_tested(tfm); + unsigned int tmpkeylen; + u8 *tmpkey = NULL; + int rc = 0; + + if (!tested) { + /* + * selftest running: key is a raw hmac clear key and needs + * to get embedded into a 'clear key token' in order to have + * it correctly processed by the pkey module. + */ + tmpkeylen = sizeof(struct hmac_clrkey_token) + bs; + tmpkey = kzalloc(tmpkeylen, GFP_KERNEL); + if (!tmpkey) { + rc = -ENOMEM; + goto out; + } + rc = make_clrkey_token(key, keylen, ds, tmpkey); + if (rc) + goto out; + keylen = tmpkeylen; + key = tmpkey; + } + + /* copy raw key into tfm context */ + rc = phmac_tfm_ctx_setkey(tfm_ctx, key, keylen); + if (rc) + goto out; + + /* convert raw key into protected key */ + rc = phmac_convert_key(tfm_ctx, tested); + if (rc) + goto out; + + /* set function code in tfm context, check for valid pk type */ + switch (ds) { + case SHA224_DIGEST_SIZE: + if (tfm_ctx->pk.type != PKEY_KEYTYPE_HMAC_512) + rc = -EINVAL; + else + tfm_ctx->fc = CPACF_KMAC_PHMAC_SHA_224; + break; + case SHA256_DIGEST_SIZE: + if (tfm_ctx->pk.type != PKEY_KEYTYPE_HMAC_512) + rc = -EINVAL; + else + tfm_ctx->fc = CPACF_KMAC_PHMAC_SHA_256; + break; + case SHA384_DIGEST_SIZE: + if (tfm_ctx->pk.type != PKEY_KEYTYPE_HMAC_1024) + rc = -EINVAL; + else + tfm_ctx->fc = CPACF_KMAC_PHMAC_SHA_384; + break; + case SHA512_DIGEST_SIZE: + if (tfm_ctx->pk.type != PKEY_KEYTYPE_HMAC_1024) + rc = -EINVAL; + else + tfm_ctx->fc = CPACF_KMAC_PHMAC_SHA_512; + break; + default: + tfm_ctx->fc = 0; + rc = -EINVAL; + } + +out: + kfree(tmpkey); + pr_debug("rc=%d\n", rc); + return rc; +} + +static int phmac_export(struct ahash_request *req, void *out) +{ + struct phmac_req_ctx *req_ctx = ahash_request_ctx(req); + struct kmac_sha2_ctx *ctx = &req_ctx->kmac_ctx; + + memcpy(out, ctx, sizeof(*ctx)); + + return 0; +} + +static int phmac_import(struct ahash_request *req, const void *in) +{ + struct phmac_req_ctx *req_ctx = ahash_request_ctx(req); + struct kmac_sha2_ctx *ctx = &req_ctx->kmac_ctx; + + memset(req_ctx, 0, sizeof(*req_ctx)); + memcpy(ctx, in, sizeof(*ctx)); + + return 0; +} + +static int phmac_init_tfm(struct crypto_ahash *tfm) +{ + struct phmac_tfm_ctx *tfm_ctx = crypto_ahash_ctx(tfm); + + memset(tfm_ctx, 0, sizeof(*tfm_ctx)); + spin_lock_init(&tfm_ctx->pk_lock); + + crypto_ahash_set_reqsize(tfm, sizeof(struct phmac_req_ctx)); + + return 0; +} + +static void phmac_exit_tfm(struct crypto_ahash *tfm) +{ + struct phmac_tfm_ctx *tfm_ctx = crypto_ahash_ctx(tfm); + + memzero_explicit(tfm_ctx->keybuf, sizeof(tfm_ctx->keybuf)); + memzero_explicit(&tfm_ctx->pk, sizeof(tfm_ctx->pk)); +} + +static int phmac_do_one_request(struct crypto_engine *engine, void *areq) +{ + struct ahash_request *req = ahash_request_cast(areq); + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct phmac_tfm_ctx *tfm_ctx = crypto_ahash_ctx(tfm); + struct phmac_req_ctx *req_ctx = ahash_request_ctx(req); + struct kmac_sha2_ctx *kmac_ctx = &req_ctx->kmac_ctx; + struct hash_walk_helper *hwh = &req_ctx->hwh; + int rc = -EINVAL; + + /* + * Three kinds of requests come in here: + * 1. req->async_op == OP_UPDATE with req->nbytes > 0 + * 2. req->async_op == OP_FINUP with req->nbytes > 0 + * 3. req->async_op == OP_FINAL + * For update and finup the hwh walk has already been prepared + * by the caller. For final there is no hwh walk needed. + */ + + switch (req_ctx->async_op) { + case OP_UPDATE: + case OP_FINUP: + rc = phmac_kmac_update(req, true); + if (rc == -EKEYEXPIRED) { + /* + * Protected key expired, conversion is in process. + * Trigger a re-schedule of this request by returning + * -ENOSPC ("hardware queue full") to the crypto engine. + * To avoid immediately re-invocation of this callback, + * tell scheduler to voluntarily give up the CPU here. + */ + pr_debug("rescheduling request\n"); + cond_resched(); + return -ENOSPC; + } else if (rc) { + hwh_advance(hwh, rc); + goto out; + } + if (req_ctx->async_op == OP_UPDATE) + break; + req_ctx->async_op = OP_FINAL; + fallthrough; + case OP_FINAL: + rc = phmac_kmac_final(req, true); + if (rc == -EKEYEXPIRED) { + /* + * Protected key expired, conversion is in process. + * Trigger a re-schedule of this request by returning + * -ENOSPC ("hardware queue full") to the crypto engine. + * To avoid immediately re-invocation of this callback, + * tell scheduler to voluntarily give up the CPU here. + */ + pr_debug("rescheduling request\n"); + cond_resched(); + return -ENOSPC; + } + break; + default: + /* unknown/unsupported/unimplemented asynch op */ + return -EOPNOTSUPP; + } + +out: + if (rc || req_ctx->async_op == OP_FINAL) + memzero_explicit(kmac_ctx, sizeof(*kmac_ctx)); + pr_debug("request complete with rc=%d\n", rc); + local_bh_disable(); + atomic_dec(&tfm_ctx->via_engine_ctr); + crypto_finalize_hash_request(engine, req, rc); + local_bh_enable(); + return rc; +} + +#define S390_ASYNC_PHMAC_ALG(x) \ +{ \ + .base = { \ + .init = phmac_init, \ + .update = phmac_update, \ + .final = phmac_final, \ + .finup = phmac_finup, \ + .digest = phmac_digest, \ + .setkey = phmac_setkey, \ + .import = phmac_import, \ + .export = phmac_export, \ + .init_tfm = phmac_init_tfm, \ + .exit_tfm = phmac_exit_tfm, \ + .halg = { \ + .digestsize = SHA##x##_DIGEST_SIZE, \ + .statesize = sizeof(struct kmac_sha2_ctx), \ + .base = { \ + .cra_name = "phmac(sha" #x ")", \ + .cra_driver_name = "phmac_s390_sha" #x, \ + .cra_blocksize = SHA##x##_BLOCK_SIZE, \ + .cra_priority = 400, \ + .cra_flags = CRYPTO_ALG_ASYNC | \ + CRYPTO_ALG_NO_FALLBACK, \ + .cra_ctxsize = sizeof(struct phmac_tfm_ctx), \ + .cra_module = THIS_MODULE, \ + }, \ + }, \ + }, \ + .op = { \ + .do_one_request = phmac_do_one_request, \ + }, \ +} + +static struct phmac_alg { + unsigned int fc; + struct ahash_engine_alg alg; + bool registered; +} phmac_algs[] = { + { + .fc = CPACF_KMAC_PHMAC_SHA_224, + .alg = S390_ASYNC_PHMAC_ALG(224), + }, { + .fc = CPACF_KMAC_PHMAC_SHA_256, + .alg = S390_ASYNC_PHMAC_ALG(256), + }, { + .fc = CPACF_KMAC_PHMAC_SHA_384, + .alg = S390_ASYNC_PHMAC_ALG(384), + }, { + .fc = CPACF_KMAC_PHMAC_SHA_512, + .alg = S390_ASYNC_PHMAC_ALG(512), + } +}; + +static struct miscdevice phmac_dev = { + .name = "phmac", + .minor = MISC_DYNAMIC_MINOR, +}; + +static void s390_phmac_exit(void) +{ + struct phmac_alg *phmac; + int i; + + if (phmac_crypto_engine) { + crypto_engine_stop(phmac_crypto_engine); + crypto_engine_exit(phmac_crypto_engine); + } + + for (i = ARRAY_SIZE(phmac_algs) - 1; i >= 0; i--) { + phmac = &phmac_algs[i]; + if (phmac->registered) + crypto_engine_unregister_ahash(&phmac->alg); + } + + misc_deregister(&phmac_dev); +} + +static int __init s390_phmac_init(void) +{ + struct phmac_alg *phmac; + int i, rc; + + /* for selftest cpacf klmd subfunction is needed */ + if (!cpacf_query_func(CPACF_KLMD, CPACF_KLMD_SHA_256)) + return -ENODEV; + if (!cpacf_query_func(CPACF_KLMD, CPACF_KLMD_SHA_512)) + return -ENODEV; + + /* register a simple phmac pseudo misc device */ + rc = misc_register(&phmac_dev); + if (rc) + return rc; + + /* with this pseudo device alloc and start a crypto engine */ + phmac_crypto_engine = + crypto_engine_alloc_init_and_set(phmac_dev.this_device, + true, false, MAX_QLEN); + if (!phmac_crypto_engine) { + rc = -ENOMEM; + goto out_err; + } + rc = crypto_engine_start(phmac_crypto_engine); + if (rc) { + crypto_engine_exit(phmac_crypto_engine); + phmac_crypto_engine = NULL; + goto out_err; + } + + for (i = 0; i < ARRAY_SIZE(phmac_algs); i++) { + phmac = &phmac_algs[i]; + if (!cpacf_query_func(CPACF_KMAC, phmac->fc)) + continue; + rc = crypto_engine_register_ahash(&phmac->alg); + if (rc) + goto out_err; + phmac->registered = true; + pr_debug("%s registered\n", phmac->alg.base.halg.base.cra_name); + } + + return 0; + +out_err: + s390_phmac_exit(); + return rc; +} + +module_init(s390_phmac_init); +module_exit(s390_phmac_exit); + +MODULE_ALIAS_CRYPTO("phmac(sha224)"); +MODULE_ALIAS_CRYPTO("phmac(sha256)"); +MODULE_ALIAS_CRYPTO("phmac(sha384)"); +MODULE_ALIAS_CRYPTO("phmac(sha512)"); + +MODULE_DESCRIPTION("S390 HMAC driver for protected keys"); +MODULE_LICENSE("GPL"); diff --git a/arch/s390/crypto/prng.c b/arch/s390/crypto/prng.c index 2becd77df741..84e3d3c6ba09 100644 --- a/arch/s390/crypto/prng.c +++ b/arch/s390/crypto/prng.c @@ -6,8 +6,7 @@ * Driver for the s390 pseudo random number generator */ -#define KMSG_COMPONENT "prng" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "prng: " fmt #include <linux/fs.h> #include <linux/fips.h> diff --git a/arch/s390/crypto/sha.h b/arch/s390/crypto/sha.h deleted file mode 100644 index 2bb22db54c31..000000000000 --- a/arch/s390/crypto/sha.h +++ /dev/null @@ -1,36 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0+ */ -/* - * Cryptographic API. - * - * s390 generic implementation of the SHA Secure Hash Algorithms. - * - * Copyright IBM Corp. 2007 - * Author(s): Jan Glauber (jang@de.ibm.com) - */ -#ifndef _CRYPTO_ARCH_S390_SHA_H -#define _CRYPTO_ARCH_S390_SHA_H - -#include <linux/crypto.h> -#include <crypto/sha1.h> -#include <crypto/sha2.h> -#include <crypto/sha3.h> - -/* must be big enough for the largest SHA variant */ -#define SHA3_STATE_SIZE 200 -#define CPACF_MAX_PARMBLOCK_SIZE SHA3_STATE_SIZE -#define SHA_MAX_BLOCK_SIZE SHA3_224_BLOCK_SIZE - -struct s390_sha_ctx { - u64 count; /* message length in bytes */ - u32 state[CPACF_MAX_PARMBLOCK_SIZE / sizeof(u32)]; - u8 buf[SHA_MAX_BLOCK_SIZE]; - int func; /* KIMD function to use */ - int first_message_part; -}; - -struct shash_desc; - -int s390_sha_update(struct shash_desc *desc, const u8 *data, unsigned int len); -int s390_sha_final(struct shash_desc *desc, u8 *out); - -#endif diff --git a/arch/s390/crypto/sha1_s390.c b/arch/s390/crypto/sha1_s390.c deleted file mode 100644 index bc3a22704e09..000000000000 --- a/arch/s390/crypto/sha1_s390.c +++ /dev/null @@ -1,103 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0+ -/* - * Cryptographic API. - * - * s390 implementation of the SHA1 Secure Hash Algorithm. - * - * Derived from cryptoapi implementation, adapted for in-place - * scatterlist interface. Originally based on the public domain - * implementation written by Steve Reid. - * - * s390 Version: - * Copyright IBM Corp. 2003, 2007 - * Author(s): Thomas Spatzier - * Jan Glauber (jan.glauber@de.ibm.com) - * - * Derived from "crypto/sha1_generic.c" - * Copyright (c) Alan Smithee. - * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> - * Copyright (c) Jean-Francois Dive <jef@linuxbe.org> - */ -#include <crypto/internal/hash.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/cpufeature.h> -#include <crypto/sha1.h> -#include <asm/cpacf.h> - -#include "sha.h" - -static int s390_sha1_init(struct shash_desc *desc) -{ - struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - - sctx->state[0] = SHA1_H0; - sctx->state[1] = SHA1_H1; - sctx->state[2] = SHA1_H2; - sctx->state[3] = SHA1_H3; - sctx->state[4] = SHA1_H4; - sctx->count = 0; - sctx->func = CPACF_KIMD_SHA_1; - - return 0; -} - -static int s390_sha1_export(struct shash_desc *desc, void *out) -{ - struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - struct sha1_state *octx = out; - - octx->count = sctx->count; - memcpy(octx->state, sctx->state, sizeof(octx->state)); - memcpy(octx->buffer, sctx->buf, sizeof(octx->buffer)); - return 0; -} - -static int s390_sha1_import(struct shash_desc *desc, const void *in) -{ - struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - const struct sha1_state *ictx = in; - - sctx->count = ictx->count; - memcpy(sctx->state, ictx->state, sizeof(ictx->state)); - memcpy(sctx->buf, ictx->buffer, sizeof(ictx->buffer)); - sctx->func = CPACF_KIMD_SHA_1; - return 0; -} - -static struct shash_alg alg = { - .digestsize = SHA1_DIGEST_SIZE, - .init = s390_sha1_init, - .update = s390_sha_update, - .final = s390_sha_final, - .export = s390_sha1_export, - .import = s390_sha1_import, - .descsize = sizeof(struct s390_sha_ctx), - .statesize = sizeof(struct sha1_state), - .base = { - .cra_name = "sha1", - .cra_driver_name= "sha1-s390", - .cra_priority = 300, - .cra_blocksize = SHA1_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static int __init sha1_s390_init(void) -{ - if (!cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_1)) - return -ENODEV; - return crypto_register_shash(&alg); -} - -static void __exit sha1_s390_fini(void) -{ - crypto_unregister_shash(&alg); -} - -module_cpu_feature_match(S390_CPU_FEATURE_MSA, sha1_s390_init); -module_exit(sha1_s390_fini); - -MODULE_ALIAS_CRYPTO("sha1"); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm"); diff --git a/arch/s390/crypto/sha256_s390.c b/arch/s390/crypto/sha256_s390.c deleted file mode 100644 index 6f1ccdf93d3e..000000000000 --- a/arch/s390/crypto/sha256_s390.c +++ /dev/null @@ -1,143 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0+ -/* - * Cryptographic API. - * - * s390 implementation of the SHA256 and SHA224 Secure Hash Algorithm. - * - * s390 Version: - * Copyright IBM Corp. 2005, 2011 - * Author(s): Jan Glauber (jang@de.ibm.com) - */ -#include <crypto/internal/hash.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/cpufeature.h> -#include <crypto/sha2.h> -#include <asm/cpacf.h> - -#include "sha.h" - -static int s390_sha256_init(struct shash_desc *desc) -{ - struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - - sctx->state[0] = SHA256_H0; - sctx->state[1] = SHA256_H1; - sctx->state[2] = SHA256_H2; - sctx->state[3] = SHA256_H3; - sctx->state[4] = SHA256_H4; - sctx->state[5] = SHA256_H5; - sctx->state[6] = SHA256_H6; - sctx->state[7] = SHA256_H7; - sctx->count = 0; - sctx->func = CPACF_KIMD_SHA_256; - - return 0; -} - -static int sha256_export(struct shash_desc *desc, void *out) -{ - struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - struct sha256_state *octx = out; - - octx->count = sctx->count; - memcpy(octx->state, sctx->state, sizeof(octx->state)); - memcpy(octx->buf, sctx->buf, sizeof(octx->buf)); - return 0; -} - -static int sha256_import(struct shash_desc *desc, const void *in) -{ - struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - const struct sha256_state *ictx = in; - - sctx->count = ictx->count; - memcpy(sctx->state, ictx->state, sizeof(ictx->state)); - memcpy(sctx->buf, ictx->buf, sizeof(ictx->buf)); - sctx->func = CPACF_KIMD_SHA_256; - return 0; -} - -static struct shash_alg sha256_alg = { - .digestsize = SHA256_DIGEST_SIZE, - .init = s390_sha256_init, - .update = s390_sha_update, - .final = s390_sha_final, - .export = sha256_export, - .import = sha256_import, - .descsize = sizeof(struct s390_sha_ctx), - .statesize = sizeof(struct sha256_state), - .base = { - .cra_name = "sha256", - .cra_driver_name= "sha256-s390", - .cra_priority = 300, - .cra_blocksize = SHA256_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static int s390_sha224_init(struct shash_desc *desc) -{ - struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - - sctx->state[0] = SHA224_H0; - sctx->state[1] = SHA224_H1; - sctx->state[2] = SHA224_H2; - sctx->state[3] = SHA224_H3; - sctx->state[4] = SHA224_H4; - sctx->state[5] = SHA224_H5; - sctx->state[6] = SHA224_H6; - sctx->state[7] = SHA224_H7; - sctx->count = 0; - sctx->func = CPACF_KIMD_SHA_256; - - return 0; -} - -static struct shash_alg sha224_alg = { - .digestsize = SHA224_DIGEST_SIZE, - .init = s390_sha224_init, - .update = s390_sha_update, - .final = s390_sha_final, - .export = sha256_export, - .import = sha256_import, - .descsize = sizeof(struct s390_sha_ctx), - .statesize = sizeof(struct sha256_state), - .base = { - .cra_name = "sha224", - .cra_driver_name= "sha224-s390", - .cra_priority = 300, - .cra_blocksize = SHA224_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static int __init sha256_s390_init(void) -{ - int ret; - - if (!cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_256)) - return -ENODEV; - ret = crypto_register_shash(&sha256_alg); - if (ret < 0) - goto out; - ret = crypto_register_shash(&sha224_alg); - if (ret < 0) - crypto_unregister_shash(&sha256_alg); -out: - return ret; -} - -static void __exit sha256_s390_fini(void) -{ - crypto_unregister_shash(&sha224_alg); - crypto_unregister_shash(&sha256_alg); -} - -module_cpu_feature_match(S390_CPU_FEATURE_MSA, sha256_s390_init); -module_exit(sha256_s390_fini); - -MODULE_ALIAS_CRYPTO("sha256"); -MODULE_ALIAS_CRYPTO("sha224"); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA256 and SHA224 Secure Hash Algorithm"); diff --git a/arch/s390/crypto/sha3_256_s390.c b/arch/s390/crypto/sha3_256_s390.c deleted file mode 100644 index a84ef692f572..000000000000 --- a/arch/s390/crypto/sha3_256_s390.c +++ /dev/null @@ -1,153 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0+ -/* - * Cryptographic API. - * - * s390 implementation of the SHA256 and SHA224 Secure Hash Algorithm. - * - * s390 Version: - * Copyright IBM Corp. 2019 - * Author(s): Joerg Schmidbauer (jschmidb@de.ibm.com) - */ -#include <crypto/internal/hash.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/cpufeature.h> -#include <crypto/sha3.h> -#include <asm/cpacf.h> - -#include "sha.h" - -static int sha3_256_init(struct shash_desc *desc) -{ - struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - - if (!test_facility(86)) /* msa 12 */ - memset(sctx->state, 0, sizeof(sctx->state)); - sctx->count = 0; - sctx->func = CPACF_KIMD_SHA3_256; - sctx->first_message_part = 1; - - return 0; -} - -static int sha3_256_export(struct shash_desc *desc, void *out) -{ - struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - struct sha3_state *octx = out; - - octx->rsiz = sctx->count; - memcpy(octx->st, sctx->state, sizeof(octx->st)); - memcpy(octx->buf, sctx->buf, sizeof(octx->buf)); - octx->partial = sctx->first_message_part; - - return 0; -} - -static int sha3_256_import(struct shash_desc *desc, const void *in) -{ - struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - const struct sha3_state *ictx = in; - - sctx->count = ictx->rsiz; - memcpy(sctx->state, ictx->st, sizeof(ictx->st)); - memcpy(sctx->buf, ictx->buf, sizeof(ictx->buf)); - sctx->first_message_part = ictx->partial; - sctx->func = CPACF_KIMD_SHA3_256; - - return 0; -} - -static int sha3_224_import(struct shash_desc *desc, const void *in) -{ - struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - const struct sha3_state *ictx = in; - - sctx->count = ictx->rsiz; - memcpy(sctx->state, ictx->st, sizeof(ictx->st)); - memcpy(sctx->buf, ictx->buf, sizeof(ictx->buf)); - sctx->first_message_part = ictx->partial; - sctx->func = CPACF_KIMD_SHA3_224; - - return 0; -} - -static struct shash_alg sha3_256_alg = { - .digestsize = SHA3_256_DIGEST_SIZE, /* = 32 */ - .init = sha3_256_init, - .update = s390_sha_update, - .final = s390_sha_final, - .export = sha3_256_export, - .import = sha3_256_import, - .descsize = sizeof(struct s390_sha_ctx), - .statesize = sizeof(struct sha3_state), - .base = { - .cra_name = "sha3-256", - .cra_driver_name = "sha3-256-s390", - .cra_priority = 300, - .cra_blocksize = SHA3_256_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static int sha3_224_init(struct shash_desc *desc) -{ - struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - - if (!test_facility(86)) /* msa 12 */ - memset(sctx->state, 0, sizeof(sctx->state)); - sctx->count = 0; - sctx->func = CPACF_KIMD_SHA3_224; - sctx->first_message_part = 1; - - return 0; -} - -static struct shash_alg sha3_224_alg = { - .digestsize = SHA3_224_DIGEST_SIZE, - .init = sha3_224_init, - .update = s390_sha_update, - .final = s390_sha_final, - .export = sha3_256_export, /* same as for 256 */ - .import = sha3_224_import, /* function code different! */ - .descsize = sizeof(struct s390_sha_ctx), - .statesize = sizeof(struct sha3_state), - .base = { - .cra_name = "sha3-224", - .cra_driver_name = "sha3-224-s390", - .cra_priority = 300, - .cra_blocksize = SHA3_224_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static int __init sha3_256_s390_init(void) -{ - int ret; - - if (!cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA3_256)) - return -ENODEV; - - ret = crypto_register_shash(&sha3_256_alg); - if (ret < 0) - goto out; - - ret = crypto_register_shash(&sha3_224_alg); - if (ret < 0) - crypto_unregister_shash(&sha3_256_alg); -out: - return ret; -} - -static void __exit sha3_256_s390_fini(void) -{ - crypto_unregister_shash(&sha3_224_alg); - crypto_unregister_shash(&sha3_256_alg); -} - -module_cpu_feature_match(S390_CPU_FEATURE_MSA, sha3_256_s390_init); -module_exit(sha3_256_s390_fini); - -MODULE_ALIAS_CRYPTO("sha3-256"); -MODULE_ALIAS_CRYPTO("sha3-224"); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA3-256 and SHA3-224 Secure Hash Algorithm"); diff --git a/arch/s390/crypto/sha3_512_s390.c b/arch/s390/crypto/sha3_512_s390.c deleted file mode 100644 index 07528fc98ff7..000000000000 --- a/arch/s390/crypto/sha3_512_s390.c +++ /dev/null @@ -1,161 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0+ -/* - * Cryptographic API. - * - * s390 implementation of the SHA512 and SHA384 Secure Hash Algorithm. - * - * Copyright IBM Corp. 2019 - * Author(s): Joerg Schmidbauer (jschmidb@de.ibm.com) - */ -#include <crypto/internal/hash.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/cpufeature.h> -#include <crypto/sha3.h> -#include <asm/cpacf.h> - -#include "sha.h" - -static int sha3_512_init(struct shash_desc *desc) -{ - struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - - if (!test_facility(86)) /* msa 12 */ - memset(sctx->state, 0, sizeof(sctx->state)); - sctx->count = 0; - sctx->func = CPACF_KIMD_SHA3_512; - sctx->first_message_part = 1; - - return 0; -} - -static int sha3_512_export(struct shash_desc *desc, void *out) -{ - struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - struct sha3_state *octx = out; - - octx->rsiz = sctx->count; - octx->rsizw = sctx->count >> 32; - - memcpy(octx->st, sctx->state, sizeof(octx->st)); - memcpy(octx->buf, sctx->buf, sizeof(octx->buf)); - octx->partial = sctx->first_message_part; - - return 0; -} - -static int sha3_512_import(struct shash_desc *desc, const void *in) -{ - struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - const struct sha3_state *ictx = in; - - if (unlikely(ictx->rsizw)) - return -ERANGE; - sctx->count = ictx->rsiz; - - memcpy(sctx->state, ictx->st, sizeof(ictx->st)); - memcpy(sctx->buf, ictx->buf, sizeof(ictx->buf)); - sctx->first_message_part = ictx->partial; - sctx->func = CPACF_KIMD_SHA3_512; - - return 0; -} - -static int sha3_384_import(struct shash_desc *desc, const void *in) -{ - struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - const struct sha3_state *ictx = in; - - if (unlikely(ictx->rsizw)) - return -ERANGE; - sctx->count = ictx->rsiz; - - memcpy(sctx->state, ictx->st, sizeof(ictx->st)); - memcpy(sctx->buf, ictx->buf, sizeof(ictx->buf)); - sctx->first_message_part = ictx->partial; - sctx->func = CPACF_KIMD_SHA3_384; - - return 0; -} - -static struct shash_alg sha3_512_alg = { - .digestsize = SHA3_512_DIGEST_SIZE, - .init = sha3_512_init, - .update = s390_sha_update, - .final = s390_sha_final, - .export = sha3_512_export, - .import = sha3_512_import, - .descsize = sizeof(struct s390_sha_ctx), - .statesize = sizeof(struct sha3_state), - .base = { - .cra_name = "sha3-512", - .cra_driver_name = "sha3-512-s390", - .cra_priority = 300, - .cra_blocksize = SHA3_512_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -MODULE_ALIAS_CRYPTO("sha3-512"); - -static int sha3_384_init(struct shash_desc *desc) -{ - struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - - if (!test_facility(86)) /* msa 12 */ - memset(sctx->state, 0, sizeof(sctx->state)); - sctx->count = 0; - sctx->func = CPACF_KIMD_SHA3_384; - sctx->first_message_part = 1; - - return 0; -} - -static struct shash_alg sha3_384_alg = { - .digestsize = SHA3_384_DIGEST_SIZE, - .init = sha3_384_init, - .update = s390_sha_update, - .final = s390_sha_final, - .export = sha3_512_export, /* same as for 512 */ - .import = sha3_384_import, /* function code different! */ - .descsize = sizeof(struct s390_sha_ctx), - .statesize = sizeof(struct sha3_state), - .base = { - .cra_name = "sha3-384", - .cra_driver_name = "sha3-384-s390", - .cra_priority = 300, - .cra_blocksize = SHA3_384_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct s390_sha_ctx), - .cra_module = THIS_MODULE, - } -}; - -MODULE_ALIAS_CRYPTO("sha3-384"); - -static int __init init(void) -{ - int ret; - - if (!cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA3_512)) - return -ENODEV; - ret = crypto_register_shash(&sha3_512_alg); - if (ret < 0) - goto out; - ret = crypto_register_shash(&sha3_384_alg); - if (ret < 0) - crypto_unregister_shash(&sha3_512_alg); -out: - return ret; -} - -static void __exit fini(void) -{ - crypto_unregister_shash(&sha3_512_alg); - crypto_unregister_shash(&sha3_384_alg); -} - -module_cpu_feature_match(S390_CPU_FEATURE_MSA, init); -module_exit(fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA3-512 and SHA3-384 Secure Hash Algorithm"); diff --git a/arch/s390/crypto/sha512_s390.c b/arch/s390/crypto/sha512_s390.c deleted file mode 100644 index 04f11c407763..000000000000 --- a/arch/s390/crypto/sha512_s390.c +++ /dev/null @@ -1,149 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0+ -/* - * Cryptographic API. - * - * s390 implementation of the SHA512 and SHA38 Secure Hash Algorithm. - * - * Copyright IBM Corp. 2007 - * Author(s): Jan Glauber (jang@de.ibm.com) - */ -#include <crypto/internal/hash.h> -#include <crypto/sha2.h> -#include <linux/errno.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/cpufeature.h> -#include <asm/cpacf.h> - -#include "sha.h" - -static int sha512_init(struct shash_desc *desc) -{ - struct s390_sha_ctx *ctx = shash_desc_ctx(desc); - - *(__u64 *)&ctx->state[0] = SHA512_H0; - *(__u64 *)&ctx->state[2] = SHA512_H1; - *(__u64 *)&ctx->state[4] = SHA512_H2; - *(__u64 *)&ctx->state[6] = SHA512_H3; - *(__u64 *)&ctx->state[8] = SHA512_H4; - *(__u64 *)&ctx->state[10] = SHA512_H5; - *(__u64 *)&ctx->state[12] = SHA512_H6; - *(__u64 *)&ctx->state[14] = SHA512_H7; - ctx->count = 0; - ctx->func = CPACF_KIMD_SHA_512; - - return 0; -} - -static int sha512_export(struct shash_desc *desc, void *out) -{ - struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - struct sha512_state *octx = out; - - octx->count[0] = sctx->count; - octx->count[1] = 0; - memcpy(octx->state, sctx->state, sizeof(octx->state)); - memcpy(octx->buf, sctx->buf, sizeof(octx->buf)); - return 0; -} - -static int sha512_import(struct shash_desc *desc, const void *in) -{ - struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - const struct sha512_state *ictx = in; - - if (unlikely(ictx->count[1])) - return -ERANGE; - sctx->count = ictx->count[0]; - - memcpy(sctx->state, ictx->state, sizeof(ictx->state)); - memcpy(sctx->buf, ictx->buf, sizeof(ictx->buf)); - sctx->func = CPACF_KIMD_SHA_512; - return 0; -} - -static struct shash_alg sha512_alg = { - .digestsize = SHA512_DIGEST_SIZE, - .init = sha512_init, - .update = s390_sha_update, - .final = s390_sha_final, - .export = sha512_export, - .import = sha512_import, - .descsize = sizeof(struct s390_sha_ctx), - .statesize = sizeof(struct sha512_state), - .base = { - .cra_name = "sha512", - .cra_driver_name= "sha512-s390", - .cra_priority = 300, - .cra_blocksize = SHA512_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -MODULE_ALIAS_CRYPTO("sha512"); - -static int sha384_init(struct shash_desc *desc) -{ - struct s390_sha_ctx *ctx = shash_desc_ctx(desc); - - *(__u64 *)&ctx->state[0] = SHA384_H0; - *(__u64 *)&ctx->state[2] = SHA384_H1; - *(__u64 *)&ctx->state[4] = SHA384_H2; - *(__u64 *)&ctx->state[6] = SHA384_H3; - *(__u64 *)&ctx->state[8] = SHA384_H4; - *(__u64 *)&ctx->state[10] = SHA384_H5; - *(__u64 *)&ctx->state[12] = SHA384_H6; - *(__u64 *)&ctx->state[14] = SHA384_H7; - ctx->count = 0; - ctx->func = CPACF_KIMD_SHA_512; - - return 0; -} - -static struct shash_alg sha384_alg = { - .digestsize = SHA384_DIGEST_SIZE, - .init = sha384_init, - .update = s390_sha_update, - .final = s390_sha_final, - .export = sha512_export, - .import = sha512_import, - .descsize = sizeof(struct s390_sha_ctx), - .statesize = sizeof(struct sha512_state), - .base = { - .cra_name = "sha384", - .cra_driver_name= "sha384-s390", - .cra_priority = 300, - .cra_blocksize = SHA384_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct s390_sha_ctx), - .cra_module = THIS_MODULE, - } -}; - -MODULE_ALIAS_CRYPTO("sha384"); - -static int __init init(void) -{ - int ret; - - if (!cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_512)) - return -ENODEV; - if ((ret = crypto_register_shash(&sha512_alg)) < 0) - goto out; - if ((ret = crypto_register_shash(&sha384_alg)) < 0) - crypto_unregister_shash(&sha512_alg); -out: - return ret; -} - -static void __exit fini(void) -{ - crypto_unregister_shash(&sha512_alg); - crypto_unregister_shash(&sha384_alg); -} - -module_cpu_feature_match(S390_CPU_FEATURE_MSA, init); -module_exit(fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA512 and SHA-384 Secure Hash Algorithm"); diff --git a/arch/s390/crypto/sha_common.c b/arch/s390/crypto/sha_common.c deleted file mode 100644 index 961d7d522af1..000000000000 --- a/arch/s390/crypto/sha_common.c +++ /dev/null @@ -1,136 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0+ -/* - * Cryptographic API. - * - * s390 generic implementation of the SHA Secure Hash Algorithms. - * - * Copyright IBM Corp. 2007 - * Author(s): Jan Glauber (jang@de.ibm.com) - */ - -#include <crypto/internal/hash.h> -#include <linux/module.h> -#include <asm/cpacf.h> -#include "sha.h" - -int s390_sha_update(struct shash_desc *desc, const u8 *data, unsigned int len) -{ - struct s390_sha_ctx *ctx = shash_desc_ctx(desc); - unsigned int bsize = crypto_shash_blocksize(desc->tfm); - unsigned int index, n; - int fc; - - /* how much is already in the buffer? */ - index = ctx->count % bsize; - ctx->count += len; - - if ((index + len) < bsize) - goto store; - - fc = ctx->func; - if (ctx->first_message_part) - fc |= test_facility(86) ? CPACF_KIMD_NIP : 0; - - /* process one stored block */ - if (index) { - memcpy(ctx->buf + index, data, bsize - index); - cpacf_kimd(fc, ctx->state, ctx->buf, bsize); - ctx->first_message_part = 0; - fc &= ~CPACF_KIMD_NIP; - data += bsize - index; - len -= bsize - index; - index = 0; - } - - /* process as many blocks as possible */ - if (len >= bsize) { - n = (len / bsize) * bsize; - cpacf_kimd(fc, ctx->state, data, n); - ctx->first_message_part = 0; - data += n; - len -= n; - } -store: - if (len) - memcpy(ctx->buf + index , data, len); - - return 0; -} -EXPORT_SYMBOL_GPL(s390_sha_update); - -static int s390_crypto_shash_parmsize(int func) -{ - switch (func) { - case CPACF_KLMD_SHA_1: - return 20; - case CPACF_KLMD_SHA_256: - return 32; - case CPACF_KLMD_SHA_512: - return 64; - case CPACF_KLMD_SHA3_224: - case CPACF_KLMD_SHA3_256: - case CPACF_KLMD_SHA3_384: - case CPACF_KLMD_SHA3_512: - return 200; - default: - return -EINVAL; - } -} - -int s390_sha_final(struct shash_desc *desc, u8 *out) -{ - struct s390_sha_ctx *ctx = shash_desc_ctx(desc); - unsigned int bsize = crypto_shash_blocksize(desc->tfm); - u64 bits; - unsigned int n; - int mbl_offset, fc; - - n = ctx->count % bsize; - bits = ctx->count * 8; - mbl_offset = s390_crypto_shash_parmsize(ctx->func); - if (mbl_offset < 0) - return -EINVAL; - - mbl_offset = mbl_offset / sizeof(u32); - - /* set total msg bit length (mbl) in CPACF parmblock */ - switch (ctx->func) { - case CPACF_KLMD_SHA_1: - case CPACF_KLMD_SHA_256: - memcpy(ctx->state + mbl_offset, &bits, sizeof(bits)); - break; - case CPACF_KLMD_SHA_512: - /* - * the SHA512 parmblock has a 128-bit mbl field, clear - * high-order u64 field, copy bits to low-order u64 field - */ - memset(ctx->state + mbl_offset, 0x00, sizeof(bits)); - mbl_offset += sizeof(u64) / sizeof(u32); - memcpy(ctx->state + mbl_offset, &bits, sizeof(bits)); - break; - case CPACF_KLMD_SHA3_224: - case CPACF_KLMD_SHA3_256: - case CPACF_KLMD_SHA3_384: - case CPACF_KLMD_SHA3_512: - break; - default: - return -EINVAL; - } - - fc = ctx->func; - fc |= test_facility(86) ? CPACF_KLMD_DUFOP : 0; - if (ctx->first_message_part) - fc |= CPACF_KLMD_NIP; - cpacf_klmd(fc, ctx->state, ctx->buf, n); - - /* copy digest to out */ - memcpy(out, ctx->state, crypto_shash_digestsize(desc->tfm)); - /* wipe context */ - memset(ctx, 0, sizeof *ctx); - - return 0; -} -EXPORT_SYMBOL_GPL(s390_sha_final); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("s390 SHA cipher common functions"); diff --git a/arch/s390/hypfs/hypfs.h b/arch/s390/hypfs/hypfs.h index 83ebf54cca6b..2bb7104124ca 100644 --- a/arch/s390/hypfs/hypfs.h +++ b/arch/s390/hypfs/hypfs.h @@ -22,11 +22,9 @@ extern struct dentry *hypfs_mkdir(struct dentry *parent, const char *name); -extern struct dentry *hypfs_create_u64(struct dentry *dir, const char *name, - __u64 value); +extern int hypfs_create_u64(struct dentry *dir, const char *name, __u64 value); -extern struct dentry *hypfs_create_str(struct dentry *dir, const char *name, - char *string); +extern int hypfs_create_str(struct dentry *dir, const char *name, char *string); /* LPAR Hypervisor */ extern int hypfs_diag_init(void); @@ -48,7 +46,7 @@ void hypfs_sprp_exit(void); int __hypfs_fs_init(void); -static inline int hypfs_fs_init(void) +static __always_inline int hypfs_fs_init(void) { if (IS_ENABLED(CONFIG_S390_HYPFS_FS)) return __hypfs_fs_init(); diff --git a/arch/s390/hypfs/hypfs_dbfs.c b/arch/s390/hypfs/hypfs_dbfs.c index 5d9effb0867c..25a59c048857 100644 --- a/arch/s390/hypfs/hypfs_dbfs.c +++ b/arch/s390/hypfs/hypfs_dbfs.c @@ -6,6 +6,7 @@ * Author(s): Michael Holzheu <holzheu@linux.vnet.ibm.com> */ +#include <linux/security.h> #include <linux/slab.h> #include "hypfs.h" @@ -15,7 +16,7 @@ static struct hypfs_dbfs_data *hypfs_dbfs_data_alloc(struct hypfs_dbfs_file *f) { struct hypfs_dbfs_data *data; - data = kmalloc(sizeof(*data), GFP_KERNEL); + data = kmalloc_obj(*data); if (!data) return NULL; data->dbfs_file = f; @@ -66,23 +67,27 @@ static long dbfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) long rc; mutex_lock(&df->lock); - if (df->unlocked_ioctl) - rc = df->unlocked_ioctl(file, cmd, arg); - else - rc = -ENOTTY; + rc = df->unlocked_ioctl(file, cmd, arg); mutex_unlock(&df->lock); return rc; } -static const struct file_operations dbfs_ops = { +static const struct file_operations dbfs_ops_ioctl = { .read = dbfs_read, .unlocked_ioctl = dbfs_ioctl, }; +static const struct file_operations dbfs_ops = { + .read = dbfs_read, +}; + void hypfs_dbfs_create_file(struct hypfs_dbfs_file *df) { - df->dentry = debugfs_create_file(df->name, 0400, dbfs_dir, df, - &dbfs_ops); + const struct file_operations *fops = &dbfs_ops; + + if (df->unlocked_ioctl && !security_locked_down(LOCKDOWN_DEBUGFS)) + fops = &dbfs_ops_ioctl; + df->dentry = debugfs_create_file(df->name, 0400, dbfs_dir, df, fops); mutex_init(&df->lock); } diff --git a/arch/s390/hypfs/hypfs_diag.c b/arch/s390/hypfs/hypfs_diag.c index c8af67d20994..08777f57bd24 100644 --- a/arch/s390/hypfs/hypfs_diag.c +++ b/arch/s390/hypfs/hypfs_diag.c @@ -7,8 +7,7 @@ * Author(s): Michael Holzheu <holzheu@de.ibm.com> */ -#define KMSG_COMPONENT "hypfs" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "hypfs: " fmt #include <linux/types.h> #include <linux/errno.h> diff --git a/arch/s390/hypfs/hypfs_diag.h b/arch/s390/hypfs/hypfs_diag.h index 7090eff27fef..b5218135b8fe 100644 --- a/arch/s390/hypfs/hypfs_diag.h +++ b/arch/s390/hypfs/hypfs_diag.h @@ -19,7 +19,7 @@ int diag204_store(void *buf, int pages); int __hypfs_diag_fs_init(void); void __hypfs_diag_fs_exit(void); -static inline int hypfs_diag_fs_init(void) +static __always_inline int hypfs_diag_fs_init(void) { if (IS_ENABLED(CONFIG_S390_HYPFS_FS)) return __hypfs_diag_fs_init(); diff --git a/arch/s390/hypfs/hypfs_diag0c.c b/arch/s390/hypfs/hypfs_diag0c.c index 4131f0daa5ea..8f726e6558a8 100644 --- a/arch/s390/hypfs/hypfs_diag0c.c +++ b/arch/s390/hypfs/hypfs_diag0c.c @@ -9,6 +9,7 @@ #include <linux/slab.h> #include <linux/cpu.h> +#include <asm/machine.h> #include <asm/diag.h> #include <asm/hypfs.h> #include "hypfs.h" @@ -34,13 +35,12 @@ static void *diag0c_store(unsigned int *count) cpus_read_lock(); cpu_count = num_online_cpus(); - cpu_vec = kmalloc_array(num_possible_cpus(), sizeof(*cpu_vec), - GFP_KERNEL); + cpu_vec = kmalloc_objs(*cpu_vec, num_possible_cpus()); if (!cpu_vec) goto fail_unlock_cpus; /* Note: Diag 0c needs 8 byte alignment and real storage */ - diag0c_data = kzalloc(struct_size(diag0c_data, entry, cpu_count), - GFP_KERNEL | GFP_DMA); + diag0c_data = kzalloc_flex(*diag0c_data, entry, cpu_count, + GFP_KERNEL | GFP_DMA); if (!diag0c_data) goto fail_kfree_cpu_vec; i = 0; @@ -107,7 +107,7 @@ static struct hypfs_dbfs_file dbfs_file_0c = { */ int __init hypfs_diag0c_init(void) { - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return 0; hypfs_dbfs_create_file(&dbfs_file_0c); return 0; @@ -118,7 +118,7 @@ int __init hypfs_diag0c_init(void) */ void hypfs_diag0c_exit(void) { - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return; hypfs_dbfs_remove_file(&dbfs_file_0c); } diff --git a/arch/s390/hypfs/hypfs_diag_fs.c b/arch/s390/hypfs/hypfs_diag_fs.c index 00a6d370a280..39621c326010 100644 --- a/arch/s390/hypfs/hypfs_diag_fs.c +++ b/arch/s390/hypfs/hypfs_diag_fs.c @@ -7,8 +7,7 @@ * Author(s): Michael Holzheu <holzheu@de.ibm.com> */ -#define KMSG_COMPONENT "hypfs" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "hypfs: " fmt #include <linux/types.h> #include <linux/errno.h> @@ -16,6 +15,7 @@ #include <linux/string.h> #include <linux/vmalloc.h> #include <linux/mm.h> +#include <asm/machine.h> #include <asm/diag.h> #include <asm/ebcdic.h> #include "hypfs_diag.h" @@ -203,30 +203,31 @@ static int hypfs_create_cpu_files(struct dentry *cpus_dir, void *cpu_info) { struct dentry *cpu_dir; char buffer[TMP_SIZE]; - void *rc; + int rc; snprintf(buffer, TMP_SIZE, "%d", cpu_info__cpu_addr(diag204_get_info_type(), cpu_info)); cpu_dir = hypfs_mkdir(cpus_dir, buffer); + if (IS_ERR(cpu_dir)) + return PTR_ERR(cpu_dir); rc = hypfs_create_u64(cpu_dir, "mgmtime", cpu_info__acc_time(diag204_get_info_type(), cpu_info) - cpu_info__lp_time(diag204_get_info_type(), cpu_info)); - if (IS_ERR(rc)) - return PTR_ERR(rc); + if (rc) + return rc; rc = hypfs_create_u64(cpu_dir, "cputime", cpu_info__lp_time(diag204_get_info_type(), cpu_info)); - if (IS_ERR(rc)) - return PTR_ERR(rc); + if (rc) + return rc; if (diag204_get_info_type() == DIAG204_INFO_EXT) { rc = hypfs_create_u64(cpu_dir, "onlinetime", cpu_info__online_time(diag204_get_info_type(), cpu_info)); - if (IS_ERR(rc)) - return PTR_ERR(rc); + if (rc) + return rc; } diag224_idx2name(cpu_info__ctidx(diag204_get_info_type(), cpu_info), buffer); - rc = hypfs_create_str(cpu_dir, "type", buffer); - return PTR_ERR_OR_ZERO(rc); + return hypfs_create_str(cpu_dir, "type", buffer); } static void *hypfs_create_lpar_files(struct dentry *systems_dir, void *part_hdr) @@ -261,7 +262,7 @@ static int hypfs_create_phys_cpu_files(struct dentry *cpus_dir, void *cpu_info) { struct dentry *cpu_dir; char buffer[TMP_SIZE]; - void *rc; + int rc; snprintf(buffer, TMP_SIZE, "%i", phys_cpu__cpu_addr(diag204_get_info_type(), cpu_info)); @@ -270,11 +271,10 @@ static int hypfs_create_phys_cpu_files(struct dentry *cpus_dir, void *cpu_info) return PTR_ERR(cpu_dir); rc = hypfs_create_u64(cpu_dir, "mgmtime", phys_cpu__mgm_time(diag204_get_info_type(), cpu_info)); - if (IS_ERR(rc)) - return PTR_ERR(rc); + if (rc) + return rc; diag224_idx2name(phys_cpu__ctidx(diag204_get_info_type(), cpu_info), buffer); - rc = hypfs_create_str(cpu_dir, "type", buffer); - return PTR_ERR_OR_ZERO(rc); + return hypfs_create_str(cpu_dir, "type", buffer); } static void *hypfs_create_phys_files(struct dentry *parent_dir, void *phys_hdr) @@ -313,41 +313,25 @@ int hypfs_diag_create_files(struct dentry *root) return rc; systems_dir = hypfs_mkdir(root, "systems"); - if (IS_ERR(systems_dir)) { - rc = PTR_ERR(systems_dir); - goto err_out; - } + if (IS_ERR(systems_dir)) + return PTR_ERR(systems_dir); time_hdr = (struct x_info_blk_hdr *)buffer; part_hdr = time_hdr + info_blk_hdr__size(diag204_get_info_type()); for (i = 0; i < info_blk_hdr__npar(diag204_get_info_type(), time_hdr); i++) { part_hdr = hypfs_create_lpar_files(systems_dir, part_hdr); - if (IS_ERR(part_hdr)) { - rc = PTR_ERR(part_hdr); - goto err_out; - } + if (IS_ERR(part_hdr)) + return PTR_ERR(part_hdr); } if (info_blk_hdr__flags(diag204_get_info_type(), time_hdr) & DIAG204_LPAR_PHYS_FLG) { ptr = hypfs_create_phys_files(root, part_hdr); - if (IS_ERR(ptr)) { - rc = PTR_ERR(ptr); - goto err_out; - } + if (IS_ERR(ptr)) + return PTR_ERR(ptr); } hyp_dir = hypfs_mkdir(root, "hyp"); - if (IS_ERR(hyp_dir)) { - rc = PTR_ERR(hyp_dir); - goto err_out; - } - ptr = hypfs_create_str(hyp_dir, "type", "LPAR Hypervisor"); - if (IS_ERR(ptr)) { - rc = PTR_ERR(ptr); - goto err_out; - } - rc = 0; - -err_out: - return rc; + if (IS_ERR(hyp_dir)) + return PTR_ERR(hyp_dir); + return hypfs_create_str(hyp_dir, "type", "LPAR Hypervisor"); } /* Diagnose 224 functions */ @@ -382,7 +366,7 @@ static void diag224_delete_name_table(void) int __init __hypfs_diag_fs_init(void) { - if (MACHINE_IS_LPAR) + if (machine_is_lpar()) return diag224_get_name_table(); return 0; } diff --git a/arch/s390/hypfs/hypfs_sprp.c b/arch/s390/hypfs/hypfs_sprp.c index 9fc3f0dae8f0..08042d3a9d56 100644 --- a/arch/s390/hypfs/hypfs_sprp.c +++ b/arch/s390/hypfs/hypfs_sprp.c @@ -7,7 +7,6 @@ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> */ -#include <linux/compat.h> #include <linux/errno.h> #include <linux/gfp.h> #include <linux/string.h> @@ -27,7 +26,7 @@ static inline unsigned long __hypfs_sprp_diag304(void *data, unsigned long cmd) { union register_pair r1 = { .even = virt_to_phys(data), }; - asm volatile("diag %[r1],%[r3],0x304\n" + asm volatile("diag %[r1],%[r3],0x304" : [r1] "+&d" (r1.pair) : [r3] "d" (cmd) : "memory"); @@ -75,7 +74,7 @@ static int __hypfs_sprp_ioctl(void __user *user_area) rc = -ENOMEM; data = (void *)get_zeroed_page(GFP_KERNEL); - diag304 = kzalloc(sizeof(*diag304), GFP_KERNEL); + diag304 = kzalloc_obj(*diag304); if (!data || !diag304) goto out; @@ -116,10 +115,7 @@ static long hypfs_sprp_ioctl(struct file *file, unsigned int cmd, if (!capable(CAP_SYS_ADMIN)) return -EACCES; - if (is_compat_task()) - argp = compat_ptr(arg); - else - argp = (void __user *) arg; + argp = (void __user *)arg; switch (cmd) { case HYPFS_DIAG304: return __hypfs_sprp_ioctl(argp); diff --git a/arch/s390/hypfs/hypfs_vm.c b/arch/s390/hypfs/hypfs_vm.c index 3db40ad853e0..4db2895e4da3 100644 --- a/arch/s390/hypfs/hypfs_vm.c +++ b/arch/s390/hypfs/hypfs_vm.c @@ -11,6 +11,7 @@ #include <linux/string.h> #include <linux/vmalloc.h> #include <asm/extable.h> +#include <asm/machine.h> #include <asm/diag.h> #include <asm/ebcdic.h> #include <asm/timex.h> @@ -121,7 +122,7 @@ static struct hypfs_dbfs_file dbfs_file_2fc = { int hypfs_vm_init(void) { - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return 0; if (diag2fc(0, all_guests, NULL) > 0) diag2fc_guest_query = all_guests; @@ -135,7 +136,7 @@ int hypfs_vm_init(void) void hypfs_vm_exit(void) { - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return; hypfs_dbfs_remove_file(&dbfs_file_2fc); } diff --git a/arch/s390/hypfs/hypfs_vm_fs.c b/arch/s390/hypfs/hypfs_vm_fs.c index 6011289afa8c..a149a9f92e40 100644 --- a/arch/s390/hypfs/hypfs_vm_fs.c +++ b/arch/s390/hypfs/hypfs_vm_fs.c @@ -19,10 +19,9 @@ #define ATTRIBUTE(dir, name, member) \ do { \ - void *rc; \ - rc = hypfs_create_u64(dir, name, member); \ - if (IS_ERR(rc)) \ - return PTR_ERR(rc); \ + int rc = hypfs_create_u64(dir, name, member); \ + if (rc) \ + return rc; \ } while (0) static int hypfs_vm_create_guest(struct dentry *systems_dir, @@ -85,7 +84,7 @@ static int hypfs_vm_create_guest(struct dentry *systems_dir, int hypfs_vm_create_files(struct dentry *root) { - struct dentry *dir, *file; + struct dentry *dir; struct diag2fc_data *data; unsigned int count = 0; int rc, i; @@ -100,11 +99,9 @@ int hypfs_vm_create_files(struct dentry *root) rc = PTR_ERR(dir); goto failed; } - file = hypfs_create_str(dir, "type", "z/VM Hypervisor"); - if (IS_ERR(file)) { - rc = PTR_ERR(file); + rc = hypfs_create_str(dir, "type", "z/VM Hypervisor"); + if (rc) goto failed; - } /* physical cpus */ dir = hypfs_mkdir(root, "cpus"); @@ -112,11 +109,9 @@ int hypfs_vm_create_files(struct dentry *root) rc = PTR_ERR(dir); goto failed; } - file = hypfs_create_u64(dir, "count", data->lcpus); - if (IS_ERR(file)) { - rc = PTR_ERR(file); + rc = hypfs_create_u64(dir, "count", data->lcpus); + if (rc) goto failed; - } /* guests */ dir = hypfs_mkdir(root, "systems"); diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c index d428635abf08..559b011e6880 100644 --- a/arch/s390/hypfs/inode.c +++ b/arch/s390/hypfs/inode.c @@ -6,8 +6,7 @@ * Author(s): Michael Holzheu <holzheu@de.ibm.com> */ -#define KMSG_COMPONENT "hypfs" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "hypfs: " fmt #include <linux/types.h> #include <linux/errno.h> @@ -24,6 +23,7 @@ #include <linux/kobject.h> #include <linux/seq_file.h> #include <linux/uio.h> +#include <asm/machine.h> #include <asm/ebcdic.h> #include "hypfs.h" @@ -60,33 +60,17 @@ static void hypfs_update_update(struct super_block *sb) static void hypfs_add_dentry(struct dentry *dentry) { - dentry->d_fsdata = hypfs_last_dentry; - hypfs_last_dentry = dentry; -} - -static void hypfs_remove(struct dentry *dentry) -{ - struct dentry *parent; - - parent = dentry->d_parent; - inode_lock(d_inode(parent)); - if (simple_positive(dentry)) { - if (d_is_dir(dentry)) - simple_rmdir(d_inode(parent), dentry); - else - simple_unlink(d_inode(parent), dentry); + if (IS_ROOT(dentry->d_parent)) { + dentry->d_fsdata = hypfs_last_dentry; + hypfs_last_dentry = dentry; } - d_drop(dentry); - dput(dentry); - inode_unlock(d_inode(parent)); } -static void hypfs_delete_tree(struct dentry *root) +static void hypfs_delete_tree(void) { while (hypfs_last_dentry) { - struct dentry *next_dentry; - next_dentry = hypfs_last_dentry->d_fsdata; - hypfs_remove(hypfs_last_dentry); + struct dentry *next_dentry = hypfs_last_dentry->d_fsdata; + simple_recursive_removal(hypfs_last_dentry, NULL); hypfs_last_dentry = next_dentry; } } @@ -183,14 +167,14 @@ static ssize_t hypfs_write_iter(struct kiocb *iocb, struct iov_iter *from) rc = -EBUSY; goto out; } - hypfs_delete_tree(sb->s_root); - if (MACHINE_IS_VM) + hypfs_delete_tree(); + if (machine_is_vm()) rc = hypfs_vm_create_files(sb->s_root); else rc = hypfs_diag_create_files(sb->s_root); if (rc) { pr_err("Updating the hypfs tree failed\n"); - hypfs_delete_tree(sb->s_root); + hypfs_delete_tree(); goto out; } hypfs_update_update(sb); @@ -273,7 +257,7 @@ static int hypfs_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_root = root_dentry = d_make_root(root_inode); if (!root_dentry) return -ENOMEM; - if (MACHINE_IS_VM) + if (machine_is_vm()) rc = hypfs_vm_create_files(root_dentry); else rc = hypfs_diag_create_files(root_dentry); @@ -308,7 +292,7 @@ static int hypfs_init_fs_context(struct fs_context *fc) { struct hypfs_sb_info *sbi; - sbi = kzalloc(sizeof(struct hypfs_sb_info), GFP_KERNEL); + sbi = kzalloc_obj(struct hypfs_sb_info); if (!sbi) return -ENOMEM; @@ -325,13 +309,9 @@ static void hypfs_kill_super(struct super_block *sb) { struct hypfs_sb_info *sb_info = sb->s_fs_info; - if (sb->s_root) - hypfs_delete_tree(sb->s_root); - if (sb_info && sb_info->update_file) - hypfs_remove(sb_info->update_file); - kfree(sb->s_fs_info); - sb->s_fs_info = NULL; - kill_litter_super(sb); + hypfs_last_dentry = NULL; + kill_anon_super(sb); + kfree(sb_info); } static struct dentry *hypfs_create_file(struct dentry *parent, const char *name, @@ -340,17 +320,13 @@ static struct dentry *hypfs_create_file(struct dentry *parent, const char *name, struct dentry *dentry; struct inode *inode; - inode_lock(d_inode(parent)); - dentry = lookup_one_len(name, parent, strlen(name)); - if (IS_ERR(dentry)) { - dentry = ERR_PTR(-ENOMEM); - goto fail; - } + dentry = simple_start_creating(parent, name); + if (IS_ERR(dentry)) + return ERR_PTR(-ENOMEM); inode = hypfs_make_inode(parent->d_sb, mode); if (!inode) { - dput(dentry); - dentry = ERR_PTR(-ENOMEM); - goto fail; + simple_done_creating(dentry); + return ERR_PTR(-ENOMEM); } if (S_ISREG(mode)) { inode->i_fop = &hypfs_file_ops; @@ -365,11 +341,9 @@ static struct dentry *hypfs_create_file(struct dentry *parent, const char *name, } else BUG(); inode->i_private = data; - d_instantiate(dentry, inode); - dget(dentry); -fail: - inode_unlock(d_inode(parent)); - return dentry; + d_make_persistent(dentry, inode); + simple_done_creating(dentry); + return dentry; // borrowed } struct dentry *hypfs_mkdir(struct dentry *parent, const char *name) @@ -397,8 +371,7 @@ static struct dentry *hypfs_create_update_file(struct dentry *dir) return dentry; } -struct dentry *hypfs_create_u64(struct dentry *dir, - const char *name, __u64 value) +int hypfs_create_u64(struct dentry *dir, const char *name, __u64 value) { char *buffer; char tmp[TMP_SIZE]; @@ -407,35 +380,34 @@ struct dentry *hypfs_create_u64(struct dentry *dir, snprintf(tmp, TMP_SIZE, "%llu\n", (unsigned long long int)value); buffer = kstrdup(tmp, GFP_KERNEL); if (!buffer) - return ERR_PTR(-ENOMEM); + return -ENOMEM; dentry = hypfs_create_file(dir, name, buffer, S_IFREG | REG_FILE_MODE); if (IS_ERR(dentry)) { kfree(buffer); - return ERR_PTR(-ENOMEM); + return -ENOMEM; } hypfs_add_dentry(dentry); - return dentry; + return 0; } -struct dentry *hypfs_create_str(struct dentry *dir, - const char *name, char *string) +int hypfs_create_str(struct dentry *dir, const char *name, char *string) { char *buffer; struct dentry *dentry; buffer = kmalloc(strlen(string) + 2, GFP_KERNEL); if (!buffer) - return ERR_PTR(-ENOMEM); + return -ENOMEM; sprintf(buffer, "%s\n", string); dentry = hypfs_create_file(dir, name, buffer, S_IFREG | REG_FILE_MODE); if (IS_ERR(dentry)) { kfree(buffer); - return ERR_PTR(-ENOMEM); + return -ENOMEM; } hypfs_add_dentry(dentry); - return dentry; + return 0; } static const struct file_operations hypfs_file_ops = { diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild index 297bf7157968..80bad7de7a04 100644 --- a/arch/s390/include/asm/Kbuild +++ b/arch/s390/include/asm/Kbuild @@ -5,6 +5,5 @@ generated-y += syscall_table.h generated-y += unistd_nr.h generic-y += asm-offsets.h -generic-y += kvm_types.h generic-y += mcs_spinlock.h generic-y += mmzone.h diff --git a/arch/s390/include/asm/abs_lowcore.h b/arch/s390/include/asm/abs_lowcore.h index 004d17ea05cf..317c07c09ae4 100644 --- a/arch/s390/include/asm/abs_lowcore.h +++ b/arch/s390/include/asm/abs_lowcore.h @@ -25,11 +25,4 @@ static inline void put_abs_lowcore(struct lowcore *lc) put_cpu(); } -extern int relocate_lowcore; - -static inline int have_relocated_lowcore(void) -{ - return relocate_lowcore; -} - #endif /* _ASM_S390_ABS_LOWCORE_H */ diff --git a/arch/s390/include/asm/alternative.h b/arch/s390/include/asm/alternative.h index 73e781b56bfe..1c56480def9e 100644 --- a/arch/s390/include/asm/alternative.h +++ b/arch/s390/include/asm/alternative.h @@ -32,8 +32,8 @@ #define ALT_CTX_ALL (ALT_CTX_EARLY | ALT_CTX_LATE) #define ALT_TYPE_FACILITY 0 -#define ALT_TYPE_SPEC 1 -#define ALT_TYPE_LOWCORE 2 +#define ALT_TYPE_FEATURE 1 +#define ALT_TYPE_SPEC 2 #define ALT_DATA_SHIFT 0 #define ALT_TYPE_SHIFT 20 @@ -43,14 +43,15 @@ ALT_TYPE_FACILITY << ALT_TYPE_SHIFT | \ (facility) << ALT_DATA_SHIFT) +#define ALT_FEATURE(feature) (ALT_CTX_EARLY << ALT_CTX_SHIFT | \ + ALT_TYPE_FEATURE << ALT_TYPE_SHIFT | \ + (feature) << ALT_DATA_SHIFT) + #define ALT_SPEC(facility) (ALT_CTX_LATE << ALT_CTX_SHIFT | \ ALT_TYPE_SPEC << ALT_TYPE_SHIFT | \ (facility) << ALT_DATA_SHIFT) -#define ALT_LOWCORE (ALT_CTX_EARLY << ALT_CTX_SHIFT | \ - ALT_TYPE_LOWCORE << ALT_TYPE_SHIFT) - -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/types.h> #include <linux/stddef.h> @@ -182,7 +183,7 @@ static inline void apply_alternatives(struct alt_instr *start, struct alt_instr /* Use this macro if clobbers are needed without inputs. */ #define ASM_NO_INPUT_CLOBBER(clobber...) : clobber -#else /* __ASSEMBLY__ */ +#else /* __ASSEMBLER__ */ /* * Issue one struct alt_instr descriptor entry (need to put it into @@ -232,6 +233,6 @@ static inline void apply_alternatives(struct alt_instr *start, struct alt_instr .popsection .endm -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_S390_ALTERNATIVE_H */ diff --git a/arch/s390/include/asm/ap.h b/arch/s390/include/asm/ap.h index 395b02d6a133..3b95c6531a67 100644 --- a/arch/s390/include/asm/ap.h +++ b/arch/s390/include/asm/ap.h @@ -38,16 +38,30 @@ typedef unsigned int ap_qid_t; * The ap queue status word is returned by all three AP functions * (PQAP, NQAP and DQAP). There's a set of flags in the first * byte, followed by a 1 byte response code. + * + * For convenience the 'value' field is a 32 bit access of the + * whole status and the 'status_bits' and 'rc' fields comprise + * the leftmost 8 status bits and the response_code. */ struct ap_queue_status { - unsigned int queue_empty : 1; - unsigned int replies_waiting : 1; - unsigned int queue_full : 1; - unsigned int : 3; - unsigned int async : 1; - unsigned int irq_enabled : 1; - unsigned int response_code : 8; - unsigned int : 16; + union { + unsigned int value : 32; + struct { + unsigned int status_bits : 8; + unsigned int rc : 8; + unsigned int : 16; + }; + struct { + unsigned int queue_empty : 1; + unsigned int replies_waiting : 1; + unsigned int queue_full : 1; + unsigned int : 3; + unsigned int async : 1; + unsigned int irq_enabled : 1; + unsigned int response_code : 8; + unsigned int : 16; + }; + }; }; /* @@ -64,7 +78,7 @@ union ap_queue_status_reg { }; /** - * ap_intructions_available() - Test if AP instructions are available. + * ap_instructions_available() - Test if AP instructions are available. * * Returns true if the AP instructions are installed, otherwise false. */ @@ -103,7 +117,7 @@ struct ap_tapq_hwinfo { unsigned int accel : 1; /* A */ unsigned int ep11 : 1; /* X */ unsigned int apxa : 1; /* APXA */ - unsigned int : 1; + unsigned int slcf : 1; /* Cmd filtering avail. */ unsigned int class : 8; unsigned int bs : 2; /* SE bind/assoc */ unsigned int : 14; @@ -143,7 +157,7 @@ static inline struct ap_queue_status ap_tapq(ap_qid_t qid, " lghi 2,0\n" /* 0 into gr2 */ " .insn rre,0xb2af0000,0,0\n" /* PQAP(TAPQ) */ " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ - " lgr %[reg2],2\n" /* gr2 into reg2 */ + " lgr %[reg2],2" /* gr2 into reg2 */ : [reg1] "=&d" (reg1.value), [reg2] "=&d" (reg2) : [qid] "d" (qid) : "cc", "0", "1", "2"); @@ -186,7 +200,7 @@ static inline struct ap_queue_status ap_rapq(ap_qid_t qid, int fbit) asm volatile( " lgr 0,%[reg0]\n" /* qid arg into gr0 */ " .insn rre,0xb2af0000,0,0\n" /* PQAP(RAPQ) */ - " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ + " lgr %[reg1],1" /* gr1 (status) into reg1 */ : [reg1] "=&d" (reg1.value) : [reg0] "d" (reg0) : "cc", "0", "1"); @@ -211,7 +225,7 @@ static inline struct ap_queue_status ap_zapq(ap_qid_t qid, int fbit) asm volatile( " lgr 0,%[reg0]\n" /* qid arg into gr0 */ " .insn rre,0xb2af0000,0,0\n" /* PQAP(ZAPQ) */ - " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ + " lgr %[reg1],1" /* gr1 (status) into reg1 */ : [reg1] "=&d" (reg1.value) : [reg0] "d" (reg0) : "cc", "0", "1"); @@ -315,7 +329,7 @@ static inline struct ap_queue_status ap_aqic(ap_qid_t qid, " lgr 1,%[reg1]\n" /* irq ctrl into gr1 */ " lgr 2,%[reg2]\n" /* ni addr into gr2 */ " .insn rre,0xb2af0000,0,0\n" /* PQAP(AQIC) */ - " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ + " lgr %[reg1],1" /* gr1 (status) into reg1 */ : [reg1] "+&d" (reg1.value) : [reg0] "d" (reg0), [reg2] "d" (reg2) : "cc", "memory", "0", "1", "2"); @@ -363,7 +377,7 @@ static inline struct ap_queue_status ap_qact(ap_qid_t qid, int ifbit, " lgr 1,%[reg1]\n" /* qact in info into gr1 */ " .insn rre,0xb2af0000,0,0\n" /* PQAP(QACT) */ " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ - " lgr %[reg2],2\n" /* qact out info into reg2 */ + " lgr %[reg2],2" /* qact out info into reg2 */ : [reg1] "+&d" (reg1.value), [reg2] "=&d" (reg2) : [reg0] "d" (reg0) : "cc", "0", "1", "2"); @@ -388,7 +402,7 @@ static inline struct ap_queue_status ap_bapq(ap_qid_t qid) asm volatile( " lgr 0,%[reg0]\n" /* qid arg into gr0 */ " .insn rre,0xb2af0000,0,0\n" /* PQAP(BAPQ) */ - " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ + " lgr %[reg1],1" /* gr1 (status) into reg1 */ : [reg1] "=&d" (reg1.value) : [reg0] "d" (reg0) : "cc", "0", "1"); @@ -416,7 +430,7 @@ static inline struct ap_queue_status ap_aapq(ap_qid_t qid, unsigned int sec_idx) " lgr 0,%[reg0]\n" /* qid arg into gr0 */ " lgr 2,%[reg2]\n" /* secret index into gr2 */ " .insn rre,0xb2af0000,0,0\n" /* PQAP(AAPQ) */ - " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ + " lgr %[reg1],1" /* gr1 (status) into reg1 */ : [reg1] "=&d" (reg1.value) : [reg0] "d" (reg0), [reg2] "d" (reg2) : "cc", "0", "1", "2"); @@ -453,7 +467,7 @@ static inline struct ap_queue_status ap_nqap(ap_qid_t qid, " lgr 0,%[reg0]\n" /* qid param in gr0 */ "0: .insn rre,0xb2ad0000,%[nqap_r1],%[nqap_r2]\n" " brc 2,0b\n" /* handle partial completion */ - " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ + " lgr %[reg1],1" /* gr1 (status) into reg1 */ : [reg0] "+&d" (reg0), [reg1] "=&d" (reg1.value), [nqap_r2] "+&d" (nqap_r2.pair) : [nqap_r1] "d" (nqap_r1.pair) @@ -518,7 +532,7 @@ static inline struct ap_queue_status ap_dqap(ap_qid_t qid, " brc 6,0b\n" /* handle partial complete */ "2: lgr %[reg0],0\n" /* gr0 (qid + info) into reg0 */ " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ - " lgr %[reg2],2\n" /* gr2 (res length) into reg2 */ + " lgr %[reg2],2" /* gr2 (res length) into reg2 */ : [reg0] "+&d" (reg0), [reg1] "=&d" (reg1.value), [reg2] "=&d" (reg2), [rp1] "+&d" (rp1.pair), [rp2] "+&d" (rp2.pair) diff --git a/arch/s390/include/asm/appldata.h b/arch/s390/include/asm/appldata.h index a92ebbc7aa7a..99b2902c10fd 100644 --- a/arch/s390/include/asm/appldata.h +++ b/arch/s390/include/asm/appldata.h @@ -9,6 +9,7 @@ #define _ASM_S390_APPLDATA_H #include <linux/io.h> +#include <asm/machine.h> #include <asm/diag.h> #define APPLDATA_START_INTERVAL_REC 0x80 @@ -48,7 +49,7 @@ static inline int appldata_asm(struct appldata_parameter_list *parm_list, { int ry; - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return -EOPNOTSUPP; parm_list->diag = 0xdc; parm_list->function = fn; diff --git a/arch/s390/include/asm/arch-stackprotector.h b/arch/s390/include/asm/arch-stackprotector.h new file mode 100644 index 000000000000..953627259e91 --- /dev/null +++ b/arch/s390/include/asm/arch-stackprotector.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _ASM_S390_ARCH_STACKPROTECTOR_H +#define _ASM_S390_ARCH_STACKPROTECTOR_H + +extern unsigned long __stack_chk_guard; +extern int stack_protector_debug; + +void __stack_protector_apply_early(unsigned long kernel_start); +int __stack_protector_apply(unsigned long *start, unsigned long *end, unsigned long kernel_start); + +static inline void stack_protector_apply_early(unsigned long kernel_start) +{ + if (IS_ENABLED(CONFIG_STACKPROTECTOR)) + __stack_protector_apply_early(kernel_start); +} + +static inline int stack_protector_apply(unsigned long *start, unsigned long *end) +{ + if (IS_ENABLED(CONFIG_STACKPROTECTOR)) + return __stack_protector_apply(start, end, 0); + return 0; +} + +#endif /* _ASM_S390_ARCH_STACKPROTECTOR_H */ diff --git a/arch/s390/include/asm/asce.h b/arch/s390/include/asm/asce.h new file mode 100644 index 000000000000..f6dfaaba735a --- /dev/null +++ b/arch/s390/include/asm/asce.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _ASM_S390_ASCE_H +#define _ASM_S390_ASCE_H + +#include <linux/thread_info.h> +#include <linux/irqflags.h> +#include <asm/lowcore.h> +#include <asm/ctlreg.h> + +static inline bool enable_sacf_uaccess(void) +{ + unsigned long flags; + + if (test_thread_flag(TIF_ASCE_PRIMARY)) + return true; + local_irq_save(flags); + local_ctl_load(1, &get_lowcore()->kernel_asce); + set_thread_flag(TIF_ASCE_PRIMARY); + local_irq_restore(flags); + return false; +} + +static inline void disable_sacf_uaccess(bool previous) +{ + unsigned long flags; + + if (previous) + return; + local_irq_save(flags); + local_ctl_load(1, &get_lowcore()->user_asce); + clear_thread_flag(TIF_ASCE_PRIMARY); + local_irq_restore(flags); +} + +#endif /* _ASM_S390_ASCE_H */ diff --git a/arch/s390/include/asm/asm-const.h b/arch/s390/include/asm/asm-const.h index 11f615eb0066..1cfffad9eea0 100644 --- a/arch/s390/include/asm/asm-const.h +++ b/arch/s390/include/asm/asm-const.h @@ -2,7 +2,7 @@ #ifndef _ASM_S390_ASM_CONST_H #define _ASM_S390_ASM_CONST_H -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ # define stringify_in_c(...) __VA_ARGS__ #else /* This version of stringify will deal with commas... */ diff --git a/arch/s390/include/asm/asm-extable.h b/arch/s390/include/asm/asm-extable.h index 2e829c16fd8a..d23ea0c94e4e 100644 --- a/arch/s390/include/asm/asm-extable.h +++ b/arch/s390/include/asm/asm-extable.h @@ -14,6 +14,8 @@ #define EX_TYPE_UA_LOAD_REGPAIR 6 #define EX_TYPE_ZEROPAD 7 #define EX_TYPE_FPC 8 +#define EX_TYPE_UA_MVCOS_TO 9 +#define EX_TYPE_UA_MVCOS_FROM 10 #define EX_DATA_REG_ERR_SHIFT 0 #define EX_DATA_REG_ERR GENMASK(3, 0) @@ -84,4 +86,10 @@ #define EX_TABLE_FPC(_fault, _target) \ __EX_TABLE(__ex_table, _fault, _target, EX_TYPE_FPC, __stringify(%%r0), __stringify(%%r0), 0) +#define EX_TABLE_UA_MVCOS_TO(_fault, _target) \ + __EX_TABLE(__ex_table, _fault, _target, EX_TYPE_UA_MVCOS_TO, __stringify(%%r0), __stringify(%%r0), 0) + +#define EX_TABLE_UA_MVCOS_FROM(_fault, _target) \ + __EX_TABLE(__ex_table, _fault, _target, EX_TYPE_UA_MVCOS_FROM, __stringify(%%r0), __stringify(%%r0), 0) + #endif /* __ASM_EXTABLE_H */ diff --git a/arch/s390/include/asm/asm-prototypes.h b/arch/s390/include/asm/asm-prototypes.h index f662eb4b9246..7bd1801cf241 100644 --- a/arch/s390/include/asm/asm-prototypes.h +++ b/arch/s390/include/asm/asm-prototypes.h @@ -3,6 +3,7 @@ #include <linux/kvm_host.h> #include <linux/ftrace.h> +#include <asm/bug.h> #include <asm/fpu.h> #include <asm/nospec-branch.h> #include <asm-generic/asm-prototypes.h> diff --git a/arch/s390/include/asm/asm.h b/arch/s390/include/asm/asm.h index e9062b01e2a2..510901c2a5f9 100644 --- a/arch/s390/include/asm/asm.h +++ b/arch/s390/include/asm/asm.h @@ -30,7 +30,7 @@ */ #if defined(__GCC_ASM_FLAG_OUTPUTS__) && !(IS_ENABLED(CONFIG_CC_ASM_FLAG_OUTPUT_BROKEN)) -#define __HAVE_ASM_FLAG_OUTPUTS__ +#define __HAVE_ASM_FLAG_OUTPUTS__ 1 #define CC_IPM(sym) #define CC_OUT(sym, var) "=@cc" (var) diff --git a/arch/s390/include/asm/atomic_ops.h b/arch/s390/include/asm/atomic_ops.h index 585678bbcd7a..845b77864412 100644 --- a/arch/s390/include/asm/atomic_ops.h +++ b/arch/s390/include/asm/atomic_ops.h @@ -17,7 +17,7 @@ static __always_inline int __atomic_read(const int *ptr) int val; asm volatile( - " l %[val],%[ptr]\n" + " l %[val],%[ptr]" : [val] "=d" (val) : [ptr] "R" (*ptr)); return val; } @@ -26,11 +26,11 @@ static __always_inline void __atomic_set(int *ptr, int val) { if (__builtin_constant_p(val) && val >= S16_MIN && val <= S16_MAX) { asm volatile( - " mvhi %[ptr],%[val]\n" + " mvhi %[ptr],%[val]" : [ptr] "=Q" (*ptr) : [val] "K" (val)); } else { asm volatile( - " st %[val],%[ptr]\n" + " st %[val],%[ptr]" : [ptr] "=R" (*ptr) : [val] "d" (val)); } } @@ -40,7 +40,7 @@ static __always_inline long __atomic64_read(const long *ptr) long val; asm volatile( - " lg %[val],%[ptr]\n" + " lg %[val],%[ptr]" : [val] "=d" (val) : [ptr] "RT" (*ptr)); return val; } @@ -49,11 +49,11 @@ static __always_inline void __atomic64_set(long *ptr, long val) { if (__builtin_constant_p(val) && val >= S16_MIN && val <= S16_MAX) { asm volatile( - " mvghi %[ptr],%[val]\n" + " mvghi %[ptr],%[val]" : [ptr] "=Q" (*ptr) : [val] "K" (val)); } else { asm volatile( - " stg %[val],%[ptr]\n" + " stg %[val],%[ptr]" : [ptr] "=RT" (*ptr) : [val] "d" (val)); } } @@ -66,7 +66,7 @@ static __always_inline op_type op_name(op_type val, op_type *ptr) \ op_type old; \ \ asm volatile( \ - op_string " %[old],%[val],%[ptr]\n" \ + op_string " %[old],%[val],%[ptr]" \ op_barrier \ : [old] "=d" (old), [ptr] "+QS" (*ptr) \ : [val] "d" (val) : "cc", "memory"); \ @@ -75,7 +75,7 @@ static __always_inline op_type op_name(op_type val, op_type *ptr) \ #define __ATOMIC_OPS(op_name, op_type, op_string) \ __ATOMIC_OP(op_name, op_type, op_string, "") \ - __ATOMIC_OP(op_name##_barrier, op_type, op_string, "bcr 14,0\n") + __ATOMIC_OP(op_name##_barrier, op_type, op_string, "\nbcr 14,0") __ATOMIC_OPS(__atomic_add, int, "laa") __ATOMIC_OPS(__atomic_and, int, "lan") @@ -94,14 +94,14 @@ __ATOMIC_OPS(__atomic64_xor, long, "laxg") static __always_inline void op_name(op_type val, op_type *ptr) \ { \ asm volatile( \ - op_string " %[ptr],%[val]\n" \ + op_string " %[ptr],%[val]" \ op_barrier \ : [ptr] "+QS" (*ptr) : [val] "i" (val) : "cc", "memory");\ } #define __ATOMIC_CONST_OPS(op_name, op_type, op_string) \ __ATOMIC_CONST_OP(op_name, op_type, op_string, "") \ - __ATOMIC_CONST_OP(op_name##_barrier, op_type, op_string, "bcr 14,0\n") + __ATOMIC_CONST_OP(op_name##_barrier, op_type, op_string, "\nbcr 14,0") __ATOMIC_CONST_OPS(__atomic_add_const, int, "asi") __ATOMIC_CONST_OPS(__atomic64_add_const, long, "agsi") @@ -163,10 +163,10 @@ __ATOMIC64_OPS(__atomic64_xor, "xgr") #undef __ATOMIC64_OPS -#define __atomic_add_const(val, ptr) __atomic_add(val, ptr) -#define __atomic_add_const_barrier(val, ptr) __atomic_add(val, ptr) -#define __atomic64_add_const(val, ptr) __atomic64_add(val, ptr) -#define __atomic64_add_const_barrier(val, ptr) __atomic64_add(val, ptr) +#define __atomic_add_const(val, ptr) ((void)__atomic_add(val, ptr)) +#define __atomic_add_const_barrier(val, ptr) ((void)__atomic_add(val, ptr)) +#define __atomic64_add_const(val, ptr) ((void)__atomic64_add(val, ptr)) +#define __atomic64_add_const_barrier(val, ptr) ((void)__atomic64_add(val, ptr)) #endif /* MARCH_HAS_Z196_FEATURES */ @@ -179,7 +179,7 @@ static __always_inline bool op_name(op_type val, op_type *ptr) \ int cc; \ \ asm volatile( \ - op_string " %[tmp],%[val],%[ptr]\n" \ + op_string " %[tmp],%[val],%[ptr]" \ op_barrier \ : "=@cc" (cc), [tmp] "=d" (tmp), [ptr] "+QS" (*ptr) \ : [val] "d" (val) \ @@ -189,7 +189,7 @@ static __always_inline bool op_name(op_type val, op_type *ptr) \ #define __ATOMIC_TEST_OPS(op_name, op_type, op_string) \ __ATOMIC_TEST_OP(op_name, op_type, op_string, "") \ - __ATOMIC_TEST_OP(op_name##_barrier, op_type, op_string, "bcr 14,0\n") + __ATOMIC_TEST_OP(op_name##_barrier, op_type, op_string, "\nbcr 14,0") __ATOMIC_TEST_OPS(__atomic_add_and_test, int, "laal") __ATOMIC_TEST_OPS(__atomic64_add_and_test, long, "laalg") @@ -203,7 +203,7 @@ static __always_inline bool op_name(op_type val, op_type *ptr) \ int cc; \ \ asm volatile( \ - op_string " %[ptr],%[val]\n" \ + op_string " %[ptr],%[val]" \ op_barrier \ : "=@cc" (cc), [ptr] "+QS" (*ptr) \ : [val] "i" (val) \ @@ -213,7 +213,7 @@ static __always_inline bool op_name(op_type val, op_type *ptr) \ #define __ATOMIC_CONST_TEST_OPS(op_name, op_type, op_string) \ __ATOMIC_CONST_TEST_OP(op_name, op_type, op_string, "") \ - __ATOMIC_CONST_TEST_OP(op_name##_barrier, op_type, op_string, "bcr 14,0\n") + __ATOMIC_CONST_TEST_OP(op_name##_barrier, op_type, op_string, "\nbcr 14,0") __ATOMIC_CONST_TEST_OPS(__atomic_add_const_and_test, int, "alsi") __ATOMIC_CONST_TEST_OPS(__atomic64_add_const_and_test, long, "algsi") diff --git a/arch/s390/include/asm/barrier.h b/arch/s390/include/asm/barrier.h index d82130d7f2b6..dad02f5b3c8d 100644 --- a/arch/s390/include/asm/barrier.h +++ b/arch/s390/include/asm/barrier.h @@ -18,9 +18,9 @@ #ifdef MARCH_HAS_Z196_FEATURES /* Fast-BCR without checkpoint synchronization */ -#define __ASM_BCR_SERIALIZE "bcr 14,0\n" +#define __ASM_BCR_SERIALIZE "bcr 14,0" #else -#define __ASM_BCR_SERIALIZE "bcr 15,0\n" +#define __ASM_BCR_SERIALIZE "bcr 15,0" #endif static __always_inline void bcr_serialize(void) @@ -62,19 +62,19 @@ do { \ * @size: number of elements in array */ #define array_index_mask_nospec array_index_mask_nospec -static inline unsigned long array_index_mask_nospec(unsigned long index, - unsigned long size) +static __always_inline unsigned long array_index_mask_nospec(unsigned long index, + unsigned long size) { unsigned long mask; if (__builtin_constant_p(size) && size > 0) { asm(" clgr %2,%1\n" - " slbgr %0,%0\n" + " slbgr %0,%0" :"=d" (mask) : "d" (size-1), "d" (index) :"cc"); return mask; } asm(" clgr %1,%2\n" - " slbgr %0,%0\n" + " slbgr %0,%0" :"=d" (mask) : "d" (size), "d" (index) :"cc"); return ~mask; } diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index d5125296ade2..5f10074665b0 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -53,12 +53,16 @@ static __always_inline bool arch_test_bit(unsigned long nr, const volatile unsig unsigned long mask; int cc; - if (__builtin_constant_p(nr)) { + /* + * With CONFIG_PROFILE_ALL_BRANCHES enabled gcc fails to + * handle __builtin_constant_p() in some cases. + */ + if (!IS_ENABLED(CONFIG_PROFILE_ALL_BRANCHES) && __builtin_constant_p(nr)) { addr = (const volatile unsigned char *)ptr; addr += (nr ^ (BITS_PER_LONG - BITS_PER_BYTE)) / BITS_PER_BYTE; mask = 1UL << (nr & (BITS_PER_BYTE - 1)); asm volatile( - " tm %[addr],%[mask]\n" + " tm %[addr],%[mask]" : "=@cc" (cc) : [addr] "Q" (*addr), [mask] "I" (mask) ); @@ -118,6 +122,8 @@ static inline bool test_bit_inv(unsigned long nr, return test_bit(nr ^ (BITS_PER_LONG - 1), ptr); } +#ifndef CONFIG_CC_HAS_BUILTIN_FFS + /** * __flogr - find leftmost one * @word - The word to search @@ -126,11 +132,12 @@ static inline bool test_bit_inv(unsigned long nr, * where the most significant bit has bit number 0. * If no bit is set this function returns 64. */ -static inline unsigned char __flogr(unsigned long word) +static __always_inline __attribute_const__ unsigned long __flogr(unsigned long word) { - if (__builtin_constant_p(word)) { - unsigned long bit = 0; + unsigned long bit; + if (__builtin_constant_p(word)) { + bit = 0; if (!word) return 64; if (!(word & 0xffffffff00000000UL)) { @@ -159,86 +166,49 @@ static inline unsigned char __flogr(unsigned long word) } return bit; } else { - union register_pair rp; + union register_pair rp __uninitialized; rp.even = word; - asm volatile( - " flogr %[rp],%[rp]\n" - : [rp] "+d" (rp.pair) : : "cc"); - return rp.even; + asm("flogr %[rp],%[rp]" + : [rp] "+d" (rp.pair) : : "cc"); + bit = rp.even; + /* + * The result of the flogr instruction is a value in the range + * of 0..64. Let the compiler know that the AND operation can + * be optimized away. + */ + __assume(bit <= 64); + return bit & 127; } } /** - * __ffs - find first bit in word. - * @word: The word to search - * - * Undefined if no bit exists, so code should check against 0 first. - */ -static inline unsigned long __ffs(unsigned long word) -{ - return __flogr(-word & word) ^ (BITS_PER_LONG - 1); -} - -/** * ffs - find first bit set * @word: the word to search * * This is defined the same way as the libc and * compiler builtin ffs routines (man ffs). */ -static inline int ffs(int word) +static __always_inline __flatten __attribute_const__ int ffs(int word) { - unsigned long mask = 2 * BITS_PER_LONG - 1; unsigned int val = (unsigned int)word; - return (1 + (__flogr(-val & val) ^ (BITS_PER_LONG - 1))) & mask; -} - -/** - * __fls - find last (most-significant) set bit in a long word - * @word: the word to search - * - * Undefined if no set bit exists, so code should check against 0 first. - */ -static inline unsigned long __fls(unsigned long word) -{ - return __flogr(word) ^ (BITS_PER_LONG - 1); + return BITS_PER_LONG - __flogr(-val & val); } -/** - * fls64 - find last set bit in a 64-bit word - * @word: the word to search - * - * This is defined in a similar way as the libc and compiler builtin - * ffsll, but returns the position of the most significant set bit. - * - * fls64(value) returns 0 if value is 0 or the position of the last - * set bit if value is nonzero. The last (most significant) bit is - * at position 64. - */ -static inline int fls64(unsigned long word) -{ - unsigned long mask = 2 * BITS_PER_LONG - 1; +#else /* CONFIG_CC_HAS_BUILTIN_FFS */ - return (1 + (__flogr(word) ^ (BITS_PER_LONG - 1))) & mask; -} +#include <asm-generic/bitops/builtin-ffs.h> -/** - * fls - find last (most-significant) bit set - * @word: the word to search - * - * This is defined the same way as ffs. - * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. - */ -static inline int fls(unsigned int word) -{ - return fls64(word); -} +#endif /* CONFIG_CC_HAS_BUILTIN_FFS */ +#include <asm-generic/bitops/builtin-__ffs.h> +#include <asm-generic/bitops/ffz.h> +#include <asm-generic/bitops/builtin-__fls.h> +#include <asm-generic/bitops/builtin-fls.h> +#include <asm-generic/bitops/fls64.h> #include <asm/arch_hweight.h> #include <asm-generic/bitops/const_hweight.h> -#include <asm-generic/bitops/ffz.h> #include <asm-generic/bitops/sched.h> #include <asm-generic/bitops/le.h> #include <asm-generic/bitops/ext2-atomic-setbit.h> diff --git a/arch/s390/include/asm/bug.h b/arch/s390/include/asm/bug.h index c500d45fb465..59017fd3d935 100644 --- a/arch/s390/include/asm/bug.h +++ b/arch/s390/include/asm/bug.h @@ -3,68 +3,126 @@ #define _ASM_S390_BUG_H #include <linux/compiler.h> +#include <linux/const.h> -#ifdef CONFIG_BUG +#define MONCODE_BUG _AC(0, U) +#define MONCODE_BUG_ARG _AC(1, U) + +#ifndef __ASSEMBLER__ +#if defined(CONFIG_BUG) && defined(CONFIG_CC_HAS_ASM_IMMEDIATE_STRINGS) #ifdef CONFIG_DEBUG_BUGVERBOSE +#define __BUG_ENTRY_VERBOSE(format, file, line) \ + " .long " format " - . # bug_entry::format\n" \ + " .long " file " - . # bug_entry::file\n" \ + " .short " line " # bug_entry::line\n" +#else +#define __BUG_ENTRY_VERBOSE(format, file, line) +#endif -#define __EMIT_BUG(x) do { \ - asm_inline volatile( \ - "0: mc 0,0\n" \ - ".section .rodata.str,\"aMS\",@progbits,1\n" \ - "1: .asciz \""__FILE__"\"\n" \ - ".previous\n" \ - ".section __bug_table,\"aw\"\n" \ - "2: .long 0b-.\n" \ - " .long 1b-.\n" \ - " .short %0,%1\n" \ - " .org 2b+%2\n" \ - ".previous\n" \ - : : "i" (__LINE__), \ - "i" (x), \ - "i" (sizeof(struct bug_entry))); \ -} while (0) +#ifdef CONFIG_DEBUG_BUGVERBOSE_DETAILED +#define WARN_CONDITION_STR(cond_str) cond_str +#else +#define WARN_CONDITION_STR(cond_str) "" +#endif + +#define __BUG_ENTRY(format, file, line, flags, size) \ + " .section __bug_table,\"aw\"\n" \ + "1: .long 0b - . # bug_entry::bug_addr\n" \ + __BUG_ENTRY_VERBOSE(format, file, line) \ + " .short "flags" # bug_entry::flags\n" \ + " .org 1b+"size"\n" \ + " .previous" -#else /* CONFIG_DEBUG_BUGVERBOSE */ - -#define __EMIT_BUG(x) do { \ - asm_inline volatile( \ - "0: mc 0,0\n" \ - ".section __bug_table,\"aw\"\n" \ - "1: .long 0b-.\n" \ - " .short %0\n" \ - " .org 1b+%1\n" \ - ".previous\n" \ - : : "i" (x), \ - "i" (sizeof(struct bug_entry))); \ +#define __BUG_ASM(cond_str, flags) \ +do { \ + asm_inline volatile("\n" \ + "0: mc %[monc](%%r0),0\n" \ + __BUG_ENTRY("%[frmt]", "%[file]", "%[line]", \ + "%[flgs]", "%[size]") \ + : \ + : [monc] "i" (MONCODE_BUG), \ + [frmt] "i" (WARN_CONDITION_STR(cond_str)), \ + [file] "i" (__FILE__), \ + [line] "i" (__LINE__), \ + [flgs] "i" (flags), \ + [size] "i" (sizeof(struct bug_entry))); \ } while (0) -#endif /* CONFIG_DEBUG_BUGVERBOSE */ +#define BUG() \ +do { \ + __BUG_ASM("", 0); \ + unreachable(); \ +} while (0) -#define BUG() do { \ - __EMIT_BUG(0); \ - unreachable(); \ +#define __WARN_FLAGS(cond_str, flags) \ +do { \ + __BUG_ASM(cond_str, BUGFLAG_WARNING | (flags)); \ } while (0) -#define __WARN_FLAGS(flags) do { \ - __EMIT_BUG(BUGFLAG_WARNING|(flags)); \ +#define __WARN_bug_entry(flags, format) \ +({ \ + struct bug_entry *bug; \ + \ + asm_inline volatile("\n" \ + "0: larl %[bug],1f\n" \ + __BUG_ENTRY("%[frmt]", "%[file]", "%[line]", \ + "%[flgs]", "%[size]") \ + : [bug] "=d" (bug) \ + : [frmt] "i" (format), \ + [file] "i" (__FILE__), \ + [line] "i" (__LINE__), \ + [flgs] "i" (flags), \ + [size] "i" (sizeof(struct bug_entry))); \ + bug; \ +}) + +/* + * Variable Argument List (va_list) as defined in ELF Application + * Binary Interface s390x Supplement documentation. + */ +struct arch_va_list { + long __gpr; + long __fpr; + void *__overflow_arg_area; + void *__reg_save_area; +}; + +struct bug_entry; +struct pt_regs; + +void *__warn_args(struct arch_va_list *args, struct pt_regs *regs); +void __WARN_trap(struct bug_entry *bug, ...); + +#define __WARN_print_arg(flags, format, arg...) \ +do { \ + int __flags = (flags) | BUGFLAG_WARNING | BUGFLAG_ARGS; \ + \ + __WARN_trap(__WARN_bug_entry(__flags, format), ## arg); \ + /* prevent tail-call optimization */ \ + asm(""); \ } while (0) -#define WARN_ON(x) ({ \ - int __ret_warn_on = !!(x); \ - if (__builtin_constant_p(__ret_warn_on)) { \ - if (__ret_warn_on) \ - __WARN(); \ - } else { \ - if (unlikely(__ret_warn_on)) \ - __WARN(); \ - } \ - unlikely(__ret_warn_on); \ +#define __WARN_printf(taint, fmt, arg...) \ + __WARN_print_arg(BUGFLAG_TAINT(taint), fmt, ## arg) + +#define WARN_ONCE(cond, format, arg...) \ +({ \ + int __ret_warn_on = !!(cond); \ + \ + if (unlikely(__ret_warn_on)) { \ + __WARN_print_arg(BUGFLAG_ONCE|BUGFLAG_TAINT(TAINT_WARN),\ + format, ## arg); \ + } \ + __ret_warn_on; \ }) #define HAVE_ARCH_BUG -#define HAVE_ARCH_WARN_ON -#endif /* CONFIG_BUG */ +#define HAVE_ARCH_BUG_FORMAT +#define HAVE_ARCH_BUG_FORMAT_ARGS + +#endif /* CONFIG_BUG && CONFIG_CC_HAS_ASM_IMMEDIATE_STRINGS */ +#endif /* __ASSEMBLER__ */ #include <asm-generic/bug.h> diff --git a/arch/s390/include/asm/checksum.h b/arch/s390/include/asm/checksum.h index d86dea5900e7..7e83dc2d3b06 100644 --- a/arch/s390/include/asm/checksum.h +++ b/arch/s390/include/asm/checksum.h @@ -27,7 +27,7 @@ static inline __wsum cksm(const void *buff, int len, __wsum sum) kmsan_check_memory(buff, len); asm volatile( "0: cksm %[sum],%[rp]\n" - " jo 0b\n" + " jo 0b" : [sum] "+&d" (sum), [rp] "+&d" (rp.pair) : : "cc", "memory"); return sum; } diff --git a/arch/s390/include/asm/cio.h b/arch/s390/include/asm/cio.h index b6b619f340a5..0a82ae2300b6 100644 --- a/arch/s390/include/asm/cio.h +++ b/arch/s390/include/asm/cio.h @@ -18,6 +18,8 @@ #include <asm/scsw.h> +#define CCW_MAX_BYTE_COUNT 65535 + /** * struct ccw1 - channel command word * @cmd_code: command code diff --git a/arch/s390/include/asm/cmpxchg.h b/arch/s390/include/asm/cmpxchg.h index a9e2006033b7..008357996262 100644 --- a/arch/s390/include/asm/cmpxchg.h +++ b/arch/s390/include/asm/cmpxchg.h @@ -18,7 +18,7 @@ void __cmpxchg_called_with_bad_pointer(void); static __always_inline u32 __cs_asm(u64 ptr, u32 old, u32 new) { asm volatile( - " cs %[old],%[new],%[ptr]\n" + " cs %[old],%[new],%[ptr]" : [old] "+d" (old), [ptr] "+Q" (*(u32 *)ptr) : [new] "d" (new) : "memory", "cc"); @@ -28,7 +28,7 @@ static __always_inline u32 __cs_asm(u64 ptr, u32 old, u32 new) static __always_inline u64 __csg_asm(u64 ptr, u64 old, u64 new) { asm volatile( - " csg %[old],%[new],%[ptr]\n" + " csg %[old],%[new],%[ptr]" : [old] "+d" (old), [ptr] "+QS" (*(u64 *)ptr) : [new] "d" (new) : "memory", "cc"); @@ -126,7 +126,7 @@ static __always_inline u64 __arch_cmpxchg(u64 ptr, u64 old, u64 new, int size) } \ case 4: { \ asm volatile( \ - " cs %[__old],%[__new],%[__ptr]\n" \ + " cs %[__old],%[__new],%[__ptr]" \ : [__old] "+d" (*__oldp), \ [__ptr] "+Q" (*(ptr)), \ "=@cc" (__cc) \ @@ -136,7 +136,7 @@ static __always_inline u64 __arch_cmpxchg(u64 ptr, u64 old, u64 new, int size) } \ case 8: { \ asm volatile( \ - " csg %[__old],%[__new],%[__ptr]\n" \ + " csg %[__old],%[__new],%[__ptr]" \ : [__old] "+d" (*__oldp), \ [__ptr] "+QS" (*(ptr)), \ "=@cc" (__cc) \ @@ -241,7 +241,7 @@ static __always_inline u64 __arch_xchg(u64 ptr, u64 x, int size) static __always_inline u128 arch_cmpxchg128(volatile u128 *ptr, u128 old, u128 new) { asm volatile( - " cdsg %[old],%[new],%[ptr]\n" + " cdsg %[old],%[new],%[ptr]" : [old] "+d" (old), [ptr] "+QS" (*ptr) : [new] "d" (new) : "memory", "cc"); @@ -258,7 +258,7 @@ static __always_inline bool arch_try_cmpxchg128(volatile u128 *ptr, u128 *oldp, int cc; asm volatile( - " cdsg %[old],%[new],%[ptr]\n" + " cdsg %[old],%[new],%[ptr]" : [old] "+d" (*oldp), [ptr] "+QS" (*ptr), "=@cc" (cc) : [new] "d" (new) : "memory"); diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h deleted file mode 100644 index 3cb9d813f022..000000000000 --- a/arch/s390/include/asm/compat.h +++ /dev/null @@ -1,140 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_S390X_COMPAT_H -#define _ASM_S390X_COMPAT_H -/* - * Architecture specific compatibility types - */ -#include <linux/types.h> -#include <linux/sched.h> -#include <linux/sched/task_stack.h> -#include <linux/thread_info.h> -#include <asm/ptrace.h> - -#define compat_mode_t compat_mode_t -typedef u16 compat_mode_t; - -#define __compat_uid_t __compat_uid_t -typedef u16 __compat_uid_t; -typedef u16 __compat_gid_t; - -#define compat_dev_t compat_dev_t -typedef u16 compat_dev_t; - -#define compat_ipc_pid_t compat_ipc_pid_t -typedef u16 compat_ipc_pid_t; - -#define compat_statfs compat_statfs - -#include <asm-generic/compat.h> - -#define __TYPE_IS_PTR(t) (!__builtin_types_compatible_p( \ - typeof(0?(__force t)0:0ULL), u64)) - -#define __SC_DELOUSE(t,v) ({ \ - BUILD_BUG_ON(sizeof(t) > 4 && !__TYPE_IS_PTR(t)); \ - (__force t)(__TYPE_IS_PTR(t) ? ((v) & 0x7fffffff) : (v)); \ -}) - -#define PSW32_MASK_USER 0x0000FF00UL - -#define PSW32_USER_BITS (PSW32_MASK_DAT | PSW32_MASK_IO | PSW32_MASK_EXT | \ - PSW32_DEFAULT_KEY | PSW32_MASK_BASE | \ - PSW32_MASK_MCHECK | PSW32_MASK_PSTATE | \ - PSW32_ASC_PRIMARY) - -#define COMPAT_UTS_MACHINE "s390\0\0\0\0" - -typedef u16 compat_nlink_t; - -typedef struct { - u32 mask; - u32 addr; -} __aligned(8) psw_compat_t; - -typedef struct { - psw_compat_t psw; - u32 gprs[NUM_GPRS]; - u32 acrs[NUM_ACRS]; - u32 orig_gpr2; -} s390_compat_regs; - -typedef struct { - u32 gprs_high[NUM_GPRS]; -} s390_compat_regs_high; - -struct compat_stat { - compat_dev_t st_dev; - u16 __pad1; - compat_ino_t st_ino; - compat_mode_t st_mode; - compat_nlink_t st_nlink; - __compat_uid_t st_uid; - __compat_gid_t st_gid; - compat_dev_t st_rdev; - u16 __pad2; - u32 st_size; - u32 st_blksize; - u32 st_blocks; - u32 st_atime; - u32 st_atime_nsec; - u32 st_mtime; - u32 st_mtime_nsec; - u32 st_ctime; - u32 st_ctime_nsec; - u32 __unused4; - u32 __unused5; -}; - -struct compat_statfs { - u32 f_type; - u32 f_bsize; - u32 f_blocks; - u32 f_bfree; - u32 f_bavail; - u32 f_files; - u32 f_ffree; - compat_fsid_t f_fsid; - u32 f_namelen; - u32 f_frsize; - u32 f_flags; - u32 f_spare[4]; -}; - -struct compat_statfs64 { - u32 f_type; - u32 f_bsize; - u64 f_blocks; - u64 f_bfree; - u64 f_bavail; - u64 f_files; - u64 f_ffree; - compat_fsid_t f_fsid; - u32 f_namelen; - u32 f_frsize; - u32 f_flags; - u32 f_spare[5]; -}; - -/* - * A pointer passed in from user mode. This should not - * be used for syscall parameters, just declare them - * as pointers because the syscall entry code will have - * appropriately converted them already. - */ - -static inline void __user *compat_ptr(compat_uptr_t uptr) -{ - return (void __user *)(unsigned long)(uptr & 0x7fffffffUL); -} -#define compat_ptr(uptr) compat_ptr(uptr) - -#ifdef CONFIG_COMPAT - -static inline int is_compat_task(void) -{ - return test_thread_flag(TIF_31BIT); -} - -#endif - -#endif /* _ASM_S390X_COMPAT_H */ diff --git a/arch/s390/include/asm/cpacf.h b/arch/s390/include/asm/cpacf.h index 59ab1192e2d5..a83683169d98 100644 --- a/arch/s390/include/asm/cpacf.h +++ b/arch/s390/include/asm/cpacf.h @@ -129,6 +129,10 @@ #define CPACF_KMAC_HMAC_SHA_256 0x71 #define CPACF_KMAC_HMAC_SHA_384 0x72 #define CPACF_KMAC_HMAC_SHA_512 0x73 +#define CPACF_KMAC_PHMAC_SHA_224 0x78 +#define CPACF_KMAC_PHMAC_SHA_256 0x79 +#define CPACF_KMAC_PHMAC_SHA_384 0x7a +#define CPACF_KMAC_PHMAC_SHA_512 0x7b /* * Function codes for the PCKMO (PERFORM CRYPTOGRAPHIC KEY MANAGEMENT) @@ -225,7 +229,7 @@ static __always_inline void __cpacf_query_rre(u32 opc, u8 r1, u8 r2, asm volatile( " la %%r1,%[pb]\n" " lghi %%r0,%[fc]\n" - " .insn rre,%[opc] << 16,%[r1],%[r2]\n" + " .insn rre,%[opc] << 16,%[r1],%[r2]" : [pb] "=R" (*pb) : [opc] "i" (opc), [fc] "i" (fc), [r1] "i" (r1), [r2] "i" (r2) @@ -238,7 +242,7 @@ static __always_inline void __cpacf_query_rrf(u32 opc, u8 r1, u8 r2, u8 r3, asm volatile( " la %%r1,%[pb]\n" " lghi %%r0,%[fc]\n" - " .insn rrf,%[opc] << 16,%[r1],%[r2],%[r3],%[m4]\n" + " .insn rrf,%[opc] << 16,%[r1],%[r2],%[r3],%[m4]" : [pb] "=R" (*pb) : [opc] "i" (opc), [fc] "i" (fc), [r1] "i" (r1), [r2] "i" (r2), [r3] "i" (r3), [m4] "i" (m4) @@ -412,7 +416,7 @@ static inline int cpacf_km(unsigned long func, void *param, " lgr 0,%[fc]\n" " lgr 1,%[pba]\n" "0: .insn rre,%[opc] << 16,%[dst],%[src]\n" - " brc 1,0b\n" /* handle partial completion */ + " brc 1,0b" /* handle partial completion */ : [src] "+&d" (s.pair), [dst] "+&d" (d.pair) : [fc] "d" (func), [pba] "d" ((unsigned long)param), [opc] "i" (CPACF_KM) @@ -444,7 +448,7 @@ static inline int cpacf_kmc(unsigned long func, void *param, " lgr 0,%[fc]\n" " lgr 1,%[pba]\n" "0: .insn rre,%[opc] << 16,%[dst],%[src]\n" - " brc 1,0b\n" /* handle partial completion */ + " brc 1,0b" /* handle partial completion */ : [src] "+&d" (s.pair), [dst] "+&d" (d.pair) : [fc] "d" (func), [pba] "d" ((unsigned long)param), [opc] "i" (CPACF_KMC) @@ -472,7 +476,7 @@ static inline void cpacf_kimd(unsigned long func, void *param, " lgr 0,%[fc]\n" " lgr 1,%[pba]\n" "0: .insn rrf,%[opc] << 16,0,%[src],8,0\n" - " brc 1,0b\n" /* handle partial completion */ + " brc 1,0b" /* handle partial completion */ : [src] "+&d" (s.pair) : [fc] "d" (func), [pba] "d" ((unsigned long)(param)), [opc] "i" (CPACF_KIMD) @@ -497,7 +501,7 @@ static inline void cpacf_klmd(unsigned long func, void *param, " lgr 0,%[fc]\n" " lgr 1,%[pba]\n" "0: .insn rrf,%[opc] << 16,0,%[src],8,0\n" - " brc 1,0b\n" /* handle partial completion */ + " brc 1,0b" /* handle partial completion */ : [src] "+&d" (s.pair) : [fc] "d" (func), [pba] "d" ((unsigned long)param), [opc] "i" (CPACF_KLMD) @@ -526,7 +530,7 @@ static inline int _cpacf_kmac(unsigned long *gr0, void *param, " lgr 1,%[pba]\n" "0: .insn rre,%[opc] << 16,0,%[src]\n" " brc 1,0b\n" /* handle partial completion */ - " lgr %[r0],0\n" + " lgr %[r0],0" : [r0] "+d" (*gr0), [src] "+&d" (s.pair) : [pba] "d" ((unsigned long)param), [opc] "i" (CPACF_KMAC) @@ -576,7 +580,7 @@ static inline int cpacf_kmctr(unsigned long func, void *param, u8 *dest, " lgr 0,%[fc]\n" " lgr 1,%[pba]\n" "0: .insn rrf,%[opc] << 16,%[dst],%[src],%[ctr],0\n" - " brc 1,0b\n" /* handle partial completion */ + " brc 1,0b" /* handle partial completion */ : [src] "+&d" (s.pair), [dst] "+&d" (d.pair), [ctr] "+&d" (c.pair) : [fc] "d" (func), [pba] "d" ((unsigned long)param), @@ -610,7 +614,7 @@ static inline void cpacf_prno(unsigned long func, void *param, " lgr 0,%[fc]\n" " lgr 1,%[pba]\n" "0: .insn rre,%[opc] << 16,%[dst],%[seed]\n" - " brc 1,0b\n" /* handle partial completion */ + " brc 1,0b" /* handle partial completion */ : [dst] "+&d" (d.pair) : [fc] "d" (func), [pba] "d" ((unsigned long)param), [seed] "d" (s.pair), [opc] "i" (CPACF_PRNO) @@ -636,7 +640,7 @@ static inline void cpacf_trng(u8 *ucbuf, unsigned long ucbuf_len, asm volatile ( " lghi 0,%[fc]\n" "0: .insn rre,%[opc] << 16,%[ucbuf],%[cbuf]\n" - " brc 1,0b\n" /* handle partial completion */ + " brc 1,0b" /* handle partial completion */ : [ucbuf] "+&d" (u.pair), [cbuf] "+&d" (c.pair) : [fc] "K" (CPACF_PRNO_TRNG), [opc] "i" (CPACF_PRNO) : "cc", "memory", "0"); @@ -649,18 +653,30 @@ static inline void cpacf_trng(u8 *ucbuf, unsigned long ucbuf_len, * instruction * @func: the function code passed to PCC; see CPACF_KM_xxx defines * @param: address of parameter block; see POP for details on each func + * + * Returns the condition code, this is + * 0 - cc code 0 (normal completion) + * 1 - cc code 1 (protected key wkvp mismatch or src operand out of range) + * 2 - cc code 2 (something invalid, scalar multiply infinity, ...) + * Condition code 3 (partial completion) is handled within the asm code + * and never returned. */ -static inline void cpacf_pcc(unsigned long func, void *param) +static inline int cpacf_pcc(unsigned long func, void *param) { + int cc; + asm volatile( " lgr 0,%[fc]\n" " lgr 1,%[pba]\n" "0: .insn rre,%[opc] << 16,0,0\n" /* PCC opcode */ " brc 1,0b\n" /* handle partial completion */ - : + CC_IPM(cc) + : CC_OUT(cc, cc) : [fc] "d" (func), [pba] "d" ((unsigned long)param), [opc] "i" (CPACF_PCC) - : "cc", "memory", "0", "1"); + : CC_CLOBBER_LIST("memory", "0", "1")); + + return CC_TRANSFORM(cc); } /** @@ -676,7 +692,7 @@ static inline void cpacf_pckmo(long func, void *param) asm volatile( " lgr 0,%[fc]\n" " lgr 1,%[pba]\n" - " .insn rre,%[opc] << 16,0,0\n" /* PCKMO opcode */ + " .insn rre,%[opc] << 16,0,0" /* PCKMO opcode */ : : [fc] "d" (func), [pba] "d" ((unsigned long)param), [opc] "i" (CPACF_PCKMO) @@ -709,7 +725,7 @@ static inline void cpacf_kma(unsigned long func, void *param, u8 *dest, " lgr 0,%[fc]\n" " lgr 1,%[pba]\n" "0: .insn rrf,%[opc] << 16,%[dst],%[src],%[aad],0\n" - " brc 1,0b\n" /* handle partial completion */ + " brc 1,0b" /* handle partial completion */ : [dst] "+&d" (d.pair), [src] "+&d" (s.pair), [aad] "+&d" (a.pair) : [fc] "d" (func), [pba] "d" ((unsigned long)param), diff --git a/arch/s390/include/asm/cpu.h b/arch/s390/include/asm/cpu.h index 26c710cd3485..5672e3fab52b 100644 --- a/arch/s390/include/asm/cpu.h +++ b/arch/s390/include/asm/cpu.h @@ -9,7 +9,7 @@ #ifndef _ASM_S390_CPU_H #define _ASM_S390_CPU_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/types.h> #include <linux/jump_label.h> @@ -24,5 +24,5 @@ struct cpuid DECLARE_STATIC_KEY_FALSE(cpu_has_bear); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_S390_CPU_H */ diff --git a/arch/s390/include/asm/cpu_mf-insn.h b/arch/s390/include/asm/cpu_mf-insn.h index a68b362e0964..941663939cc7 100644 --- a/arch/s390/include/asm/cpu_mf-insn.h +++ b/arch/s390/include/asm/cpu_mf-insn.h @@ -8,7 +8,7 @@ #ifndef _ASM_S390_CPU_MF_INSN_H #define _ASM_S390_CPU_MF_INSN_H -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ /* Macro to generate the STCCTM instruction with a customized * M3 field designating the counter set. @@ -17,6 +17,6 @@ .insn rsy,0xeb0000000017,\r1,\m3 & 0xf,\db2 .endm -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif diff --git a/arch/s390/include/asm/cpu_mf.h b/arch/s390/include/asm/cpu_mf.h index e1a279e0d6a6..1798fbd59068 100644 --- a/arch/s390/include/asm/cpu_mf.h +++ b/arch/s390/include/asm/cpu_mf.h @@ -171,7 +171,7 @@ static inline int qctri(struct cpumf_ctr_info *info) { int rc = -EINVAL; - asm volatile ( + asm_inline volatile ( "0: qctri %1\n" "1: lhi %0,0\n" "2:\n" @@ -185,7 +185,7 @@ static inline int lcctl(u64 ctl) { int cc; - asm volatile ( + asm_inline volatile ( " lcctl %[ctl]\n" CC_IPM(cc) : CC_OUT(cc, cc) @@ -200,7 +200,7 @@ static inline int __ecctr(u64 ctr, u64 *content) u64 _content; int cc; - asm volatile ( + asm_inline volatile ( " ecctr %[_content],%[ctr]\n" CC_IPM(cc) : CC_OUT(cc, cc), [_content] "=d" (_content) diff --git a/arch/s390/include/asm/cpufeature.h b/arch/s390/include/asm/cpufeature.h index 931204613753..d6fb999c8c6d 100644 --- a/arch/s390/include/asm/cpufeature.h +++ b/arch/s390/include/asm/cpufeature.h @@ -9,10 +9,13 @@ #ifndef __ASM_S390_CPUFEATURE_H #define __ASM_S390_CPUFEATURE_H +#include <asm/facility.h> + enum { S390_CPU_FEATURE_MSA, S390_CPU_FEATURE_VXRS, S390_CPU_FEATURE_UV, + S390_CPU_FEATURE_D288, MAX_CPU_FEATURES }; @@ -20,4 +23,15 @@ enum { int cpu_have_feature(unsigned int nr); +#define cpu_has_bear() test_facility(193) +#define cpu_has_edat1() test_facility(8) +#define cpu_has_edat2() test_facility(78) +#define cpu_has_gs() test_facility(133) +#define cpu_has_nx() test_facility(130) +#define cpu_has_rdp() test_facility(194) +#define cpu_has_seq_insn() test_facility(85) +#define cpu_has_tlb_lc() test_facility(51) +#define cpu_has_topology() test_facility(11) +#define cpu_has_vx() test_facility(129) + #endif /* __ASM_S390_CPUFEATURE_H */ diff --git a/arch/s390/include/asm/ctlreg.h b/arch/s390/include/asm/ctlreg.h index e6527f51ad0b..1765a0320933 100644 --- a/arch/s390/include/asm/ctlreg.h +++ b/arch/s390/include/asm/ctlreg.h @@ -80,7 +80,7 @@ #define CR14_EXTERNAL_DAMAGE_SUBMASK BIT(CR14_EXTERNAL_DAMAGE_SUBMASK_BIT) #define CR14_WARNING_SUBMASK BIT(CR14_WARNING_SUBMASK_BIT) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/bug.h> @@ -100,7 +100,7 @@ struct ctlreg { BUILD_BUG_ON(sizeof(struct addrtype) != _esize); \ typecheck(struct ctlreg, array[0]); \ asm volatile( \ - " lctlg %[_low],%[_high],%[_arr]\n" \ + " lctlg %[_low],%[_high],%[_arr]" \ : \ : [_arr] "Q" (*(struct addrtype *)(&array)), \ [_low] "i" (low), [_high] "i" (high) \ @@ -119,7 +119,7 @@ struct ctlreg { BUILD_BUG_ON(sizeof(struct addrtype) != _esize); \ typecheck(struct ctlreg, array[0]); \ asm volatile( \ - " stctg %[_low],%[_high],%[_arr]\n" \ + " stctg %[_low],%[_high],%[_arr]" \ : [_arr] "=Q" (*(struct addrtype *)(&array)) \ : [_low] "i" (low), [_high] "i" (high)); \ } while (0) @@ -127,7 +127,7 @@ struct ctlreg { static __always_inline void local_ctl_load(unsigned int cr, struct ctlreg *reg) { asm volatile( - " lctlg %[cr],%[cr],%[reg]\n" + " lctlg %[cr],%[cr],%[reg]" : : [reg] "Q" (*reg), [cr] "i" (cr) : "memory"); @@ -136,7 +136,7 @@ static __always_inline void local_ctl_load(unsigned int cr, struct ctlreg *reg) static __always_inline void local_ctl_store(unsigned int cr, struct ctlreg *reg) { asm volatile( - " stctg %[cr],%[cr],%[reg]\n" + " stctg %[cr],%[cr],%[reg]" : [reg] "=Q" (*reg) : [cr] "i" (cr)); } @@ -252,5 +252,5 @@ union ctlreg15 { }; }; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __ASM_S390_CTLREG_H */ diff --git a/arch/s390/include/asm/current.h b/arch/s390/include/asm/current.h index d03a922c641e..f9529f7cf62c 100644 --- a/arch/s390/include/asm/current.h +++ b/arch/s390/include/asm/current.h @@ -11,9 +11,25 @@ #define _S390_CURRENT_H #include <asm/lowcore.h> +#include <asm/machine.h> struct task_struct; -#define current ((struct task_struct *const)get_lowcore()->current_task) +static __always_inline struct task_struct *get_current(void) +{ + unsigned long ptr, lc_current; + + lc_current = offsetof(struct lowcore, current_task); + asm_inline( + ALTERNATIVE(" lg %[ptr],%[offzero](%%r0)\n", + " lg %[ptr],%[offalt](%%r0)\n", + ALT_FEATURE(MFEATURE_LOWCORE)) + : [ptr] "=d" (ptr) + : [offzero] "i" (lc_current), + [offalt] "i" (lc_current + LOWCORE_ALT_ADDRESS)); + return (struct task_struct *)ptr; +} + +#define current get_current() #endif /* !(_S390_CURRENT_H) */ diff --git a/arch/s390/include/asm/dat-bits.h b/arch/s390/include/asm/dat-bits.h index 8d65eec2f124..c40874e0e426 100644 --- a/arch/s390/include/asm/dat-bits.h +++ b/arch/s390/include/asm/dat-bits.h @@ -9,6 +9,32 @@ #ifndef _S390_DAT_BITS_H #define _S390_DAT_BITS_H +/* + * vaddress union in order to easily decode a virtual address into its + * region first index, region second index etc. parts. + */ +union vaddress { + unsigned long addr; + struct { + unsigned long rfx : 11; + unsigned long rsx : 11; + unsigned long rtx : 11; + unsigned long sx : 11; + unsigned long px : 8; + unsigned long bx : 12; + }; + struct { + unsigned long rfx01 : 2; + unsigned long : 9; + unsigned long rsx01 : 2; + unsigned long : 9; + unsigned long rtx01 : 2; + unsigned long : 9; + unsigned long sx01 : 2; + unsigned long : 29; + }; +}; + union asce { unsigned long val; struct { @@ -98,7 +124,8 @@ union region3_table_entry { struct { unsigned long : 53; unsigned long fc: 1; /* Format-Control */ - unsigned long : 4; + unsigned long p : 1; /* DAT-Protection Bit */ + unsigned long : 3; unsigned long i : 1; /* Region-Invalid Bit */ unsigned long cr: 1; /* Common-Region Bit */ unsigned long tt: 2; /* Table-Type Bits */ @@ -140,7 +167,8 @@ union segment_table_entry { struct { unsigned long : 53; unsigned long fc: 1; /* Format-Control */ - unsigned long : 4; + unsigned long p : 1; /* DAT-Protection Bit */ + unsigned long : 3; unsigned long i : 1; /* Segment-Invalid Bit */ unsigned long cs: 1; /* Common-Segment Bit */ unsigned long tt: 2; /* Table-Type Bits */ diff --git a/arch/s390/include/asm/debug.h b/arch/s390/include/asm/debug.h index 6375276d94ea..c5440b3ee53d 100644 --- a/arch/s390/include/asm/debug.h +++ b/arch/s390/include/asm/debug.h @@ -45,7 +45,7 @@ typedef struct debug_info { struct debug_info *next; struct debug_info *prev; refcount_t ref_count; - spinlock_t lock; + raw_spinlock_t lock; int level; int nr_areas; int pages_per_area; @@ -440,7 +440,7 @@ static int VNAME(var, active_entries)[EARLY_AREAS] __initdata .next = NULL, \ .prev = NULL, \ .ref_count = REFCOUNT_INIT(1), \ - .lock = __SPIN_LOCK_UNLOCKED(var.lock), \ + .lock = __RAW_SPIN_LOCK_UNLOCKED(var.lock), \ .level = DEBUG_DEFAULT_LEVEL, \ .nr_areas = EARLY_AREAS, \ .pages_per_area = EARLY_PAGES, \ diff --git a/arch/s390/include/asm/diag.h b/arch/s390/include/asm/diag.h index 5790630e31f0..8db8db3b1018 100644 --- a/arch/s390/include/asm/diag.h +++ b/arch/s390/include/asm/diag.h @@ -66,7 +66,7 @@ static inline void diag10_range(unsigned long start_pfn, unsigned long num_pfn) end_addr = pfn_to_phys(start_pfn + num_pfn - 1); diag_stat_inc(DIAG_STAT_X010); - asm volatile( + asm_inline volatile( "0: diag %0,%1,0x10\n" "1: nopr %%r7\n" EX_TABLE(0b, 1b) diff --git a/arch/s390/include/asm/diag288.h b/arch/s390/include/asm/diag288.h new file mode 100644 index 000000000000..5e1b43cea9d6 --- /dev/null +++ b/arch/s390/include/asm/diag288.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _ASM_S390_DIAG288_H +#define _ASM_S390_DIAG288_H + +#include <asm/asm-extable.h> +#include <asm/types.h> + +#define MIN_INTERVAL 15 /* Minimal time supported by diag288 */ +#define MAX_INTERVAL 3600 /* One hour should be enough - pure estimation */ + +#define WDT_DEFAULT_TIMEOUT 30 + +/* Function codes - init, change, cancel */ +#define WDT_FUNC_INIT 0 +#define WDT_FUNC_CHANGE 1 +#define WDT_FUNC_CANCEL 2 +#define WDT_FUNC_CONCEAL 0x80000000 + +/* Action codes for LPAR watchdog */ +#define LPARWDT_RESTART 0 + +static inline int __diag288(unsigned int func, unsigned int timeout, + unsigned long action, unsigned int len) +{ + union register_pair r1 = { .even = func, .odd = timeout, }; + union register_pair r3 = { .even = action, .odd = len, }; + int rc = -EINVAL; + + asm volatile( + " diag %[r1],%[r3],0x288\n" + "0: lhi %[rc],0\n" + "1:" + EX_TABLE(0b, 1b) + : [rc] "+d" (rc) + : [r1] "d" (r1.pair), [r3] "d" (r3.pair) + : "cc", "memory"); + return rc; +} + +#endif /* _ASM_S390_DIAG288_H */ diff --git a/arch/s390/include/asm/dwarf.h b/arch/s390/include/asm/dwarf.h index 390906b8e386..e3ad6798d0cd 100644 --- a/arch/s390/include/asm/dwarf.h +++ b/arch/s390/include/asm/dwarf.h @@ -2,7 +2,7 @@ #ifndef _ASM_S390_DWARF_H #define _ASM_S390_DWARF_H -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define CFI_STARTPROC .cfi_startproc #define CFI_ENDPROC .cfi_endproc @@ -33,6 +33,6 @@ .cfi_sections .eh_frame, .debug_frame #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_S390_DWARF_H */ diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h index 8f2c23cc52b6..bb63fa4d20bb 100644 --- a/arch/s390/include/asm/elf.h +++ b/arch/s390/include/asm/elf.h @@ -158,15 +158,10 @@ enum { #define ELF_DATA ELFDATA2MSB #define ELF_ARCH EM_S390 -/* s390 specific phdr types */ -#define PT_S390_PGSTE 0x70000000 - /* * ELF register definitions.. */ -#include <linux/compat.h> - #include <asm/ptrace.h> #include <asm/syscall.h> #include <asm/user.h> @@ -174,9 +169,6 @@ enum { typedef s390_fp_regs elf_fpregset_t; typedef s390_regs elf_gregset_t; -typedef s390_fp_regs compat_elf_fpregset_t; -typedef s390_compat_regs compat_elf_gregset_t; - #include <linux/sched/mm.h> /* for task_struct */ #include <asm/mmu_context.h> @@ -186,39 +178,6 @@ typedef s390_compat_regs compat_elf_gregset_t; #define elf_check_arch(x) \ (((x)->e_machine == EM_S390 || (x)->e_machine == EM_S390_OLD) \ && (x)->e_ident[EI_CLASS] == ELF_CLASS) -#define compat_elf_check_arch(x) \ - (((x)->e_machine == EM_S390 || (x)->e_machine == EM_S390_OLD) \ - && (x)->e_ident[EI_CLASS] == ELF_CLASS) -#define compat_start_thread start_thread31 - -struct arch_elf_state { - int rc; -}; - -#define INIT_ARCH_ELF_STATE { .rc = 0 } - -#define arch_check_elf(ehdr, interp, interp_ehdr, state) (0) -#ifdef CONFIG_PGSTE -#define arch_elf_pt_proc(ehdr, phdr, elf, interp, state) \ -({ \ - struct arch_elf_state *_state = state; \ - if ((phdr)->p_type == PT_S390_PGSTE && \ - !page_table_allocate_pgste && \ - !test_thread_flag(TIF_PGSTE) && \ - !current->mm->context.alloc_pgste) { \ - set_thread_flag(TIF_PGSTE); \ - set_pt_regs_flag(task_pt_regs(current), \ - PIF_EXECVE_PGSTE_RESTART); \ - _state->rc = -EAGAIN; \ - } \ - _state->rc; \ -}) -#else -#define arch_elf_pt_proc(ehdr, phdr, elf, interp, state) \ -({ \ - (state)->rc; \ -}) -#endif /* For SVR4/S390 the function pointer to be registered with `atexit` is passed in R14. */ @@ -235,9 +194,7 @@ struct arch_elf_state { the loader. We need to make sure that it is out of the way of the program that it will "exec", and that there is sufficient room for the brk. 64-bit tasks are aligned to 4GB. */ -#define ELF_ET_DYN_BASE (is_compat_task() ? \ - (STACK_TOP / 3 * 2) : \ - (STACK_TOP / 3 * 2) & ~((1UL << 32) - 1)) +#define ELF_ET_DYN_BASE ((STACK_TOP / 3 * 2) & ~((1UL << 32) - 1)) /* This yields a mask that user programs can use to figure out what instruction set this CPU supports. */ @@ -256,43 +213,21 @@ extern unsigned long elf_hwcap; extern char elf_platform[]; #define ELF_PLATFORM (elf_platform) -#ifndef CONFIG_COMPAT #define SET_PERSONALITY(ex) \ do { \ set_personality(PER_LINUX | \ (current->personality & (~PER_MASK))); \ - current->thread.sys_call_table = sys_call_table; \ -} while (0) -#else /* CONFIG_COMPAT */ -#define SET_PERSONALITY(ex) \ -do { \ - if (personality(current->personality) != PER_LINUX32) \ - set_personality(PER_LINUX | \ - (current->personality & ~PER_MASK)); \ - if ((ex).e_ident[EI_CLASS] == ELFCLASS32) { \ - set_thread_flag(TIF_31BIT); \ - current->thread.sys_call_table = \ - sys_call_table_emu; \ - } else { \ - clear_thread_flag(TIF_31BIT); \ - current->thread.sys_call_table = \ - sys_call_table; \ - } \ } while (0) -#endif /* CONFIG_COMPAT */ /* * Cache aliasing on the latest machines calls for a mapping granularity - * of 512KB for the anonymous mapping base. For 64-bit processes use a - * 512KB alignment and a randomization of up to 1GB. For 31-bit processes - * the virtual address space is limited, use no alignment and limit the - * randomization to 8MB. - * For the additional randomization of the program break use 32MB for - * 64-bit and 8MB for 31-bit. + * of 512KB for the anonymous mapping base. Use a 512KB alignment and a + * randomization of up to 1GB. + * For the additional randomization of the program break use 32MB. */ -#define BRK_RND_MASK (is_compat_task() ? 0x7ffUL : 0x1fffUL) -#define MMAP_RND_MASK (is_compat_task() ? 0x7ffUL : 0x3ff80UL) -#define MMAP_ALIGN_MASK (is_compat_task() ? 0 : 0x7fUL) +#define BRK_RND_MASK (0x1fffUL) +#define MMAP_RND_MASK (0x3ff80UL) +#define MMAP_ALIGN_MASK (0x7fUL) #define STACK_RND_MASK MMAP_RND_MASK /* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */ diff --git a/arch/s390/include/asm/entry-common.h b/arch/s390/include/asm/entry-common.h index 35555c944630..35450a485323 100644 --- a/arch/s390/include/asm/entry-common.h +++ b/arch/s390/include/asm/entry-common.h @@ -51,12 +51,14 @@ static __always_inline void arch_exit_to_user_mode(void) #define arch_exit_to_user_mode arch_exit_to_user_mode -static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, - unsigned long ti_work) +static __always_inline bool arch_in_rcu_eqs(void) { - choose_random_kstack_offset(get_tod_clock_fast()); + if (IS_ENABLED(CONFIG_KVM)) + return current->flags & PF_VCPU; + + return false; } -#define arch_exit_to_user_mode_prepare arch_exit_to_user_mode_prepare +#define arch_in_rcu_eqs arch_in_rcu_eqs #endif diff --git a/arch/s390/include/asm/extmem.h b/arch/s390/include/asm/extmem.h index e0a06060afdd..225ee89c3f5e 100644 --- a/arch/s390/include/asm/extmem.h +++ b/arch/s390/include/asm/extmem.h @@ -6,7 +6,7 @@ #ifndef _ASM_S390X_DCSS_H #define _ASM_S390X_DCSS_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * DCSS segment is defined as a contiguous range of pages using DEFSEG command. diff --git a/arch/s390/include/asm/fpu-insn-asm.h b/arch/s390/include/asm/fpu-insn-asm.h index d296322be4bc..cc0468fdf2d0 100644 --- a/arch/s390/include/asm/fpu-insn-asm.h +++ b/arch/s390/include/asm/fpu-insn-asm.h @@ -16,7 +16,7 @@ #error only <asm/fpu-insn.h> can be included directly #endif -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ /* Macros to generate vector instruction byte code */ @@ -750,5 +750,5 @@ MRXBOPC 0, 0x77, v1, v2, v3 .endm -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __ASM_S390_FPU_INSN_ASM_H */ diff --git a/arch/s390/include/asm/fpu-insn.h b/arch/s390/include/asm/fpu-insn.h index f668bffd6dd3..96727f3bd0dc 100644 --- a/arch/s390/include/asm/fpu-insn.h +++ b/arch/s390/include/asm/fpu-insn.h @@ -9,9 +9,10 @@ #include <asm/fpu-insn-asm.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/instrumented.h> +#include <linux/kmsan.h> #include <asm/asm-extable.h> asm(".include \"asm/fpu-insn-asm.h\"\n"); @@ -38,7 +39,7 @@ asm(".include \"asm/fpu-insn-asm.h\"\n"); static __always_inline void fpu_cefbr(u8 f1, s32 val) { - asm volatile("cefbr %[f1],%[val]\n" + asm volatile("cefbr %[f1],%[val]" : : [f1] "I" (f1), [val] "d" (val) : "memory"); @@ -48,7 +49,7 @@ static __always_inline unsigned long fpu_cgebr(u8 f2, u8 mode) { unsigned long val; - asm volatile("cgebr %[val],%[mode],%[f2]\n" + asm volatile("cgebr %[val],%[mode],%[f2]" : [val] "=d" (val) : [f2] "I" (f2), [mode] "I" (mode) : "memory"); @@ -57,7 +58,7 @@ static __always_inline unsigned long fpu_cgebr(u8 f2, u8 mode) static __always_inline void fpu_debr(u8 f1, u8 f2) { - asm volatile("debr %[f1],%[f2]\n" + asm volatile("debr %[f1],%[f2]" : : [f1] "I" (f1), [f2] "I" (f2) : "memory"); @@ -66,7 +67,7 @@ static __always_inline void fpu_debr(u8 f1, u8 f2) static __always_inline void fpu_ld(unsigned short fpr, freg_t *reg) { instrument_read(reg, sizeof(*reg)); - asm volatile("ld %[fpr],%[reg]\n" + asm volatile("ld %[fpr],%[reg]" : : [fpr] "I" (fpr), [reg] "Q" (reg->ui) : "memory"); @@ -74,7 +75,7 @@ static __always_inline void fpu_ld(unsigned short fpr, freg_t *reg) static __always_inline void fpu_ldgr(u8 f1, u32 val) { - asm volatile("ldgr %[f1],%[val]\n" + asm volatile("ldgr %[f1],%[val]" : : [f1] "I" (f1), [val] "d" (val) : "memory"); @@ -113,7 +114,7 @@ static inline void fpu_lfpc_safe(unsigned int *fpc) static __always_inline void fpu_std(unsigned short fpr, freg_t *reg) { instrument_write(reg, sizeof(*reg)); - asm volatile("std %[fpr],%[reg]\n" + asm volatile("std %[fpr],%[reg]" : [reg] "=Q" (reg->ui) : [fpr] "I" (fpr) : "memory"); @@ -181,7 +182,7 @@ static __always_inline void fpu_vgfmg(u8 v1, u8 v2, u8 v3) static __always_inline void fpu_vl(u8 v1, const void *vxr) { instrument_read(vxr, sizeof(__vector128)); - asm volatile("VL %[v1],%O[vxr],,%R[vxr]\n" + asm volatile("VL %[v1],%O[vxr],,%R[vxr]" : : [vxr] "Q" (*(__vector128 *)vxr), [v1] "I" (v1) @@ -195,7 +196,7 @@ static __always_inline void fpu_vl(u8 v1, const void *vxr) instrument_read(vxr, sizeof(__vector128)); asm volatile( " la 1,%[vxr]\n" - " VL %[v1],0,,1\n" + " VL %[v1],0,,1" : : [vxr] "R" (*(__vector128 *)vxr), [v1] "I" (v1) @@ -239,7 +240,7 @@ static __always_inline void fpu_vll(u8 v1, u32 index, const void *vxr) size = min(index + 1, sizeof(__vector128)); instrument_read(vxr, size); - asm volatile("VLL %[v1],%[index],%O[vxr],%R[vxr]\n" + asm volatile("VLL %[v1],%[index],%O[vxr],%R[vxr]" : : [vxr] "Q" (*(u8 *)vxr), [index] "d" (index), @@ -257,7 +258,7 @@ static __always_inline void fpu_vll(u8 v1, u32 index, const void *vxr) instrument_read(vxr, size); asm volatile( " la 1,%[vxr]\n" - " VLL %[v1],%[index],0,1\n" + " VLL %[v1],%[index],0,1" : : [vxr] "R" (*(u8 *)vxr), [index] "d" (index), @@ -277,7 +278,7 @@ static __always_inline void fpu_vll(u8 v1, u32 index, const void *vxr) } *_v = (void *)(_vxrs); \ \ instrument_read(_v, size); \ - asm volatile("VLM %[v1],%[v3],%O[vxrs],%R[vxrs]\n" \ + asm volatile("VLM %[v1],%[v3],%O[vxrs],%R[vxrs]" \ : \ : [vxrs] "Q" (*_v), \ [v1] "I" (_v1), [v3] "I" (_v3) \ @@ -297,7 +298,7 @@ static __always_inline void fpu_vll(u8 v1, u32 index, const void *vxr) instrument_read(_v, size); \ asm volatile( \ " la 1,%[vxrs]\n" \ - " VLM %[v1],%[v3],0,1\n" \ + " VLM %[v1],%[v3],0,1" \ : \ : [vxrs] "R" (*_v), \ [v1] "I" (_v1), [v3] "I" (_v3) \ @@ -360,7 +361,7 @@ static __always_inline void fpu_vsrlb(u8 v1, u8 v2, u8 v3) static __always_inline void fpu_vst(u8 v1, const void *vxr) { instrument_write(vxr, sizeof(__vector128)); - asm volatile("VST %[v1],%O[vxr],,%R[vxr]\n" + asm volatile("VST %[v1],%O[vxr],,%R[vxr]" : [vxr] "=Q" (*(__vector128 *)vxr) : [v1] "I" (v1) : "memory"); @@ -373,7 +374,7 @@ static __always_inline void fpu_vst(u8 v1, const void *vxr) instrument_write(vxr, sizeof(__vector128)); asm volatile( " la 1,%[vxr]\n" - " VST %[v1],0,,1\n" + " VST %[v1],0,,1" : [vxr] "=R" (*(__vector128 *)vxr) : [v1] "I" (v1) : "memory", "1"); @@ -389,10 +390,11 @@ static __always_inline void fpu_vstl(u8 v1, u32 index, const void *vxr) size = min(index + 1, sizeof(__vector128)); instrument_write(vxr, size); - asm volatile("VSTL %[v1],%[index],%O[vxr],%R[vxr]\n" + asm volatile("VSTL %[v1],%[index],%O[vxr],%R[vxr]" : [vxr] "=Q" (*(u8 *)vxr) : [index] "d" (index), [v1] "I" (v1) : "memory"); + kmsan_unpoison_memory(vxr, size); } #else /* CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */ @@ -405,10 +407,11 @@ static __always_inline void fpu_vstl(u8 v1, u32 index, const void *vxr) instrument_write(vxr, size); asm volatile( " la 1,%[vxr]\n" - " VSTL %[v1],%[index],0,1\n" + " VSTL %[v1],%[index],0,1" : [vxr] "=R" (*(u8 *)vxr) : [index] "d" (index), [v1] "I" (v1) : "memory", "1"); + kmsan_unpoison_memory(vxr, size); } #endif /* CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */ @@ -423,7 +426,7 @@ static __always_inline void fpu_vstl(u8 v1, u32 index, const void *vxr) } *_v = (void *)(_vxrs); \ \ instrument_write(_v, size); \ - asm volatile("VSTM %[v1],%[v3],%O[vxrs],%R[vxrs]\n" \ + asm volatile("VSTM %[v1],%[v3],%O[vxrs],%R[vxrs]" \ : [vxrs] "=Q" (*_v) \ : [v1] "I" (_v1), [v3] "I" (_v3) \ : "memory"); \ @@ -442,7 +445,7 @@ static __always_inline void fpu_vstl(u8 v1, u32 index, const void *vxr) instrument_write(_v, size); \ asm volatile( \ " la 1,%[vxrs]\n" \ - " VSTM %[v1],%[v3],0,1\n" \ + " VSTM %[v1],%[v3],0,1" \ : [vxrs] "=R" (*_v) \ : [v1] "I" (_v1), [v3] "I" (_v3) \ : "memory", "1"); \ @@ -475,5 +478,5 @@ static __always_inline void fpu_vzero(u8 v) : "memory"); } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __ASM_S390_FPU_INSN_H */ diff --git a/arch/s390/include/asm/fpu.h b/arch/s390/include/asm/fpu.h index c84cb33913e2..960c6c67ad6c 100644 --- a/arch/s390/include/asm/fpu.h +++ b/arch/s390/include/asm/fpu.h @@ -44,6 +44,7 @@ #ifndef _ASM_S390_FPU_H #define _ASM_S390_FPU_H +#include <linux/cpufeature.h> #include <linux/processor.h> #include <linux/preempt.h> #include <linux/string.h> @@ -51,12 +52,6 @@ #include <asm/sigcontext.h> #include <asm/fpu-types.h> #include <asm/fpu-insn.h> -#include <asm/facility.h> - -static inline bool cpu_has_vx(void) -{ - return likely(test_facility(129)); -} enum { KERNEL_FPC_BIT = 0, diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h index 185331e91f83..692c484ec163 100644 --- a/arch/s390/include/asm/ftrace.h +++ b/arch/s390/include/asm/ftrace.h @@ -5,7 +5,7 @@ #define ARCH_SUPPORTS_FTRACE_OPS 1 #define MCOUNT_INSN_SIZE 6 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <asm/stacktrace.h> static __always_inline unsigned long return_address(unsigned int n) @@ -105,28 +105,11 @@ static inline void arch_ftrace_set_direct_caller(struct ftrace_regs *fregs, unsi } #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ -/* - * Even though the system call numbers are identical for s390/s390x a - * different system call table is used for compat tasks. This may lead - * to e.g. incorrect or missing trace event sysfs files. - * Therefore simply do not trace compat system calls at all. - * See kernel/trace/trace_syscalls.c. - */ -#define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS -static inline bool arch_trace_is_compat_syscall(struct pt_regs *regs) -{ - return is_compat_task(); -} - #define ARCH_HAS_SYSCALL_MATCH_SYM_NAME static inline bool arch_syscall_match_sym_name(const char *sym, const char *name) { - /* - * Skip __s390_ and __s390x_ prefix - due to compat wrappers - * and aliasing some symbols of 64 bit system call functions - * may get the __s390_ prefix instead of the __s390x_ prefix. - */ + /* Skip the __s390x_ prefix. */ return !strcmp(sym + 7, name) || !strcmp(sym + 8, name); } @@ -134,7 +117,7 @@ void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs); #define ftrace_graph_func ftrace_graph_func -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #ifdef CONFIG_FUNCTION_TRACER diff --git a/arch/s390/include/asm/futex.h b/arch/s390/include/asm/futex.h index f5781794356b..942f21c39697 100644 --- a/arch/s390/include/asm/futex.h +++ b/arch/s390/include/asm/futex.h @@ -13,9 +13,11 @@ static uaccess_kmsan_or_inline int \ __futex_atomic_##name(int oparg, int *old, u32 __user *uaddr) \ { \ + bool sacf_flag; \ int rc, new; \ \ instrument_copy_from_user_before(old, uaddr, sizeof(*old)); \ + sacf_flag = enable_sacf_uaccess(); \ asm_inline volatile( \ " sacf 256\n" \ "0: l %[old],%[uaddr]\n" \ @@ -32,6 +34,7 @@ __futex_atomic_##name(int oparg, int *old, u32 __user *uaddr) \ [new] "=&d" (new), [uaddr] "+Q" (*uaddr) \ : [oparg] "d" (oparg) \ : "cc"); \ + disable_sacf_uaccess(sacf_flag); \ if (!rc) \ instrument_copy_from_user_after(old, uaddr, sizeof(*old), 0); \ return rc; \ @@ -75,9 +78,11 @@ int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) static uaccess_kmsan_or_inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval, u32 newval) { + bool sacf_flag; int rc; instrument_copy_from_user_before(uval, uaddr, sizeof(*uval)); + sacf_flag = enable_sacf_uaccess(); asm_inline volatile( " sacf 256\n" "0: cs %[old],%[new],%[uaddr]\n" @@ -88,6 +93,7 @@ int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval, u32 : [rc] "=d" (rc), [old] "+d" (oldval), [uaddr] "+Q" (*uaddr) : [new] "d" (newval) : "cc", "memory"); + disable_sacf_uaccess(sacf_flag); *uval = oldval; instrument_copy_from_user_after(uval, uaddr, sizeof(*uval), 0); return rc; diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h deleted file mode 100644 index 4e73ef46d4b2..000000000000 --- a/arch/s390/include/asm/gmap.h +++ /dev/null @@ -1,177 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * KVM guest address space mapping code - * - * Copyright IBM Corp. 2007, 2016 - * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> - */ - -#ifndef _ASM_S390_GMAP_H -#define _ASM_S390_GMAP_H - -#include <linux/radix-tree.h> -#include <linux/refcount.h> - -/* Generic bits for GMAP notification on DAT table entry changes. */ -#define GMAP_NOTIFY_SHADOW 0x2 -#define GMAP_NOTIFY_MPROT 0x1 - -/* Status bits only for huge segment entries */ -#define _SEGMENT_ENTRY_GMAP_IN 0x0800 /* invalidation notify bit */ -#define _SEGMENT_ENTRY_GMAP_UC 0x0002 /* dirty (migration) */ - -/** - * struct gmap_struct - guest address space - * @list: list head for the mm->context gmap list - * @mm: pointer to the parent mm_struct - * @guest_to_host: radix tree with guest to host address translation - * @host_to_guest: radix tree with pointer to segment table entries - * @guest_table_lock: spinlock to protect all entries in the guest page table - * @ref_count: reference counter for the gmap structure - * @table: pointer to the page directory - * @asce: address space control element for gmap page table - * @pfault_enabled: defines if pfaults are applicable for the guest - * @guest_handle: protected virtual machine handle for the ultravisor - * @host_to_rmap: radix tree with gmap_rmap lists - * @children: list of shadow gmap structures - * @shadow_lock: spinlock to protect the shadow gmap list - * @parent: pointer to the parent gmap for shadow guest address spaces - * @orig_asce: ASCE for which the shadow page table has been created - * @edat_level: edat level to be used for the shadow translation - * @removed: flag to indicate if a shadow guest address space has been removed - * @initialized: flag to indicate if a shadow guest address space can be used - */ -struct gmap { - struct list_head list; - struct mm_struct *mm; - struct radix_tree_root guest_to_host; - struct radix_tree_root host_to_guest; - spinlock_t guest_table_lock; - refcount_t ref_count; - unsigned long *table; - unsigned long asce; - unsigned long asce_end; - void *private; - bool pfault_enabled; - /* only set for protected virtual machines */ - unsigned long guest_handle; - /* Additional data for shadow guest address spaces */ - struct radix_tree_root host_to_rmap; - struct list_head children; - spinlock_t shadow_lock; - struct gmap *parent; - unsigned long orig_asce; - int edat_level; - bool removed; - bool initialized; -}; - -/** - * struct gmap_rmap - reverse mapping for shadow page table entries - * @next: pointer to next rmap in the list - * @raddr: virtual rmap address in the shadow guest address space - */ -struct gmap_rmap { - struct gmap_rmap *next; - unsigned long raddr; -}; - -#define gmap_for_each_rmap(pos, head) \ - for (pos = (head); pos; pos = pos->next) - -#define gmap_for_each_rmap_safe(pos, n, head) \ - for (pos = (head); n = pos ? pos->next : NULL, pos; pos = n) - -/** - * struct gmap_notifier - notify function block for page invalidation - * @notifier_call: address of callback function - */ -struct gmap_notifier { - struct list_head list; - struct rcu_head rcu; - void (*notifier_call)(struct gmap *gmap, unsigned long start, - unsigned long end); -}; - -static inline int gmap_is_shadow(struct gmap *gmap) -{ - return !!gmap->parent; -} - -struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit); -void gmap_remove(struct gmap *gmap); -struct gmap *gmap_get(struct gmap *gmap); -void gmap_put(struct gmap *gmap); -void gmap_free(struct gmap *gmap); -struct gmap *gmap_alloc(unsigned long limit); - -int gmap_map_segment(struct gmap *gmap, unsigned long from, - unsigned long to, unsigned long len); -int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len); -unsigned long __gmap_translate(struct gmap *, unsigned long gaddr); -int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr); -void gmap_discard(struct gmap *, unsigned long from, unsigned long to); -void __gmap_zap(struct gmap *, unsigned long gaddr); -void gmap_unlink(struct mm_struct *, unsigned long *table, unsigned long vmaddr); - -int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val); - -void gmap_unshadow(struct gmap *sg); -int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, - int fake); -int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, - int fake); -int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, - int fake); -int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, - int fake); -int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte); - -void gmap_register_pte_notifier(struct gmap_notifier *); -void gmap_unregister_pte_notifier(struct gmap_notifier *); - -int gmap_protect_one(struct gmap *gmap, unsigned long gaddr, int prot, unsigned long bits); - -void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4], - unsigned long gaddr, unsigned long vmaddr); -int s390_disable_cow_sharing(void); -int s390_replace_asce(struct gmap *gmap); -void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns); -int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, - unsigned long end, bool interruptible); -int kvm_s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio, bool split); -unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level); - -/** - * s390_uv_destroy_range - Destroy a range of pages in the given mm. - * @mm: the mm on which to operate on - * @start: the start of the range - * @end: the end of the range - * - * This function will call cond_sched, so it should not generate stalls, but - * it will otherwise only return when it completed. - */ -static inline void s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, - unsigned long end) -{ - (void)__s390_uv_destroy_range(mm, start, end, false); -} - -/** - * s390_uv_destroy_range_interruptible - Destroy a range of pages in the - * given mm, but stop when a fatal signal is received. - * @mm: the mm on which to operate on - * @start: the start of the range - * @end: the end of the range - * - * This function will call cond_sched, so it should not generate stalls. If - * a fatal signal is received, it will return with -EINTR immediately, - * without finishing destroying the whole range. Upon successful - * completion, 0 is returned. - */ -static inline int s390_uv_destroy_range_interruptible(struct mm_struct *mm, unsigned long start, - unsigned long end) -{ - return __s390_uv_destroy_range(mm, start, end, true); -} -#endif /* _ASM_S390_GMAP_H */ diff --git a/arch/s390/include/asm/gmap_helpers.h b/arch/s390/include/asm/gmap_helpers.h new file mode 100644 index 000000000000..2d3ae421077e --- /dev/null +++ b/arch/s390/include/asm/gmap_helpers.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Helper functions for KVM guest address space mapping code + * + * Copyright IBM Corp. 2025 + */ + +#ifndef _ASM_S390_GMAP_HELPERS_H +#define _ASM_S390_GMAP_HELPERS_H + +void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr); +void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned long end); +int gmap_helper_disable_cow_sharing(void); +void gmap_helper_try_set_pte_unused(struct mm_struct *mm, unsigned long vmaddr); + +#endif /* _ASM_S390_GMAP_HELPERS_H */ diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index 7c52acaf9f82..6983e52eaf81 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -9,12 +9,13 @@ #ifndef _ASM_S390_HUGETLB_H #define _ASM_S390_HUGETLB_H +#include <linux/cpufeature.h> #include <linux/pgtable.h> #include <linux/swap.h> #include <linux/swapops.h> #include <asm/page.h> -#define hugepages_supported() (MACHINE_HAS_EDAT1) +#define hugepages_supported() cpu_has_edat1() #define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, @@ -25,14 +26,16 @@ void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, #define __HAVE_ARCH_HUGE_PTEP_GET pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR -pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +pte_t __huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep); -static inline void arch_clear_hugetlb_flags(struct folio *folio) +#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR +static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + unsigned long sz) { - clear_bit(PG_arch_1, &folio->flags); + return __huge_ptep_get_and_clear(mm, addr, ptep); } -#define arch_clear_hugetlb_flags arch_clear_hugetlb_flags #define __HAVE_ARCH_HUGE_PTE_CLEAR static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, @@ -48,7 +51,7 @@ static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { - return huge_ptep_get_and_clear(vma->vm_mm, address, ptep); + return __huge_ptep_get_and_clear(vma->vm_mm, address, ptep); } #define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS @@ -59,7 +62,7 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, int changed = !pte_same(huge_ptep_get(vma->vm_mm, addr, ptep), pte); if (changed) { - huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); + __huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); __set_huge_pte_at(vma->vm_mm, addr, ptep, pte); } return changed; @@ -69,7 +72,7 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - pte_t pte = huge_ptep_get_and_clear(mm, addr, ptep); + pte_t pte = __huge_ptep_get_and_clear(mm, addr, ptep); __set_huge_pte_at(mm, addr, ptep, pte_wrprotect(pte)); } diff --git a/arch/s390/include/asm/idals.h b/arch/s390/include/asm/idals.h index ac68c657b28c..06e1ec2afd5a 100644 --- a/arch/s390/include/asm/idals.h +++ b/arch/s390/include/asm/idals.h @@ -94,7 +94,7 @@ static inline int set_normalized_cda(struct ccw1 *ccw, void *vaddr) return -EINVAL; nridaws = idal_nr_words(vaddr, ccw->count); if (nridaws > 0) { - idal = kcalloc(nridaws, sizeof(*idal), GFP_ATOMIC | GFP_DMA); + idal = kzalloc_objs(*idal, nridaws, GFP_ATOMIC | GFP_DMA); if (!idal) return -ENOMEM; idal_create_words(idal, vaddr, ccw->count); @@ -137,7 +137,7 @@ static inline struct idal_buffer *idal_buffer_alloc(size_t size, int page_order) nr_ptrs = (size + IDA_BLOCK_SIZE - 1) >> IDA_SIZE_SHIFT; nr_chunks = (PAGE_SIZE << page_order) >> IDA_SIZE_SHIFT; - ib = kmalloc(struct_size(ib, data, nr_ptrs), GFP_DMA | GFP_KERNEL); + ib = kmalloc_flex(*ib, data, nr_ptrs, GFP_DMA | GFP_KERNEL); if (!ib) return ERR_PTR(-ENOMEM); ib->size = size; @@ -181,6 +181,82 @@ static inline void idal_buffer_free(struct idal_buffer *ib) } /* + * Allocate an array of IDAL buffers to cover a total data size of @size. The + * resulting array is null-terminated. + * + * The amount of individual IDAL buffers is determined based on @size. + * Each IDAL buffer can have a maximum size of @CCW_MAX_BYTE_COUNT. + */ +static inline struct idal_buffer **idal_buffer_array_alloc(size_t size, int page_order) +{ + struct idal_buffer **ibs; + size_t ib_size; /* Size of a single idal buffer */ + int count; /* Amount of individual idal buffers */ + int i; + + count = (size + CCW_MAX_BYTE_COUNT - 1) / CCW_MAX_BYTE_COUNT; + ibs = kmalloc_objs(*ibs, count + 1); + for (i = 0; i < count; i++) { + /* Determine size for the current idal buffer */ + ib_size = min(size, CCW_MAX_BYTE_COUNT); + size -= ib_size; + ibs[i] = idal_buffer_alloc(ib_size, page_order); + if (IS_ERR(ibs[i])) { + while (i--) + idal_buffer_free(ibs[i]); + kfree(ibs); + ibs = NULL; + return ERR_PTR(-ENOMEM); + } + } + ibs[i] = NULL; + return ibs; +} + +/* + * Free array of IDAL buffers + */ +static inline void idal_buffer_array_free(struct idal_buffer ***ibs) +{ + struct idal_buffer **p; + + if (!ibs || !*ibs) + return; + for (p = *ibs; *p; p++) + idal_buffer_free(*p); + kfree(*ibs); + *ibs = NULL; +} + +/* + * Determine size of IDAL buffer array + */ +static inline int idal_buffer_array_size(struct idal_buffer **ibs) +{ + int size = 0; + + while (ibs && *ibs) { + size++; + ibs++; + } + return size; +} + +/* + * Determine total data size covered by IDAL buffer array + */ +static inline size_t idal_buffer_array_datasize(struct idal_buffer **ibs) +{ + size_t size = 0; + + while (ibs && *ibs) { + size += (*ibs)->size; + ibs++; + } + return size; +} + +/* * Test if a idal list is really needed. */ static inline bool __idal_buffer_is_needed(struct idal_buffer *ib) diff --git a/arch/s390/include/asm/idle.h b/arch/s390/include/asm/idle.h index 09f763b9eb40..32536ee34aa0 100644 --- a/arch/s390/include/asm/idle.h +++ b/arch/s390/include/asm/idle.h @@ -19,9 +19,9 @@ struct s390_idle_data { unsigned long mt_cycles_enter[8]; }; +DECLARE_PER_CPU(struct s390_idle_data, s390_idle); + extern struct device_attribute dev_attr_idle_count; extern struct device_attribute dev_attr_idle_time_us; -void psw_idle(struct s390_idle_data *data, unsigned long psw_mask); - #endif /* _S390_IDLE_H */ diff --git a/arch/s390/include/asm/io.h b/arch/s390/include/asm/io.h index fc9933a743d6..faddb9aef3b8 100644 --- a/arch/s390/include/asm/io.h +++ b/arch/s390/include/asm/io.h @@ -33,9 +33,7 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); #define _PAGE_IOREMAP pgprot_val(PAGE_KERNEL) #define ioremap_wc(addr, size) \ - ioremap_prot((addr), (size), pgprot_val(pgprot_writecombine(PAGE_KERNEL))) -#define ioremap_wt(addr, size) \ - ioremap_prot((addr), (size), pgprot_val(pgprot_writethrough(PAGE_KERNEL))) + ioremap_prot((addr), (size), pgprot_writecombine(PAGE_KERNEL)) static inline void __iomem *ioport_map(unsigned long port, unsigned int nr) { diff --git a/arch/s390/include/asm/irq.h b/arch/s390/include/asm/irq.h index d9e705f4a697..697497e7d13e 100644 --- a/arch/s390/include/asm/irq.h +++ b/arch/s390/include/asm/irq.h @@ -25,7 +25,7 @@ #define EXT_IRQ_CP_SERVICE 0x2603 #define EXT_IRQ_IUCV 0x4000 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/hardirq.h> #include <linux/percpu.h> @@ -54,7 +54,6 @@ enum interruption_class { IRQIO_C70, IRQIO_TAP, IRQIO_VMR, - IRQIO_LCS, IRQIO_CTC, IRQIO_ADM, IRQIO_CSC, @@ -121,6 +120,6 @@ void irq_subclass_unregister(enum irq_subclass subclass); #define irq_canonicalize(irq) (irq) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_IRQ_H */ diff --git a/arch/s390/include/asm/jump_label.h b/arch/s390/include/asm/jump_label.h index bf78cf381dfc..d9cbc18f6b2e 100644 --- a/arch/s390/include/asm/jump_label.h +++ b/arch/s390/include/asm/jump_label.h @@ -4,7 +4,7 @@ #define HAVE_JUMP_LABEL_BATCH -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/types.h> #include <linux/stringify.h> @@ -51,5 +51,5 @@ label: return true; } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif diff --git a/arch/s390/include/asm/kfence.h b/arch/s390/include/asm/kfence.h index e47fd8cbe701..e95e35eb8a3f 100644 --- a/arch/s390/include/asm/kfence.h +++ b/arch/s390/include/asm/kfence.h @@ -12,27 +12,16 @@ void __kernel_map_pages(struct page *page, int numpages, int enable); static __always_inline bool arch_kfence_init_pool(void) { - return true; -} - -#define arch_kfence_test_address(addr) ((addr) & PAGE_MASK) - -/* - * Do not split kfence pool to 4k mapping with arch_kfence_init_pool(), - * but earlier where page table allocations still happen with memblock. - * Reason is that arch_kfence_init_pool() gets called when the system - * is still in a limbo state - disabling and enabling bottom halves is - * not yet allowed, but that is what our page_table_alloc() would do. - */ -static __always_inline void kfence_split_mapping(void) -{ #ifdef CONFIG_KFENCE unsigned long pool_pages = KFENCE_POOL_SIZE >> PAGE_SHIFT; set_memory_4k((unsigned long)__kfence_pool, pool_pages); #endif + return true; } +#define arch_kfence_test_address(addr) ((addr) & PAGE_MASK) + static inline bool kfence_protect_page(unsigned long addr, bool protect) { __kernel_map_pages(virt_to_page((void *)addr), 1, !protect); diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 9a367866cab0..8a4f4a39f7a2 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -20,14 +20,14 @@ #include <linux/module.h> #include <linux/pci.h> #include <linux/mmu_notifier.h> +#include <asm/kvm_host_types.h> #include <asm/debug.h> #include <asm/cpu.h> #include <asm/fpu.h> #include <asm/isc.h> #include <asm/guarded_storage.h> -#define KVM_S390_BSCA_CPU_SLOTS 64 -#define KVM_S390_ESCA_CPU_SLOTS 248 +#define KVM_HAVE_MMU_RWLOCK #define KVM_MAX_VCPUS 255 #define KVM_INTERNAL_MEM_SLOTS 1 @@ -51,342 +51,6 @@ #define KVM_REQ_REFRESH_GUEST_PREFIX \ KVM_ARCH_REQ_FLAGS(6, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) -#define SIGP_CTRL_C 0x80 -#define SIGP_CTRL_SCN_MASK 0x3f - -union bsca_sigp_ctrl { - __u8 value; - struct { - __u8 c : 1; - __u8 r : 1; - __u8 scn : 6; - }; -}; - -union esca_sigp_ctrl { - __u16 value; - struct { - __u8 c : 1; - __u8 reserved: 7; - __u8 scn; - }; -}; - -struct esca_entry { - union esca_sigp_ctrl sigp_ctrl; - __u16 reserved1[3]; - __u64 sda; - __u64 reserved2[6]; -}; - -struct bsca_entry { - __u8 reserved0; - union bsca_sigp_ctrl sigp_ctrl; - __u16 reserved[3]; - __u64 sda; - __u64 reserved2[2]; -}; - -union ipte_control { - unsigned long val; - struct { - unsigned long k : 1; - unsigned long kh : 31; - unsigned long kg : 32; - }; -}; - -/* - * Utility is defined as two bytes but having it four bytes wide - * generates more efficient code. Since the following bytes are - * reserved this makes no functional difference. - */ -union sca_utility { - __u32 val; - struct { - __u32 mtcr : 1; - __u32 : 31; - }; -}; - -struct bsca_block { - union ipte_control ipte_control; - __u64 reserved[5]; - __u64 mcn; - union sca_utility utility; - __u8 reserved2[4]; - struct bsca_entry cpu[KVM_S390_BSCA_CPU_SLOTS]; -}; - -struct esca_block { - union ipte_control ipte_control; - __u64 reserved1[6]; - union sca_utility utility; - __u8 reserved2[4]; - __u64 mcn[4]; - __u64 reserved3[20]; - struct esca_entry cpu[KVM_S390_ESCA_CPU_SLOTS]; -}; - -/* - * This struct is used to store some machine check info from lowcore - * for machine checks that happen while the guest is running. - * This info in host's lowcore might be overwritten by a second machine - * check from host when host is in the machine check's high-level handling. - * The size is 24 bytes. - */ -struct mcck_volatile_info { - __u64 mcic; - __u64 failing_storage_address; - __u32 ext_damage_code; - __u32 reserved; -}; - -#define CR0_INITIAL_MASK (CR0_UNUSED_56 | CR0_INTERRUPT_KEY_SUBMASK | \ - CR0_MEASUREMENT_ALERT_SUBMASK) -#define CR14_INITIAL_MASK (CR14_UNUSED_32 | CR14_UNUSED_33 | \ - CR14_EXTERNAL_DAMAGE_SUBMASK) - -#define SIDAD_SIZE_MASK 0xff -#define sida_addr(sie_block) phys_to_virt((sie_block)->sidad & PAGE_MASK) -#define sida_size(sie_block) \ - ((((sie_block)->sidad & SIDAD_SIZE_MASK) + 1) * PAGE_SIZE) - -#define CPUSTAT_STOPPED 0x80000000 -#define CPUSTAT_WAIT 0x10000000 -#define CPUSTAT_ECALL_PEND 0x08000000 -#define CPUSTAT_STOP_INT 0x04000000 -#define CPUSTAT_IO_INT 0x02000000 -#define CPUSTAT_EXT_INT 0x01000000 -#define CPUSTAT_RUNNING 0x00800000 -#define CPUSTAT_RETAINED 0x00400000 -#define CPUSTAT_TIMING_SUB 0x00020000 -#define CPUSTAT_SIE_SUB 0x00010000 -#define CPUSTAT_RRF 0x00008000 -#define CPUSTAT_SLSV 0x00004000 -#define CPUSTAT_SLSR 0x00002000 -#define CPUSTAT_ZARCH 0x00000800 -#define CPUSTAT_MCDS 0x00000100 -#define CPUSTAT_KSS 0x00000200 -#define CPUSTAT_SM 0x00000080 -#define CPUSTAT_IBS 0x00000040 -#define CPUSTAT_GED2 0x00000010 -#define CPUSTAT_G 0x00000008 -#define CPUSTAT_GED 0x00000004 -#define CPUSTAT_J 0x00000002 -#define CPUSTAT_P 0x00000001 - -struct kvm_s390_sie_block { - atomic_t cpuflags; /* 0x0000 */ - __u32 : 1; /* 0x0004 */ - __u32 prefix : 18; - __u32 : 1; - __u32 ibc : 12; - __u8 reserved08[4]; /* 0x0008 */ -#define PROG_IN_SIE (1<<0) - __u32 prog0c; /* 0x000c */ - union { - __u8 reserved10[16]; /* 0x0010 */ - struct { - __u64 pv_handle_cpu; - __u64 pv_handle_config; - }; - }; -#define PROG_BLOCK_SIE (1<<0) -#define PROG_REQUEST (1<<1) - atomic_t prog20; /* 0x0020 */ - __u8 reserved24[4]; /* 0x0024 */ - __u64 cputm; /* 0x0028 */ - __u64 ckc; /* 0x0030 */ - __u64 epoch; /* 0x0038 */ - __u32 svcc; /* 0x0040 */ -#define LCTL_CR0 0x8000 -#define LCTL_CR6 0x0200 -#define LCTL_CR9 0x0040 -#define LCTL_CR10 0x0020 -#define LCTL_CR11 0x0010 -#define LCTL_CR14 0x0002 - __u16 lctl; /* 0x0044 */ - __s16 icpua; /* 0x0046 */ -#define ICTL_OPEREXC 0x80000000 -#define ICTL_PINT 0x20000000 -#define ICTL_LPSW 0x00400000 -#define ICTL_STCTL 0x00040000 -#define ICTL_ISKE 0x00004000 -#define ICTL_SSKE 0x00002000 -#define ICTL_RRBE 0x00001000 -#define ICTL_TPROT 0x00000200 - __u32 ictl; /* 0x0048 */ -#define ECA_CEI 0x80000000 -#define ECA_IB 0x40000000 -#define ECA_SIGPI 0x10000000 -#define ECA_MVPGI 0x01000000 -#define ECA_AIV 0x00200000 -#define ECA_VX 0x00020000 -#define ECA_PROTEXCI 0x00002000 -#define ECA_APIE 0x00000008 -#define ECA_SII 0x00000001 - __u32 eca; /* 0x004c */ -#define ICPT_INST 0x04 -#define ICPT_PROGI 0x08 -#define ICPT_INSTPROGI 0x0C -#define ICPT_EXTREQ 0x10 -#define ICPT_EXTINT 0x14 -#define ICPT_IOREQ 0x18 -#define ICPT_WAIT 0x1c -#define ICPT_VALIDITY 0x20 -#define ICPT_STOP 0x28 -#define ICPT_OPEREXC 0x2C -#define ICPT_PARTEXEC 0x38 -#define ICPT_IOINST 0x40 -#define ICPT_KSS 0x5c -#define ICPT_MCHKREQ 0x60 -#define ICPT_INT_ENABLE 0x64 -#define ICPT_PV_INSTR 0x68 -#define ICPT_PV_NOTIFY 0x6c -#define ICPT_PV_PREF 0x70 - __u8 icptcode; /* 0x0050 */ - __u8 icptstatus; /* 0x0051 */ - __u16 ihcpu; /* 0x0052 */ - __u8 reserved54; /* 0x0054 */ -#define IICTL_CODE_NONE 0x00 -#define IICTL_CODE_MCHK 0x01 -#define IICTL_CODE_EXT 0x02 -#define IICTL_CODE_IO 0x03 -#define IICTL_CODE_RESTART 0x04 -#define IICTL_CODE_SPECIFICATION 0x10 -#define IICTL_CODE_OPERAND 0x11 - __u8 iictl; /* 0x0055 */ - __u16 ipa; /* 0x0056 */ - __u32 ipb; /* 0x0058 */ - __u32 scaoh; /* 0x005c */ -#define FPF_BPBC 0x20 - __u8 fpf; /* 0x0060 */ -#define ECB_GS 0x40 -#define ECB_TE 0x10 -#define ECB_SPECI 0x08 -#define ECB_SRSI 0x04 -#define ECB_HOSTPROTINT 0x02 -#define ECB_PTF 0x01 - __u8 ecb; /* 0x0061 */ -#define ECB2_CMMA 0x80 -#define ECB2_IEP 0x20 -#define ECB2_PFMFI 0x08 -#define ECB2_ESCA 0x04 -#define ECB2_ZPCI_LSI 0x02 - __u8 ecb2; /* 0x0062 */ -#define ECB3_AISI 0x20 -#define ECB3_AISII 0x10 -#define ECB3_DEA 0x08 -#define ECB3_AES 0x04 -#define ECB3_RI 0x01 - __u8 ecb3; /* 0x0063 */ -#define ESCA_SCAOL_MASK ~0x3fU - __u32 scaol; /* 0x0064 */ - __u8 sdf; /* 0x0068 */ - __u8 epdx; /* 0x0069 */ - __u8 cpnc; /* 0x006a */ - __u8 reserved6b; /* 0x006b */ - __u32 todpr; /* 0x006c */ -#define GISA_FORMAT1 0x00000001 - __u32 gd; /* 0x0070 */ - __u8 reserved74[12]; /* 0x0074 */ - __u64 mso; /* 0x0080 */ - __u64 msl; /* 0x0088 */ - psw_t gpsw; /* 0x0090 */ - __u64 gg14; /* 0x00a0 */ - __u64 gg15; /* 0x00a8 */ - __u8 reservedb0[8]; /* 0x00b0 */ -#define HPID_KVM 0x4 -#define HPID_VSIE 0x5 - __u8 hpid; /* 0x00b8 */ - __u8 reservedb9[7]; /* 0x00b9 */ - union { - struct { - __u32 eiparams; /* 0x00c0 */ - __u16 extcpuaddr; /* 0x00c4 */ - __u16 eic; /* 0x00c6 */ - }; - __u64 mcic; /* 0x00c0 */ - } __packed; - __u32 reservedc8; /* 0x00c8 */ - union { - struct { - __u16 pgmilc; /* 0x00cc */ - __u16 iprcc; /* 0x00ce */ - }; - __u32 edc; /* 0x00cc */ - } __packed; - union { - struct { - __u32 dxc; /* 0x00d0 */ - __u16 mcn; /* 0x00d4 */ - __u8 perc; /* 0x00d6 */ - __u8 peratmid; /* 0x00d7 */ - }; - __u64 faddr; /* 0x00d0 */ - } __packed; - __u64 peraddr; /* 0x00d8 */ - __u8 eai; /* 0x00e0 */ - __u8 peraid; /* 0x00e1 */ - __u8 oai; /* 0x00e2 */ - __u8 armid; /* 0x00e3 */ - __u8 reservede4[4]; /* 0x00e4 */ - union { - __u64 tecmc; /* 0x00e8 */ - struct { - __u16 subchannel_id; /* 0x00e8 */ - __u16 subchannel_nr; /* 0x00ea */ - __u32 io_int_parm; /* 0x00ec */ - __u32 io_int_word; /* 0x00f0 */ - }; - } __packed; - __u8 reservedf4[8]; /* 0x00f4 */ -#define CRYCB_FORMAT_MASK 0x00000003 -#define CRYCB_FORMAT0 0x00000000 -#define CRYCB_FORMAT1 0x00000001 -#define CRYCB_FORMAT2 0x00000003 - __u32 crycbd; /* 0x00fc */ - __u64 gcr[16]; /* 0x0100 */ - union { - __u64 gbea; /* 0x0180 */ - __u64 sidad; - }; - __u8 reserved188[8]; /* 0x0188 */ - __u64 sdnxo; /* 0x0190 */ - __u8 reserved198[8]; /* 0x0198 */ - __u32 fac; /* 0x01a0 */ - __u8 reserved1a4[20]; /* 0x01a4 */ - __u64 cbrlo; /* 0x01b8 */ - __u8 reserved1c0[8]; /* 0x01c0 */ -#define ECD_HOSTREGMGMT 0x20000000 -#define ECD_MEF 0x08000000 -#define ECD_ETOKENF 0x02000000 -#define ECD_ECC 0x00200000 -#define ECD_HMAC 0x00004000 - __u32 ecd; /* 0x01c8 */ - __u8 reserved1cc[18]; /* 0x01cc */ - __u64 pp; /* 0x01de */ - __u8 reserved1e6[2]; /* 0x01e6 */ - __u64 itdba; /* 0x01e8 */ - __u64 riccbd; /* 0x01f0 */ - __u64 gvrd; /* 0x01f8 */ -} __packed __aligned(512); - -struct kvm_s390_itdb { - __u8 data[256]; -}; - -struct sie_page { - struct kvm_s390_sie_block sie_block; - struct mcck_volatile_info mcck_info; /* 0x0200 */ - __u8 reserved218[360]; /* 0x0218 */ - __u64 pv_grregs[16]; /* 0x0380 */ - __u8 reserved400[512]; /* 0x0400 */ - struct kvm_s390_itdb itdb; /* 0x0600 */ - __u8 reserved700[2304]; /* 0x0700 */ -}; - struct kvm_vcpu_stat { struct kvm_vcpu_stat_generic generic; u64 exit_userspace; @@ -483,6 +147,7 @@ struct kvm_vcpu_stat { u64 instruction_diagnose_500; u64 instruction_diagnose_other; u64 pfault_sync; + u64 signal_exits; }; #define PGM_OPERATION 0x01 @@ -693,7 +358,7 @@ struct kvm_s390_float_interrupt { int counters[FIRQ_MAX_COUNT]; struct kvm_s390_mchk_info mchk; struct kvm_s390_ext_info srv_signal; - int next_rr_cpu; + int last_sleep_cpu; struct mutex ais_lock; u8 simm; u8 nimm; @@ -777,6 +442,7 @@ struct kvm_vcpu_arch { bool acrs_loaded; struct kvm_s390_pv_vcpu pv; union diag318_info diag318_info; + struct kvm_s390_mmu_cache *mc; }; struct kvm_vm_stat { @@ -966,12 +632,14 @@ struct kvm_s390_pv { void *set_aside; struct list_head need_cleanup; struct mmu_notifier mmu_notifier; + /* Protects against concurrent import-like operations */ + struct mutex import_lock; }; -struct kvm_arch{ - void *sca; - int use_esca; - rwlock_t sca_lock; +struct kvm_s390_mmu_cache; + +struct kvm_arch { + struct esca_block *sca; debug_info_t *dbf; struct kvm_s390_float_interrupt float_int; struct kvm_device *flic; @@ -987,6 +655,8 @@ struct kvm_arch{ int user_sigp; int user_stsi; int user_instr0; + int user_operexec; + int allow_vsie_esamode; struct s390_io_adapter *adapters[MAX_S390_IO_ADAPTERS]; wait_queue_head_t ipte_wq; int ipte_lock_count; @@ -1008,6 +678,7 @@ struct kvm_arch{ struct kvm_s390_pv pv; struct list_head kzdev_list; spinlock_t kzdev_list_lock; + struct kvm_s390_mmu_cache *mc; }; #define KVM_HVA_ERR_BAD (-1UL) @@ -1040,6 +711,9 @@ void kvm_arch_crypto_clear_masks(struct kvm *kvm); void kvm_arch_crypto_set_masks(struct kvm *kvm, unsigned long *apm, unsigned long *aqm, unsigned long *adm); +#define SIE64_RETURN_NORMAL 0 +#define SIE64_RETURN_MCCK 1 + int __sie64a(phys_addr_t sie_block_phys, struct kvm_s390_sie_block *sie_block, u64 *rsa, unsigned long gasce); @@ -1053,10 +727,14 @@ extern char sie_exit; bool kvm_s390_pv_is_protected(struct kvm *kvm); bool kvm_s390_pv_cpu_is_protected(struct kvm_vcpu *vcpu); +extern int kvm_s390_enter_exit_sie(struct kvm_s390_sie_block *scb, + u64 *gprs, unsigned long gasce); + extern int kvm_s390_gisc_register(struct kvm *kvm, u32 gisc); extern int kvm_s390_gisc_unregister(struct kvm *kvm, u32 gisc); -static inline void kvm_arch_sync_events(struct kvm *kvm) {} +bool kvm_s390_is_gpa_in_memslot(struct kvm *kvm, gpa_t gpa); + static inline void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) {} static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {} diff --git a/arch/s390/include/asm/kvm_host_types.h b/arch/s390/include/asm/kvm_host_types.h new file mode 100644 index 000000000000..3f50942bdfe6 --- /dev/null +++ b/arch/s390/include/asm/kvm_host_types.h @@ -0,0 +1,347 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _ASM_KVM_HOST_TYPES_H +#define _ASM_KVM_HOST_TYPES_H + +#include <linux/atomic.h> +#include <linux/types.h> + +#define KVM_S390_BSCA_CPU_SLOTS 64 +#define KVM_S390_ESCA_CPU_SLOTS 248 + +#define SIGP_CTRL_C 0x80 +#define SIGP_CTRL_SCN_MASK 0x3f + +union bsca_sigp_ctrl { + __u8 value; + struct { + __u8 c : 1; + __u8 r : 1; + __u8 scn : 6; + }; +}; + +union esca_sigp_ctrl { + __u16 value; + struct { + __u8 c : 1; + __u8 reserved: 7; + __u8 scn; + }; +}; + +struct esca_entry { + union esca_sigp_ctrl sigp_ctrl; + __u16 reserved1[3]; + __u64 sda; + __u64 reserved2[6]; +}; + +struct bsca_entry { + __u8 reserved0; + union bsca_sigp_ctrl sigp_ctrl; + __u16 reserved[3]; + __u64 sda; + __u64 reserved2[2]; +}; + +union ipte_control { + unsigned long val; + struct { + unsigned long k : 1; + unsigned long kh : 31; + unsigned long kg : 32; + }; +}; + +/* + * Utility is defined as two bytes but having it four bytes wide + * generates more efficient code. Since the following bytes are + * reserved this makes no functional difference. + */ +union sca_utility { + __u32 val; + struct { + __u32 mtcr : 1; + __u32 : 31; + }; +}; + +struct bsca_block { + union ipte_control ipte_control; + __u64 reserved[5]; + __u64 mcn; + union sca_utility utility; + __u8 reserved2[4]; + struct bsca_entry cpu[KVM_S390_BSCA_CPU_SLOTS]; +}; + +struct esca_block { + union ipte_control ipte_control; + __u64 reserved1[6]; + union sca_utility utility; + __u8 reserved2[4]; + __u64 mcn[4]; + __u64 reserved3[20]; + struct esca_entry cpu[KVM_S390_ESCA_CPU_SLOTS]; +}; + +/* + * This struct is used to store some machine check info from lowcore + * for machine checks that happen while the guest is running. + * This info in host's lowcore might be overwritten by a second machine + * check from host when host is in the machine check's high-level handling. + * The size is 24 bytes. + */ +struct mcck_volatile_info { + __u64 mcic; + __u64 failing_storage_address; + __u32 ext_damage_code; + __u32 reserved; +}; + +#define CR0_INITIAL_MASK (CR0_UNUSED_56 | CR0_INTERRUPT_KEY_SUBMASK | \ + CR0_MEASUREMENT_ALERT_SUBMASK) +#define CR14_INITIAL_MASK (CR14_UNUSED_32 | CR14_UNUSED_33 | \ + CR14_EXTERNAL_DAMAGE_SUBMASK) + +#define SIDAD_SIZE_MASK 0xff +#define sida_addr(sie_block) phys_to_virt((sie_block)->sidad & PAGE_MASK) +#define sida_size(sie_block) \ + ((((sie_block)->sidad & SIDAD_SIZE_MASK) + 1) * PAGE_SIZE) + +#define CPUSTAT_STOPPED 0x80000000 +#define CPUSTAT_WAIT 0x10000000 +#define CPUSTAT_ECALL_PEND 0x08000000 +#define CPUSTAT_STOP_INT 0x04000000 +#define CPUSTAT_IO_INT 0x02000000 +#define CPUSTAT_EXT_INT 0x01000000 +#define CPUSTAT_RUNNING 0x00800000 +#define CPUSTAT_RETAINED 0x00400000 +#define CPUSTAT_TIMING_SUB 0x00020000 +#define CPUSTAT_SIE_SUB 0x00010000 +#define CPUSTAT_RRF 0x00008000 +#define CPUSTAT_SLSV 0x00004000 +#define CPUSTAT_SLSR 0x00002000 +#define CPUSTAT_ZARCH 0x00000800 +#define CPUSTAT_MCDS 0x00000100 +#define CPUSTAT_KSS 0x00000200 +#define CPUSTAT_SM 0x00000080 +#define CPUSTAT_IBS 0x00000040 +#define CPUSTAT_GED2 0x00000010 +#define CPUSTAT_G 0x00000008 +#define CPUSTAT_GED 0x00000004 +#define CPUSTAT_J 0x00000002 +#define CPUSTAT_P 0x00000001 + +struct kvm_s390_sie_block { + atomic_t cpuflags; /* 0x0000 */ + __u32 : 1; /* 0x0004 */ + __u32 prefix : 19; + __u32 ibc : 12; + __u8 reserved08[4]; /* 0x0008 */ +#define PROG_IN_SIE (1<<0) + __u32 prog0c; /* 0x000c */ + union { + __u8 reserved10[16]; /* 0x0010 */ + struct { + __u64 pv_handle_cpu; + __u64 pv_handle_config; + }; + }; +#define PROG_BLOCK_SIE (1<<0) +#define PROG_REQUEST (1<<1) + atomic_t prog20; /* 0x0020 */ + __u8 reserved24[4]; /* 0x0024 */ + __u64 cputm; /* 0x0028 */ + __u64 ckc; /* 0x0030 */ + __u64 epoch; /* 0x0038 */ + __u32 svcc; /* 0x0040 */ +#define LCTL_CR0 0x8000 +#define LCTL_CR6 0x0200 +#define LCTL_CR9 0x0040 +#define LCTL_CR10 0x0020 +#define LCTL_CR11 0x0010 +#define LCTL_CR14 0x0002 + __u16 lctl; /* 0x0044 */ + __s16 icpua; /* 0x0046 */ +#define ICTL_OPEREXC 0x80000000 +#define ICTL_PINT 0x20000000 +#define ICTL_LPSW 0x00400000 +#define ICTL_STCTL 0x00040000 +#define ICTL_ISKE 0x00004000 +#define ICTL_SSKE 0x00002000 +#define ICTL_RRBE 0x00001000 +#define ICTL_TPROT 0x00000200 + __u32 ictl; /* 0x0048 */ +#define ECA_CEI 0x80000000 +#define ECA_IB 0x40000000 +#define ECA_SIGPI 0x10000000 +#define ECA_MVPGI 0x01000000 +#define ECA_AIV 0x00200000 +#define ECA_VX 0x00020000 +#define ECA_PROTEXCI 0x00002000 +#define ECA_APIE 0x00000008 +#define ECA_SII 0x00000001 + __u32 eca; /* 0x004c */ +#define ICPT_INST 0x04 +#define ICPT_PROGI 0x08 +#define ICPT_INSTPROGI 0x0C +#define ICPT_EXTREQ 0x10 +#define ICPT_EXTINT 0x14 +#define ICPT_IOREQ 0x18 +#define ICPT_WAIT 0x1c +#define ICPT_VALIDITY 0x20 +#define ICPT_STOP 0x28 +#define ICPT_OPEREXC 0x2C +#define ICPT_PARTEXEC 0x38 +#define ICPT_IOINST 0x40 +#define ICPT_KSS 0x5c +#define ICPT_MCHKREQ 0x60 +#define ICPT_INT_ENABLE 0x64 +#define ICPT_PV_INSTR 0x68 +#define ICPT_PV_NOTIFY 0x6c +#define ICPT_PV_PREF 0x70 + __u8 icptcode; /* 0x0050 */ + __u8 icptstatus; /* 0x0051 */ + __u16 ihcpu; /* 0x0052 */ + __u8 reserved54; /* 0x0054 */ +#define IICTL_CODE_NONE 0x00 +#define IICTL_CODE_MCHK 0x01 +#define IICTL_CODE_EXT 0x02 +#define IICTL_CODE_IO 0x03 +#define IICTL_CODE_RESTART 0x04 +#define IICTL_CODE_SPECIFICATION 0x10 +#define IICTL_CODE_OPERAND 0x11 + __u8 iictl; /* 0x0055 */ + __u16 ipa; /* 0x0056 */ + __u32 ipb; /* 0x0058 */ + __u32 scaoh; /* 0x005c */ +#define FPF_BPBC 0x20 + __u8 fpf; /* 0x0060 */ +#define ECB_GS 0x40 +#define ECB_TE 0x10 +#define ECB_SPECI 0x08 +#define ECB_SRSI 0x04 +#define ECB_HOSTPROTINT 0x02 +#define ECB_PTF 0x01 + __u8 ecb; /* 0x0061 */ +#define ECB2_CMMA 0x80 +#define ECB2_IEP 0x20 +#define ECB2_PFMFI 0x08 +#define ECB2_ESCA 0x04 +#define ECB2_ZPCI_LSI 0x02 + __u8 ecb2; /* 0x0062 */ +#define ECB3_AISI 0x20 +#define ECB3_AISII 0x10 +#define ECB3_DEA 0x08 +#define ECB3_AES 0x04 +#define ECB3_RI 0x01 + __u8 ecb3; /* 0x0063 */ +#define ESCA_SCAOL_MASK ~0x3fU + __u32 scaol; /* 0x0064 */ + __u8 sdf; /* 0x0068 */ + __u8 epdx; /* 0x0069 */ + __u8 cpnc; /* 0x006a */ + __u8 reserved6b; /* 0x006b */ + __u32 todpr; /* 0x006c */ +#define GISA_FORMAT1 0x00000001 + __u32 gd; /* 0x0070 */ + __u8 reserved74[12]; /* 0x0074 */ + __u64 mso; /* 0x0080 */ + __u64 msl; /* 0x0088 */ + psw_t gpsw; /* 0x0090 */ + __u64 gg14; /* 0x00a0 */ + __u64 gg15; /* 0x00a8 */ + __u8 reservedb0[8]; /* 0x00b0 */ +#define HPID_KVM 0x4 +#define HPID_VSIE 0x5 + __u8 hpid; /* 0x00b8 */ + __u8 reservedb9[7]; /* 0x00b9 */ + union { + struct { + __u32 eiparams; /* 0x00c0 */ + __u16 extcpuaddr; /* 0x00c4 */ + __u16 eic; /* 0x00c6 */ + }; + __u64 mcic; /* 0x00c0 */ + } __packed; + __u32 reservedc8; /* 0x00c8 */ + union { + struct { + __u16 pgmilc; /* 0x00cc */ + __u16 iprcc; /* 0x00ce */ + }; + __u32 edc; /* 0x00cc */ + } __packed; + union { + struct { + __u32 dxc; /* 0x00d0 */ + __u16 mcn; /* 0x00d4 */ + __u8 perc; /* 0x00d6 */ + __u8 peratmid; /* 0x00d7 */ + }; + __u64 faddr; /* 0x00d0 */ + } __packed; + __u64 peraddr; /* 0x00d8 */ + __u8 eai; /* 0x00e0 */ + __u8 peraid; /* 0x00e1 */ + __u8 oai; /* 0x00e2 */ + __u8 armid; /* 0x00e3 */ + __u8 reservede4[4]; /* 0x00e4 */ + union { + __u64 tecmc; /* 0x00e8 */ + struct { + __u16 subchannel_id; /* 0x00e8 */ + __u16 subchannel_nr; /* 0x00ea */ + __u32 io_int_parm; /* 0x00ec */ + __u32 io_int_word; /* 0x00f0 */ + }; + } __packed; + __u8 reservedf4[8]; /* 0x00f4 */ +#define CRYCB_FORMAT_MASK 0x00000003 +#define CRYCB_FORMAT0 0x00000000 +#define CRYCB_FORMAT1 0x00000001 +#define CRYCB_FORMAT2 0x00000003 + __u32 crycbd; /* 0x00fc */ + __u64 gcr[16]; /* 0x0100 */ + union { + __u64 gbea; /* 0x0180 */ + __u64 sidad; + }; + __u8 reserved188[8]; /* 0x0188 */ + __u64 sdnxo; /* 0x0190 */ + __u8 reserved198[8]; /* 0x0198 */ + __u32 fac; /* 0x01a0 */ + __u8 reserved1a4[20]; /* 0x01a4 */ + __u64 cbrlo; /* 0x01b8 */ + __u8 reserved1c0[8]; /* 0x01c0 */ +#define ECD_HOSTREGMGMT 0x20000000 +#define ECD_MEF 0x08000000 +#define ECD_ETOKENF 0x02000000 +#define ECD_ECC 0x00200000 +#define ECD_HMAC 0x00004000 + __u32 ecd; /* 0x01c8 */ + __u8 reserved1cc[18]; /* 0x01cc */ + __u64 pp; /* 0x01de */ + __u8 reserved1e6[2]; /* 0x01e6 */ + __u64 itdba; /* 0x01e8 */ + __u64 riccbd; /* 0x01f0 */ + __u64 gvrd; /* 0x01f8 */ +} __packed __aligned(512); + +struct kvm_s390_itdb { + __u8 data[256]; +}; + +struct sie_page { + struct kvm_s390_sie_block sie_block; + struct mcck_volatile_info mcck_info; /* 0x0200 */ + __u8 reserved218[360]; /* 0x0218 */ + __u64 pv_grregs[16]; /* 0x0380 */ + __u8 reserved400[512]; /* 0x0400 */ + struct kvm_s390_itdb itdb; /* 0x0600 */ + __u8 reserved700[2304]; /* 0x0700 */ +}; + +#endif /* _ASM_KVM_HOST_TYPES_H */ diff --git a/arch/s390/include/asm/kvm_para.h b/arch/s390/include/asm/kvm_para.h index df73a052760c..00cc8c916cfb 100644 --- a/arch/s390/include/asm/kvm_para.h +++ b/arch/s390/include/asm/kvm_para.h @@ -76,7 +76,7 @@ long __kvm_hypercall##args(unsigned long nr HYPERCALL_PARM_##args) \ HYPERCALL_REGS_##args; \ \ asm volatile ( \ - " diag 2,4,0x500\n" \ + " diag 2,4,0x500" \ : "=d" (__rc) \ : "d" (__nr) HYPERCALL_FMT_##args \ : "memory", "cc"); \ diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h index 42a092fa1029..50ffe75adeb4 100644 --- a/arch/s390/include/asm/lowcore.h +++ b/arch/s390/include/asm/lowcore.h @@ -10,6 +10,7 @@ #define _ASM_S390_LOWCORE_H #include <linux/types.h> +#include <asm/machine.h> #include <asm/ptrace.h> #include <asm/ctlreg.h> #include <asm/cpu.h> @@ -21,7 +22,7 @@ #define LOWCORE_ALT_ADDRESS _AC(0x70000, UL) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct pgm_tdb { u64 data[32]; @@ -99,7 +100,8 @@ struct lowcore { /* Save areas. */ __u64 save_area[8]; /* 0x0200 */ - __u8 pad_0x0240[0x0280-0x0240]; /* 0x0240 */ + __u64 stack_canary; /* 0x0240 */ + __u8 pad_0x0248[0x0280-0x0248]; /* 0x0248 */ __u64 save_area_restart[1]; /* 0x0280 */ __u64 pcpu; /* 0x0288 */ @@ -126,7 +128,7 @@ struct lowcore { __u64 int_clock; /* 0x0318 */ __u8 pad_0x0320[0x0328-0x0320]; /* 0x0320 */ __u64 clock_comparator; /* 0x0328 */ - __u64 boot_clock[2]; /* 0x0330 */ + __u8 pad_0x0330[0x0340-0x0330]; /* 0x0330 */ /* Current process. */ __u64 current_task; /* 0x0340 */ @@ -163,9 +165,7 @@ struct lowcore { __u32 spinlock_index; /* 0x03b0 */ __u8 pad_0x03b4[0x03b8-0x03b4]; /* 0x03b4 */ __u64 percpu_offset; /* 0x03b8 */ - __u8 pad_0x03c0[0x03c8-0x03c0]; /* 0x03c0 */ - __u64 machine_flags; /* 0x03c8 */ - __u8 pad_0x03d0[0x0400-0x03d0]; /* 0x03d0 */ + __u8 pad_0x03c0[0x0400-0x03c0]; /* 0x03c0 */ __u32 return_lpswe; /* 0x0400 */ __u32 return_mcck_lpswe; /* 0x0404 */ @@ -222,9 +222,12 @@ static __always_inline struct lowcore *get_lowcore(void) if (__is_defined(__DECOMPRESSOR)) return NULL; - asm(ALTERNATIVE("llilh %[lc],0", "llilh %[lc],%[alt]", ALT_LOWCORE) - : [lc] "=d" (lc) - : [alt] "i" (LOWCORE_ALT_ADDRESS >> 16)); + asm_inline( + ALTERNATIVE(" lghi %[lc],0", + " llilh %[lc],%[alt]", + ALT_FEATURE(MFEATURE_LOWCORE)) + : [lc] "=d" (lc) + : [alt] "i" (LOWCORE_ALT_ADDRESS >> 16)); return lc; } @@ -235,19 +238,19 @@ static inline void set_prefix(__u32 address) asm volatile("spx %0" : : "Q" (address) : "memory"); } -#else /* __ASSEMBLY__ */ +#else /* __ASSEMBLER__ */ .macro GET_LC reg - ALTERNATIVE "llilh \reg,0", \ + ALTERNATIVE "lghi \reg,0", \ __stringify(llilh \reg, LOWCORE_ALT_ADDRESS >> 16), \ - ALT_LOWCORE + ALT_FEATURE(MFEATURE_LOWCORE) .endm .macro STMG_LC start, end, savearea ALTERNATIVE "stmg \start, \end, \savearea", \ __stringify(stmg \start, \end, LOWCORE_ALT_ADDRESS + \savearea), \ - ALT_LOWCORE + ALT_FEATURE(MFEATURE_LOWCORE) .endm -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_S390_LOWCORE_H */ diff --git a/arch/s390/include/asm/machine.h b/arch/s390/include/asm/machine.h new file mode 100644 index 000000000000..9bd4a9dc7778 --- /dev/null +++ b/arch/s390/include/asm/machine.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 2024 + */ + +#ifndef __ASM_S390_MACHINE_H +#define __ASM_S390_MACHINE_H + +#include <linux/const.h> + +#define MFEATURE_LOWCORE 0 +#define MFEATURE_PCI_MIO 1 +#define MFEATURE_SCC 2 +#define MFEATURE_TLB_GUEST 3 +#define MFEATURE_TX 4 +#define MFEATURE_ESOP 5 +#define MFEATURE_DIAG9C 6 +#define MFEATURE_VM 7 +#define MFEATURE_KVM 8 +#define MFEATURE_LPAR 9 +#define MFEATURE_DIAG288 10 + +#ifndef __ASSEMBLER__ + +#include <linux/bitops.h> +#include <asm/alternative.h> + +extern unsigned long machine_features[1]; + +#define MAX_MFEATURE_BIT (sizeof(machine_features) * BITS_PER_BYTE) + +static inline void __set_machine_feature(unsigned int nr, unsigned long *mfeatures) +{ + if (nr >= MAX_MFEATURE_BIT) + return; + __set_bit(nr, mfeatures); +} + +static inline void set_machine_feature(unsigned int nr) +{ + __set_machine_feature(nr, machine_features); +} + +static inline void __clear_machine_feature(unsigned int nr, unsigned long *mfeatures) +{ + if (nr >= MAX_MFEATURE_BIT) + return; + __clear_bit(nr, mfeatures); +} + +static inline void clear_machine_feature(unsigned int nr) +{ + __clear_machine_feature(nr, machine_features); +} + +static bool __test_machine_feature(unsigned int nr, unsigned long *mfeatures) +{ + if (nr >= MAX_MFEATURE_BIT) + return false; + return test_bit(nr, mfeatures); +} + +static bool test_machine_feature(unsigned int nr) +{ + return __test_machine_feature(nr, machine_features); +} + +static __always_inline bool __test_machine_feature_constant(unsigned int nr) +{ + asm goto( + ALTERNATIVE("brcl 15,%l[l_no]", "brcl 0,0", ALT_FEATURE(%[nr])) + : + : [nr] "i" (nr) + : + : l_no); + return true; +l_no: + return false; +} + +#define DEFINE_MACHINE_HAS_FEATURE(name, feature) \ +static __always_inline bool machine_has_##name(void) \ +{ \ + if (!__is_defined(__DECOMPRESSOR) && __builtin_constant_p(feature)) \ + return __test_machine_feature_constant(feature); \ + return test_machine_feature(feature); \ +} + +DEFINE_MACHINE_HAS_FEATURE(relocated_lowcore, MFEATURE_LOWCORE) +DEFINE_MACHINE_HAS_FEATURE(scc, MFEATURE_SCC) +DEFINE_MACHINE_HAS_FEATURE(tlb_guest, MFEATURE_TLB_GUEST) +DEFINE_MACHINE_HAS_FEATURE(tx, MFEATURE_TX) +DEFINE_MACHINE_HAS_FEATURE(esop, MFEATURE_ESOP) +DEFINE_MACHINE_HAS_FEATURE(diag9c, MFEATURE_DIAG9C) +DEFINE_MACHINE_HAS_FEATURE(vm, MFEATURE_VM) +DEFINE_MACHINE_HAS_FEATURE(kvm, MFEATURE_KVM) +DEFINE_MACHINE_HAS_FEATURE(lpar, MFEATURE_LPAR) + +#define machine_is_vm machine_has_vm +#define machine_is_kvm machine_has_kvm +#define machine_is_lpar machine_has_lpar + +#endif /* __ASSEMBLER__ */ +#endif /* __ASM_S390_MACHINE_H */ diff --git a/arch/s390/include/asm/march.h b/arch/s390/include/asm/march.h index fd9eef3be44c..11a71bd14954 100644 --- a/arch/s390/include/asm/march.h +++ b/arch/s390/include/asm/march.h @@ -33,6 +33,10 @@ #define MARCH_HAS_Z16_FEATURES 1 #endif +#ifdef CONFIG_HAVE_MARCH_Z17_FEATURES +#define MARCH_HAS_Z17_FEATURES 1 +#endif + #endif /* __DECOMPRESSOR */ #endif /* __ASM_S390_MARCH_H */ diff --git a/arch/s390/include/asm/mem_encrypt.h b/arch/s390/include/asm/mem_encrypt.h index b85e13505a0f..28c83ec1f243 100644 --- a/arch/s390/include/asm/mem_encrypt.h +++ b/arch/s390/include/asm/mem_encrypt.h @@ -2,11 +2,11 @@ #ifndef S390_MEM_ENCRYPT_H__ #define S390_MEM_ENCRYPT_H__ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ int set_memory_encrypted(unsigned long vaddr, int numpages); int set_memory_decrypted(unsigned long vaddr, int numpages); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* S390_MEM_ENCRYPT_H__ */ diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h index 4c2dc7abc285..d4fd7bf3692e 100644 --- a/arch/s390/include/asm/mmu.h +++ b/arch/s390/include/asm/mmu.h @@ -19,26 +19,10 @@ typedef struct { /* The mmu context belongs to a secure guest. */ atomic_t protected_count; /* - * The following bitfields need a down_write on the mm - * semaphore when they are written to. As they are only - * written once, they can be read without a lock. - * - * The mmu context allocates 4K page tables. - */ - unsigned int alloc_pgste:1; - /* The mmu context uses extended page tables. */ - unsigned int has_pgste:1; - /* The mmu context uses storage keys. */ - unsigned int uses_skeys:1; - /* The mmu context uses CMM. */ - unsigned int uses_cmm:1; - /* * The mmu context allows COW-sharing of memory pages (KSM, zeropage). * Note that COW-sharing during fork() is currently always allowed. */ unsigned int allow_cow_sharing:1; - /* The gmaps associated with this context are allowed to use huge pages. */ - unsigned int allow_gmap_hpage_1m:1; } mm_context_t; #define INIT_MM_CONTEXT(name) \ diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index d56eb0a1f37b..bd1ef5e2d2eb 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -13,6 +13,7 @@ #include <linux/mm_types.h> #include <asm/tlbflush.h> #include <asm/ctlreg.h> +#include <asm/asce.h> #include <asm-generic/mm_hooks.h> #define init_new_context init_new_context @@ -28,15 +29,8 @@ static inline int init_new_context(struct task_struct *tsk, atomic_set(&mm->context.protected_count, 0); mm->context.gmap_asce = 0; mm->context.flush_mm = 0; -#ifdef CONFIG_PGSTE - mm->context.alloc_pgste = page_table_allocate_pgste || - test_thread_flag(TIF_PGSTE) || - (current->mm && current->mm->context.alloc_pgste); - mm->context.has_pgste = 0; - mm->context.uses_skeys = 0; - mm->context.uses_cmm = 0; +#if IS_ENABLED(CONFIG_KVM) mm->context.allow_cow_sharing = 1; - mm->context.allow_gmap_hpage_1m = 0; #endif switch (mm->context.asce_limit) { default: @@ -80,7 +74,8 @@ static inline void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct * else get_lowcore()->user_asce.val = next->context.asce; cpumask_set_cpu(cpu, &next->context.cpu_attach_mask); - /* Clear previous user-ASCE from CR7 */ + /* Clear previous user-ASCE from CR1 and CR7 */ + local_ctl_load(1, &s390_invalid_asce); local_ctl_load(7, &s390_invalid_asce); if (prev != next) cpumask_clear_cpu(cpu, &prev->context.cpu_attach_mask); @@ -102,6 +97,7 @@ static inline void finish_arch_post_lock_switch(void) { struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; + unsigned long flags; if (mm) { preempt_disable(); @@ -111,15 +107,25 @@ static inline void finish_arch_post_lock_switch(void) __tlb_flush_mm_lazy(mm); preempt_enable(); } + local_irq_save(flags); + if (test_thread_flag(TIF_ASCE_PRIMARY)) + local_ctl_load(1, &get_lowcore()->kernel_asce); + else + local_ctl_load(1, &get_lowcore()->user_asce); local_ctl_load(7, &get_lowcore()->user_asce); + local_irq_restore(flags); } #define activate_mm activate_mm static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next) { - switch_mm(prev, next, current); + switch_mm_irqs_off(prev, next, current); cpumask_set_cpu(smp_processor_id(), mm_cpumask(next)); + if (test_thread_flag(TIF_ASCE_PRIMARY)) + local_ctl_load(1, &get_lowcore()->kernel_asce); + else + local_ctl_load(1, &get_lowcore()->user_asce); local_ctl_load(7, &get_lowcore()->user_asce); } diff --git a/arch/s390/include/asm/nmi.h b/arch/s390/include/asm/nmi.h index 227466ce9e41..6454c1531854 100644 --- a/arch/s390/include/asm/nmi.h +++ b/arch/s390/include/asm/nmi.h @@ -33,7 +33,7 @@ #define MCCK_CODE_FC_VALID BIT(63 - 43) #define MCCK_CODE_CPU_TIMER_VALID BIT(63 - 46) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ union mci { unsigned long val; @@ -104,5 +104,5 @@ void nmi_free_mcesa(u64 *mcesad); void s390_handle_mcck(void); void s390_do_machine_check(struct pt_regs *regs); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_S390_NMI_H */ diff --git a/arch/s390/include/asm/nospec-branch.h b/arch/s390/include/asm/nospec-branch.h index 192835a3e24d..81c4813cff18 100644 --- a/arch/s390/include/asm/nospec-branch.h +++ b/arch/s390/include/asm/nospec-branch.h @@ -2,7 +2,7 @@ #ifndef _ASM_S390_EXPOLINE_H #define _ASM_S390_EXPOLINE_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/types.h> #include <asm/facility.h> @@ -26,8 +26,6 @@ static inline bool nospec_uses_trampoline(void) return __is_defined(CC_USING_EXPOLINE) && !nospec_disable; } -#ifdef CONFIG_EXPOLINE_EXTERN - void __s390_indirect_jump_r1(void); void __s390_indirect_jump_r2(void); void __s390_indirect_jump_r3(void); @@ -44,8 +42,6 @@ void __s390_indirect_jump_r13(void); void __s390_indirect_jump_r14(void); void __s390_indirect_jump_r15(void); -#endif - -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_S390_EXPOLINE_H */ diff --git a/arch/s390/include/asm/nospec-insn.h b/arch/s390/include/asm/nospec-insn.h index cb15dd25bf21..46f92bb4c9e5 100644 --- a/arch/s390/include/asm/nospec-insn.h +++ b/arch/s390/include/asm/nospec-insn.h @@ -3,9 +3,10 @@ #define _ASM_S390_NOSPEC_ASM_H #include <linux/linkage.h> +#include <linux/export.h> #include <asm/dwarf.h> -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #ifdef CC_USING_EXPOLINE @@ -18,7 +19,7 @@ #ifdef CONFIG_EXPOLINE_EXTERN SYM_CODE_START(\name) #else - .pushsection .text.\name,"axG",@progbits,\name,comdat + .pushsection .text..\name,"axG",@progbits,\name,comdat .globl \name .hidden \name .type \name,@function @@ -128,6 +129,6 @@ .endm #endif /* CC_USING_EXPOLINE */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_S390_NOSPEC_ASM_H */ diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 1ff145f7b52b..56da819a79e6 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -33,7 +33,7 @@ #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA #include <asm/setup.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ void __storage_key_init_range(unsigned long start, unsigned long end); @@ -65,69 +65,79 @@ static inline void copy_page(void *to, void *from) : : "memory", "cc"); } -#define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) #define vma_alloc_zeroed_movable_folio(vma, vaddr) \ vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr) -/* - * These are used to make use of C type-checking.. - */ +#ifdef CONFIG_STRICT_MM_TYPECHECKS +#define STRICT_MM_TYPECHECKS +#endif + +#ifdef STRICT_MM_TYPECHECKS typedef struct { unsigned long pgprot; } pgprot_t; -typedef struct { unsigned long pgste; } pgste_t; typedef struct { unsigned long pte; } pte_t; typedef struct { unsigned long pmd; } pmd_t; typedef struct { unsigned long pud; } pud_t; typedef struct { unsigned long p4d; } p4d_t; typedef struct { unsigned long pgd; } pgd_t; -typedef pte_t *pgtable_t; -#define pgprot_val(x) ((x).pgprot) -#define pgste_val(x) ((x).pgste) - -static inline unsigned long pte_val(pte_t pte) -{ - return pte.pte; +#define DEFINE_PGVAL_FUNC(name) \ +static __always_inline unsigned long name ## _val(name ## _t name) \ +{ \ + return name.name; \ } -static inline unsigned long pmd_val(pmd_t pmd) -{ - return pmd.pmd; -} +#else /* STRICT_MM_TYPECHECKS */ -static inline unsigned long pud_val(pud_t pud) -{ - return pud.pud; -} +typedef unsigned long pgprot_t; +typedef unsigned long pte_t; +typedef unsigned long pmd_t; +typedef unsigned long pud_t; +typedef unsigned long p4d_t; +typedef unsigned long pgd_t; -static inline unsigned long p4d_val(p4d_t p4d) -{ - return p4d.p4d; +#define DEFINE_PGVAL_FUNC(name) \ +static __always_inline unsigned long name ## _val(name ## _t name) \ +{ \ + return name; \ } -static inline unsigned long pgd_val(pgd_t pgd) -{ - return pgd.pgd; -} +#endif /* STRICT_MM_TYPECHECKS */ -#define __pgste(x) ((pgste_t) { (x) } ) +DEFINE_PGVAL_FUNC(pgprot) +DEFINE_PGVAL_FUNC(pte) +DEFINE_PGVAL_FUNC(pmd) +DEFINE_PGVAL_FUNC(pud) +DEFINE_PGVAL_FUNC(p4d) +DEFINE_PGVAL_FUNC(pgd) + +typedef pte_t *pgtable_t; + +#define __pgprot(x) ((pgprot_t) { (x) } ) #define __pte(x) ((pte_t) { (x) } ) #define __pmd(x) ((pmd_t) { (x) } ) #define __pud(x) ((pud_t) { (x) } ) #define __p4d(x) ((p4d_t) { (x) } ) #define __pgd(x) ((pgd_t) { (x) } ) -#define __pgprot(x) ((pgprot_t) { (x) } ) static inline void page_set_storage_key(unsigned long addr, unsigned char skey, int mapped) { - if (!mapped) - asm volatile(".insn rrf,0xb22b0000,%0,%1,8,0" - : : "d" (skey), "a" (addr)); - else - asm volatile("sske %0,%1" : : "d" (skey), "a" (addr)); + if (!mapped) { + asm volatile( + " .insn rrf,0xb22b0000,%[skey],%[addr],8,0" + : + : [skey] "d" (skey), [addr] "a" (addr) + : "memory"); + } else { + asm volatile( + " sske %[skey],%[addr]" + : + : [skey] "d" (skey), [addr] "a" (addr) + : "memory"); + } } static inline unsigned char page_get_storage_key(unsigned long addr) @@ -151,6 +161,8 @@ static inline int page_reset_referenced(unsigned long addr) return CC_TRANSFORM(cc); } +int split_pud_page(pud_t *pudp, unsigned long addr); + /* Bits int the storage key */ #define _PAGE_CHANGED 0x02 /* HW changed bit */ #define _PAGE_REFERENCED 0x04 /* HW referenced bit */ @@ -265,9 +277,9 @@ static inline unsigned long virt_to_pfn(const void *kaddr) #define virt_addr_valid(kaddr) pfn_valid(phys_to_pfn(__pa_nodebug((unsigned long)(kaddr)))) -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_NON_EXEC +#define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_NON_EXEC -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #include <asm-generic/memory_model.h> #include <asm-generic/getorder.h> diff --git a/arch/s390/include/asm/pai.h b/arch/s390/include/asm/pai.h index ebeabd0aaa51..534d0320e2aa 100644 --- a/arch/s390/include/asm/pai.h +++ b/arch/s390/include/asm/pai.h @@ -77,6 +77,7 @@ static __always_inline void pai_kernel_exit(struct pt_regs *regs) #define PAI_SAVE_AREA(x) ((x)->hw.event_base) #define PAI_CPU_MASK(x) ((x)->hw.addr_filters) +#define PAI_PMU_IDX(x) ((x)->hw.last_tag) #define PAI_SWLIST(x) (&(x)->hw.tp_list) #endif diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index 474e1f8d1d3c..5dcf35f0f325 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -5,12 +5,16 @@ #include <linux/pci.h> #include <linux/mutex.h> #include <linux/iommu.h> +#include <linux/irqdomain.h> #include <linux/pci_hotplug.h> #include <asm/pci_clp.h> #include <asm/pci_debug.h> #include <asm/pci_insn.h> #include <asm/sclp.h> +#define ARCH_GENERIC_PCI_MMAP_RESOURCE 1 +#define arch_can_pci_mmap_wc() 1 + #define PCIBIOS_MIN_IO 0x1000 #define PCIBIOS_MIN_MEM 0x10000000 @@ -106,6 +110,7 @@ struct zpci_bus { struct list_head resources; struct list_head bus_next; struct resource bus_resource; + struct irq_domain *msi_parent_domain; int topo; /* TID if topo_is_tid, PCHID otherwise */ int domain_nr; u8 multifunction : 1; @@ -142,9 +147,8 @@ struct zpci_dev { u8 has_resources : 1; u8 is_physfn : 1; u8 util_str_avail : 1; - u8 irqs_registered : 1; u8 tid_avail : 1; - u8 reserved : 1; + u8 rtr_avail : 1; /* Relaxed translation allowed */ unsigned int devfn; /* DEVFN part of the RID*/ u8 pfip[CLP_PFIP_NR_SEGMENTS]; /* pci function internal path */ @@ -204,6 +208,10 @@ extern const struct attribute_group zpci_ident_attr_group; &pfip_attr_group, \ &zpci_ident_attr_group, +extern const struct attribute_group zpci_slot_attr_group; + +#define ARCH_PCI_SLOT_GROUPS (&zpci_slot_attr_group) + extern unsigned int s390_pci_force_floating __initdata; extern unsigned int s390_pci_no_rid; @@ -217,6 +225,7 @@ extern struct airq_iv *zpci_aif_sbv; struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state); int zpci_add_device(struct zpci_dev *zdev); int zpci_enable_device(struct zpci_dev *); +int zpci_reenable_device(struct zpci_dev *zdev); int zpci_disable_device(struct zpci_dev *); int zpci_scan_configured_device(struct zpci_dev *zdev, u32 fh); int zpci_deconfigure_device(struct zpci_dev *zdev); @@ -242,9 +251,20 @@ int clp_refresh_fh(u32 fid, u32 *fh); /* UID */ void update_uid_checking(bool new); +/* Firmware Sysfs */ +int __init __zpci_fw_sysfs_init(void); + +static inline int __init zpci_fw_sysfs_init(void) +{ + if (IS_ENABLED(CONFIG_SYSFS)) + return __zpci_fw_sysfs_init(); + return 0; +} + /* IOMMU Interface */ int zpci_init_iommu(struct zpci_dev *zdev); void zpci_destroy_iommu(struct zpci_dev *zdev); +int zpci_iommu_register_ioat(struct zpci_dev *zdev, u8 *status); #ifdef CONFIG_PCI static inline bool zpci_use_mio(struct zpci_dev *zdev) @@ -296,6 +316,9 @@ int zpci_dma_exit_device(struct zpci_dev *zdev); /* IRQ */ int __init zpci_irq_init(void); void __init zpci_irq_exit(void); +int zpci_set_irq(struct zpci_dev *zdev); +int zpci_create_parent_msi_domain(struct zpci_bus *zbus); +void zpci_remove_parent_msi_domain(struct zpci_bus *zbus); /* FMB */ int zpci_fmb_enable_device(struct zpci_dev *); diff --git a/arch/s390/include/asm/pci_clp.h b/arch/s390/include/asm/pci_clp.h index 3fff2f7095c8..7ebff39c84b3 100644 --- a/arch/s390/include/asm/pci_clp.h +++ b/arch/s390/include/asm/pci_clp.h @@ -156,7 +156,9 @@ struct clp_rsp_query_pci_grp { u16 : 4; u16 noi : 12; /* number of interrupts */ u8 version; - u8 : 6; + u8 : 2; + u8 rtr : 1; /* Relaxed translation requirement */ + u8 : 3; u8 frame : 1; u8 refresh : 1; /* TLB refresh mode */ u16 : 3; diff --git a/arch/s390/include/asm/pci_dma.h b/arch/s390/include/asm/pci_dma.h index 42d7cc4262ca..d12e17201661 100644 --- a/arch/s390/include/asm/pci_dma.h +++ b/arch/s390/include/asm/pci_dma.h @@ -25,6 +25,7 @@ enum zpci_ioat_dtype { #define ZPCI_KEY (PAGE_DEFAULT_KEY << 5) #define ZPCI_TABLE_SIZE_RT (1UL << 42) +#define ZPCI_TABLE_SIZE_RS (1UL << 53) #define ZPCI_IOTA_STO_FLAG (ZPCI_IOTA_IOT_ENABLED | ZPCI_KEY | ZPCI_IOTA_DT_ST) #define ZPCI_IOTA_RTTO_FLAG (ZPCI_IOTA_IOT_ENABLED | ZPCI_KEY | ZPCI_IOTA_DT_RT) @@ -55,6 +56,8 @@ enum zpci_ioat_dtype { #define ZPCI_PT_BITS 8 #define ZPCI_ST_SHIFT (ZPCI_PT_BITS + PAGE_SHIFT) #define ZPCI_RT_SHIFT (ZPCI_ST_SHIFT + ZPCI_TABLE_BITS) +#define ZPCI_RS_SHIFT (ZPCI_RT_SHIFT + ZPCI_TABLE_BITS) +#define ZPCI_RF_SHIFT (ZPCI_RS_SHIFT + ZPCI_TABLE_BITS) #define ZPCI_RTE_FLAG_MASK 0x3fffUL #define ZPCI_RTE_ADDR_MASK (~ZPCI_RTE_FLAG_MASK) diff --git a/arch/s390/include/asm/pci_insn.h b/arch/s390/include/asm/pci_insn.h index e5f57cfe1d45..025c6dcbf893 100644 --- a/arch/s390/include/asm/pci_insn.h +++ b/arch/s390/include/asm/pci_insn.h @@ -16,11 +16,11 @@ #define ZPCI_PCI_ST_FUNC_NOT_AVAIL 40 #define ZPCI_PCI_ST_ALREADY_IN_RQ_STATE 44 -/* Load/Store return codes */ -#define ZPCI_PCI_LS_OK 0 -#define ZPCI_PCI_LS_ERR 1 -#define ZPCI_PCI_LS_BUSY 2 -#define ZPCI_PCI_LS_INVAL_HANDLE 3 +/* PCI instruction condition codes */ +#define ZPCI_CC_OK 0 +#define ZPCI_CC_ERR 1 +#define ZPCI_CC_BUSY 2 +#define ZPCI_CC_INVAL_HANDLE 3 /* Load/Store address space identifiers */ #define ZPCI_PCIAS_MEMIO_0 0 diff --git a/arch/s390/include/asm/pci_io.h b/arch/s390/include/asm/pci_io.h index 43a5ea4ee20f..f3bef5bc7223 100644 --- a/arch/s390/include/asm/pci_io.h +++ b/arch/s390/include/asm/pci_io.h @@ -18,6 +18,7 @@ #define ZPCI_IOMAP_SHIFT 48 #define ZPCI_IOMAP_ADDR_SHIFT 62 #define ZPCI_IOMAP_ADDR_BASE (1UL << ZPCI_IOMAP_ADDR_SHIFT) +#define ZPCI_IOMAP_ADDR_MAX ((1UL << (ZPCI_IOMAP_ADDR_SHIFT + 1)) - 1) #define ZPCI_IOMAP_ADDR_OFF_MASK ((1UL << ZPCI_IOMAP_SHIFT) - 1) #define ZPCI_IOMAP_MAX_ENTRIES \ (1UL << (ZPCI_IOMAP_ADDR_SHIFT - ZPCI_IOMAP_SHIFT)) diff --git a/arch/s390/include/asm/percpu.h b/arch/s390/include/asm/percpu.h index 84f6b8357b45..b18a96f3a334 100644 --- a/arch/s390/include/asm/percpu.h +++ b/arch/s390/include/asm/percpu.h @@ -12,14 +12,23 @@ */ #define __my_cpu_offset get_lowcore()->percpu_offset -/* - * For 64 bit module code, the module may be more than 4G above the - * per cpu area, use weak definitions to force the compiler to - * generate external references. - */ -#if defined(MODULE) -#define ARCH_NEEDS_WEAK_PER_CPU -#endif +#define arch_raw_cpu_ptr(_ptr) \ +({ \ + unsigned long lc_percpu, tcp_ptr__; \ + \ + tcp_ptr__ = (__force unsigned long)(_ptr); \ + lc_percpu = offsetof(struct lowcore, percpu_offset); \ + asm_inline volatile( \ + ALTERNATIVE("ag %[__ptr__],%[offzero](%%r0)\n", \ + "ag %[__ptr__],%[offalt](%%r0)\n", \ + ALT_FEATURE(MFEATURE_LOWCORE)) \ + : [__ptr__] "+d" (tcp_ptr__) \ + : [offzero] "i" (lc_percpu), \ + [offalt] "i" (lc_percpu + LOWCORE_ALT_ADDRESS), \ + "m" (((struct lowcore *)0)->percpu_offset) \ + : "cc"); \ + (TYPEOF_UNQUAL(*(_ptr)) __force __kernel *)tcp_ptr__; \ +}) /* * We use a compare-and-swap loop since that uses less cpu cycles than @@ -74,13 +83,13 @@ if (__builtin_constant_p(val__) && \ ((szcast)val__ > -129) && ((szcast)val__ < 128)) { \ asm volatile( \ - op2 " %[ptr__],%[val__]\n" \ + op2 " %[ptr__],%[val__]" \ : [ptr__] "+Q" (*ptr__) \ : [val__] "i" ((szcast)val__) \ : "cc"); \ } else { \ asm volatile( \ - op1 " %[old__],%[val__],%[ptr__]\n" \ + op1 " %[old__],%[val__],%[ptr__]" \ : [old__] "=d" (old__), [ptr__] "+Q" (*ptr__) \ : [val__] "d" (val__) \ : "cc"); \ @@ -99,7 +108,7 @@ preempt_disable_notrace(); \ ptr__ = raw_cpu_ptr(&(pcp)); \ asm volatile( \ - op " %[old__],%[val__],%[ptr__]\n" \ + op " %[old__],%[val__],%[ptr__]" \ : [old__] "=d" (old__), [ptr__] "+Q" (*ptr__) \ : [val__] "d" (val__) \ : "cc"); \ @@ -118,7 +127,7 @@ preempt_disable_notrace(); \ ptr__ = raw_cpu_ptr(&(pcp)); \ asm volatile( \ - op " %[old__],%[val__],%[ptr__]\n" \ + op " %[old__],%[val__],%[ptr__]" \ : [old__] "=d" (old__), [ptr__] "+Q" (*ptr__) \ : [val__] "d" (val__) \ : "cc"); \ diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index b19b6ed2ab53..a5de9e61ea9e 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -19,14 +19,13 @@ #define CRST_ALLOC_ORDER 2 -unsigned long *crst_table_alloc(struct mm_struct *); +unsigned long *crst_table_alloc_noprof(struct mm_struct *); +#define crst_table_alloc(...) alloc_hooks(crst_table_alloc_noprof(__VA_ARGS__)) void crst_table_free(struct mm_struct *, unsigned long *); -unsigned long *page_table_alloc(struct mm_struct *); -struct ptdesc *page_table_alloc_pgste(struct mm_struct *mm); +unsigned long *page_table_alloc_noprof(struct mm_struct *); +#define page_table_alloc(...) alloc_hooks(page_table_alloc_noprof(__VA_ARGS__)) void page_table_free(struct mm_struct *, unsigned long *); -void page_table_free_pgste(struct ptdesc *ptdesc); -extern int page_table_allocate_pgste; static inline void crst_table_init(unsigned long *crst, unsigned long entry) { @@ -49,9 +48,9 @@ static inline unsigned long check_asce_limit(struct mm_struct *mm, unsigned long return addr; } -static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long address) +static inline p4d_t *p4d_alloc_one_noprof(struct mm_struct *mm, unsigned long address) { - unsigned long *table = crst_table_alloc(mm); + unsigned long *table = crst_table_alloc_noprof(mm); if (!table) return NULL; @@ -60,6 +59,7 @@ static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long address) return (p4d_t *) table; } +#define p4d_alloc_one(...) alloc_hooks(p4d_alloc_one_noprof(__VA_ARGS__)) static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) { @@ -70,9 +70,9 @@ static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) crst_table_free(mm, (unsigned long *) p4d); } -static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address) +static inline pud_t *pud_alloc_one_noprof(struct mm_struct *mm, unsigned long address) { - unsigned long *table = crst_table_alloc(mm); + unsigned long *table = crst_table_alloc_noprof(mm); if (!table) return NULL; @@ -81,6 +81,7 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address) return (pud_t *) table; } +#define pud_alloc_one(...) alloc_hooks(pud_alloc_one_noprof(__VA_ARGS__)) static inline void pud_free(struct mm_struct *mm, pud_t *pud) { @@ -91,19 +92,20 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud) crst_table_free(mm, (unsigned long *) pud); } -static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long vmaddr) +static inline pmd_t *pmd_alloc_one_noprof(struct mm_struct *mm, unsigned long vmaddr) { - unsigned long *table = crst_table_alloc(mm); + unsigned long *table = crst_table_alloc_noprof(mm); if (!table) return NULL; crst_table_init(table, _SEGMENT_ENTRY_EMPTY); - if (!pagetable_pmd_ctor(virt_to_ptdesc(table))) { + if (!pagetable_pmd_ctor(mm, virt_to_ptdesc(table))) { crst_table_free(mm, table); return NULL; } return (pmd_t *) table; } +#define pmd_alloc_one(...) alloc_hooks(pmd_alloc_one_noprof(__VA_ARGS__)) static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) { @@ -128,9 +130,9 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) set_pud(pud, __pud(_REGION3_ENTRY | __pa(pmd))); } -static inline pgd_t *pgd_alloc(struct mm_struct *mm) +static inline pgd_t *pgd_alloc_noprof(struct mm_struct *mm) { - unsigned long *table = crst_table_alloc(mm); + unsigned long *table = crst_table_alloc_noprof(mm); if (!table) return NULL; @@ -138,6 +140,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) return (pgd_t *) table; } +#define pgd_alloc(...) alloc_hooks(pgd_alloc_noprof(__VA_ARGS__)) static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 3ca5af4cfe43..2c6cee8241e0 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -14,9 +14,12 @@ #include <linux/sched.h> #include <linux/mm_types.h> +#include <linux/cpufeature.h> #include <linux/page-flags.h> +#include <linux/page_table_check.h> #include <linux/radix-tree.h> #include <linux/atomic.h> +#include <linux/mmap_lock.h> #include <asm/ctlreg.h> #include <asm/bug.h> #include <asm/page.h> @@ -412,28 +415,6 @@ void setup_protection_map(void); * SW-bits: y young, d dirty, r read, w write */ -/* Page status table bits for virtualization */ -#define PGSTE_ACC_BITS 0xf000000000000000UL -#define PGSTE_FP_BIT 0x0800000000000000UL -#define PGSTE_PCL_BIT 0x0080000000000000UL -#define PGSTE_HR_BIT 0x0040000000000000UL -#define PGSTE_HC_BIT 0x0020000000000000UL -#define PGSTE_GR_BIT 0x0004000000000000UL -#define PGSTE_GC_BIT 0x0002000000000000UL -#define PGSTE_ST2_MASK 0x0000ffff00000000UL -#define PGSTE_UC_BIT 0x0000000000008000UL /* user dirty (migration) */ -#define PGSTE_IN_BIT 0x0000000000004000UL /* IPTE notify bit */ -#define PGSTE_VSIE_BIT 0x0000000000002000UL /* ref'd in a shadow table */ - -/* Guest Page State used for virtualization */ -#define _PGSTE_GPS_ZERO 0x0000000080000000UL -#define _PGSTE_GPS_NODAT 0x0000000040000000UL -#define _PGSTE_GPS_USAGE_MASK 0x0000000003000000UL -#define _PGSTE_GPS_USAGE_STABLE 0x0000000000000000UL -#define _PGSTE_GPS_USAGE_UNUSED 0x0000000001000000UL -#define _PGSTE_GPS_USAGE_POT_VOLATILE 0x0000000002000000UL -#define _PGSTE_GPS_USAGE_VOLATILE _PGSTE_GPS_USAGE_MASK - /* * A user page table pointer has the space-switch-event bit, the * private-space-control bit and the storage-alteration-event-control @@ -565,33 +546,15 @@ static inline bool mm_pmd_folded(struct mm_struct *mm) } #define mm_pmd_folded(mm) mm_pmd_folded(mm) -static inline int mm_has_pgste(struct mm_struct *mm) -{ -#ifdef CONFIG_PGSTE - if (unlikely(mm->context.has_pgste)) - return 1; -#endif - return 0; -} - static inline int mm_is_protected(struct mm_struct *mm) { -#ifdef CONFIG_PGSTE +#if IS_ENABLED(CONFIG_KVM) if (unlikely(atomic_read(&mm->context.protected_count))) return 1; #endif return 0; } -static inline int mm_alloc_pgste(struct mm_struct *mm) -{ -#ifdef CONFIG_PGSTE - if (unlikely(mm->context.alloc_pgste)) - return 1; -#endif - return 0; -} - static inline pte_t clear_pte_bit(pte_t pte, pgprot_t prot) { return __pte(pte_val(pte) & ~pgprot_val(prot)); @@ -630,34 +593,13 @@ static inline pud_t set_pud_bit(pud_t pud, pgprot_t prot) #define mm_forbids_zeropage mm_forbids_zeropage static inline int mm_forbids_zeropage(struct mm_struct *mm) { -#ifdef CONFIG_PGSTE +#if IS_ENABLED(CONFIG_KVM) if (!mm->context.allow_cow_sharing) return 1; #endif return 0; } -static inline int mm_uses_skeys(struct mm_struct *mm) -{ -#ifdef CONFIG_PGSTE - if (mm->context.uses_skeys) - return 1; -#endif - return 0; -} - -static inline void csp(unsigned int *ptr, unsigned int old, unsigned int new) -{ - union register_pair r1 = { .even = old, .odd = new, }; - unsigned long address = (unsigned long)ptr | 1; - - asm volatile( - " csp %[r1],%[address]" - : [r1] "+&d" (r1.pair), "+m" (*ptr) - : [address] "d" (address) - : "cc"); -} - /** * cspg() - Compare and Swap and Purge (CSPG) * @ptr: Pointer to the value to be exchanged @@ -913,7 +855,7 @@ static inline int pmd_protnone(pmd_t pmd) } #endif -static inline int pte_swp_exclusive(pte_t pte) +static inline bool pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; } @@ -961,6 +903,12 @@ static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd) return clear_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_SOFT_DIRTY)); } +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION +#define pmd_swp_soft_dirty(pmd) pmd_soft_dirty(pmd) +#define pmd_swp_mksoft_dirty(pmd) pmd_mksoft_dirty(pmd) +#define pmd_swp_clear_soft_dirty(pmd) pmd_clear_soft_dirty(pmd) +#endif + /* * query functions pte_write/pte_dirty/pte_young only work if * pte_present() is true. Undefined behaviour if not.. @@ -1140,23 +1088,28 @@ static inline pte_t pte_mkhuge(pte_t pte) } #endif +static inline unsigned long sske_frame(unsigned long addr, unsigned char skey) +{ + asm volatile("sske %[skey],%[addr],1" + : [addr] "+a" (addr) : [skey] "d" (skey)); + return addr; +} + #define IPTE_GLOBAL 0 #define IPTE_LOCAL 1 #define IPTE_NODAT 0x400 #define IPTE_GUEST_ASCE 0x800 -static __always_inline void __ptep_rdp(unsigned long addr, pte_t *ptep, - unsigned long opt, unsigned long asce, - int local) +static __always_inline void __ptep_rdp(unsigned long addr, pte_t *ptep, int local) { unsigned long pto; pto = __pa(ptep) & ~(PTRS_PER_PTE * sizeof(pte_t) - 1); - asm volatile(".insn rrf,0xb98b0000,%[r1],%[r2],%[asce],%[m4]" + asm volatile(".insn rrf,0xb98b0000,%[r1],%[r2],%%r0,%[m4]" : "+m" (*ptep) - : [r1] "a" (pto), [r2] "a" ((addr & PAGE_MASK) | opt), - [asce] "a" (asce), [m4] "i" (local)); + : [r1] "a" (pto), [r2] "a" (addr & PAGE_MASK), + [m4] "i" (local)); } static __always_inline void __ptep_ipte(unsigned long address, pte_t *ptep, @@ -1213,8 +1166,8 @@ pte_t ptep_xchg_direct(struct mm_struct *, unsigned long, pte_t *, pte_t); pte_t ptep_xchg_lazy(struct mm_struct *, unsigned long, pte_t *, pte_t); #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +static inline bool ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { pte_t pte = *ptep; @@ -1223,8 +1176,8 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, } #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH -static inline int ptep_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep) +static inline bool ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) { return ptep_test_and_clear_young(vma, address, ptep); } @@ -1238,7 +1191,8 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, res = ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID)); /* At this point the reference through the mapping is still present */ if (mm_is_protected(mm) && pte_present(res)) - uv_convert_from_secure_pte(res); + WARN_ON_ONCE(uv_convert_from_secure_pte(res)); + page_table_check_pte_clear(mm, addr, res); return res; } @@ -1256,7 +1210,8 @@ static inline pte_t ptep_clear_flush(struct vm_area_struct *vma, res = ptep_xchg_direct(vma->vm_mm, addr, ptep, __pte(_PAGE_INVALID)); /* At this point the reference through the mapping is still present */ if (mm_is_protected(vma->vm_mm) && pte_present(res)) - uv_convert_from_secure_pte(res); + WARN_ON_ONCE(uv_convert_from_secure_pte(res)); + page_table_check_pte_clear(vma->vm_mm, addr, res); return res; } @@ -1280,6 +1235,9 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, } else { res = ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID)); } + + page_table_check_pte_clear(mm, addr, res); + /* Nothing to do */ if (!mm_is_protected(mm) || !pte_present(res)) return res; @@ -1293,9 +1251,10 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, /* * If something went wrong and the page could not be destroyed, or * if this is not a mm teardown, the slower export is used as - * fallback instead. + * fallback instead. If even that fails, print a warning and leak + * the page, to avoid crashing the whole system. */ - uv_convert_from_secure_pte(res); + WARN_ON_ONCE(uv_convert_from_secure_pte(res)); return res; } @@ -1339,8 +1298,8 @@ static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma, * PTE does not have _PAGE_PROTECT set, to avoid unnecessary overhead. * A local RDP can be used to do the flush. */ - if (MACHINE_HAS_RDP && !(pte_val(*ptep) & _PAGE_PROTECT)) - __ptep_rdp(address, ptep, 0, 0, 1); + if (cpu_has_rdp() && !(pte_val(*ptep) & _PAGE_PROTECT)) + __ptep_rdp(address, ptep, 1); } #define flush_tlb_fix_spurious_fault flush_tlb_fix_spurious_fault @@ -1354,57 +1313,16 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma, { if (pte_same(*ptep, entry)) return 0; - if (MACHINE_HAS_RDP && !mm_has_pgste(vma->vm_mm) && pte_allow_rdp(*ptep, entry)) + if (cpu_has_rdp() && pte_allow_rdp(*ptep, entry)) ptep_reset_dat_prot(vma->vm_mm, addr, ptep, entry); else ptep_xchg_direct(vma->vm_mm, addr, ptep, entry); return 1; } -/* - * Additional functions to handle KVM guest page tables - */ -void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t entry); -void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -void ptep_notify(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, unsigned long bits); -int ptep_force_prot(struct mm_struct *mm, unsigned long gaddr, - pte_t *ptep, int prot, unsigned long bit); -void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, - pte_t *ptep , int reset); -void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr, - pte_t *sptep, pte_t *tptep, pte_t pte); -void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep); - -bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long address, - pte_t *ptep); -int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, - unsigned char key, bool nq); -int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr, - unsigned char key, unsigned char *oldkey, - bool nq, bool mr, bool mc); -int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr); -int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, - unsigned char *key); - -int set_pgste_bits(struct mm_struct *mm, unsigned long addr, - unsigned long bits, unsigned long value); -int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep); -int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc, - unsigned long *oldpte, unsigned long *oldpgste); -void gmap_pmdp_csp(struct mm_struct *mm, unsigned long vmaddr); -void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr); -void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr); -void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr); - #define pgprot_writecombine pgprot_writecombine pgprot_t pgprot_writecombine(pgprot_t prot); -#define pgprot_writethrough pgprot_writethrough -pgprot_t pgprot_writethrough(pgprot_t prot); - #define PFN_PTE_SHIFT PAGE_SHIFT /* @@ -1416,23 +1334,13 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr, { if (pte_present(entry)) entry = clear_pte_bit(entry, __pgprot(_PAGE_UNUSED)); - if (mm_has_pgste(mm)) { - for (;;) { - ptep_set_pte_at(mm, addr, ptep, entry); - if (--nr == 0) - break; - ptep++; - entry = __pte(pte_val(entry) + PAGE_SIZE); - addr += PAGE_SIZE; - } - } else { - for (;;) { - set_pte(ptep, entry); - if (--nr == 0) - break; - ptep++; - entry = __pte(pte_val(entry) + PAGE_SIZE); - } + page_table_check_ptes_set(mm, addr, ptep, entry, nr); + for (;;) { + set_pte(ptep, entry); + if (--nr == 0) + break; + ptep++; + entry = __pte(pte_val(entry) + PAGE_SIZE); } } #define set_ptes set_ptes @@ -1449,16 +1357,6 @@ static inline pte_t mk_pte_phys(unsigned long physpage, pgprot_t pgprot) return pte_mkyoung(__pte); } -static inline pte_t mk_pte(struct page *page, pgprot_t pgprot) -{ - unsigned long physpage = page_to_phys(page); - pte_t __pte = mk_pte_phys(physpage, pgprot); - - if (pte_write(__pte) && PageDirty(page)) - __pte = pte_mkdirty(__pte); - return __pte; -} - #define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) #define p4d_index(address) (((address) >> P4D_SHIFT) & (PTRS_PER_P4D-1)) #define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) @@ -1697,10 +1595,10 @@ static inline pmd_t mk_pmd_phys(unsigned long physpage, pgprot_t pgprot) #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLB_PAGE */ -static inline void __pmdp_csp(pmd_t *pmdp) +static inline void __pmdp_cspg(pmd_t *pmdp) { - csp((unsigned int *)pmdp + 1, pmd_val(*pmdp), - pmd_val(*pmdp) | _SEGMENT_ENTRY_INVALID); + cspg((unsigned long *)pmdp, pmd_val(*pmdp), + pmd_val(*pmdp) | _SEGMENT_ENTRY_INVALID); } #define IDTE_GLOBAL 0 @@ -1793,8 +1691,8 @@ static inline int pmdp_set_access_flags(struct vm_area_struct *vma, } #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG -static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmdp) +static inline bool pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) { pmd_t pmd = *pmdp; @@ -1803,8 +1701,8 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, } #define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH -static inline int pmdp_clear_flush_young(struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmdp) +static inline bool pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) { VM_BUG_ON(addr & ~HPAGE_MASK); return pmdp_test_and_clear_young(vma, addr, pmdp); @@ -1813,6 +1711,7 @@ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma, static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t entry) { + page_table_check_pmd_set(mm, addr, pmdp, entry); set_pmd(pmdp, entry); } @@ -1827,7 +1726,11 @@ static inline pmd_t pmd_mkhuge(pmd_t pmd) static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { - return pmdp_xchg_direct(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); + pmd_t pmd; + + pmd = pmdp_xchg_direct(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); + page_table_check_pmd_clear(mm, addr, pmd); + return pmd; } #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL @@ -1835,12 +1738,17 @@ static inline pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp, int full) { + pmd_t pmd; + if (full) { - pmd_t pmd = *pmdp; + pmd = *pmdp; set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); + page_table_check_pmd_clear(vma->vm_mm, addr, pmd); return pmd; } - return pmdp_xchg_lazy(vma->vm_mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); + pmd = pmdp_xchg_lazy(vma->vm_mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); + page_table_check_pmd_clear(vma->vm_mm, addr, pmd); + return pmd; } #define __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH @@ -1854,11 +1762,16 @@ static inline pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, static inline pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { - pmd_t pmd; + pmd_t pmd = *pmdp; - VM_WARN_ON_ONCE(!pmd_present(*pmdp)); - pmd = __pmd(pmd_val(*pmdp) | _SEGMENT_ENTRY_INVALID); - return pmdp_xchg_direct(vma->vm_mm, addr, pmdp, pmd); + VM_WARN_ON_ONCE(!pmd_present(pmd)); + pmd = set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_INVALID)); +#ifdef CONFIG_PAGE_TABLE_CHECK + pmd = clear_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_READ)); +#endif + page_table_check_pmd_set(vma->vm_mm, addr, pmdp, pmd); + pmd = pmdp_xchg_direct(vma->vm_mm, addr, pmdp, pmd); + return pmd; } #define __HAVE_ARCH_PMDP_SET_WRPROTECT @@ -1880,7 +1793,6 @@ static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, #define pmdp_collapse_flush pmdp_collapse_flush #define pfn_pmd(pfn, pgprot) mk_pmd_phys(((pfn) << PAGE_SHIFT), (pgprot)) -#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot)) static inline int pmd_trans_huge(pmd_t pmd) { @@ -1890,10 +1802,33 @@ static inline int pmd_trans_huge(pmd_t pmd) #define has_transparent_hugepage has_transparent_hugepage static inline int has_transparent_hugepage(void) { - return MACHINE_HAS_EDAT1 ? 1 : 0; + return cpu_has_edat1() ? 1 : 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#ifdef CONFIG_PAGE_TABLE_CHECK +static inline bool pte_user_accessible_page(struct mm_struct *mm, unsigned long addr, pte_t pte) +{ + VM_BUG_ON(mm == &init_mm); + + return pte_present(pte); +} + +static inline bool pmd_user_accessible_page(struct mm_struct *mm, unsigned long addr, pmd_t pmd) +{ + VM_BUG_ON(mm == &init_mm); + + return pmd_leaf(pmd) && (pmd_val(pmd) & _SEGMENT_ENTRY_READ); +} + +static inline bool pud_user_accessible_page(struct mm_struct *mm, unsigned long addr, pud_t pud) +{ + VM_BUG_ON(mm == &init_mm); + + return pud_leaf(pud); +} +#endif + /* * 64 bit swap entry format: * A page-table entry has some bits we have to treat in a special way. @@ -1991,15 +1926,51 @@ static inline unsigned long __swp_offset_rste(swp_entry_t entry) #define __rste_to_swp_entry(rste) ((swp_entry_t) { rste }) +/* + * s390 has different layout for PTE and region / segment table entries (RSTE). + * This is also true for swap entries, and their swap type and offset encoding. + * For hugetlbfs PTE_MARKER support, s390 has internal __swp_type_rste() and + * __swp_offset_rste() helpers to correctly handle RSTE swap entries. + * + * But common swap code does not know about this difference, and only uses + * __swp_type(), __swp_offset() and __swp_entry() helpers for conversion between + * arch-dependent and arch-independent representation of swp_entry_t for all + * pagetable levels. On s390, those helpers only work for PTE swap entries. + * + * Therefore, implement __pmd_to_swp_entry() to build a fake PTE swap entry + * and return the arch-dependent representation of that. Correspondingly, + * implement __swp_entry_to_pmd() to convert that into a proper PMD swap + * entry again. With this, the arch-dependent swp_entry_t representation will + * always look like a PTE swap entry in common code. + * + * This is somewhat similar to fake PTEs in hugetlbfs code for s390, but only + * requires conversion of the swap type and offset, and not all the possible + * PTE bits. + */ +static inline swp_entry_t __pmd_to_swp_entry(pmd_t pmd) +{ + swp_entry_t arch_entry; + pte_t pte; + + arch_entry = __rste_to_swp_entry(pmd_val(pmd)); + pte = mk_swap_pte(__swp_type_rste(arch_entry), __swp_offset_rste(arch_entry)); + return __pte_to_swp_entry(pte); +} + +static inline pmd_t __swp_entry_to_pmd(swp_entry_t arch_entry) +{ + pmd_t pmd; + + pmd = __pmd(mk_swap_rste(__swp_type(arch_entry), __swp_offset(arch_entry))); + return pmd; +} + extern int vmem_add_mapping(unsigned long start, unsigned long size); extern void vmem_remove_mapping(unsigned long start, unsigned long size); extern int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot, bool alloc); extern int vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot); extern void vmem_unmap_4k_page(unsigned long addr); extern pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc); -extern int s390_enable_sie(void); -extern int s390_enable_skey(void); -extern void s390_reset_cmma(struct mm_struct *mm); /* s390 has a private copy of get unmapped area to deal with cache synonyms */ #define HAVE_ARCH_UNMAPPED_AREA @@ -2008,18 +1979,4 @@ extern void s390_reset_cmma(struct mm_struct *mm); #define pmd_pgtable(pmd) \ ((pgtable_t)__va(pmd_val(pmd) & -sizeof(pte_t)*PTRS_PER_PTE)) -static inline unsigned long gmap_pgste_get_pgt_addr(unsigned long *pgt) -{ - unsigned long *pgstes, res; - - pgstes = pgt + _PAGE_ENTRIES; - - res = (pgstes[0] & PGSTE_ST2_MASK) << 16; - res |= pgstes[1] & PGSTE_ST2_MASK; - res |= (pgstes[2] & PGSTE_ST2_MASK) >> 16; - res |= (pgstes[3] & PGSTE_ST2_MASK) >> 32; - - return res; -} - #endif /* _S390_PAGE_H */ diff --git a/arch/s390/include/asm/pkey.h b/arch/s390/include/asm/pkey.h index 5dca1a46a9f6..0af5ac4f646b 100644 --- a/arch/s390/include/asm/pkey.h +++ b/arch/s390/include/asm/pkey.h @@ -20,9 +20,28 @@ * @param key pointer to a buffer containing the key blob * @param keylen size of the key blob in bytes * @param protkey pointer to buffer receiving the protected key + * @param xflags additional execution flags (see PKEY_XFLAG_* definitions below) + * As of now the only supported flags are PKEY_XFLAG_NOMEMALLOC + * and PKEY_XFLAG_NOCLEARKEY. * @return 0 on success, negative errno value on failure */ int pkey_key2protkey(const u8 *key, u32 keylen, - u8 *protkey, u32 *protkeylen, u32 *protkeytype); + u8 *protkey, u32 *protkeylen, u32 *protkeytype, + u32 xflags); + +/* + * If this flag is given in the xflags parameter, the pkey implementation + * is not allowed to allocate memory but instead should fall back to use + * preallocated memory or simple fail with -ENOMEM. + * This flag is for protected key derive within a cipher or similar + * which must not allocate memory which would cause io operations - see + * also the CRYPTO_ALG_ALLOCATES_MEMORY flag in crypto.h. + */ +#define PKEY_XFLAG_NOMEMALLOC 0x0001 + +/* + * Do not accept a clear key token as source for a protected key. + */ +#define PKEY_XFLAG_NOCLEARKEY 0x0002 #endif /* _KAPI_PKEY_H */ diff --git a/arch/s390/include/asm/preempt.h b/arch/s390/include/asm/preempt.h index 6ccd033acfe5..6e5821bb047e 100644 --- a/arch/s390/include/asm/preempt.h +++ b/arch/s390/include/asm/preempt.h @@ -8,7 +8,10 @@ #include <asm/cmpxchg.h> #include <asm/march.h> -/* We use the MSB mostly because its available */ +/* + * Use MSB so it is possible to read preempt_count with LLGT which + * reads the least significant 31 bits with a single instruction. + */ #define PREEMPT_NEED_RESCHED 0x80000000 /* @@ -23,7 +26,20 @@ */ static __always_inline int preempt_count(void) { - return READ_ONCE(get_lowcore()->preempt_count) & ~PREEMPT_NEED_RESCHED; + unsigned long lc_preempt, count; + + BUILD_BUG_ON(sizeof_field(struct lowcore, preempt_count) != sizeof(int)); + lc_preempt = offsetof(struct lowcore, preempt_count); + /* READ_ONCE(get_lowcore()->preempt_count) & ~PREEMPT_NEED_RESCHED */ + asm_inline( + ALTERNATIVE("llgt %[count],%[offzero](%%r0)\n", + "llgt %[count],%[offalt](%%r0)\n", + ALT_FEATURE(MFEATURE_LOWCORE)) + : [count] "=d" (count) + : [offzero] "i" (lc_preempt), + [offalt] "i" (lc_preempt + LOWCORE_ALT_ADDRESS), + "m" (((struct lowcore *)0)->preempt_count)); + return count; } static __always_inline void preempt_count_set(int pc) @@ -68,7 +84,17 @@ static __always_inline void __preempt_count_add(int val) */ if (!IS_ENABLED(CONFIG_PROFILE_ALL_BRANCHES)) { if (__builtin_constant_p(val) && (val >= -128) && (val <= 127)) { - __atomic_add_const(val, &get_lowcore()->preempt_count); + unsigned long lc_preempt; + + lc_preempt = offsetof(struct lowcore, preempt_count); + asm_inline( + ALTERNATIVE("asi %[offzero](%%r0),%[val]\n", + "asi %[offalt](%%r0),%[val]\n", + ALT_FEATURE(MFEATURE_LOWCORE)) + : "+m" (((struct lowcore *)0)->preempt_count) + : [offzero] "i" (lc_preempt), [val] "i" (val), + [offalt] "i" (lc_preempt + LOWCORE_ALT_ADDRESS) + : "cc"); return; } } @@ -87,7 +113,22 @@ static __always_inline void __preempt_count_sub(int val) */ static __always_inline bool __preempt_count_dec_and_test(void) { +#ifdef __HAVE_ASM_FLAG_OUTPUTS__ + unsigned long lc_preempt; + int cc; + + lc_preempt = offsetof(struct lowcore, preempt_count); + asm_inline( + ALTERNATIVE("alsi %[offzero](%%r0),%[val]\n", + "alsi %[offalt](%%r0),%[val]\n", + ALT_FEATURE(MFEATURE_LOWCORE)) + : "=@cc" (cc), "+m" (((struct lowcore *)0)->preempt_count) + : [offzero] "i" (lc_preempt), [val] "i" (-1), + [offalt] "i" (lc_preempt + LOWCORE_ALT_ADDRESS)); + return (cc == 0) || (cc == 2); +#else return __atomic_add_const_and_test(-1, &get_lowcore()->preempt_count); +#endif } /* diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 4f8d5592c298..78195ee5e99f 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -26,11 +26,13 @@ #define RESTART_FLAG_CTLREGS _AC(1 << 0, U) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/cpumask.h> #include <linux/linkage.h> #include <linux/irqflags.h> +#include <linux/instruction_pointer.h> +#include <linux/bitops.h> #include <asm/fpu-types.h> #include <asm/cpu.h> #include <asm/page.h> @@ -62,33 +64,27 @@ static __always_inline struct pcpu *this_pcpu(void) static __always_inline void set_cpu_flag(int flag) { - this_pcpu()->flags |= (1UL << flag); + set_bit(flag, &this_pcpu()->flags); } static __always_inline void clear_cpu_flag(int flag) { - this_pcpu()->flags &= ~(1UL << flag); + clear_bit(flag, &this_pcpu()->flags); } static __always_inline bool test_cpu_flag(int flag) { - return this_pcpu()->flags & (1UL << flag); + return test_bit(flag, &this_pcpu()->flags); } static __always_inline bool test_and_set_cpu_flag(int flag) { - if (test_cpu_flag(flag)) - return true; - set_cpu_flag(flag); - return false; + return test_and_set_bit(flag, &this_pcpu()->flags); } static __always_inline bool test_and_clear_cpu_flag(int flag) { - if (!test_cpu_flag(flag)) - return false; - clear_cpu_flag(flag); - return true; + return test_and_clear_bit(flag, &this_pcpu()->flags); } /* @@ -97,7 +93,7 @@ static __always_inline bool test_and_clear_cpu_flag(int flag) */ static __always_inline bool test_cpu_flag_of(int flag, int cpu) { - return per_cpu(pcpu_devices, cpu).flags & (1UL << flag); + return test_bit(flag, &per_cpu(pcpu_devices, cpu).flags); } #define arch_needs_cpu() test_cpu_flag(CIF_NOHZ_DELAY) @@ -124,18 +120,12 @@ extern void execve_tail(void); unsigned long vdso_text_size(void); unsigned long vdso_size(void); -/* - * User space process size: 2GB for 31 bit, 4TB or 8PT for 64 bit. - */ - -#define TASK_SIZE (test_thread_flag(TIF_31BIT) ? \ - _REGION3_SIZE : TASK_SIZE_MAX) -#define TASK_UNMAPPED_BASE (test_thread_flag(TIF_31BIT) ? \ - (_REGION3_SIZE >> 1) : (_REGION2_SIZE >> 1)) +#define TASK_SIZE (TASK_SIZE_MAX) +#define TASK_UNMAPPED_BASE (_REGION2_SIZE >> 1) #define TASK_SIZE_MAX (-PAGE_SIZE) #define VDSO_BASE (STACK_TOP + PAGE_SIZE) -#define VDSO_LIMIT (test_thread_flag(TIF_31BIT) ? _REGION3_SIZE : _REGION2_SIZE) +#define VDSO_LIMIT (_REGION2_SIZE) #define STACK_TOP (VDSO_LIMIT - vdso_size() - PAGE_SIZE) #define STACK_TOP_MAX (_REGION2_SIZE - vdso_size() - PAGE_SIZE) @@ -168,8 +158,8 @@ static __always_inline void __stackleak_poison(unsigned long erase_low, "2: stg %[poison],0(%[addr])\n" " j 4f\n" "3: mvc 8(1,%[addr]),0(%[addr])\n" - "4:\n" - : [addr] "+&a" (erase_low), [count] "+&d" (count), [tmp] "=&a" (tmp) + "4:" + : [addr] "+&a" (erase_low), [count] "+&a" (count), [tmp] "=&a" (tmp) : [poison] "d" (poison) : "memory", "cc" ); @@ -186,7 +176,6 @@ struct thread_struct { unsigned long system_timer; /* task cputime in kernel space */ unsigned long hardirq_timer; /* task cputime in hardirq context */ unsigned long softirq_timer; /* task cputime in softirq context */ - const sys_call_ptr_t *sys_call_table; /* system call table address */ union teid gmap_teid; /* address and flags of last gmap fault */ unsigned int gmap_int_code; /* int code of last gmap fault */ int ufpu_flags; /* user fpu flags */ @@ -384,14 +373,19 @@ static inline void local_mcck_enable(void) /* * Rewind PSW instruction address by specified number of bytes. */ -static inline unsigned long __rewind_psw(psw_t psw, unsigned long ilc) +static inline unsigned long __rewind_psw(psw_t psw, long ilen) { unsigned long mask; mask = (psw.mask & PSW_MASK_EA) ? -1UL : (psw.mask & PSW_MASK_BA) ? (1UL << 31) - 1 : (1UL << 24) - 1; - return (psw.addr - ilc) & mask; + return (psw.addr - ilen) & mask; +} + +static inline unsigned long __forward_psw(psw_t psw, long ilen) +{ + return __rewind_psw(psw, -ilen); } /* @@ -416,9 +410,13 @@ static __always_inline bool regs_irqs_disabled(struct pt_regs *regs) static __always_inline void bpon(void) { - asm volatile(ALTERNATIVE("nop", ".insn rrf,0xb2e80000,0,0,13,0", ALT_SPEC(82))); + asm_inline volatile( + ALTERNATIVE(" nop\n", + " .insn rrf,0xb2e80000,0,0,13,0\n", + ALT_SPEC(82)) + ); } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __ASM_S390_PROCESSOR_H */ diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h index 788bc4467445..aaceb1d9110a 100644 --- a/arch/s390/include/asm/ptrace.h +++ b/arch/s390/include/asm/ptrace.h @@ -8,17 +8,19 @@ #define _S390_PTRACE_H #include <linux/bits.h> +#include <linux/typecheck.h> #include <uapi/asm/ptrace.h> +#include <asm/thread_info.h> #include <asm/tpi.h> #define PIF_SYSCALL 0 /* inside a system call */ -#define PIF_EXECVE_PGSTE_RESTART 1 /* restart execve for PGSTE binaries */ +#define PIF_PSW_ADDR_ADJUSTED 1 /* psw address has been adjusted */ #define PIF_SYSCALL_RET_SET 2 /* return value was set via ptrace */ #define PIF_GUEST_FAULT 3 /* indicates program check in sie64a */ #define PIF_FTRACE_FULL_REGS 4 /* all register contents valid (ftrace) */ #define _PIF_SYSCALL BIT(PIF_SYSCALL) -#define _PIF_EXECVE_PGSTE_RESTART BIT(PIF_EXECVE_PGSTE_RESTART) +#define _PIF_ADDR_PSW_ADJUSTED BIT(PIF_PSW_ADDR_ADJUSTED) #define _PIF_SYSCALL_RET_SET BIT(PIF_SYSCALL_RET_SET) #define _PIF_GUEST_FAULT BIT(PIF_GUEST_FAULT) #define _PIF_FTRACE_FULL_REGS BIT(PIF_FTRACE_FULL_REGS) @@ -55,7 +57,7 @@ PSW_DEFAULT_KEY | PSW_MASK_BASE | PSW_MASK_MCHECK | \ PSW_MASK_PSTATE | PSW_ASC_PRIMARY) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct psw_bits { unsigned long : 1; @@ -100,7 +102,7 @@ enum { typedef struct { unsigned int mask; unsigned int addr; -} psw_t32 __aligned(8); +} psw32_t __aligned(8); #define PGM_INT_CODE_MASK 0x7f #define PGM_INT_CODE_PER 0x80 @@ -118,7 +120,10 @@ struct pt_regs { unsigned long gprs[NUM_GPRS]; }; }; - unsigned long orig_gpr2; + union { + unsigned long orig_gpr2; + unsigned long monitor_code; + }; union { struct { unsigned int int_code; @@ -128,7 +133,6 @@ struct pt_regs { struct tpi_info tpi_info; }; unsigned long flags; - unsigned long cr1; unsigned long last_break; }; @@ -213,16 +217,23 @@ void update_cr_regs(struct task_struct *task); #define arch_has_single_step() (1) #define arch_has_block_step() (1) -#define user_mode(regs) (((regs)->psw.mask & PSW_MASK_PSTATE) != 0) -#define instruction_pointer(regs) ((regs)->psw.addr) -#define user_stack_pointer(regs)((regs)->gprs[15]) #define profile_pc(regs) instruction_pointer(regs) -static inline long regs_return_value(struct pt_regs *regs) +static __always_inline bool user_mode(const struct pt_regs *regs) +{ + return psw_bits(regs->psw).pstate; +} + +static inline long regs_return_value(const struct pt_regs *regs) { return regs->gprs[2]; } +static __always_inline unsigned long instruction_pointer(const struct pt_regs *regs) +{ + return regs->psw.addr; +} + static inline void instruction_pointer_set(struct pt_regs *regs, unsigned long val) { @@ -231,8 +242,52 @@ static inline void instruction_pointer_set(struct pt_regs *regs, int regs_query_register_offset(const char *name); const char *regs_query_register_name(unsigned int offset); -unsigned long regs_get_register(struct pt_regs *regs, unsigned int offset); -unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n); + +static __always_inline unsigned long kernel_stack_pointer(const struct pt_regs *regs) +{ + return regs->gprs[15]; +} + +static __always_inline unsigned long user_stack_pointer(const struct pt_regs *regs) +{ + return regs->gprs[15]; +} + +static __always_inline unsigned long regs_get_register(const struct pt_regs *regs, + unsigned int offset) +{ + if (offset >= NUM_GPRS) + return 0; + return regs->gprs[offset]; +} + +static __always_inline int regs_within_kernel_stack(const struct pt_regs *regs, + unsigned long addr) +{ + unsigned long ksp = kernel_stack_pointer(regs); + + return (addr & ~(THREAD_SIZE - 1)) == (ksp & ~(THREAD_SIZE - 1)); +} + +/** + * regs_get_kernel_stack_nth() - get Nth entry of the stack + * @regs:pt_regs which contains kernel stack pointer. + * @n:stack entry number. + * + * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which + * is specifined by @regs. If the @n th entry is NOT in the kernel stack, + * this returns 0. + */ +static __always_inline unsigned long regs_get_kernel_stack_nth(const struct pt_regs *regs, + unsigned int n) +{ + unsigned long addr; + + addr = kernel_stack_pointer(regs) + n * sizeof(long); + if (!regs_within_kernel_stack(regs, addr)) + return 0; + return READ_ONCE_NOCHECK(*(unsigned long *)addr); +} /** * regs_get_kernel_argument() - get Nth function argument in kernel @@ -241,8 +296,8 @@ unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n); * * regs_get_kernel_argument() returns @n th argument of the function call. */ -static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs, - unsigned int n) +static __always_inline unsigned long regs_get_kernel_argument(const struct pt_regs *regs, + unsigned int n) { unsigned int argoffset = STACK_FRAME_OVERHEAD / sizeof(long); @@ -253,15 +308,10 @@ static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs, return regs_get_kernel_stack_nth(regs, argoffset + n); } -static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) -{ - return regs->gprs[15]; -} - -static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) +static __always_inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) { regs->gprs[2] = rc; } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _S390_PTRACE_H */ diff --git a/arch/s390/include/asm/purgatory.h b/arch/s390/include/asm/purgatory.h index e297bcfc476f..4c7a43bc43a1 100644 --- a/arch/s390/include/asm/purgatory.h +++ b/arch/s390/include/asm/purgatory.h @@ -7,11 +7,11 @@ #ifndef _S390_PURGATORY_H_ #define _S390_PURGATORY_H_ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/purgatory.h> int verify_sha256_digest(void); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _S390_PURGATORY_H_ */ diff --git a/arch/s390/include/asm/rwonce.h b/arch/s390/include/asm/rwonce.h index 91fc24520e82..402325ec20f0 100644 --- a/arch/s390/include/asm/rwonce.h +++ b/arch/s390/include/asm/rwonce.h @@ -19,7 +19,7 @@ \ BUILD_BUG_ON(sizeof(x) != 16); \ asm volatile( \ - " lpq %[val],%[_x]\n" \ + " lpq %[val],%[_x]" \ : [val] "=d" (__u.val) \ : [_x] "QS" (x) \ : "memory"); \ diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index 18f37dff03c9..0f184dbdbe5e 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h @@ -21,7 +21,7 @@ #define SCLP_ERRNOTIFY_AQ_INFO_LOG 2 #define SCLP_ERRNOTIFY_AQ_OPTICS_DATA 3 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/uio.h> #include <asm/chpid.h> #include <asm/cpu.h> @@ -168,6 +168,7 @@ int sclp_early_read_storage_info(void); int sclp_early_get_core_info(struct sclp_core_info *info); void sclp_early_get_ipl_info(struct sclp_ipl_info *info); void sclp_early_detect(void); +void sclp_early_detect_machine_features(void); void sclp_early_printk(const char *s); void __sclp_early_printk(const char *s, unsigned int len); void sclp_emergency_printk(const char *s); @@ -198,5 +199,5 @@ static inline int sclp_get_core_info(struct sclp_core_info *info, int early) return _sclp_get_core_info(info); } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_S390_SCLP_H */ diff --git a/arch/s390/include/asm/seccomp.h b/arch/s390/include/asm/seccomp.h index 71d46f0ba97b..f904b674fee0 100644 --- a/arch/s390/include/asm/seccomp.h +++ b/arch/s390/include/asm/seccomp.h @@ -19,10 +19,5 @@ #define SECCOMP_ARCH_NATIVE AUDIT_ARCH_S390X #define SECCOMP_ARCH_NATIVE_NR NR_syscalls #define SECCOMP_ARCH_NATIVE_NAME "s390x" -#ifdef CONFIG_COMPAT -# define SECCOMP_ARCH_COMPAT AUDIT_ARCH_S390 -# define SECCOMP_ARCH_COMPAT_NR NR_syscalls -# define SECCOMP_ARCH_COMPAT_NAME "s390" -#endif #endif /* _ASM_S390_SECCOMP_H */ diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index 70b920b32827..cbf60ade741d 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -13,28 +13,6 @@ #define PARMAREA 0x10400 #define COMMAND_LINE_SIZE CONFIG_COMMAND_LINE_SIZE -/* - * Machine features detected in early.c - */ - -#define MACHINE_FLAG_VM BIT(0) -#define MACHINE_FLAG_KVM BIT(1) -#define MACHINE_FLAG_LPAR BIT(2) -#define MACHINE_FLAG_DIAG9C BIT(3) -#define MACHINE_FLAG_ESOP BIT(4) -#define MACHINE_FLAG_IDTE BIT(5) -#define MACHINE_FLAG_EDAT1 BIT(7) -#define MACHINE_FLAG_EDAT2 BIT(8) -#define MACHINE_FLAG_TOPOLOGY BIT(10) -#define MACHINE_FLAG_TE BIT(11) -#define MACHINE_FLAG_TLB_LC BIT(12) -#define MACHINE_FLAG_TLB_GUEST BIT(14) -#define MACHINE_FLAG_NX BIT(15) -#define MACHINE_FLAG_GS BIT(16) -#define MACHINE_FLAG_SCC BIT(17) -#define MACHINE_FLAG_PCI_MIO BIT(18) -#define MACHINE_FLAG_RDP BIT(19) -#define MACHINE_FLAG_SEQ_INSN BIT(20) #define LPP_MAGIC BIT(31) #define LPP_PID_MASK _AC(0xffffffff, UL) @@ -46,7 +24,7 @@ #define LEGACY_COMMAND_LINE_SIZE 896 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <asm/lowcore.h> #include <asm/types.h> @@ -63,6 +41,8 @@ struct parmarea { char command_line[COMMAND_LINE_SIZE]; /* 0x10480 */ }; +extern char arch_hw_string[128]; + extern struct parmarea parmarea; extern unsigned int zlib_dfltcc_support; @@ -72,32 +52,11 @@ extern unsigned int zlib_dfltcc_support; #define ZLIB_DFLTCC_INFLATE_ONLY 3 #define ZLIB_DFLTCC_FULL_DEBUG 4 -extern unsigned long ident_map_size; extern unsigned long max_mappable; /* The Write Back bit position in the physaddr is given by the SLPC PCI */ extern unsigned long mio_wb_bit_mask; -#define MACHINE_IS_VM (get_lowcore()->machine_flags & MACHINE_FLAG_VM) -#define MACHINE_IS_KVM (get_lowcore()->machine_flags & MACHINE_FLAG_KVM) -#define MACHINE_IS_LPAR (get_lowcore()->machine_flags & MACHINE_FLAG_LPAR) - -#define MACHINE_HAS_DIAG9C (get_lowcore()->machine_flags & MACHINE_FLAG_DIAG9C) -#define MACHINE_HAS_ESOP (get_lowcore()->machine_flags & MACHINE_FLAG_ESOP) -#define MACHINE_HAS_IDTE (get_lowcore()->machine_flags & MACHINE_FLAG_IDTE) -#define MACHINE_HAS_EDAT1 (get_lowcore()->machine_flags & MACHINE_FLAG_EDAT1) -#define MACHINE_HAS_EDAT2 (get_lowcore()->machine_flags & MACHINE_FLAG_EDAT2) -#define MACHINE_HAS_TOPOLOGY (get_lowcore()->machine_flags & MACHINE_FLAG_TOPOLOGY) -#define MACHINE_HAS_TE (get_lowcore()->machine_flags & MACHINE_FLAG_TE) -#define MACHINE_HAS_TLB_LC (get_lowcore()->machine_flags & MACHINE_FLAG_TLB_LC) -#define MACHINE_HAS_TLB_GUEST (get_lowcore()->machine_flags & MACHINE_FLAG_TLB_GUEST) -#define MACHINE_HAS_NX (get_lowcore()->machine_flags & MACHINE_FLAG_NX) -#define MACHINE_HAS_GS (get_lowcore()->machine_flags & MACHINE_FLAG_GS) -#define MACHINE_HAS_SCC (get_lowcore()->machine_flags & MACHINE_FLAG_SCC) -#define MACHINE_HAS_PCI_MIO (get_lowcore()->machine_flags & MACHINE_FLAG_PCI_MIO) -#define MACHINE_HAS_RDP (get_lowcore()->machine_flags & MACHINE_FLAG_RDP) -#define MACHINE_HAS_SEQ_INSN (get_lowcore()->machine_flags & MACHINE_FLAG_SEQ_INSN) - /* * Console mode. Override with conmode= */ @@ -142,5 +101,5 @@ static __always_inline u32 gen_lpswe(unsigned long addr) BUILD_BUG_ON(addr > 0xfff); return 0xb2b20000 | addr; } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_S390_SETUP_H */ diff --git a/arch/s390/include/asm/sigp.h b/arch/s390/include/asm/sigp.h index 472943b77066..97d77868f83c 100644 --- a/arch/s390/include/asm/sigp.h +++ b/arch/s390/include/asm/sigp.h @@ -36,7 +36,7 @@ #define SIGP_STATUS_INCORRECT_STATE 0x00000200UL #define SIGP_STATUS_NOT_RUNNING 0x00000400UL -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <asm/asm.h> @@ -68,6 +68,6 @@ static inline int __pcpu_sigp(u16 addr, u8 order, unsigned long parm, return cc; } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __S390_ASM_SIGP_H */ diff --git a/arch/s390/include/asm/skey.h b/arch/s390/include/asm/skey.h new file mode 100644 index 000000000000..84e7cf28b712 --- /dev/null +++ b/arch/s390/include/asm/skey.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_SKEY_H +#define __ASM_SKEY_H + +#include <asm/rwonce.h> + +struct skey_region { + unsigned long start; + unsigned long end; +}; + +#define SKEY_REGION(_start, _end) \ + stringify_in_c(.section .skey_region,"a";) \ + stringify_in_c(.balign 8;) \ + stringify_in_c(.quad (_start);) \ + stringify_in_c(.quad (_end);) \ + stringify_in_c(.previous) + +extern int skey_regions_initialized; +extern struct skey_region __skey_region_start[]; +extern struct skey_region __skey_region_end[]; + +void __skey_regions_initialize(void); + +static inline void skey_regions_initialize(void) +{ + if (READ_ONCE(skey_regions_initialized)) + return; + __skey_regions_initialize(); +} + +#endif /* __ASM_SKEY_H */ diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h index 7feca96c48c6..fb2bdbf35da5 100644 --- a/arch/s390/include/asm/smp.h +++ b/arch/s390/include/asm/smp.h @@ -7,11 +7,29 @@ #ifndef __ASM_SMP_H #define __ASM_SMP_H -#include <asm/sigp.h> -#include <asm/lowcore.h> #include <asm/processor.h> +#include <asm/lowcore.h> +#include <asm/machine.h> +#include <asm/sigp.h> + +static __always_inline unsigned int raw_smp_processor_id(void) +{ + unsigned long lc_cpu_nr; + unsigned int cpu; + + BUILD_BUG_ON(sizeof_field(struct lowcore, cpu_nr) != sizeof(cpu)); + lc_cpu_nr = offsetof(struct lowcore, cpu_nr); + asm_inline( + ALTERNATIVE(" ly %[cpu],%[offzero](%%r0)\n", + " ly %[cpu],%[offalt](%%r0)\n", + ALT_FEATURE(MFEATURE_LOWCORE)) + : [cpu] "=d" (cpu) + : [offzero] "i" (lc_cpu_nr), + [offalt] "i" (lc_cpu_nr + LOWCORE_ALT_ADDRESS), + "m" (((struct lowcore *)0)->cpu_nr)); + return cpu; +} -#define raw_smp_processor_id() (get_lowcore()->cpu_nr) #define arch_scale_cpu_capacity smp_cpu_get_capacity extern struct mutex smp_cpu_state_mutex; @@ -25,7 +43,7 @@ extern int __cpu_up(unsigned int cpu, struct task_struct *tidle); extern void arch_send_call_function_single_ipi(int cpu); extern void arch_send_call_function_ipi_mask(const struct cpumask *mask); -extern void smp_call_ipl_cpu(void (*func)(void *), void *); +extern void __noreturn smp_call_ipl_cpu(void (*func)(void *), void *data); extern void smp_emergency_stop(void); extern int smp_find_processor_id(u16 address); diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h index f87dd0a84855..b06b183b7246 100644 --- a/arch/s390/include/asm/spinlock.h +++ b/arch/s390/include/asm/spinlock.h @@ -16,7 +16,23 @@ #include <asm/processor.h> #include <asm/alternative.h> -#define SPINLOCK_LOCKVAL (get_lowcore()->spinlock_lockval) +static __always_inline unsigned int spinlock_lockval(void) +{ + unsigned long lc_lockval; + unsigned int lockval; + + BUILD_BUG_ON(sizeof_field(struct lowcore, spinlock_lockval) != sizeof(lockval)); + lc_lockval = offsetof(struct lowcore, spinlock_lockval); + asm_inline( + ALTERNATIVE(" ly %[lockval],%[offzero](%%r0)\n", + " ly %[lockval],%[offalt](%%r0)\n", + ALT_FEATURE(MFEATURE_LOWCORE)) + : [lockval] "=d" (lockval) + : [offzero] "i" (lc_lockval), + [offalt] "i" (lc_lockval + LOWCORE_ALT_ADDRESS), + "m" (((struct lowcore *)0)->spinlock_lockval)); + return lockval; +} extern int spin_retry; @@ -60,7 +76,7 @@ static inline int arch_spin_trylock_once(arch_spinlock_t *lp) int old = 0; barrier(); - return likely(arch_try_cmpxchg(&lp->lock, &old, SPINLOCK_LOCKVAL)); + return likely(arch_try_cmpxchg(&lp->lock, &old, spinlock_lockval())); } static inline void arch_spin_lock(arch_spinlock_t *lp) @@ -82,7 +98,7 @@ static inline void arch_spin_unlock(arch_spinlock_t *lp) kcsan_release(); asm_inline volatile( ALTERNATIVE("nop", ".insn rre,0xb2fa0000,7,0", ALT_FACILITY(49)) /* NIAI 7 */ - " mvhhi %[lock],0\n" + " mvhhi %[lock],0" : [lock] "=Q" (((unsigned short *)&lp->lock)[1]) : : "memory"); diff --git a/arch/s390/include/asm/stackprotector.h b/arch/s390/include/asm/stackprotector.h new file mode 100644 index 000000000000..0497850103dd --- /dev/null +++ b/arch/s390/include/asm/stackprotector.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _ASM_S390_STACKPROTECTOR_H +#define _ASM_S390_STACKPROTECTOR_H + +#include <linux/sched.h> +#include <asm/current.h> +#include <asm/lowcore.h> + +static __always_inline void boot_init_stack_canary(void) +{ + current->stack_canary = get_random_canary(); + get_lowcore()->stack_canary = current->stack_canary; +} + +#endif /* _ASM_S390_STACKPROTECTOR_H */ diff --git a/arch/s390/include/asm/stacktrace.h b/arch/s390/include/asm/stacktrace.h index 1d5ca13dc90f..ac3606c3babe 100644 --- a/arch/s390/include/asm/stacktrace.h +++ b/arch/s390/include/asm/stacktrace.h @@ -62,10 +62,11 @@ struct stack_frame { struct { unsigned long sie_control_block; unsigned long sie_savearea; - unsigned long sie_reason; + unsigned long sie_return; unsigned long sie_flags; unsigned long sie_control_block_phys; unsigned long sie_guest_asce; + unsigned long sie_irq; }; }; unsigned long gprs[10]; @@ -199,7 +200,7 @@ static __always_inline unsigned long get_stack_pointer(struct task_struct *task, " lg 15,%[_stack]\n" \ " stg %[_frame],%[_bc](15)\n" \ " brasl 14,%[_fn]\n" \ - " lgr 15,%[_prev]\n" \ + " lgr 15,%[_prev]" \ : [_prev] "=&d" (prev), CALL_FMT_##nr \ : [_stack] "R" (__stack), \ [_bc] "i" (offsetof(struct stack_frame, back_chain)), \ @@ -250,7 +251,7 @@ static __always_inline unsigned long get_stack_pointer(struct task_struct *task, " lra 14,0(1)\n" \ " lpswe %[psw_enter]\n" \ "0: lpswe 0(7)\n" \ - "1:\n" \ + "1:" \ : CALL_FMT_##nr, [psw_leave] "=Q" (psw_leave) \ : [psw_enter] "Q" (psw_enter) \ : "7", CALL_CLOBBER_##nr); \ diff --git a/arch/s390/include/asm/string.h b/arch/s390/include/asm/string.h index 2ab868cbae6c..238e721e5a22 100644 --- a/arch/s390/include/asm/string.h +++ b/arch/s390/include/asm/string.h @@ -26,11 +26,9 @@ void *memmove(void *dest, const void *src, size_t n); #define __HAVE_ARCH_MEMSCAN /* inline & arch function */ #define __HAVE_ARCH_STRCAT /* inline & arch function */ #define __HAVE_ARCH_STRCMP /* arch function */ -#define __HAVE_ARCH_STRCPY /* inline & arch function */ #define __HAVE_ARCH_STRLCAT /* arch function */ #define __HAVE_ARCH_STRLEN /* inline & arch function */ #define __HAVE_ARCH_STRNCAT /* arch function */ -#define __HAVE_ARCH_STRNCPY /* arch function */ #define __HAVE_ARCH_STRNLEN /* inline & arch function */ #define __HAVE_ARCH_STRSTR /* arch function */ #define __HAVE_ARCH_MEMSET16 /* arch function */ @@ -42,7 +40,6 @@ int memcmp(const void *s1, const void *s2, size_t n); int strcmp(const char *s1, const char *s2); size_t strlcat(char *dest, const char *src, size_t n); char *strncat(char *dest, const char *src, size_t n); -char *strncpy(char *dest, const char *src, size_t n); char *strstr(const char *s1, const char *s2); #endif /* !defined(CONFIG_KASAN) && !defined(CONFIG_KMSAN) */ @@ -128,7 +125,7 @@ static inline void *memscan(void *s, int c, size_t n) asm volatile( " lgr 0,%[c]\n" "0: srst %[ret],%[s]\n" - " jo 0b\n" + " jo 0b" : [ret] "+&a" (ret), [s] "+&a" (s) : [c] "d" (c) : "cc", "memory", "0"); @@ -155,22 +152,6 @@ static inline char *strcat(char *dst, const char *src) } #endif -#ifdef __HAVE_ARCH_STRCPY -static inline char *strcpy(char *dst, const char *src) -{ - char *ret = dst; - - asm volatile( - " lghi 0,0\n" - "0: mvst %[dst],%[src]\n" - " jo 0b" - : [dst] "+&a" (dst), [src] "+&a" (src) - : - : "cc", "memory", "0"); - return ret; -} -#endif - #if defined(__HAVE_ARCH_STRLEN) || (defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)) static inline size_t __no_sanitize_prefix_strfunc(strlen)(const char *s) { @@ -208,7 +189,6 @@ static inline size_t strnlen(const char * s, size_t n) void *memchr(const void * s, int c, size_t n); void *memscan(void *s, int c, size_t n); char *strcat(char *dst, const char *src); -char *strcpy(char *dst, const char *src); size_t strlen(const char *s); size_t strnlen(const char * s, size_t n); #endif /* !IN_ARCH_STRING_C */ diff --git a/arch/s390/include/asm/syscall.h b/arch/s390/include/asm/syscall.h index 27e3d804b311..4271e4169f45 100644 --- a/arch/s390/include/asm/syscall.h +++ b/arch/s390/include/asm/syscall.h @@ -15,7 +15,6 @@ #include <asm/ptrace.h> extern const sys_call_ptr_t sys_call_table[]; -extern const sys_call_ptr_t sys_call_table_emu[]; static inline long syscall_get_nr(struct task_struct *task, struct pt_regs *regs) @@ -24,6 +23,18 @@ static inline long syscall_get_nr(struct task_struct *task, (regs->int_code & 0xffff) : -1; } +static inline void syscall_set_nr(struct task_struct *task, + struct pt_regs *regs, + int nr) +{ + /* + * Unlike syscall_get_nr(), syscall_set_nr() can be called only when + * the target task is stopped for tracing on entering syscall, so + * there is no need to have the same check syscall_get_nr() has. + */ + regs->int_code = (regs->int_code & ~0xffff) | (nr & 0xffff); +} + static inline void syscall_rollback(struct task_struct *task, struct pt_regs *regs) { @@ -34,15 +45,7 @@ static inline long syscall_get_error(struct task_struct *task, struct pt_regs *regs) { unsigned long error = regs->gprs[2]; -#ifdef CONFIG_COMPAT - if (test_tsk_thread_flag(task, TIF_31BIT)) { - /* - * Sign-extend the value so (int)-EFOO becomes (long)-EFOO - * and will match correctly in comparisons. - */ - error = (long)(int)error; - } -#endif + return IS_ERR_VALUE(error) ? error : 0; } @@ -65,25 +68,24 @@ static inline void syscall_get_arguments(struct task_struct *task, unsigned long *args) { unsigned long mask = -1UL; - unsigned int n = 6; -#ifdef CONFIG_COMPAT - if (test_tsk_thread_flag(task, TIF_31BIT)) - mask = 0xffffffff; -#endif - while (n-- > 0) - if (n > 0) - args[n] = regs->gprs[2 + n] & mask; + for (int i = 1; i < 6; i++) + args[i] = regs->gprs[2 + i] & mask; args[0] = regs->orig_gpr2 & mask; } +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + const unsigned long *args) +{ + regs->orig_gpr2 = args[0]; + for (int n = 1; n < 6; n++) + regs->gprs[2 + n] = args[n]; +} + static inline int syscall_get_arch(struct task_struct *task) { -#ifdef CONFIG_COMPAT - if (test_tsk_thread_flag(task, TIF_31BIT)) - return AUDIT_ARCH_S390; -#endif return AUDIT_ARCH_S390X; } @@ -136,7 +138,7 @@ long syscall##nr(unsigned long syscall SYSCALL_PARM_##nr) \ SYSCALL_REGS_##nr; \ \ asm volatile ( \ - " svc 0\n" \ + " svc 0" \ : "=d" (rc) \ : "d" (r1) SYSCALL_FMT_##nr \ : "memory"); \ diff --git a/arch/s390/include/asm/syscall_wrapper.h b/arch/s390/include/asm/syscall_wrapper.h index 35c1d1b860d8..9eb58d5348d8 100644 --- a/arch/s390/include/asm/syscall_wrapper.h +++ b/arch/s390/include/asm/syscall_wrapper.h @@ -13,101 +13,12 @@ ,, regs->orig_gpr2,, regs->gprs[3],, regs->gprs[4] \ ,, regs->gprs[5],, regs->gprs[6],, regs->gprs[7]) -#ifdef CONFIG_COMPAT - -#define __SC_COMPAT_CAST(t, a) \ -({ \ - long __ReS = a; \ - \ - BUILD_BUG_ON((sizeof(t) > 4) && !__TYPE_IS_L(t) && \ - !__TYPE_IS_UL(t) && !__TYPE_IS_PTR(t) && \ - !__TYPE_IS_LL(t)); \ - if (__TYPE_IS_L(t)) \ - __ReS = (s32)a; \ - if (__TYPE_IS_UL(t)) \ - __ReS = (u32)a; \ - if (__TYPE_IS_PTR(t)) \ - __ReS = a & 0x7fffffff; \ - if (__TYPE_IS_LL(t)) \ - return -ENOSYS; \ - (t)__ReS; \ -}) - -/* - * To keep the naming coherent, re-define SYSCALL_DEFINE0 to create an alias - * named __s390x_sys_*() - */ -#define COMPAT_SYSCALL_DEFINE0(sname) \ - long __s390_compat_sys_##sname(void); \ - ALLOW_ERROR_INJECTION(__s390_compat_sys_##sname, ERRNO); \ - long __s390_compat_sys_##sname(void) - -#define SYSCALL_DEFINE0(sname) \ - SYSCALL_METADATA(_##sname, 0); \ - long __s390_sys_##sname(void); \ - ALLOW_ERROR_INJECTION(__s390_sys_##sname, ERRNO); \ - long __s390x_sys_##sname(void); \ - ALLOW_ERROR_INJECTION(__s390x_sys_##sname, ERRNO); \ - static inline long __do_sys_##sname(void); \ - long __s390_sys_##sname(void) \ - { \ - return __do_sys_##sname(); \ - } \ - long __s390x_sys_##sname(void) \ - { \ - return __do_sys_##sname(); \ - } \ - static inline long __do_sys_##sname(void) - -#define COND_SYSCALL(name) \ - cond_syscall(__s390x_sys_##name); \ - cond_syscall(__s390_sys_##name) - -#define COMPAT_SYSCALL_DEFINEx(x, name, ...) \ - long __s390_compat_sys##name(struct pt_regs *regs); \ - ALLOW_ERROR_INJECTION(__s390_compat_sys##name, ERRNO); \ - static inline long __se_compat_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__)); \ - static inline long __do_compat_sys##name(__MAP(x, __SC_DECL, __VA_ARGS__)); \ - long __s390_compat_sys##name(struct pt_regs *regs) \ - { \ - return __se_compat_sys##name(SC_S390_REGS_TO_ARGS(x, __VA_ARGS__)); \ - } \ - static inline long __se_compat_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__)) \ - { \ - __MAP(x, __SC_TEST, __VA_ARGS__); \ - return __do_compat_sys##name(__MAP(x, __SC_DELOUSE, __VA_ARGS__)); \ - } \ - static inline long __do_compat_sys##name(__MAP(x, __SC_DECL, __VA_ARGS__)) - -/* - * As some compat syscalls may not be implemented, we need to expand - * COND_SYSCALL_COMPAT in kernel/sys_ni.c to cover this case as well. - */ -#define COND_SYSCALL_COMPAT(name) \ - cond_syscall(__s390_compat_sys_##name) - -#define __S390_SYS_STUBx(x, name, ...) \ - long __s390_sys##name(struct pt_regs *regs); \ - ALLOW_ERROR_INJECTION(__s390_sys##name, ERRNO); \ - static inline long ___se_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__)); \ - long __s390_sys##name(struct pt_regs *regs) \ - { \ - return ___se_sys##name(SC_S390_REGS_TO_ARGS(x, __VA_ARGS__)); \ - } \ - static inline long ___se_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__)) \ - { \ - __MAP(x, __SC_TEST, __VA_ARGS__); \ - return __do_sys##name(__MAP(x, __SC_COMPAT_CAST, __VA_ARGS__)); \ - } - -#else /* CONFIG_COMPAT */ - #define SYSCALL_DEFINE0(sname) \ SYSCALL_METADATA(_##sname, 0); \ - long __s390x_sys_##sname(void); \ + long __s390x_sys_##sname(struct pt_regs *__unused); \ ALLOW_ERROR_INJECTION(__s390x_sys_##sname, ERRNO); \ static inline long __do_sys_##sname(void); \ - long __s390x_sys_##sname(void) \ + long __s390x_sys_##sname(struct pt_regs *__unused) \ { \ return __do_sys_##sname(); \ } \ @@ -118,8 +29,6 @@ #define __S390_SYS_STUBx(x, fullname, name, ...) -#endif /* CONFIG_COMPAT */ - #define __SYSCALL_DEFINEx(x, name, ...) \ long __s390x_sys##name(struct pt_regs *regs); \ ALLOW_ERROR_INJECTION(__s390x_sys##name, ERRNO); \ diff --git a/arch/s390/include/asm/sysinfo.h b/arch/s390/include/asm/sysinfo.h index edca5a751df4..9088c5267f35 100644 --- a/arch/s390/include/asm/sysinfo.h +++ b/arch/s390/include/asm/sysinfo.h @@ -11,8 +11,34 @@ #ifndef __ASM_S390_SYSINFO_H #define __ASM_S390_SYSINFO_H -#include <asm/bitsperlong.h> #include <linux/uuid.h> +#include <asm/bitsperlong.h> +#include <asm/asm.h> + +/* + * stsi - store system information + * + * Returns the current configuration level if function code 0 was specified. + * Otherwise returns 0 on success or a negative value on error. + */ +static inline int stsi(void *sysinfo, int fc, int sel1, int sel2) +{ + int r0 = (fc << 28) | sel1; + int cc; + + asm volatile( + " lr %%r0,%[r0]\n" + " lr %%r1,%[r1]\n" + " stsi %[sysinfo]\n" + " lr %[r0],%%r0\n" + CC_IPM(cc) + : CC_OUT(cc, cc), [r0] "+d" (r0), [sysinfo] "=Q" (*(char *)sysinfo) + : [r1] "d" (sel2) + : CC_CLOBBER_LIST("0", "1", "memory")); + if (cc == 3) + return -EOPNOTSUPP; + return fc ? 0 : (unsigned int)r0 >> 28; +} struct sysinfo_1_1_1 { unsigned char p:1; diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h index c33f7144d1b9..6a548a819400 100644 --- a/arch/s390/include/asm/thread_info.h +++ b/arch/s390/include/asm/thread_info.h @@ -9,9 +9,7 @@ #define _ASM_THREAD_INFO_H #include <linux/bits.h> -#ifndef ASM_OFFSETS_C -#include <asm/asm-offsets.h> -#endif +#include <vdso/page.h> /* * General size of kernel stacks @@ -26,9 +24,7 @@ #define STACK_INIT_OFFSET (THREAD_SIZE - STACK_FRAME_OVERHEAD - __PT_SIZE) -#ifndef __ASSEMBLY__ -#include <asm/lowcore.h> -#include <asm/page.h> +#ifndef __ASSEMBLER__ /* * low level task data that entry.S needs immediate access to @@ -60,49 +56,29 @@ void arch_setup_new_exec(void); /* * thread information flags bit numbers + * + * Tell the generic TIF infrastructure which special bits s390 supports */ -#define TIF_NOTIFY_RESUME 0 /* callback before returning to user */ -#define TIF_SIGPENDING 1 /* signal pending */ -#define TIF_NEED_RESCHED 2 /* rescheduling necessary */ -#define TIF_NEED_RESCHED_LAZY 3 /* lazy rescheduling needed */ -#define TIF_UPROBE 4 /* breakpointed or single-stepping */ -#define TIF_PATCH_PENDING 5 /* pending live patching update */ -#define TIF_PGSTE 6 /* New mm's will use 4K page tables */ -#define TIF_NOTIFY_SIGNAL 7 /* signal notifications exist */ -#define TIF_GUARDED_STORAGE 8 /* load guarded storage control block */ -#define TIF_ISOLATE_BP_GUEST 9 /* Run KVM guests with isolated BP */ -#define TIF_PER_TRAP 10 /* Need to handle PER trap on exit to usermode */ -#define TIF_31BIT 16 /* 32bit process */ -#define TIF_MEMDIE 17 /* is terminating due to OOM killer */ -#define TIF_RESTORE_SIGMASK 18 /* restore signal mask in do_signal() */ -#define TIF_SINGLE_STEP 19 /* This task is single stepped */ -#define TIF_BLOCK_STEP 20 /* This task is block stepped */ -#define TIF_UPROBE_SINGLESTEP 21 /* This task is uprobe single stepped */ -#define TIF_SYSCALL_TRACE 24 /* syscall trace active */ -#define TIF_SYSCALL_AUDIT 25 /* syscall auditing active */ -#define TIF_SECCOMP 26 /* secure computing */ -#define TIF_SYSCALL_TRACEPOINT 27 /* syscall tracepoint instrumentation */ +#define HAVE_TIF_NEED_RESCHED_LAZY +#define HAVE_TIF_RESTORE_SIGMASK + +#include <asm-generic/thread_info_tif.h> + +/* Architecture specific bits */ +#define TIF_ASCE_PRIMARY 16 /* primary asce is kernel asce */ +#define TIF_GUARDED_STORAGE 17 /* load guarded storage control block */ +#define TIF_ISOLATE_BP_GUEST 18 /* Run KVM guests with isolated BP */ +#define TIF_PER_TRAP 19 /* Need to handle PER trap on exit to usermode */ +#define TIF_SINGLE_STEP 21 /* This task is single stepped */ +#define TIF_BLOCK_STEP 22 /* This task is block stepped */ +#define TIF_UPROBE_SINGLESTEP 23 /* This task is uprobe single stepped */ -#define _TIF_NOTIFY_RESUME BIT(TIF_NOTIFY_RESUME) -#define _TIF_SIGPENDING BIT(TIF_SIGPENDING) -#define _TIF_NEED_RESCHED BIT(TIF_NEED_RESCHED) -#define _TIF_NEED_RESCHED_LAZY BIT(TIF_NEED_RESCHED_LAZY) -#define _TIF_UPROBE BIT(TIF_UPROBE) -#define _TIF_PATCH_PENDING BIT(TIF_PATCH_PENDING) -#define _TIF_PGSTE BIT(TIF_PGSTE) -#define _TIF_NOTIFY_SIGNAL BIT(TIF_NOTIFY_SIGNAL) +#define _TIF_ASCE_PRIMARY BIT(TIF_ASCE_PRIMARY) #define _TIF_GUARDED_STORAGE BIT(TIF_GUARDED_STORAGE) #define _TIF_ISOLATE_BP_GUEST BIT(TIF_ISOLATE_BP_GUEST) #define _TIF_PER_TRAP BIT(TIF_PER_TRAP) -#define _TIF_31BIT BIT(TIF_31BIT) -#define _TIF_MEMDIE BIT(TIF_MEMDIE) -#define _TIF_RESTORE_SIGMASK BIT(TIF_RESTORE_SIGMASK) #define _TIF_SINGLE_STEP BIT(TIF_SINGLE_STEP) #define _TIF_BLOCK_STEP BIT(TIF_BLOCK_STEP) #define _TIF_UPROBE_SINGLESTEP BIT(TIF_UPROBE_SINGLESTEP) -#define _TIF_SYSCALL_TRACE BIT(TIF_SYSCALL_TRACE) -#define _TIF_SYSCALL_AUDIT BIT(TIF_SYSCALL_AUDIT) -#define _TIF_SECCOMP BIT(TIF_SECCOMP) -#define _TIF_SYSCALL_TRACEPOINT BIT(TIF_SYSCALL_TRACEPOINT) #endif /* _ASM_THREAD_INFO_H */ diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h index a9460bd6555b..49447b40f038 100644 --- a/arch/s390/include/asm/timex.h +++ b/arch/s390/include/asm/timex.h @@ -13,6 +13,7 @@ #include <linux/preempt.h> #include <linux/time64.h> #include <asm/lowcore.h> +#include <asm/machine.h> #include <asm/asm.h> /* The value of the TOD clock for 1.1.1970. */ @@ -80,7 +81,7 @@ static inline void set_tod_programmable_field(u16 val) { asm volatile( " lgr 0,%[val]\n" - " sckpf\n" + " sckpf" : : [val] "d" ((unsigned long)val) : "0"); @@ -195,13 +196,6 @@ static inline unsigned long get_tod_clock_fast(void) asm volatile("stckf %0" : "=Q" (clk) : : "cc"); return clk; } - -static inline cycles_t get_cycles(void) -{ - return (cycles_t) get_tod_clock() >> 2; -} -#define get_cycles get_cycles - int get_phys_clock(unsigned long *clock); void init_cpu_timer(void); @@ -229,6 +223,12 @@ static inline unsigned long get_tod_clock_monotonic(void) return tod; } +static inline cycles_t get_cycles(void) +{ + return (cycles_t)get_tod_clock_monotonic() >> 2; +} +#define get_cycles get_cycles + /** * tod_to_ns - convert a TOD format value to nanoseconds * @todval: to be converted TOD format value @@ -267,7 +267,7 @@ static __always_inline u128 eitod_to_ns(u128 todval) */ static inline int tod_after(unsigned long a, unsigned long b) { - if (MACHINE_HAS_SCC) + if (machine_has_scc()) return (long) a > (long) b; return a > b; } @@ -281,7 +281,7 @@ static inline int tod_after(unsigned long a, unsigned long b) */ static inline int tod_after_eq(unsigned long a, unsigned long b) { - if (MACHINE_HAS_SCC) + if (machine_has_scc()) return (long) a >= (long) b; return a >= b; } diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index 72655fd2d867..619fd41e710e 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -24,7 +24,7 @@ static inline void tlb_flush(struct mmu_gather *tlb); static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, - struct page *page, bool delay_rmap, int page_size); + struct page *page, int page_size); static inline bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page, unsigned int nr_pages, bool delay_rmap); @@ -40,16 +40,14 @@ static inline bool __tlb_remove_folio_pages(struct mmu_gather *tlb, /* * Release the page cache reference for a pte removed by * tlb_ptep_clear_flush. In both flush modes the tlb for a page cache page - * has already been freed, so just do free_page_and_swap_cache. + * has already been freed, so just do free_folio_and_swap_cache. * * s390 doesn't delay rmap removal. */ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, - struct page *page, bool delay_rmap, int page_size) + struct page *page, int page_size) { - VM_WARN_ON_ONCE(delay_rmap); - - free_page_and_swap_cache(page); + free_folio_and_swap_cache(page_folio(page)); return false; } @@ -84,8 +82,6 @@ static inline void pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, tlb->mm->context.flush_mm = 1; tlb->freed_tables = 1; tlb->cleared_pmds = 1; - if (mm_alloc_pgste(tlb->mm)) - gmap_unlink(tlb->mm, (unsigned long *)pte, address); tlb_remove_ptdesc(tlb, virt_to_ptdesc(pte)); } diff --git a/arch/s390/include/asm/tlbflush.h b/arch/s390/include/asm/tlbflush.h index 9dfd46dd03c6..163ccbbe8c47 100644 --- a/arch/s390/include/asm/tlbflush.h +++ b/arch/s390/include/asm/tlbflush.h @@ -2,9 +2,11 @@ #ifndef _S390_TLBFLUSH_H #define _S390_TLBFLUSH_H +#include <linux/cpufeature.h> #include <linux/mm.h> #include <linux/sched.h> #include <asm/processor.h> +#include <asm/machine.h> /* * Flush all TLB entries on the local CPU. @@ -22,7 +24,7 @@ static inline void __tlb_flush_idte(unsigned long asce) unsigned long opt; opt = IDTE_PTOA; - if (MACHINE_HAS_TLB_GUEST) + if (machine_has_tlb_guest()) opt |= IDTE_GUEST_ASCE; /* Global TLB flush for the mm */ asm volatile("idte 0,%1,%0" : : "a" (opt), "a" (asce) : "cc"); @@ -33,9 +35,9 @@ static inline void __tlb_flush_idte(unsigned long asce) */ static inline void __tlb_flush_global(void) { - unsigned int dummy = 0; + unsigned long dummy = 0; - csp(&dummy, 0, 0); + cspg(&dummy, 0, 0); } /* @@ -52,7 +54,7 @@ static inline void __tlb_flush_mm(struct mm_struct *mm) cpumask_copy(mm_cpumask(mm), &mm->context.cpu_attach_mask); barrier(); gmap_asce = READ_ONCE(mm->context.gmap_asce); - if (MACHINE_HAS_IDTE && gmap_asce != -1UL) { + if (gmap_asce != -1UL) { if (gmap_asce) __tlb_flush_idte(gmap_asce); __tlb_flush_idte(mm->context.asce); @@ -66,10 +68,7 @@ static inline void __tlb_flush_mm(struct mm_struct *mm) static inline void __tlb_flush_kernel(void) { - if (MACHINE_HAS_IDTE) - __tlb_flush_idte(init_mm.context.asce); - else - __tlb_flush_global(); + __tlb_flush_idte(init_mm.context.asce); } static inline void __tlb_flush_mm_lazy(struct mm_struct * mm) @@ -84,7 +83,6 @@ static inline void __tlb_flush_mm_lazy(struct mm_struct * mm) /* * TLB flushing: - * flush_tlb() - flushes the current mm struct TLBs * flush_tlb_all() - flushes all processes TLBs * flush_tlb_mm(mm) - flushes the specified mm context TLB's * flush_tlb_page(vma, vmaddr) - flushes one page @@ -100,7 +98,6 @@ static inline void __tlb_flush_mm_lazy(struct mm_struct * mm) * only one user. At the end of the update the flush_tlb_mm and * flush_tlb_range functions need to do the flush. */ -#define flush_tlb() do { } while (0) #define flush_tlb_all() do { } while (0) #define flush_tlb_page(vma, addr) do { } while (0) diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h index cef06bffad80..44110847342a 100644 --- a/arch/s390/include/asm/topology.h +++ b/arch/s390/include/asm/topology.h @@ -61,6 +61,12 @@ static inline void topology_expect_change(void) { } #endif /* CONFIG_SCHED_TOPOLOGY */ +static inline bool topology_is_primary_thread(unsigned int cpu) +{ + return smp_get_base_cpu(cpu) == cpu; +} +#define topology_is_primary_thread topology_is_primary_thread + #define POLARIZATION_UNKNOWN (-1) #define POLARIZATION_HRZ (0) #define POLARIZATION_VL (1) diff --git a/arch/s390/include/asm/tpi.h b/arch/s390/include/asm/tpi.h index f76e5fdff23a..71c8b6f76cdd 100644 --- a/arch/s390/include/asm/tpi.h +++ b/arch/s390/include/asm/tpi.h @@ -5,7 +5,7 @@ #include <linux/types.h> #include <uapi/asm/schid.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* I/O-Interruption Code as stored by TEST PENDING INTERRUPTION (TPI). */ struct tpi_info { @@ -32,6 +32,6 @@ struct tpi_adapter_info { u32 :27; } __packed __aligned(4); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_S390_TPI_H */ diff --git a/arch/s390/include/asm/trace/ap.h b/arch/s390/include/asm/trace/ap.h new file mode 100644 index 000000000000..5c2e6c664b4d --- /dev/null +++ b/arch/s390/include/asm/trace/ap.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Tracepoint definitions for s390 ap bus related trace events + * + * There are two AP bus related tracepoint events defined here: + * There is a tracepoint s390_ap_nqap event immediately after a request + * has been pushed into the AP firmware queue with the NQAP AP command. + * The other tracepoint s390_ap_dqap event fires immediately after a + * reply has been pulled out of the AP firmware queue via DQAP AP command. + * The idea of these two trace events focuses on performance to measure + * the runtime of a crypto request/reply as close as possible at the + * firmware level. In combination with the two zcrypt tracepoints (see the + * zcrypt.h trace event definition file) this gives measurement data about + * the runtime of a request/reply within the zcrpyt and AP bus layer. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM s390 + +#if !defined(_TRACE_S390_AP_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_S390_AP_H + +#include <linux/tracepoint.h> + +DECLARE_EVENT_CLASS(s390_ap_nqapdqap_template, + TP_PROTO(u16 card, u16 dom, u32 status, u64 psmid), + TP_ARGS(card, dom, status, psmid), + TP_STRUCT__entry( + __field(u16, card) + __field(u16, dom) + __field(u32, status) + __field(u64, psmid)), + TP_fast_assign( + __entry->card = card; + __entry->dom = dom; + __entry->status = status; + __entry->psmid = psmid;), + TP_printk("card=%u dom=%u status=0x%08x psmid=0x%016lx", + (unsigned short)__entry->card, + (unsigned short)__entry->dom, + (unsigned int)__entry->status, + (unsigned long)__entry->psmid) +); + +/** + * trace_s390_ap_nqap - ap msg nqap tracepoint function + * @card: Crypto card number addressed. + * @dom: Domain within the crypto card addressed. + * @status: AP queue status (GR1 on return of nqap). + * @psmid: Unique id identifying this request/reply. + * + * Called immediately after a request has been enqueued into + * the AP firmware queue with the NQAP command. + */ +DEFINE_EVENT(s390_ap_nqapdqap_template, + s390_ap_nqap, + TP_PROTO(u16 card, u16 dom, u32 status, u64 psmid), + TP_ARGS(card, dom, status, psmid) +); + +/** + * trace_s390_ap_dqap - ap msg dqap tracepoint function + * @card: Crypto card number addressed. + * @dom: Domain within the crypto card addressed. + * @status: AP queue status (GR1 on return of dqap). + * @psmid: Unique id identifying this request/reply. + * + * Called immediately after a reply has been dequeued from + * the AP firmware queue with the DQAP command. + */ +DEFINE_EVENT(s390_ap_nqapdqap_template, + s390_ap_dqap, + TP_PROTO(u16 card, u16 dom, u32 status, u64 psmid), + TP_ARGS(card, dom, status, psmid) +); + +#endif /* _TRACE_S390_AP_H */ + +/* This part must be outside protection */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE + +#define TRACE_INCLUDE_PATH asm/trace +#define TRACE_INCLUDE_FILE ap + +#include <trace/define_trace.h> diff --git a/arch/s390/include/asm/trace/zcrypt.h b/arch/s390/include/asm/trace/zcrypt.h index 457ddaa99e19..bfb01e335f31 100644 --- a/arch/s390/include/asm/trace/zcrypt.h +++ b/arch/s390/include/asm/trace/zcrypt.h @@ -2,7 +2,7 @@ /* * Tracepoint definitions for the s390 zcrypt device driver * - * Copyright IBM Corp. 2016 + * Copyright IBM Corp. 2016,2025 * Author(s): Harald Freudenberger <freude@de.ibm.com> * * Currently there are two tracepoint events defined here. @@ -73,14 +73,15 @@ TRACE_EVENT(s390_zcrypt_req, /** * trace_s390_zcrypt_rep - zcrypt reply tracepoint function - * @ptr: Address of the local buffer where the request from userspace - * is stored. Can be used as a unique id to match together - * request and reply. - * @fc: Function code. - * @rc: The bare returncode as returned by the device driver ioctl - * function. - * @dev: The adapter nr where this request was actually processed. - * @dom: Domain id of the device where this request was processed. + * @ptr: Address of the local buffer where the request from userspace + * is stored. Can be used as a unique id to match together + * request and reply. + * @fc: Function code. + * @rc: The bare returncode as returned by the device driver ioctl + * function. + * @card: The adapter nr where this request was actually processed. + * @dom: Domain id of the device where this request was processed. + * @psmid: Unique id identifying this request/reply. * * Called upon recognising the reply from the crypto adapter. This * message may act as the exit timestamp for the request but also @@ -88,26 +89,29 @@ TRACE_EVENT(s390_zcrypt_req, * and the returncode from the device driver. */ TRACE_EVENT(s390_zcrypt_rep, - TP_PROTO(void *ptr, u32 fc, u32 rc, u16 dev, u16 dom), - TP_ARGS(ptr, fc, rc, dev, dom), + TP_PROTO(void *ptr, u32 fc, u32 rc, u16 card, u16 dom, u64 psmid), + TP_ARGS(ptr, fc, rc, card, dom, psmid), TP_STRUCT__entry( __field(void *, ptr) __field(u32, fc) __field(u32, rc) - __field(u16, device) - __field(u16, domain)), + __field(u16, card) + __field(u16, dom) + __field(u64, psmid)), TP_fast_assign( __entry->ptr = ptr; __entry->fc = fc; __entry->rc = rc; - __entry->device = dev; - __entry->domain = dom;), - TP_printk("ptr=%p fc=0x%04x rc=%d dev=0x%02hx domain=0x%04hx", + __entry->card = card; + __entry->dom = dom; + __entry->psmid = psmid;), + TP_printk("ptr=%p fc=0x%04x rc=%d card=%u dom=%u psmid=0x%016lx", __entry->ptr, - (unsigned int) __entry->fc, - (int) __entry->rc, - (unsigned short) __entry->device, - (unsigned short) __entry->domain) + (unsigned int)__entry->fc, + (int)__entry->rc, + (unsigned short)__entry->card, + (unsigned short)__entry->dom, + (unsigned long)__entry->psmid) ); #endif /* _TRACE_S390_ZCRYPT_H */ diff --git a/arch/s390/include/asm/types.h b/arch/s390/include/asm/types.h index 0b5d550a0478..53695b2196f7 100644 --- a/arch/s390/include/asm/types.h +++ b/arch/s390/include/asm/types.h @@ -5,7 +5,7 @@ #include <uapi/asm/types.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ union register_pair { unsigned __int128 pair; @@ -15,5 +15,5 @@ union register_pair { }; }; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_S390_TYPES_H */ diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index f5920163ee97..dff035372601 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -13,125 +13,80 @@ /* * User space memory access functions */ +#include <linux/pgtable.h> #include <asm/asm-extable.h> #include <asm/processor.h> #include <asm/extable.h> #include <asm/facility.h> #include <asm-generic/access_ok.h> +#include <asm/asce.h> #include <linux/instrumented.h> void debug_user_asce(int exit); -union oac { - unsigned int val; - struct { - struct { - unsigned short key : 4; - unsigned short : 4; - unsigned short as : 2; - unsigned short : 4; - unsigned short k : 1; - unsigned short a : 1; - } oac1; - struct { - unsigned short key : 4; - unsigned short : 4; - unsigned short as : 2; - unsigned short : 4; - unsigned short k : 1; - unsigned short a : 1; - } oac2; - }; -}; - -static __always_inline __must_check unsigned long -raw_copy_from_user_key(void *to, const void __user *from, unsigned long size, unsigned long key) -{ - unsigned long rem; - union oac spec = { - .oac2.key = key, - .oac2.as = PSW_BITS_AS_SECONDARY, - .oac2.k = 1, - .oac2.a = 1, - }; - - asm_inline volatile( - " lr %%r0,%[spec]\n" - "0: mvcos 0(%[to]),0(%[from]),%[size]\n" - "1: jz 5f\n" - " algr %[size],%[val]\n" - " slgr %[from],%[val]\n" - " slgr %[to],%[val]\n" - " j 0b\n" - "2: la %[rem],4095(%[from])\n" /* rem = from + 4095 */ - " nr %[rem],%[val]\n" /* rem = (from + 4095) & -4096 */ - " slgr %[rem],%[from]\n" - " clgr %[size],%[rem]\n" /* copy crosses next page boundary? */ - " jnh 6f\n" - "3: mvcos 0(%[to]),0(%[from]),%[rem]\n" - "4: slgr %[size],%[rem]\n" - " j 6f\n" - "5: lghi %[size],0\n" - "6:\n" - EX_TABLE(0b, 2b) - EX_TABLE(1b, 2b) - EX_TABLE(3b, 6b) - EX_TABLE(4b, 6b) - : [size] "+&a" (size), [from] "+&a" (from), [to] "+&a" (to), [rem] "=&a" (rem) - : [val] "a" (-4096UL), [spec] "d" (spec.val) - : "cc", "memory", "0"); - return size; -} +#ifdef CONFIG_KMSAN +#define uaccess_kmsan_or_inline noinline __maybe_unused __no_sanitize_memory +#else +#define uaccess_kmsan_or_inline __always_inline +#endif -static __always_inline __must_check unsigned long -raw_copy_from_user(void *to, const void __user *from, unsigned long n) -{ - return raw_copy_from_user_key(to, from, n, 0); -} +#define INLINE_COPY_FROM_USER +#define INLINE_COPY_TO_USER -static __always_inline __must_check unsigned long -raw_copy_to_user_key(void __user *to, const void *from, unsigned long size, unsigned long key) +static uaccess_kmsan_or_inline __must_check unsigned long +raw_copy_from_user(void *to, const void __user *from, unsigned long size) { - unsigned long rem; - union oac spec = { - .oac1.key = key, - .oac1.as = PSW_BITS_AS_SECONDARY, - .oac1.k = 1, - .oac1.a = 1, - }; - - asm_inline volatile( - " lr %%r0,%[spec]\n" - "0: mvcos 0(%[to]),0(%[from]),%[size]\n" - "1: jz 5f\n" - " algr %[size],%[val]\n" - " slgr %[to],%[val]\n" - " slgr %[from],%[val]\n" - " j 0b\n" - "2: la %[rem],4095(%[to])\n" /* rem = to + 4095 */ - " nr %[rem],%[val]\n" /* rem = (to + 4095) & -4096 */ - " slgr %[rem],%[to]\n" - " clgr %[size],%[rem]\n" /* copy crosses next page boundary? */ - " jnh 6f\n" - "3: mvcos 0(%[to]),0(%[from]),%[rem]\n" - "4: slgr %[size],%[rem]\n" - " j 6f\n" - "5: lghi %[size],0\n" - "6:\n" - EX_TABLE(0b, 2b) - EX_TABLE(1b, 2b) - EX_TABLE(3b, 6b) - EX_TABLE(4b, 6b) - : [size] "+&a" (size), [to] "+&a" (to), [from] "+&a" (from), [rem] "=&a" (rem) - : [val] "a" (-4096UL), [spec] "d" (spec.val) - : "cc", "memory", "0"); - return size; + unsigned long osize; + int cc; + + while (1) { + osize = size; + asm_inline volatile( + " lhi %%r0,%[spec]\n" + "0: mvcos %[to],%[from],%[size]\n" + "1: nopr %%r7\n" + CC_IPM(cc) + EX_TABLE_UA_MVCOS_FROM(0b, 0b) + EX_TABLE_UA_MVCOS_FROM(1b, 0b) + : CC_OUT(cc, cc), [size] "+d" (size), [to] "=Q" (*(char *)to) + : [spec] "I" (0x81), [from] "Q" (*(const char __user *)from) + : CC_CLOBBER_LIST("memory", "0")); + if (__builtin_constant_p(osize) && osize <= 4096) + return osize - size; + if (likely(CC_TRANSFORM(cc) == 0)) + return osize - size; + size -= 4096; + to += 4096; + from += 4096; + } } -static __always_inline __must_check unsigned long -raw_copy_to_user(void __user *to, const void *from, unsigned long n) +static uaccess_kmsan_or_inline __must_check unsigned long +raw_copy_to_user(void __user *to, const void *from, unsigned long size) { - return raw_copy_to_user_key(to, from, n, 0); + unsigned long osize; + int cc; + + while (1) { + osize = size; + asm_inline volatile( + " llilh %%r0,%[spec]\n" + "0: mvcos %[to],%[from],%[size]\n" + "1: nopr %%r7\n" + CC_IPM(cc) + EX_TABLE_UA_MVCOS_TO(0b, 0b) + EX_TABLE_UA_MVCOS_TO(1b, 0b) + : CC_OUT(cc, cc), [size] "+d" (size), [to] "=Q" (*(char __user *)to) + : [spec] "I" (0x81), [from] "Q" (*(const char *)from) + : CC_CLOBBER_LIST("memory", "0")); + if (__builtin_constant_p(osize) && osize <= 4096) + return osize - size; + if (likely(CC_TRANSFORM(cc) == 0)) + return osize - size; + size -= 4096; + to += 4096; + from += 4096; + } } unsigned long __must_check @@ -158,12 +113,6 @@ copy_to_user_key(void __user *to, const void *from, unsigned long n, unsigned lo int __noreturn __put_user_bad(void); -#ifdef CONFIG_KMSAN -#define uaccess_kmsan_or_inline noinline __maybe_unused __no_sanitize_memory -#else -#define uaccess_kmsan_or_inline __always_inline -#endif - #ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT #define DEFINE_PUT_USER_NOINSTR(type) \ @@ -199,7 +148,7 @@ __put_user_##type##_noinstr(unsigned type __user *to, \ { \ int rc; \ \ - asm volatile( \ + asm_inline volatile( \ " llilh %%r0,%[spec]\n" \ "0: mvcos %[to],%[from],%[size]\n" \ "1: lhi %[rc],0\n" \ @@ -315,7 +264,7 @@ __get_user_##type##_noinstr(unsigned type *to, \ { \ int rc; \ \ - asm volatile( \ + asm_inline volatile( \ " lhi %%r0,%[spec]\n" \ "0: mvcos %[to],%[from],%[size]\n" \ "1: lhi %[rc],0\n" \ @@ -415,12 +364,34 @@ long __must_check strncpy_from_user(char *dst, const char __user *src, long coun long __must_check strnlen_user(const char __user *src, long count); -/* - * Zero Userspace - */ -unsigned long __must_check __clear_user(void __user *to, unsigned long size); +static uaccess_kmsan_or_inline __must_check unsigned long +__clear_user(void __user *to, unsigned long size) +{ + unsigned long osize; + int cc; + + while (1) { + osize = size; + asm_inline volatile( + " llilh %%r0,%[spec]\n" + "0: mvcos %[to],%[from],%[size]\n" + "1: nopr %%r7\n" + CC_IPM(cc) + EX_TABLE_UA_MVCOS_TO(0b, 0b) + EX_TABLE_UA_MVCOS_TO(1b, 0b) + : CC_OUT(cc, cc), [size] "+d" (size), [to] "=Q" (*(char __user *)to) + : [spec] "I" (0x81), [from] "Q" (*(const char *)empty_zero_page) + : CC_CLOBBER_LIST("memory", "0")); + if (__builtin_constant_p(osize) && osize <= 4096) + return osize - size; + if (CC_TRANSFORM(cc) == 0) + return osize - size; + size -= 4096; + to += 4096; + } +} -static inline unsigned long __must_check clear_user(void __user *to, unsigned long n) +static __always_inline unsigned long __must_check clear_user(void __user *to, unsigned long n) { might_fault(); return __clear_user(to, n); @@ -497,215 +468,18 @@ do { \ #endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT && CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */ -#define __get_kernel_nofault __mvc_kernel_nofault -#define __put_kernel_nofault __mvc_kernel_nofault - -void __cmpxchg_user_key_called_with_bad_pointer(void); - -#define CMPXCHG_USER_KEY_MAX_LOOPS 128 - -static __always_inline int __cmpxchg_user_key(unsigned long address, void *uval, - __uint128_t old, __uint128_t new, - unsigned long key, int size) -{ - int rc = 0; - - switch (size) { - case 1: { - unsigned int prev, shift, mask, _old, _new; - unsigned long count; - - shift = (3 ^ (address & 3)) << 3; - address ^= address & 3; - _old = ((unsigned int)old & 0xff) << shift; - _new = ((unsigned int)new & 0xff) << shift; - mask = ~(0xff << shift); - asm volatile( - " spka 0(%[key])\n" - " sacf 256\n" - " llill %[count],%[max_loops]\n" - "0: l %[prev],%[address]\n" - "1: nr %[prev],%[mask]\n" - " xilf %[mask],0xffffffff\n" - " or %[new],%[prev]\n" - " or %[prev],%[tmp]\n" - "2: lr %[tmp],%[prev]\n" - "3: cs %[prev],%[new],%[address]\n" - "4: jnl 5f\n" - " xr %[tmp],%[prev]\n" - " xr %[new],%[tmp]\n" - " nr %[tmp],%[mask]\n" - " jnz 5f\n" - " brct %[count],2b\n" - "5: sacf 768\n" - " spka %[default_key]\n" - EX_TABLE_UA_LOAD_REG(0b, 5b, %[rc], %[prev]) - EX_TABLE_UA_LOAD_REG(1b, 5b, %[rc], %[prev]) - EX_TABLE_UA_LOAD_REG(3b, 5b, %[rc], %[prev]) - EX_TABLE_UA_LOAD_REG(4b, 5b, %[rc], %[prev]) - : [rc] "+&d" (rc), - [prev] "=&d" (prev), - [address] "+Q" (*(int *)address), - [tmp] "+&d" (_old), - [new] "+&d" (_new), - [mask] "+&d" (mask), - [count] "=a" (count) - : [key] "%[count]" (key << 4), - [default_key] "J" (PAGE_DEFAULT_KEY), - [max_loops] "J" (CMPXCHG_USER_KEY_MAX_LOOPS) - : "memory", "cc"); - *(unsigned char *)uval = prev >> shift; - if (!count) - rc = -EAGAIN; - return rc; - } - case 2: { - unsigned int prev, shift, mask, _old, _new; - unsigned long count; - - shift = (2 ^ (address & 2)) << 3; - address ^= address & 2; - _old = ((unsigned int)old & 0xffff) << shift; - _new = ((unsigned int)new & 0xffff) << shift; - mask = ~(0xffff << shift); - asm volatile( - " spka 0(%[key])\n" - " sacf 256\n" - " llill %[count],%[max_loops]\n" - "0: l %[prev],%[address]\n" - "1: nr %[prev],%[mask]\n" - " xilf %[mask],0xffffffff\n" - " or %[new],%[prev]\n" - " or %[prev],%[tmp]\n" - "2: lr %[tmp],%[prev]\n" - "3: cs %[prev],%[new],%[address]\n" - "4: jnl 5f\n" - " xr %[tmp],%[prev]\n" - " xr %[new],%[tmp]\n" - " nr %[tmp],%[mask]\n" - " jnz 5f\n" - " brct %[count],2b\n" - "5: sacf 768\n" - " spka %[default_key]\n" - EX_TABLE_UA_LOAD_REG(0b, 5b, %[rc], %[prev]) - EX_TABLE_UA_LOAD_REG(1b, 5b, %[rc], %[prev]) - EX_TABLE_UA_LOAD_REG(3b, 5b, %[rc], %[prev]) - EX_TABLE_UA_LOAD_REG(4b, 5b, %[rc], %[prev]) - : [rc] "+&d" (rc), - [prev] "=&d" (prev), - [address] "+Q" (*(int *)address), - [tmp] "+&d" (_old), - [new] "+&d" (_new), - [mask] "+&d" (mask), - [count] "=a" (count) - : [key] "%[count]" (key << 4), - [default_key] "J" (PAGE_DEFAULT_KEY), - [max_loops] "J" (CMPXCHG_USER_KEY_MAX_LOOPS) - : "memory", "cc"); - *(unsigned short *)uval = prev >> shift; - if (!count) - rc = -EAGAIN; - return rc; - } - case 4: { - unsigned int prev = old; - - asm volatile( - " spka 0(%[key])\n" - " sacf 256\n" - "0: cs %[prev],%[new],%[address]\n" - "1: sacf 768\n" - " spka %[default_key]\n" - EX_TABLE_UA_LOAD_REG(0b, 1b, %[rc], %[prev]) - EX_TABLE_UA_LOAD_REG(1b, 1b, %[rc], %[prev]) - : [rc] "+&d" (rc), - [prev] "+&d" (prev), - [address] "+Q" (*(int *)address) - : [new] "d" ((unsigned int)new), - [key] "a" (key << 4), - [default_key] "J" (PAGE_DEFAULT_KEY) - : "memory", "cc"); - *(unsigned int *)uval = prev; - return rc; - } - case 8: { - unsigned long prev = old; - - asm volatile( - " spka 0(%[key])\n" - " sacf 256\n" - "0: csg %[prev],%[new],%[address]\n" - "1: sacf 768\n" - " spka %[default_key]\n" - EX_TABLE_UA_LOAD_REG(0b, 1b, %[rc], %[prev]) - EX_TABLE_UA_LOAD_REG(1b, 1b, %[rc], %[prev]) - : [rc] "+&d" (rc), - [prev] "+&d" (prev), - [address] "+QS" (*(long *)address) - : [new] "d" ((unsigned long)new), - [key] "a" (key << 4), - [default_key] "J" (PAGE_DEFAULT_KEY) - : "memory", "cc"); - *(unsigned long *)uval = prev; - return rc; - } - case 16: { - __uint128_t prev = old; - - asm volatile( - " spka 0(%[key])\n" - " sacf 256\n" - "0: cdsg %[prev],%[new],%[address]\n" - "1: sacf 768\n" - " spka %[default_key]\n" - EX_TABLE_UA_LOAD_REGPAIR(0b, 1b, %[rc], %[prev]) - EX_TABLE_UA_LOAD_REGPAIR(1b, 1b, %[rc], %[prev]) - : [rc] "+&d" (rc), - [prev] "+&d" (prev), - [address] "+QS" (*(__int128_t *)address) - : [new] "d" (new), - [key] "a" (key << 4), - [default_key] "J" (PAGE_DEFAULT_KEY) - : "memory", "cc"); - *(__uint128_t *)uval = prev; - return rc; - } - } - __cmpxchg_user_key_called_with_bad_pointer(); - return rc; -} - -/** - * cmpxchg_user_key() - cmpxchg with user space target, honoring storage keys - * @ptr: User space address of value to compare to @old and exchange with - * @new. Must be aligned to sizeof(*@ptr). - * @uval: Address where the old value of *@ptr is written to. - * @old: Old value. Compared to the content pointed to by @ptr in order to - * determine if the exchange occurs. The old value read from *@ptr is - * written to *@uval. - * @new: New value to place at *@ptr. - * @key: Access key to use for checking storage key protection. - * - * Perform a cmpxchg on a user space target, honoring storage key protection. - * @key alone determines how key checking is performed, neither - * storage-protection-override nor fetch-protection-override apply. - * The caller must compare *@uval and @old to determine if values have been - * exchanged. In case of an exception *@uval is set to zero. - * - * Return: 0: cmpxchg executed - * -EFAULT: an exception happened when trying to access *@ptr - * -EAGAIN: maxed out number of retries (byte and short only) - */ -#define cmpxchg_user_key(ptr, uval, old, new, key) \ -({ \ - __typeof__(ptr) __ptr = (ptr); \ - __typeof__(uval) __uval = (uval); \ - \ - BUILD_BUG_ON(sizeof(*(__ptr)) != sizeof(*(__uval))); \ - might_fault(); \ - __chk_user_ptr(__ptr); \ - __cmpxchg_user_key((unsigned long)(__ptr), (void *)(__uval), \ - (old), (new), (key), sizeof(*(__ptr))); \ -}) +#define arch_get_kernel_nofault __mvc_kernel_nofault +#define arch_put_kernel_nofault __mvc_kernel_nofault + +int __cmpxchg_key1(void *address, unsigned char *uval, unsigned char old, + unsigned char new, unsigned long key); +int __cmpxchg_key2(void *address, unsigned short *uval, unsigned short old, + unsigned short new, unsigned long key); +int __cmpxchg_key4(void *address, unsigned int *uval, unsigned int old, + unsigned int new, unsigned long key); +int __cmpxchg_key8(void *address, unsigned long *uval, unsigned long old, + unsigned long new, unsigned long key); +int __cmpxchg_key16(void *address, __uint128_t *uval, __uint128_t old, + __uint128_t new, unsigned long key); #endif /* __S390_UACCESS_H */ diff --git a/arch/s390/include/asm/unistd.h b/arch/s390/include/asm/unistd.h index 70fc671397da..921c3fb3586b 100644 --- a/arch/s390/include/asm/unistd.h +++ b/arch/s390/include/asm/unistd.h @@ -8,7 +8,8 @@ #define _ASM_S390_UNISTD_H_ #include <uapi/asm/unistd.h> -#include <asm/unistd_nr.h> + +#define NR_syscalls (__NR_syscalls) #define __ARCH_WANT_NEW_STAT #define __ARCH_WANT_OLD_READDIR @@ -27,11 +28,6 @@ #define __ARCH_WANT_SYS_OLDUMOUNT #define __ARCH_WANT_SYS_SIGPENDING #define __ARCH_WANT_SYS_SIGPROCMASK -# ifdef CONFIG_COMPAT -# define __ARCH_WANT_COMPAT_STAT -# define __ARCH_WANT_SYS_TIME32 -# define __ARCH_WANT_SYS_UTIME32 -# endif #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK #define __ARCH_WANT_SYS_CLONE diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h index b11f5b6d0bd1..d919e69662f5 100644 --- a/arch/s390/include/asm/uv.h +++ b/arch/s390/include/asm/uv.h @@ -16,7 +16,6 @@ #include <linux/bug.h> #include <linux/sched.h> #include <asm/page.h> -#include <asm/gmap.h> #include <asm/asm.h> #define UVC_CC_OK 0 @@ -616,8 +615,9 @@ static inline int uv_remove_shared(unsigned long addr) return share(addr, UVC_CMD_REMOVE_SHARED_ACCESS); } -int uv_get_secret_metadata(const u8 secret_id[UV_SECRET_ID_LEN], - struct uv_secret_list_item_hdr *secret); +int uv_find_secret(const u8 secret_id[UV_SECRET_ID_LEN], + struct uv_secret_list *list, + struct uv_secret_list_item_hdr *secret); int uv_retrieve_secret(u16 secret_idx, u8 *buf, size_t buf_size); extern int prot_virt_host; @@ -631,7 +631,8 @@ int uv_pin_shared(unsigned long paddr); int uv_destroy_folio(struct folio *folio); int uv_destroy_pte(pte_t pte); int uv_convert_from_secure_pte(pte_t pte); -int make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb); +int s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio); +int __make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb); int uv_convert_from_secure(unsigned long paddr); int uv_convert_from_secure_folio(struct folio *folio); diff --git a/arch/s390/include/asm/vdso-symbols.h b/arch/s390/include/asm/vdso-symbols.h index 0df17574d788..e3561e67c4e3 100644 --- a/arch/s390/include/asm/vdso-symbols.h +++ b/arch/s390/include/asm/vdso-symbols.h @@ -2,16 +2,8 @@ #ifndef __S390_VDSO_SYMBOLS_H__ #define __S390_VDSO_SYMBOLS_H__ -#include <generated/vdso64-offsets.h> -#ifdef CONFIG_COMPAT -#include <generated/vdso32-offsets.h> -#endif +#include <generated/vdso-offsets.h> -#define VDSO64_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso64_offset_##name)) -#ifdef CONFIG_COMPAT -#define VDSO32_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso32_offset_##name)) -#else -#define VDSO32_SYMBOL(tsk, name) (-1UL) -#endif +#define VDSO_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso_offset_##name)) #endif /* __S390_VDSO_SYMBOLS_H__ */ diff --git a/arch/s390/include/asm/vdso.h b/arch/s390/include/asm/vdso.h index 92c73e4d97a9..8e2fffa0ca68 100644 --- a/arch/s390/include/asm/vdso.h +++ b/arch/s390/include/asm/vdso.h @@ -4,15 +4,13 @@ #include <vdso/datapage.h> -#ifndef __ASSEMBLY__ - -extern struct vdso_data *vdso_data; +#ifndef __ASSEMBLER__ int vdso_getcpu_init(void); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ -#define __VVAR_PAGES 2 +#define __VDSO_PAGES 4 #define VDSO_VERSION_STRING LINUX_2.6.29 diff --git a/arch/s390/include/asm/vdso/getrandom.h b/arch/s390/include/asm/vdso/getrandom.h index 36355af7160b..6741a27199f8 100644 --- a/arch/s390/include/asm/vdso/getrandom.h +++ b/arch/s390/include/asm/vdso/getrandom.h @@ -3,7 +3,7 @@ #ifndef __ASM_VDSO_GETRANDOM_H #define __ASM_VDSO_GETRANDOM_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <vdso/datapage.h> #include <asm/vdso/vsyscall.h> @@ -23,18 +23,6 @@ static __always_inline ssize_t getrandom_syscall(void *buffer, size_t len, unsig return syscall3(__NR_getrandom, (long)buffer, (long)len, (long)flags); } -static __always_inline const struct vdso_rng_data *__arch_get_vdso_rng_data(void) -{ - /* - * The RNG data is in the real VVAR data page, but if a task belongs to a time namespace - * then VVAR_DATA_PAGE_OFFSET points to the namespace-specific VVAR page and VVAR_TIMENS_ - * PAGE_OFFSET points to the real VVAR page. - */ - if (IS_ENABLED(CONFIG_TIME_NS) && _vdso_data->clock_mode == VDSO_CLOCKMODE_TIMENS) - return (void *)&_vdso_rng_data + VVAR_TIMENS_PAGE_OFFSET * PAGE_SIZE; - return &_vdso_rng_data; -} - -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* __ASM_VDSO_GETRANDOM_H */ diff --git a/arch/s390/include/asm/vdso/gettimeofday.h b/arch/s390/include/asm/vdso/gettimeofday.h index 7937765ccfa5..c31ac5f61c83 100644 --- a/arch/s390/include/asm/vdso/gettimeofday.h +++ b/arch/s390/include/asm/vdso/gettimeofday.h @@ -14,20 +14,9 @@ #include <linux/compiler.h> -static __always_inline const struct vdso_data *__arch_get_vdso_data(void) +static inline u64 __arch_get_hw_counter(s32 clock_mode, const struct vdso_time_data *vd) { - return _vdso_data; -} - -static inline u64 __arch_get_hw_counter(s32 clock_mode, const struct vdso_data *vd) -{ - u64 adj, now; - - now = get_tod_clock(); - adj = vd->arch_data.tod_steering_end - now; - if (unlikely((s64) adj > 0)) - now += (vd->arch_data.tod_steering_delta < 0) ? (adj >> 15) : -(adj >> 15); - return now; + return get_tod_clock() - vd->arch_data.tod_delta; } static __always_inline @@ -49,12 +38,4 @@ long clock_getres_fallback(clockid_t clkid, struct __kernel_timespec *ts) return syscall2(__NR_clock_getres, (long)clkid, (long)ts); } -#ifdef CONFIG_TIME_NS -static __always_inline -const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd) -{ - return _timens_data; -} -#endif - #endif diff --git a/arch/s390/include/asm/vdso/time_data.h b/arch/s390/include/asm/vdso/time_data.h index 8a08752422e6..25c4e0d9f596 100644 --- a/arch/s390/include/asm/vdso/time_data.h +++ b/arch/s390/include/asm/vdso/time_data.h @@ -5,8 +5,7 @@ #include <linux/types.h> struct arch_vdso_time_data { - __s64 tod_steering_delta; - __u64 tod_steering_end; + __s64 tod_delta; }; #endif /* __S390_ASM_VDSO_TIME_DATA_H */ diff --git a/arch/s390/include/asm/vdso/vsyscall.h b/arch/s390/include/asm/vdso/vsyscall.h index 3eb576ecd3bd..b00acec8ddbc 100644 --- a/arch/s390/include/asm/vdso/vsyscall.h +++ b/arch/s390/include/asm/vdso/vsyscall.h @@ -2,35 +2,15 @@ #ifndef __ASM_VDSO_VSYSCALL_H #define __ASM_VDSO_VSYSCALL_H -#define __VDSO_RND_DATA_OFFSET 768 - -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/hrtimer.h> #include <vdso/datapage.h> #include <asm/vdso.h> -enum vvar_pages { - VVAR_DATA_PAGE_OFFSET, - VVAR_TIMENS_PAGE_OFFSET, - VVAR_NR_PAGES -}; - -static __always_inline struct vdso_data *__s390_get_k_vdso_data(void) -{ - return vdso_data; -} -#define __arch_get_k_vdso_data __s390_get_k_vdso_data - -static __always_inline struct vdso_rng_data *__s390_get_k_vdso_rnd_data(void) -{ - return (void *)vdso_data + __VDSO_RND_DATA_OFFSET; -} -#define __arch_get_k_vdso_rng_data __s390_get_k_vdso_rnd_data - /* The asm-generic header needs to be included after the definitions above */ #include <asm-generic/vdso/vsyscall.h> -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* __ASM_VDSO_VSYSCALL_H */ diff --git a/arch/s390/include/asm/vtime.h b/arch/s390/include/asm/vtime.h index 9d25fb35a042..b1db75d14e9d 100644 --- a/arch/s390/include/asm/vtime.h +++ b/arch/s390/include/asm/vtime.h @@ -2,6 +2,12 @@ #ifndef _S390_VTIME_H #define _S390_VTIME_H +#include <asm/lowcore.h> +#include <asm/cpu_mf.h> +#include <asm/idle.h> + +DECLARE_PER_CPU(u64, mt_cycles[8]); + static inline void update_timer_sys(void) { struct lowcore *lc = get_lowcore(); @@ -20,4 +26,32 @@ static inline void update_timer_mcck(void) lc->last_update_timer = lc->mcck_enter_timer; } +static inline void update_timer_idle(void) +{ + struct s390_idle_data *idle = this_cpu_ptr(&s390_idle); + struct lowcore *lc = get_lowcore(); + u64 cycles_new[8]; + int i, mtid; + + mtid = smp_cpu_mtid; + if (mtid) { + stcctm(MT_DIAG, mtid, cycles_new); + for (i = 0; i < mtid; i++) + __this_cpu_add(mt_cycles[i], cycles_new[i] - idle->mt_cycles_enter[i]); + } + /* + * This is a bit subtle: Forward last_update_clock so it excludes idle + * time. For correct steal time calculation in do_account_vtime() add + * passed wall time before idle_enter to steal_timer: + * During the passed wall time before idle_enter CPU time may have + * been accounted to system, hardirq, softirq, etc. lowcore fields. + * The accounted CPU times will be subtracted again from steal_timer + * when accumulated steal time is calculated in do_account_vtime(). + */ + lc->steal_timer += idle->clock_idle_enter - lc->last_update_clock; + lc->last_update_clock = lc->int_clock; + lc->system_timer += lc->last_update_timer - idle->timer_idle_enter; + lc->last_update_timer = lc->sys_enter_timer; +} + #endif /* _S390_VTIME_H */ diff --git a/arch/s390/include/asm/word-at-a-time.h b/arch/s390/include/asm/word-at-a-time.h index 203acd6e431b..eaa19dee7699 100644 --- a/arch/s390/include/asm/word-at-a-time.h +++ b/arch/s390/include/asm/word-at-a-time.h @@ -52,7 +52,7 @@ static inline unsigned long load_unaligned_zeropad(const void *addr) { unsigned long data; - asm volatile( + asm_inline volatile( "0: lg %[data],0(%[addr])\n" "1: nopr %%r7\n" EX_TABLE_ZEROPAD(0b, 1b, %[data], %[addr]) diff --git a/arch/s390/include/asm/xor.h b/arch/s390/include/asm/xor.h deleted file mode 100644 index 857d6759b67f..000000000000 --- a/arch/s390/include/asm/xor.h +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Optimited xor routines - * - * Copyright IBM Corp. 2016 - * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> - */ -#ifndef _ASM_S390_XOR_H -#define _ASM_S390_XOR_H - -extern struct xor_block_template xor_block_xc; - -#undef XOR_TRY_TEMPLATES -#define XOR_TRY_TEMPLATES \ -do { \ - xor_speed(&xor_block_xc); \ -} while (0) - -#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_xc) - -#endif /* _ASM_S390_XOR_H */ diff --git a/arch/s390/include/uapi/asm/bitsperlong.h b/arch/s390/include/uapi/asm/bitsperlong.h index cceaf47b021a..7af27a985f25 100644 --- a/arch/s390/include/uapi/asm/bitsperlong.h +++ b/arch/s390/include/uapi/asm/bitsperlong.h @@ -2,11 +2,7 @@ #ifndef __ASM_S390_BITSPERLONG_H #define __ASM_S390_BITSPERLONG_H -#ifndef __s390x__ -#define __BITS_PER_LONG 32 -#else #define __BITS_PER_LONG 64 -#endif #include <asm-generic/bitsperlong.h> diff --git a/arch/s390/include/uapi/asm/ipcbuf.h b/arch/s390/include/uapi/asm/ipcbuf.h index 1030cd186899..9277e76d6d72 100644 --- a/arch/s390/include/uapi/asm/ipcbuf.h +++ b/arch/s390/include/uapi/asm/ipcbuf.h @@ -24,9 +24,6 @@ struct ipc64_perm __kernel_mode_t mode; unsigned short __pad1; unsigned short seq; -#ifndef __s390x__ - unsigned short __pad2; -#endif /* ! __s390x__ */ unsigned long __unused1; unsigned long __unused2; }; diff --git a/arch/s390/include/uapi/asm/ipl.h b/arch/s390/include/uapi/asm/ipl.h index 2cd28af50dd4..3d64a2251699 100644 --- a/arch/s390/include/uapi/asm/ipl.h +++ b/arch/s390/include/uapi/asm/ipl.h @@ -15,6 +15,7 @@ struct ipl_pl_hdr { #define IPL_PL_FLAG_IPLPS 0x80 #define IPL_PL_FLAG_SIPL 0x40 #define IPL_PL_FLAG_IPLSR 0x20 +#define IPL_PL_FLAG_SBP 0x10 /* IPL Parameter Block header */ struct ipl_pb_hdr { diff --git a/arch/s390/include/uapi/asm/posix_types.h b/arch/s390/include/uapi/asm/posix_types.h index 1913613e71b6..ad5ab940d192 100644 --- a/arch/s390/include/uapi/asm/posix_types.h +++ b/arch/s390/include/uapi/asm/posix_types.h @@ -26,17 +26,6 @@ typedef unsigned short __kernel_old_gid_t; #define __kernel_old_uid_t __kernel_old_uid_t #endif -#ifndef __s390x__ - -typedef unsigned long __kernel_ino_t; -typedef unsigned short __kernel_mode_t; -typedef unsigned short __kernel_ipc_pid_t; -typedef unsigned short __kernel_uid_t; -typedef unsigned short __kernel_gid_t; -typedef int __kernel_ptrdiff_t; - -#else /* __s390x__ */ - typedef unsigned int __kernel_ino_t; typedef unsigned int __kernel_mode_t; typedef int __kernel_ipc_pid_t; @@ -45,8 +34,6 @@ typedef unsigned int __kernel_gid_t; typedef long __kernel_ptrdiff_t; typedef unsigned long __kernel_sigset_t; /* at least 32 bits */ -#endif /* __s390x__ */ - #define __kernel_ino_t __kernel_ino_t #define __kernel_mode_t __kernel_mode_t #define __kernel_ipc_pid_t __kernel_ipc_pid_t diff --git a/arch/s390/include/uapi/asm/ptrace.h b/arch/s390/include/uapi/asm/ptrace.h index bb0826024bb9..ea29ba470e5a 100644 --- a/arch/s390/include/uapi/asm/ptrace.h +++ b/arch/s390/include/uapi/asm/ptrace.h @@ -14,94 +14,6 @@ * Offsets in the user_regs_struct. They are used for the ptrace * system call and in entry.S */ -#ifndef __s390x__ - -#define PT_PSWMASK 0x00 -#define PT_PSWADDR 0x04 -#define PT_GPR0 0x08 -#define PT_GPR1 0x0C -#define PT_GPR2 0x10 -#define PT_GPR3 0x14 -#define PT_GPR4 0x18 -#define PT_GPR5 0x1C -#define PT_GPR6 0x20 -#define PT_GPR7 0x24 -#define PT_GPR8 0x28 -#define PT_GPR9 0x2C -#define PT_GPR10 0x30 -#define PT_GPR11 0x34 -#define PT_GPR12 0x38 -#define PT_GPR13 0x3C -#define PT_GPR14 0x40 -#define PT_GPR15 0x44 -#define PT_ACR0 0x48 -#define PT_ACR1 0x4C -#define PT_ACR2 0x50 -#define PT_ACR3 0x54 -#define PT_ACR4 0x58 -#define PT_ACR5 0x5C -#define PT_ACR6 0x60 -#define PT_ACR7 0x64 -#define PT_ACR8 0x68 -#define PT_ACR9 0x6C -#define PT_ACR10 0x70 -#define PT_ACR11 0x74 -#define PT_ACR12 0x78 -#define PT_ACR13 0x7C -#define PT_ACR14 0x80 -#define PT_ACR15 0x84 -#define PT_ORIGGPR2 0x88 -#define PT_FPC 0x90 -/* - * A nasty fact of life that the ptrace api - * only supports passing of longs. - */ -#define PT_FPR0_HI 0x98 -#define PT_FPR0_LO 0x9C -#define PT_FPR1_HI 0xA0 -#define PT_FPR1_LO 0xA4 -#define PT_FPR2_HI 0xA8 -#define PT_FPR2_LO 0xAC -#define PT_FPR3_HI 0xB0 -#define PT_FPR3_LO 0xB4 -#define PT_FPR4_HI 0xB8 -#define PT_FPR4_LO 0xBC -#define PT_FPR5_HI 0xC0 -#define PT_FPR5_LO 0xC4 -#define PT_FPR6_HI 0xC8 -#define PT_FPR6_LO 0xCC -#define PT_FPR7_HI 0xD0 -#define PT_FPR7_LO 0xD4 -#define PT_FPR8_HI 0xD8 -#define PT_FPR8_LO 0XDC -#define PT_FPR9_HI 0xE0 -#define PT_FPR9_LO 0xE4 -#define PT_FPR10_HI 0xE8 -#define PT_FPR10_LO 0xEC -#define PT_FPR11_HI 0xF0 -#define PT_FPR11_LO 0xF4 -#define PT_FPR12_HI 0xF8 -#define PT_FPR12_LO 0xFC -#define PT_FPR13_HI 0x100 -#define PT_FPR13_LO 0x104 -#define PT_FPR14_HI 0x108 -#define PT_FPR14_LO 0x10C -#define PT_FPR15_HI 0x110 -#define PT_FPR15_LO 0x114 -#define PT_CR_9 0x118 -#define PT_CR_10 0x11C -#define PT_CR_11 0x120 -#define PT_IEEE_IP 0x13C -#define PT_LASTOFF PT_IEEE_IP -#define PT_ENDREGS 0x140-1 - -#define GPR_SIZE 4 -#define CR_SIZE 4 - -#define STACK_FRAME_OVERHEAD 96 /* size of minimum stack frame */ - -#else /* __s390x__ */ - #define PT_PSWMASK 0x00 #define PT_PSWADDR 0x08 #define PT_GPR0 0x10 @@ -166,38 +78,6 @@ #define STACK_FRAME_OVERHEAD 160 /* size of minimum stack frame */ -#endif /* __s390x__ */ - -#ifndef __s390x__ - -#define PSW_MASK_PER _AC(0x40000000, UL) -#define PSW_MASK_DAT _AC(0x04000000, UL) -#define PSW_MASK_IO _AC(0x02000000, UL) -#define PSW_MASK_EXT _AC(0x01000000, UL) -#define PSW_MASK_KEY _AC(0x00F00000, UL) -#define PSW_MASK_BASE _AC(0x00080000, UL) /* always one */ -#define PSW_MASK_MCHECK _AC(0x00040000, UL) -#define PSW_MASK_WAIT _AC(0x00020000, UL) -#define PSW_MASK_PSTATE _AC(0x00010000, UL) -#define PSW_MASK_ASC _AC(0x0000C000, UL) -#define PSW_MASK_CC _AC(0x00003000, UL) -#define PSW_MASK_PM _AC(0x00000F00, UL) -#define PSW_MASK_RI _AC(0x00000000, UL) -#define PSW_MASK_EA _AC(0x00000000, UL) -#define PSW_MASK_BA _AC(0x00000000, UL) - -#define PSW_MASK_USER _AC(0x0000FF00, UL) - -#define PSW_ADDR_AMODE _AC(0x80000000, UL) -#define PSW_ADDR_INSN _AC(0x7FFFFFFF, UL) - -#define PSW_ASC_PRIMARY _AC(0x00000000, UL) -#define PSW_ASC_ACCREG _AC(0x00004000, UL) -#define PSW_ASC_SECONDARY _AC(0x00008000, UL) -#define PSW_ASC_HOME _AC(0x0000C000, UL) - -#else /* __s390x__ */ - #define PSW_MASK_PER _AC(0x4000000000000000, UL) #define PSW_MASK_DAT _AC(0x0400000000000000, UL) #define PSW_MASK_IO _AC(0x0200000000000000, UL) @@ -224,8 +104,6 @@ #define PSW_ASC_SECONDARY _AC(0x0000800000000000, UL) #define PSW_ASC_HOME _AC(0x0000C00000000000, UL) -#endif /* __s390x__ */ - #define NUM_GPRS 16 #define NUM_FPRS 16 #define NUM_CRS 16 @@ -242,7 +120,8 @@ #define PTRACE_OLDSETOPTIONS 21 #define PTRACE_SYSEMU 31 #define PTRACE_SYSEMU_SINGLESTEP 32 -#ifndef __ASSEMBLY__ + +#ifndef __ASSEMBLER__ #include <linux/stddef.h> #include <linux/types.h> @@ -307,9 +186,7 @@ typedef struct { #define PER_EM_MASK 0xE8000000UL typedef struct { -#ifdef __s390x__ unsigned : 32; -#endif /* __s390x__ */ unsigned em_branching : 1; unsigned em_instruction_fetch : 1; /* @@ -450,6 +327,6 @@ struct user_regs_struct { unsigned long ieee_instruction_pointer; /* obsolete, always 0 */ }; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _UAPI_S390_PTRACE_H */ diff --git a/arch/s390/include/uapi/asm/schid.h b/arch/s390/include/uapi/asm/schid.h index a3e1cf168553..d804d1a5b1b3 100644 --- a/arch/s390/include/uapi/asm/schid.h +++ b/arch/s390/include/uapi/asm/schid.h @@ -4,7 +4,7 @@ #include <linux/types.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct subchannel_id { __u32 cssid : 8; @@ -15,6 +15,6 @@ struct subchannel_id { __u32 sch_no : 16; } __attribute__ ((packed, aligned(4))); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _UAPIASM_SCHID_H */ diff --git a/arch/s390/include/uapi/asm/sigcontext.h b/arch/s390/include/uapi/asm/sigcontext.h index 8b35033334c4..7c90b30c50fd 100644 --- a/arch/s390/include/uapi/asm/sigcontext.h +++ b/arch/s390/include/uapi/asm/sigcontext.h @@ -17,24 +17,12 @@ #define __NUM_VXRS_LOW 16 #define __NUM_VXRS_HIGH 16 -#ifndef __s390x__ - -/* Has to be at least _NSIG_WORDS from asm/signal.h */ -#define _SIGCONTEXT_NSIG 64 -#define _SIGCONTEXT_NSIG_BPW 32 -/* Size of stack frame allocated when calling signal handler. */ -#define __SIGNAL_FRAMESIZE 96 - -#else /* __s390x__ */ - /* Has to be at least _NSIG_WORDS from asm/signal.h */ #define _SIGCONTEXT_NSIG 64 #define _SIGCONTEXT_NSIG_BPW 64 /* Size of stack frame allocated when calling signal handler. */ #define __SIGNAL_FRAMESIZE 160 -#endif /* __s390x__ */ - #define _SIGCONTEXT_NSIG_WORDS (_SIGCONTEXT_NSIG / _SIGCONTEXT_NSIG_BPW) #define _SIGMASK_COPY_SIZE (sizeof(unsigned long)*_SIGCONTEXT_NSIG_WORDS) @@ -66,9 +54,6 @@ typedef struct typedef struct { -#ifndef __s390x__ - unsigned long gprs_high[__NUM_GPRS]; -#endif unsigned long long vxrs_low[__NUM_VXRS_LOW]; __vector128 vxrs_high[__NUM_VXRS_HIGH]; unsigned char __reserved[128]; diff --git a/arch/s390/include/uapi/asm/stat.h b/arch/s390/include/uapi/asm/stat.h index ac253d23606b..21599298c2f5 100644 --- a/arch/s390/include/uapi/asm/stat.h +++ b/arch/s390/include/uapi/asm/stat.h @@ -8,74 +8,6 @@ #ifndef _S390_STAT_H #define _S390_STAT_H -#ifndef __s390x__ -struct __old_kernel_stat { - unsigned short st_dev; - unsigned short st_ino; - unsigned short st_mode; - unsigned short st_nlink; - unsigned short st_uid; - unsigned short st_gid; - unsigned short st_rdev; - unsigned long st_size; - unsigned long st_atime; - unsigned long st_mtime; - unsigned long st_ctime; -}; - -struct stat { - unsigned short st_dev; - unsigned short __pad1; - unsigned long st_ino; - unsigned short st_mode; - unsigned short st_nlink; - unsigned short st_uid; - unsigned short st_gid; - unsigned short st_rdev; - unsigned short __pad2; - unsigned long st_size; - unsigned long st_blksize; - unsigned long st_blocks; - unsigned long st_atime; - unsigned long st_atime_nsec; - unsigned long st_mtime; - unsigned long st_mtime_nsec; - unsigned long st_ctime; - unsigned long st_ctime_nsec; - unsigned long __unused4; - unsigned long __unused5; -}; - -/* This matches struct stat64 in glibc2.1, hence the absolutely - * insane amounts of padding around dev_t's. - */ -struct stat64 { - unsigned long long st_dev; - unsigned int __pad1; -#define STAT64_HAS_BROKEN_ST_INO 1 - unsigned long __st_ino; - unsigned int st_mode; - unsigned int st_nlink; - unsigned long st_uid; - unsigned long st_gid; - unsigned long long st_rdev; - unsigned int __pad3; - long long st_size; - unsigned long st_blksize; - unsigned char __pad4[4]; - unsigned long __pad5; /* future possible st_blocks high bits */ - unsigned long st_blocks; /* Number 512-byte blocks allocated. */ - unsigned long st_atime; - unsigned long st_atime_nsec; - unsigned long st_mtime; - unsigned long st_mtime_nsec; - unsigned long st_ctime; - unsigned long st_ctime_nsec; /* will be high 32 bits of ctime someday */ - unsigned long long st_ino; -}; - -#else /* __s390x__ */ - struct stat { unsigned long st_dev; unsigned long st_ino; @@ -97,8 +29,6 @@ struct stat { unsigned long __unused[3]; }; -#endif /* __s390x__ */ - #define STAT_HAVE_NSEC 1 #endif diff --git a/arch/s390/include/uapi/asm/tape390.h b/arch/s390/include/uapi/asm/tape390.h deleted file mode 100644 index 90266c696486..000000000000 --- a/arch/s390/include/uapi/asm/tape390.h +++ /dev/null @@ -1,103 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/************************************************************************* - * - * enables user programs to display messages and control encryption - * on s390 tape devices - * - * Copyright IBM Corp. 2001, 2006 - * Author(s): Michael Holzheu <holzheu@de.ibm.com> - * - *************************************************************************/ - -#ifndef _TAPE390_H -#define _TAPE390_H - -#define TAPE390_DISPLAY _IOW('d', 1, struct display_struct) - -/* - * The TAPE390_DISPLAY ioctl calls the Load Display command - * which transfers 17 bytes of data from the channel to the subsystem: - * - 1 format control byte, and - * - two 8-byte messages - * - * Format control byte: - * 0-2: New Message Overlay - * 3: Alternate Messages - * 4: Blink Message - * 5: Display Low/High Message - * 6: Reserved - * 7: Automatic Load Request - * - */ - -typedef struct display_struct { - char cntrl; - char message1[8]; - char message2[8]; -} display_struct; - -/* - * Tape encryption support - */ - -struct tape390_crypt_info { - char capability; - char status; - char medium_status; -} __attribute__ ((packed)); - - -/* Macros for "capable" field */ -#define TAPE390_CRYPT_SUPPORTED_MASK 0x01 -#define TAPE390_CRYPT_SUPPORTED(x) \ - ((x.capability & TAPE390_CRYPT_SUPPORTED_MASK)) - -/* Macros for "status" field */ -#define TAPE390_CRYPT_ON_MASK 0x01 -#define TAPE390_CRYPT_ON(x) (((x.status) & TAPE390_CRYPT_ON_MASK)) - -/* Macros for "medium status" field */ -#define TAPE390_MEDIUM_LOADED_MASK 0x01 -#define TAPE390_MEDIUM_ENCRYPTED_MASK 0x02 -#define TAPE390_MEDIUM_ENCRYPTED(x) \ - (((x.medium_status) & TAPE390_MEDIUM_ENCRYPTED_MASK)) -#define TAPE390_MEDIUM_LOADED(x) \ - (((x.medium_status) & TAPE390_MEDIUM_LOADED_MASK)) - -/* - * The TAPE390_CRYPT_SET ioctl is used to switch on/off encryption. - * The "encryption_capable" and "tape_status" fields are ignored for this ioctl! - */ -#define TAPE390_CRYPT_SET _IOW('d', 2, struct tape390_crypt_info) - -/* - * The TAPE390_CRYPT_QUERY ioctl is used to query the encryption state. - */ -#define TAPE390_CRYPT_QUERY _IOR('d', 3, struct tape390_crypt_info) - -/* Values for "kekl1/2_type" and "kekl1/2_type_on_tape" fields */ -#define TAPE390_KEKL_TYPE_NONE 0 -#define TAPE390_KEKL_TYPE_LABEL 1 -#define TAPE390_KEKL_TYPE_HASH 2 - -struct tape390_kekl { - unsigned char type; - unsigned char type_on_tape; - char label[65]; -} __attribute__ ((packed)); - -struct tape390_kekl_pair { - struct tape390_kekl kekl[2]; -} __attribute__ ((packed)); - -/* - * The TAPE390_KEKL_SET ioctl is used to set Key Encrypting Key labels. - */ -#define TAPE390_KEKL_SET _IOW('d', 4, struct tape390_kekl_pair) - -/* - * The TAPE390_KEKL_QUERY ioctl is used to query Key Encrypting Key labels. - */ -#define TAPE390_KEKL_QUERY _IOR('d', 5, struct tape390_kekl_pair) - -#endif diff --git a/arch/s390/include/uapi/asm/types.h b/arch/s390/include/uapi/asm/types.h index 84457dbb26b4..4ab468c5032e 100644 --- a/arch/s390/include/uapi/asm/types.h +++ b/arch/s390/include/uapi/asm/types.h @@ -10,7 +10,7 @@ #include <asm-generic/int-ll64.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ typedef unsigned long addr_t; typedef __signed__ long saddr_t; @@ -25,6 +25,6 @@ typedef struct { }; } __attribute__((packed, aligned(4))) __vector128; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _UAPI_S390_TYPES_H */ diff --git a/arch/s390/include/uapi/asm/unistd.h b/arch/s390/include/uapi/asm/unistd.h index 01b5fe8b9db6..b0c5afe19db2 100644 --- a/arch/s390/include/uapi/asm/unistd.h +++ b/arch/s390/include/uapi/asm/unistd.h @@ -8,10 +8,6 @@ #ifndef _UAPI_ASM_S390_UNISTD_H_ #define _UAPI_ASM_S390_UNISTD_H_ -#ifdef __s390x__ #include <asm/unistd_64.h> -#else -#include <asm/unistd_32.h> -#endif #endif /* _UAPI_ASM_S390_UNISTD_H_ */ diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index db5f3a3faefb..137e4793ec11 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -36,17 +36,17 @@ CFLAGS_stacktrace.o += -fno-optimize-sibling-calls CFLAGS_dumpstack.o += -fno-optimize-sibling-calls CFLAGS_unwind_bc.o += -fno-optimize-sibling-calls -obj-y := head64.o traps.o time.o process.o early.o setup.o idle.o vtime.o +obj-y := head.o traps.o time.o process.o early.o setup.o idle.o vtime.o obj-y += processor.o syscall.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o obj-y += debug.o irq.o ipl.o dis.o vdso.o cpufeature.o obj-y += sysinfo.o lgr.o os_info.o ctlreg.o obj-y += runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o -obj-y += entry.o reipl.o kdebugfs.o alternative.o +obj-y += entry.o reipl.o kdebugfs.o alternative.o skey.o obj-y += nospec-branch.o ipl_vmparm.o machine_kexec_reloc.o unwind_bc.o obj-y += smp.o text_amode31.o stacktrace.o abs_lowcore.o facility.o uv.o wti.o obj-y += diag/ -extra-y += vmlinux.lds +always-$(KBUILD_BUILTIN) += vmlinux.lds obj-$(CONFIG_SYSFS) += nospec-sysfs.o CFLAGS_REMOVE_nospec-branch.o += $(CC_FLAGS_EXPOLINE) @@ -56,9 +56,6 @@ obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_SCHED_TOPOLOGY) += topology.o hiperdispatch.o obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_AUDIT) += audit.o -compat-obj-$(CONFIG_AUDIT) += compat_audit.o -obj-$(CONFIG_COMPAT) += compat_linux.o compat_signal.o -obj-$(CONFIG_COMPAT) += $(compat-obj-y) obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_KPROBES) += mcount.o @@ -70,19 +67,19 @@ obj-$(CONFIG_KEXEC_CORE) += machine_kexec.o relocate_kernel.o obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o obj-$(CONFIG_UPROBES) += uprobes.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o - +obj-$(CONFIG_STACKPROTECTOR) += stackprotector.o obj-$(CONFIG_KEXEC_FILE) += machine_kexec_file.o kexec_image.o obj-$(CONFIG_KEXEC_FILE) += kexec_elf.o obj-$(CONFIG_CERT_STORE) += cert_store.o -obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT) += ima_arch.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf.o perf_cpum_sf.o obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf_events.o perf_regs.o -obj-$(CONFIG_PERF_EVENTS) += perf_pai_crypto.o perf_pai_ext.o +obj-$(CONFIG_PERF_EVENTS) += perf_pai.o obj-$(CONFIG_TRACEPOINTS) += trace.o +obj-$(CONFIG_BPF_SYSCALL) += bpf.o + # vdso -obj-y += vdso64/ -obj-$(CONFIG_COMPAT) += vdso32/ +obj-y += vdso/ diff --git a/arch/s390/kernel/abs_lowcore.c b/arch/s390/kernel/abs_lowcore.c index 88f0b91d7a73..6252b7d115dd 100644 --- a/arch/s390/kernel/abs_lowcore.c +++ b/arch/s390/kernel/abs_lowcore.c @@ -5,7 +5,6 @@ #include <asm/sections.h> unsigned long __bootdata_preserved(__abs_lowcore); -int __bootdata_preserved(relocate_lowcore); int abs_lowcore_map(int cpu, struct lowcore *lc, bool alloc) { diff --git a/arch/s390/kernel/alternative.c b/arch/s390/kernel/alternative.c index 8d5d0de35de0..02d04ae621ba 100644 --- a/arch/s390/kernel/alternative.c +++ b/arch/s390/kernel/alternative.c @@ -1,41 +1,91 @@ // SPDX-License-Identifier: GPL-2.0 +#ifndef pr_fmt +#define pr_fmt(fmt) "alt: " fmt +#endif + +#include <linux/hex.h> #include <linux/uaccess.h> +#include <linux/printk.h> #include <asm/nospec-branch.h> #include <asm/abs_lowcore.h> #include <asm/alternative.h> #include <asm/facility.h> +#include <asm/sections.h> +#include <asm/machine.h> + +#ifndef a_debug +#define a_debug pr_debug +#endif + +#ifndef __kernel_va +#define __kernel_va(x) (void *)(x) +#endif + +unsigned long __bootdata_preserved(machine_features[1]); + +struct alt_debug { + unsigned long facilities[MAX_FACILITY_BIT / BITS_PER_LONG]; + unsigned long mfeatures[MAX_MFEATURE_BIT / BITS_PER_LONG]; + int spec; +}; + +static struct alt_debug __bootdata_preserved(alt_debug); + +static void alternative_dump(u8 *old, u8 *new, unsigned int len, unsigned int type, unsigned int data) +{ + char oinsn[33], ninsn[33]; + unsigned long kptr; + unsigned int pos; + + for (pos = 0; pos < len && 2 * pos < sizeof(oinsn) - 3; pos++) + hex_byte_pack(&oinsn[2 * pos], old[pos]); + oinsn[2 * pos] = 0; + for (pos = 0; pos < len && 2 * pos < sizeof(ninsn) - 3; pos++) + hex_byte_pack(&ninsn[2 * pos], new[pos]); + ninsn[2 * pos] = 0; + kptr = (unsigned long)__kernel_va(old); + a_debug("[%d/%3d] %016lx: %s -> %s\n", type, data, kptr, oinsn, ninsn); +} void __apply_alternatives(struct alt_instr *start, struct alt_instr *end, unsigned int ctx) { - u8 *instr, *replacement; + struct alt_debug *d; struct alt_instr *a; - bool replace; + bool debug, replace; + u8 *old, *new; /* * The scan order should be from start to end. A later scanned * alternative code can overwrite previously scanned alternative code. */ + d = &alt_debug; for (a = start; a < end; a++) { if (!(a->ctx & ctx)) continue; switch (a->type) { case ALT_TYPE_FACILITY: replace = test_facility(a->data); + debug = __test_facility(a->data, d->facilities); + break; + case ALT_TYPE_FEATURE: + replace = test_machine_feature(a->data); + debug = __test_machine_feature(a->data, d->mfeatures); break; case ALT_TYPE_SPEC: replace = nobp_enabled(); - break; - case ALT_TYPE_LOWCORE: - replace = have_relocated_lowcore(); + debug = d->spec; break; default: replace = false; + debug = false; } if (!replace) continue; - instr = (u8 *)&a->instr_offset + a->instr_offset; - replacement = (u8 *)&a->repl_offset + a->repl_offset; - s390_kernel_write(instr, replacement, a->instrlen); + old = (u8 *)&a->instr_offset + a->instr_offset; + new = (u8 *)&a->repl_offset + a->repl_offset; + if (debug) + alternative_dump(old, new, a->instrlen, a->type, a->data); + s390_kernel_write(old, new, a->instrlen); } } diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index 36709112ae7a..fbd26f3e9f96 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -4,16 +4,16 @@ * This code generates raw asm output which is post-processed to extract * and format the required data. */ - -#define ASM_OFFSETS_C +#define COMPILE_OFFSETS #include <linux/kbuild.h> -#include <linux/kvm_host.h> #include <linux/sched.h> #include <linux/purgatory.h> #include <linux/pgtable.h> -#include <linux/ftrace.h> +#include <linux/ftrace_regs.h> +#include <asm/kvm_host_types.h> #include <asm/stacktrace.h> +#include <asm/ptrace.h> int main(void) { @@ -21,6 +21,9 @@ int main(void) OFFSET(__TASK_stack, task_struct, stack); OFFSET(__TASK_thread, task_struct, thread); OFFSET(__TASK_pid, task_struct, pid); +#ifdef CONFIG_STACKPROTECTOR + OFFSET(__TASK_stack_canary, task_struct, stack_canary); +#endif BLANK(); /* thread struct offsets */ OFFSET(__THREAD_ksp, thread_struct, ksp); @@ -49,8 +52,8 @@ int main(void) OFFSET(__PT_R14, pt_regs, gprs[14]); OFFSET(__PT_R15, pt_regs, gprs[15]); OFFSET(__PT_ORIG_GPR2, pt_regs, orig_gpr2); + OFFSET(__PT_INT_CODE, pt_regs, int_code); OFFSET(__PT_FLAGS, pt_regs, flags); - OFFSET(__PT_CR1, pt_regs, cr1); OFFSET(__PT_LAST_BREAK, pt_regs, last_break); DEFINE(__PT_SIZE, sizeof(struct pt_regs)); BLANK(); @@ -60,10 +63,11 @@ int main(void) OFFSET(__SF_EMPTY, stack_frame, empty[0]); OFFSET(__SF_SIE_CONTROL, stack_frame, sie_control_block); OFFSET(__SF_SIE_SAVEAREA, stack_frame, sie_savearea); - OFFSET(__SF_SIE_REASON, stack_frame, sie_reason); + OFFSET(__SF_SIE_RETURN, stack_frame, sie_return); OFFSET(__SF_SIE_FLAGS, stack_frame, sie_flags); OFFSET(__SF_SIE_CONTROL_PHYS, stack_frame, sie_control_block_phys); OFFSET(__SF_SIE_GUEST_ASCE, stack_frame, sie_guest_asce); + OFFSET(__SF_SIE_IRQ, stack_frame, sie_irq); DEFINE(STACK_FRAME_OVERHEAD, sizeof(struct stack_frame)); BLANK(); OFFSET(__SFUSER_BACKCHAIN, stack_frame_user, back_chain); @@ -76,7 +80,8 @@ int main(void) OFFSET(__LC_EXT_CPU_ADDR, lowcore, ext_cpu_addr); OFFSET(__LC_EXT_INT_CODE, lowcore, ext_int_code); OFFSET(__LC_PGM_ILC, lowcore, pgm_ilc); - OFFSET(__LC_PGM_INT_CODE, lowcore, pgm_code); + OFFSET(__LC_PGM_CODE, lowcore, pgm_code); + OFFSET(__LC_PGM_INT_CODE, lowcore, pgm_int_code); OFFSET(__LC_DATA_EXC_CODE, lowcore, data_exc_code); OFFSET(__LC_MON_CLASS_NR, lowcore, mon_class_num); OFFSET(__LC_PER_CODE, lowcore, per_code); @@ -122,7 +127,6 @@ int main(void) OFFSET(__LC_LAST_UPDATE_TIMER, lowcore, last_update_timer); OFFSET(__LC_LAST_UPDATE_CLOCK, lowcore, last_update_clock); OFFSET(__LC_INT_CLOCK, lowcore, int_clock); - OFFSET(__LC_BOOT_CLOCK, lowcore, boot_clock); OFFSET(__LC_CURRENT, lowcore, current_task); OFFSET(__LC_KERNEL_STACK, lowcore, kernel_stack); OFFSET(__LC_ASYNC_STACK, lowcore, async_stack); @@ -139,6 +143,7 @@ int main(void) OFFSET(__LC_CURRENT_PID, lowcore, current_pid); OFFSET(__LC_LAST_BREAK, lowcore, last_break); /* software defined ABI-relevant lowcore locations 0xe00 - 0xe20 */ + OFFSET(__LC_STACK_CANARY, lowcore, stack_canary); OFFSET(__LC_DUMP_REIPL, lowcore, ipib); OFFSET(__LC_VMCORE_INFO, lowcore, vmcore_info); OFFSET(__LC_OS_INFO, lowcore, os_info); diff --git a/arch/s390/kernel/audit.c b/arch/s390/kernel/audit.c index 02051a596b87..7897d9411e13 100644 --- a/arch/s390/kernel/audit.c +++ b/arch/s390/kernel/audit.c @@ -3,7 +3,6 @@ #include <linux/types.h> #include <linux/audit.h> #include <asm/unistd.h> -#include "audit.h" static unsigned dir_class[] = { #include <asm-generic/audit_dir_write.h> @@ -32,19 +31,11 @@ static unsigned signal_class[] = { int audit_classify_arch(int arch) { -#ifdef CONFIG_COMPAT - if (arch == AUDIT_ARCH_S390) - return 1; -#endif return 0; } int audit_classify_syscall(int abi, unsigned syscall) { -#ifdef CONFIG_COMPAT - if (abi == AUDIT_ARCH_S390) - return s390_classify_syscall(syscall); -#endif switch(syscall) { case __NR_open: return AUDITSC_OPEN; @@ -63,13 +54,6 @@ int audit_classify_syscall(int abi, unsigned syscall) static int __init audit_classes_init(void) { -#ifdef CONFIG_COMPAT - audit_register_class(AUDIT_CLASS_WRITE_32, s390_write_class); - audit_register_class(AUDIT_CLASS_READ_32, s390_read_class); - audit_register_class(AUDIT_CLASS_DIR_WRITE_32, s390_dir_class); - audit_register_class(AUDIT_CLASS_CHATTR_32, s390_chattr_class); - audit_register_class(AUDIT_CLASS_SIGNAL_32, s390_signal_class); -#endif audit_register_class(AUDIT_CLASS_WRITE, write_class); audit_register_class(AUDIT_CLASS_READ, read_class); audit_register_class(AUDIT_CLASS_DIR_WRITE, dir_class); diff --git a/arch/s390/kernel/audit.h b/arch/s390/kernel/audit.h deleted file mode 100644 index 4d4b596412ec..000000000000 --- a/arch/s390/kernel/audit.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __ARCH_S390_KERNEL_AUDIT_H -#define __ARCH_S390_KERNEL_AUDIT_H - -#include <linux/types.h> - -#ifdef CONFIG_COMPAT -extern int s390_classify_syscall(unsigned); -extern __u32 s390_dir_class[]; -extern __u32 s390_write_class[]; -extern __u32 s390_read_class[]; -extern __u32 s390_chattr_class[]; -extern __u32 s390_signal_class[]; -#endif /* CONFIG_COMPAT */ - -#endif /* __ARCH_S390_KERNEL_AUDIT_H */ diff --git a/arch/s390/kernel/bpf.c b/arch/s390/kernel/bpf.c new file mode 100644 index 000000000000..713337fae626 --- /dev/null +++ b/arch/s390/kernel/bpf.c @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <asm/lowcore.h> +#include <linux/btf.h> + +__bpf_kfunc_start_defs(); + +__bpf_kfunc struct lowcore *bpf_get_lowcore(void) +{ + return get_lowcore(); +} + +__bpf_kfunc_end_defs(); diff --git a/arch/s390/kernel/cert_store.c b/arch/s390/kernel/cert_store.c index bf983513dd33..dc1992a675de 100644 --- a/arch/s390/kernel/cert_store.c +++ b/arch/s390/kernel/cert_store.c @@ -138,7 +138,7 @@ static void cert_store_key_describe(const struct key *key, struct seq_file *m) * First 64 bytes of the key description is key name in EBCDIC CP 500. * Convert it to ASCII for displaying in /proc/keys. */ - strscpy(ascii, key->description, sizeof(ascii)); + strscpy(ascii, key->description); EBCASC_500(ascii, VC_NAME_LEN_BYTES); seq_puts(m, ascii); @@ -235,7 +235,7 @@ static int __diag320(unsigned long subcode, void *addr) { union register_pair rp = { .even = (unsigned long)addr, }; - asm volatile( + asm_inline volatile( " diag %[rp],%[subcode],0x320\n" "0: nopr %%r7\n" EX_TABLE(0b, 0b) @@ -322,7 +322,7 @@ static int invalidate_keyring_keys(struct key *keyring) keyring_payload_len = key_type_keyring.read(keyring, NULL, 0); num_keys = keyring_payload_len / sizeof(key_serial_t); - key_array = kcalloc(num_keys, sizeof(key_serial_t), GFP_KERNEL); + key_array = kzalloc_objs(key_serial_t, num_keys); if (!key_array) return -ENOMEM; diff --git a/arch/s390/kernel/compat_audit.c b/arch/s390/kernel/compat_audit.c deleted file mode 100644 index a7c46e8310f0..000000000000 --- a/arch/s390/kernel/compat_audit.c +++ /dev/null @@ -1,48 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#undef __s390x__ -#include <linux/audit_arch.h> -#include <asm/unistd.h> -#include "audit.h" - -unsigned s390_dir_class[] = { -#include <asm-generic/audit_dir_write.h> -~0U -}; - -unsigned s390_chattr_class[] = { -#include <asm-generic/audit_change_attr.h> -~0U -}; - -unsigned s390_write_class[] = { -#include <asm-generic/audit_write.h> -~0U -}; - -unsigned s390_read_class[] = { -#include <asm-generic/audit_read.h> -~0U -}; - -unsigned s390_signal_class[] = { -#include <asm-generic/audit_signal.h> -~0U -}; - -int s390_classify_syscall(unsigned syscall) -{ - switch(syscall) { - case __NR_open: - return AUDITSC_OPEN; - case __NR_openat: - return AUDITSC_OPENAT; - case __NR_socketcall: - return AUDITSC_SOCKETCALL; - case __NR_execve: - return AUDITSC_EXECVE; - case __NR_openat2: - return AUDITSC_OPENAT2; - default: - return AUDITSC_COMPAT; - } -} diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c deleted file mode 100644 index f9d418d1b619..000000000000 --- a/arch/s390/kernel/compat_linux.c +++ /dev/null @@ -1,289 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * S390 version - * Copyright IBM Corp. 2000 - * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com), - * Gerhard Tonn (ton@de.ibm.com) - * Thomas Spatzier (tspat@de.ibm.com) - * - * Conversion between 31bit and 64bit native syscalls. - * - * Heavily inspired by the 32-bit Sparc compat code which is - * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz) - * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) - * - */ - - -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/fs.h> -#include <linux/mm.h> -#include <linux/file.h> -#include <linux/signal.h> -#include <linux/resource.h> -#include <linux/times.h> -#include <linux/smp.h> -#include <linux/sem.h> -#include <linux/msg.h> -#include <linux/shm.h> -#include <linux/uio.h> -#include <linux/quota.h> -#include <linux/poll.h> -#include <linux/personality.h> -#include <linux/stat.h> -#include <linux/filter.h> -#include <linux/highmem.h> -#include <linux/mman.h> -#include <linux/ipv6.h> -#include <linux/in.h> -#include <linux/icmpv6.h> -#include <linux/syscalls.h> -#include <linux/sysctl.h> -#include <linux/binfmts.h> -#include <linux/capability.h> -#include <linux/compat.h> -#include <linux/vfs.h> -#include <linux/ptrace.h> -#include <linux/fadvise.h> -#include <linux/ipc.h> -#include <linux/slab.h> - -#include <asm/types.h> -#include <linux/uaccess.h> - -#include <net/scm.h> -#include <net/sock.h> - -#include "compat_linux.h" - -#ifdef CONFIG_SYSVIPC -COMPAT_SYSCALL_DEFINE5(s390_ipc, uint, call, int, first, compat_ulong_t, second, - compat_ulong_t, third, compat_uptr_t, ptr) -{ - if (call >> 16) /* hack for backward compatibility */ - return -EINVAL; - return compat_ksys_ipc(call, first, second, third, ptr, third); -} -#endif - -COMPAT_SYSCALL_DEFINE3(s390_truncate64, const char __user *, path, u32, high, u32, low) -{ - return ksys_truncate(path, (unsigned long)high << 32 | low); -} - -COMPAT_SYSCALL_DEFINE3(s390_ftruncate64, unsigned int, fd, u32, high, u32, low) -{ - return ksys_ftruncate(fd, (unsigned long)high << 32 | low); -} - -COMPAT_SYSCALL_DEFINE5(s390_pread64, unsigned int, fd, char __user *, ubuf, - compat_size_t, count, u32, high, u32, low) -{ - if ((compat_ssize_t) count < 0) - return -EINVAL; - return ksys_pread64(fd, ubuf, count, (unsigned long)high << 32 | low); -} - -COMPAT_SYSCALL_DEFINE5(s390_pwrite64, unsigned int, fd, const char __user *, ubuf, - compat_size_t, count, u32, high, u32, low) -{ - if ((compat_ssize_t) count < 0) - return -EINVAL; - return ksys_pwrite64(fd, ubuf, count, (unsigned long)high << 32 | low); -} - -COMPAT_SYSCALL_DEFINE4(s390_readahead, int, fd, u32, high, u32, low, s32, count) -{ - return ksys_readahead(fd, (unsigned long)high << 32 | low, count); -} - -struct stat64_emu31 { - unsigned long long st_dev; - unsigned int __pad1; -#define STAT64_HAS_BROKEN_ST_INO 1 - u32 __st_ino; - unsigned int st_mode; - unsigned int st_nlink; - u32 st_uid; - u32 st_gid; - unsigned long long st_rdev; - unsigned int __pad3; - long st_size; - u32 st_blksize; - unsigned char __pad4[4]; - u32 __pad5; /* future possible st_blocks high bits */ - u32 st_blocks; /* Number 512-byte blocks allocated. */ - u32 st_atime; - u32 __pad6; - u32 st_mtime; - u32 __pad7; - u32 st_ctime; - u32 __pad8; /* will be high 32 bits of ctime someday */ - unsigned long st_ino; -}; - -static int cp_stat64(struct stat64_emu31 __user *ubuf, struct kstat *stat) -{ - struct stat64_emu31 tmp; - - memset(&tmp, 0, sizeof(tmp)); - - tmp.st_dev = huge_encode_dev(stat->dev); - tmp.st_ino = stat->ino; - tmp.__st_ino = (u32)stat->ino; - tmp.st_mode = stat->mode; - tmp.st_nlink = (unsigned int)stat->nlink; - tmp.st_uid = from_kuid_munged(current_user_ns(), stat->uid); - tmp.st_gid = from_kgid_munged(current_user_ns(), stat->gid); - tmp.st_rdev = huge_encode_dev(stat->rdev); - tmp.st_size = stat->size; - tmp.st_blksize = (u32)stat->blksize; - tmp.st_blocks = (u32)stat->blocks; - tmp.st_atime = (u32)stat->atime.tv_sec; - tmp.st_mtime = (u32)stat->mtime.tv_sec; - tmp.st_ctime = (u32)stat->ctime.tv_sec; - - return copy_to_user(ubuf,&tmp,sizeof(tmp)) ? -EFAULT : 0; -} - -COMPAT_SYSCALL_DEFINE2(s390_stat64, const char __user *, filename, struct stat64_emu31 __user *, statbuf) -{ - struct kstat stat; - int ret = vfs_stat(filename, &stat); - if (!ret) - ret = cp_stat64(statbuf, &stat); - return ret; -} - -COMPAT_SYSCALL_DEFINE2(s390_lstat64, const char __user *, filename, struct stat64_emu31 __user *, statbuf) -{ - struct kstat stat; - int ret = vfs_lstat(filename, &stat); - if (!ret) - ret = cp_stat64(statbuf, &stat); - return ret; -} - -COMPAT_SYSCALL_DEFINE2(s390_fstat64, unsigned int, fd, struct stat64_emu31 __user *, statbuf) -{ - struct kstat stat; - int ret = vfs_fstat(fd, &stat); - if (!ret) - ret = cp_stat64(statbuf, &stat); - return ret; -} - -COMPAT_SYSCALL_DEFINE4(s390_fstatat64, unsigned int, dfd, const char __user *, filename, - struct stat64_emu31 __user *, statbuf, int, flag) -{ - struct kstat stat; - int error; - - error = vfs_fstatat(dfd, filename, &stat, flag); - if (error) - return error; - return cp_stat64(statbuf, &stat); -} - -/* - * Linux/i386 didn't use to be able to handle more than - * 4 system call parameters, so these system calls used a memory - * block for parameter passing.. - */ - -struct mmap_arg_struct_emu31 { - compat_ulong_t addr; - compat_ulong_t len; - compat_ulong_t prot; - compat_ulong_t flags; - compat_ulong_t fd; - compat_ulong_t offset; -}; - -COMPAT_SYSCALL_DEFINE1(s390_old_mmap, struct mmap_arg_struct_emu31 __user *, arg) -{ - struct mmap_arg_struct_emu31 a; - - if (copy_from_user(&a, arg, sizeof(a))) - return -EFAULT; - if (a.offset & ~PAGE_MASK) - return -EINVAL; - return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, - a.offset >> PAGE_SHIFT); -} - -COMPAT_SYSCALL_DEFINE1(s390_mmap2, struct mmap_arg_struct_emu31 __user *, arg) -{ - struct mmap_arg_struct_emu31 a; - - if (copy_from_user(&a, arg, sizeof(a))) - return -EFAULT; - return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset); -} - -COMPAT_SYSCALL_DEFINE3(s390_read, unsigned int, fd, char __user *, buf, compat_size_t, count) -{ - if ((compat_ssize_t) count < 0) - return -EINVAL; - - return ksys_read(fd, buf, count); -} - -COMPAT_SYSCALL_DEFINE3(s390_write, unsigned int, fd, const char __user *, buf, compat_size_t, count) -{ - if ((compat_ssize_t) count < 0) - return -EINVAL; - - return ksys_write(fd, buf, count); -} - -/* - * 31 bit emulation wrapper functions for sys_fadvise64/fadvise64_64. - * These need to rewrite the advise values for POSIX_FADV_{DONTNEED,NOREUSE} - * because the 31 bit values differ from the 64 bit values. - */ - -COMPAT_SYSCALL_DEFINE5(s390_fadvise64, int, fd, u32, high, u32, low, compat_size_t, len, int, advise) -{ - if (advise == 4) - advise = POSIX_FADV_DONTNEED; - else if (advise == 5) - advise = POSIX_FADV_NOREUSE; - return ksys_fadvise64_64(fd, (unsigned long)high << 32 | low, len, - advise); -} - -struct fadvise64_64_args { - int fd; - long long offset; - long long len; - int advice; -}; - -COMPAT_SYSCALL_DEFINE1(s390_fadvise64_64, struct fadvise64_64_args __user *, args) -{ - struct fadvise64_64_args a; - - if ( copy_from_user(&a, args, sizeof(a)) ) - return -EFAULT; - if (a.advice == 4) - a.advice = POSIX_FADV_DONTNEED; - else if (a.advice == 5) - a.advice = POSIX_FADV_NOREUSE; - return ksys_fadvise64_64(a.fd, a.offset, a.len, a.advice); -} - -COMPAT_SYSCALL_DEFINE6(s390_sync_file_range, int, fd, u32, offhigh, u32, offlow, - u32, nhigh, u32, nlow, unsigned int, flags) -{ - return ksys_sync_file_range(fd, ((loff_t)offhigh << 32) + offlow, - ((u64)nhigh << 32) + nlow, flags); -} - -COMPAT_SYSCALL_DEFINE6(s390_fallocate, int, fd, int, mode, u32, offhigh, u32, offlow, - u32, lenhigh, u32, lenlow) -{ - return ksys_fallocate(fd, mode, ((loff_t)offhigh << 32) + offlow, - ((u64)lenhigh << 32) + lenlow); -} diff --git a/arch/s390/kernel/compat_linux.h b/arch/s390/kernel/compat_linux.h deleted file mode 100644 index ef23739b277c..000000000000 --- a/arch/s390/kernel/compat_linux.h +++ /dev/null @@ -1,101 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_S390X_S390_H -#define _ASM_S390X_S390_H - -#include <linux/compat.h> -#include <linux/socket.h> -#include <linux/syscalls.h> -#include <asm/ptrace.h> - -/* - * Macro that masks the high order bit of a 32 bit pointer and - * converts it to a 64 bit pointer. - */ -#define A(__x) ((unsigned long)((__x) & 0x7FFFFFFFUL)) -#define AA(__x) ((unsigned long)(__x)) - -/* Now 32bit compatibility types */ -struct ipc_kludge_32 { - __u32 msgp; /* pointer */ - __s32 msgtyp; -}; - -/* asm/sigcontext.h */ -typedef union { - __u64 d; - __u32 f; -} freg_t32; - -typedef struct { - unsigned int fpc; - unsigned int pad; - freg_t32 fprs[__NUM_FPRS]; -} _s390_fp_regs32; - -typedef struct { - psw_t32 psw; - __u32 gprs[__NUM_GPRS]; - __u32 acrs[__NUM_ACRS]; -} _s390_regs_common32; - -typedef struct { - _s390_regs_common32 regs; - _s390_fp_regs32 fpregs; -} _sigregs32; - -typedef struct { - __u32 gprs_high[__NUM_GPRS]; - __u64 vxrs_low[__NUM_VXRS_LOW]; - __vector128 vxrs_high[__NUM_VXRS_HIGH]; - __u8 __reserved[128]; -} _sigregs_ext32; - -#define _SIGCONTEXT_NSIG32 64 -#define _SIGCONTEXT_NSIG_BPW32 32 -#define __SIGNAL_FRAMESIZE32 96 -#define _SIGMASK_COPY_SIZE32 (sizeof(u32) * 2) - -struct sigcontext32 { - __u32 oldmask[_COMPAT_NSIG_WORDS]; - __u32 sregs; /* pointer */ -}; - -/* asm/signal.h */ - -/* asm/ucontext.h */ -struct ucontext32 { - __u32 uc_flags; - __u32 uc_link; /* pointer */ - compat_stack_t uc_stack; - _sigregs32 uc_mcontext; - compat_sigset_t uc_sigmask; - /* Allow for uc_sigmask growth. Glibc uses a 1024-bit sigset_t. */ - unsigned char __unused[128 - sizeof(compat_sigset_t)]; - _sigregs_ext32 uc_mcontext_ext; -}; - -struct stat64_emu31; -struct mmap_arg_struct_emu31; -struct fadvise64_64_args; - -long compat_sys_s390_truncate64(const char __user *path, u32 high, u32 low); -long compat_sys_s390_ftruncate64(unsigned int fd, u32 high, u32 low); -long compat_sys_s390_pread64(unsigned int fd, char __user *ubuf, compat_size_t count, u32 high, u32 low); -long compat_sys_s390_pwrite64(unsigned int fd, const char __user *ubuf, compat_size_t count, u32 high, u32 low); -long compat_sys_s390_readahead(int fd, u32 high, u32 low, s32 count); -long compat_sys_s390_stat64(const char __user *filename, struct stat64_emu31 __user *statbuf); -long compat_sys_s390_lstat64(const char __user *filename, struct stat64_emu31 __user *statbuf); -long compat_sys_s390_fstat64(unsigned int fd, struct stat64_emu31 __user *statbuf); -long compat_sys_s390_fstatat64(unsigned int dfd, const char __user *filename, struct stat64_emu31 __user *statbuf, int flag); -long compat_sys_s390_old_mmap(struct mmap_arg_struct_emu31 __user *arg); -long compat_sys_s390_mmap2(struct mmap_arg_struct_emu31 __user *arg); -long compat_sys_s390_read(unsigned int fd, char __user *buf, compat_size_t count); -long compat_sys_s390_write(unsigned int fd, const char __user *buf, compat_size_t count); -long compat_sys_s390_fadvise64(int fd, u32 high, u32 low, compat_size_t len, int advise); -long compat_sys_s390_fadvise64_64(struct fadvise64_64_args __user *args); -long compat_sys_s390_sync_file_range(int fd, u32 offhigh, u32 offlow, u32 nhigh, u32 nlow, unsigned int flags); -long compat_sys_s390_fallocate(int fd, int mode, u32 offhigh, u32 offlow, u32 lenhigh, u32 lenlow); -long compat_sys_sigreturn(void); -long compat_sys_rt_sigreturn(void); - -#endif /* _ASM_S390X_S390_H */ diff --git a/arch/s390/kernel/compat_ptrace.h b/arch/s390/kernel/compat_ptrace.h deleted file mode 100644 index 3c400fc7e987..000000000000 --- a/arch/s390/kernel/compat_ptrace.h +++ /dev/null @@ -1,64 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _PTRACE32_H -#define _PTRACE32_H - -#include <asm/ptrace.h> /* needed for NUM_CR_WORDS */ -#include "compat_linux.h" /* needed for psw_compat_t */ - -struct compat_per_struct_kernel { - __u32 cr9; /* PER control bits */ - __u32 cr10; /* PER starting address */ - __u32 cr11; /* PER ending address */ - __u32 bits; /* Obsolete software bits */ - __u32 starting_addr; /* User specified start address */ - __u32 ending_addr; /* User specified end address */ - __u16 perc_atmid; /* PER trap ATMID */ - __u32 address; /* PER trap instruction address */ - __u8 access_id; /* PER trap access identification */ -}; - -struct compat_user_regs_struct -{ - psw_compat_t psw; - u32 gprs[NUM_GPRS]; - u32 acrs[NUM_ACRS]; - u32 orig_gpr2; - /* nb: there's a 4-byte hole here */ - s390_fp_regs fp_regs; - /* - * These per registers are in here so that gdb can modify them - * itself as there is no "official" ptrace interface for hardware - * watchpoints. This is the way intel does it. - */ - struct compat_per_struct_kernel per_info; - u32 ieee_instruction_pointer; /* obsolete, always 0 */ -}; - -struct compat_user { - /* We start with the registers, to mimic the way that "memory" - is returned from the ptrace(3,...) function. */ - struct compat_user_regs_struct regs; - /* The rest of this junk is to help gdb figure out what goes where */ - u32 u_tsize; /* Text segment size (pages). */ - u32 u_dsize; /* Data segment size (pages). */ - u32 u_ssize; /* Stack segment size (pages). */ - u32 start_code; /* Starting virtual address of text. */ - u32 start_stack; /* Starting virtual address of stack area. - This is actually the bottom of the stack, - the top of the stack is always found in the - esp register. */ - s32 signal; /* Signal that caused the core dump. */ - u32 u_ar0; /* Used by gdb to help find the values for */ - /* the registers. */ - u32 magic; /* To uniquely identify a core file */ - char u_comm[32]; /* User command that was responsible */ -}; - -typedef struct -{ - __u32 len; - __u32 kernel_addr; - __u32 process_addr; -} compat_ptrace_area; - -#endif /* _PTRACE32_H */ diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c deleted file mode 100644 index 5a86b9d1da71..000000000000 --- a/arch/s390/kernel/compat_signal.c +++ /dev/null @@ -1,420 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright IBM Corp. 2000, 2006 - * Author(s): Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com) - * Gerhard Tonn (ton@de.ibm.com) - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson - */ - -#include <linux/compat.h> -#include <linux/sched.h> -#include <linux/sched/task_stack.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/kernel.h> -#include <linux/signal.h> -#include <linux/errno.h> -#include <linux/wait.h> -#include <linux/ptrace.h> -#include <linux/unistd.h> -#include <linux/stddef.h> -#include <linux/tty.h> -#include <linux/personality.h> -#include <linux/binfmts.h> -#include <asm/vdso-symbols.h> -#include <asm/access-regs.h> -#include <asm/ucontext.h> -#include <linux/uaccess.h> -#include <asm/lowcore.h> -#include <asm/fpu.h> -#include "compat_linux.h" -#include "compat_ptrace.h" -#include "entry.h" - -typedef struct -{ - __u8 callee_used_stack[__SIGNAL_FRAMESIZE32]; - struct sigcontext32 sc; - _sigregs32 sregs; - int signo; - _sigregs_ext32 sregs_ext; - __u16 svc_insn; /* Offset of svc_insn is NOT fixed! */ -} sigframe32; - -typedef struct -{ - __u8 callee_used_stack[__SIGNAL_FRAMESIZE32]; - __u16 svc_insn; - compat_siginfo_t info; - struct ucontext32 uc; -} rt_sigframe32; - -/* Store registers needed to create the signal frame */ -static void store_sigregs(void) -{ - save_access_regs(current->thread.acrs); - save_user_fpu_regs(); -} - -/* Load registers after signal return */ -static void load_sigregs(void) -{ - restore_access_regs(current->thread.acrs); -} - -static int save_sigregs32(struct pt_regs *regs, _sigregs32 __user *sregs) -{ - _sigregs32 user_sregs; - int i; - - user_sregs.regs.psw.mask = (__u32)(regs->psw.mask >> 32); - user_sregs.regs.psw.mask &= PSW32_MASK_USER | PSW32_MASK_RI; - user_sregs.regs.psw.mask |= PSW32_USER_BITS; - user_sregs.regs.psw.addr = (__u32) regs->psw.addr | - (__u32)(regs->psw.mask & PSW_MASK_BA); - for (i = 0; i < NUM_GPRS; i++) - user_sregs.regs.gprs[i] = (__u32) regs->gprs[i]; - memcpy(&user_sregs.regs.acrs, current->thread.acrs, - sizeof(user_sregs.regs.acrs)); - fpregs_store((_s390_fp_regs *) &user_sregs.fpregs, ¤t->thread.ufpu); - if (__copy_to_user(sregs, &user_sregs, sizeof(_sigregs32))) - return -EFAULT; - return 0; -} - -static int restore_sigregs32(struct pt_regs *regs,_sigregs32 __user *sregs) -{ - _sigregs32 user_sregs; - int i; - - /* Always make any pending restarted system call return -EINTR */ - current->restart_block.fn = do_no_restart_syscall; - - if (__copy_from_user(&user_sregs, &sregs->regs, sizeof(user_sregs))) - return -EFAULT; - - if (!is_ri_task(current) && (user_sregs.regs.psw.mask & PSW32_MASK_RI)) - return -EINVAL; - - /* Use regs->psw.mask instead of PSW_USER_BITS to preserve PER bit. */ - regs->psw.mask = (regs->psw.mask & ~(PSW_MASK_USER | PSW_MASK_RI)) | - (__u64)(user_sregs.regs.psw.mask & PSW32_MASK_USER) << 32 | - (__u64)(user_sregs.regs.psw.mask & PSW32_MASK_RI) << 32 | - (__u64)(user_sregs.regs.psw.addr & PSW32_ADDR_AMODE); - /* Check for invalid user address space control. */ - if ((regs->psw.mask & PSW_MASK_ASC) == PSW_ASC_HOME) - regs->psw.mask = PSW_ASC_PRIMARY | - (regs->psw.mask & ~PSW_MASK_ASC); - regs->psw.addr = (__u64)(user_sregs.regs.psw.addr & PSW32_ADDR_INSN); - for (i = 0; i < NUM_GPRS; i++) - regs->gprs[i] = (__u64) user_sregs.regs.gprs[i]; - memcpy(¤t->thread.acrs, &user_sregs.regs.acrs, - sizeof(current->thread.acrs)); - fpregs_load((_s390_fp_regs *)&user_sregs.fpregs, ¤t->thread.ufpu); - - clear_pt_regs_flag(regs, PIF_SYSCALL); /* No longer in a system call */ - return 0; -} - -static int save_sigregs_ext32(struct pt_regs *regs, - _sigregs_ext32 __user *sregs_ext) -{ - __u32 gprs_high[NUM_GPRS]; - __u64 vxrs[__NUM_VXRS_LOW]; - int i; - - /* Save high gprs to signal stack */ - for (i = 0; i < NUM_GPRS; i++) - gprs_high[i] = regs->gprs[i] >> 32; - if (__copy_to_user(&sregs_ext->gprs_high, &gprs_high, - sizeof(sregs_ext->gprs_high))) - return -EFAULT; - - /* Save vector registers to signal stack */ - if (cpu_has_vx()) { - for (i = 0; i < __NUM_VXRS_LOW; i++) - vxrs[i] = current->thread.ufpu.vxrs[i].low; - if (__copy_to_user(&sregs_ext->vxrs_low, vxrs, - sizeof(sregs_ext->vxrs_low)) || - __copy_to_user(&sregs_ext->vxrs_high, - current->thread.ufpu.vxrs + __NUM_VXRS_LOW, - sizeof(sregs_ext->vxrs_high))) - return -EFAULT; - } - return 0; -} - -static int restore_sigregs_ext32(struct pt_regs *regs, - _sigregs_ext32 __user *sregs_ext) -{ - __u32 gprs_high[NUM_GPRS]; - __u64 vxrs[__NUM_VXRS_LOW]; - int i; - - /* Restore high gprs from signal stack */ - if (__copy_from_user(&gprs_high, &sregs_ext->gprs_high, - sizeof(sregs_ext->gprs_high))) - return -EFAULT; - for (i = 0; i < NUM_GPRS; i++) - *(__u32 *)®s->gprs[i] = gprs_high[i]; - - /* Restore vector registers from signal stack */ - if (cpu_has_vx()) { - if (__copy_from_user(vxrs, &sregs_ext->vxrs_low, - sizeof(sregs_ext->vxrs_low)) || - __copy_from_user(current->thread.ufpu.vxrs + __NUM_VXRS_LOW, - &sregs_ext->vxrs_high, - sizeof(sregs_ext->vxrs_high))) - return -EFAULT; - for (i = 0; i < __NUM_VXRS_LOW; i++) - current->thread.ufpu.vxrs[i].low = vxrs[i]; - } - return 0; -} - -COMPAT_SYSCALL_DEFINE0(sigreturn) -{ - struct pt_regs *regs = task_pt_regs(current); - sigframe32 __user *frame = (sigframe32 __user *)regs->gprs[15]; - sigset_t set; - - if (get_compat_sigset(&set, (compat_sigset_t __user *)frame->sc.oldmask)) - goto badframe; - set_current_blocked(&set); - save_user_fpu_regs(); - if (restore_sigregs32(regs, &frame->sregs)) - goto badframe; - if (restore_sigregs_ext32(regs, &frame->sregs_ext)) - goto badframe; - load_sigregs(); - return regs->gprs[2]; -badframe: - force_sig(SIGSEGV); - return 0; -} - -COMPAT_SYSCALL_DEFINE0(rt_sigreturn) -{ - struct pt_regs *regs = task_pt_regs(current); - rt_sigframe32 __user *frame = (rt_sigframe32 __user *)regs->gprs[15]; - sigset_t set; - - if (get_compat_sigset(&set, &frame->uc.uc_sigmask)) - goto badframe; - set_current_blocked(&set); - if (compat_restore_altstack(&frame->uc.uc_stack)) - goto badframe; - save_user_fpu_regs(); - if (restore_sigregs32(regs, &frame->uc.uc_mcontext)) - goto badframe; - if (restore_sigregs_ext32(regs, &frame->uc.uc_mcontext_ext)) - goto badframe; - load_sigregs(); - return regs->gprs[2]; -badframe: - force_sig(SIGSEGV); - return 0; -} - -/* - * Set up a signal frame. - */ - - -/* - * Determine which stack to use.. - */ -static inline void __user * -get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size) -{ - unsigned long sp; - - /* Default to using normal stack */ - sp = (unsigned long) A(regs->gprs[15]); - - /* Overflow on alternate signal stack gives SIGSEGV. */ - if (on_sig_stack(sp) && !on_sig_stack((sp - frame_size) & -8UL)) - return (void __user *) -1UL; - - /* This is the X/Open sanctioned signal stack switching. */ - if (ka->sa.sa_flags & SA_ONSTACK) { - if (! sas_ss_flags(sp)) - sp = current->sas_ss_sp + current->sas_ss_size; - } - - return (void __user *)((sp - frame_size) & -8ul); -} - -static int setup_frame32(struct ksignal *ksig, sigset_t *set, - struct pt_regs *regs) -{ - int sig = ksig->sig; - sigframe32 __user *frame; - unsigned long restorer; - size_t frame_size; - - /* - * gprs_high are always present for 31-bit compat tasks. - * The space for vector registers is only allocated if - * the machine supports it - */ - frame_size = sizeof(*frame) - sizeof(frame->sregs_ext.__reserved); - if (!cpu_has_vx()) - frame_size -= sizeof(frame->sregs_ext.vxrs_low) + - sizeof(frame->sregs_ext.vxrs_high); - frame = get_sigframe(&ksig->ka, regs, frame_size); - if (frame == (void __user *) -1UL) - return -EFAULT; - - /* Set up backchain. */ - if (__put_user(regs->gprs[15], (unsigned int __user *) frame)) - return -EFAULT; - - /* Create struct sigcontext32 on the signal stack */ - if (put_compat_sigset((compat_sigset_t __user *)frame->sc.oldmask, - set, sizeof(compat_sigset_t))) - return -EFAULT; - if (__put_user(ptr_to_compat(&frame->sregs), &frame->sc.sregs)) - return -EFAULT; - - /* Store registers needed to create the signal frame */ - store_sigregs(); - - /* Create _sigregs32 on the signal stack */ - if (save_sigregs32(regs, &frame->sregs)) - return -EFAULT; - - /* Place signal number on stack to allow backtrace from handler. */ - if (__put_user(regs->gprs[2], (int __force __user *) &frame->signo)) - return -EFAULT; - - /* Create _sigregs_ext32 on the signal stack */ - if (save_sigregs_ext32(regs, &frame->sregs_ext)) - return -EFAULT; - - /* Set up to return from userspace. If provided, use a stub - already in userspace. */ - if (ksig->ka.sa.sa_flags & SA_RESTORER) { - restorer = (unsigned long __force) - ksig->ka.sa.sa_restorer | PSW32_ADDR_AMODE; - } else { - restorer = VDSO32_SYMBOL(current, sigreturn); - } - - /* Set up registers for signal handler */ - regs->gprs[14] = restorer; - regs->gprs[15] = (__force __u64) frame; - /* Force 31 bit amode and default user address space control. */ - regs->psw.mask = PSW_MASK_BA | - (PSW_USER_BITS & PSW_MASK_ASC) | - (regs->psw.mask & ~PSW_MASK_ASC); - regs->psw.addr = (__force __u64) ksig->ka.sa.sa_handler; - - regs->gprs[2] = sig; - regs->gprs[3] = (__force __u64) &frame->sc; - - /* We forgot to include these in the sigcontext. - To avoid breaking binary compatibility, they are passed as args. */ - if (sig == SIGSEGV || sig == SIGBUS || sig == SIGILL || - sig == SIGTRAP || sig == SIGFPE) { - /* set extra registers only for synchronous signals */ - regs->gprs[4] = regs->int_code & 127; - regs->gprs[5] = regs->int_parm_long; - regs->gprs[6] = current->thread.last_break; - } - - return 0; -} - -static int setup_rt_frame32(struct ksignal *ksig, sigset_t *set, - struct pt_regs *regs) -{ - rt_sigframe32 __user *frame; - unsigned long restorer; - size_t frame_size; - u32 uc_flags; - - frame_size = sizeof(*frame) - - sizeof(frame->uc.uc_mcontext_ext.__reserved); - /* - * gprs_high are always present for 31-bit compat tasks. - * The space for vector registers is only allocated if - * the machine supports it - */ - uc_flags = UC_GPRS_HIGH; - if (cpu_has_vx()) { - uc_flags |= UC_VXRS; - } else { - frame_size -= sizeof(frame->uc.uc_mcontext_ext.vxrs_low) + - sizeof(frame->uc.uc_mcontext_ext.vxrs_high); - } - frame = get_sigframe(&ksig->ka, regs, frame_size); - if (frame == (void __user *) -1UL) - return -EFAULT; - - /* Set up backchain. */ - if (__put_user(regs->gprs[15], (unsigned int __force __user *) frame)) - return -EFAULT; - - /* Set up to return from userspace. If provided, use a stub - already in userspace. */ - if (ksig->ka.sa.sa_flags & SA_RESTORER) { - restorer = (unsigned long __force) - ksig->ka.sa.sa_restorer | PSW32_ADDR_AMODE; - } else { - restorer = VDSO32_SYMBOL(current, rt_sigreturn); - } - - /* Create siginfo on the signal stack */ - if (copy_siginfo_to_user32(&frame->info, &ksig->info)) - return -EFAULT; - - /* Store registers needed to create the signal frame */ - store_sigregs(); - - /* Create ucontext on the signal stack. */ - if (__put_user(uc_flags, &frame->uc.uc_flags) || - __put_user(0, &frame->uc.uc_link) || - __compat_save_altstack(&frame->uc.uc_stack, regs->gprs[15]) || - save_sigregs32(regs, &frame->uc.uc_mcontext) || - put_compat_sigset(&frame->uc.uc_sigmask, set, sizeof(compat_sigset_t)) || - save_sigregs_ext32(regs, &frame->uc.uc_mcontext_ext)) - return -EFAULT; - - /* Set up registers for signal handler */ - regs->gprs[14] = restorer; - regs->gprs[15] = (__force __u64) frame; - /* Force 31 bit amode and default user address space control. */ - regs->psw.mask = PSW_MASK_BA | - (PSW_USER_BITS & PSW_MASK_ASC) | - (regs->psw.mask & ~PSW_MASK_ASC); - regs->psw.addr = (__u64 __force) ksig->ka.sa.sa_handler; - - regs->gprs[2] = ksig->sig; - regs->gprs[3] = (__force __u64) &frame->info; - regs->gprs[4] = (__force __u64) &frame->uc; - regs->gprs[5] = current->thread.last_break; - return 0; -} - -/* - * OK, we're invoking a handler - */ - -void handle_signal32(struct ksignal *ksig, sigset_t *oldset, - struct pt_regs *regs) -{ - int ret; - - /* Set up the stack frame */ - if (ksig->ka.sa.sa_flags & SA_SIGINFO) - ret = setup_rt_frame32(ksig, oldset, regs); - else - ret = setup_frame32(ksig, oldset, regs); - - signal_setup_done(ret, ksig, test_thread_flag(TIF_SINGLE_STEP)); -} - diff --git a/arch/s390/kernel/cpacf.c b/arch/s390/kernel/cpacf.c index 4b9b34f95d72..9d85b4bc7036 100644 --- a/arch/s390/kernel/cpacf.c +++ b/arch/s390/kernel/cpacf.c @@ -3,8 +3,7 @@ * Copyright IBM Corp. 2024 */ -#define KMSG_COMPONENT "cpacf" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "cpacf: " fmt #include <linux/cpu.h> #include <linux/device.h> @@ -101,7 +100,7 @@ static const struct bin_attribute *const cpacf_attrs[] = { static const struct attribute_group cpacf_attr_grp = { .name = "cpacf", - .bin_attrs_new = cpacf_attrs, + .bin_attrs = cpacf_attrs, }; static int __init cpacf_init(void) diff --git a/arch/s390/kernel/cpcmd.c b/arch/s390/kernel/cpcmd.c index 2f4174b961de..ab611764642a 100644 --- a/arch/s390/kernel/cpcmd.c +++ b/arch/s390/kernel/cpcmd.c @@ -6,8 +6,7 @@ * Christian Borntraeger (cborntra@de.ibm.com), */ -#define KMSG_COMPONENT "cpcmd" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "cpcmd: " fmt #include <linux/kernel.h> #include <linux/export.h> diff --git a/arch/s390/kernel/cpufeature.c b/arch/s390/kernel/cpufeature.c index 1b2ae42a0c15..c9eef9ed876b 100644 --- a/arch/s390/kernel/cpufeature.c +++ b/arch/s390/kernel/cpufeature.c @@ -4,12 +4,15 @@ */ #include <linux/cpufeature.h> +#include <linux/export.h> #include <linux/bug.h> +#include <asm/machine.h> #include <asm/elf.h> enum { TYPE_HWCAP, TYPE_FACILITY, + TYPE_MACHINE, }; struct s390_cpu_feature { @@ -21,6 +24,7 @@ static struct s390_cpu_feature s390_cpu_features[MAX_CPU_FEATURES] = { [S390_CPU_FEATURE_MSA] = {.type = TYPE_HWCAP, .num = HWCAP_NR_MSA}, [S390_CPU_FEATURE_VXRS] = {.type = TYPE_HWCAP, .num = HWCAP_NR_VXRS}, [S390_CPU_FEATURE_UV] = {.type = TYPE_FACILITY, .num = 158}, + [S390_CPU_FEATURE_D288] = {.type = TYPE_MACHINE, .num = MFEATURE_DIAG288}, }; /* @@ -38,6 +42,8 @@ int cpu_have_feature(unsigned int num) return !!(elf_hwcap & BIT(feature->num)); case TYPE_FACILITY: return test_facility(feature->num); + case TYPE_MACHINE: + return test_machine_feature(feature->num); default: WARN_ON_ONCE(1); return 0; diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c index 276cb4c1e11b..d4839de8ce9d 100644 --- a/arch/s390/kernel/crash_dump.c +++ b/arch/s390/kernel/crash_dump.c @@ -7,6 +7,7 @@ */ #include <linux/crash_dump.h> +#include <linux/export.h> #include <asm/lowcore.h> #include <linux/kernel.h> #include <linux/init.h> @@ -246,15 +247,6 @@ bool is_kdump_kernel(void) } EXPORT_SYMBOL_GPL(is_kdump_kernel); -static const char *nt_name(Elf64_Word type) -{ - const char *name = "LINUX"; - - if (type == NT_PRPSINFO || type == NT_PRSTATUS || type == NT_PRFPREG) - name = KEXEC_CORE_NOTE_NAME; - return name; -} - /* * Initialize ELF note */ @@ -279,10 +271,8 @@ static void *nt_init_name(void *buf, Elf64_Word type, void *desc, int d_len, return PTR_ADD(buf, len); } -static inline void *nt_init(void *buf, Elf64_Word type, void *desc, int d_len) -{ - return nt_init_name(buf, type, desc, d_len, nt_name(type)); -} +#define nt_init(buf, type, desc) \ + nt_init_name(buf, NT_ ## type, &(desc), sizeof(desc), NN_ ## type) /* * Calculate the size of ELF note @@ -298,10 +288,7 @@ static size_t nt_size_name(int d_len, const char *name) return size; } -static inline size_t nt_size(Elf64_Word type, int d_len) -{ - return nt_size_name(d_len, nt_name(type)); -} +#define nt_size(type, desc) nt_size_name(sizeof(desc), NN_ ## type) /* * Fill ELF notes for one CPU with save area registers @@ -322,18 +309,16 @@ static void *fill_cpu_elf_notes(void *ptr, int cpu, struct save_area *sa) memcpy(&nt_fpregset.fpc, &sa->fpc, sizeof(sa->fpc)); memcpy(&nt_fpregset.fprs, &sa->fprs, sizeof(sa->fprs)); /* Create ELF notes for the CPU */ - ptr = nt_init(ptr, NT_PRSTATUS, &nt_prstatus, sizeof(nt_prstatus)); - ptr = nt_init(ptr, NT_PRFPREG, &nt_fpregset, sizeof(nt_fpregset)); - ptr = nt_init(ptr, NT_S390_TIMER, &sa->timer, sizeof(sa->timer)); - ptr = nt_init(ptr, NT_S390_TODCMP, &sa->todcmp, sizeof(sa->todcmp)); - ptr = nt_init(ptr, NT_S390_TODPREG, &sa->todpreg, sizeof(sa->todpreg)); - ptr = nt_init(ptr, NT_S390_CTRS, &sa->ctrs, sizeof(sa->ctrs)); - ptr = nt_init(ptr, NT_S390_PREFIX, &sa->prefix, sizeof(sa->prefix)); + ptr = nt_init(ptr, PRSTATUS, nt_prstatus); + ptr = nt_init(ptr, PRFPREG, nt_fpregset); + ptr = nt_init(ptr, S390_TIMER, sa->timer); + ptr = nt_init(ptr, S390_TODCMP, sa->todcmp); + ptr = nt_init(ptr, S390_TODPREG, sa->todpreg); + ptr = nt_init(ptr, S390_CTRS, sa->ctrs); + ptr = nt_init(ptr, S390_PREFIX, sa->prefix); if (cpu_has_vx()) { - ptr = nt_init(ptr, NT_S390_VXRS_HIGH, - &sa->vxrs_high, sizeof(sa->vxrs_high)); - ptr = nt_init(ptr, NT_S390_VXRS_LOW, - &sa->vxrs_low, sizeof(sa->vxrs_low)); + ptr = nt_init(ptr, S390_VXRS_HIGH, sa->vxrs_high); + ptr = nt_init(ptr, S390_VXRS_LOW, sa->vxrs_low); } return ptr; } @@ -346,16 +331,16 @@ static size_t get_cpu_elf_notes_size(void) struct save_area *sa = NULL; size_t size; - size = nt_size(NT_PRSTATUS, sizeof(struct elf_prstatus)); - size += nt_size(NT_PRFPREG, sizeof(elf_fpregset_t)); - size += nt_size(NT_S390_TIMER, sizeof(sa->timer)); - size += nt_size(NT_S390_TODCMP, sizeof(sa->todcmp)); - size += nt_size(NT_S390_TODPREG, sizeof(sa->todpreg)); - size += nt_size(NT_S390_CTRS, sizeof(sa->ctrs)); - size += nt_size(NT_S390_PREFIX, sizeof(sa->prefix)); + size = nt_size(PRSTATUS, struct elf_prstatus); + size += nt_size(PRFPREG, elf_fpregset_t); + size += nt_size(S390_TIMER, sa->timer); + size += nt_size(S390_TODCMP, sa->todcmp); + size += nt_size(S390_TODPREG, sa->todpreg); + size += nt_size(S390_CTRS, sa->ctrs); + size += nt_size(S390_PREFIX, sa->prefix); if (cpu_has_vx()) { - size += nt_size(NT_S390_VXRS_HIGH, sizeof(sa->vxrs_high)); - size += nt_size(NT_S390_VXRS_LOW, sizeof(sa->vxrs_low)); + size += nt_size(S390_VXRS_HIGH, sa->vxrs_high); + size += nt_size(S390_VXRS_LOW, sa->vxrs_low); } return size; @@ -370,8 +355,8 @@ static void *nt_prpsinfo(void *ptr) memset(&prpsinfo, 0, sizeof(prpsinfo)); prpsinfo.pr_sname = 'R'; - strcpy(prpsinfo.pr_fname, "vmlinux"); - return nt_init(ptr, NT_PRPSINFO, &prpsinfo, sizeof(prpsinfo)); + strscpy(prpsinfo.pr_fname, "vmlinux"); + return nt_init(ptr, PRPSINFO, prpsinfo); } /* @@ -610,7 +595,7 @@ static size_t get_elfcorehdr_size(int phdr_count) /* PT_NOTES */ size += sizeof(Elf64_Phdr); /* nt_prpsinfo */ - size += nt_size(NT_PRPSINFO, sizeof(struct elf_prpsinfo)); + size += nt_size(PRPSINFO, struct elf_prpsinfo); /* regsets */ size += get_cpu_cnt() * get_cpu_elf_notes_size(); /* nt_vmcoreinfo */ diff --git a/arch/s390/kernel/ctlreg.c b/arch/s390/kernel/ctlreg.c index 8cc26cf2c64a..a0501f4c7e7a 100644 --- a/arch/s390/kernel/ctlreg.c +++ b/arch/s390/kernel/ctlreg.c @@ -5,6 +5,7 @@ #include <linux/irqflags.h> #include <linux/spinlock.h> +#include <linux/export.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/smp.h> diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c index ce038e9205f7..31430e9bcfdd 100644 --- a/arch/s390/kernel/debug.c +++ b/arch/s390/kernel/debug.c @@ -10,8 +10,7 @@ * Bugreports to: <Linux390@de.ibm.com> */ -#define KMSG_COMPONENT "s390dbf" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "s390dbf: " fmt #include <linux/stddef.h> #include <linux/kernel.h> @@ -182,14 +181,13 @@ static debug_entry_t ***debug_areas_alloc(int pages_per_area, int nr_areas) debug_entry_t ***areas; int i, j; - areas = kmalloc_array(nr_areas, sizeof(debug_entry_t **), GFP_KERNEL); + areas = kmalloc_objs(debug_entry_t **, nr_areas); if (!areas) goto fail_malloc_areas; for (i = 0; i < nr_areas; i++) { /* GFP_NOWARN to avoid user triggerable WARN, we handle fails */ - areas[i] = kmalloc_array(pages_per_area, - sizeof(debug_entry_t *), - GFP_KERNEL | __GFP_NOWARN); + areas[i] = kmalloc_objs(debug_entry_t *, pages_per_area, + GFP_KERNEL | __GFP_NOWARN); if (!areas[i]) goto fail_malloc_areas2; for (j = 0; j < pages_per_area; j++) { @@ -226,13 +224,13 @@ static debug_info_t *debug_info_alloc(const char *name, int pages_per_area, debug_info_t *rc; /* alloc everything */ - rc = kmalloc(sizeof(debug_info_t), GFP_KERNEL); + rc = kmalloc_obj(debug_info_t); if (!rc) goto fail_malloc_rc; - rc->active_entries = kcalloc(nr_areas, sizeof(int), GFP_KERNEL); + rc->active_entries = kzalloc_objs(int, nr_areas); if (!rc->active_entries) goto fail_malloc_active_entries; - rc->active_pages = kcalloc(nr_areas, sizeof(int), GFP_KERNEL); + rc->active_pages = kzalloc_objs(int, nr_areas); if (!rc->active_pages) goto fail_malloc_active_pages; if ((mode == ALL_AREAS) && (pages_per_area != 0)) { @@ -244,14 +242,14 @@ static debug_info_t *debug_info_alloc(const char *name, int pages_per_area, } /* initialize members */ - spin_lock_init(&rc->lock); + raw_spin_lock_init(&rc->lock); rc->pages_per_area = pages_per_area; rc->nr_areas = nr_areas; rc->active_area = 0; rc->level = level; rc->buf_size = buf_size; rc->entry_size = sizeof(debug_entry_t) + buf_size; - strscpy(rc->name, name, sizeof(rc->name)); + strscpy(rc->name, name); memset(rc->views, 0, DEBUG_MAX_VIEWS * sizeof(struct debug_view *)); memset(rc->debugfs_entries, 0, DEBUG_MAX_VIEWS * sizeof(struct dentry *)); refcount_set(&(rc->ref_count), 0); @@ -334,7 +332,7 @@ static debug_info_t *debug_info_copy(debug_info_t *in, int mode) do { rc = debug_info_alloc(in->name, in->pages_per_area, in->nr_areas, in->buf_size, in->level, mode); - spin_lock_irqsave(&in->lock, flags); + raw_spin_lock_irqsave(&in->lock, flags); if (!rc) goto out; /* has something changed in the meantime ? */ @@ -342,7 +340,7 @@ static debug_info_t *debug_info_copy(debug_info_t *in, int mode) (rc->nr_areas == in->nr_areas)) { break; } - spin_unlock_irqrestore(&in->lock, flags); + raw_spin_unlock_irqrestore(&in->lock, flags); debug_info_free(rc); } while (1); @@ -357,7 +355,7 @@ static debug_info_t *debug_info_copy(debug_info_t *in, int mode) } rc->active_area = in->active_area; out: - spin_unlock_irqrestore(&in->lock, flags); + raw_spin_unlock_irqrestore(&in->lock, flags); return rc; } @@ -632,7 +630,7 @@ static file_private_info_t *debug_file_private_alloc(debug_info_t *debug_info, if (!debug_info_snapshot) return NULL; - p_info = kmalloc(sizeof(file_private_info_t), GFP_KERNEL); + p_info = kmalloc_obj(file_private_info_t); if (!p_info) { debug_info_free(debug_info_snapshot); return NULL; @@ -880,20 +878,20 @@ void debug_register_static(debug_info_t *id, int pages_per_area, int nr_areas) pr_err("Registering debug feature %s failed\n", id->name); /* Clear pointers to prevent tracing into released initdata. */ - spin_lock_irqsave(&id->lock, flags); + raw_spin_lock_irqsave(&id->lock, flags); id->areas = NULL; id->active_pages = NULL; id->active_entries = NULL; - spin_unlock_irqrestore(&id->lock, flags); + raw_spin_unlock_irqrestore(&id->lock, flags); return; } /* Replace static trace area with dynamic copy. */ - spin_lock_irqsave(&id->lock, flags); + raw_spin_lock_irqsave(&id->lock, flags); debug_events_append(copy, id); debug_areas_swap(id, copy); - spin_unlock_irqrestore(&id->lock, flags); + raw_spin_unlock_irqrestore(&id->lock, flags); /* Clear pointers to initdata and discard copy. */ copy->areas = NULL; @@ -967,11 +965,11 @@ static int debug_set_size(debug_info_t *id, int nr_areas, int pages_per_area) return -ENOMEM; } - spin_lock_irqsave(&id->lock, flags); + raw_spin_lock_irqsave(&id->lock, flags); debug_events_append(new_id, id); debug_areas_swap(new_id, id); + raw_spin_unlock_irqrestore(&id->lock, flags); debug_info_free(new_id); - spin_unlock_irqrestore(&id->lock, flags); pr_info("%s: set new size (%i pages)\n", id->name, pages_per_area); return 0; @@ -1001,9 +999,9 @@ void debug_set_level(debug_info_t *id, int new_level) return; } - spin_lock_irqsave(&id->lock, flags); + raw_spin_lock_irqsave(&id->lock, flags); id->level = new_level; - spin_unlock_irqrestore(&id->lock, flags); + raw_spin_unlock_irqrestore(&id->lock, flags); } EXPORT_SYMBOL(debug_set_level); @@ -1185,10 +1183,10 @@ debug_entry_t *debug_event_common(debug_info_t *id, int level, const void *buf, if (!debug_active || !id->areas) return NULL; if (debug_critical) { - if (!spin_trylock_irqsave(&id->lock, flags)) + if (!raw_spin_trylock_irqsave(&id->lock, flags)) return NULL; } else { - spin_lock_irqsave(&id->lock, flags); + raw_spin_lock_irqsave(&id->lock, flags); } do { active = get_active_entry(id); @@ -1200,7 +1198,7 @@ debug_entry_t *debug_event_common(debug_info_t *id, int level, const void *buf, buf += id->buf_size; } while (len > 0); - spin_unlock_irqrestore(&id->lock, flags); + raw_spin_unlock_irqrestore(&id->lock, flags); return active; } EXPORT_SYMBOL(debug_event_common); @@ -1218,10 +1216,10 @@ debug_entry_t *debug_exception_common(debug_info_t *id, int level, if (!debug_active || !id->areas) return NULL; if (debug_critical) { - if (!spin_trylock_irqsave(&id->lock, flags)) + if (!raw_spin_trylock_irqsave(&id->lock, flags)) return NULL; } else { - spin_lock_irqsave(&id->lock, flags); + raw_spin_lock_irqsave(&id->lock, flags); } do { active = get_active_entry(id); @@ -1233,7 +1231,7 @@ debug_entry_t *debug_exception_common(debug_info_t *id, int level, buf += id->buf_size; } while (len > 0); - spin_unlock_irqrestore(&id->lock, flags); + raw_spin_unlock_irqrestore(&id->lock, flags); return active; } EXPORT_SYMBOL(debug_exception_common); @@ -1268,10 +1266,10 @@ debug_entry_t *__debug_sprintf_event(debug_info_t *id, int level, char *string, numargs = debug_count_numargs(string); if (debug_critical) { - if (!spin_trylock_irqsave(&id->lock, flags)) + if (!raw_spin_trylock_irqsave(&id->lock, flags)) return NULL; } else { - spin_lock_irqsave(&id->lock, flags); + raw_spin_lock_irqsave(&id->lock, flags); } active = get_active_entry(id); curr_event = (debug_sprintf_entry_t *) DEBUG_DATA(active); @@ -1281,7 +1279,7 @@ debug_entry_t *__debug_sprintf_event(debug_info_t *id, int level, char *string, curr_event->args[idx] = va_arg(ap, long); va_end(ap); debug_finish_entry(id, active, level, 0); - spin_unlock_irqrestore(&id->lock, flags); + raw_spin_unlock_irqrestore(&id->lock, flags); return active; } @@ -1304,10 +1302,10 @@ debug_entry_t *__debug_sprintf_exception(debug_info_t *id, int level, char *stri numargs = debug_count_numargs(string); if (debug_critical) { - if (!spin_trylock_irqsave(&id->lock, flags)) + if (!raw_spin_trylock_irqsave(&id->lock, flags)) return NULL; } else { - spin_lock_irqsave(&id->lock, flags); + raw_spin_lock_irqsave(&id->lock, flags); } active = get_active_entry(id); curr_event = (debug_sprintf_entry_t *)DEBUG_DATA(active); @@ -1317,7 +1315,7 @@ debug_entry_t *__debug_sprintf_exception(debug_info_t *id, int level, char *stri curr_event->args[idx] = va_arg(ap, long); va_end(ap); debug_finish_entry(id, active, level, 1); - spin_unlock_irqrestore(&id->lock, flags); + raw_spin_unlock_irqrestore(&id->lock, flags); return active; } @@ -1351,7 +1349,7 @@ int debug_register_view(debug_info_t *id, struct debug_view *view) mode &= ~(S_IWUSR | S_IWGRP | S_IWOTH); pde = debugfs_create_file(view->name, mode, id->debugfs_root_entry, id, &debug_file_ops); - spin_lock_irqsave(&id->lock, flags); + raw_spin_lock_irqsave(&id->lock, flags); for (i = 0; i < DEBUG_MAX_VIEWS; i++) { if (!id->views[i]) break; @@ -1362,7 +1360,7 @@ int debug_register_view(debug_info_t *id, struct debug_view *view) id->views[i] = view; id->debugfs_entries[i] = pde; } - spin_unlock_irqrestore(&id->lock, flags); + raw_spin_unlock_irqrestore(&id->lock, flags); if (rc) { pr_err("Registering view %s/%s would exceed the maximum " "number of views %i\n", id->name, view->name, i); @@ -1392,7 +1390,7 @@ int debug_unregister_view(debug_info_t *id, struct debug_view *view) if (!id) goto out; - spin_lock_irqsave(&id->lock, flags); + raw_spin_lock_irqsave(&id->lock, flags); for (i = 0; i < DEBUG_MAX_VIEWS; i++) { if (id->views[i] == view) break; @@ -1404,7 +1402,7 @@ int debug_unregister_view(debug_info_t *id, struct debug_view *view) id->views[i] = NULL; id->debugfs_entries[i] = NULL; } - spin_unlock_irqrestore(&id->lock, flags); + raw_spin_unlock_irqrestore(&id->lock, flags); debugfs_remove(dentry); out: return rc; @@ -1416,18 +1414,12 @@ static inline char *debug_get_user_string(const char __user *user_buf, { char *buffer; - buffer = kmalloc(user_len + 1, GFP_KERNEL); - if (!buffer) - return ERR_PTR(-ENOMEM); - if (copy_from_user(buffer, user_buf, user_len) != 0) { - kfree(buffer); - return ERR_PTR(-EFAULT); - } + buffer = memdup_user_nul(user_buf, user_len); + if (IS_ERR(buffer)) + return buffer; /* got the string, now strip linefeed. */ if (buffer[user_len - 1] == '\n') buffer[user_len - 1] = 0; - else - buffer[user_len] = 0; return buffer; } @@ -1564,7 +1556,7 @@ static void debug_flush(debug_info_t *id, int area) if (!id || !id->areas) return; - spin_lock_irqsave(&id->lock, flags); + raw_spin_lock_irqsave(&id->lock, flags); if (area == DEBUG_FLUSH_ALL) { id->active_area = 0; memset(id->active_entries, 0, id->nr_areas * sizeof(int)); @@ -1579,7 +1571,7 @@ static void debug_flush(debug_info_t *id, int area) for (i = 0; i < id->pages_per_area; i++) memset(id->areas[area][i], 0, PAGE_SIZE); } - spin_unlock_irqrestore(&id->lock, flags); + raw_spin_unlock_irqrestore(&id->lock, flags); } /* @@ -1677,7 +1669,7 @@ EXPORT_SYMBOL(debug_dflt_header_fn); /* * prints debug data sprintf-formatted: - * debug_sprinf_event/exception calls must be used together with this view + * debug_sprintf_event/exception calls must be used together with this view */ #define DEBUG_SPRINTF_MAX_ARGS 10 diff --git a/arch/s390/kernel/diag/diag.c b/arch/s390/kernel/diag/diag.c index e15b8dee3228..56b862ba9be8 100644 --- a/arch/s390/kernel/diag/diag.c +++ b/arch/s390/kernel/diag/diag.c @@ -195,7 +195,7 @@ static inline int __diag204(unsigned long *subcode, unsigned long size, void *ad { union register_pair rp = { .even = *subcode, .odd = size }; - asm volatile( + asm_inline volatile( " diag %[addr],%[rp],0x204\n" "0: nopr %%r7\n" EX_TABLE(0b,0b) @@ -286,7 +286,7 @@ int diag224(void *ptr) int rc = -EOPNOTSUPP; diag_stat_inc(DIAG_STAT_X224); - asm volatile("\n" + asm_inline volatile("\n" " diag %[type],%[addr],0x224\n" "0: lhi %[rc],0\n" "1:\n" diff --git a/arch/s390/kernel/diag/diag310.c b/arch/s390/kernel/diag/diag310.c index d6a34454aa5a..f411562aa7f6 100644 --- a/arch/s390/kernel/diag/diag310.c +++ b/arch/s390/kernel/diag/diag310.c @@ -66,7 +66,7 @@ static inline unsigned long diag310(unsigned long subcode, unsigned long size, v union register_pair rp = { .even = (unsigned long)addr, .odd = size }; diag_stat_inc(DIAG_STAT_X310); - asm volatile("diag %[rp],%[subcode],0x310\n" + asm volatile("diag %[rp],%[subcode],0x310" : [rp] "+d" (rp.pair) : [subcode] "d" (subcode) : "memory"); diff --git a/arch/s390/kernel/diag/diag324.c b/arch/s390/kernel/diag/diag324.c index 7fa4c0b7eb6c..fe325c2a2d0d 100644 --- a/arch/s390/kernel/diag/diag324.c +++ b/arch/s390/kernel/diag/diag324.c @@ -101,7 +101,7 @@ static unsigned long diag324(unsigned long subcode, void *addr) union register_pair rp = { .even = (unsigned long)addr }; diag_stat_inc(DIAG_STAT_X324); - asm volatile("diag %[rp],%[subcode],0x324\n" + asm volatile("diag %[rp],%[subcode],0x324" : [rp] "+d" (rp.pair) : [subcode] "d" (subcode) : "memory"); @@ -116,7 +116,7 @@ static void pibwork_handler(struct work_struct *work) mutex_lock(&pibmutex); timedout = ktime_add_ns(data->expire, PIBWORK_DELAY); if (ktime_before(ktime_get(), timedout)) { - mod_delayed_work(system_wq, &pibwork, nsecs_to_jiffies(PIBWORK_DELAY)); + mod_delayed_work(system_percpu_wq, &pibwork, nsecs_to_jiffies(PIBWORK_DELAY)); goto out; } vfree(data->pib); @@ -174,7 +174,7 @@ long diag324_pibbuf(unsigned long arg) pib_update(data); data->sequence++; data->expire = ktime_add_ns(ktime_get(), tod_to_ns(data->pib->intv)); - mod_delayed_work(system_wq, &pibwork, nsecs_to_jiffies(PIBWORK_DELAY)); + mod_delayed_work(system_percpu_wq, &pibwork, nsecs_to_jiffies(PIBWORK_DELAY)); first = false; } rc = data->rc; diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c index 94eb8168ea44..1cec93895b3a 100644 --- a/arch/s390/kernel/dis.c +++ b/arch/s390/kernel/dis.c @@ -17,7 +17,6 @@ #include <linux/init.h> #include <linux/interrupt.h> #include <linux/delay.h> -#include <linux/export.h> #include <linux/kallsyms.h> #include <linux/reboot.h> #include <linux/kprobes.h> @@ -504,24 +503,27 @@ static int copy_from_regs(struct pt_regs *regs, void *dst, void *src, int len) void show_code(struct pt_regs *regs) { char *mode = user_mode(regs) ? "User" : "Krnl"; + unsigned long addr, pswaddr; unsigned char code[64]; char buffer[128], *ptr; - unsigned long addr; int start, end, opsize, hops, i; + pswaddr = regs->psw.addr; + if (test_pt_regs_flag(regs, PIF_PSW_ADDR_ADJUSTED)) + pswaddr = __forward_psw(regs->psw, regs->int_code >> 16); /* Get a snapshot of the 64 bytes surrounding the fault address. */ - for (start = 32; start && regs->psw.addr >= 34 - start; start -= 2) { - addr = regs->psw.addr - 34 + start; + for (start = 32; start && pswaddr >= 34 - start; start -= 2) { + addr = pswaddr - 34 + start; if (copy_from_regs(regs, code + start - 2, (void *)addr, 2)) break; } for (end = 32; end < 64; end += 2) { - addr = regs->psw.addr + end - 32; + addr = pswaddr + end - 32; if (copy_from_regs(regs, code + end, (void *)addr, 2)) break; } /* Code snapshot usable ? */ - if ((regs->psw.addr & 1) || start >= end) { + if ((pswaddr & 1) || start >= end) { printk("%s Code: Bad PSW.\n", mode); return; } @@ -544,12 +546,12 @@ void show_code(struct pt_regs *regs) while (start < end && hops < 8) { opsize = insn_length(code[start]); if (start + opsize == 32) - *ptr++ = '#'; + *ptr++ = '*'; else if (start == 32) *ptr++ = '>'; else *ptr++ = ' '; - addr = regs->psw.addr + start - 32; + addr = pswaddr + start - 32; ptr += sprintf(ptr, "%px: ", (void *)addr); if (start + opsize >= end) break; diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c index 1ecd0580561f..f9d52e05e01e 100644 --- a/arch/s390/kernel/dumpstack.c +++ b/arch/s390/kernel/dumpstack.c @@ -17,6 +17,7 @@ #include <linux/sched.h> #include <linux/sched/debug.h> #include <linux/sched/task_stack.h> +#include <asm/asm-offsets.h> #include <asm/processor.h> #include <asm/debug.h> #include <asm/dis.h> @@ -154,12 +155,16 @@ static void show_last_breaking_event(struct pt_regs *regs) void show_registers(struct pt_regs *regs) { struct psw_bits *psw = &psw_bits(regs->psw); + unsigned long pswaddr; char *mode; + pswaddr = regs->psw.addr; + if (test_pt_regs_flag(regs, PIF_PSW_ADDR_ADJUSTED)) + pswaddr = __forward_psw(regs->psw, regs->int_code >> 16); mode = user_mode(regs) ? "User" : "Krnl"; - printk("%s PSW : %px %px", mode, (void *)regs->psw.mask, (void *)regs->psw.addr); + printk("%s PSW : %px %px", mode, (void *)regs->psw.mask, (void *)pswaddr); if (!user_mode(regs)) - pr_cont(" (%pSR)", (void *)regs->psw.addr); + pr_cont(" (%pSR)", (void *)pswaddr); pr_cont("\n"); printk(" R:%x T:%x IO:%x EX:%x Key:%x M:%x W:%x " "P:%x AS:%x CC:%x PM:%x", psw->per, psw->dat, psw->io, psw->ext, @@ -198,13 +203,8 @@ void __noreturn die(struct pt_regs *regs, const char *str) console_verbose(); spin_lock_irq(&die_lock); bust_spinlocks(1); - printk("%s: %04x ilc:%d [#%d] ", str, regs->int_code & 0xffff, + printk("%s: %04x ilc:%d [#%d]", str, regs->int_code & 0xffff, regs->int_code >> 17, ++die_counter); -#ifdef CONFIG_PREEMPT - pr_cont("PREEMPT "); -#elif defined(CONFIG_PREEMPT_RT) - pr_cont("PREEMPT_RT "); -#endif pr_cont("SMP "); if (debug_pagealloc_enabled()) pr_cont("DEBUG_PAGEALLOC"); diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index 2fa25164df7d..b27239c03d79 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -4,10 +4,10 @@ * Author(s): Hongjie Yang <hongjie@us.ibm.com>, */ -#define KMSG_COMPONENT "setup" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "setup: " fmt #include <linux/sched/debug.h> +#include <linux/cpufeature.h> #include <linux/compiler.h> #include <linux/init.h> #include <linux/errno.h> @@ -20,7 +20,10 @@ #include <linux/kernel.h> #include <asm/asm-extable.h> #include <linux/memblock.h> +#include <linux/kasan.h> #include <asm/access-regs.h> +#include <asm/asm-offsets.h> +#include <asm/machine.h> #include <asm/diag.h> #include <asm/ebcdic.h> #include <asm/fpu.h> @@ -36,12 +39,14 @@ #include <asm/boot_data.h> #include "entry.h" -#define decompressor_handled_param(param) \ -static int __init ignore_decompressor_param_##param(char *s) \ +#define __decompressor_handled_param(func, param) \ +static int __init ignore_decompressor_param_##func(char *s) \ { \ return 0; \ } \ -early_param(#param, ignore_decompressor_param_##param) +early_param(#param, ignore_decompressor_param_##func) + +#define decompressor_handled_param(param) __decompressor_handled_param(param, param) decompressor_handled_param(mem); decompressor_handled_param(vmalloc); @@ -51,6 +56,7 @@ decompressor_handled_param(nokaslr); decompressor_handled_param(cmma); decompressor_handled_param(relocate_lowcore); decompressor_handled_param(bootdebug); +__decompressor_handled_param(debug_alternative, debug-alternative); #if IS_ENABLED(CONFIG_KVM) decompressor_handled_param(prot_virt); #endif @@ -59,25 +65,10 @@ static void __init kasan_early_init(void) { #ifdef CONFIG_KASAN init_task.kasan_depth = 0; - pr_info("KernelAddressSanitizer initialized\n"); + kasan_init_generic(); #endif } -static void __init reset_tod_clock(void) -{ - union tod_clock clk; - - if (store_tod_clock_ext_cc(&clk) == 0) - return; - /* TOD clock not running. Set the clock to Unix Epoch. */ - if (set_tod_clock(TOD_UNIX_EPOCH) || store_tod_clock_ext_cc(&clk)) - disabled_wait(); - - memset(&tod_clock_base, 0, sizeof(tod_clock_base)); - tod_clock_base.tod = TOD_UNIX_EPOCH; - get_lowcore()->last_update_clock = TOD_UNIX_EPOCH; -} - /* * Initialize storage key for kernel pages */ @@ -96,26 +87,6 @@ static noinline __init void init_kernel_storage_key(void) static __initdata char sysinfo_page[PAGE_SIZE] __aligned(PAGE_SIZE); -static noinline __init void detect_machine_type(void) -{ - struct sysinfo_3_2_2 *vmms = (struct sysinfo_3_2_2 *)&sysinfo_page; - - /* Check current-configuration-level */ - if (stsi(NULL, 0, 0, 0) <= 2) { - get_lowcore()->machine_flags |= MACHINE_FLAG_LPAR; - return; - } - /* Get virtual-machine cpu information. */ - if (stsi(vmms, 3, 2, 2) || !vmms->count) - return; - - /* Detect known hypervisors */ - if (!memcmp(vmms->vm[0].cpi, "\xd2\xe5\xd4", 3)) - get_lowcore()->machine_flags |= MACHINE_FLAG_KVM; - else if (!memcmp(vmms->vm[0].cpi, "\xa9\x61\xe5\xd4", 4)) - get_lowcore()->machine_flags |= MACHINE_FLAG_VM; -} - /* Remove leading, trailing and double whitespace. */ static inline void strim_all(char *str) { @@ -134,6 +105,8 @@ static inline void strim_all(char *str) } } +char arch_hw_string[128]; + static noinline __init void setup_arch_string(void) { struct sysinfo_1_1_1 *mach = (struct sysinfo_1_1_1 *)&sysinfo_page; @@ -146,20 +119,21 @@ static noinline __init void setup_arch_string(void) EBCASC(mach->type, sizeof(mach->type)); EBCASC(mach->model, sizeof(mach->model)); EBCASC(mach->model_capacity, sizeof(mach->model_capacity)); - sprintf(mstr, "%-16.16s %-4.4s %-16.16s %-16.16s", - mach->manufacturer, mach->type, - mach->model, mach->model_capacity); + scnprintf(mstr, sizeof(mstr), "%-16.16s %-4.4s %-16.16s %-16.16s", + mach->manufacturer, mach->type, + mach->model, mach->model_capacity); strim_all(mstr); if (stsi(vm, 3, 2, 2) == 0 && vm->count) { EBCASC(vm->vm[0].cpi, sizeof(vm->vm[0].cpi)); - sprintf(hvstr, "%-16.16s", vm->vm[0].cpi); + scnprintf(hvstr, sizeof(hvstr), "%-16.16s", vm->vm[0].cpi); strim_all(hvstr); } else { - sprintf(hvstr, "%s", - MACHINE_IS_LPAR ? "LPAR" : - MACHINE_IS_VM ? "z/VM" : - MACHINE_IS_KVM ? "KVM" : "unknown"); + scnprintf(hvstr, sizeof(hvstr), "%s", + machine_is_lpar() ? "LPAR" : + machine_is_vm() ? "z/VM" : + machine_is_kvm() ? "KVM" : "unknown"); } + scnprintf(arch_hw_string, sizeof(arch_hw_string), "HW: %s (%s)", mstr, hvstr); dump_stack_set_arch_desc("%s (%s)", mstr, hvstr); } @@ -167,9 +141,8 @@ static __init void setup_topology(void) { int max_mnest; - if (!test_facility(11)) + if (!cpu_has_topology()) return; - get_lowcore()->machine_flags |= MACHINE_FLAG_TOPOLOGY; for (max_mnest = 6; max_mnest > 1; max_mnest--) { if (stsi(&sysinfo_page, 15, 1, max_mnest) == 0) break; @@ -184,6 +157,7 @@ void __init __do_early_pgm_check(struct pt_regs *regs) regs->int_code = lc->pgm_int_code; regs->int_parm_long = lc->trans_exc_code; + regs->last_break = lc->pgm_last_break; ip = __rewind_psw(regs->psw, regs->int_code >> 16); /* Monitor Event? Might be a warning */ @@ -218,65 +192,10 @@ static noinline __init void setup_lowcore_early(void) lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW); } -static __init void detect_diag9c(void) -{ - unsigned int cpu_address; - int rc; - - cpu_address = stap(); - diag_stat_inc(DIAG_STAT_X09C); - asm volatile( - " diag %2,0,0x9c\n" - "0: la %0,0\n" - "1:\n" - EX_TABLE(0b,1b) - : "=d" (rc) : "0" (-EOPNOTSUPP), "d" (cpu_address) : "cc"); - if (!rc) - get_lowcore()->machine_flags |= MACHINE_FLAG_DIAG9C; -} - -static __init void detect_machine_facilities(void) -{ - if (test_facility(8)) { - get_lowcore()->machine_flags |= MACHINE_FLAG_EDAT1; - system_ctl_set_bit(0, CR0_EDAT_BIT); - } - if (test_facility(78)) - get_lowcore()->machine_flags |= MACHINE_FLAG_EDAT2; - if (test_facility(3)) - get_lowcore()->machine_flags |= MACHINE_FLAG_IDTE; - if (test_facility(50) && test_facility(73)) { - get_lowcore()->machine_flags |= MACHINE_FLAG_TE; - system_ctl_set_bit(0, CR0_TRANSACTIONAL_EXECUTION_BIT); - } - if (test_facility(51)) - get_lowcore()->machine_flags |= MACHINE_FLAG_TLB_LC; - if (test_facility(129)) - system_ctl_set_bit(0, CR0_VECTOR_BIT); - if (test_facility(130)) - get_lowcore()->machine_flags |= MACHINE_FLAG_NX; - if (test_facility(133)) - get_lowcore()->machine_flags |= MACHINE_FLAG_GS; - if (test_facility(139) && (tod_clock_base.tod >> 63)) { - /* Enabled signed clock comparator comparisons */ - get_lowcore()->machine_flags |= MACHINE_FLAG_SCC; - clock_comparator_max = -1ULL >> 1; - system_ctl_set_bit(0, CR0_CLOCK_COMPARATOR_SIGN_BIT); - } - if (IS_ENABLED(CONFIG_PCI) && test_facility(153)) { - get_lowcore()->machine_flags |= MACHINE_FLAG_PCI_MIO; - /* the control bit is set during PCI initialization */ - } - if (test_facility(194)) - get_lowcore()->machine_flags |= MACHINE_FLAG_RDP; - if (test_facility(85)) - get_lowcore()->machine_flags |= MACHINE_FLAG_SEQ_INSN; -} - static inline void save_vector_registers(void) { #ifdef CONFIG_CRASH_DUMP - if (test_facility(129)) + if (cpu_has_vx()) save_vx_regs(boot_cpu_vector_save_area); #endif } @@ -308,17 +227,13 @@ static void __init sort_amode31_extable(void) void __init startup_init(void) { kasan_early_init(); - reset_tod_clock(); time_early_init(); init_kernel_storage_key(); lockdep_off(); sort_amode31_extable(); setup_lowcore_early(); - detect_machine_type(); setup_arch_string(); setup_boot_command_line(); - detect_diag9c(); - detect_machine_facilities(); save_vector_registers(); setup_topology(); sclp_early_detect(); diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 4cc3408c4dac..bb806d1ddae0 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -23,12 +23,14 @@ #include <asm/unistd.h> #include <asm/page.h> #include <asm/sigp.h> +#include <asm/bug.h> #include <asm/irq.h> #include <asm/fpu-insn.h> #include <asm/setup.h> #include <asm/nmi.h> #include <asm/nospec-insn.h> #include <asm/lowcore.h> +#include <asm/machine.h> _LPP_OFFSET = __LC_LPP @@ -44,7 +46,7 @@ _LPP_OFFSET = __LC_LPP ALTERNATIVE_2 "b \lpswe;nopr", \ ".insn siy,0xeb0000000071,\address,0", ALT_FACILITY(193), \ __stringify(.insn siy,0xeb0000000071,LOWCORE_ALT_ADDRESS+\address,0), \ - ALT_LOWCORE + ALT_FEATURE(MFEATURE_LOWCORE) .endm .macro MBEAR reg, lowcore @@ -67,7 +69,7 @@ _LPP_OFFSET = __LC_LPP clg %r14,__LC_RESTART_STACK(\lowcore) je \oklabel la %r14,\savearea(\lowcore) - j stack_overflow + j stack_invalid .endm /* @@ -115,7 +117,7 @@ _LPP_OFFSET = __LC_LPP .macro SIEEXIT sie_control,lowcore lg %r9,\sie_control # get control block pointer ni __SIE_PROG0C+3(%r9),0xfe # no longer in SIE - lctlg %c1,%c1,__LC_KERNEL_ASCE(\lowcore) # load primary asce + lctlg %c1,%c1,__LC_USER_ASCE(\lowcore) # load primary asce lg %r9,__LC_CURRENT(\lowcore) mvi __TI_sie(%r9),0 larl %r9,sie_exit # skip forward to sie_exit @@ -123,7 +125,7 @@ _LPP_OFFSET = __LC_LPP #endif .macro STACKLEAK_ERASE -#ifdef CONFIG_GCC_PLUGIN_STACKLEAK +#ifdef CONFIG_KSTACK_ERASE brasl %r14,stackleak_erase_on_task_stack #endif .endm @@ -161,13 +163,27 @@ SYM_FUNC_START(__switch_to_asm) stg %r3,__LC_CURRENT(%r13) # store task struct of next stg %r15,__LC_KERNEL_STACK(%r13) # store end of kernel stack lg %r15,__THREAD_ksp(%r1,%r3) # load kernel stack of next - aghi %r3,__TASK_pid - mvc __LC_CURRENT_PID(4,%r13),0(%r3) # store pid of next + lay %r4,__TASK_pid(%r3) + mvc __LC_CURRENT_PID(4,%r13),0(%r4) # store pid of next ALTERNATIVE "nop", "lpp _LPP_OFFSET(%r13)", ALT_FACILITY(40) +#ifdef CONFIG_STACKPROTECTOR + lg %r3,__TASK_stack_canary(%r3) + stg %r3,__LC_STACK_CANARY(%r13) +#endif lmg %r6,%r15,__SF_GPRS(%r15) # load gprs of next task BR_EX %r14 SYM_FUNC_END(__switch_to_asm) +#if defined(CONFIG_BUG) && defined(CONFIG_CC_HAS_ASM_IMMEDIATE_STRINGS) + +SYM_FUNC_START(__WARN_trap) + mc MONCODE_BUG_ARG(%r0),0 + BR_EX %r14 +SYM_FUNC_END(__WARN_trap) +EXPORT_SYMBOL(__WARN_trap) + +#endif /* CONFIG_BUG && CONFIG_CC_HAS_ASM_IMMEDIATE_STRINGS */ + #if IS_ENABLED(CONFIG_KVM) /* * __sie64a calling convention: @@ -184,10 +200,11 @@ SYM_FUNC_START(__sie64a) stg %r3,__SF_SIE_CONTROL(%r15) # ...and virtual addresses stg %r4,__SF_SIE_SAVEAREA(%r15) # save guest register save area stg %r5,__SF_SIE_GUEST_ASCE(%r15) # save guest asce - xc __SF_SIE_REASON(8,%r15),__SF_SIE_REASON(%r15) # reason code = 0 + xc __SF_SIE_RETURN(8,%r15),__SF_SIE_RETURN(%r15) # return code = 0 mvc __SF_SIE_FLAGS(8,%r15),__TI_flags(%r14) # copy thread flags lmg %r0,%r13,0(%r4) # load guest gprs 0-13 mvi __TI_sie(%r14),1 + stosm __SF_SIE_IRQ(%r15),0x03 # enable interrupts lctlg %c1,%c1,__SF_SIE_GUEST_ASCE(%r15) # load primary asce lg %r14,__SF_SIE_CONTROL(%r15) # get control block pointer oi __SIE_PROG0C+3(%r14),1 # we are going into SIE now @@ -207,10 +224,11 @@ SYM_FUNC_START(__sie64a) lg %r14,__SF_SIE_CONTROL(%r15) # get control block pointer ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE GET_LC %r14 - lctlg %c1,%c1,__LC_KERNEL_ASCE(%r14) # load primary asce + lctlg %c1,%c1,__LC_USER_ASCE(%r14) # load primary asce lg %r14,__LC_CURRENT(%r14) mvi __TI_sie(%r14),0 SYM_INNER_LABEL(sie_exit, SYM_L_GLOBAL) + stnsm __SF_SIE_IRQ(%r15),0xfc # disable interrupts lg %r14,__SF_SIE_SAVEAREA(%r15) # load guest register save area stmg %r0,%r13,0(%r14) # save guest gprs 0-13 xgr %r0,%r0 # clear guest registers to @@ -219,7 +237,7 @@ SYM_INNER_LABEL(sie_exit, SYM_L_GLOBAL) xgr %r4,%r4 xgr %r5,%r5 lmg %r6,%r14,__SF_GPRS(%r15) # restore kernel registers - lg %r2,__SF_SIE_REASON(%r15) # return exit reason code + lg %r2,__SF_SIE_RETURN(%r15) # return sie return code BR_EX %r14 SYM_FUNC_END(__sie64a) EXPORT_SYMBOL(__sie64a) @@ -239,7 +257,6 @@ SYM_CODE_START(system_call) lghi %r14,0 .Lsysc_per: STBEAR __LC_LAST_BREAK(%r13) - lctlg %c1,%c1,__LC_KERNEL_ASCE(%r13) lg %r15,__LC_KERNEL_STACK(%r13) xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) stmg %r0,%r7,STACK_FRAME_OVERHEAD+__PT_R0(%r15) @@ -254,13 +271,13 @@ SYM_CODE_START(system_call) xgr %r9,%r9 xgr %r10,%r10 xgr %r11,%r11 + xgr %r12,%r12 la %r2,STACK_FRAME_OVERHEAD(%r15) # pointer to pt_regs mvc __PT_R8(64,%r2),__LC_SAVE_AREA(%r13) MBEAR %r2,%r13 lgr %r3,%r14 brasl %r14,__do_syscall STACKLEAK_ERASE - lctlg %c1,%c1,__LC_USER_ASCE(%r13) mvc __LC_RETURN_PSW(16,%r13),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) BPON LBEAR STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15) @@ -277,7 +294,6 @@ SYM_CODE_START(ret_from_fork) brasl %r14,__ret_from_fork STACKLEAK_ERASE GET_LC %r13 - lctlg %c1,%c1,__LC_USER_ASCE(%r13) mvc __LC_RETURN_PSW(16,%r13),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) BPON LBEAR STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15) @@ -298,10 +314,7 @@ SYM_CODE_START(pgm_check_handler) lmg %r8,%r9,__LC_PGM_OLD_PSW(%r13) xgr %r10,%r10 tmhh %r8,0x0001 # coming from user space? - jno .Lpgm_skip_asce - lctlg %c1,%c1,__LC_KERNEL_ASCE(%r13) - j 3f # -> fault in user space -.Lpgm_skip_asce: + jo 3f # -> fault in user space #if IS_ENABLED(CONFIG_KVM) lg %r11,__LC_CURRENT(%r13) tm __TI_sie(%r11),0xff @@ -315,7 +328,7 @@ SYM_CODE_START(pgm_check_handler) tm __LC_PGM_ILC+3(%r13),0x80 # check for per exception jnz .Lpgm_svcper # -> single stepped svc 2: aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) - # CHECK_VMAP_STACK branches to stack_overflow or 4f + # CHECK_VMAP_STACK branches to stack_invalid or 4f CHECK_VMAP_STACK __LC_SAVE_AREA,%r13,4f 3: lg %r15,__LC_KERNEL_STACK(%r13) 4: la %r11,STACK_FRAME_OVERHEAD(%r15) @@ -339,7 +352,6 @@ SYM_CODE_START(pgm_check_handler) tmhh %r8,0x0001 # returning to user space? jno .Lpgm_exit_kernel STACKLEAK_ERASE - lctlg %c1,%c1,__LC_USER_ASCE(%r13) BPON stpt __LC_EXIT_TIMER(%r13) .Lpgm_exit_kernel: @@ -383,8 +395,7 @@ SYM_CODE_START(\name) #endif 0: aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) j 2f -1: lctlg %c1,%c1,__LC_KERNEL_ASCE(%r13) - lg %r15,__LC_KERNEL_STACK(%r13) +1: lg %r15,__LC_KERNEL_STACK(%r13) 2: xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) la %r11,STACK_FRAME_OVERHEAD(%r15) stmg %r0,%r7,__PT_R0(%r11) @@ -397,6 +408,7 @@ SYM_CODE_START(\name) xgr %r6,%r6 xgr %r7,%r7 xgr %r10,%r10 + xgr %r12,%r12 xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) mvc __PT_R8(64,%r11),__LC_SAVE_AREA(%r13) MBEAR %r11,%r13 @@ -407,7 +419,6 @@ SYM_CODE_START(\name) tmhh %r8,0x0001 # returning to user ? jno 2f STACKLEAK_ERASE - lctlg %c1,%c1,__LC_USER_ASCE(%r13) BPON stpt __LC_EXIT_TIMER(%r13) 2: LBEAR __PT_LAST_BREAK(%r11) @@ -467,7 +478,7 @@ SYM_CODE_START(mcck_int_handler) clgrjl %r9,%r14, 4f larl %r14,.Lsie_leave clgrjhe %r9,%r14, 4f - lg %r10,__LC_PCPU + lg %r10,__LC_PCPU(%r13) oi __PCPU_FLAGS+7(%r10), _CIF_MCCK_GUEST 4: BPENTER __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST SIEEXIT __SF_SIE_CONTROL(%r15),%r13 @@ -475,8 +486,6 @@ SYM_CODE_START(mcck_int_handler) .Lmcck_user: lg %r15,__LC_MCCK_STACK(%r13) la %r11,STACK_FRAME_OVERHEAD(%r15) - stctg %c1,%c1,__PT_CR1(%r11) - lctlg %c1,%c1,__LC_KERNEL_ASCE(%r13) xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) lay %r14,__LC_GPREGS_SAVE_AREA(%r13) mvc __PT_R0(128,%r11),0(%r14) @@ -489,12 +498,12 @@ SYM_CODE_START(mcck_int_handler) xgr %r6,%r6 xgr %r7,%r7 xgr %r10,%r10 + xgr %r12,%r12 stmg %r8,%r9,__PT_PSW(%r11) xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) lgr %r2,%r11 # pass pointer to pt_regs brasl %r14,s390_do_machine_check - lctlg %c1,%c1,__PT_CR1(%r11) lmg %r0,%r10,__PT_R0(%r11) mvc __LC_RETURN_MCCK_PSW(16,%r13),__PT_PSW(%r11) # move return PSW tm __LC_RETURN_MCCK_PSW+1(%r13),0x01 # returning to user ? @@ -590,22 +599,23 @@ SYM_CODE_END(early_pgm_check_handler) .section .kprobes.text, "ax" /* - * The synchronous or the asynchronous stack overflowed. We are dead. + * The synchronous or the asynchronous stack pointer is invalid. We are dead. * No need to properly save the registers, we are going to panic anyway. * Setup a pt_regs so that show_trace can provide a good call trace. */ -SYM_CODE_START(stack_overflow) +SYM_CODE_START(stack_invalid) GET_LC %r15 lg %r15,__LC_NODAT_STACK(%r15) # change to panic stack la %r11,STACK_FRAME_OVERHEAD(%r15) stmg %r0,%r7,__PT_R0(%r11) stmg %r8,%r9,__PT_PSW(%r11) mvc __PT_R8(64,%r11),0(%r14) - stg %r10,__PT_ORIG_GPR2(%r11) # store last break to orig_gpr2 + GET_LC %r2 + mvc __PT_ORIG_GPR2(8,%r11),__LC_PGM_LAST_BREAK(%r2) xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) lgr %r2,%r11 # pass pointer to pt_regs - jg kernel_stack_overflow -SYM_CODE_END(stack_overflow) + jg kernel_stack_invalid +SYM_CODE_END(stack_invalid) .section .data, "aw" .balign 4 @@ -616,20 +626,3 @@ SYM_DATA_START_LOCAL(daton_psw) .quad PSW_KERNEL_BITS .quad .Ldaton SYM_DATA_END(daton_psw) - - .section .rodata, "a" - .balign 8 -#define SYSCALL(esame,emu) .quad __s390x_ ## esame -SYM_DATA_START(sys_call_table) -#include "asm/syscall_table.h" -SYM_DATA_END(sys_call_table) -#undef SYSCALL - -#ifdef CONFIG_COMPAT - -#define SYSCALL(esame,emu) .quad __s390_ ## emu -SYM_DATA_START(sys_call_table_emu) -#include "asm/syscall_table.h" -SYM_DATA_END(sys_call_table_emu) -#undef SYSCALL -#endif diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h index a1f28879c87e..fb67b4abe68c 100644 --- a/arch/s390/kernel/entry.h +++ b/arch/s390/kernel/entry.h @@ -31,7 +31,7 @@ void do_secure_storage_access(struct pt_regs *regs); void do_non_secure_storage_access(struct pt_regs *regs); void do_secure_storage_violation(struct pt_regs *regs); void do_report_trap(struct pt_regs *regs, int si_signo, int si_code, char *str); -void kernel_stack_overflow(struct pt_regs * regs); +void kernel_stack_invalid(struct pt_regs *regs); void handle_signal32(struct ksignal *ksig, sigset_t *oldset, struct pt_regs *regs); @@ -56,8 +56,6 @@ long sys_s390_pci_mmio_write(unsigned long, const void __user *, size_t); long sys_s390_pci_mmio_read(unsigned long, void __user *, size_t); long sys_s390_sthyi(unsigned long function_code, void __user *buffer, u64 __user *return_code, unsigned long flags); -DECLARE_PER_CPU(u64, mt_cycles[8]); - unsigned long stack_alloc(void); void stack_free(unsigned long stack); diff --git a/arch/s390/kernel/facility.c b/arch/s390/kernel/facility.c index f02127219a27..d028b0be5c1d 100644 --- a/arch/s390/kernel/facility.c +++ b/arch/s390/kernel/facility.c @@ -3,6 +3,7 @@ * Copyright IBM Corp. 2023 */ +#include <linux/export.h> #include <asm/facility.h> unsigned int stfle_size(void) diff --git a/arch/s390/kernel/fpu.c b/arch/s390/kernel/fpu.c index 6f2e87920288..03a8973aec3c 100644 --- a/arch/s390/kernel/fpu.c +++ b/arch/s390/kernel/fpu.c @@ -5,6 +5,8 @@ * Copyright IBM Corp. 2015 * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> */ + +#include <linux/export.h> #include <linux/kernel.h> #include <linux/cpu.h> #include <linux/sched.h> diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index 63ba6306632e..e94bb98f5231 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -13,6 +13,7 @@ #include <linux/kernel.h> #include <linux/types.h> #include <linux/kmsan-checks.h> +#include <linux/cpufeature.h> #include <linux/kprobes.h> #include <linux/execmem.h> #include <trace/syscall.h> @@ -69,7 +70,7 @@ static const char *ftrace_shared_hotpatch_trampoline(const char **end) bool ftrace_need_init_nop(void) { - return !MACHINE_HAS_SEQ_INSN; + return !cpu_has_seq_insn(); } int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) @@ -189,7 +190,7 @@ static int ftrace_modify_trampoline_call(struct dyn_ftrace *rec, int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, unsigned long addr) { - if (MACHINE_HAS_SEQ_INSN) + if (cpu_has_seq_insn()) return ftrace_patch_branch_insn(rec->ip, old_addr, addr); else return ftrace_modify_trampoline_call(rec, old_addr, addr); @@ -213,8 +214,8 @@ static int ftrace_patch_branch_mask(void *addr, u16 expected, bool enable) int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { - /* Expect brcl 0xf,... for the !MACHINE_HAS_SEQ_INSN case */ - if (MACHINE_HAS_SEQ_INSN) + /* Expect brcl 0xf,... for the !cpu_has_seq_insn() case */ + if (cpu_has_seq_insn()) return ftrace_patch_branch_insn(rec->ip, addr, 0); else return ftrace_patch_branch_mask((void *)rec->ip, 0xc0f4, false); @@ -234,7 +235,7 @@ static int ftrace_make_trampoline_call(struct dyn_ftrace *rec, unsigned long add int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) { - if (MACHINE_HAS_SEQ_INSN) + if (cpu_has_seq_insn()) return ftrace_patch_branch_insn(rec->ip, 0, addr); else return ftrace_make_trampoline_call(rec, addr); @@ -266,12 +267,13 @@ void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs) { unsigned long *parent = &arch_ftrace_regs(fregs)->regs.gprs[14]; + unsigned long sp = arch_ftrace_regs(fregs)->regs.gprs[15]; if (unlikely(ftrace_graph_is_dead())) return; if (unlikely(atomic_read(¤t->tracing_graph_pause))) return; - if (!function_graph_enter_regs(*parent, ip, 0, parent, fregs)) + if (!function_graph_enter_regs(*parent, ip, 0, (unsigned long *)sp, fregs)) *parent = (unsigned long)&return_to_handler; } diff --git a/arch/s390/kernel/guarded_storage.c b/arch/s390/kernel/guarded_storage.c index 0b68168d9566..cab8ac25a010 100644 --- a/arch/s390/kernel/guarded_storage.c +++ b/arch/s390/kernel/guarded_storage.c @@ -4,6 +4,7 @@ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> */ +#include <linux/cpufeature.h> #include <linux/kernel.h> #include <linux/syscalls.h> #include <linux/signal.h> @@ -23,7 +24,7 @@ static int gs_enable(void) struct gs_cb *gs_cb; if (!current->thread.gs_cb) { - gs_cb = kzalloc(sizeof(*gs_cb), GFP_KERNEL); + gs_cb = kzalloc_obj(*gs_cb); if (!gs_cb) return -ENOMEM; gs_cb->gsd = 25; @@ -54,7 +55,7 @@ static int gs_set_bc_cb(struct gs_cb __user *u_gs_cb) gs_cb = current->thread.gs_bc_cb; if (!gs_cb) { - gs_cb = kzalloc(sizeof(*gs_cb), GFP_KERNEL); + gs_cb = kzalloc_obj(*gs_cb); if (!gs_cb) return -ENOMEM; current->thread.gs_bc_cb = gs_cb; @@ -109,7 +110,7 @@ static int gs_broadcast(void) SYSCALL_DEFINE2(s390_guarded_storage, int, command, struct gs_cb __user *, gs_cb) { - if (!MACHINE_HAS_GS) + if (!cpu_has_gs()) return -EOPNOTSUPP; switch (command) { case GS_ENABLE: diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head.S index 396034b2fe67..7edb9ded199c 100644 --- a/arch/s390/kernel/head64.S +++ b/arch/s390/kernel/head.S @@ -18,12 +18,10 @@ __HEAD SYM_CODE_START(startup_continue) - larl %r1,tod_clock_base - GET_LC %r2 - mvc 0(16,%r1),__LC_BOOT_CLOCK(%r2) # # Setup stack # + GET_LC %r2 larl %r14,init_task stg %r14,__LC_CURRENT(%r2) larl %r15,init_thread_union+STACK_INIT_OFFSET diff --git a/arch/s390/kernel/hiperdispatch.c b/arch/s390/kernel/hiperdispatch.c index 7857a7e8e56c..217206522266 100644 --- a/arch/s390/kernel/hiperdispatch.c +++ b/arch/s390/kernel/hiperdispatch.c @@ -3,8 +3,7 @@ * Copyright IBM Corp. 2024 */ -#define KMSG_COMPONENT "hd" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "hd: " fmt /* * Hiperdispatch: @@ -45,6 +44,7 @@ * therefore delaying the throughput loss caused by using SMP threads. */ +#include <linux/cpufeature.h> #include <linux/cpumask.h> #include <linux/debugfs.h> #include <linux/device.h> @@ -64,7 +64,7 @@ #define HD_DELAY_FACTOR (4) #define HD_DELAY_INTERVAL (HZ / 4) -#define HD_STEAL_THRESHOLD 30 +#define HD_STEAL_THRESHOLD 10 #define HD_STEAL_AVG_WEIGHT 16 static cpumask_t hd_vl_coremask; /* Mask containing all vertical low COREs */ @@ -87,7 +87,7 @@ static DECLARE_DELAYED_WORK(hd_capacity_work, hd_capacity_work_fn); static int hd_set_hiperdispatch_mode(int enable) { - if (!MACHINE_HAS_TOPOLOGY) + if (!cpu_has_topology()) enable = 0; if (hd_enabled == enable) return 0; @@ -190,7 +190,7 @@ int hd_enable_hiperdispatch(void) return 0; if (hd_online_cores <= hd_entitled_cores) return 0; - mod_delayed_work(system_wq, &hd_capacity_work, HD_DELAY_INTERVAL * hd_delay_factor); + mod_delayed_work(system_dfl_wq, &hd_capacity_work, HD_DELAY_INTERVAL * hd_delay_factor); hd_update_capacities(); return 1; } diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c index 39cb8d0ae348..1f1b06b6b4ef 100644 --- a/arch/s390/kernel/idle.c +++ b/arch/s390/kernel/idle.c @@ -15,37 +15,22 @@ #include <trace/events/power.h> #include <asm/cpu_mf.h> #include <asm/cputime.h> +#include <asm/idle.h> #include <asm/nmi.h> #include <asm/smp.h> -#include "entry.h" -static DEFINE_PER_CPU(struct s390_idle_data, s390_idle); +DEFINE_PER_CPU(struct s390_idle_data, s390_idle); void account_idle_time_irq(void) { struct s390_idle_data *idle = this_cpu_ptr(&s390_idle); - struct lowcore *lc = get_lowcore(); unsigned long idle_time; - u64 cycles_new[8]; - int i; - if (smp_cpu_mtid) { - stcctm(MT_DIAG, smp_cpu_mtid, cycles_new); - for (i = 0; i < smp_cpu_mtid; i++) - this_cpu_add(mt_cycles[i], cycles_new[i] - idle->mt_cycles_enter[i]); - } - - idle_time = lc->int_clock - idle->clock_idle_enter; - - lc->steal_timer += idle->clock_idle_enter - lc->last_update_clock; - lc->last_update_clock = lc->int_clock; - - lc->system_timer += lc->last_update_timer - idle->timer_idle_enter; - lc->last_update_timer = lc->sys_enter_timer; + idle_time = get_lowcore()->int_clock - idle->clock_idle_enter; /* Account time spent with enabled wait psw loaded as idle time. */ - WRITE_ONCE(idle->idle_time, READ_ONCE(idle->idle_time) + idle_time); - WRITE_ONCE(idle->idle_count, READ_ONCE(idle->idle_count) + 1); + __atomic64_add(idle_time, &idle->idle_time); + __atomic64_add_const(1, &idle->idle_count); account_idle_time(cputime_to_nsecs(idle_time)); } diff --git a/arch/s390/kernel/ima_arch.c b/arch/s390/kernel/ima_arch.c deleted file mode 100644 index f3c3e6e1c5d3..000000000000 --- a/arch/s390/kernel/ima_arch.c +++ /dev/null @@ -1,14 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include <linux/ima.h> -#include <asm/boot_data.h> - -bool arch_ima_get_secureboot(void) -{ - return ipl_secure_flag; -} - -const char * const *arch_get_ima_policy(void) -{ - return NULL; -} diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index 69be2309cde0..3c346b02ceb9 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -21,7 +21,9 @@ #include <linux/crash_dump.h> #include <linux/debug_locks.h> #include <linux/vmalloc.h> +#include <linux/secure_boot.h> #include <asm/asm-extable.h> +#include <asm/machine.h> #include <asm/diag.h> #include <asm/ipl.h> #include <asm/smp.h> @@ -185,7 +187,7 @@ static inline int __diag308(unsigned long subcode, unsigned long addr) r1.even = addr; r1.odd = 0; - asm volatile( + asm_inline volatile( " diag %[r1],%[subcode],0x308\n" "0: nopr %%r7\n" EX_TABLE(0b,0b) @@ -261,6 +263,24 @@ static struct kobj_attribute sys_##_prefix##_##_name##_attr = \ sys_##_prefix##_##_name##_show, \ sys_##_prefix##_##_name##_store) +#define DEFINE_IPL_ATTR_BOOTPROG_RW(_prefix, _name, _fmt_out, _fmt_in, _hdr, _value) \ + IPL_ATTR_SHOW_FN(_prefix, _name, _fmt_out, (unsigned long long) _value) \ +static ssize_t sys_##_prefix##_##_name##_store(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + const char *buf, size_t len) \ +{ \ + unsigned long long value; \ + if (sscanf(buf, _fmt_in, &value) != 1) \ + return -EINVAL; \ + (_value) = value; \ + (_hdr).flags &= ~IPL_PL_FLAG_SBP; \ + return len; \ +} \ +static struct kobj_attribute sys_##_prefix##_##_name##_attr = \ + __ATTR(_name, 0644, \ + sys_##_prefix##_##_name##_show, \ + sys_##_prefix##_##_name##_store) + #define DEFINE_IPL_ATTR_STR_RW(_prefix, _name, _fmt_out, _fmt_in, _value)\ IPL_ATTR_SHOW_FN(_prefix, _name, _fmt_out, _value) \ static ssize_t sys_##_prefix##_##_name##_store(struct kobject *kobj, \ @@ -269,7 +289,7 @@ static ssize_t sys_##_prefix##_##_name##_store(struct kobject *kobj, \ { \ if (len >= sizeof(_value)) \ return -E2BIG; \ - len = strscpy(_value, buf, sizeof(_value)); \ + len = strscpy(_value, buf); \ if ((ssize_t)len < 0) \ return len; \ strim(_value); \ @@ -595,7 +615,7 @@ static struct attribute *ipl_fcp_attrs[] = { static const struct attribute_group ipl_fcp_attr_group = { .attrs = ipl_fcp_attrs, - .bin_attrs_new = ipl_fcp_bin_attrs, + .bin_attrs = ipl_fcp_bin_attrs, }; static struct attribute *ipl_nvme_attrs[] = { @@ -609,7 +629,7 @@ static struct attribute *ipl_nvme_attrs[] = { static const struct attribute_group ipl_nvme_attr_group = { .attrs = ipl_nvme_attrs, - .bin_attrs_new = ipl_nvme_bin_attrs, + .bin_attrs = ipl_nvme_bin_attrs, }; static struct attribute *ipl_eckd_attrs[] = { @@ -622,7 +642,7 @@ static struct attribute *ipl_eckd_attrs[] = { static const struct attribute_group ipl_eckd_attr_group = { .attrs = ipl_eckd_attrs, - .bin_attrs_new = ipl_eckd_bin_attrs, + .bin_attrs = ipl_eckd_bin_attrs, }; /* CCW ipl device attributes */ @@ -685,7 +705,7 @@ static int __init ipl_init(void) goto out; switch (ipl_info.type) { case IPL_TYPE_CCW: - if (MACHINE_IS_VM) + if (machine_is_vm()) rc = sysfs_create_group(&ipl_kset->kobj, &ipl_ccw_attr_group_vm); else @@ -817,12 +837,13 @@ DEFINE_IPL_ATTR_RW(reipl_fcp, wwpn, "0x%016llx\n", "%llx\n", reipl_block_fcp->fcp.wwpn); DEFINE_IPL_ATTR_RW(reipl_fcp, lun, "0x%016llx\n", "%llx\n", reipl_block_fcp->fcp.lun); -DEFINE_IPL_ATTR_RW(reipl_fcp, bootprog, "%lld\n", "%lld\n", - reipl_block_fcp->fcp.bootprog); DEFINE_IPL_ATTR_RW(reipl_fcp, br_lba, "%lld\n", "%lld\n", reipl_block_fcp->fcp.br_lba); DEFINE_IPL_ATTR_RW(reipl_fcp, device, "0.0.%04llx\n", "0.0.%llx\n", reipl_block_fcp->fcp.devno); +DEFINE_IPL_ATTR_BOOTPROG_RW(reipl_fcp, bootprog, "%lld\n", "%lld\n", + reipl_block_fcp->hdr, + reipl_block_fcp->fcp.bootprog); static void reipl_get_ascii_loadparm(char *loadparm, struct ipl_parameter_block *ibp) @@ -919,7 +940,7 @@ static struct attribute *reipl_fcp_attrs[] = { static const struct attribute_group reipl_fcp_attr_group = { .attrs = reipl_fcp_attrs, - .bin_attrs_new = reipl_fcp_bin_attrs, + .bin_attrs = reipl_fcp_bin_attrs, }; static struct kobj_attribute sys_reipl_fcp_clear_attr = @@ -941,10 +962,11 @@ DEFINE_IPL_ATTR_RW(reipl_nvme, fid, "0x%08llx\n", "%llx\n", reipl_block_nvme->nvme.fid); DEFINE_IPL_ATTR_RW(reipl_nvme, nsid, "0x%08llx\n", "%llx\n", reipl_block_nvme->nvme.nsid); -DEFINE_IPL_ATTR_RW(reipl_nvme, bootprog, "%lld\n", "%lld\n", - reipl_block_nvme->nvme.bootprog); DEFINE_IPL_ATTR_RW(reipl_nvme, br_lba, "%lld\n", "%lld\n", reipl_block_nvme->nvme.br_lba); +DEFINE_IPL_ATTR_BOOTPROG_RW(reipl_nvme, bootprog, "%lld\n", "%lld\n", + reipl_block_nvme->hdr, + reipl_block_nvme->nvme.bootprog); static struct attribute *reipl_nvme_attrs[] = { &sys_reipl_nvme_fid_attr.attr, @@ -957,7 +979,7 @@ static struct attribute *reipl_nvme_attrs[] = { static const struct attribute_group reipl_nvme_attr_group = { .attrs = reipl_nvme_attrs, - .bin_attrs_new = reipl_nvme_bin_attrs + .bin_attrs = reipl_nvme_bin_attrs }; static ssize_t reipl_nvme_clear_show(struct kobject *kobj, @@ -1037,8 +1059,9 @@ static const struct bin_attribute *const reipl_eckd_bin_attrs[] = { }; DEFINE_IPL_CCW_ATTR_RW(reipl_eckd, device, reipl_block_eckd->eckd); -DEFINE_IPL_ATTR_RW(reipl_eckd, bootprog, "%lld\n", "%lld\n", - reipl_block_eckd->eckd.bootprog); +DEFINE_IPL_ATTR_BOOTPROG_RW(reipl_eckd, bootprog, "%lld\n", "%lld\n", + reipl_block_eckd->hdr, + reipl_block_eckd->eckd.bootprog); static struct attribute *reipl_eckd_attrs[] = { &sys_reipl_eckd_device_attr.attr, @@ -1050,7 +1073,7 @@ static struct attribute *reipl_eckd_attrs[] = { static const struct attribute_group reipl_eckd_attr_group = { .attrs = reipl_eckd_attrs, - .bin_attrs_new = reipl_eckd_bin_attrs + .bin_attrs = reipl_eckd_bin_attrs }; static ssize_t reipl_eckd_clear_show(struct kobject *kobj, @@ -1272,7 +1295,7 @@ static void reipl_block_ccw_fill_parms(struct ipl_parameter_block *ipb) ipb->ccw.flags = IPL_PB0_FLAG_LOADPARM; /* VM PARM */ - if (MACHINE_IS_VM && ipl_block_valid && + if (machine_is_vm() && ipl_block_valid && (ipl_block.ccw.vm_flags & IPL_PB0_CCW_VM_FLAG_VP)) { ipb->ccw.vm_flags |= IPL_PB0_CCW_VM_FLAG_VP; @@ -1286,7 +1309,7 @@ static int __init reipl_nss_init(void) { int rc; - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return 0; reipl_block_nss = (void *) get_zeroed_page(GFP_KERNEL); @@ -1311,8 +1334,8 @@ static int __init reipl_ccw_init(void) return -ENOMEM; rc = sysfs_create_group(&reipl_kset->kobj, - MACHINE_IS_VM ? &reipl_ccw_attr_group_vm - : &reipl_ccw_attr_group_lpar); + machine_is_vm() ? &reipl_ccw_attr_group_vm + : &reipl_ccw_attr_group_lpar); if (rc) return rc; @@ -1566,12 +1589,13 @@ DEFINE_IPL_ATTR_RW(dump_fcp, wwpn, "0x%016llx\n", "%llx\n", dump_block_fcp->fcp.wwpn); DEFINE_IPL_ATTR_RW(dump_fcp, lun, "0x%016llx\n", "%llx\n", dump_block_fcp->fcp.lun); -DEFINE_IPL_ATTR_RW(dump_fcp, bootprog, "%lld\n", "%lld\n", - dump_block_fcp->fcp.bootprog); DEFINE_IPL_ATTR_RW(dump_fcp, br_lba, "%lld\n", "%lld\n", dump_block_fcp->fcp.br_lba); DEFINE_IPL_ATTR_RW(dump_fcp, device, "0.0.%04llx\n", "0.0.%llx\n", dump_block_fcp->fcp.devno); +DEFINE_IPL_ATTR_BOOTPROG_RW(dump_fcp, bootprog, "%lld\n", "%lld\n", + dump_block_fcp->hdr, + dump_block_fcp->fcp.bootprog); DEFINE_IPL_ATTR_SCP_DATA_RW(dump_fcp, dump_block_fcp->hdr, dump_block_fcp->fcp, @@ -1595,7 +1619,7 @@ static const struct bin_attribute *const dump_fcp_bin_attrs[] = { static const struct attribute_group dump_fcp_attr_group = { .name = IPL_FCP_STR, .attrs = dump_fcp_attrs, - .bin_attrs_new = dump_fcp_bin_attrs, + .bin_attrs = dump_fcp_bin_attrs, }; /* NVME dump device attributes */ @@ -1603,10 +1627,11 @@ DEFINE_IPL_ATTR_RW(dump_nvme, fid, "0x%08llx\n", "%llx\n", dump_block_nvme->nvme.fid); DEFINE_IPL_ATTR_RW(dump_nvme, nsid, "0x%08llx\n", "%llx\n", dump_block_nvme->nvme.nsid); -DEFINE_IPL_ATTR_RW(dump_nvme, bootprog, "%lld\n", "%llx\n", - dump_block_nvme->nvme.bootprog); DEFINE_IPL_ATTR_RW(dump_nvme, br_lba, "%lld\n", "%llx\n", dump_block_nvme->nvme.br_lba); +DEFINE_IPL_ATTR_BOOTPROG_RW(dump_nvme, bootprog, "%lld\n", "%llx\n", + dump_block_nvme->hdr, + dump_block_nvme->nvme.bootprog); DEFINE_IPL_ATTR_SCP_DATA_RW(dump_nvme, dump_block_nvme->hdr, dump_block_nvme->nvme, @@ -1629,13 +1654,14 @@ static const struct bin_attribute *const dump_nvme_bin_attrs[] = { static const struct attribute_group dump_nvme_attr_group = { .name = IPL_NVME_STR, .attrs = dump_nvme_attrs, - .bin_attrs_new = dump_nvme_bin_attrs, + .bin_attrs = dump_nvme_bin_attrs, }; /* ECKD dump device attributes */ DEFINE_IPL_CCW_ATTR_RW(dump_eckd, device, dump_block_eckd->eckd); -DEFINE_IPL_ATTR_RW(dump_eckd, bootprog, "%lld\n", "%llx\n", - dump_block_eckd->eckd.bootprog); +DEFINE_IPL_ATTR_BOOTPROG_RW(dump_eckd, bootprog, "%lld\n", "%llx\n", + dump_block_eckd->hdr, + dump_block_eckd->eckd.bootprog); IPL_ATTR_BR_CHR_SHOW_FN(dump, dump_block_eckd->eckd); IPL_ATTR_BR_CHR_STORE_FN(dump, dump_block_eckd->eckd); @@ -1663,7 +1689,7 @@ static const struct bin_attribute *const dump_eckd_bin_attrs[] = { static const struct attribute_group dump_eckd_attr_group = { .name = IPL_ECKD_STR, .attrs = dump_eckd_attrs, - .bin_attrs_new = dump_eckd_bin_attrs, + .bin_attrs = dump_eckd_bin_attrs, }; /* CCW dump device attributes */ @@ -1987,7 +2013,7 @@ static void vmcmd_run(struct shutdown_trigger *trigger) static int vmcmd_init(void) { - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return -EOPNOTSUPP; vmcmd_kset = kset_create_and_add("vmcmd", NULL, firmware_kobj); if (!vmcmd_kset) @@ -2248,26 +2274,28 @@ static int __init s390_ipl_init(void) __initcall(s390_ipl_init); -static void __init strncpy_skip_quote(char *dst, char *src, int n) +static void __init strscpy_skip_quote(char *dst, char *src, int n) { int sx, dx; - dx = 0; - for (sx = 0; src[sx] != 0; sx++) { + if (!n) + return; + for (sx = 0, dx = 0; src[sx]; sx++) { if (src[sx] == '"') continue; - dst[dx++] = src[sx]; - if (dx >= n) + dst[dx] = src[sx]; + if (dx + 1 == n) break; + dx++; } + dst[dx] = '\0'; } static int __init vmcmd_on_reboot_setup(char *str) { - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return 1; - strncpy_skip_quote(vmcmd_on_reboot, str, VMCMD_MAX_SIZE); - vmcmd_on_reboot[VMCMD_MAX_SIZE] = 0; + strscpy_skip_quote(vmcmd_on_reboot, str, sizeof(vmcmd_on_reboot)); on_reboot_trigger.action = &vmcmd_action; return 1; } @@ -2275,10 +2303,9 @@ __setup("vmreboot=", vmcmd_on_reboot_setup); static int __init vmcmd_on_panic_setup(char *str) { - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return 1; - strncpy_skip_quote(vmcmd_on_panic, str, VMCMD_MAX_SIZE); - vmcmd_on_panic[VMCMD_MAX_SIZE] = 0; + strscpy_skip_quote(vmcmd_on_panic, str, sizeof(vmcmd_on_panic)); on_panic_trigger.action = &vmcmd_action; return 1; } @@ -2286,10 +2313,9 @@ __setup("vmpanic=", vmcmd_on_panic_setup); static int __init vmcmd_on_halt_setup(char *str) { - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return 1; - strncpy_skip_quote(vmcmd_on_halt, str, VMCMD_MAX_SIZE); - vmcmd_on_halt[VMCMD_MAX_SIZE] = 0; + strscpy_skip_quote(vmcmd_on_halt, str, sizeof(vmcmd_on_halt)); on_halt_trigger.action = &vmcmd_action; return 1; } @@ -2297,10 +2323,9 @@ __setup("vmhalt=", vmcmd_on_halt_setup); static int __init vmcmd_on_poff_setup(char *str) { - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return 1; - strncpy_skip_quote(vmcmd_on_poff, str, VMCMD_MAX_SIZE); - vmcmd_on_poff[VMCMD_MAX_SIZE] = 0; + strscpy_skip_quote(vmcmd_on_poff, str, sizeof(vmcmd_on_poff)); on_poff_trigger.action = &vmcmd_action; return 1; } @@ -2353,7 +2378,7 @@ void __init setup_ipl(void) atomic_notifier_chain_register(&panic_notifier_list, &on_panic_nb); } -void s390_reset_system(void) +void __no_stack_protector s390_reset_system(void) { /* Disable prefixing */ set_prefix(0); @@ -2363,6 +2388,11 @@ void s390_reset_system(void) diag_amode31_ops.diag308_reset(); } +bool arch_get_secureboot(void) +{ + return ipl_secure_flag; +} + #ifdef CONFIG_KEXEC_FILE int ipl_report_add_component(struct ipl_report *report, struct kexec_buf *kbuf, diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index ef7be599e1f7..d10a17e6531d 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -9,6 +9,7 @@ */ #include <linux/kernel_stat.h> +#include <linux/cpufeature.h> #include <linux/interrupt.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> @@ -25,6 +26,7 @@ #include <asm/irq_regs.h> #include <asm/cputime.h> #include <asm/lowcore.h> +#include <asm/machine.h> #include <asm/irq.h> #include <asm/hw_irq.h> #include <asm/stacktrace.h> @@ -84,7 +86,6 @@ static const struct irq_class irqclass_sub_desc[] = { {.irq = IRQIO_C70, .name = "C70", .desc = "[I/O] 3270"}, {.irq = IRQIO_TAP, .name = "TAP", .desc = "[I/O] Tape"}, {.irq = IRQIO_VMR, .name = "VMR", .desc = "[I/O] Unit Record Devices"}, - {.irq = IRQIO_LCS, .name = "LCS", .desc = "[I/O] LCS"}, {.irq = IRQIO_CTC, .name = "CTC", .desc = "[I/O] CTC"}, {.irq = IRQIO_ADM, .name = "ADM", .desc = "[I/O] EADM Subchannel"}, {.irq = IRQIO_CSC, .name = "CSC", .desc = "[I/O] CHSC Subchannel"}, @@ -145,15 +146,18 @@ void noinstr do_io_irq(struct pt_regs *regs) struct pt_regs *old_regs = set_irq_regs(regs); bool from_idle; + from_idle = test_and_clear_cpu_flag(CIF_ENABLED_WAIT); + if (from_idle) + update_timer_idle(); + irq_enter_rcu(); if (user_mode(regs)) { update_timer_sys(); - if (static_branch_likely(&cpu_has_bear)) + if (cpu_has_bear()) current->thread.last_break = regs->last_break; } - from_idle = test_and_clear_cpu_flag(CIF_ENABLED_WAIT); if (from_idle) account_idle_time_irq(); @@ -164,7 +168,7 @@ void noinstr do_io_irq(struct pt_regs *regs) do_irq_async(regs, THIN_INTERRUPT); else do_irq_async(regs, IO_INTERRUPT); - } while (MACHINE_IS_LPAR && irq_pending(regs)); + } while (machine_is_lpar() && irq_pending(regs)); irq_exit_rcu(); @@ -181,11 +185,15 @@ void noinstr do_ext_irq(struct pt_regs *regs) struct pt_regs *old_regs = set_irq_regs(regs); bool from_idle; + from_idle = test_and_clear_cpu_flag(CIF_ENABLED_WAIT); + if (from_idle) + update_timer_idle(); + irq_enter_rcu(); if (user_mode(regs)) { update_timer_sys(); - if (static_branch_likely(&cpu_has_bear)) + if (cpu_has_bear()) current->thread.last_break = regs->last_break; } @@ -193,7 +201,6 @@ void noinstr do_ext_irq(struct pt_regs *regs) regs->int_parm = get_lowcore()->ext_params; regs->int_parm_long = get_lowcore()->ext_params2; - from_idle = test_and_clear_cpu_flag(CIF_ENABLED_WAIT); if (from_idle) account_idle_time_irq(); @@ -311,7 +318,7 @@ int register_external_irq(u16 code, ext_int_handler_t handler) unsigned long flags; int index; - p = kmalloc(sizeof(*p), GFP_ATOMIC); + p = kmalloc_obj(*p, GFP_ATOMIC); if (!p) return -ENOMEM; p->code = code; diff --git a/arch/s390/kernel/kexec_elf.c b/arch/s390/kernel/kexec_elf.c index 4d364de43799..143e34a4eca5 100644 --- a/arch/s390/kernel/kexec_elf.c +++ b/arch/s390/kernel/kexec_elf.c @@ -16,7 +16,7 @@ static int kexec_file_add_kernel_elf(struct kimage *image, struct s390_load_data *data) { - struct kexec_buf buf; + struct kexec_buf buf = {}; const Elf_Ehdr *ehdr; const Elf_Phdr *phdr; Elf_Addr entry; diff --git a/arch/s390/kernel/kexec_image.c b/arch/s390/kernel/kexec_image.c index a32ce8bea745..9a439175723c 100644 --- a/arch/s390/kernel/kexec_image.c +++ b/arch/s390/kernel/kexec_image.c @@ -16,7 +16,7 @@ static int kexec_file_add_kernel_image(struct kimage *image, struct s390_load_data *data) { - struct kexec_buf buf; + struct kexec_buf buf = {}; buf.image = image; diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index 8b80ea57125f..c450120b4474 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -13,6 +13,7 @@ #include <linux/ptrace.h> #include <linux/preempt.h> #include <linux/stop_machine.h> +#include <linux/cpufeature.h> #include <linux/kdebug.h> #include <linux/uaccess.h> #include <linux/extable.h> @@ -153,7 +154,7 @@ void arch_arm_kprobe(struct kprobe *p) { struct swap_insn_args args = {.p = p, .arm_kprobe = 1}; - if (MACHINE_HAS_SEQ_INSN) { + if (cpu_has_seq_insn()) { swap_instruction(&args); text_poke_sync(); } else { @@ -166,7 +167,7 @@ void arch_disarm_kprobe(struct kprobe *p) { struct swap_insn_args args = {.p = p, .arm_kprobe = 0}; - if (MACHINE_HAS_SEQ_INSN) { + if (cpu_has_seq_insn()) { swap_instruction(&args); text_poke_sync(); } else { diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c index 8f681ccfb83a..baeb3dcfc1c8 100644 --- a/arch/s390/kernel/machine_kexec.c +++ b/arch/s390/kernel/machine_kexec.c @@ -13,7 +13,9 @@ #include <linux/reboot.h> #include <linux/ftrace.h> #include <linux/debug_locks.h> +#include <linux/cpufeature.h> #include <asm/guarded_storage.h> +#include <asm/machine.h> #include <asm/pfault.h> #include <asm/cio.h> #include <asm/fpu.h> @@ -94,7 +96,7 @@ static noinline void __machine_kdump(void *image) mcesa = __va(get_lowcore()->mcesad & MCESA_ORIGIN_MASK); if (cpu_has_vx()) save_vx_regs((__vector128 *) mcesa->vector_save_area); - if (MACHINE_HAS_GS) { + if (cpu_has_gs()) { local_ctl_store(2, &cr2_old.reg); cr2_new = cr2_old; cr2_new.gse = 1; @@ -178,7 +180,7 @@ void arch_kexec_unprotect_crashkres(void) static int machine_kexec_prepare_kdump(void) { #ifdef CONFIG_CRASH_DUMP - if (MACHINE_IS_VM) + if (machine_is_vm()) diag10_range(PFN_DOWN(crashk_res.start), PFN_DOWN(crashk_res.end - crashk_res.start + 1)); return 0; diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c index c2bac14dd668..6f0852d5a3a9 100644 --- a/arch/s390/kernel/machine_kexec_file.c +++ b/arch/s390/kernel/machine_kexec_file.c @@ -28,7 +28,7 @@ const struct kexec_file_ops * const kexec_file_loaders[] = { #ifdef CONFIG_KEXEC_SIG int s390_verify_sig(const char *kernel, unsigned long kernel_len) { - const unsigned long marker_len = sizeof(MODULE_SIG_STRING) - 1; + const unsigned long marker_len = sizeof(MODULE_SIGNATURE_MARKER) - 1; struct module_signature *ms; unsigned long sig_len; int ret; @@ -40,7 +40,7 @@ int s390_verify_sig(const char *kernel, unsigned long kernel_len) if (marker_len > kernel_len) return -EKEYREJECTED; - if (memcmp(kernel + kernel_len - marker_len, MODULE_SIG_STRING, + if (memcmp(kernel + kernel_len - marker_len, MODULE_SIGNATURE_MARKER, marker_len)) return -EKEYREJECTED; kernel_len -= marker_len; @@ -53,7 +53,7 @@ int s390_verify_sig(const char *kernel, unsigned long kernel_len) return -EKEYREJECTED; kernel_len -= sig_len; - if (ms->id_type != PKEY_ID_PKCS7) + if (ms->id_type != MODULE_SIGNATURE_TYPE_PKCS7) return -EKEYREJECTED; if (ms->algo != 0 || @@ -129,7 +129,7 @@ static int kexec_file_update_purgatory(struct kimage *image, static int kexec_file_add_purgatory(struct kimage *image, struct s390_load_data *data) { - struct kexec_buf buf; + struct kexec_buf buf = {}; int ret; buf.image = image; @@ -152,7 +152,7 @@ static int kexec_file_add_purgatory(struct kimage *image, static int kexec_file_add_initrd(struct kimage *image, struct s390_load_data *data) { - struct kexec_buf buf; + struct kexec_buf buf = {}; int ret; buf.image = image; @@ -184,7 +184,7 @@ static int kexec_file_add_ipl_report(struct kimage *image, { __u32 *lc_ipl_parmblock_ptr; unsigned int len, ncerts; - struct kexec_buf buf; + struct kexec_buf buf = {}; unsigned long addr; void *ptr, *end; int ret; @@ -270,8 +270,10 @@ void *kexec_file_add_components(struct kimage *image, if (image->kernel_buf_len < minsize + max_command_line_size) goto out; - if (image->cmdline_buf_len >= max_command_line_size) + if (image->cmdline_buf_len >= max_command_line_size) { + pr_err("Kernel command line exceeds supported limit of %lu", max_command_line_size); goto out; + } memcpy(data.parm->command_line, image->cmdline_buf, image->cmdline_buf_len); diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c index 91e207b50394..9d1f8a50f5a4 100644 --- a/arch/s390/kernel/module.c +++ b/arch/s390/kernel/module.c @@ -22,12 +22,14 @@ #include <linux/bug.h> #include <linux/memory.h> #include <linux/execmem.h> +#include <asm/arch-stackprotector.h> #include <asm/alternative.h> #include <asm/nospec-branch.h> #include <asm/facility.h> #include <asm/ftrace.lds.h> #include <asm/set_memory.h> #include <asm/setup.h> +#include <asm/asm-offsets.h> #if 0 #define DEBUGP printk @@ -495,9 +497,7 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *s; char *secstrings, *secname; void *aseg; -#ifdef CONFIG_FUNCTION_TRACER - int ret; -#endif + int rc = 0; if (IS_ENABLED(CONFIG_EXPOLINE) && !nospec_disable && me->arch.plt_size) { @@ -527,14 +527,21 @@ int module_finalize(const Elf_Ehdr *hdr, (str_has_prefix(secname, ".s390_return"))) nospec_revert(aseg, aseg + s->sh_size); + if (IS_ENABLED(CONFIG_STACKPROTECTOR) && + (str_has_prefix(secname, "__stack_protector_loc"))) { + rc = stack_protector_apply(aseg, aseg + s->sh_size); + if (rc) + break; + } + #ifdef CONFIG_FUNCTION_TRACER if (!strcmp(FTRACE_CALLSITE_SECTION, secname)) { - ret = module_alloc_ftrace_hotpatch_trampolines(me, s); - if (ret < 0) - return ret; + rc = module_alloc_ftrace_hotpatch_trampolines(me, s); + if (rc) + break; } #endif /* CONFIG_FUNCTION_TRACER */ } - return 0; + return rc; } diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index fbd218b6fc8e..94fbfad49f62 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c @@ -9,6 +9,8 @@ */ #include <linux/kernel_stat.h> +#include <linux/utsname.h> +#include <linux/cpufeature.h> #include <linux/init.h> #include <linux/errno.h> #include <linux/entry-common.h> @@ -20,7 +22,6 @@ #include <linux/module.h> #include <linux/sched/signal.h> #include <linux/kvm_host.h> -#include <linux/export.h> #include <asm/lowcore.h> #include <asm/ctlreg.h> #include <asm/fpu.h> @@ -45,7 +46,7 @@ static DEFINE_PER_CPU(struct mcck_struct, cpu_mcck); static inline int nmi_needs_mcesa(void) { - return cpu_has_vx() || MACHINE_HAS_GS; + return cpu_has_vx() || cpu_has_gs(); } /* @@ -61,7 +62,7 @@ void __init nmi_alloc_mcesa_early(u64 *mcesad) if (!nmi_needs_mcesa()) return; *mcesad = __pa(&boot_mcesa); - if (MACHINE_HAS_GS) + if (cpu_has_gs()) *mcesad |= ilog2(MCESA_MAX_SIZE); } @@ -73,14 +74,14 @@ int nmi_alloc_mcesa(u64 *mcesad) *mcesad = 0; if (!nmi_needs_mcesa()) return 0; - size = MACHINE_HAS_GS ? MCESA_MAX_SIZE : MCESA_MIN_SIZE; + size = cpu_has_gs() ? MCESA_MAX_SIZE : MCESA_MIN_SIZE; origin = kmalloc(size, GFP_KERNEL); if (!origin) return -ENOMEM; /* The pointer is stored with mcesa_bits ORed in */ kmemleak_not_leak(origin); *mcesad = __pa(origin); - if (MACHINE_HAS_GS) + if (cpu_has_gs()) *mcesad |= ilog2(MCESA_MAX_SIZE); return 0; } @@ -115,18 +116,82 @@ static __always_inline char *u64_to_hex(char *dest, u64 val) return dest; } -static notrace void s390_handle_damage(void) +static notrace void nmi_print_info(void) { struct lowcore *lc = get_lowcore(); - union ctlreg0 cr0, cr0_new; char message[100]; - psw_t psw_save; char *ptr; + int i; + + ptr = nmi_puts(message, "Unrecoverable machine check, code: "); + ptr = u64_to_hex(ptr, lc->mcck_interruption_code); + ptr = nmi_puts(ptr, "\n"); + sclp_emergency_printk(message); + + ptr = nmi_puts(message, init_utsname()->release); + ptr = nmi_puts(ptr, "\n"); + sclp_emergency_printk(message); + + ptr = nmi_puts(message, arch_hw_string); + ptr = nmi_puts(ptr, "\n"); + sclp_emergency_printk(message); + + ptr = nmi_puts(message, "PSW: "); + ptr = u64_to_hex(ptr, lc->mcck_old_psw.mask); + ptr = nmi_puts(ptr, " "); + ptr = u64_to_hex(ptr, lc->mcck_old_psw.addr); + ptr = nmi_puts(ptr, " PFX: "); + ptr = u64_to_hex(ptr, (u64)get_lowcore()); + ptr = nmi_puts(ptr, "\n"); + sclp_emergency_printk(message); + + ptr = nmi_puts(message, "LBA: "); + ptr = u64_to_hex(ptr, lc->last_break_save_area); + ptr = nmi_puts(ptr, " EDC: "); + ptr = u64_to_hex(ptr, lc->external_damage_code); + ptr = nmi_puts(ptr, " FSA: "); + ptr = u64_to_hex(ptr, lc->failing_storage_address); + ptr = nmi_puts(ptr, "\n"); + sclp_emergency_printk(message); + + ptr = nmi_puts(message, "CRS:\n"); + sclp_emergency_printk(message); + ptr = message; + for (i = 0; i < 16; i++) { + ptr = u64_to_hex(ptr, lc->cregs_save_area[i].val); + ptr = nmi_puts(ptr, " "); + if ((i + 1) % 4 == 0) { + ptr = nmi_puts(ptr, "\n"); + sclp_emergency_printk(message); + ptr = message; + } + } + + ptr = nmi_puts(message, "GPRS:\n"); + sclp_emergency_printk(message); + ptr = message; + for (i = 0; i < 16; i++) { + ptr = u64_to_hex(ptr, lc->gpregs_save_area[i]); + ptr = nmi_puts(ptr, " "); + if ((i + 1) % 4 == 0) { + ptr = nmi_puts(ptr, "\n"); + sclp_emergency_printk(message); + ptr = message; + } + } + + ptr = nmi_puts(message, "System stopped\n"); + sclp_emergency_printk(message); +} + +static notrace void __noreturn s390_handle_damage(void) +{ + struct lowcore *lc = get_lowcore(); + union ctlreg0 cr0, cr0_new; + psw_t psw_save; smp_emergency_stop(); diag_amode31_ops.diag308_reset(); - ptr = nmi_puts(message, "System stopped due to unrecoverable machine check, code: 0x"); - u64_to_hex(ptr, lc->mcck_interruption_code); /* * Disable low address protection and make machine check new PSW a @@ -140,7 +205,7 @@ static notrace void s390_handle_damage(void) psw_bits(lc->mcck_new_psw).io = 0; psw_bits(lc->mcck_new_psw).ext = 0; psw_bits(lc->mcck_new_psw).wait = 1; - sclp_emergency_printk(message); + nmi_print_info(); /* * Restore machine check new PSW and control register 0 to original @@ -149,7 +214,6 @@ static notrace void s390_handle_damage(void) lc->mcck_new_psw = psw_save; local_ctl_load(0, &cr0.reg); disabled_wait(); - while (1); } NOKPROBE_SYMBOL(s390_handle_damage); @@ -423,8 +487,8 @@ void notrace s390_do_machine_check(struct pt_regs *regs) mcck_dam_code = (mci.val & MCIC_SUBCLASS_MASK); if (test_cpu_flag(CIF_MCCK_GUEST) && (mcck_dam_code & MCCK_CODE_NO_GUEST) != mcck_dam_code) { - /* Set exit reason code for host's later handling */ - *((long *)(regs->gprs[15] + __SF_SIE_REASON)) = -EINTR; + /* Set sie return code for host's later handling */ + ((struct stack_frame *)regs->gprs[15])->sie_return = SIE64_RETURN_MCCK; } clear_cpu_flag(CIF_MCCK_GUEST); diff --git a/arch/s390/kernel/os_info.c b/arch/s390/kernel/os_info.c index c2a468986212..4a30c10ae63b 100644 --- a/arch/s390/kernel/os_info.c +++ b/arch/s390/kernel/os_info.c @@ -6,8 +6,7 @@ * Author(s): Michael Holzheu <holzheu@linux.vnet.ibm.com> */ -#define KMSG_COMPONENT "os_info" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "os_info: " fmt #include <linux/crash_dump.h> #include <linux/kernel.h> @@ -155,7 +154,7 @@ static void os_info_old_init(void) goto fail; if (addr == 0 || addr % PAGE_SIZE) goto fail; - os_info_old = kzalloc(sizeof(*os_info_old), GFP_KERNEL); + os_info_old = kzalloc_obj(*os_info_old); if (!os_info_old) goto fail; if (copy_oldmem_kernel(os_info_old, addr, sizeof(*os_info_old))) diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index 33205dd410e4..7aa655664ecc 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -6,15 +6,13 @@ * Author(s): Hendrik Brueckner <brueckner@linux.ibm.com> * Thomas Richter <tmricht@linux.ibm.com> */ -#define KMSG_COMPONENT "cpum_cf" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "cpum_cf: " fmt #include <linux/kernel.h> #include <linux/kernel_stat.h> #include <linux/percpu.h> #include <linux/notifier.h> #include <linux/init.h> -#include <linux/export.h> #include <linux/miscdevice.h> #include <linux/perf_event.h> @@ -254,7 +252,7 @@ static int cpum_cf_alloc_cpu(int cpu) cpuhw = p->cpucf; if (!cpuhw) { - cpuhw = kzalloc(sizeof(*cpuhw), GFP_KERNEL); + cpuhw = kzalloc_obj(*cpuhw); if (cpuhw) { p->cpucf = cpuhw; refcount_set(&cpuhw->refcnt, 1); @@ -442,7 +440,7 @@ static void cpum_cf_make_setsize(enum cpumf_ctr_set ctrset) ctrset_size = 48; else if (cpumf_ctr_info.csvn >= 3 && cpumf_ctr_info.csvn <= 5) ctrset_size = 128; - else if (cpumf_ctr_info.csvn == 6 || cpumf_ctr_info.csvn == 7) + else if (cpumf_ctr_info.csvn >= 6 && cpumf_ctr_info.csvn <= 8) ctrset_size = 160; break; case CPUMF_CTR_SET_MT_DIAG: @@ -761,8 +759,6 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type) break; case PERF_TYPE_HARDWARE: - if (is_sampling_event(event)) /* No sampling support */ - return -ENOENT; ev = attr->config; if (!attr->exclude_user && attr->exclude_kernel) { /* @@ -858,18 +854,15 @@ static int cpumf_pmu_event_type(struct perf_event *event) static int cpumf_pmu_event_init(struct perf_event *event) { unsigned int type = event->attr.type; - int err; + int err = -ENOENT; + if (is_sampling_event(event)) /* No sampling support */ + return err; if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_RAW) err = __hw_perf_event_init(event, type); else if (event->pmu->type == type) /* Registered as unknown PMU */ err = __hw_perf_event_init(event, cpumf_pmu_event_type(event)); - else - return -ENOENT; - - if (unlikely(err) && event->destroy) - event->destroy(event); return err; } @@ -985,8 +978,6 @@ static int cfdiag_push_sample(struct perf_event *event, } overflow = perf_event_overflow(event, &data, ®s); - if (overflow) - event->pmu->stop(event, 0); perf_event_update_userpage(event); return overflow; @@ -1214,7 +1205,7 @@ static int __init cpumf_pmu_init(void) } /* Setup s390dbf facility */ - cf_dbg = debug_register(KMSG_COMPONENT, 2, 1, 128); + cf_dbg = debug_register("cpum_cf", 2, 1, 128); if (!cf_dbg) { pr_err("Registration of s390dbf(cpum_cf) failed\n"); rc = -ENOMEM; @@ -1625,7 +1616,7 @@ static long cfset_ioctl_start(unsigned long arg, struct file *file) if (!start.counter_sets) return -EINVAL; /* No counter set at all? */ - preq = kzalloc(sizeof(*preq), GFP_KERNEL); + preq = kzalloc_obj(*preq); if (!preq) return -ENOMEM; cpumask_clear(&preq->mask); @@ -1697,7 +1688,6 @@ static const struct file_operations cfset_fops = { .open = cfset_open, .release = cfset_release, .unlocked_ioctl = cfset_ioctl, - .compat_ioctl = cfset_ioctl, }; static struct miscdevice cfset_dev = { @@ -1819,8 +1809,6 @@ static int cfdiag_event_init(struct perf_event *event) event->destroy = hw_perf_event_destroy; err = cfdiag_event_init2(event); - if (unlikely(err)) - event->destroy(event); out: return err; } diff --git a/arch/s390/kernel/perf_cpum_cf_events.c b/arch/s390/kernel/perf_cpum_cf_events.c index e4a6bfc91080..ad8c32f0aaed 100644 --- a/arch/s390/kernel/perf_cpum_cf_events.c +++ b/arch/s390/kernel/perf_cpum_cf_events.c @@ -237,7 +237,6 @@ CPUMF_EVENT_ATTR(cf_z14, TX_C_TABORT_NO_SPECIAL, 0x00f4); CPUMF_EVENT_ATTR(cf_z14, TX_C_TABORT_SPECIAL, 0x00f5); CPUMF_EVENT_ATTR(cf_z14, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0); CPUMF_EVENT_ATTR(cf_z14, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1); - CPUMF_EVENT_ATTR(cf_z15, L1D_RO_EXCL_WRITES, 0x0080); CPUMF_EVENT_ATTR(cf_z15, DTLB2_WRITES, 0x0081); CPUMF_EVENT_ATTR(cf_z15, DTLB2_MISSES, 0x0082); @@ -291,8 +290,8 @@ CPUMF_EVENT_ATTR(cf_z15, TX_C_TABORT_NO_SPECIAL, 0x00f4); CPUMF_EVENT_ATTR(cf_z15, TX_C_TABORT_SPECIAL, 0x00f5); CPUMF_EVENT_ATTR(cf_z15, DFLT_ACCESS, 0x00f7); CPUMF_EVENT_ATTR(cf_z15, DFLT_CYCLES, 0x00fc); -CPUMF_EVENT_ATTR(cf_z15, DFLT_CC, 0x00108); -CPUMF_EVENT_ATTR(cf_z15, DFLT_CCFINISH, 0x00109); +CPUMF_EVENT_ATTR(cf_z15, DFLT_CC, 0x0108); +CPUMF_EVENT_ATTR(cf_z15, DFLT_CCFINISH, 0x0109); CPUMF_EVENT_ATTR(cf_z15, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0); CPUMF_EVENT_ATTR(cf_z15, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1); CPUMF_EVENT_ATTR(cf_z16, L1D_RO_EXCL_WRITES, 0x0080); @@ -365,6 +364,83 @@ CPUMF_EVENT_ATTR(cf_z16, NNPA_WAIT_LOCK, 0x010d); CPUMF_EVENT_ATTR(cf_z16, NNPA_HOLD_LOCK, 0x010e); CPUMF_EVENT_ATTR(cf_z16, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0); CPUMF_EVENT_ATTR(cf_z16, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1); +CPUMF_EVENT_ATTR(cf_z17, L1D_RO_EXCL_WRITES, 0x0080); +CPUMF_EVENT_ATTR(cf_z17, DTLB2_WRITES, 0x0081); +CPUMF_EVENT_ATTR(cf_z17, DTLB2_MISSES, 0x0082); +CPUMF_EVENT_ATTR(cf_z17, CRSTE_1MB_WRITES, 0x0083); +CPUMF_EVENT_ATTR(cf_z17, DTLB2_GPAGE_WRITES, 0x0084); +CPUMF_EVENT_ATTR(cf_z17, ITLB2_WRITES, 0x0086); +CPUMF_EVENT_ATTR(cf_z17, ITLB2_MISSES, 0x0087); +CPUMF_EVENT_ATTR(cf_z17, TLB2_PTE_WRITES, 0x0089); +CPUMF_EVENT_ATTR(cf_z17, TLB2_CRSTE_WRITES, 0x008a); +CPUMF_EVENT_ATTR(cf_z17, TLB2_ENGINES_BUSY, 0x008b); +CPUMF_EVENT_ATTR(cf_z17, TX_C_TEND, 0x008c); +CPUMF_EVENT_ATTR(cf_z17, TX_NC_TEND, 0x008d); +CPUMF_EVENT_ATTR(cf_z17, L1C_TLB2_MISSES, 0x008f); +CPUMF_EVENT_ATTR(cf_z17, DCW_REQ, 0x0091); +CPUMF_EVENT_ATTR(cf_z17, DCW_REQ_IV, 0x0092); +CPUMF_EVENT_ATTR(cf_z17, DCW_REQ_CHIP_HIT, 0x0093); +CPUMF_EVENT_ATTR(cf_z17, DCW_REQ_DRAWER_HIT, 0x0094); +CPUMF_EVENT_ATTR(cf_z17, DCW_ON_CHIP, 0x0095); +CPUMF_EVENT_ATTR(cf_z17, DCW_ON_CHIP_IV, 0x0096); +CPUMF_EVENT_ATTR(cf_z17, DCW_ON_CHIP_CHIP_HIT, 0x0097); +CPUMF_EVENT_ATTR(cf_z17, DCW_ON_CHIP_DRAWER_HIT, 0x0098); +CPUMF_EVENT_ATTR(cf_z17, DCW_ON_MODULE, 0x0099); +CPUMF_EVENT_ATTR(cf_z17, DCW_ON_DRAWER, 0x009a); +CPUMF_EVENT_ATTR(cf_z17, DCW_OFF_DRAWER, 0x009b); +CPUMF_EVENT_ATTR(cf_z17, DCW_ON_CHIP_MEMORY, 0x009c); +CPUMF_EVENT_ATTR(cf_z17, DCW_ON_MODULE_MEMORY, 0x009d); +CPUMF_EVENT_ATTR(cf_z17, DCW_ON_DRAWER_MEMORY, 0x009e); +CPUMF_EVENT_ATTR(cf_z17, DCW_OFF_DRAWER_MEMORY, 0x009f); +CPUMF_EVENT_ATTR(cf_z17, IDCW_ON_MODULE_IV, 0x00a0); +CPUMF_EVENT_ATTR(cf_z17, IDCW_ON_MODULE_CHIP_HIT, 0x00a1); +CPUMF_EVENT_ATTR(cf_z17, IDCW_ON_MODULE_DRAWER_HIT, 0x00a2); +CPUMF_EVENT_ATTR(cf_z17, IDCW_ON_DRAWER_IV, 0x00a3); +CPUMF_EVENT_ATTR(cf_z17, IDCW_ON_DRAWER_CHIP_HIT, 0x00a4); +CPUMF_EVENT_ATTR(cf_z17, IDCW_ON_DRAWER_DRAWER_HIT, 0x00a5); +CPUMF_EVENT_ATTR(cf_z17, IDCW_OFF_DRAWER_IV, 0x00a6); +CPUMF_EVENT_ATTR(cf_z17, IDCW_OFF_DRAWER_CHIP_HIT, 0x00a7); +CPUMF_EVENT_ATTR(cf_z17, IDCW_OFF_DRAWER_DRAWER_HIT, 0x00a8); +CPUMF_EVENT_ATTR(cf_z17, ICW_REQ, 0x00a9); +CPUMF_EVENT_ATTR(cf_z17, ICW_REQ_IV, 0x00aa); +CPUMF_EVENT_ATTR(cf_z17, ICW_REQ_CHIP_HIT, 0x00ab); +CPUMF_EVENT_ATTR(cf_z17, ICW_REQ_DRAWER_HIT, 0x00ac); +CPUMF_EVENT_ATTR(cf_z17, ICW_ON_CHIP, 0x00ad); +CPUMF_EVENT_ATTR(cf_z17, ICW_ON_CHIP_IV, 0x00ae); +CPUMF_EVENT_ATTR(cf_z17, ICW_ON_CHIP_CHIP_HIT, 0x00af); +CPUMF_EVENT_ATTR(cf_z17, ICW_ON_CHIP_DRAWER_HIT, 0x00b0); +CPUMF_EVENT_ATTR(cf_z17, ICW_ON_MODULE, 0x00b1); +CPUMF_EVENT_ATTR(cf_z17, ICW_ON_DRAWER, 0x00b2); +CPUMF_EVENT_ATTR(cf_z17, ICW_OFF_DRAWER, 0x00b3); +CPUMF_EVENT_ATTR(cf_z17, CYCLES_SAMETHRD, 0x00ca); +CPUMF_EVENT_ATTR(cf_z17, CYCLES_DIFFTHRD, 0x00cb); +CPUMF_EVENT_ATTR(cf_z17, INST_SAMETHRD, 0x00cc); +CPUMF_EVENT_ATTR(cf_z17, INST_DIFFTHRD, 0x00cd); +CPUMF_EVENT_ATTR(cf_z17, WRONG_BRANCH_PREDICTION, 0x00ce); +CPUMF_EVENT_ATTR(cf_z17, VX_BCD_EXECUTION_SLOTS, 0x00e1); +CPUMF_EVENT_ATTR(cf_z17, DECIMAL_INSTRUCTIONS, 0x00e2); +CPUMF_EVENT_ATTR(cf_z17, LAST_HOST_TRANSLATIONS, 0x00e8); +CPUMF_EVENT_ATTR(cf_z17, TX_NC_TABORT, 0x00f4); +CPUMF_EVENT_ATTR(cf_z17, TX_C_TABORT_NO_SPECIAL, 0x00f5); +CPUMF_EVENT_ATTR(cf_z17, TX_C_TABORT_SPECIAL, 0x00f6); +CPUMF_EVENT_ATTR(cf_z17, DFLT_ACCESS, 0x00f8); +CPUMF_EVENT_ATTR(cf_z17, DFLT_CYCLES, 0x00fd); +CPUMF_EVENT_ATTR(cf_z17, SORTL, 0x0100); +CPUMF_EVENT_ATTR(cf_z17, DFLT_CC, 0x0109); +CPUMF_EVENT_ATTR(cf_z17, DFLT_CCFINISH, 0x010a); +CPUMF_EVENT_ATTR(cf_z17, NNPA_INVOCATIONS, 0x010b); +CPUMF_EVENT_ATTR(cf_z17, NNPA_COMPLETIONS, 0x010c); +CPUMF_EVENT_ATTR(cf_z17, NNPA_WAIT_LOCK, 0x010d); +CPUMF_EVENT_ATTR(cf_z17, NNPA_HOLD_LOCK, 0x010e); +CPUMF_EVENT_ATTR(cf_z17, NNPA_INST_ONCHIP, 0x0110); +CPUMF_EVENT_ATTR(cf_z17, NNPA_INST_OFFCHIP, 0x0111); +CPUMF_EVENT_ATTR(cf_z17, NNPA_INST_DIFF, 0x0112); +CPUMF_EVENT_ATTR(cf_z17, NNPA_4K_PREFETCH, 0x0114); +CPUMF_EVENT_ATTR(cf_z17, NNPA_COMPL_LOCK, 0x0115); +CPUMF_EVENT_ATTR(cf_z17, NNPA_RETRY_LOCK, 0x0116); +CPUMF_EVENT_ATTR(cf_z17, NNPA_RETRY_LOCK_WITH_PLO, 0x0117); +CPUMF_EVENT_ATTR(cf_z17, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0); +CPUMF_EVENT_ATTR(cf_z17, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1); static struct attribute *cpumcf_fvn1_pmu_event_attr[] __initdata = { CPUMF_EVENT_PTR(cf_fvn1, CPU_CYCLES), @@ -414,7 +490,7 @@ static struct attribute *cpumcf_svn_12345_pmu_event_attr[] __initdata = { NULL, }; -static struct attribute *cpumcf_svn_67_pmu_event_attr[] __initdata = { +static struct attribute *cpumcf_svn_678_pmu_event_attr[] __initdata = { CPUMF_EVENT_PTR(cf_svn_12345, PRNG_FUNCTIONS), CPUMF_EVENT_PTR(cf_svn_12345, PRNG_CYCLES), CPUMF_EVENT_PTR(cf_svn_12345, PRNG_BLOCKED_FUNCTIONS), @@ -779,6 +855,87 @@ static struct attribute *cpumcf_z16_pmu_event_attr[] __initdata = { NULL, }; +static struct attribute *cpumcf_z17_pmu_event_attr[] __initdata = { + CPUMF_EVENT_PTR(cf_z17, L1D_RO_EXCL_WRITES), + CPUMF_EVENT_PTR(cf_z17, DTLB2_WRITES), + CPUMF_EVENT_PTR(cf_z17, DTLB2_MISSES), + CPUMF_EVENT_PTR(cf_z17, CRSTE_1MB_WRITES), + CPUMF_EVENT_PTR(cf_z17, DTLB2_GPAGE_WRITES), + CPUMF_EVENT_PTR(cf_z17, ITLB2_WRITES), + CPUMF_EVENT_PTR(cf_z17, ITLB2_MISSES), + CPUMF_EVENT_PTR(cf_z17, TLB2_PTE_WRITES), + CPUMF_EVENT_PTR(cf_z17, TLB2_CRSTE_WRITES), + CPUMF_EVENT_PTR(cf_z17, TLB2_ENGINES_BUSY), + CPUMF_EVENT_PTR(cf_z17, TX_C_TEND), + CPUMF_EVENT_PTR(cf_z17, TX_NC_TEND), + CPUMF_EVENT_PTR(cf_z17, L1C_TLB2_MISSES), + CPUMF_EVENT_PTR(cf_z17, DCW_REQ), + CPUMF_EVENT_PTR(cf_z17, DCW_REQ_IV), + CPUMF_EVENT_PTR(cf_z17, DCW_REQ_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z17, DCW_REQ_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z17, DCW_ON_CHIP), + CPUMF_EVENT_PTR(cf_z17, DCW_ON_CHIP_IV), + CPUMF_EVENT_PTR(cf_z17, DCW_ON_CHIP_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z17, DCW_ON_CHIP_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z17, DCW_ON_MODULE), + CPUMF_EVENT_PTR(cf_z17, DCW_ON_DRAWER), + CPUMF_EVENT_PTR(cf_z17, DCW_OFF_DRAWER), + CPUMF_EVENT_PTR(cf_z17, DCW_ON_CHIP_MEMORY), + CPUMF_EVENT_PTR(cf_z17, DCW_ON_MODULE_MEMORY), + CPUMF_EVENT_PTR(cf_z17, DCW_ON_DRAWER_MEMORY), + CPUMF_EVENT_PTR(cf_z17, DCW_OFF_DRAWER_MEMORY), + CPUMF_EVENT_PTR(cf_z17, IDCW_ON_MODULE_IV), + CPUMF_EVENT_PTR(cf_z17, IDCW_ON_MODULE_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z17, IDCW_ON_MODULE_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z17, IDCW_ON_DRAWER_IV), + CPUMF_EVENT_PTR(cf_z17, IDCW_ON_DRAWER_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z17, IDCW_ON_DRAWER_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z17, IDCW_OFF_DRAWER_IV), + CPUMF_EVENT_PTR(cf_z17, IDCW_OFF_DRAWER_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z17, IDCW_OFF_DRAWER_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z17, ICW_REQ), + CPUMF_EVENT_PTR(cf_z17, ICW_REQ_IV), + CPUMF_EVENT_PTR(cf_z17, ICW_REQ_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z17, ICW_REQ_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z17, ICW_ON_CHIP), + CPUMF_EVENT_PTR(cf_z17, ICW_ON_CHIP_IV), + CPUMF_EVENT_PTR(cf_z17, ICW_ON_CHIP_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z17, ICW_ON_CHIP_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z17, ICW_ON_MODULE), + CPUMF_EVENT_PTR(cf_z17, ICW_ON_DRAWER), + CPUMF_EVENT_PTR(cf_z17, ICW_OFF_DRAWER), + CPUMF_EVENT_PTR(cf_z17, CYCLES_SAMETHRD), + CPUMF_EVENT_PTR(cf_z17, CYCLES_DIFFTHRD), + CPUMF_EVENT_PTR(cf_z17, INST_SAMETHRD), + CPUMF_EVENT_PTR(cf_z17, INST_DIFFTHRD), + CPUMF_EVENT_PTR(cf_z17, WRONG_BRANCH_PREDICTION), + CPUMF_EVENT_PTR(cf_z17, VX_BCD_EXECUTION_SLOTS), + CPUMF_EVENT_PTR(cf_z17, DECIMAL_INSTRUCTIONS), + CPUMF_EVENT_PTR(cf_z17, LAST_HOST_TRANSLATIONS), + CPUMF_EVENT_PTR(cf_z17, TX_NC_TABORT), + CPUMF_EVENT_PTR(cf_z17, TX_C_TABORT_NO_SPECIAL), + CPUMF_EVENT_PTR(cf_z17, TX_C_TABORT_SPECIAL), + CPUMF_EVENT_PTR(cf_z17, DFLT_ACCESS), + CPUMF_EVENT_PTR(cf_z17, DFLT_CYCLES), + CPUMF_EVENT_PTR(cf_z17, SORTL), + CPUMF_EVENT_PTR(cf_z17, DFLT_CC), + CPUMF_EVENT_PTR(cf_z17, DFLT_CCFINISH), + CPUMF_EVENT_PTR(cf_z17, NNPA_INVOCATIONS), + CPUMF_EVENT_PTR(cf_z17, NNPA_COMPLETIONS), + CPUMF_EVENT_PTR(cf_z17, NNPA_WAIT_LOCK), + CPUMF_EVENT_PTR(cf_z17, NNPA_HOLD_LOCK), + CPUMF_EVENT_PTR(cf_z17, NNPA_INST_ONCHIP), + CPUMF_EVENT_PTR(cf_z17, NNPA_INST_OFFCHIP), + CPUMF_EVENT_PTR(cf_z17, NNPA_INST_DIFF), + CPUMF_EVENT_PTR(cf_z17, NNPA_4K_PREFETCH), + CPUMF_EVENT_PTR(cf_z17, NNPA_COMPL_LOCK), + CPUMF_EVENT_PTR(cf_z17, NNPA_RETRY_LOCK), + CPUMF_EVENT_PTR(cf_z17, NNPA_RETRY_LOCK_WITH_PLO), + CPUMF_EVENT_PTR(cf_z17, MT_DIAG_CYCLES_ONE_THR_ACTIVE), + CPUMF_EVENT_PTR(cf_z17, MT_DIAG_CYCLES_TWO_THR_ACTIVE), + NULL, +}; + /* END: CPUM_CF COUNTER DEFINITIONS ===================================== */ static struct attribute_group cpumcf_pmu_events_group = { @@ -819,7 +976,7 @@ static __init struct attribute **merge_attr(struct attribute **a, j++; j++; - new = kmalloc_array(j, sizeof(struct attribute *), GFP_KERNEL); + new = kmalloc_objs(struct attribute *, j); if (!new) return NULL; j = 0; @@ -859,7 +1016,7 @@ __init const struct attribute_group **cpumf_cf_event_group(void) if (ci.csvn >= 1 && ci.csvn <= 5) csvn = cpumcf_svn_12345_pmu_event_attr; else if (ci.csvn >= 6) - csvn = cpumcf_svn_67_pmu_event_attr; + csvn = cpumcf_svn_678_pmu_event_attr; /* Determine model-specific counter set(s) */ get_cpu_id(&cpu_id); @@ -892,6 +1049,10 @@ __init const struct attribute_group **cpumf_cf_event_group(void) case 0x3932: model = cpumcf_z16_pmu_event_attr; break; + case 0x9175: + case 0x9176: + model = cpumcf_z17_pmu_event_attr; + break; default: model = none; break; diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index 5f60248cb468..7bfeb5208177 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -5,8 +5,7 @@ * Copyright IBM Corp. 2013, 2018 * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> */ -#define KMSG_COMPONENT "cpum_sf" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "cpum_sf: " fmt #include <linux/kernel.h> #include <linux/kernel_stat.h> @@ -14,7 +13,6 @@ #include <linux/percpu.h> #include <linux/pid.h> #include <linux/notifier.h> -#include <linux/export.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/moduleparam.h> @@ -843,7 +841,7 @@ static bool is_callchain_event(struct perf_event *event) u64 sample_type = event->attr.sample_type; return sample_type & (PERF_SAMPLE_CALLCHAIN | PERF_SAMPLE_REGS_USER | - PERF_SAMPLE_STACK_USER); + PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_STACK_USER); } static int cpumsf_pmu_event_init(struct perf_event *event) @@ -885,9 +883,6 @@ static int cpumsf_pmu_event_init(struct perf_event *event) event->attr.exclude_idle = 0; err = __hw_perf_event_init(event); - if (unlikely(err)) - if (event->destroy) - event->destroy(event); return err; } @@ -1075,10 +1070,7 @@ static int perf_push_sample(struct perf_event *event, overflow = 0; if (perf_event_exclude(event, ®s, sde_regs)) goto out; - if (perf_event_overflow(event, &data, ®s)) { - overflow = 1; - event->pmu->stop(event, 0); - } + overflow = perf_event_overflow(event, &data, ®s); perf_event_update_userpage(event); out: return overflow; @@ -1100,7 +1092,7 @@ static void perf_event_count_update(struct perf_event *event, u64 count) * combined-sampling data entry consists of a basic- and a diagnostic-sampling * data entry. The sampling function is determined by the flags in the perf * event hardware structure. The function always works with a combined-sampling - * data entry but ignores the the diagnostic portion if it is not available. + * data entry but ignores the diagnostic portion if it is not available. * * Note that the implementation focuses on basic-sampling data entries and, if * such an entry is not valid, the entire combined-sampling data entry is @@ -1176,6 +1168,7 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt, static void hw_perf_event_update(struct perf_event *event, int flush_all) { unsigned long long event_overflow, sampl_overflow, num_sdb; + struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf); struct hw_perf_event *hwc = &event->hw; union hws_trailer_header prev, new; struct hws_trailer_entry *te; @@ -1255,8 +1248,11 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all) * are dropped. * Slightly increase the interval to avoid hitting this limit. */ - if (event_overflow) + if (event_overflow) { SAMPL_RATE(hwc) += DIV_ROUND_UP(SAMPL_RATE(hwc), 10); + if (SAMPL_RATE(hwc) > cpuhw->qsi.max_sampl_rate) + SAMPL_RATE(hwc) = cpuhw->qsi.max_sampl_rate; + } } static inline unsigned long aux_sdb_index(struct aux_buffer *aux, @@ -1617,7 +1613,7 @@ static void *aux_buffer_setup(struct perf_event *event, void **pages, } /* Allocate aux_buffer struct for the event */ - aux = kzalloc(sizeof(struct aux_buffer), GFP_KERNEL); + aux = kzalloc_obj(struct aux_buffer); if (!aux) goto no_aux; sfb = &aux->sfb; @@ -2077,7 +2073,7 @@ static int __init init_cpum_sampling_pmu(void) CPUMF_EVENT_PTR(SF, SF_CYCLES_BASIC_DIAG); } - sfdbg = debug_register(KMSG_COMPONENT, 2, 1, 80); + sfdbg = debug_register("cpum_sf", 2, 1, 80); if (!sfdbg) { pr_err("Registering for s390dbf failed\n"); return -ENOMEM; diff --git a/arch/s390/kernel/perf_event.c b/arch/s390/kernel/perf_event.c index 2b9611c4718e..606750bae508 100644 --- a/arch/s390/kernel/perf_event.c +++ b/arch/s390/kernel/perf_event.c @@ -5,18 +5,15 @@ * Copyright IBM Corp. 2012, 2013 * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> */ -#define KMSG_COMPONENT "perf" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "perf: " fmt #include <linux/kernel.h> #include <linux/perf_event.h> #include <linux/kvm_host.h> #include <linux/percpu.h> -#include <linux/export.h> #include <linux/seq_file.h> #include <linux/spinlock.h> #include <linux/uaccess.h> -#include <linux/compat.h> #include <linux/sysfs.h> #include <asm/stacktrace.h> #include <asm/irq.h> diff --git a/arch/s390/kernel/perf_pai.c b/arch/s390/kernel/perf_pai.c new file mode 100644 index 000000000000..86f71a3d1ef2 --- /dev/null +++ b/arch/s390/kernel/perf_pai.c @@ -0,0 +1,1229 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Performance event support - Processor Activity Instrumentation Facility + * + * Copyright IBM Corp. 2026 + * Author(s): Thomas Richter <tmricht@linux.ibm.com> + */ +#define pr_fmt(fmt) "pai: " fmt + +#include <linux/kernel.h> +#include <linux/kernel_stat.h> +#include <linux/percpu.h> +#include <linux/notifier.h> +#include <linux/init.h> +#include <linux/io.h> +#include <linux/perf_event.h> +#include <asm/ctlreg.h> +#include <asm/pai.h> +#include <asm/debug.h> + +static debug_info_t *paidbg; + +DEFINE_STATIC_KEY_FALSE(pai_key); + +enum { + PAI_PMU_CRYPTO, /* Index of PMU pai_crypto */ + PAI_PMU_EXT, /* Index of PMU pai_ext */ + PAI_PMU_MAX /* # of PAI PMUs */ +}; + +enum { + PAIE1_CB_SZ = 0x200, /* Size of PAIE1 control block */ + PAIE1_CTRBLOCK_SZ = 0x400 /* Size of PAIE1 counter blocks */ +}; + +struct pai_userdata { + u16 num; + u64 value; +} __packed; + +/* Create the PAI extension 1 control block area. + * The PAI extension control block 1 is pointed to by lowcore + * address 0x1508 for each CPU. This control block is 512 bytes in size + * and requires a 512 byte boundary alignment. + */ +struct paiext_cb { /* PAI extension 1 control block */ + u64 header; /* Not used */ + u64 reserved1; + u64 acc; /* Addr to analytics counter control block */ + u8 reserved2[PAIE1_CTRBLOCK_SZ - 3 * sizeof(u64)]; +} __packed; + +struct pai_map { + unsigned long *area; /* Area for CPU to store counters */ + struct pai_userdata *save; /* Page to store no-zero counters */ + unsigned int active_events; /* # of PAI crypto users */ + refcount_t refcnt; /* Reference count mapped buffers */ + struct perf_event *event; /* Perf event for sampling */ + struct list_head syswide_list; /* List system-wide sampling events */ + struct paiext_cb *paiext_cb; /* PAI extension control block area */ + bool fullpage; /* True: counter area is a full page */ +}; + +struct pai_mapptr { + struct pai_map *mapptr; +}; + +static struct pai_root { /* Anchor to per CPU data */ + refcount_t refcnt; /* Overall active events */ + struct pai_mapptr __percpu *mapptr; +} pai_root[PAI_PMU_MAX]; + +/* This table defines the different parameters of the PAI PMUs. During + * initialization the machine dependent values are extracted and saved. + * However most of the values are static and do not change. + * There is one table entry per PAI PMU. + */ +struct pai_pmu { /* Define PAI PMU characteristics */ + const char *pmuname; /* Name of PMU */ + const int facility_nr; /* Facility number to check for support */ + unsigned int num_avail; /* # Counters defined by hardware */ + unsigned int num_named; /* # Counters known by name */ + unsigned long base; /* Counter set base number */ + unsigned long kernel_offset; /* Offset to kernel part in counter page */ + unsigned long area_size; /* Size of counter area */ + const char * const *names; /* List of counter names */ + struct pmu *pmu; /* Ptr to supporting PMU */ + int (*init)(struct pai_pmu *p); /* PMU support init function */ + void (*exit)(struct pai_pmu *p); /* PMU support exit function */ + struct attribute_group *event_group; /* Ptr to attribute of events */ +}; + +static struct pai_pmu pai_pmu[]; /* Forward declaration */ + +/* Free per CPU data when the last event is removed. */ +static void pai_root_free(int idx) +{ + if (refcount_dec_and_test(&pai_root[idx].refcnt)) { + free_percpu(pai_root[idx].mapptr); + pai_root[idx].mapptr = NULL; + } + debug_sprintf_event(paidbg, 5, "%s root[%d].refcount %d\n", __func__, + idx, refcount_read(&pai_root[idx].refcnt)); +} + +/* + * On initialization of first event also allocate per CPU data dynamically. + * Start with an array of pointers, the array size is the maximum number of + * CPUs possible, which might be larger than the number of CPUs currently + * online. + */ +static int pai_root_alloc(int idx) +{ + if (!refcount_inc_not_zero(&pai_root[idx].refcnt)) { + /* The memory is already zeroed. */ + pai_root[idx].mapptr = alloc_percpu(struct pai_mapptr); + if (!pai_root[idx].mapptr) + return -ENOMEM; + refcount_set(&pai_root[idx].refcnt, 1); + } + return 0; +} + +/* Release the PMU if event is the last perf event */ +static DEFINE_MUTEX(pai_reserve_mutex); + +/* Free all memory allocated for event counting/sampling setup */ +static void pai_free(struct pai_mapptr *mp) +{ + if (mp->mapptr->fullpage) + free_page((unsigned long)mp->mapptr->area); + else + kfree(mp->mapptr->area); + kfree(mp->mapptr->paiext_cb); + kvfree(mp->mapptr->save); + kfree(mp->mapptr); + mp->mapptr = NULL; +} + +/* Adjust usage counters and remove allocated memory when all users are + * gone. + */ +static void pai_event_destroy_cpu(struct perf_event *event, int cpu) +{ + int idx = PAI_PMU_IDX(event); + struct pai_mapptr *mp = per_cpu_ptr(pai_root[idx].mapptr, cpu); + struct pai_map *cpump = mp->mapptr; + + mutex_lock(&pai_reserve_mutex); + debug_sprintf_event(paidbg, 5, "%s event %#llx idx %d cpu %d users %d " + "refcnt %u\n", __func__, event->attr.config, idx, + event->cpu, cpump->active_events, + refcount_read(&cpump->refcnt)); + if (refcount_dec_and_test(&cpump->refcnt)) + pai_free(mp); + pai_root_free(idx); + mutex_unlock(&pai_reserve_mutex); +} + +static void pai_event_destroy(struct perf_event *event) +{ + int cpu; + + free_page(PAI_SAVE_AREA(event)); + if (event->cpu == -1) { + struct cpumask *mask = PAI_CPU_MASK(event); + + for_each_cpu(cpu, mask) + pai_event_destroy_cpu(event, cpu); + kfree(mask); + } else { + pai_event_destroy_cpu(event, event->cpu); + } +} + +static void paicrypt_event_destroy(struct perf_event *event) +{ + static_branch_dec(&pai_key); + pai_event_destroy(event); +} + +static u64 pai_getctr(unsigned long *page, int nr, unsigned long offset) +{ + if (offset) + nr += offset / sizeof(*page); + return page[nr]; +} + +/* Read the counter values. Return value from location in CMP. For base + * event xxx_ALL sum up all events. Returns counter value. + */ +static u64 pai_getdata(struct perf_event *event, bool kernel) +{ + int idx = PAI_PMU_IDX(event); + struct pai_mapptr *mp = this_cpu_ptr(pai_root[idx].mapptr); + struct pai_pmu *pp = &pai_pmu[idx]; + struct pai_map *cpump = mp->mapptr; + unsigned int i; + u64 sum = 0; + + if (event->attr.config != pp->base) { + return pai_getctr(cpump->area, + event->attr.config - pp->base, + kernel ? pp->kernel_offset : 0); + } + + for (i = 1; i <= pp->num_avail; i++) { + u64 val = pai_getctr(cpump->area, i, + kernel ? pp->kernel_offset : 0); + + if (!val) + continue; + sum += val; + } + return sum; +} + +static u64 paicrypt_getall(struct perf_event *event) +{ + u64 sum = 0; + + if (!event->attr.exclude_kernel) + sum += pai_getdata(event, true); + if (!event->attr.exclude_user) + sum += pai_getdata(event, false); + + return sum; +} + +/* Check concurrent access of counting and sampling for crypto events. + * This function is called in process context and it is save to block. + * When the event initialization functions fails, no other call back will + * be invoked. + * + * Allocate the memory for the event. + */ +static int pai_alloc_cpu(struct perf_event *event, int cpu) +{ + int rc, idx = PAI_PMU_IDX(event); + struct pai_map *cpump = NULL; + bool need_paiext_cb = false; + struct pai_mapptr *mp; + + mutex_lock(&pai_reserve_mutex); + /* Allocate root node */ + rc = pai_root_alloc(idx); + if (rc) + goto unlock; + + /* Allocate node for this event */ + mp = per_cpu_ptr(pai_root[idx].mapptr, cpu); + cpump = mp->mapptr; + if (!cpump) { /* Paicrypt_map allocated? */ + rc = -ENOMEM; + cpump = kzalloc_obj(*cpump); + if (!cpump) + goto undo; + /* Allocate memory for counter page and counter extraction. + * Only the first counting event has to allocate a page. + */ + mp->mapptr = cpump; + if (idx == PAI_PMU_CRYPTO) { + cpump->area = (unsigned long *)get_zeroed_page(GFP_KERNEL); + /* free_page() can handle 0x0 address */ + cpump->fullpage = true; + } else { /* PAI_PMU_EXT */ + /* + * Allocate memory for counter area and counter extraction. + * These are + * - a 512 byte block and requires 512 byte boundary + * alignment. + * - a 1KB byte block and requires 1KB boundary + * alignment. + * Only the first counting event has to allocate the area. + * + * Note: This works with commit 59bb47985c1d by default. + * Backporting this to kernels without this commit might + * needs adjustment. + */ + cpump->area = kzalloc(pai_pmu[idx].area_size, GFP_KERNEL); + cpump->paiext_cb = kzalloc(PAIE1_CB_SZ, GFP_KERNEL); + need_paiext_cb = true; + } + cpump->save = kvmalloc_objs(struct pai_userdata, + pai_pmu[idx].num_avail + 1); + if (!cpump->area || !cpump->save || + (need_paiext_cb && !cpump->paiext_cb)) { + pai_free(mp); + goto undo; + } + INIT_LIST_HEAD(&cpump->syswide_list); + refcount_set(&cpump->refcnt, 1); + rc = 0; + } else { + refcount_inc(&cpump->refcnt); + } + +undo: + if (rc) { + /* Error in allocation of event, decrement anchor. Since + * the event in not created, its destroy() function is never + * invoked. Adjust the reference counter for the anchor. + */ + pai_root_free(idx); + } +unlock: + mutex_unlock(&pai_reserve_mutex); + /* If rc is non-zero, no increment of counter/sampler was done. */ + return rc; +} + +static int pai_alloc(struct perf_event *event) +{ + struct cpumask *maskptr; + int cpu, rc = -ENOMEM; + + maskptr = kzalloc_obj(*maskptr); + if (!maskptr) + goto out; + + for_each_online_cpu(cpu) { + rc = pai_alloc_cpu(event, cpu); + if (rc) { + for_each_cpu(cpu, maskptr) + pai_event_destroy_cpu(event, cpu); + kfree(maskptr); + goto out; + } + cpumask_set_cpu(cpu, maskptr); + } + + /* + * On error all cpumask are freed and all events have been destroyed. + * Save of which CPUs data structures have been allocated for. + * Release them in pai_event_destroy call back function + * for this event. + */ + PAI_CPU_MASK(event) = maskptr; + rc = 0; +out: + return rc; +} + +/* Validate event number and return error if event is not supported. + * On successful return, PAI_PMU_IDX(event) is set to the index of + * the supporting paing_support[] array element. + */ +static int pai_event_valid(struct perf_event *event, int idx) +{ + struct perf_event_attr *a = &event->attr; + struct pai_pmu *pp = &pai_pmu[idx]; + + /* PAI crypto PMU registered as PERF_TYPE_RAW, check event type */ + if (a->type != PERF_TYPE_RAW && event->pmu->type != a->type) + return -ENOENT; + /* Allow only CRYPTO_ALL/NNPA_ALL for sampling */ + if (a->sample_period && a->config != pp->base) + return -EINVAL; + /* PAI crypto event must be in valid range, try others if not */ + if (a->config < pp->base || a->config > pp->base + pp->num_avail) + return -ENOENT; + if (idx == PAI_PMU_EXT && a->exclude_user) + return -EINVAL; + PAI_PMU_IDX(event) = idx; + return 0; +} + +/* Might be called on different CPU than the one the event is intended for. */ +static int pai_event_init(struct perf_event *event, int idx) +{ + struct perf_event_attr *a = &event->attr; + int rc; + + /* PAI event must be valid and in supported range */ + rc = pai_event_valid(event, idx); + if (rc) + goto out; + /* Get a page to store last counter values for sampling */ + if (a->sample_period) { + PAI_SAVE_AREA(event) = get_zeroed_page(GFP_KERNEL); + if (!PAI_SAVE_AREA(event)) { + rc = -ENOMEM; + goto out; + } + } + + if (event->cpu >= 0) + rc = pai_alloc_cpu(event, event->cpu); + else + rc = pai_alloc(event); + if (rc) { + free_page(PAI_SAVE_AREA(event)); + goto out; + } + + if (a->sample_period) { + a->sample_period = 1; + a->freq = 0; + /* Register for paicrypt_sched_task() to be called */ + event->attach_state |= PERF_ATTACH_SCHED_CB; + /* Add raw data which contain the memory mapped counters */ + a->sample_type |= PERF_SAMPLE_RAW; + /* Turn off inheritance */ + a->inherit = 0; + } +out: + return rc; +} + +static int paicrypt_event_init(struct perf_event *event) +{ + int rc = pai_event_init(event, PAI_PMU_CRYPTO); + + if (!rc) { + event->destroy = paicrypt_event_destroy; + static_branch_inc(&pai_key); + } + return rc; +} + +static void pai_read(struct perf_event *event, + u64 (*fct)(struct perf_event *event)) +{ + u64 prev, new, delta; + + prev = local64_read(&event->hw.prev_count); + new = fct(event); + local64_set(&event->hw.prev_count, new); + delta = (prev <= new) ? new - prev : (-1ULL - prev) + new + 1; + local64_add(delta, &event->count); +} + +static void paicrypt_read(struct perf_event *event) +{ + pai_read(event, paicrypt_getall); +} + +static void pai_start(struct perf_event *event, int flags, + u64 (*fct)(struct perf_event *event)) +{ + int idx = PAI_PMU_IDX(event); + struct pai_pmu *pp = &pai_pmu[idx]; + struct pai_mapptr *mp = this_cpu_ptr(pai_root[idx].mapptr); + struct pai_map *cpump = mp->mapptr; + u64 sum; + + if (!event->attr.sample_period) { /* Counting */ + sum = fct(event); /* Get current value */ + local64_set(&event->hw.prev_count, sum); + } else { /* Sampling */ + memcpy((void *)PAI_SAVE_AREA(event), cpump->area, pp->area_size); + /* Enable context switch callback for system-wide sampling */ + if (!(event->attach_state & PERF_ATTACH_TASK)) { + list_add_tail(PAI_SWLIST(event), &cpump->syswide_list); + perf_sched_cb_inc(event->pmu); + } else { + cpump->event = event; + } + } +} + +static void paicrypt_start(struct perf_event *event, int flags) +{ + pai_start(event, flags, paicrypt_getall); +} + +static int pai_add(struct perf_event *event, int flags) +{ + int idx = PAI_PMU_IDX(event); + struct pai_mapptr *mp = this_cpu_ptr(pai_root[idx].mapptr); + struct pai_map *cpump = mp->mapptr; + struct paiext_cb *pcb = cpump->paiext_cb; + unsigned long ccd; + + if (++cpump->active_events == 1) { + if (!pcb) { /* PAI crypto */ + ccd = virt_to_phys(cpump->area) | PAI_CRYPTO_KERNEL_OFFSET; + WRITE_ONCE(get_lowcore()->ccd, ccd); + local_ctl_set_bit(0, CR0_CRYPTOGRAPHY_COUNTER_BIT); + } else { /* PAI extension 1 */ + ccd = virt_to_phys(pcb); + WRITE_ONCE(get_lowcore()->aicd, ccd); + pcb->acc = virt_to_phys(cpump->area) | 0x1; + /* Enable CPU instruction lookup for PAIE1 control block */ + local_ctl_set_bit(0, CR0_PAI_EXTENSION_BIT); + } + } + if (flags & PERF_EF_START) + pai_pmu[idx].pmu->start(event, PERF_EF_RELOAD); + event->hw.state = 0; + return 0; +} + +static int paicrypt_add(struct perf_event *event, int flags) +{ + return pai_add(event, flags); +} + +static void pai_have_sample(struct perf_event *, struct pai_map *); +static void pai_stop(struct perf_event *event, int flags) +{ + int idx = PAI_PMU_IDX(event); + struct pai_mapptr *mp = this_cpu_ptr(pai_root[idx].mapptr); + struct pai_map *cpump = mp->mapptr; + + if (!event->attr.sample_period) { /* Counting */ + pai_pmu[idx].pmu->read(event); + } else { /* Sampling */ + if (!(event->attach_state & PERF_ATTACH_TASK)) { + perf_sched_cb_dec(event->pmu); + list_del(PAI_SWLIST(event)); + } else { + pai_have_sample(event, cpump); + cpump->event = NULL; + } + } + event->hw.state = PERF_HES_STOPPED; +} + +static void paicrypt_stop(struct perf_event *event, int flags) +{ + pai_stop(event, flags); +} + +static void pai_del(struct perf_event *event, int flags) +{ + int idx = PAI_PMU_IDX(event); + struct pai_mapptr *mp = this_cpu_ptr(pai_root[idx].mapptr); + struct pai_map *cpump = mp->mapptr; + struct paiext_cb *pcb = cpump->paiext_cb; + + pai_pmu[idx].pmu->stop(event, PERF_EF_UPDATE); + if (--cpump->active_events == 0) { + if (!pcb) { /* PAI crypto */ + local_ctl_clear_bit(0, CR0_CRYPTOGRAPHY_COUNTER_BIT); + WRITE_ONCE(get_lowcore()->ccd, 0); + } else { /* PAI extension 1 */ + /* Disable CPU instruction lookup for PAIE1 control block */ + local_ctl_clear_bit(0, CR0_PAI_EXTENSION_BIT); + pcb->acc = 0; + WRITE_ONCE(get_lowcore()->aicd, 0); + } + } +} + +static void paicrypt_del(struct perf_event *event, int flags) +{ + pai_del(event, flags); +} + +/* Create raw data and save it in buffer. Calculate the delta for each + * counter between this invocation and the last invocation. + * Returns number of bytes copied. + * Saves only entries with positive counter difference of the form + * 2 bytes: Number of counter + * 8 bytes: Value of counter + */ +static size_t pai_copy(struct pai_userdata *userdata, unsigned long *page, + struct pai_pmu *pp, unsigned long *page_old, + bool exclude_user, bool exclude_kernel) +{ + int i, outidx = 0; + + for (i = 1; i <= pp->num_avail; i++) { + u64 val = 0, val_old = 0; + + if (!exclude_kernel) { + val += pai_getctr(page, i, pp->kernel_offset); + val_old += pai_getctr(page_old, i, pp->kernel_offset); + } + if (!exclude_user) { + val += pai_getctr(page, i, 0); + val_old += pai_getctr(page_old, i, 0); + } + if (val >= val_old) + val -= val_old; + else + val = (~0ULL - val_old) + val + 1; + if (val) { + userdata[outidx].num = i; + userdata[outidx].value = val; + outidx++; + } + } + return outidx * sizeof(*userdata); +} + +/* Write sample when one or more counters values are nonzero. + * + * Note: The function paicrypt_sched_task() and pai_push_sample() are not + * invoked after function paicrypt_del() has been called because of function + * perf_sched_cb_dec(). Both functions are only + * called when sampling is active. Function perf_sched_cb_inc() + * has been invoked to install function paicrypt_sched_task() as call back + * to run at context switch time. + * + * This causes function perf_event_context_sched_out() and + * perf_event_context_sched_in() to check whether the PMU has installed an + * sched_task() callback. That callback is not active after paicrypt_del() + * returns and has deleted the event on that CPU. + */ +static int pai_push_sample(size_t rawsize, struct pai_map *cpump, + struct perf_event *event) +{ + int idx = PAI_PMU_IDX(event); + struct pai_pmu *pp = &pai_pmu[idx]; + struct perf_sample_data data; + struct perf_raw_record raw; + struct pt_regs regs; + int overflow; + + /* Setup perf sample */ + memset(®s, 0, sizeof(regs)); + memset(&raw, 0, sizeof(raw)); + memset(&data, 0, sizeof(data)); + perf_sample_data_init(&data, 0, event->hw.last_period); + if (event->attr.sample_type & PERF_SAMPLE_TID) { + data.tid_entry.pid = task_tgid_nr(current); + data.tid_entry.tid = task_pid_nr(current); + } + if (event->attr.sample_type & PERF_SAMPLE_TIME) + data.time = event->clock(); + if (event->attr.sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) + data.id = event->id; + if (event->attr.sample_type & PERF_SAMPLE_CPU) { + data.cpu_entry.cpu = smp_processor_id(); + data.cpu_entry.reserved = 0; + } + if (event->attr.sample_type & PERF_SAMPLE_RAW) { + raw.frag.size = rawsize; + raw.frag.data = cpump->save; + perf_sample_save_raw_data(&data, event, &raw); + } + + overflow = perf_event_overflow(event, &data, ®s); + perf_event_update_userpage(event); + /* Save crypto counter lowcore page after reading event data. */ + memcpy((void *)PAI_SAVE_AREA(event), cpump->area, pp->area_size); + return overflow; +} + +/* Check if there is data to be saved on schedule out of a task. */ +static void pai_have_sample(struct perf_event *event, struct pai_map *cpump) +{ + struct pai_pmu *pp; + size_t rawsize; + + if (!event) /* No event active */ + return; + pp = &pai_pmu[PAI_PMU_IDX(event)]; + rawsize = pai_copy(cpump->save, cpump->area, pp, + (unsigned long *)PAI_SAVE_AREA(event), + event->attr.exclude_user, + event->attr.exclude_kernel); + if (rawsize) /* No incremented counters */ + pai_push_sample(rawsize, cpump, event); +} + +/* Check if there is data to be saved on schedule out of a task. */ +static void pai_have_samples(int idx) +{ + struct pai_mapptr *mp = this_cpu_ptr(pai_root[idx].mapptr); + struct pai_map *cpump = mp->mapptr; + struct perf_event *event; + + list_for_each_entry(event, &cpump->syswide_list, hw.tp_list) + pai_have_sample(event, cpump); +} + +/* Called on schedule-in and schedule-out. No access to event structure, + * but for sampling only event CRYPTO_ALL is allowed. + */ +static void paicrypt_sched_task(struct perf_event_pmu_context *pmu_ctx, + struct task_struct *task, bool sched_in) +{ + /* We started with a clean page on event installation. So read out + * results on schedule_out and if page was dirty, save old values. + */ + if (!sched_in) + pai_have_samples(PAI_PMU_CRYPTO); +} + +/* ============================= paiext ====================================*/ + +static void paiext_event_destroy(struct perf_event *event) +{ + pai_event_destroy(event); +} + +/* Might be called on different CPU than the one the event is intended for. */ +static int paiext_event_init(struct perf_event *event) +{ + int rc = pai_event_init(event, PAI_PMU_EXT); + + if (!rc) { + event->attr.exclude_kernel = true; /* No kernel space part */ + event->destroy = paiext_event_destroy; + /* Offset of NNPA in paiext_cb */ + event->hw.config_base = offsetof(struct paiext_cb, acc); + } + return rc; +} + +static u64 paiext_getall(struct perf_event *event) +{ + return pai_getdata(event, false); +} + +static void paiext_read(struct perf_event *event) +{ + pai_read(event, paiext_getall); +} + +static void paiext_start(struct perf_event *event, int flags) +{ + pai_start(event, flags, paiext_getall); +} + +static int paiext_add(struct perf_event *event, int flags) +{ + return pai_add(event, flags); +} + +static void paiext_stop(struct perf_event *event, int flags) +{ + pai_stop(event, flags); +} + +static void paiext_del(struct perf_event *event, int flags) +{ + pai_del(event, flags); +} + +/* Called on schedule-in and schedule-out. No access to event structure, + * but for sampling only event NNPA_ALL is allowed. + */ +static void paiext_sched_task(struct perf_event_pmu_context *pmu_ctx, + struct task_struct *task, bool sched_in) +{ + /* We started with a clean page on event installation. So read out + * results on schedule_out and if page was dirty, save old values. + */ + if (!sched_in) + pai_have_samples(PAI_PMU_EXT); +} + +/* Attribute definitions for paicrypt interface. As with other CPU + * Measurement Facilities, there is one attribute per mapped counter. + * The number of mapped counters may vary per machine generation. Use + * the QUERY PROCESSOR ACTIVITY COUNTER INFORMATION (QPACI) instruction + * to determine the number of mapped counters. The instructions returns + * a positive number, which is the highest number of supported counters. + * All counters less than this number are also supported, there are no + * holes. A returned number of zero means no support for mapped counters. + * + * The identification of the counter is a unique number. The chosen range + * is 0x1000 + offset in mapped kernel page. + * All CPU Measurement Facility counters identifiers must be unique and + * the numbers from 0 to 496 are already used for the CPU Measurement + * Counter facility. Numbers 0xb0000, 0xbc000 and 0xbd000 are already + * used for the CPU Measurement Sampling facility. + */ +PMU_FORMAT_ATTR(event, "config:0-63"); + +static struct attribute *paicrypt_format_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group paicrypt_events_group = { + .name = "events", + .attrs = NULL /* Filled in attr_event_init() */ +}; + +static struct attribute_group paicrypt_format_group = { + .name = "format", + .attrs = paicrypt_format_attr, +}; + +static const struct attribute_group *paicrypt_attr_groups[] = { + &paicrypt_events_group, + &paicrypt_format_group, + NULL, +}; + +/* Performance monitoring unit for mapped counters */ +static struct pmu paicrypt = { + .task_ctx_nr = perf_hw_context, + .event_init = paicrypt_event_init, + .add = paicrypt_add, + .del = paicrypt_del, + .start = paicrypt_start, + .stop = paicrypt_stop, + .read = paicrypt_read, + .sched_task = paicrypt_sched_task, + .attr_groups = paicrypt_attr_groups +}; + +/* List of symbolic PAI counter names. */ +static const char * const paicrypt_ctrnames[] = { + [0] = "CRYPTO_ALL", + [1] = "KM_DEA", + [2] = "KM_TDEA_128", + [3] = "KM_TDEA_192", + [4] = "KM_ENCRYPTED_DEA", + [5] = "KM_ENCRYPTED_TDEA_128", + [6] = "KM_ENCRYPTED_TDEA_192", + [7] = "KM_AES_128", + [8] = "KM_AES_192", + [9] = "KM_AES_256", + [10] = "KM_ENCRYPTED_AES_128", + [11] = "KM_ENCRYPTED_AES_192", + [12] = "KM_ENCRYPTED_AES_256", + [13] = "KM_XTS_AES_128", + [14] = "KM_XTS_AES_256", + [15] = "KM_XTS_ENCRYPTED_AES_128", + [16] = "KM_XTS_ENCRYPTED_AES_256", + [17] = "KMC_DEA", + [18] = "KMC_TDEA_128", + [19] = "KMC_TDEA_192", + [20] = "KMC_ENCRYPTED_DEA", + [21] = "KMC_ENCRYPTED_TDEA_128", + [22] = "KMC_ENCRYPTED_TDEA_192", + [23] = "KMC_AES_128", + [24] = "KMC_AES_192", + [25] = "KMC_AES_256", + [26] = "KMC_ENCRYPTED_AES_128", + [27] = "KMC_ENCRYPTED_AES_192", + [28] = "KMC_ENCRYPTED_AES_256", + [29] = "KMC_PRNG", + [30] = "KMA_GCM_AES_128", + [31] = "KMA_GCM_AES_192", + [32] = "KMA_GCM_AES_256", + [33] = "KMA_GCM_ENCRYPTED_AES_128", + [34] = "KMA_GCM_ENCRYPTED_AES_192", + [35] = "KMA_GCM_ENCRYPTED_AES_256", + [36] = "KMF_DEA", + [37] = "KMF_TDEA_128", + [38] = "KMF_TDEA_192", + [39] = "KMF_ENCRYPTED_DEA", + [40] = "KMF_ENCRYPTED_TDEA_128", + [41] = "KMF_ENCRYPTED_TDEA_192", + [42] = "KMF_AES_128", + [43] = "KMF_AES_192", + [44] = "KMF_AES_256", + [45] = "KMF_ENCRYPTED_AES_128", + [46] = "KMF_ENCRYPTED_AES_192", + [47] = "KMF_ENCRYPTED_AES_256", + [48] = "KMCTR_DEA", + [49] = "KMCTR_TDEA_128", + [50] = "KMCTR_TDEA_192", + [51] = "KMCTR_ENCRYPTED_DEA", + [52] = "KMCTR_ENCRYPTED_TDEA_128", + [53] = "KMCTR_ENCRYPTED_TDEA_192", + [54] = "KMCTR_AES_128", + [55] = "KMCTR_AES_192", + [56] = "KMCTR_AES_256", + [57] = "KMCTR_ENCRYPTED_AES_128", + [58] = "KMCTR_ENCRYPTED_AES_192", + [59] = "KMCTR_ENCRYPTED_AES_256", + [60] = "KMO_DEA", + [61] = "KMO_TDEA_128", + [62] = "KMO_TDEA_192", + [63] = "KMO_ENCRYPTED_DEA", + [64] = "KMO_ENCRYPTED_TDEA_128", + [65] = "KMO_ENCRYPTED_TDEA_192", + [66] = "KMO_AES_128", + [67] = "KMO_AES_192", + [68] = "KMO_AES_256", + [69] = "KMO_ENCRYPTED_AES_128", + [70] = "KMO_ENCRYPTED_AES_192", + [71] = "KMO_ENCRYPTED_AES_256", + [72] = "KIMD_SHA_1", + [73] = "KIMD_SHA_256", + [74] = "KIMD_SHA_512", + [75] = "KIMD_SHA3_224", + [76] = "KIMD_SHA3_256", + [77] = "KIMD_SHA3_384", + [78] = "KIMD_SHA3_512", + [79] = "KIMD_SHAKE_128", + [80] = "KIMD_SHAKE_256", + [81] = "KIMD_GHASH", + [82] = "KLMD_SHA_1", + [83] = "KLMD_SHA_256", + [84] = "KLMD_SHA_512", + [85] = "KLMD_SHA3_224", + [86] = "KLMD_SHA3_256", + [87] = "KLMD_SHA3_384", + [88] = "KLMD_SHA3_512", + [89] = "KLMD_SHAKE_128", + [90] = "KLMD_SHAKE_256", + [91] = "KMAC_DEA", + [92] = "KMAC_TDEA_128", + [93] = "KMAC_TDEA_192", + [94] = "KMAC_ENCRYPTED_DEA", + [95] = "KMAC_ENCRYPTED_TDEA_128", + [96] = "KMAC_ENCRYPTED_TDEA_192", + [97] = "KMAC_AES_128", + [98] = "KMAC_AES_192", + [99] = "KMAC_AES_256", + [100] = "KMAC_ENCRYPTED_AES_128", + [101] = "KMAC_ENCRYPTED_AES_192", + [102] = "KMAC_ENCRYPTED_AES_256", + [103] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_DEA", + [104] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_TDEA_128", + [105] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_TDEA_192", + [106] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_DEA", + [107] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_TDEA_128", + [108] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_TDEA_192", + [109] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_AES_128", + [110] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_AES_192", + [111] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_AES_256", + [112] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_128", + [113] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_192", + [114] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_256", + [115] = "PCC_COMPUTE_XTS_PARAMETER_USING_AES_128", + [116] = "PCC_COMPUTE_XTS_PARAMETER_USING_AES_256", + [117] = "PCC_COMPUTE_XTS_PARAMETER_USING_ENCRYPTED_AES_128", + [118] = "PCC_COMPUTE_XTS_PARAMETER_USING_ENCRYPTED_AES_256", + [119] = "PCC_SCALAR_MULTIPLY_P256", + [120] = "PCC_SCALAR_MULTIPLY_P384", + [121] = "PCC_SCALAR_MULTIPLY_P521", + [122] = "PCC_SCALAR_MULTIPLY_ED25519", + [123] = "PCC_SCALAR_MULTIPLY_ED448", + [124] = "PCC_SCALAR_MULTIPLY_X25519", + [125] = "PCC_SCALAR_MULTIPLY_X448", + [126] = "PRNO_SHA_512_DRNG", + [127] = "PRNO_TRNG_QUERY_RAW_TO_CONDITIONED_RATIO", + [128] = "PRNO_TRNG", + [129] = "KDSA_ECDSA_VERIFY_P256", + [130] = "KDSA_ECDSA_VERIFY_P384", + [131] = "KDSA_ECDSA_VERIFY_P521", + [132] = "KDSA_ECDSA_SIGN_P256", + [133] = "KDSA_ECDSA_SIGN_P384", + [134] = "KDSA_ECDSA_SIGN_P521", + [135] = "KDSA_ENCRYPTED_ECDSA_SIGN_P256", + [136] = "KDSA_ENCRYPTED_ECDSA_SIGN_P384", + [137] = "KDSA_ENCRYPTED_ECDSA_SIGN_P521", + [138] = "KDSA_EDDSA_VERIFY_ED25519", + [139] = "KDSA_EDDSA_VERIFY_ED448", + [140] = "KDSA_EDDSA_SIGN_ED25519", + [141] = "KDSA_EDDSA_SIGN_ED448", + [142] = "KDSA_ENCRYPTED_EDDSA_SIGN_ED25519", + [143] = "KDSA_ENCRYPTED_EDDSA_SIGN_ED448", + [144] = "PCKMO_ENCRYPT_DEA_KEY", + [145] = "PCKMO_ENCRYPT_TDEA_128_KEY", + [146] = "PCKMO_ENCRYPT_TDEA_192_KEY", + [147] = "PCKMO_ENCRYPT_AES_128_KEY", + [148] = "PCKMO_ENCRYPT_AES_192_KEY", + [149] = "PCKMO_ENCRYPT_AES_256_KEY", + [150] = "PCKMO_ENCRYPT_ECC_P256_KEY", + [151] = "PCKMO_ENCRYPT_ECC_P384_KEY", + [152] = "PCKMO_ENCRYPT_ECC_P521_KEY", + [153] = "PCKMO_ENCRYPT_ECC_ED25519_KEY", + [154] = "PCKMO_ENCRYPT_ECC_ED448_KEY", + [155] = "IBM_RESERVED_155", + [156] = "IBM_RESERVED_156", + [157] = "KM_FULL_XTS_AES_128", + [158] = "KM_FULL_XTS_AES_256", + [159] = "KM_FULL_XTS_ENCRYPTED_AES_128", + [160] = "KM_FULL_XTS_ENCRYPTED_AES_256", + [161] = "KMAC_HMAC_SHA_224", + [162] = "KMAC_HMAC_SHA_256", + [163] = "KMAC_HMAC_SHA_384", + [164] = "KMAC_HMAC_SHA_512", + [165] = "KMAC_HMAC_ENCRYPTED_SHA_224", + [166] = "KMAC_HMAC_ENCRYPTED_SHA_256", + [167] = "KMAC_HMAC_ENCRYPTED_SHA_384", + [168] = "KMAC_HMAC_ENCRYPTED_SHA_512", + [169] = "PCKMO_ENCRYPT_HMAC_512_KEY", + [170] = "PCKMO_ENCRYPT_HMAC_1024_KEY", + [171] = "PCKMO_ENCRYPT_AES_XTS_128", + [172] = "PCKMO_ENCRYPT_AES_XTS_256", +}; + +static struct attribute *paiext_format_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group paiext_events_group = { + .name = "events", + .attrs = NULL, /* Filled in attr_event_init() */ +}; + +static struct attribute_group paiext_format_group = { + .name = "format", + .attrs = paiext_format_attr, +}; + +static const struct attribute_group *paiext_attr_groups[] = { + &paiext_events_group, + &paiext_format_group, + NULL, +}; + +/* Performance monitoring unit for mapped counters */ +static struct pmu paiext = { + .task_ctx_nr = perf_hw_context, + .event_init = paiext_event_init, + .add = paiext_add, + .del = paiext_del, + .start = paiext_start, + .stop = paiext_stop, + .read = paiext_read, + .sched_task = paiext_sched_task, + .attr_groups = paiext_attr_groups, +}; + +/* List of symbolic PAI extension 1 NNPA counter names. */ +static const char * const paiext_ctrnames[] = { + [0] = "NNPA_ALL", + [1] = "NNPA_ADD", + [2] = "NNPA_SUB", + [3] = "NNPA_MUL", + [4] = "NNPA_DIV", + [5] = "NNPA_MIN", + [6] = "NNPA_MAX", + [7] = "NNPA_LOG", + [8] = "NNPA_EXP", + [9] = "NNPA_IBM_RESERVED_9", + [10] = "NNPA_RELU", + [11] = "NNPA_TANH", + [12] = "NNPA_SIGMOID", + [13] = "NNPA_SOFTMAX", + [14] = "NNPA_BATCHNORM", + [15] = "NNPA_MAXPOOL2D", + [16] = "NNPA_AVGPOOL2D", + [17] = "NNPA_LSTMACT", + [18] = "NNPA_GRUACT", + [19] = "NNPA_CONVOLUTION", + [20] = "NNPA_MATMUL_OP", + [21] = "NNPA_MATMUL_OP_BCAST23", + [22] = "NNPA_SMALLBATCH", + [23] = "NNPA_LARGEDIM", + [24] = "NNPA_SMALLTENSOR", + [25] = "NNPA_1MFRAME", + [26] = "NNPA_2GFRAME", + [27] = "NNPA_ACCESSEXCEPT", + [28] = "NNPA_TRANSFORM", + [29] = "NNPA_GELU", + [30] = "NNPA_MOMENTS", + [31] = "NNPA_LAYERNORM", + [32] = "NNPA_MATMUL_OP_BCAST1", + [33] = "NNPA_SQRT", + [34] = "NNPA_INVSQRT", + [35] = "NNPA_NORM", + [36] = "NNPA_REDUCE", +}; + +static void __init attr_event_free(struct attribute **attrs) +{ + struct perf_pmu_events_attr *pa; + unsigned int i; + + for (i = 0; attrs[i]; i++) { + struct device_attribute *dap; + + dap = container_of(attrs[i], struct device_attribute, attr); + pa = container_of(dap, struct perf_pmu_events_attr, attr); + kfree(pa); + } + kfree(attrs); +} + +static struct attribute * __init attr_event_init_one(int num, + unsigned long base, + const char *name) +{ + struct perf_pmu_events_attr *pa; + + pa = kzalloc_obj(*pa); + if (!pa) + return NULL; + + sysfs_attr_init(&pa->attr.attr); + pa->id = base + num; + pa->attr.attr.name = name; + pa->attr.attr.mode = 0444; + pa->attr.show = cpumf_events_sysfs_show; + pa->attr.store = NULL; + return &pa->attr.attr; +} + +static struct attribute ** __init attr_event_init(struct pai_pmu *p) +{ + unsigned int min_attr = min_t(unsigned int, p->num_named, p->num_avail); + struct attribute **attrs; + unsigned int i; + + attrs = kmalloc_objs(*attrs, min_attr + 1, GFP_KERNEL | __GFP_ZERO); + if (!attrs) + goto out; + for (i = 0; i < min_attr; i++) { + attrs[i] = attr_event_init_one(i, p->base, p->names[i]); + if (!attrs[i]) { + attr_event_free(attrs); + attrs = NULL; + goto out; + } + } + attrs[i] = NULL; +out: + return attrs; +} + +static void __init pai_pmu_exit(struct pai_pmu *p) +{ + attr_event_free(p->event_group->attrs); + p->event_group->attrs = NULL; +} + +/* Add a PMU. Install its events and register the PMU device driver + * call back functions. + */ +static int __init pai_pmu_init(struct pai_pmu *p) +{ + int rc = -ENOMEM; + + + /* Export known PAI events */ + p->event_group->attrs = attr_event_init(p); + if (!p->event_group->attrs) { + pr_err("Creation of PMU %s /sysfs failed\n", p->pmuname); + goto out; + } + + rc = perf_pmu_register(p->pmu, p->pmuname, -1); + if (rc) { + pai_pmu_exit(p); + pr_err("Registering PMU %s failed with rc=%i\n", p->pmuname, + rc); + } +out: + return rc; +} + +/* PAI PMU characteristics table */ +static struct pai_pmu pai_pmu[] __refdata = { + [PAI_PMU_CRYPTO] = { + .pmuname = "pai_crypto", + .facility_nr = 196, + .num_named = ARRAY_SIZE(paicrypt_ctrnames), + .names = paicrypt_ctrnames, + .base = PAI_CRYPTO_BASE, + .kernel_offset = PAI_CRYPTO_KERNEL_OFFSET, + .area_size = PAGE_SIZE, + .init = pai_pmu_init, + .exit = pai_pmu_exit, + .pmu = &paicrypt, + .event_group = &paicrypt_events_group + }, + [PAI_PMU_EXT] = { + .pmuname = "pai_ext", + .facility_nr = 197, + .num_named = ARRAY_SIZE(paiext_ctrnames), + .names = paiext_ctrnames, + .base = PAI_NNPA_BASE, + .kernel_offset = 0, + .area_size = PAIE1_CTRBLOCK_SZ, + .init = pai_pmu_init, + .exit = pai_pmu_exit, + .pmu = &paiext, + .event_group = &paiext_events_group + } +}; + +/* + * Check if the PMU (via facility) is supported by machine. Try all of the + * supported PAI PMUs. + * Return number of successfully installed PMUs. + */ +static int __init paipmu_setup(void) +{ + struct qpaci_info_block ib; + int install_ok = 0, rc; + struct pai_pmu *p; + size_t i; + + for (i = 0; i < ARRAY_SIZE(pai_pmu); ++i) { + p = &pai_pmu[i]; + + if (!test_facility(p->facility_nr)) + continue; + + qpaci(&ib); + switch (i) { + case PAI_PMU_CRYPTO: + p->num_avail = ib.num_cc; + if (p->num_avail >= PAI_CRYPTO_MAXCTR) { + pr_err("Too many PMU %s counters %d\n", + p->pmuname, p->num_avail); + continue; + } + break; + case PAI_PMU_EXT: + p->num_avail = ib.num_nnpa; + break; + } + p->num_avail += 1; /* Add xxx_ALL event */ + if (p->init) { + rc = p->init(p); + if (!rc) + ++install_ok; + } + } + return install_ok; +} + +static int __init pai_init(void) +{ + /* Setup s390dbf facility */ + paidbg = debug_register("pai", 32, 256, 128); + if (!paidbg) { + pr_err("Registration of s390dbf pai failed\n"); + return -ENOMEM; + } + debug_register_view(paidbg, &debug_sprintf_view); + + if (!paipmu_setup()) { + /* No PMU registration, no need for debug buffer */ + debug_unregister_view(paidbg, &debug_sprintf_view); + debug_unregister(paidbg); + return -ENODEV; + } + return 0; +} + +device_initcall(pai_init); diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c deleted file mode 100644 index 10725f5a6f0f..000000000000 --- a/arch/s390/kernel/perf_pai_crypto.c +++ /dev/null @@ -1,861 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Performance event support - Processor Activity Instrumentation Facility - * - * Copyright IBM Corp. 2022 - * Author(s): Thomas Richter <tmricht@linux.ibm.com> - */ -#define KMSG_COMPONENT "pai_crypto" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt - -#include <linux/kernel.h> -#include <linux/kernel_stat.h> -#include <linux/percpu.h> -#include <linux/notifier.h> -#include <linux/init.h> -#include <linux/export.h> -#include <linux/io.h> -#include <linux/perf_event.h> -#include <asm/ctlreg.h> -#include <asm/pai.h> -#include <asm/debug.h> - -static debug_info_t *cfm_dbg; -static unsigned int paicrypt_cnt; /* Size of the mapped counter sets */ - /* extracted with QPACI instruction */ - -DEFINE_STATIC_KEY_FALSE(pai_key); - -struct pai_userdata { - u16 num; - u64 value; -} __packed; - -struct paicrypt_map { - unsigned long *page; /* Page for CPU to store counters */ - struct pai_userdata *save; /* Page to store no-zero counters */ - unsigned int active_events; /* # of PAI crypto users */ - refcount_t refcnt; /* Reference count mapped buffers */ - struct perf_event *event; /* Perf event for sampling */ - struct list_head syswide_list; /* List system-wide sampling events */ -}; - -struct paicrypt_mapptr { - struct paicrypt_map *mapptr; -}; - -static struct paicrypt_root { /* Anchor to per CPU data */ - refcount_t refcnt; /* Overall active events */ - struct paicrypt_mapptr __percpu *mapptr; -} paicrypt_root; - -/* Free per CPU data when the last event is removed. */ -static void paicrypt_root_free(void) -{ - if (refcount_dec_and_test(&paicrypt_root.refcnt)) { - free_percpu(paicrypt_root.mapptr); - paicrypt_root.mapptr = NULL; - } - debug_sprintf_event(cfm_dbg, 5, "%s root.refcount %d\n", __func__, - refcount_read(&paicrypt_root.refcnt)); -} - -/* - * On initialization of first event also allocate per CPU data dynamically. - * Start with an array of pointers, the array size is the maximum number of - * CPUs possible, which might be larger than the number of CPUs currently - * online. - */ -static int paicrypt_root_alloc(void) -{ - if (!refcount_inc_not_zero(&paicrypt_root.refcnt)) { - /* The memory is already zeroed. */ - paicrypt_root.mapptr = alloc_percpu(struct paicrypt_mapptr); - if (!paicrypt_root.mapptr) - return -ENOMEM; - refcount_set(&paicrypt_root.refcnt, 1); - } - return 0; -} - -/* Release the PMU if event is the last perf event */ -static DEFINE_MUTEX(pai_reserve_mutex); - -/* Adjust usage counters and remove allocated memory when all users are - * gone. - */ -static void paicrypt_event_destroy_cpu(struct perf_event *event, int cpu) -{ - struct paicrypt_mapptr *mp = per_cpu_ptr(paicrypt_root.mapptr, cpu); - struct paicrypt_map *cpump = mp->mapptr; - - mutex_lock(&pai_reserve_mutex); - debug_sprintf_event(cfm_dbg, 5, "%s event %#llx cpu %d users %d " - "refcnt %u\n", __func__, event->attr.config, - event->cpu, cpump->active_events, - refcount_read(&cpump->refcnt)); - if (refcount_dec_and_test(&cpump->refcnt)) { - debug_sprintf_event(cfm_dbg, 4, "%s page %#lx save %p\n", - __func__, (unsigned long)cpump->page, - cpump->save); - free_page((unsigned long)cpump->page); - kvfree(cpump->save); - kfree(cpump); - mp->mapptr = NULL; - } - paicrypt_root_free(); - mutex_unlock(&pai_reserve_mutex); -} - -static void paicrypt_event_destroy(struct perf_event *event) -{ - int cpu; - - static_branch_dec(&pai_key); - free_page(PAI_SAVE_AREA(event)); - if (event->cpu == -1) { - struct cpumask *mask = PAI_CPU_MASK(event); - - for_each_cpu(cpu, mask) - paicrypt_event_destroy_cpu(event, cpu); - kfree(mask); - } else { - paicrypt_event_destroy_cpu(event, event->cpu); - } -} - -static u64 paicrypt_getctr(unsigned long *page, int nr, bool kernel) -{ - if (kernel) - nr += PAI_CRYPTO_MAXCTR; - return page[nr]; -} - -/* Read the counter values. Return value from location in CMP. For event - * CRYPTO_ALL sum up all events. - */ -static u64 paicrypt_getdata(struct perf_event *event, bool kernel) -{ - struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr); - struct paicrypt_map *cpump = mp->mapptr; - u64 sum = 0; - int i; - - if (event->attr.config != PAI_CRYPTO_BASE) { - return paicrypt_getctr(cpump->page, - event->attr.config - PAI_CRYPTO_BASE, - kernel); - } - - for (i = 1; i <= paicrypt_cnt; i++) { - u64 val = paicrypt_getctr(cpump->page, i, kernel); - - if (!val) - continue; - sum += val; - } - return sum; -} - -static u64 paicrypt_getall(struct perf_event *event) -{ - u64 sum = 0; - - if (!event->attr.exclude_kernel) - sum += paicrypt_getdata(event, true); - if (!event->attr.exclude_user) - sum += paicrypt_getdata(event, false); - - return sum; -} - -/* Check concurrent access of counting and sampling for crypto events. - * This function is called in process context and it is save to block. - * When the event initialization functions fails, no other call back will - * be invoked. - * - * Allocate the memory for the event. - */ -static struct paicrypt_map *paicrypt_busy(struct perf_event *event, int cpu) -{ - struct paicrypt_map *cpump = NULL; - struct paicrypt_mapptr *mp; - int rc; - - mutex_lock(&pai_reserve_mutex); - - /* Allocate root node */ - rc = paicrypt_root_alloc(); - if (rc) - goto unlock; - - /* Allocate node for this event */ - mp = per_cpu_ptr(paicrypt_root.mapptr, cpu); - cpump = mp->mapptr; - if (!cpump) { /* Paicrypt_map allocated? */ - cpump = kzalloc(sizeof(*cpump), GFP_KERNEL); - if (!cpump) { - rc = -ENOMEM; - goto free_root; - } - INIT_LIST_HEAD(&cpump->syswide_list); - } - - /* Allocate memory for counter page and counter extraction. - * Only the first counting event has to allocate a page. - */ - if (cpump->page) { - refcount_inc(&cpump->refcnt); - goto unlock; - } - - rc = -ENOMEM; - cpump->page = (unsigned long *)get_zeroed_page(GFP_KERNEL); - if (!cpump->page) - goto free_paicrypt_map; - cpump->save = kvmalloc_array(paicrypt_cnt + 1, - sizeof(struct pai_userdata), GFP_KERNEL); - if (!cpump->save) { - free_page((unsigned long)cpump->page); - cpump->page = NULL; - goto free_paicrypt_map; - } - - /* Set mode and reference count */ - rc = 0; - refcount_set(&cpump->refcnt, 1); - mp->mapptr = cpump; - debug_sprintf_event(cfm_dbg, 5, "%s users %d refcnt %u page %#lx " - "save %p rc %d\n", __func__, cpump->active_events, - refcount_read(&cpump->refcnt), - (unsigned long)cpump->page, cpump->save, rc); - goto unlock; - -free_paicrypt_map: - /* Undo memory allocation */ - kfree(cpump); - mp->mapptr = NULL; -free_root: - paicrypt_root_free(); -unlock: - mutex_unlock(&pai_reserve_mutex); - return rc ? ERR_PTR(rc) : cpump; -} - -static int paicrypt_event_init_all(struct perf_event *event) -{ - struct paicrypt_map *cpump; - struct cpumask *maskptr; - int cpu, rc = -ENOMEM; - - maskptr = kzalloc(sizeof(*maskptr), GFP_KERNEL); - if (!maskptr) - goto out; - - for_each_online_cpu(cpu) { - cpump = paicrypt_busy(event, cpu); - if (IS_ERR(cpump)) { - for_each_cpu(cpu, maskptr) - paicrypt_event_destroy_cpu(event, cpu); - kfree(maskptr); - rc = PTR_ERR(cpump); - goto out; - } - cpumask_set_cpu(cpu, maskptr); - } - - /* - * On error all cpumask are freed and all events have been destroyed. - * Save of which CPUs data structures have been allocated for. - * Release them in paicrypt_event_destroy call back function - * for this event. - */ - PAI_CPU_MASK(event) = maskptr; - rc = 0; -out: - return rc; -} - -/* Might be called on different CPU than the one the event is intended for. */ -static int paicrypt_event_init(struct perf_event *event) -{ - struct perf_event_attr *a = &event->attr; - struct paicrypt_map *cpump; - int rc = 0; - - /* PAI crypto PMU registered as PERF_TYPE_RAW, check event type */ - if (a->type != PERF_TYPE_RAW && event->pmu->type != a->type) - return -ENOENT; - /* PAI crypto event must be in valid range */ - if (a->config < PAI_CRYPTO_BASE || - a->config > PAI_CRYPTO_BASE + paicrypt_cnt) - return -EINVAL; - /* Allow only CRYPTO_ALL for sampling */ - if (a->sample_period && a->config != PAI_CRYPTO_BASE) - return -EINVAL; - /* Get a page to store last counter values for sampling */ - if (a->sample_period) { - PAI_SAVE_AREA(event) = get_zeroed_page(GFP_KERNEL); - if (!PAI_SAVE_AREA(event)) { - rc = -ENOMEM; - goto out; - } - } - - if (event->cpu >= 0) { - cpump = paicrypt_busy(event, event->cpu); - if (IS_ERR(cpump)) - rc = PTR_ERR(cpump); - } else { - rc = paicrypt_event_init_all(event); - } - if (rc) { - free_page(PAI_SAVE_AREA(event)); - goto out; - } - event->destroy = paicrypt_event_destroy; - - if (a->sample_period) { - a->sample_period = 1; - a->freq = 0; - /* Register for paicrypt_sched_task() to be called */ - event->attach_state |= PERF_ATTACH_SCHED_CB; - /* Add raw data which contain the memory mapped counters */ - a->sample_type |= PERF_SAMPLE_RAW; - /* Turn off inheritance */ - a->inherit = 0; - } - - static_branch_inc(&pai_key); -out: - return rc; -} - -static void paicrypt_read(struct perf_event *event) -{ - u64 prev, new, delta; - - prev = local64_read(&event->hw.prev_count); - new = paicrypt_getall(event); - local64_set(&event->hw.prev_count, new); - delta = (prev <= new) ? new - prev - : (-1ULL - prev) + new + 1; /* overflow */ - local64_add(delta, &event->count); -} - -static void paicrypt_start(struct perf_event *event, int flags) -{ - struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr); - struct paicrypt_map *cpump = mp->mapptr; - u64 sum; - - if (!event->attr.sample_period) { /* Counting */ - sum = paicrypt_getall(event); /* Get current value */ - local64_set(&event->hw.prev_count, sum); - } else { /* Sampling */ - memcpy((void *)PAI_SAVE_AREA(event), cpump->page, PAGE_SIZE); - /* Enable context switch callback for system-wide sampling */ - if (!(event->attach_state & PERF_ATTACH_TASK)) { - list_add_tail(PAI_SWLIST(event), &cpump->syswide_list); - perf_sched_cb_inc(event->pmu); - } else { - cpump->event = event; - } - } -} - -static int paicrypt_add(struct perf_event *event, int flags) -{ - struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr); - struct paicrypt_map *cpump = mp->mapptr; - unsigned long ccd; - - if (++cpump->active_events == 1) { - ccd = virt_to_phys(cpump->page) | PAI_CRYPTO_KERNEL_OFFSET; - WRITE_ONCE(get_lowcore()->ccd, ccd); - local_ctl_set_bit(0, CR0_CRYPTOGRAPHY_COUNTER_BIT); - } - if (flags & PERF_EF_START) - paicrypt_start(event, PERF_EF_RELOAD); - event->hw.state = 0; - return 0; -} - -static void paicrypt_have_sample(struct perf_event *, struct paicrypt_map *); -static void paicrypt_stop(struct perf_event *event, int flags) -{ - struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr); - struct paicrypt_map *cpump = mp->mapptr; - - if (!event->attr.sample_period) { /* Counting */ - paicrypt_read(event); - } else { /* Sampling */ - if (!(event->attach_state & PERF_ATTACH_TASK)) { - perf_sched_cb_dec(event->pmu); - list_del(PAI_SWLIST(event)); - } else { - paicrypt_have_sample(event, cpump); - cpump->event = NULL; - } - } - event->hw.state = PERF_HES_STOPPED; -} - -static void paicrypt_del(struct perf_event *event, int flags) -{ - struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr); - struct paicrypt_map *cpump = mp->mapptr; - - paicrypt_stop(event, PERF_EF_UPDATE); - if (--cpump->active_events == 0) { - local_ctl_clear_bit(0, CR0_CRYPTOGRAPHY_COUNTER_BIT); - WRITE_ONCE(get_lowcore()->ccd, 0); - } -} - -/* Create raw data and save it in buffer. Calculate the delta for each - * counter between this invocation and the last invocation. - * Returns number of bytes copied. - * Saves only entries with positive counter difference of the form - * 2 bytes: Number of counter - * 8 bytes: Value of counter - */ -static size_t paicrypt_copy(struct pai_userdata *userdata, unsigned long *page, - unsigned long *page_old, bool exclude_user, - bool exclude_kernel) -{ - int i, outidx = 0; - - for (i = 1; i <= paicrypt_cnt; i++) { - u64 val = 0, val_old = 0; - - if (!exclude_kernel) { - val += paicrypt_getctr(page, i, true); - val_old += paicrypt_getctr(page_old, i, true); - } - if (!exclude_user) { - val += paicrypt_getctr(page, i, false); - val_old += paicrypt_getctr(page_old, i, false); - } - if (val >= val_old) - val -= val_old; - else - val = (~0ULL - val_old) + val + 1; - if (val) { - userdata[outidx].num = i; - userdata[outidx].value = val; - outidx++; - } - } - return outidx * sizeof(struct pai_userdata); -} - -static int paicrypt_push_sample(size_t rawsize, struct paicrypt_map *cpump, - struct perf_event *event) -{ - struct perf_sample_data data; - struct perf_raw_record raw; - struct pt_regs regs; - int overflow; - - /* Setup perf sample */ - memset(®s, 0, sizeof(regs)); - memset(&raw, 0, sizeof(raw)); - memset(&data, 0, sizeof(data)); - perf_sample_data_init(&data, 0, event->hw.last_period); - if (event->attr.sample_type & PERF_SAMPLE_TID) { - data.tid_entry.pid = task_tgid_nr(current); - data.tid_entry.tid = task_pid_nr(current); - } - if (event->attr.sample_type & PERF_SAMPLE_TIME) - data.time = event->clock(); - if (event->attr.sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) - data.id = event->id; - if (event->attr.sample_type & PERF_SAMPLE_CPU) { - data.cpu_entry.cpu = smp_processor_id(); - data.cpu_entry.reserved = 0; - } - if (event->attr.sample_type & PERF_SAMPLE_RAW) { - raw.frag.size = rawsize; - raw.frag.data = cpump->save; - perf_sample_save_raw_data(&data, event, &raw); - } - - overflow = perf_event_overflow(event, &data, ®s); - perf_event_update_userpage(event); - /* Save crypto counter lowcore page after reading event data. */ - memcpy((void *)PAI_SAVE_AREA(event), cpump->page, PAGE_SIZE); - return overflow; -} - -/* Check if there is data to be saved on schedule out of a task. */ -static void paicrypt_have_sample(struct perf_event *event, - struct paicrypt_map *cpump) -{ - size_t rawsize; - - if (!event) /* No event active */ - return; - rawsize = paicrypt_copy(cpump->save, cpump->page, - (unsigned long *)PAI_SAVE_AREA(event), - event->attr.exclude_user, - event->attr.exclude_kernel); - if (rawsize) /* No incremented counters */ - paicrypt_push_sample(rawsize, cpump, event); -} - -/* Check if there is data to be saved on schedule out of a task. */ -static void paicrypt_have_samples(void) -{ - struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr); - struct paicrypt_map *cpump = mp->mapptr; - struct perf_event *event; - - list_for_each_entry(event, &cpump->syswide_list, hw.tp_list) - paicrypt_have_sample(event, cpump); -} - -/* Called on schedule-in and schedule-out. No access to event structure, - * but for sampling only event CRYPTO_ALL is allowed. - */ -static void paicrypt_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) -{ - /* We started with a clean page on event installation. So read out - * results on schedule_out and if page was dirty, save old values. - */ - if (!sched_in) - paicrypt_have_samples(); -} - -/* Attribute definitions for paicrypt interface. As with other CPU - * Measurement Facilities, there is one attribute per mapped counter. - * The number of mapped counters may vary per machine generation. Use - * the QUERY PROCESSOR ACTIVITY COUNTER INFORMATION (QPACI) instruction - * to determine the number of mapped counters. The instructions returns - * a positive number, which is the highest number of supported counters. - * All counters less than this number are also supported, there are no - * holes. A returned number of zero means no support for mapped counters. - * - * The identification of the counter is a unique number. The chosen range - * is 0x1000 + offset in mapped kernel page. - * All CPU Measurement Facility counters identifiers must be unique and - * the numbers from 0 to 496 are already used for the CPU Measurement - * Counter facility. Numbers 0xb0000, 0xbc000 and 0xbd000 are already - * used for the CPU Measurement Sampling facility. - */ -PMU_FORMAT_ATTR(event, "config:0-63"); - -static struct attribute *paicrypt_format_attr[] = { - &format_attr_event.attr, - NULL, -}; - -static struct attribute_group paicrypt_events_group = { - .name = "events", - .attrs = NULL /* Filled in attr_event_init() */ -}; - -static struct attribute_group paicrypt_format_group = { - .name = "format", - .attrs = paicrypt_format_attr, -}; - -static const struct attribute_group *paicrypt_attr_groups[] = { - &paicrypt_events_group, - &paicrypt_format_group, - NULL, -}; - -/* Performance monitoring unit for mapped counters */ -static struct pmu paicrypt = { - .task_ctx_nr = perf_hw_context, - .event_init = paicrypt_event_init, - .add = paicrypt_add, - .del = paicrypt_del, - .start = paicrypt_start, - .stop = paicrypt_stop, - .read = paicrypt_read, - .sched_task = paicrypt_sched_task, - .attr_groups = paicrypt_attr_groups -}; - -/* List of symbolic PAI counter names. */ -static const char * const paicrypt_ctrnames[] = { - [0] = "CRYPTO_ALL", - [1] = "KM_DEA", - [2] = "KM_TDEA_128", - [3] = "KM_TDEA_192", - [4] = "KM_ENCRYPTED_DEA", - [5] = "KM_ENCRYPTED_TDEA_128", - [6] = "KM_ENCRYPTED_TDEA_192", - [7] = "KM_AES_128", - [8] = "KM_AES_192", - [9] = "KM_AES_256", - [10] = "KM_ENCRYPTED_AES_128", - [11] = "KM_ENCRYPTED_AES_192", - [12] = "KM_ENCRYPTED_AES_256", - [13] = "KM_XTS_AES_128", - [14] = "KM_XTS_AES_256", - [15] = "KM_XTS_ENCRYPTED_AES_128", - [16] = "KM_XTS_ENCRYPTED_AES_256", - [17] = "KMC_DEA", - [18] = "KMC_TDEA_128", - [19] = "KMC_TDEA_192", - [20] = "KMC_ENCRYPTED_DEA", - [21] = "KMC_ENCRYPTED_TDEA_128", - [22] = "KMC_ENCRYPTED_TDEA_192", - [23] = "KMC_AES_128", - [24] = "KMC_AES_192", - [25] = "KMC_AES_256", - [26] = "KMC_ENCRYPTED_AES_128", - [27] = "KMC_ENCRYPTED_AES_192", - [28] = "KMC_ENCRYPTED_AES_256", - [29] = "KMC_PRNG", - [30] = "KMA_GCM_AES_128", - [31] = "KMA_GCM_AES_192", - [32] = "KMA_GCM_AES_256", - [33] = "KMA_GCM_ENCRYPTED_AES_128", - [34] = "KMA_GCM_ENCRYPTED_AES_192", - [35] = "KMA_GCM_ENCRYPTED_AES_256", - [36] = "KMF_DEA", - [37] = "KMF_TDEA_128", - [38] = "KMF_TDEA_192", - [39] = "KMF_ENCRYPTED_DEA", - [40] = "KMF_ENCRYPTED_TDEA_128", - [41] = "KMF_ENCRYPTED_TDEA_192", - [42] = "KMF_AES_128", - [43] = "KMF_AES_192", - [44] = "KMF_AES_256", - [45] = "KMF_ENCRYPTED_AES_128", - [46] = "KMF_ENCRYPTED_AES_192", - [47] = "KMF_ENCRYPTED_AES_256", - [48] = "KMCTR_DEA", - [49] = "KMCTR_TDEA_128", - [50] = "KMCTR_TDEA_192", - [51] = "KMCTR_ENCRYPTED_DEA", - [52] = "KMCTR_ENCRYPTED_TDEA_128", - [53] = "KMCTR_ENCRYPTED_TDEA_192", - [54] = "KMCTR_AES_128", - [55] = "KMCTR_AES_192", - [56] = "KMCTR_AES_256", - [57] = "KMCTR_ENCRYPTED_AES_128", - [58] = "KMCTR_ENCRYPTED_AES_192", - [59] = "KMCTR_ENCRYPTED_AES_256", - [60] = "KMO_DEA", - [61] = "KMO_TDEA_128", - [62] = "KMO_TDEA_192", - [63] = "KMO_ENCRYPTED_DEA", - [64] = "KMO_ENCRYPTED_TDEA_128", - [65] = "KMO_ENCRYPTED_TDEA_192", - [66] = "KMO_AES_128", - [67] = "KMO_AES_192", - [68] = "KMO_AES_256", - [69] = "KMO_ENCRYPTED_AES_128", - [70] = "KMO_ENCRYPTED_AES_192", - [71] = "KMO_ENCRYPTED_AES_256", - [72] = "KIMD_SHA_1", - [73] = "KIMD_SHA_256", - [74] = "KIMD_SHA_512", - [75] = "KIMD_SHA3_224", - [76] = "KIMD_SHA3_256", - [77] = "KIMD_SHA3_384", - [78] = "KIMD_SHA3_512", - [79] = "KIMD_SHAKE_128", - [80] = "KIMD_SHAKE_256", - [81] = "KIMD_GHASH", - [82] = "KLMD_SHA_1", - [83] = "KLMD_SHA_256", - [84] = "KLMD_SHA_512", - [85] = "KLMD_SHA3_224", - [86] = "KLMD_SHA3_256", - [87] = "KLMD_SHA3_384", - [88] = "KLMD_SHA3_512", - [89] = "KLMD_SHAKE_128", - [90] = "KLMD_SHAKE_256", - [91] = "KMAC_DEA", - [92] = "KMAC_TDEA_128", - [93] = "KMAC_TDEA_192", - [94] = "KMAC_ENCRYPTED_DEA", - [95] = "KMAC_ENCRYPTED_TDEA_128", - [96] = "KMAC_ENCRYPTED_TDEA_192", - [97] = "KMAC_AES_128", - [98] = "KMAC_AES_192", - [99] = "KMAC_AES_256", - [100] = "KMAC_ENCRYPTED_AES_128", - [101] = "KMAC_ENCRYPTED_AES_192", - [102] = "KMAC_ENCRYPTED_AES_256", - [103] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_DEA", - [104] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_TDEA_128", - [105] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_TDEA_192", - [106] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_DEA", - [107] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_TDEA_128", - [108] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_TDEA_192", - [109] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_AES_128", - [110] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_AES_192", - [111] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_AES_256", - [112] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_128", - [113] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_192", - [114] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_256A", - [115] = "PCC_COMPUTE_XTS_PARAMETER_USING_AES_128", - [116] = "PCC_COMPUTE_XTS_PARAMETER_USING_AES_256", - [117] = "PCC_COMPUTE_XTS_PARAMETER_USING_ENCRYPTED_AES_128", - [118] = "PCC_COMPUTE_XTS_PARAMETER_USING_ENCRYPTED_AES_256", - [119] = "PCC_SCALAR_MULTIPLY_P256", - [120] = "PCC_SCALAR_MULTIPLY_P384", - [121] = "PCC_SCALAR_MULTIPLY_P521", - [122] = "PCC_SCALAR_MULTIPLY_ED25519", - [123] = "PCC_SCALAR_MULTIPLY_ED448", - [124] = "PCC_SCALAR_MULTIPLY_X25519", - [125] = "PCC_SCALAR_MULTIPLY_X448", - [126] = "PRNO_SHA_512_DRNG", - [127] = "PRNO_TRNG_QUERY_RAW_TO_CONDITIONED_RATIO", - [128] = "PRNO_TRNG", - [129] = "KDSA_ECDSA_VERIFY_P256", - [130] = "KDSA_ECDSA_VERIFY_P384", - [131] = "KDSA_ECDSA_VERIFY_P521", - [132] = "KDSA_ECDSA_SIGN_P256", - [133] = "KDSA_ECDSA_SIGN_P384", - [134] = "KDSA_ECDSA_SIGN_P521", - [135] = "KDSA_ENCRYPTED_ECDSA_SIGN_P256", - [136] = "KDSA_ENCRYPTED_ECDSA_SIGN_P384", - [137] = "KDSA_ENCRYPTED_ECDSA_SIGN_P521", - [138] = "KDSA_EDDSA_VERIFY_ED25519", - [139] = "KDSA_EDDSA_VERIFY_ED448", - [140] = "KDSA_EDDSA_SIGN_ED25519", - [141] = "KDSA_EDDSA_SIGN_ED448", - [142] = "KDSA_ENCRYPTED_EDDSA_SIGN_ED25519", - [143] = "KDSA_ENCRYPTED_EDDSA_SIGN_ED448", - [144] = "PCKMO_ENCRYPT_DEA_KEY", - [145] = "PCKMO_ENCRYPT_TDEA_128_KEY", - [146] = "PCKMO_ENCRYPT_TDEA_192_KEY", - [147] = "PCKMO_ENCRYPT_AES_128_KEY", - [148] = "PCKMO_ENCRYPT_AES_192_KEY", - [149] = "PCKMO_ENCRYPT_AES_256_KEY", - [150] = "PCKMO_ENCRYPT_ECC_P256_KEY", - [151] = "PCKMO_ENCRYPT_ECC_P384_KEY", - [152] = "PCKMO_ENCRYPT_ECC_P521_KEY", - [153] = "PCKMO_ENCRYPT_ECC_ED25519_KEY", - [154] = "PCKMO_ENCRYPT_ECC_ED448_KEY", - [155] = "IBM_RESERVED_155", - [156] = "IBM_RESERVED_156", - [157] = "KM_FULL_XTS_AES_128", - [158] = "KM_FULL_XTS_AES_256", - [159] = "KM_FULL_XTS_ENCRYPTED_AES_128", - [160] = "KM_FULL_XTS_ENCRYPTED_AES_256", - [161] = "KMAC_HMAC_SHA_224", - [162] = "KMAC_HMAC_SHA_256", - [163] = "KMAC_HMAC_SHA_384", - [164] = "KMAC_HMAC_SHA_512", - [165] = "KMAC_HMAC_ENCRYPTED_SHA_224", - [166] = "KMAC_HMAC_ENCRYPTED_SHA_256", - [167] = "KMAC_HMAC_ENCRYPTED_SHA_384", - [168] = "KMAC_HMAC_ENCRYPTED_SHA_512", - [169] = "PCKMO_ENCRYPT_HMAC_512_KEY", - [170] = "PCKMO_ENCRYPT_HMAC_1024_KEY", - [171] = "PCKMO_ENCRYPT_AES_XTS_128", - [172] = "PCKMO_ENCRYPT_AES_XTS_256", -}; - -static void __init attr_event_free(struct attribute **attrs, int num) -{ - struct perf_pmu_events_attr *pa; - int i; - - for (i = 0; i < num; i++) { - struct device_attribute *dap; - - dap = container_of(attrs[i], struct device_attribute, attr); - pa = container_of(dap, struct perf_pmu_events_attr, attr); - kfree(pa); - } - kfree(attrs); -} - -static int __init attr_event_init_one(struct attribute **attrs, int num) -{ - struct perf_pmu_events_attr *pa; - - /* Index larger than array_size, no counter name available */ - if (num >= ARRAY_SIZE(paicrypt_ctrnames)) { - attrs[num] = NULL; - return 0; - } - - pa = kzalloc(sizeof(*pa), GFP_KERNEL); - if (!pa) - return -ENOMEM; - - sysfs_attr_init(&pa->attr.attr); - pa->id = PAI_CRYPTO_BASE + num; - pa->attr.attr.name = paicrypt_ctrnames[num]; - pa->attr.attr.mode = 0444; - pa->attr.show = cpumf_events_sysfs_show; - pa->attr.store = NULL; - attrs[num] = &pa->attr.attr; - return 0; -} - -/* Create PMU sysfs event attributes on the fly. */ -static int __init attr_event_init(void) -{ - struct attribute **attrs; - int ret, i; - - attrs = kmalloc_array(paicrypt_cnt + 2, sizeof(*attrs), GFP_KERNEL); - if (!attrs) - return -ENOMEM; - for (i = 0; i <= paicrypt_cnt; i++) { - ret = attr_event_init_one(attrs, i); - if (ret) { - attr_event_free(attrs, i); - return ret; - } - } - attrs[i] = NULL; - paicrypt_events_group.attrs = attrs; - return 0; -} - -static int __init paicrypt_init(void) -{ - struct qpaci_info_block ib; - int rc; - - if (!test_facility(196)) - return 0; - - qpaci(&ib); - paicrypt_cnt = ib.num_cc; - if (paicrypt_cnt == 0) - return 0; - if (paicrypt_cnt >= PAI_CRYPTO_MAXCTR) { - pr_err("Too many PMU pai_crypto counters %d\n", paicrypt_cnt); - return -E2BIG; - } - - rc = attr_event_init(); /* Export known PAI crypto events */ - if (rc) { - pr_err("Creation of PMU pai_crypto /sysfs failed\n"); - return rc; - } - - /* Setup s390dbf facility */ - cfm_dbg = debug_register(KMSG_COMPONENT, 2, 256, 128); - if (!cfm_dbg) { - pr_err("Registration of s390dbf pai_crypto failed\n"); - return -ENOMEM; - } - debug_register_view(cfm_dbg, &debug_sprintf_view); - - rc = perf_pmu_register(&paicrypt, "pai_crypto", -1); - if (rc) { - pr_err("Registering the pai_crypto PMU failed with rc=%i\n", - rc); - debug_unregister_view(cfm_dbg, &debug_sprintf_view); - debug_unregister(cfm_dbg); - return rc; - } - return 0; -} - -device_initcall(paicrypt_init); diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c deleted file mode 100644 index a8f0bad99cf0..000000000000 --- a/arch/s390/kernel/perf_pai_ext.c +++ /dev/null @@ -1,756 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Performance event support - Processor Activity Instrumentation Extension - * Facility - * - * Copyright IBM Corp. 2022 - * Author(s): Thomas Richter <tmricht@linux.ibm.com> - */ -#define KMSG_COMPONENT "pai_ext" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt - -#include <linux/kernel.h> -#include <linux/kernel_stat.h> -#include <linux/percpu.h> -#include <linux/notifier.h> -#include <linux/init.h> -#include <linux/export.h> -#include <linux/io.h> -#include <linux/perf_event.h> -#include <asm/ctlreg.h> -#include <asm/pai.h> -#include <asm/debug.h> - -#define PAIE1_CB_SZ 0x200 /* Size of PAIE1 control block */ -#define PAIE1_CTRBLOCK_SZ 0x400 /* Size of PAIE1 counter blocks */ - -static debug_info_t *paiext_dbg; -static unsigned int paiext_cnt; /* Extracted with QPACI instruction */ - -struct pai_userdata { - u16 num; - u64 value; -} __packed; - -/* Create the PAI extension 1 control block area. - * The PAI extension control block 1 is pointed to by lowcore - * address 0x1508 for each CPU. This control block is 512 bytes in size - * and requires a 512 byte boundary alignment. - */ -struct paiext_cb { /* PAI extension 1 control block */ - u64 header; /* Not used */ - u64 reserved1; - u64 acc; /* Addr to analytics counter control block */ - u8 reserved2[488]; -} __packed; - -struct paiext_map { - unsigned long *area; /* Area for CPU to store counters */ - struct pai_userdata *save; /* Area to store non-zero counters */ - unsigned int active_events; /* # of PAI Extension users */ - refcount_t refcnt; - struct perf_event *event; /* Perf event for sampling */ - struct paiext_cb *paiext_cb; /* PAI extension control block area */ - struct list_head syswide_list; /* List system-wide sampling events */ -}; - -struct paiext_mapptr { - struct paiext_map *mapptr; -}; - -static struct paiext_root { /* Anchor to per CPU data */ - refcount_t refcnt; /* Overall active events */ - struct paiext_mapptr __percpu *mapptr; -} paiext_root; - -/* Free per CPU data when the last event is removed. */ -static void paiext_root_free(void) -{ - if (refcount_dec_and_test(&paiext_root.refcnt)) { - free_percpu(paiext_root.mapptr); - paiext_root.mapptr = NULL; - } - debug_sprintf_event(paiext_dbg, 5, "%s root.refcount %d\n", __func__, - refcount_read(&paiext_root.refcnt)); -} - -/* On initialization of first event also allocate per CPU data dynamically. - * Start with an array of pointers, the array size is the maximum number of - * CPUs possible, which might be larger than the number of CPUs currently - * online. - */ -static int paiext_root_alloc(void) -{ - if (!refcount_inc_not_zero(&paiext_root.refcnt)) { - /* The memory is already zeroed. */ - paiext_root.mapptr = alloc_percpu(struct paiext_mapptr); - if (!paiext_root.mapptr) { - /* Returning without refcnt adjustment is ok. The - * error code is handled by paiext_alloc() which - * decrements refcnt when an event can not be - * created. - */ - return -ENOMEM; - } - refcount_set(&paiext_root.refcnt, 1); - } - return 0; -} - -/* Protects against concurrent increment of sampler and counter member - * increments at the same time and prohibits concurrent execution of - * counting and sampling events. - * Ensures that analytics counter block is deallocated only when the - * sampling and counting on that cpu is zero. - * For details see paiext_alloc(). - */ -static DEFINE_MUTEX(paiext_reserve_mutex); - -/* Free all memory allocated for event counting/sampling setup */ -static void paiext_free(struct paiext_mapptr *mp) -{ - kfree(mp->mapptr->area); - kfree(mp->mapptr->paiext_cb); - kvfree(mp->mapptr->save); - kfree(mp->mapptr); - mp->mapptr = NULL; -} - -/* Release the PMU if event is the last perf event */ -static void paiext_event_destroy_cpu(struct perf_event *event, int cpu) -{ - struct paiext_mapptr *mp = per_cpu_ptr(paiext_root.mapptr, cpu); - struct paiext_map *cpump = mp->mapptr; - - mutex_lock(&paiext_reserve_mutex); - if (refcount_dec_and_test(&cpump->refcnt)) /* Last reference gone */ - paiext_free(mp); - paiext_root_free(); - mutex_unlock(&paiext_reserve_mutex); -} - -static void paiext_event_destroy(struct perf_event *event) -{ - int cpu; - - free_page(PAI_SAVE_AREA(event)); - if (event->cpu == -1) { - struct cpumask *mask = PAI_CPU_MASK(event); - - for_each_cpu(cpu, mask) - paiext_event_destroy_cpu(event, cpu); - kfree(mask); - } else { - paiext_event_destroy_cpu(event, event->cpu); - } - debug_sprintf_event(paiext_dbg, 4, "%s cpu %d\n", __func__, - event->cpu); -} - -/* Used to avoid races in checking concurrent access of counting and - * sampling for pai_extension events. - * - * Only one instance of event pai_ext/NNPA_ALL/ for sampling is - * allowed and when this event is running, no counting event is allowed. - * Several counting events are allowed in parallel, but no sampling event - * is allowed while one (or more) counting events are running. - * - * This function is called in process context and it is safe to block. - * When the event initialization functions fails, no other call back will - * be invoked. - * - * Allocate the memory for the event. - */ -static int paiext_alloc_cpu(struct perf_event *event, int cpu) -{ - struct paiext_mapptr *mp; - struct paiext_map *cpump; - int rc; - - mutex_lock(&paiext_reserve_mutex); - rc = paiext_root_alloc(); - if (rc) - goto unlock; - - mp = per_cpu_ptr(paiext_root.mapptr, cpu); - cpump = mp->mapptr; - if (!cpump) { /* Paiext_map allocated? */ - rc = -ENOMEM; - cpump = kzalloc(sizeof(*cpump), GFP_KERNEL); - if (!cpump) - goto undo; - - /* Allocate memory for counter area and counter extraction. - * These are - * - a 512 byte block and requires 512 byte boundary alignment. - * - a 1KB byte block and requires 1KB boundary alignment. - * Only the first counting event has to allocate the area. - * - * Note: This works with commit 59bb47985c1d by default. - * Backporting this to kernels without this commit might - * need adjustment. - */ - mp->mapptr = cpump; - cpump->area = kzalloc(PAIE1_CTRBLOCK_SZ, GFP_KERNEL); - cpump->paiext_cb = kzalloc(PAIE1_CB_SZ, GFP_KERNEL); - cpump->save = kvmalloc_array(paiext_cnt + 1, - sizeof(struct pai_userdata), - GFP_KERNEL); - if (!cpump->save || !cpump->area || !cpump->paiext_cb) { - paiext_free(mp); - goto undo; - } - INIT_LIST_HEAD(&cpump->syswide_list); - refcount_set(&cpump->refcnt, 1); - rc = 0; - } else { - refcount_inc(&cpump->refcnt); - } - -undo: - if (rc) { - /* Error in allocation of event, decrement anchor. Since - * the event in not created, its destroy() function is never - * invoked. Adjust the reference counter for the anchor. - */ - paiext_root_free(); - } -unlock: - mutex_unlock(&paiext_reserve_mutex); - /* If rc is non-zero, no increment of counter/sampler was done. */ - return rc; -} - -static int paiext_alloc(struct perf_event *event) -{ - struct cpumask *maskptr; - int cpu, rc = -ENOMEM; - - maskptr = kzalloc(sizeof(*maskptr), GFP_KERNEL); - if (!maskptr) - goto out; - - for_each_online_cpu(cpu) { - rc = paiext_alloc_cpu(event, cpu); - if (rc) { - for_each_cpu(cpu, maskptr) - paiext_event_destroy_cpu(event, cpu); - kfree(maskptr); - goto out; - } - cpumask_set_cpu(cpu, maskptr); - } - - /* - * On error all cpumask are freed and all events have been destroyed. - * Save of which CPUs data structures have been allocated for. - * Release them in paicrypt_event_destroy call back function - * for this event. - */ - PAI_CPU_MASK(event) = maskptr; - rc = 0; -out: - return rc; -} - -/* The PAI extension 1 control block supports up to 128 entries. Return - * the index within PAIE1_CB given the event number. Also validate event - * number. - */ -static int paiext_event_valid(struct perf_event *event) -{ - u64 cfg = event->attr.config; - - if (cfg >= PAI_NNPA_BASE && cfg <= PAI_NNPA_BASE + paiext_cnt) { - /* Offset NNPA in paiext_cb */ - event->hw.config_base = offsetof(struct paiext_cb, acc); - return 0; - } - return -EINVAL; -} - -/* Might be called on different CPU than the one the event is intended for. */ -static int paiext_event_init(struct perf_event *event) -{ - struct perf_event_attr *a = &event->attr; - int rc; - - /* PMU pai_ext registered as PERF_TYPE_RAW, check event type */ - if (a->type != PERF_TYPE_RAW && event->pmu->type != a->type) - return -ENOENT; - /* PAI extension event must be valid and in supported range */ - rc = paiext_event_valid(event); - if (rc) - return rc; - /* Allow only event NNPA_ALL for sampling. */ - if (a->sample_period && a->config != PAI_NNPA_BASE) - return -EINVAL; - /* Prohibit exclude_user event selection */ - if (a->exclude_user) - return -EINVAL; - /* Get a page to store last counter values for sampling */ - if (a->sample_period) { - PAI_SAVE_AREA(event) = get_zeroed_page(GFP_KERNEL); - if (!PAI_SAVE_AREA(event)) - return -ENOMEM; - } - - if (event->cpu >= 0) - rc = paiext_alloc_cpu(event, event->cpu); - else - rc = paiext_alloc(event); - if (rc) { - free_page(PAI_SAVE_AREA(event)); - return rc; - } - event->destroy = paiext_event_destroy; - - if (a->sample_period) { - a->sample_period = 1; - a->freq = 0; - /* Register for paicrypt_sched_task() to be called */ - event->attach_state |= PERF_ATTACH_SCHED_CB; - /* Add raw data which are the memory mapped counters */ - a->sample_type |= PERF_SAMPLE_RAW; - /* Turn off inheritance */ - a->inherit = 0; - } - - return 0; -} - -static u64 paiext_getctr(unsigned long *area, int nr) -{ - return area[nr]; -} - -/* Read the counter values. Return value from location in buffer. For event - * NNPA_ALL sum up all events. - */ -static u64 paiext_getdata(struct perf_event *event) -{ - struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr); - struct paiext_map *cpump = mp->mapptr; - u64 sum = 0; - int i; - - if (event->attr.config != PAI_NNPA_BASE) - return paiext_getctr(cpump->area, - event->attr.config - PAI_NNPA_BASE); - - for (i = 1; i <= paiext_cnt; i++) - sum += paiext_getctr(cpump->area, i); - - return sum; -} - -static u64 paiext_getall(struct perf_event *event) -{ - return paiext_getdata(event); -} - -static void paiext_read(struct perf_event *event) -{ - u64 prev, new, delta; - - prev = local64_read(&event->hw.prev_count); - new = paiext_getall(event); - local64_set(&event->hw.prev_count, new); - delta = new - prev; - local64_add(delta, &event->count); -} - -static void paiext_start(struct perf_event *event, int flags) -{ - struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr); - struct paiext_map *cpump = mp->mapptr; - u64 sum; - - if (!event->attr.sample_period) { /* Counting */ - sum = paiext_getall(event); /* Get current value */ - local64_set(&event->hw.prev_count, sum); - } else { /* Sampling */ - memcpy((void *)PAI_SAVE_AREA(event), cpump->area, - PAIE1_CTRBLOCK_SZ); - /* Enable context switch callback for system-wide sampling */ - if (!(event->attach_state & PERF_ATTACH_TASK)) { - list_add_tail(PAI_SWLIST(event), &cpump->syswide_list); - perf_sched_cb_inc(event->pmu); - } else { - cpump->event = event; - } - } -} - -static int paiext_add(struct perf_event *event, int flags) -{ - struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr); - struct paiext_map *cpump = mp->mapptr; - struct paiext_cb *pcb = cpump->paiext_cb; - - if (++cpump->active_events == 1) { - get_lowcore()->aicd = virt_to_phys(cpump->paiext_cb); - pcb->acc = virt_to_phys(cpump->area) | 0x1; - /* Enable CPU instruction lookup for PAIE1 control block */ - local_ctl_set_bit(0, CR0_PAI_EXTENSION_BIT); - } - if (flags & PERF_EF_START) - paiext_start(event, PERF_EF_RELOAD); - event->hw.state = 0; - return 0; -} - -static void paiext_have_sample(struct perf_event *, struct paiext_map *); -static void paiext_stop(struct perf_event *event, int flags) -{ - struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr); - struct paiext_map *cpump = mp->mapptr; - - if (!event->attr.sample_period) { /* Counting */ - paiext_read(event); - } else { /* Sampling */ - if (!(event->attach_state & PERF_ATTACH_TASK)) { - list_del(PAI_SWLIST(event)); - perf_sched_cb_dec(event->pmu); - } else { - paiext_have_sample(event, cpump); - cpump->event = NULL; - } - } - event->hw.state = PERF_HES_STOPPED; -} - -static void paiext_del(struct perf_event *event, int flags) -{ - struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr); - struct paiext_map *cpump = mp->mapptr; - struct paiext_cb *pcb = cpump->paiext_cb; - - paiext_stop(event, PERF_EF_UPDATE); - if (--cpump->active_events == 0) { - /* Disable CPU instruction lookup for PAIE1 control block */ - local_ctl_clear_bit(0, CR0_PAI_EXTENSION_BIT); - pcb->acc = 0; - get_lowcore()->aicd = 0; - } -} - -/* Create raw data and save it in buffer. Returns number of bytes copied. - * Saves only positive counter entries of the form - * 2 bytes: Number of counter - * 8 bytes: Value of counter - */ -static size_t paiext_copy(struct pai_userdata *userdata, unsigned long *area, - unsigned long *area_old) -{ - int i, outidx = 0; - - for (i = 1; i <= paiext_cnt; i++) { - u64 val = paiext_getctr(area, i); - u64 val_old = paiext_getctr(area_old, i); - - if (val >= val_old) - val -= val_old; - else - val = (~0ULL - val_old) + val + 1; - if (val) { - userdata[outidx].num = i; - userdata[outidx].value = val; - outidx++; - } - } - return outidx * sizeof(*userdata); -} - -/* Write sample when one or more counters values are nonzero. - * - * Note: The function paiext_sched_task() and paiext_push_sample() are not - * invoked after function paiext_del() has been called because of function - * perf_sched_cb_dec(). - * The function paiext_sched_task() and paiext_push_sample() are only - * called when sampling is active. Function perf_sched_cb_inc() - * has been invoked to install function paiext_sched_task() as call back - * to run at context switch time (see paiext_add()). - * - * This causes function perf_event_context_sched_out() and - * perf_event_context_sched_in() to check whether the PMU has installed an - * sched_task() callback. That callback is not active after paiext_del() - * returns and has deleted the event on that CPU. - */ -static int paiext_push_sample(size_t rawsize, struct paiext_map *cpump, - struct perf_event *event) -{ - struct perf_sample_data data; - struct perf_raw_record raw; - struct pt_regs regs; - int overflow; - - /* Setup perf sample */ - memset(®s, 0, sizeof(regs)); - memset(&raw, 0, sizeof(raw)); - memset(&data, 0, sizeof(data)); - perf_sample_data_init(&data, 0, event->hw.last_period); - if (event->attr.sample_type & PERF_SAMPLE_TID) { - data.tid_entry.pid = task_tgid_nr(current); - data.tid_entry.tid = task_pid_nr(current); - } - if (event->attr.sample_type & PERF_SAMPLE_TIME) - data.time = event->clock(); - if (event->attr.sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) - data.id = event->id; - if (event->attr.sample_type & PERF_SAMPLE_CPU) - data.cpu_entry.cpu = smp_processor_id(); - if (event->attr.sample_type & PERF_SAMPLE_RAW) { - raw.frag.size = rawsize; - raw.frag.data = cpump->save; - perf_sample_save_raw_data(&data, event, &raw); - } - - overflow = perf_event_overflow(event, &data, ®s); - perf_event_update_userpage(event); - /* Save NNPA lowcore area after read in event */ - memcpy((void *)PAI_SAVE_AREA(event), cpump->area, - PAIE1_CTRBLOCK_SZ); - return overflow; -} - -/* Check if there is data to be saved on schedule out of a task. */ -static void paiext_have_sample(struct perf_event *event, - struct paiext_map *cpump) -{ - size_t rawsize; - - if (!event) - return; - rawsize = paiext_copy(cpump->save, cpump->area, - (unsigned long *)PAI_SAVE_AREA(event)); - if (rawsize) /* Incremented counters */ - paiext_push_sample(rawsize, cpump, event); -} - -/* Check if there is data to be saved on schedule out of a task. */ -static void paiext_have_samples(void) -{ - struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr); - struct paiext_map *cpump = mp->mapptr; - struct perf_event *event; - - list_for_each_entry(event, &cpump->syswide_list, hw.tp_list) - paiext_have_sample(event, cpump); -} - -/* Called on schedule-in and schedule-out. No access to event structure, - * but for sampling only event NNPA_ALL is allowed. - */ -static void paiext_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) -{ - /* We started with a clean page on event installation. So read out - * results on schedule_out and if page was dirty, save old values. - */ - if (!sched_in) - paiext_have_samples(); -} - -/* Attribute definitions for pai extension1 interface. As with other CPU - * Measurement Facilities, there is one attribute per mapped counter. - * The number of mapped counters may vary per machine generation. Use - * the QUERY PROCESSOR ACTIVITY COUNTER INFORMATION (QPACI) instruction - * to determine the number of mapped counters. The instructions returns - * a positive number, which is the highest number of supported counters. - * All counters less than this number are also supported, there are no - * holes. A returned number of zero means no support for mapped counters. - * - * The identification of the counter is a unique number. The chosen range - * is 0x1800 + offset in mapped kernel page. - * All CPU Measurement Facility counters identifiers must be unique and - * the numbers from 0 to 496 are already used for the CPU Measurement - * Counter facility. Number 0x1000 to 0x103e are used for PAI cryptography - * counters. - * Numbers 0xb0000, 0xbc000 and 0xbd000 are already - * used for the CPU Measurement Sampling facility. - */ -PMU_FORMAT_ATTR(event, "config:0-63"); - -static struct attribute *paiext_format_attr[] = { - &format_attr_event.attr, - NULL, -}; - -static struct attribute_group paiext_events_group = { - .name = "events", - .attrs = NULL, /* Filled in attr_event_init() */ -}; - -static struct attribute_group paiext_format_group = { - .name = "format", - .attrs = paiext_format_attr, -}; - -static const struct attribute_group *paiext_attr_groups[] = { - &paiext_events_group, - &paiext_format_group, - NULL, -}; - -/* Performance monitoring unit for mapped counters */ -static struct pmu paiext = { - .task_ctx_nr = perf_hw_context, - .event_init = paiext_event_init, - .add = paiext_add, - .del = paiext_del, - .start = paiext_start, - .stop = paiext_stop, - .read = paiext_read, - .sched_task = paiext_sched_task, - .attr_groups = paiext_attr_groups, -}; - -/* List of symbolic PAI extension 1 NNPA counter names. */ -static const char * const paiext_ctrnames[] = { - [0] = "NNPA_ALL", - [1] = "NNPA_ADD", - [2] = "NNPA_SUB", - [3] = "NNPA_MUL", - [4] = "NNPA_DIV", - [5] = "NNPA_MIN", - [6] = "NNPA_MAX", - [7] = "NNPA_LOG", - [8] = "NNPA_EXP", - [9] = "NNPA_IBM_RESERVED_9", - [10] = "NNPA_RELU", - [11] = "NNPA_TANH", - [12] = "NNPA_SIGMOID", - [13] = "NNPA_SOFTMAX", - [14] = "NNPA_BATCHNORM", - [15] = "NNPA_MAXPOOL2D", - [16] = "NNPA_AVGPOOL2D", - [17] = "NNPA_LSTMACT", - [18] = "NNPA_GRUACT", - [19] = "NNPA_CONVOLUTION", - [20] = "NNPA_MATMUL_OP", - [21] = "NNPA_MATMUL_OP_BCAST23", - [22] = "NNPA_SMALLBATCH", - [23] = "NNPA_LARGEDIM", - [24] = "NNPA_SMALLTENSOR", - [25] = "NNPA_1MFRAME", - [26] = "NNPA_2GFRAME", - [27] = "NNPA_ACCESSEXCEPT", - [28] = "NNPA_TRANSFORM", - [29] = "NNPA_GELU", - [30] = "NNPA_MOMENTS", - [31] = "NNPA_LAYERNORM", - [32] = "NNPA_MATMUL_OP_BCAST1", - [33] = "NNPA_SQRT", - [34] = "NNPA_INVSQRT", - [35] = "NNPA_NORM", - [36] = "NNPA_REDUCE", -}; - -static void __init attr_event_free(struct attribute **attrs, int num) -{ - struct perf_pmu_events_attr *pa; - struct device_attribute *dap; - int i; - - for (i = 0; i < num; i++) { - dap = container_of(attrs[i], struct device_attribute, attr); - pa = container_of(dap, struct perf_pmu_events_attr, attr); - kfree(pa); - } - kfree(attrs); -} - -static int __init attr_event_init_one(struct attribute **attrs, int num) -{ - struct perf_pmu_events_attr *pa; - - /* Index larger than array_size, no counter name available */ - if (num >= ARRAY_SIZE(paiext_ctrnames)) { - attrs[num] = NULL; - return 0; - } - - pa = kzalloc(sizeof(*pa), GFP_KERNEL); - if (!pa) - return -ENOMEM; - - sysfs_attr_init(&pa->attr.attr); - pa->id = PAI_NNPA_BASE + num; - pa->attr.attr.name = paiext_ctrnames[num]; - pa->attr.attr.mode = 0444; - pa->attr.show = cpumf_events_sysfs_show; - pa->attr.store = NULL; - attrs[num] = &pa->attr.attr; - return 0; -} - -/* Create PMU sysfs event attributes on the fly. */ -static int __init attr_event_init(void) -{ - struct attribute **attrs; - int ret, i; - - attrs = kmalloc_array(paiext_cnt + 2, sizeof(*attrs), GFP_KERNEL); - if (!attrs) - return -ENOMEM; - for (i = 0; i <= paiext_cnt; i++) { - ret = attr_event_init_one(attrs, i); - if (ret) { - attr_event_free(attrs, i); - return ret; - } - } - attrs[i] = NULL; - paiext_events_group.attrs = attrs; - return 0; -} - -static int __init paiext_init(void) -{ - struct qpaci_info_block ib; - int rc = -ENOMEM; - - if (!test_facility(197)) - return 0; - - qpaci(&ib); - paiext_cnt = ib.num_nnpa; - if (paiext_cnt >= PAI_NNPA_MAXCTR) - paiext_cnt = PAI_NNPA_MAXCTR; - if (!paiext_cnt) - return 0; - - rc = attr_event_init(); - if (rc) { - pr_err("Creation of PMU " KMSG_COMPONENT " /sysfs failed\n"); - return rc; - } - - /* Setup s390dbf facility */ - paiext_dbg = debug_register(KMSG_COMPONENT, 2, 256, 128); - if (!paiext_dbg) { - pr_err("Registration of s390dbf " KMSG_COMPONENT " failed\n"); - rc = -ENOMEM; - goto out_init; - } - debug_register_view(paiext_dbg, &debug_sprintf_view); - - rc = perf_pmu_register(&paiext, KMSG_COMPONENT, -1); - if (rc) { - pr_err("Registration of " KMSG_COMPONENT " PMU failed with " - "rc=%i\n", rc); - goto out_pmu; - } - - return 0; - -out_pmu: - debug_unregister_view(paiext_dbg, &debug_sprintf_view); - debug_unregister(paiext_dbg); -out_init: - attr_event_free(paiext_events_group.attrs, - ARRAY_SIZE(paiext_ctrnames) + 1); - return rc; -} - -device_initcall(paiext_init); diff --git a/arch/s390/kernel/perf_regs.c b/arch/s390/kernel/perf_regs.c index a6b058ee4a36..7b305f1456f8 100644 --- a/arch/s390/kernel/perf_regs.c +++ b/arch/s390/kernel/perf_regs.c @@ -44,9 +44,6 @@ int perf_reg_validate(u64 mask) u64 perf_reg_abi(struct task_struct *task) { - if (test_tsk_thread_flag(task, TIF_31BIT)) - return PERF_SAMPLE_REGS_ABI_32; - return PERF_SAMPLE_REGS_ABI_64; } diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 9637aee43c40..0df95dcb2101 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -24,10 +24,8 @@ #include <linux/tick.h> #include <linux/personality.h> #include <linux/syscalls.h> -#include <linux/compat.h> #include <linux/kprobes.h> #include <linux/random.h> -#include <linux/export.h> #include <linux/init_task.h> #include <linux/entry-common.h> #include <linux/io.h> @@ -107,7 +105,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long new_stackp = args->stack; unsigned long tls = args->tls; struct fake_frame @@ -167,12 +165,8 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) /* Set a new TLS ? */ if (clone_flags & CLONE_SETTLS) { - if (is_compat_task()) { - p->thread.acrs[0] = (unsigned int)tls; - } else { - p->thread.acrs[0] = (unsigned int)(tls >> 32); - p->thread.acrs[1] = (unsigned int)tls; - } + p->thread.acrs[0] = (unsigned int)(tls >> 32); + p->thread.acrs[1] = (unsigned int)tls; } /* * s390 stores the svc return address in arch_data when calling diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c index 5ce9a795a0fe..e33a3eccda56 100644 --- a/arch/s390/kernel/processor.c +++ b/arch/s390/kernel/processor.c @@ -4,10 +4,10 @@ * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) */ -#define KMSG_COMPONENT "cpu" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "cpu: " fmt #include <linux/stop_machine.h> +#include <linux/cpufeature.h> #include <linux/bitops.h> #include <linux/kernel.h> #include <linux/random.h> @@ -19,6 +19,7 @@ #include <linux/cpu.h> #include <linux/smp.h> #include <asm/text-patching.h> +#include <asm/machine.h> #include <asm/diag.h> #include <asm/facility.h> #include <asm/elf.h> @@ -72,7 +73,7 @@ void notrace stop_machine_yield(const struct cpumask *cpumask) this_cpu = smp_processor_id(); if (__this_cpu_inc_return(cpu_relax_retry) >= spin_retry) { __this_cpu_write(cpu_relax_retry, 0); - cpu = cpumask_next_wrap(this_cpu, cpumask, this_cpu, false); + cpu = cpumask_next_wrap(this_cpu, cpumask); if (cpu >= nr_cpu_ids) return; if (arch_vcpu_is_preempted(cpu)) @@ -209,14 +210,14 @@ static int __init setup_hwcaps(void) elf_hwcap |= HWCAP_DFP; /* huge page support */ - if (MACHINE_HAS_EDAT1) + if (cpu_has_edat1()) elf_hwcap |= HWCAP_HPAGE; /* 64-bit register support for 31-bit processes */ elf_hwcap |= HWCAP_HIGH_GPRS; /* transactional execution */ - if (MACHINE_HAS_TE) + if (machine_has_tx()) elf_hwcap |= HWCAP_TE; /* vector */ @@ -244,10 +245,10 @@ static int __init setup_hwcaps(void) elf_hwcap |= HWCAP_NNPA; /* guarded storage */ - if (MACHINE_HAS_GS) + if (cpu_has_gs()) elf_hwcap |= HWCAP_GS; - if (MACHINE_HAS_PCI_MIO) + if (test_machine_feature(MFEATURE_PCI_MIO)) elf_hwcap |= HWCAP_PCI_MIO; /* virtualization support */ @@ -266,31 +267,35 @@ static int __init setup_elf_platform(void) add_device_randomness(&cpu_id, sizeof(cpu_id)); switch (cpu_id.machine) { default: /* Use "z10" as default. */ - strcpy(elf_platform, "z10"); + strscpy(elf_platform, "z10"); break; case 0x2817: case 0x2818: - strcpy(elf_platform, "z196"); + strscpy(elf_platform, "z196"); break; case 0x2827: case 0x2828: - strcpy(elf_platform, "zEC12"); + strscpy(elf_platform, "zEC12"); break; case 0x2964: case 0x2965: - strcpy(elf_platform, "z13"); + strscpy(elf_platform, "z13"); break; case 0x3906: case 0x3907: - strcpy(elf_platform, "z14"); + strscpy(elf_platform, "z14"); break; case 0x8561: case 0x8562: - strcpy(elf_platform, "z15"); + strscpy(elf_platform, "z15"); break; case 0x3931: case 0x3932: - strcpy(elf_platform, "z16"); + strscpy(elf_platform, "z16"); + break; + case 0x9175: + case 0x9176: + strscpy(elf_platform, "z17"); break; } return 0; diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index 1cfed8b710b8..125ca4c4e30c 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -7,10 +7,10 @@ * Martin Schwidefsky (schwidefsky@de.ibm.com) */ -#include "asm/ptrace.h" #include <linux/kernel.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> +#include <linux/cpufeature.h> #include <linux/mm.h> #include <linux/smp.h> #include <linux/errno.h> @@ -22,7 +22,6 @@ #include <linux/elf.h> #include <linux/regset.h> #include <linux/seccomp.h> -#include <linux/compat.h> #include <trace/syscall.h> #include <asm/guarded_storage.h> #include <asm/access-regs.h> @@ -31,14 +30,13 @@ #include <asm/unistd.h> #include <asm/runtime_instr.h> #include <asm/facility.h> +#include <asm/machine.h> +#include <asm/ptrace.h> +#include <asm/rwonce.h> #include <asm/fpu.h> #include "entry.h" -#ifdef CONFIG_COMPAT -#include "compat_ptrace.h" -#endif - void update_cr_regs(struct task_struct *task) { struct pt_regs *regs = task_pt_regs(task); @@ -60,7 +58,7 @@ void update_cr_regs(struct task_struct *task) cr0_new = cr0_old; cr2_new = cr2_old; /* Take care of the enable/disable of transactional execution. */ - if (MACHINE_HAS_TE) { + if (machine_has_tx()) { /* Set or clear transaction execution TXC bit 8. */ cr0_new.tcx = 1; if (task->thread.per_flags & PER_FLAG_NO_TE) @@ -75,7 +73,7 @@ void update_cr_regs(struct task_struct *task) } } /* Take care of enable/disable of guarded storage. */ - if (MACHINE_HAS_GS) { + if (cpu_has_gs()) { cr2_new.gse = 0; if (task->thread.gs_cb) cr2_new.gse = 1; @@ -470,18 +468,18 @@ long arch_ptrace(struct task_struct *child, long request, case PTRACE_GET_LAST_BREAK: return put_user(child->thread.last_break, (unsigned long __user *)data); case PTRACE_ENABLE_TE: - if (!MACHINE_HAS_TE) + if (!machine_has_tx()) return -EIO; child->thread.per_flags &= ~PER_FLAG_NO_TE; return 0; case PTRACE_DISABLE_TE: - if (!MACHINE_HAS_TE) + if (!machine_has_tx()) return -EIO; child->thread.per_flags |= PER_FLAG_NO_TE; child->thread.per_flags &= ~PER_FLAG_TE_ABORT_RAND; return 0; case PTRACE_TE_ABORT_RAND: - if (!MACHINE_HAS_TE || (child->thread.per_flags & PER_FLAG_NO_TE)) + if (!machine_has_tx() || (child->thread.per_flags & PER_FLAG_NO_TE)) return -EIO; switch (data) { case 0UL: @@ -504,308 +502,6 @@ long arch_ptrace(struct task_struct *child, long request, } } -#ifdef CONFIG_COMPAT -/* - * Now the fun part starts... a 31 bit program running in the - * 31 bit emulation tracing another program. PTRACE_PEEKTEXT, - * PTRACE_PEEKDATA, PTRACE_POKETEXT and PTRACE_POKEDATA are easy - * to handle, the difference to the 64 bit versions of the requests - * is that the access is done in multiples of 4 byte instead of - * 8 bytes (sizeof(unsigned long) on 31/64 bit). - * The ugly part are PTRACE_PEEKUSR, PTRACE_PEEKUSR_AREA, - * PTRACE_POKEUSR and PTRACE_POKEUSR_AREA. If the traced program - * is a 31 bit program too, the content of struct user can be - * emulated. A 31 bit program peeking into the struct user of - * a 64 bit program is a no-no. - */ - -/* - * Same as peek_user_per but for a 31 bit program. - */ -static inline __u32 __peek_user_per_compat(struct task_struct *child, - addr_t addr) -{ - if (addr == offsetof(struct compat_per_struct_kernel, cr9)) - /* Control bits of the active per set. */ - return (__u32) test_thread_flag(TIF_SINGLE_STEP) ? - PER_EVENT_IFETCH : child->thread.per_user.control; - else if (addr == offsetof(struct compat_per_struct_kernel, cr10)) - /* Start address of the active per set. */ - return (__u32) test_thread_flag(TIF_SINGLE_STEP) ? - 0 : child->thread.per_user.start; - else if (addr == offsetof(struct compat_per_struct_kernel, cr11)) - /* End address of the active per set. */ - return test_thread_flag(TIF_SINGLE_STEP) ? - PSW32_ADDR_INSN : child->thread.per_user.end; - else if (addr == offsetof(struct compat_per_struct_kernel, bits)) - /* Single-step bit. */ - return (__u32) test_thread_flag(TIF_SINGLE_STEP) ? - 0x80000000 : 0; - else if (addr == offsetof(struct compat_per_struct_kernel, starting_addr)) - /* Start address of the user specified per set. */ - return (__u32) child->thread.per_user.start; - else if (addr == offsetof(struct compat_per_struct_kernel, ending_addr)) - /* End address of the user specified per set. */ - return (__u32) child->thread.per_user.end; - else if (addr == offsetof(struct compat_per_struct_kernel, perc_atmid)) - /* PER code, ATMID and AI of the last PER trap */ - return (__u32) child->thread.per_event.cause << 16; - else if (addr == offsetof(struct compat_per_struct_kernel, address)) - /* Address of the last PER trap */ - return (__u32) child->thread.per_event.address; - else if (addr == offsetof(struct compat_per_struct_kernel, access_id)) - /* Access id of the last PER trap */ - return (__u32) child->thread.per_event.paid << 24; - return 0; -} - -/* - * Same as peek_user but for a 31 bit program. - */ -static u32 __peek_user_compat(struct task_struct *child, addr_t addr) -{ - addr_t offset; - __u32 tmp; - - if (addr < offsetof(struct compat_user, regs.acrs)) { - struct pt_regs *regs = task_pt_regs(child); - /* - * psw and gprs are stored on the stack - */ - if (addr == offsetof(struct compat_user, regs.psw.mask)) { - /* Fake a 31 bit psw mask. */ - tmp = (__u32)(regs->psw.mask >> 32); - tmp &= PSW32_MASK_USER | PSW32_MASK_RI; - tmp |= PSW32_USER_BITS; - } else if (addr == offsetof(struct compat_user, regs.psw.addr)) { - /* Fake a 31 bit psw address. */ - tmp = (__u32) regs->psw.addr | - (__u32)(regs->psw.mask & PSW_MASK_BA); - } else { - /* gpr 0-15 */ - tmp = *(__u32 *)((addr_t) ®s->psw + addr*2 + 4); - } - } else if (addr < offsetof(struct compat_user, regs.orig_gpr2)) { - /* - * access registers are stored in the thread structure - */ - offset = addr - offsetof(struct compat_user, regs.acrs); - tmp = *(__u32*)((addr_t) &child->thread.acrs + offset); - - } else if (addr == offsetof(struct compat_user, regs.orig_gpr2)) { - /* - * orig_gpr2 is stored on the kernel stack - */ - tmp = *(__u32*)((addr_t) &task_pt_regs(child)->orig_gpr2 + 4); - - } else if (addr < offsetof(struct compat_user, regs.fp_regs)) { - /* - * prevent reads of padding hole between - * orig_gpr2 and fp_regs on s390. - */ - tmp = 0; - - } else if (addr == offsetof(struct compat_user, regs.fp_regs.fpc)) { - /* - * floating point control reg. is in the thread structure - */ - tmp = child->thread.ufpu.fpc; - - } else if (addr < offsetof(struct compat_user, regs.fp_regs) + sizeof(s390_fp_regs)) { - /* - * floating point regs. are in the child->thread.ufpu.vxrs array - */ - offset = addr - offsetof(struct compat_user, regs.fp_regs.fprs); - tmp = *(__u32 *)((addr_t)child->thread.ufpu.vxrs + 2 * offset); - } else if (addr < offsetof(struct compat_user, regs.per_info) + sizeof(struct compat_per_struct_kernel)) { - /* - * Handle access to the per_info structure. - */ - addr -= offsetof(struct compat_user, regs.per_info); - tmp = __peek_user_per_compat(child, addr); - - } else - tmp = 0; - - return tmp; -} - -static int peek_user_compat(struct task_struct *child, - addr_t addr, addr_t data) -{ - __u32 tmp; - - if (!is_compat_task() || (addr & 3) || addr > sizeof(struct user) - 3) - return -EIO; - - tmp = __peek_user_compat(child, addr); - return put_user(tmp, (__u32 __user *) data); -} - -/* - * Same as poke_user_per but for a 31 bit program. - */ -static inline void __poke_user_per_compat(struct task_struct *child, - addr_t addr, __u32 data) -{ - if (addr == offsetof(struct compat_per_struct_kernel, cr9)) - /* PER event mask of the user specified per set. */ - child->thread.per_user.control = - data & (PER_EVENT_MASK | PER_CONTROL_MASK); - else if (addr == offsetof(struct compat_per_struct_kernel, starting_addr)) - /* Starting address of the user specified per set. */ - child->thread.per_user.start = data; - else if (addr == offsetof(struct compat_per_struct_kernel, ending_addr)) - /* Ending address of the user specified per set. */ - child->thread.per_user.end = data; -} - -/* - * Same as poke_user but for a 31 bit program. - */ -static int __poke_user_compat(struct task_struct *child, - addr_t addr, addr_t data) -{ - __u32 tmp = (__u32) data; - addr_t offset; - - if (addr < offsetof(struct compat_user, regs.acrs)) { - struct pt_regs *regs = task_pt_regs(child); - /* - * psw, gprs, acrs and orig_gpr2 are stored on the stack - */ - if (addr == offsetof(struct compat_user, regs.psw.mask)) { - __u32 mask = PSW32_MASK_USER; - - mask |= is_ri_task(child) ? PSW32_MASK_RI : 0; - /* Build a 64 bit psw mask from 31 bit mask. */ - if ((tmp ^ PSW32_USER_BITS) & ~mask) - /* Invalid psw mask. */ - return -EINVAL; - if ((data & PSW32_MASK_ASC) == PSW32_ASC_HOME) - /* Invalid address-space-control bits */ - return -EINVAL; - regs->psw.mask = (regs->psw.mask & ~PSW_MASK_USER) | - (regs->psw.mask & PSW_MASK_BA) | - (__u64)(tmp & mask) << 32; - } else if (addr == offsetof(struct compat_user, regs.psw.addr)) { - /* Build a 64 bit psw address from 31 bit address. */ - regs->psw.addr = (__u64) tmp & PSW32_ADDR_INSN; - /* Transfer 31 bit amode bit to psw mask. */ - regs->psw.mask = (regs->psw.mask & ~PSW_MASK_BA) | - (__u64)(tmp & PSW32_ADDR_AMODE); - } else { - if (test_pt_regs_flag(regs, PIF_SYSCALL) && - addr == offsetof(struct compat_user, regs.gprs[2])) { - struct pt_regs *regs = task_pt_regs(child); - - regs->int_code = 0x20000 | (data & 0xffff); - } - /* gpr 0-15 */ - *(__u32*)((addr_t) ®s->psw + addr*2 + 4) = tmp; - } - } else if (addr < offsetof(struct compat_user, regs.orig_gpr2)) { - /* - * access registers are stored in the thread structure - */ - offset = addr - offsetof(struct compat_user, regs.acrs); - *(__u32*)((addr_t) &child->thread.acrs + offset) = tmp; - - } else if (addr == offsetof(struct compat_user, regs.orig_gpr2)) { - /* - * orig_gpr2 is stored on the kernel stack - */ - *(__u32*)((addr_t) &task_pt_regs(child)->orig_gpr2 + 4) = tmp; - - } else if (addr < offsetof(struct compat_user, regs.fp_regs)) { - /* - * prevent writess of padding hole between - * orig_gpr2 and fp_regs on s390. - */ - return 0; - - } else if (addr == offsetof(struct compat_user, regs.fp_regs.fpc)) { - /* - * floating point control reg. is in the thread structure - */ - child->thread.ufpu.fpc = data; - - } else if (addr < offsetof(struct compat_user, regs.fp_regs) + sizeof(s390_fp_regs)) { - /* - * floating point regs. are in the child->thread.ufpu.vxrs array - */ - offset = addr - offsetof(struct compat_user, regs.fp_regs.fprs); - *(__u32 *)((addr_t)child->thread.ufpu.vxrs + 2 * offset) = tmp; - } else if (addr < offsetof(struct compat_user, regs.per_info) + sizeof(struct compat_per_struct_kernel)) { - /* - * Handle access to the per_info structure. - */ - addr -= offsetof(struct compat_user, regs.per_info); - __poke_user_per_compat(child, addr, data); - } - - return 0; -} - -static int poke_user_compat(struct task_struct *child, - addr_t addr, addr_t data) -{ - if (!is_compat_task() || (addr & 3) || - addr > sizeof(struct compat_user) - 3) - return -EIO; - - return __poke_user_compat(child, addr, data); -} - -long compat_arch_ptrace(struct task_struct *child, compat_long_t request, - compat_ulong_t caddr, compat_ulong_t cdata) -{ - unsigned long addr = caddr; - unsigned long data = cdata; - compat_ptrace_area parea; - int copied, ret; - - switch (request) { - case PTRACE_PEEKUSR: - /* read the word at location addr in the USER area. */ - return peek_user_compat(child, addr, data); - - case PTRACE_POKEUSR: - /* write the word at location addr in the USER area */ - return poke_user_compat(child, addr, data); - - case PTRACE_PEEKUSR_AREA: - case PTRACE_POKEUSR_AREA: - if (copy_from_user(&parea, (void __force __user *) addr, - sizeof(parea))) - return -EFAULT; - addr = parea.kernel_addr; - data = parea.process_addr; - copied = 0; - while (copied < parea.len) { - if (request == PTRACE_PEEKUSR_AREA) - ret = peek_user_compat(child, addr, data); - else { - __u32 utmp; - if (get_user(utmp, - (__u32 __force __user *) data)) - return -EFAULT; - ret = poke_user_compat(child, addr, utmp); - } - if (ret) - return ret; - addr += sizeof(unsigned int); - data += sizeof(unsigned int); - copied += sizeof(unsigned int); - } - return 0; - case PTRACE_GET_LAST_BREAK: - return put_user(child->thread.last_break, (unsigned int __user *)data); - } - return compat_ptrace_request(child, request, addr, data); -} -#endif - /* * user_regset definitions. */ @@ -1033,7 +729,7 @@ static int s390_gs_cb_get(struct task_struct *target, { struct gs_cb *data = target->thread.gs_cb; - if (!MACHINE_HAS_GS) + if (!cpu_has_gs()) return -ENODEV; if (!data) return -ENODATA; @@ -1050,10 +746,10 @@ static int s390_gs_cb_set(struct task_struct *target, struct gs_cb gs_cb = { }, *data = NULL; int rc; - if (!MACHINE_HAS_GS) + if (!cpu_has_gs()) return -ENODEV; if (!target->thread.gs_cb) { - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc_obj(*data); if (!data) return -ENOMEM; } @@ -1087,7 +783,7 @@ static int s390_gs_bc_get(struct task_struct *target, { struct gs_cb *data = target->thread.gs_bc_cb; - if (!MACHINE_HAS_GS) + if (!cpu_has_gs()) return -ENODEV; if (!data) return -ENODATA; @@ -1101,10 +797,10 @@ static int s390_gs_bc_set(struct task_struct *target, { struct gs_cb *data = target->thread.gs_bc_cb; - if (!MACHINE_HAS_GS) + if (!cpu_has_gs()) return -ENODEV; if (!data) { - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc_obj(*data); if (!data) return -ENOMEM; target->thread.gs_bc_cb = data; @@ -1165,7 +861,7 @@ static int s390_runtime_instr_set(struct task_struct *target, return -ENODEV; if (!target->thread.ri_cb) { - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc_obj(*data); if (!data) return -ENOMEM; } @@ -1206,7 +902,7 @@ static int s390_runtime_instr_set(struct task_struct *target, static const struct user_regset s390_regsets[] = { { - .core_note_type = NT_PRSTATUS, + USER_REGSET_NOTE_TYPE(PRSTATUS), .n = sizeof(s390_regs) / sizeof(long), .size = sizeof(long), .align = sizeof(long), @@ -1214,7 +910,7 @@ static const struct user_regset s390_regsets[] = { .set = s390_regs_set, }, { - .core_note_type = NT_PRFPREG, + USER_REGSET_NOTE_TYPE(PRFPREG), .n = sizeof(s390_fp_regs) / sizeof(long), .size = sizeof(long), .align = sizeof(long), @@ -1222,7 +918,7 @@ static const struct user_regset s390_regsets[] = { .set = s390_fpregs_set, }, { - .core_note_type = NT_S390_SYSTEM_CALL, + USER_REGSET_NOTE_TYPE(S390_SYSTEM_CALL), .n = 1, .size = sizeof(unsigned int), .align = sizeof(unsigned int), @@ -1230,7 +926,7 @@ static const struct user_regset s390_regsets[] = { .set = s390_system_call_set, }, { - .core_note_type = NT_S390_LAST_BREAK, + USER_REGSET_NOTE_TYPE(S390_LAST_BREAK), .n = 1, .size = sizeof(long), .align = sizeof(long), @@ -1238,7 +934,7 @@ static const struct user_regset s390_regsets[] = { .set = s390_last_break_set, }, { - .core_note_type = NT_S390_TDB, + USER_REGSET_NOTE_TYPE(S390_TDB), .n = 1, .size = 256, .align = 1, @@ -1246,7 +942,7 @@ static const struct user_regset s390_regsets[] = { .set = s390_tdb_set, }, { - .core_note_type = NT_S390_VXRS_LOW, + USER_REGSET_NOTE_TYPE(S390_VXRS_LOW), .n = __NUM_VXRS_LOW, .size = sizeof(__u64), .align = sizeof(__u64), @@ -1254,7 +950,7 @@ static const struct user_regset s390_regsets[] = { .set = s390_vxrs_low_set, }, { - .core_note_type = NT_S390_VXRS_HIGH, + USER_REGSET_NOTE_TYPE(S390_VXRS_HIGH), .n = __NUM_VXRS_HIGH, .size = sizeof(__vector128), .align = sizeof(__vector128), @@ -1262,7 +958,7 @@ static const struct user_regset s390_regsets[] = { .set = s390_vxrs_high_set, }, { - .core_note_type = NT_S390_GS_CB, + USER_REGSET_NOTE_TYPE(S390_GS_CB), .n = sizeof(struct gs_cb) / sizeof(__u64), .size = sizeof(__u64), .align = sizeof(__u64), @@ -1270,7 +966,7 @@ static const struct user_regset s390_regsets[] = { .set = s390_gs_cb_set, }, { - .core_note_type = NT_S390_GS_BC, + USER_REGSET_NOTE_TYPE(S390_GS_BC), .n = sizeof(struct gs_cb) / sizeof(__u64), .size = sizeof(__u64), .align = sizeof(__u64), @@ -1278,7 +974,7 @@ static const struct user_regset s390_regsets[] = { .set = s390_gs_bc_set, }, { - .core_note_type = NT_S390_RI_CB, + USER_REGSET_NOTE_TYPE(S390_RI_CB), .n = sizeof(struct runtime_instr_cb) / sizeof(__u64), .size = sizeof(__u64), .align = sizeof(__u64), @@ -1294,225 +990,8 @@ static const struct user_regset_view user_s390_view = { .n = ARRAY_SIZE(s390_regsets) }; -#ifdef CONFIG_COMPAT -static int s390_compat_regs_get(struct task_struct *target, - const struct user_regset *regset, - struct membuf to) -{ - unsigned n; - - if (target == current) - save_access_regs(target->thread.acrs); - - for (n = 0; n < sizeof(s390_compat_regs); n += sizeof(compat_ulong_t)) - membuf_store(&to, __peek_user_compat(target, n)); - return 0; -} - -static int s390_compat_regs_set(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) -{ - int rc = 0; - - if (target == current) - save_access_regs(target->thread.acrs); - - if (kbuf) { - const compat_ulong_t *k = kbuf; - while (count > 0 && !rc) { - rc = __poke_user_compat(target, pos, *k++); - count -= sizeof(*k); - pos += sizeof(*k); - } - } else { - const compat_ulong_t __user *u = ubuf; - while (count > 0 && !rc) { - compat_ulong_t word; - rc = __get_user(word, u++); - if (rc) - break; - rc = __poke_user_compat(target, pos, word); - count -= sizeof(*u); - pos += sizeof(*u); - } - } - - if (rc == 0 && target == current) - restore_access_regs(target->thread.acrs); - - return rc; -} - -static int s390_compat_regs_high_get(struct task_struct *target, - const struct user_regset *regset, - struct membuf to) -{ - compat_ulong_t *gprs_high; - int i; - - gprs_high = (compat_ulong_t *)task_pt_regs(target)->gprs; - for (i = 0; i < NUM_GPRS; i++, gprs_high += 2) - membuf_store(&to, *gprs_high); - return 0; -} - -static int s390_compat_regs_high_set(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) -{ - compat_ulong_t *gprs_high; - int rc = 0; - - gprs_high = (compat_ulong_t *) - &task_pt_regs(target)->gprs[pos / sizeof(compat_ulong_t)]; - if (kbuf) { - const compat_ulong_t *k = kbuf; - while (count > 0) { - *gprs_high = *k++; - *gprs_high += 2; - count -= sizeof(*k); - } - } else { - const compat_ulong_t __user *u = ubuf; - while (count > 0 && !rc) { - unsigned long word; - rc = __get_user(word, u++); - if (rc) - break; - *gprs_high = word; - *gprs_high += 2; - count -= sizeof(*u); - } - } - - return rc; -} - -static int s390_compat_last_break_get(struct task_struct *target, - const struct user_regset *regset, - struct membuf to) -{ - compat_ulong_t last_break = target->thread.last_break; - - return membuf_store(&to, (unsigned long)last_break); -} - -static int s390_compat_last_break_set(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) -{ - return 0; -} - -static const struct user_regset s390_compat_regsets[] = { - { - .core_note_type = NT_PRSTATUS, - .n = sizeof(s390_compat_regs) / sizeof(compat_long_t), - .size = sizeof(compat_long_t), - .align = sizeof(compat_long_t), - .regset_get = s390_compat_regs_get, - .set = s390_compat_regs_set, - }, - { - .core_note_type = NT_PRFPREG, - .n = sizeof(s390_fp_regs) / sizeof(compat_long_t), - .size = sizeof(compat_long_t), - .align = sizeof(compat_long_t), - .regset_get = s390_fpregs_get, - .set = s390_fpregs_set, - }, - { - .core_note_type = NT_S390_SYSTEM_CALL, - .n = 1, - .size = sizeof(compat_uint_t), - .align = sizeof(compat_uint_t), - .regset_get = s390_system_call_get, - .set = s390_system_call_set, - }, - { - .core_note_type = NT_S390_LAST_BREAK, - .n = 1, - .size = sizeof(long), - .align = sizeof(long), - .regset_get = s390_compat_last_break_get, - .set = s390_compat_last_break_set, - }, - { - .core_note_type = NT_S390_TDB, - .n = 1, - .size = 256, - .align = 1, - .regset_get = s390_tdb_get, - .set = s390_tdb_set, - }, - { - .core_note_type = NT_S390_VXRS_LOW, - .n = __NUM_VXRS_LOW, - .size = sizeof(__u64), - .align = sizeof(__u64), - .regset_get = s390_vxrs_low_get, - .set = s390_vxrs_low_set, - }, - { - .core_note_type = NT_S390_VXRS_HIGH, - .n = __NUM_VXRS_HIGH, - .size = sizeof(__vector128), - .align = sizeof(__vector128), - .regset_get = s390_vxrs_high_get, - .set = s390_vxrs_high_set, - }, - { - .core_note_type = NT_S390_HIGH_GPRS, - .n = sizeof(s390_compat_regs_high) / sizeof(compat_long_t), - .size = sizeof(compat_long_t), - .align = sizeof(compat_long_t), - .regset_get = s390_compat_regs_high_get, - .set = s390_compat_regs_high_set, - }, - { - .core_note_type = NT_S390_GS_CB, - .n = sizeof(struct gs_cb) / sizeof(__u64), - .size = sizeof(__u64), - .align = sizeof(__u64), - .regset_get = s390_gs_cb_get, - .set = s390_gs_cb_set, - }, - { - .core_note_type = NT_S390_GS_BC, - .n = sizeof(struct gs_cb) / sizeof(__u64), - .size = sizeof(__u64), - .align = sizeof(__u64), - .regset_get = s390_gs_bc_get, - .set = s390_gs_bc_set, - }, - { - .core_note_type = NT_S390_RI_CB, - .n = sizeof(struct runtime_instr_cb) / sizeof(__u64), - .size = sizeof(__u64), - .align = sizeof(__u64), - .regset_get = s390_runtime_instr_get, - .set = s390_runtime_instr_set, - }, -}; - -static const struct user_regset_view user_s390_compat_view = { - .name = "s390", - .e_machine = EM_S390, - .regsets = s390_compat_regsets, - .n = ARRAY_SIZE(s390_compat_regsets) -}; -#endif - const struct user_regset_view *task_user_regset_view(struct task_struct *task) { -#ifdef CONFIG_COMPAT - if (test_tsk_thread_flag(task, TIF_31BIT)) - return &user_s390_compat_view; -#endif return &user_s390_view; } @@ -1521,13 +1000,6 @@ static const char *gpr_names[NUM_GPRS] = { "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", }; -unsigned long regs_get_register(struct pt_regs *regs, unsigned int offset) -{ - if (offset >= NUM_GPRS) - return 0; - return regs->gprs[offset]; -} - int regs_query_register_offset(const char *name) { unsigned long offset; @@ -1547,29 +1019,3 @@ const char *regs_query_register_name(unsigned int offset) return NULL; return gpr_names[offset]; } - -static int regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr) -{ - unsigned long ksp = kernel_stack_pointer(regs); - - return (addr & ~(THREAD_SIZE - 1)) == (ksp & ~(THREAD_SIZE - 1)); -} - -/** - * regs_get_kernel_stack_nth() - get Nth entry of the stack - * @regs:pt_regs which contains kernel stack pointer. - * @n:stack entry number. - * - * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which - * is specifined by @regs. If the @n th entry is NOT in the kernel stack, - * this returns 0. - */ -unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n) -{ - unsigned long addr; - - addr = kernel_stack_pointer(regs) + n * sizeof(long); - if (!regs_within_kernel_stack(regs, addr)) - return 0; - return *(unsigned long *)addr; -} diff --git a/arch/s390/kernel/runtime_instr.c b/arch/s390/kernel/runtime_instr.c index 1788a5454b6f..928bff8fcd4a 100644 --- a/arch/s390/kernel/runtime_instr.c +++ b/arch/s390/kernel/runtime_instr.c @@ -83,7 +83,7 @@ SYSCALL_DEFINE2(s390_runtime_instr, int, command, int, signum) return -EINVAL; if (!current->thread.ri_cb) { - cb = kzalloc(sizeof(*cb), GFP_KERNEL); + cb = kzalloc_obj(*cb); if (!cb) return -ENOMEM; } else { diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index d78bcfe707b5..b60284328fe3 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -13,8 +13,7 @@ * This file handles the architecture-dependent parts of initialization */ -#define KMSG_COMPONENT "setup" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "setup: " fmt #include <linux/errno.h> #include <linux/export.h> @@ -47,13 +46,13 @@ #include <linux/kexec.h> #include <linux/crash_dump.h> #include <linux/memory.h> -#include <linux/compat.h> #include <linux/start_kernel.h> #include <linux/hugetlb.h> #include <linux/kmemleak.h> #include <asm/archrandom.h> #include <asm/boot_data.h> +#include <asm/machine.h> #include <asm/ipl.h> #include <asm/facility.h> #include <asm/smp.h> @@ -111,7 +110,7 @@ struct exception_table_entry __amode31_ref *__stop_amode31_ex_table = _stop_amod * Because the AMODE31 sections are relocated below 2G at startup, * the content of control registers CR2, CR5 and CR15 must be updated * with new addresses after the relocation. The initial initialization of - * control registers occurs in head64.S and then gets updated again after AMODE31 + * control registers occurs in head.S and then gets updated again after AMODE31 * relocation. We must access the relevant AMODE31 tables indirectly via * pointers placed in the .amode31.refs linker section. Those pointers get * updated automatically during AMODE31 relocation and always contain a valid @@ -180,8 +179,6 @@ unsigned long __bootdata_preserved(MODULES_END); struct lowcore *lowcore_ptr[NR_CPUS]; EXPORT_SYMBOL(lowcore_ptr); -DEFINE_STATIC_KEY_FALSE(cpu_has_bear); - /* * The Write Back bit position in the physaddr is given by the SLPC PCI. * Leaving the mask zero always uses write through which is safe @@ -251,7 +248,7 @@ static void __init conmode_default(void) char query_buffer[1024]; char *ptr; - if (MACHINE_IS_VM) { + if (machine_is_vm()) { cpcmd("QUERY CONSOLE", query_buffer, 1024, NULL); console_devno = simple_strtoul(query_buffer + 5, NULL, 16); ptr = strstr(query_buffer, "SUBCHANNEL ="); @@ -289,7 +286,7 @@ static void __init conmode_default(void) SET_CONSOLE_SCLP; #endif } - } else if (MACHINE_IS_KVM) { + } else if (machine_is_kvm()) { if (sclp.has_vt220 && IS_ENABLED(CONFIG_SCLP_VT220_CONSOLE)) SET_CONSOLE_VT220; else if (sclp.has_linemode && IS_ENABLED(CONFIG_SCLP_CONSOLE)) @@ -415,7 +412,6 @@ static void __init setup_lowcore(void) lc->clock_comparator = clock_comparator_max; lc->current_task = (unsigned long)&init_task; lc->lpp = LPP_MAGIC; - lc->machine_flags = get_lowcore()->machine_flags; lc->preempt_count = get_lowcore()->preempt_count; nmi_alloc_mcesa_early(&lc->mcesad); lc->sys_enter_timer = get_lowcore()->sys_enter_timer; @@ -607,7 +603,7 @@ static void __init reserve_crashkernel(void) int rc; rc = parse_crashkernel(boot_command_line, ident_map_size, - &crash_size, &crash_base, NULL, NULL); + &crash_size, &crash_base, NULL, NULL, NULL); crash_base = ALIGN(crash_base, KEXEC_CRASH_MEM_ALIGN); crash_size = ALIGN(crash_size, KEXEC_CRASH_MEM_ALIGN); @@ -652,7 +648,7 @@ static void __init reserve_crashkernel(void) return; } - if (!oldmem_data.start && MACHINE_IS_VM) + if (!oldmem_data.start && machine_is_vm()) diag10_range(PFN_DOWN(crash_base), PFN_DOWN(crash_size)); crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; @@ -721,6 +717,11 @@ static void __init memblock_add_physmem_info(void) memblock_set_node(0, ULONG_MAX, &memblock.memory, 0); } +static void __init setup_high_memory(void) +{ + high_memory = __va(ident_map_size); +} + /* * Reserve memory used for lowcore. */ @@ -836,7 +837,7 @@ static void __init setup_control_program_code(void) return; diag_stat_inc(DIAG_STAT_X318); - asm volatile("diag %0,0,0x318\n" : : "d" (diag318_info.val)); + asm volatile("diag %0,0,0x318" : : "d" (diag318_info.val)); } /* @@ -898,12 +899,12 @@ void __init setup_arch(char **cmdline_p) /* * print what head.S has found out about the machine */ - if (MACHINE_IS_VM) + if (machine_is_vm()) pr_info("Linux is running as a z/VM " "guest operating system in 64-bit mode\n"); - else if (MACHINE_IS_KVM) + else if (machine_is_kvm()) pr_info("Linux is running under KVM in 64-bit mode\n"); - else if (MACHINE_IS_LPAR) + else if (machine_is_lpar()) pr_info("Linux is running natively in 64-bit mode\n"); else pr_info("Linux is running as a guest in 64-bit mode\n"); @@ -911,7 +912,7 @@ void __init setup_arch(char **cmdline_p) if (!boot_earlyprintk) boot_rb_foreach(print_rb_entry); - if (have_relocated_lowcore()) + if (machine_has_relocated_lowcore()) pr_info("Lowcore relocated to 0x%px\n", get_lowcore()); log_component_list(); @@ -953,6 +954,7 @@ void __init setup_arch(char **cmdline_p) free_physmem_info(); setup_memory_end(); + setup_high_memory(); memblock_dump_all(); setup_memory(); @@ -961,8 +963,6 @@ void __init setup_arch(char **cmdline_p) setup_uv(); dma_contiguous_reserve(ident_map_size); vmcp_cma_reserve(); - if (MACHINE_HAS_EDAT2) - hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT); reserve_crashkernel(); #ifdef CONFIG_CRASH_DUMP @@ -981,10 +981,6 @@ void __init setup_arch(char **cmdline_p) numa_setup(); smp_detect_cpus(); topology_init_early(); - - if (test_facility(193)) - static_branch_enable(&cpu_has_bear); - setup_protection_map(); /* * Create kernel page tables. diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c index e48013cd832c..4874de5edea0 100644 --- a/arch/s390/kernel/signal.c +++ b/arch/s390/kernel/signal.c @@ -27,7 +27,6 @@ #include <linux/personality.h> #include <linux/binfmts.h> #include <linux/syscalls.h> -#include <linux/compat.h> #include <asm/ucontext.h> #include <linux/uaccess.h> #include <asm/vdso-symbols.h> @@ -290,12 +289,6 @@ static int setup_frame(int sig, struct k_sigaction *ka, unsigned long restorer; size_t frame_size; - /* - * gprs_high are only present for a 31-bit task running on - * a 64-bit kernel (see compat_signal.c) but the space for - * gprs_high need to be allocated if vector registers are - * included in the signal frame on a 31-bit system. - */ frame_size = sizeof(*frame) - sizeof(frame->sregs_ext); if (cpu_has_vx()) frame_size += sizeof(frame->sregs_ext); @@ -333,7 +326,7 @@ static int setup_frame(int sig, struct k_sigaction *ka, if (ka->sa.sa_flags & SA_RESTORER) restorer = (unsigned long) ka->sa.sa_restorer; else - restorer = VDSO64_SYMBOL(current, sigreturn); + restorer = VDSO_SYMBOL(current, sigreturn); /* Set up registers for signal handler */ regs->gprs[14] = restorer; @@ -367,12 +360,6 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, size_t frame_size; frame_size = sizeof(struct rt_sigframe) - sizeof(_sigregs_ext); - /* - * gprs_high are only present for a 31-bit task running on - * a 64-bit kernel (see compat_signal.c) but the space for - * gprs_high need to be allocated if vector registers are - * included in the signal frame on a 31-bit system. - */ uc_flags = 0; if (cpu_has_vx()) { frame_size += sizeof(_sigregs_ext); @@ -391,7 +378,7 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, if (ksig->ka.sa.sa_flags & SA_RESTORER) restorer = (unsigned long) ksig->ka.sa.sa_restorer; else - restorer = VDSO64_SYMBOL(current, rt_sigreturn); + restorer = VDSO_SYMBOL(current, rt_sigreturn); /* Create siginfo on the signal stack */ if (copy_siginfo_to_user(&frame->info, &ksig->info)) @@ -490,10 +477,7 @@ void arch_do_signal_or_restart(struct pt_regs *regs) clear_pt_regs_flag(regs, PIF_SYSCALL); rseq_signal_deliver(&ksig, regs); - if (is_compat_task()) - handle_signal32(&ksig, oldset, regs); - else - handle_signal(&ksig, oldset, regs); + handle_signal(&ksig, oldset, regs); return; } @@ -506,10 +490,7 @@ void arch_do_signal_or_restart(struct pt_regs *regs) /* Restart with sys_restart_syscall */ regs->gprs[2] = regs->orig_gpr2; current->restart_block.arch_data = regs->psw.addr; - if (is_compat_task()) - regs->psw.addr = VDSO32_SYMBOL(current, restart_syscall); - else - regs->psw.addr = VDSO64_SYMBOL(current, restart_syscall); + regs->psw.addr = VDSO_SYMBOL(current, restart_syscall); if (test_thread_flag(TIF_SINGLE_STEP)) clear_thread_flag(TIF_PER_TRAP); break; diff --git a/arch/s390/kernel/skey.c b/arch/s390/kernel/skey.c new file mode 100644 index 000000000000..cc869de6e3a5 --- /dev/null +++ b/arch/s390/kernel/skey.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <asm/rwonce.h> +#include <asm/page.h> +#include <asm/skey.h> + +int skey_regions_initialized; + +static inline unsigned long load_real_address(unsigned long address) +{ + unsigned long real; + + asm volatile( + " lra %[real],0(%[address])" + : [real] "=d" (real) + : [address] "a" (address) + : "cc"); + return real; +} + +/* + * Initialize storage keys of registered memory regions with the + * default key. This is useful for code which is executed with a + * non-default access key. + */ +void __skey_regions_initialize(void) +{ + unsigned long address, real; + struct skey_region *r, *end; + + r = __skey_region_start; + end = __skey_region_end; + while (r < end) { + address = r->start & PAGE_MASK; + do { + real = load_real_address(address); + page_set_storage_key(real, PAGE_DEFAULT_KEY, 1); + address += PAGE_SIZE; + } while (address < r->end); + r++; + } + /* + * Make sure storage keys are initialized before + * skey_regions_initialized is changed. + */ + barrier(); + WRITE_ONCE(skey_regions_initialized, 1); +} diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 7b08399b0846..50bb499cf3e5 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -15,9 +15,9 @@ * operates on physical cpu numbers needs to go into smp.c. */ -#define KMSG_COMPONENT "cpu" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "cpu: " fmt +#include <linux/cpufeature.h> #include <linux/workqueue.h> #include <linux/memblock.h> #include <linux/export.h> @@ -38,6 +38,7 @@ #include <linux/kprobes.h> #include <asm/access-regs.h> #include <asm/asm-offsets.h> +#include <asm/machine.h> #include <asm/ctlreg.h> #include <asm/pfault.h> #include <asm/diag.h> @@ -97,13 +98,6 @@ __vector128 __initdata boot_cpu_vector_save_area[__NUM_VXRS]; static unsigned int smp_max_threads __initdata = -1U; cpumask_t cpu_setup_mask; -static int __init early_nosmt(char *s) -{ - smp_max_threads = 1; - return 0; -} -early_param("nosmt", early_nosmt); - static int __init early_smt(char *s) { get_option(&s, &smp_max_threads); @@ -180,13 +174,10 @@ static struct pcpu *pcpu_find_address(const struct cpumask *mask, u16 address) static void pcpu_ec_call(struct pcpu *pcpu, int ec_bit) { - int order; - if (test_and_set_bit(ec_bit, &pcpu->ec_mask)) return; - order = pcpu_running(pcpu) ? SIGP_EXTERNAL_CALL : SIGP_EMERGENCY_SIGNAL; pcpu->ec_clk = get_tod_clock_fast(); - pcpu_sigp_retry(pcpu, order, 0); + pcpu_sigp_retry(pcpu, SIGP_EXTERNAL_CALL, 0); } static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu) @@ -263,13 +254,12 @@ static void pcpu_prepare_secondary(struct pcpu *pcpu, int cpu) lc->percpu_offset = __per_cpu_offset[cpu]; lc->kernel_asce = get_lowcore()->kernel_asce; lc->user_asce = s390_invalid_asce; - lc->machine_flags = get_lowcore()->machine_flags; lc->user_timer = lc->system_timer = lc->steal_timer = lc->avg_steal_timer = 0; abs_lc = get_abs_lowcore(); memcpy(lc->cregs_save_area, abs_lc->cregs_save_area, sizeof(lc->cregs_save_area)); put_abs_lowcore(abs_lc); - lc->cregs_save_area[1] = lc->kernel_asce; + lc->cregs_save_area[1] = lc->user_asce; lc->cregs_save_area[7] = lc->user_asce; save_access_regs((unsigned int *) lc->access_regs_save_area); arch_spin_lock_setup(cpu); @@ -290,6 +280,9 @@ static void pcpu_attach_task(int cpu, struct task_struct *tsk) lc->hardirq_timer = tsk->thread.hardirq_timer; lc->softirq_timer = tsk->thread.softirq_timer; lc->steal_timer = 0; +#ifdef CONFIG_STACKPROTECTOR + lc->stack_canary = tsk->stack_canary; +#endif } static void pcpu_start_fn(int cpu, void (*func)(void *), void *data) @@ -314,9 +307,9 @@ static void __pcpu_delegate(pcpu_delegate_fn *func, void *data) func(data); /* should not return */ } -static void pcpu_delegate(struct pcpu *pcpu, int cpu, - pcpu_delegate_fn *func, - void *data, unsigned long stack) +static void __noreturn pcpu_delegate(struct pcpu *pcpu, int cpu, + pcpu_delegate_fn *func, + void *data, unsigned long stack) { struct lowcore *lc, *abs_lc; unsigned int source_cpu; @@ -349,7 +342,7 @@ static void pcpu_delegate(struct pcpu *pcpu, int cpu, "0: sigp 0,%0,%2 # sigp restart to target cpu\n" " brc 2,0b # busy, try again\n" "1: sigp 0,%1,%3 # sigp stop to current cpu\n" - " brc 2,1b # busy, try again\n" + " brc 2,1b # busy, try again" : : "d" (pcpu->address), "d" (source_cpu), "K" (SIGP_RESTART), "K" (SIGP_STOP) : "0", "1", "cc"); @@ -379,7 +372,7 @@ static int pcpu_set_smt(unsigned int mtid) /* * Call function on the ipl CPU. */ -void smp_call_ipl_cpu(void (*func)(void *), void *data) +void __noreturn smp_call_ipl_cpu(void (*func)(void *), void *data) { struct lowcore *lc = lowcore_ptr[0]; @@ -416,7 +409,7 @@ EXPORT_SYMBOL(arch_vcpu_is_preempted); void notrace smp_yield_cpu(int cpu) { - if (!MACHINE_HAS_DIAG9C) + if (!machine_has_diag9c()) return; diag_stat_inc_norecursion(DIAG_STAT_X09C); asm volatile("diag %0,0,0x9c" @@ -439,16 +432,16 @@ void notrace smp_emergency_stop(void) cpumask_copy(&cpumask, cpu_online_mask); cpumask_clear_cpu(smp_processor_id(), &cpumask); - end = get_tod_clock() + (1000000UL << 12); + end = get_tod_clock_monotonic() + (1000000UL << 12); for_each_cpu(cpu, &cpumask) { struct pcpu *pcpu = per_cpu_ptr(&pcpu_devices, cpu); set_bit(ec_stop_cpu, &pcpu->ec_mask); while (__pcpu_sigp(pcpu->address, SIGP_EMERGENCY_SIGNAL, 0, NULL) == SIGP_CC_BUSY && - get_tod_clock() < end) + get_tod_clock_monotonic() < end) cpu_relax(); } - while (get_tod_clock() < end) { + while (get_tod_clock_monotonic() < end) { for_each_cpu(cpu, &cpumask) if (pcpu_stopped(per_cpu_ptr(&pcpu_devices, cpu))) cpumask_clear_cpu(cpu, &cpumask); @@ -561,10 +554,10 @@ int smp_store_status(int cpu) if (__pcpu_sigp_relax(pcpu->address, SIGP_STORE_STATUS_AT_ADDRESS, pa) != SIGP_CC_ORDER_CODE_ACCEPTED) return -EIO; - if (!cpu_has_vx() && !MACHINE_HAS_GS) + if (!cpu_has_vx() && !cpu_has_gs()) return 0; pa = lc->mcesad & MCESA_ORIGIN_MASK; - if (MACHINE_HAS_GS) + if (cpu_has_gs()) pa |= lc->mcesad & MCESA_LC_MASK; if (__pcpu_sigp_relax(pcpu->address, SIGP_STORE_ADDITIONAL_STATUS, pa) != SIGP_CC_ORDER_CODE_ACCEPTED) @@ -706,6 +699,7 @@ static void __ref smp_get_core_info(struct sclp_core_info *info, int early) continue; info->core[info->configured].core_id = address >> smp_cpu_mt_shift; + info->core[info->configured].type = boot_core_type; info->configured++; } info->combined = info->configured; @@ -807,6 +801,7 @@ void __init smp_detect_cpus(void) mtid = boot_core_type ? sclp.mtid : sclp.mtid_cp; mtid = (mtid < smp_max_threads) ? mtid : smp_max_threads - 1; pcpu_set_smt(mtid); + cpu_smt_set_num_threads(smp_cpu_mtid + 1, smp_cpu_mtid + 1); /* Print number of CPUs */ c_cpus = s_cpus = 0; @@ -1150,13 +1145,13 @@ int __ref smp_rescan_cpus(bool early) struct sclp_core_info *info; int nr; - info = kzalloc(sizeof(*info), GFP_KERNEL); + info = kzalloc_obj(*info); if (!info) return -ENOMEM; smp_get_core_info(info, 0); nr = __smp_rescan_cpus(info, early); kfree(info); - if (nr) + if (nr && !early) topology_schedule_update(); return 0; } diff --git a/arch/s390/kernel/stackprotector.c b/arch/s390/kernel/stackprotector.c new file mode 100644 index 000000000000..8bd3ecf9200a --- /dev/null +++ b/arch/s390/kernel/stackprotector.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifndef pr_fmt +#define pr_fmt(fmt) "stackprot: " fmt +#endif + +#include <linux/export.h> +#include <linux/hex.h> +#include <linux/uaccess.h> +#include <linux/printk.h> +#include <asm/abs_lowcore.h> +#include <asm/sections.h> +#include <asm/machine.h> +#include <asm/asm-offsets.h> +#include <asm/arch-stackprotector.h> + +#ifdef __DECOMPRESSOR + +#define DEBUGP boot_debug +#define EMERGP boot_emerg +#define PANIC boot_panic + +#else /* __DECOMPRESSOR */ + +#define DEBUGP pr_debug +#define EMERGP pr_emerg +#define PANIC panic + +#endif /* __DECOMPRESSOR */ + +int __bootdata_preserved(stack_protector_debug); + +unsigned long __stack_chk_guard; +EXPORT_SYMBOL(__stack_chk_guard); + +struct insn_ril { + u8 opc1 : 8; + u8 r1 : 4; + u8 opc2 : 4; + u32 imm; +} __packed; + +/* + * Convert a virtual instruction address to a real instruction address. The + * decompressor needs to patch instructions within the kernel image based on + * their virtual addresses, while dynamic address translation is still + * disabled. Therefore a translation from virtual kernel image addresses to + * the corresponding physical addresses is required. + * + * After dynamic address translation is enabled and when the kernel needs to + * patch instructions such a translation is not required since the addresses + * are identical. + */ +static struct insn_ril *vaddress_to_insn(unsigned long vaddress) +{ +#ifdef __DECOMPRESSOR + return (struct insn_ril *)__kernel_pa(vaddress); +#else + return (struct insn_ril *)vaddress; +#endif +} + +static unsigned long insn_to_vaddress(struct insn_ril *insn) +{ +#ifdef __DECOMPRESSOR + return (unsigned long)__kernel_va(insn); +#else + return (unsigned long)insn; +#endif +} + +#define INSN_RIL_STRING_SIZE (sizeof(struct insn_ril) * 2 + 1) + +static void insn_ril_to_string(char *str, struct insn_ril *insn) +{ + u8 *ptr = (u8 *)insn; + int i; + + for (i = 0; i < sizeof(*insn); i++) + hex_byte_pack(&str[2 * i], ptr[i]); + str[2 * i] = 0; +} + +static void stack_protector_dump(struct insn_ril *old, struct insn_ril *new) +{ + char ostr[INSN_RIL_STRING_SIZE]; + char nstr[INSN_RIL_STRING_SIZE]; + + insn_ril_to_string(ostr, old); + insn_ril_to_string(nstr, new); + DEBUGP("%016lx: %s -> %s\n", insn_to_vaddress(old), ostr, nstr); +} + +static int stack_protector_verify(struct insn_ril *insn, unsigned long kernel_start) +{ + char istr[INSN_RIL_STRING_SIZE]; + unsigned long vaddress, offset; + + /* larl */ + if (insn->opc1 == 0xc0 && insn->opc2 == 0x0) + return 0; + /* lgrl */ + if (insn->opc1 == 0xc4 && insn->opc2 == 0x8) + return 0; + insn_ril_to_string(istr, insn); + vaddress = insn_to_vaddress(insn); + if (__is_defined(__DECOMPRESSOR)) { + offset = (unsigned long)insn - kernel_start + TEXT_OFFSET; + EMERGP("Unexpected instruction at %016lx/%016lx: %s\n", vaddress, offset, istr); + PANIC("Stackprotector error\n"); + } else { + EMERGP("Unexpected instruction at %016lx: %s\n", vaddress, istr); + } + return -EINVAL; +} + +int __stack_protector_apply(unsigned long *start, unsigned long *end, unsigned long kernel_start) +{ + unsigned long canary, *loc; + struct insn_ril *insn, new; + int rc; + + /* + * Convert LARL/LGRL instructions to LLILF so register R1 contains the + * address of the per-cpu / per-process stack canary: + * + * LARL/LGRL R1,__stack_chk_guard => LLILF R1,__lc_stack_canary + */ + canary = __LC_STACK_CANARY; + if (machine_has_relocated_lowcore()) + canary += LOWCORE_ALT_ADDRESS; + for (loc = start; loc < end; loc++) { + insn = vaddress_to_insn(*loc); + rc = stack_protector_verify(insn, kernel_start); + if (rc) + return rc; + new = *insn; + new.opc1 = 0xc0; + new.opc2 = 0xf; + new.imm = canary; + if (stack_protector_debug) + stack_protector_dump(insn, &new); + s390_kernel_write(insn, &new, sizeof(*insn)); + } + return 0; +} + +#ifdef __DECOMPRESSOR +void __stack_protector_apply_early(unsigned long kernel_start) +{ + unsigned long *start, *end; + + start = (unsigned long *)vmlinux.stack_prot_start; + end = (unsigned long *)vmlinux.stack_prot_end; + __stack_protector_apply(start, end, kernel_start); +} +#endif diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c index 40edfde25f5b..18520d333058 100644 --- a/arch/s390/kernel/stacktrace.c +++ b/arch/s390/kernel/stacktrace.c @@ -8,7 +8,7 @@ #include <linux/perf_event.h> #include <linux/stacktrace.h> #include <linux/uaccess.h> -#include <linux/compat.h> +#include <asm/asm-offsets.h> #include <asm/stacktrace.h> #include <asm/unwind.h> #include <asm/kprobes.h> @@ -104,10 +104,7 @@ void arch_stack_walk_user_common(stack_trace_consume_fn consume_entry, void *coo struct stack_frame_vdso_wrapper __user *sf_vdso; struct stack_frame_user __user *sf; unsigned long ip, sp; - bool first = true; - if (is_compat_task()) - return; if (!current->mm) return; ip = instruction_pointer(regs); @@ -135,24 +132,11 @@ void arch_stack_walk_user_common(stack_trace_consume_fn consume_entry, void *coo if (__get_user(ip, &sf->gprs[8])) break; } - /* Sanity check: ABI requires SP to be 8 byte aligned. */ - if (sp & 0x7) + /* Validate SP and RA (ABI requires SP to be 8 byte aligned). */ + if (sp & 0x7 || ip_invalid(ip)) break; - if (ip_invalid(ip)) { - /* - * If the instruction address is invalid, and this - * is the first stack frame, assume r14 has not - * been written to the stack yet. Otherwise exit. - */ - if (!first) - break; - ip = regs->gprs[14]; - if (ip_invalid(ip)) - break; - } if (!store_ip(consume_entry, cookie, entry, perf, ip)) break; - first = false; } pagefault_enable(); } diff --git a/arch/s390/kernel/sthyi.c b/arch/s390/kernel/sthyi.c index d40f0b983e74..5eae2e25997a 100644 --- a/arch/s390/kernel/sthyi.c +++ b/arch/s390/kernel/sthyi.c @@ -5,6 +5,8 @@ * Copyright IBM Corp. 2016 * Author(s): Janosch Frank <frankja@linux.vnet.ibm.com> */ + +#include <linux/export.h> #include <linux/errno.h> #include <linux/pagemap.h> #include <linux/vmalloc.h> @@ -251,7 +253,7 @@ static void fill_diag_mac(struct sthyi_sctns *sctns, sctns->mac.infmval1 |= MAC_CNT_VLD; } -/* Returns a pointer to the the next partition block. */ +/* Returns a pointer to the next partition block. */ static struct diag204_x_part_block *lpar_cpu_inf(struct lpar_cpu_inf *part_inf, bool this_lpar, void *diag224_buf, diff --git a/arch/s390/kernel/syscall.c b/arch/s390/kernel/syscall.c index 5ec28028315b..75d5a3cab14e 100644 --- a/arch/s390/kernel/syscall.c +++ b/arch/s390/kernel/syscall.c @@ -12,6 +12,8 @@ * platform. */ +#include <linux/cpufeature.h> +#include <linux/nospec.h> #include <linux/errno.h> #include <linux/sched.h> #include <linux/mm.h> @@ -38,6 +40,16 @@ #include "entry.h" +#define __SYSCALL(nr, sym) long __s390x_##sym(struct pt_regs *); +#include <asm/syscall_table.h> +#undef __SYSCALL + +#define __SYSCALL(nr, sym) [nr] = (__s390x_##sym), +const sys_call_ptr_t sys_call_table[__NR_syscalls] = { +#include <asm/syscall_table.h> +}; +#undef __SYSCALL + #ifdef CONFIG_SYSVIPC /* * sys_ipc() is the de-multiplexer for the SysV IPC calls. @@ -81,25 +93,35 @@ SYSCALL_DEFINE0(ni_syscall) return -ENOSYS; } -static void do_syscall(struct pt_regs *regs) +void noinstr __do_syscall(struct pt_regs *regs, int per_trap) { unsigned long nr; + enter_from_user_mode(regs); + add_random_kstack_offset(); + regs->psw = get_lowcore()->svc_old_psw; + regs->int_code = get_lowcore()->svc_int_code; + update_timer_sys(); + if (cpu_has_bear()) + current->thread.last_break = regs->last_break; + local_irq_enable(); + regs->orig_gpr2 = regs->gprs[2]; + if (unlikely(per_trap)) + set_thread_flag(TIF_PER_TRAP); + regs->flags = 0; + set_pt_regs_flag(regs, PIF_SYSCALL); nr = regs->int_code & 0xffff; - if (!nr) { + if (likely(!nr)) { nr = regs->gprs[1] & 0xffff; regs->int_code &= ~0xffffUL; regs->int_code |= nr; } - regs->gprs[2] = nr; - if (nr == __NR_restart_syscall && !(current->restart_block.arch_data & 1)) { regs->psw.addr = current->restart_block.arch_data; current->restart_block.arch_data = 1; } nr = syscall_enter_from_user_mode_work(regs, nr); - /* * In the s390 ptrace ABI, both the syscall number and the return value * use gpr2. However, userspace puts the syscall number either in the @@ -107,37 +129,13 @@ static void do_syscall(struct pt_regs *regs) * work, the ptrace code sets PIF_SYSCALL_RET_SET, which is checked here * and if set, the syscall will be skipped. */ - if (unlikely(test_and_clear_pt_regs_flag(regs, PIF_SYSCALL_RET_SET))) goto out; regs->gprs[2] = -ENOSYS; - if (likely(nr >= NR_syscalls)) - goto out; - do { - regs->gprs[2] = current->thread.sys_call_table[nr](regs); - } while (test_and_clear_pt_regs_flag(regs, PIF_EXECVE_PGSTE_RESTART)); + if (likely(nr < NR_syscalls)) { + nr = array_index_nospec(nr, NR_syscalls); + regs->gprs[2] = sys_call_table[nr](regs); + } out: - syscall_exit_to_user_mode_work(regs); -} - -void noinstr __do_syscall(struct pt_regs *regs, int per_trap) -{ - add_random_kstack_offset(); - enter_from_user_mode(regs); - regs->psw = get_lowcore()->svc_old_psw; - regs->int_code = get_lowcore()->svc_int_code; - update_timer_sys(); - if (static_branch_likely(&cpu_has_bear)) - current->thread.last_break = regs->last_break; - - local_irq_enable(); - regs->orig_gpr2 = regs->gprs[2]; - - if (per_trap) - set_thread_flag(TIF_PER_TRAP); - - regs->flags = 0; - set_pt_regs_flag(regs, PIF_SYSCALL); - do_syscall(regs); - exit_to_user_mode(); + syscall_exit_to_user_mode(regs); } diff --git a/arch/s390/kernel/syscalls/Makefile b/arch/s390/kernel/syscalls/Makefile index c5d958a09ff4..d5fca0ca0890 100644 --- a/arch/s390/kernel/syscalls/Makefile +++ b/arch/s390/kernel/syscalls/Makefile @@ -1,48 +1,32 @@ # SPDX-License-Identifier: GPL-2.0 +kapi := arch/$(SRCARCH)/include/generated/asm +uapi := arch/$(SRCARCH)/include/generated/uapi/asm -gen := arch/$(ARCH)/include/generated -kapi := $(gen)/asm -uapi := $(gen)/uapi/asm - -syscall := $(src)/syscall.tbl -systbl := $(src)/syscalltbl - -gen-y := $(kapi)/syscall_table.h -kapi-hdrs-y := $(kapi)/unistd_nr.h -uapi-hdrs-y := $(uapi)/unistd_32.h -uapi-hdrs-y += $(uapi)/unistd_64.h - -targets += $(addprefix ../../../../,$(gen-y) $(kapi-hdrs-y) $(uapi-hdrs-y)) - -PHONY += kapi uapi - -kapi: $(gen-y) $(kapi-hdrs-y) -uapi: $(uapi-hdrs-y) - - -# Create output directory if not already present $(shell mkdir -p $(uapi) $(kapi)) -quiet_cmd_syshdr = SYSHDR $@ - cmd_syshdr = $(CONFIG_SHELL) '$(systbl)' -H -a $(syshdr_abi_$(basetarget)) -f "$@" < $< > $@ - -quiet_cmd_sysnr = SYSNR $@ - cmd_sysnr = $(CONFIG_SHELL) '$(systbl)' -N -a $(sysnr_abi_$(basetarget)) < $< > $@ +syscall := $(src)/syscall.tbl +syshdr := $(srctree)/scripts/syscallhdr.sh +systbl := $(srctree)/scripts/syscalltbl.sh -quiet_cmd_syscalls = SYSTBL $@ - cmd_syscalls = $(CONFIG_SHELL) '$(systbl)' -S < $< > $@ +quiet_cmd_syshdr = SYSHDR $@ + cmd_syshdr = $(CONFIG_SHELL) $(syshdr) --emit-nr --abis common,$* $< $@ -syshdr_abi_unistd_32 := common,32 -$(uapi)/unistd_32.h: $(syscall) $(systbl) FORCE - $(call if_changed,syshdr) +quiet_cmd_systbl = SYSTBL $@ + cmd_systbl = $(CONFIG_SHELL) $(systbl) --abis common,$* $< $@ -syshdr_abi_unistd_64 := common,64 -$(uapi)/unistd_64.h: $(syscall) $(systbl) FORCE +$(uapi)/unistd_%.h: $(syscall) $(syshdr) FORCE $(call if_changed,syshdr) $(kapi)/syscall_table.h: $(syscall) $(systbl) FORCE - $(call if_changed,syscalls) + $(call if_changed,systbl) + +uapisyshdr-y += unistd_64.h +kapisyshdr-y += syscall_table.h + +uapisyshdr-y := $(addprefix $(uapi)/, $(uapisyshdr-y)) +kapisyshdr-y := $(addprefix $(kapi)/, $(kapisyshdr-y)) +targets += $(addprefix ../../../../, $(uapisyshdr-y) $(kapisyshdr-y)) -sysnr_abi_unistd_nr := common,32,64 -$(kapi)/unistd_nr.h: $(syscall) $(systbl) FORCE - $(call if_changed,sysnr) +PHONY += all +all: $(uapisyshdr-y) $(kapisyshdr-y) + @: diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index e9115b4d8b63..09a7ef04d979 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -3,469 +3,398 @@ # System call table for s390 # # Format: +# <nr> <abi> <syscall> <entry> # -# <nr> <abi> <syscall> <entry-64bit> <compat-entry> -# -# where <abi> can be common, 64, or 32 +# <abi> is always common. -1 common exit sys_exit sys_exit -2 common fork sys_fork sys_fork -3 common read sys_read compat_sys_s390_read -4 common write sys_write compat_sys_s390_write -5 common open sys_open compat_sys_open -6 common close sys_close sys_close -7 common restart_syscall sys_restart_syscall sys_restart_syscall -8 common creat sys_creat sys_creat -9 common link sys_link sys_link -10 common unlink sys_unlink sys_unlink -11 common execve sys_execve compat_sys_execve -12 common chdir sys_chdir sys_chdir -13 32 time - sys_time32 -14 common mknod sys_mknod sys_mknod -15 common chmod sys_chmod sys_chmod -16 32 lchown - sys_lchown16 -19 common lseek sys_lseek compat_sys_lseek -20 common getpid sys_getpid sys_getpid -21 common mount sys_mount sys_mount -22 common umount sys_oldumount sys_oldumount -23 32 setuid - sys_setuid16 -24 32 getuid - sys_getuid16 -25 32 stime - sys_stime32 -26 common ptrace sys_ptrace compat_sys_ptrace -27 common alarm sys_alarm sys_alarm -29 common pause sys_pause sys_pause -30 common utime sys_utime sys_utime32 -33 common access sys_access sys_access -34 common nice sys_nice sys_nice -36 common sync sys_sync sys_sync -37 common kill sys_kill sys_kill -38 common rename sys_rename sys_rename -39 common mkdir sys_mkdir sys_mkdir -40 common rmdir sys_rmdir sys_rmdir -41 common dup sys_dup sys_dup -42 common pipe sys_pipe sys_pipe -43 common times sys_times compat_sys_times -45 common brk sys_brk sys_brk -46 32 setgid - sys_setgid16 -47 32 getgid - sys_getgid16 -48 common signal sys_signal sys_signal -49 32 geteuid - sys_geteuid16 -50 32 getegid - sys_getegid16 -51 common acct sys_acct sys_acct -52 common umount2 sys_umount sys_umount -54 common ioctl sys_ioctl compat_sys_ioctl -55 common fcntl sys_fcntl compat_sys_fcntl -57 common setpgid sys_setpgid sys_setpgid -60 common umask sys_umask sys_umask -61 common chroot sys_chroot sys_chroot -62 common ustat sys_ustat compat_sys_ustat -63 common dup2 sys_dup2 sys_dup2 -64 common getppid sys_getppid sys_getppid -65 common getpgrp sys_getpgrp sys_getpgrp -66 common setsid sys_setsid sys_setsid -67 common sigaction sys_sigaction compat_sys_sigaction -70 32 setreuid - sys_setreuid16 -71 32 setregid - sys_setregid16 -72 common sigsuspend sys_sigsuspend sys_sigsuspend -73 common sigpending sys_sigpending compat_sys_sigpending -74 common sethostname sys_sethostname sys_sethostname -75 common setrlimit sys_setrlimit compat_sys_setrlimit -76 32 getrlimit - compat_sys_old_getrlimit -77 common getrusage sys_getrusage compat_sys_getrusage -78 common gettimeofday sys_gettimeofday compat_sys_gettimeofday -79 common settimeofday sys_settimeofday compat_sys_settimeofday -80 32 getgroups - sys_getgroups16 -81 32 setgroups - sys_setgroups16 -83 common symlink sys_symlink sys_symlink -85 common readlink sys_readlink sys_readlink -86 common uselib sys_uselib sys_uselib -87 common swapon sys_swapon sys_swapon -88 common reboot sys_reboot sys_reboot -89 common readdir - compat_sys_old_readdir -90 common mmap sys_old_mmap compat_sys_s390_old_mmap -91 common munmap sys_munmap sys_munmap -92 common truncate sys_truncate compat_sys_truncate -93 common ftruncate sys_ftruncate compat_sys_ftruncate -94 common fchmod sys_fchmod sys_fchmod -95 32 fchown - sys_fchown16 -96 common getpriority sys_getpriority sys_getpriority -97 common setpriority sys_setpriority sys_setpriority -99 common statfs sys_statfs compat_sys_statfs -100 common fstatfs sys_fstatfs compat_sys_fstatfs -101 32 ioperm - - -102 common socketcall sys_socketcall compat_sys_socketcall -103 common syslog sys_syslog sys_syslog -104 common setitimer sys_setitimer compat_sys_setitimer -105 common getitimer sys_getitimer compat_sys_getitimer -106 common stat sys_newstat compat_sys_newstat -107 common lstat sys_newlstat compat_sys_newlstat -108 common fstat sys_newfstat compat_sys_newfstat -110 common lookup_dcookie - - -111 common vhangup sys_vhangup sys_vhangup -112 common idle - - -114 common wait4 sys_wait4 compat_sys_wait4 -115 common swapoff sys_swapoff sys_swapoff -116 common sysinfo sys_sysinfo compat_sys_sysinfo -117 common ipc sys_s390_ipc compat_sys_s390_ipc -118 common fsync sys_fsync sys_fsync -119 common sigreturn sys_sigreturn compat_sys_sigreturn -120 common clone sys_clone sys_clone -121 common setdomainname sys_setdomainname sys_setdomainname -122 common uname sys_newuname sys_newuname -124 common adjtimex sys_adjtimex sys_adjtimex_time32 -125 common mprotect sys_mprotect sys_mprotect -126 common sigprocmask sys_sigprocmask compat_sys_sigprocmask -127 common create_module - - -128 common init_module sys_init_module sys_init_module -129 common delete_module sys_delete_module sys_delete_module -130 common get_kernel_syms - - -131 common quotactl sys_quotactl sys_quotactl -132 common getpgid sys_getpgid sys_getpgid -133 common fchdir sys_fchdir sys_fchdir -134 common bdflush sys_ni_syscall sys_ni_syscall -135 common sysfs sys_sysfs sys_sysfs -136 common personality sys_s390_personality sys_s390_personality -137 common afs_syscall - - -138 32 setfsuid - sys_setfsuid16 -139 32 setfsgid - sys_setfsgid16 -140 32 _llseek - sys_llseek -141 common getdents sys_getdents compat_sys_getdents -142 32 _newselect - compat_sys_select -142 64 select sys_select - -143 common flock sys_flock sys_flock -144 common msync sys_msync sys_msync -145 common readv sys_readv sys_readv -146 common writev sys_writev sys_writev -147 common getsid sys_getsid sys_getsid -148 common fdatasync sys_fdatasync sys_fdatasync -149 common _sysctl - - -150 common mlock sys_mlock sys_mlock -151 common munlock sys_munlock sys_munlock -152 common mlockall sys_mlockall sys_mlockall -153 common munlockall sys_munlockall sys_munlockall -154 common sched_setparam sys_sched_setparam sys_sched_setparam -155 common sched_getparam sys_sched_getparam sys_sched_getparam -156 common sched_setscheduler sys_sched_setscheduler sys_sched_setscheduler -157 common sched_getscheduler sys_sched_getscheduler sys_sched_getscheduler -158 common sched_yield sys_sched_yield sys_sched_yield -159 common sched_get_priority_max sys_sched_get_priority_max sys_sched_get_priority_max -160 common sched_get_priority_min sys_sched_get_priority_min sys_sched_get_priority_min -161 common sched_rr_get_interval sys_sched_rr_get_interval sys_sched_rr_get_interval_time32 -162 common nanosleep sys_nanosleep sys_nanosleep_time32 -163 common mremap sys_mremap sys_mremap -164 32 setresuid - sys_setresuid16 -165 32 getresuid - sys_getresuid16 -167 common query_module - - -168 common poll sys_poll sys_poll -169 common nfsservctl - - -170 32 setresgid - sys_setresgid16 -171 32 getresgid - sys_getresgid16 -172 common prctl sys_prctl sys_prctl -173 common rt_sigreturn sys_rt_sigreturn compat_sys_rt_sigreturn -174 common rt_sigaction sys_rt_sigaction compat_sys_rt_sigaction -175 common rt_sigprocmask sys_rt_sigprocmask compat_sys_rt_sigprocmask -176 common rt_sigpending sys_rt_sigpending compat_sys_rt_sigpending -177 common rt_sigtimedwait sys_rt_sigtimedwait compat_sys_rt_sigtimedwait_time32 -178 common rt_sigqueueinfo sys_rt_sigqueueinfo compat_sys_rt_sigqueueinfo -179 common rt_sigsuspend sys_rt_sigsuspend compat_sys_rt_sigsuspend -180 common pread64 sys_pread64 compat_sys_s390_pread64 -181 common pwrite64 sys_pwrite64 compat_sys_s390_pwrite64 -182 32 chown - sys_chown16 -183 common getcwd sys_getcwd sys_getcwd -184 common capget sys_capget sys_capget -185 common capset sys_capset sys_capset -186 common sigaltstack sys_sigaltstack compat_sys_sigaltstack -187 common sendfile sys_sendfile64 compat_sys_sendfile -188 common getpmsg - - -189 common putpmsg - - -190 common vfork sys_vfork sys_vfork -191 32 ugetrlimit - compat_sys_getrlimit -191 64 getrlimit sys_getrlimit - -192 32 mmap2 - compat_sys_s390_mmap2 -193 32 truncate64 - compat_sys_s390_truncate64 -194 32 ftruncate64 - compat_sys_s390_ftruncate64 -195 32 stat64 - compat_sys_s390_stat64 -196 32 lstat64 - compat_sys_s390_lstat64 -197 32 fstat64 - compat_sys_s390_fstat64 -198 32 lchown32 - sys_lchown -198 64 lchown sys_lchown - -199 32 getuid32 - sys_getuid -199 64 getuid sys_getuid - -200 32 getgid32 - sys_getgid -200 64 getgid sys_getgid - -201 32 geteuid32 - sys_geteuid -201 64 geteuid sys_geteuid - -202 32 getegid32 - sys_getegid -202 64 getegid sys_getegid - -203 32 setreuid32 - sys_setreuid -203 64 setreuid sys_setreuid - -204 32 setregid32 - sys_setregid -204 64 setregid sys_setregid - -205 32 getgroups32 - sys_getgroups -205 64 getgroups sys_getgroups - -206 32 setgroups32 - sys_setgroups -206 64 setgroups sys_setgroups - -207 32 fchown32 - sys_fchown -207 64 fchown sys_fchown - -208 32 setresuid32 - sys_setresuid -208 64 setresuid sys_setresuid - -209 32 getresuid32 - sys_getresuid -209 64 getresuid sys_getresuid - -210 32 setresgid32 - sys_setresgid -210 64 setresgid sys_setresgid - -211 32 getresgid32 - sys_getresgid -211 64 getresgid sys_getresgid - -212 32 chown32 - sys_chown -212 64 chown sys_chown - -213 32 setuid32 - sys_setuid -213 64 setuid sys_setuid - -214 32 setgid32 - sys_setgid -214 64 setgid sys_setgid - -215 32 setfsuid32 - sys_setfsuid -215 64 setfsuid sys_setfsuid - -216 32 setfsgid32 - sys_setfsgid -216 64 setfsgid sys_setfsgid - -217 common pivot_root sys_pivot_root sys_pivot_root -218 common mincore sys_mincore sys_mincore -219 common madvise sys_madvise sys_madvise -220 common getdents64 sys_getdents64 sys_getdents64 -221 32 fcntl64 - compat_sys_fcntl64 -222 common readahead sys_readahead compat_sys_s390_readahead -223 32 sendfile64 - compat_sys_sendfile64 -224 common setxattr sys_setxattr sys_setxattr -225 common lsetxattr sys_lsetxattr sys_lsetxattr -226 common fsetxattr sys_fsetxattr sys_fsetxattr -227 common getxattr sys_getxattr sys_getxattr -228 common lgetxattr sys_lgetxattr sys_lgetxattr -229 common fgetxattr sys_fgetxattr sys_fgetxattr -230 common listxattr sys_listxattr sys_listxattr -231 common llistxattr sys_llistxattr sys_llistxattr -232 common flistxattr sys_flistxattr sys_flistxattr -233 common removexattr sys_removexattr sys_removexattr -234 common lremovexattr sys_lremovexattr sys_lremovexattr -235 common fremovexattr sys_fremovexattr sys_fremovexattr -236 common gettid sys_gettid sys_gettid -237 common tkill sys_tkill sys_tkill -238 common futex sys_futex sys_futex_time32 -239 common sched_setaffinity sys_sched_setaffinity compat_sys_sched_setaffinity -240 common sched_getaffinity sys_sched_getaffinity compat_sys_sched_getaffinity -241 common tgkill sys_tgkill sys_tgkill -243 common io_setup sys_io_setup compat_sys_io_setup -244 common io_destroy sys_io_destroy sys_io_destroy -245 common io_getevents sys_io_getevents sys_io_getevents_time32 -246 common io_submit sys_io_submit compat_sys_io_submit -247 common io_cancel sys_io_cancel sys_io_cancel -248 common exit_group sys_exit_group sys_exit_group -249 common epoll_create sys_epoll_create sys_epoll_create -250 common epoll_ctl sys_epoll_ctl sys_epoll_ctl -251 common epoll_wait sys_epoll_wait sys_epoll_wait -252 common set_tid_address sys_set_tid_address sys_set_tid_address -253 common fadvise64 sys_fadvise64_64 compat_sys_s390_fadvise64 -254 common timer_create sys_timer_create compat_sys_timer_create -255 common timer_settime sys_timer_settime sys_timer_settime32 -256 common timer_gettime sys_timer_gettime sys_timer_gettime32 -257 common timer_getoverrun sys_timer_getoverrun sys_timer_getoverrun -258 common timer_delete sys_timer_delete sys_timer_delete -259 common clock_settime sys_clock_settime sys_clock_settime32 -260 common clock_gettime sys_clock_gettime sys_clock_gettime32 -261 common clock_getres sys_clock_getres sys_clock_getres_time32 -262 common clock_nanosleep sys_clock_nanosleep sys_clock_nanosleep_time32 -264 32 fadvise64_64 - compat_sys_s390_fadvise64_64 -265 common statfs64 sys_statfs64 compat_sys_statfs64 -266 common fstatfs64 sys_fstatfs64 compat_sys_fstatfs64 -267 common remap_file_pages sys_remap_file_pages sys_remap_file_pages -268 common mbind sys_mbind sys_mbind -269 common get_mempolicy sys_get_mempolicy sys_get_mempolicy -270 common set_mempolicy sys_set_mempolicy sys_set_mempolicy -271 common mq_open sys_mq_open compat_sys_mq_open -272 common mq_unlink sys_mq_unlink sys_mq_unlink -273 common mq_timedsend sys_mq_timedsend sys_mq_timedsend_time32 -274 common mq_timedreceive sys_mq_timedreceive sys_mq_timedreceive_time32 -275 common mq_notify sys_mq_notify compat_sys_mq_notify -276 common mq_getsetattr sys_mq_getsetattr compat_sys_mq_getsetattr -277 common kexec_load sys_kexec_load compat_sys_kexec_load -278 common add_key sys_add_key sys_add_key -279 common request_key sys_request_key sys_request_key -280 common keyctl sys_keyctl compat_sys_keyctl -281 common waitid sys_waitid compat_sys_waitid -282 common ioprio_set sys_ioprio_set sys_ioprio_set -283 common ioprio_get sys_ioprio_get sys_ioprio_get -284 common inotify_init sys_inotify_init sys_inotify_init -285 common inotify_add_watch sys_inotify_add_watch sys_inotify_add_watch -286 common inotify_rm_watch sys_inotify_rm_watch sys_inotify_rm_watch -287 common migrate_pages sys_migrate_pages sys_migrate_pages -288 common openat sys_openat compat_sys_openat -289 common mkdirat sys_mkdirat sys_mkdirat -290 common mknodat sys_mknodat sys_mknodat -291 common fchownat sys_fchownat sys_fchownat -292 common futimesat sys_futimesat sys_futimesat_time32 -293 32 fstatat64 - compat_sys_s390_fstatat64 -293 64 newfstatat sys_newfstatat - -294 common unlinkat sys_unlinkat sys_unlinkat -295 common renameat sys_renameat sys_renameat -296 common linkat sys_linkat sys_linkat -297 common symlinkat sys_symlinkat sys_symlinkat -298 common readlinkat sys_readlinkat sys_readlinkat -299 common fchmodat sys_fchmodat sys_fchmodat -300 common faccessat sys_faccessat sys_faccessat -301 common pselect6 sys_pselect6 compat_sys_pselect6_time32 -302 common ppoll sys_ppoll compat_sys_ppoll_time32 -303 common unshare sys_unshare sys_unshare -304 common set_robust_list sys_set_robust_list compat_sys_set_robust_list -305 common get_robust_list sys_get_robust_list compat_sys_get_robust_list -306 common splice sys_splice sys_splice -307 common sync_file_range sys_sync_file_range compat_sys_s390_sync_file_range -308 common tee sys_tee sys_tee -309 common vmsplice sys_vmsplice sys_vmsplice -310 common move_pages sys_move_pages sys_move_pages -311 common getcpu sys_getcpu sys_getcpu -312 common epoll_pwait sys_epoll_pwait compat_sys_epoll_pwait -313 common utimes sys_utimes sys_utimes_time32 -314 common fallocate sys_fallocate compat_sys_s390_fallocate -315 common utimensat sys_utimensat sys_utimensat_time32 -316 common signalfd sys_signalfd compat_sys_signalfd -317 common timerfd - - -318 common eventfd sys_eventfd sys_eventfd -319 common timerfd_create sys_timerfd_create sys_timerfd_create -320 common timerfd_settime sys_timerfd_settime sys_timerfd_settime32 -321 common timerfd_gettime sys_timerfd_gettime sys_timerfd_gettime32 -322 common signalfd4 sys_signalfd4 compat_sys_signalfd4 -323 common eventfd2 sys_eventfd2 sys_eventfd2 -324 common inotify_init1 sys_inotify_init1 sys_inotify_init1 -325 common pipe2 sys_pipe2 sys_pipe2 -326 common dup3 sys_dup3 sys_dup3 -327 common epoll_create1 sys_epoll_create1 sys_epoll_create1 -328 common preadv sys_preadv compat_sys_preadv -329 common pwritev sys_pwritev compat_sys_pwritev -330 common rt_tgsigqueueinfo sys_rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo -331 common perf_event_open sys_perf_event_open sys_perf_event_open -332 common fanotify_init sys_fanotify_init sys_fanotify_init -333 common fanotify_mark sys_fanotify_mark compat_sys_fanotify_mark -334 common prlimit64 sys_prlimit64 sys_prlimit64 -335 common name_to_handle_at sys_name_to_handle_at sys_name_to_handle_at -336 common open_by_handle_at sys_open_by_handle_at compat_sys_open_by_handle_at -337 common clock_adjtime sys_clock_adjtime sys_clock_adjtime32 -338 common syncfs sys_syncfs sys_syncfs -339 common setns sys_setns sys_setns -340 common process_vm_readv sys_process_vm_readv sys_process_vm_readv -341 common process_vm_writev sys_process_vm_writev sys_process_vm_writev -342 common s390_runtime_instr sys_s390_runtime_instr sys_s390_runtime_instr -343 common kcmp sys_kcmp sys_kcmp -344 common finit_module sys_finit_module sys_finit_module -345 common sched_setattr sys_sched_setattr sys_sched_setattr -346 common sched_getattr sys_sched_getattr sys_sched_getattr -347 common renameat2 sys_renameat2 sys_renameat2 -348 common seccomp sys_seccomp sys_seccomp -349 common getrandom sys_getrandom sys_getrandom -350 common memfd_create sys_memfd_create sys_memfd_create -351 common bpf sys_bpf sys_bpf -352 common s390_pci_mmio_write sys_s390_pci_mmio_write sys_s390_pci_mmio_write -353 common s390_pci_mmio_read sys_s390_pci_mmio_read sys_s390_pci_mmio_read -354 common execveat sys_execveat compat_sys_execveat -355 common userfaultfd sys_userfaultfd sys_userfaultfd -356 common membarrier sys_membarrier sys_membarrier -357 common recvmmsg sys_recvmmsg compat_sys_recvmmsg_time32 -358 common sendmmsg sys_sendmmsg compat_sys_sendmmsg -359 common socket sys_socket sys_socket -360 common socketpair sys_socketpair sys_socketpair -361 common bind sys_bind sys_bind -362 common connect sys_connect sys_connect -363 common listen sys_listen sys_listen -364 common accept4 sys_accept4 sys_accept4 -365 common getsockopt sys_getsockopt sys_getsockopt -366 common setsockopt sys_setsockopt sys_setsockopt -367 common getsockname sys_getsockname sys_getsockname -368 common getpeername sys_getpeername sys_getpeername -369 common sendto sys_sendto sys_sendto -370 common sendmsg sys_sendmsg compat_sys_sendmsg -371 common recvfrom sys_recvfrom compat_sys_recvfrom -372 common recvmsg sys_recvmsg compat_sys_recvmsg -373 common shutdown sys_shutdown sys_shutdown -374 common mlock2 sys_mlock2 sys_mlock2 -375 common copy_file_range sys_copy_file_range sys_copy_file_range -376 common preadv2 sys_preadv2 compat_sys_preadv2 -377 common pwritev2 sys_pwritev2 compat_sys_pwritev2 -378 common s390_guarded_storage sys_s390_guarded_storage sys_s390_guarded_storage -379 common statx sys_statx sys_statx -380 common s390_sthyi sys_s390_sthyi sys_s390_sthyi -381 common kexec_file_load sys_kexec_file_load sys_kexec_file_load -382 common io_pgetevents sys_io_pgetevents compat_sys_io_pgetevents -383 common rseq sys_rseq sys_rseq -384 common pkey_mprotect sys_pkey_mprotect sys_pkey_mprotect -385 common pkey_alloc sys_pkey_alloc sys_pkey_alloc -386 common pkey_free sys_pkey_free sys_pkey_free +1 common exit sys_exit +2 common fork sys_fork +3 common read sys_read +4 common write sys_write +5 common open sys_open +6 common close sys_close +7 common restart_syscall sys_restart_syscall +8 common creat sys_creat +9 common link sys_link +10 common unlink sys_unlink +11 common execve sys_execve +12 common chdir sys_chdir +14 common mknod sys_mknod +15 common chmod sys_chmod +19 common lseek sys_lseek +20 common getpid sys_getpid +21 common mount sys_mount +22 common umount sys_oldumount +26 common ptrace sys_ptrace +27 common alarm sys_alarm +29 common pause sys_pause +30 common utime sys_utime +33 common access sys_access +34 common nice sys_nice +36 common sync sys_sync +37 common kill sys_kill +38 common rename sys_rename +39 common mkdir sys_mkdir +40 common rmdir sys_rmdir +41 common dup sys_dup +42 common pipe sys_pipe +43 common times sys_times +45 common brk sys_brk +48 common signal sys_signal +51 common acct sys_acct +52 common umount2 sys_umount +54 common ioctl sys_ioctl +55 common fcntl sys_fcntl +57 common setpgid sys_setpgid +60 common umask sys_umask +61 common chroot sys_chroot +62 common ustat sys_ustat +63 common dup2 sys_dup2 +64 common getppid sys_getppid +65 common getpgrp sys_getpgrp +66 common setsid sys_setsid +67 common sigaction sys_sigaction +72 common sigsuspend sys_sigsuspend +73 common sigpending sys_sigpending +74 common sethostname sys_sethostname +75 common setrlimit sys_setrlimit +77 common getrusage sys_getrusage +78 common gettimeofday sys_gettimeofday +79 common settimeofday sys_settimeofday +83 common symlink sys_symlink +85 common readlink sys_readlink +86 common uselib sys_uselib +87 common swapon sys_swapon +88 common reboot sys_reboot +89 common readdir sys_ni_syscall +90 common mmap sys_old_mmap +91 common munmap sys_munmap +92 common truncate sys_truncate +93 common ftruncate sys_ftruncate +94 common fchmod sys_fchmod +96 common getpriority sys_getpriority +97 common setpriority sys_setpriority +99 common statfs sys_statfs +100 common fstatfs sys_fstatfs +102 common socketcall sys_socketcall +103 common syslog sys_syslog +104 common setitimer sys_setitimer +105 common getitimer sys_getitimer +106 common stat sys_newstat +107 common lstat sys_newlstat +108 common fstat sys_newfstat +110 common lookup_dcookie sys_ni_syscall +111 common vhangup sys_vhangup +112 common idle sys_ni_syscall +114 common wait4 sys_wait4 +115 common swapoff sys_swapoff +116 common sysinfo sys_sysinfo +117 common ipc sys_s390_ipc +118 common fsync sys_fsync +119 common sigreturn sys_sigreturn +120 common clone sys_clone +121 common setdomainname sys_setdomainname +122 common uname sys_newuname +124 common adjtimex sys_adjtimex +125 common mprotect sys_mprotect +126 common sigprocmask sys_sigprocmask +127 common create_module sys_ni_syscall +128 common init_module sys_init_module +129 common delete_module sys_delete_module +130 common get_kernel_syms sys_ni_syscall +131 common quotactl sys_quotactl +132 common getpgid sys_getpgid +133 common fchdir sys_fchdir +134 common bdflush sys_ni_syscall +135 common sysfs sys_sysfs +136 common personality sys_s390_personality +137 common afs_syscall sys_ni_syscall +141 common getdents sys_getdents +142 common select sys_select +143 common flock sys_flock +144 common msync sys_msync +145 common readv sys_readv +146 common writev sys_writev +147 common getsid sys_getsid +148 common fdatasync sys_fdatasync +149 common _sysctl sys_ni_syscall +150 common mlock sys_mlock +151 common munlock sys_munlock +152 common mlockall sys_mlockall +153 common munlockall sys_munlockall +154 common sched_setparam sys_sched_setparam +155 common sched_getparam sys_sched_getparam +156 common sched_setscheduler sys_sched_setscheduler +157 common sched_getscheduler sys_sched_getscheduler +158 common sched_yield sys_sched_yield +159 common sched_get_priority_max sys_sched_get_priority_max +160 common sched_get_priority_min sys_sched_get_priority_min +161 common sched_rr_get_interval sys_sched_rr_get_interval +162 common nanosleep sys_nanosleep +163 common mremap sys_mremap +167 common query_module sys_ni_syscall +168 common poll sys_poll +169 common nfsservctl sys_ni_syscall +172 common prctl sys_prctl +173 common rt_sigreturn sys_rt_sigreturn +174 common rt_sigaction sys_rt_sigaction +175 common rt_sigprocmask sys_rt_sigprocmask +176 common rt_sigpending sys_rt_sigpending +177 common rt_sigtimedwait sys_rt_sigtimedwait +178 common rt_sigqueueinfo sys_rt_sigqueueinfo +179 common rt_sigsuspend sys_rt_sigsuspend +180 common pread64 sys_pread64 +181 common pwrite64 sys_pwrite64 +183 common getcwd sys_getcwd +184 common capget sys_capget +185 common capset sys_capset +186 common sigaltstack sys_sigaltstack +187 common sendfile sys_sendfile64 +188 common getpmsg sys_ni_syscall +189 common putpmsg sys_ni_syscall +190 common vfork sys_vfork +191 common getrlimit sys_getrlimit +198 common lchown sys_lchown +199 common getuid sys_getuid +200 common getgid sys_getgid +201 common geteuid sys_geteuid +202 common getegid sys_getegid +203 common setreuid sys_setreuid +204 common setregid sys_setregid +205 common getgroups sys_getgroups +206 common setgroups sys_setgroups +207 common fchown sys_fchown +208 common setresuid sys_setresuid +209 common getresuid sys_getresuid +210 common setresgid sys_setresgid +211 common getresgid sys_getresgid +212 common chown sys_chown +213 common setuid sys_setuid +214 common setgid sys_setgid +215 common setfsuid sys_setfsuid +216 common setfsgid sys_setfsgid +217 common pivot_root sys_pivot_root +218 common mincore sys_mincore +219 common madvise sys_madvise +220 common getdents64 sys_getdents64 +222 common readahead sys_readahead +224 common setxattr sys_setxattr +225 common lsetxattr sys_lsetxattr +226 common fsetxattr sys_fsetxattr +227 common getxattr sys_getxattr +228 common lgetxattr sys_lgetxattr +229 common fgetxattr sys_fgetxattr +230 common listxattr sys_listxattr +231 common llistxattr sys_llistxattr +232 common flistxattr sys_flistxattr +233 common removexattr sys_removexattr +234 common lremovexattr sys_lremovexattr +235 common fremovexattr sys_fremovexattr +236 common gettid sys_gettid +237 common tkill sys_tkill +238 common futex sys_futex +239 common sched_setaffinity sys_sched_setaffinity +240 common sched_getaffinity sys_sched_getaffinity +241 common tgkill sys_tgkill +243 common io_setup sys_io_setup +244 common io_destroy sys_io_destroy +245 common io_getevents sys_io_getevents +246 common io_submit sys_io_submit +247 common io_cancel sys_io_cancel +248 common exit_group sys_exit_group +249 common epoll_create sys_epoll_create +250 common epoll_ctl sys_epoll_ctl +251 common epoll_wait sys_epoll_wait +252 common set_tid_address sys_set_tid_address +253 common fadvise64 sys_fadvise64_64 +254 common timer_create sys_timer_create +255 common timer_settime sys_timer_settime +256 common timer_gettime sys_timer_gettime +257 common timer_getoverrun sys_timer_getoverrun +258 common timer_delete sys_timer_delete +259 common clock_settime sys_clock_settime +260 common clock_gettime sys_clock_gettime +261 common clock_getres sys_clock_getres +262 common clock_nanosleep sys_clock_nanosleep +265 common statfs64 sys_statfs64 +266 common fstatfs64 sys_fstatfs64 +267 common remap_file_pages sys_remap_file_pages +268 common mbind sys_mbind +269 common get_mempolicy sys_get_mempolicy +270 common set_mempolicy sys_set_mempolicy +271 common mq_open sys_mq_open +272 common mq_unlink sys_mq_unlink +273 common mq_timedsend sys_mq_timedsend +274 common mq_timedreceive sys_mq_timedreceive +275 common mq_notify sys_mq_notify +276 common mq_getsetattr sys_mq_getsetattr +277 common kexec_load sys_kexec_load +278 common add_key sys_add_key +279 common request_key sys_request_key +280 common keyctl sys_keyctl +281 common waitid sys_waitid +282 common ioprio_set sys_ioprio_set +283 common ioprio_get sys_ioprio_get +284 common inotify_init sys_inotify_init +285 common inotify_add_watch sys_inotify_add_watch +286 common inotify_rm_watch sys_inotify_rm_watch +287 common migrate_pages sys_migrate_pages +288 common openat sys_openat +289 common mkdirat sys_mkdirat +290 common mknodat sys_mknodat +291 common fchownat sys_fchownat +292 common futimesat sys_futimesat +293 common newfstatat sys_newfstatat +294 common unlinkat sys_unlinkat +295 common renameat sys_renameat +296 common linkat sys_linkat +297 common symlinkat sys_symlinkat +298 common readlinkat sys_readlinkat +299 common fchmodat sys_fchmodat +300 common faccessat sys_faccessat +301 common pselect6 sys_pselect6 +302 common ppoll sys_ppoll +303 common unshare sys_unshare +304 common set_robust_list sys_set_robust_list +305 common get_robust_list sys_get_robust_list +306 common splice sys_splice +307 common sync_file_range sys_sync_file_range +308 common tee sys_tee +309 common vmsplice sys_vmsplice +310 common move_pages sys_move_pages +311 common getcpu sys_getcpu +312 common epoll_pwait sys_epoll_pwait +313 common utimes sys_utimes +314 common fallocate sys_fallocate +315 common utimensat sys_utimensat +316 common signalfd sys_signalfd +317 common timerfd sys_ni_syscall +318 common eventfd sys_eventfd +319 common timerfd_create sys_timerfd_create +320 common timerfd_settime sys_timerfd_settime +321 common timerfd_gettime sys_timerfd_gettime +322 common signalfd4 sys_signalfd4 +323 common eventfd2 sys_eventfd2 +324 common inotify_init1 sys_inotify_init1 +325 common pipe2 sys_pipe2 +326 common dup3 sys_dup3 +327 common epoll_create1 sys_epoll_create1 +328 common preadv sys_preadv +329 common pwritev sys_pwritev +330 common rt_tgsigqueueinfo sys_rt_tgsigqueueinfo +331 common perf_event_open sys_perf_event_open +332 common fanotify_init sys_fanotify_init +333 common fanotify_mark sys_fanotify_mark +334 common prlimit64 sys_prlimit64 +335 common name_to_handle_at sys_name_to_handle_at +336 common open_by_handle_at sys_open_by_handle_at +337 common clock_adjtime sys_clock_adjtime +338 common syncfs sys_syncfs +339 common setns sys_setns +340 common process_vm_readv sys_process_vm_readv +341 common process_vm_writev sys_process_vm_writev +342 common s390_runtime_instr sys_s390_runtime_instr +343 common kcmp sys_kcmp +344 common finit_module sys_finit_module +345 common sched_setattr sys_sched_setattr +346 common sched_getattr sys_sched_getattr +347 common renameat2 sys_renameat2 +348 common seccomp sys_seccomp +349 common getrandom sys_getrandom +350 common memfd_create sys_memfd_create +351 common bpf sys_bpf +352 common s390_pci_mmio_write sys_s390_pci_mmio_write +353 common s390_pci_mmio_read sys_s390_pci_mmio_read +354 common execveat sys_execveat +355 common userfaultfd sys_userfaultfd +356 common membarrier sys_membarrier +357 common recvmmsg sys_recvmmsg +358 common sendmmsg sys_sendmmsg +359 common socket sys_socket +360 common socketpair sys_socketpair +361 common bind sys_bind +362 common connect sys_connect +363 common listen sys_listen +364 common accept4 sys_accept4 +365 common getsockopt sys_getsockopt +366 common setsockopt sys_setsockopt +367 common getsockname sys_getsockname +368 common getpeername sys_getpeername +369 common sendto sys_sendto +370 common sendmsg sys_sendmsg +371 common recvfrom sys_recvfrom +372 common recvmsg sys_recvmsg +373 common shutdown sys_shutdown +374 common mlock2 sys_mlock2 +375 common copy_file_range sys_copy_file_range +376 common preadv2 sys_preadv2 +377 common pwritev2 sys_pwritev2 +378 common s390_guarded_storage sys_s390_guarded_storage +379 common statx sys_statx +380 common s390_sthyi sys_s390_sthyi +381 common kexec_file_load sys_kexec_file_load +382 common io_pgetevents sys_io_pgetevents +383 common rseq sys_rseq +384 common pkey_mprotect sys_pkey_mprotect +385 common pkey_alloc sys_pkey_alloc +386 common pkey_free sys_pkey_free # room for arch specific syscalls -392 64 semtimedop sys_semtimedop - -393 common semget sys_semget sys_semget -394 common semctl sys_semctl compat_sys_semctl -395 common shmget sys_shmget sys_shmget -396 common shmctl sys_shmctl compat_sys_shmctl -397 common shmat sys_shmat compat_sys_shmat -398 common shmdt sys_shmdt sys_shmdt -399 common msgget sys_msgget sys_msgget -400 common msgsnd sys_msgsnd compat_sys_msgsnd -401 common msgrcv sys_msgrcv compat_sys_msgrcv -402 common msgctl sys_msgctl compat_sys_msgctl -403 32 clock_gettime64 - sys_clock_gettime -404 32 clock_settime64 - sys_clock_settime -405 32 clock_adjtime64 - sys_clock_adjtime -406 32 clock_getres_time64 - sys_clock_getres -407 32 clock_nanosleep_time64 - sys_clock_nanosleep -408 32 timer_gettime64 - sys_timer_gettime -409 32 timer_settime64 - sys_timer_settime -410 32 timerfd_gettime64 - sys_timerfd_gettime -411 32 timerfd_settime64 - sys_timerfd_settime -412 32 utimensat_time64 - sys_utimensat -413 32 pselect6_time64 - compat_sys_pselect6_time64 -414 32 ppoll_time64 - compat_sys_ppoll_time64 -416 32 io_pgetevents_time64 - compat_sys_io_pgetevents_time64 -417 32 recvmmsg_time64 - compat_sys_recvmmsg_time64 -418 32 mq_timedsend_time64 - sys_mq_timedsend -419 32 mq_timedreceive_time64 - sys_mq_timedreceive -420 32 semtimedop_time64 - sys_semtimedop -421 32 rt_sigtimedwait_time64 - compat_sys_rt_sigtimedwait_time64 -422 32 futex_time64 - sys_futex -423 32 sched_rr_get_interval_time64 - sys_sched_rr_get_interval -424 common pidfd_send_signal sys_pidfd_send_signal sys_pidfd_send_signal -425 common io_uring_setup sys_io_uring_setup sys_io_uring_setup -426 common io_uring_enter sys_io_uring_enter sys_io_uring_enter -427 common io_uring_register sys_io_uring_register sys_io_uring_register -428 common open_tree sys_open_tree sys_open_tree -429 common move_mount sys_move_mount sys_move_mount -430 common fsopen sys_fsopen sys_fsopen -431 common fsconfig sys_fsconfig sys_fsconfig -432 common fsmount sys_fsmount sys_fsmount -433 common fspick sys_fspick sys_fspick -434 common pidfd_open sys_pidfd_open sys_pidfd_open -435 common clone3 sys_clone3 sys_clone3 -436 common close_range sys_close_range sys_close_range -437 common openat2 sys_openat2 sys_openat2 -438 common pidfd_getfd sys_pidfd_getfd sys_pidfd_getfd -439 common faccessat2 sys_faccessat2 sys_faccessat2 -440 common process_madvise sys_process_madvise sys_process_madvise -441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 -442 common mount_setattr sys_mount_setattr sys_mount_setattr -443 common quotactl_fd sys_quotactl_fd sys_quotactl_fd -444 common landlock_create_ruleset sys_landlock_create_ruleset sys_landlock_create_ruleset -445 common landlock_add_rule sys_landlock_add_rule sys_landlock_add_rule -446 common landlock_restrict_self sys_landlock_restrict_self sys_landlock_restrict_self -447 common memfd_secret sys_memfd_secret sys_memfd_secret -448 common process_mrelease sys_process_mrelease sys_process_mrelease -449 common futex_waitv sys_futex_waitv sys_futex_waitv -450 common set_mempolicy_home_node sys_set_mempolicy_home_node sys_set_mempolicy_home_node -451 common cachestat sys_cachestat sys_cachestat -452 common fchmodat2 sys_fchmodat2 sys_fchmodat2 -453 common map_shadow_stack sys_map_shadow_stack sys_map_shadow_stack -454 common futex_wake sys_futex_wake sys_futex_wake -455 common futex_wait sys_futex_wait sys_futex_wait -456 common futex_requeue sys_futex_requeue sys_futex_requeue -457 common statmount sys_statmount sys_statmount -458 common listmount sys_listmount sys_listmount -459 common lsm_get_self_attr sys_lsm_get_self_attr sys_lsm_get_self_attr -460 common lsm_set_self_attr sys_lsm_set_self_attr sys_lsm_set_self_attr -461 common lsm_list_modules sys_lsm_list_modules sys_lsm_list_modules -462 common mseal sys_mseal sys_mseal -463 common setxattrat sys_setxattrat sys_setxattrat -464 common getxattrat sys_getxattrat sys_getxattrat -465 common listxattrat sys_listxattrat sys_listxattrat -466 common removexattrat sys_removexattrat sys_removexattrat +392 common semtimedop sys_semtimedop +393 common semget sys_semget +394 common semctl sys_semctl +395 common shmget sys_shmget +396 common shmctl sys_shmctl +397 common shmat sys_shmat +398 common shmdt sys_shmdt +399 common msgget sys_msgget +400 common msgsnd sys_msgsnd +401 common msgrcv sys_msgrcv +402 common msgctl sys_msgctl +424 common pidfd_send_signal sys_pidfd_send_signal +425 common io_uring_setup sys_io_uring_setup +426 common io_uring_enter sys_io_uring_enter +427 common io_uring_register sys_io_uring_register +428 common open_tree sys_open_tree +429 common move_mount sys_move_mount +430 common fsopen sys_fsopen +431 common fsconfig sys_fsconfig +432 common fsmount sys_fsmount +433 common fspick sys_fspick +434 common pidfd_open sys_pidfd_open +435 common clone3 sys_clone3 +436 common close_range sys_close_range +437 common openat2 sys_openat2 +438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise +441 common epoll_pwait2 sys_epoll_pwait2 +442 common mount_setattr sys_mount_setattr +443 common quotactl_fd sys_quotactl_fd +444 common landlock_create_ruleset sys_landlock_create_ruleset +445 common landlock_add_rule sys_landlock_add_rule +446 common landlock_restrict_self sys_landlock_restrict_self +447 common memfd_secret sys_memfd_secret +448 common process_mrelease sys_process_mrelease +449 common futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node +451 common cachestat sys_cachestat +452 common fchmodat2 sys_fchmodat2 +453 common map_shadow_stack sys_map_shadow_stack +454 common futex_wake sys_futex_wake +455 common futex_wait sys_futex_wait +456 common futex_requeue sys_futex_requeue +457 common statmount sys_statmount +458 common listmount sys_listmount +459 common lsm_get_self_attr sys_lsm_get_self_attr +460 common lsm_set_self_attr sys_lsm_set_self_attr +461 common lsm_list_modules sys_lsm_list_modules +462 common mseal sys_mseal +463 common setxattrat sys_setxattrat +464 common getxattrat sys_getxattrat +465 common listxattrat sys_listxattrat +466 common removexattrat sys_removexattrat +467 common open_tree_attr sys_open_tree_attr +468 common file_getattr sys_file_getattr +469 common file_setattr sys_file_setattr +470 common listns sys_listns +471 common rseq_slice_yield sys_rseq_slice_yield diff --git a/arch/s390/kernel/syscalls/syscalltbl b/arch/s390/kernel/syscalls/syscalltbl deleted file mode 100755 index fbac1732f874..000000000000 --- a/arch/s390/kernel/syscalls/syscalltbl +++ /dev/null @@ -1,232 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 -# -# Generate system call table and header files -# -# Copyright IBM Corp. 2018 -# Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> - -# -# File path to the system call table definition. -# You can set the path with the -i option. If omitted, -# system call table definitions are read from standard input. -# -SYSCALL_TBL="" - - -create_syscall_table_entries() -{ - local nr abi name entry64 entry32 _ignore - local temp=$(mktemp ${TMPDIR:-/tmp}/syscalltbl-common.XXXXXXXXX) - - ( - # - # Initialize with 0 to create an NI_SYSCALL for 0 - # - local prev_nr=0 prev_32=sys_ni_syscall prev_64=sys_ni_syscall - while read nr abi name entry64 entry32 _ignore; do - test x$entry32 = x- && entry32=sys_ni_syscall - test x$entry64 = x- && entry64=sys_ni_syscall - - if test $prev_nr -eq $nr; then - # - # Same syscall but different ABI, just update - # the respective entry point - # - case $abi in - 32) - prev_32=$entry32 - ;; - 64) - prev_64=$entry64 - ;; - esac - continue; - else - printf "%d\t%s\t%s\n" $prev_nr $prev_64 $prev_32 - fi - - prev_nr=$nr - prev_64=$entry64 - prev_32=$entry32 - done - printf "%d\t%s\t%s\n" $prev_nr $prev_64 $prev_32 - ) >> $temp - - # - # Check for duplicate syscall numbers - # - if ! cat $temp |cut -f1 |uniq -d 2>&1; then - echo "Error: generated system call table contains duplicate entries: $temp" >&2 - exit 1 - fi - - # - # Generate syscall table - # - prev_nr=0 - while read nr entry64 entry32; do - while test $prev_nr -lt $((nr - 1)); do - printf "NI_SYSCALL\n" - prev_nr=$((prev_nr + 1)) - done - if test x$entry64 = xsys_ni_syscall && - test x$entry32 = xsys_ni_syscall; then - printf "NI_SYSCALL\n" - else - printf "SYSCALL(%s,%s)\n" $entry64 $entry32 - fi - prev_nr=$nr - done < $temp - rm $temp -} - -generate_syscall_table() -{ - cat <<-EoHEADER - /* SPDX-License-Identifier: GPL-2.0 */ - /* - * Definitions for sys_call_table, each line represents an - * entry in the table in the form - * SYSCALL(64 bit syscall, 31 bit emulated syscall) - * - * This file is meant to be included from entry.S. - */ - - #define NI_SYSCALL SYSCALL(sys_ni_syscall,sys_ni_syscall) - -EoHEADER - grep -Ev '^(#|[[:blank:]]*$)' $SYSCALL_TBL \ - |sort -k1 -n \ - |create_syscall_table_entries -} - -create_header_defines() -{ - local nr abi name _ignore - - while read nr abi name _ignore; do - printf "#define __NR_%s %d\n" $name $nr - done -} - -normalize_fileguard() -{ - local fileguard="$1" - - echo "$1" |tr '[[:lower:]]' '[[:upper:]]' \ - |sed -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g' -} - -generate_syscall_header() -{ - local abis=$(echo "($1)" | tr ',' '|') - local filename="$2" - local fileguard suffix - - if test "$filename"; then - fileguard=$(normalize_fileguard "__UAPI_ASM_S390_$2") - else - case "$abis" in - *64*) suffix=64 ;; - *32*) suffix=32 ;; - esac - fileguard=$(normalize_fileguard "__UAPI_ASM_S390_SYSCALLS_$suffix") - fi - - cat <<-EoHEADER - /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ - #ifndef ${fileguard} - #define ${fileguard} - -EoHEADER - - grep -E "^[[:digit:]]+[[:space:]]+${abis}" $SYSCALL_TBL \ - |sort -k1 -n \ - |create_header_defines - - cat <<-EoFOOTER - - #endif /* ${fileguard} */ -EoFOOTER -} - -__max_syscall_nr() -{ - local abis=$(echo "($1)" | tr ',' '|') - - grep -E "^[[:digit:]]+[[:space:]]+${abis}" $SYSCALL_TBL \ - |sed -ne 's/^\([[:digit:]]*\)[[:space:]].*/\1/p' \ - |sort -n \ - |tail -1 -} - - -generate_syscall_nr() -{ - local abis="$1" - local max_syscall_nr num_syscalls - - max_syscall_nr=$(__max_syscall_nr "$abis") - num_syscalls=$((max_syscall_nr + 1)) - - cat <<-EoHEADER - /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ - #ifndef __ASM_S390_SYSCALLS_NR - #define __ASM_S390_SYSCALLS_NR - - #define NR_syscalls ${num_syscalls} - - #endif /* __ASM_S390_SYSCALLS_NR */ -EoHEADER -} - - -# -# Parse command line arguments -# -do_syscall_header="" -do_syscall_table="" -do_syscall_nr="" -output_file="" -abi_list="common,64" -filename="" -while getopts ":HNSXi:a:f:" arg; do - case $arg in - a) - abi_list="$OPTARG" - ;; - i) - SYSCALL_TBL="$OPTARG" - ;; - f) - filename=${OPTARG##*/} - ;; - H) - do_syscall_header=1 - ;; - N) - do_syscall_nr=1 - ;; - S) - do_syscall_table=1 - ;; - X) - set -x - ;; - :) - echo "Missing argument for -$OPTARG" >&2 - exit 1 - ;; - \?) - echo "Invalid option specified" >&2 - exit 1 - ;; - esac -done - -test "$do_syscall_header" && generate_syscall_header "$abi_list" "$filename" -test "$do_syscall_table" && generate_syscall_table -test "$do_syscall_nr" && generate_syscall_nr "$abi_list" - -exit 0 diff --git a/arch/s390/kernel/sysinfo.c b/arch/s390/kernel/sysinfo.c index 88055f58fbda..33ca3e47a0e6 100644 --- a/arch/s390/kernel/sysinfo.c +++ b/arch/s390/kernel/sysinfo.c @@ -5,6 +5,7 @@ * Martin Schwidefsky <schwidefsky@de.ibm.com>, */ +#include <linux/cpufeature.h> #include <linux/debugfs.h> #include <linux/kernel.h> #include <linux/mm.h> @@ -15,54 +16,17 @@ #include <linux/export.h> #include <linux/slab.h> #include <asm/asm-extable.h> +#include <asm/machine.h> #include <asm/ebcdic.h> #include <asm/debug.h> #include <asm/sysinfo.h> #include <asm/cpcmd.h> #include <asm/topology.h> #include <asm/fpu.h> +#include <asm/asm.h> int topology_max_mnest; -static inline int __stsi(void *sysinfo, int fc, int sel1, int sel2, int *lvl) -{ - int r0 = (fc << 28) | sel1; - int rc = 0; - - asm volatile( - " lr 0,%[r0]\n" - " lr 1,%[r1]\n" - " stsi 0(%[sysinfo])\n" - "0: jz 2f\n" - "1: lhi %[rc],%[retval]\n" - "2: lr %[r0],0\n" - EX_TABLE(0b, 1b) - : [r0] "+d" (r0), [rc] "+d" (rc) - : [r1] "d" (sel2), - [sysinfo] "a" (sysinfo), - [retval] "K" (-EOPNOTSUPP) - : "cc", "0", "1", "memory"); - *lvl = ((unsigned int) r0) >> 28; - return rc; -} - -/* - * stsi - store system information - * - * Returns the current configuration level if function code 0 was specified. - * Otherwise returns 0 on success or a negative value on error. - */ -int stsi(void *sysinfo, int fc, int sel1, int sel2) -{ - int lvl, rc; - - rc = __stsi(sysinfo, fc, sel1, sel2, &lvl); - if (rc) - return rc; - return fc ? 0 : lvl; -} -EXPORT_SYMBOL(stsi); - #ifdef CONFIG_PROC_FS static bool convert_ext_name(unsigned char encoding, char *name, size_t len) @@ -154,7 +118,7 @@ static void stsi_15_1_x(struct seq_file *m, struct sysinfo_15_1_x *info) int i; seq_putc(m, '\n'); - if (!MACHINE_HAS_TOPOLOGY) + if (!cpu_has_topology()) return; if (stsi(info, 15, 1, topology_max_mnest)) return; @@ -415,7 +379,7 @@ static struct service_level service_level_vm = { static __init int create_proc_service_level(void) { proc_create_seq("service_levels", 0, NULL, &service_level_seq_ops); - if (MACHINE_IS_VM) + if (machine_is_vm()) register_service_level(&service_level_vm); return 0; } @@ -559,10 +523,10 @@ static __init int stsi_init_debugfs(void) sf = &stsi_file[i]; debugfs_create_file(sf->name, 0400, stsi_root, NULL, sf->fops); } - if (IS_ENABLED(CONFIG_SCHED_TOPOLOGY) && MACHINE_HAS_TOPOLOGY) { + if (IS_ENABLED(CONFIG_SCHED_TOPOLOGY) && cpu_has_topology()) { char link_to[10]; - sprintf(link_to, "15_1_%d", topology_mnest_limit()); + snprintf(link_to, sizeof(link_to), "15_1_%d", topology_mnest_limit()); debugfs_create_symlink("topology", stsi_root, link_to); } return 0; diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index e9f47c3a6197..bd0df61d1907 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -12,8 +12,7 @@ * Copyright (C) 1991, 1992, 1995 Linus Torvalds */ -#define KMSG_COMPONENT "time" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "time: " fmt #include <linux/kernel_stat.h> #include <linux/errno.h> @@ -54,10 +53,10 @@ #include <asm/cio.h> #include "entry.h" -union tod_clock tod_clock_base __section(".data"); +union tod_clock __bootdata_preserved(tod_clock_base); EXPORT_SYMBOL_GPL(tod_clock_base); -u64 clock_comparator_max = -1ULL; +u64 __bootdata_preserved(clock_comparator_max); EXPORT_SYMBOL_GPL(clock_comparator_max); static DEFINE_PER_CPU(struct clock_event_device, comparators); @@ -69,8 +68,6 @@ unsigned char ptff_function_mask[16]; static unsigned long lpar_offset; static unsigned long initial_leap_seconds; -static unsigned long tod_steering_end; -static long tod_steering_delta; /* * Get time offsets with PTFF @@ -79,12 +76,8 @@ void __init time_early_init(void) { struct ptff_qto qto; struct ptff_qui qui; - int cs; - /* Initialize TOD steering parameters */ - tod_steering_end = tod_clock_base.tod; - for (cs = 0; cs < CS_BASES; cs++) - vdso_data[cs].arch_data.tod_steering_end = tod_steering_end; + vdso_k_time_data->arch_data.tod_delta = tod_clock_base.tod; if (!test_facility(28)) return; @@ -228,21 +221,7 @@ void __init read_persistent_wall_and_boot_offset(struct timespec64 *wall_time, static u64 read_tod_clock(struct clocksource *cs) { - unsigned long now, adj; - - preempt_disable(); /* protect from changes to steering parameters */ - now = get_tod_clock(); - adj = tod_steering_end - now; - if (unlikely((s64) adj > 0)) - /* - * manually steer by 1 cycle every 2^16 cycles. This - * corresponds to shifting the tod delta by 15. 1s is - * therefore steered in ~9h. The adjust will decrease - * over time, until it finally reaches 0. - */ - now += (tod_steering_delta < 0) ? (adj >> 15) : -(adj >> 15); - preempt_enable(); - return now; + return get_tod_clock_monotonic(); } static struct clocksource clocksource_tod = { @@ -371,29 +350,11 @@ static inline int check_sync_clock(void) */ static void clock_sync_global(long delta) { - unsigned long now, adj; struct ptff_qto qto; - int cs; /* Fixup the monotonic sched clock. */ tod_clock_base.eitod += delta; - /* Adjust TOD steering parameters. */ - now = get_tod_clock(); - adj = tod_steering_end - now; - if (unlikely((s64) adj >= 0)) - /* Calculate how much of the old adjustment is left. */ - tod_steering_delta = (tod_steering_delta < 0) ? - -(adj >> 15) : (adj >> 15); - tod_steering_delta += delta; - if ((abs(tod_steering_delta) >> 48) != 0) - panic("TOD clock sync offset %li is too large to drift\n", - tod_steering_delta); - tod_steering_end = now + (abs(tod_steering_delta) << 15); - for (cs = 0; cs < CS_BASES; cs++) { - vdso_data[cs].arch_data.tod_steering_end = tod_steering_end; - vdso_data[cs].arch_data.tod_steering_delta = tod_steering_delta; - } - + vdso_k_time_data->arch_data.tod_delta = tod_clock_base.tod; /* Update LPAR offset. */ if (ptff_query(PTFF_QTO) && ptff(&qto, sizeof(qto), PTFF_QTO) == 0) lpar_offset = qto.tod_epoch_difference; @@ -435,7 +396,7 @@ struct clock_sync_data { /* * Server Time Protocol (STP) code. */ -static bool stp_online; +static bool stp_online = true; static struct stp_sstpi stp_info; static void *stp_page; @@ -461,7 +422,6 @@ static void __init stp_reset(void) if (rc == 0) set_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags); else if (stp_online) { - pr_warn("The real or virtual hardware system does not provide an STP interface\n"); free_page((unsigned long) stp_page); stp_page = NULL; stp_online = false; @@ -585,7 +545,7 @@ static int stp_sync_clock(void *data) atomic_dec(&sync->cpus); /* Wait for in_sync to be set. */ while (READ_ONCE(sync->in_sync) == 0) - __udelay(1); + ; } if (sync->in_sync != 1) /* Didn't work. Clear per-cpu in sync bit again. */ @@ -596,81 +556,6 @@ static int stp_sync_clock(void *data) return 0; } -static int stp_clear_leap(void) -{ - struct __kernel_timex txc; - int ret; - - memset(&txc, 0, sizeof(txc)); - - ret = do_adjtimex(&txc); - if (ret < 0) - return ret; - - txc.modes = ADJ_STATUS; - txc.status &= ~(STA_INS|STA_DEL); - return do_adjtimex(&txc); -} - -static void stp_check_leap(void) -{ - struct stp_stzi stzi; - struct stp_lsoib *lsoib = &stzi.lsoib; - struct __kernel_timex txc; - int64_t timediff; - int leapdiff, ret; - - if (!stp_info.lu || !check_sync_clock()) { - /* - * Either a scheduled leap second was removed by the operator, - * or STP is out of sync. In both cases, clear the leap second - * kernel flags. - */ - if (stp_clear_leap() < 0) - pr_err("failed to clear leap second flags\n"); - return; - } - - if (chsc_stzi(stp_page, &stzi, sizeof(stzi))) { - pr_err("stzi failed\n"); - return; - } - - timediff = tod_to_ns(lsoib->nlsout - get_tod_clock()) / NSEC_PER_SEC; - leapdiff = lsoib->nlso - lsoib->also; - - if (leapdiff != 1 && leapdiff != -1) { - pr_err("Cannot schedule %d leap seconds\n", leapdiff); - return; - } - - if (timediff < 0) { - if (stp_clear_leap() < 0) - pr_err("failed to clear leap second flags\n"); - } else if (timediff < 7200) { - memset(&txc, 0, sizeof(txc)); - ret = do_adjtimex(&txc); - if (ret < 0) - return; - - txc.modes = ADJ_STATUS; - if (leapdiff > 0) - txc.status |= STA_INS; - else - txc.status |= STA_DEL; - ret = do_adjtimex(&txc); - if (ret < 0) - pr_err("failed to set leap second flags\n"); - /* arm Timer to clear leap second flags */ - mod_timer(&stp_timer, jiffies + secs_to_jiffies(14400)); - } else { - /* The day the leap second is scheduled for hasn't been reached. Retry - * in one hour. - */ - mod_timer(&stp_timer, jiffies + secs_to_jiffies(3600)); - } -} - /* * STP work. Check for the STP state and take over the clock * synchronization if the STP clock source is usable. @@ -685,7 +570,7 @@ static void stp_work_fn(struct work_struct *work) if (!stp_online) { chsc_sstpc(stp_page, STP_OP_CTRL, 0x0000, NULL); - del_timer_sync(&stp_timer); + timer_delete_sync(&stp_timer); goto out_unlock; } @@ -712,8 +597,6 @@ static void stp_work_fn(struct work_struct *work) * Retry after a second. */ mod_timer(&stp_timer, jiffies + msecs_to_jiffies(MSEC_PER_SEC)); - else if (stp_info.lu) - stp_check_leap(); out_unlock: mutex_unlock(&stp_mutex); diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 211cc8382e4a..1913a5566ac2 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -3,9 +3,9 @@ * Copyright IBM Corp. 2007, 2011 */ -#define KMSG_COMPONENT "cpu" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "cpu: " fmt +#include <linux/cpufeature.h> #include <linux/workqueue.h> #include <linux/memblock.h> #include <linux/uaccess.h> @@ -240,7 +240,7 @@ int topology_set_cpu_management(int fc) { int cpu, rc; - if (!MACHINE_HAS_TOPOLOGY) + if (!cpu_has_topology()) return -EOPNOTSUPP; if (fc) rc = ptf(PTF_VERTICAL); @@ -315,13 +315,13 @@ static int __arch_update_cpu_topology(void) hd_status = 0; rc = 0; mutex_lock(&smp_cpu_state_mutex); - if (MACHINE_HAS_TOPOLOGY) { + if (cpu_has_topology()) { rc = 1; store_topology(info); tl_to_masks(info); } update_cpu_masks(); - if (!MACHINE_HAS_TOPOLOGY) + if (!cpu_has_topology()) topology_update_polarization_simple(); if (cpu_management == 1) hd_status = hd_enable_hiperdispatch(); @@ -376,7 +376,7 @@ static void set_topology_timer(void) void topology_expect_change(void) { - if (!MACHINE_HAS_TOPOLOGY) + if (!cpu_has_topology()) return; /* This is racy, but it doesn't matter since it is just a heuristic. * Worst case is that we poll in a higher frequency for a bit longer. @@ -500,7 +500,7 @@ int topology_cpu_init(struct cpu *cpu) int rc; rc = sysfs_create_group(&cpu->dev.kobj, &topology_cpu_attr_group); - if (rc || !MACHINE_HAS_TOPOLOGY) + if (rc || !cpu_has_topology()) return rc; rc = sysfs_create_group(&cpu->dev.kobj, &topology_extra_cpu_attr_group); if (rc) @@ -508,33 +508,27 @@ int topology_cpu_init(struct cpu *cpu) return rc; } -static const struct cpumask *cpu_thread_mask(int cpu) -{ - return &cpu_topology[cpu].thread_mask; -} - - const struct cpumask *cpu_coregroup_mask(int cpu) { return &cpu_topology[cpu].core_mask; } -static const struct cpumask *cpu_book_mask(int cpu) +static const struct cpumask *tl_book_mask(struct sched_domain_topology_level *tl, int cpu) { return &cpu_topology[cpu].book_mask; } -static const struct cpumask *cpu_drawer_mask(int cpu) +static const struct cpumask *tl_drawer_mask(struct sched_domain_topology_level *tl, int cpu) { return &cpu_topology[cpu].drawer_mask; } static struct sched_domain_topology_level s390_topology[] = { - { cpu_thread_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, - { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, - { cpu_book_mask, SD_INIT_NAME(BOOK) }, - { cpu_drawer_mask, SD_INIT_NAME(DRAWER) }, - { cpu_cpu_mask, SD_INIT_NAME(PKG) }, + SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT), + SDTL_INIT(tl_mc_mask, cpu_core_flags, MC), + SDTL_INIT(tl_book_mask, NULL, BOOK), + SDTL_INIT(tl_drawer_mask, NULL, DRAWER), + SDTL_INIT(tl_pkg_mask, NULL, PKG), { NULL, }, }; @@ -569,12 +563,12 @@ void __init topology_init_early(void) set_sched_topology(s390_topology); if (topology_mode == TOPOLOGY_MODE_UNINITIALIZED) { - if (MACHINE_HAS_TOPOLOGY) + if (cpu_has_topology()) topology_mode = TOPOLOGY_MODE_HW; else topology_mode = TOPOLOGY_MODE_SINGLE; } - if (!MACHINE_HAS_TOPOLOGY) + if (!cpu_has_topology()) goto out; tl_info = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); info = tl_info; @@ -596,7 +590,7 @@ static inline int topology_get_mode(int enabled) { if (!enabled) return TOPOLOGY_MODE_SINGLE; - return MACHINE_HAS_TOPOLOGY ? TOPOLOGY_MODE_HW : TOPOLOGY_MODE_PACKAGE; + return cpu_has_topology() ? TOPOLOGY_MODE_HW : TOPOLOGY_MODE_PACKAGE; } static inline int topology_is_enabled(void) @@ -686,7 +680,7 @@ static int __init topology_init(void) int rc = 0; timer_setup(&topology_timer, topology_timer_fn, TIMER_DEFERRABLE); - if (MACHINE_HAS_TOPOLOGY) + if (cpu_has_topology()) set_topology_timer(); else topology_update_polarization_simple(); diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index 24fee11b030d..1b5c6fc431cc 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -3,18 +3,13 @@ * S390 version * Copyright IBM Corp. 1999, 2000 * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com), - * Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com), + * Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com), * * Derived from "arch/i386/kernel/traps.c" * Copyright (C) 1991, 1992 Linus Torvalds */ -/* - * 'Traps.c' handles hardware traps and faults after we have saved some - * state in 'asm.s'. - */ -#include "asm/irqflags.h" -#include "asm/ptrace.h" +#include <linux/cpufeature.h> #include <linux/kprobes.h> #include <linux/kdebug.h> #include <linux/randomize_kstack.h> @@ -28,7 +23,10 @@ #include <linux/cpu.h> #include <linux/entry-common.h> #include <linux/kmsan.h> +#include <linux/bug.h> #include <asm/asm-extable.h> +#include <asm/irqflags.h> +#include <asm/ptrace.h> #include <asm/vtime.h> #include <asm/fpu.h> #include <asm/fault.h> @@ -42,7 +40,7 @@ static inline void __user *get_trap_ip(struct pt_regs *regs) address = current->thread.trap_tdb.data[3]; else address = regs->psw.addr; - return (void __user *) (address - (regs->int_code >> 16)); + return (void __user *)(address - (regs->int_code >> 16)); } #ifdef CONFIG_GENERIC_BUG @@ -57,16 +55,15 @@ void do_report_trap(struct pt_regs *regs, int si_signo, int si_code, char *str) if (user_mode(regs)) { force_sig_fault(si_signo, si_code, get_trap_ip(regs)); report_user_fault(regs, si_signo, 0); - } else { + } else { if (!fixup_exception(regs)) die(regs, str); - } + } } static void do_trap(struct pt_regs *regs, int si_signo, int si_code, char *str) { - if (notify_die(DIE_TRAP, str, regs, 0, - regs->int_code, si_signo) == NOTIFY_STOP) + if (notify_die(DIE_TRAP, str, regs, 0, regs->int_code, si_signo) == NOTIFY_STOP) return; do_report_trap(regs, si_signo, si_code, str); } @@ -78,8 +75,7 @@ void do_per_trap(struct pt_regs *regs) return; if (!current->ptrace) return; - force_sig_fault(SIGTRAP, TRAP_HWBKPT, - (void __force __user *) current->thread.per_event.address); + force_sig_fault(SIGTRAP, TRAP_HWBKPT, (void __force __user *)current->thread.per_event.address); } NOKPROBE_SYMBOL(do_per_trap); @@ -98,36 +94,25 @@ static void name(struct pt_regs *regs) \ do_trap(regs, signr, sicode, str); \ } -DO_ERROR_INFO(addressing_exception, SIGILL, ILL_ILLADR, - "addressing exception") -DO_ERROR_INFO(execute_exception, SIGILL, ILL_ILLOPN, - "execute exception") -DO_ERROR_INFO(divide_exception, SIGFPE, FPE_INTDIV, - "fixpoint divide exception") -DO_ERROR_INFO(overflow_exception, SIGFPE, FPE_INTOVF, - "fixpoint overflow exception") -DO_ERROR_INFO(hfp_overflow_exception, SIGFPE, FPE_FLTOVF, - "HFP overflow exception") -DO_ERROR_INFO(hfp_underflow_exception, SIGFPE, FPE_FLTUND, - "HFP underflow exception") -DO_ERROR_INFO(hfp_significance_exception, SIGFPE, FPE_FLTRES, - "HFP significance exception") -DO_ERROR_INFO(hfp_divide_exception, SIGFPE, FPE_FLTDIV, - "HFP divide exception") -DO_ERROR_INFO(hfp_sqrt_exception, SIGFPE, FPE_FLTINV, - "HFP square root exception") -DO_ERROR_INFO(operand_exception, SIGILL, ILL_ILLOPN, - "operand exception") -DO_ERROR_INFO(privileged_op, SIGILL, ILL_PRVOPC, - "privileged operation") -DO_ERROR_INFO(special_op_exception, SIGILL, ILL_ILLOPN, - "special operation exception") -DO_ERROR_INFO(transaction_exception, SIGILL, ILL_ILLOPN, - "transaction constraint exception") +DO_ERROR_INFO(addressing_exception, SIGILL, ILL_ILLADR, "addressing exception") +DO_ERROR_INFO(divide_exception, SIGFPE, FPE_INTDIV, "fixpoint divide exception") +DO_ERROR_INFO(execute_exception, SIGILL, ILL_ILLOPN, "execute exception") +DO_ERROR_INFO(hfp_divide_exception, SIGFPE, FPE_FLTDIV, "HFP divide exception") +DO_ERROR_INFO(hfp_overflow_exception, SIGFPE, FPE_FLTOVF, "HFP overflow exception") +DO_ERROR_INFO(hfp_significance_exception, SIGFPE, FPE_FLTRES, "HFP significance exception") +DO_ERROR_INFO(hfp_sqrt_exception, SIGFPE, FPE_FLTINV, "HFP square root exception") +DO_ERROR_INFO(hfp_underflow_exception, SIGFPE, FPE_FLTUND, "HFP underflow exception") +DO_ERROR_INFO(operand_exception, SIGILL, ILL_ILLOPN, "operand exception") +DO_ERROR_INFO(overflow_exception, SIGFPE, FPE_INTOVF, "fixpoint overflow exception") +DO_ERROR_INFO(privileged_op, SIGILL, ILL_PRVOPC, "privileged operation") +DO_ERROR_INFO(special_op_exception, SIGILL, ILL_ILLOPN, "special operation exception") +DO_ERROR_INFO(specification_exception, SIGILL, ILL_ILLOPN, "specification exception"); +DO_ERROR_INFO(transaction_exception, SIGILL, ILL_ILLOPN, "transaction constraint exception") static inline void do_fp_trap(struct pt_regs *regs, __u32 fpc) { int si_code = 0; + /* FPC[2] is Data Exception Code */ if ((fpc & 0x00000300) == 0) { /* bits 6 and 7 of DXC are 0 iff IEEE exception */ @@ -153,36 +138,35 @@ static void translation_specification_exception(struct pt_regs *regs) static void illegal_op(struct pt_regs *regs) { - __u8 opcode[6]; - __u16 __user *location; int is_uprobe_insn = 0; + u16 __user *location; int signal = 0; + u16 opcode; location = get_trap_ip(regs); - if (user_mode(regs)) { - if (get_user(*((__u16 *) opcode), (__u16 __user *) location)) + if (get_user(opcode, location)) return; - if (*((__u16 *) opcode) == S390_BREAKPOINT_U16) { + if (opcode == S390_BREAKPOINT_U16) { if (current->ptrace) force_sig_fault(SIGTRAP, TRAP_BRKPT, location); else signal = SIGILL; #ifdef CONFIG_UPROBES - } else if (*((__u16 *) opcode) == UPROBE_SWBP_INSN) { + } else if (opcode == UPROBE_SWBP_INSN) { is_uprobe_insn = 1; #endif - } else + } else { signal = SIGILL; + } } /* - * We got either an illegal op in kernel mode, or user space trapped + * This is either an illegal op in kernel mode, or user space trapped * on a uprobes illegal instruction. See if kprobes or uprobes picks * it up. If not, SIGILL. */ if (is_uprobe_insn || !user_mode(regs)) { - if (notify_die(DIE_BPT, "bpt", regs, 0, - 3, SIGTRAP) != NOTIFY_STOP) + if (notify_die(DIE_BPT, "bpt", regs, 0, 3, SIGTRAP) != NOTIFY_STOP) signal = SIGILL; } if (signal) @@ -190,18 +174,10 @@ static void illegal_op(struct pt_regs *regs) } NOKPROBE_SYMBOL(illegal_op); -DO_ERROR_INFO(specification_exception, SIGILL, ILL_ILLOPN, - "specification exception"); - static void vector_exception(struct pt_regs *regs) { int si_code, vic; - if (!cpu_has_vx()) { - do_trap(regs, SIGILL, ILL_ILLOPN, "illegal operation"); - return; - } - /* get vector interrupt code from fpc */ save_user_fpu_regs(); vic = (current->thread.ufpu.fpc & 0xf00) >> 8; @@ -245,12 +221,48 @@ static void space_switch_exception(struct pt_regs *regs) do_trap(regs, SIGILL, ILL_PRVOPC, "space switch event"); } +#if defined(CONFIG_BUG) && defined(CONFIG_CC_HAS_ASM_IMMEDIATE_STRINGS) + +void *__warn_args(struct arch_va_list *args, struct pt_regs *regs) +{ + struct stack_frame *stack_frame; + + /* + * Generate va_list from pt_regs. See ELF Application Binary Interface + * s390x Supplement documentation for details. + * + * - __overflow_arg_area needs to point to the parameter area, which + * is right above the standard stack frame (160 bytes) + * + * - __reg_save_area needs to point to a register save area where + * general registers (%r2 - %r6) can be found at offset 16. Which + * means that the gprs save area of pt_regs can be used + * + * - __gpr must be set to one, since the first parameter has been + * processed (pointer to bug_entry) + */ + stack_frame = (struct stack_frame *)regs->gprs[15]; + args->__overflow_arg_area = stack_frame + 1; + args->__reg_save_area = regs->gprs; + args->__gpr = 1; + return args; +} + +#endif /* CONFIG_BUG && CONFIG_CC_HAS_ASM_IMMEDIATE_STRINGS */ + static void monitor_event_exception(struct pt_regs *regs) { + enum bug_trap_type btt; + if (user_mode(regs)) return; - - switch (report_bug(regs->psw.addr - (regs->int_code >> 16), regs)) { + if (regs->monitor_code == MONCODE_BUG_ARG) { + regs->psw.addr = regs->gprs[14]; + btt = report_bug_entry((struct bug_entry *)regs->gprs[2], regs); + } else { + btt = report_bug(regs->psw.addr - (regs->int_code >> 16), regs); + } + switch (btt) { case BUG_TRAP_TYPE_NONE: fixup_exception(regs); break; @@ -262,7 +274,7 @@ static void monitor_event_exception(struct pt_regs *regs) } } -void kernel_stack_overflow(struct pt_regs *regs) +void kernel_stack_invalid(struct pt_regs *regs) { /* * Normally regs are unpoisoned by the generic entry code, but @@ -270,12 +282,12 @@ void kernel_stack_overflow(struct pt_regs *regs) */ kmsan_unpoison_entry_regs(regs); bust_spinlocks(1); - printk("Kernel stack overflow.\n"); + pr_emerg("Kernel stack pointer invalid\n"); show_regs(regs); bust_spinlocks(0); - panic("Corrupt kernel stack, can't continue."); + panic("Invalid kernel stack pointer, cannot continue"); } -NOKPROBE_SYMBOL(kernel_stack_overflow); +NOKPROBE_SYMBOL(kernel_stack_invalid); static void __init test_monitor_call(void) { @@ -283,12 +295,13 @@ static void __init test_monitor_call(void) if (!IS_ENABLED(CONFIG_BUG)) return; - asm volatile( - " mc 0,0\n" - "0: xgr %0,%0\n" + asm_inline volatile( + " mc %[monc](%%r0),0\n" + "0: lhi %[val],0\n" "1:\n" - EX_TABLE(0b,1b) - : "+d" (val)); + EX_TABLE(0b, 1b) + : [val] "+d" (val) + : [monc] "i" (MONCODE_BUG)); if (!val) panic("Monitor call doesn't work!\n"); } @@ -323,7 +336,7 @@ void noinstr __do_pgm_check(struct pt_regs *regs) teid.val = lc->trans_exc_code; regs->int_code = lc->pgm_int_code; regs->int_parm_long = teid.val; - + regs->monitor_code = lc->monitor_code; /* * In case of a guest fault, short-circuit the fault handler and return. * This way the sie64a() function will return 0; fault address and @@ -336,23 +349,19 @@ void noinstr __do_pgm_check(struct pt_regs *regs) current->thread.gmap_int_code = regs->int_code & 0xffff; return; } - state = irqentry_enter(regs); - if (user_mode(regs)) { update_timer_sys(); - if (!static_branch_likely(&cpu_has_bear)) { + if (!cpu_has_bear()) { if (regs->last_break < 4096) regs->last_break = 1; } current->thread.last_break = regs->last_break; } - if (lc->pgm_code & 0x0200) { /* transaction abort */ current->thread.trap_tdb = lc->pgm_tdb; } - if (lc->pgm_code & PGM_INT_CODE_PER) { if (user_mode(regs)) { struct per_event *ev = ¤t->thread.per_event; @@ -368,11 +377,9 @@ void noinstr __do_pgm_check(struct pt_regs *regs) goto out; } } - if (!irqs_disabled_flags(regs->psw.mask)) trace_hardirqs_on(); __arch_local_irq_ssm(regs->psw.mask & ~PSW_MASK_PER); - trapnr = regs->int_code & PGM_INT_CODE_MASK; if (trapnr) pgm_check_table[trapnr](regs); diff --git a/arch/s390/kernel/unwind_bc.c b/arch/s390/kernel/unwind_bc.c index cd44be2b6ce8..0f88caca4eaf 100644 --- a/arch/s390/kernel/unwind_bc.c +++ b/arch/s390/kernel/unwind_bc.c @@ -1,4 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ + +#include <linux/export.h> #include <linux/sched.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> diff --git a/arch/s390/kernel/uprobes.c b/arch/s390/kernel/uprobes.c index 5b0633ea8d93..c624f3361e43 100644 --- a/arch/s390/kernel/uprobes.c +++ b/arch/s390/kernel/uprobes.c @@ -8,7 +8,6 @@ #include <linux/uaccess.h> #include <linux/uprobes.h> -#include <linux/compat.h> #include <linux/kdebug.h> #include <linux/sched/task_stack.h> @@ -29,7 +28,7 @@ int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { if (psw_bits(regs->psw).eaba == PSW_BITS_AMODE_24BIT) return -EINVAL; - if (!is_compat_task() && psw_bits(regs->psw).eaba == PSW_BITS_AMODE_31BIT) + if (psw_bits(regs->psw).eaba == PSW_BITS_AMODE_31BIT) return -EINVAL; clear_thread_flag(TIF_PER_TRAP); auprobe->saved_per = psw_bits(regs->psw).per; @@ -161,11 +160,6 @@ bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, /* Instruction Emulation */ -static void adjust_psw_addr(psw_t *psw, unsigned long len) -{ - psw->addr = __rewind_psw(*psw, -len); -} - #define EMU_ILLEGAL_OP 1 #define EMU_SPECIFICATION 2 #define EMU_ADDRESSING 3 @@ -353,7 +347,7 @@ static void handle_insn_ril(struct arch_uprobe *auprobe, struct pt_regs *regs) } break; } - adjust_psw_addr(®s->psw, ilen); + regs->psw.addr = __forward_psw(regs->psw, ilen); switch (rc) { case EMU_ILLEGAL_OP: regs->int_code = ilen << 16 | 0x0001; @@ -373,8 +367,7 @@ static void handle_insn_ril(struct arch_uprobe *auprobe, struct pt_regs *regs) bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) { if ((psw_bits(regs->psw).eaba == PSW_BITS_AMODE_24BIT) || - ((psw_bits(regs->psw).eaba == PSW_BITS_AMODE_31BIT) && - !is_compat_task())) { + (psw_bits(regs->psw).eaba == PSW_BITS_AMODE_31BIT)) { regs->psw.addr = __rewind_psw(regs->psw, UPROBE_SWBP_INSN_SIZE); do_report_trap(regs, SIGILL, ILL_ILLADR, NULL); return true; diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index 9f05df2da2f7..a284f98d9716 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -4,9 +4,9 @@ * * Copyright IBM Corp. 2019, 2024 */ -#define KMSG_COMPONENT "prot_virt" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "prot_virt: " fmt +#include <linux/export.h> #include <linux/kernel.h> #include <linux/types.h> #include <linux/sizes.h> @@ -15,6 +15,7 @@ #include <linux/pagemap.h> #include <linux/swap.h> #include <linux/pagewalk.h> +#include <linux/backing-dev.h> #include <asm/facility.h> #include <asm/sections.h> #include <asm/uv.h> @@ -133,16 +134,17 @@ static int uv_destroy(unsigned long paddr) */ int uv_destroy_folio(struct folio *folio) { + unsigned long i; int rc; - /* See gmap_make_secure(): large folios cannot be secure */ - if (unlikely(folio_test_large(folio))) - return 0; - folio_get(folio); - rc = uv_destroy(folio_to_phys(folio)); + for (i = 0; i < (1 << folio_order(folio)); i++) { + rc = uv_destroy(folio_to_phys(folio) + i * PAGE_SIZE); + if (rc) + break; + } if (!rc) - clear_bit(PG_arch_1, &folio->flags); + clear_bit(PG_arch_1, &folio->flags.f); folio_put(folio); return rc; } @@ -182,16 +184,17 @@ EXPORT_SYMBOL_GPL(uv_convert_from_secure); */ int uv_convert_from_secure_folio(struct folio *folio) { + unsigned long i; int rc; - /* See gmap_make_secure(): large folios cannot be secure */ - if (unlikely(folio_test_large(folio))) - return 0; - folio_get(folio); - rc = uv_convert_from_secure(folio_to_phys(folio)); + for (i = 0; i < (1 << folio_order(folio)); i++) { + rc = uv_convert_from_secure(folio_to_phys(folio) + i * PAGE_SIZE); + if (rc) + break; + } if (!rc) - clear_bit(PG_arch_1, &folio->flags); + clear_bit(PG_arch_1, &folio->flags.f); folio_put(folio); return rc; } @@ -228,7 +231,7 @@ static int expected_folio_refs(struct folio *folio) } /** - * make_folio_secure() - make a folio secure + * __make_folio_secure() - make a folio secure * @folio: the folio to make secure * @uvcb: the uvcb that describes the UVC to be used * @@ -237,26 +240,24 @@ static int expected_folio_refs(struct folio *folio) * * Return: 0 on success; * -EBUSY if the folio is in writeback or has too many references; - * -E2BIG if the folio is large; * -EAGAIN if the UVC needs to be attempted again; * -ENXIO if the address is not mapped; * -EINVAL if the UVC failed for other reasons. * * Context: The caller must hold exactly one extra reference on the folio - * (it's the same logic as split_folio()) + * (it's the same logic as split_folio()), and the folio must be + * locked. */ -int make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb) +int __make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb) { int expected, cc = 0; - if (folio_test_large(folio)) - return -E2BIG; if (folio_test_writeback(folio)) return -EBUSY; expected = expected_folio_refs(folio) + 1; if (!folio_ref_freeze(folio, expected)) return -EBUSY; - set_bit(PG_arch_1, &folio->flags); + set_bit(PG_arch_1, &folio->flags.f); /* * If the UVC does not succeed or fail immediately, we don't want to * loop for long, or we might get stall notifications. @@ -277,42 +278,126 @@ int make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb) return -EAGAIN; return uvcb->rc == 0x10a ? -ENXIO : -EINVAL; } -EXPORT_SYMBOL_GPL(make_folio_secure); +EXPORT_SYMBOL(__make_folio_secure); + +/** + * s390_wiggle_split_folio() - try to drain extra references to a folio and + * split the folio if it is large. + * @mm: the mm containing the folio to work on + * @folio: the folio + * + * Context: Must be called while holding an extra reference to the folio; + * the mm lock should not be held. + * Return: 0 if the operation was successful; + * -EAGAIN if splitting the large folio was not successful, + * but another attempt can be made; + * -EINVAL in case of other folio splitting errors. See split_folio(). + */ +int s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio) +{ + int rc, tried_splits; + + lockdep_assert_not_held(&mm->mmap_lock); + folio_wait_writeback(folio); + lru_add_drain_all(); + + if (!folio_test_large(folio)) + return 0; + + for (tried_splits = 0; tried_splits < 2; tried_splits++) { + struct address_space *mapping; + loff_t lstart, lend; + struct inode *inode; + + folio_lock(folio); + rc = split_folio(folio); + if (rc != -EBUSY) { + folio_unlock(folio); + return rc; + } + + /* + * Splitting with -EBUSY can fail for various reasons, but we + * have to handle one case explicitly for now: some mappings + * don't allow for splitting dirty folios; writeback will + * mark them clean again, including marking all page table + * entries mapping the folio read-only, to catch future write + * attempts. + * + * While the system should be writing back dirty folios in the + * background, we obtained this folio by looking up a writable + * page table entry. On these problematic mappings, writable + * page table entries imply dirty folios, preventing the + * split in the first place. + * + * To prevent a livelock when trigger writeback manually and + * letting the caller look up the folio again in the page + * table (turning it dirty), immediately try to split again. + * + * This is only a problem for some mappings (e.g., XFS); + * mappings that do not support writeback (e.g., shmem) do not + * apply. + */ + if (!folio_test_dirty(folio) || folio_test_anon(folio) || + !folio->mapping || !mapping_can_writeback(folio->mapping)) { + folio_unlock(folio); + break; + } + + /* + * Ideally, we'd only trigger writeback on this exact folio. But + * there is no easy way to do that, so we'll stabilize the + * mapping while we still hold the folio lock, so we can drop + * the folio lock to trigger writeback on the range currently + * covered by the folio instead. + */ + mapping = folio->mapping; + lstart = folio_pos(folio); + lend = lstart + folio_size(folio) - 1; + inode = igrab(mapping->host); + folio_unlock(folio); + + if (unlikely(!inode)) + break; + + filemap_write_and_wait_range(mapping, lstart, lend); + iput(mapping->host); + } + return -EAGAIN; +} +EXPORT_SYMBOL_GPL(s390_wiggle_split_folio); /* * To be called with the folio locked or with an extra reference! This will - * prevent gmap_make_secure from touching the folio concurrently. Having 2 - * parallel arch_make_folio_accessible is fine, as the UV calls will become a - * no-op if the folio is already exported. + * prevent kvm_s390_pv_make_secure() from touching the folio concurrently. + * Having 2 parallel arch_make_folio_accessible is fine, as the UV calls will + * become a no-op if the folio is already exported. */ int arch_make_folio_accessible(struct folio *folio) { int rc = 0; - /* See gmap_make_secure(): large folios cannot be secure */ - if (unlikely(folio_test_large(folio))) - return 0; - /* - * PG_arch_1 is used in 2 places: - * 1. for storage keys of hugetlb folios and KVM - * 2. As an indication that this small folio might be secure. This can - * overindicate, e.g. we set the bit before calling - * convert_to_secure. - * As secure pages are never large folios, both variants can co-exists. + * PG_arch_1 is used as an indication that this small folio might be + * secure. This can overindicate, e.g. we set the bit before calling + * convert_to_secure. */ - if (!test_bit(PG_arch_1, &folio->flags)) + if (!test_bit(PG_arch_1, &folio->flags.f)) return 0; + /* Large folios cannot be secure. */ + if (WARN_ON_ONCE(folio_test_large(folio))) + return -EFAULT; + rc = uv_pin_shared(folio_to_phys(folio)); if (!rc) { - clear_bit(PG_arch_1, &folio->flags); + clear_bit(PG_arch_1, &folio->flags.f); return 0; } rc = uv_convert_from_secure(folio_to_phys(folio)); if (!rc) { - clear_bit(PG_arch_1, &folio->flags); + clear_bit(PG_arch_1, &folio->flags.f); return 0; } @@ -660,7 +745,12 @@ out_kobj: device_initcall(uv_sysfs_init); /* - * Find the secret with the secret_id in the provided list. + * Locate a secret in the list by its id. + * @secret_id: search pattern. + * @list: ephemeral buffer space + * @secret: output data, containing the secret's metadata. + * + * Search for a secret with the given secret_id in the Ultravisor secret store. * * Context: might sleep. */ @@ -679,14 +769,17 @@ static int find_secret_in_page(const u8 secret_id[UV_SECRET_ID_LEN], return -ENOENT; } -/* - * Do the actual search for `uv_get_secret_metadata`. +/** + * uv_find_secret() - search secret metadata for a given secret id. + * @secret_id: search pattern. + * @list: ephemeral buffer space + * @secret: output data, containing the secret's metadata. * * Context: might sleep. */ -static int find_secret(const u8 secret_id[UV_SECRET_ID_LEN], - struct uv_secret_list *list, - struct uv_secret_list_item_hdr *secret) +int uv_find_secret(const u8 secret_id[UV_SECRET_ID_LEN], + struct uv_secret_list *list, + struct uv_secret_list_item_hdr *secret) { u16 start_idx = 0; u16 list_rc; @@ -708,36 +801,7 @@ static int find_secret(const u8 secret_id[UV_SECRET_ID_LEN], return -ENOENT; } - -/** - * uv_get_secret_metadata() - get secret metadata for a given secret id. - * @secret_id: search pattern. - * @secret: output data, containing the secret's metadata. - * - * Search for a secret with the given secret_id in the Ultravisor secret store. - * - * Context: might sleep. - * - * Return: - * * %0: - Found entry; secret->idx and secret->type are valid. - * * %ENOENT - No entry found. - * * %ENODEV: - Not supported: UV not available or command not available. - * * %EIO: - Other unexpected UV error. - */ -int uv_get_secret_metadata(const u8 secret_id[UV_SECRET_ID_LEN], - struct uv_secret_list_item_hdr *secret) -{ - struct uv_secret_list *buf; - int rc; - - buf = kzalloc(sizeof(*buf), GFP_KERNEL); - if (!buf) - return -ENOMEM; - rc = find_secret(secret_id, buf, secret); - kfree(buf); - return rc; -} -EXPORT_SYMBOL_GPL(uv_get_secret_metadata); +EXPORT_SYMBOL_GPL(uv_find_secret); /** * uv_retrieve_secret() - get the secret value for the secret index. diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index 598b512cde01..0fe616c49fcc 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -7,7 +7,6 @@ */ #include <linux/binfmts.h> -#include <linux/compat.h> #include <linux/elf.h> #include <linux/errno.h> #include <linux/init.h> @@ -16,94 +15,14 @@ #include <linux/mm.h> #include <linux/slab.h> #include <linux/smp.h> -#include <linux/time_namespace.h> #include <linux/random.h> +#include <linux/vdso_datastore.h> #include <vdso/datapage.h> #include <asm/vdso/vsyscall.h> #include <asm/alternative.h> #include <asm/vdso.h> -extern char vdso64_start[], vdso64_end[]; -extern char vdso32_start[], vdso32_end[]; - -static struct vm_special_mapping vvar_mapping; - -static union vdso_data_store vdso_data_store __page_aligned_data; - -struct vdso_data *vdso_data = vdso_data_store.data; - -#ifdef CONFIG_TIME_NS -struct vdso_data *arch_get_vdso_data(void *vvar_page) -{ - return (struct vdso_data *)(vvar_page); -} - -/* - * The VVAR page layout depends on whether a task belongs to the root or - * non-root time namespace. Whenever a task changes its namespace, the VVAR - * page tables are cleared and then they will be re-faulted with a - * corresponding layout. - * See also the comment near timens_setup_vdso_data() for details. - */ -int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) -{ - struct mm_struct *mm = task->mm; - VMA_ITERATOR(vmi, mm, 0); - struct vm_area_struct *vma; - - mmap_read_lock(mm); - for_each_vma(vmi, vma) { - if (!vma_is_special_mapping(vma, &vvar_mapping)) - continue; - zap_vma_pages(vma); - break; - } - mmap_read_unlock(mm); - return 0; -} -#endif - -static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, - struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct page *timens_page = find_timens_vvar_page(vma); - unsigned long addr, pfn; - vm_fault_t err; - - switch (vmf->pgoff) { - case VVAR_DATA_PAGE_OFFSET: - pfn = virt_to_pfn(vdso_data); - if (timens_page) { - /* - * Fault in VVAR page too, since it will be accessed - * to get clock data anyway. - */ - addr = vmf->address + VVAR_TIMENS_PAGE_OFFSET * PAGE_SIZE; - err = vmf_insert_pfn(vma, addr, pfn); - if (unlikely(err & VM_FAULT_ERROR)) - return err; - pfn = page_to_pfn(timens_page); - } - break; -#ifdef CONFIG_TIME_NS - case VVAR_TIMENS_PAGE_OFFSET: - /* - * If a task belongs to a time namespace then a namespace - * specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and - * the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET - * offset. - * See also the comment near timens_setup_vdso_data(). - */ - if (!timens_page) - return VM_FAULT_SIGBUS; - pfn = virt_to_pfn(vdso_data); - break; -#endif /* CONFIG_TIME_NS */ - default: - return VM_FAULT_SIGBUS; - } - return vmf_insert_pfn(vma, vmf->address, pfn); -} +extern char vdso_start[], vdso_end[]; static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *vma) @@ -112,17 +31,7 @@ static int vdso_mremap(const struct vm_special_mapping *sm, return 0; } -static struct vm_special_mapping vvar_mapping = { - .name = "[vvar]", - .fault = vvar_fault, -}; - -static struct vm_special_mapping vdso64_mapping = { - .name = "[vdso]", - .mremap = vdso_mremap, -}; - -static struct vm_special_mapping vdso32_mapping = { +static struct vm_special_mapping vdso_mapping = { .name = "[vdso]", .mremap = vdso_mremap, }; @@ -137,39 +46,29 @@ early_initcall(vdso_getcpu_init); /* Must be called before SMP init */ static int map_vdso(unsigned long addr, unsigned long vdso_mapping_len) { unsigned long vvar_start, vdso_text_start, vdso_text_len; - struct vm_special_mapping *vdso_mapping; struct mm_struct *mm = current->mm; struct vm_area_struct *vma; int rc; - BUILD_BUG_ON(VVAR_NR_PAGES != __VVAR_PAGES); + BUILD_BUG_ON(VDSO_NR_PAGES != __VDSO_PAGES); if (mmap_write_lock_killable(mm)) return -EINTR; - if (is_compat_task()) { - vdso_text_len = vdso32_end - vdso32_start; - vdso_mapping = &vdso32_mapping; - } else { - vdso_text_len = vdso64_end - vdso64_start; - vdso_mapping = &vdso64_mapping; - } + vdso_text_len = vdso_end - vdso_start; vvar_start = get_unmapped_area(NULL, addr, vdso_mapping_len, 0, 0); rc = vvar_start; if (IS_ERR_VALUE(vvar_start)) goto out; - vma = _install_special_mapping(mm, vvar_start, VVAR_NR_PAGES*PAGE_SIZE, - VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP| - VM_PFNMAP, - &vvar_mapping); + vma = vdso_install_vvar_mapping(mm, vvar_start); rc = PTR_ERR(vma); if (IS_ERR(vma)) goto out; - vdso_text_start = vvar_start + VVAR_NR_PAGES * PAGE_SIZE; + vdso_text_start = vvar_start + VDSO_NR_PAGES * PAGE_SIZE; /* VM_MAYWRITE for COW so gdb can set breakpoints */ vma = _install_special_mapping(mm, vdso_text_start, vdso_text_len, - VM_READ|VM_EXEC| + VM_READ|VM_EXEC|VM_SEALED_SYSMAP| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, - vdso_mapping); + &vdso_mapping); if (IS_ERR(vma)) { do_munmap(mm, vvar_start, PAGE_SIZE, NULL); rc = PTR_ERR(vma); @@ -209,18 +108,12 @@ static unsigned long vdso_addr(unsigned long start, unsigned long len) unsigned long vdso_text_size(void) { - unsigned long size; - - if (is_compat_task()) - size = vdso32_end - vdso32_start; - else - size = vdso64_end - vdso64_start; - return PAGE_ALIGN(size); + return PAGE_ALIGN(vdso_end - vdso_start); } unsigned long vdso_size(void) { - return vdso_text_size() + VVAR_NR_PAGES * PAGE_SIZE; + return vdso_text_size() + VDSO_NR_PAGES * PAGE_SIZE; } int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) @@ -239,7 +132,7 @@ static struct page ** __init vdso_setup_pages(void *start, void *end) struct page **pagelist; int i; - pagelist = kcalloc(pages + 1, sizeof(struct page *), GFP_KERNEL); + pagelist = kzalloc_objs(struct page *, pages + 1); if (!pagelist) panic("%s: Cannot allocate page list for VDSO", __func__); for (i = 0; i < pages; i++) @@ -253,7 +146,7 @@ static void vdso_apply_alternatives(void) struct alt_instr *start, *end; const struct elf64_hdr *hdr; - hdr = (struct elf64_hdr *)vdso64_start; + hdr = (struct elf64_hdr *)vdso_start; shdr = (void *)hdr + hdr->e_shoff; alt = find_section(hdr, shdr, ".altinstructions"); if (!alt) @@ -266,9 +159,7 @@ static void vdso_apply_alternatives(void) static int __init vdso_init(void) { vdso_apply_alternatives(); - vdso64_mapping.pages = vdso_setup_pages(vdso64_start, vdso64_end); - if (IS_ENABLED(CONFIG_COMPAT)) - vdso32_mapping.pages = vdso_setup_pages(vdso32_start, vdso32_end); + vdso_mapping.pages = vdso_setup_pages(vdso_start, vdso_end); return 0; } arch_initcall(vdso_init); diff --git a/arch/s390/kernel/vdso32/.gitignore b/arch/s390/kernel/vdso/.gitignore index 5167384843b9..652e31d82582 100644 --- a/arch/s390/kernel/vdso32/.gitignore +++ b/arch/s390/kernel/vdso/.gitignore @@ -1,2 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only -vdso32.lds +vdso.lds diff --git a/arch/s390/kernel/vdso/Makefile b/arch/s390/kernel/vdso/Makefile new file mode 100644 index 000000000000..fece5d975eaf --- /dev/null +++ b/arch/s390/kernel/vdso/Makefile @@ -0,0 +1,76 @@ +# SPDX-License-Identifier: GPL-2.0 +# List of files in the vdso + +# Include the generic Makefile to check the built vdso. +include $(srctree)/lib/vdso/Makefile.include +obj-vdso = vdso_user_wrapper.o note.o vgetrandom-chacha.o +obj-cvdso = vdso_generic.o getcpu.o vgetrandom.o +VDSO_CFLAGS_REMOVE := -pg $(CC_FLAGS_FTRACE) $(CC_FLAGS_EXPOLINE) +CFLAGS_REMOVE_getcpu.o = $(VDSO_CFLAGS_REMOVE) +CFLAGS_REMOVE_vgetrandom.o = $(VDSO_CFLAGS_REMOVE) +CFLAGS_REMOVE_vdso_generic.o = $(VDSO_CFLAGS_REMOVE) + +ifneq ($(c-getrandom-y),) + CFLAGS_vgetrandom.o += -include $(c-getrandom-y) +endif + +# Build rules + +targets := $(obj-vdso) $(obj-cvdso) vdso.so vdso.so.dbg +obj-vdso := $(addprefix $(obj)/, $(obj-vdso)) +obj-cvdso := $(addprefix $(obj)/, $(obj-cvdso)) + +KBUILD_AFLAGS_VDSO := $(KBUILD_AFLAGS) -DBUILD_VDSO + +KBUILD_CFLAGS_VDSO := $(KBUILD_CFLAGS) -DBUILD_VDSO -DDISABLE_BRANCH_PROFILING +KBUILD_CFLAGS_VDSO := $(filter-out -mpacked-stack,$(KBUILD_CFLAGS_VDSO)) +KBUILD_CFLAGS_VDSO := $(filter-out -mno-pic-data-is-text-relative,$(KBUILD_CFLAGS_VDSO)) +KBUILD_CFLAGS_VDSO := $(filter-out -munaligned-symbols,$(KBUILD_CFLAGS_VDSO)) +KBUILD_CFLAGS_VDSO := $(filter-out -fno-asynchronous-unwind-tables,$(KBUILD_CFLAGS_VDSO)) +KBUILD_CFLAGS_VDSO += -fPIC -fno-common -fno-builtin -fasynchronous-unwind-tables +KBUILD_CFLAGS_VDSO += -fno-stack-protector $(DISABLE_KSTACK_ERASE) +ldflags-y := -shared -soname=linux-vdso.so.1 \ + --hash-style=both --build-id=sha1 -T + +$(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_VDSO) +$(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_VDSO) + +obj-y += vdso_wrapper.o +targets += vdso.lds +CPPFLAGS_vdso.lds += -P -C -U$(ARCH) + +# Force dependency (incbin is bad) +$(obj)/vdso_wrapper.o : $(obj)/vdso.so + +quiet_cmd_vdso_and_check = VDSO $@ + cmd_vdso_and_check = $(cmd_ld); $(cmd_vdso_check) + +# link rule for the .so file, .lds has to be first +$(obj)/vdso.so.dbg: $(obj)/vdso.lds $(obj-vdso) $(obj-cvdso) FORCE + $(call if_changed,vdso_and_check) + +# strip rule for the .so file +$(obj)/%.so: OBJCOPYFLAGS := -S +$(obj)/%.so: $(obj)/%.so.dbg FORCE + $(call if_changed,objcopy) + +# assembly rules for the .S files +$(obj-vdso): %.o: %.S FORCE + $(call if_changed_dep,vdsoas) + +$(obj-cvdso): %.o: %.c FORCE + $(call if_changed_dep,vdsocc) + +# actual build commands +quiet_cmd_vdsoas = VDSOA $@ + cmd_vdsoas = $(CC) $(a_flags) -c -o $@ $< +quiet_cmd_vdsocc = VDSOC $@ + cmd_vdsocc = $(CC) $(c_flags) -c -o $@ $< + +# Generate VDSO offsets using helper script +gen-vdsosym := $(src)/gen_vdso_offsets.sh +quiet_cmd_vdsosym = VDSOSYM $@ + cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ + +include/generated/vdso-offsets.h: $(obj)/vdso.so.dbg FORCE + $(call if_changed,vdsosym) diff --git a/arch/s390/kernel/vdso64/gen_vdso_offsets.sh b/arch/s390/kernel/vdso/gen_vdso_offsets.sh index 37f05cb38dad..359982fb002d 100755 --- a/arch/s390/kernel/vdso64/gen_vdso_offsets.sh +++ b/arch/s390/kernel/vdso/gen_vdso_offsets.sh @@ -12,4 +12,4 @@ # LC_ALL=C -sed -n 's/\([0-9a-f]*\) . __kernel_\(.*\)/\#define vdso64_offset_\2\t0x\1/p' +sed -n 's/\([0-9a-f]*\) . __kernel_\(.*\)/\#define vdso_offset_\2\t0x\1/p' diff --git a/arch/s390/kernel/vdso64/getcpu.c b/arch/s390/kernel/vdso/getcpu.c index 5c5d4a848b76..1e17665616c5 100644 --- a/arch/s390/kernel/vdso64/getcpu.c +++ b/arch/s390/kernel/vdso/getcpu.c @@ -2,11 +2,10 @@ /* Copyright IBM Corp. 2020 */ #include <linux/compiler.h> -#include <linux/getcpu.h> #include <asm/timex.h> #include "vdso.h" -int __s390_vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) +int __s390_vdso_getcpu(unsigned *cpu, unsigned *node, void *unused) { union tod_clock clk; diff --git a/arch/s390/kernel/vdso32/note.S b/arch/s390/kernel/vdso/note.S index db19d0680a0a..db19d0680a0a 100644 --- a/arch/s390/kernel/vdso32/note.S +++ b/arch/s390/kernel/vdso/note.S diff --git a/arch/s390/kernel/vdso64/vdso.h b/arch/s390/kernel/vdso/vdso.h index 9e5397e7b590..1fe52a6f5a56 100644 --- a/arch/s390/kernel/vdso64/vdso.h +++ b/arch/s390/kernel/vdso/vdso.h @@ -1,15 +1,13 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __ARCH_S390_KERNEL_VDSO64_VDSO_H -#define __ARCH_S390_KERNEL_VDSO64_VDSO_H +#ifndef __ARCH_S390_KERNEL_VDSO_VDSO_H +#define __ARCH_S390_KERNEL_VDSO_VDSO_H #include <vdso/datapage.h> -struct getcpu_cache; - -int __s390_vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused); +int __s390_vdso_getcpu(unsigned *cpu, unsigned *node, void *unused); int __s390_vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz); int __s390_vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts); int __s390_vdso_clock_getres(clockid_t clock, struct __kernel_timespec *ts); ssize_t __kernel_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len); -#endif /* __ARCH_S390_KERNEL_VDSO64_VDSO_H */ +#endif /* __ARCH_S390_KERNEL_VDSO_VDSO_H */ diff --git a/arch/s390/kernel/vdso64/vdso64.lds.S b/arch/s390/kernel/vdso/vdso.lds.S index ec42b7d9cb53..7bec4de0e8e0 100644 --- a/arch/s390/kernel/vdso64/vdso64.lds.S +++ b/arch/s390/kernel/vdso/vdso.lds.S @@ -7,17 +7,16 @@ #include <asm/vdso/vsyscall.h> #include <asm/page.h> #include <asm/vdso.h> +#include <asm-generic/vmlinux.lds.h> +#include <vdso/datapage.h> OUTPUT_FORMAT("elf64-s390", "elf64-s390", "elf64-s390") OUTPUT_ARCH(s390:64-bit) SECTIONS { - PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); - PROVIDE(_vdso_rng_data = _vdso_data + __VDSO_RND_DATA_OFFSET); -#ifdef CONFIG_TIME_NS - PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); -#endif + VDSO_VVAR_SYMS + . = SIZEOF_HEADERS; .hash : { *(.hash) } :text @@ -61,47 +60,9 @@ SECTIONS _end = .; PROVIDE(end = .); - /* - * Stabs debugging sections are here too. - */ - .stab 0 : { *(.stab) } - .stabstr 0 : { *(.stabstr) } - .stab.excl 0 : { *(.stab.excl) } - .stab.exclstr 0 : { *(.stab.exclstr) } - .stab.index 0 : { *(.stab.index) } - .stab.indexstr 0 : { *(.stab.indexstr) } + STABS_DEBUG + DWARF_DEBUG .comment 0 : { *(.comment) } - - /* - * DWARF debug sections. - * Symbols in the DWARF debugging sections are relative to the - * beginning of the section so we begin them at 0. - */ - /* DWARF 1 */ - .debug 0 : { *(.debug) } - .line 0 : { *(.line) } - /* GNU DWARF 1 extensions */ - .debug_srcinfo 0 : { *(.debug_srcinfo) } - .debug_sfnames 0 : { *(.debug_sfnames) } - /* DWARF 1.1 and DWARF 2 */ - .debug_aranges 0 : { *(.debug_aranges) } - .debug_pubnames 0 : { *(.debug_pubnames) } - /* DWARF 2 */ - .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) } - .debug_abbrev 0 : { *(.debug_abbrev) } - .debug_line 0 : { *(.debug_line) } - .debug_frame 0 : { *(.debug_frame) } - .debug_str 0 : { *(.debug_str) } - .debug_loc 0 : { *(.debug_loc) } - .debug_macinfo 0 : { *(.debug_macinfo) } - /* SGI/MIPS DWARF 2 extensions */ - .debug_weaknames 0 : { *(.debug_weaknames) } - .debug_funcnames 0 : { *(.debug_funcnames) } - .debug_typenames 0 : { *(.debug_typenames) } - .debug_varnames 0 : { *(.debug_varnames) } - /* DWARF 3 */ - .debug_pubtypes 0 : { *(.debug_pubtypes) } - .debug_ranges 0 : { *(.debug_ranges) } .gnu.attributes 0 : { KEEP (*(.gnu.attributes)) } /DISCARD/ : { diff --git a/arch/s390/kernel/vdso64/vdso64_generic.c b/arch/s390/kernel/vdso/vdso_generic.c index a9aa75643c08..a9aa75643c08 100644 --- a/arch/s390/kernel/vdso64/vdso64_generic.c +++ b/arch/s390/kernel/vdso/vdso_generic.c diff --git a/arch/s390/kernel/vdso64/vdso_user_wrapper.S b/arch/s390/kernel/vdso/vdso_user_wrapper.S index aa06c85bcbd3..aa06c85bcbd3 100644 --- a/arch/s390/kernel/vdso64/vdso_user_wrapper.S +++ b/arch/s390/kernel/vdso/vdso_user_wrapper.S diff --git a/arch/s390/kernel/vdso64/vdso64_wrapper.S b/arch/s390/kernel/vdso/vdso_wrapper.S index 672184998623..f69e62a14978 100644 --- a/arch/s390/kernel/vdso64/vdso64_wrapper.S +++ b/arch/s390/kernel/vdso/vdso_wrapper.S @@ -5,11 +5,11 @@ __PAGE_ALIGNED_DATA - .globl vdso64_start, vdso64_end + .globl vdso_start, vdso_end .balign PAGE_SIZE -vdso64_start: - .incbin "arch/s390/kernel/vdso64/vdso64.so" +vdso_start: + .incbin "arch/s390/kernel/vdso/vdso.so" .balign PAGE_SIZE -vdso64_end: +vdso_end: .previous diff --git a/arch/s390/kernel/vdso64/vgetrandom-chacha.S b/arch/s390/kernel/vdso/vgetrandom-chacha.S index 09c034c2f853..09c034c2f853 100644 --- a/arch/s390/kernel/vdso64/vgetrandom-chacha.S +++ b/arch/s390/kernel/vdso/vgetrandom-chacha.S diff --git a/arch/s390/kernel/vdso64/vgetrandom.c b/arch/s390/kernel/vdso/vgetrandom.c index b5268b507fb5..b5268b507fb5 100644 --- a/arch/s390/kernel/vdso64/vgetrandom.c +++ b/arch/s390/kernel/vdso/vgetrandom.c diff --git a/arch/s390/kernel/vdso32/Makefile b/arch/s390/kernel/vdso32/Makefile deleted file mode 100644 index 2c5afb88d298..000000000000 --- a/arch/s390/kernel/vdso32/Makefile +++ /dev/null @@ -1,64 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# List of files in the vdso - -# Include the generic Makefile to check the built vdso. -include $(srctree)/lib/vdso/Makefile -obj-vdso32 = vdso_user_wrapper-32.o note-32.o - -# Build rules - -targets := $(obj-vdso32) vdso32.so vdso32.so.dbg -obj-vdso32 := $(addprefix $(obj)/, $(obj-vdso32)) - -KBUILD_AFLAGS += -DBUILD_VDSO -KBUILD_CFLAGS += -DBUILD_VDSO -DDISABLE_BRANCH_PROFILING - -KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) -KBUILD_AFLAGS_32 += -m31 -s - -KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS)) -KBUILD_CFLAGS_32 := $(filter-out -mpacked-stack,$(KBUILD_CFLAGS)) -KBUILD_CFLAGS_32 := $(filter-out -mno-pic-data-is-text-relative,$(KBUILD_CFLAGS_32)) -KBUILD_CFLAGS_32 := $(filter-out -fno-asynchronous-unwind-tables,$(KBUILD_CFLAGS_32)) -KBUILD_CFLAGS_32 += -m31 -fPIC -shared -fno-common -fno-builtin -fasynchronous-unwind-tables - -LDFLAGS_vdso32.so.dbg += -shared -soname=linux-vdso32.so.1 \ - --hash-style=both --build-id=sha1 -melf_s390 -T - -$(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) -$(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32) - -obj-y += vdso32_wrapper.o -targets += vdso32.lds -CPPFLAGS_vdso32.lds += -P -C -U$(ARCH) - -# Force dependency (incbin is bad) -$(obj)/vdso32_wrapper.o : $(obj)/vdso32.so - -quiet_cmd_vdso_and_check = VDSO $@ - cmd_vdso_and_check = $(cmd_ld); $(cmd_vdso_check) - -$(obj)/vdso32.so.dbg: $(obj)/vdso32.lds $(obj-vdso32) FORCE - $(call if_changed,vdso_and_check) - -# strip rule for the .so file -$(obj)/%.so: OBJCOPYFLAGS := -S -$(obj)/%.so: $(obj)/%.so.dbg FORCE - $(call if_changed,objcopy) - -$(obj-vdso32): %-32.o: %.S FORCE - $(call if_changed_dep,vdso32as) - -# actual build commands -quiet_cmd_vdso32as = VDSO32A $@ - cmd_vdso32as = $(CC) $(a_flags) -c -o $@ $< -quiet_cmd_vdso32cc = VDSO32C $@ - cmd_vdso32cc = $(CC) $(c_flags) -c -o $@ $< - -# Generate VDSO offsets using helper script -gen-vdsosym := $(src)/gen_vdso_offsets.sh -quiet_cmd_vdsosym = VDSOSYM $@ - cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ - -include/generated/vdso32-offsets.h: $(obj)/vdso32.so.dbg FORCE - $(call if_changed,vdsosym) diff --git a/arch/s390/kernel/vdso32/gen_vdso_offsets.sh b/arch/s390/kernel/vdso32/gen_vdso_offsets.sh deleted file mode 100755 index 9c4f951e227d..000000000000 --- a/arch/s390/kernel/vdso32/gen_vdso_offsets.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -# -# Match symbols in the DSO that look like VDSO_*; produce a header file -# of constant offsets into the shared object. -# -# Doing this inside the Makefile will break the $(filter-out) function, -# causing Kbuild to rebuild the vdso-offsets header file every time. -# -# Inspired by arm64 version. -# - -LC_ALL=C -sed -n 's/\([0-9a-f]*\) . __kernel_compat_\(.*\)/\#define vdso32_offset_\2\t0x\1/p' diff --git a/arch/s390/kernel/vdso32/vdso32.lds.S b/arch/s390/kernel/vdso32/vdso32.lds.S deleted file mode 100644 index c916c4f73f76..000000000000 --- a/arch/s390/kernel/vdso32/vdso32.lds.S +++ /dev/null @@ -1,141 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * This is the infamous ld script for the 64 bits vdso - * library - */ - -#include <asm/page.h> -#include <asm/vdso.h> - -OUTPUT_FORMAT("elf32-s390", "elf32-s390", "elf32-s390") -OUTPUT_ARCH(s390:31-bit) - -SECTIONS -{ - PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); -#ifdef CONFIG_TIME_NS - PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); -#endif - . = SIZEOF_HEADERS; - - .hash : { *(.hash) } :text - .gnu.hash : { *(.gnu.hash) } - .dynsym : { *(.dynsym) } - .dynstr : { *(.dynstr) } - .gnu.version : { *(.gnu.version) } - .gnu.version_d : { *(.gnu.version_d) } - .gnu.version_r : { *(.gnu.version_r) } - - .note : { *(.note.*) } :text :note - - . = ALIGN(16); - .text : { - *(.text .stub .text.* .gnu.linkonce.t.*) - } :text - PROVIDE(__etext = .); - PROVIDE(_etext = .); - PROVIDE(etext = .); - - /* - * Other stuff is appended to the text segment: - */ - .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } - .rodata1 : { *(.rodata1) } - - .dynamic : { *(.dynamic) } :text :dynamic - - .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr - .eh_frame : { KEEP (*(.eh_frame)) } :text - .gcc_except_table : { *(.gcc_except_table .gcc_except_table.*) } - - .rela.dyn ALIGN(8) : { *(.rela.dyn) } - .got ALIGN(8) : { *(.got .toc) } - .got.plt ALIGN(8) : { *(.got.plt) } - - _end = .; - PROVIDE(end = .); - - /* - * Stabs debugging sections are here too. - */ - .stab 0 : { *(.stab) } - .stabstr 0 : { *(.stabstr) } - .stab.excl 0 : { *(.stab.excl) } - .stab.exclstr 0 : { *(.stab.exclstr) } - .stab.index 0 : { *(.stab.index) } - .stab.indexstr 0 : { *(.stab.indexstr) } - .comment 0 : { *(.comment) } - - /* - * DWARF debug sections. - * Symbols in the DWARF debugging sections are relative to the - * beginning of the section so we begin them at 0. - */ - /* DWARF 1 */ - .debug 0 : { *(.debug) } - .line 0 : { *(.line) } - /* GNU DWARF 1 extensions */ - .debug_srcinfo 0 : { *(.debug_srcinfo) } - .debug_sfnames 0 : { *(.debug_sfnames) } - /* DWARF 1.1 and DWARF 2 */ - .debug_aranges 0 : { *(.debug_aranges) } - .debug_pubnames 0 : { *(.debug_pubnames) } - /* DWARF 2 */ - .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) } - .debug_abbrev 0 : { *(.debug_abbrev) } - .debug_line 0 : { *(.debug_line) } - .debug_frame 0 : { *(.debug_frame) } - .debug_str 0 : { *(.debug_str) } - .debug_loc 0 : { *(.debug_loc) } - .debug_macinfo 0 : { *(.debug_macinfo) } - /* SGI/MIPS DWARF 2 extensions */ - .debug_weaknames 0 : { *(.debug_weaknames) } - .debug_funcnames 0 : { *(.debug_funcnames) } - .debug_typenames 0 : { *(.debug_typenames) } - .debug_varnames 0 : { *(.debug_varnames) } - /* DWARF 3 */ - .debug_pubtypes 0 : { *(.debug_pubtypes) } - .debug_ranges 0 : { *(.debug_ranges) } - .gnu.attributes 0 : { KEEP (*(.gnu.attributes)) } - - /DISCARD/ : { - *(.note.GNU-stack) - *(.branch_lt) - *(.data .data.* .gnu.linkonce.d.* .sdata*) - *(.bss .sbss .dynbss .dynsbss) - } -} - -/* - * Very old versions of ld do not recognize this name token; use the constant. - */ -#define PT_GNU_EH_FRAME 0x6474e550 - -/* - * We must supply the ELF program headers explicitly to get just one - * PT_LOAD segment, and set the flags explicitly to make segments read-only. - */ -PHDRS -{ - text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */ - dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ - note PT_NOTE FLAGS(4); /* PF_R */ - eh_frame_hdr PT_GNU_EH_FRAME; -} - -/* - * This controls what symbols we export from the DSO. - */ -VERSION -{ - VDSO_VERSION_STRING { - global: - /* - * Has to be there for the kernel to find - */ - __kernel_compat_restart_syscall; - __kernel_compat_rt_sigreturn; - __kernel_compat_sigreturn; - local: *; - }; -} diff --git a/arch/s390/kernel/vdso32/vdso32_wrapper.S b/arch/s390/kernel/vdso32/vdso32_wrapper.S deleted file mode 100644 index de2fb930471a..000000000000 --- a/arch/s390/kernel/vdso32/vdso32_wrapper.S +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#include <linux/init.h> -#include <linux/linkage.h> -#include <asm/page.h> - - __PAGE_ALIGNED_DATA - - .globl vdso32_start, vdso32_end - .balign PAGE_SIZE -vdso32_start: - .incbin "arch/s390/kernel/vdso32/vdso32.so" - .balign PAGE_SIZE -vdso32_end: - - .previous diff --git a/arch/s390/kernel/vdso32/vdso_user_wrapper.S b/arch/s390/kernel/vdso32/vdso_user_wrapper.S deleted file mode 100644 index 2e645003fdaf..000000000000 --- a/arch/s390/kernel/vdso32/vdso_user_wrapper.S +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#include <linux/linkage.h> -#include <asm/unistd.h> -#include <asm/dwarf.h> - -.macro vdso_syscall func,syscall - .globl __kernel_compat_\func - .type __kernel_compat_\func,@function - __ALIGN -__kernel_compat_\func: - CFI_STARTPROC - svc \syscall - /* Make sure we notice when a syscall returns, which shouldn't happen */ - .word 0 - CFI_ENDPROC - .size __kernel_compat_\func,.-__kernel_compat_\func -.endm - -vdso_syscall restart_syscall,__NR_restart_syscall -vdso_syscall sigreturn,__NR_sigreturn -vdso_syscall rt_sigreturn,__NR_rt_sigreturn diff --git a/arch/s390/kernel/vdso64/.gitignore b/arch/s390/kernel/vdso64/.gitignore deleted file mode 100644 index 4ec80685fecc..000000000000 --- a/arch/s390/kernel/vdso64/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -vdso64.lds diff --git a/arch/s390/kernel/vdso64/Makefile b/arch/s390/kernel/vdso64/Makefile deleted file mode 100644 index ad206f2068d8..000000000000 --- a/arch/s390/kernel/vdso64/Makefile +++ /dev/null @@ -1,79 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# List of files in the vdso - -# Include the generic Makefile to check the built vdso. -include $(srctree)/lib/vdso/Makefile -obj-vdso64 = vdso_user_wrapper.o note.o vgetrandom-chacha.o -obj-cvdso64 = vdso64_generic.o getcpu.o vgetrandom.o -VDSO_CFLAGS_REMOVE := -pg $(CC_FLAGS_FTRACE) $(CC_FLAGS_EXPOLINE) -CFLAGS_REMOVE_getcpu.o = $(VDSO_CFLAGS_REMOVE) -CFLAGS_REMOVE_vgetrandom.o = $(VDSO_CFLAGS_REMOVE) -CFLAGS_REMOVE_vdso64_generic.o = $(VDSO_CFLAGS_REMOVE) - -ifneq ($(c-getrandom-y),) - CFLAGS_vgetrandom.o += -include $(c-getrandom-y) -endif - -# Build rules - -targets := $(obj-vdso64) $(obj-cvdso64) vdso64.so vdso64.so.dbg -obj-vdso64 := $(addprefix $(obj)/, $(obj-vdso64)) -obj-cvdso64 := $(addprefix $(obj)/, $(obj-cvdso64)) - -KBUILD_AFLAGS += -DBUILD_VDSO -KBUILD_CFLAGS += -DBUILD_VDSO -DDISABLE_BRANCH_PROFILING - -KBUILD_AFLAGS_64 := $(filter-out -m64,$(KBUILD_AFLAGS)) -KBUILD_AFLAGS_64 += -m64 - -KBUILD_CFLAGS_64 := $(filter-out -m64,$(KBUILD_CFLAGS)) -KBUILD_CFLAGS_64 := $(filter-out -mpacked-stack,$(KBUILD_CFLAGS_64)) -KBUILD_CFLAGS_64 := $(filter-out -mno-pic-data-is-text-relative,$(KBUILD_CFLAGS_64)) -KBUILD_CFLAGS_64 := $(filter-out -munaligned-symbols,$(KBUILD_CFLAGS_64)) -KBUILD_CFLAGS_64 := $(filter-out -fno-asynchronous-unwind-tables,$(KBUILD_CFLAGS_64)) -KBUILD_CFLAGS_64 += -m64 -fPIC -fno-common -fno-builtin -fasynchronous-unwind-tables -ldflags-y := -shared -soname=linux-vdso64.so.1 \ - --hash-style=both --build-id=sha1 -T - -$(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_64) -$(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_64) - -obj-y += vdso64_wrapper.o -targets += vdso64.lds -CPPFLAGS_vdso64.lds += -P -C -U$(ARCH) - -# Force dependency (incbin is bad) -$(obj)/vdso64_wrapper.o : $(obj)/vdso64.so - -quiet_cmd_vdso_and_check = VDSO $@ - cmd_vdso_and_check = $(cmd_ld); $(cmd_vdso_check) - -# link rule for the .so file, .lds has to be first -$(obj)/vdso64.so.dbg: $(obj)/vdso64.lds $(obj-vdso64) $(obj-cvdso64) FORCE - $(call if_changed,vdso_and_check) - -# strip rule for the .so file -$(obj)/%.so: OBJCOPYFLAGS := -S -$(obj)/%.so: $(obj)/%.so.dbg FORCE - $(call if_changed,objcopy) - -# assembly rules for the .S files -$(obj-vdso64): %.o: %.S FORCE - $(call if_changed_dep,vdso64as) - -$(obj-cvdso64): %.o: %.c FORCE - $(call if_changed_dep,vdso64cc) - -# actual build commands -quiet_cmd_vdso64as = VDSO64A $@ - cmd_vdso64as = $(CC) $(a_flags) -c -o $@ $< -quiet_cmd_vdso64cc = VDSO64C $@ - cmd_vdso64cc = $(CC) $(c_flags) -c -o $@ $< - -# Generate VDSO offsets using helper script -gen-vdsosym := $(src)/gen_vdso_offsets.sh -quiet_cmd_vdsosym = VDSOSYM $@ - cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ - -include/generated/vdso64-offsets.h: $(obj)/vdso64.so.dbg FORCE - $(call if_changed,vdsosym) diff --git a/arch/s390/kernel/vdso64/note.S b/arch/s390/kernel/vdso64/note.S deleted file mode 100644 index db19d0680a0a..000000000000 --- a/arch/s390/kernel/vdso64/note.S +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text. - * Here we can supply some information useful to userland. - */ - -#include <linux/uts.h> -#include <linux/version.h> -#include <linux/elfnote.h> - -ELFNOTE_START(Linux, 0, "a") - .long LINUX_VERSION_CODE -ELFNOTE_END diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index ff1ddba96352..2b62395e35bf 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -51,7 +51,7 @@ SECTIONS IRQENTRY_TEXT SOFTIRQENTRY_TEXT FTRACE_HOTPATCH_TRAMPOLINES_TEXT - *(.text.*_indirect_*) + *(.text..*_indirect_*) *(.gnu.warning) . = ALIGN(PAGE_SIZE); _etext = .; /* End of text section */ @@ -71,6 +71,13 @@ SECTIONS . = ALIGN(PAGE_SIZE); __end_ro_after_init = .; + . = ALIGN(8); + .skey_region_table : { + __skey_region_start = .; + KEEP(*(.skey_region)) + __skey_region_end = .; + } + .data.rel.ro : { *(.data.rel.ro .data.rel.ro.*) } @@ -143,6 +150,15 @@ SECTIONS *(.altinstr_replacement) } +#ifdef CONFIG_STACKPROTECTOR + . = ALIGN(8); + .stack_prot_table : { + __stack_prot_start = .; + KEEP(*(__stack_protector_loc)) + __stack_prot_end = .; + } +#endif + /* * Table with the patch locations to undo expolines */ @@ -202,6 +218,34 @@ SECTIONS . = ALIGN(PAGE_SIZE); _end = . ; + /* Debugging sections. */ + STABS_DEBUG + DWARF_DEBUG + MODINFO + ELF_DETAILS + + /* + * Make sure that the .got.plt is either completely empty or it + * contains only the three reserved double words. + */ + .got.plt : { + *(.got.plt) + } + ASSERT(SIZEOF(.got.plt) == 0 || SIZEOF(.got.plt) == 0x18, "Unexpected GOT/PLT entries detected!") + + /* + * Sections that should stay zero sized, which is safer to + * explicitly check instead of blindly discarding. + */ + .plt : { + *(.plt) *(.plt.*) *(.iplt) *(.igot .igot.plt) + } + ASSERT(SIZEOF(.plt) == 0, "Unexpected run-time procedure linkages detected!") + .rela.dyn : { + *(.rela.*) *(.rela_*) + } + ASSERT(SIZEOF(.rela.dyn) == 0, "Unexpected run-time relocations (.rela) detected!") + /* * uncompressed image info used by the decompressor * it should match struct vmlinux_info @@ -223,6 +267,10 @@ SECTIONS QUAD(invalid_pg_dir) QUAD(__alt_instructions) QUAD(__alt_instructions_end) +#ifdef CONFIG_STACKPROTECTOR + QUAD(__stack_prot_start) + QUAD(__stack_prot_end) +#endif #ifdef CONFIG_KASAN QUAD(kasan_early_shadow_page) QUAD(kasan_early_shadow_pte) @@ -232,33 +280,6 @@ SECTIONS #endif } :NONE - /* Debugging sections. */ - STABS_DEBUG - DWARF_DEBUG - ELF_DETAILS - - /* - * Make sure that the .got.plt is either completely empty or it - * contains only the three reserved double words. - */ - .got.plt : { - *(.got.plt) - } - ASSERT(SIZEOF(.got.plt) == 0 || SIZEOF(.got.plt) == 0x18, "Unexpected GOT/PLT entries detected!") - - /* - * Sections that should stay zero sized, which is safer to - * explicitly check instead of blindly discarding. - */ - .plt : { - *(.plt) *(.plt.*) *(.iplt) *(.igot .igot.plt) - } - ASSERT(SIZEOF(.plt) == 0, "Unexpected run-time procedure linkages detected!") - .rela.dyn : { - *(.rela.*) *(.rela_*) - } - ASSERT(SIZEOF(.rela.dyn) == 0, "Unexpected run-time relocations (.rela) detected!") - /* Sections to be discarded */ DISCARDS /DISCARD/ : { diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 234a0ba30510..bf48744d0912 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -48,8 +48,7 @@ static inline void set_vtimer(u64 expires) static inline int virt_timer_forward(u64 elapsed) { - BUG_ON(!irqs_disabled()); - + lockdep_assert_irqs_disabled(); if (list_empty(&virt_timer_list)) return 0; elapsed = atomic64_add_return(elapsed, &virt_timer_elapsed); @@ -137,23 +136,16 @@ static int do_account_vtime(struct task_struct *tsk) lc->system_timer += timer; /* Update MT utilization calculation */ - if (smp_cpu_mtid && - time_after64(jiffies_64, this_cpu_read(mt_scaling_jiffies))) + if (smp_cpu_mtid && time_after64(jiffies_64, __this_cpu_read(mt_scaling_jiffies))) update_mt_scaling(); /* Calculate cputime delta */ - user = update_tsk_timer(&tsk->thread.user_timer, - READ_ONCE(lc->user_timer)); - guest = update_tsk_timer(&tsk->thread.guest_timer, - READ_ONCE(lc->guest_timer)); - system = update_tsk_timer(&tsk->thread.system_timer, - READ_ONCE(lc->system_timer)); - hardirq = update_tsk_timer(&tsk->thread.hardirq_timer, - READ_ONCE(lc->hardirq_timer)); - softirq = update_tsk_timer(&tsk->thread.softirq_timer, - READ_ONCE(lc->softirq_timer)); - lc->steal_timer += - clock - user - guest - system - hardirq - softirq; + user = update_tsk_timer(&tsk->thread.user_timer, lc->user_timer); + guest = update_tsk_timer(&tsk->thread.guest_timer, lc->guest_timer); + system = update_tsk_timer(&tsk->thread.system_timer, lc->system_timer); + hardirq = update_tsk_timer(&tsk->thread.hardirq_timer, lc->hardirq_timer); + softirq = update_tsk_timer(&tsk->thread.softirq_timer, lc->softirq_timer); + lc->steal_timer += clock - user - guest - system - hardirq - softirq; /* Push account value */ if (user) { @@ -225,10 +217,6 @@ static u64 vtime_delta(void) return timer - lc->last_update_timer; } -/* - * Update process times based on virtual cpu times stored by entry.S - * to the lowcore fields user_timer, system_timer & steal_clock. - */ void vtime_account_kernel(struct task_struct *tsk) { struct lowcore *lc = get_lowcore(); @@ -238,27 +226,17 @@ void vtime_account_kernel(struct task_struct *tsk) lc->guest_timer += delta; else lc->system_timer += delta; - - virt_timer_forward(delta); } EXPORT_SYMBOL_GPL(vtime_account_kernel); void vtime_account_softirq(struct task_struct *tsk) { - u64 delta = vtime_delta(); - - get_lowcore()->softirq_timer += delta; - - virt_timer_forward(delta); + get_lowcore()->softirq_timer += vtime_delta(); } void vtime_account_hardirq(struct task_struct *tsk) { - u64 delta = vtime_delta(); - - get_lowcore()->hardirq_timer += delta; - - virt_timer_forward(delta); + get_lowcore()->hardirq_timer += vtime_delta(); } /* diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index cae908d64550..5b835bc6a194 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -20,7 +20,6 @@ config KVM def_tristate y prompt "Kernel-based Virtual Machine (KVM) support" select HAVE_KVM_CPU_RELAX_INTERCEPT - select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_ASYNC_PF select KVM_ASYNC_PF_SYNC select KVM_COMMON @@ -29,7 +28,8 @@ config KVM select HAVE_KVM_INVALID_WAKEUPS select HAVE_KVM_NO_POLL select KVM_VFIO - select MMU_NOTIFIER + select VIRT_XFER_TO_GUEST_WORK + select KVM_MMU_LOCKLESS_AGING help Support hosting paravirtualized guest machines using the SIE virtualization capability on the mainframe. This should work diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile index f0ffe874adc2..dac9d53b23d8 100644 --- a/arch/s390/kvm/Makefile +++ b/arch/s390/kvm/Makefile @@ -8,7 +8,8 @@ include $(srctree)/virt/kvm/Makefile.kvm ccflags-y := -Ivirt/kvm -Iarch/s390/kvm kvm-y += kvm-s390.o intercept.o interrupt.o priv.o sigp.o -kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o gmap.o gmap-vsie.o +kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o +kvm-y += dat.o gmap.o faultin.o kvm-$(CONFIG_VFIO_PCI_ZDEV_KVM) += pci.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/s390/kvm/dat.c b/arch/s390/kvm/dat.c new file mode 100644 index 000000000000..7b8d70fe406d --- /dev/null +++ b/arch/s390/kvm/dat.c @@ -0,0 +1,1321 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KVM guest address space mapping code + * + * Copyright IBM Corp. 2007, 2020, 2024 + * Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com> + * Martin Schwidefsky <schwidefsky@de.ibm.com> + * David Hildenbrand <david@redhat.com> + * Janosch Frank <frankja@linux.ibm.com> + */ + +#include <linux/kernel.h> +#include <linux/pagewalk.h> +#include <linux/swap.h> +#include <linux/smp.h> +#include <linux/spinlock.h> +#include <linux/slab.h> +#include <linux/swapops.h> +#include <linux/ksm.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/pgtable.h> +#include <linux/kvm_types.h> +#include <linux/kvm_host.h> +#include <linux/pgalloc.h> + +#include <asm/page-states.h> +#include <asm/tlb.h> +#include "dat.h" + +int kvm_s390_mmu_cache_topup(struct kvm_s390_mmu_cache *mc) +{ + void *o; + + for ( ; mc->n_crsts < KVM_S390_MMU_CACHE_N_CRSTS; mc->n_crsts++) { + o = (void *)__get_free_pages(GFP_KERNEL_ACCOUNT | __GFP_COMP, CRST_ALLOC_ORDER); + if (!o) + return -ENOMEM; + mc->crsts[mc->n_crsts] = o; + } + for ( ; mc->n_pts < KVM_S390_MMU_CACHE_N_PTS; mc->n_pts++) { + o = (void *)__get_free_page(GFP_KERNEL_ACCOUNT); + if (!o) + return -ENOMEM; + mc->pts[mc->n_pts] = o; + } + for ( ; mc->n_rmaps < KVM_S390_MMU_CACHE_N_RMAPS; mc->n_rmaps++) { + o = kzalloc_obj(*mc->rmaps[0], GFP_KERNEL_ACCOUNT); + if (!o) + return -ENOMEM; + mc->rmaps[mc->n_rmaps] = o; + } + return 0; +} + +static inline struct page_table *dat_alloc_pt_noinit(struct kvm_s390_mmu_cache *mc) +{ + struct page_table *res; + + res = kvm_s390_mmu_cache_alloc_pt(mc); + if (res) + __arch_set_page_dat(res, 1); + return res; +} + +static inline struct crst_table *dat_alloc_crst_noinit(struct kvm_s390_mmu_cache *mc) +{ + struct crst_table *res; + + res = kvm_s390_mmu_cache_alloc_crst(mc); + if (res) + __arch_set_page_dat(res, 1UL << CRST_ALLOC_ORDER); + return res; +} + +struct crst_table *dat_alloc_crst_sleepable(unsigned long init) +{ + struct page *page; + void *virt; + + page = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_COMP, CRST_ALLOC_ORDER); + if (!page) + return NULL; + virt = page_to_virt(page); + __arch_set_page_dat(virt, 1UL << CRST_ALLOC_ORDER); + crst_table_init(virt, init); + return virt; +} + +void dat_free_level(struct crst_table *table, bool owns_ptes) +{ + unsigned int i; + + for (i = 0; i < _CRST_ENTRIES; i++) { + if (table->crstes[i].h.fc || table->crstes[i].h.i) + continue; + if (!is_pmd(table->crstes[i])) + dat_free_level(dereference_crste(table->crstes[i]), owns_ptes); + else if (owns_ptes) + dat_free_pt(dereference_pmd(table->crstes[i].pmd)); + } + dat_free_crst(table); +} + +int dat_set_asce_limit(struct kvm_s390_mmu_cache *mc, union asce *asce, int newtype) +{ + struct crst_table *table; + union crste crste; + + while (asce->dt > newtype) { + table = dereference_asce(*asce); + crste = table->crstes[0]; + if (crste.h.fc) + return 0; + if (!crste.h.i) { + asce->rsto = crste.h.fc0.to; + dat_free_crst(table); + } else { + crste.h.tt--; + crst_table_init((void *)table, crste.val); + } + asce->dt--; + } + while (asce->dt < newtype) { + crste = _crste_fc0(asce->rsto, asce->dt + 1); + table = dat_alloc_crst_noinit(mc); + if (!table) + return -ENOMEM; + crst_table_init((void *)table, _CRSTE_HOLE(crste.h.tt).val); + table->crstes[0] = crste; + asce->rsto = __pa(table) >> PAGE_SHIFT; + asce->dt++; + } + return 0; +} + +/** + * dat_crstep_xchg_atomic() - Atomically exchange a gmap CRSTE with another. + * @crstep: Pointer to the CRST entry. + * @old: Expected old value. + * @new: Replacement entry. + * @gfn: The affected guest address. + * @asce: The asce of the address space. + * + * This function is needed to atomically exchange a CRSTE that potentially + * maps a prefix area, without having to invalidate it inbetween. + * + * Context: This function is assumed to be called with kvm->mmu_lock held. + * + * Return: %true if the exchange was successful. + */ +bool __must_check dat_crstep_xchg_atomic(union crste *crstep, union crste old, union crste new, + gfn_t gfn, union asce asce) +{ + if (old.h.i) + return arch_try_cmpxchg((long *)crstep, &old.val, new.val); + if (cpu_has_edat2()) + return crdte_crste(crstep, old, new, gfn, asce); + return cspg_crste(crstep, old, new); +} + +static void dat_set_storage_key_from_pgste(union pte pte, union pgste pgste) +{ + union skey nkey = { .acc = pgste.acc, .fp = pgste.fp }; + + page_set_storage_key(pte_origin(pte), nkey.skey, 0); +} + +static void dat_move_storage_key(union pte old, union pte new) +{ + page_set_storage_key(pte_origin(new), page_get_storage_key(pte_origin(old)), 1); +} + +static union pgste dat_save_storage_key_into_pgste(union pte pte, union pgste pgste) +{ + union skey skey; + + skey.skey = page_get_storage_key(pte_origin(pte)); + + pgste.acc = skey.acc; + pgste.fp = skey.fp; + pgste.gr |= skey.r; + pgste.gc |= skey.c; + + return pgste; +} + +union pgste __dat_ptep_xchg(union pte *ptep, union pgste pgste, union pte new, gfn_t gfn, + union asce asce, bool uses_skeys) +{ + union pte old = READ_ONCE(*ptep); + + /* Updating only the software bits while holding the pgste lock. */ + if (!((ptep->val ^ new.val) & ~_PAGE_SW_BITS)) { + WRITE_ONCE(ptep->swbyte, new.swbyte); + return pgste; + } + + if (!old.h.i) { + unsigned long opts = IPTE_GUEST_ASCE | (pgste.nodat ? IPTE_NODAT : 0); + + if (machine_has_tlb_guest()) + __ptep_ipte(gfn_to_gpa(gfn), (void *)ptep, opts, asce.val, IPTE_GLOBAL); + else + __ptep_ipte(gfn_to_gpa(gfn), (void *)ptep, 0, 0, IPTE_GLOBAL); + } + + if (uses_skeys) { + if (old.h.i && !new.h.i) + /* Invalid to valid: restore storage keys from PGSTE. */ + dat_set_storage_key_from_pgste(new, pgste); + else if (!old.h.i && new.h.i) + /* Valid to invalid: save storage keys to PGSTE. */ + pgste = dat_save_storage_key_into_pgste(old, pgste); + else if (!old.h.i && !new.h.i) + /* Valid to valid: move storage keys. */ + if (old.h.pfra != new.h.pfra) + dat_move_storage_key(old, new); + /* Invalid to invalid: nothing to do. */ + } + + WRITE_ONCE(*ptep, new); + return pgste; +} + +/* + * dat_split_ste() - Split a segment table entry into page table entries. + * + * Context: This function is assumed to be called with kvm->mmu_lock held. + * + * Return: 0 in case of success, -ENOMEM if running out of memory. + */ +static int dat_split_ste(struct kvm_s390_mmu_cache *mc, union pmd *pmdp, gfn_t gfn, + union asce asce, bool uses_skeys) +{ + union pgste pgste_init; + struct page_table *pt; + union pmd new, old; + union pte init; + int i; + + BUG_ON(!mc); + old = READ_ONCE(*pmdp); + + /* Already split, nothing to do. */ + if (!old.h.i && !old.h.fc) + return 0; + + pt = dat_alloc_pt_noinit(mc); + if (!pt) + return -ENOMEM; + new.val = virt_to_phys(pt); + + while (old.h.i || old.h.fc) { + init.val = pmd_origin_large(old); + init.h.p = old.h.p; + init.h.i = old.h.i; + init.s.d = old.s.fc1.d; + init.s.w = old.s.fc1.w; + init.s.y = old.s.fc1.y; + init.s.sd = old.s.fc1.sd; + init.s.pr = old.s.fc1.pr; + pgste_init.val = 0; + if (old.h.fc) { + for (i = 0; i < _PAGE_ENTRIES; i++) + pt->ptes[i].val = init.val | i * PAGE_SIZE; + /* No need to take locks as the page table is not installed yet. */ + pgste_init.prefix_notif = old.s.fc1.prefix_notif; + pgste_init.vsie_notif = old.s.fc1.vsie_notif; + pgste_init.pcl = uses_skeys && init.h.i; + dat_init_pgstes(pt, pgste_init.val); + } else { + dat_init_page_table(pt, init.val, 0); + } + + if (dat_pmdp_xchg_atomic(pmdp, old, new, gfn, asce)) { + if (!pgste_init.pcl) + return 0; + for (i = 0; i < _PAGE_ENTRIES; i++) { + union pgste pgste = pt->pgstes[i]; + + pgste = dat_save_storage_key_into_pgste(pt->ptes[i], pgste); + pgste_set_unlock(pt->ptes + i, pgste); + } + return 0; + } + old = READ_ONCE(*pmdp); + } + + dat_free_pt(pt); + return 0; +} + +/* + * dat_split_crste() - Split a crste into smaller crstes. + * + * Context: This function is assumed to be called with kvm->mmu_lock held. + * + * Return: %0 in case of success, %-ENOMEM if running out of memory. + */ +static int dat_split_crste(struct kvm_s390_mmu_cache *mc, union crste *crstep, + gfn_t gfn, union asce asce, bool uses_skeys) +{ + struct crst_table *table; + union crste old, new, init; + int i; + + old = READ_ONCE(*crstep); + if (is_pmd(old)) + return dat_split_ste(mc, &crstep->pmd, gfn, asce, uses_skeys); + + BUG_ON(!mc); + + /* Already split, nothing to do. */ + if (!old.h.i && !old.h.fc) + return 0; + + table = dat_alloc_crst_noinit(mc); + if (!table) + return -ENOMEM; + + new.val = virt_to_phys(table); + new.h.tt = old.h.tt; + new.h.fc0.tl = _REGION_ENTRY_LENGTH; + + while (old.h.i || old.h.fc) { + init = old; + init.h.tt--; + if (old.h.fc) { + for (i = 0; i < _CRST_ENTRIES; i++) + table->crstes[i].val = init.val | i * HPAGE_SIZE; + } else { + crst_table_init((void *)table, init.val); + } + if (dat_crstep_xchg_atomic(crstep, old, new, gfn, asce)) + return 0; + old = READ_ONCE(*crstep); + } + + dat_free_crst(table); + return 0; +} + +/** + * dat_entry_walk() - Walk the gmap page tables. + * @mc: Cache to use to allocate dat tables, if needed; can be NULL if neither + * %DAT_WALK_SPLIT or %DAT_WALK_ALLOC is specified in @flags. + * @gfn: Guest frame. + * @asce: The ASCE of the address space. + * @flags: Flags from WALK_* macros. + * @walk_level: Level to walk to, from LEVEL_* macros. + * @last: Will be filled the last visited non-pte DAT entry. + * @ptepp: Will be filled the last visited pte entry, if any, otherwise NULL. + * + * Returns a table entry pointer for the given guest address and @walk_level. + * + * The @flags have the following meanings: + * * %DAT_WALK_IGN_HOLES: consider holes as normal table entries + * * %DAT_WALK_ALLOC: allocate new tables to reach the requested level, if needed + * * %DAT_WALK_SPLIT: split existing large pages to reach the requested level, if needed + * * %DAT_WALK_LEAF: return successfully whenever a large page is encountered + * * %DAT_WALK_ANY: return successfully even if the requested level could not be reached + * * %DAT_WALK_CONTINUE: walk to the requested level with the specified flags, and then try to + * continue walking to ptes with only DAT_WALK_ANY + * * %DAT_WALK_USES_SKEYS: storage keys are in use + * + * Context: called with kvm->mmu_lock held. + * + * Return: + * * %PGM_ADDRESSING if the requested address lies outside memory + * * a PIC number if the requested address lies in a memory hole of type _DAT_TOKEN_PIC + * * %-EFAULT if the requested address lies inside a memory hole of a different type + * * %-EINVAL if the given ASCE is not compatible with the requested level + * * %-EFBIG if the requested level could not be reached because a larger frame was found + * * %-ENOENT if the requested level could not be reached for other reasons + * * %-ENOMEM if running out of memory while allocating or splitting a table + */ +int dat_entry_walk(struct kvm_s390_mmu_cache *mc, gfn_t gfn, union asce asce, int flags, + int walk_level, union crste **last, union pte **ptepp) +{ + union vaddress vaddr = { .addr = gfn_to_gpa(gfn) }; + bool continue_anyway = flags & DAT_WALK_CONTINUE; + bool uses_skeys = flags & DAT_WALK_USES_SKEYS; + bool ign_holes = flags & DAT_WALK_IGN_HOLES; + bool allocate = flags & DAT_WALK_ALLOC; + bool split = flags & DAT_WALK_SPLIT; + bool leaf = flags & DAT_WALK_LEAF; + bool any = flags & DAT_WALK_ANY; + struct page_table *pgtable; + struct crst_table *table; + union crste entry; + int rc; + + *last = NULL; + *ptepp = NULL; + if (WARN_ON_ONCE(unlikely(!asce.val))) + return -EINVAL; + if (WARN_ON_ONCE(unlikely(walk_level > asce.dt))) + return -EINVAL; + if (!asce_contains_gfn(asce, gfn)) + return PGM_ADDRESSING; + + table = dereference_asce(asce); + if (asce.dt >= ASCE_TYPE_REGION1) { + *last = table->crstes + vaddr.rfx; + entry = READ_ONCE(**last); + if (WARN_ON_ONCE(entry.h.tt != TABLE_TYPE_REGION1)) + return -EINVAL; + if (crste_hole(entry) && !ign_holes) + return entry.tok.type == _DAT_TOKEN_PIC ? entry.tok.par : -EFAULT; + if (walk_level == TABLE_TYPE_REGION1) + return 0; + if (entry.pgd.h.i) { + if (!allocate) + return any ? 0 : -ENOENT; + rc = dat_split_crste(mc, *last, gfn, asce, uses_skeys); + if (rc) + return rc; + entry = READ_ONCE(**last); + } + table = dereference_crste(entry.pgd); + } + + if (asce.dt >= ASCE_TYPE_REGION2) { + *last = table->crstes + vaddr.rsx; + entry = READ_ONCE(**last); + if (WARN_ON_ONCE(entry.h.tt != TABLE_TYPE_REGION2)) + return -EINVAL; + if (crste_hole(entry) && !ign_holes) + return entry.tok.type == _DAT_TOKEN_PIC ? entry.tok.par : -EFAULT; + if (walk_level == TABLE_TYPE_REGION2) + return 0; + if (entry.p4d.h.i) { + if (!allocate) + return any ? 0 : -ENOENT; + rc = dat_split_crste(mc, *last, gfn, asce, uses_skeys); + if (rc) + return rc; + entry = READ_ONCE(**last); + } + table = dereference_crste(entry.p4d); + } + + if (asce.dt >= ASCE_TYPE_REGION3) { + *last = table->crstes + vaddr.rtx; + entry = READ_ONCE(**last); + if (WARN_ON_ONCE(entry.h.tt != TABLE_TYPE_REGION3)) + return -EINVAL; + if (crste_hole(entry) && !ign_holes) + return entry.tok.type == _DAT_TOKEN_PIC ? entry.tok.par : -EFAULT; + if (walk_level == TABLE_TYPE_REGION3 && + continue_anyway && !entry.pud.h.fc && !entry.h.i) { + walk_level = TABLE_TYPE_PAGE_TABLE; + allocate = false; + } + if (walk_level == TABLE_TYPE_REGION3 || ((leaf || any) && entry.pud.h.fc)) + return 0; + if (entry.pud.h.i && !entry.pud.h.fc) { + if (!allocate) + return any ? 0 : -ENOENT; + rc = dat_split_crste(mc, *last, gfn, asce, uses_skeys); + if (rc) + return rc; + entry = READ_ONCE(**last); + } + if (walk_level <= TABLE_TYPE_SEGMENT && entry.pud.h.fc) { + if (!split) + return -EFBIG; + rc = dat_split_crste(mc, *last, gfn, asce, uses_skeys); + if (rc) + return rc; + entry = READ_ONCE(**last); + } + table = dereference_crste(entry.pud); + } + + *last = table->crstes + vaddr.sx; + entry = READ_ONCE(**last); + if (WARN_ON_ONCE(entry.h.tt != TABLE_TYPE_SEGMENT)) + return -EINVAL; + if (crste_hole(entry) && !ign_holes) + return entry.tok.type == _DAT_TOKEN_PIC ? entry.tok.par : -EFAULT; + if (continue_anyway && !entry.pmd.h.fc && !entry.h.i) { + walk_level = TABLE_TYPE_PAGE_TABLE; + allocate = false; + } + if (walk_level == TABLE_TYPE_SEGMENT || ((leaf || any) && entry.pmd.h.fc)) + return 0; + + if (entry.pmd.h.i && !entry.pmd.h.fc) { + if (!allocate) + return any ? 0 : -ENOENT; + rc = dat_split_ste(mc, &(*last)->pmd, gfn, asce, uses_skeys); + if (rc) + return rc; + entry = READ_ONCE(**last); + } + if (walk_level <= TABLE_TYPE_PAGE_TABLE && entry.pmd.h.fc) { + if (!split) + return -EFBIG; + rc = dat_split_ste(mc, &(*last)->pmd, gfn, asce, uses_skeys); + if (rc) + return rc; + entry = READ_ONCE(**last); + } + pgtable = dereference_pmd(entry.pmd); + *ptepp = pgtable->ptes + vaddr.px; + if (pte_hole(**ptepp) && !ign_holes) + return (*ptepp)->tok.type == _DAT_TOKEN_PIC ? (*ptepp)->tok.par : -EFAULT; + return 0; +} + +static long dat_pte_walk_range(gfn_t gfn, gfn_t end, struct page_table *table, struct dat_walk *w) +{ + unsigned int idx = gfn & (_PAGE_ENTRIES - 1); + long rc = 0; + + for ( ; gfn < end; idx++, gfn++) { + if (pte_hole(READ_ONCE(table->ptes[idx]))) { + if (!(w->flags & DAT_WALK_IGN_HOLES)) + return -EFAULT; + if (!(w->flags & DAT_WALK_ANY)) + continue; + } + + rc = w->ops->pte_entry(table->ptes + idx, gfn, gfn + 1, w); + if (rc) + break; + } + return rc; +} + +static long dat_crste_walk_range(gfn_t start, gfn_t end, struct crst_table *table, + struct dat_walk *walk) +{ + unsigned long idx, cur_shift, cur_size; + dat_walk_op the_op; + union crste crste; + gfn_t cur, next; + long rc = 0; + + cur_shift = 8 + table->crstes[0].h.tt * 11; + idx = (start >> cur_shift) & (_CRST_ENTRIES - 1); + cur_size = 1UL << cur_shift; + + for (cur = ALIGN_DOWN(start, cur_size); cur < end; idx++, cur = next) { + next = cur + cur_size; + walk->last = table->crstes + idx; + crste = READ_ONCE(*walk->last); + + if (crste_hole(crste)) { + if (!(walk->flags & DAT_WALK_IGN_HOLES)) + return -EFAULT; + if (!(walk->flags & DAT_WALK_ANY)) + continue; + } + + the_op = walk->ops->crste_ops[crste.h.tt]; + if (the_op) { + rc = the_op(walk->last, cur, next, walk); + crste = READ_ONCE(*walk->last); + } + if (rc) + break; + if (!crste.h.i && !crste.h.fc) { + if (!is_pmd(crste)) + rc = dat_crste_walk_range(max(start, cur), min(end, next), + _dereference_crste(crste), walk); + else if (walk->ops->pte_entry) + rc = dat_pte_walk_range(max(start, cur), min(end, next), + dereference_pmd(crste.pmd), walk); + } + } + return rc; +} + +/** + * _dat_walk_gfn_range() - Walk DAT tables. + * @start: The first guest page frame to walk. + * @end: The guest page frame immediately after the last one to walk. + * @asce: The ASCE of the guest mapping. + * @ops: The gmap_walk_ops that will be used to perform the walk. + * @flags: Flags from WALK_* (currently only WALK_IGN_HOLES is supported). + * @priv: Will be passed as-is to the callbacks. + * + * Any callback returning non-zero causes the walk to stop immediately. + * + * Return: %-EINVAL in case of error, %-EFAULT if @start is too high for the + * given ASCE unless the DAT_WALK_IGN_HOLES flag is specified, + * otherwise it returns whatever the callbacks return. + */ +long _dat_walk_gfn_range(gfn_t start, gfn_t end, union asce asce, + const struct dat_walk_ops *ops, int flags, void *priv) +{ + struct crst_table *table = dereference_asce(asce); + struct dat_walk walk = { + .ops = ops, + .asce = asce, + .priv = priv, + .flags = flags, + .start = start, + .end = end, + }; + + if (WARN_ON_ONCE(unlikely(!asce.val))) + return -EINVAL; + if (!asce_contains_gfn(asce, start)) + return (flags & DAT_WALK_IGN_HOLES) ? 0 : -EFAULT; + + return dat_crste_walk_range(start, min(end, asce_end(asce)), table, &walk); +} + +int dat_get_storage_key(union asce asce, gfn_t gfn, union skey *skey) +{ + union crste *crstep; + union pgste pgste; + union pte *ptep; + int rc; + + skey->skey = 0; + rc = dat_entry_walk(NULL, gfn, asce, DAT_WALK_ANY, TABLE_TYPE_PAGE_TABLE, &crstep, &ptep); + if (rc) + return rc; + + if (!ptep) { + union crste crste; + + crste = READ_ONCE(*crstep); + if (!crste.h.fc || !crste.s.fc1.pr) + return 0; + skey->skey = page_get_storage_key(large_crste_to_phys(crste, gfn)); + return 0; + } + pgste = pgste_get_lock(ptep); + if (ptep->h.i) { + skey->acc = pgste.acc; + skey->fp = pgste.fp; + } else { + skey->skey = page_get_storage_key(pte_origin(*ptep)); + } + skey->r |= pgste.gr; + skey->c |= pgste.gc; + pgste_set_unlock(ptep, pgste); + return 0; +} + +static void dat_update_ptep_sd(union pgste old, union pgste pgste, union pte *ptep) +{ + if (pgste.acc != old.acc || pgste.fp != old.fp || pgste.gr != old.gr || pgste.gc != old.gc) + __atomic64_or(_PAGE_SD, &ptep->val); +} + +int dat_set_storage_key(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t gfn, + union skey skey, bool nq) +{ + union pgste pgste, old; + union crste *crstep; + union pte *ptep; + int rc; + + rc = dat_entry_walk(mc, gfn, asce, DAT_WALK_LEAF_ALLOC, TABLE_TYPE_PAGE_TABLE, + &crstep, &ptep); + if (rc) + return rc; + + if (!ptep) { + page_set_storage_key(large_crste_to_phys(*crstep, gfn), skey.skey, !nq); + return 0; + } + + old = pgste_get_lock(ptep); + pgste = old; + + pgste.acc = skey.acc; + pgste.fp = skey.fp; + pgste.gc = skey.c; + pgste.gr = skey.r; + + if (!ptep->h.i) { + union skey old_skey; + + old_skey.skey = page_get_storage_key(pte_origin(*ptep)); + pgste.hc |= old_skey.c; + pgste.hr |= old_skey.r; + old_skey.c = old.gc; + old_skey.r = old.gr; + skey.r = 0; + skey.c = 0; + page_set_storage_key(pte_origin(*ptep), skey.skey, !nq); + } + + dat_update_ptep_sd(old, pgste, ptep); + pgste_set_unlock(ptep, pgste); + return 0; +} + +static bool page_cond_set_storage_key(phys_addr_t paddr, union skey skey, union skey *oldkey, + bool nq, bool mr, bool mc) +{ + oldkey->skey = page_get_storage_key(paddr); + if (oldkey->acc == skey.acc && oldkey->fp == skey.fp && + (oldkey->r == skey.r || mr) && (oldkey->c == skey.c || mc)) + return false; + page_set_storage_key(paddr, skey.skey, !nq); + return true; +} + +int dat_cond_set_storage_key(struct kvm_s390_mmu_cache *mmc, union asce asce, gfn_t gfn, + union skey skey, union skey *oldkey, bool nq, bool mr, bool mc) +{ + union pgste pgste, old; + union crste *crstep; + union skey prev; + union pte *ptep; + int rc; + + rc = dat_entry_walk(mmc, gfn, asce, DAT_WALK_LEAF_ALLOC, TABLE_TYPE_PAGE_TABLE, + &crstep, &ptep); + if (rc) + return rc; + + if (!ptep) + return page_cond_set_storage_key(large_crste_to_phys(*crstep, gfn), skey, oldkey, + nq, mr, mc); + + old = pgste_get_lock(ptep); + pgste = old; + + rc = 1; + pgste.acc = skey.acc; + pgste.fp = skey.fp; + pgste.gc = skey.c; + pgste.gr = skey.r; + + if (!ptep->h.i) { + rc = page_cond_set_storage_key(pte_origin(*ptep), skey, &prev, nq, mr, mc); + pgste.hc |= prev.c; + pgste.hr |= prev.r; + prev.c |= old.gc; + prev.r |= old.gr; + } else { + prev.acc = old.acc; + prev.fp = old.fp; + prev.c = old.gc; + prev.r = old.gr; + } + if (oldkey) + *oldkey = prev; + + dat_update_ptep_sd(old, pgste, ptep); + pgste_set_unlock(ptep, pgste); + return rc; +} + +int dat_reset_reference_bit(union asce asce, gfn_t gfn) +{ + union pgste pgste, old; + union crste *crstep; + union pte *ptep; + int rc; + + rc = dat_entry_walk(NULL, gfn, asce, DAT_WALK_ANY, TABLE_TYPE_PAGE_TABLE, &crstep, &ptep); + if (rc) + return rc; + + if (!ptep) { + union crste crste = READ_ONCE(*crstep); + + if (!crste.h.fc || !crste.s.fc1.pr) + return 0; + return page_reset_referenced(large_crste_to_phys(*crstep, gfn)); + } + old = pgste_get_lock(ptep); + pgste = old; + + if (!ptep->h.i) { + rc = page_reset_referenced(pte_origin(*ptep)); + pgste.hr = rc >> 1; + } + rc |= (pgste.gr << 1) | pgste.gc; + pgste.gr = 0; + + dat_update_ptep_sd(old, pgste, ptep); + pgste_set_unlock(ptep, pgste); + return rc; +} + +static long dat_reset_skeys_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + union pgste pgste; + + pgste = pgste_get_lock(ptep); + pgste.acc = 0; + pgste.fp = 0; + pgste.gr = 0; + pgste.gc = 0; + if (ptep->s.pr) + page_set_storage_key(pte_origin(*ptep), PAGE_DEFAULT_KEY, 1); + pgste_set_unlock(ptep, pgste); + + if (need_resched()) + return next; + return 0; +} + +static long dat_reset_skeys_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + phys_addr_t addr, end, origin = crste_origin_large(*crstep); + + if (!crstep->h.fc || !crstep->s.fc1.pr) + return 0; + + addr = ((max(gfn, walk->start) - gfn) << PAGE_SHIFT) + origin; + end = ((min(next, walk->end) - gfn) << PAGE_SHIFT) + origin; + while (ALIGN(addr + 1, _SEGMENT_SIZE) <= end) + addr = sske_frame(addr, PAGE_DEFAULT_KEY); + for ( ; addr < end; addr += PAGE_SIZE) + page_set_storage_key(addr, PAGE_DEFAULT_KEY, 1); + + if (need_resched()) + return next; + return 0; +} + +long dat_reset_skeys(union asce asce, gfn_t start) +{ + const struct dat_walk_ops ops = { + .pte_entry = dat_reset_skeys_pte, + .pmd_entry = dat_reset_skeys_crste, + .pud_entry = dat_reset_skeys_crste, + }; + + return _dat_walk_gfn_range(start, asce_end(asce), asce, &ops, DAT_WALK_IGN_HOLES, NULL); +} + +struct slot_priv { + unsigned long token; + struct kvm_s390_mmu_cache *mc; +}; + +static long _dat_slot_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + struct slot_priv *p = walk->priv; + union crste dummy = { .val = p->token }; + union pte new_pte, pte = READ_ONCE(*ptep); + + new_pte = _PTE_TOK(dummy.tok.type, dummy.tok.par); + + /* Table entry already in the desired state. */ + if (pte.val == new_pte.val) + return 0; + + dat_ptep_xchg(ptep, new_pte, gfn, walk->asce, false); + return 0; +} + +static long _dat_slot_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + union crste new_crste, crste = READ_ONCE(*crstep); + struct slot_priv *p = walk->priv; + + new_crste.val = p->token; + new_crste.h.tt = crste.h.tt; + + /* Table entry already in the desired state. */ + if (crste.val == new_crste.val) + return 0; + + /* This table entry needs to be updated. */ + if (walk->start <= gfn && walk->end >= next) { + if (!dat_crstep_xchg_atomic(crstep, crste, new_crste, gfn, walk->asce)) + return -EINVAL; + /* A lower level table was present, needs to be freed. */ + if (!crste.h.fc && !crste.h.i) { + if (is_pmd(crste)) + dat_free_pt(dereference_pmd(crste.pmd)); + else + dat_free_level(dereference_crste(crste), true); + } + return 0; + } + + /* A lower level table is present, things will handled there. */ + if (!crste.h.fc && !crste.h.i) + return 0; + /* Split (install a lower level table), and handle things there. */ + return dat_split_crste(p->mc, crstep, gfn, walk->asce, false); +} + +static const struct dat_walk_ops dat_slot_ops = { + .pte_entry = _dat_slot_pte, + .crste_ops = { _dat_slot_crste, _dat_slot_crste, _dat_slot_crste, _dat_slot_crste, }, +}; + +int dat_set_slot(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t start, gfn_t end, + u16 type, u16 param) +{ + struct slot_priv priv = { + .token = _CRSTE_TOK(0, type, param).val, + .mc = mc, + }; + + return _dat_walk_gfn_range(start, end, asce, &dat_slot_ops, + DAT_WALK_IGN_HOLES | DAT_WALK_ANY, &priv); +} + +static void pgste_set_unlock_multiple(union pte *first, int n, union pgste *pgstes) +{ + int i; + + for (i = 0; i < n; i++) { + if (!pgstes[i].pcl) + break; + pgste_set_unlock(first + i, pgstes[i]); + } +} + +static bool pgste_get_trylock_multiple(union pte *first, int n, union pgste *pgstes) +{ + int i; + + for (i = 0; i < n; i++) { + if (!pgste_get_trylock(first + i, pgstes + i)) + break; + } + if (i == n) + return true; + pgste_set_unlock_multiple(first, n, pgstes); + return false; +} + +unsigned long dat_get_ptval(struct page_table *table, struct ptval_param param) +{ + union pgste pgstes[4] = {}; + unsigned long res = 0; + int i, n; + + n = param.len + 1; + + while (!pgste_get_trylock_multiple(table->ptes + param.offset, n, pgstes)) + cpu_relax(); + + for (i = 0; i < n; i++) + res = res << 16 | pgstes[i].val16; + + pgste_set_unlock_multiple(table->ptes + param.offset, n, pgstes); + return res; +} + +void dat_set_ptval(struct page_table *table, struct ptval_param param, unsigned long val) +{ + union pgste pgstes[4] = {}; + int i, n; + + n = param.len + 1; + + while (!pgste_get_trylock_multiple(table->ptes + param.offset, n, pgstes)) + cpu_relax(); + + for (i = param.len; i >= 0; i--) { + pgstes[i].val16 = val; + val = val >> 16; + } + + pgste_set_unlock_multiple(table->ptes + param.offset, n, pgstes); +} + +static long _dat_test_young_pte(union pte *ptep, gfn_t start, gfn_t end, struct dat_walk *walk) +{ + return ptep->s.y; +} + +static long _dat_test_young_crste(union crste *crstep, gfn_t start, gfn_t end, + struct dat_walk *walk) +{ + return crstep->h.fc && crstep->s.fc1.y; +} + +static const struct dat_walk_ops test_age_ops = { + .pte_entry = _dat_test_young_pte, + .pmd_entry = _dat_test_young_crste, + .pud_entry = _dat_test_young_crste, +}; + +/** + * dat_test_age_gfn() - Test young. + * @asce: The ASCE whose address range is to be tested. + * @start: The first guest frame of the range to check. + * @end: The guest frame after the last in the range. + * + * Context: called by KVM common code with the kvm mmu write lock held. + * + * Return: %true if any page in the given range is young, otherwise %false. + */ +bool dat_test_age_gfn(union asce asce, gfn_t start, gfn_t end) +{ + return _dat_walk_gfn_range(start, end, asce, &test_age_ops, 0, NULL) > 0; +} + +static long dat_set_pn_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + union crste newcrste, oldcrste; + int *n = walk->priv; + + do { + oldcrste = READ_ONCE(*crstep); + if (!oldcrste.h.fc || oldcrste.h.i || oldcrste.h.p) + return 0; + if (oldcrste.s.fc1.prefix_notif) + break; + newcrste = oldcrste; + newcrste.s.fc1.prefix_notif = 1; + } while (!dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, gfn, walk->asce)); + *n = 2; + return 0; +} + +static long dat_set_pn_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + int *n = walk->priv; + union pgste pgste; + + pgste = pgste_get_lock(ptep); + if (!ptep->h.i && !ptep->h.p) { + pgste.prefix_notif = 1; + *n += 1; + } + pgste_set_unlock(ptep, pgste); + return 0; +} + +int dat_set_prefix_notif_bit(union asce asce, gfn_t gfn) +{ + static const struct dat_walk_ops ops = { + .pte_entry = dat_set_pn_pte, + .pmd_entry = dat_set_pn_crste, + .pud_entry = dat_set_pn_crste, + }; + + int n = 0; + + _dat_walk_gfn_range(gfn, gfn + 2, asce, &ops, DAT_WALK_IGN_HOLES, &n); + if (n != 2) + return -EAGAIN; + return 0; +} + +/** + * dat_perform_essa() - Perform ESSA actions on the PGSTE. + * @asce: The asce to operate on. + * @gfn: The guest page frame to operate on. + * @orc: The specific action to perform, see the ESSA_SET_* macros. + * @state: The storage attributes to be returned to the guest. + * @dirty: Returns whether the function dirtied a previously clean entry. + * + * Context: Called with kvm->mmu_lock held. + * + * Return: + * * %1 if the page state has been altered and the page is to be added to the CBRL + * * %0 if the page state has been altered, but the page is not to be added to the CBRL + * * %-1 if the page state has not been altered and the page is not to be added to the CBRL + */ +int dat_perform_essa(union asce asce, gfn_t gfn, int orc, union essa_state *state, bool *dirty) +{ + union crste *crstep; + union pgste pgste; + union pte *ptep; + int res = 0; + + if (dat_entry_walk(NULL, gfn, asce, 0, TABLE_TYPE_PAGE_TABLE, &crstep, &ptep)) { + *state = (union essa_state) { .exception = 1 }; + return -1; + } + + pgste = pgste_get_lock(ptep); + + *state = (union essa_state) { + .content = (ptep->h.i << 1) + (ptep->h.i && pgste.zero), + .nodat = pgste.nodat, + .usage = pgste.usage, + }; + + switch (orc) { + case ESSA_GET_STATE: + res = -1; + break; + case ESSA_SET_STABLE: + pgste.usage = PGSTE_GPS_USAGE_STABLE; + pgste.nodat = 0; + break; + case ESSA_SET_UNUSED: + pgste.usage = PGSTE_GPS_USAGE_UNUSED; + if (ptep->h.i) + res = 1; + break; + case ESSA_SET_VOLATILE: + pgste.usage = PGSTE_GPS_USAGE_VOLATILE; + if (ptep->h.i) + res = 1; + break; + case ESSA_SET_POT_VOLATILE: + if (!ptep->h.i) { + pgste.usage = PGSTE_GPS_USAGE_POT_VOLATILE; + } else if (pgste.zero) { + pgste.usage = PGSTE_GPS_USAGE_VOLATILE; + } else if (!pgste.gc) { + pgste.usage = PGSTE_GPS_USAGE_VOLATILE; + res = 1; + } + break; + case ESSA_SET_STABLE_RESIDENT: + pgste.usage = PGSTE_GPS_USAGE_STABLE; + /* + * Since the resident state can go away any time after this + * call, we will not make this page resident. We can revisit + * this decision if a guest will ever start using this. + */ + break; + case ESSA_SET_STABLE_IF_RESIDENT: + if (!ptep->h.i) + pgste.usage = PGSTE_GPS_USAGE_STABLE; + break; + case ESSA_SET_STABLE_NODAT: + pgste.usage = PGSTE_GPS_USAGE_STABLE; + pgste.nodat = 1; + break; + default: + WARN_ONCE(1, "Invalid ORC!"); + res = -1; + break; + } + /* If we are discarding a page, set it to logical zero. */ + pgste.zero = res == 1; + if (orc > 0) { + *dirty = !pgste.cmma_d; + pgste.cmma_d = 1; + } + + pgste_set_unlock(ptep, pgste); + + return res; +} + +static long dat_reset_cmma_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + union pgste pgste; + + pgste = pgste_get_lock(ptep); + pgste.usage = 0; + pgste.nodat = 0; + pgste.cmma_d = 0; + pgste_set_unlock(ptep, pgste); + if (need_resched()) + return next; + return 0; +} + +long dat_reset_cmma(union asce asce, gfn_t start) +{ + const struct dat_walk_ops dat_reset_cmma_ops = { + .pte_entry = dat_reset_cmma_pte, + }; + + return _dat_walk_gfn_range(start, asce_end(asce), asce, &dat_reset_cmma_ops, + DAT_WALK_IGN_HOLES, NULL); +} + +struct dat_get_cmma_state { + gfn_t start; + gfn_t end; + unsigned int count; + u8 *values; + atomic64_t *remaining; +}; + +static long __dat_peek_cmma_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + struct dat_get_cmma_state *state = walk->priv; + union pgste pgste; + + pgste = pgste_get_lock(ptep); + state->values[gfn - walk->start] = pgste.usage | (pgste.nodat << 6); + pgste_set_unlock(ptep, pgste); + state->end = next; + + return 0; +} + +static long __dat_peek_cmma_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + struct dat_get_cmma_state *state = walk->priv; + + if (crstep->h.i) + state->end = min(walk->end, next); + return 0; +} + +int dat_peek_cmma(gfn_t start, union asce asce, unsigned int *count, u8 *values) +{ + const struct dat_walk_ops ops = { + .pte_entry = __dat_peek_cmma_pte, + .pmd_entry = __dat_peek_cmma_crste, + .pud_entry = __dat_peek_cmma_crste, + .p4d_entry = __dat_peek_cmma_crste, + .pgd_entry = __dat_peek_cmma_crste, + }; + struct dat_get_cmma_state state = { .values = values, }; + int rc; + + rc = _dat_walk_gfn_range(start, start + *count, asce, &ops, DAT_WALK_DEFAULT, &state); + *count = state.end - start; + /* Return success if at least one value was saved, otherwise an error. */ + return (rc == -EFAULT && *count > 0) ? 0 : rc; +} + +static long __dat_get_cmma_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + struct dat_get_cmma_state *state = walk->priv; + union pgste pgste; + + if (state->start != -1) { + if ((gfn - state->end) > KVM_S390_MAX_BIT_DISTANCE) + return 1; + if (gfn - state->start >= state->count) + return 1; + } + + if (!READ_ONCE(*pgste_of(ptep)).cmma_d) + return 0; + + pgste = pgste_get_lock(ptep); + if (pgste.cmma_d) { + if (state->start == -1) + state->start = gfn; + pgste.cmma_d = 0; + atomic64_dec(state->remaining); + state->values[gfn - state->start] = pgste.usage | pgste.nodat << 6; + state->end = next; + } + pgste_set_unlock(ptep, pgste); + return 0; +} + +int dat_get_cmma(union asce asce, gfn_t *start, unsigned int *count, u8 *values, atomic64_t *rem) +{ + const struct dat_walk_ops ops = { .pte_entry = __dat_get_cmma_pte, }; + struct dat_get_cmma_state state = { + .remaining = rem, + .values = values, + .count = *count, + .start = -1, + }; + + _dat_walk_gfn_range(*start, asce_end(asce), asce, &ops, DAT_WALK_IGN_HOLES, &state); + + if (state.start == -1) { + *count = 0; + } else { + *count = state.end - state.start; + *start = state.start; + } + + return 0; +} + +struct dat_set_cmma_state { + unsigned long mask; + const u8 *bits; +}; + +static long __dat_set_cmma_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + struct dat_set_cmma_state *state = walk->priv; + union pgste pgste, tmp; + + tmp.val = (state->bits[gfn - walk->start] << 24) & state->mask; + + pgste = pgste_get_lock(ptep); + pgste.usage = tmp.usage; + pgste.nodat = tmp.nodat; + pgste_set_unlock(ptep, pgste); + + return 0; +} + +/** + * dat_set_cmma_bits() - Set CMMA bits for a range of guest pages. + * @mc: Cache used for allocations. + * @asce: The ASCE of the guest. + * @gfn: The guest frame of the fist page whose CMMA bits are to set. + * @count: How many pages need to be processed. + * @mask: Which PGSTE bits should be set. + * @bits: Points to an array with the CMMA attributes. + * + * This function sets the CMMA attributes for the given pages. If the input + * buffer has zero length, no action is taken, otherwise the attributes are + * set and the mm->context.uses_cmm flag is set. + * + * Each byte in @bits contains new values for bits 32-39 of the PGSTE. + * Currently, only the fields NT and US are applied. + * + * Return: %0 in case of success, a negative error value otherwise. + */ +int dat_set_cmma_bits(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t gfn, + unsigned long count, unsigned long mask, const uint8_t *bits) +{ + const struct dat_walk_ops ops = { .pte_entry = __dat_set_cmma_pte, }; + struct dat_set_cmma_state state = { .mask = mask, .bits = bits, }; + union crste *crstep; + union pte *ptep; + gfn_t cur; + int rc; + + for (cur = ALIGN_DOWN(gfn, _PAGE_ENTRIES); cur < gfn + count; cur += _PAGE_ENTRIES) { + rc = dat_entry_walk(mc, cur, asce, DAT_WALK_ALLOC, TABLE_TYPE_PAGE_TABLE, + &crstep, &ptep); + if (rc) + return rc; + } + return _dat_walk_gfn_range(gfn, gfn + count, asce, &ops, DAT_WALK_IGN_HOLES, &state); +} diff --git a/arch/s390/kvm/dat.h b/arch/s390/kvm/dat.h new file mode 100644 index 000000000000..8f8278c44879 --- /dev/null +++ b/arch/s390/kvm/dat.h @@ -0,0 +1,976 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * KVM guest address space mapping code + * + * Copyright IBM Corp. 2024, 2025 + * Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com> + */ + +#ifndef __KVM_S390_DAT_H +#define __KVM_S390_DAT_H + +#include <linux/radix-tree.h> +#include <linux/refcount.h> +#include <linux/io.h> +#include <linux/kvm_types.h> +#include <linux/pgalloc.h> +#include <asm/tlbflush.h> +#include <asm/dat-bits.h> + +/* + * Base address and length must be sent at the start of each block, therefore + * it's cheaper to send some clean data, as long as it's less than the size of + * two longs. + */ +#define KVM_S390_MAX_BIT_DISTANCE (2 * sizeof(void *)) +/* For consistency */ +#define KVM_S390_CMMA_SIZE_MAX ((u32)KVM_S390_SKEYS_MAX) + +#define _ASCE(x) ((union asce) { .val = (x), }) +#define NULL_ASCE _ASCE(0) + +enum { + _DAT_TOKEN_NONE = 0, + _DAT_TOKEN_PIC, +}; + +#define _CRSTE_TOK(l, t, p) ((union crste) { \ + .tok.i = 1, \ + .tok.tt = (l), \ + .tok.type = (t), \ + .tok.par = (p) \ + }) +#define _CRSTE_PIC(l, p) _CRSTE_TOK(l, _DAT_TOKEN_PIC, p) + +#define _CRSTE_HOLE(l) _CRSTE_PIC(l, PGM_ADDRESSING) +#define _CRSTE_EMPTY(l) _CRSTE_TOK(l, _DAT_TOKEN_NONE, 0) + +#define _PMD_EMPTY _CRSTE_EMPTY(TABLE_TYPE_SEGMENT) + +#define _PTE_TOK(t, p) ((union pte) { .tok.i = 1, .tok.type = (t), .tok.par = (p) }) +#define _PTE_EMPTY _PTE_TOK(_DAT_TOKEN_NONE, 0) + +/* This fake table type is used for page table walks (both for normal page tables and vSIE) */ +#define TABLE_TYPE_PAGE_TABLE -1 + +enum dat_walk_flags { + DAT_WALK_USES_SKEYS = 0x40, + DAT_WALK_CONTINUE = 0x20, + DAT_WALK_IGN_HOLES = 0x10, + DAT_WALK_SPLIT = 0x08, + DAT_WALK_ALLOC = 0x04, + DAT_WALK_ANY = 0x02, + DAT_WALK_LEAF = 0x01, + DAT_WALK_DEFAULT = 0 +}; + +#define DAT_WALK_SPLIT_ALLOC (DAT_WALK_SPLIT | DAT_WALK_ALLOC) +#define DAT_WALK_ALLOC_CONTINUE (DAT_WALK_CONTINUE | DAT_WALK_ALLOC) +#define DAT_WALK_LEAF_ALLOC (DAT_WALK_LEAF | DAT_WALK_ALLOC) + +union pte { + unsigned long val; + union page_table_entry h; + struct { + unsigned long :56; /* Hardware bits */ + unsigned long u : 1; /* Page unused */ + unsigned long s : 1; /* Special */ + unsigned long w : 1; /* Writable */ + unsigned long r : 1; /* Readable */ + unsigned long d : 1; /* Dirty */ + unsigned long y : 1; /* Young */ + unsigned long sd: 1; /* Soft dirty */ + unsigned long pr: 1; /* Present */ + } s; + struct { + unsigned char hwbytes[7]; + unsigned char swbyte; + }; + union { + struct { + unsigned long type :16; /* Token type */ + unsigned long par :16; /* Token parameter */ + unsigned long :20; + unsigned long : 1; /* Must be 0 */ + unsigned long i : 1; /* Must be 1 */ + unsigned long : 2; + unsigned long : 7; + unsigned long pr : 1; /* Must be 0 */ + }; + struct { + unsigned long token:32; /* Token and parameter */ + unsigned long :32; + }; + } tok; +}; + +#define _SEGMENT_FR_MASK (_SEGMENT_MASK >> PAGE_SHIFT) +#define _REGION3_FR_MASK (_REGION3_MASK >> PAGE_SHIFT) +#define _PAGES_PER_SEGMENT _PAGE_ENTRIES +#define _PAGES_PER_REGION3 (_PAGES_PER_SEGMENT * _CRST_ENTRIES) + +/* Soft dirty, needed as macro for atomic operations on ptes */ +#define _PAGE_SD 0x002 + +/* Needed as macro to perform atomic operations */ +#define PGSTE_PCL_BIT 0x0080000000000000UL /* PCL lock, HW bit */ +#define PGSTE_CMMA_D_BIT 0x0000000000008000UL /* CMMA dirty soft-bit */ + +enum pgste_gps_usage { + PGSTE_GPS_USAGE_STABLE = 0, + PGSTE_GPS_USAGE_UNUSED, + PGSTE_GPS_USAGE_POT_VOLATILE, + PGSTE_GPS_USAGE_VOLATILE, +}; + +union pgste { + unsigned long val; + struct { + unsigned long acc : 4; + unsigned long fp : 1; + unsigned long : 3; + unsigned long pcl : 1; + unsigned long hr : 1; + unsigned long hc : 1; + unsigned long : 2; + unsigned long gr : 1; + unsigned long gc : 1; + unsigned long : 1; + unsigned long :16; /* val16 */ + unsigned long zero : 1; + unsigned long nodat : 1; + unsigned long : 4; + unsigned long usage : 2; + unsigned long : 8; + unsigned long cmma_d : 1; /* Dirty flag for CMMA bits */ + unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */ + unsigned long vsie_notif : 1; /* Referenced in a shadow table */ + unsigned long : 5; + unsigned long : 8; + }; + struct { + unsigned short hwbytes0; + unsigned short val16; /* Used to store chunked values, see dat_{s,g}et_ptval() */ + unsigned short hwbytes4; + unsigned char flags; /* Maps to the software bits */ + unsigned char hwbyte7; + } __packed; +}; + +union pmd { + unsigned long val; + union segment_table_entry h; + struct { + struct { + unsigned long :44; /* HW */ + unsigned long : 3; /* Unused */ + unsigned long : 1; /* HW */ + unsigned long s : 1; /* Special */ + unsigned long w : 1; /* Writable soft-bit */ + unsigned long r : 1; /* Readable soft-bit */ + unsigned long d : 1; /* Dirty */ + unsigned long y : 1; /* Young */ + unsigned long : 3; /* HW */ + unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */ + unsigned long vsie_notif : 1; /* Referenced in a shadow table */ + unsigned long : 4; /* HW */ + unsigned long sd : 1; /* Soft-Dirty */ + unsigned long pr : 1; /* Present */ + } fc1; + } s; +}; + +union pud { + unsigned long val; + union region3_table_entry h; + struct { + struct { + unsigned long :33; /* HW */ + unsigned long :14; /* Unused */ + unsigned long : 1; /* HW */ + unsigned long s : 1; /* Special */ + unsigned long w : 1; /* Writable soft-bit */ + unsigned long r : 1; /* Readable soft-bit */ + unsigned long d : 1; /* Dirty */ + unsigned long y : 1; /* Young */ + unsigned long : 3; /* HW */ + unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */ + unsigned long vsie_notif : 1; /* Referenced in a shadow table */ + unsigned long : 4; /* HW */ + unsigned long sd : 1; /* Soft-Dirty */ + unsigned long pr : 1; /* Present */ + } fc1; + } s; +}; + +union p4d { + unsigned long val; + union region2_table_entry h; +}; + +union pgd { + unsigned long val; + union region1_table_entry h; +}; + +union crste { + unsigned long val; + union { + struct { + unsigned long :52; + unsigned long : 1; + unsigned long fc: 1; + unsigned long p : 1; + unsigned long : 1; + unsigned long : 2; + unsigned long i : 1; + unsigned long : 1; + unsigned long tt: 2; + unsigned long : 2; + }; + struct { + unsigned long to:52; + unsigned long : 1; + unsigned long fc: 1; + unsigned long p : 1; + unsigned long : 1; + unsigned long tf: 2; + unsigned long i : 1; + unsigned long : 1; + unsigned long tt: 2; + unsigned long tl: 2; + } fc0; + struct { + unsigned long :47; + unsigned long av : 1; /* ACCF-Validity Control */ + unsigned long acc: 4; /* Access-Control Bits */ + unsigned long f : 1; /* Fetch-Protection Bit */ + unsigned long fc : 1; /* Format-Control */ + unsigned long p : 1; /* DAT-Protection Bit */ + unsigned long iep: 1; /* Instruction-Execution-Protection */ + unsigned long : 2; + unsigned long i : 1; /* Segment-Invalid Bit */ + unsigned long cs : 1; /* Common-Segment Bit */ + unsigned long tt : 2; /* Table-Type Bits */ + unsigned long : 2; + } fc1; + } h; + struct { + struct { + unsigned long :47; + unsigned long : 1; /* HW (should be 0) */ + unsigned long s : 1; /* Special */ + unsigned long w : 1; /* Writable */ + unsigned long r : 1; /* Readable */ + unsigned long d : 1; /* Dirty */ + unsigned long y : 1; /* Young */ + unsigned long : 3; /* HW */ + unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */ + unsigned long vsie_notif : 1; /* Referenced in a shadow table */ + unsigned long : 4; /* HW */ + unsigned long sd : 1; /* Soft-Dirty */ + unsigned long pr : 1; /* Present */ + } fc1; + } s; + union { + struct { + unsigned long type :16; /* Token type */ + unsigned long par :16; /* Token parameter */ + unsigned long :26; + unsigned long i : 1; /* Must be 1 */ + unsigned long : 1; + unsigned long tt : 2; + unsigned long : 1; + unsigned long pr : 1; /* Must be 0 */ + }; + struct { + unsigned long token:32; /* Token and parameter */ + unsigned long :32; + }; + } tok; + union pmd pmd; + union pud pud; + union p4d p4d; + union pgd pgd; +}; + +union skey { + unsigned char skey; + struct { + unsigned char acc :4; + unsigned char fp :1; + unsigned char r :1; + unsigned char c :1; + unsigned char zero:1; + }; +}; + +static_assert(sizeof(union pgste) == sizeof(unsigned long)); +static_assert(sizeof(union pte) == sizeof(unsigned long)); +static_assert(sizeof(union pmd) == sizeof(unsigned long)); +static_assert(sizeof(union pud) == sizeof(unsigned long)); +static_assert(sizeof(union p4d) == sizeof(unsigned long)); +static_assert(sizeof(union pgd) == sizeof(unsigned long)); +static_assert(sizeof(union crste) == sizeof(unsigned long)); +static_assert(sizeof(union skey) == sizeof(char)); + +struct segment_table { + union pmd pmds[_CRST_ENTRIES]; +}; + +struct region3_table { + union pud puds[_CRST_ENTRIES]; +}; + +struct region2_table { + union p4d p4ds[_CRST_ENTRIES]; +}; + +struct region1_table { + union pgd pgds[_CRST_ENTRIES]; +}; + +struct crst_table { + union { + union crste crstes[_CRST_ENTRIES]; + struct segment_table segment; + struct region3_table region3; + struct region2_table region2; + struct region1_table region1; + }; +}; + +struct page_table { + union pte ptes[_PAGE_ENTRIES]; + union pgste pgstes[_PAGE_ENTRIES]; +}; + +static_assert(sizeof(struct crst_table) == _CRST_TABLE_SIZE); +static_assert(sizeof(struct page_table) == PAGE_SIZE); + +struct dat_walk; + +typedef long (*dat_walk_op)(union crste *crste, gfn_t gfn, gfn_t next, struct dat_walk *w); + +struct dat_walk_ops { + union { + dat_walk_op crste_ops[4]; + struct { + dat_walk_op pmd_entry; + dat_walk_op pud_entry; + dat_walk_op p4d_entry; + dat_walk_op pgd_entry; + }; + }; + long (*pte_entry)(union pte *pte, gfn_t gfn, gfn_t next, struct dat_walk *w); +}; + +struct dat_walk { + const struct dat_walk_ops *ops; + union crste *last; + union pte *last_pte; + union asce asce; + gfn_t start; + gfn_t end; + int flags; + void *priv; +}; + +struct ptval_param { + unsigned char offset : 6; + unsigned char len : 2; +}; + +/** + * _pte() - Useful constructor for union pte + * @pfn: the pfn this pte should point to. + * @writable: whether the pte should be writable. + * @dirty: whether the pte should be dirty. + * @special: whether the pte should be marked as special + * + * The pte is also marked as young and present. If the pte is marked as dirty, + * it gets marked as soft-dirty too. If the pte is not dirty, the hardware + * protect bit is set (independently of the write softbit); this way proper + * dirty tracking can be performed. + * + * Return: a union pte value. + */ +static inline union pte _pte(kvm_pfn_t pfn, bool writable, bool dirty, bool special) +{ + union pte res = { .val = PFN_PHYS(pfn) }; + + res.h.p = !dirty; + res.s.y = 1; + res.s.pr = 1; + res.s.w = writable; + res.s.d = dirty; + res.s.sd = dirty; + res.s.s = special; + return res; +} + +static inline union crste _crste_fc0(kvm_pfn_t pfn, int tt) +{ + union crste res = { .val = PFN_PHYS(pfn) }; + + res.h.tt = tt; + res.h.fc0.tl = _REGION_ENTRY_LENGTH; + res.h.fc0.tf = 0; + return res; +} + +/** + * _crste() - Useful constructor for union crste with FC=1 + * @pfn: the pfn this pte should point to. + * @tt: the table type + * @writable: whether the pte should be writable. + * @dirty: whether the pte should be dirty. + * + * The crste is also marked as young and present. If the crste is marked as + * dirty, it gets marked as soft-dirty too. If the crste is not dirty, the + * hardware protect bit is set (independently of the write softbit); this way + * proper dirty tracking can be performed. + * + * Return: a union crste value. + */ +static inline union crste _crste_fc1(kvm_pfn_t pfn, int tt, bool writable, bool dirty) +{ + union crste res = { .val = PFN_PHYS(pfn) & _SEGMENT_MASK }; + + res.h.tt = tt; + res.h.p = !dirty; + res.h.fc = 1; + res.s.fc1.y = 1; + res.s.fc1.pr = 1; + res.s.fc1.w = writable; + res.s.fc1.d = dirty; + res.s.fc1.sd = dirty; + return res; +} + +union essa_state { + unsigned char val; + struct { + unsigned char : 2; + unsigned char nodat : 1; + unsigned char exception : 1; + unsigned char usage : 2; + unsigned char content : 2; + }; +}; + +/** + * struct vsie_rmap - reverse mapping for shadow page table entries + * @next: pointer to next rmap in the list + * @r_gfn: virtual rmap address in the shadow guest address space + */ +struct vsie_rmap { + struct vsie_rmap *next; + union { + unsigned long val; + struct { + long level: 8; + unsigned long : 4; + unsigned long r_gfn:52; + }; + }; +}; + +static_assert(sizeof(struct vsie_rmap) == 2 * sizeof(long)); + +#define KVM_S390_MMU_CACHE_N_CRSTS 6 +#define KVM_S390_MMU_CACHE_N_PTS 2 +#define KVM_S390_MMU_CACHE_N_RMAPS 16 +struct kvm_s390_mmu_cache { + void *crsts[KVM_S390_MMU_CACHE_N_CRSTS]; + void *pts[KVM_S390_MMU_CACHE_N_PTS]; + void *rmaps[KVM_S390_MMU_CACHE_N_RMAPS]; + short int n_crsts; + short int n_pts; + short int n_rmaps; +}; + +struct guest_fault { + gfn_t gfn; /* Guest frame */ + kvm_pfn_t pfn; /* Host PFN */ + struct page *page; /* Host page */ + union pte *ptep; /* Used to resolve the fault, or NULL */ + union crste *crstep; /* Used to resolve the fault, or NULL */ + bool writable; /* Mapping is writable */ + bool write_attempt; /* Write access attempted */ + bool attempt_pfault; /* Attempt a pfault first */ + bool valid; /* This entry contains valid data */ + void (*callback)(struct guest_fault *f); + void *priv; +}; + +/* + * 0 1 2 3 4 5 6 7 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 0 | | PGT_ADDR | + * 8 | VMADDR | | + * 16 | | + * 24 | | + */ +#define MKPTVAL(o, l) ((struct ptval_param) { .offset = (o), .len = ((l) + 1) / 2 - 1}) +#define PTVAL_PGT_ADDR MKPTVAL(4, 8) +#define PTVAL_VMADDR MKPTVAL(8, 6) + +union pgste __must_check __dat_ptep_xchg(union pte *ptep, union pgste pgste, union pte new, + gfn_t gfn, union asce asce, bool uses_skeys); +bool dat_crstep_xchg_atomic(union crste *crstep, union crste old, union crste new, gfn_t gfn, + union asce asce); +void dat_crstep_xchg(union crste *crstep, union crste new, gfn_t gfn, union asce asce); + +long _dat_walk_gfn_range(gfn_t start, gfn_t end, union asce asce, + const struct dat_walk_ops *ops, int flags, void *priv); + +int dat_entry_walk(struct kvm_s390_mmu_cache *mc, gfn_t gfn, union asce asce, int flags, + int walk_level, union crste **last, union pte **ptepp); +void dat_free_level(struct crst_table *table, bool owns_ptes); +struct crst_table *dat_alloc_crst_sleepable(unsigned long init); +int dat_set_asce_limit(struct kvm_s390_mmu_cache *mc, union asce *asce, int newtype); +int dat_get_storage_key(union asce asce, gfn_t gfn, union skey *skey); +int dat_set_storage_key(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t gfn, + union skey skey, bool nq); +int dat_cond_set_storage_key(struct kvm_s390_mmu_cache *mmc, union asce asce, gfn_t gfn, + union skey skey, union skey *oldkey, bool nq, bool mr, bool mc); +int dat_reset_reference_bit(union asce asce, gfn_t gfn); +long dat_reset_skeys(union asce asce, gfn_t start); + +unsigned long dat_get_ptval(struct page_table *table, struct ptval_param param); +void dat_set_ptval(struct page_table *table, struct ptval_param param, unsigned long val); + +int dat_set_slot(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t start, gfn_t end, + u16 type, u16 param); +int dat_set_prefix_notif_bit(union asce asce, gfn_t gfn); +bool dat_test_age_gfn(union asce asce, gfn_t start, gfn_t end); + +int dat_perform_essa(union asce asce, gfn_t gfn, int orc, union essa_state *state, bool *dirty); +long dat_reset_cmma(union asce asce, gfn_t start_gfn); +int dat_peek_cmma(gfn_t start, union asce asce, unsigned int *count, u8 *values); +int dat_get_cmma(union asce asce, gfn_t *start, unsigned int *count, u8 *values, atomic64_t *rem); +int dat_set_cmma_bits(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t gfn, + unsigned long count, unsigned long mask, const uint8_t *bits); + +int kvm_s390_mmu_cache_topup(struct kvm_s390_mmu_cache *mc); + +#define GFP_KVM_S390_MMU_CACHE (GFP_ATOMIC | __GFP_ACCOUNT | __GFP_NOWARN) + +static inline struct page_table *kvm_s390_mmu_cache_alloc_pt(struct kvm_s390_mmu_cache *mc) +{ + if (mc->n_pts) + return mc->pts[--mc->n_pts]; + return (void *)__get_free_page(GFP_KVM_S390_MMU_CACHE); +} + +static inline struct crst_table *kvm_s390_mmu_cache_alloc_crst(struct kvm_s390_mmu_cache *mc) +{ + if (mc->n_crsts) + return mc->crsts[--mc->n_crsts]; + return (void *)__get_free_pages(GFP_KVM_S390_MMU_CACHE | __GFP_COMP, CRST_ALLOC_ORDER); +} + +static inline struct vsie_rmap *kvm_s390_mmu_cache_alloc_rmap(struct kvm_s390_mmu_cache *mc) +{ + if (mc->n_rmaps) + return mc->rmaps[--mc->n_rmaps]; + return kzalloc_obj(struct vsie_rmap, GFP_KVM_S390_MMU_CACHE); +} + +static inline struct crst_table *crste_table_start(union crste *crstep) +{ + return (struct crst_table *)ALIGN_DOWN((unsigned long)crstep, _CRST_TABLE_SIZE); +} + +static inline struct page_table *pte_table_start(union pte *ptep) +{ + return (struct page_table *)ALIGN_DOWN((unsigned long)ptep, _PAGE_TABLE_SIZE); +} + +static inline bool crdte_crste(union crste *crstep, union crste old, union crste new, gfn_t gfn, + union asce asce) +{ + unsigned long dtt = 0x10 | new.h.tt << 2; + void *table = crste_table_start(crstep); + + return crdte(old.val, new.val, table, dtt, gfn_to_gpa(gfn), asce.val); +} + +/** + * idte_crste() - invalidate a crste entry using idte + * @crstep: pointer to the crste to be invalidated + * @gfn: a gfn mapped by the crste + * @opt: options for the idte instruction + * @asce: the asce + * @local: whether the operation is cpu-local + */ +static __always_inline void idte_crste(union crste *crstep, gfn_t gfn, unsigned long opt, + union asce asce, int local) +{ + unsigned long table_origin = __pa(crste_table_start(crstep)); + unsigned long gaddr = gfn_to_gpa(gfn) & HPAGE_MASK; + + if (__builtin_constant_p(opt) && opt == 0) { + /* flush without guest asce */ + asm volatile("idte %[table_origin],0,%[gaddr],%[local]" + : "+m" (*crstep) + : [table_origin] "a" (table_origin), [gaddr] "a" (gaddr), + [local] "i" (local) + : "cc"); + } else { + /* flush with guest asce */ + asm volatile("idte %[table_origin],%[asce],%[gaddr_opt],%[local]" + : "+m" (*crstep) + : [table_origin] "a" (table_origin), [gaddr_opt] "a" (gaddr | opt), + [asce] "a" (asce.val), [local] "i" (local) + : "cc"); + } +} + +static inline void dat_init_pgstes(struct page_table *pt, unsigned long val) +{ + memset64((void *)pt->pgstes, val, PTRS_PER_PTE); +} + +static inline void dat_init_page_table(struct page_table *pt, unsigned long ptes, + unsigned long pgstes) +{ + memset64((void *)pt->ptes, ptes, PTRS_PER_PTE); + dat_init_pgstes(pt, pgstes); +} + +static inline gfn_t asce_end(union asce asce) +{ + return 1ULL << ((asce.dt + 1) * 11 + _SEGMENT_SHIFT - PAGE_SHIFT); +} + +#define _CRSTE(x) ((union crste) { .val = _Generic((x), \ + union pgd : (x).val, \ + union p4d : (x).val, \ + union pud : (x).val, \ + union pmd : (x).val, \ + union crste : (x).val)}) + +#define _CRSTEP(x) ((union crste *)_Generic((*(x)), \ + union pgd : (x), \ + union p4d : (x), \ + union pud : (x), \ + union pmd : (x), \ + union crste : (x))) + +#define _CRSTP(x) ((struct crst_table *)_Generic((*(x)), \ + struct crst_table : (x), \ + struct segment_table : (x), \ + struct region3_table : (x), \ + struct region2_table : (x), \ + struct region1_table : (x))) + +static inline bool asce_contains_gfn(union asce asce, gfn_t gfn) +{ + return gfn < asce_end(asce); +} + +static inline bool is_pmd(union crste crste) +{ + return crste.h.tt == TABLE_TYPE_SEGMENT; +} + +static inline bool is_pud(union crste crste) +{ + return crste.h.tt == TABLE_TYPE_REGION3; +} + +static inline bool is_p4d(union crste crste) +{ + return crste.h.tt == TABLE_TYPE_REGION2; +} + +static inline bool is_pgd(union crste crste) +{ + return crste.h.tt == TABLE_TYPE_REGION1; +} + +static inline phys_addr_t pmd_origin_large(union pmd pmd) +{ + return pmd.val & _SEGMENT_ENTRY_ORIGIN_LARGE; +} + +static inline phys_addr_t pud_origin_large(union pud pud) +{ + return pud.val & _REGION3_ENTRY_ORIGIN_LARGE; +} + +/** + * crste_origin_large() - Return the large frame origin of a large crste + * @crste: The crste whose origin is to be returned. Should be either a + * region-3 table entry or a segment table entry, in both cases with + * FC set to 1 (large pages). + * + * Return: The origin of the large frame pointed to by @crste, or -1 if the + * crste was not large (wrong table type, or FC==0) + */ +static inline phys_addr_t crste_origin_large(union crste crste) +{ + if (unlikely(!crste.h.fc || crste.h.tt > TABLE_TYPE_REGION3)) + return -1; + if (is_pmd(crste)) + return pmd_origin_large(crste.pmd); + return pud_origin_large(crste.pud); +} + +#define crste_origin(x) (_Generic((x), \ + union pmd : (x).val & _SEGMENT_ENTRY_ORIGIN, \ + union pud : (x).val & _REGION_ENTRY_ORIGIN, \ + union p4d : (x).val & _REGION_ENTRY_ORIGIN, \ + union pgd : (x).val & _REGION_ENTRY_ORIGIN)) + +static inline unsigned long pte_origin(union pte pte) +{ + return pte.val & PAGE_MASK; +} + +static inline bool pmd_prefix(union pmd pmd) +{ + return pmd.h.fc && pmd.s.fc1.prefix_notif; +} + +static inline bool pud_prefix(union pud pud) +{ + return pud.h.fc && pud.s.fc1.prefix_notif; +} + +static inline bool crste_leaf(union crste crste) +{ + return (crste.h.tt <= TABLE_TYPE_REGION3) && crste.h.fc; +} + +static inline bool crste_prefix(union crste crste) +{ + return crste_leaf(crste) && crste.s.fc1.prefix_notif; +} + +static inline bool crste_dirty(union crste crste) +{ + return crste_leaf(crste) && crste.s.fc1.d; +} + +static inline union pgste *pgste_of(union pte *pte) +{ + return (union pgste *)(pte + _PAGE_ENTRIES); +} + +static inline bool pte_hole(union pte pte) +{ + return pte.h.i && !pte.tok.pr && pte.tok.type != _DAT_TOKEN_NONE; +} + +static inline bool _crste_hole(union crste crste) +{ + return crste.h.i && !crste.tok.pr && crste.tok.type != _DAT_TOKEN_NONE; +} + +#define crste_hole(x) _crste_hole(_CRSTE(x)) + +static inline bool _crste_none(union crste crste) +{ + return crste.h.i && !crste.tok.pr && crste.tok.type == _DAT_TOKEN_NONE; +} + +#define crste_none(x) _crste_none(_CRSTE(x)) + +static inline phys_addr_t large_pud_to_phys(union pud pud, gfn_t gfn) +{ + return pud_origin_large(pud) | (gfn_to_gpa(gfn) & ~_REGION3_MASK); +} + +static inline phys_addr_t large_pmd_to_phys(union pmd pmd, gfn_t gfn) +{ + return pmd_origin_large(pmd) | (gfn_to_gpa(gfn) & ~_SEGMENT_MASK); +} + +static inline phys_addr_t large_crste_to_phys(union crste crste, gfn_t gfn) +{ + if (unlikely(!crste.h.fc || crste.h.tt > TABLE_TYPE_REGION3)) + return -1; + if (is_pmd(crste)) + return large_pmd_to_phys(crste.pmd, gfn); + return large_pud_to_phys(crste.pud, gfn); +} + +static inline bool cspg_crste(union crste *crstep, union crste old, union crste new) +{ + return cspg(&crstep->val, old.val, new.val); +} + +static inline struct page_table *dereference_pmd(union pmd pmd) +{ + return phys_to_virt(crste_origin(pmd)); +} + +static inline struct segment_table *dereference_pud(union pud pud) +{ + return phys_to_virt(crste_origin(pud)); +} + +static inline struct region3_table *dereference_p4d(union p4d p4d) +{ + return phys_to_virt(crste_origin(p4d)); +} + +static inline struct region2_table *dereference_pgd(union pgd pgd) +{ + return phys_to_virt(crste_origin(pgd)); +} + +static inline struct crst_table *_dereference_crste(union crste crste) +{ + if (unlikely(is_pmd(crste))) + return NULL; + return phys_to_virt(crste_origin(crste.pud)); +} + +#define dereference_crste(x) (_Generic((x), \ + union pud : _dereference_crste(_CRSTE(x)), \ + union p4d : _dereference_crste(_CRSTE(x)), \ + union pgd : _dereference_crste(_CRSTE(x)), \ + union crste : _dereference_crste(_CRSTE(x)))) + +static inline struct crst_table *dereference_asce(union asce asce) +{ + return phys_to_virt(asce.val & _ASCE_ORIGIN); +} + +static inline void asce_flush_tlb(union asce asce) +{ + __tlb_flush_idte(asce.val); +} + +static inline bool pgste_get_trylock(union pte *ptep, union pgste *res) +{ + union pgste *pgstep = pgste_of(ptep); + union pgste old_pgste; + + if (READ_ONCE(pgstep->val) & PGSTE_PCL_BIT) + return false; + old_pgste.val = __atomic64_or_barrier(PGSTE_PCL_BIT, &pgstep->val); + if (old_pgste.pcl) + return false; + old_pgste.pcl = 1; + *res = old_pgste; + return true; +} + +static inline union pgste pgste_get_lock(union pte *ptep) +{ + union pgste res; + + while (!pgste_get_trylock(ptep, &res)) + cpu_relax(); + return res; +} + +static inline void pgste_set_unlock(union pte *ptep, union pgste pgste) +{ + pgste.pcl = 0; + barrier(); + WRITE_ONCE(*pgste_of(ptep), pgste); +} + +static inline void dat_ptep_xchg(union pte *ptep, union pte new, gfn_t gfn, union asce asce, + bool has_skeys) +{ + union pgste pgste; + + pgste = pgste_get_lock(ptep); + pgste = __dat_ptep_xchg(ptep, pgste, new, gfn, asce, has_skeys); + pgste_set_unlock(ptep, pgste); +} + +static inline void dat_ptep_clear(union pte *ptep, gfn_t gfn, union asce asce, bool has_skeys) +{ + dat_ptep_xchg(ptep, _PTE_EMPTY, gfn, asce, has_skeys); +} + +static inline void dat_free_pt(struct page_table *pt) +{ + free_page((unsigned long)pt); +} + +static inline void _dat_free_crst(struct crst_table *table) +{ + free_pages((unsigned long)table, CRST_ALLOC_ORDER); +} + +#define dat_free_crst(x) _dat_free_crst(_CRSTP(x)) + +static inline void kvm_s390_free_mmu_cache(struct kvm_s390_mmu_cache *mc) +{ + if (!mc) + return; + while (mc->n_pts) + dat_free_pt(mc->pts[--mc->n_pts]); + while (mc->n_crsts) + _dat_free_crst(mc->crsts[--mc->n_crsts]); + while (mc->n_rmaps) + kfree(mc->rmaps[--mc->n_rmaps]); + kfree(mc); +} + +DEFINE_FREE(kvm_s390_mmu_cache, struct kvm_s390_mmu_cache *, if (_T) kvm_s390_free_mmu_cache(_T)) + +static inline struct kvm_s390_mmu_cache *kvm_s390_new_mmu_cache(void) +{ + struct kvm_s390_mmu_cache *mc __free(kvm_s390_mmu_cache) = NULL; + + mc = kzalloc_obj(*mc, GFP_KERNEL_ACCOUNT); + if (mc && !kvm_s390_mmu_cache_topup(mc)) + return_ptr(mc); + return NULL; +} + +static inline bool dat_pmdp_xchg_atomic(union pmd *pmdp, union pmd old, union pmd new, + gfn_t gfn, union asce asce) +{ + return dat_crstep_xchg_atomic(_CRSTEP(pmdp), _CRSTE(old), _CRSTE(new), gfn, asce); +} + +static inline bool dat_pudp_xchg_atomic(union pud *pudp, union pud old, union pud new, + gfn_t gfn, union asce asce) +{ + return dat_crstep_xchg_atomic(_CRSTEP(pudp), _CRSTE(old), _CRSTE(new), gfn, asce); +} + +static inline union crste dat_crstep_clear_atomic(union crste *crstep, gfn_t gfn, union asce asce) +{ + union crste oldcrste, empty = _CRSTE_EMPTY(crstep->h.tt); + + do { + oldcrste = READ_ONCE(*crstep); + } while (!dat_crstep_xchg_atomic(crstep, oldcrste, empty, gfn, asce)); + return oldcrste; +} + +static inline int get_level(union crste *crstep, union pte *ptep) +{ + return ptep ? TABLE_TYPE_PAGE_TABLE : crstep->h.tt; +} + +static inline int dat_delete_slot(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t start, + unsigned long npages) +{ + return dat_set_slot(mc, asce, start, start + npages, _DAT_TOKEN_PIC, PGM_ADDRESSING); +} + +static inline int dat_create_slot(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t start, + unsigned long npages) +{ + return dat_set_slot(mc, asce, start, start + npages, _DAT_TOKEN_NONE, 0); +} + +static inline bool crste_is_ucas(union crste crste) +{ + return is_pmd(crste) && crste.h.i && crste.h.fc0.tl == 1 && crste.h.fc == 0; +} + +#endif /* __KVM_S390_DAT_H */ diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index 74f73141f9b9..d89d1c381522 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -10,12 +10,30 @@ #include <linux/kvm.h> #include <linux/kvm_host.h> -#include <asm/gmap.h> +#include <asm/gmap_helpers.h> #include <asm/virtio-ccw.h> #include "kvm-s390.h" #include "trace.h" #include "trace-s390.h" #include "gaccess.h" +#include "gmap.h" + +static void do_discard_gfn_range(struct kvm_vcpu *vcpu, gfn_t gfn_start, gfn_t gfn_end) +{ + struct kvm_memslot_iter iter; + struct kvm_memory_slot *slot; + struct kvm_memslots *slots; + unsigned long start, end; + + slots = kvm_vcpu_memslots(vcpu); + + kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) { + slot = iter.slot; + start = __gfn_to_hva_memslot(slot, max(gfn_start, slot->base_gfn)); + end = __gfn_to_hva_memslot(slot, min(gfn_end, slot->base_gfn + slot->npages)); + gmap_helper_discard(vcpu->kvm->mm, start, end); + } +} static int diag_release_pages(struct kvm_vcpu *vcpu) { @@ -32,12 +50,13 @@ static int diag_release_pages(struct kvm_vcpu *vcpu) VCPU_EVENT(vcpu, 5, "diag release pages %lX %lX", start, end); + mmap_read_lock(vcpu->kvm->mm); /* * We checked for start >= end above, so lets check for the * fast path (no prefix swap page involved) */ if (end <= prefix || start >= prefix + 2 * PAGE_SIZE) { - gmap_discard(vcpu->arch.gmap, start, end); + do_discard_gfn_range(vcpu, gpa_to_gfn(start), gpa_to_gfn(end)); } else { /* * This is slow path. gmap_discard will check for start @@ -45,13 +64,14 @@ static int diag_release_pages(struct kvm_vcpu *vcpu) * prefix and let gmap_discard make some of these calls * NOPs. */ - gmap_discard(vcpu->arch.gmap, start, prefix); + do_discard_gfn_range(vcpu, gpa_to_gfn(start), gpa_to_gfn(prefix)); if (start <= prefix) - gmap_discard(vcpu->arch.gmap, 0, PAGE_SIZE); + do_discard_gfn_range(vcpu, 0, 1); if (end > prefix + PAGE_SIZE) - gmap_discard(vcpu->arch.gmap, PAGE_SIZE, 2 * PAGE_SIZE); - gmap_discard(vcpu->arch.gmap, prefix + 2 * PAGE_SIZE, end); + do_discard_gfn_range(vcpu, 1, 2); + do_discard_gfn_range(vcpu, gpa_to_gfn(prefix) + 2, gpa_to_gfn(end)); } + mmap_read_unlock(vcpu->kvm->mm); return 0; } diff --git a/arch/s390/kvm/faultin.c b/arch/s390/kvm/faultin.c new file mode 100644 index 000000000000..ddf0ca71f374 --- /dev/null +++ b/arch/s390/kvm/faultin.c @@ -0,0 +1,148 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KVM guest fault handling. + * + * Copyright IBM Corp. 2025 + * Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com> + */ +#include <linux/kvm_types.h> +#include <linux/kvm_host.h> + +#include "gmap.h" +#include "trace.h" +#include "faultin.h" + +bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu); + +/* + * kvm_s390_faultin_gfn() - handle a dat fault. + * @vcpu: The vCPU whose gmap is to be fixed up, or NULL if operating on the VM. + * @kvm: The VM whose gmap is to be fixed up, or NULL if operating on a vCPU. + * @f: The guest fault that needs to be resolved. + * + * Return: + * * 0 on success + * * < 0 in case of error + * * > 0 in case of guest exceptions + * + * Context: + * * The mm lock must not be held before calling + * * kvm->srcu must be held + * * may sleep + */ +int kvm_s390_faultin_gfn(struct kvm_vcpu *vcpu, struct kvm *kvm, struct guest_fault *f) +{ + struct kvm_s390_mmu_cache *local_mc __free(kvm_s390_mmu_cache) = NULL; + struct kvm_s390_mmu_cache *mc = NULL; + struct kvm_memory_slot *slot; + unsigned long inv_seq; + int foll, rc = 0; + + foll = f->write_attempt ? FOLL_WRITE : 0; + foll |= f->attempt_pfault ? FOLL_NOWAIT : 0; + + if (vcpu) { + kvm = vcpu->kvm; + mc = vcpu->arch.mc; + } + + lockdep_assert_held(&kvm->srcu); + + scoped_guard(read_lock, &kvm->mmu_lock) { + if (gmap_try_fixup_minor(kvm->arch.gmap, f) == 0) + return 0; + } + + while (1) { + f->valid = false; + inv_seq = kvm->mmu_invalidate_seq; + /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */ + smp_rmb(); + + if (vcpu) + slot = kvm_vcpu_gfn_to_memslot(vcpu, f->gfn); + else + slot = gfn_to_memslot(kvm, f->gfn); + f->pfn = __kvm_faultin_pfn(slot, f->gfn, foll, &f->writable, &f->page); + + /* Needs I/O, try to setup async pfault (only possible with FOLL_NOWAIT). */ + if (f->pfn == KVM_PFN_ERR_NEEDS_IO) { + if (unlikely(!f->attempt_pfault)) + return -EAGAIN; + if (unlikely(!vcpu)) + return -EINVAL; + trace_kvm_s390_major_guest_pfault(vcpu); + if (kvm_arch_setup_async_pf(vcpu)) + return 0; + vcpu->stat.pfault_sync++; + /* Could not setup async pfault, try again synchronously. */ + foll &= ~FOLL_NOWAIT; + f->pfn = __kvm_faultin_pfn(slot, f->gfn, foll, &f->writable, &f->page); + } + + /* Access outside memory, addressing exception. */ + if (is_noslot_pfn(f->pfn)) + return PGM_ADDRESSING; + /* Signal pending: try again. */ + if (f->pfn == KVM_PFN_ERR_SIGPENDING) + return -EAGAIN; + /* Check if it's read-only memory; don't try to actually handle that case. */ + if (f->pfn == KVM_PFN_ERR_RO_FAULT) + return -EOPNOTSUPP; + /* Any other error. */ + if (is_error_pfn(f->pfn)) + return -EFAULT; + + if (!mc) { + local_mc = kvm_s390_new_mmu_cache(); + if (!local_mc) + return -ENOMEM; + mc = local_mc; + } + + /* Loop, will automatically release the faulted page. */ + if (mmu_invalidate_retry_gfn_unsafe(kvm, inv_seq, f->gfn)) { + kvm_release_faultin_page(kvm, f->page, true, false); + continue; + } + + scoped_guard(read_lock, &kvm->mmu_lock) { + if (!mmu_invalidate_retry_gfn(kvm, inv_seq, f->gfn)) { + f->valid = true; + rc = gmap_link(mc, kvm->arch.gmap, f, slot); + kvm_release_faultin_page(kvm, f->page, !!rc, f->write_attempt); + f->page = NULL; + } + } + kvm_release_faultin_page(kvm, f->page, true, false); + + if (rc == -ENOMEM) { + rc = kvm_s390_mmu_cache_topup(mc); + if (rc) + return rc; + } else if (rc != -EAGAIN) { + return rc; + } + } +} + +int kvm_s390_get_guest_page(struct kvm *kvm, struct guest_fault *f, gfn_t gfn, bool w) +{ + struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); + int foll = w ? FOLL_WRITE : 0; + + f->write_attempt = w; + f->gfn = gfn; + f->pfn = __kvm_faultin_pfn(slot, gfn, foll, &f->writable, &f->page); + if (is_noslot_pfn(f->pfn)) + return PGM_ADDRESSING; + if (is_sigpending_pfn(f->pfn)) + return -EINTR; + if (f->pfn == KVM_PFN_ERR_NEEDS_IO) + return -EAGAIN; + if (is_error_pfn(f->pfn)) + return -EFAULT; + + f->valid = true; + return 0; +} diff --git a/arch/s390/kvm/faultin.h b/arch/s390/kvm/faultin.h new file mode 100644 index 000000000000..f86176d2769c --- /dev/null +++ b/arch/s390/kvm/faultin.h @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * KVM guest fault handling. + * + * Copyright IBM Corp. 2025 + * Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com> + */ + +#ifndef __KVM_S390_FAULTIN_H +#define __KVM_S390_FAULTIN_H + +#include <linux/kvm_host.h> + +#include "dat.h" + +int kvm_s390_faultin_gfn(struct kvm_vcpu *vcpu, struct kvm *kvm, struct guest_fault *f); +int kvm_s390_get_guest_page(struct kvm *kvm, struct guest_fault *f, gfn_t gfn, bool w); + +static inline int kvm_s390_faultin_gfn_simple(struct kvm_vcpu *vcpu, struct kvm *kvm, + gfn_t gfn, bool wr) +{ + struct guest_fault f = { .gfn = gfn, .write_attempt = wr, }; + + return kvm_s390_faultin_gfn(vcpu, kvm, &f); +} + +static inline int kvm_s390_get_guest_page_and_read_gpa(struct kvm *kvm, struct guest_fault *f, + gpa_t gaddr, unsigned long *val) +{ + int rc; + + rc = kvm_s390_get_guest_page(kvm, f, gpa_to_gfn(gaddr), false); + if (rc) + return rc; + + *val = *(unsigned long *)phys_to_virt(pfn_to_phys(f->pfn) | offset_in_page(gaddr)); + + return 0; +} + +static inline void kvm_s390_release_multiple(struct kvm *kvm, struct guest_fault *guest_faults, + int n, bool ignore) +{ + int i; + + for (i = 0; i < n; i++) { + kvm_release_faultin_page(kvm, guest_faults[i].page, ignore, + guest_faults[i].write_attempt); + guest_faults[i].page = NULL; + } +} + +static inline bool kvm_s390_multiple_faults_need_retry(struct kvm *kvm, unsigned long seq, + struct guest_fault *guest_faults, int n, + bool unsafe) +{ + int i; + + for (i = 0; i < n; i++) { + if (!guest_faults[i].valid) + continue; + if (unsafe && mmu_invalidate_retry_gfn_unsafe(kvm, seq, guest_faults[i].gfn)) + return true; + if (!unsafe && mmu_invalidate_retry_gfn(kvm, seq, guest_faults[i].gfn)) + return true; + } + return false; +} + +static inline int kvm_s390_get_guest_pages(struct kvm *kvm, struct guest_fault *guest_faults, + gfn_t start, int n_pages, bool write_attempt) +{ + int i, rc; + + for (i = 0; i < n_pages; i++) { + rc = kvm_s390_get_guest_page(kvm, guest_faults + i, start + i, write_attempt); + if (rc) + break; + } + return rc; +} + +#define kvm_s390_release_faultin_array(kvm, array, ignore) \ + kvm_s390_release_multiple(kvm, array, ARRAY_SIZE(array), ignore) + +#define kvm_s390_array_needs_retry_unsafe(kvm, seq, array) \ + kvm_s390_multiple_faults_need_retry(kvm, seq, array, ARRAY_SIZE(array), true) + +#define kvm_s390_array_needs_retry_safe(kvm, seq, array) \ + kvm_s390_multiple_faults_need_retry(kvm, seq, array, ARRAY_SIZE(array), false) + +#endif /* __KVM_S390_FAULTIN_H */ diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index f6fded15633a..b07accd19618 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -11,40 +11,43 @@ #include <linux/err.h> #include <linux/pgtable.h> #include <linux/bitfield.h> +#include <linux/kvm_host.h> +#include <linux/kvm_types.h> +#include <asm/diag.h> #include <asm/access-regs.h> #include <asm/fault.h> -#include <asm/gmap.h> #include <asm/dat-bits.h> #include "kvm-s390.h" +#include "dat.h" #include "gmap.h" #include "gaccess.h" +#include "faultin.h" -/* - * vaddress union in order to easily decode a virtual address into its - * region first index, region second index etc. parts. - */ -union vaddress { - unsigned long addr; - struct { - unsigned long rfx : 11; - unsigned long rsx : 11; - unsigned long rtx : 11; - unsigned long sx : 11; - unsigned long px : 8; - unsigned long bx : 12; - }; - struct { - unsigned long rfx01 : 2; - unsigned long : 9; - unsigned long rsx01 : 2; - unsigned long : 9; - unsigned long rtx01 : 2; - unsigned long : 9; - unsigned long sx01 : 2; - unsigned long : 29; - }; +#define GMAP_SHADOW_FAKE_TABLE 1ULL + +union dat_table_entry { + unsigned long val; + union region1_table_entry pgd; + union region2_table_entry p4d; + union region3_table_entry pud; + union segment_table_entry pmd; + union page_table_entry pte; }; +#define WALK_N_ENTRIES 7 +#define LEVEL_MEM -2 +struct pgtwalk { + struct guest_fault raw_entries[WALK_N_ENTRIES]; + gpa_t last_addr; + int level; + bool p; +}; + +static inline struct guest_fault *get_entries(struct pgtwalk *w) +{ + return w->raw_entries - LEVEL_MEM; +} + /* * raddress union which will contain the result (real or absolute address) * after a page table walk. The rfaa, sfaa and pfra members are used to @@ -106,16 +109,33 @@ struct aste { /* .. more fields there */ }; +union oac { + unsigned int val; + struct { + struct { + unsigned short key : 4; + unsigned short : 4; + unsigned short as : 2; + unsigned short : 4; + unsigned short k : 1; + unsigned short a : 1; + } oac1; + struct { + unsigned short key : 4; + unsigned short : 4; + unsigned short as : 2; + unsigned short : 4; + unsigned short k : 1; + unsigned short a : 1; + } oac2; + }; +}; + int ipte_lock_held(struct kvm *kvm) { - if (sclp.has_siif) { - int rc; + if (sclp.has_siif) + return kvm->arch.sca->ipte_control.kh != 0; - read_lock(&kvm->arch.sca_lock); - rc = kvm_s390_get_ipte_control(kvm)->kh != 0; - read_unlock(&kvm->arch.sca_lock); - return rc; - } return kvm->arch.ipte_lock_count != 0; } @@ -128,19 +148,16 @@ static void ipte_lock_simple(struct kvm *kvm) if (kvm->arch.ipte_lock_count > 1) goto out; retry: - read_lock(&kvm->arch.sca_lock); - ic = kvm_s390_get_ipte_control(kvm); + ic = &kvm->arch.sca->ipte_control; old = READ_ONCE(*ic); do { if (old.k) { - read_unlock(&kvm->arch.sca_lock); cond_resched(); goto retry; } new = old; new.k = 1; } while (!try_cmpxchg(&ic->val, &old.val, new.val)); - read_unlock(&kvm->arch.sca_lock); out: mutex_unlock(&kvm->arch.ipte_mutex); } @@ -153,14 +170,12 @@ static void ipte_unlock_simple(struct kvm *kvm) kvm->arch.ipte_lock_count--; if (kvm->arch.ipte_lock_count) goto out; - read_lock(&kvm->arch.sca_lock); - ic = kvm_s390_get_ipte_control(kvm); + ic = &kvm->arch.sca->ipte_control; old = READ_ONCE(*ic); do { new = old; new.k = 0; } while (!try_cmpxchg(&ic->val, &old.val, new.val)); - read_unlock(&kvm->arch.sca_lock); wake_up(&kvm->arch.ipte_wq); out: mutex_unlock(&kvm->arch.ipte_mutex); @@ -171,12 +186,10 @@ static void ipte_lock_siif(struct kvm *kvm) union ipte_control old, new, *ic; retry: - read_lock(&kvm->arch.sca_lock); - ic = kvm_s390_get_ipte_control(kvm); + ic = &kvm->arch.sca->ipte_control; old = READ_ONCE(*ic); do { if (old.kg) { - read_unlock(&kvm->arch.sca_lock); cond_resched(); goto retry; } @@ -184,15 +197,13 @@ retry: new.k = 1; new.kh++; } while (!try_cmpxchg(&ic->val, &old.val, new.val)); - read_unlock(&kvm->arch.sca_lock); } static void ipte_unlock_siif(struct kvm *kvm) { union ipte_control old, new, *ic; - read_lock(&kvm->arch.sca_lock); - ic = kvm_s390_get_ipte_control(kvm); + ic = &kvm->arch.sca->ipte_control; old = READ_ONCE(*ic); do { new = old; @@ -200,7 +211,6 @@ static void ipte_unlock_siif(struct kvm *kvm) if (!new.kh) new.k = 0; } while (!try_cmpxchg(&ic->val, &old.val, new.val)); - read_unlock(&kvm->arch.sca_lock); if (!new.kh) wake_up(&kvm->arch.ipte_wq); } @@ -318,7 +328,7 @@ enum prot_type { PROT_TYPE_DAT = 3, PROT_TYPE_IEP = 4, /* Dummy value for passing an initialized value when code != PGM_PROTECTION */ - PROT_NONE, + PROT_TYPE_DUMMY, }; static int trans_exc_ending(struct kvm_vcpu *vcpu, int code, unsigned long gva, u8 ar, @@ -334,7 +344,7 @@ static int trans_exc_ending(struct kvm_vcpu *vcpu, int code, unsigned long gva, switch (code) { case PGM_PROTECTION: switch (prot) { - case PROT_NONE: + case PROT_TYPE_DUMMY: /* We should never get here, acts like termination */ WARN_ON_ONCE(1); break; @@ -437,7 +447,7 @@ static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val) } /** - * guest_translate - translate a guest virtual into a guest absolute address + * guest_translate_gva() - translate a guest virtual into a guest absolute address * @vcpu: virtual cpu * @gva: guest virtual address * @gpa: points to where guest physical (absolute) address should be stored @@ -457,9 +467,9 @@ static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val) * the returned value is the program interruption code as defined * by the architecture */ -static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, - unsigned long *gpa, const union asce asce, - enum gacc_mode mode, enum prot_type *prot) +static unsigned long guest_translate_gva(struct kvm_vcpu *vcpu, unsigned long gva, + unsigned long *gpa, const union asce asce, + enum gacc_mode mode, enum prot_type *prot) { union vaddress vaddr = {.addr = gva}; union raddress raddr = {.addr = gva}; @@ -640,31 +650,19 @@ static int low_address_protection_enabled(struct kvm_vcpu *vcpu, return 1; } -static int vm_check_access_key(struct kvm *kvm, u8 access_key, - enum gacc_mode mode, gpa_t gpa) +static int vm_check_access_key_gpa(struct kvm *kvm, u8 access_key, + enum gacc_mode mode, gpa_t gpa) { - u8 storage_key, access_control; - bool fetch_protected; - unsigned long hva; + union skey storage_key; int r; - if (access_key == 0) - return 0; - - hva = gfn_to_hva(kvm, gpa_to_gfn(gpa)); - if (kvm_is_error_hva(hva)) - return PGM_ADDRESSING; - - mmap_read_lock(current->mm); - r = get_guest_storage_key(current->mm, hva, &storage_key); - mmap_read_unlock(current->mm); + scoped_guard(read_lock, &kvm->mmu_lock) + r = dat_get_storage_key(kvm->arch.gmap->asce, gpa_to_gfn(gpa), &storage_key); if (r) return r; - access_control = FIELD_GET(_PAGE_ACC_BITS, storage_key); - if (access_control == access_key) + if (access_key == 0 || storage_key.acc == access_key) return 0; - fetch_protected = storage_key & _PAGE_FP_BIT; - if ((mode == GACC_FETCH || mode == GACC_IFETCH) && !fetch_protected) + if ((mode == GACC_FETCH || mode == GACC_IFETCH) && !storage_key.fp) return 0; return PGM_PROTECTION; } @@ -703,12 +701,11 @@ static bool storage_prot_override_applies(u8 access_control) return access_control == PAGE_SPO_ACC; } -static int vcpu_check_access_key(struct kvm_vcpu *vcpu, u8 access_key, - enum gacc_mode mode, union asce asce, gpa_t gpa, - unsigned long ga, unsigned int len) +static int vcpu_check_access_key_gpa(struct kvm_vcpu *vcpu, u8 access_key, + enum gacc_mode mode, union asce asce, gpa_t gpa, + unsigned long ga, unsigned int len) { - u8 storage_key, access_control; - unsigned long hva; + union skey storage_key; int r; /* access key 0 matches any storage key -> allow */ @@ -718,26 +715,23 @@ static int vcpu_check_access_key(struct kvm_vcpu *vcpu, u8 access_key, * caller needs to ensure that gfn is accessible, so we can * assume that this cannot fail */ - hva = gfn_to_hva(vcpu->kvm, gpa_to_gfn(gpa)); - mmap_read_lock(current->mm); - r = get_guest_storage_key(current->mm, hva, &storage_key); - mmap_read_unlock(current->mm); + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) + r = dat_get_storage_key(vcpu->arch.gmap->asce, gpa_to_gfn(gpa), &storage_key); if (r) return r; - access_control = FIELD_GET(_PAGE_ACC_BITS, storage_key); /* access key matches storage key -> allow */ - if (access_control == access_key) + if (storage_key.acc == access_key) return 0; if (mode == GACC_FETCH || mode == GACC_IFETCH) { /* it is a fetch and fetch protection is off -> allow */ - if (!(storage_key & _PAGE_FP_BIT)) + if (!storage_key.fp) return 0; if (fetch_prot_override_applicable(vcpu, mode, asce) && fetch_prot_override_applies(ga, len)) return 0; } if (storage_prot_override_applicable(vcpu) && - storage_prot_override_applies(access_control)) + storage_prot_override_applies(storage_key.acc)) return 0; return PGM_PROTECTION; } @@ -797,20 +791,19 @@ static int guest_range_to_gpas(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, return trans_exc(vcpu, PGM_PROTECTION, ga, ar, mode, PROT_TYPE_LA); if (psw_bits(*psw).dat) { - rc = guest_translate(vcpu, ga, &gpa, asce, mode, &prot); + rc = guest_translate_gva(vcpu, ga, &gpa, asce, mode, &prot); if (rc < 0) return rc; } else { gpa = kvm_s390_real_to_abs(vcpu, ga); if (!kvm_is_gpa_in_memslot(vcpu->kvm, gpa)) { rc = PGM_ADDRESSING; - prot = PROT_NONE; + prot = PROT_TYPE_DUMMY; } } if (rc) return trans_exc(vcpu, rc, ga, ar, mode, prot); - rc = vcpu_check_access_key(vcpu, access_key, mode, asce, gpa, ga, - fragment_len); + rc = vcpu_check_access_key_gpa(vcpu, access_key, mode, asce, gpa, ga, fragment_len); if (rc) return trans_exc(vcpu, rc, ga, ar, mode, PROT_TYPE_KEYC); if (gpas) @@ -822,8 +815,8 @@ static int guest_range_to_gpas(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, return 0; } -static int access_guest_page(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa, - void *data, unsigned int len) +static int access_guest_page_gpa(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa, + void *data, unsigned int len) { const unsigned int offset = offset_in_page(gpa); const gfn_t gfn = gpa_to_gfn(gpa); @@ -838,38 +831,79 @@ static int access_guest_page(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa, return rc; } -static int -access_guest_page_with_key(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa, - void *data, unsigned int len, u8 access_key) +static int mvcos_key(void *to, const void *from, unsigned long size, u8 dst_key, u8 src_key) { - struct kvm_memory_slot *slot; - bool writable; - gfn_t gfn; - hva_t hva; - int rc; + union oac spec = { + .oac1.key = dst_key, + .oac1.k = !!dst_key, + .oac2.key = src_key, + .oac2.k = !!src_key, + }; + int exception = PGM_PROTECTION; + + asm_inline volatile( + " lr %%r0,%[spec]\n" + "0: mvcos %[to],%[from],%[size]\n" + "1: lhi %[exc],0\n" + "2:\n" + EX_TABLE(0b, 2b) + EX_TABLE(1b, 2b) + : [size] "+d" (size), [to] "=Q" (*(char *)to), [exc] "+d" (exception) + : [spec] "d" (spec.val), [from] "Q" (*(const char *)from) + : "memory", "cc", "0"); + return exception; +} - gfn = gpa >> PAGE_SHIFT; - slot = gfn_to_memslot(kvm, gfn); - hva = gfn_to_hva_memslot_prot(slot, gfn, &writable); +struct acc_page_key_context { + void *data; + int exception; + unsigned short offset; + unsigned short len; + bool store; + u8 access_key; +}; - if (kvm_is_error_hva(hva)) - return PGM_ADDRESSING; - /* - * Check if it's a ro memslot, even tho that can't occur (they're unsupported). - * Don't try to actually handle that case. - */ - if (!writable && mode == GACC_STORE) - return -EOPNOTSUPP; - hva += offset_in_page(gpa); - if (mode == GACC_STORE) - rc = copy_to_user_key((void __user *)hva, data, len, access_key); +static void _access_guest_page_with_key_gpa(struct guest_fault *f) +{ + struct acc_page_key_context *context = f->priv; + void *ptr; + int r; + + ptr = __va(PFN_PHYS(f->pfn) | context->offset); + + if (context->store) + r = mvcos_key(ptr, context->data, context->len, context->access_key, 0); else - rc = copy_from_user_key(data, (void __user *)hva, len, access_key); + r = mvcos_key(context->data, ptr, context->len, 0, context->access_key); + + context->exception = r; +} + +static int access_guest_page_with_key_gpa(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa, + void *data, unsigned int len, u8 acc) +{ + struct acc_page_key_context context = { + .offset = offset_in_page(gpa), + .len = len, + .data = data, + .access_key = acc, + .store = mode == GACC_STORE, + }; + struct guest_fault fault = { + .gfn = gpa_to_gfn(gpa), + .priv = &context, + .write_attempt = mode == GACC_STORE, + .callback = _access_guest_page_with_key_gpa, + }; + int rc; + + if (KVM_BUG_ON((len + context.offset) > PAGE_SIZE, kvm)) + return -EINVAL; + + rc = kvm_s390_faultin_gfn(NULL, kvm, &fault); if (rc) - return PGM_PROTECTION; - if (mode == GACC_STORE) - mark_page_dirty_in_slot(kvm, slot, gfn); - return 0; + return rc; + return context.exception; } int access_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, void *data, @@ -881,7 +915,7 @@ int access_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, void *data, while (min(PAGE_SIZE - offset, len) > 0) { fragment_len = min(PAGE_SIZE - offset, len); - rc = access_guest_page_with_key(kvm, mode, gpa, data, fragment_len, access_key); + rc = access_guest_page_with_key_gpa(kvm, mode, gpa, data, fragment_len, access_key); if (rc) return rc; offset = 0; @@ -941,15 +975,14 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, for (idx = 0; idx < nr_pages; idx++) { fragment_len = min(PAGE_SIZE - offset_in_page(gpas[idx]), len); if (try_fetch_prot_override && fetch_prot_override_applies(ga, fragment_len)) { - rc = access_guest_page(vcpu->kvm, mode, gpas[idx], - data, fragment_len); + rc = access_guest_page_gpa(vcpu->kvm, mode, gpas[idx], data, fragment_len); } else { - rc = access_guest_page_with_key(vcpu->kvm, mode, gpas[idx], - data, fragment_len, access_key); + rc = access_guest_page_with_key_gpa(vcpu->kvm, mode, gpas[idx], + data, fragment_len, access_key); } if (rc == PGM_PROTECTION && try_storage_prot_override) - rc = access_guest_page_with_key(vcpu->kvm, mode, gpas[idx], - data, fragment_len, PAGE_SPO_ACC); + rc = access_guest_page_with_key_gpa(vcpu->kvm, mode, gpas[idx], + data, fragment_len, PAGE_SPO_ACC); if (rc) break; len -= fragment_len; @@ -962,7 +995,7 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, if (rc == PGM_PROTECTION) prot = PROT_TYPE_KEYC; else - prot = PROT_NONE; + prot = PROT_TYPE_DUMMY; rc = trans_exc_ending(vcpu, rc, ga, ar, mode, prot, terminate); } out_unlock: @@ -983,7 +1016,7 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, while (len && !rc) { gpa = kvm_s390_real_to_abs(vcpu, gra); fragment_len = min(PAGE_SIZE - offset_in_page(gpa), len); - rc = access_guest_page(vcpu->kvm, mode, gpa, data, fragment_len); + rc = access_guest_page_gpa(vcpu->kvm, mode, gpa, data, fragment_len); len -= fragment_len; gra += fragment_len; data += fragment_len; @@ -994,17 +1027,101 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, } /** + * __cmpxchg_with_key() - Perform cmpxchg, honoring storage keys. + * @ptr: Address of value to compare to *@old and exchange with + * @new. Must be aligned to @size. + * @old: Old value. Compared to the content pointed to by @ptr in order to + * determine if the exchange occurs. The old value read from *@ptr is + * written here. + * @new: New value to place at *@ptr. + * @size: Size of the operation in bytes, may only be a power of two up to 16. + * @access_key: Access key to use for checking storage key protection. + * + * Perform a cmpxchg on guest memory, honoring storage key protection. + * @access_key alone determines how key checking is performed, neither + * storage-protection-override nor fetch-protection-override apply. + * In case of an exception *@uval is set to zero. + * + * Return: + * * %0: cmpxchg executed successfully + * * %1: cmpxchg executed unsuccessfully + * * %PGM_PROTECTION: an exception happened when trying to access *@ptr + * * %-EAGAIN: maxed out number of retries (byte and short only) + * * %-EINVAL: invalid value for @size + */ +static int __cmpxchg_with_key(union kvm_s390_quad *ptr, union kvm_s390_quad *old, + union kvm_s390_quad new, int size, u8 access_key) +{ + union kvm_s390_quad tmp = { .sixteen = 0 }; + int rc; + + /* + * The cmpxchg_key macro depends on the type of "old", so we need + * a case for each valid length and get some code duplication as long + * as we don't introduce a new macro. + */ + switch (size) { + case 1: + rc = __cmpxchg_key1(&ptr->one, &tmp.one, old->one, new.one, access_key); + break; + case 2: + rc = __cmpxchg_key2(&ptr->two, &tmp.two, old->two, new.two, access_key); + break; + case 4: + rc = __cmpxchg_key4(&ptr->four, &tmp.four, old->four, new.four, access_key); + break; + case 8: + rc = __cmpxchg_key8(&ptr->eight, &tmp.eight, old->eight, new.eight, access_key); + break; + case 16: + rc = __cmpxchg_key16(&ptr->sixteen, &tmp.sixteen, old->sixteen, new.sixteen, + access_key); + break; + default: + return -EINVAL; + } + if (!rc && memcmp(&tmp, old, size)) + rc = 1; + *old = tmp; + /* + * Assume that the fault is caused by protection, either key protection + * or user page write protection. + */ + if (rc == -EFAULT) + rc = PGM_PROTECTION; + return rc; +} + +struct cmpxchg_key_context { + union kvm_s390_quad new; + union kvm_s390_quad *old; + int exception; + unsigned short offset; + u8 access_key; + u8 len; +}; + +static void _cmpxchg_guest_abs_with_key(struct guest_fault *f) +{ + struct cmpxchg_key_context *context = f->priv; + + context->exception = __cmpxchg_with_key(__va(PFN_PHYS(f->pfn) | context->offset), + context->old, context->new, context->len, + context->access_key); +} + +/** * cmpxchg_guest_abs_with_key() - Perform cmpxchg on guest absolute address. * @kvm: Virtual machine instance. * @gpa: Absolute guest address of the location to be changed. * @len: Operand length of the cmpxchg, required: 1 <= len <= 16. Providing a * non power of two will result in failure. - * @old_addr: Pointer to old value. If the location at @gpa contains this value, - * the exchange will succeed. After calling cmpxchg_guest_abs_with_key() - * *@old_addr contains the value at @gpa before the attempt to - * exchange the value. + * @old: Pointer to old value. If the location at @gpa contains this value, + * the exchange will succeed. After calling cmpxchg_guest_abs_with_key() + * *@old contains the value at @gpa before the attempt to + * exchange the value. * @new: The value to place at @gpa. - * @access_key: The access key to use for the guest access. + * @acc: The access key to use for the guest access. * @success: output value indicating if an exchange occurred. * * Atomically exchange the value at @gpa by @new, if it contains *@old. @@ -1017,89 +1134,36 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, * * -EAGAIN: transient failure (len 1 or 2) * * -EOPNOTSUPP: read-only memslot (should never occur) */ -int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, - __uint128_t *old_addr, __uint128_t new, - u8 access_key, bool *success) +int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, union kvm_s390_quad *old, + union kvm_s390_quad new, u8 acc, bool *success) { - gfn_t gfn = gpa_to_gfn(gpa); - struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); - bool writable; - hva_t hva; - int ret; - - if (!IS_ALIGNED(gpa, len)) - return -EINVAL; - - hva = gfn_to_hva_memslot_prot(slot, gfn, &writable); - if (kvm_is_error_hva(hva)) - return PGM_ADDRESSING; - /* - * Check if it's a read-only memslot, even though that cannot occur - * since those are unsupported. - * Don't try to actually handle that case. - */ - if (!writable) - return -EOPNOTSUPP; - - hva += offset_in_page(gpa); - /* - * The cmpxchg_user_key macro depends on the type of "old", so we need - * a case for each valid length and get some code duplication as long - * as we don't introduce a new macro. - */ - switch (len) { - case 1: { - u8 old; - - ret = cmpxchg_user_key((u8 __user *)hva, &old, *old_addr, new, access_key); - *success = !ret && old == *old_addr; - *old_addr = old; - break; - } - case 2: { - u16 old; - - ret = cmpxchg_user_key((u16 __user *)hva, &old, *old_addr, new, access_key); - *success = !ret && old == *old_addr; - *old_addr = old; - break; - } - case 4: { - u32 old; - - ret = cmpxchg_user_key((u32 __user *)hva, &old, *old_addr, new, access_key); - *success = !ret && old == *old_addr; - *old_addr = old; - break; - } - case 8: { - u64 old; + struct cmpxchg_key_context context = { + .old = old, + .new = new, + .offset = offset_in_page(gpa), + .len = len, + .access_key = acc, + }; + struct guest_fault fault = { + .gfn = gpa_to_gfn(gpa), + .priv = &context, + .write_attempt = true, + .callback = _cmpxchg_guest_abs_with_key, + }; + int rc; - ret = cmpxchg_user_key((u64 __user *)hva, &old, *old_addr, new, access_key); - *success = !ret && old == *old_addr; - *old_addr = old; - break; - } - case 16: { - __uint128_t old; + lockdep_assert_held(&kvm->srcu); - ret = cmpxchg_user_key((__uint128_t __user *)hva, &old, *old_addr, new, access_key); - *success = !ret && old == *old_addr; - *old_addr = old; - break; - } - default: + if (len > 16 || !IS_ALIGNED(gpa, len)) return -EINVAL; - } - if (*success) - mark_page_dirty_in_slot(kvm, slot, gfn); - /* - * Assume that the fault is caused by protection, either key protection - * or user page write protection. - */ - if (ret == -EFAULT) - ret = PGM_PROTECTION; - return ret; + + rc = kvm_s390_faultin_gfn(NULL, kvm, &fault); + if (rc) + return rc; + *success = !context.exception; + if (context.exception == 1) + return 0; + return context.exception; } /** @@ -1174,7 +1238,7 @@ int check_gpa_range(struct kvm *kvm, unsigned long gpa, unsigned long length, while (length && !rc) { fragment_len = min(PAGE_SIZE - offset_in_page(gpa), length); - rc = vm_check_access_key(kvm, access_key, mode, gpa); + rc = vm_check_access_key_gpa(kvm, access_key, mode, gpa); length -= fragment_len; gpa += fragment_len; } @@ -1201,304 +1265,409 @@ int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra) } /** - * kvm_s390_shadow_tables - walk the guest page table and create shadow tables - * @sg: pointer to the shadow guest address space structure - * @saddr: faulting address in the shadow gmap - * @pgt: pointer to the beginning of the page table for the given address if - * successful (return value 0), or to the first invalid DAT entry in - * case of exceptions (return value > 0) - * @dat_protection: referenced memory is write protected - * @fake: pgt references contiguous guest memory block, not a pgtable + * walk_guest_tables() - Walk the guest page table and pin the dat tables. + * @sg: Pointer to the shadow guest address space structure. + * @saddr: Faulting address in the shadow gmap. + * @w: Will be filled with information on the pinned pages. + * @wr: Wndicates a write access if true. + * + * Return: + * * %0 in case of success, + * * a PIC code > 0 in case the address translation fails + * * an error code < 0 if other errors happen in the host */ -static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, - unsigned long *pgt, int *dat_protection, - int *fake) +static int walk_guest_tables(struct gmap *sg, unsigned long saddr, struct pgtwalk *w, bool wr) { - struct kvm *kvm; - struct gmap *parent; - union asce asce; + struct gmap *parent = sg->parent; + struct guest_fault *entries; + union dat_table_entry table; union vaddress vaddr; unsigned long ptr; + struct kvm *kvm; + union asce asce; int rc; - *fake = 0; - *dat_protection = 0; - kvm = sg->private; - parent = sg->parent; + if (!parent) + return -EAGAIN; + kvm = parent->kvm; + WARN_ON(!kvm); + asce = sg->guest_asce; + entries = get_entries(w); + + w->level = LEVEL_MEM; + w->last_addr = saddr; + if (asce.r) + return kvm_s390_get_guest_page(kvm, entries + LEVEL_MEM, gpa_to_gfn(saddr), false); + vaddr.addr = saddr; - asce.val = sg->orig_asce; ptr = asce.rsto * PAGE_SIZE; - if (asce.r) { - *fake = 1; - ptr = 0; - asce.dt = ASCE_TYPE_REGION1; - } + + if (!asce_contains_gfn(asce, gpa_to_gfn(saddr))) + return PGM_ASCE_TYPE; switch (asce.dt) { case ASCE_TYPE_REGION1: - if (vaddr.rfx01 > asce.tl && !*fake) + if (vaddr.rfx01 > asce.tl) return PGM_REGION_FIRST_TRANS; break; case ASCE_TYPE_REGION2: - if (vaddr.rfx) - return PGM_ASCE_TYPE; if (vaddr.rsx01 > asce.tl) return PGM_REGION_SECOND_TRANS; break; case ASCE_TYPE_REGION3: - if (vaddr.rfx || vaddr.rsx) - return PGM_ASCE_TYPE; if (vaddr.rtx01 > asce.tl) return PGM_REGION_THIRD_TRANS; break; case ASCE_TYPE_SEGMENT: - if (vaddr.rfx || vaddr.rsx || vaddr.rtx) - return PGM_ASCE_TYPE; if (vaddr.sx01 > asce.tl) return PGM_SEGMENT_TRANSLATION; break; } + w->level = asce.dt; switch (asce.dt) { - case ASCE_TYPE_REGION1: { - union region1_table_entry rfte; - - if (*fake) { - ptr += vaddr.rfx * _REGION1_SIZE; - rfte.val = ptr; - goto shadow_r2t; - } - *pgt = ptr + vaddr.rfx * 8; - rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val); + case ASCE_TYPE_REGION1: + w->last_addr = ptr + vaddr.rfx * 8; + rc = kvm_s390_get_guest_page_and_read_gpa(kvm, entries + w->level, + w->last_addr, &table.val); if (rc) return rc; - if (rfte.i) + if (table.pgd.i) return PGM_REGION_FIRST_TRANS; - if (rfte.tt != TABLE_TYPE_REGION1) + if (table.pgd.tt != TABLE_TYPE_REGION1) return PGM_TRANSLATION_SPEC; - if (vaddr.rsx01 < rfte.tf || vaddr.rsx01 > rfte.tl) + if (vaddr.rsx01 < table.pgd.tf || vaddr.rsx01 > table.pgd.tl) return PGM_REGION_SECOND_TRANS; if (sg->edat_level >= 1) - *dat_protection |= rfte.p; - ptr = rfte.rto * PAGE_SIZE; -shadow_r2t: - rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake); - if (rc) - return rc; - kvm->stat.gmap_shadow_r1_entry++; - } + w->p |= table.pgd.p; + ptr = table.pgd.rto * PAGE_SIZE; + w->level--; fallthrough; - case ASCE_TYPE_REGION2: { - union region2_table_entry rste; - - if (*fake) { - ptr += vaddr.rsx * _REGION2_SIZE; - rste.val = ptr; - goto shadow_r3t; - } - *pgt = ptr + vaddr.rsx * 8; - rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val); + case ASCE_TYPE_REGION2: + w->last_addr = ptr + vaddr.rsx * 8; + rc = kvm_s390_get_guest_page_and_read_gpa(kvm, entries + w->level, + w->last_addr, &table.val); if (rc) return rc; - if (rste.i) + if (table.p4d.i) return PGM_REGION_SECOND_TRANS; - if (rste.tt != TABLE_TYPE_REGION2) + if (table.p4d.tt != TABLE_TYPE_REGION2) return PGM_TRANSLATION_SPEC; - if (vaddr.rtx01 < rste.tf || vaddr.rtx01 > rste.tl) + if (vaddr.rtx01 < table.p4d.tf || vaddr.rtx01 > table.p4d.tl) return PGM_REGION_THIRD_TRANS; if (sg->edat_level >= 1) - *dat_protection |= rste.p; - ptr = rste.rto * PAGE_SIZE; -shadow_r3t: - rste.p |= *dat_protection; - rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake); - if (rc) - return rc; - kvm->stat.gmap_shadow_r2_entry++; - } + w->p |= table.p4d.p; + ptr = table.p4d.rto * PAGE_SIZE; + w->level--; fallthrough; - case ASCE_TYPE_REGION3: { - union region3_table_entry rtte; - - if (*fake) { - ptr += vaddr.rtx * _REGION3_SIZE; - rtte.val = ptr; - goto shadow_sgt; - } - *pgt = ptr + vaddr.rtx * 8; - rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val); + case ASCE_TYPE_REGION3: + w->last_addr = ptr + vaddr.rtx * 8; + rc = kvm_s390_get_guest_page_and_read_gpa(kvm, entries + w->level, + w->last_addr, &table.val); if (rc) return rc; - if (rtte.i) + if (table.pud.i) return PGM_REGION_THIRD_TRANS; - if (rtte.tt != TABLE_TYPE_REGION3) + if (table.pud.tt != TABLE_TYPE_REGION3) return PGM_TRANSLATION_SPEC; - if (rtte.cr && asce.p && sg->edat_level >= 2) + if (table.pud.cr && asce.p && sg->edat_level >= 2) return PGM_TRANSLATION_SPEC; - if (rtte.fc && sg->edat_level >= 2) { - *dat_protection |= rtte.fc0.p; - *fake = 1; - ptr = rtte.fc1.rfaa * _REGION3_SIZE; - rtte.val = ptr; - goto shadow_sgt; + if (sg->edat_level >= 1) + w->p |= table.pud.p; + if (table.pud.fc && sg->edat_level >= 2) { + table.val = u64_replace_bits(table.val, saddr, ~_REGION3_MASK); + goto edat_applies; } - if (vaddr.sx01 < rtte.fc0.tf || vaddr.sx01 > rtte.fc0.tl) + if (vaddr.sx01 < table.pud.fc0.tf || vaddr.sx01 > table.pud.fc0.tl) return PGM_SEGMENT_TRANSLATION; - if (sg->edat_level >= 1) - *dat_protection |= rtte.fc0.p; - ptr = rtte.fc0.sto * PAGE_SIZE; -shadow_sgt: - rtte.fc0.p |= *dat_protection; - rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake); - if (rc) - return rc; - kvm->stat.gmap_shadow_r3_entry++; - } + ptr = table.pud.fc0.sto * PAGE_SIZE; + w->level--; fallthrough; - case ASCE_TYPE_SEGMENT: { - union segment_table_entry ste; - - if (*fake) { - ptr += vaddr.sx * _SEGMENT_SIZE; - ste.val = ptr; - goto shadow_pgt; - } - *pgt = ptr + vaddr.sx * 8; - rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val); + case ASCE_TYPE_SEGMENT: + w->last_addr = ptr + vaddr.sx * 8; + rc = kvm_s390_get_guest_page_and_read_gpa(kvm, entries + w->level, + w->last_addr, &table.val); if (rc) return rc; - if (ste.i) + if (table.pmd.i) return PGM_SEGMENT_TRANSLATION; - if (ste.tt != TABLE_TYPE_SEGMENT) + if (table.pmd.tt != TABLE_TYPE_SEGMENT) return PGM_TRANSLATION_SPEC; - if (ste.cs && asce.p) + if (table.pmd.cs && asce.p) return PGM_TRANSLATION_SPEC; - *dat_protection |= ste.fc0.p; - if (ste.fc && sg->edat_level >= 1) { - *fake = 1; - ptr = ste.fc1.sfaa * _SEGMENT_SIZE; - ste.val = ptr; - goto shadow_pgt; + w->p |= table.pmd.p; + if (table.pmd.fc && sg->edat_level >= 1) { + table.val = u64_replace_bits(table.val, saddr, ~_SEGMENT_MASK); + goto edat_applies; } - ptr = ste.fc0.pto * (PAGE_SIZE / 2); -shadow_pgt: - ste.fc0.p |= *dat_protection; - rc = gmap_shadow_pgt(sg, saddr, ste.val, *fake); + ptr = table.pmd.fc0.pto * (PAGE_SIZE / 2); + w->level--; + } + w->last_addr = ptr + vaddr.px * 8; + rc = kvm_s390_get_guest_page_and_read_gpa(kvm, entries + w->level, + w->last_addr, &table.val); + if (rc) + return rc; + if (table.pte.i) + return PGM_PAGE_TRANSLATION; + if (table.pte.z) + return PGM_TRANSLATION_SPEC; + w->p |= table.pte.p; +edat_applies: + if (wr && w->p) + return PGM_PROTECTION; + + return kvm_s390_get_guest_page(kvm, entries + LEVEL_MEM, table.pte.pfra, wr); +} + +static int _do_shadow_pte(struct gmap *sg, gpa_t raddr, union pte *ptep_h, union pte *ptep, + struct guest_fault *f, bool p) +{ + union pgste pgste; + union pte newpte; + int rc; + + lockdep_assert_held(&sg->kvm->mmu_lock); + lockdep_assert_held(&sg->parent->children_lock); + + scoped_guard(spinlock, &sg->host_to_rmap_lock) + rc = gmap_insert_rmap(sg, f->gfn, gpa_to_gfn(raddr), TABLE_TYPE_PAGE_TABLE); + if (rc) + return rc; + + if (!pgste_get_trylock(ptep_h, &pgste)) + return -EAGAIN; + newpte = _pte(f->pfn, f->writable, !p, ptep_h->s.s); + newpte.s.d |= ptep_h->s.d; + newpte.s.sd |= ptep_h->s.sd; + newpte.h.p &= ptep_h->h.p; + if (!newpte.h.p && !f->writable) { + rc = -EOPNOTSUPP; + } else { + pgste = _gmap_ptep_xchg(sg->parent, ptep_h, newpte, pgste, f->gfn, false); + pgste.vsie_notif = 1; + } + pgste_set_unlock(ptep_h, pgste); + if (rc) + return rc; + if (sg->invalidated) + return -EAGAIN; + + newpte = _pte(f->pfn, 0, !p, 0); + if (!pgste_get_trylock(ptep, &pgste)) + return -EAGAIN; + pgste = __dat_ptep_xchg(ptep, pgste, newpte, gpa_to_gfn(raddr), sg->asce, uses_skeys(sg)); + pgste_set_unlock(ptep, pgste); + + return 0; +} + +static int _do_shadow_crste(struct gmap *sg, gpa_t raddr, union crste *host, union crste *table, + struct guest_fault *f, bool p) +{ + union crste newcrste, oldcrste; + gfn_t gfn; + int rc; + + lockdep_assert_held(&sg->kvm->mmu_lock); + lockdep_assert_held(&sg->parent->children_lock); + + gfn = f->gfn & (is_pmd(*table) ? _SEGMENT_FR_MASK : _REGION3_FR_MASK); + scoped_guard(spinlock, &sg->host_to_rmap_lock) + rc = gmap_insert_rmap(sg, gfn, gpa_to_gfn(raddr), host->h.tt); + if (rc) + return rc; + + do { + /* _gmap_crstep_xchg_atomic() could have unshadowed this shadow gmap */ + if (sg->invalidated) + return -EAGAIN; + oldcrste = READ_ONCE(*host); + newcrste = _crste_fc1(f->pfn, oldcrste.h.tt, f->writable, !p); + newcrste.s.fc1.d |= oldcrste.s.fc1.d; + newcrste.s.fc1.sd |= oldcrste.s.fc1.sd; + newcrste.h.p &= oldcrste.h.p; + newcrste.s.fc1.vsie_notif = 1; + newcrste.s.fc1.prefix_notif = oldcrste.s.fc1.prefix_notif; + newcrste.s.fc1.s = oldcrste.s.fc1.s; + if (!newcrste.h.p && !f->writable) + return -EOPNOTSUPP; + } while (!_gmap_crstep_xchg_atomic(sg->parent, host, oldcrste, newcrste, f->gfn, false)); + if (sg->invalidated) + return -EAGAIN; + + newcrste = _crste_fc1(f->pfn, oldcrste.h.tt, 0, !p); + gfn = gpa_to_gfn(raddr); + while (!dat_crstep_xchg_atomic(table, READ_ONCE(*table), newcrste, gfn, sg->asce)) + ; + return 0; +} + +static int _gaccess_do_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *sg, + unsigned long saddr, struct pgtwalk *w) +{ + struct guest_fault *entries; + int flags, i, hl, gl, l, rc; + union crste *table, *host; + union pte *ptep, *ptep_h; + + lockdep_assert_held(&sg->kvm->mmu_lock); + lockdep_assert_held(&sg->parent->children_lock); + + entries = get_entries(w); + ptep_h = NULL; + ptep = NULL; + + rc = dat_entry_walk(NULL, gpa_to_gfn(saddr), sg->asce, DAT_WALK_ANY, TABLE_TYPE_PAGE_TABLE, + &table, &ptep); + if (rc) + return rc; + + /* A race occurred. The shadow mapping is already valid, nothing to do */ + if ((ptep && !ptep->h.i && ptep->h.p == w->p) || + (!ptep && crste_leaf(*table) && !table->h.i && table->h.p == w->p)) + return 0; + + gl = get_level(table, ptep); + + /* In case of a real address space */ + if (w->level <= LEVEL_MEM) { + l = TABLE_TYPE_PAGE_TABLE; + hl = TABLE_TYPE_REGION1; + goto real_address_space; + } + + /* + * Skip levels that are already protected. For each level, protect + * only the page containing the entry, not the whole table. + */ + for (i = gl ; i >= w->level; i--) { + rc = gmap_protect_rmap(mc, sg, entries[i].gfn, gpa_to_gfn(saddr), + entries[i].pfn, i + 1, entries[i].writable); if (rc) return rc; - kvm->stat.gmap_shadow_sg_entry++; + if (sg->invalidated) + return -EAGAIN; } + + rc = dat_entry_walk(NULL, entries[LEVEL_MEM].gfn, sg->parent->asce, DAT_WALK_LEAF, + TABLE_TYPE_PAGE_TABLE, &host, &ptep_h); + if (rc) + return rc; + + hl = get_level(host, ptep_h); + /* Get the smallest granularity */ + l = min3(gl, hl, w->level); + +real_address_space: + flags = DAT_WALK_SPLIT_ALLOC | (uses_skeys(sg->parent) ? DAT_WALK_USES_SKEYS : 0); + /* If necessary, create the shadow mapping */ + if (l < gl) { + rc = dat_entry_walk(mc, gpa_to_gfn(saddr), sg->asce, flags, l, &table, &ptep); + if (rc) + return rc; } - /* Return the parent address of the page table */ - *pgt = ptr; - return 0; + if (l < hl) { + rc = dat_entry_walk(mc, entries[LEVEL_MEM].gfn, sg->parent->asce, + flags, l, &host, &ptep_h); + if (rc) + return rc; + } + + if (KVM_BUG_ON(l > TABLE_TYPE_REGION3, sg->kvm)) + return -EFAULT; + if (l == TABLE_TYPE_PAGE_TABLE) + return _do_shadow_pte(sg, saddr, ptep_h, ptep, entries + LEVEL_MEM, w->p); + return _do_shadow_crste(sg, saddr, host, table, entries + LEVEL_MEM, w->p); } -/** - * shadow_pgt_lookup() - find a shadow page table - * @sg: pointer to the shadow guest address space structure - * @saddr: the address in the shadow aguest address space - * @pgt: parent gmap address of the page table to get shadowed - * @dat_protection: if the pgtable is marked as protected by dat - * @fake: pgt references contiguous guest memory block, not a pgtable - * - * Returns 0 if the shadow page table was found and -EAGAIN if the page - * table was not found. - * - * Called with sg->mm->mmap_lock in read. - */ -static int shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, unsigned long *pgt, - int *dat_protection, int *fake) +static inline int _gaccess_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, gpa_t saddr, + unsigned long seq, struct pgtwalk *walk) { - unsigned long pt_index; - unsigned long *table; - struct page *page; + struct gmap *parent; int rc; - spin_lock(&sg->guest_table_lock); - table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ - if (table && !(*table & _SEGMENT_ENTRY_INVALID)) { - /* Shadow page tables are full pages (pte+pgste) */ - page = pfn_to_page(*table >> PAGE_SHIFT); - pt_index = gmap_pgste_get_pgt_addr(page_to_virt(page)); - *pgt = pt_index & ~GMAP_SHADOW_FAKE_TABLE; - *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT); - *fake = !!(pt_index & GMAP_SHADOW_FAKE_TABLE); - rc = 0; - } else { - rc = -EAGAIN; + if (kvm_s390_array_needs_retry_unsafe(vcpu->kvm, seq, walk->raw_entries)) + return -EAGAIN; +again: + rc = kvm_s390_mmu_cache_topup(vcpu->arch.mc); + if (rc) + return rc; + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) { + if (kvm_s390_array_needs_retry_safe(vcpu->kvm, seq, walk->raw_entries)) + return -EAGAIN; + parent = READ_ONCE(sg->parent); + if (!parent) + return -EAGAIN; + scoped_guard(spinlock, &parent->children_lock) { + if (READ_ONCE(sg->parent) != parent) + return -EAGAIN; + sg->invalidated = false; + rc = _gaccess_do_shadow(vcpu->arch.mc, sg, saddr, walk); + } + if (rc == -ENOMEM) + goto again; + if (!rc) + kvm_s390_release_faultin_array(vcpu->kvm, walk->raw_entries, false); } - spin_unlock(&sg->guest_table_lock); return rc; } /** - * kvm_s390_shadow_fault - handle fault on a shadow page table - * @vcpu: virtual cpu - * @sg: pointer to the shadow guest address space structure - * @saddr: faulting address in the shadow gmap - * @datptr: will contain the address of the faulting DAT table entry, or of - * the valid leaf, plus some flags + * __gaccess_shadow_fault() - Handle fault on a shadow page table. + * @vcpu: Virtual cpu that triggered the action. + * @sg: The shadow guest address space structure. + * @saddr: Faulting address in the shadow gmap. + * @datptr: Will contain the address of the faulting DAT table entry, or of + * the valid leaf, plus some flags. + * @wr: Whether this is a write access. * - * Returns: - 0 if the shadow fault was successfully resolved - * - > 0 (pgm exception code) on exceptions while faulting - * - -EAGAIN if the caller can retry immediately - * - -EFAULT when accessing invalid guest addresses - * - -ENOMEM if out of memory + * Return: + * * %0 if the shadow fault was successfully resolved + * * > 0 (pgm exception code) on exceptions while faulting + * * %-EAGAIN if the caller can retry immediately + * * %-EFAULT when accessing invalid guest addresses + * * %-ENOMEM if out of memory */ -int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, - unsigned long saddr, unsigned long *datptr) +static int __gaccess_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, gpa_t saddr, + union mvpg_pei *datptr, bool wr) { - union vaddress vaddr; - union page_table_entry pte; - unsigned long pgt = 0; - int dat_protection, fake; + struct pgtwalk walk = { .p = false, }; + unsigned long seq; int rc; - if (KVM_BUG_ON(!gmap_is_shadow(sg), vcpu->kvm)) - return -EFAULT; - - mmap_read_lock(sg->mm); - /* - * We don't want any guest-2 tables to change - so the parent - * tables/pointers we read stay valid - unshadowing is however - * always possible - only guest_table_lock protects us. - */ - ipte_lock(vcpu->kvm); + seq = vcpu->kvm->mmu_invalidate_seq; + /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */ + smp_rmb(); - rc = shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake); + rc = walk_guest_tables(sg, saddr, &walk, wr); + if (datptr) { + datptr->val = walk.last_addr; + datptr->dat_prot = wr && walk.p; + datptr->not_pte = walk.level > TABLE_TYPE_PAGE_TABLE; + datptr->real = sg->guest_asce.r; + } + if (!rc) + rc = _gaccess_shadow_fault(vcpu, sg, saddr, seq, &walk); if (rc) - rc = kvm_s390_shadow_tables(sg, saddr, &pgt, &dat_protection, - &fake); + kvm_s390_release_faultin_array(vcpu->kvm, walk.raw_entries, true); + return rc; +} - vaddr.addr = saddr; - if (fake) { - pte.val = pgt + vaddr.px * PAGE_SIZE; - goto shadow_page; - } +int gaccess_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, gpa_t saddr, + union mvpg_pei *datptr, bool wr) +{ + int rc; - switch (rc) { - case PGM_SEGMENT_TRANSLATION: - case PGM_REGION_THIRD_TRANS: - case PGM_REGION_SECOND_TRANS: - case PGM_REGION_FIRST_TRANS: - pgt |= PEI_NOT_PTE; - break; - case 0: - pgt += vaddr.px * 8; - rc = gmap_read_table(sg->parent, pgt, &pte.val); - } - if (datptr) - *datptr = pgt | dat_protection * PEI_DAT_PROT; - if (!rc && pte.i) - rc = PGM_PAGE_TRANSLATION; - if (!rc && pte.z) - rc = PGM_TRANSLATION_SPEC; -shadow_page: - pte.p |= dat_protection; - if (!rc) - rc = gmap_shadow_page(sg, saddr, __pte(pte.val)); - vcpu->kvm->stat.gmap_shadow_pg_entry++; + if (KVM_BUG_ON(!test_bit(GMAP_FLAG_SHADOW, &sg->flags), vcpu->kvm)) + return -EFAULT; + + rc = kvm_s390_mmu_cache_topup(vcpu->arch.mc); + if (rc) + return rc; + + ipte_lock(vcpu->kvm); + rc = __gaccess_shadow_fault(vcpu, sg, saddr, datptr, wr || sg->guest_asce.r); ipte_unlock(vcpu->kvm); - mmap_read_unlock(sg->mm); + return rc; } diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h index 3fde45a151f2..b5385cec60f4 100644 --- a/arch/s390/kvm/gaccess.h +++ b/arch/s390/kvm/gaccess.h @@ -206,8 +206,8 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data, unsigned long len, enum gacc_mode mode); -int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, __uint128_t *old, - __uint128_t new, u8 access_key, bool *success); +int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, union kvm_s390_quad *old, + union kvm_s390_quad new, u8 access_key, bool *success); /** * write_guest_with_key - copy data from kernel space to guest space @@ -450,11 +450,17 @@ void ipte_unlock(struct kvm *kvm); int ipte_lock_held(struct kvm *kvm); int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra); -/* MVPG PEI indication bits */ -#define PEI_DAT_PROT 2 -#define PEI_NOT_PTE 4 +union mvpg_pei { + unsigned long val; + struct { + unsigned long addr : 61; + unsigned long not_pte : 1; + unsigned long dat_prot: 1; + unsigned long real : 1; + }; +}; -int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *shadow, - unsigned long saddr, unsigned long *datptr); +int gaccess_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, gpa_t saddr, + union mvpg_pei *datptr, bool wr); #endif /* __KVM_S390_GACCESS_H */ diff --git a/arch/s390/kvm/gmap-vsie.c b/arch/s390/kvm/gmap-vsie.c deleted file mode 100644 index a6d1dbb04c97..000000000000 --- a/arch/s390/kvm/gmap-vsie.c +++ /dev/null @@ -1,142 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Guest memory management for KVM/s390 nested VMs. - * - * Copyright IBM Corp. 2008, 2020, 2024 - * - * Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com> - * Martin Schwidefsky <schwidefsky@de.ibm.com> - * David Hildenbrand <david@redhat.com> - * Janosch Frank <frankja@linux.vnet.ibm.com> - */ - -#include <linux/compiler.h> -#include <linux/kvm.h> -#include <linux/kvm_host.h> -#include <linux/pgtable.h> -#include <linux/pagemap.h> -#include <linux/mman.h> - -#include <asm/lowcore.h> -#include <asm/gmap.h> -#include <asm/uv.h> - -#include "kvm-s390.h" -#include "gmap.h" - -/** - * gmap_find_shadow - find a specific asce in the list of shadow tables - * @parent: pointer to the parent gmap - * @asce: ASCE for which the shadow table is created - * @edat_level: edat level to be used for the shadow translation - * - * Returns the pointer to a gmap if a shadow table with the given asce is - * already available, ERR_PTR(-EAGAIN) if another one is just being created, - * otherwise NULL - * - * Context: Called with parent->shadow_lock held - */ -static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce, int edat_level) -{ - struct gmap *sg; - - lockdep_assert_held(&parent->shadow_lock); - list_for_each_entry(sg, &parent->children, list) { - if (!gmap_shadow_valid(sg, asce, edat_level)) - continue; - if (!sg->initialized) - return ERR_PTR(-EAGAIN); - refcount_inc(&sg->ref_count); - return sg; - } - return NULL; -} - -/** - * gmap_shadow - create/find a shadow guest address space - * @parent: pointer to the parent gmap - * @asce: ASCE for which the shadow table is created - * @edat_level: edat level to be used for the shadow translation - * - * The pages of the top level page table referred by the asce parameter - * will be set to read-only and marked in the PGSTEs of the kvm process. - * The shadow table will be removed automatically on any change to the - * PTE mapping for the source table. - * - * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory, - * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the - * parent gmap table could not be protected. - */ -struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, int edat_level) -{ - struct gmap *sg, *new; - unsigned long limit; - int rc; - - if (KVM_BUG_ON(parent->mm->context.allow_gmap_hpage_1m, (struct kvm *)parent->private) || - KVM_BUG_ON(gmap_is_shadow(parent), (struct kvm *)parent->private)) - return ERR_PTR(-EFAULT); - spin_lock(&parent->shadow_lock); - sg = gmap_find_shadow(parent, asce, edat_level); - spin_unlock(&parent->shadow_lock); - if (sg) - return sg; - /* Create a new shadow gmap */ - limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11)); - if (asce & _ASCE_REAL_SPACE) - limit = -1UL; - new = gmap_alloc(limit); - if (!new) - return ERR_PTR(-ENOMEM); - new->mm = parent->mm; - new->parent = gmap_get(parent); - new->private = parent->private; - new->orig_asce = asce; - new->edat_level = edat_level; - new->initialized = false; - spin_lock(&parent->shadow_lock); - /* Recheck if another CPU created the same shadow */ - sg = gmap_find_shadow(parent, asce, edat_level); - if (sg) { - spin_unlock(&parent->shadow_lock); - gmap_free(new); - return sg; - } - if (asce & _ASCE_REAL_SPACE) { - /* only allow one real-space gmap shadow */ - list_for_each_entry(sg, &parent->children, list) { - if (sg->orig_asce & _ASCE_REAL_SPACE) { - spin_lock(&sg->guest_table_lock); - gmap_unshadow(sg); - spin_unlock(&sg->guest_table_lock); - list_del(&sg->list); - gmap_put(sg); - break; - } - } - } - refcount_set(&new->ref_count, 2); - list_add(&new->list, &parent->children); - if (asce & _ASCE_REAL_SPACE) { - /* nothing to protect, return right away */ - new->initialized = true; - spin_unlock(&parent->shadow_lock); - return new; - } - spin_unlock(&parent->shadow_lock); - /* protect after insertion, so it will get properly invalidated */ - mmap_read_lock(parent->mm); - rc = __kvm_s390_mprotect_many(parent, asce & _ASCE_ORIGIN, - ((asce & _ASCE_TABLE_LENGTH) + 1), - PROT_READ, GMAP_NOTIFY_SHADOW); - mmap_read_unlock(parent->mm); - spin_lock(&parent->shadow_lock); - new->initialized = true; - if (rc) { - list_del(&new->list); - gmap_free(new); - new = ERR_PTR(rc); - } - spin_unlock(&parent->shadow_lock); - return new; -} diff --git a/arch/s390/kvm/gmap.c b/arch/s390/kvm/gmap.c index 02adf151d4de..3c26e35af0ef 100644 --- a/arch/s390/kvm/gmap.c +++ b/arch/s390/kvm/gmap.c @@ -7,7 +7,7 @@ * Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com> * Martin Schwidefsky <schwidefsky@de.ibm.com> * David Hildenbrand <david@redhat.com> - * Janosch Frank <frankja@linux.vnet.ibm.com> + * Janosch Frank <frankja@linux.ibm.com> */ #include <linux/compiler.h> @@ -15,198 +15,1321 @@ #include <linux/kvm_host.h> #include <linux/pgtable.h> #include <linux/pagemap.h> - #include <asm/lowcore.h> -#include <asm/gmap.h> #include <asm/uv.h> +#include <asm/gmap_helpers.h> +#include "dat.h" #include "gmap.h" +#include "kvm-s390.h" +#include "faultin.h" + +static inline bool kvm_s390_is_in_sie(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.sie_block->prog0c & PROG_IN_SIE; +} + +static int gmap_limit_to_type(gfn_t limit) +{ + if (!limit) + return TABLE_TYPE_REGION1; + if (limit <= _REGION3_SIZE >> PAGE_SHIFT) + return TABLE_TYPE_SEGMENT; + if (limit <= _REGION2_SIZE >> PAGE_SHIFT) + return TABLE_TYPE_REGION3; + if (limit <= _REGION1_SIZE >> PAGE_SHIFT) + return TABLE_TYPE_REGION2; + return TABLE_TYPE_REGION1; +} + +/** + * gmap_new() - Allocate and initialize a guest address space. + * @kvm: The kvm owning the guest. + * @limit: Maximum address of the gmap address space. + * + * Return: A guest address space structure. + */ +struct gmap *gmap_new(struct kvm *kvm, gfn_t limit) +{ + struct crst_table *table; + struct gmap *gmap; + int type; + + type = gmap_limit_to_type(limit); + + gmap = kzalloc_obj(*gmap, GFP_KERNEL_ACCOUNT); + if (!gmap) + return NULL; + INIT_LIST_HEAD(&gmap->children); + INIT_LIST_HEAD(&gmap->list); + INIT_LIST_HEAD(&gmap->scb_users); + INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_KVM_S390_MMU_CACHE); + spin_lock_init(&gmap->children_lock); + spin_lock_init(&gmap->host_to_rmap_lock); + refcount_set(&gmap->refcount, 1); + + table = dat_alloc_crst_sleepable(_CRSTE_EMPTY(type).val); + if (!table) { + kfree(gmap); + return NULL; + } + + gmap->asce.val = __pa(table); + gmap->asce.dt = type; + gmap->asce.tl = _ASCE_TABLE_LENGTH; + gmap->asce.x = 1; + gmap->asce.p = 1; + gmap->asce.s = 1; + gmap->kvm = kvm; + set_bit(GMAP_FLAG_OWNS_PAGETABLES, &gmap->flags); + + return gmap; +} + +static void gmap_add_child(struct gmap *parent, struct gmap *child) +{ + KVM_BUG_ON(is_ucontrol(parent) && parent->parent, parent->kvm); + KVM_BUG_ON(is_ucontrol(parent) && !owns_page_tables(parent), parent->kvm); + KVM_BUG_ON(!refcount_read(&child->refcount), parent->kvm); + lockdep_assert_held(&parent->children_lock); + + child->parent = parent; + + if (is_ucontrol(parent)) + set_bit(GMAP_FLAG_IS_UCONTROL, &child->flags); + else + clear_bit(GMAP_FLAG_IS_UCONTROL, &child->flags); + + if (test_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &parent->flags)) + set_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &child->flags); + else + clear_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &child->flags); + + if (kvm_is_ucontrol(parent->kvm)) + clear_bit(GMAP_FLAG_OWNS_PAGETABLES, &child->flags); + list_add(&child->list, &parent->children); +} + +struct gmap *gmap_new_child(struct gmap *parent, gfn_t limit) +{ + struct gmap *res; + + lockdep_assert_not_held(&parent->children_lock); + res = gmap_new(parent->kvm, limit); + if (res) { + scoped_guard(spinlock, &parent->children_lock) + gmap_add_child(parent, res); + } + return res; +} + +int gmap_set_limit(struct gmap *gmap, gfn_t limit) +{ + struct kvm_s390_mmu_cache *mc; + int rc, type; + + type = gmap_limit_to_type(limit); + + mc = kvm_s390_new_mmu_cache(); + if (!mc) + return -ENOMEM; + + do { + rc = kvm_s390_mmu_cache_topup(mc); + if (rc) + return rc; + scoped_guard(write_lock, &gmap->kvm->mmu_lock) + rc = dat_set_asce_limit(mc, &gmap->asce, type); + } while (rc == -ENOMEM); + + kvm_s390_free_mmu_cache(mc); + return 0; +} + +static void gmap_rmap_radix_tree_free(struct radix_tree_root *root) +{ + struct vsie_rmap *rmap, *rnext, *head; + struct radix_tree_iter iter; + unsigned long indices[16]; + unsigned long index; + void __rcu **slot; + int i, nr; + + /* A radix tree is freed by deleting all of its entries */ + index = 0; + do { + nr = 0; + radix_tree_for_each_slot(slot, root, &iter, index) { + indices[nr] = iter.index; + if (++nr == 16) + break; + } + for (i = 0; i < nr; i++) { + index = indices[i]; + head = radix_tree_delete(root, index); + gmap_for_each_rmap_safe(rmap, rnext, head) + kfree(rmap); + } + } while (nr > 0); +} + +void gmap_remove_child(struct gmap *child) +{ + if (KVM_BUG_ON(!child->parent, child->kvm)) + return; + lockdep_assert_held(&child->parent->children_lock); + + list_del(&child->list); + child->parent = NULL; + child->invalidated = true; +} + +/** + * gmap_dispose() - Remove and free a guest address space and its children. + * @gmap: Pointer to the guest address space structure. + */ +void gmap_dispose(struct gmap *gmap) +{ + /* The gmap must have been removed from the parent beforehands */ + KVM_BUG_ON(gmap->parent, gmap->kvm); + /* All children of this gmap must have been removed beforehands */ + KVM_BUG_ON(!list_empty(&gmap->children), gmap->kvm); + /* No VSIE shadow block is allowed to use this gmap */ + KVM_BUG_ON(!list_empty(&gmap->scb_users), gmap->kvm); + /* The ASCE must be valid */ + KVM_BUG_ON(!gmap->asce.val, gmap->kvm); + /* The refcount must be 0 */ + KVM_BUG_ON(refcount_read(&gmap->refcount), gmap->kvm); + + /* Flush tlb of all gmaps */ + asce_flush_tlb(gmap->asce); + + /* Free all DAT tables. */ + dat_free_level(dereference_asce(gmap->asce), owns_page_tables(gmap)); + + /* Free additional data for a shadow gmap */ + if (is_shadow(gmap)) + gmap_rmap_radix_tree_free(&gmap->host_to_rmap); + + kfree(gmap); +} /** - * should_export_before_import - Determine whether an export is needed - * before an import-like operation - * @uvcb: the Ultravisor control block of the UVC to be performed - * @mm: the mm of the process + * s390_replace_asce() - Try to replace the current ASCE of a gmap with a copy. + * @gmap: The gmap whose ASCE needs to be replaced. * - * Returns whether an export is needed before every import-like operation. - * This is needed for shared pages, which don't trigger a secure storage - * exception when accessed from a different guest. + * If the ASCE is a SEGMENT type then this function will return -EINVAL, + * otherwise the pointers in the host_to_guest radix tree will keep pointing + * to the wrong pages, causing use-after-free and memory corruption. + * If the allocation of the new top level page table fails, the ASCE is not + * replaced. + * In any case, the old ASCE is always removed from the gmap CRST list. + * Therefore the caller has to make sure to save a pointer to it + * beforehand, unless a leak is actually intended. * - * Although considered as one, the Unpin Page UVC is not an actual import, - * so it is not affected. + * Return: 0 in case of success, -EINVAL if the ASCE is segment type ASCE, + * -ENOMEM if runinng out of memory. + */ +int s390_replace_asce(struct gmap *gmap) +{ + struct crst_table *table; + union asce asce; + + /* Replacing segment type ASCEs would cause serious issues */ + if (gmap->asce.dt == ASCE_TYPE_SEGMENT) + return -EINVAL; + + table = dat_alloc_crst_sleepable(0); + if (!table) + return -ENOMEM; + memcpy(table, dereference_asce(gmap->asce), sizeof(*table)); + + /* Set new table origin while preserving existing ASCE control bits */ + asce = gmap->asce; + asce.rsto = virt_to_pfn(table); + WRITE_ONCE(gmap->asce, asce); + + return 0; +} + +bool _gmap_unmap_prefix(struct gmap *gmap, gfn_t gfn, gfn_t end, bool hint) +{ + struct kvm *kvm = gmap->kvm; + struct kvm_vcpu *vcpu; + gfn_t prefix_gfn; + unsigned long i; + + if (is_shadow(gmap)) + return false; + kvm_for_each_vcpu(i, vcpu, kvm) { + /* Match against both prefix pages */ + prefix_gfn = gpa_to_gfn(kvm_s390_get_prefix(vcpu)); + if (prefix_gfn < end && gfn <= prefix_gfn + 1) { + if (hint && kvm_s390_is_in_sie(vcpu)) + return false; + VCPU_EVENT(vcpu, 2, "gmap notifier for %llx-%llx", + gfn_to_gpa(gfn), gfn_to_gpa(end)); + kvm_s390_sync_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu); + } + } + return true; +} + +struct clear_young_pte_priv { + struct gmap *gmap; + bool young; +}; + +static long gmap_clear_young_pte(union pte *ptep, gfn_t gfn, gfn_t end, struct dat_walk *walk) +{ + struct clear_young_pte_priv *p = walk->priv; + union pgste pgste; + union pte pte, new; + + pte = READ_ONCE(*ptep); + + if (!pte.s.pr || (!pte.s.y && pte.h.i)) + return 0; + + pgste = pgste_get_lock(ptep); + if (!pgste.prefix_notif || gmap_mkold_prefix(p->gmap, gfn, end)) { + new = pte; + new.h.i = 1; + new.s.y = 0; + if ((new.s.d || !new.h.p) && !new.s.s) + folio_set_dirty(pfn_folio(pte.h.pfra)); + new.s.d = 0; + new.h.p = 1; + + pgste.prefix_notif = 0; + pgste = __dat_ptep_xchg(ptep, pgste, new, gfn, walk->asce, uses_skeys(p->gmap)); + } + p->young = 1; + pgste_set_unlock(ptep, pgste); + return 0; +} + +static long gmap_clear_young_crste(union crste *crstep, gfn_t gfn, gfn_t end, struct dat_walk *walk) +{ + struct clear_young_pte_priv *priv = walk->priv; + union crste crste, new; + + do { + crste = READ_ONCE(*crstep); + + if (!crste.h.fc) + return 0; + if (!crste.s.fc1.y && crste.h.i) + return 0; + if (crste_prefix(crste) && !gmap_mkold_prefix(priv->gmap, gfn, end)) + break; + + new = crste; + new.h.i = 1; + new.s.fc1.y = 0; + new.s.fc1.prefix_notif = 0; + if (new.s.fc1.d || !new.h.p) + folio_set_dirty(phys_to_folio(crste_origin_large(crste))); + new.s.fc1.d = 0; + new.h.p = 1; + } while (!dat_crstep_xchg_atomic(crstep, crste, new, gfn, walk->asce)); + + priv->young = 1; + return 0; +} + +/** + * gmap_age_gfn() - Clear young. + * @gmap: The guest gmap. + * @start: The first gfn to test. + * @end: The gfn after the last one to test. * - * No export is needed also when there is only one protected VM, because the - * page cannot belong to the wrong VM in that case (there is no "other VM" - * it can belong to). + * Context: Called with the kvm mmu write lock held. + * Return: 1 if any page in the given range was young, otherwise 0. + */ +bool gmap_age_gfn(struct gmap *gmap, gfn_t start, gfn_t end) +{ + const struct dat_walk_ops ops = { + .pte_entry = gmap_clear_young_pte, + .pmd_entry = gmap_clear_young_crste, + .pud_entry = gmap_clear_young_crste, + }; + struct clear_young_pte_priv priv = { + .gmap = gmap, + .young = false, + }; + + _dat_walk_gfn_range(start, end, gmap->asce, &ops, 0, &priv); + + return priv.young; +} + +struct gmap_unmap_priv { + struct gmap *gmap; + struct kvm_memory_slot *slot; +}; + +static long _gmap_unmap_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *w) +{ + struct gmap_unmap_priv *priv = w->priv; + struct folio *folio = NULL; + unsigned long vmaddr; + union pgste pgste; + + pgste = pgste_get_lock(ptep); + if (ptep->s.pr && pgste.usage == PGSTE_GPS_USAGE_UNUSED) { + vmaddr = __gfn_to_hva_memslot(priv->slot, gfn); + gmap_helper_try_set_pte_unused(priv->gmap->kvm->mm, vmaddr); + } + if (ptep->s.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags)) + folio = pfn_folio(ptep->h.pfra); + pgste = gmap_ptep_xchg(priv->gmap, ptep, _PTE_EMPTY, pgste, gfn); + pgste_set_unlock(ptep, pgste); + if (folio) + uv_convert_from_secure_folio(folio); + + return 0; +} + +static long _gmap_unmap_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + struct gmap_unmap_priv *priv = walk->priv; + struct folio *folio = NULL; + union crste old = *crstep; + + if (!old.h.fc) + return 0; + + if (old.s.fc1.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags)) + folio = phys_to_folio(crste_origin_large(old)); + /* No races should happen because kvm->mmu_lock is held in write mode */ + KVM_BUG_ON(!gmap_crstep_xchg_atomic(priv->gmap, crstep, old, _CRSTE_EMPTY(old.h.tt), gfn), + priv->gmap->kvm); + if (folio) + uv_convert_from_secure_folio(folio); + + return 0; +} + +/** + * gmap_unmap_gfn_range() - Unmap a range of guest addresses. + * @gmap: The gmap to act on. + * @slot: The memslot in which the range is located. + * @start: The first gfn to unmap. + * @end: The gfn after the last one to unmap. * - * Return: true if an export is needed before every import, otherwise false. + * Context: Called with the kvm mmu write lock held. + * Return: false */ -static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm) +bool gmap_unmap_gfn_range(struct gmap *gmap, struct kvm_memory_slot *slot, gfn_t start, gfn_t end) { + const struct dat_walk_ops ops = { + .pte_entry = _gmap_unmap_pte, + .pmd_entry = _gmap_unmap_crste, + .pud_entry = _gmap_unmap_crste, + }; + struct gmap_unmap_priv priv = { + .gmap = gmap, + .slot = slot, + }; + + lockdep_assert_held_write(&gmap->kvm->mmu_lock); + + _dat_walk_gfn_range(start, end, gmap->asce, &ops, 0, &priv); + return false; +} + +static union pgste __pte_test_and_clear_softdirty(union pte *ptep, union pgste pgste, gfn_t gfn, + struct gmap *gmap) +{ + union pte pte = READ_ONCE(*ptep); + + if (!pte.s.pr || (pte.h.p && !pte.s.sd)) + return pgste; + /* - * The misc feature indicates, among other things, that importing a - * shared page from a different protected VM will automatically also - * transfer its ownership. + * If this page contains one or more prefixes of vCPUS that are currently + * running, do not reset the protection, leave it marked as dirty. */ - if (uv_has_feature(BIT_UV_FEAT_MISC)) - return false; - if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED) - return false; - return atomic_read(&mm->context.protected_count) > 1; + if (!pgste.prefix_notif || gmap_mkold_prefix(gmap, gfn, gfn + 1)) { + pte.h.p = 1; + pte.s.sd = 0; + pgste = gmap_ptep_xchg(gmap, ptep, pte, pgste, gfn); + } + + mark_page_dirty(gmap->kvm, gfn); + + return pgste; +} + +static long _pte_test_and_clear_softdirty(union pte *ptep, gfn_t gfn, gfn_t end, + struct dat_walk *walk) +{ + struct gmap *gmap = walk->priv; + union pgste pgste; + + pgste = pgste_get_lock(ptep); + pgste = __pte_test_and_clear_softdirty(ptep, pgste, gfn, gmap); + pgste_set_unlock(ptep, pgste); + return 0; +} + +static long _crste_test_and_clear_softdirty(union crste *table, gfn_t gfn, gfn_t end, + struct dat_walk *walk) +{ + struct gmap *gmap = walk->priv; + union crste crste, new; + + if (fatal_signal_pending(current)) + return 1; + do { + crste = READ_ONCE(*table); + if (!crste.h.fc) + return 0; + if (crste.h.p && !crste.s.fc1.sd) + return 0; + + /* + * If this large page contains one or more prefixes of vCPUs that are + * currently running, do not reset the protection, leave it marked as + * dirty. + */ + if (crste.s.fc1.prefix_notif && !gmap_mkold_prefix(gmap, gfn, end)) + break; + new = crste; + new.h.p = 1; + new.s.fc1.sd = 0; + } while (!gmap_crstep_xchg_atomic(gmap, table, crste, new, gfn)); + + for ( ; gfn < end; gfn++) + mark_page_dirty(gmap->kvm, gfn); + + return 0; } -static int __gmap_make_secure(struct gmap *gmap, struct page *page, void *uvcb) +void gmap_sync_dirty_log(struct gmap *gmap, gfn_t start, gfn_t end) { - struct folio *folio = page_folio(page); + const struct dat_walk_ops walk_ops = { + .pte_entry = _pte_test_and_clear_softdirty, + .pmd_entry = _crste_test_and_clear_softdirty, + .pud_entry = _crste_test_and_clear_softdirty, + }; + + lockdep_assert_held(&gmap->kvm->mmu_lock); + + _dat_walk_gfn_range(start, end, gmap->asce, &walk_ops, 0, gmap); +} + +static int gmap_handle_minor_crste_fault(struct gmap *gmap, struct guest_fault *f) +{ + union crste newcrste, oldcrste = READ_ONCE(*f->crstep); + + /* Somehow the crste is not large anymore, let the slow path deal with it. */ + if (!oldcrste.h.fc) + return 1; + + f->pfn = PHYS_PFN(large_crste_to_phys(oldcrste, f->gfn)); + f->writable = oldcrste.s.fc1.w; + + /* Appropriate permissions already (race with another handler), nothing to do. */ + if (!oldcrste.h.i && !(f->write_attempt && oldcrste.h.p)) + return 0; + + if (!f->write_attempt || oldcrste.s.fc1.w) { + f->write_attempt |= oldcrste.s.fc1.w && oldcrste.s.fc1.d; + newcrste = oldcrste; + newcrste.h.i = 0; + newcrste.s.fc1.y = 1; + if (f->write_attempt) { + newcrste.h.p = 0; + newcrste.s.fc1.d = 1; + newcrste.s.fc1.sd = 1; + } + /* In case of races, let the slow path deal with it. */ + return !gmap_crstep_xchg_atomic(gmap, f->crstep, oldcrste, newcrste, f->gfn); + } + /* Trying to write on a read-only page, let the slow path deal with it. */ + return 1; +} + +static int _gmap_handle_minor_pte_fault(struct gmap *gmap, union pgste *pgste, + struct guest_fault *f) +{ + union pte newpte, oldpte = READ_ONCE(*f->ptep); + + f->pfn = oldpte.h.pfra; + f->writable = oldpte.s.w; + + /* Appropriate permissions already (race with another handler), nothing to do. */ + if (!oldpte.h.i && !(f->write_attempt && oldpte.h.p)) + return 0; + /* Trying to write on a read-only page, let the slow path deal with it. */ + if (!oldpte.s.pr || (f->write_attempt && !oldpte.s.w)) + return 1; + + newpte = oldpte; + newpte.h.i = 0; + newpte.s.y = 1; + if (f->write_attempt) { + newpte.h.p = 0; + newpte.s.d = 1; + newpte.s.sd = 1; + } + *pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, *pgste, f->gfn); + + return 0; +} + +/** + * gmap_try_fixup_minor() -- Try to fixup a minor gmap fault. + * @gmap: The gmap whose fault needs to be resolved. + * @fault: Describes the fault that is being resolved. + * + * A minor fault is a fault that can be resolved quickly within gmap. + * The page is already mapped, the fault is only due to dirty/young tracking. + * + * Return: 0 in case of success, < 0 in case of error, > 0 if the fault could + * not be resolved and needs to go through the slow path. + */ +int gmap_try_fixup_minor(struct gmap *gmap, struct guest_fault *fault) +{ + union pgste pgste; int rc; - /* - * Secure pages cannot be huge and userspace should not combine both. - * In case userspace does it anyway this will result in an -EFAULT for - * the unpack. The guest is thus never reaching secure mode. - * If userspace plays dirty tricks and decides to map huge pages at a - * later point in time, it will receive a segmentation fault or - * KVM_RUN will return -EFAULT. - */ - if (folio_test_hugetlb(folio)) - return -EFAULT; - if (folio_test_large(folio)) { - mmap_read_unlock(gmap->mm); - rc = kvm_s390_wiggle_split_folio(gmap->mm, folio, true); - mmap_read_lock(gmap->mm); - if (rc) - return rc; - folio = page_folio(page); + lockdep_assert_held(&gmap->kvm->mmu_lock); + + rc = dat_entry_walk(NULL, fault->gfn, gmap->asce, DAT_WALK_LEAF, TABLE_TYPE_PAGE_TABLE, + &fault->crstep, &fault->ptep); + /* If a PTE or a leaf CRSTE could not be reached, slow path. */ + if (rc) + return 1; + + if (fault->ptep) { + pgste = pgste_get_lock(fault->ptep); + rc = _gmap_handle_minor_pte_fault(gmap, &pgste, fault); + if (!rc && fault->callback) + fault->callback(fault); + pgste_set_unlock(fault->ptep, pgste); + } else { + rc = gmap_handle_minor_crste_fault(gmap, fault); + if (!rc && fault->callback) + fault->callback(fault); } + return rc; +} - if (!folio_trylock(folio)) - return -EAGAIN; - if (should_export_before_import(uvcb, gmap->mm)) - uv_convert_from_secure(folio_to_phys(folio)); - rc = make_folio_secure(folio, uvcb); - folio_unlock(folio); +static inline bool gmap_2g_allowed(struct gmap *gmap, struct guest_fault *f, + struct kvm_memory_slot *slot) +{ + return false; +} - /* - * In theory a race is possible and the folio might have become - * large again before the folio_trylock() above. In that case, no - * action is performed and -EAGAIN is returned; the callers will - * have to try again later. - * In most cases this implies running the VM again, getting the same - * exception again, and make another attempt in this function. - * This is expected to happen extremely rarely. - */ - if (rc == -E2BIG) - return -EAGAIN; - /* The folio has too many references, try to shake some off */ - if (rc == -EBUSY) { - mmap_read_unlock(gmap->mm); - kvm_s390_wiggle_split_folio(gmap->mm, folio, false); - mmap_read_lock(gmap->mm); +/** + * gmap_1m_allowed() - Check whether a 1M hugepage is allowed. + * @gmap: The gmap of the guest. + * @f: Describes the fault that is being resolved. + * @slot: The memslot the faulting address belongs to. + * + * The function checks whether the GMAP_FLAG_ALLOW_HPAGE_1M flag is set for + * @gmap, whether the offset of the address in the 1M virtual frame is the + * same as the offset in the physical 1M frame, and finally whether the whole + * 1M page would fit in the given memslot. + * + * Return: true if a 1M hugepage is allowed to back the faulting address, false + * otherwise. + */ +static inline bool gmap_1m_allowed(struct gmap *gmap, struct guest_fault *f, + struct kvm_memory_slot *slot) +{ + return test_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &gmap->flags) && + !((f->gfn ^ f->pfn) & ~_SEGMENT_FR_MASK) && + slot->base_gfn <= ALIGN_DOWN(f->gfn, _PAGES_PER_SEGMENT) && + slot->base_gfn + slot->npages >= ALIGN(f->gfn + 1, _PAGES_PER_SEGMENT); +} + +static int _gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, int level, + struct guest_fault *f) +{ + union crste oldval, newval; + union pte newpte, oldpte; + union pgste pgste; + int rc = 0; + + rc = dat_entry_walk(mc, f->gfn, gmap->asce, DAT_WALK_ALLOC_CONTINUE, level, + &f->crstep, &f->ptep); + if (rc == -ENOMEM) + return rc; + if (KVM_BUG_ON(rc == -EINVAL, gmap->kvm)) + return rc; + if (rc) return -EAGAIN; + if (KVM_BUG_ON(get_level(f->crstep, f->ptep) > level, gmap->kvm)) + return -EINVAL; + + if (f->ptep) { + pgste = pgste_get_lock(f->ptep); + oldpte = *f->ptep; + newpte = _pte(f->pfn, f->writable, f->write_attempt | oldpte.s.d, !f->page); + newpte.s.sd = oldpte.s.sd; + oldpte.s.sd = 0; + if (oldpte.val == _PTE_EMPTY.val || oldpte.h.pfra == f->pfn) { + pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, pgste, f->gfn); + if (f->callback) + f->callback(f); + } else { + rc = -EAGAIN; + } + pgste_set_unlock(f->ptep, pgste); + } else { + do { + oldval = READ_ONCE(*f->crstep); + newval = _crste_fc1(f->pfn, oldval.h.tt, f->writable, + f->write_attempt | oldval.s.fc1.d); + newval.s.fc1.s = !f->page; + newval.s.fc1.sd = oldval.s.fc1.sd; + if (oldval.val != _CRSTE_EMPTY(oldval.h.tt).val && + crste_origin_large(oldval) != crste_origin_large(newval)) + return -EAGAIN; + } while (!gmap_crstep_xchg_atomic(gmap, f->crstep, oldval, newval, f->gfn)); + if (f->callback) + f->callback(f); } return rc; } +int gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, struct guest_fault *f, + struct kvm_memory_slot *slot) +{ + unsigned int order; + int level; + + lockdep_assert_held(&gmap->kvm->mmu_lock); + + level = TABLE_TYPE_PAGE_TABLE; + if (f->page) { + order = folio_order(page_folio(f->page)); + if (order >= get_order(_REGION3_SIZE) && gmap_2g_allowed(gmap, f, slot)) + level = TABLE_TYPE_REGION3; + else if (order >= get_order(_SEGMENT_SIZE) && gmap_1m_allowed(gmap, f, slot)) + level = TABLE_TYPE_SEGMENT; + } + return _gmap_link(mc, gmap, level, f); +} + +static int gmap_ucas_map_one(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, + gfn_t p_gfn, gfn_t c_gfn, bool force_alloc) +{ + union crste newcrste, oldcrste; + struct page_table *pt; + union crste *crstep; + union pte *ptep; + int rc; + + if (force_alloc) + rc = dat_entry_walk(mc, p_gfn, gmap->parent->asce, DAT_WALK_ALLOC, + TABLE_TYPE_PAGE_TABLE, &crstep, &ptep); + else + rc = dat_entry_walk(mc, p_gfn, gmap->parent->asce, DAT_WALK_ALLOC_CONTINUE, + TABLE_TYPE_SEGMENT, &crstep, &ptep); + if (rc) + return rc; + if (!ptep) { + newcrste = _crste_fc0(p_gfn, TABLE_TYPE_SEGMENT); + newcrste.h.i = 1; + newcrste.h.fc0.tl = 1; + } else { + pt = pte_table_start(ptep); + dat_set_ptval(pt, PTVAL_VMADDR, p_gfn >> (_SEGMENT_SHIFT - PAGE_SHIFT)); + newcrste = _crste_fc0(virt_to_pfn(pt), TABLE_TYPE_SEGMENT); + } + rc = dat_entry_walk(mc, c_gfn, gmap->asce, DAT_WALK_ALLOC, TABLE_TYPE_SEGMENT, + &crstep, &ptep); + if (rc) + return rc; + do { + oldcrste = READ_ONCE(*crstep); + if (oldcrste.val == newcrste.val) + break; + } while (!dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, c_gfn, gmap->asce)); + return 0; +} + +static int gmap_ucas_translate_simple(struct gmap *gmap, gpa_t *gaddr, union crste **crstepp) +{ + union pte *ptep; + int rc; + + rc = dat_entry_walk(NULL, gpa_to_gfn(*gaddr), gmap->asce, DAT_WALK_CONTINUE, + TABLE_TYPE_SEGMENT, crstepp, &ptep); + if (rc || (!ptep && !crste_is_ucas(**crstepp))) + return -EREMOTE; + if (!ptep) + return 1; + *gaddr &= ~_SEGMENT_MASK; + *gaddr |= dat_get_ptval(pte_table_start(ptep), PTVAL_VMADDR) << _SEGMENT_SHIFT; + return 0; +} + /** - * gmap_make_secure() - make one guest page secure - * @gmap: the guest gmap - * @gaddr: the guest address that needs to be made secure - * @uvcb: the UVCB specifying which operation needs to be performed + * gmap_ucas_translate() - Translate a vcpu address into a host gmap address + * @mc: The memory cache to be used for allocations. + * @gmap: The per-cpu gmap. + * @gaddr: Pointer to the address to be translated, will get overwritten with + * the translated address in case of success. + * Translates the per-vCPU guest address into a fake guest address, which can + * then be used with the fake memslots that are identity mapping userspace. + * This allows ucontrol VMs to use the normal fault resolution path, like + * normal VMs. * - * Context: needs to be called with kvm->srcu held. - * Return: 0 on success, < 0 in case of error (see __gmap_make_secure()). + * Return: %0 in case of success, otherwise %-EREMOTE. */ -int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb) +int gmap_ucas_translate(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, gpa_t *gaddr) { - struct kvm *kvm = gmap->private; - struct page *page; - int rc = 0; + gpa_t translated_address; + union crste *crstep; + gfn_t gfn; + int rc; + + gfn = gpa_to_gfn(*gaddr); + + scoped_guard(read_lock, &gmap->kvm->mmu_lock) { + rc = gmap_ucas_translate_simple(gmap, gaddr, &crstep); + if (rc <= 0) + return rc; + } + do { + scoped_guard(write_lock, &gmap->kvm->mmu_lock) { + rc = gmap_ucas_translate_simple(gmap, gaddr, &crstep); + if (rc <= 0) + return rc; + translated_address = (*gaddr & ~_SEGMENT_MASK) | + (crstep->val & _SEGMENT_MASK); + rc = gmap_ucas_map_one(mc, gmap, gpa_to_gfn(translated_address), gfn, true); + } + if (!rc) { + *gaddr = translated_address; + return 0; + } + if (rc != -ENOMEM) + return -EREMOTE; + rc = kvm_s390_mmu_cache_topup(mc); + if (rc) + return rc; + } while (1); + return 0; +} + +int gmap_ucas_map(struct gmap *gmap, gfn_t p_gfn, gfn_t c_gfn, unsigned long count) +{ + struct kvm_s390_mmu_cache *mc; + int rc; + + mc = kvm_s390_new_mmu_cache(); + if (!mc) + return -ENOMEM; + + while (count) { + scoped_guard(write_lock, &gmap->kvm->mmu_lock) + rc = gmap_ucas_map_one(mc, gmap, p_gfn, c_gfn, false); + if (rc == -ENOMEM) { + rc = kvm_s390_mmu_cache_topup(mc); + if (rc) + return rc; + continue; + } + if (rc) + return rc; + + count--; + c_gfn += _PAGE_ENTRIES; + p_gfn += _PAGE_ENTRIES; + } + return rc; +} + +static void gmap_ucas_unmap_one(struct gmap *gmap, gfn_t c_gfn) +{ + union crste *crstep; + union pte *ptep; + int rc; + + rc = dat_entry_walk(NULL, c_gfn, gmap->asce, 0, TABLE_TYPE_SEGMENT, &crstep, &ptep); + if (rc) + return; + while (!dat_crstep_xchg_atomic(crstep, READ_ONCE(*crstep), _PMD_EMPTY, c_gfn, gmap->asce)) + ; +} + +void gmap_ucas_unmap(struct gmap *gmap, gfn_t c_gfn, unsigned long count) +{ + guard(read_lock)(&gmap->kvm->mmu_lock); - lockdep_assert_held(&kvm->srcu); + for ( ; count; count--, c_gfn += _PAGE_ENTRIES) + gmap_ucas_unmap_one(gmap, c_gfn); +} - page = gfn_to_page(kvm, gpa_to_gfn(gaddr)); - mmap_read_lock(gmap->mm); - if (page) - rc = __gmap_make_secure(gmap, page, uvcb); - kvm_release_page_clean(page); - mmap_read_unlock(gmap->mm); +static long _gmap_split_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + struct gmap *gmap = walk->priv; + union crste crste, newcrste; + + crste = READ_ONCE(*crstep); + newcrste = _CRSTE_EMPTY(crste.h.tt); + + while (crste_leaf(crste)) { + if (crste_prefix(crste)) + gmap_unmap_prefix(gmap, gfn, next); + if (crste.s.fc1.vsie_notif) + gmap_handle_vsie_unshadow_event(gmap, gfn); + if (dat_crstep_xchg_atomic(crstep, crste, newcrste, gfn, walk->asce)) + break; + crste = READ_ONCE(*crstep); + } + + if (need_resched()) + return next; + + return 0; +} + +void gmap_split_huge_pages(struct gmap *gmap) +{ + const struct dat_walk_ops ops = { + .pmd_entry = _gmap_split_crste, + .pud_entry = _gmap_split_crste, + }; + gfn_t start = 0; + + do { + scoped_guard(read_lock, &gmap->kvm->mmu_lock) + start = _dat_walk_gfn_range(start, asce_end(gmap->asce), gmap->asce, + &ops, DAT_WALK_IGN_HOLES, gmap); + cond_resched(); + } while (start); +} +static int _gmap_enable_skeys(struct gmap *gmap) +{ + gfn_t start = 0; + int rc; + + if (uses_skeys(gmap)) + return 0; + + set_bit(GMAP_FLAG_USES_SKEYS, &gmap->flags); + rc = gmap_helper_disable_cow_sharing(); + if (rc) { + clear_bit(GMAP_FLAG_USES_SKEYS, &gmap->flags); + return rc; + } + + do { + scoped_guard(write_lock, &gmap->kvm->mmu_lock) + start = dat_reset_skeys(gmap->asce, start); + cond_resched(); + } while (start); + return 0; +} + +int gmap_enable_skeys(struct gmap *gmap) +{ + int rc; + + mmap_write_lock(gmap->kvm->mm); + rc = _gmap_enable_skeys(gmap); + mmap_write_unlock(gmap->kvm->mm); return rc; } -int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr) +static long _destroy_pages_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) { - struct uv_cb_cts uvcb = { - .header.cmd = UVC_CMD_CONV_TO_SEC_STOR, - .header.len = sizeof(uvcb), - .guest_handle = gmap->guest_handle, - .gaddr = gaddr, + if (!ptep->s.pr) + return 0; + __kvm_s390_pv_destroy_page(phys_to_page(pte_origin(*ptep))); + if (need_resched()) + return next; + return 0; +} + +static long _destroy_pages_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + phys_addr_t origin, cur, end; + + if (!crstep->h.fc || !crstep->s.fc1.pr) + return 0; + + origin = crste_origin_large(*crstep); + cur = ((max(gfn, walk->start) - gfn) << PAGE_SHIFT) + origin; + end = ((min(next, walk->end) - gfn) << PAGE_SHIFT) + origin; + for ( ; cur < end; cur += PAGE_SIZE) + __kvm_s390_pv_destroy_page(phys_to_page(cur)); + if (need_resched()) + return next; + return 0; +} + +int gmap_pv_destroy_range(struct gmap *gmap, gfn_t start, gfn_t end, bool interruptible) +{ + const struct dat_walk_ops ops = { + .pte_entry = _destroy_pages_pte, + .pmd_entry = _destroy_pages_crste, + .pud_entry = _destroy_pages_crste, }; - return gmap_make_secure(gmap, gaddr, &uvcb); + do { + scoped_guard(read_lock, &gmap->kvm->mmu_lock) + start = _dat_walk_gfn_range(start, end, gmap->asce, &ops, + DAT_WALK_IGN_HOLES, NULL); + if (interruptible && fatal_signal_pending(current)) + return -EINTR; + cond_resched(); + } while (start && start < end); + return 0; +} + +int gmap_insert_rmap(struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn, int level) +{ + struct vsie_rmap *rmap __free(kvfree) = NULL; + struct vsie_rmap *temp; + void __rcu **slot; + int rc = 0; + + KVM_BUG_ON(!is_shadow(sg), sg->kvm); + lockdep_assert_held(&sg->host_to_rmap_lock); + + rmap = kzalloc_obj(*rmap, GFP_ATOMIC); + if (!rmap) + return -ENOMEM; + + rmap->r_gfn = r_gfn; + rmap->level = level; + slot = radix_tree_lookup_slot(&sg->host_to_rmap, p_gfn); + if (slot) { + rmap->next = radix_tree_deref_slot_protected(slot, &sg->host_to_rmap_lock); + for (temp = rmap->next; temp; temp = temp->next) { + if (temp->val == rmap->val) + return 0; + } + radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap); + } else { + rmap->next = NULL; + rc = radix_tree_insert(&sg->host_to_rmap, p_gfn, rmap); + if (rc) + return rc; + } + rmap = NULL; + + return 0; +} + +int gmap_protect_rmap(struct kvm_s390_mmu_cache *mc, struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn, + kvm_pfn_t pfn, int level, bool wr) +{ + union crste *crstep; + union pgste pgste; + union pte *ptep; + union pte pte; + int flags, rc; + + KVM_BUG_ON(!is_shadow(sg), sg->kvm); + lockdep_assert_held(&sg->parent->children_lock); + + flags = DAT_WALK_SPLIT_ALLOC | (uses_skeys(sg->parent) ? DAT_WALK_USES_SKEYS : 0); + rc = dat_entry_walk(mc, p_gfn, sg->parent->asce, flags, + TABLE_TYPE_PAGE_TABLE, &crstep, &ptep); + if (rc) + return rc; + if (level <= TABLE_TYPE_REGION1) { + scoped_guard(spinlock, &sg->host_to_rmap_lock) + rc = gmap_insert_rmap(sg, p_gfn, r_gfn, level); + } + if (rc) + return rc; + + if (!pgste_get_trylock(ptep, &pgste)) + return -EAGAIN; + pte = ptep->s.pr ? *ptep : _pte(pfn, wr, false, false); + pte.h.p = 1; + pgste = _gmap_ptep_xchg(sg->parent, ptep, pte, pgste, p_gfn, false); + pgste.vsie_notif = 1; + pgste_set_unlock(ptep, pgste); + + return 0; +} + +static long __set_cmma_dirty_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + __atomic64_or(PGSTE_CMMA_D_BIT, &pgste_of(ptep)->val); + if (need_resched()) + return next; + return 0; +} + +void gmap_set_cmma_all_dirty(struct gmap *gmap) +{ + const struct dat_walk_ops ops = { .pte_entry = __set_cmma_dirty_pte, }; + gfn_t gfn = 0; + + do { + scoped_guard(read_lock, &gmap->kvm->mmu_lock) + gfn = _dat_walk_gfn_range(gfn, asce_end(gmap->asce), gmap->asce, &ops, + DAT_WALK_IGN_HOLES, NULL); + cond_resched(); + } while (gfn); +} + +static void gmap_unshadow_level(struct gmap *sg, gfn_t r_gfn, int level) +{ + unsigned long align = PAGE_SIZE; + gpa_t gaddr = gfn_to_gpa(r_gfn); + union crste *crstep; + union crste crste; + union pte *ptep; + + if (level > TABLE_TYPE_PAGE_TABLE) + align = 1UL << (11 * level + _SEGMENT_SHIFT); + kvm_s390_vsie_gmap_notifier(sg, ALIGN_DOWN(gaddr, align), ALIGN(gaddr + 1, align)); + sg->invalidated = true; + if (dat_entry_walk(NULL, r_gfn, sg->asce, 0, level, &crstep, &ptep)) + return; + if (ptep) { + if (READ_ONCE(*ptep).val != _PTE_EMPTY.val) + dat_ptep_xchg(ptep, _PTE_EMPTY, r_gfn, sg->asce, uses_skeys(sg)); + return; + } + + crste = dat_crstep_clear_atomic(crstep, r_gfn, sg->asce); + if (crste_leaf(crste) || crste.h.i) + return; + if (is_pmd(crste)) + dat_free_pt(dereference_pmd(crste.pmd)); + else + dat_free_level(dereference_crste(crste), true); +} + +static void gmap_unshadow(struct gmap *sg) +{ + struct gmap_cache *gmap_cache, *next; + + KVM_BUG_ON(!is_shadow(sg), sg->kvm); + KVM_BUG_ON(!sg->parent, sg->kvm); + + lockdep_assert_held(&sg->parent->children_lock); + + gmap_remove_child(sg); + kvm_s390_vsie_gmap_notifier(sg, 0, -1UL); + + list_for_each_entry_safe(gmap_cache, next, &sg->scb_users, list) { + gmap_cache->gmap = NULL; + list_del(&gmap_cache->list); + } + + gmap_put(sg); +} + +void _gmap_handle_vsie_unshadow_event(struct gmap *parent, gfn_t gfn) +{ + struct vsie_rmap *rmap, *rnext, *head; + struct gmap *sg, *next; + gfn_t start, end; + + list_for_each_entry_safe(sg, next, &parent->children, list) { + start = sg->guest_asce.rsto; + end = start + sg->guest_asce.tl + 1; + if (!sg->guest_asce.r && gfn >= start && gfn < end) { + gmap_unshadow(sg); + continue; + } + scoped_guard(spinlock, &sg->host_to_rmap_lock) + head = radix_tree_delete(&sg->host_to_rmap, gfn); + gmap_for_each_rmap_safe(rmap, rnext, head) + gmap_unshadow_level(sg, rmap->r_gfn, rmap->level); + } } /** - * __gmap_destroy_page() - Destroy a guest page. - * @gmap: the gmap of the guest - * @page: the page to destroy + * gmap_find_shadow() - Find a specific ASCE in the list of shadow tables. + * @parent: Pointer to the parent gmap. + * @asce: ASCE for which the shadow table is created. + * @edat_level: Edat level to be used for the shadow translation. * - * An attempt will be made to destroy the given guest page. If the attempt - * fails, an attempt is made to export the page. If both attempts fail, an - * appropriate error is returned. + * Context: Called with parent->children_lock held. * - * Context: must be called holding the mm lock for gmap->mm + * Return: The pointer to a gmap if a shadow table with the given asce is + * already available, ERR_PTR(-EAGAIN) if another one is just being created, + * otherwise NULL. */ -static int __gmap_destroy_page(struct gmap *gmap, struct page *page) +static struct gmap *gmap_find_shadow(struct gmap *parent, union asce asce, int edat_level) +{ + struct gmap *sg; + + lockdep_assert_held(&parent->children_lock); + list_for_each_entry(sg, &parent->children, list) { + if (!gmap_is_shadow_valid(sg, asce, edat_level)) + continue; + return sg; + } + return NULL; +} + +#define CRST_TABLE_PAGES (_CRST_TABLE_SIZE / PAGE_SIZE) +struct gmap_protect_asce_top_level { + unsigned long seq; + struct guest_fault f[CRST_TABLE_PAGES]; +}; + +static inline int __gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg, + struct gmap_protect_asce_top_level *context) +{ + struct gmap *parent; + int rc, i; + + guard(write_lock)(&sg->kvm->mmu_lock); + + if (kvm_s390_array_needs_retry_safe(sg->kvm, context->seq, context->f)) + return -EAGAIN; + + parent = READ_ONCE(sg->parent); + if (!parent) + return -EAGAIN; + scoped_guard(spinlock, &parent->children_lock) { + if (READ_ONCE(sg->parent) != parent) + return -EAGAIN; + sg->invalidated = false; + for (i = 0; i < CRST_TABLE_PAGES; i++) { + if (!context->f[i].valid) + continue; + rc = gmap_protect_rmap(mc, sg, context->f[i].gfn, 0, context->f[i].pfn, + TABLE_TYPE_REGION1 + 1, context->f[i].writable); + if (rc) + return rc; + } + gmap_add_child(sg->parent, sg); + } + + kvm_s390_release_faultin_array(sg->kvm, context->f, false); + return 0; +} + +static inline int _gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg, + struct gmap_protect_asce_top_level *context) { - struct folio *folio = page_folio(page); int rc; - /* - * See gmap_make_secure(): large folios cannot be secure. Small - * folio implies FW_LEVEL_PTE. - */ - if (folio_test_large(folio)) - return -EFAULT; + if (kvm_s390_array_needs_retry_unsafe(sg->kvm, context->seq, context->f)) + return -EAGAIN; + do { + rc = kvm_s390_mmu_cache_topup(mc); + if (rc) + return rc; + rc = radix_tree_preload(GFP_KERNEL); + if (rc) + return rc; + rc = __gmap_protect_asce_top_level(mc, sg, context); + radix_tree_preload_end(); + } while (rc == -ENOMEM); - rc = uv_destroy_folio(folio); - /* - * Fault handlers can race; it is possible that two CPUs will fault - * on the same secure page. One CPU can destroy the page, reboot, - * re-enter secure mode and import it, while the second CPU was - * stuck at the beginning of the handler. At some point the second - * CPU will be able to progress, and it will not be able to destroy - * the page. In that case we do not want to terminate the process, - * we instead try to export the page. - */ - if (rc) - rc = uv_convert_from_secure_folio(folio); + return rc; +} + +static int gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg) +{ + struct gmap_protect_asce_top_level context = {}; + union asce asce = sg->guest_asce; + int rc; + + KVM_BUG_ON(!is_shadow(sg), sg->kvm); + + context.seq = sg->kvm->mmu_invalidate_seq; + /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */ + smp_rmb(); + rc = kvm_s390_get_guest_pages(sg->kvm, context.f, asce.rsto, asce.dt + 1, false); + if (rc > 0) + rc = -EFAULT; + if (!rc) + rc = _gmap_protect_asce_top_level(mc, sg, &context); + if (rc) + kvm_s390_release_faultin_array(sg->kvm, context.f, true); return rc; } /** - * gmap_destroy_page() - Destroy a guest page. - * @gmap: the gmap of the guest - * @gaddr: the guest address to destroy + * gmap_create_shadow() - Create/find a shadow guest address space. + * @mc: The cache to use to allocate dat tables. + * @parent: Pointer to the parent gmap. + * @asce: ASCE for which the shadow table is created. + * @edat_level: Edat level to be used for the shadow translation. + * + * The pages of the top level page table referred by the asce parameter + * will be set to read-only and marked in the PGSTEs of the kvm process. + * The shadow table will be removed automatically on any change to the + * PTE mapping for the source table. * - * An attempt will be made to destroy the given guest page. If the attempt - * fails, an attempt is made to export the page. If both attempts fail, an - * appropriate error is returned. + * The returned shadow gmap will be returned with one extra reference. * - * Context: may sleep. + * Return: A guest address space structure, ERR_PTR(-ENOMEM) if out of memory, + * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the + * parent gmap table could not be protected. */ -int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr) +struct gmap *gmap_create_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *parent, + union asce asce, int edat_level) { - struct page *page; - int rc = 0; + struct gmap *sg, *new; + int rc; - mmap_read_lock(gmap->mm); - page = gfn_to_page(gmap->private, gpa_to_gfn(gaddr)); - if (page) - rc = __gmap_destroy_page(gmap, page); - kvm_release_page_clean(page); - mmap_read_unlock(gmap->mm); - return rc; + if (WARN_ON(!parent)) + return ERR_PTR(-EINVAL); + + scoped_guard(spinlock, &parent->children_lock) { + sg = gmap_find_shadow(parent, asce, edat_level); + if (sg) { + gmap_get(sg); + return sg; + } + } + /* Create a new shadow gmap. */ + new = gmap_new(parent->kvm, asce.r ? 1UL << (64 - PAGE_SHIFT) : asce_end(asce)); + if (!new) + return ERR_PTR(-ENOMEM); + new->guest_asce = asce; + new->edat_level = edat_level; + set_bit(GMAP_FLAG_SHADOW, &new->flags); + + scoped_guard(spinlock, &parent->children_lock) { + /* Recheck if another CPU created the same shadow. */ + sg = gmap_find_shadow(parent, asce, edat_level); + if (sg) { + gmap_put(new); + gmap_get(sg); + return sg; + } + if (asce.r) { + /* Only allow one real-space gmap shadow. */ + list_for_each_entry(sg, &parent->children, list) { + if (sg->guest_asce.r) { + scoped_guard(write_lock, &parent->kvm->mmu_lock) + gmap_unshadow(sg); + break; + } + } + gmap_add_child(parent, new); + /* Nothing to protect, return right away. */ + gmap_get(new); + return new; + } + } + + gmap_get(new); + new->parent = parent; + /* Protect while inserting, protects against invalidation races. */ + rc = gmap_protect_asce_top_level(mc, new); + if (rc) { + new->parent = NULL; + gmap_put(new); + gmap_put(new); + return ERR_PTR(rc); + } + return new; } diff --git a/arch/s390/kvm/gmap.h b/arch/s390/kvm/gmap.h index c8f031c9ea5f..96ee1395a592 100644 --- a/arch/s390/kvm/gmap.h +++ b/arch/s390/kvm/gmap.h @@ -10,30 +10,246 @@ #ifndef ARCH_KVM_S390_GMAP_H #define ARCH_KVM_S390_GMAP_H -#define GMAP_SHADOW_FAKE_TABLE 1ULL +#include "dat.h" -int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb); -int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr); -int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr); -struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, int edat_level); +/** + * enum gmap_flags - Flags of a gmap. + * + * @GMAP_FLAG_SHADOW: The gmap is a vsie shadow gmap. + * @GMAP_FLAG_OWNS_PAGETABLES: The gmap owns all dat levels; normally 1, is 0 + * only for ucontrol per-cpu gmaps, since they + * share the page tables with the main gmap. + * @GMAP_FLAG_IS_UCONTROL: The gmap is ucontrol (main gmap or per-cpu gmap). + * @GMAP_FLAG_ALLOW_HPAGE_1M: 1M hugepages are allowed for this gmap, + * independently of the page size used by userspace. + * @GMAP_FLAG_ALLOW_HPAGE_2G: 2G hugepages are allowed for this gmap, + * independently of the page size used by userspace. + * @GMAP_FLAG_PFAULT_ENABLED: Pfault is enabled for the gmap. + * @GMAP_FLAG_USES_SKEYS: If the guest uses storage keys. + * @GMAP_FLAG_USES_CMM: Whether the guest uses CMMA. + * @GMAP_FLAG_EXPORT_ON_UNMAP: Whether to export guest pages when unmapping. + */ +enum gmap_flags { + GMAP_FLAG_SHADOW = 0, + GMAP_FLAG_OWNS_PAGETABLES, + GMAP_FLAG_IS_UCONTROL, + GMAP_FLAG_ALLOW_HPAGE_1M, + GMAP_FLAG_ALLOW_HPAGE_2G, + GMAP_FLAG_PFAULT_ENABLED, + GMAP_FLAG_USES_SKEYS, + GMAP_FLAG_USES_CMM, + GMAP_FLAG_EXPORT_ON_UNMAP, +}; /** - * gmap_shadow_valid - check if a shadow guest address space matches the - * given properties and is still valid - * @sg: pointer to the shadow guest address space structure - * @asce: ASCE for which the shadow table is requested - * @edat_level: edat level to be used for the shadow translation + * struct gmap_struct - Guest address space. * - * Returns 1 if the gmap shadow is still valid and matches the given - * properties, the caller can continue using it. Returns 0 otherwise, the - * caller has to request a new shadow gmap in this case. + * @flags: GMAP_FLAG_* flags. + * @edat_level: The edat level of this shadow gmap. + * @kvm: The vm. + * @asce: The ASCE used by this gmap. + * @list: List head used in children gmaps for the children gmap list. + * @children_lock: Protects children and scb_users. + * @children: List of child gmaps of this gmap. + * @scb_users: List of vsie_scb that use this shadow gmap. + * @parent: Parent gmap of a child gmap. + * @guest_asce: Original ASCE of this shadow gmap. + * @host_to_rmap_lock: Protects host_to_rmap. + * @host_to_rmap: Radix tree mapping host addresses to guest addresses. + */ +struct gmap { + unsigned long flags; + unsigned char edat_level; + bool invalidated; + struct kvm *kvm; + union asce asce; + struct list_head list; + spinlock_t children_lock; /* Protects: children, scb_users */ + struct list_head children; + struct list_head scb_users; + struct gmap *parent; + union asce guest_asce; + spinlock_t host_to_rmap_lock; /* Protects host_to_rmap */ + struct radix_tree_root host_to_rmap; + refcount_t refcount; +}; + +struct gmap_cache { + struct list_head list; + struct gmap *gmap; +}; + +#define gmap_for_each_rmap_safe(pos, n, head) \ + for (pos = (head); n = pos ? pos->next : NULL, pos; pos = n) + +int s390_replace_asce(struct gmap *gmap); +bool _gmap_unmap_prefix(struct gmap *gmap, gfn_t gfn, gfn_t end, bool hint); +bool gmap_age_gfn(struct gmap *gmap, gfn_t start, gfn_t end); +bool gmap_unmap_gfn_range(struct gmap *gmap, struct kvm_memory_slot *slot, gfn_t start, gfn_t end); +int gmap_try_fixup_minor(struct gmap *gmap, struct guest_fault *fault); +struct gmap *gmap_new(struct kvm *kvm, gfn_t limit); +struct gmap *gmap_new_child(struct gmap *parent, gfn_t limit); +void gmap_remove_child(struct gmap *child); +void gmap_dispose(struct gmap *gmap); +int gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, struct guest_fault *fault, + struct kvm_memory_slot *slot); +void gmap_sync_dirty_log(struct gmap *gmap, gfn_t start, gfn_t end); +int gmap_set_limit(struct gmap *gmap, gfn_t limit); +int gmap_ucas_translate(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, gpa_t *gaddr); +int gmap_ucas_map(struct gmap *gmap, gfn_t p_gfn, gfn_t c_gfn, unsigned long count); +void gmap_ucas_unmap(struct gmap *gmap, gfn_t c_gfn, unsigned long count); +int gmap_enable_skeys(struct gmap *gmap); +int gmap_pv_destroy_range(struct gmap *gmap, gfn_t start, gfn_t end, bool interruptible); +int gmap_insert_rmap(struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn, int level); +int gmap_protect_rmap(struct kvm_s390_mmu_cache *mc, struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn, + kvm_pfn_t pfn, int level, bool wr); +void gmap_set_cmma_all_dirty(struct gmap *gmap); +void _gmap_handle_vsie_unshadow_event(struct gmap *parent, gfn_t gfn); +struct gmap *gmap_create_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, + union asce asce, int edat_level); +void gmap_split_huge_pages(struct gmap *gmap); + +static inline bool uses_skeys(struct gmap *gmap) +{ + return test_bit(GMAP_FLAG_USES_SKEYS, &gmap->flags); +} + +static inline bool uses_cmm(struct gmap *gmap) +{ + return test_bit(GMAP_FLAG_USES_CMM, &gmap->flags); +} + +static inline bool pfault_enabled(struct gmap *gmap) +{ + return test_bit(GMAP_FLAG_PFAULT_ENABLED, &gmap->flags); +} + +static inline bool is_ucontrol(struct gmap *gmap) +{ + return test_bit(GMAP_FLAG_IS_UCONTROL, &gmap->flags); +} + +static inline bool is_shadow(struct gmap *gmap) +{ + return test_bit(GMAP_FLAG_SHADOW, &gmap->flags); +} + +static inline bool owns_page_tables(struct gmap *gmap) +{ + return test_bit(GMAP_FLAG_OWNS_PAGETABLES, &gmap->flags); +} + +static inline struct gmap *gmap_put(struct gmap *gmap) +{ + if (refcount_dec_and_test(&gmap->refcount)) + gmap_dispose(gmap); + return NULL; +} + +static inline void gmap_get(struct gmap *gmap) +{ + WARN_ON_ONCE(unlikely(!refcount_inc_not_zero(&gmap->refcount))); +} + +static inline void gmap_handle_vsie_unshadow_event(struct gmap *parent, gfn_t gfn) +{ + scoped_guard(spinlock, &parent->children_lock) + _gmap_handle_vsie_unshadow_event(parent, gfn); +} + +static inline bool gmap_mkold_prefix(struct gmap *gmap, gfn_t gfn, gfn_t end) +{ + return _gmap_unmap_prefix(gmap, gfn, end, true); +} + +static inline bool gmap_unmap_prefix(struct gmap *gmap, gfn_t gfn, gfn_t end) +{ + return _gmap_unmap_prefix(gmap, gfn, end, false); +} + +static inline union pgste _gmap_ptep_xchg(struct gmap *gmap, union pte *ptep, union pte newpte, + union pgste pgste, gfn_t gfn, bool needs_lock) +{ + lockdep_assert_held(&gmap->kvm->mmu_lock); + if (!needs_lock) + lockdep_assert_held(&gmap->children_lock); + else + lockdep_assert_not_held(&gmap->children_lock); + + if (pgste.prefix_notif && (newpte.h.p || newpte.h.i)) { + pgste.prefix_notif = 0; + gmap_unmap_prefix(gmap, gfn, gfn + 1); + } + if (pgste.vsie_notif && (ptep->h.p != newpte.h.p || newpte.h.i)) { + pgste.vsie_notif = 0; + if (needs_lock) + gmap_handle_vsie_unshadow_event(gmap, gfn); + else + _gmap_handle_vsie_unshadow_event(gmap, gfn); + } + if (!ptep->s.d && newpte.s.d && !newpte.s.s) + SetPageDirty(pfn_to_page(newpte.h.pfra)); + return __dat_ptep_xchg(ptep, pgste, newpte, gfn, gmap->asce, uses_skeys(gmap)); +} + +static inline union pgste gmap_ptep_xchg(struct gmap *gmap, union pte *ptep, union pte newpte, + union pgste pgste, gfn_t gfn) +{ + return _gmap_ptep_xchg(gmap, ptep, newpte, pgste, gfn, true); +} + +static inline bool __must_check _gmap_crstep_xchg_atomic(struct gmap *gmap, union crste *crstep, + union crste oldcrste, union crste newcrste, + gfn_t gfn, bool needs_lock) +{ + unsigned long align = is_pmd(newcrste) ? _PAGE_ENTRIES : _PAGE_ENTRIES * _CRST_ENTRIES; + + if (KVM_BUG_ON(crstep->h.tt != oldcrste.h.tt || newcrste.h.tt != oldcrste.h.tt, gmap->kvm)) + return true; + + lockdep_assert_held(&gmap->kvm->mmu_lock); + if (!needs_lock) + lockdep_assert_held(&gmap->children_lock); + + gfn = ALIGN_DOWN(gfn, align); + if (crste_prefix(oldcrste) && (newcrste.h.p || newcrste.h.i || !crste_prefix(newcrste))) { + newcrste.s.fc1.prefix_notif = 0; + gmap_unmap_prefix(gmap, gfn, gfn + align); + } + if (crste_leaf(oldcrste) && oldcrste.s.fc1.vsie_notif && + (newcrste.h.p || newcrste.h.i || !newcrste.s.fc1.vsie_notif)) { + newcrste.s.fc1.vsie_notif = 0; + if (needs_lock) + gmap_handle_vsie_unshadow_event(gmap, gfn); + else + _gmap_handle_vsie_unshadow_event(gmap, gfn); + } + if (!oldcrste.s.fc1.d && newcrste.s.fc1.d && !newcrste.s.fc1.s) + SetPageDirty(phys_to_page(crste_origin_large(newcrste))); + return dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, gfn, gmap->asce); +} + +static inline bool __must_check gmap_crstep_xchg_atomic(struct gmap *gmap, union crste *crstep, + union crste oldcrste, union crste newcrste, + gfn_t gfn) +{ + return _gmap_crstep_xchg_atomic(gmap, crstep, oldcrste, newcrste, gfn, true); +} + +/** + * gmap_is_shadow_valid() - check if a shadow guest address space matches the + * given properties and is still valid. + * @sg: Pointer to the shadow guest address space structure. + * @asce: ASCE for which the shadow table is requested. + * @edat_level: Edat level to be used for the shadow translation. * + * Return: true if the gmap shadow is still valid and matches the given + * properties and the caller can continue using it; false otherwise, the + * caller has to request a new shadow gmap in this case. */ -static inline int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level) +static inline bool gmap_is_shadow_valid(struct gmap *sg, union asce asce, int edat_level) { - if (sg->removed) - return 0; - return sg->orig_asce == asce && sg->edat_level == edat_level; + return sg->guest_asce.val == asce.val && sg->edat_level == edat_level; } -#endif +#endif /* ARCH_KVM_S390_GMAP_H */ diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c index 80879fc73c90..69835e1d4f20 100644 --- a/arch/s390/kvm/guestdbg.c +++ b/arch/s390/kvm/guestdbg.c @@ -232,18 +232,14 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu, } if (nr_wp > 0) { - wp_info = kmalloc_array(nr_wp, - sizeof(*wp_info), - GFP_KERNEL_ACCOUNT); + wp_info = kmalloc_objs(*wp_info, nr_wp, GFP_KERNEL_ACCOUNT); if (!wp_info) { ret = -ENOMEM; goto error; } } if (nr_bp > 0) { - bp_info = kmalloc_array(nr_bp, - sizeof(*bp_info), - GFP_KERNEL_ACCOUNT); + bp_info = kmalloc_objs(*bp_info, nr_bp, GFP_KERNEL_ACCOUNT); if (!bp_info) { ret = -ENOMEM; goto error; diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 610dd44a948b..39aff324203e 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -21,7 +21,7 @@ #include "gaccess.h" #include "trace.h" #include "trace-s390.h" -#include "gmap.h" +#include "faultin.h" u8 kvm_s390_get_ilen(struct kvm_vcpu *vcpu) { @@ -95,7 +95,7 @@ static int handle_validity(struct kvm_vcpu *vcpu) vcpu->stat.exit_validity++; trace_kvm_s390_intercept_validity(vcpu, viwhy); - KVM_EVENT(3, "validity intercept 0x%x for pid %u (kvm 0x%pK)", viwhy, + KVM_EVENT(3, "validity intercept 0x%x for pid %u (kvm 0x%p)", viwhy, current->pid, vcpu->kvm); /* do not warn on invalid runtime instrumentation mode */ @@ -368,8 +368,11 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu) reg2, &srcaddr, GACC_FETCH, 0); if (rc) return kvm_s390_inject_prog_cond(vcpu, rc); - rc = kvm_s390_handle_dat_fault(vcpu, srcaddr, 0); - if (rc != 0) + + do { + rc = kvm_s390_faultin_gfn_simple(vcpu, NULL, gpa_to_gfn(srcaddr), false); + } while (rc == -EAGAIN); + if (rc) return rc; /* Ensure that the source is paged-in, no actual access -> no key checking */ @@ -377,8 +380,11 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu) reg1, &dstaddr, GACC_STORE, 0); if (rc) return kvm_s390_inject_prog_cond(vcpu, rc); - rc = kvm_s390_handle_dat_fault(vcpu, dstaddr, FOLL_WRITE); - if (rc != 0) + + do { + rc = kvm_s390_faultin_gfn_simple(vcpu, NULL, gpa_to_gfn(dstaddr), true); + } while (rc == -EAGAIN); + if (rc) return rc; kvm_s390_retry_instr(vcpu); @@ -472,6 +478,9 @@ static int handle_operexc(struct kvm_vcpu *vcpu) if (vcpu->arch.sie_block->ipa == 0xb256) return handle_sthyi(vcpu); + if (vcpu->kvm->arch.user_operexec) + return -EOPNOTSUPP; + if (vcpu->arch.sie_block->ipa == 0 && vcpu->kvm->arch.user_instr0) return -EOPNOTSUPP; rc = read_guest_lc(vcpu, __LC_PGM_NEW_PSW, &newpsw, sizeof(psw_t)); @@ -545,7 +554,7 @@ static int handle_pv_uvc(struct kvm_vcpu *vcpu) guest_uvcb->header.cmd); return 0; } - rc = gmap_make_secure(vcpu->arch.gmap, uvcb.gaddr, &uvcb); + rc = kvm_s390_pv_make_secure(vcpu->kvm, uvcb.gaddr, &uvcb); /* * If the unpin did not succeed, the guest will exit again for the UVC * and we will retry the unpin. @@ -653,10 +662,8 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu) break; case ICPT_PV_PREF: rc = 0; - gmap_convert_to_secure(vcpu->arch.gmap, - kvm_s390_get_prefix(vcpu)); - gmap_convert_to_secure(vcpu->arch.gmap, - kvm_s390_get_prefix(vcpu) + PAGE_SIZE); + kvm_s390_pv_convert_to_secure(vcpu->kvm, kvm_s390_get_prefix(vcpu)); + kvm_s390_pv_convert_to_secure(vcpu->kvm, kvm_s390_get_prefix(vcpu) + PAGE_SIZE); break; default: return -EOPNOTSUPP; diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 07ff0e10cb7f..07f59c3b9a7b 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -7,12 +7,13 @@ * Author(s): Carsten Otte <cotte@de.ibm.com> */ -#define KMSG_COMPONENT "kvm-s390" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "kvm-s390: " fmt +#include <linux/cpufeature.h> #include <linux/interrupt.h> #include <linux/kvm_host.h> #include <linux/hrtimer.h> +#include <linux/export.h> #include <linux/mmu_context.h> #include <linux/nospec.h> #include <linux/signal.h> @@ -25,7 +26,6 @@ #include <linux/uaccess.h> #include <asm/sclp.h> #include <asm/isc.h> -#include <asm/gmap.h> #include <asm/nmi.h> #include <asm/airq.h> #include <asm/tpi.h> @@ -33,6 +33,7 @@ #include "gaccess.h" #include "trace-s390.h" #include "pci.h" +#include "gmap.h" #define PFAULT_INIT 0x0600 #define PFAULT_DONE 0x0680 @@ -43,70 +44,34 @@ static struct kvm_s390_gib *gib; /* handle external calls via sigp interpretation facility */ static int sca_ext_call_pending(struct kvm_vcpu *vcpu, int *src_id) { - int c, scn; + struct esca_block *sca = vcpu->kvm->arch.sca; + union esca_sigp_ctrl sigp_ctrl = sca->cpu[vcpu->vcpu_id].sigp_ctrl; if (!kvm_s390_test_cpuflags(vcpu, CPUSTAT_ECALL_PEND)) return 0; BUG_ON(!kvm_s390_use_sca_entries()); - read_lock(&vcpu->kvm->arch.sca_lock); - if (vcpu->kvm->arch.use_esca) { - struct esca_block *sca = vcpu->kvm->arch.sca; - union esca_sigp_ctrl sigp_ctrl = - sca->cpu[vcpu->vcpu_id].sigp_ctrl; - - c = sigp_ctrl.c; - scn = sigp_ctrl.scn; - } else { - struct bsca_block *sca = vcpu->kvm->arch.sca; - union bsca_sigp_ctrl sigp_ctrl = - sca->cpu[vcpu->vcpu_id].sigp_ctrl; - - c = sigp_ctrl.c; - scn = sigp_ctrl.scn; - } - read_unlock(&vcpu->kvm->arch.sca_lock); if (src_id) - *src_id = scn; + *src_id = sigp_ctrl.scn; - return c; + return sigp_ctrl.c; } static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id) { + struct esca_block *sca = vcpu->kvm->arch.sca; + union esca_sigp_ctrl *sigp_ctrl = &sca->cpu[vcpu->vcpu_id].sigp_ctrl; + union esca_sigp_ctrl old_val, new_val = {.scn = src_id, .c = 1}; int expect, rc; BUG_ON(!kvm_s390_use_sca_entries()); - read_lock(&vcpu->kvm->arch.sca_lock); - if (vcpu->kvm->arch.use_esca) { - struct esca_block *sca = vcpu->kvm->arch.sca; - union esca_sigp_ctrl *sigp_ctrl = - &(sca->cpu[vcpu->vcpu_id].sigp_ctrl); - union esca_sigp_ctrl new_val = {0}, old_val; - - old_val = READ_ONCE(*sigp_ctrl); - new_val.scn = src_id; - new_val.c = 1; - old_val.c = 0; - - expect = old_val.value; - rc = cmpxchg(&sigp_ctrl->value, old_val.value, new_val.value); - } else { - struct bsca_block *sca = vcpu->kvm->arch.sca; - union bsca_sigp_ctrl *sigp_ctrl = - &(sca->cpu[vcpu->vcpu_id].sigp_ctrl); - union bsca_sigp_ctrl new_val = {0}, old_val; - old_val = READ_ONCE(*sigp_ctrl); - new_val.scn = src_id; - new_val.c = 1; - old_val.c = 0; + old_val = READ_ONCE(*sigp_ctrl); + old_val.c = 0; - expect = old_val.value; - rc = cmpxchg(&sigp_ctrl->value, old_val.value, new_val.value); - } - read_unlock(&vcpu->kvm->arch.sca_lock); + expect = old_val.value; + rc = cmpxchg(&sigp_ctrl->value, old_val.value, new_val.value); if (rc != expect) { /* another external call is pending */ @@ -118,24 +83,14 @@ static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id) static void sca_clear_ext_call(struct kvm_vcpu *vcpu) { + struct esca_block *sca = vcpu->kvm->arch.sca; + union esca_sigp_ctrl *sigp_ctrl = &sca->cpu[vcpu->vcpu_id].sigp_ctrl; + if (!kvm_s390_use_sca_entries()) return; kvm_s390_clear_cpuflags(vcpu, CPUSTAT_ECALL_PEND); - read_lock(&vcpu->kvm->arch.sca_lock); - if (vcpu->kvm->arch.use_esca) { - struct esca_block *sca = vcpu->kvm->arch.sca; - union esca_sigp_ctrl *sigp_ctrl = - &(sca->cpu[vcpu->vcpu_id].sigp_ctrl); - WRITE_ONCE(sigp_ctrl->value, 0); - } else { - struct bsca_block *sca = vcpu->kvm->arch.sca; - union bsca_sigp_ctrl *sigp_ctrl = - &(sca->cpu[vcpu->vcpu_id].sigp_ctrl); - - WRITE_ONCE(sigp_ctrl->value, 0); - } - read_unlock(&vcpu->kvm->arch.sca_lock); + WRITE_ONCE(sigp_ctrl->value, 0); } int psw_extint_disabled(struct kvm_vcpu *vcpu) @@ -577,7 +532,7 @@ static int __write_machine_check(struct kvm_vcpu *vcpu, /* take care of lazy register loading */ kvm_s390_fpu_store(vcpu->run); save_access_regs(vcpu->run->s.regs.acrs); - if (MACHINE_HAS_GS && vcpu->arch.gs_enabled) + if (cpu_has_gs() && vcpu->arch.gs_enabled) save_gs_cb(current->thread.gs_cb); /* Extended save area */ @@ -948,8 +903,7 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu) rc |= put_guest_lc(vcpu, ilen, (u16 *) __LC_PGM_ILC); rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->gbea, (u64 *) __LC_PGM_LAST_BREAK); - rc |= put_guest_lc(vcpu, pgm_info.code, - (u16 *)__LC_PGM_INT_CODE); + rc |= put_guest_lc(vcpu, pgm_info.code, (u16 *)__LC_PGM_CODE); rc |= write_guest_lc(vcpu, __LC_PGM_OLD_PSW, &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); rc |= read_guest_lc(vcpu, __LC_PGM_NEW_PSW, @@ -1002,6 +956,9 @@ static int __must_check __deliver_service(struct kvm_vcpu *vcpu) set_bit(IRQ_PEND_EXT_SERVICE, &fi->masked_irqs); spin_unlock(&fi->lock); + if (!ext.ext_params) + return 0; + VCPU_EVENT(vcpu, 4, "deliver: sclp parameter 0x%x", ext.ext_params); vcpu->stat.deliver_service_signal++; @@ -1223,7 +1180,7 @@ int kvm_s390_ext_call_pending(struct kvm_vcpu *vcpu) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; - if (!sclp.has_sigpif) + if (!kvm_s390_use_sca_entries()) return test_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs); return sca_ext_call_pending(vcpu, NULL); @@ -1322,6 +1279,7 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu) VCPU_EVENT(vcpu, 4, "enabled wait: %llu ns", sltime); no_timer: kvm_vcpu_srcu_read_unlock(vcpu); + vcpu->kvm->arch.float_int.last_sleep_cpu = vcpu->vcpu_idx; kvm_vcpu_halt(vcpu); vcpu->valid_wakeup = false; __unset_cpu_idle(vcpu); @@ -1547,7 +1505,7 @@ static int __inject_extcall(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) if (kvm_get_vcpu_by_id(vcpu->kvm, src_id) == NULL) return -EINVAL; - if (sclp.has_sigpif && !kvm_s390_pv_cpu_get_handle(vcpu)) + if (kvm_s390_use_sca_entries() && !kvm_s390_pv_cpu_get_handle(vcpu)) return sca_inject_ext_call(vcpu, src_id); if (test_and_set_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs)) @@ -1794,7 +1752,7 @@ struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm, goto out; } gisa_out: - tmp_inti = kzalloc(sizeof(*inti), GFP_KERNEL_ACCOUNT); + tmp_inti = kzalloc_obj(*inti, GFP_KERNEL_ACCOUNT); if (tmp_inti) { tmp_inti->type = KVM_S390_INT_IO(1, 0, 0, 0); tmp_inti->io.io_int_word = isc_to_int_word(isc); @@ -1948,18 +1906,15 @@ static void __floating_irq_kick(struct kvm *kvm, u64 type) if (!online_vcpus) return; - /* find idle VCPUs first, then round robin */ - sigcpu = find_first_bit(kvm->arch.idle_mask, online_vcpus); - if (sigcpu == online_vcpus) { - do { - sigcpu = kvm->arch.float_int.next_rr_cpu++; - kvm->arch.float_int.next_rr_cpu %= online_vcpus; - /* avoid endless loops if all vcpus are stopped */ - if (nr_tries++ >= online_vcpus) - return; - } while (is_vcpu_stopped(kvm_get_vcpu(kvm, sigcpu))); + for (sigcpu = kvm->arch.float_int.last_sleep_cpu; ; sigcpu++) { + sigcpu %= online_vcpus; + dst_vcpu = kvm_get_vcpu(kvm, sigcpu); + if (!is_vcpu_stopped(dst_vcpu)) + break; + /* avoid endless loops if all vcpus are stopped */ + if (nr_tries++ >= online_vcpus) + return; } - dst_vcpu = kvm_get_vcpu(kvm, sigcpu); /* make the VCPU drop out of the SIE, or wake it up if sleeping */ switch (type) { @@ -2016,7 +1971,7 @@ int kvm_s390_inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti; int rc; - inti = kzalloc(sizeof(*inti), GFP_KERNEL_ACCOUNT); + inti = kzalloc_obj(*inti, GFP_KERNEL_ACCOUNT); if (!inti) return -ENOMEM; @@ -2422,7 +2377,7 @@ static int enqueue_floating_irq(struct kvm_device *dev, return -EINVAL; while (len >= sizeof(struct kvm_s390_irq)) { - inti = kzalloc(sizeof(*inti), GFP_KERNEL_ACCOUNT); + inti = kzalloc_obj(*inti, GFP_KERNEL_ACCOUNT); if (!inti) return -ENOMEM; @@ -2470,7 +2425,7 @@ static int register_io_adapter(struct kvm_device *dev, if (dev->kvm->arch.adapters[adapter_info.id] != NULL) return -EINVAL; - adapter = kzalloc(sizeof(*adapter), GFP_KERNEL_ACCOUNT); + adapter = kzalloc_obj(*adapter, GFP_KERNEL_ACCOUNT); if (!adapter) return -ENOMEM; @@ -2680,12 +2635,12 @@ static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) case KVM_DEV_FLIC_APF_ENABLE: if (kvm_is_ucontrol(dev->kvm)) return -EINVAL; - dev->kvm->arch.gmap->pfault_enabled = 1; + set_bit(GMAP_FLAG_PFAULT_ENABLED, &dev->kvm->arch.gmap->flags); break; case KVM_DEV_FLIC_APF_DISABLE_WAIT: if (kvm_is_ucontrol(dev->kvm)) return -EINVAL; - dev->kvm->arch.gmap->pfault_enabled = 0; + clear_bit(GMAP_FLAG_PFAULT_ENABLED, &dev->kvm->arch.gmap->flags); /* * Make sure no async faults are in transition when * clearing the queues. So we don't need to worry @@ -2772,17 +2727,27 @@ static unsigned long get_ind_bit(__u64 addr, unsigned long bit_nr, bool swap) bit = bit_nr + (addr % PAGE_SIZE) * 8; + /* kvm_set_routing_entry() should never allow this to happen */ + WARN_ON_ONCE(bit > (PAGE_SIZE * BITS_PER_BYTE - 1)); + return swap ? (bit ^ (BITS_PER_LONG - 1)) : bit; } static struct page *get_map_page(struct kvm *kvm, u64 uaddr) { + struct mm_struct *mm = kvm->mm; struct page *page = NULL; + int locked = 1; + + if (mmget_not_zero(mm)) { + mmap_read_lock(mm); + get_user_pages_remote(mm, uaddr, 1, FOLL_WRITE, + &page, &locked); + if (locked) + mmap_read_unlock(mm); + mmput(mm); + } - mmap_read_lock(kvm->mm); - get_user_pages_remote(kvm->mm, uaddr, 1, FOLL_WRITE, - &page, NULL); - mmap_read_unlock(kvm->mm); return page; } @@ -2809,13 +2774,13 @@ static int adapter_indicators_set(struct kvm *kvm, bit = get_ind_bit(adapter_int->ind_addr, adapter_int->ind_offset, adapter->swap); set_bit(bit, map); - mark_page_dirty(kvm, adapter_int->ind_addr >> PAGE_SHIFT); + mark_page_dirty(kvm, adapter_int->ind_gaddr >> PAGE_SHIFT); set_page_dirty_lock(ind_page); map = page_address(summary_page); bit = get_ind_bit(adapter_int->summary_addr, adapter_int->summary_offset, adapter->swap); summary_set = test_and_set_bit(bit, map); - mark_page_dirty(kvm, adapter_int->summary_addr >> PAGE_SHIFT); + mark_page_dirty(kvm, adapter_int->summary_gaddr >> PAGE_SHIFT); set_page_dirty_lock(summary_page); srcu_read_unlock(&kvm->srcu, idx); @@ -2865,6 +2830,12 @@ void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu, int rc; mci.val = mcck_info->mcic; + + /* log machine checks being reinjected on all debugs */ + VCPU_EVENT(vcpu, 2, "guest machine check %lx", mci.val); + KVM_EVENT(2, "guest machine check %lx", mci.val); + pr_info("guest machine check pid %d: %lx", current->pid, mci.val); + if (mci.sr) cr14 |= CR14_RECOVERY_SUBMASK; if (mci.dg) @@ -2893,6 +2864,7 @@ int kvm_set_routing_entry(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { + const struct kvm_irq_routing_s390_adapter *adapter; u64 uaddr_s, uaddr_i; int idx; @@ -2903,6 +2875,14 @@ int kvm_set_routing_entry(struct kvm *kvm, return -EINVAL; e->set = set_adapter_int; + adapter = &ue->u.adapter; + if (adapter->summary_addr + (adapter->summary_offset / 8) >= + (adapter->summary_addr & PAGE_MASK) + PAGE_SIZE) + return -EINVAL; + if (adapter->ind_addr + (adapter->ind_offset / 8) >= + (adapter->ind_addr & PAGE_MASK) + PAGE_SIZE) + return -EINVAL; + idx = srcu_read_lock(&kvm->srcu); uaddr_s = gpa_to_hva(kvm, ue->u.adapter.summary_addr); uaddr_i = gpa_to_hva(kvm, ue->u.adapter.ind_addr); @@ -2911,7 +2891,9 @@ int kvm_set_routing_entry(struct kvm *kvm, if (kvm_is_error_hva(uaddr_s) || kvm_is_error_hva(uaddr_i)) return -EFAULT; e->adapter.summary_addr = uaddr_s; + e->adapter.summary_gaddr = ue->u.adapter.summary_addr; e->adapter.ind_addr = uaddr_i; + e->adapter.ind_gaddr = ue->u.adapter.ind_addr; e->adapter.summary_offset = ue->u.adapter.summary_offset; e->adapter.ind_offset = ue->u.adapter.ind_offset; e->adapter.adapter_id = ue->u.adapter.adapter_id; @@ -3161,7 +3143,7 @@ void kvm_s390_gisa_clear(struct kvm *kvm) if (!gi->origin) return; gisa_clear_ipm(gi->origin); - VM_EVENT(kvm, 3, "gisa 0x%pK cleared", gi->origin); + VM_EVENT(kvm, 3, "gisa 0x%p cleared", gi->origin); } void kvm_s390_gisa_init(struct kvm *kvm) @@ -3174,11 +3156,10 @@ void kvm_s390_gisa_init(struct kvm *kvm) gi->alert.mask = 0; spin_lock_init(&gi->alert.ref_lock); gi->expires = 50 * 1000; /* 50 usec */ - hrtimer_init(&gi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - gi->timer.function = gisa_vcpu_kicker; + hrtimer_setup(&gi->timer, gisa_vcpu_kicker, CLOCK_MONOTONIC, HRTIMER_MODE_REL); memset(gi->origin, 0, sizeof(struct kvm_s390_gisa)); gi->origin->next_alert = (u32)virt_to_phys(gi->origin); - VM_EVENT(kvm, 3, "gisa 0x%pK initialized", gi->origin); + VM_EVENT(kvm, 3, "gisa 0x%p initialized", gi->origin); } void kvm_s390_gisa_enable(struct kvm *kvm) @@ -3219,7 +3200,7 @@ void kvm_s390_gisa_destroy(struct kvm *kvm) process_gib_alert_list(); hrtimer_cancel(&gi->timer); gi->origin = NULL; - VM_EVENT(kvm, 3, "gisa 0x%pK destroyed", gisa); + VM_EVENT(kvm, 3, "gisa 0x%p destroyed", gisa); } void kvm_s390_gisa_disable(struct kvm *kvm) @@ -3468,7 +3449,7 @@ int __init kvm_s390_gib_init(u8 nisc) } } - KVM_EVENT(3, "gib 0x%pK (nisc=%d) initialized", gib, gib->nisc); + KVM_EVENT(3, "gib 0x%p (nisc=%d) initialized", gib, gib->nisc); goto out; out_unreg_gal: diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index ebecb96bacce..e09960c2e6ed 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -10,10 +10,11 @@ * Jason J. Herne <jjherne@us.ibm.com> */ -#define KMSG_COMPONENT "kvm-s390" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "kvm-s390: " fmt #include <linux/compiler.h> +#include <linux/entry-virt.h> +#include <linux/export.h> #include <linux/err.h> #include <linux/fs.h> #include <linux/hrtimer.h> @@ -23,6 +24,7 @@ #include <linux/mman.h> #include <linux/module.h> #include <linux/moduleparam.h> +#include <linux/cpufeature.h> #include <linux/random.h> #include <linux/slab.h> #include <linux/timer.h> @@ -36,8 +38,9 @@ #include <asm/access-regs.h> #include <asm/asm-offsets.h> #include <asm/lowcore.h> +#include <asm/machine.h> #include <asm/stp.h> -#include <asm/gmap.h> +#include <asm/gmap_helpers.h> #include <asm/nmi.h> #include <asm/isc.h> #include <asm/sclp.h> @@ -49,8 +52,9 @@ #include <asm/uv.h> #include "kvm-s390.h" #include "gaccess.h" -#include "pci.h" #include "gmap.h" +#include "faultin.h" +#include "pci.h" #define CREATE_TRACE_POINTS #include "trace.h" @@ -61,7 +65,7 @@ #define VCPU_IRQS_MAX_BUF (sizeof(struct kvm_s390_irq) * \ (KVM_MAX_VCPUS + LOCAL_IRQS)) -const struct _kvm_stats_desc kvm_vm_stats_desc[] = { +const struct kvm_stats_desc kvm_vm_stats_desc[] = { KVM_GENERIC_VM_STATS(), STATS_DESC_COUNTER(VM, inject_io), STATS_DESC_COUNTER(VM, inject_float_mchk), @@ -87,7 +91,7 @@ const struct kvm_stats_header kvm_vm_stats_header = { sizeof(kvm_vm_stats_desc), }; -const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { +const struct kvm_stats_desc kvm_vcpu_stats_desc[] = { KVM_GENERIC_VCPU_STATS(), STATS_DESC_COUNTER(VCPU, exit_userspace), STATS_DESC_COUNTER(VCPU, exit_null), @@ -182,7 +186,8 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, instruction_diagnose_308), STATS_DESC_COUNTER(VCPU, instruction_diagnose_500), STATS_DESC_COUNTER(VCPU, instruction_diagnose_other), - STATS_DESC_COUNTER(VCPU, pfault_sync) + STATS_DESC_COUNTER(VCPU, pfault_sync), + STATS_DESC_COUNTER(VCPU, signal_exits) }; const struct kvm_stats_header kvm_vcpu_stats_header = { @@ -260,17 +265,11 @@ static DECLARE_BITMAP(kvm_s390_available_cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS) /* available subfunctions indicated via query / "test bit" */ static struct kvm_s390_vm_cpu_subfunc kvm_s390_available_subfunc; -static struct gmap_notifier gmap_notifier; -static struct gmap_notifier vsie_gmap_notifier; debug_info_t *kvm_s390_dbf; debug_info_t *kvm_s390_dbf_uv; /* Section: not file related */ /* forward declarations */ -static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, - unsigned long end); -static int sca_switch_to_extended(struct kvm *kvm); - static void kvm_clock_sync_scb(struct kvm_s390_sie_block *scb, u64 delta) { u8 delta_idx = 0; @@ -353,7 +352,7 @@ static __always_inline void pfcr_query(u8 (*query)[16]) { asm volatile( " lghi 0,0\n" - " .insn rsy,0xeb0000000016,0,0,%[query]\n" + " .insn rsy,0xeb0000000016,0,0,%[query]" : [query] "=QS" (*query) : : "cc", "0"); @@ -365,7 +364,7 @@ static __always_inline void __sortl_query(u8 (*query)[32]) " lghi 0,0\n" " la 1,%[query]\n" /* Parameter registers are ignored */ - " .insn rre,0xb9380000,2,4\n" + " .insn rre,0xb9380000,2,4" : [query] "=R" (*query) : : "cc", "0", "1"); @@ -377,7 +376,7 @@ static __always_inline void __dfltcc_query(u8 (*query)[32]) " lghi 0,0\n" " la 1,%[query]\n" /* Parameter registers are ignored */ - " .insn rrf,0xb9390000,2,4,6,0\n" + " .insn rrf,0xb9390000,2,4,6,0" : [query] "=R" (*query) : : "cc", "0", "1"); @@ -443,13 +442,13 @@ static void __init kvm_s390_cpu_feat_init(void) if (test_facility(201)) /* PFCR */ pfcr_query(&kvm_s390_available_subfunc.pfcr); - if (MACHINE_HAS_ESOP) + if (machine_has_esop()) allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP); /* * We need SIE support, ESOP (PROT_READ protection for gmap_shadow), * 64bit SCAO (SCA passthrough) and IDTE (for gmap_shadow unshadowing). */ - if (!sclp.has_sief2 || !MACHINE_HAS_ESOP || !sclp.has_64bscao || + if (!sclp.has_sief2 || !machine_has_esop() || !sclp.has_64bscao || !test_facility(3) || !nested) return; allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIEF2); @@ -526,10 +525,6 @@ static int __init __kvm_s390_init(void) if (rc) goto err_gib; - gmap_notifier.notifier_call = kvm_gmap_notifier; - gmap_register_pte_notifier(&gmap_notifier); - vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier; - gmap_register_pte_notifier(&vsie_gmap_notifier); atomic_notifier_chain_register(&s390_epoch_delta_notifier, &kvm_clock_notifier); @@ -549,8 +544,6 @@ err_kvm_uv: static void __kvm_s390_exit(void) { - gmap_unregister_pte_notifier(&gmap_notifier); - gmap_unregister_pte_notifier(&vsie_gmap_notifier); atomic_notifier_chain_unregister(&s390_epoch_delta_notifier, &kvm_clock_notifier); @@ -561,12 +554,43 @@ static void __kvm_s390_exit(void) debug_unregister(kvm_s390_dbf_uv); } +static int kvm_s390_keyop(struct kvm_s390_mmu_cache *mc, struct kvm *kvm, int op, + unsigned long addr, union skey skey) +{ + union asce asce = kvm->arch.gmap->asce; + gfn_t gfn = gpa_to_gfn(addr); + int r; + + guard(read_lock)(&kvm->mmu_lock); + + switch (op) { + case KVM_S390_KEYOP_SSKE: + r = dat_cond_set_storage_key(mc, asce, gfn, skey, &skey, 0, 0, 0); + if (r >= 0) + return skey.skey; + break; + case KVM_S390_KEYOP_ISKE: + r = dat_get_storage_key(asce, gfn, &skey); + if (!r) + return skey.skey; + break; + case KVM_S390_KEYOP_RRBE: + r = dat_reset_reference_bit(asce, gfn); + if (r > 0) + return r << 1; + break; + default: + return -EINVAL; + } + return r; +} + /* Section: device related */ long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { if (ioctl == KVM_S390_ENABLE_SIE) - return s390_enable_sie(); + return 0; return -EINVAL; } @@ -577,7 +601,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) switch (ext) { case KVM_CAP_S390_PSW: case KVM_CAP_S390_GMAP: - case KVM_CAP_SYNC_MMU: #ifdef CONFIG_KVM_S390_UCONTROL case KVM_CAP_S390_UCONTROL: #endif @@ -604,6 +627,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SET_GUEST_DEBUG: case KVM_CAP_S390_DIAG318: case KVM_CAP_IRQFD_RESAMPLE: + case KVM_CAP_S390_USER_OPEREXEC: + case KVM_CAP_S390_KEYOP: + case KVM_CAP_S390_VSIE_ESAMODE: r = 1; break; case KVM_CAP_SET_GUEST_DEBUG2: @@ -629,16 +655,18 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_NR_VCPUS: case KVM_CAP_MAX_VCPUS: case KVM_CAP_MAX_VCPU_ID: - r = KVM_S390_BSCA_CPU_SLOTS; + /* + * Return the same value for KVM_CAP_MAX_VCPUS and + * KVM_CAP_MAX_VCPU_ID to conform with the KVM API. + */ + r = KVM_S390_ESCA_CPU_SLOTS; if (!kvm_s390_use_sca_entries()) r = KVM_MAX_VCPUS; - else if (sclp.has_esca && sclp.has_64bscao) - r = KVM_S390_ESCA_CPU_SLOTS; if (ext == KVM_CAP_NR_VCPUS) r = min_t(unsigned int, num_online_cpus(), r); break; case KVM_CAP_S390_COW: - r = MACHINE_HAS_ESOP; + r = machine_has_esop(); break; case KVM_CAP_S390_VECTOR_REGISTERS: r = test_facility(129); @@ -692,32 +720,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) { - int i; - gfn_t cur_gfn, last_gfn; - unsigned long gaddr, vmaddr; - struct gmap *gmap = kvm->arch.gmap; - DECLARE_BITMAP(bitmap, _PAGE_ENTRIES); - - /* Loop over all guest segments */ - cur_gfn = memslot->base_gfn; - last_gfn = memslot->base_gfn + memslot->npages; - for (; cur_gfn <= last_gfn; cur_gfn += _PAGE_ENTRIES) { - gaddr = gfn_to_gpa(cur_gfn); - vmaddr = gfn_to_hva_memslot(memslot, cur_gfn); - if (kvm_is_error_hva(vmaddr)) - continue; - - bitmap_zero(bitmap, _PAGE_ENTRIES); - gmap_sync_dirty_log_pmd(gmap, bitmap, gaddr, vmaddr); - for (i = 0; i < _PAGE_ENTRIES; i++) { - if (test_bit(i, bitmap)) - mark_page_dirty(kvm, cur_gfn + i); - } + gfn_t last_gfn = memslot->base_gfn + memslot->npages; - if (fatal_signal_pending(current)) - return; - cond_resched(); - } + scoped_guard(read_lock, &kvm->mmu_lock) + gmap_sync_dirty_log(kvm->arch.gmap, memslot->base_gfn, last_gfn); } /* Section: vm related */ @@ -877,9 +883,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) r = -EINVAL; else { r = 0; - mmap_write_lock(kvm->mm); - kvm->mm->context.allow_gmap_hpage_1m = 1; - mmap_write_unlock(kvm->mm); + set_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &kvm->arch.gmap->flags); /* * We might have to create fake 4k page * tables. To avoid that the hardware works on @@ -917,6 +921,17 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) VM_EVENT(kvm, 3, "ENABLE: CAP_S390_CPU_TOPOLOGY %s", r ? "(not available)" : "(success)"); break; + case KVM_CAP_S390_USER_OPEREXEC: + VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_USER_OPEREXEC"); + kvm->arch.user_operexec = 1; + icpt_operexc_on_all_vcpus(kvm); + r = 0; + break; + case KVM_CAP_S390_VSIE_ESAMODE: + VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_VSIE_ESAMODE"); + kvm->arch.allow_vsie_esamode = 1; + r = 0; + break; default: r = -EINVAL; break; @@ -946,7 +961,7 @@ static int kvm_s390_get_mem_control(struct kvm *kvm, struct kvm_device_attr *att static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *attr) { int ret; - unsigned int idx; + switch (attr->attr) { case KVM_S390_VM_MEM_ENABLE_CMMA: ret = -ENXIO; @@ -957,8 +972,6 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att mutex_lock(&kvm->lock); if (kvm->created_vcpus) ret = -EBUSY; - else if (kvm->mm->context.allow_gmap_hpage_1m) - ret = -EINVAL; else { kvm->arch.use_cmma = 1; /* Not compatible with cmma. */ @@ -967,7 +980,9 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att } mutex_unlock(&kvm->lock); break; - case KVM_S390_VM_MEM_CLR_CMMA: + case KVM_S390_VM_MEM_CLR_CMMA: { + gfn_t start_gfn = 0; + ret = -ENXIO; if (!sclp.has_cmma) break; @@ -976,13 +991,13 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att break; VM_EVENT(kvm, 3, "%s", "RESET: CMMA states"); - mutex_lock(&kvm->lock); - idx = srcu_read_lock(&kvm->srcu); - s390_reset_cmma(kvm->arch.gmap->mm); - srcu_read_unlock(&kvm->srcu, idx); - mutex_unlock(&kvm->lock); + do { + start_gfn = dat_reset_cmma(kvm->arch.gmap->asce, start_gfn); + cond_resched(); + } while (start_gfn); ret = 0; break; + } case KVM_S390_VM_MEM_LIMIT_SIZE: { unsigned long new_limit; @@ -999,29 +1014,12 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att if (!new_limit) return -EINVAL; - /* gmap_create takes last usable address */ - if (new_limit != KVM_S390_NO_MEM_LIMIT) - new_limit -= 1; - ret = -EBUSY; - mutex_lock(&kvm->lock); - if (!kvm->created_vcpus) { - /* gmap_create will round the limit up */ - struct gmap *new = gmap_create(current->mm, new_limit); - - if (!new) { - ret = -ENOMEM; - } else { - gmap_remove(kvm->arch.gmap); - new->private = kvm; - kvm->arch.gmap = new; - ret = 0; - } - } - mutex_unlock(&kvm->lock); + if (!kvm->created_vcpus) + ret = gmap_set_limit(kvm->arch.gmap, gpa_to_gfn(new_limit)); VM_EVENT(kvm, 3, "SET: max guest address: %lu", new_limit); - VM_EVENT(kvm, 3, "New guest asce: 0x%pK", - (void *) kvm->arch.gmap->asce); + VM_EVENT(kvm, 3, "New guest asce: 0x%p", + (void *)kvm->arch.gmap->asce.val); break; } default: @@ -1186,19 +1184,13 @@ static int kvm_s390_vm_start_migration(struct kvm *kvm) kvm->arch.migration_mode = 1; return 0; } - /* mark all the pages in active slots as dirty */ kvm_for_each_memslot(ms, bkt, slots) { if (!ms->dirty_bitmap) return -EINVAL; - /* - * The second half of the bitmap is only used on x86, - * and would be wasted otherwise, so we put it to good - * use here to keep track of the state of the storage - * attributes. - */ - memset(kvm_second_dirty_bitmap(ms), 0xff, kvm_dirty_bitmap_bytes(ms)); ram_pages += ms->npages; } + /* mark all the pages as dirty */ + gmap_set_cmma_all_dirty(kvm->arch.gmap); atomic64_set(&kvm->arch.cmma_dirty_pages, ram_pages); kvm->arch.migration_mode = 1; kvm_s390_sync_request_broadcast(kvm, KVM_REQ_START_MIGRATION); @@ -1431,7 +1423,7 @@ static int kvm_s390_set_processor(struct kvm *kvm, struct kvm_device_attr *attr) ret = -EBUSY; goto out; } - proc = kzalloc(sizeof(*proc), GFP_KERNEL_ACCOUNT); + proc = kzalloc_obj(*proc, GFP_KERNEL_ACCOUNT); if (!proc) { ret = -ENOMEM; goto out; @@ -1631,7 +1623,7 @@ static int kvm_s390_get_processor(struct kvm *kvm, struct kvm_device_attr *attr) struct kvm_s390_vm_cpu_processor *proc; int ret = 0; - proc = kzalloc(sizeof(*proc), GFP_KERNEL_ACCOUNT); + proc = kzalloc_obj(*proc, GFP_KERNEL_ACCOUNT); if (!proc) { ret = -ENOMEM; goto out; @@ -1659,7 +1651,7 @@ static int kvm_s390_get_machine(struct kvm *kvm, struct kvm_device_attr *attr) struct kvm_s390_vm_cpu_machine *mach; int ret = 0; - mach = kzalloc(sizeof(*mach), GFP_KERNEL_ACCOUNT); + mach = kzalloc_obj(*mach, GFP_KERNEL_ACCOUNT); if (!mach) { ret = -ENOMEM; goto out; @@ -1928,22 +1920,18 @@ static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) * Updates the Multiprocessor Topology-Change-Report bit to signal * the guest with a topology change. * This is only relevant if the topology facility is present. - * - * The SCA version, bsca or esca, doesn't matter as offset is the same. */ static void kvm_s390_update_topology_change_report(struct kvm *kvm, bool val) { union sca_utility new, old; - struct bsca_block *sca; + struct esca_block *sca; - read_lock(&kvm->arch.sca_lock); sca = kvm->arch.sca; old = READ_ONCE(sca->utility); do { new = old; new.mtcr = val; } while (!try_cmpxchg(&sca->utility.val, &old.val, new.val)); - read_unlock(&kvm->arch.sca_lock); } static int kvm_s390_set_topo_change_indication(struct kvm *kvm, @@ -1964,9 +1952,7 @@ static int kvm_s390_get_topo_change_indication(struct kvm *kvm, if (!test_kvm_facility(kvm, 11)) return -ENXIO; - read_lock(&kvm->arch.sca_lock); - topo = ((struct bsca_block *)kvm->arch.sca)->utility.mtcr; - read_unlock(&kvm->arch.sca_lock); + topo = kvm->arch.sca->utility.mtcr; return put_user(topo, (u8 __user *)attr->addr); } @@ -2110,40 +2096,32 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) static int kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) { - uint8_t *keys; - uint64_t hva; - int srcu_idx, i, r = 0; + union skey *keys; + int i, r = 0; if (args->flags != 0) return -EINVAL; /* Is this guest using storage keys? */ - if (!mm_uses_skeys(current->mm)) + if (!uses_skeys(kvm->arch.gmap)) return KVM_S390_GET_SKEYS_NONE; /* Enforce sane limit on memory allocation */ if (args->count < 1 || args->count > KVM_S390_SKEYS_MAX) return -EINVAL; - keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL_ACCOUNT); + keys = kvmalloc_array(args->count, sizeof(*keys), GFP_KERNEL_ACCOUNT); if (!keys) return -ENOMEM; - mmap_read_lock(current->mm); - srcu_idx = srcu_read_lock(&kvm->srcu); - for (i = 0; i < args->count; i++) { - hva = gfn_to_hva(kvm, args->start_gfn + i); - if (kvm_is_error_hva(hva)) { - r = -EFAULT; - break; + scoped_guard(read_lock, &kvm->mmu_lock) { + for (i = 0; i < args->count; i++) { + r = dat_get_storage_key(kvm->arch.gmap->asce, + args->start_gfn + i, keys + i); + if (r) + break; } - - r = get_guest_storage_key(current->mm, hva, &keys[i]); - if (r) - break; } - srcu_read_unlock(&kvm->srcu, srcu_idx); - mmap_read_unlock(current->mm); if (!r) { r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys, @@ -2158,10 +2136,9 @@ static int kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) static int kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) { - uint8_t *keys; - uint64_t hva; - int srcu_idx, i, r = 0; - bool unlocked; + struct kvm_s390_mmu_cache *mc; + union skey *keys; + int i, r = 0; if (args->flags != 0) return -EINVAL; @@ -2170,7 +2147,7 @@ static int kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) if (args->count < 1 || args->count > KVM_S390_SKEYS_MAX) return -EINVAL; - keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL_ACCOUNT); + keys = kvmalloc_array(args->count, sizeof(*keys), GFP_KERNEL_ACCOUNT); if (!keys) return -ENOMEM; @@ -2182,159 +2159,41 @@ static int kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) } /* Enable storage key handling for the guest */ - r = s390_enable_skey(); + r = gmap_enable_skeys(kvm->arch.gmap); if (r) goto out; - i = 0; - mmap_read_lock(current->mm); - srcu_idx = srcu_read_lock(&kvm->srcu); - while (i < args->count) { - unlocked = false; - hva = gfn_to_hva(kvm, args->start_gfn + i); - if (kvm_is_error_hva(hva)) { - r = -EFAULT; - break; - } - + r = -EINVAL; + for (i = 0; i < args->count; i++) { /* Lowest order bit is reserved */ - if (keys[i] & 0x01) { - r = -EINVAL; - break; - } - - r = set_guest_storage_key(current->mm, hva, keys[i], 0); - if (r) { - r = fixup_user_fault(current->mm, hva, - FAULT_FLAG_WRITE, &unlocked); - if (r) - break; - } - if (!r) - i++; - } - srcu_read_unlock(&kvm->srcu, srcu_idx); - mmap_read_unlock(current->mm); -out: - kvfree(keys); - return r; -} - -/* - * Base address and length must be sent at the start of each block, therefore - * it's cheaper to send some clean data, as long as it's less than the size of - * two longs. - */ -#define KVM_S390_MAX_BIT_DISTANCE (2 * sizeof(void *)) -/* for consistency */ -#define KVM_S390_CMMA_SIZE_MAX ((u32)KVM_S390_SKEYS_MAX) - -static int kvm_s390_peek_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args, - u8 *res, unsigned long bufsize) -{ - unsigned long pgstev, hva, cur_gfn = args->start_gfn; - - args->count = 0; - while (args->count < bufsize) { - hva = gfn_to_hva(kvm, cur_gfn); - /* - * We return an error if the first value was invalid, but we - * return successfully if at least one value was copied. - */ - if (kvm_is_error_hva(hva)) - return args->count ? 0 : -EFAULT; - if (get_pgste(kvm->mm, hva, &pgstev) < 0) - pgstev = 0; - res[args->count++] = (pgstev >> 24) & 0x43; - cur_gfn++; - } - - return 0; -} - -static struct kvm_memory_slot *gfn_to_memslot_approx(struct kvm_memslots *slots, - gfn_t gfn) -{ - return ____gfn_to_memslot(slots, gfn, true); -} - -static unsigned long kvm_s390_next_dirty_cmma(struct kvm_memslots *slots, - unsigned long cur_gfn) -{ - struct kvm_memory_slot *ms = gfn_to_memslot_approx(slots, cur_gfn); - unsigned long ofs = cur_gfn - ms->base_gfn; - struct rb_node *mnode = &ms->gfn_node[slots->node_idx]; - - if (ms->base_gfn + ms->npages <= cur_gfn) { - mnode = rb_next(mnode); - /* If we are above the highest slot, wrap around */ - if (!mnode) - mnode = rb_first(&slots->gfn_tree); - - ms = container_of(mnode, struct kvm_memory_slot, gfn_node[slots->node_idx]); - ofs = 0; + if (keys[i].zero) + goto out; } - if (cur_gfn < ms->base_gfn) - ofs = 0; - - ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, ofs); - while (ofs >= ms->npages && (mnode = rb_next(mnode))) { - ms = container_of(mnode, struct kvm_memory_slot, gfn_node[slots->node_idx]); - ofs = find_first_bit(kvm_second_dirty_bitmap(ms), ms->npages); + mc = kvm_s390_new_mmu_cache(); + if (!mc) { + r = -ENOMEM; + goto out; } - return ms->base_gfn + ofs; -} -static int kvm_s390_get_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args, - u8 *res, unsigned long bufsize) -{ - unsigned long mem_end, cur_gfn, next_gfn, hva, pgstev; - struct kvm_memslots *slots = kvm_memslots(kvm); - struct kvm_memory_slot *ms; - - if (unlikely(kvm_memslots_empty(slots))) - return 0; - - cur_gfn = kvm_s390_next_dirty_cmma(slots, args->start_gfn); - ms = gfn_to_memslot(kvm, cur_gfn); - args->count = 0; - args->start_gfn = cur_gfn; - if (!ms) - return 0; - next_gfn = kvm_s390_next_dirty_cmma(slots, cur_gfn + 1); - mem_end = kvm_s390_get_gfn_end(slots); - - while (args->count < bufsize) { - hva = gfn_to_hva(kvm, cur_gfn); - if (kvm_is_error_hva(hva)) - return 0; - /* Decrement only if we actually flipped the bit to 0 */ - if (test_and_clear_bit(cur_gfn - ms->base_gfn, kvm_second_dirty_bitmap(ms))) - atomic64_dec(&kvm->arch.cmma_dirty_pages); - if (get_pgste(kvm->mm, hva, &pgstev) < 0) - pgstev = 0; - /* Save the value */ - res[args->count++] = (pgstev >> 24) & 0x43; - /* If the next bit is too far away, stop. */ - if (next_gfn > cur_gfn + KVM_S390_MAX_BIT_DISTANCE) - return 0; - /* If we reached the previous "next", find the next one */ - if (cur_gfn == next_gfn) - next_gfn = kvm_s390_next_dirty_cmma(slots, cur_gfn + 1); - /* Reached the end of memory or of the buffer, stop */ - if ((next_gfn >= mem_end) || - (next_gfn - args->start_gfn >= bufsize)) - return 0; - cur_gfn++; - /* Reached the end of the current memslot, take the next one. */ - if (cur_gfn - ms->base_gfn >= ms->npages) { - ms = gfn_to_memslot(kvm, cur_gfn); - if (!ms) - return 0; + r = 0; + do { + r = kvm_s390_mmu_cache_topup(mc); + if (r == -ENOMEM) + break; + scoped_guard(read_lock, &kvm->mmu_lock) { + for (i = 0 ; i < args->count; i++) { + r = dat_set_storage_key(mc, kvm->arch.gmap->asce, + args->start_gfn + i, keys[i], 0); + if (r) + break; + } } - } - return 0; + } while (r == -ENOMEM); + kvm_s390_free_mmu_cache(mc); +out: + kvfree(keys); + return r; } /* @@ -2348,8 +2207,7 @@ static int kvm_s390_get_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args, static int kvm_s390_get_cmma_bits(struct kvm *kvm, struct kvm_s390_cmma_log *args) { - unsigned long bufsize; - int srcu_idx, peek, ret; + int peek, ret; u8 *values; if (!kvm->arch.use_cmma) @@ -2362,8 +2220,8 @@ static int kvm_s390_get_cmma_bits(struct kvm *kvm, if (!peek && !kvm->arch.migration_mode) return -EINVAL; /* CMMA is disabled or was not used, or the buffer has length zero */ - bufsize = min(args->count, KVM_S390_CMMA_SIZE_MAX); - if (!bufsize || !kvm->mm->context.uses_cmm) { + args->count = min(args->count, KVM_S390_CMMA_SIZE_MAX); + if (!args->count || !uses_cmm(kvm->arch.gmap)) { memset(args, 0, sizeof(*args)); return 0; } @@ -2373,18 +2231,18 @@ static int kvm_s390_get_cmma_bits(struct kvm *kvm, return 0; } - values = vmalloc(bufsize); + values = vmalloc(args->count); if (!values) return -ENOMEM; - mmap_read_lock(kvm->mm); - srcu_idx = srcu_read_lock(&kvm->srcu); - if (peek) - ret = kvm_s390_peek_cmma(kvm, args, values, bufsize); - else - ret = kvm_s390_get_cmma(kvm, args, values, bufsize); - srcu_read_unlock(&kvm->srcu, srcu_idx); - mmap_read_unlock(kvm->mm); + scoped_guard(read_lock, &kvm->mmu_lock) { + if (peek) + ret = dat_peek_cmma(args->start_gfn, kvm->arch.gmap->asce, &args->count, + values); + else + ret = dat_get_cmma(kvm->arch.gmap->asce, &args->start_gfn, &args->count, + values, &kvm->arch.cmma_dirty_pages); + } if (kvm->arch.migration_mode) args->remaining = atomic64_read(&kvm->arch.cmma_dirty_pages); @@ -2406,11 +2264,9 @@ static int kvm_s390_get_cmma_bits(struct kvm *kvm, static int kvm_s390_set_cmma_bits(struct kvm *kvm, const struct kvm_s390_cmma_log *args) { - unsigned long hva, mask, pgstev, i; - uint8_t *bits; - int srcu_idx, r = 0; - - mask = args->mask; + struct kvm_s390_mmu_cache *mc; + u8 *bits = NULL; + int r = 0; if (!kvm->arch.use_cmma) return -ENXIO; @@ -2424,9 +2280,12 @@ static int kvm_s390_set_cmma_bits(struct kvm *kvm, if (args->count == 0) return 0; + mc = kvm_s390_new_mmu_cache(); + if (!mc) + return -ENOMEM; bits = vmalloc(array_size(sizeof(*bits), args->count)); if (!bits) - return -ENOMEM; + goto out; r = copy_from_user(bits, (void __user *)args->values, args->count); if (r) { @@ -2434,29 +2293,19 @@ static int kvm_s390_set_cmma_bits(struct kvm *kvm, goto out; } - mmap_read_lock(kvm->mm); - srcu_idx = srcu_read_lock(&kvm->srcu); - for (i = 0; i < args->count; i++) { - hva = gfn_to_hva(kvm, args->start_gfn + i); - if (kvm_is_error_hva(hva)) { - r = -EFAULT; + do { + r = kvm_s390_mmu_cache_topup(mc); + if (r) break; + scoped_guard(read_lock, &kvm->mmu_lock) { + r = dat_set_cmma_bits(mc, kvm->arch.gmap->asce, args->start_gfn, + args->count, args->mask, bits); } + } while (r == -ENOMEM); - pgstev = bits[i]; - pgstev = pgstev << 24; - mask &= _PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT; - set_pgste_bits(kvm->mm, hva, mask, pgstev); - } - srcu_read_unlock(&kvm->srcu, srcu_idx); - mmap_read_unlock(kvm->mm); - - if (!kvm->mm->context.uses_cmm) { - mmap_write_lock(kvm->mm); - kvm->mm->context.uses_cmm = 1; - mmap_write_unlock(kvm->mm); - } + set_bit(GMAP_FLAG_USES_CMM, &kvm->arch.gmap->flags); out: + kvm_s390_free_mmu_cache(mc); vfree(bits); return r; } @@ -2664,15 +2513,16 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) if (kvm_s390_pv_is_protected(kvm)) break; + mmap_write_lock(kvm->mm); /* - * FMT 4 SIE needs esca. As we never switch back to bsca from - * esca, we need no cleanup in the error cases below + * Disable creation of new THPs. Existing THPs can stay, they + * will be split when any part of them gets imported. */ - r = sca_switch_to_extended(kvm); - if (r) - break; - - r = s390_disable_cow_sharing(); + mm_flags_clear(MMF_DISABLE_THP_EXCEPT_ADVISED, kvm->mm); + mm_flags_set(MMF_DISABLE_THP_COMPLETELY, kvm->mm); + set_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &kvm->arch.gmap->flags); + r = gmap_helper_disable_cow_sharing(); + mmap_write_unlock(kvm->mm); if (r) break; @@ -2744,9 +2594,9 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) if (copy_from_user(&parms, argp, sizeof(parms))) break; - /* Currently restricted to 8KB */ + /* Currently restricted to 1MiB */ r = -EINVAL; - if (parms.length > PAGE_SIZE * 2) + if (parms.length > SZ_1M) break; r = -ENOMEM; @@ -2900,9 +2750,9 @@ static int mem_op_validate_common(struct kvm_s390_mem_op *mop, u64 supported_fla static int kvm_s390_vm_mem_op_abs(struct kvm *kvm, struct kvm_s390_mem_op *mop) { void __user *uaddr = (void __user *)mop->buf; + void *tmpbuf __free(kvfree) = NULL; enum gacc_mode acc_mode; - void *tmpbuf = NULL; - int r, srcu_idx; + int r; r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_SKEY_PROTECTION | KVM_S390_MEMOP_F_CHECK_ONLY); @@ -2915,52 +2765,32 @@ static int kvm_s390_vm_mem_op_abs(struct kvm *kvm, struct kvm_s390_mem_op *mop) return -ENOMEM; } - srcu_idx = srcu_read_lock(&kvm->srcu); + acc_mode = mop->op == KVM_S390_MEMOP_ABSOLUTE_READ ? GACC_FETCH : GACC_STORE; - if (!kvm_is_gpa_in_memslot(kvm, mop->gaddr)) { - r = PGM_ADDRESSING; - goto out_unlock; - } + scoped_guard(srcu, &kvm->srcu) { + if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) + return check_gpa_range(kvm, mop->gaddr, mop->size, acc_mode, mop->key); - acc_mode = mop->op == KVM_S390_MEMOP_ABSOLUTE_READ ? GACC_FETCH : GACC_STORE; - if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) { - r = check_gpa_range(kvm, mop->gaddr, mop->size, acc_mode, mop->key); - goto out_unlock; - } - if (acc_mode == GACC_FETCH) { + if (acc_mode == GACC_STORE && copy_from_user(tmpbuf, uaddr, mop->size)) + return -EFAULT; r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf, - mop->size, GACC_FETCH, mop->key); + mop->size, acc_mode, mop->key); if (r) - goto out_unlock; - if (copy_to_user(uaddr, tmpbuf, mop->size)) - r = -EFAULT; - } else { - if (copy_from_user(tmpbuf, uaddr, mop->size)) { - r = -EFAULT; - goto out_unlock; - } - r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf, - mop->size, GACC_STORE, mop->key); + return r; + if (acc_mode != GACC_STORE && copy_to_user(uaddr, tmpbuf, mop->size)) + return -EFAULT; } - -out_unlock: - srcu_read_unlock(&kvm->srcu, srcu_idx); - - vfree(tmpbuf); - return r; + return 0; } static int kvm_s390_vm_mem_op_cmpxchg(struct kvm *kvm, struct kvm_s390_mem_op *mop) { void __user *uaddr = (void __user *)mop->buf; void __user *old_addr = (void __user *)mop->old_addr; - union { - __uint128_t quad; - char raw[sizeof(__uint128_t)]; - } old = { .quad = 0}, new = { .quad = 0 }; - unsigned int off_in_quad = sizeof(new) - mop->size; - int r, srcu_idx; - bool success; + union kvm_s390_quad old = { .sixteen = 0 }; + union kvm_s390_quad new = { .sixteen = 0 }; + bool success = false; + int r; r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_SKEY_PROTECTION); if (r) @@ -2972,25 +2802,18 @@ static int kvm_s390_vm_mem_op_cmpxchg(struct kvm *kvm, struct kvm_s390_mem_op *m */ if (mop->size > sizeof(new)) return -EINVAL; - if (copy_from_user(&new.raw[off_in_quad], uaddr, mop->size)) + if (copy_from_user(&new, uaddr, mop->size)) return -EFAULT; - if (copy_from_user(&old.raw[off_in_quad], old_addr, mop->size)) + if (copy_from_user(&old, old_addr, mop->size)) return -EFAULT; - srcu_idx = srcu_read_lock(&kvm->srcu); + scoped_guard(srcu, &kvm->srcu) { + r = cmpxchg_guest_abs_with_key(kvm, mop->gaddr, mop->size, &old, new, + mop->key, &success); - if (!kvm_is_gpa_in_memslot(kvm, mop->gaddr)) { - r = PGM_ADDRESSING; - goto out_unlock; + if (!success && copy_to_user(old_addr, &old, mop->size)) + return -EFAULT; } - - r = cmpxchg_guest_abs_with_key(kvm, mop->gaddr, mop->size, &old.quad, - new.quad, mop->key, &success); - if (!success && copy_to_user(old_addr, &old.raw[off_in_quad], mop->size)) - r = -EFAULT; - -out_unlock: - srcu_read_unlock(&kvm->srcu, srcu_idx); return r; } @@ -3145,6 +2968,32 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) r = -EFAULT; break; } + case KVM_S390_KEYOP: { + struct kvm_s390_mmu_cache *mc; + struct kvm_s390_keyop kop; + union skey skey; + + if (copy_from_user(&kop, argp, sizeof(kop))) { + r = -EFAULT; + break; + } + skey.skey = kop.key; + + mc = kvm_s390_new_mmu_cache(); + if (!mc) + return -ENOMEM; + + r = kvm_s390_keyop(mc, kvm, kop.operation, kop.guest_addr, skey); + kvm_s390_free_mmu_cache(mc); + if (r < 0) + break; + + kop.key = r; + r = 0; + if (copy_to_user(argp, &kop, sizeof(kop))) + r = -EFAULT; + break; + } case KVM_S390_ZPCI_OP: { struct kvm_s390_zpci_op args; @@ -3312,10 +3161,7 @@ static void kvm_s390_crypto_init(struct kvm *kvm) static void sca_dispose(struct kvm *kvm) { - if (kvm->arch.use_esca) - free_pages_exact(kvm->arch.sca, sizeof(struct esca_block)); - else - free_page((unsigned long)(kvm->arch.sca)); + free_pages_exact(kvm->arch.sca, sizeof(*kvm->arch.sca)); kvm->arch.sca = NULL; } @@ -3329,10 +3175,11 @@ void kvm_arch_free_vm(struct kvm *kvm) int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { - gfp_t alloc_flags = GFP_KERNEL_ACCOUNT; - int i, rc; + gfp_t alloc_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO; char debug_name[16]; - static unsigned long sca_offset; + int i, rc; + + mutex_init(&kvm->arch.pv.import_lock); rc = -EINVAL; #ifdef CONFIG_KVM_S390_UCONTROL @@ -3344,29 +3191,18 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (type) goto out_err; #endif - - rc = s390_enable_sie(); - if (rc) - goto out_err; - rc = -ENOMEM; if (!sclp.has_64bscao) alloc_flags |= GFP_DMA; - rwlock_init(&kvm->arch.sca_lock); - /* start with basic SCA */ - kvm->arch.sca = (struct bsca_block *) get_zeroed_page(alloc_flags); - if (!kvm->arch.sca) - goto out_err; mutex_lock(&kvm_lock); - sca_offset += 16; - if (sca_offset + sizeof(struct bsca_block) > PAGE_SIZE) - sca_offset = 0; - kvm->arch.sca = (struct bsca_block *) - ((char *) kvm->arch.sca + sca_offset); + + kvm->arch.sca = alloc_pages_exact(sizeof(*kvm->arch.sca), alloc_flags); mutex_unlock(&kvm_lock); + if (!kvm->arch.sca) + goto out_err; - sprintf(debug_name, "kvm-%u", current->pid); + snprintf(debug_name, sizeof(debug_name), "kvm-%u", current->pid); kvm->arch.dbf = debug_register(debug_name, 32, 1, 7 * sizeof(long)); if (!kvm->arch.dbf) @@ -3396,7 +3232,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) /* we emulate STHYI in kvm */ set_kvm_facility(kvm->arch.model.fac_mask, 74); set_kvm_facility(kvm->arch.model.fac_list, 74); - if (MACHINE_HAS_TLB_GUEST) { + if (machine_has_tlb_guest()) { set_kvm_facility(kvm->arch.model.fac_mask, 147); set_kvm_facility(kvm->arch.model.fac_list, 147); } @@ -3428,6 +3264,12 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) debug_register_view(kvm->arch.dbf, &debug_sprintf_view); VM_EVENT(kvm, 3, "vm created with type %lu", type); + kvm->arch.mem_limit = type & KVM_VM_S390_UCONTROL ? KVM_S390_NO_MEM_LIMIT : sclp.hamax + 1; + kvm->arch.gmap = gmap_new(kvm, gpa_to_gfn(kvm->arch.mem_limit)); + if (!kvm->arch.gmap) + goto out_err; + clear_bit(GMAP_FLAG_PFAULT_ENABLED, &kvm->arch.gmap->flags); + if (type & KVM_VM_S390_UCONTROL) { struct kvm_userspace_memory_region2 fake_memslot = { .slot = KVM_S390_UCONTROL_MEMSLOT, @@ -3437,23 +3279,15 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) .flags = 0, }; - kvm->arch.gmap = NULL; - kvm->arch.mem_limit = KVM_S390_NO_MEM_LIMIT; /* one flat fake memslot covering the whole address-space */ mutex_lock(&kvm->slots_lock); KVM_BUG_ON(kvm_set_internal_memslot(kvm, &fake_memslot), kvm); mutex_unlock(&kvm->slots_lock); + set_bit(GMAP_FLAG_IS_UCONTROL, &kvm->arch.gmap->flags); } else { - if (sclp.hamax == U64_MAX) - kvm->arch.mem_limit = TASK_SIZE_MAX; - else - kvm->arch.mem_limit = min_t(unsigned long, TASK_SIZE_MAX, - sclp.hamax + 1); - kvm->arch.gmap = gmap_create(current->mm, kvm->arch.mem_limit - 1); - if (!kvm->arch.gmap) - goto out_err; - kvm->arch.gmap->private = kvm; - kvm->arch.gmap->pfault_enabled = 0; + struct crst_table *table = dereference_asce(kvm->arch.gmap->asce); + + crst_table_init((void *)table, _CRSTE_HOLE(table->crstes[0].h.tt).val); } kvm->arch.use_pfmfi = sclp.has_pfmfi; @@ -3464,7 +3298,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_s390_gisa_init(kvm); INIT_LIST_HEAD(&kvm->arch.pv.need_cleanup); kvm->arch.pv.set_aside = NULL; - KVM_EVENT(3, "vm 0x%pK created by pid %u", kvm, current->pid); + KVM_EVENT(3, "vm 0x%p created by pid %u", kvm, current->pid); return 0; out_err: @@ -3487,8 +3321,11 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) sca_del_vcpu(vcpu); kvm_s390_update_topology_change_report(vcpu->kvm, 1); - if (kvm_is_ucontrol(vcpu->kvm)) - gmap_remove(vcpu->arch.gmap); + if (kvm_is_ucontrol(vcpu->kvm)) { + scoped_guard(spinlock, &vcpu->kvm->arch.gmap->children_lock) + gmap_remove_child(vcpu->arch.gmap); + vcpu->arch.gmap = gmap_put(vcpu->arch.gmap); + } if (vcpu->kvm->arch.use_cmma) kvm_s390_vcpu_unsetup_cmma(vcpu); @@ -3496,6 +3333,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) if (kvm_s390_pv_cpu_get_handle(vcpu)) kvm_s390_pv_destroy_cpu(vcpu, &rc, &rrc); free_page((unsigned long)(vcpu->arch.sie_block)); + kvm_s390_free_mmu_cache(vcpu->arch.mc); } void kvm_arch_destroy_vm(struct kvm *kvm) @@ -3522,154 +3360,48 @@ void kvm_arch_destroy_vm(struct kvm *kvm) debug_unregister(kvm->arch.dbf); free_page((unsigned long)kvm->arch.sie_page2); - if (!kvm_is_ucontrol(kvm)) - gmap_remove(kvm->arch.gmap); kvm_s390_destroy_adapters(kvm); kvm_s390_clear_float_irqs(kvm); kvm_s390_vsie_destroy(kvm); - KVM_EVENT(3, "vm 0x%pK destroyed", kvm); + kvm->arch.gmap = gmap_put(kvm->arch.gmap); + KVM_EVENT(3, "vm 0x%p destroyed", kvm); } /* Section: vcpu related */ -static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu) -{ - vcpu->arch.gmap = gmap_create(current->mm, -1UL); - if (!vcpu->arch.gmap) - return -ENOMEM; - vcpu->arch.gmap->private = vcpu->kvm; - - return 0; -} - static void sca_del_vcpu(struct kvm_vcpu *vcpu) { + struct esca_block *sca = vcpu->kvm->arch.sca; + if (!kvm_s390_use_sca_entries()) return; - read_lock(&vcpu->kvm->arch.sca_lock); - if (vcpu->kvm->arch.use_esca) { - struct esca_block *sca = vcpu->kvm->arch.sca; - - clear_bit_inv(vcpu->vcpu_id, (unsigned long *) sca->mcn); - sca->cpu[vcpu->vcpu_id].sda = 0; - } else { - struct bsca_block *sca = vcpu->kvm->arch.sca; - clear_bit_inv(vcpu->vcpu_id, (unsigned long *) &sca->mcn); - sca->cpu[vcpu->vcpu_id].sda = 0; - } - read_unlock(&vcpu->kvm->arch.sca_lock); + clear_bit_inv(vcpu->vcpu_id, (unsigned long *)sca->mcn); + sca->cpu[vcpu->vcpu_id].sda = 0; } static void sca_add_vcpu(struct kvm_vcpu *vcpu) { - if (!kvm_s390_use_sca_entries()) { - phys_addr_t sca_phys = virt_to_phys(vcpu->kvm->arch.sca); - - /* we still need the basic sca for the ipte control */ - vcpu->arch.sie_block->scaoh = sca_phys >> 32; - vcpu->arch.sie_block->scaol = sca_phys; - return; - } - read_lock(&vcpu->kvm->arch.sca_lock); - if (vcpu->kvm->arch.use_esca) { - struct esca_block *sca = vcpu->kvm->arch.sca; - phys_addr_t sca_phys = virt_to_phys(sca); - - sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block); - vcpu->arch.sie_block->scaoh = sca_phys >> 32; - vcpu->arch.sie_block->scaol = sca_phys & ESCA_SCAOL_MASK; - vcpu->arch.sie_block->ecb2 |= ECB2_ESCA; - set_bit_inv(vcpu->vcpu_id, (unsigned long *) sca->mcn); - } else { - struct bsca_block *sca = vcpu->kvm->arch.sca; - phys_addr_t sca_phys = virt_to_phys(sca); - - sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block); - vcpu->arch.sie_block->scaoh = sca_phys >> 32; - vcpu->arch.sie_block->scaol = sca_phys; - set_bit_inv(vcpu->vcpu_id, (unsigned long *) &sca->mcn); - } - read_unlock(&vcpu->kvm->arch.sca_lock); -} - -/* Basic SCA to Extended SCA data copy routines */ -static inline void sca_copy_entry(struct esca_entry *d, struct bsca_entry *s) -{ - d->sda = s->sda; - d->sigp_ctrl.c = s->sigp_ctrl.c; - d->sigp_ctrl.scn = s->sigp_ctrl.scn; -} - -static void sca_copy_b_to_e(struct esca_block *d, struct bsca_block *s) -{ - int i; - - d->ipte_control = s->ipte_control; - d->mcn[0] = s->mcn; - for (i = 0; i < KVM_S390_BSCA_CPU_SLOTS; i++) - sca_copy_entry(&d->cpu[i], &s->cpu[i]); -} - -static int sca_switch_to_extended(struct kvm *kvm) -{ - struct bsca_block *old_sca = kvm->arch.sca; - struct esca_block *new_sca; - struct kvm_vcpu *vcpu; - unsigned long vcpu_idx; - u32 scaol, scaoh; - phys_addr_t new_sca_phys; - - if (kvm->arch.use_esca) - return 0; - - new_sca = alloc_pages_exact(sizeof(*new_sca), GFP_KERNEL_ACCOUNT | __GFP_ZERO); - if (!new_sca) - return -ENOMEM; - - new_sca_phys = virt_to_phys(new_sca); - scaoh = new_sca_phys >> 32; - scaol = new_sca_phys & ESCA_SCAOL_MASK; + struct esca_block *sca = vcpu->kvm->arch.sca; + phys_addr_t sca_phys = virt_to_phys(sca); - kvm_s390_vcpu_block_all(kvm); - write_lock(&kvm->arch.sca_lock); - - sca_copy_b_to_e(new_sca, old_sca); - - kvm_for_each_vcpu(vcpu_idx, vcpu, kvm) { - vcpu->arch.sie_block->scaoh = scaoh; - vcpu->arch.sie_block->scaol = scaol; - vcpu->arch.sie_block->ecb2 |= ECB2_ESCA; - } - kvm->arch.sca = new_sca; - kvm->arch.use_esca = 1; + /* we still need the sca header for the ipte control */ + vcpu->arch.sie_block->scaoh = sca_phys >> 32; + vcpu->arch.sie_block->scaol = sca_phys & ESCA_SCAOL_MASK; + vcpu->arch.sie_block->ecb2 |= ECB2_ESCA; - write_unlock(&kvm->arch.sca_lock); - kvm_s390_vcpu_unblock_all(kvm); - - free_page((unsigned long)old_sca); + if (!kvm_s390_use_sca_entries()) + return; - VM_EVENT(kvm, 2, "Switched to ESCA (0x%pK -> 0x%pK)", - old_sca, kvm->arch.sca); - return 0; + set_bit_inv(vcpu->vcpu_id, (unsigned long *)sca->mcn); + sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block); } static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id) { - int rc; - - if (!kvm_s390_use_sca_entries()) { - if (id < KVM_MAX_VCPUS) - return true; - return false; - } - if (id < KVM_S390_BSCA_CPU_SLOTS) - return true; - if (!sclp.has_esca || !sclp.has_64bscao) - return false; - - rc = kvm->arch.use_esca ? 0 : sca_switch_to_extended(kvm); + if (!kvm_s390_use_sca_entries()) + return id < KVM_MAX_VCPUS; - return rc == 0 && id < KVM_S390_ESCA_CPU_SLOTS; + return id < KVM_S390_ESCA_CPU_SLOTS; } /* needs disabled preemption to protect from TOD sync and vcpu_load/put */ @@ -3892,8 +3624,8 @@ static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu) kvm_s390_vcpu_setup_model(vcpu); - /* pgste_set_pte has special handling for !MACHINE_HAS_ESOP */ - if (MACHINE_HAS_ESOP) + /* pgste_set_pte has special handling for !machine_has_esop() */ + if (machine_has_esop()) vcpu->arch.sie_block->ecb |= ECB_HOSTPROTINT; if (test_kvm_facility(vcpu->kvm, 9)) vcpu->arch.sie_block->ecb |= ECB_SRSI; @@ -3915,7 +3647,7 @@ static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->eca |= ECA_IB; if (sclp.has_siif) vcpu->arch.sie_block->eca |= ECA_SII; - if (sclp.has_sigpif) + if (kvm_s390_use_sca_entries()) vcpu->arch.sie_block->eca |= ECA_SIGPI; if (test_kvm_facility(vcpu->kvm, 129)) { vcpu->arch.sie_block->eca |= ECA_VX; @@ -3943,8 +3675,8 @@ static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu) if (rc) return rc; } - hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - vcpu->arch.ckc_timer.function = kvm_s390_idle_wakeup; + hrtimer_setup(&vcpu->arch.ckc_timer, kvm_s390_idle_wakeup, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); vcpu->arch.sie_block->hpid = HPID_KVM; @@ -3976,9 +3708,15 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) int rc; BUILD_BUG_ON(sizeof(struct sie_page) != 4096); + vcpu->arch.mc = kvm_s390_new_mmu_cache(); + if (!vcpu->arch.mc) + return -ENOMEM; sie_page = (struct sie_page *) get_zeroed_page(GFP_KERNEL_ACCOUNT); - if (!sie_page) + if (!sie_page) { + kvm_s390_free_mmu_cache(vcpu->arch.mc); + vcpu->arch.mc = NULL; return -ENOMEM; + } vcpu->arch.sie_block = &sie_page->sie_block; vcpu->arch.sie_block->itdba = virt_to_phys(&sie_page->itdb); @@ -4020,12 +3758,13 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->run->kvm_valid_regs |= KVM_SYNC_FPRS; if (kvm_is_ucontrol(vcpu->kvm)) { - rc = __kvm_ucontrol_vcpu_init(vcpu); - if (rc) + rc = -ENOMEM; + vcpu->arch.gmap = gmap_new_child(vcpu->kvm->arch.gmap, -1UL); + if (!vcpu->arch.gmap) goto out_free_sie_block; } - VM_EVENT(vcpu->kvm, 3, "create cpu %d at 0x%pK, sie block at 0x%pK", + VM_EVENT(vcpu->kvm, 3, "create cpu %d at 0x%p, sie block at 0x%p", vcpu->vcpu_id, vcpu, vcpu->arch.sie_block); trace_kvm_s390_create_vcpu(vcpu->vcpu_id, vcpu, vcpu->arch.sie_block); @@ -4037,8 +3776,10 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) return 0; out_ucontrol_uninit: - if (kvm_is_ucontrol(vcpu->kvm)) - gmap_remove(vcpu->arch.gmap); + if (kvm_is_ucontrol(vcpu->kvm)) { + gmap_remove_child(vcpu->arch.gmap); + vcpu->arch.gmap = gmap_put(vcpu->arch.gmap); + } out_free_sie_block: free_page((unsigned long)(vcpu->arch.sie_block)); return rc; @@ -4102,32 +3843,6 @@ void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu) kvm_s390_vcpu_request(vcpu); } -static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, - unsigned long end) -{ - struct kvm *kvm = gmap->private; - struct kvm_vcpu *vcpu; - unsigned long prefix; - unsigned long i; - - trace_kvm_s390_gmap_notifier(start, end, gmap_is_shadow(gmap)); - - if (gmap_is_shadow(gmap)) - return; - if (start >= 1UL << 31) - /* We are only interested in prefix pages */ - return; - kvm_for_each_vcpu(i, vcpu, kvm) { - /* match against both prefix pages */ - prefix = kvm_s390_get_prefix(vcpu); - if (prefix <= end && start <= prefix + 2*PAGE_SIZE - 1) { - VCPU_EVENT(vcpu, 2, "gmap notifier for %lx-%lx", - start, end); - kvm_s390_sync_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu); - } - } -} - bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) { /* do not poll with more than halt_poll_max_steal percent of steal time */ @@ -4362,8 +4077,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - int ret = 0; - vcpu_load(vcpu); vcpu->run->s.regs.fpc = fpu->fpc; @@ -4374,7 +4087,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) memcpy(vcpu->run->s.regs.fprs, &fpu->fprs, sizeof(fpu->fprs)); vcpu_put(vcpu); - return ret; + return 0; } int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) @@ -4511,72 +4224,41 @@ static bool ibs_enabled(struct kvm_vcpu *vcpu) return kvm_s390_test_cpuflags(vcpu, CPUSTAT_IBS); } -static int __kvm_s390_fixup_fault_sync(struct gmap *gmap, gpa_t gaddr, unsigned int flags) -{ - struct kvm *kvm = gmap->private; - gfn_t gfn = gpa_to_gfn(gaddr); - bool unlocked; - hva_t vmaddr; - gpa_t tmp; - int rc; - - if (kvm_is_ucontrol(kvm)) { - tmp = __gmap_translate(gmap, gaddr); - gfn = gpa_to_gfn(tmp); - } - - vmaddr = gfn_to_hva(kvm, gfn); - rc = fixup_user_fault(gmap->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked); - if (!rc) - rc = __gmap_link(gmap, gaddr, vmaddr); - return rc; -} - -/** - * __kvm_s390_mprotect_many() - Apply specified protection to guest pages - * @gmap: the gmap of the guest - * @gpa: the starting guest address - * @npages: how many pages to protect - * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE - * @bits: pgste notification bits to set - * - * Returns: 0 in case of success, < 0 in case of error - see gmap_protect_one() - * - * Context: kvm->srcu and gmap->mm need to be held in read mode - */ -int __kvm_s390_mprotect_many(struct gmap *gmap, gpa_t gpa, u8 npages, unsigned int prot, - unsigned long bits) +static int vcpu_ucontrol_translate(struct kvm_vcpu *vcpu, gpa_t *gaddr) { - unsigned int fault_flag = (prot & PROT_WRITE) ? FAULT_FLAG_WRITE : 0; - gpa_t end = gpa + npages * PAGE_SIZE; int rc; - for (; gpa < end; gpa = ALIGN(gpa + 1, rc)) { - rc = gmap_protect_one(gmap, gpa, prot, bits); - if (rc == -EAGAIN) { - __kvm_s390_fixup_fault_sync(gmap, gpa, fault_flag); - rc = gmap_protect_one(gmap, gpa, prot, bits); + if (kvm_is_ucontrol(vcpu->kvm)) { + rc = gmap_ucas_translate(vcpu->arch.mc, vcpu->arch.gmap, gaddr); + if (rc == -EREMOTE) { + vcpu->run->exit_reason = KVM_EXIT_S390_UCONTROL; + vcpu->run->s390_ucontrol.trans_exc_code = *gaddr; + vcpu->run->s390_ucontrol.pgm_code = PGM_SEGMENT_TRANSLATION; } - if (rc < 0) - return rc; + return rc; } - return 0; } -static int kvm_s390_mprotect_notify_prefix(struct kvm_vcpu *vcpu) +static int kvm_s390_fixup_prefix(struct kvm_vcpu *vcpu) { gpa_t gaddr = kvm_s390_get_prefix(vcpu); - int idx, rc; - - idx = srcu_read_lock(&vcpu->kvm->srcu); - mmap_read_lock(vcpu->arch.gmap->mm); + gfn_t gfn; + int rc; - rc = __kvm_s390_mprotect_many(vcpu->arch.gmap, gaddr, 2, PROT_WRITE, GMAP_NOTIFY_MPROT); + if (vcpu_ucontrol_translate(vcpu, &gaddr)) + return -EREMOTE; + gfn = gpa_to_gfn(gaddr); - mmap_read_unlock(vcpu->arch.gmap->mm); - srcu_read_unlock(&vcpu->kvm->srcu, idx); + rc = kvm_s390_faultin_gfn_simple(vcpu, NULL, gfn, true); + if (rc) + return rc; + rc = kvm_s390_faultin_gfn_simple(vcpu, NULL, gfn + 1, true); + if (rc) + return rc; + scoped_guard(write_lock, &vcpu->kvm->mmu_lock) + rc = dat_set_prefix_notif_bit(vcpu->kvm->arch.gmap->asce, gfn); return rc; } @@ -4596,7 +4278,7 @@ retry: if (kvm_check_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu)) { int rc; - rc = kvm_s390_mprotect_notify_prefix(vcpu); + rc = kvm_s390_fixup_prefix(vcpu); if (rc) { kvm_make_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu); return rc; @@ -4645,8 +4327,7 @@ retry: * Re-enable CMM virtualization if CMMA is available and * CMM has been used. */ - if ((vcpu->kvm->arch.use_cmma) && - (vcpu->kvm->mm->context.uses_cmm)) + if (vcpu->kvm->arch.use_cmma && uses_cmm(vcpu->arch.gmap)) vcpu->arch.sie_block->ecb2 |= ECB2_CMMA; goto retry; } @@ -4742,7 +4423,7 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu) return true; } -static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu) +bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu) { hva_t hva; struct kvm_arch_async_pf arch; @@ -4758,7 +4439,7 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu) return false; if (!(vcpu->arch.sie_block->gcr[0] & CR0_SERVICE_SIGNAL_SUBMASK)) return false; - if (!vcpu->arch.gmap->pfault_enabled) + if (!pfault_enabled(vcpu->arch.gmap)) return false; hva = gfn_to_hva(vcpu->kvm, current->thread.gmap_teid.addr); @@ -4782,9 +4463,6 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->gg14 = vcpu->run->s.regs.gprs[14]; vcpu->arch.sie_block->gg15 = vcpu->run->s.regs.gprs[15]; - if (need_resched()) - schedule(); - if (!kvm_is_ucontrol(vcpu->kvm)) { rc = kvm_s390_deliver_pending_interrupts(vcpu); if (rc || guestdbg_exit_pending(vcpu)) @@ -4854,123 +4532,41 @@ static void kvm_s390_assert_primary_as(struct kvm_vcpu *vcpu) current->thread.gmap_int_code, current->thread.gmap_teid.val); } -/* - * __kvm_s390_handle_dat_fault() - handle a dat fault for the gmap of a vcpu - * @vcpu: the vCPU whose gmap is to be fixed up - * @gfn: the guest frame number used for memslots (including fake memslots) - * @gaddr: the gmap address, does not have to match @gfn for ucontrol gmaps - * @flags: FOLL_* flags - * - * Return: 0 on success, < 0 in case of error. - * Context: The mm lock must not be held before calling. May sleep. - */ -int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int flags) -{ - struct kvm_memory_slot *slot; - unsigned int fault_flags; - bool writable, unlocked; - unsigned long vmaddr; - struct page *page; - kvm_pfn_t pfn; +static int vcpu_dat_fault_handler(struct kvm_vcpu *vcpu, gpa_t gaddr, bool wr) +{ + struct guest_fault f = { + .write_attempt = wr, + .attempt_pfault = pfault_enabled(vcpu->arch.gmap), + }; int rc; - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - if (!slot || slot->flags & KVM_MEMSLOT_INVALID) - return vcpu_post_run_addressing_exception(vcpu); - - fault_flags = flags & FOLL_WRITE ? FAULT_FLAG_WRITE : 0; - if (vcpu->arch.gmap->pfault_enabled) - flags |= FOLL_NOWAIT; - vmaddr = __gfn_to_hva_memslot(slot, gfn); - -try_again: - pfn = __kvm_faultin_pfn(slot, gfn, flags, &writable, &page); + if (vcpu_ucontrol_translate(vcpu, &gaddr)) + return -EREMOTE; + f.gfn = gpa_to_gfn(gaddr); - /* Access outside memory, inject addressing exception */ - if (is_noslot_pfn(pfn)) + rc = kvm_s390_faultin_gfn(vcpu, NULL, &f); + if (rc <= 0) + return rc; + if (rc == PGM_ADDRESSING) return vcpu_post_run_addressing_exception(vcpu); - /* Signal pending: try again */ - if (pfn == KVM_PFN_ERR_SIGPENDING) - return -EAGAIN; - - /* Needs I/O, try to setup async pfault (only possible with FOLL_NOWAIT) */ - if (pfn == KVM_PFN_ERR_NEEDS_IO) { - trace_kvm_s390_major_guest_pfault(vcpu); - if (kvm_arch_setup_async_pf(vcpu)) - return 0; - vcpu->stat.pfault_sync++; - /* Could not setup async pfault, try again synchronously */ - flags &= ~FOLL_NOWAIT; - goto try_again; - } - /* Any other error */ - if (is_error_pfn(pfn)) - return -EFAULT; - - /* Success */ - mmap_read_lock(vcpu->arch.gmap->mm); - /* Mark the userspace PTEs as young and/or dirty, to avoid page fault loops */ - rc = fixup_user_fault(vcpu->arch.gmap->mm, vmaddr, fault_flags, &unlocked); - if (!rc) - rc = __gmap_link(vcpu->arch.gmap, gaddr, vmaddr); - scoped_guard(spinlock, &vcpu->kvm->mmu_lock) { - kvm_release_faultin_page(vcpu->kvm, page, false, writable); - } - mmap_read_unlock(vcpu->arch.gmap->mm); - return rc; -} - -static int vcpu_dat_fault_handler(struct kvm_vcpu *vcpu, unsigned long gaddr, unsigned int flags) -{ - unsigned long gaddr_tmp; - gfn_t gfn; - - gfn = gpa_to_gfn(gaddr); - if (kvm_is_ucontrol(vcpu->kvm)) { - /* - * This translates the per-vCPU guest address into a - * fake guest address, which can then be used with the - * fake memslots that are identity mapping userspace. - * This allows ucontrol VMs to use the normal fault - * resolution path, like normal VMs. - */ - mmap_read_lock(vcpu->arch.gmap->mm); - gaddr_tmp = __gmap_translate(vcpu->arch.gmap, gaddr); - mmap_read_unlock(vcpu->arch.gmap->mm); - if (gaddr_tmp == -EFAULT) { - vcpu->run->exit_reason = KVM_EXIT_S390_UCONTROL; - vcpu->run->s390_ucontrol.trans_exc_code = gaddr; - vcpu->run->s390_ucontrol.pgm_code = PGM_SEGMENT_TRANSLATION; - return -EREMOTE; - } - gfn = gpa_to_gfn(gaddr_tmp); - } - return __kvm_s390_handle_dat_fault(vcpu, gfn, gaddr, flags); + KVM_BUG_ON(rc, vcpu->kvm); + return -EINVAL; } static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu) { - unsigned int flags = 0; + unsigned int foll = 0; unsigned long gaddr; + int rc; gaddr = current->thread.gmap_teid.addr * PAGE_SIZE; if (kvm_s390_cur_gmap_fault_is_write()) - flags = FAULT_FLAG_WRITE; + foll = FOLL_WRITE; switch (current->thread.gmap_int_code & PGM_INT_CODE_MASK) { case 0: vcpu->stat.exit_null++; break; - case PGM_NON_SECURE_STORAGE_ACCESS: - kvm_s390_assert_primary_as(vcpu); - /* - * This is normal operation; a page belonging to a protected - * guest has not been imported yet. Try to import the page into - * the protected guest. - */ - if (gmap_convert_to_secure(vcpu->arch.gmap, gaddr) == -EINVAL) - send_sig(SIGSEGV, current, 0); - break; case PGM_SECURE_STORAGE_ACCESS: case PGM_SECURE_STORAGE_VIOLATION: kvm_s390_assert_primary_as(vcpu); @@ -4980,7 +4576,7 @@ static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu) * previous protected guest. The old pages need to be destroyed * so the new guest can use them. */ - if (gmap_destroy_page(vcpu->arch.gmap, gaddr)) { + if (kvm_s390_pv_destroy_page(vcpu->kvm, gaddr)) { /* * Either KVM messed up the secure guest mapping or the * same page is mapped into multiple secure guests. @@ -4995,6 +4591,20 @@ static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu) send_sig(SIGSEGV, current, 0); } break; + case PGM_NON_SECURE_STORAGE_ACCESS: + kvm_s390_assert_primary_as(vcpu); + /* + * This is normal operation; a page belonging to a protected + * guest has not been imported yet. Try to import the page into + * the protected guest. + */ + rc = kvm_s390_pv_convert_to_secure(vcpu->kvm, gaddr); + if (rc == -EINVAL) + send_sig(SIGSEGV, current, 0); + if (rc != -ENXIO) + break; + foll = FOLL_WRITE; + fallthrough; case PGM_PROTECTION: case PGM_SEGMENT_TRANSLATION: case PGM_PAGE_TRANSLATION: @@ -5003,7 +4613,7 @@ static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu) case PGM_REGION_SECOND_TRANS: case PGM_REGION_THIRD_TRANS: kvm_s390_assert_primary_as(vcpu); - return vcpu_dat_fault_handler(vcpu, gaddr, flags); + return vcpu_dat_fault_handler(vcpu, gaddr, foll); default: KVM_BUG(1, vcpu->kvm, "Unexpected program interrupt 0x%x, TEID 0x%016lx", current->thread.gmap_int_code, current->thread.gmap_teid.val); @@ -5013,7 +4623,7 @@ static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu) return 0; } -static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason) +static int vcpu_post_run(struct kvm_vcpu *vcpu, int sie_return) { struct mcck_volatile_info *mcck_info; struct sie_page *sie_page; @@ -5029,14 +4639,14 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason) vcpu->run->s.regs.gprs[14] = vcpu->arch.sie_block->gg14; vcpu->run->s.regs.gprs[15] = vcpu->arch.sie_block->gg15; - if (exit_reason == -EINTR) { - VCPU_EVENT(vcpu, 3, "%s", "machine check"); + if (sie_return == SIE64_RETURN_MCCK) { sie_page = container_of(vcpu->arch.sie_block, struct sie_page, sie_block); mcck_info = &sie_page->mcck_info; kvm_s390_reinject_machine_check(vcpu, mcck_info); return 0; } + WARN_ON_ONCE(sie_return != SIE64_RETURN_NORMAL); if (vcpu->arch.sie_block->icptcode > 0) { rc = kvm_handle_sie_intercept(vcpu); @@ -5053,10 +4663,29 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason) return vcpu_post_run_handle_fault(vcpu); } +int noinstr kvm_s390_enter_exit_sie(struct kvm_s390_sie_block *scb, + u64 *gprs, unsigned long gasce) +{ + int ret; + + guest_state_enter_irqoff(); + + /* + * The guest_state_{enter,exit}_irqoff() functions inform lockdep and + * tracing that entry to the guest will enable host IRQs, and exit from + * the guest will disable host IRQs. + */ + ret = sie64a(scb, gprs, gasce); + + guest_state_exit_irqoff(); + + return ret; +} + #define PSW_INT_MASK (PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_MCHECK) static int __vcpu_run(struct kvm_vcpu *vcpu) { - int rc, exit_reason; + int rc, sie_return; struct sie_page *sie_page = (struct sie_page *)vcpu->arch.sie_block; /* @@ -5065,28 +4694,45 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) */ kvm_vcpu_srcu_read_lock(vcpu); - do { + while (true) { rc = vcpu_pre_run(vcpu); + kvm_vcpu_srcu_read_unlock(vcpu); if (rc || guestdbg_exit_pending(vcpu)) break; - kvm_vcpu_srcu_read_unlock(vcpu); /* * As PF_VCPU will be used in fault handler, between - * guest_enter and guest_exit should be no uaccess. + * guest_timing_enter_irqoff and guest_timing_exit_irqoff + * should be no uaccess. */ - local_irq_disable(); - guest_enter_irqoff(); - __disable_cpu_timer_accounting(vcpu); - local_irq_enable(); if (kvm_s390_pv_cpu_is_protected(vcpu)) { memcpy(sie_page->pv_grregs, vcpu->run->s.regs.gprs, sizeof(sie_page->pv_grregs)); } - exit_reason = sie64a(vcpu->arch.sie_block, - vcpu->run->s.regs.gprs, - vcpu->arch.gmap->asce); + +xfer_to_guest_mode_check: + local_irq_disable(); + xfer_to_guest_mode_prepare(); + if (xfer_to_guest_mode_work_pending()) { + local_irq_enable(); + rc = kvm_xfer_to_guest_mode_handle_work(vcpu); + if (rc) + break; + goto xfer_to_guest_mode_check; + } + + guest_timing_enter_irqoff(); + __disable_cpu_timer_accounting(vcpu); + + sie_return = kvm_s390_enter_exit_sie(vcpu->arch.sie_block, + vcpu->run->s.regs.gprs, + vcpu->arch.gmap->asce.val); + + __enable_cpu_timer_accounting(vcpu); + guest_timing_exit_irqoff(); + local_irq_enable(); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { memcpy(vcpu->run->s.regs.gprs, sie_page->pv_grregs, @@ -5102,16 +4748,15 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->gpsw.mask &= ~PSW_INT_MASK; } } - local_irq_disable(); - __enable_cpu_timer_accounting(vcpu); - guest_exit_irqoff(); - local_irq_enable(); kvm_vcpu_srcu_read_lock(vcpu); - rc = vcpu_post_run(vcpu, exit_reason); - } while (!signal_pending(current) && !guestdbg_exit_pending(vcpu) && !rc); + rc = vcpu_post_run(vcpu, sie_return); + if (rc || guestdbg_exit_pending(vcpu)) { + kvm_vcpu_srcu_read_unlock(vcpu); + break; + } + } - kvm_vcpu_srcu_read_unlock(vcpu); return rc; } @@ -5171,7 +4816,7 @@ static void sync_regs_fmt2(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->fpf &= ~FPF_BPBC; vcpu->arch.sie_block->fpf |= kvm_run->s.regs.bpbc ? FPF_BPBC : 0; } - if (MACHINE_HAS_GS) { + if (cpu_has_gs()) { preempt_disable(); local_ctl_set_bit(2, CR2_GUARDED_STORAGE_BIT); if (current->thread.gs_cb) { @@ -5237,7 +4882,7 @@ static void store_regs_fmt2(struct kvm_vcpu *vcpu) kvm_run->s.regs.gbea = vcpu->arch.sie_block->gbea; kvm_run->s.regs.bpbc = (vcpu->arch.sie_block->fpf & FPF_BPBC) == FPF_BPBC; kvm_run->s.regs.diag318 = vcpu->arch.diag318_info.val; - if (MACHINE_HAS_GS) { + if (cpu_has_gs()) { preempt_disable(); local_ctl_set_bit(2, CR2_GUARDED_STORAGE_BIT); if (vcpu->arch.gs_enabled) @@ -5327,6 +4972,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) if (signal_pending(current) && !rc) { kvm_run->exit_reason = KVM_EXIT_INTR; + vcpu->stat.signal_exits++; rc = -EINTR; } @@ -5616,8 +5262,8 @@ static long kvm_s390_vcpu_mem_op(struct kvm_vcpu *vcpu, struct kvm_s390_mem_op *mop) { void __user *uaddr = (void __user *)mop->buf; + void *tmpbuf __free(kvfree) = NULL; enum gacc_mode acc_mode; - void *tmpbuf = NULL; int r; r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_INJECT_EXCEPTION | @@ -5639,32 +5285,21 @@ static long kvm_s390_vcpu_mem_op(struct kvm_vcpu *vcpu, if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) { r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size, acc_mode, mop->key); - goto out_inject; - } - if (acc_mode == GACC_FETCH) { + } else if (acc_mode == GACC_FETCH) { r = read_guest_with_key(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size, mop->key); - if (r) - goto out_inject; - if (copy_to_user(uaddr, tmpbuf, mop->size)) { - r = -EFAULT; - goto out_free; - } + if (!r && copy_to_user(uaddr, tmpbuf, mop->size)) + return -EFAULT; } else { - if (copy_from_user(tmpbuf, uaddr, mop->size)) { - r = -EFAULT; - goto out_free; - } + if (copy_from_user(tmpbuf, uaddr, mop->size)) + return -EFAULT; r = write_guest_with_key(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size, mop->key); } -out_inject: if (r > 0 && (mop->flags & KVM_S390_MEMOP_F_INJECT_EXCEPTION) != 0) kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm); -out_free: - vfree(tmpbuf); return r; } @@ -5693,8 +5328,8 @@ static long kvm_s390_vcpu_memsida_op(struct kvm_vcpu *vcpu, return r; } -long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; @@ -5854,44 +5489,58 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } #ifdef CONFIG_KVM_S390_UCONTROL case KVM_S390_UCAS_MAP: { - struct kvm_s390_ucas_mapping ucasmap; + struct kvm_s390_ucas_mapping ucas; - if (copy_from_user(&ucasmap, argp, sizeof(ucasmap))) { - r = -EFAULT; + r = -EFAULT; + if (copy_from_user(&ucas, argp, sizeof(ucas))) break; - } - if (!kvm_is_ucontrol(vcpu->kvm)) { - r = -EINVAL; + r = -EINVAL; + if (!kvm_is_ucontrol(vcpu->kvm)) + break; + if (!IS_ALIGNED(ucas.user_addr | ucas.vcpu_addr | ucas.length, _SEGMENT_SIZE)) break; - } - r = gmap_map_segment(vcpu->arch.gmap, ucasmap.user_addr, - ucasmap.vcpu_addr, ucasmap.length); + r = gmap_ucas_map(vcpu->arch.gmap, gpa_to_gfn(ucas.user_addr), + gpa_to_gfn(ucas.vcpu_addr), + ucas.length >> _SEGMENT_SHIFT); break; } case KVM_S390_UCAS_UNMAP: { - struct kvm_s390_ucas_mapping ucasmap; + struct kvm_s390_ucas_mapping ucas; - if (copy_from_user(&ucasmap, argp, sizeof(ucasmap))) { - r = -EFAULT; + r = -EFAULT; + if (copy_from_user(&ucas, argp, sizeof(ucas))) break; - } - if (!kvm_is_ucontrol(vcpu->kvm)) { - r = -EINVAL; + r = -EINVAL; + if (!kvm_is_ucontrol(vcpu->kvm)) + break; + if (!IS_ALIGNED(ucas.vcpu_addr | ucas.length, _SEGMENT_SIZE)) break; - } - r = gmap_unmap_segment(vcpu->arch.gmap, ucasmap.vcpu_addr, - ucasmap.length); + gmap_ucas_unmap(vcpu->arch.gmap, gpa_to_gfn(ucas.vcpu_addr), + ucas.length >> _SEGMENT_SHIFT); + r = 0; break; } #endif case KVM_S390_VCPU_FAULT: { - idx = srcu_read_lock(&vcpu->kvm->srcu); - r = vcpu_dat_fault_handler(vcpu, arg, 0); - srcu_read_unlock(&vcpu->kvm->srcu, idx); + gpa_t gaddr = arg; + + scoped_guard(srcu, &vcpu->kvm->srcu) { + r = vcpu_ucontrol_translate(vcpu, &gaddr); + if (r) + break; + + r = kvm_s390_faultin_gfn_simple(vcpu, NULL, gpa_to_gfn(gaddr), false); + if (r == PGM_ADDRESSING) + r = -EFAULT; + if (r <= 0) + break; + r = -EIO; + KVM_BUG_ON(r, vcpu->kvm); + } break; } case KVM_ENABLE_CAP: @@ -6005,9 +5654,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *new, enum kvm_mr_change change) { - gpa_t size; - - if (kvm_is_ucontrol(kvm) && new->id < KVM_USER_MEM_SLOTS) + if (kvm_is_ucontrol(kvm) && new && new->id < KVM_USER_MEM_SLOTS) return -EINVAL; /* When we are protected, we should not change the memory slots */ @@ -6016,20 +5663,14 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, if (change != KVM_MR_DELETE && change != KVM_MR_FLAGS_ONLY) { /* - * A few sanity checks. We can have memory slots which have to be - * located/ended at a segment boundary (1MB). The memory in userland is - * ok to be fragmented into various different vmas. It is okay to mmap() - * and munmap() stuff in this slot after doing this call at any time + * A few sanity checks. The memory in userland is ok to be + * fragmented into various different vmas. It is okay to mmap() + * and munmap() stuff in this slot after doing this call at any + * time. */ - - if (new->userspace_addr & 0xffffful) - return -EINVAL; - - size = new->npages * PAGE_SIZE; - if (size & 0xffffful) + if (new->userspace_addr & ~PAGE_MASK) return -EINVAL; - - if ((new->base_gfn * PAGE_SIZE) + size > kvm->arch.mem_limit) + if ((new->base_gfn + new->npages) * PAGE_SIZE > kvm->arch.mem_limit) return -EINVAL; } @@ -6057,37 +5698,89 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_memory_slot *new, enum kvm_mr_change change) { + struct kvm_s390_mmu_cache *mc = NULL; int rc = 0; - if (kvm_is_ucontrol(kvm)) + if (change == KVM_MR_FLAGS_ONLY) return; - switch (change) { - case KVM_MR_DELETE: - rc = gmap_unmap_segment(kvm->arch.gmap, old->base_gfn * PAGE_SIZE, - old->npages * PAGE_SIZE); - break; - case KVM_MR_MOVE: - rc = gmap_unmap_segment(kvm->arch.gmap, old->base_gfn * PAGE_SIZE, - old->npages * PAGE_SIZE); - if (rc) + mc = kvm_s390_new_mmu_cache(); + if (!mc) { + rc = -ENOMEM; + goto out; + } + + scoped_guard(write_lock, &kvm->mmu_lock) { + switch (change) { + case KVM_MR_DELETE: + rc = dat_delete_slot(mc, kvm->arch.gmap->asce, old->base_gfn, old->npages); break; - fallthrough; - case KVM_MR_CREATE: - rc = gmap_map_segment(kvm->arch.gmap, new->userspace_addr, - new->base_gfn * PAGE_SIZE, - new->npages * PAGE_SIZE); - break; - case KVM_MR_FLAGS_ONLY: - break; - default: - WARN(1, "Unknown KVM MR CHANGE: %d\n", change); + case KVM_MR_MOVE: + rc = dat_delete_slot(mc, kvm->arch.gmap->asce, old->base_gfn, old->npages); + if (rc) + break; + fallthrough; + case KVM_MR_CREATE: + rc = dat_create_slot(mc, kvm->arch.gmap->asce, new->base_gfn, new->npages); + break; + case KVM_MR_FLAGS_ONLY: + break; + default: + WARN(1, "Unknown KVM MR CHANGE: %d\n", change); + } } +out: if (rc) pr_warn("failed to commit memory region\n"); + kvm_s390_free_mmu_cache(mc); return; } +/** + * kvm_test_age_gfn() - test young + * @kvm: the kvm instance + * @range: the range of guest addresses whose young status needs to be cleared + * + * Context: called by KVM common code without holding the kvm mmu lock + * Return: true if any page in the given range is young, otherwise 0. + */ +bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) +{ + scoped_guard(read_lock, &kvm->mmu_lock) + return dat_test_age_gfn(kvm->arch.gmap->asce, range->start, range->end); +} + +/** + * kvm_age_gfn() - clear young + * @kvm: the kvm instance + * @range: the range of guest addresses whose young status needs to be cleared + * + * Context: called by KVM common code without holding the kvm mmu lock + * Return: true if any page in the given range was young, otherwise 0. + */ +bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) +{ + scoped_guard(read_lock, &kvm->mmu_lock) + return gmap_age_gfn(kvm->arch.gmap, range->start, range->end); +} + +/** + * kvm_unmap_gfn_range() - Unmap a range of guest addresses + * @kvm: the kvm instance + * @range: the range of guest page frames to invalidate + * + * This function always returns false because every DAT table modification + * has to use the appropriate DAT table manipulation instructions, which will + * keep the TLB coherent, hence no additional TLB flush is ever required. + * + * Context: called by KVM common code with the kvm mmu write lock held + * Return: false + */ +bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) +{ + return gmap_unmap_gfn_range(kvm->arch.gmap, range->slot, range->start, range->end); +} + static inline unsigned long nonhyp_mask(int i) { unsigned int nonhyp_fai = (sclp.hmfai << i * 2) >> 30; @@ -6104,11 +5797,6 @@ static int __init kvm_s390_init(void) return -ENODEV; } - if (nested && hpage) { - pr_info("A KVM host that supports nesting cannot back its KVM guests with huge pages\n"); - return -EINVAL; - } - for (i = 0; i < 16; i++) kvm_s390_fac_base[i] |= stfle_fac_list[i] & nonhyp_mask(i); diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 8d3bbb2dd8d2..dc0573b7aa4b 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -19,9 +19,19 @@ #include <asm/facility.h> #include <asm/processor.h> #include <asm/sclp.h> +#include "dat.h" +#include "gmap.h" #define KVM_S390_UCONTROL_MEMSLOT (KVM_USER_MEM_SLOTS + 0) +union kvm_s390_quad { + __uint128_t sixteen; + unsigned long eight; + unsigned int four; + unsigned short two; + unsigned char one; +}; + static inline void kvm_s390_fpu_store(struct kvm_run *run) { fpu_stfpc(&run->s.regs.fpc); @@ -106,15 +116,15 @@ static inline int is_vcpu_idle(struct kvm_vcpu *vcpu) static inline int kvm_is_ucontrol(struct kvm *kvm) { #ifdef CONFIG_KVM_S390_UCONTROL - if (kvm->arch.gmap) - return 0; - return 1; + return test_bit(GMAP_FLAG_IS_UCONTROL, &kvm->arch.gmap->flags); #else return 0; #endif } -#define GUEST_PREFIX_SHIFT 13 +#define GUEST_PREFIX_SHIFT 12 +#define GUEST_PREFIX_MASK_ZARCH 0x7fffe +#define GUEST_PREFIX_MASK_ESA 0x7ffff static inline u32 kvm_s390_get_prefix(struct kvm_vcpu *vcpu) { return vcpu->arch.sie_block->prefix << GUEST_PREFIX_SHIFT; @@ -125,6 +135,7 @@ static inline void kvm_s390_set_prefix(struct kvm_vcpu *vcpu, u32 prefix) VCPU_EVENT(vcpu, 3, "set prefix of cpu %03u to 0x%x", vcpu->vcpu_id, prefix); vcpu->arch.sie_block->prefix = prefix >> GUEST_PREFIX_SHIFT; + vcpu->arch.sie_block->prefix &= GUEST_PREFIX_MASK_ZARCH; kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); kvm_make_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu); } @@ -308,6 +319,9 @@ int kvm_s390_pv_dump_stor_state(struct kvm *kvm, void __user *buff_user, u64 *gaddr, u64 buff_user_len, u16 *rc, u16 *rrc); int kvm_s390_pv_dump_complete(struct kvm *kvm, void __user *buff_user, u16 *rc, u16 *rrc); +int kvm_s390_pv_destroy_page(struct kvm *kvm, unsigned long gaddr); +int kvm_s390_pv_convert_to_secure(struct kvm *kvm, unsigned long gaddr); +int kvm_s390_pv_make_secure(struct kvm *kvm, unsigned long gaddr, void *uvcb); static inline u64 kvm_s390_pv_get_handle(struct kvm *kvm) { @@ -319,6 +333,41 @@ static inline u64 kvm_s390_pv_cpu_get_handle(struct kvm_vcpu *vcpu) return vcpu->arch.pv.handle; } +/** + * __kvm_s390_pv_destroy_page() - Destroy a guest page. + * @page: the page to destroy + * + * An attempt will be made to destroy the given guest page. If the attempt + * fails, an attempt is made to export the page. If both attempts fail, an + * appropriate error is returned. + * + * Context: must be called holding the mm lock for gmap->mm + */ +static inline int __kvm_s390_pv_destroy_page(struct page *page) +{ + struct folio *folio = page_folio(page); + int rc; + + /* Large folios cannot be secure. Small folio implies FW_LEVEL_PTE. */ + if (folio_test_large(folio)) + return -EFAULT; + + rc = uv_destroy_folio(folio); + /* + * Fault handlers can race; it is possible that two CPUs will fault + * on the same secure page. One CPU can destroy the page, reboot, + * re-enter secure mode and import it, while the second CPU was + * stuck at the beginning of the handler. At some point the second + * CPU will be able to progress, and it will not be able to destroy + * the page. In that case we do not want to terminate the process, + * we instead try to export the page. + */ + if (rc) + rc = uv_convert_from_secure_folio(folio); + + return rc; +} + /* implemented in interrupt.c */ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu); @@ -394,8 +443,7 @@ int kvm_s390_skey_check_enable(struct kvm_vcpu *vcpu); /* implemented in vsie.c */ int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu); void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu); -void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start, - unsigned long end); +void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, gpa_t start, gpa_t end); void kvm_s390_vsie_init(struct kvm *kvm); void kvm_s390_vsie_destroy(struct kvm *kvm); @@ -419,14 +467,10 @@ void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu); void kvm_s390_set_cpu_timer(struct kvm_vcpu *vcpu, __u64 cputm); __u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu); int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rc, u16 *rrc); -int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int flags); int __kvm_s390_mprotect_many(struct gmap *gmap, gpa_t gpa, u8 npages, unsigned int prot, unsigned long bits); -static inline int kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gpa_t gaddr, unsigned int flags) -{ - return __kvm_s390_handle_dat_fault(vcpu, gpa_to_gfn(gaddr), gaddr, flags); -} +bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu); /* implemented in diag.c */ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu); @@ -528,13 +572,6 @@ void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu); int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu); int kvm_s390_handle_per_event(struct kvm_vcpu *vcpu); -/* support for Basic/Extended SCA handling */ -static inline union ipte_control *kvm_s390_get_ipte_control(struct kvm *kvm) -{ - struct bsca_block *sca = kvm->arch.sca; /* SCA version doesn't matter */ - - return &sca->ipte_control; -} static inline int kvm_s390_use_sca_entries(void) { /* @@ -542,7 +579,7 @@ static inline int kvm_s390_use_sca_entries(void) * might use the entries. By not setting the entries and keeping them * invalid, hardware will not access them but intercept. */ - return sclp.has_sigpif; + return sclp.has_sigpif && sclp.has_esca; } void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu, struct mcck_volatile_info *mcck_info); diff --git a/arch/s390/kvm/pci.c b/arch/s390/kvm/pci.c index 9b9e7fdd5380..86d93e8dddae 100644 --- a/arch/s390/kvm/pci.c +++ b/arch/s390/kvm/pci.c @@ -54,7 +54,7 @@ static int zpci_setup_aipb(u8 nisc) struct page *page; int size, rc; - zpci_aipb = kzalloc(sizeof(union zpci_sic_iib), GFP_KERNEL); + zpci_aipb = kzalloc_obj(union zpci_sic_iib); if (!zpci_aipb) return -ENOMEM; @@ -126,8 +126,7 @@ int kvm_s390_pci_aen_init(u8 nisc) return -EPERM; mutex_lock(&aift->aift_lock); - aift->kzdev = kcalloc(ZPCI_NR_DEVICES, sizeof(struct kvm_zdev *), - GFP_KERNEL); + aift->kzdev = kzalloc_objs(struct kvm_zdev *, ZPCI_NR_DEVICES); if (!aift->kzdev) { rc = -ENOMEM; goto unlock; @@ -404,7 +403,7 @@ static int kvm_s390_pci_dev_open(struct zpci_dev *zdev) { struct kvm_zdev *kzdev; - kzdev = kzalloc(sizeof(struct kvm_zdev), GFP_KERNEL); + kzdev = kzalloc_obj(struct kvm_zdev); if (!kzdev) return -ENOMEM; @@ -433,7 +432,6 @@ static void kvm_s390_pci_dev_release(struct zpci_dev *zdev) static int kvm_s390_pci_register_kvm(void *opaque, struct kvm *kvm) { struct zpci_dev *zdev = opaque; - u8 status; int rc; if (!zdev) @@ -480,13 +478,7 @@ static int kvm_s390_pci_register_kvm(void *opaque, struct kvm *kvm) */ zdev->gisa = (u32)virt_to_phys(&kvm->arch.sie_page2->gisa); - rc = zpci_enable_device(zdev); - if (rc) - goto clear_gisa; - - /* Re-register the IOMMU that was already created */ - rc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, - virt_to_phys(zdev->dma_table), &status); + rc = zpci_reenable_device(zdev); if (rc) goto clear_gisa; @@ -516,7 +508,6 @@ static void kvm_s390_pci_unregister_kvm(void *opaque) { struct zpci_dev *zdev = opaque; struct kvm *kvm; - u8 status; if (!zdev) return; @@ -550,12 +541,7 @@ static void kvm_s390_pci_unregister_kvm(void *opaque) goto out; } - if (zpci_enable_device(zdev)) - goto out; - - /* Re-register the IOMMU that was already created */ - zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, - virt_to_phys(zdev->dma_table), &status); + zpci_reenable_device(zdev); out: spin_lock(&kvm->arch.kzdev_list_lock); @@ -679,7 +665,7 @@ int __init kvm_s390_pci_init(void) if (!kvm_s390_pci_interp_allowed()) return 0; - aift = kzalloc(sizeof(struct zpci_aift), GFP_KERNEL); + aift = kzalloc_obj(struct zpci_aift); if (!aift) return -ENOMEM; diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 1a49b89706f8..cc0553da14cb 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -21,13 +21,14 @@ #include <asm/ebcdic.h> #include <asm/sysinfo.h> #include <asm/page-states.h> -#include <asm/gmap.h> #include <asm/ptrace.h> #include <asm/sclp.h> #include <asm/ap.h> +#include <asm/gmap_helpers.h> #include "gaccess.h" #include "kvm-s390.h" #include "trace.h" +#include "gmap.h" static int handle_ri(struct kvm_vcpu *vcpu) { @@ -222,7 +223,7 @@ int kvm_s390_skey_check_enable(struct kvm_vcpu *vcpu) if (vcpu->arch.skey_enabled) return 0; - rc = s390_enable_skey(); + rc = gmap_enable_skeys(vcpu->arch.gmap); VCPU_EVENT(vcpu, 3, "enabling storage keys for guest: %d", rc); if (rc) return rc; @@ -255,10 +256,9 @@ static int try_handle_skey(struct kvm_vcpu *vcpu) static int handle_iske(struct kvm_vcpu *vcpu) { - unsigned long gaddr, vmaddr; - unsigned char key; + unsigned long gaddr; int reg1, reg2; - bool unlocked; + union skey key; int rc; vcpu->stat.instruction_iske++; @@ -275,37 +275,21 @@ static int handle_iske(struct kvm_vcpu *vcpu) gaddr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; gaddr = kvm_s390_logical_to_effective(vcpu, gaddr); gaddr = kvm_s390_real_to_abs(vcpu, gaddr); - vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(gaddr)); - if (kvm_is_error_hva(vmaddr)) - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); -retry: - unlocked = false; - mmap_read_lock(current->mm); - rc = get_guest_storage_key(current->mm, vmaddr, &key); - - if (rc) { - rc = fixup_user_fault(current->mm, vmaddr, - FAULT_FLAG_WRITE, &unlocked); - if (!rc) { - mmap_read_unlock(current->mm); - goto retry; - } - } - mmap_read_unlock(current->mm); - if (rc == -EFAULT) - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) + rc = dat_get_storage_key(vcpu->arch.gmap->asce, gpa_to_gfn(gaddr), &key); + if (rc > 0) + return kvm_s390_inject_program_int(vcpu, rc); if (rc < 0) return rc; vcpu->run->s.regs.gprs[reg1] &= ~0xff; - vcpu->run->s.regs.gprs[reg1] |= key; + vcpu->run->s.regs.gprs[reg1] |= key.skey; return 0; } static int handle_rrbe(struct kvm_vcpu *vcpu) { - unsigned long vmaddr, gaddr; + unsigned long gaddr; int reg1, reg2; - bool unlocked; int rc; vcpu->stat.instruction_rrbe++; @@ -322,24 +306,10 @@ static int handle_rrbe(struct kvm_vcpu *vcpu) gaddr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; gaddr = kvm_s390_logical_to_effective(vcpu, gaddr); gaddr = kvm_s390_real_to_abs(vcpu, gaddr); - vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(gaddr)); - if (kvm_is_error_hva(vmaddr)) - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); -retry: - unlocked = false; - mmap_read_lock(current->mm); - rc = reset_guest_reference_bit(current->mm, vmaddr); - if (rc < 0) { - rc = fixup_user_fault(current->mm, vmaddr, - FAULT_FLAG_WRITE, &unlocked); - if (!rc) { - mmap_read_unlock(current->mm); - goto retry; - } - } - mmap_read_unlock(current->mm); - if (rc == -EFAULT) - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) + rc = dat_reset_reference_bit(vcpu->arch.gmap->asce, gpa_to_gfn(gaddr)); + if (rc > 0) + return kvm_s390_inject_program_int(vcpu, rc); if (rc < 0) return rc; kvm_s390_set_psw_cc(vcpu, rc); @@ -354,9 +324,8 @@ static int handle_sske(struct kvm_vcpu *vcpu) { unsigned char m3 = vcpu->arch.sie_block->ipb >> 28; unsigned long start, end; - unsigned char key, oldkey; + union skey key, oldkey; int reg1, reg2; - bool unlocked; int rc; vcpu->stat.instruction_sske++; @@ -377,7 +346,7 @@ static int handle_sske(struct kvm_vcpu *vcpu) kvm_s390_get_regs_rre(vcpu, ®1, ®2); - key = vcpu->run->s.regs.gprs[reg1] & 0xfe; + key.skey = vcpu->run->s.regs.gprs[reg1] & 0xfe; start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; start = kvm_s390_logical_to_effective(vcpu, start); if (m3 & SSKE_MB) { @@ -389,27 +358,17 @@ static int handle_sske(struct kvm_vcpu *vcpu) } while (start != end) { - unsigned long vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start)); - unlocked = false; - - if (kvm_is_error_hva(vmaddr)) - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - - mmap_read_lock(current->mm); - rc = cond_set_guest_storage_key(current->mm, vmaddr, key, &oldkey, - m3 & SSKE_NQ, m3 & SSKE_MR, - m3 & SSKE_MC); - - if (rc < 0) { - rc = fixup_user_fault(current->mm, vmaddr, - FAULT_FLAG_WRITE, &unlocked); - rc = !rc ? -EAGAIN : rc; + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) { + rc = dat_cond_set_storage_key(vcpu->arch.mc, vcpu->arch.gmap->asce, + gpa_to_gfn(start), key, &oldkey, + m3 & SSKE_NQ, m3 & SSKE_MR, m3 & SSKE_MC); } - mmap_read_unlock(current->mm); - if (rc == -EFAULT) + if (rc > 1) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - if (rc == -EAGAIN) + if (rc == -ENOMEM) { + kvm_s390_mmu_cache_topup(vcpu->arch.mc); continue; + } if (rc < 0) return rc; start += PAGE_SIZE; @@ -422,7 +381,7 @@ static int handle_sske(struct kvm_vcpu *vcpu) } else { kvm_s390_set_psw_cc(vcpu, rc); vcpu->run->s.regs.gprs[reg1] &= ~0xff00UL; - vcpu->run->s.regs.gprs[reg1] |= (u64) oldkey << 8; + vcpu->run->s.regs.gprs[reg1] |= (u64)oldkey.skey << 8; } } if (m3 & SSKE_MB) { @@ -605,6 +564,14 @@ static int handle_io_inst(struct kvm_vcpu *vcpu) } } +#if IS_ENABLED(CONFIG_VFIO_AP) +bool kvm_s390_is_gpa_in_memslot(struct kvm *kvm, gpa_t gpa) +{ + return kvm_is_gpa_in_memslot(kvm, gpa); +} +EXPORT_SYMBOL_FOR_MODULES(kvm_s390_is_gpa_in_memslot, "vfio_ap"); +#endif + /* * handle_pqap: Handling pqap interception * @vcpu: the vcpu having issue the pqap instruction @@ -746,13 +713,14 @@ int is_valid_psw(psw_t *psw) int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu) { psw_t *gpsw = &vcpu->arch.sie_block->gpsw; - psw_compat_t new_psw; - u64 addr; + psw32_t new_psw; + u64 addr, iaddr; int rc; u8 ar; vcpu->stat.instruction_lpsw++; + iaddr = gpsw->addr - kvm_s390_get_ilen(vcpu); if (gpsw->mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); @@ -770,18 +738,20 @@ int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu) gpsw->addr = new_psw.addr & ~PSW32_ADDR_AMODE; if (!is_valid_psw(gpsw)) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + vcpu->arch.sie_block->gbea = iaddr; return 0; } static int handle_lpswe(struct kvm_vcpu *vcpu) { psw_t new_psw; - u64 addr; + u64 addr, iaddr; int rc; u8 ar; vcpu->stat.instruction_lpswe++; + iaddr = vcpu->arch.sie_block->gpsw.addr - kvm_s390_get_ilen(vcpu); if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); @@ -794,6 +764,7 @@ static int handle_lpswe(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->gpsw = new_psw; if (!is_valid_psw(&vcpu->arch.sie_block->gpsw)) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + vcpu->arch.sie_block->gbea = iaddr; return 0; } @@ -1074,7 +1045,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) bool mr = false, mc = false, nq; int reg1, reg2; unsigned long start, end; - unsigned char key; + union skey key; vcpu->stat.instruction_pfmf++; @@ -1102,7 +1073,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) } nq = vcpu->run->s.regs.gprs[reg1] & PFMF_NQ; - key = vcpu->run->s.regs.gprs[reg1] & PFMF_KEY; + key.skey = vcpu->run->s.regs.gprs[reg1] & PFMF_KEY; start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; start = kvm_s390_logical_to_effective(vcpu, start); @@ -1133,14 +1104,6 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) } while (start != end) { - unsigned long vmaddr; - bool unlocked = false; - - /* Translate guest address to host address */ - vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start)); - if (kvm_is_error_hva(vmaddr)) - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) { if (kvm_clear_guest(vcpu->kvm, start, PAGE_SIZE)) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); @@ -1151,19 +1114,17 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) if (rc) return rc; - mmap_read_lock(current->mm); - rc = cond_set_guest_storage_key(current->mm, vmaddr, - key, NULL, nq, mr, mc); - if (rc < 0) { - rc = fixup_user_fault(current->mm, vmaddr, - FAULT_FLAG_WRITE, &unlocked); - rc = !rc ? -EAGAIN : rc; + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) { + rc = dat_cond_set_storage_key(vcpu->arch.mc, vcpu->arch.gmap->asce, + gpa_to_gfn(start), key, + NULL, nq, mr, mc); } - mmap_read_unlock(current->mm); - if (rc == -EFAULT) - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - if (rc == -EAGAIN) + if (rc > 1) + return kvm_s390_inject_program_int(vcpu, rc); + if (rc == -ENOMEM) { + kvm_s390_mmu_cache_topup(vcpu->arch.mc); continue; + } if (rc < 0) return rc; } @@ -1187,8 +1148,10 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) static inline int __do_essa(struct kvm_vcpu *vcpu, const int orc) { int r1, r2, nappended, entries; - unsigned long gfn, hva, res, pgstev, ptev; + union essa_state state; unsigned long *cbrlo; + unsigned long gfn; + bool dirtied; /* * We don't need to set SD.FPF.SK to 1 here, because if we have a @@ -1197,33 +1160,12 @@ static inline int __do_essa(struct kvm_vcpu *vcpu, const int orc) kvm_s390_get_regs_rre(vcpu, &r1, &r2); gfn = vcpu->run->s.regs.gprs[r2] >> PAGE_SHIFT; - hva = gfn_to_hva(vcpu->kvm, gfn); entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3; - if (kvm_is_error_hva(hva)) - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - - nappended = pgste_perform_essa(vcpu->kvm->mm, hva, orc, &ptev, &pgstev); - if (nappended < 0) { - res = orc ? 0x10 : 0; - vcpu->run->s.regs.gprs[r1] = res; /* Exception Indication */ + nappended = dat_perform_essa(vcpu->arch.gmap->asce, gfn, orc, &state, &dirtied); + vcpu->run->s.regs.gprs[r1] = state.val; + if (nappended < 0) return 0; - } - res = (pgstev & _PGSTE_GPS_USAGE_MASK) >> 22; - /* - * Set the block-content state part of the result. 0 means resident, so - * nothing to do if the page is valid. 2 is for preserved pages - * (non-present and non-zero), and 3 for zero pages (non-present and - * zero). - */ - if (ptev & _PAGE_INVALID) { - res |= 2; - if (pgstev & _PGSTE_GPS_ZERO) - res |= 1; - } - if (pgstev & _PGSTE_GPS_NODAT) - res |= 0x20; - vcpu->run->s.regs.gprs[r1] = res; /* * It is possible that all the normal 511 slots were full, in which case * we will now write in the 512th slot, which is reserved for host use. @@ -1235,27 +1177,44 @@ static inline int __do_essa(struct kvm_vcpu *vcpu, const int orc) cbrlo[entries] = gfn << PAGE_SHIFT; } - if (orc) { - struct kvm_memory_slot *ms = gfn_to_memslot(vcpu->kvm, gfn); - - /* Increment only if we are really flipping the bit */ - if (ms && !test_and_set_bit(gfn - ms->base_gfn, kvm_second_dirty_bitmap(ms))) - atomic64_inc(&vcpu->kvm->arch.cmma_dirty_pages); - } + if (dirtied) + atomic64_inc(&vcpu->kvm->arch.cmma_dirty_pages); return nappended; } +static void _essa_clear_cbrl(struct kvm_vcpu *vcpu, unsigned long *cbrl, int len) +{ + union crste *crstep; + union pgste pgste; + union pte *ptep; + int i; + + lockdep_assert_held(&vcpu->kvm->mmu_lock); + + for (i = 0; i < len; i++) { + if (dat_entry_walk(NULL, gpa_to_gfn(cbrl[i]), vcpu->arch.gmap->asce, + 0, TABLE_TYPE_PAGE_TABLE, &crstep, &ptep)) + continue; + if (!ptep || ptep->s.pr) + continue; + pgste = pgste_get_lock(ptep); + if (pgste.usage == PGSTE_GPS_USAGE_UNUSED || pgste.zero) + gmap_helper_zap_one_page(vcpu->kvm->mm, cbrl[i]); + pgste_set_unlock(ptep, pgste); + } +} + static int handle_essa(struct kvm_vcpu *vcpu) { + lockdep_assert_held(&vcpu->kvm->srcu); + /* entries expected to be 1FF */ int entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3; unsigned long *cbrlo; - struct gmap *gmap; int i, orc; VCPU_EVENT(vcpu, 4, "ESSA: release %d pages", entries); - gmap = vcpu->arch.gmap; vcpu->stat.instruction_essa++; if (!vcpu->kvm->arch.use_cmma) return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); @@ -1279,11 +1238,7 @@ static int handle_essa(struct kvm_vcpu *vcpu) * value really needs to be written to; if the value is * already correct, we do nothing and avoid the lock. */ - if (vcpu->kvm->mm->context.uses_cmm == 0) { - mmap_write_lock(vcpu->kvm->mm); - vcpu->kvm->mm->context.uses_cmm = 1; - mmap_write_unlock(vcpu->kvm->mm); - } + set_bit(GMAP_FLAG_USES_CMM, &vcpu->arch.gmap->flags); /* * If we are here, we are supposed to have CMMA enabled in * the SIE block. Enabling CMMA works on a per-CPU basis, @@ -1297,24 +1252,22 @@ static int handle_essa(struct kvm_vcpu *vcpu) /* Retry the ESSA instruction */ kvm_s390_retry_instr(vcpu); } else { - int srcu_idx; - - mmap_read_lock(vcpu->kvm->mm); - srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - i = __do_essa(vcpu, orc); - srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); - mmap_read_unlock(vcpu->kvm->mm); + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) + i = __do_essa(vcpu, orc); if (i < 0) return i; /* Account for the possible extra cbrl entry */ entries += i; } - vcpu->arch.sie_block->cbrlo &= PAGE_MASK; /* reset nceo */ + /* reset nceo */ + vcpu->arch.sie_block->cbrlo &= PAGE_MASK; cbrlo = phys_to_virt(vcpu->arch.sie_block->cbrlo); - mmap_read_lock(gmap->mm); - for (i = 0; i < entries; ++i) - __gmap_zap(gmap, cbrlo[i]); - mmap_read_unlock(gmap->mm); + + mmap_read_lock(vcpu->kvm->mm); + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) + _essa_clear_cbrl(vcpu, cbrlo, entries); + mmap_read_unlock(vcpu->kvm->mm); + return 0; } diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c index 22c012aa5206..c2dafd812a3b 100644 --- a/arch/s390/kvm/pv.c +++ b/arch/s390/kvm/pv.c @@ -5,19 +5,23 @@ * Copyright IBM Corp. 2019, 2020 * Author(s): Janosch Frank <frankja@linux.ibm.com> */ + +#include <linux/export.h> #include <linux/kvm.h> #include <linux/kvm_host.h> #include <linux/minmax.h> #include <linux/pagemap.h> #include <linux/sched/signal.h> -#include <asm/gmap.h> #include <asm/uv.h> #include <asm/mman.h> #include <linux/pagewalk.h> #include <linux/sched/mm.h> #include <linux/mmu_notifier.h> #include "kvm-s390.h" +#include "dat.h" +#include "gaccess.h" #include "gmap.h" +#include "faultin.h" bool kvm_s390_pv_is_protected(struct kvm *kvm) { @@ -34,6 +38,163 @@ bool kvm_s390_pv_cpu_is_protected(struct kvm_vcpu *vcpu) EXPORT_SYMBOL_GPL(kvm_s390_pv_cpu_is_protected); /** + * should_export_before_import() - Determine whether an export is needed + * before an import-like operation. + * @uvcb: The Ultravisor control block of the UVC to be performed. + * @mm: The mm of the process. + * + * Returns whether an export is needed before every import-like operation. + * This is needed for shared pages, which don't trigger a secure storage + * exception when accessed from a different guest. + * + * Although considered as one, the Unpin Page UVC is not an actual import, + * so it is not affected. + * + * No export is needed also when there is only one protected VM, because the + * page cannot belong to the wrong VM in that case (there is no "other VM" + * it can belong to). + * + * Return: %true if an export is needed before every import, otherwise %false. + */ +static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm) +{ + /* + * The misc feature indicates, among other things, that importing a + * shared page from a different protected VM will automatically also + * transfer its ownership. + */ + if (uv_has_feature(BIT_UV_FEAT_MISC)) + return false; + if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED) + return false; + return atomic_read(&mm->context.protected_count) > 1; +} + +struct pv_make_secure { + void *uvcb; + struct folio *folio; + int rc; + bool needs_export; +}; + +static int __kvm_s390_pv_make_secure(struct guest_fault *f, struct folio *folio) +{ + struct pv_make_secure *priv = f->priv; + int rc; + + if (priv->needs_export) + uv_convert_from_secure(folio_to_phys(folio)); + + if (folio_test_hugetlb(folio)) + return -EFAULT; + if (folio_test_large(folio)) + return -E2BIG; + + if (!f->page) + folio_get(folio); + rc = __make_folio_secure(folio, priv->uvcb); + if (!f->page) + folio_put(folio); + + return rc; +} + +static void _kvm_s390_pv_make_secure(struct guest_fault *f) +{ + struct pv_make_secure *priv = f->priv; + struct folio *folio; + + folio = pfn_folio(f->pfn); + priv->rc = -EAGAIN; + if (folio_trylock(folio)) { + priv->rc = __kvm_s390_pv_make_secure(f, folio); + if (priv->rc == -E2BIG || priv->rc == -EBUSY) { + priv->folio = folio; + folio_get(folio); + } + folio_unlock(folio); + } +} + +/** + * kvm_s390_pv_make_secure() - make one guest page secure + * @kvm: the guest + * @gaddr: the guest address that needs to be made secure + * @uvcb: the UVCB specifying which operation needs to be performed + * + * Context: needs to be called with kvm->srcu held. + * Return: 0 on success, < 0 in case of error. + */ +int kvm_s390_pv_make_secure(struct kvm *kvm, unsigned long gaddr, void *uvcb) +{ + struct pv_make_secure priv = { .uvcb = uvcb }; + struct guest_fault f = { + .write_attempt = true, + .gfn = gpa_to_gfn(gaddr), + .callback = _kvm_s390_pv_make_secure, + .priv = &priv, + }; + int rc; + + lockdep_assert_held(&kvm->srcu); + + priv.needs_export = should_export_before_import(uvcb, kvm->mm); + + scoped_guard(mutex, &kvm->arch.pv.import_lock) { + rc = kvm_s390_faultin_gfn(NULL, kvm, &f); + + if (!rc) { + rc = priv.rc; + if (priv.folio) { + rc = s390_wiggle_split_folio(kvm->mm, priv.folio); + if (!rc) + rc = -EAGAIN; + } + } + } + if (priv.folio) + folio_put(priv.folio); + return rc; +} + +int kvm_s390_pv_convert_to_secure(struct kvm *kvm, unsigned long gaddr) +{ + struct uv_cb_cts uvcb = { + .header.cmd = UVC_CMD_CONV_TO_SEC_STOR, + .header.len = sizeof(uvcb), + .guest_handle = kvm_s390_pv_get_handle(kvm), + .gaddr = gaddr, + }; + + return kvm_s390_pv_make_secure(kvm, gaddr, &uvcb); +} + +/** + * kvm_s390_pv_destroy_page() - Destroy a guest page. + * @kvm: the guest + * @gaddr: the guest address to destroy + * + * An attempt will be made to destroy the given guest page. If the attempt + * fails, an attempt is made to export the page. If both attempts fail, an + * appropriate error is returned. + * + * Context: may sleep. + */ +int kvm_s390_pv_destroy_page(struct kvm *kvm, unsigned long gaddr) +{ + struct page *page; + int rc = 0; + + mmap_read_lock(kvm->mm); + page = gfn_to_page(kvm, gpa_to_gfn(gaddr)); + if (page) + rc = __kvm_s390_pv_destroy_page(page); + kvm_release_page_clean(page); + mmap_read_unlock(kvm->mm); + return rc; +} + +/** * struct pv_vm_to_be_destroyed - Represents a protected VM that needs to * be destroyed * @@ -240,35 +401,6 @@ done_fast: return 0; } -/** - * kvm_s390_destroy_lower_2g - Destroy the first 2GB of protected guest memory. - * @kvm: the VM whose memory is to be cleared. - * - * Destroy the first 2GB of guest memory, to avoid prefix issues after reboot. - * The CPUs of the protected VM need to be destroyed beforehand. - */ -static void kvm_s390_destroy_lower_2g(struct kvm *kvm) -{ - const unsigned long pages_2g = SZ_2G / PAGE_SIZE; - struct kvm_memory_slot *slot; - unsigned long len; - int srcu_idx; - - srcu_idx = srcu_read_lock(&kvm->srcu); - - /* Take the memslot containing guest absolute address 0 */ - slot = gfn_to_memslot(kvm, 0); - /* Clear all slots or parts thereof that are below 2GB */ - while (slot && slot->base_gfn < pages_2g) { - len = min_t(u64, slot->npages, pages_2g - slot->base_gfn) * PAGE_SIZE; - s390_uv_destroy_range(kvm->mm, slot->userspace_addr, slot->userspace_addr + len); - /* Take the next memslot */ - slot = gfn_to_memslot(kvm, slot->base_gfn + slot->npages); - } - - srcu_read_unlock(&kvm->srcu, srcu_idx); -} - static int kvm_s390_pv_deinit_vm_fast(struct kvm *kvm, u16 *rc, u16 *rrc) { struct uv_cb_destroy_fast uvcb = { @@ -283,7 +415,6 @@ static int kvm_s390_pv_deinit_vm_fast(struct kvm *kvm, u16 *rc, u16 *rrc) *rc = uvcb.header.rc; if (rrc) *rrc = uvcb.header.rrc; - WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM FAST: rc %x rrc %x", uvcb.header.rc, uvcb.header.rrc); WARN_ONCE(cc && uvcb.header.rc != 0x104, @@ -332,10 +463,10 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc) return -EINVAL; /* Guest with segment type ASCE, refuse to destroy asynchronously */ - if ((kvm->arch.gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT) + if (kvm->arch.gmap->asce.dt == TABLE_TYPE_SEGMENT) return -EINVAL; - priv = kzalloc(sizeof(*priv), GFP_KERNEL); + priv = kzalloc_obj(*priv); if (!priv) return -ENOMEM; @@ -345,8 +476,7 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc) priv->stor_var = kvm->arch.pv.stor_var; priv->stor_base = kvm->arch.pv.stor_base; priv->handle = kvm_s390_pv_get_handle(kvm); - priv->old_gmap_table = (unsigned long)kvm->arch.gmap->table; - WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); + priv->old_gmap_table = (unsigned long)dereference_asce(kvm->arch.gmap->asce); if (s390_replace_asce(kvm->arch.gmap)) res = -ENOMEM; } @@ -356,7 +486,7 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc) return res; } - kvm_s390_destroy_lower_2g(kvm); + gmap_pv_destroy_range(kvm->arch.gmap, 0, gpa_to_gfn(SZ_2G), false); kvm_s390_clear_pv_state(kvm); kvm->arch.pv.set_aside = priv; @@ -390,7 +520,6 @@ int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc) cc = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), UVC_CMD_DESTROY_SEC_CONF, rc, rrc); - WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); if (!cc) { atomic_dec(&kvm->mm->context.protected_count); kvm_s390_pv_dealloc_vm(kvm); @@ -473,7 +602,7 @@ int kvm_s390_pv_deinit_cleanup_all(struct kvm *kvm, u16 *rc, u16 *rrc) * cleanup has been performed. */ if (need_zap && mmget_not_zero(kvm->mm)) { - s390_uv_destroy_range(kvm->mm, 0, TASK_SIZE); + gmap_pv_destroy_range(kvm->arch.gmap, 0, asce_end(kvm->arch.gmap->asce), false); mmput(kvm->mm); } @@ -511,7 +640,7 @@ int kvm_s390_pv_deinit_aside_vm(struct kvm *kvm, u16 *rc, u16 *rrc) return -EINVAL; /* When a fatal signal is received, stop immediately */ - if (s390_uv_destroy_range_interruptible(kvm->mm, 0, TASK_SIZE_MAX)) + if (gmap_pv_destroy_range(kvm->arch.gmap, 0, asce_end(kvm->arch.gmap->asce), true)) goto done; if (kvm_s390_pv_dispose_one_leftover(kvm, p, rc, rrc)) ret = -EIO; @@ -550,6 +679,7 @@ static void kvm_s390_pv_mmu_notifier_release(struct mmu_notifier *subscription, r = kvm_s390_cpus_from_pv(kvm, &dummy, &dummy); if (!r && is_destroy_fast_available() && kvm_s390_pv_get_handle(kvm)) kvm_s390_pv_deinit_vm_fast(kvm, &dummy, &dummy); + set_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &kvm->arch.gmap->flags); } static const struct mmu_notifier_ops kvm_s390_pv_mmu_notifier_ops = { @@ -565,6 +695,17 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc) int cc, ret; u16 dummy; + /* Add the notifier only once. No races because we hold kvm->lock */ + if (kvm->arch.pv.mmu_notifier.ops != &kvm_s390_pv_mmu_notifier_ops) { + /* The notifier will be unregistered when the VM is destroyed */ + kvm->arch.pv.mmu_notifier.ops = &kvm_s390_pv_mmu_notifier_ops; + ret = mmu_notifier_register(&kvm->arch.pv.mmu_notifier, kvm->mm); + if (ret) { + kvm->arch.pv.mmu_notifier.ops = NULL; + return ret; + } + } + ret = kvm_s390_pv_alloc_vm(kvm); if (ret) return ret; @@ -572,7 +713,7 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc) /* Inputs */ uvcb.guest_stor_origin = 0; /* MSO is 0 for KVM */ uvcb.guest_stor_len = kvm->arch.pv.guest_len; - uvcb.guest_asce = kvm->arch.gmap->asce; + uvcb.guest_asce = kvm->arch.gmap->asce.val; uvcb.guest_sca = virt_to_phys(kvm->arch.sca); uvcb.conf_base_stor_origin = virt_to_phys((void *)kvm->arch.pv.stor_base); @@ -580,6 +721,9 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc) uvcb.flags.ap_allow_instr = kvm->arch.model.uv_feat_guest.ap; uvcb.flags.ap_instr_intr = kvm->arch.model.uv_feat_guest.ap_intr; + clear_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &kvm->arch.gmap->flags); + gmap_split_huge_pages(kvm->arch.gmap); + cc = uv_call_sched(0, (u64)&uvcb); *rc = uvcb.header.rc; *rrc = uvcb.header.rrc; @@ -599,12 +743,6 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc) } return -EIO; } - kvm->arch.gmap->guest_handle = uvcb.guest_handle; - /* Add the notifier only once. No races because we hold kvm->lock */ - if (kvm->arch.pv.mmu_notifier.ops != &kvm_s390_pv_mmu_notifier_ops) { - kvm->arch.pv.mmu_notifier.ops = &kvm_s390_pv_mmu_notifier_ops; - mmu_notifier_register(&kvm->arch.pv.mmu_notifier, kvm->mm); - } return 0; } @@ -638,27 +776,15 @@ static int unpack_one(struct kvm *kvm, unsigned long addr, u64 tweak, .tweak[0] = tweak, .tweak[1] = offset, }; - int ret = gmap_make_secure(kvm->arch.gmap, addr, &uvcb); - unsigned long vmaddr; - bool unlocked; + int ret = kvm_s390_pv_make_secure(kvm, addr, &uvcb); *rc = uvcb.header.rc; *rrc = uvcb.header.rrc; if (ret == -ENXIO) { - mmap_read_lock(kvm->mm); - vmaddr = gfn_to_hva(kvm, gpa_to_gfn(addr)); - if (kvm_is_error_hva(vmaddr)) { - ret = -EFAULT; - } else { - ret = fixup_user_fault(kvm->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked); - if (!ret) - ret = __gmap_link(kvm->arch.gmap, addr, vmaddr); - } - mmap_read_unlock(kvm->mm); + ret = kvm_s390_faultin_gfn_simple(NULL, kvm, gpa_to_gfn(addr), true); if (!ret) return -EAGAIN; - return ret; } if (ret && ret != -EAGAIN) diff --git a/arch/s390/kvm/trace-s390.h b/arch/s390/kvm/trace-s390.h index 9ac92dbf680d..9e28f165c114 100644 --- a/arch/s390/kvm/trace-s390.h +++ b/arch/s390/kvm/trace-s390.h @@ -56,7 +56,7 @@ TRACE_EVENT(kvm_s390_create_vcpu, __entry->sie_block = sie_block; ), - TP_printk("create cpu %d at 0x%pK, sie block at 0x%pK", + TP_printk("create cpu %d at 0x%p, sie block at 0x%p", __entry->id, __entry->vcpu, __entry->sie_block) ); @@ -255,7 +255,7 @@ TRACE_EVENT(kvm_s390_enable_css, __entry->kvm = kvm; ), - TP_printk("enabling channel I/O support (kvm @ %pK)\n", + TP_printk("enabling channel I/O support (kvm @ %p)\n", __entry->kvm) ); diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index a78df3a4f353..e5a23f1c9749 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -15,7 +15,6 @@ #include <linux/io.h> #include <linux/mman.h> -#include <asm/gmap.h> #include <asm/mmu_context.h> #include <asm/sclp.h> #include <asm/nmi.h> @@ -42,8 +41,11 @@ struct vsie_page { * are reused conditionally, should be accessed via READ_ONCE. */ struct kvm_s390_sie_block *scb_o; /* 0x0218 */ - /* the shadow gmap in use by the vsie_page */ - struct gmap *gmap; /* 0x0220 */ + /* + * Flags: must be set/cleared atomically after the vsie page can be + * looked up by other CPUs. + */ + unsigned long flags; /* 0x0220 */ /* address of the last reported fault to guest2 */ unsigned long fault_addr; /* 0x0228 */ /* calculated guest addresses of satellite control blocks */ @@ -58,16 +60,15 @@ struct vsie_page { * radix tree. */ gpa_t scb_gpa; /* 0x0258 */ - /* - * Flags: must be set/cleared atomically after the vsie page can be - * looked up by other CPUs. - */ - unsigned long flags; /* 0x0260 */ - __u8 reserved[0x0700 - 0x0268]; /* 0x0268 */ + /* the shadow gmap in use by the vsie_page */ + struct gmap_cache gmap_cache; /* 0x0260 */ + __u8 reserved[0x0700 - 0x0278]; /* 0x0278 */ struct kvm_s390_crypto_cb crycb; /* 0x0700 */ __u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE]; /* 0x0800 */ }; +static_assert(sizeof(struct vsie_page) == PAGE_SIZE); + /* trigger a validity icpt for the given scb */ static int set_validity_icpt(struct kvm_s390_sie_block *scb, __u16 reason_code) @@ -124,8 +125,8 @@ static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; int newflags, cpuflags = atomic_read(&scb_o->cpuflags); - /* we don't allow ESA/390 guests */ - if (!(cpuflags & CPUSTAT_ZARCH)) + /* we don't allow ESA/390 guests unless explicitly enabled */ + if (!(cpuflags & CPUSTAT_ZARCH) && !vcpu->kvm->arch.allow_vsie_esamode) return set_validity_icpt(scb_s, 0x0001U); if (cpuflags & (CPUSTAT_RRF | CPUSTAT_MCDS)) @@ -134,7 +135,9 @@ static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) return set_validity_icpt(scb_s, 0x0007U); /* intervention requests will be set later */ - newflags = CPUSTAT_ZARCH; + newflags = 0; + if (cpuflags & CPUSTAT_ZARCH) + newflags = CPUSTAT_ZARCH; if (cpuflags & CPUSTAT_GED && test_kvm_facility(vcpu->kvm, 8)) newflags |= CPUSTAT_GED; if (cpuflags & CPUSTAT_GED2 && test_kvm_facility(vcpu->kvm, 78)) { @@ -384,6 +387,17 @@ end: return 0; } +static void shadow_esa(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + + /* Ensure these bits are indeed turned off */ + scb_s->eca &= ~ECA_VX; + scb_s->ecb &= ~(ECB_GS | ECB_TE); + scb_s->ecb3 &= ~ECB3_RI; + scb_s->ecd &= ~ECD_HOSTREGMGMT; +} + /* shadow (round up/down) the ibc to avoid validity icpt */ static void prepare_ibc(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) { @@ -465,7 +479,7 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; /* READ_ONCE does not work on bitfields - use a temporary variable */ const uint32_t __new_prefix = scb_o->prefix; - const uint32_t new_prefix = READ_ONCE(__new_prefix); + uint32_t new_prefix = READ_ONCE(__new_prefix); const bool wants_tx = READ_ONCE(scb_o->ecb) & ECB_TE; bool had_tx = scb_s->ecb & ECB_TE; unsigned long new_mso = 0; @@ -513,6 +527,11 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) scb_s->icpua = scb_o->icpua; + if (!(atomic_read(&scb_s->cpuflags) & CPUSTAT_ZARCH)) + new_prefix &= GUEST_PREFIX_MASK_ESA; + else + new_prefix &= GUEST_PREFIX_MASK_ZARCH; + if (!(atomic_read(&scb_s->cpuflags) & CPUSTAT_SM)) new_mso = READ_ONCE(scb_o->mso) & 0xfffffffffff00000UL; /* if the hva of the prefix changes, we have to remap the prefix */ @@ -587,6 +606,9 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) scb_s->hpid = HPID_VSIE; scb_s->cpnc = scb_o->cpnc; + if (!(atomic_read(&scb_s->cpuflags) & CPUSTAT_ZARCH)) + shadow_esa(vcpu, vsie_page); + prepare_ibc(vcpu, vsie_page); rc = shadow_crycb(vcpu, vsie_page); out: @@ -595,26 +617,17 @@ out: return rc; } -void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start, - unsigned long end) +void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, gpa_t start, gpa_t end) { - struct kvm *kvm = gmap->private; - struct vsie_page *cur; + struct vsie_page *cur, *next; unsigned long prefix; - int i; - if (!gmap_is_shadow(gmap)) - return; + KVM_BUG_ON(!test_bit(GMAP_FLAG_SHADOW, &gmap->flags), gmap->kvm); /* * Only new shadow blocks are added to the list during runtime, * therefore we can safely reference them all the time. */ - for (i = 0; i < kvm->arch.vsie.page_count; i++) { - cur = READ_ONCE(kvm->arch.vsie.pages[i]); - if (!cur) - continue; - if (READ_ONCE(cur->gmap) != gmap) - continue; + list_for_each_entry_safe(cur, next, &gmap->scb_users, gmap_cache.list) { prefix = cur->scb_s.prefix << GUEST_PREFIX_SHIFT; /* with mso/msl, the prefix lies at an offset */ prefix += cur->scb_s.mso; @@ -635,7 +648,7 @@ void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start, * - -EAGAIN if the caller can retry immediately * - -ENOMEM if out of memory */ -static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct gmap *sg) { struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; u64 prefix = scb_s->prefix << GUEST_PREFIX_SHIFT; @@ -650,10 +663,9 @@ static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) /* with mso/msl, the prefix lies at offset *mso* */ prefix += scb_s->mso; - rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix, NULL); + rc = gaccess_shadow_fault(vcpu, sg, prefix, NULL, true); if (!rc && (scb_s->ecb & ECB_TE)) - rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, - prefix + PAGE_SIZE, NULL); + rc = gaccess_shadow_fault(vcpu, sg, prefix + PAGE_SIZE, NULL, true); /* * We don't have to mprotect, we will be called for all unshadows. * SIE will detect if protection applies and trigger a validity. @@ -765,7 +777,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) else if ((gpa & ~0x1fffUL) == kvm_s390_get_prefix(vcpu)) rc = set_validity_icpt(scb_s, 0x0011U); else if ((gpa & PAGE_MASK) != - ((gpa + sizeof(struct bsca_block) - 1) & PAGE_MASK)) + ((gpa + offsetof(struct bsca_block, cpu[0]) - 1) & PAGE_MASK)) rc = set_validity_icpt(scb_s, 0x003bU); if (!rc) { rc = pin_guest_page(vcpu->kvm, gpa, &hpa); @@ -934,8 +946,9 @@ static int inject_fault(struct kvm_vcpu *vcpu, __u16 code, __u64 vaddr, * - > 0 if control has to be given to guest 2 * - < 0 if an error occurred */ -static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct gmap *sg) { + bool wr = kvm_s390_cur_gmap_fault_is_write(); int rc; if ((current->thread.gmap_int_code & PGM_INT_CODE_MASK) == PGM_PROTECTION) @@ -943,12 +956,10 @@ static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) return inject_fault(vcpu, PGM_PROTECTION, current->thread.gmap_teid.addr * PAGE_SIZE, 1); - rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, - current->thread.gmap_teid.addr * PAGE_SIZE, NULL); + rc = gaccess_shadow_fault(vcpu, sg, current->thread.gmap_teid.addr * PAGE_SIZE, NULL, wr); if (rc > 0) { rc = inject_fault(vcpu, rc, - current->thread.gmap_teid.addr * PAGE_SIZE, - kvm_s390_cur_gmap_fault_is_write()); + current->thread.gmap_teid.addr * PAGE_SIZE, wr); if (rc >= 0) vsie_page->fault_addr = current->thread.gmap_teid.addr * PAGE_SIZE; } @@ -961,12 +972,10 @@ static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) * * Will ignore any errors. The next SIE fault will do proper fault handling. */ -static void handle_last_fault(struct kvm_vcpu *vcpu, - struct vsie_page *vsie_page) +static void handle_last_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct gmap *sg) { if (vsie_page->fault_addr) - kvm_s390_shadow_fault(vcpu, vsie_page->gmap, - vsie_page->fault_addr, NULL); + gaccess_shadow_fault(vcpu, sg, vsie_page->fault_addr, NULL, true); vsie_page->fault_addr = 0; } @@ -1048,11 +1057,12 @@ static u64 vsie_get_register(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, } } -static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct gmap *sg) { struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; - unsigned long pei_dest, pei_src, src, dest, mask, prefix; + unsigned long src, dest, mask, prefix; u64 *pei_block = &vsie_page->scb_o->mcic; + union mvpg_pei pei_dest, pei_src; int edat, rc_dest, rc_src; union ctlreg0 cr0; @@ -1066,8 +1076,8 @@ static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) src = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 16) & mask; src = _kvm_s390_real_to_abs(prefix, src) + scb_s->mso; - rc_dest = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, dest, &pei_dest); - rc_src = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, src, &pei_src); + rc_dest = gaccess_shadow_fault(vcpu, sg, dest, &pei_dest, true); + rc_src = gaccess_shadow_fault(vcpu, sg, src, &pei_src, false); /* * Either everything went well, or something non-critical went wrong * e.g. because of a race. In either case, simply retry. @@ -1102,8 +1112,8 @@ static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) rc_src = rc_src != PGM_PAGE_TRANSLATION ? rc_src : 0; } if (!rc_dest && !rc_src) { - pei_block[0] = pei_dest; - pei_block[1] = pei_src; + pei_block[0] = pei_dest.val; + pei_block[1] = pei_src.val; return 1; } @@ -1127,16 +1137,17 @@ static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) * - > 0 if control has to be given to guest 2 * - < 0 if an error occurred */ -static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct gmap *sg) __releases(vcpu->kvm->srcu) __acquires(vcpu->kvm->srcu) { struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + unsigned long sie_return = SIE64_RETURN_NORMAL; int guest_bp_isolation; int rc = 0; - handle_last_fault(vcpu, vsie_page); + handle_last_fault(vcpu, vsie_page, sg); kvm_vcpu_srcu_read_unlock(vcpu); @@ -1153,10 +1164,6 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) vcpu->arch.sie_block->fpf & FPF_BPBC) set_thread_flag(TIF_ISOLATE_BP_GUEST); - local_irq_disable(); - guest_enter_irqoff(); - local_irq_enable(); - /* * Simulate a SIE entry of the VCPU (see sie64a), so VCPU blocking * and VCPU requests also hinder the vSIE from running and lead @@ -1166,31 +1173,44 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) vcpu->arch.sie_block->prog0c |= PROG_IN_SIE; current->thread.gmap_int_code = 0; barrier(); - if (!kvm_s390_vcpu_sie_inhibited(vcpu)) - rc = sie64a(scb_s, vcpu->run->s.regs.gprs, vsie_page->gmap->asce); + if (!kvm_s390_vcpu_sie_inhibited(vcpu)) { +xfer_to_guest_mode_check: + local_irq_disable(); + xfer_to_guest_mode_prepare(); + if (xfer_to_guest_mode_work_pending()) { + local_irq_enable(); + rc = kvm_xfer_to_guest_mode_handle_work(vcpu); + if (rc) + goto skip_sie; + goto xfer_to_guest_mode_check; + } + guest_timing_enter_irqoff(); + sie_return = kvm_s390_enter_exit_sie(scb_s, vcpu->run->s.regs.gprs, sg->asce.val); + guest_timing_exit_irqoff(); + local_irq_enable(); + } + +skip_sie: barrier(); vcpu->arch.sie_block->prog0c &= ~PROG_IN_SIE; - local_irq_disable(); - guest_exit_irqoff(); - local_irq_enable(); - /* restore guest state for bp isolation override */ if (!guest_bp_isolation) clear_thread_flag(TIF_ISOLATE_BP_GUEST); kvm_vcpu_srcu_read_lock(vcpu); - if (rc == -EINTR) { - VCPU_EVENT(vcpu, 3, "%s", "machine check"); + if (sie_return == SIE64_RETURN_MCCK) { kvm_s390_reinject_machine_check(vcpu, &vsie_page->mcck_info); return 0; } + WARN_ON_ONCE(sie_return != SIE64_RETURN_NORMAL); + if (rc > 0) rc = 0; /* we could still have an icpt */ else if (current->thread.gmap_int_code) - return handle_fault(vcpu, vsie_page); + return handle_fault(vcpu, vsie_page, sg); switch (scb_s->icptcode) { case ICPT_INST: @@ -1208,7 +1228,7 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) break; case ICPT_PARTEXEC: if (scb_s->ipa == 0xb254) - rc = vsie_handle_mvpg(vcpu, vsie_page); + rc = vsie_handle_mvpg(vcpu, vsie_page, sg); break; } return rc; @@ -1216,43 +1236,67 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) static void release_gmap_shadow(struct vsie_page *vsie_page) { - if (vsie_page->gmap) - gmap_put(vsie_page->gmap); - WRITE_ONCE(vsie_page->gmap, NULL); + struct gmap *gmap = vsie_page->gmap_cache.gmap; + + lockdep_assert_held(&gmap->kvm->arch.gmap->children_lock); + + list_del(&vsie_page->gmap_cache.list); + vsie_page->gmap_cache.gmap = NULL; prefix_unmapped(vsie_page); + + if (list_empty(&gmap->scb_users)) { + gmap_remove_child(gmap); + gmap_put(gmap); + } } -static int acquire_gmap_shadow(struct kvm_vcpu *vcpu, - struct vsie_page *vsie_page) +static struct gmap *acquire_gmap_shadow(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) { - unsigned long asce; union ctlreg0 cr0; struct gmap *gmap; + union asce asce; int edat; - asce = vcpu->arch.sie_block->gcr[1]; + asce.val = vcpu->arch.sie_block->gcr[1]; cr0.val = vcpu->arch.sie_block->gcr[0]; edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8); edat += edat && test_kvm_facility(vcpu->kvm, 78); - /* - * ASCE or EDAT could have changed since last icpt, or the gmap - * we're holding has been unshadowed. If the gmap is still valid, - * we can safely reuse it. - */ - if (vsie_page->gmap && gmap_shadow_valid(vsie_page->gmap, asce, edat)) { - vcpu->kvm->stat.gmap_shadow_reuse++; - return 0; + scoped_guard(spinlock, &vcpu->kvm->arch.gmap->children_lock) { + gmap = vsie_page->gmap_cache.gmap; + if (gmap) { + /* + * ASCE or EDAT could have changed since last icpt, or the gmap + * we're holding has been unshadowed. If the gmap is still valid, + * we can safely reuse it. + */ + if (gmap_is_shadow_valid(gmap, asce, edat)) { + vcpu->kvm->stat.gmap_shadow_reuse++; + gmap_get(gmap); + return gmap; + } + /* release the old shadow and mark the prefix as unmapped */ + release_gmap_shadow(vsie_page); + } } - - /* release the old shadow - if any, and mark the prefix as unmapped */ - release_gmap_shadow(vsie_page); - gmap = gmap_shadow(vcpu->arch.gmap, asce, edat); +again: + gmap = gmap_create_shadow(vcpu->arch.mc, vcpu->kvm->arch.gmap, asce, edat); if (IS_ERR(gmap)) - return PTR_ERR(gmap); - vcpu->kvm->stat.gmap_shadow_create++; - WRITE_ONCE(vsie_page->gmap, gmap); - return 0; + return gmap; + scoped_guard(spinlock, &vcpu->kvm->arch.gmap->children_lock) { + /* unlikely race condition, remove the previous shadow */ + if (vsie_page->gmap_cache.gmap) + release_gmap_shadow(vsie_page); + if (!gmap->parent) { + gmap_put(gmap); + goto again; + } + vcpu->kvm->stat.gmap_shadow_create++; + list_add(&vsie_page->gmap_cache.list, &gmap->scb_users); + vsie_page->gmap_cache.gmap = gmap; + prefix_unmapped(vsie_page); + } + return gmap; } /* @@ -1305,15 +1349,20 @@ static void unregister_shadow_scb(struct kvm_vcpu *vcpu) static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) { struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + struct gmap *sg = NULL; int rc = 0; while (1) { - rc = acquire_gmap_shadow(vcpu, vsie_page); + sg = acquire_gmap_shadow(vcpu, vsie_page); + if (IS_ERR(sg)) { + rc = PTR_ERR(sg); + sg = NULL; + } if (!rc) - rc = map_prefix(vcpu, vsie_page); + rc = map_prefix(vcpu, vsie_page, sg); if (!rc) { update_intervention_requests(vsie_page); - rc = do_vsie_run(vcpu, vsie_page); + rc = do_vsie_run(vcpu, vsie_page, sg); } atomic_andnot(PROG_BLOCK_SIE, &scb_s->prog20); @@ -1331,14 +1380,17 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) * but rewind the PSW to re-enter SIE once that's completed * instead of passing a "no action" intercept to the guest. */ - if (signal_pending(current) || - kvm_s390_vcpu_has_irq(vcpu, 0) || + if (kvm_s390_vcpu_has_irq(vcpu, 0) || kvm_s390_vcpu_sie_inhibited(vcpu)) { kvm_s390_rewind_psw(vcpu, 4); break; } + if (sg) + sg = gmap_put(sg); cond_resched(); } + if (sg) + sg = gmap_put(sg); if (rc == -EFAULT) { /* @@ -1434,8 +1486,7 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr) vsie_page->scb_gpa = ULONG_MAX; /* Double use of the same address or allocation failure. */ - if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, - vsie_page)) { + if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, vsie_page)) { put_vsie_page(vsie_page); mutex_unlock(&kvm->arch.vsie.mutex); return NULL; @@ -1444,7 +1495,12 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr) mutex_unlock(&kvm->arch.vsie.mutex); memset(&vsie_page->scb_s, 0, sizeof(struct kvm_s390_sie_block)); - release_gmap_shadow(vsie_page); + if (vsie_page->gmap_cache.gmap) { + scoped_guard(spinlock, &kvm->arch.gmap->children_lock) + if (vsie_page->gmap_cache.gmap) + release_gmap_shadow(vsie_page); + } + prefix_unmapped(vsie_page); vsie_page->fault_addr = 0; vsie_page->scb_s.ihcpu = 0xffffU; return vsie_page; @@ -1469,18 +1525,19 @@ int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu) if (unlikely(scb_addr & 0x1ffUL)) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - if (signal_pending(current) || kvm_s390_vcpu_has_irq(vcpu, 0) || - kvm_s390_vcpu_sie_inhibited(vcpu)) { + if (kvm_s390_vcpu_has_irq(vcpu, 0) || kvm_s390_vcpu_sie_inhibited(vcpu)) { kvm_s390_rewind_psw(vcpu, 4); return 0; } vsie_page = get_vsie_page(vcpu->kvm, scb_addr); - if (IS_ERR(vsie_page)) + if (IS_ERR(vsie_page)) { return PTR_ERR(vsie_page); - else if (!vsie_page) + } else if (!vsie_page) { /* double use of sie control block - simply do nothing */ + kvm_s390_rewind_psw(vcpu, 4); return 0; + } rc = pin_scb(vcpu, vsie_page, scb_addr); if (rc) @@ -1521,8 +1578,10 @@ void kvm_s390_vsie_destroy(struct kvm *kvm) mutex_lock(&kvm->arch.vsie.mutex); for (i = 0; i < kvm->arch.vsie.page_count; i++) { vsie_page = kvm->arch.vsie.pages[i]; + scoped_guard(spinlock, &kvm->arch.gmap->children_lock) + if (vsie_page->gmap_cache.gmap) + release_gmap_shadow(vsie_page); kvm->arch.vsie.pages[i] = NULL; - release_gmap_shadow(vsie_page); /* free the radix tree entry */ if (vsie_page->scb_gpa != ULONG_MAX) radix_tree_delete(&kvm->arch.vsie.addr_to_page, diff --git a/arch/s390/lib/Makefile b/arch/s390/lib/Makefile index 14bbfe50033c..2bf47204f6ab 100644 --- a/arch/s390/lib/Makefile +++ b/arch/s390/lib/Makefile @@ -5,7 +5,7 @@ lib-y += delay.o string.o uaccess.o find.o spinlock.o tishift.o lib-y += csum-partial.o -obj-y += mem.o xor.o +obj-y += mem.o lib-$(CONFIG_KPROBES) += probes.o lib-$(CONFIG_UPROBES) += probes.o obj-$(CONFIG_S390_KPROBES_SANITY_TEST) += test_kprobes_s390.o @@ -24,6 +24,3 @@ obj-$(CONFIG_S390_MODULES_SANITY_TEST_HELPERS) += test_modules_helpers.o lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o obj-$(CONFIG_EXPOLINE_EXTERN) += expoline.o - -obj-$(CONFIG_CRC32_ARCH) += crc32-s390.o -crc32-s390-y := crc32-glue.o crc32le-vx.o crc32be-vx.o diff --git a/arch/s390/lib/crc32-glue.c b/arch/s390/lib/crc32-glue.c deleted file mode 100644 index 137080e61f90..000000000000 --- a/arch/s390/lib/crc32-glue.c +++ /dev/null @@ -1,92 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * CRC-32 implemented with the z/Architecture Vector Extension Facility. - * - * Copyright IBM Corp. 2015 - * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> - */ -#define KMSG_COMPONENT "crc32-vx" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt - -#include <linux/module.h> -#include <linux/cpufeature.h> -#include <linux/crc32.h> -#include <asm/fpu.h> -#include "crc32-vx.h" - -#define VX_MIN_LEN 64 -#define VX_ALIGNMENT 16L -#define VX_ALIGN_MASK (VX_ALIGNMENT - 1) - -static DEFINE_STATIC_KEY_FALSE(have_vxrs); - -/* - * DEFINE_CRC32_VX() - Define a CRC-32 function using the vector extension - * - * Creates a function to perform a particular CRC-32 computation. Depending - * on the message buffer, the hardware-accelerated or software implementation - * is used. Note that the message buffer is aligned to improve fetch - * operations of VECTOR LOAD MULTIPLE instructions. - */ -#define DEFINE_CRC32_VX(___fname, ___crc32_vx, ___crc32_sw) \ - u32 ___fname(u32 crc, const u8 *data, size_t datalen) \ - { \ - unsigned long prealign, aligned, remaining; \ - DECLARE_KERNEL_FPU_ONSTACK16(vxstate); \ - \ - if (datalen < VX_MIN_LEN + VX_ALIGN_MASK || \ - !static_branch_likely(&have_vxrs)) \ - return ___crc32_sw(crc, data, datalen); \ - \ - if ((unsigned long)data & VX_ALIGN_MASK) { \ - prealign = VX_ALIGNMENT - \ - ((unsigned long)data & VX_ALIGN_MASK); \ - datalen -= prealign; \ - crc = ___crc32_sw(crc, data, prealign); \ - data = (void *)((unsigned long)data + prealign); \ - } \ - \ - aligned = datalen & ~VX_ALIGN_MASK; \ - remaining = datalen & VX_ALIGN_MASK; \ - \ - kernel_fpu_begin(&vxstate, KERNEL_VXR_LOW); \ - crc = ___crc32_vx(crc, data, aligned); \ - kernel_fpu_end(&vxstate, KERNEL_VXR_LOW); \ - \ - if (remaining) \ - crc = ___crc32_sw(crc, data + aligned, remaining); \ - \ - return crc; \ - } \ - EXPORT_SYMBOL(___fname); - -DEFINE_CRC32_VX(crc32_le_arch, crc32_le_vgfm_16, crc32_le_base) -DEFINE_CRC32_VX(crc32_be_arch, crc32_be_vgfm_16, crc32_be_base) -DEFINE_CRC32_VX(crc32c_le_arch, crc32c_le_vgfm_16, crc32c_le_base) - -static int __init crc32_s390_init(void) -{ - if (cpu_have_feature(S390_CPU_FEATURE_VXRS)) - static_branch_enable(&have_vxrs); - return 0; -} -arch_initcall(crc32_s390_init); - -static void __exit crc32_s390_exit(void) -{ -} -module_exit(crc32_s390_exit); - -u32 crc32_optimizations(void) -{ - if (static_key_enabled(&have_vxrs)) - return CRC32_LE_OPTIMIZATION | - CRC32_BE_OPTIMIZATION | - CRC32C_OPTIMIZATION; - return 0; -} -EXPORT_SYMBOL(crc32_optimizations); - -MODULE_AUTHOR("Hendrik Brueckner <brueckner@linux.vnet.ibm.com>"); -MODULE_DESCRIPTION("CRC-32 algorithms using z/Architecture Vector Extension Facility"); -MODULE_LICENSE("GPL"); diff --git a/arch/s390/lib/crc32-vx.h b/arch/s390/lib/crc32-vx.h deleted file mode 100644 index 652c96e1a822..000000000000 --- a/arch/s390/lib/crc32-vx.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#ifndef _CRC32_VX_S390_H -#define _CRC32_VX_S390_H - -#include <linux/types.h> - -u32 crc32_be_vgfm_16(u32 crc, unsigned char const *buf, size_t size); -u32 crc32_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size); -u32 crc32c_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size); - -#endif /* _CRC32_VX_S390_H */ diff --git a/arch/s390/lib/crc32be-vx.c b/arch/s390/lib/crc32be-vx.c deleted file mode 100644 index fed7c9c70d05..000000000000 --- a/arch/s390/lib/crc32be-vx.c +++ /dev/null @@ -1,174 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Hardware-accelerated CRC-32 variants for Linux on z Systems - * - * Use the z/Architecture Vector Extension Facility to accelerate the - * computing of CRC-32 checksums. - * - * This CRC-32 implementation algorithm processes the most-significant - * bit first (BE). - * - * Copyright IBM Corp. 2015 - * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> - */ - -#include <linux/types.h> -#include <asm/fpu.h> -#include "crc32-vx.h" - -/* Vector register range containing CRC-32 constants */ -#define CONST_R1R2 9 -#define CONST_R3R4 10 -#define CONST_R5 11 -#define CONST_R6 12 -#define CONST_RU_POLY 13 -#define CONST_CRC_POLY 14 - -/* - * The CRC-32 constant block contains reduction constants to fold and - * process particular chunks of the input data stream in parallel. - * - * For the CRC-32 variants, the constants are precomputed according to - * these definitions: - * - * R1 = x4*128+64 mod P(x) - * R2 = x4*128 mod P(x) - * R3 = x128+64 mod P(x) - * R4 = x128 mod P(x) - * R5 = x96 mod P(x) - * R6 = x64 mod P(x) - * - * Barret reduction constant, u, is defined as floor(x**64 / P(x)). - * - * where P(x) is the polynomial in the normal domain and the P'(x) is the - * polynomial in the reversed (bitreflected) domain. - * - * Note that the constant definitions below are extended in order to compute - * intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction. - * The rightmost doubleword can be 0 to prevent contribution to the result or - * can be multiplied by 1 to perform an XOR without the need for a separate - * VECTOR EXCLUSIVE OR instruction. - * - * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials: - * - * P(x) = 0x04C11DB7 - * P'(x) = 0xEDB88320 - */ - -static unsigned long constants_CRC_32_BE[] = { - 0x08833794c, 0x0e6228b11, /* R1, R2 */ - 0x0c5b9cd4c, 0x0e8a45605, /* R3, R4 */ - 0x0f200aa66, 1UL << 32, /* R5, x32 */ - 0x0490d678d, 1, /* R6, 1 */ - 0x104d101df, 0, /* u */ - 0x104C11DB7, 0, /* P(x) */ -}; - -/** - * crc32_be_vgfm_16 - Compute CRC-32 (BE variant) with vector registers - * @crc: Initial CRC value, typically ~0. - * @buf: Input buffer pointer, performance might be improved if the - * buffer is on a doubleword boundary. - * @size: Size of the buffer, must be 64 bytes or greater. - * - * Register usage: - * V0: Initial CRC value and intermediate constants and results. - * V1..V4: Data for CRC computation. - * V5..V8: Next data chunks that are fetched from the input buffer. - * V9..V14: CRC-32 constants. - */ -u32 crc32_be_vgfm_16(u32 crc, unsigned char const *buf, size_t size) -{ - /* Load CRC-32 constants */ - fpu_vlm(CONST_R1R2, CONST_CRC_POLY, &constants_CRC_32_BE); - fpu_vzero(0); - - /* Load the initial CRC value into the leftmost word of V0. */ - fpu_vlvgf(0, crc, 0); - - /* Load a 64-byte data chunk and XOR with CRC */ - fpu_vlm(1, 4, buf); - fpu_vx(1, 0, 1); - buf += 64; - size -= 64; - - while (size >= 64) { - /* Load the next 64-byte data chunk into V5 to V8 */ - fpu_vlm(5, 8, buf); - - /* - * Perform a GF(2) multiplication of the doublewords in V1 with - * the reduction constants in V0. The intermediate result is - * then folded (accumulated) with the next data chunk in V5 and - * stored in V1. Repeat this step for the register contents - * in V2, V3, and V4 respectively. - */ - fpu_vgfmag(1, CONST_R1R2, 1, 5); - fpu_vgfmag(2, CONST_R1R2, 2, 6); - fpu_vgfmag(3, CONST_R1R2, 3, 7); - fpu_vgfmag(4, CONST_R1R2, 4, 8); - buf += 64; - size -= 64; - } - - /* Fold V1 to V4 into a single 128-bit value in V1 */ - fpu_vgfmag(1, CONST_R3R4, 1, 2); - fpu_vgfmag(1, CONST_R3R4, 1, 3); - fpu_vgfmag(1, CONST_R3R4, 1, 4); - - while (size >= 16) { - fpu_vl(2, buf); - fpu_vgfmag(1, CONST_R3R4, 1, 2); - buf += 16; - size -= 16; - } - - /* - * The R5 constant is used to fold a 128-bit value into an 96-bit value - * that is XORed with the next 96-bit input data chunk. To use a single - * VGFMG instruction, multiply the rightmost 64-bit with x^32 (1<<32) to - * form an intermediate 96-bit value (with appended zeros) which is then - * XORed with the intermediate reduction result. - */ - fpu_vgfmg(1, CONST_R5, 1); - - /* - * Further reduce the remaining 96-bit value to a 64-bit value using a - * single VGFMG, the rightmost doubleword is multiplied with 0x1. The - * intermediate result is then XORed with the product of the leftmost - * doubleword with R6. The result is a 64-bit value and is subject to - * the Barret reduction. - */ - fpu_vgfmg(1, CONST_R6, 1); - - /* - * The input values to the Barret reduction are the degree-63 polynomial - * in V1 (R(x)), degree-32 generator polynomial, and the reduction - * constant u. The Barret reduction result is the CRC value of R(x) mod - * P(x). - * - * The Barret reduction algorithm is defined as: - * - * 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u - * 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x) - * 3. C(x) = R(x) XOR T2(x) mod x^32 - * - * Note: To compensate the division by x^32, use the vector unpack - * instruction to move the leftmost word into the leftmost doubleword - * of the vector register. The rightmost doubleword is multiplied - * with zero to not contribute to the intermediate results. - */ - - /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */ - fpu_vupllf(2, 1); - fpu_vgfmg(2, CONST_RU_POLY, 2); - - /* - * Compute the GF(2) product of the CRC polynomial in VO with T1(x) in - * V2 and XOR the intermediate result, T2(x), with the value in V1. - * The final result is in the rightmost word of V2. - */ - fpu_vupllf(2, 2); - fpu_vgfmag(2, CONST_CRC_POLY, 2, 1); - return fpu_vlgvf(2, 3); -} diff --git a/arch/s390/lib/crc32le-vx.c b/arch/s390/lib/crc32le-vx.c deleted file mode 100644 index 2f629f394df7..000000000000 --- a/arch/s390/lib/crc32le-vx.c +++ /dev/null @@ -1,240 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Hardware-accelerated CRC-32 variants for Linux on z Systems - * - * Use the z/Architecture Vector Extension Facility to accelerate the - * computing of bitreflected CRC-32 checksums for IEEE 802.3 Ethernet - * and Castagnoli. - * - * This CRC-32 implementation algorithm is bitreflected and processes - * the least-significant bit first (Little-Endian). - * - * Copyright IBM Corp. 2015 - * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> - */ - -#include <linux/types.h> -#include <asm/fpu.h> -#include "crc32-vx.h" - -/* Vector register range containing CRC-32 constants */ -#define CONST_PERM_LE2BE 9 -#define CONST_R2R1 10 -#define CONST_R4R3 11 -#define CONST_R5 12 -#define CONST_RU_POLY 13 -#define CONST_CRC_POLY 14 - -/* - * The CRC-32 constant block contains reduction constants to fold and - * process particular chunks of the input data stream in parallel. - * - * For the CRC-32 variants, the constants are precomputed according to - * these definitions: - * - * R1 = [(x4*128+32 mod P'(x) << 32)]' << 1 - * R2 = [(x4*128-32 mod P'(x) << 32)]' << 1 - * R3 = [(x128+32 mod P'(x) << 32)]' << 1 - * R4 = [(x128-32 mod P'(x) << 32)]' << 1 - * R5 = [(x64 mod P'(x) << 32)]' << 1 - * R6 = [(x32 mod P'(x) << 32)]' << 1 - * - * The bitreflected Barret reduction constant, u', is defined as - * the bit reversal of floor(x**64 / P(x)). - * - * where P(x) is the polynomial in the normal domain and the P'(x) is the - * polynomial in the reversed (bitreflected) domain. - * - * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials: - * - * P(x) = 0x04C11DB7 - * P'(x) = 0xEDB88320 - * - * CRC-32C (Castagnoli) polynomials: - * - * P(x) = 0x1EDC6F41 - * P'(x) = 0x82F63B78 - */ - -static unsigned long constants_CRC_32_LE[] = { - 0x0f0e0d0c0b0a0908, 0x0706050403020100, /* BE->LE mask */ - 0x1c6e41596, 0x154442bd4, /* R2, R1 */ - 0x0ccaa009e, 0x1751997d0, /* R4, R3 */ - 0x0, 0x163cd6124, /* R5 */ - 0x0, 0x1f7011641, /* u' */ - 0x0, 0x1db710641 /* P'(x) << 1 */ -}; - -static unsigned long constants_CRC_32C_LE[] = { - 0x0f0e0d0c0b0a0908, 0x0706050403020100, /* BE->LE mask */ - 0x09e4addf8, 0x740eef02, /* R2, R1 */ - 0x14cd00bd6, 0xf20c0dfe, /* R4, R3 */ - 0x0, 0x0dd45aab8, /* R5 */ - 0x0, 0x0dea713f1, /* u' */ - 0x0, 0x105ec76f0 /* P'(x) << 1 */ -}; - -/** - * crc32_le_vgfm_generic - Compute CRC-32 (LE variant) with vector registers - * @crc: Initial CRC value, typically ~0. - * @buf: Input buffer pointer, performance might be improved if the - * buffer is on a doubleword boundary. - * @size: Size of the buffer, must be 64 bytes or greater. - * @constants: CRC-32 constant pool base pointer. - * - * Register usage: - * V0: Initial CRC value and intermediate constants and results. - * V1..V4: Data for CRC computation. - * V5..V8: Next data chunks that are fetched from the input buffer. - * V9: Constant for BE->LE conversion and shift operations - * V10..V14: CRC-32 constants. - */ -static u32 crc32_le_vgfm_generic(u32 crc, unsigned char const *buf, size_t size, unsigned long *constants) -{ - /* Load CRC-32 constants */ - fpu_vlm(CONST_PERM_LE2BE, CONST_CRC_POLY, constants); - - /* - * Load the initial CRC value. - * - * The CRC value is loaded into the rightmost word of the - * vector register and is later XORed with the LSB portion - * of the loaded input data. - */ - fpu_vzero(0); /* Clear V0 */ - fpu_vlvgf(0, crc, 3); /* Load CRC into rightmost word */ - - /* Load a 64-byte data chunk and XOR with CRC */ - fpu_vlm(1, 4, buf); - fpu_vperm(1, 1, 1, CONST_PERM_LE2BE); - fpu_vperm(2, 2, 2, CONST_PERM_LE2BE); - fpu_vperm(3, 3, 3, CONST_PERM_LE2BE); - fpu_vperm(4, 4, 4, CONST_PERM_LE2BE); - - fpu_vx(1, 0, 1); /* V1 ^= CRC */ - buf += 64; - size -= 64; - - while (size >= 64) { - fpu_vlm(5, 8, buf); - fpu_vperm(5, 5, 5, CONST_PERM_LE2BE); - fpu_vperm(6, 6, 6, CONST_PERM_LE2BE); - fpu_vperm(7, 7, 7, CONST_PERM_LE2BE); - fpu_vperm(8, 8, 8, CONST_PERM_LE2BE); - /* - * Perform a GF(2) multiplication of the doublewords in V1 with - * the R1 and R2 reduction constants in V0. The intermediate - * result is then folded (accumulated) with the next data chunk - * in V5 and stored in V1. Repeat this step for the register - * contents in V2, V3, and V4 respectively. - */ - fpu_vgfmag(1, CONST_R2R1, 1, 5); - fpu_vgfmag(2, CONST_R2R1, 2, 6); - fpu_vgfmag(3, CONST_R2R1, 3, 7); - fpu_vgfmag(4, CONST_R2R1, 4, 8); - buf += 64; - size -= 64; - } - - /* - * Fold V1 to V4 into a single 128-bit value in V1. Multiply V1 with R3 - * and R4 and accumulating the next 128-bit chunk until a single 128-bit - * value remains. - */ - fpu_vgfmag(1, CONST_R4R3, 1, 2); - fpu_vgfmag(1, CONST_R4R3, 1, 3); - fpu_vgfmag(1, CONST_R4R3, 1, 4); - - while (size >= 16) { - fpu_vl(2, buf); - fpu_vperm(2, 2, 2, CONST_PERM_LE2BE); - fpu_vgfmag(1, CONST_R4R3, 1, 2); - buf += 16; - size -= 16; - } - - /* - * Set up a vector register for byte shifts. The shift value must - * be loaded in bits 1-4 in byte element 7 of a vector register. - * Shift by 8 bytes: 0x40 - * Shift by 4 bytes: 0x20 - */ - fpu_vleib(9, 0x40, 7); - - /* - * Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes - * to move R4 into the rightmost doubleword and set the leftmost - * doubleword to 0x1. - */ - fpu_vsrlb(0, CONST_R4R3, 9); - fpu_vleig(0, 1, 0); - - /* - * Compute GF(2) product of V1 and V0. The rightmost doubleword - * of V1 is multiplied with R4. The leftmost doubleword of V1 is - * multiplied by 0x1 and is then XORed with rightmost product. - * Implicitly, the intermediate leftmost product becomes padded - */ - fpu_vgfmg(1, 0, 1); - - /* - * Now do the final 32-bit fold by multiplying the rightmost word - * in V1 with R5 and XOR the result with the remaining bits in V1. - * - * To achieve this by a single VGFMAG, right shift V1 by a word - * and store the result in V2 which is then accumulated. Use the - * vector unpack instruction to load the rightmost half of the - * doubleword into the rightmost doubleword element of V1; the other - * half is loaded in the leftmost doubleword. - * The vector register with CONST_R5 contains the R5 constant in the - * rightmost doubleword and the leftmost doubleword is zero to ignore - * the leftmost product of V1. - */ - fpu_vleib(9, 0x20, 7); /* Shift by words */ - fpu_vsrlb(2, 1, 9); /* Store remaining bits in V2 */ - fpu_vupllf(1, 1); /* Split rightmost doubleword */ - fpu_vgfmag(1, CONST_R5, 1, 2); /* V1 = (V1 * R5) XOR V2 */ - - /* - * Apply a Barret reduction to compute the final 32-bit CRC value. - * - * The input values to the Barret reduction are the degree-63 polynomial - * in V1 (R(x)), degree-32 generator polynomial, and the reduction - * constant u. The Barret reduction result is the CRC value of R(x) mod - * P(x). - * - * The Barret reduction algorithm is defined as: - * - * 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u - * 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x) - * 3. C(x) = R(x) XOR T2(x) mod x^32 - * - * Note: The leftmost doubleword of vector register containing - * CONST_RU_POLY is zero and, thus, the intermediate GF(2) product - * is zero and does not contribute to the final result. - */ - - /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */ - fpu_vupllf(2, 1); - fpu_vgfmg(2, CONST_RU_POLY, 2); - - /* - * Compute the GF(2) product of the CRC polynomial with T1(x) in - * V2 and XOR the intermediate result, T2(x), with the value in V1. - * The final result is stored in word element 2 of V2. - */ - fpu_vupllf(2, 2); - fpu_vgfmag(2, CONST_CRC_POLY, 2, 1); - - return fpu_vlgvf(2, 2); -} - -u32 crc32_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size) -{ - return crc32_le_vgfm_generic(crc, buf, size, &constants_CRC_32_LE[0]); -} - -u32 crc32c_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size) -{ - return crc32_le_vgfm_generic(crc, buf, size, &constants_CRC_32C_LE[0]); -} diff --git a/arch/s390/lib/delay.c b/arch/s390/lib/delay.c index be14c58cb989..c1ea14e3c927 100644 --- a/arch/s390/lib/delay.c +++ b/arch/s390/lib/delay.c @@ -7,6 +7,7 @@ */ #include <linux/processor.h> +#include <linux/export.h> #include <linux/delay.h> #include <asm/div64.h> #include <asm/timex.h> diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c index a81a01c44927..10db1e56a811 100644 --- a/arch/s390/lib/spinlock.c +++ b/arch/s390/lib/spinlock.c @@ -10,11 +10,13 @@ #include <linux/export.h> #include <linux/spinlock.h> #include <linux/jiffies.h> +#include <linux/sysctl.h> #include <linux/init.h> #include <linux/smp.h> #include <linux/percpu.h> #include <linux/io.h> #include <asm/alternative.h> +#include <asm/machine.h> #include <asm/asm.h> int spin_retry = -1; @@ -37,6 +39,23 @@ static int __init spin_retry_setup(char *str) } __setup("spin_retry=", spin_retry_setup); +static const struct ctl_table s390_spin_sysctl_table[] = { + { + .procname = "spin_retry", + .data = &spin_retry, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +}; + +static int __init init_s390_spin_sysctls(void) +{ + register_sysctl_init("kernel", s390_spin_sysctl_table); + return 0; +} +arch_initcall(init_s390_spin_sysctls); + struct spin_wait { struct spin_wait *next, *prev; int node_id; @@ -77,7 +96,7 @@ static inline int arch_load_niai4(int *lock) asm_inline volatile( ALTERNATIVE("nop", ".insn rre,0xb2fa0000,4,0", ALT_FACILITY(49)) /* NIAI 4 */ - " l %[owner],%[lock]\n" + " l %[owner],%[lock]" : [owner] "=d" (owner) : [lock] "R" (*lock) : "memory"); return owner; } @@ -90,7 +109,7 @@ static inline int arch_try_cmpxchg_niai8(int *lock, int old, int new) asm_inline volatile( ALTERNATIVE("nop", ".insn rre,0xb2fa0000,8,0", ALT_FACILITY(49)) /* NIAI 8 */ - " cs %[old],%[new],%[lock]\n" + " cs %[old],%[new],%[lock]" : [old] "+d" (old), [lock] "+Q" (*lock), "=@cc" (cc) : [new] "d" (new) : "memory"); @@ -105,7 +124,7 @@ static inline int arch_try_cmpxchg_niai8(int *lock, int old, int new) asm_inline volatile( ALTERNATIVE("nop", ".insn rre,0xb2fa0000,8,0", ALT_FACILITY(49)) /* NIAI 8 */ - " cs %[old],%[new],%[lock]\n" + " cs %[old],%[new],%[lock]" : [old] "+d" (old), [lock] "+Q" (*lock) : [new] "d" (new) : "cc", "memory"); @@ -141,7 +160,7 @@ static inline void arch_spin_lock_queued(arch_spinlock_t *lp) ix = get_lowcore()->spinlock_index++; barrier(); - lockval = SPINLOCK_LOCKVAL; /* cpu + 1 */ + lockval = spinlock_lockval(); /* cpu + 1 */ node = this_cpu_ptr(&spin_wait[ix]); node->prev = node->next = NULL; node_id = node->node_id; @@ -212,7 +231,7 @@ static inline void arch_spin_lock_queued(arch_spinlock_t *lp) if (count-- >= 0) continue; count = spin_retry; - if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(owner - 1)) + if (!machine_is_lpar() || arch_vcpu_is_preempted(owner - 1)) smp_yield_cpu(owner - 1); } @@ -232,7 +251,7 @@ static inline void arch_spin_lock_classic(arch_spinlock_t *lp) { int lockval, old, new, owner, count; - lockval = SPINLOCK_LOCKVAL; /* cpu + 1 */ + lockval = spinlock_lockval(); /* cpu + 1 */ /* Pass the virtual CPU to the lock holder if it is not running */ owner = arch_spin_yield_target(READ_ONCE(lp->lock), NULL); @@ -255,7 +274,7 @@ static inline void arch_spin_lock_classic(arch_spinlock_t *lp) if (count-- >= 0) continue; count = spin_retry; - if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(owner - 1)) + if (!machine_is_lpar() || arch_vcpu_is_preempted(owner - 1)) smp_yield_cpu(owner - 1); } } @@ -271,7 +290,7 @@ EXPORT_SYMBOL(arch_spin_lock_wait); int arch_spin_trylock_retry(arch_spinlock_t *lp) { - int cpu = SPINLOCK_LOCKVAL; + int cpu = spinlock_lockval(); int owner, count; for (count = spin_retry; count > 0; count--) { @@ -337,7 +356,7 @@ void arch_spin_relax(arch_spinlock_t *lp) cpu = READ_ONCE(lp->lock) & _Q_LOCK_CPU_MASK; if (!cpu) return; - if (MACHINE_IS_LPAR && !arch_vcpu_is_preempted(cpu - 1)) + if (machine_is_lpar() && !arch_vcpu_is_preempted(cpu - 1)) return; smp_yield_cpu(cpu - 1); } diff --git a/arch/s390/lib/string.c b/arch/s390/lib/string.c index 373fa1f01937..757f58960198 100644 --- a/arch/s390/lib/string.c +++ b/arch/s390/lib/string.c @@ -27,7 +27,7 @@ static inline char *__strend(const char *s) asm volatile( " lghi 0,0\n" "0: srst %[e],%[s]\n" - " jo 0b\n" + " jo 0b" : [e] "+&a" (e), [s] "+&a" (s) : : "cc", "memory", "0"); @@ -41,7 +41,7 @@ static inline char *__strnend(const char *s, size_t n) asm volatile( " lghi 0,0\n" "0: srst %[p],%[s]\n" - " jo 0b\n" + " jo 0b" : [p] "+&d" (p), [s] "+&a" (s) : : "cc", "memory", "0"); @@ -78,50 +78,6 @@ EXPORT_SYMBOL(strnlen); #endif /** - * strcpy - Copy a %NUL terminated string - * @dest: Where to copy the string to - * @src: Where to copy the string from - * - * returns a pointer to @dest - */ -#ifdef __HAVE_ARCH_STRCPY -char *strcpy(char *dest, const char *src) -{ - char *ret = dest; - - asm volatile( - " lghi 0,0\n" - "0: mvst %[dest],%[src]\n" - " jo 0b\n" - : [dest] "+&a" (dest), [src] "+&a" (src) - : - : "cc", "memory", "0"); - return ret; -} -EXPORT_SYMBOL(strcpy); -#endif - -/** - * strncpy - Copy a length-limited, %NUL-terminated string - * @dest: Where to copy the string to - * @src: Where to copy the string from - * @n: The maximum number of bytes to copy - * - * The result is not %NUL-terminated if the source exceeds - * @n bytes. - */ -#ifdef __HAVE_ARCH_STRNCPY -char *strncpy(char *dest, const char *src, size_t n) -{ - size_t len = __strnend(src, n) - src; - memset(dest + len, 0, n - len); - memcpy(dest, src, len); - return dest; -} -EXPORT_SYMBOL(strncpy); -#endif - -/** * strcat - Append one %NUL-terminated string to another * @dest: The string to be appended to * @src: The string to append to it @@ -139,7 +95,7 @@ char *strcat(char *dest, const char *src) "0: srst %[dummy],%[dest]\n" " jo 0b\n" "1: mvst %[dummy],%[src]\n" - " jo 1b\n" + " jo 1b" : [dummy] "+&a" (dummy), [dest] "+&a" (dest), [src] "+&a" (src) : : "cc", "memory", "0"); @@ -181,9 +137,6 @@ EXPORT_SYMBOL(strlcat); * @n: The maximum numbers of bytes to copy * * returns a pointer to @dest - * - * Note that in contrast to strncpy, strncat ensures the result is - * terminated. */ #ifdef __HAVE_ARCH_STRNCAT char *strncat(char *dest, const char *src, size_t n) @@ -338,7 +291,7 @@ void *memscan(void *s, int c, size_t n) asm volatile( " lgr 0,%[c]\n" "0: srst %[ret],%[s]\n" - " jo 0b\n" + " jo 0b" : [ret] "+&a" (ret), [s] "+&a" (s) : [c] "d" (c) : "cc", "memory", "0"); diff --git a/arch/s390/lib/test_unwind.c b/arch/s390/lib/test_unwind.c index 6e42100875e7..6bb3fa5bf925 100644 --- a/arch/s390/lib/test_unwind.c +++ b/arch/s390/lib/test_unwind.c @@ -150,7 +150,7 @@ static __always_inline struct pt_regs fake_pt_regs(void) regs.gprs[15] = current_stack_pointer; asm volatile( - "basr %[psw_addr],0\n" + "basr %[psw_addr],0" : [psw_addr] "=d" (regs.psw.addr)); return regs; } @@ -232,7 +232,7 @@ static noinline void test_unwind_kprobed_func(void) asm volatile( " nopr %%r7\n" "test_unwind_kprobed_insn:\n" - " nopr %%r7\n" + " nopr %%r7" :); } diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c index f977b7c37efc..0ac2f3998b14 100644 --- a/arch/s390/lib/uaccess.c +++ b/arch/s390/lib/uaccess.c @@ -8,89 +8,196 @@ * Gerald Schaefer (gerald.schaefer@de.ibm.com) */ +#include <linux/kprobes.h> #include <linux/uaccess.h> #include <linux/export.h> #include <linux/mm.h> #include <asm/asm-extable.h> #include <asm/ctlreg.h> +#include <asm/skey.h> #ifdef CONFIG_DEBUG_ENTRY void debug_user_asce(int exit) { + struct lowcore *lc = get_lowcore(); struct ctlreg cr1, cr7; local_ctl_store(1, &cr1); local_ctl_store(7, &cr7); - if (cr1.val == get_lowcore()->kernel_asce.val && cr7.val == get_lowcore()->user_asce.val) + if (cr1.val == lc->user_asce.val && cr7.val == lc->user_asce.val) return; panic("incorrect ASCE on kernel %s\n" "cr1: %016lx cr7: %016lx\n" "kernel: %016lx user: %016lx\n", exit ? "exit" : "entry", cr1.val, cr7.val, - get_lowcore()->kernel_asce.val, get_lowcore()->user_asce.val); + lc->kernel_asce.val, lc->user_asce.val); } #endif /*CONFIG_DEBUG_ENTRY */ -unsigned long _copy_from_user_key(void *to, const void __user *from, - unsigned long n, unsigned long key) +#define CMPXCHG_USER_KEY_MAX_LOOPS 128 + +static nokprobe_inline int __cmpxchg_key_small(void *address, unsigned int *uval, + unsigned int old, unsigned int new, + unsigned int mask, unsigned long key) +{ + unsigned long count; + unsigned int prev; + int rc = 0; + + skey_regions_initialize(); + asm_inline volatile( + "20: spka 0(%[key])\n" + " llill %[count],%[max_loops]\n" + "0: l %[prev],%[address]\n" + "1: nr %[prev],%[mask]\n" + " xilf %[mask],0xffffffff\n" + " or %[new],%[prev]\n" + " or %[prev],%[tmp]\n" + "2: lr %[tmp],%[prev]\n" + "3: cs %[prev],%[new],%[address]\n" + "4: jnl 5f\n" + " xr %[tmp],%[prev]\n" + " xr %[new],%[tmp]\n" + " nr %[tmp],%[mask]\n" + " jnz 5f\n" + " brct %[count],2b\n" + "5: spka %[default_key]\n" + "21:\n" + EX_TABLE_UA_LOAD_REG(0b, 5b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REG(1b, 5b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REG(3b, 5b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REG(4b, 5b, %[rc], %[prev]) + SKEY_REGION(20b, 21b) + : [rc] "+&d" (rc), + [prev] "=&d" (prev), + [address] "+Q" (*(int *)address), + [tmp] "+&d" (old), + [new] "+&d" (new), + [mask] "+&d" (mask), + [count] "=a" (count) + : [key] "%[count]" (key << 4), + [default_key] "J" (PAGE_DEFAULT_KEY), + [max_loops] "J" (CMPXCHG_USER_KEY_MAX_LOOPS) + : "memory", "cc"); + *uval = prev; + if (!count) + rc = -EAGAIN; + return rc; +} + +int __kprobes __cmpxchg_key1(void *addr, unsigned char *uval, unsigned char old, + unsigned char new, unsigned long key) +{ + unsigned long address = (unsigned long)addr; + unsigned int prev, shift, mask, _old, _new; + int rc; + + shift = (3 ^ (address & 3)) << 3; + address ^= address & 3; + _old = (unsigned int)old << shift; + _new = (unsigned int)new << shift; + mask = ~(0xff << shift); + rc = __cmpxchg_key_small((void *)address, &prev, _old, _new, mask, key); + *uval = prev >> shift; + return rc; +} +EXPORT_SYMBOL(__cmpxchg_key1); + +int __kprobes __cmpxchg_key2(void *addr, unsigned short *uval, unsigned short old, + unsigned short new, unsigned long key) +{ + unsigned long address = (unsigned long)addr; + unsigned int prev, shift, mask, _old, _new; + int rc; + + shift = (2 ^ (address & 2)) << 3; + address ^= address & 2; + _old = (unsigned int)old << shift; + _new = (unsigned int)new << shift; + mask = ~(0xffff << shift); + rc = __cmpxchg_key_small((void *)address, &prev, _old, _new, mask, key); + *uval = prev >> shift; + return rc; +} +EXPORT_SYMBOL(__cmpxchg_key2); + +int __kprobes __cmpxchg_key4(void *address, unsigned int *uval, unsigned int old, + unsigned int new, unsigned long key) { - unsigned long res = n; - - might_fault(); - if (!should_fail_usercopy()) { - instrument_copy_from_user_before(to, from, n); - res = raw_copy_from_user_key(to, from, n, key); - instrument_copy_from_user_after(to, from, n, res); - } - if (unlikely(res)) - memset(to + (n - res), 0, res); - return res; + unsigned int prev = old; + int rc = 0; + + skey_regions_initialize(); + asm_inline volatile( + "20: spka 0(%[key])\n" + "0: cs %[prev],%[new],%[address]\n" + "1: spka %[default_key]\n" + "21:\n" + EX_TABLE_UA_LOAD_REG(0b, 1b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REG(1b, 1b, %[rc], %[prev]) + SKEY_REGION(20b, 21b) + : [rc] "+&d" (rc), + [prev] "+&d" (prev), + [address] "+Q" (*(int *)address) + : [new] "d" (new), + [key] "a" (key << 4), + [default_key] "J" (PAGE_DEFAULT_KEY) + : "memory", "cc"); + *uval = prev; + return rc; } -EXPORT_SYMBOL(_copy_from_user_key); +EXPORT_SYMBOL(__cmpxchg_key4); -unsigned long _copy_to_user_key(void __user *to, const void *from, - unsigned long n, unsigned long key) +int __kprobes __cmpxchg_key8(void *address, unsigned long *uval, unsigned long old, + unsigned long new, unsigned long key) { - might_fault(); - if (should_fail_usercopy()) - return n; - instrument_copy_to_user(to, from, n); - return raw_copy_to_user_key(to, from, n, key); + unsigned long prev = old; + int rc = 0; + + skey_regions_initialize(); + asm_inline volatile( + "20: spka 0(%[key])\n" + "0: csg %[prev],%[new],%[address]\n" + "1: spka %[default_key]\n" + "21:\n" + EX_TABLE_UA_LOAD_REG(0b, 1b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REG(1b, 1b, %[rc], %[prev]) + SKEY_REGION(20b, 21b) + : [rc] "+&d" (rc), + [prev] "+&d" (prev), + [address] "+QS" (*(long *)address) + : [new] "d" (new), + [key] "a" (key << 4), + [default_key] "J" (PAGE_DEFAULT_KEY) + : "memory", "cc"); + *uval = prev; + return rc; } -EXPORT_SYMBOL(_copy_to_user_key); +EXPORT_SYMBOL(__cmpxchg_key8); -unsigned long __clear_user(void __user *to, unsigned long size) +int __kprobes __cmpxchg_key16(void *address, __uint128_t *uval, __uint128_t old, + __uint128_t new, unsigned long key) { - unsigned long rem; - union oac spec = { - .oac1.as = PSW_BITS_AS_SECONDARY, - .oac1.a = 1, - }; - - asm volatile( - " lr 0,%[spec]\n" - "0: mvcos 0(%[to]),0(%[zeropg]),%[size]\n" - "1: jz 5f\n" - " algr %[size],%[val]\n" - " slgr %[to],%[val]\n" - " j 0b\n" - "2: la %[rem],4095(%[to])\n" /* rem = to + 4095 */ - " nr %[rem],%[val]\n" /* rem = (to + 4095) & -4096 */ - " slgr %[rem],%[to]\n" - " clgr %[size],%[rem]\n" /* copy crosses next page boundary? */ - " jnh 6f\n" - "3: mvcos 0(%[to]),0(%[zeropg]),%[rem]\n" - "4: slgr %[size],%[rem]\n" - " j 6f\n" - "5: slgr %[size],%[size]\n" - "6:\n" - EX_TABLE(0b, 2b) - EX_TABLE(1b, 2b) - EX_TABLE(3b, 6b) - EX_TABLE(4b, 6b) - : [size] "+&a" (size), [to] "+&a" (to), [rem] "=&a" (rem) - : [val] "a" (-4096UL), [zeropg] "a" (empty_zero_page), [spec] "d" (spec.val) - : "cc", "memory", "0"); - return size; + __uint128_t prev = old; + int rc = 0; + + skey_regions_initialize(); + asm_inline volatile( + "20: spka 0(%[key])\n" + "0: cdsg %[prev],%[new],%[address]\n" + "1: spka %[default_key]\n" + "21:\n" + EX_TABLE_UA_LOAD_REGPAIR(0b, 1b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REGPAIR(1b, 1b, %[rc], %[prev]) + SKEY_REGION(20b, 21b) + : [rc] "+&d" (rc), + [prev] "+&d" (prev), + [address] "+QS" (*(__int128_t *)address) + : [new] "d" (new), + [key] "a" (key << 4), + [default_key] "J" (PAGE_DEFAULT_KEY) + : "memory", "cc"); + *uval = prev; + return rc; } -EXPORT_SYMBOL(__clear_user); +EXPORT_SYMBOL(__cmpxchg_key16); diff --git a/arch/s390/lib/xor.c b/arch/s390/lib/xor.c deleted file mode 100644 index ce7bcf7c0032..000000000000 --- a/arch/s390/lib/xor.c +++ /dev/null @@ -1,137 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Optimized xor_block operation for RAID4/5 - * - * Copyright IBM Corp. 2016 - * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> - */ - -#include <linux/types.h> -#include <linux/export.h> -#include <linux/raid/xor.h> -#include <asm/xor.h> - -static void xor_xc_2(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2) -{ - asm volatile( - " aghi %0,-1\n" - " jm 3f\n" - " srlg 0,%0,8\n" - " ltgr 0,0\n" - " jz 1f\n" - "0: xc 0(256,%1),0(%2)\n" - " la %1,256(%1)\n" - " la %2,256(%2)\n" - " brctg 0,0b\n" - "1: exrl %0,2f\n" - " j 3f\n" - "2: xc 0(1,%1),0(%2)\n" - "3:\n" - : : "d" (bytes), "a" (p1), "a" (p2) - : "0", "cc", "memory"); -} - -static void xor_xc_3(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3) -{ - asm volatile( - " aghi %0,-1\n" - " jm 4f\n" - " srlg 0,%0,8\n" - " ltgr 0,0\n" - " jz 1f\n" - "0: xc 0(256,%1),0(%2)\n" - " xc 0(256,%1),0(%3)\n" - " la %1,256(%1)\n" - " la %2,256(%2)\n" - " la %3,256(%3)\n" - " brctg 0,0b\n" - "1: exrl %0,2f\n" - " exrl %0,3f\n" - " j 4f\n" - "2: xc 0(1,%1),0(%2)\n" - "3: xc 0(1,%1),0(%3)\n" - "4:\n" - : "+d" (bytes), "+a" (p1), "+a" (p2), "+a" (p3) - : : "0", "cc", "memory"); -} - -static void xor_xc_4(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3, - const unsigned long * __restrict p4) -{ - asm volatile( - " aghi %0,-1\n" - " jm 5f\n" - " srlg 0,%0,8\n" - " ltgr 0,0\n" - " jz 1f\n" - "0: xc 0(256,%1),0(%2)\n" - " xc 0(256,%1),0(%3)\n" - " xc 0(256,%1),0(%4)\n" - " la %1,256(%1)\n" - " la %2,256(%2)\n" - " la %3,256(%3)\n" - " la %4,256(%4)\n" - " brctg 0,0b\n" - "1: exrl %0,2f\n" - " exrl %0,3f\n" - " exrl %0,4f\n" - " j 5f\n" - "2: xc 0(1,%1),0(%2)\n" - "3: xc 0(1,%1),0(%3)\n" - "4: xc 0(1,%1),0(%4)\n" - "5:\n" - : "+d" (bytes), "+a" (p1), "+a" (p2), "+a" (p3), "+a" (p4) - : : "0", "cc", "memory"); -} - -static void xor_xc_5(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3, - const unsigned long * __restrict p4, - const unsigned long * __restrict p5) -{ - asm volatile( - " larl 1,2f\n" - " aghi %0,-1\n" - " jm 6f\n" - " srlg 0,%0,8\n" - " ltgr 0,0\n" - " jz 1f\n" - "0: xc 0(256,%1),0(%2)\n" - " xc 0(256,%1),0(%3)\n" - " xc 0(256,%1),0(%4)\n" - " xc 0(256,%1),0(%5)\n" - " la %1,256(%1)\n" - " la %2,256(%2)\n" - " la %3,256(%3)\n" - " la %4,256(%4)\n" - " la %5,256(%5)\n" - " brctg 0,0b\n" - "1: exrl %0,2f\n" - " exrl %0,3f\n" - " exrl %0,4f\n" - " exrl %0,5f\n" - " j 6f\n" - "2: xc 0(1,%1),0(%2)\n" - "3: xc 0(1,%1),0(%3)\n" - "4: xc 0(1,%1),0(%4)\n" - "5: xc 0(1,%1),0(%5)\n" - "6:\n" - : "+d" (bytes), "+a" (p1), "+a" (p2), "+a" (p3), "+a" (p4), - "+a" (p5) - : : "0", "cc", "memory"); -} - -struct xor_block_template xor_block_xc = { - .name = "xc", - .do_2 = xor_xc_2, - .do_3 = xor_xc_3, - .do_4 = xor_xc_4, - .do_5 = xor_xc_5, -}; -EXPORT_SYMBOL(xor_block_xc); diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile index f6c2db7a8669..193899c39ca7 100644 --- a/arch/s390/mm/Makefile +++ b/arch/s390/mm/Makefile @@ -9,6 +9,7 @@ obj-y += page-states.o pageattr.o pgtable.o pgalloc.o extable.o obj-$(CONFIG_CMM) += cmm.o obj-$(CONFIG_DEBUG_VIRTUAL) += physaddr.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o -obj-$(CONFIG_PTDUMP_CORE) += dump_pagetables.o -obj-$(CONFIG_PGSTE) += gmap.o +obj-$(CONFIG_PTDUMP) += dump_pagetables.o obj-$(CONFIG_PFAULT) += pfault.o + +obj-$(subst m,y,$(CONFIG_KVM)) += gmap_helpers.o diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c index 39f44b6256e0..eb7ef63fab1e 100644 --- a/arch/s390/mm/cmm.c +++ b/arch/s390/mm/cmm.c @@ -201,7 +201,7 @@ static void cmm_set_timer(void) { if (cmm_timed_pages_target <= 0 || cmm_timeout_seconds <= 0) { if (timer_pending(&cmm_timer)) - del_timer(&cmm_timer); + timer_delete(&cmm_timer); return; } mod_timer(&cmm_timer, jiffies + secs_to_jiffies(cmm_timeout_seconds)); @@ -321,8 +321,8 @@ static int cmm_timeout_handler(const struct ctl_table *ctl, int write, cmm_set_timeout(nr, seconds); *ppos += *lenp; } else { - len = sprintf(buf, "%ld %ld\n", - cmm_timeout_pages, cmm_timeout_seconds); + len = scnprintf(buf, sizeof(buf), "%ld %ld\n", + cmm_timeout_pages, cmm_timeout_seconds); if (len > *lenp) len = *lenp; memcpy(buffer, buf, len); @@ -424,7 +424,7 @@ out_smsg: #endif unregister_sysctl_table(cmm_sysctl_header); out_sysctl: - del_timer_sync(&cmm_timer); + timer_delete_sync(&cmm_timer); return rc; } module_init(cmm_init); @@ -437,7 +437,7 @@ static void __exit cmm_exit(void) #endif unregister_oom_notifier(&cmm_oom_nb); kthread_stop(cmm_thread_ptr); - del_timer_sync(&cmm_timer); + timer_delete_sync(&cmm_timer); cmm_free_pages(cmm_pages, &cmm_pages, &cmm_page_list); cmm_free_pages(cmm_timed_pages, &cmm_timed_pages, &cmm_timed_page_list); } diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c index fa54f3bc0c8d..89badbe72ae7 100644 --- a/arch/s390/mm/dump_pagetables.c +++ b/arch/s390/mm/dump_pagetables.c @@ -1,4 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 + +#include <linux/cpufeature.h> #include <linux/set_memory.h> #include <linux/ptdump.h> #include <linux/seq_file.h> @@ -49,7 +51,7 @@ struct pg_state { struct seq_file *__m = (m); \ \ if (__m) \ - seq_printf(__m, fmt); \ + seq_puts(__m, fmt); \ }) static void print_prot(struct seq_file *m, unsigned int pr, int level) @@ -82,7 +84,7 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr) * in which case we have two lpswe instructions in lowcore that need * to be executable. */ - if (addr == PAGE_SIZE && (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear))) + if (addr == PAGE_SIZE && (nospec_uses_trampoline() || !cpu_has_bear())) return; WARN_ONCE(IS_ENABLED(CONFIG_DEBUG_WX), "s390/mm: Found insecure W+X mapping at address %pS\n", @@ -145,11 +147,48 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, } } +static void note_page_pte(struct ptdump_state *pt_st, unsigned long addr, pte_t pte) +{ + note_page(pt_st, addr, 4, pte_val(pte)); +} + +static void note_page_pmd(struct ptdump_state *pt_st, unsigned long addr, pmd_t pmd) +{ + note_page(pt_st, addr, 3, pmd_val(pmd)); +} + +static void note_page_pud(struct ptdump_state *pt_st, unsigned long addr, pud_t pud) +{ + note_page(pt_st, addr, 2, pud_val(pud)); +} + +static void note_page_p4d(struct ptdump_state *pt_st, unsigned long addr, p4d_t p4d) +{ + note_page(pt_st, addr, 1, p4d_val(p4d)); +} + +static void note_page_pgd(struct ptdump_state *pt_st, unsigned long addr, pgd_t pgd) +{ + note_page(pt_st, addr, 0, pgd_val(pgd)); +} + +static void note_page_flush(struct ptdump_state *pt_st) +{ + pte_t pte_zero = {0}; + + note_page(pt_st, 0, -1, pte_val(pte_zero)); +} + bool ptdump_check_wx(void) { struct pg_state st = { .ptdump = { - .note_page = note_page, + .note_page_pte = note_page_pte, + .note_page_pmd = note_page_pmd, + .note_page_pud = note_page_pud, + .note_page_p4d = note_page_p4d, + .note_page_pgd = note_page_pgd, + .note_page_flush = note_page_flush, .range = (struct ptdump_range[]) { {.start = 0, .end = max_addr}, {.start = 0, .end = 0}, @@ -167,7 +206,7 @@ bool ptdump_check_wx(void) }, }; - if (!MACHINE_HAS_NX) + if (!cpu_has_nx()) return true; ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); if (st.wx_pages) { @@ -176,7 +215,7 @@ bool ptdump_check_wx(void) return false; } else { pr_info("Checked W+X mappings: passed, no %sW+X pages found\n", - (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)) ? + (nospec_uses_trampoline() || !cpu_has_bear()) ? "unexpected " : ""); return true; @@ -188,7 +227,12 @@ static int ptdump_show(struct seq_file *m, void *v) { struct pg_state st = { .ptdump = { - .note_page = note_page, + .note_page_pte = note_page_pte, + .note_page_pmd = note_page_pmd, + .note_page_pud = note_page_pud, + .note_page_p4d = note_page_p4d, + .note_page_pgd = note_page_pgd, + .note_page_flush = note_page_flush, .range = (struct ptdump_range[]) { {.start = 0, .end = max_addr}, {.start = 0, .end = 0}, @@ -203,11 +247,9 @@ static int ptdump_show(struct seq_file *m, void *v) .marker = markers, }; - get_online_mems(); mutex_lock(&cpa_mutex); ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); mutex_unlock(&cpa_mutex); - put_online_mems(); return 0; } DEFINE_SHOW_ATTRIBUTE(ptdump); @@ -249,16 +291,14 @@ static int ptdump_cmp(const void *a, const void *b) static int add_marker(unsigned long start, unsigned long end, const char *name) { - size_t oldsize, newsize; - - oldsize = markers_cnt * sizeof(*markers); - newsize = oldsize + 2 * sizeof(*markers); - if (!oldsize) - markers = kvmalloc(newsize, GFP_KERNEL); - else - markers = kvrealloc(markers, newsize, GFP_KERNEL); - if (!markers) - goto error; + struct addr_marker *new; + size_t newsize; + + newsize = (markers_cnt + 2) * sizeof(*markers); + new = kvrealloc(markers, newsize, GFP_KERNEL); + if (!new) + return -ENOMEM; + markers = new; markers[markers_cnt].is_start = 1; markers[markers_cnt].start_address = start; markers[markers_cnt].size = end - start; @@ -270,9 +310,6 @@ static int add_marker(unsigned long start, unsigned long end, const char *name) markers[markers_cnt].name = name; markers_cnt++; return 0; -error: - markers_cnt = 0; - return -ENOMEM; } static int pt_dump_init(void) diff --git a/arch/s390/mm/extable.c b/arch/s390/mm/extable.c index a046be1715cf..7498e858c401 100644 --- a/arch/s390/mm/extable.c +++ b/arch/s390/mm/extable.c @@ -73,6 +73,49 @@ static bool ex_handler_fpc(const struct exception_table_entry *ex, struct pt_reg return true; } +struct insn_ssf { + u64 opc1 : 8; + u64 r3 : 4; + u64 opc2 : 4; + u64 b1 : 4; + u64 d1 : 12; + u64 b2 : 4; + u64 d2 : 12; +} __packed; + +static bool ex_handler_ua_mvcos(const struct exception_table_entry *ex, + bool from, struct pt_regs *regs) +{ + unsigned long uaddr, remainder; + struct insn_ssf *insn; + + /* + * If the faulting user space access crossed a page boundary retry by + * limiting the access to the first page (adjust length accordingly). + * Then the mvcos instruction will either complete with condition code + * zero, or generate another fault where the user space access did not + * cross a page boundary. + * If the faulting user space access did not cross a page boundary set + * length to zero and retry. In this case no user space access will + * happen, and the mvcos instruction will complete with condition code + * zero. + * In both cases the instruction will complete with condition code + * zero (copying finished), and the register which contains the + * length, indicates the number of bytes copied. + */ + regs->psw.addr = extable_fixup(ex); + insn = (struct insn_ssf *)regs->psw.addr; + if (from) + uaddr = regs->gprs[insn->b2] + insn->d2; + else + uaddr = regs->gprs[insn->b1] + insn->d1; + remainder = PAGE_SIZE - (uaddr & (PAGE_SIZE - 1)); + if (regs->gprs[insn->r3] <= remainder) + remainder = 0; + regs->gprs[insn->r3] = remainder; + return true; +} + bool fixup_exception(struct pt_regs *regs) { const struct exception_table_entry *ex; @@ -95,6 +138,10 @@ bool fixup_exception(struct pt_regs *regs) return ex_handler_zeropad(ex, regs); case EX_TYPE_FPC: return ex_handler_fpc(ex, regs); + case EX_TYPE_UA_MVCOS_TO: + return ex_handler_ua_mvcos(ex, false, regs); + case EX_TYPE_UA_MVCOS_FROM: + return ex_handler_ua_mvcos(ex, true, regs); } panic("invalid exception table entry"); } diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c index 4692136c0af1..1c3a1eed4381 100644 --- a/arch/s390/mm/extmem.c +++ b/arch/s390/mm/extmem.c @@ -7,8 +7,7 @@ * Copyright IBM Corp. 2002, 2004 */ -#define KMSG_COMPONENT "extmem" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "extmem: " fmt #include <linux/kernel.h> #include <linux/string.h> @@ -21,6 +20,7 @@ #include <linux/ioport.h> #include <linux/refcount.h> #include <linux/pgtable.h> +#include <asm/machine.h> #include <asm/diag.h> #include <asm/page.h> #include <asm/ebcdic.h> @@ -171,8 +171,8 @@ query_segment_type (struct dcss_segment *seg) struct qout64 *qout; struct qin64 *qin; - qin = kmalloc(sizeof(*qin), GFP_KERNEL | GFP_DMA); - qout = kmalloc(sizeof(*qout), GFP_KERNEL | GFP_DMA); + qin = kmalloc_obj(*qin, GFP_KERNEL | GFP_DMA); + qout = kmalloc_obj(*qout, GFP_KERNEL | GFP_DMA); if ((qin == NULL) || (qout == NULL)) { rc = -ENOMEM; goto out_free; @@ -255,7 +255,7 @@ segment_type (char* name) int rc; struct dcss_segment seg; - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return -ENOSYS; dcss_mkname(name, seg.dcss_name); @@ -302,7 +302,7 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long start_addr = end_addr = 0; segtype = -1; - seg = kmalloc(sizeof(*seg), GFP_KERNEL | GFP_DMA); + seg = kmalloc_obj(*seg, GFP_KERNEL | GFP_DMA); if (seg == NULL) { rc = -ENOMEM; goto out; @@ -317,7 +317,7 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long goto out_free; } - seg->res = kzalloc(sizeof(struct resource), GFP_KERNEL); + seg->res = kzalloc_obj(struct resource); if (seg->res == NULL) { rc = -ENOMEM; goto out_free; @@ -418,7 +418,7 @@ segment_load (char *name, int do_nonshared, unsigned long *addr, struct dcss_segment *seg; int rc; - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return -ENOSYS; mutex_lock(&dcss_lock); @@ -529,6 +529,14 @@ segment_modify_shared (char *name, int do_nonshared) return rc; } +static void __dcss_diag_purge_on_cpu_0(void *data) +{ + struct dcss_segment *seg = (struct dcss_segment *)data; + unsigned long dummy; + + dcss_diag(&purgeseg_scode, seg->dcss_name, &dummy, &dummy); +} + /* * Decrease the use count of a DCSS segment and remove * it from the address space if nobody is using it @@ -537,10 +545,9 @@ segment_modify_shared (char *name, int do_nonshared) void segment_unload(char *name) { - unsigned long dummy; struct dcss_segment *seg; - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return; mutex_lock(&dcss_lock); @@ -555,7 +562,14 @@ segment_unload(char *name) kfree(seg->res); vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1); list_del(&seg->list); - dcss_diag(&purgeseg_scode, seg->dcss_name, &dummy, &dummy); + /* + * Workaround for z/VM issue, where calling the DCSS unload diag on + * a non-IPL CPU would cause bogus sclp maximum memory detection on + * next IPL. + * IPL CPU 0 cannot be set offline, so the dcss_diag() call can + * directly be scheduled to that CPU. + */ + smp_call_function_single(0, __dcss_diag_purge_on_cpu_0, seg, 1); kfree(seg); out_unlock: mutex_unlock(&dcss_lock); @@ -572,7 +586,7 @@ segment_save(char *name) char cmd2[80]; int i, response; - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return; mutex_lock(&dcss_lock); @@ -583,14 +597,16 @@ segment_save(char *name) goto out; } - sprintf(cmd1, "DEFSEG %s", name); + snprintf(cmd1, sizeof(cmd1), "DEFSEG %s", name); for (i=0; i<seg->segcnt; i++) { - sprintf(cmd1+strlen(cmd1), " %lX-%lX %s", - seg->range[i].start >> PAGE_SHIFT, - seg->range[i].end >> PAGE_SHIFT, - segtype_string[seg->range[i].start & 0xff]); + size_t len = strlen(cmd1); + + snprintf(cmd1 + len, sizeof(cmd1) - len, " %lX-%lX %s", + seg->range[i].start >> PAGE_SHIFT, + seg->range[i].end >> PAGE_SHIFT, + segtype_string[seg->range[i].start & 0xff]); } - sprintf(cmd2, "SAVESEG %s", name); + snprintf(cmd2, sizeof(cmd2), "SAVESEG %s", name); response = 0; cpcmd(cmd1, NULL, 0, &response); if (response) { diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 9b681f74dccc..191cc53caead 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -11,11 +11,11 @@ #include <linux/kernel_stat.h> #include <linux/mmu_context.h> +#include <linux/cpufeature.h> #include <linux/perf_event.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/sched/debug.h> -#include <linux/jump_label.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/string.h> @@ -23,7 +23,6 @@ #include <linux/ptrace.h> #include <linux/mman.h> #include <linux/mm.h> -#include <linux/compat.h> #include <linux/smp.h> #include <linux/kdebug.h> #include <linux/init.h> @@ -40,22 +39,11 @@ #include <asm/ptrace.h> #include <asm/fault.h> #include <asm/diag.h> -#include <asm/gmap.h> #include <asm/irq.h> #include <asm/facility.h> #include <asm/uv.h> #include "../kernel/entry.h" -static DEFINE_STATIC_KEY_FALSE(have_store_indication); - -static int __init fault_init(void) -{ - if (test_facility(75)) - static_branch_enable(&have_store_indication); - return 0; -} -early_initcall(fault_init); - /* * Find out which address space caused the exception. */ @@ -81,7 +69,7 @@ static __always_inline bool fault_is_write(struct pt_regs *regs) { union teid teid = { .val = regs->int_parm_long }; - if (static_branch_likely(&have_store_indication)) + if (test_facility(75)) return teid.fsi == TEID_FSI_STORE; return false; } @@ -144,8 +132,17 @@ static void dump_fault_info(struct pt_regs *regs) union teid teid = { .val = regs->int_parm_long }; unsigned long asce; - pr_alert("Failing address: %016lx TEID: %016lx\n", + pr_alert("Failing address: %016lx TEID: %016lx", get_fault_address(regs), teid.val); + if (test_facility(131)) + pr_cont(" ESOP-2"); + else if (machine_has_esop()) + pr_cont(" ESOP-1"); + else + pr_cont(" SOP"); + if (test_facility(75)) + pr_cont(" FSI"); + pr_cont("\n"); pr_alert("Fault in "); switch (teid.as) { case PSW_BITS_AS_HOME: @@ -175,6 +172,23 @@ static void dump_fault_info(struct pt_regs *regs) int show_unhandled_signals = 1; +static const struct ctl_table s390_fault_sysctl_table[] = { + { + .procname = "userprocess_debug", + .data = &show_unhandled_signals, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +}; + +static int __init init_s390_fault_sysctls(void) +{ + register_sysctl_init("kernel", s390_fault_sysctl_table); + return 0; +} +arch_initcall(init_s390_fault_sysctls); + void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault) { static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -359,25 +373,23 @@ void do_protection_exception(struct pt_regs *regs) * The exception to this rule are aborted transactions, for these * the PSW already points to the correct location. */ - if (!(regs->int_code & 0x200)) + if (!(regs->int_code & 0x200)) { regs->psw.addr = __rewind_psw(regs->psw, regs->int_code >> 16); + set_pt_regs_flag(regs, PIF_PSW_ADDR_ADJUSTED); + } /* - * Check for low-address protection. This needs to be treated - * as a special case because the translation exception code - * field is not guaranteed to contain valid data in this case. + * If bit 61 if the TEID is not set, the remainder of the + * TEID is unpredictable. Special handling is required. */ if (unlikely(!teid.b61)) { if (user_mode(regs)) { - /* Low-address protection in user mode: cannot happen */ - die(regs, "Low-address protection"); + dump_fault_info(regs); + die(regs, "Unexpected TEID"); } - /* - * Low-address protection in kernel mode means - * NULL pointer write access in kernel mode. - */ + /* Assume low-address protection in kernel mode. */ return handle_fault_error_nolock(regs, 0); } - if (unlikely(MACHINE_HAS_NX && teid.b56)) { + if (unlikely(cpu_has_nx() && teid.b56)) { regs->int_parm_long = (teid.addr * PAGE_SIZE) | (regs->psw.addr & PAGE_MASK); return handle_fault_error_nolock(regs, SEGV_ACCERR); } @@ -391,7 +403,7 @@ void do_dat_exception(struct pt_regs *regs) } NOKPROBE_SYMBOL(do_dat_exception); -#if IS_ENABLED(CONFIG_PGSTE) +#if IS_ENABLED(CONFIG_KVM) void do_secure_storage_access(struct pt_regs *regs) { @@ -429,11 +441,20 @@ void do_secure_storage_access(struct pt_regs *regs) folio = phys_to_folio(addr); if (unlikely(!folio_try_get(folio))) return; - rc = arch_make_folio_accessible(folio); + rc = uv_convert_from_secure(folio_to_phys(folio)); + if (!rc) + clear_bit(PG_arch_1, &folio->flags.f); folio_put(folio); + /* + * There are some valid fixup types for kernel + * accesses to donated secure memory. zeropad is one + * of them. + */ if (rc) - BUG(); + return handle_fault_error_nolock(regs, 0); } else { + if (faulthandler_disabled()) + return handle_fault_error_nolock(regs, 0); mm = current->mm; mmap_read_lock(mm); vma = find_vma(mm, addr); @@ -456,4 +477,4 @@ void do_secure_storage_access(struct pt_regs *regs) } NOKPROBE_SYMBOL(do_secure_storage_access); -#endif /* CONFIG_PGSTE */ +#endif /* CONFIG_KVM */ diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c deleted file mode 100644 index 94d927785800..000000000000 --- a/arch/s390/mm/gmap.c +++ /dev/null @@ -1,2656 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * KVM guest address space mapping code - * - * Copyright IBM Corp. 2007, 2020 - * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> - * David Hildenbrand <david@redhat.com> - * Janosch Frank <frankja@linux.vnet.ibm.com> - */ - -#include <linux/kernel.h> -#include <linux/pagewalk.h> -#include <linux/swap.h> -#include <linux/smp.h> -#include <linux/spinlock.h> -#include <linux/slab.h> -#include <linux/swapops.h> -#include <linux/ksm.h> -#include <linux/mman.h> -#include <linux/pgtable.h> -#include <asm/page-states.h> -#include <asm/pgalloc.h> -#include <asm/gmap.h> -#include <asm/page.h> -#include <asm/tlb.h> - -/* - * The address is saved in a radix tree directly; NULL would be ambiguous, - * since 0 is a valid address, and NULL is returned when nothing was found. - * The lower bits are ignored by all users of the macro, so it can be used - * to distinguish a valid address 0 from a NULL. - */ -#define VALID_GADDR_FLAG 1 -#define IS_GADDR_VALID(gaddr) ((gaddr) & VALID_GADDR_FLAG) -#define MAKE_VALID_GADDR(gaddr) (((gaddr) & HPAGE_MASK) | VALID_GADDR_FLAG) - -#define GMAP_SHADOW_FAKE_TABLE 1ULL - -static struct page *gmap_alloc_crst(void) -{ - struct page *page; - - page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); - if (!page) - return NULL; - __arch_set_page_dat(page_to_virt(page), 1UL << CRST_ALLOC_ORDER); - return page; -} - -/** - * gmap_alloc - allocate and initialize a guest address space - * @limit: maximum address of the gmap address space - * - * Returns a guest address space structure. - */ -struct gmap *gmap_alloc(unsigned long limit) -{ - struct gmap *gmap; - struct page *page; - unsigned long *table; - unsigned long etype, atype; - - if (limit < _REGION3_SIZE) { - limit = _REGION3_SIZE - 1; - atype = _ASCE_TYPE_SEGMENT; - etype = _SEGMENT_ENTRY_EMPTY; - } else if (limit < _REGION2_SIZE) { - limit = _REGION2_SIZE - 1; - atype = _ASCE_TYPE_REGION3; - etype = _REGION3_ENTRY_EMPTY; - } else if (limit < _REGION1_SIZE) { - limit = _REGION1_SIZE - 1; - atype = _ASCE_TYPE_REGION2; - etype = _REGION2_ENTRY_EMPTY; - } else { - limit = -1UL; - atype = _ASCE_TYPE_REGION1; - etype = _REGION1_ENTRY_EMPTY; - } - gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL_ACCOUNT); - if (!gmap) - goto out; - INIT_LIST_HEAD(&gmap->children); - INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL_ACCOUNT); - INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC | __GFP_ACCOUNT); - INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC | __GFP_ACCOUNT); - spin_lock_init(&gmap->guest_table_lock); - spin_lock_init(&gmap->shadow_lock); - refcount_set(&gmap->ref_count, 1); - page = gmap_alloc_crst(); - if (!page) - goto out_free; - table = page_to_virt(page); - crst_table_init(table, etype); - gmap->table = table; - gmap->asce = atype | _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | __pa(table); - gmap->asce_end = limit; - return gmap; - -out_free: - kfree(gmap); -out: - return NULL; -} -EXPORT_SYMBOL_GPL(gmap_alloc); - -/** - * gmap_create - create a guest address space - * @mm: pointer to the parent mm_struct - * @limit: maximum size of the gmap address space - * - * Returns a guest address space structure. - */ -struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit) -{ - struct gmap *gmap; - unsigned long gmap_asce; - - gmap = gmap_alloc(limit); - if (!gmap) - return NULL; - gmap->mm = mm; - spin_lock(&mm->context.lock); - list_add_rcu(&gmap->list, &mm->context.gmap_list); - if (list_is_singular(&mm->context.gmap_list)) - gmap_asce = gmap->asce; - else - gmap_asce = -1UL; - WRITE_ONCE(mm->context.gmap_asce, gmap_asce); - spin_unlock(&mm->context.lock); - return gmap; -} -EXPORT_SYMBOL_GPL(gmap_create); - -static void gmap_flush_tlb(struct gmap *gmap) -{ - if (MACHINE_HAS_IDTE) - __tlb_flush_idte(gmap->asce); - else - __tlb_flush_global(); -} - -static void gmap_radix_tree_free(struct radix_tree_root *root) -{ - struct radix_tree_iter iter; - unsigned long indices[16]; - unsigned long index; - void __rcu **slot; - int i, nr; - - /* A radix tree is freed by deleting all of its entries */ - index = 0; - do { - nr = 0; - radix_tree_for_each_slot(slot, root, &iter, index) { - indices[nr] = iter.index; - if (++nr == 16) - break; - } - for (i = 0; i < nr; i++) { - index = indices[i]; - radix_tree_delete(root, index); - } - } while (nr > 0); -} - -static void gmap_rmap_radix_tree_free(struct radix_tree_root *root) -{ - struct gmap_rmap *rmap, *rnext, *head; - struct radix_tree_iter iter; - unsigned long indices[16]; - unsigned long index; - void __rcu **slot; - int i, nr; - - /* A radix tree is freed by deleting all of its entries */ - index = 0; - do { - nr = 0; - radix_tree_for_each_slot(slot, root, &iter, index) { - indices[nr] = iter.index; - if (++nr == 16) - break; - } - for (i = 0; i < nr; i++) { - index = indices[i]; - head = radix_tree_delete(root, index); - gmap_for_each_rmap_safe(rmap, rnext, head) - kfree(rmap); - } - } while (nr > 0); -} - -static void gmap_free_crst(unsigned long *table, bool free_ptes) -{ - bool is_segment = (table[0] & _SEGMENT_ENTRY_TYPE_MASK) == 0; - int i; - - if (is_segment) { - if (!free_ptes) - goto out; - for (i = 0; i < _CRST_ENTRIES; i++) - if (!(table[i] & _SEGMENT_ENTRY_INVALID)) - page_table_free_pgste(page_ptdesc(phys_to_page(table[i]))); - } else { - for (i = 0; i < _CRST_ENTRIES; i++) - if (!(table[i] & _REGION_ENTRY_INVALID)) - gmap_free_crst(__va(table[i] & PAGE_MASK), free_ptes); - } - -out: - free_pages((unsigned long)table, CRST_ALLOC_ORDER); -} - -/** - * gmap_free - free a guest address space - * @gmap: pointer to the guest address space structure - * - * No locks required. There are no references to this gmap anymore. - */ -void gmap_free(struct gmap *gmap) -{ - /* Flush tlb of all gmaps (if not already done for shadows) */ - if (!(gmap_is_shadow(gmap) && gmap->removed)) - gmap_flush_tlb(gmap); - /* Free all segment & region tables. */ - gmap_free_crst(gmap->table, gmap_is_shadow(gmap)); - - gmap_radix_tree_free(&gmap->guest_to_host); - gmap_radix_tree_free(&gmap->host_to_guest); - - /* Free additional data for a shadow gmap */ - if (gmap_is_shadow(gmap)) { - gmap_rmap_radix_tree_free(&gmap->host_to_rmap); - /* Release reference to the parent */ - gmap_put(gmap->parent); - } - - kfree(gmap); -} -EXPORT_SYMBOL_GPL(gmap_free); - -/** - * gmap_get - increase reference counter for guest address space - * @gmap: pointer to the guest address space structure - * - * Returns the gmap pointer - */ -struct gmap *gmap_get(struct gmap *gmap) -{ - refcount_inc(&gmap->ref_count); - return gmap; -} -EXPORT_SYMBOL_GPL(gmap_get); - -/** - * gmap_put - decrease reference counter for guest address space - * @gmap: pointer to the guest address space structure - * - * If the reference counter reaches zero the guest address space is freed. - */ -void gmap_put(struct gmap *gmap) -{ - if (refcount_dec_and_test(&gmap->ref_count)) - gmap_free(gmap); -} -EXPORT_SYMBOL_GPL(gmap_put); - -/** - * gmap_remove - remove a guest address space but do not free it yet - * @gmap: pointer to the guest address space structure - */ -void gmap_remove(struct gmap *gmap) -{ - struct gmap *sg, *next; - unsigned long gmap_asce; - - /* Remove all shadow gmaps linked to this gmap */ - if (!list_empty(&gmap->children)) { - spin_lock(&gmap->shadow_lock); - list_for_each_entry_safe(sg, next, &gmap->children, list) { - list_del(&sg->list); - gmap_put(sg); - } - spin_unlock(&gmap->shadow_lock); - } - /* Remove gmap from the pre-mm list */ - spin_lock(&gmap->mm->context.lock); - list_del_rcu(&gmap->list); - if (list_empty(&gmap->mm->context.gmap_list)) - gmap_asce = 0; - else if (list_is_singular(&gmap->mm->context.gmap_list)) - gmap_asce = list_first_entry(&gmap->mm->context.gmap_list, - struct gmap, list)->asce; - else - gmap_asce = -1UL; - WRITE_ONCE(gmap->mm->context.gmap_asce, gmap_asce); - spin_unlock(&gmap->mm->context.lock); - synchronize_rcu(); - /* Put reference */ - gmap_put(gmap); -} -EXPORT_SYMBOL_GPL(gmap_remove); - -/* - * gmap_alloc_table is assumed to be called with mmap_lock held - */ -static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, - unsigned long init, unsigned long gaddr) -{ - struct page *page; - unsigned long *new; - - /* since we dont free the gmap table until gmap_free we can unlock */ - page = gmap_alloc_crst(); - if (!page) - return -ENOMEM; - new = page_to_virt(page); - crst_table_init(new, init); - spin_lock(&gmap->guest_table_lock); - if (*table & _REGION_ENTRY_INVALID) { - *table = __pa(new) | _REGION_ENTRY_LENGTH | - (*table & _REGION_ENTRY_TYPE_MASK); - page = NULL; - } - spin_unlock(&gmap->guest_table_lock); - if (page) - __free_pages(page, CRST_ALLOC_ORDER); - return 0; -} - -static unsigned long host_to_guest_lookup(struct gmap *gmap, unsigned long vmaddr) -{ - return (unsigned long)radix_tree_lookup(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); -} - -static unsigned long host_to_guest_delete(struct gmap *gmap, unsigned long vmaddr) -{ - return (unsigned long)radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); -} - -static pmd_t *host_to_guest_pmd_delete(struct gmap *gmap, unsigned long vmaddr, - unsigned long *gaddr) -{ - *gaddr = host_to_guest_delete(gmap, vmaddr); - if (IS_GADDR_VALID(*gaddr)) - return (pmd_t *)gmap_table_walk(gmap, *gaddr, 1); - return NULL; -} - -/** - * __gmap_unlink_by_vmaddr - unlink a single segment via a host address - * @gmap: pointer to the guest address space structure - * @vmaddr: address in the host process address space - * - * Returns 1 if a TLB flush is required - */ -static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr) -{ - unsigned long gaddr; - int flush = 0; - pmd_t *pmdp; - - BUG_ON(gmap_is_shadow(gmap)); - spin_lock(&gmap->guest_table_lock); - - pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); - if (pmdp) { - flush = (pmd_val(*pmdp) != _SEGMENT_ENTRY_EMPTY); - *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY); - } - - spin_unlock(&gmap->guest_table_lock); - return flush; -} - -/** - * __gmap_unmap_by_gaddr - unmap a single segment via a guest address - * @gmap: pointer to the guest address space structure - * @gaddr: address in the guest address space - * - * Returns 1 if a TLB flush is required - */ -static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr) -{ - unsigned long vmaddr; - - vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host, - gaddr >> PMD_SHIFT); - return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0; -} - -/** - * gmap_unmap_segment - unmap segment from the guest address space - * @gmap: pointer to the guest address space structure - * @to: address in the guest address space - * @len: length of the memory area to unmap - * - * Returns 0 if the unmap succeeded, -EINVAL if not. - */ -int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) -{ - unsigned long off; - int flush; - - BUG_ON(gmap_is_shadow(gmap)); - if ((to | len) & (PMD_SIZE - 1)) - return -EINVAL; - if (len == 0 || to + len < to) - return -EINVAL; - - flush = 0; - mmap_write_lock(gmap->mm); - for (off = 0; off < len; off += PMD_SIZE) - flush |= __gmap_unmap_by_gaddr(gmap, to + off); - mmap_write_unlock(gmap->mm); - if (flush) - gmap_flush_tlb(gmap); - return 0; -} -EXPORT_SYMBOL_GPL(gmap_unmap_segment); - -/** - * gmap_map_segment - map a segment to the guest address space - * @gmap: pointer to the guest address space structure - * @from: source address in the parent address space - * @to: target address in the guest address space - * @len: length of the memory area to map - * - * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not. - */ -int gmap_map_segment(struct gmap *gmap, unsigned long from, - unsigned long to, unsigned long len) -{ - unsigned long off; - int flush; - - BUG_ON(gmap_is_shadow(gmap)); - if ((from | to | len) & (PMD_SIZE - 1)) - return -EINVAL; - if (len == 0 || from + len < from || to + len < to || - from + len - 1 > TASK_SIZE_MAX || to + len - 1 > gmap->asce_end) - return -EINVAL; - - flush = 0; - mmap_write_lock(gmap->mm); - for (off = 0; off < len; off += PMD_SIZE) { - /* Remove old translation */ - flush |= __gmap_unmap_by_gaddr(gmap, to + off); - /* Store new translation */ - if (radix_tree_insert(&gmap->guest_to_host, - (to + off) >> PMD_SHIFT, - (void *) from + off)) - break; - } - mmap_write_unlock(gmap->mm); - if (flush) - gmap_flush_tlb(gmap); - if (off >= len) - return 0; - gmap_unmap_segment(gmap, to, len); - return -ENOMEM; -} -EXPORT_SYMBOL_GPL(gmap_map_segment); - -/** - * __gmap_translate - translate a guest address to a user space address - * @gmap: pointer to guest mapping meta data structure - * @gaddr: guest address - * - * Returns user space address which corresponds to the guest address or - * -EFAULT if no such mapping exists. - * This function does not establish potentially missing page table entries. - * The mmap_lock of the mm that belongs to the address space must be held - * when this function gets called. - * - * Note: Can also be called for shadow gmaps. - */ -unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) -{ - unsigned long vmaddr; - - vmaddr = (unsigned long) - radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT); - /* Note: guest_to_host is empty for a shadow gmap */ - return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT; -} -EXPORT_SYMBOL_GPL(__gmap_translate); - -/** - * gmap_unlink - disconnect a page table from the gmap shadow tables - * @mm: pointer to the parent mm_struct - * @table: pointer to the host page table - * @vmaddr: vm address associated with the host page table - */ -void gmap_unlink(struct mm_struct *mm, unsigned long *table, - unsigned long vmaddr) -{ - struct gmap *gmap; - int flush; - - rcu_read_lock(); - list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { - flush = __gmap_unlink_by_vmaddr(gmap, vmaddr); - if (flush) - gmap_flush_tlb(gmap); - } - rcu_read_unlock(); -} - -static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *old, pmd_t new, - unsigned long gaddr); - -/** - * __gmap_link - set up shadow page tables to connect a host to a guest address - * @gmap: pointer to guest mapping meta data structure - * @gaddr: guest address - * @vmaddr: vm address - * - * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT - * if the vm address is already mapped to a different guest segment. - * The mmap_lock of the mm that belongs to the address space must be held - * when this function gets called. - */ -int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) -{ - struct mm_struct *mm; - unsigned long *table; - spinlock_t *ptl; - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - u64 unprot; - int rc; - - BUG_ON(gmap_is_shadow(gmap)); - /* Create higher level tables in the gmap page table */ - table = gmap->table; - if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) { - table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT; - if ((*table & _REGION_ENTRY_INVALID) && - gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY, - gaddr & _REGION1_MASK)) - return -ENOMEM; - table = __va(*table & _REGION_ENTRY_ORIGIN); - } - if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) { - table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; - if ((*table & _REGION_ENTRY_INVALID) && - gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY, - gaddr & _REGION2_MASK)) - return -ENOMEM; - table = __va(*table & _REGION_ENTRY_ORIGIN); - } - if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) { - table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; - if ((*table & _REGION_ENTRY_INVALID) && - gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY, - gaddr & _REGION3_MASK)) - return -ENOMEM; - table = __va(*table & _REGION_ENTRY_ORIGIN); - } - table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; - /* Walk the parent mm page table */ - mm = gmap->mm; - pgd = pgd_offset(mm, vmaddr); - VM_BUG_ON(pgd_none(*pgd)); - p4d = p4d_offset(pgd, vmaddr); - VM_BUG_ON(p4d_none(*p4d)); - pud = pud_offset(p4d, vmaddr); - VM_BUG_ON(pud_none(*pud)); - /* large puds cannot yet be handled */ - if (pud_leaf(*pud)) - return -EFAULT; - pmd = pmd_offset(pud, vmaddr); - VM_BUG_ON(pmd_none(*pmd)); - /* Are we allowed to use huge pages? */ - if (pmd_leaf(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m) - return -EFAULT; - /* Link gmap segment table entry location to page table. */ - rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); - if (rc) - return rc; - ptl = pmd_lock(mm, pmd); - spin_lock(&gmap->guest_table_lock); - if (*table == _SEGMENT_ENTRY_EMPTY) { - rc = radix_tree_insert(&gmap->host_to_guest, - vmaddr >> PMD_SHIFT, - (void *)MAKE_VALID_GADDR(gaddr)); - if (!rc) { - if (pmd_leaf(*pmd)) { - *table = (pmd_val(*pmd) & - _SEGMENT_ENTRY_HARDWARE_BITS_LARGE) - | _SEGMENT_ENTRY_GMAP_UC - | _SEGMENT_ENTRY; - } else - *table = pmd_val(*pmd) & - _SEGMENT_ENTRY_HARDWARE_BITS; - } - } else if (*table & _SEGMENT_ENTRY_PROTECT && - !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) { - unprot = (u64)*table; - unprot &= ~_SEGMENT_ENTRY_PROTECT; - unprot |= _SEGMENT_ENTRY_GMAP_UC; - gmap_pmdp_xchg(gmap, (pmd_t *)table, __pmd(unprot), gaddr); - } - spin_unlock(&gmap->guest_table_lock); - spin_unlock(ptl); - radix_tree_preload_end(); - return rc; -} -EXPORT_SYMBOL(__gmap_link); - -/* - * this function is assumed to be called with mmap_lock held - */ -void __gmap_zap(struct gmap *gmap, unsigned long gaddr) -{ - struct vm_area_struct *vma; - unsigned long vmaddr; - spinlock_t *ptl; - pte_t *ptep; - - /* Find the vm address for the guest address */ - vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host, - gaddr >> PMD_SHIFT); - if (vmaddr) { - vmaddr |= gaddr & ~PMD_MASK; - - vma = vma_lookup(gmap->mm, vmaddr); - if (!vma || is_vm_hugetlb_page(vma)) - return; - - /* Get pointer to the page table entry */ - ptep = get_locked_pte(gmap->mm, vmaddr, &ptl); - if (likely(ptep)) { - ptep_zap_unused(gmap->mm, vmaddr, ptep, 0); - pte_unmap_unlock(ptep, ptl); - } - } -} -EXPORT_SYMBOL_GPL(__gmap_zap); - -void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) -{ - unsigned long gaddr, vmaddr, size; - struct vm_area_struct *vma; - - mmap_read_lock(gmap->mm); - for (gaddr = from; gaddr < to; - gaddr = (gaddr + PMD_SIZE) & PMD_MASK) { - /* Find the vm address for the guest address */ - vmaddr = (unsigned long) - radix_tree_lookup(&gmap->guest_to_host, - gaddr >> PMD_SHIFT); - if (!vmaddr) - continue; - vmaddr |= gaddr & ~PMD_MASK; - /* Find vma in the parent mm */ - vma = find_vma(gmap->mm, vmaddr); - if (!vma) - continue; - /* - * We do not discard pages that are backed by - * hugetlbfs, so we don't have to refault them. - */ - if (is_vm_hugetlb_page(vma)) - continue; - size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); - zap_page_range_single(vma, vmaddr, size, NULL); - } - mmap_read_unlock(gmap->mm); -} -EXPORT_SYMBOL_GPL(gmap_discard); - -static LIST_HEAD(gmap_notifier_list); -static DEFINE_SPINLOCK(gmap_notifier_lock); - -/** - * gmap_register_pte_notifier - register a pte invalidation callback - * @nb: pointer to the gmap notifier block - */ -void gmap_register_pte_notifier(struct gmap_notifier *nb) -{ - spin_lock(&gmap_notifier_lock); - list_add_rcu(&nb->list, &gmap_notifier_list); - spin_unlock(&gmap_notifier_lock); -} -EXPORT_SYMBOL_GPL(gmap_register_pte_notifier); - -/** - * gmap_unregister_pte_notifier - remove a pte invalidation callback - * @nb: pointer to the gmap notifier block - */ -void gmap_unregister_pte_notifier(struct gmap_notifier *nb) -{ - spin_lock(&gmap_notifier_lock); - list_del_rcu(&nb->list); - spin_unlock(&gmap_notifier_lock); - synchronize_rcu(); -} -EXPORT_SYMBOL_GPL(gmap_unregister_pte_notifier); - -/** - * gmap_call_notifier - call all registered invalidation callbacks - * @gmap: pointer to guest mapping meta data structure - * @start: start virtual address in the guest address space - * @end: end virtual address in the guest address space - */ -static void gmap_call_notifier(struct gmap *gmap, unsigned long start, - unsigned long end) -{ - struct gmap_notifier *nb; - - list_for_each_entry(nb, &gmap_notifier_list, list) - nb->notifier_call(gmap, start, end); -} - -/** - * gmap_table_walk - walk the gmap page tables - * @gmap: pointer to guest mapping meta data structure - * @gaddr: virtual address in the guest address space - * @level: page table level to stop at - * - * Returns a table entry pointer for the given guest address and @level - * @level=0 : returns a pointer to a page table table entry (or NULL) - * @level=1 : returns a pointer to a segment table entry (or NULL) - * @level=2 : returns a pointer to a region-3 table entry (or NULL) - * @level=3 : returns a pointer to a region-2 table entry (or NULL) - * @level=4 : returns a pointer to a region-1 table entry (or NULL) - * - * Returns NULL if the gmap page tables could not be walked to the - * requested level. - * - * Note: Can also be called for shadow gmaps. - */ -unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level) -{ - const int asce_type = gmap->asce & _ASCE_TYPE_MASK; - unsigned long *table = gmap->table; - - if (gmap_is_shadow(gmap) && gmap->removed) - return NULL; - - if (WARN_ON_ONCE(level > (asce_type >> 2) + 1)) - return NULL; - - if (asce_type != _ASCE_TYPE_REGION1 && - gaddr & (-1UL << (31 + (asce_type >> 2) * 11))) - return NULL; - - switch (asce_type) { - case _ASCE_TYPE_REGION1: - table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT; - if (level == 4) - break; - if (*table & _REGION_ENTRY_INVALID) - return NULL; - table = __va(*table & _REGION_ENTRY_ORIGIN); - fallthrough; - case _ASCE_TYPE_REGION2: - table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; - if (level == 3) - break; - if (*table & _REGION_ENTRY_INVALID) - return NULL; - table = __va(*table & _REGION_ENTRY_ORIGIN); - fallthrough; - case _ASCE_TYPE_REGION3: - table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; - if (level == 2) - break; - if (*table & _REGION_ENTRY_INVALID) - return NULL; - table = __va(*table & _REGION_ENTRY_ORIGIN); - fallthrough; - case _ASCE_TYPE_SEGMENT: - table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; - if (level == 1) - break; - if (*table & _REGION_ENTRY_INVALID) - return NULL; - table = __va(*table & _SEGMENT_ENTRY_ORIGIN); - table += (gaddr & _PAGE_INDEX) >> PAGE_SHIFT; - } - return table; -} -EXPORT_SYMBOL(gmap_table_walk); - -/** - * gmap_pte_op_walk - walk the gmap page table, get the page table lock - * and return the pte pointer - * @gmap: pointer to guest mapping meta data structure - * @gaddr: virtual address in the guest address space - * @ptl: pointer to the spinlock pointer - * - * Returns a pointer to the locked pte for a guest address, or NULL - */ -static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr, - spinlock_t **ptl) -{ - unsigned long *table; - - BUG_ON(gmap_is_shadow(gmap)); - /* Walk the gmap page table, lock and get pte pointer */ - table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */ - if (!table || *table & _SEGMENT_ENTRY_INVALID) - return NULL; - return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl); -} - -/** - * gmap_pte_op_fixup - force a page in and connect the gmap page table - * @gmap: pointer to guest mapping meta data structure - * @gaddr: virtual address in the guest address space - * @vmaddr: address in the host process address space - * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE - * - * Returns 0 if the caller can retry __gmap_translate (might fail again), - * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing - * up or connecting the gmap page table. - */ -static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr, - unsigned long vmaddr, int prot) -{ - struct mm_struct *mm = gmap->mm; - unsigned int fault_flags; - bool unlocked = false; - - BUG_ON(gmap_is_shadow(gmap)); - fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0; - if (fixup_user_fault(mm, vmaddr, fault_flags, &unlocked)) - return -EFAULT; - if (unlocked) - /* lost mmap_lock, caller has to retry __gmap_translate */ - return 0; - /* Connect the page tables */ - return __gmap_link(gmap, gaddr, vmaddr); -} - -/** - * gmap_pte_op_end - release the page table lock - * @ptep: pointer to the locked pte - * @ptl: pointer to the page table spinlock - */ -static void gmap_pte_op_end(pte_t *ptep, spinlock_t *ptl) -{ - pte_unmap_unlock(ptep, ptl); -} - -/** - * gmap_pmd_op_walk - walk the gmap tables, get the guest table lock - * and return the pmd pointer - * @gmap: pointer to guest mapping meta data structure - * @gaddr: virtual address in the guest address space - * - * Returns a pointer to the pmd for a guest address, or NULL - */ -static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr) -{ - pmd_t *pmdp; - - BUG_ON(gmap_is_shadow(gmap)); - pmdp = (pmd_t *) gmap_table_walk(gmap, gaddr, 1); - if (!pmdp) - return NULL; - - /* without huge pages, there is no need to take the table lock */ - if (!gmap->mm->context.allow_gmap_hpage_1m) - return pmd_none(*pmdp) ? NULL : pmdp; - - spin_lock(&gmap->guest_table_lock); - if (pmd_none(*pmdp)) { - spin_unlock(&gmap->guest_table_lock); - return NULL; - } - - /* 4k page table entries are locked via the pte (pte_alloc_map_lock). */ - if (!pmd_leaf(*pmdp)) - spin_unlock(&gmap->guest_table_lock); - return pmdp; -} - -/** - * gmap_pmd_op_end - release the guest_table_lock if needed - * @gmap: pointer to the guest mapping meta data structure - * @pmdp: pointer to the pmd - */ -static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp) -{ - if (pmd_leaf(*pmdp)) - spin_unlock(&gmap->guest_table_lock); -} - -/* - * gmap_protect_pmd - remove access rights to memory and set pmd notification bits - * @pmdp: pointer to the pmd to be protected - * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE - * @bits: notification bits to set - * - * Returns: - * 0 if successfully protected - * -EAGAIN if a fixup is needed - * -EINVAL if unsupported notifier bits have been specified - * - * Expected to be called with sg->mm->mmap_lock in read and - * guest_table_lock held. - */ -static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr, - pmd_t *pmdp, int prot, unsigned long bits) -{ - int pmd_i = pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID; - int pmd_p = pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT; - pmd_t new = *pmdp; - - /* Fixup needed */ - if ((pmd_i && (prot != PROT_NONE)) || (pmd_p && (prot == PROT_WRITE))) - return -EAGAIN; - - if (prot == PROT_NONE && !pmd_i) { - new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID)); - gmap_pmdp_xchg(gmap, pmdp, new, gaddr); - } - - if (prot == PROT_READ && !pmd_p) { - new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID)); - new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_PROTECT)); - gmap_pmdp_xchg(gmap, pmdp, new, gaddr); - } - - if (bits & GMAP_NOTIFY_MPROT) - set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN))); - - /* Shadow GMAP protection needs split PMDs */ - if (bits & GMAP_NOTIFY_SHADOW) - return -EINVAL; - - return 0; -} - -/* - * gmap_protect_pte - remove access rights to memory and set pgste bits - * @gmap: pointer to guest mapping meta data structure - * @gaddr: virtual address in the guest address space - * @pmdp: pointer to the pmd associated with the pte - * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE - * @bits: notification bits to set - * - * Returns 0 if successfully protected, -ENOMEM if out of memory and - * -EAGAIN if a fixup is needed. - * - * Expected to be called with sg->mm->mmap_lock in read - */ -static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr, - pmd_t *pmdp, int prot, unsigned long bits) -{ - int rc; - pte_t *ptep; - spinlock_t *ptl; - unsigned long pbits = 0; - - if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID) - return -EAGAIN; - - ptep = pte_alloc_map_lock(gmap->mm, pmdp, gaddr, &ptl); - if (!ptep) - return -ENOMEM; - - pbits |= (bits & GMAP_NOTIFY_MPROT) ? PGSTE_IN_BIT : 0; - pbits |= (bits & GMAP_NOTIFY_SHADOW) ? PGSTE_VSIE_BIT : 0; - /* Protect and unlock. */ - rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, pbits); - gmap_pte_op_end(ptep, ptl); - return rc; -} - -/* - * gmap_protect_range - remove access rights to memory and set pgste bits - * @gmap: pointer to guest mapping meta data structure - * @gaddr: virtual address in the guest address space - * @len: size of area - * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE - * @bits: pgste notification bits to set - * - * Returns: - * PAGE_SIZE if a small page was successfully protected; - * HPAGE_SIZE if a large page was successfully protected; - * -ENOMEM if out of memory; - * -EFAULT if gaddr is invalid (or mapping for shadows is missing); - * -EAGAIN if the guest mapping is missing and should be fixed by the caller. - * - * Context: Called with sg->mm->mmap_lock in read. - */ -int gmap_protect_one(struct gmap *gmap, unsigned long gaddr, int prot, unsigned long bits) -{ - pmd_t *pmdp; - int rc = 0; - - BUG_ON(gmap_is_shadow(gmap)); - - pmdp = gmap_pmd_op_walk(gmap, gaddr); - if (!pmdp) - return -EAGAIN; - - if (!pmd_leaf(*pmdp)) { - rc = gmap_protect_pte(gmap, gaddr, pmdp, prot, bits); - if (!rc) - rc = PAGE_SIZE; - } else { - rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot, bits); - if (!rc) - rc = HPAGE_SIZE; - } - gmap_pmd_op_end(gmap, pmdp); - - return rc; -} -EXPORT_SYMBOL_GPL(gmap_protect_one); - -/** - * gmap_read_table - get an unsigned long value from a guest page table using - * absolute addressing, without marking the page referenced. - * @gmap: pointer to guest mapping meta data structure - * @gaddr: virtual address in the guest address space - * @val: pointer to the unsigned long value to return - * - * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT - * if reading using the virtual address failed. -EINVAL if called on a gmap - * shadow. - * - * Called with gmap->mm->mmap_lock in read. - */ -int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val) -{ - unsigned long address, vmaddr; - spinlock_t *ptl; - pte_t *ptep, pte; - int rc; - - if (gmap_is_shadow(gmap)) - return -EINVAL; - - while (1) { - rc = -EAGAIN; - ptep = gmap_pte_op_walk(gmap, gaddr, &ptl); - if (ptep) { - pte = *ptep; - if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) { - address = pte_val(pte) & PAGE_MASK; - address += gaddr & ~PAGE_MASK; - *val = *(unsigned long *)__va(address); - set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_YOUNG))); - /* Do *NOT* clear the _PAGE_INVALID bit! */ - rc = 0; - } - gmap_pte_op_end(ptep, ptl); - } - if (!rc) - break; - vmaddr = __gmap_translate(gmap, gaddr); - if (IS_ERR_VALUE(vmaddr)) { - rc = vmaddr; - break; - } - rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ); - if (rc) - break; - } - return rc; -} -EXPORT_SYMBOL_GPL(gmap_read_table); - -/** - * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree - * @sg: pointer to the shadow guest address space structure - * @vmaddr: vm address associated with the rmap - * @rmap: pointer to the rmap structure - * - * Called with the sg->guest_table_lock - */ -static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, - struct gmap_rmap *rmap) -{ - struct gmap_rmap *temp; - void __rcu **slot; - - BUG_ON(!gmap_is_shadow(sg)); - slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT); - if (slot) { - rmap->next = radix_tree_deref_slot_protected(slot, - &sg->guest_table_lock); - for (temp = rmap->next; temp; temp = temp->next) { - if (temp->raddr == rmap->raddr) { - kfree(rmap); - return; - } - } - radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap); - } else { - rmap->next = NULL; - radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT, - rmap); - } -} - -/** - * gmap_protect_rmap - restrict access rights to memory (RO) and create an rmap - * @sg: pointer to the shadow guest address space structure - * @raddr: rmap address in the shadow gmap - * @paddr: address in the parent guest address space - * @len: length of the memory area to protect - * - * Returns 0 if successfully protected and the rmap was created, -ENOMEM - * if out of memory and -EFAULT if paddr is invalid. - */ -static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr, - unsigned long paddr, unsigned long len) -{ - struct gmap *parent; - struct gmap_rmap *rmap; - unsigned long vmaddr; - spinlock_t *ptl; - pte_t *ptep; - int rc; - - BUG_ON(!gmap_is_shadow(sg)); - parent = sg->parent; - while (len) { - vmaddr = __gmap_translate(parent, paddr); - if (IS_ERR_VALUE(vmaddr)) - return vmaddr; - rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT); - if (!rmap) - return -ENOMEM; - rmap->raddr = raddr; - rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); - if (rc) { - kfree(rmap); - return rc; - } - rc = -EAGAIN; - ptep = gmap_pte_op_walk(parent, paddr, &ptl); - if (ptep) { - spin_lock(&sg->guest_table_lock); - rc = ptep_force_prot(parent->mm, paddr, ptep, PROT_READ, - PGSTE_VSIE_BIT); - if (!rc) - gmap_insert_rmap(sg, vmaddr, rmap); - spin_unlock(&sg->guest_table_lock); - gmap_pte_op_end(ptep, ptl); - } - radix_tree_preload_end(); - if (rc) { - kfree(rmap); - rc = gmap_pte_op_fixup(parent, paddr, vmaddr, PROT_READ); - if (rc) - return rc; - continue; - } - paddr += PAGE_SIZE; - len -= PAGE_SIZE; - } - return 0; -} - -#define _SHADOW_RMAP_MASK 0x7 -#define _SHADOW_RMAP_REGION1 0x5 -#define _SHADOW_RMAP_REGION2 0x4 -#define _SHADOW_RMAP_REGION3 0x3 -#define _SHADOW_RMAP_SEGMENT 0x2 -#define _SHADOW_RMAP_PGTABLE 0x1 - -/** - * gmap_idte_one - invalidate a single region or segment table entry - * @asce: region or segment table *origin* + table-type bits - * @vaddr: virtual address to identify the table entry to flush - * - * The invalid bit of a single region or segment table entry is set - * and the associated TLB entries depending on the entry are flushed. - * The table-type of the @asce identifies the portion of the @vaddr - * that is used as the invalidation index. - */ -static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr) -{ - asm volatile( - " idte %0,0,%1" - : : "a" (asce), "a" (vaddr) : "cc", "memory"); -} - -/** - * gmap_unshadow_page - remove a page from a shadow page table - * @sg: pointer to the shadow guest address space structure - * @raddr: rmap address in the shadow guest address space - * - * Called with the sg->guest_table_lock - */ -static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr) -{ - unsigned long *table; - - BUG_ON(!gmap_is_shadow(sg)); - table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */ - if (!table || *table & _PAGE_INVALID) - return; - gmap_call_notifier(sg, raddr, raddr + PAGE_SIZE - 1); - ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table); -} - -/** - * __gmap_unshadow_pgt - remove all entries from a shadow page table - * @sg: pointer to the shadow guest address space structure - * @raddr: rmap address in the shadow guest address space - * @pgt: pointer to the start of a shadow page table - * - * Called with the sg->guest_table_lock - */ -static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr, - unsigned long *pgt) -{ - int i; - - BUG_ON(!gmap_is_shadow(sg)); - for (i = 0; i < _PAGE_ENTRIES; i++, raddr += PAGE_SIZE) - pgt[i] = _PAGE_INVALID; -} - -/** - * gmap_unshadow_pgt - remove a shadow page table from a segment entry - * @sg: pointer to the shadow guest address space structure - * @raddr: address in the shadow guest address space - * - * Called with the sg->guest_table_lock - */ -static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) -{ - unsigned long *ste; - phys_addr_t sto, pgt; - struct ptdesc *ptdesc; - - BUG_ON(!gmap_is_shadow(sg)); - ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */ - if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN)) - return; - gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1); - sto = __pa(ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT)); - gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr); - pgt = *ste & _SEGMENT_ENTRY_ORIGIN; - *ste = _SEGMENT_ENTRY_EMPTY; - __gmap_unshadow_pgt(sg, raddr, __va(pgt)); - /* Free page table */ - ptdesc = page_ptdesc(phys_to_page(pgt)); - page_table_free_pgste(ptdesc); -} - -/** - * __gmap_unshadow_sgt - remove all entries from a shadow segment table - * @sg: pointer to the shadow guest address space structure - * @raddr: rmap address in the shadow guest address space - * @sgt: pointer to the start of a shadow segment table - * - * Called with the sg->guest_table_lock - */ -static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr, - unsigned long *sgt) -{ - struct ptdesc *ptdesc; - phys_addr_t pgt; - int i; - - BUG_ON(!gmap_is_shadow(sg)); - for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) { - if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN)) - continue; - pgt = sgt[i] & _REGION_ENTRY_ORIGIN; - sgt[i] = _SEGMENT_ENTRY_EMPTY; - __gmap_unshadow_pgt(sg, raddr, __va(pgt)); - /* Free page table */ - ptdesc = page_ptdesc(phys_to_page(pgt)); - page_table_free_pgste(ptdesc); - } -} - -/** - * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry - * @sg: pointer to the shadow guest address space structure - * @raddr: rmap address in the shadow guest address space - * - * Called with the shadow->guest_table_lock - */ -static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) -{ - unsigned long r3o, *r3e; - phys_addr_t sgt; - struct page *page; - - BUG_ON(!gmap_is_shadow(sg)); - r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */ - if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN)) - return; - gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1); - r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT)); - gmap_idte_one(__pa(r3o) | _ASCE_TYPE_REGION3, raddr); - sgt = *r3e & _REGION_ENTRY_ORIGIN; - *r3e = _REGION3_ENTRY_EMPTY; - __gmap_unshadow_sgt(sg, raddr, __va(sgt)); - /* Free segment table */ - page = phys_to_page(sgt); - __free_pages(page, CRST_ALLOC_ORDER); -} - -/** - * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table - * @sg: pointer to the shadow guest address space structure - * @raddr: address in the shadow guest address space - * @r3t: pointer to the start of a shadow region-3 table - * - * Called with the sg->guest_table_lock - */ -static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, - unsigned long *r3t) -{ - struct page *page; - phys_addr_t sgt; - int i; - - BUG_ON(!gmap_is_shadow(sg)); - for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) { - if (!(r3t[i] & _REGION_ENTRY_ORIGIN)) - continue; - sgt = r3t[i] & _REGION_ENTRY_ORIGIN; - r3t[i] = _REGION3_ENTRY_EMPTY; - __gmap_unshadow_sgt(sg, raddr, __va(sgt)); - /* Free segment table */ - page = phys_to_page(sgt); - __free_pages(page, CRST_ALLOC_ORDER); - } -} - -/** - * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry - * @sg: pointer to the shadow guest address space structure - * @raddr: rmap address in the shadow guest address space - * - * Called with the sg->guest_table_lock - */ -static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) -{ - unsigned long r2o, *r2e; - phys_addr_t r3t; - struct page *page; - - BUG_ON(!gmap_is_shadow(sg)); - r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */ - if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN)) - return; - gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1); - r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT)); - gmap_idte_one(__pa(r2o) | _ASCE_TYPE_REGION2, raddr); - r3t = *r2e & _REGION_ENTRY_ORIGIN; - *r2e = _REGION2_ENTRY_EMPTY; - __gmap_unshadow_r3t(sg, raddr, __va(r3t)); - /* Free region 3 table */ - page = phys_to_page(r3t); - __free_pages(page, CRST_ALLOC_ORDER); -} - -/** - * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table - * @sg: pointer to the shadow guest address space structure - * @raddr: rmap address in the shadow guest address space - * @r2t: pointer to the start of a shadow region-2 table - * - * Called with the sg->guest_table_lock - */ -static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, - unsigned long *r2t) -{ - phys_addr_t r3t; - struct page *page; - int i; - - BUG_ON(!gmap_is_shadow(sg)); - for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) { - if (!(r2t[i] & _REGION_ENTRY_ORIGIN)) - continue; - r3t = r2t[i] & _REGION_ENTRY_ORIGIN; - r2t[i] = _REGION2_ENTRY_EMPTY; - __gmap_unshadow_r3t(sg, raddr, __va(r3t)); - /* Free region 3 table */ - page = phys_to_page(r3t); - __free_pages(page, CRST_ALLOC_ORDER); - } -} - -/** - * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry - * @sg: pointer to the shadow guest address space structure - * @raddr: rmap address in the shadow guest address space - * - * Called with the sg->guest_table_lock - */ -static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) -{ - unsigned long r1o, *r1e; - struct page *page; - phys_addr_t r2t; - - BUG_ON(!gmap_is_shadow(sg)); - r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */ - if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN)) - return; - gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1); - r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT)); - gmap_idte_one(__pa(r1o) | _ASCE_TYPE_REGION1, raddr); - r2t = *r1e & _REGION_ENTRY_ORIGIN; - *r1e = _REGION1_ENTRY_EMPTY; - __gmap_unshadow_r2t(sg, raddr, __va(r2t)); - /* Free region 2 table */ - page = phys_to_page(r2t); - __free_pages(page, CRST_ALLOC_ORDER); -} - -/** - * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table - * @sg: pointer to the shadow guest address space structure - * @raddr: rmap address in the shadow guest address space - * @r1t: pointer to the start of a shadow region-1 table - * - * Called with the shadow->guest_table_lock - */ -static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr, - unsigned long *r1t) -{ - unsigned long asce; - struct page *page; - phys_addr_t r2t; - int i; - - BUG_ON(!gmap_is_shadow(sg)); - asce = __pa(r1t) | _ASCE_TYPE_REGION1; - for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) { - if (!(r1t[i] & _REGION_ENTRY_ORIGIN)) - continue; - r2t = r1t[i] & _REGION_ENTRY_ORIGIN; - __gmap_unshadow_r2t(sg, raddr, __va(r2t)); - /* Clear entry and flush translation r1t -> r2t */ - gmap_idte_one(asce, raddr); - r1t[i] = _REGION1_ENTRY_EMPTY; - /* Free region 2 table */ - page = phys_to_page(r2t); - __free_pages(page, CRST_ALLOC_ORDER); - } -} - -/** - * gmap_unshadow - remove a shadow page table completely - * @sg: pointer to the shadow guest address space structure - * - * Called with sg->guest_table_lock - */ -void gmap_unshadow(struct gmap *sg) -{ - unsigned long *table; - - BUG_ON(!gmap_is_shadow(sg)); - if (sg->removed) - return; - sg->removed = 1; - gmap_call_notifier(sg, 0, -1UL); - gmap_flush_tlb(sg); - table = __va(sg->asce & _ASCE_ORIGIN); - switch (sg->asce & _ASCE_TYPE_MASK) { - case _ASCE_TYPE_REGION1: - __gmap_unshadow_r1t(sg, 0, table); - break; - case _ASCE_TYPE_REGION2: - __gmap_unshadow_r2t(sg, 0, table); - break; - case _ASCE_TYPE_REGION3: - __gmap_unshadow_r3t(sg, 0, table); - break; - case _ASCE_TYPE_SEGMENT: - __gmap_unshadow_sgt(sg, 0, table); - break; - } -} -EXPORT_SYMBOL(gmap_unshadow); - -/** - * gmap_shadow_r2t - create an empty shadow region 2 table - * @sg: pointer to the shadow guest address space structure - * @saddr: faulting address in the shadow gmap - * @r2t: parent gmap address of the region 2 table to get shadowed - * @fake: r2t references contiguous guest memory block, not a r2t - * - * The r2t parameter specifies the address of the source table. The - * four pages of the source table are made read-only in the parent gmap - * address space. A write to the source table area @r2t will automatically - * remove the shadow r2 table and all of its descendants. - * - * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the - * shadow table structure is incomplete, -ENOMEM if out of memory and - * -EFAULT if an address in the parent gmap could not be resolved. - * - * Called with sg->mm->mmap_lock in read. - */ -int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, - int fake) -{ - unsigned long raddr, origin, offset, len; - unsigned long *table; - phys_addr_t s_r2t; - struct page *page; - int rc; - - BUG_ON(!gmap_is_shadow(sg)); - /* Allocate a shadow region second table */ - page = gmap_alloc_crst(); - if (!page) - return -ENOMEM; - s_r2t = page_to_phys(page); - /* Install shadow region second table */ - spin_lock(&sg->guest_table_lock); - table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */ - if (!table) { - rc = -EAGAIN; /* Race with unshadow */ - goto out_free; - } - if (!(*table & _REGION_ENTRY_INVALID)) { - rc = 0; /* Already established */ - goto out_free; - } else if (*table & _REGION_ENTRY_ORIGIN) { - rc = -EAGAIN; /* Race with shadow */ - goto out_free; - } - crst_table_init(__va(s_r2t), _REGION2_ENTRY_EMPTY); - /* mark as invalid as long as the parent table is not protected */ - *table = s_r2t | _REGION_ENTRY_LENGTH | - _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID; - if (sg->edat_level >= 1) - *table |= (r2t & _REGION_ENTRY_PROTECT); - if (fake) { - /* nothing to protect for fake tables */ - *table &= ~_REGION_ENTRY_INVALID; - spin_unlock(&sg->guest_table_lock); - return 0; - } - spin_unlock(&sg->guest_table_lock); - /* Make r2t read-only in parent gmap page table */ - raddr = (saddr & _REGION1_MASK) | _SHADOW_RMAP_REGION1; - origin = r2t & _REGION_ENTRY_ORIGIN; - offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; - len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; - rc = gmap_protect_rmap(sg, raddr, origin + offset, len); - spin_lock(&sg->guest_table_lock); - if (!rc) { - table = gmap_table_walk(sg, saddr, 4); - if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r2t) - rc = -EAGAIN; /* Race with unshadow */ - else - *table &= ~_REGION_ENTRY_INVALID; - } else { - gmap_unshadow_r2t(sg, raddr); - } - spin_unlock(&sg->guest_table_lock); - return rc; -out_free: - spin_unlock(&sg->guest_table_lock); - __free_pages(page, CRST_ALLOC_ORDER); - return rc; -} -EXPORT_SYMBOL_GPL(gmap_shadow_r2t); - -/** - * gmap_shadow_r3t - create a shadow region 3 table - * @sg: pointer to the shadow guest address space structure - * @saddr: faulting address in the shadow gmap - * @r3t: parent gmap address of the region 3 table to get shadowed - * @fake: r3t references contiguous guest memory block, not a r3t - * - * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the - * shadow table structure is incomplete, -ENOMEM if out of memory and - * -EFAULT if an address in the parent gmap could not be resolved. - * - * Called with sg->mm->mmap_lock in read. - */ -int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, - int fake) -{ - unsigned long raddr, origin, offset, len; - unsigned long *table; - phys_addr_t s_r3t; - struct page *page; - int rc; - - BUG_ON(!gmap_is_shadow(sg)); - /* Allocate a shadow region second table */ - page = gmap_alloc_crst(); - if (!page) - return -ENOMEM; - s_r3t = page_to_phys(page); - /* Install shadow region second table */ - spin_lock(&sg->guest_table_lock); - table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */ - if (!table) { - rc = -EAGAIN; /* Race with unshadow */ - goto out_free; - } - if (!(*table & _REGION_ENTRY_INVALID)) { - rc = 0; /* Already established */ - goto out_free; - } else if (*table & _REGION_ENTRY_ORIGIN) { - rc = -EAGAIN; /* Race with shadow */ - goto out_free; - } - crst_table_init(__va(s_r3t), _REGION3_ENTRY_EMPTY); - /* mark as invalid as long as the parent table is not protected */ - *table = s_r3t | _REGION_ENTRY_LENGTH | - _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID; - if (sg->edat_level >= 1) - *table |= (r3t & _REGION_ENTRY_PROTECT); - if (fake) { - /* nothing to protect for fake tables */ - *table &= ~_REGION_ENTRY_INVALID; - spin_unlock(&sg->guest_table_lock); - return 0; - } - spin_unlock(&sg->guest_table_lock); - /* Make r3t read-only in parent gmap page table */ - raddr = (saddr & _REGION2_MASK) | _SHADOW_RMAP_REGION2; - origin = r3t & _REGION_ENTRY_ORIGIN; - offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; - len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; - rc = gmap_protect_rmap(sg, raddr, origin + offset, len); - spin_lock(&sg->guest_table_lock); - if (!rc) { - table = gmap_table_walk(sg, saddr, 3); - if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r3t) - rc = -EAGAIN; /* Race with unshadow */ - else - *table &= ~_REGION_ENTRY_INVALID; - } else { - gmap_unshadow_r3t(sg, raddr); - } - spin_unlock(&sg->guest_table_lock); - return rc; -out_free: - spin_unlock(&sg->guest_table_lock); - __free_pages(page, CRST_ALLOC_ORDER); - return rc; -} -EXPORT_SYMBOL_GPL(gmap_shadow_r3t); - -/** - * gmap_shadow_sgt - create a shadow segment table - * @sg: pointer to the shadow guest address space structure - * @saddr: faulting address in the shadow gmap - * @sgt: parent gmap address of the segment table to get shadowed - * @fake: sgt references contiguous guest memory block, not a sgt - * - * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the - * shadow table structure is incomplete, -ENOMEM if out of memory and - * -EFAULT if an address in the parent gmap could not be resolved. - * - * Called with sg->mm->mmap_lock in read. - */ -int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, - int fake) -{ - unsigned long raddr, origin, offset, len; - unsigned long *table; - phys_addr_t s_sgt; - struct page *page; - int rc; - - BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE)); - /* Allocate a shadow segment table */ - page = gmap_alloc_crst(); - if (!page) - return -ENOMEM; - s_sgt = page_to_phys(page); - /* Install shadow region second table */ - spin_lock(&sg->guest_table_lock); - table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */ - if (!table) { - rc = -EAGAIN; /* Race with unshadow */ - goto out_free; - } - if (!(*table & _REGION_ENTRY_INVALID)) { - rc = 0; /* Already established */ - goto out_free; - } else if (*table & _REGION_ENTRY_ORIGIN) { - rc = -EAGAIN; /* Race with shadow */ - goto out_free; - } - crst_table_init(__va(s_sgt), _SEGMENT_ENTRY_EMPTY); - /* mark as invalid as long as the parent table is not protected */ - *table = s_sgt | _REGION_ENTRY_LENGTH | - _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID; - if (sg->edat_level >= 1) - *table |= sgt & _REGION_ENTRY_PROTECT; - if (fake) { - /* nothing to protect for fake tables */ - *table &= ~_REGION_ENTRY_INVALID; - spin_unlock(&sg->guest_table_lock); - return 0; - } - spin_unlock(&sg->guest_table_lock); - /* Make sgt read-only in parent gmap page table */ - raddr = (saddr & _REGION3_MASK) | _SHADOW_RMAP_REGION3; - origin = sgt & _REGION_ENTRY_ORIGIN; - offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; - len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; - rc = gmap_protect_rmap(sg, raddr, origin + offset, len); - spin_lock(&sg->guest_table_lock); - if (!rc) { - table = gmap_table_walk(sg, saddr, 2); - if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_sgt) - rc = -EAGAIN; /* Race with unshadow */ - else - *table &= ~_REGION_ENTRY_INVALID; - } else { - gmap_unshadow_sgt(sg, raddr); - } - spin_unlock(&sg->guest_table_lock); - return rc; -out_free: - spin_unlock(&sg->guest_table_lock); - __free_pages(page, CRST_ALLOC_ORDER); - return rc; -} -EXPORT_SYMBOL_GPL(gmap_shadow_sgt); - -static void gmap_pgste_set_pgt_addr(struct ptdesc *ptdesc, unsigned long pgt_addr) -{ - unsigned long *pgstes = page_to_virt(ptdesc_page(ptdesc)); - - pgstes += _PAGE_ENTRIES; - - pgstes[0] &= ~PGSTE_ST2_MASK; - pgstes[1] &= ~PGSTE_ST2_MASK; - pgstes[2] &= ~PGSTE_ST2_MASK; - pgstes[3] &= ~PGSTE_ST2_MASK; - - pgstes[0] |= (pgt_addr >> 16) & PGSTE_ST2_MASK; - pgstes[1] |= pgt_addr & PGSTE_ST2_MASK; - pgstes[2] |= (pgt_addr << 16) & PGSTE_ST2_MASK; - pgstes[3] |= (pgt_addr << 32) & PGSTE_ST2_MASK; -} - -/** - * gmap_shadow_pgt - instantiate a shadow page table - * @sg: pointer to the shadow guest address space structure - * @saddr: faulting address in the shadow gmap - * @pgt: parent gmap address of the page table to get shadowed - * @fake: pgt references contiguous guest memory block, not a pgtable - * - * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the - * shadow table structure is incomplete, -ENOMEM if out of memory, - * -EFAULT if an address in the parent gmap could not be resolved and - * - * Called with gmap->mm->mmap_lock in read - */ -int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, - int fake) -{ - unsigned long raddr, origin; - unsigned long *table; - struct ptdesc *ptdesc; - phys_addr_t s_pgt; - int rc; - - BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE)); - /* Allocate a shadow page table */ - ptdesc = page_table_alloc_pgste(sg->mm); - if (!ptdesc) - return -ENOMEM; - origin = pgt & _SEGMENT_ENTRY_ORIGIN; - if (fake) - origin |= GMAP_SHADOW_FAKE_TABLE; - gmap_pgste_set_pgt_addr(ptdesc, origin); - s_pgt = page_to_phys(ptdesc_page(ptdesc)); - /* Install shadow page table */ - spin_lock(&sg->guest_table_lock); - table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ - if (!table) { - rc = -EAGAIN; /* Race with unshadow */ - goto out_free; - } - if (!(*table & _SEGMENT_ENTRY_INVALID)) { - rc = 0; /* Already established */ - goto out_free; - } else if (*table & _SEGMENT_ENTRY_ORIGIN) { - rc = -EAGAIN; /* Race with shadow */ - goto out_free; - } - /* mark as invalid as long as the parent table is not protected */ - *table = (unsigned long) s_pgt | _SEGMENT_ENTRY | - (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID; - if (fake) { - /* nothing to protect for fake tables */ - *table &= ~_SEGMENT_ENTRY_INVALID; - spin_unlock(&sg->guest_table_lock); - return 0; - } - spin_unlock(&sg->guest_table_lock); - /* Make pgt read-only in parent gmap page table (not the pgste) */ - raddr = (saddr & _SEGMENT_MASK) | _SHADOW_RMAP_SEGMENT; - origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK; - rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE); - spin_lock(&sg->guest_table_lock); - if (!rc) { - table = gmap_table_walk(sg, saddr, 1); - if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != s_pgt) - rc = -EAGAIN; /* Race with unshadow */ - else - *table &= ~_SEGMENT_ENTRY_INVALID; - } else { - gmap_unshadow_pgt(sg, raddr); - } - spin_unlock(&sg->guest_table_lock); - return rc; -out_free: - spin_unlock(&sg->guest_table_lock); - page_table_free_pgste(ptdesc); - return rc; - -} -EXPORT_SYMBOL_GPL(gmap_shadow_pgt); - -/** - * gmap_shadow_page - create a shadow page mapping - * @sg: pointer to the shadow guest address space structure - * @saddr: faulting address in the shadow gmap - * @pte: pte in parent gmap address space to get shadowed - * - * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the - * shadow table structure is incomplete, -ENOMEM if out of memory and - * -EFAULT if an address in the parent gmap could not be resolved. - * - * Called with sg->mm->mmap_lock in read. - */ -int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) -{ - struct gmap *parent; - struct gmap_rmap *rmap; - unsigned long vmaddr, paddr; - spinlock_t *ptl; - pte_t *sptep, *tptep; - int prot; - int rc; - - BUG_ON(!gmap_is_shadow(sg)); - parent = sg->parent; - prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE; - - rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT); - if (!rmap) - return -ENOMEM; - rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE; - - while (1) { - paddr = pte_val(pte) & PAGE_MASK; - vmaddr = __gmap_translate(parent, paddr); - if (IS_ERR_VALUE(vmaddr)) { - rc = vmaddr; - break; - } - rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); - if (rc) - break; - rc = -EAGAIN; - sptep = gmap_pte_op_walk(parent, paddr, &ptl); - if (sptep) { - spin_lock(&sg->guest_table_lock); - /* Get page table pointer */ - tptep = (pte_t *) gmap_table_walk(sg, saddr, 0); - if (!tptep) { - spin_unlock(&sg->guest_table_lock); - gmap_pte_op_end(sptep, ptl); - radix_tree_preload_end(); - break; - } - rc = ptep_shadow_pte(sg->mm, saddr, sptep, tptep, pte); - if (rc > 0) { - /* Success and a new mapping */ - gmap_insert_rmap(sg, vmaddr, rmap); - rmap = NULL; - rc = 0; - } - gmap_pte_op_end(sptep, ptl); - spin_unlock(&sg->guest_table_lock); - } - radix_tree_preload_end(); - if (!rc) - break; - rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot); - if (rc) - break; - } - kfree(rmap); - return rc; -} -EXPORT_SYMBOL_GPL(gmap_shadow_page); - -/* - * gmap_shadow_notify - handle notifications for shadow gmap - * - * Called with sg->parent->shadow_lock. - */ -static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr, - unsigned long gaddr) -{ - struct gmap_rmap *rmap, *rnext, *head; - unsigned long start, end, bits, raddr; - - BUG_ON(!gmap_is_shadow(sg)); - - spin_lock(&sg->guest_table_lock); - if (sg->removed) { - spin_unlock(&sg->guest_table_lock); - return; - } - /* Check for top level table */ - start = sg->orig_asce & _ASCE_ORIGIN; - end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE; - if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start && - gaddr < end) { - /* The complete shadow table has to go */ - gmap_unshadow(sg); - spin_unlock(&sg->guest_table_lock); - list_del(&sg->list); - gmap_put(sg); - return; - } - /* Remove the page table tree from on specific entry */ - head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT); - gmap_for_each_rmap_safe(rmap, rnext, head) { - bits = rmap->raddr & _SHADOW_RMAP_MASK; - raddr = rmap->raddr ^ bits; - switch (bits) { - case _SHADOW_RMAP_REGION1: - gmap_unshadow_r2t(sg, raddr); - break; - case _SHADOW_RMAP_REGION2: - gmap_unshadow_r3t(sg, raddr); - break; - case _SHADOW_RMAP_REGION3: - gmap_unshadow_sgt(sg, raddr); - break; - case _SHADOW_RMAP_SEGMENT: - gmap_unshadow_pgt(sg, raddr); - break; - case _SHADOW_RMAP_PGTABLE: - gmap_unshadow_page(sg, raddr); - break; - } - kfree(rmap); - } - spin_unlock(&sg->guest_table_lock); -} - -/** - * ptep_notify - call all invalidation callbacks for a specific pte. - * @mm: pointer to the process mm_struct - * @vmaddr: virtual address in the process address space - * @pte: pointer to the page table entry - * @bits: bits from the pgste that caused the notify call - * - * This function is assumed to be called with the page table lock held - * for the pte to notify. - */ -void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, - pte_t *pte, unsigned long bits) -{ - unsigned long offset, gaddr = 0; - struct gmap *gmap, *sg, *next; - - offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); - offset = offset * (PAGE_SIZE / sizeof(pte_t)); - rcu_read_lock(); - list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { - spin_lock(&gmap->guest_table_lock); - gaddr = host_to_guest_lookup(gmap, vmaddr) + offset; - spin_unlock(&gmap->guest_table_lock); - if (!IS_GADDR_VALID(gaddr)) - continue; - - if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) { - spin_lock(&gmap->shadow_lock); - list_for_each_entry_safe(sg, next, - &gmap->children, list) - gmap_shadow_notify(sg, vmaddr, gaddr); - spin_unlock(&gmap->shadow_lock); - } - if (bits & PGSTE_IN_BIT) - gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1); - } - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(ptep_notify); - -static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp, - unsigned long gaddr) -{ - set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN))); - gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1); -} - -/** - * gmap_pmdp_xchg - exchange a gmap pmd with another - * @gmap: pointer to the guest address space structure - * @pmdp: pointer to the pmd entry - * @new: replacement entry - * @gaddr: the affected guest address - * - * This function is assumed to be called with the guest_table_lock - * held. - */ -static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new, - unsigned long gaddr) -{ - gaddr &= HPAGE_MASK; - pmdp_notify_gmap(gmap, pmdp, gaddr); - new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_GMAP_IN)); - if (MACHINE_HAS_TLB_GUEST) - __pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce, - IDTE_GLOBAL); - else if (MACHINE_HAS_IDTE) - __pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL); - else - __pmdp_csp(pmdp); - set_pmd(pmdp, new); -} - -static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr, - int purge) -{ - pmd_t *pmdp; - struct gmap *gmap; - unsigned long gaddr; - - rcu_read_lock(); - list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { - spin_lock(&gmap->guest_table_lock); - pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); - if (pmdp) { - pmdp_notify_gmap(gmap, pmdp, gaddr); - WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | - _SEGMENT_ENTRY_GMAP_UC | - _SEGMENT_ENTRY)); - if (purge) - __pmdp_csp(pmdp); - set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); - } - spin_unlock(&gmap->guest_table_lock); - } - rcu_read_unlock(); -} - -/** - * gmap_pmdp_invalidate - invalidate all affected guest pmd entries without - * flushing - * @mm: pointer to the process mm_struct - * @vmaddr: virtual address in the process address space - */ -void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr) -{ - gmap_pmdp_clear(mm, vmaddr, 0); -} -EXPORT_SYMBOL_GPL(gmap_pmdp_invalidate); - -/** - * gmap_pmdp_csp - csp all affected guest pmd entries - * @mm: pointer to the process mm_struct - * @vmaddr: virtual address in the process address space - */ -void gmap_pmdp_csp(struct mm_struct *mm, unsigned long vmaddr) -{ - gmap_pmdp_clear(mm, vmaddr, 1); -} -EXPORT_SYMBOL_GPL(gmap_pmdp_csp); - -/** - * gmap_pmdp_idte_local - invalidate and clear a guest pmd entry - * @mm: pointer to the process mm_struct - * @vmaddr: virtual address in the process address space - */ -void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr) -{ - unsigned long gaddr; - struct gmap *gmap; - pmd_t *pmdp; - - rcu_read_lock(); - list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { - spin_lock(&gmap->guest_table_lock); - pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); - if (pmdp) { - pmdp_notify_gmap(gmap, pmdp, gaddr); - WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | - _SEGMENT_ENTRY_GMAP_UC | - _SEGMENT_ENTRY)); - if (MACHINE_HAS_TLB_GUEST) - __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, - gmap->asce, IDTE_LOCAL); - else if (MACHINE_HAS_IDTE) - __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL); - *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY); - } - spin_unlock(&gmap->guest_table_lock); - } - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(gmap_pmdp_idte_local); - -/** - * gmap_pmdp_idte_global - invalidate and clear a guest pmd entry - * @mm: pointer to the process mm_struct - * @vmaddr: virtual address in the process address space - */ -void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr) -{ - unsigned long gaddr; - struct gmap *gmap; - pmd_t *pmdp; - - rcu_read_lock(); - list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { - spin_lock(&gmap->guest_table_lock); - pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); - if (pmdp) { - pmdp_notify_gmap(gmap, pmdp, gaddr); - WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | - _SEGMENT_ENTRY_GMAP_UC | - _SEGMENT_ENTRY)); - if (MACHINE_HAS_TLB_GUEST) - __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, - gmap->asce, IDTE_GLOBAL); - else if (MACHINE_HAS_IDTE) - __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL); - else - __pmdp_csp(pmdp); - *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY); - } - spin_unlock(&gmap->guest_table_lock); - } - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(gmap_pmdp_idte_global); - -/** - * gmap_test_and_clear_dirty_pmd - test and reset segment dirty status - * @gmap: pointer to guest address space - * @pmdp: pointer to the pmd to be tested - * @gaddr: virtual address in the guest address space - * - * This function is assumed to be called with the guest_table_lock - * held. - */ -static bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp, - unsigned long gaddr) -{ - if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID) - return false; - - /* Already protected memory, which did not change is clean */ - if (pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT && - !(pmd_val(*pmdp) & _SEGMENT_ENTRY_GMAP_UC)) - return false; - - /* Clear UC indication and reset protection */ - set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_UC))); - gmap_protect_pmd(gmap, gaddr, pmdp, PROT_READ, 0); - return true; -} - -/** - * gmap_sync_dirty_log_pmd - set bitmap based on dirty status of segment - * @gmap: pointer to guest address space - * @bitmap: dirty bitmap for this pmd - * @gaddr: virtual address in the guest address space - * @vmaddr: virtual address in the host address space - * - * This function is assumed to be called with the guest_table_lock - * held. - */ -void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4], - unsigned long gaddr, unsigned long vmaddr) -{ - int i; - pmd_t *pmdp; - pte_t *ptep; - spinlock_t *ptl; - - pmdp = gmap_pmd_op_walk(gmap, gaddr); - if (!pmdp) - return; - - if (pmd_leaf(*pmdp)) { - if (gmap_test_and_clear_dirty_pmd(gmap, pmdp, gaddr)) - bitmap_fill(bitmap, _PAGE_ENTRIES); - } else { - for (i = 0; i < _PAGE_ENTRIES; i++, vmaddr += PAGE_SIZE) { - ptep = pte_alloc_map_lock(gmap->mm, pmdp, vmaddr, &ptl); - if (!ptep) - continue; - if (ptep_test_and_clear_uc(gmap->mm, vmaddr, ptep)) - set_bit(i, bitmap); - pte_unmap_unlock(ptep, ptl); - } - } - gmap_pmd_op_end(gmap, pmdp); -} -EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd); - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -static int thp_split_walk_pmd_entry(pmd_t *pmd, unsigned long addr, - unsigned long end, struct mm_walk *walk) -{ - struct vm_area_struct *vma = walk->vma; - - split_huge_pmd(vma, pmd, addr); - return 0; -} - -static const struct mm_walk_ops thp_split_walk_ops = { - .pmd_entry = thp_split_walk_pmd_entry, - .walk_lock = PGWALK_WRLOCK_VERIFY, -}; - -static inline void thp_split_mm(struct mm_struct *mm) -{ - struct vm_area_struct *vma; - VMA_ITERATOR(vmi, mm, 0); - - for_each_vma(vmi, vma) { - vm_flags_mod(vma, VM_NOHUGEPAGE, VM_HUGEPAGE); - walk_page_vma(vma, &thp_split_walk_ops, NULL); - } - mm->def_flags |= VM_NOHUGEPAGE; -} -#else -static inline void thp_split_mm(struct mm_struct *mm) -{ -} -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - -/* - * switch on pgstes for its userspace process (for kvm) - */ -int s390_enable_sie(void) -{ - struct mm_struct *mm = current->mm; - - /* Do we have pgstes? if yes, we are done */ - if (mm_has_pgste(mm)) - return 0; - /* Fail if the page tables are 2K */ - if (!mm_alloc_pgste(mm)) - return -EINVAL; - mmap_write_lock(mm); - mm->context.has_pgste = 1; - /* split thp mappings and disable thp for future mappings */ - thp_split_mm(mm); - mmap_write_unlock(mm); - return 0; -} -EXPORT_SYMBOL_GPL(s390_enable_sie); - -static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr, - unsigned long end, struct mm_walk *walk) -{ - unsigned long *found_addr = walk->private; - - /* Return 1 of the page is a zeropage. */ - if (is_zero_pfn(pte_pfn(*pte))) { - /* - * Shared zeropage in e.g., a FS DAX mapping? We cannot do the - * right thing and likely don't care: FAULT_FLAG_UNSHARE - * currently only works in COW mappings, which is also where - * mm_forbids_zeropage() is checked. - */ - if (!is_cow_mapping(walk->vma->vm_flags)) - return -EFAULT; - - *found_addr = addr; - return 1; - } - return 0; -} - -static const struct mm_walk_ops find_zeropage_ops = { - .pte_entry = find_zeropage_pte_entry, - .walk_lock = PGWALK_WRLOCK, -}; - -/* - * Unshare all shared zeropages, replacing them by anonymous pages. Note that - * we cannot simply zap all shared zeropages, because this could later - * trigger unexpected userfaultfd missing events. - * - * This must be called after mm->context.allow_cow_sharing was - * set to 0, to avoid future mappings of shared zeropages. - * - * mm contracts with s390, that even if mm were to remove a page table, - * and racing with walk_page_range_vma() calling pte_offset_map_lock() - * would fail, it will never insert a page table containing empty zero - * pages once mm_forbids_zeropage(mm) i.e. - * mm->context.allow_cow_sharing is set to 0. - */ -static int __s390_unshare_zeropages(struct mm_struct *mm) -{ - struct vm_area_struct *vma; - VMA_ITERATOR(vmi, mm, 0); - unsigned long addr; - vm_fault_t fault; - int rc; - - for_each_vma(vmi, vma) { - /* - * We could only look at COW mappings, but it's more future - * proof to catch unexpected zeropages in other mappings and - * fail. - */ - if ((vma->vm_flags & VM_PFNMAP) || is_vm_hugetlb_page(vma)) - continue; - addr = vma->vm_start; - -retry: - rc = walk_page_range_vma(vma, addr, vma->vm_end, - &find_zeropage_ops, &addr); - if (rc < 0) - return rc; - else if (!rc) - continue; - - /* addr was updated by find_zeropage_pte_entry() */ - fault = handle_mm_fault(vma, addr, - FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE, - NULL); - if (fault & VM_FAULT_OOM) - return -ENOMEM; - /* - * See break_ksm(): even after handle_mm_fault() returned 0, we - * must start the lookup from the current address, because - * handle_mm_fault() may back out if there's any difficulty. - * - * VM_FAULT_SIGBUS and VM_FAULT_SIGSEGV are unexpected but - * maybe they could trigger in the future on concurrent - * truncation. In that case, the shared zeropage would be gone - * and we can simply retry and make progress. - */ - cond_resched(); - goto retry; - } - - return 0; -} - -static int __s390_disable_cow_sharing(struct mm_struct *mm) -{ - int rc; - - if (!mm->context.allow_cow_sharing) - return 0; - - mm->context.allow_cow_sharing = 0; - - /* Replace all shared zeropages by anonymous pages. */ - rc = __s390_unshare_zeropages(mm); - /* - * Make sure to disable KSM (if enabled for the whole process or - * individual VMAs). Note that nothing currently hinders user space - * from re-enabling it. - */ - if (!rc) - rc = ksm_disable(mm); - if (rc) - mm->context.allow_cow_sharing = 1; - return rc; -} - -/* - * Disable most COW-sharing of memory pages for the whole process: - * (1) Disable KSM and unmerge/unshare any KSM pages. - * (2) Disallow shared zeropages and unshare any zerpages that are mapped. - * - * Not that we currently don't bother with COW-shared pages that are shared - * with parent/child processes due to fork(). - */ -int s390_disable_cow_sharing(void) -{ - int rc; - - mmap_write_lock(current->mm); - rc = __s390_disable_cow_sharing(current->mm); - mmap_write_unlock(current->mm); - return rc; -} -EXPORT_SYMBOL_GPL(s390_disable_cow_sharing); - -/* - * Enable storage key handling from now on and initialize the storage - * keys with the default key. - */ -static int __s390_enable_skey_pte(pte_t *pte, unsigned long addr, - unsigned long next, struct mm_walk *walk) -{ - /* Clear storage key */ - ptep_zap_key(walk->mm, addr, pte); - return 0; -} - -/* - * Give a chance to schedule after setting a key to 256 pages. - * We only hold the mm lock, which is a rwsem and the kvm srcu. - * Both can sleep. - */ -static int __s390_enable_skey_pmd(pmd_t *pmd, unsigned long addr, - unsigned long next, struct mm_walk *walk) -{ - cond_resched(); - return 0; -} - -static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr, - unsigned long hmask, unsigned long next, - struct mm_walk *walk) -{ - pmd_t *pmd = (pmd_t *)pte; - unsigned long start, end; - struct folio *folio = page_folio(pmd_page(*pmd)); - - /* - * The write check makes sure we do not set a key on shared - * memory. This is needed as the walker does not differentiate - * between actual guest memory and the process executable or - * shared libraries. - */ - if (pmd_val(*pmd) & _SEGMENT_ENTRY_INVALID || - !(pmd_val(*pmd) & _SEGMENT_ENTRY_WRITE)) - return 0; - - start = pmd_val(*pmd) & HPAGE_MASK; - end = start + HPAGE_SIZE; - __storage_key_init_range(start, end); - set_bit(PG_arch_1, &folio->flags); - cond_resched(); - return 0; -} - -static const struct mm_walk_ops enable_skey_walk_ops = { - .hugetlb_entry = __s390_enable_skey_hugetlb, - .pte_entry = __s390_enable_skey_pte, - .pmd_entry = __s390_enable_skey_pmd, - .walk_lock = PGWALK_WRLOCK, -}; - -int s390_enable_skey(void) -{ - struct mm_struct *mm = current->mm; - int rc = 0; - - mmap_write_lock(mm); - if (mm_uses_skeys(mm)) - goto out_up; - - mm->context.uses_skeys = 1; - rc = __s390_disable_cow_sharing(mm); - if (rc) { - mm->context.uses_skeys = 0; - goto out_up; - } - walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL); - -out_up: - mmap_write_unlock(mm); - return rc; -} -EXPORT_SYMBOL_GPL(s390_enable_skey); - -/* - * Reset CMMA state, make all pages stable again. - */ -static int __s390_reset_cmma(pte_t *pte, unsigned long addr, - unsigned long next, struct mm_walk *walk) -{ - ptep_zap_unused(walk->mm, addr, pte, 1); - return 0; -} - -static const struct mm_walk_ops reset_cmma_walk_ops = { - .pte_entry = __s390_reset_cmma, - .walk_lock = PGWALK_WRLOCK, -}; - -void s390_reset_cmma(struct mm_struct *mm) -{ - mmap_write_lock(mm); - walk_page_range(mm, 0, TASK_SIZE, &reset_cmma_walk_ops, NULL); - mmap_write_unlock(mm); -} -EXPORT_SYMBOL_GPL(s390_reset_cmma); - -#define GATHER_GET_PAGES 32 - -struct reset_walk_state { - unsigned long next; - unsigned long count; - unsigned long pfns[GATHER_GET_PAGES]; -}; - -static int s390_gather_pages(pte_t *ptep, unsigned long addr, - unsigned long next, struct mm_walk *walk) -{ - struct reset_walk_state *p = walk->private; - pte_t pte = READ_ONCE(*ptep); - - if (pte_present(pte)) { - /* we have a reference from the mapping, take an extra one */ - get_page(phys_to_page(pte_val(pte))); - p->pfns[p->count] = phys_to_pfn(pte_val(pte)); - p->next = next; - p->count++; - } - return p->count >= GATHER_GET_PAGES; -} - -static const struct mm_walk_ops gather_pages_ops = { - .pte_entry = s390_gather_pages, - .walk_lock = PGWALK_RDLOCK, -}; - -/* - * Call the Destroy secure page UVC on each page in the given array of PFNs. - * Each page needs to have an extra reference, which will be released here. - */ -void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns) -{ - struct folio *folio; - unsigned long i; - - for (i = 0; i < count; i++) { - folio = pfn_folio(pfns[i]); - /* we always have an extra reference */ - uv_destroy_folio(folio); - /* get rid of the extra reference */ - folio_put(folio); - cond_resched(); - } -} -EXPORT_SYMBOL_GPL(s390_uv_destroy_pfns); - -/** - * __s390_uv_destroy_range - Call the destroy secure page UVC on each page - * in the given range of the given address space. - * @mm: the mm to operate on - * @start: the start of the range - * @end: the end of the range - * @interruptible: if not 0, stop when a fatal signal is received - * - * Walk the given range of the given address space and call the destroy - * secure page UVC on each page. Optionally exit early if a fatal signal is - * pending. - * - * Return: 0 on success, -EINTR if the function stopped before completing - */ -int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, - unsigned long end, bool interruptible) -{ - struct reset_walk_state state = { .next = start }; - int r = 1; - - while (r > 0) { - state.count = 0; - mmap_read_lock(mm); - r = walk_page_range(mm, state.next, end, &gather_pages_ops, &state); - mmap_read_unlock(mm); - cond_resched(); - s390_uv_destroy_pfns(state.count, state.pfns); - if (interruptible && fatal_signal_pending(current)) - return -EINTR; - } - return 0; -} -EXPORT_SYMBOL_GPL(__s390_uv_destroy_range); - -/** - * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy - * @gmap: the gmap whose ASCE needs to be replaced - * - * If the ASCE is a SEGMENT type then this function will return -EINVAL, - * otherwise the pointers in the host_to_guest radix tree will keep pointing - * to the wrong pages, causing use-after-free and memory corruption. - * If the allocation of the new top level page table fails, the ASCE is not - * replaced. - * In any case, the old ASCE is always removed from the gmap CRST list. - * Therefore the caller has to make sure to save a pointer to it - * beforehand, unless a leak is actually intended. - */ -int s390_replace_asce(struct gmap *gmap) -{ - unsigned long asce; - struct page *page; - void *table; - - /* Replacing segment type ASCEs would cause serious issues */ - if ((gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT) - return -EINVAL; - - page = gmap_alloc_crst(); - if (!page) - return -ENOMEM; - table = page_to_virt(page); - memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT)); - - /* Set new table origin while preserving existing ASCE control bits */ - asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table); - WRITE_ONCE(gmap->asce, asce); - WRITE_ONCE(gmap->mm->context.gmap_asce, asce); - WRITE_ONCE(gmap->table, table); - - return 0; -} -EXPORT_SYMBOL_GPL(s390_replace_asce); - -/** - * kvm_s390_wiggle_split_folio() - try to drain extra references to a folio and optionally split - * @mm: the mm containing the folio to work on - * @folio: the folio - * @split: whether to split a large folio - * - * Context: Must be called while holding an extra reference to the folio; - * the mm lock should not be held. - */ -int kvm_s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio, bool split) -{ - int rc; - - lockdep_assert_not_held(&mm->mmap_lock); - folio_wait_writeback(folio); - lru_add_drain_all(); - if (split) { - folio_lock(folio); - rc = split_folio(folio); - folio_unlock(folio); - - if (rc != -EBUSY) - return rc; - } - return -EAGAIN; -} -EXPORT_SYMBOL_GPL(kvm_s390_wiggle_split_folio); diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c new file mode 100644 index 000000000000..f8789ffcc05c --- /dev/null +++ b/arch/s390/mm/gmap_helpers.c @@ -0,0 +1,304 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Helper functions for KVM guest address space mapping code + * + * Copyright IBM Corp. 2007, 2025 + */ + +#include <linux/export.h> +#include <linux/mm_types.h> +#include <linux/mmap_lock.h> +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <linux/swap.h> +#include <linux/leafops.h> +#include <linux/pagewalk.h> +#include <linux/ksm.h> +#include <asm/gmap_helpers.h> + +/** + * ptep_zap_softleaf_entry() - discard a software leaf entry. + * @mm: the mm + * @entry: the software leaf entry that needs to be zapped + * + * Discards the given software leaf entry. If the leaf entry was an actual + * swap entry (and not a migration entry, for example), the actual swapped + * page is also discarded from swap. + */ +static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry) +{ + if (softleaf_is_swap(entry)) + dec_mm_counter(mm, MM_SWAPENTS); + else if (softleaf_is_migration(entry)) + dec_mm_counter(mm, mm_counter(softleaf_to_folio(entry))); + swap_put_entries_direct(entry, 1); +} + +/** + * gmap_helper_zap_one_page() - discard a page if it was swapped. + * @mm: the mm + * @vmaddr: the userspace virtual address that needs to be discarded + * + * If the given address maps to a swap entry, discard it. + * + * Context: needs to be called while holding the mmap lock. + */ +void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr) +{ + struct vm_area_struct *vma; + spinlock_t *ptl; + pte_t *ptep; + + mmap_assert_locked(mm); + + /* Find the vm address for the guest address */ + vma = vma_lookup(mm, vmaddr); + if (!vma || is_vm_hugetlb_page(vma)) + return; + + /* Get pointer to the page table entry */ + ptep = get_locked_pte(mm, vmaddr, &ptl); + if (unlikely(!ptep)) + return; + if (pte_swap(*ptep)) { + ptep_zap_softleaf_entry(mm, softleaf_from_pte(*ptep)); + pte_clear(mm, vmaddr, ptep); + } + pte_unmap_unlock(ptep, ptl); +} +EXPORT_SYMBOL_GPL(gmap_helper_zap_one_page); + +/** + * gmap_helper_discard() - discard user pages in the given range + * @mm: the mm + * @vmaddr: starting userspace address + * @end: end address (first address outside the range) + * + * All userpace pages in the range [@vamddr, @end) are discarded and unmapped. + * + * Context: needs to be called while holding the mmap lock. + */ +void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned long end) +{ + struct vm_area_struct *vma; + + mmap_assert_locked(mm); + + while (vmaddr < end) { + vma = find_vma_intersection(mm, vmaddr, end); + if (!vma) + return; + if (!is_vm_hugetlb_page(vma)) + zap_vma_range(vma, vmaddr, min(end, vma->vm_end) - vmaddr); + vmaddr = vma->vm_end; + } +} +EXPORT_SYMBOL_GPL(gmap_helper_discard); + +/** + * gmap_helper_try_set_pte_unused() - mark a pte entry as unused + * @mm: the mm + * @vmaddr: the userspace address whose pte is to be marked + * + * Mark the pte corresponding the given address as unused. This will cause + * core mm code to just drop this page instead of swapping it. + * + * This function needs to be called with interrupts disabled (for example + * while holding a spinlock), or while holding the mmap lock. Normally this + * function is called as a result of an unmap operation, and thus KVM common + * code will already hold kvm->mmu_lock in write mode. + * + * Context: Needs to be called while holding the mmap lock or with interrupts + * disabled. + */ +void gmap_helper_try_set_pte_unused(struct mm_struct *mm, unsigned long vmaddr) +{ + pmd_t *pmdp, pmd, pmdval; + pud_t *pudp, pud; + p4d_t *p4dp, p4d; + pgd_t *pgdp, pgd; + spinlock_t *ptl; /* Lock for the host (userspace) page table */ + pte_t *ptep; + + pgdp = pgd_offset(mm, vmaddr); + pgd = pgdp_get(pgdp); + if (pgd_none(pgd) || !pgd_present(pgd)) + return; + + p4dp = p4d_offset(pgdp, vmaddr); + p4d = p4dp_get(p4dp); + if (p4d_none(p4d) || !p4d_present(p4d)) + return; + + pudp = pud_offset(p4dp, vmaddr); + pud = pudp_get(pudp); + if (pud_none(pud) || pud_leaf(pud) || !pud_present(pud)) + return; + + pmdp = pmd_offset(pudp, vmaddr); + pmd = pmdp_get_lockless(pmdp); + if (pmd_none(pmd) || pmd_leaf(pmd) || !pmd_present(pmd)) + return; + + ptep = pte_offset_map_rw_nolock(mm, pmdp, vmaddr, &pmdval, &ptl); + if (!ptep) + return; + + /* + * Several paths exists that takes the ptl lock and then call the + * mmu_notifier, which takes the mmu_lock. The unmap path, instead, + * takes the mmu_lock in write mode first, and then potentially + * calls this function, which takes the ptl lock. This can lead to a + * deadlock. + * The unused page mechanism is only an optimization, if the + * _PAGE_UNUSED bit is not set, the unused page is swapped as normal + * instead of being discarded. + * If the lock is contended the bit is not set and the deadlock is + * avoided. + */ + if (spin_trylock(ptl)) { + /* + * Make sure the pte we are touching is still the correct + * one. In theory this check should not be needed, but + * better safe than sorry. + * Disabling interrupts or holding the mmap lock is enough to + * guarantee that no concurrent updates to the page tables + * are possible. + */ + if (likely(pmd_same(pmdval, pmdp_get_lockless(pmdp)))) + __atomic64_or(_PAGE_UNUSED, (long *)ptep); + spin_unlock(ptl); + } + + pte_unmap(ptep); +} +EXPORT_SYMBOL_GPL(gmap_helper_try_set_pte_unused); + +static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + unsigned long *found_addr = walk->private; + + /* Return 1 of the page is a zeropage. */ + if (is_zero_pfn(pte_pfn(*pte))) { + /* + * Shared zeropage in e.g., a FS DAX mapping? We cannot do the + * right thing and likely don't care: FAULT_FLAG_UNSHARE + * currently only works in COW mappings, which is also where + * mm_forbids_zeropage() is checked. + */ + if (!is_cow_mapping(walk->vma->vm_flags)) + return -EFAULT; + + *found_addr = addr; + return 1; + } + return 0; +} + +static const struct mm_walk_ops find_zeropage_ops = { + .pte_entry = find_zeropage_pte_entry, + .walk_lock = PGWALK_WRLOCK, +}; + +/** __gmap_helper_unshare_zeropages() - unshare all shared zeropages + * @mm: the mm whose zero pages are to be unshared + * + * Unshare all shared zeropages, replacing them by anonymous pages. Note that + * we cannot simply zap all shared zeropages, because this could later + * trigger unexpected userfaultfd missing events. + * + * This must be called after mm->context.allow_cow_sharing was + * set to 0, to avoid future mappings of shared zeropages. + * + * mm contracts with s390, that even if mm were to remove a page table, + * and racing with walk_page_range_vma() calling pte_offset_map_lock() + * would fail, it will never insert a page table containing empty zero + * pages once mm_forbids_zeropage(mm) i.e. + * mm->context.allow_cow_sharing is set to 0. + */ +static int __gmap_helper_unshare_zeropages(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); + unsigned long addr; + vm_fault_t fault; + int rc; + + for_each_vma(vmi, vma) { + /* + * We could only look at COW mappings, but it's more future + * proof to catch unexpected zeropages in other mappings and + * fail. + */ + if ((vma->vm_flags & VM_PFNMAP) || is_vm_hugetlb_page(vma)) + continue; + addr = vma->vm_start; + +retry: + rc = walk_page_range_vma(vma, addr, vma->vm_end, + &find_zeropage_ops, &addr); + if (rc < 0) + return rc; + else if (!rc) + continue; + + /* addr was updated by find_zeropage_pte_entry() */ + fault = handle_mm_fault(vma, addr, + FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE, + NULL); + if (fault & VM_FAULT_OOM) + return -ENOMEM; + /* + * See break_ksm(): even after handle_mm_fault() returned 0, we + * must start the lookup from the current address, because + * handle_mm_fault() may back out if there's any difficulty. + * + * VM_FAULT_SIGBUS and VM_FAULT_SIGSEGV are unexpected but + * maybe they could trigger in the future on concurrent + * truncation. In that case, the shared zeropage would be gone + * and we can simply retry and make progress. + */ + cond_resched(); + goto retry; + } + + return 0; +} + +/** + * gmap_helper_disable_cow_sharing() - disable all COW sharing + * + * Disable most COW-sharing of memory pages for the whole process: + * (1) Disable KSM and unmerge/unshare any KSM pages. + * (2) Disallow shared zeropages and unshare any zerpages that are mapped. + * + * Not that we currently don't bother with COW-shared pages that are shared + * with parent/child processes due to fork(). + */ +int gmap_helper_disable_cow_sharing(void) +{ + struct mm_struct *mm = current->mm; + int rc; + + mmap_assert_write_locked(mm); + + if (!mm->context.allow_cow_sharing) + return 0; + + mm->context.allow_cow_sharing = 0; + + /* Replace all shared zeropages by anonymous pages. */ + rc = __gmap_helper_unshare_zeropages(mm); + /* + * Make sure to disable KSM (if enabled for the whole process or + * individual VMAs). Note that nothing currently hinders user space + * from re-enabling it. + */ + if (!rc) + rc = ksm_disable(mm); + if (rc) + mm->context.allow_cow_sharing = 1; + return rc; +} +EXPORT_SYMBOL_GPL(gmap_helper_disable_cow_sharing); diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index d9ce199953de..302ef5781b65 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -6,15 +6,15 @@ * Author(s): Gerald Schaefer <gerald.schaefer@de.ibm.com> */ -#define KMSG_COMPONENT "hugetlb" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "hugetlb: " fmt -#include <asm/pgalloc.h> +#include <linux/cpufeature.h> #include <linux/mm.h> #include <linux/hugetlb.h> #include <linux/mman.h> #include <linux/sched/mm.h> #include <linux/security.h> +#include <asm/pgalloc.h> /* * If the bit selected by single-bit bitmask "a" is set within "x", move @@ -135,29 +135,6 @@ static inline pte_t __rste_to_pte(unsigned long rste) return __pte(pteval); } -static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste) -{ - struct folio *folio; - unsigned long size, paddr; - - if (!mm_uses_skeys(mm) || - rste & _SEGMENT_ENTRY_INVALID) - return; - - if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) { - folio = page_folio(pud_page(__pud(rste))); - size = PUD_SIZE; - paddr = rste & PUD_MASK; - } else { - folio = page_folio(pmd_page(__pmd(rste))); - size = PMD_SIZE; - paddr = rste & PMD_MASK; - } - - if (!test_and_set_bit(PG_arch_1, &folio->flags)) - __storage_key_init_range(paddr, paddr + size); -} - void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { @@ -173,7 +150,6 @@ void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, } else if (likely(pte_present(pte))) rste |= _SEGMENT_ENTRY_LARGE; - clear_huge_pte_skeys(mm, rste); set_pte(ptep, __pte(rste)); } @@ -188,8 +164,8 @@ pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep) return __rste_to_pte(pte_val(*ptep)); } -pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) +pte_t __huge_ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) { pte_t pte = huge_ptep_get(mm, addr, ptep); pmd_t *pmdp = (pmd_t *) ptep; @@ -248,10 +224,18 @@ pte_t *huge_pte_offset(struct mm_struct *mm, bool __init arch_hugetlb_valid_size(unsigned long size) { - if (MACHINE_HAS_EDAT1 && size == PMD_SIZE) + if (cpu_has_edat1() && size == PMD_SIZE) return true; - else if (MACHINE_HAS_EDAT2 && size == PUD_SIZE) + else if (cpu_has_edat2() && size == PUD_SIZE) return true; else return false; } + +unsigned int __init arch_hugetlb_cma_order(void) +{ + if (cpu_has_edat2()) + return PUD_SHIFT - PAGE_SHIFT; + + return 0; +} diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index f2298f7a3f21..1f72efc2a579 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -8,6 +8,7 @@ * Copyright (C) 1995 Linus Torvalds */ +#include <linux/cpufeature.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/kernel.h> @@ -39,7 +40,6 @@ #include <asm/kfence.h> #include <asm/dma.h> #include <asm/abs_lowcore.h> -#include <asm/tlb.h> #include <asm/tlbflush.h> #include <asm/sections.h> #include <asm/sclp.h> @@ -69,12 +69,10 @@ unsigned long empty_zero_page, zero_page_mask; EXPORT_SYMBOL(empty_zero_page); EXPORT_SYMBOL(zero_page_mask); -static void __init setup_zero_pages(void) +void __init arch_setup_zero_pages(void) { unsigned long total_pages = memblock_estimated_nr_free_pages(); unsigned int order; - struct page *page; - int i; /* Latest machines require a mapping granularity of 512KB */ order = 7; @@ -83,41 +81,31 @@ static void __init setup_zero_pages(void) while (order > 2 && (total_pages >> 10) < (1UL << order)) order--; - empty_zero_page = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order); - if (!empty_zero_page) - panic("Out of memory in setup_zero_pages"); - - page = virt_to_page((void *) empty_zero_page); - split_page(page, order); - for (i = 1 << order; i > 0; i--) { - mark_page_reserved(page); - page++; - } + empty_zero_page = (unsigned long)memblock_alloc_or_panic(PAGE_SIZE << order, PAGE_SIZE); zero_page_mask = ((PAGE_SIZE << order) - 1) & PAGE_MASK; } +void __init arch_zone_limits_init(unsigned long *max_zone_pfns) +{ + max_zone_pfns[ZONE_DMA] = virt_to_pfn(MAX_DMA_ADDRESS); + max_zone_pfns[ZONE_NORMAL] = max_low_pfn; +} + /* * paging_init() sets up the page tables */ void __init paging_init(void) { - unsigned long max_zone_pfns[MAX_NR_ZONES]; - vmem_map_init(); - sparse_init(); zone_dma_limit = DMA_BIT_MASK(31); - memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); - max_zone_pfns[ZONE_DMA] = virt_to_pfn(MAX_DMA_ADDRESS); - max_zone_pfns[ZONE_NORMAL] = max_low_pfn; - free_area_init(max_zone_pfns); } void mark_rodata_ro(void) { unsigned long size = __end_ro_after_init - __start_ro_after_init; - if (MACHINE_HAS_NX) + if (cpu_has_nx()) system_ctl_set_bit(0, CR0_INSTRUCTION_EXEC_PROTECTION_BIT); __set_memory_ro(__start_ro_after_init, __end_ro_after_init); pr_info("Write protected read-only-after-init data: %luk\n", size >> 10); @@ -153,7 +141,7 @@ bool force_dma_unencrypted(struct device *dev) } /* protected virtualization */ -static void pv_init(void) +static void __init pv_init(void) { if (!is_prot_virt_guest()) return; @@ -165,20 +153,12 @@ static void pv_init(void) swiotlb_update_mem_attributes(); } -void __init mem_init(void) +void __init arch_mm_preinit(void) { cpumask_set_cpu(0, &init_mm.context.cpu_attach_mask); cpumask_set_cpu(0, mm_cpumask(&init_mm)); - set_max_mapnr(max_low_pfn); - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); - pv_init(); - kfence_split_mapping(); - - /* this will put all low memory onto the freelists */ - memblock_free_all(); - setup_zero_pages(); /* Setup zeroed pages. */ } unsigned long memory_block_size_bytes(void) @@ -239,16 +219,13 @@ struct s390_cma_mem_data { static int s390_cma_check_range(struct cma *cma, void *data) { struct s390_cma_mem_data *mem_data; - unsigned long start, end; mem_data = data; - start = cma_get_base(cma); - end = start + cma_get_size(cma); - if (end < mem_data->start) - return 0; - if (start >= mem_data->end) - return 0; - return -EBUSY; + + if (cma_intersects(cma, mem_data->start, mem_data->end)) + return -EBUSY; + + return 0; } static int s390_cma_mem_notifier(struct notifier_block *nb, @@ -285,7 +262,7 @@ int arch_add_memory(int nid, u64 start, u64 size, unsigned long size_pages = PFN_DOWN(size); int rc; - if (WARN_ON_ONCE(params->pgprot.pgprot != PAGE_KERNEL.pgprot)) + if (WARN_ON_ONCE(pgprot_val(params->pgprot) != pgprot_val(PAGE_KERNEL))) return -EINVAL; VM_BUG_ON(!mhp_range_allowed(start, size, true)); diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c index 44426e0f2944..cfd219fe495c 100644 --- a/arch/s390/mm/maccess.c +++ b/arch/s390/mm/maccess.c @@ -41,7 +41,7 @@ static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t siz " ex %1,0(1)\n" " lg %1,0(%3)\n" " lra %0,0(%0)\n" - " sturg %1,%0\n" + " sturg %1,%0" : "+&a" (aligned), "+&a" (count), "=m" (tmp) : "a" (&tmp), "a" (&tmp[offset]), "a" (src) : "cc", "memory", "1"); diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index 76f376876e0d..2a222a7e14f4 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -15,7 +15,6 @@ #include <linux/sched/signal.h> #include <linux/sched/mm.h> #include <linux/random.h> -#include <linux/compat.h> #include <linux/security.h> #include <linux/hugetlb.h> #include <asm/elf.h> @@ -27,7 +26,7 @@ static unsigned long stack_maxrandom_size(void) return STACK_RND_MASK << PAGE_SHIFT; } -static inline int mmap_is_legacy(struct rlimit *rlim_stack) +static inline int mmap_is_legacy(const struct rlimit *rlim_stack) { if (current->personality & ADDR_COMPAT_LAYOUT) return 1; @@ -47,11 +46,10 @@ static unsigned long mmap_base_legacy(unsigned long rnd) } static inline unsigned long mmap_base(unsigned long rnd, - struct rlimit *rlim_stack) + const struct rlimit *rlim_stack) { unsigned long gap = rlim_stack->rlim_cur; unsigned long pad = stack_maxrandom_size() + stack_guard_gap; - unsigned long gap_min, gap_max; /* Values close to RLIM_INFINITY can overflow. */ if (gap + pad > gap) @@ -61,13 +59,7 @@ static inline unsigned long mmap_base(unsigned long rnd, * Top of mmap area (just below the process stack). * Leave at least a ~128 MB hole. */ - gap_min = SZ_128M; - gap_max = (STACK_TOP / 6) * 5; - - if (gap < gap_min) - gap = gap_min; - else if (gap > gap_max) - gap = gap_max; + gap = clamp(gap, SZ_128M, (STACK_TOP / 6) * 5); return PAGE_ALIGN(STACK_TOP - gap - rnd); } @@ -176,7 +168,7 @@ check_asce_limit: * This function, called very early during the creation of a new * process VM image, sets up which VM layout function to use: */ -void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) +void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack) { unsigned long random_factor = 0UL; @@ -189,10 +181,10 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) */ if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = mmap_base_legacy(random_factor); - clear_bit(MMF_TOPDOWN, &mm->flags); + mm_flags_clear(MMF_TOPDOWN, mm); } else { mm->mmap_base = mmap_base(random_factor, rlim_stack); - set_bit(MMF_TOPDOWN, &mm->flags); + mm_flags_set(MMF_TOPDOWN, mm); } } diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c index 01f9b39e65f5..5bee173db72e 100644 --- a/arch/s390/mm/page-states.c +++ b/arch/s390/mm/page-states.c @@ -13,6 +13,7 @@ #include <asm/page.h> int __bootdata_preserved(cmma_flag); +EXPORT_SYMBOL(cmma_flag); void arch_free_page(struct page *page, int order) { diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index eae97fb61712..bb29c38ae624 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -3,6 +3,7 @@ * Copyright IBM Corp. 2011 * Author(s): Jan Glauber <jang@linux.vnet.ibm.com> */ +#include <linux/cpufeature.h> #include <linux/hugetlb.h> #include <linux/proc_fs.h> #include <linux/vmalloc.h> @@ -15,19 +16,12 @@ #include <asm/asm.h> #include <asm/set_memory.h> -static inline unsigned long sske_frame(unsigned long addr, unsigned char skey) -{ - asm volatile(".insn rrf,0xb22b0000,%[skey],%[addr],1,0" - : [addr] "+a" (addr) : [skey] "d" (skey)); - return addr; -} - void __storage_key_init_range(unsigned long start, unsigned long end) { unsigned long boundary, size; while (start < end) { - if (MACHINE_HAS_EDAT1) { + if (cpu_has_edat1()) { /* set storage keys for a 1MB frame */ size = 1UL << 20; boundary = (start + size) & ~(size - 1); @@ -63,7 +57,7 @@ static void pgt_set(unsigned long *old, unsigned long new, unsigned long addr, unsigned long *table, mask; mask = 0; - if (MACHINE_HAS_EDAT2) { + if (cpu_has_edat2()) { switch (dtt) { case CRDTE_DTT_REGION3: mask = ~(PTRS_PER_PUD * sizeof(pud_t) - 1); @@ -77,10 +71,8 @@ static void pgt_set(unsigned long *old, unsigned long new, unsigned long addr, } table = (unsigned long *)((unsigned long)old & mask); crdte(*old, new, table, dtt, addr, get_lowcore()->kernel_asce.val); - } else if (MACHINE_HAS_IDTE) { - cspg(old, *old, new); } else { - csp((unsigned int *)old + 1, *old, new); + cspg(old, *old, new); } } @@ -205,7 +197,7 @@ static int walk_pmd_level(pud_t *pudp, unsigned long addr, unsigned long end, return rc; } -static int split_pud_page(pud_t *pudp, unsigned long addr) +int split_pud_page(pud_t *pudp, unsigned long addr) { unsigned long pmd_addr, prot; pmd_t *pm_dir, *pmdp; @@ -373,7 +365,7 @@ int __set_memory(unsigned long addr, unsigned long numpages, unsigned long flags unsigned long end; int rc; - if (!MACHINE_HAS_NX) + if (!cpu_has_nx()) flags &= ~(SET_MEMORY_NX | SET_MEMORY_X); if (!flags) return 0; diff --git a/arch/s390/mm/pfault.c b/arch/s390/mm/pfault.c index 1aac13bb8f53..6ecd6b0a22a8 100644 --- a/arch/s390/mm/pfault.c +++ b/arch/s390/mm/pfault.c @@ -9,6 +9,7 @@ #include <linux/init.h> #include <linux/irq.h> #include <asm/asm-extable.h> +#include <asm/asm-offsets.h> #include <asm/pfault.h> #include <asm/diag.h> @@ -56,12 +57,12 @@ int __pfault_init(void) if (pfault_disable) return rc; diag_stat_inc(DIAG_STAT_X258); - asm volatile( + asm_inline volatile( " diag %[refbk],%[rc],0x258\n" "0: nopr %%r7\n" EX_TABLE(0b, 0b) : [rc] "+d" (rc) - : [refbk] "a" (&pfault_init_refbk), "m" (pfault_init_refbk) + : [refbk] "a" (virt_to_phys(&pfault_init_refbk)), "m" (pfault_init_refbk) : "cc"); return rc; } @@ -78,12 +79,12 @@ void __pfault_fini(void) if (pfault_disable) return; diag_stat_inc(DIAG_STAT_X258); - asm volatile( + asm_inline volatile( " diag %[refbk],0,0x258\n" "0: nopr %%r7\n" EX_TABLE(0b, 0b) : - : [refbk] "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk) + : [refbk] "a" (virt_to_phys(&pfault_fini_refbk)), "m" (pfault_fini_refbk) : "cc"); } @@ -198,8 +199,7 @@ block: * return to userspace schedule() to block. */ __set_current_state(TASK_UNINTERRUPTIBLE); - set_tsk_need_resched(tsk); - set_preempt_need_resched(); + set_need_resched_current(); } } out: diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 30387a6e98ff..7ac44543e051 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -12,43 +12,20 @@ #include <asm/mmu_context.h> #include <asm/page-states.h> #include <asm/pgalloc.h> -#include <asm/gmap.h> -#include <asm/tlb.h> #include <asm/tlbflush.h> -#ifdef CONFIG_PGSTE - -int page_table_allocate_pgste = 0; -EXPORT_SYMBOL(page_table_allocate_pgste); - -static const struct ctl_table page_table_sysctl[] = { - { - .procname = "allocate_pgste", - .data = &page_table_allocate_pgste, - .maxlen = sizeof(int), - .mode = S_IRUGO | S_IWUSR, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -}; - -static int __init page_table_register_sysctl(void) +unsigned long *crst_table_alloc_noprof(struct mm_struct *mm) { - return register_sysctl("vm", page_table_sysctl) ? 0 : -ENOMEM; -} -__initcall(page_table_register_sysctl); - -#endif /* CONFIG_PGSTE */ - -unsigned long *crst_table_alloc(struct mm_struct *mm) -{ - struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER); + gfp_t gfp = GFP_KERNEL_ACCOUNT; + struct ptdesc *ptdesc; unsigned long *table; + if (mm == &init_mm) + gfp &= ~__GFP_ACCOUNT; + ptdesc = pagetable_alloc_noprof(gfp, CRST_ALLOC_ORDER); if (!ptdesc) return NULL; - table = ptdesc_to_virt(ptdesc); + table = ptdesc_address(ptdesc); __arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER); return table; } @@ -63,11 +40,15 @@ void crst_table_free(struct mm_struct *mm, unsigned long *table) static void __crst_table_upgrade(void *arg) { struct mm_struct *mm = arg; + struct ctlreg asce; /* change all active ASCEs to avoid the creation of new TLBs */ if (current->active_mm == mm) { - get_lowcore()->user_asce.val = mm->context.asce; - local_ctl_load(7, &get_lowcore()->user_asce); + asce.val = mm->context.asce; + get_lowcore()->user_asce = asce; + local_ctl_load(7, &asce); + if (!test_thread_flag(TIF_ASCE_PRIMARY)) + local_ctl_load(1, &asce); } __tlb_flush_local(); } @@ -77,6 +58,8 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end) unsigned long *pgd = NULL, *p4d = NULL, *__pgd; unsigned long asce_limit = mm->context.asce_limit; + mmap_assert_write_locked(mm); + /* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */ VM_BUG_ON(asce_limit < _REGION2_SIZE); @@ -100,13 +83,6 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end) spin_lock_bh(&mm->page_table_lock); - /* - * This routine gets called with mmap_lock lock held and there is - * no reason to optimize for the case of otherwise. However, if - * that would ever change, the below check will let us know. - */ - VM_BUG_ON(asce_limit != mm->context.asce_limit); - if (p4d) { __pgd = (unsigned long *) mm->pgd; p4d_populate(mm, (p4d_t *) p4d, (pud_t *) __pgd); @@ -138,43 +114,22 @@ err_p4d: return -ENOMEM; } -#ifdef CONFIG_PGSTE - -struct ptdesc *page_table_alloc_pgste(struct mm_struct *mm) -{ - struct ptdesc *ptdesc; - u64 *table; - - ptdesc = pagetable_alloc(GFP_KERNEL, 0); - if (ptdesc) { - table = (u64 *)ptdesc_to_virt(ptdesc); - __arch_set_page_dat(table, 1); - memset64(table, _PAGE_INVALID, PTRS_PER_PTE); - memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE); - } - return ptdesc; -} - -void page_table_free_pgste(struct ptdesc *ptdesc) -{ - pagetable_free(ptdesc); -} - -#endif /* CONFIG_PGSTE */ - -unsigned long *page_table_alloc(struct mm_struct *mm) +unsigned long *page_table_alloc_noprof(struct mm_struct *mm) { + gfp_t gfp = GFP_KERNEL_ACCOUNT; struct ptdesc *ptdesc; unsigned long *table; - ptdesc = pagetable_alloc(GFP_KERNEL, 0); + if (mm == &init_mm) + gfp &= ~__GFP_ACCOUNT; + ptdesc = pagetable_alloc_noprof(gfp, 0); if (!ptdesc) return NULL; - if (!pagetable_pte_ctor(ptdesc)) { + if (!pagetable_pte_ctor(mm, ptdesc)) { pagetable_free(ptdesc); return NULL; } - table = ptdesc_to_virt(ptdesc); + table = ptdesc_address(ptdesc); __arch_set_page_dat(table, 1); memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE); @@ -185,6 +140,8 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) { struct ptdesc *ptdesc = virt_to_ptdesc(table); + if (pagetable_is_reserved(ptdesc)) + return free_reserved_ptdesc(ptdesc); pagetable_dtor_free(ptdesc); } @@ -201,11 +158,6 @@ void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable) struct ptdesc *ptdesc = virt_to_ptdesc(pgtable); call_rcu(&ptdesc->pt_rcu_head, pte_free_now); - /* - * THPs are not allowed for KVM guests. Warn if pgste ever reaches here. - * Turn to the generic pte_free_defer() version once gmap is removed. - */ - WARN_ON_ONCE(mm_has_pgste(mm)); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -271,7 +223,7 @@ static inline unsigned long base_lra(unsigned long address) unsigned long real; asm volatile( - " lra %0,0(%1)\n" + " lra %0,0(%1)" : "=d" (real) : "a" (address) : "cc"); return real; } diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index f05e62e037c2..4acd8b140c4b 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -4,6 +4,8 @@ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> */ +#include <linux/cpufeature.h> +#include <linux/export.h> #include <linux/sched.h> #include <linux/kernel.h> #include <linux/errno.h> @@ -14,15 +16,15 @@ #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/slab.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <linux/sysctl.h> #include <linux/ksm.h> #include <linux/mman.h> -#include <asm/tlb.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> #include <asm/page-states.h> +#include <asm/machine.h> pgprot_t pgprot_writecombine(pgprot_t prot) { @@ -34,22 +36,12 @@ pgprot_t pgprot_writecombine(pgprot_t prot) } EXPORT_SYMBOL_GPL(pgprot_writecombine); -pgprot_t pgprot_writethrough(pgprot_t prot) -{ - /* - * mio_wb_bit_mask may be set on a different CPU, but it is only set - * once at init and only read afterwards. - */ - return __pgprot(pgprot_val(prot) & ~mio_wb_bit_mask); -} -EXPORT_SYMBOL_GPL(pgprot_writethrough); - static inline void ptep_ipte_local(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int nodat) { unsigned long opt, asce; - if (MACHINE_HAS_TLB_GUEST) { + if (machine_has_tlb_guest()) { opt = 0; asce = READ_ONCE(mm->context.gmap_asce); if (asce == 0UL || nodat) @@ -69,7 +61,7 @@ static inline void ptep_ipte_global(struct mm_struct *mm, unsigned long addr, { unsigned long opt, asce; - if (MACHINE_HAS_TLB_GUEST) { + if (machine_has_tlb_guest()) { opt = 0; asce = READ_ONCE(mm->context.gmap_asce); if (asce == 0UL || nodat) @@ -94,7 +86,7 @@ static inline pte_t ptep_flush_direct(struct mm_struct *mm, if (unlikely(pte_val(old) & _PAGE_INVALID)) return old; atomic_inc(&mm->context.flush_count); - if (MACHINE_HAS_TLB_LC && + if (cpu_has_tlb_lc() && cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) ptep_ipte_local(mm, addr, ptep, nodat); else @@ -123,171 +115,14 @@ static inline pte_t ptep_flush_lazy(struct mm_struct *mm, return old; } -static inline pgste_t pgste_get_lock(pte_t *ptep) -{ - unsigned long value = 0; -#ifdef CONFIG_PGSTE - unsigned long *ptr = (unsigned long *)(ptep + PTRS_PER_PTE); - - do { - value = __atomic64_or_barrier(PGSTE_PCL_BIT, ptr); - } while (value & PGSTE_PCL_BIT); - value |= PGSTE_PCL_BIT; -#endif - return __pgste(value); -} - -static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste) -{ -#ifdef CONFIG_PGSTE - barrier(); - WRITE_ONCE(*(unsigned long *)(ptep + PTRS_PER_PTE), pgste_val(pgste) & ~PGSTE_PCL_BIT); -#endif -} - -static inline pgste_t pgste_get(pte_t *ptep) -{ - unsigned long pgste = 0; -#ifdef CONFIG_PGSTE - pgste = *(unsigned long *)(ptep + PTRS_PER_PTE); -#endif - return __pgste(pgste); -} - -static inline void pgste_set(pte_t *ptep, pgste_t pgste) -{ -#ifdef CONFIG_PGSTE - *(pgste_t *)(ptep + PTRS_PER_PTE) = pgste; -#endif -} - -static inline pgste_t pgste_update_all(pte_t pte, pgste_t pgste, - struct mm_struct *mm) -{ -#ifdef CONFIG_PGSTE - unsigned long address, bits, skey; - - if (!mm_uses_skeys(mm) || pte_val(pte) & _PAGE_INVALID) - return pgste; - address = pte_val(pte) & PAGE_MASK; - skey = (unsigned long) page_get_storage_key(address); - bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); - /* Transfer page changed & referenced bit to guest bits in pgste */ - pgste_val(pgste) |= bits << 48; /* GR bit & GC bit */ - /* Copy page access key and fetch protection bit to pgste */ - pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT); - pgste_val(pgste) |= (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; -#endif - return pgste; - -} - -static inline void pgste_set_key(pte_t *ptep, pgste_t pgste, pte_t entry, - struct mm_struct *mm) -{ -#ifdef CONFIG_PGSTE - unsigned long address; - unsigned long nkey; - - if (!mm_uses_skeys(mm) || pte_val(entry) & _PAGE_INVALID) - return; - VM_BUG_ON(!(pte_val(*ptep) & _PAGE_INVALID)); - address = pte_val(entry) & PAGE_MASK; - /* - * Set page access key and fetch protection bit from pgste. - * The guest C/R information is still in the PGSTE, set real - * key C/R to 0. - */ - nkey = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; - nkey |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48; - page_set_storage_key(address, nkey, 0); -#endif -} - -static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry) -{ -#ifdef CONFIG_PGSTE - if ((pte_val(entry) & _PAGE_PRESENT) && - (pte_val(entry) & _PAGE_WRITE) && - !(pte_val(entry) & _PAGE_INVALID)) { - if (!MACHINE_HAS_ESOP) { - /* - * Without enhanced suppression-on-protection force - * the dirty bit on for all writable ptes. - */ - entry = set_pte_bit(entry, __pgprot(_PAGE_DIRTY)); - entry = clear_pte_bit(entry, __pgprot(_PAGE_PROTECT)); - } - if (!(pte_val(entry) & _PAGE_PROTECT)) - /* This pte allows write access, set user-dirty */ - pgste_val(pgste) |= PGSTE_UC_BIT; - } -#endif - set_pte(ptep, entry); - return pgste; -} - -static inline pgste_t pgste_pte_notify(struct mm_struct *mm, - unsigned long addr, - pte_t *ptep, pgste_t pgste) -{ -#ifdef CONFIG_PGSTE - unsigned long bits; - - bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT); - if (bits) { - pgste_val(pgste) ^= bits; - ptep_notify(mm, addr, ptep, bits); - } -#endif - return pgste; -} - -static inline pgste_t ptep_xchg_start(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - pgste_t pgste = __pgste(0); - - if (mm_has_pgste(mm)) { - pgste = pgste_get_lock(ptep); - pgste = pgste_pte_notify(mm, addr, ptep, pgste); - } - return pgste; -} - -static inline pte_t ptep_xchg_commit(struct mm_struct *mm, - unsigned long addr, pte_t *ptep, - pgste_t pgste, pte_t old, pte_t new) -{ - if (mm_has_pgste(mm)) { - if (pte_val(old) & _PAGE_INVALID) - pgste_set_key(ptep, pgste, new, mm); - if (pte_val(new) & _PAGE_INVALID) { - pgste = pgste_update_all(old, pgste, mm); - if ((pgste_val(pgste) & _PGSTE_GPS_USAGE_MASK) == - _PGSTE_GPS_USAGE_UNUSED) - old = set_pte_bit(old, __pgprot(_PAGE_UNUSED)); - } - pgste = pgste_set_pte(ptep, pgste, new); - pgste_set_unlock(ptep, pgste); - } else { - set_pte(ptep, new); - } - return old; -} - pte_t ptep_xchg_direct(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t new) { - pgste_t pgste; pte_t old; - int nodat; preempt_disable(); - pgste = ptep_xchg_start(mm, addr, ptep); - nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); - old = ptep_flush_direct(mm, addr, ptep, nodat); - old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new); + old = ptep_flush_direct(mm, addr, ptep, 1); + set_pte(ptep, new); preempt_enable(); return old; } @@ -303,9 +138,9 @@ void ptep_reset_dat_prot(struct mm_struct *mm, unsigned long addr, pte_t *ptep, preempt_disable(); atomic_inc(&mm->context.flush_count); if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) - __ptep_rdp(addr, ptep, 0, 0, 1); + __ptep_rdp(addr, ptep, 1); else - __ptep_rdp(addr, ptep, 0, 0, 0); + __ptep_rdp(addr, ptep, 0); /* * PTE is not invalidated by RDP, only _PAGE_PROTECT is cleared. That * means it is still valid and active, and must not be changed according @@ -321,15 +156,11 @@ EXPORT_SYMBOL(ptep_reset_dat_prot); pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t new) { - pgste_t pgste; pte_t old; - int nodat; preempt_disable(); - pgste = ptep_xchg_start(mm, addr, ptep); - nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); - old = ptep_flush_lazy(mm, addr, ptep, nodat); - old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new); + old = ptep_flush_lazy(mm, addr, ptep, 1); + set_pte(ptep, new); preempt_enable(); return old; } @@ -338,67 +169,32 @@ EXPORT_SYMBOL(ptep_xchg_lazy); pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { - pgste_t pgste; - pte_t old; - int nodat; - struct mm_struct *mm = vma->vm_mm; - - preempt_disable(); - pgste = ptep_xchg_start(mm, addr, ptep); - nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); - old = ptep_flush_lazy(mm, addr, ptep, nodat); - if (mm_has_pgste(mm)) { - pgste = pgste_update_all(old, pgste, mm); - pgste_set(ptep, pgste); - } - return old; + return ptep_flush_lazy(vma->vm_mm, addr, ptep, 1); } void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte) { - pgste_t pgste; - struct mm_struct *mm = vma->vm_mm; - - if (mm_has_pgste(mm)) { - pgste = pgste_get(ptep); - pgste_set_key(ptep, pgste, pte, mm); - pgste = pgste_set_pte(ptep, pgste, pte); - pgste_set_unlock(ptep, pgste); - } else { - set_pte(ptep, pte); - } - preempt_enable(); + set_pte(ptep, pte); } static inline void pmdp_idte_local(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { - if (MACHINE_HAS_TLB_GUEST) - __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, - mm->context.asce, IDTE_LOCAL); + if (machine_has_tlb_guest()) + __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, mm->context.asce, IDTE_LOCAL); else __pmdp_idte(addr, pmdp, 0, 0, IDTE_LOCAL); - if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) - gmap_pmdp_idte_local(mm, addr); } static inline void pmdp_idte_global(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { - if (MACHINE_HAS_TLB_GUEST) { + if (machine_has_tlb_guest()) { __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, mm->context.asce, IDTE_GLOBAL); - if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) - gmap_pmdp_idte_global(mm, addr); - } else if (MACHINE_HAS_IDTE) { - __pmdp_idte(addr, pmdp, 0, 0, IDTE_GLOBAL); - if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) - gmap_pmdp_idte_global(mm, addr); } else { - __pmdp_csp(pmdp); - if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) - gmap_pmdp_csp(mm, addr); + __pmdp_idte(addr, pmdp, 0, 0, IDTE_GLOBAL); } } @@ -411,7 +207,7 @@ static inline pmd_t pmdp_flush_direct(struct mm_struct *mm, if (pmd_val(old) & _SEGMENT_ENTRY_INVALID) return old; atomic_inc(&mm->context.flush_count); - if (MACHINE_HAS_TLB_LC && + if (cpu_has_tlb_lc() && cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) pmdp_idte_local(mm, addr, pmdp); else @@ -433,8 +229,6 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm, cpumask_of(smp_processor_id()))) { set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_INVALID))); mm->context.flush_mm = 1; - if (mm_has_pgste(mm)) - gmap_pmdp_invalidate(mm, addr); } else { pmdp_idte_global(mm, addr, pmdp); } @@ -442,40 +236,6 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm, return old; } -#ifdef CONFIG_PGSTE -static int pmd_lookup(struct mm_struct *mm, unsigned long addr, pmd_t **pmdp) -{ - struct vm_area_struct *vma; - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - - /* We need a valid VMA, otherwise this is clearly a fault. */ - vma = vma_lookup(mm, addr); - if (!vma) - return -EFAULT; - - pgd = pgd_offset(mm, addr); - if (!pgd_present(*pgd)) - return -ENOENT; - - p4d = p4d_offset(pgd, addr); - if (!p4d_present(*p4d)) - return -ENOENT; - - pud = pud_offset(p4d, addr); - if (!pud_present(*pud)) - return -ENOENT; - - /* Large PUDs are not supported yet. */ - if (pud_leaf(*pud)) - return -EFAULT; - - *pmdp = pmd_offset(pud, addr); - return 0; -} -#endif - pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t new) { @@ -505,7 +265,7 @@ EXPORT_SYMBOL(pmdp_xchg_lazy); static inline void pudp_idte_local(struct mm_struct *mm, unsigned long addr, pud_t *pudp) { - if (MACHINE_HAS_TLB_GUEST) + if (machine_has_tlb_guest()) __pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE, mm->context.asce, IDTE_LOCAL); else @@ -515,17 +275,11 @@ static inline void pudp_idte_local(struct mm_struct *mm, static inline void pudp_idte_global(struct mm_struct *mm, unsigned long addr, pud_t *pudp) { - if (MACHINE_HAS_TLB_GUEST) + if (machine_has_tlb_guest()) __pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE, mm->context.asce, IDTE_GLOBAL); - else if (MACHINE_HAS_IDTE) - __pudp_idte(addr, pudp, 0, 0, IDTE_GLOBAL); else - /* - * Invalid bit position is the same for pmd and pud, so we can - * reuse _pmd_csp() here - */ - __pmdp_csp((pmd_t *) pudp); + __pudp_idte(addr, pudp, 0, 0, IDTE_GLOBAL); } static inline pud_t pudp_flush_direct(struct mm_struct *mm, @@ -537,7 +291,7 @@ static inline pud_t pudp_flush_direct(struct mm_struct *mm, if (pud_val(old) & _REGION_ENTRY_INVALID) return old; atomic_inc(&mm->context.flush_count); - if (MACHINE_HAS_TLB_LC && + if (cpu_has_tlb_lc() && cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) pudp_idte_local(mm, addr, pudp); else @@ -599,598 +353,3 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) return pgtable; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - -#ifdef CONFIG_PGSTE -void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t entry) -{ - pgste_t pgste; - - /* the mm_has_pgste() check is done in set_pte_at() */ - preempt_disable(); - pgste = pgste_get_lock(ptep); - pgste_val(pgste) &= ~_PGSTE_GPS_ZERO; - pgste_set_key(ptep, pgste, entry, mm); - pgste = pgste_set_pte(ptep, pgste, entry); - pgste_set_unlock(ptep, pgste); - preempt_enable(); -} - -void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep) -{ - pgste_t pgste; - - preempt_disable(); - pgste = pgste_get_lock(ptep); - pgste_val(pgste) |= PGSTE_IN_BIT; - pgste_set_unlock(ptep, pgste); - preempt_enable(); -} - -/** - * ptep_force_prot - change access rights of a locked pte - * @mm: pointer to the process mm_struct - * @addr: virtual address in the guest address space - * @ptep: pointer to the page table entry - * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE - * @bit: pgste bit to set (e.g. for notification) - * - * Returns 0 if the access rights were changed and -EAGAIN if the current - * and requested access rights are incompatible. - */ -int ptep_force_prot(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, int prot, unsigned long bit) -{ - pte_t entry; - pgste_t pgste; - int pte_i, pte_p, nodat; - - pgste = pgste_get_lock(ptep); - entry = *ptep; - /* Check pte entry after all locks have been acquired */ - pte_i = pte_val(entry) & _PAGE_INVALID; - pte_p = pte_val(entry) & _PAGE_PROTECT; - if ((pte_i && (prot != PROT_NONE)) || - (pte_p && (prot & PROT_WRITE))) { - pgste_set_unlock(ptep, pgste); - return -EAGAIN; - } - /* Change access rights and set pgste bit */ - nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); - if (prot == PROT_NONE && !pte_i) { - ptep_flush_direct(mm, addr, ptep, nodat); - pgste = pgste_update_all(entry, pgste, mm); - entry = set_pte_bit(entry, __pgprot(_PAGE_INVALID)); - } - if (prot == PROT_READ && !pte_p) { - ptep_flush_direct(mm, addr, ptep, nodat); - entry = clear_pte_bit(entry, __pgprot(_PAGE_INVALID)); - entry = set_pte_bit(entry, __pgprot(_PAGE_PROTECT)); - } - pgste_val(pgste) |= bit; - pgste = pgste_set_pte(ptep, pgste, entry); - pgste_set_unlock(ptep, pgste); - return 0; -} - -int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr, - pte_t *sptep, pte_t *tptep, pte_t pte) -{ - pgste_t spgste, tpgste; - pte_t spte, tpte; - int rc = -EAGAIN; - - if (!(pte_val(*tptep) & _PAGE_INVALID)) - return 0; /* already shadowed */ - spgste = pgste_get_lock(sptep); - spte = *sptep; - if (!(pte_val(spte) & _PAGE_INVALID) && - !((pte_val(spte) & _PAGE_PROTECT) && - !(pte_val(pte) & _PAGE_PROTECT))) { - pgste_val(spgste) |= PGSTE_VSIE_BIT; - tpgste = pgste_get_lock(tptep); - tpte = __pte((pte_val(spte) & PAGE_MASK) | - (pte_val(pte) & _PAGE_PROTECT)); - /* don't touch the storage key - it belongs to parent pgste */ - tpgste = pgste_set_pte(tptep, tpgste, tpte); - pgste_set_unlock(tptep, tpgste); - rc = 1; - } - pgste_set_unlock(sptep, spgste); - return rc; -} - -void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep) -{ - pgste_t pgste; - int nodat; - - pgste = pgste_get_lock(ptep); - /* notifier is called by the caller */ - nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); - ptep_flush_direct(mm, saddr, ptep, nodat); - /* don't touch the storage key - it belongs to parent pgste */ - pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID)); - pgste_set_unlock(ptep, pgste); -} - -static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry) -{ - if (!non_swap_entry(entry)) - dec_mm_counter(mm, MM_SWAPENTS); - else if (is_migration_entry(entry)) { - struct folio *folio = pfn_swap_entry_folio(entry); - - dec_mm_counter(mm, mm_counter(folio)); - } - free_swap_and_cache(entry); -} - -void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, int reset) -{ - unsigned long pgstev; - pgste_t pgste; - pte_t pte; - - /* Zap unused and logically-zero pages */ - preempt_disable(); - pgste = pgste_get_lock(ptep); - pgstev = pgste_val(pgste); - pte = *ptep; - if (!reset && pte_swap(pte) && - ((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED || - (pgstev & _PGSTE_GPS_ZERO))) { - ptep_zap_swap_entry(mm, pte_to_swp_entry(pte)); - pte_clear(mm, addr, ptep); - } - if (reset) - pgste_val(pgste) &= ~(_PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT); - pgste_set_unlock(ptep, pgste); - preempt_enable(); -} - -void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep) -{ - unsigned long ptev; - pgste_t pgste; - - /* Clear storage key ACC and F, but set R/C */ - preempt_disable(); - pgste = pgste_get_lock(ptep); - pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT); - pgste_val(pgste) |= PGSTE_GR_BIT | PGSTE_GC_BIT; - ptev = pte_val(*ptep); - if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE)) - page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 0); - pgste_set_unlock(ptep, pgste); - preempt_enable(); -} - -/* - * Test and reset if a guest page is dirty - */ -bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ - pgste_t pgste; - pte_t pte; - bool dirty; - int nodat; - - pgste = pgste_get_lock(ptep); - dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT); - pgste_val(pgste) &= ~PGSTE_UC_BIT; - pte = *ptep; - if (dirty && (pte_val(pte) & _PAGE_PRESENT)) { - pgste = pgste_pte_notify(mm, addr, ptep, pgste); - nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); - ptep_ipte_global(mm, addr, ptep, nodat); - if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE)) - pte = set_pte_bit(pte, __pgprot(_PAGE_PROTECT)); - else - pte = set_pte_bit(pte, __pgprot(_PAGE_INVALID)); - set_pte(ptep, pte); - } - pgste_set_unlock(ptep, pgste); - return dirty; -} -EXPORT_SYMBOL_GPL(ptep_test_and_clear_uc); - -int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, - unsigned char key, bool nq) -{ - unsigned long keyul, paddr; - spinlock_t *ptl; - pgste_t old, new; - pmd_t *pmdp; - pte_t *ptep; - - /* - * If we don't have a PTE table and if there is no huge page mapped, - * we can ignore attempts to set the key to 0, because it already is 0. - */ - switch (pmd_lookup(mm, addr, &pmdp)) { - case -ENOENT: - return key ? -EFAULT : 0; - case 0: - break; - default: - return -EFAULT; - } -again: - ptl = pmd_lock(mm, pmdp); - if (!pmd_present(*pmdp)) { - spin_unlock(ptl); - return key ? -EFAULT : 0; - } - - if (pmd_leaf(*pmdp)) { - paddr = pmd_val(*pmdp) & HPAGE_MASK; - paddr |= addr & ~HPAGE_MASK; - /* - * Huge pmds need quiescing operations, they are - * always mapped. - */ - page_set_storage_key(paddr, key, 1); - spin_unlock(ptl); - return 0; - } - spin_unlock(ptl); - - ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); - if (!ptep) - goto again; - new = old = pgste_get_lock(ptep); - pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT | - PGSTE_ACC_BITS | PGSTE_FP_BIT); - keyul = (unsigned long) key; - pgste_val(new) |= (keyul & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48; - pgste_val(new) |= (keyul & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; - if (!(pte_val(*ptep) & _PAGE_INVALID)) { - unsigned long bits, skey; - - paddr = pte_val(*ptep) & PAGE_MASK; - skey = (unsigned long) page_get_storage_key(paddr); - bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); - skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT); - /* Set storage key ACC and FP */ - page_set_storage_key(paddr, skey, !nq); - /* Merge host changed & referenced into pgste */ - pgste_val(new) |= bits << 52; - } - /* changing the guest storage key is considered a change of the page */ - if ((pgste_val(new) ^ pgste_val(old)) & - (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT)) - pgste_val(new) |= PGSTE_UC_BIT; - - pgste_set_unlock(ptep, new); - pte_unmap_unlock(ptep, ptl); - return 0; -} -EXPORT_SYMBOL(set_guest_storage_key); - -/* - * Conditionally set a guest storage key (handling csske). - * oldkey will be updated when either mr or mc is set and a pointer is given. - * - * Returns 0 if a guests storage key update wasn't necessary, 1 if the guest - * storage key was updated and -EFAULT on access errors. - */ -int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr, - unsigned char key, unsigned char *oldkey, - bool nq, bool mr, bool mc) -{ - unsigned char tmp, mask = _PAGE_ACC_BITS | _PAGE_FP_BIT; - int rc; - - /* we can drop the pgste lock between getting and setting the key */ - if (mr | mc) { - rc = get_guest_storage_key(current->mm, addr, &tmp); - if (rc) - return rc; - if (oldkey) - *oldkey = tmp; - if (!mr) - mask |= _PAGE_REFERENCED; - if (!mc) - mask |= _PAGE_CHANGED; - if (!((tmp ^ key) & mask)) - return 0; - } - rc = set_guest_storage_key(current->mm, addr, key, nq); - return rc < 0 ? rc : 1; -} -EXPORT_SYMBOL(cond_set_guest_storage_key); - -/* - * Reset a guest reference bit (rrbe), returning the reference and changed bit. - * - * Returns < 0 in case of error, otherwise the cc to be reported to the guest. - */ -int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr) -{ - spinlock_t *ptl; - unsigned long paddr; - pgste_t old, new; - pmd_t *pmdp; - pte_t *ptep; - int cc = 0; - - /* - * If we don't have a PTE table and if there is no huge page mapped, - * the storage key is 0 and there is nothing for us to do. - */ - switch (pmd_lookup(mm, addr, &pmdp)) { - case -ENOENT: - return 0; - case 0: - break; - default: - return -EFAULT; - } -again: - ptl = pmd_lock(mm, pmdp); - if (!pmd_present(*pmdp)) { - spin_unlock(ptl); - return 0; - } - - if (pmd_leaf(*pmdp)) { - paddr = pmd_val(*pmdp) & HPAGE_MASK; - paddr |= addr & ~HPAGE_MASK; - cc = page_reset_referenced(paddr); - spin_unlock(ptl); - return cc; - } - spin_unlock(ptl); - - ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); - if (!ptep) - goto again; - new = old = pgste_get_lock(ptep); - /* Reset guest reference bit only */ - pgste_val(new) &= ~PGSTE_GR_BIT; - - if (!(pte_val(*ptep) & _PAGE_INVALID)) { - paddr = pte_val(*ptep) & PAGE_MASK; - cc = page_reset_referenced(paddr); - /* Merge real referenced bit into host-set */ - pgste_val(new) |= ((unsigned long) cc << 53) & PGSTE_HR_BIT; - } - /* Reflect guest's logical view, not physical */ - cc |= (pgste_val(old) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 49; - /* Changing the guest storage key is considered a change of the page */ - if ((pgste_val(new) ^ pgste_val(old)) & PGSTE_GR_BIT) - pgste_val(new) |= PGSTE_UC_BIT; - - pgste_set_unlock(ptep, new); - pte_unmap_unlock(ptep, ptl); - return cc; -} -EXPORT_SYMBOL(reset_guest_reference_bit); - -int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, - unsigned char *key) -{ - unsigned long paddr; - spinlock_t *ptl; - pgste_t pgste; - pmd_t *pmdp; - pte_t *ptep; - - /* - * If we don't have a PTE table and if there is no huge page mapped, - * the storage key is 0. - */ - *key = 0; - - switch (pmd_lookup(mm, addr, &pmdp)) { - case -ENOENT: - return 0; - case 0: - break; - default: - return -EFAULT; - } -again: - ptl = pmd_lock(mm, pmdp); - if (!pmd_present(*pmdp)) { - spin_unlock(ptl); - return 0; - } - - if (pmd_leaf(*pmdp)) { - paddr = pmd_val(*pmdp) & HPAGE_MASK; - paddr |= addr & ~HPAGE_MASK; - *key = page_get_storage_key(paddr); - spin_unlock(ptl); - return 0; - } - spin_unlock(ptl); - - ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); - if (!ptep) - goto again; - pgste = pgste_get_lock(ptep); - *key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; - paddr = pte_val(*ptep) & PAGE_MASK; - if (!(pte_val(*ptep) & _PAGE_INVALID)) - *key = page_get_storage_key(paddr); - /* Reflect guest's logical view, not physical */ - *key |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48; - pgste_set_unlock(ptep, pgste); - pte_unmap_unlock(ptep, ptl); - return 0; -} -EXPORT_SYMBOL(get_guest_storage_key); - -/** - * pgste_perform_essa - perform ESSA actions on the PGSTE. - * @mm: the memory context. It must have PGSTEs, no check is performed here! - * @hva: the host virtual address of the page whose PGSTE is to be processed - * @orc: the specific action to perform, see the ESSA_SET_* macros. - * @oldpte: the PTE will be saved there if the pointer is not NULL. - * @oldpgste: the old PGSTE will be saved there if the pointer is not NULL. - * - * Return: 1 if the page is to be added to the CBRL, otherwise 0, - * or < 0 in case of error. -EINVAL is returned for invalid values - * of orc, -EFAULT for invalid addresses. - */ -int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc, - unsigned long *oldpte, unsigned long *oldpgste) -{ - struct vm_area_struct *vma; - unsigned long pgstev; - spinlock_t *ptl; - pgste_t pgste; - pte_t *ptep; - int res = 0; - - WARN_ON_ONCE(orc > ESSA_MAX); - if (unlikely(orc > ESSA_MAX)) - return -EINVAL; - - vma = vma_lookup(mm, hva); - if (!vma || is_vm_hugetlb_page(vma)) - return -EFAULT; - ptep = get_locked_pte(mm, hva, &ptl); - if (unlikely(!ptep)) - return -EFAULT; - pgste = pgste_get_lock(ptep); - pgstev = pgste_val(pgste); - if (oldpte) - *oldpte = pte_val(*ptep); - if (oldpgste) - *oldpgste = pgstev; - - switch (orc) { - case ESSA_GET_STATE: - break; - case ESSA_SET_STABLE: - pgstev &= ~(_PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT); - pgstev |= _PGSTE_GPS_USAGE_STABLE; - break; - case ESSA_SET_UNUSED: - pgstev &= ~_PGSTE_GPS_USAGE_MASK; - pgstev |= _PGSTE_GPS_USAGE_UNUSED; - if (pte_val(*ptep) & _PAGE_INVALID) - res = 1; - break; - case ESSA_SET_VOLATILE: - pgstev &= ~_PGSTE_GPS_USAGE_MASK; - pgstev |= _PGSTE_GPS_USAGE_VOLATILE; - if (pte_val(*ptep) & _PAGE_INVALID) - res = 1; - break; - case ESSA_SET_POT_VOLATILE: - pgstev &= ~_PGSTE_GPS_USAGE_MASK; - if (!(pte_val(*ptep) & _PAGE_INVALID)) { - pgstev |= _PGSTE_GPS_USAGE_POT_VOLATILE; - break; - } - if (pgstev & _PGSTE_GPS_ZERO) { - pgstev |= _PGSTE_GPS_USAGE_VOLATILE; - break; - } - if (!(pgstev & PGSTE_GC_BIT)) { - pgstev |= _PGSTE_GPS_USAGE_VOLATILE; - res = 1; - break; - } - break; - case ESSA_SET_STABLE_RESIDENT: - pgstev &= ~_PGSTE_GPS_USAGE_MASK; - pgstev |= _PGSTE_GPS_USAGE_STABLE; - /* - * Since the resident state can go away any time after this - * call, we will not make this page resident. We can revisit - * this decision if a guest will ever start using this. - */ - break; - case ESSA_SET_STABLE_IF_RESIDENT: - if (!(pte_val(*ptep) & _PAGE_INVALID)) { - pgstev &= ~_PGSTE_GPS_USAGE_MASK; - pgstev |= _PGSTE_GPS_USAGE_STABLE; - } - break; - case ESSA_SET_STABLE_NODAT: - pgstev &= ~_PGSTE_GPS_USAGE_MASK; - pgstev |= _PGSTE_GPS_USAGE_STABLE | _PGSTE_GPS_NODAT; - break; - default: - /* we should never get here! */ - break; - } - /* If we are discarding a page, set it to logical zero */ - if (res) - pgstev |= _PGSTE_GPS_ZERO; - - pgste_val(pgste) = pgstev; - pgste_set_unlock(ptep, pgste); - pte_unmap_unlock(ptep, ptl); - return res; -} -EXPORT_SYMBOL(pgste_perform_essa); - -/** - * set_pgste_bits - set specific PGSTE bits. - * @mm: the memory context. It must have PGSTEs, no check is performed here! - * @hva: the host virtual address of the page whose PGSTE is to be processed - * @bits: a bitmask representing the bits that will be touched - * @value: the values of the bits to be written. Only the bits in the mask - * will be written. - * - * Return: 0 on success, < 0 in case of error. - */ -int set_pgste_bits(struct mm_struct *mm, unsigned long hva, - unsigned long bits, unsigned long value) -{ - struct vm_area_struct *vma; - spinlock_t *ptl; - pgste_t new; - pte_t *ptep; - - vma = vma_lookup(mm, hva); - if (!vma || is_vm_hugetlb_page(vma)) - return -EFAULT; - ptep = get_locked_pte(mm, hva, &ptl); - if (unlikely(!ptep)) - return -EFAULT; - new = pgste_get_lock(ptep); - - pgste_val(new) &= ~bits; - pgste_val(new) |= value & bits; - - pgste_set_unlock(ptep, new); - pte_unmap_unlock(ptep, ptl); - return 0; -} -EXPORT_SYMBOL(set_pgste_bits); - -/** - * get_pgste - get the current PGSTE for the given address. - * @mm: the memory context. It must have PGSTEs, no check is performed here! - * @hva: the host virtual address of the page whose PGSTE is to be processed - * @pgstep: will be written with the current PGSTE for the given address. - * - * Return: 0 on success, < 0 in case of error. - */ -int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep) -{ - struct vm_area_struct *vma; - spinlock_t *ptl; - pte_t *ptep; - - vma = vma_lookup(mm, hva); - if (!vma || is_vm_hugetlb_page(vma)) - return -EFAULT; - ptep = get_locked_pte(mm, hva, &ptl); - if (unlikely(!ptep)) - return -EFAULT; - *pgstep = pgste_val(pgste_get(ptep)); - pte_unmap_unlock(ptep, ptl); - return 0; -} -EXPORT_SYMBOL(get_pgste); -#endif diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 8ead999e340b..eeadff45e0e1 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -4,6 +4,8 @@ */ #include <linux/memory_hotplug.h> +#include <linux/bootmem_info.h> +#include <linux/cpufeature.h> #include <linux/memblock.h> #include <linux/pfn.h> #include <linux/mm.h> @@ -38,15 +40,21 @@ static void __ref *vmem_alloc_pages(unsigned int order) static void vmem_free_pages(unsigned long addr, int order, struct vmem_altmap *altmap) { + unsigned int nr_pages = 1 << order; + struct page *page; + if (altmap) { vmem_altmap_free(altmap, 1 << order); return; } - /* We don't expect boot memory to be removed ever. */ - if (!slab_is_available() || - WARN_ON_ONCE(PageReserved(virt_to_page((void *)addr)))) - return; - free_pages(addr, order); + page = virt_to_page((void *)addr); + if (PageReserved(page)) { + /* allocated from memblock */ + while (nr_pages--) + free_bootmem_page(page++); + } else { + free_pages(addr, order); + } } void *vmem_crst_alloc(unsigned long val) @@ -63,13 +71,12 @@ void *vmem_crst_alloc(unsigned long val) pte_t __ref *vmem_pte_alloc(void) { - unsigned long size = PTRS_PER_PTE * sizeof(pte_t); pte_t *pte; if (slab_is_available()) - pte = (pte_t *) page_table_alloc(&init_mm); + pte = (pte_t *)page_table_alloc(&init_mm); else - pte = (pte_t *) memblock_alloc(size, size); + pte = (pte_t *)memblock_alloc(PAGE_SIZE, PAGE_SIZE); if (!pte) return NULL; memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE); @@ -79,10 +86,6 @@ pte_t __ref *vmem_pte_alloc(void) static void vmem_pte_free(unsigned long *table) { - /* We don't expect boot memory to be removed ever. */ - if (!slab_is_available() || - WARN_ON_ONCE(PageReserved(virt_to_page(table)))) - return; page_table_free(&init_mm, table); } @@ -249,12 +252,12 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, } else if (pmd_none(*pmd)) { if (IS_ALIGNED(addr, PMD_SIZE) && IS_ALIGNED(next, PMD_SIZE) && - MACHINE_HAS_EDAT1 && direct && + cpu_has_edat1() && direct && !debug_pagealloc_enabled()) { set_pmd(pmd, __pmd(__pa(addr) | prot)); pages++; continue; - } else if (!direct && MACHINE_HAS_EDAT1) { + } else if (!direct && cpu_has_edat1()) { void *new_page; /* @@ -327,15 +330,19 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, if (pud_leaf(*pud)) { if (IS_ALIGNED(addr, PUD_SIZE) && IS_ALIGNED(next, PUD_SIZE)) { + if (!direct) + vmem_free_pages(pud_deref(*pud), get_order(PUD_SIZE), altmap); pud_clear(pud); pages++; + continue; + } else { + split_pud_page(pud, addr & PUD_MASK); } - continue; } } else if (pud_none(*pud)) { if (IS_ALIGNED(addr, PUD_SIZE) && IS_ALIGNED(next, PUD_SIZE) && - MACHINE_HAS_EDAT2 && direct && + cpu_has_edat2() && direct && !debug_pagealloc_enabled()) { set_pud(pud, __pud(__pa(addr) | prot)); pages++; @@ -430,9 +437,15 @@ static int modify_pagetable(unsigned long start, unsigned long end, bool add, if (WARN_ON_ONCE(!PAGE_ALIGNED(start | end))) return -EINVAL; - /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */ + /* Don't mess with any tables not fully in 1:1 mapping, vmemmap & kasan area */ +#ifdef CONFIG_KASAN + if (WARN_ON_ONCE(!(start >= KASAN_SHADOW_START && end <= KASAN_SHADOW_END) && + end > __abs_lowcore)) + return -EINVAL; +#else if (WARN_ON_ONCE(end > __abs_lowcore)) return -EINVAL; +#endif for (addr = start; addr < end; addr = next) { next = pgd_addr_end(addr, end); pgd = pgd_offset_k(addr); @@ -659,7 +672,7 @@ void __init vmem_map_init(void) * prefix page is used to return to the previous context with * an LPSWE instruction and therefore must be executable. */ - if (!static_key_enabled(&cpu_has_bear)) + if (!cpu_has_bear()) set_memory_x(0, 1); if (debug_pagealloc_enabled()) __set_memory_4k(__va(0), absolute_pointer(__va(0)) + ident_map_size); diff --git a/arch/s390/net/Makefile b/arch/s390/net/Makefile index 8cab6deb0403..9275cf63192a 100644 --- a/arch/s390/net/Makefile +++ b/arch/s390/net/Makefile @@ -2,5 +2,5 @@ # # Arch-specific network modules # -obj-$(CONFIG_BPF_JIT) += bpf_jit_comp.o +obj-$(CONFIG_BPF_JIT) += bpf_jit_comp.o bpf_timed_may_goto.o obj-$(CONFIG_HAVE_PNETID) += pnet.o diff --git a/arch/s390/net/bpf_jit.h b/arch/s390/net/bpf_jit.h deleted file mode 100644 index 7822ea92e54a..000000000000 --- a/arch/s390/net/bpf_jit.h +++ /dev/null @@ -1,55 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * BPF Jit compiler defines - * - * Copyright IBM Corp. 2012,2015 - * - * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> - * Michael Holzheu <holzheu@linux.vnet.ibm.com> - */ - -#ifndef __ARCH_S390_NET_BPF_JIT_H -#define __ARCH_S390_NET_BPF_JIT_H - -#ifndef __ASSEMBLY__ - -#include <linux/filter.h> -#include <linux/types.h> - -#endif /* __ASSEMBLY__ */ - -/* - * Stackframe layout (packed stack): - * - * ^ high - * +---------------+ | - * | old backchain | | - * +---------------+ | - * | r15 - r6 | | - * +---------------+ | - * | 4 byte align | | - * | tail_call_cnt | | - * BFP -> +===============+ | - * | | | - * | BPF stack | | - * | | | - * R15+160 -> +---------------+ | - * | new backchain | | - * R15+152 -> +---------------+ | - * | + 152 byte SA | | - * R15 -> +---------------+ + low - * - * We get 160 bytes stack space from calling function, but only use - * 12 * 8 byte for old backchain, r15..r6, and tail_call_cnt. - * - * The stack size used by the BPF program ("BPF stack" above) is passed - * via "aux->stack_depth". - */ -#define STK_SPACE_ADD (160) -#define STK_160_UNUSED (160 - 12 * 8) -#define STK_OFF (STK_SPACE_ADD - STK_160_UNUSED) - -#define STK_OFF_R6 (160 - 11 * 8) /* Offset of r6 on stack */ -#define STK_OFF_TCCNT (160 - 12 * 8) /* Offset of tail_call_cnt on stack */ - -#endif /* __ARCH_S390_NET_BPF_JIT_H */ diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 9d440a0b729e..94128fe6be23 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -15,8 +15,7 @@ * Michael Holzheu <holzheu@linux.vnet.ibm.com> */ -#define KMSG_COMPONENT "bpf_jit" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "bpf_jit: " fmt #include <linux/netdevice.h> #include <linux/filter.h> @@ -32,7 +31,6 @@ #include <asm/set_memory.h> #include <asm/text-patching.h> #include <asm/unwind.h> -#include "bpf_jit.h" struct bpf_jit { u32 seen; /* Flags to remember seen eBPF instructions */ @@ -48,14 +46,13 @@ struct bpf_jit { int lit64; /* Current position in 64-bit literal pool */ int base_ip; /* Base address for literal pool */ int exit_ip; /* Address of exit */ - int r1_thunk_ip; /* Address of expoline thunk for 'br %r1' */ - int r14_thunk_ip; /* Address of expoline thunk for 'br %r14' */ int tail_call_start; /* Tail call start offset */ int excnt; /* Number of exception table entries */ int prologue_plt_ret; /* Return address for prologue hotpatch PLT */ int prologue_plt; /* Start of prologue hotpatch PLT */ int kern_arena; /* Pool offset of kernel arena address */ u64 user_arena; /* User arena address */ + u32 frame_off; /* Offset of struct bpf_prog from %r15 */ }; #define SEEN_MEM BIT(0) /* use mem[] for temporary storage */ @@ -127,6 +124,18 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) jit->seen_regs |= (1 << r1); } +static s32 off_to_pcrel(struct bpf_jit *jit, u32 off) +{ + return off - jit->prg; +} + +static s64 ptr_to_pcrel(struct bpf_jit *jit, const void *ptr) +{ + if (jit->prg_buf) + return (const u8 *)ptr - ((const u8 *)jit->prg_buf + jit->prg); + return 0; +} + #define REG_SET_SEEN(b1) \ ({ \ reg_set_seen(jit, b1); \ @@ -201,7 +210,7 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) #define EMIT4_PCREL_RIC(op, mask, target) \ ({ \ - int __rel = ((target) - jit->prg) / 2; \ + int __rel = off_to_pcrel(jit, target) / 2; \ _EMIT4((op) | (mask) << 20 | (__rel & 0xffff)); \ }) @@ -239,7 +248,7 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) #define EMIT6_PCREL_RIEB(op1, op2, b1, b2, mask, target) \ ({ \ - unsigned int rel = (int)((target) - jit->prg) / 2; \ + unsigned int rel = off_to_pcrel(jit, target) / 2; \ _EMIT6((op1) | reg(b1, b2) << 16 | (rel & 0xffff), \ (op2) | (mask) << 12); \ REG_SET_SEEN(b1); \ @@ -248,7 +257,7 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) #define EMIT6_PCREL_RIEC(op1, op2, b1, imm, mask, target) \ ({ \ - unsigned int rel = (int)((target) - jit->prg) / 2; \ + unsigned int rel = off_to_pcrel(jit, target) / 2; \ _EMIT6((op1) | (reg_high(b1) | (mask)) << 16 | \ (rel & 0xffff), (op2) | ((imm) & 0xff) << 8); \ REG_SET_SEEN(b1); \ @@ -257,29 +266,41 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) #define EMIT6_PCREL(op1, op2, b1, b2, i, off, mask) \ ({ \ - int rel = (addrs[(i) + (off) + 1] - jit->prg) / 2; \ + int rel = off_to_pcrel(jit, addrs[(i) + (off) + 1]) / 2;\ _EMIT6((op1) | reg(b1, b2) << 16 | (rel & 0xffff), (op2) | (mask));\ REG_SET_SEEN(b1); \ REG_SET_SEEN(b2); \ }) +static void emit6_pcrel_ril(struct bpf_jit *jit, u32 op, s64 pcrel) +{ + u32 pc32dbl = (s32)(pcrel / 2); + + _EMIT6(op | pc32dbl >> 16, pc32dbl & 0xffff); +} + +static void emit6_pcrel_rilb(struct bpf_jit *jit, u32 op, u8 b, s64 pcrel) +{ + emit6_pcrel_ril(jit, op | reg_high(b) << 16, pcrel); + REG_SET_SEEN(b); +} + #define EMIT6_PCREL_RILB(op, b, target) \ -({ \ - unsigned int rel = (int)((target) - jit->prg) / 2; \ - _EMIT6((op) | reg_high(b) << 16 | rel >> 16, rel & 0xffff);\ - REG_SET_SEEN(b); \ -}) + emit6_pcrel_rilb(jit, op, b, off_to_pcrel(jit, target)) -#define EMIT6_PCREL_RIL(op, target) \ -({ \ - unsigned int rel = (int)((target) - jit->prg) / 2; \ - _EMIT6((op) | rel >> 16, rel & 0xffff); \ -}) +#define EMIT6_PCREL_RILB_PTR(op, b, target_ptr) \ + emit6_pcrel_rilb(jit, op, b, ptr_to_pcrel(jit, target_ptr)) + +static void emit6_pcrel_rilc(struct bpf_jit *jit, u32 op, u8 mask, s64 pcrel) +{ + emit6_pcrel_ril(jit, op | mask << 20, pcrel); +} #define EMIT6_PCREL_RILC(op, mask, target) \ -({ \ - EMIT6_PCREL_RIL((op) | (mask) << 20, (target)); \ -}) + emit6_pcrel_rilc(jit, op, mask, off_to_pcrel(jit, target)) + +#define EMIT6_PCREL_RILC_PTR(op, mask, target_ptr) \ + emit6_pcrel_rilc(jit, op, mask, ptr_to_pcrel(jit, target_ptr)) #define _EMIT6_IMM(op, imm) \ ({ \ @@ -404,11 +425,25 @@ static void jit_fill_hole(void *area, unsigned int size) } /* + * Caller-allocated part of the frame. + * Thanks to packed stack, its otherwise unused initial part can be used for + * the BPF stack and for the next frame. + */ +struct prog_frame { + u64 unused[8]; + /* BPF stack starts here and grows towards 0 */ + u32 tail_call_cnt; + u32 pad; + u64 r6[10]; /* r6 - r15 */ + u64 backchain; +} __packed; + +/* * Save registers from "rs" (register start) to "re" (register end) on stack */ static void save_regs(struct bpf_jit *jit, u32 rs, u32 re) { - u32 off = STK_OFF_R6 + (rs - 6) * 8; + u32 off = offsetof(struct prog_frame, r6) + (rs - 6) * 8; if (rs == re) /* stg %rs,off(%r15) */ @@ -421,12 +456,9 @@ static void save_regs(struct bpf_jit *jit, u32 rs, u32 re) /* * Restore registers from "rs" (register start) to "re" (register end) on stack */ -static void restore_regs(struct bpf_jit *jit, u32 rs, u32 re, u32 stack_depth) +static void restore_regs(struct bpf_jit *jit, u32 rs, u32 re) { - u32 off = STK_OFF_R6 + (rs - 6) * 8; - - if (jit->seen & SEEN_STACK) - off += STK_OFF + stack_depth; + u32 off = jit->frame_off + offsetof(struct prog_frame, r6) + (rs - 6) * 8; if (rs == re) /* lg %rs,off(%r15) */ @@ -470,8 +502,7 @@ static int get_end(u16 seen_regs, int start) * Save and restore clobbered registers (6-15) on stack. * We save/restore registers in chunks with gap >= 2 registers. */ -static void save_restore_regs(struct bpf_jit *jit, int op, u32 stack_depth, - u16 extra_regs) +static void save_restore_regs(struct bpf_jit *jit, int op, u16 extra_regs) { u16 seen_regs = jit->seen_regs | extra_regs; const int last = 15, save_restore_size = 6; @@ -494,7 +525,7 @@ static void save_restore_regs(struct bpf_jit *jit, int op, u32 stack_depth, if (op == REGS_SAVE) save_regs(jit, rs, re); else - restore_regs(jit, rs, re, stack_depth); + restore_regs(jit, rs, re); re++; } while (re <= last); } @@ -503,7 +534,7 @@ static void bpf_skip(struct bpf_jit *jit, int size) { if (size >= 6 && !is_valid_rel(size)) { /* brcl 0xf,size */ - EMIT6_PCREL_RIL(0xc0f4000000, size); + EMIT6_PCREL_RILC(0xc0040000, 0xf, size); size -= 6; } else if (size >= 4 && is_valid_rel(size)) { /* brc 0xf,size */ @@ -544,18 +575,27 @@ static void bpf_jit_plt(struct bpf_plt *plt, void *ret, void *target) { memcpy(plt, &bpf_plt, sizeof(*plt)); plt->ret = ret; - plt->target = target; + /* + * (target == NULL) implies that the branch to this PLT entry was + * patched and became a no-op. However, some CPU could have jumped + * to this PLT entry before patching and may be still executing it. + * + * Since the intention in this case is to make the PLT entry a no-op, + * make the target point to the return label instead of NULL. + */ + plt->target = target ?: ret; } /* * Emit function prologue * * Save registers and create stack frame if necessary. - * See stack frame layout description in "bpf_jit.h"! + * Stack frame layout is described by struct prog_frame. */ -static void bpf_jit_prologue(struct bpf_jit *jit, struct bpf_prog *fp, - u32 stack_depth) +static void bpf_jit_prologue(struct bpf_jit *jit, struct bpf_prog *fp) { + BUILD_BUG_ON(sizeof(struct prog_frame) != STACK_FRAME_OVERHEAD); + /* No-op for hotpatching */ /* brcl 0,prologue_plt */ EMIT6_PCREL_RILC(0xc0040000, 0, jit->prologue_plt); @@ -563,8 +603,9 @@ static void bpf_jit_prologue(struct bpf_jit *jit, struct bpf_prog *fp, if (!bpf_is_subprog(fp)) { /* Initialize the tail call counter in the main program. */ - /* xc STK_OFF_TCCNT(4,%r15),STK_OFF_TCCNT(%r15) */ - _EMIT6(0xd703f000 | STK_OFF_TCCNT, 0xf000 | STK_OFF_TCCNT); + /* xc tail_call_cnt(4,%r15),tail_call_cnt(%r15) */ + _EMIT6(0xd703f000 | offsetof(struct prog_frame, tail_call_cnt), + 0xf000 | offsetof(struct prog_frame, tail_call_cnt)); } else { /* * Skip the tail call counter initialization in subprograms. @@ -587,7 +628,7 @@ static void bpf_jit_prologue(struct bpf_jit *jit, struct bpf_prog *fp, jit->seen_regs |= NVREGS; } else { /* Save registers */ - save_restore_regs(jit, REGS_SAVE, stack_depth, + save_restore_regs(jit, REGS_SAVE, fp->aux->exception_boundary ? NVREGS : 0); } /* Setup literal pool */ @@ -605,77 +646,44 @@ static void bpf_jit_prologue(struct bpf_jit *jit, struct bpf_prog *fp, } /* Setup stack and backchain */ if (is_first_pass(jit) || (jit->seen & SEEN_STACK)) { - if (is_first_pass(jit) || (jit->seen & SEEN_FUNC)) - /* lgr %w1,%r15 (backchain) */ - EMIT4(0xb9040000, REG_W1, REG_15); - /* la %bfp,STK_160_UNUSED(%r15) (BPF frame pointer) */ - EMIT4_DISP(0x41000000, BPF_REG_FP, REG_15, STK_160_UNUSED); - /* aghi %r15,-STK_OFF */ - EMIT4_IMM(0xa70b0000, REG_15, -(STK_OFF + stack_depth)); - if (is_first_pass(jit) || (jit->seen & SEEN_FUNC)) - /* stg %w1,152(%r15) (backchain) */ - EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0, - REG_15, 152); + /* lgr %w1,%r15 (backchain) */ + EMIT4(0xb9040000, REG_W1, REG_15); + /* la %bfp,unused_end(%r15) (BPF frame pointer) */ + EMIT4_DISP(0x41000000, BPF_REG_FP, REG_15, + offsetofend(struct prog_frame, unused)); + /* aghi %r15,-frame_off */ + EMIT4_IMM(0xa70b0000, REG_15, -jit->frame_off); + /* stg %w1,backchain(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0, + REG_15, + offsetof(struct prog_frame, backchain)); } } /* - * Emit an expoline for a jump that follows + * Jump using a register either directly or via an expoline thunk */ -static void emit_expoline(struct bpf_jit *jit) -{ - /* exrl %r0,.+10 */ - EMIT6_PCREL_RIL(0xc6000000, jit->prg + 10); - /* j . */ - EMIT4_PCREL(0xa7f40000, 0); -} - -/* - * Emit __s390_indirect_jump_r1 thunk if necessary - */ -static void emit_r1_thunk(struct bpf_jit *jit) -{ - if (nospec_uses_trampoline()) { - jit->r1_thunk_ip = jit->prg; - emit_expoline(jit); - /* br %r1 */ - _EMIT2(0x07f1); - } -} - -/* - * Call r1 either directly or via __s390_indirect_jump_r1 thunk - */ -static void call_r1(struct bpf_jit *jit) -{ - if (nospec_uses_trampoline()) - /* brasl %r14,__s390_indirect_jump_r1 */ - EMIT6_PCREL_RILB(0xc0050000, REG_14, jit->r1_thunk_ip); - else - /* basr %r14,%r1 */ - EMIT2(0x0d00, REG_14, REG_1); -} +#define EMIT_JUMP_REG(reg) do { \ + if (nospec_uses_trampoline()) \ + /* brcl 0xf,__s390_indirect_jump_rN */ \ + EMIT6_PCREL_RILC_PTR(0xc0040000, 0x0f, \ + __s390_indirect_jump_r ## reg); \ + else \ + /* br %rN */ \ + _EMIT2(0x07f0 | reg); \ +} while (0) /* * Function epilogue */ -static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth) +static void bpf_jit_epilogue(struct bpf_jit *jit) { jit->exit_ip = jit->prg; /* Load exit code: lgr %r2,%b0 */ EMIT4(0xb9040000, REG_2, BPF_REG_0); /* Restore registers */ - save_restore_regs(jit, REGS_RESTORE, stack_depth, 0); - if (nospec_uses_trampoline()) { - jit->r14_thunk_ip = jit->prg; - /* Generate __s390_indirect_jump_r14 thunk */ - emit_expoline(jit); - } - /* br %r14 */ - _EMIT2(0x07fe); - - if (is_first_pass(jit) || (jit->seen & SEEN_FUNC)) - emit_r1_thunk(jit); + save_restore_regs(jit, REGS_RESTORE, 0); + EMIT_JUMP_REG(14); jit->prg = ALIGN(jit->prg, 8); jit->prologue_plt = jit->prg; @@ -822,25 +830,34 @@ static int bpf_jit_probe_post(struct bpf_jit *jit, struct bpf_prog *fp, } /* - * Sign-extend the register if necessary + * Sign- or zero-extend the register if necessary */ -static int sign_extend(struct bpf_jit *jit, int r, u8 size, u8 flags) +static int sign_zero_extend(struct bpf_jit *jit, int r, u8 size, u8 flags) { - if (!(flags & BTF_FMODEL_SIGNED_ARG)) - return 0; - switch (size) { case 1: - /* lgbr %r,%r */ - EMIT4(0xb9060000, r, r); + if (flags & BTF_FMODEL_SIGNED_ARG) + /* lgbr %r,%r */ + EMIT4(0xb9060000, r, r); + else + /* llgcr %r,%r */ + EMIT4(0xb9840000, r, r); return 0; case 2: - /* lghr %r,%r */ - EMIT4(0xb9070000, r, r); + if (flags & BTF_FMODEL_SIGNED_ARG) + /* lghr %r,%r */ + EMIT4(0xb9070000, r, r); + else + /* llghr %r,%r */ + EMIT4(0xb9850000, r, r); return 0; case 4: - /* lgfr %r,%r */ - EMIT4(0xb9140000, r, r); + if (flags & BTF_FMODEL_SIGNED_ARG) + /* lgfr %r,%r */ + EMIT4(0xb9140000, r, r); + else + /* llgfr %r,%r */ + EMIT4(0xb9160000, r, r); return 0; case 8: return 0; @@ -856,7 +873,7 @@ static int sign_extend(struct bpf_jit *jit, int r, u8 size, u8 flags) * stack space for the large switch statement. */ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, - int i, bool extra_pass, u32 stack_depth) + int i, bool extra_pass) { struct bpf_insn *insn = &fp->insnsi[i]; s32 branch_oc_off = insn->off; @@ -1767,19 +1784,21 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, REG_SET_SEEN(BPF_REG_5); jit->seen |= SEEN_FUNC; + /* * Copy the tail call counter to where the callee expects it. - * - * Note 1: The callee can increment the tail call counter, but - * we do not load it back, since the x86 JIT does not do this - * either. - * - * Note 2: We assume that the verifier does not let us call the - * main program, which clears the tail call counter on entry. */ - /* mvc STK_OFF_TCCNT(4,%r15),N(%r15) */ - _EMIT6(0xd203f000 | STK_OFF_TCCNT, - 0xf000 | (STK_OFF_TCCNT + STK_OFF + stack_depth)); + + if (insn->src_reg == BPF_PSEUDO_CALL) + /* + * mvc tail_call_cnt(4,%r15), + * frame_off+tail_call_cnt(%r15) + */ + _EMIT6(0xd203f000 | offsetof(struct prog_frame, + tail_call_cnt), + 0xf000 | (jit->frame_off + + offsetof(struct prog_frame, + tail_call_cnt))); /* Sign-extend the kfunc arguments. */ if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { @@ -1788,19 +1807,45 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, return -1; for (j = 0; j < m->nr_args; j++) { - if (sign_extend(jit, BPF_REG_1 + j, - m->arg_size[j], - m->arg_flags[j])) + if (sign_zero_extend(jit, BPF_REG_1 + j, + m->arg_size[j], + m->arg_flags[j])) return -1; } } - /* lgrl %w1,func */ - EMIT6_PCREL_RILB(0xc4080000, REG_W1, _EMIT_CONST_U64(func)); - /* %r1() */ - call_r1(jit); - /* lgr %b0,%r2: load return value into %b0 */ - EMIT4(0xb9040000, BPF_REG_0, REG_2); + if ((void *)func == arch_bpf_timed_may_goto) { + /* + * arch_bpf_timed_may_goto() has a special ABI: the + * parameters are in BPF_REG_AX and BPF_REG_10; the + * return value is in BPF_REG_AX; and all GPRs except + * REG_W0, REG_W1, and BPF_REG_AX are callee-saved. + */ + + /* brasl %r0,func */ + EMIT6_PCREL_RILB_PTR(0xc0050000, REG_0, (void *)func); + } else { + /* brasl %r14,func */ + EMIT6_PCREL_RILB_PTR(0xc0050000, REG_14, (void *)func); + /* lgr %b0,%r2: load return value into %b0 */ + EMIT4(0xb9040000, BPF_REG_0, REG_2); + } + + /* + * Copy the potentially updated tail call counter back. + */ + + if (insn->src_reg == BPF_PSEUDO_CALL) + /* + * mvc frame_off+tail_call_cnt(%r15), + * tail_call_cnt(4,%r15) + */ + _EMIT6(0xd203f000 | (jit->frame_off + + offsetof(struct prog_frame, + tail_call_cnt)), + 0xf000 | offsetof(struct prog_frame, + tail_call_cnt)); + break; } case BPF_JMP | BPF_TAIL_CALL: { @@ -1826,22 +1871,21 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, jit->prg); /* - * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT) + * if (tail_call_cnt >= MAX_TAIL_CALL_CNT) * goto out; + * + * tail_call_cnt is read into %w0, which needs to be preserved + * until it's incremented and flushed. */ - if (jit->seen & SEEN_STACK) - off = STK_OFF_TCCNT + STK_OFF + stack_depth; - else - off = STK_OFF_TCCNT; - /* lhi %w0,1 */ - EMIT4_IMM(0xa7080000, REG_W0, 1); - /* laal %w1,%w0,off(%r15) */ - EMIT6_DISP_LH(0xeb000000, 0x00fa, REG_W1, REG_W0, REG_15, off); - /* clij %w1,MAX_TAIL_CALL_CNT-1,0x2,out */ + off = jit->frame_off + + offsetof(struct prog_frame, tail_call_cnt); + /* ly %w0,off(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0058, REG_W0, REG_0, REG_15, off); + /* clij %w0,MAX_TAIL_CALL_CNT,0xa,out */ patch_2_clij = jit->prg; - EMIT6_PCREL_RIEC(0xec000000, 0x007f, REG_W1, MAX_TAIL_CALL_CNT - 1, - 2, jit->prg); + EMIT6_PCREL_RIEC(0xec000000, 0x007f, REG_W0, MAX_TAIL_CALL_CNT, + 0xa, jit->prg); /* * prog = array->ptrs[index]; @@ -1860,10 +1904,16 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, patch_3_brc = jit->prg; EMIT4_PCREL_RIC(0xa7040000, 8, jit->prg); + /* tail_call_cnt++; */ + /* ahi %w0,1 */ + EMIT4_IMM(0xa70a0000, REG_W0, 1); + /* sty %w0,off(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0050, REG_W0, REG_0, REG_15, off); + /* * Restore registers before calling function */ - save_restore_regs(jit, REGS_RESTORE, stack_depth, 0); + save_restore_regs(jit, REGS_RESTORE, 0); /* * goto *(prog->bpf_func + tail_call_start); @@ -1877,7 +1927,8 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, /* aghi %r1,tail_call_start */ EMIT4_IMM(0xa70b0000, REG_1, jit->tail_call_start); /* brcl 0xf,__s390_indirect_jump_r1 */ - EMIT6_PCREL_RILC(0xc0040000, 0xf, jit->r1_thunk_ip); + EMIT6_PCREL_RILC_PTR(0xc0040000, 0xf, + __s390_indirect_jump_r1); } else { /* bc 0xf,tail_call_start(%r1) */ _EMIT4(0x47f01000 + jit->tail_call_start); @@ -2155,7 +2206,7 @@ static int bpf_set_addr(struct bpf_jit *jit, int i) * Compile eBPF program into s390x code */ static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp, - bool extra_pass, u32 stack_depth) + bool extra_pass) { int i, insn_count, lit32_size, lit64_size; u64 kern_arena; @@ -2164,24 +2215,30 @@ static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp, jit->lit64 = jit->lit64_start; jit->prg = 0; jit->excnt = 0; + if (is_first_pass(jit) || (jit->seen & SEEN_STACK)) + jit->frame_off = sizeof(struct prog_frame) - + offsetofend(struct prog_frame, unused) + + round_up(fp->aux->stack_depth, 8); + else + jit->frame_off = 0; kern_arena = bpf_arena_get_kern_vm_start(fp->aux->arena); if (kern_arena) jit->kern_arena = _EMIT_CONST_U64(kern_arena); jit->user_arena = bpf_arena_get_user_vm_start(fp->aux->arena); - bpf_jit_prologue(jit, fp, stack_depth); + bpf_jit_prologue(jit, fp); if (bpf_set_addr(jit, 0) < 0) return -1; for (i = 0; i < fp->len; i += insn_count) { - insn_count = bpf_jit_insn(jit, fp, i, extra_pass, stack_depth); + insn_count = bpf_jit_insn(jit, fp, i, extra_pass); if (insn_count < 0) return -1; /* Next instruction address */ if (bpf_set_addr(jit, i + insn_count) < 0) return -1; } - bpf_jit_epilogue(jit, stack_depth); + bpf_jit_epilogue(jit); lit32_size = jit->lit32 - jit->lit32_start; lit64_size = jit->lit64 - jit->lit64_start; @@ -2255,39 +2312,22 @@ static struct bpf_binary_header *bpf_jit_alloc(struct bpf_jit *jit, /* * Compile eBPF program "fp" */ -struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) +struct bpf_prog *bpf_int_jit_compile(struct bpf_verifier_env *env, struct bpf_prog *fp) { - u32 stack_depth = round_up(fp->aux->stack_depth, 8); - struct bpf_prog *tmp, *orig_fp = fp; struct bpf_binary_header *header; struct s390_jit_data *jit_data; - bool tmp_blinded = false; bool extra_pass = false; struct bpf_jit jit; int pass; if (!fp->jit_requested) - return orig_fp; - - tmp = bpf_jit_blind_constants(fp); - /* - * If blinding was requested and we failed during blinding, - * we must fall back to the interpreter. - */ - if (IS_ERR(tmp)) - return orig_fp; - if (tmp != fp) { - tmp_blinded = true; - fp = tmp; - } + return fp; jit_data = fp->aux->jit_data; if (!jit_data) { - jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL); - if (!jit_data) { - fp = orig_fp; - goto out; - } + jit_data = kzalloc_obj(*jit_data); + if (!jit_data) + return fp; fp->aux->jit_data = jit_data; } if (jit_data->ctx.addrs) { @@ -2300,34 +2340,27 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) memset(&jit, 0, sizeof(jit)); jit.addrs = kvcalloc(fp->len + 1, sizeof(*jit.addrs), GFP_KERNEL); - if (jit.addrs == NULL) { - fp = orig_fp; - goto free_addrs; - } + if (jit.addrs == NULL) + goto out_err; /* * Three initial passes: * - 1/2: Determine clobbered registers * - 3: Calculate program size and addrs array */ for (pass = 1; pass <= 3; pass++) { - if (bpf_jit_prog(&jit, fp, extra_pass, stack_depth)) { - fp = orig_fp; - goto free_addrs; - } + if (bpf_jit_prog(&jit, fp, extra_pass)) + goto out_err; } /* * Final pass: Allocate and generate program */ header = bpf_jit_alloc(&jit, fp); - if (!header) { - fp = orig_fp; - goto free_addrs; - } + if (!header) + goto out_err; skip_init_ctx: - if (bpf_jit_prog(&jit, fp, extra_pass, stack_depth)) { + if (bpf_jit_prog(&jit, fp, extra_pass)) { bpf_jit_binary_free(header); - fp = orig_fp; - goto free_addrs; + goto out_err; } if (bpf_jit_enable > 1) { bpf_jit_dump(fp->len, jit.size, pass, jit.prg_buf); @@ -2336,8 +2369,7 @@ skip_init_ctx: if (!fp->is_func || extra_pass) { if (bpf_jit_binary_lock_ro(header)) { bpf_jit_binary_free(header); - fp = orig_fp; - goto free_addrs; + goto out_err; } } else { jit_data->header = header; @@ -2355,11 +2387,16 @@ free_addrs: kfree(jit_data); fp->aux->jit_data = NULL; } -out: - if (tmp_blinded) - bpf_jit_prog_release_other(fp, fp == orig_fp ? - tmp : orig_fp); + return fp; + +out_err: + if (extra_pass) { + fp->bpf_func = NULL; + fp->jited = 0; + fp->jited_len = 0; + } + goto free_addrs; } bool bpf_jit_supports_kfunc_call(void) @@ -2372,8 +2409,9 @@ bool bpf_jit_supports_far_kfunc_call(void) return true; } -int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, - void *old_addr, void *new_addr) +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t, + enum bpf_text_poke_type new_t, void *old_addr, + void *new_addr) { struct bpf_plt expected_plt, current_plt, new_plt, *plt; struct { @@ -2390,7 +2428,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, if (insn.opc != (0xc004 | (old_addr ? 0xf0 : 0))) return -EINVAL; - if (t == BPF_MOD_JUMP && + if ((new_t == BPF_MOD_JUMP || old_t == BPF_MOD_JUMP) && insn.disp == ((char *)new_addr - (char *)ip) >> 1) { /* * The branch already points to the destination, @@ -2439,8 +2477,8 @@ struct bpf_tramp_jit { int ip_off; /* For bpf_get_func_ip(), has to be at * (ctx - 16) */ - int arg_cnt_off; /* For bpf_get_func_arg_cnt(), has to be at - * (ctx - 8) + int func_meta_off; /* For bpf_get_func_arg_cnt()/fsession, has + * to be at (ctx - 8) */ int bpf_args_off; /* Offset of BPF_PROG context, which consists * of BPF arguments followed by return value @@ -2465,6 +2503,13 @@ static void load_imm64(struct bpf_jit *jit, int dst_reg, u64 val) EMIT6_IMM(0xc00d0000, dst_reg, val); } +static void emit_store_stack_imm64(struct bpf_jit *jit, int tmp_reg, int stack_off, u64 imm) +{ + load_imm64(jit, tmp_reg, imm); + /* stg %tmp_reg,stack_off(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0024, tmp_reg, REG_0, REG_15, stack_off); +} + static int invoke_bpf_prog(struct bpf_tramp_jit *tjit, const struct btf_func_model *m, struct bpf_tramp_link *tlink, bool save_ret) @@ -2479,24 +2524,19 @@ static int invoke_bpf_prog(struct bpf_tramp_jit *tjit, * run_ctx.cookie = tlink->cookie; */ - /* %r0 = tlink->cookie */ - load_imm64(jit, REG_W0, tlink->cookie); - /* stg %r0,cookie_off(%r15) */ - EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W0, REG_0, REG_15, cookie_off); + emit_store_stack_imm64(jit, REG_W0, cookie_off, tlink->cookie); /* * if ((start = __bpf_prog_enter(p, &run_ctx)) == 0) * goto skip; */ - /* %r1 = __bpf_prog_enter */ - load_imm64(jit, REG_1, (u64)bpf_trampoline_enter(p)); /* %r2 = p */ load_imm64(jit, REG_2, (u64)p); /* la %r3,run_ctx_off(%r15) */ EMIT4_DISP(0x41000000, REG_3, REG_15, tjit->run_ctx_off); - /* %r1() */ - call_r1(jit); + /* brasl %r14,__bpf_prog_enter */ + EMIT6_PCREL_RILB_PTR(0xc0050000, REG_14, bpf_trampoline_enter(p)); /* ltgr %r7,%r2 */ EMIT4(0xb9020000, REG_7, REG_2); /* brcl 8,skip */ @@ -2507,18 +2547,16 @@ static int invoke_bpf_prog(struct bpf_tramp_jit *tjit, * retval = bpf_func(args, p->insnsi); */ - /* %r1 = p->bpf_func */ - load_imm64(jit, REG_1, (u64)p->bpf_func); /* la %r2,bpf_args_off(%r15) */ EMIT4_DISP(0x41000000, REG_2, REG_15, tjit->bpf_args_off); /* %r3 = p->insnsi */ if (!p->jited) load_imm64(jit, REG_3, (u64)p->insnsi); - /* %r1() */ - call_r1(jit); + /* brasl %r14,p->bpf_func */ + EMIT6_PCREL_RILB_PTR(0xc0050000, REG_14, p->bpf_func); /* stg %r2,retval_off(%r15) */ if (save_ret) { - if (sign_extend(jit, REG_2, m->ret_size, m->ret_flags)) + if (sign_zero_extend(jit, REG_2, m->ret_size, m->ret_flags)) return -1; EMIT6_DISP_LH(0xe3000000, 0x0024, REG_2, REG_0, REG_15, tjit->retval_off); @@ -2532,16 +2570,36 @@ static int invoke_bpf_prog(struct bpf_tramp_jit *tjit, * __bpf_prog_exit(p, start, &run_ctx); */ - /* %r1 = __bpf_prog_exit */ - load_imm64(jit, REG_1, (u64)bpf_trampoline_exit(p)); /* %r2 = p */ load_imm64(jit, REG_2, (u64)p); /* lgr %r3,%r7 */ EMIT4(0xb9040000, REG_3, REG_7); /* la %r4,run_ctx_off(%r15) */ EMIT4_DISP(0x41000000, REG_4, REG_15, tjit->run_ctx_off); - /* %r1() */ - call_r1(jit); + /* brasl %r14,__bpf_prog_exit */ + EMIT6_PCREL_RILB_PTR(0xc0050000, REG_14, bpf_trampoline_exit(p)); + + return 0; +} + +static int invoke_bpf(struct bpf_tramp_jit *tjit, + const struct btf_func_model *m, + struct bpf_tramp_links *tl, bool save_ret, + u64 func_meta, int cookie_off) +{ + int i, cur_cookie = (tjit->bpf_args_off - cookie_off) / sizeof(u64); + struct bpf_jit *jit = &tjit->common; + + for (i = 0; i < tl->nr_links; i++) { + if (bpf_prog_calls_session_cookie(tl->links[i])) { + u64 meta = func_meta | ((u64)cur_cookie << BPF_TRAMP_COOKIE_INDEX_SHIFT); + + emit_store_stack_imm64(jit, REG_0, tjit->func_meta_off, meta); + cur_cookie--; + } + if (invoke_bpf_prog(tjit, m, tl->links[i], save_ret)) + return -EINVAL; + } return 0; } @@ -2575,8 +2633,10 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY]; struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT]; int nr_bpf_args, nr_reg_args, nr_stack_args; + int cookie_cnt, cookie_off, fsession_cnt; struct bpf_jit *jit = &tjit->common; int arg, bpf_arg_off; + u64 func_meta; int i, j; /* Support as many stack arguments as "mvc" instruction can handle. */ @@ -2585,9 +2645,8 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, if (nr_stack_args > MAX_NR_STACK_ARGS) return -ENOTSUPP; - /* Return to %r14, since func_addr and %r0 are not available. */ - if ((!func_addr && !(flags & BPF_TRAMP_F_ORIG_STACK)) || - (flags & BPF_TRAMP_F_INDIRECT)) + /* Return to %r14 in the struct_ops case. */ + if (flags & BPF_TRAMP_F_INDIRECT) flags |= BPF_TRAMP_F_SKIP_FRAME; /* @@ -2609,6 +2668,9 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, return -ENOTSUPP; } + cookie_cnt = bpf_fsession_cookie_cnt(tlinks); + fsession_cnt = bpf_fsession_cnt(tlinks); + /* * Calculate the stack layout. */ @@ -2621,8 +2683,9 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, tjit->backchain_off = tjit->stack_size - sizeof(u64); tjit->stack_args_off = alloc_stack(tjit, nr_stack_args * sizeof(u64)); tjit->reg_args_off = alloc_stack(tjit, nr_reg_args * sizeof(u64)); + cookie_off = alloc_stack(tjit, cookie_cnt * sizeof(u64)); tjit->ip_off = alloc_stack(tjit, sizeof(u64)); - tjit->arg_cnt_off = alloc_stack(tjit, sizeof(u64)); + tjit->func_meta_off = alloc_stack(tjit, sizeof(u64)); tjit->bpf_args_off = alloc_stack(tjit, nr_bpf_args * sizeof(u64)); tjit->retval_off = alloc_stack(tjit, sizeof(u64)); tjit->r7_r8_off = alloc_stack(tjit, 2 * sizeof(u64)); @@ -2645,9 +2708,10 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, /* stg %r1,backchain_off(%r15) */ EMIT6_DISP_LH(0xe3000000, 0x0024, REG_1, REG_0, REG_15, tjit->backchain_off); - /* mvc tccnt_off(4,%r15),stack_size+STK_OFF_TCCNT(%r15) */ + /* mvc tccnt_off(4,%r15),stack_size+tail_call_cnt(%r15) */ _EMIT6(0xd203f000 | tjit->tccnt_off, - 0xf000 | (tjit->stack_size + STK_OFF_TCCNT)); + 0xf000 | (tjit->stack_size + + offsetof(struct prog_frame, tail_call_cnt))); /* stmg %r2,%rN,fwd_reg_args_off(%r15) */ if (nr_reg_args) EMIT6_DISP_LH(0xeb000000, 0x0024, REG_2, @@ -2701,9 +2765,6 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, /* lgr %r8,%r0 */ EMIT4(0xb9040000, REG_8, REG_0); - } else { - /* %r8 = func_addr + S390X_PATCH_SIZE */ - load_imm64(jit, REG_8, (u64)func_addr + S390X_PATCH_SIZE); } /* @@ -2711,36 +2772,37 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, * arg_cnt = m->nr_args; */ - if (flags & BPF_TRAMP_F_IP_ARG) { - /* %r0 = func_addr */ - load_imm64(jit, REG_0, (u64)func_addr); - /* stg %r0,ip_off(%r15) */ - EMIT6_DISP_LH(0xe3000000, 0x0024, REG_0, REG_0, REG_15, - tjit->ip_off); - } - /* lghi %r0,nr_bpf_args */ - EMIT4_IMM(0xa7090000, REG_0, nr_bpf_args); - /* stg %r0,arg_cnt_off(%r15) */ + if (flags & BPF_TRAMP_F_IP_ARG) + emit_store_stack_imm64(jit, REG_0, tjit->ip_off, (u64)func_addr); + func_meta = nr_bpf_args; + /* lghi %r0,func_meta */ + EMIT4_IMM(0xa7090000, REG_0, func_meta); + /* stg %r0,func_meta_off(%r15) */ EMIT6_DISP_LH(0xe3000000, 0x0024, REG_0, REG_0, REG_15, - tjit->arg_cnt_off); + tjit->func_meta_off); if (flags & BPF_TRAMP_F_CALL_ORIG) { /* * __bpf_tramp_enter(im); */ - /* %r1 = __bpf_tramp_enter */ - load_imm64(jit, REG_1, (u64)__bpf_tramp_enter); /* %r2 = im */ load_imm64(jit, REG_2, (u64)im); - /* %r1() */ - call_r1(jit); + /* brasl %r14,__bpf_tramp_enter */ + EMIT6_PCREL_RILB_PTR(0xc0050000, REG_14, __bpf_tramp_enter); } - for (i = 0; i < fentry->nr_links; i++) - if (invoke_bpf_prog(tjit, m, fentry->links[i], - flags & BPF_TRAMP_F_RET_FENTRY_RET)) - return -EINVAL; + if (fsession_cnt) { + /* Clear all the session cookies' value. */ + for (i = 0; i < cookie_cnt; i++) + emit_store_stack_imm64(jit, REG_0, cookie_off + 8 * i, 0); + /* Clear the return value to make sure fentry always gets 0. */ + emit_store_stack_imm64(jit, REG_0, tjit->retval_off, 0); + } + + if (invoke_bpf(tjit, m, fentry, flags & BPF_TRAMP_F_RET_FENTRY_RET, + func_meta, cookie_off)) + return -EINVAL; if (fmod_ret->nr_links) { /* @@ -2784,15 +2846,28 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, (nr_stack_args * sizeof(u64) - 1) << 16 | tjit->stack_args_off, 0xf000 | tjit->orig_stack_args_off); - /* mvc STK_OFF_TCCNT(4,%r15),tccnt_off(%r15) */ - _EMIT6(0xd203f000 | STK_OFF_TCCNT, 0xf000 | tjit->tccnt_off); - /* lgr %r1,%r8 */ - EMIT4(0xb9040000, REG_1, REG_8); - /* %r1() */ - call_r1(jit); + /* mvc tail_call_cnt(4,%r15),tccnt_off(%r15) */ + _EMIT6(0xd203f000 | offsetof(struct prog_frame, tail_call_cnt), + 0xf000 | tjit->tccnt_off); + if (flags & BPF_TRAMP_F_ORIG_STACK) { + if (nospec_uses_trampoline()) + /* brasl %r14,__s390_indirect_jump_r8 */ + EMIT6_PCREL_RILB_PTR(0xc0050000, REG_14, + __s390_indirect_jump_r8); + else + /* basr %r14,%r8 */ + EMIT2(0x0d00, REG_14, REG_8); + } else { + /* brasl %r14,func_addr+S390X_PATCH_SIZE */ + EMIT6_PCREL_RILB_PTR(0xc0050000, REG_14, + func_addr + S390X_PATCH_SIZE); + } /* stg %r2,retval_off(%r15) */ EMIT6_DISP_LH(0xe3000000, 0x0024, REG_2, REG_0, REG_15, tjit->retval_off); + /* mvc tccnt_off(%r15),tail_call_cnt(4,%r15) */ + _EMIT6(0xd203f000 | tjit->tccnt_off, + 0xf000 | offsetof(struct prog_frame, tail_call_cnt)); im->ip_after_call = jit->prg_buf + jit->prg; @@ -2804,11 +2879,16 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, EMIT6_PCREL_RILC(0xc0040000, 0, (u64)im->ip_epilogue); } + /* Set the "is_return" flag for fsession. */ + func_meta |= (1ULL << BPF_TRAMP_IS_RETURN_SHIFT); + if (fsession_cnt) + emit_store_stack_imm64(jit, REG_W0, tjit->func_meta_off, + func_meta); + /* do_fexit: */ tjit->do_fexit = jit->prg; - for (i = 0; i < fexit->nr_links; i++) - if (invoke_bpf_prog(tjit, m, fexit->links[i], false)) - return -EINVAL; + if (invoke_bpf(tjit, m, fexit, false, func_meta, cookie_off)) + return -EINVAL; if (flags & BPF_TRAMP_F_CALL_ORIG) { im->ip_epilogue = jit->prg_buf + jit->prg; @@ -2817,12 +2897,10 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, * __bpf_tramp_exit(im); */ - /* %r1 = __bpf_tramp_exit */ - load_imm64(jit, REG_1, (u64)__bpf_tramp_exit); /* %r2 = im */ load_imm64(jit, REG_2, (u64)im); - /* %r1() */ - call_r1(jit); + /* brasl %r14,__bpf_tramp_exit */ + EMIT6_PCREL_RILB_PTR(0xc0050000, REG_14, __bpf_tramp_exit); } /* lmg %r2,%rN,reg_args_off(%r15) */ @@ -2831,7 +2909,8 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, REG_2 + (nr_reg_args - 1), REG_15, tjit->reg_args_off); /* lgr %r1,%r8 */ - if (!(flags & BPF_TRAMP_F_SKIP_FRAME)) + if (!(flags & BPF_TRAMP_F_SKIP_FRAME) && + (flags & BPF_TRAMP_F_ORIG_STACK)) EMIT4(0xb9040000, REG_1, REG_8); /* lmg %r7,%r8,r7_r8_off(%r15) */ EMIT6_DISP_LH(0xeb000000, 0x0004, REG_7, REG_8, REG_15, @@ -2842,23 +2921,20 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, if (flags & (BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_RET_FENTRY_RET)) EMIT6_DISP_LH(0xe3000000, 0x0004, REG_2, REG_0, REG_15, tjit->retval_off); - /* mvc stack_size+STK_OFF_TCCNT(4,%r15),tccnt_off(%r15) */ - _EMIT6(0xd203f000 | (tjit->stack_size + STK_OFF_TCCNT), + /* mvc stack_size+tail_call_cnt(4,%r15),tccnt_off(%r15) */ + _EMIT6(0xd203f000 | (tjit->stack_size + + offsetof(struct prog_frame, tail_call_cnt)), 0xf000 | tjit->tccnt_off); /* aghi %r15,stack_size */ EMIT4_IMM(0xa70b0000, REG_15, tjit->stack_size); - /* Emit an expoline for the following indirect jump. */ - if (nospec_uses_trampoline()) - emit_expoline(jit); if (flags & BPF_TRAMP_F_SKIP_FRAME) - /* br %r14 */ - _EMIT2(0x07fe); + EMIT_JUMP_REG(14); + else if (flags & BPF_TRAMP_F_ORIG_STACK) + EMIT_JUMP_REG(1); else - /* br %r1 */ - _EMIT2(0x07f1); - - emit_r1_thunk(jit); - + /* brcl 0xf,func_addr+S390X_PATCH_SIZE */ + EMIT6_PCREL_RILC_PTR(0xc0040000, 0xf, + func_addr + S390X_PATCH_SIZE); return 0; } @@ -2917,12 +2993,28 @@ bool bpf_jit_supports_arena(void) return true; } +bool bpf_jit_supports_fsession(void) +{ + return true; +} + bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) { - /* - * Currently the verifier uses this function only to check which - * atomic stores to arena are supported, and they all are. - */ + if (!in_arena) + return true; + switch (insn->code) { + case BPF_STX | BPF_ATOMIC | BPF_B: + case BPF_STX | BPF_ATOMIC | BPF_H: + case BPF_STX | BPF_ATOMIC | BPF_W: + case BPF_STX | BPF_ATOMIC | BPF_DW: + if (bpf_atomic_is_load_store(insn)) + return false; + break; + case BPF_LDX | BPF_MEMSX | BPF_B: + case BPF_LDX | BPF_MEMSX | BPF_H: + case BPF_LDX | BPF_MEMSX | BPF_W: + return false; + } return true; } @@ -2960,3 +3052,8 @@ void arch_bpf_stack_walk(bool (*consume_fn)(void *, u64, u64, u64), prev_addr = addr; } } + +bool bpf_jit_supports_timed_may_goto(void) +{ + return true; +} diff --git a/arch/s390/net/bpf_timed_may_goto.S b/arch/s390/net/bpf_timed_may_goto.S new file mode 100644 index 000000000000..06f567a460d7 --- /dev/null +++ b/arch/s390/net/bpf_timed_may_goto.S @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <linux/export.h> +#include <linux/linkage.h> +#include <asm/asm-offsets.h> +#include <asm/nospec-insn.h> + +#define R2_OFF 0 +#define R5_OFF (R2_OFF + (5 - 2 + 1) * 8) +#define R14_OFF (R5_OFF + 8) +#define RETADDR_OFF (R14_OFF + 8) +#define R15_OFF (RETADDR_OFF + 8) +#define BACKCHAIN_OFF (R15_OFF + 8) +#define FRAME_SIZE (BACKCHAIN_OFF + 8) +#define FRAME_OFF (STACK_FRAME_OVERHEAD - FRAME_SIZE) +#if (FRAME_OFF + BACKCHAIN_OFF) != __SF_BACKCHAIN +#error Stack frame layout calculation is broken +#endif + + GEN_BR_THUNK %r1 + +SYM_FUNC_START(arch_bpf_timed_may_goto) + /* + * This function has a special ABI: the parameters are in %r12 and + * %r13; the return value is in %r12; all GPRs except %r0, %r1, and + * %r12 are callee-saved; and the return address is in %r0. + */ + stmg %r2,%r5,FRAME_OFF+R2_OFF(%r15) + stg %r14,FRAME_OFF+R14_OFF(%r15) + stg %r0,FRAME_OFF+RETADDR_OFF(%r15) + stg %r15,FRAME_OFF+R15_OFF(%r15) + lgr %r1,%r15 + lay %r15,-FRAME_SIZE(%r15) + stg %r1,__SF_BACKCHAIN(%r15) + + lay %r2,0(%r12,%r13) + brasl %r14,bpf_check_timed_may_goto + lgr %r12,%r2 + + lg %r15,FRAME_SIZE+FRAME_OFF+R15_OFF(%r15) + lmg %r2,%r5,FRAME_OFF+R2_OFF(%r15) + lg %r14,FRAME_OFF+R14_OFF(%r15) + lg %r1,FRAME_OFF+RETADDR_OFF(%r15) + BR_EX %r1 +SYM_FUNC_END(arch_bpf_timed_may_goto) diff --git a/arch/s390/net/pnet.c b/arch/s390/net/pnet.c index 79211bec0fc8..03089ef479b2 100644 --- a/arch/s390/net/pnet.c +++ b/arch/s390/net/pnet.c @@ -6,6 +6,7 @@ */ #include <linux/device.h> +#include <linux/export.h> #include <linux/module.h> #include <linux/pci.h> #include <linux/types.h> diff --git a/arch/s390/pci/Makefile b/arch/s390/pci/Makefile index df73c5182990..1810e0944a4e 100644 --- a/arch/s390/pci/Makefile +++ b/arch/s390/pci/Makefile @@ -5,6 +5,6 @@ obj-$(CONFIG_PCI) += pci.o pci_irq.o pci_clp.o \ pci_event.o pci_debug.o pci_insn.o pci_mmio.o \ - pci_bus.o pci_kvm_hook.o pci_report.o + pci_bus.o pci_kvm_hook.o pci_report.o pci_fixup.o obj-$(CONFIG_PCI_IOV) += pci_iov.o obj-$(CONFIG_SYSFS) += pci_sysfs.o diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 88f72745fa59..39bd2adfc240 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -16,8 +16,7 @@ * Thomas Klein */ -#define KMSG_COMPONENT "zpci" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "zpci: " fmt #include <linux/kernel.h> #include <linux/slab.h> @@ -31,6 +30,7 @@ #include <linux/lockdep.h> #include <linux/list_sort.h> +#include <asm/machine.h> #include <asm/isc.h> #include <asm/airq.h> #include <asm/facility.h> @@ -44,6 +44,7 @@ /* list of all detected zpci devices */ static LIST_HEAD(zpci_list); static DEFINE_SPINLOCK(zpci_list_lock); +static DEFINE_MUTEX(zpci_add_remove_lock); static DECLARE_BITMAP(zpci_domain, ZPCI_DOMAIN_BITMAP_SIZE); static DEFINE_SPINLOCK(zpci_domain_lock); @@ -69,6 +70,15 @@ EXPORT_SYMBOL_GPL(zpci_aipb); struct airq_iv *zpci_aif_sbv; EXPORT_SYMBOL_GPL(zpci_aif_sbv); +void zpci_zdev_put(struct zpci_dev *zdev) +{ + if (!zdev) + return; + mutex_lock(&zpci_add_remove_lock); + kref_put_lock(&zdev->kref, zpci_release_device, &zpci_list_lock); + mutex_unlock(&zpci_add_remove_lock); +} + struct zpci_dev *get_zdev_by_fid(u32 fid) { struct zpci_dev *tmp, *zdev = NULL; @@ -124,14 +134,13 @@ int zpci_register_ioat(struct zpci_dev *zdev, u8 dmaas, struct zpci_fib fib = {0}; u8 cc; - WARN_ON_ONCE(iota & 0x3fff); fib.pba = base; /* Work around off by one in ISM virt device */ if (zdev->pft == PCI_FUNC_TYPE_ISM && limit > base) fib.pal = limit + (1 << 12); else fib.pal = limit; - fib.iota = iota | ZPCI_IOTA_RTTO_FLAG; + fib.iota = iota; fib.gd = zdev->gisa; cc = zpci_mod_fc(req, &fib, status); if (cc) @@ -222,24 +231,33 @@ int zpci_fmb_disable_device(struct zpci_dev *zdev) static int zpci_cfg_load(struct zpci_dev *zdev, int offset, u32 *val, u8 len) { u64 req = ZPCI_CREATE_REQ(zdev->fh, ZPCI_PCIAS_CFGSPC, len); + int rc = -ENODEV; u64 data; - int rc; + + if (!zdev_enabled(zdev)) + goto out_err; rc = __zpci_load(&data, req, offset); - if (!rc) { - data = le64_to_cpu((__force __le64) data); - data >>= (8 - len) * 8; - *val = (u32) data; - } else - *val = 0xffffffff; + if (rc) + goto out_err; + data = le64_to_cpu((__force __le64)data); + data >>= (8 - len) * 8; + *val = (u32)data; + return 0; + +out_err: + PCI_SET_ERROR_RESPONSE(val); return rc; } static int zpci_cfg_store(struct zpci_dev *zdev, int offset, u32 val, u8 len) { u64 req = ZPCI_CREATE_REQ(zdev->fh, ZPCI_PCIAS_CFGSPC, len); + int rc = -ENODEV; u64 data = val; - int rc; + + if (!zdev_enabled(zdev)) + return rc; data <<= (8 - len) * 8; data = (__force u64) cpu_to_le64(data); @@ -248,6 +266,7 @@ static int zpci_cfg_store(struct zpci_dev *zdev, int offset, u32 val, u8 len) } resource_size_t pcibios_align_resource(void *data, const struct resource *res, + const struct resource *empty_res, resource_size_t size, resource_size_t align) { @@ -255,7 +274,7 @@ resource_size_t pcibios_align_resource(void *data, const struct resource *res, } void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, - unsigned long prot) + pgprot_t prot) { /* * When PCI MIO instructions are unavailable the "physical" address @@ -265,7 +284,7 @@ void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, if (!static_branch_unlikely(&have_mio)) return (void __iomem *)phys_addr; - return generic_ioremap_prot(phys_addr, size, __pgprot(prot)); + return generic_ioremap_prot(phys_addr, size, prot); } EXPORT_SYMBOL(ioremap_prot); @@ -388,7 +407,9 @@ static int pci_read(struct pci_bus *bus, unsigned int devfn, int where, { struct zpci_dev *zdev = zdev_from_bus(bus, devfn); - return (zdev) ? zpci_cfg_load(zdev, where, val, size) : -ENODEV; + if (!zdev || zpci_cfg_load(zdev, where, val, size)) + return PCIBIOS_DEVICE_NOT_FOUND; + return PCIBIOS_SUCCESSFUL; } static int pci_write(struct pci_bus *bus, unsigned int devfn, int where, @@ -396,7 +417,9 @@ static int pci_write(struct pci_bus *bus, unsigned int devfn, int where, { struct zpci_dev *zdev = zdev_from_bus(bus, devfn); - return (zdev) ? zpci_cfg_store(zdev, where, val, size) : -ENODEV; + if (!zdev || zpci_cfg_store(zdev, where, val, size)) + return PCIBIOS_DEVICE_NOT_FOUND; + return PCIBIOS_SUCCESSFUL; } static struct pci_ops pci_root_ops = { @@ -501,7 +524,7 @@ static struct resource *__alloc_res(struct zpci_dev *zdev, unsigned long start, { struct resource *r; - r = kzalloc(sizeof(*r), GFP_KERNEL); + r = kzalloc_obj(*r); if (!r) return NULL; @@ -690,6 +713,29 @@ int zpci_enable_device(struct zpci_dev *zdev) } EXPORT_SYMBOL_GPL(zpci_enable_device); +int zpci_reenable_device(struct zpci_dev *zdev) +{ + u8 status; + int rc; + + rc = zpci_enable_device(zdev); + if (rc) + return rc; + + if (zdev->msi_nr_irqs > 0) { + rc = zpci_set_irq(zdev); + if (rc) + return rc; + } + + rc = zpci_iommu_register_ioat(zdev, &status); + if (rc) + zpci_disable_device(zdev); + + return rc; +} +EXPORT_SYMBOL_GPL(zpci_reenable_device); + int zpci_disable_device(struct zpci_dev *zdev) { u32 fh = zdev->fh; @@ -739,7 +785,6 @@ EXPORT_SYMBOL_GPL(zpci_disable_device); */ int zpci_hot_reset_device(struct zpci_dev *zdev) { - u8 status; int rc; lockdep_assert_held(&zdev->state_lock); @@ -758,19 +803,9 @@ int zpci_hot_reset_device(struct zpci_dev *zdev) return rc; } - rc = zpci_enable_device(zdev); - if (rc) - return rc; - - if (zdev->dma_table) - rc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, - virt_to_phys(zdev->dma_table), &status); - if (rc) { - zpci_disable_device(zdev); - return rc; - } + rc = zpci_reenable_device(zdev); - return 0; + return rc; } /** @@ -790,7 +825,7 @@ struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state) struct zpci_dev *zdev; int rc; - zdev = kzalloc(sizeof(*zdev), GFP_KERNEL); + zdev = kzalloc_obj(*zdev); if (!zdev) return ERR_PTR(-ENOMEM); @@ -831,6 +866,7 @@ int zpci_add_device(struct zpci_dev *zdev) { int rc; + mutex_lock(&zpci_add_remove_lock); zpci_dbg(1, "add fid:%x, fh:%x, c:%d\n", zdev->fid, zdev->fh, zdev->state); rc = zpci_init_iommu(zdev); if (rc) @@ -844,12 +880,14 @@ int zpci_add_device(struct zpci_dev *zdev) spin_lock(&zpci_list_lock); list_add_tail(&zdev->entry, &zpci_list); spin_unlock(&zpci_list_lock); + mutex_unlock(&zpci_add_remove_lock); return 0; error_destroy_iommu: zpci_destroy_iommu(zdev); error: zpci_dbg(0, "add fid:%x, rc:%d\n", zdev->fid, rc); + mutex_unlock(&zpci_add_remove_lock); return rc; } @@ -919,35 +957,36 @@ int zpci_deconfigure_device(struct zpci_dev *zdev) * @zdev: the zpci_dev that was reserved * * Handle the case that a given zPCI function was reserved by another system. - * After a call to this function the zpci_dev can not be found via - * get_zdev_by_fid() anymore but may still be accessible via existing - * references though it will not be functional anymore. */ void zpci_device_reserved(struct zpci_dev *zdev) { - /* - * Remove device from zpci_list as it is going away. This also - * makes sure we ignore subsequent zPCI events for this device. - */ - spin_lock(&zpci_list_lock); - list_del(&zdev->entry); - spin_unlock(&zpci_list_lock); + lockdep_assert_held(&zdev->state_lock); + /* We may declare the device reserved multiple times */ + if (zdev->state == ZPCI_FN_STATE_RESERVED) + return; zdev->state = ZPCI_FN_STATE_RESERVED; zpci_dbg(3, "rsv fid:%x\n", zdev->fid); + /* + * The underlying device is gone. Allow the zdev to be freed + * as soon as all other references are gone by accounting for + * the removal as a dropped reference. + */ zpci_zdev_put(zdev); } void zpci_release_device(struct kref *kref) + __releases(&zpci_list_lock) { struct zpci_dev *zdev = container_of(kref, struct zpci_dev, kref); + lockdep_assert_held(&zpci_add_remove_lock); WARN_ON(zdev->state != ZPCI_FN_STATE_RESERVED); - - if (zdev->zbus->bus) - zpci_bus_remove_device(zdev, false); - - if (zdev_enabled(zdev)) - zpci_disable_device(zdev); + /* + * We already hold zpci_list_lock thanks to kref_put_lock(). + * This makes sure no new reference can be taken from the list. + */ + list_del(&zdev->entry); + spin_unlock(&zpci_list_lock); if (zdev->has_hp_slot) zpci_exit_slot(zdev); @@ -1027,14 +1066,15 @@ static int zpci_mem_init(void) { BUILD_BUG_ON(!is_power_of_2(__alignof__(struct zpci_fmb)) || __alignof__(struct zpci_fmb) < sizeof(struct zpci_fmb)); + BUILD_BUG_ON((CONFIG_ILLEGAL_POINTER_VALUE + 0x10000 > ZPCI_IOMAP_ADDR_BASE) && + (CONFIG_ILLEGAL_POINTER_VALUE <= ZPCI_IOMAP_ADDR_MAX)); zdev_fmb_cache = kmem_cache_create("PCI_FMB_cache", sizeof(struct zpci_fmb), __alignof__(struct zpci_fmb), 0, NULL); if (!zdev_fmb_cache) goto error_fmb; - zpci_iomap_start = kcalloc(ZPCI_IOMAP_ENTRIES, - sizeof(*zpci_iomap_start), GFP_KERNEL); + zpci_iomap_start = kzalloc_objs(*zpci_iomap_start, ZPCI_IOMAP_ENTRIES); if (!zpci_iomap_start) goto error_iomap; @@ -1073,7 +1113,7 @@ char * __init pcibios_setup(char *str) return NULL; } if (!strcmp(str, "nomio")) { - get_lowcore()->machine_flags &= ~MACHINE_FLAG_PCI_MIO; + clear_machine_feature(MFEATURE_PCI_MIO); return NULL; } if (!strcmp(str, "force_floating")) { @@ -1124,6 +1164,7 @@ static void zpci_add_devices(struct list_head *scan_list) int zpci_scan_devices(void) { + struct zpci_bus *zbus; LIST_HEAD(scan_list); int rc; @@ -1132,7 +1173,10 @@ int zpci_scan_devices(void) return rc; zpci_add_devices(&scan_list); - zpci_bus_scan_busses(); + zpci_bus_for_each(zbus) { + zpci_bus_scan_bus(zbus); + cond_resched(); + } return 0; } @@ -1148,7 +1192,7 @@ static int __init pci_base_init(void) return 0; } - if (MACHINE_HAS_PCI_MIO) { + if (test_machine_feature(MFEATURE_PCI_MIO)) { static_branch_enable(&have_mio); system_ctl_set_bit(2, CR2_MIO_ADDRESSING_BIT); } @@ -1169,6 +1213,10 @@ static int __init pci_base_init(void) if (rc) goto out_find; + rc = zpci_fw_sysfs_init(); + if (rc) + goto out_find; + s390_pci_initialized = 1; return 0; diff --git a/arch/s390/pci/pci_bus.c b/arch/s390/pci/pci_bus.c index 857afbc4828f..36a4807285fa 100644 --- a/arch/s390/pci/pci_bus.c +++ b/arch/s390/pci/pci_bus.c @@ -7,18 +7,18 @@ * */ -#define KMSG_COMPONENT "zpci" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "zpci: " fmt #include <linux/kernel.h> #include <linux/slab.h> #include <linux/err.h> -#include <linux/export.h> #include <linux/delay.h> #include <linux/seq_file.h> +#include <linux/irqdomain.h> #include <linux/jump_label.h> #include <linux/pci.h> #include <linux/printk.h> +#include <linux/dma-direct.h> #include <asm/pci_clp.h> #include <asm/pci_dma.h> @@ -45,8 +45,10 @@ static int zpci_bus_prepare_device(struct zpci_dev *zdev) if (!zdev_enabled(zdev)) { rc = zpci_enable_device(zdev); - if (rc) + if (rc) { + pr_err("Enabling PCI function %08x failed\n", zdev->fid); return rc; + } } if (!zdev->has_resources) { @@ -151,23 +153,6 @@ int zpci_bus_scan_bus(struct zpci_bus *zbus) return ret; } -/* zpci_bus_scan_busses - Scan all registered busses - * - * Scan all available zbusses - * - */ -void zpci_bus_scan_busses(void) -{ - struct zpci_bus *zbus = NULL; - - mutex_lock(&zbus_list_lock); - list_for_each_entry(zbus, &zbus_list, bus_next) { - zpci_bus_scan_bus(zbus); - cond_resched(); - } - mutex_unlock(&zbus_list_lock); -} - static bool zpci_bus_is_multifunction_root(struct zpci_dev *zdev) { return !s390_pci_no_rid && zdev->rid_available && @@ -197,25 +182,52 @@ static int zpci_bus_create_pci_bus(struct zpci_bus *zbus, struct zpci_dev *fr, s zbus->multifunction = zpci_bus_is_multifunction_root(fr); zbus->max_bus_speed = fr->max_bus_speed; + if (zpci_create_parent_msi_domain(zbus)) + goto out_free_domain; + /* * Note that the zbus->resources are taken over and zbus->resources * is empty after a successful call */ bus = pci_create_root_bus(NULL, ZPCI_BUS_NR, ops, zbus, &zbus->resources); - if (!bus) { - zpci_free_domain(zbus->domain_nr); - return -EFAULT; - } + if (!bus) + goto out_remove_msi_domain; zbus->bus = bus; + dev_set_msi_domain(&zbus->bus->dev, zbus->msi_parent_domain); return 0; + +out_remove_msi_domain: + zpci_remove_parent_msi_domain(zbus); +out_free_domain: + zpci_free_domain(zbus->domain_nr); + return -ENOMEM; } -static void zpci_bus_release(struct kref *kref) +/** + * zpci_bus_release - Un-initialize resources associated with the zbus and + * free memory + * @kref: refcount * that is part of struct zpci_bus + * + * MUST be called with `zbus_list_lock` held, but the lock is released during + * run of the function. + */ +static inline void zpci_bus_release(struct kref *kref) + __releases(&zbus_list_lock) { struct zpci_bus *zbus = container_of(kref, struct zpci_bus, kref); + lockdep_assert_held(&zbus_list_lock); + + list_del(&zbus->bus_next); + mutex_unlock(&zbus_list_lock); + + /* + * At this point no-one should see this object, or be able to get a new + * reference to it. + */ + if (zbus->bus) { pci_lock_rescan_remove(); pci_stop_root_bus(zbus->bus); @@ -227,15 +239,19 @@ static void zpci_bus_release(struct kref *kref) pci_unlock_rescan_remove(); } - mutex_lock(&zbus_list_lock); - list_del(&zbus->bus_next); - mutex_unlock(&zbus_list_lock); + zpci_remove_parent_msi_domain(zbus); kfree(zbus); } -static void zpci_bus_put(struct zpci_bus *zbus) +static inline void __zpci_bus_get(struct zpci_bus *zbus) { - kref_put(&zbus->kref, zpci_bus_release); + lockdep_assert_held(&zbus_list_lock); + kref_get(&zbus->kref); +} + +static inline void zpci_bus_put(struct zpci_bus *zbus) +{ + kref_put_mutex(&zbus->kref, zpci_bus_release, &zbus_list_lock); } static struct zpci_bus *zpci_bus_get(int topo, bool topo_is_tid) @@ -247,7 +263,7 @@ static struct zpci_bus *zpci_bus_get(int topo, bool topo_is_tid) if (!zbus->multifunction) continue; if (topo_is_tid == zbus->topo_is_tid && topo == zbus->topo) { - kref_get(&zbus->kref); + __zpci_bus_get(zbus); goto out_unlock; } } @@ -257,20 +273,55 @@ out_unlock: return zbus; } +/** + * zpci_bus_get_next - get the next zbus object from given position in the list + * @pos: current position/cursor in the global zbus list + * + * Acquires and releases references as the cursor iterates (might also free/ + * release the cursor). Is tolerant of concurrent operations on the list. + * + * To begin the iteration, set *@pos to %NULL before calling the function. + * + * *@pos is set to %NULL in cases where either the list is empty, or *@pos is + * the last element in the list. + * + * Context: Process context. May sleep. + */ +void zpci_bus_get_next(struct zpci_bus **pos) +{ + struct zpci_bus *curp = *pos, *next = NULL; + + mutex_lock(&zbus_list_lock); + if (curp) + next = list_next_entry(curp, bus_next); + else + next = list_first_entry(&zbus_list, typeof(*curp), bus_next); + + if (list_entry_is_head(next, &zbus_list, bus_next)) + next = NULL; + + if (next) + __zpci_bus_get(next); + + *pos = next; + mutex_unlock(&zbus_list_lock); + + /* zpci_bus_put() might drop refcount to 0 and locks zbus_list_lock */ + if (curp) + zpci_bus_put(curp); +} + static struct zpci_bus *zpci_bus_alloc(int topo, bool topo_is_tid) { struct zpci_bus *zbus; - zbus = kzalloc(sizeof(*zbus), GFP_KERNEL); + zbus = kzalloc_obj(*zbus); if (!zbus) return NULL; zbus->topo = topo; zbus->topo_is_tid = topo_is_tid; INIT_LIST_HEAD(&zbus->bus_next); - mutex_lock(&zbus_list_lock); - list_add_tail(&zbus->bus_next, &zbus_list); - mutex_unlock(&zbus_list_lock); kref_init(&zbus->kref); INIT_LIST_HEAD(&zbus->resources); @@ -280,13 +331,39 @@ static struct zpci_bus *zpci_bus_alloc(int topo, bool topo_is_tid) zbus->bus_resource.flags = IORESOURCE_BUS; pci_add_resource(&zbus->resources, &zbus->bus_resource); + mutex_lock(&zbus_list_lock); + list_add_tail(&zbus->bus_next, &zbus_list); + mutex_unlock(&zbus_list_lock); + return zbus; } +static void pci_dma_range_setup(struct pci_dev *pdev) +{ + struct zpci_dev *zdev = to_zpci(pdev); + u64 aligned_end, size; + dma_addr_t dma_start; + int ret; + + dma_start = PAGE_ALIGN(zdev->start_dma); + aligned_end = PAGE_ALIGN_DOWN(zdev->end_dma + 1); + if (aligned_end >= dma_start) + size = aligned_end - dma_start; + else + size = 0; + WARN_ON_ONCE(size == 0); + + ret = dma_direct_set_offset(&pdev->dev, 0, dma_start, size); + if (ret) + pr_err("Failed to allocate DMA range map for %s\n", pci_name(pdev)); +} + void pcibios_bus_add_device(struct pci_dev *pdev) { struct zpci_dev *zdev = to_zpci(pdev); + pci_dma_range_setup(pdev); + /* * With pdev->no_vf_scan the common PCI probing code does not * perform PF/VF linking. @@ -331,6 +408,20 @@ error: return rc; } +static bool zpci_bus_is_isolated_vf(struct zpci_bus *zbus, struct zpci_dev *zdev) +{ + struct pci_dev *pdev; + + if (!zdev->vfn) + return false; + + pdev = zpci_iov_find_parent_pf(zbus, zdev); + if (!pdev) + return true; + pci_dev_put(pdev); + return false; +} + int zpci_bus_device_register(struct zpci_dev *zdev, struct pci_ops *ops) { bool topo_is_tid = zdev->tid_avail; @@ -345,6 +436,15 @@ int zpci_bus_device_register(struct zpci_dev *zdev, struct pci_ops *ops) topo = topo_is_tid ? zdev->tid : zdev->pchid; zbus = zpci_bus_get(topo, topo_is_tid); + /* + * An isolated VF gets its own domain/bus even if there exists + * a matching domain/bus already + */ + if (zbus && zpci_bus_is_isolated_vf(zbus, zdev)) { + zpci_bus_put(zbus); + zbus = NULL; + } + if (!zbus) { zbus = zpci_bus_alloc(topo, topo_is_tid); if (!zbus) diff --git a/arch/s390/pci/pci_bus.h b/arch/s390/pci/pci_bus.h index e86a9419d233..e440742e3145 100644 --- a/arch/s390/pci/pci_bus.h +++ b/arch/s390/pci/pci_bus.h @@ -15,17 +15,27 @@ int zpci_bus_device_register(struct zpci_dev *zdev, struct pci_ops *ops); void zpci_bus_device_unregister(struct zpci_dev *zdev); int zpci_bus_scan_bus(struct zpci_bus *zbus); -void zpci_bus_scan_busses(void); +void zpci_bus_get_next(struct zpci_bus **pos); + +/** + * zpci_bus_for_each - iterate over all the registered zbus objects + * @pos: a struct zpci_bus * as cursor + * + * Acquires and releases references as the cursor iterates over the registered + * objects. Is tolerant against concurrent removals of objects. + * + * Context: Process context. May sleep. + */ +#define zpci_bus_for_each(pos) \ + for ((pos) = NULL, zpci_bus_get_next(&(pos)); (pos) != NULL; \ + zpci_bus_get_next(&(pos))) int zpci_bus_scan_device(struct zpci_dev *zdev); void zpci_bus_remove_device(struct zpci_dev *zdev, bool set_error); void zpci_release_device(struct kref *kref); -static inline void zpci_zdev_put(struct zpci_dev *zdev) -{ - if (zdev) - kref_put(&zdev->kref, zpci_release_device); -} + +void zpci_zdev_put(struct zpci_dev *zdev); static inline void zpci_zdev_get(struct zpci_dev *zdev) { diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c index 14bf7e8d06b7..177aa0214547 100644 --- a/arch/s390/pci/pci_clp.c +++ b/arch/s390/pci/pci_clp.c @@ -6,10 +6,8 @@ * Jan Glauber <jang@linux.vnet.ibm.com> */ -#define KMSG_COMPONENT "zpci" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "zpci: " fmt -#include <linux/compat.h> #include <linux/kernel.h> #include <linux/miscdevice.h> #include <linux/slab.h> @@ -56,7 +54,7 @@ static inline int clp_get_ilp(unsigned long *ilp) int cc, exception; exception = 1; - asm volatile ( + asm_inline volatile ( " .insn rrf,0xb9a00000,%[mask],%[cmd],8,0\n" "0: lhi %[exc],0\n" "1:\n" @@ -79,7 +77,7 @@ static __always_inline int clp_req(void *data, unsigned int lps) u64 ignored; exception = 1; - asm volatile ( + asm_inline volatile ( " .insn rrf,0xb9a00000,%[ign],%[req],0,%[lps]\n" "0: lhi %[exc],0\n" "1:\n" @@ -112,6 +110,7 @@ static void clp_store_query_pci_fngrp(struct zpci_dev *zdev, zdev->version = response->version; zdev->maxstbl = response->maxstbl; zdev->dtsm = response->dtsm; + zdev->rtr_avail = response->rtr; switch (response->version) { case 1: @@ -427,6 +426,8 @@ static void __clp_add(struct clp_fh_list_entry *entry, void *data) return; } zdev = zpci_create_device(entry->fid, entry->fh, entry->config_state); + if (IS_ERR(zdev)) + return; list_add_tail(&zdev->entry, scan_list); } @@ -648,7 +649,7 @@ static long clp_misc_ioctl(struct file *filp, unsigned int cmd, if (cmd != CLP_SYNC) return -EINVAL; - argp = is_compat_task() ? compat_ptr(arg) : (void __user *) arg; + argp = (void __user *)arg; if (copy_from_user(&req, argp, sizeof(req))) return -EFAULT; if (req.r != 0) @@ -666,7 +667,6 @@ static const struct file_operations clp_misc_fops = { .open = nonseekable_open, .release = clp_misc_release, .unlocked_ioctl = clp_misc_ioctl, - .compat_ioctl = clp_misc_ioctl, }; static struct miscdevice clp_misc_device = { diff --git a/arch/s390/pci/pci_debug.c b/arch/s390/pci/pci_debug.c index 38014206c16b..c7ed7bf254b5 100644 --- a/arch/s390/pci/pci_debug.c +++ b/arch/s390/pci/pci_debug.c @@ -6,8 +6,7 @@ * Jan Glauber <jang@linux.vnet.ibm.com> */ -#define KMSG_COMPONENT "zpci" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "zpci: " fmt #include <linux/kernel.h> #include <linux/seq_file.h> diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c index 7bd7721c1239..839bd91c056e 100644 --- a/arch/s390/pci/pci_event.c +++ b/arch/s390/pci/pci_event.c @@ -6,8 +6,7 @@ * Jan Glauber <jang@linux.vnet.ibm.com> */ -#define KMSG_COMPONENT "zpci" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "zpci: " fmt #include <linux/kernel.h> #include <linux/pci.h> @@ -54,6 +53,7 @@ static inline bool ers_result_indicates_abort(pci_ers_result_t ers_res) case PCI_ERS_RESULT_CAN_RECOVER: case PCI_ERS_RESULT_RECOVERED: case PCI_ERS_RESULT_NEED_RESET: + case PCI_ERS_RESULT_NONE: return false; default: return true; @@ -78,10 +78,6 @@ static bool is_driver_supported(struct pci_driver *driver) return false; if (!driver->err_handler->error_detected) return false; - if (!driver->err_handler->slot_reset) - return false; - if (!driver->err_handler->resume) - return false; return true; } @@ -91,6 +87,7 @@ static pci_ers_result_t zpci_event_notify_error_detected(struct pci_dev *pdev, pci_ers_result_t ers_res = PCI_ERS_RESULT_DISCONNECT; ers_res = driver->err_handler->error_detected(pdev, pdev->error_state); + pci_uevent_ers(pdev, ers_res); if (ers_result_indicates_abort(ers_res)) pr_info("%s: Automatic recovery failed after initial reporting\n", pci_name(pdev)); else if (ers_res == PCI_ERS_RESULT_NEED_RESET) @@ -106,6 +103,10 @@ static pci_ers_result_t zpci_event_do_error_state_clear(struct pci_dev *pdev, struct zpci_dev *zdev = to_zpci(pdev); int rc; + /* The underlying device may have been disabled by the event */ + if (!zdev_enabled(zdev)) + return PCI_ERS_RESULT_NEED_RESET; + pr_info("%s: Unblocking device access for examination\n", pci_name(pdev)); rc = zpci_reset_load_store_blocked(zdev); if (rc) { @@ -114,16 +115,18 @@ static pci_ers_result_t zpci_event_do_error_state_clear(struct pci_dev *pdev, return PCI_ERS_RESULT_NEED_RESET; } - if (driver->err_handler->mmio_enabled) { + if (driver->err_handler->mmio_enabled) ers_res = driver->err_handler->mmio_enabled(pdev); - if (ers_result_indicates_abort(ers_res)) { - pr_info("%s: Automatic recovery failed after MMIO re-enable\n", - pci_name(pdev)); - return ers_res; - } else if (ers_res == PCI_ERS_RESULT_NEED_RESET) { - pr_debug("%s: Driver needs reset to recover\n", pci_name(pdev)); - return ers_res; - } + else + ers_res = PCI_ERS_RESULT_NONE; + + if (ers_result_indicates_abort(ers_res)) { + pr_info("%s: Automatic recovery failed after MMIO re-enable\n", + pci_name(pdev)); + return ers_res; + } else if (ers_res == PCI_ERS_RESULT_NEED_RESET) { + pr_debug("%s: Driver needs reset to recover\n", pci_name(pdev)); + return ers_res; } pr_debug("%s: Unblocking DMA\n", pci_name(pdev)); @@ -150,7 +153,12 @@ static pci_ers_result_t zpci_event_do_reset(struct pci_dev *pdev, return ers_res; } pdev->error_state = pci_channel_io_normal; - ers_res = driver->err_handler->slot_reset(pdev); + + if (driver->err_handler->slot_reset) + ers_res = driver->err_handler->slot_reset(pdev); + else + ers_res = PCI_ERS_RESULT_NONE; + if (ers_result_indicates_abort(ers_res)) { pr_info("%s: Automatic recovery failed after slot reset\n", pci_name(pdev)); return ers_res; @@ -179,7 +187,7 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev) * is unbound or probed and that userspace can't access its * configuration space while we perform recovery. */ - pci_dev_lock(pdev); + device_lock(&pdev->dev); if (pdev->error_state == pci_channel_io_perm_failure) { ers_res = PCI_ERS_RESULT_DISCONNECT; goto out_unlock; @@ -214,7 +222,7 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev) goto out_unlock; } - if (ers_res == PCI_ERS_RESULT_CAN_RECOVER) { + if (ers_res != PCI_ERS_RESULT_NEED_RESET) { ers_res = zpci_event_do_error_state_clear(pdev, driver); if (ers_result_indicates_abort(ers_res)) { status_str = "failed (abort on MMIO enable)"; @@ -225,7 +233,18 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev) if (ers_res == PCI_ERS_RESULT_NEED_RESET) ers_res = zpci_event_do_reset(pdev, driver); + /* + * ers_res can be PCI_ERS_RESULT_NONE either because the driver + * decided to return it, indicating that it abstains from voting + * on how to recover, or because it didn't implement the callback. + * Both cases assume, that if there is nothing else causing a + * disconnect, we recovered successfully. + */ + if (ers_res == PCI_ERS_RESULT_NONE) + ers_res = PCI_ERS_RESULT_RECOVERED; + if (ers_res != PCI_ERS_RESULT_RECOVERED) { + pci_uevent_ers(pdev, PCI_ERS_RESULT_DISCONNECT); pr_err("%s: Automatic recovery failed; operator intervention is required\n", pci_name(pdev)); status_str = "failed (driver can't recover)"; @@ -235,8 +254,9 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev) pr_info("%s: The device is ready to resume operations\n", pci_name(pdev)); if (driver->err_handler->resume) driver->err_handler->resume(pdev); + pci_uevent_ers(pdev, PCI_ERS_RESULT_RECOVERED); out_unlock: - pci_dev_unlock(pdev); + device_unlock(&pdev->dev); zpci_report_status(zdev, "recovery", status_str); return ers_res; @@ -273,6 +293,8 @@ static void __zpci_event_error(struct zpci_ccdf_err *ccdf) struct zpci_dev *zdev = get_zdev_by_fid(ccdf->fid); struct pci_dev *pdev = NULL; pci_ers_result_t ers_res; + u32 fh = 0; + int rc; zpci_dbg(3, "err fid:%x, fh:%x, pec:%x\n", ccdf->fid, ccdf->fh, ccdf->pec); @@ -281,6 +303,15 @@ static void __zpci_event_error(struct zpci_ccdf_err *ccdf) if (zdev) { mutex_lock(&zdev->state_lock); + rc = clp_refresh_fh(zdev->fid, &fh); + if (rc) + goto no_pdev; + if (!fh || ccdf->fh != fh) { + /* Ignore events with stale handles */ + zpci_dbg(3, "err fid:%x, fh:%x (stale %x)\n", + ccdf->fid, fh, ccdf->fh); + goto no_pdev; + } zpci_update_fh(zdev, ccdf->fh); if (zdev->zbus->bus) pdev = pci_get_slot(zdev->zbus->bus, zdev->devfn); @@ -335,6 +366,22 @@ static void zpci_event_hard_deconfigured(struct zpci_dev *zdev, u32 fh) zdev->state = ZPCI_FN_STATE_STANDBY; } +static void zpci_event_reappear(struct zpci_dev *zdev) +{ + lockdep_assert_held(&zdev->state_lock); + /* + * The zdev is in the reserved state. This means that it was presumed to + * go away but there are still undropped references. Now, the platform + * announced its availability again. Bring back the lingering zdev + * to standby. This is safe because we hold a temporary reference + * now so that it won't go away. Account for the re-appearance of the + * underlying device by incrementing the reference count. + */ + zdev->state = ZPCI_FN_STATE_STANDBY; + zpci_zdev_get(zdev); + zpci_dbg(1, "rea fid:%x, fh:%x\n", zdev->fid, zdev->fh); +} + static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) { struct zpci_dev *zdev = get_zdev_by_fid(ccdf->fid); @@ -358,8 +405,10 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) break; } } else { + if (zdev->state == ZPCI_FN_STATE_RESERVED) + zpci_event_reappear(zdev); /* the configuration request may be stale */ - if (zdev->state != ZPCI_FN_STATE_STANDBY) + else if (zdev->state != ZPCI_FN_STATE_STANDBY) break; zdev->state = ZPCI_FN_STATE_CONFIGURED; } @@ -375,6 +424,8 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) break; } } else { + if (zdev->state == ZPCI_FN_STATE_RESERVED) + zpci_event_reappear(zdev); zpci_update_fh(zdev, ccdf->fh); } break; diff --git a/arch/s390/pci/pci_fixup.c b/arch/s390/pci/pci_fixup.c new file mode 100644 index 000000000000..35688b645098 --- /dev/null +++ b/arch/s390/pci/pci_fixup.c @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Exceptions for specific devices, + * + * Copyright IBM Corp. 2025 + * + * Author(s): + * Niklas Schnelle <schnelle@linux.ibm.com> + */ +#include <linux/pci.h> + +static void zpci_ism_bar_no_mmap(struct pci_dev *pdev) +{ + /* + * ISM's BAR is special. Drivers written for ISM know + * how to handle this but others need to be aware of their + * special nature e.g. to prevent attempts to mmap() it. + */ + pdev->non_mappable_bars = 1; +} +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_IBM, + PCI_DEVICE_ID_IBM_ISM, + zpci_ism_bar_no_mmap); diff --git a/arch/s390/pci/pci_insn.c b/arch/s390/pci/pci_insn.c index f5a75ea7629a..35ceb1bea1c6 100644 --- a/arch/s390/pci/pci_insn.c +++ b/arch/s390/pci/pci_insn.c @@ -145,7 +145,7 @@ int zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib) return -EIO; asm volatile( - ".insn rsy,0xeb00000000d1,%[ctl],%[isc],%[iib]\n" + ".insn rsy,0xeb00000000d1,%[ctl],%[isc],%[iib]" : : [ctl] "d" (ctl), [isc] "d" (isc << 27), [iib] "Q" (*iib)); return 0; @@ -160,7 +160,7 @@ static inline int ____pcilg(u64 *data, u64 req, u64 offset, u8 *status) u64 __data; exception = 1; - asm volatile ( + asm_inline volatile ( " .insn rre,0xb9d20000,%[data],%[req_off]\n" "0: lhi %[exc],0\n" "1:\n" @@ -229,7 +229,7 @@ static inline int __pcilg_mio(u64 *data, u64 ioaddr, u64 len, u8 *status) u64 __data; exception = 1; - asm volatile ( + asm_inline volatile ( " .insn rre,0xb9d60000,%[data],%[ioaddr_len]\n" "0: lhi %[exc],0\n" "1:\n" @@ -267,7 +267,7 @@ static inline int __pcistg(u64 data, u64 req, u64 offset, u8 *status) int cc, exception; exception = 1; - asm volatile ( + asm_inline volatile ( " .insn rre,0xb9d00000,%[data],%[req_off]\n" "0: lhi %[exc],0\n" "1:\n" @@ -321,7 +321,7 @@ static inline int __pcistg_mio(u64 data, u64 ioaddr, u64 len, u8 *status) int cc, exception; exception = 1; - asm volatile ( + asm_inline volatile ( " .insn rre,0xb9d40000,%[data],%[ioaddr_len]\n" "0: lhi %[exc],0\n" "1:\n" @@ -356,7 +356,7 @@ static inline int __pcistb(const u64 *data, u64 req, u64 offset, u8 *status) int cc, exception; exception = 1; - asm volatile ( + asm_inline volatile ( " .insn rsy,0xeb00000000d0,%[req],%[offset],%[data]\n" "0: lhi %[exc],0\n" "1:\n" @@ -410,7 +410,7 @@ static inline int __pcistb_mio(const u64 *data, u64 ioaddr, u64 len, u8 *status) int cc, exception; exception = 1; - asm volatile ( + asm_inline volatile ( " .insn rsy,0xeb00000000d4,%[len],%[ioaddr],%[data]\n" "0: lhi %[exc],0\n" "1:\n" @@ -442,7 +442,7 @@ EXPORT_SYMBOL_GPL(zpci_write_block); static inline void __pciwb_mio(void) { - asm volatile (".insn rre,0xb9d50000,0,0\n"); + asm volatile (".insn rre,0xb9d50000,0,0"); } void zpci_barrier(void) diff --git a/arch/s390/pci/pci_iov.c b/arch/s390/pci/pci_iov.c index ead062bf2b41..13050ce5c3e9 100644 --- a/arch/s390/pci/pci_iov.c +++ b/arch/s390/pci/pci_iov.c @@ -7,8 +7,7 @@ * */ -#define KMSG_COMPONENT "zpci" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "zpci: " fmt #include <linux/kernel.h> #include <linux/pci.h> @@ -60,18 +59,35 @@ static int zpci_iov_link_virtfn(struct pci_dev *pdev, struct pci_dev *virtfn, in return 0; } -int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *virtfn, int vfn) +/** + * zpci_iov_find_parent_pf - Find the parent PF, if any, of the given function + * @zbus: The bus that the PCI function is on, or would be added on + * @zdev: The PCI function + * + * Finds the parent PF, if it exists and is configured, of the given PCI function + * and increments its refcount. Th PF is searched for on the provided bus so the + * caller has to ensure that this is the correct bus to search. This function may + * be used before adding the PCI function to a zbus. + * + * Return: Pointer to the struct pci_dev of the parent PF or NULL if it not + * found. If the function is not a VF or has no RequesterID information, + * NULL is returned as well. + */ +struct pci_dev *zpci_iov_find_parent_pf(struct zpci_bus *zbus, struct zpci_dev *zdev) { - int i, cand_devfn; - struct zpci_dev *zdev; + int i, vfid, devfn, cand_devfn; struct pci_dev *pdev; - int vfid = vfn - 1; /* Linux' vfid's start at 0 vfn at 1*/ - int rc = 0; if (!zbus->multifunction) - return 0; - - /* If the parent PF for the given VF is also configured in the + return NULL; + /* Non-VFs and VFs without RID available don't have a parent */ + if (!zdev->vfn || !zdev->rid_available) + return NULL; + /* Linux vfid starts at 0 vfn at 1 */ + vfid = zdev->vfn - 1; + devfn = zdev->rid & ZPCI_RID_MASK_DEVFN; + /* + * If the parent PF for the given VF is also configured in the * instance, it must be on the same zbus. * We can then identify the parent PF by checking what * devfn the VF would have if it belonged to that PF using the PF's @@ -85,15 +101,26 @@ int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *virtfn, int vfn if (!pdev) continue; cand_devfn = pci_iov_virtfn_devfn(pdev, vfid); - if (cand_devfn == virtfn->devfn) { - rc = zpci_iov_link_virtfn(pdev, virtfn, vfid); - /* balance pci_get_slot() */ - pci_dev_put(pdev); - break; - } + if (cand_devfn == devfn) + return pdev; /* balance pci_get_slot() */ pci_dev_put(pdev); } } + return NULL; +} + +int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *virtfn, int vfn) +{ + struct zpci_dev *zdev = to_zpci(virtfn); + struct pci_dev *pdev_pf; + int rc = 0; + + pdev_pf = zpci_iov_find_parent_pf(zbus, zdev); + if (pdev_pf) { + /* Linux' vfids start at 0 while zdev->vfn starts at 1 */ + rc = zpci_iov_link_virtfn(pdev_pf, virtfn, zdev->vfn - 1); + pci_dev_put(pdev_pf); + } return rc; } diff --git a/arch/s390/pci/pci_iov.h b/arch/s390/pci/pci_iov.h index e3fa4e77fc86..d2c2793eb0f3 100644 --- a/arch/s390/pci/pci_iov.h +++ b/arch/s390/pci/pci_iov.h @@ -19,6 +19,8 @@ void zpci_iov_map_resources(struct pci_dev *pdev); int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *virtfn, int vfn); +struct pci_dev *zpci_iov_find_parent_pf(struct zpci_bus *zbus, struct zpci_dev *zdev); + #else /* CONFIG_PCI_IOV */ static inline void zpci_iov_remove_virtfn(struct pci_dev *pdev, int vfn) {} @@ -28,5 +30,10 @@ static inline int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *v { return 0; } + +static inline struct pci_dev *zpci_iov_find_parent_pf(struct zpci_bus *zbus, struct zpci_dev *zdev) +{ + return NULL; +} #endif /* CONFIG_PCI_IOV */ #endif /* __S390_PCI_IOV_h */ diff --git a/arch/s390/pci/pci_irq.c b/arch/s390/pci/pci_irq.c index 84482a921332..9c9ed3d8d959 100644 --- a/arch/s390/pci/pci_irq.c +++ b/arch/s390/pci/pci_irq.c @@ -1,12 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 -#define KMSG_COMPONENT "zpci" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "zpci: " fmt #include <linux/kernel.h> #include <linux/irq.h> #include <linux/kernel_stat.h> #include <linux/pci.h> #include <linux/msi.h> +#include <linux/irqchip/irq-msi-lib.h> #include <linux/smp.h> #include <asm/isc.h> @@ -98,7 +98,7 @@ static int zpci_clear_directed_irq(struct zpci_dev *zdev) } /* Register adapter interruptions */ -static int zpci_set_irq(struct zpci_dev *zdev) +int zpci_set_irq(struct zpci_dev *zdev) { int rc; @@ -107,9 +107,6 @@ static int zpci_set_irq(struct zpci_dev *zdev) else rc = zpci_set_airq(zdev); - if (!rc) - zdev->irqs_registered = 1; - return rc; } @@ -123,36 +120,59 @@ static int zpci_clear_irq(struct zpci_dev *zdev) else rc = zpci_clear_airq(zdev); - if (!rc) - zdev->irqs_registered = 0; - return rc; } static int zpci_set_irq_affinity(struct irq_data *data, const struct cpumask *dest, bool force) { - struct msi_desc *entry = irq_data_get_msi_desc(data); - struct msi_msg msg = entry->msg; - int cpu_addr = smp_cpu_get_cpu_address(cpumask_first(dest)); + irq_data_update_affinity(data, dest); + return IRQ_SET_MASK_OK; +} - msg.address_lo &= 0xff0000ff; - msg.address_lo |= (cpu_addr << 8); - pci_write_msi_msg(data->irq, &msg); +/* + * Encode the hwirq number for the parent domain. The encoding must be unique + * for each IRQ of each device in the parent domain, so it uses the devfn to + * identify the device and the msi_index to identify the IRQ within that device. + */ +static inline u32 zpci_encode_hwirq(u8 devfn, u16 msi_index) +{ + return (devfn << 16) | msi_index; +} - return IRQ_SET_MASK_OK; +static inline u16 zpci_decode_hwirq_msi_index(irq_hw_number_t hwirq) +{ + return hwirq & 0xffff; +} + +static void zpci_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) +{ + struct msi_desc *desc = irq_data_get_msi_desc(data); + struct zpci_dev *zdev = to_zpci_dev(desc->dev); + + if (irq_delivery == DIRECTED) { + int cpu = cpumask_first(irq_data_get_affinity_mask(data)); + + msg->address_lo = zdev->msi_addr & 0xff0000ff; + msg->address_lo |= (smp_cpu_get_cpu_address(cpu) << 8); + } else { + msg->address_lo = zdev->msi_addr & 0xffffffff; + } + msg->address_hi = zdev->msi_addr >> 32; + msg->data = zpci_decode_hwirq_msi_index(data->hwirq); } static struct irq_chip zpci_irq_chip = { .name = "PCI-MSI", - .irq_unmask = pci_msi_unmask_irq, - .irq_mask = pci_msi_mask_irq, + .irq_compose_msi_msg = zpci_compose_msi_msg, }; static void zpci_handle_cpu_local_irq(bool rescan) { struct airq_iv *dibv = zpci_ibv[smp_processor_id()]; union zpci_sic_iib iib = {{0}}; + struct irq_domain *msi_domain; + irq_hw_number_t hwirq; unsigned long bit; int irqs_on = 0; @@ -170,7 +190,9 @@ static void zpci_handle_cpu_local_irq(bool rescan) continue; } inc_irq_stat(IRQIO_MSI); - generic_handle_irq(airq_iv_get_data(dibv, bit)); + hwirq = airq_iv_get_data(dibv, bit); + msi_domain = (struct irq_domain *)airq_iv_get_ptr(dibv, bit); + generic_handle_domain_irq(msi_domain, hwirq); } } @@ -235,6 +257,8 @@ static void zpci_floating_irq_handler(struct airq_struct *airq, struct tpi_info *tpi_info) { union zpci_sic_iib iib = {{0}}; + struct irq_domain *msi_domain; + irq_hw_number_t hwirq; unsigned long si, ai; struct airq_iv *aibv; int irqs_on = 0; @@ -262,7 +286,9 @@ static void zpci_floating_irq_handler(struct airq_struct *airq, break; inc_irq_stat(IRQIO_MSI); airq_iv_lock(aibv, ai); - generic_handle_irq(airq_iv_get_data(aibv, ai)); + hwirq = airq_iv_get_data(aibv, ai); + msi_domain = (struct irq_domain *)airq_iv_get_ptr(aibv, ai); + generic_handle_domain_irq(msi_domain, hwirq); airq_iv_unlock(aibv, ai); } } @@ -284,7 +310,9 @@ static int __alloc_airq(struct zpci_dev *zdev, int msi_vecs, zdev->aisb = *bit; /* Create adapter interrupt vector */ - zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | AIRQ_IV_BITLOCK, NULL); + zdev->aibv = airq_iv_create(msi_vecs, + AIRQ_IV_PTR | AIRQ_IV_DATA | AIRQ_IV_BITLOCK, + NULL); if (!zdev->aibv) return -ENOMEM; @@ -296,147 +324,220 @@ static int __alloc_airq(struct zpci_dev *zdev, int msi_vecs, return 0; } -int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) +bool arch_restore_msi_irqs(struct pci_dev *pdev) { - unsigned int hwirq, msi_vecs, irqs_per_msi, i, cpu; struct zpci_dev *zdev = to_zpci(pdev); - struct msi_desc *msi; - struct msi_msg msg; - unsigned long bit; - int cpu_addr; - int rc, irq; + zpci_set_irq(zdev); + return true; +} + +static struct airq_struct zpci_airq = { + .handler = zpci_floating_irq_handler, + .isc = PCI_ISC, +}; + +static void zpci_msi_teardown_directed(struct zpci_dev *zdev) +{ + airq_iv_free(zpci_ibv[0], zdev->msi_first_bit, zdev->max_msi); + zdev->msi_first_bit = -1U; + zdev->msi_nr_irqs = 0; +} + +static void zpci_msi_teardown_floating(struct zpci_dev *zdev) +{ + airq_iv_release(zdev->aibv); + zdev->aibv = NULL; + airq_iv_free_bit(zpci_sbv, zdev->aisb); zdev->aisb = -1UL; zdev->msi_first_bit = -1U; + zdev->msi_nr_irqs = 0; +} + +static void zpci_msi_teardown(struct irq_domain *domain, msi_alloc_info_t *arg) +{ + struct zpci_dev *zdev = to_zpci_dev(domain->dev); + + zpci_clear_irq(zdev); + if (irq_delivery == DIRECTED) + zpci_msi_teardown_directed(zdev); + else + zpci_msi_teardown_floating(zdev); +} + +static int zpci_msi_prepare(struct irq_domain *domain, + struct device *dev, int nvec, + msi_alloc_info_t *info) +{ + struct zpci_dev *zdev = to_zpci_dev(dev); + struct pci_dev *pdev = to_pci_dev(dev); + unsigned long bit; + int msi_vecs, rc; msi_vecs = min_t(unsigned int, nvec, zdev->max_msi); if (msi_vecs < nvec) { - pr_info("%s requested %d irqs, allocate system limit of %d", + pr_info("%s requested %d IRQs, allocate system limit of %d\n", pci_name(pdev), nvec, zdev->max_msi); } rc = __alloc_airq(zdev, msi_vecs, &bit); - if (rc < 0) + if (rc) { + pr_err("Allocating adapter IRQs for %s failed\n", pci_name(pdev)); return rc; + } - /* - * Request MSI interrupts: - * When using MSI, nvec_used interrupt sources and their irq - * descriptors are controlled through one msi descriptor. - * Thus the outer loop over msi descriptors shall run only once, - * while two inner loops iterate over the interrupt vectors. - * When using MSI-X, each interrupt vector/irq descriptor - * is bound to exactly one msi descriptor (nvec_used is one). - * So the inner loops are executed once, while the outer iterates - * over the MSI-X descriptors. - */ - hwirq = bit; - msi_for_each_desc(msi, &pdev->dev, MSI_DESC_NOTASSOCIATED) { - if (hwirq - bit >= msi_vecs) - break; - irqs_per_msi = min_t(unsigned int, msi_vecs, msi->nvec_used); - irq = __irq_alloc_descs(-1, 0, irqs_per_msi, 0, THIS_MODULE, - (irq_delivery == DIRECTED) ? - msi->affinity : NULL); - if (irq < 0) - return -ENOMEM; + zdev->msi_first_bit = bit; + zdev->msi_nr_irqs = msi_vecs; + rc = zpci_set_irq(zdev); + if (rc) { + pr_err("Registering adapter IRQs for %s failed\n", + pci_name(pdev)); + + if (irq_delivery == DIRECTED) + zpci_msi_teardown_directed(zdev); + else + zpci_msi_teardown_floating(zdev); + return rc; + } + return 0; +} - for (i = 0; i < irqs_per_msi; i++) { - rc = irq_set_msi_desc_off(irq, i, msi); - if (rc) - return rc; - irq_set_chip_and_handler(irq + i, &zpci_irq_chip, - handle_percpu_irq); - } +static int zpci_msi_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *args) +{ + struct msi_desc *desc = ((msi_alloc_info_t *)args)->desc; + struct zpci_dev *zdev = to_zpci_dev(desc->dev); + struct zpci_bus *zbus = zdev->zbus; + unsigned int cpu, hwirq; + unsigned long bit; + int i; - msg.data = hwirq - bit; - if (irq_delivery == DIRECTED) { - if (msi->affinity) - cpu = cpumask_first(&msi->affinity->mask); - else - cpu = 0; - cpu_addr = smp_cpu_get_cpu_address(cpu); + bit = zdev->msi_first_bit + desc->msi_index; + hwirq = zpci_encode_hwirq(zdev->devfn, desc->msi_index); + + if (desc->msi_index + nr_irqs > zdev->max_msi) + return -EINVAL; - msg.address_lo = zdev->msi_addr & 0xff0000ff; - msg.address_lo |= (cpu_addr << 8); + for (i = 0; i < nr_irqs; i++) { + irq_domain_set_info(domain, virq + i, hwirq + i, + &zpci_irq_chip, zdev, + handle_percpu_irq, NULL, NULL); + if (irq_delivery == DIRECTED) { for_each_possible_cpu(cpu) { - for (i = 0; i < irqs_per_msi; i++) - airq_iv_set_data(zpci_ibv[cpu], - hwirq + i, irq + i); + airq_iv_set_ptr(zpci_ibv[cpu], bit + i, + (unsigned long)zbus->msi_parent_domain); + airq_iv_set_data(zpci_ibv[cpu], bit + i, hwirq + i); } } else { - msg.address_lo = zdev->msi_addr & 0xffffffff; - for (i = 0; i < irqs_per_msi; i++) - airq_iv_set_data(zdev->aibv, hwirq + i, irq + i); + airq_iv_set_ptr(zdev->aibv, bit + i, + (unsigned long)zbus->msi_parent_domain); + airq_iv_set_data(zdev->aibv, bit + i, hwirq + i); } - msg.address_hi = zdev->msi_addr >> 32; - pci_write_msi_msg(irq, &msg); - hwirq += irqs_per_msi; } - zdev->msi_first_bit = bit; - zdev->msi_nr_irqs = hwirq - bit; - - rc = zpci_set_irq(zdev); - if (rc) - return rc; - - return (zdev->msi_nr_irqs == nvec) ? 0 : zdev->msi_nr_irqs; + return 0; } -void arch_teardown_msi_irqs(struct pci_dev *pdev) +static void zpci_msi_clear_airq(struct irq_data *d, int i) { - struct zpci_dev *zdev = to_zpci(pdev); - struct msi_desc *msi; - unsigned int i; - int rc; + struct msi_desc *desc = irq_data_get_msi_desc(d); + struct zpci_dev *zdev = to_zpci_dev(desc->dev); + unsigned long bit; + unsigned int cpu; + u16 msi_index; - /* Disable interrupts */ - rc = zpci_clear_irq(zdev); - if (rc) - return; + msi_index = zpci_decode_hwirq_msi_index(d->hwirq); + bit = zdev->msi_first_bit + msi_index; - /* Release MSI interrupts */ - msi_for_each_desc(msi, &pdev->dev, MSI_DESC_ASSOCIATED) { - for (i = 0; i < msi->nvec_used; i++) { - irq_set_msi_desc(msi->irq + i, NULL); - irq_free_desc(msi->irq + i); + if (irq_delivery == DIRECTED) { + for_each_possible_cpu(cpu) { + airq_iv_set_ptr(zpci_ibv[cpu], bit + i, 0); + airq_iv_set_data(zpci_ibv[cpu], bit + i, 0); } - msi->msg.address_lo = 0; - msi->msg.address_hi = 0; - msi->msg.data = 0; - msi->irq = 0; + } else { + airq_iv_set_ptr(zdev->aibv, bit + i, 0); + airq_iv_set_data(zdev->aibv, bit + i, 0); } +} - if (zdev->aisb != -1UL) { - zpci_ibv[zdev->aisb] = NULL; - airq_iv_free_bit(zpci_sbv, zdev->aisb); - zdev->aisb = -1UL; - } - if (zdev->aibv) { - airq_iv_release(zdev->aibv); - zdev->aibv = NULL; - } +static void zpci_msi_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + struct irq_data *d; + int i; - if ((irq_delivery == DIRECTED) && zdev->msi_first_bit != -1U) - airq_iv_free(zpci_ibv[0], zdev->msi_first_bit, zdev->msi_nr_irqs); + for (i = 0; i < nr_irqs; i++) { + d = irq_domain_get_irq_data(domain, virq + i); + zpci_msi_clear_airq(d, i); + irq_domain_reset_irq_data(d); + } } -bool arch_restore_msi_irqs(struct pci_dev *pdev) +static const struct irq_domain_ops zpci_msi_domain_ops = { + .alloc = zpci_msi_domain_alloc, + .free = zpci_msi_domain_free, +}; + +static bool zpci_init_dev_msi_info(struct device *dev, struct irq_domain *domain, + struct irq_domain *real_parent, + struct msi_domain_info *info) { - struct zpci_dev *zdev = to_zpci(pdev); + if (!msi_lib_init_dev_msi_info(dev, domain, real_parent, info)) + return false; + + info->ops->msi_prepare = zpci_msi_prepare; + info->ops->msi_teardown = zpci_msi_teardown; - if (!zdev->irqs_registered) - zpci_set_irq(zdev); return true; } -static struct airq_struct zpci_airq = { - .handler = zpci_floating_irq_handler, - .isc = PCI_ISC, +static struct msi_parent_ops zpci_msi_parent_ops = { + .supported_flags = MSI_GENERIC_FLAGS_MASK | + MSI_FLAG_PCI_MSIX | + MSI_FLAG_MULTI_PCI_MSI, + .required_flags = MSI_FLAG_USE_DEF_DOM_OPS | + MSI_FLAG_USE_DEF_CHIP_OPS, + .init_dev_msi_info = zpci_init_dev_msi_info, }; +int zpci_create_parent_msi_domain(struct zpci_bus *zbus) +{ + char fwnode_name[18]; + + snprintf(fwnode_name, sizeof(fwnode_name), "ZPCI_MSI_DOM_%04x", zbus->domain_nr); + struct irq_domain_info info = { + .fwnode = irq_domain_alloc_named_fwnode(fwnode_name), + .ops = &zpci_msi_domain_ops, + }; + + if (!info.fwnode) { + pr_err("Failed to allocate fwnode for MSI IRQ domain\n"); + return -ENOMEM; + } + + if (irq_delivery == FLOATING) + zpci_msi_parent_ops.required_flags |= MSI_FLAG_NO_AFFINITY; + + zbus->msi_parent_domain = msi_create_parent_irq_domain(&info, &zpci_msi_parent_ops); + if (!zbus->msi_parent_domain) { + irq_domain_free_fwnode(info.fwnode); + pr_err("Failed to create MSI IRQ domain\n"); + return -ENOMEM; + } + + return 0; +} + +void zpci_remove_parent_msi_domain(struct zpci_bus *zbus) +{ + struct fwnode_handle *fn; + + fn = zbus->msi_parent_domain->fwnode; + irq_domain_remove(zbus->msi_parent_domain); + irq_domain_free_fwnode(fn); +} + static void __init cpu_enable_directed_irq(void *unused) { union zpci_sic_iib iib = {{0}}; @@ -462,8 +563,7 @@ static int __init zpci_directed_irq_init(void) iib.diib.disb_addr = virt_to_phys(zpci_sbv->vector); zpci_set_irq_ctrl(SIC_IRQ_MODE_DIRECT, 0, &iib); - zpci_ibv = kcalloc(num_possible_cpus(), sizeof(*zpci_ibv), - GFP_KERNEL); + zpci_ibv = kzalloc_objs(*zpci_ibv, num_possible_cpus()); if (!zpci_ibv) return -ENOMEM; @@ -473,6 +573,7 @@ static int __init zpci_directed_irq_init(void) * is only done on the first vector. */ zpci_ibv[cpu] = airq_iv_create(cache_line_size() * BITS_PER_BYTE, + AIRQ_IV_PTR | AIRQ_IV_DATA | AIRQ_IV_CACHELINE | (!cpu ? AIRQ_IV_ALLOC : 0), NULL); @@ -488,7 +589,7 @@ static int __init zpci_directed_irq_init(void) static int __init zpci_floating_irq_init(void) { - zpci_ibv = kcalloc(ZPCI_NR_DEVICES, sizeof(*zpci_ibv), GFP_KERNEL); + zpci_ibv = kzalloc_objs(*zpci_ibv, ZPCI_NR_DEVICES); if (!zpci_ibv) return -ENOMEM; diff --git a/arch/s390/pci/pci_kvm_hook.c b/arch/s390/pci/pci_kvm_hook.c index ff34baf50a3e..df5b25dbe9ca 100644 --- a/arch/s390/pci/pci_kvm_hook.c +++ b/arch/s390/pci/pci_kvm_hook.c @@ -5,7 +5,9 @@ * Copyright (C) IBM Corp. 2022. All rights reserved. * Author(s): Pierre Morel <pmorel@linux.ibm.com> */ + #include <linux/kvm_host.h> +#include <linux/export.h> struct zpci_kvm_hook zpci_kvm_hook; EXPORT_SYMBOL_GPL(zpci_kvm_hook); diff --git a/arch/s390/pci/pci_mmio.c b/arch/s390/pci/pci_mmio.c index 46f99dc164ad..51e7a28af899 100644 --- a/arch/s390/pci/pci_mmio.c +++ b/arch/s390/pci/pci_mmio.c @@ -32,9 +32,11 @@ static inline int __pcistb_mio_inuser( u64 len, u8 *status) { int cc, exception; + bool sacf_flag; exception = 1; - asm volatile ( + sacf_flag = enable_sacf_uaccess(); + asm_inline volatile ( " sacf 256\n" "0: .insn rsy,0xeb00000000d4,%[len],%[ioaddr],%[src]\n" "1: lhi %[exc],0\n" @@ -44,6 +46,7 @@ static inline int __pcistb_mio_inuser( : CC_OUT(cc, cc), [len] "+d" (len), [exc] "+d" (exception) : [ioaddr] "a" (ioaddr), [src] "Q" (*((u8 __force *)src)) : CC_CLOBBER_LIST("memory")); + disable_sacf_uaccess(sacf_flag); *status = len >> 24 & 0xff; return exception ? -ENXIO : CC_TRANSFORM(cc); } @@ -54,6 +57,7 @@ static inline int __pcistg_mio_inuser( { union register_pair ioaddr_len = {.even = (u64 __force)ioaddr, .odd = ulen}; int cc, exception; + bool sacf_flag; u64 val = 0; u64 cnt = ulen; u8 tmp; @@ -64,7 +68,8 @@ static inline int __pcistg_mio_inuser( * address space. pcistg then uses the user mappings. */ exception = 1; - asm volatile ( + sacf_flag = enable_sacf_uaccess(); + asm_inline volatile ( " sacf 256\n" "0: llgc %[tmp],0(%[src])\n" "4: sllg %[val],%[val],8\n" @@ -81,6 +86,7 @@ static inline int __pcistg_mio_inuser( CC_OUT(cc, cc), [ioaddr_len] "+&d" (ioaddr_len.pair) : : CC_CLOBBER_LIST("memory")); + disable_sacf_uaccess(sacf_flag); *status = ioaddr_len.odd >> 24 & 0xff; cc = exception ? -ENXIO : CC_TRANSFORM(cc); @@ -175,8 +181,12 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr, args.address = mmio_addr; args.vma = vma; ret = follow_pfnmap_start(&args); - if (ret) - goto out_unlock_mmap; + if (ret) { + fixup_user_fault(current->mm, mmio_addr, FAULT_FLAG_WRITE, NULL); + ret = follow_pfnmap_start(&args); + if (ret) + goto out_unlock_mmap; + } io_addr = (void __iomem *)((args.pfn << PAGE_SHIFT) | (mmio_addr & ~PAGE_MASK)); @@ -200,6 +210,7 @@ static inline int __pcilg_mio_inuser( u64 ulen, u8 *status) { union register_pair ioaddr_len = {.even = (u64 __force)ioaddr, .odd = ulen}; + bool sacf_flag; u64 cnt = ulen; int shift = ulen * 8; int cc, exception; @@ -211,7 +222,8 @@ static inline int __pcilg_mio_inuser( * user address @dst */ exception = 1; - asm volatile ( + sacf_flag = enable_sacf_uaccess(); + asm_inline volatile ( " sacf 256\n" "0: .insn rre,0xb9d60000,%[val],%[ioaddr_len]\n" "1: lhi %[exc],0\n" @@ -232,10 +244,10 @@ static inline int __pcilg_mio_inuser( : [ioaddr_len] "+&d" (ioaddr_len.pair), [exc] "+d" (exception), CC_OUT(cc, cc), [val] "=d" (val), [dst] "+a" (dst), [cnt] "+d" (cnt), [tmp] "=d" (tmp), - [shift] "+d" (shift) + [shift] "+a" (shift) : : CC_CLOBBER_LIST("memory")); - + disable_sacf_uaccess(sacf_flag); cc = exception ? -ENXIO : CC_TRANSFORM(cc); /* did we write everything to the user space buffer? */ if (!cc && cnt != 0) @@ -315,14 +327,18 @@ SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr, if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) goto out_unlock_mmap; ret = -EACCES; - if (!(vma->vm_flags & VM_WRITE)) + if (!(vma->vm_flags & VM_READ)) goto out_unlock_mmap; args.vma = vma; args.address = mmio_addr; ret = follow_pfnmap_start(&args); - if (ret) - goto out_unlock_mmap; + if (ret) { + fixup_user_fault(current->mm, mmio_addr, 0, NULL); + ret = follow_pfnmap_start(&args); + if (ret) + goto out_unlock_mmap; + } io_addr = (void __iomem *)((args.pfn << PAGE_SHIFT) | (mmio_addr & ~PAGE_MASK)); diff --git a/arch/s390/pci/pci_report.c b/arch/s390/pci/pci_report.c index 1b494e5ecc4d..7030f7052926 100644 --- a/arch/s390/pci/pci_report.c +++ b/arch/s390/pci/pci_report.c @@ -7,8 +7,7 @@ * */ -#define KMSG_COMPONENT "zpci" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "zpci: " fmt #include <linux/kernel.h> #include <linux/sprintf.h> diff --git a/arch/s390/pci/pci_sysfs.c b/arch/s390/pci/pci_sysfs.c index 2de1ea6c3a8c..d98d97df792a 100644 --- a/arch/s390/pci/pci_sysfs.c +++ b/arch/s390/pci/pci_sysfs.c @@ -6,8 +6,7 @@ * Jan Glauber <jang@linux.vnet.ibm.com> */ -#define KMSG_COMPONENT "zpci" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "zpci: " fmt #include <linux/kernel.h> #include <linux/stat.h> @@ -41,6 +40,9 @@ zpci_attr(segment1, "0x%02x\n", pfip[1]); zpci_attr(segment2, "0x%02x\n", pfip[2]); zpci_attr(segment3, "0x%02x\n", pfip[3]); +#define ZPCI_FW_ATTR_RO(_name) \ + static struct kobj_attribute _name##_attr = __ATTR_RO(_name) + static ssize_t mio_enabled_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -52,7 +54,6 @@ static DEVICE_ATTR_RO(mio_enabled); static int _do_recover(struct pci_dev *pdev, struct zpci_dev *zdev) { - u8 status; int ret; pci_stop_and_remove_bus_device(pdev); @@ -70,16 +71,8 @@ static int _do_recover(struct pci_dev *pdev, struct zpci_dev *zdev) return ret; } - ret = zpci_enable_device(zdev); - if (ret) - return ret; + ret = zpci_reenable_device(zdev); - if (zdev->dma_table) { - ret = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, - virt_to_phys(zdev->dma_table), &status); - if (ret) - zpci_disable_device(zdev); - } return ret; } @@ -173,6 +166,13 @@ static ssize_t uid_is_unique_show(struct device *dev, } static DEVICE_ATTR_RO(uid_is_unique); +static ssize_t uid_checking_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", zpci_unique_uid ? 1 : 0); +} +ZPCI_FW_ATTR_RO(uid_checking); + /* analogous to smbios index */ static ssize_t index_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -187,6 +187,17 @@ static ssize_t index_show(struct device *dev, } static DEVICE_ATTR_RO(index); +static ssize_t zpci_uid_slot_show(struct pci_slot *slot, char *buf) +{ + struct zpci_dev *zdev = container_of(slot->hotplug, struct zpci_dev, + hotplug_slot); + + return sysfs_emit(buf, "0x%x\n", zdev->uid); +} + +static struct pci_slot_attribute zpci_slot_attr_uid = + __ATTR(uid, 0444, zpci_uid_slot_show, NULL); + static umode_t zpci_index_is_visible(struct kobject *kobj, struct attribute *attr, int n) { @@ -227,7 +238,7 @@ static struct attribute *zpci_dev_attrs[] = { const struct attribute_group zpci_attr_group = { .attrs = zpci_dev_attrs, - .bin_attrs_new = zpci_bin_attrs, + .bin_attrs = zpci_bin_attrs, }; static struct attribute *pfip_attrs[] = { @@ -242,3 +253,27 @@ const struct attribute_group pfip_attr_group = { .name = "pfip", .attrs = pfip_attrs, }; + +static struct attribute *zpci_slot_attrs[] = { + &zpci_slot_attr_uid.attr, + NULL, +}; + +const struct attribute_group zpci_slot_attr_group = { + .attrs = zpci_slot_attrs, +}; + +static struct attribute *clp_fw_attrs[] = { + &uid_checking_attr.attr, + NULL, +}; + +static struct attribute_group clp_fw_attr_group = { + .name = "clp", + .attrs = clp_fw_attrs, +}; + +int __init __zpci_fw_sysfs_init(void) +{ + return sysfs_create_group(firmware_kobj, &clp_fw_attr_group); +} diff --git a/arch/s390/purgatory/Makefile b/arch/s390/purgatory/Makefile index bdcf2a3b6c41..95a8ac45b67e 100644 --- a/arch/s390/purgatory/Makefile +++ b/arch/s390/purgatory/Makefile @@ -8,20 +8,23 @@ PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y)) $(obj)/sha256.o: $(srctree)/lib/crypto/sha256.c FORCE $(call if_changed_rule,cc_o_c) -CFLAGS_sha256.o := -D__DISABLE_EXPORTS -D__NO_FORTIFY +CFLAGS_sha256.o := -D__NO_FORTIFY $(obj)/mem.o: $(srctree)/arch/s390/lib/mem.S FORCE $(call if_changed_rule,as_o_S) -KBUILD_CFLAGS := -std=gnu11 -fno-strict-aliasing -Wall -Wstrict-prototypes +KBUILD_CFLAGS := $(CC_FLAGS_DIALECT) -fno-strict-aliasing -Wall -Wstrict-prototypes KBUILD_CFLAGS += -Wno-pointer-sign -Wno-sign-compare KBUILD_CFLAGS += -fno-zero-initialized-in-bss -fno-builtin -ffreestanding KBUILD_CFLAGS += -Os -m64 -msoft-float -fno-common KBUILD_CFLAGS += -fno-stack-protector KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING +KBUILD_CFLAGS += -D__DISABLE_EXPORTS KBUILD_CFLAGS += $(CLANG_FLAGS) KBUILD_CFLAGS += $(call cc-option,-fno-PIE) +KBUILD_CFLAGS += $(call cc-option, -Wno-default-const-init-unsafe) KBUILD_AFLAGS := $(filter-out -DCC_USING_EXPOLINE,$(KBUILD_AFLAGS)) +KBUILD_AFLAGS += -D__DISABLE_EXPORTS # Since we link purgatory with -r unresolved symbols are not checked, so we # also link a purgatory.chk binary without -r to check for unresolved symbols. diff --git a/arch/s390/purgatory/purgatory.c b/arch/s390/purgatory/purgatory.c index 030efda05dbe..ecb38102187c 100644 --- a/arch/s390/purgatory/purgatory.c +++ b/arch/s390/purgatory/purgatory.c @@ -16,7 +16,7 @@ int verify_sha256_digest(void) { struct kexec_sha_region *ptr, *end; u8 digest[SHA256_DIGEST_SIZE]; - struct sha256_state sctx; + struct sha256_ctx sctx; sha256_init(&sctx); end = purgatory_sha_regions + ARRAY_SIZE(purgatory_sha_regions); diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c index 855f818deb98..2d28a569f793 100644 --- a/arch/s390/tools/gen_facilities.c +++ b/arch/s390/tools/gen_facilities.c @@ -29,6 +29,7 @@ static struct facility_def facility_defs[] = { .bits = (int[]){ 0, /* N3 instructions */ 1, /* z/Arch mode installed */ + 3, /* dat-enhancement 1 */ 18, /* long displacement facility */ 21, /* extended-immediate facility */ 25, /* store clock fast */ @@ -54,6 +55,9 @@ static struct facility_def facility_defs[] = { #ifdef CONFIG_HAVE_MARCH_Z15_FEATURES 61, /* miscellaneous-instruction-extension 3 */ #endif +#ifdef CONFIG_HAVE_MARCH_Z17_FEATURES + 84, /* miscellaneous-instruction-extension 4 */ +#endif -1 /* END */ } }, |
