diff options
Diffstat (limited to 'arch/arm64/kernel')
73 files changed, 3020 insertions, 2207 deletions
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 71c29a2a2f19..74b76bb70452 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -34,7 +34,7 @@ obj-y := debug-monitors.o entry.o irq.o fpsimd.o \ cpufeature.o alternative.o cacheinfo.o \ smp.o smp_spin_table.o topology.o smccc-call.o \ syscall.o proton-pack.o idle.o patching.o pi/ \ - rsi.o + rsi.o jump_label.o obj-$(CONFIG_COMPAT) += sys32.o signal32.o \ sys_compat.o @@ -46,8 +46,8 @@ obj-$(CONFIG_MODULES) += module.o module-plts.o obj-$(CONFIG_PERF_EVENTS) += perf_regs.o perf_callchain.o obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF) += watchdog_hld.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o +obj-$(CONFIG_HAVE_STATIC_CALL) += static_call.o obj-$(CONFIG_CPU_PM) += sleep.o suspend.o -obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_EFI) += efi.o efi-rt-wrapper.o obj-$(CONFIG_PCI) += pci.o @@ -68,6 +68,7 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o obj-$(CONFIG_ARM_SDE_INTERFACE) += sdei.o obj-$(CONFIG_ARM64_PTR_AUTH) += pointer_auth.o +obj-$(CONFIG_ARM64_MPAM) += mpam.o obj-$(CONFIG_ARM64_MTE) += mte.o obj-y += vdso-wrap.o obj-$(CONFIG_COMPAT_VDSO) += vdso32-wrap.o @@ -78,10 +79,10 @@ $(obj)/vdso32-wrap.o: $(obj)/vdso32/vdso.so obj-y += probes/ obj-y += head.o -extra-y += vmlinux.lds +always-$(KBUILD_BUILTIN) += vmlinux.lds ifeq ($(CONFIG_DEBUG_EFI),y) -AFLAGS_head.o += -DVMLINUX_PATH="\"$(realpath $(objtree)/vmlinux)\"" +AFLAGS_head.o += -DVMLINUX_PATH="\"$(abspath vmlinux)\"" endif # for cleaning diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c index e6f66491fbe9..5891f92c2035 100644 --- a/arch/arm64/kernel/acpi.c +++ b/arch/arm64/kernel/acpi.c @@ -133,7 +133,7 @@ static int __init acpi_fadt_sanity_check(void) /* * FADT is required on arm64; retrieve it to check its presence - * and carry out revision and ACPI HW reduced compliancy tests + * and carry out revision and ACPI HW reduced compliance tests */ status = acpi_get_table(ACPI_SIG_FADT, 0, &table); if (ACPI_FAILURE(status)) { @@ -252,8 +252,6 @@ done: */ acpi_parse_spcr(earlycon_acpi_spcr_enable, !param_acpi_nospcr); - pr_info("Use ACPI SPCR as default console: %s\n", - param_acpi_nospcr ? "No" : "Yes"); if (IS_ENABLED(CONFIG_ACPI_BGRT)) acpi_table_parse(ACPI_SIG_BGRT, acpi_parse_bgrt); @@ -379,7 +377,7 @@ void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size) prot = __acpi_get_writethrough_mem_attribute(); } } - return ioremap_prot(phys, size, pgprot_val(prot)); + return __ioremap_prot(phys, size, prot); } /* @@ -403,7 +401,7 @@ int apei_claim_sea(struct pt_regs *regs) return_to_irqs_enabled = !irqs_disabled_flags(arch_local_save_flags()); if (regs) - return_to_irqs_enabled = interrupts_enabled(regs); + return_to_irqs_enabled = !regs_irqs_disabled(regs); /* * SEA can interrupt SError, mask it and describe this as an NMI so @@ -425,7 +423,7 @@ int apei_claim_sea(struct pt_regs *regs) irq_work_run(); __irq_exit(); } else { - pr_warn_ratelimited("APEI work queued but not completed"); + pr_warn_ratelimited("APEI work queued but not completed\n"); err = -EINPROGRESS; } } @@ -460,3 +458,33 @@ int acpi_unmap_cpu(int cpu) } EXPORT_SYMBOL(acpi_unmap_cpu); #endif /* CONFIG_ACPI_HOTPLUG_CPU */ + +int acpi_get_cpu_uid(unsigned int cpu, u32 *uid) +{ + struct acpi_madt_generic_interrupt *gicc; + + if (cpu >= nr_cpu_ids) + return -EINVAL; + + gicc = acpi_cpu_get_madt_gicc(cpu); + if (!gicc) + return -ENODEV; + + *uid = gicc->uid; + return 0; +} +EXPORT_SYMBOL_GPL(acpi_get_cpu_uid); + +int get_cpu_for_acpi_id(u32 uid) +{ + u32 cpu_uid; + int ret; + + for (int cpu = 0; cpu < nr_cpu_ids; cpu++) { + ret = acpi_get_cpu_uid(cpu, &cpu_uid); + if (ret == 0 && uid == cpu_uid) + return cpu; + } + + return -EINVAL; +} diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c index 8ff6610af496..f5ec7e7c1d3f 100644 --- a/arch/arm64/kernel/alternative.c +++ b/arch/arm64/kernel/alternative.c @@ -139,9 +139,9 @@ static noinstr void clean_dcache_range_nopatch(u64 start, u64 end) } while (cur += d_size, cur < end); } -static void __apply_alternatives(const struct alt_region *region, - bool is_module, - unsigned long *cpucap_mask) +static int __apply_alternatives(const struct alt_region *region, + bool is_module, + unsigned long *cpucap_mask) { struct alt_instr *alt; __le32 *origptr, *updptr; @@ -166,10 +166,13 @@ static void __apply_alternatives(const struct alt_region *region, updptr = is_module ? origptr : lm_alias(origptr); nr_inst = alt->orig_len / AARCH64_INSN_SIZE; - if (ALT_HAS_CB(alt)) + if (ALT_HAS_CB(alt)) { alt_cb = ALT_REPL_PTR(alt); - else + if (is_module && !core_kernel_text((unsigned long)alt_cb)) + return -ENOEXEC; + } else { alt_cb = patch_alternative; + } alt_cb(alt, origptr, updptr, nr_inst); @@ -193,6 +196,8 @@ static void __apply_alternatives(const struct alt_region *region, bitmap_and(applied_alternatives, applied_alternatives, system_cpucaps, ARM64_NCAPS); } + + return 0; } static void __init apply_alternatives_vdso(void) @@ -277,7 +282,7 @@ void __init apply_boot_alternatives(void) } #ifdef CONFIG_MODULES -void apply_alternatives_module(void *start, size_t length) +int apply_alternatives_module(void *start, size_t length) { struct alt_region region = { .begin = start, @@ -287,7 +292,7 @@ void apply_alternatives_module(void *start, size_t length) bitmap_fill(all_capabilities, ARM64_NCAPS); - __apply_alternatives(®ion, true, &all_capabilities[0]); + return __apply_alternatives(®ion, true, &all_capabilities[0]); } #endif diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c index e737c6295ec7..b7a1f8b788bb 100644 --- a/arch/arm64/kernel/armv8_deprecated.c +++ b/arch/arm64/kernel/armv8_deprecated.c @@ -610,6 +610,20 @@ static int __init armv8_deprecated_init(void) } #endif + +#ifdef CONFIG_SWP_EMULATION + /* + * The purpose of supporting LSUI is to eliminate PAN toggling. CPUs + * that support LSUI are unlikely to support a 32-bit runtime. Rather + * than emulating the SWP instruction using LSUI instructions, simply + * disable SWP emulation. + */ + if (cpus_have_final_cap(ARM64_HAS_LSUI)) { + insn_swp.status = INSN_UNAVAILABLE; + pr_info("swp/swpb instruction emulation is not supported on this system\n"); + } +#endif + for (int i = 0; i < ARRAY_SIZE(insn_emulations); i++) { struct insn_emulation *ie = insn_emulations[i]; diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index eb1a840e4110..b6367ff3a49c 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -6,6 +6,7 @@ * 2001-2002 Keith Owens * Copyright (C) 2012 ARM Ltd. */ +#define COMPILE_OFFSETS #include <linux/arm_sdei.h> #include <linux/sched.h> @@ -182,5 +183,7 @@ int main(void) #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS DEFINE(FTRACE_OPS_DIRECT_CALL, offsetof(struct ftrace_ops, direct_call)); #endif + DEFINE(PIE_E0_ASM, PIE_E0); + DEFINE(PIE_E1_ASM, PIE_E1); return 0; } diff --git a/arch/arm64/kernel/compat_alignment.c b/arch/arm64/kernel/compat_alignment.c index deff21bfa680..b68e1d328d4c 100644 --- a/arch/arm64/kernel/compat_alignment.c +++ b/arch/arm64/kernel/compat_alignment.c @@ -368,6 +368,8 @@ int do_compat_alignment_fixup(unsigned long addr, struct pt_regs *regs) return 1; } + if (!handler) + return 1; type = handler(addr, instr, regs); if (type == TYPE_ERROR || type == TYPE_FAULT) diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 7ce555862895..5c0ab6bfd44a 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -14,31 +14,85 @@ #include <asm/kvm_asm.h> #include <asm/smp_plat.h> +static u64 target_impl_cpu_num; +static struct target_impl_cpu *target_impl_cpus; + +bool cpu_errata_set_target_impl(u64 num, void *impl_cpus) +{ + if (target_impl_cpu_num || !num || !impl_cpus) + return false; + + target_impl_cpu_num = num; + target_impl_cpus = impl_cpus; + return true; +} + +static inline bool is_midr_in_range(struct midr_range const *range) +{ + int i; + + if (!target_impl_cpu_num) + return midr_is_cpu_model_range(read_cpuid_id(), range->model, + range->rv_min, range->rv_max); + + for (i = 0; i < target_impl_cpu_num; i++) { + if (midr_is_cpu_model_range(target_impl_cpus[i].midr, + range->model, + range->rv_min, range->rv_max)) + return true; + } + return false; +} + +bool is_midr_in_range_list(struct midr_range const *ranges) +{ + while (ranges->model) + if (is_midr_in_range(ranges++)) + return true; + return false; +} +EXPORT_SYMBOL_GPL(is_midr_in_range_list); + static bool __maybe_unused -is_affected_midr_range(const struct arm64_cpu_capabilities *entry, int scope) +__is_affected_midr_range(const struct arm64_cpu_capabilities *entry, + u32 midr, u32 revidr) { const struct arm64_midr_revidr *fix; - u32 midr = read_cpuid_id(), revidr; - - WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); - if (!is_midr_in_range(midr, &entry->midr_range)) + if (!is_midr_in_range(&entry->midr_range)) return false; midr &= MIDR_REVISION_MASK | MIDR_VARIANT_MASK; - revidr = read_cpuid(REVIDR_EL1); for (fix = entry->fixed_revs; fix && fix->revidr_mask; fix++) if (midr == fix->midr_rv && (revidr & fix->revidr_mask)) return false; - return true; } static bool __maybe_unused +is_affected_midr_range(const struct arm64_cpu_capabilities *entry, int scope) +{ + int i; + + if (!target_impl_cpu_num) { + WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); + return __is_affected_midr_range(entry, read_cpuid_id(), + read_cpuid(REVIDR_EL1)); + } + + for (i = 0; i < target_impl_cpu_num; i++) { + if (__is_affected_midr_range(entry, target_impl_cpus[i].midr, + target_impl_cpus[i].midr)) + return true; + } + return false; +} + +static bool __maybe_unused is_affected_midr_range_list(const struct arm64_cpu_capabilities *entry, int scope) { WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); - return is_midr_in_range_list(read_cpuid_id(), entry->midr_range_list); + return is_midr_in_range_list(entry->midr_range_list); } static bool __maybe_unused @@ -87,6 +141,30 @@ has_mismatched_cache_type(const struct arm64_cpu_capabilities *entry, return (ctr_real != sys) && (ctr_raw != sys); } +#ifdef CONFIG_ARM64_ERRATUM_4311569 +static DEFINE_STATIC_KEY_FALSE(arm_si_l1_workaround_4311569); +static int __init early_arm_si_l1_workaround_4311569_cfg(char *arg) +{ + static_branch_enable(&arm_si_l1_workaround_4311569); + pr_info("Enabling cache maintenance workaround for ARM SI-L1 erratum 4311569\n"); + + return 0; +} +early_param("arm_si_l1_workaround_4311569", early_arm_si_l1_workaround_4311569_cfg); + +/* + * We have some earlier use cases to call cache maintenance operation functions, for example, + * dcache_inval_poc() and dcache_clean_poc() in head.S, before making decision to turn on this + * workaround. Since the scope of this workaround is limited to non-coherent DMA agents, its + * safe to have the workaround off by default. + */ +static bool +need_arm_si_l1_workaround_4311569(const struct arm64_cpu_capabilities *entry, int scope) +{ + return static_branch_unlikely(&arm_si_l1_workaround_4311569); +} +#endif + static void cpu_enable_trap_ctr_access(const struct arm64_cpu_capabilities *cap) { @@ -186,12 +264,48 @@ static bool __maybe_unused has_neoverse_n1_erratum_1542419(const struct arm64_cpu_capabilities *entry, int scope) { - u32 midr = read_cpuid_id(); bool has_dic = read_cpuid_cachetype() & BIT(CTR_EL0_DIC_SHIFT); const struct midr_range range = MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1); WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); - return is_midr_in_range(midr, &range) && has_dic; + return is_midr_in_range(&range) && has_dic; +} + +static const struct midr_range impdef_pmuv3_cpus[] = { + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_MAX), + {}, +}; + +static bool has_impdef_pmuv3(const struct arm64_cpu_capabilities *entry, int scope) +{ + u64 dfr0 = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1); + unsigned int pmuver; + + if (!is_kernel_in_hyp_mode()) + return false; + + pmuver = cpuid_feature_extract_unsigned_field(dfr0, + ID_AA64DFR0_EL1_PMUVer_SHIFT); + if (pmuver != ID_AA64DFR0_EL1_PMUVer_IMP_DEF) + return false; + + return is_midr_in_range_list(impdef_pmuv3_cpus); +} + +static void cpu_enable_impdef_pmuv3_traps(const struct arm64_cpu_capabilities *__unused) +{ + sysreg_clear_set_s(SYS_HACR_EL2, 0, BIT(56)); } #ifdef CONFIG_ARM64_WORKAROUND_REPEAT_TLBI @@ -245,7 +359,7 @@ static const struct midr_range cavium_erratum_23154_cpus[] = { #endif #ifdef CONFIG_CAVIUM_ERRATUM_27456 -const struct midr_range cavium_erratum_27456_cpus[] = { +static const struct midr_range cavium_erratum_27456_cpus[] = { /* Cavium ThunderX, T88 pass 1.x - 2.1 */ MIDR_RANGE(MIDR_THUNDERX, 0, 0, 1, 1), /* Cavium ThunderX, T81 pass 1.0 */ @@ -441,6 +555,7 @@ static const struct midr_range erratum_spec_ssbs_list[] = { MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), MIDR_ALL_VERSIONS(MIDR_CORTEX_A715), MIDR_ALL_VERSIONS(MIDR_CORTEX_A720), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A720AE), MIDR_ALL_VERSIONS(MIDR_CORTEX_A725), MIDR_ALL_VERSIONS(MIDR_CORTEX_X1), MIDR_ALL_VERSIONS(MIDR_CORTEX_X1C), @@ -455,6 +570,7 @@ static const struct midr_range erratum_spec_ssbs_list[] = { MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1), MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2), MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V3), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V3AE), {} }; #endif @@ -467,6 +583,13 @@ static const struct midr_range erratum_ac03_cpu_38_list[] = { }; #endif +#ifdef CONFIG_AMPERE_ERRATUM_AC04_CPU_23 +static const struct midr_range erratum_ac04_cpu_23_list[] = { + MIDR_ALL_VERSIONS(MIDR_AMPERE1A), + {}, +}; +#endif + const struct arm64_cpu_capabilities arm64_errata[] = { #ifdef CONFIG_ARM64_WORKAROUND_CLEAN_CACHE { @@ -771,6 +894,13 @@ const struct arm64_cpu_capabilities arm64_errata[] = { ERRATA_MIDR_RANGE_LIST(erratum_spec_ssbs_list), }, #endif +#ifdef CONFIG_ARM64_ERRATUM_4311569 + { + .capability = ARM64_WORKAROUND_4311569, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = need_arm_si_l1_workaround_4311569, + }, +#endif #ifdef CONFIG_ARM64_WORKAROUND_SPECULATIVE_UNPRIV_LOAD { .desc = "ARM errata 2966298, 3117295", @@ -786,6 +916,13 @@ const struct arm64_cpu_capabilities arm64_errata[] = { ERRATA_MIDR_RANGE_LIST(erratum_ac03_cpu_38_list), }, #endif +#ifdef CONFIG_AMPERE_ERRATUM_AC04_CPU_23 + { + .desc = "AmpereOne erratum AC04_CPU_23", + .capability = ARM64_WORKAROUND_AMPERE_AC04_CPU_23, + ERRATA_MIDR_RANGE_LIST(erratum_ac04_cpu_23_list), + }, +#endif { .desc = "Broken CNTVOFF_EL2", .capability = ARM64_WORKAROUND_QCOM_ORYON_CNTVOFF, @@ -795,5 +932,12 @@ const struct arm64_cpu_capabilities arm64_errata[] = { })), }, { + .desc = "Apple IMPDEF PMUv3 Traps", + .capability = ARM64_WORKAROUND_PMUV3_IMPDEF_TRAPS, + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, + .matches = has_impdef_pmuv3, + .cpu_enable = cpu_enable_impdef_pmuv3_traps, + }, + { } }; diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index d561cf3b8ac7..6d53bb15cf7b 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -77,6 +77,7 @@ #include <linux/percpu.h> #include <linux/sched/isolation.h> +#include <asm/arm_pmuv3.h> #include <asm/cpu.h> #include <asm/cpufeature.h> #include <asm/cpu_ops.h> @@ -84,8 +85,11 @@ #include <asm/hwcap.h> #include <asm/insn.h> #include <asm/kvm_host.h> +#include <asm/mmu.h> #include <asm/mmu_context.h> +#include <asm/mpam.h> #include <asm/mte.h> +#include <asm/hypervisor.h> #include <asm/processor.h> #include <asm/smp.h> #include <asm/sysreg.h> @@ -93,6 +97,7 @@ #include <asm/vectors.h> #include <asm/virt.h> +#include <asm/spectre.h> /* Kernel representation of AT_HWCAP and AT_HWCAP2 */ static DECLARE_BITMAP(elf_hwcap, MAX_CPU_FEATURES) __read_mostly; @@ -113,7 +118,14 @@ static struct arm64_cpu_capabilities const __ro_after_init *cpucap_ptrs[ARM64_NC DECLARE_BITMAP(boot_cpucaps, ARM64_NCAPS); -bool arm64_use_ng_mappings = false; +/* + * arm64_use_ng_mappings must be placed in the .data section, otherwise it + * ends up in the .bss section where it is initialized in early_map_kernel() + * after the MMU (with the idmap) was enabled. create_init_idmap() - which + * runs before early_map_kernel() and reads the variable via PTE_MAYBE_NG - + * may end up generating an incorrect idmap page table attributes. + */ +bool arm64_use_ng_mappings __read_mostly = false; EXPORT_SYMBOL(arm64_use_ng_mappings); DEFINE_PER_CPU_READ_MOSTLY(const char *, this_cpu_vector) = vectors; @@ -230,6 +242,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar0[] = { }; static const struct arm64_ftr_bits ftr_id_aa64isar1[] = { + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_LS64_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_XS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_I8MM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_DGH_SHIFT, 4, 0), @@ -270,6 +283,8 @@ static const struct arm64_ftr_bits ftr_id_aa64isar2[] = { static const struct arm64_ftr_bits ftr_id_aa64isar3[] = { ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_FPRCVT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_LSUI_SHIFT, 4, ID_AA64ISAR3_EL1_LSUI_NI), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_LSFE_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_FAMINMAX_SHIFT, 4, 0), ARM64_FTR_END, }; @@ -295,8 +310,10 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = { }; static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_DF2_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_GCS), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_GCS_SHIFT, 4, 0), + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_MTE_frac_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_SME_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_MPAM_frac_SHIFT, 4, 0), @@ -311,6 +328,9 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = { static const struct arm64_ftr_bits ftr_id_aa64pfr2[] = { ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR2_EL1_FPMR_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR2_EL1_GCIE_SHIFT, 4, ID_AA64PFR2_EL1_GCIE_NI), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR2_EL1_MTEFAR_SHIFT, 4, ID_AA64PFR2_EL1_MTEFAR_NI), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR2_EL1_MTESTOREONLY_SHIFT, 4, ID_AA64PFR2_EL1_MTESTOREONLY_NI), ARM64_FTR_END, }; @@ -491,12 +511,14 @@ static const struct arm64_ftr_bits ftr_id_aa64mmfr3[] = { ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_POE), FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_S1POE_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_S1PIE_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_SCTLRX_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_TCRX_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64mmfr4[] = { S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR4_EL1_E2H0_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR4_EL1_NV_frac_SHIFT, 4, 0), ARM64_FTR_END, }; @@ -547,7 +569,7 @@ static const struct arm64_ftr_bits ftr_id_aa64dfr0[] = { * We can instantiate multiple PMU instances with different levels * of support. */ - S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64DFR0_EL1_PMUVer_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64DFR0_EL1_PMUVer_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64DFR0_EL1_DebugVer_SHIFT, 4, 0x6), ARM64_FTR_END, }; @@ -691,7 +713,7 @@ static const struct arm64_ftr_bits ftr_id_pfr2[] = { static const struct arm64_ftr_bits ftr_id_dfr0[] = { /* [31:28] TraceFilt */ - S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_DFR0_EL1_PerfMon_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_DFR0_EL1_PerfMon_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_EL1_MProfDbg_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_EL1_MMapTrc_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_EL1_CopTrc_SHIFT, 4, 0), @@ -756,17 +778,17 @@ static const struct arm64_ftr_bits ftr_raz[] = { #define ARM64_FTR_REG(id, table) \ __ARM64_FTR_REG_OVERRIDE(#id, id, table, &no_override) -struct arm64_ftr_override id_aa64mmfr0_override; -struct arm64_ftr_override id_aa64mmfr1_override; -struct arm64_ftr_override id_aa64mmfr2_override; -struct arm64_ftr_override id_aa64pfr0_override; -struct arm64_ftr_override id_aa64pfr1_override; -struct arm64_ftr_override id_aa64zfr0_override; -struct arm64_ftr_override id_aa64smfr0_override; -struct arm64_ftr_override id_aa64isar1_override; -struct arm64_ftr_override id_aa64isar2_override; +struct arm64_ftr_override __read_mostly id_aa64mmfr0_override; +struct arm64_ftr_override __read_mostly id_aa64mmfr1_override; +struct arm64_ftr_override __read_mostly id_aa64mmfr2_override; +struct arm64_ftr_override __read_mostly id_aa64pfr0_override; +struct arm64_ftr_override __read_mostly id_aa64pfr1_override; +struct arm64_ftr_override __read_mostly id_aa64zfr0_override; +struct arm64_ftr_override __read_mostly id_aa64smfr0_override; +struct arm64_ftr_override __read_mostly id_aa64isar1_override; +struct arm64_ftr_override __read_mostly id_aa64isar2_override; -struct arm64_ftr_override arm64_sw_feature_override; +struct arm64_ftr_override __read_mostly arm64_sw_feature_override; static const struct __ftr_reg_entry { u32 sys_id; @@ -986,7 +1008,7 @@ static void __init sort_ftr_regs(void) /* * Initialise the CPU feature register from Boot CPU values. - * Also initiliases the strict_mask for the register. + * Also initialises the strict_mask for the register. * Any bits that are not covered by an arm64_ftr_bits entry are considered * RES0 for the system-wide value, and must strictly match. */ @@ -1189,8 +1211,10 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info) cpacr_restore(cpacr); } - if (id_aa64pfr0_mpam(info->reg_id_aa64pfr0)) + if (id_aa64pfr0_mpam(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1))) { + info->reg_mpamidr = read_cpuid(MPAMIDR_EL1); init_cpu_ftr_reg(SYS_MPAMIDR_EL1, info->reg_mpamidr); + } if (id_aa64pfr1_mte(info->reg_id_aa64pfr1)) init_cpu_ftr_reg(SYS_GMID_EL1, info->reg_gmid); @@ -1401,6 +1425,8 @@ void update_cpu_features(int cpu, info->reg_id_aa64mmfr2, boot->reg_id_aa64mmfr2); taint |= check_update_ftr_reg(SYS_ID_AA64MMFR3_EL1, cpu, info->reg_id_aa64mmfr3, boot->reg_id_aa64mmfr3); + taint |= check_update_ftr_reg(SYS_ID_AA64MMFR4_EL1, cpu, + info->reg_id_aa64mmfr4, boot->reg_id_aa64mmfr4); taint |= check_update_ftr_reg(SYS_ID_AA64PFR0_EL1, cpu, info->reg_id_aa64pfr0, boot->reg_id_aa64pfr0); @@ -1441,7 +1467,8 @@ void update_cpu_features(int cpu, cpacr_restore(cpacr); } - if (id_aa64pfr0_mpam(info->reg_id_aa64pfr0)) { + if (id_aa64pfr0_mpam(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1))) { + info->reg_mpamidr = read_cpuid(MPAMIDR_EL1); taint |= check_update_ftr_reg(SYS_MPAMIDR_EL1, cpu, info->reg_mpamidr, boot->reg_mpamidr); } @@ -1647,7 +1674,7 @@ const struct cpumask *system_32bit_el0_cpumask(void) const struct cpumask *task_cpu_fallback_mask(struct task_struct *p) { - return __task_cpu_possible_mask(p, housekeeping_cpumask(HK_TYPE_TICK)); + return __task_cpu_possible_mask(p, housekeeping_cpumask(HK_TYPE_DOMAIN)); } static int __init parse_32bit_el0_param(char *str) @@ -1792,7 +1819,7 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, char const *str = "kpti command line option"; bool meltdown_safe; - meltdown_safe = is_midr_in_range_list(read_cpuid_id(), kpti_safe_list); + meltdown_safe = is_midr_in_range_list(kpti_safe_list); /* Defer to CPU feature registers */ if (has_cpuid_feature(entry, scope)) @@ -1862,7 +1889,7 @@ static bool has_nv1(const struct arm64_cpu_capabilities *entry, int scope) return (__system_matches_cap(ARM64_HAS_NESTED_VIRT) && !(has_cpuid_feature(entry, scope) || - is_midr_in_range_list(read_cpuid_id(), nv1_ni_list))); + is_midr_in_range_list(nv1_ni_list))); } #if defined(ID_AA64MMFR0_EL1_TGRAN_LPA2) && defined(ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_LPA2) @@ -1898,102 +1925,18 @@ static bool has_lpa2(const struct arm64_cpu_capabilities *entry, int scope) } #endif -#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 -#define KPTI_NG_TEMP_VA (-(1UL << PMD_SHIFT)) - -extern -void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, - phys_addr_t size, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(int), int flags); - -static phys_addr_t __initdata kpti_ng_temp_alloc; - -static phys_addr_t __init kpti_ng_pgd_alloc(int shift) +#ifdef CONFIG_HW_PERF_EVENTS +static bool has_pmuv3(const struct arm64_cpu_capabilities *entry, int scope) { - kpti_ng_temp_alloc -= PAGE_SIZE; - return kpti_ng_temp_alloc; -} - -static int __init __kpti_install_ng_mappings(void *__unused) -{ - typedef void (kpti_remap_fn)(int, int, phys_addr_t, unsigned long); - extern kpti_remap_fn idmap_kpti_install_ng_mappings; - kpti_remap_fn *remap_fn; - - int cpu = smp_processor_id(); - int levels = CONFIG_PGTABLE_LEVELS; - int order = order_base_2(levels); - u64 kpti_ng_temp_pgd_pa = 0; - pgd_t *kpti_ng_temp_pgd; - u64 alloc = 0; - - if (levels == 5 && !pgtable_l5_enabled()) - levels = 4; - else if (levels == 4 && !pgtable_l4_enabled()) - levels = 3; - - remap_fn = (void *)__pa_symbol(idmap_kpti_install_ng_mappings); - - if (!cpu) { - alloc = __get_free_pages(GFP_ATOMIC | __GFP_ZERO, order); - kpti_ng_temp_pgd = (pgd_t *)(alloc + (levels - 1) * PAGE_SIZE); - kpti_ng_temp_alloc = kpti_ng_temp_pgd_pa = __pa(kpti_ng_temp_pgd); - - // - // Create a minimal page table hierarchy that permits us to map - // the swapper page tables temporarily as we traverse them. - // - // The physical pages are laid out as follows: - // - // +--------+-/-------+-/------ +-/------ +-\\\--------+ - // : PTE[] : | PMD[] : | PUD[] : | P4D[] : ||| PGD[] : - // +--------+-\-------+-\------ +-\------ +-///--------+ - // ^ - // The first page is mapped into this hierarchy at a PMD_SHIFT - // aligned virtual address, so that we can manipulate the PTE - // level entries while the mapping is active. The first entry - // covers the PTE[] page itself, the remaining entries are free - // to be used as a ad-hoc fixmap. - // - create_kpti_ng_temp_pgd(kpti_ng_temp_pgd, __pa(alloc), - KPTI_NG_TEMP_VA, PAGE_SIZE, PAGE_KERNEL, - kpti_ng_pgd_alloc, 0); - } + u64 dfr0 = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1); + unsigned int pmuver; - cpu_install_idmap(); - remap_fn(cpu, num_online_cpus(), kpti_ng_temp_pgd_pa, KPTI_NG_TEMP_VA); - cpu_uninstall_idmap(); + pmuver = cpuid_feature_extract_unsigned_field(dfr0, + ID_AA64DFR0_EL1_PMUVer_SHIFT); - if (!cpu) { - free_pages(alloc, order); - arm64_use_ng_mappings = true; - } - - return 0; + return pmuv3_implemented(pmuver); } - -static void __init kpti_install_ng_mappings(void) -{ - /* Check whether KPTI is going to be used */ - if (!arm64_kernel_unmapped_at_el0()) - return; - - /* - * We don't need to rewrite the page-tables if either we've done - * it already or we have KASLR enabled and therefore have not - * created any global mappings at all. - */ - if (arm64_use_ng_mappings) - return; - - stop_machine(__kpti_install_ng_mappings, NULL, cpu_online_mask); -} - -#else -static inline void kpti_install_ng_mappings(void) -{ -} -#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ +#endif static void cpu_enable_kpti(struct arm64_cpu_capabilities const *cap) { @@ -2023,7 +1966,7 @@ static struct cpumask dbm_cpus __read_mostly; static inline void __cpu_enable_hw_dbm(void) { - u64 tcr = read_sysreg(tcr_el1) | TCR_HD; + u64 tcr = read_sysreg(tcr_el1) | TCR_EL1_HD; write_sysreg(tcr, tcr_el1); isb(); @@ -2045,7 +1988,7 @@ static bool cpu_has_broken_dbm(void) {}, }; - return is_midr_in_range_list(read_cpuid_id(), cpus); + return is_midr_in_range_list(cpus); } static bool cpu_can_use_dbm(const struct arm64_cpu_capabilities *cap) @@ -2162,7 +2105,7 @@ static bool has_nested_virt_support(const struct arm64_cpu_capabilities *cap, if (kvm_get_mode() != KVM_MODE_NV) return false; - if (!has_cpuid_feature(cap, scope)) { + if (!cpucap_multi_entry_cap_matches(cap, scope)) { pr_warn("unavailable: %s\n", cap->desc); return false; } @@ -2176,7 +2119,47 @@ static bool hvhe_possible(const struct arm64_cpu_capabilities *entry, return arm64_test_sw_feature_override(ARM64_SW_FEATURE_OVERRIDE_HVHE); } -#ifdef CONFIG_ARM64_PAN +bool cpu_supports_bbml2_noabort(void) +{ + /* + * We want to allow usage of BBML2 in as wide a range of kernel contexts + * as possible. This list is therefore an allow-list of known-good + * implementations that both support BBML2 and additionally, fulfill the + * extra constraint of never generating TLB conflict aborts when using + * the relaxed BBML2 semantics (such aborts make use of BBML2 in certain + * kernel contexts difficult to prove safe against recursive aborts). + * + * Note that implementations can only be considered "known-good" if their + * implementors attest to the fact that the implementation never raises + * TLB conflict aborts for BBML2 mapping granularity changes. + */ + static const struct midr_range supports_bbml2_noabort_list[] = { + MIDR_REV_RANGE(MIDR_CORTEX_X4, 0, 3, 0xf), + MIDR_REV_RANGE(MIDR_NEOVERSE_V3, 0, 2, 0xf), + MIDR_REV_RANGE(MIDR_NEOVERSE_V3AE, 0, 2, 0xf), + MIDR_ALL_VERSIONS(MIDR_NVIDIA_OLYMPUS), + MIDR_ALL_VERSIONS(MIDR_AMPERE1), + MIDR_ALL_VERSIONS(MIDR_AMPERE1A), + {} + }; + + /* Does our cpu guarantee to never raise TLB conflict aborts? */ + if (!is_midr_in_range_list(supports_bbml2_noabort_list)) + return false; + + /* + * We currently ignore the ID_AA64MMFR2_EL1 register, and only care + * about whether the MIDR check passes. + */ + + return true; +} + +static bool has_bbml2_noabort(const struct arm64_cpu_capabilities *caps, int scope) +{ + return cpu_supports_bbml2_noabort(); +} + static void cpu_enable_pan(const struct arm64_cpu_capabilities *__unused) { /* @@ -2188,7 +2171,6 @@ static void cpu_enable_pan(const struct arm64_cpu_capabilities *__unused) sysreg_clear_set(sctlr_el1, SCTLR_EL1_SPAN, 0); set_pstate_pan(1); } -#endif /* CONFIG_ARM64_PAN */ #ifdef CONFIG_ARM64_RAS_EXTN static void cpu_clear_disr(const struct arm64_cpu_capabilities *__unused) @@ -2196,6 +2178,24 @@ static void cpu_clear_disr(const struct arm64_cpu_capabilities *__unused) /* Firmware may have left a deferred SError in this register. */ write_sysreg_s(0, SYS_DISR_EL1); } +static bool has_rasv1p1(const struct arm64_cpu_capabilities *__unused, int scope) +{ + const struct arm64_cpu_capabilities rasv1p1_caps[] = { + { + ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, RAS, V1P1) + }, + { + ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, RAS, IMP) + }, + { + ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, RAS_frac, RASv1p1) + }, + }; + + return (has_cpuid_feature(&rasv1p1_caps[0], scope) || + (has_cpuid_feature(&rasv1p1_caps[1], scope) && + has_cpuid_feature(&rasv1p1_caps[2], scope))); +} #endif /* CONFIG_ARM64_RAS_EXTN */ #ifdef CONFIG_ARM64_PTR_AUTH @@ -2250,20 +2250,30 @@ static bool has_generic_auth(const struct arm64_cpu_capabilities *entry, static void cpu_enable_e0pd(struct arm64_cpu_capabilities const *cap) { if (this_cpu_has_cap(ARM64_HAS_E0PD)) - sysreg_clear_set(tcr_el1, 0, TCR_E0PD1); + sysreg_clear_set(tcr_el1, 0, TCR_EL1_E0PD1); } #endif /* CONFIG_ARM64_E0PD */ +static void cpu_enable_ls64(struct arm64_cpu_capabilities const *cap) +{ + sysreg_clear_set(sctlr_el1, SCTLR_EL1_EnALS, SCTLR_EL1_EnALS); +} + +static void cpu_enable_ls64_v(struct arm64_cpu_capabilities const *cap) +{ + sysreg_clear_set(sctlr_el1, SCTLR_EL1_EnASR, 0); +} + #ifdef CONFIG_ARM64_PSEUDO_NMI static bool can_use_gic_priorities(const struct arm64_cpu_capabilities *entry, int scope) { /* - * ARM64_HAS_GIC_CPUIF_SYSREGS has a lower index, and is a boot CPU + * ARM64_HAS_GICV3_CPUIF has a lower index, and is a boot CPU * feature, so will be detected earlier. */ - BUILD_BUG_ON(ARM64_HAS_GIC_PRIO_MASKING <= ARM64_HAS_GIC_CPUIF_SYSREGS); - if (!cpus_have_cap(ARM64_HAS_GIC_CPUIF_SYSREGS)) + BUILD_BUG_ON(ARM64_HAS_GIC_PRIO_MASKING <= ARM64_HAS_GICV3_CPUIF); + if (!cpus_have_cap(ARM64_HAS_GICV3_CPUIF)) return false; return enable_pseudo_nmi; @@ -2298,6 +2308,58 @@ static bool has_gic_prio_relaxed_sync(const struct arm64_cpu_capabilities *entry } #endif +static bool can_trap_icv_dir_el1(const struct arm64_cpu_capabilities *entry, + int scope) +{ + static const struct midr_range has_vgic_v3[] = { + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_MAX), + {}, + }; + struct arm_smccc_res res = {}; + + BUILD_BUG_ON(ARM64_HAS_ICH_HCR_EL2_TDIR <= ARM64_HAS_GICV3_CPUIF); + BUILD_BUG_ON(ARM64_HAS_ICH_HCR_EL2_TDIR <= ARM64_HAS_GICV5_LEGACY); + if (!is_hyp_mode_available()) + return false; + + if (this_cpu_has_cap(ARM64_HAS_GICV5_LEGACY)) + return true; + + if (!this_cpu_has_cap(ARM64_HAS_GICV3_CPUIF) && + !is_midr_in_range_list(has_vgic_v3)) + return false; + + /* + * pKVM prevents late onlining of CPUs. This means that whatever + * state the capability is in after deprivilege cannot be affected + * by a new CPU booting -- this is garanteed to be a CPU we have + * already seen, and the cap is therefore unchanged. + */ + if (system_capabilities_finalized() && is_protected_kvm_enabled()) + return cpus_have_final_cap(ARM64_HAS_ICH_HCR_EL2_TDIR); + + if (is_kernel_in_hyp_mode()) + res.a1 = read_sysreg_s(SYS_ICH_VTR_EL2); + else + arm_smccc_1_1_hvc(HVC_GET_ICH_VTR_EL2, &res); + + if (res.a0 == HVC_STUB_ERR) + return false; + + return res.a1 & ICH_VTR_EL2_TDS; +} + #ifdef CONFIG_ARM64_BTI static void bti_enable(const struct arm64_cpu_capabilities *__unused) { @@ -2316,17 +2378,21 @@ static void bti_enable(const struct arm64_cpu_capabilities *__unused) #ifdef CONFIG_ARM64_MTE static void cpu_enable_mte(struct arm64_cpu_capabilities const *cap) { + static bool cleared_zero_page = false; + sysreg_clear_set(sctlr_el1, 0, SCTLR_ELx_ATA | SCTLR_EL1_ATA0); mte_cpu_setup(); /* * Clear the tags in the zero page. This needs to be done via the - * linear map which has the Tagged attribute. + * linear map which has the Tagged attribute. Since this page is + * always mapped as pte_special(), set_pte_at() will not attempt to + * clear the tags or set PG_mte_tagged. */ - if (try_page_mte_tagging(ZERO_PAGE(0))) { + if (!cleared_zero_page) { + cleared_zero_page = true; mte_clear_page_tags(lm_alias(empty_zero_page)); - set_page_mte_tagged(ZERO_PAGE(0)); } kasan_init_hw_tags_cpu(); @@ -2430,13 +2496,19 @@ test_has_mpam(const struct arm64_cpu_capabilities *entry, int scope) static void cpu_enable_mpam(const struct arm64_cpu_capabilities *entry) { - /* - * Access by the kernel (at EL1) should use the reserved PARTID - * which is configured unrestricted. This avoids priority-inversion - * where latency sensitive tasks have to wait for a task that has - * been throttled to release the lock. - */ - write_sysreg_s(0, SYS_MPAM1_EL1); + int cpu = smp_processor_id(); + u64 regval = 0; + + if (IS_ENABLED(CONFIG_ARM64_MPAM) && static_branch_likely(&mpam_enabled)) + regval = READ_ONCE(per_cpu(arm64_mpam_current, cpu)); + + write_sysreg_s(regval | MPAM1_EL1_MPAMEN, SYS_MPAM1_EL1); + if (cpus_have_cap(ARM64_SME)) + write_sysreg_s(regval & (MPAMSM_EL1_PARTID_D | MPAMSM_EL1_PMG_D), SYS_MPAMSM_EL1); + isb(); + + /* Synchronising the EL0 write is left until the ERET to EL0 */ + write_sysreg_s(regval, SYS_MPAM0_EL1); } static bool @@ -2447,6 +2519,15 @@ test_has_mpam_hcr(const struct arm64_cpu_capabilities *entry, int scope) return idr & MPAMIDR_EL1_HAS_HCR; } +static bool +test_has_gicv5_legacy(const struct arm64_cpu_capabilities *entry, int scope) +{ + if (!this_cpu_has_cap(ARM64_HAS_GICV5_CPUIF)) + return false; + + return !!(read_sysreg_s(SYS_ICC_IDR0_EL1) & ICC_IDR0_EL1_GCIE_LEGACY); +} + static const struct arm64_cpu_capabilities arm64_features[] = { { .capability = ARM64_ALWAYS_BOOT, @@ -2459,8 +2540,8 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_always, }, { - .desc = "GIC system register CPU interface", - .capability = ARM64_HAS_GIC_CPUIF_SYSREGS, + .desc = "GICv3 CPU interface", + .capability = ARM64_HAS_GICV3_CPUIF, .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE, .matches = has_useable_gicv3_cpuif, ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, GIC, IMP) @@ -2479,7 +2560,6 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, ECV, CNTPOFF) }, -#ifdef CONFIG_ARM64_PAN { .desc = "Privileged Access Never", .capability = ARM64_HAS_PAN, @@ -2488,7 +2568,6 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .cpu_enable = cpu_enable_pan, ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, PAN, IMP) }, -#endif /* CONFIG_ARM64_PAN */ #ifdef CONFIG_ARM64_EPAN { .desc = "Enhanced Privileged Access Never", @@ -2498,7 +2577,6 @@ static const struct arm64_cpu_capabilities arm64_features[] = { ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, PAN, PAN3) }, #endif /* CONFIG_ARM64_EPAN */ -#ifdef CONFIG_ARM64_LSE_ATOMICS { .desc = "LSE atomic instructions", .capability = ARM64_HAS_LSE_ATOMICS, @@ -2506,7 +2584,6 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64ISAR0_EL1, ATOMIC, IMP) }, -#endif /* CONFIG_ARM64_LSE_ATOMICS */ { .desc = "Virtualization Host Extensions", .capability = ARM64_HAS_VIRT_HOST_EXTN, @@ -2519,7 +2596,17 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .capability = ARM64_HAS_NESTED_VIRT, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_nested_virt_support, - ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, NV, NV2) + .match_list = (const struct arm64_cpu_capabilities []){ + { + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, NV, NV2) + }, + { + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY) + }, + { /* Sentinel */ } + }, }, { .capability = ARM64_HAS_32BIT_EL0_DO_NOT_USE, @@ -2604,6 +2691,12 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .cpu_enable = cpu_clear_disr, ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, RAS, IMP) }, + { + .desc = "RASv1p1 Extension Support", + .capability = ARM64_HAS_RASV1P1_EXTN, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_rasv1p1, + }, #endif /* CONFIG_ARM64_RAS_EXTN */ #ifdef CONFIG_ARM64_AMU_EXTN { @@ -2780,6 +2873,15 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_gic_prio_relaxed_sync, }, #endif + { + /* + * Depends on having GICv3 + */ + .desc = "ICV_DIR_EL1 trapping", + .capability = ARM64_HAS_ICH_HCR_EL2_TDIR, + .type = ARM64_CPUCAP_EARLY_LOCAL_CPU_FEATURE, + .matches = can_trap_icv_dir_el1, + }, #ifdef CONFIG_ARM64_E0PD { .desc = "E0PD", @@ -2827,6 +2929,20 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, MTE, MTE3) }, + { + .desc = "FAR on MTE Tag Check Fault", + .capability = ARM64_MTE_FAR, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64PFR2_EL1, MTEFAR, IMP) + }, + { + .desc = "Store Only MTE Tag Check", + .capability = ARM64_MTE_STORE_ONLY, + .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64PFR2_EL1, MTESTOREONLY, IMP) + }, #endif /* CONFIG_ARM64_MTE */ { .desc = "RCpc load-acquire (LDAPR)", @@ -2842,6 +2958,13 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, FGT, IMP) }, + { + .desc = "Fine Grained Traps 2", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_HAS_FGT2, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, FGT, FGT2) + }, #ifdef CONFIG_ARM64_SME { .desc = "Scalable Matrix Extension", @@ -2927,6 +3050,12 @@ static const struct arm64_cpu_capabilities arm64_features[] = { ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, EVT, IMP) }, { + .desc = "BBM Level 2 without TLB conflict abort", + .capability = ARM64_HAS_BBML2_NOABORT, + .type = ARM64_CPUCAP_EARLY_LOCAL_CPU_FEATURE, + .matches = has_bbml2_noabort, + }, + { .desc = "52-bit Virtual Addressing for KVM (LPA2)", .capability = ARM64_HAS_LPA2, .type = ARM64_CPUCAP_SYSTEM_FEATURE, @@ -2999,6 +3128,66 @@ static const struct arm64_cpu_capabilities arm64_features[] = { ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, GCS, IMP) }, #endif +#ifdef CONFIG_HW_PERF_EVENTS + { + .desc = "PMUv3", + .capability = ARM64_HAS_PMUV3, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_pmuv3, + }, +#endif + { + .desc = "SCTLR2", + .capability = ARM64_HAS_SCTLR2, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR3_EL1, SCTLRX, IMP) + }, + { + .desc = "GICv5 CPU interface", + .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE, + .capability = ARM64_HAS_GICV5_CPUIF, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64PFR2_EL1, GCIE, IMP) + }, + { + .desc = "GICv5 Legacy vCPU interface", + .type = ARM64_CPUCAP_EARLY_LOCAL_CPU_FEATURE, + .capability = ARM64_HAS_GICV5_LEGACY, + .matches = test_has_gicv5_legacy, + }, + { + .desc = "XNX", + .capability = ARM64_HAS_XNX, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, XNX, IMP) + }, + { + .desc = "LS64", + .capability = ARM64_HAS_LS64, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + .cpu_enable = cpu_enable_ls64, + ARM64_CPUID_FIELDS(ID_AA64ISAR1_EL1, LS64, LS64) + }, + { + .desc = "LS64_V", + .capability = ARM64_HAS_LS64_V, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + .cpu_enable = cpu_enable_ls64_v, + ARM64_CPUID_FIELDS(ID_AA64ISAR1_EL1, LS64, LS64_V) + }, +#ifdef CONFIG_ARM64_LSUI + { + .desc = "Unprivileged Load Store Instructions (LSUI)", + .capability = ARM64_HAS_LSUI, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64ISAR3_EL1, LSUI, IMP) + }, +#endif {}, }; @@ -3073,6 +3262,13 @@ static bool has_sve_feature(const struct arm64_cpu_capabilities *cap, int scope) } #endif +#ifdef CONFIG_ARM64_SME +static bool has_sme_feature(const struct arm64_cpu_capabilities *cap, int scope) +{ + return system_supports_sme() && has_user_cpuid_feature(cap, scope); +} +#endif + static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64ISAR0_EL1, AES, PMULL, CAP_HWCAP, KERNEL_HWCAP_PMULL), HWCAP_CAP(ID_AA64ISAR0_EL1, AES, AES, CAP_HWCAP, KERNEL_HWCAP_AES), @@ -3111,8 +3307,10 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64ISAR1_EL1, BF16, EBF16, CAP_HWCAP, KERNEL_HWCAP_EBF16), HWCAP_CAP(ID_AA64ISAR1_EL1, DGH, IMP, CAP_HWCAP, KERNEL_HWCAP_DGH), HWCAP_CAP(ID_AA64ISAR1_EL1, I8MM, IMP, CAP_HWCAP, KERNEL_HWCAP_I8MM), + HWCAP_CAP(ID_AA64ISAR1_EL1, LS64, LS64, CAP_HWCAP, KERNEL_HWCAP_LS64), HWCAP_CAP(ID_AA64ISAR2_EL1, LUT, IMP, CAP_HWCAP, KERNEL_HWCAP_LUT), HWCAP_CAP(ID_AA64ISAR3_EL1, FAMINMAX, IMP, CAP_HWCAP, KERNEL_HWCAP_FAMINMAX), + HWCAP_CAP(ID_AA64ISAR3_EL1, LSFE, IMP, CAP_HWCAP, KERNEL_HWCAP_LSFE), HWCAP_CAP(ID_AA64MMFR2_EL1, AT, IMP, CAP_HWCAP, KERNEL_HWCAP_USCAT), #ifdef CONFIG_ARM64_SVE HWCAP_CAP(ID_AA64PFR0_EL1, SVE, IMP, CAP_HWCAP, KERNEL_HWCAP_SVE), @@ -3149,6 +3347,8 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { #ifdef CONFIG_ARM64_MTE HWCAP_CAP(ID_AA64PFR1_EL1, MTE, MTE2, CAP_HWCAP, KERNEL_HWCAP_MTE), HWCAP_CAP(ID_AA64PFR1_EL1, MTE, MTE3, CAP_HWCAP, KERNEL_HWCAP_MTE3), + HWCAP_CAP(ID_AA64PFR2_EL1, MTEFAR, IMP, CAP_HWCAP, KERNEL_HWCAP_MTE_FAR), + HWCAP_CAP(ID_AA64PFR2_EL1, MTESTOREONLY, IMP, CAP_HWCAP , KERNEL_HWCAP_MTE_STORE_ONLY), #endif /* CONFIG_ARM64_MTE */ HWCAP_CAP(ID_AA64MMFR0_EL1, ECV, IMP, CAP_HWCAP, KERNEL_HWCAP_ECV), HWCAP_CAP(ID_AA64MMFR1_EL1, AFP, IMP, CAP_HWCAP, KERNEL_HWCAP_AFP), @@ -3161,31 +3361,31 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64ISAR2_EL1, BC, IMP, CAP_HWCAP, KERNEL_HWCAP_HBC), #ifdef CONFIG_ARM64_SME HWCAP_CAP(ID_AA64PFR1_EL1, SME, IMP, CAP_HWCAP, KERNEL_HWCAP_SME), - HWCAP_CAP(ID_AA64SMFR0_EL1, FA64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_FA64), - HWCAP_CAP(ID_AA64SMFR0_EL1, LUTv2, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_LUTV2), - HWCAP_CAP(ID_AA64SMFR0_EL1, SMEver, SME2p2, CAP_HWCAP, KERNEL_HWCAP_SME2P2), - HWCAP_CAP(ID_AA64SMFR0_EL1, SMEver, SME2p1, CAP_HWCAP, KERNEL_HWCAP_SME2P1), - HWCAP_CAP(ID_AA64SMFR0_EL1, SMEver, SME2, CAP_HWCAP, KERNEL_HWCAP_SME2), - HWCAP_CAP(ID_AA64SMFR0_EL1, I16I64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I16I64), - HWCAP_CAP(ID_AA64SMFR0_EL1, F64F64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F64F64), - HWCAP_CAP(ID_AA64SMFR0_EL1, I16I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I16I32), - HWCAP_CAP(ID_AA64SMFR0_EL1, B16B16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_B16B16), - HWCAP_CAP(ID_AA64SMFR0_EL1, F16F16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F16F16), - HWCAP_CAP(ID_AA64SMFR0_EL1, F8F16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F8F16), - HWCAP_CAP(ID_AA64SMFR0_EL1, F8F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F8F32), - HWCAP_CAP(ID_AA64SMFR0_EL1, I8I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I8I32), - HWCAP_CAP(ID_AA64SMFR0_EL1, F16F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F16F32), - HWCAP_CAP(ID_AA64SMFR0_EL1, B16F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_B16F32), - HWCAP_CAP(ID_AA64SMFR0_EL1, BI32I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_BI32I32), - HWCAP_CAP(ID_AA64SMFR0_EL1, F32F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F32F32), - HWCAP_CAP(ID_AA64SMFR0_EL1, SF8FMA, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8FMA), - HWCAP_CAP(ID_AA64SMFR0_EL1, SF8DP4, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8DP4), - HWCAP_CAP(ID_AA64SMFR0_EL1, SF8DP2, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8DP2), - HWCAP_CAP(ID_AA64SMFR0_EL1, SBitPerm, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SBITPERM), - HWCAP_CAP(ID_AA64SMFR0_EL1, AES, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_AES), - HWCAP_CAP(ID_AA64SMFR0_EL1, SFEXPA, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SFEXPA), - HWCAP_CAP(ID_AA64SMFR0_EL1, STMOP, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_STMOP), - HWCAP_CAP(ID_AA64SMFR0_EL1, SMOP4, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SMOP4), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, FA64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_FA64), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, LUTv2, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_LUTV2), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SMEver, SME2p2, CAP_HWCAP, KERNEL_HWCAP_SME2P2), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SMEver, SME2p1, CAP_HWCAP, KERNEL_HWCAP_SME2P1), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SMEver, SME2, CAP_HWCAP, KERNEL_HWCAP_SME2), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, I16I64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I16I64), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, F64F64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F64F64), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, I16I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I16I32), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, B16B16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_B16B16), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, F16F16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F16F16), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, F8F16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F8F16), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, F8F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F8F32), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, I8I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I8I32), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, F16F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F16F32), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, B16F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_B16F32), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, BI32I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_BI32I32), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, F32F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F32F32), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SF8FMA, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8FMA), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SF8DP4, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8DP4), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SF8DP2, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8DP2), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SBitPerm, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SBITPERM), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, AES, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_AES), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SFEXPA, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SFEXPA), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, STMOP, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_STMOP), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SMOP4, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SMOP4), #endif /* CONFIG_ARM64_SME */ HWCAP_CAP(ID_AA64FPFR0_EL1, F8CVT, IMP, CAP_HWCAP, KERNEL_HWCAP_F8CVT), HWCAP_CAP(ID_AA64FPFR0_EL1, F8FMA, IMP, CAP_HWCAP, KERNEL_HWCAP_F8FMA), @@ -3308,18 +3508,49 @@ static void update_cpu_capabilities(u16 scope_mask) scope_mask &= ARM64_CPUCAP_SCOPE_MASK; for (i = 0; i < ARM64_NCAPS; i++) { + bool match_all = false; + bool caps_set = false; + bool boot_cpu = false; + caps = cpucap_ptrs[i]; - if (!caps || !(caps->type & scope_mask) || - cpus_have_cap(caps->capability) || - !caps->matches(caps, cpucap_default_scope(caps))) + if (!caps || !(caps->type & scope_mask)) continue; - if (caps->desc && !caps->cpus) + match_all = cpucap_match_all_early_cpus(caps); + caps_set = cpus_have_cap(caps->capability); + boot_cpu = scope_mask & SCOPE_BOOT_CPU; + + /* + * Unless it's a match-all CPUs feature, avoid probing if + * already detected. + */ + if (!match_all && caps_set) + continue; + + /* + * A match-all CPUs capability is only set when probing the + * boot CPU. It may be cleared subsequently if not detected on + * secondary ones. + */ + if (match_all && !caps_set && !boot_cpu) + continue; + + if (!caps->matches(caps, cpucap_default_scope(caps))) { + if (match_all) + __clear_bit(caps->capability, system_cpucaps); + continue; + } + + /* + * Match-all CPUs capabilities are logged later when the + * system capabilities are finalised. + */ + if (!match_all && caps->desc && !caps->cpus) pr_info("detected: %s\n", caps->desc); __set_bit(caps->capability, system_cpucaps); - if ((scope_mask & SCOPE_BOOT_CPU) && (caps->type & SCOPE_BOOT_CPU)) + if (boot_cpu && (caps->type & SCOPE_BOOT_CPU)) set_bit(caps->capability, boot_cpucaps); } } @@ -3680,6 +3911,7 @@ unsigned long cpu_get_elf_hwcap3(void) static void __init setup_boot_cpu_capabilities(void) { + kvm_arm_target_impl_cpu_init(); /* * The boot CPU's feature register values have been recorded. Detect * boot cpucaps and local cpucaps for the boot CPU, then enable and @@ -3719,17 +3951,24 @@ static void __init setup_system_capabilities(void) enable_cpu_capabilities(SCOPE_ALL & ~SCOPE_BOOT_CPU); apply_alternatives_all(); - /* - * Log any cpucaps with a cpumask as these aren't logged by - * update_cpu_capabilities(). - */ for (int i = 0; i < ARM64_NCAPS; i++) { const struct arm64_cpu_capabilities *caps = cpucap_ptrs[i]; - if (caps && caps->cpus && caps->desc && - cpumask_any(caps->cpus) < nr_cpu_ids) + if (!caps || !caps->desc) + continue; + + /* + * Log any cpucaps with a cpumask as these aren't logged by + * update_cpu_capabilities(). + */ + if (caps->cpus && cpumask_any(caps->cpus) < nr_cpu_ids) pr_info("detected: %s on CPU%*pbl\n", caps->desc, cpumask_pr_args(caps->cpus)); + + /* Log match-all CPUs capabilities */ + if (cpucap_match_all_early_cpus(caps) && + cpus_have_cap(caps->capability)) + pr_info("detected: %s\n", caps->desc); } /* @@ -3737,12 +3976,18 @@ static void __init setup_system_capabilities(void) */ if (system_uses_ttbr0_pan()) pr_info("emulated: Privileged Access Never (PAN) using TTBR0_EL1 switching\n"); + + /* + * Report Spectre mitigations status. + */ + spectre_print_disabled_mitigations(); } void __init setup_system_features(void) { setup_system_capabilities(); + linear_map_maybe_split_to_ptes(); kpti_install_ng_mappings(); sve_setup(); @@ -3783,8 +4028,8 @@ static int enable_mismatched_32bit_el0(unsigned int cpu) bool cpu_32bit = false; if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) { - if (!housekeeping_cpu(cpu, HK_TYPE_TICK)) - pr_info("Treating adaptive-ticks CPU %u as 64-bit only\n", cpu); + if (!housekeeping_cpu(cpu, HK_TYPE_DOMAIN)) + pr_info("Treating domain isolated CPU %u as 64-bit only\n", cpu); else cpu_32bit = true; } diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 285d7d538342..6149bc91251d 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -81,6 +81,7 @@ static const char *const hwcap_str[] = { [KERNEL_HWCAP_PACA] = "paca", [KERNEL_HWCAP_PACG] = "pacg", [KERNEL_HWCAP_GCS] = "gcs", + [KERNEL_HWCAP_LS64] = "ls64", [KERNEL_HWCAP_DCPODP] = "dcpodp", [KERNEL_HWCAP_SVE2] = "sve2", [KERNEL_HWCAP_SVEAES] = "sveaes", @@ -160,6 +161,9 @@ static const char *const hwcap_str[] = { [KERNEL_HWCAP_SME_SFEXPA] = "smesfexpa", [KERNEL_HWCAP_SME_STMOP] = "smestmop", [KERNEL_HWCAP_SME_SMOP4] = "smesmop4", + [KERNEL_HWCAP_MTE_FAR] = "mtefar", + [KERNEL_HWCAP_MTE_STORE_ONLY] = "mtestoreonly", + [KERNEL_HWCAP_LSFE] = "lsfe", }; #ifdef CONFIG_COMPAT @@ -209,80 +213,79 @@ static const char *const compat_hwcap2_str[] = { static int c_show(struct seq_file *m, void *v) { - int i, j; + int j; + int cpu = m->index; bool compat = personality(current->personality) == PER_LINUX32; + struct cpuinfo_arm64 *cpuinfo = v; + u32 midr = cpuinfo->reg_midr; - for_each_online_cpu(i) { - struct cpuinfo_arm64 *cpuinfo = &per_cpu(cpu_data, i); - u32 midr = cpuinfo->reg_midr; - - /* - * glibc reads /proc/cpuinfo to determine the number of - * online processors, looking for lines beginning with - * "processor". Give glibc what it expects. - */ - seq_printf(m, "processor\t: %d\n", i); - if (compat) - seq_printf(m, "model name\t: ARMv8 Processor rev %d (%s)\n", - MIDR_REVISION(midr), COMPAT_ELF_PLATFORM); + /* + * glibc reads /proc/cpuinfo to determine the number of + * online processors, looking for lines beginning with + * "processor". Give glibc what it expects. + */ + seq_printf(m, "processor\t: %d\n", cpu); + if (compat) + seq_printf(m, "model name\t: ARMv8 Processor rev %d (%s)\n", + MIDR_REVISION(midr), COMPAT_ELF_PLATFORM); - seq_printf(m, "BogoMIPS\t: %lu.%02lu\n", - loops_per_jiffy / (500000UL/HZ), - loops_per_jiffy / (5000UL/HZ) % 100); + seq_printf(m, "BogoMIPS\t: %lu.%02lu\n", + loops_per_jiffy / (500000UL/HZ), + loops_per_jiffy / (5000UL/HZ) % 100); - /* - * Dump out the common processor features in a single line. - * Userspace should read the hwcaps with getauxval(AT_HWCAP) - * rather than attempting to parse this, but there's a body of - * software which does already (at least for 32-bit). - */ - seq_puts(m, "Features\t:"); - if (compat) { + /* + * Dump out the common processor features in a single line. + * Userspace should read the hwcaps with getauxval(AT_HWCAP) + * rather than attempting to parse this, but there's a body of + * software which does already (at least for 32-bit). + */ + seq_puts(m, "Features\t:"); + if (compat) { #ifdef CONFIG_COMPAT - for (j = 0; j < ARRAY_SIZE(compat_hwcap_str); j++) { - if (compat_elf_hwcap & (1 << j)) { - /* - * Warn once if any feature should not - * have been present on arm64 platform. - */ - if (WARN_ON_ONCE(!compat_hwcap_str[j])) - continue; - - seq_printf(m, " %s", compat_hwcap_str[j]); - } + for (j = 0; j < ARRAY_SIZE(compat_hwcap_str); j++) { + if (compat_elf_hwcap & (1 << j)) { + /* + * Warn once if any feature should not + * have been present on arm64 platform. + */ + if (WARN_ON_ONCE(!compat_hwcap_str[j])) + continue; + + seq_printf(m, " %s", compat_hwcap_str[j]); } + } - for (j = 0; j < ARRAY_SIZE(compat_hwcap2_str); j++) - if (compat_elf_hwcap2 & (1 << j)) - seq_printf(m, " %s", compat_hwcap2_str[j]); + for (j = 0; j < ARRAY_SIZE(compat_hwcap2_str); j++) + if (compat_elf_hwcap2 & (1 << j)) + seq_printf(m, " %s", compat_hwcap2_str[j]); #endif /* CONFIG_COMPAT */ - } else { - for (j = 0; j < ARRAY_SIZE(hwcap_str); j++) - if (cpu_have_feature(j)) - seq_printf(m, " %s", hwcap_str[j]); - } - seq_puts(m, "\n"); - - seq_printf(m, "CPU implementer\t: 0x%02x\n", - MIDR_IMPLEMENTOR(midr)); - seq_printf(m, "CPU architecture: 8\n"); - seq_printf(m, "CPU variant\t: 0x%x\n", MIDR_VARIANT(midr)); - seq_printf(m, "CPU part\t: 0x%03x\n", MIDR_PARTNUM(midr)); - seq_printf(m, "CPU revision\t: %d\n\n", MIDR_REVISION(midr)); + } else { + for (j = 0; j < ARRAY_SIZE(hwcap_str); j++) + if (cpu_have_feature(j)) + seq_printf(m, " %s", hwcap_str[j]); } + seq_puts(m, "\n"); + + seq_printf(m, "CPU implementer\t: 0x%02x\n", + MIDR_IMPLEMENTOR(midr)); + seq_puts(m, "CPU architecture: 8\n"); + seq_printf(m, "CPU variant\t: 0x%x\n", MIDR_VARIANT(midr)); + seq_printf(m, "CPU part\t: 0x%03x\n", MIDR_PARTNUM(midr)); + seq_printf(m, "CPU revision\t: %d\n\n", MIDR_REVISION(midr)); return 0; } static void *c_start(struct seq_file *m, loff_t *pos) { - return *pos < 1 ? (void *)1 : NULL; + *pos = cpumask_next(*pos - 1, cpu_online_mask); + return *pos < nr_cpu_ids ? &per_cpu(cpu_data, *pos) : NULL; } static void *c_next(struct seq_file *m, void *v, loff_t *pos) { ++*pos; - return NULL; + return c_start(m, pos); } static void c_stop(struct seq_file *m, void *v) @@ -328,11 +331,13 @@ static const struct kobj_type cpuregs_kobj_type = { CPUREGS_ATTR_RO(midr_el1, midr); CPUREGS_ATTR_RO(revidr_el1, revidr); +CPUREGS_ATTR_RO(aidr_el1, aidr); CPUREGS_ATTR_RO(smidr_el1, smidr); static struct attribute *cpuregs_id_attrs[] = { &cpuregs_attr_midr_el1.attr, &cpuregs_attr_revidr_el1.attr, + &cpuregs_attr_aidr_el1.attr, NULL }; @@ -469,6 +474,7 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) info->reg_dczid = read_cpuid(DCZID_EL0); info->reg_midr = read_cpuid_id(); info->reg_revidr = read_cpuid(REVIDR_EL1); + info->reg_aidr = read_cpuid(AIDR_EL1); info->reg_id_aa64dfr0 = read_cpuid(ID_AA64DFR0_EL1); info->reg_id_aa64dfr1 = read_cpuid(ID_AA64DFR1_EL1); @@ -494,8 +500,11 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) __cpuinfo_store_cpu_32bit(&info->aarch32); - if (id_aa64pfr0_mpam(info->reg_id_aa64pfr0)) - info->reg_mpamidr = read_cpuid(MPAMIDR_EL1); + /* + * info->reg_mpamidr deferred to {init,update}_cpu_features because we + * don't want to read it (and trigger a trap on buggy firmware) if + * using an aa64pfr0_el1 override to unconditionally disable MPAM. + */ if (IS_ENABLED(CONFIG_ARM64_SME) && id_aa64pfr1_sme(info->reg_id_aa64pfr1)) { diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c index 58f047de3e1c..29307642f4c9 100644 --- a/arch/arm64/kernel/debug-monitors.c +++ b/arch/arm64/kernel/debug-monitors.c @@ -21,8 +21,12 @@ #include <asm/cputype.h> #include <asm/daifflags.h> #include <asm/debug-monitors.h> +#include <asm/exception.h> +#include <asm/kgdb.h> +#include <asm/kprobes.h> #include <asm/system_misc.h> #include <asm/traps.h> +#include <asm/uprobes.h> /* Determine debug architecture. */ u8 debug_monitors_arch(void) @@ -34,7 +38,7 @@ u8 debug_monitors_arch(void) /* * MDSCR access routines. */ -static void mdscr_write(u32 mdscr) +static void mdscr_write(u64 mdscr) { unsigned long flags; flags = local_daif_save(); @@ -43,7 +47,7 @@ static void mdscr_write(u32 mdscr) } NOKPROBE_SYMBOL(mdscr_write); -static u32 mdscr_read(void) +static u64 mdscr_read(void) { return read_sysreg(mdscr_el1); } @@ -79,16 +83,16 @@ static DEFINE_PER_CPU(int, kde_ref_count); void enable_debug_monitors(enum dbg_active_el el) { - u32 mdscr, enable = 0; + u64 mdscr, enable = 0; WARN_ON(preemptible()); if (this_cpu_inc_return(mde_ref_count) == 1) - enable = DBG_MDSCR_MDE; + enable = MDSCR_EL1_MDE; if (el == DBG_ACTIVE_EL1 && this_cpu_inc_return(kde_ref_count) == 1) - enable |= DBG_MDSCR_KDE; + enable |= MDSCR_EL1_KDE; if (enable && debug_enabled) { mdscr = mdscr_read(); @@ -100,16 +104,16 @@ NOKPROBE_SYMBOL(enable_debug_monitors); void disable_debug_monitors(enum dbg_active_el el) { - u32 mdscr, disable = 0; + u64 mdscr, disable = 0; WARN_ON(preemptible()); if (this_cpu_dec_return(mde_ref_count) == 0) - disable = ~DBG_MDSCR_MDE; + disable = ~MDSCR_EL1_MDE; if (el == DBG_ACTIVE_EL1 && this_cpu_dec_return(kde_ref_count) == 0) - disable &= ~DBG_MDSCR_KDE; + disable &= ~MDSCR_EL1_KDE; if (disable) { mdscr = mdscr_read(); @@ -156,187 +160,124 @@ NOKPROBE_SYMBOL(clear_user_regs_spsr_ss); #define set_regs_spsr_ss(r) set_user_regs_spsr_ss(&(r)->user_regs) #define clear_regs_spsr_ss(r) clear_user_regs_spsr_ss(&(r)->user_regs) -static DEFINE_SPINLOCK(debug_hook_lock); -static LIST_HEAD(user_step_hook); -static LIST_HEAD(kernel_step_hook); - -static void register_debug_hook(struct list_head *node, struct list_head *list) +static void send_user_sigtrap(int si_code) { - spin_lock(&debug_hook_lock); - list_add_rcu(node, list); - spin_unlock(&debug_hook_lock); - -} + struct pt_regs *regs = current_pt_regs(); -static void unregister_debug_hook(struct list_head *node) -{ - spin_lock(&debug_hook_lock); - list_del_rcu(node); - spin_unlock(&debug_hook_lock); - synchronize_rcu(); -} + if (WARN_ON(!user_mode(regs))) + return; -void register_user_step_hook(struct step_hook *hook) -{ - register_debug_hook(&hook->node, &user_step_hook); -} + if (!regs_irqs_disabled(regs)) + local_irq_enable(); -void unregister_user_step_hook(struct step_hook *hook) -{ - unregister_debug_hook(&hook->node); + arm64_force_sig_fault(SIGTRAP, si_code, instruction_pointer(regs), + "User debug trap"); } -void register_kernel_step_hook(struct step_hook *hook) +/* + * We have already unmasked interrupts and enabled preemption + * when calling do_el0_softstep() from entry-common.c. + */ +void do_el0_softstep(unsigned long esr, struct pt_regs *regs) { - register_debug_hook(&hook->node, &kernel_step_hook); -} + if (uprobe_single_step_handler(regs, esr) == DBG_HOOK_HANDLED) + return; -void unregister_kernel_step_hook(struct step_hook *hook) -{ - unregister_debug_hook(&hook->node); + send_user_sigtrap(TRAP_TRACE); + /* + * ptrace will disable single step unless explicitly + * asked to re-enable it. For other clients, it makes + * sense to leave it enabled (i.e. rewind the controls + * to the active-not-pending state). + */ + user_rewind_single_step(current); } -/* - * Call registered single step handlers - * There is no Syndrome info to check for determining the handler. - * So we call all the registered handlers, until the right handler is - * found which returns zero. - */ -static int call_step_hook(struct pt_regs *regs, unsigned long esr) +void do_el1_softstep(unsigned long esr, struct pt_regs *regs) { - struct step_hook *hook; - struct list_head *list; - int retval = DBG_HOOK_ERROR; - - list = user_mode(regs) ? &user_step_hook : &kernel_step_hook; + if (kgdb_single_step_handler(regs, esr) == DBG_HOOK_HANDLED) + return; + pr_warn("Unexpected kernel single-step exception at EL1\n"); /* - * Since single-step exception disables interrupt, this function is - * entirely not preemptible, and we can use rcu list safely here. + * Re-enable stepping since we know that we will be + * returning to regs. */ - list_for_each_entry_rcu(hook, list, node) { - retval = hook->fn(regs, esr); - if (retval == DBG_HOOK_HANDLED) - break; - } - - return retval; + set_regs_spsr_ss(regs); } -NOKPROBE_SYMBOL(call_step_hook); +NOKPROBE_SYMBOL(do_el1_softstep); -static void send_user_sigtrap(int si_code) +static int call_el1_break_hook(struct pt_regs *regs, unsigned long esr) { - struct pt_regs *regs = current_pt_regs(); + if (esr_brk_comment(esr) == BUG_BRK_IMM) + return bug_brk_handler(regs, esr); - if (WARN_ON(!user_mode(regs))) - return; + if (IS_ENABLED(CONFIG_CFI) && esr_is_cfi_brk(esr)) + return cfi_brk_handler(regs, esr); - if (interrupts_enabled(regs)) - local_irq_enable(); + if (esr_brk_comment(esr) == FAULT_BRK_IMM) + return reserved_fault_brk_handler(regs, esr); - arm64_force_sig_fault(SIGTRAP, si_code, instruction_pointer(regs), - "User debug trap"); -} + if (IS_ENABLED(CONFIG_KASAN_SW_TAGS) && + (esr_brk_comment(esr) & ~KASAN_BRK_MASK) == KASAN_BRK_IMM) + return kasan_brk_handler(regs, esr); -static int single_step_handler(unsigned long unused, unsigned long esr, - struct pt_regs *regs) -{ - bool handler_found = false; + if (IS_ENABLED(CONFIG_UBSAN_TRAP) && esr_is_ubsan_brk(esr)) + return ubsan_brk_handler(regs, esr); - /* - * If we are stepping a pending breakpoint, call the hw_breakpoint - * handler first. - */ - if (!reinstall_suspended_bps(regs)) - return 0; - - if (!handler_found && call_step_hook(regs, esr) == DBG_HOOK_HANDLED) - handler_found = true; - - if (!handler_found && user_mode(regs)) { - send_user_sigtrap(TRAP_TRACE); - - /* - * ptrace will disable single step unless explicitly - * asked to re-enable it. For other clients, it makes - * sense to leave it enabled (i.e. rewind the controls - * to the active-not-pending state). - */ - user_rewind_single_step(current); - } else if (!handler_found) { - pr_warn("Unexpected kernel single-step exception at EL1\n"); - /* - * Re-enable stepping since we know that we will be - * returning to regs. - */ - set_regs_spsr_ss(regs); + if (IS_ENABLED(CONFIG_KGDB)) { + if (esr_brk_comment(esr) == KGDB_DYN_DBG_BRK_IMM) + return kgdb_brk_handler(regs, esr); + if (esr_brk_comment(esr) == KGDB_COMPILED_DBG_BRK_IMM) + return kgdb_compiled_brk_handler(regs, esr); } - return 0; -} -NOKPROBE_SYMBOL(single_step_handler); - -static LIST_HEAD(user_break_hook); -static LIST_HEAD(kernel_break_hook); + if (IS_ENABLED(CONFIG_KPROBES)) { + if (esr_brk_comment(esr) == KPROBES_BRK_IMM) + return kprobe_brk_handler(regs, esr); + if (esr_brk_comment(esr) == KPROBES_BRK_SS_IMM) + return kprobe_ss_brk_handler(regs, esr); + } -void register_user_break_hook(struct break_hook *hook) -{ - register_debug_hook(&hook->node, &user_break_hook); -} + if (IS_ENABLED(CONFIG_KRETPROBES) && + esr_brk_comment(esr) == KRETPROBES_BRK_IMM) + return kretprobe_brk_handler(regs, esr); -void unregister_user_break_hook(struct break_hook *hook) -{ - unregister_debug_hook(&hook->node); + return DBG_HOOK_ERROR; } +NOKPROBE_SYMBOL(call_el1_break_hook); -void register_kernel_break_hook(struct break_hook *hook) +/* + * We have already unmasked interrupts and enabled preemption + * when calling do_el0_brk64() from entry-common.c. + */ +void do_el0_brk64(unsigned long esr, struct pt_regs *regs) { - register_debug_hook(&hook->node, &kernel_break_hook); -} + if (IS_ENABLED(CONFIG_UPROBES) && + esr_brk_comment(esr) == UPROBES_BRK_IMM && + uprobe_brk_handler(regs, esr) == DBG_HOOK_HANDLED) + return; -void unregister_kernel_break_hook(struct break_hook *hook) -{ - unregister_debug_hook(&hook->node); + send_user_sigtrap(TRAP_BRKPT); } -static int call_break_hook(struct pt_regs *regs, unsigned long esr) +void do_el1_brk64(unsigned long esr, struct pt_regs *regs) { - struct break_hook *hook; - struct list_head *list; - - list = user_mode(regs) ? &user_break_hook : &kernel_break_hook; - - /* - * Since brk exception disables interrupt, this function is - * entirely not preemptible, and we can use rcu list safely here. - */ - list_for_each_entry_rcu(hook, list, node) { - if ((esr_brk_comment(esr) & ~hook->mask) == hook->imm) - return hook->fn(regs, esr); - } + if (call_el1_break_hook(regs, esr) == DBG_HOOK_HANDLED) + return; - return DBG_HOOK_ERROR; + die("Oops - BRK", regs, esr); } -NOKPROBE_SYMBOL(call_break_hook); +NOKPROBE_SYMBOL(do_el1_brk64); -static int brk_handler(unsigned long unused, unsigned long esr, - struct pt_regs *regs) +#ifdef CONFIG_COMPAT +void do_bkpt32(unsigned long esr, struct pt_regs *regs) { - if (call_break_hook(regs, esr) == DBG_HOOK_HANDLED) - return 0; - - if (user_mode(regs)) { - send_user_sigtrap(TRAP_BRKPT); - } else { - pr_warn("Unexpected kernel BRK exception at EL1\n"); - return -EFAULT; - } - - return 0; + arm64_notify_die("aarch32 BKPT", regs, SIGTRAP, TRAP_BRKPT, regs->pc, esr); } -NOKPROBE_SYMBOL(brk_handler); +#endif /* CONFIG_COMPAT */ -int aarch32_break_handler(struct pt_regs *regs) +bool try_handle_aarch32_break(struct pt_regs *regs) { u32 arm_instr; u16 thumb_instr; @@ -344,7 +285,7 @@ int aarch32_break_handler(struct pt_regs *regs) void __user *pc = (void __user *)instruction_pointer(regs); if (!compat_user_mode(regs)) - return -EFAULT; + return false; if (compat_thumb_mode(regs)) { /* get 16-bit Thumb instruction */ @@ -368,20 +309,12 @@ int aarch32_break_handler(struct pt_regs *regs) } if (!bp) - return -EFAULT; + return false; send_user_sigtrap(TRAP_BRKPT); - return 0; -} -NOKPROBE_SYMBOL(aarch32_break_handler); - -void __init debug_traps_init(void) -{ - hook_debug_fault_code(DBG_ESR_EVT_HWSS, single_step_handler, SIGTRAP, - TRAP_TRACE, "single-step handler"); - hook_debug_fault_code(DBG_ESR_EVT_BRK, brk_handler, SIGTRAP, - TRAP_BRKPT, "BRK handler"); + return true; } +NOKPROBE_SYMBOL(try_handle_aarch32_break); /* Re-enable single step for syscall restarting. */ void user_rewind_single_step(struct task_struct *task) @@ -415,7 +348,7 @@ void kernel_enable_single_step(struct pt_regs *regs) { WARN_ON(!irqs_disabled()); set_regs_spsr_ss(regs); - mdscr_write(mdscr_read() | DBG_MDSCR_SS); + mdscr_write(mdscr_read() | MDSCR_EL1_SS); enable_debug_monitors(DBG_ACTIVE_EL1); } NOKPROBE_SYMBOL(kernel_enable_single_step); @@ -423,7 +356,7 @@ NOKPROBE_SYMBOL(kernel_enable_single_step); void kernel_disable_single_step(void) { WARN_ON(!irqs_disabled()); - mdscr_write(mdscr_read() & ~DBG_MDSCR_SS); + mdscr_write(mdscr_read() & ~MDSCR_EL1_SS); disable_debug_monitors(DBG_ACTIVE_EL1); } NOKPROBE_SYMBOL(kernel_disable_single_step); @@ -431,7 +364,7 @@ NOKPROBE_SYMBOL(kernel_disable_single_step); int kernel_active_single_step(void) { WARN_ON(!irqs_disabled()); - return mdscr_read() & DBG_MDSCR_SS; + return mdscr_read() & MDSCR_EL1_SS; } NOKPROBE_SYMBOL(kernel_active_single_step); diff --git a/arch/arm64/kernel/efi-header.S b/arch/arm64/kernel/efi-header.S index 11d7f7de202d..329e8df9215f 100644 --- a/arch/arm64/kernel/efi-header.S +++ b/arch/arm64/kernel/efi-header.S @@ -28,7 +28,7 @@ .macro __EFI_PE_HEADER #ifdef CONFIG_EFI .set .Lpe_header_offset, . - .L_head - .long PE_MAGIC + .long IMAGE_NT_SIGNATURE .short IMAGE_FILE_MACHINE_ARM64 // Machine .short .Lsection_count // NumberOfSections .long 0 // TimeDateStamp @@ -40,7 +40,7 @@ IMAGE_FILE_LINE_NUMS_STRIPPED // Characteristics .Loptional_header: - .short PE_OPT_MAGIC_PE32PLUS // PE32+ format + .short IMAGE_NT_OPTIONAL_HDR64_MAGIC // PE32+ format .byte 0x02 // MajorLinkerVersion .byte 0x14 // MinorLinkerVersion .long __initdata_begin - .Lefi_header_end // SizeOfCode @@ -66,7 +66,7 @@ .long .Lefi_header_end - .L_head // SizeOfHeaders .long 0 // CheckSum .short IMAGE_SUBSYSTEM_EFI_APPLICATION // Subsystem - .short IMAGE_DLL_CHARACTERISTICS_NX_COMPAT // DllCharacteristics + .short IMAGE_DLLCHARACTERISTICS_NX_COMPAT // DllCharacteristics .quad 0 // SizeOfStackReserve .quad 0 // SizeOfStackCommit .quad 0 // SizeOfHeapReserve diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index 1d25d8899dbf..a81cb4aa4738 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -10,11 +10,13 @@ #include <linux/efi.h> #include <linux/init.h> #include <linux/kmemleak.h> +#include <linux/kthread.h> #include <linux/screen_info.h> #include <linux/vmalloc.h> #include <asm/efi.h> #include <asm/stacktrace.h> +#include <asm/vmap_stack.h> static bool region_is_misaligned(const efi_memory_desc_t *md) { @@ -29,7 +31,7 @@ static bool region_is_misaligned(const efi_memory_desc_t *md) * executable, everything else can be mapped with the XN bits * set. Also take the new (optional) RO/XP bits into account. */ -static __init pteval_t create_mapping_protection(efi_memory_desc_t *md) +static __init ptdesc_t create_mapping_protection(efi_memory_desc_t *md) { u64 attr = md->attribute; u32 type = md->type; @@ -83,7 +85,7 @@ static __init pteval_t create_mapping_protection(efi_memory_desc_t *md) int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) { - pteval_t prot_val = create_mapping_protection(md); + ptdesc_t prot_val = create_mapping_protection(md); bool page_mappings_only = (md->type == EFI_RUNTIME_SERVICES_CODE || md->type == EFI_RUNTIME_SERVICES_DATA); @@ -164,20 +166,53 @@ asmlinkage efi_status_t efi_handle_corrupted_x18(efi_status_t s, const char *f) return s; } -static DEFINE_RAW_SPINLOCK(efi_rt_lock); - void arch_efi_call_virt_setup(void) { - efi_virtmap_load(); + efi_runtime_assert_lock_held(); + + if (preemptible() && (current->flags & PF_KTHREAD)) { + /* + * Disable migration to ensure that a preempted EFI runtime + * service call will be resumed on the same CPU. This avoids + * potential issues with EFI runtime calls that are preempted + * while polling for an asynchronous completion of a secure + * firmware call, which may not permit the CPU to change. + */ + migrate_disable(); + kthread_use_mm(&efi_mm); + } else { + efi_virtmap_load(); + } + + /* + * Enable access to the valid TTBR0_EL1 and invoke the errata + * workaround directly since there is no return from exception when + * invoking the EFI run-time services. + */ + uaccess_ttbr0_enable(); + post_ttbr_update_workaround(); + __efi_fpsimd_begin(); - raw_spin_lock(&efi_rt_lock); } void arch_efi_call_virt_teardown(void) { - raw_spin_unlock(&efi_rt_lock); __efi_fpsimd_end(); - efi_virtmap_unload(); + + /* + * Defer the switch to the current thread's TTBR0_EL1 until + * uaccess_enable(). Do so before efi_virtmap_unload() updates the + * saved TTBR0 value, so the userland page tables are not activated + * inadvertently over the back of an exception. + */ + uaccess_ttbr0_disable(); + + if (preemptible() && (current->flags & PF_KTHREAD)) { + kthread_unuse_mm(&efi_mm); + migrate_enable(); + } else { + efi_virtmap_unload(); + } } asmlinkage u64 *efi_rt_stack_top __ro_after_init; @@ -214,9 +249,8 @@ static int __init arm64_efi_rt_init(void) if (!efi_enabled(EFI_RUNTIME_SERVICES)) return 0; - p = __vmalloc_node(THREAD_SIZE, THREAD_ALIGN, GFP_KERNEL, - NUMA_NO_NODE, &&l); -l: if (!p) { + p = arch_alloc_vmap_stack(THREAD_SIZE, NUMA_NO_NODE); + if (!p) { pr_warn("Failed to allocate EFI runtime stack\n"); clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); return -ENOMEM; diff --git a/arch/arm64/kernel/elfcore.c b/arch/arm64/kernel/elfcore.c index 2e94d20c4ac7..b735f4c2fe5e 100644 --- a/arch/arm64/kernel/elfcore.c +++ b/arch/arm64/kernel/elfcore.c @@ -27,9 +27,10 @@ static int mte_dump_tag_range(struct coredump_params *cprm, int ret = 1; unsigned long addr; void *tags = NULL; + int locked = 0; for (addr = start; addr < start + len; addr += PAGE_SIZE) { - struct page *page = get_dump_page(addr); + struct page *page = get_dump_page(addr, &locked); /* * get_dump_page() returns NULL when encountering an empty diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index b260ddc4d3e9..f42ce7b5c67f 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -6,8 +6,10 @@ */ #include <linux/context_tracking.h> +#include <linux/irq-entry-common.h> #include <linux/kasan.h> #include <linux/linkage.h> +#include <linux/livepatch.h> #include <linux/lockdep.h> #include <linux/ptrace.h> #include <linux/resume_user_mode.h> @@ -32,67 +34,31 @@ * Handle IRQ/context state management when entering from kernel mode. * Before this function is called it is not safe to call regular kernel code, * instrumentable code, or any code which may trigger an exception. - * - * This is intended to match the logic in irqentry_enter(), handling the kernel - * mode transitions only. */ -static __always_inline void __enter_from_kernel_mode(struct pt_regs *regs) +static noinstr irqentry_state_t arm64_enter_from_kernel_mode(struct pt_regs *regs) { - regs->exit_rcu = false; - - if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) { - lockdep_hardirqs_off(CALLER_ADDR0); - ct_irq_enter(); - trace_hardirqs_off_finish(); - - regs->exit_rcu = true; - return; - } - - lockdep_hardirqs_off(CALLER_ADDR0); - rcu_irq_enter_check_tick(); - trace_hardirqs_off_finish(); -} + irqentry_state_t state; -static void noinstr enter_from_kernel_mode(struct pt_regs *regs) -{ - __enter_from_kernel_mode(regs); + state = irqentry_enter_from_kernel_mode(regs); mte_check_tfsr_entry(); mte_disable_tco_entry(current); + + return state; } /* * Handle IRQ/context state management when exiting to kernel mode. * After this function returns it is not safe to call regular kernel code, * instrumentable code, or any code which may trigger an exception. - * - * This is intended to match the logic in irqentry_exit(), handling the kernel - * mode transitions only, and with preemption handled elsewhere. */ -static __always_inline void __exit_to_kernel_mode(struct pt_regs *regs) -{ - lockdep_assert_irqs_disabled(); - - if (interrupts_enabled(regs)) { - if (regs->exit_rcu) { - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(); - ct_irq_exit(); - lockdep_hardirqs_on(CALLER_ADDR0); - return; - } - - trace_hardirqs_on(); - } else { - if (regs->exit_rcu) - ct_irq_exit(); - } -} - -static void noinstr exit_to_kernel_mode(struct pt_regs *regs) +static void noinstr arm64_exit_to_kernel_mode(struct pt_regs *regs, + irqentry_state_t state) { + local_irq_disable(); + irqentry_exit_to_kernel_mode_preempt(regs, state); + local_daif_mask(); mte_check_tfsr_exit(); - __exit_to_kernel_mode(regs); + irqentry_exit_to_kernel_mode_after_preempt(regs, state); } /* @@ -100,129 +66,30 @@ static void noinstr exit_to_kernel_mode(struct pt_regs *regs) * Before this function is called it is not safe to call regular kernel code, * instrumentable code, or any code which may trigger an exception. */ -static __always_inline void __enter_from_user_mode(void) +static __always_inline void arm64_enter_from_user_mode(struct pt_regs *regs) { - lockdep_hardirqs_off(CALLER_ADDR0); - CT_WARN_ON(ct_state() != CT_STATE_USER); - user_exit_irqoff(); - trace_hardirqs_off_finish(); + enter_from_user_mode(regs); mte_disable_tco_entry(current); } -static __always_inline void enter_from_user_mode(struct pt_regs *regs) -{ - __enter_from_user_mode(); -} - /* * Handle IRQ/context state management when exiting to user mode. * After this function returns it is not safe to call regular kernel code, * instrumentable code, or any code which may trigger an exception. */ -static __always_inline void __exit_to_user_mode(void) -{ - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(); - user_enter_irqoff(); - lockdep_hardirqs_on(CALLER_ADDR0); -} - -static void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags) -{ - do { - local_irq_enable(); - - if (thread_flags & _TIF_NEED_RESCHED) - schedule(); - - if (thread_flags & _TIF_UPROBE) - uprobe_notify_resume(regs); - - if (thread_flags & _TIF_MTE_ASYNC_FAULT) { - clear_thread_flag(TIF_MTE_ASYNC_FAULT); - send_sig_fault(SIGSEGV, SEGV_MTEAERR, - (void __user *)NULL, current); - } - if (thread_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) - do_signal(regs); - - if (thread_flags & _TIF_NOTIFY_RESUME) - resume_user_mode_work(regs); - - if (thread_flags & _TIF_FOREIGN_FPSTATE) - fpsimd_restore_current_state(); - - local_irq_disable(); - thread_flags = read_thread_flags(); - } while (thread_flags & _TIF_WORK_MASK); -} - -static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs) +static __always_inline void arm64_exit_to_user_mode(struct pt_regs *regs) { - unsigned long flags; - local_irq_disable(); - - flags = read_thread_flags(); - if (unlikely(flags & _TIF_WORK_MASK)) - do_notify_resume(regs, flags); - + exit_to_user_mode_prepare_legacy(regs); local_daif_mask(); - - lockdep_sys_exit(); -} - -static __always_inline void exit_to_user_mode(struct pt_regs *regs) -{ - exit_to_user_mode_prepare(regs); mte_check_tfsr_exit(); - __exit_to_user_mode(); + exit_to_user_mode(); } asmlinkage void noinstr asm_exit_to_user_mode(struct pt_regs *regs) { - exit_to_user_mode(regs); -} - -/* - * Handle IRQ/context state management when entering an NMI from user/kernel - * mode. Before this function is called it is not safe to call regular kernel - * code, instrumentable code, or any code which may trigger an exception. - */ -static void noinstr arm64_enter_nmi(struct pt_regs *regs) -{ - regs->lockdep_hardirqs = lockdep_hardirqs_enabled(); - - __nmi_enter(); - lockdep_hardirqs_off(CALLER_ADDR0); - lockdep_hardirq_enter(); - ct_nmi_enter(); - - trace_hardirqs_off_finish(); - ftrace_nmi_enter(); -} - -/* - * Handle IRQ/context state management when exiting an NMI from user/kernel - * mode. After this function returns it is not safe to call regular kernel - * code, instrumentable code, or any code which may trigger an exception. - */ -static void noinstr arm64_exit_nmi(struct pt_regs *regs) -{ - bool restore = regs->lockdep_hardirqs; - - ftrace_nmi_exit(); - if (restore) { - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(); - } - - ct_nmi_exit(); - lockdep_hardirq_exit(); - if (restore) - lockdep_hardirqs_on(CALLER_ADDR0); - __nmi_exit(); + arm64_exit_to_user_mode(regs); } /* @@ -230,14 +97,18 @@ static void noinstr arm64_exit_nmi(struct pt_regs *regs) * kernel mode. Before this function is called it is not safe to call regular * kernel code, instrumentable code, or any code which may trigger an exception. */ -static void noinstr arm64_enter_el1_dbg(struct pt_regs *regs) +static noinstr irqentry_state_t arm64_enter_el1_dbg(struct pt_regs *regs) { - regs->lockdep_hardirqs = lockdep_hardirqs_enabled(); + irqentry_state_t state; + + state.lockdep = lockdep_hardirqs_enabled(); lockdep_hardirqs_off(CALLER_ADDR0); ct_nmi_enter(); trace_hardirqs_off_finish(); + + return state; } /* @@ -245,62 +116,19 @@ static void noinstr arm64_enter_el1_dbg(struct pt_regs *regs) * kernel mode. After this function returns it is not safe to call regular * kernel code, instrumentable code, or any code which may trigger an exception. */ -static void noinstr arm64_exit_el1_dbg(struct pt_regs *regs) +static void noinstr arm64_exit_el1_dbg(struct pt_regs *regs, + irqentry_state_t state) { - bool restore = regs->lockdep_hardirqs; - - if (restore) { + if (state.lockdep) { trace_hardirqs_on_prepare(); lockdep_hardirqs_on_prepare(); } ct_nmi_exit(); - if (restore) + if (state.lockdep) lockdep_hardirqs_on(CALLER_ADDR0); } -#ifdef CONFIG_PREEMPT_DYNAMIC -DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); -#define need_irq_preemption() \ - (static_branch_unlikely(&sk_dynamic_irqentry_exit_cond_resched)) -#else -#define need_irq_preemption() (IS_ENABLED(CONFIG_PREEMPTION)) -#endif - -static void __sched arm64_preempt_schedule_irq(void) -{ - if (!need_irq_preemption()) - return; - - /* - * Note: thread_info::preempt_count includes both thread_info::count - * and thread_info::need_resched, and is not equivalent to - * preempt_count(). - */ - if (READ_ONCE(current_thread_info()->preempt_count) != 0) - return; - - /* - * DAIF.DA are cleared at the start of IRQ/FIQ handling, and when GIC - * priority masking is used the GIC irqchip driver will clear DAIF.IF - * using gic_arch_enable_irqs() for normal IRQs. If anything is set in - * DAIF we must have handled an NMI, so skip preemption. - */ - if (system_uses_irq_prio_masking() && read_sysreg(daif)) - return; - - /* - * Preempting a task from an IRQ means we leave copies of PSTATE - * on the stack. cpufeature's enable calls may modify PSTATE, but - * resuming one of these preempted tasks would undo those changes. - * - * Only allow a task to be preempted once cpufeatures have been - * enabled. - */ - if (system_capabilities_finalized()) - preempt_schedule_irq(); -} - static void do_interrupt_handler(struct pt_regs *regs, void (*handler)(struct pt_regs *)) { @@ -320,7 +148,7 @@ extern void (*handle_arch_fiq)(struct pt_regs *); static void noinstr __panic_unhandled(struct pt_regs *regs, const char *vector, unsigned long esr) { - arm64_enter_nmi(regs); + irqentry_nmi_enter(regs); console_verbose(); @@ -344,7 +172,7 @@ static DEFINE_PER_CPU(int, __in_cortex_a76_erratum_1463225_wa); static void cortex_a76_erratum_1463225_svc_handler(void) { - u32 reg, val; + u64 reg, val; if (!unlikely(test_thread_flag(TIF_SINGLESTEP))) return; @@ -354,7 +182,7 @@ static void cortex_a76_erratum_1463225_svc_handler(void) __this_cpu_write(__in_cortex_a76_erratum_1463225_wa, 1); reg = read_sysreg(mdscr_el1); - val = reg | DBG_MDSCR_SS | DBG_MDSCR_KDE; + val = reg | MDSCR_EL1_SS | MDSCR_EL1_KDE; write_sysreg(val, mdscr_el1); asm volatile("msr daifclr, #8"); isb(); @@ -393,20 +221,16 @@ static bool cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs) * As per the ABI exit SME streaming mode and clear the SVE state not * shared with FPSIMD on syscall entry. */ -static inline void fp_user_discard(void) +static inline void fpsimd_syscall_enter(void) { - /* - * If SME is active then exit streaming mode. If ZA is active - * then flush the SVE registers but leave userspace access to - * both SVE and SME enabled, otherwise disable SME for the - * task and fall through to disabling SVE too. This means - * that after a syscall we never have any streaming mode - * register state to track, if this changes the KVM code will - * need updating. - */ + /* Ensure PSTATE.SM is clear, but leave PSTATE.ZA as-is. */ if (system_supports_sme()) sme_smstop_sm(); + /* + * The CPU is not in streaming mode. If non-streaming SVE is not + * supported, there is no SVE state that needs to be discarded. + */ if (!system_supports_sve()) return; @@ -416,7 +240,56 @@ static inline void fp_user_discard(void) sve_vq_minus_one = sve_vq_from_vl(task_get_sve_vl(current)) - 1; sve_flush_live(true, sve_vq_minus_one); } + + /* + * Any live non-FPSIMD SVE state has been zeroed. Allow + * fpsimd_save_user_state() to lazily discard SVE state until either + * the live state is unbound or fpsimd_syscall_exit() is called. + */ + __this_cpu_write(fpsimd_last_state.to_save, FP_STATE_FPSIMD); +} + +static __always_inline void fpsimd_syscall_exit(void) +{ + if (!system_supports_sve()) + return; + + /* + * The current task's user FPSIMD/SVE/SME state is now bound to this + * CPU. The fpsimd_last_state.to_save value is either: + * + * - FP_STATE_FPSIMD, if the state has not been reloaded on this CPU + * since fpsimd_syscall_enter(). + * + * - FP_STATE_CURRENT, if the state has been reloaded on this CPU at + * any point. + * + * Reset this to FP_STATE_CURRENT to stop lazy discarding. + */ + __this_cpu_write(fpsimd_last_state.to_save, FP_STATE_CURRENT); +} + +/* + * In debug exception context, we explicitly disable preemption despite + * having interrupts disabled. + * This serves two purposes: it makes it much less likely that we would + * accidentally schedule in exception context and it will force a warning + * if we somehow manage to schedule by accident. + */ +static void debug_exception_enter(struct pt_regs *regs) +{ + preempt_disable(); + + /* This code is a bit fragile. Test it. */ + RCU_LOCKDEP_WARN(!rcu_is_watching(), "exception_enter didn't work"); } +NOKPROBE_SYMBOL(debug_exception_enter); + +static void debug_exception_exit(struct pt_regs *regs) +{ + preempt_enable_no_resched(); +} +NOKPROBE_SYMBOL(debug_exception_exit); UNHANDLED(el1t, 64, sync) UNHANDLED(el1t, 64, irq) @@ -426,78 +299,128 @@ UNHANDLED(el1t, 64, error) static void noinstr el1_abort(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); + irqentry_state_t state; - enter_from_kernel_mode(regs); + state = arm64_enter_from_kernel_mode(regs); local_daif_inherit(regs); do_mem_abort(far, esr, regs); - local_daif_mask(); - exit_to_kernel_mode(regs); + arm64_exit_to_kernel_mode(regs, state); } static void noinstr el1_pc(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); + irqentry_state_t state; - enter_from_kernel_mode(regs); + state = arm64_enter_from_kernel_mode(regs); local_daif_inherit(regs); do_sp_pc_abort(far, esr, regs); - local_daif_mask(); - exit_to_kernel_mode(regs); + arm64_exit_to_kernel_mode(regs, state); } static void noinstr el1_undef(struct pt_regs *regs, unsigned long esr) { - enter_from_kernel_mode(regs); + irqentry_state_t state; + + state = arm64_enter_from_kernel_mode(regs); local_daif_inherit(regs); do_el1_undef(regs, esr); - local_daif_mask(); - exit_to_kernel_mode(regs); + arm64_exit_to_kernel_mode(regs, state); } static void noinstr el1_bti(struct pt_regs *regs, unsigned long esr) { - enter_from_kernel_mode(regs); + irqentry_state_t state; + + state = arm64_enter_from_kernel_mode(regs); local_daif_inherit(regs); do_el1_bti(regs, esr); - local_daif_mask(); - exit_to_kernel_mode(regs); + arm64_exit_to_kernel_mode(regs, state); } static void noinstr el1_gcs(struct pt_regs *regs, unsigned long esr) { - enter_from_kernel_mode(regs); + irqentry_state_t state; + + state = arm64_enter_from_kernel_mode(regs); local_daif_inherit(regs); do_el1_gcs(regs, esr); - local_daif_mask(); - exit_to_kernel_mode(regs); + arm64_exit_to_kernel_mode(regs, state); } static void noinstr el1_mops(struct pt_regs *regs, unsigned long esr) { - enter_from_kernel_mode(regs); + irqentry_state_t state; + + state = arm64_enter_from_kernel_mode(regs); local_daif_inherit(regs); do_el1_mops(regs, esr); - local_daif_mask(); - exit_to_kernel_mode(regs); + arm64_exit_to_kernel_mode(regs, state); +} + +static void noinstr el1_breakpt(struct pt_regs *regs, unsigned long esr) +{ + irqentry_state_t state; + + state = arm64_enter_el1_dbg(regs); + debug_exception_enter(regs); + do_breakpoint(esr, regs); + debug_exception_exit(regs); + arm64_exit_el1_dbg(regs, state); +} + +static void noinstr el1_softstp(struct pt_regs *regs, unsigned long esr) +{ + irqentry_state_t state; + + state = arm64_enter_el1_dbg(regs); + if (!cortex_a76_erratum_1463225_debug_handler(regs)) { + debug_exception_enter(regs); + /* + * After handling a breakpoint, we suspend the breakpoint + * and use single-step to move to the next instruction. + * If we are stepping a suspended breakpoint there's nothing more to do: + * the single-step is complete. + */ + if (!try_step_suspended_breakpoints(regs)) + do_el1_softstep(esr, regs); + debug_exception_exit(regs); + } + arm64_exit_el1_dbg(regs, state); } -static void noinstr el1_dbg(struct pt_regs *regs, unsigned long esr) +static void noinstr el1_watchpt(struct pt_regs *regs, unsigned long esr) { + /* Watchpoints are the only debug exception to write FAR_EL1 */ unsigned long far = read_sysreg(far_el1); + irqentry_state_t state; + + state = arm64_enter_el1_dbg(regs); + debug_exception_enter(regs); + do_watchpoint(far, esr, regs); + debug_exception_exit(regs); + arm64_exit_el1_dbg(regs, state); +} - arm64_enter_el1_dbg(regs); - if (!cortex_a76_erratum_1463225_debug_handler(regs)) - do_debug_exception(far, esr, regs); - arm64_exit_el1_dbg(regs); +static void noinstr el1_brk64(struct pt_regs *regs, unsigned long esr) +{ + irqentry_state_t state; + + state = arm64_enter_el1_dbg(regs); + debug_exception_enter(regs); + do_el1_brk64(esr, regs); + debug_exception_exit(regs); + arm64_exit_el1_dbg(regs, state); } static void noinstr el1_fpac(struct pt_regs *regs, unsigned long esr) { - enter_from_kernel_mode(regs); + irqentry_state_t state; + + state = arm64_enter_from_kernel_mode(regs); local_daif_inherit(regs); do_el1_fpac(regs, esr); - local_daif_mask(); - exit_to_kernel_mode(regs); + arm64_exit_to_kernel_mode(regs, state); } asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs) @@ -530,10 +453,16 @@ asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs) el1_mops(regs, esr); break; case ESR_ELx_EC_BREAKPT_CUR: + el1_breakpt(regs, esr); + break; case ESR_ELx_EC_SOFTSTP_CUR: + el1_softstp(regs, esr); + break; case ESR_ELx_EC_WATCHPT_CUR: + el1_watchpt(regs, esr); + break; case ESR_ELx_EC_BRK64: - el1_dbg(regs, esr); + el1_brk64(regs, esr); break; case ESR_ELx_EC_FPAC: el1_fpac(regs, esr); @@ -546,30 +475,32 @@ asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs) static __always_inline void __el1_pnmi(struct pt_regs *regs, void (*handler)(struct pt_regs *)) { - arm64_enter_nmi(regs); + irqentry_state_t state; + + state = irqentry_nmi_enter(regs); do_interrupt_handler(regs, handler); - arm64_exit_nmi(regs); + irqentry_nmi_exit(regs, state); } static __always_inline void __el1_irq(struct pt_regs *regs, void (*handler)(struct pt_regs *)) { - enter_from_kernel_mode(regs); + irqentry_state_t state; + + state = arm64_enter_from_kernel_mode(regs); irq_enter_rcu(); do_interrupt_handler(regs, handler); irq_exit_rcu(); - arm64_preempt_schedule_irq(); - - exit_to_kernel_mode(regs); + arm64_exit_to_kernel_mode(regs, state); } static void noinstr el1_interrupt(struct pt_regs *regs, void (*handler)(struct pt_regs *)) { write_sysreg(DAIF_PROCCTX_NOIRQ, daif); - if (IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && !interrupts_enabled(regs)) + if (IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && regs_irqs_disabled(regs)) __el1_pnmi(regs, handler); else __el1_irq(regs, handler); @@ -588,21 +519,22 @@ asmlinkage void noinstr el1h_64_fiq_handler(struct pt_regs *regs) asmlinkage void noinstr el1h_64_error_handler(struct pt_regs *regs) { unsigned long esr = read_sysreg(esr_el1); + irqentry_state_t state; local_daif_restore(DAIF_ERRCTX); - arm64_enter_nmi(regs); + state = irqentry_nmi_enter(regs); do_serror(regs, esr); - arm64_exit_nmi(regs); + irqentry_nmi_exit(regs, state); } static void noinstr el0_da(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_mem_abort(far, esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_ia(struct pt_regs *regs, unsigned long esr) @@ -617,50 +549,50 @@ static void noinstr el0_ia(struct pt_regs *regs, unsigned long esr) if (!is_ttbr0_addr(far)) arm64_apply_bp_hardening(); - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_mem_abort(far, esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_fpsimd_acc(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_fpsimd_acc(esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_sve_acc(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_sve_acc(esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_sme_acc(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_sme_acc(esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_fpsimd_exc(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_fpsimd_exc(esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_sys(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_el0_sys(esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_pc(struct pt_regs *regs, unsigned long esr) @@ -670,87 +602,132 @@ static void noinstr el0_pc(struct pt_regs *regs, unsigned long esr) if (!is_ttbr0_addr(instruction_pointer(regs))) arm64_apply_bp_hardening(); - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_sp_pc_abort(far, esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_sp(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_sp_pc_abort(regs->sp, esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_undef(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_el0_undef(regs, esr); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_bti(struct pt_regs *regs) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_el0_bti(regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_mops(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_el0_mops(regs, esr); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_gcs(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_el0_gcs(regs, esr); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_inv(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); bad_el0_sync(regs, 0, esr); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); +} + +static void noinstr el0_breakpt(struct pt_regs *regs, unsigned long esr) +{ + if (!is_ttbr0_addr(regs->pc)) + arm64_apply_bp_hardening(); + + arm64_enter_from_user_mode(regs); + debug_exception_enter(regs); + do_breakpoint(esr, regs); + debug_exception_exit(regs); + local_daif_restore(DAIF_PROCCTX); + arm64_exit_to_user_mode(regs); +} + +static void noinstr el0_softstp(struct pt_regs *regs, unsigned long esr) +{ + bool step_done; + + if (!is_ttbr0_addr(regs->pc)) + arm64_apply_bp_hardening(); + + arm64_enter_from_user_mode(regs); + /* + * After handling a breakpoint, we suspend the breakpoint + * and use single-step to move to the next instruction. + * If we are stepping a suspended breakpoint there's nothing more to do: + * the single-step is complete. + */ + step_done = try_step_suspended_breakpoints(regs); + local_daif_restore(DAIF_PROCCTX); + if (!step_done) + do_el0_softstep(esr, regs); + arm64_exit_to_user_mode(regs); } -static void noinstr el0_dbg(struct pt_regs *regs, unsigned long esr) +static void noinstr el0_watchpt(struct pt_regs *regs, unsigned long esr) { - /* Only watchpoints write FAR_EL1, otherwise its UNKNOWN */ + /* Watchpoints are the only debug exception to write FAR_EL1 */ unsigned long far = read_sysreg(far_el1); - enter_from_user_mode(regs); - do_debug_exception(far, esr, regs); + arm64_enter_from_user_mode(regs); + debug_exception_enter(regs); + do_watchpoint(far, esr, regs); + debug_exception_exit(regs); + local_daif_restore(DAIF_PROCCTX); + arm64_exit_to_user_mode(regs); +} + +static void noinstr el0_brk64(struct pt_regs *regs, unsigned long esr) +{ + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); - exit_to_user_mode(regs); + do_el0_brk64(esr, regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_svc(struct pt_regs *regs) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); cortex_a76_erratum_1463225_svc_handler(); - fp_user_discard(); + fpsimd_syscall_enter(); local_daif_restore(DAIF_PROCCTX); do_el0_svc(regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); + fpsimd_syscall_exit(); } static void noinstr el0_fpac(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_el0_fpac(regs, esr); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } asmlinkage void noinstr el0t_64_sync_handler(struct pt_regs *regs) @@ -802,10 +779,16 @@ asmlinkage void noinstr el0t_64_sync_handler(struct pt_regs *regs) el0_gcs(regs, esr); break; case ESR_ELx_EC_BREAKPT_LOW: + el0_breakpt(regs, esr); + break; case ESR_ELx_EC_SOFTSTP_LOW: + el0_softstp(regs, esr); + break; case ESR_ELx_EC_WATCHPT_LOW: + el0_watchpt(regs, esr); + break; case ESR_ELx_EC_BRK64: - el0_dbg(regs, esr); + el0_brk64(regs, esr); break; case ESR_ELx_EC_FPAC: el0_fpac(regs, esr); @@ -818,7 +801,7 @@ asmlinkage void noinstr el0t_64_sync_handler(struct pt_regs *regs) static void noinstr el0_interrupt(struct pt_regs *regs, void (*handler)(struct pt_regs *)) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); write_sysreg(DAIF_PROCCTX_NOIRQ, daif); @@ -829,7 +812,7 @@ static void noinstr el0_interrupt(struct pt_regs *regs, do_interrupt_handler(regs, handler); irq_exit_rcu(); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr __el0_irq_handler_common(struct pt_regs *regs) @@ -855,14 +838,15 @@ asmlinkage void noinstr el0t_64_fiq_handler(struct pt_regs *regs) static void noinstr __el0_error_handler_common(struct pt_regs *regs) { unsigned long esr = read_sysreg(esr_el1); + irqentry_state_t state; - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_ERRCTX); - arm64_enter_nmi(regs); + state = irqentry_nmi_enter(regs); do_serror(regs, esr); - arm64_exit_nmi(regs); + irqentry_nmi_exit(regs, state); local_daif_restore(DAIF_PROCCTX); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } asmlinkage void noinstr el0t_64_error_handler(struct pt_regs *regs) @@ -873,19 +857,27 @@ asmlinkage void noinstr el0t_64_error_handler(struct pt_regs *regs) #ifdef CONFIG_COMPAT static void noinstr el0_cp15(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_el0_cp15(esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_svc_compat(struct pt_regs *regs) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); cortex_a76_erratum_1463225_svc_handler(); local_daif_restore(DAIF_PROCCTX); do_el0_svc_compat(regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); +} + +static void noinstr el0_bkpt32(struct pt_regs *regs, unsigned long esr) +{ + arm64_enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_bkpt32(esr, regs); + arm64_exit_to_user_mode(regs); } asmlinkage void noinstr el0t_32_sync_handler(struct pt_regs *regs) @@ -922,10 +914,16 @@ asmlinkage void noinstr el0t_32_sync_handler(struct pt_regs *regs) el0_cp15(regs, esr); break; case ESR_ELx_EC_BREAKPT_LOW: + el0_breakpt(regs, esr); + break; case ESR_ELx_EC_SOFTSTP_LOW: + el0_softstp(regs, esr); + break; case ESR_ELx_EC_WATCHPT_LOW: + el0_watchpt(regs, esr); + break; case ESR_ELx_EC_BKPT32: - el0_dbg(regs, esr); + el0_bkpt32(regs, esr); break; default: el0_inv(regs, esr); @@ -953,21 +951,20 @@ UNHANDLED(el0t, 32, fiq) UNHANDLED(el0t, 32, error) #endif /* CONFIG_COMPAT */ -#ifdef CONFIG_VMAP_STACK asmlinkage void noinstr __noreturn handle_bad_stack(struct pt_regs *regs) { unsigned long esr = read_sysreg(esr_el1); unsigned long far = read_sysreg(far_el1); - arm64_enter_nmi(regs); + irqentry_nmi_enter(regs); panic_bad_stack(regs, esr, far); } -#endif /* CONFIG_VMAP_STACK */ #ifdef CONFIG_ARM_SDE_INTERFACE asmlinkage noinstr unsigned long __sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg) { + irqentry_state_t state; unsigned long ret; /* @@ -992,9 +989,9 @@ __sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg) else if (cpu_has_pan()) set_pstate_pan(0); - arm64_enter_nmi(regs); + state = irqentry_nmi_enter(regs); ret = do_sdei_event(regs, arg); - arm64_exit_nmi(regs); + irqentry_nmi_exit(regs, state); return ret; } diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S index 169ccf600066..025140caafe7 100644 --- a/arch/arm64/kernel/entry-ftrace.S +++ b/arch/arm64/kernel/entry-ftrace.S @@ -94,7 +94,7 @@ SYM_CODE_START(ftrace_caller) stp x29, x30, [sp, #FREGS_SIZE] add x29, sp, #FREGS_SIZE - /* Prepare arguments for the the tracer func */ + /* Prepare arguments for the tracer func */ sub x0, x30, #AARCH64_INSN_SIZE // ip (callsite's BL insn) mov x1, x9 // parent_ip (callsite's LR) mov x3, sp // regs diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 5ae2a34b50bd..e0db14e9c843 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -55,7 +55,6 @@ .endif sub sp, sp, #PT_REGS_SIZE -#ifdef CONFIG_VMAP_STACK /* * Test whether the SP has overflowed, without corrupting a GPR. * Task and IRQ stacks are aligned so that SP & (1 << THREAD_SHIFT) @@ -97,7 +96,6 @@ /* We were already on the overflow stack. Restore sp/x0 and carry on. */ sub sp, sp, x0 mrs x0, tpidrro_el0 -#endif b el\el\ht\()_\regsize\()_\label .org .Lventry_start\@ + 128 // Did we overflow the ventry slot? .endm @@ -275,7 +273,7 @@ alternative_if ARM64_HAS_ADDRESS_AUTH alternative_else_nop_endif 1: - scs_load_current + scs_load_current_base .else add x21, sp, #PT_REGS_SIZE get_current_task tsk @@ -380,8 +378,6 @@ alternative_if ARM64_WORKAROUND_845719 alternative_else_nop_endif #endif 3: - scs_save tsk - /* Ignore asynchronous tag check faults in the uaccess routines */ ldr x0, [tsk, THREAD_SCTLR_USER] clear_mte_async_tcf x0 @@ -475,7 +471,7 @@ alternative_else_nop_endif */ SYM_CODE_START_LOCAL(__swpan_entry_el1) mrs x21, ttbr0_el1 - tst x21, #TTBR_ASID_MASK // Check for the reserved ASID + tst x21, #TTBRx_EL1_ASID_MASK // Check for the reserved ASID orr x23, x23, #PSR_PAN_BIT // Set the emulated PAN in the saved SPSR b.eq 1f // TTBR0 access already disabled and x23, x23, #~PSR_PAN_BIT // Clear the emulated PAN in the saved SPSR @@ -540,7 +536,6 @@ SYM_CODE_START(vectors) kernel_ventry 0, t, 32, error // Error 32-bit EL0 SYM_CODE_END(vectors) -#ifdef CONFIG_VMAP_STACK SYM_CODE_START_LOCAL(__bad_stack) /* * We detected an overflow in kernel_ventry, which switched to the @@ -568,7 +563,6 @@ SYM_CODE_START_LOCAL(__bad_stack) bl handle_bad_stack ASM_BUG() SYM_CODE_END(__bad_stack) -#endif /* CONFIG_VMAP_STACK */ .macro entry_handler el:req, ht:req, regsize:req, label:req @@ -614,7 +608,7 @@ SYM_CODE_END(ret_to_kernel) SYM_CODE_START_LOCAL(ret_to_user) ldr x19, [tsk, #TSK_TI_FLAGS] // re-check for single-step enable_step_tsk x19, x2 -#ifdef CONFIG_GCC_PLUGIN_STACKLEAK +#ifdef CONFIG_KSTACK_ERASE bl stackleak_erase_on_task_stack #endif kernel_exit 0 @@ -825,6 +819,7 @@ SYM_CODE_END(__bp_harden_el1_vectors) * */ SYM_FUNC_START(cpu_switch_to) + save_and_disable_daif x11 mov x10, #THREAD_CPU_CONTEXT add x8, x0, x10 mov x9, sp @@ -848,6 +843,7 @@ SYM_FUNC_START(cpu_switch_to) ptrauth_keys_install_kernel x1, x8, x9, x10 scs_save x0 scs_load_current + restore_irq x11 ret SYM_FUNC_END(cpu_switch_to) NOKPROBE(cpu_switch_to) @@ -874,6 +870,7 @@ NOKPROBE(ret_from_fork) * Calls func(regs) using this CPU's irq stack and shadow irq stack. */ SYM_FUNC_START(call_on_irq_stack) + save_and_disable_daif x9 #ifdef CONFIG_SHADOW_CALL_STACK get_current_task x16 scs_save x16 @@ -888,8 +885,10 @@ SYM_FUNC_START(call_on_irq_stack) /* Move to the new stack and call the function there */ add sp, x16, #IRQ_STACK_SIZE + restore_irq x9 blr x1 + save_and_disable_daif x9 /* * Restore the SP from the FP, and restore the FP and LR from the frame * record. @@ -897,6 +896,7 @@ SYM_FUNC_START(call_on_irq_stack) mov sp, x29 ldp x29, x30, [sp], #16 scs_load_current + restore_irq x9 ret SYM_FUNC_END(call_on_irq_stack) NOKPROBE(call_on_irq_stack) @@ -1003,7 +1003,6 @@ SYM_CODE_START(__sdei_asm_handler) 1: adr_this_cpu dst=x5, sym=sdei_active_critical_event, tmp=x6 2: str x19, [x5] -#ifdef CONFIG_VMAP_STACK /* * entry.S may have been using sp as a scratch register, find whether * this is a normal or critical event and switch to the appropriate @@ -1016,7 +1015,6 @@ SYM_CODE_START(__sdei_asm_handler) 2: mov x6, #SDEI_STACK_SIZE add x5, x5, x6 mov sp, x5 -#endif #ifdef CONFIG_SHADOW_CALL_STACK /* Use a separate shadow call stack for normal and critical events */ diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 8370d55f0353..9de1d8a604cb 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -119,7 +119,7 @@ * whatever is in the FPSIMD registers is not saved to memory, but discarded. */ -static DEFINE_PER_CPU(struct cpu_fp_state, fpsimd_last_state); +DEFINE_PER_CPU(struct cpu_fp_state, fpsimd_last_state); __ro_after_init struct vl_info vl_info[ARM64_VEC_MAX] = { #ifdef CONFIG_ARM64_SVE @@ -180,13 +180,6 @@ static inline void set_sve_default_vl(int val) set_default_vl(ARM64_VEC_SVE, val); } -static void __percpu *efi_sve_state; - -#else /* ! CONFIG_ARM64_SVE */ - -/* Dummy declaration for code that will be optimised out: */ -extern void __percpu *efi_sve_state; - #endif /* ! CONFIG_ARM64_SVE */ #ifdef CONFIG_ARM64_SME @@ -225,10 +218,21 @@ static void fpsimd_bind_task_to_cpu(void); */ static void get_cpu_fpsimd_context(void) { - if (!IS_ENABLED(CONFIG_PREEMPT_RT)) - local_bh_disable(); - else + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + /* + * The softirq subsystem lacks a true unmask/mask API, and + * re-enabling softirq processing using local_bh_enable() will + * not only unmask softirqs, it will also result in immediate + * delivery of any pending softirqs. + * This is undesirable when running with IRQs disabled, but in + * that case, there is no need to mask softirqs in the first + * place, so only bother doing so when IRQs are enabled. + */ + if (!irqs_disabled()) + local_bh_disable(); + } else { preempt_disable(); + } } /* @@ -240,10 +244,12 @@ static void get_cpu_fpsimd_context(void) */ static void put_cpu_fpsimd_context(void) { - if (!IS_ENABLED(CONFIG_PREEMPT_RT)) - local_bh_enable(); - else + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + if (!irqs_disabled()) + local_bh_enable(); + } else { preempt_enable(); + } } unsigned int task_get_vl(const struct task_struct *task, enum vec_type type) @@ -359,20 +365,15 @@ static void task_fpsimd_load(void) WARN_ON(preemptible()); WARN_ON(test_thread_flag(TIF_KERNEL_FPSTATE)); - if (system_supports_fpmr()) - write_sysreg_s(current->thread.uw.fpmr, SYS_FPMR); - if (system_supports_sve() || system_supports_sme()) { switch (current->thread.fp_type) { case FP_STATE_FPSIMD: /* Stop tracking SVE for this task until next use. */ - if (test_and_clear_thread_flag(TIF_SVE)) - sve_user_disable(); + clear_thread_flag(TIF_SVE); break; case FP_STATE_SVE: - if (!thread_sm_enabled(¤t->thread) && - !WARN_ON_ONCE(!test_and_set_thread_flag(TIF_SVE))) - sve_user_enable(); + if (!thread_sm_enabled(¤t->thread)) + WARN_ON_ONCE(!test_and_set_thread_flag(TIF_SVE)); if (test_thread_flag(TIF_SVE)) sve_set_vq(sve_vq_from_vl(task_get_sve_vl(current)) - 1); @@ -413,6 +414,9 @@ static void task_fpsimd_load(void) restore_ffr = system_supports_fa64(); } + if (system_supports_fpmr()) + write_sysreg_s(current->thread.uw.fpmr, SYS_FPMR); + if (restore_sve_regs) { WARN_ON_ONCE(current->thread.fp_type != FP_STATE_SVE); sve_load_state(sve_pffr(¤t->thread), @@ -453,12 +457,15 @@ static void fpsimd_save_user_state(void) *(last->fpmr) = read_sysreg_s(SYS_FPMR); /* - * If a task is in a syscall the ABI allows us to only - * preserve the state shared with FPSIMD so don't bother - * saving the full SVE state in that case. + * Save SVE state if it is live. + * + * The syscall ABI discards live SVE state at syscall entry. When + * entering a syscall, fpsimd_syscall_enter() sets to_save to + * FP_STATE_FPSIMD to allow the SVE state to be lazily discarded until + * either new SVE state is loaded+bound or fpsimd_syscall_exit() is + * called prior to a return to userspace. */ - if ((last->to_save == FP_STATE_CURRENT && test_thread_flag(TIF_SVE) && - !in_syscall(current_pt_regs())) || + if ((last->to_save == FP_STATE_CURRENT && test_thread_flag(TIF_SVE)) || last->to_save == FP_STATE_SVE) { save_sve_regs = true; save_ffr = true; @@ -651,7 +658,7 @@ static void __fpsimd_to_sve(void *sst, struct user_fpsimd_state const *fst, * task->thread.uw.fpsimd_state must be up to date before calling this * function. */ -static void fpsimd_to_sve(struct task_struct *task) +static inline void fpsimd_to_sve(struct task_struct *task) { unsigned int vq; void *sst = task->thread.sve_state; @@ -675,7 +682,7 @@ static void fpsimd_to_sve(struct task_struct *task) * bytes of allocated kernel memory. * task->thread.sve_state must be up to date before calling this function. */ -static void sve_to_fpsimd(struct task_struct *task) +static inline void sve_to_fpsimd(struct task_struct *task) { unsigned int vq, vl; void const *sst = task->thread.sve_state; @@ -694,44 +701,39 @@ static void sve_to_fpsimd(struct task_struct *task) } } -void cpu_enable_fpmr(const struct arm64_cpu_capabilities *__always_unused p) +static inline void __fpsimd_zero_vregs(struct user_fpsimd_state *fpsimd) { - write_sysreg_s(read_sysreg_s(SYS_SCTLR_EL1) | SCTLR_EL1_EnFPM_MASK, - SYS_SCTLR_EL1); + memset(&fpsimd->vregs, 0, sizeof(fpsimd->vregs)); } -#ifdef CONFIG_ARM64_SVE /* - * Call __sve_free() directly only if you know task can't be scheduled - * or preempted. + * Simulate the effects of an SMSTOP SM instruction. */ -static void __sve_free(struct task_struct *task) +void task_smstop_sm(struct task_struct *task) { - kfree(task->thread.sve_state); - task->thread.sve_state = NULL; -} + if (!thread_sm_enabled(&task->thread)) + return; -static void sve_free(struct task_struct *task) -{ - WARN_ON(test_tsk_thread_flag(task, TIF_SVE)); + __fpsimd_zero_vregs(&task->thread.uw.fpsimd_state); + task->thread.uw.fpsimd_state.fpsr = 0x0800009f; + if (system_supports_fpmr()) + task->thread.uw.fpmr = 0; - __sve_free(task); + task->thread.svcr &= ~SVCR_SM_MASK; + task->thread.fp_type = FP_STATE_FPSIMD; } -/* - * Return how many bytes of memory are required to store the full SVE - * state for task, given task's currently configured vector length. - */ -size_t sve_state_size(struct task_struct const *task) +void cpu_enable_fpmr(const struct arm64_cpu_capabilities *__always_unused p) { - unsigned int vl = 0; - - if (system_supports_sve()) - vl = task_get_sve_vl(task); - if (system_supports_sme()) - vl = max(vl, task_get_sme_vl(task)); + write_sysreg_s(read_sysreg_s(SYS_SCTLR_EL1) | SCTLR_EL1_EnFPM_MASK, + SYS_SCTLR_EL1); +} - return SVE_SIG_REGS_SIZE(sve_vq_from_vl(vl)); +#ifdef CONFIG_ARM64_SVE +static void sve_free(struct task_struct *task) +{ + kfree(task->thread.sve_state); + task->thread.sve_state = NULL; } /* @@ -758,69 +760,34 @@ void sve_alloc(struct task_struct *task, bool flush) kzalloc(sve_state_size(task), GFP_KERNEL); } - -/* - * Force the FPSIMD state shared with SVE to be updated in the SVE state - * even if the SVE state is the current active state. - * - * This should only be called by ptrace. task must be non-runnable. - * task->thread.sve_state must point to at least sve_state_size(task) - * bytes of allocated kernel memory. - */ -void fpsimd_force_sync_to_sve(struct task_struct *task) -{ - fpsimd_to_sve(task); -} - /* - * Ensure that task->thread.sve_state is up to date with respect to - * the user task, irrespective of when SVE is in use or not. + * Ensure that task->thread.uw.fpsimd_state is up to date with respect to the + * task's currently effective FPSIMD/SVE state. * - * This should only be called by ptrace. task must be non-runnable. - * task->thread.sve_state must point to at least sve_state_size(task) - * bytes of allocated kernel memory. + * The task's FPSIMD/SVE/SME state must not be subject to concurrent + * manipulation. */ -void fpsimd_sync_to_sve(struct task_struct *task) -{ - if (!test_tsk_thread_flag(task, TIF_SVE) && - !thread_sm_enabled(&task->thread)) - fpsimd_to_sve(task); -} - -/* - * Ensure that task->thread.uw.fpsimd_state is up to date with respect to - * the user task, irrespective of whether SVE is in use or not. - * - * This should only be called by ptrace. task must be non-runnable. - * task->thread.sve_state must point to at least sve_state_size(task) - * bytes of allocated kernel memory. - */ -void sve_sync_to_fpsimd(struct task_struct *task) +void fpsimd_sync_from_effective_state(struct task_struct *task) { if (task->thread.fp_type == FP_STATE_SVE) sve_to_fpsimd(task); } /* - * Ensure that task->thread.sve_state is up to date with respect to - * the task->thread.uw.fpsimd_state. + * Ensure that the task's currently effective FPSIMD/SVE state is up to date + * with respect to task->thread.uw.fpsimd_state, zeroing any effective + * non-FPSIMD (S)SVE state. * - * This should only be called by ptrace to merge new FPSIMD register - * values into a task for which SVE is currently active. - * task must be non-runnable. - * task->thread.sve_state must point to at least sve_state_size(task) - * bytes of allocated kernel memory. - * task->thread.uw.fpsimd_state must already have been initialised with - * the new FPSIMD register values to be merged in. + * The task's FPSIMD/SVE/SME state must not be subject to concurrent + * manipulation. */ -void sve_sync_from_fpsimd_zeropad(struct task_struct *task) +void fpsimd_sync_to_effective_state_zeropad(struct task_struct *task) { unsigned int vq; void *sst = task->thread.sve_state; struct user_fpsimd_state const *fst = &task->thread.uw.fpsimd_state; - if (!test_tsk_thread_flag(task, TIF_SVE) && - !thread_sm_enabled(&task->thread)) + if (task->thread.fp_type != FP_STATE_SVE) return; vq = sve_vq_from_vl(thread_get_cur_vl(&task->thread)); @@ -829,10 +796,73 @@ void sve_sync_from_fpsimd_zeropad(struct task_struct *task) __fpsimd_to_sve(sst, fst, vq); } +static int change_live_vector_length(struct task_struct *task, + enum vec_type type, + unsigned long vl) +{ + unsigned int sve_vl = task_get_sve_vl(task); + unsigned int sme_vl = task_get_sme_vl(task); + void *sve_state = NULL, *sme_state = NULL; + + if (type == ARM64_VEC_SME) + sme_vl = vl; + else + sve_vl = vl; + + /* + * Allocate the new sve_state and sme_state before freeing the old + * copies so that allocation failure can be handled without needing to + * mutate the task's state in any way. + * + * Changes to the SVE vector length must not discard live ZA state or + * clear PSTATE.ZA, as userspace code which is unaware of the AAPCS64 + * ZA lazy saving scheme may attempt to change the SVE vector length + * while unsaved/dormant ZA state exists. + */ + sve_state = kzalloc(__sve_state_size(sve_vl, sme_vl), GFP_KERNEL); + if (!sve_state) + goto out_mem; + + if (type == ARM64_VEC_SME) { + sme_state = kzalloc(__sme_state_size(sme_vl), GFP_KERNEL); + if (!sme_state) + goto out_mem; + } + + if (task == current) + fpsimd_save_and_flush_current_state(); + else + fpsimd_flush_task_state(task); + + /* + * Always preserve PSTATE.SM and the effective FPSIMD state, zeroing + * other SVE state. + */ + fpsimd_sync_from_effective_state(task); + task_set_vl(task, type, vl); + kfree(task->thread.sve_state); + task->thread.sve_state = sve_state; + fpsimd_sync_to_effective_state_zeropad(task); + + if (type == ARM64_VEC_SME) { + task->thread.svcr &= ~SVCR_ZA_MASK; + kfree(task->thread.sme_state); + task->thread.sme_state = sme_state; + } + + return 0; + +out_mem: + kfree(sve_state); + kfree(sme_state); + return -ENOMEM; +} + int vec_set_vector_length(struct task_struct *task, enum vec_type type, unsigned long vl, unsigned long flags) { - bool free_sme = false; + bool onexec = flags & PR_SVE_SET_VL_ONEXEC; + bool inherit = flags & PR_SVE_VL_INHERIT; if (flags & ~(unsigned long)(PR_SVE_VL_INHERIT | PR_SVE_SET_VL_ONEXEC)) @@ -852,71 +882,17 @@ int vec_set_vector_length(struct task_struct *task, enum vec_type type, vl = find_supported_vector_length(type, vl); - if (flags & (PR_SVE_VL_INHERIT | - PR_SVE_SET_VL_ONEXEC)) + if (!onexec && vl != task_get_vl(task, type)) { + if (change_live_vector_length(task, type, vl)) + return -ENOMEM; + } + + if (onexec || inherit) task_set_vl_onexec(task, type, vl); else /* Reset VL to system default on next exec: */ task_set_vl_onexec(task, type, 0); - /* Only actually set the VL if not deferred: */ - if (flags & PR_SVE_SET_VL_ONEXEC) - goto out; - - if (vl == task_get_vl(task, type)) - goto out; - - /* - * To ensure the FPSIMD bits of the SVE vector registers are preserved, - * write any live register state back to task_struct, and convert to a - * regular FPSIMD thread. - */ - if (task == current) { - get_cpu_fpsimd_context(); - - fpsimd_save_user_state(); - } - - fpsimd_flush_task_state(task); - if (test_and_clear_tsk_thread_flag(task, TIF_SVE) || - thread_sm_enabled(&task->thread)) { - sve_to_fpsimd(task); - task->thread.fp_type = FP_STATE_FPSIMD; - } - - if (system_supports_sme()) { - if (type == ARM64_VEC_SME || - !(task->thread.svcr & (SVCR_SM_MASK | SVCR_ZA_MASK))) { - /* - * We are changing the SME VL or weren't using - * SME anyway, discard the state and force a - * reallocation. - */ - task->thread.svcr &= ~(SVCR_SM_MASK | - SVCR_ZA_MASK); - clear_tsk_thread_flag(task, TIF_SME); - free_sme = true; - } - } - - if (task == current) - put_cpu_fpsimd_context(); - - task_set_vl(task, type, vl); - - /* - * Free the changed states if they are not in use, SME will be - * reallocated to the correct size on next use and we just - * allocate SVE now in case it is needed for use in streaming - * mode. - */ - sve_free(task); - sve_alloc(task, true); - - if (free_sme) - sme_free(task); - -out: update_tsk_thread_flag(task, vec_vl_inherit_flag(type), flags & PR_SVE_VL_INHERIT); @@ -1112,36 +1088,6 @@ int vec_verify_vq_map(enum vec_type type) return 0; } -static void __init sve_efi_setup(void) -{ - int max_vl = 0; - int i; - - if (!IS_ENABLED(CONFIG_EFI)) - return; - - for (i = 0; i < ARRAY_SIZE(vl_info); i++) - max_vl = max(vl_info[i].max_vl, max_vl); - - /* - * alloc_percpu() warns and prints a backtrace if this goes wrong. - * This is evidence of a crippled system and we are returning void, - * so no attempt is made to handle this situation here. - */ - if (!sve_vl_valid(max_vl)) - goto fail; - - efi_sve_state = __alloc_percpu( - SVE_SIG_REGS_SIZE(sve_vq_from_vl(max_vl)), SVE_VQ_BYTES); - if (!efi_sve_state) - goto fail; - - return; - -fail: - panic("Cannot allocate percpu memory for EFI SVE save/restore"); -} - void cpu_enable_sve(const struct arm64_cpu_capabilities *__always_unused p) { write_sysreg(read_sysreg(CPACR_EL1) | CPACR_EL1_ZEN_EL1EN, CPACR_EL1); @@ -1202,8 +1148,6 @@ void __init sve_setup(void) if (sve_max_virtualisable_vl() < sve_max_vl()) pr_warn("%s: unvirtualisable vector lengths present\n", info->name); - - sve_efi_setup(); } /* @@ -1212,7 +1156,7 @@ void __init sve_setup(void) */ void fpsimd_release_task(struct task_struct *dead_task) { - __sve_free(dead_task); + sve_free(dead_task); sme_free(dead_task); } @@ -1295,6 +1239,8 @@ void __init sme_setup(void) if (!system_supports_sme()) return; + min_bit = find_last_bit(info->vq_map, SVE_VQ_MAX); + /* * SME doesn't require any particular vector length be * supported but it does require at least one. We should have @@ -1302,9 +1248,8 @@ void __init sme_setup(void) * let's double check here. The bitmap is SVE_VQ_MAP sized for * sharing with SVE. */ - WARN_ON(bitmap_empty(info->vq_map, SVE_VQ_MAX)); + WARN_ON(min_bit >= SVE_VQ_MAX); - min_bit = find_last_bit(info->vq_map, SVE_VQ_MAX); info->min_vl = sve_vl_from_vq(__bit_to_vq(min_bit)); max_bit = find_first_bit(info->vq_map, SVE_VQ_MAX); @@ -1436,7 +1381,7 @@ void do_sme_acc(unsigned long esr, struct pt_regs *regs) * If this not a trap due to SME being disabled then something * is being used in the wrong mode, report as SIGILL. */ - if (ESR_ELx_ISS(esr) != ESR_ELx_SME_ISS_SME_DISABLED) { + if (ESR_ELx_SME_ISS_SMTC(esr) != ESR_ELx_SME_ISS_SMTC_SME_DISABLED) { force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); return; } @@ -1460,6 +1405,8 @@ void do_sme_acc(unsigned long esr, struct pt_regs *regs) sme_set_vq(vq_minus_one); fpsimd_bind_task_to_cpu(); + } else { + fpsimd_flush_task_state(current); } put_cpu_fpsimd_context(); @@ -1516,21 +1463,23 @@ static void fpsimd_load_kernel_state(struct task_struct *task) * Elide the load if this CPU holds the most recent kernel mode * FPSIMD context of the current task. */ - if (last->st == &task->thread.kernel_fpsimd_state && + if (last->st == task->thread.kernel_fpsimd_state && task->thread.kernel_fpsimd_cpu == smp_processor_id()) return; - fpsimd_load_state(&task->thread.kernel_fpsimd_state); + fpsimd_load_state(task->thread.kernel_fpsimd_state); } static void fpsimd_save_kernel_state(struct task_struct *task) { struct cpu_fp_state cpu_fp_state = { - .st = &task->thread.kernel_fpsimd_state, + .st = task->thread.kernel_fpsimd_state, .to_save = FP_STATE_FPSIMD, }; - fpsimd_save_state(&task->thread.kernel_fpsimd_state); + BUG_ON(!cpu_fp_state.st); + + fpsimd_save_state(task->thread.kernel_fpsimd_state); fpsimd_bind_state_to_cpu(&cpu_fp_state); task->thread.kernel_fpsimd_cpu = smp_processor_id(); @@ -1573,8 +1522,8 @@ void fpsimd_thread_switch(struct task_struct *next) fpsimd_save_user_state(); if (test_tsk_thread_flag(next, TIF_KERNEL_FPSTATE)) { - fpsimd_load_kernel_state(next); fpsimd_flush_cpu_state(); + fpsimd_load_kernel_state(next); } else { /* * Fix up TIF_FOREIGN_FPSTATE to correctly describe next's @@ -1661,6 +1610,9 @@ void fpsimd_flush_thread(void) current->thread.svcr = 0; } + if (system_supports_fpmr()) + current->thread.uw.fpmr = 0; + current->thread.fp_type = FP_STATE_FPSIMD; put_cpu_fpsimd_context(); @@ -1683,18 +1635,6 @@ void fpsimd_preserve_current_state(void) } /* - * Like fpsimd_preserve_current_state(), but ensure that - * current->thread.uw.fpsimd_state is updated so that it can be copied to - * the signal frame. - */ -void fpsimd_signal_preserve_current_state(void) -{ - fpsimd_preserve_current_state(); - if (current->thread.fp_type == FP_STATE_SVE) - sve_to_fpsimd(current); -} - -/* * Associate current's FPSIMD context with this cpu * The caller must have ownership of the cpu FPSIMD context before calling * this function. @@ -1786,30 +1726,14 @@ void fpsimd_restore_current_state(void) put_cpu_fpsimd_context(); } -/* - * Load an updated userland FPSIMD state for 'current' from memory and set the - * flag that indicates that the FPSIMD register contents are the most recent - * FPSIMD state of 'current'. This is used by the signal code to restore the - * register state when returning from a signal handler in FPSIMD only cases, - * any SVE context will be discarded. - */ void fpsimd_update_current_state(struct user_fpsimd_state const *state) { if (WARN_ON(!system_supports_fpsimd())) return; - get_cpu_fpsimd_context(); - current->thread.uw.fpsimd_state = *state; - if (test_thread_flag(TIF_SVE)) + if (current->thread.fp_type == FP_STATE_SVE) fpsimd_to_sve(current); - - task_fpsimd_load(); - fpsimd_bind_task_to_cpu(); - - clear_thread_flag(TIF_FOREIGN_FPSTATE); - - put_cpu_fpsimd_context(); } /* @@ -1826,6 +1750,7 @@ void fpsimd_update_current_state(struct user_fpsimd_state const *state) void fpsimd_flush_task_state(struct task_struct *t) { t->thread.fpsimd_cpu = NR_CPUS; + t->thread.kernel_fpsimd_state = NULL; /* * If we don't support fpsimd, bail out after we have * reset the fpsimd_cpu for this task and clear the @@ -1839,6 +1764,17 @@ void fpsimd_flush_task_state(struct task_struct *t) barrier(); } +void fpsimd_save_and_flush_current_state(void) +{ + if (!system_supports_fpsimd()) + return; + + get_cpu_fpsimd_context(); + fpsimd_save_user_state(); + fpsimd_flush_task_state(current); + put_cpu_fpsimd_context(); +} + /* * Save the FPSIMD state to memory and invalidate cpu view. * This function must be called with preemption disabled. @@ -1874,12 +1810,19 @@ void fpsimd_save_and_flush_cpu_state(void) * * The caller may freely use the FPSIMD registers until kernel_neon_end() is * called. + * + * Unless called from non-preemptible task context, @state must point to a + * caller provided buffer that will be used to preserve the task's kernel mode + * FPSIMD context when it is scheduled out, or if it is interrupted by kernel + * mode FPSIMD occurring in softirq context. May be %NULL otherwise. */ -void kernel_neon_begin(void) +void kernel_neon_begin(struct user_fpsimd_state *state) { if (WARN_ON(!system_supports_fpsimd())) return; + WARN_ON((preemptible() || in_serving_softirq()) && !state); + BUG_ON(!may_use_simd()); get_cpu_fpsimd_context(); @@ -1887,7 +1830,7 @@ void kernel_neon_begin(void) /* Save unsaved fpsimd state, if any: */ if (test_thread_flag(TIF_KERNEL_FPSTATE)) { BUG_ON(IS_ENABLED(CONFIG_PREEMPT_RT) || !in_serving_softirq()); - fpsimd_save_kernel_state(current); + fpsimd_save_state(state); } else { fpsimd_save_user_state(); @@ -1908,8 +1851,16 @@ void kernel_neon_begin(void) * mode in task context. So in this case, setting the flag here * is always appropriate. */ - if (IS_ENABLED(CONFIG_PREEMPT_RT) || !in_serving_softirq()) + if (IS_ENABLED(CONFIG_PREEMPT_RT) || !in_serving_softirq()) { + /* + * Record the caller provided buffer as the kernel mode + * FP/SIMD buffer for this task, so that the state can + * be preserved and restored on a context switch. + */ + WARN_ON(current->thread.kernel_fpsimd_state != NULL); + current->thread.kernel_fpsimd_state = state; set_thread_flag(TIF_KERNEL_FPSTATE); + } } /* Invalidate any task state remaining in the fpsimd regs: */ @@ -1927,31 +1878,36 @@ EXPORT_SYMBOL_GPL(kernel_neon_begin); * * The caller must not use the FPSIMD registers after this function is called, * unless kernel_neon_begin() is called again in the meantime. + * + * The value of @state must match the value passed to the preceding call to + * kernel_neon_begin(). */ -void kernel_neon_end(void) +void kernel_neon_end(struct user_fpsimd_state *state) { if (!system_supports_fpsimd()) return; + if (!test_thread_flag(TIF_KERNEL_FPSTATE)) + return; + /* * If we are returning from a nested use of kernel mode FPSIMD, restore * the task context kernel mode FPSIMD state. This can only happen when * running in softirq context on non-PREEMPT_RT. */ - if (!IS_ENABLED(CONFIG_PREEMPT_RT) && in_serving_softirq() && - test_thread_flag(TIF_KERNEL_FPSTATE)) - fpsimd_load_kernel_state(current); - else + if (!IS_ENABLED(CONFIG_PREEMPT_RT) && in_serving_softirq()) { + fpsimd_load_state(state); + } else { clear_thread_flag(TIF_KERNEL_FPSTATE); + WARN_ON(current->thread.kernel_fpsimd_state != state); + current->thread.kernel_fpsimd_state = NULL; + } } EXPORT_SYMBOL_GPL(kernel_neon_end); #ifdef CONFIG_EFI -static DEFINE_PER_CPU(struct user_fpsimd_state, efi_fpsimd_state); -static DEFINE_PER_CPU(bool, efi_fpsimd_state_used); -static DEFINE_PER_CPU(bool, efi_sve_state_used); -static DEFINE_PER_CPU(bool, efi_sm_state); +static struct user_fpsimd_state efi_fpsimd_state; /* * EFI runtime services support functions @@ -1975,49 +1931,29 @@ void __efi_fpsimd_begin(void) if (!system_supports_fpsimd()) return; - WARN_ON(preemptible()); - if (may_use_simd()) { - kernel_neon_begin(); + kernel_neon_begin(&efi_fpsimd_state); } else { /* - * If !efi_sve_state, SVE can't be in use yet and doesn't need - * preserving: + * We are running in hardirq or NMI context, and the only + * legitimate case where this might happen is when EFI pstore + * is attempting to record the system's dying gasps into EFI + * variables. This could be due to an oops, a panic or a call + * to emergency_restart(), and in none of those cases, we can + * expect the current task to ever return to user space again, + * or for the kernel to resume any normal execution, for that + * matter (an oops in hardirq context triggers a panic too). + * + * Therefore, there is no point in attempting to preserve any + * SVE/SME state here. On the off chance that we might have + * ended up here for a different reason inadvertently, kill the + * task and preserve/restore the base FP/SIMD state, which + * might belong to kernel mode FP/SIMD. */ - if (system_supports_sve() && likely(efi_sve_state)) { - char *sve_state = this_cpu_ptr(efi_sve_state); - bool ffr = true; - u64 svcr; - - __this_cpu_write(efi_sve_state_used, true); - - if (system_supports_sme()) { - svcr = read_sysreg_s(SYS_SVCR); - - __this_cpu_write(efi_sm_state, - svcr & SVCR_SM_MASK); - - /* - * Unless we have FA64 FFR does not - * exist in streaming mode. - */ - if (!system_supports_fa64()) - ffr = !(svcr & SVCR_SM_MASK); - } - - sve_save_state(sve_state + sve_ffr_offset(sve_max_vl()), - &this_cpu_ptr(&efi_fpsimd_state)->fpsr, - ffr); - - if (system_supports_sme()) - sysreg_clear_set_s(SYS_SVCR, - SVCR_SM_MASK, 0); - - } else { - fpsimd_save_state(this_cpu_ptr(&efi_fpsimd_state)); - } - - __this_cpu_write(efi_fpsimd_state_used, true); + pr_warn_ratelimited("Calling EFI runtime from %s context\n", + in_nmi() ? "NMI" : "hardirq"); + force_signal_inject(SIGKILL, SI_KERNEL, 0, 0); + fpsimd_save_state(&efi_fpsimd_state); } } @@ -2029,42 +1965,10 @@ void __efi_fpsimd_end(void) if (!system_supports_fpsimd()) return; - if (!__this_cpu_xchg(efi_fpsimd_state_used, false)) { - kernel_neon_end(); + if (may_use_simd()) { + kernel_neon_end(&efi_fpsimd_state); } else { - if (system_supports_sve() && - likely(__this_cpu_read(efi_sve_state_used))) { - char const *sve_state = this_cpu_ptr(efi_sve_state); - bool ffr = true; - - /* - * Restore streaming mode; EFI calls are - * normal function calls so should not return in - * streaming mode. - */ - if (system_supports_sme()) { - if (__this_cpu_read(efi_sm_state)) { - sysreg_clear_set_s(SYS_SVCR, - 0, - SVCR_SM_MASK); - - /* - * Unless we have FA64 FFR does not - * exist in streaming mode. - */ - if (!system_supports_fa64()) - ffr = false; - } - } - - sve_load_state(sve_state + sve_ffr_offset(sve_max_vl()), - &this_cpu_ptr(&efi_fpsimd_state)->fpsr, - ffr); - - __this_cpu_write(efi_sve_state_used, false); - } else { - fpsimd_load_state(this_cpu_ptr(&efi_fpsimd_state)); - } + fpsimd_load_state(&efi_fpsimd_state); } } diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c index d7c0d023dfe5..5a1554a44162 100644 --- a/arch/arm64/kernel/ftrace.c +++ b/arch/arm64/kernel/ftrace.c @@ -258,10 +258,17 @@ int ftrace_update_ftrace_func(ftrace_func_t func) return ftrace_modify_code(pc, 0, new, false); } -static struct plt_entry *get_ftrace_plt(struct module *mod) +static struct plt_entry *get_ftrace_plt(struct module *mod, unsigned long addr) { #ifdef CONFIG_MODULES - struct plt_entry *plt = mod->arch.ftrace_trampolines; + struct plt_entry *plt = NULL; + + if (within_module_mem_type(addr, mod, MOD_INIT_TEXT)) + plt = mod->arch.init_ftrace_trampolines; + else if (within_module_mem_type(addr, mod, MOD_TEXT)) + plt = mod->arch.ftrace_trampolines; + else + return NULL; return &plt[FTRACE_PLT_IDX]; #else @@ -320,20 +327,19 @@ static bool ftrace_find_callable_addr(struct dyn_ftrace *rec, * dealing with an out-of-range condition, we can assume it * is due to a module being loaded far away from the kernel. * - * NOTE: __module_text_address() must be called with preemption - * disabled, but we can rely on ftrace_lock to ensure that 'mod' + * NOTE: __module_text_address() must be called within a RCU read + * section, but we can rely on ftrace_lock to ensure that 'mod' * retains its validity throughout the remainder of this code. */ if (!mod) { - preempt_disable(); + guard(rcu)(); mod = __module_text_address(pc); - preempt_enable(); } if (WARN_ON(!mod)) return false; - plt = get_ftrace_plt(mod); + plt = get_ftrace_plt(mod, pc); if (!plt) { pr_err("ftrace: no module PLT for %ps\n", (void *)*addr); return false; @@ -486,7 +492,7 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, return ret; /* - * When using mcount, callsites in modules may have been initalized to + * When using mcount, callsites in modules may have been initialized to * call an arbitrary module PLT (which redirects to the _mcount stub) * rather than the ftrace PLT we'll use at runtime (which redirects to * the ftrace trampoline). We can ignore the old PLT when initializing diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 2ce73525de2c..87a822e5c4ca 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -89,7 +89,7 @@ SYM_CODE_START(primary_entry) adrp x1, early_init_stack mov sp, x1 mov x29, xzr - adrp x0, init_idmap_pg_dir + adrp x0, __pi_init_idmap_pg_dir mov x1, xzr bl __pi_create_init_idmap @@ -101,7 +101,7 @@ SYM_CODE_START(primary_entry) cbnz x19, 0f dmb sy mov x1, x0 // end of used region - adrp x0, init_idmap_pg_dir + adrp x0, __pi_init_idmap_pg_dir adr_l x2, dcache_inval_poc blr x2 b 1f @@ -299,7 +299,7 @@ SYM_INNER_LABEL(init_el2, SYM_L_LOCAL) isb 0: - init_el2_hcr HCR_HOST_NVHE_FLAGS + init_el2_hcr HCR_HOST_NVHE_FLAGS | HCR_ATA init_el2_state /* Hypervisor stub */ @@ -507,7 +507,7 @@ SYM_FUNC_END(__no_granule_support) SYM_FUNC_START_LOCAL(__primary_switch) adrp x1, reserved_pg_dir - adrp x2, init_idmap_pg_dir + adrp x2, __pi_init_idmap_pg_dir bl __enable_mmu adrp x1, early_init_stack diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index 18749e9a6c2d..9717568518ba 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -402,7 +402,7 @@ int swsusp_arch_suspend(void) * Memory allocated by get_safe_page() will be dealt with by the hibernate code, * we don't need to free it here. */ -int swsusp_arch_resume(void) +int __nocfi swsusp_arch_resume(void) { int rc; void *zero_page; diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c index 722ac45f9f7b..ab76b36dce82 100644 --- a/arch/arm64/kernel/hw_breakpoint.c +++ b/arch/arm64/kernel/hw_breakpoint.c @@ -22,6 +22,7 @@ #include <asm/current.h> #include <asm/debug-monitors.h> #include <asm/esr.h> +#include <asm/exception.h> #include <asm/hw_breakpoint.h> #include <asm/traps.h> #include <asm/cputype.h> @@ -618,8 +619,7 @@ NOKPROBE_SYMBOL(toggle_bp_registers); /* * Debug exception handlers. */ -static int breakpoint_handler(unsigned long unused, unsigned long esr, - struct pt_regs *regs) +void do_breakpoint(unsigned long esr, struct pt_regs *regs) { int i, step = 0, *kernel_step; u32 ctrl_reg; @@ -662,7 +662,7 @@ unlock: } if (!step) - return 0; + return; if (user_mode(regs)) { debug_info->bps_disabled = 1; @@ -670,7 +670,7 @@ unlock: /* If we're already stepping a watchpoint, just return. */ if (debug_info->wps_disabled) - return 0; + return; if (test_thread_flag(TIF_SINGLESTEP)) debug_info->suspended_step = 1; @@ -681,7 +681,7 @@ unlock: kernel_step = this_cpu_ptr(&stepping_kernel_bp); if (*kernel_step != ARM_KERNEL_STEP_NONE) - return 0; + return; if (kernel_active_single_step()) { *kernel_step = ARM_KERNEL_STEP_SUSPEND; @@ -690,10 +690,8 @@ unlock: kernel_enable_single_step(regs); } } - - return 0; } -NOKPROBE_SYMBOL(breakpoint_handler); +NOKPROBE_SYMBOL(do_breakpoint); /* * Arm64 hardware does not always report a watchpoint hit address that matches @@ -752,8 +750,7 @@ static int watchpoint_report(struct perf_event *wp, unsigned long addr, return step; } -static int watchpoint_handler(unsigned long addr, unsigned long esr, - struct pt_regs *regs) +void do_watchpoint(unsigned long addr, unsigned long esr, struct pt_regs *regs) { int i, step = 0, *kernel_step, access, closest_match = 0; u64 min_dist = -1, dist; @@ -808,7 +805,7 @@ static int watchpoint_handler(unsigned long addr, unsigned long esr, rcu_read_unlock(); if (!step) - return 0; + return; /* * We always disable EL0 watchpoints because the kernel can @@ -821,7 +818,7 @@ static int watchpoint_handler(unsigned long addr, unsigned long esr, /* If we're already stepping a breakpoint, just return. */ if (debug_info->bps_disabled) - return 0; + return; if (test_thread_flag(TIF_SINGLESTEP)) debug_info->suspended_step = 1; @@ -832,7 +829,7 @@ static int watchpoint_handler(unsigned long addr, unsigned long esr, kernel_step = this_cpu_ptr(&stepping_kernel_bp); if (*kernel_step != ARM_KERNEL_STEP_NONE) - return 0; + return; if (kernel_active_single_step()) { *kernel_step = ARM_KERNEL_STEP_SUSPEND; @@ -841,44 +838,41 @@ static int watchpoint_handler(unsigned long addr, unsigned long esr, kernel_enable_single_step(regs); } } - - return 0; } -NOKPROBE_SYMBOL(watchpoint_handler); +NOKPROBE_SYMBOL(do_watchpoint); /* * Handle single-step exception. */ -int reinstall_suspended_bps(struct pt_regs *regs) +bool try_step_suspended_breakpoints(struct pt_regs *regs) { struct debug_info *debug_info = ¤t->thread.debug; - int handled_exception = 0, *kernel_step; - - kernel_step = this_cpu_ptr(&stepping_kernel_bp); + int *kernel_step = this_cpu_ptr(&stepping_kernel_bp); + bool handled_exception = false; /* - * Called from single-step exception handler. - * Return 0 if execution can resume, 1 if a SIGTRAP should be - * reported. + * Called from single-step exception entry. + * Return true if we stepped a breakpoint and can resume execution, + * false if we need to handle a single-step. */ if (user_mode(regs)) { if (debug_info->bps_disabled) { debug_info->bps_disabled = 0; toggle_bp_registers(AARCH64_DBG_REG_BCR, DBG_ACTIVE_EL0, 1); - handled_exception = 1; + handled_exception = true; } if (debug_info->wps_disabled) { debug_info->wps_disabled = 0; toggle_bp_registers(AARCH64_DBG_REG_WCR, DBG_ACTIVE_EL0, 1); - handled_exception = 1; + handled_exception = true; } if (handled_exception) { if (debug_info->suspended_step) { debug_info->suspended_step = 0; /* Allow exception handling to fall-through. */ - handled_exception = 0; + handled_exception = false; } else { user_disable_single_step(current); } @@ -892,17 +886,17 @@ int reinstall_suspended_bps(struct pt_regs *regs) if (*kernel_step != ARM_KERNEL_STEP_SUSPEND) { kernel_disable_single_step(); - handled_exception = 1; + handled_exception = true; } else { - handled_exception = 0; + handled_exception = false; } *kernel_step = ARM_KERNEL_STEP_NONE; } - return !handled_exception; + return handled_exception; } -NOKPROBE_SYMBOL(reinstall_suspended_bps); +NOKPROBE_SYMBOL(try_step_suspended_breakpoints); /* * Context-switcher for restoring suspended breakpoints. @@ -987,12 +981,6 @@ static int __init arch_hw_breakpoint_init(void) pr_info("found %d breakpoint and %d watchpoint registers.\n", core_num_brps, core_num_wrps); - /* Register debug fault handlers. */ - hook_debug_fault_code(DBG_ESR_EVT_HWBP, breakpoint_handler, SIGTRAP, - TRAP_HWBKPT, "hw-breakpoint handler"); - hook_debug_fault_code(DBG_ESR_EVT_HWWP, watchpoint_handler, SIGTRAP, - TRAP_HWBKPT, "hw-watchpoint handler"); - /* * Reset the breakpoint resources. We assume that a halting * debugger will leave the world in a nice state for us. diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S index ae990da1eae5..634ddc904244 100644 --- a/arch/arm64/kernel/hyp-stub.S +++ b/arch/arm64/kernel/hyp-stub.S @@ -54,6 +54,11 @@ SYM_CODE_START_LOCAL(elx_sync) 1: cmp x0, #HVC_FINALISE_EL2 b.eq __finalise_el2 + cmp x0, #HVC_GET_ICH_VTR_EL2 + b.ne 2f + mrs_s x1, SYS_ICH_VTR_EL2 + b 9f + 2: cmp x0, #HVC_SOFT_RESTART b.ne 3f mov x0, x2 @@ -97,8 +102,7 @@ SYM_CODE_START_LOCAL(__finalise_el2) 2: // Engage the VHE magic! mov_q x0, HCR_HOST_VHE_FLAGS - msr hcr_el2, x0 - isb + msr_hcr_el2 x0 // Use the EL1 allocated stack, per-cpu offset mrs x0, sp_el1 diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index ef3a69cc398e..d4c7d45ae6bc 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -10,6 +10,16 @@ #error This file should only be included in vmlinux.lds.S #endif +#if defined(CONFIG_LD_IS_LLD) && CONFIG_LLD_VERSION < 210000 +#define ASSERT(...) +#endif + +#define PI_EXPORT_SYM(sym) \ + __PI_EXPORT_SYM(sym, __pi_ ## sym, Cannot export BSS symbol sym to startup code) +#define __PI_EXPORT_SYM(sym, pisym, msg)\ + PROVIDE(pisym = sym); \ + ASSERT((sym - KIMAGE_VADDR) < (__bss_start - KIMAGE_VADDR), #msg) + PROVIDE(__efistub_primary_entry = primary_entry); /* @@ -28,7 +38,7 @@ PROVIDE(__efistub__end = _end); PROVIDE(__efistub___inittext_end = __inittext_end); PROVIDE(__efistub__edata = _edata); #if defined(CONFIG_EFI_EARLYCON) || defined(CONFIG_SYSFB) -PROVIDE(__efistub_screen_info = screen_info); +PROVIDE(__efistub_sysfb_primary_display = sysfb_primary_display); #endif PROVIDE(__efistub__ctype = _ctype); @@ -36,40 +46,30 @@ PROVIDE(__pi___memcpy = __pi_memcpy); PROVIDE(__pi___memmove = __pi_memmove); PROVIDE(__pi___memset = __pi_memset); -PROVIDE(__pi_id_aa64isar1_override = id_aa64isar1_override); -PROVIDE(__pi_id_aa64isar2_override = id_aa64isar2_override); -PROVIDE(__pi_id_aa64mmfr0_override = id_aa64mmfr0_override); -PROVIDE(__pi_id_aa64mmfr1_override = id_aa64mmfr1_override); -PROVIDE(__pi_id_aa64mmfr2_override = id_aa64mmfr2_override); -PROVIDE(__pi_id_aa64pfr0_override = id_aa64pfr0_override); -PROVIDE(__pi_id_aa64pfr1_override = id_aa64pfr1_override); -PROVIDE(__pi_id_aa64smfr0_override = id_aa64smfr0_override); -PROVIDE(__pi_id_aa64zfr0_override = id_aa64zfr0_override); -PROVIDE(__pi_arm64_sw_feature_override = arm64_sw_feature_override); -PROVIDE(__pi_arm64_use_ng_mappings = arm64_use_ng_mappings); -#ifdef CONFIG_CAVIUM_ERRATUM_27456 -PROVIDE(__pi_cavium_erratum_27456_cpus = cavium_erratum_27456_cpus); -#endif -PROVIDE(__pi__ctype = _ctype); -PROVIDE(__pi_memstart_offset_seed = memstart_offset_seed); - -PROVIDE(__pi_init_idmap_pg_dir = init_idmap_pg_dir); -PROVIDE(__pi_init_idmap_pg_end = init_idmap_pg_end); -PROVIDE(__pi_init_pg_dir = init_pg_dir); -PROVIDE(__pi_init_pg_end = init_pg_end); -PROVIDE(__pi_swapper_pg_dir = swapper_pg_dir); - -PROVIDE(__pi__text = _text); -PROVIDE(__pi__stext = _stext); -PROVIDE(__pi__etext = _etext); -PROVIDE(__pi___start_rodata = __start_rodata); -PROVIDE(__pi___inittext_begin = __inittext_begin); -PROVIDE(__pi___inittext_end = __inittext_end); -PROVIDE(__pi___initdata_begin = __initdata_begin); -PROVIDE(__pi___initdata_end = __initdata_end); -PROVIDE(__pi__data = _data); -PROVIDE(__pi___bss_start = __bss_start); -PROVIDE(__pi__end = _end); +PI_EXPORT_SYM(id_aa64isar1_override); +PI_EXPORT_SYM(id_aa64isar2_override); +PI_EXPORT_SYM(id_aa64mmfr0_override); +PI_EXPORT_SYM(id_aa64mmfr1_override); +PI_EXPORT_SYM(id_aa64mmfr2_override); +PI_EXPORT_SYM(id_aa64pfr0_override); +PI_EXPORT_SYM(id_aa64pfr1_override); +PI_EXPORT_SYM(id_aa64smfr0_override); +PI_EXPORT_SYM(id_aa64zfr0_override); +PI_EXPORT_SYM(arm64_sw_feature_override); +PI_EXPORT_SYM(arm64_use_ng_mappings); +PI_EXPORT_SYM(_ctype); + +PI_EXPORT_SYM(swapper_pg_dir); + +PI_EXPORT_SYM(_text); +PI_EXPORT_SYM(_stext); +PI_EXPORT_SYM(_etext); +PI_EXPORT_SYM(__start_rodata); +PI_EXPORT_SYM(__inittext_begin); +PI_EXPORT_SYM(__inittext_end); +PI_EXPORT_SYM(__initdata_begin); +PI_EXPORT_SYM(__initdata_end); +PI_EXPORT_SYM(_data); #ifdef CONFIG_KVM @@ -91,6 +91,7 @@ KVM_NVHE_ALIAS(spectre_bhb_patch_loop_mitigation_enable); KVM_NVHE_ALIAS(spectre_bhb_patch_wa3); KVM_NVHE_ALIAS(spectre_bhb_patch_clearbhb); KVM_NVHE_ALIAS(alt_cb_patch_nops); +KVM_NVHE_ALIAS(kvm_compute_ich_hcr_trap_bits); /* Global kernel state accessed by nVHE hyp code. */ KVM_NVHE_ALIAS(kvm_vgic_global_state); @@ -105,6 +106,9 @@ KVM_NVHE_ALIAS(__hyp_stub_vectors); KVM_NVHE_ALIAS(vgic_v2_cpuif_trap); KVM_NVHE_ALIAS(vgic_v3_cpuif_trap); +/* Static key indicating whether GICv3 has GICv2 compatibility */ +KVM_NVHE_ALIAS(vgic_v3_has_v2_compat); + /* Static key which is set if CNTVOFF_EL2 is unusable */ KVM_NVHE_ALIAS(broken_cntvoff_key); @@ -112,11 +116,6 @@ KVM_NVHE_ALIAS(broken_cntvoff_key); KVM_NVHE_ALIAS(__start___kvm_ex_table); KVM_NVHE_ALIAS(__stop___kvm_ex_table); -/* PMU available static key */ -#ifdef CONFIG_HW_PERF_EVENTS -KVM_NVHE_ALIAS(kvm_arm_pmu_available); -#endif - /* Position-independent library routines */ KVM_NVHE_ALIAS_HYP(clear_page, __pi_clear_page); KVM_NVHE_ALIAS_HYP(copy_page, __pi_copy_page); @@ -135,8 +134,14 @@ KVM_NVHE_ALIAS(__hyp_text_start); KVM_NVHE_ALIAS(__hyp_text_end); KVM_NVHE_ALIAS(__hyp_bss_start); KVM_NVHE_ALIAS(__hyp_bss_end); +KVM_NVHE_ALIAS(__hyp_data_start); +KVM_NVHE_ALIAS(__hyp_data_end); KVM_NVHE_ALIAS(__hyp_rodata_start); KVM_NVHE_ALIAS(__hyp_rodata_end); +#ifdef CONFIG_NVHE_EL2_TRACING +KVM_NVHE_ALIAS(__hyp_event_ids_start); +KVM_NVHE_ALIAS(__hyp_event_ids_end); +#endif /* pKVM static key */ KVM_NVHE_ALIAS(kvm_protected_mode_initialized); @@ -147,4 +152,17 @@ KVM_NVHE_ALIAS(kvm_protected_mode_initialized); _kernel_codesize = ABSOLUTE(__inittext_end - _text); #endif +/* + * LLD will occasionally error out with a '__init_end does not converge' error + * if INIT_IDMAP_DIR_SIZE is defined in terms of _end, as this results in a + * circular dependency. Counter this by dimensioning the initial IDMAP page + * tables based on kimage_limit, which is defined such that its value should + * not change as a result of the initdata segment being pushed over a 64k + * segment boundary due to changes in INIT_IDMAP_DIR_SIZE, provided that its + * value doesn't change by more than 2M between linker passes. + */ +kimage_limit = ALIGN(ABSOLUTE(_end + SZ_64K), SZ_2M); + +#undef ASSERT + #endif /* __ARM64_KERNEL_IMAGE_VARS_H */ diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c index 85087e2df564..15dedb385b9e 100644 --- a/arch/arm64/kernel/irq.c +++ b/arch/arm64/kernel/irq.c @@ -51,7 +51,6 @@ static void init_irq_scs(void) scs_alloc(early_cpu_to_node(cpu)); } -#ifdef CONFIG_VMAP_STACK static void __init init_irq_stacks(void) { int cpu; @@ -62,20 +61,8 @@ static void __init init_irq_stacks(void) per_cpu(irq_stack_ptr, cpu) = p; } } -#else -/* irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned. */ -DEFINE_PER_CPU_ALIGNED(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack); -static void init_irq_stacks(void) -{ - int cpu; - - for_each_possible_cpu(cpu) - per_cpu(irq_stack_ptr, cpu) = per_cpu(irq_stack, cpu); -} -#endif - -#ifndef CONFIG_PREEMPT_RT +#ifdef CONFIG_SOFTIRQ_ON_OWN_STACK static void ____do_softirq(struct pt_regs *regs) { __do_softirq(); diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c index 1da3e25f9d9e..c9503ed45a6c 100644 --- a/arch/arm64/kernel/kaslr.c +++ b/arch/arm64/kernel/kaslr.c @@ -10,8 +10,6 @@ #include <asm/cpufeature.h> #include <asm/memory.h> -u16 __initdata memstart_offset_seed; - bool __ro_after_init __kaslr_is_enabled = false; void __init kaslr_init(void) diff --git a/arch/arm64/kernel/kexec_image.c b/arch/arm64/kernel/kexec_image.c index 532d72ea42ee..b70f4df15a1a 100644 --- a/arch/arm64/kernel/kexec_image.c +++ b/arch/arm64/kernel/kexec_image.c @@ -41,7 +41,7 @@ static void *image_load(struct kimage *image, struct arm64_image_header *h; u64 flags, value; bool be_image, be_kernel; - struct kexec_buf kbuf; + struct kexec_buf kbuf = {}; unsigned long text_offset, kernel_segment_number; struct kexec_segment *kernel_segment; int ret; diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c index f3c4d3a8a20f..968324a79a89 100644 --- a/arch/arm64/kernel/kgdb.c +++ b/arch/arm64/kernel/kgdb.c @@ -234,23 +234,23 @@ int kgdb_arch_handle_exception(int exception_vector, int signo, return err; } -static int kgdb_brk_fn(struct pt_regs *regs, unsigned long esr) +int kgdb_brk_handler(struct pt_regs *regs, unsigned long esr) { kgdb_handle_exception(1, SIGTRAP, 0, regs); return DBG_HOOK_HANDLED; } -NOKPROBE_SYMBOL(kgdb_brk_fn) +NOKPROBE_SYMBOL(kgdb_brk_handler) -static int kgdb_compiled_brk_fn(struct pt_regs *regs, unsigned long esr) +int kgdb_compiled_brk_handler(struct pt_regs *regs, unsigned long esr) { compiled_break = 1; kgdb_handle_exception(1, SIGTRAP, 0, regs); return DBG_HOOK_HANDLED; } -NOKPROBE_SYMBOL(kgdb_compiled_brk_fn); +NOKPROBE_SYMBOL(kgdb_compiled_brk_handler); -static int kgdb_step_brk_fn(struct pt_regs *regs, unsigned long esr) +int kgdb_single_step_handler(struct pt_regs *regs, unsigned long esr) { if (!kgdb_single_step) return DBG_HOOK_ERROR; @@ -258,21 +258,7 @@ static int kgdb_step_brk_fn(struct pt_regs *regs, unsigned long esr) kgdb_handle_exception(0, SIGTRAP, 0, regs); return DBG_HOOK_HANDLED; } -NOKPROBE_SYMBOL(kgdb_step_brk_fn); - -static struct break_hook kgdb_brkpt_hook = { - .fn = kgdb_brk_fn, - .imm = KGDB_DYN_DBG_BRK_IMM, -}; - -static struct break_hook kgdb_compiled_brkpt_hook = { - .fn = kgdb_compiled_brk_fn, - .imm = KGDB_COMPILED_DBG_BRK_IMM, -}; - -static struct step_hook kgdb_step_hook = { - .fn = kgdb_step_brk_fn -}; +NOKPROBE_SYMBOL(kgdb_single_step_handler); static int __kgdb_notify(struct die_args *args, unsigned long cmd) { @@ -311,15 +297,7 @@ static struct notifier_block kgdb_notifier = { */ int kgdb_arch_init(void) { - int ret = register_die_notifier(&kgdb_notifier); - - if (ret != 0) - return ret; - - register_kernel_break_hook(&kgdb_brkpt_hook); - register_kernel_break_hook(&kgdb_compiled_brkpt_hook); - register_kernel_step_hook(&kgdb_step_hook); - return 0; + return register_die_notifier(&kgdb_notifier); } /* @@ -329,9 +307,6 @@ int kgdb_arch_init(void) */ void kgdb_arch_exit(void) { - unregister_kernel_break_hook(&kgdb_brkpt_hook); - unregister_kernel_break_hook(&kgdb_compiled_brkpt_hook); - unregister_kernel_step_hook(&kgdb_step_hook); unregister_die_notifier(&kgdb_notifier); } diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c index 6f121a0164a4..c5693a32e49b 100644 --- a/arch/arm64/kernel/machine_kexec.c +++ b/arch/arm64/kernel/machine_kexec.c @@ -129,9 +129,6 @@ int machine_kexec_post_load(struct kimage *kimage) } /* Create a copy of the linear map */ - trans_pgd = kexec_page_alloc(kimage); - if (!trans_pgd) - return -ENOMEM; rc = trans_pgd_create_copy(&info, &trans_pgd, PAGE_OFFSET, PAGE_END); if (rc) return rc; @@ -251,7 +248,7 @@ void crash_post_resume(void) * marked as Reserved as memory was allocated via memblock_reserve(). * * In hibernation, the pages which are Reserved and yet "nosave" are excluded - * from the hibernation iamge. crash_is_nosave() does thich check for crash + * from the hibernation image. crash_is_nosave() does thich check for crash * dump kernel and will reduce the total size of hibernation image. */ diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c index af1ca875c52c..e31fabed378a 100644 --- a/arch/arm64/kernel/machine_kexec_file.c +++ b/arch/arm64/kernel/machine_kexec_file.c @@ -52,7 +52,7 @@ static int prepare_elf_headers(void **addr, unsigned long *sz) for_each_mem_range(i, &start, &end) nr_ranges++; - cmem = kmalloc(struct_size(cmem, ranges, nr_ranges), GFP_KERNEL); + cmem = kmalloc_flex(*cmem, ranges, nr_ranges); if (!cmem) return -ENOMEM; @@ -94,7 +94,7 @@ int load_other_segments(struct kimage *image, char *initrd, unsigned long initrd_len, char *cmdline) { - struct kexec_buf kbuf; + struct kexec_buf kbuf = {}; void *dtb = NULL; unsigned long initrd_load_addr = 0, dtb_len, orig_segments = image->nr_segments; @@ -134,6 +134,10 @@ int load_other_segments(struct kimage *image, kexec_dprintk("Loaded elf core header at 0x%lx bufsz=0x%lx memsz=0x%lx\n", image->elf_load_addr, kbuf.bufsz, kbuf.memsz); + + ret = crash_load_dm_crypt_keys(image); + if (ret) + goto out_err; } #endif diff --git a/arch/arm64/kernel/module-plts.c b/arch/arm64/kernel/module-plts.c index bde32979c06a..7afd370da9f4 100644 --- a/arch/arm64/kernel/module-plts.c +++ b/arch/arm64/kernel/module-plts.c @@ -283,7 +283,7 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, unsigned long core_plts = 0; unsigned long init_plts = 0; Elf64_Sym *syms = NULL; - Elf_Shdr *pltsec, *tramp = NULL; + Elf_Shdr *pltsec, *tramp = NULL, *init_tramp = NULL; int i; /* @@ -298,6 +298,9 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, else if (!strcmp(secstrings + sechdrs[i].sh_name, ".text.ftrace_trampoline")) tramp = sechdrs + i; + else if (!strcmp(secstrings + sechdrs[i].sh_name, + ".init.text.ftrace_trampoline")) + init_tramp = sechdrs + i; else if (sechdrs[i].sh_type == SHT_SYMTAB) syms = (Elf64_Sym *)sechdrs[i].sh_addr; } @@ -363,5 +366,12 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, tramp->sh_size = NR_FTRACE_PLTS * sizeof(struct plt_entry); } + if (init_tramp) { + init_tramp->sh_type = SHT_NOBITS; + init_tramp->sh_flags = SHF_EXECINSTR | SHF_ALLOC; + init_tramp->sh_addralign = __alignof__(struct plt_entry); + init_tramp->sh_size = NR_FTRACE_PLTS * sizeof(struct plt_entry); + } + return 0; } diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index 06bb680bfe97..24adb581af0e 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -23,6 +23,7 @@ #include <asm/insn.h> #include <asm/scs.h> #include <asm/sections.h> +#include <asm/text-patching.h> enum aarch64_reloc_op { RELOC_OP_NONE, @@ -48,7 +49,17 @@ static u64 do_reloc(enum aarch64_reloc_op reloc_op, __le32 *place, u64 val) return 0; } -static int reloc_data(enum aarch64_reloc_op op, void *place, u64 val, int len) +#define WRITE_PLACE(place, val, mod) do { \ + __typeof__(val) __val = (val); \ + \ + if (mod->state == MODULE_STATE_UNFORMED) \ + *(place) = __val; \ + else \ + aarch64_insn_copy(place, &(__val), sizeof(*place)); \ +} while (0) + +static int reloc_data(enum aarch64_reloc_op op, void *place, u64 val, int len, + struct module *me) { s64 sval = do_reloc(op, place, val); @@ -66,7 +77,7 @@ static int reloc_data(enum aarch64_reloc_op op, void *place, u64 val, int len) switch (len) { case 16: - *(s16 *)place = sval; + WRITE_PLACE((s16 *)place, sval, me); switch (op) { case RELOC_OP_ABS: if (sval < 0 || sval > U16_MAX) @@ -82,7 +93,7 @@ static int reloc_data(enum aarch64_reloc_op op, void *place, u64 val, int len) } break; case 32: - *(s32 *)place = sval; + WRITE_PLACE((s32 *)place, sval, me); switch (op) { case RELOC_OP_ABS: if (sval < 0 || sval > U32_MAX) @@ -98,7 +109,7 @@ static int reloc_data(enum aarch64_reloc_op op, void *place, u64 val, int len) } break; case 64: - *(s64 *)place = sval; + WRITE_PLACE((s64 *)place, sval, me); break; default: pr_err("Invalid length (%d) for data relocation\n", len); @@ -113,7 +124,8 @@ enum aarch64_insn_movw_imm_type { }; static int reloc_insn_movw(enum aarch64_reloc_op op, __le32 *place, u64 val, - int lsb, enum aarch64_insn_movw_imm_type imm_type) + int lsb, enum aarch64_insn_movw_imm_type imm_type, + struct module *me) { u64 imm; s64 sval; @@ -145,7 +157,7 @@ static int reloc_insn_movw(enum aarch64_reloc_op op, __le32 *place, u64 val, /* Update the instruction with the new encoding. */ insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_16, insn, imm); - *place = cpu_to_le32(insn); + WRITE_PLACE(place, cpu_to_le32(insn), me); if (imm > U16_MAX) return -ERANGE; @@ -154,7 +166,8 @@ static int reloc_insn_movw(enum aarch64_reloc_op op, __le32 *place, u64 val, } static int reloc_insn_imm(enum aarch64_reloc_op op, __le32 *place, u64 val, - int lsb, int len, enum aarch64_insn_imm_type imm_type) + int lsb, int len, enum aarch64_insn_imm_type imm_type, + struct module *me) { u64 imm, imm_mask; s64 sval; @@ -170,7 +183,7 @@ static int reloc_insn_imm(enum aarch64_reloc_op op, __le32 *place, u64 val, /* Update the instruction's immediate field. */ insn = aarch64_insn_encode_immediate(imm_type, insn, imm); - *place = cpu_to_le32(insn); + WRITE_PLACE(place, cpu_to_le32(insn), me); /* * Extract the upper value bits (including the sign bit) and @@ -189,17 +202,17 @@ static int reloc_insn_imm(enum aarch64_reloc_op op, __le32 *place, u64 val, } static int reloc_insn_adrp(struct module *mod, Elf64_Shdr *sechdrs, - __le32 *place, u64 val) + __le32 *place, u64 val, struct module *me) { u32 insn; if (!is_forbidden_offset_for_adrp(place)) return reloc_insn_imm(RELOC_OP_PAGE, place, val, 12, 21, - AARCH64_INSN_IMM_ADR); + AARCH64_INSN_IMM_ADR, me); /* patch ADRP to ADR if it is in range */ if (!reloc_insn_imm(RELOC_OP_PREL, place, val & ~0xfff, 0, 21, - AARCH64_INSN_IMM_ADR)) { + AARCH64_INSN_IMM_ADR, me)) { insn = le32_to_cpu(*place); insn &= ~BIT(31); } else { @@ -211,7 +224,7 @@ static int reloc_insn_adrp(struct module *mod, Elf64_Shdr *sechdrs, AARCH64_INSN_BRANCH_NOLINK); } - *place = cpu_to_le32(insn); + WRITE_PLACE(place, cpu_to_le32(insn), me); return 0; } @@ -255,23 +268,23 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, /* Data relocations. */ case R_AARCH64_ABS64: overflow_check = false; - ovf = reloc_data(RELOC_OP_ABS, loc, val, 64); + ovf = reloc_data(RELOC_OP_ABS, loc, val, 64, me); break; case R_AARCH64_ABS32: - ovf = reloc_data(RELOC_OP_ABS, loc, val, 32); + ovf = reloc_data(RELOC_OP_ABS, loc, val, 32, me); break; case R_AARCH64_ABS16: - ovf = reloc_data(RELOC_OP_ABS, loc, val, 16); + ovf = reloc_data(RELOC_OP_ABS, loc, val, 16, me); break; case R_AARCH64_PREL64: overflow_check = false; - ovf = reloc_data(RELOC_OP_PREL, loc, val, 64); + ovf = reloc_data(RELOC_OP_PREL, loc, val, 64, me); break; case R_AARCH64_PREL32: - ovf = reloc_data(RELOC_OP_PREL, loc, val, 32); + ovf = reloc_data(RELOC_OP_PREL, loc, val, 32, me); break; case R_AARCH64_PREL16: - ovf = reloc_data(RELOC_OP_PREL, loc, val, 16); + ovf = reloc_data(RELOC_OP_PREL, loc, val, 16, me); break; /* MOVW instruction relocations. */ @@ -280,88 +293,88 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, fallthrough; case R_AARCH64_MOVW_UABS_G0: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 0, - AARCH64_INSN_IMM_MOVKZ); + AARCH64_INSN_IMM_MOVKZ, me); break; case R_AARCH64_MOVW_UABS_G1_NC: overflow_check = false; fallthrough; case R_AARCH64_MOVW_UABS_G1: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 16, - AARCH64_INSN_IMM_MOVKZ); + AARCH64_INSN_IMM_MOVKZ, me); break; case R_AARCH64_MOVW_UABS_G2_NC: overflow_check = false; fallthrough; case R_AARCH64_MOVW_UABS_G2: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 32, - AARCH64_INSN_IMM_MOVKZ); + AARCH64_INSN_IMM_MOVKZ, me); break; case R_AARCH64_MOVW_UABS_G3: /* We're using the top bits so we can't overflow. */ overflow_check = false; ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 48, - AARCH64_INSN_IMM_MOVKZ); + AARCH64_INSN_IMM_MOVKZ, me); break; case R_AARCH64_MOVW_SABS_G0: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 0, - AARCH64_INSN_IMM_MOVNZ); + AARCH64_INSN_IMM_MOVNZ, me); break; case R_AARCH64_MOVW_SABS_G1: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 16, - AARCH64_INSN_IMM_MOVNZ); + AARCH64_INSN_IMM_MOVNZ, me); break; case R_AARCH64_MOVW_SABS_G2: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 32, - AARCH64_INSN_IMM_MOVNZ); + AARCH64_INSN_IMM_MOVNZ, me); break; case R_AARCH64_MOVW_PREL_G0_NC: overflow_check = false; ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 0, - AARCH64_INSN_IMM_MOVKZ); + AARCH64_INSN_IMM_MOVKZ, me); break; case R_AARCH64_MOVW_PREL_G0: ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 0, - AARCH64_INSN_IMM_MOVNZ); + AARCH64_INSN_IMM_MOVNZ, me); break; case R_AARCH64_MOVW_PREL_G1_NC: overflow_check = false; ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 16, - AARCH64_INSN_IMM_MOVKZ); + AARCH64_INSN_IMM_MOVKZ, me); break; case R_AARCH64_MOVW_PREL_G1: ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 16, - AARCH64_INSN_IMM_MOVNZ); + AARCH64_INSN_IMM_MOVNZ, me); break; case R_AARCH64_MOVW_PREL_G2_NC: overflow_check = false; ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 32, - AARCH64_INSN_IMM_MOVKZ); + AARCH64_INSN_IMM_MOVKZ, me); break; case R_AARCH64_MOVW_PREL_G2: ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 32, - AARCH64_INSN_IMM_MOVNZ); + AARCH64_INSN_IMM_MOVNZ, me); break; case R_AARCH64_MOVW_PREL_G3: /* We're using the top bits so we can't overflow. */ overflow_check = false; ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 48, - AARCH64_INSN_IMM_MOVNZ); + AARCH64_INSN_IMM_MOVNZ, me); break; /* Immediate instruction relocations. */ case R_AARCH64_LD_PREL_LO19: ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 19, - AARCH64_INSN_IMM_19); + AARCH64_INSN_IMM_19, me); break; case R_AARCH64_ADR_PREL_LO21: ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 0, 21, - AARCH64_INSN_IMM_ADR); + AARCH64_INSN_IMM_ADR, me); break; case R_AARCH64_ADR_PREL_PG_HI21_NC: overflow_check = false; fallthrough; case R_AARCH64_ADR_PREL_PG_HI21: - ovf = reloc_insn_adrp(me, sechdrs, loc, val); + ovf = reloc_insn_adrp(me, sechdrs, loc, val, me); if (ovf && ovf != -ERANGE) return ovf; break; @@ -369,46 +382,46 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, case R_AARCH64_LDST8_ABS_LO12_NC: overflow_check = false; ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 0, 12, - AARCH64_INSN_IMM_12); + AARCH64_INSN_IMM_12, me); break; case R_AARCH64_LDST16_ABS_LO12_NC: overflow_check = false; ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 1, 11, - AARCH64_INSN_IMM_12); + AARCH64_INSN_IMM_12, me); break; case R_AARCH64_LDST32_ABS_LO12_NC: overflow_check = false; ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 2, 10, - AARCH64_INSN_IMM_12); + AARCH64_INSN_IMM_12, me); break; case R_AARCH64_LDST64_ABS_LO12_NC: overflow_check = false; ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 3, 9, - AARCH64_INSN_IMM_12); + AARCH64_INSN_IMM_12, me); break; case R_AARCH64_LDST128_ABS_LO12_NC: overflow_check = false; ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 4, 8, - AARCH64_INSN_IMM_12); + AARCH64_INSN_IMM_12, me); break; case R_AARCH64_TSTBR14: ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 14, - AARCH64_INSN_IMM_14); + AARCH64_INSN_IMM_14, me); break; case R_AARCH64_CONDBR19: ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 19, - AARCH64_INSN_IMM_19); + AARCH64_INSN_IMM_19, me); break; case R_AARCH64_JUMP26: case R_AARCH64_CALL26: ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 26, - AARCH64_INSN_IMM_26); + AARCH64_INSN_IMM_26, me); if (ovf == -ERANGE) { val = module_emit_plt_entry(me, sechdrs, loc, &rel[i], sym); if (!val) return -ENOEXEC; ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, - 26, AARCH64_INSN_IMM_26); + 26, AARCH64_INSN_IMM_26, me); } break; @@ -453,6 +466,17 @@ static int module_init_ftrace_plt(const Elf_Ehdr *hdr, __init_plt(&plts[FTRACE_PLT_IDX], FTRACE_ADDR); mod->arch.ftrace_trampolines = plts; + + s = find_section(hdr, sechdrs, ".init.text.ftrace_trampoline"); + if (!s) + return -ENOEXEC; + + plts = (void *)s->sh_addr; + + __init_plt(&plts[FTRACE_PLT_IDX], FTRACE_ADDR); + + mod->arch.init_ftrace_trampolines = plts; + #endif return 0; } @@ -465,16 +489,29 @@ int module_finalize(const Elf_Ehdr *hdr, int ret; s = find_section(hdr, sechdrs, ".altinstructions"); - if (s) - apply_alternatives_module((void *)s->sh_addr, s->sh_size); + if (s) { + ret = apply_alternatives_module((void *)s->sh_addr, s->sh_size); + if (ret < 0) { + pr_err("module %s: error occurred when applying alternatives\n", me->name); + return ret; + } + } if (scs_is_dynamic()) { s = find_section(hdr, sechdrs, ".init.eh_frame"); if (s) { - ret = __pi_scs_patch((void *)s->sh_addr, s->sh_size); - if (ret) + /* + * Because we can reject modules that are malformed + * so SCS patching fails, skip dry run and try to patch + * it in place. If patching fails, the module would not + * be loaded anyway. + */ + ret = __pi_scs_patch((void *)s->sh_addr, s->sh_size, true); + if (ret) { pr_err("module %s: error occurred during dynamic SCS patching (%d)\n", me->name, ret); + return -ENOEXEC; + } } } diff --git a/arch/arm64/kernel/mpam.c b/arch/arm64/kernel/mpam.c new file mode 100644 index 000000000000..3a490de4fa12 --- /dev/null +++ b/arch/arm64/kernel/mpam.c @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2025 Arm Ltd. */ + +#include <asm/mpam.h> + +#include <linux/arm_mpam.h> +#include <linux/cpu_pm.h> +#include <linux/jump_label.h> +#include <linux/percpu.h> + +DEFINE_STATIC_KEY_FALSE(mpam_enabled); +DEFINE_PER_CPU(u64, arm64_mpam_default); +DEFINE_PER_CPU(u64, arm64_mpam_current); + +u64 arm64_mpam_global_default; + +static int mpam_pm_notifier(struct notifier_block *self, + unsigned long cmd, void *v) +{ + u64 regval; + int cpu = smp_processor_id(); + + switch (cmd) { + case CPU_PM_EXIT: + /* + * Don't use mpam_thread_switch() as the system register + * value has changed under our feet. + */ + regval = READ_ONCE(per_cpu(arm64_mpam_current, cpu)); + write_sysreg_s(regval | MPAM1_EL1_MPAMEN, SYS_MPAM1_EL1); + if (system_supports_sme()) { + write_sysreg_s(regval & (MPAMSM_EL1_PARTID_D | MPAMSM_EL1_PMG_D), + SYS_MPAMSM_EL1); + } + isb(); + + write_sysreg_s(regval, SYS_MPAM0_EL1); + + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} + +static struct notifier_block mpam_pm_nb = { + .notifier_call = mpam_pm_notifier, +}; + +static int __init arm64_mpam_register_cpus(void) +{ + u64 mpamidr = read_sanitised_ftr_reg(SYS_MPAMIDR_EL1); + u16 partid_max = FIELD_GET(MPAMIDR_EL1_PARTID_MAX, mpamidr); + u8 pmg_max = FIELD_GET(MPAMIDR_EL1_PMG_MAX, mpamidr); + + if (!system_supports_mpam()) + return 0; + + cpu_pm_register_notifier(&mpam_pm_nb); + return mpam_register_requestor(partid_max, pmg_max); +} +/* Must occur before mpam_msc_driver_init() from subsys_initcall() */ +arch_initcall(arm64_mpam_register_cpus) diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 2fbfd27ff5f2..6874b16d0657 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -157,6 +157,24 @@ void mte_enable_kernel_asymm(void) mte_enable_kernel_sync(); } } + +int mte_enable_kernel_store_only(void) +{ + /* + * If the CPU does not support MTE store only, + * the kernel checks all operations. + */ + if (!cpus_have_cap(ARM64_MTE_STORE_ONLY)) + return -EINVAL; + + sysreg_clear_set(sctlr_el1, SCTLR_EL1_TCSO_MASK, + SYS_FIELD_PREP(SCTLR_EL1, TCSO, 1)); + isb(); + + pr_info_once("MTE: enabled store only mode at EL1\n"); + + return 0; +} #endif #ifdef CONFIG_KASAN_HW_TAGS @@ -200,7 +218,7 @@ static void mte_update_sctlr_user(struct task_struct *task) * program requested values go with what was requested. */ resolved_mte_tcf = (mte_ctrl & pref) ? pref : mte_ctrl; - sctlr &= ~SCTLR_EL1_TCF0_MASK; + sctlr &= ~(SCTLR_EL1_TCF0_MASK | SCTLR_EL1_TCSO0_MASK); /* * Pick an actual setting. The order in which we check for * set bits and map into register values determines our @@ -212,6 +230,10 @@ static void mte_update_sctlr_user(struct task_struct *task) sctlr |= SYS_FIELD_PREP_ENUM(SCTLR_EL1, TCF0, ASYNC); else if (resolved_mte_tcf & MTE_CTRL_TCF_SYNC) sctlr |= SYS_FIELD_PREP_ENUM(SCTLR_EL1, TCF0, SYNC); + + if (mte_ctrl & MTE_CTRL_STORE_ONLY) + sctlr |= SYS_FIELD_PREP(SCTLR_EL1, TCSO0, 1); + task->thread.sctlr_user = sctlr; } @@ -269,6 +291,9 @@ void mte_thread_switch(struct task_struct *next) /* TCO may not have been disabled on exception entry for the current task. */ mte_disable_tco_entry(next); + if (!system_uses_mte_async_or_asymm_mode()) + return; + /* * Check if an async tag exception occurred at EL1. * @@ -293,8 +318,8 @@ void mte_cpu_setup(void) * CnP is not a boot feature so MTE gets enabled before CnP, but let's * make sure that is the case. */ - BUG_ON(read_sysreg(ttbr0_el1) & TTBR_CNP_BIT); - BUG_ON(read_sysreg(ttbr1_el1) & TTBR_CNP_BIT); + BUG_ON(read_sysreg(ttbr0_el1) & TTBRx_EL1_CnP); + BUG_ON(read_sysreg(ttbr1_el1) & TTBRx_EL1_CnP); /* Normal Tagged memory type at the corresponding MAIR index */ sysreg_clear_set(mair_el1, @@ -328,6 +353,9 @@ void mte_suspend_enter(void) if (!system_supports_mte()) return; + if (!system_uses_mte_async_or_asymm_mode()) + return; + /* * The barriers are required to guarantee that the indirect writes * to TFSR_EL1 are synchronized before we report the state. @@ -371,6 +399,9 @@ long set_mte_ctrl(struct task_struct *task, unsigned long arg) (arg & PR_MTE_TCF_SYNC)) mte_ctrl |= MTE_CTRL_TCF_ASYMM; + if (arg & PR_MTE_STORE_ONLY) + mte_ctrl |= MTE_CTRL_STORE_ONLY; + task->thread.mte_ctrl = mte_ctrl; if (task == current) { preempt_disable(); @@ -398,6 +429,8 @@ long get_mte_ctrl(struct task_struct *task) ret |= PR_MTE_TCF_ASYNC; if (mte_ctrl & MTE_CTRL_TCF_SYNC) ret |= PR_MTE_TCF_SYNC; + if (mte_ctrl & MTE_CTRL_STORE_ONLY) + ret |= PR_MTE_STORE_ONLY; return ret; } @@ -449,9 +482,10 @@ static int __access_remote_tags(struct mm_struct *mm, unsigned long addr, folio = page_folio(page); if (folio_test_hugetlb(folio)) - WARN_ON_ONCE(!folio_test_hugetlb_mte_tagged(folio)); + WARN_ON_ONCE(!folio_test_hugetlb_mte_tagged(folio) && + !is_huge_zero_folio(folio)); else - WARN_ON_ONCE(!page_mte_tagged(page)); + WARN_ON_ONCE(!page_mte_tagged(page) && !is_zero_page(page)); /* limit access to the end of the page */ offset = offset_in_page(addr); diff --git a/arch/arm64/kernel/paravirt.c b/arch/arm64/kernel/paravirt.c index aa718d6a9274..572efb96b23f 100644 --- a/arch/arm64/kernel/paravirt.c +++ b/arch/arm64/kernel/paravirt.c @@ -19,21 +19,12 @@ #include <linux/slab.h> #include <linux/types.h> #include <linux/static_call.h> +#include <linux/sched/cputime.h> #include <asm/paravirt.h> #include <asm/pvclock-abi.h> #include <asm/smp_plat.h> -struct static_key paravirt_steal_enabled; -struct static_key paravirt_steal_rq_enabled; - -static u64 native_steal_clock(int cpu) -{ - return 0; -} - -DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock); - struct pv_time_stolen_time_region { struct pvclock_vcpu_stolen_time __rcu *kaddr; }; diff --git a/arch/arm64/kernel/pi/Makefile b/arch/arm64/kernel/pi/Makefile index 4d11a8c29181..be92d73c25b2 100644 --- a/arch/arm64/kernel/pi/Makefile +++ b/arch/arm64/kernel/pi/Makefile @@ -2,7 +2,7 @@ # Copyright 2022 Google LLC KBUILD_CFLAGS := $(subst $(CC_FLAGS_FTRACE),,$(KBUILD_CFLAGS)) -fpie \ - -Os -DDISABLE_BRANCH_PROFILING $(DISABLE_STACKLEAK_PLUGIN) \ + -Os -DDISABLE_BRANCH_PROFILING $(DISABLE_KSTACK_ERASE) \ $(DISABLE_LATENT_ENTROPY_PLUGIN) \ $(call cc-option,-mbranch-protection=none) \ -I$(srctree)/scripts/dtc/libfdt -fno-stack-protector \ @@ -41,4 +41,4 @@ obj-y := idreg-override.pi.o \ obj-$(CONFIG_RELOCATABLE) += relocate.pi.o obj-$(CONFIG_RANDOMIZE_BASE) += kaslr_early.pi.o obj-$(CONFIG_UNWIND_PATCH_PAC_INTO_SCS) += patch-scs.pi.o -extra-y := $(patsubst %.pi.o,%.o,$(obj-y)) +targets := $(patsubst %.pi.o,%.o,$(obj-y)) diff --git a/arch/arm64/kernel/pi/idreg-override.c b/arch/arm64/kernel/pi/idreg-override.c index c6b185b885f7..bc57b290e5e7 100644 --- a/arch/arm64/kernel/pi/idreg-override.c +++ b/arch/arm64/kernel/pi/idreg-override.c @@ -127,6 +127,7 @@ static const struct ftr_set_desc pfr0 __prel64_initconst = { .fields = { FIELD("sve", ID_AA64PFR0_EL1_SVE_SHIFT, pfr0_sve_filter), FIELD("el0", ID_AA64PFR0_EL1_EL0_SHIFT, NULL), + FIELD("mpam", ID_AA64PFR0_EL1_MPAM_SHIFT, NULL), {} }, }; @@ -154,6 +155,7 @@ static const struct ftr_set_desc pfr1 __prel64_initconst = { FIELD("gcs", ID_AA64PFR1_EL1_GCS_SHIFT, NULL), FIELD("mte", ID_AA64PFR1_EL1_MTE_SHIFT, NULL), FIELD("sme", ID_AA64PFR1_EL1_SME_SHIFT, pfr1_sme_filter), + FIELD("mpam_frac", ID_AA64PFR1_EL1_MPAM_frac_SHIFT, NULL), {} }, }; @@ -246,6 +248,7 @@ static const struct { { "rodata=off", "arm64_sw.rodataoff=1" }, { "arm64.nolva", "id_aa64mmfr2.varange=0" }, { "arm64.no32bit_el0", "id_aa64pfr0.el0=1" }, + { "arm64.nompam", "id_aa64pfr0.mpam=0 id_aa64pfr1.mpam_frac=0" }, }; static int __init parse_hexdigit(const char *p, u64 *v) diff --git a/arch/arm64/kernel/pi/kaslr_early.c b/arch/arm64/kernel/pi/kaslr_early.c index 0257b43819db..e0e018046a46 100644 --- a/arch/arm64/kernel/pi/kaslr_early.c +++ b/arch/arm64/kernel/pi/kaslr_early.c @@ -18,8 +18,6 @@ #include "pi.h" -extern u16 memstart_offset_seed; - static u64 __init get_kaslr_seed(void *fdt, int node) { static char const seed_str[] __initconst = "kaslr-seed"; @@ -53,8 +51,6 @@ u64 __init kaslr_early_init(void *fdt, int chosen) return 0; } - memstart_offset_seed = seed & U16_MAX; - /* * OK, so we are proceeding with KASLR enabled. Calculate a suitable * kernel image offset from the seed. Let's place the kernel in the diff --git a/arch/arm64/kernel/pi/map_kernel.c b/arch/arm64/kernel/pi/map_kernel.c index e57b043f324b..a852264958c3 100644 --- a/arch/arm64/kernel/pi/map_kernel.c +++ b/arch/arm64/kernel/pi/map_kernel.c @@ -18,9 +18,9 @@ extern const u8 __eh_frame_start[], __eh_frame_end[]; -extern void idmap_cpu_replace_ttbr1(void *pgdir); +extern void idmap_cpu_replace_ttbr1(phys_addr_t pgdir); -static void __init map_segment(pgd_t *pg_dir, u64 *pgd, u64 va_offset, +static void __init map_segment(pgd_t *pg_dir, phys_addr_t *pgd, u64 va_offset, void *start, void *end, pgprot_t prot, bool may_use_cont, int root_level) { @@ -40,7 +40,7 @@ static void __init map_kernel(u64 kaslr_offset, u64 va_offset, int root_level) { bool enable_scs = IS_ENABLED(CONFIG_UNWIND_PATCH_PAC_INTO_SCS); bool twopass = IS_ENABLED(CONFIG_RELOCATABLE); - u64 pgdp = (u64)init_pg_dir + PAGE_SIZE; + phys_addr_t pgdp = (phys_addr_t)init_pg_dir + PAGE_SIZE; pgprot_t text_prot = PAGE_KERNEL_ROX; pgprot_t data_prot = PAGE_KERNEL; pgprot_t prot; @@ -78,6 +78,12 @@ static void __init map_kernel(u64 kaslr_offset, u64 va_offset, int root_level) twopass |= enable_scs; prot = twopass ? data_prot : text_prot; + /* + * [_stext, _text) isn't executed after boot and contains some + * non-executable, unpredictable data, so map it non-executable. + */ + map_segment(init_pg_dir, &pgdp, va_offset, _text, _stext, data_prot, + false, root_level); map_segment(init_pg_dir, &pgdp, va_offset, _stext, _etext, prot, !twopass, root_level); map_segment(init_pg_dir, &pgdp, va_offset, __start_rodata, @@ -90,7 +96,7 @@ static void __init map_kernel(u64 kaslr_offset, u64 va_offset, int root_level) true, root_level); dsb(ishst); - idmap_cpu_replace_ttbr1(init_pg_dir); + idmap_cpu_replace_ttbr1((phys_addr_t)init_pg_dir); if (twopass) { if (IS_ENABLED(CONFIG_RELOCATABLE)) @@ -98,7 +104,7 @@ static void __init map_kernel(u64 kaslr_offset, u64 va_offset, int root_level) if (enable_scs) { scs_patch(__eh_frame_start + va_offset, - __eh_frame_end - __eh_frame_start); + __eh_frame_end - __eh_frame_start, false); asm("ic ialluis"); dynamic_scs_is_enabled = true; @@ -129,19 +135,19 @@ static void __init map_kernel(u64 kaslr_offset, u64 va_offset, int root_level) /* Copy the root page table to its final location */ memcpy((void *)swapper_pg_dir + va_offset, init_pg_dir, PAGE_SIZE); dsb(ishst); - idmap_cpu_replace_ttbr1(swapper_pg_dir); + idmap_cpu_replace_ttbr1((phys_addr_t)swapper_pg_dir); } -static void noinline __section(".idmap.text") set_ttbr0_for_lpa2(u64 ttbr) +static void noinline __section(".idmap.text") set_ttbr0_for_lpa2(phys_addr_t ttbr) { u64 sctlr = read_sysreg(sctlr_el1); - u64 tcr = read_sysreg(tcr_el1) | TCR_DS; + u64 tcr = read_sysreg(tcr_el1) | TCR_EL1_DS; u64 mmfr0 = read_sysreg(id_aa64mmfr0_el1); u64 parange = cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_EL1_PARANGE_SHIFT); - tcr &= ~TCR_IPS_MASK; - tcr |= parange << TCR_IPS_SHIFT; + tcr &= ~TCR_EL1_IPS_MASK; + tcr |= parange << TCR_EL1_IPS_SHIFT; asm(" msr sctlr_el1, %0 ;" " isb ;" @@ -159,7 +165,7 @@ static void noinline __section(".idmap.text") set_ttbr0_for_lpa2(u64 ttbr) static void __init remap_idmap_for_lpa2(void) { /* clear the bits that change meaning once LPA2 is turned on */ - pteval_t mask = PTE_SHARED; + ptdesc_t mask = PTE_SHARED; /* * We have to clear bits [9:8] in all block or page descriptors in the @@ -172,30 +178,30 @@ static void __init remap_idmap_for_lpa2(void) */ create_init_idmap(init_pg_dir, mask); dsb(ishst); - set_ttbr0_for_lpa2((u64)init_pg_dir); + set_ttbr0_for_lpa2((phys_addr_t)init_pg_dir); /* * Recreate the initial ID map with the same granularity as before. * Don't bother with the FDT, we no longer need it after this. */ memset(init_idmap_pg_dir, 0, - (u64)init_idmap_pg_end - (u64)init_idmap_pg_dir); + (char *)init_idmap_pg_end - (char *)init_idmap_pg_dir); create_init_idmap(init_idmap_pg_dir, mask); dsb(ishst); /* switch back to the updated initial ID map */ - set_ttbr0_for_lpa2((u64)init_idmap_pg_dir); + set_ttbr0_for_lpa2((phys_addr_t)init_idmap_pg_dir); /* wipe the temporary ID map from memory */ - memset(init_pg_dir, 0, (u64)init_pg_end - (u64)init_pg_dir); + memset(init_pg_dir, 0, (char *)init_pg_end - (char *)init_pg_dir); } -static void __init map_fdt(u64 fdt) +static void *__init map_fdt(phys_addr_t fdt) { static u8 ptes[INIT_IDMAP_FDT_SIZE] __initdata __aligned(PAGE_SIZE); - u64 efdt = fdt + MAX_FDT_SIZE; - u64 ptep = (u64)ptes; + phys_addr_t efdt = fdt + MAX_FDT_SIZE; + phys_addr_t ptep = (phys_addr_t)ptes; /* We're idmapped when called */ /* * Map up to MAX_FDT_SIZE bytes, but avoid overlap with @@ -205,9 +211,34 @@ static void __init map_fdt(u64 fdt) fdt, PAGE_KERNEL, IDMAP_ROOT_LEVEL, (pte_t *)init_idmap_pg_dir, false, 0); dsb(ishst); + + return (void *)fdt; +} + +/* + * PI version of the Cavium Eratum 27456 detection, which makes it + * impossible to use non-global mappings. + */ +static bool __init ng_mappings_allowed(void) +{ + static const struct midr_range cavium_erratum_27456_cpus[] __initconst = { + /* Cavium ThunderX, T88 pass 1.x - 2.1 */ + MIDR_RANGE(MIDR_THUNDERX, 0, 0, 1, 1), + /* Cavium ThunderX, T81 pass 1.0 */ + MIDR_REV(MIDR_THUNDERX_81XX, 0, 0), + {}, + }; + + for (const struct midr_range *r = cavium_erratum_27456_cpus; r->model; r++) { + if (midr_is_cpu_model_range(read_cpuid_id(), r->model, + r->rv_min, r->rv_max)) + return false; + } + + return true; } -asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt) +asmlinkage void __init early_map_kernel(u64 boot_status, phys_addr_t fdt) { static char const chosen_str[] __initconst = "/chosen"; u64 va_base, pa_base = (u64)&_text; @@ -215,15 +246,14 @@ asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt) int root_level = 4 - CONFIG_PGTABLE_LEVELS; int va_bits = VA_BITS; int chosen; - - map_fdt((u64)fdt); + void *fdt_mapped = map_fdt(fdt); /* Clear BSS and the initial page tables */ - memset(__bss_start, 0, (u64)init_pg_end - (u64)__bss_start); + memset(__bss_start, 0, (char *)init_pg_end - (char *)__bss_start); /* Parse the command line for CPU feature overrides */ - chosen = fdt_path_offset(fdt, chosen_str); - init_feature_override(boot_status, fdt, chosen); + chosen = fdt_path_offset(fdt_mapped, chosen_str); + init_feature_override(boot_status, fdt_mapped, chosen); if (IS_ENABLED(CONFIG_ARM64_64K_PAGES) && !cpu_has_lva()) { va_bits = VA_BITS_MIN; @@ -233,7 +263,7 @@ asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt) } if (va_bits > VA_BITS_MIN) - sysreg_clear_set(tcr_el1, TCR_T1SZ_MASK, TCR_T1SZ(va_bits)); + sysreg_clear_set(tcr_el1, TCR_EL1_T1SZ_MASK, TCR_T1SZ(va_bits)); /* * The virtual KASLR displacement modulo 2MiB is decided by the @@ -243,10 +273,10 @@ asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt) * fill in the high bits from the seed. */ if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { - u64 kaslr_seed = kaslr_early_init(fdt, chosen); + u64 kaslr_seed = kaslr_early_init(fdt_mapped, chosen); if (kaslr_seed && kaslr_requires_kpti()) - arm64_use_ng_mappings = true; + arm64_use_ng_mappings = ng_mappings_allowed(); kaslr_offset |= kaslr_seed & ~(MIN_KIMG_ALIGN - 1); } diff --git a/arch/arm64/kernel/pi/map_range.c b/arch/arm64/kernel/pi/map_range.c index 2b69e3beeef8..de52cd85c691 100644 --- a/arch/arm64/kernel/pi/map_range.c +++ b/arch/arm64/kernel/pi/map_range.c @@ -26,12 +26,13 @@ * @va_offset: Offset between a physical page and its current mapping * in the VA space */ -void __init map_range(u64 *pte, u64 start, u64 end, u64 pa, pgprot_t prot, - int level, pte_t *tbl, bool may_use_cont, u64 va_offset) +void __init map_range(phys_addr_t *pte, u64 start, u64 end, phys_addr_t pa, + pgprot_t prot, int level, pte_t *tbl, bool may_use_cont, + u64 va_offset) { u64 cmask = (level == 3) ? CONT_PTE_SIZE - 1 : U64_MAX; - pteval_t protval = pgprot_val(prot) & ~PTE_TYPE_MASK; - int lshift = (3 - level) * (PAGE_SHIFT - 3); + ptdesc_t protval = pgprot_val(prot) & ~PTE_TYPE_MASK; + int lshift = (3 - level) * PTDESC_TABLE_SHIFT; u64 lmask = (PAGE_SIZE << lshift) - 1; start &= PAGE_MASK; @@ -45,12 +46,12 @@ void __init map_range(u64 *pte, u64 start, u64 end, u64 pa, pgprot_t prot, * clearing the mapping */ if (protval) - protval |= (level < 3) ? PMD_TYPE_SECT : PTE_TYPE_PAGE; + protval |= (level == 2) ? PMD_TYPE_SECT : PTE_TYPE_PAGE; while (start < end) { u64 next = min((start | lmask) + 1, PAGE_ALIGN(end)); - if (level < 3 && (start | next | pa) & lmask) { + if (level < 2 || (level == 2 && (start | next | pa) & lmask)) { /* * This chunk needs a finer grained mapping. Create a * table mapping if necessary and recurse. @@ -87,19 +88,22 @@ void __init map_range(u64 *pte, u64 start, u64 end, u64 pa, pgprot_t prot, } } -asmlinkage u64 __init create_init_idmap(pgd_t *pg_dir, pteval_t clrmask) +asmlinkage phys_addr_t __init create_init_idmap(pgd_t *pg_dir, ptdesc_t clrmask) { - u64 ptep = (u64)pg_dir + PAGE_SIZE; + phys_addr_t ptep = (phys_addr_t)pg_dir + PAGE_SIZE; /* MMU is off */ pgprot_t text_prot = PAGE_KERNEL_ROX; pgprot_t data_prot = PAGE_KERNEL; pgprot_val(text_prot) &= ~clrmask; pgprot_val(data_prot) &= ~clrmask; - map_range(&ptep, (u64)_stext, (u64)__initdata_begin, (u64)_stext, - text_prot, IDMAP_ROOT_LEVEL, (pte_t *)pg_dir, false, 0); - map_range(&ptep, (u64)__initdata_begin, (u64)_end, (u64)__initdata_begin, - data_prot, IDMAP_ROOT_LEVEL, (pte_t *)pg_dir, false, 0); + /* MMU is off; pointer casts to phys_addr_t are safe */ + map_range(&ptep, (u64)_stext, (u64)__initdata_begin, + (phys_addr_t)_stext, text_prot, IDMAP_ROOT_LEVEL, + (pte_t *)pg_dir, false, 0); + map_range(&ptep, (u64)__initdata_begin, (u64)_end, + (phys_addr_t)__initdata_begin, data_prot, IDMAP_ROOT_LEVEL, + (pte_t *)pg_dir, false, 0); return ptep; } diff --git a/arch/arm64/kernel/pi/patch-scs.c b/arch/arm64/kernel/pi/patch-scs.c index 55d0cd64ef71..dac568e4a54f 100644 --- a/arch/arm64/kernel/pi/patch-scs.c +++ b/arch/arm64/kernel/pi/patch-scs.c @@ -192,6 +192,14 @@ static int scs_handle_fde_frame(const struct eh_frame *frame, size -= 2; break; + case DW_CFA_advance_loc4: + loc += *opcode++ * code_alignment_factor; + loc += (*opcode++ << 8) * code_alignment_factor; + loc += (*opcode++ << 16) * code_alignment_factor; + loc += (*opcode++ << 24) * code_alignment_factor; + size -= 4; + break; + case DW_CFA_def_cfa: case DW_CFA_offset_extended: size = skip_xleb128(&opcode, size); @@ -225,7 +233,7 @@ static int scs_handle_fde_frame(const struct eh_frame *frame, return 0; } -int scs_patch(const u8 eh_frame[], int size) +int scs_patch(const u8 eh_frame[], int size, bool skip_dry_run) { int code_alignment_factor = 1; bool fde_use_sdata8 = false; @@ -277,11 +285,13 @@ int scs_patch(const u8 eh_frame[], int size) } } else { ret = scs_handle_fde_frame(frame, code_alignment_factor, - fde_use_sdata8, true); + fde_use_sdata8, !skip_dry_run); if (ret) return ret; - scs_handle_fde_frame(frame, code_alignment_factor, - fde_use_sdata8, false); + + if (!skip_dry_run) + scs_handle_fde_frame(frame, code_alignment_factor, + fde_use_sdata8, false); } p += sizeof(frame->size) + frame->size; diff --git a/arch/arm64/kernel/pi/pi.h b/arch/arm64/kernel/pi/pi.h index c91e5e965cd3..aec3172d4003 100644 --- a/arch/arm64/kernel/pi/pi.h +++ b/arch/arm64/kernel/pi/pi.h @@ -22,15 +22,17 @@ static inline void *prel64_to_pointer(const prel64_t *offset) extern bool dynamic_scs_is_enabled; extern pgd_t init_idmap_pg_dir[], init_idmap_pg_end[]; +extern pgd_t init_pg_dir[], init_pg_end[]; void init_feature_override(u64 boot_status, const void *fdt, int chosen); u64 kaslr_early_init(void *fdt, int chosen); void relocate_kernel(u64 offset); -int scs_patch(const u8 eh_frame[], int size); +int scs_patch(const u8 eh_frame[], int size, bool skip_dry_run); -void map_range(u64 *pgd, u64 start, u64 end, u64 pa, pgprot_t prot, - int level, pte_t *tbl, bool may_use_cont, u64 va_offset); +void map_range(phys_addr_t *pte, u64 start, u64 end, phys_addr_t pa, + pgprot_t prot, int level, pte_t *tbl, bool may_use_cont, + u64 va_offset); -asmlinkage void early_map_kernel(u64 boot_status, void *fdt); +asmlinkage void early_map_kernel(u64 boot_status, phys_addr_t fdt); -asmlinkage u64 create_init_idmap(pgd_t *pgd, pteval_t clrmask); +asmlinkage phys_addr_t create_init_idmap(pgd_t *pgd, ptdesc_t clrmask); diff --git a/arch/arm64/kernel/probes/decode-insn.c b/arch/arm64/kernel/probes/decode-insn.c index 6438bf62e753..4137cc5ef031 100644 --- a/arch/arm64/kernel/probes/decode-insn.c +++ b/arch/arm64/kernel/probes/decode-insn.c @@ -108,9 +108,10 @@ arm_probe_decode_insn(u32 insn, struct arch_probe_insn *api) aarch64_insn_is_bl(insn)) { api->handler = simulate_b_bl; } else if (aarch64_insn_is_br(insn) || - aarch64_insn_is_blr(insn) || - aarch64_insn_is_ret(insn)) { - api->handler = simulate_br_blr_ret; + aarch64_insn_is_blr(insn)) { + api->handler = simulate_br_blr; + } else if (aarch64_insn_is_ret(insn)) { + api->handler = simulate_ret; } else { /* * Instruction cannot be stepped out-of-line and we don't diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index d9e462eafb95..43a0361a8bf0 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -10,6 +10,7 @@ #define pr_fmt(fmt) "kprobes: " fmt +#include <linux/execmem.h> #include <linux/extable.h> #include <linux/kasan.h> #include <linux/kernel.h> @@ -41,6 +42,20 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); static void __kprobes post_kprobe_handler(struct kprobe *, struct kprobe_ctlblk *, struct pt_regs *); +void *alloc_insn_page(void) +{ + void *addr; + + addr = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE); + if (!addr) + return NULL; + if (set_memory_rox((unsigned long)addr, 1)) { + execmem_free(addr); + return NULL; + } + return addr; +} + static void __kprobes arch_prepare_ss_slot(struct kprobe *p) { kprobe_opcode_t *addr = p->ainsn.xol_insn; @@ -292,8 +307,8 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, unsigned int fsr) return 0; } -static int __kprobes -kprobe_breakpoint_handler(struct pt_regs *regs, unsigned long esr) +int __kprobes +kprobe_brk_handler(struct pt_regs *regs, unsigned long esr) { struct kprobe *p, *cur_kprobe; struct kprobe_ctlblk *kcb; @@ -336,13 +351,8 @@ kprobe_breakpoint_handler(struct pt_regs *regs, unsigned long esr) return DBG_HOOK_HANDLED; } -static struct break_hook kprobes_break_hook = { - .imm = KPROBES_BRK_IMM, - .fn = kprobe_breakpoint_handler, -}; - -static int __kprobes -kprobe_breakpoint_ss_handler(struct pt_regs *regs, unsigned long esr) +int __kprobes +kprobe_ss_brk_handler(struct pt_regs *regs, unsigned long esr) { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); unsigned long addr = instruction_pointer(regs); @@ -360,13 +370,8 @@ kprobe_breakpoint_ss_handler(struct pt_regs *regs, unsigned long esr) return DBG_HOOK_ERROR; } -static struct break_hook kprobes_break_ss_hook = { - .imm = KPROBES_BRK_SS_IMM, - .fn = kprobe_breakpoint_ss_handler, -}; - -static int __kprobes -kretprobe_breakpoint_handler(struct pt_regs *regs, unsigned long esr) +int __kprobes +kretprobe_brk_handler(struct pt_regs *regs, unsigned long esr) { if (regs->pc != (unsigned long)__kretprobe_trampoline) return DBG_HOOK_ERROR; @@ -375,11 +380,6 @@ kretprobe_breakpoint_handler(struct pt_regs *regs, unsigned long esr) return DBG_HOOK_HANDLED; } -static struct break_hook kretprobes_break_hook = { - .imm = KRETPROBES_BRK_IMM, - .fn = kretprobe_breakpoint_handler, -}; - /* * Provide a blacklist of symbols identifying ranges which cannot be kprobed. * This blacklist is exposed to userspace via debugfs (kprobes/blacklist). @@ -422,9 +422,5 @@ int __kprobes arch_trampoline_kprobe(struct kprobe *p) int __init arch_init_kprobes(void) { - register_kernel_break_hook(&kprobes_break_hook); - register_kernel_break_hook(&kprobes_break_ss_hook); - register_kernel_break_hook(&kretprobes_break_hook); - return 0; } diff --git a/arch/arm64/kernel/probes/kprobes_trampoline.S b/arch/arm64/kernel/probes/kprobes_trampoline.S index a362f3dbb3d1..b60739d3983f 100644 --- a/arch/arm64/kernel/probes/kprobes_trampoline.S +++ b/arch/arm64/kernel/probes/kprobes_trampoline.S @@ -12,7 +12,7 @@ SYM_CODE_START(__kretprobe_trampoline) /* * Trigger a breakpoint exception. The PC will be adjusted by - * kretprobe_breakpoint_handler(), and no subsequent instructions will + * kretprobe_brk_handler(), and no subsequent instructions will * be executed from the trampoline. */ brk #KRETPROBES_BRK_IMM diff --git a/arch/arm64/kernel/probes/simulate-insn.c b/arch/arm64/kernel/probes/simulate-insn.c index 4c6d2d712fbd..89fbeb32107e 100644 --- a/arch/arm64/kernel/probes/simulate-insn.c +++ b/arch/arm64/kernel/probes/simulate-insn.c @@ -13,6 +13,7 @@ #include <asm/traps.h> #include "simulate-insn.h" +#include "asm/gcs.h" #define bbl_displacement(insn) \ sign_extend32(((insn) & 0x3ffffff) << 2, 27) @@ -49,6 +50,21 @@ static inline u32 get_w_reg(struct pt_regs *regs, int reg) return lower_32_bits(pt_regs_read_reg(regs, reg)); } +static inline int update_lr(struct pt_regs *regs, long addr) +{ + int err = 0; + + if (user_mode(regs) && task_gcs_el0_enabled(current)) { + push_user_gcs(addr, &err); + if (err) { + force_sig(SIGSEGV); + return err; + } + } + procedure_link_pointer_set(regs, addr); + return err; +} + static bool __kprobes check_cbz(u32 opcode, struct pt_regs *regs) { int xn = opcode & 0x1f; @@ -107,9 +123,9 @@ simulate_b_bl(u32 opcode, long addr, struct pt_regs *regs) { int disp = bbl_displacement(opcode); - /* Link register is x30 */ if (opcode & (1 << 31)) - set_x_reg(regs, 30, addr + 4); + if (update_lr(regs, addr + 4)) + return; instruction_pointer_set(regs, addr + disp); } @@ -126,16 +142,34 @@ simulate_b_cond(u32 opcode, long addr, struct pt_regs *regs) } void __kprobes -simulate_br_blr_ret(u32 opcode, long addr, struct pt_regs *regs) +simulate_br_blr(u32 opcode, long addr, struct pt_regs *regs) { int xn = (opcode >> 5) & 0x1f; + u64 b_target = get_x_reg(regs, xn); - /* update pc first in case we're doing a "blr lr" */ - instruction_pointer_set(regs, get_x_reg(regs, xn)); - - /* Link register is x30 */ if (((opcode >> 21) & 0x3) == 1) - set_x_reg(regs, 30, addr + 4); + if (update_lr(regs, addr + 4)) + return; + + instruction_pointer_set(regs, b_target); +} + +void __kprobes +simulate_ret(u32 opcode, long addr, struct pt_regs *regs) +{ + u64 ret_addr; + int err = 0; + int xn = (opcode >> 5) & 0x1f; + u64 r_target = get_x_reg(regs, xn); + + if (user_mode(regs) && task_gcs_el0_enabled(current)) { + ret_addr = pop_user_gcs(&err); + if (err || ret_addr != r_target) { + force_sig(SIGSEGV); + return; + } + } + instruction_pointer_set(regs, r_target); } void __kprobes diff --git a/arch/arm64/kernel/probes/simulate-insn.h b/arch/arm64/kernel/probes/simulate-insn.h index efb2803ec943..9e772a292d56 100644 --- a/arch/arm64/kernel/probes/simulate-insn.h +++ b/arch/arm64/kernel/probes/simulate-insn.h @@ -11,7 +11,8 @@ void simulate_adr_adrp(u32 opcode, long addr, struct pt_regs *regs); void simulate_b_bl(u32 opcode, long addr, struct pt_regs *regs); void simulate_b_cond(u32 opcode, long addr, struct pt_regs *regs); -void simulate_br_blr_ret(u32 opcode, long addr, struct pt_regs *regs); +void simulate_br_blr(u32 opcode, long addr, struct pt_regs *regs); +void simulate_ret(u32 opcode, long addr, struct pt_regs *regs); void simulate_cbz_cbnz(u32 opcode, long addr, struct pt_regs *regs); void simulate_tbz_tbnz(u32 opcode, long addr, struct pt_regs *regs); void simulate_ldr_literal(u32 opcode, long addr, struct pt_regs *regs); diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c index cb3d05af36e3..2a9b0bc083a3 100644 --- a/arch/arm64/kernel/probes/uprobes.c +++ b/arch/arm64/kernel/probes/uprobes.c @@ -6,6 +6,7 @@ #include <linux/ptrace.h> #include <linux/uprobes.h> #include <asm/cacheflush.h> +#include <asm/gcs.h> #include "decode-insn.h" @@ -14,7 +15,7 @@ void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, void *src, unsigned long len) { - void *xol_page_kaddr = kmap_atomic(page); + void *xol_page_kaddr = kmap_local_page(page); void *dst = xol_page_kaddr + (vaddr & ~PAGE_MASK); /* @@ -31,7 +32,7 @@ void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, sync_icache_aliases((unsigned long)dst, (unsigned long)dst + len); done: - kunmap_atomic(xol_page_kaddr); + kunmap_local(xol_page_kaddr); } unsigned long uprobe_get_swbp_addr(struct pt_regs *regs) @@ -102,10 +103,7 @@ bool arch_uprobe_xol_was_trapped(struct task_struct *t) * insn itself is trapped, then detect the case with the help of * invalid fault code which is being set in arch_uprobe_pre_xol */ - if (t->thread.fault_code != UPROBE_INV_FAULT_CODE) - return true; - - return false; + return t->thread.fault_code != UPROBE_INV_FAULT_CODE; } bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) @@ -130,7 +128,7 @@ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) struct uprobe_task *utask = current->utask; /* - * Task has received a fatal signal, so reset back to probbed + * Task has received a fatal signal, so reset back to probed * address. */ instruction_pointer_set(regs, utask->vaddr); @@ -159,11 +157,43 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs) { unsigned long orig_ret_vaddr; + unsigned long gcs_ret_vaddr; + int err = 0; + u64 gcspr; orig_ret_vaddr = procedure_link_pointer(regs); + + if (task_gcs_el0_enabled(current)) { + gcspr = read_sysreg_s(SYS_GCSPR_EL0); + gcs_ret_vaddr = get_user_gcs((__force unsigned long __user *)gcspr, &err); + if (err) { + force_sig(SIGSEGV); + goto out; + } + + /* + * If the LR and GCS return addr don't match, then some kind of PAC + * signing or control flow occurred since entering the probed function. + * Likely because the user is attempting to retprobe on an instruction + * that isn't a function boundary or inside a leaf function. Explicitly + * abort this retprobe because it will generate a GCS exception. + */ + if (gcs_ret_vaddr != orig_ret_vaddr) { + orig_ret_vaddr = -1; + goto out; + } + + put_user_gcs(trampoline_vaddr, (__force unsigned long __user *)gcspr, &err); + if (err) { + force_sig(SIGSEGV); + goto out; + } + } + /* Replace the return addr with trampoline addr */ procedure_link_pointer_set(regs, trampoline_vaddr); +out: return orig_ret_vaddr; } @@ -173,7 +203,7 @@ int arch_uprobe_exception_notify(struct notifier_block *self, return NOTIFY_DONE; } -static int uprobe_breakpoint_handler(struct pt_regs *regs, +int uprobe_brk_handler(struct pt_regs *regs, unsigned long esr) { if (uprobe_pre_sstep_notifier(regs)) @@ -182,7 +212,7 @@ static int uprobe_breakpoint_handler(struct pt_regs *regs, return DBG_HOOK_ERROR; } -static int uprobe_single_step_handler(struct pt_regs *regs, +int uprobe_single_step_handler(struct pt_regs *regs, unsigned long esr) { struct uprobe_task *utask = current->utask; @@ -194,23 +224,3 @@ static int uprobe_single_step_handler(struct pt_regs *regs, return DBG_HOOK_ERROR; } -/* uprobe breakpoint handler hook */ -static struct break_hook uprobes_break_hook = { - .imm = UPROBES_BRK_IMM, - .fn = uprobe_breakpoint_handler, -}; - -/* uprobe single step handler hook */ -static struct step_hook uprobes_step_hook = { - .fn = uprobe_single_step_handler, -}; - -static int __init arch_init_uprobes(void) -{ - register_user_break_hook(&uprobes_break_hook); - register_user_step_hook(&uprobes_step_hook); - - return 0; -} - -device_initcall(arch_init_uprobes); diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 42faebb7b712..c0bf1f46cdc6 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -51,6 +51,7 @@ #include <asm/fpsimd.h> #include <asm/gcs.h> #include <asm/mmu_context.h> +#include <asm/mpam.h> #include <asm/mte.h> #include <asm/processor.h> #include <asm/pointer_auth.h> @@ -288,8 +289,11 @@ static void flush_gcs(void) if (!system_supports_gcs()) return; - gcs_free(current); + current->thread.gcspr_el0 = 0; + current->thread.gcs_base = 0; + current->thread.gcs_size = 0; current->thread.gcs_el0_mode = 0; + current->thread.gcs_el0_locked = 0; write_sysreg_s(GCSCRE0_EL1_nTR, SYS_GCSCRE0_EL1); write_sysreg_s(0, SYS_GCSPR_EL0); } @@ -305,13 +309,13 @@ static int copy_thread_gcs(struct task_struct *p, p->thread.gcs_base = 0; p->thread.gcs_size = 0; + p->thread.gcs_el0_mode = current->thread.gcs_el0_mode; + p->thread.gcs_el0_locked = current->thread.gcs_el0_locked; + gcs = gcs_alloc_thread_stack(p, args); if (IS_ERR_VALUE(gcs)) return PTR_ERR((void *)gcs); - p->thread.gcs_el0_mode = current->thread.gcs_el0_mode; - p->thread.gcs_el0_locked = current->thread.gcs_el0_locked; - return 0; } @@ -339,55 +343,38 @@ void flush_thread(void) void arch_release_task_struct(struct task_struct *tsk) { fpsimd_release_task(tsk); - gcs_free(tsk); } int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { - if (current->mm) - fpsimd_preserve_current_state(); + /* + * The current/src task's FPSIMD state may or may not be live, and may + * have been altered by ptrace after entry to the kernel. Save the + * effective FPSIMD state so that this will be copied into dst. + */ + fpsimd_save_and_flush_current_state(); + fpsimd_sync_from_effective_state(src); + *dst = *src; /* - * Detach src's sve_state (if any) from dst so that it does not - * get erroneously used or freed prematurely. dst's copies - * will be allocated on demand later on if dst uses SVE. - * For consistency, also clear TIF_SVE here: this could be done - * later in copy_process(), but to avoid tripping up future - * maintainers it is best not to leave TIF flags and buffers in - * an inconsistent state, even temporarily. + * Drop stale reference to src's sve_state and convert dst to + * non-streaming FPSIMD mode. */ + dst->thread.fp_type = FP_STATE_FPSIMD; dst->thread.sve_state = NULL; clear_tsk_thread_flag(dst, TIF_SVE); + task_smstop_sm(dst); /* - * In the unlikely event that we create a new thread with ZA - * enabled we should retain the ZA and ZT state so duplicate - * it here. This may be shortly freed if we exec() or if - * CLONE_SETTLS but it's simpler to do it here. To avoid - * confusing the rest of the code ensure that we have a - * sve_state allocated whenever sme_state is allocated. + * Drop stale reference to src's sme_state and ensure dst has ZA + * disabled. + * + * When necessary, ZA will be inherited later in copy_thread_za(). */ - if (thread_za_enabled(&src->thread)) { - dst->thread.sve_state = kzalloc(sve_state_size(src), - GFP_KERNEL); - if (!dst->thread.sve_state) - return -ENOMEM; - - dst->thread.sme_state = kmemdup(src->thread.sme_state, - sme_state_size(src), - GFP_KERNEL); - if (!dst->thread.sme_state) { - kfree(dst->thread.sve_state); - dst->thread.sve_state = NULL; - return -ENOMEM; - } - } else { - dst->thread.sme_state = NULL; - clear_tsk_thread_flag(dst, TIF_SME); - } - - dst->thread.fp_type = FP_STATE_FPSIMD; + dst->thread.sme_state = NULL; + clear_tsk_thread_flag(dst, TIF_SME); + dst->thread.svcr &= ~SVCR_ZA_MASK; /* clear any pending asynchronous tag fault raised by the parent */ clear_tsk_thread_flag(dst, TIF_MTE_ASYNC_FAULT); @@ -395,11 +382,36 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) return 0; } +static int copy_thread_za(struct task_struct *dst, struct task_struct *src) +{ + if (!thread_za_enabled(&src->thread)) + return 0; + + dst->thread.sve_state = kzalloc(sve_state_size(src), + GFP_KERNEL); + if (!dst->thread.sve_state) + return -ENOMEM; + + dst->thread.sme_state = kmemdup(src->thread.sme_state, + sme_state_size(src), + GFP_KERNEL); + if (!dst->thread.sme_state) { + kfree(dst->thread.sve_state); + dst->thread.sve_state = NULL; + return -ENOMEM; + } + + set_tsk_thread_flag(dst, TIF_SME); + dst->thread.svcr |= SVCR_ZA_MASK; + + return 0; +} + asmlinkage void ret_from_fork(void) asm("ret_from_fork"); int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long stack_start = args->stack; unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); @@ -427,8 +439,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) * out-of-sync with the saved value. */ *task_user_tls(p) = read_sysreg(tpidr_el0); - if (system_supports_tpidr2()) - p->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); if (system_supports_poe()) p->thread.por_el0 = read_sysreg_s(SYS_POR_EL0); @@ -441,13 +451,39 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) } /* + * Due to the AAPCS64 "ZA lazy saving scheme", PSTATE.ZA and + * TPIDR2 need to be manipulated as a pair, and either both + * need to be inherited or both need to be reset. + * + * Within a process, child threads must not inherit their + * parent's TPIDR2 value or they may clobber their parent's + * stack at some later point. + * + * When a process is fork()'d, the child must inherit ZA and + * TPIDR2 from its parent in case there was dormant ZA state. + * + * Use CLONE_VM to determine when the child will share the + * address space with the parent, and cannot safely inherit the + * state. + */ + if (system_supports_sme()) { + if (!(clone_flags & CLONE_VM)) { + p->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); + ret = copy_thread_za(p, current); + if (ret) + return ret; + } else { + p->thread.tpidr2_el0 = 0; + WARN_ON_ONCE(p->thread.svcr & SVCR_ZA_MASK); + } + } + + /* * If a TLS pointer was passed to clone, use it for the new - * thread. We also reset TPIDR2 if it's in use. + * thread. */ - if (clone_flags & CLONE_SETTLS) { + if (clone_flags & CLONE_SETTLS) p->thread.uw.tp_value = tls; - p->thread.tpidr2_el0 = 0; - } ret = copy_thread_gcs(p, args); if (ret != 0) @@ -638,6 +674,11 @@ static void permission_overlay_switch(struct task_struct *next) current->thread.por_el0 = read_sysreg_s(SYS_POR_EL0); if (current->thread.por_el0 != next->thread.por_el0) { write_sysreg_s(next->thread.por_el0, SYS_POR_EL0); + /* + * No ISB required as we can tolerate spurious Overlay faults - + * the fault handler will check again based on the new value + * of POR_EL0. + */ } } @@ -659,6 +700,29 @@ void update_sctlr_el1(u64 sctlr) isb(); } +static inline void debug_switch_state(void) +{ + if (system_uses_irq_prio_masking()) { + unsigned long daif_expected = 0; + unsigned long daif_actual = read_sysreg(daif); + unsigned long pmr_expected = GIC_PRIO_IRQOFF; + unsigned long pmr_actual = read_sysreg_s(SYS_ICC_PMR_EL1); + + WARN_ONCE(daif_actual != daif_expected || + pmr_actual != pmr_expected, + "Unexpected DAIF + PMR: 0x%lx + 0x%lx (expected 0x%lx + 0x%lx)\n", + daif_actual, pmr_actual, + daif_expected, pmr_expected); + } else { + unsigned long daif_expected = DAIF_PROCCTX_NOIRQ; + unsigned long daif_actual = read_sysreg(daif); + + WARN_ONCE(daif_actual != daif_expected, + "Unexpected DAIF value: 0x%lx (expected 0x%lx)\n", + daif_actual, daif_expected); + } +} + /* * Thread switching. */ @@ -668,6 +732,8 @@ struct task_struct *__switch_to(struct task_struct *prev, { struct task_struct *last; + debug_switch_state(); + fpsimd_thread_switch(next); tls_thread_switch(next); hw_breakpoint_thread_switch(next); @@ -680,10 +746,11 @@ struct task_struct *__switch_to(struct task_struct *prev, gcs_thread_switch(next); /* - * Complete any pending TLB or cache maintenance on this CPU in case - * the thread migrates to a different CPU. - * This full barrier is also required by the membarrier system - * call. + * Complete any pending TLB or cache maintenance on this CPU in case the + * thread migrates to a different CPU. This full barrier is also + * required by the membarrier system call. Additionally it makes any + * in-progress pgtable writes visible to the table walker; See + * emit_pte_barriers(). */ dsb(ish); @@ -697,6 +764,12 @@ struct task_struct *__switch_to(struct task_struct *prev, if (prev->thread.sctlr_user != next->thread.sctlr_user) update_sctlr_el1(next->thread.sctlr_user); + /* + * MPAM thread switch happens after the DSB to ensure prev's accesses + * use prev's MPAM settings. + */ + mpam_thread_switch(next); + /* the actual thread switch */ last = cpu_switch_to(prev, next); @@ -815,10 +888,14 @@ long set_tagged_addr_ctrl(struct task_struct *task, unsigned long arg) if (is_compat_thread(ti)) return -EINVAL; - if (system_supports_mte()) + if (system_supports_mte()) { valid_mask |= PR_MTE_TCF_SYNC | PR_MTE_TCF_ASYNC \ | PR_MTE_TAG_MASK; + if (cpus_have_cap(ARM64_MTE_STORE_ONLY)) + valid_mask |= PR_MTE_STORE_ONLY; + } + if (arg & ~valid_mask) return -EINVAL; diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index da53722f95d4..b3801f532b10 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -91,12 +91,7 @@ early_param("nospectre_v2", parse_spectre_v2_param); static bool spectre_v2_mitigations_off(void) { - bool ret = __nospectre_v2 || cpu_mitigations_off(); - - if (ret) - pr_info_once("spectre-v2 mitigation disabled by command line option\n"); - - return ret; + return __nospectre_v2 || cpu_mitigations_off(); } static const char *get_bhb_affected_string(enum mitigation_state bhb_state) @@ -172,7 +167,7 @@ static enum mitigation_state spectre_v2_get_cpu_hw_mitigation_state(void) return SPECTRE_UNAFFECTED; /* Alternatively, we have a list of unaffected CPUs */ - if (is_midr_in_range_list(read_cpuid_id(), spectre_v2_safe_list)) + if (is_midr_in_range_list(spectre_v2_safe_list)) return SPECTRE_UNAFFECTED; return SPECTRE_VULNERABLE; @@ -331,7 +326,7 @@ bool has_spectre_v3a(const struct arm64_cpu_capabilities *entry, int scope) }; WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); - return is_midr_in_range_list(read_cpuid_id(), spectre_v3a_unsafe_list); + return is_midr_in_range_list(spectre_v3a_unsafe_list); } void spectre_v3a_enable_mitigation(const struct arm64_cpu_capabilities *__unused) @@ -421,13 +416,8 @@ early_param("ssbd", parse_spectre_v4_param); */ static bool spectre_v4_mitigations_off(void) { - bool ret = cpu_mitigations_off() || - __spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_DISABLED; - - if (ret) - pr_info_once("spectre-v4 mitigation disabled by command-line option\n"); - - return ret; + return cpu_mitigations_off() || + __spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_DISABLED; } /* Do we need to toggle the mitigation state on entry to/exit from the kernel? */ @@ -475,7 +465,7 @@ static enum mitigation_state spectre_v4_get_cpu_hw_mitigation_state(void) { /* sentinel */ }, }; - if (is_midr_in_range_list(read_cpuid_id(), spectre_v4_safe_list)) + if (is_midr_in_range_list(spectre_v4_safe_list)) return SPECTRE_UNAFFECTED; /* CPU features are detected first */ @@ -845,52 +835,92 @@ static unsigned long system_bhb_mitigations; * This must be called with SCOPE_LOCAL_CPU for each type of CPU, before any * SCOPE_SYSTEM call will give the right answer. */ -u8 spectre_bhb_loop_affected(int scope) +static bool is_spectre_bhb_safe(int scope) +{ + static const struct midr_range spectre_bhb_safe_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A35), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A53), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A55), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A510), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A520), + MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_2XX_SILVER), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER), + {}, + }; + static bool all_safe = true; + + if (scope != SCOPE_LOCAL_CPU) + return all_safe; + + if (is_midr_in_range_list(spectre_bhb_safe_list)) + return true; + + all_safe = false; + + return false; +} + +static u8 spectre_bhb_loop_affected(void) { u8 k = 0; - static u8 max_bhb_k; - - if (scope == SCOPE_LOCAL_CPU) { - static const struct midr_range spectre_bhb_k32_list[] = { - MIDR_ALL_VERSIONS(MIDR_CORTEX_A78), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A78AE), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A78C), - MIDR_ALL_VERSIONS(MIDR_CORTEX_X1), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), - MIDR_ALL_VERSIONS(MIDR_CORTEX_X2), - MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), - MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1), - {}, - }; - static const struct midr_range spectre_bhb_k24_list[] = { - MIDR_ALL_VERSIONS(MIDR_CORTEX_A76), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A77), - MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1), - {}, - }; - static const struct midr_range spectre_bhb_k11_list[] = { - MIDR_ALL_VERSIONS(MIDR_AMPERE1), - {}, - }; - static const struct midr_range spectre_bhb_k8_list[] = { - MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), - {}, - }; - - if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k32_list)) - k = 32; - else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k24_list)) - k = 24; - else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k11_list)) - k = 11; - else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k8_list)) - k = 8; - - max_bhb_k = max(max_bhb_k, k); - } else { - k = max_bhb_k; - } + + static const struct midr_range spectre_bhb_k132_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_X3), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2), + {}, + }; + static const struct midr_range spectre_bhb_k38_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A715), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A720), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A720AE), + {}, + }; + static const struct midr_range spectre_bhb_k32_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78AE), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78C), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X1), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X1C), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X2), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1), + MIDR_ALL_VERSIONS(MIDR_HISI_TSV110), + {}, + }; + static const struct midr_range spectre_bhb_k24_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A76), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A76AE), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A77), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_GOLD), + MIDR_ALL_VERSIONS(MIDR_HISI_HIP09), + {}, + }; + static const struct midr_range spectre_bhb_k11_list[] = { + MIDR_ALL_VERSIONS(MIDR_AMPERE1), + {}, + }; + static const struct midr_range spectre_bhb_k8_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), + {}, + }; + + if (is_midr_in_range_list(spectre_bhb_k132_list)) + k = 132; + else if (is_midr_in_range_list(spectre_bhb_k38_list)) + k = 38; + else if (is_midr_in_range_list(spectre_bhb_k32_list)) + k = 32; + else if (is_midr_in_range_list(spectre_bhb_k24_list)) + k = 24; + else if (is_midr_in_range_list(spectre_bhb_k11_list)) + k = 11; + else if (is_midr_in_range_list(spectre_bhb_k8_list)) + k = 8; return k; } @@ -916,29 +946,13 @@ static enum mitigation_state spectre_bhb_get_cpu_fw_mitigation_state(void) } } -static bool is_spectre_bhb_fw_affected(int scope) +static bool has_spectre_bhb_fw_mitigation(void) { - static bool system_affected; enum mitigation_state fw_state; bool has_smccc = arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_NONE; - static const struct midr_range spectre_bhb_firmware_mitigated_list[] = { - MIDR_ALL_VERSIONS(MIDR_CORTEX_A73), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A75), - {}, - }; - bool cpu_in_list = is_midr_in_range_list(read_cpuid_id(), - spectre_bhb_firmware_mitigated_list); - - if (scope != SCOPE_LOCAL_CPU) - return system_affected; fw_state = spectre_bhb_get_cpu_fw_mitigation_state(); - if (cpu_in_list || (has_smccc && fw_state == SPECTRE_MITIGATED)) { - system_affected = true; - return true; - } - - return false; + return has_smccc && fw_state == SPECTRE_MITIGATED; } static bool supports_ecbhb(int scope) @@ -954,6 +968,8 @@ static bool supports_ecbhb(int scope) ID_AA64MMFR1_EL1_ECBHB_SHIFT); } +static u8 max_bhb_k; + bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry, int scope) { @@ -962,16 +978,23 @@ bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry, if (supports_csv2p3(scope)) return false; - if (supports_clearbhb(scope)) - return true; + if (is_spectre_bhb_safe(scope)) + return false; - if (spectre_bhb_loop_affected(scope)) - return true; + /* + * At this point the core isn't known to be "safe" so we're going to + * assume it's vulnerable. We still need to update `max_bhb_k` though, + * but only if we aren't mitigating with clearbhb though. + */ + if (scope == SCOPE_LOCAL_CPU && !supports_clearbhb(SCOPE_LOCAL_CPU)) + max_bhb_k = max(max_bhb_k, spectre_bhb_loop_affected()); - if (is_spectre_bhb_fw_affected(scope)) - return true; + return true; +} - return false; +u8 get_spectre_bhb_loop_value(void) +{ + return max_bhb_k; } static void this_cpu_set_vectors(enum arm64_bp_harden_el1_vectors slot) @@ -991,7 +1014,7 @@ static void this_cpu_set_vectors(enum arm64_bp_harden_el1_vectors slot) isb(); } -static bool __read_mostly __nospectre_bhb; +bool __read_mostly __nospectre_bhb; static int __init parse_spectre_bhb_param(char *str) { __nospectre_bhb = true; @@ -1002,7 +1025,7 @@ early_param("nospectre_bhb", parse_spectre_bhb_param); void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry) { bp_hardening_cb_t cpu_cb; - enum mitigation_state fw_state, state = SPECTRE_VULNERABLE; + enum mitigation_state state = SPECTRE_VULNERABLE; struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data); if (!is_spectre_bhb_affected(entry, SCOPE_LOCAL_CPU)) @@ -1011,9 +1034,7 @@ void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry) if (arm64_get_spectre_v2_state() == SPECTRE_VULNERABLE) { /* No point mitigating Spectre-BHB alone. */ } else if (!IS_ENABLED(CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY)) { - pr_info_once("spectre-bhb mitigation disabled by compile time option\n"); - } else if (cpu_mitigations_off() || __nospectre_bhb) { - pr_info_once("spectre-bhb mitigation disabled by command line option\n"); + /* Do nothing */ } else if (supports_ecbhb(SCOPE_LOCAL_CPU)) { state = SPECTRE_MITIGATED; set_bit(BHB_HW, &system_bhb_mitigations); @@ -1028,7 +1049,7 @@ void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry) this_cpu_set_vectors(EL1_VECTOR_BHB_CLEAR_INSN); state = SPECTRE_MITIGATED; set_bit(BHB_INSN, &system_bhb_mitigations); - } else if (spectre_bhb_loop_affected(SCOPE_LOCAL_CPU)) { + } else if (spectre_bhb_loop_affected()) { /* * Ensure KVM uses the indirect vector which will have the * branchy-loop added. A57/A72-r0 will already have selected @@ -1041,37 +1062,39 @@ void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry) this_cpu_set_vectors(EL1_VECTOR_BHB_LOOP); state = SPECTRE_MITIGATED; set_bit(BHB_LOOP, &system_bhb_mitigations); - } else if (is_spectre_bhb_fw_affected(SCOPE_LOCAL_CPU)) { - fw_state = spectre_bhb_get_cpu_fw_mitigation_state(); - if (fw_state == SPECTRE_MITIGATED) { - /* - * Ensure KVM uses one of the spectre bp_hardening - * vectors. The indirect vector doesn't include the EL3 - * call, so needs upgrading to - * HYP_VECTOR_SPECTRE_INDIRECT. - */ - if (!data->slot || data->slot == HYP_VECTOR_INDIRECT) - data->slot += 1; - - this_cpu_set_vectors(EL1_VECTOR_BHB_FW); - - /* - * The WA3 call in the vectors supersedes the WA1 call - * made during context-switch. Uninstall any firmware - * bp_hardening callback. - */ - cpu_cb = spectre_v2_get_sw_mitigation_cb(); - if (__this_cpu_read(bp_hardening_data.fn) != cpu_cb) - __this_cpu_write(bp_hardening_data.fn, NULL); - - state = SPECTRE_MITIGATED; - set_bit(BHB_FW, &system_bhb_mitigations); - } + } else if (has_spectre_bhb_fw_mitigation()) { + /* + * Ensure KVM uses one of the spectre bp_hardening + * vectors. The indirect vector doesn't include the EL3 + * call, so needs upgrading to + * HYP_VECTOR_SPECTRE_INDIRECT. + */ + if (!data->slot || data->slot == HYP_VECTOR_INDIRECT) + data->slot += 1; + + this_cpu_set_vectors(EL1_VECTOR_BHB_FW); + + /* + * The WA3 call in the vectors supersedes the WA1 call + * made during context-switch. Uninstall any firmware + * bp_hardening callback. + */ + cpu_cb = spectre_v2_get_sw_mitigation_cb(); + if (__this_cpu_read(bp_hardening_data.fn) != cpu_cb) + __this_cpu_write(bp_hardening_data.fn, NULL); + + state = SPECTRE_MITIGATED; + set_bit(BHB_FW, &system_bhb_mitigations); } update_mitigation_state(&spectre_bhb_state, state); } +bool is_spectre_bhb_fw_mitigated(void) +{ + return test_bit(BHB_FW, &system_bhb_mitigations); +} + /* Patched to NOP when enabled */ void noinstr spectre_bhb_patch_loop_mitigation_enable(struct alt_instr *alt, __le32 *origptr, @@ -1100,7 +1123,6 @@ void noinstr spectre_bhb_patch_loop_iter(struct alt_instr *alt, { u8 rd; u32 insn; - u16 loop_count = spectre_bhb_loop_affected(SCOPE_SYSTEM); BUG_ON(nr_inst != 1); /* MOV -> MOV */ @@ -1109,7 +1131,7 @@ void noinstr spectre_bhb_patch_loop_iter(struct alt_instr *alt, insn = le32_to_cpu(*origptr); rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, insn); - insn = aarch64_insn_gen_movewide(rd, loop_count, 0, + insn = aarch64_insn_gen_movewide(rd, max_bhb_k, 0, AARCH64_INSN_VARIANT_64BIT, AARCH64_INSN_MOVEWIDE_ZERO); *updptr++ = cpu_to_le32(insn); @@ -1166,3 +1188,18 @@ void unpriv_ebpf_notify(int new_state) pr_err("WARNING: %s", EBPF_WARN); } #endif + +void spectre_print_disabled_mitigations(void) +{ + /* Keep a single copy of the common message suffix to avoid duplication. */ + const char *spectre_disabled_suffix = "mitigation disabled by command-line option\n"; + + if (spectre_v2_mitigations_off()) + pr_info("spectre-v2 %s", spectre_disabled_suffix); + + if (spectre_v4_mitigations_off()) + pr_info("spectre-v4 %s", spectre_disabled_suffix); + + if (__nospectre_bhb || cpu_mitigations_off()) + pr_info("spectre-bhb %s", spectre_disabled_suffix); +} diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index f79b0d5f71ac..ba5eab23fd90 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -141,7 +141,7 @@ unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n) addr += n; if (regs_within_kernel_stack(regs, (unsigned long)addr)) - return *addr; + return READ_ONCE_NOCHECK(*addr); else return 0; } @@ -594,7 +594,7 @@ static int __fpr_get(struct task_struct *target, { struct user_fpsimd_state *uregs; - sve_sync_to_fpsimd(target); + fpsimd_sync_from_effective_state(target); uregs = &target->thread.uw.fpsimd_state; @@ -626,7 +626,7 @@ static int __fpr_set(struct task_struct *target, * Ensure target->thread.uw.fpsimd_state is up to date, so that a * short copyin can't resurrect stale data. */ - sve_sync_to_fpsimd(target); + fpsimd_sync_from_effective_state(target); newstate = target->thread.uw.fpsimd_state; @@ -653,7 +653,7 @@ static int fpr_set(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; - sve_sync_from_fpsimd_zeropad(target); + fpsimd_sync_to_effective_state_zeropad(target); fpsimd_flush_task_state(target); return ret; @@ -775,6 +775,11 @@ static void sve_init_header_from_task(struct user_sve_header *header, task_type = ARM64_VEC_SVE; active = (task_type == type); + if (active && target->thread.fp_type == FP_STATE_SVE) + header->flags = SVE_PT_REGS_SVE; + else + header->flags = SVE_PT_REGS_FPSIMD; + switch (type) { case ARM64_VEC_SVE: if (test_tsk_thread_flag(target, TIF_SVE_VL_INHERIT)) @@ -789,19 +794,14 @@ static void sve_init_header_from_task(struct user_sve_header *header, return; } - if (active) { - if (target->thread.fp_type == FP_STATE_FPSIMD) { - header->flags |= SVE_PT_REGS_FPSIMD; - } else { - header->flags |= SVE_PT_REGS_SVE; - } - } - header->vl = task_get_vl(target, type); vq = sve_vq_from_vl(header->vl); header->max_vl = vec_max_vl(type); - header->size = SVE_PT_SIZE(vq, header->flags); + if (active) + header->size = SVE_PT_SIZE(vq, header->flags); + else + header->size = sizeof(header); header->max_size = SVE_PT_SIZE(sve_vq_from_vl(header->max_vl), SVE_PT_REGS_SVE); } @@ -820,18 +820,25 @@ static int sve_get_common(struct task_struct *target, unsigned int vq; unsigned long start, end; + if (target == current) + fpsimd_preserve_current_state(); + /* Header */ sve_init_header_from_task(&header, target, type); vq = sve_vq_from_vl(header.vl); membuf_write(&to, &header, sizeof(header)); - if (target == current) - fpsimd_preserve_current_state(); - BUILD_BUG_ON(SVE_PT_FPSIMD_OFFSET != sizeof(header)); BUILD_BUG_ON(SVE_PT_SVE_OFFSET != sizeof(header)); + /* + * When the requested vector type is not active, do not present data + * from the other mode to userspace. + */ + if (header.size == sizeof(header)) + return 0; + switch ((header.flags & SVE_PT_REGS_MASK)) { case SVE_PT_REGS_FPSIMD: return __fpr_get(target, regset, to); @@ -859,7 +866,7 @@ static int sve_get_common(struct task_struct *target, return membuf_zero(&to, end - start); default: - return 0; + BUILD_BUG(); } } @@ -883,6 +890,9 @@ static int sve_set_common(struct task_struct *target, struct user_sve_header header; unsigned int vq; unsigned long start, end; + bool fpsimd; + + fpsimd_flush_task_state(target); /* Header */ if (count < sizeof(header)) @@ -890,16 +900,65 @@ static int sve_set_common(struct task_struct *target, ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &header, 0, sizeof(header)); if (ret) - goto out; + return ret; /* - * Apart from SVE_PT_REGS_MASK, all SVE_PT_* flags are consumed by - * vec_set_vector_length(), which will also validate them for us: + * Streaming SVE data is always stored and presented in SVE format. + * Require the user to provide SVE formatted data for consistency, and + * to avoid the risk that we configure the task into an invalid state. */ - ret = vec_set_vector_length(target, type, header.vl, - ((unsigned long)header.flags & ~SVE_PT_REGS_MASK) << 16); - if (ret) - goto out; + fpsimd = (header.flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD; + if (fpsimd && type == ARM64_VEC_SME) + return -EINVAL; + + /* + * On systems without SVE we accept FPSIMD format writes with + * a VL of 0 to allow exiting streaming mode, otherwise a VL + * is required. + */ + if (header.vl) { + /* + * If the system does not support SVE we can't + * configure a SVE VL. + */ + if (!system_supports_sve() && type == ARM64_VEC_SVE) + return -EINVAL; + + /* + * Apart from SVE_PT_REGS_MASK, all SVE_PT_* flags are + * consumed by vec_set_vector_length(), which will + * also validate them for us: + */ + ret = vec_set_vector_length(target, type, header.vl, + ((unsigned long)header.flags & ~SVE_PT_REGS_MASK) << 16); + if (ret) + return ret; + } else { + /* If the system supports SVE we require a VL. */ + if (system_supports_sve()) + return -EINVAL; + + /* + * Only FPSIMD formatted data with no flags set is + * supported. + */ + if (header.flags != SVE_PT_REGS_FPSIMD) + return -EINVAL; + } + + /* Allocate SME storage if necessary, preserving any existing ZA/ZT state */ + if (type == ARM64_VEC_SME) { + sme_alloc(target, false); + if (!target->thread.sme_state) + return -ENOMEM; + } + + /* Allocate SVE storage if necessary, zeroing any existing SVE state */ + if (!fpsimd) { + sve_alloc(target, true); + if (!target->thread.sve_state) + return -ENOMEM; + } /* * Actual VL set may be different from what the user asked @@ -909,82 +968,46 @@ static int sve_set_common(struct task_struct *target, vq = sve_vq_from_vl(task_get_vl(target, type)); /* Enter/exit streaming mode */ - if (system_supports_sme()) { - u64 old_svcr = target->thread.svcr; - - switch (type) { - case ARM64_VEC_SVE: - target->thread.svcr &= ~SVCR_SM_MASK; - break; - case ARM64_VEC_SME: - target->thread.svcr |= SVCR_SM_MASK; - - /* - * Disable traps and ensure there is SME storage but - * preserve any currently set values in ZA/ZT. - */ - sme_alloc(target, false); - set_tsk_thread_flag(target, TIF_SME); - break; - default: - WARN_ON_ONCE(1); - ret = -EINVAL; - goto out; - } - - /* - * If we switched then invalidate any existing SVE - * state and ensure there's storage. - */ - if (target->thread.svcr != old_svcr) - sve_alloc(target, true); + switch (type) { + case ARM64_VEC_SVE: + target->thread.svcr &= ~SVCR_SM_MASK; + set_tsk_thread_flag(target, TIF_SVE); + break; + case ARM64_VEC_SME: + target->thread.svcr |= SVCR_SM_MASK; + set_tsk_thread_flag(target, TIF_SME); + break; + default: + WARN_ON_ONCE(1); + return -EINVAL; } + /* Always zero V regs, FPSR, and FPCR */ + memset(¤t->thread.uw.fpsimd_state, 0, + sizeof(current->thread.uw.fpsimd_state)); + /* Registers: FPSIMD-only case */ BUILD_BUG_ON(SVE_PT_FPSIMD_OFFSET != sizeof(header)); - if ((header.flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD) { - ret = __fpr_set(target, regset, pos, count, kbuf, ubuf, - SVE_PT_FPSIMD_OFFSET); + if (fpsimd) { clear_tsk_thread_flag(target, TIF_SVE); target->thread.fp_type = FP_STATE_FPSIMD; - goto out; + ret = __fpr_set(target, regset, pos, count, kbuf, ubuf, + SVE_PT_FPSIMD_OFFSET); + return ret; } - /* - * Otherwise: no registers or full SVE case. For backwards - * compatibility reasons we treat empty flags as SVE registers. - */ + /* Otherwise: no registers or full SVE case. */ + + target->thread.fp_type = FP_STATE_SVE; /* * If setting a different VL from the requested VL and there is * register data, the data layout will be wrong: don't even * try to set the registers in this case. */ - if (count && vq != sve_vq_from_vl(header.vl)) { - ret = -EIO; - goto out; - } - - sve_alloc(target, true); - if (!target->thread.sve_state) { - ret = -ENOMEM; - clear_tsk_thread_flag(target, TIF_SVE); - target->thread.fp_type = FP_STATE_FPSIMD; - goto out; - } - - /* - * Ensure target->thread.sve_state is up to date with target's - * FPSIMD regs, so that a short copyin leaves trailing - * registers unmodified. Only enable SVE if we are - * configuring normal SVE, a system with streaming SVE may not - * have normal SVE. - */ - fpsimd_sync_to_sve(target); - if (type == ARM64_VEC_SVE) - set_tsk_thread_flag(target, TIF_SVE); - target->thread.fp_type = FP_STATE_SVE; + if (count && vq != sve_vq_from_vl(header.vl)) + return -EIO; BUILD_BUG_ON(SVE_PT_SVE_OFFSET != sizeof(header)); start = SVE_PT_SVE_OFFSET; @@ -993,7 +1016,7 @@ static int sve_set_common(struct task_struct *target, target->thread.sve_state, start, end); if (ret) - goto out; + return ret; start = end; end = SVE_PT_SVE_FPSR_OFFSET(vq); @@ -1009,8 +1032,6 @@ static int sve_set_common(struct task_struct *target, &target->thread.uw.fpsimd_state.fpsr, start, end); -out: - fpsimd_flush_task_state(target); return ret; } @@ -1019,7 +1040,7 @@ static int sve_set(struct task_struct *target, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - if (!system_supports_sve()) + if (!system_supports_sve() && !system_supports_sme()) return -EINVAL; return sve_set_common(target, regset, pos, count, kbuf, ubuf, @@ -1463,6 +1484,9 @@ static int poe_get(struct task_struct *target, if (!system_supports_poe()) return -EINVAL; + if (target == current) + current->thread.por_el0 = read_sysreg_s(SYS_POR_EL0); + return membuf_write(&to, &target->thread.por_el0, sizeof(target->thread.por_el0)); } @@ -1589,7 +1613,7 @@ enum aarch64_regset { static const struct user_regset aarch64_regsets[] = { [REGSET_GPR] = { - .core_note_type = NT_PRSTATUS, + USER_REGSET_NOTE_TYPE(PRSTATUS), .n = sizeof(struct user_pt_regs) / sizeof(u64), .size = sizeof(u64), .align = sizeof(u64), @@ -1597,7 +1621,7 @@ static const struct user_regset aarch64_regsets[] = { .set = gpr_set }, [REGSET_FPR] = { - .core_note_type = NT_PRFPREG, + USER_REGSET_NOTE_TYPE(PRFPREG), .n = sizeof(struct user_fpsimd_state) / sizeof(u32), /* * We pretend we have 32-bit registers because the fpsr and @@ -1610,7 +1634,7 @@ static const struct user_regset aarch64_regsets[] = { .set = fpr_set }, [REGSET_TLS] = { - .core_note_type = NT_ARM_TLS, + USER_REGSET_NOTE_TYPE(ARM_TLS), .n = 2, .size = sizeof(void *), .align = sizeof(void *), @@ -1619,7 +1643,7 @@ static const struct user_regset aarch64_regsets[] = { }, #ifdef CONFIG_HAVE_HW_BREAKPOINT [REGSET_HW_BREAK] = { - .core_note_type = NT_ARM_HW_BREAK, + USER_REGSET_NOTE_TYPE(ARM_HW_BREAK), .n = sizeof(struct user_hwdebug_state) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), @@ -1627,7 +1651,7 @@ static const struct user_regset aarch64_regsets[] = { .set = hw_break_set, }, [REGSET_HW_WATCH] = { - .core_note_type = NT_ARM_HW_WATCH, + USER_REGSET_NOTE_TYPE(ARM_HW_WATCH), .n = sizeof(struct user_hwdebug_state) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), @@ -1636,7 +1660,7 @@ static const struct user_regset aarch64_regsets[] = { }, #endif [REGSET_SYSTEM_CALL] = { - .core_note_type = NT_ARM_SYSTEM_CALL, + USER_REGSET_NOTE_TYPE(ARM_SYSTEM_CALL), .n = 1, .size = sizeof(int), .align = sizeof(int), @@ -1644,7 +1668,7 @@ static const struct user_regset aarch64_regsets[] = { .set = system_call_set, }, [REGSET_FPMR] = { - .core_note_type = NT_ARM_FPMR, + USER_REGSET_NOTE_TYPE(ARM_FPMR), .n = 1, .size = sizeof(u64), .align = sizeof(u64), @@ -1653,7 +1677,7 @@ static const struct user_regset aarch64_regsets[] = { }, #ifdef CONFIG_ARM64_SVE [REGSET_SVE] = { /* Scalable Vector Extension */ - .core_note_type = NT_ARM_SVE, + USER_REGSET_NOTE_TYPE(ARM_SVE), .n = DIV_ROUND_UP(SVE_PT_SIZE(ARCH_SVE_VQ_MAX, SVE_PT_REGS_SVE), SVE_VQ_BYTES), @@ -1665,7 +1689,7 @@ static const struct user_regset aarch64_regsets[] = { #endif #ifdef CONFIG_ARM64_SME [REGSET_SSVE] = { /* Streaming mode SVE */ - .core_note_type = NT_ARM_SSVE, + USER_REGSET_NOTE_TYPE(ARM_SSVE), .n = DIV_ROUND_UP(SVE_PT_SIZE(SME_VQ_MAX, SVE_PT_REGS_SVE), SVE_VQ_BYTES), .size = SVE_VQ_BYTES, @@ -1674,7 +1698,7 @@ static const struct user_regset aarch64_regsets[] = { .set = ssve_set, }, [REGSET_ZA] = { /* SME ZA */ - .core_note_type = NT_ARM_ZA, + USER_REGSET_NOTE_TYPE(ARM_ZA), /* * ZA is a single register but it's variably sized and * the ptrace core requires that the size of any data @@ -1690,7 +1714,7 @@ static const struct user_regset aarch64_regsets[] = { .set = za_set, }, [REGSET_ZT] = { /* SME ZT */ - .core_note_type = NT_ARM_ZT, + USER_REGSET_NOTE_TYPE(ARM_ZT), .n = 1, .size = ZT_SIG_REG_BYTES, .align = sizeof(u64), @@ -1700,7 +1724,7 @@ static const struct user_regset aarch64_regsets[] = { #endif #ifdef CONFIG_ARM64_PTR_AUTH [REGSET_PAC_MASK] = { - .core_note_type = NT_ARM_PAC_MASK, + USER_REGSET_NOTE_TYPE(ARM_PAC_MASK), .n = sizeof(struct user_pac_mask) / sizeof(u64), .size = sizeof(u64), .align = sizeof(u64), @@ -1708,7 +1732,7 @@ static const struct user_regset aarch64_regsets[] = { /* this cannot be set dynamically */ }, [REGSET_PAC_ENABLED_KEYS] = { - .core_note_type = NT_ARM_PAC_ENABLED_KEYS, + USER_REGSET_NOTE_TYPE(ARM_PAC_ENABLED_KEYS), .n = 1, .size = sizeof(long), .align = sizeof(long), @@ -1717,7 +1741,7 @@ static const struct user_regset aarch64_regsets[] = { }, #ifdef CONFIG_CHECKPOINT_RESTORE [REGSET_PACA_KEYS] = { - .core_note_type = NT_ARM_PACA_KEYS, + USER_REGSET_NOTE_TYPE(ARM_PACA_KEYS), .n = sizeof(struct user_pac_address_keys) / sizeof(__uint128_t), .size = sizeof(__uint128_t), .align = sizeof(__uint128_t), @@ -1725,7 +1749,7 @@ static const struct user_regset aarch64_regsets[] = { .set = pac_address_keys_set, }, [REGSET_PACG_KEYS] = { - .core_note_type = NT_ARM_PACG_KEYS, + USER_REGSET_NOTE_TYPE(ARM_PACG_KEYS), .n = sizeof(struct user_pac_generic_keys) / sizeof(__uint128_t), .size = sizeof(__uint128_t), .align = sizeof(__uint128_t), @@ -1736,7 +1760,7 @@ static const struct user_regset aarch64_regsets[] = { #endif #ifdef CONFIG_ARM64_TAGGED_ADDR_ABI [REGSET_TAGGED_ADDR_CTRL] = { - .core_note_type = NT_ARM_TAGGED_ADDR_CTRL, + USER_REGSET_NOTE_TYPE(ARM_TAGGED_ADDR_CTRL), .n = 1, .size = sizeof(long), .align = sizeof(long), @@ -1746,7 +1770,7 @@ static const struct user_regset aarch64_regsets[] = { #endif #ifdef CONFIG_ARM64_POE [REGSET_POE] = { - .core_note_type = NT_ARM_POE, + USER_REGSET_NOTE_TYPE(ARM_POE), .n = 1, .size = sizeof(long), .align = sizeof(long), @@ -1756,7 +1780,7 @@ static const struct user_regset aarch64_regsets[] = { #endif #ifdef CONFIG_ARM64_GCS [REGSET_GCS] = { - .core_note_type = NT_ARM_GCS, + USER_REGSET_NOTE_TYPE(ARM_GCS), .n = sizeof(struct user_gcs) / sizeof(u64), .size = sizeof(u64), .align = sizeof(u64), @@ -1946,7 +1970,7 @@ static int compat_tls_set(struct task_struct *target, static const struct user_regset aarch32_regsets[] = { [REGSET_COMPAT_GPR] = { - .core_note_type = NT_PRSTATUS, + USER_REGSET_NOTE_TYPE(PRSTATUS), .n = COMPAT_ELF_NGREG, .size = sizeof(compat_elf_greg_t), .align = sizeof(compat_elf_greg_t), @@ -1954,7 +1978,7 @@ static const struct user_regset aarch32_regsets[] = { .set = compat_gpr_set }, [REGSET_COMPAT_VFP] = { - .core_note_type = NT_ARM_VFP, + USER_REGSET_NOTE_TYPE(ARM_VFP), .n = VFP_STATE_SIZE / sizeof(compat_ulong_t), .size = sizeof(compat_ulong_t), .align = sizeof(compat_ulong_t), @@ -1971,7 +1995,7 @@ static const struct user_regset_view user_aarch32_view = { static const struct user_regset aarch32_ptrace_regsets[] = { [REGSET_GPR] = { - .core_note_type = NT_PRSTATUS, + USER_REGSET_NOTE_TYPE(PRSTATUS), .n = COMPAT_ELF_NGREG, .size = sizeof(compat_elf_greg_t), .align = sizeof(compat_elf_greg_t), @@ -1979,7 +2003,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = { .set = compat_gpr_set }, [REGSET_FPR] = { - .core_note_type = NT_ARM_VFP, + USER_REGSET_NOTE_TYPE(ARM_VFP), .n = VFP_STATE_SIZE / sizeof(compat_ulong_t), .size = sizeof(compat_ulong_t), .align = sizeof(compat_ulong_t), @@ -1987,7 +2011,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = { .set = compat_vfp_set }, [REGSET_TLS] = { - .core_note_type = NT_ARM_TLS, + USER_REGSET_NOTE_TYPE(ARM_TLS), .n = 1, .size = sizeof(compat_ulong_t), .align = sizeof(compat_ulong_t), @@ -1996,7 +2020,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = { }, #ifdef CONFIG_HAVE_HW_BREAKPOINT [REGSET_HW_BREAK] = { - .core_note_type = NT_ARM_HW_BREAK, + USER_REGSET_NOTE_TYPE(ARM_HW_BREAK), .n = sizeof(struct user_hwdebug_state) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), @@ -2004,7 +2028,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = { .set = hw_break_set, }, [REGSET_HW_WATCH] = { - .core_note_type = NT_ARM_HW_WATCH, + USER_REGSET_NOTE_TYPE(ARM_HW_WATCH), .n = sizeof(struct user_hwdebug_state) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), @@ -2013,7 +2037,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = { }, #endif [REGSET_SYSTEM_CALL] = { - .core_note_type = NT_ARM_SYSTEM_CALL, + USER_REGSET_NOTE_TYPE(ARM_SYSTEM_CALL), .n = 1, .size = sizeof(int), .align = sizeof(int), @@ -2320,9 +2344,10 @@ enum ptrace_syscall_dir { PTRACE_SYSCALL_EXIT, }; -static void report_syscall(struct pt_regs *regs, enum ptrace_syscall_dir dir) +static __always_inline unsigned long ptrace_save_reg(struct pt_regs *regs, + enum ptrace_syscall_dir dir, + int *regno) { - int regno; unsigned long saved_reg; /* @@ -2341,15 +2366,34 @@ static void report_syscall(struct pt_regs *regs, enum ptrace_syscall_dir dir) * - Syscall stops behave differently to seccomp and pseudo-step traps * (the latter do not nobble any registers). */ - regno = (is_compat_task() ? 12 : 7); - saved_reg = regs->regs[regno]; - regs->regs[regno] = dir; + *regno = (is_compat_task() ? 12 : 7); + saved_reg = regs->regs[*regno]; + regs->regs[*regno] = dir; - if (dir == PTRACE_SYSCALL_ENTER) { - if (ptrace_report_syscall_entry(regs)) - forget_syscall(regs); - regs->regs[regno] = saved_reg; - } else if (!test_thread_flag(TIF_SINGLESTEP)) { + return saved_reg; +} + +static int report_syscall_entry(struct pt_regs *regs) +{ + unsigned long saved_reg; + int regno, ret; + + saved_reg = ptrace_save_reg(regs, PTRACE_SYSCALL_ENTER, ®no); + ret = ptrace_report_syscall_entry(regs); + if (ret) + forget_syscall(regs); + regs->regs[regno] = saved_reg; + + return ret; +} + +static void report_syscall_exit(struct pt_regs *regs) +{ + unsigned long saved_reg; + int regno; + + saved_reg = ptrace_save_reg(regs, PTRACE_SYSCALL_EXIT, ®no); + if (!test_thread_flag(TIF_SINGLESTEP)) { ptrace_report_syscall_exit(regs, 0); regs->regs[regno] = saved_reg; } else { @@ -2367,10 +2411,11 @@ static void report_syscall(struct pt_regs *regs, enum ptrace_syscall_dir dir) int syscall_trace_enter(struct pt_regs *regs) { unsigned long flags = read_thread_flags(); + int ret; if (flags & (_TIF_SYSCALL_EMU | _TIF_SYSCALL_TRACE)) { - report_syscall(regs, PTRACE_SYSCALL_ENTER); - if (flags & _TIF_SYSCALL_EMU) + ret = report_syscall_entry(regs); + if (ret || (flags & _TIF_SYSCALL_EMU)) return NO_SYSCALL; } @@ -2397,7 +2442,7 @@ void syscall_trace_exit(struct pt_regs *regs) trace_sys_exit(regs, syscall_get_return_value(current, regs)); if (flags & (_TIF_SYSCALL_TRACE | _TIF_SINGLESTEP)) - report_syscall(regs, PTRACE_SYSCALL_EXIT); + report_syscall_exit(regs); rseq_syscall(regs); } diff --git a/arch/arm64/kernel/relocate_kernel.S b/arch/arm64/kernel/relocate_kernel.S index 413f899e4ac6..6cb4209f5dab 100644 --- a/arch/arm64/kernel/relocate_kernel.S +++ b/arch/arm64/kernel/relocate_kernel.S @@ -64,7 +64,8 @@ SYM_CODE_START(arm64_relocate_new_kernel) mov x19, x13 copy_page x13, x12, x1, x2, x3, x4, x5, x6, x7, x8 add x1, x19, #PAGE_SIZE - dcache_by_myline_op civac, sy, x19, x1, x15, x20 + dcache_by_myline_op_nosync civac, x19, x1, x15, x20 + dsb sy b .Lnext .Ltest_indirection: tbz x16, IND_INDIRECTION_BIT, .Ltest_destination diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c index ce4778141ec7..92160f2e57ff 100644 --- a/arch/arm64/kernel/rsi.c +++ b/arch/arm64/kernel/rsi.c @@ -12,6 +12,7 @@ #include <asm/io.h> #include <asm/mem_encrypt.h> +#include <asm/pgtable.h> #include <asm/rsi.h> static struct realm_config config; @@ -84,7 +85,25 @@ static void __init arm64_rsi_setup_memory(void) } } -bool __arm64_is_protected_mmio(phys_addr_t base, size_t size) +/* + * Check if a given PA range is Trusted (e.g., Protected memory, a Trusted Device + * mapping, or an MMIO emulated in the Realm world). + * + * We can rely on the RIPAS value of the region to detect if a given region is + * protected. + * + * RIPAS_DEV - A trusted device memory or a trusted emulated MMIO (in the Realm + * world + * RIPAS_RAM - Memory (RAM), protected by the RMM guarantees. (e.g., Firmware + * reserved regions for data sharing). + * + * RIPAS_DESTROYED is a special case of one of the above, where the host did + * something without our permission and as such we can't do anything about it. + * + * The only case where something is emulated by the untrusted hypervisor or is + * backed by shared memory is indicated by RSI_RIPAS_EMPTY. + */ +bool arm64_rsi_is_protected(phys_addr_t base, size_t size) { enum ripas ripas; phys_addr_t end, top; @@ -101,18 +120,18 @@ bool __arm64_is_protected_mmio(phys_addr_t base, size_t size) break; if (WARN_ON(top <= base)) break; - if (ripas != RSI_RIPAS_DEV) + if (ripas == RSI_RIPAS_EMPTY) break; base = top; } return base >= end; } -EXPORT_SYMBOL(__arm64_is_protected_mmio); +EXPORT_SYMBOL(arm64_rsi_is_protected); static int realm_ioremap_hook(phys_addr_t phys, size_t size, pgprot_t *prot) { - if (__arm64_is_protected_mmio(phys, size)) + if (arm64_rsi_is_protected(phys, size)) *prot = pgprot_encrypted(*prot); else *prot = pgprot_decrypted(*prot); @@ -126,9 +145,9 @@ void __init arm64_rsi_init(void) return; if (!rsi_version_matches()) return; - if (WARN_ON(rsi_get_realm_config(&config))) + if (WARN_ON(rsi_get_realm_config(lm_alias(&config)))) return; - prot_ns_shared = BIT(config.ipa_bits - 1); + prot_ns_shared = __phys_to_pte_val(BIT(config.ipa_bits - 1)); if (arm64_ioremap_prot_hook_register(realm_ioremap_hook)) return; diff --git a/arch/arm64/kernel/sdei.c b/arch/arm64/kernel/sdei.c index 255d12f881c2..778f2a1faac8 100644 --- a/arch/arm64/kernel/sdei.c +++ b/arch/arm64/kernel/sdei.c @@ -34,10 +34,8 @@ unsigned long sdei_exit_mode; DECLARE_PER_CPU(unsigned long *, sdei_stack_normal_ptr); DECLARE_PER_CPU(unsigned long *, sdei_stack_critical_ptr); -#ifdef CONFIG_VMAP_STACK DEFINE_PER_CPU(unsigned long *, sdei_stack_normal_ptr); DEFINE_PER_CPU(unsigned long *, sdei_stack_critical_ptr); -#endif DECLARE_PER_CPU(unsigned long *, sdei_shadow_call_stack_normal_ptr); DECLARE_PER_CPU(unsigned long *, sdei_shadow_call_stack_critical_ptr); @@ -65,9 +63,6 @@ static void free_sdei_stacks(void) { int cpu; - if (!IS_ENABLED(CONFIG_VMAP_STACK)) - return; - for_each_possible_cpu(cpu) { _free_sdei_stack(&sdei_stack_normal_ptr, cpu); _free_sdei_stack(&sdei_stack_critical_ptr, cpu); @@ -91,9 +86,6 @@ static int init_sdei_stacks(void) int cpu; int err = 0; - if (!IS_ENABLED(CONFIG_VMAP_STACK)) - return 0; - for_each_possible_cpu(cpu) { err = _init_sdei_stack(&sdei_stack_normal_ptr, cpu); if (err) @@ -206,7 +198,7 @@ out_err: /* * do_sdei_event() returns one of: * SDEI_EV_HANDLED - success, return to the interrupted context. - * SDEI_EV_FAILED - failure, return this error code to firmare. + * SDEI_EV_FAILED - failure, return this error code to firmware. * virtual-address - success, return to this address. */ unsigned long __kprobes do_sdei_event(struct pt_regs *regs, @@ -247,7 +239,7 @@ unsigned long __kprobes do_sdei_event(struct pt_regs *regs, * If we interrupted the kernel with interrupts masked, we always go * back to wherever we came from. */ - if (mode == kernel_mode && !interrupts_enabled(regs)) + if (mode == kernel_mode && regs_irqs_disabled(regs)) return SDEI_EV_HANDLED; /* diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 85104587f849..23c05dc7a8f2 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -169,7 +169,7 @@ static void __init smp_build_mpidr_hash(void) static void __init setup_machine_fdt(phys_addr_t dt_phys) { - int size; + int size = 0; void *dt_virt = fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL); const char *name; @@ -182,10 +182,10 @@ static void __init setup_machine_fdt(phys_addr_t dt_phys) */ if (!early_init_dt_scan(dt_virt, dt_phys)) { pr_crit("\n" - "Error: invalid device tree blob at physical address %pa (virtual address 0x%px)\n" - "The dtb must be 8-byte aligned and must not exceed 2 MB in size\n" - "\nPlease check your bootloader.", - &dt_phys, dt_virt); + "Error: invalid device tree blob: PA=%pa, VA=%px, size=%d bytes\n" + "The dtb must be 8-byte aligned and must not exceed 2 MB in size.\n" + "\nPlease check your bootloader.\n", + &dt_phys, dt_virt, size); /* * Note that in this _really_ early stage we cannot even BUG() @@ -214,7 +214,7 @@ static void __init request_standard_resources(void) unsigned long i = 0; size_t res_size; - kernel_code.start = __pa_symbol(_stext); + kernel_code.start = __pa_symbol(_text); kernel_code.end = __pa_symbol(__init_begin - 1); kernel_data.start = __pa_symbol(_sdata); kernel_data.end = __pa_symbol(_end - 1); @@ -280,7 +280,7 @@ u64 cpu_logical_map(unsigned int cpu) void __init __no_sanitize_address setup_arch(char **cmdline_p) { - setup_initial_init_mm(_stext, _etext, _edata, _end); + setup_initial_init_mm(_text, _etext, _edata, _end); *cmdline_p = boot_command_line; diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 99ea26d400ff..08ffc5a5aea4 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -9,6 +9,7 @@ #include <linux/cache.h> #include <linux/compat.h> #include <linux/errno.h> +#include <linux/irq-entry-common.h> #include <linux/kernel.h> #include <linux/signal.h> #include <linux/freezer.h> @@ -91,12 +92,15 @@ static void save_reset_user_access_state(struct user_access_state *ua_state) u64 por_enable_all = 0; for (int pkey = 0; pkey < arch_max_pkey(); pkey++) - por_enable_all |= POE_RXW << (pkey * POR_BITS_PER_PKEY); + por_enable_all |= POR_ELx_PERM_PREP(pkey, POE_RWX); ua_state->por_el0 = read_sysreg_s(SYS_POR_EL0); write_sysreg_s(por_enable_all, SYS_POR_EL0); - /* Ensure that any subsequent uaccess observes the updated value */ - isb(); + /* + * No ISB required as we can tolerate spurious Overlay faults - + * the fault handler will check again based on the new value + * of POR_EL0. + */ } } @@ -250,6 +254,8 @@ static int preserve_fpsimd_context(struct fpsimd_context __user *ctx) ¤t->thread.uw.fpsimd_state; int err; + fpsimd_sync_from_effective_state(current); + /* copy the FP and status/control registers */ err = __copy_to_user(ctx->vregs, fpsimd->vregs, sizeof(fpsimd->vregs)); __put_user_error(fpsimd->fpsr, &ctx->fpsr, err); @@ -262,37 +268,46 @@ static int preserve_fpsimd_context(struct fpsimd_context __user *ctx) return err ? -EFAULT : 0; } -static int restore_fpsimd_context(struct user_ctxs *user) +static int read_fpsimd_context(struct user_fpsimd_state *fpsimd, + struct user_ctxs *user) { - struct user_fpsimd_state fpsimd; - int err = 0; + int err; /* check the size information */ if (user->fpsimd_size != sizeof(struct fpsimd_context)) return -EINVAL; /* copy the FP and status/control registers */ - err = __copy_from_user(fpsimd.vregs, &(user->fpsimd->vregs), - sizeof(fpsimd.vregs)); - __get_user_error(fpsimd.fpsr, &(user->fpsimd->fpsr), err); - __get_user_error(fpsimd.fpcr, &(user->fpsimd->fpcr), err); + err = __copy_from_user(fpsimd->vregs, &(user->fpsimd->vregs), + sizeof(fpsimd->vregs)); + __get_user_error(fpsimd->fpsr, &(user->fpsimd->fpsr), err); + __get_user_error(fpsimd->fpcr, &(user->fpsimd->fpcr), err); + + return err ? -EFAULT : 0; +} + +static int restore_fpsimd_context(struct user_ctxs *user) +{ + struct user_fpsimd_state fpsimd; + int err; + + err = read_fpsimd_context(&fpsimd, user); + if (err) + return err; clear_thread_flag(TIF_SVE); + current->thread.svcr &= ~SVCR_SM_MASK; current->thread.fp_type = FP_STATE_FPSIMD; /* load the hardware registers from the fpsimd_state structure */ - if (!err) - fpsimd_update_current_state(&fpsimd); - - return err ? -EFAULT : 0; + fpsimd_update_current_state(&fpsimd); + return 0; } static int preserve_fpmr_context(struct fpmr_context __user *ctx) { int err = 0; - current->thread.uw.fpmr = read_sysreg_s(SYS_FPMR); - __put_user_error(FPMR_MAGIC, &ctx->head.magic, err); __put_user_error(sizeof(*ctx), &ctx->head.size, err); __put_user_error(current->thread.uw.fpmr, &ctx->fpmr, err); @@ -310,7 +325,7 @@ static int restore_fpmr_context(struct user_ctxs *user) __get_user_error(fpmr, &user->fpmr->fpmr, err); if (!err) - write_sysreg_s(fpmr, SYS_FPMR); + current->thread.uw.fpmr = fpmr; return err; } @@ -372,11 +387,6 @@ static int preserve_sve_context(struct sve_context __user *ctx) err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved)); if (vq) { - /* - * This assumes that the SVE state has already been saved to - * the task struct by calling the function - * fpsimd_signal_preserve_current_state(). - */ err |= __copy_to_user((char __user *)ctx + SVE_SIG_REGS_OFFSET, current->thread.sve_state, SVE_SIG_REGS_SIZE(vq)); @@ -391,6 +401,7 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user) unsigned int vl, vq; struct user_fpsimd_state fpsimd; u16 user_vl, flags; + bool sm; if (user->sve_size < sizeof(*user->sve)) return -EINVAL; @@ -400,7 +411,8 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user) if (err) return err; - if (flags & SVE_SIG_FLAG_SM) { + sm = flags & SVE_SIG_FLAG_SM; + if (sm) { if (!system_supports_sme()) return -EINVAL; @@ -420,27 +432,28 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user) if (user_vl != vl) return -EINVAL; - if (user->sve_size == sizeof(*user->sve)) { - clear_thread_flag(TIF_SVE); - current->thread.svcr &= ~SVCR_SM_MASK; - current->thread.fp_type = FP_STATE_FPSIMD; - goto fpsimd_only; - } + /* + * Non-streaming SVE state may be preserved without an SVE payload, in + * which case the SVE context only has a header with VL==0, and all + * state can be restored from the FPSIMD context. + * + * Streaming SVE state is always preserved with an SVE payload. For + * consistency and robustness, reject restoring streaming SVE state + * without an SVE payload. + */ + if (!sm && user->sve_size == sizeof(*user->sve)) + return restore_fpsimd_context(user); vq = sve_vq_from_vl(vl); if (user->sve_size < SVE_SIG_CONTEXT_SIZE(vq)) return -EINVAL; - /* - * Careful: we are about __copy_from_user() directly into - * thread.sve_state with preemption enabled, so protection is - * needed to prevent a racing context switch from writing stale - * registers back over the new data. - */ - - fpsimd_flush_task_state(current); - /* From now, fpsimd_thread_switch() won't touch thread.sve_state */ + if (sm) { + sme_alloc(current, false); + if (!current->thread.sme_state) + return -ENOMEM; + } sve_alloc(current, true); if (!current->thread.sve_state) { @@ -448,6 +461,16 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user) return -ENOMEM; } + if (sm) { + current->thread.svcr |= SVCR_SM_MASK; + set_thread_flag(TIF_SME); + } else { + current->thread.svcr &= ~SVCR_SM_MASK; + set_thread_flag(TIF_SVE); + } + + current->thread.fp_type = FP_STATE_SVE; + err = __copy_from_user(current->thread.sve_state, (char __user const *)user->sve + SVE_SIG_REGS_OFFSET, @@ -455,25 +478,14 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user) if (err) return -EFAULT; - if (flags & SVE_SIG_FLAG_SM) - current->thread.svcr |= SVCR_SM_MASK; - else - set_thread_flag(TIF_SVE); - current->thread.fp_type = FP_STATE_SVE; + err = read_fpsimd_context(&fpsimd, user); + if (err) + return err; -fpsimd_only: - /* copy the FP and status/control registers */ - /* restore_sigframe() already checked that user->fpsimd != NULL. */ - err = __copy_from_user(fpsimd.vregs, user->fpsimd->vregs, - sizeof(fpsimd.vregs)); - __get_user_error(fpsimd.fpsr, &user->fpsimd->fpsr, err); - __get_user_error(fpsimd.fpcr, &user->fpsimd->fpcr, err); + /* Merge the FPSIMD registers into the SVE state */ + fpsimd_update_current_state(&fpsimd); - /* load the hardware registers from the fpsimd_state structure */ - if (!err) - fpsimd_update_current_state(&fpsimd); - - return err ? -EFAULT : 0; + return 0; } #else /* ! CONFIG_ARM64_SVE */ @@ -493,13 +505,12 @@ extern int preserve_sve_context(void __user *ctx); static int preserve_tpidr2_context(struct tpidr2_context __user *ctx) { + u64 tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); int err = 0; - current->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); - __put_user_error(TPIDR2_MAGIC, &ctx->head.magic, err); __put_user_error(sizeof(*ctx), &ctx->head.size, err); - __put_user_error(current->thread.tpidr2_el0, &ctx->tpidr2, err); + __put_user_error(tpidr2_el0, &ctx->tpidr2, err); return err; } @@ -541,11 +552,6 @@ static int preserve_za_context(struct za_context __user *ctx) err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved)); if (vq) { - /* - * This assumes that the ZA state has already been saved to - * the task struct by calling the function - * fpsimd_signal_preserve_current_state(). - */ err |= __copy_to_user((char __user *)ctx + ZA_SIG_REGS_OFFSET, current->thread.sme_state, ZA_SIG_REGS_SIZE(vq)); @@ -580,15 +586,9 @@ static int restore_za_context(struct user_ctxs *user) if (user->za_size < ZA_SIG_CONTEXT_SIZE(vq)) return -EINVAL; - /* - * Careful: we are about __copy_from_user() directly into - * thread.sme_state with preemption enabled, so protection is - * needed to prevent a racing context switch from writing stale - * registers back over the new data. - */ - - fpsimd_flush_task_state(current); - /* From now, fpsimd_thread_switch() won't touch thread.sve_state */ + sve_alloc(current, false); + if (!current->thread.sve_state) + return -ENOMEM; sme_alloc(current, true); if (!current->thread.sme_state) { @@ -627,11 +627,6 @@ static int preserve_zt_context(struct zt_context __user *ctx) BUILD_BUG_ON(sizeof(ctx->__reserved) != sizeof(reserved)); err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved)); - /* - * This assumes that the ZT state has already been saved to - * the task struct by calling the function - * fpsimd_signal_preserve_current_state(). - */ err |= __copy_to_user((char __user *)ctx + ZT_SIG_REGS_OFFSET, thread_zt_state(¤t->thread), ZT_SIG_REGS_SIZE(1)); @@ -657,16 +652,6 @@ static int restore_zt_context(struct user_ctxs *user) if (nregs != 1) return -EINVAL; - /* - * Careful: we are about __copy_from_user() directly into - * thread.zt_state with preemption enabled, so protection is - * needed to prevent a racing context switch from writing stale - * registers back over the new data. - */ - - fpsimd_flush_task_state(current); - /* From now, fpsimd_thread_switch() won't touch ZT in thread state */ - err = __copy_from_user(thread_zt_state(¤t->thread), (char __user const *)user->zt + ZT_SIG_REGS_OFFSET, @@ -1017,6 +1002,8 @@ static int restore_sigframe(struct pt_regs *regs, */ forget_syscall(regs); + fpsimd_save_and_flush_current_state(); + err |= !valid_user_regs(®s->user_regs, current); if (err == 0) err = parse_user_sigframe(&user, sf); @@ -1507,21 +1494,9 @@ static int setup_return(struct pt_regs *regs, struct ksignal *ksig, /* Signal handlers are invoked with ZA and streaming mode disabled */ if (system_supports_sme()) { - /* - * If we were in streaming mode the saved register - * state was SVE but we will exit SM and use the - * FPSIMD register state - flush the saved FPSIMD - * register state in case it gets loaded. - */ - if (current->thread.svcr & SVCR_SM_MASK) { - memset(¤t->thread.uw.fpsimd_state, 0, - sizeof(current->thread.uw.fpsimd_state)); - current->thread.fp_type = FP_STATE_FPSIMD; - } - - current->thread.svcr &= ~(SVCR_ZA_MASK | - SVCR_SM_MASK); - sme_smstop(); + task_smstop_sm(current); + current->thread.svcr &= ~SVCR_ZA_MASK; + write_sysreg_s(0, SYS_TPIDR2_EL0); } return 0; @@ -1535,7 +1510,7 @@ static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set, struct user_access_state ua_state; int err = 0; - fpsimd_signal_preserve_current_state(); + fpsimd_save_and_flush_current_state(); if (get_sigframe(&user, ksig, regs)) return 1; @@ -1616,7 +1591,7 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) * the kernel can handle, and then we build all the user-level signal handling * stack-frames in one go after that. */ -void do_signal(struct pt_regs *regs) +void arch_do_signal_or_restart(struct pt_regs *regs) { unsigned long continue_addr = 0, restart_addr = 0; int retval = 0; diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c index 81e798b6dada..bb3b526ff43f 100644 --- a/arch/arm64/kernel/signal32.c +++ b/arch/arm64/kernel/signal32.c @@ -103,7 +103,7 @@ static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame) * Note that this also saves V16-31, which aren't visible * in AArch32. */ - fpsimd_signal_preserve_current_state(); + fpsimd_save_and_flush_current_state(); /* Place structure header on the stack */ __put_user_error(magic, &frame->magic, err); @@ -169,14 +169,17 @@ static int compat_restore_vfp_context(struct compat_vfp_sigframe __user *frame) fpsimd.fpsr = fpscr & VFP_FPSCR_STAT_MASK; fpsimd.fpcr = fpscr & VFP_FPSCR_CTRL_MASK; + if (err) + return -EFAULT; + /* * We don't need to touch the exception register, so * reload the hardware state. */ - if (!err) - fpsimd_update_current_state(&fpsimd); + fpsimd_save_and_flush_current_state(); + current->thread.uw.fpsimd_state = fpsimd; - return err ? -EFAULT : 0; + return 0; } static int compat_restore_sigframe(struct pt_regs *regs, diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 3b3f6b56e733..1aa324104afb 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -64,26 +64,18 @@ struct secondary_data secondary_data; /* Number of CPUs which aren't online, but looping in kernel text. */ static int cpus_stuck_in_kernel; -enum ipi_msg_type { - IPI_RESCHEDULE, - IPI_CALL_FUNC, - IPI_CPU_STOP, - IPI_CPU_STOP_NMI, - IPI_TIMER, - IPI_IRQ_WORK, - NR_IPI, - /* - * Any enum >= NR_IPI and < MAX_IPI is special and not tracable - * with trace_ipi_* - */ - IPI_CPU_BACKTRACE = NR_IPI, - IPI_KGDB_ROUNDUP, - MAX_IPI -}; - static int ipi_irq_base __ro_after_init; static int nr_ipi __ro_after_init = NR_IPI; -static struct irq_desc *ipi_desc[MAX_IPI] __ro_after_init; + +struct ipi_descs { + struct irq_desc *descs[MAX_IPI]; +}; + +static DEFINE_PER_CPU_READ_MOSTLY(struct ipi_descs, pcpu_ipi_desc); + +#define get_ipi_desc(__cpu, __ipi) (per_cpu_ptr(&pcpu_ipi_desc, __cpu)->descs[__ipi]) + +static bool percpu_ipi_descs __ro_after_init; static bool crash_stop; @@ -358,7 +350,7 @@ void arch_cpuhp_cleanup_dead_cpu(unsigned int cpu) /* * Now that the dying CPU is beyond the point of no return w.r.t. - * in-kernel synchronisation, try to get the firwmare to help us to + * in-kernel synchronisation, try to get the firmware to help us to * verify that it has really left the kernel before we consider * clobbering anything it might still be using. */ @@ -531,7 +523,7 @@ int arch_register_cpu(int cpu) /* * Availability of the acpi handle is sufficient to establish - * that _STA has aleady been checked. No need to recheck here. + * that _STA has already been checked. No need to recheck here. */ c->hotpluggable = arch_cpu_is_hotpluggable(cpu); @@ -844,7 +836,7 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_printf(p, "%*s%u:%s", prec - 1, "IPI", i, prec >= 4 ? " " : ""); for_each_online_cpu(cpu) - seq_printf(p, "%10u ", irq_desc_kstat_cpu(ipi_desc[i], cpu)); + seq_printf(p, "%10u ", irq_desc_kstat_cpu(get_ipi_desc(cpu, i), cpu)); seq_printf(p, " %s\n", ipi_types[i]); } @@ -917,9 +909,20 @@ static void __noreturn ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs #endif } +static void arm64_send_ipi(const cpumask_t *mask, unsigned int nr) +{ + unsigned int cpu; + + if (!percpu_ipi_descs) + __ipi_send_mask(get_ipi_desc(0, nr), mask); + else + for_each_cpu(cpu, mask) + __ipi_send_single(get_ipi_desc(cpu, nr), cpu); +} + static void arm64_backtrace_ipi(cpumask_t *mask) { - __ipi_send_mask(ipi_desc[IPI_CPU_BACKTRACE], mask); + arm64_send_ipi(mask, IPI_CPU_BACKTRACE); } void arch_trigger_cpumask_backtrace(const cpumask_t *mask, int exclude_cpu) @@ -944,7 +947,7 @@ void kgdb_roundup_cpus(void) if (cpu == this_cpu) continue; - __ipi_send_single(ipi_desc[IPI_KGDB_ROUNDUP], cpu); + __ipi_send_single(get_ipi_desc(cpu, IPI_KGDB_ROUNDUP), cpu); } } #endif @@ -1013,14 +1016,16 @@ static void do_handle_IPI(int ipinr) static irqreturn_t ipi_handler(int irq, void *data) { - do_handle_IPI(irq - ipi_irq_base); + unsigned int ipi = (irq - ipi_irq_base) % nr_ipi; + + do_handle_IPI(ipi); return IRQ_HANDLED; } static void smp_cross_call(const struct cpumask *target, unsigned int ipinr) { trace_ipi_raise(target, ipi_types[ipinr]); - __ipi_send_mask(ipi_desc[ipinr], target); + arm64_send_ipi(target, ipinr); } static bool ipi_should_be_nmi(enum ipi_msg_type ipi) @@ -1046,11 +1051,15 @@ static void ipi_setup(int cpu) return; for (i = 0; i < nr_ipi; i++) { - if (ipi_should_be_nmi(i)) { - prepare_percpu_nmi(ipi_irq_base + i); - enable_percpu_nmi(ipi_irq_base + i, 0); + if (!percpu_ipi_descs) { + if (ipi_should_be_nmi(i)) { + prepare_percpu_nmi(ipi_irq_base + i); + enable_percpu_nmi(ipi_irq_base + i, 0); + } else { + enable_percpu_irq(ipi_irq_base + i, 0); + } } else { - enable_percpu_irq(ipi_irq_base + i, 0); + enable_irq(irq_desc_get_irq(get_ipi_desc(cpu, i))); } } } @@ -1064,44 +1073,77 @@ static void ipi_teardown(int cpu) return; for (i = 0; i < nr_ipi; i++) { - if (ipi_should_be_nmi(i)) { - disable_percpu_nmi(ipi_irq_base + i); - teardown_percpu_nmi(ipi_irq_base + i); + if (!percpu_ipi_descs) { + if (ipi_should_be_nmi(i)) { + disable_percpu_nmi(ipi_irq_base + i); + teardown_percpu_nmi(ipi_irq_base + i); + } else { + disable_percpu_irq(ipi_irq_base + i); + } } else { - disable_percpu_irq(ipi_irq_base + i); + disable_irq(irq_desc_get_irq(get_ipi_desc(cpu, i))); } } } #endif -void __init set_smp_ipi_range(int ipi_base, int n) +static void ipi_setup_sgi(int ipi) { - int i; + int err, irq, cpu; - WARN_ON(n < MAX_IPI); - nr_ipi = min(n, MAX_IPI); + irq = ipi_irq_base + ipi; - for (i = 0; i < nr_ipi; i++) { - int err; + if (ipi_should_be_nmi(ipi)) { + err = request_percpu_nmi(irq, ipi_handler, "IPI", NULL, &irq_stat); + WARN(err, "Could not request IRQ %d as NMI, err=%d\n", irq, err); + } else { + err = request_percpu_irq(irq, ipi_handler, "IPI", &irq_stat); + WARN(err, "Could not request IRQ %d as IRQ, err=%d\n", irq, err); + } - if (ipi_should_be_nmi(i)) { - err = request_percpu_nmi(ipi_base + i, ipi_handler, - "IPI", &irq_stat); - WARN(err, "Could not request IPI %d as NMI, err=%d\n", - i, err); - } else { - err = request_percpu_irq(ipi_base + i, ipi_handler, - "IPI", &irq_stat); - WARN(err, "Could not request IPI %d as IRQ, err=%d\n", - i, err); - } + for_each_possible_cpu(cpu) + get_ipi_desc(cpu, ipi) = irq_to_desc(irq); + + irq_set_status_flags(irq, IRQ_HIDDEN); +} + +static void ipi_setup_lpi(int ipi, int ncpus) +{ + for (int cpu = 0; cpu < ncpus; cpu++) { + int err, irq; + + irq = ipi_irq_base + (cpu * nr_ipi) + ipi; + + err = irq_force_affinity(irq, cpumask_of(cpu)); + WARN(err, "Could not force affinity IRQ %d, err=%d\n", irq, err); + + err = request_irq(irq, ipi_handler, IRQF_NO_AUTOEN, "IPI", + NULL); + WARN(err, "Could not request IRQ %d, err=%d\n", irq, err); + + irq_set_status_flags(irq, (IRQ_HIDDEN | IRQ_NO_BALANCING_MASK)); - ipi_desc[i] = irq_to_desc(ipi_base + i); - irq_set_status_flags(ipi_base + i, IRQ_HIDDEN); + get_ipi_desc(cpu, ipi) = irq_to_desc(irq); } +} + +void __init set_smp_ipi_range_percpu(int ipi_base, int n, int ncpus) +{ + int i; + + WARN_ON(n < MAX_IPI); + nr_ipi = min(n, MAX_IPI); + percpu_ipi_descs = !!ncpus; ipi_irq_base = ipi_base; + for (i = 0; i < nr_ipi; i++) { + if (!percpu_ipi_descs) + ipi_setup_sgi(i); + else + ipi_setup_lpi(i, ncpus); + } + /* Setup the boot CPU immediately */ ipi_setup(smp_processor_id()); } @@ -1143,7 +1185,7 @@ static inline unsigned int num_other_online_cpus(void) void smp_send_stop(void) { static unsigned long stop_in_progress; - cpumask_t mask; + static cpumask_t mask; unsigned long timeout; /* diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 1d9d51d7627f..3ebcf8c53fb0 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -152,6 +152,8 @@ kunwind_recover_return_address(struct kunwind_state *state) orig_pc = kretprobe_find_ret_addr(state->task, (void *)state->common.fp, &state->kr_cur); + if (!orig_pc) + return -EINVAL; state->common.pc = orig_pc; state->flags.kretprobe = 1; } @@ -277,21 +279,24 @@ kunwind_next(struct kunwind_state *state) typedef bool (*kunwind_consume_fn)(const struct kunwind_state *state, void *cookie); -static __always_inline void +static __always_inline int do_kunwind(struct kunwind_state *state, kunwind_consume_fn consume_state, void *cookie) { - if (kunwind_recover_return_address(state)) - return; + int ret; - while (1) { - int ret; + ret = kunwind_recover_return_address(state); + if (ret) + return ret; + while (1) { if (!consume_state(state, cookie)) - break; + return -EINVAL; ret = kunwind_next(state); + if (ret == -ENOENT) + return 0; if (ret < 0) - break; + return ret; } } @@ -324,7 +329,7 @@ do_kunwind(struct kunwind_state *state, kunwind_consume_fn consume_state, : stackinfo_get_unknown(); \ }) -static __always_inline void +static __always_inline int kunwind_stack_walk(kunwind_consume_fn consume_state, void *cookie, struct task_struct *task, struct pt_regs *regs) @@ -332,10 +337,8 @@ kunwind_stack_walk(kunwind_consume_fn consume_state, struct stack_info stacks[] = { stackinfo_get_task(task), STACKINFO_CPU(irq), -#if defined(CONFIG_VMAP_STACK) STACKINFO_CPU(overflow), -#endif -#if defined(CONFIG_VMAP_STACK) && defined(CONFIG_ARM_SDE_INTERFACE) +#if defined(CONFIG_ARM_SDE_INTERFACE) STACKINFO_SDEI(normal), STACKINFO_SDEI(critical), #endif @@ -352,7 +355,7 @@ kunwind_stack_walk(kunwind_consume_fn consume_state, if (regs) { if (task != current) - return; + return -EINVAL; kunwind_init_from_regs(&state, regs); } else if (task == current) { kunwind_init_from_caller(&state); @@ -360,7 +363,7 @@ kunwind_stack_walk(kunwind_consume_fn consume_state, kunwind_init_from_task(&state, task); } - do_kunwind(&state, consume_state, cookie); + return do_kunwind(&state, consume_state, cookie); } struct kunwind_consume_entry_data { @@ -387,6 +390,36 @@ noinline noinstr void arch_stack_walk(stack_trace_consume_fn consume_entry, kunwind_stack_walk(arch_kunwind_consume_entry, &data, task, regs); } +static __always_inline bool +arch_reliable_kunwind_consume_entry(const struct kunwind_state *state, void *cookie) +{ + /* + * At an exception boundary we can reliably consume the saved PC. We do + * not know whether the LR was live when the exception was taken, and + * so we cannot perform the next unwind step reliably. + * + * All that matters is whether the *entire* unwind is reliable, so give + * up as soon as we hit an exception boundary. + */ + if (state->source == KUNWIND_SOURCE_REGS_PC) + return false; + + return arch_kunwind_consume_entry(state, cookie); +} + +noinline noinstr int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, + void *cookie, + struct task_struct *task) +{ + struct kunwind_consume_entry_data data = { + .consume_entry = consume_entry, + .cookie = cookie, + }; + + return kunwind_stack_walk(arch_reliable_kunwind_consume_entry, &data, + task, NULL); +} + struct bpf_unwind_consume_entry_data { bool (*consume_entry)(void *cookie, u64 ip, u64 sp, u64 fp); void *cookie; diff --git a/arch/arm64/kernel/static_call.c b/arch/arm64/kernel/static_call.c new file mode 100644 index 000000000000..8b3a19e10871 --- /dev/null +++ b/arch/arm64/kernel/static_call.c @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/static_call.h> +#include <linux/memory.h> +#include <asm/text-patching.h> + +void arch_static_call_transform(void *site, void *tramp, void *func, bool tail) +{ + u64 literal; + int ret; + + if (!func) + func = __static_call_return0; + + /* decode the instructions to discover the literal address */ + literal = ALIGN_DOWN((u64)tramp + 4, SZ_4K) + + aarch64_insn_adrp_get_offset(le32_to_cpup(tramp + 4)) + + 8 * aarch64_insn_decode_immediate(AARCH64_INSN_IMM_12, + le32_to_cpup(tramp + 8)); + + ret = aarch64_insn_write_literal_u64((void *)literal, (u64)func); + WARN_ON_ONCE(ret); +} +EXPORT_SYMBOL_GPL(arch_static_call_transform); diff --git a/arch/arm64/kernel/sys32.c b/arch/arm64/kernel/sys32.c index 96bcfb907443..12a948f3a504 100644 --- a/arch/arm64/kernel/sys32.c +++ b/arch/arm64/kernel/sys32.c @@ -89,7 +89,7 @@ COMPAT_SYSCALL_DEFINE4(aarch32_truncate64, const char __user *, pathname, COMPAT_SYSCALL_DEFINE4(aarch32_ftruncate64, unsigned int, fd, u32, __pad, arg_u32p(length)) { - return ksys_ftruncate(fd, arg_u64(length)); + return ksys_ftruncate(fd, arg_u64(length), FTRUNCATE_LFS); } COMPAT_SYSCALL_DEFINE5(aarch32_readahead, int, fd, u32, __pad, diff --git a/arch/arm64/kernel/sys_compat.c b/arch/arm64/kernel/sys_compat.c index 4a609e9b65de..7e9860143add 100644 --- a/arch/arm64/kernel/sys_compat.c +++ b/arch/arm64/kernel/sys_compat.c @@ -36,8 +36,8 @@ __do_compat_cache_op(unsigned long start, unsigned long end) * The workaround requires an inner-shareable tlbi. * We pick the reserved-ASID to minimise the impact. */ - __tlbi(aside1is, __TLBI_VADDR(0, 0)); - dsb(ish); + __tlbi(aside1is, 0UL); + __tlbi_sync_s1ish(); } ret = caches_clean_inval_user_pou(start, start + chunk); diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c index c442fcec6b9e..358ddfbf1401 100644 --- a/arch/arm64/kernel/syscall.c +++ b/arch/arm64/kernel/syscall.c @@ -43,7 +43,7 @@ static void invoke_syscall(struct pt_regs *regs, unsigned int scno, add_random_kstack_offset(); - if (scno < sc_nr) { + if (likely(scno < sc_nr)) { syscall_fn_t syscall_fn; syscall_fn = syscall_table[array_index_nospec(scno, sc_nr)]; ret = __invoke_syscall(regs, syscall_fn); @@ -52,17 +52,6 @@ static void invoke_syscall(struct pt_regs *regs, unsigned int scno, } syscall_set_return_value(current, regs, 0, ret); - - /* - * This value will get limited by KSTACK_OFFSET_MAX(), which is 10 - * bits. The actual entropy will be further reduced by the compiler - * when applying stack alignment constraints: the AAPCS mandates a - * 16-byte aligned SP at function boundaries, which will remove the - * 4 low bits from any entropy chosen here. - * - * The resulting 6 bits of entropy is seen in SP[9:4]. - */ - choose_random_kstack_offset(get_random_u16()); } static inline bool has_syscall_work(unsigned long flags) @@ -96,7 +85,7 @@ static void el0_svc_common(struct pt_regs *regs, int scno, int sc_nr, * (Similarly for HVC and SMC elsewhere.) */ - if (flags & _TIF_MTE_ASYNC_FAULT) { + if (unlikely(flags & _TIF_MTE_ASYNC_FAULT)) { /* * Process the asynchronous tag check fault before the actual * syscall. do_notify_resume() will send a signal to userspace diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index cb180684d10d..b32f13358fbb 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -15,62 +15,16 @@ #include <linux/arch_topology.h> #include <linux/cacheinfo.h> #include <linux/cpufreq.h> +#include <linux/cpu_smt.h> #include <linux/init.h> #include <linux/percpu.h> +#include <linux/sched/isolation.h> +#include <linux/xarray.h> #include <asm/cpu.h> #include <asm/cputype.h> #include <asm/topology.h> -#ifdef CONFIG_ACPI -static bool __init acpi_cpu_is_threaded(int cpu) -{ - int is_threaded = acpi_pptt_cpu_is_thread(cpu); - - /* - * if the PPTT doesn't have thread information, assume a homogeneous - * machine and return the current CPU's thread state. - */ - if (is_threaded < 0) - is_threaded = read_cpuid_mpidr() & MPIDR_MT_BITMASK; - - return !!is_threaded; -} - -/* - * Propagate the topology information of the processor_topology_node tree to the - * cpu_topology array. - */ -int __init parse_acpi_topology(void) -{ - int cpu, topology_id; - - if (acpi_disabled) - return 0; - - for_each_possible_cpu(cpu) { - topology_id = find_acpi_cpu_topology(cpu, 0); - if (topology_id < 0) - return topology_id; - - if (acpi_cpu_is_threaded(cpu)) { - cpu_topology[cpu].thread_id = topology_id; - topology_id = find_acpi_cpu_topology(cpu, 1); - cpu_topology[cpu].core_id = topology_id; - } else { - cpu_topology[cpu].thread_id = -1; - cpu_topology[cpu].core_id = topology_id; - } - topology_id = find_acpi_cpu_topology_cluster(cpu); - cpu_topology[cpu].cluster_id = topology_id; - topology_id = find_acpi_cpu_topology_package(cpu); - cpu_topology[cpu].package_id = topology_id; - } - - return 0; -} -#endif - #ifdef CONFIG_ARM64_AMU_EXTN #define read_corecnt() read_sysreg_s(SYS_AMEVCNTR0_CORE_EL0) #define read_constcnt() read_sysreg_s(SYS_AMEVCNTR0_CONST_EL0) @@ -88,18 +42,28 @@ int __init parse_acpi_topology(void) * initialized. */ static DEFINE_PER_CPU_READ_MOSTLY(unsigned long, arch_max_freq_scale) = 1UL << (2 * SCHED_CAPACITY_SHIFT); -static DEFINE_PER_CPU(u64, arch_const_cycles_prev); -static DEFINE_PER_CPU(u64, arch_core_cycles_prev); static cpumask_var_t amu_fie_cpus; +struct amu_cntr_sample { + u64 arch_const_cycles_prev; + u64 arch_core_cycles_prev; + unsigned long last_scale_update; +}; + +static DEFINE_PER_CPU_SHARED_ALIGNED(struct amu_cntr_sample, cpu_amu_samples); + void update_freq_counters_refs(void) { - this_cpu_write(arch_core_cycles_prev, read_corecnt()); - this_cpu_write(arch_const_cycles_prev, read_constcnt()); + struct amu_cntr_sample *amu_sample = this_cpu_ptr(&cpu_amu_samples); + + amu_sample->arch_core_cycles_prev = read_corecnt(); + amu_sample->arch_const_cycles_prev = read_constcnt(); } static inline bool freq_counters_valid(int cpu) { + struct amu_cntr_sample *amu_sample = per_cpu_ptr(&cpu_amu_samples, cpu); + if ((cpu >= nr_cpu_ids) || !cpumask_test_cpu(cpu, cpu_present_mask)) return false; @@ -108,8 +72,8 @@ static inline bool freq_counters_valid(int cpu) return false; } - if (unlikely(!per_cpu(arch_const_cycles_prev, cpu) || - !per_cpu(arch_core_cycles_prev, cpu))) { + if (unlikely(!amu_sample->arch_const_cycles_prev || + !amu_sample->arch_core_cycles_prev)) { pr_debug("CPU%d: cycle counters are not enabled.\n", cpu); return false; } @@ -152,17 +116,22 @@ void freq_inv_set_max_ratio(int cpu, u64 max_rate) static void amu_scale_freq_tick(void) { + struct amu_cntr_sample *amu_sample = this_cpu_ptr(&cpu_amu_samples); u64 prev_core_cnt, prev_const_cnt; u64 core_cnt, const_cnt, scale; - prev_const_cnt = this_cpu_read(arch_const_cycles_prev); - prev_core_cnt = this_cpu_read(arch_core_cycles_prev); + prev_const_cnt = amu_sample->arch_const_cycles_prev; + prev_core_cnt = amu_sample->arch_core_cycles_prev; update_freq_counters_refs(); - const_cnt = this_cpu_read(arch_const_cycles_prev); - core_cnt = this_cpu_read(arch_core_cycles_prev); + const_cnt = amu_sample->arch_const_cycles_prev; + core_cnt = amu_sample->arch_core_cycles_prev; + /* + * This should not happen unless the AMUs have been reset and the + * counter values have not been restored - unlikely + */ if (unlikely(core_cnt <= prev_core_cnt || const_cnt <= prev_const_cnt)) return; @@ -182,6 +151,8 @@ static void amu_scale_freq_tick(void) scale = min_t(unsigned long, scale, SCHED_CAPACITY_SCALE); this_cpu_write(arch_freq_scale, (unsigned long)scale); + + amu_sample->last_scale_update = jiffies; } static struct scale_freq_data amu_sfd = { @@ -189,6 +160,96 @@ static struct scale_freq_data amu_sfd = { .set_freq_scale = amu_scale_freq_tick, }; +static __always_inline bool amu_fie_cpu_supported(unsigned int cpu) +{ + return cpumask_available(amu_fie_cpus) && + cpumask_test_cpu(cpu, amu_fie_cpus); +} + +void arch_cpu_idle_enter(void) +{ + unsigned int cpu = smp_processor_id(); + + if (!amu_fie_cpu_supported(cpu)) + return; + + /* Kick in AMU update but only if one has not happened already */ + if (housekeeping_cpu(cpu, HK_TYPE_TICK) && + time_is_before_jiffies(per_cpu(cpu_amu_samples.last_scale_update, cpu))) + amu_scale_freq_tick(); +} + +#define AMU_SAMPLE_EXP_MS 20 + +int arch_freq_get_on_cpu(int cpu) +{ + struct amu_cntr_sample *amu_sample; + unsigned int start_cpu = cpu; + unsigned long last_update; + unsigned int freq = 0; + u64 scale; + + if (!amu_fie_cpu_supported(cpu) || !arch_scale_freq_ref(cpu)) + return -EOPNOTSUPP; + + while (1) { + + amu_sample = per_cpu_ptr(&cpu_amu_samples, cpu); + + last_update = amu_sample->last_scale_update; + + /* + * For those CPUs that are in full dynticks mode, or those that have + * not seen tick for a while, try an alternative source for the counters + * (and thus freq scale), if available, for given policy: this boils + * down to identifying an active cpu within the same freq domain, if any. + */ + if (!housekeeping_cpu(cpu, HK_TYPE_TICK) || + time_is_before_jiffies(last_update + msecs_to_jiffies(AMU_SAMPLE_EXP_MS))) { + struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); + int ref_cpu; + + if (!policy) + return -EINVAL; + + if (!cpumask_intersects(policy->related_cpus, + housekeeping_cpumask(HK_TYPE_TICK))) { + cpufreq_cpu_put(policy); + return -EOPNOTSUPP; + } + + for_each_cpu_wrap(ref_cpu, policy->cpus, cpu + 1) { + if (ref_cpu == start_cpu) { + /* Prevent verifying same CPU twice */ + ref_cpu = nr_cpu_ids; + break; + } + if (!idle_cpu(ref_cpu)) + break; + } + + cpufreq_cpu_put(policy); + + if (ref_cpu >= nr_cpu_ids) + /* No alternative to pull info from */ + return -EAGAIN; + + cpu = ref_cpu; + } else { + break; + } + } + /* + * Reversed computation to the one used to determine + * the arch_freq_scale value + * (see amu_scale_freq_tick for details) + */ + scale = arch_scale_freq_capacity(cpu); + freq = scale * arch_scale_freq_ref(cpu); + freq >>= SCHED_CAPACITY_SHIFT; + return freq; +} + static void amu_fie_setup(const struct cpumask *cpus) { int cpu; @@ -211,7 +272,7 @@ static void amu_fie_setup(const struct cpumask *cpus) cpumask_or(amu_fie_cpus, amu_fie_cpus, cpus); - topology_set_scale_freq_source(&amu_sfd, amu_fie_cpus); + topology_set_scale_freq_source(&amu_sfd, cpus); pr_debug("CPUs[%*pbl]: counters will be used for FIE.", cpumask_pr_args(cpus)); @@ -223,7 +284,7 @@ static int init_amu_fie_callback(struct notifier_block *nb, unsigned long val, struct cpufreq_policy *policy = data; if (val == CPUFREQ_CREATE_POLICY) - amu_fie_setup(policy->related_cpus); + amu_fie_setup(policy->cpus); /* * We don't need to handle CPUFREQ_REMOVE_POLICY event as the AMU @@ -242,10 +303,70 @@ static struct notifier_block init_amu_fie_notifier = { .notifier_call = init_amu_fie_callback, }; +static int cpuhp_topology_online(unsigned int cpu) +{ + struct cpufreq_policy *policy = cpufreq_cpu_policy(cpu); + + /* Those are cheap checks */ + + /* + * Skip this CPU if: + * - it has no cpufreq policy assigned yet, + * - no policy exists that spans CPUs with AMU counters, or + * - it was already handled. + */ + if (unlikely(!policy) || !cpumask_available(amu_fie_cpus) || + cpumask_test_cpu(cpu, amu_fie_cpus)) + return 0; + + /* + * Only proceed if all already-online CPUs in this policy + * support AMU counters. + */ + if (unlikely(!cpumask_subset(policy->cpus, amu_fie_cpus))) + return 0; + + /* + * If the new online CPU cannot pass this check, all the CPUs related to + * the same policy should be clear from amu_fie_cpus mask, otherwise they + * may use different source of the freq scale. + */ + if (!freq_counters_valid(cpu)) { + topology_clear_scale_freq_source(SCALE_FREQ_SOURCE_ARCH, + policy->related_cpus); + cpumask_andnot(amu_fie_cpus, amu_fie_cpus, policy->related_cpus); + return 0; + } + + cpumask_set_cpu(cpu, amu_fie_cpus); + + topology_set_scale_freq_source(&amu_sfd, cpumask_of(cpu)); + + pr_debug("CPU[%u]: counter will be used for FIE.", cpu); + + return 0; +} + static int __init init_amu_fie(void) { - return cpufreq_register_notifier(&init_amu_fie_notifier, + int ret; + + ret = cpufreq_register_notifier(&init_amu_fie_notifier, CPUFREQ_POLICY_NOTIFIER); + if (ret) + return ret; + + ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "arm64/topology:online", + cpuhp_topology_online, + NULL); + if (ret < 0) { + cpufreq_unregister_notifier(&init_amu_fie_notifier, + CPUFREQ_POLICY_NOTIFIER); + return ret; + } + + return 0; } core_initcall(init_amu_fie); @@ -279,16 +400,25 @@ static inline int counters_read_on_cpu(int cpu, smp_call_func_t func, u64 *val) { /* - * Abort call on counterless CPU or when interrupts are - * disabled - can lead to deadlock in smp sync call. + * Abort call on counterless CPU. */ if (!cpu_has_amu_feat(cpu)) return -EOPNOTSUPP; - if (WARN_ON_ONCE(irqs_disabled())) - return -EPERM; - - smp_call_function_single(cpu, func, val, 1); + if (irqs_disabled()) { + /* + * When IRQs are disabled (tick path: sched_tick -> + * topology_scale_freq_tick or cppc_scale_freq_tick), only local + * CPU counter reads are allowed. Remote CPU counter read would + * require smp_call_function_single() which is unsafe with IRQs + * disabled. + */ + if (WARN_ON_ONCE(cpu != smp_processor_id())) + return -EPERM; + func(val); + } else { + smp_call_function_single(cpu, func, val, 1); + } return 0; } diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 4e26bd356a48..914282016069 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -149,19 +149,18 @@ pstate_check_t * const aarch32_opcode_cond_checks[16] = { int show_unhandled_signals = 0; -static void dump_kernel_instr(const char *lvl, struct pt_regs *regs) +void dump_kernel_instr(unsigned long kaddr) { - unsigned long addr = instruction_pointer(regs); char str[sizeof("00000000 ") * 5 + 2 + 1], *p = str; int i; - if (user_mode(regs)) + if (!is_ttbr1_addr(kaddr)) return; for (i = -4; i < 1; i++) { unsigned int val, bad; - bad = aarch64_insn_read(&((u32 *)addr)[i], &val); + bad = aarch64_insn_read(&((u32 *)kaddr)[i], &val); if (!bad) p += sprintf(p, i == 0 ? "(%08x) " : "%08x ", val); @@ -169,25 +168,18 @@ static void dump_kernel_instr(const char *lvl, struct pt_regs *regs) p += sprintf(p, i == 0 ? "(????????) " : "???????? "); } - printk("%sCode: %s\n", lvl, str); + printk(KERN_EMERG "Code: %s\n", str); } -#ifdef CONFIG_PREEMPT -#define S_PREEMPT " PREEMPT" -#elif defined(CONFIG_PREEMPT_RT) -#define S_PREEMPT " PREEMPT_RT" -#else -#define S_PREEMPT "" -#endif - #define S_SMP " SMP" static int __die(const char *str, long err, struct pt_regs *regs) { static int die_counter; int ret; + unsigned long addr = instruction_pointer(regs); - pr_emerg("Internal error: %s: %016lx [#%d]" S_PREEMPT S_SMP "\n", + pr_emerg("Internal error: %s: %016lx [#%d] " S_SMP "\n", str, err, ++die_counter); /* trap and error numbers are mostly meaningless on ARM */ @@ -198,7 +190,10 @@ static int __die(const char *str, long err, struct pt_regs *regs) print_modules(); show_regs(regs); - dump_kernel_instr(KERN_EMERG, regs); + if (user_mode(regs)) + return ret; + + dump_kernel_instr(addr); return ret; } @@ -462,7 +457,7 @@ void do_el0_undef(struct pt_regs *regs, unsigned long esr) u32 insn; /* check for AArch32 breakpoint instructions */ - if (!aarch32_break_handler(regs)) + if (try_handle_aarch32_break(regs)) return; if (user_insn_read(regs, &insn)) @@ -902,8 +897,6 @@ void bad_el0_sync(struct pt_regs *regs, int reason, unsigned long esr) "Bad EL0 synchronous exception"); } -#ifdef CONFIG_VMAP_STACK - DEFINE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack) __aligned(16); @@ -929,16 +922,16 @@ void __noreturn panic_bad_stack(struct pt_regs *regs, unsigned long esr, unsigne __show_regs(regs); /* - * We use nmi_panic to limit the potential for recusive overflows, and + * We use nmi_panic to limit the potential for recursive overflows, and * to get a better stack trace. */ nmi_panic(NULL, "kernel stack overflow"); cpu_park_loop(); } -#endif void __noreturn arm64_serror_panic(struct pt_regs *regs, unsigned long esr) { + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_STILL_OK); console_verbose(); pr_crit("SError Interrupt on CPU%d, code 0x%016lx -- %s\n", @@ -995,7 +988,7 @@ void do_serror(struct pt_regs *regs, unsigned long esr) int is_valid_bugaddr(unsigned long addr) { /* - * bug_handler() only called for BRK #BUG_BRK_IMM. + * bug_brk_handler() only called for BRK #BUG_BRK_IMM. * So the answer is trivial -- any spurious instances with no * bug table entry will be rejected by report_bug() and passed * back to the debug-monitors code and handled as a fatal @@ -1005,7 +998,7 @@ int is_valid_bugaddr(unsigned long addr) } #endif -static int bug_handler(struct pt_regs *regs, unsigned long esr) +int bug_brk_handler(struct pt_regs *regs, unsigned long esr) { switch (report_bug(regs->pc, regs)) { case BUG_TRAP_TYPE_BUG: @@ -1025,13 +1018,8 @@ static int bug_handler(struct pt_regs *regs, unsigned long esr) return DBG_HOOK_HANDLED; } -static struct break_hook bug_break_hook = { - .fn = bug_handler, - .imm = BUG_BRK_IMM, -}; - -#ifdef CONFIG_CFI_CLANG -static int cfi_handler(struct pt_regs *regs, unsigned long esr) +#ifdef CONFIG_CFI +int cfi_brk_handler(struct pt_regs *regs, unsigned long esr) { unsigned long target; u32 type; @@ -1054,15 +1042,9 @@ static int cfi_handler(struct pt_regs *regs, unsigned long esr) arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); return DBG_HOOK_HANDLED; } +#endif /* CONFIG_CFI */ -static struct break_hook cfi_break_hook = { - .fn = cfi_handler, - .imm = CFI_BRK_IMM_BASE, - .mask = CFI_BRK_IMM_MASK, -}; -#endif /* CONFIG_CFI_CLANG */ - -static int reserved_fault_handler(struct pt_regs *regs, unsigned long esr) +int reserved_fault_brk_handler(struct pt_regs *regs, unsigned long esr) { pr_err("%s generated an invalid instruction at %pS!\n", "Kernel text patching", @@ -1072,11 +1054,6 @@ static int reserved_fault_handler(struct pt_regs *regs, unsigned long esr) return DBG_HOOK_ERROR; } -static struct break_hook fault_break_hook = { - .fn = reserved_fault_handler, - .imm = FAULT_BRK_IMM, -}; - #ifdef CONFIG_KASAN_SW_TAGS #define KASAN_ESR_RECOVER 0x20 @@ -1084,7 +1061,7 @@ static struct break_hook fault_break_hook = { #define KASAN_ESR_SIZE_MASK 0x0f #define KASAN_ESR_SIZE(esr) (1 << ((esr) & KASAN_ESR_SIZE_MASK)) -static int kasan_handler(struct pt_regs *regs, unsigned long esr) +int kasan_brk_handler(struct pt_regs *regs, unsigned long esr) { bool recover = esr & KASAN_ESR_RECOVER; bool write = esr & KASAN_ESR_WRITE; @@ -1115,62 +1092,12 @@ static int kasan_handler(struct pt_regs *regs, unsigned long esr) arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); return DBG_HOOK_HANDLED; } - -static struct break_hook kasan_break_hook = { - .fn = kasan_handler, - .imm = KASAN_BRK_IMM, - .mask = KASAN_BRK_MASK, -}; #endif #ifdef CONFIG_UBSAN_TRAP -static int ubsan_handler(struct pt_regs *regs, unsigned long esr) +int ubsan_brk_handler(struct pt_regs *regs, unsigned long esr) { - die(report_ubsan_failure(regs, esr & UBSAN_BRK_MASK), regs, esr); + die(report_ubsan_failure(esr & UBSAN_BRK_MASK), regs, esr); return DBG_HOOK_HANDLED; } - -static struct break_hook ubsan_break_hook = { - .fn = ubsan_handler, - .imm = UBSAN_BRK_IMM, - .mask = UBSAN_BRK_MASK, -}; -#endif - -/* - * Initial handler for AArch64 BRK exceptions - * This handler only used until debug_traps_init(). - */ -int __init early_brk64(unsigned long addr, unsigned long esr, - struct pt_regs *regs) -{ -#ifdef CONFIG_CFI_CLANG - if (esr_is_cfi_brk(esr)) - return cfi_handler(regs, esr) != DBG_HOOK_HANDLED; -#endif -#ifdef CONFIG_KASAN_SW_TAGS - if ((esr_brk_comment(esr) & ~KASAN_BRK_MASK) == KASAN_BRK_IMM) - return kasan_handler(regs, esr) != DBG_HOOK_HANDLED; -#endif -#ifdef CONFIG_UBSAN_TRAP - if ((esr_brk_comment(esr) & ~UBSAN_BRK_MASK) == UBSAN_BRK_IMM) - return ubsan_handler(regs, esr) != DBG_HOOK_HANDLED; #endif - return bug_handler(regs, esr) != DBG_HOOK_HANDLED; -} - -void __init trap_init(void) -{ - register_kernel_break_hook(&bug_break_hook); -#ifdef CONFIG_CFI_CLANG - register_kernel_break_hook(&cfi_break_hook); -#endif - register_kernel_break_hook(&fault_break_hook); -#ifdef CONFIG_KASAN_SW_TAGS - register_kernel_break_hook(&kasan_break_hook); -#endif -#ifdef CONFIG_UBSAN_TRAP - register_kernel_break_hook(&ubsan_break_hook); -#endif - debug_traps_init(); -} diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index e8ed8e5b713b..592dd8668de4 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -18,7 +18,7 @@ #include <linux/sched.h> #include <linux/signal.h> #include <linux/slab.h> -#include <linux/time_namespace.h> +#include <linux/vdso_datastore.h> #include <linux/vmalloc.h> #include <vdso/datapage.h> #include <vdso/helpers.h> @@ -57,12 +57,6 @@ static struct vdso_abi_info vdso_info[] __ro_after_init = { #endif /* CONFIG_COMPAT_VDSO */ }; -/* - * The vDSO data page. - */ -static union vdso_data_store vdso_data_store __page_aligned_data; -struct vdso_data *vdso_data = vdso_data_store.data; - static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) { @@ -87,9 +81,7 @@ static int __init __vdso_init(enum vdso_abi abi) vdso_info[abi].vdso_code_start) >> PAGE_SHIFT; - vdso_pagelist = kcalloc(vdso_info[abi].vdso_pages, - sizeof(struct page *), - GFP_KERNEL); + vdso_pagelist = kzalloc_objs(struct page *, vdso_info[abi].vdso_pages); if (vdso_pagelist == NULL) return -ENOMEM; @@ -104,78 +96,6 @@ static int __init __vdso_init(enum vdso_abi abi) return 0; } -#ifdef CONFIG_TIME_NS -struct vdso_data *arch_get_vdso_data(void *vvar_page) -{ - return (struct vdso_data *)(vvar_page); -} - -static const struct vm_special_mapping vvar_map; - -/* - * The vvar mapping contains data for a specific time namespace, so when a task - * changes namespace we must unmap its vvar data for the old namespace. - * Subsequent faults will map in data for the new namespace. - * - * For more details see timens_setup_vdso_data(). - */ -int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) -{ - struct mm_struct *mm = task->mm; - struct vm_area_struct *vma; - VMA_ITERATOR(vmi, mm, 0); - - mmap_read_lock(mm); - - for_each_vma(vmi, vma) { - if (vma_is_special_mapping(vma, &vvar_map)) - zap_vma_pages(vma); - } - - mmap_read_unlock(mm); - return 0; -} -#endif - -static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, - struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct page *timens_page = find_timens_vvar_page(vma); - unsigned long pfn; - - switch (vmf->pgoff) { - case VVAR_DATA_PAGE_OFFSET: - if (timens_page) - pfn = page_to_pfn(timens_page); - else - pfn = sym_to_pfn(vdso_data); - break; -#ifdef CONFIG_TIME_NS - case VVAR_TIMENS_PAGE_OFFSET: - /* - * If a task belongs to a time namespace then a namespace - * specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and - * the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET - * offset. - * See also the comment near timens_setup_vdso_data(). - */ - if (!timens_page) - return VM_FAULT_SIGBUS; - pfn = sym_to_pfn(vdso_data); - break; -#endif /* CONFIG_TIME_NS */ - default: - return VM_FAULT_SIGBUS; - } - - return vmf_insert_pfn(vma, vmf->address, pfn); -} - -static const struct vm_special_mapping vvar_map = { - .name = "[vvar]", - .fault = vvar_fault, -}; - static int __setup_additional_pages(enum vdso_abi abi, struct mm_struct *mm, struct linux_binprm *bprm, @@ -185,11 +105,11 @@ static int __setup_additional_pages(enum vdso_abi abi, unsigned long gp_flags = 0; void *ret; - BUILD_BUG_ON(VVAR_NR_PAGES != __VVAR_PAGES); + BUILD_BUG_ON(VDSO_NR_PAGES != __VDSO_PAGES); vdso_text_len = vdso_info[abi].vdso_pages << PAGE_SHIFT; /* Be sure to map the data page */ - vdso_mapping_len = vdso_text_len + VVAR_NR_PAGES * PAGE_SIZE; + vdso_mapping_len = vdso_text_len + VDSO_NR_PAGES * PAGE_SIZE; vdso_base = get_unmapped_area(NULL, 0, vdso_mapping_len, 0, 0); if (IS_ERR_VALUE(vdso_base)) { @@ -197,20 +117,19 @@ static int __setup_additional_pages(enum vdso_abi abi, goto up_fail; } - ret = _install_special_mapping(mm, vdso_base, VVAR_NR_PAGES * PAGE_SIZE, - VM_READ|VM_MAYREAD|VM_PFNMAP, - &vvar_map); + ret = vdso_install_vvar_mapping(mm, vdso_base); if (IS_ERR(ret)) goto up_fail; if (system_supports_bti_kernel()) gp_flags = VM_ARM64_BTI; - vdso_base += VVAR_NR_PAGES * PAGE_SIZE; + vdso_base += VDSO_NR_PAGES * PAGE_SIZE; mm->context.vdso = (void *)vdso_base; ret = _install_special_mapping(mm, vdso_base, vdso_text_len, VM_READ|VM_EXEC|gp_flags| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| + VM_SEALED_SYSMAP, vdso_info[abi].cm); if (IS_ERR(ret)) goto up_fail; @@ -336,7 +255,8 @@ static int aarch32_kuser_helpers_setup(struct mm_struct *mm) */ ret = _install_special_mapping(mm, AARCH32_VECTORS_BASE, PAGE_SIZE, VM_READ | VM_EXEC | - VM_MAYREAD | VM_MAYEXEC, + VM_MAYREAD | VM_MAYEXEC | + VM_SEALED_SYSMAP, &aarch32_vdso_maps[AA32_MAP_VECTORS]); return PTR_ERR_OR_ZERO(ret); @@ -359,7 +279,8 @@ static int aarch32_sigreturn_setup(struct mm_struct *mm) */ ret = _install_special_mapping(mm, addr, PAGE_SIZE, VM_READ | VM_EXEC | VM_MAYREAD | - VM_MAYWRITE | VM_MAYEXEC, + VM_MAYWRITE | VM_MAYEXEC | + VM_SEALED_SYSMAP, &aarch32_vdso_maps[AA32_MAP_SIGPAGE]); if (IS_ERR(ret)) goto out; diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile index 35685c036044..7dec05dd33b7 100644 --- a/arch/arm64/kernel/vdso/Makefile +++ b/arch/arm64/kernel/vdso/Makefile @@ -7,7 +7,7 @@ # # Include the generic Makefile to check the built vdso. -include $(srctree)/lib/vdso/Makefile +include $(srctree)/lib/vdso/Makefile.include obj-vdso := vgettimeofday.o note.o sigreturn.o vgetrandom.o vgetrandom-chacha.o @@ -36,7 +36,8 @@ ccflags-y += -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO # -Wmissing-prototypes and -Wmissing-declarations are removed from # the CFLAGS to make possible to build the kernel with CONFIG_WERROR enabled. CC_FLAGS_REMOVE_VDSO := $(CC_FLAGS_FTRACE) -Os $(CC_FLAGS_SCS) \ - $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) \ + $(RANDSTRUCT_CFLAGS) $(KSTACK_ERASE_CFLAGS) \ + $(GCC_PLUGINS_CFLAGS) \ $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) \ -Wmissing-prototypes -Wmissing-declarations diff --git a/arch/arm64/kernel/vdso/vdso.lds.S b/arch/arm64/kernel/vdso/vdso.lds.S index 47ad6944f9f0..52314be29191 100644 --- a/arch/arm64/kernel/vdso/vdso.lds.S +++ b/arch/arm64/kernel/vdso/vdso.lds.S @@ -20,11 +20,8 @@ OUTPUT_ARCH(aarch64) SECTIONS { - PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); - PROVIDE(_vdso_rng_data = _vdso_data + __VDSO_RND_DATA_OFFSET); -#ifdef CONFIG_TIME_NS - PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); -#endif + VDSO_VVAR_SYMS + . = SIZEOF_HEADERS; .hash : { *(.hash) } :text diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile index 25a2cb6317f3..bea3675fa668 100644 --- a/arch/arm64/kernel/vdso32/Makefile +++ b/arch/arm64/kernel/vdso32/Makefile @@ -3,7 +3,7 @@ # Makefile for vdso32 # -include $(srctree)/lib/vdso/Makefile +include $(srctree)/lib/vdso/Makefile.include # Same as cc-*option, but using CC_COMPAT instead of CC ifeq ($(CONFIG_CC_IS_CLANG), y) @@ -21,8 +21,6 @@ endif cc32-option = $(call try-run,\ $(CC_COMPAT) $(1) -c -x c /dev/null -o "$$TMP",$(1),$(2)) -cc32-disable-warning = $(call try-run,\ - $(CC_COMPAT) -W$(strip $(1)) -c -x c /dev/null -o "$$TMP",-Wno-$(strip $(1))) # We cannot use the global flags to compile the vDSO files, the main reason # being that the 32-bit compiler may be older than the main (64-bit) compiler @@ -59,13 +57,13 @@ VDSO_CAFLAGS += -DDISABLE_BRANCH_PROFILING VDSO_CAFLAGS += -march=armv8-a VDSO_CFLAGS := $(VDSO_CAFLAGS) -VDSO_CFLAGS += -DENABLE_COMPAT_VDSO=1 # KBUILD_CFLAGS from top-level Makefile VDSO_CFLAGS += -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \ -fno-strict-aliasing -fno-common \ + $(filter -Werror,$(KBUILD_CPPFLAGS)) \ -Werror-implicit-function-declaration \ -Wno-format-security \ - -std=gnu11 + $(CC_FLAGS_DIALECT) VDSO_CFLAGS += -O2 # Some useful compiler-dependent flags from top-level Makefile VDSO_CFLAGS += $(call cc32-option,-Wno-pointer-sign) @@ -74,16 +72,6 @@ VDSO_CFLAGS += $(call cc32-option,-Werror=strict-prototypes) VDSO_CFLAGS += -Werror=date-time VDSO_CFLAGS += $(call cc32-option,-Werror=incompatible-pointer-types) -# The 32-bit compiler does not provide 128-bit integers, which are used in -# some headers that are indirectly included from the vDSO code. -# This hack makes the compiler happy and should trigger a warning/error if -# variables of such type are referenced. -VDSO_CFLAGS += -D__uint128_t='void*' -# Silence some warnings coming from headers that operate on long's -# (on GCC 4.8 or older, there is unfortunately no way to silence this warning) -VDSO_CFLAGS += $(call cc32-disable-warning,shift-count-overflow) -VDSO_CFLAGS += -Wno-int-to-pointer-cast - # Compile as THUMB2 or ARM. Unwinding via frame-pointers in THUMB2 is # unreliable. ifeq ($(CONFIG_THUMB2_COMPAT_VDSO), y) diff --git a/arch/arm64/kernel/vdso32/vdso.lds.S b/arch/arm64/kernel/vdso32/vdso.lds.S index 732702a187e9..c374fb0146f3 100644 --- a/arch/arm64/kernel/vdso32/vdso.lds.S +++ b/arch/arm64/kernel/vdso32/vdso.lds.S @@ -12,16 +12,15 @@ #include <asm/page.h> #include <asm/vdso.h> #include <asm-generic/vmlinux.lds.h> +#include <vdso/datapage.h> OUTPUT_FORMAT("elf32-littlearm", "elf32-bigarm", "elf32-littlearm") OUTPUT_ARCH(arm) SECTIONS { - PROVIDE_HIDDEN(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); -#ifdef CONFIG_TIME_NS - PROVIDE_HIDDEN(_timens_data = _vdso_data + PAGE_SIZE); -#endif + VDSO_VVAR_SYMS + . = SIZEOF_HEADERS; .hash : { *(.hash) } :text @@ -87,6 +86,7 @@ VERSION __vdso_gettimeofday; __vdso_clock_getres; __vdso_clock_gettime64; + __vdso_clock_getres_time64; local: *; }; } diff --git a/arch/arm64/kernel/vdso32/vgettimeofday.c b/arch/arm64/kernel/vdso32/vgettimeofday.c index 29b4d8f61e39..0c6998ebe491 100644 --- a/arch/arm64/kernel/vdso32/vgettimeofday.c +++ b/arch/arm64/kernel/vdso32/vgettimeofday.c @@ -32,6 +32,11 @@ int __vdso_clock_getres(clockid_t clock_id, return __cvdso_clock_getres_time32(clock_id, res); } +int __vdso_clock_getres_time64(clockid_t clock_id, struct __kernel_timespec *res) +{ + return __cvdso_clock_getres(clock_id, res); +} + /* Avoid unresolved references emitted by GCC */ void __aeabi_unwind_cpp_pr0(void) diff --git a/arch/arm64/kernel/vmcore_info.c b/arch/arm64/kernel/vmcore_info.c index b19d5d6cb8b3..9619ece66b79 100644 --- a/arch/arm64/kernel/vmcore_info.c +++ b/arch/arm64/kernel/vmcore_info.c @@ -14,7 +14,7 @@ static inline u64 get_tcr_el1_t1sz(void); static inline u64 get_tcr_el1_t1sz(void) { - return (read_sysreg(tcr_el1) & TCR_T1SZ_MASK) >> TCR_T1SZ_OFFSET; + return (read_sysreg(tcr_el1) & TCR_EL1_T1SZ_MASK) >> TCR_EL1_T1SZ_SHIFT; } void arch_crash_save_vmcoreinfo(void) diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index e73326bd3ff7..e1ac876200a3 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -13,16 +13,36 @@ *(__kvm_ex_table) \ __stop___kvm_ex_table = .; -#define HYPERVISOR_DATA_SECTIONS \ +#ifdef CONFIG_NVHE_EL2_TRACING +#define HYPERVISOR_EVENT_IDS \ + . = ALIGN(PAGE_SIZE); \ + __hyp_event_ids_start = .; \ + *(HYP_SECTION_NAME(.event_ids)) \ + __hyp_event_ids_end = .; +#else +#define HYPERVISOR_EVENT_IDS +#endif + +#define HYPERVISOR_RODATA_SECTIONS \ HYP_SECTION_NAME(.rodata) : { \ . = ALIGN(PAGE_SIZE); \ __hyp_rodata_start = .; \ *(HYP_SECTION_NAME(.data..ro_after_init)) \ *(HYP_SECTION_NAME(.rodata)) \ + HYPERVISOR_EVENT_IDS \ . = ALIGN(PAGE_SIZE); \ __hyp_rodata_end = .; \ } +#define HYPERVISOR_DATA_SECTION \ + HYP_SECTION_NAME(.data) : { \ + . = ALIGN(PAGE_SIZE); \ + __hyp_data_start = .; \ + *(HYP_SECTION_NAME(.data)) \ + . = ALIGN(PAGE_SIZE); \ + __hyp_data_end = .; \ + } + #define HYPERVISOR_PERCPU_SECTION \ . = ALIGN(PAGE_SIZE); \ HYP_SECTION_NAME(.data..percpu) : { \ @@ -51,7 +71,8 @@ #define SBSS_ALIGN PAGE_SIZE #else /* CONFIG_KVM */ #define HYPERVISOR_EXTABLE -#define HYPERVISOR_DATA_SECTIONS +#define HYPERVISOR_RODATA_SECTIONS +#define HYPERVISOR_DATA_SECTION #define HYPERVISOR_PERCPU_SECTION #define HYPERVISOR_RELOC_SECTION #define SBSS_ALIGN 0 @@ -181,6 +202,7 @@ SECTIONS LOCK_TEXT KPROBES_TEXT HYPERVISOR_TEXT + STATIC_CALL_TEXT *(.gnu.warning) } @@ -190,7 +212,7 @@ SECTIONS /* everything from this point to __init_begin will be marked RO NX */ RO_DATA(PAGE_SIZE) - HYPERVISOR_DATA_SECTIONS + HYPERVISOR_RODATA_SECTIONS .got : { *(.got) } /* @@ -249,9 +271,9 @@ SECTIONS __inittext_end = .; __initdata_begin = .; - init_idmap_pg_dir = .; + __pi_init_idmap_pg_dir = .; . += INIT_IDMAP_DIR_SIZE; - init_idmap_pg_end = .; + __pi_init_idmap_pg_end = .; .init.data : { INIT_DATA @@ -295,6 +317,15 @@ SECTIONS _sdata = .; RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN) + HYPERVISOR_DATA_SECTION + +#ifdef CONFIG_NVHE_EL2_TRACING + .data.hyp_events : { + __hyp_events_start = .; + *(SORT(_hyp_events.*)) + __hyp_events_end = .; + } +#endif /* * Data written with the MMU off but read with the MMU on requires * cache lines to be invalidated, discarding up to a Cache Writeback @@ -319,11 +350,12 @@ SECTIONS /* start of zero-init region */ BSS_SECTION(SBSS_ALIGN, 0, 0) + __pi___bss_start = __bss_start; . = ALIGN(PAGE_SIZE); - init_pg_dir = .; + __pi_init_pg_dir = .; . += INIT_DIR_SIZE; - init_pg_end = .; + __pi_init_pg_end = .; /* end of zero-init region */ . += SZ_4K; /* stack for the early C runtime */ @@ -332,9 +364,11 @@ SECTIONS . = ALIGN(SEGMENT_ALIGN); __pecoff_data_size = ABSOLUTE(. - __initdata_begin); _end = .; + __pi__end = .; STABS_DEBUG DWARF_DEBUG + MODINFO ELF_DETAILS HEAD_SYMBOLS diff --git a/arch/arm64/kernel/watchdog_hld.c b/arch/arm64/kernel/watchdog_hld.c index dcd25322127c..3093037dcb7b 100644 --- a/arch/arm64/kernel/watchdog_hld.c +++ b/arch/arm64/kernel/watchdog_hld.c @@ -34,3 +34,61 @@ bool __init arch_perf_nmi_is_available(void) */ return arm_pmu_irq_is_nmi(); } + +static int watchdog_perf_update_period(void *data) +{ + int cpu = smp_processor_id(); + u64 max_cpu_freq, new_period; + + max_cpu_freq = cpufreq_get_hw_max_freq(cpu) * 1000UL; + if (!max_cpu_freq) + return 0; + + new_period = watchdog_thresh * max_cpu_freq; + hardlockup_detector_perf_adjust_period(new_period); + + return 0; +} + +static int watchdog_freq_notifier_callback(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_policy *policy = data; + int cpu; + + if (val != CPUFREQ_CREATE_POLICY) + return NOTIFY_DONE; + + /* + * Let each online CPU related to the policy update the period by their + * own. This will serialize with the framework on start/stop the lockup + * detector (softlockup_{start,stop}_all) and avoid potential race + * condition. Otherwise we may have below theoretical race condition: + * (core 0/1 share the same policy) + * [core 0] [core 1] + * hardlockup_detector_event_create() + * hw_nmi_get_sample_period() + * (cpufreq registered, notifier callback invoked) + * watchdog_freq_notifier_callback() + * watchdog_perf_update_period() + * (since core 1's event's not yet created, + * the period is not set) + * perf_event_create_kernel_counter() + * (event's period is SAFE_MAX_CPU_FREQ) + */ + for_each_cpu(cpu, policy->cpus) + smp_call_on_cpu(cpu, watchdog_perf_update_period, NULL, false); + + return NOTIFY_DONE; +} + +static struct notifier_block watchdog_freq_notifier = { + .notifier_call = watchdog_freq_notifier_callback, +}; + +static int __init init_watchdog_freq_notifier(void) +{ + return cpufreq_register_notifier(&watchdog_freq_notifier, + CPUFREQ_POLICY_NOTIFIER); +} +core_initcall(init_watchdog_freq_notifier); |
