From 6487c963083c24ede289d4267ffa60a9db668cd4 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:58:51 +0100 Subject: arm64/cpufeature: Runtime detection of Guarded Control Stack (GCS) Add a cpufeature for GCS, allowing other code to conditionally support it at runtime. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-12-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/kernel/cpufeature.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 718728a85430..d1e758e99e0a 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -291,6 +291,8 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = { }; static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = { + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_GCS), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_GCS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_SME_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_MPAM_frac_SHIFT, 4, 0), @@ -2358,6 +2360,14 @@ static void cpu_enable_poe(const struct arm64_cpu_capabilities *__unused) } #endif +#ifdef CONFIG_ARM64_GCS +static void cpu_enable_gcs(const struct arm64_cpu_capabilities *__unused) +{ + /* GCSPR_EL0 is always readable */ + write_sysreg_s(GCSCRE0_EL1_nTR, SYS_GCSCRE0_EL1); +} +#endif + /* Internal helper functions to match cpu capability type */ static bool cpucap_late_cpu_optional(const struct arm64_cpu_capabilities *cap) @@ -2889,6 +2899,16 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .cpu_enable = cpu_enable_poe, ARM64_CPUID_FIELDS(ID_AA64MMFR3_EL1, S1POE, IMP) }, +#endif +#ifdef CONFIG_ARM64_GCS + { + .desc = "Guarded Control Stack (GCS)", + .capability = ARM64_HAS_GCS, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .cpu_enable = cpu_enable_gcs, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, GCS, IMP) + }, #endif {}, }; -- cgit v1.2.3 From a94452112ce4020af073af368d7c25a07bfabf37 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:58:56 +0100 Subject: arm64/idreg: Add overrride for GCS Hook up an override for GCS, allowing it to be disabled from the command line by specifying arm64.nogcs in case there are problems. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Acked-by: Catalin Marinas Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-17-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- Documentation/admin-guide/kernel-parameters.txt | 3 +++ arch/arm64/kernel/pi/idreg-override.c | 2 ++ 2 files changed, 5 insertions(+) (limited to 'arch/arm64/kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 1518343bbe22..c1b00f709734 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -446,6 +446,9 @@ arm64.nobti [ARM64] Unconditionally disable Branch Target Identification support + arm64.nogcs [ARM64] Unconditionally disable Guarded Control Stack + support + arm64.nomops [ARM64] Unconditionally disable Memory Copy and Memory Set instructions support diff --git a/arch/arm64/kernel/pi/idreg-override.c b/arch/arm64/kernel/pi/idreg-override.c index 29d4b6244a6f..2bb709d78405 100644 --- a/arch/arm64/kernel/pi/idreg-override.c +++ b/arch/arm64/kernel/pi/idreg-override.c @@ -133,6 +133,7 @@ static const struct ftr_set_desc pfr1 __prel64_initconst = { .override = &id_aa64pfr1_override, .fields = { FIELD("bt", ID_AA64PFR1_EL1_BT_SHIFT, NULL ), + FIELD("gcs", ID_AA64PFR1_EL1_GCS_SHIFT, NULL), FIELD("mte", ID_AA64PFR1_EL1_MTE_SHIFT, NULL), FIELD("sme", ID_AA64PFR1_EL1_SME_SHIFT, pfr1_sme_filter), {} @@ -215,6 +216,7 @@ static const struct { { "arm64.nosve", "id_aa64pfr0.sve=0" }, { "arm64.nosme", "id_aa64pfr1.sme=0" }, { "arm64.nobti", "id_aa64pfr1.bt=0" }, + { "arm64.nogcs", "id_aa64pfr1.gcs=0" }, { "arm64.nopauth", "id_aa64isar1.gpi=0 id_aa64isar1.gpa=0 " "id_aa64isar1.api=0 id_aa64isar1.apa=0 " -- cgit v1.2.3 From eefc98711f84abd7ffb074af0cdbc5f8a8464272 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:58:57 +0100 Subject: arm64/hwcap: Add hwcap for GCS Provide a hwcap to enable userspace to detect support for GCS. Signed-off-by: Mark Brown Acked-by: Yury Khrustalev Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-18-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- Documentation/arch/arm64/elf_hwcaps.rst | 4 ++++ arch/arm64/include/asm/hwcap.h | 1 + arch/arm64/include/uapi/asm/hwcap.h | 3 ++- arch/arm64/kernel/cpufeature.c | 3 +++ arch/arm64/kernel/cpuinfo.c | 1 + 5 files changed, 11 insertions(+), 1 deletion(-) (limited to 'arch/arm64/kernel') diff --git a/Documentation/arch/arm64/elf_hwcaps.rst b/Documentation/arch/arm64/elf_hwcaps.rst index 694f67fa07d1..25b41ff74fa0 100644 --- a/Documentation/arch/arm64/elf_hwcaps.rst +++ b/Documentation/arch/arm64/elf_hwcaps.rst @@ -170,6 +170,10 @@ HWCAP_PACG ID_AA64ISAR1_EL1.GPI == 0b0001, as described by Documentation/arch/arm64/pointer-authentication.rst. +HWCAP_GCS + Functionality implied by ID_AA64PFR1_EL1.GCS == 0b1, as + described by Documentation/arch/arm64/gcs.rst. + HWCAP2_DCPODP Functionality implied by ID_AA64ISAR1_EL1.DPB == 0b0010. diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h index a775adddecf2..7bcf1347ca0b 100644 --- a/arch/arm64/include/asm/hwcap.h +++ b/arch/arm64/include/asm/hwcap.h @@ -92,6 +92,7 @@ #define KERNEL_HWCAP_SB __khwcap_feature(SB) #define KERNEL_HWCAP_PACA __khwcap_feature(PACA) #define KERNEL_HWCAP_PACG __khwcap_feature(PACG) +#define KERNEL_HWCAP_GCS __khwcap_feature(GCS) #define __khwcap2_feature(x) (const_ilog2(HWCAP2_ ## x) + 64) #define KERNEL_HWCAP_DCPODP __khwcap2_feature(DCPODP) diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h index 055381b2c615..675642ec4d91 100644 --- a/arch/arm64/include/uapi/asm/hwcap.h +++ b/arch/arm64/include/uapi/asm/hwcap.h @@ -21,7 +21,7 @@ * HWCAP flags - for AT_HWCAP * * Bits 62 and 63 are reserved for use by libc. - * Bits 32-61 are unallocated for potential use by libc. + * Bits 33-61 are unallocated for potential use by libc. */ #define HWCAP_FP (1 << 0) #define HWCAP_ASIMD (1 << 1) @@ -55,6 +55,7 @@ #define HWCAP_SB (1 << 29) #define HWCAP_PACA (1 << 30) #define HWCAP_PACG (1UL << 31) +#define HWCAP_GCS (1UL << 32) /* * HWCAP2 flags - for AT_HWCAP2 diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index d1e758e99e0a..b8655d55f318 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -3025,6 +3025,9 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64ZFR0_EL1, I8MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEI8MM), HWCAP_CAP(ID_AA64ZFR0_EL1, F32MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEF32MM), HWCAP_CAP(ID_AA64ZFR0_EL1, F64MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEF64MM), +#endif +#ifdef CONFIG_ARM64_GCS + HWCAP_CAP(ID_AA64PFR1_EL1, GCS, IMP, CAP_HWCAP, KERNEL_HWCAP_GCS), #endif HWCAP_CAP(ID_AA64PFR1_EL1, SSBS, SSBS2, CAP_HWCAP, KERNEL_HWCAP_SSBS), #ifdef CONFIG_ARM64_BTI diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 44718d0482b3..f2f92c6b1c85 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -80,6 +80,7 @@ static const char *const hwcap_str[] = { [KERNEL_HWCAP_SB] = "sb", [KERNEL_HWCAP_PACA] = "paca", [KERNEL_HWCAP_PACG] = "pacg", + [KERNEL_HWCAP_GCS] = "gcs", [KERNEL_HWCAP_DCPODP] = "dcpodp", [KERNEL_HWCAP_SVE2] = "sve2", [KERNEL_HWCAP_SVEAES] = "sveaes", -- cgit v1.2.3 From 8ce71d270536dd7a48698a2b18ddf13f2d5007fb Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:58:58 +0100 Subject: arm64/traps: Handle GCS exceptions A new exception code is defined for GCS specific faults other than standard load/store faults, for example GCS token validation failures, add handling for this. These faults are reported to userspace as segfaults with code SEGV_CPERR (protection error), mirroring the reporting for x86 shadow stack errors. GCS faults due to memory load/store operations generate data aborts with a flag set, these will be handled separately as part of the data abort handling. Since we do not currently enable GCS for EL1 we should not get any faults there but while we're at it we wire things up there, treating any GCS fault as fatal. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-19-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/esr.h | 28 +++++++++++++++++++++++++++- arch/arm64/include/asm/exception.h | 2 ++ arch/arm64/kernel/entry-common.c | 23 +++++++++++++++++++++++ arch/arm64/kernel/traps.c | 11 +++++++++++ 4 files changed, 63 insertions(+), 1 deletion(-) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index da6d2c1c0b03..d1b1a33f9a8b 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -51,7 +51,8 @@ #define ESR_ELx_EC_FP_EXC32 UL(0x28) /* Unallocated EC: 0x29 - 0x2B */ #define ESR_ELx_EC_FP_EXC64 UL(0x2C) -/* Unallocated EC: 0x2D - 0x2E */ +#define ESR_ELx_EC_GCS UL(0x2D) +/* Unallocated EC: 0x2E */ #define ESR_ELx_EC_SERROR UL(0x2F) #define ESR_ELx_EC_BREAKPT_LOW UL(0x30) #define ESR_ELx_EC_BREAKPT_CUR UL(0x31) @@ -386,6 +387,31 @@ #define ESR_ELx_MOPS_ISS_SRCREG(esr) (((esr) & (UL(0x1f) << 5)) >> 5) #define ESR_ELx_MOPS_ISS_SIZEREG(esr) (((esr) & (UL(0x1f) << 0)) >> 0) +/* ISS field definitions for GCS */ +#define ESR_ELx_ExType_SHIFT (20) +#define ESR_ELx_ExType_MASK GENMASK(23, 20) +#define ESR_ELx_Raddr_SHIFT (10) +#define ESR_ELx_Raddr_MASK GENMASK(14, 10) +#define ESR_ELx_Rn_SHIFT (5) +#define ESR_ELx_Rn_MASK GENMASK(9, 5) +#define ESR_ELx_Rvalue_SHIFT 5 +#define ESR_ELx_Rvalue_MASK GENMASK(9, 5) +#define ESR_ELx_IT_SHIFT (0) +#define ESR_ELx_IT_MASK GENMASK(4, 0) + +#define ESR_ELx_ExType_DATA_CHECK 0 +#define ESR_ELx_ExType_EXLOCK 1 +#define ESR_ELx_ExType_STR 2 + +#define ESR_ELx_IT_RET 0 +#define ESR_ELx_IT_GCSPOPM 1 +#define ESR_ELx_IT_RET_KEYA 2 +#define ESR_ELx_IT_RET_KEYB 3 +#define ESR_ELx_IT_GCSSS1 4 +#define ESR_ELx_IT_GCSSS2 5 +#define ESR_ELx_IT_GCSPOPCX 6 +#define ESR_ELx_IT_GCSPOPX 7 + #ifndef __ASSEMBLY__ #include diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h index f296662590c7..674518464718 100644 --- a/arch/arm64/include/asm/exception.h +++ b/arch/arm64/include/asm/exception.h @@ -57,6 +57,8 @@ void do_el0_undef(struct pt_regs *regs, unsigned long esr); void do_el1_undef(struct pt_regs *regs, unsigned long esr); void do_el0_bti(struct pt_regs *regs); void do_el1_bti(struct pt_regs *regs, unsigned long esr); +void do_el0_gcs(struct pt_regs *regs, unsigned long esr); +void do_el1_gcs(struct pt_regs *regs, unsigned long esr); void do_debug_exception(unsigned long addr_if_watchpoint, unsigned long esr, struct pt_regs *regs); void do_fpsimd_acc(unsigned long esr, struct pt_regs *regs); diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index 3fcd9d080bf2..fe74813009bd 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -463,6 +463,15 @@ static void noinstr el1_bti(struct pt_regs *regs, unsigned long esr) exit_to_kernel_mode(regs); } +static void noinstr el1_gcs(struct pt_regs *regs, unsigned long esr) +{ + enter_from_kernel_mode(regs); + local_daif_inherit(regs); + do_el1_gcs(regs, esr); + local_daif_mask(); + exit_to_kernel_mode(regs); +} + static void noinstr el1_dbg(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); @@ -505,6 +514,9 @@ asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs) case ESR_ELx_EC_BTI: el1_bti(regs, esr); break; + case ESR_ELx_EC_GCS: + el1_gcs(regs, esr); + break; case ESR_ELx_EC_BREAKPT_CUR: case ESR_ELx_EC_SOFTSTP_CUR: case ESR_ELx_EC_WATCHPT_CUR: @@ -684,6 +696,14 @@ static void noinstr el0_mops(struct pt_regs *regs, unsigned long esr) exit_to_user_mode(regs); } +static void noinstr el0_gcs(struct pt_regs *regs, unsigned long esr) +{ + enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_el0_gcs(regs, esr); + exit_to_user_mode(regs); +} + static void noinstr el0_inv(struct pt_regs *regs, unsigned long esr) { enter_from_user_mode(regs); @@ -766,6 +786,9 @@ asmlinkage void noinstr el0t_64_sync_handler(struct pt_regs *regs) case ESR_ELx_EC_MOPS: el0_mops(regs, esr); break; + case ESR_ELx_EC_GCS: + el0_gcs(regs, esr); + break; case ESR_ELx_EC_BREAKPT_LOW: case ESR_ELx_EC_SOFTSTP_LOW: case ESR_ELx_EC_WATCHPT_LOW: diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 563cbce11126..fdbcf047108c 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -506,6 +506,16 @@ void do_el1_bti(struct pt_regs *regs, unsigned long esr) die("Oops - BTI", regs, esr); } +void do_el0_gcs(struct pt_regs *regs, unsigned long esr) +{ + force_signal_inject(SIGSEGV, SEGV_CPERR, regs->pc, 0); +} + +void do_el1_gcs(struct pt_regs *regs, unsigned long esr) +{ + die("Oops - GCS", regs, esr); +} + void do_el0_fpac(struct pt_regs *regs, unsigned long esr) { force_signal_inject(SIGILL, ILL_ILLOPN, regs->pc, esr); @@ -852,6 +862,7 @@ static const char *esr_class_str[] = { [ESR_ELx_EC_MOPS] = "MOPS", [ESR_ELx_EC_FP_EXC32] = "FP (AArch32)", [ESR_ELx_EC_FP_EXC64] = "FP (AArch64)", + [ESR_ELx_EC_GCS] = "Guarded Control Stack", [ESR_ELx_EC_SERROR] = "SError", [ESR_ELx_EC_BREAKPT_LOW] = "Breakpoint (lower EL)", [ESR_ELx_EC_BREAKPT_CUR] = "Breakpoint (current EL)", -- cgit v1.2.3 From fc84bc5378a8c852308fa022957b8976adb5aa6a Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:59:00 +0100 Subject: arm64/gcs: Context switch GCS state for EL0 There are two registers controlling the GCS state of EL0, GCSPR_EL0 which is the current GCS pointer and GCSCRE0_EL1 which has enable bits for the specific GCS functionality enabled for EL0. Manage these on context switch and process lifetime events, GCS is reset on exec(). Also ensure that any changes to the GCS memory are visible to other PEs and that changes from other PEs are visible on this one by issuing a GCSB DSYNC when moving to or from a thread with GCS. Since the current GCS configuration of a thread will be visible to userspace we store the configuration in the format used with userspace and provide a helper which configures the system register as needed. On systems that support GCS we always allow access to GCSPR_EL0, this facilitates reporting of GCS faults if userspace implements disabling of GCS on error - the GCS can still be discovered and examined even if GCS has been disabled. Reviewed-by: Catalin Marinas Reviewed-by: Thiago Jung Bauermann Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-21-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/gcs.h | 24 +++++++++++++++ arch/arm64/include/asm/processor.h | 6 ++++ arch/arm64/kernel/process.c | 62 ++++++++++++++++++++++++++++++++++++++ arch/arm64/mm/Makefile | 1 + arch/arm64/mm/gcs.c | 42 ++++++++++++++++++++++++++ 5 files changed, 135 insertions(+) create mode 100644 arch/arm64/mm/gcs.c (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/include/asm/gcs.h b/arch/arm64/include/asm/gcs.h index 7c5e95218db6..04594ef59dad 100644 --- a/arch/arm64/include/asm/gcs.h +++ b/arch/arm64/include/asm/gcs.h @@ -48,4 +48,28 @@ static inline u64 gcsss2(void) return Xt; } +#ifdef CONFIG_ARM64_GCS + +static inline bool task_gcs_el0_enabled(struct task_struct *task) +{ + return current->thread.gcs_el0_mode & PR_SHADOW_STACK_ENABLE; +} + +void gcs_set_el0_mode(struct task_struct *task); +void gcs_free(struct task_struct *task); +void gcs_preserve_current_state(void); + +#else + +static inline bool task_gcs_el0_enabled(struct task_struct *task) +{ + return false; +} + +static inline void gcs_set_el0_mode(struct task_struct *task) { } +static inline void gcs_free(struct task_struct *task) { } +static inline void gcs_preserve_current_state(void) { } + +#endif + #endif diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 1438424f0064..5260788247d8 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -185,6 +185,12 @@ struct thread_struct { u64 svcr; u64 tpidr2_el0; u64 por_el0; +#ifdef CONFIG_ARM64_GCS + unsigned int gcs_el0_mode; + u64 gcspr_el0; + u64 gcs_base; + u64 gcs_size; +#endif }; static inline unsigned int thread_get_vl(struct thread_struct *thread, diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 0540653fbf38..aedcf332f422 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include #include @@ -280,6 +281,25 @@ static void flush_poe(void) write_sysreg_s(POR_EL0_INIT, SYS_POR_EL0); } +#ifdef CONFIG_ARM64_GCS + +static void flush_gcs(void) +{ + if (!system_supports_gcs()) + return; + + gcs_free(current); + current->thread.gcs_el0_mode = 0; + write_sysreg_s(GCSCRE0_EL1_nTR, SYS_GCSCRE0_EL1); + write_sysreg_s(0, SYS_GCSPR_EL0); +} + +#else + +static void flush_gcs(void) { } + +#endif + void flush_thread(void) { fpsimd_flush_thread(); @@ -287,6 +307,7 @@ void flush_thread(void) flush_ptrace_hw_breakpoint(current); flush_tagged_addr_state(); flush_poe(); + flush_gcs(); } void arch_release_task_struct(struct task_struct *tsk) @@ -484,6 +505,46 @@ static void entry_task_switch(struct task_struct *next) __this_cpu_write(__entry_task, next); } +#ifdef CONFIG_ARM64_GCS + +void gcs_preserve_current_state(void) +{ + current->thread.gcspr_el0 = read_sysreg_s(SYS_GCSPR_EL0); +} + +static void gcs_thread_switch(struct task_struct *next) +{ + if (!system_supports_gcs()) + return; + + /* GCSPR_EL0 is always readable */ + gcs_preserve_current_state(); + write_sysreg_s(next->thread.gcspr_el0, SYS_GCSPR_EL0); + + if (current->thread.gcs_el0_mode != next->thread.gcs_el0_mode) + gcs_set_el0_mode(next); + + /* + * Ensure that GCS memory effects of the 'prev' thread are + * ordered before other memory accesses with release semantics + * (or preceded by a DMB) on the current PE. In addition, any + * memory accesses with acquire semantics (or succeeded by a + * DMB) are ordered before GCS memory effects of the 'next' + * thread. This will ensure that the GCS memory effects are + * visible to other PEs in case of migration. + */ + if (task_gcs_el0_enabled(current) || task_gcs_el0_enabled(next)) + gcsb_dsync(); +} + +#else + +static void gcs_thread_switch(struct task_struct *next) +{ +} + +#endif + /* * Handle sysreg updates for ARM erratum 1418040 which affects the 32bit view of * CNTVCT, various other errata which require trapping all CNTVCT{,_EL0} @@ -580,6 +641,7 @@ struct task_struct *__switch_to(struct task_struct *prev, cntkctl_thread_switch(prev, next); ptrauth_thread_switch_user(next); permission_overlay_switch(next); + gcs_thread_switch(next); /* * Complete any pending TLB or cache maintenance on this CPU in case diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile index 2fc8c6dd0407..fc92170a8f37 100644 --- a/arch/arm64/mm/Makefile +++ b/arch/arm64/mm/Makefile @@ -11,6 +11,7 @@ obj-$(CONFIG_TRANS_TABLE) += trans_pgd.o obj-$(CONFIG_TRANS_TABLE) += trans_pgd-asm.o obj-$(CONFIG_DEBUG_VIRTUAL) += physaddr.o obj-$(CONFIG_ARM64_MTE) += mteswap.o +obj-$(CONFIG_ARM64_GCS) += gcs.o KASAN_SANITIZE_physaddr.o += n obj-$(CONFIG_KASAN) += kasan_init.o diff --git a/arch/arm64/mm/gcs.c b/arch/arm64/mm/gcs.c new file mode 100644 index 000000000000..f8f4f984a247 --- /dev/null +++ b/arch/arm64/mm/gcs.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include + +#include +#include + +/* + * Apply the GCS mode configured for the specified task to the + * hardware. + */ +void gcs_set_el0_mode(struct task_struct *task) +{ + u64 gcscre0_el1 = GCSCRE0_EL1_nTR; + + if (task->thread.gcs_el0_mode & PR_SHADOW_STACK_ENABLE) + gcscre0_el1 |= GCSCRE0_EL1_RVCHKEN | GCSCRE0_EL1_PCRSEL; + + if (task->thread.gcs_el0_mode & PR_SHADOW_STACK_WRITE) + gcscre0_el1 |= GCSCRE0_EL1_STREn; + + if (task->thread.gcs_el0_mode & PR_SHADOW_STACK_PUSH) + gcscre0_el1 |= GCSCRE0_EL1_PUSHMEn; + + write_sysreg_s(gcscre0_el1, SYS_GCSCRE0_EL1); +} + +void gcs_free(struct task_struct *task) +{ + if (!system_supports_gcs()) + return; + + if (task->thread.gcs_base) + vm_munmap(task->thread.gcs_base, task->thread.gcs_size); + + task->thread.gcspr_el0 = 0; + task->thread.gcs_base = 0; + task->thread.gcs_size = 0; +} -- cgit v1.2.3 From 506496bcbb4204c9ff5cfe82b1b90e1f14366992 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:59:01 +0100 Subject: arm64/gcs: Ensure that new threads have a GCS When a new thread is created by a thread with GCS enabled the GCS needs to be specified along with the regular stack. Unfortunately plain clone() is not extensible and existing clone3() users will not specify a stack so all existing code would be broken if we mandated specifying the stack explicitly. For compatibility with these cases and also x86 (which did not initially implement clone3() support for shadow stacks) if no GCS is specified we will allocate one so when a thread is created which has GCS enabled allocate one for it. We follow the extensively discussed x86 implementation and allocate min(RLIMIT_STACK/2, 2G). Since the GCS only stores the call stack and not any variables this should be more than sufficient for most applications. GCSs allocated via this mechanism will be freed when the thread exits. Reviewed-by: Thiago Jung Bauermann Acked-by: Yury Khrustalev Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-22-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/gcs.h | 9 +++++ arch/arm64/include/asm/mmu_context.h | 9 +++++ arch/arm64/kernel/process.c | 32 +++++++++++++++++ arch/arm64/mm/gcs.c | 69 ++++++++++++++++++++++++++++++++++++ 4 files changed, 119 insertions(+) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/include/asm/gcs.h b/arch/arm64/include/asm/gcs.h index 04594ef59dad..c1f274fdb9c0 100644 --- a/arch/arm64/include/asm/gcs.h +++ b/arch/arm64/include/asm/gcs.h @@ -8,6 +8,8 @@ #include #include +struct kernel_clone_args; + static inline void gcsb_dsync(void) { asm volatile(".inst 0xd503227f" : : : "memory"); @@ -58,6 +60,8 @@ static inline bool task_gcs_el0_enabled(struct task_struct *task) void gcs_set_el0_mode(struct task_struct *task); void gcs_free(struct task_struct *task); void gcs_preserve_current_state(void); +unsigned long gcs_alloc_thread_stack(struct task_struct *tsk, + const struct kernel_clone_args *args); #else @@ -69,6 +73,11 @@ static inline bool task_gcs_el0_enabled(struct task_struct *task) static inline void gcs_set_el0_mode(struct task_struct *task) { } static inline void gcs_free(struct task_struct *task) { } static inline void gcs_preserve_current_state(void) { } +static inline unsigned long gcs_alloc_thread_stack(struct task_struct *tsk, + const struct kernel_clone_args *args) +{ + return -ENOTSUPP; +} #endif diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index 7c09d47e09cb..48b3d9553b67 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -311,6 +312,14 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, return por_el0_allows_pkey(vma_pkey(vma), write, execute); } +#define deactivate_mm deactivate_mm +static inline void deactivate_mm(struct task_struct *tsk, + struct mm_struct *mm) +{ + gcs_free(tsk); +} + + #include #endif /* !__ASSEMBLY__ */ diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index aedcf332f422..fdd095480c3f 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -294,9 +294,35 @@ static void flush_gcs(void) write_sysreg_s(0, SYS_GCSPR_EL0); } +static int copy_thread_gcs(struct task_struct *p, + const struct kernel_clone_args *args) +{ + unsigned long gcs; + + if (!system_supports_gcs()) + return 0; + + p->thread.gcs_base = 0; + p->thread.gcs_size = 0; + + gcs = gcs_alloc_thread_stack(p, args); + if (IS_ERR_VALUE(gcs)) + return PTR_ERR((void *)gcs); + + p->thread.gcs_el0_mode = current->thread.gcs_el0_mode; + p->thread.gcs_el0_locked = current->thread.gcs_el0_locked; + + return 0; +} + #else static void flush_gcs(void) { } +static int copy_thread_gcs(struct task_struct *p, + const struct kernel_clone_args *args) +{ + return 0; +} #endif @@ -313,6 +339,7 @@ void flush_thread(void) void arch_release_task_struct(struct task_struct *tsk) { fpsimd_release_task(tsk); + gcs_free(tsk); } int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) @@ -376,6 +403,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) unsigned long stack_start = args->stack; unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); + int ret; memset(&p->thread.cpu_context, 0, sizeof(struct cpu_context)); @@ -420,6 +448,10 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) p->thread.uw.tp_value = tls; p->thread.tpidr2_el0 = 0; } + + ret = copy_thread_gcs(p, args); + if (ret != 0) + return ret; } else { /* * A kthread has no context to ERET to, so ensure any buggy diff --git a/arch/arm64/mm/gcs.c b/arch/arm64/mm/gcs.c index f8f4f984a247..3c7a18f57ea9 100644 --- a/arch/arm64/mm/gcs.c +++ b/arch/arm64/mm/gcs.c @@ -5,9 +5,69 @@ #include #include +#include #include +#include #include +static unsigned long alloc_gcs(unsigned long addr, unsigned long size) +{ + int flags = MAP_ANONYMOUS | MAP_PRIVATE; + struct mm_struct *mm = current->mm; + unsigned long mapped_addr, unused; + + if (addr) + flags |= MAP_FIXED_NOREPLACE; + + mmap_write_lock(mm); + mapped_addr = do_mmap(NULL, addr, size, PROT_READ, flags, + VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL); + mmap_write_unlock(mm); + + return mapped_addr; +} + +static unsigned long gcs_size(unsigned long size) +{ + if (size) + return PAGE_ALIGN(size); + + /* Allocate RLIMIT_STACK/2 with limits of PAGE_SIZE..2G */ + size = PAGE_ALIGN(min_t(unsigned long long, + rlimit(RLIMIT_STACK) / 2, SZ_2G)); + return max(PAGE_SIZE, size); +} + +unsigned long gcs_alloc_thread_stack(struct task_struct *tsk, + const struct kernel_clone_args *args) +{ + unsigned long addr, size; + + if (!system_supports_gcs()) + return 0; + + if (!task_gcs_el0_enabled(tsk)) + return 0; + + if ((args->flags & (CLONE_VFORK | CLONE_VM)) != CLONE_VM) { + tsk->thread.gcspr_el0 = read_sysreg_s(SYS_GCSPR_EL0); + return 0; + } + + size = args->stack_size / 2; + + size = gcs_size(size); + addr = alloc_gcs(0, size); + if (IS_ERR_VALUE(addr)) + return addr; + + tsk->thread.gcs_base = addr; + tsk->thread.gcs_size = size; + tsk->thread.gcspr_el0 = addr + size - sizeof(u64); + + return addr; +} + /* * Apply the GCS mode configured for the specified task to the * hardware. @@ -33,6 +93,15 @@ void gcs_free(struct task_struct *task) if (!system_supports_gcs()) return; + /* + * When fork() with CLONE_VM fails, the child (tsk) already + * has a GCS allocated, and exit_thread() calls this function + * to free it. In this case the parent (current) and the + * child share the same mm struct. + */ + if (!task->mm || task->mm != current->mm) + return; + if (task->thread.gcs_base) vm_munmap(task->thread.gcs_base, task->thread.gcs_size); -- cgit v1.2.3 From eaf62ce1563b8557e3550acb97d5086120168750 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:59:04 +0100 Subject: arm64/signal: Set up and restore the GCS context for signal handlers When invoking a signal handler we use the GCS configuration and stack for the current thread. Since we implement signal return by calling the signal handler with a return address set up pointing to a trampoline in the vDSO we need to also configure any active GCS for this by pushing a frame for the trampoline onto the GCS. If we do not do this then signal return will generate a GCS protection fault. In order to guard against attempts to bypass GCS protections via signal return we only allow returning with GCSPR_EL0 pointing to an address where it was previously preempted by a signal. We do this by pushing a cap onto the GCS, this takes the form of an architectural GCS cap token with the top bit set and token type of 0 which we add on signal entry and validate and pop off on signal return. The combination of the top bit being set and the token type mean that this can't be interpreted as a valid token or address. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-25-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/gcs.h | 1 + arch/arm64/kernel/signal.c | 118 +++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 114 insertions(+), 5 deletions(-) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/include/asm/gcs.h b/arch/arm64/include/asm/gcs.h index 48c97e63e56a..f50660603ecf 100644 --- a/arch/arm64/include/asm/gcs.h +++ b/arch/arm64/include/asm/gcs.h @@ -9,6 +9,7 @@ #include struct kernel_clone_args; +struct ksignal; static inline void gcsb_dsync(void) { diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 561986947530..b5ab0e229a78 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -34,6 +35,15 @@ #include #include +#ifdef CONFIG_ARM64_GCS +#define GCS_SIGNAL_CAP(addr) (((unsigned long)addr) & GCS_CAP_ADDR_MASK) + +static bool gcs_signal_cap_valid(u64 addr, u64 val) +{ + return val == GCS_SIGNAL_CAP(addr); +} +#endif + /* * Do a signal return; undo the signal stack. These are aligned to 128-bit. */ @@ -904,6 +914,58 @@ static int restore_sigframe(struct pt_regs *regs, return err; } +#ifdef CONFIG_ARM64_GCS +static int gcs_restore_signal(void) +{ + unsigned long __user *gcspr_el0; + u64 cap; + int ret; + + if (!system_supports_gcs()) + return 0; + + if (!(current->thread.gcs_el0_mode & PR_SHADOW_STACK_ENABLE)) + return 0; + + gcspr_el0 = (unsigned long __user *)read_sysreg_s(SYS_GCSPR_EL0); + + /* + * Ensure that any changes to the GCS done via GCS operations + * are visible to the normal reads we do to validate the + * token. + */ + gcsb_dsync(); + + /* + * GCSPR_EL0 should be pointing at a capped GCS, read the cap. + * We don't enforce that this is in a GCS page, if it is not + * then faults will be generated on GCS operations - the main + * concern is to protect GCS pages. + */ + ret = copy_from_user(&cap, gcspr_el0, sizeof(cap)); + if (ret) + return -EFAULT; + + /* + * Check that the cap is the actual GCS before replacing it. + */ + if (!gcs_signal_cap_valid((u64)gcspr_el0, cap)) + return -EINVAL; + + /* Invalidate the token to prevent reuse */ + put_user_gcs(0, (__user void*)gcspr_el0, &ret); + if (ret != 0) + return -EFAULT; + + write_sysreg_s(gcspr_el0 + 1, SYS_GCSPR_EL0); + + return 0; +} + +#else +static int gcs_restore_signal(void) { return 0; } +#endif + SYSCALL_DEFINE0(rt_sigreturn) { struct pt_regs *regs = current_pt_regs(); @@ -927,6 +989,9 @@ SYSCALL_DEFINE0(rt_sigreturn) if (restore_sigframe(regs, frame)) goto badframe; + if (gcs_restore_signal()) + goto badframe; + if (restore_altstack(&frame->uc.uc_stack)) goto badframe; @@ -1189,7 +1254,48 @@ static int get_sigframe(struct rt_sigframe_user_layout *user, return 0; } -static void setup_return(struct pt_regs *regs, struct k_sigaction *ka, +#ifdef CONFIG_ARM64_GCS + +static int gcs_signal_entry(__sigrestore_t sigtramp, struct ksignal *ksig) +{ + unsigned long __user *gcspr_el0; + int ret = 0; + + if (!system_supports_gcs()) + return 0; + + if (!task_gcs_el0_enabled(current)) + return 0; + + /* + * We are entering a signal handler, current register state is + * active. + */ + gcspr_el0 = (unsigned long __user *)read_sysreg_s(SYS_GCSPR_EL0); + + /* + * Push a cap and the GCS entry for the trampoline onto the GCS. + */ + put_user_gcs((unsigned long)sigtramp, gcspr_el0 - 2, &ret); + put_user_gcs(GCS_SIGNAL_CAP(gcspr_el0 - 1), gcspr_el0 - 1, &ret); + if (ret != 0) + return ret; + + gcspr_el0 -= 2; + write_sysreg_s((unsigned long)gcspr_el0, SYS_GCSPR_EL0); + + return 0; +} +#else + +static int gcs_signal_entry(__sigrestore_t sigtramp, struct ksignal *ksig) +{ + return 0; +} + +#endif + +static int setup_return(struct pt_regs *regs, struct ksignal *ksig, struct rt_sigframe_user_layout *user, int usig) { __sigrestore_t sigtramp; @@ -1197,7 +1303,7 @@ static void setup_return(struct pt_regs *regs, struct k_sigaction *ka, regs->regs[0] = usig; regs->sp = (unsigned long)user->sigframe; regs->regs[29] = (unsigned long)&user->next_frame->fp; - regs->pc = (unsigned long)ka->sa.sa_handler; + regs->pc = (unsigned long)ksig->ka.sa.sa_handler; /* * Signal delivery is a (wacky) indirect function call in @@ -1240,12 +1346,14 @@ static void setup_return(struct pt_regs *regs, struct k_sigaction *ka, if (system_supports_poe()) write_sysreg_s(POR_EL0_INIT, SYS_POR_EL0); - if (ka->sa.sa_flags & SA_RESTORER) - sigtramp = ka->sa.sa_restorer; + if (ksig->ka.sa.sa_flags & SA_RESTORER) + sigtramp = ksig->ka.sa.sa_restorer; else sigtramp = VDSO_SYMBOL(current->mm->context.vdso, sigtramp); regs->regs[30] = (unsigned long)sigtramp; + + return gcs_signal_entry(sigtramp, ksig); } static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set, @@ -1268,7 +1376,7 @@ static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set, err |= __save_altstack(&frame->uc.uc_stack, regs->sp); err |= setup_sigframe(&user, regs, set); if (err == 0) { - setup_return(regs, &ksig->ka, &user, usig); + err = setup_return(regs, ksig, &user, usig); if (ksig->ka.sa.sa_flags & SA_SIGINFO) { err |= copy_siginfo_to_user(&frame->info, &ksig->info); regs->regs[1] = (unsigned long)&frame->info; -- cgit v1.2.3 From 16f47bb9ac8afe09e7ca14cc53748f779b2a12e0 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:59:05 +0100 Subject: arm64/signal: Expose GCS state in signal frames Add a context for the GCS state and include it in the signal context when running on a system that supports GCS. We reuse the same flags that the prctl() uses to specify which GCS features are enabled and also provide the current GCS pointer. We do not support enabling GCS via signal return, there is a conflict between specifying GCSPR_EL0 and allocation of a new GCS and this is not an ancticipated use case. We also enforce GCS configuration locking on signal return. Reviewed-by: Catalin Marinas Reviewed-by: Thiago Jung Bauermann Acked-by: Yury Khrustalev Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-26-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/include/uapi/asm/sigcontext.h | 9 +++ arch/arm64/kernel/signal.c | 109 +++++++++++++++++++++++++++++++ 2 files changed, 118 insertions(+) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h index bb7af77a30a7..d42f7a92238b 100644 --- a/arch/arm64/include/uapi/asm/sigcontext.h +++ b/arch/arm64/include/uapi/asm/sigcontext.h @@ -183,6 +183,15 @@ struct zt_context { __u16 __reserved[3]; }; +#define GCS_MAGIC 0x47435300 + +struct gcs_context { + struct _aarch64_ctx head; + __u64 gcspr; + __u64 features_enabled; + __u64 reserved; +}; + #endif /* !__ASSEMBLY__ */ #include diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index b5ab0e229a78..62d666278264 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -66,6 +66,7 @@ struct rt_sigframe_user_layout { unsigned long fpsimd_offset; unsigned long esr_offset; + unsigned long gcs_offset; unsigned long sve_offset; unsigned long tpidr2_offset; unsigned long za_offset; @@ -198,6 +199,8 @@ struct user_ctxs { u32 fpmr_size; struct poe_context __user *poe; u32 poe_size; + struct gcs_context __user *gcs; + u32 gcs_size; }; static int preserve_fpsimd_context(struct fpsimd_context __user *ctx) @@ -643,6 +646,82 @@ extern int restore_zt_context(struct user_ctxs *user); #endif /* ! CONFIG_ARM64_SME */ +#ifdef CONFIG_ARM64_GCS + +static int preserve_gcs_context(struct gcs_context __user *ctx) +{ + int err = 0; + u64 gcspr = read_sysreg_s(SYS_GCSPR_EL0); + + /* + * If GCS is enabled we will add a cap token to the frame, + * include it in the GCSPR_EL0 we report to support stack + * switching via sigreturn if GCS is enabled. We do not allow + * enabling via sigreturn so the token is only relevant for + * threads with GCS enabled. + */ + if (task_gcs_el0_enabled(current)) + gcspr -= 8; + + __put_user_error(GCS_MAGIC, &ctx->head.magic, err); + __put_user_error(sizeof(*ctx), &ctx->head.size, err); + __put_user_error(gcspr, &ctx->gcspr, err); + __put_user_error(0, &ctx->reserved, err); + __put_user_error(current->thread.gcs_el0_mode, + &ctx->features_enabled, err); + + return err; +} + +static int restore_gcs_context(struct user_ctxs *user) +{ + u64 gcspr, enabled; + int err = 0; + + if (user->gcs_size != sizeof(*user->gcs)) + return -EINVAL; + + __get_user_error(gcspr, &user->gcs->gcspr, err); + __get_user_error(enabled, &user->gcs->features_enabled, err); + if (err) + return err; + + /* Don't allow unknown modes */ + if (enabled & ~PR_SHADOW_STACK_SUPPORTED_STATUS_MASK) + return -EINVAL; + + err = gcs_check_locked(current, enabled); + if (err != 0) + return err; + + /* Don't allow enabling */ + if (!task_gcs_el0_enabled(current) && + (enabled & PR_SHADOW_STACK_ENABLE)) + return -EINVAL; + + /* If we are disabling disable everything */ + if (!(enabled & PR_SHADOW_STACK_ENABLE)) + enabled = 0; + + current->thread.gcs_el0_mode = enabled; + + /* + * We let userspace set GCSPR_EL0 to anything here, we will + * validate later in gcs_restore_signal(). + */ + write_sysreg_s(gcspr, SYS_GCSPR_EL0); + + return 0; +} + +#else /* ! CONFIG_ARM64_GCS */ + +/* Turn any non-optimised out attempts to use these into a link error: */ +extern int preserve_gcs_context(void __user *ctx); +extern int restore_gcs_context(struct user_ctxs *user); + +#endif /* ! CONFIG_ARM64_GCS */ + static int parse_user_sigframe(struct user_ctxs *user, struct rt_sigframe __user *sf) { @@ -661,6 +740,7 @@ static int parse_user_sigframe(struct user_ctxs *user, user->zt = NULL; user->fpmr = NULL; user->poe = NULL; + user->gcs = NULL; if (!IS_ALIGNED((unsigned long)base, 16)) goto invalid; @@ -777,6 +857,17 @@ static int parse_user_sigframe(struct user_ctxs *user, user->fpmr_size = size; break; + case GCS_MAGIC: + if (!system_supports_gcs()) + goto invalid; + + if (user->gcs) + goto invalid; + + user->gcs = (struct gcs_context __user *)head; + user->gcs_size = size; + break; + case EXTRA_MAGIC: if (have_extra_context) goto invalid; @@ -896,6 +987,9 @@ static int restore_sigframe(struct pt_regs *regs, err = restore_fpsimd_context(&user); } + if (err == 0 && system_supports_gcs() && user.gcs) + err = restore_gcs_context(&user); + if (err == 0 && system_supports_tpidr2() && user.tpidr2) err = restore_tpidr2_context(&user); @@ -1029,6 +1123,15 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user, return err; } +#ifdef CONFIG_ARM64_GCS + if (system_supports_gcs() && (add_all || current->thread.gcspr_el0)) { + err = sigframe_alloc(user, &user->gcs_offset, + sizeof(struct gcs_context)); + if (err) + return err; + } +#endif + if (system_supports_sve() || system_supports_sme()) { unsigned int vq = 0; @@ -1136,6 +1239,12 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user, __put_user_error(current->thread.fault_code, &esr_ctx->esr, err); } + if (system_supports_gcs() && err == 0 && user->gcs_offset) { + struct gcs_context __user *gcs_ctx = + apply_user_offset(user, user->gcs_offset); + err |= preserve_gcs_context(gcs_ctx); + } + /* Scalable Vector Extension state (including streaming), if present */ if ((system_supports_sve() || system_supports_sme()) && err == 0 && user->sve_offset) { -- cgit v1.2.3 From 7ec3b57cb29f8371bf12a725b6e8f75831a03f27 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:59:06 +0100 Subject: arm64/ptrace: Expose GCS via ptrace and core files Provide a new register type NT_ARM_GCS reporting the current GCS mode and pointer for EL0. Due to the interactions with allocation and deallocation of Guarded Control Stacks we do not permit any changes to the GCS mode via ptrace, only GCSPR_EL0 may be changed. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-27-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/include/uapi/asm/ptrace.h | 8 +++++ arch/arm64/kernel/ptrace.c | 62 +++++++++++++++++++++++++++++++++++- include/uapi/linux/elf.h | 1 + 3 files changed, 70 insertions(+), 1 deletion(-) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/include/uapi/asm/ptrace.h b/arch/arm64/include/uapi/asm/ptrace.h index 7fa2f7036aa7..0f39ba4f3efd 100644 --- a/arch/arm64/include/uapi/asm/ptrace.h +++ b/arch/arm64/include/uapi/asm/ptrace.h @@ -324,6 +324,14 @@ struct user_za_header { #define ZA_PT_SIZE(vq) \ (ZA_PT_ZA_OFFSET + ZA_PT_ZA_SIZE(vq)) +/* GCS state (NT_ARM_GCS) */ + +struct user_gcs { + __u64 features_enabled; + __u64 features_locked; + __u64 gcspr_el0; +}; + #endif /* __ASSEMBLY__ */ #endif /* _UAPI__ASM_PTRACE_H */ diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index b756578aeaee..6c1dcfe6d25a 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -1473,6 +1474,52 @@ static int poe_set(struct task_struct *target, const struct } #endif +#ifdef CONFIG_ARM64_GCS +static int gcs_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + struct user_gcs user_gcs; + + if (!system_supports_gcs()) + return -EINVAL; + + if (target == current) + gcs_preserve_current_state(); + + user_gcs.features_enabled = target->thread.gcs_el0_mode; + user_gcs.features_locked = target->thread.gcs_el0_locked; + user_gcs.gcspr_el0 = target->thread.gcspr_el0; + + return membuf_write(&to, &user_gcs, sizeof(user_gcs)); +} + +static int gcs_set(struct task_struct *target, const struct + user_regset *regset, unsigned int pos, + unsigned int count, const void *kbuf, const + void __user *ubuf) +{ + int ret; + struct user_gcs user_gcs; + + if (!system_supports_gcs()) + return -EINVAL; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &user_gcs, 0, -1); + if (ret) + return ret; + + if (user_gcs.features_enabled & ~PR_SHADOW_STACK_SUPPORTED_STATUS_MASK) + return -EINVAL; + + target->thread.gcs_el0_mode = user_gcs.features_enabled; + target->thread.gcs_el0_locked = user_gcs.features_locked; + target->thread.gcspr_el0 = user_gcs.gcspr_el0; + + return 0; +} +#endif + enum aarch64_regset { REGSET_GPR, REGSET_FPR, @@ -1503,7 +1550,10 @@ enum aarch64_regset { REGSET_TAGGED_ADDR_CTRL, #endif #ifdef CONFIG_ARM64_POE - REGSET_POE + REGSET_POE, +#endif +#ifdef CONFIG_ARM64_GCS + REGSET_GCS, #endif }; @@ -1674,6 +1724,16 @@ static const struct user_regset aarch64_regsets[] = { .set = poe_set, }, #endif +#ifdef CONFIG_ARM64_GCS + [REGSET_GCS] = { + .core_note_type = NT_ARM_GCS, + .n = sizeof(struct user_gcs) / sizeof(u64), + .size = sizeof(u64), + .align = sizeof(u64), + .regset_get = gcs_get, + .set = gcs_set, + }, +#endif }; static const struct user_regset_view user_aarch64_view = { diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index b9935988da5c..9adc218fb6df 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -443,6 +443,7 @@ typedef struct elf64_shdr { #define NT_ARM_ZT 0x40d /* ARM SME ZT registers */ #define NT_ARM_FPMR 0x40e /* ARM floating point mode register */ #define NT_ARM_POE 0x40f /* ARM POE registers */ +#define NT_ARM_GCS 0x410 /* ARM GCS state */ #define NT_ARC_V2 0x600 /* ARCv2 accumulator/extra registers */ #define NT_VMCOREDD 0x700 /* Vmcore Device Dump Note */ #define NT_MIPS_DSP 0x800 /* MIPS DSP ASE registers */ -- cgit v1.2.3 From acc450aa07099d071b18174c22a1119c57da8227 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 8 Oct 2024 16:58:46 +0100 Subject: arm64: probes: Remove broken LDR (literal) uprobe support The simulate_ldr_literal() and simulate_ldrsw_literal() functions are unsafe to use for uprobes. Both functions were originally written for use with kprobes, and access memory with plain C accesses. When uprobes was added, these were reused unmodified even though they cannot safely access user memory. There are three key problems: 1) The plain C accesses do not have corresponding extable entries, and thus if they encounter a fault the kernel will treat these as unintentional accesses to user memory, resulting in a BUG() which will kill the kernel thread, and likely lead to further issues (e.g. lockup or panic()). 2) The plain C accesses are subject to HW PAN and SW PAN, and so when either is in use, any attempt to simulate an access to user memory will fault. Thus neither simulate_ldr_literal() nor simulate_ldrsw_literal() can do anything useful when simulating a user instruction on any system with HW PAN or SW PAN. 3) The plain C accesses are privileged, as they run in kernel context, and in practice can access a small range of kernel virtual addresses. The instructions they simulate have a range of +/-1MiB, and since the simulated instructions must itself be a user instructions in the TTBR0 address range, these can address the final 1MiB of the TTBR1 acddress range by wrapping downwards from an address in the first 1MiB of the TTBR0 address range. In contemporary kernels the last 8MiB of TTBR1 address range is reserved, and accesses to this will always fault, meaning this is no worse than (1). Historically, it was theoretically possible for the linear map or vmemmap to spill into the final 8MiB of the TTBR1 address range, but in practice this is extremely unlikely to occur as this would require either: * Having enough physical memory to fill the entire linear map all the way to the final 1MiB of the TTBR1 address range. * Getting unlucky with KASLR randomization of the linear map such that the populated region happens to overlap with the last 1MiB of the TTBR address range. ... and in either case if we were to spill into the final page there would be larger problems as the final page would alias with error pointers. Practically speaking, (1) and (2) are the big issues. Given there have been no reports of problems since the broken code was introduced, it appears that no-one is relying on probing these instructions with uprobes. Avoid these issues by not allowing uprobes on LDR (literal) and LDRSW (literal), limiting the use of simulate_ldr_literal() and simulate_ldrsw_literal() to kprobes. Attempts to place uprobes on LDR (literal) and LDRSW (literal) will be rejected as arm_probe_decode_insn() will return INSN_REJECTED. In future we can consider introducing working uprobes support for these instructions, but this will require more significant work. Fixes: 9842ceae9fa8 ("arm64: Add uprobe support") Cc: stable@vger.kernel.org Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Will Deacon Link: https://lore.kernel.org/r/20241008155851.801546-2-mark.rutland@arm.com Signed-off-by: Will Deacon --- arch/arm64/kernel/probes/decode-insn.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/kernel/probes/decode-insn.c b/arch/arm64/kernel/probes/decode-insn.c index 968d5fffe233..3496d6169e59 100644 --- a/arch/arm64/kernel/probes/decode-insn.c +++ b/arch/arm64/kernel/probes/decode-insn.c @@ -99,10 +99,6 @@ arm_probe_decode_insn(probe_opcode_t insn, struct arch_probe_insn *api) aarch64_insn_is_blr(insn) || aarch64_insn_is_ret(insn)) { api->handler = simulate_br_blr_ret; - } else if (aarch64_insn_is_ldr_lit(insn)) { - api->handler = simulate_ldr_literal; - } else if (aarch64_insn_is_ldrsw_lit(insn)) { - api->handler = simulate_ldrsw_literal; } else { /* * Instruction cannot be stepped out-of-line and we don't @@ -140,6 +136,17 @@ arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi) probe_opcode_t insn = le32_to_cpu(*addr); probe_opcode_t *scan_end = NULL; unsigned long size = 0, offset = 0; + struct arch_probe_insn *api = &asi->api; + + if (aarch64_insn_is_ldr_lit(insn)) { + api->handler = simulate_ldr_literal; + decoded = INSN_GOOD_NO_SLOT; + } else if (aarch64_insn_is_ldrsw_lit(insn)) { + api->handler = simulate_ldrsw_literal; + decoded = INSN_GOOD_NO_SLOT; + } else { + decoded = arm_probe_decode_insn(insn, &asi->api); + } /* * If there's a symbol defined in front of and near enough to @@ -157,7 +164,6 @@ arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi) else scan_end = addr - MAX_ATOMIC_CONTEXT_SIZE; } - decoded = arm_probe_decode_insn(insn, &asi->api); if (decoded != INSN_REJECTED && scan_end) if (is_probed_address_atomic(addr - 1, scan_end)) -- cgit v1.2.3 From 50f813e57601c22b6f26ced3193b9b94d70a2640 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 8 Oct 2024 16:58:47 +0100 Subject: arm64: probes: Fix simulate_ldr*_literal() The simulate_ldr_literal() code always loads a 64-bit quantity, and when simulating a 32-bit load into a 'W' register, it discards the most significant 32 bits. For big-endian kernels this means that the relevant bits are discarded, and the value returned is the the subsequent 32 bits in memory (i.e. the value at addr + 4). Additionally, simulate_ldr_literal() and simulate_ldrsw_literal() use a plain C load, which the compiler may tear or elide (e.g. if the target is the zero register). Today this doesn't happen to matter, but it may matter in future if trampoline code uses a LDR (literal) or LDRSW (literal). Update simulate_ldr_literal() and simulate_ldrsw_literal() to use an appropriately-sized READ_ONCE() to perform the access, which avoids these problems. Fixes: 39a67d49ba35 ("arm64: kprobes instruction simulation support") Cc: stable@vger.kernel.org Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Will Deacon Link: https://lore.kernel.org/r/20241008155851.801546-3-mark.rutland@arm.com Signed-off-by: Will Deacon --- arch/arm64/kernel/probes/simulate-insn.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/kernel/probes/simulate-insn.c b/arch/arm64/kernel/probes/simulate-insn.c index 22d0b3252476..b65334ab79d2 100644 --- a/arch/arm64/kernel/probes/simulate-insn.c +++ b/arch/arm64/kernel/probes/simulate-insn.c @@ -171,17 +171,15 @@ simulate_tbz_tbnz(u32 opcode, long addr, struct pt_regs *regs) void __kprobes simulate_ldr_literal(u32 opcode, long addr, struct pt_regs *regs) { - u64 *load_addr; + unsigned long load_addr; int xn = opcode & 0x1f; - int disp; - disp = ldr_displacement(opcode); - load_addr = (u64 *) (addr + disp); + load_addr = addr + ldr_displacement(opcode); if (opcode & (1 << 30)) /* x0-x30 */ - set_x_reg(regs, xn, *load_addr); + set_x_reg(regs, xn, READ_ONCE(*(u64 *)load_addr)); else /* w0-w30 */ - set_w_reg(regs, xn, *load_addr); + set_w_reg(regs, xn, READ_ONCE(*(u32 *)load_addr)); instruction_pointer_set(regs, instruction_pointer(regs) + 4); } @@ -189,14 +187,12 @@ simulate_ldr_literal(u32 opcode, long addr, struct pt_regs *regs) void __kprobes simulate_ldrsw_literal(u32 opcode, long addr, struct pt_regs *regs) { - s32 *load_addr; + unsigned long load_addr; int xn = opcode & 0x1f; - int disp; - disp = ldr_displacement(opcode); - load_addr = (s32 *) (addr + disp); + load_addr = addr + ldr_displacement(opcode); - set_x_reg(regs, xn, *load_addr); + set_x_reg(regs, xn, READ_ONCE(*(s32 *)load_addr)); instruction_pointer_set(regs, instruction_pointer(regs) + 4); } -- cgit v1.2.3 From 13f8f1e05f1dc36dbba6cba0ae03354c0dafcde7 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 8 Oct 2024 16:58:48 +0100 Subject: arm64: probes: Fix uprobes for big-endian kernels The arm64 uprobes code is broken for big-endian kernels as it doesn't convert the in-memory instruction encoding (which is always little-endian) into the kernel's native endianness before analyzing and simulating instructions. This may result in a few distinct problems: * The kernel may may erroneously reject probing an instruction which can safely be probed. * The kernel may erroneously erroneously permit stepping an instruction out-of-line when that instruction cannot be stepped out-of-line safely. * The kernel may erroneously simulate instruction incorrectly dur to interpretting the byte-swapped encoding. The endianness mismatch isn't caught by the compiler or sparse because: * The arch_uprobe::{insn,ixol} fields are encoded as arrays of u8, so the compiler and sparse have no idea these contain a little-endian 32-bit value. The core uprobes code populates these with a memcpy() which similarly does not handle endianness. * While the uprobe_opcode_t type is an alias for __le32, both arch_uprobe_analyze_insn() and arch_uprobe_skip_sstep() cast from u8[] to the similarly-named probe_opcode_t, which is an alias for u32. Hence there is no endianness conversion warning. Fix this by changing the arch_uprobe::{insn,ixol} fields to __le32 and adding the appropriate __le32_to_cpu() conversions prior to consuming the instruction encoding. The core uprobes copies these fields as opaque ranges of bytes, and so is unaffected by this change. At the same time, remove MAX_UINSN_BYTES and consistently use AARCH64_INSN_SIZE for clarity. Tested with the following: | #include | #include | | #define noinline __attribute__((noinline)) | | static noinline void *adrp_self(void) | { | void *addr; | | asm volatile( | " adrp %x0, adrp_self\n" | " add %x0, %x0, :lo12:adrp_self\n" | : "=r" (addr)); | } | | | int main(int argc, char *argv) | { | void *ptr = adrp_self(); | bool equal = (ptr == adrp_self); | | printf("adrp_self => %p\n" | "adrp_self() => %p\n" | "%s\n", | adrp_self, ptr, equal ? "EQUAL" : "NOT EQUAL"); | | return 0; | } .... where the adrp_self() function was compiled to: | 00000000004007e0 : | 4007e0: 90000000 adrp x0, 400000 <__ehdr_start> | 4007e4: 911f8000 add x0, x0, #0x7e0 | 4007e8: d65f03c0 ret Before this patch, the ADRP is not recognized, and is assumed to be steppable, resulting in corruption of the result: | # ./adrp-self | adrp_self => 0x4007e0 | adrp_self() => 0x4007e0 | EQUAL | # echo 'p /root/adrp-self:0x007e0' > /sys/kernel/tracing/uprobe_events | # echo 1 > /sys/kernel/tracing/events/uprobes/enable | # ./adrp-self | adrp_self => 0x4007e0 | adrp_self() => 0xffffffffff7e0 | NOT EQUAL After this patch, the ADRP is correctly recognized and simulated: | # ./adrp-self | adrp_self => 0x4007e0 | adrp_self() => 0x4007e0 | EQUAL | # | # echo 'p /root/adrp-self:0x007e0' > /sys/kernel/tracing/uprobe_events | # echo 1 > /sys/kernel/tracing/events/uprobes/enable | # ./adrp-self | adrp_self => 0x4007e0 | adrp_self() => 0x4007e0 | EQUAL Fixes: 9842ceae9fa8 ("arm64: Add uprobe support") Cc: stable@vger.kernel.org Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Will Deacon Link: https://lore.kernel.org/r/20241008155851.801546-4-mark.rutland@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/uprobes.h | 8 +++----- arch/arm64/kernel/probes/uprobes.c | 4 ++-- 2 files changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/include/asm/uprobes.h b/arch/arm64/include/asm/uprobes.h index 2b09495499c6..014b02897f8e 100644 --- a/arch/arm64/include/asm/uprobes.h +++ b/arch/arm64/include/asm/uprobes.h @@ -10,11 +10,9 @@ #include #include -#define MAX_UINSN_BYTES AARCH64_INSN_SIZE - #define UPROBE_SWBP_INSN cpu_to_le32(BRK64_OPCODE_UPROBES) #define UPROBE_SWBP_INSN_SIZE AARCH64_INSN_SIZE -#define UPROBE_XOL_SLOT_BYTES MAX_UINSN_BYTES +#define UPROBE_XOL_SLOT_BYTES AARCH64_INSN_SIZE typedef __le32 uprobe_opcode_t; @@ -23,8 +21,8 @@ struct arch_uprobe_task { struct arch_uprobe { union { - u8 insn[MAX_UINSN_BYTES]; - u8 ixol[MAX_UINSN_BYTES]; + __le32 insn; + __le32 ixol; }; struct arch_probe_insn api; bool simulate; diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c index d49aef2657cd..a2f137a595fc 100644 --- a/arch/arm64/kernel/probes/uprobes.c +++ b/arch/arm64/kernel/probes/uprobes.c @@ -42,7 +42,7 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, else if (!IS_ALIGNED(addr, AARCH64_INSN_SIZE)) return -EINVAL; - insn = *(probe_opcode_t *)(&auprobe->insn[0]); + insn = le32_to_cpu(auprobe->insn); switch (arm_probe_decode_insn(insn, &auprobe->api)) { case INSN_REJECTED: @@ -108,7 +108,7 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) if (!auprobe->simulate) return false; - insn = *(probe_opcode_t *)(&auprobe->insn[0]); + insn = le32_to_cpu(auprobe->insn); addr = instruction_pointer(regs); if (auprobe->api.handler) -- cgit v1.2.3 From 6105c5d46d0b39ccf01e329fc4449ba840c2937d Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 8 Oct 2024 16:58:49 +0100 Subject: arm64: probes: Move kprobes-specific fields We share struct arch_probe_insn between krpboes and uprobes, but most of its fields aren't necessary for uprobes: * The 'insn' field is only used by kprobes as a pointer to the XOL slot. * The 'restore' field is only used by probes as the PC to restore after stepping an instruction in the XOL slot. * The 'pstate_cc' field isn't used by kprobes or uprobes, and seems to only exist as a result of copy-pasting the 32-bit arm implementation of kprobes. As these fields live in struct arch_probe_insn they cannot use definitions that only exist when CONFIG_KPROBES=y, such as the kprobe_opcode_t typedef, which we'd like to use in subsequent patches. Clean this up by removing the 'pstate_cc' field, and moving the kprobes-specific fields into the kprobes-specific struct arch_specific_insn. To make it clear that the fields are related to stepping instructions in the XOL slot, 'insn' is renamed to 'xol_insn' and 'restore' is renamed to 'xol_restore' At the same time, remove the misleading and useless comment above struct arch_probe_insn. The should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Cc: Will Deacon Link: https://lore.kernel.org/r/20241008155851.801546-5-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/probes.h | 8 +++----- arch/arm64/kernel/probes/kprobes.c | 30 +++++++++++++++--------------- 2 files changed, 18 insertions(+), 20 deletions(-) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/include/asm/probes.h b/arch/arm64/include/asm/probes.h index 006946745352..4aa54322794d 100644 --- a/arch/arm64/include/asm/probes.h +++ b/arch/arm64/include/asm/probes.h @@ -12,18 +12,16 @@ typedef u32 probe_opcode_t; typedef void (probes_handler_t) (u32 opcode, long addr, struct pt_regs *); -/* architecture specific copy of original instruction */ struct arch_probe_insn { - probe_opcode_t *insn; - pstate_check_t *pstate_cc; probes_handler_t *handler; - /* restore address after step xol */ - unsigned long restore; }; #ifdef CONFIG_KPROBES typedef u32 kprobe_opcode_t; struct arch_specific_insn { struct arch_probe_insn api; + probe_opcode_t *xol_insn; + /* restore address after step xol */ + unsigned long xol_restore; }; #endif diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index 4268678d0e86..222419a41a40 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -43,7 +43,7 @@ post_kprobe_handler(struct kprobe *, struct kprobe_ctlblk *, struct pt_regs *); static void __kprobes arch_prepare_ss_slot(struct kprobe *p) { - kprobe_opcode_t *addr = p->ainsn.api.insn; + kprobe_opcode_t *addr = p->ainsn.xol_insn; /* * Prepare insn slot, Mark Rutland points out it depends on a coupe of @@ -70,14 +70,14 @@ static void __kprobes arch_prepare_ss_slot(struct kprobe *p) /* * Needs restoring of return address after stepping xol. */ - p->ainsn.api.restore = (unsigned long) p->addr + + p->ainsn.xol_restore = (unsigned long) p->addr + sizeof(kprobe_opcode_t); } static void __kprobes arch_prepare_simulate(struct kprobe *p) { /* This instructions is not executed xol. No need to adjust the PC */ - p->ainsn.api.restore = 0; + p->ainsn.xol_restore = 0; } static void __kprobes arch_simulate_insn(struct kprobe *p, struct pt_regs *regs) @@ -110,18 +110,18 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) return -EINVAL; case INSN_GOOD_NO_SLOT: /* insn need simulation */ - p->ainsn.api.insn = NULL; + p->ainsn.xol_insn = NULL; break; case INSN_GOOD: /* instruction uses slot */ - p->ainsn.api.insn = get_insn_slot(); - if (!p->ainsn.api.insn) + p->ainsn.xol_insn = get_insn_slot(); + if (!p->ainsn.xol_insn) return -ENOMEM; break; } /* prepare the instruction */ - if (p->ainsn.api.insn) + if (p->ainsn.xol_insn) arch_prepare_ss_slot(p); else arch_prepare_simulate(p); @@ -148,9 +148,9 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p) void __kprobes arch_remove_kprobe(struct kprobe *p) { - if (p->ainsn.api.insn) { - free_insn_slot(p->ainsn.api.insn, 0); - p->ainsn.api.insn = NULL; + if (p->ainsn.xol_insn) { + free_insn_slot(p->ainsn.xol_insn, 0); + p->ainsn.xol_insn = NULL; } } @@ -205,9 +205,9 @@ static void __kprobes setup_singlestep(struct kprobe *p, } - if (p->ainsn.api.insn) { + if (p->ainsn.xol_insn) { /* prepare for single stepping */ - slot = (unsigned long)p->ainsn.api.insn; + slot = (unsigned long)p->ainsn.xol_insn; kprobes_save_local_irqflag(kcb, regs); instruction_pointer_set(regs, slot); @@ -245,8 +245,8 @@ static void __kprobes post_kprobe_handler(struct kprobe *cur, struct kprobe_ctlblk *kcb, struct pt_regs *regs) { /* return addr restore if non-branching insn */ - if (cur->ainsn.api.restore != 0) - instruction_pointer_set(regs, cur->ainsn.api.restore); + if (cur->ainsn.xol_restore != 0) + instruction_pointer_set(regs, cur->ainsn.xol_restore); /* restore back original saved kprobe variables and continue */ if (kcb->kprobe_status == KPROBE_REENTER) { @@ -348,7 +348,7 @@ kprobe_breakpoint_ss_handler(struct pt_regs *regs, unsigned long esr) struct kprobe *cur = kprobe_running(); if (cur && (kcb->kprobe_status & (KPROBE_HIT_SS | KPROBE_REENTER)) && - ((unsigned long)&cur->ainsn.api.insn[1] == addr)) { + ((unsigned long)&cur->ainsn.xol_insn[1] == addr)) { kprobes_restore_local_irqflag(kcb, regs); post_kprobe_handler(cur, kcb, regs); -- cgit v1.2.3 From dd0eb50e7c71fdd74f2ab0ef4b60bd6b27bbc670 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 8 Oct 2024 16:58:50 +0100 Subject: arm64: probes: Cleanup kprobes endianness conversions The core kprobes code uses kprobe_opcode_t for the in-memory representation of an instruction, using 'kprobe_opcode_t *' for XOL slots. As arm64 instructions are always little-endian 32-bit values, kprobes_opcode_t should be __le32, but at the moment kprobe_opcode_t is typedef'd to u32. Today there is no functional issue as we convert values via cpu_to_le32() and le32_to_cpu() where necessary, but these conversions are inconsistent with the types used, causing sparse warnings: | CHECK arch/arm64/kernel/probes/kprobes.c | arch/arm64/kernel/probes/kprobes.c:102:21: warning: cast to restricted __le32 | CHECK arch/arm64/kernel/probes/decode-insn.c | arch/arm64/kernel/probes/decode-insn.c:122:46: warning: cast to restricted __le32 | arch/arm64/kernel/probes/decode-insn.c:124:50: warning: cast to restricted __le32 | arch/arm64/kernel/probes/decode-insn.c:136:31: warning: cast to restricted __le32 Improve this by making kprobes_opcode_t a typedef for __le32 and consistently using this for pointers to executable instructions. With this change we can rely on the type system to tell us where conversions are necessary. Since kprobe::opcode is changed from u32 to __le32, the existing le32_to_cpu() converion moves from the point this is initialized (in arch_prepare_kprobe()) to the points this is consumed when passed to a handler or text patching function. As kprobe::opcode isn't altered or consumed elsewhere, this shouldn't result in a functional change. Signed-off-by: Mark Rutland Cc: Will Deacon Link: https://lore.kernel.org/r/20241008155851.801546-6-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/probes.h | 4 ++-- arch/arm64/kernel/probes/decode-insn.c | 2 +- arch/arm64/kernel/probes/kprobes.c | 9 +++++---- 3 files changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/include/asm/probes.h b/arch/arm64/include/asm/probes.h index 4aa54322794d..11e809733b7d 100644 --- a/arch/arm64/include/asm/probes.h +++ b/arch/arm64/include/asm/probes.h @@ -16,10 +16,10 @@ struct arch_probe_insn { probes_handler_t *handler; }; #ifdef CONFIG_KPROBES -typedef u32 kprobe_opcode_t; +typedef __le32 kprobe_opcode_t; struct arch_specific_insn { struct arch_probe_insn api; - probe_opcode_t *xol_insn; + kprobe_opcode_t *xol_insn; /* restore address after step xol */ unsigned long xol_restore; }; diff --git a/arch/arm64/kernel/probes/decode-insn.c b/arch/arm64/kernel/probes/decode-insn.c index 3496d6169e59..147d6ddf3a4c 100644 --- a/arch/arm64/kernel/probes/decode-insn.c +++ b/arch/arm64/kernel/probes/decode-insn.c @@ -134,7 +134,7 @@ arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi) { enum probe_insn decoded; probe_opcode_t insn = le32_to_cpu(*addr); - probe_opcode_t *scan_end = NULL; + kprobe_opcode_t *scan_end = NULL; unsigned long size = 0, offset = 0; struct arch_probe_insn *api = &asi->api; diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index 222419a41a40..48d88e07611d 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -64,7 +64,7 @@ static void __kprobes arch_prepare_ss_slot(struct kprobe *p) * the BRK exception handler, so it is unnecessary to generate * Contex-Synchronization-Event via ISB again. */ - aarch64_insn_patch_text_nosync(addr, p->opcode); + aarch64_insn_patch_text_nosync(addr, le32_to_cpu(p->opcode)); aarch64_insn_patch_text_nosync(addr + 1, BRK64_OPCODE_KPROBES_SS); /* @@ -85,7 +85,7 @@ static void __kprobes arch_simulate_insn(struct kprobe *p, struct pt_regs *regs) struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); if (p->ainsn.api.handler) - p->ainsn.api.handler((u32)p->opcode, (long)p->addr, regs); + p->ainsn.api.handler(le32_to_cpu(p->opcode), (long)p->addr, regs); /* single step simulated, now go for post processing */ post_kprobe_handler(p, kcb, regs); @@ -99,7 +99,7 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) return -EINVAL; /* copy instruction */ - p->opcode = le32_to_cpu(*p->addr); + p->opcode = *p->addr; if (search_exception_tables(probe_addr)) return -EINVAL; @@ -142,8 +142,9 @@ void __kprobes arch_arm_kprobe(struct kprobe *p) void __kprobes arch_disarm_kprobe(struct kprobe *p) { void *addr = p->addr; + u32 insn = le32_to_cpu(p->opcode); - aarch64_insn_patch_text(&addr, &p->opcode, 1); + aarch64_insn_patch_text(&addr, &insn, 1); } void __kprobes arch_remove_kprobe(struct kprobe *p) -- cgit v1.2.3 From 14762109de024837dafcb893d45ccaf6a08ac990 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 8 Oct 2024 16:58:51 +0100 Subject: arm64: probes: Remove probe_opcode_t The probe_opcode_t typedef for u32 isn't necessary, and is a source of confusion as it is easily confused with kprobe_opcode_t, which is a typedef for __le32. The typedef is only used within arch/arm64, and all of arm64's commn insn code uses u32 for the endian-agnostic value of an instruction, so it'd be clearer to use u32 consistently. Remove probe_opcode_t and use u32 directly. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Will Deacon Link: https://lore.kernel.org/r/20241008155851.801546-7-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/probes.h | 1 - arch/arm64/kernel/probes/decode-insn.c | 4 ++-- arch/arm64/kernel/probes/decode-insn.h | 2 +- arch/arm64/kernel/probes/uprobes.c | 4 ++-- 4 files changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/include/asm/probes.h b/arch/arm64/include/asm/probes.h index 11e809733b7d..d49368886309 100644 --- a/arch/arm64/include/asm/probes.h +++ b/arch/arm64/include/asm/probes.h @@ -9,7 +9,6 @@ #include -typedef u32 probe_opcode_t; typedef void (probes_handler_t) (u32 opcode, long addr, struct pt_regs *); struct arch_probe_insn { diff --git a/arch/arm64/kernel/probes/decode-insn.c b/arch/arm64/kernel/probes/decode-insn.c index 147d6ddf3a4c..41b100bcb041 100644 --- a/arch/arm64/kernel/probes/decode-insn.c +++ b/arch/arm64/kernel/probes/decode-insn.c @@ -73,7 +73,7 @@ static bool __kprobes aarch64_insn_is_steppable(u32 insn) * INSN_GOOD_NO_SLOT If instruction is supported but doesn't use its slot. */ enum probe_insn __kprobes -arm_probe_decode_insn(probe_opcode_t insn, struct arch_probe_insn *api) +arm_probe_decode_insn(u32 insn, struct arch_probe_insn *api) { /* * Instructions reading or modifying the PC won't work from the XOL @@ -133,7 +133,7 @@ enum probe_insn __kprobes arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi) { enum probe_insn decoded; - probe_opcode_t insn = le32_to_cpu(*addr); + u32 insn = le32_to_cpu(*addr); kprobe_opcode_t *scan_end = NULL; unsigned long size = 0, offset = 0; struct arch_probe_insn *api = &asi->api; diff --git a/arch/arm64/kernel/probes/decode-insn.h b/arch/arm64/kernel/probes/decode-insn.h index 8b758c5a2062..0e4195de8206 100644 --- a/arch/arm64/kernel/probes/decode-insn.h +++ b/arch/arm64/kernel/probes/decode-insn.h @@ -28,6 +28,6 @@ enum probe_insn __kprobes arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi); #endif enum probe_insn __kprobes -arm_probe_decode_insn(probe_opcode_t insn, struct arch_probe_insn *asi); +arm_probe_decode_insn(u32 insn, struct arch_probe_insn *asi); #endif /* _ARM_KERNEL_KPROBES_ARM64_H */ diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c index a2f137a595fc..fa0b7941d204 100644 --- a/arch/arm64/kernel/probes/uprobes.c +++ b/arch/arm64/kernel/probes/uprobes.c @@ -34,7 +34,7 @@ unsigned long uprobe_get_swbp_addr(struct pt_regs *regs) int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr) { - probe_opcode_t insn; + u32 insn; /* TODO: Currently we do not support AARCH32 instruction probing */ if (mm->context.flags & MMCF_AARCH32) @@ -102,7 +102,7 @@ bool arch_uprobe_xol_was_trapped(struct task_struct *t) bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) { - probe_opcode_t insn; + u32 insn; unsigned long addr; if (!auprobe->simulate) -- cgit v1.2.3 From ab23df141f5318cf661504fb446159ce2dbd1ec2 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 7 Oct 2024 13:39:15 +0100 Subject: arm64: asm-offsets: remove TSK_ACTIVE_MM The TSK_ACTIVE_MM definition isn't used anywhere. Remove it. Signed-off-by: Mark Rutland Cc: Will Deacon Link: https://lore.kernel.org/r/20241007123921.549340-2-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/asm-offsets.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 27de1dddb0ab..42fd36027545 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -28,8 +28,6 @@ int main(void) { - DEFINE(TSK_ACTIVE_MM, offsetof(struct task_struct, active_mm)); - BLANK(); DEFINE(TSK_TI_CPU, offsetof(struct task_struct, thread_info.cpu)); DEFINE(TSK_TI_FLAGS, offsetof(struct task_struct, thread_info.flags)); DEFINE(TSK_TI_PREEMPT, offsetof(struct task_struct, thread_info.preempt_count)); -- cgit v1.2.3 From 7bd8870af8dde527e0e8e83838de8966418c58f3 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 7 Oct 2024 13:39:16 +0100 Subject: arm64: asm-offsets: remove VMA_VM_* The VMA_VM_MM definition is only used by the vma_vm_mm macro, which itself is unused. The VMA_VM_FLAGS definition isn't used anywhere. Remove them all. Signed-off-by: Mark Rutland Cc: Will Deacon Link: https://lore.kernel.org/r/20241007123921.549340-3-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/assembler.h | 7 ------- arch/arm64/kernel/asm-offsets.c | 3 --- 2 files changed, 10 deletions(-) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index bc0b0d75acef..3d8d534a7a77 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -248,13 +248,6 @@ alternative_endif ldr \dst, [\dst, \tmp] .endm -/* - * vma_vm_mm - get mm pointer from vma pointer (vma->vm_mm) - */ - .macro vma_vm_mm, rd, rn - ldr \rd, [\rn, #VMA_VM_MM] - .endm - /* * read_ctr - read CTR_EL0. If the system has mismatched register fields, * provide the system wide safe value from arm64_ftr_reg_ctrel0.sys_val diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 42fd36027545..5f8b83915fac 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -104,9 +104,6 @@ int main(void) #endif DEFINE(MM_CONTEXT_ID, offsetof(struct mm_struct, context.id.counter)); BLANK(); - DEFINE(VMA_VM_MM, offsetof(struct vm_area_struct, vm_mm)); - DEFINE(VMA_VM_FLAGS, offsetof(struct vm_area_struct, vm_flags)); - BLANK(); DEFINE(VM_EXEC, VM_EXEC); BLANK(); DEFINE(PAGE_SZ, PAGE_SIZE); -- cgit v1.2.3 From 4c92c121c402cf3a9a60f58fbd2650577c68f1ac Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 7 Oct 2024 13:39:17 +0100 Subject: arm64: asm-offsets: remove COMPAT_{RT_,SIGFRAME_REGS_OFFSET The COMPAT_SIGFRAME_REGS_OFFSET and COMPAT_RT_SIGFRAME_REGS_OFFSET defintions aren't used anywhere. They were added in commit: f14d8025d263f3c8 ("arm64: compat: Generate asm offsets for signals") ... and subsequently their only user was removed in commit: 2d071968a4052e58 ("arm64: compat: Remove 32-bit sigreturn code from the vDSO") ... leaving them unused. Remove them. Signed-off-by: Mark Rutland Cc: Will Deacon Link: https://lore.kernel.org/r/20241007123921.549340-4-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/asm-offsets.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 5f8b83915fac..520914b901a2 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include @@ -96,11 +95,6 @@ int main(void) #endif DEFINE(FREGS_SIZE, sizeof(struct ftrace_regs)); BLANK(); -#endif -#ifdef CONFIG_COMPAT - DEFINE(COMPAT_SIGFRAME_REGS_OFFSET, offsetof(struct compat_sigframe, uc.uc_mcontext.arm_r0)); - DEFINE(COMPAT_RT_SIGFRAME_REGS_OFFSET, offsetof(struct compat_rt_sigframe, sig.uc.uc_mcontext.arm_r0)); - BLANK(); #endif DEFINE(MM_CONTEXT_ID, offsetof(struct mm_struct, context.id.counter)); BLANK(); -- cgit v1.2.3 From 1abc7c1e593337491d27b84a2a0b79c6d8164811 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 7 Oct 2024 13:39:18 +0100 Subject: arm64: asm-offsets: remove MM_CONTEXT_ID The only user of the MM_CONTEXT_ID defintion was removed in commit: 25b92693a1b67a47 ("arm64: mm: convert cpu_do_switch_mm() to C") Remove MM_CONTEXT_ID. Signed-off-by: Mark Rutland Cc: Will Deacon Link: https://lore.kernel.org/r/20241007123921.549340-5-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/asm-offsets.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 520914b901a2..e6a6b5160f75 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -96,8 +96,6 @@ int main(void) DEFINE(FREGS_SIZE, sizeof(struct ftrace_regs)); BLANK(); #endif - DEFINE(MM_CONTEXT_ID, offsetof(struct mm_struct, context.id.counter)); - BLANK(); DEFINE(VM_EXEC, VM_EXEC); BLANK(); DEFINE(PAGE_SZ, PAGE_SIZE); -- cgit v1.2.3 From 4ce689b4480a528f8d088c71d26744944e360f76 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 7 Oct 2024 13:39:19 +0100 Subject: arm64: asm-offsets: remove VM_EXEC and PAGE_SZ The VM_EXEC definition duplicates the common VM_EXEC definition from . The common definition cannot safely be included by assembly code but currently we don't need to use VM_EXEC in assembly. The PAGE_SZ definition duplicates arm64's definition of PAGE_SIZE from which can safely be included from assembly code and should be used directly. Remove the duplicate definitions. Signed-off-by: Mark Rutland Cc: Will Deacon Link: https://lore.kernel.org/r/20241007123921.549340-6-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/asm-offsets.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index e6a6b5160f75..7c7c8d98256f 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -96,10 +96,6 @@ int main(void) DEFINE(FREGS_SIZE, sizeof(struct ftrace_regs)); BLANK(); #endif - DEFINE(VM_EXEC, VM_EXEC); - BLANK(); - DEFINE(PAGE_SZ, PAGE_SIZE); - BLANK(); DEFINE(DMA_TO_DEVICE, DMA_TO_DEVICE); DEFINE(DMA_FROM_DEVICE, DMA_FROM_DEVICE); BLANK(); -- cgit v1.2.3 From b129125e1f963500fd1cbd19e4155a65ce922944 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 7 Oct 2024 13:39:20 +0100 Subject: arm64: asm-offsets: remove DMA_{TO,FROM}_DEVICE The DMA_TO_DEVICE and DMA_FROM_DEVICE defintitons in asm-offsets duplicate the common defintions from (which used to live in ), and haven't been used from asseembly code since commit: 7eacf1858bc86fe9 ("arm64: mm: Remove assembly DMA cache maintenance wrappers") Remove them both. Signed-off-by: Mark Rutland Cc: Will Deacon Link: https://lore.kernel.org/r/20241007123921.549340-7-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/asm-offsets.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 7c7c8d98256f..e5ffdc0163aa 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -96,9 +95,6 @@ int main(void) DEFINE(FREGS_SIZE, sizeof(struct ftrace_regs)); BLANK(); #endif - DEFINE(DMA_TO_DEVICE, DMA_TO_DEVICE); - DEFINE(DMA_FROM_DEVICE, DMA_FROM_DEVICE); - BLANK(); DEFINE(PREEMPT_DISABLE_OFFSET, PREEMPT_DISABLE_OFFSET); BLANK(); DEFINE(CPU_BOOT_TASK, offsetof(struct secondary_data, task)); -- cgit v1.2.3 From 7bb32dc788ddd0adae1273e799b920a790610fac Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 7 Oct 2024 13:39:21 +0100 Subject: arm64: asm-offsets: remove PREEMPT_DISABLE_OFFSET The PREEMPT_DISABLE_OFFSET definition was added in commit: 24534b3511828c66 ("arm64: assembler: add macros to conditionally yield the NEON under PREEMPT") ... but hasn't been used since commit: 3931261ecf46151a ("arm64: fpsimd: Bring cond_yield asm macro in line with new rules") Remove PREEMPT_DISABLE_OFFSET. Signed-off-by: Mark Rutland Cc: Will Deacon Link: https://lore.kernel.org/r/20241007123921.549340-8-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/asm-offsets.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index e5ffdc0163aa..cb2e9567dca9 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include @@ -95,8 +94,6 @@ int main(void) DEFINE(FREGS_SIZE, sizeof(struct ftrace_regs)); BLANK(); #endif - DEFINE(PREEMPT_DISABLE_OFFSET, PREEMPT_DISABLE_OFFSET); - BLANK(); DEFINE(CPU_BOOT_TASK, offsetof(struct secondary_data, task)); BLANK(); DEFINE(FTR_OVR_VAL_OFFSET, offsetof(struct arm64_ftr_override, val)); -- cgit v1.2.3 From ac4ad5c09b34adb9c2937af874b63762fe66f725 Mon Sep 17 00:00:00 2001 From: Liao Chang Date: Mon, 9 Sep 2024 07:11:14 +0000 Subject: arm64: insn: Simulate nop instruction for better uprobe performance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit v2->v1: 1. Remove the simuation of STP and the related bits. 2. Use arm64_skip_faulting_instruction for single-stepping or FEAT_BTI scenario. As Andrii pointed out, the uprobe/uretprobe selftest bench run into a counterintuitive result that nop and push variants are much slower than ret variant [0]. The root cause lies in the arch_probe_analyse_insn(), which excludes 'nop' and 'stp' from the emulatable instructions list. This force the kernel returns to userspace and execute them out-of-line, then trapping back to kernel for running uprobe callback functions. This leads to a significant performance overhead compared to 'ret' variant, which is already emulated. Typicall uprobe is installed on 'nop' for USDT and on function entry which starts with the instrucion 'stp x29, x30, [sp, #imm]!' to push lr and fp into stack regardless kernel or userspace binary. In order to improve the performance of handling uprobe for common usecases. This patch supports the emulation of Arm64 equvialents instructions of 'nop' and 'push'. The benchmark results below indicates the performance gain of emulation is obvious. On Kunpeng916 (Hi1616), 4 NUMA nodes, 64 Arm64 cores@2.4GHz. xol (1 cpus) ------------ uprobe-nop: 0.916 ± 0.001M/s (0.916M/prod) uprobe-push: 0.908 ± 0.001M/s (0.908M/prod) uprobe-ret: 1.855 ± 0.000M/s (1.855M/prod) uretprobe-nop: 0.640 ± 0.000M/s (0.640M/prod) uretprobe-push: 0.633 ± 0.001M/s (0.633M/prod) uretprobe-ret: 0.978 ± 0.003M/s (0.978M/prod) emulation (1 cpus) ------------------- uprobe-nop: 1.862 ± 0.002M/s (1.862M/prod) uprobe-push: 1.743 ± 0.006M/s (1.743M/prod) uprobe-ret: 1.840 ± 0.001M/s (1.840M/prod) uretprobe-nop: 0.964 ± 0.004M/s (0.964M/prod) uretprobe-push: 0.936 ± 0.004M/s (0.936M/prod) uretprobe-ret: 0.940 ± 0.001M/s (0.940M/prod) As shown above, the performance gap between 'nop/push' and 'ret' variants has been significantly reduced. Due to the emulation of 'push' instruction needs to access userspace memory, it spent more cycles than the other. As Mark suggested [1], it is painful to emulate the correct atomicity and ordering properties of STP, especially when it interacts with MTE, POE, etc. So this patch just focus on the simuation of 'nop'. The simluation of STP and related changes will be addressed in a separate patch. [0] https://lore.kernel.org/all/CAEf4BzaO4eG6hr2hzXYpn+7Uer4chS0R99zLn02ezZ5YruVuQw@mail.gmail.com/ [1] https://lore.kernel.org/all/Zr3RN4zxF5XPgjEB@J2N7QTR9R3/ CC: Andrii Nakryiko CC: Mark Rutland Signed-off-by: Liao Chang Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20240909071114.1150053-1-liaochang1@huawei.com [catalin.marinas@arm.com: small tweaks following MarkR's comments] Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/insn.h | 5 +++++ arch/arm64/kernel/probes/decode-insn.c | 9 +++++++++ arch/arm64/kernel/probes/simulate-insn.c | 6 ++++++ arch/arm64/kernel/probes/simulate-insn.h | 1 + 4 files changed, 21 insertions(+) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h index 8c0a36f72d6f..89bc18989b90 100644 --- a/arch/arm64/include/asm/insn.h +++ b/arch/arm64/include/asm/insn.h @@ -575,6 +575,11 @@ static __always_inline u32 aarch64_insn_gen_nop(void) return aarch64_insn_gen_hint(AARCH64_INSN_HINT_NOP); } +static __always_inline bool aarch64_insn_is_nop(u32 insn) +{ + return insn == aarch64_insn_gen_nop(); +} + u32 aarch64_insn_gen_branch_reg(enum aarch64_insn_register reg, enum aarch64_insn_branch_type type); u32 aarch64_insn_gen_load_store_reg(enum aarch64_insn_register reg, diff --git a/arch/arm64/kernel/probes/decode-insn.c b/arch/arm64/kernel/probes/decode-insn.c index 41b100bcb041..e05249f57075 100644 --- a/arch/arm64/kernel/probes/decode-insn.c +++ b/arch/arm64/kernel/probes/decode-insn.c @@ -75,6 +75,15 @@ static bool __kprobes aarch64_insn_is_steppable(u32 insn) enum probe_insn __kprobes arm_probe_decode_insn(u32 insn, struct arch_probe_insn *api) { + /* + * While 'nop' instruction can execute in the out-of-line slot, + * simulating them in breakpoint handling offers better performance. + */ + if (aarch64_insn_is_nop(insn)) { + api->handler = simulate_nop; + return INSN_GOOD_NO_SLOT; + } + /* * Instructions reading or modifying the PC won't work from the XOL * slot. diff --git a/arch/arm64/kernel/probes/simulate-insn.c b/arch/arm64/kernel/probes/simulate-insn.c index b65334ab79d2..4c6d2d712fbd 100644 --- a/arch/arm64/kernel/probes/simulate-insn.c +++ b/arch/arm64/kernel/probes/simulate-insn.c @@ -196,3 +196,9 @@ simulate_ldrsw_literal(u32 opcode, long addr, struct pt_regs *regs) instruction_pointer_set(regs, instruction_pointer(regs) + 4); } + +void __kprobes +simulate_nop(u32 opcode, long addr, struct pt_regs *regs) +{ + arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); +} diff --git a/arch/arm64/kernel/probes/simulate-insn.h b/arch/arm64/kernel/probes/simulate-insn.h index e065dc92218e..efb2803ec943 100644 --- a/arch/arm64/kernel/probes/simulate-insn.h +++ b/arch/arm64/kernel/probes/simulate-insn.h @@ -16,5 +16,6 @@ void simulate_cbz_cbnz(u32 opcode, long addr, struct pt_regs *regs); void simulate_tbz_tbnz(u32 opcode, long addr, struct pt_regs *regs); void simulate_ldr_literal(u32 opcode, long addr, struct pt_regs *regs); void simulate_ldrsw_literal(u32 opcode, long addr, struct pt_regs *regs); +void simulate_nop(u32 opcode, long addr, struct pt_regs *regs); #endif /* _ARM_KERNEL_KPROBES_SIMULATE_INSN_H */ -- cgit v1.2.3 From 25c17c4b55def92a01e3eecc9c775a6ee25ca20f Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 1 Oct 2024 15:52:19 -0700 Subject: hugetlb: arm64: add mte support Enable MTE support for hugetlb. The MTE page flags will be set on the folio only. When copying hugetlb folio (for example, CoW), the tags for all subpages will be copied when copying the first subpage. When freeing hugetlb folio, the MTE flags will be cleared. Reviewed-by: Catalin Marinas Reviewed-by: David Hildenbrand Signed-off-by: Yang Shi Link: https://lore.kernel.org/r/20241001225220.271178-1-yang@os.amperecomputing.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/hugetlb.h | 8 +++++ arch/arm64/include/asm/mman.h | 3 +- arch/arm64/include/asm/mte.h | 67 ++++++++++++++++++++++++++++++++++++++++ arch/arm64/kernel/hibernate.c | 6 ++++ arch/arm64/kernel/mte.c | 27 ++++++++++++++-- arch/arm64/kvm/guest.c | 16 ++++++++-- arch/arm64/kvm/mmu.c | 11 +++++++ arch/arm64/mm/copypage.c | 27 +++++++++++++++- fs/hugetlbfs/inode.c | 2 +- 9 files changed, 159 insertions(+), 8 deletions(-) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 293f880865e8..c6dff3e69539 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -11,6 +11,7 @@ #define __ASM_HUGETLB_H #include +#include #include #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION @@ -21,6 +22,13 @@ extern bool arch_hugetlb_migration_supported(struct hstate *h); static inline void arch_clear_hugetlb_flags(struct folio *folio) { clear_bit(PG_dcache_clean, &folio->flags); + +#ifdef CONFIG_ARM64_MTE + if (system_supports_mte()) { + clear_bit(PG_mte_tagged, &folio->flags); + clear_bit(PG_mte_lock, &folio->flags); + } +#endif } #define arch_clear_hugetlb_flags arch_clear_hugetlb_flags diff --git a/arch/arm64/include/asm/mman.h b/arch/arm64/include/asm/mman.h index 9e39217b4afb..65bc2b07f666 100644 --- a/arch/arm64/include/asm/mman.h +++ b/arch/arm64/include/asm/mman.h @@ -38,7 +38,8 @@ static inline unsigned long arch_calc_vm_flag_bits(unsigned long flags) * backed by tags-capable memory. The vm_flags may be overridden by a * filesystem supporting MTE (RAM-based). */ - if (system_supports_mte() && (flags & MAP_ANONYMOUS)) + if (system_supports_mte() && + (flags & (MAP_ANONYMOUS | MAP_HUGETLB))) return VM_MTE_ALLOWED; return 0; diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h index 0f84518632b4..6567df8ec8ca 100644 --- a/arch/arm64/include/asm/mte.h +++ b/arch/arm64/include/asm/mte.h @@ -41,6 +41,8 @@ void mte_free_tag_storage(char *storage); static inline void set_page_mte_tagged(struct page *page) { + VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page))); + /* * Ensure that the tags written prior to this function are visible * before the page flags update. @@ -53,6 +55,8 @@ static inline bool page_mte_tagged(struct page *page) { bool ret = test_bit(PG_mte_tagged, &page->flags); + VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page))); + /* * If the page is tagged, ensure ordering with a likely subsequent * read of the tags. @@ -76,6 +80,8 @@ static inline bool page_mte_tagged(struct page *page) */ static inline bool try_page_mte_tagging(struct page *page) { + VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page))); + if (!test_and_set_bit(PG_mte_lock, &page->flags)) return true; @@ -157,6 +163,67 @@ static inline int mte_ptrace_copy_tags(struct task_struct *child, #endif /* CONFIG_ARM64_MTE */ +#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_ARM64_MTE) +static inline void folio_set_hugetlb_mte_tagged(struct folio *folio) +{ + VM_WARN_ON_ONCE(!folio_test_hugetlb(folio)); + + /* + * Ensure that the tags written prior to this function are visible + * before the folio flags update. + */ + smp_wmb(); + set_bit(PG_mte_tagged, &folio->flags); + +} + +static inline bool folio_test_hugetlb_mte_tagged(struct folio *folio) +{ + bool ret = test_bit(PG_mte_tagged, &folio->flags); + + VM_WARN_ON_ONCE(!folio_test_hugetlb(folio)); + + /* + * If the folio is tagged, ensure ordering with a likely subsequent + * read of the tags. + */ + if (ret) + smp_rmb(); + return ret; +} + +static inline bool folio_try_hugetlb_mte_tagging(struct folio *folio) +{ + VM_WARN_ON_ONCE(!folio_test_hugetlb(folio)); + + if (!test_and_set_bit(PG_mte_lock, &folio->flags)) + return true; + + /* + * The tags are either being initialised or may have been initialised + * already. Check if the PG_mte_tagged flag has been set or wait + * otherwise. + */ + smp_cond_load_acquire(&folio->flags, VAL & (1UL << PG_mte_tagged)); + + return false; +} +#else +static inline void folio_set_hugetlb_mte_tagged(struct folio *folio) +{ +} + +static inline bool folio_test_hugetlb_mte_tagged(struct folio *folio) +{ + return false; +} + +static inline bool folio_try_hugetlb_mte_tagging(struct folio *folio) +{ + return false; +} +#endif + static inline void mte_disable_tco_entry(struct task_struct *task) { if (!system_supports_mte()) diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index 7b11d84f533c..18749e9a6c2d 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -266,9 +266,15 @@ static int swsusp_mte_save_tags(void) max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { struct page *page = pfn_to_online_page(pfn); + struct folio *folio; if (!page) continue; + folio = page_folio(page); + + if (folio_test_hugetlb(folio) && + !folio_test_hugetlb_mte_tagged(folio)) + continue; if (!page_mte_tagged(page)) continue; diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 6174671be7c1..2fbfd27ff5f2 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -38,7 +38,24 @@ EXPORT_SYMBOL_GPL(mte_async_or_asymm_mode); void mte_sync_tags(pte_t pte, unsigned int nr_pages) { struct page *page = pte_page(pte); - unsigned int i; + struct folio *folio = page_folio(page); + unsigned long i; + + if (folio_test_hugetlb(folio)) { + unsigned long nr = folio_nr_pages(folio); + + /* Hugetlb MTE flags are set for head page only */ + if (folio_try_hugetlb_mte_tagging(folio)) { + for (i = 0; i < nr; i++, page++) + mte_clear_page_tags(page_address(page)); + folio_set_hugetlb_mte_tagged(folio); + } + + /* ensure the tags are visible before the PTE is set */ + smp_wmb(); + + return; + } /* if PG_mte_tagged is set, tags have already been initialised */ for (i = 0; i < nr_pages; i++, page++) { @@ -410,6 +427,7 @@ static int __access_remote_tags(struct mm_struct *mm, unsigned long addr, void *maddr; struct page *page = get_user_page_vma_remote(mm, addr, gup_flags, &vma); + struct folio *folio; if (IS_ERR(page)) { err = PTR_ERR(page); @@ -428,7 +446,12 @@ static int __access_remote_tags(struct mm_struct *mm, unsigned long addr, put_page(page); break; } - WARN_ON_ONCE(!page_mte_tagged(page)); + + folio = page_folio(page); + if (folio_test_hugetlb(folio)) + WARN_ON_ONCE(!folio_test_hugetlb_mte_tagged(folio)); + else + WARN_ON_ONCE(!page_mte_tagged(page)); /* limit access to the end of the page */ offset = offset_in_page(addr); diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 962f985977c2..e738a353b20e 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -1055,6 +1055,7 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, void *maddr; unsigned long num_tags; struct page *page; + struct folio *folio; if (is_error_noslot_pfn(pfn)) { ret = -EFAULT; @@ -1068,10 +1069,13 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, ret = -EFAULT; goto out; } + folio = page_folio(page); maddr = page_address(page); if (!write) { - if (page_mte_tagged(page)) + if ((folio_test_hugetlb(folio) && + folio_test_hugetlb_mte_tagged(folio)) || + page_mte_tagged(page)) num_tags = mte_copy_tags_to_user(tags, maddr, MTE_GRANULES_PER_PAGE); else @@ -1085,14 +1089,20 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, * __set_ptes() in the VMM but still overriding the * tags, hence ignoring the return value. */ - try_page_mte_tagging(page); + if (folio_test_hugetlb(folio)) + folio_try_hugetlb_mte_tagging(folio); + else + try_page_mte_tagging(page); num_tags = mte_copy_tags_from_user(maddr, tags, MTE_GRANULES_PER_PAGE); /* uaccess failed, don't leave stale tags */ if (num_tags != MTE_GRANULES_PER_PAGE) mte_clear_page_tags(maddr); - set_page_mte_tagged(page); + if (folio_test_hugetlb(folio)) + folio_set_hugetlb_mte_tagged(folio); + else + set_page_mte_tagged(page); kvm_release_pfn_dirty(pfn); } diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index a509b63bd4dd..962449f9ac2f 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1401,10 +1401,21 @@ static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn, { unsigned long i, nr_pages = size >> PAGE_SHIFT; struct page *page = pfn_to_page(pfn); + struct folio *folio = page_folio(page); if (!kvm_has_mte(kvm)) return; + if (folio_test_hugetlb(folio)) { + /* Hugetlb has MTE flags set on head page only */ + if (folio_try_hugetlb_mte_tagging(folio)) { + for (i = 0; i < nr_pages; i++, page++) + mte_clear_page_tags(page_address(page)); + folio_set_hugetlb_mte_tagged(folio); + } + return; + } + for (i = 0; i < nr_pages; i++, page++) { if (try_page_mte_tagging(page)) { mte_clear_page_tags(page_address(page)); diff --git a/arch/arm64/mm/copypage.c b/arch/arm64/mm/copypage.c index a7bb20055ce0..87b3f1a25535 100644 --- a/arch/arm64/mm/copypage.c +++ b/arch/arm64/mm/copypage.c @@ -18,15 +18,40 @@ void copy_highpage(struct page *to, struct page *from) { void *kto = page_address(to); void *kfrom = page_address(from); + struct folio *src = page_folio(from); + struct folio *dst = page_folio(to); + unsigned int i, nr_pages; copy_page(kto, kfrom); if (kasan_hw_tags_enabled()) page_kasan_tag_reset(to); - if (system_supports_mte() && page_mte_tagged(from)) { + if (!system_supports_mte()) + return; + + if (folio_test_hugetlb(src) && + folio_test_hugetlb_mte_tagged(src)) { + if (!folio_try_hugetlb_mte_tagging(dst)) + return; + + /* + * Populate tags for all subpages. + * + * Don't assume the first page is head page since + * huge page copy may start from any subpage. + */ + nr_pages = folio_nr_pages(src); + for (i = 0; i < nr_pages; i++) { + kfrom = page_address(folio_page(src, i)); + kto = page_address(folio_page(dst, i)); + mte_copy_page_tags(kto, kfrom); + } + folio_set_hugetlb_mte_tagged(dst); + } else if (page_mte_tagged(from)) { /* It's a new page, shouldn't have been tagged yet */ WARN_ON_ONCE(!try_page_mte_tagging(to)); + mte_copy_page_tags(kto, kfrom); set_page_mte_tagged(to); } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 5cf327337e22..f26b3b53d7de 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -110,7 +110,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) * way when do_mmap unwinds (may be important on powerpc * and ia64). */ - vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND); + vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND | VM_MTE_ALLOWED); vma->vm_ops = &hugetlb_vm_ops; ret = seal_check_write(info->seals, vma); -- cgit v1.2.3 From 00d9597903d0053bb13c74dfe61bcba35e213bd7 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 17 Oct 2024 10:25:31 +0100 Subject: arm64: pt_regs: rename "pmr_save" -> "pmr" The pt_regs::pmr_save field is weirdly named relative to all other pt_regs fields, with a '_save' suffix that doesn't make anything clearer and only leads to more typing to access the field. Remove the '_save' suffix. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Reviewed-by: Mark Brown Reviewed-by: Miroslav Benes Reviewed-by: Puranjay Mohan Cc: Ard Biesheuvel Cc: Josh Poimboeuf Cc: Kalesh Singh Cc: Madhavan T. Venkataraman Cc: Marc Zyngier Cc: Will Deacon Link: https://lore.kernel.org/r/20241017092538.1859841-4-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/daifflags.h | 2 +- arch/arm64/include/asm/processor.h | 2 +- arch/arm64/include/asm/ptrace.h | 4 ++-- arch/arm64/kernel/asm-offsets.c | 2 +- arch/arm64/kernel/entry.S | 4 ++-- arch/arm64/kernel/process.c | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/include/asm/daifflags.h b/arch/arm64/include/asm/daifflags.h index 55f57dfa8e2f..fbb5c99eb2f9 100644 --- a/arch/arm64/include/asm/daifflags.h +++ b/arch/arm64/include/asm/daifflags.h @@ -132,7 +132,7 @@ static inline void local_daif_inherit(struct pt_regs *regs) trace_hardirqs_on(); if (system_uses_irq_prio_masking()) - gic_write_pmr(regs->pmr_save); + gic_write_pmr(regs->pmr); /* * We can't use local_daif_restore(regs->pstate) here as diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 1438424f0064..03b99164f7f4 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -293,7 +293,7 @@ static inline void start_thread_common(struct pt_regs *regs, unsigned long pc) regs->pc = pc; if (system_uses_irq_prio_masking()) - regs->pmr_save = GIC_PRIO_IRQON; + regs->pmr = GIC_PRIO_IRQON; } static inline void start_thread(struct pt_regs *regs, unsigned long pc, diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index 1ae671fe64d8..e82e6e47ac9a 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -167,7 +167,7 @@ struct pt_regs { u64 sdei_ttbr1; /* Only valid when ARM64_HAS_GIC_PRIO_MASKING is enabled. */ - u64 pmr_save; + u64 pmr; u64 stackframe[2]; /* Only valid for some EL1 exceptions. */ @@ -211,7 +211,7 @@ static inline void forget_syscall(struct pt_regs *regs) #define irqs_priority_unmasked(regs) \ (system_uses_irq_prio_masking() ? \ - (regs)->pmr_save == GIC_PRIO_IRQON : \ + (regs)->pmr == GIC_PRIO_IRQON : \ true) #define interrupts_enabled(regs) \ diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 27de1dddb0ab..eb7fb2f9b927 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -79,7 +79,7 @@ int main(void) DEFINE(S_PSTATE, offsetof(struct pt_regs, pstate)); DEFINE(S_SYSCALLNO, offsetof(struct pt_regs, syscallno)); DEFINE(S_SDEI_TTBR1, offsetof(struct pt_regs, sdei_ttbr1)); - DEFINE(S_PMR_SAVE, offsetof(struct pt_regs, pmr_save)); + DEFINE(S_PMR, offsetof(struct pt_regs, pmr)); DEFINE(S_STACKFRAME, offsetof(struct pt_regs, stackframe)); DEFINE(PT_REGS_SIZE, sizeof(struct pt_regs)); BLANK(); diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 7ef0e127b149..a84ce95ad96c 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -315,7 +315,7 @@ alternative_if_not ARM64_HAS_GIC_PRIO_MASKING alternative_else_nop_endif mrs_s x20, SYS_ICC_PMR_EL1 - str x20, [sp, #S_PMR_SAVE] + str x20, [sp, #S_PMR] mov x20, #GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET msr_s SYS_ICC_PMR_EL1, x20 @@ -342,7 +342,7 @@ alternative_if_not ARM64_HAS_GIC_PRIO_MASKING b .Lskip_pmr_restore\@ alternative_else_nop_endif - ldr x20, [sp, #S_PMR_SAVE] + ldr x20, [sp, #S_PMR] msr_s SYS_ICC_PMR_EL1, x20 /* Ensure priority change is seen by redistributor */ diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 0540653fbf38..2951b5ba1937 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -227,7 +227,7 @@ void __show_regs(struct pt_regs *regs) printk("sp : %016llx\n", sp); if (system_uses_irq_prio_masking()) - printk("pmr_save: %08llx\n", regs->pmr_save); + printk("pmr: %08llx\n", regs->pmr); i = top_reg; -- cgit v1.2.3 From 1454363098a0f7f14479f6a45945bf8d3d775a95 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 17 Oct 2024 10:25:32 +0100 Subject: arm64: pt_regs: swap 'unused' and 'pmr' fields In subsequent patches we'll want to add an additional u64 to struct pt_regs. To make space, this patch swaps the 'unused' and 'pmr' fields, as the 'pmr' value only requires bits[7:0] and can safely fit into a u32, which frees up a 64-bit unused field. The 'lockdep_hardirqs' and 'exit_rcu' fields should eventually be moved out of pt_regs and managed locally within entry-common.c, so I've left those as-is for the moment. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Reviewed-by: Mark Brown Reviewed-by: Miroslav Benes Reviewed-by: Puranjay Mohan Cc: Ard Biesheuvel Cc: Josh Poimboeuf Cc: Kalesh Singh Cc: Madhavan T. Venkataraman Cc: Marc Zyngier Cc: Will Deacon Link: https://lore.kernel.org/r/20241017092538.1859841-5-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/ptrace.h | 6 +++--- arch/arm64/kernel/entry.S | 4 ++-- arch/arm64/kernel/process.c | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index e82e6e47ac9a..92531aeba531 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -163,11 +163,11 @@ struct pt_regs { }; u64 orig_x0; s32 syscallno; - u32 unused; + u32 pmr; u64 sdei_ttbr1; - /* Only valid when ARM64_HAS_GIC_PRIO_MASKING is enabled. */ - u64 pmr; + u64 unused; + u64 stackframe[2]; /* Only valid for some EL1 exceptions. */ diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index a84ce95ad96c..fa6d6d5ca5e0 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -315,7 +315,7 @@ alternative_if_not ARM64_HAS_GIC_PRIO_MASKING alternative_else_nop_endif mrs_s x20, SYS_ICC_PMR_EL1 - str x20, [sp, #S_PMR] + str w20, [sp, #S_PMR] mov x20, #GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET msr_s SYS_ICC_PMR_EL1, x20 @@ -342,7 +342,7 @@ alternative_if_not ARM64_HAS_GIC_PRIO_MASKING b .Lskip_pmr_restore\@ alternative_else_nop_endif - ldr x20, [sp, #S_PMR] + ldr w20, [sp, #S_PMR] msr_s SYS_ICC_PMR_EL1, x20 /* Ensure priority change is seen by redistributor */ diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 2951b5ba1937..c722c1be6fa5 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -227,7 +227,7 @@ void __show_regs(struct pt_regs *regs) printk("sp : %016llx\n", sp); if (system_uses_irq_prio_masking()) - printk("pmr: %08llx\n", regs->pmr); + printk("pmr: %08x\n", regs->pmr); i = top_reg; -- cgit v1.2.3 From 886c2b0ba820b9d6ffe3a7c670eb2f519755123c Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 17 Oct 2024 10:25:33 +0100 Subject: arm64: use a common struct frame_record Currently the signal handling code has its own struct frame_record, the definition of struct pt_regs open-codes a frame record as an array, and the kernel unwinder hard-codes frame record offsets. Move to a common struct frame_record that can be used throughout the kernel. Signed-off-by: Mark Rutland Reviewed-by: Mark Brown Reviewed-by: Miroslav Benes Reviewed-by: Puranjay Mohan Cc: Ard Biesheuvel Cc: Josh Poimboeuf Cc: Kalesh Singh Cc: Madhavan T. Venkataraman Cc: Marc Zyngier Cc: Will Deacon Link: https://lore.kernel.org/r/20241017092538.1859841-6-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/ptrace.h | 4 +++- arch/arm64/include/asm/stacktrace/common.h | 8 +++++--- arch/arm64/include/asm/stacktrace/frame.h | 13 +++++++++++++ arch/arm64/kernel/process.c | 2 +- arch/arm64/kernel/signal.c | 5 ----- arch/arm64/kernel/stacktrace.c | 2 +- 6 files changed, 23 insertions(+), 11 deletions(-) create mode 100644 arch/arm64/include/asm/stacktrace/frame.h (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index 92531aeba531..89c02f85f4b1 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -98,6 +98,8 @@ #include #include +#include + /* sizeof(struct user) for AArch32 */ #define COMPAT_USER_SZ 296 @@ -168,7 +170,7 @@ struct pt_regs { u64 sdei_ttbr1; u64 unused; - u64 stackframe[2]; + struct frame_record stackframe; /* Only valid for some EL1 exceptions. */ u64 lockdep_hardirqs; diff --git a/arch/arm64/include/asm/stacktrace/common.h b/arch/arm64/include/asm/stacktrace/common.h index f63dc654e545..7fab6876e497 100644 --- a/arch/arm64/include/asm/stacktrace/common.h +++ b/arch/arm64/include/asm/stacktrace/common.h @@ -137,21 +137,23 @@ found: static inline int unwind_next_frame_record(struct unwind_state *state) { + struct frame_record *record; unsigned long fp = state->fp; int err; if (fp & 0x7) return -EINVAL; - err = unwind_consume_stack(state, fp, 16); + err = unwind_consume_stack(state, fp, sizeof(*record)); if (err) return err; /* * Record this frame record's values. */ - state->fp = READ_ONCE(*(unsigned long *)(fp)); - state->pc = READ_ONCE(*(unsigned long *)(fp + 8)); + record = (struct frame_record *)fp; + state->fp = READ_ONCE(record->fp); + state->pc = READ_ONCE(record->lr); return 0; } diff --git a/arch/arm64/include/asm/stacktrace/frame.h b/arch/arm64/include/asm/stacktrace/frame.h new file mode 100644 index 000000000000..6397bc847f14 --- /dev/null +++ b/arch/arm64/include/asm/stacktrace/frame.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __ASM_STACKTRACE_FRAME_H +#define __ASM_STACKTRACE_FRAME_H + +/* + * A standard AAPCS64 frame record. + */ +struct frame_record { + u64 fp; + u64 lr; +}; + +#endif /* __ASM_STACKTRACE_FRAME_H */ diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index c722c1be6fa5..d45fd114eac3 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -419,7 +419,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) * For the benefit of the unwinder, set up childregs->stackframe * as the final frame for the new task. */ - p->thread.cpu_context.fp = (unsigned long)childregs->stackframe; + p->thread.cpu_context.fp = (unsigned long)&childregs->stackframe; ptrace_hw_copy_thread(p); diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 561986947530..2c47f9a0e40b 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -42,11 +42,6 @@ struct rt_sigframe { struct ucontext uc; }; -struct frame_record { - u64 fp; - u64 lr; -}; - struct rt_sigframe_user_layout { struct rt_sigframe __user *sigframe; struct frame_record __user *next_frame; diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 2729faaee4b4..ffe8e4f54956 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -145,7 +145,7 @@ kunwind_next(struct kunwind_state *state) int err; /* Final frame; nothing to unwind */ - if (fp == (unsigned long)task_pt_regs(tsk)->stackframe) + if (fp == (unsigned long)&task_pt_regs(tsk)->stackframe) return -ENOENT; err = unwind_next_frame_record(&state->common); -- cgit v1.2.3 From b7794795c93d9c15e4bc64db2f3bf2104451e3bc Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 17 Oct 2024 10:25:34 +0100 Subject: arm64: stacktrace: move dump_backtrace() to kunwind_stack_walk() Currently dump_backtrace() can only print the PC value at each step of the unwind, as this is all the information that arch_stack_walk() passes to the dump_backtrace_entry() callback. In future we'd like to print some additional information, such as the origin of entries (e.g. PC, LR, FP) and/or the reliability thereof. In preparation for doing so, this patch moves dump_backtrace() over to kunwind_stack_walk(), which passes the full kunwind_state to the callback. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Reviewed-by: Mark Brown Reviewed-by: Miroslav Benes Reviewed-by: Puranjay Mohan Cc: Ard Biesheuvel Cc: Josh Poimboeuf Cc: Kalesh Singh Cc: Madhavan T. Venkataraman Cc: Marc Zyngier Cc: Will Deacon Link: https://lore.kernel.org/r/20241017092538.1859841-7-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/stacktrace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index ffe8e4f54956..22f8709f0e76 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -294,10 +294,10 @@ noinline noinstr void arch_bpf_stack_walk(bool (*consume_entry)(void *cookie, u6 kunwind_stack_walk(arch_bpf_unwind_consume_entry, &data, current, NULL); } -static bool dump_backtrace_entry(void *arg, unsigned long where) +static bool dump_backtrace_entry(const struct kunwind_state *state, void *arg) { char *loglvl = arg; - printk("%s %pSb\n", loglvl, (void *)where); + printk("%s %pSb\n", loglvl, (void *)state->common.pc); return true; } @@ -316,7 +316,7 @@ void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, return; printk("%sCall trace:\n", loglvl); - arch_stack_walk(dump_backtrace_entry, (void *)loglvl, tsk, regs); + kunwind_stack_walk(dump_backtrace_entry, (void *)loglvl, tsk, regs); put_task_stack(tsk); } -- cgit v1.2.3 From bdf8eafbf7f56f9fa43a019cdc1a5f057210f01d Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 17 Oct 2024 10:25:35 +0100 Subject: arm64: stacktrace: report source of unwind data When analysing a stacktrace it can be useful to know where an unwound PC came from, as in some situations certain sources may be suspect or known to be unreliable. In future it would also be useful to track this so that certain unwind steps can be performed in a stateful manner. For example when unwinding across an exception boundary, we'd ideally unwind pt_regs::pc, then pt_regs::lr, then the next frame record. This patch adds an enumerated set of unwind sources, tracks this during the unwind, and updates dump_backtrace() to log these for interesting unwind steps. The interesting sources recorded are: "C" - the PC came from the caller of an unwind function. "T" - the PC came from thread_saved_pc() for a blocked task. "P" - the PC came from a pt_regs::pc. "U" - the PC came from an unknown source (indicates an unwinder error). ... with nothing recorded when the PC came from a frame_record::pc as this is the vastly common case and logging this would make it difficult to spot the more interesting cases. For example, when triggering a backtrace via magic-sysrq + L, the CPU handling the sysrq will have a backtrace whose first element is the caller (C) of dump_backtrace(): | Call trace: | show_stack+0x18/0x30 (C) | dump_stack_lvl+0x60/0x80 | dump_stack+0x18/0x24 | nmi_cpu_backtrace+0xfc/0x140 | ... ... and other CPUs will have a backtrace whose first element is their pt_regs::pc (P) at the instant the backtrace IPI was taken: | Call trace: | _raw_spin_unlock_irqrestore+0x8/0x50 (P) | wake_up_process+0x18/0x24 | process_timeout+0x14/0x20 | call_timer_fn.isra.0+0x24/0x80 | ... Signed-off-by: Mark Rutland Reviewed-by: Mark Brown Reviewed-by: Miroslav Benes Reviewed-by: Puranjay Mohan Cc: Ard Biesheuvel Cc: Josh Poimboeuf Cc: Kalesh Singh Cc: Madhavan T. Venkataraman Cc: Marc Zyngier Cc: Will Deacon Link: https://lore.kernel.org/r/20241017092538.1859841-8-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/stacktrace.c | 47 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 43 insertions(+), 4 deletions(-) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 22f8709f0e76..83ab37e13159 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -20,6 +20,14 @@ #include #include +enum kunwind_source { + KUNWIND_SOURCE_UNKNOWN, + KUNWIND_SOURCE_FRAME, + KUNWIND_SOURCE_CALLER, + KUNWIND_SOURCE_TASK, + KUNWIND_SOURCE_REGS_PC, +}; + /* * Kernel unwind state * @@ -37,6 +45,7 @@ struct kunwind_state { #ifdef CONFIG_KRETPROBES struct llist_node *kr_cur; #endif + enum kunwind_source source; }; static __always_inline void @@ -45,6 +54,7 @@ kunwind_init(struct kunwind_state *state, { unwind_init_common(&state->common); state->task = task; + state->source = KUNWIND_SOURCE_UNKNOWN; } /* @@ -62,6 +72,7 @@ kunwind_init_from_regs(struct kunwind_state *state, state->common.fp = regs->regs[29]; state->common.pc = regs->pc; + state->source = KUNWIND_SOURCE_REGS_PC; } /* @@ -79,6 +90,7 @@ kunwind_init_from_caller(struct kunwind_state *state) state->common.fp = (unsigned long)__builtin_frame_address(1); state->common.pc = (unsigned long)__builtin_return_address(0); + state->source = KUNWIND_SOURCE_CALLER; } /* @@ -99,6 +111,7 @@ kunwind_init_from_task(struct kunwind_state *state, state->common.fp = thread_saved_fp(task); state->common.pc = thread_saved_pc(task); + state->source = KUNWIND_SOURCE_TASK; } static __always_inline int @@ -148,9 +161,19 @@ kunwind_next(struct kunwind_state *state) if (fp == (unsigned long)&task_pt_regs(tsk)->stackframe) return -ENOENT; - err = unwind_next_frame_record(&state->common); - if (err) - return err; + switch (state->source) { + case KUNWIND_SOURCE_FRAME: + case KUNWIND_SOURCE_CALLER: + case KUNWIND_SOURCE_TASK: + case KUNWIND_SOURCE_REGS_PC: + err = unwind_next_frame_record(&state->common); + if (err) + return err; + state->source = KUNWIND_SOURCE_FRAME; + break; + default: + return -EINVAL; + } state->common.pc = ptrauth_strip_kernel_insn_pac(state->common.pc); @@ -294,10 +317,26 @@ noinline noinstr void arch_bpf_stack_walk(bool (*consume_entry)(void *cookie, u6 kunwind_stack_walk(arch_bpf_unwind_consume_entry, &data, current, NULL); } +static const char *state_source_string(const struct kunwind_state *state) +{ + switch (state->source) { + case KUNWIND_SOURCE_FRAME: return NULL; + case KUNWIND_SOURCE_CALLER: return "C"; + case KUNWIND_SOURCE_TASK: return "T"; + case KUNWIND_SOURCE_REGS_PC: return "P"; + default: return "U"; + } +} + static bool dump_backtrace_entry(const struct kunwind_state *state, void *arg) { + const char *source = state_source_string(state); char *loglvl = arg; - printk("%s %pSb\n", loglvl, (void *)state->common.pc); + printk("%s %pSb%s%s%s\n", loglvl, + (void *)state->common.pc, + source ? " (" : "", + source ? source : "", + source ? ")" : ""); return true; } -- cgit v1.2.3 From 8094df1cf09248e60afd0e14fb6c2ba4c79b0b9c Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 17 Oct 2024 10:25:36 +0100 Subject: arm64: stacktrace: report recovered PCs When analysing a stacktrace it can be useful to know whether an unwound PC has been rewritten by fgraph or kretprobes, as in some situations these may be suspect or be known to be unreliable. This patch adds flags to track when an unwind entry has recovered the PC from fgraph and/or kretprobes, and updates dump_backtrace() to log when this is the case. The flags recorded are: "F" - the PC was recovered from fgraph "K" - the PC was recovered from kretprobes These flags are recorded and logged in addition to the original source of the unwound PC. For example, with the ftrace_graph profiler enabled globally, and kretprobes installed on generic_handle_domain_irq() and do_interrupt_handler(), a backtrace triggered by magic-sysrq + L reports: | Call trace: | show_stack+0x20/0x40 (CF) | dump_stack_lvl+0x60/0x80 (F) | dump_stack+0x18/0x28 | nmi_cpu_backtrace+0xfc/0x140 | nmi_trigger_cpumask_backtrace+0x1c8/0x200 | arch_trigger_cpumask_backtrace+0x20/0x40 | sysrq_handle_showallcpus+0x24/0x38 (F) | __handle_sysrq+0xa8/0x1b0 (F) | handle_sysrq+0x38/0x50 (F) | pl011_int+0x460/0x5a8 (F) | __handle_irq_event_percpu+0x60/0x220 (F) | handle_irq_event+0x54/0xc0 (F) | handle_fasteoi_irq+0xa8/0x1d0 (F) | generic_handle_domain_irq+0x34/0x58 (F) | gic_handle_irq+0x54/0x140 (FK) | call_on_irq_stack+0x24/0x58 (F) | do_interrupt_handler+0x88/0xa0 | el1_interrupt+0x34/0x68 (FK) | el1h_64_irq_handler+0x18/0x28 | el1h_64_irq+0x64/0x68 | default_idle_call+0x34/0x180 | do_idle+0x204/0x268 | cpu_startup_entry+0x40/0x50 (F) | rest_init+0xe4/0xf0 | start_kernel+0x744/0x750 | __primary_switched+0x80/0x90 Note that as these flags are reported next to the recovered PC value, they appear on the callers of instrumented functions. For example gic_handle_irq() has a "K" marker because generic_handle_domain_irq() was instrumented with kretprobes and had its return address rewritten. Signed-off-by: Mark Rutland Reviewed-by: Mark Brown Reviewed-by: Miroslav Benes Reviewed-by: Puranjay Mohan Cc: Ard Biesheuvel Cc: Josh Poimboeuf Cc: Kalesh Singh Cc: Madhavan T. Venkataraman Cc: Marc Zyngier Cc: Will Deacon Link: https://lore.kernel.org/r/20241017092538.1859841-9-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/stacktrace.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 83ab37e13159..f8e231683dad 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -28,6 +28,14 @@ enum kunwind_source { KUNWIND_SOURCE_REGS_PC, }; +union unwind_flags { + unsigned long all; + struct { + unsigned long fgraph : 1, + kretprobe : 1; + }; +}; + /* * Kernel unwind state * @@ -46,6 +54,7 @@ struct kunwind_state { struct llist_node *kr_cur; #endif enum kunwind_source source; + union unwind_flags flags; }; static __always_inline void @@ -55,6 +64,7 @@ kunwind_init(struct kunwind_state *state, unwind_init_common(&state->common); state->task = task; state->source = KUNWIND_SOURCE_UNKNOWN; + state->flags.all = 0; } /* @@ -127,6 +137,7 @@ kunwind_recover_return_address(struct kunwind_state *state) if (WARN_ON_ONCE(state->common.pc == orig_pc)) return -EINVAL; state->common.pc = orig_pc; + state->flags.fgraph = 1; } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ @@ -137,6 +148,7 @@ kunwind_recover_return_address(struct kunwind_state *state) (void *)state->common.fp, &state->kr_cur); state->common.pc = orig_pc; + state->flags.kretprobe = 1; } #endif /* CONFIG_KRETPROBES */ @@ -157,6 +169,8 @@ kunwind_next(struct kunwind_state *state) unsigned long fp = state->common.fp; int err; + state->flags.all = 0; + /* Final frame; nothing to unwind */ if (fp == (unsigned long)&task_pt_regs(tsk)->stackframe) return -ENOENT; @@ -331,12 +345,18 @@ static const char *state_source_string(const struct kunwind_state *state) static bool dump_backtrace_entry(const struct kunwind_state *state, void *arg) { const char *source = state_source_string(state); + union unwind_flags flags = state->flags; + bool has_info = source || flags.all; char *loglvl = arg; - printk("%s %pSb%s%s%s\n", loglvl, + + printk("%s %pSb%s%s%s%s%s\n", loglvl, (void *)state->common.pc, - source ? " (" : "", + has_info ? " (" : "", source ? source : "", - source ? ")" : ""); + flags.fgraph ? "F" : "", + flags.kretprobe ? "K" : "", + has_info ? ")" : ""); + return true; } -- cgit v1.2.3 From c2c6b27b5aa14fa28e3f455f697ccd2e0e75d773 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 17 Oct 2024 10:25:38 +0100 Subject: arm64: stacktrace: unwind exception boundaries When arm64's stack unwinder encounters an exception boundary, it uses the pt_regs::stackframe created by the entry code, which has a copy of the PC and FP at the time the exception was taken. The unwinder doesn't know anything about pt_regs, and reports the PC from the stackframe, but does not report the LR. The LR is only guaranteed to contain the return address at function call boundaries, and can be used as a scratch register at other times, so the LR at an exception boundary may or may not be a legitimate return address. It would be useful to report the LR value regardless, as it can be helpful when debugging, and in future it will be helpful for reliable stacktrace support. This patch changes the way we unwind across exception boundaries, allowing both the PC and LR to be reported. The entry code creates a frame_record_meta structure embedded within pt_regs, which the unwinder uses to find the pt_regs. The unwinder can then extract pt_regs::pc and pt_regs::lr as two separate unwind steps before continuing with a regular walk of frame records. When a PC is unwound from pt_regs::lr, dump_backtrace() will log this with an "L" marker so that it can be identified easily. For example, an unwind across an exception boundary will appear as follows: | el1h_64_irq+0x6c/0x70 | _raw_spin_unlock_irqrestore+0x10/0x60 (P) | __aarch64_insn_write+0x6c/0x90 (L) | aarch64_insn_patch_text_nosync+0x28/0x80 ... with a (P) entry for pt_regs::pc, and an (L) entry for pt_regs:lr. Note that the LR may be stale at the point of the exception, for example, shortly after a return: | el1h_64_irq+0x6c/0x70 | default_idle_call+0x34/0x180 (P) | default_idle_call+0x28/0x180 (L) | do_idle+0x204/0x268 ... where the LR points a few instructions before the current PC. This plays nicely with all the other unwind metadata tracking. With the ftrace_graph profiler enabled globally, and kretprobes installed on generic_handle_domain_irq() and do_interrupt_handler(), a backtrace triggered by magic-sysrq + L reports: | Call trace: | show_stack+0x20/0x40 (CF) | dump_stack_lvl+0x60/0x80 (F) | dump_stack+0x18/0x28 | nmi_cpu_backtrace+0xfc/0x140 | nmi_trigger_cpumask_backtrace+0x1c8/0x200 | arch_trigger_cpumask_backtrace+0x20/0x40 | sysrq_handle_showallcpus+0x24/0x38 (F) | __handle_sysrq+0xa8/0x1b0 (F) | handle_sysrq+0x38/0x50 (F) | pl011_int+0x460/0x5a8 (F) | __handle_irq_event_percpu+0x60/0x220 (F) | handle_irq_event+0x54/0xc0 (F) | handle_fasteoi_irq+0xa8/0x1d0 (F) | generic_handle_domain_irq+0x34/0x58 (F) | gic_handle_irq+0x54/0x140 (FK) | call_on_irq_stack+0x24/0x58 (F) | do_interrupt_handler+0x88/0xa0 | el1_interrupt+0x34/0x68 (FK) | el1h_64_irq_handler+0x18/0x28 | el1h_64_irq+0x6c/0x70 | default_idle_call+0x34/0x180 (P) | default_idle_call+0x28/0x180 (L) | do_idle+0x204/0x268 | cpu_startup_entry+0x3c/0x50 (F) | rest_init+0xe4/0xf0 | start_kernel+0x744/0x750 | __primary_switched+0x88/0x98 Signed-off-by: Mark Rutland Reviewed-by: Mark Brown Reviewed-by: Miroslav Benes Reviewed-by: Puranjay Mohan Cc: Ard Biesheuvel Cc: Josh Poimboeuf Cc: Kalesh Singh Cc: Madhavan T. Venkataraman Cc: Marc Zyngier Cc: Will Deacon Link: https://lore.kernel.org/r/20241017092538.1859841-11-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/ptrace.h | 4 +- arch/arm64/include/asm/stacktrace/frame.h | 35 +++++++++ arch/arm64/kernel/asm-offsets.c | 1 + arch/arm64/kernel/entry.S | 12 +-- arch/arm64/kernel/head.S | 3 + arch/arm64/kernel/process.c | 1 + arch/arm64/kernel/stacktrace.c | 121 +++++++++++++++++++++++++++--- 7 files changed, 158 insertions(+), 19 deletions(-) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index 89c02f85f4b1..47ff8654c5ec 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -168,9 +168,7 @@ struct pt_regs { u32 pmr; u64 sdei_ttbr1; - u64 unused; - - struct frame_record stackframe; + struct frame_record_meta stackframe; /* Only valid for some EL1 exceptions. */ u64 lockdep_hardirqs; diff --git a/arch/arm64/include/asm/stacktrace/frame.h b/arch/arm64/include/asm/stacktrace/frame.h index 6397bc847f14..0ee0f6ba0fd8 100644 --- a/arch/arm64/include/asm/stacktrace/frame.h +++ b/arch/arm64/include/asm/stacktrace/frame.h @@ -3,6 +3,30 @@ #define __ASM_STACKTRACE_FRAME_H /* + * - FRAME_META_TYPE_NONE + * + * This value is reserved. + * + * - FRAME_META_TYPE_FINAL + * + * The record is the last entry on the stack. + * Unwinding should terminate successfully. + * + * - FRAME_META_TYPE_PT_REGS + * + * The record is embedded within a struct pt_regs, recording the registers at + * an arbitrary point in time. + * Unwinding should consume pt_regs::pc, followed by pt_regs::lr. + * + * Note: all other values are reserved and should result in unwinding + * terminating with an error. + */ +#define FRAME_META_TYPE_NONE 0 +#define FRAME_META_TYPE_FINAL 1 +#define FRAME_META_TYPE_PT_REGS 2 + +#ifndef __ASSEMBLY__ +/* * A standard AAPCS64 frame record. */ struct frame_record { @@ -10,4 +34,15 @@ struct frame_record { u64 lr; }; +/* + * A metadata frame record indicating a special unwind. + * The record::{fp,lr} fields must be zero to indicate the presence of + * metadata. + */ +struct frame_record_meta { + struct frame_record record; + u64 type; +}; +#endif /* __ASSEMBLY */ + #endif /* __ASM_STACKTRACE_FRAME_H */ diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index eb7fb2f9b927..021f04f97fde 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -81,6 +81,7 @@ int main(void) DEFINE(S_SDEI_TTBR1, offsetof(struct pt_regs, sdei_ttbr1)); DEFINE(S_PMR, offsetof(struct pt_regs, pmr)); DEFINE(S_STACKFRAME, offsetof(struct pt_regs, stackframe)); + DEFINE(S_STACKFRAME_TYPE, offsetof(struct pt_regs, stackframe.type)); DEFINE(PT_REGS_SIZE, sizeof(struct pt_regs)); BLANK(); #ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index fa6d6d5ca5e0..5ae2a34b50bd 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -284,15 +285,16 @@ alternative_else_nop_endif stp lr, x21, [sp, #S_LR] /* - * For exceptions from EL0, create a final frame record. - * For exceptions from EL1, create a synthetic frame record so the - * interrupted code shows up in the backtrace. + * Create a metadata frame record. The unwinder will use this to + * identify and unwind exception boundaries. */ - .if \el == 0 stp xzr, xzr, [sp, #S_STACKFRAME] + .if \el == 0 + mov x0, #FRAME_META_TYPE_FINAL .else - stp x29, x22, [sp, #S_STACKFRAME] + mov x0, #FRAME_META_TYPE_PT_REGS .endif + str x0, [sp, #S_STACKFRAME_TYPE] add x29, sp, #S_STACKFRAME #ifdef CONFIG_ARM64_SW_TTBR0_PAN diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index cb68adcabe07..5ab1970ee543 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -199,6 +200,8 @@ SYM_CODE_END(preserve_boot_args) sub sp, sp, #PT_REGS_SIZE stp xzr, xzr, [sp, #S_STACKFRAME] + mov \tmp1, #FRAME_META_TYPE_FINAL + str \tmp1, [sp, #S_STACKFRAME_TYPE] add x29, sp, #S_STACKFRAME scs_load_current diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index d45fd114eac3..29904c829de2 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -409,6 +409,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) */ memset(childregs, 0, sizeof(struct pt_regs)); childregs->pstate = PSR_MODE_EL1h | PSR_IL_BIT; + childregs->stackframe.type = FRAME_META_TYPE_FINAL; p->thread.cpu_context.x19 = (unsigned long)args->fn; p->thread.cpu_context.x20 = (unsigned long)args->fn_arg; diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index f8e231683dad..caef85462acb 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -26,6 +26,7 @@ enum kunwind_source { KUNWIND_SOURCE_CALLER, KUNWIND_SOURCE_TASK, KUNWIND_SOURCE_REGS_PC, + KUNWIND_SOURCE_REGS_LR, }; union unwind_flags { @@ -55,6 +56,7 @@ struct kunwind_state { #endif enum kunwind_source source; union unwind_flags flags; + struct pt_regs *regs; }; static __always_inline void @@ -65,6 +67,7 @@ kunwind_init(struct kunwind_state *state, state->task = task; state->source = KUNWIND_SOURCE_UNKNOWN; state->flags.all = 0; + state->regs = NULL; } /* @@ -80,6 +83,7 @@ kunwind_init_from_regs(struct kunwind_state *state, { kunwind_init(state, current); + state->regs = regs; state->common.fp = regs->regs[29]; state->common.pc = regs->pc; state->source = KUNWIND_SOURCE_REGS_PC; @@ -155,6 +159,103 @@ kunwind_recover_return_address(struct kunwind_state *state) return 0; } +static __always_inline +int kunwind_next_regs_pc(struct kunwind_state *state) +{ + struct stack_info *info; + unsigned long fp = state->common.fp; + struct pt_regs *regs; + + regs = container_of((u64 *)fp, struct pt_regs, stackframe.record.fp); + + info = unwind_find_stack(&state->common, (unsigned long)regs, sizeof(*regs)); + if (!info) + return -EINVAL; + + unwind_consume_stack(&state->common, info, (unsigned long)regs, + sizeof(*regs)); + + state->regs = regs; + state->common.pc = regs->pc; + state->common.fp = regs->regs[29]; + state->source = KUNWIND_SOURCE_REGS_PC; + return 0; +} + +static __always_inline int +kunwind_next_regs_lr(struct kunwind_state *state) +{ + /* + * The stack for the regs was consumed by kunwind_next_regs_pc(), so we + * cannot consume that again here, but we know the regs are safe to + * access. + */ + state->common.pc = state->regs->regs[30]; + state->common.fp = state->regs->regs[29]; + state->regs = NULL; + state->source = KUNWIND_SOURCE_REGS_LR; + + return 0; +} + +static __always_inline int +kunwind_next_frame_record_meta(struct kunwind_state *state) +{ + struct task_struct *tsk = state->task; + unsigned long fp = state->common.fp; + struct frame_record_meta *meta; + struct stack_info *info; + + info = unwind_find_stack(&state->common, fp, sizeof(*meta)); + if (!info) + return -EINVAL; + + meta = (struct frame_record_meta *)fp; + switch (READ_ONCE(meta->type)) { + case FRAME_META_TYPE_FINAL: + if (meta == &task_pt_regs(tsk)->stackframe) + return -ENOENT; + WARN_ON_ONCE(1); + return -EINVAL; + case FRAME_META_TYPE_PT_REGS: + return kunwind_next_regs_pc(state); + default: + WARN_ON_ONCE(1); + return -EINVAL; + } +} + +static __always_inline int +kunwind_next_frame_record(struct kunwind_state *state) +{ + unsigned long fp = state->common.fp; + struct frame_record *record; + struct stack_info *info; + unsigned long new_fp, new_pc; + + if (fp & 0x7) + return -EINVAL; + + info = unwind_find_stack(&state->common, fp, sizeof(*record)); + if (!info) + return -EINVAL; + + record = (struct frame_record *)fp; + new_fp = READ_ONCE(record->fp); + new_pc = READ_ONCE(record->lr); + + if (!new_fp && !new_pc) + return kunwind_next_frame_record_meta(state); + + unwind_consume_stack(&state->common, info, fp, sizeof(*record)); + + state->common.fp = new_fp; + state->common.pc = new_pc; + state->source = KUNWIND_SOURCE_FRAME; + + return 0; +} + /* * Unwind from one frame record (A) to the next frame record (B). * @@ -165,30 +266,27 @@ kunwind_recover_return_address(struct kunwind_state *state) static __always_inline int kunwind_next(struct kunwind_state *state) { - struct task_struct *tsk = state->task; - unsigned long fp = state->common.fp; int err; state->flags.all = 0; - /* Final frame; nothing to unwind */ - if (fp == (unsigned long)&task_pt_regs(tsk)->stackframe) - return -ENOENT; - switch (state->source) { case KUNWIND_SOURCE_FRAME: case KUNWIND_SOURCE_CALLER: case KUNWIND_SOURCE_TASK: + case KUNWIND_SOURCE_REGS_LR: + err = kunwind_next_frame_record(state); + break; case KUNWIND_SOURCE_REGS_PC: - err = unwind_next_frame_record(&state->common); - if (err) - return err; - state->source = KUNWIND_SOURCE_FRAME; + err = kunwind_next_regs_lr(state); break; default: - return -EINVAL; + err = -EINVAL; } + if (err) + return err; + state->common.pc = ptrauth_strip_kernel_insn_pac(state->common.pc); return kunwind_recover_return_address(state); @@ -338,6 +436,7 @@ static const char *state_source_string(const struct kunwind_state *state) case KUNWIND_SOURCE_CALLER: return "C"; case KUNWIND_SOURCE_TASK: return "T"; case KUNWIND_SOURCE_REGS_PC: return "P"; + case KUNWIND_SOURCE_REGS_LR: return "L"; default: return "U"; } } -- cgit v1.2.3 From ddadbcdaaed5c3c44cc6c36093f6bf02d942d71d Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 4 Oct 2024 21:26:30 +0100 Subject: arm64: Support AT_HWCAP3 We have filled all 64 bits of AT_HWCAP2 so in order to support discovery of further features provide the framework to use the already defined AT_HWCAP3 for further CPU features. Signed-off-by: Mark Brown Reviewed-by: Anshuman Khandual Link: https://lore.kernel.org/r/20241004-arm64-elf-hwcap3-v2-2-799d1daad8b0@kernel.org Signed-off-by: Catalin Marinas --- Documentation/arch/arm64/elf_hwcaps.rst | 6 +++--- arch/arm64/include/asm/cpufeature.h | 3 ++- arch/arm64/include/asm/hwcap.h | 6 +++++- arch/arm64/include/uapi/asm/hwcap.h | 4 ++++ arch/arm64/kernel/cpufeature.c | 6 ++++++ 5 files changed, 20 insertions(+), 5 deletions(-) (limited to 'arch/arm64/kernel') diff --git a/Documentation/arch/arm64/elf_hwcaps.rst b/Documentation/arch/arm64/elf_hwcaps.rst index 694f67fa07d1..dc1b11d6d112 100644 --- a/Documentation/arch/arm64/elf_hwcaps.rst +++ b/Documentation/arch/arm64/elf_hwcaps.rst @@ -16,9 +16,9 @@ architected discovery mechanism available to userspace code at EL0. The kernel exposes the presence of these features to userspace through a set of flags called hwcaps, exposed in the auxiliary vector. -Userspace software can test for features by acquiring the AT_HWCAP or -AT_HWCAP2 entry of the auxiliary vector, and testing whether the relevant -flags are set, e.g.:: +Userspace software can test for features by acquiring the AT_HWCAP, +AT_HWCAP2 or AT_HWCAP3 entry of the auxiliary vector, and testing +whether the relevant flags are set, e.g.:: bool floating_point_is_present(void) { diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 3d261cc123c1..38e7d1a44ea3 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -12,7 +12,7 @@ #include #include -#define MAX_CPU_FEATURES 128 +#define MAX_CPU_FEATURES 192 #define cpu_feature(x) KERNEL_HWCAP_ ## x #define ARM64_SW_FEATURE_OVERRIDE_NOKASLR 0 @@ -438,6 +438,7 @@ void cpu_set_feature(unsigned int num); bool cpu_have_feature(unsigned int num); unsigned long cpu_get_elf_hwcap(void); unsigned long cpu_get_elf_hwcap2(void); +unsigned long cpu_get_elf_hwcap3(void); #define cpu_set_named_feature(name) cpu_set_feature(cpu_feature(name)) #define cpu_have_named_feature(name) cpu_have_feature(cpu_feature(name)) diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h index a775adddecf2..3b5c50df419e 100644 --- a/arch/arm64/include/asm/hwcap.h +++ b/arch/arm64/include/asm/hwcap.h @@ -159,17 +159,21 @@ #define KERNEL_HWCAP_SME_SF8DP2 __khwcap2_feature(SME_SF8DP2) #define KERNEL_HWCAP_POE __khwcap2_feature(POE) +#define __khwcap3_feature(x) (const_ilog2(HWCAP3_ ## x) + 128) + /* * This yields a mask that user programs can use to figure out what * instruction set this cpu supports. */ #define ELF_HWCAP cpu_get_elf_hwcap() #define ELF_HWCAP2 cpu_get_elf_hwcap2() +#define ELF_HWCAP3 cpu_get_elf_hwcap3() #ifdef CONFIG_COMPAT #define COMPAT_ELF_HWCAP (compat_elf_hwcap) #define COMPAT_ELF_HWCAP2 (compat_elf_hwcap2) -extern unsigned int compat_elf_hwcap, compat_elf_hwcap2; +#define COMPAT_ELF_HWCAP3 (compat_elf_hwcap3) +extern unsigned int compat_elf_hwcap, compat_elf_hwcap2, compat_elf_hwcap3; #endif enum { diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h index 055381b2c615..3dd4a53a438a 100644 --- a/arch/arm64/include/uapi/asm/hwcap.h +++ b/arch/arm64/include/uapi/asm/hwcap.h @@ -124,4 +124,8 @@ #define HWCAP2_SME_SF8DP2 (1UL << 62) #define HWCAP2_POE (1UL << 63) +/* + * HWCAP3 flags - for AT_HWCAP3 + */ + #endif /* _UAPI__ASM_HWCAP_H */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 718728a85430..7221636b7907 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -103,6 +103,7 @@ static DECLARE_BITMAP(elf_hwcap, MAX_CPU_FEATURES) __read_mostly; COMPAT_HWCAP_LPAE) unsigned int compat_elf_hwcap __read_mostly = COMPAT_ELF_HWCAP_DEFAULT; unsigned int compat_elf_hwcap2 __read_mostly; +unsigned int compat_elf_hwcap3 __read_mostly; #endif DECLARE_BITMAP(system_cpucaps, ARM64_NCAPS); @@ -3499,6 +3500,11 @@ unsigned long cpu_get_elf_hwcap2(void) return elf_hwcap[1]; } +unsigned long cpu_get_elf_hwcap3(void) +{ + return elf_hwcap[2]; +} + static void __init setup_boot_cpu_capabilities(void) { /* -- cgit v1.2.3 From c077711f718be7cebcc8b987eac2ebfd17447e9f Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Thu, 17 Oct 2024 14:14:25 +0100 Subject: arm64: Detect if in a realm and set RIPAS RAM Detect that the VM is a realm guest by the presence of the RSI interface. This is done after PSCI has been initialised so that we can check the SMCCC conduit before making any RSI calls. If in a realm then iterate over all memory ensuring that it is marked as RIPAS RAM. The loader is required to do this for us, however if some memory is missed this will cause the guest to receive a hard to debug external abort at some random point in the future. So for a belt-and-braces approach set all memory to RIPAS RAM. Any failure here implies that the RAM regions passed to Linux are incorrect so panic() promptly to make the situation clear. Reviewed-by: Gavin Shan Reviewed-by: Catalin Marinas Signed-off-by: Suzuki K Poulose Co-developed-by: Steven Price Signed-off-by: Steven Price Link: https://lore.kernel.org/r/20241017131434.40935-3-steven.price@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/rsi.h | 66 ++++++++++++++++++++++++++++++++++++++ arch/arm64/kernel/Makefile | 3 +- arch/arm64/kernel/rsi.c | 76 ++++++++++++++++++++++++++++++++++++++++++++ arch/arm64/kernel/setup.c | 3 ++ 4 files changed, 147 insertions(+), 1 deletion(-) create mode 100644 arch/arm64/include/asm/rsi.h create mode 100644 arch/arm64/kernel/rsi.c (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/include/asm/rsi.h b/arch/arm64/include/asm/rsi.h new file mode 100644 index 000000000000..acba065eb00e --- /dev/null +++ b/arch/arm64/include/asm/rsi.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2024 ARM Ltd. + */ + +#ifndef __ASM_RSI_H_ +#define __ASM_RSI_H_ + +#include +#include +#include + +DECLARE_STATIC_KEY_FALSE(rsi_present); + +void __init arm64_rsi_init(void); + +static inline bool is_realm_world(void) +{ + return static_branch_unlikely(&rsi_present); +} + +static inline int rsi_set_memory_range(phys_addr_t start, phys_addr_t end, + enum ripas state, unsigned long flags) +{ + unsigned long ret; + phys_addr_t top; + + while (start != end) { + ret = rsi_set_addr_range_state(start, end, state, flags, &top); + if (ret || top < start || top > end) + return -EINVAL; + start = top; + } + + return 0; +} + +/* + * Convert the specified range to RAM. Do not use this if you rely on the + * contents of a page that may already be in RAM state. + */ +static inline int rsi_set_memory_range_protected(phys_addr_t start, + phys_addr_t end) +{ + return rsi_set_memory_range(start, end, RSI_RIPAS_RAM, + RSI_CHANGE_DESTROYED); +} + +/* + * Convert the specified range to RAM. Do not convert any pages that may have + * been DESTROYED, without our permission. + */ +static inline int rsi_set_memory_range_protected_safe(phys_addr_t start, + phys_addr_t end) +{ + return rsi_set_memory_range(start, end, RSI_RIPAS_RAM, + RSI_NO_CHANGE_DESTROYED); +} + +static inline int rsi_set_memory_range_shared(phys_addr_t start, + phys_addr_t end) +{ + return rsi_set_memory_range(start, end, RSI_RIPAS_EMPTY, + RSI_CHANGE_DESTROYED); +} +#endif /* __ASM_RSI_H_ */ diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 2b112f3b7510..71c29a2a2f19 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -33,7 +33,8 @@ obj-y := debug-monitors.o entry.o irq.o fpsimd.o \ return_address.o cpuinfo.o cpu_errata.o \ cpufeature.o alternative.o cacheinfo.o \ smp.o smp_spin_table.o topology.o smccc-call.o \ - syscall.o proton-pack.o idle.o patching.o pi/ + syscall.o proton-pack.o idle.o patching.o pi/ \ + rsi.o obj-$(CONFIG_COMPAT) += sys32.o signal32.o \ sys_compat.o diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c new file mode 100644 index 000000000000..c5758317dfed --- /dev/null +++ b/arch/arm64/kernel/rsi.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023 ARM Ltd. + */ + +#include +#include +#include +#include + +DEFINE_STATIC_KEY_FALSE_RO(rsi_present); +EXPORT_SYMBOL(rsi_present); + +static bool rsi_version_matches(void) +{ + unsigned long ver_lower, ver_higher; + unsigned long ret = rsi_request_version(RSI_ABI_VERSION, + &ver_lower, + &ver_higher); + + if (ret == SMCCC_RET_NOT_SUPPORTED) + return false; + + if (ret != RSI_SUCCESS) { + pr_err("RME: RMM doesn't support RSI version %lu.%lu. Supported range: %lu.%lu-%lu.%lu\n", + RSI_ABI_VERSION_MAJOR, RSI_ABI_VERSION_MINOR, + RSI_ABI_VERSION_GET_MAJOR(ver_lower), + RSI_ABI_VERSION_GET_MINOR(ver_lower), + RSI_ABI_VERSION_GET_MAJOR(ver_higher), + RSI_ABI_VERSION_GET_MINOR(ver_higher)); + return false; + } + + pr_info("RME: Using RSI version %lu.%lu\n", + RSI_ABI_VERSION_GET_MAJOR(ver_lower), + RSI_ABI_VERSION_GET_MINOR(ver_lower)); + + return true; +} + +static void __init arm64_rsi_setup_memory(void) +{ + u64 i; + phys_addr_t start, end; + + /* + * Iterate over the available memory ranges and convert the state to + * protected memory. We should take extra care to ensure that we DO NOT + * permit any "DESTROYED" pages to be converted to "RAM". + * + * panic() is used because if the attempt to switch the memory to + * protected has failed here, then future accesses to the memory are + * simply going to be reflected as a SEA (Synchronous External Abort) + * which we can't handle. Bailing out early prevents the guest limping + * on and dying later. + */ + for_each_mem_range(i, &start, &end) { + if (rsi_set_memory_range_protected_safe(start, end)) { + panic("Failed to set memory range to protected: %pa-%pa", + &start, &end); + } + } +} + +void __init arm64_rsi_init(void) +{ + if (arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_SMC) + return; + if (!rsi_version_matches()) + return; + + arm64_rsi_setup_memory(); + + static_branch_enable(&rsi_present); +} + diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index b22d28ec8028..b5e1e306fa51 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -351,6 +352,8 @@ void __init __no_sanitize_address setup_arch(char **cmdline_p) else psci_acpi_init(); + arm64_rsi_init(); + init_bootcpu_ops(); smp_init_cpus(); smp_build_mpidr_hash(); -- cgit v1.2.3 From 399306954996be58ac20b4b29f6334e3d55a2ce7 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Thu, 17 Oct 2024 14:14:26 +0100 Subject: arm64: realm: Query IPA size from the RMM The top bit of the configured IPA size is used as an attribute to control whether the address is protected or shared. Query the configuration from the RMM to assertain which bit this is. Reviewed-by: Catalin Marinas Reviewed-by: Gavin Shan Co-developed-by: Suzuki K Poulose Signed-off-by: Suzuki K Poulose Signed-off-by: Steven Price Link: https://lore.kernel.org/r/20241017131434.40935-4-steven.price@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/pgtable-prot.h | 4 ++++ arch/arm64/kernel/rsi.c | 8 ++++++++ 2 files changed, 12 insertions(+) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index 2a11d0c10760..820a3b06f08c 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -68,8 +68,12 @@ #include #include +#include extern bool arm64_use_ng_mappings; +extern unsigned long prot_ns_shared; + +#define PROT_NS_SHARED (is_realm_world() ? prot_ns_shared : 0) #define PTE_MAYBE_NG (arm64_use_ng_mappings ? PTE_NG : 0) #define PMD_MAYBE_NG (arm64_use_ng_mappings ? PMD_SECT_NG : 0) diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c index c5758317dfed..cea8f0d39591 100644 --- a/arch/arm64/kernel/rsi.c +++ b/arch/arm64/kernel/rsi.c @@ -8,6 +8,11 @@ #include #include +static struct realm_config config; + +unsigned long prot_ns_shared; +EXPORT_SYMBOL(prot_ns_shared); + DEFINE_STATIC_KEY_FALSE_RO(rsi_present); EXPORT_SYMBOL(rsi_present); @@ -68,6 +73,9 @@ void __init arm64_rsi_init(void) return; if (!rsi_version_matches()) return; + if (WARN_ON(rsi_get_realm_config(&config))) + return; + prot_ns_shared = BIT(config.ipa_bits - 1); arm64_rsi_setup_memory(); -- cgit v1.2.3 From 371589437616fbb03590d8ff505f8a4c95c8a031 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Thu, 17 Oct 2024 14:14:27 +0100 Subject: arm64: rsi: Add support for checking whether an MMIO is protected On Arm CCA, with RMM-v1.0, all MMIO regions are shared. However, in the future, an Arm CCA-v1.0 compliant guest may be run in a lesser privileged partition in the Realm World (with Arm CCA-v1.1 Planes feature). In this case, some of the MMIO regions may be emulated by a higher privileged component in the Realm world, i.e, protected. Thus the guest must decide today, whether a given MMIO region is shared vs Protected and create the stage1 mapping accordingly. On Arm CCA, this detection is based on the "IPA State" (RIPAS == RIPAS_IO). Provide a helper to run this check on a given range of MMIO. Also, provide a arm64 helper which may be hooked in by other solutions. Reviewed-by: Catalin Marinas Reviewed-by: Gavin Shan Signed-off-by: Suzuki K Poulose Signed-off-by: Steven Price Link: https://lore.kernel.org/r/20241017131434.40935-5-steven.price@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/io.h | 8 ++++++++ arch/arm64/include/asm/rsi.h | 2 ++ arch/arm64/include/asm/rsi_cmds.h | 21 +++++++++++++++++++++ arch/arm64/kernel/rsi.c | 26 ++++++++++++++++++++++++++ 4 files changed, 57 insertions(+) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index 1ada23a6ec19..8688343b71f2 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -17,6 +17,7 @@ #include #include #include +#include /* * Generic IO read/write. These perform native-endian accesses. @@ -318,4 +319,11 @@ extern bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size, unsigned long flags); #define arch_memremap_can_ram_remap arch_memremap_can_ram_remap +static inline bool arm64_is_protected_mmio(phys_addr_t phys_addr, size_t size) +{ + if (unlikely(is_realm_world())) + return __arm64_is_protected_mmio(phys_addr, size); + return false; +} + #endif /* __ASM_IO_H */ diff --git a/arch/arm64/include/asm/rsi.h b/arch/arm64/include/asm/rsi.h index acba065eb00e..188cbb9b23f5 100644 --- a/arch/arm64/include/asm/rsi.h +++ b/arch/arm64/include/asm/rsi.h @@ -14,6 +14,8 @@ DECLARE_STATIC_KEY_FALSE(rsi_present); void __init arm64_rsi_init(void); +bool __arm64_is_protected_mmio(phys_addr_t base, size_t size); + static inline bool is_realm_world(void) { return static_branch_unlikely(&rsi_present); diff --git a/arch/arm64/include/asm/rsi_cmds.h b/arch/arm64/include/asm/rsi_cmds.h index 2fcf351b5634..e6a211001bd3 100644 --- a/arch/arm64/include/asm/rsi_cmds.h +++ b/arch/arm64/include/asm/rsi_cmds.h @@ -45,6 +45,27 @@ static inline unsigned long rsi_get_realm_config(struct realm_config *cfg) return res.a0; } +static inline unsigned long rsi_ipa_state_get(phys_addr_t start, + phys_addr_t end, + enum ripas *state, + phys_addr_t *top) +{ + struct arm_smccc_res res; + + arm_smccc_smc(SMC_RSI_IPA_STATE_GET, + start, end, 0, 0, 0, 0, 0, + &res); + + if (res.a0 == RSI_SUCCESS) { + if (top) + *top = res.a1; + if (state) + *state = res.a2; + } + + return res.a0; +} + static inline long rsi_set_addr_range_state(phys_addr_t start, phys_addr_t end, enum ripas state, diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c index cea8f0d39591..7e7934c4fca0 100644 --- a/arch/arm64/kernel/rsi.c +++ b/arch/arm64/kernel/rsi.c @@ -67,6 +67,32 @@ static void __init arm64_rsi_setup_memory(void) } } +bool __arm64_is_protected_mmio(phys_addr_t base, size_t size) +{ + enum ripas ripas; + phys_addr_t end, top; + + /* Overflow ? */ + if (WARN_ON(base + size <= base)) + return false; + + end = ALIGN(base + size, RSI_GRANULE_SIZE); + base = ALIGN_DOWN(base, RSI_GRANULE_SIZE); + + while (base < end) { + if (WARN_ON(rsi_ipa_state_get(base, end, &ripas, &top))) + break; + if (WARN_ON(top <= base)) + break; + if (ripas != RSI_RIPAS_DEV) + break; + base = top; + } + + return base >= end; +} +EXPORT_SYMBOL(__arm64_is_protected_mmio); + void __init arm64_rsi_init(void) { if (arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_SMC) -- cgit v1.2.3 From 3c6c706139564f74ec48229378873c1d930a8bc8 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Thu, 17 Oct 2024 14:14:28 +0100 Subject: arm64: rsi: Map unprotected MMIO as decrypted Instead of marking every MMIO as shared, check if the given region is "Protected" and apply the permissions accordingly. Reviewed-by: Gavin Shan Reviewed-by: Catalin Marinas Signed-off-by: Suzuki K Poulose Signed-off-by: Steven Price Link: https://lore.kernel.org/r/20241017131434.40935-6-steven.price@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/rsi.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c index 7e7934c4fca0..3e0c83e2296f 100644 --- a/arch/arm64/kernel/rsi.c +++ b/arch/arm64/kernel/rsi.c @@ -6,6 +6,8 @@ #include #include #include + +#include #include static struct realm_config config; @@ -93,6 +95,16 @@ bool __arm64_is_protected_mmio(phys_addr_t base, size_t size) } EXPORT_SYMBOL(__arm64_is_protected_mmio); +static int realm_ioremap_hook(phys_addr_t phys, size_t size, pgprot_t *prot) +{ + if (__arm64_is_protected_mmio(phys, size)) + *prot = pgprot_encrypted(*prot); + else + *prot = pgprot_decrypted(*prot); + + return 0; +} + void __init arm64_rsi_init(void) { if (arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_SMC) @@ -103,6 +115,9 @@ void __init arm64_rsi_init(void) return; prot_ns_shared = BIT(config.ipa_bits - 1); + if (arm64_ioremap_prot_hook_register(realm_ioremap_hook)) + return; + arm64_rsi_setup_memory(); static_branch_enable(&rsi_present); -- cgit v1.2.3 From 491db21d8256992ab9fe11c42744eb3044315d14 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Thu, 17 Oct 2024 14:14:29 +0100 Subject: efi: arm64: Map Device with Prot Shared Device mappings need to be emulated by the VMM so must be mapped shared with the host. Reviewed-by: Gavin Shan Reviewed-by: Catalin Marinas Signed-off-by: Suzuki K Poulose Signed-off-by: Steven Price Link: https://lore.kernel.org/r/20241017131434.40935-7-steven.price@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/efi.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index 712718aed5dd..1d25d8899dbf 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -34,8 +34,16 @@ static __init pteval_t create_mapping_protection(efi_memory_desc_t *md) u64 attr = md->attribute; u32 type = md->type; - if (type == EFI_MEMORY_MAPPED_IO) - return PROT_DEVICE_nGnRE; + if (type == EFI_MEMORY_MAPPED_IO) { + pgprot_t prot = __pgprot(PROT_DEVICE_nGnRE); + + if (arm64_is_protected_mmio(md->phys_addr, + md->num_pages << EFI_PAGE_SHIFT)) + prot = pgprot_encrypted(prot); + else + prot = pgprot_decrypted(prot); + return pgprot_val(prot); + } if (region_is_misaligned(md)) { static bool __initdata code_is_misaligned; -- cgit v1.2.3 From fbf979a01375704fa87c559763209c658593b6f8 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Thu, 17 Oct 2024 14:14:30 +0100 Subject: arm64: Enforce bounce buffers for realm DMA Within a realm guest it's not possible for a device emulated by the VMM to access arbitrary guest memory. So force the use of bounce buffers to ensure that the memory the emulated devices are accessing is in memory which is explicitly shared with the host. This adds a call to swiotlb_update_mem_attributes() which calls set_memory_decrypted() to ensure the bounce buffer memory is shared with the host. For non-realm guests or hosts this is a no-op. Reviewed-by: Catalin Marinas Reviewed-by: Gavin Shan Co-developed-by: Suzuki K Poulose Signed-off-by: Suzuki K Poulose Signed-off-by: Steven Price Link: https://lore.kernel.org/r/20241017131434.40935-8-steven.price@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/rsi.c | 1 + arch/arm64/mm/init.c | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c index 3e0c83e2296f..a23c0a7154d2 100644 --- a/arch/arm64/kernel/rsi.c +++ b/arch/arm64/kernel/rsi.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 27a32ff15412..d21f67d67cf5 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -366,8 +367,14 @@ void __init bootmem_init(void) */ void __init mem_init(void) { + unsigned int flags = SWIOTLB_VERBOSE; bool swiotlb = max_pfn > PFN_DOWN(arm64_dma_phys_limit); + if (is_realm_world()) { + swiotlb = true; + flags |= SWIOTLB_FORCE; + } + if (IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) && !swiotlb) { /* * If no bouncing needed for ZONE_DMA, reduce the swiotlb @@ -379,7 +386,8 @@ void __init mem_init(void) swiotlb = true; } - swiotlb_init(swiotlb, SWIOTLB_VERBOSE); + swiotlb_init(swiotlb, flags); + swiotlb_update_mem_attributes(); /* this will put all unused low memory onto the freelists */ memblock_free_all(); -- cgit v1.2.3 From 42be24a4178fe51e6f47d91d8621b2f53820f88b Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Thu, 17 Oct 2024 14:14:32 +0100 Subject: arm64: Enable memory encrypt for Realms Use the memory encryption APIs to trigger a RSI call to request a transition between protected memory and shared memory (or vice versa) and updating the kernel's linear map of modified pages to flip the top bit of the IPA. This requires that block mappings are not used in the direct map for realm guests. Reviewed-by: Catalin Marinas Reviewed-by: Gavin Shan Signed-off-by: Suzuki K Poulose Co-developed-by: Steven Price Signed-off-by: Steven Price Link: https://lore.kernel.org/r/20241017131434.40935-10-steven.price@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 3 ++ arch/arm64/include/asm/mem_encrypt.h | 9 ++++ arch/arm64/include/asm/pgtable.h | 5 ++ arch/arm64/include/asm/set_memory.h | 3 ++ arch/arm64/kernel/rsi.c | 16 +++++++ arch/arm64/mm/pageattr.c | 90 ++++++++++++++++++++++++++++++++++-- 6 files changed, 123 insertions(+), 3 deletions(-) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 3e29b44d2d7b..ccea9c22d6df 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -21,6 +21,7 @@ config ARM64 select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2 select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE select ARCH_HAS_CACHE_LINE_SIZE + select ARCH_HAS_CC_PLATFORM select ARCH_HAS_CURRENT_STACK_POINTER select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEBUG_VM_PGTABLE @@ -44,6 +45,8 @@ config ARM64 select ARCH_HAS_SETUP_DMA_OPS select ARCH_HAS_SET_DIRECT_MAP select ARCH_HAS_SET_MEMORY + select ARCH_HAS_MEM_ENCRYPT + select ARCH_HAS_FORCE_DMA_UNENCRYPTED select ARCH_STACKWALK select ARCH_HAS_STRICT_KERNEL_RWX select ARCH_HAS_STRICT_MODULE_RWX diff --git a/arch/arm64/include/asm/mem_encrypt.h b/arch/arm64/include/asm/mem_encrypt.h index b0c9a86b13a4..f8f78f622dd2 100644 --- a/arch/arm64/include/asm/mem_encrypt.h +++ b/arch/arm64/include/asm/mem_encrypt.h @@ -2,6 +2,8 @@ #ifndef __ASM_MEM_ENCRYPT_H #define __ASM_MEM_ENCRYPT_H +#include + struct arm64_mem_crypt_ops { int (*encrypt)(unsigned long addr, int numpages); int (*decrypt)(unsigned long addr, int numpages); @@ -12,4 +14,11 @@ int arm64_mem_crypt_ops_register(const struct arm64_mem_crypt_ops *ops); int set_memory_encrypted(unsigned long addr, int numpages); int set_memory_decrypted(unsigned long addr, int numpages); +int realm_register_memory_enc_ops(void); + +static inline bool force_dma_unencrypted(struct device *dev) +{ + return is_realm_world(); +} + #endif /* __ASM_MEM_ENCRYPT_H */ diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index c329ea061dc9..7e4bdc8259a2 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -684,6 +684,11 @@ static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, #define pgprot_nx(prot) \ __pgprot_modify(prot, PTE_MAYBE_GP, PTE_PXN) +#define pgprot_decrypted(prot) \ + __pgprot_modify(prot, PROT_NS_SHARED, PROT_NS_SHARED) +#define pgprot_encrypted(prot) \ + __pgprot_modify(prot, PROT_NS_SHARED, 0) + /* * Mark the prot value as uncacheable and unbufferable. */ diff --git a/arch/arm64/include/asm/set_memory.h b/arch/arm64/include/asm/set_memory.h index 917761feeffd..37774c793006 100644 --- a/arch/arm64/include/asm/set_memory.h +++ b/arch/arm64/include/asm/set_memory.h @@ -15,4 +15,7 @@ int set_direct_map_invalid_noflush(struct page *page); int set_direct_map_default_noflush(struct page *page); bool kernel_page_present(struct page *page); +int set_memory_encrypted(unsigned long addr, int numpages); +int set_memory_decrypted(unsigned long addr, int numpages); + #endif /* _ASM_ARM64_SET_MEMORY_H */ diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c index a23c0a7154d2..3031f25c32ef 100644 --- a/arch/arm64/kernel/rsi.c +++ b/arch/arm64/kernel/rsi.c @@ -7,8 +7,10 @@ #include #include #include +#include #include +#include #include static struct realm_config config; @@ -19,6 +21,17 @@ EXPORT_SYMBOL(prot_ns_shared); DEFINE_STATIC_KEY_FALSE_RO(rsi_present); EXPORT_SYMBOL(rsi_present); +bool cc_platform_has(enum cc_attr attr) +{ + switch (attr) { + case CC_ATTR_MEM_ENCRYPT: + return is_realm_world(); + default: + return false; + } +} +EXPORT_SYMBOL_GPL(cc_platform_has); + static bool rsi_version_matches(void) { unsigned long ver_lower, ver_higher; @@ -119,6 +132,9 @@ void __init arm64_rsi_init(void) if (arm64_ioremap_prot_hook_register(realm_ioremap_hook)) return; + if (realm_register_memory_enc_ops()) + return; + arm64_rsi_setup_memory(); static_branch_enable(&rsi_present); diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 547a9e0b46c2..6ae6ae806454 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -5,10 +5,12 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -23,14 +25,16 @@ bool rodata_full __ro_after_init = IS_ENABLED(CONFIG_RODATA_FULL_DEFAULT_ENABLED bool can_set_direct_map(void) { /* - * rodata_full and DEBUG_PAGEALLOC require linear map to be - * mapped at page granularity, so that it is possible to + * rodata_full, DEBUG_PAGEALLOC and a Realm guest all require linear + * map to be mapped at page granularity, so that it is possible to * protect/unprotect single pages. * * KFENCE pool requires page-granular mapping if initialized late. + * + * Realms need to make pages shared/protected at page granularity. */ return rodata_full || debug_pagealloc_enabled() || - arm64_kfence_can_set_direct_map(); + arm64_kfence_can_set_direct_map() || is_realm_world(); } static int change_page_range(pte_t *ptep, unsigned long addr, void *data) @@ -198,6 +202,86 @@ int set_direct_map_default_noflush(struct page *page) PAGE_SIZE, change_page_range, &data); } +static int __set_memory_enc_dec(unsigned long addr, + int numpages, + bool encrypt) +{ + unsigned long set_prot = 0, clear_prot = 0; + phys_addr_t start, end; + int ret; + + if (!is_realm_world()) + return 0; + + if (!__is_lm_address(addr)) + return -EINVAL; + + start = __virt_to_phys(addr); + end = start + numpages * PAGE_SIZE; + + if (encrypt) + clear_prot = PROT_NS_SHARED; + else + set_prot = PROT_NS_SHARED; + + /* + * Break the mapping before we make any changes to avoid stale TLB + * entries or Synchronous External Aborts caused by RIPAS_EMPTY + */ + ret = __change_memory_common(addr, PAGE_SIZE * numpages, + __pgprot(set_prot), + __pgprot(clear_prot | PTE_VALID)); + + if (ret) + return ret; + + if (encrypt) + ret = rsi_set_memory_range_protected(start, end); + else + ret = rsi_set_memory_range_shared(start, end); + + if (ret) + return ret; + + return __change_memory_common(addr, PAGE_SIZE * numpages, + __pgprot(PTE_VALID), + __pgprot(0)); +} + +static int realm_set_memory_encrypted(unsigned long addr, int numpages) +{ + int ret = __set_memory_enc_dec(addr, numpages, true); + + /* + * If the request to change state fails, then the only sensible cause + * of action for the caller is to leak the memory + */ + WARN(ret, "Failed to encrypt memory, %d pages will be leaked", + numpages); + + return ret; +} + +static int realm_set_memory_decrypted(unsigned long addr, int numpages) +{ + int ret = __set_memory_enc_dec(addr, numpages, false); + + WARN(ret, "Failed to decrypt memory, %d pages will be leaked", + numpages); + + return ret; +} + +static const struct arm64_mem_crypt_ops realm_crypt_ops = { + .encrypt = realm_set_memory_encrypted, + .decrypt = realm_set_memory_decrypted, +}; + +int realm_register_memory_enc_ops(void) +{ + return arm64_mem_crypt_ops_register(&realm_crypt_ops); +} + #ifdef CONFIG_DEBUG_PAGEALLOC void __kernel_map_pages(struct page *page, int numpages, int enable) { -- cgit v1.2.3 From 358dd4a9bdac63a0a8fb13773bfce6f599e25433 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 21 Oct 2024 19:14:34 +0100 Subject: arm64: Add command-line override for ID_AA64MMFR0_EL1.ECV It appears that relatively popular hardware out there implements the CNTPOFF_EL2 variant of FEAT_ECV, advertises it via ID_AA64MMFR0_EL1, but cannot be bothered to set SCR_EL3.ECVEn to 1. You would probably think that "this is fine, EL3 will take the trap on access to CNTPOFF_EL2 and flip the ECVEn bit", as that's what a semi-decent firmware implementation would do. But no. None of that. This particular implementation takes the trap, considers its purpose in life, decides that it has none, and *RESETS* the system. Yes, x1e001de, I'm talking about you. In order to allow this machine to be promoted slightly above the level of a glorified door-stop, add a new "id_aa64mmfr0.ecv" override. allowing the kernel to pretend this option was never there. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20241021181434.1052974-1-maz@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/kernel/pi/idreg-override.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/kernel/pi/idreg-override.c b/arch/arm64/kernel/pi/idreg-override.c index 29d4b6244a6f..fbc37b2733e2 100644 --- a/arch/arm64/kernel/pi/idreg-override.c +++ b/arch/arm64/kernel/pi/idreg-override.c @@ -38,6 +38,15 @@ struct ftr_set_desc { #define FIELD(n, s, f) { .name = n, .shift = s, .width = 4, .filter = f } +static const struct ftr_set_desc mmfr0 __prel64_initconst = { + .name = "id_aa64mmfr0", + .override = &id_aa64mmfr0_override, + .fields = { + FIELD("ecv", ID_AA64MMFR0_EL1_ECV_SHIFT, NULL), + {} + }, +}; + static bool __init mmfr1_vh_filter(u64 val) { /* @@ -196,6 +205,7 @@ static const struct ftr_set_desc sw_features __prel64_initconst = { static const PREL64(const struct ftr_set_desc, reg) regs[] __prel64_initconst = { + { &mmfr0 }, { &mmfr1 }, { &mmfr2 }, { &pfr0 }, -- cgit v1.2.3 From 9a0e3b92b02e7a921b22f4d00d87e3506d06b92a Mon Sep 17 00:00:00 2001 From: Liao Chang Date: Thu, 24 Oct 2024 03:41:20 +0000 Subject: arm64: Return early when break handler is found on linked-list The search for breakpoint handlers iterate through the entire linked list. Given that all registered hook has a valid fn field, and no registered hooks share the same mask and imm. This commit optimize the efficiency slightly by returning early as a matching handler is found. Signed-off-by: Liao Chang Acked-by: Will Deacon Link: https://lore.kernel.org/r/20241024034120.3814224-1-liaochang1@huawei.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/debug-monitors.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c index 024a7b245056..4713a4c65b1b 100644 --- a/arch/arm64/kernel/debug-monitors.c +++ b/arch/arm64/kernel/debug-monitors.c @@ -303,7 +303,6 @@ static int call_break_hook(struct pt_regs *regs, unsigned long esr) { struct break_hook *hook; struct list_head *list; - int (*fn)(struct pt_regs *regs, unsigned long esr) = NULL; list = user_mode(regs) ? &user_break_hook : &kernel_break_hook; @@ -313,10 +312,10 @@ static int call_break_hook(struct pt_regs *regs, unsigned long esr) */ list_for_each_entry_rcu(hook, list, node) { if ((esr_brk_comment(esr) & ~hook->mask) == hook->imm) - fn = hook->fn; + return hook->fn(regs, esr); } - return fn ? fn(regs, esr) : DBG_HOOK_ERROR; + return DBG_HOOK_ERROR; } NOKPROBE_SYMBOL(call_break_hook); -- cgit v1.2.3 From 2287a4c1e11822d05a70d22f28b26bd810dd204e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 31 Oct 2024 08:35:19 +0000 Subject: arm64: Expose ID_AA64ISAR1_EL1.XS to sanitised feature consumers Despite KVM now being able to deal with XS-tagged TLBIs, we still don't expose these feature bits to KVM. Plumb in the feature in ID_AA64ISAR1_EL1. Fixes: 0feec7769a63 ("KVM: arm64: nv: Add handling of NXS-flavoured TLBI operations") Signed-off-by: Marc Zyngier Acked-by: Catalin Marinas Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20241031083519.364313-1-maz@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/kernel/cpufeature.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 718728a85430..db994d1fd97e 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -228,6 +228,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar0[] = { }; static const struct arm64_ftr_bits ftr_id_aa64isar1[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_XS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_I8MM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_DGH_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_BF16_SHIFT, 4, 0), -- cgit v1.2.3 From 525fd6a1b34ea8ce42e12db38380eed0efefb5d5 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 2 Nov 2024 10:31:54 +0100 Subject: arm64/fpsimd: Fix a typo s/FPSMID/FPSIMD/ M and I swapped. Fix it. Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/2cbcb42615e9265bccc9b746465d7998382e605d.1730539907.git.christophe.jaillet@wanadoo.fr Signed-off-by: Catalin Marinas --- arch/arm64/kernel/fpsimd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 77006df20a75..cd7d71fe1fda 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -386,7 +386,7 @@ static void task_fpsimd_load(void) * fpsimd_save_user_state() or memory corruption, we * should always record an explicit format * when we save. We always at least have the - * memory allocated for FPSMID registers so + * memory allocated for FPSIMD registers so * try that and hope for the best. */ WARN_ON_ONCE(1); -- cgit v1.2.3 From efe72541355d4d40a4f076af453f6533e98e058c Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Sat, 2 Nov 2024 18:42:33 +0800 Subject: arm64: Add support for FEAT_HAFT Armv8.9/v9.4 introduces the feature Hardware managed Access Flag for Table descriptors (FEAT_HAFT). The feature is indicated by ID_AA64MMFR1_EL1.HAFDBS == 0b0011 and can be enabled by TCR2_EL1.HAFT so it has a dependency on FEAT_TCR2. Adds the Kconfig for FEAT_HAFT and support detecting and enabling the feature. The feature is enabled in __cpu_setup() before MMU on just like HA. A CPU capability is added to notify the user of the feature. Add definition of P{G,4,U,M}D_TABLE_AF bit and set the AF bit when creating the page table, which will save the hardware from having to update them at runtime. This will be ignored if FEAT_HAFT is not enabled. The AF bit of table descriptors cannot be managed by the software per spec, unlike the HA. So this should be used only if it's supported system wide by system_supports_haft(). Signed-off-by: Yicong Yang Link: https://lore.kernel.org/r/20241102104235.62560-4-yangyicong@huawei.com Reviewed-by: Catalin Marinas [catalin.marinas@arm.com: added the ID check back to __cpu_setup in case of future CPU errata] Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 15 +++++++++++++++ arch/arm64/include/asm/cpufeature.h | 6 ++++++ arch/arm64/include/asm/pgalloc.h | 12 +++++++----- arch/arm64/include/asm/pgtable-hwdef.h | 4 ++++ arch/arm64/kernel/cpufeature.c | 15 +++++++++++++++ arch/arm64/mm/fixmap.c | 9 ++++++--- arch/arm64/mm/mmu.c | 8 ++++---- arch/arm64/mm/proc.S | 7 ++++++- arch/arm64/tools/cpucaps | 1 + 9 files changed, 64 insertions(+), 13 deletions(-) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 3e29b44d2d7b..b90f92f89d54 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2176,6 +2176,21 @@ config ARCH_PKEY_BITS int default 3 +config ARM64_HAFT + bool "Support for Hardware managed Access Flag for Table Descriptors" + depends on ARM64_HW_AFDBM + default y + help + The ARMv8.9/ARMv9.5 introduces the feature Hardware managed Access + Flag for Table descriptors. When enabled an architectural executed + memory access will update the Access Flag in each Table descriptor + which is accessed during the translation table walk and for which + the Access Flag is 0. The Access Flag of the Table descriptor use + the same bit of PTE_AF. + + The feature will only be enabled if all the CPUs in the system + support this feature. If unsure, say Y. + endmenu # "ARMv8.9 architectural features" config ARM64_SVE diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 3d261cc123c1..ed8c784ca082 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -838,6 +838,12 @@ static inline bool system_supports_poe(void) alternative_has_cap_unlikely(ARM64_HAS_S1POE); } +static inline bool system_supports_haft(void) +{ + return IS_ENABLED(CONFIG_ARM64_HAFT) && + cpus_have_final_cap(ARM64_HAFT); +} + int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt); bool try_emulate_mrs(struct pt_regs *regs, u32 isn); diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h index 8ff5f2a2579e..e75422864d1b 100644 --- a/arch/arm64/include/asm/pgalloc.h +++ b/arch/arm64/include/asm/pgalloc.h @@ -28,7 +28,7 @@ static inline void __pud_populate(pud_t *pudp, phys_addr_t pmdp, pudval_t prot) static inline void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmdp) { - pudval_t pudval = PUD_TYPE_TABLE; + pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_AF; pudval |= (mm == &init_mm) ? PUD_TABLE_UXN : PUD_TABLE_PXN; __pud_populate(pudp, __pa(pmdp), pudval); @@ -50,7 +50,7 @@ static inline void __p4d_populate(p4d_t *p4dp, phys_addr_t pudp, p4dval_t prot) static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4dp, pud_t *pudp) { - p4dval_t p4dval = P4D_TYPE_TABLE; + p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_AF; p4dval |= (mm == &init_mm) ? P4D_TABLE_UXN : P4D_TABLE_PXN; __p4d_populate(p4dp, __pa(pudp), p4dval); @@ -79,7 +79,7 @@ static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t p4dp, pgdval_t prot) static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgdp, p4d_t *p4dp) { - pgdval_t pgdval = PGD_TYPE_TABLE; + pgdval_t pgdval = PGD_TYPE_TABLE | PGD_TABLE_AF; pgdval |= (mm == &init_mm) ? PGD_TABLE_UXN : PGD_TABLE_PXN; __pgd_populate(pgdp, __pa(p4dp), pgdval); @@ -127,14 +127,16 @@ static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, pte_t *ptep) { VM_BUG_ON(mm && mm != &init_mm); - __pmd_populate(pmdp, __pa(ptep), PMD_TYPE_TABLE | PMD_TABLE_UXN); + __pmd_populate(pmdp, __pa(ptep), + PMD_TYPE_TABLE | PMD_TABLE_AF | PMD_TABLE_UXN); } static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp, pgtable_t ptep) { VM_BUG_ON(mm == &init_mm); - __pmd_populate(pmdp, page_to_phys(ptep), PMD_TYPE_TABLE | PMD_TABLE_PXN); + __pmd_populate(pmdp, page_to_phys(ptep), + PMD_TYPE_TABLE | PMD_TABLE_AF | PMD_TABLE_PXN); } #endif diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index fd330c1db289..c78a988cca93 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -99,6 +99,7 @@ #define PGD_TYPE_TABLE (_AT(pgdval_t, 3) << 0) #define PGD_TABLE_BIT (_AT(pgdval_t, 1) << 1) #define PGD_TYPE_MASK (_AT(pgdval_t, 3) << 0) +#define PGD_TABLE_AF (_AT(pgdval_t, 1) << 10) /* Ignored if no FEAT_HAFT */ #define PGD_TABLE_PXN (_AT(pgdval_t, 1) << 59) #define PGD_TABLE_UXN (_AT(pgdval_t, 1) << 60) @@ -110,6 +111,7 @@ #define P4D_TYPE_MASK (_AT(p4dval_t, 3) << 0) #define P4D_TYPE_SECT (_AT(p4dval_t, 1) << 0) #define P4D_SECT_RDONLY (_AT(p4dval_t, 1) << 7) /* AP[2] */ +#define P4D_TABLE_AF (_AT(p4dval_t, 1) << 10) /* Ignored if no FEAT_HAFT */ #define P4D_TABLE_PXN (_AT(p4dval_t, 1) << 59) #define P4D_TABLE_UXN (_AT(p4dval_t, 1) << 60) @@ -121,6 +123,7 @@ #define PUD_TYPE_MASK (_AT(pudval_t, 3) << 0) #define PUD_TYPE_SECT (_AT(pudval_t, 1) << 0) #define PUD_SECT_RDONLY (_AT(pudval_t, 1) << 7) /* AP[2] */ +#define PUD_TABLE_AF (_AT(pudval_t, 1) << 10) /* Ignored if no FEAT_HAFT */ #define PUD_TABLE_PXN (_AT(pudval_t, 1) << 59) #define PUD_TABLE_UXN (_AT(pudval_t, 1) << 60) @@ -131,6 +134,7 @@ #define PMD_TYPE_TABLE (_AT(pmdval_t, 3) << 0) #define PMD_TYPE_SECT (_AT(pmdval_t, 1) << 0) #define PMD_TABLE_BIT (_AT(pmdval_t, 1) << 1) +#define PMD_TABLE_AF (_AT(pmdval_t, 1) << 10) /* Ignored if no FEAT_HAFT */ /* * Section diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 718728a85430..878712fa0d3b 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2590,6 +2590,21 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .cpus = &dbm_cpus, ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, HAFDBS, DBM) }, +#endif +#ifdef CONFIG_ARM64_HAFT + { + .desc = "Hardware managed Access Flag for Table Descriptors", + /* + * Contrary to the page/block access flag, the table access flag + * cannot be emulated in software (no access fault will occur). + * Therefore this should be used only if it's supported system + * wide. + */ + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_HAFT, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, HAFDBS, HAFT) + }, #endif { .desc = "CRC32 instructions", diff --git a/arch/arm64/mm/fixmap.c b/arch/arm64/mm/fixmap.c index de1e09d986ad..c5c5425791da 100644 --- a/arch/arm64/mm/fixmap.c +++ b/arch/arm64/mm/fixmap.c @@ -47,7 +47,8 @@ static void __init early_fixmap_init_pte(pmd_t *pmdp, unsigned long addr) if (pmd_none(pmd)) { ptep = bm_pte[BM_PTE_TABLE_IDX(addr)]; - __pmd_populate(pmdp, __pa_symbol(ptep), PMD_TYPE_TABLE); + __pmd_populate(pmdp, __pa_symbol(ptep), + PMD_TYPE_TABLE | PMD_TABLE_AF); } } @@ -59,7 +60,8 @@ static void __init early_fixmap_init_pmd(pud_t *pudp, unsigned long addr, pmd_t *pmdp; if (pud_none(pud)) - __pud_populate(pudp, __pa_symbol(bm_pmd), PUD_TYPE_TABLE); + __pud_populate(pudp, __pa_symbol(bm_pmd), + PUD_TYPE_TABLE | PUD_TABLE_AF); pmdp = pmd_offset_kimg(pudp, addr); do { @@ -86,7 +88,8 @@ static void __init early_fixmap_init_pud(p4d_t *p4dp, unsigned long addr, } if (p4d_none(p4d)) - __p4d_populate(p4dp, __pa_symbol(bm_pud), P4D_TYPE_TABLE); + __p4d_populate(p4dp, __pa_symbol(bm_pud), + P4D_TYPE_TABLE | P4D_TABLE_AF); pudp = pud_offset_kimg(p4dp, addr); early_fixmap_init_pmd(pudp, addr, end); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index e55b02fbddc8..6441a45eaeda 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -201,7 +201,7 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, BUG_ON(pmd_sect(pmd)); if (pmd_none(pmd)) { - pmdval_t pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN; + pmdval_t pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN | PMD_TABLE_AF; phys_addr_t pte_phys; if (flags & NO_EXEC_MAPPINGS) @@ -288,7 +288,7 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, */ BUG_ON(pud_sect(pud)); if (pud_none(pud)) { - pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN; + pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN | PUD_TABLE_AF; phys_addr_t pmd_phys; if (flags & NO_EXEC_MAPPINGS) @@ -333,7 +333,7 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end, pud_t *pudp; if (p4d_none(p4d)) { - p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_UXN; + p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_UXN | P4D_TABLE_AF; phys_addr_t pud_phys; if (flags & NO_EXEC_MAPPINGS) @@ -391,7 +391,7 @@ static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end, p4d_t *p4dp; if (pgd_none(pgd)) { - pgdval_t pgdval = PGD_TYPE_TABLE | PGD_TABLE_UXN; + pgdval_t pgdval = PGD_TYPE_TABLE | PGD_TABLE_UXN | PGD_TABLE_AF; phys_addr_t p4d_phys; if (flags & NO_EXEC_MAPPINGS) diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index ccbae4525891..b8edc5765441 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -495,9 +495,14 @@ alternative_else_nop_endif * via capabilities. */ mrs x9, ID_AA64MMFR1_EL1 - and x9, x9, ID_AA64MMFR1_EL1_HAFDBS_MASK + ubfx x9, x9, ID_AA64MMFR1_EL1_HAFDBS_SHIFT, #4 cbz x9, 1f orr tcr, tcr, #TCR_HA // hardware Access flag update +#ifdef CONFIG_ARM64_HAFT + cmp x9, ID_AA64MMFR1_EL1_HAFDBS_HAFT + b.lt 1f + orr tcr2, tcr2, TCR2_EL1x_HAFT +#endif /* CONFIG_ARM64_HAFT */ 1: #endif /* CONFIG_ARM64_HW_AFDBM */ msr mair_el1, mair diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index eedb5acc21ed..b35004fa8313 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -56,6 +56,7 @@ HAS_TLB_RANGE HAS_VA52 HAS_VIRT_HOST_EXTN HAS_WFXT +HAFT HW_DBM KVM_HVHE KVM_PROTECTED_MODE -- cgit v1.2.3 From 340fd66c856651d8c1d29f392dd26ad674d2db0e Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 7 Nov 2024 01:18:42 +0900 Subject: arm64: fix .data.rel.ro size assertion when CONFIG_LTO_CLANG Commit be2881824ae9 ("arm64/build: Assert for unwanted sections") introduced an assertion to ensure that the .data.rel.ro section does not exist. However, this check does not work when CONFIG_LTO_CLANG is enabled, because .data.rel.ro matches the .data.[0-9a-zA-Z_]* pattern in the DATA_MAIN macro. Move the ASSERT() above the RW_DATA() line. Fixes: be2881824ae9 ("arm64/build: Assert for unwanted sections") Signed-off-by: Masahiro Yamada Acked-by: Will Deacon Link: https://lore.kernel.org/r/20241106161843.189927-1-masahiroy@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/kernel/vmlinux.lds.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 58d89d997d05..f84c71f04d9e 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -287,6 +287,9 @@ SECTIONS __initdata_end = .; __init_end = .; + .data.rel.ro : { *(.data.rel.ro) } + ASSERT(SIZEOF(.data.rel.ro) == 0, "Unexpected RELRO detected!") + _data = .; _sdata = .; RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN) @@ -343,9 +346,6 @@ SECTIONS *(.plt) *(.plt.*) *(.iplt) *(.igot .igot.plt) } ASSERT(SIZEOF(.plt) == 0, "Unexpected run-time procedure linkages detected!") - - .data.rel.ro : { *(.data.rel.ro) } - ASSERT(SIZEOF(.data.rel.ro) == 0, "Unexpected RELRO detected!") } #include "image-vars.h" -- cgit v1.2.3 From bdf94836c22abc923af5ae83709c96ff4c3c77c9 Mon Sep 17 00:00:00 2001 From: Liao Chang Date: Thu, 19 Sep 2024 12:17:19 +0000 Subject: arm64: uprobes: Optimize cache flushes for xol slot MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The profiling of single-thread selftests bench reveals a bottlenect in caches_clean_inval_pou() on ARM64. On my local testing machine, this function takes approximately 34% of CPU cycles for trig-uprobe-nop and trig-uprobe-push. This patch add a check to avoid unnecessary cache flush when writing instruction to the xol slot. If the instruction is same with the existing instruction in slot, there is no need to synchronize D/I cache. Since xol slot allocation and updates occur on the hot path of uprobe handling, The upstream kernel running on Kunpeng916 (Hi1616), 4 NUMA nodes, 64 cores@ 2.4GHz reveals this optimization has obvious gain for nop and push testcases. Before (next-20240918) ---------------------- uprobe-nop ( 1 cpus): 0.418 ± 0.001M/s ( 0.418M/s/cpu) uprobe-push ( 1 cpus): 0.411 ± 0.005M/s ( 0.411M/s/cpu) uprobe-ret ( 1 cpus): 2.052 ± 0.002M/s ( 2.052M/s/cpu) uretprobe-nop ( 1 cpus): 0.350 ± 0.000M/s ( 0.350M/s/cpu) uretprobe-push ( 1 cpus): 0.353 ± 0.000M/s ( 0.353M/s/cpu) uretprobe-ret ( 1 cpus): 1.074 ± 0.001M/s ( 1.074M/s/cpu) After ----- uprobe-nop ( 1 cpus): 0.926 ± 0.000M/s ( 0.926M/s/cpu) uprobe-push ( 1 cpus): 0.910 ± 0.001M/s ( 0.910M/s/cpu) uprobe-ret ( 1 cpus): 2.056 ± 0.001M/s ( 2.056M/s/cpu) uretprobe-nop ( 1 cpus): 0.653 ± 0.001M/s ( 0.653M/s/cpu) uretprobe-push ( 1 cpus): 0.645 ± 0.000M/s ( 0.645M/s/cpu) uretprobe-ret ( 1 cpus): 1.093 ± 0.001M/s ( 1.093M/s/cpu) Signed-off-by: Liao Chang Link: https://lore.kernel.org/r/20240919121719.2148361-1-liaochang1@huawei.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/probes/uprobes.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c index d49aef2657cd..ba61651db4b5 100644 --- a/arch/arm64/kernel/probes/uprobes.c +++ b/arch/arm64/kernel/probes/uprobes.c @@ -17,12 +17,20 @@ void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, void *xol_page_kaddr = kmap_atomic(page); void *dst = xol_page_kaddr + (vaddr & ~PAGE_MASK); + /* + * Initial cache maintenance of the xol page done via set_pte_at(). + * Subsequent CMOs only needed if the xol slot changes. + */ + if (!memcmp(dst, src, len)) + goto done; + /* Initialize the slot */ memcpy(dst, src, len); /* flush caches (dcache/icache) */ sync_icache_aliases((unsigned long)dst, (unsigned long)dst + len); +done: kunmap_atomic(xol_page_kaddr); } -- cgit v1.2.3 From ccf54058f5328e6da5ecc61c961bf68bbb7d865b Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 6 Nov 2024 19:55:15 +0100 Subject: arm64/scs: Fix handling of DWARF augmentation data in CIE/FDE frames The dynamic SCS patching code pretends to parse the DWARF augmentation data in the CIE (header) frame, and handle accordingly when processing the individual FDE frames based on this CIE frame. However, the boolean variable is defined inside the loop, and so the parsed value is ignored. The same applies to the code alignment field, which is also read from the header but then discarded. This was never spotted before because Clang is the only compiler that supports dynamic SCS patching (which is essentially an Android feature), and the unwind tables it produces are highly uniform, and match the de facto defaults. So instead of testing for the 'z' flag in the augmentation data field, require a fixed augmentation data string of 'zR', and simplify the rest of the code accordingly. Also introduce some error codes to specify why the patching failed, and log it to the kernel console on failure when this happens when loading a module. (Doing so for vmlinux is infeasible, as the patching is done extremely early in the boot.) Signed-off-by: Ard Biesheuvel Reviewed-by: Sami Tolvanen Tested-by: Sami Tolvanen Link: https://lore.kernel.org/r/20241106185513.3096442-6-ardb+git@google.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/scs.h | 7 +++++ arch/arm64/kernel/module.c | 10 +++++-- arch/arm64/kernel/pi/patch-scs.c | 63 ++++++++++++++++++++++------------------ 3 files changed, 50 insertions(+), 30 deletions(-) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/include/asm/scs.h b/arch/arm64/include/asm/scs.h index 2e010ea76be2..934e9217cd74 100644 --- a/arch/arm64/include/asm/scs.h +++ b/arch/arm64/include/asm/scs.h @@ -46,6 +46,13 @@ static inline void dynamic_scs_init(void) static inline void dynamic_scs_init(void) {} #endif +enum { + EDYNSCS_INVALID_CIE_HEADER = 1, + EDYNSCS_INVALID_CIE_SDATA_SIZE = 2, + EDYNSCS_INVALID_FDE_AUGM_DATA_SIZE = 3, + EDYNSCS_INVALID_CFA_OPCODE = 4, +}; + int __pi_scs_patch(const u8 eh_frame[], int size); asmlinkage void __pi_scs_patch_vmlinux(void); diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index 36b25af56324..06bb680bfe97 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -462,14 +462,20 @@ int module_finalize(const Elf_Ehdr *hdr, struct module *me) { const Elf_Shdr *s; + int ret; + s = find_section(hdr, sechdrs, ".altinstructions"); if (s) apply_alternatives_module((void *)s->sh_addr, s->sh_size); if (scs_is_dynamic()) { s = find_section(hdr, sechdrs, ".init.eh_frame"); - if (s) - __pi_scs_patch((void *)s->sh_addr, s->sh_size); + if (s) { + ret = __pi_scs_patch((void *)s->sh_addr, s->sh_size); + if (ret) + pr_err("module %s: error occurred during dynamic SCS patching (%d)\n", + me->name, ret); + } } return module_init_ftrace_plt(hdr, sechdrs, me); diff --git a/arch/arm64/kernel/pi/patch-scs.c b/arch/arm64/kernel/pi/patch-scs.c index 49d8b40e61bc..cec8f0a52bbc 100644 --- a/arch/arm64/kernel/pi/patch-scs.c +++ b/arch/arm64/kernel/pi/patch-scs.c @@ -120,7 +120,11 @@ struct eh_frame { union { struct { // CIE u8 version; - u8 augmentation_string[]; + u8 augmentation_string[3]; + u8 code_alignment_factor; + u8 data_alignment_factor; + u8 return_address_register; + u8 augmentation_data_size; }; struct { // FDE @@ -132,25 +136,21 @@ struct eh_frame { }; static int scs_handle_fde_frame(const struct eh_frame *frame, - bool fde_has_augmentation_data, int code_alignment_factor, bool dry_run) { int size = frame->size - offsetof(struct eh_frame, opcodes) + 4; u64 loc = (u64)offset_to_ptr(&frame->initial_loc); const u8 *opcode = frame->opcodes; + int l; - if (fde_has_augmentation_data) { - int l; + // assume single byte uleb128_t for augmentation data size + if (*opcode & BIT(7)) + return EDYNSCS_INVALID_FDE_AUGM_DATA_SIZE; - // assume single byte uleb128_t - if (WARN_ON(*opcode & BIT(7))) - return -ENOEXEC; - - l = *opcode++; - opcode += l; - size -= l + 1; - } + l = *opcode++; + opcode += l; + size -= l + 1; /* * Starting from 'loc', apply the CFA opcodes that advance the location @@ -201,7 +201,7 @@ static int scs_handle_fde_frame(const struct eh_frame *frame, break; default: - return -ENOEXEC; + return EDYNSCS_INVALID_CFA_OPCODE; } } return 0; @@ -209,12 +209,11 @@ static int scs_handle_fde_frame(const struct eh_frame *frame, int scs_patch(const u8 eh_frame[], int size) { + int code_alignment_factor = 1; const u8 *p = eh_frame; while (size > 4) { const struct eh_frame *frame = (const void *)p; - bool fde_has_augmentation_data = true; - int code_alignment_factor = 1; int ret; if (frame->size == 0 || @@ -223,28 +222,36 @@ int scs_patch(const u8 eh_frame[], int size) break; if (frame->cie_id_or_pointer == 0) { - const u8 *p = frame->augmentation_string; - - /* a 'z' in the augmentation string must come first */ - fde_has_augmentation_data = *p == 'z'; + /* + * Require presence of augmentation data (z) with a + * specifier for the size of the FDE initial_loc and + * range fields (R), and nothing else. + */ + if (strcmp(frame->augmentation_string, "zR")) + return EDYNSCS_INVALID_CIE_HEADER; /* * The code alignment factor is a uleb128 encoded field * but given that the only sensible values are 1 or 4, - * there is no point in decoding the whole thing. + * there is no point in decoding the whole thing. Also + * sanity check the size of the data alignment factor + * field, and the values of the return address register + * and augmentation data size fields. */ - p += strlen(p) + 1; - if (!WARN_ON(*p & BIT(7))) - code_alignment_factor = *p; + if ((frame->code_alignment_factor & BIT(7)) || + (frame->data_alignment_factor & BIT(7)) || + frame->return_address_register != 30 || + frame->augmentation_data_size != 1) + return EDYNSCS_INVALID_CIE_HEADER; + + code_alignment_factor = frame->code_alignment_factor; } else { - ret = scs_handle_fde_frame(frame, - fde_has_augmentation_data, - code_alignment_factor, + ret = scs_handle_fde_frame(frame, code_alignment_factor, true); if (ret) return ret; - scs_handle_fde_frame(frame, fde_has_augmentation_data, - code_alignment_factor, false); + scs_handle_fde_frame(frame, code_alignment_factor, + false); } p += sizeof(frame->size) + frame->size; -- cgit v1.2.3 From 60de7a647fc51b8467f6fb1a8abfe4cab1c95687 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 6 Nov 2024 19:55:16 +0100 Subject: arm64/scs: Deal with 64-bit relative offsets in FDE frames In some cases, the compiler may decide to emit DWARF FDE frames with 64-bit signed fields for the code offset and range fields. This may happen when using the large code model, for instance, which permits an executable to be spread out over more than 4 GiB of address space. Whether this is the case can be inferred from the augmentation data in the CIE frame, so decode this data before processing the FDE frames. Signed-off-by: Ard Biesheuvel Reviewed-by: Sami Tolvanen Tested-by: Sami Tolvanen Link: https://lore.kernel.org/r/20241106185513.3096442-7-ardb+git@google.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/pi/patch-scs.c | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/kernel/pi/patch-scs.c b/arch/arm64/kernel/pi/patch-scs.c index cec8f0a52bbc..55d0cd64ef71 100644 --- a/arch/arm64/kernel/pi/patch-scs.c +++ b/arch/arm64/kernel/pi/patch-scs.c @@ -50,6 +50,10 @@ bool dynamic_scs_is_enabled; #define DW_CFA_GNU_negative_offset_extended 0x2f #define DW_CFA_hi_user 0x3f +#define DW_EH_PE_sdata4 0x0b +#define DW_EH_PE_sdata8 0x0c +#define DW_EH_PE_pcrel 0x10 + enum { PACIASP = 0xd503233f, AUTIASP = 0xd50323bf, @@ -125,6 +129,7 @@ struct eh_frame { u8 data_alignment_factor; u8 return_address_register; u8 augmentation_data_size; + u8 fde_pointer_format; }; struct { // FDE @@ -132,11 +137,18 @@ struct eh_frame { s32 range; u8 opcodes[]; }; + + struct { // FDE + s64 initial_loc64; + s64 range64; + u8 opcodes64[]; + }; }; }; static int scs_handle_fde_frame(const struct eh_frame *frame, int code_alignment_factor, + bool use_sdata8, bool dry_run) { int size = frame->size - offsetof(struct eh_frame, opcodes) + 4; @@ -144,6 +156,12 @@ static int scs_handle_fde_frame(const struct eh_frame *frame, const u8 *opcode = frame->opcodes; int l; + if (use_sdata8) { + loc = (u64)&frame->initial_loc64 + frame->initial_loc64; + opcode = frame->opcodes64; + size -= 8; + } + // assume single byte uleb128_t for augmentation data size if (*opcode & BIT(7)) return EDYNSCS_INVALID_FDE_AUGM_DATA_SIZE; @@ -210,6 +228,7 @@ static int scs_handle_fde_frame(const struct eh_frame *frame, int scs_patch(const u8 eh_frame[], int size) { int code_alignment_factor = 1; + bool fde_use_sdata8 = false; const u8 *p = eh_frame; while (size > 4) { @@ -245,13 +264,24 @@ int scs_patch(const u8 eh_frame[], int size) return EDYNSCS_INVALID_CIE_HEADER; code_alignment_factor = frame->code_alignment_factor; + + switch (frame->fde_pointer_format) { + case DW_EH_PE_pcrel | DW_EH_PE_sdata4: + fde_use_sdata8 = false; + break; + case DW_EH_PE_pcrel | DW_EH_PE_sdata8: + fde_use_sdata8 = true; + break; + default: + return EDYNSCS_INVALID_CIE_SDATA_SIZE; + } } else { ret = scs_handle_fde_frame(frame, code_alignment_factor, - true); + fde_use_sdata8, true); if (ret) return ret; scs_handle_fde_frame(frame, code_alignment_factor, - false); + fde_use_sdata8, false); } p += sizeof(frame->size) + frame->size; -- cgit v1.2.3 From de7fb8d3a2c913e36d78d762e9fbdd754d8dc749 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 11 Nov 2024 13:22:49 +0530 Subject: arm64/mm: Change protval as 'pteval_t' in map_range() pgprot_t has been defined as an encapsulated structure with pteval_t as its element. Hence it is prudent to use pteval_t as the type instead of via the size based u64. Besides pteval_t type might be different size later on with FEAT_D128. Cc: Will Deacon Cc: Ard Biesheuvel Cc: Ryan Roberts Cc: Mark Rutland Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Anshuman Khandual Reviewed-by: Gavin Shan Link: https://lore.kernel.org/r/20241111075249.609493-1-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/pi/map_range.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/kernel/pi/map_range.c b/arch/arm64/kernel/pi/map_range.c index 5410b2cac590..2b69e3beeef8 100644 --- a/arch/arm64/kernel/pi/map_range.c +++ b/arch/arm64/kernel/pi/map_range.c @@ -30,7 +30,7 @@ void __init map_range(u64 *pte, u64 start, u64 end, u64 pa, pgprot_t prot, int level, pte_t *tbl, bool may_use_cont, u64 va_offset) { u64 cmask = (level == 3) ? CONT_PTE_SIZE - 1 : U64_MAX; - u64 protval = pgprot_val(prot) & ~PTE_TYPE_MASK; + pteval_t protval = pgprot_val(prot) & ~PTE_TYPE_MASK; int lshift = (3 - level) * (PAGE_SHIFT - 3); u64 lmask = (PAGE_SIZE << lshift) - 1; -- cgit v1.2.3 From c0139f6cbb1fc6438509467ec0455a200cc49a43 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 6 Nov 2024 17:41:32 +0000 Subject: arm64/ptrace: Clarify documentation of VL configuration via ptrace When we configure SVE, SSVE or ZA via ptrace we allow the user to configure the vector length and specify any of the flags that are accepted when configuring via prctl(). This includes the S[VM]E_SET_VL_ONEXEC flag which defers the configuration of the VL until an exec(). We don't do anything to limit the provision of register data as part of configuring the _ONEXEC VL but as a function of the VL enumeration support we do this will be interpreted using the vector length currently configured for the process. This is all a bit surprising, and probably we should just not have allowed register data to be specified with _ONEXEC, but it's our ABI so let's add some explicit documentation in both the ABI documents and the source calling out what happens. The comments are also missing the fact that since SME does not have a mandatory 128 bit VL it is possible for VL enumeration to result in the configuration of a higher VL than was requested, cover that too. Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241106-arm64-sve-ptrace-vl-set-v1-1-3b164e8b559c@kernel.org Signed-off-by: Catalin Marinas --- Documentation/arch/arm64/sme.rst | 4 ++++ Documentation/arch/arm64/sve.rst | 4 ++++ arch/arm64/kernel/ptrace.c | 12 ++++++++++-- 3 files changed, 18 insertions(+), 2 deletions(-) (limited to 'arch/arm64/kernel') diff --git a/Documentation/arch/arm64/sme.rst b/Documentation/arch/arm64/sme.rst index be317d457417..b2fa01f85cb5 100644 --- a/Documentation/arch/arm64/sme.rst +++ b/Documentation/arch/arm64/sme.rst @@ -346,6 +346,10 @@ The regset data starts with struct user_za_header, containing: * Writes to NT_ARM_ZT will set PSTATE.ZA to 1. +* If any register data is provided along with SME_PT_VL_ONEXEC then the + registers data will be interpreted with the current vector length, not + the vector length configured for use on exec. + 8. ELF coredump extensions --------------------------- diff --git a/Documentation/arch/arm64/sve.rst b/Documentation/arch/arm64/sve.rst index 8d8837fc39ec..28152492c29c 100644 --- a/Documentation/arch/arm64/sve.rst +++ b/Documentation/arch/arm64/sve.rst @@ -402,6 +402,10 @@ The regset data starts with struct user_sve_header, containing: streaming mode and any SETREGSET of NT_ARM_SSVE will enter streaming mode if the target was not in streaming mode. +* If any register data is provided along with SVE_PT_VL_ONEXEC then the + registers data will be interpreted with the current vector length, not + the vector length configured for use on exec. + * The effect of writing a partial, incomplete payload is unspecified. diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index b756578aeaee..f09ffd70c916 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -898,7 +898,11 @@ static int sve_set_common(struct task_struct *target, if (ret) goto out; - /* Actual VL set may be less than the user asked for: */ + /* + * Actual VL set may be different from what the user asked + * for, or we may have configured the _ONEXEC VL not the + * current VL: + */ vq = sve_vq_from_vl(task_get_vl(target, type)); /* Enter/exit streaming mode */ @@ -1125,7 +1129,11 @@ static int za_set(struct task_struct *target, if (ret) goto out; - /* Actual VL set may be less than the user asked for: */ + /* + * Actual VL set may be different from what the user asked + * for, or we may have configured the _ONEXEC rather than + * current VL: + */ vq = sve_vq_from_vl(task_get_sme_vl(target)); /* Ensure there is some SVE storage for streaming mode */ -- cgit v1.2.3 From 67ab51cbdfee02ef07fb9d7d14cc0bf6cb5a5e5c Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 14 Nov 2024 09:53:32 +0000 Subject: arm64: tls: Fix context-switching of tpidrro_el0 when kpti is enabled Commit 18011eac28c7 ("arm64: tls: Avoid unconditional zeroing of tpidrro_el0 for native tasks") tried to optimise the context switching of tpidrro_el0 by eliding the clearing of the register when switching to a native task with kpti enabled, on the erroneous assumption that the kpti trampoline entry code would already have taken care of the write. Although the kpti trampoline does zero the register on entry from a native task, the check in tls_thread_switch() is on the *next* task and so we can end up leaving a stale, non-zero value in the register if the previous task was 32-bit. Drop the broken optimisation and zero tpidrro_el0 unconditionally when switching to a native 64-bit task. Cc: Mark Rutland Cc: stable@vger.kernel.org Fixes: 18011eac28c7 ("arm64: tls: Avoid unconditional zeroing of tpidrro_el0 for native tasks") Signed-off-by: Will Deacon Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20241114095332.23391-1-will@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/kernel/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/arm64/kernel') diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 0540653fbf38..3d78798c4a0a 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -439,7 +439,7 @@ static void tls_thread_switch(struct task_struct *next) if (is_compat_thread(task_thread_info(next))) write_sysreg(next->thread.uw.tp_value, tpidrro_el0); - else if (!arm64_kernel_unmapped_at_el0()) + else write_sysreg(0, tpidrro_el0); write_sysreg(*task_user_tls(next), tpidr_el0); -- cgit v1.2.3