From 2f0584f3f4bd60bcc8735172981fb0bff86e74e0 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 12 Jun 2023 17:10:27 -0700 Subject: mm: Rename arch pte_mkwrite()'s to pte_mkwrite_novma() The x86 Shadow stack feature includes a new type of memory called shadow stack. This shadow stack memory has some unusual properties, which requires some core mm changes to function properly. One of these unusual properties is that shadow stack memory is writable, but only in limited ways. These limits are applied via a specific PTE bit combination. Nevertheless, the memory is writable, and core mm code will need to apply the writable permissions in the typical paths that call pte_mkwrite(). The goal is to make pte_mkwrite() take a VMA, so that the x86 implementation of it can know whether to create regular writable or shadow stack mappings. But there are a couple of challenges to this. Modifying the signatures of each arch pte_mkwrite() implementation would be error prone because some are generated with macros and would need to be re-implemented. Also, some pte_mkwrite() callers operate on kernel memory without a VMA. So this can be done in a three step process. First pte_mkwrite() can be renamed to pte_mkwrite_novma() in each arch, with a generic pte_mkwrite() added that just calls pte_mkwrite_novma(). Next callers without a VMA can be moved to pte_mkwrite_novma(). And lastly, pte_mkwrite() and all callers can be changed to take/pass a VMA. Start the process by renaming pte_mkwrite() to pte_mkwrite_novma() and adding the pte_mkwrite() wrapper in linux/pgtable.h. Apply the same pattern for pmd_mkwrite(). Since not all archs have a pmd_mkwrite_novma(), create a new arch config HAS_HUGE_PAGE that can be used to tell if pmd_mkwrite() should be defined. Otherwise in the !HAS_HUGE_PAGE cases the compiler would not be able to find pmd_mkwrite_novma(). No functional change. Suggested-by: Linus Torvalds Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Reviewed-by: Mike Rapoport (IBM) Acked-by: Geert Uytterhoeven Acked-by: David Hildenbrand Link: https://lore.kernel.org/lkml/CAHk-=wiZjSu7c9sFYZb3q04108stgHff2wfbokGCCgW7riz+8Q@mail.gmail.com/ Link: https://lore.kernel.org/all/20230613001108.3040476-2-rick.p.edgecombe%40intel.com --- arch/x86/include/asm/pgtable.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 5700bb337987..0aa295cc6277 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -353,7 +353,7 @@ static inline pte_t pte_mkyoung(pte_t pte) return pte_set_flags(pte, _PAGE_ACCESSED); } -static inline pte_t pte_mkwrite(pte_t pte) +static inline pte_t pte_mkwrite_novma(pte_t pte) { return pte_set_flags(pte, _PAGE_RW); } @@ -454,7 +454,7 @@ static inline pmd_t pmd_mkyoung(pmd_t pmd) return pmd_set_flags(pmd, _PAGE_ACCESSED); } -static inline pmd_t pmd_mkwrite(pmd_t pmd) +static inline pmd_t pmd_mkwrite_novma(pmd_t pmd) { return pmd_set_flags(pmd, _PAGE_RW); } -- cgit v1.2.3 From 6ecc21bb432dab7241bcbd766ecd1b15620c75c3 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 12 Jun 2023 17:10:28 -0700 Subject: mm: Move pte/pmd_mkwrite() callers with no VMA to _novma() The x86 Shadow stack feature includes a new type of memory called shadow stack. This shadow stack memory has some unusual properties, which requires some core mm changes to function properly. One of these unusual properties is that shadow stack memory is writable, but only in limited ways. These limits are applied via a specific PTE bit combination. Nevertheless, the memory is writable, and core mm code will need to apply the writable permissions in the typical paths that call pte_mkwrite(). Future patches will make pte_mkwrite() take a VMA, so that the x86 implementation of it can know whether to create regular writable or shadow stack mappings. But there are a couple of challenges to this. Modifying the signatures of each arch pte_mkwrite() implementation would be error prone because some are generated with macros and would need to be re-implemented. Also, some pte_mkwrite() callers operate on kernel memory without a VMA. So this can be done in a three step process. First pte_mkwrite() can be renamed to pte_mkwrite_novma() in each arch, with a generic pte_mkwrite() added that just calls pte_mkwrite_novma(). Next callers without a VMA can be moved to pte_mkwrite_novma(). And lastly, pte_mkwrite() and all callers can be changed to take/pass a VMA. Earlier work did the first step, so next move the callers that don't have a VMA to pte_mkwrite_novma(). Also do the same for pmd_mkwrite(). This will be ok for the shadow stack feature, as these callers are on kernel memory which will not need to be made shadow stack, and the other architectures only currently support one type of memory in pte_mkwrite() Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Reviewed-by: Mike Rapoport (IBM) Acked-by: David Hildenbrand Link: https://lore.kernel.org/all/20230613001108.3040476-3-rick.p.edgecombe%40intel.com --- arch/arm64/mm/trans_pgd.c | 4 ++-- arch/s390/mm/pageattr.c | 4 ++-- arch/x86/xen/mmu_pv.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c index 4ea2eefbc053..a01493f3a06f 100644 --- a/arch/arm64/mm/trans_pgd.c +++ b/arch/arm64/mm/trans_pgd.c @@ -40,7 +40,7 @@ static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr) * read only (code, rodata). Clear the RDONLY bit from * the temporary mappings we use during restore. */ - set_pte(dst_ptep, pte_mkwrite(pte)); + set_pte(dst_ptep, pte_mkwrite_novma(pte)); } else if (debug_pagealloc_enabled() && !pte_none(pte)) { /* * debug_pagealloc will removed the PTE_VALID bit if @@ -53,7 +53,7 @@ static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr) */ BUG_ON(!pfn_valid(pte_pfn(pte))); - set_pte(dst_ptep, pte_mkpresent(pte_mkwrite(pte))); + set_pte(dst_ptep, pte_mkpresent(pte_mkwrite_novma(pte))); } } diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index ca5a418c58a8..e5ec76271b16 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -98,7 +98,7 @@ static int walk_pte_level(pmd_t *pmdp, unsigned long addr, unsigned long end, if (flags & SET_MEMORY_RO) new = pte_wrprotect(new); else if (flags & SET_MEMORY_RW) - new = pte_mkwrite(pte_mkdirty(new)); + new = pte_mkwrite_novma(pte_mkdirty(new)); if (flags & SET_MEMORY_NX) new = set_pte_bit(new, __pgprot(_PAGE_NOEXEC)); else if (flags & SET_MEMORY_X) @@ -156,7 +156,7 @@ static void modify_pmd_page(pmd_t *pmdp, unsigned long addr, if (flags & SET_MEMORY_RO) new = pmd_wrprotect(new); else if (flags & SET_MEMORY_RW) - new = pmd_mkwrite(pmd_mkdirty(new)); + new = pmd_mkwrite_novma(pmd_mkdirty(new)); if (flags & SET_MEMORY_NX) new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_NOEXEC)); else if (flags & SET_MEMORY_X) diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index e0a975165de7..ccf3d308c874 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -166,7 +166,7 @@ void make_lowmem_page_readwrite(void *vaddr) if (pte == NULL) return; /* vaddr missing */ - ptev = pte_mkwrite(*pte); + ptev = pte_mkwrite_novma(*pte); if (HYPERVISOR_update_va_mapping(address, ptev, 0)) BUG(); -- cgit v1.2.3 From 18e66b695e787374ca762ecdeaa1ab5e3772af94 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 12 Jun 2023 17:10:32 -0700 Subject: x86/shstk: Add Kconfig option for shadow stack Shadow stack provides protection for applications against function return address corruption. It is active when the processor supports it, the kernel has CONFIG_X86_SHADOW_STACK enabled, and the application is built for the feature. This is only implemented for the 64-bit kernel. When it is enabled, legacy non-shadow stack applications continue to work, but without protection. Since there is another feature that utilizes CET (Kernel IBT) that will share implementation with shadow stacks, create CONFIG_CET to signify that at least one CET feature is configured. Co-developed-by: Yu-cheng Yu Signed-off-by: Yu-cheng Yu Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Reviewed-by: Borislav Petkov (AMD) Reviewed-by: Kees Cook Acked-by: Mike Rapoport (IBM) Tested-by: Pengfei Xu Tested-by: John Allen Tested-by: Kees Cook Link: https://lore.kernel.org/all/20230613001108.3040476-7-rick.p.edgecombe%40intel.com --- arch/x86/Kconfig | 24 ++++++++++++++++++++++++ arch/x86/Kconfig.assembler | 5 +++++ 2 files changed, 29 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7422db409770..e860f805199f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1849,6 +1849,11 @@ config CC_HAS_IBT (CC_IS_CLANG && CLANG_VERSION >= 140000)) && \ $(as-instr,endbr64) +config X86_CET + def_bool n + help + CET features configured (Shadow stack or IBT) + config X86_KERNEL_IBT prompt "Indirect Branch Tracking" def_bool y @@ -1856,6 +1861,7 @@ config X86_KERNEL_IBT # https://github.com/llvm/llvm-project/commit/9d7001eba9c4cb311e03cd8cdc231f9e579f2d0f depends on !LD_IS_LLD || LLD_VERSION >= 140000 select OBJTOOL + select X86_CET help Build the kernel with support for Indirect Branch Tracking, a hardware support course-grain forward-edge Control Flow Integrity @@ -1949,6 +1955,24 @@ config X86_SGX If unsure, say N. +config X86_USER_SHADOW_STACK + bool "X86 userspace shadow stack" + depends on AS_WRUSS + depends on X86_64 + select ARCH_USES_HIGH_VMA_FLAGS + select X86_CET + help + Shadow stack protection is a hardware feature that detects function + return address corruption. This helps mitigate ROP attacks. + Applications must be enabled to use it, and old userspace does not + get protection "for free". + + CPUs supporting shadow stacks were first released in 2020. + + See Documentation/x86/shstk.rst for more information. + + If unsure, say N. + config EFI bool "EFI runtime service support" depends on ACPI diff --git a/arch/x86/Kconfig.assembler b/arch/x86/Kconfig.assembler index b88f784cb02e..8ad41da301e5 100644 --- a/arch/x86/Kconfig.assembler +++ b/arch/x86/Kconfig.assembler @@ -24,3 +24,8 @@ config AS_GFNI def_bool $(as-instr,vgf2p8mulb %xmm0$(comma)%xmm1$(comma)%xmm2) help Supported by binutils >= 2.30 and LLVM integrated assembler + +config AS_WRUSS + def_bool $(as-instr,wrussq %rax$(comma)(%rbx)) + help + Supported by binutils >= 2.31 and LLVM integrated assembler -- cgit v1.2.3 From 2da5b91fe4092aab9d92138c78b68e9856b078d0 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 12 Jun 2023 17:10:33 -0700 Subject: x86/traps: Move control protection handler to separate file Today the control protection handler is defined in traps.c and used only for the kernel IBT feature. To reduce ifdeffery, move it to it's own file. In future patches, functionality will be added to make this handler also handle user shadow stack faults. So name the file cet.c. No functional change. Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Reviewed-by: Borislav Petkov (AMD) Reviewed-by: Kees Cook Acked-by: Mike Rapoport (IBM) Tested-by: Pengfei Xu Tested-by: John Allen Tested-by: Kees Cook Link: https://lore.kernel.org/all/20230613001108.3040476-8-rick.p.edgecombe%40intel.com --- arch/x86/kernel/Makefile | 2 ++ arch/x86/kernel/cet.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/traps.c | 75 ----------------------------------------------- 3 files changed, 78 insertions(+), 75 deletions(-) create mode 100644 arch/x86/kernel/cet.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 4070a01c11b7..abee0564b750 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -145,6 +145,8 @@ obj-$(CONFIG_CFI_CLANG) += cfi.o obj-$(CONFIG_CALL_THUNKS) += callthunks.o +obj-$(CONFIG_X86_CET) += cet.o + ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c new file mode 100644 index 000000000000..7ad22b705b64 --- /dev/null +++ b/arch/x86/kernel/cet.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include + +static __ro_after_init bool ibt_fatal = true; + +extern void ibt_selftest_ip(void); /* code label defined in asm below */ + +enum cp_error_code { + CP_EC = (1 << 15) - 1, + + CP_RET = 1, + CP_IRET = 2, + CP_ENDBR = 3, + CP_RSTRORSSP = 4, + CP_SETSSBSY = 5, + + CP_ENCL = 1 << 15, +}; + +DEFINE_IDTENTRY_ERRORCODE(exc_control_protection) +{ + if (!cpu_feature_enabled(X86_FEATURE_IBT)) { + pr_err("Unexpected #CP\n"); + BUG(); + } + + if (WARN_ON_ONCE(user_mode(regs) || (error_code & CP_EC) != CP_ENDBR)) + return; + + if (unlikely(regs->ip == (unsigned long)&ibt_selftest_ip)) { + regs->ax = 0; + return; + } + + pr_err("Missing ENDBR: %pS\n", (void *)instruction_pointer(regs)); + if (!ibt_fatal) { + printk(KERN_DEFAULT CUT_HERE); + __warn(__FILE__, __LINE__, (void *)regs->ip, TAINT_WARN, regs, NULL); + return; + } + BUG(); +} + +/* Must be noinline to ensure uniqueness of ibt_selftest_ip. */ +noinline bool ibt_selftest(void) +{ + unsigned long ret; + + asm (" lea ibt_selftest_ip(%%rip), %%rax\n\t" + ANNOTATE_RETPOLINE_SAFE + " jmp *%%rax\n\t" + "ibt_selftest_ip:\n\t" + UNWIND_HINT_FUNC + ANNOTATE_NOENDBR + " nop\n\t" + + : "=a" (ret) : : "memory"); + + return !ret; +} + +static int __init ibt_setup(char *str) +{ + if (!strcmp(str, "off")) + setup_clear_cpu_cap(X86_FEATURE_IBT); + + if (!strcmp(str, "warn")) + ibt_fatal = false; + + return 1; +} + +__setup("ibt=", ibt_setup); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 58b1f208eff5..6f666dfa97de 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -213,81 +213,6 @@ DEFINE_IDTENTRY(exc_overflow) do_error_trap(regs, 0, "overflow", X86_TRAP_OF, SIGSEGV, 0, NULL); } -#ifdef CONFIG_X86_KERNEL_IBT - -static __ro_after_init bool ibt_fatal = true; - -extern void ibt_selftest_ip(void); /* code label defined in asm below */ - -enum cp_error_code { - CP_EC = (1 << 15) - 1, - - CP_RET = 1, - CP_IRET = 2, - CP_ENDBR = 3, - CP_RSTRORSSP = 4, - CP_SETSSBSY = 5, - - CP_ENCL = 1 << 15, -}; - -DEFINE_IDTENTRY_ERRORCODE(exc_control_protection) -{ - if (!cpu_feature_enabled(X86_FEATURE_IBT)) { - pr_err("Unexpected #CP\n"); - BUG(); - } - - if (WARN_ON_ONCE(user_mode(regs) || (error_code & CP_EC) != CP_ENDBR)) - return; - - if (unlikely(regs->ip == (unsigned long)&ibt_selftest_ip)) { - regs->ax = 0; - return; - } - - pr_err("Missing ENDBR: %pS\n", (void *)instruction_pointer(regs)); - if (!ibt_fatal) { - printk(KERN_DEFAULT CUT_HERE); - __warn(__FILE__, __LINE__, (void *)regs->ip, TAINT_WARN, regs, NULL); - return; - } - BUG(); -} - -/* Must be noinline to ensure uniqueness of ibt_selftest_ip. */ -noinline bool ibt_selftest(void) -{ - unsigned long ret; - - asm (" lea ibt_selftest_ip(%%rip), %%rax\n\t" - ANNOTATE_RETPOLINE_SAFE - " jmp *%%rax\n\t" - "ibt_selftest_ip:\n\t" - UNWIND_HINT_FUNC - ANNOTATE_NOENDBR - " nop\n\t" - - : "=a" (ret) : : "memory"); - - return !ret; -} - -static int __init ibt_setup(char *str) -{ - if (!strcmp(str, "off")) - setup_clear_cpu_cap(X86_FEATURE_IBT); - - if (!strcmp(str, "warn")) - ibt_fatal = false; - - return 1; -} - -__setup("ibt=", ibt_setup); - -#endif /* CONFIG_X86_KERNEL_IBT */ - #ifdef CONFIG_X86_F00F_BUG void handle_invalid_op(struct pt_regs *regs) #else -- cgit v1.2.3 From 701fb66d576ec4d06903e7dd55678a3005d86e5f Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 12 Jun 2023 17:10:34 -0700 Subject: x86/cpufeatures: Add CPU feature flags for shadow stacks The Control-Flow Enforcement Technology contains two related features, one of which is Shadow Stacks. Future patches will utilize this feature for shadow stack support in KVM, so add a CPU feature flags for Shadow Stacks (CPUID.(EAX=7,ECX=0):ECX[bit 7]). To protect shadow stack state from malicious modification, the registers are only accessible in supervisor mode. This implementation context-switches the registers with XSAVES. Make X86_FEATURE_SHSTK depend on XSAVES. The shadow stack feature, enumerated by the CPUID bit described above, encompasses both supervisor and userspace support for shadow stack. In near future patches, only userspace shadow stack will be enabled. In expectation of future supervisor shadow stack support, create a software CPU capability to enumerate kernel utilization of userspace shadow stack support. This user shadow stack bit should depend on the HW "shstk" capability and that logic will be implemented in future patches. Co-developed-by: Yu-cheng Yu Signed-off-by: Yu-cheng Yu Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Reviewed-by: Borislav Petkov (AMD) Reviewed-by: Kees Cook Acked-by: Mike Rapoport (IBM) Tested-by: Pengfei Xu Tested-by: John Allen Tested-by: Kees Cook Link: https://lore.kernel.org/all/20230613001108.3040476-9-rick.p.edgecombe%40intel.com --- arch/x86/include/asm/cpufeatures.h | 2 ++ arch/x86/include/asm/disabled-features.h | 8 +++++++- arch/x86/kernel/cpu/cpuid-deps.c | 1 + 3 files changed, 10 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index cb8ca46213be..d7215c8b7923 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -308,6 +308,7 @@ #define X86_FEATURE_MSR_TSX_CTRL (11*32+20) /* "" MSR IA32_TSX_CTRL (Intel) implemented */ #define X86_FEATURE_SMBA (11*32+21) /* "" Slow Memory Bandwidth Allocation */ #define X86_FEATURE_BMEC (11*32+22) /* "" Bandwidth Monitoring Event Configuration */ +#define X86_FEATURE_USER_SHSTK (11*32+23) /* Shadow stack support for user mode applications */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ @@ -380,6 +381,7 @@ #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ #define X86_FEATURE_WAITPKG (16*32+ 5) /* UMONITOR/UMWAIT/TPAUSE Instructions */ #define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ +#define X86_FEATURE_SHSTK (16*32+ 7) /* "" Shadow stack */ #define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */ #define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */ #define X86_FEATURE_VPCLMULQDQ (16*32+10) /* Carry-Less Multiplication Double Quadword */ diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index fafe9be7a6f4..b9c7eae2e70f 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -105,6 +105,12 @@ # define DISABLE_TDX_GUEST (1 << (X86_FEATURE_TDX_GUEST & 31)) #endif +#ifdef CONFIG_X86_USER_SHADOW_STACK +#define DISABLE_USER_SHSTK 0 +#else +#define DISABLE_USER_SHSTK (1 << (X86_FEATURE_USER_SHSTK & 31)) +#endif + /* * Make sure to add features to the correct mask */ @@ -120,7 +126,7 @@ #define DISABLED_MASK9 (DISABLE_SGX) #define DISABLED_MASK10 0 #define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \ - DISABLE_CALL_DEPTH_TRACKING) + DISABLE_CALL_DEPTH_TRACKING|DISABLE_USER_SHSTK) #define DISABLED_MASK12 (DISABLE_LAM) #define DISABLED_MASK13 0 #define DISABLED_MASK14 0 diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index f6748c8bd647..e462c1d3800a 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -81,6 +81,7 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_XFD, X86_FEATURE_XSAVES }, { X86_FEATURE_XFD, X86_FEATURE_XGETBV1 }, { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD }, + { X86_FEATURE_SHSTK, X86_FEATURE_XSAVES }, {} }; -- cgit v1.2.3 From a956efc09b106173c6c85da7e3bb6e0f452a5681 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 12 Jun 2023 17:10:35 -0700 Subject: x86/mm: Move pmd_write(), pud_write() up in the file To prepare the introduction of _PAGE_SAVED_DIRTY, move pmd_write() and pud_write() up in the file, so that they can be used by other helpers below. No functional changes. Co-developed-by: Yu-cheng Yu Signed-off-by: Yu-cheng Yu Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Reviewed-by: Borislav Petkov (AMD) Reviewed-by: Kees Cook Reviewed-by: Kirill A. Shutemov Acked-by: Mike Rapoport (IBM) Tested-by: Pengfei Xu Tested-by: John Allen Tested-by: Kees Cook Link: https://lore.kernel.org/all/20230613001108.3040476-10-rick.p.edgecombe%40intel.com --- arch/x86/include/asm/pgtable.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 0aa295cc6277..5f6e4ac1aad8 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -161,6 +161,18 @@ static inline int pte_write(pte_t pte) return pte_flags(pte) & _PAGE_RW; } +#define pmd_write pmd_write +static inline int pmd_write(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_RW; +} + +#define pud_write pud_write +static inline int pud_write(pud_t pud) +{ + return pud_flags(pud) & _PAGE_RW; +} + static inline int pte_huge(pte_t pte) { return pte_flags(pte) & _PAGE_PSE; @@ -1121,12 +1133,6 @@ extern int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); -#define pmd_write pmd_write -static inline int pmd_write(pmd_t pmd) -{ - return pmd_flags(pmd) & _PAGE_RW; -} - #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) @@ -1156,12 +1162,6 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp); } -#define pud_write pud_write -static inline int pud_write(pud_t pud) -{ - return pud_flags(pud) & _PAGE_RW; -} - #ifndef pmdp_establish #define pmdp_establish pmdp_establish static inline pmd_t pmdp_establish(struct vm_area_struct *vma, -- cgit v1.2.3 From fca4d413c5f707b759d1031b170cbb8884f0999d Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 12 Jun 2023 17:10:36 -0700 Subject: x86/mm: Introduce _PAGE_SAVED_DIRTY Some OSes have a greater dependence on software available bits in PTEs than Linux. That left the hardware architects looking for a way to represent a new memory type (shadow stack) within the existing bits. They chose to repurpose a lightly-used state: Write=0,Dirty=1. So in order to support shadow stack memory, Linux should avoid creating memory with this PTE bit combination unless it intends for it to be shadow stack. The reason it's lightly used is that Dirty=1 is normally set by HW _before_ a write. A write with a Write=0 PTE would typically only generate a fault, not set Dirty=1. Hardware can (rarely) both set Dirty=1 *and* generate the fault, resulting in a Write=0,Dirty=1 PTE. Hardware which supports shadow stacks will no longer exhibit this oddity. So that leaves Write=0,Dirty=1 PTEs created in software. To avoid inadvertently created shadow stack memory, in places where Linux normally creates Write=0,Dirty=1, it can use the software-defined _PAGE_SAVED_DIRTY in place of the hardware _PAGE_DIRTY. In other words, whenever Linux needs to create Write=0,Dirty=1, it instead creates Write=0,SavedDirty=1 except for shadow stack, which is Write=0,Dirty=1. There are six bits left available to software in the 64-bit PTE after consuming a bit for _PAGE_SAVED_DIRTY. For 32 bit, the same bit as _PAGE_BIT_UFFD_WP is used, since user fault fd is not supported on 32 bit. This leaves one unused software bit on 32 bit (_PAGE_BIT_SOFT_DIRTY, as this is also not supported on 32 bit). Implement only the infrastructure for _PAGE_SAVED_DIRTY. Changes to actually begin creating _PAGE_SAVED_DIRTY PTEs will follow once other pieces are in place. Since this SavedDirty shifting is done for all x86 CPUs, this leaves the possibility for the hardware oddity to still create Write=0,Dirty=1 PTEs in rare cases. Since these CPUs also don't support shadow stack, this will be harmless as it was before the introduction of SavedDirty. Implement the shifting logic to be branchless. Embed the logic of whether to do the shifting (including checking the Write bits) so that it can be called by future callers that would otherwise need additional branching logic. This efficiency allows the logic of when to do the shifting to be centralized, making the code easier to reason about. Co-developed-by: Yu-cheng Yu Signed-off-by: Yu-cheng Yu Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Tested-by: Pengfei Xu Tested-by: John Allen Tested-by: Kees Cook Link: https://lore.kernel.org/all/20230613001108.3040476-11-rick.p.edgecombe%40intel.com --- arch/x86/include/asm/pgtable.h | 83 ++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/pgtable_types.h | 34 +++++++++++++-- arch/x86/include/asm/tlbflush.h | 3 +- 3 files changed, 115 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 5f6e4ac1aad8..0c0747cbad3e 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -302,6 +302,53 @@ static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear) return native_make_pte(v & ~clear); } +/* + * Write protection operations can result in Dirty=1,Write=0 PTEs. But in the + * case of X86_FEATURE_USER_SHSTK, these PTEs denote shadow stack memory. So + * when creating dirty, write-protected memory, a software bit is used: + * _PAGE_BIT_SAVED_DIRTY. The following functions take a PTE and transition the + * Dirty bit to SavedDirty, and vice-vesra. + * + * This shifting is only done if needed. In the case of shifting + * Dirty->SavedDirty, the condition is if the PTE is Write=0. In the case of + * shifting SavedDirty->Dirty, the condition is Write=1. + */ +static inline pgprotval_t mksaveddirty_shift(pgprotval_t v) +{ + pgprotval_t cond = (~v >> _PAGE_BIT_RW) & 1; + + v |= ((v >> _PAGE_BIT_DIRTY) & cond) << _PAGE_BIT_SAVED_DIRTY; + v &= ~(cond << _PAGE_BIT_DIRTY); + + return v; +} + +static inline pgprotval_t clear_saveddirty_shift(pgprotval_t v) +{ + pgprotval_t cond = (v >> _PAGE_BIT_RW) & 1; + + v |= ((v >> _PAGE_BIT_SAVED_DIRTY) & cond) << _PAGE_BIT_DIRTY; + v &= ~(cond << _PAGE_BIT_SAVED_DIRTY); + + return v; +} + +static inline pte_t pte_mksaveddirty(pte_t pte) +{ + pteval_t v = native_pte_val(pte); + + v = mksaveddirty_shift(v); + return native_make_pte(v); +} + +static inline pte_t pte_clear_saveddirty(pte_t pte) +{ + pteval_t v = native_pte_val(pte); + + v = clear_saveddirty_shift(v); + return native_make_pte(v); +} + static inline pte_t pte_wrprotect(pte_t pte) { return pte_clear_flags(pte, _PAGE_RW); @@ -414,6 +461,24 @@ static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) return native_make_pmd(v & ~clear); } +/* See comments above mksaveddirty_shift() */ +static inline pmd_t pmd_mksaveddirty(pmd_t pmd) +{ + pmdval_t v = native_pmd_val(pmd); + + v = mksaveddirty_shift(v); + return native_make_pmd(v); +} + +/* See comments above mksaveddirty_shift() */ +static inline pmd_t pmd_clear_saveddirty(pmd_t pmd) +{ + pmdval_t v = native_pmd_val(pmd); + + v = clear_saveddirty_shift(v); + return native_make_pmd(v); +} + static inline pmd_t pmd_wrprotect(pmd_t pmd) { return pmd_clear_flags(pmd, _PAGE_RW); @@ -485,6 +550,24 @@ static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear) return native_make_pud(v & ~clear); } +/* See comments above mksaveddirty_shift() */ +static inline pud_t pud_mksaveddirty(pud_t pud) +{ + pudval_t v = native_pud_val(pud); + + v = mksaveddirty_shift(v); + return native_make_pud(v); +} + +/* See comments above mksaveddirty_shift() */ +static inline pud_t pud_clear_saveddirty(pud_t pud) +{ + pudval_t v = native_pud_val(pud); + + v = clear_saveddirty_shift(v); + return native_make_pud(v); +} + static inline pud_t pud_mkold(pud_t pud) { return pud_clear_flags(pud, _PAGE_ACCESSED); diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index ba3e2554799a..93796471a8fa 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -21,7 +21,8 @@ #define _PAGE_BIT_SOFTW2 10 /* " */ #define _PAGE_BIT_SOFTW3 11 /* " */ #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ -#define _PAGE_BIT_SOFTW4 58 /* available for programmer */ +#define _PAGE_BIT_SOFTW4 57 /* available for programmer */ +#define _PAGE_BIT_SOFTW5 58 /* available for programmer */ #define _PAGE_BIT_PKEY_BIT0 59 /* Protection Keys, bit 1/4 */ #define _PAGE_BIT_PKEY_BIT1 60 /* Protection Keys, bit 2/4 */ #define _PAGE_BIT_PKEY_BIT2 61 /* Protection Keys, bit 3/4 */ @@ -34,6 +35,13 @@ #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ #define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4 +#ifdef CONFIG_X86_64 +#define _PAGE_BIT_SAVED_DIRTY _PAGE_BIT_SOFTW5 /* Saved Dirty bit */ +#else +/* Shared with _PAGE_BIT_UFFD_WP which is not supported on 32 bit */ +#define _PAGE_BIT_SAVED_DIRTY _PAGE_BIT_SOFTW2 /* Saved Dirty bit */ +#endif + /* If _PAGE_BIT_PRESENT is clear, we use these: */ /* - if the user mapped it with PROT_NONE; pte_present gives true */ #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL @@ -117,6 +125,18 @@ #define _PAGE_SOFTW4 (_AT(pteval_t, 0)) #endif +/* + * The hardware requires shadow stack to be Write=0,Dirty=1. However, + * there are valid cases where the kernel might create read-only PTEs that + * are dirty (e.g., fork(), mprotect(), uffd-wp(), soft-dirty tracking). In + * this case, the _PAGE_SAVED_DIRTY bit is used instead of the HW-dirty bit, + * to avoid creating a wrong "shadow stack" PTEs. Such PTEs have + * (Write=0,SavedDirty=1,Dirty=0) set. + */ +#define _PAGE_SAVED_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SAVED_DIRTY) + +#define _PAGE_DIRTY_BITS (_PAGE_DIRTY | _PAGE_SAVED_DIRTY) + #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) /* @@ -125,9 +145,9 @@ * instance, and is *not* included in this mask since * pte_modify() does modify it. */ -#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ - _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \ - _PAGE_SOFT_DIRTY | _PAGE_DEVMAP | _PAGE_ENC | \ +#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ + _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY_BITS | \ + _PAGE_SOFT_DIRTY | _PAGE_DEVMAP | _PAGE_ENC | \ _PAGE_UFFD_WP) #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) @@ -188,10 +208,16 @@ enum page_cache_mode { #define __PAGE_KERNEL (__PP|__RW| 0|___A|__NX|___D| 0|___G) #define __PAGE_KERNEL_EXEC (__PP|__RW| 0|___A| 0|___D| 0|___G) + +/* + * Page tables needs to have Write=1 in order for any lower PTEs to be + * writable. This includes shadow stack memory (Write=0, Dirty=1) + */ #define _KERNPG_TABLE_NOENC (__PP|__RW| 0|___A| 0|___D| 0| 0) #define _KERNPG_TABLE (__PP|__RW| 0|___A| 0|___D| 0| 0| _ENC) #define _PAGE_TABLE_NOENC (__PP|__RW|_USR|___A| 0|___D| 0| 0) #define _PAGE_TABLE (__PP|__RW|_USR|___A| 0|___D| 0| 0| _ENC) + #define __PAGE_KERNEL_RO (__PP| 0| 0|___A|__NX|___D| 0|___G) #define __PAGE_KERNEL_ROX (__PP| 0| 0|___A| 0|___D| 0|___G) #define __PAGE_KERNEL_NOCACHE (__PP|__RW| 0|___A|__NX|___D| 0|___G| __NC) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 80450e1d5385..2deae4dd71da 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -286,7 +286,8 @@ static inline bool pte_flags_need_flush(unsigned long oldflags, const pteval_t flush_on_clear = _PAGE_DIRTY | _PAGE_PRESENT | _PAGE_ACCESSED; const pteval_t software_flags = _PAGE_SOFTW1 | _PAGE_SOFTW2 | - _PAGE_SOFTW3 | _PAGE_SOFTW4; + _PAGE_SOFTW3 | _PAGE_SOFTW4 | + _PAGE_SAVED_DIRTY; const pteval_t flush_on_change = _PAGE_RW | _PAGE_USER | _PAGE_PWT | _PAGE_PCD | _PAGE_PSE | _PAGE_GLOBAL | _PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PKEY_BIT0 | _PAGE_PKEY_BIT1 | -- cgit v1.2.3 From 1f6f66f62e8cba909abc4fb59de3b57d8c5a9783 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 12 Jun 2023 17:10:37 -0700 Subject: x86/mm: Update ptep/pmdp_set_wrprotect() for _PAGE_SAVED_DIRTY When shadow stack is in use, Write=0,Dirty=1 PTE are preserved for shadow stack. Copy-on-write PTEs then have Write=0,SavedDirty=1. When a PTE goes from Write=1,Dirty=1 to Write=0,SavedDirty=1, it could become a transient shadow stack PTE in two cases: 1. Some processors can start a write but end up seeing a Write=0 PTE by the time they get to the Dirty bit, creating a transient shadow stack PTE. However, this will not occur on processors supporting shadow stack, and a TLB flush is not necessary. 2. When _PAGE_DIRTY is replaced with _PAGE_SAVED_DIRTY non-atomically, a transient shadow stack PTE can be created as a result. Prevent the second case when doing a write protection and Dirty->SavedDirty shift at the same time with a CMPXCHG loop. The first case Note, in the PAE case CMPXCHG will need to operate on 8 byte, but try_cmpxchg() will not use CMPXCHG8B, so it cannot operate on a full PAE PTE. However the exiting logic is not operating on a full 8 byte region either, and relies on the fact that the Write bit is in the first 4 bytes when doing the clear_bit(). Since both the Dirty, SavedDirty and Write bits are in the first 4 bytes, casting to a long will be similar to the existing behavior which also casts to a long. Dave Hansen, Jann Horn, Andy Lutomirski, and Peter Zijlstra provided many insights to the issue. Jann Horn provided the CMPXCHG solution. Co-developed-by: Yu-cheng Yu Signed-off-by: Yu-cheng Yu Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Acked-by: Mike Rapoport (IBM) Tested-by: Pengfei Xu Tested-by: John Allen Tested-by: Kees Cook Link: https://lore.kernel.org/all/20230613001108.3040476-12-rick.p.edgecombe%40intel.com --- arch/x86/include/asm/pgtable.h | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 0c0747cbad3e..b48d07a1c7d3 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1190,7 +1190,17 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - clear_bit(_PAGE_BIT_RW, (unsigned long *)&ptep->pte); + /* + * Avoid accidentally creating shadow stack PTEs + * (Write=0,Dirty=1). Use cmpxchg() to prevent races with + * the hardware setting Dirty=1. + */ + pte_t old_pte, new_pte; + + old_pte = READ_ONCE(*ptep); + do { + new_pte = pte_wrprotect(old_pte); + } while (!try_cmpxchg((long *)&ptep->pte, (long *)&old_pte, *(long *)&new_pte)); } #define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0) @@ -1242,7 +1252,17 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { - clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp); + /* + * Avoid accidentally creating shadow stack PTEs + * (Write=0,Dirty=1). Use cmpxchg() to prevent races with + * the hardware setting Dirty=1. + */ + pmd_t old_pmd, new_pmd; + + old_pmd = READ_ONCE(*pmdp); + do { + new_pmd = pmd_wrprotect(old_pmd); + } while (!try_cmpxchg((long *)pmdp, (long *)&old_pmd, *(long *)&new_pmd)); } #ifndef pmdp_establish -- cgit v1.2.3 From bb3aadf7d446aaf22c725b274e2c194ac5cb2111 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 12 Jun 2023 17:10:38 -0700 Subject: x86/mm: Start actually marking _PAGE_SAVED_DIRTY The recently introduced _PAGE_SAVED_DIRTY should be used instead of the HW Dirty bit whenever a PTE is Write=0, in order to not inadvertently create shadow stack PTEs. Update pte_mk*() helpers to do this, and apply the same changes to pmd and pud. Since there is no x86 version of pte_mkwrite() to hold this arch specific logic, create one. Add it to x86/mm/pgtable.c instead of x86/asm/include/pgtable.h as future patches will require it to live in pgtable.c and it will make the diff easier for reviewers. Since CPUs without shadow stack support could create Write=0,Dirty=1 PTEs, only return true for pte_shstk() if the CPU also supports shadow stack. This will prevent these HW creates PTEs as showing as true for pte_write(). For pte_modify() this is a bit trickier. It takes a "raw" pgprot_t which was not necessarily created with any of the existing PTE bit helpers. That means that it can return a pte_t with Write=0,Dirty=1, a shadow stack PTE, when it did not intend to create one. Modify it to also move _PAGE_DIRTY to _PAGE_SAVED_DIRTY. To avoid creating Write=0,Dirty=1 PTEs, pte_modify() needs to avoid: 1. Marking Write=0 PTEs Dirty=1 2. Marking Dirty=1 PTEs Write=0 The first case cannot happen as the existing behavior of pte_modify() is to filter out any Dirty bit passed in newprot. Handle the second case by shifting _PAGE_DIRTY=1 to _PAGE_SAVED_DIRTY=1 if the PTE was write protected by the pte_modify() call. Apply the same changes to pmd_modify(). Co-developed-by: Yu-cheng Yu Signed-off-by: Yu-cheng Yu Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Acked-by: Mike Rapoport (IBM) Tested-by: Pengfei Xu Tested-by: John Allen Tested-by: Kees Cook Link: https://lore.kernel.org/all/20230613001108.3040476-13-rick.p.edgecombe%40intel.com --- arch/x86/include/asm/pgtable.h | 151 +++++++++++++++++++++++++++++++++++------ arch/x86/mm/pgtable.c | 14 ++++ 2 files changed, 144 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index b48d07a1c7d3..7bab1b22a354 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -125,9 +125,15 @@ extern pmdval_t early_pmd_flags; * The following only work if pte_present() is true. * Undefined behaviour if not.. */ -static inline int pte_dirty(pte_t pte) +static inline bool pte_dirty(pte_t pte) { - return pte_flags(pte) & _PAGE_DIRTY; + return pte_flags(pte) & _PAGE_DIRTY_BITS; +} + +static inline bool pte_shstk(pte_t pte) +{ + return cpu_feature_enabled(X86_FEATURE_SHSTK) && + (pte_flags(pte) & (_PAGE_RW | _PAGE_DIRTY)) == _PAGE_DIRTY; } static inline int pte_young(pte_t pte) @@ -135,9 +141,16 @@ static inline int pte_young(pte_t pte) return pte_flags(pte) & _PAGE_ACCESSED; } -static inline int pmd_dirty(pmd_t pmd) +static inline bool pmd_dirty(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_DIRTY_BITS; +} + +static inline bool pmd_shstk(pmd_t pmd) { - return pmd_flags(pmd) & _PAGE_DIRTY; + return cpu_feature_enabled(X86_FEATURE_SHSTK) && + (pmd_flags(pmd) & (_PAGE_RW | _PAGE_DIRTY | _PAGE_PSE)) == + (_PAGE_DIRTY | _PAGE_PSE); } #define pmd_young pmd_young @@ -146,9 +159,9 @@ static inline int pmd_young(pmd_t pmd) return pmd_flags(pmd) & _PAGE_ACCESSED; } -static inline int pud_dirty(pud_t pud) +static inline bool pud_dirty(pud_t pud) { - return pud_flags(pud) & _PAGE_DIRTY; + return pud_flags(pud) & _PAGE_DIRTY_BITS; } static inline int pud_young(pud_t pud) @@ -158,13 +171,21 @@ static inline int pud_young(pud_t pud) static inline int pte_write(pte_t pte) { - return pte_flags(pte) & _PAGE_RW; + /* + * Shadow stack pages are logically writable, but do not have + * _PAGE_RW. Check for them separately from _PAGE_RW itself. + */ + return (pte_flags(pte) & _PAGE_RW) || pte_shstk(pte); } #define pmd_write pmd_write static inline int pmd_write(pmd_t pmd) { - return pmd_flags(pmd) & _PAGE_RW; + /* + * Shadow stack pages are logically writable, but do not have + * _PAGE_RW. Check for them separately from _PAGE_RW itself. + */ + return (pmd_flags(pmd) & _PAGE_RW) || pmd_shstk(pmd); } #define pud_write pud_write @@ -351,7 +372,14 @@ static inline pte_t pte_clear_saveddirty(pte_t pte) static inline pte_t pte_wrprotect(pte_t pte) { - return pte_clear_flags(pte, _PAGE_RW); + pte = pte_clear_flags(pte, _PAGE_RW); + + /* + * Blindly clearing _PAGE_RW might accidentally create + * a shadow stack PTE (Write=0,Dirty=1). Move the hardware + * dirty value to the software bit, if present. + */ + return pte_mksaveddirty(pte); } #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP @@ -389,7 +417,7 @@ static inline pte_t pte_clear_uffd_wp(pte_t pte) static inline pte_t pte_mkclean(pte_t pte) { - return pte_clear_flags(pte, _PAGE_DIRTY); + return pte_clear_flags(pte, _PAGE_DIRTY_BITS); } static inline pte_t pte_mkold(pte_t pte) @@ -404,7 +432,16 @@ static inline pte_t pte_mkexec(pte_t pte) static inline pte_t pte_mkdirty(pte_t pte) { - return pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); + pte = pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); + + return pte_mksaveddirty(pte); +} + +static inline pte_t pte_mkwrite_shstk(pte_t pte) +{ + pte = pte_clear_flags(pte, _PAGE_RW); + + return pte_set_flags(pte, _PAGE_DIRTY); } static inline pte_t pte_mkyoung(pte_t pte) @@ -417,6 +454,10 @@ static inline pte_t pte_mkwrite_novma(pte_t pte) return pte_set_flags(pte, _PAGE_RW); } +struct vm_area_struct; +pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma); +#define pte_mkwrite pte_mkwrite + static inline pte_t pte_mkhuge(pte_t pte) { return pte_set_flags(pte, _PAGE_PSE); @@ -481,7 +522,14 @@ static inline pmd_t pmd_clear_saveddirty(pmd_t pmd) static inline pmd_t pmd_wrprotect(pmd_t pmd) { - return pmd_clear_flags(pmd, _PAGE_RW); + pmd = pmd_clear_flags(pmd, _PAGE_RW); + + /* + * Blindly clearing _PAGE_RW might accidentally create + * a shadow stack PMD (RW=0, Dirty=1). Move the hardware + * dirty value to the software bit. + */ + return pmd_mksaveddirty(pmd); } #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP @@ -508,12 +556,21 @@ static inline pmd_t pmd_mkold(pmd_t pmd) static inline pmd_t pmd_mkclean(pmd_t pmd) { - return pmd_clear_flags(pmd, _PAGE_DIRTY); + return pmd_clear_flags(pmd, _PAGE_DIRTY_BITS); } static inline pmd_t pmd_mkdirty(pmd_t pmd) { - return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); + pmd = pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); + + return pmd_mksaveddirty(pmd); +} + +static inline pmd_t pmd_mkwrite_shstk(pmd_t pmd) +{ + pmd = pmd_clear_flags(pmd, _PAGE_RW); + + return pmd_set_flags(pmd, _PAGE_DIRTY); } static inline pmd_t pmd_mkdevmap(pmd_t pmd) @@ -536,6 +593,9 @@ static inline pmd_t pmd_mkwrite_novma(pmd_t pmd) return pmd_set_flags(pmd, _PAGE_RW); } +pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); +#define pmd_mkwrite pmd_mkwrite + static inline pud_t pud_set_flags(pud_t pud, pudval_t set) { pudval_t v = native_pud_val(pud); @@ -575,17 +635,26 @@ static inline pud_t pud_mkold(pud_t pud) static inline pud_t pud_mkclean(pud_t pud) { - return pud_clear_flags(pud, _PAGE_DIRTY); + return pud_clear_flags(pud, _PAGE_DIRTY_BITS); } static inline pud_t pud_wrprotect(pud_t pud) { - return pud_clear_flags(pud, _PAGE_RW); + pud = pud_clear_flags(pud, _PAGE_RW); + + /* + * Blindly clearing _PAGE_RW might accidentally create + * a shadow stack PUD (RW=0, Dirty=1). Move the hardware + * dirty value to the software bit. + */ + return pud_mksaveddirty(pud); } static inline pud_t pud_mkdirty(pud_t pud) { - return pud_set_flags(pud, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); + pud = pud_set_flags(pud, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); + + return pud_mksaveddirty(pud); } static inline pud_t pud_mkdevmap(pud_t pud) @@ -605,7 +674,9 @@ static inline pud_t pud_mkyoung(pud_t pud) static inline pud_t pud_mkwrite(pud_t pud) { - return pud_set_flags(pud, _PAGE_RW); + pud = pud_set_flags(pud, _PAGE_RW); + + return pud_clear_saveddirty(pud); } #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY @@ -722,6 +793,7 @@ static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask); static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { pteval_t val = pte_val(pte), oldval = val; + pte_t pte_result; /* * Chop off the NX bit (if present), and add the NX portion of @@ -730,17 +802,54 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) val &= _PAGE_CHG_MASK; val |= check_pgprot(newprot) & ~_PAGE_CHG_MASK; val = flip_protnone_guard(oldval, val, PTE_PFN_MASK); - return __pte(val); + + pte_result = __pte(val); + + /* + * To avoid creating Write=0,Dirty=1 PTEs, pte_modify() needs to avoid: + * 1. Marking Write=0 PTEs Dirty=1 + * 2. Marking Dirty=1 PTEs Write=0 + * + * The first case cannot happen because the _PAGE_CHG_MASK will filter + * out any Dirty bit passed in newprot. Handle the second case by + * going through the mksaveddirty exercise. Only do this if the old + * value was Write=1 to avoid doing this on Shadow Stack PTEs. + */ + if (oldval & _PAGE_RW) + pte_result = pte_mksaveddirty(pte_result); + else + pte_result = pte_clear_saveddirty(pte_result); + + return pte_result; } static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) { pmdval_t val = pmd_val(pmd), oldval = val; + pmd_t pmd_result; - val &= _HPAGE_CHG_MASK; + val &= (_HPAGE_CHG_MASK & ~_PAGE_DIRTY); val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK; val = flip_protnone_guard(oldval, val, PHYSICAL_PMD_PAGE_MASK); - return __pmd(val); + + pmd_result = __pmd(val); + + /* + * To avoid creating Write=0,Dirty=1 PMDs, pte_modify() needs to avoid: + * 1. Marking Write=0 PMDs Dirty=1 + * 2. Marking Dirty=1 PMDs Write=0 + * + * The first case cannot happen because the _PAGE_CHG_MASK will filter + * out any Dirty bit passed in newprot. Handle the second case by + * going through the mksaveddirty exercise. Only do this if the old + * value was Write=1 to avoid doing this on Shadow Stack PTEs. + */ + if (oldval & _PAGE_RW) + pmd_result = pmd_mksaveddirty(pmd_result); + else + pmd_result = pmd_clear_saveddirty(pmd_result); + + return pmd_result; } /* diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 15a8009a4480..217c436acfd3 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -872,3 +872,17 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) #endif /* CONFIG_X86_64 */ #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ + +pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma) +{ + pte = pte_mkwrite_novma(pte); + + return pte_clear_saveddirty(pte); +} + +pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) +{ + pmd = pmd_mkwrite_novma(pmd); + + return pmd_clear_saveddirty(pmd); +} -- cgit v1.2.3 From f788b71768ff6a8a453a93a9f366e162af560483 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 12 Jun 2023 17:10:39 -0700 Subject: x86/mm: Remove _PAGE_DIRTY from kernel RO pages New processors that support Shadow Stack regard Write=0,Dirty=1 PTEs as shadow stack pages. In normal cases, it can be helpful to create Write=1 PTEs as also Dirty=1 if HW dirty tracking is not needed, because if the Dirty bit is not already set the CPU has to set Dirty=1 when the memory gets written to. This creates additional work for the CPU. So traditional wisdom was to simply set the Dirty bit whenever you didn't care about it. However, it was never really very helpful for read-only kernel memory. When CR4.CET=1 and IA32_S_CET.SH_STK_EN=1, some instructions can write to such supervisor memory. The kernel does not set IA32_S_CET.SH_STK_EN, so avoiding kernel Write=0,Dirty=1 memory is not strictly needed for any functional reason. But having Write=0,Dirty=1 kernel memory doesn't have any functional benefit either, so to reduce ambiguity between shadow stack and regular Write=0 pages, remove Dirty=1 from any kernel Write=0 PTEs. Co-developed-by: Yu-cheng Yu Signed-off-by: Yu-cheng Yu Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Reviewed-by: Borislav Petkov (AMD) Reviewed-by: Kees Cook Acked-by: Mike Rapoport (IBM) Tested-by: Pengfei Xu Tested-by: John Allen Tested-by: Kees Cook Link: https://lore.kernel.org/all/20230613001108.3040476-14-rick.p.edgecombe%40intel.com --- arch/x86/include/asm/pgtable_types.h | 8 +++++--- arch/x86/mm/pat/set_memory.c | 4 ++-- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 93796471a8fa..002f19e4e1f5 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -218,10 +218,12 @@ enum page_cache_mode { #define _PAGE_TABLE_NOENC (__PP|__RW|_USR|___A| 0|___D| 0| 0) #define _PAGE_TABLE (__PP|__RW|_USR|___A| 0|___D| 0| 0| _ENC) -#define __PAGE_KERNEL_RO (__PP| 0| 0|___A|__NX|___D| 0|___G) -#define __PAGE_KERNEL_ROX (__PP| 0| 0|___A| 0|___D| 0|___G) +#define __PAGE_KERNEL_RO (__PP| 0| 0|___A|__NX| 0| 0|___G) +#define __PAGE_KERNEL_ROX (__PP| 0| 0|___A| 0| 0| 0|___G) +#define __PAGE_KERNEL (__PP|__RW| 0|___A|__NX|___D| 0|___G) +#define __PAGE_KERNEL_EXEC (__PP|__RW| 0|___A| 0|___D| 0|___G) #define __PAGE_KERNEL_NOCACHE (__PP|__RW| 0|___A|__NX|___D| 0|___G| __NC) -#define __PAGE_KERNEL_VVAR (__PP| 0|_USR|___A|__NX|___D| 0|___G) +#define __PAGE_KERNEL_VVAR (__PP| 0|_USR|___A|__NX| 0| 0|___G) #define __PAGE_KERNEL_LARGE (__PP|__RW| 0|___A|__NX|___D|_PSE|___G) #define __PAGE_KERNEL_LARGE_EXEC (__PP|__RW| 0|___A| 0|___D|_PSE|___G) #define __PAGE_KERNEL_WP (__PP|__RW| 0|___A|__NX|___D| 0|___G| __WP) diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index df4182b6449f..bda9f129835e 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -2074,12 +2074,12 @@ int set_memory_nx(unsigned long addr, int numpages) int set_memory_ro(unsigned long addr, int numpages) { - return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0); + return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW | _PAGE_DIRTY), 0); } int set_memory_rox(unsigned long addr, int numpages) { - pgprot_t clr = __pgprot(_PAGE_RW); + pgprot_t clr = __pgprot(_PAGE_RW | _PAGE_DIRTY); if (__supported_pte_mask & _PAGE_NX) clr.pgprot |= _PAGE_NX; -- cgit v1.2.3 From fd5439e0c97bbc469d0a263ce343241fc48200ea Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 12 Jun 2023 17:10:41 -0700 Subject: x86/mm: Check shadow stack page fault errors The CPU performs "shadow stack accesses" when it expects to encounter shadow stack mappings. These accesses can be implicit (via CALL/RET instructions) or explicit (instructions like WRSS). Shadow stack accesses to shadow-stack mappings can result in faults in normal, valid operation just like regular accesses to regular mappings. Shadow stacks need some of the same features like delayed allocation, swap and copy-on-write. The kernel needs to use faults to implement those features. The architecture has concepts of both shadow stack reads and shadow stack writes. Any shadow stack access to non-shadow stack memory will generate a fault with the shadow stack error code bit set. This means that, unlike normal write protection, the fault handler needs to create a type of memory that can be written to (with instructions that generate shadow stack writes), even to fulfill a read access. So in the case of COW memory, the COW needs to take place even with a shadow stack read. Otherwise the page will be left (shadow stack) writable in userspace. So to trigger the appropriate behavior, set FAULT_FLAG_WRITE for shadow stack accesses, even if the access was a shadow stack read. For the purpose of making this clearer, consider the following example. If a process has a shadow stack, and forks, the shadow stack PTEs will become read-only due to COW. If the CPU in one process performs a shadow stack read access to the shadow stack, for example executing a RET and causing the CPU to read the shadow stack copy of the return address, then in order for the fault to be resolved the PTE will need to be set with shadow stack permissions. But then the memory would be changeable from userspace (from CALL, RET, WRSS, etc). So this scenario needs to trigger COW, otherwise the shared page would be changeable from both processes. Shadow stack accesses can also result in errors, such as when a shadow stack overflows, or if a shadow stack access occurs to a non-shadow-stack mapping. Also, generate the errors for invalid shadow stack accesses. Co-developed-by: Yu-cheng Yu Signed-off-by: Yu-cheng Yu Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Reviewed-by: Borislav Petkov (AMD) Reviewed-by: Kees Cook Acked-by: Mike Rapoport (IBM) Tested-by: Pengfei Xu Tested-by: John Allen Tested-by: Kees Cook Link: https://lore.kernel.org/all/20230613001108.3040476-16-rick.p.edgecombe%40intel.com --- arch/x86/include/asm/trap_pf.h | 2 ++ arch/x86/mm/fault.c | 22 ++++++++++++++++++++++ 2 files changed, 24 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/trap_pf.h b/arch/x86/include/asm/trap_pf.h index 10b1de500ab1..afa524325e55 100644 --- a/arch/x86/include/asm/trap_pf.h +++ b/arch/x86/include/asm/trap_pf.h @@ -11,6 +11,7 @@ * bit 3 == 1: use of reserved bit detected * bit 4 == 1: fault was an instruction fetch * bit 5 == 1: protection keys block access + * bit 6 == 1: shadow stack access fault * bit 15 == 1: SGX MMU page-fault */ enum x86_pf_error_code { @@ -20,6 +21,7 @@ enum x86_pf_error_code { X86_PF_RSVD = 1 << 3, X86_PF_INSTR = 1 << 4, X86_PF_PK = 1 << 5, + X86_PF_SHSTK = 1 << 6, X86_PF_SGX = 1 << 15, }; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index e8711b2cafaf..8dff37e7b824 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1112,8 +1112,22 @@ access_error(unsigned long error_code, struct vm_area_struct *vma) (error_code & X86_PF_INSTR), foreign)) return 1; + /* + * Shadow stack accesses (PF_SHSTK=1) are only permitted to + * shadow stack VMAs. All other accesses result in an error. + */ + if (error_code & X86_PF_SHSTK) { + if (unlikely(!(vma->vm_flags & VM_SHADOW_STACK))) + return 1; + if (unlikely(!(vma->vm_flags & VM_WRITE))) + return 1; + return 0; + } + if (error_code & X86_PF_WRITE) { /* write, present and write, not present: */ + if (unlikely(vma->vm_flags & VM_SHADOW_STACK)) + return 1; if (unlikely(!(vma->vm_flags & VM_WRITE))) return 1; return 0; @@ -1305,6 +1319,14 @@ void do_user_addr_fault(struct pt_regs *regs, perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + /* + * Read-only permissions can not be expressed in shadow stack PTEs. + * Treat all shadow stack accesses as WRITE faults. This ensures + * that the MM will prepare everything (e.g., break COW) such that + * maybe_mkwrite() can create a proper shadow stack PTE. + */ + if (error_code & X86_PF_SHSTK) + flags |= FAULT_FLAG_WRITE; if (error_code & X86_PF_WRITE) flags |= FAULT_FLAG_WRITE; if (error_code & X86_PF_INSTR) -- cgit v1.2.3 From e5136e876581ba5b63220378e25fec9dcec7bad1 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 12 Jun 2023 17:10:43 -0700 Subject: mm: Warn on shadow stack memory in wrong vma The x86 Control-flow Enforcement Technology (CET) feature includes a new type of memory called shadow stack. This shadow stack memory has some unusual properties, which requires some core mm changes to function properly. One sharp edge is that PTEs that are both Write=0 and Dirty=1 are treated as shadow by the CPU, but this combination used to be created by the kernel on x86. Previous patches have changed the kernel to now avoid creating these PTEs unless they are for shadow stack memory. In case any missed corners of the kernel are still creating PTEs like this for non-shadow stack memory, and to catch any re-introductions of the logic, warn if any shadow stack PTEs (Write=0, Dirty=1) are found in non-shadow stack VMAs when they are being zapped. This won't catch transient cases but should have decent coverage. In order to check if a PTE is shadow stack in core mm code, add two arch breakouts arch_check_zapped_pte/pmd(). This will allow shadow stack specific code to be kept in arch/x86. Only do the check if shadow stack is supported by the CPU and configured because in rare cases older CPUs may write Dirty=1 to a Write=0 CPU on older CPUs. This check is handled in pte_shstk()/pmd_shstk(). Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Reviewed-by: Mark Brown Acked-by: Mike Rapoport (IBM) Tested-by: Pengfei Xu Tested-by: John Allen Tested-by: Kees Cook Link: https://lore.kernel.org/all/20230613001108.3040476-18-rick.p.edgecombe%40intel.com --- arch/x86/include/asm/pgtable.h | 6 ++++++ arch/x86/mm/pgtable.c | 20 ++++++++++++++++++++ include/linux/pgtable.h | 14 ++++++++++++++ mm/huge_memory.c | 1 + mm/memory.c | 1 + 5 files changed, 42 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 7bab1b22a354..9255b5b7a9d9 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1665,6 +1665,12 @@ static inline bool arch_has_hw_pte_young(void) return true; } +#define arch_check_zapped_pte arch_check_zapped_pte +void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte); + +#define arch_check_zapped_pmd arch_check_zapped_pmd +void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd); + #ifdef CONFIG_XEN_PV #define arch_has_hw_nonleaf_pmd_young arch_has_hw_nonleaf_pmd_young static inline bool arch_has_hw_nonleaf_pmd_young(void) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 217c436acfd3..4bfbe4cb6978 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -886,3 +886,23 @@ pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) return pmd_clear_saveddirty(pmd); } + +void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte) +{ + /* + * Hardware before shadow stack can (rarely) set Dirty=1 + * on a Write=0 PTE. So the below condition + * only indicates a software bug when shadow stack is + * supported by the HW. This checking is covered in + * pte_shstk(). + */ + VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && + pte_shstk(pte)); +} + +void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd) +{ + /* See note in arch_check_zapped_pte() */ + VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && + pmd_shstk(pmd)); +} diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 9462f4a87d42..dd4637d6cfaa 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -313,6 +313,20 @@ static inline bool arch_has_hw_pte_young(void) } #endif +#ifndef arch_check_zapped_pte +static inline void arch_check_zapped_pte(struct vm_area_struct *vma, + pte_t pte) +{ +} +#endif + +#ifndef arch_check_zapped_pmd +static inline void arch_check_zapped_pmd(struct vm_area_struct *vma, + pmd_t pmd) +{ +} +#endif + #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long address, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 23c2aa612926..554f6f82d225 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1681,6 +1681,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, */ orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd, tlb->fullmm); + arch_check_zapped_pmd(vma, orig_pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); if (vma_is_special_huge(vma)) { if (arch_needs_pgtable_deposit()) diff --git a/mm/memory.c b/mm/memory.c index f093c73512c5..36289f33327e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1430,6 +1430,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, continue; ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); + arch_check_zapped_pte(vma, ptent); tlb_remove_tlb_entry(tlb, pte, addr); zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent); -- cgit v1.2.3 From ae1f05a617dcbc0a732fbeba0893786cd009536c Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 12 Jun 2023 17:10:44 -0700 Subject: x86/mm: Warn if create Write=0,Dirty=1 with raw prot When user shadow stack is in use, Write=0,Dirty=1 is treated by the CPU as shadow stack memory. So for shadow stack memory this bit combination is valid, but when Dirty=1,Write=1 (conventionally writable) memory is being write protected, the kernel has been taught to transition the Dirty=1 bit to SavedDirty=1, to avoid inadvertently creating shadow stack memory. It does this inside pte_wrprotect() because it knows the PTE is not intended to be a writable shadow stack entry, it is supposed to be write protected. However, when a PTE is created by a raw prot using mk_pte(), mk_pte() can't know whether to adjust Dirty=1 to SavedDirty=1. It can't distinguish between the caller intending to create a shadow stack PTE or needing the SavedDirty shift. The kernel has been updated to not do this, and so Write=0,Dirty=1 memory should only be created by the pte_mkfoo() helpers. Add a warning to make sure no new mk_pte() start doing this, like, for example, set_memory_rox() did. Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Tested-by: Pengfei Xu Tested-by: John Allen Tested-by: Kees Cook Link: https://lore.kernel.org/all/20230613001108.3040476-19-rick.p.edgecombe%40intel.com --- arch/x86/include/asm/pgtable.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 9255b5b7a9d9..61b52445e4d6 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1033,7 +1033,14 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) * (Currently stuck as a macro because of indirect forward reference * to linux/mm.h:page_to_nid()) */ -#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) +#define mk_pte(page, pgprot) \ +({ \ + pgprot_t __pgprot = pgprot; \ + \ + WARN_ON_ONCE((pgprot_val(__pgprot) & (_PAGE_DIRTY | _PAGE_RW)) == \ + _PAGE_DIRTY); \ + pfn_pte(page_to_pfn(page), __pgprot); \ +}) static inline int pmd_bad(pmd_t pmd) { -- cgit v1.2.3 From 29f890d1050fc099fd578d9db844d6c0375902b6 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 12 Jun 2023 17:10:46 -0700 Subject: x86/mm: Introduce MAP_ABOVE4G The x86 Control-flow Enforcement Technology (CET) feature includes a new type of memory called shadow stack. This shadow stack memory has some unusual properties, which require some core mm changes to function properly. One of the properties is that the shadow stack pointer (SSP), which is a CPU register that points to the shadow stack like the stack pointer points to the stack, can't be pointing outside of the 32 bit address space when the CPU is executing in 32 bit mode. It is desirable to prevent executing in 32 bit mode when shadow stack is enabled because the kernel can't easily support 32 bit signals. On x86 it is possible to transition to 32 bit mode without any special interaction with the kernel, by doing a "far call" to a 32 bit segment. So the shadow stack implementation can use this address space behavior as a feature, by enforcing that shadow stack memory is always mapped outside of the 32 bit address space. This way userspace will trigger a general protection fault which will in turn trigger a segfault if it tries to transition to 32 bit mode with shadow stack enabled. This provides a clean error generating border for the user if they try attempt to do 32 bit mode shadow stack, rather than leave the kernel in a half working state for userspace to be surprised by. So to allow future shadow stack enabling patches to map shadow stacks out of the 32 bit address space, introduce MAP_ABOVE4G. The behavior is pretty much like MAP_32BIT, except that it has the opposite address range. The are a few differences though. If both MAP_32BIT and MAP_ABOVE4G are provided, the kernel will use the MAP_ABOVE4G behavior. Like MAP_32BIT, MAP_ABOVE4G is ignored in a 32 bit syscall. Since the default search behavior is top down, the normal kaslr base can be used for MAP_ABOVE4G. This is unlike MAP_32BIT which has to add its own randomization in the bottom up case. For MAP_32BIT, only the bottom up search path is used. For MAP_ABOVE4G both are potentially valid, so both are used. In the bottomup search path, the default behavior is already consistent with MAP_ABOVE4G since mmap base should be above 4GB. Without MAP_ABOVE4G, the shadow stack will already normally be above 4GB. So without introducing MAP_ABOVE4G, trying to transition to 32 bit mode with shadow stack enabled would usually segfault anyway. This is already pretty decent guard rails. But the addition of MAP_ABOVE4G is some small complexity spent to make it make it more complete. Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Reviewed-by: Borislav Petkov (AMD) Reviewed-by: Kees Cook Acked-by: Mike Rapoport (IBM) Tested-by: Pengfei Xu Tested-by: John Allen Tested-by: Kees Cook Link: https://lore.kernel.org/all/20230613001108.3040476-21-rick.p.edgecombe%40intel.com --- arch/x86/include/uapi/asm/mman.h | 1 + arch/x86/kernel/sys_x86_64.c | 6 +++++- include/linux/mman.h | 4 ++++ 3 files changed, 10 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h index 775dbd3aff73..5a0256e73f1e 100644 --- a/arch/x86/include/uapi/asm/mman.h +++ b/arch/x86/include/uapi/asm/mman.h @@ -3,6 +3,7 @@ #define _ASM_X86_MMAN_H #define MAP_32BIT 0x40 /* only give out 32bit addresses */ +#define MAP_ABOVE4G 0x80 /* only map above 4GB */ #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS #define arch_calc_vm_prot_bits(prot, key) ( \ diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 8cc653ffdccd..c783aeb37dce 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -193,7 +193,11 @@ get_unmapped_area: info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; - info.low_limit = PAGE_SIZE; + if (!in_32bit_syscall() && (flags & MAP_ABOVE4G)) + info.low_limit = SZ_4G; + else + info.low_limit = PAGE_SIZE; + info.high_limit = get_mmap_base(0); /* diff --git a/include/linux/mman.h b/include/linux/mman.h index cee1e4b566d8..40d94411d492 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -15,6 +15,9 @@ #ifndef MAP_32BIT #define MAP_32BIT 0 #endif +#ifndef MAP_ABOVE4G +#define MAP_ABOVE4G 0 +#endif #ifndef MAP_HUGE_2MB #define MAP_HUGE_2MB 0 #endif @@ -50,6 +53,7 @@ | MAP_STACK \ | MAP_HUGETLB \ | MAP_32BIT \ + | MAP_ABOVE4G \ | MAP_HUGE_2MB \ | MAP_HUGE_1GB) -- cgit v1.2.3 From b497e52ddb2ab750b00d8cc78209613558fc503b Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 12 Jun 2023 17:10:47 -0700 Subject: x86/mm: Teach pte_mkwrite() about stack memory If a VMA has the VM_SHADOW_STACK flag, it is shadow stack memory. So when it is made writable with pte_mkwrite(), it should create shadow stack memory, not conventionally writable memory. Now that all the places where shadow stack memory might be created pass a VMA into pte_mkwrite(), it can know when it should do this. So make pte_mkwrite() create shadow stack memory when the VMA has the VM_SHADOW_STACK flag. Do the same thing for pmd_mkwrite(). This requires referencing VM_SHADOW_STACK in these functions, which are currently defined in pgtable.h, however mm.h (where VM_SHADOW_STACK is located) can't be pulled in without causing problems for files that reference pgtable.h. So also move pte/pmd_mkwrite() into pgtable.c, where they can safely reference VM_SHADOW_STACK. Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Reviewed-by: Borislav Petkov (AMD) Reviewed-by: Kees Cook Acked-by: Mike Rapoport (IBM) Acked-by: Deepak Gupta Tested-by: Pengfei Xu Tested-by: John Allen Tested-by: Kees Cook Link: https://lore.kernel.org/all/20230613001108.3040476-22-rick.p.edgecombe%40intel.com --- arch/x86/mm/pgtable.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 4bfbe4cb6978..0d66aeb703a1 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -875,6 +875,9 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma) { + if (vma->vm_flags & VM_SHADOW_STACK) + return pte_mkwrite_shstk(pte); + pte = pte_mkwrite_novma(pte); return pte_clear_saveddirty(pte); @@ -882,6 +885,9 @@ pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma) pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) { + if (vma->vm_flags & VM_SHADOW_STACK) + return pmd_mkwrite_shstk(pmd); + pmd = pmd_mkwrite_novma(pmd); return pmd_clear_saveddirty(pmd); -- cgit v1.2.3 From 6beb99580bc040aed1d5fe7ed9083a4be77f3c20 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 12 Jun 2023 17:10:48 -0700 Subject: mm: Don't allow write GUPs to shadow stack memory The x86 Control-flow Enforcement Technology (CET) feature includes a new type of memory called shadow stack. This shadow stack memory has some unusual properties, which requires some core mm changes to function properly. In userspace, shadow stack memory is writable only in very specific, controlled ways. However, since userspace can, even in the limited ways, modify shadow stack contents, the kernel treats it as writable memory. As a result, without additional work there would remain many ways for userspace to trigger the kernel to write arbitrary data to shadow stacks via get_user_pages(, FOLL_WRITE) based operations. To help userspace protect their shadow stacks, make this a little less exposed by blocking writable get_user_pages() operations for shadow stack VMAs. Still allow FOLL_FORCE to write through shadow stack protections, as it does for read-only protections. This is required for debugging use cases. [ dhansen: fix rebase goof, readd writable_file_mapping_allowed() hunk ] Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Reviewed-by: Borislav Petkov (AMD) Reviewed-by: Kees Cook Acked-by: Mike Rapoport (IBM) Acked-by: David Hildenbrand Tested-by: Pengfei Xu Tested-by: John Allen Tested-by: Kees Cook Link: https://lore.kernel.org/all/20230613001108.3040476-23-rick.p.edgecombe%40intel.com --- arch/x86/include/asm/pgtable.h | 5 +++++ mm/gup.c | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 61b52445e4d6..e95cfd3f1dda 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1631,6 +1631,11 @@ static inline bool __pte_access_permitted(unsigned long pteval, bool write) { unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER; + /* + * Write=0,Dirty=1 PTEs are shadow stack, which the kernel + * shouldn't generally allow access to, but since they + * are already Write=0, the below logic covers both cases. + */ if (write) need_pte_bits |= _PAGE_RW; diff --git a/mm/gup.c b/mm/gup.c index 76d222ccc3ff..44c2658cc128 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1054,7 +1054,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) !writable_file_mapping_allowed(vma, gup_flags)) return -EFAULT; - if (!(vm_flags & VM_WRITE)) { + if (!(vm_flags & VM_WRITE) || (vm_flags & VM_SHADOW_STACK)) { if (!(gup_flags & FOLL_FORCE)) return -EFAULT; /* hugetlb does not support FOLL_FORCE|FOLL_WRITE. */ -- cgit v1.2.3 From 8970ef027b21c58436f93b874286d342db164e3d Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 12 Jun 2023 17:10:50 -0700 Subject: x86/fpu/xstate: Introduce CET MSR and XSAVES supervisor states Shadow stack register state can be managed with XSAVE. The registers can logically be separated into two groups: * Registers controlling user-mode operation * Registers controlling kernel-mode operation The architecture has two new XSAVE state components: one for each group of those groups of registers. This lets an OS manage them separately if it chooses. Future patches for host userspace and KVM guests will only utilize the user-mode registers, so only configure XSAVE to save user-mode registers. This state will add 16 bytes to the xsave buffer size. Future patches will use the user-mode XSAVE area to save guest user-mode CET state. However, VMCS includes new fields for guest CET supervisor states. KVM can use these to save and restore guest supervisor state, so host supervisor XSAVE support is not required. Adding this exacerbates the already unwieldy if statement in check_xstate_against_struct() that handles warning about unimplemented xfeatures. So refactor these check's by having XCHECK_SZ() set a bool when it actually check's the xfeature. This ends up exceeding 80 chars, but was better on balance than other options explored. Pass the bool as pointer to make it clear that XCHECK_SZ() can change the variable. While configuring user-mode XSAVE, clarify kernel-mode registers are not managed by XSAVE by defining the xfeature in XFEATURE_MASK_SUPERVISOR_UNSUPPORTED, like is done for XFEATURE_MASK_PT. This serves more of a documentation as code purpose, and functionally, only enables a few safety checks. Both XSAVE state components are supervisor states, even the state controlling user-mode operation. This is a departure from earlier features like protection keys where the PKRU state is a normal user (non-supervisor) state. Having the user state be supervisor-managed ensures there is no direct, unprivileged access to it, making it harder for an attacker to subvert CET. To facilitate this privileged access, define the two user-mode CET MSRs, and the bits defined in those MSRs relevant to future shadow stack enablement patches. Co-developed-by: Yu-cheng Yu Signed-off-by: Yu-cheng Yu Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Reviewed-by: Borislav Petkov (AMD) Reviewed-by: Kees Cook Acked-by: Mike Rapoport (IBM) Tested-by: Pengfei Xu Tested-by: John Allen Tested-by: Kees Cook Link: https://lore.kernel.org/all/20230613001108.3040476-25-rick.p.edgecombe%40intel.com --- arch/x86/include/asm/fpu/types.h | 16 ++++++- arch/x86/include/asm/fpu/xstate.h | 6 ++- arch/x86/kernel/fpu/xstate.c | 90 +++++++++++++++++++-------------------- 3 files changed, 61 insertions(+), 51 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 7f6d858ff47a..eb810074f1e7 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -115,8 +115,8 @@ enum xfeature { XFEATURE_PT_UNIMPLEMENTED_SO_FAR, XFEATURE_PKRU, XFEATURE_PASID, - XFEATURE_RSRVD_COMP_11, - XFEATURE_RSRVD_COMP_12, + XFEATURE_CET_USER, + XFEATURE_CET_KERNEL_UNUSED, XFEATURE_RSRVD_COMP_13, XFEATURE_RSRVD_COMP_14, XFEATURE_LBR, @@ -138,6 +138,8 @@ enum xfeature { #define XFEATURE_MASK_PT (1 << XFEATURE_PT_UNIMPLEMENTED_SO_FAR) #define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU) #define XFEATURE_MASK_PASID (1 << XFEATURE_PASID) +#define XFEATURE_MASK_CET_USER (1 << XFEATURE_CET_USER) +#define XFEATURE_MASK_CET_KERNEL (1 << XFEATURE_CET_KERNEL_UNUSED) #define XFEATURE_MASK_LBR (1 << XFEATURE_LBR) #define XFEATURE_MASK_XTILE_CFG (1 << XFEATURE_XTILE_CFG) #define XFEATURE_MASK_XTILE_DATA (1 << XFEATURE_XTILE_DATA) @@ -252,6 +254,16 @@ struct pkru_state { u32 pad; } __packed; +/* + * State component 11 is Control-flow Enforcement user states + */ +struct cet_user_state { + /* user control-flow settings */ + u64 user_cet; + /* user shadow stack pointer */ + u64 user_ssp; +}; + /* * State component 15: Architectural LBR configuration state. * The size of Arch LBR state depends on the number of LBRs (lbr_depth). diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index cd3dd170e23a..d4427b88ee12 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -50,7 +50,8 @@ #define XFEATURE_MASK_USER_DYNAMIC XFEATURE_MASK_XTILE_DATA /* All currently supported supervisor features */ -#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID) +#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID | \ + XFEATURE_MASK_CET_USER) /* * A supervisor state component may not always contain valuable information, @@ -77,7 +78,8 @@ * Unsupported supervisor features. When a supervisor feature in this mask is * supported in the future, move it to the supported supervisor feature mask. */ -#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT) +#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT | \ + XFEATURE_MASK_CET_KERNEL) /* All supervisor states including supported and unsupported states. */ #define XFEATURE_MASK_SUPERVISOR_ALL (XFEATURE_MASK_SUPERVISOR_SUPPORTED | \ diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 0bab497c9436..4fa4751912d9 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -39,26 +39,26 @@ */ static const char *xfeature_names[] = { - "x87 floating point registers" , - "SSE registers" , - "AVX registers" , - "MPX bounds registers" , - "MPX CSR" , - "AVX-512 opmask" , - "AVX-512 Hi256" , - "AVX-512 ZMM_Hi256" , - "Processor Trace (unused)" , + "x87 floating point registers", + "SSE registers", + "AVX registers", + "MPX bounds registers", + "MPX CSR", + "AVX-512 opmask", + "AVX-512 Hi256", + "AVX-512 ZMM_Hi256", + "Processor Trace (unused)", "Protection Keys User registers", "PASID state", - "unknown xstate feature" , - "unknown xstate feature" , - "unknown xstate feature" , - "unknown xstate feature" , - "unknown xstate feature" , - "unknown xstate feature" , - "AMX Tile config" , - "AMX Tile data" , - "unknown xstate feature" , + "Control-flow User registers", + "Control-flow Kernel registers (unused)", + "unknown xstate feature", + "unknown xstate feature", + "unknown xstate feature", + "unknown xstate feature", + "AMX Tile config", + "AMX Tile data", + "unknown xstate feature", }; static unsigned short xsave_cpuid_features[] __initdata = { @@ -73,6 +73,7 @@ static unsigned short xsave_cpuid_features[] __initdata = { [XFEATURE_PT_UNIMPLEMENTED_SO_FAR] = X86_FEATURE_INTEL_PT, [XFEATURE_PKRU] = X86_FEATURE_PKU, [XFEATURE_PASID] = X86_FEATURE_ENQCMD, + [XFEATURE_CET_USER] = X86_FEATURE_SHSTK, [XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE, [XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE, }; @@ -276,6 +277,7 @@ static void __init print_xstate_features(void) print_xstate_feature(XFEATURE_MASK_Hi16_ZMM); print_xstate_feature(XFEATURE_MASK_PKRU); print_xstate_feature(XFEATURE_MASK_PASID); + print_xstate_feature(XFEATURE_MASK_CET_USER); print_xstate_feature(XFEATURE_MASK_XTILE_CFG); print_xstate_feature(XFEATURE_MASK_XTILE_DATA); } @@ -344,6 +346,7 @@ static __init void os_xrstor_booting(struct xregs_state *xstate) XFEATURE_MASK_BNDREGS | \ XFEATURE_MASK_BNDCSR | \ XFEATURE_MASK_PASID | \ + XFEATURE_MASK_CET_USER | \ XFEATURE_MASK_XTILE) /* @@ -446,14 +449,15 @@ static void __init __xstate_dump_leaves(void) } \ } while (0) -#define XCHECK_SZ(sz, nr, nr_macro, __struct) do { \ - if ((nr == nr_macro) && \ - WARN_ONCE(sz != sizeof(__struct), \ - "%s: struct is %zu bytes, cpu state %d bytes\n", \ - __stringify(nr_macro), sizeof(__struct), sz)) { \ +#define XCHECK_SZ(sz, nr, __struct) ({ \ + if (WARN_ONCE(sz != sizeof(__struct), \ + "[%s]: struct is %zu bytes, cpu state %d bytes\n", \ + xfeature_names[nr], sizeof(__struct), sz)) { \ __xstate_dump_leaves(); \ } \ -} while (0) + true; \ +}) + /** * check_xtile_data_against_struct - Check tile data state size. @@ -527,36 +531,28 @@ static bool __init check_xstate_against_struct(int nr) * Ask the CPU for the size of the state. */ int sz = xfeature_size(nr); + /* * Match each CPU state with the corresponding software * structure. */ - XCHECK_SZ(sz, nr, XFEATURE_YMM, struct ymmh_struct); - XCHECK_SZ(sz, nr, XFEATURE_BNDREGS, struct mpx_bndreg_state); - XCHECK_SZ(sz, nr, XFEATURE_BNDCSR, struct mpx_bndcsr_state); - XCHECK_SZ(sz, nr, XFEATURE_OPMASK, struct avx_512_opmask_state); - XCHECK_SZ(sz, nr, XFEATURE_ZMM_Hi256, struct avx_512_zmm_uppers_state); - XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM, struct avx_512_hi16_state); - XCHECK_SZ(sz, nr, XFEATURE_PKRU, struct pkru_state); - XCHECK_SZ(sz, nr, XFEATURE_PASID, struct ia32_pasid_state); - XCHECK_SZ(sz, nr, XFEATURE_XTILE_CFG, struct xtile_cfg); - - /* The tile data size varies between implementations. */ - if (nr == XFEATURE_XTILE_DATA) - check_xtile_data_against_struct(sz); - - /* - * Make *SURE* to add any feature numbers in below if - * there are "holes" in the xsave state component - * numbers. - */ - if ((nr < XFEATURE_YMM) || - (nr >= XFEATURE_MAX) || - (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) || - ((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_RSRVD_COMP_16))) { + switch (nr) { + case XFEATURE_YMM: return XCHECK_SZ(sz, nr, struct ymmh_struct); + case XFEATURE_BNDREGS: return XCHECK_SZ(sz, nr, struct mpx_bndreg_state); + case XFEATURE_BNDCSR: return XCHECK_SZ(sz, nr, struct mpx_bndcsr_state); + case XFEATURE_OPMASK: return XCHECK_SZ(sz, nr, struct avx_512_opmask_state); + case XFEATURE_ZMM_Hi256: return XCHECK_SZ(sz, nr, struct avx_512_zmm_uppers_state); + case XFEATURE_Hi16_ZMM: return XCHECK_SZ(sz, nr, struct avx_512_hi16_state); + case XFEATURE_PKRU: return XCHECK_SZ(sz, nr, struct pkru_state); + case XFEATURE_PASID: return XCHECK_SZ(sz, nr, struct ia32_pasid_state); + case XFEATURE_XTILE_CFG: return XCHECK_SZ(sz, nr, struct xtile_cfg); + case XFEATURE_CET_USER: return XCHECK_SZ(sz, nr, struct cet_user_state); + case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true; + default: XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr); return false; } + return true; } -- cgit v1.2.3 From 6ee836687a3f39f92da790d33fa9694fe0143410 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 12 Jun 2023 17:10:51 -0700 Subject: x86/fpu: Add helper for modifying xstate Just like user xfeatures, supervisor xfeatures can be active in the registers or present in the task FPU buffer. If the registers are active, the registers can be modified directly. If the registers are not active, the modification must be performed on the task FPU buffer. When the state is not active, the kernel could perform modifications directly to the buffer. But in order for it to do that, it needs to know where in the buffer the specific state it wants to modify is located. Doing this is not robust against optimizations that compact the FPU buffer, as each access would require computing where in the buffer it is. The easiest way to modify supervisor xfeature data is to force restore the registers and write directly to the MSRs. Often times this is just fine anyway as the registers need to be restored before returning to userspace. Do this for now, leaving buffer writing optimizations for the future. Add a new function fpregs_lock_and_load() that can simultaneously call fpregs_lock() and do this restore. Also perform some extra sanity checks in this function since this will be used in non-fpu focused code. Suggested-by: Thomas Gleixner Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Reviewed-by: Borislav Petkov (AMD) Reviewed-by: Kees Cook Acked-by: Mike Rapoport (IBM) Tested-by: Pengfei Xu Tested-by: John Allen Tested-by: Kees Cook Link: https://lore.kernel.org/all/20230613001108.3040476-26-rick.p.edgecombe%40intel.com --- arch/x86/include/asm/fpu/api.h | 9 +++++++++ arch/x86/kernel/fpu/core.c | 18 ++++++++++++++++++ 2 files changed, 27 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index b475d9a582b8..31089b851c4f 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -82,6 +82,15 @@ static inline void fpregs_unlock(void) preempt_enable(); } +/* + * FPU state gets lazily restored before returning to userspace. So when in the + * kernel, the valid FPU state may be kept in the buffer. This function will force + * restore all the fpu state to the registers early if needed, and lock them from + * being automatically saved/restored. Then FPU state can be modified safely in the + * registers, before unlocking with fpregs_unlock(). + */ +void fpregs_lock_and_load(void); + #ifdef CONFIG_X86_DEBUG_FPU extern void fpregs_assert_state_consistent(void); #else diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 1015af1ae562..375852cf8fac 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -753,6 +753,24 @@ void switch_fpu_return(void) } EXPORT_SYMBOL_GPL(switch_fpu_return); +void fpregs_lock_and_load(void) +{ + /* + * fpregs_lock() only disables preemption (mostly). So modifying state + * in an interrupt could screw up some in progress fpregs operation. + * Warn about it. + */ + WARN_ON_ONCE(!irq_fpu_usable()); + WARN_ON_ONCE(current->flags & PF_KTHREAD); + + fpregs_lock(); + + fpregs_assert_state_consistent(); + + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + fpregs_restore_userregs(); +} + #ifdef CONFIG_X86_DEBUG_FPU /* * If current FPU state according to its tracking (loaded FPU context on this -- cgit v1.2.3 From 98cfa4630912a80a575277d1bf193376ba66116a Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 12 Jun 2023 17:10:52 -0700 Subject: x86: Introduce userspace API for shadow stack Add three new arch_prctl() handles: - ARCH_SHSTK_ENABLE/DISABLE enables or disables the specified feature. Returns 0 on success or a negative value on error. - ARCH_SHSTK_LOCK prevents future disabling or enabling of the specified feature. Returns 0 on success or a negative value on error. The features are handled per-thread and inherited over fork(2)/clone(2), but reset on exec(). Co-developed-by: Kirill A. Shutemov Signed-off-by: Kirill A. Shutemov Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Reviewed-by: Borislav Petkov (AMD) Reviewed-by: Kees Cook Acked-by: Mike Rapoport (IBM) Tested-by: Pengfei Xu Tested-by: John Allen Tested-by: Kees Cook Link: https://lore.kernel.org/all/20230613001108.3040476-27-rick.p.edgecombe%40intel.com --- arch/x86/include/asm/processor.h | 6 ++++++ arch/x86/include/asm/shstk.h | 21 +++++++++++++++++++ arch/x86/include/uapi/asm/prctl.h | 6 ++++++ arch/x86/kernel/Makefile | 2 ++ arch/x86/kernel/process_64.c | 6 ++++++ arch/x86/kernel/shstk.c | 44 +++++++++++++++++++++++++++++++++++++++ 6 files changed, 85 insertions(+) create mode 100644 arch/x86/include/asm/shstk.h create mode 100644 arch/x86/kernel/shstk.c (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index d46300e94f85..4e35f40ea1e6 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -28,6 +28,7 @@ struct vm86; #include #include #include +#include #include #include @@ -475,6 +476,11 @@ struct thread_struct { */ u32 pkru; +#ifdef CONFIG_X86_USER_SHADOW_STACK + unsigned long features; + unsigned long features_locked; +#endif + /* Floating point and extended processor state */ struct fpu fpu; /* diff --git a/arch/x86/include/asm/shstk.h b/arch/x86/include/asm/shstk.h new file mode 100644 index 000000000000..ec753809f074 --- /dev/null +++ b/arch/x86/include/asm/shstk.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_SHSTK_H +#define _ASM_X86_SHSTK_H + +#ifndef __ASSEMBLY__ +#include + +struct task_struct; + +#ifdef CONFIG_X86_USER_SHADOW_STACK +long shstk_prctl(struct task_struct *task, int option, unsigned long features); +void reset_thread_features(void); +#else +static inline long shstk_prctl(struct task_struct *task, int option, + unsigned long arg2) { return -EINVAL; } +static inline void reset_thread_features(void) {} +#endif /* CONFIG_X86_USER_SHADOW_STACK */ + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_X86_SHSTK_H */ diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h index e8d7ebbca1a4..1cd44ecc9ce0 100644 --- a/arch/x86/include/uapi/asm/prctl.h +++ b/arch/x86/include/uapi/asm/prctl.h @@ -23,9 +23,15 @@ #define ARCH_MAP_VDSO_32 0x2002 #define ARCH_MAP_VDSO_64 0x2003 +/* Don't use 0x3001-0x3004 because of old glibcs */ + #define ARCH_GET_UNTAG_MASK 0x4001 #define ARCH_ENABLE_TAGGED_ADDR 0x4002 #define ARCH_GET_MAX_TAG_BITS 0x4003 #define ARCH_FORCE_TAGGED_SVA 0x4004 +#define ARCH_SHSTK_ENABLE 0x5001 +#define ARCH_SHSTK_DISABLE 0x5002 +#define ARCH_SHSTK_LOCK 0x5003 + #endif /* _ASM_X86_PRCTL_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index abee0564b750..6b6bf47652ee 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -147,6 +147,8 @@ obj-$(CONFIG_CALL_THUNKS) += callthunks.o obj-$(CONFIG_X86_CET) += cet.o +obj-$(CONFIG_X86_USER_SHADOW_STACK) += shstk.o + ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 3d181c16a2f6..0f89aa0186d1 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -515,6 +515,8 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, load_gs_index(__USER_DS); } + reset_thread_features(); + loadsegment(fs, 0); loadsegment(es, _ds); loadsegment(ds, _ds); @@ -894,6 +896,10 @@ long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) else return put_user(LAM_U57_BITS, (unsigned long __user *)arg2); #endif + case ARCH_SHSTK_ENABLE: + case ARCH_SHSTK_DISABLE: + case ARCH_SHSTK_LOCK: + return shstk_prctl(task, option, arg2); default: ret = -EINVAL; break; diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c new file mode 100644 index 000000000000..41ed6552e0a5 --- /dev/null +++ b/arch/x86/kernel/shstk.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * shstk.c - Intel shadow stack support + * + * Copyright (c) 2021, Intel Corporation. + * Yu-cheng Yu + */ + +#include +#include +#include + +void reset_thread_features(void) +{ + current->thread.features = 0; + current->thread.features_locked = 0; +} + +long shstk_prctl(struct task_struct *task, int option, unsigned long features) +{ + if (option == ARCH_SHSTK_LOCK) { + task->thread.features_locked |= features; + return 0; + } + + /* Don't allow via ptrace */ + if (task != current) + return -EINVAL; + + /* Do not allow to change locked features */ + if (features & task->thread.features_locked) + return -EPERM; + + /* Only support enabling/disabling one feature at a time. */ + if (hweight_long(features) > 1) + return -EINVAL; + + if (option == ARCH_SHSTK_DISABLE) { + return -EINVAL; + } + + /* Handle ARCH_SHSTK_ENABLE */ + return -EINVAL; +} -- cgit v1.2.3 From a5f6c2ace9974adf92ce65dacca8126d90adabfe Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 12 Jun 2023 17:10:53 -0700 Subject: x86/shstk: Add user control-protection fault handler A control-protection fault is triggered when a control-flow transfer attempt violates Shadow Stack or Indirect Branch Tracking constraints. For example, the return address for a RET instruction differs from the copy on the shadow stack. There already exists a control-protection fault handler for handling kernel IBT faults. Refactor this fault handler into separate user and kernel handlers, like the page fault handler. Add a control-protection handler for usermode. To avoid ifdeffery, put them both in a new file cet.c, which is compiled in the case of either of the two CET features supported in the kernel: kernel IBT or user mode shadow stack. Move some static inline functions from traps.c into a header so they can be used in cet.c. Opportunistically fix a comment in the kernel IBT part of the fault handler that is on the end of the line instead of preceding it. Keep the same behavior for the kernel side of the fault handler, except for converting a BUG to a WARN in the case of a #CP happening when the feature is missing. This unifies the behavior with the new shadow stack code, and also prevents the kernel from crashing under this situation which is potentially recoverable. The control-protection fault handler works in a similar way as the general protection fault handler. It provides the si_code SEGV_CPERR to the signal handler. Co-developed-by: Yu-cheng Yu Signed-off-by: Yu-cheng Yu Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Reviewed-by: Borislav Petkov (AMD) Reviewed-by: Kees Cook Acked-by: Mike Rapoport (IBM) Tested-by: Pengfei Xu Tested-by: John Allen Tested-by: Kees Cook Link: https://lore.kernel.org/all/20230613001108.3040476-28-rick.p.edgecombe%40intel.com --- arch/arm/kernel/signal.c | 2 +- arch/arm64/kernel/signal.c | 2 +- arch/arm64/kernel/signal32.c | 2 +- arch/sparc/kernel/signal32.c | 2 +- arch/sparc/kernel/signal_64.c | 2 +- arch/x86/include/asm/disabled-features.h | 8 ++- arch/x86/include/asm/idtentry.h | 2 +- arch/x86/include/asm/traps.h | 12 ++++ arch/x86/kernel/cet.c | 94 +++++++++++++++++++++++++++++--- arch/x86/kernel/idt.c | 2 +- arch/x86/kernel/signal_32.c | 2 +- arch/x86/kernel/signal_64.c | 2 +- arch/x86/kernel/traps.c | 12 ---- arch/x86/xen/enlighten_pv.c | 2 +- arch/x86/xen/xen-asm.S | 2 +- include/uapi/asm-generic/siginfo.h | 3 +- 16 files changed, 117 insertions(+), 34 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index 8d0afa11bed5..79a6730fa0eb 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -682,7 +682,7 @@ asmlinkage void do_rseq_syscall(struct pt_regs *regs) */ static_assert(NSIGILL == 11); static_assert(NSIGFPE == 15); -static_assert(NSIGSEGV == 9); +static_assert(NSIGSEGV == 10); static_assert(NSIGBUS == 5); static_assert(NSIGTRAP == 6); static_assert(NSIGCHLD == 6); diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index e304f7ebec2a..0df8cc295ea5 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -1344,7 +1344,7 @@ void __init minsigstksz_setup(void) */ static_assert(NSIGILL == 11); static_assert(NSIGFPE == 15); -static_assert(NSIGSEGV == 9); +static_assert(NSIGSEGV == 10); static_assert(NSIGBUS == 5); static_assert(NSIGTRAP == 6); static_assert(NSIGCHLD == 6); diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c index 4700f8522d27..bbd542704730 100644 --- a/arch/arm64/kernel/signal32.c +++ b/arch/arm64/kernel/signal32.c @@ -460,7 +460,7 @@ void compat_setup_restart_syscall(struct pt_regs *regs) */ static_assert(NSIGILL == 11); static_assert(NSIGFPE == 15); -static_assert(NSIGSEGV == 9); +static_assert(NSIGSEGV == 10); static_assert(NSIGBUS == 5); static_assert(NSIGTRAP == 6); static_assert(NSIGCHLD == 6); diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c index ca450c7bc53f..a23cdd7459bb 100644 --- a/arch/sparc/kernel/signal32.c +++ b/arch/sparc/kernel/signal32.c @@ -753,7 +753,7 @@ out: */ static_assert(NSIGILL == 11); static_assert(NSIGFPE == 15); -static_assert(NSIGSEGV == 9); +static_assert(NSIGSEGV == 10); static_assert(NSIGBUS == 5); static_assert(NSIGTRAP == 6); static_assert(NSIGCHLD == 6); diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c index 570e43e6fda5..b4e410976e0d 100644 --- a/arch/sparc/kernel/signal_64.c +++ b/arch/sparc/kernel/signal_64.c @@ -562,7 +562,7 @@ void do_notify_resume(struct pt_regs *regs, unsigned long orig_i0, unsigned long */ static_assert(NSIGILL == 11); static_assert(NSIGFPE == 15); -static_assert(NSIGSEGV == 9); +static_assert(NSIGSEGV == 10); static_assert(NSIGBUS == 5); static_assert(NSIGTRAP == 6); static_assert(NSIGCHLD == 6); diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index b9c7eae2e70f..702d93fdd10e 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -111,6 +111,12 @@ #define DISABLE_USER_SHSTK (1 << (X86_FEATURE_USER_SHSTK & 31)) #endif +#ifdef CONFIG_X86_KERNEL_IBT +#define DISABLE_IBT 0 +#else +#define DISABLE_IBT (1 << (X86_FEATURE_IBT & 31)) +#endif + /* * Make sure to add features to the correct mask */ @@ -134,7 +140,7 @@ #define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \ DISABLE_ENQCMD) #define DISABLED_MASK17 0 -#define DISABLED_MASK18 0 +#define DISABLED_MASK18 (DISABLE_IBT) #define DISABLED_MASK19 0 #define DISABLED_MASK20 0 #define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 21) diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index b241af4ce9b4..61e0e6301f09 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -614,7 +614,7 @@ DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_DF, xenpv_exc_double_fault); #endif /* #CP */ -#ifdef CONFIG_X86_KERNEL_IBT +#ifdef CONFIG_X86_CET DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_CP, exc_control_protection); #endif diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 47ecfff2c83d..75e0dabf0c45 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -47,4 +47,16 @@ void __noreturn handle_stack_overflow(struct pt_regs *regs, struct stack_info *info); #endif +static inline void cond_local_irq_enable(struct pt_regs *regs) +{ + if (regs->flags & X86_EFLAGS_IF) + local_irq_enable(); +} + +static inline void cond_local_irq_disable(struct pt_regs *regs) +{ + if (regs->flags & X86_EFLAGS_IF) + local_irq_disable(); +} + #endif /* _ASM_X86_TRAPS_H */ diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c index 7ad22b705b64..cc10d8be9d74 100644 --- a/arch/x86/kernel/cet.c +++ b/arch/x86/kernel/cet.c @@ -4,10 +4,6 @@ #include #include -static __ro_after_init bool ibt_fatal = true; - -extern void ibt_selftest_ip(void); /* code label defined in asm below */ - enum cp_error_code { CP_EC = (1 << 15) - 1, @@ -20,15 +16,80 @@ enum cp_error_code { CP_ENCL = 1 << 15, }; -DEFINE_IDTENTRY_ERRORCODE(exc_control_protection) +static const char cp_err[][10] = { + [0] = "unknown", + [1] = "near ret", + [2] = "far/iret", + [3] = "endbranch", + [4] = "rstorssp", + [5] = "setssbsy", +}; + +static const char *cp_err_string(unsigned long error_code) +{ + unsigned int cpec = error_code & CP_EC; + + if (cpec >= ARRAY_SIZE(cp_err)) + cpec = 0; + return cp_err[cpec]; +} + +static void do_unexpected_cp(struct pt_regs *regs, unsigned long error_code) +{ + WARN_ONCE(1, "Unexpected %s #CP, error_code: %s\n", + user_mode(regs) ? "user mode" : "kernel mode", + cp_err_string(error_code)); +} + +static DEFINE_RATELIMIT_STATE(cpf_rate, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + +static void do_user_cp_fault(struct pt_regs *regs, unsigned long error_code) { - if (!cpu_feature_enabled(X86_FEATURE_IBT)) { - pr_err("Unexpected #CP\n"); - BUG(); + struct task_struct *tsk; + unsigned long ssp; + + /* + * An exception was just taken from userspace. Since interrupts are disabled + * here, no scheduling should have messed with the registers yet and they + * will be whatever is live in userspace. So read the SSP before enabling + * interrupts so locking the fpregs to do it later is not required. + */ + rdmsrl(MSR_IA32_PL3_SSP, ssp); + + cond_local_irq_enable(regs); + + tsk = current; + tsk->thread.error_code = error_code; + tsk->thread.trap_nr = X86_TRAP_CP; + + /* Ratelimit to prevent log spamming. */ + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && + __ratelimit(&cpf_rate)) { + pr_emerg("%s[%d] control protection ip:%lx sp:%lx ssp:%lx error:%lx(%s)%s", + tsk->comm, task_pid_nr(tsk), + regs->ip, regs->sp, ssp, error_code, + cp_err_string(error_code), + error_code & CP_ENCL ? " in enclave" : ""); + print_vma_addr(KERN_CONT " in ", regs->ip); + pr_cont("\n"); } - if (WARN_ON_ONCE(user_mode(regs) || (error_code & CP_EC) != CP_ENDBR)) + force_sig_fault(SIGSEGV, SEGV_CPERR, (void __user *)0); + cond_local_irq_disable(regs); +} + +static __ro_after_init bool ibt_fatal = true; + +/* code label defined in asm below */ +extern void ibt_selftest_ip(void); + +static void do_kernel_cp_fault(struct pt_regs *regs, unsigned long error_code) +{ + if ((error_code & CP_EC) != CP_ENDBR) { + do_unexpected_cp(regs, error_code); return; + } if (unlikely(regs->ip == (unsigned long)&ibt_selftest_ip)) { regs->ax = 0; @@ -74,3 +135,18 @@ static int __init ibt_setup(char *str) } __setup("ibt=", ibt_setup); + +DEFINE_IDTENTRY_ERRORCODE(exc_control_protection) +{ + if (user_mode(regs)) { + if (cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) + do_user_cp_fault(regs, error_code); + else + do_unexpected_cp(regs, error_code); + } else { + if (cpu_feature_enabled(X86_FEATURE_IBT)) + do_kernel_cp_fault(regs, error_code); + else + do_unexpected_cp(regs, error_code); + } +} diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index a58c6bc1cd68..5074b8420359 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -107,7 +107,7 @@ static const __initconst struct idt_data def_idts[] = { ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE), #endif -#ifdef CONFIG_X86_KERNEL_IBT +#ifdef CONFIG_X86_CET INTG(X86_TRAP_CP, asm_exc_control_protection), #endif diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 9027fc088f97..c12624bc82a3 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -402,7 +402,7 @@ Efault: */ static_assert(NSIGILL == 11); static_assert(NSIGFPE == 15); -static_assert(NSIGSEGV == 9); +static_assert(NSIGSEGV == 10); static_assert(NSIGBUS == 5); static_assert(NSIGTRAP == 6); static_assert(NSIGCHLD == 6); diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 13a1e6083837..0e808c72bf7e 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -403,7 +403,7 @@ void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact) */ static_assert(NSIGILL == 11); static_assert(NSIGFPE == 15); -static_assert(NSIGSEGV == 9); +static_assert(NSIGSEGV == 10); static_assert(NSIGBUS == 5); static_assert(NSIGTRAP == 6); static_assert(NSIGCHLD == 6); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 6f666dfa97de..f358350624b2 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -77,18 +77,6 @@ DECLARE_BITMAP(system_vectors, NR_VECTORS); -static inline void cond_local_irq_enable(struct pt_regs *regs) -{ - if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); -} - -static inline void cond_local_irq_disable(struct pt_regs *regs) -{ - if (regs->flags & X86_EFLAGS_IF) - local_irq_disable(); -} - __always_inline int is_valid_bugaddr(unsigned long addr) { if (addr < TASK_SIZE_MAX) diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 93b658248d01..aa797b892b5a 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -694,7 +694,7 @@ static struct trap_array_entry trap_array[] = { TRAP_ENTRY(exc_coprocessor_error, false ), TRAP_ENTRY(exc_alignment_check, false ), TRAP_ENTRY(exc_simd_coprocessor_error, false ), -#ifdef CONFIG_X86_KERNEL_IBT +#ifdef CONFIG_X86_CET TRAP_ENTRY(exc_control_protection, false ), #endif }; diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index 08f1ceb9eb81..9e5e68008785 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -148,7 +148,7 @@ xen_pv_trap asm_exc_page_fault xen_pv_trap asm_exc_spurious_interrupt_bug xen_pv_trap asm_exc_coprocessor_error xen_pv_trap asm_exc_alignment_check -#ifdef CONFIG_X86_KERNEL_IBT +#ifdef CONFIG_X86_CET xen_pv_trap asm_exc_control_protection #endif #ifdef CONFIG_X86_MCE diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h index ffbe4cec9f32..0f52d0ac47c5 100644 --- a/include/uapi/asm-generic/siginfo.h +++ b/include/uapi/asm-generic/siginfo.h @@ -242,7 +242,8 @@ typedef struct siginfo { #define SEGV_ADIPERR 7 /* Precise MCD exception */ #define SEGV_MTEAERR 8 /* Asynchronous ARM MTE error */ #define SEGV_MTESERR 9 /* Synchronous ARM MTE exception */ -#define NSIGSEGV 9 +#define SEGV_CPERR 10 /* Control protection fault */ +#define NSIGSEGV 10 /* * SIGBUS si_codes -- cgit v1.2.3 From 2d39a6add422ac78254927ec2194838c33ae4fb2 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 12 Jun 2023 17:10:54 -0700 Subject: x86/shstk: Add user-mode shadow stack support Introduce basic shadow stack enabling/disabling/allocation routines. A task's shadow stack is allocated from memory with VM_SHADOW_STACK flag and has a fixed size of min(RLIMIT_STACK, 4GB). Keep the task's shadow stack address and size in thread_struct. This will be copied when cloning new threads, but needs to be cleared during exec, so add a function to do this. 32 bit shadow stack is not expected to have many users and it will complicate the signal implementation. So do not support IA32 emulation or x32. Co-developed-by: Yu-cheng Yu Signed-off-by: Yu-cheng Yu Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Reviewed-by: Borislav Petkov (AMD) Reviewed-by: Kees Cook Acked-by: Mike Rapoport (IBM) Tested-by: Pengfei Xu Tested-by: John Allen Tested-by: Kees Cook Link: https://lore.kernel.org/all/20230613001108.3040476-29-rick.p.edgecombe%40intel.com --- arch/x86/include/asm/processor.h | 2 + arch/x86/include/asm/shstk.h | 7 ++ arch/x86/include/uapi/asm/prctl.h | 3 + arch/x86/kernel/shstk.c | 145 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 157 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 4e35f40ea1e6..b216ac80ebcc 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -479,6 +479,8 @@ struct thread_struct { #ifdef CONFIG_X86_USER_SHADOW_STACK unsigned long features; unsigned long features_locked; + + struct thread_shstk shstk; #endif /* Floating point and extended processor state */ diff --git a/arch/x86/include/asm/shstk.h b/arch/x86/include/asm/shstk.h index ec753809f074..2b1f7c9b9995 100644 --- a/arch/x86/include/asm/shstk.h +++ b/arch/x86/include/asm/shstk.h @@ -8,12 +8,19 @@ struct task_struct; #ifdef CONFIG_X86_USER_SHADOW_STACK +struct thread_shstk { + u64 base; + u64 size; +}; + long shstk_prctl(struct task_struct *task, int option, unsigned long features); void reset_thread_features(void); +void shstk_free(struct task_struct *p); #else static inline long shstk_prctl(struct task_struct *task, int option, unsigned long arg2) { return -EINVAL; } static inline void reset_thread_features(void) {} +static inline void shstk_free(struct task_struct *p) {} #endif /* CONFIG_X86_USER_SHADOW_STACK */ #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h index 1cd44ecc9ce0..6a8e0e1bff4a 100644 --- a/arch/x86/include/uapi/asm/prctl.h +++ b/arch/x86/include/uapi/asm/prctl.h @@ -34,4 +34,7 @@ #define ARCH_SHSTK_DISABLE 0x5002 #define ARCH_SHSTK_LOCK 0x5003 +/* ARCH_SHSTK_ features bits */ +#define ARCH_SHSTK_SHSTK (1ULL << 0) + #endif /* _ASM_X86_PRCTL_H */ diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c index 41ed6552e0a5..3cb85224d856 100644 --- a/arch/x86/kernel/shstk.c +++ b/arch/x86/kernel/shstk.c @@ -8,14 +8,159 @@ #include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include +static bool features_enabled(unsigned long features) +{ + return current->thread.features & features; +} + +static void features_set(unsigned long features) +{ + current->thread.features |= features; +} + +static void features_clr(unsigned long features) +{ + current->thread.features &= ~features; +} + +static unsigned long alloc_shstk(unsigned long size) +{ + int flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_ABOVE4G; + struct mm_struct *mm = current->mm; + unsigned long addr, unused; + + mmap_write_lock(mm); + addr = do_mmap(NULL, addr, size, PROT_READ, flags, + VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL); + + mmap_write_unlock(mm); + + return addr; +} + +static unsigned long adjust_shstk_size(unsigned long size) +{ + if (size) + return PAGE_ALIGN(size); + + return PAGE_ALIGN(min_t(unsigned long long, rlimit(RLIMIT_STACK), SZ_4G)); +} + +static void unmap_shadow_stack(u64 base, u64 size) +{ + while (1) { + int r; + + r = vm_munmap(base, size); + + /* + * vm_munmap() returns -EINTR when mmap_lock is held by + * something else, and that lock should not be held for a + * long time. Retry it for the case. + */ + if (r == -EINTR) { + cond_resched(); + continue; + } + + /* + * For all other types of vm_munmap() failure, either the + * system is out of memory or there is bug. + */ + WARN_ON_ONCE(r); + break; + } +} + +static int shstk_setup(void) +{ + struct thread_shstk *shstk = ¤t->thread.shstk; + unsigned long addr, size; + + /* Already enabled */ + if (features_enabled(ARCH_SHSTK_SHSTK)) + return 0; + + /* Also not supported for 32 bit and x32 */ + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || in_32bit_syscall()) + return -EOPNOTSUPP; + + size = adjust_shstk_size(0); + addr = alloc_shstk(size); + if (IS_ERR_VALUE(addr)) + return PTR_ERR((void *)addr); + + fpregs_lock_and_load(); + wrmsrl(MSR_IA32_PL3_SSP, addr + size); + wrmsrl(MSR_IA32_U_CET, CET_SHSTK_EN); + fpregs_unlock(); + + shstk->base = addr; + shstk->size = size; + features_set(ARCH_SHSTK_SHSTK); + + return 0; +} + void reset_thread_features(void) { + memset(¤t->thread.shstk, 0, sizeof(struct thread_shstk)); current->thread.features = 0; current->thread.features_locked = 0; } +void shstk_free(struct task_struct *tsk) +{ + struct thread_shstk *shstk = &tsk->thread.shstk; + + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || + !features_enabled(ARCH_SHSTK_SHSTK)) + return; + + if (!tsk->mm) + return; + + unmap_shadow_stack(shstk->base, shstk->size); +} + +static int shstk_disable(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) + return -EOPNOTSUPP; + + /* Already disabled? */ + if (!features_enabled(ARCH_SHSTK_SHSTK)) + return 0; + + fpregs_lock_and_load(); + /* Disable WRSS too when disabling shadow stack */ + wrmsrl(MSR_IA32_U_CET, 0); + wrmsrl(MSR_IA32_PL3_SSP, 0); + fpregs_unlock(); + + shstk_free(current); + features_clr(ARCH_SHSTK_SHSTK); + + return 0; +} + long shstk_prctl(struct task_struct *task, int option, unsigned long features) { if (option == ARCH_SHSTK_LOCK) { -- cgit v1.2.3 From b2926a36b97a4f8daf162b6dc79b519c2b5a8b94 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 12 Jun 2023 17:10:55 -0700 Subject: x86/shstk: Handle thread shadow stack When a process is duplicated, but the child shares the address space with the parent, there is potential for the threads sharing a single stack to cause conflicts for each other. In the normal non-CET case this is handled in two ways. With regular CLONE_VM a new stack is provided by userspace such that the parent and child have different stacks. For vfork, the parent is suspended until the child exits. So as long as the child doesn't return from the vfork()/CLONE_VFORK calling function and sticks to a limited set of operations, the parent and child can share the same stack. For shadow stack, these scenarios present similar sharing problems. For the CLONE_VM case, the child and the parent must have separate shadow stacks. Instead of changing clone to take a shadow stack, have the kernel just allocate one and switch to it. Use stack_size passed from clone3() syscall for thread shadow stack size. A compat-mode thread shadow stack size is further reduced to 1/4. This allows more threads to run in a 32-bit address space. The clone() does not pass stack_size, which was added to clone3(). In that case, use RLIMIT_STACK size and cap to 4 GB. For shadow stack enabled vfork(), the parent and child can share the same shadow stack, like they can share a normal stack. Since the parent is suspended until the child terminates, the child will not interfere with the parent while executing as long as it doesn't return from the vfork() and overwrite up the shadow stack. The child can safely overwrite down the shadow stack, as the parent can just overwrite this later. So CET does not add any additional limitations for vfork(). Free the shadow stack on thread exit by doing it in mm_release(). Skip this when exiting a vfork() child since the stack is shared in the parent. During this operation, the shadow stack pointer of the new thread needs to be updated to point to the newly allocated shadow stack. Since the ability to do this is confined to the FPU subsystem, change fpu_clone() to take the new shadow stack pointer, and update it internally inside the FPU subsystem. This part was suggested by Thomas Gleixner. Co-developed-by: Yu-cheng Yu Suggested-by: Thomas Gleixner Signed-off-by: Yu-cheng Yu Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Reviewed-by: Borislav Petkov (AMD) Reviewed-by: Kees Cook Acked-by: Mike Rapoport (IBM) Tested-by: Pengfei Xu Tested-by: John Allen Tested-by: Kees Cook Link: https://lore.kernel.org/all/20230613001108.3040476-30-rick.p.edgecombe%40intel.com --- arch/x86/include/asm/fpu/sched.h | 3 ++- arch/x86/include/asm/mmu_context.h | 2 ++ arch/x86/include/asm/shstk.h | 5 +++++ arch/x86/kernel/fpu/core.c | 36 ++++++++++++++++++++++++++++++++- arch/x86/kernel/process.c | 21 ++++++++++++++++++- arch/x86/kernel/shstk.c | 41 ++++++++++++++++++++++++++++++++++++-- 6 files changed, 103 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/sched.h b/arch/x86/include/asm/fpu/sched.h index 78fcde7b1f07..ca6e5e5f16b2 100644 --- a/arch/x86/include/asm/fpu/sched.h +++ b/arch/x86/include/asm/fpu/sched.h @@ -11,7 +11,8 @@ extern void save_fpregs_to_fpstate(struct fpu *fpu); extern void fpu__drop(struct fpu *fpu); -extern int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal); +extern int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal, + unsigned long shstk_addr); extern void fpu_flush_thread(void); /* diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 1d29dc791f5a..416901d406f8 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -186,6 +186,8 @@ do { \ #else #define deactivate_mm(tsk, mm) \ do { \ + if (!tsk->vfork_done) \ + shstk_free(tsk); \ load_gs_index(0); \ loadsegment(fs, 0); \ } while (0) diff --git a/arch/x86/include/asm/shstk.h b/arch/x86/include/asm/shstk.h index 2b1f7c9b9995..d4a5c7b10cb5 100644 --- a/arch/x86/include/asm/shstk.h +++ b/arch/x86/include/asm/shstk.h @@ -15,11 +15,16 @@ struct thread_shstk { long shstk_prctl(struct task_struct *task, int option, unsigned long features); void reset_thread_features(void); +unsigned long shstk_alloc_thread_stack(struct task_struct *p, unsigned long clone_flags, + unsigned long stack_size); void shstk_free(struct task_struct *p); #else static inline long shstk_prctl(struct task_struct *task, int option, unsigned long arg2) { return -EINVAL; } static inline void reset_thread_features(void) {} +static inline unsigned long shstk_alloc_thread_stack(struct task_struct *p, + unsigned long clone_flags, + unsigned long stack_size) { return 0; } static inline void shstk_free(struct task_struct *p) {} #endif /* CONFIG_X86_USER_SHADOW_STACK */ diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 375852cf8fac..e03b6b107b20 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -552,8 +552,36 @@ static inline void fpu_inherit_perms(struct fpu *dst_fpu) } } +/* A passed ssp of zero will not cause any update */ +static int update_fpu_shstk(struct task_struct *dst, unsigned long ssp) +{ +#ifdef CONFIG_X86_USER_SHADOW_STACK + struct cet_user_state *xstate; + + /* If ssp update is not needed. */ + if (!ssp) + return 0; + + xstate = get_xsave_addr(&dst->thread.fpu.fpstate->regs.xsave, + XFEATURE_CET_USER); + + /* + * If there is a non-zero ssp, then 'dst' must be configured with a shadow + * stack and the fpu state should be up to date since it was just copied + * from the parent in fpu_clone(). So there must be a valid non-init CET + * state location in the buffer. + */ + if (WARN_ON_ONCE(!xstate)) + return 1; + + xstate->user_ssp = (u64)ssp; +#endif + return 0; +} + /* Clone current's FPU state on fork */ -int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal) +int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal, + unsigned long ssp) { struct fpu *src_fpu = ¤t->thread.fpu; struct fpu *dst_fpu = &dst->thread.fpu; @@ -613,6 +641,12 @@ int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal) if (use_xsave()) dst_fpu->fpstate->regs.xsave.header.xfeatures &= ~XFEATURE_MASK_PASID; + /* + * Update shadow stack pointer, in case it changed during clone. + */ + if (update_fpu_shstk(dst, ssp)) + return 1; + trace_x86_fpu_copy_src(src_fpu); trace_x86_fpu_copy_dst(dst_fpu); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ff9b80a0e3e3..cc7a642f8c9d 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -50,6 +50,7 @@ #include #include #include +#include #include "process.h" @@ -121,6 +122,7 @@ void exit_thread(struct task_struct *tsk) free_vm86(t); + shstk_free(tsk); fpu__drop(fpu); } @@ -142,6 +144,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) struct inactive_task_frame *frame; struct fork_frame *fork_frame; struct pt_regs *childregs; + unsigned long new_ssp; int ret = 0; childregs = task_pt_regs(p); @@ -179,7 +182,16 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) frame->flags = X86_EFLAGS_FIXED; #endif - fpu_clone(p, clone_flags, args->fn); + /* + * Allocate a new shadow stack for thread if needed. If shadow stack, + * is disabled, new_ssp will remain 0, and fpu_clone() will know not to + * update it. + */ + new_ssp = shstk_alloc_thread_stack(p, clone_flags, args->stack_size); + if (IS_ERR_VALUE(new_ssp)) + return PTR_ERR((void *)new_ssp); + + fpu_clone(p, clone_flags, args->fn, new_ssp); /* Kernel thread ? */ if (unlikely(p->flags & PF_KTHREAD)) { @@ -225,6 +237,13 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) if (!ret && unlikely(test_tsk_thread_flag(current, TIF_IO_BITMAP))) io_bitmap_share(p); + /* + * If copy_thread() if failing, don't leak the shadow stack possibly + * allocated in shstk_alloc_thread_stack() above. + */ + if (ret) + shstk_free(p); + return ret; } diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c index 3cb85224d856..bd9cdc3a7338 100644 --- a/arch/x86/kernel/shstk.c +++ b/arch/x86/kernel/shstk.c @@ -47,7 +47,7 @@ static unsigned long alloc_shstk(unsigned long size) unsigned long addr, unused; mmap_write_lock(mm); - addr = do_mmap(NULL, addr, size, PROT_READ, flags, + addr = do_mmap(NULL, 0, size, PROT_READ, flags, VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL); mmap_write_unlock(mm); @@ -126,6 +126,37 @@ void reset_thread_features(void) current->thread.features_locked = 0; } +unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, unsigned long clone_flags, + unsigned long stack_size) +{ + struct thread_shstk *shstk = &tsk->thread.shstk; + unsigned long addr, size; + + /* + * If shadow stack is not enabled on the new thread, skip any + * switch to a new shadow stack. + */ + if (!features_enabled(ARCH_SHSTK_SHSTK)) + return 0; + + /* + * For CLONE_VM, except vfork, the child needs a separate shadow + * stack. + */ + if ((clone_flags & (CLONE_VFORK | CLONE_VM)) != CLONE_VM) + return 0; + + size = adjust_shstk_size(stack_size); + addr = alloc_shstk(size); + if (IS_ERR_VALUE(addr)) + return addr; + + shstk->base = addr; + shstk->size = size; + + return addr + size; +} + void shstk_free(struct task_struct *tsk) { struct thread_shstk *shstk = &tsk->thread.shstk; @@ -134,7 +165,13 @@ void shstk_free(struct task_struct *tsk) !features_enabled(ARCH_SHSTK_SHSTK)) return; - if (!tsk->mm) + /* + * When fork() with CLONE_VM fails, the child (tsk) already has a + * shadow stack allocated, and exit_thread() calls this function to + * free it. In this case the parent (current) and the child share + * the same mm struct. + */ + if (!tsk->mm || tsk->mm != current->mm) return; unmap_shadow_stack(shstk->base, shstk->size); -- cgit v1.2.3 From 928054769dbdedc03aaebceecaaef15a0707b8cb Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 12 Jun 2023 17:10:56 -0700 Subject: x86/shstk: Introduce routines modifying shstk Shadow stacks are normally written to via CALL/RET or specific CET instructions like RSTORSSP/SAVEPREVSSP. However, sometimes the kernel will need to write to the shadow stack directly using the ring-0 only WRUSS instruction. A shadow stack restore token marks a restore point of the shadow stack, and the address in a token must point directly above the token, which is within the same shadow stack. This is distinctively different from other pointers on the shadow stack, since those pointers point to executable code area. Introduce token setup and verify routines. Also introduce WRUSS, which is a kernel-mode instruction but writes directly to user shadow stack. In future patches that enable shadow stack to work with signals, the kernel will need something to denote the point in the stack where sigreturn may be called. This will prevent attackers calling sigreturn at arbitrary places in the stack, in order to help prevent SROP attacks. To do this, something that can only be written by the kernel needs to be placed on the shadow stack. This can be accomplished by setting bit 63 in the frame written to the shadow stack. Userspace return addresses can't have this bit set as it is in the kernel range. It also can't be a valid restore token. Co-developed-by: Yu-cheng Yu Signed-off-by: Yu-cheng Yu Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Reviewed-by: Borislav Petkov (AMD) Reviewed-by: Kees Cook Acked-by: Mike Rapoport (IBM) Tested-by: Pengfei Xu Tested-by: John Allen Tested-by: Kees Cook Link: https://lore.kernel.org/all/20230613001108.3040476-31-rick.p.edgecombe%40intel.com --- arch/x86/include/asm/special_insns.h | 13 +++++++ arch/x86/kernel/shstk.c | 75 ++++++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index de48d1389936..d6cd9344f6c7 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -202,6 +202,19 @@ static inline void clwb(volatile void *__p) : [pax] "a" (p)); } +#ifdef CONFIG_X86_USER_SHADOW_STACK +static inline int write_user_shstk_64(u64 __user *addr, u64 val) +{ + asm_volatile_goto("1: wrussq %[val], (%[addr])\n" + _ASM_EXTABLE(1b, %l[fail]) + :: [addr] "r" (addr), [val] "r" (val) + :: fail); + return 0; +fail: + return -EFAULT; +} +#endif /* CONFIG_X86_USER_SHADOW_STACK */ + #define nop() asm volatile ("nop") static inline void serialize(void) diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c index bd9cdc3a7338..e22928c63ffc 100644 --- a/arch/x86/kernel/shstk.c +++ b/arch/x86/kernel/shstk.c @@ -25,6 +25,8 @@ #include #include +#define SS_FRAME_SIZE 8 + static bool features_enabled(unsigned long features) { return current->thread.features & features; @@ -40,6 +42,35 @@ static void features_clr(unsigned long features) current->thread.features &= ~features; } +/* + * Create a restore token on the shadow stack. A token is always 8-byte + * and aligned to 8. + */ +static int create_rstor_token(unsigned long ssp, unsigned long *token_addr) +{ + unsigned long addr; + + /* Token must be aligned */ + if (!IS_ALIGNED(ssp, 8)) + return -EINVAL; + + addr = ssp - SS_FRAME_SIZE; + + /* + * SSP is aligned, so reserved bits and mode bit are a zero, just mark + * the token 64-bit. + */ + ssp |= BIT(0); + + if (write_user_shstk_64((u64 __user *)addr, (u64)ssp)) + return -EFAULT; + + if (token_addr) + *token_addr = addr; + + return 0; +} + static unsigned long alloc_shstk(unsigned long size) { int flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_ABOVE4G; @@ -157,6 +188,50 @@ unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, unsigned long cl return addr + size; } +static unsigned long get_user_shstk_addr(void) +{ + unsigned long long ssp; + + fpregs_lock_and_load(); + + rdmsrl(MSR_IA32_PL3_SSP, ssp); + + fpregs_unlock(); + + return ssp; +} + +#define SHSTK_DATA_BIT BIT(63) + +static int put_shstk_data(u64 __user *addr, u64 data) +{ + if (WARN_ON_ONCE(data & SHSTK_DATA_BIT)) + return -EINVAL; + + /* + * Mark the high bit so that the sigframe can't be processed as a + * return address. + */ + if (write_user_shstk_64(addr, data | SHSTK_DATA_BIT)) + return -EFAULT; + return 0; +} + +static int get_shstk_data(unsigned long *data, unsigned long __user *addr) +{ + unsigned long ldata; + + if (unlikely(get_user(ldata, addr))) + return -EFAULT; + + if (!(ldata & SHSTK_DATA_BIT)) + return -EINVAL; + + *data = ldata & ~SHSTK_DATA_BIT; + + return 0; +} + void shstk_free(struct task_struct *tsk) { struct thread_shstk *shstk = &tsk->thread.shstk; -- cgit v1.2.3 From 05e36022c0543ba55a3de55af455b00cb3eb4fcc Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 12 Jun 2023 17:10:57 -0700 Subject: x86/shstk: Handle signals for shadow stack When a signal is handled, the context is pushed to the stack before handling it. For shadow stacks, since the shadow stack only tracks return addresses, there isn't any state that needs to be pushed. However, there are still a few things that need to be done. These things are visible to userspace and which will be kernel ABI for shadow stacks. One is to make sure the restorer address is written to shadow stack, since the signal handler (if not changing ucontext) returns to the restorer, and the restorer calls sigreturn. So add the restorer on the shadow stack before handling the signal, so there is not a conflict when the signal handler returns to the restorer. The other thing to do is to place some type of checkable token on the thread's shadow stack before handling the signal and check it during sigreturn. This is an extra layer of protection to hamper attackers calling sigreturn manually as in SROP-like attacks. For this token the shadow stack data format defined earlier can be used. Have the data pushed be the previous SSP. In the future the sigreturn might want to return back to a different stack. Storing the SSP (instead of a restore offset or something) allows for future functionality that may want to restore to a different stack. So, when handling a signal push - the SSP pointing in the shadow stack data format - the restorer address below the restore token. In sigreturn, verify SSP is stored in the data format and pop the shadow stack. Co-developed-by: Yu-cheng Yu Signed-off-by: Yu-cheng Yu Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Reviewed-by: Borislav Petkov (AMD) Reviewed-by: Kees Cook Acked-by: Mike Rapoport (IBM) Tested-by: Pengfei Xu Tested-by: John Allen Tested-by: Kees Cook Link: https://lore.kernel.org/all/20230613001108.3040476-32-rick.p.edgecombe%40intel.com --- arch/x86/include/asm/shstk.h | 5 +++ arch/x86/kernel/shstk.c | 95 ++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/signal.c | 1 + arch/x86/kernel/signal_64.c | 6 +++ 4 files changed, 107 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/shstk.h b/arch/x86/include/asm/shstk.h index d4a5c7b10cb5..ecb23a8ca47d 100644 --- a/arch/x86/include/asm/shstk.h +++ b/arch/x86/include/asm/shstk.h @@ -6,6 +6,7 @@ #include struct task_struct; +struct ksignal; #ifdef CONFIG_X86_USER_SHADOW_STACK struct thread_shstk { @@ -18,6 +19,8 @@ void reset_thread_features(void); unsigned long shstk_alloc_thread_stack(struct task_struct *p, unsigned long clone_flags, unsigned long stack_size); void shstk_free(struct task_struct *p); +int setup_signal_shadow_stack(struct ksignal *ksig); +int restore_signal_shadow_stack(void); #else static inline long shstk_prctl(struct task_struct *task, int option, unsigned long arg2) { return -EINVAL; } @@ -26,6 +29,8 @@ static inline unsigned long shstk_alloc_thread_stack(struct task_struct *p, unsigned long clone_flags, unsigned long stack_size) { return 0; } static inline void shstk_free(struct task_struct *p) {} +static inline int setup_signal_shadow_stack(struct ksignal *ksig) { return 0; } +static inline int restore_signal_shadow_stack(void) { return 0; } #endif /* CONFIG_X86_USER_SHADOW_STACK */ #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c index e22928c63ffc..f02e8ea4f1b5 100644 --- a/arch/x86/kernel/shstk.c +++ b/arch/x86/kernel/shstk.c @@ -232,6 +232,101 @@ static int get_shstk_data(unsigned long *data, unsigned long __user *addr) return 0; } +static int shstk_push_sigframe(unsigned long *ssp) +{ + unsigned long target_ssp = *ssp; + + /* Token must be aligned */ + if (!IS_ALIGNED(target_ssp, 8)) + return -EINVAL; + + *ssp -= SS_FRAME_SIZE; + if (put_shstk_data((void *__user)*ssp, target_ssp)) + return -EFAULT; + + return 0; +} + +static int shstk_pop_sigframe(unsigned long *ssp) +{ + unsigned long token_addr; + int err; + + err = get_shstk_data(&token_addr, (unsigned long __user *)*ssp); + if (unlikely(err)) + return err; + + /* Restore SSP aligned? */ + if (unlikely(!IS_ALIGNED(token_addr, 8))) + return -EINVAL; + + /* SSP in userspace? */ + if (unlikely(token_addr >= TASK_SIZE_MAX)) + return -EINVAL; + + *ssp = token_addr; + + return 0; +} + +int setup_signal_shadow_stack(struct ksignal *ksig) +{ + void __user *restorer = ksig->ka.sa.sa_restorer; + unsigned long ssp; + int err; + + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || + !features_enabled(ARCH_SHSTK_SHSTK)) + return 0; + + if (!restorer) + return -EINVAL; + + ssp = get_user_shstk_addr(); + if (unlikely(!ssp)) + return -EINVAL; + + err = shstk_push_sigframe(&ssp); + if (unlikely(err)) + return err; + + /* Push restorer address */ + ssp -= SS_FRAME_SIZE; + err = write_user_shstk_64((u64 __user *)ssp, (u64)restorer); + if (unlikely(err)) + return -EFAULT; + + fpregs_lock_and_load(); + wrmsrl(MSR_IA32_PL3_SSP, ssp); + fpregs_unlock(); + + return 0; +} + +int restore_signal_shadow_stack(void) +{ + unsigned long ssp; + int err; + + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || + !features_enabled(ARCH_SHSTK_SHSTK)) + return 0; + + ssp = get_user_shstk_addr(); + if (unlikely(!ssp)) + return -EINVAL; + + err = shstk_pop_sigframe(&ssp); + if (unlikely(err)) + return err; + + fpregs_lock_and_load(); + wrmsrl(MSR_IA32_PL3_SSP, ssp); + fpregs_unlock(); + + return 0; +} + void shstk_free(struct task_struct *tsk) { struct thread_shstk *shstk = &tsk->thread.shstk; diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index cfeec3ee877e..65fe2094da59 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -40,6 +40,7 @@ #include #include #include +#include static inline int is_ia32_compat_frame(struct ksignal *ksig) { diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 0e808c72bf7e..cacf2ede6217 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -175,6 +175,9 @@ int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) frame = get_sigframe(ksig, regs, sizeof(struct rt_sigframe), &fp); uc_flags = frame_uc_flags(regs); + if (setup_signal_shadow_stack(ksig)) + return -EFAULT; + if (!user_access_begin(frame, sizeof(*frame))) return -EFAULT; @@ -260,6 +263,9 @@ SYSCALL_DEFINE0(rt_sigreturn) if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) goto badframe; + if (restore_signal_shadow_stack()) + goto badframe; + if (restore_altstack(&frame->uc.uc_stack)) goto badframe; -- cgit v1.2.3 From b93d6c78829ad1e22486ccc93d04bf77e45597df Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 12 Jun 2023 17:10:58 -0700 Subject: x86/shstk: Check that SSP is aligned on sigreturn The shadow stack signal frame is read by the kernel on sigreturn. It relies on shadow stack memory protections to prevent forgeries of this signal frame (which included the pre-signal SSP). It also relies on the shadow stack signal frame to have bit 63 set. Since this bit would not be set via typical shadow stack operations, so the kernel can assume it was a value it placed there. However, in order to support 32 bit shadow stack, the INCSSPD instruction can increment the shadow stack by 4 bytes. In this case SSP might be pointing to a region spanning two 8 byte shadow stack frames. It could confuse the checks described above. Since the kernel only supports shadow stack in 64 bit, just check that the SSP is 8 byte aligned in the sigreturn path. Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20230613001108.3040476-33-rick.p.edgecombe%40intel.com --- arch/x86/kernel/shstk.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c index f02e8ea4f1b5..a8705f7d966c 100644 --- a/arch/x86/kernel/shstk.c +++ b/arch/x86/kernel/shstk.c @@ -252,6 +252,9 @@ static int shstk_pop_sigframe(unsigned long *ssp) unsigned long token_addr; int err; + if (!IS_ALIGNED(*ssp, 8)) + return -EINVAL; + err = get_shstk_data(&token_addr, (unsigned long __user *)*ssp); if (unlikely(err)) return err; -- cgit v1.2.3 From 7fad2a432cd35bbf104d2d9d426e74902f22aa95 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 12 Jun 2023 17:10:59 -0700 Subject: x86/shstk: Check that signal frame is shadow stack mem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The shadow stack signal frame is read by the kernel on sigreturn. It relies on shadow stack memory protections to prevent forgeries of this signal frame (which included the pre-signal SSP). This behavior helps userspace protect itself. However, using the INCSSP instruction userspace can adjust the SSP to 8 bytes beyond the end of a shadow stack. INCSSP performs shadow stack reads to make sure it doesn’t increment off of the shadow stack, but on the end position it actually reads 8 bytes below the new SSP. For the shadow stack HW operations, this situation (INCSSP off the end of a shadow stack by 8 bytes) would be fine. If the a RET is executed, the push to the shadow stack would fail to write to the shadow stack. If a CALL is executed, the SSP will be incremented back onto the stack and the return address will be written successfully to the very end. That is expected behavior around shadow stack underflow. However, the kernel doesn’t have a way to read shadow stack memory using shadow stack accesses. WRUSS can write to shadow stack memory with a shadow stack access which ensures the access is to shadow stack memory. But unfortunately for this case, there is no equivalent instruction for shadow stack reads. So when reading the shadow stack signal frames, the kernel currently assumes the SSP is pointing to the shadow stack and uses a normal read. The SSP pointing to shadow stack memory will be true in most cases, but as described above, in can be untrue by 8 bytes. So lookup the VMA of the shadow stack sigframe being read to verify it is shadow stack. Since the SSP can only be beyond the shadow stack by 8 bytes, and shadow stack memory is page aligned, this check only needs to be done when this type of relative position to a page boundary is encountered. So skip the extra work otherwise. Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20230613001108.3040476-34-rick.p.edgecombe%40intel.com --- arch/x86/kernel/shstk.c | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c index a8705f7d966c..50733a510446 100644 --- a/arch/x86/kernel/shstk.c +++ b/arch/x86/kernel/shstk.c @@ -249,15 +249,38 @@ static int shstk_push_sigframe(unsigned long *ssp) static int shstk_pop_sigframe(unsigned long *ssp) { + struct vm_area_struct *vma; unsigned long token_addr; - int err; + bool need_to_check_vma; + int err = 1; + /* + * It is possible for the SSP to be off the end of a shadow stack by 4 + * or 8 bytes. If the shadow stack is at the start of a page or 4 bytes + * before it, it might be this case, so check that the address being + * read is actually shadow stack. + */ if (!IS_ALIGNED(*ssp, 8)) return -EINVAL; + need_to_check_vma = PAGE_ALIGN(*ssp) == *ssp; + + if (need_to_check_vma) + mmap_read_lock_killable(current->mm); + err = get_shstk_data(&token_addr, (unsigned long __user *)*ssp); if (unlikely(err)) - return err; + goto out_err; + + if (need_to_check_vma) { + vma = find_vma(current->mm, *ssp); + if (!vma || !(vma->vm_flags & VM_SHADOW_STACK)) { + err = -EFAULT; + goto out_err; + } + + mmap_read_unlock(current->mm); + } /* Restore SSP aligned? */ if (unlikely(!IS_ALIGNED(token_addr, 8))) @@ -270,6 +293,10 @@ static int shstk_pop_sigframe(unsigned long *ssp) *ssp = token_addr; return 0; +out_err: + if (need_to_check_vma) + mmap_read_unlock(current->mm); + return err; } int setup_signal_shadow_stack(struct ksignal *ksig) -- cgit v1.2.3 From c35559f94ebc3e3bc82e56e07161bb5986cd9761 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 12 Jun 2023 17:11:00 -0700 Subject: x86/shstk: Introduce map_shadow_stack syscall When operating with shadow stacks enabled, the kernel will automatically allocate shadow stacks for new threads, however in some cases userspace will need additional shadow stacks. The main example of this is the ucontext family of functions, which require userspace allocating and pivoting to userspace managed stacks. Unlike most other user memory permissions, shadow stacks need to be provisioned with special data in order to be useful. They need to be setup with a restore token so that userspace can pivot to them via the RSTORSSP instruction. But, the security design of shadow stacks is that they should not be written to except in limited circumstances. This presents a problem for userspace, as to how userspace can provision this special data, without allowing for the shadow stack to be generally writable. Previously, a new PROT_SHADOW_STACK was attempted, which could be mprotect()ed from RW permissions after the data was provisioned. This was found to not be secure enough, as other threads could write to the shadow stack during the writable window. The kernel can use a special instruction, WRUSS, to write directly to userspace shadow stacks. So the solution can be that memory can be mapped as shadow stack permissions from the beginning (never generally writable in userspace), and the kernel itself can write the restore token. First, a new madvise() flag was explored, which could operate on the PROT_SHADOW_STACK memory. This had a couple of downsides: 1. Extra checks were needed in mprotect() to prevent writable memory from ever becoming PROT_SHADOW_STACK. 2. Extra checks/vma state were needed in the new madvise() to prevent restore tokens being written into the middle of pre-used shadow stacks. It is ideal to prevent restore tokens being added at arbitrary locations, so the check was to make sure the shadow stack had never been written to. 3. It stood out from the rest of the madvise flags, as more of direct action than a hint at future desired behavior. So rather than repurpose two existing syscalls (mmap, madvise) that don't quite fit, just implement a new map_shadow_stack syscall to allow userspace to map and setup new shadow stacks in one step. While ucontext is the primary motivator, userspace may have other unforeseen reasons to setup its own shadow stacks using the WRSS instruction. Towards this provide a flag so that stacks can be optionally setup securely for the common case of ucontext without enabling WRSS. Or potentially have the kernel set up the shadow stack in some new way. The following example demonstrates how to create a new shadow stack with map_shadow_stack: void *shstk = map_shadow_stack(addr, stack_size, SHADOW_STACK_SET_TOKEN); Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Reviewed-by: Borislav Petkov (AMD) Reviewed-by: Kees Cook Acked-by: Mike Rapoport (IBM) Tested-by: Pengfei Xu Tested-by: John Allen Tested-by: Kees Cook Link: https://lore.kernel.org/all/20230613001108.3040476-35-rick.p.edgecombe%40intel.com --- arch/x86/entry/syscalls/syscall_64.tbl | 1 + arch/x86/include/uapi/asm/mman.h | 3 ++ arch/x86/kernel/shstk.c | 59 +++++++++++++++++++++++++++++----- include/linux/syscalls.h | 1 + kernel/sys_ni.c | 1 + 5 files changed, 57 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 227538b0ce80..38db4b1c291a 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -373,6 +373,7 @@ 449 common futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node 451 common cachestat sys_cachestat +452 64 map_shadow_stack sys_map_shadow_stack # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h index 5a0256e73f1e..8148bdddbd2c 100644 --- a/arch/x86/include/uapi/asm/mman.h +++ b/arch/x86/include/uapi/asm/mman.h @@ -13,6 +13,9 @@ ((key) & 0x8 ? VM_PKEY_BIT3 : 0)) #endif +/* Flags for map_shadow_stack(2) */ +#define SHADOW_STACK_SET_TOKEN (1ULL << 0) /* Set up a restore token in the shadow stack */ + #include #endif /* _ASM_X86_MMAN_H */ diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c index 50733a510446..04c37b33a625 100644 --- a/arch/x86/kernel/shstk.c +++ b/arch/x86/kernel/shstk.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -71,19 +72,31 @@ static int create_rstor_token(unsigned long ssp, unsigned long *token_addr) return 0; } -static unsigned long alloc_shstk(unsigned long size) +static unsigned long alloc_shstk(unsigned long addr, unsigned long size, + unsigned long token_offset, bool set_res_tok) { int flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_ABOVE4G; struct mm_struct *mm = current->mm; - unsigned long addr, unused; + unsigned long mapped_addr, unused; - mmap_write_lock(mm); - addr = do_mmap(NULL, 0, size, PROT_READ, flags, - VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL); + if (addr) + flags |= MAP_FIXED_NOREPLACE; + mmap_write_lock(mm); + mapped_addr = do_mmap(NULL, addr, size, PROT_READ, flags, + VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL); mmap_write_unlock(mm); - return addr; + if (!set_res_tok || IS_ERR_VALUE(mapped_addr)) + goto out; + + if (create_rstor_token(mapped_addr + token_offset, NULL)) { + vm_munmap(mapped_addr, size); + return -EINVAL; + } + +out: + return mapped_addr; } static unsigned long adjust_shstk_size(unsigned long size) @@ -134,7 +147,7 @@ static int shstk_setup(void) return -EOPNOTSUPP; size = adjust_shstk_size(0); - addr = alloc_shstk(size); + addr = alloc_shstk(0, size, 0, false); if (IS_ERR_VALUE(addr)) return PTR_ERR((void *)addr); @@ -178,7 +191,7 @@ unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, unsigned long cl return 0; size = adjust_shstk_size(stack_size); - addr = alloc_shstk(size); + addr = alloc_shstk(0, size, 0, false); if (IS_ERR_VALUE(addr)) return addr; @@ -398,6 +411,36 @@ static int shstk_disable(void) return 0; } +SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsigned int, flags) +{ + bool set_tok = flags & SHADOW_STACK_SET_TOKEN; + unsigned long aligned_size; + + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) + return -EOPNOTSUPP; + + if (flags & ~SHADOW_STACK_SET_TOKEN) + return -EINVAL; + + /* If there isn't space for a token */ + if (set_tok && size < 8) + return -ENOSPC; + + if (addr && addr < SZ_4G) + return -ERANGE; + + /* + * An overflow would result in attempting to write the restore token + * to the wrong location. Not catastrophic, but just return the right + * error code and block it. + */ + aligned_size = PAGE_ALIGN(size); + if (aligned_size < size) + return -EOVERFLOW; + + return alloc_shstk(addr, aligned_size, size, set_tok); +} + long shstk_prctl(struct task_struct *task, int option, unsigned long features) { if (option == ARCH_SHSTK_LOCK) { diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 03e3d0121d5e..7f6dc0988197 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -953,6 +953,7 @@ asmlinkage long sys_set_mempolicy_home_node(unsigned long start, unsigned long l asmlinkage long sys_cachestat(unsigned int fd, struct cachestat_range __user *cstat_range, struct cachestat __user *cstat, unsigned int flags); +asmlinkage long sys_map_shadow_stack(unsigned long addr, unsigned long size, unsigned int flags); /* * Architecture-specific system calls diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 781de7cc6a4e..e137c1385c56 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -274,6 +274,7 @@ COND_SYSCALL(vm86old); COND_SYSCALL(modify_ldt); COND_SYSCALL(vm86); COND_SYSCALL(kexec_file_load); +COND_SYSCALL(map_shadow_stack); /* s390 */ COND_SYSCALL(s390_pci_mmio_read); -- cgit v1.2.3 From 1d62c65372ab08599e4cf24af83d004434087ada Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 12 Jun 2023 17:11:01 -0700 Subject: x86/shstk: Support WRSS for userspace For the current shadow stack implementation, shadow stacks contents can't easily be provisioned with arbitrary data. This property helps apps protect themselves better, but also restricts any potential apps that may want to do exotic things at the expense of a little security. The x86 shadow stack feature introduces a new instruction, WRSS, which can be enabled to write directly to shadow stack memory from userspace. Allow it to get enabled via the prctl interface. Only enable the userspace WRSS instruction, which allows writes to userspace shadow stacks from userspace. Do not allow it to be enabled independently of shadow stack, as HW does not support using WRSS when shadow stack is disabled. >From a fault handler perspective, WRSS will behave very similar to WRUSS, which is treated like a user access from a #PF err code perspective. Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Reviewed-by: Borislav Petkov (AMD) Reviewed-by: Kees Cook Acked-by: Mike Rapoport (IBM) Tested-by: Pengfei Xu Tested-by: John Allen Tested-by: Kees Cook Link: https://lore.kernel.org/all/20230613001108.3040476-36-rick.p.edgecombe%40intel.com --- arch/x86/include/uapi/asm/prctl.h | 1 + arch/x86/kernel/shstk.c | 43 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 43 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h index 6a8e0e1bff4a..eedfde3b63be 100644 --- a/arch/x86/include/uapi/asm/prctl.h +++ b/arch/x86/include/uapi/asm/prctl.h @@ -36,5 +36,6 @@ /* ARCH_SHSTK_ features bits */ #define ARCH_SHSTK_SHSTK (1ULL << 0) +#define ARCH_SHSTK_WRSS (1ULL << 1) #endif /* _ASM_X86_PRCTL_H */ diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c index 04c37b33a625..ea0bf113f9cf 100644 --- a/arch/x86/kernel/shstk.c +++ b/arch/x86/kernel/shstk.c @@ -390,6 +390,47 @@ void shstk_free(struct task_struct *tsk) unmap_shadow_stack(shstk->base, shstk->size); } +static int wrss_control(bool enable) +{ + u64 msrval; + + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) + return -EOPNOTSUPP; + + /* + * Only enable WRSS if shadow stack is enabled. If shadow stack is not + * enabled, WRSS will already be disabled, so don't bother clearing it + * when disabling. + */ + if (!features_enabled(ARCH_SHSTK_SHSTK)) + return -EPERM; + + /* Already enabled/disabled? */ + if (features_enabled(ARCH_SHSTK_WRSS) == enable) + return 0; + + fpregs_lock_and_load(); + rdmsrl(MSR_IA32_U_CET, msrval); + + if (enable) { + features_set(ARCH_SHSTK_WRSS); + msrval |= CET_WRSS_EN; + } else { + features_clr(ARCH_SHSTK_WRSS); + if (!(msrval & CET_WRSS_EN)) + goto unlock; + + msrval &= ~CET_WRSS_EN; + } + + wrmsrl(MSR_IA32_U_CET, msrval); + +unlock: + fpregs_unlock(); + + return 0; +} + static int shstk_disable(void) { if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) @@ -406,7 +447,7 @@ static int shstk_disable(void) fpregs_unlock(); shstk_free(current); - features_clr(ARCH_SHSTK_SHSTK); + features_clr(ARCH_SHSTK_SHSTK | ARCH_SHSTK_WRSS); return 0; } -- cgit v1.2.3 From 0ee44885fe9cf19eb3870947c8f3c275017e48a7 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 12 Jun 2023 17:11:02 -0700 Subject: x86: Expose thread features in /proc/$PID/status Applications and loaders can have logic to decide whether to enable shadow stack. They usually don't report whether shadow stack has been enabled or not, so there is no way to verify whether an application actually is protected by shadow stack. Add two lines in /proc/$PID/status to report enabled and locked features. Since, this involves referring to arch specific defines in asm/prctl.h, implement an arch breakout to emit the feature lines. [Switched to CET, added to commit log] Co-developed-by: Kirill A. Shutemov Signed-off-by: Kirill A. Shutemov Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Reviewed-by: Borislav Petkov (AMD) Reviewed-by: Kees Cook Acked-by: Mike Rapoport (IBM) Tested-by: Pengfei Xu Tested-by: John Allen Tested-by: Kees Cook Link: https://lore.kernel.org/all/20230613001108.3040476-37-rick.p.edgecombe%40intel.com --- arch/x86/kernel/cpu/proc.c | 23 +++++++++++++++++++++++ fs/proc/array.c | 6 ++++++ include/linux/proc_fs.h | 1 + 3 files changed, 30 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 099b6f0d96bd..31c0e68f6227 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -4,6 +4,8 @@ #include #include #include +#include +#include #include "cpu.h" @@ -175,3 +177,24 @@ const struct seq_operations cpuinfo_op = { .stop = c_stop, .show = show_cpuinfo, }; + +#ifdef CONFIG_X86_USER_SHADOW_STACK +static void dump_x86_features(struct seq_file *m, unsigned long features) +{ + if (features & ARCH_SHSTK_SHSTK) + seq_puts(m, "shstk "); + if (features & ARCH_SHSTK_WRSS) + seq_puts(m, "wrss "); +} + +void arch_proc_pid_thread_features(struct seq_file *m, struct task_struct *task) +{ + seq_puts(m, "x86_Thread_features:\t"); + dump_x86_features(m, task->thread.features); + seq_putc(m, '\n'); + + seq_puts(m, "x86_Thread_features_locked:\t"); + dump_x86_features(m, task->thread.features_locked); + seq_putc(m, '\n'); +} +#endif /* CONFIG_X86_USER_SHADOW_STACK */ diff --git a/fs/proc/array.c b/fs/proc/array.c index d35bbf35a874..2c2efbe685d8 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -431,6 +431,11 @@ static inline void task_untag_mask(struct seq_file *m, struct mm_struct *mm) seq_printf(m, "untag_mask:\t%#lx\n", mm_untag_mask(mm)); } +__weak void arch_proc_pid_thread_features(struct seq_file *m, + struct task_struct *task) +{ +} + int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { @@ -455,6 +460,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, task_cpus_allowed(m, task); cpuset_task_status_allowed(m, task); task_context_switch_counts(m, task); + arch_proc_pid_thread_features(m, task); return 0; } diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 253f2676d93a..de407e7c3b55 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -159,6 +159,7 @@ int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns, #endif /* CONFIG_PROC_PID_ARCH_STATUS */ void arch_report_meminfo(struct seq_file *m); +void arch_proc_pid_thread_features(struct seq_file *m, struct task_struct *task); #else /* CONFIG_PROC_FS */ -- cgit v1.2.3 From 488af8ea7131185c1adcbb0b52da2b6800429ecb Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 12 Jun 2023 17:11:03 -0700 Subject: x86/shstk: Wire in shadow stack interface The kernel now has the main shadow stack functionality to support applications. Wire in the WRSS and shadow stack enable/disable functions into the existing shadow stack API skeleton. Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Reviewed-by: Borislav Petkov (AMD) Reviewed-by: Kees Cook Acked-by: Mike Rapoport (IBM) Tested-by: Pengfei Xu Tested-by: John Allen Tested-by: Kees Cook Link: https://lore.kernel.org/all/20230613001108.3040476-38-rick.p.edgecombe%40intel.com --- arch/x86/kernel/shstk.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c index ea0bf113f9cf..d723cdc93474 100644 --- a/arch/x86/kernel/shstk.c +++ b/arch/x86/kernel/shstk.c @@ -502,9 +502,17 @@ long shstk_prctl(struct task_struct *task, int option, unsigned long features) return -EINVAL; if (option == ARCH_SHSTK_DISABLE) { + if (features & ARCH_SHSTK_WRSS) + return wrss_control(false); + if (features & ARCH_SHSTK_SHSTK) + return shstk_disable(); return -EINVAL; } /* Handle ARCH_SHSTK_ENABLE */ + if (features & ARCH_SHSTK_SHSTK) + return shstk_setup(); + if (features & ARCH_SHSTK_WRSS) + return wrss_control(true); return -EINVAL; } -- cgit v1.2.3 From 0dc2a76092d9d513f5ea5eb3992d1459a67a3e7a Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 12 Jun 2023 17:11:04 -0700 Subject: x86/cpufeatures: Enable CET CR4 bit for shadow stack Setting CR4.CET is a prerequisite for utilizing any CET features, most of which also require setting MSRs. Kernel IBT already enables the CET CR4 bit when it detects IBT HW support and is configured with kernel IBT. However, future patches that enable userspace shadow stack support will need the bit set as well. So change the logic to enable it in either case. Clear MSR_IA32_U_CET in cet_disable() so that it can't live to see userspace in a new kexec-ed kernel that has CR4.CET set from kernel IBT. Co-developed-by: Yu-cheng Yu Signed-off-by: Yu-cheng Yu Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Reviewed-by: Borislav Petkov (AMD) Reviewed-by: Kees Cook Acked-by: Mike Rapoport (IBM) Tested-by: Pengfei Xu Tested-by: John Allen Tested-by: Kees Cook Link: https://lore.kernel.org/all/20230613001108.3040476-39-rick.p.edgecombe%40intel.com --- arch/x86/kernel/cpu/common.c | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 52683fddafaf..cf5275a67f39 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -588,27 +588,43 @@ __noendbr void ibt_restore(u64 save) static __always_inline void setup_cet(struct cpuinfo_x86 *c) { - u64 msr = CET_ENDBR_EN; + bool user_shstk, kernel_ibt; - if (!HAS_KERNEL_IBT || - !cpu_feature_enabled(X86_FEATURE_IBT)) + if (!IS_ENABLED(CONFIG_X86_CET)) return; - wrmsrl(MSR_IA32_S_CET, msr); + kernel_ibt = HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT); + user_shstk = cpu_feature_enabled(X86_FEATURE_SHSTK) && + IS_ENABLED(CONFIG_X86_USER_SHADOW_STACK); + + if (!kernel_ibt && !user_shstk) + return; + + if (user_shstk) + set_cpu_cap(c, X86_FEATURE_USER_SHSTK); + + if (kernel_ibt) + wrmsrl(MSR_IA32_S_CET, CET_ENDBR_EN); + else + wrmsrl(MSR_IA32_S_CET, 0); + cr4_set_bits(X86_CR4_CET); - if (!ibt_selftest()) { + if (kernel_ibt && !ibt_selftest()) { pr_err("IBT selftest: Failed!\n"); wrmsrl(MSR_IA32_S_CET, 0); setup_clear_cpu_cap(X86_FEATURE_IBT); - return; } } __noendbr void cet_disable(void) { - if (cpu_feature_enabled(X86_FEATURE_IBT)) - wrmsrl(MSR_IA32_S_CET, 0); + if (!(cpu_feature_enabled(X86_FEATURE_IBT) || + cpu_feature_enabled(X86_FEATURE_SHSTK))) + return; + + wrmsrl(MSR_IA32_S_CET, 0); + wrmsrl(MSR_IA32_U_CET, 0); } /* @@ -1470,6 +1486,9 @@ static void __init cpu_parse_early_param(void) if (cmdline_find_option_bool(boot_command_line, "noxsaves")) setup_clear_cpu_cap(X86_FEATURE_XSAVES); + if (cmdline_find_option_bool(boot_command_line, "nousershstk")) + setup_clear_cpu_cap(X86_FEATURE_USER_SHSTK); + arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg)); if (arglen <= 0) return; -- cgit v1.2.3 From 2fab02b25ae7cf5f714ab456b03d9a3fe5ae98c9 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 12 Jun 2023 17:11:06 -0700 Subject: x86: Add PTRACE interface for shadow stack Some applications (like GDB) would like to tweak shadow stack state via ptrace. This allows for existing functionality to continue to work for seized shadow stack applications. Provide a regset interface for manipulating the shadow stack pointer (SSP). There is already ptrace functionality for accessing xstate, but this does not include supervisor xfeatures. So there is not a completely clear place for where to put the shadow stack state. Adding it to the user xfeatures regset would complicate that code, as it currently shares logic with signals which should not have supervisor features. Don't add a general supervisor xfeature regset like the user one, because it is better to maintain flexibility for other supervisor xfeatures to define their own interface. For example, an xfeature may decide not to expose all of it's state to userspace, as is actually the case for shadow stack ptrace functionality. A lot of enum values remain to be used, so just put it in dedicated shadow stack regset. The only downside to not having a generic supervisor xfeature regset, is that apps need to be enlightened of any new supervisor xfeature exposed this way (i.e. they can't try to have generic save/restore logic). But maybe that is a good thing, because they have to think through each new xfeature instead of encountering issues when a new supervisor xfeature was added. By adding a shadow stack regset, it also has the effect of including the shadow stack state in a core dump, which could be useful for debugging. The shadow stack specific xstate includes the SSP, and the shadow stack and WRSS enablement status. Enabling shadow stack or WRSS in the kernel involves more than just flipping the bit. The kernel is made aware that it has to do extra things when cloning or handling signals. That logic is triggered off of separate feature enablement state kept in the task struct. So the flipping on HW shadow stack enforcement without notifying the kernel to change its behavior would severely limit what an application could do without crashing, and the results would depend on kernel internal implementation details. There is also no known use for controlling this state via ptrace today. So only expose the SSP, which is something that userspace already has indirect control over. Co-developed-by: Yu-cheng Yu Signed-off-by: Yu-cheng Yu Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Reviewed-by: Borislav Petkov (AMD) Reviewed-by: Kees Cook Acked-by: Mike Rapoport (IBM) Tested-by: Pengfei Xu Tested-by: John Allen Tested-by: Kees Cook Link: https://lore.kernel.org/all/20230613001108.3040476-41-rick.p.edgecombe%40intel.com --- arch/x86/include/asm/fpu/regset.h | 7 ++-- arch/x86/kernel/fpu/regset.c | 81 +++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/ptrace.c | 12 ++++++ include/uapi/linux/elf.h | 2 + 4 files changed, 99 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/regset.h b/arch/x86/include/asm/fpu/regset.h index 4f928d6a367b..697b77e96025 100644 --- a/arch/x86/include/asm/fpu/regset.h +++ b/arch/x86/include/asm/fpu/regset.h @@ -7,11 +7,12 @@ #include -extern user_regset_active_fn regset_fpregs_active, regset_xregset_fpregs_active; +extern user_regset_active_fn regset_fpregs_active, regset_xregset_fpregs_active, + ssp_active; extern user_regset_get2_fn fpregs_get, xfpregs_get, fpregs_soft_get, - xstateregs_get; + xstateregs_get, ssp_get; extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set, - xstateregs_set; + xstateregs_set, ssp_set; /* * xstateregs_active == regset_fpregs_active. Please refer to the comment diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 6d056b68f4ed..6bc1eb2a21bd 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "context.h" #include "internal.h" @@ -174,6 +175,86 @@ out: return ret; } +#ifdef CONFIG_X86_USER_SHADOW_STACK +int ssp_active(struct task_struct *target, const struct user_regset *regset) +{ + if (target->thread.features & ARCH_SHSTK_SHSTK) + return regset->n; + + return 0; +} + +int ssp_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ + struct fpu *fpu = &target->thread.fpu; + struct cet_user_state *cetregs; + + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) + return -ENODEV; + + sync_fpstate(fpu); + cetregs = get_xsave_addr(&fpu->fpstate->regs.xsave, XFEATURE_CET_USER); + if (WARN_ON(!cetregs)) { + /* + * This shouldn't ever be NULL because shadow stack was + * verified to be enabled above. This means + * MSR_IA32_U_CET.CET_SHSTK_EN should be 1 and so + * XFEATURE_CET_USER should not be in the init state. + */ + return -ENODEV; + } + + return membuf_write(&to, (unsigned long *)&cetregs->user_ssp, + sizeof(cetregs->user_ssp)); +} + +int ssp_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct fpu *fpu = &target->thread.fpu; + struct xregs_state *xsave = &fpu->fpstate->regs.xsave; + struct cet_user_state *cetregs; + unsigned long user_ssp; + int r; + + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || + !ssp_active(target, regset)) + return -ENODEV; + + if (pos != 0 || count != sizeof(user_ssp)) + return -EINVAL; + + r = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &user_ssp, 0, -1); + if (r) + return r; + + /* + * Some kernel instructions (IRET, etc) can cause exceptions in the case + * of disallowed CET register values. Just prevent invalid values. + */ + if (user_ssp >= TASK_SIZE_MAX || !IS_ALIGNED(user_ssp, 8)) + return -EINVAL; + + fpu_force_restore(fpu); + + cetregs = get_xsave_addr(xsave, XFEATURE_CET_USER); + if (WARN_ON(!cetregs)) { + /* + * This shouldn't ever be NULL because shadow stack was + * verified to be enabled above. This means + * MSR_IA32_U_CET.CET_SHSTK_EN should be 1 and so + * XFEATURE_CET_USER should not be in the init state. + */ + return -ENODEV; + } + + cetregs->user_ssp = user_ssp; + return 0; +} +#endif /* CONFIG_X86_USER_SHADOW_STACK */ + #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION /* diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index dfaa270a7cc9..095f04bdabdc 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -58,6 +58,7 @@ enum x86_regset_64 { REGSET64_FP, REGSET64_IOPERM, REGSET64_XSTATE, + REGSET64_SSP, }; #define REGSET_GENERAL \ @@ -1267,6 +1268,17 @@ static struct user_regset x86_64_regsets[] __ro_after_init = { .active = ioperm_active, .regset_get = ioperm_get }, +#ifdef CONFIG_X86_USER_SHADOW_STACK + [REGSET64_SSP] = { + .core_note_type = NT_X86_SHSTK, + .n = 1, + .size = sizeof(u64), + .align = sizeof(u64), + .active = ssp_active, + .regset_get = ssp_get, + .set = ssp_set + }, +#endif }; static const struct user_regset_view user_x86_64_view = { diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index 0c8cf359ea5b..786d51727e74 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -409,6 +409,8 @@ typedef struct elf64_shdr { #define NT_386_TLS 0x200 /* i386 TLS slots (struct user_desc) */ #define NT_386_IOPERM 0x201 /* x86 io permission bitmap (1=deny) */ #define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */ +/* Old binutils treats 0x203 as a CET state */ +#define NT_X86_SHSTK 0x204 /* x86 SHSTK state */ #define NT_S390_HIGH_GPRS 0x300 /* s390 upper register halves */ #define NT_S390_TIMER 0x301 /* s390 timer register */ #define NT_S390_TODCMP 0x302 /* s390 TOD clock comparator register */ -- cgit v1.2.3 From 680ed2f15e70d079c8148589a3ce9426fc0ff914 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 12 Jun 2023 17:11:07 -0700 Subject: x86/shstk: Add ARCH_SHSTK_UNLOCK Userspace loaders may lock features before a CRIU restore operation has the chance to set them to whatever state is required by the process being restored. Allow a way for CRIU to unlock features. Add it as an arch_prctl() like the other shadow stack operations, but restrict it being called by the ptrace arch_pctl() interface. [Merged into recent API changes, added commit log and docs] Signed-off-by: Mike Rapoport Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Reviewed-by: Borislav Petkov (AMD) Reviewed-by: Kees Cook Reviewed-by: David Hildenbrand Tested-by: Pengfei Xu Tested-by: John Allen Tested-by: Kees Cook Link: https://lore.kernel.org/all/20230613001108.3040476-42-rick.p.edgecombe%40intel.com --- Documentation/arch/x86/shstk.rst | 4 ++++ arch/x86/include/uapi/asm/prctl.h | 1 + arch/x86/kernel/process_64.c | 1 + arch/x86/kernel/shstk.c | 9 +++++++-- 4 files changed, 13 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/arch/x86/shstk.rst b/Documentation/arch/x86/shstk.rst index f09afa504ec0..f3553cc8c758 100644 --- a/Documentation/arch/x86/shstk.rst +++ b/Documentation/arch/x86/shstk.rst @@ -75,6 +75,10 @@ arch_prctl(ARCH_SHSTK_LOCK, unsigned long features) are ignored. The mask is ORed with the existing value. So any feature bits set here cannot be enabled or disabled afterwards. +arch_prctl(ARCH_SHSTK_UNLOCK, unsigned long features) + Unlock features. 'features' is a mask of all features to unlock. All + bits set are processed, unset bits are ignored. Only works via ptrace. + The return values are as follows. On success, return 0. On error, errno can be:: diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h index eedfde3b63be..3189c4a96468 100644 --- a/arch/x86/include/uapi/asm/prctl.h +++ b/arch/x86/include/uapi/asm/prctl.h @@ -33,6 +33,7 @@ #define ARCH_SHSTK_ENABLE 0x5001 #define ARCH_SHSTK_DISABLE 0x5002 #define ARCH_SHSTK_LOCK 0x5003 +#define ARCH_SHSTK_UNLOCK 0x5004 /* ARCH_SHSTK_ features bits */ #define ARCH_SHSTK_SHSTK (1ULL << 0) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 0f89aa0186d1..e6db21c470aa 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -899,6 +899,7 @@ long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) case ARCH_SHSTK_ENABLE: case ARCH_SHSTK_DISABLE: case ARCH_SHSTK_LOCK: + case ARCH_SHSTK_UNLOCK: return shstk_prctl(task, option, arg2); default: ret = -EINVAL; diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c index d723cdc93474..d43b7a9c57ce 100644 --- a/arch/x86/kernel/shstk.c +++ b/arch/x86/kernel/shstk.c @@ -489,9 +489,14 @@ long shstk_prctl(struct task_struct *task, int option, unsigned long features) return 0; } - /* Don't allow via ptrace */ - if (task != current) + /* Only allow via ptrace */ + if (task != current) { + if (option == ARCH_SHSTK_UNLOCK && IS_ENABLED(CONFIG_CHECKPOINT_RESTORE)) { + task->thread.features_locked &= ~features; + return 0; + } return -EINVAL; + } /* Do not allow to change locked features */ if (features & task->thread.features_locked) -- cgit v1.2.3 From 67840ad0fa14ad49a605074b12d5b0f3c3113ed1 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 12 Jun 2023 17:11:08 -0700 Subject: x86/shstk: Add ARCH_SHSTK_STATUS CRIU and GDB need to get the current shadow stack and WRSS enablement status. This information is already available via /proc/pid/status, but this is inconvenient for CRIU because it involves parsing the text output in an area of the code where this is difficult. Provide a status arch_prctl(), ARCH_SHSTK_STATUS for retrieving the status. Have arg2 be a userspace address, and make the new arch_prctl simply copy the features out to userspace. Suggested-by: Mike Rapoport Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Reviewed-by: Borislav Petkov (AMD) Reviewed-by: Kees Cook Acked-by: Mike Rapoport (IBM) Tested-by: Pengfei Xu Tested-by: John Allen Tested-by: Kees Cook Link: https://lore.kernel.org/all/20230613001108.3040476-43-rick.p.edgecombe%40intel.com --- Documentation/arch/x86/shstk.rst | 6 ++++++ arch/x86/include/asm/shstk.h | 2 +- arch/x86/include/uapi/asm/prctl.h | 1 + arch/x86/kernel/process_64.c | 1 + arch/x86/kernel/shstk.c | 8 +++++++- 5 files changed, 16 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/arch/x86/shstk.rst b/Documentation/arch/x86/shstk.rst index f3553cc8c758..60260e809baf 100644 --- a/Documentation/arch/x86/shstk.rst +++ b/Documentation/arch/x86/shstk.rst @@ -79,6 +79,11 @@ arch_prctl(ARCH_SHSTK_UNLOCK, unsigned long features) Unlock features. 'features' is a mask of all features to unlock. All bits set are processed, unset bits are ignored. Only works via ptrace. +arch_prctl(ARCH_SHSTK_STATUS, unsigned long addr) + Copy the currently enabled features to the address passed in addr. The + features are described using the bits passed into the others in + 'features'. + The return values are as follows. On success, return 0. On error, errno can be:: @@ -86,6 +91,7 @@ be:: -ENOTSUPP if the feature is not supported by the hardware or kernel. -EINVAL arguments (non existing feature, etc) + -EFAULT if could not copy information back to userspace The feature's bits supported are:: diff --git a/arch/x86/include/asm/shstk.h b/arch/x86/include/asm/shstk.h index ecb23a8ca47d..42fee8959df7 100644 --- a/arch/x86/include/asm/shstk.h +++ b/arch/x86/include/asm/shstk.h @@ -14,7 +14,7 @@ struct thread_shstk { u64 size; }; -long shstk_prctl(struct task_struct *task, int option, unsigned long features); +long shstk_prctl(struct task_struct *task, int option, unsigned long arg2); void reset_thread_features(void); unsigned long shstk_alloc_thread_stack(struct task_struct *p, unsigned long clone_flags, unsigned long stack_size); diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h index 3189c4a96468..384e2cc6ac19 100644 --- a/arch/x86/include/uapi/asm/prctl.h +++ b/arch/x86/include/uapi/asm/prctl.h @@ -34,6 +34,7 @@ #define ARCH_SHSTK_DISABLE 0x5002 #define ARCH_SHSTK_LOCK 0x5003 #define ARCH_SHSTK_UNLOCK 0x5004 +#define ARCH_SHSTK_STATUS 0x5005 /* ARCH_SHSTK_ features bits */ #define ARCH_SHSTK_SHSTK (1ULL << 0) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index e6db21c470aa..33b268747bb7 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -900,6 +900,7 @@ long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) case ARCH_SHSTK_DISABLE: case ARCH_SHSTK_LOCK: case ARCH_SHSTK_UNLOCK: + case ARCH_SHSTK_STATUS: return shstk_prctl(task, option, arg2); default: ret = -EINVAL; diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c index d43b7a9c57ce..b26810c7cd1c 100644 --- a/arch/x86/kernel/shstk.c +++ b/arch/x86/kernel/shstk.c @@ -482,8 +482,14 @@ SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsi return alloc_shstk(addr, aligned_size, size, set_tok); } -long shstk_prctl(struct task_struct *task, int option, unsigned long features) +long shstk_prctl(struct task_struct *task, int option, unsigned long arg2) { + unsigned long features = arg2; + + if (option == ARCH_SHSTK_STATUS) { + return put_user(task->thread.features, (unsigned long __user *)arg2); + } + if (option == ARCH_SHSTK_LOCK) { task->thread.features_locked |= features; return 0; -- cgit v1.2.3 From 87f0df7828899c552bcdde639c045983d5aeeed9 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Thu, 6 Jul 2023 16:32:48 -0700 Subject: x86/shstk: Move arch detail comment out of core mm The comment around VM_SHADOW_STACK in mm.h refers to a lot of x86 specific details that don't belong in a cross arch file. Remove these out of core mm, and just leave the non-arch details. Since the comment includes some useful details that would be good to retain in the source somewhere, put the arch specifics parts in arch/x86/shstk.c near alloc_shstk(), where memory of this type is allocated. Include a reference to the existence of the x86 details near the VM_SHADOW_STACK definition mm.h. Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Reviewed-by: Mark Brown Link: https://lore.kernel.org/all/20230706233248.445713-1-rick.p.edgecombe%40intel.com --- arch/x86/kernel/shstk.c | 25 +++++++++++++++++++++++++ include/linux/mm.h | 32 ++++++-------------------------- 2 files changed, 31 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c index b26810c7cd1c..47f5204b0fa9 100644 --- a/arch/x86/kernel/shstk.c +++ b/arch/x86/kernel/shstk.c @@ -72,6 +72,31 @@ static int create_rstor_token(unsigned long ssp, unsigned long *token_addr) return 0; } +/* + * VM_SHADOW_STACK will have a guard page. This helps userspace protect + * itself from attacks. The reasoning is as follows: + * + * The shadow stack pointer(SSP) is moved by CALL, RET, and INCSSPQ. The + * INCSSP instruction can increment the shadow stack pointer. It is the + * shadow stack analog of an instruction like: + * + * addq $0x80, %rsp + * + * However, there is one important difference between an ADD on %rsp + * and INCSSP. In addition to modifying SSP, INCSSP also reads from the + * memory of the first and last elements that were "popped". It can be + * thought of as acting like this: + * + * READ_ONCE(ssp); // read+discard top element on stack + * ssp += nr_to_pop * 8; // move the shadow stack + * READ_ONCE(ssp-8); // read+discard last popped stack element + * + * The maximum distance INCSSP can move the SSP is 2040 bytes, before + * it would read the memory. Therefore a single page gap will be enough + * to prevent any operation from shifting the SSP to an adjacent stack, + * since it would have to land in the gap at least once, causing a + * fault. + */ static unsigned long alloc_shstk(unsigned long addr, unsigned long size, unsigned long token_offset, bool set_res_tok) { diff --git a/include/linux/mm.h b/include/linux/mm.h index 97eddc83d19c..8c0350c1134a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -343,33 +343,13 @@ extern unsigned int kobjsize(const void *objp); #ifdef CONFIG_X86_USER_SHADOW_STACK /* - * This flag should not be set with VM_SHARED because of lack of support - * core mm. It will also get a guard page. This helps userspace protect - * itself from attacks. The reasoning is as follows: + * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of + * support core mm. * - * The shadow stack pointer(SSP) is moved by CALL, RET, and INCSSPQ. The - * INCSSP instruction can increment the shadow stack pointer. It is the - * shadow stack analog of an instruction like: - * - * addq $0x80, %rsp - * - * However, there is one important difference between an ADD on %rsp - * and INCSSP. In addition to modifying SSP, INCSSP also reads from the - * memory of the first and last elements that were "popped". It can be - * thought of as acting like this: - * - * READ_ONCE(ssp); // read+discard top element on stack - * ssp += nr_to_pop * 8; // move the shadow stack - * READ_ONCE(ssp-8); // read+discard last popped stack element - * - * The maximum distance INCSSP can move the SSP is 2040 bytes, before - * it would read the memory. Therefore a single page gap will be enough - * to prevent any operation from shifting the SSP to an adjacent stack, - * since it would have to land in the gap at least once, causing a - * fault. - * - * Prevent using INCSSP to move the SSP between shadow stacks by - * having a PAGE_SIZE guard gap. + * These VMAs will get a single end guard page. This helps userspace protect + * itself from attacks. A single page is enough for current shadow stack archs + * (x86). See the comments near alloc_shstk() in arch/x86/kernel/shstk.c + * for more details on the guard size. */ # define VM_SHADOW_STACK VM_HIGH_ARCH_5 #else -- cgit v1.2.3 From 54acee601b87cfe43e17f6812449e5eb3eab39f9 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 2 Aug 2023 13:37:22 -0700 Subject: x86/kbuild: Fix Documentation/ reference x86 documentation moved. Fix the reference. Signed-off-by: Dave Hansen Reported-by: kernel test robot Link: https://lore.kernel.org/oe-kbuild-all/202307271956.IxoG9X0c-lkp@intel.com/ --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e860f805199f..99e7552fff4f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1969,7 +1969,7 @@ config X86_USER_SHADOW_STACK CPUs supporting shadow stacks were first released in 2020. - See Documentation/x86/shstk.rst for more information. + See Documentation/arch/x86/shstk.rst for more information. If unsure, say N. -- cgit v1.2.3 From c6b53dcec07c842af75123d9b29684bdbd36a407 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Thu, 6 Jul 2023 16:38:58 -0700 Subject: x86/shstk: Don't retry vm_munmap() on -EINTR The existing comment around handling vm_munmap() failure when freeing a shadow stack is wrong. It asserts that vm_munmap() returns -EINTR when the mmap lock is only being held for a short time, and so the caller should retry. Based on this wrong understanding, unmap_shadow_stack() will loop retrying vm_munmap(). What -EINTR actually means in this case is that the process is going away (see ae79878), and the whole MM will be torn down soon. In order to facilitate this, the task should not linger in the kernel, but actually do the opposite. So don't loop in this scenario, just abandon the operation and let exit_mmap() clean it up. Also, update the comment to reflect the actual meaning of the error code. Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20230706233858.446232-1-rick.p.edgecombe%40intel.com --- arch/x86/kernel/shstk.c | 38 +++++++++++++++++--------------------- 1 file changed, 17 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c index 47f5204b0fa9..cd10d074a444 100644 --- a/arch/x86/kernel/shstk.c +++ b/arch/x86/kernel/shstk.c @@ -134,28 +134,24 @@ static unsigned long adjust_shstk_size(unsigned long size) static void unmap_shadow_stack(u64 base, u64 size) { - while (1) { - int r; - - r = vm_munmap(base, size); - - /* - * vm_munmap() returns -EINTR when mmap_lock is held by - * something else, and that lock should not be held for a - * long time. Retry it for the case. - */ - if (r == -EINTR) { - cond_resched(); - continue; - } + int r; - /* - * For all other types of vm_munmap() failure, either the - * system is out of memory or there is bug. - */ - WARN_ON_ONCE(r); - break; - } + r = vm_munmap(base, size); + + /* + * mmap_write_lock_killable() failed with -EINTR. This means + * the process is about to die and have it's MM cleaned up. + * This task shouldn't ever make it back to userspace. In this + * case it is ok to leak a shadow stack, so just exit out. + */ + if (r == -EINTR) + return; + + /* + * For all other types of vm_munmap() failure, either the + * system is out of memory or there is bug. + */ + WARN_ON_ONCE(r); } static int shstk_setup(void) -- cgit v1.2.3 From c6cfcbd8ca43766851a8c952e3b570727147020f Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 7 Apr 2023 17:16:41 -0700 Subject: x86/ibt: Convert IBT selftest to asm The following warning is reported when frame pointers and kernel IBT are enabled: vmlinux.o: warning: objtool: ibt_selftest+0x11: sibling call from callable instruction with modified stack frame The problem is that objtool interprets the indirect branch in ibt_selftest() as a sibling call, and GCC inserts a (partial) frame pointer prologue before it: 0000 000000000003f550 : 0000 3f550: f3 0f 1e fa endbr64 0004 3f554: e8 00 00 00 00 call 3f559 3f555: R_X86_64_PLT32 __fentry__-0x4 0009 3f559: 55 push %rbp 000a 3f55a: 48 8d 05 02 00 00 00 lea 0x2(%rip),%rax # 3f563 0011 3f561: ff e0 jmp *%rax Note the inline asm is missing ASM_CALL_CONSTRAINT, so the 'push %rbp' happens before the indirect branch and the 'mov %rsp, %rbp' happens afterwards. Simplify the generated code and make it easier to understand for both tools and humans by moving the selftest to proper asm. Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/99a7e16b97bda97bf0a04aa141d6241cd8a839a2.1680912949.git.jpoimboe@kernel.org --- arch/x86/include/asm/traps.h | 3 ++- arch/x86/kernel/Makefile | 1 + arch/x86/kernel/cet.c | 23 +---------------------- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/ibt_selftest.S | 17 +++++++++++++++++ 5 files changed, 22 insertions(+), 24 deletions(-) create mode 100644 arch/x86/kernel/ibt_selftest.S (limited to 'arch/x86') diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 75e0dabf0c45..b1c9cea6ba88 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -18,7 +18,8 @@ void __init trap_init(void); asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *eregs); #endif -extern bool ibt_selftest(void); +extern int ibt_selftest(void); +extern int ibt_selftest_noendbr(void); #ifdef CONFIG_X86_F00F_BUG /* For handling the FOOF bug */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 6b6bf47652ee..eca9784a46fe 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -49,6 +49,7 @@ obj-y += process_$(BITS).o signal.o signal_$(BITS).o obj-y += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o dumpstack.o nmi.o obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o +obj-$(CONFIG_X86_KERNEL_IBT) += ibt_selftest.o obj-y += setup.o x86_init.o i8259.o irqinit.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_IRQ_WORK) += irq_work.o diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c index cc10d8be9d74..d2c732a34e5d 100644 --- a/arch/x86/kernel/cet.c +++ b/arch/x86/kernel/cet.c @@ -81,9 +81,6 @@ static void do_user_cp_fault(struct pt_regs *regs, unsigned long error_code) static __ro_after_init bool ibt_fatal = true; -/* code label defined in asm below */ -extern void ibt_selftest_ip(void); - static void do_kernel_cp_fault(struct pt_regs *regs, unsigned long error_code) { if ((error_code & CP_EC) != CP_ENDBR) { @@ -91,7 +88,7 @@ static void do_kernel_cp_fault(struct pt_regs *regs, unsigned long error_code) return; } - if (unlikely(regs->ip == (unsigned long)&ibt_selftest_ip)) { + if (unlikely(regs->ip == (unsigned long)&ibt_selftest_noendbr)) { regs->ax = 0; return; } @@ -105,24 +102,6 @@ static void do_kernel_cp_fault(struct pt_regs *regs, unsigned long error_code) BUG(); } -/* Must be noinline to ensure uniqueness of ibt_selftest_ip. */ -noinline bool ibt_selftest(void) -{ - unsigned long ret; - - asm (" lea ibt_selftest_ip(%%rip), %%rax\n\t" - ANNOTATE_RETPOLINE_SAFE - " jmp *%%rax\n\t" - "ibt_selftest_ip:\n\t" - UNWIND_HINT_FUNC - ANNOTATE_NOENDBR - " nop\n\t" - - : "=a" (ret) : : "memory"); - - return !ret; -} - static int __init ibt_setup(char *str) { if (!strcmp(str, "off")) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cf5275a67f39..3c6426ce3c73 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -610,7 +610,7 @@ static __always_inline void setup_cet(struct cpuinfo_x86 *c) cr4_set_bits(X86_CR4_CET); - if (kernel_ibt && !ibt_selftest()) { + if (kernel_ibt && ibt_selftest()) { pr_err("IBT selftest: Failed!\n"); wrmsrl(MSR_IA32_S_CET, 0); setup_clear_cpu_cap(X86_FEATURE_IBT); diff --git a/arch/x86/kernel/ibt_selftest.S b/arch/x86/kernel/ibt_selftest.S new file mode 100644 index 000000000000..c43c4ed28a9c --- /dev/null +++ b/arch/x86/kernel/ibt_selftest.S @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include +#include +#include + +SYM_CODE_START(ibt_selftest_noendbr) + ANNOTATE_NOENDBR + UNWIND_HINT_FUNC + /* #CP handler sets %ax to 0 */ + RET +SYM_CODE_END(ibt_selftest_noendbr) + +SYM_FUNC_START(ibt_selftest) + lea ibt_selftest_noendbr(%rip), %rax + ANNOTATE_RETPOLINE_SAFE + jmp *%rax +SYM_FUNC_END(ibt_selftest) -- cgit v1.2.3 From 1fe428d3692fb10a0e8d85fafe719b154e43ad4e Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Thu, 24 Aug 2023 18:45:54 -0700 Subject: x86/shstk: Change order of __user in type 0day reports a sparse warning: arch/x86/kernel/shstk.c:295:55: sparse: sparse: cast removes address space '__user' of expression The __user is in the wrong spot. Move it to right spot and make sparse happy. Closes: https://lore.kernel.org/oe-kbuild-all/202308222312.Jt4Tog5T-lkp@intel.com/ Reported-by: kernel test robot Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20230825014554.1769194-1-rick.p.edgecombe%40intel.com --- arch/x86/kernel/shstk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c index cd10d074a444..fd689921a1db 100644 --- a/arch/x86/kernel/shstk.c +++ b/arch/x86/kernel/shstk.c @@ -275,7 +275,7 @@ static int shstk_push_sigframe(unsigned long *ssp) return -EINVAL; *ssp -= SS_FRAME_SIZE; - if (put_shstk_data((void *__user)*ssp, target_ssp)) + if (put_shstk_data((void __user *)*ssp, target_ssp)) return -EFAULT; return 0; -- cgit v1.2.3