From 93d73005bff4f600696ce30e366e742c3373b13d Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 16 Dec 2025 13:25:55 -0800 Subject: x86/entry/vdso: Rename vdso_image_* to vdso*_image The vdso .so files are named vdso*.so. These structures are binary images and descriptions of these files, so it is more consistent for them to have a naming that more directly mirrors the filenames. It is also very slightly more compact (by one character...) and simplifies the Makefile just a little bit. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Dave Hansen Link: https://patch.msgid.link/20251216212606.1325678-2-hpa@zytor.com --- arch/x86/include/asm/elf.h | 2 +- arch/x86/include/asm/vdso.h | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 6c8fdc96be7e..2ba5f166e58f 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -361,7 +361,7 @@ else if (IS_ENABLED(CONFIG_IA32_EMULATION)) \ #define VDSO_ENTRY \ ((unsigned long)current->mm->context.vdso + \ - vdso_image_32.sym___kernel_vsyscall) + vdso32_image.sym___kernel_vsyscall) struct linux_binprm; diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index b7253ef3205a..e8afbe9faa5b 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -27,9 +27,9 @@ struct vdso_image { long sym_vdso32_rt_sigreturn_landing_pad; }; -extern const struct vdso_image vdso_image_64; -extern const struct vdso_image vdso_image_x32; -extern const struct vdso_image vdso_image_32; +extern const struct vdso_image vdso64_image; +extern const struct vdso_image vdsox32_image; +extern const struct vdso_image vdso32_image; extern int __init init_vdso_image(const struct vdso_image *image); -- cgit v1.2.3 From 884961618ee51307cc63ab620a0bdd710fa0b0af Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 16 Dec 2025 13:26:00 -0800 Subject: x86/entry/vdso32: Remove open-coded DWARF in sigreturn.S The vdso32 sigreturn.S contains open-coded DWARF bytecode, which includes a hack for gdb to not try to step back to a previous call instruction when backtracing from a signal handler. Neither of those are necessary anymore: the backtracing issue is handled by ".cfi_entry simple" and ".cfi_signal_frame", both of which have been supported for a very long time now, which allows the remaining frame to be built using regular .cfi annotations. Add a few more register offsets to the signal frame just for good measure. Replace the nop on fallthrough of the system call (which should never, ever happen) with a ud2a trap. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Dave Hansen Link: https://patch.msgid.link/20251216212606.1325678-7-hpa@zytor.com --- arch/x86/entry/vdso/vdso32/sigreturn.S | 146 ++++++++------------------------- arch/x86/include/asm/dwarf2.h | 1 + arch/x86/kernel/asm-offsets.c | 6 ++ 3 files changed, 39 insertions(+), 114 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/entry/vdso/vdso32/sigreturn.S b/arch/x86/entry/vdso/vdso32/sigreturn.S index 965900c6763b..25b0ac4b4bfe 100644 --- a/arch/x86/entry/vdso/vdso32/sigreturn.S +++ b/arch/x86/entry/vdso/vdso32/sigreturn.S @@ -1,136 +1,54 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include #include +#include #include +.macro STARTPROC_SIGNAL_FRAME sc + CFI_STARTPROC simple + CFI_SIGNAL_FRAME + /* -4 as pretcode has already been popped */ + CFI_DEF_CFA esp, \sc - 4 + CFI_OFFSET eip, IA32_SIGCONTEXT_ip + CFI_OFFSET eax, IA32_SIGCONTEXT_ax + CFI_OFFSET ebx, IA32_SIGCONTEXT_bx + CFI_OFFSET ecx, IA32_SIGCONTEXT_cx + CFI_OFFSET edx, IA32_SIGCONTEXT_dx + CFI_OFFSET esp, IA32_SIGCONTEXT_sp + CFI_OFFSET ebp, IA32_SIGCONTEXT_bp + CFI_OFFSET esi, IA32_SIGCONTEXT_si + CFI_OFFSET edi, IA32_SIGCONTEXT_di + CFI_OFFSET es, IA32_SIGCONTEXT_es + CFI_OFFSET cs, IA32_SIGCONTEXT_cs + CFI_OFFSET ss, IA32_SIGCONTEXT_ss + CFI_OFFSET ds, IA32_SIGCONTEXT_ds + CFI_OFFSET eflags, IA32_SIGCONTEXT_flags +.endm + .text .globl __kernel_sigreturn .type __kernel_sigreturn,@function - nop /* this guy is needed for .LSTARTFDEDLSI1 below (watch for HACK) */ ALIGN __kernel_sigreturn: -.LSTART_sigreturn: - popl %eax /* XXX does this mean it needs unwind info? */ + STARTPROC_SIGNAL_FRAME IA32_SIGFRAME_sigcontext + popl %eax + CFI_ADJUST_CFA_OFFSET -4 movl $__NR_sigreturn, %eax int $0x80 -.LEND_sigreturn: SYM_INNER_LABEL(vdso32_sigreturn_landing_pad, SYM_L_GLOBAL) - nop - .size __kernel_sigreturn,.-.LSTART_sigreturn + ud2a + CFI_ENDPROC + .size __kernel_sigreturn,.-__kernel_sigreturn .globl __kernel_rt_sigreturn .type __kernel_rt_sigreturn,@function ALIGN __kernel_rt_sigreturn: -.LSTART_rt_sigreturn: + STARTPROC_SIGNAL_FRAME IA32_RT_SIGFRAME_sigcontext movl $__NR_rt_sigreturn, %eax int $0x80 -.LEND_rt_sigreturn: SYM_INNER_LABEL(vdso32_rt_sigreturn_landing_pad, SYM_L_GLOBAL) - nop - .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn - .previous - - .section .eh_frame,"a",@progbits -.LSTARTFRAMEDLSI1: - .long .LENDCIEDLSI1-.LSTARTCIEDLSI1 -.LSTARTCIEDLSI1: - .long 0 /* CIE ID */ - .byte 1 /* Version number */ - .string "zRS" /* NUL-terminated augmentation string */ - .uleb128 1 /* Code alignment factor */ - .sleb128 -4 /* Data alignment factor */ - .byte 8 /* Return address register column */ - .uleb128 1 /* Augmentation value length */ - .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ - .byte 0 /* DW_CFA_nop */ - .align 4 -.LENDCIEDLSI1: - .long .LENDFDEDLSI1-.LSTARTFDEDLSI1 /* Length FDE */ -.LSTARTFDEDLSI1: - .long .LSTARTFDEDLSI1-.LSTARTFRAMEDLSI1 /* CIE pointer */ - /* HACK: The dwarf2 unwind routines will subtract 1 from the - return address to get an address in the middle of the - presumed call instruction. Since we didn't get here via - a call, we need to include the nop before the real start - to make up for it. */ - .long .LSTART_sigreturn-1-. /* PC-relative start address */ - .long .LEND_sigreturn-.LSTART_sigreturn+1 - .uleb128 0 /* Augmentation */ - /* What follows are the instructions for the table generation. - We record the locations of each register saved. This is - complicated by the fact that the "CFA" is always assumed to - be the value of the stack pointer in the caller. This means - that we must define the CFA of this body of code to be the - saved value of the stack pointer in the sigcontext. Which - also means that there is no fixed relation to the other - saved registers, which means that we must use DW_CFA_expression - to compute their addresses. It also means that when we - adjust the stack with the popl, we have to do it all over again. */ - -#define do_cfa_expr(offset) \ - .byte 0x0f; /* DW_CFA_def_cfa_expression */ \ - .uleb128 1f-0f; /* length */ \ -0: .byte 0x74; /* DW_OP_breg4 */ \ - .sleb128 offset; /* offset */ \ - .byte 0x06; /* DW_OP_deref */ \ -1: - -#define do_expr(regno, offset) \ - .byte 0x10; /* DW_CFA_expression */ \ - .uleb128 regno; /* regno */ \ - .uleb128 1f-0f; /* length */ \ -0: .byte 0x74; /* DW_OP_breg4 */ \ - .sleb128 offset; /* offset */ \ -1: - - do_cfa_expr(IA32_SIGCONTEXT_sp+4) - do_expr(0, IA32_SIGCONTEXT_ax+4) - do_expr(1, IA32_SIGCONTEXT_cx+4) - do_expr(2, IA32_SIGCONTEXT_dx+4) - do_expr(3, IA32_SIGCONTEXT_bx+4) - do_expr(5, IA32_SIGCONTEXT_bp+4) - do_expr(6, IA32_SIGCONTEXT_si+4) - do_expr(7, IA32_SIGCONTEXT_di+4) - do_expr(8, IA32_SIGCONTEXT_ip+4) - - .byte 0x42 /* DW_CFA_advance_loc 2 -- nop; popl eax. */ - - do_cfa_expr(IA32_SIGCONTEXT_sp) - do_expr(0, IA32_SIGCONTEXT_ax) - do_expr(1, IA32_SIGCONTEXT_cx) - do_expr(2, IA32_SIGCONTEXT_dx) - do_expr(3, IA32_SIGCONTEXT_bx) - do_expr(5, IA32_SIGCONTEXT_bp) - do_expr(6, IA32_SIGCONTEXT_si) - do_expr(7, IA32_SIGCONTEXT_di) - do_expr(8, IA32_SIGCONTEXT_ip) - - .align 4 -.LENDFDEDLSI1: - - .long .LENDFDEDLSI2-.LSTARTFDEDLSI2 /* Length FDE */ -.LSTARTFDEDLSI2: - .long .LSTARTFDEDLSI2-.LSTARTFRAMEDLSI1 /* CIE pointer */ - /* HACK: See above wrt unwind library assumptions. */ - .long .LSTART_rt_sigreturn-1-. /* PC-relative start address */ - .long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1 - .uleb128 0 /* Augmentation */ - /* What follows are the instructions for the table generation. - We record the locations of each register saved. This is - slightly less complicated than the above, since we don't - modify the stack pointer in the process. */ - - do_cfa_expr(IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_sp) - do_expr(0, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ax) - do_expr(1, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_cx) - do_expr(2, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_dx) - do_expr(3, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_bx) - do_expr(5, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_bp) - do_expr(6, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_si) - do_expr(7, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_di) - do_expr(8, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ip) - - .align 4 -.LENDFDEDLSI2: + ud2a + CFI_ENDPROC + .size __kernel_rt_sigreturn,.-__kernel_rt_sigreturn .previous diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h index 302e11b15da8..09c9684d3ad6 100644 --- a/arch/x86/include/asm/dwarf2.h +++ b/arch/x86/include/asm/dwarf2.h @@ -20,6 +20,7 @@ #define CFI_RESTORE_STATE .cfi_restore_state #define CFI_UNDEFINED .cfi_undefined #define CFI_ESCAPE .cfi_escape +#define CFI_SIGNAL_FRAME .cfi_signal_frame #ifndef BUILD_VDSO /* diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 25fcde525c68..081816888f7a 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -63,8 +63,14 @@ static void __used common(void) OFFSET(IA32_SIGCONTEXT_bp, sigcontext_32, bp); OFFSET(IA32_SIGCONTEXT_sp, sigcontext_32, sp); OFFSET(IA32_SIGCONTEXT_ip, sigcontext_32, ip); + OFFSET(IA32_SIGCONTEXT_es, sigcontext_32, es); + OFFSET(IA32_SIGCONTEXT_cs, sigcontext_32, cs); + OFFSET(IA32_SIGCONTEXT_ss, sigcontext_32, ss); + OFFSET(IA32_SIGCONTEXT_ds, sigcontext_32, ds); + OFFSET(IA32_SIGCONTEXT_flags, sigcontext_32, flags); BLANK(); + OFFSET(IA32_SIGFRAME_sigcontext, sigframe_ia32, sc); OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext); #endif -- cgit v1.2.3 From a0636d4c3ad0da0cd6069eb6fef5d2b7d3449378 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 16 Dec 2025 13:26:02 -0800 Subject: x86/vdso: Abstract out vdso system call internals Abstract out the calling of true system calls from the vdso into macros. It has been a very long time since gcc did not allow %ebx or %ebp in inline asm in 32-bit PIC mode; remove the corresponding hacks. Remove the use of memory output constraints in gettimeofday.h in favor of "memory" clobbers. The resulting code is identical for the current use cases, as the system call is usually a terminal fallback anyway, and it merely complicates the macroization. This patch adds only a handful of more lines of code than it removes, and in fact could be made substantially smaller by removing the macros for the argument counts that aren't currently used, however, it seems better to be general from the start. [ v3: remove stray comment from prototyping; remove VDSO_SYSCALL6() since it would require special handling on 32 bits and is currently unused. (Uros Biszjak) Indent nested preprocessor directives. ] Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Dave Hansen Acked-by: Uros Bizjak Link: https://patch.msgid.link/20251216212606.1325678-9-hpa@zytor.com --- arch/x86/include/asm/vdso/gettimeofday.h | 108 +++---------------------------- arch/x86/include/asm/vdso/sys_call.h | 103 +++++++++++++++++++++++++++++ 2 files changed, 111 insertions(+), 100 deletions(-) create mode 100644 arch/x86/include/asm/vdso/sys_call.h (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h index 73b2e7ee8f0f..3cf214cc4a75 100644 --- a/arch/x86/include/asm/vdso/gettimeofday.h +++ b/arch/x86/include/asm/vdso/gettimeofday.h @@ -18,6 +18,7 @@ #include #include #include +#include #define VDSO_HAS_TIME 1 @@ -53,130 +54,37 @@ extern struct ms_hyperv_tsc_page hvclock_page __attribute__((visibility("hidden"))); #endif -#ifndef BUILD_VDSO32 - static __always_inline long clock_gettime_fallback(clockid_t _clkid, struct __kernel_timespec *_ts) { - long ret; - - asm ("syscall" : "=a" (ret), "=m" (*_ts) : - "0" (__NR_clock_gettime), "D" (_clkid), "S" (_ts) : - "rcx", "r11"); - - return ret; + return VDSO_SYSCALL2(clock_gettime,64,_clkid,_ts); } static __always_inline long gettimeofday_fallback(struct __kernel_old_timeval *_tv, struct timezone *_tz) { - long ret; - - asm("syscall" : "=a" (ret) : - "0" (__NR_gettimeofday), "D" (_tv), "S" (_tz) : "memory"); - - return ret; + return VDSO_SYSCALL2(gettimeofday,,_tv,_tz); } static __always_inline long clock_getres_fallback(clockid_t _clkid, struct __kernel_timespec *_ts) { - long ret; - - asm ("syscall" : "=a" (ret), "=m" (*_ts) : - "0" (__NR_clock_getres), "D" (_clkid), "S" (_ts) : - "rcx", "r11"); - - return ret; + return VDSO_SYSCALL2(clock_getres,_time64,_clkid,_ts); } -#else - -static __always_inline -long clock_gettime_fallback(clockid_t _clkid, struct __kernel_timespec *_ts) -{ - long ret; - - asm ( - "mov %%ebx, %%edx \n" - "mov %[clock], %%ebx \n" - "call __kernel_vsyscall \n" - "mov %%edx, %%ebx \n" - : "=a" (ret), "=m" (*_ts) - : "0" (__NR_clock_gettime64), [clock] "g" (_clkid), "c" (_ts) - : "edx"); - - return ret; -} +#ifndef CONFIG_X86_64 static __always_inline long clock_gettime32_fallback(clockid_t _clkid, struct old_timespec32 *_ts) { - long ret; - - asm ( - "mov %%ebx, %%edx \n" - "mov %[clock], %%ebx \n" - "call __kernel_vsyscall \n" - "mov %%edx, %%ebx \n" - : "=a" (ret), "=m" (*_ts) - : "0" (__NR_clock_gettime), [clock] "g" (_clkid), "c" (_ts) - : "edx"); - - return ret; -} - -static __always_inline -long gettimeofday_fallback(struct __kernel_old_timeval *_tv, - struct timezone *_tz) -{ - long ret; - - asm( - "mov %%ebx, %%edx \n" - "mov %2, %%ebx \n" - "call __kernel_vsyscall \n" - "mov %%edx, %%ebx \n" - : "=a" (ret) - : "0" (__NR_gettimeofday), "g" (_tv), "c" (_tz) - : "memory", "edx"); - - return ret; + return VDSO_SYSCALL2(clock_gettime,,_clkid,_ts); } static __always_inline long -clock_getres_fallback(clockid_t _clkid, struct __kernel_timespec *_ts) -{ - long ret; - - asm ( - "mov %%ebx, %%edx \n" - "mov %[clock], %%ebx \n" - "call __kernel_vsyscall \n" - "mov %%edx, %%ebx \n" - : "=a" (ret), "=m" (*_ts) - : "0" (__NR_clock_getres_time64), [clock] "g" (_clkid), "c" (_ts) - : "edx"); - - return ret; -} - -static __always_inline -long clock_getres32_fallback(clockid_t _clkid, struct old_timespec32 *_ts) +clock_getres32_fallback(clockid_t _clkid, struct old_timespec32 *_ts) { - long ret; - - asm ( - "mov %%ebx, %%edx \n" - "mov %[clock], %%ebx \n" - "call __kernel_vsyscall \n" - "mov %%edx, %%ebx \n" - : "=a" (ret), "=m" (*_ts) - : "0" (__NR_clock_getres), [clock] "g" (_clkid), "c" (_ts) - : "edx"); - - return ret; + return VDSO_SYSCALL2(clock_getres,,_clkid,_ts); } #endif diff --git a/arch/x86/include/asm/vdso/sys_call.h b/arch/x86/include/asm/vdso/sys_call.h new file mode 100644 index 000000000000..dcfd17c6dd57 --- /dev/null +++ b/arch/x86/include/asm/vdso/sys_call.h @@ -0,0 +1,103 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Macros for issuing an inline system call from the vDSO. + */ + +#ifndef X86_ASM_VDSO_SYS_CALL_H +#define X86_ASM_VDSO_SYS_CALL_H + +#include +#include +#include + +#ifdef CONFIG_X86_64 +# define __sys_instr "syscall" +# define __sys_clobber "rcx", "r11", "memory" +# define __sys_nr(x,y) __NR_ ## x +# define __sys_reg1 "rdi" +# define __sys_reg2 "rsi" +# define __sys_reg3 "rdx" +# define __sys_reg4 "r10" +# define __sys_reg5 "r8" +#else +# define __sys_instr "call __kernel_vsyscall" +# define __sys_clobber "memory" +# define __sys_nr(x,y) __NR_ ## x ## y +# define __sys_reg1 "ebx" +# define __sys_reg2 "ecx" +# define __sys_reg3 "edx" +# define __sys_reg4 "esi" +# define __sys_reg5 "edi" +#endif + +/* + * Example usage: + * + * result = VDSO_SYSCALL3(foo,64,x,y,z); + * + * ... calls foo(x,y,z) on 64 bits, and foo64(x,y,z) on 32 bits. + * + * VDSO_SYSCALL6() is currently missing, because it would require + * special handling for %ebp on 32 bits when the vdso is compiled with + * frame pointers enabled (the default on 32 bits.) Add it as a special + * case when and if it becomes necessary. + */ +#define _VDSO_SYSCALL(name,suf32,...) \ + ({ \ + long _sys_num_ret = __sys_nr(name,suf32); \ + asm_inline volatile( \ + __sys_instr \ + : "+a" (_sys_num_ret) \ + : __VA_ARGS__ \ + : __sys_clobber); \ + _sys_num_ret; \ + }) + +#define VDSO_SYSCALL0(name,suf32) \ + _VDSO_SYSCALL(name,suf32) +#define VDSO_SYSCALL1(name,suf32,a1) \ + ({ \ + register long _sys_arg1 asm(__sys_reg1) = (long)(a1); \ + _VDSO_SYSCALL(name,suf32, \ + "r" (_sys_arg1)); \ + }) +#define VDSO_SYSCALL2(name,suf32,a1,a2) \ + ({ \ + register long _sys_arg1 asm(__sys_reg1) = (long)(a1); \ + register long _sys_arg2 asm(__sys_reg2) = (long)(a2); \ + _VDSO_SYSCALL(name,suf32, \ + "r" (_sys_arg1), "r" (_sys_arg2)); \ + }) +#define VDSO_SYSCALL3(name,suf32,a1,a2,a3) \ + ({ \ + register long _sys_arg1 asm(__sys_reg1) = (long)(a1); \ + register long _sys_arg2 asm(__sys_reg2) = (long)(a2); \ + register long _sys_arg3 asm(__sys_reg3) = (long)(a3); \ + _VDSO_SYSCALL(name,suf32, \ + "r" (_sys_arg1), "r" (_sys_arg2), \ + "r" (_sys_arg3)); \ + }) +#define VDSO_SYSCALL4(name,suf32,a1,a2,a3,a4) \ + ({ \ + register long _sys_arg1 asm(__sys_reg1) = (long)(a1); \ + register long _sys_arg2 asm(__sys_reg2) = (long)(a2); \ + register long _sys_arg3 asm(__sys_reg3) = (long)(a3); \ + register long _sys_arg4 asm(__sys_reg4) = (long)(a4); \ + _VDSO_SYSCALL(name,suf32, \ + "r" (_sys_arg1), "r" (_sys_arg2), \ + "r" (_sys_arg3), "r" (_sys_arg4)); \ + }) +#define VDSO_SYSCALL5(name,suf32,a1,a2,a3,a4,a5) \ + ({ \ + register long _sys_arg1 asm(__sys_reg1) = (long)(a1); \ + register long _sys_arg2 asm(__sys_reg2) = (long)(a2); \ + register long _sys_arg3 asm(__sys_reg3) = (long)(a3); \ + register long _sys_arg4 asm(__sys_reg4) = (long)(a4); \ + register long _sys_arg5 asm(__sys_reg5) = (long)(a5); \ + _VDSO_SYSCALL(name,suf32, \ + "r" (_sys_arg1), "r" (_sys_arg2), \ + "r" (_sys_arg3), "r" (_sys_arg4), \ + "r" (_sys_arg5)); \ + }) + +#endif /* X86_VDSO_SYS_CALL_H */ -- cgit v1.2.3 From f49ecf5e110ab0ed255ddea5e321689faf4e50e6 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 16 Dec 2025 13:26:03 -0800 Subject: x86/cpufeature: Replace X86_FEATURE_SYSENTER32 with X86_FEATURE_SYSFAST32 In most cases, the use of "fast 32-bit system call" depends either on X86_FEATURE_SEP or X86_FEATURE_SYSENTER32 || X86_FEATURE_SYSCALL32. However, nearly all the logic for both is identical. Define X86_FEATURE_SYSFAST32 which indicates that *either* SYSENTER32 or SYSCALL32 should be used, for either 32- or 64-bit kernels. This defaults to SYSENTER; use SYSCALL if the SYSCALL32 bit is also set. As this removes ALL existing uses of X86_FEATURE_SYSENTER32, which is a kernel-only synthetic feature bit, simply remove it and replace it with X86_FEATURE_SYSFAST32. This leaves an unused alternative for a true 32-bit kernel, but that should really not matter in any way. The clearing of X86_FEATURE_SYSCALL32 can be removed once the patches for automatically clearing disabled features has been merged. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Dave Hansen Link: https://patch.msgid.link/20251216212606.1325678-10-hpa@zytor.com --- arch/x86/Kconfig.cpufeatures | 8 ++++++++ arch/x86/entry/vdso/vdso32/system_call.S | 8 ++------ arch/x86/include/asm/cpufeatures.h | 2 +- arch/x86/kernel/cpu/centaur.c | 3 --- arch/x86/kernel/cpu/common.c | 8 ++++++++ arch/x86/kernel/cpu/intel.c | 4 +--- arch/x86/kernel/cpu/zhaoxin.c | 4 +--- arch/x86/kernel/fred.c | 2 +- arch/x86/xen/setup.c | 28 ++++++++++++++++++---------- arch/x86/xen/smp_pv.c | 5 ++--- arch/x86/xen/xen-ops.h | 1 - 11 files changed, 42 insertions(+), 31 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/Kconfig.cpufeatures b/arch/x86/Kconfig.cpufeatures index 733d5aff2456..423ac795baa7 100644 --- a/arch/x86/Kconfig.cpufeatures +++ b/arch/x86/Kconfig.cpufeatures @@ -56,6 +56,10 @@ config X86_REQUIRED_FEATURE_MOVBE def_bool y depends on MATOM +config X86_REQUIRED_FEATURE_SYSFAST32 + def_bool y + depends on X86_64 && !X86_FRED + config X86_REQUIRED_FEATURE_CPUID def_bool y depends on X86_64 @@ -120,6 +124,10 @@ config X86_DISABLED_FEATURE_CENTAUR_MCR def_bool y depends on X86_64 +config X86_DISABLED_FEATURE_SYSCALL32 + def_bool y + depends on !X86_64 + config X86_DISABLED_FEATURE_PCID def_bool y depends on !X86_64 diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S index 2a15634bbe75..7b1c0f16e511 100644 --- a/arch/x86/entry/vdso/vdso32/system_call.S +++ b/arch/x86/entry/vdso/vdso32/system_call.S @@ -52,13 +52,9 @@ __kernel_vsyscall: #define SYSENTER_SEQUENCE "movl %esp, %ebp; sysenter" #define SYSCALL_SEQUENCE "movl %ecx, %ebp; syscall" -#ifdef BUILD_VDSO32_64 /* If SYSENTER (Intel) or SYSCALL32 (AMD) is available, use it. */ - ALTERNATIVE_2 "", SYSENTER_SEQUENCE, X86_FEATURE_SYSENTER32, \ - SYSCALL_SEQUENCE, X86_FEATURE_SYSCALL32 -#else - ALTERNATIVE "", SYSENTER_SEQUENCE, X86_FEATURE_SEP -#endif + ALTERNATIVE_2 "", SYSENTER_SEQUENCE, X86_FEATURE_SYSFAST32, \ + SYSCALL_SEQUENCE, X86_FEATURE_SYSCALL32 /* Enter using int $0x80 */ int $0x80 diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index c3b53beb1300..63b0f9aa9b3e 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -84,7 +84,7 @@ #define X86_FEATURE_PEBS ( 3*32+12) /* "pebs" Precise-Event Based Sampling */ #define X86_FEATURE_BTS ( 3*32+13) /* "bts" Branch Trace Store */ #define X86_FEATURE_SYSCALL32 ( 3*32+14) /* syscall in IA32 userspace */ -#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* sysenter in IA32 userspace */ +#define X86_FEATURE_SYSFAST32 ( 3*32+15) /* sysenter/syscall in IA32 userspace */ #define X86_FEATURE_REP_GOOD ( 3*32+16) /* "rep_good" REP microcode works well */ #define X86_FEATURE_AMD_LBR_V2 ( 3*32+17) /* "amd_lbr_v2" AMD Last Branch Record Extension Version 2 */ #define X86_FEATURE_CLEAR_CPU_BUF ( 3*32+18) /* Clear CPU buffers using VERW */ diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index a3b55db35c96..9833f837141c 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -102,9 +102,6 @@ static void early_init_centaur(struct cpuinfo_x86 *c) (c->x86 >= 7)) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); -#ifdef CONFIG_X86_64 - set_cpu_cap(c, X86_FEATURE_SYSENTER32); -#endif if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index e7ab22fce3b5..1c3261cae40c 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1068,6 +1068,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c) init_scattered_cpuid_features(c); init_speculation_control(c); + if (IS_ENABLED(CONFIG_X86_64) || cpu_has(c, X86_FEATURE_SEP)) + set_cpu_cap(c, X86_FEATURE_SYSFAST32); + /* * Clear/Set all flags overridden by options, after probe. * This needs to happen each time we re-probe, which may happen @@ -1813,6 +1816,11 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) * that it can't be enabled in 32-bit mode. */ setup_clear_cpu_cap(X86_FEATURE_PCID); + + /* + * Never use SYSCALL on a 32-bit kernel + */ + setup_clear_cpu_cap(X86_FEATURE_SYSCALL32); #endif /* diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 98ae4c37c93e..646ff33c4651 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -236,9 +236,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) clear_cpu_cap(c, X86_FEATURE_PSE); } -#ifdef CONFIG_X86_64 - set_cpu_cap(c, X86_FEATURE_SYSENTER32); -#else +#ifndef CONFIG_X86_64 /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */ if (c->x86 == 15 && c->x86_cache_alignment == 64) c->x86_cache_alignment = 128; diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c index 89b1c8a70fe8..031379b7d4fa 100644 --- a/arch/x86/kernel/cpu/zhaoxin.c +++ b/arch/x86/kernel/cpu/zhaoxin.c @@ -59,9 +59,7 @@ static void early_init_zhaoxin(struct cpuinfo_x86 *c) { if (c->x86 >= 0x6) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); -#ifdef CONFIG_X86_64 - set_cpu_cap(c, X86_FEATURE_SYSENTER32); -#endif + if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); diff --git a/arch/x86/kernel/fred.c b/arch/x86/kernel/fred.c index 816187da3a47..e736b19e18de 100644 --- a/arch/x86/kernel/fred.c +++ b/arch/x86/kernel/fred.c @@ -68,7 +68,7 @@ void cpu_init_fred_exceptions(void) idt_invalidate(); /* Use int $0x80 for 32-bit system calls in FRED mode */ - setup_clear_cpu_cap(X86_FEATURE_SYSENTER32); + setup_clear_cpu_cap(X86_FEATURE_SYSFAST32); setup_clear_cpu_cap(X86_FEATURE_SYSCALL32); } diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 3823e52aef52..ac8021c3a997 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -990,13 +990,6 @@ static int register_callback(unsigned type, const void *func) return HYPERVISOR_callback_op(CALLBACKOP_register, &callback); } -void xen_enable_sysenter(void) -{ - if (cpu_feature_enabled(X86_FEATURE_SYSENTER32) && - register_callback(CALLBACKTYPE_sysenter, xen_entry_SYSENTER_compat)) - setup_clear_cpu_cap(X86_FEATURE_SYSENTER32); -} - void xen_enable_syscall(void) { int ret; @@ -1008,11 +1001,27 @@ void xen_enable_syscall(void) mechanism for syscalls. */ } - if (cpu_feature_enabled(X86_FEATURE_SYSCALL32) && - register_callback(CALLBACKTYPE_syscall32, xen_entry_SYSCALL_compat)) + if (!cpu_feature_enabled(X86_FEATURE_SYSFAST32)) + return; + + if (cpu_feature_enabled(X86_FEATURE_SYSCALL32)) { + /* Use SYSCALL32 */ + ret = register_callback(CALLBACKTYPE_syscall32, + xen_entry_SYSCALL_compat); + + } else { + /* Use SYSENTER32 */ + ret = register_callback(CALLBACKTYPE_sysenter, + xen_entry_SYSENTER_compat); + } + + if (ret) { setup_clear_cpu_cap(X86_FEATURE_SYSCALL32); + setup_clear_cpu_cap(X86_FEATURE_SYSFAST32); + } } + static void __init xen_pvmmu_arch_setup(void) { HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); @@ -1022,7 +1031,6 @@ static void __init xen_pvmmu_arch_setup(void) register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback)) BUG(); - xen_enable_sysenter(); xen_enable_syscall(); } diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index 9bb8ff8bff30..c40f326f0c3a 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -65,10 +65,9 @@ static void cpu_bringup(void) touch_softlockup_watchdog(); /* PVH runs in ring 0 and allows us to do native syscalls. Yay! */ - if (!xen_feature(XENFEAT_supervisor_mode_kernel)) { - xen_enable_sysenter(); + if (!xen_feature(XENFEAT_supervisor_mode_kernel)) xen_enable_syscall(); - } + cpu = smp_processor_id(); identify_secondary_cpu(cpu); set_cpu_sibling_map(cpu); diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 090349baec09..f6c331b20fad 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -60,7 +60,6 @@ phys_addr_t __init xen_find_free_area(phys_addr_t size); char * __init xen_memory_setup(void); void __init xen_arch_setup(void); void xen_banner(void); -void xen_enable_sysenter(void); void xen_enable_syscall(void); void xen_vcpu_restore(void); -- cgit v1.2.3 From 36d83c249e0395a915144eceeb528ddc19b1fbe6 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 16 Dec 2025 13:26:04 -0800 Subject: x86/entry/vdso32: When using int $0x80, use it directly When neither sysenter32 nor syscall32 is available (on either FRED-capable 64-bit hardware or old 32-bit hardware), there is no reason to do a bunch of stack shuffling in __kernel_vsyscall. Unfortunately, just overwriting the initial "push" instructions will mess up the CFI annotations, so suffer the 3-byte NOP if not applicable. Similarly, inline the int $0x80 when doing inline system calls in the vdso instead of calling __kernel_vsyscall. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Dave Hansen Link: https://patch.msgid.link/20251216212606.1325678-11-hpa@zytor.com --- arch/x86/entry/vdso/vdso32/system_call.S | 18 ++++++++++++++---- arch/x86/include/asm/vdso/sys_call.h | 4 +++- 2 files changed, 17 insertions(+), 5 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S index 7b1c0f16e511..9157cf9c5749 100644 --- a/arch/x86/entry/vdso/vdso32/system_call.S +++ b/arch/x86/entry/vdso/vdso32/system_call.S @@ -14,6 +14,18 @@ ALIGN __kernel_vsyscall: CFI_STARTPROC + + /* + * If using int $0x80, there is no reason to muck about with the + * stack here. Unfortunately just overwriting the push instructions + * would mess up the CFI annotations, but it is only a 3-byte + * NOP in that case. This could be avoided by patching the + * vdso symbol table (not the code) and entry point, but that + * would a fair bit of tooling work or by simply compiling + * two different vDSO images, but that doesn't seem worth it. + */ + ALTERNATIVE "int $0x80; ret", "", X86_FEATURE_SYSFAST32 + /* * Reshuffle regs so that all of any of the entry instructions * will preserve enough state. @@ -52,11 +64,9 @@ __kernel_vsyscall: #define SYSENTER_SEQUENCE "movl %esp, %ebp; sysenter" #define SYSCALL_SEQUENCE "movl %ecx, %ebp; syscall" - /* If SYSENTER (Intel) or SYSCALL32 (AMD) is available, use it. */ - ALTERNATIVE_2 "", SYSENTER_SEQUENCE, X86_FEATURE_SYSFAST32, \ - SYSCALL_SEQUENCE, X86_FEATURE_SYSCALL32 + ALTERNATIVE SYSENTER_SEQUENCE, SYSCALL_SEQUENCE, X86_FEATURE_SYSCALL32 - /* Enter using int $0x80 */ + /* Re-enter using int $0x80 */ int $0x80 SYM_INNER_LABEL(int80_landing_pad, SYM_L_GLOBAL) diff --git a/arch/x86/include/asm/vdso/sys_call.h b/arch/x86/include/asm/vdso/sys_call.h index dcfd17c6dd57..5806b1cd6aef 100644 --- a/arch/x86/include/asm/vdso/sys_call.h +++ b/arch/x86/include/asm/vdso/sys_call.h @@ -20,7 +20,9 @@ # define __sys_reg4 "r10" # define __sys_reg5 "r8" #else -# define __sys_instr "call __kernel_vsyscall" +# define __sys_instr ALTERNATIVE("ds;ds;ds;int $0x80", \ + "call __kernel_vsyscall", \ + X86_FEATURE_SYSFAST32) # define __sys_clobber "memory" # define __sys_nr(x,y) __NR_ ## x ## y # define __sys_reg1 "ebx" -- cgit v1.2.3