From 1934dc9a8a92fda36ab80cfd55edab1708dcdf9a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Sep 2022 13:10:49 +0200 Subject: x86/error_inject: Align function properly Ensure inline asm functions are consistently aligned with compiler generated and SYM_FUNC_START*() functions. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111143.930201368@infradead.org --- arch/x86/lib/error-inject.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/lib') diff --git a/arch/x86/lib/error-inject.c b/arch/x86/lib/error-inject.c index 1e3de0769b81..b5a6d83106bc 100644 --- a/arch/x86/lib/error-inject.c +++ b/arch/x86/lib/error-inject.c @@ -11,6 +11,7 @@ asm( ".text\n" ".type just_return_func, @function\n" ".globl just_return_func\n" + ASM_FUNC_ALIGN "just_return_func:\n" ANNOTATE_NOENDBR ASM_RET -- cgit v1.2.3 From cb855971d717a2dd752241f66fedad9dc178388c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:16 +0200 Subject: x86/putuser: Provide room for padding Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111146.746429822@infradead.org --- arch/x86/lib/putuser.S | 62 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 13 deletions(-) (limited to 'arch/x86/lib') diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S index b7dfd60243b7..32125224fcca 100644 --- a/arch/x86/lib/putuser.S +++ b/arch/x86/lib/putuser.S @@ -47,8 +47,6 @@ SYM_FUNC_START(__put_user_1) LOAD_TASK_SIZE_MINUS_N(0) cmp %_ASM_BX,%_ASM_CX jae .Lbad_put_user -SYM_INNER_LABEL(__put_user_nocheck_1, SYM_L_GLOBAL) - ENDBR ASM_STAC 1: movb %al,(%_ASM_CX) xor %ecx,%ecx @@ -56,54 +54,87 @@ SYM_INNER_LABEL(__put_user_nocheck_1, SYM_L_GLOBAL) RET SYM_FUNC_END(__put_user_1) EXPORT_SYMBOL(__put_user_1) + +SYM_FUNC_START(__put_user_nocheck_1) + ENDBR + ASM_STAC +2: movb %al,(%_ASM_CX) + xor %ecx,%ecx + ASM_CLAC + RET +SYM_FUNC_END(__put_user_nocheck_1) EXPORT_SYMBOL(__put_user_nocheck_1) SYM_FUNC_START(__put_user_2) LOAD_TASK_SIZE_MINUS_N(1) cmp %_ASM_BX,%_ASM_CX jae .Lbad_put_user -SYM_INNER_LABEL(__put_user_nocheck_2, SYM_L_GLOBAL) - ENDBR ASM_STAC -2: movw %ax,(%_ASM_CX) +3: movw %ax,(%_ASM_CX) xor %ecx,%ecx ASM_CLAC RET SYM_FUNC_END(__put_user_2) EXPORT_SYMBOL(__put_user_2) + +SYM_FUNC_START(__put_user_nocheck_2) + ENDBR + ASM_STAC +4: movw %ax,(%_ASM_CX) + xor %ecx,%ecx + ASM_CLAC + RET +SYM_FUNC_END(__put_user_nocheck_2) EXPORT_SYMBOL(__put_user_nocheck_2) SYM_FUNC_START(__put_user_4) LOAD_TASK_SIZE_MINUS_N(3) cmp %_ASM_BX,%_ASM_CX jae .Lbad_put_user -SYM_INNER_LABEL(__put_user_nocheck_4, SYM_L_GLOBAL) - ENDBR ASM_STAC -3: movl %eax,(%_ASM_CX) +5: movl %eax,(%_ASM_CX) xor %ecx,%ecx ASM_CLAC RET SYM_FUNC_END(__put_user_4) EXPORT_SYMBOL(__put_user_4) + +SYM_FUNC_START(__put_user_nocheck_4) + ENDBR + ASM_STAC +6: movl %eax,(%_ASM_CX) + xor %ecx,%ecx + ASM_CLAC + RET +SYM_FUNC_END(__put_user_nocheck_4) EXPORT_SYMBOL(__put_user_nocheck_4) SYM_FUNC_START(__put_user_8) LOAD_TASK_SIZE_MINUS_N(7) cmp %_ASM_BX,%_ASM_CX jae .Lbad_put_user -SYM_INNER_LABEL(__put_user_nocheck_8, SYM_L_GLOBAL) - ENDBR ASM_STAC -4: mov %_ASM_AX,(%_ASM_CX) +7: mov %_ASM_AX,(%_ASM_CX) #ifdef CONFIG_X86_32 -5: movl %edx,4(%_ASM_CX) +8: movl %edx,4(%_ASM_CX) #endif xor %ecx,%ecx ASM_CLAC RET SYM_FUNC_END(__put_user_8) EXPORT_SYMBOL(__put_user_8) + +SYM_FUNC_START(__put_user_nocheck_8) + ENDBR + ASM_STAC +9: mov %_ASM_AX,(%_ASM_CX) +#ifdef CONFIG_X86_32 +10: movl %edx,4(%_ASM_CX) +#endif + xor %ecx,%ecx + ASM_CLAC + RET +SYM_FUNC_END(__put_user_nocheck_8) EXPORT_SYMBOL(__put_user_nocheck_8) SYM_CODE_START_LOCAL(.Lbad_put_user_clac) @@ -117,6 +148,11 @@ SYM_CODE_END(.Lbad_put_user_clac) _ASM_EXTABLE_UA(2b, .Lbad_put_user_clac) _ASM_EXTABLE_UA(3b, .Lbad_put_user_clac) _ASM_EXTABLE_UA(4b, .Lbad_put_user_clac) -#ifdef CONFIG_X86_32 _ASM_EXTABLE_UA(5b, .Lbad_put_user_clac) + _ASM_EXTABLE_UA(6b, .Lbad_put_user_clac) + _ASM_EXTABLE_UA(7b, .Lbad_put_user_clac) + _ASM_EXTABLE_UA(9b, .Lbad_put_user_clac) +#ifdef CONFIG_X86_32 + _ASM_EXTABLE_UA(8b, .Lbad_put_user_clac) + _ASM_EXTABLE_UA(10b, .Lbad_put_user_clac) #endif -- cgit v1.2.3 From 5d8213864ade86b48fc492584ea86d65a62f892e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:27 +0200 Subject: x86/retbleed: Add SKL return thunk To address the Intel SKL RSB underflow issue in software it's required to do call depth tracking. Provide a return thunk for call depth tracking on Intel SKL CPUs. The tracking does not use a counter. It uses uses arithmetic shift right on call entry and logical shift left on return. The depth tracking variable is initialized to 0x8000.... when the call depth is zero. The arithmetic shift right sign extends the MSB and saturates after the 12th call. The shift count is 5 so the tracking covers 12 nested calls. On return the variable is shifted left logically so it becomes zero again. CALL RET 0: 0x8000000000000000 0x0000000000000000 1: 0xfc00000000000000 0xf000000000000000 ... 11: 0xfffffffffffffff8 0xfffffffffffffc00 12: 0xffffffffffffffff 0xffffffffffffffe0 After a return buffer fill the depth is credited 12 calls before the next stuffing has to take place. There is a inaccuracy for situations like this: 10 calls 5 returns 3 calls 4 returns 3 calls .... The shift count might cause this to be off by one in either direction, but there is still a cushion vs. the RSB depth. The algorithm does not claim to be perfect, but it should obfuscate the problem enough to make exploitation extremly difficult. The theory behind this is: RSB is a stack with depth 16 which is filled on every call. On the return path speculation "pops" entries to speculate down the call chain. Once the speculative RSB is empty it switches to other predictors, e.g. the Branch History Buffer, which can be mistrained by user space and misguide the speculation path to a gadget. Call depth tracking is designed to break this speculation path by stuffing speculation trap calls into the RSB which are never getting a corresponding return executed. This stalls the prediction path until it gets resteered, The assumption is that stuffing at the 12th return is sufficient to break the speculation before it hits the underflow and the fallback to the other predictors. Testing confirms that it works. Johannes, one of the retbleed researchers. tried to attack this approach but failed. There is obviously no scientific proof that this will withstand future research progress, but all we can do right now is to speculate about it. The SAR/SHL usage was suggested by Andi Kleen. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111147.890071690@infradead.org --- arch/x86/entry/entry_64.S | 10 +-- arch/x86/include/asm/current.h | 3 + arch/x86/include/asm/nospec-branch.h | 121 +++++++++++++++++++++++++++++++++-- arch/x86/kernel/asm-offsets.c | 3 + arch/x86/kvm/svm/vmenter.S | 1 + arch/x86/lib/retpoline.S | 31 +++++++++ 6 files changed, 159 insertions(+), 10 deletions(-) (limited to 'arch/x86/lib') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 4cc0125fdfdc..15739a2c0983 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -288,6 +288,7 @@ SYM_FUNC_END(__switch_to_asm) SYM_CODE_START_NOALIGN(ret_from_fork) UNWIND_HINT_EMPTY ANNOTATE_NOENDBR // copy_thread + CALL_DEPTH_ACCOUNT movq %rax, %rdi call schedule_tail /* rdi: 'prev' task parameter */ @@ -332,7 +333,7 @@ SYM_CODE_START(xen_error_entry) UNWIND_HINT_FUNC PUSH_AND_CLEAR_REGS save_ret=1 ENCODE_FRAME_POINTER 8 - UNTRAIN_RET + UNTRAIN_RET_FROM_CALL RET SYM_CODE_END(xen_error_entry) @@ -977,7 +978,7 @@ SYM_CODE_START(paranoid_entry) * CR3 above, keep the old value in a callee saved register. */ IBRS_ENTER save_reg=%r15 - UNTRAIN_RET + UNTRAIN_RET_FROM_CALL RET SYM_CODE_END(paranoid_entry) @@ -1062,7 +1063,7 @@ SYM_CODE_START(error_entry) /* We have user CR3. Change to kernel CR3. */ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax IBRS_ENTER - UNTRAIN_RET + UNTRAIN_RET_FROM_CALL leaq 8(%rsp), %rdi /* arg0 = pt_regs pointer */ /* Put us onto the real thread stack. */ @@ -1097,6 +1098,7 @@ SYM_CODE_START(error_entry) */ .Lerror_entry_done_lfence: FENCE_SWAPGS_KERNEL_ENTRY + CALL_DEPTH_ACCOUNT leaq 8(%rsp), %rax /* return pt_regs pointer */ ANNOTATE_UNRET_END RET @@ -1115,7 +1117,7 @@ SYM_CODE_START(error_entry) FENCE_SWAPGS_USER_ENTRY SWITCH_TO_KERNEL_CR3 scratch_reg=%rax IBRS_ENTER - UNTRAIN_RET + UNTRAIN_RET_FROM_CALL /* * Pretend that the exception came from user mode: set up pt_regs diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h index b89aba077b84..a1168e7b69e5 100644 --- a/arch/x86/include/asm/current.h +++ b/arch/x86/include/asm/current.h @@ -17,6 +17,9 @@ struct pcpu_hot { struct task_struct *current_task; int preempt_count; int cpu_number; +#ifdef CONFIG_CALL_DEPTH_TRACKING + u64 call_depth; +#endif unsigned long top_of_stack; void *hardirq_stack_ptr; u16 softirq_pending; diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index f10ca334dd75..d4be826a2282 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -12,8 +12,83 @@ #include #include #include +#include -#define RETPOLINE_THUNK_SIZE 32 +/* + * Call depth tracking for Intel SKL CPUs to address the RSB underflow + * issue in software. + * + * The tracking does not use a counter. It uses uses arithmetic shift + * right on call entry and logical shift left on return. + * + * The depth tracking variable is initialized to 0x8000.... when the call + * depth is zero. The arithmetic shift right sign extends the MSB and + * saturates after the 12th call. The shift count is 5 for both directions + * so the tracking covers 12 nested calls. + * + * Call + * 0: 0x8000000000000000 0x0000000000000000 + * 1: 0xfc00000000000000 0xf000000000000000 + * ... + * 11: 0xfffffffffffffff8 0xfffffffffffffc00 + * 12: 0xffffffffffffffff 0xffffffffffffffe0 + * + * After a return buffer fill the depth is credited 12 calls before the + * next stuffing has to take place. + * + * There is a inaccuracy for situations like this: + * + * 10 calls + * 5 returns + * 3 calls + * 4 returns + * 3 calls + * .... + * + * The shift count might cause this to be off by one in either direction, + * but there is still a cushion vs. the RSB depth. The algorithm does not + * claim to be perfect and it can be speculated around by the CPU, but it + * is considered that it obfuscates the problem enough to make exploitation + * extremly difficult. + */ +#define RET_DEPTH_SHIFT 5 +#define RSB_RET_STUFF_LOOPS 16 +#define RET_DEPTH_INIT 0x8000000000000000ULL +#define RET_DEPTH_INIT_FROM_CALL 0xfc00000000000000ULL +#define RET_DEPTH_CREDIT 0xffffffffffffffffULL + +#if defined(CONFIG_CALL_DEPTH_TRACKING) && !defined(COMPILE_OFFSETS) + +#include + +#define CREDIT_CALL_DEPTH \ + movq $-1, PER_CPU_VAR(pcpu_hot + X86_call_depth); + +#define ASM_CREDIT_CALL_DEPTH \ + movq $-1, PER_CPU_VAR(pcpu_hot + X86_call_depth); + +#define RESET_CALL_DEPTH \ + mov $0x80, %rax; \ + shl $56, %rax; \ + movq %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth); + +#define RESET_CALL_DEPTH_FROM_CALL \ + mov $0xfc, %rax; \ + shl $56, %rax; \ + movq %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth); + +#define INCREMENT_CALL_DEPTH \ + sarq $5, %gs:pcpu_hot + X86_call_depth; + +#define ASM_INCREMENT_CALL_DEPTH \ + sarq $5, PER_CPU_VAR(pcpu_hot + X86_call_depth); + +#else +#define CREDIT_CALL_DEPTH +#define RESET_CALL_DEPTH +#define INCREMENT_CALL_DEPTH +#define RESET_CALL_DEPTH_FROM_CALL +#endif /* * Fill the CPU return stack buffer. @@ -32,6 +107,7 @@ * from C via asm(".include ") but let's not go there. */ +#define RETPOLINE_THUNK_SIZE 32 #define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */ /* @@ -60,7 +136,8 @@ dec reg; \ jnz 771b; \ /* barrier for jnz misprediction */ \ - lfence; + lfence; \ + ASM_CREDIT_CALL_DEPTH #else /* * i386 doesn't unconditionally have LFENCE, as such it can't @@ -185,11 +262,32 @@ * where we have a stack but before any RET instruction. */ .macro UNTRAIN_RET -#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) +#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \ + defined(CONFIG_X86_FEATURE_CALL_DEPTH) ANNOTATE_UNRET_END - ALTERNATIVE_2 "", \ - CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET, \ - "call entry_ibpb", X86_FEATURE_ENTRY_IBPB + ALTERNATIVE_3 "", \ + CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET, \ + "call entry_ibpb", X86_FEATURE_ENTRY_IBPB, \ + __stringify(RESET_CALL_DEPTH), X86_FEATURE_CALL_DEPTH +#endif +.endm + +.macro UNTRAIN_RET_FROM_CALL +#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \ + defined(CONFIG_X86_FEATURE_CALL_DEPTH) + ANNOTATE_UNRET_END + ALTERNATIVE_3 "", \ + CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET, \ + "call entry_ibpb", X86_FEATURE_ENTRY_IBPB, \ + __stringify(RESET_CALL_DEPTH_FROM_CALL), X86_FEATURE_CALL_DEPTH +#endif +.endm + + +.macro CALL_DEPTH_ACCOUNT +#ifdef CONFIG_CALL_DEPTH_TRACKING + ALTERNATIVE "", \ + __stringify(ASM_INCREMENT_CALL_DEPTH), X86_FEATURE_CALL_DEPTH #endif .endm @@ -214,6 +312,17 @@ extern void (*x86_return_thunk)(void); #define x86_return_thunk (&__x86_return_thunk) #endif +#ifdef CONFIG_CALL_DEPTH_TRACKING +extern void __x86_return_skl(void); + +static inline void x86_set_skl_return_thunk(void) +{ + x86_return_thunk = &__x86_return_skl; +} +#else +static inline void x86_set_skl_return_thunk(void) {} +#endif + #ifdef CONFIG_RETPOLINE #define GEN(reg) \ diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index a9824318e1c5..13afdbbee349 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -110,6 +110,9 @@ static void __used common(void) OFFSET(TSS_sp2, tss_struct, x86_tss.sp2); OFFSET(X86_top_of_stack, pcpu_hot, top_of_stack); +#ifdef CONFIG_CALL_DEPTH_TRACKING + OFFSET(X86_call_depth, pcpu_hot, call_depth); +#endif if (IS_ENABLED(CONFIG_KVM_INTEL)) { BLANK(); diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 723f8534986c..09eacf19d718 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include #include +#include #include #include #include diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index 073289a55f84..1e79eccc1d69 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -5,9 +5,11 @@ #include #include #include +#include #include #include #include +#include #include .section .text.__x86.indirect_thunk @@ -140,3 +142,32 @@ __EXPORT_THUNK(zen_untrain_ret) EXPORT_SYMBOL(__x86_return_thunk) #endif /* CONFIG_RETHUNK */ + +#ifdef CONFIG_CALL_DEPTH_TRACKING + + .align 64 +SYM_FUNC_START(__x86_return_skl) + ANNOTATE_NOENDBR + /* Keep the hotpath in a 16byte I-fetch */ + shlq $5, PER_CPU_VAR(pcpu_hot + X86_call_depth) + jz 1f + ANNOTATE_UNRET_SAFE + ret + int3 +1: + .rept 16 + ANNOTATE_INTRA_FUNCTION_CALL + call 2f + int3 +2: + .endr + add $(8*16), %rsp + + CREDIT_CALL_DEPTH + + ANNOTATE_UNRET_SAFE + ret + int3 +SYM_FUNC_END(__x86_return_skl) + +#endif /* CONFIG_CALL_DEPTH_TRACKING */ -- cgit v1.2.3 From 3b6c1747da48ff40ab746b0e860cffe83619f5c5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Sep 2022 13:11:28 +0200 Subject: x86/retpoline: Add SKL retthunk retpolines Ensure that retpolines do the proper call accounting so that the return accounting works correctly. Specifically; retpolines are used to replace both 'jmp *%reg' and 'call *%reg', however these two cases do not have the same accounting requirements. Therefore split things up and provide two different retpoline arrays for SKL. The 'jmp *%reg' case needs no accounting, the __x86_indirect_jump_thunk_array[] covers this. The retpoline is changed to not use the return thunk; it's a simple call;ret construct. [ strictly speaking it should do: andq $(~0x1f), PER_CPU_VAR(__x86_call_depth) but we can argue this can be covered by the fuzz we already have in the accounting depth (12) vs the RSB depth (16) ] The 'call *%reg' case does need accounting, the __x86_indirect_call_thunk_array[] covers this. Again, this retpoline avoids the use of the return-thunk, in this case to avoid double accounting. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111147.996634749@infradead.org --- arch/x86/include/asm/nospec-branch.h | 12 ++++++ arch/x86/kernel/alternative.c | 59 ++++++++++++++++++++++++++++-- arch/x86/lib/retpoline.S | 71 ++++++++++++++++++++++++++++++++---- arch/x86/net/bpf_jit_comp.c | 5 ++- 4 files changed, 135 insertions(+), 12 deletions(-) (limited to 'arch/x86/lib') diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index d4be826a2282..06ba7caa0cad 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -301,6 +301,8 @@ typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE]; extern retpoline_thunk_t __x86_indirect_thunk_array[]; +extern retpoline_thunk_t __x86_indirect_call_thunk_array[]; +extern retpoline_thunk_t __x86_indirect_jump_thunk_array[]; extern void __x86_return_thunk(void); extern void zen_untrain_ret(void); @@ -330,6 +332,16 @@ static inline void x86_set_skl_return_thunk(void) {} #include #undef GEN +#define GEN(reg) \ + extern retpoline_thunk_t __x86_indirect_call_thunk_ ## reg; +#include +#undef GEN + +#define GEN(reg) \ + extern retpoline_thunk_t __x86_indirect_jump_thunk_ ## reg; +#include +#undef GEN + #ifdef CONFIG_X86_64 /* diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 04d1e3d35b0e..19221d77dc27 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -377,6 +377,56 @@ static int emit_indirect(int op, int reg, u8 *bytes) return i; } +static inline bool is_jcc32(struct insn *insn) +{ + /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */ + return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80; +} + +static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes) +{ + u8 op = insn->opcode.bytes[0]; + int i = 0; + + /* + * Clang does 'weird' Jcc __x86_indirect_thunk_r11 conditional + * tail-calls. Deal with them. + */ + if (is_jcc32(insn)) { + bytes[i++] = op; + op = insn->opcode.bytes[1]; + goto clang_jcc; + } + + if (insn->length == 6) + bytes[i++] = 0x2e; /* CS-prefix */ + + switch (op) { + case CALL_INSN_OPCODE: + __text_gen_insn(bytes+i, op, addr+i, + __x86_indirect_call_thunk_array[reg], + CALL_INSN_SIZE); + i += CALL_INSN_SIZE; + break; + + case JMP32_INSN_OPCODE: +clang_jcc: + __text_gen_insn(bytes+i, op, addr+i, + __x86_indirect_jump_thunk_array[reg], + JMP32_INSN_SIZE); + i += JMP32_INSN_SIZE; + break; + + default: + WARN("%pS %px %*ph\n", addr, addr, 6, addr); + return -1; + } + + WARN_ON_ONCE(i != insn->length); + + return i; +} + /* * Rewrite the compiler generated retpoline thunk calls. * @@ -409,8 +459,12 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) BUG_ON(reg == 4); if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) && - !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) + !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { + if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH)) + return emit_call_track_retpoline(addr, insn, reg, bytes); + return -1; + } op = insn->opcode.bytes[0]; @@ -427,8 +481,7 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) * [ NOP ] * 1: */ - /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */ - if (op == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80) { + if (is_jcc32(insn)) { cc = insn->opcode.bytes[1] & 0xf; cc ^= 1; /* invert condition */ diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index 1e79eccc1d69..e00206077ae9 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -14,17 +14,18 @@ .section .text.__x86.indirect_thunk -.macro RETPOLINE reg + +.macro POLINE reg ANNOTATE_INTRA_FUNCTION_CALL call .Ldo_rop_\@ -.Lspec_trap_\@: - UNWIND_HINT_EMPTY - pause - lfence - jmp .Lspec_trap_\@ + int3 .Ldo_rop_\@: mov %\reg, (%_ASM_SP) UNWIND_HINT_FUNC +.endm + +.macro RETPOLINE reg + POLINE \reg RET .endm @@ -54,7 +55,6 @@ SYM_INNER_LABEL(__x86_indirect_thunk_\reg, SYM_L_GLOBAL) */ #define __EXPORT_THUNK(sym) _ASM_NOKPROBE(sym); EXPORT_SYMBOL(sym) -#define EXPORT_THUNK(reg) __EXPORT_THUNK(__x86_indirect_thunk_ ## reg) .align RETPOLINE_THUNK_SIZE SYM_CODE_START(__x86_indirect_thunk_array) @@ -66,10 +66,65 @@ SYM_CODE_START(__x86_indirect_thunk_array) .align RETPOLINE_THUNK_SIZE SYM_CODE_END(__x86_indirect_thunk_array) -#define GEN(reg) EXPORT_THUNK(reg) +#define GEN(reg) __EXPORT_THUNK(__x86_indirect_thunk_ ## reg) +#include +#undef GEN + +#ifdef CONFIG_CALL_DEPTH_TRACKING +.macro CALL_THUNK reg + .align RETPOLINE_THUNK_SIZE + +SYM_INNER_LABEL(__x86_indirect_call_thunk_\reg, SYM_L_GLOBAL) + UNWIND_HINT_EMPTY + ANNOTATE_NOENDBR + + CALL_DEPTH_ACCOUNT + POLINE \reg + ANNOTATE_UNRET_SAFE + ret + int3 +.endm + + .align RETPOLINE_THUNK_SIZE +SYM_CODE_START(__x86_indirect_call_thunk_array) + +#define GEN(reg) CALL_THUNK reg #include #undef GEN + .align RETPOLINE_THUNK_SIZE +SYM_CODE_END(__x86_indirect_call_thunk_array) + +#define GEN(reg) __EXPORT_THUNK(__x86_indirect_call_thunk_ ## reg) +#include +#undef GEN + +.macro JUMP_THUNK reg + .align RETPOLINE_THUNK_SIZE + +SYM_INNER_LABEL(__x86_indirect_jump_thunk_\reg, SYM_L_GLOBAL) + UNWIND_HINT_EMPTY + ANNOTATE_NOENDBR + POLINE \reg + ANNOTATE_UNRET_SAFE + ret + int3 +.endm + + .align RETPOLINE_THUNK_SIZE +SYM_CODE_START(__x86_indirect_jump_thunk_array) + +#define GEN(reg) JUMP_THUNK reg +#include +#undef GEN + + .align RETPOLINE_THUNK_SIZE +SYM_CODE_END(__x86_indirect_jump_thunk_array) + +#define GEN(reg) __EXPORT_THUNK(__x86_indirect_jump_thunk_ ## reg) +#include +#undef GEN +#endif /* * This function name is magical and is used by -mfunction-return=thunk-extern * for the compiler to generate JMPs to it. diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 0df391ecd4d8..ad8cb7f15ab8 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -417,7 +417,10 @@ static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip) EMIT2(0xFF, 0xE0 + reg); } else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) { OPTIMIZER_HIDE_VAR(reg); - emit_jump(&prog, &__x86_indirect_thunk_array[reg], ip); + if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH)) + emit_jump(&prog, &__x86_indirect_jump_thunk_array[reg], ip); + else + emit_jump(&prog, &__x86_indirect_thunk_array[reg], ip); } else { EMIT2(0xFF, 0xE0 + reg); /* jmp *%\reg */ if (IS_ENABLED(CONFIG_RETPOLINE) || IS_ENABLED(CONFIG_SLS)) -- cgit v1.2.3 From f5c1bb2afe93396d41c5cbdcb909b08a75b8dde4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:30 +0200 Subject: x86/calldepth: Add ret/call counting for debug Add a debuigfs mechanism to validate the accounting, e.g. vs. call/ret balance and to gather statistics about the stuffing to call ratio. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111148.204285506@infradead.org --- arch/x86/include/asm/nospec-branch.h | 36 +++++++++++++++++++++--- arch/x86/kernel/callthunks.c | 53 ++++++++++++++++++++++++++++++++++++ arch/x86/lib/retpoline.S | 7 ++++- 3 files changed, 91 insertions(+), 5 deletions(-) (limited to 'arch/x86/lib') diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 06ba7caa0cad..4771147c7c5a 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -57,6 +57,22 @@ #define RET_DEPTH_INIT_FROM_CALL 0xfc00000000000000ULL #define RET_DEPTH_CREDIT 0xffffffffffffffffULL +#ifdef CONFIG_CALL_THUNKS_DEBUG +# define CALL_THUNKS_DEBUG_INC_CALLS \ + incq %gs:__x86_call_count; +# define CALL_THUNKS_DEBUG_INC_RETS \ + incq %gs:__x86_ret_count; +# define CALL_THUNKS_DEBUG_INC_STUFFS \ + incq %gs:__x86_stuffs_count; +# define CALL_THUNKS_DEBUG_INC_CTXSW \ + incq %gs:__x86_ctxsw_count; +#else +# define CALL_THUNKS_DEBUG_INC_CALLS +# define CALL_THUNKS_DEBUG_INC_RETS +# define CALL_THUNKS_DEBUG_INC_STUFFS +# define CALL_THUNKS_DEBUG_INC_CTXSW +#endif + #if defined(CONFIG_CALL_DEPTH_TRACKING) && !defined(COMPILE_OFFSETS) #include @@ -75,18 +91,23 @@ #define RESET_CALL_DEPTH_FROM_CALL \ mov $0xfc, %rax; \ shl $56, %rax; \ - movq %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth); + movq %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth); \ + CALL_THUNKS_DEBUG_INC_CALLS #define INCREMENT_CALL_DEPTH \ - sarq $5, %gs:pcpu_hot + X86_call_depth; + sarq $5, %gs:pcpu_hot + X86_call_depth; \ + CALL_THUNKS_DEBUG_INC_CALLS #define ASM_INCREMENT_CALL_DEPTH \ - sarq $5, PER_CPU_VAR(pcpu_hot + X86_call_depth); + sarq $5, PER_CPU_VAR(pcpu_hot + X86_call_depth); \ + CALL_THUNKS_DEBUG_INC_CALLS #else #define CREDIT_CALL_DEPTH +#define ASM_CREDIT_CALL_DEPTH #define RESET_CALL_DEPTH #define INCREMENT_CALL_DEPTH +#define ASM_INCREMENT_CALL_DEPTH #define RESET_CALL_DEPTH_FROM_CALL #endif @@ -137,7 +158,8 @@ jnz 771b; \ /* barrier for jnz misprediction */ \ lfence; \ - ASM_CREDIT_CALL_DEPTH + ASM_CREDIT_CALL_DEPTH \ + CALL_THUNKS_DEBUG_INC_CTXSW #else /* * i386 doesn't unconditionally have LFENCE, as such it can't @@ -321,6 +343,12 @@ static inline void x86_set_skl_return_thunk(void) { x86_return_thunk = &__x86_return_skl; } +#ifdef CONFIG_CALL_THUNKS_DEBUG +DECLARE_PER_CPU(u64, __x86_call_count); +DECLARE_PER_CPU(u64, __x86_ret_count); +DECLARE_PER_CPU(u64, __x86_stuffs_count); +DECLARE_PER_CPU(u64, __x86_ctxsw_count); +#endif #else static inline void x86_set_skl_return_thunk(void) {} #endif diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index 01f6f6b5a93c..dfe7ffff88b9 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -2,6 +2,7 @@ #define pr_fmt(fmt) "callthunks: " fmt +#include #include #include #include @@ -35,6 +36,15 @@ static int __init debug_thunks(char *str) } __setup("debug-callthunks", debug_thunks); +#ifdef CONFIG_CALL_THUNKS_DEBUG +DEFINE_PER_CPU(u64, __x86_call_count); +DEFINE_PER_CPU(u64, __x86_ret_count); +DEFINE_PER_CPU(u64, __x86_stuffs_count); +DEFINE_PER_CPU(u64, __x86_ctxsw_count); +EXPORT_SYMBOL_GPL(__x86_ctxsw_count); +EXPORT_SYMBOL_GPL(__x86_call_count); +#endif + extern s32 __call_sites[], __call_sites_end[]; struct thunk_desc { @@ -283,3 +293,46 @@ void noinline callthunks_patch_module_calls(struct callthunk_sites *cs, mutex_unlock(&text_mutex); } #endif /* CONFIG_MODULES */ + +#if defined(CONFIG_CALL_THUNKS_DEBUG) && defined(CONFIG_DEBUG_FS) +static int callthunks_debug_show(struct seq_file *m, void *p) +{ + unsigned long cpu = (unsigned long)m->private; + + seq_printf(m, "C: %16llu R: %16llu S: %16llu X: %16llu\n,", + per_cpu(__x86_call_count, cpu), + per_cpu(__x86_ret_count, cpu), + per_cpu(__x86_stuffs_count, cpu), + per_cpu(__x86_ctxsw_count, cpu)); + return 0; +} + +static int callthunks_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, callthunks_debug_show, inode->i_private); +} + +static const struct file_operations dfs_ops = { + .open = callthunks_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init callthunks_debugfs_init(void) +{ + struct dentry *dir; + unsigned long cpu; + + dir = debugfs_create_dir("callthunks", NULL); + for_each_possible_cpu(cpu) { + void *arg = (void *)cpu; + char name [10]; + + sprintf(name, "cpu%lu", cpu); + debugfs_create_file(name, 0644, dir, arg, &dfs_ops); + } + return 0; +} +__initcall(callthunks_debugfs_init); +#endif diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index e00206077ae9..5f61c65322be 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -203,13 +203,18 @@ EXPORT_SYMBOL(__x86_return_thunk) .align 64 SYM_FUNC_START(__x86_return_skl) ANNOTATE_NOENDBR - /* Keep the hotpath in a 16byte I-fetch */ + /* + * Keep the hotpath in a 16byte I-fetch for the non-debug + * case. + */ + CALL_THUNKS_DEBUG_INC_RETS shlq $5, PER_CPU_VAR(pcpu_hot + X86_call_depth) jz 1f ANNOTATE_UNRET_SAFE ret int3 1: + CALL_THUNKS_DEBUG_INC_STUFFS .rept 16 ANNOTATE_INTRA_FUNCTION_CALL call 2f -- cgit v1.2.3