diff options
Diffstat (limited to 'arch/x86/net')
| -rw-r--r-- | arch/x86/net/Makefile | 2 | ||||
| -rw-r--r-- | arch/x86/net/bpf_jit_comp.c | 610 | ||||
| -rw-r--r-- | arch/x86/net/bpf_jit_comp32.c | 37 | ||||
| -rw-r--r-- | arch/x86/net/bpf_timed_may_goto.S | 55 |
4 files changed, 512 insertions, 192 deletions
diff --git a/arch/x86/net/Makefile b/arch/x86/net/Makefile index 383c87300b0d..dddbefc0f439 100644 --- a/arch/x86/net/Makefile +++ b/arch/x86/net/Makefile @@ -6,5 +6,5 @@ ifeq ($(CONFIG_X86_32),y) obj-$(CONFIG_BPF_JIT) += bpf_jit_comp32.o else - obj-$(CONFIG_BPF_JIT) += bpf_jit_comp.o + obj-$(CONFIG_BPF_JIT) += bpf_jit_comp.o bpf_timed_may_goto.o endif diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index a43fc5af973d..ea9e707e8abf 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -8,6 +8,7 @@ #include <linux/netdevice.h> #include <linux/filter.h> #include <linux/if_vlan.h> +#include <linux/bitfield.h> #include <linux/bpf.h> #include <linux/memory.h> #include <linux/sort.h> @@ -41,6 +42,8 @@ static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) #define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2) #define EMIT3(b1, b2, b3) EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3) #define EMIT4(b1, b2, b3, b4) EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4) +#define EMIT5(b1, b2, b3, b4, b5) \ + do { EMIT1(b1); EMIT4(b2, b3, b4, b5); } while (0) #define EMIT1_off32(b1, off) \ do { EMIT1(b1); EMIT(off, 4); } while (0) @@ -55,8 +58,8 @@ static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) #define EMIT_ENDBR() EMIT(gen_endbr(), 4) #define EMIT_ENDBR_POISON() EMIT(gen_endbr_poison(), 4) #else -#define EMIT_ENDBR() -#define EMIT_ENDBR_POISON() +#define EMIT_ENDBR() do { } while (0) +#define EMIT_ENDBR_POISON() do { } while (0) #endif static bool is_imm8(int value) @@ -410,16 +413,20 @@ static void emit_nops(u8 **pprog, int len) * Emit the various CFI preambles, see asm/cfi.h and the comments about FineIBT * in arch/x86/kernel/alternative.c */ +static int emit_call(u8 **prog, void *func, void *ip); -static void emit_fineibt(u8 **pprog, u32 hash) +static void emit_fineibt(u8 **pprog, u8 *ip, u32 hash, int arity) { u8 *prog = *pprog; EMIT_ENDBR(); - EMIT3_off32(0x41, 0x81, 0xea, hash); /* subl $hash, %r10d */ - EMIT2(0x74, 0x07); /* jz.d8 +7 */ - EMIT2(0x0f, 0x0b); /* ud2 */ - EMIT1(0x90); /* nop */ + EMIT1_off32(0x2d, hash); /* subl $hash, %eax */ + if (cfi_bhi) { + EMIT2(0x2e, 0x2e); /* cs cs */ + emit_call(&prog, __bhi_args[arity], ip + 11); + } else { + EMIT3_off32(0x2e, 0x0f, 0x85, 3); /* jne.d32,pn 3 */ + } EMIT_ENDBR_POISON(); *pprog = prog; @@ -431,30 +438,21 @@ static void emit_kcfi(u8 **pprog, u32 hash) EMIT1_off32(0xb8, hash); /* movl $hash, %eax */ #ifdef CONFIG_CALL_PADDING - EMIT1(0x90); - EMIT1(0x90); - EMIT1(0x90); - EMIT1(0x90); - EMIT1(0x90); - EMIT1(0x90); - EMIT1(0x90); - EMIT1(0x90); - EMIT1(0x90); - EMIT1(0x90); - EMIT1(0x90); + for (int i = 0; i < CONFIG_FUNCTION_PADDING_CFI; i++) + EMIT1(0x90); #endif EMIT_ENDBR(); *pprog = prog; } -static void emit_cfi(u8 **pprog, u32 hash) +static void emit_cfi(u8 **pprog, u8 *ip, u32 hash, int arity) { u8 *prog = *pprog; switch (cfi_mode) { case CFI_FINEIBT: - emit_fineibt(&prog, hash); + emit_fineibt(&prog, ip, hash, arity); break; case CFI_KCFI: @@ -505,13 +503,17 @@ static void emit_prologue_tail_call(u8 **pprog, bool is_subprog) * bpf_tail_call helper will skip the first X86_TAIL_CALL_OFFSET bytes * while jumping to another program */ -static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf, +static void emit_prologue(u8 **pprog, u8 *ip, u32 stack_depth, bool ebpf_from_cbpf, bool tail_call_reachable, bool is_subprog, bool is_exception_cb) { u8 *prog = *pprog; - emit_cfi(&prog, is_subprog ? cfi_bpf_subprog_hash : cfi_bpf_hash); + if (is_subprog) { + emit_cfi(&prog, ip, cfi_bpf_subprog_hash, 5); + } else { + emit_cfi(&prog, ip, cfi_bpf_hash, 1); + } /* BPF trampoline can be made to work without these nops, * but let's waste 5 bytes for now and optimize later */ @@ -586,7 +588,8 @@ static int emit_jump(u8 **pprog, void *func, void *ip) return emit_patch(pprog, func, ip, 0xE9); } -static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, +static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t, + enum bpf_text_poke_type new_t, void *old_addr, void *new_addr) { const u8 *nop_insn = x86_nops[5]; @@ -596,9 +599,9 @@ static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, int ret; memcpy(old_insn, nop_insn, X86_PATCH_SIZE); - if (old_addr) { + if (old_t != BPF_MOD_NOP && old_addr) { prog = old_insn; - ret = t == BPF_MOD_CALL ? + ret = old_t == BPF_MOD_CALL ? emit_call(&prog, old_addr, ip) : emit_jump(&prog, old_addr, ip); if (ret) @@ -606,9 +609,9 @@ static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, } memcpy(new_insn, nop_insn, X86_PATCH_SIZE); - if (new_addr) { + if (new_t != BPF_MOD_NOP && new_addr) { prog = new_insn; - ret = t == BPF_MOD_CALL ? + ret = new_t == BPF_MOD_CALL ? emit_call(&prog, new_addr, ip) : emit_jump(&prog, new_addr, ip); if (ret) @@ -621,7 +624,7 @@ static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, goto out; ret = 1; if (memcmp(ip, new_insn, X86_PATCH_SIZE)) { - text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL); + smp_text_poke_single(ip, new_insn, X86_PATCH_SIZE, NULL); ret = 0; } out: @@ -629,8 +632,9 @@ out: return ret; } -int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, - void *old_addr, void *new_addr) +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t, + enum bpf_text_poke_type new_t, void *old_addr, + void *new_addr) { if (!is_kernel_text((long)ip) && !is_bpf_text_address((long)ip)) @@ -641,29 +645,46 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, * See emit_prologue(), for IBT builds the trampoline hook is preceded * with an ENDBR instruction. */ - if (is_endbr(*(u32 *)ip)) + if (is_endbr(ip)) ip += ENDBR_INSN_SIZE; - return __bpf_arch_text_poke(ip, t, old_addr, new_addr); + return __bpf_arch_text_poke(ip, old_t, new_t, old_addr, new_addr); } #define EMIT_LFENCE() EMIT3(0x0F, 0xAE, 0xE8) -static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip) +static void __emit_indirect_jump(u8 **pprog, int reg, bool ereg) +{ + u8 *prog = *pprog; + + if (ereg) + EMIT1(0x41); + + EMIT2(0xFF, 0xE0 + reg); + + *pprog = prog; +} + +static void emit_indirect_jump(u8 **pprog, int bpf_reg, u8 *ip) { u8 *prog = *pprog; + int reg = reg2hex[bpf_reg]; + bool ereg = is_ereg(bpf_reg); - if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { + if (cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) { + OPTIMIZER_HIDE_VAR(reg); + emit_jump(&prog, its_static_thunk(reg + 8*ereg), ip); + } else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { EMIT_LFENCE(); - EMIT2(0xFF, 0xE0 + reg); + __emit_indirect_jump(&prog, reg, ereg); } else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) { OPTIMIZER_HIDE_VAR(reg); if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH)) - emit_jump(&prog, &__x86_indirect_jump_thunk_array[reg], ip); + emit_jump(&prog, &__x86_indirect_jump_thunk_array[reg + 8*ereg], ip); else - emit_jump(&prog, &__x86_indirect_thunk_array[reg], ip); + emit_jump(&prog, &__x86_indirect_thunk_array[reg + 8*ereg], ip); } else { - EMIT2(0xFF, 0xE0 + reg); /* jmp *%\reg */ + __emit_indirect_jump(&prog, reg, ereg); if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) || IS_ENABLED(CONFIG_MITIGATION_SLS)) EMIT1(0xCC); /* int3 */ } @@ -675,7 +696,7 @@ static void emit_return(u8 **pprog, u8 *ip) { u8 *prog = *pprog; - if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) { + if (cpu_wants_rethunk()) { emit_jump(&prog, x86_return_thunk, ip); } else { EMIT1(0xC3); /* ret */ @@ -783,7 +804,7 @@ static void emit_bpf_tail_call_indirect(struct bpf_prog *bpf_prog, * rdi == ctx (1st arg) * rcx == prog->bpf_func + X86_TAIL_CALL_OFFSET */ - emit_indirect_jump(&prog, 1 /* rcx */, ip + (prog - start)); + emit_indirect_jump(&prog, BPF_REG_4 /* R4 -> rcx */, ip + (prog - start)); /* out: */ ctx->tail_call_indirect_label = prog - start; @@ -869,12 +890,13 @@ static void bpf_tail_call_direct_fixup(struct bpf_prog *prog) target = array->ptrs[poke->tail_call.key]; if (target) { ret = __bpf_arch_text_poke(poke->tailcall_target, - BPF_MOD_JUMP, NULL, + BPF_MOD_NOP, BPF_MOD_JUMP, + NULL, (u8 *)target->bpf_func + poke->adj_off); BUG_ON(ret < 0); ret = __bpf_arch_text_poke(poke->tailcall_bypass, - BPF_MOD_JUMP, + BPF_MOD_JUMP, BPF_MOD_NOP, (u8 *)poke->tailcall_target + X86_PATCH_SIZE, NULL); BUG_ON(ret < 0); @@ -1138,11 +1160,38 @@ static void emit_ldx_index(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, u32 i *pprog = prog; } +static void emit_ldsx_index(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, u32 index_reg, int off) +{ + u8 *prog = *pprog; + + switch (size) { + case BPF_B: + /* movsx rax, byte ptr [rax + r12 + off] */ + EMIT3(add_3mod(0x48, src_reg, dst_reg, index_reg), 0x0F, 0xBE); + break; + case BPF_H: + /* movsx rax, word ptr [rax + r12 + off] */ + EMIT3(add_3mod(0x48, src_reg, dst_reg, index_reg), 0x0F, 0xBF); + break; + case BPF_W: + /* movsx rax, dword ptr [rax + r12 + off] */ + EMIT2(add_3mod(0x48, src_reg, dst_reg, index_reg), 0x63); + break; + } + emit_insn_suffix_SIB(&prog, src_reg, dst_reg, index_reg, off); + *pprog = prog; +} + static void emit_ldx_r12(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) { emit_ldx_index(pprog, size, dst_reg, src_reg, X86_REG_R12, off); } +static void emit_ldsx_r12(u8 **prog, u32 size, u32 dst_reg, u32 src_reg, int off) +{ + emit_ldsx_index(prog, size, dst_reg, src_reg, X86_REG_R12, off); +} + /* STX: *(u8*)(dst_reg + off) = src_reg */ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) { @@ -1242,12 +1291,23 @@ static void emit_st_r12(u8 **pprog, u32 size, u32 dst_reg, int off, int imm) emit_st_index(pprog, size, dst_reg, X86_REG_R12, off, imm); } -static int emit_atomic(u8 **pprog, u8 atomic_op, - u32 dst_reg, u32 src_reg, s16 off, u8 bpf_size) +static void emit_store_stack_imm64(u8 **pprog, int reg, int stack_off, u64 imm64) +{ + /* + * mov reg, imm64 + * mov QWORD PTR [rbp + stack_off], reg + */ + emit_mov_imm64(pprog, reg, imm64 >> 32, (u32) imm64); + emit_stx(pprog, BPF_DW, BPF_REG_FP, reg, stack_off); +} + +static int emit_atomic_rmw(u8 **pprog, u32 atomic_op, + u32 dst_reg, u32 src_reg, s16 off, u8 bpf_size) { u8 *prog = *pprog; - EMIT1(0xF0); /* lock prefix */ + if (atomic_op != BPF_XCHG) + EMIT1(0xF0); /* lock prefix */ maybe_emit_mod(&prog, dst_reg, src_reg, bpf_size == BPF_DW); @@ -1283,12 +1343,15 @@ static int emit_atomic(u8 **pprog, u8 atomic_op, return 0; } -static int emit_atomic_index(u8 **pprog, u8 atomic_op, u32 size, - u32 dst_reg, u32 src_reg, u32 index_reg, int off) +static int emit_atomic_rmw_index(u8 **pprog, u32 atomic_op, u32 size, + u32 dst_reg, u32 src_reg, u32 index_reg, + int off) { u8 *prog = *pprog; - EMIT1(0xF0); /* lock prefix */ + if (atomic_op != BPF_XCHG) + EMIT1(0xF0); /* lock prefix */ + switch (size) { case BPF_W: EMIT1(add_3mod(0x40, dst_reg, src_reg, index_reg)); @@ -1297,7 +1360,7 @@ static int emit_atomic_index(u8 **pprog, u8 atomic_op, u32 size, EMIT1(add_3mod(0x48, dst_reg, src_reg, index_reg)); break; default: - pr_err("bpf_jit: 1 and 2 byte atomics are not supported\n"); + pr_err("bpf_jit: 1- and 2-byte RMW atomics are not supported\n"); return -EFAULT; } @@ -1331,16 +1394,110 @@ static int emit_atomic_index(u8 **pprog, u8 atomic_op, u32 size, return 0; } +static int emit_atomic_ld_st(u8 **pprog, u32 atomic_op, u32 dst_reg, + u32 src_reg, s16 off, u8 bpf_size) +{ + switch (atomic_op) { + case BPF_LOAD_ACQ: + /* dst_reg = smp_load_acquire(src_reg + off16) */ + emit_ldx(pprog, bpf_size, dst_reg, src_reg, off); + break; + case BPF_STORE_REL: + /* smp_store_release(dst_reg + off16, src_reg) */ + emit_stx(pprog, bpf_size, dst_reg, src_reg, off); + break; + default: + pr_err("bpf_jit: unknown atomic load/store opcode %02x\n", + atomic_op); + return -EFAULT; + } + + return 0; +} + +static int emit_atomic_ld_st_index(u8 **pprog, u32 atomic_op, u32 size, + u32 dst_reg, u32 src_reg, u32 index_reg, + int off) +{ + switch (atomic_op) { + case BPF_LOAD_ACQ: + /* dst_reg = smp_load_acquire(src_reg + idx_reg + off16) */ + emit_ldx_index(pprog, size, dst_reg, src_reg, index_reg, off); + break; + case BPF_STORE_REL: + /* smp_store_release(dst_reg + idx_reg + off16, src_reg) */ + emit_stx_index(pprog, size, dst_reg, src_reg, index_reg, off); + break; + default: + pr_err("bpf_jit: unknown atomic load/store opcode %02x\n", + atomic_op); + return -EFAULT; + } + + return 0; +} + +/* + * Metadata encoding for exception handling in JITed code. + * + * Format of `fixup` and `data` fields in `struct exception_table_entry`: + * + * Bit layout of `fixup` (32-bit): + * + * +-----------+--------+-----------+---------+----------+ + * | 31 | 30-24 | 23-16 | 15-8 | 7-0 | + * | | | | | | + * | ARENA_ACC | Unused | ARENA_REG | DST_REG | INSN_LEN | + * +-----------+--------+-----------+---------+----------+ + * + * - INSN_LEN (8 bits): Length of faulting insn (max x86 insn = 15 bytes (fits in 8 bits)). + * - DST_REG (8 bits): Offset of dst_reg from reg2pt_regs[] (max offset = 112 (fits in 8 bits)). + * This is set to DONT_CLEAR if the insn is a store. + * - ARENA_REG (8 bits): Offset of the register that is used to calculate the + * address for load/store when accessing the arena region. + * - ARENA_ACCESS (1 bit): This bit is set when the faulting instruction accessed the arena region. + * + * Bit layout of `data` (32-bit): + * + * +--------------+--------+--------------+ + * | 31-16 | 15-8 | 7-0 | + * | | | | + * | ARENA_OFFSET | Unused | EX_TYPE_BPF | + * +--------------+--------+--------------+ + * + * - ARENA_OFFSET (16 bits): Offset used to calculate the address for load/store when + * accessing the arena region. + */ + #define DONT_CLEAR 1 +#define FIXUP_INSN_LEN_MASK GENMASK(7, 0) +#define FIXUP_REG_MASK GENMASK(15, 8) +#define FIXUP_ARENA_REG_MASK GENMASK(23, 16) +#define FIXUP_ARENA_ACCESS BIT(31) +#define DATA_ARENA_OFFSET_MASK GENMASK(31, 16) bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs) { - u32 reg = x->fixup >> 8; + u32 reg = FIELD_GET(FIXUP_REG_MASK, x->fixup); + u32 insn_len = FIELD_GET(FIXUP_INSN_LEN_MASK, x->fixup); + bool is_arena = !!(x->fixup & FIXUP_ARENA_ACCESS); + bool is_write = (reg == DONT_CLEAR); + unsigned long addr; + s16 off; + u32 arena_reg; + + if (is_arena) { + arena_reg = FIELD_GET(FIXUP_ARENA_REG_MASK, x->fixup); + off = FIELD_GET(DATA_ARENA_OFFSET_MASK, x->data); + addr = *(unsigned long *)((void *)regs + arena_reg) + off; + bpf_prog_report_arena_violation(is_write, addr, regs->ip); + } /* jump over faulting load and clear dest register */ if (reg != DONT_CLEAR) *(unsigned long *)((void *)regs + reg) = 0; - regs->ip += x->fixup & 0xff; + regs->ip += insn_len; + return true; } @@ -1450,8 +1607,50 @@ static void emit_priv_frame_ptr(u8 **pprog, void __percpu *priv_frame_ptr) #define PRIV_STACK_GUARD_SZ 8 #define PRIV_STACK_GUARD_VAL 0xEB9F12345678eb9fULL -static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image, - int oldproglen, struct jit_context *ctx, bool jmp_padding) +static int emit_spectre_bhb_barrier(u8 **pprog, u8 *ip, + struct bpf_prog *bpf_prog) +{ + u8 *prog = *pprog; + u8 *func; + + if (cpu_feature_enabled(X86_FEATURE_CLEAR_BHB_LOOP)) { + /* The clearing sequence clobbers eax and ecx. */ + EMIT1(0x50); /* push rax */ + EMIT1(0x51); /* push rcx */ + ip += 2; + + func = (u8 *)clear_bhb_loop; + ip += x86_call_depth_emit_accounting(&prog, func, ip); + + if (emit_call(&prog, func, ip)) + return -EINVAL; + EMIT1(0x59); /* pop rcx */ + EMIT1(0x58); /* pop rax */ + } + /* Insert IBHF instruction */ + if ((cpu_feature_enabled(X86_FEATURE_CLEAR_BHB_LOOP) && + cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) || + cpu_feature_enabled(X86_FEATURE_CLEAR_BHB_HW)) { + /* + * Add an Indirect Branch History Fence (IBHF). IBHF acts as a + * fence preventing branch history from before the fence from + * affecting indirect branches after the fence. This is + * specifically used in cBPF jitted code to prevent Intra-mode + * BHI attacks. The IBHF instruction is designed to be a NOP on + * hardware that doesn't need or support it. The REP and REX.W + * prefixes are required by the microcode, and they also ensure + * that the NOP is unlikely to be used in existing code. + * + * IBHF is not a valid instruction in 32-bit mode. + */ + EMIT5(0xF3, 0x48, 0x0F, 0x1E, 0xF8); /* ibhf */ + } + *pprog = prog; + return 0; +} + +static int do_jit(struct bpf_verifier_env *env, struct bpf_prog *bpf_prog, int *addrs, u8 *image, + u8 *rw_image, int oldproglen, struct jit_context *ctx, bool jmp_padding) { bool tail_call_reachable = bpf_prog->aux->tail_call_reachable; struct bpf_insn *insn = bpf_prog->insnsi; @@ -1464,7 +1663,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image void __percpu *priv_stack_ptr; int i, excnt = 0; int ilen, proglen = 0; - u8 *prog = temp; + u8 *ip, *prog = temp; u32 stack_depth; int err; @@ -1480,9 +1679,12 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image detect_reg_usage(insn, insn_cnt, callee_regs_used); - emit_prologue(&prog, stack_depth, + emit_prologue(&prog, image, stack_depth, bpf_prog_was_classic(bpf_prog), tail_call_reachable, bpf_is_subprog(bpf_prog), bpf_prog->aux->exception_cb); + + bpf_prog->aux->ksym.fp_start = prog - temp; + /* Exception callback will clobber callee regs for its own use, and * restore the original callee regs from main prog's stack frame. */ @@ -1532,6 +1734,11 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image dst_reg = X86_REG_R9; } + if (bpf_insn_is_indirect_target(env, bpf_prog, i - 1)) + EMIT_ENDBR(); + + ip = image + addrs[i - 1] + (prog - temp); + switch (insn->code) { /* ALU */ case BPF_ALU | BPF_ADD | BPF_X: @@ -1958,19 +2165,27 @@ st: if (is_imm8(insn->off)) case BPF_LDX | BPF_PROBE_MEM32 | BPF_H: case BPF_LDX | BPF_PROBE_MEM32 | BPF_W: case BPF_LDX | BPF_PROBE_MEM32 | BPF_DW: + case BPF_LDX | BPF_PROBE_MEM32SX | BPF_B: + case BPF_LDX | BPF_PROBE_MEM32SX | BPF_H: + case BPF_LDX | BPF_PROBE_MEM32SX | BPF_W: case BPF_STX | BPF_PROBE_MEM32 | BPF_B: case BPF_STX | BPF_PROBE_MEM32 | BPF_H: case BPF_STX | BPF_PROBE_MEM32 | BPF_W: case BPF_STX | BPF_PROBE_MEM32 | BPF_DW: start_of_ldx = prog; - if (BPF_CLASS(insn->code) == BPF_LDX) - emit_ldx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); - else + if (BPF_CLASS(insn->code) == BPF_LDX) { + if (BPF_MODE(insn->code) == BPF_PROBE_MEM32SX) + emit_ldsx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); + else + emit_ldx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); + } else { emit_stx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); + } populate_extable: { struct exception_table_entry *ex; u8 *_insn = image + proglen + (start_of_ldx - temp); + u32 arena_reg, fixup_reg; s64 delta; if (!bpf_prog->aux->extable) @@ -1990,8 +2205,29 @@ populate_extable: ex->data = EX_TYPE_BPF; - ex->fixup = (prog - start_of_ldx) | - ((BPF_CLASS(insn->code) == BPF_LDX ? reg2pt_regs[dst_reg] : DONT_CLEAR) << 8); + /* + * src_reg/dst_reg holds the address in the arena region with upper + * 32-bits being zero because of a preceding addr_space_cast(r<n>, + * 0x0, 0x1) instruction. This address is adjusted with the addition + * of arena_vm_start (see the implementation of BPF_PROBE_MEM32 and + * BPF_PROBE_ATOMIC) before being used for the memory access. Pass + * the reg holding the unmodified 32-bit address to + * ex_handler_bpf(). + */ + if (BPF_CLASS(insn->code) == BPF_LDX) { + arena_reg = reg2pt_regs[src_reg]; + fixup_reg = reg2pt_regs[dst_reg]; + } else { + arena_reg = reg2pt_regs[dst_reg]; + fixup_reg = DONT_CLEAR; + } + + ex->fixup = FIELD_PREP(FIXUP_INSN_LEN_MASK, prog - start_of_ldx) | + FIELD_PREP(FIXUP_ARENA_REG_MASK, arena_reg) | + FIELD_PREP(FIXUP_REG_MASK, fixup_reg); + ex->fixup |= FIXUP_ARENA_ACCESS; + + ex->data |= FIELD_PREP(DATA_ARENA_OFFSET_MASK, insn->off); } break; @@ -2109,10 +2345,18 @@ populate_extable: * End result: x86 insn "mov rbx, qword ptr [rax+0x14]" * of 4 bytes will be ignored and rbx will be zero inited. */ - ex->fixup = (prog - start_of_ldx) | (reg2pt_regs[dst_reg] << 8); + ex->fixup = FIELD_PREP(FIXUP_INSN_LEN_MASK, prog - start_of_ldx) | + FIELD_PREP(FIXUP_REG_MASK, reg2pt_regs[dst_reg]); } break; + case BPF_STX | BPF_ATOMIC | BPF_B: + case BPF_STX | BPF_ATOMIC | BPF_H: + if (!bpf_atomic_is_load_store(insn)) { + pr_err("bpf_jit: 1- and 2-byte RMW atomics are not supported\n"); + return -EFAULT; + } + fallthrough; case BPF_STX | BPF_ATOMIC | BPF_W: case BPF_STX | BPF_ATOMIC | BPF_DW: if (insn->imm == (BPF_AND | BPF_FETCH) || @@ -2148,10 +2392,10 @@ populate_extable: EMIT2(simple_alu_opcodes[BPF_OP(insn->imm)], add_2reg(0xC0, AUX_REG, real_src_reg)); /* Attempt to swap in new value */ - err = emit_atomic(&prog, BPF_CMPXCHG, - real_dst_reg, AUX_REG, - insn->off, - BPF_SIZE(insn->code)); + err = emit_atomic_rmw(&prog, BPF_CMPXCHG, + real_dst_reg, AUX_REG, + insn->off, + BPF_SIZE(insn->code)); if (WARN_ON(err)) return err; /* @@ -2166,25 +2410,41 @@ populate_extable: break; } - err = emit_atomic(&prog, insn->imm, dst_reg, src_reg, - insn->off, BPF_SIZE(insn->code)); + if (bpf_atomic_is_load_store(insn)) + err = emit_atomic_ld_st(&prog, insn->imm, dst_reg, src_reg, + insn->off, BPF_SIZE(insn->code)); + else + err = emit_atomic_rmw(&prog, insn->imm, dst_reg, src_reg, + insn->off, BPF_SIZE(insn->code)); if (err) return err; break; + case BPF_STX | BPF_PROBE_ATOMIC | BPF_B: + case BPF_STX | BPF_PROBE_ATOMIC | BPF_H: + if (!bpf_atomic_is_load_store(insn)) { + pr_err("bpf_jit: 1- and 2-byte RMW atomics are not supported\n"); + return -EFAULT; + } + fallthrough; case BPF_STX | BPF_PROBE_ATOMIC | BPF_W: case BPF_STX | BPF_PROBE_ATOMIC | BPF_DW: start_of_ldx = prog; - err = emit_atomic_index(&prog, insn->imm, BPF_SIZE(insn->code), - dst_reg, src_reg, X86_REG_R12, insn->off); + + if (bpf_atomic_is_load_store(insn)) + err = emit_atomic_ld_st_index(&prog, insn->imm, + BPF_SIZE(insn->code), dst_reg, + src_reg, X86_REG_R12, insn->off); + else + err = emit_atomic_rmw_index(&prog, insn->imm, BPF_SIZE(insn->code), + dst_reg, src_reg, X86_REG_R12, + insn->off); if (err) return err; goto populate_extable; /* call */ case BPF_JMP | BPF_CALL: { - u8 *ip = image + addrs[i - 1]; - func = (u8 *) __bpf_call_base + imm32; if (src_reg == BPF_PSEUDO_CALL && tail_call_reachable) { LOAD_TAIL_CALL_CNT_PTR(stack_depth); @@ -2208,7 +2468,8 @@ populate_extable: if (imm32) emit_bpf_tail_call_direct(bpf_prog, &bpf_prog->aux->poke_tab[imm32 - 1], - &prog, image + addrs[i - 1], + &prog, + ip, callee_regs_used, stack_depth, ctx); @@ -2217,7 +2478,7 @@ populate_extable: &prog, callee_regs_used, stack_depth, - image + addrs[i - 1], + ip, ctx); break; @@ -2381,6 +2642,9 @@ emit_cond_jmp: /* Convert BPF opcode to x86 */ break; + case BPF_JMP | BPF_JA | BPF_X: + emit_indirect_jump(&prog, insn->dst_reg, ip); + break; case BPF_JMP | BPF_JA: case BPF_JMP32 | BPF_JA: if (BPF_CLASS(insn->code) == BPF_JMP) { @@ -2467,6 +2731,11 @@ emit_jmp: seen_exit = true; /* Update cleanup_addr */ ctx->cleanup_addr = proglen; + if (bpf_prog_was_classic(bpf_prog) && + !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN)) { + if (emit_spectre_bhb_barrier(&prog, ip, bpf_prog)) + return -EINVAL; + } if (bpf_prog->aux->exception_boundary) { pop_callee_regs(&prog, all_callee_regs_used); pop_r12(&prog); @@ -2476,6 +2745,8 @@ emit_jmp: pop_r12(&prog); } EMIT1(0xC9); /* leave */ + bpf_prog->aux->ksym.fp_end = prog - temp; + emit_return(&prog, image + addrs[i - 1] + (prog - temp)); break; @@ -2590,9 +2861,10 @@ static int get_nr_used_regs(const struct btf_func_model *m) } static void save_args(const struct btf_func_model *m, u8 **prog, - int stack_size, bool for_call_origin) + int stack_size, bool for_call_origin, u32 flags) { int arg_regs, first_off = 0, nr_regs = 0, nr_stack_slots = 0; + bool use_jmp = bpf_trampoline_use_jmp(flags); int i, j; /* Store function arguments to stack. @@ -2633,7 +2905,7 @@ static void save_args(const struct btf_func_model *m, u8 **prog, */ for (j = 0; j < arg_regs; j++) { emit_ldx(prog, BPF_DW, BPF_REG_0, BPF_REG_FP, - nr_stack_slots * 8 + 0x18); + nr_stack_slots * 8 + 16 + (!use_jmp) * 8); emit_stx(prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -stack_size); @@ -2815,13 +3087,19 @@ static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond) static int invoke_bpf(const struct btf_func_model *m, u8 **pprog, struct bpf_tramp_links *tl, int stack_size, - int run_ctx_off, bool save_ret, - void *image, void *rw_image) + int run_ctx_off, int func_meta_off, bool save_ret, + void *image, void *rw_image, u64 func_meta, + int cookie_off) { - int i; + int i, cur_cookie = (cookie_off - stack_size) / 8; u8 *prog = *pprog; for (i = 0; i < tl->nr_links; i++) { + if (tl->links[i]->link.prog->call_session_cookie) { + emit_store_stack_imm64(&prog, BPF_REG_0, -func_meta_off, + func_meta | (cur_cookie << BPF_TRAMP_COOKIE_INDEX_SHIFT)); + cur_cookie--; + } if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size, run_ctx_off, save_ret, image, rw_image)) return -EINVAL; @@ -2939,12 +3217,14 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im void *func_addr) { int i, ret, nr_regs = m->nr_args, stack_size = 0; - int regs_off, nregs_off, ip_off, run_ctx_off, arg_stack_off, rbx_off; + int regs_off, func_meta_off, ip_off, run_ctx_off, arg_stack_off, rbx_off; struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY]; struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT]; struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN]; void *orig_call = func_addr; + int cookie_off, cookie_cnt; u8 **branches = NULL; + u64 func_meta; u8 *prog; bool save_ret; @@ -2980,7 +3260,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im * [ ... ] * RBP - regs_off [ reg_arg1 ] program's ctx pointer * - * RBP - nregs_off [ regs count ] always + * RBP - func_meta_off [ regs count, etc ] always * * RBP - ip_off [ traced function ] BPF_TRAMP_F_IP_ARG flag * @@ -3003,15 +3283,20 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im stack_size += nr_regs * 8; regs_off = stack_size; - /* regs count */ + /* function matedata, such as regs count */ stack_size += 8; - nregs_off = stack_size; + func_meta_off = stack_size; if (flags & BPF_TRAMP_F_IP_ARG) stack_size += 8; /* room for IP address argument */ ip_off = stack_size; + cookie_cnt = bpf_fsession_cookie_cnt(tlinks); + /* room for session cookies */ + stack_size += cookie_cnt * 8; + cookie_off = stack_size; + stack_size += 8; rbx_off = stack_size; @@ -3027,16 +3312,21 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im * should be 16-byte aligned. Following code depend on * that stack_size is already 8-byte aligned. */ - stack_size += (stack_size % 16) ? 0 : 8; + if (bpf_trampoline_use_jmp(flags)) { + /* no rip in the "jmp" case */ + stack_size += (stack_size % 16) ? 8 : 0; + } else { + stack_size += (stack_size % 16) ? 0 : 8; + } } arg_stack_off = stack_size; - if (flags & BPF_TRAMP_F_SKIP_FRAME) { + if (flags & BPF_TRAMP_F_CALL_ORIG) { /* skip patched call instruction and point orig_call to actual * body of the kernel function. */ - if (is_endbr(*(u32 *)orig_call)) + if (is_endbr(orig_call)) orig_call += ENDBR_INSN_SIZE; orig_call += X86_PATCH_SIZE; } @@ -3047,7 +3337,9 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im /* * Indirect call for bpf_struct_ops */ - emit_cfi(&prog, cfi_get_func_hash(func_addr)); + emit_cfi(&prog, image, + cfi_get_func_hash(func_addr), + cfi_get_func_arity(func_addr)); } else { /* * Direct-call fentry stub, as such it needs accounting for the @@ -3057,6 +3349,9 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im } EMIT1(0x55); /* push rbp */ EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */ + if (im) + im->ksym.fp_start = prog - (u8 *)rw_image; + if (!is_imm8(stack_size)) { /* sub rsp, stack_size */ EMIT3_off32(0x48, 0x81, 0xEC, stack_size); @@ -3069,23 +3364,16 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im /* mov QWORD PTR [rbp - rbx_off], rbx */ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_6, -rbx_off); - /* Store number of argument registers of the traced function: - * mov rax, nr_regs - * mov QWORD PTR [rbp - nregs_off], rax - */ - emit_mov_imm64(&prog, BPF_REG_0, 0, (u32) nr_regs); - emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -nregs_off); + func_meta = nr_regs; + /* Store number of argument registers of the traced function */ + emit_store_stack_imm64(&prog, BPF_REG_0, -func_meta_off, func_meta); if (flags & BPF_TRAMP_F_IP_ARG) { - /* Store IP address of the traced function: - * movabsq rax, func_addr - * mov QWORD PTR [rbp - ip_off], rax - */ - emit_mov_imm64(&prog, BPF_REG_0, (long) func_addr >> 32, (u32) (long) func_addr); - emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -ip_off); + /* Store IP address of the traced function */ + emit_store_stack_imm64(&prog, BPF_REG_0, -ip_off, (long)func_addr); } - save_args(m, &prog, regs_off, false); + save_args(m, &prog, regs_off, false, flags); if (flags & BPF_TRAMP_F_CALL_ORIG) { /* arg1: mov rdi, im */ @@ -3097,9 +3385,18 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im } } + if (bpf_fsession_cnt(tlinks)) { + /* clear all the session cookies' value */ + for (int i = 0; i < cookie_cnt; i++) + emit_store_stack_imm64(&prog, BPF_REG_0, -cookie_off + 8 * i, 0); + /* clear the return value to make sure fentry always get 0 */ + emit_store_stack_imm64(&prog, BPF_REG_0, -8, 0); + } + if (fentry->nr_links) { - if (invoke_bpf(m, &prog, fentry, regs_off, run_ctx_off, - flags & BPF_TRAMP_F_RET_FENTRY_RET, image, rw_image)) + if (invoke_bpf(m, &prog, fentry, regs_off, run_ctx_off, func_meta_off, + flags & BPF_TRAMP_F_RET_FENTRY_RET, image, rw_image, + func_meta, cookie_off)) return -EINVAL; } @@ -3118,7 +3415,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im if (flags & BPF_TRAMP_F_CALL_ORIG) { restore_regs(m, &prog, regs_off); - save_args(m, &prog, arg_stack_off, true); + save_args(m, &prog, arg_stack_off, true, flags); if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) { /* Before calling the original function, load the @@ -3159,9 +3456,14 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im } } + /* set the "is_return" flag for fsession */ + func_meta |= (1ULL << BPF_TRAMP_IS_RETURN_SHIFT); + if (bpf_fsession_cnt(tlinks)) + emit_store_stack_imm64(&prog, BPF_REG_0, -func_meta_off, func_meta); + if (fexit->nr_links) { - if (invoke_bpf(m, &prog, fexit, regs_off, run_ctx_off, - false, image, rw_image)) { + if (invoke_bpf(m, &prog, fexit, regs_off, run_ctx_off, func_meta_off, + false, image, rw_image, func_meta, cookie_off)) { ret = -EINVAL; goto cleanup; } @@ -3194,7 +3496,11 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8); emit_ldx(&prog, BPF_DW, BPF_REG_6, BPF_REG_FP, -rbx_off); + EMIT1(0xC9); /* leave */ + if (im) + im->ksym.fp_end = prog - (u8 *)rw_image; + if (flags & BPF_TRAMP_F_SKIP_FRAME) { /* skip our return address and return to parent */ EMIT4(0x48, 0x83, 0xC4, 8); /* add rsp, 8 */ @@ -3301,7 +3607,7 @@ static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs, u8 *image, if (err) return err; - emit_indirect_jump(&prog, 2 /* rdx */, image + (prog - buf)); + emit_indirect_jump(&prog, BPF_REG_3 /* R3 -> rdx */, image + (prog - buf)); *pprog = prog; return 0; @@ -3368,13 +3674,6 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func return emit_bpf_dispatcher(&prog, 0, num_funcs - 1, funcs, image, buf); } -static const char *bpf_get_prog_name(struct bpf_prog *prog) -{ - if (prog->aux->ksym.prog) - return prog->aux->ksym.name; - return prog->aux->name; -} - static void priv_stack_init_guard(void __percpu *priv_stack_ptr, int alloc_size) { int cpu, underflow_idx = (alloc_size - PRIV_STACK_GUARD_SZ) >> 3; @@ -3398,7 +3697,7 @@ static void priv_stack_check_guard(void __percpu *priv_stack_ptr, int alloc_size if (stack_ptr[0] != PRIV_STACK_GUARD_VAL || stack_ptr[underflow_idx] != PRIV_STACK_GUARD_VAL) { pr_err("BPF private stack overflow/underflow detected for prog %sx\n", - bpf_get_prog_name(prog)); + bpf_jit_get_prog_name(prog)); break; } } @@ -3416,17 +3715,15 @@ struct x64_jit_data { #define MAX_PASSES 20 #define PADDING_PASSES (MAX_PASSES - 5) -struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) +struct bpf_prog *bpf_int_jit_compile(struct bpf_verifier_env *env, struct bpf_prog *prog) { struct bpf_binary_header *rw_header = NULL; struct bpf_binary_header *header = NULL; - struct bpf_prog *tmp, *orig_prog = prog; void __percpu *priv_stack_ptr = NULL; struct x64_jit_data *jit_data; int priv_stack_alloc_sz; int proglen, oldproglen = 0; struct jit_context ctx = {}; - bool tmp_blinded = false; bool extra_pass = false; bool padding = false; u8 *rw_image = NULL; @@ -3436,27 +3733,13 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) int i; if (!prog->jit_requested) - return orig_prog; - - tmp = bpf_jit_blind_constants(prog); - /* - * If blinding was requested and we failed during blinding, - * we must fall back to the interpreter. - */ - if (IS_ERR(tmp)) - return orig_prog; - if (tmp != prog) { - tmp_blinded = true; - prog = tmp; - } + return prog; jit_data = prog->aux->jit_data; if (!jit_data) { - jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL); - if (!jit_data) { - prog = orig_prog; - goto out; - } + jit_data = kzalloc_obj(*jit_data); + if (!jit_data) + return prog; prog->aux->jit_data = jit_data; } priv_stack_ptr = prog->aux->priv_stack_ptr; @@ -3468,10 +3751,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) priv_stack_alloc_sz = round_up(prog->aux->stack_depth, 8) + 2 * PRIV_STACK_GUARD_SZ; priv_stack_ptr = __alloc_percpu_gfp(priv_stack_alloc_sz, 8, GFP_KERNEL); - if (!priv_stack_ptr) { - prog = orig_prog; + if (!priv_stack_ptr) goto out_priv_stack; - } priv_stack_init_guard(priv_stack_ptr, priv_stack_alloc_sz); prog->aux->priv_stack_ptr = priv_stack_ptr; @@ -3488,11 +3769,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) padding = true; goto skip_init_addrs; } - addrs = kvmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL); - if (!addrs) { - prog = orig_prog; + addrs = kvmalloc_objs(*addrs, prog->len + 1); + if (!addrs) goto out_addrs; - } /* * Before first pass, make a rough estimation of addrs[] @@ -3514,7 +3793,7 @@ skip_init_addrs: for (pass = 0; pass < MAX_PASSES || image; pass++) { if (!padding && pass >= PADDING_PASSES) padding = true; - proglen = do_jit(prog, addrs, image, rw_image, oldproglen, &ctx, padding); + proglen = do_jit(env, prog, addrs, image, rw_image, oldproglen, &ctx, padding); if (proglen <= 0) { out_image: image = NULL; @@ -3523,8 +3802,6 @@ out_image: sizeof(rw_header->size)); bpf_jit_binary_pack_free(header, rw_header); } - /* Fall back to interpreter mode */ - prog = orig_prog; if (extra_pass) { prog->bpf_func = NULL; prog->jited = 0; @@ -3555,10 +3832,8 @@ out_image: header = bpf_jit_binary_pack_alloc(roundup(proglen, align) + extable_size, &image, align, &rw_header, &rw_image, jit_fill_hole); - if (!header) { - prog = orig_prog; + if (!header) goto out_addrs; - } prog->aux->extable = (void *) image + roundup(proglen, align); } oldproglen = proglen; @@ -3592,6 +3867,15 @@ out_image: jit_data->header = header; jit_data->rw_header = rw_header; } + + /* + * The bpf_prog_update_insn_ptrs function expects addrs to + * point to the first byte of the jitted instruction (unlike + * the bpf_prog_fill_jited_linfo below, which, for historical + * reasons, expects to point to the next instruction) + */ + bpf_prog_update_insn_ptrs(prog, addrs, image); + /* * ctx.prog_offset is used when CFI preambles put code *before* * the function. See emit_cfi(). For FineIBT specifically this code @@ -3602,8 +3886,6 @@ out_image: prog->bpf_func = (void *)image + cfi_get_offset(); prog->jited = 1; prog->jited_len = proglen - cfi_get_offset(); - } else { - prog = orig_prog; } if (!image || !prog->is_func || extra_pass) { @@ -3619,10 +3901,7 @@ out_priv_stack: kfree(jit_data); prog->aux->jit_data = NULL; } -out: - if (tmp_blinded) - bpf_jit_prog_release_other(prog, prog == orig_prog ? - tmp : orig_prog); + return prog; } @@ -3712,13 +3991,13 @@ void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp } return; #endif - WARN(1, "verification of programs using bpf_throw should have failed\n"); } void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, struct bpf_prog *new, struct bpf_prog *old) { u8 *old_addr, *new_addr, *old_bypass_addr; + enum bpf_text_poke_type t; int ret; old_bypass_addr = old ? NULL : poke->bypass_addr; @@ -3731,21 +4010,22 @@ void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, * the kallsyms check. */ if (new) { + t = old_addr ? BPF_MOD_JUMP : BPF_MOD_NOP; ret = __bpf_arch_text_poke(poke->tailcall_target, - BPF_MOD_JUMP, + t, BPF_MOD_JUMP, old_addr, new_addr); BUG_ON(ret < 0); if (!old) { ret = __bpf_arch_text_poke(poke->tailcall_bypass, - BPF_MOD_JUMP, + BPF_MOD_JUMP, BPF_MOD_NOP, poke->bypass_addr, NULL); BUG_ON(ret < 0); } } else { + t = old_bypass_addr ? BPF_MOD_JUMP : BPF_MOD_NOP; ret = __bpf_arch_text_poke(poke->tailcall_bypass, - BPF_MOD_JUMP, - old_bypass_addr, + t, BPF_MOD_JUMP, old_bypass_addr, poke->bypass_addr); BUG_ON(ret < 0); /* let other CPUs finish the execution of program @@ -3754,9 +4034,9 @@ void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, */ if (!ret) synchronize_rcu(); + t = old_addr ? BPF_MOD_JUMP : BPF_MOD_NOP; ret = __bpf_arch_text_poke(poke->tailcall_target, - BPF_MOD_JUMP, - old_addr, NULL); + t, BPF_MOD_NOP, old_addr, NULL); BUG_ON(ret < 0); } } @@ -3791,3 +4071,13 @@ u64 bpf_arch_uaddress_limit(void) { return 0; } + +bool bpf_jit_supports_timed_may_goto(void) +{ + return true; +} + +bool bpf_jit_supports_fsession(void) +{ + return true; +} diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c index de0f9e5f9f73..852baf2e4db4 100644 --- a/arch/x86/net/bpf_jit_comp32.c +++ b/arch/x86/net/bpf_jit_comp32.c @@ -2518,38 +2518,22 @@ bool bpf_jit_needs_zext(void) return true; } -struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) +struct bpf_prog *bpf_int_jit_compile(struct bpf_verifier_env *env, struct bpf_prog *prog) { struct bpf_binary_header *header = NULL; - struct bpf_prog *tmp, *orig_prog = prog; int proglen, oldproglen = 0; struct jit_context ctx = {}; - bool tmp_blinded = false; u8 *image = NULL; int *addrs; int pass; int i; if (!prog->jit_requested) - return orig_prog; + return prog; - tmp = bpf_jit_blind_constants(prog); - /* - * If blinding was requested and we failed during blinding, - * we must fall back to the interpreter. - */ - if (IS_ERR(tmp)) - return orig_prog; - if (tmp != prog) { - tmp_blinded = true; - prog = tmp; - } - - addrs = kmalloc_array(prog->len, sizeof(*addrs), GFP_KERNEL); - if (!addrs) { - prog = orig_prog; - goto out; - } + addrs = kmalloc_objs(*addrs, prog->len); + if (!addrs) + return prog; /* * Before first pass, make a rough estimation of addrs[] @@ -2574,7 +2558,6 @@ out_image: image = NULL; if (header) bpf_jit_binary_free(header); - prog = orig_prog; goto out_addrs; } if (image) { @@ -2588,10 +2571,8 @@ out_image: if (proglen == oldproglen) { header = bpf_jit_binary_alloc(proglen, &image, 1, jit_fill_hole); - if (!header) { - prog = orig_prog; + if (!header) goto out_addrs; - } } oldproglen = proglen; cond_resched(); @@ -2604,16 +2585,10 @@ out_image: prog->bpf_func = (void *)image; prog->jited = 1; prog->jited_len = proglen; - } else { - prog = orig_prog; } out_addrs: kfree(addrs); -out: - if (tmp_blinded) - bpf_jit_prog_release_other(prog, prog == orig_prog ? - tmp : orig_prog); return prog; } diff --git a/arch/x86/net/bpf_timed_may_goto.S b/arch/x86/net/bpf_timed_may_goto.S new file mode 100644 index 000000000000..54c690cae190 --- /dev/null +++ b/arch/x86/net/bpf_timed_may_goto.S @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include <linux/export.h> +#include <linux/linkage.h> +#include <asm/nospec-branch.h> + + .code64 + .section .text, "ax" + +SYM_FUNC_START(arch_bpf_timed_may_goto) + ANNOTATE_NOENDBR + + /* + * r10 passes us stack depth, load the pointer to count and timestamp + * into r10 by adding it to BPF frame pointer. + */ + leaq (%rbp, %r10, 1), %r10 + + /* Setup frame. */ + pushq %rbp + movq %rsp, %rbp + + /* Save r0-r5. */ + pushq %rax + pushq %rdi + pushq %rsi + pushq %rdx + pushq %rcx + pushq %r8 + + /* + * r10 has the pointer to count and timestamp, pass it as first + * argument. + */ + movq %r10, %rdi + + /* Emit call depth accounting for call below. */ + CALL_DEPTH_ACCOUNT + call bpf_check_timed_may_goto + + /* BPF_REG_AX=r10 will be stored into count, so move return value to it. */ + movq %rax, %r10 + + /* Restore r5-r0. */ + popq %r8 + popq %rcx + popq %rdx + popq %rsi + popq %rdi + popq %rax + + leave + RET +SYM_FUNC_END(arch_bpf_timed_may_goto) |
