diff options
Diffstat (limited to 'kernel/bpf/verifier.c')
| -rw-r--r-- | kernel/bpf/verifier.c | 12762 |
1 files changed, 4842 insertions, 7920 deletions
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 60611df77957..7fb88e1cd7c4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -44,6 +44,12 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { #undef BPF_LINK_TYPE }; +enum bpf_features { + BPF_FEAT_RDONLY_CAST_TO_VOID = 0, + BPF_FEAT_STREAMS = 1, + __MAX_BPF_FEAT, +}; + struct bpf_mem_alloc bpf_global_percpu_ma; static bool bpf_global_percpu_ma_set; @@ -189,9 +195,6 @@ struct bpf_verifier_stack_elem { #define BPF_COMPLEXITY_LIMIT_JMP_SEQ 8192 #define BPF_COMPLEXITY_LIMIT_STATES 64 -#define BPF_MAP_KEY_POISON (1ULL << 63) -#define BPF_MAP_KEY_SEEN (1ULL << 62) - #define BPF_GLOBAL_PERCPU_MA_MAX_SIZE 512 #define BPF_PRIV_STACK_MIN_SIZE 64 @@ -203,19 +206,11 @@ static void invalidate_non_owning_refs(struct bpf_verifier_env *env); static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env); static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg); -static void specialize_kfunc(struct bpf_verifier_env *env, - u32 func_id, u16 offset, unsigned long *addr); static bool is_trusted_reg(const struct bpf_reg_state *reg); - -static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux) -{ - return aux->map_ptr_state.poison; -} - -static bool bpf_map_ptr_unpriv(const struct bpf_insn_aux_data *aux) -{ - return aux->map_ptr_state.unpriv; -} +static inline bool in_sleepable_context(struct bpf_verifier_env *env); +static const char *non_sleepable_context_description(struct bpf_verifier_env *env); +static void scalar32_min_max_add(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg); +static void scalar_min_max_add(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg); static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux, struct bpf_map *map, @@ -227,21 +222,6 @@ static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux, aux->map_ptr_state.map_ptr = map; } -static bool bpf_map_key_poisoned(const struct bpf_insn_aux_data *aux) -{ - return aux->map_key_state & BPF_MAP_KEY_POISON; -} - -static bool bpf_map_key_unseen(const struct bpf_insn_aux_data *aux) -{ - return !(aux->map_key_state & BPF_MAP_KEY_SEEN); -} - -static u64 bpf_map_key_immediate(const struct bpf_insn_aux_data *aux) -{ - return aux->map_key_state & ~(BPF_MAP_KEY_SEEN | BPF_MAP_KEY_POISON); -} - static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state) { bool poisoned = bpf_map_key_poisoned(aux); @@ -250,26 +230,8 @@ static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state) (poisoned ? BPF_MAP_KEY_POISON : 0ULL); } -static bool bpf_helper_call(const struct bpf_insn *insn) -{ - return insn->code == (BPF_JMP | BPF_CALL) && - insn->src_reg == 0; -} - -static bool bpf_pseudo_call(const struct bpf_insn *insn) -{ - return insn->code == (BPF_JMP | BPF_CALL) && - insn->src_reg == BPF_PSEUDO_CALL; -} - -static bool bpf_pseudo_kfunc_call(const struct bpf_insn *insn) -{ - return insn->code == (BPF_JMP | BPF_CALL) && - insn->src_reg == BPF_PSEUDO_KFUNC_CALL; -} - struct bpf_call_arg_meta { - struct bpf_map *map_ptr; + struct bpf_map_desc map; bool raw_mode; bool pkt_access; u8 release_regno; @@ -279,7 +241,6 @@ struct bpf_call_arg_meta { u64 msize_max_value; int ref_obj_id; int dynptr_id; - int map_uid; int func_id; struct btf *btf; u32 btf_id; @@ -290,59 +251,12 @@ struct bpf_call_arg_meta { s64 const_map_key; }; -struct bpf_kfunc_call_arg_meta { - /* In parameters */ +struct bpf_kfunc_meta { struct btf *btf; - u32 func_id; - u32 kfunc_flags; - const struct btf_type *func_proto; - const char *func_name; - /* Out parameters */ - u32 ref_obj_id; - u8 release_regno; - bool r0_rdonly; - u32 ret_btf_id; - u64 r0_size; - u32 subprogno; - struct { - u64 value; - bool found; - } arg_constant; - - /* arg_{btf,btf_id,owning_ref} are used by kfunc-specific handling, - * generally to pass info about user-defined local kptr types to later - * verification logic - * bpf_obj_drop/bpf_percpu_obj_drop - * Record the local kptr type to be drop'd - * bpf_refcount_acquire (via KF_ARG_PTR_TO_REFCOUNTED_KPTR arg type) - * Record the local kptr type to be refcount_incr'd and use - * arg_owning_ref to determine whether refcount_acquire should be - * fallible - */ - struct btf *arg_btf; - u32 arg_btf_id; - bool arg_owning_ref; - - struct { - struct btf_field *field; - } arg_list_head; - struct { - struct btf_field *field; - } arg_rbtree_root; - struct { - enum bpf_dynptr_type type; - u32 id; - u32 ref_obj_id; - } initialized_dynptr; - struct { - u8 spi; - u8 frameno; - } iter; - struct { - struct bpf_map *ptr; - int uid; - } map; - u64 mem_size; + const struct btf_type *proto; + const char *name; + const u32 *flags; + s32 id; }; struct btf *btf_vmlinux; @@ -404,7 +318,8 @@ static bool reg_not_null(const struct bpf_reg_state *reg) type == PTR_TO_MAP_KEY || type == PTR_TO_SOCK_COMMON || (type == PTR_TO_BTF_ID && is_trusted_reg(reg)) || - type == PTR_TO_MEM; + (type == PTR_TO_MEM && !(reg->type & PTR_UNTRUSTED)) || + type == CONST_PTR_TO_MAP; } static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg) @@ -422,13 +337,36 @@ static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg) return rec; } -static bool subprog_is_global(const struct bpf_verifier_env *env, int subprog) +bool bpf_subprog_is_global(const struct bpf_verifier_env *env, int subprog) { struct bpf_func_info_aux *aux = env->prog->aux->func_info_aux; return aux && aux[subprog].linkage == BTF_FUNC_GLOBAL; } +static bool subprog_returns_void(struct bpf_verifier_env *env, int subprog) +{ + const struct btf_type *type, *func, *func_proto; + const struct btf *btf = env->prog->aux->btf; + u32 btf_id; + + btf_id = env->prog->aux->func_info[subprog].type_id; + + func = btf_type_by_id(btf, btf_id); + if (verifier_bug_if(!func, env, "btf_id %u not found", btf_id)) + return false; + + func_proto = btf_type_by_id(btf, func->type); + if (!func_proto) + return false; + + type = btf_type_skip_modifiers(btf, func_proto->type, NULL); + if (!type) + return false; + + return btf_type_is_void(type); +} + static const char *subprog_name(const struct bpf_verifier_env *env, int subprog) { struct bpf_func_info *info; @@ -440,7 +378,7 @@ static const char *subprog_name(const struct bpf_verifier_env *env, int subprog) return btf_type_name(env->prog->aux->btf, info->type_id); } -static void mark_subprog_exc_cb(struct bpf_verifier_env *env, int subprog) +void bpf_mark_subprog_exc_cb(struct bpf_verifier_env *env, int subprog) { struct bpf_subprog_info *info = subprog_info(env, subprog); @@ -456,7 +394,7 @@ static bool subprog_is_exc_cb(struct bpf_verifier_env *env, int subprog) static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) { - return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK); + return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK); } static bool type_is_rdonly_mem(u32 type) @@ -504,9 +442,9 @@ static bool is_dynptr_ref_function(enum bpf_func_id func_id) static bool is_sync_callback_calling_kfunc(u32 btf_id); static bool is_async_callback_calling_kfunc(u32 btf_id); static bool is_callback_calling_kfunc(u32 btf_id); -static bool is_bpf_throw_kfunc(struct bpf_insn *insn); -static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id); +static bool is_bpf_wq_set_callback_kfunc(u32 btf_id); +static bool is_task_work_add_kfunc(u32 func_id); static bool is_sync_callback_calling_function(enum bpf_func_id func_id) { @@ -527,34 +465,36 @@ static bool is_callback_calling_function(enum bpf_func_id func_id) is_async_callback_calling_function(func_id); } -static bool is_sync_callback_calling_insn(struct bpf_insn *insn) +bool bpf_is_sync_callback_calling_insn(struct bpf_insn *insn) { return (bpf_helper_call(insn) && is_sync_callback_calling_function(insn->imm)) || (bpf_pseudo_kfunc_call(insn) && is_sync_callback_calling_kfunc(insn->imm)); } -static bool is_async_callback_calling_insn(struct bpf_insn *insn) +bool bpf_is_async_callback_calling_insn(struct bpf_insn *insn) { return (bpf_helper_call(insn) && is_async_callback_calling_function(insn->imm)) || (bpf_pseudo_kfunc_call(insn) && is_async_callback_calling_kfunc(insn->imm)); } -static bool is_may_goto_insn(struct bpf_insn *insn) +static bool is_async_cb_sleepable(struct bpf_verifier_env *env, struct bpf_insn *insn) { - return insn->code == (BPF_JMP | BPF_JCOND) && insn->src_reg == BPF_MAY_GOTO; -} + /* bpf_timer callbacks are never sleepable. */ + if (bpf_helper_call(insn) && insn->imm == BPF_FUNC_timer_set_callback) + return false; -static bool is_may_goto_insn_at(struct bpf_verifier_env *env, int insn_idx) -{ - return is_may_goto_insn(&env->prog->insnsi[insn_idx]); + /* bpf_wq and bpf_task_work callbacks are always sleepable. */ + if (bpf_pseudo_kfunc_call(insn) && insn->off == 0 && + (is_bpf_wq_set_callback_kfunc(insn->imm) || is_task_work_add_kfunc(insn->imm))) + return true; + + verifier_bug(env, "unhandled async callback in is_async_cb_sleepable"); + return false; } -static bool is_storage_get_function(enum bpf_func_id func_id) +bool bpf_is_may_goto_insn(struct bpf_insn *insn) { - return func_id == BPF_FUNC_sk_storage_get || - func_id == BPF_FUNC_inode_storage_get || - func_id == BPF_FUNC_task_storage_get || - func_id == BPF_FUNC_cgrp_storage_get; + return insn->code == (BPF_JMP | BPF_JCOND) && insn->src_reg == BPF_MAY_GOTO; } static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id, @@ -572,25 +512,6 @@ static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id, return ref_obj_uses > 1; } -static bool is_cmpxchg_insn(const struct bpf_insn *insn) -{ - return BPF_CLASS(insn->code) == BPF_STX && - BPF_MODE(insn->code) == BPF_ATOMIC && - insn->imm == BPF_CMPXCHG; -} - -static int __get_spi(s32 off) -{ - return (-off - 1) / BPF_REG_SIZE; -} - -static struct bpf_func_state *func(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg) -{ - struct bpf_verifier_state *cur = env->cur_state; - - return cur->frame[reg->frameno]; -} static bool is_spi_bounds_valid(struct bpf_func_state *state, int spi, int nr_slots) { @@ -616,19 +537,19 @@ static int stack_slot_obj_get_spi(struct bpf_verifier_env *env, struct bpf_reg_s return -EINVAL; } - off = reg->off + reg->var_off.value; + off = reg->var_off.value; if (off % BPF_REG_SIZE) { verbose(env, "cannot pass in %s at an offset=%d\n", obj_kind, off); return -EINVAL; } - spi = __get_spi(off); + spi = bpf_get_spi(off); if (spi + 1 < nr_slots) { verbose(env, "cannot pass in %s at an offset=%d\n", obj_kind, off); return -EINVAL; } - if (!is_spi_bounds_valid(func(env, reg), spi, nr_slots)) + if (!is_spi_bounds_valid(bpf_func(env, reg), spi, nr_slots)) return -ERANGE; return spi; } @@ -659,6 +580,10 @@ static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type) return BPF_DYNPTR_TYPE_SKB; case DYNPTR_TYPE_XDP: return BPF_DYNPTR_TYPE_XDP; + case DYNPTR_TYPE_SKB_META: + return BPF_DYNPTR_TYPE_SKB_META; + case DYNPTR_TYPE_FILE: + return BPF_DYNPTR_TYPE_FILE; default: return BPF_DYNPTR_TYPE_INVALID; } @@ -675,6 +600,10 @@ static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type) return DYNPTR_TYPE_SKB; case BPF_DYNPTR_TYPE_XDP: return DYNPTR_TYPE_XDP; + case BPF_DYNPTR_TYPE_SKB_META: + return DYNPTR_TYPE_SKB_META; + case BPF_DYNPTR_TYPE_FILE: + return DYNPTR_TYPE_FILE; default: return 0; } @@ -682,15 +611,13 @@ static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type) static bool dynptr_type_refcounted(enum bpf_dynptr_type type) { - return type == BPF_DYNPTR_TYPE_RINGBUF; + return type == BPF_DYNPTR_TYPE_RINGBUF || type == BPF_DYNPTR_TYPE_FILE; } static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type type, bool first_slot, int dynptr_id); -static void __mark_reg_not_init(const struct bpf_verifier_env *env, - struct bpf_reg_state *reg); static void mark_dynptr_stack_regs(struct bpf_verifier_env *env, struct bpf_reg_state *sreg1, @@ -716,7 +643,7 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg, enum bpf_arg_type arg_type, int insn_idx, int clone_ref_obj_id) { - struct bpf_func_state *state = func(env, reg); + struct bpf_func_state *state = bpf_func(env, reg); enum bpf_dynptr_type type; int spi, i, err; @@ -768,9 +695,6 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_ state->stack[spi - 1].spilled_ptr.ref_obj_id = id; } - state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; - state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN; - return 0; } @@ -783,39 +707,24 @@ static void invalidate_dynptr(struct bpf_verifier_env *env, struct bpf_func_stat state->stack[spi - 1].slot_type[i] = STACK_INVALID; } - __mark_reg_not_init(env, &state->stack[spi].spilled_ptr); - __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr); - - /* Why do we need to set REG_LIVE_WRITTEN for STACK_INVALID slot? - * - * While we don't allow reading STACK_INVALID, it is still possible to - * do <8 byte writes marking some but not all slots as STACK_MISC. Then, - * helpers or insns can do partial read of that part without failing, - * but check_stack_range_initialized, check_stack_read_var_off, and - * check_stack_read_fixed_off will do mark_reg_read for all 8-bytes of - * the slot conservatively. Hence we need to prevent those liveness - * marking walks. - * - * This was not a problem before because STACK_INVALID is only set by - * default (where the default reg state has its reg->parent as NULL), or - * in clean_live_states after REG_LIVE_DONE (at which point - * mark_reg_read won't walk reg->parent chain), but not randomly during - * verifier state exploration (like we did above). Hence, for our case - * parentage chain will still be live (i.e. reg->parent may be - * non-NULL), while earlier reg->parent was NULL, so we need - * REG_LIVE_WRITTEN to screen off read marker propagation when it is - * done later on reads or by mark_dynptr_read as well to unnecessary - * mark registers in verifier state. - */ - state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; - state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN; + bpf_mark_reg_not_init(env, &state->stack[spi].spilled_ptr); + bpf_mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr); } static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { - struct bpf_func_state *state = func(env, reg); + struct bpf_func_state *state = bpf_func(env, reg); int spi, ref_obj_id, i; + /* + * This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot + * be released by any dynptr helper. Hence, unmark_stack_slots_dynptr + * is safe to do directly. + */ + if (reg->type == CONST_PTR_TO_DYNPTR) { + verifier_bug(env, "CONST_PTR_TO_DYNPTR cannot be released"); + return -EFAULT; + } spi = dynptr_get_spi(env, reg); if (spi < 0) return spi; @@ -847,7 +756,7 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re * dynptr */ if (state->stack[i].slot_type[0] != STACK_DYNPTR) { - verbose(env, "verifier internal error: misconfigured ref_obj_id\n"); + verifier_bug(env, "misconfigured ref_obj_id"); return -EFAULT; } if (state->stack[i].spilled_ptr.dynptr.first_slot) @@ -863,7 +772,7 @@ static void __mark_reg_unknown(const struct bpf_verifier_env *env, static void mark_reg_invalid(const struct bpf_verifier_env *env, struct bpf_reg_state *reg) { if (!env->allow_ptr_leaks) - __mark_reg_not_init(env, reg); + bpf_mark_reg_not_init(env, reg); else __mark_reg_unknown(env, reg); } @@ -888,8 +797,27 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, spi = spi + 1; if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) { - verbose(env, "cannot overwrite referenced dynptr\n"); - return -EINVAL; + int ref_obj_id = state->stack[spi].spilled_ptr.ref_obj_id; + int ref_cnt = 0; + + /* + * A referenced dynptr can be overwritten only if there is at + * least one other dynptr sharing the same ref_obj_id, + * ensuring the reference can still be properly released. + */ + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_DYNPTR) + continue; + if (!state->stack[i].spilled_ptr.dynptr.first_slot) + continue; + if (state->stack[i].spilled_ptr.ref_obj_id == ref_obj_id) + ref_cnt++; + } + + if (ref_cnt <= 1) { + verbose(env, "cannot overwrite referenced dynptr\n"); + return -EINVAL; + } } mark_stack_slot_scratched(env, spi); @@ -914,12 +842,8 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, /* Do not release reference state, we are destroying dynptr on stack, * not using some helper to release it. Just reset register. */ - __mark_reg_not_init(env, &state->stack[spi].spilled_ptr); - __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr); - - /* Same reason as unmark_stack_slots_dynptr above */ - state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; - state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN; + bpf_mark_reg_not_init(env, &state->stack[spi].spilled_ptr); + bpf_mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr); return 0; } @@ -954,7 +878,7 @@ static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_ static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { - struct bpf_func_state *state = func(env, reg); + struct bpf_func_state *state = bpf_func(env, reg); int i, spi; /* This already represents first slot of initialized bpf_dynptr. @@ -984,7 +908,7 @@ static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_re static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg_state *reg, enum bpf_arg_type arg_type) { - struct bpf_func_state *state = func(env, reg); + struct bpf_func_state *state = bpf_func(env, reg); enum bpf_dynptr_type dynptr_type; int spi; @@ -1014,7 +938,7 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int insn_idx, struct btf *btf, u32 btf_id, int nr_slots) { - struct bpf_func_state *state = func(env, reg); + struct bpf_func_state *state = bpf_func(env, reg); int spi, i, j, id; spi = iter_get_spi(env, reg, nr_slots); @@ -1037,7 +961,6 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env, else st->type |= PTR_UNTRUSTED; } - st->live |= REG_LIVE_WRITTEN; st->ref_obj_id = i == 0 ? id : 0; st->iter.btf = btf; st->iter.btf_id = btf_id; @@ -1056,7 +979,7 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env, static int unmark_stack_slots_iter(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int nr_slots) { - struct bpf_func_state *state = func(env, reg); + struct bpf_func_state *state = bpf_func(env, reg); int spi, i, j; spi = iter_get_spi(env, reg, nr_slots); @@ -1070,10 +993,7 @@ static int unmark_stack_slots_iter(struct bpf_verifier_env *env, if (i == 0) WARN_ON_ONCE(release_reference(env, st->ref_obj_id)); - __mark_reg_not_init(env, st); - - /* see unmark_stack_slots_dynptr() for why we need to set REG_LIVE_WRITTEN */ - st->live |= REG_LIVE_WRITTEN; + bpf_mark_reg_not_init(env, st); for (j = 0; j < BPF_REG_SIZE; j++) slot->slot_type[j] = STACK_INVALID; @@ -1087,7 +1007,7 @@ static int unmark_stack_slots_iter(struct bpf_verifier_env *env, static bool is_iter_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int nr_slots) { - struct bpf_func_state *state = func(env, reg); + struct bpf_func_state *state = bpf_func(env, reg); int spi, i, j; /* For -ERANGE (i.e. spi not falling into allocated stack slots), we @@ -1114,7 +1034,7 @@ static bool is_iter_reg_valid_uninit(struct bpf_verifier_env *env, static int is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg, struct btf *btf, u32 btf_id, int nr_slots) { - struct bpf_func_state *state = func(env, reg); + struct bpf_func_state *state = bpf_func(env, reg); int spi, i, j; spi = iter_get_spi(env, reg, nr_slots); @@ -1148,9 +1068,10 @@ static int release_irq_state(struct bpf_verifier_state *state, int id); static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta, - struct bpf_reg_state *reg, int insn_idx) + struct bpf_reg_state *reg, int insn_idx, + int kfunc_class) { - struct bpf_func_state *state = func(env, reg); + struct bpf_func_state *state = bpf_func(env, reg); struct bpf_stack_state *slot; struct bpf_reg_state *st; int spi, i, id; @@ -1168,8 +1089,8 @@ static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env, __mark_reg_known_zero(st); st->type = PTR_TO_STACK; /* we don't have dedicated reg type */ - st->live |= REG_LIVE_WRITTEN; st->ref_obj_id = id; + st->irq.kfunc_class = kfunc_class; for (i = 0; i < BPF_REG_SIZE; i++) slot->slot_type[i] = STACK_IRQ_FLAG; @@ -1178,9 +1099,10 @@ static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env, return 0; } -static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + int kfunc_class) { - struct bpf_func_state *state = func(env, reg); + struct bpf_func_state *state = bpf_func(env, reg); struct bpf_stack_state *slot; struct bpf_reg_state *st; int spi, i, err; @@ -1192,6 +1114,15 @@ static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_r slot = &state->stack[spi]; st = &slot->spilled_ptr; + if (st->irq.kfunc_class != kfunc_class) { + const char *flag_kfunc = st->irq.kfunc_class == IRQ_NATIVE_KFUNC ? "native" : "lock"; + const char *used_kfunc = kfunc_class == IRQ_NATIVE_KFUNC ? "native" : "lock"; + + verbose(env, "irq flag acquired by %s kfuncs cannot be restored with %s kfuncs\n", + flag_kfunc, used_kfunc); + return -EINVAL; + } + err = release_irq_state(env->cur_state, st->ref_obj_id); WARN_ON_ONCE(err && err != -EACCES); if (err) { @@ -1209,10 +1140,7 @@ static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_r return err; } - __mark_reg_not_init(env, st); - - /* see unmark_stack_slots_dynptr() for why we need to set REG_LIVE_WRITTEN */ - st->live |= REG_LIVE_WRITTEN; + bpf_mark_reg_not_init(env, st); for (i = 0; i < BPF_REG_SIZE; i++) slot->slot_type[i] = STACK_INVALID; @@ -1223,7 +1151,7 @@ static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_r static bool is_irq_flag_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { - struct bpf_func_state *state = func(env, reg); + struct bpf_func_state *state = bpf_func(env, reg); struct bpf_stack_state *slot; int spi, i; @@ -1247,7 +1175,7 @@ static bool is_irq_flag_reg_valid_uninit(struct bpf_verifier_env *env, struct bp static int is_irq_flag_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { - struct bpf_func_state *state = func(env, reg); + struct bpf_func_state *state = bpf_func(env, reg); struct bpf_stack_state *slot; struct bpf_reg_state *st; int spi, i; @@ -1285,6 +1213,7 @@ static bool is_stack_slot_special(const struct bpf_stack_state *stack) case STACK_IRQ_FLAG: return true; case STACK_INVALID: + case STACK_POISON: case STACK_MISC: case STACK_ZERO: return false; @@ -1297,26 +1226,12 @@ static bool is_stack_slot_special(const struct bpf_stack_state *stack) /* The reg state of a pointer or a bounded scalar was saved when * it was spilled to the stack. */ -static bool is_spilled_reg(const struct bpf_stack_state *stack) -{ - return stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL; -} -static bool is_spilled_scalar_reg(const struct bpf_stack_state *stack) -{ - return stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL && - stack->spilled_ptr.type == SCALAR_VALUE; -} - -static bool is_spilled_scalar_reg64(const struct bpf_stack_state *stack) -{ - return stack->slot_type[0] == STACK_SPILL && - stack->spilled_ptr.type == SCALAR_VALUE; -} - -/* Mark stack slot as STACK_MISC, unless it is already STACK_INVALID, in which - * case they are equivalent, or it's STACK_ZERO, in which case we preserve - * more precise STACK_ZERO. +/* + * Mark stack slot as STACK_MISC, unless it is already: + * - STACK_INVALID, in which case they are equivalent. + * - STACK_ZERO, in which case we preserve more precise STACK_ZERO. + * - STACK_POISON, which truly forbids access to the slot. * Regardless of allow_ptr_leaks setting (i.e., privileged or unprivileged * mode), we won't promote STACK_INVALID to STACK_MISC. In privileged case it is * unnecessary as both are considered equivalent when loading data and pruning, @@ -1327,14 +1242,14 @@ static void mark_stack_slot_misc(struct bpf_verifier_env *env, u8 *stype) { if (*stype == STACK_ZERO) return; - if (*stype == STACK_INVALID) + if (*stype == STACK_INVALID || *stype == STACK_POISON) return; *stype = STACK_MISC; } static void scrub_spilled_slot(u8 *stype) { - if (*stype != STACK_INVALID) + if (*stype != STACK_INVALID && *stype != STACK_POISON) *stype = STACK_MISC; } @@ -1383,7 +1298,7 @@ static void *realloc_array(void *arr, size_t old_n, size_t new_n, size_t size) goto out; alloc_size = kmalloc_size_roundup(size_mul(new_n, size)); - new_arr = krealloc(arr, alloc_size, GFP_KERNEL); + new_arr = krealloc(arr, alloc_size, GFP_KERNEL_ACCOUNT); if (!new_arr) { kfree(arr); return NULL; @@ -1400,15 +1315,17 @@ out: static int copy_reference_state(struct bpf_verifier_state *dst, const struct bpf_verifier_state *src) { dst->refs = copy_array(dst->refs, src->refs, src->acquired_refs, - sizeof(struct bpf_reference_state), GFP_KERNEL); + sizeof(struct bpf_reference_state), GFP_KERNEL_ACCOUNT); if (!dst->refs) return -ENOMEM; dst->acquired_refs = src->acquired_refs; dst->active_locks = src->active_locks; dst->active_preempt_locks = src->active_preempt_locks; - dst->active_rcu_lock = src->active_rcu_lock; + dst->active_rcu_locks = src->active_rcu_locks; dst->active_irq_id = src->active_irq_id; + dst->active_lock_id = src->active_lock_id; + dst->active_lock_ptr = src->active_lock_ptr; return 0; } @@ -1417,7 +1334,7 @@ static int copy_stack_state(struct bpf_func_state *dst, const struct bpf_func_st size_t n = src->allocated_stack / BPF_REG_SIZE; dst->stack = copy_array(dst->stack, src->stack, n, sizeof(struct bpf_stack_state), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!dst->stack) return -ENOMEM; @@ -1508,6 +1425,8 @@ static int acquire_lock_state(struct bpf_verifier_env *env, int insn_idx, enum r s->ptr = ptr; state->active_locks++; + state->active_lock_id = id; + state->active_lock_ptr = ptr; return 0; } @@ -1545,18 +1464,37 @@ static void release_reference_state(struct bpf_verifier_state *state, int idx) return; } +static bool find_reference_state(struct bpf_verifier_state *state, int ptr_id) +{ + int i; + + for (i = 0; i < state->acquired_refs; i++) + if (state->refs[i].id == ptr_id) + return true; + + return false; +} + static int release_lock_state(struct bpf_verifier_state *state, int type, int id, void *ptr) { + void *prev_ptr = NULL; + u32 prev_id = 0; int i; for (i = 0; i < state->acquired_refs; i++) { - if (state->refs[i].type != type) - continue; - if (state->refs[i].id == id && state->refs[i].ptr == ptr) { + if (state->refs[i].type == type && state->refs[i].id == id && + state->refs[i].ptr == ptr) { release_reference_state(state, i); state->active_locks--; + /* Reassign active lock (id, ptr). */ + state->active_lock_id = prev_id; + state->active_lock_ptr = prev_ptr; return 0; } + if (state->refs[i].type & REF_TYPE_LOCK_MASK) { + prev_id = state->refs[i].id; + prev_ptr = state->refs[i].ptr; + } } return -EINVAL; } @@ -1591,7 +1529,7 @@ static struct bpf_reference_state *find_lock_state(struct bpf_verifier_state *st for (i = 0; i < state->acquired_refs; i++) { struct bpf_reference_state *s = &state->refs[i]; - if (s->type != type) + if (!(s->type & type)) continue; if (s->id == id && s->ptr == ptr) @@ -1608,8 +1546,15 @@ static void free_func_state(struct bpf_func_state *state) kfree(state); } -static void free_verifier_state(struct bpf_verifier_state *state, - bool free_self) +void bpf_clear_jmp_history(struct bpf_verifier_state *state) +{ + kfree(state->jmp_history); + state->jmp_history = NULL; + state->jmp_history_cnt = 0; +} + +void bpf_free_verifier_state(struct bpf_verifier_state *state, + bool free_self) { int i; @@ -1618,6 +1563,7 @@ static void free_verifier_state(struct bpf_verifier_state *state, state->frame[i] = NULL; } kfree(state->refs); + bpf_clear_jmp_history(state); if (free_self) kfree(state); } @@ -1632,12 +1578,19 @@ static int copy_func_state(struct bpf_func_state *dst, return copy_stack_state(dst, src); } -static int copy_verifier_state(struct bpf_verifier_state *dst_state, - const struct bpf_verifier_state *src) +int bpf_copy_verifier_state(struct bpf_verifier_state *dst_state, + const struct bpf_verifier_state *src) { struct bpf_func_state *dst; int i, err; + dst_state->jmp_history = copy_array(dst_state->jmp_history, src->jmp_history, + src->jmp_history_cnt, sizeof(*dst_state->jmp_history), + GFP_KERNEL_ACCOUNT); + if (!dst_state->jmp_history) + return -ENOMEM; + dst_state->jmp_history_cnt = src->jmp_history_cnt; + /* if dst has more stack frames then src frame, free them, this is also * necessary in case of exceptional exits using bpf_throw. */ @@ -1655,16 +1608,14 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, dst_state->parent = src->parent; dst_state->first_insn_idx = src->first_insn_idx; dst_state->last_insn_idx = src->last_insn_idx; - dst_state->insn_hist_start = src->insn_hist_start; - dst_state->insn_hist_end = src->insn_hist_end; dst_state->dfs_depth = src->dfs_depth; dst_state->callback_unroll_depth = src->callback_unroll_depth; - dst_state->used_as_loop_entry = src->used_as_loop_entry; dst_state->may_goto_depth = src->may_goto_depth; + dst_state->equal_state = src->equal_state; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; if (!dst) { - dst = kzalloc(sizeof(*dst), GFP_KERNEL); + dst = kzalloc_obj(*dst, GFP_KERNEL_ACCOUNT); if (!dst) return -ENOMEM; dst_state->frame[i] = dst; @@ -1681,7 +1632,7 @@ static u32 state_htab_size(struct bpf_verifier_env *env) return env->prog->len; } -static struct bpf_verifier_state_list **explored_state(struct bpf_verifier_env *env, int idx) +struct list_head *bpf_explored_state(struct bpf_verifier_env *env, int idx) { struct bpf_verifier_state *cur = env->cur_state; struct bpf_func_state *state = cur->frame[cur->curframe]; @@ -1703,186 +1654,17 @@ static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_sta return true; } -/* Open coded iterators allow back-edges in the state graph in order to - * check unbounded loops that iterators. - * - * In is_state_visited() it is necessary to know if explored states are - * part of some loops in order to decide whether non-exact states - * comparison could be used: - * - non-exact states comparison establishes sub-state relation and uses - * read and precision marks to do so, these marks are propagated from - * children states and thus are not guaranteed to be final in a loop; - * - exact states comparison just checks if current and explored states - * are identical (and thus form a back-edge). - * - * Paper "A New Algorithm for Identifying Loops in Decompilation" - * by Tao Wei, Jian Mao, Wei Zou and Yu Chen [1] presents a convenient - * algorithm for loop structure detection and gives an overview of - * relevant terminology. It also has helpful illustrations. - * - * [1] https://api.semanticscholar.org/CorpusID:15784067 - * - * We use a similar algorithm but because loop nested structure is - * irrelevant for verifier ours is significantly simpler and resembles - * strongly connected components algorithm from Sedgewick's textbook. - * - * Define topmost loop entry as a first node of the loop traversed in a - * depth first search starting from initial state. The goal of the loop - * tracking algorithm is to associate topmost loop entries with states - * derived from these entries. - * - * For each step in the DFS states traversal algorithm needs to identify - * the following situations: - * - * initial initial initial - * | | | - * V V V - * ... ... .---------> hdr - * | | | | - * V V | V - * cur .-> succ | .------... - * | | | | | | - * V | V | V V - * succ '-- cur | ... ... - * | | | - * | V V - * | succ <- cur - * | | - * | V - * | ... - * | | - * '----' - * - * (A) successor state of cur (B) successor state of cur or it's entry - * not yet traversed are in current DFS path, thus cur and succ - * are members of the same outermost loop - * - * initial initial - * | | - * V V - * ... ... - * | | - * V V - * .------... .------... - * | | | | - * V V V V - * .-> hdr ... ... ... - * | | | | | - * | V V V V - * | succ <- cur succ <- cur - * | | | - * | V V - * | ... ... - * | | | - * '----' exit - * - * (C) successor state of cur is a part of some loop but this loop - * does not include cur or successor state is not in a loop at all. - * - * Algorithm could be described as the following python code: - * - * traversed = set() # Set of traversed nodes - * entries = {} # Mapping from node to loop entry - * depths = {} # Depth level assigned to graph node - * path = set() # Current DFS path - * - * # Find outermost loop entry known for n - * def get_loop_entry(n): - * h = entries.get(n, None) - * while h in entries and entries[h] != h: - * h = entries[h] - * return h - * - * # Update n's loop entry if h's outermost entry comes - * # before n's outermost entry in current DFS path. - * def update_loop_entry(n, h): - * n1 = get_loop_entry(n) or n - * h1 = get_loop_entry(h) or h - * if h1 in path and depths[h1] <= depths[n1]: - * entries[n] = h1 - * - * def dfs(n, depth): - * traversed.add(n) - * path.add(n) - * depths[n] = depth - * for succ in G.successors(n): - * if succ not in traversed: - * # Case A: explore succ and update cur's loop entry - * # only if succ's entry is in current DFS path. - * dfs(succ, depth + 1) - * h = get_loop_entry(succ) - * update_loop_entry(n, h) - * else: - * # Case B or C depending on `h1 in path` check in update_loop_entry(). - * update_loop_entry(n, succ) - * path.remove(n) - * - * To adapt this algorithm for use with verifier: - * - use st->branch == 0 as a signal that DFS of succ had been finished - * and cur's loop entry has to be updated (case A), handle this in - * update_branch_counts(); - * - use st->branch > 0 as a signal that st is in the current DFS path; - * - handle cases B and C in is_state_visited(); - * - update topmost loop entry for intermediate states in get_loop_entry(). - */ -static struct bpf_verifier_state *get_loop_entry(struct bpf_verifier_state *st) -{ - struct bpf_verifier_state *topmost = st->loop_entry, *old; - while (topmost && topmost->loop_entry && topmost != topmost->loop_entry) - topmost = topmost->loop_entry; - /* Update loop entries for intermediate states to avoid this - * traversal in future get_loop_entry() calls. - */ - while (st && st->loop_entry != topmost) { - old = st->loop_entry; - st->loop_entry = topmost; - st = old; - } - return topmost; -} - -static void update_loop_entry(struct bpf_verifier_state *cur, struct bpf_verifier_state *hdr) +void bpf_free_backedges(struct bpf_scc_visit *visit) { - struct bpf_verifier_state *cur1, *hdr1; - - cur1 = get_loop_entry(cur) ?: cur; - hdr1 = get_loop_entry(hdr) ?: hdr; - /* The head1->branches check decides between cases B and C in - * comment for get_loop_entry(). If hdr1->branches == 0 then - * head's topmost loop entry is not in current DFS path, - * hence 'cur' and 'hdr' are not in the same loop and there is - * no need to update cur->loop_entry. - */ - if (hdr1->branches && hdr1->dfs_depth <= cur1->dfs_depth) { - cur->loop_entry = hdr; - hdr->used_as_loop_entry = true; - } -} - -static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st) -{ - while (st) { - u32 br = --st->branches; - - /* br == 0 signals that DFS exploration for 'st' is finished, - * thus it is necessary to update parent's loop entry if it - * turned out that st is a part of some loop. - * This is a part of 'case A' in get_loop_entry() comment. - */ - if (br == 0 && st->parent && st->loop_entry) - update_loop_entry(st->parent, st->loop_entry); + struct bpf_scc_backedge *backedge, *next; - /* WARN_ON(br > 1) technically makes sense here, - * but see comment in push_stack(), hence: - */ - WARN_ONCE((int)br < 0, - "BUG update_branch_counts:branches_to_explore=%d\n", - br); - if (br) - break; - st = st->parent; + for (backedge = visit->backedges; backedge; backedge = next) { + bpf_free_verifier_state(&backedge->state, false); + next = backedge->next; + kfree(backedge); } + visit->backedges = NULL; } static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, @@ -1896,7 +1678,7 @@ static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, return -ENOENT; if (cur) { - err = copy_verifier_state(cur, &head->st); + err = bpf_copy_verifier_state(cur, &head->st); if (err) return err; } @@ -1907,13 +1689,25 @@ static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, if (prev_insn_idx) *prev_insn_idx = head->prev_insn_idx; elem = head->next; - free_verifier_state(&head->st, false); + bpf_free_verifier_state(&head->st, false); kfree(head); env->head = elem; env->stack_size--; return 0; } +static bool error_recoverable_with_nospec(int err) +{ + /* Should only return true for non-fatal errors that are allowed to + * occur during speculative verification. For these we can insert a + * nospec and the program might still be accepted. Do not include + * something like ENOMEM because it is likely to re-occur for the next + * architectural path once it has been recovered-from in all speculative + * paths. + */ + return err == -EPERM || err == -EACCES || err == -EINVAL; +} + static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx, bool speculative) @@ -1922,9 +1716,9 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, struct bpf_verifier_stack_elem *elem; int err; - elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL); + elem = kzalloc_obj(struct bpf_verifier_stack_elem, GFP_KERNEL_ACCOUNT); if (!elem) - goto err; + return ERR_PTR(-ENOMEM); elem->insn_idx = insn_idx; elem->prev_insn_idx = prev_insn_idx; @@ -1932,14 +1726,14 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, elem->log_pos = env->log.end_pos; env->head = elem; env->stack_size++; - err = copy_verifier_state(&elem->st, cur); + err = bpf_copy_verifier_state(&elem->st, cur); if (err) - goto err; + return ERR_PTR(-ENOMEM); elem->st.speculative |= speculative; if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) { verbose(env, "The sequence of %d jumps is too complex.\n", env->stack_size); - goto err; + return ERR_PTR(-E2BIG); } if (elem->st.parent) { ++elem->st.parent->branches; @@ -1954,15 +1748,8 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, */ } return &elem->st; -err: - free_verifier_state(env->cur_state, true); - env->cur_state = NULL; - /* pop all elements and return */ - while (!pop_stack(env, NULL, NULL, false)); - return NULL; } -#define CALLER_SAVED_REGS 6 static const int caller_saved[CALLER_SAVED_REGS] = { BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 }; @@ -2025,13 +1812,6 @@ static void __mark_reg_const_zero(const struct bpf_verifier_env *env, struct bpf static void mark_reg_known_zero(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno) { - if (WARN_ON(regno >= MAX_BPF_REG)) { - verbose(env, "mark_reg_known_zero(regs, %u)\n", regno); - /* Something bad happened, let's kill all regs */ - for (regno = 0; regno < MAX_BPF_REG; regno++) - __mark_reg_not_init(env, regs + regno); - return; - } __mark_reg_known_zero(regs + regno); } @@ -2061,10 +1841,10 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) /* transfer reg's id which is unique for every map_lookup_elem * as UID of the inner map. */ - if (btf_record_has_field(map->inner_map_meta->record, BPF_TIMER)) - reg->map_uid = reg->id; - if (btf_record_has_field(map->inner_map_meta->record, BPF_WORKQUEUE)) + if (btf_record_has_field(map->inner_map_meta->record, + BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) { reg->map_uid = reg->id; + } } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { reg->type = PTR_TO_XDP_SOCK; } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP || @@ -2082,11 +1862,10 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) static void mark_reg_graph_node(struct bpf_reg_state *regs, u32 regno, struct btf_field_graph_root *ds_head) { - __mark_reg_known_zero(®s[regno]); + __mark_reg_known(®s[regno], ds_head->node_offset); regs[regno].type = PTR_TO_BTF_ID | MEM_ALLOC; regs[regno].btf = ds_head->btf; regs[regno].btf_id = ds_head->value_btf_id; - regs[regno].off = ds_head->node_offset; } static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg) @@ -2103,7 +1882,8 @@ static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg) static bool reg_is_dynptr_slice_pkt(const struct bpf_reg_state *reg) { return base_type(reg->type) == PTR_TO_MEM && - (reg->type & DYNPTR_TYPE_SKB || reg->type & DYNPTR_TYPE_XDP); + (reg->type & + (DYNPTR_TYPE_SKB | DYNPTR_TYPE_XDP | DYNPTR_TYPE_SKB_META)); } /* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */ @@ -2116,7 +1896,6 @@ static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg, */ return reg->type == which && reg->id == 0 && - reg->off == 0 && tnum_equals_const(reg->var_off, 0); } @@ -2150,6 +1929,18 @@ static void __mark_reg32_unbounded(struct bpf_reg_state *reg) reg->u32_max_value = U32_MAX; } +static void reset_reg64_and_tnum(struct bpf_reg_state *reg) +{ + __mark_reg64_unbounded(reg); + reg->var_off = tnum_unknown; +} + +static void reset_reg32_and_tnum(struct bpf_reg_state *reg) +{ + __mark_reg32_unbounded(reg); + reg->var_off = tnum_unknown; +} + static void __update_reg32_bounds(struct bpf_reg_state *reg) { struct tnum var32_off = tnum_subreg(reg->var_off); @@ -2167,6 +1958,9 @@ static void __update_reg32_bounds(struct bpf_reg_state *reg) static void __update_reg64_bounds(struct bpf_reg_state *reg) { + u64 tnum_next, tmax; + bool umin_in_tnum; + /* min signed is max(sign bit) | min(other bits) */ reg->smin_value = max_t(s64, reg->smin_value, reg->var_off.value | (reg->var_off.mask & S64_MIN)); @@ -2176,6 +1970,33 @@ static void __update_reg64_bounds(struct bpf_reg_state *reg) reg->umin_value = max(reg->umin_value, reg->var_off.value); reg->umax_value = min(reg->umax_value, reg->var_off.value | reg->var_off.mask); + + /* Check if u64 and tnum overlap in a single value */ + tnum_next = tnum_step(reg->var_off, reg->umin_value); + umin_in_tnum = (reg->umin_value & ~reg->var_off.mask) == reg->var_off.value; + tmax = reg->var_off.value | reg->var_off.mask; + if (umin_in_tnum && tnum_next > reg->umax_value) { + /* The u64 range and the tnum only overlap in umin. + * u64: ---[xxxxxx]----- + * tnum: --xx----------x- + */ + ___mark_reg_known(reg, reg->umin_value); + } else if (!umin_in_tnum && tnum_next == tmax) { + /* The u64 range and the tnum only overlap in the maximum value + * represented by the tnum, called tmax. + * u64: ---[xxxxxx]----- + * tnum: xx-----x-------- + */ + ___mark_reg_known(reg, tmax); + } else if (!umin_in_tnum && tnum_next <= reg->umax_value && + tnum_step(reg->var_off, tnum_next) > reg->umax_value) { + /* The u64 range and the tnum only overlap in between umin + * (excluded) and umax. + * u64: ---[xxxxxx]----- + * tnum: xx----x-------x- + */ + ___mark_reg_known(reg, tnum_next); + } } static void __update_reg_bounds(struct bpf_reg_state *reg) @@ -2185,7 +2006,7 @@ static void __update_reg_bounds(struct bpf_reg_state *reg) } /* Uses signed min/max values to inform unsigned, and vice-versa */ -static void __reg32_deduce_bounds(struct bpf_reg_state *reg) +static void deduce_bounds_32_from_64(struct bpf_reg_state *reg) { /* If upper 32 bits of u64/s64 range don't change, we can use lower 32 * bits to improve our u32/s32 boundaries. @@ -2255,6 +2076,10 @@ static void __reg32_deduce_bounds(struct bpf_reg_state *reg) reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value); reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value); } +} + +static void deduce_bounds_32_from_32(struct bpf_reg_state *reg) +{ /* if u32 range forms a valid s32 range (due to matching sign bit), * try to learn from that */ @@ -2269,10 +2094,34 @@ static void __reg32_deduce_bounds(struct bpf_reg_state *reg) if ((u32)reg->s32_min_value <= (u32)reg->s32_max_value) { reg->u32_min_value = max_t(u32, reg->s32_min_value, reg->u32_min_value); reg->u32_max_value = min_t(u32, reg->s32_max_value, reg->u32_max_value); + } else { + if (reg->u32_max_value < (u32)reg->s32_min_value) { + /* See __reg64_deduce_bounds() for detailed explanation. + * Refine ranges in the following situation: + * + * 0 U32_MAX + * | [xxxxxxxxxxxxxx u32 range xxxxxxxxxxxxxx] | + * |----------------------------|----------------------------| + * |xxxxx s32 range xxxxxxxxx] [xxxxxxx| + * 0 S32_MAX S32_MIN -1 + */ + reg->s32_min_value = (s32)reg->u32_min_value; + reg->u32_max_value = min_t(u32, reg->u32_max_value, reg->s32_max_value); + } else if ((u32)reg->s32_max_value < reg->u32_min_value) { + /* + * 0 U32_MAX + * | [xxxxxxxxxxxxxx u32 range xxxxxxxxxxxxxx] | + * |----------------------------|----------------------------| + * |xxxxxxxxx] [xxxxxxxxxxxx s32 range | + * 0 S32_MAX S32_MIN -1 + */ + reg->s32_max_value = (s32)reg->u32_max_value; + reg->u32_min_value = max_t(u32, reg->u32_min_value, reg->s32_min_value); + } } } -static void __reg64_deduce_bounds(struct bpf_reg_state *reg) +static void deduce_bounds_64_from_64(struct bpf_reg_state *reg) { /* If u64 range forms a valid s64 range (due to matching sign bit), * try to learn from that. Let's do a bit of ASCII art to see when @@ -2352,10 +2201,62 @@ static void __reg64_deduce_bounds(struct bpf_reg_state *reg) if ((u64)reg->smin_value <= (u64)reg->smax_value) { reg->umin_value = max_t(u64, reg->smin_value, reg->umin_value); reg->umax_value = min_t(u64, reg->smax_value, reg->umax_value); + } else { + /* If the s64 range crosses the sign boundary, then it's split + * between the beginning and end of the U64 domain. In that + * case, we can derive new bounds if the u64 range overlaps + * with only one end of the s64 range. + * + * In the following example, the u64 range overlaps only with + * positive portion of the s64 range. + * + * 0 U64_MAX + * | [xxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxx] | + * |----------------------------|----------------------------| + * |xxxxx s64 range xxxxxxxxx] [xxxxxxx| + * 0 S64_MAX S64_MIN -1 + * + * We can thus derive the following new s64 and u64 ranges. + * + * 0 U64_MAX + * | [xxxxxx u64 range xxxxx] | + * |----------------------------|----------------------------| + * | [xxxxxx s64 range xxxxx] | + * 0 S64_MAX S64_MIN -1 + * + * If they overlap in two places, we can't derive anything + * because reg_state can't represent two ranges per numeric + * domain. + * + * 0 U64_MAX + * | [xxxxxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxxxxx] | + * |----------------------------|----------------------------| + * |xxxxx s64 range xxxxxxxxx] [xxxxxxxxxx| + * 0 S64_MAX S64_MIN -1 + * + * The first condition below corresponds to the first diagram + * above. + */ + if (reg->umax_value < (u64)reg->smin_value) { + reg->smin_value = (s64)reg->umin_value; + reg->umax_value = min_t(u64, reg->umax_value, reg->smax_value); + } else if ((u64)reg->smax_value < reg->umin_value) { + /* This second condition considers the case where the u64 range + * overlaps with the negative portion of the s64 range: + * + * 0 U64_MAX + * | [xxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxx] | + * |----------------------------|----------------------------| + * |xxxxxxxxx] [xxxxxxxxxxxx s64 range | + * 0 S64_MAX S64_MIN -1 + */ + reg->smax_value = (s64)reg->umax_value; + reg->umin_value = max_t(u64, reg->umin_value, reg->smin_value); + } } } -static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg) +static void deduce_bounds_64_from_32(struct bpf_reg_state *reg) { /* Try to tighten 64-bit bounds from 32-bit knowledge, using 32-bit * values on both sides of 64-bit range in hope to have tighter range. @@ -2383,20 +2284,6 @@ static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg) reg->smin_value = max_t(s64, reg->smin_value, new_smin); reg->smax_value = min_t(s64, reg->smax_value, new_smax); - /* if s32 can be treated as valid u32 range, we can use it as well */ - if ((u32)reg->s32_min_value <= (u32)reg->s32_max_value) { - /* s32 -> u64 tightening */ - new_umin = (reg->umin_value & ~0xffffffffULL) | (u32)reg->s32_min_value; - new_umax = (reg->umax_value & ~0xffffffffULL) | (u32)reg->s32_max_value; - reg->umin_value = max_t(u64, reg->umin_value, new_umin); - reg->umax_value = min_t(u64, reg->umax_value, new_umax); - /* s32 -> s64 tightening */ - new_smin = (reg->smin_value & ~0xffffffffULL) | (u32)reg->s32_min_value; - new_smax = (reg->smax_value & ~0xffffffffULL) | (u32)reg->s32_max_value; - reg->smin_value = max_t(s64, reg->smin_value, new_smin); - reg->smax_value = min_t(s64, reg->smax_value, new_smax); - } - /* Here we would like to handle a special case after sign extending load, * when upper bits for a 64-bit range are all 1s or all 0s. * @@ -2438,9 +2325,10 @@ static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg) static void __reg_deduce_bounds(struct bpf_reg_state *reg) { - __reg32_deduce_bounds(reg); - __reg64_deduce_bounds(reg); - __reg_deduce_mixed_bounds(reg); + deduce_bounds_64_from_64(reg); + deduce_bounds_32_from_64(reg); + deduce_bounds_32_from_32(reg); + deduce_bounds_64_from_32(reg); } /* Attempts to improve var_off based on unsigned min/max information */ @@ -2456,8 +2344,13 @@ static void __reg_bound_offset(struct bpf_reg_state *reg) reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off); } +static bool range_bounds_violation(struct bpf_reg_state *reg); + static void reg_bounds_sync(struct bpf_reg_state *reg) { + /* If the input reg_state is invalid, we can exit early */ + if (range_bounds_violation(reg)) + return; /* We might have learned new bounds from the var_off. */ __update_reg_bounds(reg); /* We might have learned something about the sign bit. */ @@ -2472,50 +2365,66 @@ static void reg_bounds_sync(struct bpf_reg_state *reg) __update_reg_bounds(reg); } +static bool range_bounds_violation(struct bpf_reg_state *reg) +{ + return (reg->umin_value > reg->umax_value || reg->smin_value > reg->smax_value || + reg->u32_min_value > reg->u32_max_value || + reg->s32_min_value > reg->s32_max_value); +} + +static bool const_tnum_range_mismatch(struct bpf_reg_state *reg) +{ + u64 uval = reg->var_off.value; + s64 sval = (s64)uval; + + if (!tnum_is_const(reg->var_off)) + return false; + + return reg->umin_value != uval || reg->umax_value != uval || + reg->smin_value != sval || reg->smax_value != sval; +} + +static bool const_tnum_range_mismatch_32(struct bpf_reg_state *reg) +{ + u32 uval32 = tnum_subreg(reg->var_off).value; + s32 sval32 = (s32)uval32; + + if (!tnum_subreg_is_const(reg->var_off)) + return false; + + return reg->u32_min_value != uval32 || reg->u32_max_value != uval32 || + reg->s32_min_value != sval32 || reg->s32_max_value != sval32; +} + static int reg_bounds_sanity_check(struct bpf_verifier_env *env, struct bpf_reg_state *reg, const char *ctx) { const char *msg; - if (reg->umin_value > reg->umax_value || - reg->smin_value > reg->smax_value || - reg->u32_min_value > reg->u32_max_value || - reg->s32_min_value > reg->s32_max_value) { - msg = "range bounds violation"; - goto out; + if (range_bounds_violation(reg)) { + msg = "range bounds violation"; + goto out; } - if (tnum_is_const(reg->var_off)) { - u64 uval = reg->var_off.value; - s64 sval = (s64)uval; - - if (reg->umin_value != uval || reg->umax_value != uval || - reg->smin_value != sval || reg->smax_value != sval) { - msg = "const tnum out of sync with range bounds"; - goto out; - } + if (const_tnum_range_mismatch(reg)) { + msg = "const tnum out of sync with range bounds"; + goto out; } - if (tnum_subreg_is_const(reg->var_off)) { - u32 uval32 = tnum_subreg(reg->var_off).value; - s32 sval32 = (s32)uval32; - - if (reg->u32_min_value != uval32 || reg->u32_max_value != uval32 || - reg->s32_min_value != sval32 || reg->s32_max_value != sval32) { - msg = "const subreg tnum out of sync with range bounds"; - goto out; - } + if (const_tnum_range_mismatch_32(reg)) { + msg = "const subreg tnum out of sync with range bounds"; + goto out; } return 0; out: - verbose(env, "REG INVARIANTS VIOLATION (%s): %s u64=[%#llx, %#llx] " - "s64=[%#llx, %#llx] u32=[%#x, %#x] s32=[%#x, %#x] var_off=(%#llx, %#llx)\n", - ctx, msg, reg->umin_value, reg->umax_value, - reg->smin_value, reg->smax_value, - reg->u32_min_value, reg->u32_max_value, - reg->s32_min_value, reg->s32_max_value, - reg->var_off.value, reg->var_off.mask); + verifier_bug(env, "REG INVARIANTS VIOLATION (%s): %s u64=[%#llx, %#llx] " + "s64=[%#llx, %#llx] u32=[%#x, %#x] s32=[%#x, %#x] var_off=(%#llx, %#llx)", + ctx, msg, reg->umin_value, reg->umax_value, + reg->smin_value, reg->smax_value, + reg->u32_min_value, reg->u32_max_value, + reg->s32_min_value, reg->s32_max_value, + reg->var_off.value, reg->var_off.mask); if (env->test_reg_invariants) return -EFAULT; __mark_reg_unbounded(reg); @@ -2547,7 +2456,7 @@ static void __reg_assign_32_into_64(struct bpf_reg_state *reg) } /* Mark a register as having a completely unknown (scalar) value. */ -static void __mark_reg_unknown_imprecise(struct bpf_reg_state *reg) +void bpf_mark_reg_unknown_imprecise(struct bpf_reg_state *reg) { /* * Clear type, off, and union(map_ptr, range) and @@ -2569,20 +2478,13 @@ static void __mark_reg_unknown_imprecise(struct bpf_reg_state *reg) static void __mark_reg_unknown(const struct bpf_verifier_env *env, struct bpf_reg_state *reg) { - __mark_reg_unknown_imprecise(reg); + bpf_mark_reg_unknown_imprecise(reg); reg->precise = !env->bpf_capable; } static void mark_reg_unknown(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno) { - if (WARN_ON(regno >= MAX_BPF_REG)) { - verbose(env, "mark_reg_unknown(regs, %u)\n", regno); - /* Something bad happened, let's kill all regs except FP */ - for (regno = 0; regno < BPF_REG_FP; regno++) - __mark_reg_not_init(env, regs + regno); - return; - } __mark_reg_unknown(env, regs + regno); } @@ -2605,42 +2507,40 @@ static int __mark_reg_s32_range(struct bpf_verifier_env *env, return reg_bounds_sanity_check(env, reg, "s32_range"); } -static void __mark_reg_not_init(const struct bpf_verifier_env *env, - struct bpf_reg_state *reg) +void bpf_mark_reg_not_init(const struct bpf_verifier_env *env, + struct bpf_reg_state *reg) { __mark_reg_unknown(env, reg); reg->type = NOT_INIT; } -static void mark_reg_not_init(struct bpf_verifier_env *env, - struct bpf_reg_state *regs, u32 regno) +static int mark_btf_ld_reg(struct bpf_verifier_env *env, + struct bpf_reg_state *regs, u32 regno, + enum bpf_reg_type reg_type, + struct btf *btf, u32 btf_id, + enum bpf_type_flag flag) { - if (WARN_ON(regno >= MAX_BPF_REG)) { - verbose(env, "mark_reg_not_init(regs, %u)\n", regno); - /* Something bad happened, let's kill all regs except FP */ - for (regno = 0; regno < BPF_REG_FP; regno++) - __mark_reg_not_init(env, regs + regno); - return; - } - __mark_reg_not_init(env, regs + regno); -} - -static void mark_btf_ld_reg(struct bpf_verifier_env *env, - struct bpf_reg_state *regs, u32 regno, - enum bpf_reg_type reg_type, - struct btf *btf, u32 btf_id, - enum bpf_type_flag flag) -{ - if (reg_type == SCALAR_VALUE) { + switch (reg_type) { + case SCALAR_VALUE: mark_reg_unknown(env, regs, regno); - return; + return 0; + case PTR_TO_BTF_ID: + mark_reg_known_zero(env, regs, regno); + regs[regno].type = PTR_TO_BTF_ID | flag; + regs[regno].btf = btf; + regs[regno].btf_id = btf_id; + if (type_may_be_null(flag)) + regs[regno].id = ++env->id_gen; + return 0; + case PTR_TO_MEM: + mark_reg_known_zero(env, regs, regno); + regs[regno].type = PTR_TO_MEM | flag; + regs[regno].mem_size = 0; + return 0; + default: + verifier_bug(env, "unexpected reg_type %d in %s\n", reg_type, __func__); + return -EFAULT; } - mark_reg_known_zero(env, regs, regno); - regs[regno].type = PTR_TO_BTF_ID | flag; - regs[regno].btf = btf; - regs[regno].btf_id = btf_id; - if (type_may_be_null(flag)) - regs[regno].id = ++env->id_gen; } #define DEF_NOT_SUBREG (0) @@ -2651,9 +2551,7 @@ static void init_reg_state(struct bpf_verifier_env *env, int i; for (i = 0; i < MAX_BPF_REG; i++) { - mark_reg_not_init(env, regs, i); - regs[i].live = REG_LIVE_NONE; - regs[i].parent = NULL; + bpf_mark_reg_not_init(env, ®s[i]); regs[i].subreg_def = DEF_NOT_SUBREG; } @@ -2665,10 +2563,13 @@ static void init_reg_state(struct bpf_verifier_env *env, static struct bpf_retval_range retval_range(s32 minval, s32 maxval) { - return (struct bpf_retval_range){ minval, maxval }; + /* + * return_32bit is set to false by default and set explicitly + * by the caller when necessary. + */ + return (struct bpf_retval_range){ minval, maxval, false }; } -#define BPF_MAIN_FUNC (-1) static void init_func_state(struct bpf_verifier_env *env, struct bpf_func_state *state, int callsite, int frameno, int subprogno) @@ -2689,9 +2590,9 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, struct bpf_verifier_stack_elem *elem; struct bpf_func_state *frame; - elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL); + elem = kzalloc_obj(struct bpf_verifier_stack_elem, GFP_KERNEL_ACCOUNT); if (!elem) - goto err; + return ERR_PTR(-ENOMEM); elem->insn_idx = insn_idx; elem->prev_insn_idx = prev_insn_idx; @@ -2703,44 +2604,27 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, verbose(env, "The sequence of %d jumps is too complex for async cb.\n", env->stack_size); - goto err; + return ERR_PTR(-E2BIG); } - /* Unlike push_stack() do not copy_verifier_state(). + /* Unlike push_stack() do not bpf_copy_verifier_state(). * The caller state doesn't matter. * This is async callback. It starts in a fresh stack. * Initialize it similar to do_check_common(). - * But we do need to make sure to not clobber insn_hist, so we keep - * chaining insn_hist_start/insn_hist_end indices as for a normal - * child state. */ elem->st.branches = 1; elem->st.in_sleepable = is_sleepable; - elem->st.insn_hist_start = env->cur_state->insn_hist_end; - elem->st.insn_hist_end = elem->st.insn_hist_start; - frame = kzalloc(sizeof(*frame), GFP_KERNEL); + frame = kzalloc_obj(*frame, GFP_KERNEL_ACCOUNT); if (!frame) - goto err; + return ERR_PTR(-ENOMEM); init_func_state(env, frame, BPF_MAIN_FUNC /* callsite */, 0 /* frameno within this callchain */, subprog /* subprog number within this prog */); elem->st.frame[0] = frame; return &elem->st; -err: - free_verifier_state(env->cur_state, true); - env->cur_state = NULL; - /* pop all elements and return */ - while (!pop_stack(env, NULL, NULL, false)); - return NULL; } -enum reg_arg_type { - SRC_OP, /* register is used as source operand */ - DST_OP, /* register is used as destination operand */ - DST_OP_NO_MARK /* same as above, check only, don't mark */ -}; - static int cmp_subprogs(const void *a, const void *b) { return ((struct bpf_subprog_info *)a)->start - @@ -2748,7 +2632,7 @@ static int cmp_subprogs(const void *a, const void *b) } /* Find subprogram that contains instruction at 'off' */ -static struct bpf_subprog_info *find_containing_subprog(struct bpf_verifier_env *env, int off) +struct bpf_subprog_info *bpf_find_containing_subprog(struct bpf_verifier_env *env, int off) { struct bpf_subprog_info *vals = env->subprog_info; int l, r, m; @@ -2769,11 +2653,11 @@ static struct bpf_subprog_info *find_containing_subprog(struct bpf_verifier_env } /* Find subprogram that starts exactly at 'off' */ -static int find_subprog(struct bpf_verifier_env *env, int off) +int bpf_find_subprog(struct bpf_verifier_env *env, int off) { struct bpf_subprog_info *p; - p = find_containing_subprog(env, off); + p = bpf_find_containing_subprog(env, off); if (!p || p->start != off) return -ENOENT; return p - env->subprog_info; @@ -2788,7 +2672,7 @@ static int add_subprog(struct bpf_verifier_env *env, int off) verbose(env, "call to invalid destination\n"); return -EINVAL; } - ret = find_subprog(env, off); + ret = bpf_find_subprog(env, off); if (ret >= 0) return ret; if (env->subprog_cnt >= BPF_MAX_SUBPROGS) { @@ -2864,33 +2748,14 @@ static int bpf_find_exception_callback_insn_off(struct bpf_verifier_env *env) return ret; } -#define MAX_KFUNC_DESCS 256 #define MAX_KFUNC_BTFS 256 -struct bpf_kfunc_desc { - struct btf_func_model func_model; - u32 func_id; - s32 imm; - u16 offset; - unsigned long addr; -}; - struct bpf_kfunc_btf { struct btf *btf; struct module *module; u16 offset; }; -struct bpf_kfunc_desc_tab { - /* Sorted by func_id (BTF ID) and offset (fd_array offset) during - * verification. JITs do lookups by bpf_insn, where func_id may not be - * available, therefore at the end of verification do_misc_fixups() - * sorts this by imm and offset. - */ - struct bpf_kfunc_desc descs[MAX_KFUNC_DESCS]; - u32 nr_descs; -}; - struct bpf_kfunc_btf_tab { struct bpf_kfunc_btf descs[MAX_KFUNC_BTFS]; u32 nr_descs; @@ -2913,7 +2778,7 @@ static int kfunc_btf_cmp_by_off(const void *a, const void *b) return d0->offset - d1->offset; } -static const struct bpf_kfunc_desc * +static struct bpf_kfunc_desc * find_kfunc_desc(const struct bpf_prog *prog, u32 func_id, u16 offset) { struct bpf_kfunc_desc desc = { @@ -3032,16 +2897,105 @@ static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env, s16 offset) return btf_vmlinux ?: ERR_PTR(-ENOENT); } -static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) +#define KF_IMPL_SUFFIX "_impl" + +static const struct btf_type *find_kfunc_impl_proto(struct bpf_verifier_env *env, + struct btf *btf, + const char *func_name) +{ + char *buf = env->tmp_str_buf; + const struct btf_type *func; + s32 impl_id; + int len; + + len = snprintf(buf, TMP_STR_BUF_LEN, "%s%s", func_name, KF_IMPL_SUFFIX); + if (len < 0 || len >= TMP_STR_BUF_LEN) { + verbose(env, "function name %s%s is too long\n", func_name, KF_IMPL_SUFFIX); + return NULL; + } + + impl_id = btf_find_by_name_kind(btf, buf, BTF_KIND_FUNC); + if (impl_id <= 0) { + verbose(env, "cannot find function %s in BTF\n", buf); + return NULL; + } + + func = btf_type_by_id(btf, impl_id); + + return btf_type_by_id(btf, func->type); +} + +static int fetch_kfunc_meta(struct bpf_verifier_env *env, + s32 func_id, + s16 offset, + struct bpf_kfunc_meta *kfunc) { const struct btf_type *func, *func_proto; + const char *func_name; + u32 *kfunc_flags; + struct btf *btf; + + if (func_id <= 0) { + verbose(env, "invalid kernel function btf_id %d\n", func_id); + return -EINVAL; + } + + btf = find_kfunc_desc_btf(env, offset); + if (IS_ERR(btf)) { + verbose(env, "failed to find BTF for kernel function\n"); + return PTR_ERR(btf); + } + + /* + * Note that kfunc_flags may be NULL at this point, which + * means that we couldn't find func_id in any relevant + * kfunc_id_set. This most likely indicates an invalid kfunc + * call. However we don't fail with an error here, + * and let the caller decide what to do with NULL kfunc->flags. + */ + kfunc_flags = btf_kfunc_flags(btf, func_id, env->prog); + + func = btf_type_by_id(btf, func_id); + if (!func || !btf_type_is_func(func)) { + verbose(env, "kernel btf_id %d is not a function\n", func_id); + return -EINVAL; + } + + func_name = btf_name_by_offset(btf, func->name_off); + + /* + * An actual prototype of a kfunc with KF_IMPLICIT_ARGS flag + * can be found through the counterpart _impl kfunc. + */ + if (kfunc_flags && (*kfunc_flags & KF_IMPLICIT_ARGS)) + func_proto = find_kfunc_impl_proto(env, btf, func_name); + else + func_proto = btf_type_by_id(btf, func->type); + + if (!func_proto || !btf_type_is_func_proto(func_proto)) { + verbose(env, "kernel function btf_id %d does not have a valid func_proto\n", + func_id); + return -EINVAL; + } + + memset(kfunc, 0, sizeof(*kfunc)); + kfunc->btf = btf; + kfunc->id = func_id; + kfunc->name = func_name; + kfunc->proto = func_proto; + kfunc->flags = kfunc_flags; + + return 0; +} + +int bpf_add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, u16 offset) +{ struct bpf_kfunc_btf_tab *btf_tab; + struct btf_func_model func_model; struct bpf_kfunc_desc_tab *tab; struct bpf_prog_aux *prog_aux; + struct bpf_kfunc_meta kfunc; struct bpf_kfunc_desc *desc; - const char *func_name; - struct btf *desc_btf; - unsigned long call_imm; unsigned long addr; int err; @@ -3069,7 +3023,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) return -EINVAL; } - tab = kzalloc(sizeof(*tab), GFP_KERNEL); + tab = kzalloc_obj(*tab, GFP_KERNEL_ACCOUNT); if (!tab) return -ENOMEM; prog_aux->kfunc_tab = tab; @@ -3085,18 +3039,12 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) return 0; if (!btf_tab && offset) { - btf_tab = kzalloc(sizeof(*btf_tab), GFP_KERNEL); + btf_tab = kzalloc_obj(*btf_tab, GFP_KERNEL_ACCOUNT); if (!btf_tab) return -ENOMEM; prog_aux->kfunc_btf_tab = btf_tab; } - desc_btf = find_kfunc_desc_btf(env, offset); - if (IS_ERR(desc_btf)) { - verbose(env, "failed to find BTF for kernel function\n"); - return PTR_ERR(desc_btf); - } - if (find_kfunc_desc(env->prog, func_id, offset)) return 0; @@ -3105,39 +3053,15 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) return -E2BIG; } - func = btf_type_by_id(desc_btf, func_id); - if (!func || !btf_type_is_func(func)) { - verbose(env, "kernel btf_id %u is not a function\n", - func_id); - return -EINVAL; - } - func_proto = btf_type_by_id(desc_btf, func->type); - if (!func_proto || !btf_type_is_func_proto(func_proto)) { - verbose(env, "kernel function btf_id %u does not have a valid func_proto\n", - func_id); - return -EINVAL; - } + err = fetch_kfunc_meta(env, func_id, offset, &kfunc); + if (err) + return err; - func_name = btf_name_by_offset(desc_btf, func->name_off); - addr = kallsyms_lookup_name(func_name); + addr = kallsyms_lookup_name(kfunc.name); if (!addr) { - verbose(env, "cannot find address for kernel function %s\n", - func_name); + verbose(env, "cannot find address for kernel function %s\n", kfunc.name); return -EINVAL; } - specialize_kfunc(env, func_id, offset, &addr); - - if (bpf_jit_supports_far_kfunc_call()) { - call_imm = func_id; - } else { - call_imm = BPF_CALL_IMM(addr); - /* Check whether the relative offset overflows desc->imm */ - if ((unsigned long)(s32)call_imm != call_imm) { - verbose(env, "address of kernel function %s is out of range\n", - func_name); - return -EINVAL; - } - } if (bpf_dev_bound_kfunc_id(func_id)) { err = bpf_dev_bound_kfunc_check(&env->log, prog_aux); @@ -3145,42 +3069,18 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) return err; } + err = btf_distill_func_proto(&env->log, kfunc.btf, kfunc.proto, kfunc.name, &func_model); + if (err) + return err; + desc = &tab->descs[tab->nr_descs++]; desc->func_id = func_id; - desc->imm = call_imm; desc->offset = offset; desc->addr = addr; - err = btf_distill_func_proto(&env->log, desc_btf, - func_proto, func_name, - &desc->func_model); - if (!err) - sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]), - kfunc_desc_cmp_by_id_off, NULL); - return err; -} - -static int kfunc_desc_cmp_by_imm_off(const void *a, const void *b) -{ - const struct bpf_kfunc_desc *d0 = a; - const struct bpf_kfunc_desc *d1 = b; - - if (d0->imm != d1->imm) - return d0->imm < d1->imm ? -1 : 1; - if (d0->offset != d1->offset) - return d0->offset < d1->offset ? -1 : 1; - return 0; -} - -static void sort_kfunc_descs_by_imm_off(struct bpf_prog *prog) -{ - struct bpf_kfunc_desc_tab *tab; - - tab = prog->aux->kfunc_tab; - if (!tab) - return; - + desc->func_model = func_model; sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]), - kfunc_desc_cmp_by_imm_off, NULL); + kfunc_desc_cmp_by_id_off, NULL); + return 0; } bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog) @@ -3188,24 +3088,6 @@ bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog) return !!prog->aux->kfunc_tab; } -const struct btf_func_model * -bpf_jit_find_kfunc_model(const struct bpf_prog *prog, - const struct bpf_insn *insn) -{ - const struct bpf_kfunc_desc desc = { - .imm = insn->imm, - .offset = insn->off, - }; - const struct bpf_kfunc_desc *res; - struct bpf_kfunc_desc_tab *tab; - - tab = prog->aux->kfunc_tab; - res = bsearch(&desc, tab->descs, tab->nr_descs, - sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm_off); - - return res ? &res->func_model : NULL; -} - static int add_subprog_and_kfunc(struct bpf_verifier_env *env) { struct bpf_subprog_info *subprog = env->subprog_info; @@ -3230,7 +3112,7 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env) if (bpf_pseudo_func(insn) || bpf_pseudo_call(insn)) ret = add_subprog(env, i + insn->imm + 1); else - ret = add_kfunc_call(env, insn->imm, insn->off); + ret = bpf_add_kfunc_call(env, insn->imm, insn->off); if (ret < 0) return ret; @@ -3252,7 +3134,7 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env) if (env->subprog_info[i].start != ex_cb_insn) continue; env->exception_callback_subprog = i; - mark_subprog_exc_cb(env, i); + bpf_mark_subprog_exc_cb(env, i); break; } } @@ -3293,12 +3175,13 @@ static int check_subprogs(struct bpf_verifier_env *env) subprog[cur_subprog].has_ld_abs = true; if (BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) goto next; - if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) + if (BPF_OP(code) == BPF_CALL) goto next; - if (code == (BPF_JMP32 | BPF_JA)) - off = i + insn[i].imm + 1; - else - off = i + insn[i].off + 1; + if (BPF_OP(code) == BPF_EXIT) { + subprog[cur_subprog].exit_idx = i; + goto next; + } + off = i + bpf_jmp_offset(&insn[i]) + 1; if (off < subprog_start || off >= subprog_end) { verbose(env, "jump out of range from insn %d to %d\n", i, off); return -EINVAL; @@ -3324,72 +3207,101 @@ next: return 0; } -/* Parentage chain of this register (or stack slot) should take care of all - * issues like callee-saved registers, stack slot allocation time, etc. +/* + * Sort subprogs in topological order so that leaf subprogs come first and + * their callers come later. This is a DFS post-order traversal of the call + * graph. Scan only reachable instructions (those in the computed postorder) of + * the current subprog to discover callees (direct subprogs and sync + * callbacks). */ -static int mark_reg_read(struct bpf_verifier_env *env, - const struct bpf_reg_state *state, - struct bpf_reg_state *parent, u8 flag) +static int sort_subprogs_topo(struct bpf_verifier_env *env) { - bool writes = parent == state->parent; /* Observe write marks */ - int cnt = 0; + struct bpf_subprog_info *si = env->subprog_info; + int *insn_postorder = env->cfg.insn_postorder; + struct bpf_insn *insn = env->prog->insnsi; + int cnt = env->subprog_cnt; + int *dfs_stack = NULL; + int top = 0, order = 0; + int i, ret = 0; + u8 *color = NULL; + + color = kvzalloc_objs(*color, cnt, GFP_KERNEL_ACCOUNT); + dfs_stack = kvmalloc_objs(*dfs_stack, cnt, GFP_KERNEL_ACCOUNT); + if (!color || !dfs_stack) { + ret = -ENOMEM; + goto out; + } - while (parent) { - /* if read wasn't screened by an earlier write ... */ - if (writes && state->live & REG_LIVE_WRITTEN) - break; - if (parent->live & REG_LIVE_DONE) { - verbose(env, "verifier BUG type %s var_off %lld off %d\n", - reg_type_str(env, parent->type), - parent->var_off.value, parent->off); - return -EFAULT; + /* + * DFS post-order traversal. + * Color values: 0 = unvisited, 1 = on stack, 2 = done. + */ + for (i = 0; i < cnt; i++) { + if (color[i]) + continue; + color[i] = 1; + dfs_stack[top++] = i; + + while (top > 0) { + int cur = dfs_stack[top - 1]; + int po_start = si[cur].postorder_start; + int po_end = si[cur + 1].postorder_start; + bool pushed = false; + int j; + + for (j = po_start; j < po_end; j++) { + int idx = insn_postorder[j]; + int callee; + + if (!bpf_pseudo_call(&insn[idx]) && !bpf_pseudo_func(&insn[idx])) + continue; + callee = bpf_find_subprog(env, idx + insn[idx].imm + 1); + if (callee < 0) { + ret = -EFAULT; + goto out; + } + if (color[callee] == 2) + continue; + if (color[callee] == 1) { + if (bpf_pseudo_func(&insn[idx])) + continue; + verbose(env, "recursive call from %s() to %s()\n", + subprog_name(env, cur), + subprog_name(env, callee)); + ret = -EINVAL; + goto out; + } + color[callee] = 1; + dfs_stack[top++] = callee; + pushed = true; + break; + } + + if (!pushed) { + color[cur] = 2; + env->subprog_topo_order[order++] = cur; + top--; + } } - /* The first condition is more likely to be true than the - * second, checked it first. - */ - if ((parent->live & REG_LIVE_READ) == flag || - parent->live & REG_LIVE_READ64) - /* The parentage chain never changes and - * this parent was already marked as LIVE_READ. - * There is no need to keep walking the chain again and - * keep re-marking all parents as LIVE_READ. - * This case happens when the same register is read - * multiple times without writes into it in-between. - * Also, if parent has the stronger REG_LIVE_READ64 set, - * then no need to set the weak REG_LIVE_READ32. - */ - break; - /* ... then we depend on parent's value */ - parent->live |= flag; - /* REG_LIVE_READ64 overrides REG_LIVE_READ32. */ - if (flag == REG_LIVE_READ64) - parent->live &= ~REG_LIVE_READ32; - state = parent; - parent = state->parent; - writes = true; - cnt++; - } - - if (env->longest_mark_read_walk < cnt) - env->longest_mark_read_walk = cnt; - return 0; + } + + if (env->log.level & BPF_LOG_LEVEL2) + for (i = 0; i < cnt; i++) + verbose(env, "topo_order[%d] = %s\n", + i, subprog_name(env, env->subprog_topo_order[i])); +out: + kvfree(dfs_stack); + kvfree(color); + return ret; } static int mark_stack_slot_obj_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi, int nr_slots) { - struct bpf_func_state *state = func(env, reg); - int err, i; - - for (i = 0; i < nr_slots; i++) { - struct bpf_reg_state *st = &state->stack[spi - i].spilled_ptr; - - err = mark_reg_read(env, st, st->parent, REG_LIVE_READ64); - if (err) - return err; + int i; + for (i = 0; i < nr_slots; i++) mark_stack_slot_scratched(env, spi - i); - } return 0; } @@ -3433,8 +3345,8 @@ static int mark_irq_flag_read(struct bpf_verifier_env *env, struct bpf_reg_state * code only. It returns TRUE if the source or destination register operates * on 64-bit, otherwise return FALSE. */ -static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn, - u32 regno, struct bpf_reg_state *reg, enum reg_arg_type t) +bool bpf_is_reg64(struct bpf_insn *insn, + u32 regno, struct bpf_reg_state *reg, enum bpf_reg_arg_type t) { u8 code, class, op; @@ -3483,7 +3395,7 @@ static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn, } if (class == BPF_STX) { - /* BPF_STX (including atomic variants) has multiple source + /* BPF_STX (including atomic variants) has one or more source * operands, one of which is a ptr. Check whether the caller is * asking about it. */ @@ -3519,41 +3431,6 @@ static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn, return true; } -/* Return the regno defined by the insn, or -1. */ -static int insn_def_regno(const struct bpf_insn *insn) -{ - switch (BPF_CLASS(insn->code)) { - case BPF_JMP: - case BPF_JMP32: - case BPF_ST: - return -1; - case BPF_STX: - if ((BPF_MODE(insn->code) == BPF_ATOMIC || - BPF_MODE(insn->code) == BPF_PROBE_ATOMIC) && - (insn->imm & BPF_FETCH)) { - if (insn->imm == BPF_CMPXCHG) - return BPF_REG_0; - else - return insn->src_reg; - } else { - return -1; - } - default: - return insn->dst_reg; - } -} - -/* Return TRUE if INSN has defined any 32-bit value explicitly. */ -static bool insn_has_def32(struct bpf_verifier_env *env, struct bpf_insn *insn) -{ - int dst_reg = insn_def_regno(insn); - - if (dst_reg == -1) - return false; - - return !is_reg64(env, insn, dst_reg, NULL, DST_OP); -} - static void mark_insn_zext(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { @@ -3568,21 +3445,16 @@ static void mark_insn_zext(struct bpf_verifier_env *env, } static int __check_reg_arg(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno, - enum reg_arg_type t) + enum bpf_reg_arg_type t) { struct bpf_insn *insn = env->prog->insnsi + env->insn_idx; struct bpf_reg_state *reg; bool rw64; - if (regno >= MAX_BPF_REG) { - verbose(env, "R%d is invalid\n", regno); - return -EINVAL; - } - mark_reg_scratched(env, regno); reg = ®s[regno]; - rw64 = is_reg64(env, insn, regno, reg, t); + rw64 = bpf_is_reg64(insn, regno, reg, t); if (t == SRC_OP) { /* check whether register used as source operand can be read */ if (reg->type == NOT_INIT) { @@ -3596,15 +3468,13 @@ static int __check_reg_arg(struct bpf_verifier_env *env, struct bpf_reg_state *r if (rw64) mark_insn_zext(env, reg); - return mark_reg_read(env, reg, reg->parent, - rw64 ? REG_LIVE_READ64 : REG_LIVE_READ32); + return 0; } else { /* check whether register used as dest operand can be written to */ if (regno == BPF_REG_FP) { verbose(env, "frame pointer is read only\n"); return -EACCES; } - reg->live |= REG_LIVE_WRITTEN; reg->subreg_def = rw64 ? DEF_NOT_SUBREG : env->insn_idx + 1; if (t == DST_OP) mark_reg_unknown(env, regs, regno); @@ -3613,7 +3483,7 @@ static int __check_reg_arg(struct bpf_verifier_env *env, struct bpf_reg_state *r } static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, - enum reg_arg_type t) + enum bpf_reg_arg_type t) { struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; @@ -3626,24 +3496,9 @@ static int insn_stack_access_flags(int frameno, int spi) return INSN_F_STACK_ACCESS | (spi << INSN_F_SPI_SHIFT) | frameno; } -static int insn_stack_access_spi(int insn_flags) +static void mark_indirect_target(struct bpf_verifier_env *env, int idx) { - return (insn_flags >> INSN_F_SPI_SHIFT) & INSN_F_SPI_MASK; -} - -static int insn_stack_access_frameno(int insn_flags) -{ - return insn_flags & INSN_F_FRAMENO_MASK; -} - -static void mark_jmp_point(struct bpf_verifier_env *env, int idx) -{ - env->insn_aux_data[idx].jmp_point = true; -} - -static bool is_jmp_point(struct bpf_verifier_env *env, int insn_idx) -{ - return env->insn_aux_data[insn_idx].jmp_point; + env->insn_aux_data[idx].indirect_target = true; } #define LR_FRAMENO_BITS 3 @@ -3724,94 +3579,6 @@ static void linked_regs_unpack(u64 val, struct linked_regs *s) } } -/* for any branch, call, exit record the history of jmps in the given state */ -static int push_insn_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, - int insn_flags, u64 linked_regs) -{ - struct bpf_insn_hist_entry *p; - size_t alloc_size; - - /* combine instruction flags if we already recorded this instruction */ - if (env->cur_hist_ent) { - /* atomic instructions push insn_flags twice, for READ and - * WRITE sides, but they should agree on stack slot - */ - WARN_ONCE((env->cur_hist_ent->flags & insn_flags) && - (env->cur_hist_ent->flags & insn_flags) != insn_flags, - "verifier insn history bug: insn_idx %d cur flags %x new flags %x\n", - env->insn_idx, env->cur_hist_ent->flags, insn_flags); - env->cur_hist_ent->flags |= insn_flags; - WARN_ONCE(env->cur_hist_ent->linked_regs != 0, - "verifier insn history bug: insn_idx %d linked_regs != 0: %#llx\n", - env->insn_idx, env->cur_hist_ent->linked_regs); - env->cur_hist_ent->linked_regs = linked_regs; - return 0; - } - - if (cur->insn_hist_end + 1 > env->insn_hist_cap) { - alloc_size = size_mul(cur->insn_hist_end + 1, sizeof(*p)); - p = kvrealloc(env->insn_hist, alloc_size, GFP_USER); - if (!p) - return -ENOMEM; - env->insn_hist = p; - env->insn_hist_cap = alloc_size / sizeof(*p); - } - - p = &env->insn_hist[cur->insn_hist_end]; - p->idx = env->insn_idx; - p->prev_idx = env->prev_insn_idx; - p->flags = insn_flags; - p->linked_regs = linked_regs; - - cur->insn_hist_end++; - env->cur_hist_ent = p; - - return 0; -} - -static struct bpf_insn_hist_entry *get_insn_hist_entry(struct bpf_verifier_env *env, - u32 hist_start, u32 hist_end, int insn_idx) -{ - if (hist_end > hist_start && env->insn_hist[hist_end - 1].idx == insn_idx) - return &env->insn_hist[hist_end - 1]; - return NULL; -} - -/* Backtrack one insn at a time. If idx is not at the top of recorded - * history then previous instruction came from straight line execution. - * Return -ENOENT if we exhausted all instructions within given state. - * - * It's legal to have a bit of a looping with the same starting and ending - * insn index within the same state, e.g.: 3->4->5->3, so just because current - * instruction index is the same as state's first_idx doesn't mean we are - * done. If there is still some jump history left, we should keep going. We - * need to take into account that we might have a jump history between given - * state's parent and itself, due to checkpointing. In this case, we'll have - * history entry recording a jump from last instruction of parent state and - * first instruction of given state. - */ -static int get_prev_insn_idx(const struct bpf_verifier_env *env, - struct bpf_verifier_state *st, - int insn_idx, u32 hist_start, u32 *hist_endp) -{ - u32 hist_end = *hist_endp; - u32 cnt = hist_end - hist_start; - - if (insn_idx == st->first_insn_idx) { - if (cnt == 0) - return -ENOENT; - if (cnt == 1 && env->insn_hist[hist_start].idx == insn_idx) - return -ENOENT; - } - - if (cnt && env->insn_hist[hist_end - 1].idx == insn_idx) { - (*hist_endp)--; - return env->insn_hist[hist_end - 1].prev_idx; - } else { - return insn_idx - 1; - } -} - static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn) { const struct btf_type *func; @@ -3828,160 +3595,21 @@ static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn) return btf_name_by_offset(desc_btf, func->name_off); } -static inline void bt_init(struct backtrack_state *bt, u32 frame) -{ - bt->frame = frame; -} - -static inline void bt_reset(struct backtrack_state *bt) -{ - struct bpf_verifier_env *env = bt->env; - - memset(bt, 0, sizeof(*bt)); - bt->env = env; -} - -static inline u32 bt_empty(struct backtrack_state *bt) -{ - u64 mask = 0; - int i; - - for (i = 0; i <= bt->frame; i++) - mask |= bt->reg_masks[i] | bt->stack_masks[i]; - - return mask == 0; -} - -static inline int bt_subprog_enter(struct backtrack_state *bt) -{ - if (bt->frame == MAX_CALL_FRAMES - 1) { - verbose(bt->env, "BUG subprog enter from frame %d\n", bt->frame); - WARN_ONCE(1, "verifier backtracking bug"); - return -EFAULT; - } - bt->frame++; - return 0; -} - -static inline int bt_subprog_exit(struct backtrack_state *bt) -{ - if (bt->frame == 0) { - verbose(bt->env, "BUG subprog exit from frame 0\n"); - WARN_ONCE(1, "verifier backtracking bug"); - return -EFAULT; - } - bt->frame--; - return 0; -} - -static inline void bt_set_frame_reg(struct backtrack_state *bt, u32 frame, u32 reg) -{ - bt->reg_masks[frame] |= 1 << reg; -} - -static inline void bt_clear_frame_reg(struct backtrack_state *bt, u32 frame, u32 reg) -{ - bt->reg_masks[frame] &= ~(1 << reg); -} - -static inline void bt_set_reg(struct backtrack_state *bt, u32 reg) -{ - bt_set_frame_reg(bt, bt->frame, reg); -} - -static inline void bt_clear_reg(struct backtrack_state *bt, u32 reg) -{ - bt_clear_frame_reg(bt, bt->frame, reg); -} - -static inline void bt_set_frame_slot(struct backtrack_state *bt, u32 frame, u32 slot) -{ - bt->stack_masks[frame] |= 1ull << slot; -} - -static inline void bt_clear_frame_slot(struct backtrack_state *bt, u32 frame, u32 slot) -{ - bt->stack_masks[frame] &= ~(1ull << slot); -} - -static inline u32 bt_frame_reg_mask(struct backtrack_state *bt, u32 frame) -{ - return bt->reg_masks[frame]; -} - -static inline u32 bt_reg_mask(struct backtrack_state *bt) -{ - return bt->reg_masks[bt->frame]; -} - -static inline u64 bt_frame_stack_mask(struct backtrack_state *bt, u32 frame) -{ - return bt->stack_masks[frame]; -} - -static inline u64 bt_stack_mask(struct backtrack_state *bt) +void bpf_verbose_insn(struct bpf_verifier_env *env, struct bpf_insn *insn) { - return bt->stack_masks[bt->frame]; -} - -static inline bool bt_is_reg_set(struct backtrack_state *bt, u32 reg) -{ - return bt->reg_masks[bt->frame] & (1 << reg); -} - -static inline bool bt_is_frame_reg_set(struct backtrack_state *bt, u32 frame, u32 reg) -{ - return bt->reg_masks[frame] & (1 << reg); -} - -static inline bool bt_is_frame_slot_set(struct backtrack_state *bt, u32 frame, u32 slot) -{ - return bt->stack_masks[frame] & (1ull << slot); -} - -/* format registers bitmask, e.g., "r0,r2,r4" for 0x15 mask */ -static void fmt_reg_mask(char *buf, ssize_t buf_sz, u32 reg_mask) -{ - DECLARE_BITMAP(mask, 64); - bool first = true; - int i, n; - - buf[0] = '\0'; - - bitmap_from_u64(mask, reg_mask); - for_each_set_bit(i, mask, 32) { - n = snprintf(buf, buf_sz, "%sr%d", first ? "" : ",", i); - first = false; - buf += n; - buf_sz -= n; - if (buf_sz < 0) - break; - } -} -/* format stack slots bitmask, e.g., "-8,-24,-40" for 0x15 mask */ -static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask) -{ - DECLARE_BITMAP(mask, 64); - bool first = true; - int i, n; - - buf[0] = '\0'; + const struct bpf_insn_cbs cbs = { + .cb_call = disasm_kfunc_name, + .cb_print = verbose, + .private_data = env, + }; - bitmap_from_u64(mask, stack_mask); - for_each_set_bit(i, mask, 64) { - n = snprintf(buf, buf_sz, "%s%d", first ? "" : ",", -(i + 1) * 8); - first = false; - buf += n; - buf_sz -= n; - if (buf_sz < 0) - break; - } + print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); } /* If any register R in hist->linked_regs is marked as precise in bt, * do bt_set_frame_{reg,slot}(bt, R) for all registers in hist->linked_regs. */ -static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_insn_hist_entry *hist) +void bpf_bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_history_entry *hist) { struct linked_regs linked_regs; bool some_precise = false; @@ -4008,714 +3636,24 @@ static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_insn_hist struct linked_reg *e = &linked_regs.entries[i]; if (e->is_reg) - bt_set_frame_reg(bt, e->frameno, e->regno); + bpf_bt_set_frame_reg(bt, e->frameno, e->regno); else - bt_set_frame_slot(bt, e->frameno, e->spi); - } -} - -static bool calls_callback(struct bpf_verifier_env *env, int insn_idx); - -/* For given verifier state backtrack_insn() is called from the last insn to - * the first insn. Its purpose is to compute a bitmask of registers and - * stack slots that needs precision in the parent verifier state. - * - * @idx is an index of the instruction we are currently processing; - * @subseq_idx is an index of the subsequent instruction that: - * - *would be* executed next, if jump history is viewed in forward order; - * - *was* processed previously during backtracking. - */ -static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, - struct bpf_insn_hist_entry *hist, struct backtrack_state *bt) -{ - const struct bpf_insn_cbs cbs = { - .cb_call = disasm_kfunc_name, - .cb_print = verbose, - .private_data = env, - }; - struct bpf_insn *insn = env->prog->insnsi + idx; - u8 class = BPF_CLASS(insn->code); - u8 opcode = BPF_OP(insn->code); - u8 mode = BPF_MODE(insn->code); - u32 dreg = insn->dst_reg; - u32 sreg = insn->src_reg; - u32 spi, i, fr; - - if (insn->code == 0) - return 0; - if (env->log.level & BPF_LOG_LEVEL2) { - fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_reg_mask(bt)); - verbose(env, "mark_precise: frame%d: regs=%s ", - bt->frame, env->tmp_str_buf); - fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_stack_mask(bt)); - verbose(env, "stack=%s before ", env->tmp_str_buf); - verbose(env, "%d: ", idx); - print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); - } - - /* If there is a history record that some registers gained range at this insn, - * propagate precision marks to those registers, so that bt_is_reg_set() - * accounts for these registers. - */ - bt_sync_linked_regs(bt, hist); - - if (class == BPF_ALU || class == BPF_ALU64) { - if (!bt_is_reg_set(bt, dreg)) - return 0; - if (opcode == BPF_END || opcode == BPF_NEG) { - /* sreg is reserved and unused - * dreg still need precision before this insn - */ - return 0; - } else if (opcode == BPF_MOV) { - if (BPF_SRC(insn->code) == BPF_X) { - /* dreg = sreg or dreg = (s8, s16, s32)sreg - * dreg needs precision after this insn - * sreg needs precision before this insn - */ - bt_clear_reg(bt, dreg); - if (sreg != BPF_REG_FP) - bt_set_reg(bt, sreg); - } else { - /* dreg = K - * dreg needs precision after this insn. - * Corresponding register is already marked - * as precise=true in this verifier state. - * No further markings in parent are necessary - */ - bt_clear_reg(bt, dreg); - } - } else { - if (BPF_SRC(insn->code) == BPF_X) { - /* dreg += sreg - * both dreg and sreg need precision - * before this insn - */ - if (sreg != BPF_REG_FP) - bt_set_reg(bt, sreg); - } /* else dreg += K - * dreg still needs precision before this insn - */ - } - } else if (class == BPF_LDX) { - if (!bt_is_reg_set(bt, dreg)) - return 0; - bt_clear_reg(bt, dreg); - - /* scalars can only be spilled into stack w/o losing precision. - * Load from any other memory can be zero extended. - * The desire to keep that precision is already indicated - * by 'precise' mark in corresponding register of this state. - * No further tracking necessary. - */ - if (!hist || !(hist->flags & INSN_F_STACK_ACCESS)) - return 0; - /* dreg = *(u64 *)[fp - off] was a fill from the stack. - * that [fp - off] slot contains scalar that needs to be - * tracked with precision - */ - spi = insn_stack_access_spi(hist->flags); - fr = insn_stack_access_frameno(hist->flags); - bt_set_frame_slot(bt, fr, spi); - } else if (class == BPF_STX || class == BPF_ST) { - if (bt_is_reg_set(bt, dreg)) - /* stx & st shouldn't be using _scalar_ dst_reg - * to access memory. It means backtracking - * encountered a case of pointer subtraction. - */ - return -ENOTSUPP; - /* scalars can only be spilled into stack */ - if (!hist || !(hist->flags & INSN_F_STACK_ACCESS)) - return 0; - spi = insn_stack_access_spi(hist->flags); - fr = insn_stack_access_frameno(hist->flags); - if (!bt_is_frame_slot_set(bt, fr, spi)) - return 0; - bt_clear_frame_slot(bt, fr, spi); - if (class == BPF_STX) - bt_set_reg(bt, sreg); - } else if (class == BPF_JMP || class == BPF_JMP32) { - if (bpf_pseudo_call(insn)) { - int subprog_insn_idx, subprog; - - subprog_insn_idx = idx + insn->imm + 1; - subprog = find_subprog(env, subprog_insn_idx); - if (subprog < 0) - return -EFAULT; - - if (subprog_is_global(env, subprog)) { - /* check that jump history doesn't have any - * extra instructions from subprog; the next - * instruction after call to global subprog - * should be literally next instruction in - * caller program - */ - WARN_ONCE(idx + 1 != subseq_idx, "verifier backtracking bug"); - /* r1-r5 are invalidated after subprog call, - * so for global func call it shouldn't be set - * anymore - */ - if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) { - verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); - WARN_ONCE(1, "verifier backtracking bug"); - return -EFAULT; - } - /* global subprog always sets R0 */ - bt_clear_reg(bt, BPF_REG_0); - return 0; - } else { - /* static subprog call instruction, which - * means that we are exiting current subprog, - * so only r1-r5 could be still requested as - * precise, r0 and r6-r10 or any stack slot in - * the current frame should be zero by now - */ - if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) { - verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); - WARN_ONCE(1, "verifier backtracking bug"); - return -EFAULT; - } - /* we are now tracking register spills correctly, - * so any instance of leftover slots is a bug - */ - if (bt_stack_mask(bt) != 0) { - verbose(env, "BUG stack slots %llx\n", bt_stack_mask(bt)); - WARN_ONCE(1, "verifier backtracking bug (subprog leftover stack slots)"); - return -EFAULT; - } - /* propagate r1-r5 to the caller */ - for (i = BPF_REG_1; i <= BPF_REG_5; i++) { - if (bt_is_reg_set(bt, i)) { - bt_clear_reg(bt, i); - bt_set_frame_reg(bt, bt->frame - 1, i); - } - } - if (bt_subprog_exit(bt)) - return -EFAULT; - return 0; - } - } else if (is_sync_callback_calling_insn(insn) && idx != subseq_idx - 1) { - /* exit from callback subprog to callback-calling helper or - * kfunc call. Use idx/subseq_idx check to discern it from - * straight line code backtracking. - * Unlike the subprog call handling above, we shouldn't - * propagate precision of r1-r5 (if any requested), as they are - * not actually arguments passed directly to callback subprogs - */ - if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) { - verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); - WARN_ONCE(1, "verifier backtracking bug"); - return -EFAULT; - } - if (bt_stack_mask(bt) != 0) { - verbose(env, "BUG stack slots %llx\n", bt_stack_mask(bt)); - WARN_ONCE(1, "verifier backtracking bug (callback leftover stack slots)"); - return -EFAULT; - } - /* clear r1-r5 in callback subprog's mask */ - for (i = BPF_REG_1; i <= BPF_REG_5; i++) - bt_clear_reg(bt, i); - if (bt_subprog_exit(bt)) - return -EFAULT; - return 0; - } else if (opcode == BPF_CALL) { - /* kfunc with imm==0 is invalid and fixup_kfunc_call will - * catch this error later. Make backtracking conservative - * with ENOTSUPP. - */ - if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && insn->imm == 0) - return -ENOTSUPP; - /* regular helper call sets R0 */ - bt_clear_reg(bt, BPF_REG_0); - if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) { - /* if backtracing was looking for registers R1-R5 - * they should have been found already. - */ - verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); - WARN_ONCE(1, "verifier backtracking bug"); - return -EFAULT; - } - } else if (opcode == BPF_EXIT) { - bool r0_precise; - - /* Backtracking to a nested function call, 'idx' is a part of - * the inner frame 'subseq_idx' is a part of the outer frame. - * In case of a regular function call, instructions giving - * precision to registers R1-R5 should have been found already. - * In case of a callback, it is ok to have R1-R5 marked for - * backtracking, as these registers are set by the function - * invoking callback. - */ - if (subseq_idx >= 0 && calls_callback(env, subseq_idx)) - for (i = BPF_REG_1; i <= BPF_REG_5; i++) - bt_clear_reg(bt, i); - if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) { - verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); - WARN_ONCE(1, "verifier backtracking bug"); - return -EFAULT; - } - - /* BPF_EXIT in subprog or callback always returns - * right after the call instruction, so by checking - * whether the instruction at subseq_idx-1 is subprog - * call or not we can distinguish actual exit from - * *subprog* from exit from *callback*. In the former - * case, we need to propagate r0 precision, if - * necessary. In the former we never do that. - */ - r0_precise = subseq_idx - 1 >= 0 && - bpf_pseudo_call(&env->prog->insnsi[subseq_idx - 1]) && - bt_is_reg_set(bt, BPF_REG_0); - - bt_clear_reg(bt, BPF_REG_0); - if (bt_subprog_enter(bt)) - return -EFAULT; - - if (r0_precise) - bt_set_reg(bt, BPF_REG_0); - /* r6-r9 and stack slots will stay set in caller frame - * bitmasks until we return back from callee(s) - */ - return 0; - } else if (BPF_SRC(insn->code) == BPF_X) { - if (!bt_is_reg_set(bt, dreg) && !bt_is_reg_set(bt, sreg)) - return 0; - /* dreg <cond> sreg - * Both dreg and sreg need precision before - * this insn. If only sreg was marked precise - * before it would be equally necessary to - * propagate it to dreg. - */ - bt_set_reg(bt, dreg); - bt_set_reg(bt, sreg); - } else if (BPF_SRC(insn->code) == BPF_K) { - /* dreg <cond> K - * Only dreg still needs precision before - * this insn, so for the K-based conditional - * there is nothing new to be marked. - */ - } - } else if (class == BPF_LD) { - if (!bt_is_reg_set(bt, dreg)) - return 0; - bt_clear_reg(bt, dreg); - /* It's ld_imm64 or ld_abs or ld_ind. - * For ld_imm64 no further tracking of precision - * into parent is necessary - */ - if (mode == BPF_IND || mode == BPF_ABS) - /* to be analyzed */ - return -ENOTSUPP; + bpf_bt_set_frame_slot(bt, e->frameno, e->spi); } - /* Propagate precision marks to linked registers, to account for - * registers marked as precise in this function. - */ - bt_sync_linked_regs(bt, hist); - return 0; -} - -/* the scalar precision tracking algorithm: - * . at the start all registers have precise=false. - * . scalar ranges are tracked as normal through alu and jmp insns. - * . once precise value of the scalar register is used in: - * . ptr + scalar alu - * . if (scalar cond K|scalar) - * . helper_call(.., scalar, ...) where ARG_CONST is expected - * backtrack through the verifier states and mark all registers and - * stack slots with spilled constants that these scalar regisers - * should be precise. - * . during state pruning two registers (or spilled stack slots) - * are equivalent if both are not precise. - * - * Note the verifier cannot simply walk register parentage chain, - * since many different registers and stack slots could have been - * used to compute single precise scalar. - * - * The approach of starting with precise=true for all registers and then - * backtrack to mark a register as not precise when the verifier detects - * that program doesn't care about specific value (e.g., when helper - * takes register as ARG_ANYTHING parameter) is not safe. - * - * It's ok to walk single parentage chain of the verifier states. - * It's possible that this backtracking will go all the way till 1st insn. - * All other branches will be explored for needing precision later. - * - * The backtracking needs to deal with cases like: - * R8=map_value(id=0,off=0,ks=4,vs=1952,imm=0) R9_w=map_value(id=0,off=40,ks=4,vs=1952,imm=0) - * r9 -= r8 - * r5 = r9 - * if r5 > 0x79f goto pc+7 - * R5_w=inv(id=0,umax_value=1951,var_off=(0x0; 0x7ff)) - * r5 += 1 - * ... - * call bpf_perf_event_output#25 - * where .arg5_type = ARG_CONST_SIZE_OR_ZERO - * - * and this case: - * r6 = 1 - * call foo // uses callee's r6 inside to compute r0 - * r0 += r6 - * if r0 == 0 goto - * - * to track above reg_mask/stack_mask needs to be independent for each frame. - * - * Also if parent's curframe > frame where backtracking started, - * the verifier need to mark registers in both frames, otherwise callees - * may incorrectly prune callers. This is similar to - * commit 7640ead93924 ("bpf: verifier: make sure callees don't prune with caller differences") - * - * For now backtracking falls back into conservative marking. - */ -static void mark_all_scalars_precise(struct bpf_verifier_env *env, - struct bpf_verifier_state *st) -{ - struct bpf_func_state *func; - struct bpf_reg_state *reg; - int i, j; - - if (env->log.level & BPF_LOG_LEVEL2) { - verbose(env, "mark_precise: frame%d: falling back to forcing all scalars precise\n", - st->curframe); - } - - /* big hammer: mark all scalars precise in this path. - * pop_stack may still get !precise scalars. - * We also skip current state and go straight to first parent state, - * because precision markings in current non-checkpointed state are - * not needed. See why in the comment in __mark_chain_precision below. - */ - for (st = st->parent; st; st = st->parent) { - for (i = 0; i <= st->curframe; i++) { - func = st->frame[i]; - for (j = 0; j < BPF_REG_FP; j++) { - reg = &func->regs[j]; - if (reg->type != SCALAR_VALUE || reg->precise) - continue; - reg->precise = true; - if (env->log.level & BPF_LOG_LEVEL2) { - verbose(env, "force_precise: frame%d: forcing r%d to be precise\n", - i, j); - } - } - for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) { - if (!is_spilled_reg(&func->stack[j])) - continue; - reg = &func->stack[j].spilled_ptr; - if (reg->type != SCALAR_VALUE || reg->precise) - continue; - reg->precise = true; - if (env->log.level & BPF_LOG_LEVEL2) { - verbose(env, "force_precise: frame%d: forcing fp%d to be precise\n", - i, -(j + 1) * 8); - } - } - } - } -} - -static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_verifier_state *st) -{ - struct bpf_func_state *func; - struct bpf_reg_state *reg; - int i, j; - - for (i = 0; i <= st->curframe; i++) { - func = st->frame[i]; - for (j = 0; j < BPF_REG_FP; j++) { - reg = &func->regs[j]; - if (reg->type != SCALAR_VALUE) - continue; - reg->precise = false; - } - for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) { - if (!is_spilled_reg(&func->stack[j])) - continue; - reg = &func->stack[j].spilled_ptr; - if (reg->type != SCALAR_VALUE) - continue; - reg->precise = false; - } - } -} - -/* - * __mark_chain_precision() backtracks BPF program instruction sequence and - * chain of verifier states making sure that register *regno* (if regno >= 0) - * and/or stack slot *spi* (if spi >= 0) are marked as precisely tracked - * SCALARS, as well as any other registers and slots that contribute to - * a tracked state of given registers/stack slots, depending on specific BPF - * assembly instructions (see backtrack_insns() for exact instruction handling - * logic). This backtracking relies on recorded insn_hist and is able to - * traverse entire chain of parent states. This process ends only when all the - * necessary registers/slots and their transitive dependencies are marked as - * precise. - * - * One important and subtle aspect is that precise marks *do not matter* in - * the currently verified state (current state). It is important to understand - * why this is the case. - * - * First, note that current state is the state that is not yet "checkpointed", - * i.e., it is not yet put into env->explored_states, and it has no children - * states as well. It's ephemeral, and can end up either a) being discarded if - * compatible explored state is found at some point or BPF_EXIT instruction is - * reached or b) checkpointed and put into env->explored_states, branching out - * into one or more children states. - * - * In the former case, precise markings in current state are completely - * ignored by state comparison code (see regsafe() for details). Only - * checkpointed ("old") state precise markings are important, and if old - * state's register/slot is precise, regsafe() assumes current state's - * register/slot as precise and checks value ranges exactly and precisely. If - * states turn out to be compatible, current state's necessary precise - * markings and any required parent states' precise markings are enforced - * after the fact with propagate_precision() logic, after the fact. But it's - * important to realize that in this case, even after marking current state - * registers/slots as precise, we immediately discard current state. So what - * actually matters is any of the precise markings propagated into current - * state's parent states, which are always checkpointed (due to b) case above). - * As such, for scenario a) it doesn't matter if current state has precise - * markings set or not. - * - * Now, for the scenario b), checkpointing and forking into child(ren) - * state(s). Note that before current state gets to checkpointing step, any - * processed instruction always assumes precise SCALAR register/slot - * knowledge: if precise value or range is useful to prune jump branch, BPF - * verifier takes this opportunity enthusiastically. Similarly, when - * register's value is used to calculate offset or memory address, exact - * knowledge of SCALAR range is assumed, checked, and enforced. So, similar to - * what we mentioned above about state comparison ignoring precise markings - * during state comparison, BPF verifier ignores and also assumes precise - * markings *at will* during instruction verification process. But as verifier - * assumes precision, it also propagates any precision dependencies across - * parent states, which are not yet finalized, so can be further restricted - * based on new knowledge gained from restrictions enforced by their children - * states. This is so that once those parent states are finalized, i.e., when - * they have no more active children state, state comparison logic in - * is_state_visited() would enforce strict and precise SCALAR ranges, if - * required for correctness. - * - * To build a bit more intuition, note also that once a state is checkpointed, - * the path we took to get to that state is not important. This is crucial - * property for state pruning. When state is checkpointed and finalized at - * some instruction index, it can be correctly and safely used to "short - * circuit" any *compatible* state that reaches exactly the same instruction - * index. I.e., if we jumped to that instruction from a completely different - * code path than original finalized state was derived from, it doesn't - * matter, current state can be discarded because from that instruction - * forward having a compatible state will ensure we will safely reach the - * exit. States describe preconditions for further exploration, but completely - * forget the history of how we got here. - * - * This also means that even if we needed precise SCALAR range to get to - * finalized state, but from that point forward *that same* SCALAR register is - * never used in a precise context (i.e., it's precise value is not needed for - * correctness), it's correct and safe to mark such register as "imprecise" - * (i.e., precise marking set to false). This is what we rely on when we do - * not set precise marking in current state. If no child state requires - * precision for any given SCALAR register, it's safe to dictate that it can - * be imprecise. If any child state does require this register to be precise, - * we'll mark it precise later retroactively during precise markings - * propagation from child state to parent states. - * - * Skipping precise marking setting in current state is a mild version of - * relying on the above observation. But we can utilize this property even - * more aggressively by proactively forgetting any precise marking in the - * current state (which we inherited from the parent state), right before we - * checkpoint it and branch off into new child state. This is done by - * mark_all_scalars_imprecise() to hopefully get more permissive and generic - * finalized states which help in short circuiting more future states. - */ -static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) -{ - struct backtrack_state *bt = &env->bt; - struct bpf_verifier_state *st = env->cur_state; - int first_idx = st->first_insn_idx; - int last_idx = env->insn_idx; - int subseq_idx = -1; - struct bpf_func_state *func; - struct bpf_reg_state *reg; - bool skip_first = true; - int i, fr, err; - - if (!env->bpf_capable) - return 0; - - /* set frame number from which we are starting to backtrack */ - bt_init(bt, env->cur_state->curframe); - - /* Do sanity checks against current state of register and/or stack - * slot, but don't set precise flag in current state, as precision - * tracking in the current state is unnecessary. - */ - func = st->frame[bt->frame]; - if (regno >= 0) { - reg = &func->regs[regno]; - if (reg->type != SCALAR_VALUE) { - WARN_ONCE(1, "backtracing misuse"); - return -EFAULT; - } - bt_set_reg(bt, regno); - } - - if (bt_empty(bt)) - return 0; - - for (;;) { - DECLARE_BITMAP(mask, 64); - u32 hist_start = st->insn_hist_start; - u32 hist_end = st->insn_hist_end; - struct bpf_insn_hist_entry *hist; - - if (env->log.level & BPF_LOG_LEVEL2) { - verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d subseq_idx %d \n", - bt->frame, last_idx, first_idx, subseq_idx); - } - - if (last_idx < 0) { - /* we are at the entry into subprog, which - * is expected for global funcs, but only if - * requested precise registers are R1-R5 - * (which are global func's input arguments) - */ - if (st->curframe == 0 && - st->frame[0]->subprogno > 0 && - st->frame[0]->callsite == BPF_MAIN_FUNC && - bt_stack_mask(bt) == 0 && - (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) == 0) { - bitmap_from_u64(mask, bt_reg_mask(bt)); - for_each_set_bit(i, mask, 32) { - reg = &st->frame[0]->regs[i]; - bt_clear_reg(bt, i); - if (reg->type == SCALAR_VALUE) - reg->precise = true; - } - return 0; - } - - verbose(env, "BUG backtracking func entry subprog %d reg_mask %x stack_mask %llx\n", - st->frame[0]->subprogno, bt_reg_mask(bt), bt_stack_mask(bt)); - WARN_ONCE(1, "verifier backtracking bug"); - return -EFAULT; - } - - for (i = last_idx;;) { - if (skip_first) { - err = 0; - skip_first = false; - } else { - hist = get_insn_hist_entry(env, hist_start, hist_end, i); - err = backtrack_insn(env, i, subseq_idx, hist, bt); - } - if (err == -ENOTSUPP) { - mark_all_scalars_precise(env, env->cur_state); - bt_reset(bt); - return 0; - } else if (err) { - return err; - } - if (bt_empty(bt)) - /* Found assignment(s) into tracked register in this state. - * Since this state is already marked, just return. - * Nothing to be tracked further in the parent state. - */ - return 0; - subseq_idx = i; - i = get_prev_insn_idx(env, st, i, hist_start, &hist_end); - if (i == -ENOENT) - break; - if (i >= env->prog->len) { - /* This can happen if backtracking reached insn 0 - * and there are still reg_mask or stack_mask - * to backtrack. - * It means the backtracking missed the spot where - * particular register was initialized with a constant. - */ - verbose(env, "BUG backtracking idx %d\n", i); - WARN_ONCE(1, "verifier backtracking bug"); - return -EFAULT; - } - } - st = st->parent; - if (!st) - break; - - for (fr = bt->frame; fr >= 0; fr--) { - func = st->frame[fr]; - bitmap_from_u64(mask, bt_frame_reg_mask(bt, fr)); - for_each_set_bit(i, mask, 32) { - reg = &func->regs[i]; - if (reg->type != SCALAR_VALUE) { - bt_clear_frame_reg(bt, fr, i); - continue; - } - if (reg->precise) - bt_clear_frame_reg(bt, fr, i); - else - reg->precise = true; - } - - bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr)); - for_each_set_bit(i, mask, 64) { - if (i >= func->allocated_stack / BPF_REG_SIZE) { - verbose(env, "BUG backtracking (stack slot %d, total slots %d)\n", - i, func->allocated_stack / BPF_REG_SIZE); - WARN_ONCE(1, "verifier backtracking bug (stack slot out of bounds)"); - return -EFAULT; - } - - if (!is_spilled_scalar_reg(&func->stack[i])) { - bt_clear_frame_slot(bt, fr, i); - continue; - } - reg = &func->stack[i].spilled_ptr; - if (reg->precise) - bt_clear_frame_slot(bt, fr, i); - else - reg->precise = true; - } - if (env->log.level & BPF_LOG_LEVEL2) { - fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, - bt_frame_reg_mask(bt, fr)); - verbose(env, "mark_precise: frame%d: parent state regs=%s ", - fr, env->tmp_str_buf); - fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, - bt_frame_stack_mask(bt, fr)); - verbose(env, "stack=%s: ", env->tmp_str_buf); - print_verifier_state(env, st, fr, true); - } - } - - if (bt_empty(bt)) - return 0; - - subseq_idx = first_idx; - last_idx = st->last_insn_idx; - first_idx = st->first_insn_idx; - } - - /* if we still have requested precise regs or slots, we missed - * something (e.g., stack access through non-r10 register), so - * fallback to marking all precise - */ - if (!bt_empty(bt)) { - mark_all_scalars_precise(env, env->cur_state); - bt_reset(bt); - } - - return 0; } int mark_chain_precision(struct bpf_verifier_env *env, int regno) { - return __mark_chain_precision(env, regno); + return bpf_mark_chain_precision(env, env->cur_state, regno, NULL); } /* mark_chain_precision_batch() assumes that env->bt is set in the caller to * desired reg and stack masks across all relevant frames */ -static int mark_chain_precision_batch(struct bpf_verifier_env *env) +static int mark_chain_precision_batch(struct bpf_verifier_env *env, + struct bpf_verifier_state *starting_state) { - return __mark_chain_precision(env, -1); + return bpf_mark_chain_precision(env, starting_state, -1, NULL); } static bool is_spillable_regtype(enum bpf_reg_type type) @@ -4745,11 +3683,6 @@ static bool is_spillable_regtype(enum bpf_reg_type type) } } -/* Does this register contain a constant zero? */ -static bool register_is_null(struct bpf_reg_state *reg) -{ - return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0); -} /* check if register is a constant scalar value */ static bool is_reg_const(struct bpf_reg_state *reg, bool subreg32) @@ -4773,39 +3706,37 @@ static bool __is_pointer_value(bool allow_ptr_leaks, return reg->type != SCALAR_VALUE; } +static void clear_scalar_id(struct bpf_reg_state *reg) +{ + reg->id = 0; + reg->delta = 0; +} + static void assign_scalar_id_before_mov(struct bpf_verifier_env *env, struct bpf_reg_state *src_reg) { if (src_reg->type != SCALAR_VALUE) return; - - if (src_reg->id & BPF_ADD_CONST) { - /* - * The verifier is processing rX = rY insn and - * rY->id has special linked register already. - * Cleared it, since multiple rX += const are not supported. - */ - src_reg->id = 0; - src_reg->off = 0; - } - + /* + * The verifier is processing rX = rY insn and + * rY->id has special linked register already. + * Cleared it, since multiple rX += const are not supported. + */ + if (src_reg->id & BPF_ADD_CONST) + clear_scalar_id(src_reg); + /* + * Ensure that src_reg has a valid ID that will be copied to + * dst_reg and then will be used by sync_linked_regs() to + * propagate min/max range. + */ if (!src_reg->id && !tnum_is_const(src_reg->var_off)) - /* Ensure that src_reg has a valid ID that will be copied to - * dst_reg and then will be used by sync_linked_regs() to - * propagate min/max range. - */ src_reg->id = ++env->id_gen; } /* Copy src state preserving dst->parent and dst->live fields */ static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_state *src) { - struct bpf_reg_state *parent = dst->parent; - enum bpf_reg_liveness live = dst->live; - *dst = *src; - dst->parent = parent; - dst->live = live; } static void save_register_state(struct bpf_verifier_env *env, @@ -4816,8 +3747,6 @@ static void save_register_state(struct bpf_verifier_env *env, int i; copy_register_state(&state->stack[spi].spilled_ptr, reg); - if (size == BPF_REG_SIZE) - state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; for (i = BPF_REG_SIZE; i > BPF_REG_SIZE - size; i--) state->stack[spi].slot_type[i - 1] = STACK_SPILL; @@ -4862,6 +3791,18 @@ static void check_fastcall_stack_contract(struct bpf_verifier_env *env, } } +static void scrub_special_slot(struct bpf_func_state *state, int spi) +{ + int i; + + /* regular write of data into stack destroys any spilled ptr */ + state->stack[spi].spilled_ptr.type = NOT_INIT; + /* Mark slots as STACK_MISC if they belonged to spilled ptr/dynptr/iter. */ + if (is_stack_slot_special(&state->stack[spi])) + for (i = 0; i < BPF_REG_SIZE; i++) + scrub_spilled_slot(&state->stack[spi].slot_type[i]); +} + /* check_stack_{read,write}_fixed_off functions track spill/fill of registers, * stack boundary and alignment are checked in check_mem_access() */ @@ -4881,8 +3822,8 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, * so it's aligned access and [off, off + size) are within stack limits */ if (!env->allow_ptr_leaks && - is_spilled_reg(&state->stack[spi]) && - !is_spilled_scalar_reg(&state->stack[spi]) && + bpf_is_spilled_reg(&state->stack[spi]) && + !bpf_is_spilled_scalar_reg(&state->stack[spi]) && size != BPF_REG_SIZE) { verbose(env, "attempt to corrupt spilled pointer on stack\n"); return -EACCES; @@ -4904,7 +3845,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, } if (sanitize) - env->insn_aux_data[insn_idx].sanitize_stack_spill = true; + env->insn_aux_data[insn_idx].nospec_result = true; } err = destroy_if_dynptr_stack_slot(env, state, spi); @@ -4947,26 +3888,10 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, } else { u8 type = STACK_MISC; - /* regular write of data into stack destroys any spilled ptr */ - state->stack[spi].spilled_ptr.type = NOT_INIT; - /* Mark slots as STACK_MISC if they belonged to spilled ptr/dynptr/iter. */ - if (is_stack_slot_special(&state->stack[spi])) - for (i = 0; i < BPF_REG_SIZE; i++) - scrub_spilled_slot(&state->stack[spi].slot_type[i]); - - /* only mark the slot as written if all 8 bytes were written - * otherwise read propagation may incorrectly stop too soon - * when stack slots are partially written. - * This heuristic means that read propagation will be - * conservative, since it will add reg_live_read marks - * to stack slots all the way to first state when programs - * writes+reads less than 8 bytes - */ - if (size == BPF_REG_SIZE) - state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; + scrub_special_slot(state, spi); /* when we zero initialize stack slots mark them as such */ - if ((reg && register_is_null(reg)) || + if ((reg && bpf_register_is_null(reg)) || (!reg && is_bpf_st_mem(insn) && insn->imm == 0)) { /* STACK_ZERO case happened because register spill * wasn't properly aligned at the stack slot boundary, @@ -4987,7 +3912,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, } if (insn_flags) - return push_insn_history(env, env->cur_state, insn_flags, 0); + return bpf_push_jmp_history(env, env->cur_state, insn_flags, 0); return 0; } @@ -4997,7 +3922,6 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, * tracks the effects of the write, considering that each stack slot in the * dynamic range is potentially written to. * - * 'off' includes 'regno->off'. * 'value_regno' can be -1, meaning that an unknown value is being written to * the stack. * @@ -5033,14 +3957,14 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env, max_off = ptr_reg->smax_value + off + size; if (value_regno >= 0) value_reg = &cur->regs[value_regno]; - if ((value_reg && register_is_null(value_reg)) || + if ((value_reg && bpf_register_is_null(value_reg)) || (!value_reg && is_bpf_st_mem(insn) && insn->imm == 0)) writing_zero = true; for (i = min_off; i < max_off; i++) { int spi; - spi = __get_spi(i); + spi = bpf_get_spi(i); err = destroy_if_dynptr_stack_slot(env, state, spi); if (err) return err; @@ -5078,7 +4002,7 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env, * maintain the spill type. */ if (writing_zero && *stype == STACK_SPILL && - is_spilled_scalar_reg(&state->stack[spi])) { + bpf_is_spilled_scalar_reg(&state->stack[spi])) { struct bpf_reg_state *spill_reg = &state->stack[spi].spilled_ptr; if (tnum_is_const(spill_reg->var_off) && spill_reg->var_off.value == 0) { @@ -5087,8 +4011,13 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env, } } - /* Erase all other spilled pointers. */ - state->stack[spi].spilled_ptr.type = NOT_INIT; + /* + * Scrub slots if variable-offset stack write goes over spilled pointers. + * Otherwise bpf_is_spilled_reg() may == true && spilled_ptr.type == NOT_INIT + * and valid program is rejected by check_stack_read_fixed_off() + * with obscure "invalid size of register fill" message. + */ + scrub_special_slot(state, spi); /* Update the slot type. */ new_type = STACK_MISC; @@ -5103,8 +4032,10 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env, * For privileged programs, we will accept such reads to slots * that may or may not be written because, if we're reject * them, the error would be too confusing. + * Conservatively, treat STACK_POISON in a similar way. */ - if (*stype == STACK_INVALID && !env->allow_uninit_stack) { + if ((*stype == STACK_INVALID || *stype == STACK_POISON) && + !env->allow_uninit_stack) { verbose(env, "uninit stack in range of var-offset write prohibited for !root; insn %d, off: %d", insn_idx, i); return -EINVAL; @@ -5157,7 +4088,6 @@ static void mark_reg_stack_read(struct bpf_verifier_env *env, /* have read misc data from the stack */ mark_reg_unknown(env, state->regs, dst_regno); } - state->regs[dst_regno].live |= REG_LIVE_WRITTEN; } /* Read the stack at 'off' and put the results into the register indicated by @@ -5187,7 +4117,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, mark_stack_slot_scratched(env, spi); check_fastcall_stack_contract(env, state, env->insn_idx, off); - if (is_spilled_reg(®_state->stack[spi])) { + if (bpf_is_spilled_reg(®_state->stack[spi])) { u8 spill_size = 1; for (i = BPF_REG_SIZE - 1; i > 0 && stype[i - 1] == STACK_SPILL; i--) @@ -5200,7 +4130,6 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, return -EACCES; } - mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); if (dst_regno < 0) return 0; @@ -5211,6 +4140,12 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, */ s32 subreg_def = state->regs[dst_regno].subreg_def; + if (env->bpf_capable && size == 4 && spill_size == 4 && + get_reg_width(reg) <= 32) + /* Ensure stack slot has an ID to build a relation + * with the destination register on fill. + */ + assign_scalar_id_before_mov(env, reg); copy_register_state(&state->regs[dst_regno], reg); state->regs[dst_regno].subreg_def = subreg_def; @@ -5218,7 +4153,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, * coerce_reg_to_size will adjust the boundaries. */ if (get_reg_width(reg) > size * BITS_PER_BYTE) - state->regs[dst_regno].id = 0; + clear_scalar_id(&state->regs[dst_regno]); } else { int spill_cnt = 0, zero_cnt = 0; @@ -5236,8 +4171,13 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, } if (type == STACK_INVALID && env->allow_uninit_stack) continue; - verbose(env, "invalid read from stack off %d+%d size %d\n", - off, i, size); + if (type == STACK_POISON) { + verbose(env, "reading from stack off %d+%d size %d, slot poisoned by dead code elimination\n", + off, i, size); + } else { + verbose(env, "invalid read from stack off %d+%d size %d\n", + off, i, size); + } return -EACCES; } @@ -5254,15 +4194,18 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, insn_flags = 0; /* not restoring original register state */ } } - state->regs[dst_regno].live |= REG_LIVE_WRITTEN; } else if (dst_regno >= 0) { /* restore register state from stack */ + if (env->bpf_capable) + /* Ensure stack slot has an ID to build a relation + * with the destination register on fill. + */ + assign_scalar_id_before_mov(env, reg); copy_register_state(&state->regs[dst_regno], reg); /* mark reg as written since spilled pointer state likely * has its liveness marks cleared by is_state_visited() * which resets stack/reg liveness for state transitions */ - state->regs[dst_regno].live |= REG_LIVE_WRITTEN; } else if (__is_pointer_value(env->allow_ptr_leaks, reg)) { /* If dst_regno==-1, the caller is asking us whether * it is acceptable to use this value as a SCALAR_VALUE @@ -5274,7 +4217,6 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, off); return -EACCES; } - mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); } else { for (i = 0; i < size; i++) { type = stype[(slot - i) % BPF_REG_SIZE]; @@ -5284,17 +4226,21 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, continue; if (type == STACK_INVALID && env->allow_uninit_stack) continue; - verbose(env, "invalid read from stack off %d+%d size %d\n", - off, i, size); + if (type == STACK_POISON) { + verbose(env, "reading from stack off %d+%d size %d, slot poisoned by dead code elimination\n", + off, i, size); + } else { + verbose(env, "invalid read from stack off %d+%d size %d\n", + off, i, size); + } return -EACCES; } - mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); if (dst_regno >= 0) mark_reg_stack_read(env, reg_state, off, off + size, dst_regno); insn_flags = 0; /* we are not restoring spilled register */ } if (insn_flags) - return push_insn_history(env, env->cur_state, insn_flags, 0); + return bpf_push_jmp_history(env, env->cur_state, insn_flags, 0); return 0; } @@ -5332,7 +4278,7 @@ static int check_stack_read_var_off(struct bpf_verifier_env *env, { /* The state of the source register. */ struct bpf_reg_state *reg = reg_state(env, ptr_regno); - struct bpf_func_state *ptr_state = func(env, reg); + struct bpf_func_state *ptr_state = bpf_func(env, reg); int err; int min_off, max_off; @@ -5364,7 +4310,7 @@ static int check_stack_read(struct bpf_verifier_env *env, int dst_regno) { struct bpf_reg_state *reg = reg_state(env, ptr_regno); - struct bpf_func_state *state = func(env, reg); + struct bpf_func_state *state = bpf_func(env, reg); int err; /* Some accesses are only permitted with a static offset. */ bool var_off = !tnum_is_const(reg->var_off); @@ -5410,7 +4356,6 @@ static int check_stack_read(struct bpf_verifier_env *env, * check_stack_write_var_off. * * 'ptr_regno' is the register used as a pointer into the stack. - * 'off' includes 'ptr_regno->off', but not its variable offset (if any). * 'value_regno' is the register whose value we're writing to the stack. It can * be -1, meaning that we're not writing from a register. * @@ -5421,7 +4366,7 @@ static int check_stack_write(struct bpf_verifier_env *env, int value_regno, int insn_idx) { struct bpf_reg_state *reg = reg_state(env, ptr_regno); - struct bpf_func_state *state = func(env, reg); + struct bpf_func_state *state = bpf_func(env, reg); int err; if (tnum_is_const(reg->var_off)) { @@ -5442,19 +4387,19 @@ static int check_stack_write(struct bpf_verifier_env *env, static int check_map_access_type(struct bpf_verifier_env *env, u32 regno, int off, int size, enum bpf_access_type type) { - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_map *map = regs[regno].map_ptr; + struct bpf_reg_state *reg = reg_state(env, regno); + struct bpf_map *map = reg->map_ptr; u32 cap = bpf_map_flags_to_cap(map); if (type == BPF_WRITE && !(cap & BPF_MAP_CAN_WRITE)) { - verbose(env, "write into map forbidden, value_size=%d off=%d size=%d\n", - map->value_size, off, size); + verbose(env, "write into map forbidden, value_size=%d off=%lld size=%d\n", + map->value_size, reg->smin_value + off, size); return -EACCES; } if (type == BPF_READ && !(cap & BPF_MAP_CAN_READ)) { - verbose(env, "read from map forbidden, value_size=%d off=%d size=%d\n", - map->value_size, off, size); + verbose(env, "read from map forbidden, value_size=%d off=%lld size=%d\n", + map->value_size, reg->smin_value + off, size); return -EACCES; } @@ -5488,6 +4433,10 @@ static int __check_mem_access(struct bpf_verifier_env *env, int regno, verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", off, size, regno, reg->id, off, mem_size); break; + case PTR_TO_CTX: + verbose(env, "invalid access to context, ctx_size=%d off=%d size=%d\n", + mem_size, off, size); + break; case PTR_TO_MEM: default: verbose(env, "invalid access to memory, mem_size=%u off=%d size=%d\n", @@ -5561,24 +4510,24 @@ static int __check_ptr_off_reg(struct bpf_verifier_env *env, * is only allowed in its original, unmodified form. */ - if (reg->off < 0) { - verbose(env, "negative offset %s ptr R%d off=%d disallowed\n", - reg_type_str(env, reg->type), regno, reg->off); + if (!tnum_is_const(reg->var_off)) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "variable %s access var_off=%s disallowed\n", + reg_type_str(env, reg->type), tn_buf); return -EACCES; } - if (!fixed_off_ok && reg->off) { - verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n", - reg_type_str(env, reg->type), regno, reg->off); + if (reg->smin_value < 0) { + verbose(env, "negative offset %s ptr R%d off=%lld disallowed\n", + reg_type_str(env, reg->type), regno, reg->var_off.value); return -EACCES; } - if (!tnum_is_const(reg->var_off) || reg->var_off.value) { - char tn_buf[48]; - - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "variable %s access var_off=%s disallowed\n", - reg_type_str(env, reg->type), tn_buf); + if (!fixed_off_ok && reg->var_off.value != 0) { + verbose(env, "dereference of modified %s ptr R%d off=%lld disallowed\n", + reg_type_str(env, reg->type), regno, reg->var_off.value); return -EACCES; } @@ -5599,6 +4548,9 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, int perm_flags; const char *reg_name = ""; + if (base_type(reg->type) != PTR_TO_BTF_ID) + goto bad_type; + if (btf_is_kernel(reg->btf)) { perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU; @@ -5611,7 +4563,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, perm_flags |= MEM_PERCPU; } - if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags)) + if (type_flag(reg->type) & ~perm_flags) goto bad_type; /* We need to verify reg->type and reg->btf, before accessing reg->btf */ @@ -5620,14 +4572,14 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, /* For ref_ptr case, release function check should ensure we get one * referenced PTR_TO_BTF_ID, and that its fixed offset is 0. For the * normal store of unreferenced kptr, we must ensure var_off is zero. - * Since ref_ptr cannot be accessed directly by BPF insns, checks for - * reg->off and reg->ref_obj_id are not needed here. + * Since ref_ptr cannot be accessed directly by BPF insns, check for + * reg->ref_obj_id is not needed here. */ if (__check_ptr_off_reg(env, reg, regno, true)) return -EACCES; /* A full type match is needed, as BTF can be vmlinux, module or prog BTF, and - * we also need to take into account the reg->off. + * we also need to take into account the reg->var_off. * * We want to support cases like: * @@ -5638,19 +4590,19 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, * * struct foo *v; * v = func(); // PTR_TO_BTF_ID - * val->foo = v; // reg->off is zero, btf and btf_id match type - * val->bar = &v->br; // reg->off is still zero, but we need to retry with + * val->foo = v; // reg->var_off is zero, btf and btf_id match type + * val->bar = &v->br; // reg->var_off is still zero, but we need to retry with * // first member type of struct after comparison fails - * val->baz = &v->bz; // reg->off is non-zero, so struct needs to be walked + * val->baz = &v->bz; // reg->var_off is non-zero, so struct needs to be walked * // to match type * - * In the kptr_ref case, check_func_arg_reg_off already ensures reg->off + * In the kptr_ref case, check_func_arg_reg_off already ensures reg->var_off * is zero. We must also ensure that btf_struct_ids_match does not walk * the struct to match type against first member of struct, i.e. reject * second case from above. Hence, when type is BPF_KPTR_REF, we set * strict mode to true for type match. */ - if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, + if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->var_off.value, kptr_field->kptr.btf, kptr_field->kptr.btf_id, kptr_field->type != BPF_KPTR_UNREF)) goto bad_type; @@ -5669,8 +4621,7 @@ bad_type: static bool in_sleepable(struct bpf_verifier_env *env) { - return env->prog->sleepable || - (env->cur_state && env->cur_state->in_sleepable); + return env->cur_state->in_sleepable; } /* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock() @@ -5678,7 +4629,7 @@ static bool in_sleepable(struct bpf_verifier_env *env) */ static bool in_rcu_cs(struct bpf_verifier_env *env) { - return env->cur_state->active_rcu_lock || + return env->cur_state->active_rcu_locks || env->cur_state->active_locks || !in_sleepable(env); } @@ -5774,6 +4725,7 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, struct bpf_insn *insn = &env->prog->insnsi[insn_idx]; int class = BPF_CLASS(insn->code); struct bpf_reg_state *val_reg; + int ret; /* Things we already checked for in check_map_access and caller: * - Reject cases where variable offset may touch kptr @@ -5807,11 +4759,14 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, /* We can simply mark the value_regno receiving the pointer * value from map as PTR_TO_BTF_ID, with the correct type. */ - mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf, - kptr_field->kptr.btf_id, btf_ld_kptr_type(env, kptr_field)); + ret = mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, + kptr_field->kptr.btf, kptr_field->kptr.btf_id, + btf_ld_kptr_type(env, kptr_field)); + if (ret < 0) + return ret; } else if (class == BPF_STX) { val_reg = reg_state(env, value_regno); - if (!register_is_null(val_reg) && + if (!bpf_register_is_null(val_reg) && map_kptr_match_type(env, kptr_field, val_reg, value_regno)) return -EACCES; } else if (class == BPF_ST) { @@ -5827,6 +4782,18 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, return 0; } +/* + * Return the size of the memory region accessible from a pointer to map value. + * For INSN_ARRAY maps whole bpf_insn_array->ips array is accessible. + */ +static u32 map_mem_size(const struct bpf_map *map) +{ + if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) + return map->max_entries * sizeof(long); + + return map->value_size; +} + /* check read/write into a map element with possible variable offset */ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off, int size, bool zero_size_allowed, @@ -5836,11 +4803,11 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *reg = &state->regs[regno]; struct bpf_map *map = reg->map_ptr; + u32 mem_size = map_mem_size(map); struct btf_record *rec; int err, i; - err = check_mem_region_access(env, regno, off, size, map->value_size, - zero_size_allowed); + err = check_mem_region_access(env, regno, off, size, mem_size, zero_size_allowed); if (err) return err; @@ -5894,11 +4861,9 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, return 0; } -#define MAX_PACKET_OFF 0xffff - static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, - const struct bpf_call_arg_meta *meta, - enum bpf_access_type t) + const struct bpf_call_arg_meta *meta, + enum bpf_access_type t) { enum bpf_prog_type prog_type = resolve_prog_type(env->prog); @@ -5941,31 +4906,17 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, int size, bool zero_size_allowed) { - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_reg_state *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); int err; - /* We may have added a variable offset to the packet pointer; but any - * reg->range we have comes after that. We are only checking the fixed - * offset. - */ - - /* We don't allow negative numbers, because we aren't tracking enough - * detail to prove they're safe. - */ - if (reg->smin_value < 0) { - verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", - regno); - return -EACCES; + if (reg->range < 0) { + verbose(env, "R%d offset is outside of the packet\n", regno); + return -EINVAL; } - err = reg->range < 0 ? -EINVAL : - __check_mem_access(env, regno, off, size, reg->range, - zero_size_allowed); - if (err) { - verbose(env, "R%d offset is outside of the packet\n", regno); + err = check_mem_region_access(env, regno, off, size, reg->range, zero_size_allowed); + if (err) return err; - } /* __check_mem_access has made sure "off + size - 1" is within u16. * reg->umax_value can't be bigger than MAX_PACKET_OFF which is 0xffff, @@ -5977,23 +4928,20 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, max_t(u32, env->prog->aux->max_pkt_offset, off + reg->umax_value + size - 1); - return err; + return 0; } -/* check access to 'struct bpf_context' fields. Supports fixed offsets only */ -static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size, - enum bpf_access_type t, enum bpf_reg_type *reg_type, - struct btf **btf, u32 *btf_id, bool *is_retval, bool is_ldsx) -{ - struct bpf_insn_access_aux info = { - .reg_type = *reg_type, - .log = &env->log, - .is_retval = false, - .is_ldsx = is_ldsx, - }; +static bool is_var_ctx_off_allowed(struct bpf_prog *prog) +{ + return resolve_prog_type(prog) == BPF_PROG_TYPE_SYSCALL; +} +/* check access to 'struct bpf_context' fields. Supports fixed offsets only */ +static int __check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size, + enum bpf_access_type t, struct bpf_insn_access_aux *info) +{ if (env->ops->is_valid_access && - env->ops->is_valid_access(off, size, t, env->prog, &info)) { + env->ops->is_valid_access(off, size, t, env->prog, info)) { /* A non zero info.ctx_field_size indicates that this field is a * candidate for later verifier transformation to load the whole * field and then apply a mask when accessed with a narrower @@ -6001,14 +4949,15 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, * will only allow for whole field access and rejects any other * type of narrower access. */ - *reg_type = info.reg_type; - *is_retval = info.is_retval; - - if (base_type(*reg_type) == PTR_TO_BTF_ID) { - *btf = info.btf; - *btf_id = info.btf_id; + if (base_type(info->reg_type) == PTR_TO_BTF_ID) { + if (info->ref_obj_id && + !find_reference_state(env->cur_state, info->ref_obj_id)) { + verbose(env, "invalid bpf_context access off=%d. Reference may already be released\n", + off); + return -EACCES; + } } else { - env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; + env->insn_aux_data[insn_idx].ctx_field_size = info->ctx_field_size; } /* remember the offset of last byte accessed in ctx */ if (env->prog->aux->max_ctx_offset < off + size) @@ -6020,6 +4969,34 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, return -EACCES; } +static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, + int off, int access_size, enum bpf_access_type t, + struct bpf_insn_access_aux *info) +{ + /* + * Program types that don't rewrite ctx accesses can safely + * dereference ctx pointers with fixed offsets. + */ + bool var_off_ok = is_var_ctx_off_allowed(env->prog); + bool fixed_off_ok = !env->ops->convert_ctx_access; + struct bpf_reg_state *regs = cur_regs(env); + struct bpf_reg_state *reg = regs + regno; + int err; + + if (var_off_ok) + err = check_mem_region_access(env, regno, off, access_size, U16_MAX, false); + else + err = __check_ptr_off_reg(env, reg, regno, fixed_off_ok); + if (err) + return err; + off += reg->umax_value; + + err = __check_ctx_access(env, insn_idx, off, access_size, t, info); + if (err) + verbose_linfo(env, insn_idx, "; "); + return err; +} + static int check_flow_keys_access(struct bpf_verifier_env *env, int off, int size) { @@ -6036,8 +5013,7 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, int off, int size, enum bpf_access_type t) { - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_reg_state *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_insn_access_aux info = {}; bool valid; @@ -6118,6 +5094,26 @@ static bool is_arena_reg(struct bpf_verifier_env *env, int regno) return reg->type == PTR_TO_ARENA; } +/* Return false if @regno contains a pointer whose type isn't supported for + * atomic instruction @insn. + */ +static bool atomic_ptr_type_ok(struct bpf_verifier_env *env, int regno, + struct bpf_insn *insn) +{ + if (is_ctx_reg(env, regno)) + return false; + if (is_pkt_reg(env, regno)) + return false; + if (is_flow_key_reg(env, regno)) + return false; + if (is_sk_reg(env, regno)) + return false; + if (is_arena_reg(env, regno)) + return bpf_jit_supports_insn(insn, true); + + return true; +} + static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = { #ifdef CONFIG_NET [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK], @@ -6182,14 +5178,14 @@ static int check_pkt_ptr_alignment(struct bpf_verifier_env *env, */ ip_align = 2; - reg_off = tnum_add(reg->var_off, tnum_const(ip_align + reg->off + off)); + reg_off = tnum_add(reg->var_off, tnum_const(ip_align + off)); if (!tnum_is_aligned(reg_off, size)) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); verbose(env, - "misaligned packet access off %d+%s+%d+%d size %d\n", - ip_align, tn_buf, reg->off, off, size); + "misaligned packet access off %d+%s+%d size %d\n", + ip_align, tn_buf, off, size); return -EACCES; } @@ -6207,13 +5203,13 @@ static int check_generic_ptr_alignment(struct bpf_verifier_env *env, if (!strict || size == 1) return 0; - reg_off = tnum_add(reg->var_off, tnum_const(reg->off + off)); + reg_off = tnum_add(reg->var_off, tnum_const(off)); if (!tnum_is_aligned(reg_off, size)) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "misaligned %saccess off %s+%d+%d size %d\n", - pointer_desc, tn_buf, reg->off, off, size); + verbose(env, "misaligned %saccess off %s+%d size %d\n", + pointer_desc, tn_buf, off, size); return -EACCES; } @@ -6242,6 +5238,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, break; case PTR_TO_MAP_VALUE: pointer_desc = "value "; + if (reg->map_ptr->map_type == BPF_MAP_TYPE_INSN_ARRAY) + strict = true; break; case PTR_TO_CTX: pointer_desc = "context "; @@ -6314,22 +5312,30 @@ static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth) return round_up(max_t(u32, stack_depth, 1), 32); } +/* temporary state used for call frame depth calculation */ +struct bpf_subprog_call_depth_info { + int ret_insn; /* caller instruction where we return to. */ + int caller; /* caller subprogram idx */ + int frame; /* # of consecutive static call stack frames on top of stack */ +}; + /* starting from main bpf function walk all instructions of the function * and recursively walk all callees that given function can call. * Ignore jump and exit insns. - * Since recursion is prevented by check_cfg() this algorithm - * only needs a local stack of MAX_CALL_FRAMES to remember callsites */ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx, + struct bpf_subprog_call_depth_info *dinfo, bool priv_stack_supported) { struct bpf_subprog_info *subprog = env->subprog_info; struct bpf_insn *insn = env->prog->insnsi; int depth = 0, frame = 0, i, subprog_end, subprog_depth; bool tail_call_reachable = false; - int ret_insn[MAX_CALL_FRAMES]; - int ret_prog[MAX_CALL_FRAMES]; - int j; + int total; + int tmp; + + /* no caller idx */ + dinfo[idx].caller = -1; i = subprog[idx].start; if (!priv_stack_supported) @@ -6381,8 +5387,12 @@ process_func: } else { depth += subprog_depth; if (depth > MAX_BPF_STACK) { + total = 0; + for (tmp = idx; tmp >= 0; tmp = dinfo[tmp].caller) + total++; + verbose(env, "combined stack size of %d calls is %d. Too large\n", - frame + 1, depth); + total, depth); return -EACCES; } } @@ -6394,12 +5404,10 @@ continue_func: if (bpf_pseudo_kfunc_call(insn + i) && !insn[i].off) { bool err = false; - if (!is_bpf_throw_kfunc(insn + i)) + if (!bpf_is_throw_kfunc(insn + i)) continue; - if (subprog[idx].is_cb) - err = true; - for (int c = 0; c < frame && !err; c++) { - if (subprog[ret_prog[c]].is_cb) { + for (tmp = idx; tmp >= 0 && !err; tmp = dinfo[tmp].caller) { + if (subprog[tmp].is_cb) { err = true; break; } @@ -6415,31 +5423,35 @@ continue_func: if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i)) continue; /* remember insn and function to return to */ - ret_insn[frame] = i + 1; - ret_prog[frame] = idx; /* find the callee */ next_insn = i + insn[i].imm + 1; - sidx = find_subprog(env, next_insn); - if (sidx < 0) { - WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", - next_insn); + sidx = bpf_find_subprog(env, next_insn); + if (verifier_bug_if(sidx < 0, env, "callee not found at insn %d", next_insn)) return -EFAULT; - } if (subprog[sidx].is_async_cb) { if (subprog[sidx].has_tail_call) { - verbose(env, "verifier bug. subprog has tail_call and async cb\n"); + verifier_bug(env, "subprog has tail_call and async cb"); return -EFAULT; } /* async callbacks don't increase bpf prog stack size unless called directly */ if (!bpf_pseudo_call(insn + i)) continue; if (subprog[sidx].is_exception_cb) { - verbose(env, "insn %d cannot call exception cb directly\n", i); + verbose(env, "insn %d cannot call exception cb directly", i); return -EINVAL; } } + + /* store caller info for after we return from callee */ + dinfo[idx].frame = frame; + dinfo[idx].ret_insn = i + 1; + + /* push caller idx into callee's dinfo */ + dinfo[sidx].caller = idx; + i = next_insn; + idx = sidx; if (!priv_stack_supported) subprog[idx].priv_stack_mode = NO_PRIV_STACK; @@ -6447,7 +5459,7 @@ continue_func: if (subprog[idx].has_tail_call) tail_call_reachable = true; - frame++; + frame = bpf_subprog_is_global(env, idx) ? 0 : frame + 1; if (frame >= MAX_CALL_FRAMES) { verbose(env, "the call stack of %d frames is too deep !\n", frame); @@ -6461,12 +5473,12 @@ continue_func: * tail call counter throughout bpf2bpf calls combined with tailcalls */ if (tail_call_reachable) - for (j = 0; j < frame; j++) { - if (subprog[ret_prog[j]].is_exception_cb) { + for (tmp = idx; tmp >= 0; tmp = dinfo[tmp].caller) { + if (subprog[tmp].is_exception_cb) { verbose(env, "cannot tail call within exception cb\n"); return -EINVAL; } - subprog[ret_prog[j]].tail_call_reachable = true; + subprog[tmp].tail_call_reachable = true; } if (subprog[0].tail_call_reachable) env->prog->aux->tail_call_reachable = true; @@ -6474,23 +5486,33 @@ continue_func: /* end of for() loop means the last insn of the 'subprog' * was reached. Doesn't matter whether it was JA or EXIT */ - if (frame == 0) + if (frame == 0 && dinfo[idx].caller < 0) return 0; if (subprog[idx].priv_stack_mode != PRIV_STACK_ADAPTIVE) depth -= round_up_stack_depth(env, subprog[idx].stack_depth); - frame--; - i = ret_insn[frame]; - idx = ret_prog[frame]; + + /* pop caller idx from callee */ + idx = dinfo[idx].caller; + + /* retrieve caller state from its frame */ + frame = dinfo[idx].frame; + i = dinfo[idx].ret_insn; + goto continue_func; } static int check_max_stack_depth(struct bpf_verifier_env *env) { enum priv_stack_mode priv_stack_mode = PRIV_STACK_UNKNOWN; + struct bpf_subprog_call_depth_info *dinfo; struct bpf_subprog_info *si = env->subprog_info; bool priv_stack_supported; int ret; + dinfo = kvcalloc(env->subprog_cnt, sizeof(*dinfo), GFP_KERNEL_ACCOUNT); + if (!dinfo) + return -ENOMEM; + for (int i = 0; i < env->subprog_cnt; i++) { if (si[i].has_tail_call) { priv_stack_mode = NO_PRIV_STACK; @@ -6512,9 +5534,12 @@ static int check_max_stack_depth(struct bpf_verifier_env *env) for (int i = env->subprog_cnt - 1; i >= 0; i--) { if (!i || si[i].is_async_cb) { priv_stack_supported = !i && priv_stack_mode == PRIV_STACK_ADAPTIVE; - ret = check_max_stack_depth_subprog(env, i, priv_stack_supported); - if (ret < 0) + ret = check_max_stack_depth_subprog(env, i, dinfo, + priv_stack_supported); + if (ret < 0) { + kvfree(dinfo); return ret; + } } } @@ -6525,24 +5550,10 @@ static int check_max_stack_depth(struct bpf_verifier_env *env) } } - return 0; -} - -#ifndef CONFIG_BPF_JIT_ALWAYS_ON -static int get_callee_stack_depth(struct bpf_verifier_env *env, - const struct bpf_insn *insn, int idx) -{ - int start = idx + insn->imm + 1, subprog; + kvfree(dinfo); - subprog = find_subprog(env, start); - if (subprog < 0) { - WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", - start); - return -EFAULT; - } - return env->subprog_info[subprog].stack_depth; + return 0; } -#endif static int __check_buffer_access(struct bpf_verifier_env *env, const char *buf_info, @@ -6555,7 +5566,7 @@ static int __check_buffer_access(struct bpf_verifier_env *env, regno, buf_info, off, size); return -EACCES; } - if (!tnum_is_const(reg->var_off) || reg->var_off.value) { + if (!tnum_is_const(reg->var_off)) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); @@ -6578,8 +5589,8 @@ static int check_tp_buffer_access(struct bpf_verifier_env *env, if (err) return err; - if (off + size > env->prog->aux->max_tp_access) - env->prog->aux->max_tp_access = off + size; + env->prog->aux->max_tp_access = max(reg->var_off.value + off + size, + env->prog->aux->max_tp_access); return 0; } @@ -6597,8 +5608,7 @@ static int check_buffer_access(struct bpf_verifier_env *env, if (err) return err; - if (off + size > *max_access) - *max_access = off + size; + *max_access = max(reg->var_off.value + off + size, *max_access); return 0; } @@ -6785,7 +5795,7 @@ out: set_sext32_default_val(reg, size); } -static bool bpf_map_is_rdonly(const struct bpf_map *map) +bool bpf_map_is_rdonly(const struct bpf_map *map) { /* A map is considered read-only if the following condition are true: * @@ -6805,8 +5815,8 @@ static bool bpf_map_is_rdonly(const struct bpf_map *map) !bpf_map_write_active(map); } -static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val, - bool is_ldsx) +int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val, + bool is_ldsx) { void *ptr; u64 addr; @@ -6864,9 +5874,16 @@ BTF_TYPE_SAFE_RCU(struct css_set) { struct cgroup *dfl_cgrp; }; +BTF_TYPE_SAFE_RCU(struct cgroup_subsys_state) { + struct cgroup *cgroup; +}; + /* RCU trusted: these fields are trusted in RCU CS and can be NULL */ BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct) { struct file __rcu *exe_file; +#ifdef CONFIG_MEMCG + struct task_struct __rcu *owner; +#endif }; /* skb->sk, req->sk are not RCU protected, but we mark them as such @@ -6898,8 +5915,7 @@ BTF_TYPE_SAFE_TRUSTED(struct file) { struct inode *f_inode; }; -BTF_TYPE_SAFE_TRUSTED(struct dentry) { - /* no negative dentry-s in places where bpf can see it */ +BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct dentry) { struct inode *d_inode; }; @@ -6907,6 +5923,11 @@ BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket) { struct sock *sk; }; +BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct vm_area_struct) { + struct mm_struct *vm_mm; + struct file *vm_file; +}; + static bool type_is_rcu(struct bpf_verifier_env *env, struct bpf_reg_state *reg, const char *field_name, u32 btf_id) @@ -6914,6 +5935,7 @@ static bool type_is_rcu(struct bpf_verifier_env *env, BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct task_struct)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct cgroup)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct css_set)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct cgroup_subsys_state)); return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_rcu"); } @@ -6937,7 +5959,6 @@ static bool type_is_trusted(struct bpf_verifier_env *env, BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct linux_binprm)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct file)); - BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct dentry)); return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted"); } @@ -6947,6 +5968,8 @@ static bool type_is_trusted_or_null(struct bpf_verifier_env *env, const char *field_name, u32 btf_id) { BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct dentry)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct vm_area_struct)); return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted_or_null"); @@ -6978,13 +6001,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, tname); return -EINVAL; } - if (off < 0) { - verbose(env, - "R%d is ptr_%s invalid negative access: off=%d\n", - regno, tname, off); - return -EACCES; - } - if (!tnum_is_const(reg->var_off) || reg->var_off.value) { + + if (!tnum_is_const(reg->var_off)) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); @@ -6994,6 +6012,15 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, return -EACCES; } + off += reg->var_off.value; + + if (off < 0) { + verbose(env, + "R%d is ptr_%s invalid negative access: off=%d\n", + regno, tname, off); + return -EACCES; + } + if (reg->type & MEM_USER) { verbose(env, "R%d is ptr_%s access user memory: off=%d\n", @@ -7010,7 +6037,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, if (env->ops->btf_struct_access && !type_is_alloc(reg->type) && atype == BPF_WRITE) { if (!btf_is_kernel(reg->btf)) { - verbose(env, "verifier internal error: reg->btf must be kernel btf\n"); + verifier_bug(env, "reg->btf must be kernel btf"); return -EFAULT; } ret = env->ops->btf_struct_access(&env->log, reg, off, size); @@ -7026,7 +6053,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, if (type_is_alloc(reg->type) && !type_is_non_owning_ref(reg->type) && !(reg->type & MEM_RCU) && !reg->ref_obj_id) { - verbose(env, "verifier internal error: ref_obj_id for allocated object must be non-zero\n"); + verifier_bug(env, "ref_obj_id for allocated object must be non-zero"); return -EFAULT; } @@ -7096,8 +6123,11 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, clear_trusted_flags(&flag); } - if (atype == BPF_READ && value_regno >= 0) - mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag); + if (atype == BPF_READ && value_regno >= 0) { + ret = mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag); + if (ret < 0) + return ret; + } return 0; } @@ -7151,13 +6181,19 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, /* Simulate access to a PTR_TO_BTF_ID */ memset(&map_reg, 0, sizeof(map_reg)); - mark_btf_ld_reg(env, &map_reg, 0, PTR_TO_BTF_ID, btf_vmlinux, *map->ops->map_btf_id, 0); + ret = mark_btf_ld_reg(env, &map_reg, 0, PTR_TO_BTF_ID, + btf_vmlinux, *map->ops->map_btf_id, 0); + if (ret < 0) + return ret; ret = btf_struct_access(&env->log, &map_reg, off, size, atype, &btf_id, &flag, NULL); if (ret < 0) return ret; - if (value_regno >= 0) - mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id, flag); + if (value_regno >= 0) { + ret = mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id, flag); + if (ret < 0) + return ret; + } return 0; } @@ -7195,9 +6231,8 @@ static int check_stack_access_within_bounds( int regno, int off, int access_size, enum bpf_access_type type) { - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_reg_state *reg = regs + regno; - struct bpf_func_state *state = func(env, reg); + struct bpf_reg_state *reg = reg_state(env, regno); + struct bpf_func_state *state = bpf_func(env, reg); s64 min_off, max_off; int err; char *err_extra; @@ -7232,8 +6267,8 @@ static int check_stack_access_within_bounds( if (err) { if (tnum_is_const(reg->var_off)) { - verbose(env, "invalid%s stack R%d off=%d size=%d\n", - err_extra, regno, off, access_size); + verbose(env, "invalid%s stack R%d off=%lld size=%d\n", + err_extra, regno, min_off, access_size); } else { char tn_buf[48]; @@ -7261,6 +6296,23 @@ static bool get_func_retval_range(struct bpf_prog *prog, return false; } +static void add_scalar_to_reg(struct bpf_reg_state *dst_reg, s64 val) +{ + struct bpf_reg_state fake_reg; + + if (!val) + return; + + fake_reg.type = SCALAR_VALUE; + __mark_reg_known(&fake_reg, val); + + scalar32_min_max_add(dst_reg, &fake_reg); + scalar_min_max_add(dst_reg, &fake_reg); + dst_reg->var_off = tnum_add(dst_reg->var_off, fake_reg.var_off); + + reg_bounds_sync(dst_reg); +} + /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory * if t==read, value_regno is a register which will receive the value from memory @@ -7279,14 +6331,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (size < 0) return size; - /* alignment checks will add in reg->off themselves */ err = check_ptr_alignment(env, reg, off, size, strict_alignment_once); if (err) return err; - /* for access checks, reg->off is just part of off */ - off += reg->off; - if (reg->type == PTR_TO_MAP_KEY) { if (t == BPF_WRITE) { verbose(env, "write to change key R%d not allowed\n", regno); @@ -7321,10 +6369,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else if (t == BPF_READ && value_regno >= 0) { struct bpf_map *map = reg->map_ptr; - /* if map is read-only, track its contents as scalars */ + /* + * If map is read-only, track its contents as scalars, + * unless it is an insn array (see the special case below) + */ if (tnum_is_const(reg->var_off) && bpf_map_is_rdonly(map) && - map->ops->map_direct_value_addr) { + map->ops->map_direct_value_addr && + map->map_type != BPF_MAP_TYPE_INSN_ARRAY) { int map_off = off + reg->var_off.value; u64 val = 0; @@ -7335,12 +6387,22 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn regs[value_regno].type = SCALAR_VALUE; __mark_reg_known(®s[value_regno], val); + } else if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) { + if (bpf_size != BPF_DW) { + verbose(env, "Invalid read of %d bytes from insn_array\n", + size); + return -EACCES; + } + copy_register_state(®s[value_regno], reg); + add_scalar_to_reg(®s[value_regno], off); + regs[value_regno].type = PTR_TO_INSN; } else { mark_reg_unknown(env, regs, value_regno); } } } else if (base_type(reg->type) == PTR_TO_MEM) { bool rdonly_mem = type_is_rdonly_mem(reg->type); + bool rdonly_untrusted = rdonly_mem && (reg->type & PTR_UNTRUSTED); if (type_may_be_null(reg->type)) { verbose(env, "R%d invalid mem access '%s'\n", regno, @@ -7360,16 +6422,22 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return -EACCES; } - err = check_mem_region_access(env, regno, off, size, - reg->mem_size, false); + /* + * Accesses to untrusted PTR_TO_MEM are done through probe + * instructions, hence no need to check bounds in that case. + */ + if (!rdonly_untrusted) + err = check_mem_region_access(env, regno, off, size, + reg->mem_size, false); if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem)) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_CTX) { - bool is_retval = false; + struct bpf_insn_access_aux info = { + .reg_type = SCALAR_VALUE, + .is_ldsx = is_ldsx, + .log = &env->log, + }; struct bpf_retval_range range; - enum bpf_reg_type reg_type = SCALAR_VALUE; - struct btf *btf = NULL; - u32 btf_id = 0; if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { @@ -7377,21 +6445,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return -EACCES; } - err = check_ptr_off_reg(env, reg, regno); - if (err < 0) - return err; - - err = check_ctx_access(env, insn_idx, off, size, t, ®_type, &btf, - &btf_id, &is_retval, is_ldsx); - if (err) - verbose_linfo(env, insn_idx, "; "); + err = check_ctx_access(env, insn_idx, regno, off, size, t, &info); if (!err && t == BPF_READ && value_regno >= 0) { /* ctx access returns either a scalar, or a * PTR_TO_PACKET[_META,_END]. In the latter * case, we know the offset is zero. */ - if (reg_type == SCALAR_VALUE) { - if (is_retval && get_func_retval_range(env->prog, &range)) { + if (info.reg_type == SCALAR_VALUE) { + if (info.is_retval && get_func_retval_range(env->prog, &range)) { err = __mark_reg_s32_range(env, regs, value_regno, range.minval, range.maxval); if (err) @@ -7402,7 +6463,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else { mark_reg_known_zero(env, regs, value_regno); - if (type_may_be_null(reg_type)) + if (type_may_be_null(info.reg_type)) regs[value_regno].id = ++env->id_gen; /* A load of ctx field could have different * actual load size with the one encoded in the @@ -7410,12 +6471,13 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * a sub-register. */ regs[value_regno].subreg_def = DEF_NOT_SUBREG; - if (base_type(reg_type) == PTR_TO_BTF_ID) { - regs[value_regno].btf = btf; - regs[value_regno].btf_id = btf_id; + if (base_type(info.reg_type) == PTR_TO_BTF_ID) { + regs[value_regno].btf = info.btf; + regs[value_regno].btf_id = info.btf_id; + regs[value_regno].ref_obj_id = info.ref_obj_id; } } - regs[value_regno].type = reg_type; + regs[value_regno].type = info.reg_type; } } else if (reg->type == PTR_TO_STACK) { @@ -7475,7 +6537,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else if (reg->type == CONST_PTR_TO_MAP) { err = check_ptr_to_map_access(env, regs, regno, off, size, t, value_regno); - } else if (base_type(reg->type) == PTR_TO_BUF) { + } else if (base_type(reg->type) == PTR_TO_BUF && + !type_may_be_null(reg->type)) { bool rdonly_mem = type_is_rdonly_mem(reg->type); u32 *max_access; @@ -7518,27 +6581,72 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type, bool allow_trust_mismatch); -static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn) +static int check_load_mem(struct bpf_verifier_env *env, struct bpf_insn *insn, + bool strict_alignment_once, bool is_ldsx, + bool allow_trust_mismatch, const char *ctx) { - int load_reg; + struct bpf_reg_state *regs = cur_regs(env); + enum bpf_reg_type src_reg_type; int err; - switch (insn->imm) { - case BPF_ADD: - case BPF_ADD | BPF_FETCH: - case BPF_AND: - case BPF_AND | BPF_FETCH: - case BPF_OR: - case BPF_OR | BPF_FETCH: - case BPF_XOR: - case BPF_XOR | BPF_FETCH: - case BPF_XCHG: - case BPF_CMPXCHG: - break; - default: - verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", insn->imm); - return -EINVAL; - } + /* check src operand */ + err = check_reg_arg(env, insn->src_reg, SRC_OP); + if (err) + return err; + + /* check dst operand */ + err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); + if (err) + return err; + + src_reg_type = regs[insn->src_reg].type; + + /* Check if (src_reg + off) is readable. The state of dst_reg will be + * updated by this call. + */ + err = check_mem_access(env, env->insn_idx, insn->src_reg, insn->off, + BPF_SIZE(insn->code), BPF_READ, insn->dst_reg, + strict_alignment_once, is_ldsx); + err = err ?: save_aux_ptr_type(env, src_reg_type, + allow_trust_mismatch); + err = err ?: reg_bounds_sanity_check(env, ®s[insn->dst_reg], ctx); + + return err; +} + +static int check_store_reg(struct bpf_verifier_env *env, struct bpf_insn *insn, + bool strict_alignment_once) +{ + struct bpf_reg_state *regs = cur_regs(env); + enum bpf_reg_type dst_reg_type; + int err; + + /* check src1 operand */ + err = check_reg_arg(env, insn->src_reg, SRC_OP); + if (err) + return err; + + /* check src2 operand */ + err = check_reg_arg(env, insn->dst_reg, SRC_OP); + if (err) + return err; + + dst_reg_type = regs[insn->dst_reg].type; + + /* Check if (dst_reg + off) is writeable. */ + err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_WRITE, insn->src_reg, + strict_alignment_once, false); + err = err ?: save_aux_ptr_type(env, dst_reg_type, false); + + return err; +} + +static int check_atomic_rmw(struct bpf_verifier_env *env, + struct bpf_insn *insn) +{ + int load_reg; + int err; if (BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) { verbose(env, "invalid atomic operand size\n"); @@ -7574,11 +6682,7 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i return -EACCES; } - if (is_ctx_reg(env, insn->dst_reg) || - is_pkt_reg(env, insn->dst_reg) || - is_flow_key_reg(env, insn->dst_reg) || - is_sk_reg(env, insn->dst_reg) || - (is_arena_reg(env, insn->dst_reg) && !bpf_jit_supports_insn(insn, true))) { + if (!atomic_ptr_type_ok(env, insn->dst_reg, insn)) { verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n", insn->dst_reg, reg_type_str(env, reg_state(env, insn->dst_reg)->type)); @@ -7605,12 +6709,12 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i /* Check whether we can read the memory, with second call for fetch * case to simulate the register fill. */ - err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, + err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, -1, true, false); if (!err && load_reg >= 0) - err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_READ, load_reg, - true, false); + err = check_mem_access(env, env->insn_idx, insn->dst_reg, + insn->off, BPF_SIZE(insn->code), + BPF_READ, load_reg, true, false); if (err) return err; @@ -7620,20 +6724,91 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i return err; } /* Check whether we can write into the same memory. */ - err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, + err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1, true, false); if (err) return err; return 0; } +static int check_atomic_load(struct bpf_verifier_env *env, + struct bpf_insn *insn) +{ + int err; + + err = check_load_mem(env, insn, true, false, false, "atomic_load"); + if (err) + return err; + + if (!atomic_ptr_type_ok(env, insn->src_reg, insn)) { + verbose(env, "BPF_ATOMIC loads from R%d %s is not allowed\n", + insn->src_reg, + reg_type_str(env, reg_state(env, insn->src_reg)->type)); + return -EACCES; + } + + return 0; +} + +static int check_atomic_store(struct bpf_verifier_env *env, + struct bpf_insn *insn) +{ + int err; + + err = check_store_reg(env, insn, true); + if (err) + return err; + + if (!atomic_ptr_type_ok(env, insn->dst_reg, insn)) { + verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n", + insn->dst_reg, + reg_type_str(env, reg_state(env, insn->dst_reg)->type)); + return -EACCES; + } + + return 0; +} + +static int check_atomic(struct bpf_verifier_env *env, struct bpf_insn *insn) +{ + switch (insn->imm) { + case BPF_ADD: + case BPF_ADD | BPF_FETCH: + case BPF_AND: + case BPF_AND | BPF_FETCH: + case BPF_OR: + case BPF_OR | BPF_FETCH: + case BPF_XOR: + case BPF_XOR | BPF_FETCH: + case BPF_XCHG: + case BPF_CMPXCHG: + return check_atomic_rmw(env, insn); + case BPF_LOAD_ACQ: + if (BPF_SIZE(insn->code) == BPF_DW && BITS_PER_LONG != 64) { + verbose(env, + "64-bit load-acquires are only supported on 64-bit arches\n"); + return -EOPNOTSUPP; + } + return check_atomic_load(env, insn); + case BPF_STORE_REL: + if (BPF_SIZE(insn->code) == BPF_DW && BITS_PER_LONG != 64) { + verbose(env, + "64-bit store-releases are only supported on 64-bit arches\n"); + return -EOPNOTSUPP; + } + return check_atomic_store(env, insn); + default: + verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", + insn->imm); + return -EINVAL; + } +} + /* When register 'regno' is used to read the stack (either directly or through * a helper function) make sure that it's within stack boundary and, depending * on the access type and privileges, that all elements of the stack are * initialized. * - * 'off' includes 'regno->off', but not its dynamic part (if any). - * * All registers that have been spilled on the stack in the slots within the * read offsets are marked as read. */ @@ -7643,21 +6818,27 @@ static int check_stack_range_initialized( enum bpf_access_type type, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *reg = reg_state(env, regno); - struct bpf_func_state *state = func(env, reg); + struct bpf_func_state *state = bpf_func(env, reg); int err, min_off, max_off, i, j, slot, spi; /* Some accesses can write anything into the stack, others are * read-only. */ - bool clobber = false; + bool clobber = type == BPF_WRITE; + /* + * Negative access_size signals global subprog/kfunc arg check where + * STACK_POISON slots are acceptable. static stack liveness + * might have determined that subprog doesn't read them, + * but BTF based global subprog validation isn't accurate enough. + */ + bool allow_poison = access_size < 0 || clobber; + + access_size = abs(access_size); if (access_size == 0 && !zero_size_allowed) { verbose(env, "invalid zero-sized read\n"); return -EACCES; } - if (type == BPF_WRITE) - clobber = true; - err = check_stack_access_within_bounds(env, regno, off, access_size, type); if (err) return err; @@ -7709,7 +6890,7 @@ static int check_stack_range_initialized( for (i = min_off; i < max_off + access_size; i++) { int stack_off = -i - 1; - spi = __get_spi(i); + spi = bpf_get_spi(i); /* raw_mode may write past allocated_stack */ if (state->allocated_stack <= stack_off) continue; @@ -7729,7 +6910,7 @@ static int check_stack_range_initialized( slot = -i - 1; spi = slot / BPF_REG_SIZE; if (state->allocated_stack <= slot) { - verbose(env, "verifier bug: allocated_stack too small\n"); + verbose(env, "allocated_stack too small\n"); return -EFAULT; } @@ -7745,7 +6926,7 @@ static int check_stack_range_initialized( goto mark; } - if (is_spilled_reg(&state->stack[spi]) && + if (bpf_is_spilled_reg(&state->stack[spi]) && (state->stack[spi].spilled_ptr.type == SCALAR_VALUE || env->allow_ptr_leaks)) { if (clobber) { @@ -7756,7 +6937,12 @@ static int check_stack_range_initialized( goto mark; } - if (tnum_is_const(reg->var_off)) { + if (*stype == STACK_POISON) { + if (allow_poison) + goto mark; + verbose(env, "reading from stack R%d off %d+%d size %d, slot poisoned by dead code elimination\n", + regno, min_off, i - min_off, access_size); + } else if (tnum_is_const(reg->var_off)) { verbose(env, "invalid read from stack R%d off %d+%d size %d\n", regno, min_off, i - min_off, access_size); } else { @@ -7768,17 +6954,7 @@ static int check_stack_range_initialized( } return -EACCES; mark: - /* reading any byte out of 8-byte 'spill_slot' will cause - * the whole slot to be marked as 'read' - */ - mark_reg_read(env, &state->stack[spi].spilled_ptr, - state->stack[spi].spilled_ptr.parent, - REG_LIVE_READ64); - /* We do not set REG_LIVE_WRITTEN for stack slot, as we can not - * be sure that whether stack slot is written to or not. Hence, - * we must still conservatively propagate reads upwards even if - * helper may write to the entire memory range. - */ + ; } return 0; } @@ -7794,7 +6970,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, switch (base_type(reg->type)) { case PTR_TO_PACKET: case PTR_TO_PACKET_META: - return check_packet_access(env, regno, reg->off, access_size, + return check_packet_access(env, regno, 0, access_size, zero_size_allowed); case PTR_TO_MAP_KEY: if (access_type == BPF_WRITE) { @@ -7802,12 +6978,12 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, reg_type_str(env, reg->type)); return -EACCES; } - return check_mem_region_access(env, regno, reg->off, access_size, + return check_mem_region_access(env, regno, 0, access_size, reg->map_ptr->key_size, false); case PTR_TO_MAP_VALUE: - if (check_map_access_type(env, regno, reg->off, access_size, access_type)) + if (check_map_access_type(env, regno, 0, access_size, access_type)) return -EACCES; - return check_map_access(env, regno, reg->off, access_size, + return check_map_access(env, regno, 0, access_size, zero_size_allowed, ACCESS_HELPER); case PTR_TO_MEM: if (type_is_rdonly_mem(reg->type)) { @@ -7817,7 +6993,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return -EACCES; } } - return check_mem_region_access(env, regno, reg->off, + return check_mem_region_access(env, regno, 0, access_size, reg->mem_size, zero_size_allowed); case PTR_TO_BUF: @@ -7832,39 +7008,33 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, } else { max_access = &env->prog->aux->max_rdwr_access; } - return check_buffer_access(env, reg, regno, reg->off, + return check_buffer_access(env, reg, regno, 0, access_size, zero_size_allowed, max_access); case PTR_TO_STACK: return check_stack_range_initialized( env, - regno, reg->off, access_size, + regno, 0, access_size, zero_size_allowed, access_type, meta); case PTR_TO_BTF_ID: - return check_ptr_to_btf_access(env, regs, regno, reg->off, + return check_ptr_to_btf_access(env, regs, regno, 0, access_size, BPF_READ, -1); case PTR_TO_CTX: - /* in case the function doesn't know how to access the context, - * (because we are in a program of type SYSCALL for example), we - * can not statically check its size. - * Dynamically check it now. - */ - if (!env->ops->convert_ctx_access) { - int offset = access_size - 1; - - /* Allow zero-byte read from PTR_TO_CTX */ - if (access_size == 0) - return zero_size_allowed ? 0 : -EACCES; - - return check_mem_access(env, env->insn_idx, regno, offset, BPF_B, - access_type, -1, false, false); + /* Only permit reading or writing syscall context using helper calls. */ + if (is_var_ctx_off_allowed(env->prog)) { + int err = check_mem_region_access(env, regno, 0, access_size, U16_MAX, + zero_size_allowed); + if (err) + return err; + if (env->prog->aux->max_ctx_offset < reg->umax_value + access_size) + env->prog->aux->max_ctx_offset = reg->umax_value + access_size; + return 0; } - fallthrough; default: /* scalar_value or invalid ptr */ /* Allow zero-byte read from NULL, regardless of pointer type */ if (zero_size_allowed && access_size == 0 && - register_is_null(reg)) + bpf_register_is_null(reg)) return 0; verbose(env, "R%d type=%s ", regno, @@ -7937,7 +7107,7 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg struct bpf_reg_state saved_reg; int err; - if (register_is_null(reg)) + if (bpf_register_is_null(reg)) return 0; /* Assuming that the register contains a value check if the memory @@ -7949,8 +7119,10 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg mark_ptr_not_null_reg(reg); } - err = check_helper_mem_access(env, regno, mem_size, BPF_READ, true, NULL); - err = err ?: check_helper_mem_access(env, regno, mem_size, BPF_WRITE, true, NULL); + int size = base_type(reg->type) == PTR_TO_STACK ? -(int)mem_size : mem_size; + + err = check_helper_mem_access(env, regno, size, BPF_READ, true, NULL); + err = err ?: check_helper_mem_access(env, regno, size, BPF_WRITE, true, NULL); if (may_be_null) *reg = saved_reg; @@ -7985,6 +7157,12 @@ static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg return err; } +enum { + PROCESS_SPIN_LOCK = (1 << 0), + PROCESS_RES_LOCK = (1 << 1), + PROCESS_LOCK_IRQ = (1 << 2), +}; + /* Implementation details: * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL. * bpf_obj_new returns PTR_TO_BTF_ID | MEM_ALLOC | PTR_MAYBE_NULL. @@ -8007,30 +7185,33 @@ static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg * env->cur_state->active_locks remembers which map value element or allocated * object got locked and clears it after bpf_spin_unlock. */ -static int process_spin_lock(struct bpf_verifier_env *env, int regno, - bool is_lock) +static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + bool is_lock = flags & PROCESS_SPIN_LOCK, is_res_lock = flags & PROCESS_RES_LOCK; + const char *lock_str = is_res_lock ? "bpf_res_spin" : "bpf_spin"; + struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_verifier_state *cur = env->cur_state; bool is_const = tnum_is_const(reg->var_off); + bool is_irq = flags & PROCESS_LOCK_IRQ; u64 val = reg->var_off.value; struct bpf_map *map = NULL; struct btf *btf = NULL; struct btf_record *rec; + u32 spin_lock_off; int err; if (!is_const) { verbose(env, - "R%d doesn't have constant offset. bpf_spin_lock has to be at the constant offset\n", - regno); + "R%d doesn't have constant offset. %s_lock has to be at the constant offset\n", + regno, lock_str); return -EINVAL; } if (reg->type == PTR_TO_MAP_VALUE) { map = reg->map_ptr; if (!map->btf) { verbose(env, - "map '%s' has to have BTF in order to use bpf_spin_lock\n", - map->name); + "map '%s' has to have BTF in order to use %s_lock\n", + map->name, lock_str); return -EINVAL; } } else { @@ -8038,36 +7219,53 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, } rec = reg_btf_record(reg); - if (!btf_record_has_field(rec, BPF_SPIN_LOCK)) { - verbose(env, "%s '%s' has no valid bpf_spin_lock\n", map ? "map" : "local", - map ? map->name : "kptr"); + if (!btf_record_has_field(rec, is_res_lock ? BPF_RES_SPIN_LOCK : BPF_SPIN_LOCK)) { + verbose(env, "%s '%s' has no valid %s_lock\n", map ? "map" : "local", + map ? map->name : "kptr", lock_str); return -EINVAL; } - if (rec->spin_lock_off != val + reg->off) { - verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock' that is at %d\n", - val + reg->off, rec->spin_lock_off); + spin_lock_off = is_res_lock ? rec->res_spin_lock_off : rec->spin_lock_off; + if (spin_lock_off != val) { + verbose(env, "off %lld doesn't point to 'struct %s_lock' that is at %d\n", + val, lock_str, spin_lock_off); return -EINVAL; } if (is_lock) { void *ptr; + int type; if (map) ptr = map; else ptr = btf; - if (cur->active_locks) { - verbose(env, - "Locking two bpf_spin_locks are not allowed\n"); - return -EINVAL; + if (!is_res_lock && cur->active_locks) { + if (find_lock_state(env->cur_state, REF_TYPE_LOCK, 0, NULL)) { + verbose(env, + "Locking two bpf_spin_locks are not allowed\n"); + return -EINVAL; + } + } else if (is_res_lock && cur->active_locks) { + if (find_lock_state(env->cur_state, REF_TYPE_RES_LOCK | REF_TYPE_RES_LOCK_IRQ, reg->id, ptr)) { + verbose(env, "Acquiring the same lock again, AA deadlock detected\n"); + return -EINVAL; + } } - err = acquire_lock_state(env, env->insn_idx, REF_TYPE_LOCK, reg->id, ptr); + + if (is_res_lock && is_irq) + type = REF_TYPE_RES_LOCK_IRQ; + else if (is_res_lock) + type = REF_TYPE_RES_LOCK; + else + type = REF_TYPE_LOCK; + err = acquire_lock_state(env, env->insn_idx, type, reg->id, ptr); if (err < 0) { verbose(env, "Failed to acquire lock state\n"); return err; } } else { void *ptr; + int type; if (map) ptr = map; @@ -8075,12 +7273,26 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, ptr = btf; if (!cur->active_locks) { - verbose(env, "bpf_spin_unlock without taking a lock\n"); + verbose(env, "%s_unlock without taking a lock\n", lock_str); return -EINVAL; } - if (release_lock_state(env->cur_state, REF_TYPE_LOCK, reg->id, ptr)) { - verbose(env, "bpf_spin_unlock of different lock\n"); + if (is_res_lock && is_irq) + type = REF_TYPE_RES_LOCK_IRQ; + else if (is_res_lock) + type = REF_TYPE_RES_LOCK; + else + type = REF_TYPE_LOCK; + if (!find_lock_state(cur, type, reg->id, ptr)) { + verbose(env, "%s_unlock of different lock\n", lock_str); + return -EINVAL; + } + if (reg->id != cur->active_lock_id || ptr != cur->active_lock_ptr) { + verbose(env, "%s_unlock cannot be out of order\n", lock_str); + return -EINVAL; + } + if (release_lock_state(cur, type, reg->id, ptr)) { + verbose(env, "%s_unlock of different lock\n", lock_str); return -EINVAL; } @@ -8089,64 +7301,87 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, return 0; } -static int process_timer_func(struct bpf_verifier_env *env, int regno, - struct bpf_call_arg_meta *meta) +/* Check if @regno is a pointer to a specific field in a map value */ +static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno, + enum btf_field_type field_type, + struct bpf_map_desc *map_desc) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); bool is_const = tnum_is_const(reg->var_off); struct bpf_map *map = reg->map_ptr; u64 val = reg->var_off.value; + const char *struct_name = btf_field_type_name(field_type); + int field_off = -1; if (!is_const) { verbose(env, - "R%d doesn't have constant offset. bpf_timer has to be at the constant offset\n", - regno); + "R%d doesn't have constant offset. %s has to be at the constant offset\n", + regno, struct_name); return -EINVAL; } if (!map->btf) { - verbose(env, "map '%s' has to have BTF in order to use bpf_timer\n", - map->name); + verbose(env, "map '%s' has to have BTF in order to use %s\n", map->name, + struct_name); + return -EINVAL; + } + if (!btf_record_has_field(map->record, field_type)) { + verbose(env, "map '%s' has no valid %s\n", map->name, struct_name); return -EINVAL; } - if (!btf_record_has_field(map->record, BPF_TIMER)) { - verbose(env, "map '%s' has no valid bpf_timer\n", map->name); + switch (field_type) { + case BPF_TIMER: + field_off = map->record->timer_off; + break; + case BPF_TASK_WORK: + field_off = map->record->task_work_off; + break; + case BPF_WORKQUEUE: + field_off = map->record->wq_off; + break; + default: + verifier_bug(env, "unsupported BTF field type: %s\n", struct_name); return -EINVAL; } - if (map->record->timer_off != val + reg->off) { - verbose(env, "off %lld doesn't point to 'struct bpf_timer' that is at %d\n", - val + reg->off, map->record->timer_off); + if (field_off != val) { + verbose(env, "off %lld doesn't point to 'struct %s' that is at %d\n", + val, struct_name, field_off); return -EINVAL; } - if (meta->map_ptr) { - verbose(env, "verifier bug. Two map pointers in a timer helper\n"); + if (map_desc->ptr) { + verifier_bug(env, "Two map pointers in a %s helper", struct_name); return -EFAULT; } - meta->map_uid = reg->map_uid; - meta->map_ptr = map; + map_desc->uid = reg->map_uid; + map_desc->ptr = map; return 0; } -static int process_wq_func(struct bpf_verifier_env *env, int regno, - struct bpf_kfunc_call_arg_meta *meta) +static int process_timer_func(struct bpf_verifier_env *env, int regno, + struct bpf_map_desc *map) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; - struct bpf_map *map = reg->map_ptr; - u64 val = reg->var_off.value; - - if (map->record->wq_off != val + reg->off) { - verbose(env, "off %lld doesn't point to 'struct bpf_wq' that is at %d\n", - val + reg->off, map->record->wq_off); - return -EINVAL; + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + verbose(env, "bpf_timer cannot be used for PREEMPT_RT.\n"); + return -EOPNOTSUPP; } - meta->map.uid = reg->map_uid; - meta->map.ptr = map; - return 0; + return check_map_field_pointer(env, regno, BPF_TIMER, map); +} + +static int process_timer_helper(struct bpf_verifier_env *env, int regno, + struct bpf_call_arg_meta *meta) +{ + return process_timer_func(env, regno, &meta->map); +} + +static int process_timer_kfunc(struct bpf_verifier_env *env, int regno, + struct bpf_kfunc_call_arg_meta *meta) +{ + return process_timer_func(env, regno, &meta->map); } static int process_kptr_func(struct bpf_verifier_env *env, int regno, struct bpf_call_arg_meta *meta) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); struct btf_field *kptr_field; struct bpf_map *map_ptr; struct btf_record *rec; @@ -8162,7 +7397,7 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, return -EINVAL; } rec = map_ptr->record; - meta->map_ptr = map_ptr; + meta->map.ptr = map_ptr; } if (!tnum_is_const(reg->var_off)) { @@ -8177,7 +7412,7 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, return -EINVAL; } - kptr_off = reg->off + reg->var_off.value; + kptr_off = reg->var_off.value; kptr_field = btf_record_find(rec, kptr_off, BPF_KPTR); if (!kptr_field) { verbose(env, "off=%d doesn't point to kptr\n", kptr_off); @@ -8219,7 +7454,7 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn_idx, enum bpf_arg_type arg_type, int clone_ref_obj_id) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); int err; if (reg->type != PTR_TO_STACK && reg->type != CONST_PTR_TO_DYNPTR) { @@ -8233,7 +7468,7 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*): */ if ((arg_type & (MEM_UNINIT | MEM_RDONLY)) == (MEM_UNINIT | MEM_RDONLY)) { - verbose(env, "verifier internal error: misconfigured dynptr helper type flags\n"); + verifier_bug(env, "misconfigured dynptr helper type flags"); return -EFAULT; } @@ -8298,7 +7533,7 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn static u32 iter_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi) { - struct bpf_func_state *state = func(env, reg); + struct bpf_func_state *state = bpf_func(env, reg); return state->stack[spi].spilled_ptr.ref_obj_id; } @@ -8313,10 +7548,6 @@ static bool is_iter_new_kfunc(struct bpf_kfunc_call_arg_meta *meta) return meta->kfunc_flags & KF_ITER_NEW; } -static bool is_iter_next_kfunc(struct bpf_kfunc_call_arg_meta *meta) -{ - return meta->kfunc_flags & KF_ITER_NEXT; -} static bool is_iter_destroy_kfunc(struct bpf_kfunc_call_arg_meta *meta) { @@ -8339,7 +7570,7 @@ static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg_idx, static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_idx, struct bpf_kfunc_call_arg_meta *meta) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); const struct btf_type *t; int spi, err, i, nr_slots, btf_id; @@ -8431,10 +7662,12 @@ static struct bpf_verifier_state *find_prev_entry(struct bpf_verifier_env *env, { struct bpf_verifier_state_list *sl; struct bpf_verifier_state *st; + struct list_head *pos, *head; /* Explored states are pushed in stack order, most recent states come first */ - sl = *explored_state(env, insn_idx); - for (; sl; sl = sl->next) { + head = bpf_explored_state(env, insn_idx); + list_for_each(pos, head) { + sl = container_of(pos, struct bpf_verifier_state_list, node); /* If st->branches != 0 state is a part of current DFS verification path, * hence cur & st for a loop. */ @@ -8447,20 +7680,24 @@ static struct bpf_verifier_state *find_prev_entry(struct bpf_verifier_env *env, return NULL; } -static void reset_idmap_scratch(struct bpf_verifier_env *env); -static bool regs_exact(const struct bpf_reg_state *rold, - const struct bpf_reg_state *rcur, - struct bpf_idmap *idmap); +/* + * Check if scalar registers are exact for the purpose of not widening. + * More lenient than regs_exact() + */ +static bool scalars_exact_for_widen(const struct bpf_reg_state *rold, + const struct bpf_reg_state *rcur) +{ + return !memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)); +} static void maybe_widen_reg(struct bpf_verifier_env *env, - struct bpf_reg_state *rold, struct bpf_reg_state *rcur, - struct bpf_idmap *idmap) + struct bpf_reg_state *rold, struct bpf_reg_state *rcur) { if (rold->type != SCALAR_VALUE) return; if (rold->type != rcur->type) return; - if (rold->precise || rcur->precise || regs_exact(rold, rcur, idmap)) + if (rold->precise || rcur->precise || scalars_exact_for_widen(rold, rcur)) return; __mark_reg_unknown(env, rcur); } @@ -8470,9 +7707,8 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env, struct bpf_verifier_state *cur) { struct bpf_func_state *fold, *fcur; - int i, fr; + int i, fr, num_slots; - reset_idmap_scratch(env); for (fr = old->curframe; fr >= 0; fr--) { fold = old->frame[fr]; fcur = cur->frame[fr]; @@ -8480,18 +7716,18 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env, for (i = 0; i < MAX_BPF_REG; i++) maybe_widen_reg(env, &fold->regs[i], - &fcur->regs[i], - &env->idmap_scratch); + &fcur->regs[i]); - for (i = 0; i < fold->allocated_stack / BPF_REG_SIZE; i++) { - if (!is_spilled_reg(&fold->stack[i]) || - !is_spilled_reg(&fcur->stack[i])) + num_slots = min(fold->allocated_stack / BPF_REG_SIZE, + fcur->allocated_stack / BPF_REG_SIZE); + for (i = 0; i < num_slots; i++) { + if (!bpf_is_spilled_reg(&fold->stack[i]) || + !bpf_is_spilled_reg(&fcur->stack[i])) continue; maybe_widen_reg(env, &fold->stack[i].spilled_ptr, - &fcur->stack[i].spilled_ptr, - &env->idmap_scratch); + &fcur->stack[i].spilled_ptr); } } return 0; @@ -8597,8 +7833,8 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx, if (cur_iter->iter.state != BPF_ITER_STATE_ACTIVE && cur_iter->iter.state != BPF_ITER_STATE_DRAINED) { - verbose(env, "verifier internal error: unexpected iterator state %d (%s)\n", - cur_iter->iter.state, iter_state_str(cur_iter->iter.state)); + verifier_bug(env, "unexpected iterator state %d (%s)", + cur_iter->iter.state, iter_state_str(cur_iter->iter.state)); return -EFAULT; } @@ -8608,7 +7844,7 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx, */ if (!cur_st->parent || cur_st->parent->insn_idx != insn_idx || !same_callsites(cur_st->parent, cur_st)) { - verbose(env, "bug: bad parent state for iter next call"); + verifier_bug(env, "bad parent state for iter next call"); return -EFAULT; } /* Note cur_st->parent in the call below, it is necessary to skip @@ -8618,8 +7854,8 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx, prev_st = find_prev_entry(env, cur_st->parent, insn_idx); /* branch out active iter state */ queued_st = push_stack(env, insn_idx + 1, insn_idx, false); - if (!queued_st) - return -ENOMEM; + if (IS_ERR(queued_st)) + return PTR_ERR(queued_st); queued_iter = get_iter_from_state(queued_st, meta); queued_iter->iter.state = BPF_ITER_STATE_ACTIVE; @@ -8665,13 +7901,13 @@ static int resolve_map_arg_type(struct bpf_verifier_env *env, const struct bpf_call_arg_meta *meta, enum bpf_arg_type *arg_type) { - if (!meta->map_ptr) { + if (!meta->map.ptr) { /* kernel subsystem misconfigured verifier */ - verbose(env, "invalid map_ptr to access map->type\n"); - return -EACCES; + verifier_bug(env, "invalid map_ptr to access map->type"); + return -EFAULT; } - switch (meta->map_ptr->map_type) { + switch (meta->map.ptr->map_type) { case BPF_MAP_TYPE_SOCKMAP: case BPF_MAP_TYPE_SOCKHASH: if (*arg_type == ARG_PTR_TO_MAP_VALUE) { @@ -8730,6 +7966,7 @@ static const struct bpf_reg_types mem_types = { PTR_TO_MEM | MEM_RINGBUF, PTR_TO_BUF, PTR_TO_BTF_ID | PTR_TRUSTED, + PTR_TO_CTX, }, }; @@ -8766,7 +8003,9 @@ static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } static const struct bpf_reg_types kptr_xchg_dest_types = { .types = { PTR_TO_MAP_VALUE, - PTR_TO_BTF_ID | MEM_ALLOC + PTR_TO_BTF_ID | MEM_ALLOC, + PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF, + PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU, } }; static const struct bpf_reg_types dynptr_types = { @@ -8807,14 +8046,14 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, const u32 *arg_btf_id, struct bpf_call_arg_meta *meta) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); enum bpf_reg_type expected, type = reg->type; const struct bpf_reg_types *compatible; - int i, j; + int i, j, err; compatible = compatible_reg_types[base_type(arg_type)]; if (!compatible) { - verbose(env, "verifier internal error: unsupported arg type %d\n", arg_type); + verifier_bug(env, "unsupported arg type %d", arg_type); return -EFAULT; } @@ -8896,7 +8135,7 @@ found: if (!arg_btf_id) { if (!compatible->btf_id) { - verbose(env, "verifier internal error: missing arg compatible BTF ID\n"); + verifier_bug(env, "missing arg compatible BTF ID"); return -EFAULT; } arg_btf_id = compatible->btf_id; @@ -8913,8 +8152,12 @@ found: return -EACCES; } - if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, - btf_vmlinux, *arg_btf_id, + err = __check_ptr_off_reg(env, reg, regno, true); + if (err) + return err; + + if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, + reg->var_off.value, btf_vmlinux, *arg_btf_id, strict_type_match)) { verbose(env, "R%d is of type %s but %s is expected\n", regno, btf_type_name(reg->btf, reg->btf_id), @@ -8926,9 +8169,11 @@ found: } case PTR_TO_BTF_ID | MEM_ALLOC: case PTR_TO_BTF_ID | MEM_PERCPU | MEM_ALLOC: + case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF: + case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU: if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock && meta->func_id != BPF_FUNC_kptr_xchg) { - verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n"); + verifier_bug(env, "unimplemented handling of MEM_ALLOC"); return -EFAULT; } /* Check if local kptr in src arg matches kptr in dst arg */ @@ -8943,7 +8188,7 @@ found: /* Handled by helper specific checks */ break; default: - verbose(env, "verifier internal error: invalid PTR_TO_BTF_ID register for type match\n"); + verifier_bug(env, "invalid PTR_TO_BTF_ID register for type match"); return -EFAULT; } return 0; @@ -8992,12 +8237,11 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env, * because fixed_off_ok is false, but checking here allows us * to give the user a better error message. */ - if (reg->off) { + if (!tnum_is_const(reg->var_off) || reg->var_off.value != 0) { verbose(env, "R%d must have zero offset when passed to release func or trusted arg to kfunc\n", regno); return -EINVAL; } - return __check_ptr_off_reg(env, reg, regno, false); } switch (type) { @@ -9032,6 +8276,16 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env, * still need to do checks instead of returning. */ return __check_ptr_off_reg(env, reg, regno, true); + case PTR_TO_CTX: + /* + * Allow fixed and variable offsets for syscall context, but + * only when the argument is passed as memory, not ctx, + * otherwise we may get modified ctx in tail called programs and + * global subprogs (that may act as extension prog hooks). + */ + if (arg_type != ARG_PTR_TO_CTX && is_var_ctx_off_allowed(env->prog)) + return 0; + fallthrough; default: return __check_ptr_off_reg(env, reg, regno, false); } @@ -9061,7 +8315,7 @@ static struct bpf_reg_state *get_dynptr_arg_reg(struct bpf_verifier_env *env, static int dynptr_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { - struct bpf_func_state *state = func(env, reg); + struct bpf_func_state *state = bpf_func(env, reg); int spi; if (reg->type == CONST_PTR_TO_DYNPTR) @@ -9074,7 +8328,7 @@ static int dynptr_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg) static int dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { - struct bpf_func_state *state = func(env, reg); + struct bpf_func_state *state = bpf_func(env, reg); int spi; if (reg->type == CONST_PTR_TO_DYNPTR) @@ -9088,13 +8342,13 @@ static int dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state static enum bpf_dynptr_type dynptr_get_type(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { - struct bpf_func_state *state = func(env, reg); + struct bpf_func_state *state = bpf_func(env, reg); int spi; if (reg->type == CONST_PTR_TO_DYNPTR) return reg->dynptr.type; - spi = __get_spi(reg->off); + spi = bpf_get_spi(reg->var_off.value); if (spi < 0) { verbose(env, "verifier internal error: invalid spi when querying dynptr type\n"); return BPF_DYNPTR_TYPE_INVALID; @@ -9115,6 +8369,11 @@ static int check_reg_const_str(struct bpf_verifier_env *env, if (reg->type != PTR_TO_MAP_VALUE) return -EINVAL; + if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) { + verbose(env, "R%d points to insn_array map which cannot be used as const string\n", regno); + return -EACCES; + } + if (!bpf_map_is_rdonly(map)) { verbose(env, "R%d does not point to a readonly map'\n", regno); return -EACCES; @@ -9130,13 +8389,13 @@ static int check_reg_const_str(struct bpf_verifier_env *env, return -EACCES; } - err = check_map_access(env, regno, reg->off, - map->value_size - reg->off, false, + err = check_map_access(env, regno, 0, + map->value_size - reg->var_off.value, false, ACCESS_HELPER); if (err) return err; - map_off = reg->off + reg->var_off.value; + map_off = reg->var_off.value; err = map->ops->map_direct_value_addr(map, &map_addr, map_off); if (err) { verbose(env, "direct value access on string failed\n"); @@ -9157,7 +8416,7 @@ static int get_constant_map_key(struct bpf_verifier_env *env, u32 key_size, s64 *value) { - struct bpf_func_state *state = func(env, key); + struct bpf_func_state *state = bpf_func(env, key); struct bpf_reg_state *reg; int slot, spi, off; int spill_size = 0; @@ -9173,7 +8432,7 @@ static int get_constant_map_key(struct bpf_verifier_env *env, if (!tnum_is_const(key->var_off)) return -EOPNOTSUPP; - stack_off = key->off + key->var_off.value; + stack_off = key->var_off.value; slot = -stack_off - 1; spi = slot / BPF_REG_SIZE; off = slot % BPF_REG_SIZE; @@ -9188,7 +8447,7 @@ static int get_constant_map_key(struct bpf_verifier_env *env, } /* Check that stack contains a scalar spill of expected size */ - if (!is_spilled_scalar_reg(&state->stack[spi])) + if (!bpf_is_spilled_scalar_reg(&state->stack[spi])) return -EOPNOTSUPP; for (i = off; i >= 0 && stype[i] == STACK_SPILL; i--) spill_size++; @@ -9203,8 +8462,8 @@ static int get_constant_map_key(struct bpf_verifier_env *env, /* We are relying on a constant value. So mark as precise * to prevent pruning on it. */ - bt_set_frame_slot(&env->bt, key->frameno, spi); - err = mark_chain_precision_batch(env); + bpf_bt_set_frame_slot(&env->bt, key->frameno, spi); + err = mark_chain_precision_batch(env, env->cur_state); if (err < 0) return err; @@ -9220,7 +8479,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, int insn_idx) { u32 regno = BPF_REG_1 + arg; - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); enum bpf_arg_type arg_type = fn->arg_type[arg]; enum bpf_reg_type type = reg->type; u32 *arg_btf_id = NULL; @@ -9255,7 +8514,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, return err; } - if (register_is_null(reg) && type_may_be_null(arg_type)) + if (bpf_register_is_null(reg) && type_may_be_null(arg_type)) /* A NULL register has a SCALAR_VALUE type, so skip * type checking. */ @@ -9277,7 +8536,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, skip_type_check: if (arg_type_is_release(arg_type)) { if (arg_type_is_dynptr(arg_type)) { - struct bpf_func_state *state = func(env, reg); + struct bpf_func_state *state = bpf_func(env, reg); int spi; /* Only dynptr created on stack can be released, thus @@ -9295,13 +8554,13 @@ skip_type_check: verbose(env, "cannot release unowned const bpf_dynptr\n"); return -EINVAL; } - } else if (!reg->ref_obj_id && !register_is_null(reg)) { + } else if (!reg->ref_obj_id && !bpf_register_is_null(reg)) { verbose(env, "R%d must be referenced when passed to release function\n", regno); return -EINVAL; } if (meta->release_regno) { - verbose(env, "verifier internal error: more than one release argument\n"); + verifier_bug(env, "more than one release argument"); return -EFAULT; } meta->release_regno = regno; @@ -9309,10 +8568,10 @@ skip_type_check: if (reg->ref_obj_id && base_type(arg_type) != ARG_KPTR_XCHG_DEST) { if (meta->ref_obj_id) { - verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", + verbose(env, "more than one arg with ref_obj_id R%d %u %u", regno, reg->ref_obj_id, meta->ref_obj_id); - return -EFAULT; + return -EACCES; } meta->ref_obj_id = reg->ref_obj_id; } @@ -9320,7 +8579,7 @@ skip_type_check: switch (base_type(arg_type)) { case ARG_CONST_MAP_PTR: /* bpf_map_xxx(map_ptr) call: remember that map_ptr */ - if (meta->map_ptr) { + if (meta->map.ptr) { /* Use map_uid (which is unique id of inner map) to reject: * inner_map1 = bpf_map_lookup_elem(outer_map, key1) * inner_map2 = bpf_map_lookup_elem(outer_map, key2) @@ -9333,36 +8592,36 @@ skip_type_check: * * Comparing map_ptr is enough to distinguish normal and outer maps. */ - if (meta->map_ptr != reg->map_ptr || - meta->map_uid != reg->map_uid) { + if (meta->map.ptr != reg->map_ptr || + meta->map.uid != reg->map_uid) { verbose(env, "timer pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n", - meta->map_uid, reg->map_uid); + meta->map.uid, reg->map_uid); return -EINVAL; } } - meta->map_ptr = reg->map_ptr; - meta->map_uid = reg->map_uid; + meta->map.ptr = reg->map_ptr; + meta->map.uid = reg->map_uid; break; case ARG_PTR_TO_MAP_KEY: /* bpf_map_xxx(..., map_ptr, ..., key) call: * check that [key, key + map->key_size) are within * stack limits and initialized */ - if (!meta->map_ptr) { + if (!meta->map.ptr) { /* in function declaration map_ptr must come before * map_key, so that it's verified and known before * we have to check map_key here. Otherwise it means * that kernel subsystem misconfigured verifier */ - verbose(env, "invalid map_ptr to access map->key\n"); - return -EACCES; + verifier_bug(env, "invalid map_ptr to access map->key"); + return -EFAULT; } - key_size = meta->map_ptr->key_size; + key_size = meta->map.ptr->key_size; err = check_helper_mem_access(env, regno, key_size, BPF_READ, false, NULL); if (err) return err; - if (can_elide_value_nullness(meta->map_ptr->map_type)) { + if (can_elide_value_nullness(meta->map.ptr->map_type)) { err = get_constant_map_key(env, reg, key_size, &meta->const_map_key); if (err < 0) { meta->const_map_key = -1; @@ -9374,19 +8633,19 @@ skip_type_check: } break; case ARG_PTR_TO_MAP_VALUE: - if (type_may_be_null(arg_type) && register_is_null(reg)) + if (type_may_be_null(arg_type) && bpf_register_is_null(reg)) return 0; /* bpf_map_xxx(..., map_ptr, ..., value) call: * check [value, value + map->value_size) validity */ - if (!meta->map_ptr) { + if (!meta->map.ptr) { /* kernel subsystem misconfigured verifier */ - verbose(env, "invalid map_ptr to access map->value\n"); - return -EACCES; + verifier_bug(env, "invalid map_ptr to access map->value"); + return -EFAULT; } meta->raw_mode = arg_type & MEM_UNINIT; - err = check_helper_mem_access(env, regno, meta->map_ptr->value_size, + err = check_helper_mem_access(env, regno, meta->map.ptr->value_size, arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ, false, meta); break; @@ -9404,20 +8663,20 @@ skip_type_check: return -EACCES; } if (meta->func_id == BPF_FUNC_spin_lock) { - err = process_spin_lock(env, regno, true); + err = process_spin_lock(env, regno, PROCESS_SPIN_LOCK); if (err) return err; } else if (meta->func_id == BPF_FUNC_spin_unlock) { - err = process_spin_lock(env, regno, false); + err = process_spin_lock(env, regno, 0); if (err) return err; } else { - verbose(env, "verifier internal error\n"); + verifier_bug(env, "spin lock arg on unexpected helper"); return -EFAULT; } break; case ARG_PTR_TO_TIMER: - err = process_timer_func(env, regno, meta); + err = process_timer_helper(env, regno, meta); if (err) return err; break; @@ -9522,7 +8781,7 @@ static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id) return false; } -static bool allow_tail_call_in_subprogs(struct bpf_verifier_env *env) +bool bpf_allow_tail_call_in_subprogs(struct bpf_verifier_env *env) { return env->prog->jit_requested && bpf_jit_supports_subprog_tailcalls(); @@ -9656,6 +8915,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_map_push_elem) goto error; break; + case BPF_MAP_TYPE_INSN_ARRAY: + goto error; default: break; } @@ -9665,8 +8926,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_FUNC_tail_call: if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) goto error; - if (env->subprog_cnt > 1 && !allow_tail_call_in_subprogs(env)) { - verbose(env, "tail_calls are not allowed in non-JITed programs with bpf-to-bpf calls\n"); + if (env->subprog_cnt > 1 && !bpf_allow_tail_call_in_subprogs(env)) { + verbose(env, "mixing of tail_calls and bpf-to-bpf calls is not supported\n"); return -EINVAL; } break; @@ -9853,10 +9114,27 @@ static bool check_btf_id_ok(const struct bpf_func_proto *fn) return true; } -static int check_func_proto(const struct bpf_func_proto *fn, int func_id) +static bool check_mem_arg_rw_flag_ok(const struct bpf_func_proto *fn) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) { + enum bpf_arg_type arg_type = fn->arg_type[i]; + + if (base_type(arg_type) != ARG_PTR_TO_MEM) + continue; + if (!(arg_type & (MEM_WRITE | MEM_RDONLY))) + return false; + } + + return true; +} + +static int check_func_proto(const struct bpf_func_proto *fn) { return check_raw_mode_ok(fn) && check_arg_pair_ok(fn) && + check_mem_arg_rw_flag_ok(fn) && check_btf_id_ok(fn) ? 0 : -EINVAL; } @@ -9960,7 +9238,7 @@ static void clear_caller_saved_regs(struct bpf_verifier_env *env, /* after the call registers r0 - r5 were scratched */ for (i = 0; i < CALLER_SAVED_REGS; i++) { - mark_reg_not_init(env, regs, caller_saved[i]); + bpf_mark_reg_not_init(env, ®s[caller_saved[i]]); __check_reg_arg(env, regs, caller_saved[i], DST_OP_NO_MARK); } } @@ -9988,13 +9266,12 @@ static int setup_func_entry(struct bpf_verifier_env *env, int subprog, int calls } if (state->frame[state->curframe + 1]) { - verbose(env, "verifier bug. Frame %d already allocated\n", - state->curframe + 1); + verifier_bug(env, "Frame %d already allocated", state->curframe + 1); return -EFAULT; } caller = state->frame[state->curframe]; - callee = kzalloc(sizeof(*callee), GFP_KERNEL); + callee = kzalloc_obj(*callee, GFP_KERNEL_ACCOUNT); if (!callee) return -ENOMEM; state->frame[state->curframe + 1] = callee; @@ -10049,8 +9326,14 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, bpf_log(log, "R%d is not a scalar\n", regno); return -EINVAL; } + } else if (arg->arg_type & PTR_UNTRUSTED) { + /* + * Anything is allowed for untrusted arguments, as these are + * read-only and probe read instructions would protect against + * invalid memory access. + */ } else if (arg->arg_type == ARG_PTR_TO_CTX) { - ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE); + ret = check_func_arg_reg_off(env, reg, regno, ARG_PTR_TO_CTX); if (ret < 0) return ret; /* If function expects ctx type in BTF check that caller @@ -10094,7 +9377,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, struct bpf_call_arg_meta meta; int err; - if (register_is_null(reg) && type_may_be_null(arg->arg_type)) + if (bpf_register_is_null(reg) && type_may_be_null(arg->arg_type)) continue; memset(&meta, 0, sizeof(meta)); /* leave func_id as zero */ @@ -10103,8 +9386,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, if (err) return err; } else { - bpf_log(log, "verifier bug: unrecognized arg#%d type %d\n", - i, arg->arg_type); + verifier_bug(env, "unrecognized arg#%d type %d", i, arg->arg_type); return -EFAULT; } } @@ -10167,26 +9449,26 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins env->subprog_info[subprog].is_cb = true; if (bpf_pseudo_kfunc_call(insn) && !is_callback_calling_kfunc(insn->imm)) { - verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n", - func_id_name(insn->imm), insn->imm); + verifier_bug(env, "kfunc %s#%d not marked as callback-calling", + func_id_name(insn->imm), insn->imm); return -EFAULT; } else if (!bpf_pseudo_kfunc_call(insn) && !is_callback_calling_function(insn->imm)) { /* helper */ - verbose(env, "verifier bug: helper %s#%d not marked as callback-calling\n", - func_id_name(insn->imm), insn->imm); + verifier_bug(env, "helper %s#%d not marked as callback-calling", + func_id_name(insn->imm), insn->imm); return -EFAULT; } - if (is_async_callback_calling_insn(insn)) { + if (bpf_is_async_callback_calling_insn(insn)) { struct bpf_verifier_state *async_cb; /* there is no real recursion here. timer and workqueue callbacks are async */ env->subprog_info[subprog].is_async_cb = true; async_cb = push_async_cb(env, env->subprog_info[subprog].start, insn_idx, subprog, - is_bpf_wq_set_callback_impl_kfunc(insn->imm)); - if (!async_cb) - return -EFAULT; + is_async_cb_sleepable(env, insn)); + if (IS_ERR(async_cb)) + return PTR_ERR(async_cb); callee = async_cb->frame[0]; callee->async_entry_cnt = caller->async_entry_cnt + 1; @@ -10202,8 +9484,8 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins * proceed with next instruction within current frame. */ callback_state = push_stack(env, env->subprog_info[subprog].start, insn_idx, false); - if (!callback_state) - return -ENOMEM; + if (IS_ERR(callback_state)) + return PTR_ERR(callback_state); err = setup_func_entry(env, subprog, insn_idx, set_callee_state_cb, callback_state); @@ -10216,6 +9498,9 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins return 0; } +static int process_bpf_exit_full(struct bpf_verifier_env *env, + bool *do_print_state, bool exception_exit); + static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { @@ -10224,36 +9509,27 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int err, subprog, target_insn; target_insn = *insn_idx + insn->imm + 1; - subprog = find_subprog(env, target_insn); - if (subprog < 0) { - verbose(env, "verifier bug. No program starts at insn %d\n", target_insn); + subprog = bpf_find_subprog(env, target_insn); + if (verifier_bug_if(subprog < 0, env, "target of func call at insn %d is not a program", + target_insn)) return -EFAULT; - } caller = state->frame[state->curframe]; err = btf_check_subprog_call(env, subprog, caller->regs); if (err == -EFAULT) return err; - if (subprog_is_global(env, subprog)) { + if (bpf_subprog_is_global(env, subprog)) { const char *sub_name = subprog_name(env, subprog); - /* Only global subprogs cannot be called with a lock held. */ if (env->cur_state->active_locks) { verbose(env, "global function calls are not allowed while holding a lock,\n" "use static function instead\n"); return -EINVAL; } - /* Only global subprogs cannot be called with preemption disabled. */ - if (env->cur_state->active_preempt_locks) { - verbose(env, "global function calls are not allowed with preemption disabled,\n" - "use static function instead\n"); - return -EINVAL; - } - - if (env->cur_state->active_irq_id) { - verbose(env, "global function calls are not allowed with IRQs disabled,\n" - "use static function instead\n"); + if (env->subprog_info[subprog].might_sleep && !in_sleepable_context(env)) { + verbose(env, "sleepable global function %s() called in %s\n", + sub_name, non_sleepable_context_description(env)); return -EINVAL; } @@ -10263,17 +9539,31 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return err; } - verbose(env, "Func#%d ('%s') is global and assumed valid.\n", - subprog, sub_name); + if (env->log.level & BPF_LOG_LEVEL) + verbose(env, "Func#%d ('%s') is global and assumed valid.\n", + subprog, sub_name); if (env->subprog_info[subprog].changes_pkt_data) clear_all_pkt_pointers(env); /* mark global subprog for verifying after main prog */ subprog_aux(env, subprog)->called = true; clear_caller_saved_regs(env, caller->regs); - /* All global functions return a 64-bit SCALAR_VALUE */ - mark_reg_unknown(env, caller->regs, BPF_REG_0); - caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; + /* All non-void global functions return a 64-bit SCALAR_VALUE. */ + if (!subprog_returns_void(env, subprog)) { + mark_reg_unknown(env, caller->regs, BPF_REG_0); + caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; + } + + if (env->subprog_info[subprog].might_throw) { + struct bpf_verifier_state *branch; + + branch = push_stack(env, *insn_idx + 1, *insn_idx, false); + if (IS_ERR(branch)) { + verbose(env, "failed to push state for global subprog exception path\n"); + return PTR_ERR(branch); + } + return process_bpf_exit_full(env, NULL, true); + } /* continue with next insn after call */ return 0; @@ -10324,7 +9614,7 @@ int map_set_for_each_callback_args(struct bpf_verifier_env *env, callee->regs[BPF_REG_4] = caller->regs[BPF_REG_3]; /* unused */ - __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_5]); return 0; } @@ -10381,9 +9671,9 @@ static int set_loop_callback_state(struct bpf_verifier_env *env, callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3]; /* unused */ - __mark_reg_not_init(env, &callee->regs[BPF_REG_3]); - __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); - __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_3]); + bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_4]); + bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_callback_fn = true; callee->callback_ret_range = retval_range(0, 1); @@ -10413,10 +9703,10 @@ static int set_timer_callback_state(struct bpf_verifier_env *env, callee->regs[BPF_REG_3].map_ptr = map_ptr; /* unused */ - __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); - __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_4]); + bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_async_callback_fn = true; - callee->callback_ret_range = retval_range(0, 1); + callee->callback_ret_range = retval_range(0, 0); return 0; } @@ -10441,8 +9731,8 @@ static int set_find_vma_callback_state(struct bpf_verifier_env *env, callee->regs[BPF_REG_3] = caller->regs[BPF_REG_4]; /* unused */ - __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); - __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_4]); + bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_callback_fn = true; callee->callback_ret_range = retval_range(0, 1); return 0; @@ -10457,14 +9747,14 @@ static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env, * callback_ctx, u64 flags); * callback_fn(const struct bpf_dynptr_t* dynptr, void *callback_ctx); */ - __mark_reg_not_init(env, &callee->regs[BPF_REG_0]); + bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_0]); mark_dynptr_cb_reg(env, &callee->regs[BPF_REG_1], BPF_DYNPTR_TYPE_LOCAL); callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3]; /* unused */ - __mark_reg_not_init(env, &callee->regs[BPF_REG_3]); - __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); - __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_3]); + bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_4]); + bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_callback_fn = true; callee->callback_ret_range = retval_range(0, 1); @@ -10485,7 +9775,8 @@ static int set_rbtree_add_callback_state(struct bpf_verifier_env *env, */ struct btf_field *field; - field = reg_find_field_offset(&caller->regs[BPF_REG_1], caller->regs[BPF_REG_1].off, + field = reg_find_field_offset(&caller->regs[BPF_REG_1], + caller->regs[BPF_REG_1].var_off.value, BPF_RB_ROOT); if (!field || !field->graph_root.value_btf_id) return -EFAULT; @@ -10495,14 +9786,44 @@ static int set_rbtree_add_callback_state(struct bpf_verifier_env *env, mark_reg_graph_node(callee->regs, BPF_REG_2, &field->graph_root); ref_set_non_owning(env, &callee->regs[BPF_REG_2]); - __mark_reg_not_init(env, &callee->regs[BPF_REG_3]); - __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); - __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_3]); + bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_4]); + bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_callback_fn = true; callee->callback_ret_range = retval_range(0, 1); return 0; } +static int set_task_work_schedule_callback_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, + int insn_idx) +{ + struct bpf_map *map_ptr = caller->regs[BPF_REG_3].map_ptr; + + /* + * callback_fn(struct bpf_map *map, void *key, void *value); + */ + callee->regs[BPF_REG_1].type = CONST_PTR_TO_MAP; + __mark_reg_known_zero(&callee->regs[BPF_REG_1]); + callee->regs[BPF_REG_1].map_ptr = map_ptr; + + callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY; + __mark_reg_known_zero(&callee->regs[BPF_REG_2]); + callee->regs[BPF_REG_2].map_ptr = map_ptr; + + callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE; + __mark_reg_known_zero(&callee->regs[BPF_REG_3]); + callee->regs[BPF_REG_3].map_ptr = map_ptr; + + /* unused */ + bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_4]); + bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + callee->in_async_callback_fn = true; + callee->callback_ret_range = retval_range(S32_MIN, S32_MAX); + return 0; +} + static bool is_rbtree_lock_required_kfunc(u32 btf_id); /* Are we currently verifying the callback for a rbtree helper that must @@ -10528,10 +9849,9 @@ static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env) return is_rbtree_lock_required_kfunc(kfunc_btf_id); } -static bool retval_range_within(struct bpf_retval_range range, const struct bpf_reg_state *reg, - bool return_32bit) +static bool retval_range_within(struct bpf_retval_range range, const struct bpf_reg_state *reg) { - if (return_32bit) + if (range.return_32bit) return range.minval <= reg->s32_min_value && reg->s32_max_value <= range.maxval; else return range.minval <= reg->smin_value && reg->smax_value <= range.maxval; @@ -10566,20 +9886,19 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) } /* we are going to rely on register's precise value */ - err = mark_reg_read(env, r0, r0->parent, REG_LIVE_READ64); - err = err ?: mark_chain_precision(env, BPF_REG_0); + err = mark_chain_precision(env, BPF_REG_0); if (err) return err; /* enforce R0 return value range, and bpf_callback_t returns 64bit */ - if (!retval_range_within(callee->callback_ret_range, r0, false)) { + if (!retval_range_within(callee->callback_ret_range, r0)) { verbose_invalid_scalar(env, r0, callee->callback_ret_range, "At callback return", "R0"); return -EINVAL; } - if (!calls_callback(env, callee->callsite)) { - verbose(env, "BUG: in callback at %d, callsite %d !calls_callback\n", - *insn_idx, callee->callsite); + if (!bpf_calls_callback(env, callee->callsite)) { + verifier_bug(env, "in callback at %d, callsite %d !calls_callback", + *insn_idx, callee->callsite); return -EFAULT; } } else { @@ -10671,7 +9990,7 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, int func_id, int insn_idx) { struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; - struct bpf_map *map = meta->map_ptr; + struct bpf_map *map = meta->map.ptr; if (func_id != BPF_FUNC_tail_call && func_id != BPF_FUNC_map_lookup_elem && @@ -10686,8 +10005,8 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, return 0; if (map == NULL) { - verbose(env, "kernel subsystem misconfigured verifier\n"); - return -EINVAL; + verifier_bug(env, "expected map for helper call"); + return -EFAULT; } /* In case of read-only, some additional restrictions @@ -10704,11 +10023,11 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, } if (!aux->map_ptr_state.map_ptr) - bpf_map_ptr_store(aux, meta->map_ptr, - !meta->map_ptr->bypass_spec_v1, false); - else if (aux->map_ptr_state.map_ptr != meta->map_ptr) - bpf_map_ptr_store(aux, meta->map_ptr, - !meta->map_ptr->bypass_spec_v1, true); + bpf_map_ptr_store(aux, meta->map.ptr, + !meta->map.ptr->bypass_spec_v1, false); + else if (aux->map_ptr_state.map_ptr != meta->map.ptr) + bpf_map_ptr_store(aux, meta->map.ptr, + !meta->map.ptr->bypass_spec_v1, true); return 0; } @@ -10717,19 +10036,19 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, int func_id, int insn_idx) { struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; - struct bpf_reg_state *regs = cur_regs(env), *reg; - struct bpf_map *map = meta->map_ptr; + struct bpf_reg_state *reg; + struct bpf_map *map = meta->map.ptr; u64 val, max; int err; if (func_id != BPF_FUNC_tail_call) return 0; if (!map || map->map_type != BPF_MAP_TYPE_PROG_ARRAY) { - verbose(env, "kernel subsystem misconfigured verifier\n"); + verbose(env, "expected prog array map for tail call"); return -EINVAL; } - reg = ®s[BPF_REG_3]; + reg = reg_state(env, BPF_REG_3); val = reg->var_off.value; max = map->max_entries; @@ -10752,6 +10071,8 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exit) { struct bpf_verifier_state *state = env->cur_state; + enum bpf_prog_type type = resolve_prog_type(env->prog); + struct bpf_reg_state *reg = reg_state(env, BPF_REG_0); bool refs_lingering = false; int i; @@ -10761,6 +10082,12 @@ static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exi for (i = 0; i < state->acquired_refs; i++) { if (state->refs[i].type != REF_TYPE_PTR) continue; + /* Allow struct_ops programs to return a referenced kptr back to + * kernel. Type checks are performed later in check_return_code. + */ + if (type == BPF_PROG_TYPE_STRUCT_OPS && !exception_exit && + reg->ref_obj_id == state->refs[i].id) + continue; verbose(env, "Unreleased reference id=%d alloc_insn=%d\n", state->refs[i].id, state->refs[i].insn_idx); refs_lingering = true; @@ -10788,7 +10115,7 @@ static int check_resource_leak(struct bpf_verifier_env *env, bool exception_exit return -EINVAL; } - if (check_lock && env->cur_state->active_rcu_lock) { + if (check_lock && env->cur_state->active_rcu_locks) { verbose(env, "%s cannot be used inside bpf_rcu_read_lock-ed region\n", prefix); return -EINVAL; } @@ -10820,11 +10147,11 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env, /* fmt being ARG_PTR_TO_CONST_STR guarantees that var_off is const * and map_direct_value_addr is set. */ - fmt_map_off = fmt_reg->off + fmt_reg->var_off.value; + fmt_map_off = fmt_reg->var_off.value; err = fmt_map->ops->map_direct_value_addr(fmt_map, &fmt_addr, fmt_map_off); if (err) { - verbose(env, "verifier bug\n"); + verbose(env, "failed to retrieve map value address\n"); return -EFAULT; } fmt = (char *)(long)fmt_addr + fmt_map_off; @@ -10846,7 +10173,7 @@ static int check_get_func_ip(struct bpf_verifier_env *env) if (type == BPF_PROG_TYPE_TRACING) { if (!bpf_prog_has_trampoline(env->prog)) { - verbose(env, "func %s#%d supported only for fentry/fexit/fmod_ret programs\n", + verbose(env, "func %s#%d supported only for fentry/fexit/fsession/fmod_ret programs\n", func_id_name(func_id), func_id); return -ENOTSUPP; } @@ -10860,16 +10187,15 @@ static int check_get_func_ip(struct bpf_verifier_env *env) return -ENOTSUPP; } -static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env) +static struct bpf_insn_aux_data *cur_aux(const struct bpf_verifier_env *env) { return &env->insn_aux_data[env->insn_idx]; } static bool loop_flag_is_zero(struct bpf_verifier_env *env) { - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_reg_state *reg = ®s[BPF_REG_4]; - bool reg_is_null = register_is_null(reg); + struct bpf_reg_state *reg = reg_state(env, BPF_REG_4); + bool reg_is_null = bpf_register_is_null(reg); if (reg_is_null) mark_chain_precision(env, BPF_REG_4); @@ -10910,8 +10236,8 @@ static bool can_elide_value_nullness(enum bpf_map_type type) } } -static int get_helper_proto(struct bpf_verifier_env *env, int func_id, - const struct bpf_func_proto **ptr) +int bpf_get_helper_proto(struct bpf_verifier_env *env, int func_id, + const struct bpf_func_proto **ptr) { if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) return -ERANGE; @@ -10920,7 +10246,30 @@ static int get_helper_proto(struct bpf_verifier_env *env, int func_id, return -EINVAL; *ptr = env->ops->get_func_proto(func_id, env->prog); - return *ptr ? 0 : -EINVAL; + return *ptr && (*ptr)->func ? 0 : -EINVAL; +} + +/* Check if we're in a sleepable context. */ +static inline bool in_sleepable_context(struct bpf_verifier_env *env) +{ + return !env->cur_state->active_rcu_locks && + !env->cur_state->active_preempt_locks && + !env->cur_state->active_locks && + !env->cur_state->active_irq_id && + in_sleepable(env); +} + +static const char *non_sleepable_context_description(struct bpf_verifier_env *env) +{ + if (env->cur_state->active_rcu_locks) + return "rcu_read_lock region"; + if (env->cur_state->active_preempt_locks) + return "non-preemptible region"; + if (env->cur_state->active_irq_id) + return "IRQ-disabled region"; + if (env->cur_state->active_locks) + return "lock region"; + return "non-sleepable prog"; } static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, @@ -10939,7 +10288,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn /* find function prototype */ func_id = insn->imm; - err = get_helper_proto(env, insn->imm, &fn); + err = bpf_get_helper_proto(env, insn->imm, &fn); if (err == -ERANGE) { verbose(env, "invalid func %s#%d\n", func_id_name(func_id), func_id); return -EINVAL; @@ -10962,61 +10311,31 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EINVAL; } - if (!in_sleepable(env) && fn->might_sleep) { - verbose(env, "helper call might sleep in a non-sleepable prog\n"); - return -EINVAL; - } - /* With LD_ABS/IND some JITs save/restore skb from r1. */ changes_data = bpf_helper_changes_pkt_data(func_id); if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) { - verbose(env, "kernel subsystem misconfigured func %s#%d: r1 != ctx\n", - func_id_name(func_id), func_id); - return -EINVAL; + verifier_bug(env, "func %s#%d: r1 != ctx", func_id_name(func_id), func_id); + return -EFAULT; } memset(&meta, 0, sizeof(meta)); meta.pkt_access = fn->pkt_access; - err = check_func_proto(fn, func_id); + err = check_func_proto(fn); if (err) { - verbose(env, "kernel subsystem misconfigured func %s#%d\n", - func_id_name(func_id), func_id); + verifier_bug(env, "incorrect func proto %s#%d", func_id_name(func_id), func_id); return err; } - if (env->cur_state->active_rcu_lock) { - if (fn->might_sleep) { - verbose(env, "sleepable helper %s#%d in rcu_read_lock region\n", - func_id_name(func_id), func_id); - return -EINVAL; - } - - if (in_sleepable(env) && is_storage_get_function(func_id)) - env->insn_aux_data[insn_idx].storage_get_func_atomic = true; - } - - if (env->cur_state->active_preempt_locks) { - if (fn->might_sleep) { - verbose(env, "sleepable helper %s#%d in non-preemptible region\n", - func_id_name(func_id), func_id); - return -EINVAL; - } - - if (in_sleepable(env) && is_storage_get_function(func_id)) - env->insn_aux_data[insn_idx].storage_get_func_atomic = true; + if (fn->might_sleep && !in_sleepable_context(env)) { + verbose(env, "sleepable helper %s#%d in %s\n", func_id_name(func_id), func_id, + non_sleepable_context_description(env)); + return -EINVAL; } - if (env->cur_state->active_irq_id) { - if (fn->might_sleep) { - verbose(env, "sleepable helper %s#%d in IRQ-disabled region\n", - func_id_name(func_id), func_id); - return -EINVAL; - } - - if (in_sleepable(env) && is_storage_get_function(func_id)) - env->insn_aux_data[insn_idx].storage_get_func_atomic = true; - } + /* Track non-sleepable context for helpers. */ + if (!in_sleepable_context(env)) + env->insn_aux_data[insn_idx].non_sleepable = true; meta.func_id = func_id; /* check args */ @@ -11048,15 +10367,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (meta.release_regno) { err = -EINVAL; - /* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot - * be released by any dynptr helper. Hence, unmark_stack_slots_dynptr - * is safe to do directly. - */ if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) { - if (regs[meta.release_regno].type == CONST_PTR_TO_DYNPTR) { - verbose(env, "verifier internal error: CONST_PTR_TO_DYNPTR cannot be released\n"); - return -EFAULT; - } err = unmark_stack_slots_dynptr(env, ®s[meta.release_regno]); } else if (func_id == BPF_FUNC_kptr_xchg && meta.ref_obj_id) { u32 ref_obj_id = meta.ref_obj_id; @@ -11080,7 +10391,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } } else if (meta.ref_obj_id) { err = release_reference(env, meta.ref_obj_id); - } else if (register_is_null(®s[meta.release_regno])) { + } else if (bpf_register_is_null(®s[meta.release_regno])) { /* meta.ref_obj_id can only be 0 if register that is meant to be * released is NULL, which must be > R0. */ @@ -11103,7 +10414,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn /* check that flags argument in get_local_storage(map, flags) is 0, * this is required because get_local_storage() can't return an error. */ - if (!register_is_null(®s[BPF_REG_2])) { + if (!bpf_register_is_null(®s[BPF_REG_2])) { verbose(env, "get_local_storage() doesn't support non-zero flags\n"); return -EINVAL; } @@ -11171,23 +10482,23 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (meta.dynptr_id) { - verbose(env, "verifier internal error: meta.dynptr_id already set\n"); + verifier_bug(env, "meta.dynptr_id already set"); return -EFAULT; } if (meta.ref_obj_id) { - verbose(env, "verifier internal error: meta.ref_obj_id already set\n"); + verifier_bug(env, "meta.ref_obj_id already set"); return -EFAULT; } id = dynptr_id(env, reg); if (id < 0) { - verbose(env, "verifier internal error: failed to obtain dynptr id\n"); + verifier_bug(env, "failed to obtain dynptr id"); return id; } ref_obj_id = dynptr_ref_obj_id(env, reg); if (ref_obj_id < 0) { - verbose(env, "verifier internal error: failed to obtain dynptr ref_obj_id\n"); + verifier_bug(env, "failed to obtain dynptr ref_obj_id"); return ref_obj_id; } @@ -11209,7 +10520,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (dynptr_type == BPF_DYNPTR_TYPE_INVALID) return -EFAULT; - if (dynptr_type == BPF_DYNPTR_TYPE_SKB) + if (dynptr_type == BPF_DYNPTR_TYPE_SKB || + dynptr_type == BPF_DYNPTR_TYPE_SKB_META) /* this will trigger clear_all_pkt_pointers(), which will * invalidate all dynptr slices associated with the skb */ @@ -11245,7 +10557,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { - mark_reg_not_init(env, regs, caller_saved[i]); + bpf_mark_reg_not_init(env, ®s[caller_saved[i]]); check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); } @@ -11271,23 +10583,22 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn * can check 'value_size' boundary of memory access * to map element returned from bpf_map_lookup_elem() */ - if (meta.map_ptr == NULL) { - verbose(env, - "kernel subsystem misconfigured verifier\n"); - return -EINVAL; + if (meta.map.ptr == NULL) { + verifier_bug(env, "unexpected null map_ptr"); + return -EFAULT; } if (func_id == BPF_FUNC_map_lookup_elem && - can_elide_value_nullness(meta.map_ptr->map_type) && + can_elide_value_nullness(meta.map.ptr->map_type) && meta.const_map_key >= 0 && - meta.const_map_key < meta.map_ptr->max_entries) + meta.const_map_key < meta.map.ptr->max_entries) ret_flag &= ~PTR_MAYBE_NULL; - regs[BPF_REG_0].map_ptr = meta.map_ptr; - regs[BPF_REG_0].map_uid = meta.map_uid; + regs[BPF_REG_0].map_ptr = meta.map.ptr; + regs[BPF_REG_0].map_uid = meta.map.uid; regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag; if (!type_may_be_null(ret_flag) && - btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK)) { + btf_record_has_field(meta.map.ptr->record, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK)) { regs[BPF_REG_0].id = ++env->id_gen; } break; @@ -11364,10 +10675,9 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } } else { if (fn->ret_btf_id == BPF_PTR_POISON) { - verbose(env, "verifier internal error:"); - verbose(env, "func %s has non-overwritten BPF_PTR_POISON return type\n", - func_id_name(func_id)); - return -EINVAL; + verifier_bug(env, "func %s has non-overwritten BPF_PTR_POISON return type", + func_id_name(func_id)); + return -EFAULT; } ret_btf = btf_vmlinux; ret_btf_id = *fn->ret_btf_id; @@ -11391,9 +10701,9 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (type_may_be_null(regs[BPF_REG_0].type)) regs[BPF_REG_0].id = ++env->id_gen; - if (helper_multiple_ref_obj_use(func_id, meta.map_ptr)) { - verbose(env, "verifier internal error: func %s#%d sets ref_obj_id more than once\n", - func_id_name(func_id), func_id); + if (helper_multiple_ref_obj_use(func_id, meta.map.ptr)) { + verifier_bug(env, "func %s#%d sets ref_obj_id more than once", + func_id_name(func_id), func_id); return -EFAULT; } @@ -11403,7 +10713,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (is_ptr_cast_function(func_id) || is_dynptr_ref_function(func_id)) { /* For release_reference() */ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; - } else if (is_acquire_function(func_id, meta.map_ptr)) { + } else if (is_acquire_function(func_id, meta.map.ptr)) { int id = acquire_reference(env, insn_idx); if (id < 0) @@ -11418,7 +10728,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (err) return err; - err = check_map_func_compatibility(env, meta.map_ptr, func_id); + err = check_map_func_compatibility(env, meta.map.ptr, func_id); if (err) return err; @@ -11451,6 +10761,25 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn env->prog->call_get_func_ip = true; } + if (func_id == BPF_FUNC_tail_call) { + if (env->cur_state->curframe) { + struct bpf_verifier_state *branch; + + mark_reg_scratched(env, BPF_REG_0); + branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false); + if (IS_ERR(branch)) + return PTR_ERR(branch); + clear_all_pkt_pointers(env); + mark_reg_unknown(env, regs, BPF_REG_0); + err = prepare_func_exit(env, &env->insn_idx); + if (err) + return err; + env->insn_idx--; + } else { + changes_data = false; + } + } + if (changes_data) clear_all_pkt_pointers(env); return 0; @@ -11459,27 +10788,27 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn /* mark_btf_func_reg_size() is used when the reg size is determined by * the BTF func_proto's return value size and argument. */ -static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno, - size_t reg_size) +static void __mark_btf_func_reg_size(struct bpf_verifier_env *env, struct bpf_reg_state *regs, + u32 regno, size_t reg_size) { - struct bpf_reg_state *reg = &cur_regs(env)[regno]; + struct bpf_reg_state *reg = ®s[regno]; if (regno == BPF_REG_0) { /* Function return value */ - reg->live |= REG_LIVE_WRITTEN; reg->subreg_def = reg_size == sizeof(u64) ? DEF_NOT_SUBREG : env->insn_idx + 1; - } else { + } else if (reg_size == sizeof(u64)) { /* Function argument */ - if (reg_size == sizeof(u64)) { - mark_insn_zext(env, reg); - mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); - } else { - mark_reg_read(env, reg, reg->parent, REG_LIVE_READ32); - } + mark_insn_zext(env, reg); } } +static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno, + size_t reg_size) +{ + return __mark_btf_func_reg_size(env, cur_regs(env), regno, reg_size); +} + static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta) { return meta->kfunc_flags & KF_ACQUIRE; @@ -11490,15 +10819,6 @@ static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta) return meta->kfunc_flags & KF_RELEASE; } -static bool is_kfunc_trusted_args(struct bpf_kfunc_call_arg_meta *meta) -{ - return (meta->kfunc_flags & KF_TRUSTED_ARGS) || is_kfunc_release(meta); -} - -static bool is_kfunc_sleepable(struct bpf_kfunc_call_arg_meta *meta) -{ - return meta->kfunc_flags & KF_SLEEPABLE; -} static bool is_kfunc_destructive(struct bpf_kfunc_call_arg_meta *meta) { @@ -11541,11 +10861,6 @@ static bool is_kfunc_arg_const_mem_size(const struct btf *btf, return btf_param_match_suffix(btf, arg, "__szk"); } -static bool is_kfunc_arg_optional(const struct btf *btf, const struct btf_param *arg) -{ - return btf_param_match_suffix(btf, arg, "__opt"); -} - static bool is_kfunc_arg_constant(const struct btf *btf, const struct btf_param *arg) { return btf_param_match_suffix(btf, arg, "__k"); @@ -11617,6 +10932,10 @@ enum { KF_ARG_RB_ROOT_ID, KF_ARG_RB_NODE_ID, KF_ARG_WORKQUEUE_ID, + KF_ARG_RES_SPIN_LOCK_ID, + KF_ARG_TASK_WORK_ID, + KF_ARG_PROG_AUX_ID, + KF_ARG_TIMER_ID }; BTF_ID_LIST(kf_arg_btf_ids) @@ -11626,6 +10945,10 @@ BTF_ID(struct, bpf_list_node) BTF_ID(struct, bpf_rb_root) BTF_ID(struct, bpf_rb_node) BTF_ID(struct, bpf_wq) +BTF_ID(struct, bpf_res_spin_lock) +BTF_ID(struct, bpf_task_work) +BTF_ID(struct, bpf_prog_aux) +BTF_ID(struct, bpf_timer) static bool __is_kfunc_ptr_arg_type(const struct btf *btf, const struct btf_param *arg, int type) @@ -11669,11 +10992,36 @@ static bool is_kfunc_arg_rbtree_node(const struct btf *btf, const struct btf_par return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RB_NODE_ID); } +static bool is_kfunc_arg_timer(const struct btf *btf, const struct btf_param *arg) +{ + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_TIMER_ID); +} + static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg) { return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID); } +static bool is_kfunc_arg_task_work(const struct btf *btf, const struct btf_param *arg) +{ + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_TASK_WORK_ID); +} + +static bool is_kfunc_arg_res_spin_lock(const struct btf *btf, const struct btf_param *arg) +{ + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RES_SPIN_LOCK_ID); +} + +static bool is_rbtree_node_type(const struct btf_type *t) +{ + return t == btf_type_by_id(btf_vmlinux, kf_arg_btf_ids[KF_ARG_RB_NODE_ID]); +} + +static bool is_list_node_type(const struct btf_type *t) +{ + return t == btf_type_by_id(btf_vmlinux, kf_arg_btf_ids[KF_ARG_LIST_NODE_ID]); +} + static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf *btf, const struct btf_param *arg) { @@ -11686,6 +11034,33 @@ static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf return true; } +static bool is_kfunc_arg_prog_aux(const struct btf *btf, const struct btf_param *arg) +{ + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_PROG_AUX_ID); +} + +/* + * A kfunc with KF_IMPLICIT_ARGS has two prototypes in BTF: + * - the _impl prototype with full arg list (meta->func_proto) + * - the BPF API prototype w/o implicit args (func->type in BTF) + * To determine whether an argument is implicit, we compare its position + * against the number of arguments in the prototype w/o implicit args. + */ +static bool is_kfunc_arg_implicit(const struct bpf_kfunc_call_arg_meta *meta, u32 arg_idx) +{ + const struct btf_type *func, *func_proto; + u32 argn; + + if (!(meta->kfunc_flags & KF_IMPLICIT_ARGS)) + return false; + + func = btf_type_by_id(meta->btf, meta->func_id); + func_proto = btf_type_by_id(meta->btf, func->type); + argn = btf_type_vlen(func_proto); + + return argn <= arg_idx; +} + /* Returns true if struct is composed of scalars, 4 levels of nesting allowed */ static bool __btf_type_is_scalar_struct(struct bpf_verifier_env *env, const struct btf *btf, @@ -11743,34 +11118,52 @@ enum kfunc_ptr_arg_type { KF_ARG_PTR_TO_NULL, KF_ARG_PTR_TO_CONST_STR, KF_ARG_PTR_TO_MAP, + KF_ARG_PTR_TO_TIMER, KF_ARG_PTR_TO_WORKQUEUE, KF_ARG_PTR_TO_IRQ_FLAG, + KF_ARG_PTR_TO_RES_SPIN_LOCK, + KF_ARG_PTR_TO_TASK_WORK, }; enum special_kfunc_type { KF_bpf_obj_new_impl, + KF_bpf_obj_new, KF_bpf_obj_drop_impl, + KF_bpf_obj_drop, KF_bpf_refcount_acquire_impl, + KF_bpf_refcount_acquire, KF_bpf_list_push_front_impl, + KF_bpf_list_push_front, KF_bpf_list_push_back_impl, + KF_bpf_list_push_back, KF_bpf_list_pop_front, KF_bpf_list_pop_back, + KF_bpf_list_front, + KF_bpf_list_back, KF_bpf_cast_to_kern_ctx, KF_bpf_rdonly_cast, KF_bpf_rcu_read_lock, KF_bpf_rcu_read_unlock, KF_bpf_rbtree_remove, KF_bpf_rbtree_add_impl, + KF_bpf_rbtree_add, KF_bpf_rbtree_first, + KF_bpf_rbtree_root, + KF_bpf_rbtree_left, + KF_bpf_rbtree_right, KF_bpf_dynptr_from_skb, KF_bpf_dynptr_from_xdp, + KF_bpf_dynptr_from_skb_meta, + KF_bpf_xdp_pull_data, KF_bpf_dynptr_slice, KF_bpf_dynptr_slice_rdwr, KF_bpf_dynptr_clone, KF_bpf_percpu_obj_new_impl, + KF_bpf_percpu_obj_new, KF_bpf_percpu_obj_drop_impl, + KF_bpf_percpu_obj_drop, KF_bpf_throw, - KF_bpf_wq_set_callback_impl, + KF_bpf_wq_set_callback, KF_bpf_preempt_disable, KF_bpf_preempt_enable, KF_bpf_iter_css_task_new, @@ -11781,66 +11174,71 @@ enum special_kfunc_type { KF_bpf_iter_num_new, KF_bpf_iter_num_next, KF_bpf_iter_num_destroy, + KF_bpf_set_dentry_xattr, + KF_bpf_remove_dentry_xattr, + KF_bpf_res_spin_lock, + KF_bpf_res_spin_unlock, + KF_bpf_res_spin_lock_irqsave, + KF_bpf_res_spin_unlock_irqrestore, + KF_bpf_dynptr_from_file, + KF_bpf_dynptr_file_discard, + KF___bpf_trap, + KF_bpf_task_work_schedule_signal, + KF_bpf_task_work_schedule_resume, + KF_bpf_arena_alloc_pages, + KF_bpf_arena_free_pages, + KF_bpf_arena_reserve_pages, + KF_bpf_session_is_return, + KF_bpf_stream_vprintk, + KF_bpf_stream_print_stack, }; -BTF_SET_START(special_kfunc_set) -BTF_ID(func, bpf_obj_new_impl) -BTF_ID(func, bpf_obj_drop_impl) -BTF_ID(func, bpf_refcount_acquire_impl) -BTF_ID(func, bpf_list_push_front_impl) -BTF_ID(func, bpf_list_push_back_impl) -BTF_ID(func, bpf_list_pop_front) -BTF_ID(func, bpf_list_pop_back) -BTF_ID(func, bpf_cast_to_kern_ctx) -BTF_ID(func, bpf_rdonly_cast) -BTF_ID(func, bpf_rbtree_remove) -BTF_ID(func, bpf_rbtree_add_impl) -BTF_ID(func, bpf_rbtree_first) -#ifdef CONFIG_NET -BTF_ID(func, bpf_dynptr_from_skb) -BTF_ID(func, bpf_dynptr_from_xdp) -#endif -BTF_ID(func, bpf_dynptr_slice) -BTF_ID(func, bpf_dynptr_slice_rdwr) -BTF_ID(func, bpf_dynptr_clone) -BTF_ID(func, bpf_percpu_obj_new_impl) -BTF_ID(func, bpf_percpu_obj_drop_impl) -BTF_ID(func, bpf_throw) -BTF_ID(func, bpf_wq_set_callback_impl) -#ifdef CONFIG_CGROUPS -BTF_ID(func, bpf_iter_css_task_new) -#endif -BTF_SET_END(special_kfunc_set) - BTF_ID_LIST(special_kfunc_list) BTF_ID(func, bpf_obj_new_impl) +BTF_ID(func, bpf_obj_new) BTF_ID(func, bpf_obj_drop_impl) +BTF_ID(func, bpf_obj_drop) BTF_ID(func, bpf_refcount_acquire_impl) +BTF_ID(func, bpf_refcount_acquire) BTF_ID(func, bpf_list_push_front_impl) +BTF_ID(func, bpf_list_push_front) BTF_ID(func, bpf_list_push_back_impl) +BTF_ID(func, bpf_list_push_back) BTF_ID(func, bpf_list_pop_front) BTF_ID(func, bpf_list_pop_back) +BTF_ID(func, bpf_list_front) +BTF_ID(func, bpf_list_back) BTF_ID(func, bpf_cast_to_kern_ctx) BTF_ID(func, bpf_rdonly_cast) BTF_ID(func, bpf_rcu_read_lock) BTF_ID(func, bpf_rcu_read_unlock) BTF_ID(func, bpf_rbtree_remove) BTF_ID(func, bpf_rbtree_add_impl) +BTF_ID(func, bpf_rbtree_add) BTF_ID(func, bpf_rbtree_first) +BTF_ID(func, bpf_rbtree_root) +BTF_ID(func, bpf_rbtree_left) +BTF_ID(func, bpf_rbtree_right) #ifdef CONFIG_NET BTF_ID(func, bpf_dynptr_from_skb) BTF_ID(func, bpf_dynptr_from_xdp) +BTF_ID(func, bpf_dynptr_from_skb_meta) +BTF_ID(func, bpf_xdp_pull_data) #else BTF_ID_UNUSED BTF_ID_UNUSED +BTF_ID_UNUSED +BTF_ID_UNUSED #endif BTF_ID(func, bpf_dynptr_slice) BTF_ID(func, bpf_dynptr_slice_rdwr) BTF_ID(func, bpf_dynptr_clone) BTF_ID(func, bpf_percpu_obj_new_impl) +BTF_ID(func, bpf_percpu_obj_new) BTF_ID(func, bpf_percpu_obj_drop_impl) +BTF_ID(func, bpf_percpu_obj_drop) BTF_ID(func, bpf_throw) -BTF_ID(func, bpf_wq_set_callback_impl) +BTF_ID(func, bpf_wq_set_callback) BTF_ID(func, bpf_preempt_disable) BTF_ID(func, bpf_preempt_enable) #ifdef CONFIG_CGROUPS @@ -11859,13 +11257,87 @@ BTF_ID(func, bpf_local_irq_restore) BTF_ID(func, bpf_iter_num_new) BTF_ID(func, bpf_iter_num_next) BTF_ID(func, bpf_iter_num_destroy) +#ifdef CONFIG_BPF_LSM +BTF_ID(func, bpf_set_dentry_xattr) +BTF_ID(func, bpf_remove_dentry_xattr) +#else +BTF_ID_UNUSED +BTF_ID_UNUSED +#endif +BTF_ID(func, bpf_res_spin_lock) +BTF_ID(func, bpf_res_spin_unlock) +BTF_ID(func, bpf_res_spin_lock_irqsave) +BTF_ID(func, bpf_res_spin_unlock_irqrestore) +BTF_ID(func, bpf_dynptr_from_file) +BTF_ID(func, bpf_dynptr_file_discard) +BTF_ID(func, __bpf_trap) +BTF_ID(func, bpf_task_work_schedule_signal) +BTF_ID(func, bpf_task_work_schedule_resume) +BTF_ID(func, bpf_arena_alloc_pages) +BTF_ID(func, bpf_arena_free_pages) +BTF_ID(func, bpf_arena_reserve_pages) +#ifdef CONFIG_BPF_EVENTS +BTF_ID(func, bpf_session_is_return) +#else +BTF_ID_UNUSED +#endif +BTF_ID(func, bpf_stream_vprintk) +BTF_ID(func, bpf_stream_print_stack) + +static bool is_bpf_obj_new_kfunc(u32 func_id) +{ + return func_id == special_kfunc_list[KF_bpf_obj_new] || + func_id == special_kfunc_list[KF_bpf_obj_new_impl]; +} + +static bool is_bpf_percpu_obj_new_kfunc(u32 func_id) +{ + return func_id == special_kfunc_list[KF_bpf_percpu_obj_new] || + func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]; +} + +static bool is_bpf_obj_drop_kfunc(u32 func_id) +{ + return func_id == special_kfunc_list[KF_bpf_obj_drop] || + func_id == special_kfunc_list[KF_bpf_obj_drop_impl]; +} + +static bool is_bpf_percpu_obj_drop_kfunc(u32 func_id) +{ + return func_id == special_kfunc_list[KF_bpf_percpu_obj_drop] || + func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl]; +} + +static bool is_bpf_refcount_acquire_kfunc(u32 func_id) +{ + return func_id == special_kfunc_list[KF_bpf_refcount_acquire] || + func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]; +} + +static bool is_bpf_list_push_kfunc(u32 func_id) +{ + return func_id == special_kfunc_list[KF_bpf_list_push_front] || + func_id == special_kfunc_list[KF_bpf_list_push_front_impl] || + func_id == special_kfunc_list[KF_bpf_list_push_back] || + func_id == special_kfunc_list[KF_bpf_list_push_back_impl]; +} + +static bool is_bpf_rbtree_add_kfunc(u32 func_id) +{ + return func_id == special_kfunc_list[KF_bpf_rbtree_add] || + func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]; +} + +static bool is_task_work_add_kfunc(u32 func_id) +{ + return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal] || + func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume]; +} static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) { - if (meta->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] && - meta->arg_owning_ref) { + if (is_bpf_refcount_acquire_kfunc(meta->func_id) && meta->arg_owning_ref) return false; - } return meta->kfunc_flags & KF_RET_NULL; } @@ -11890,6 +11362,11 @@ static bool is_kfunc_bpf_preempt_enable(struct bpf_kfunc_call_arg_meta *meta) return meta->func_id == special_kfunc_list[KF_bpf_preempt_enable]; } +bool bpf_is_kfunc_pkt_changing(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->func_id == special_kfunc_list[KF_bpf_xdp_pull_data]; +} + static enum kfunc_ptr_arg_type get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta, @@ -11902,9 +11379,16 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, struct bpf_reg_state *reg = ®s[regno]; bool arg_mem_size = false; - if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) + if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] || + meta->func_id == special_kfunc_list[KF_bpf_session_is_return] || + meta->func_id == special_kfunc_list[KF_bpf_session_cookie]) return KF_ARG_PTR_TO_CTX; + if (argno + 1 < nargs && + (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1]) || + is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1]))) + arg_mem_size = true; + /* In this function, we verify the kfunc's BTF as per the argument type, * leaving the rest of the verification with respect to the register * type to our caller. When a set of conditions hold in the BTF type of @@ -11913,7 +11397,8 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (btf_is_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno)) return KF_ARG_PTR_TO_CTX; - if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && register_is_null(reg)) + if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && bpf_register_is_null(reg) && + !arg_mem_size) return KF_ARG_PTR_TO_NULL; if (is_kfunc_arg_alloc_obj(meta->btf, &args[argno])) @@ -11949,9 +11434,18 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_wq(meta->btf, &args[argno])) return KF_ARG_PTR_TO_WORKQUEUE; + if (is_kfunc_arg_timer(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_TIMER; + + if (is_kfunc_arg_task_work(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_TASK_WORK; + if (is_kfunc_arg_irq_flag(meta->btf, &args[argno])) return KF_ARG_PTR_TO_IRQ_FLAG; + if (is_kfunc_arg_res_spin_lock(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_RES_SPIN_LOCK; + if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) { if (!btf_type_is_struct(ref_t)) { verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n", @@ -11964,11 +11458,6 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_callback(env, meta->btf, &args[argno])) return KF_ARG_PTR_TO_CALLBACK; - if (argno + 1 < nargs && - (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1]) || - is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1]))) - arg_mem_size = true; - /* This is the catch all argument type of register types supported by * check_helper_mem_access. However, we only allow when argument type is * pointer to scalar, or struct composed (recursively) of scalars. When @@ -12008,7 +11497,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, /* Enforce strict type matching for calls to kfuncs that are acquiring * or releasing a reference, or are no-cast aliases. We do _not_ - * enforce strict matching for plain KF_TRUSTED_ARGS kfuncs by default, + * enforce strict matching for kfuncs by default, * as we want to enable BPF programs to pass types that are bitwise * equivalent without forcing them to explicitly cast with something * like bpf_cast_to_kern_ctx(). @@ -12034,13 +11523,12 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, btf_type_ids_nocast_alias(&env->log, reg_btf, reg_ref_id, meta->btf, ref_id)) strict_type_match = true; - WARN_ON_ONCE(is_kfunc_release(meta) && - (reg->off || !tnum_is_const(reg->var_off) || - reg->var_off.value)); + WARN_ON_ONCE(is_kfunc_release(meta) && !tnum_is_const(reg->var_off)); reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, ®_ref_id); reg_ref_tname = btf_name_by_offset(reg_btf, reg_ref_t->name_off); - struct_same = btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->off, meta->btf, ref_id, strict_type_match); + struct_same = btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->var_off.value, + meta->btf, ref_id, strict_type_match); /* If kfunc is accepting a projection type (ie. __sk_buff), it cannot * actually use it -- it must cast to the underlying type. So we allow * caller to pass in the underlying type. @@ -12058,16 +11546,22 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, static int process_irq_flag(struct bpf_verifier_env *env, int regno, struct bpf_kfunc_call_arg_meta *meta) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); + int err, kfunc_class = IRQ_NATIVE_KFUNC; bool irq_save; - int err; - if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_save]) { + if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_save] || + meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave]) { irq_save = true; - } else if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_restore]) { + if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave]) + kfunc_class = IRQ_LOCK_KFUNC; + } else if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_restore] || + meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]) { irq_save = false; + if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]) + kfunc_class = IRQ_LOCK_KFUNC; } else { - verbose(env, "verifier internal error: unknown irq flags kfunc\n"); + verifier_bug(env, "unknown irq flags kfunc"); return -EFAULT; } @@ -12081,7 +11575,7 @@ static int process_irq_flag(struct bpf_verifier_env *env, int regno, if (err) return err; - err = mark_stack_slot_irq_flag(env, meta, reg, env->insn_idx); + err = mark_stack_slot_irq_flag(env, meta, reg, env->insn_idx, kfunc_class); if (err) return err; } else { @@ -12095,7 +11589,7 @@ static int process_irq_flag(struct bpf_verifier_env *env, int regno, if (err) return err; - err = unmark_stack_slot_irq_flag(env, reg); + err = unmark_stack_slot_irq_flag(env, reg, kfunc_class); if (err) return err; } @@ -12108,12 +11602,12 @@ static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state struct btf_record *rec = reg_btf_record(reg); if (!env->cur_state->active_locks) { - verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n"); + verifier_bug(env, "%s w/o active lock", __func__); return -EFAULT; } if (type_flag(reg->type) & NON_OWN_REF) { - verbose(env, "verifier internal error: NON_OWN_REF already set\n"); + verifier_bug(env, "NON_OWN_REF already set"); return -EFAULT; } @@ -12132,8 +11626,7 @@ static int ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 ref_o int i; if (!ref_obj_id) { - verbose(env, "verifier internal error: ref_obj_id is zero for " - "owning -> non-owning conversion\n"); + verifier_bug(env, "ref_obj_id is zero for owning -> non-owning conversion"); return -EFAULT; } @@ -12153,7 +11646,7 @@ static int ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 ref_o return 0; } - verbose(env, "verifier internal error: ref state missing for ref_obj_id\n"); + verifier_bug(env, "ref state missing for ref_obj_id"); return -EFAULT; } @@ -12215,14 +11708,14 @@ static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_ ptr = reg->btf; break; default: - verbose(env, "verifier internal error: unknown reg type for lock check\n"); + verifier_bug(env, "unknown reg type for lock check"); return -EFAULT; } id = reg->id; if (!env->cur_state->active_locks) return -EINVAL; - s = find_lock_state(env->cur_state, REF_TYPE_LOCK, id, ptr); + s = find_lock_state(env->cur_state, REF_TYPE_LOCK_MASK, id, ptr); if (!s) { verbose(env, "held lock and object are not in the same allocation\n"); return -EINVAL; @@ -12232,17 +11725,21 @@ static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_ static bool is_bpf_list_api_kfunc(u32 btf_id) { - return btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] || - btf_id == special_kfunc_list[KF_bpf_list_push_back_impl] || + return is_bpf_list_push_kfunc(btf_id) || btf_id == special_kfunc_list[KF_bpf_list_pop_front] || - btf_id == special_kfunc_list[KF_bpf_list_pop_back]; + btf_id == special_kfunc_list[KF_bpf_list_pop_back] || + btf_id == special_kfunc_list[KF_bpf_list_front] || + btf_id == special_kfunc_list[KF_bpf_list_back]; } static bool is_bpf_rbtree_api_kfunc(u32 btf_id) { - return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl] || + return is_bpf_rbtree_add_kfunc(btf_id) || btf_id == special_kfunc_list[KF_bpf_rbtree_remove] || - btf_id == special_kfunc_list[KF_bpf_rbtree_first]; + btf_id == special_kfunc_list[KF_bpf_rbtree_first] || + btf_id == special_kfunc_list[KF_bpf_rbtree_root] || + btf_id == special_kfunc_list[KF_bpf_rbtree_left] || + btf_id == special_kfunc_list[KF_bpf_rbtree_right]; } static bool is_bpf_iter_num_api_kfunc(u32 btf_id) @@ -12254,34 +11751,59 @@ static bool is_bpf_iter_num_api_kfunc(u32 btf_id) static bool is_bpf_graph_api_kfunc(u32 btf_id) { - return is_bpf_list_api_kfunc(btf_id) || is_bpf_rbtree_api_kfunc(btf_id) || - btf_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]; + return is_bpf_list_api_kfunc(btf_id) || + is_bpf_rbtree_api_kfunc(btf_id) || + is_bpf_refcount_acquire_kfunc(btf_id); +} + +static bool is_bpf_res_spin_lock_kfunc(u32 btf_id) +{ + return btf_id == special_kfunc_list[KF_bpf_res_spin_lock] || + btf_id == special_kfunc_list[KF_bpf_res_spin_unlock] || + btf_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave] || + btf_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]; +} + +static bool is_bpf_arena_kfunc(u32 btf_id) +{ + return btf_id == special_kfunc_list[KF_bpf_arena_alloc_pages] || + btf_id == special_kfunc_list[KF_bpf_arena_free_pages] || + btf_id == special_kfunc_list[KF_bpf_arena_reserve_pages]; +} + +static bool is_bpf_stream_kfunc(u32 btf_id) +{ + return btf_id == special_kfunc_list[KF_bpf_stream_vprintk] || + btf_id == special_kfunc_list[KF_bpf_stream_print_stack]; } static bool kfunc_spin_allowed(u32 btf_id) { - return is_bpf_graph_api_kfunc(btf_id) || is_bpf_iter_num_api_kfunc(btf_id); + return is_bpf_graph_api_kfunc(btf_id) || is_bpf_iter_num_api_kfunc(btf_id) || + is_bpf_res_spin_lock_kfunc(btf_id) || is_bpf_arena_kfunc(btf_id) || + is_bpf_stream_kfunc(btf_id); } static bool is_sync_callback_calling_kfunc(u32 btf_id) { - return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl]; + return is_bpf_rbtree_add_kfunc(btf_id); } static bool is_async_callback_calling_kfunc(u32 btf_id) { - return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl]; + return is_bpf_wq_set_callback_kfunc(btf_id) || + is_task_work_add_kfunc(btf_id); } -static bool is_bpf_throw_kfunc(struct bpf_insn *insn) +bool bpf_is_throw_kfunc(struct bpf_insn *insn) { return bpf_pseudo_kfunc_call(insn) && insn->off == 0 && insn->imm == special_kfunc_list[KF_bpf_throw]; } -static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id) +static bool is_bpf_wq_set_callback_kfunc(u32 btf_id) { - return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl]; + return btf_id == special_kfunc_list[KF_bpf_wq_set_callback]; } static bool is_callback_calling_kfunc(u32 btf_id) @@ -12328,12 +11850,13 @@ static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env, switch (node_field_type) { case BPF_LIST_NODE: - ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] || - kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_back_impl]); + ret = is_bpf_list_push_kfunc(kfunc_btf_id); break; case BPF_RB_NODE: - ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_remove] || - kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl]); + ret = (is_bpf_rbtree_add_kfunc(kfunc_btf_id) || + kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_remove] || + kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_left] || + kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_right]); break; default: verbose(env, "verifier internal error: unexpected graph node argument type %s\n", @@ -12360,7 +11883,7 @@ __process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env, u32 head_off; if (meta->btf != btf_vmlinux) { - verbose(env, "verifier internal error: unexpected btf mismatch in kfunc call\n"); + verifier_bug(env, "unexpected btf mismatch in kfunc call"); return -EFAULT; } @@ -12376,7 +11899,7 @@ __process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env, } rec = reg_btf_record(reg); - head_off = reg->off + reg->var_off.value; + head_off = reg->var_off.value; field = btf_record_find(rec, head_off, head_field_type); if (!field) { verbose(env, "%s not found at offset=%u\n", head_type_name, head_off); @@ -12391,7 +11914,7 @@ __process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env, } if (*head_field) { - verbose(env, "verifier internal error: repeating %s arg\n", head_type_name); + verifier_bug(env, "repeating %s arg", head_type_name); return -EFAULT; } *head_field = field; @@ -12428,7 +11951,7 @@ __process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env, u32 node_off; if (meta->btf != btf_vmlinux) { - verbose(env, "verifier internal error: unexpected btf mismatch in kfunc call\n"); + verifier_bug(env, "unexpected btf mismatch in kfunc call"); return -EFAULT; } @@ -12443,7 +11966,7 @@ __process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env, return -EINVAL; } - node_off = reg->off + reg->var_off.value; + node_off = reg->var_off.value; field = reg_find_field_offset(reg, node_off, node_field_type); if (!field) { verbose(env, "%s not found at offset=%u\n", node_type_name, node_off); @@ -12548,11 +12071,22 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ bool is_ret_buf_sz = false; int kf_arg_type; - t = btf_type_skip_modifiers(btf, args[i].type, NULL); + if (is_kfunc_arg_prog_aux(btf, &args[i])) { + /* Reject repeated use bpf_prog_aux */ + if (meta->arg_prog) { + verifier_bug(env, "Only 1 prog->aux argument supported per-kfunc"); + return -EFAULT; + } + meta->arg_prog = true; + cur_aux(env)->arg_prog = regno; + continue; + } - if (is_kfunc_arg_ignore(btf, &args[i])) + if (is_kfunc_arg_ignore(btf, &args[i]) || is_kfunc_arg_implicit(meta, i)) continue; + t = btf_type_skip_modifiers(btf, args[i].type, NULL); + if (btf_type_is_scalar(t)) { if (reg->type != SCALAR_VALUE) { verbose(env, "R%d is not a scalar\n", regno); @@ -12561,7 +12095,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (is_kfunc_arg_constant(meta->btf, &args[i])) { if (meta->arg_constant.found) { - verbose(env, "verifier internal error: only one constant argument permitted\n"); + verifier_bug(env, "only one constant argument permitted"); return -EFAULT; } if (!tnum_is_const(reg->var_off)) { @@ -12604,18 +12138,17 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EINVAL; } - if ((is_kfunc_trusted_args(meta) || is_kfunc_rcu(meta)) && - (register_is_null(reg) || type_may_be_null(reg->type)) && - !is_kfunc_arg_nullable(meta->btf, &args[i])) { + if ((bpf_register_is_null(reg) || type_may_be_null(reg->type)) && + !is_kfunc_arg_nullable(meta->btf, &args[i])) { verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i); return -EACCES; } if (reg->ref_obj_id) { if (is_kfunc_release(meta) && meta->ref_obj_id) { - verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", - regno, reg->ref_obj_id, - meta->ref_obj_id); + verifier_bug(env, "more than one arg with ref_obj_id R%d %u %u", + regno, reg->ref_obj_id, + meta->ref_obj_id); return -EFAULT; } meta->ref_obj_id = reg->ref_obj_id; @@ -12638,7 +12171,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "pointer in R%d isn't map pointer\n", regno); return -EINVAL; } - if (meta->map.ptr && reg->map_ptr->record->wq_off >= 0) { + if (meta->map.ptr && (reg->map_ptr->record->wq_off >= 0 || + reg->map_ptr->record->task_work_off >= 0)) { /* Use map_uid (which is unique id of inner map) to reject: * inner_map1 = bpf_map_lookup_elem(outer_map, key1) * inner_map2 = bpf_map_lookup_elem(outer_map, key2) @@ -12653,6 +12187,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ */ if (meta->map.ptr != reg->map_ptr || meta->map.uid != reg->map_uid) { + if (reg->map_ptr->record->task_work_off >= 0) { + verbose(env, + "bpf_task_work pointer in R2 map_uid=%d doesn't match map pointer in R3 map_uid=%d\n", + meta->map.uid, reg->map_uid); + return -EINVAL; + } verbose(env, "workqueue pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n", meta->map.uid, reg->map_uid); @@ -12664,9 +12204,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ fallthrough; case KF_ARG_PTR_TO_ALLOC_BTF_ID: case KF_ARG_PTR_TO_BTF_ID: - if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta)) - break; - if (!is_trusted_reg(reg)) { if (!is_kfunc_rcu(meta)) { verbose(env, "R%d must be referenced or trusted\n", regno); @@ -12678,7 +12215,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } } fallthrough; - case KF_ARG_PTR_TO_CTX: case KF_ARG_PTR_TO_DYNPTR: case KF_ARG_PTR_TO_ITER: case KF_ARG_PTR_TO_LIST_HEAD: @@ -12691,10 +12227,16 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_REFCOUNTED_KPTR: case KF_ARG_PTR_TO_CONST_STR: case KF_ARG_PTR_TO_WORKQUEUE: + case KF_ARG_PTR_TO_TIMER: + case KF_ARG_PTR_TO_TASK_WORK: case KF_ARG_PTR_TO_IRQ_FLAG: + case KF_ARG_PTR_TO_RES_SPIN_LOCK: + break; + case KF_ARG_PTR_TO_CTX: + arg_type = ARG_PTR_TO_CTX; break; default: - WARN_ON_ONCE(1); + verifier_bug(env, "unknown kfunc arg type %d", kf_arg_type); return -EFAULT; } @@ -12721,13 +12263,13 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ break; case KF_ARG_PTR_TO_ALLOC_BTF_ID: if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC)) { - if (meta->func_id != special_kfunc_list[KF_bpf_obj_drop_impl]) { - verbose(env, "arg#%d expected for bpf_obj_drop_impl()\n", i); + if (!is_bpf_obj_drop_kfunc(meta->func_id)) { + verbose(env, "arg#%d expected for bpf_obj_drop()\n", i); return -EINVAL; } } else if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC | MEM_PERCPU)) { - if (meta->func_id != special_kfunc_list[KF_bpf_percpu_obj_drop_impl]) { - verbose(env, "arg#%d expected for bpf_percpu_obj_drop_impl()\n", i); + if (!is_bpf_percpu_obj_drop_kfunc(meta->func_id)) { + verbose(env, "arg#%d expected for bpf_percpu_obj_drop()\n", i); return -EINVAL; } } else { @@ -12758,19 +12300,26 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ dynptr_arg_type |= DYNPTR_TYPE_SKB; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_xdp]) { dynptr_arg_type |= DYNPTR_TYPE_XDP; + } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb_meta]) { + dynptr_arg_type |= DYNPTR_TYPE_SKB_META; + } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) { + dynptr_arg_type |= DYNPTR_TYPE_FILE; + } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_file_discard]) { + dynptr_arg_type |= DYNPTR_TYPE_FILE; + meta->release_regno = regno; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_clone] && (dynptr_arg_type & MEM_UNINIT)) { enum bpf_dynptr_type parent_type = meta->initialized_dynptr.type; if (parent_type == BPF_DYNPTR_TYPE_INVALID) { - verbose(env, "verifier internal error: no dynptr type for parent of clone\n"); + verifier_bug(env, "no dynptr type for parent of clone"); return -EFAULT; } dynptr_arg_type |= (unsigned int)get_dynptr_type_flag(parent_type); clone_ref_obj_id = meta->initialized_dynptr.ref_obj_id; if (dynptr_type_refcounted(parent_type) && !clone_ref_obj_id) { - verbose(env, "verifier internal error: missing ref obj id for parent of clone\n"); + verifier_bug(env, "missing ref obj id for parent of clone"); return -EFAULT; } } @@ -12783,7 +12332,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ int id = dynptr_id(env, reg); if (id < 0) { - verbose(env, "verifier internal error: failed to obtain dynptr id\n"); + verifier_bug(env, "failed to obtain dynptr id"); return id; } meta->initialized_dynptr.id = id; @@ -12846,22 +12395,22 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return ret; break; case KF_ARG_PTR_TO_RB_NODE: - if (meta->func_id == special_kfunc_list[KF_bpf_rbtree_remove]) { - if (!type_is_non_owning_ref(reg->type) || reg->ref_obj_id) { - verbose(env, "rbtree_remove node input must be non-owning ref\n"); + if (is_bpf_rbtree_add_kfunc(meta->func_id)) { + if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { + verbose(env, "arg#%d expected pointer to allocated object\n", i); return -EINVAL; } - if (in_rbtree_lock_required_cb(env)) { - verbose(env, "rbtree_remove not allowed in rbtree cb\n"); + if (!reg->ref_obj_id) { + verbose(env, "allocated object must be referenced\n"); return -EINVAL; } } else { - if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { - verbose(env, "arg#%d expected pointer to allocated object\n", i); + if (!type_is_non_owning_ref(reg->type) && !reg->ref_obj_id) { + verbose(env, "%s can only take non-owning or refcounted bpf_rb_node pointer\n", func_name); return -EINVAL; } - if (!reg->ref_obj_id) { - verbose(env, "allocated object must be referenced\n"); + if (in_rbtree_lock_required_cb(env)) { + verbose(env, "%s not allowed in rbtree cb\n", func_name); return -EINVAL; } } @@ -12909,7 +12458,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ struct bpf_reg_state *size_reg = ®s[regno + 1]; const struct btf_param *size_arg = &args[i + 1]; - if (!register_is_null(buff_reg) || !is_kfunc_arg_optional(meta->btf, buff_arg)) { + if (!bpf_register_is_null(buff_reg) || !is_kfunc_arg_nullable(meta->btf, buff_arg)) { ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1); if (ret < 0) { verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1); @@ -12919,7 +12468,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (is_kfunc_arg_const_mem_size(meta->btf, size_arg, size_reg)) { if (meta->arg_constant.found) { - verbose(env, "verifier internal error: only one constant argument permitted\n"); + verifier_bug(env, "only one constant argument permitted"); return -EFAULT; } if (!tnum_is_const(size_reg->var_off)) { @@ -12951,7 +12500,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ rec = reg_btf_record(reg); if (!rec) { - verbose(env, "verifier internal error: Couldn't find btf_record\n"); + verifier_bug(env, "Couldn't find btf_record"); return -EFAULT; } @@ -12977,7 +12526,25 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "arg#%d doesn't point to a map value\n", i); return -EINVAL; } - ret = process_wq_func(env, regno, meta); + ret = check_map_field_pointer(env, regno, BPF_WORKQUEUE, &meta->map); + if (ret < 0) + return ret; + break; + case KF_ARG_PTR_TO_TIMER: + if (reg->type != PTR_TO_MAP_VALUE) { + verbose(env, "arg#%d doesn't point to a map value\n", i); + return -EINVAL; + } + ret = process_timer_kfunc(env, regno, meta); + if (ret < 0) + return ret; + break; + case KF_ARG_PTR_TO_TASK_WORK: + if (reg->type != PTR_TO_MAP_VALUE) { + verbose(env, "arg#%d doesn't point to a map value\n", i); + return -EINVAL; + } + ret = check_map_field_pointer(env, regno, BPF_TASK_WORK, &meta->map); if (ret < 0) return ret; break; @@ -12990,6 +12557,28 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (ret < 0) return ret; break; + case KF_ARG_PTR_TO_RES_SPIN_LOCK: + { + int flags = PROCESS_RES_LOCK; + + if (reg->type != PTR_TO_MAP_VALUE && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { + verbose(env, "arg#%d doesn't point to map value or allocated object\n", i); + return -EINVAL; + } + + if (!is_bpf_res_spin_lock_kfunc(meta->func_id)) + return -EFAULT; + if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock] || + meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave]) + flags |= PROCESS_SPIN_LOCK; + if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave] || + meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]) + flags |= PROCESS_LOCK_IRQ; + ret = process_spin_lock(env, regno, flags); + if (ret < 0) + return ret; + break; + } } } @@ -13002,48 +12591,399 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return 0; } -static int fetch_kfunc_meta(struct bpf_verifier_env *env, - struct bpf_insn *insn, - struct bpf_kfunc_call_arg_meta *meta, - const char **kfunc_name) +int bpf_fetch_kfunc_arg_meta(struct bpf_verifier_env *env, + s32 func_id, + s16 offset, + struct bpf_kfunc_call_arg_meta *meta) { - const struct btf_type *func, *func_proto; - u32 func_id, *kfunc_flags; - const char *func_name; - struct btf *desc_btf; - - if (kfunc_name) - *kfunc_name = NULL; + struct bpf_kfunc_meta kfunc; + int err; - if (!insn->imm) - return -EINVAL; + err = fetch_kfunc_meta(env, func_id, offset, &kfunc); + if (err) + return err; - desc_btf = find_kfunc_desc_btf(env, insn->off); - if (IS_ERR(desc_btf)) - return PTR_ERR(desc_btf); + memset(meta, 0, sizeof(*meta)); + meta->btf = kfunc.btf; + meta->func_id = kfunc.id; + meta->func_proto = kfunc.proto; + meta->func_name = kfunc.name; - func_id = insn->imm; - func = btf_type_by_id(desc_btf, func_id); - func_name = btf_name_by_offset(desc_btf, func->name_off); - if (kfunc_name) - *kfunc_name = func_name; - func_proto = btf_type_by_id(desc_btf, func->type); - - kfunc_flags = btf_kfunc_id_set_contains(desc_btf, func_id, env->prog); - if (!kfunc_flags) { + if (!kfunc.flags || !btf_kfunc_is_allowed(kfunc.btf, kfunc.id, env->prog)) return -EACCES; - } - memset(meta, 0, sizeof(*meta)); - meta->btf = desc_btf; - meta->func_id = func_id; - meta->kfunc_flags = *kfunc_flags; - meta->func_proto = func_proto; - meta->func_name = func_name; + meta->kfunc_flags = *kfunc.flags; return 0; } +/* + * Determine how many bytes a helper accesses through a stack pointer at + * argument position @arg (0-based, corresponding to R1-R5). + * + * Returns: + * > 0 known read access size in bytes + * 0 doesn't read anything directly + * S64_MIN unknown + * < 0 known write access of (-return) bytes + */ +s64 bpf_helper_stack_access_bytes(struct bpf_verifier_env *env, struct bpf_insn *insn, + int arg, int insn_idx) +{ + struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; + const struct bpf_func_proto *fn; + enum bpf_arg_type at; + s64 size; + + if (bpf_get_helper_proto(env, insn->imm, &fn) < 0) + return S64_MIN; + + at = fn->arg_type[arg]; + + switch (base_type(at)) { + case ARG_PTR_TO_MAP_KEY: + case ARG_PTR_TO_MAP_VALUE: { + bool is_key = base_type(at) == ARG_PTR_TO_MAP_KEY; + u64 val; + int i, map_reg; + + for (i = 0; i < arg; i++) { + if (base_type(fn->arg_type[i]) == ARG_CONST_MAP_PTR) + break; + } + if (i >= arg) + goto scan_all_maps; + + map_reg = BPF_REG_1 + i; + + if (!(aux->const_reg_map_mask & BIT(map_reg))) + goto scan_all_maps; + + i = aux->const_reg_vals[map_reg]; + if (i < env->used_map_cnt) { + size = is_key ? env->used_maps[i]->key_size + : env->used_maps[i]->value_size; + goto out; + } +scan_all_maps: + /* + * Map pointer is not known at this call site (e.g. different + * maps on merged paths). Conservatively return the largest + * key_size or value_size across all maps used by the program. + */ + val = 0; + for (i = 0; i < env->used_map_cnt; i++) { + struct bpf_map *map = env->used_maps[i]; + u32 sz = is_key ? map->key_size : map->value_size; + + if (sz > val) + val = sz; + if (map->inner_map_meta) { + sz = is_key ? map->inner_map_meta->key_size + : map->inner_map_meta->value_size; + if (sz > val) + val = sz; + } + } + if (!val) + return S64_MIN; + size = val; + goto out; + } + case ARG_PTR_TO_MEM: + if (at & MEM_FIXED_SIZE) { + size = fn->arg_size[arg]; + goto out; + } + if (arg + 1 < ARRAY_SIZE(fn->arg_type) && + arg_type_is_mem_size(fn->arg_type[arg + 1])) { + int size_reg = BPF_REG_1 + arg + 1; + + if (aux->const_reg_mask & BIT(size_reg)) { + size = (s64)aux->const_reg_vals[size_reg]; + goto out; + } + /* + * Size arg is const on each path but differs across merged + * paths. MAX_BPF_STACK is a safe upper bound for reads. + */ + if (at & MEM_UNINIT) + return 0; + return MAX_BPF_STACK; + } + return S64_MIN; + case ARG_PTR_TO_DYNPTR: + size = BPF_DYNPTR_SIZE; + break; + case ARG_PTR_TO_STACK: + /* + * Only used by bpf_calls_callback() helpers. The helper itself + * doesn't access stack. The callback subprog does and it's + * analyzed separately. + */ + return 0; + default: + return S64_MIN; + } +out: + /* + * MEM_UNINIT args are write-only: the helper initializes the + * buffer without reading it. + */ + if (at & MEM_UNINIT) + return -size; + return size; +} + +/* + * Determine how many bytes a kfunc accesses through a stack pointer at + * argument position @arg (0-based, corresponding to R1-R5). + * + * Returns: + * > 0 known read access size in bytes + * 0 doesn't access memory through that argument (ex: not a pointer) + * S64_MIN unknown + * < 0 known write access of (-return) bytes + */ +s64 bpf_kfunc_stack_access_bytes(struct bpf_verifier_env *env, struct bpf_insn *insn, + int arg, int insn_idx) +{ + struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; + struct bpf_kfunc_call_arg_meta meta; + const struct btf_param *args; + const struct btf_type *t, *ref_t; + const struct btf *btf; + u32 nargs, type_size; + s64 size; + + if (bpf_fetch_kfunc_arg_meta(env, insn->imm, insn->off, &meta) < 0) + return S64_MIN; + + btf = meta.btf; + args = btf_params(meta.func_proto); + nargs = btf_type_vlen(meta.func_proto); + if (arg >= nargs) + return 0; + + t = btf_type_skip_modifiers(btf, args[arg].type, NULL); + if (!btf_type_is_ptr(t)) + return 0; + + /* dynptr: fixed 16-byte on-stack representation */ + if (is_kfunc_arg_dynptr(btf, &args[arg])) { + size = BPF_DYNPTR_SIZE; + goto out; + } + + /* ptr + __sz/__szk pair: size is in the next register */ + if (arg + 1 < nargs && + (btf_param_match_suffix(btf, &args[arg + 1], "__sz") || + btf_param_match_suffix(btf, &args[arg + 1], "__szk"))) { + int size_reg = BPF_REG_1 + arg + 1; + + if (aux->const_reg_mask & BIT(size_reg)) { + size = (s64)aux->const_reg_vals[size_reg]; + goto out; + } + return MAX_BPF_STACK; + } + + /* fixed-size pointed-to type: resolve via BTF */ + ref_t = btf_type_skip_modifiers(btf, t->type, NULL); + if (!IS_ERR(btf_resolve_size(btf, ref_t, &type_size))) { + size = type_size; + goto out; + } + + return S64_MIN; +out: + /* KF_ITER_NEW kfuncs initialize the iterator state at arg 0 */ + if (arg == 0 && meta.kfunc_flags & KF_ITER_NEW) + return -size; + if (is_kfunc_arg_uninit(btf, &args[arg])) + return -size; + return size; +} + +/* check special kfuncs and return: + * 1 - not fall-through to 'else' branch, continue verification + * 0 - fall-through to 'else' branch + * < 0 - not fall-through to 'else' branch, return error + */ +static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta, + struct bpf_reg_state *regs, struct bpf_insn_aux_data *insn_aux, + const struct btf_type *ptr_type, struct btf *desc_btf) +{ + const struct btf_type *ret_t; + int err = 0; + + if (meta->btf != btf_vmlinux) + return 0; + + if (is_bpf_obj_new_kfunc(meta->func_id) || is_bpf_percpu_obj_new_kfunc(meta->func_id)) { + struct btf_struct_meta *struct_meta; + struct btf *ret_btf; + u32 ret_btf_id; + + if (is_bpf_obj_new_kfunc(meta->func_id) && !bpf_global_ma_set) + return -ENOMEM; + + if (((u64)(u32)meta->arg_constant.value) != meta->arg_constant.value) { + verbose(env, "local type ID argument must be in range [0, U32_MAX]\n"); + return -EINVAL; + } + + ret_btf = env->prog->aux->btf; + ret_btf_id = meta->arg_constant.value; + + /* This may be NULL due to user not supplying a BTF */ + if (!ret_btf) { + verbose(env, "bpf_obj_new/bpf_percpu_obj_new requires prog BTF\n"); + return -EINVAL; + } + + ret_t = btf_type_by_id(ret_btf, ret_btf_id); + if (!ret_t || !__btf_type_is_struct(ret_t)) { + verbose(env, "bpf_obj_new/bpf_percpu_obj_new type ID argument must be of a struct\n"); + return -EINVAL; + } + + if (is_bpf_percpu_obj_new_kfunc(meta->func_id)) { + if (ret_t->size > BPF_GLOBAL_PERCPU_MA_MAX_SIZE) { + verbose(env, "bpf_percpu_obj_new type size (%d) is greater than %d\n", + ret_t->size, BPF_GLOBAL_PERCPU_MA_MAX_SIZE); + return -EINVAL; + } + + if (!bpf_global_percpu_ma_set) { + mutex_lock(&bpf_percpu_ma_lock); + if (!bpf_global_percpu_ma_set) { + /* Charge memory allocated with bpf_global_percpu_ma to + * root memcg. The obj_cgroup for root memcg is NULL. + */ + err = bpf_mem_alloc_percpu_init(&bpf_global_percpu_ma, NULL); + if (!err) + bpf_global_percpu_ma_set = true; + } + mutex_unlock(&bpf_percpu_ma_lock); + if (err) + return err; + } + + mutex_lock(&bpf_percpu_ma_lock); + err = bpf_mem_alloc_percpu_unit_init(&bpf_global_percpu_ma, ret_t->size); + mutex_unlock(&bpf_percpu_ma_lock); + if (err) + return err; + } + + struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id); + if (is_bpf_percpu_obj_new_kfunc(meta->func_id)) { + if (!__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) { + verbose(env, "bpf_percpu_obj_new type ID argument must be of a struct of scalars\n"); + return -EINVAL; + } + + if (struct_meta) { + verbose(env, "bpf_percpu_obj_new type ID argument must not contain special fields\n"); + return -EINVAL; + } + } + + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; + regs[BPF_REG_0].btf = ret_btf; + regs[BPF_REG_0].btf_id = ret_btf_id; + if (is_bpf_percpu_obj_new_kfunc(meta->func_id)) + regs[BPF_REG_0].type |= MEM_PERCPU; + + insn_aux->obj_new_size = ret_t->size; + insn_aux->kptr_struct_meta = struct_meta; + } else if (is_bpf_refcount_acquire_kfunc(meta->func_id)) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; + regs[BPF_REG_0].btf = meta->arg_btf; + regs[BPF_REG_0].btf_id = meta->arg_btf_id; + + insn_aux->kptr_struct_meta = + btf_find_struct_meta(meta->arg_btf, + meta->arg_btf_id); + } else if (is_list_node_type(ptr_type)) { + struct btf_field *field = meta->arg_list_head.field; + + mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root); + } else if (is_rbtree_node_type(ptr_type)) { + struct btf_field *field = meta->arg_rbtree_root.field; + + mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root); + } else if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_TRUSTED; + regs[BPF_REG_0].btf = desc_btf; + regs[BPF_REG_0].btf_id = meta->ret_btf_id; + } else if (meta->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { + ret_t = btf_type_by_id(desc_btf, meta->arg_constant.value); + if (!ret_t) { + verbose(env, "Unknown type ID %lld passed to kfunc bpf_rdonly_cast\n", + meta->arg_constant.value); + return -EINVAL; + } else if (btf_type_is_struct(ret_t)) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED; + regs[BPF_REG_0].btf = desc_btf; + regs[BPF_REG_0].btf_id = meta->arg_constant.value; + } else if (btf_type_is_void(ret_t)) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_MEM | MEM_RDONLY | PTR_UNTRUSTED; + regs[BPF_REG_0].mem_size = 0; + } else { + verbose(env, + "kfunc bpf_rdonly_cast type ID argument must be of a struct or void\n"); + return -EINVAL; + } + } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice] || + meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) { + enum bpf_type_flag type_flag = get_dynptr_type_flag(meta->initialized_dynptr.type); + + mark_reg_known_zero(env, regs, BPF_REG_0); + + if (!meta->arg_constant.found) { + verifier_bug(env, "bpf_dynptr_slice(_rdwr) no constant size"); + return -EFAULT; + } + + regs[BPF_REG_0].mem_size = meta->arg_constant.value; + + /* PTR_MAYBE_NULL will be added when is_kfunc_ret_null is checked */ + regs[BPF_REG_0].type = PTR_TO_MEM | type_flag; + + if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice]) { + regs[BPF_REG_0].type |= MEM_RDONLY; + } else { + /* this will set env->seen_direct_write to true */ + if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE)) { + verbose(env, "the prog does not allow writes to packet data\n"); + return -EINVAL; + } + } + + if (!meta->initialized_dynptr.id) { + verifier_bug(env, "no dynptr id"); + return -EFAULT; + } + regs[BPF_REG_0].dynptr_id = meta->initialized_dynptr.id; + + /* we don't need to set BPF_REG_0's ref obj id + * because packet slices are not refcounted (see + * dynptr_type_refcounted) + */ + } else { + return 0; + } + + return 1; +} + static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name); static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, @@ -13058,40 +12998,74 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_insn_aux_data *insn_aux; int err, insn_idx = *insn_idx_p; const struct btf_param *args; - const struct btf_type *ret_t; struct btf *desc_btf; /* skip for now, but return error when we find this in fixup_kfunc_call */ if (!insn->imm) return 0; - err = fetch_kfunc_meta(env, insn, &meta, &func_name); - if (err == -EACCES && func_name) - verbose(env, "calling kernel function %s is not allowed\n", func_name); + err = bpf_fetch_kfunc_arg_meta(env, insn->imm, insn->off, &meta); + if (err == -EACCES && meta.func_name) + verbose(env, "calling kernel function %s is not allowed\n", meta.func_name); if (err) return err; desc_btf = meta.btf; + func_name = meta.func_name; insn_aux = &env->insn_aux_data[insn_idx]; - insn_aux->is_iter_next = is_iter_next_kfunc(&meta); + insn_aux->is_iter_next = bpf_is_iter_next_kfunc(&meta); + + if (!insn->off && + (insn->imm == special_kfunc_list[KF_bpf_res_spin_lock] || + insn->imm == special_kfunc_list[KF_bpf_res_spin_lock_irqsave])) { + struct bpf_verifier_state *branch; + struct bpf_reg_state *regs; + + branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false); + if (IS_ERR(branch)) { + verbose(env, "failed to push state for failed lock acquisition\n"); + return PTR_ERR(branch); + } + + regs = branch->frame[branch->curframe]->regs; + + /* Clear r0-r5 registers in forked state */ + for (i = 0; i < CALLER_SAVED_REGS; i++) + bpf_mark_reg_not_init(env, ®s[caller_saved[i]]); + + mark_reg_unknown(env, regs, BPF_REG_0); + err = __mark_reg_s32_range(env, regs, BPF_REG_0, -MAX_ERRNO, -1); + if (err) { + verbose(env, "failed to mark s32 range for retval in forked state for lock\n"); + return err; + } + __mark_btf_func_reg_size(env, regs, BPF_REG_0, sizeof(u32)); + } else if (!insn->off && insn->imm == special_kfunc_list[KF___bpf_trap]) { + verbose(env, "unexpected __bpf_trap() due to uninitialized variable?\n"); + return -EFAULT; + } if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) { verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capability\n"); return -EACCES; } - sleepable = is_kfunc_sleepable(&meta); + sleepable = bpf_is_kfunc_sleepable(&meta); if (sleepable && !in_sleepable(env)) { verbose(env, "program must be sleepable to call sleepable kfunc %s\n", func_name); return -EACCES; } + /* Track non-sleepable context for kfuncs, same as for helpers. */ + if (!in_sleepable_context(env)) + insn_aux->non_sleepable = true; + /* Check the arguments */ err = check_kfunc_args(env, &meta, insn_idx); if (err < 0) return err; - if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) { + if (is_bpf_rbtree_add_kfunc(meta.func_id)) { err = push_callback_call(env, insn, insn_idx, meta.subprogno, set_rbtree_add_callback_state); if (err) { @@ -13106,7 +13080,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, meta.r0_rdonly = false; } - if (is_bpf_wq_set_callback_impl_kfunc(meta.func_id)) { + if (is_bpf_wq_set_callback_kfunc(meta.func_id)) { err = push_callback_call(env, insn, insn_idx, meta.subprogno, set_timer_callback_state); if (err) { @@ -13116,62 +13090,64 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } } + if (is_task_work_add_kfunc(meta.func_id)) { + err = push_callback_call(env, insn, insn_idx, meta.subprogno, + set_task_work_schedule_callback_state); + if (err) { + verbose(env, "kfunc %s#%d failed callback verification\n", + func_name, meta.func_id); + return err; + } + } + rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta); rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta); preempt_disable = is_kfunc_bpf_preempt_disable(&meta); preempt_enable = is_kfunc_bpf_preempt_enable(&meta); - if (env->cur_state->active_rcu_lock) { + if (rcu_lock) { + env->cur_state->active_rcu_locks++; + } else if (rcu_unlock) { struct bpf_func_state *state; struct bpf_reg_state *reg; u32 clear_mask = (1 << STACK_SPILL) | (1 << STACK_ITER); - if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) { - verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n"); - return -EACCES; - } - - if (rcu_lock) { - verbose(env, "nested rcu read lock (kernel function %s)\n", func_name); + if (env->cur_state->active_rcu_locks == 0) { + verbose(env, "unmatched rcu read unlock (kernel function %s)\n", func_name); return -EINVAL; - } else if (rcu_unlock) { + } + if (--env->cur_state->active_rcu_locks == 0) { bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, clear_mask, ({ if (reg->type & MEM_RCU) { reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL); reg->type |= PTR_UNTRUSTED; } })); - env->cur_state->active_rcu_lock = false; - } else if (sleepable) { - verbose(env, "kernel func %s is sleepable within rcu_read_lock region\n", func_name); - return -EACCES; - } - } else if (rcu_lock) { - env->cur_state->active_rcu_lock = true; - } else if (rcu_unlock) { - verbose(env, "unmatched rcu read unlock (kernel function %s)\n", func_name); - return -EINVAL; - } - - if (env->cur_state->active_preempt_locks) { - if (preempt_disable) { - env->cur_state->active_preempt_locks++; - } else if (preempt_enable) { - env->cur_state->active_preempt_locks--; - } else if (sleepable) { - verbose(env, "kernel func %s is sleepable within non-preemptible region\n", func_name); - return -EACCES; } } else if (preempt_disable) { env->cur_state->active_preempt_locks++; } else if (preempt_enable) { - verbose(env, "unmatched attempt to enable preemption (kernel function %s)\n", func_name); - return -EINVAL; + if (env->cur_state->active_preempt_locks == 0) { + verbose(env, "unmatched attempt to enable preemption (kernel function %s)\n", func_name); + return -EINVAL; + } + env->cur_state->active_preempt_locks--; } - if (env->cur_state->active_irq_id && sleepable) { - verbose(env, "kernel func %s is sleepable within IRQ-disabled region\n", func_name); + if (sleepable && !in_sleepable_context(env)) { + verbose(env, "kernel func %s is sleepable within %s\n", + func_name, non_sleepable_context_description(env)); + return -EACCES; + } + + if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) { + verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n"); + return -EACCES; + } + + if (is_kfunc_rcu_protected(&meta) && !in_rcu_cs(env)) { + verbose(env, "kernel func %s requires RCU critical section protection\n", func_name); return -EACCES; } @@ -13179,19 +13155,23 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now. */ if (meta.release_regno) { - err = release_reference(env, regs[meta.release_regno].ref_obj_id); - if (err) { - verbose(env, "kfunc %s#%d reference has not been acquired before\n", - func_name, meta.func_id); - return err; + struct bpf_reg_state *reg = ®s[meta.release_regno]; + + if (meta.initialized_dynptr.ref_obj_id) { + err = unmark_stack_slots_dynptr(env, reg); + } else { + err = release_reference(env, reg->ref_obj_id); + if (err) + verbose(env, "kfunc %s#%d reference has not been acquired before\n", + func_name, meta.func_id); } + if (err) + return err; } - if (meta.func_id == special_kfunc_list[KF_bpf_list_push_front_impl] || - meta.func_id == special_kfunc_list[KF_bpf_list_push_back_impl] || - meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) { + if (is_bpf_list_push_kfunc(meta.func_id) || is_bpf_rbtree_add_kfunc(meta.func_id)) { release_ref_obj_id = regs[BPF_REG_2].ref_obj_id; - insn_aux->insert_off = regs[BPF_REG_2].off; + insn_aux->insert_off = regs[BPF_REG_2].var_off.value; insn_aux->kptr_struct_meta = btf_find_struct_meta(meta.arg_btf, meta.arg_btf_id); err = ref_convert_owning_non_owning(env, release_ref_obj_id); if (err) { @@ -13226,18 +13206,21 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } } - for (i = 0; i < CALLER_SAVED_REGS; i++) - mark_reg_not_init(env, regs, caller_saved[i]); + for (i = 0; i < CALLER_SAVED_REGS; i++) { + u32 regno = caller_saved[i]; + + bpf_mark_reg_not_init(env, ®s[regno]); + regs[regno].subreg_def = DEF_NOT_SUBREG; + } /* Check return type */ t = btf_type_skip_modifiers(desc_btf, meta.func_proto->type, NULL); if (is_kfunc_acquire(&meta) && !btf_type_is_struct_ptr(meta.btf, t)) { - /* Only exception is bpf_obj_new_impl */ if (meta.btf != btf_vmlinux || - (meta.func_id != special_kfunc_list[KF_bpf_obj_new_impl] && - meta.func_id != special_kfunc_list[KF_bpf_percpu_obj_new_impl] && - meta.func_id != special_kfunc_list[KF_bpf_refcount_acquire_impl])) { + (!is_bpf_obj_new_kfunc(meta.func_id) && + !is_bpf_percpu_obj_new_kfunc(meta.func_id) && + !is_bpf_refcount_acquire_kfunc(meta.func_id))) { verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n"); return -EINVAL; } @@ -13245,168 +13228,16 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (btf_type_is_scalar(t)) { mark_reg_unknown(env, regs, BPF_REG_0); + if (meta.btf == btf_vmlinux && (meta.func_id == special_kfunc_list[KF_bpf_res_spin_lock] || + meta.func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave])) + __mark_reg_const_zero(env, ®s[BPF_REG_0]); mark_btf_func_reg_size(env, BPF_REG_0, t->size); } else if (btf_type_is_ptr(t)) { ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id); - - if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) { - if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] || - meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { - struct btf_struct_meta *struct_meta; - struct btf *ret_btf; - u32 ret_btf_id; - - if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set) - return -ENOMEM; - - if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) { - verbose(env, "local type ID argument must be in range [0, U32_MAX]\n"); - return -EINVAL; - } - - ret_btf = env->prog->aux->btf; - ret_btf_id = meta.arg_constant.value; - - /* This may be NULL due to user not supplying a BTF */ - if (!ret_btf) { - verbose(env, "bpf_obj_new/bpf_percpu_obj_new requires prog BTF\n"); - return -EINVAL; - } - - ret_t = btf_type_by_id(ret_btf, ret_btf_id); - if (!ret_t || !__btf_type_is_struct(ret_t)) { - verbose(env, "bpf_obj_new/bpf_percpu_obj_new type ID argument must be of a struct\n"); - return -EINVAL; - } - - if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { - if (ret_t->size > BPF_GLOBAL_PERCPU_MA_MAX_SIZE) { - verbose(env, "bpf_percpu_obj_new type size (%d) is greater than %d\n", - ret_t->size, BPF_GLOBAL_PERCPU_MA_MAX_SIZE); - return -EINVAL; - } - - if (!bpf_global_percpu_ma_set) { - mutex_lock(&bpf_percpu_ma_lock); - if (!bpf_global_percpu_ma_set) { - /* Charge memory allocated with bpf_global_percpu_ma to - * root memcg. The obj_cgroup for root memcg is NULL. - */ - err = bpf_mem_alloc_percpu_init(&bpf_global_percpu_ma, NULL); - if (!err) - bpf_global_percpu_ma_set = true; - } - mutex_unlock(&bpf_percpu_ma_lock); - if (err) - return err; - } - - mutex_lock(&bpf_percpu_ma_lock); - err = bpf_mem_alloc_percpu_unit_init(&bpf_global_percpu_ma, ret_t->size); - mutex_unlock(&bpf_percpu_ma_lock); - if (err) - return err; - } - - struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id); - if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { - if (!__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) { - verbose(env, "bpf_percpu_obj_new type ID argument must be of a struct of scalars\n"); - return -EINVAL; - } - - if (struct_meta) { - verbose(env, "bpf_percpu_obj_new type ID argument must not contain special fields\n"); - return -EINVAL; - } - } - - mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; - regs[BPF_REG_0].btf = ret_btf; - regs[BPF_REG_0].btf_id = ret_btf_id; - if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) - regs[BPF_REG_0].type |= MEM_PERCPU; - - insn_aux->obj_new_size = ret_t->size; - insn_aux->kptr_struct_meta = struct_meta; - } else if (meta.func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) { - mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; - regs[BPF_REG_0].btf = meta.arg_btf; - regs[BPF_REG_0].btf_id = meta.arg_btf_id; - - insn_aux->kptr_struct_meta = - btf_find_struct_meta(meta.arg_btf, - meta.arg_btf_id); - } else if (meta.func_id == special_kfunc_list[KF_bpf_list_pop_front] || - meta.func_id == special_kfunc_list[KF_bpf_list_pop_back]) { - struct btf_field *field = meta.arg_list_head.field; - - mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root); - } else if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_remove] || - meta.func_id == special_kfunc_list[KF_bpf_rbtree_first]) { - struct btf_field *field = meta.arg_rbtree_root.field; - - mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root); - } else if (meta.func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) { - mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_TRUSTED; - regs[BPF_REG_0].btf = desc_btf; - regs[BPF_REG_0].btf_id = meta.ret_btf_id; - } else if (meta.func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { - ret_t = btf_type_by_id(desc_btf, meta.arg_constant.value); - if (!ret_t || !btf_type_is_struct(ret_t)) { - verbose(env, - "kfunc bpf_rdonly_cast type ID argument must be of a struct\n"); - return -EINVAL; - } - - mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED; - regs[BPF_REG_0].btf = desc_btf; - regs[BPF_REG_0].btf_id = meta.arg_constant.value; - } else if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice] || - meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) { - enum bpf_type_flag type_flag = get_dynptr_type_flag(meta.initialized_dynptr.type); - - mark_reg_known_zero(env, regs, BPF_REG_0); - - if (!meta.arg_constant.found) { - verbose(env, "verifier internal error: bpf_dynptr_slice(_rdwr) no constant size\n"); - return -EFAULT; - } - - regs[BPF_REG_0].mem_size = meta.arg_constant.value; - - /* PTR_MAYBE_NULL will be added when is_kfunc_ret_null is checked */ - regs[BPF_REG_0].type = PTR_TO_MEM | type_flag; - - if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice]) { - regs[BPF_REG_0].type |= MEM_RDONLY; - } else { - /* this will set env->seen_direct_write to true */ - if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE)) { - verbose(env, "the prog does not allow writes to packet data\n"); - return -EINVAL; - } - } - - if (!meta.initialized_dynptr.id) { - verbose(env, "verifier internal error: no dynptr id\n"); - return -EFAULT; - } - regs[BPF_REG_0].dynptr_id = meta.initialized_dynptr.id; - - /* we don't need to set BPF_REG_0's ref obj id - * because packet slices are not refcounted (see - * dynptr_type_refcounted) - */ - } else { - verbose(env, "kernel function %s unhandled dynamic return type\n", - meta.func_name); - return -EFAULT; - } + err = check_special_kfunc(env, &meta, regs, insn_aux, ptr_type, desc_btf); + if (err) { + if (err < 0) + return err; } else if (btf_type_is_void(ptr_type)) { /* kfunc returning 'void *' is equivalent to returning scalar */ mark_reg_unknown(env, regs, BPF_REG_0); @@ -13440,25 +13271,42 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, /* Ensures we don't access the memory after a release_reference() */ if (meta.ref_obj_id) regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; + + if (is_kfunc_rcu_protected(&meta)) + regs[BPF_REG_0].type |= MEM_RCU; } else { - mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].btf = desc_btf; - regs[BPF_REG_0].type = PTR_TO_BTF_ID; - regs[BPF_REG_0].btf_id = ptr_type_id; + enum bpf_reg_type type = PTR_TO_BTF_ID; if (meta.func_id == special_kfunc_list[KF_bpf_get_kmem_cache]) - regs[BPF_REG_0].type |= PTR_UNTRUSTED; - - if (is_iter_next_kfunc(&meta)) { - struct bpf_reg_state *cur_iter; - - cur_iter = get_iter_from_state(env->cur_state, &meta); - - if (cur_iter->type & MEM_RCU) /* KF_RCU_PROTECTED */ - regs[BPF_REG_0].type |= MEM_RCU; - else - regs[BPF_REG_0].type |= PTR_TRUSTED; + type |= PTR_UNTRUSTED; + else if (is_kfunc_rcu_protected(&meta) || + (bpf_is_iter_next_kfunc(&meta) && + (get_iter_from_state(env->cur_state, &meta) + ->type & MEM_RCU))) { + /* + * If the iterator's constructor (the _new + * function e.g., bpf_iter_task_new) has been + * annotated with BPF kfunc flag + * KF_RCU_PROTECTED and was called within a RCU + * read-side critical section, also propagate + * the MEM_RCU flag to the pointer returned from + * the iterator's next function (e.g., + * bpf_iter_task_next). + */ + type |= MEM_RCU; + } else { + /* + * Any PTR_TO_BTF_ID that is returned from a BPF + * kfunc should by default be treated as + * implicitly trusted. + */ + type |= PTR_TRUSTED; } + + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].btf = desc_btf; + regs[BPF_REG_0].type = type; + regs[BPF_REG_0].btf_id = ptr_type_id; } if (is_kfunc_ret_null(&meta)) { @@ -13475,16 +13323,16 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (is_kfunc_ret_null(&meta)) regs[BPF_REG_0].id = id; regs[BPF_REG_0].ref_obj_id = id; - } else if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_first]) { + } else if (is_rbtree_node_type(ptr_type) || is_list_node_type(ptr_type)) { ref_set_non_owning(env, ®s[BPF_REG_0]); } if (reg_may_point_to_spin_lock(®s[BPF_REG_0]) && !regs[BPF_REG_0].id) regs[BPF_REG_0].id = ++env->id_gen; } else if (btf_type_is_void(t)) { - if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) { - if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl] || - meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl]) { + if (meta.btf == btf_vmlinux) { + if (is_bpf_obj_drop_kfunc(meta.func_id) || + is_bpf_percpu_obj_drop_kfunc(meta.func_id)) { insn_aux->kptr_struct_meta = btf_find_struct_meta(meta.arg_btf, meta.arg_btf_id); @@ -13492,6 +13340,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } } + if (bpf_is_kfunc_pkt_changing(&meta)) + clear_all_pkt_pointers(env); + nargs = btf_type_vlen(meta.func_proto); args = (const struct btf_param *)(meta.func_proto + 1); for (i = 0; i < nargs; i++) { @@ -13501,22 +13352,28 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (btf_type_is_ptr(t)) mark_btf_func_reg_size(env, regno, sizeof(void *)); else - /* scalar. ensured by btf_check_kfunc_arg_match() */ + /* scalar. ensured by check_kfunc_args() */ mark_btf_func_reg_size(env, regno, t->size); } - if (is_iter_next_kfunc(&meta)) { + if (bpf_is_iter_next_kfunc(&meta)) { err = process_iter_next_call(env, insn_idx, &meta); if (err) return err; } + if (meta.func_id == special_kfunc_list[KF_bpf_session_cookie]) + env->prog->call_session_cookie = true; + + if (bpf_is_throw_kfunc(insn)) + return process_bpf_exit_full(env, NULL, true); + return 0; } -static bool check_reg_sane_offset(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, - enum bpf_reg_type type) +static bool check_reg_sane_offset_scalar(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, + enum bpf_reg_type type) { bool known = tnum_is_const(reg->var_off); s64 val = reg->var_off.value; @@ -13528,12 +13385,6 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env, return false; } - if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) { - verbose(env, "%s pointer offset %d is not allowed\n", - reg_type_str(env, type), reg->off); - return false; - } - if (smin == S64_MIN) { verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n", reg_type_str(env, type)); @@ -13549,6 +13400,29 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env, return true; } +static bool check_reg_sane_offset_ptr(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, + enum bpf_reg_type type) +{ + bool known = tnum_is_const(reg->var_off); + s64 val = reg->var_off.value; + s64 smin = reg->smin_value; + + if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) { + verbose(env, "%s pointer offset %lld is not allowed\n", + reg_type_str(env, type), val); + return false; + } + + if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) { + verbose(env, "%s pointer offset %lld is not allowed\n", + reg_type_str(env, type), smin); + return false; + } + + return true; +} + enum { REASON_BOUNDS = -1, REASON_TYPE = -2, @@ -13570,13 +13444,11 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, * currently prohibited for unprivileged. */ max = MAX_BPF_STACK + mask_to_left; - ptr_limit = -(ptr_reg->var_off.value + ptr_reg->off); + ptr_limit = -ptr_reg->var_off.value; break; case PTR_TO_MAP_VALUE: max = ptr_reg->map_ptr->value_size; - ptr_limit = (mask_to_left ? - ptr_reg->smin_value : - ptr_reg->umax_value) + ptr_reg->off; + ptr_limit = mask_to_left ? ptr_reg->smin_value : ptr_reg->umax_value; break; default: return REASON_TYPE; @@ -13591,7 +13463,9 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env, const struct bpf_insn *insn) { - return env->bypass_spec_v1 || BPF_SRC(insn->code) == BPF_K; + return env->bypass_spec_v1 || + BPF_SRC(insn->code) == BPF_K || + cur_aux(env)->nospec; } static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux, @@ -13632,16 +13506,15 @@ struct bpf_sanitize_info { bool mask_to_left; }; -static struct bpf_verifier_state * -sanitize_speculative_path(struct bpf_verifier_env *env, - const struct bpf_insn *insn, - u32 next_idx, u32 curr_idx) +static int sanitize_speculative_path(struct bpf_verifier_env *env, + const struct bpf_insn *insn, + u32 next_idx, u32 curr_idx) { struct bpf_verifier_state *branch; struct bpf_reg_state *regs; branch = push_stack(env, next_idx, curr_idx, true); - if (branch && insn) { + if (!IS_ERR(branch) && insn) { regs = branch->frame[branch->curframe]->regs; if (BPF_SRC(insn->code) == BPF_K) { mark_reg_unknown(env, regs, insn->dst_reg); @@ -13650,7 +13523,7 @@ sanitize_speculative_path(struct bpf_verifier_env *env, mark_reg_unknown(env, regs, insn->src_reg); } } - return branch; + return PTR_ERR_OR_ZERO(branch); } static int sanitize_ptr_alu(struct bpf_verifier_env *env, @@ -13669,7 +13542,6 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, u8 opcode = BPF_OP(insn->code); u32 alu_state, alu_limit; struct bpf_reg_state tmp; - bool ret; int err; if (can_skip_alu_sanitation(env, insn)) @@ -13742,11 +13614,12 @@ do_sim: tmp = *dst_reg; copy_register_state(dst_reg, ptr_reg); } - ret = sanitize_speculative_path(env, NULL, env->insn_idx + 1, - env->insn_idx); - if (!ptr_is_dst_reg && ret) + err = sanitize_speculative_path(env, NULL, env->insn_idx + 1, env->insn_idx); + if (err < 0) + return REASON_STACK; + if (!ptr_is_dst_reg) *dst_reg = tmp; - return !ret ? REASON_STACK : 0; + return 0; } static void sanitize_mark_insn_seen(struct bpf_verifier_env *env) @@ -13791,10 +13664,9 @@ static int sanitize_err(struct bpf_verifier_env *env, case REASON_STACK: verbose(env, "R%d could not be pushed for speculative verification, %s\n", dst, err); - break; + return -ENOMEM; default: - verbose(env, "verifier internal error: unknown reason (%d)\n", - reason); + verifier_bug(env, "unknown reason (%d)", reason); break; } @@ -13807,9 +13679,6 @@ static int sanitize_err(struct bpf_verifier_env *env, * Variable offset is prohibited for unprivileged mode for simplicity since it * requires corresponding support in Spectre masking for stack ALU. See also * retrieve_ptr_limit(). - * - * - * 'off' includes 'reg->off'. */ static int check_stack_access_for_ptr_arithmetic( struct bpf_verifier_env *env, @@ -13850,18 +13719,18 @@ static int sanitize_check_bounds(struct bpf_verifier_env *env, switch (dst_reg->type) { case PTR_TO_STACK: if (check_stack_access_for_ptr_arithmetic(env, dst, dst_reg, - dst_reg->off + dst_reg->var_off.value)) + dst_reg->var_off.value)) return -EACCES; break; case PTR_TO_MAP_VALUE: - if (check_map_access(env, dst, dst_reg->off, 1, false, ACCESS_HELPER)) { + if (check_map_access(env, dst, 0, 1, false, ACCESS_HELPER)) { verbose(env, "R%d pointer arithmetic of map value goes out of range, " "prohibited for !root\n", dst); return -EACCES; } break; default: - break; + return -EOPNOTSUPP; } return 0; @@ -13888,7 +13757,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, struct bpf_sanitize_info info = {}; u8 opcode = BPF_OP(insn->code); u32 dst = insn->dst_reg; - int ret; + int ret, bounds_ret; dst_reg = ®s[dst]; @@ -13920,6 +13789,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, return -EACCES; } + /* + * Accesses to untrusted PTR_TO_MEM are done through probe + * instructions, hence no need to track offsets. + */ + if (base_type(ptr_reg->type) == PTR_TO_MEM && (ptr_reg->type & PTR_UNTRUSTED)) + return 0; + switch (base_type(ptr_reg->type)) { case PTR_TO_CTX: case PTR_TO_MAP_VALUE: @@ -13955,8 +13831,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg->type = ptr_reg->type; dst_reg->id = ptr_reg->id; - if (!check_reg_sane_offset(env, off_reg, ptr_reg->type) || - !check_reg_sane_offset(env, ptr_reg, ptr_reg->type)) + if (!check_reg_sane_offset_scalar(env, off_reg, ptr_reg->type) || + !check_reg_sane_offset_ptr(env, ptr_reg, ptr_reg->type)) return -EINVAL; /* pointer types do not carry 32-bit bounds at the moment. */ @@ -13971,23 +13847,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, switch (opcode) { case BPF_ADD: - /* We can take a fixed offset as long as it doesn't overflow - * the s32 'off' field - */ - if (known && (ptr_reg->off + smin_val == - (s64)(s32)(ptr_reg->off + smin_val))) { - /* pointer += K. Accumulate it into fixed offset */ - dst_reg->smin_value = smin_ptr; - dst_reg->smax_value = smax_ptr; - dst_reg->umin_value = umin_ptr; - dst_reg->umax_value = umax_ptr; - dst_reg->var_off = ptr_reg->var_off; - dst_reg->off = ptr_reg->off + smin_val; - dst_reg->raw = ptr_reg->raw; - break; - } - /* A new variable offset is created. Note that off_reg->off - * == 0, since it's a scalar. + /* * dst_reg gets the pointer type and since some positive * integer value was added to the pointer, give it a new 'id' * if it's a PTR_TO_PACKET. @@ -14006,12 +13866,18 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg->umax_value = U64_MAX; } dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off); - dst_reg->off = ptr_reg->off; dst_reg->raw = ptr_reg->raw; if (reg_is_pkt_pointer(ptr_reg)) { - dst_reg->id = ++env->id_gen; - /* something was added to pkt_ptr, set range to zero */ - memset(&dst_reg->raw, 0, sizeof(dst_reg->raw)); + if (!known) + dst_reg->id = ++env->id_gen; + /* + * Clear range for unknown addends since we can't know + * where the pkt pointer ended up. Also clear AT_PKT_END / + * BEYOND_PKT_END from prior comparison as any pointer + * arithmetic invalidates them. + */ + if (!known || dst_reg->range < 0) + memset(&dst_reg->raw, 0, sizeof(dst_reg->raw)); } break; case BPF_SUB: @@ -14030,19 +13896,6 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst); return -EACCES; } - if (known && (ptr_reg->off - smin_val == - (s64)(s32)(ptr_reg->off - smin_val))) { - /* pointer -= K. Subtract it from fixed offset */ - dst_reg->smin_value = smin_ptr; - dst_reg->smax_value = smax_ptr; - dst_reg->umin_value = umin_ptr; - dst_reg->umax_value = umax_ptr; - dst_reg->var_off = ptr_reg->var_off; - dst_reg->id = ptr_reg->id; - dst_reg->off = ptr_reg->off - smin_val; - dst_reg->raw = ptr_reg->raw; - break; - } /* A new variable offset is created. If the subtrahend is known * nonnegative, then any reg->range we had before is still good. */ @@ -14062,12 +13915,18 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg->umax_value = umax_ptr - umin_val; } dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off); - dst_reg->off = ptr_reg->off; dst_reg->raw = ptr_reg->raw; if (reg_is_pkt_pointer(ptr_reg)) { - dst_reg->id = ++env->id_gen; - /* something was added to pkt_ptr, set range to zero */ - if (smin_val < 0) + if (!known) + dst_reg->id = ++env->id_gen; + /* + * Clear range if the subtrahend may be negative since + * pkt pointer could move past its bounds. A positive + * subtrahend moves it backwards keeping positive range + * intact. Also clear AT_PKT_END / BEYOND_PKT_END from + * prior comparison as arithmetic invalidates them. + */ + if ((!known && smin_val < 0) || dst_reg->range < 0) memset(&dst_reg->raw, 0, sizeof(dst_reg->raw)); } break; @@ -14085,14 +13944,22 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, return -EACCES; } - if (!check_reg_sane_offset(env, dst_reg, ptr_reg->type)) + if (!check_reg_sane_offset_ptr(env, dst_reg, ptr_reg->type)) return -EINVAL; reg_bounds_sync(dst_reg); - if (sanitize_check_bounds(env, insn, dst_reg) < 0) - return -EACCES; + bounds_ret = sanitize_check_bounds(env, insn, dst_reg); + if (bounds_ret == -EACCES) + return bounds_ret; if (sanitize_needed(opcode)) { ret = sanitize_ptr_alu(env, insn, dst_reg, off_reg, dst_reg, &info, true); + if (verifier_bug_if(!can_skip_alu_sanitation(env, insn) + && !env->cur_state->speculative + && bounds_ret + && !ret, + env, "Pointer type unsupported by sanitize_check_bounds() not rejected by retrieve_ptr_limit() as required")) { + return -EFAULT; + } if (ret < 0) return sanitize_err(env, insn, ret, off_reg, dst_reg); } @@ -14107,14 +13974,25 @@ static void scalar32_min_max_add(struct bpf_reg_state *dst_reg, s32 *dst_smax = &dst_reg->s32_max_value; u32 *dst_umin = &dst_reg->u32_min_value; u32 *dst_umax = &dst_reg->u32_max_value; + u32 umin_val = src_reg->u32_min_value; + u32 umax_val = src_reg->u32_max_value; + bool min_overflow, max_overflow; if (check_add_overflow(*dst_smin, src_reg->s32_min_value, dst_smin) || check_add_overflow(*dst_smax, src_reg->s32_max_value, dst_smax)) { *dst_smin = S32_MIN; *dst_smax = S32_MAX; } - if (check_add_overflow(*dst_umin, src_reg->u32_min_value, dst_umin) || - check_add_overflow(*dst_umax, src_reg->u32_max_value, dst_umax)) { + + /* If either all additions overflow or no additions overflow, then + * it is okay to set: dst_umin = dst_umin + src_umin, dst_umax = + * dst_umax + src_umax. Otherwise (some additions overflow), set + * the output bounds to unbounded. + */ + min_overflow = check_add_overflow(*dst_umin, umin_val, dst_umin); + max_overflow = check_add_overflow(*dst_umax, umax_val, dst_umax); + + if (!min_overflow && max_overflow) { *dst_umin = 0; *dst_umax = U32_MAX; } @@ -14127,14 +14005,25 @@ static void scalar_min_max_add(struct bpf_reg_state *dst_reg, s64 *dst_smax = &dst_reg->smax_value; u64 *dst_umin = &dst_reg->umin_value; u64 *dst_umax = &dst_reg->umax_value; + u64 umin_val = src_reg->umin_value; + u64 umax_val = src_reg->umax_value; + bool min_overflow, max_overflow; if (check_add_overflow(*dst_smin, src_reg->smin_value, dst_smin) || check_add_overflow(*dst_smax, src_reg->smax_value, dst_smax)) { *dst_smin = S64_MIN; *dst_smax = S64_MAX; } - if (check_add_overflow(*dst_umin, src_reg->umin_value, dst_umin) || - check_add_overflow(*dst_umax, src_reg->umax_value, dst_umax)) { + + /* If either all additions overflow or no additions overflow, then + * it is okay to set: dst_umin = dst_umin + src_umin, dst_umax = + * dst_umax + src_umax. Otherwise (some additions overflow), set + * the output bounds to unbounded. + */ + min_overflow = check_add_overflow(*dst_umin, umin_val, dst_umin); + max_overflow = check_add_overflow(*dst_umax, umax_val, dst_umax); + + if (!min_overflow && max_overflow) { *dst_umin = 0; *dst_umax = U64_MAX; } @@ -14145,8 +14034,11 @@ static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg, { s32 *dst_smin = &dst_reg->s32_min_value; s32 *dst_smax = &dst_reg->s32_max_value; + u32 *dst_umin = &dst_reg->u32_min_value; + u32 *dst_umax = &dst_reg->u32_max_value; u32 umin_val = src_reg->u32_min_value; u32 umax_val = src_reg->u32_max_value; + bool min_underflow, max_underflow; if (check_sub_overflow(*dst_smin, src_reg->s32_max_value, dst_smin) || check_sub_overflow(*dst_smax, src_reg->s32_min_value, dst_smax)) { @@ -14154,14 +14046,18 @@ static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg, *dst_smin = S32_MIN; *dst_smax = S32_MAX; } - if (dst_reg->u32_min_value < umax_val) { - /* Overflow possible, we know nothing */ - dst_reg->u32_min_value = 0; - dst_reg->u32_max_value = U32_MAX; - } else { - /* Cannot overflow (as long as bounds are consistent) */ - dst_reg->u32_min_value -= umax_val; - dst_reg->u32_max_value -= umin_val; + + /* If either all subtractions underflow or no subtractions + * underflow, it is okay to set: dst_umin = dst_umin - src_umax, + * dst_umax = dst_umax - src_umin. Otherwise (some subtractions + * underflow), set the output bounds to unbounded. + */ + min_underflow = check_sub_overflow(*dst_umin, umax_val, dst_umin); + max_underflow = check_sub_overflow(*dst_umax, umin_val, dst_umax); + + if (min_underflow && !max_underflow) { + *dst_umin = 0; + *dst_umax = U32_MAX; } } @@ -14170,8 +14066,11 @@ static void scalar_min_max_sub(struct bpf_reg_state *dst_reg, { s64 *dst_smin = &dst_reg->smin_value; s64 *dst_smax = &dst_reg->smax_value; + u64 *dst_umin = &dst_reg->umin_value; + u64 *dst_umax = &dst_reg->umax_value; u64 umin_val = src_reg->umin_value; u64 umax_val = src_reg->umax_value; + bool min_underflow, max_underflow; if (check_sub_overflow(*dst_smin, src_reg->smax_value, dst_smin) || check_sub_overflow(*dst_smax, src_reg->smin_value, dst_smax)) { @@ -14179,14 +14078,18 @@ static void scalar_min_max_sub(struct bpf_reg_state *dst_reg, *dst_smin = S64_MIN; *dst_smax = S64_MAX; } - if (dst_reg->umin_value < umax_val) { - /* Overflow possible, we know nothing */ - dst_reg->umin_value = 0; - dst_reg->umax_value = U64_MAX; - } else { - /* Cannot overflow (as long as bounds are consistent) */ - dst_reg->umin_value -= umax_val; - dst_reg->umax_value -= umin_val; + + /* If either all subtractions underflow or no subtractions + * underflow, it is okay to set: dst_umin = dst_umin - src_umax, + * dst_umax = dst_umax - src_umin. Otherwise (some subtractions + * underflow), set the output bounds to unbounded. + */ + min_underflow = check_sub_overflow(*dst_umin, umax_val, dst_umin); + max_underflow = check_sub_overflow(*dst_umax, umin_val, dst_umax); + + if (min_underflow && !max_underflow) { + *dst_umin = 0; + *dst_umax = U64_MAX; } } @@ -14246,6 +14149,252 @@ static void scalar_min_max_mul(struct bpf_reg_state *dst_reg, } } +static void scalar32_min_max_udiv(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + u32 *dst_umin = &dst_reg->u32_min_value; + u32 *dst_umax = &dst_reg->u32_max_value; + u32 src_val = src_reg->u32_min_value; /* non-zero, const divisor */ + + *dst_umin = *dst_umin / src_val; + *dst_umax = *dst_umax / src_val; + + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->s32_min_value = S32_MIN; + dst_reg->s32_max_value = S32_MAX; + reset_reg64_and_tnum(dst_reg); +} + +static void scalar_min_max_udiv(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + u64 *dst_umin = &dst_reg->umin_value; + u64 *dst_umax = &dst_reg->umax_value; + u64 src_val = src_reg->umin_value; /* non-zero, const divisor */ + + *dst_umin = div64_u64(*dst_umin, src_val); + *dst_umax = div64_u64(*dst_umax, src_val); + + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; + reset_reg32_and_tnum(dst_reg); +} + +static void scalar32_min_max_sdiv(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + s32 *dst_smin = &dst_reg->s32_min_value; + s32 *dst_smax = &dst_reg->s32_max_value; + s32 src_val = src_reg->s32_min_value; /* non-zero, const divisor */ + s32 res1, res2; + + /* BPF div specification: S32_MIN / -1 = S32_MIN */ + if (*dst_smin == S32_MIN && src_val == -1) { + /* + * If the dividend range contains more than just S32_MIN, + * we cannot precisely track the result, so it becomes unbounded. + * e.g., [S32_MIN, S32_MIN+10]/(-1), + * = {S32_MIN} U [-(S32_MIN+10), -(S32_MIN+1)] + * = {S32_MIN} U [S32_MAX-9, S32_MAX] = [S32_MIN, S32_MAX] + * Otherwise (if dividend is exactly S32_MIN), result remains S32_MIN. + */ + if (*dst_smax != S32_MIN) { + *dst_smin = S32_MIN; + *dst_smax = S32_MAX; + } + goto reset; + } + + res1 = *dst_smin / src_val; + res2 = *dst_smax / src_val; + *dst_smin = min(res1, res2); + *dst_smax = max(res1, res2); + +reset: + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->u32_min_value = 0; + dst_reg->u32_max_value = U32_MAX; + reset_reg64_and_tnum(dst_reg); +} + +static void scalar_min_max_sdiv(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + s64 *dst_smin = &dst_reg->smin_value; + s64 *dst_smax = &dst_reg->smax_value; + s64 src_val = src_reg->smin_value; /* non-zero, const divisor */ + s64 res1, res2; + + /* BPF div specification: S64_MIN / -1 = S64_MIN */ + if (*dst_smin == S64_MIN && src_val == -1) { + /* + * If the dividend range contains more than just S64_MIN, + * we cannot precisely track the result, so it becomes unbounded. + * e.g., [S64_MIN, S64_MIN+10]/(-1), + * = {S64_MIN} U [-(S64_MIN+10), -(S64_MIN+1)] + * = {S64_MIN} U [S64_MAX-9, S64_MAX] = [S64_MIN, S64_MAX] + * Otherwise (if dividend is exactly S64_MIN), result remains S64_MIN. + */ + if (*dst_smax != S64_MIN) { + *dst_smin = S64_MIN; + *dst_smax = S64_MAX; + } + goto reset; + } + + res1 = div64_s64(*dst_smin, src_val); + res2 = div64_s64(*dst_smax, src_val); + *dst_smin = min(res1, res2); + *dst_smax = max(res1, res2); + +reset: + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->umin_value = 0; + dst_reg->umax_value = U64_MAX; + reset_reg32_and_tnum(dst_reg); +} + +static void scalar32_min_max_umod(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + u32 *dst_umin = &dst_reg->u32_min_value; + u32 *dst_umax = &dst_reg->u32_max_value; + u32 src_val = src_reg->u32_min_value; /* non-zero, const divisor */ + u32 res_max = src_val - 1; + + /* + * If dst_umax <= res_max, the result remains unchanged. + * e.g., [2, 5] % 10 = [2, 5]. + */ + if (*dst_umax <= res_max) + return; + + *dst_umin = 0; + *dst_umax = min(*dst_umax, res_max); + + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->s32_min_value = S32_MIN; + dst_reg->s32_max_value = S32_MAX; + reset_reg64_and_tnum(dst_reg); +} + +static void scalar_min_max_umod(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + u64 *dst_umin = &dst_reg->umin_value; + u64 *dst_umax = &dst_reg->umax_value; + u64 src_val = src_reg->umin_value; /* non-zero, const divisor */ + u64 res_max = src_val - 1; + + /* + * If dst_umax <= res_max, the result remains unchanged. + * e.g., [2, 5] % 10 = [2, 5]. + */ + if (*dst_umax <= res_max) + return; + + *dst_umin = 0; + *dst_umax = min(*dst_umax, res_max); + + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; + reset_reg32_and_tnum(dst_reg); +} + +static void scalar32_min_max_smod(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + s32 *dst_smin = &dst_reg->s32_min_value; + s32 *dst_smax = &dst_reg->s32_max_value; + s32 src_val = src_reg->s32_min_value; /* non-zero, const divisor */ + + /* + * Safe absolute value calculation: + * If src_val == S32_MIN (-2147483648), src_abs becomes 2147483648. + * Here use unsigned integer to avoid overflow. + */ + u32 src_abs = (src_val > 0) ? (u32)src_val : -(u32)src_val; + + /* + * Calculate the maximum possible absolute value of the result. + * Even if src_abs is 2147483648 (S32_MIN), subtracting 1 gives + * 2147483647 (S32_MAX), which fits perfectly in s32. + */ + s32 res_max_abs = src_abs - 1; + + /* + * If the dividend is already within the result range, + * the result remains unchanged. e.g., [-2, 5] % 10 = [-2, 5]. + */ + if (*dst_smin >= -res_max_abs && *dst_smax <= res_max_abs) + return; + + /* General case: result has the same sign as the dividend. */ + if (*dst_smin >= 0) { + *dst_smin = 0; + *dst_smax = min(*dst_smax, res_max_abs); + } else if (*dst_smax <= 0) { + *dst_smax = 0; + *dst_smin = max(*dst_smin, -res_max_abs); + } else { + *dst_smin = -res_max_abs; + *dst_smax = res_max_abs; + } + + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->u32_min_value = 0; + dst_reg->u32_max_value = U32_MAX; + reset_reg64_and_tnum(dst_reg); +} + +static void scalar_min_max_smod(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + s64 *dst_smin = &dst_reg->smin_value; + s64 *dst_smax = &dst_reg->smax_value; + s64 src_val = src_reg->smin_value; /* non-zero, const divisor */ + + /* + * Safe absolute value calculation: + * If src_val == S64_MIN (-2^63), src_abs becomes 2^63. + * Here use unsigned integer to avoid overflow. + */ + u64 src_abs = (src_val > 0) ? (u64)src_val : -(u64)src_val; + + /* + * Calculate the maximum possible absolute value of the result. + * Even if src_abs is 2^63 (S64_MIN), subtracting 1 gives + * 2^63 - 1 (S64_MAX), which fits perfectly in s64. + */ + s64 res_max_abs = src_abs - 1; + + /* + * If the dividend is already within the result range, + * the result remains unchanged. e.g., [-2, 5] % 10 = [-2, 5]. + */ + if (*dst_smin >= -res_max_abs && *dst_smax <= res_max_abs) + return; + + /* General case: result has the same sign as the dividend. */ + if (*dst_smin >= 0) { + *dst_smin = 0; + *dst_smax = min(*dst_smax, res_max_abs); + } else if (*dst_smax <= 0) { + *dst_smax = 0; + *dst_smin = max(*dst_smin, -res_max_abs); + } else { + *dst_smin = -res_max_abs; + *dst_smax = res_max_abs; + } + + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->umin_value = 0; + dst_reg->umax_value = U64_MAX; + reset_reg32_and_tnum(dst_reg); +} + static void scalar32_min_max_and(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { @@ -14470,21 +14619,17 @@ static void __scalar64_min_max_lsh(struct bpf_reg_state *dst_reg, u64 umin_val, u64 umax_val) { /* Special case <<32 because it is a common compiler pattern to sign - * extend subreg by doing <<32 s>>32. In this case if 32bit bounds are - * positive we know this shift will also be positive so we can track - * bounds correctly. Otherwise we lose all sign bit information except - * what we can pick up from var_off. Perhaps we can generalize this - * later to shifts of any length. + * extend subreg by doing <<32 s>>32. smin/smax assignments are correct + * because s32 bounds don't flip sign when shifting to the left by + * 32bits. */ - if (umin_val == 32 && umax_val == 32 && dst_reg->s32_max_value >= 0) + if (umin_val == 32 && umax_val == 32) { dst_reg->smax_value = (s64)dst_reg->s32_max_value << 32; - else - dst_reg->smax_value = S64_MAX; - - if (umin_val == 32 && umax_val == 32 && dst_reg->s32_min_value >= 0) dst_reg->smin_value = (s64)dst_reg->s32_min_value << 32; - else + } else { + dst_reg->smax_value = S64_MAX; dst_reg->smin_value = S64_MIN; + } /* If we might shift our top bit out, then we know nothing */ if (dst_reg->umax_value > 1ULL << (63 - umax_val)) { @@ -14627,6 +14772,55 @@ static void scalar_min_max_arsh(struct bpf_reg_state *dst_reg, __update_reg_bounds(dst_reg); } +static void scalar_byte_swap(struct bpf_reg_state *dst_reg, struct bpf_insn *insn) +{ + /* + * Byte swap operation - update var_off using tnum_bswap. + * Three cases: + * 1. bswap(16|32|64): opcode=0xd7 (BPF_END | BPF_ALU64 | BPF_TO_LE) + * unconditional swap + * 2. to_le(16|32|64): opcode=0xd4 (BPF_END | BPF_ALU | BPF_TO_LE) + * swap on big-endian, truncation or no-op on little-endian + * 3. to_be(16|32|64): opcode=0xdc (BPF_END | BPF_ALU | BPF_TO_BE) + * swap on little-endian, truncation or no-op on big-endian + */ + + bool alu64 = BPF_CLASS(insn->code) == BPF_ALU64; + bool to_le = BPF_SRC(insn->code) == BPF_TO_LE; + bool is_big_endian; +#ifdef CONFIG_CPU_BIG_ENDIAN + is_big_endian = true; +#else + is_big_endian = false; +#endif + /* Apply bswap if alu64 or switch between big-endian and little-endian machines */ + bool need_bswap = alu64 || (to_le == is_big_endian); + + /* + * If the register is mutated, manually reset its scalar ID to break + * any existing ties and avoid incorrect bounds propagation. + */ + if (need_bswap || insn->imm == 16 || insn->imm == 32) + clear_scalar_id(dst_reg); + + if (need_bswap) { + if (insn->imm == 16) + dst_reg->var_off = tnum_bswap16(dst_reg->var_off); + else if (insn->imm == 32) + dst_reg->var_off = tnum_bswap32(dst_reg->var_off); + else if (insn->imm == 64) + dst_reg->var_off = tnum_bswap64(dst_reg->var_off); + /* + * Byteswap scrambles the range, so we must reset bounds. + * Bounds will be re-derived from the new tnum later. + */ + __mark_reg_unbounded(dst_reg); + } + /* For bswap16/32, truncate dst register to match the swapped size */ + if (insn->imm == 16 || insn->imm == 32) + coerce_reg_to_size(dst_reg, insn->imm / 8); +} + static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn, const struct bpf_reg_state *src_reg) { @@ -14648,12 +14842,22 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn, switch (BPF_OP(insn->code)) { case BPF_ADD: case BPF_SUB: + case BPF_NEG: case BPF_AND: case BPF_XOR: case BPF_OR: case BPF_MUL: + case BPF_END: return true; + /* + * Division and modulo operators range is only safe to compute when the + * divisor is a constant. + */ + case BPF_DIV: + case BPF_MOD: + return src_is_const; + /* Shift operators range is only computable if shift dimension operand * is a constant. Shifts greater than 31 or 63 are undefined. This * includes shifts by a negative number. @@ -14667,6 +14871,35 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn, } } +static int maybe_fork_scalars(struct bpf_verifier_env *env, struct bpf_insn *insn, + struct bpf_reg_state *dst_reg) +{ + struct bpf_verifier_state *branch; + struct bpf_reg_state *regs; + bool alu32; + + if (dst_reg->smin_value == -1 && dst_reg->smax_value == 0) + alu32 = false; + else if (dst_reg->s32_min_value == -1 && dst_reg->s32_max_value == 0) + alu32 = true; + else + return 0; + + branch = push_stack(env, env->insn_idx, env->insn_idx, false); + if (IS_ERR(branch)) + return PTR_ERR(branch); + + regs = branch->frame[branch->curframe]->regs; + if (alu32) { + __mark_reg32_known(®s[insn->dst_reg], 0); + __mark_reg32_known(dst_reg, -1ull); + } else { + __mark_reg_known(®s[insn->dst_reg], 0); + __mark_reg_known(dst_reg, -1ull); + } + return 0; +} + /* WARNING: This function does calculations on 64-bit values, but the actual * execution may occur on 32-bit values. Therefore, things like bitshifts * need extra checks in the 32-bit case. @@ -14677,6 +14910,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, struct bpf_reg_state src_reg) { u8 opcode = BPF_OP(insn->code); + s16 off = insn->off; bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64); int ret; @@ -14716,17 +14950,66 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, scalar_min_max_sub(dst_reg, &src_reg); dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg.var_off); break; + case BPF_NEG: + env->fake_reg[0] = *dst_reg; + __mark_reg_known(dst_reg, 0); + scalar32_min_max_sub(dst_reg, &env->fake_reg[0]); + scalar_min_max_sub(dst_reg, &env->fake_reg[0]); + dst_reg->var_off = tnum_neg(env->fake_reg[0].var_off); + break; case BPF_MUL: dst_reg->var_off = tnum_mul(dst_reg->var_off, src_reg.var_off); scalar32_min_max_mul(dst_reg, &src_reg); scalar_min_max_mul(dst_reg, &src_reg); break; + case BPF_DIV: + /* BPF div specification: x / 0 = 0 */ + if ((alu32 && src_reg.u32_min_value == 0) || (!alu32 && src_reg.umin_value == 0)) { + ___mark_reg_known(dst_reg, 0); + break; + } + if (alu32) + if (off == 1) + scalar32_min_max_sdiv(dst_reg, &src_reg); + else + scalar32_min_max_udiv(dst_reg, &src_reg); + else + if (off == 1) + scalar_min_max_sdiv(dst_reg, &src_reg); + else + scalar_min_max_udiv(dst_reg, &src_reg); + break; + case BPF_MOD: + /* BPF mod specification: x % 0 = x */ + if ((alu32 && src_reg.u32_min_value == 0) || (!alu32 && src_reg.umin_value == 0)) + break; + if (alu32) + if (off == 1) + scalar32_min_max_smod(dst_reg, &src_reg); + else + scalar32_min_max_umod(dst_reg, &src_reg); + else + if (off == 1) + scalar_min_max_smod(dst_reg, &src_reg); + else + scalar_min_max_umod(dst_reg, &src_reg); + break; case BPF_AND: + if (tnum_is_const(src_reg.var_off)) { + ret = maybe_fork_scalars(env, insn, dst_reg); + if (ret) + return ret; + } dst_reg->var_off = tnum_and(dst_reg->var_off, src_reg.var_off); scalar32_min_max_and(dst_reg, &src_reg); scalar_min_max_and(dst_reg, &src_reg); break; case BPF_OR: + if (tnum_is_const(src_reg.var_off)) { + ret = maybe_fork_scalars(env, insn, dst_reg); + if (ret) + return ret; + } dst_reg->var_off = tnum_or(dst_reg->var_off, src_reg.var_off); scalar32_min_max_or(dst_reg, &src_reg); scalar_min_max_or(dst_reg, &src_reg); @@ -14754,12 +15037,23 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, else scalar_min_max_arsh(dst_reg, &src_reg); break; + case BPF_END: + scalar_byte_swap(dst_reg, insn); + break; default: break; } - /* ALU32 ops are zero extended into 64bit register */ - if (alu32) + /* + * ALU32 ops are zero extended into 64bit register. + * + * BPF_END is already handled inside the helper (truncation), + * so skip zext here to avoid unexpected zero extension. + * e.g., le64: opcode=(BPF_END|BPF_ALU|BPF_TO_LE), imm=0x40 + * This is a 64bit byte swap operation with alu32==true, + * but we should not zero extend the result. + */ + if (alu32 && opcode != BPF_END) zext_32_to_64(dst_reg); reg_bounds_sync(dst_reg); return 0; @@ -14780,11 +15074,20 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, int err; dst_reg = ®s[insn->dst_reg]; - src_reg = NULL; + if (BPF_SRC(insn->code) == BPF_X) + src_reg = ®s[insn->src_reg]; + else + src_reg = NULL; - if (dst_reg->type == PTR_TO_ARENA) { + /* Case where at least one operand is an arena. */ + if (dst_reg->type == PTR_TO_ARENA || (src_reg && src_reg->type == PTR_TO_ARENA)) { struct bpf_insn_aux_data *aux = cur_aux(env); + if (dst_reg->type != PTR_TO_ARENA) + *dst_reg = *src_reg; + + dst_reg->subreg_def = env->insn_idx + 1; + if (BPF_CLASS(insn->code) == BPF_ALU64) /* * 32-bit operations zero upper bits automatically. @@ -14800,7 +15103,6 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, ptr_reg = dst_reg; if (BPF_SRC(insn->code) == BPF_X) { - src_reg = ®s[insn->src_reg]; if (src_reg->type != SCALAR_VALUE) { if (dst_reg->type != SCALAR_VALUE) { /* Combining two pointers by any ALU op yields @@ -14855,13 +15157,20 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, if (WARN_ON_ONCE(ptr_reg)) { print_verifier_state(env, vstate, vstate->curframe, true); verbose(env, "verifier internal error: unexpected ptr_reg\n"); - return -EINVAL; + return -EFAULT; } if (WARN_ON(!src_reg)) { print_verifier_state(env, vstate, vstate->curframe, true); verbose(env, "verifier internal error: no src_reg\n"); - return -EINVAL; + return -EFAULT; } + /* + * For alu32 linked register tracking, we need to check dst_reg's + * umax_value before the ALU operation. After adjust_scalar_min_max_vals(), + * alu32 ops will have zero-extended the result, making umax_value <= U32_MAX. + */ + u64 dst_umax = dst_reg->umax_value; + err = adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg); if (err) return err; @@ -14871,33 +15180,51 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, * r1 += 0x1 * if r2 < 1000 goto ... * use r1 in memory access - * So for 64-bit alu remember constant delta between r2 and r1 and - * update r1 after 'if' condition. + * So remember constant delta between r2 and r1 and update r1 after + * 'if' condition. */ if (env->bpf_capable && - BPF_OP(insn->code) == BPF_ADD && !alu32 && - dst_reg->id && is_reg_const(src_reg, false)) { - u64 val = reg_const_value(src_reg, false); + (BPF_OP(insn->code) == BPF_ADD || BPF_OP(insn->code) == BPF_SUB) && + dst_reg->id && is_reg_const(src_reg, alu32) && + !(BPF_SRC(insn->code) == BPF_X && insn->src_reg == insn->dst_reg)) { + u64 val = reg_const_value(src_reg, alu32); + s32 off; - if ((dst_reg->id & BPF_ADD_CONST) || - /* prevent overflow in sync_linked_regs() later */ - val > (u32)S32_MAX) { + if (!alu32 && ((s64)val < S32_MIN || (s64)val > S32_MAX)) + goto clear_id; + + if (alu32 && (dst_umax > U32_MAX)) + goto clear_id; + + off = (s32)val; + + if (BPF_OP(insn->code) == BPF_SUB) { + /* Negating S32_MIN would overflow */ + if (off == S32_MIN) + goto clear_id; + off = -off; + } + + if (dst_reg->id & BPF_ADD_CONST) { /* * If the register already went through rX += val * we cannot accumulate another val into rx->off. */ - dst_reg->off = 0; - dst_reg->id = 0; +clear_id: + clear_scalar_id(dst_reg); } else { - dst_reg->id |= BPF_ADD_CONST; - dst_reg->off = val; + if (alu32) + dst_reg->id |= BPF_ADD_CONST32; + else + dst_reg->id |= BPF_ADD_CONST64; + dst_reg->delta = off; } } else { /* * Make sure ID is cleared otherwise dst_reg min/max could be * incorrectly propagated into other registers by sync_linked_regs() */ - dst_reg->id = 0; + clear_scalar_id(dst_reg); } return 0; } @@ -14910,23 +15237,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) int err; if (opcode == BPF_END || opcode == BPF_NEG) { - if (opcode == BPF_NEG) { - if (BPF_SRC(insn->code) != BPF_K || - insn->src_reg != BPF_REG_0 || - insn->off != 0 || insn->imm != 0) { - verbose(env, "BPF_NEG uses reserved fields\n"); - return -EINVAL; - } - } else { - if (insn->src_reg != BPF_REG_0 || insn->off != 0 || - (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) || - (BPF_CLASS(insn->code) == BPF_ALU64 && - BPF_SRC(insn->code) != BPF_TO_LE)) { - verbose(env, "BPF_END uses reserved fields\n"); - return -EINVAL; - } - } - /* check src operand */ err = check_reg_arg(env, insn->dst_reg, SRC_OP); if (err) @@ -14939,45 +15249,31 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } /* check dest operand */ - err = check_reg_arg(env, insn->dst_reg, DST_OP); + if (regs[insn->dst_reg].type == SCALAR_VALUE) { + err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); + err = err ?: adjust_scalar_min_max_vals(env, insn, + ®s[insn->dst_reg], + regs[insn->dst_reg]); + } else { + err = check_reg_arg(env, insn->dst_reg, DST_OP); + } if (err) return err; } else if (opcode == BPF_MOV) { if (BPF_SRC(insn->code) == BPF_X) { - if (BPF_CLASS(insn->code) == BPF_ALU) { - if ((insn->off != 0 && insn->off != 8 && insn->off != 16) || - insn->imm) { - verbose(env, "BPF_MOV uses reserved fields\n"); - return -EINVAL; - } - } else if (insn->off == BPF_ADDR_SPACE_CAST) { - if (insn->imm != 1 && insn->imm != 1u << 16) { - verbose(env, "addr_space_cast insn can only convert between address space 1 and 0\n"); - return -EINVAL; - } + if (insn->off == BPF_ADDR_SPACE_CAST) { if (!env->prog->aux->arena) { verbose(env, "addr_space_cast insn can only be used in a program that has an associated arena\n"); return -EINVAL; } - } else { - if ((insn->off != 0 && insn->off != 8 && insn->off != 16 && - insn->off != 32) || insn->imm) { - verbose(env, "BPF_MOV uses reserved fields\n"); - return -EINVAL; - } } /* check src operand */ err = check_reg_arg(env, insn->src_reg, SRC_OP); if (err) return err; - } else { - if (insn->src_reg != BPF_REG_0 || insn->off != 0) { - verbose(env, "BPF_MOV uses reserved fields\n"); - return -EINVAL; - } } /* check dest operand, mark as required later */ @@ -15004,7 +15300,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) */ assign_scalar_id_before_mov(env, src_reg); copy_register_state(dst_reg, src_reg); - dst_reg->live |= REG_LIVE_WRITTEN; dst_reg->subreg_def = DEF_NOT_SUBREG; } else { /* case: R1 = (s8, s16 s32)R2 */ @@ -15021,9 +15316,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) assign_scalar_id_before_mov(env, src_reg); copy_register_state(dst_reg, src_reg); if (!no_sext) - dst_reg->id = 0; + clear_scalar_id(dst_reg); coerce_reg_to_size_sx(dst_reg, insn->off >> 3); - dst_reg->live |= REG_LIVE_WRITTEN; dst_reg->subreg_def = DEF_NOT_SUBREG; } else { mark_reg_unknown(env, regs, insn->dst_reg); @@ -15048,8 +15342,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) * propagated into src_reg by sync_linked_regs() */ if (!is_src_reg_u32) - dst_reg->id = 0; - dst_reg->live |= REG_LIVE_WRITTEN; + clear_scalar_id(dst_reg); dst_reg->subreg_def = env->insn_idx + 1; } else { /* case: W1 = (s8, s16)W2 */ @@ -15059,8 +15352,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) assign_scalar_id_before_mov(env, src_reg); copy_register_state(dst_reg, src_reg); if (!no_sext) - dst_reg->id = 0; - dst_reg->live |= REG_LIVE_WRITTEN; + clear_scalar_id(dst_reg); dst_reg->subreg_def = env->insn_idx + 1; coerce_subreg_to_size_sx(dst_reg, insn->off >> 3); } @@ -15087,28 +15379,13 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } } - } else if (opcode > BPF_END) { - verbose(env, "invalid BPF_ALU opcode %x\n", opcode); - return -EINVAL; - } else { /* all other ALU ops: and, sub, xor, add, ... */ if (BPF_SRC(insn->code) == BPF_X) { - if (insn->imm != 0 || insn->off > 1 || - (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) { - verbose(env, "BPF_ALU uses reserved fields\n"); - return -EINVAL; - } /* check src1 operand */ err = check_reg_arg(env, insn->src_reg, SRC_OP); if (err) return err; - } else { - if (insn->src_reg != BPF_REG_0 || insn->off > 1 || - (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) { - verbose(env, "BPF_ALU uses reserved fields\n"); - return -EINVAL; - } } /* check src2 operand */ @@ -15151,19 +15428,17 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, struct bpf_reg_state *reg; int new_range; - if (dst_reg->off < 0 || - (dst_reg->off == 0 && range_right_open)) + if (dst_reg->umax_value == 0 && range_right_open) /* This doesn't give us any range */ return; - if (dst_reg->umax_value > MAX_PACKET_OFF || - dst_reg->umax_value + dst_reg->off > MAX_PACKET_OFF) + if (dst_reg->umax_value > MAX_PACKET_OFF) /* Risk of overflow. For instance, ptr + (1<<63) may be less * than pkt_end, but that's because it's also less than pkt. */ return; - new_range = dst_reg->off; + new_range = dst_reg->umax_value; if (range_right_open) new_range++; @@ -15212,7 +15487,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, /* If our ids match, then we must have the same max_value. And we * don't care about the other reg's fixed offset, since if it's too big * the range won't allow anything. - * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16. + * dst_reg->umax_value is known < MAX_PACKET_OFF, therefore it fits in a u16. */ bpf_for_each_reg_in_vstate(vstate, state, reg, ({ if (reg->type == type && reg->id == dst_reg->id) @@ -15221,11 +15496,50 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, })); } +static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2, + u8 opcode, bool is_jmp32); +static u8 rev_opcode(u8 opcode); + +/* + * Learn more information about live branches by simulating refinement on both branches. + * regs_refine_cond_op() is sound, so producing ill-formed register bounds for the branch means + * that branch is dead. + */ +static int simulate_both_branches_taken(struct bpf_verifier_env *env, u8 opcode, bool is_jmp32) +{ + /* Fallthrough (FALSE) branch */ + regs_refine_cond_op(&env->false_reg1, &env->false_reg2, rev_opcode(opcode), is_jmp32); + reg_bounds_sync(&env->false_reg1); + reg_bounds_sync(&env->false_reg2); + /* + * If there is a range bounds violation in *any* of the abstract values in either + * reg_states in the FALSE branch (i.e. reg1, reg2), the FALSE branch must be dead. Only + * TRUE branch will be taken. + */ + if (range_bounds_violation(&env->false_reg1) || range_bounds_violation(&env->false_reg2)) + return 1; + + /* Jump (TRUE) branch */ + regs_refine_cond_op(&env->true_reg1, &env->true_reg2, opcode, is_jmp32); + reg_bounds_sync(&env->true_reg1); + reg_bounds_sync(&env->true_reg2); + /* + * If there is a range bounds violation in *any* of the abstract values in either + * reg_states in the TRUE branch (i.e. true_reg1, true_reg2), the TRUE branch must be dead. + * Only FALSE branch will be taken. + */ + if (range_bounds_violation(&env->true_reg1) || range_bounds_violation(&env->true_reg2)) + return 0; + + /* Both branches are possible, we can't determine which one will be taken. */ + return -1; +} + /* * <reg1> <op> <reg2>, currently assuming reg2 is a constant */ -static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2, - u8 opcode, bool is_jmp32) +static int is_scalar_branch_taken(struct bpf_verifier_env *env, struct bpf_reg_state *reg1, + struct bpf_reg_state *reg2, u8 opcode, bool is_jmp32) { struct tnum t1 = is_jmp32 ? tnum_subreg(reg1->var_off) : reg1->var_off; struct tnum t2 = is_jmp32 ? tnum_subreg(reg2->var_off) : reg2->var_off; @@ -15238,6 +15552,30 @@ static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_sta s64 smin2 = is_jmp32 ? (s64)reg2->s32_min_value : reg2->smin_value; s64 smax2 = is_jmp32 ? (s64)reg2->s32_max_value : reg2->smax_value; + if (reg1 == reg2) { + switch (opcode) { + case BPF_JGE: + case BPF_JLE: + case BPF_JSGE: + case BPF_JSLE: + case BPF_JEQ: + return 1; + case BPF_JGT: + case BPF_JLT: + case BPF_JSGT: + case BPF_JSLT: + case BPF_JNE: + return 0; + case BPF_JSET: + if (tnum_is_const(t1)) + return t1.value != 0; + else + return (smin1 <= 0 && smax1 >= 0) ? -1 : 1; + default: + return -1; + } + } + switch (opcode) { case BPF_JEQ: /* constants, umin/umax and smin/smax checks would be @@ -15245,6 +15583,8 @@ static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_sta */ if (tnum_is_const(t1) && tnum_is_const(t2)) return t1.value == t2.value; + if (!tnum_overlap(t1, t2)) + return 0; /* non-overlapping ranges */ if (umin1 > umax2 || umax1 < umin2) return 0; @@ -15269,6 +15609,8 @@ static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_sta */ if (tnum_is_const(t1) && tnum_is_const(t2)) return t1.value != t2.value; + if (!tnum_overlap(t1, t2)) + return 1; /* non-overlapping ranges */ if (umin1 > umax2 || umax1 < umin2) return 1; @@ -15349,7 +15691,7 @@ static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_sta break; } - return -1; + return simulate_both_branches_taken(env, opcode, is_jmp32); } static int flip_opcode(u32 opcode) @@ -15420,8 +15762,8 @@ static int is_pkt_ptr_branch_taken(struct bpf_reg_state *dst_reg, * -1 - unknown. Example: "if (reg1 < 5)" is unknown when register value * range [0,10] */ -static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2, - u8 opcode, bool is_jmp32) +static int is_branch_taken(struct bpf_verifier_env *env, struct bpf_reg_state *reg1, + struct bpf_reg_state *reg2, u8 opcode, bool is_jmp32) { if (reg_is_pkt_pointer_any(reg1) && reg_is_pkt_pointer_any(reg2) && !is_jmp32) return is_pkt_ptr_branch_taken(reg1, reg2, opcode); @@ -15459,7 +15801,7 @@ static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg } /* now deal with two scalars, but not necessarily constants */ - return is_scalar_branch_taken(reg1, reg2, opcode, is_jmp32); + return is_scalar_branch_taken(env, reg1, reg2, opcode, is_jmp32); } /* Opcode that corresponds to a *false* branch condition. @@ -15550,8 +15892,8 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state /* u32_min_value is not equal to 0xffffffff at this point, * because otherwise u32_max_value is 0xffffffff as well, * in such a case both reg1 and reg2 would be constants, - * jump would be predicted and reg_set_min_max() won't - * be called. + * jump would be predicted and regs_refine_cond_op() + * wouldn't be called. * * Same reasoning works for all {u,s}{min,max}{32,64} cases * below. @@ -15606,6 +15948,10 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state if (!is_reg_const(reg2, is_jmp32)) break; val = reg_const_value(reg2, is_jmp32); + /* Forget the ranges before narrowing tnums, to avoid invariant + * violations if we're on a dead branch. + */ + __mark_reg_unbounded(reg1); if (is_jmp32) { t = tnum_and(tnum_subreg(reg1->var_off), tnum_const(~val)); reg1->var_off = tnum_with_subreg(reg1->var_off, t); @@ -15654,42 +16000,15 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state } } -/* Adjusts the register min/max values in the case that the dst_reg and - * src_reg are both SCALAR_VALUE registers (or we are simply doing a BPF_K - * check, in which case we have a fake SCALAR_VALUE representing insn->imm). - * Technically we can do similar adjustments for pointers to the same object, - * but we don't support that right now. - */ -static int reg_set_min_max(struct bpf_verifier_env *env, - struct bpf_reg_state *true_reg1, - struct bpf_reg_state *true_reg2, - struct bpf_reg_state *false_reg1, - struct bpf_reg_state *false_reg2, - u8 opcode, bool is_jmp32) +/* Check for invariant violations on the registers for both branches of a condition */ +static int regs_bounds_sanity_check_branches(struct bpf_verifier_env *env) { int err; - /* If either register is a pointer, we can't learn anything about its - * variable offset from the compare (unless they were a pointer into - * the same object, but we don't bother with that). - */ - if (false_reg1->type != SCALAR_VALUE || false_reg2->type != SCALAR_VALUE) - return 0; - - /* fallthrough (FALSE) branch */ - regs_refine_cond_op(false_reg1, false_reg2, rev_opcode(opcode), is_jmp32); - reg_bounds_sync(false_reg1); - reg_bounds_sync(false_reg2); - - /* jump (TRUE) branch */ - regs_refine_cond_op(true_reg1, true_reg2, opcode, is_jmp32); - reg_bounds_sync(true_reg1); - reg_bounds_sync(true_reg2); - - err = reg_bounds_sanity_check(env, true_reg1, "true_reg1"); - err = err ?: reg_bounds_sanity_check(env, true_reg2, "true_reg2"); - err = err ?: reg_bounds_sanity_check(env, false_reg1, "false_reg1"); - err = err ?: reg_bounds_sanity_check(env, false_reg2, "false_reg2"); + err = reg_bounds_sanity_check(env, &env->true_reg1, "true_reg1"); + err = err ?: reg_bounds_sanity_check(env, &env->true_reg2, "true_reg2"); + err = err ?: reg_bounds_sanity_check(env, &env->false_reg1, "false_reg1"); + err = err ?: reg_bounds_sanity_check(env, &env->false_reg2, "false_reg2"); return err; } @@ -15699,29 +16018,24 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, { if (type_may_be_null(reg->type) && reg->id == id && (is_rcu_reg(reg) || !WARN_ON_ONCE(!reg->id))) { - /* Old offset (both fixed and variable parts) should have been - * known-zero, because we don't allow pointer arithmetic on - * pointers that might be NULL. If we see this happening, don't - * convert the register. + /* Old offset should have been known-zero, because we don't + * allow pointer arithmetic on pointers that might be NULL. + * If we see this happening, don't convert the register. * * But in some cases, some helpers that return local kptrs - * advance offset for the returned pointer. In those cases, it - * is fine to expect to see reg->off. + * advance offset for the returned pointer. In those cases, + * it is fine to expect to see reg->var_off. */ - if (WARN_ON_ONCE(reg->smin_value || reg->smax_value || !tnum_equals_const(reg->var_off, 0))) - return; if (!(type_is_ptr_alloc_obj(reg->type) || type_is_non_owning_ref(reg->type)) && - WARN_ON_ONCE(reg->off)) + WARN_ON_ONCE(!tnum_equals_const(reg->var_off, 0))) return; - if (is_null) { - reg->type = SCALAR_VALUE; /* We don't need id and ref_obj_id from this point * onwards anymore, thus we should better reset it, * so that state pruning has chances to take effect. */ - reg->id = 0; - reg->ref_obj_id = 0; + __mark_reg_known_zero(reg); + reg->type = SCALAR_VALUE; return; } @@ -15882,7 +16196,7 @@ static void __collect_linked_regs(struct linked_regs *reg_set, struct bpf_reg_st e->is_reg = is_reg; e->regno = spi_or_reg; } else { - reg->id = 0; + clear_scalar_id(reg); } } @@ -15890,22 +16204,29 @@ static void __collect_linked_regs(struct linked_regs *reg_set, struct bpf_reg_st * in verifier state, save R in linked_regs if R->id == id. * If there are too many Rs sharing same id, reset id for leftover Rs. */ -static void collect_linked_regs(struct bpf_verifier_state *vstate, u32 id, +static void collect_linked_regs(struct bpf_verifier_env *env, + struct bpf_verifier_state *vstate, + u32 id, struct linked_regs *linked_regs) { + struct bpf_insn_aux_data *aux = env->insn_aux_data; struct bpf_func_state *func; struct bpf_reg_state *reg; + u16 live_regs; int i, j; id = id & ~BPF_ADD_CONST; for (i = vstate->curframe; i >= 0; i--) { + live_regs = aux[bpf_frame_insn_idx(vstate, i)].live_regs_before; func = vstate->frame[i]; for (j = 0; j < BPF_REG_FP; j++) { + if (!(live_regs & BIT(j))) + continue; reg = &func->regs[j]; __collect_linked_regs(linked_regs, reg, id, i, j, true); } for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) { - if (!is_spilled_reg(&func->stack[j])) + if (!bpf_is_spilled_reg(&func->stack[j])) continue; reg = &func->stack[j].spilled_ptr; __collect_linked_regs(linked_regs, reg, id, i, j, false); @@ -15916,8 +16237,8 @@ static void collect_linked_regs(struct bpf_verifier_state *vstate, u32 id, /* For all R in linked_regs, copy known_reg range into R * if R->id == known_reg->id. */ -static void sync_linked_regs(struct bpf_verifier_state *vstate, struct bpf_reg_state *known_reg, - struct linked_regs *linked_regs) +static void sync_linked_regs(struct bpf_verifier_env *env, struct bpf_verifier_state *vstate, + struct bpf_reg_state *known_reg, struct linked_regs *linked_regs) { struct bpf_reg_state fake_reg; struct bpf_reg_state *reg; @@ -15932,32 +16253,47 @@ static void sync_linked_regs(struct bpf_verifier_state *vstate, struct bpf_reg_s continue; if ((reg->id & ~BPF_ADD_CONST) != (known_reg->id & ~BPF_ADD_CONST)) continue; + /* + * Skip mixed 32/64-bit links: the delta relationship doesn't + * hold across different ALU widths. + */ + if (((reg->id ^ known_reg->id) & BPF_ADD_CONST) == BPF_ADD_CONST) + continue; if ((!(reg->id & BPF_ADD_CONST) && !(known_reg->id & BPF_ADD_CONST)) || - reg->off == known_reg->off) { + reg->delta == known_reg->delta) { s32 saved_subreg_def = reg->subreg_def; copy_register_state(reg, known_reg); reg->subreg_def = saved_subreg_def; } else { s32 saved_subreg_def = reg->subreg_def; - s32 saved_off = reg->off; + s32 saved_off = reg->delta; + u32 saved_id = reg->id; fake_reg.type = SCALAR_VALUE; - __mark_reg_known(&fake_reg, (s32)reg->off - (s32)known_reg->off); + __mark_reg_known(&fake_reg, (s64)reg->delta - (s64)known_reg->delta); /* reg = known_reg; reg += delta */ copy_register_state(reg, known_reg); /* - * Must preserve off, id and add_const flag, + * Must preserve off, id and subreg_def flag, * otherwise another sync_linked_regs() will be incorrect. */ - reg->off = saved_off; + reg->delta = saved_off; + reg->id = saved_id; reg->subreg_def = saved_subreg_def; scalar32_min_max_add(reg, &fake_reg); scalar_min_max_add(reg, &fake_reg); reg->var_off = tnum_add(reg->var_off, fake_reg.var_off); + if ((reg->id | known_reg->id) & BPF_ADD_CONST32) + zext_32_to_64(reg); + reg_bounds_sync(reg); } + if (e->is_reg) + mark_reg_scratched(env, e->regno); + else + mark_stack_slot_scratched(env, e->spi); } } @@ -15971,6 +16307,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_reg_state *eq_branch_regs; struct linked_regs linked_regs = {}; u8 opcode = BPF_OP(insn->code); + int insn_flags = 0; bool is_jmp32; int pred = -1; int err; @@ -15985,18 +16322,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_verifier_state *cur_st = env->cur_state, *queued_st, *prev_st; int idx = *insn_idx; - if (insn->code != (BPF_JMP | BPF_JCOND) || - insn->src_reg != BPF_MAY_GOTO || - insn->dst_reg || insn->imm) { - verbose(env, "invalid may_goto imm %d\n", insn->imm); - return -EINVAL; - } prev_st = find_prev_entry(env, cur_st->parent, idx); /* branch out 'fallthrough' insn as a new state to explore */ queued_st = push_stack(env, idx + 1, idx, false); - if (!queued_st) - return -ENOMEM; + if (IS_ERR(queued_st)) + return PTR_ERR(queued_st); queued_st->may_goto_depth++; if (prev_st) @@ -16012,11 +16343,6 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, dst_reg = ®s[insn->dst_reg]; if (BPF_SRC(insn->code) == BPF_X) { - if (insn->imm != 0) { - verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); - return -EINVAL; - } - /* check src1 operand */ err = check_reg_arg(env, insn->src_reg, SRC_OP); if (err) @@ -16029,19 +16355,33 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, insn->src_reg); return -EACCES; } + + if (src_reg->type == PTR_TO_STACK) + insn_flags |= INSN_F_SRC_REG_STACK; + if (dst_reg->type == PTR_TO_STACK) + insn_flags |= INSN_F_DST_REG_STACK; } else { - if (insn->src_reg != BPF_REG_0) { - verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); - return -EINVAL; - } src_reg = &env->fake_reg[0]; memset(src_reg, 0, sizeof(*src_reg)); src_reg->type = SCALAR_VALUE; __mark_reg_known(src_reg, insn->imm); + + if (dst_reg->type == PTR_TO_STACK) + insn_flags |= INSN_F_DST_REG_STACK; + } + + if (insn_flags) { + err = bpf_push_jmp_history(env, this_branch, insn_flags, 0); + if (err) + return err; } is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; - pred = is_branch_taken(dst_reg, src_reg, opcode, is_jmp32); + copy_register_state(&env->false_reg1, dst_reg); + copy_register_state(&env->false_reg2, src_reg); + copy_register_state(&env->true_reg1, dst_reg); + copy_register_state(&env->true_reg2, src_reg); + pred = is_branch_taken(env, dst_reg, src_reg, opcode, is_jmp32); if (pred >= 0) { /* If we get here with a dst_reg pointer type it is because * above is_branch_taken() special cased the 0 comparison. @@ -16060,10 +16400,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, * the fall-through branch for simulation under speculative * execution. */ - if (!env->bypass_spec_v1 && - !sanitize_speculative_path(env, insn, *insn_idx + 1, - *insn_idx)) - return -EFAULT; + if (!env->bypass_spec_v1) { + err = sanitize_speculative_path(env, insn, *insn_idx + 1, *insn_idx); + if (err < 0) + return err; + } if (env->log.level & BPF_LOG_LEVEL) print_insn_state(env, this_branch, this_branch->curframe); *insn_idx += insn->off; @@ -16073,11 +16414,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, * program will go. If needed, push the goto branch for * simulation under speculative execution. */ - if (!env->bypass_spec_v1 && - !sanitize_speculative_path(env, insn, - *insn_idx + insn->off + 1, - *insn_idx)) - return -EFAULT; + if (!env->bypass_spec_v1) { + err = sanitize_speculative_path(env, insn, *insn_idx + insn->off + 1, + *insn_idx); + if (err < 0) + return err; + } if (env->log.level & BPF_LOG_LEVEL) print_insn_state(env, this_branch, this_branch->curframe); return 0; @@ -16089,52 +16431,42 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, * if parent state is created. */ if (BPF_SRC(insn->code) == BPF_X && src_reg->type == SCALAR_VALUE && src_reg->id) - collect_linked_regs(this_branch, src_reg->id, &linked_regs); + collect_linked_regs(env, this_branch, src_reg->id, &linked_regs); if (dst_reg->type == SCALAR_VALUE && dst_reg->id) - collect_linked_regs(this_branch, dst_reg->id, &linked_regs); + collect_linked_regs(env, this_branch, dst_reg->id, &linked_regs); if (linked_regs.cnt > 1) { - err = push_insn_history(env, this_branch, 0, linked_regs_pack(&linked_regs)); + err = bpf_push_jmp_history(env, this_branch, 0, linked_regs_pack(&linked_regs)); if (err) return err; } - other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx, - false); - if (!other_branch) - return -EFAULT; + other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx, false); + if (IS_ERR(other_branch)) + return PTR_ERR(other_branch); other_branch_regs = other_branch->frame[other_branch->curframe]->regs; - if (BPF_SRC(insn->code) == BPF_X) { - err = reg_set_min_max(env, - &other_branch_regs[insn->dst_reg], - &other_branch_regs[insn->src_reg], - dst_reg, src_reg, opcode, is_jmp32); - } else /* BPF_SRC(insn->code) == BPF_K */ { - /* reg_set_min_max() can mangle the fake_reg. Make a copy - * so that these are two different memory locations. The - * src_reg is not used beyond here in context of K. - */ - memcpy(&env->fake_reg[1], &env->fake_reg[0], - sizeof(env->fake_reg[0])); - err = reg_set_min_max(env, - &other_branch_regs[insn->dst_reg], - &env->fake_reg[0], - dst_reg, &env->fake_reg[1], - opcode, is_jmp32); - } + err = regs_bounds_sanity_check_branches(env); if (err) return err; + copy_register_state(dst_reg, &env->false_reg1); + copy_register_state(src_reg, &env->false_reg2); + copy_register_state(&other_branch_regs[insn->dst_reg], &env->true_reg1); + if (BPF_SRC(insn->code) == BPF_X) + copy_register_state(&other_branch_regs[insn->src_reg], &env->true_reg2); + if (BPF_SRC(insn->code) == BPF_X && src_reg->type == SCALAR_VALUE && src_reg->id && !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) { - sync_linked_regs(this_branch, src_reg, &linked_regs); - sync_linked_regs(other_branch, &other_branch_regs[insn->src_reg], &linked_regs); + sync_linked_regs(env, this_branch, src_reg, &linked_regs); + sync_linked_regs(env, other_branch, &other_branch_regs[insn->src_reg], + &linked_regs); } if (dst_reg->type == SCALAR_VALUE && dst_reg->id && !WARN_ON_ONCE(dst_reg->id != other_branch_regs[insn->dst_reg].id)) { - sync_linked_regs(this_branch, dst_reg, &linked_regs); - sync_linked_regs(other_branch, &other_branch_regs[insn->dst_reg], &linked_regs); + sync_linked_regs(env, this_branch, dst_reg, &linked_regs); + sync_linked_regs(env, other_branch, &other_branch_regs[insn->dst_reg], + &linked_regs); } /* if one pointer register is compared to another pointer @@ -16175,12 +16507,15 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } /* detect if R == 0 where R is returned from bpf_map_lookup_elem(). + * Also does the same detection for a register whose the value is + * known to be 0. * NOTE: these optimizations below are related with pointer comparison * which will never be JMP32. */ - if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K && - insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && - type_may_be_null(dst_reg->type)) { + if (!is_jmp32 && (opcode == BPF_JEQ || opcode == BPF_JNE) && + type_may_be_null(dst_reg->type) && + ((BPF_SRC(insn->code) == BPF_K && insn->imm == 0) || + (BPF_SRC(insn->code) == BPF_X && bpf_register_is_null(src_reg)))) { /* Mark all identical registers in each branch as either * safe or unknown depending R == 0 or R != 0 conditional. */ @@ -16213,10 +16548,6 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) verbose(env, "invalid BPF_LD_IMM insn\n"); return -EINVAL; } - if (insn->off != 0) { - verbose(env, "BPF_LD_IMM64 uses reserved fields\n"); - return -EINVAL; - } err = check_reg_arg(env, insn->dst_reg, DST_OP); if (err) @@ -16248,7 +16579,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) dst_reg->btf_id = aux->btf_var.btf_id; break; default: - verbose(env, "bpf verifier is misconfigured\n"); + verifier_bug(env, "pseudo btf id: unexpected dst reg type"); return -EFAULT; } return 0; @@ -16256,8 +16587,8 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) if (insn->src_reg == BPF_PSEUDO_FUNC) { struct bpf_prog_aux *aux = env->prog->aux; - u32 subprogno = find_subprog(env, - env->insn_idx + insn->imm + 1); + u32 subprogno = bpf_find_subprog(env, + env->insn_idx + insn->imm + 1); if (!aux->func_info) { verbose(env, "missing btf func_info\n"); @@ -16274,24 +16605,27 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) } map = env->used_maps[aux->map_index]; - dst_reg->map_ptr = map; if (insn->src_reg == BPF_PSEUDO_MAP_VALUE || insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) { if (map->map_type == BPF_MAP_TYPE_ARENA) { __mark_reg_unknown(env, dst_reg); + dst_reg->map_ptr = map; return 0; } + __mark_reg_known(dst_reg, aux->map_off); dst_reg->type = PTR_TO_MAP_VALUE; - dst_reg->off = aux->map_off; - WARN_ON_ONCE(map->max_entries != 1); + dst_reg->map_ptr = map; + WARN_ON_ONCE(map->map_type != BPF_MAP_TYPE_INSN_ARRAY && + map->max_entries != 1); /* We want reg->id to be same (0) as map_value is not distinct */ } else if (insn->src_reg == BPF_PSEUDO_MAP_FD || insn->src_reg == BPF_PSEUDO_MAP_IDX) { dst_reg->type = CONST_PTR_TO_MAP; + dst_reg->map_ptr = map; } else { - verbose(env, "bpf verifier is misconfigured\n"); - return -EINVAL; + verifier_bug(env, "unexpected src reg value for ldimm64"); + return -EFAULT; } return 0; @@ -16337,15 +16671,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) } if (!env->ops->gen_ld_abs) { - verbose(env, "bpf verifier is misconfigured\n"); - return -EINVAL; - } - - if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || - BPF_SIZE(insn->code) == BPF_DW || - (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { - verbose(env, "BPF_LD_[ABS|IND] uses reserved fields\n"); - return -EINVAL; + verifier_bug(env, "gen_ld_abs is null"); + return -EFAULT; } /* check whether implicit source operand (register R6) is readable */ @@ -16380,7 +16707,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) /* reset caller saved regs to unreadable */ for (i = 0; i < CALLER_SAVED_REGS; i++) { - mark_reg_not_init(env, regs, caller_saved[i]); + bpf_mark_reg_not_init(env, ®s[caller_saved[i]]); check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); } @@ -16391,93 +16718,59 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) mark_reg_unknown(env, regs, BPF_REG_0); /* ld_abs load up to 32-bit skb data. */ regs[BPF_REG_0].subreg_def = env->insn_idx + 1; + /* + * See bpf_gen_ld_abs() which emits a hidden BPF_EXIT with r0=0 + * which must be explored by the verifier when in a subprog. + */ + if (env->cur_state->curframe) { + struct bpf_verifier_state *branch; + + mark_reg_scratched(env, BPF_REG_0); + branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false); + if (IS_ERR(branch)) + return PTR_ERR(branch); + mark_reg_known_zero(env, regs, BPF_REG_0); + err = prepare_func_exit(env, &env->insn_idx); + if (err) + return err; + env->insn_idx--; + } return 0; } -static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name) + +static bool return_retval_range(struct bpf_verifier_env *env, struct bpf_retval_range *range) { - const char *exit_ctx = "At program exit"; - struct tnum enforce_attach_type_range = tnum_unknown; - const struct bpf_prog *prog = env->prog; - struct bpf_reg_state *reg; - struct bpf_retval_range range = retval_range(0, 1); enum bpf_prog_type prog_type = resolve_prog_type(env->prog); - int err; - struct bpf_func_state *frame = env->cur_state->frame[0]; - const bool is_subprog = frame->subprogno; - bool return_32bit = false; - /* LSM and struct_ops func-ptr's return type could be "void" */ - if (!is_subprog || frame->in_exception_callback_fn) { - switch (prog_type) { - case BPF_PROG_TYPE_LSM: - if (prog->expected_attach_type == BPF_LSM_CGROUP) - /* See below, can be 0 or 0-1 depending on hook. */ - break; - fallthrough; - case BPF_PROG_TYPE_STRUCT_OPS: - if (!prog->aux->attach_func_proto->type) - return 0; + /* Default return value range. */ + *range = retval_range(0, 1); + + switch (prog_type) { + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + switch (env->prog->expected_attach_type) { + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UNIX_RECVMSG: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_UNIX_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: + case BPF_CGROUP_UNIX_GETSOCKNAME: + *range = retval_range(1, 1); + break; + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: + *range = retval_range(0, 3); break; default: break; } - } - - /* eBPF calling convention is such that R0 is used - * to return the value from eBPF program. - * Make sure that it's readable at this time - * of bpf_exit, which means that program wrote - * something into it earlier - */ - err = check_reg_arg(env, regno, SRC_OP); - if (err) - return err; - - if (is_pointer_value(env, regno)) { - verbose(env, "R%d leaks addr as return value\n", regno); - return -EACCES; - } - - reg = cur_regs(env) + regno; - - if (frame->in_async_callback_fn) { - /* enforce return zero from async callbacks like timer */ - exit_ctx = "At async callback return"; - range = retval_range(0, 0); - goto enforce_retval; - } - - if (is_subprog && !frame->in_exception_callback_fn) { - if (reg->type != SCALAR_VALUE) { - verbose(env, "At subprogram exit the register R%d is not a scalar value (%s)\n", - regno, reg_type_str(env, reg->type)); - return -EINVAL; - } - return 0; - } - - switch (prog_type) { - case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: - if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG || - env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG || - env->prog->expected_attach_type == BPF_CGROUP_UNIX_RECVMSG || - env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME || - env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME || - env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETPEERNAME || - env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME || - env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME || - env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETSOCKNAME) - range = retval_range(1, 1); - if (env->prog->expected_attach_type == BPF_CGROUP_INET4_BIND || - env->prog->expected_attach_type == BPF_CGROUP_INET6_BIND) - range = retval_range(0, 3); break; case BPF_PROG_TYPE_CGROUP_SKB: - if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) { - range = retval_range(0, 3); - enforce_attach_type_range = tnum_range(2, 3); - } + if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) + *range = retval_range(0, 3); break; case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_SOCK_OPS: @@ -16487,66 +16780,164 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char break; case BPF_PROG_TYPE_RAW_TRACEPOINT: if (!env->prog->aux->attach_btf_id) - return 0; - range = retval_range(0, 0); + return false; + *range = retval_range(0, 0); break; case BPF_PROG_TYPE_TRACING: switch (env->prog->expected_attach_type) { case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: - range = retval_range(0, 0); + case BPF_TRACE_FSESSION: + *range = retval_range(0, 0); break; case BPF_TRACE_RAW_TP: case BPF_MODIFY_RETURN: - return 0; + return false; case BPF_TRACE_ITER: - break; default: - return -ENOTSUPP; + break; } break; case BPF_PROG_TYPE_KPROBE: switch (env->prog->expected_attach_type) { case BPF_TRACE_KPROBE_SESSION: case BPF_TRACE_UPROBE_SESSION: - range = retval_range(0, 1); break; default: - return 0; + return false; } break; case BPF_PROG_TYPE_SK_LOOKUP: - range = retval_range(SK_DROP, SK_PASS); + *range = retval_range(SK_DROP, SK_PASS); break; case BPF_PROG_TYPE_LSM: if (env->prog->expected_attach_type != BPF_LSM_CGROUP) { /* no range found, any return value is allowed */ - if (!get_func_retval_range(env->prog, &range)) - return 0; + if (!get_func_retval_range(env->prog, range)) + return false; /* no restricted range, any return value is allowed */ - if (range.minval == S32_MIN && range.maxval == S32_MAX) - return 0; - return_32bit = true; + if (range->minval == S32_MIN && range->maxval == S32_MAX) + return false; + range->return_32bit = true; } else if (!env->prog->aux->attach_func_proto->type) { /* Make sure programs that attach to void * hooks don't try to modify return value. */ - range = retval_range(1, 1); + *range = retval_range(1, 1); } break; case BPF_PROG_TYPE_NETFILTER: - range = retval_range(NF_DROP, NF_ACCEPT); + *range = retval_range(NF_DROP, NF_ACCEPT); + break; + case BPF_PROG_TYPE_STRUCT_OPS: + *range = retval_range(0, 0); break; case BPF_PROG_TYPE_EXT: /* freplace program can return anything as its return value * depends on the to-be-replaced kernel func or bpf program. */ default: + return false; + } + + /* Continue calculating. */ + + return true; +} + +static bool program_returns_void(struct bpf_verifier_env *env) +{ + const struct bpf_prog *prog = env->prog; + enum bpf_prog_type prog_type = prog->type; + + switch (prog_type) { + case BPF_PROG_TYPE_LSM: + /* See return_retval_range, for BPF_LSM_CGROUP can be 0 or 0-1 depending on hook. */ + if (prog->expected_attach_type != BPF_LSM_CGROUP && + !prog->aux->attach_func_proto->type) + return true; + break; + case BPF_PROG_TYPE_STRUCT_OPS: + if (!prog->aux->attach_func_proto->type) + return true; + break; + case BPF_PROG_TYPE_EXT: + /* + * If the actual program is an extension, let it + * return void - attaching will succeed only if the + * program being replaced also returns void, and since + * it has passed verification its actual type doesn't matter. + */ + if (subprog_returns_void(env, 0)) + return true; + break; + default: + break; + } + return false; +} + +static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name) +{ + const char *exit_ctx = "At program exit"; + struct tnum enforce_attach_type_range = tnum_unknown; + const struct bpf_prog *prog = env->prog; + struct bpf_reg_state *reg = reg_state(env, regno); + struct bpf_retval_range range = retval_range(0, 1); + enum bpf_prog_type prog_type = resolve_prog_type(env->prog); + struct bpf_func_state *frame = env->cur_state->frame[0]; + const struct btf_type *reg_type, *ret_type = NULL; + int err; + + /* LSM and struct_ops func-ptr's return type could be "void" */ + if (!frame->in_async_callback_fn && program_returns_void(env)) return 0; + + if (prog_type == BPF_PROG_TYPE_STRUCT_OPS) { + /* Allow a struct_ops program to return a referenced kptr if it + * matches the operator's return type and is in its unmodified + * form. A scalar zero (i.e., a null pointer) is also allowed. + */ + reg_type = reg->btf ? btf_type_by_id(reg->btf, reg->btf_id) : NULL; + ret_type = btf_type_resolve_ptr(prog->aux->attach_btf, + prog->aux->attach_func_proto->type, + NULL); + if (ret_type && ret_type == reg_type && reg->ref_obj_id) + return __check_ptr_off_reg(env, reg, regno, false); } + /* eBPF calling convention is such that R0 is used + * to return the value from eBPF program. + * Make sure that it's readable at this time + * of bpf_exit, which means that program wrote + * something into it earlier + */ + err = check_reg_arg(env, regno, SRC_OP); + if (err) + return err; + + if (is_pointer_value(env, regno)) { + verbose(env, "R%d leaks addr as return value\n", regno); + return -EACCES; + } + + if (frame->in_async_callback_fn) { + exit_ctx = "At async callback return"; + range = frame->callback_ret_range; + goto enforce_retval; + } + + if (prog_type == BPF_PROG_TYPE_STRUCT_OPS && !ret_type) + return 0; + + if (prog_type == BPF_PROG_TYPE_CGROUP_SKB && (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS)) + enforce_attach_type_range = tnum_range(2, 3); + + if (!return_retval_range(env, &range)) + return 0; + enforce_retval: if (reg->type != SCALAR_VALUE) { verbose(env, "%s the register R%d is not a known value (%s)\n", @@ -16558,10 +16949,9 @@ enforce_retval: if (err) return err; - if (!retval_range_within(range, reg, return_32bit)) { + if (!retval_range_within(range, reg)) { verbose_invalid_scalar(env, reg, range, exit_ctx, reg_name); - if (!is_subprog && - prog->expected_attach_type == BPF_LSM_CGROUP && + if (prog->expected_attach_type == BPF_LSM_CGROUP && prog_type == BPF_PROG_TYPE_LSM && !prog->aux->attach_func_proto->type) verbose(env, "Note, BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n"); @@ -16574,215 +16964,49 @@ enforce_retval: return 0; } -static void mark_subprog_changes_pkt_data(struct bpf_verifier_env *env, int off) -{ - struct bpf_subprog_info *subprog; - - subprog = find_containing_subprog(env, off); - subprog->changes_pkt_data = true; -} - -/* 't' is an index of a call-site. - * 'w' is a callee entry point. - * Eventually this function would be called when env->cfg.insn_state[w] == EXPLORED. - * Rely on DFS traversal order and absence of recursive calls to guarantee that - * callee's change_pkt_data marks would be correct at that moment. - */ -static void merge_callee_effects(struct bpf_verifier_env *env, int t, int w) -{ - struct bpf_subprog_info *caller, *callee; - - caller = find_containing_subprog(env, t); - callee = find_containing_subprog(env, w); - caller->changes_pkt_data |= callee->changes_pkt_data; -} - -/* non-recursive DFS pseudo code - * 1 procedure DFS-iterative(G,v): - * 2 label v as discovered - * 3 let S be a stack - * 4 S.push(v) - * 5 while S is not empty - * 6 t <- S.peek() - * 7 if t is what we're looking for: - * 8 return t - * 9 for all edges e in G.adjacentEdges(t) do - * 10 if edge e is already labelled - * 11 continue with the next edge - * 12 w <- G.adjacentVertex(t,e) - * 13 if vertex w is not discovered and not explored - * 14 label e as tree-edge - * 15 label w as discovered - * 16 S.push(w) - * 17 continue at 5 - * 18 else if vertex w is discovered - * 19 label e as back-edge - * 20 else - * 21 // vertex w is explored - * 22 label e as forward- or cross-edge - * 23 label t as explored - * 24 S.pop() - * - * convention: - * 0x10 - discovered - * 0x11 - discovered and fall-through edge labelled - * 0x12 - discovered and fall-through and branch edges labelled - * 0x20 - explored - */ - -enum { - DISCOVERED = 0x10, - EXPLORED = 0x20, - FALLTHROUGH = 1, - BRANCH = 2, -}; - -static void mark_prune_point(struct bpf_verifier_env *env, int idx) -{ - env->insn_aux_data[idx].prune_point = true; -} - -static bool is_prune_point(struct bpf_verifier_env *env, int insn_idx) +static int check_global_subprog_return_code(struct bpf_verifier_env *env) { - return env->insn_aux_data[insn_idx].prune_point; -} - -static void mark_force_checkpoint(struct bpf_verifier_env *env, int idx) -{ - env->insn_aux_data[idx].force_checkpoint = true; -} - -static bool is_force_checkpoint(struct bpf_verifier_env *env, int insn_idx) -{ - return env->insn_aux_data[insn_idx].force_checkpoint; -} - -static void mark_calls_callback(struct bpf_verifier_env *env, int idx) -{ - env->insn_aux_data[idx].calls_callback = true; -} - -static bool calls_callback(struct bpf_verifier_env *env, int insn_idx) -{ - return env->insn_aux_data[insn_idx].calls_callback; -} - -enum { - DONE_EXPLORING = 0, - KEEP_EXPLORING = 1, -}; - -/* t, w, e - match pseudo-code above: - * t - index of current instruction - * w - next instruction - * e - edge - */ -static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) -{ - int *insn_stack = env->cfg.insn_stack; - int *insn_state = env->cfg.insn_state; - - if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH)) - return DONE_EXPLORING; + struct bpf_reg_state *reg = reg_state(env, BPF_REG_0); + struct bpf_func_state *cur_frame = cur_func(env); + int err; - if (e == BRANCH && insn_state[t] >= (DISCOVERED | BRANCH)) - return DONE_EXPLORING; + if (subprog_returns_void(env, cur_frame->subprogno)) + return 0; - if (w < 0 || w >= env->prog->len) { - verbose_linfo(env, t, "%d: ", t); - verbose(env, "jump out of range from insn %d to %d\n", t, w); - return -EINVAL; - } + err = check_reg_arg(env, BPF_REG_0, SRC_OP); + if (err) + return err; - if (e == BRANCH) { - /* mark branch target for state pruning */ - mark_prune_point(env, w); - mark_jmp_point(env, w); + if (is_pointer_value(env, BPF_REG_0)) { + verbose(env, "R%d leaks addr as return value\n", BPF_REG_0); + return -EACCES; } - if (insn_state[w] == 0) { - /* tree-edge */ - insn_state[t] = DISCOVERED | e; - insn_state[w] = DISCOVERED; - if (env->cfg.cur_stack >= env->prog->len) - return -E2BIG; - insn_stack[env->cfg.cur_stack++] = w; - return KEEP_EXPLORING; - } else if ((insn_state[w] & 0xF0) == DISCOVERED) { - if (env->bpf_capable) - return DONE_EXPLORING; - verbose_linfo(env, t, "%d: ", t); - verbose_linfo(env, w, "%d: ", w); - verbose(env, "back-edge from insn %d to %d\n", t, w); + if (reg->type != SCALAR_VALUE) { + verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n", + reg_type_str(env, reg->type)); return -EINVAL; - } else if (insn_state[w] == EXPLORED) { - /* forward- or cross-edge */ - insn_state[t] = DISCOVERED | e; - } else { - verbose(env, "insn state internal bug\n"); - return -EFAULT; } - return DONE_EXPLORING; -} -static int visit_func_call_insn(int t, struct bpf_insn *insns, - struct bpf_verifier_env *env, - bool visit_callee) -{ - int ret, insn_sz; - int w; - - insn_sz = bpf_is_ldimm64(&insns[t]) ? 2 : 1; - ret = push_insn(t, t + insn_sz, FALLTHROUGH, env); - if (ret) - return ret; - - mark_prune_point(env, t + insn_sz); - /* when we exit from subprog, we need to record non-linear history */ - mark_jmp_point(env, t + insn_sz); - - if (visit_callee) { - w = t + insns[t].imm + 1; - mark_prune_point(env, t); - merge_callee_effects(env, t, w); - ret = push_insn(t, w, BRANCH, env); - } - return ret; + return 0; } /* Bitmask with 1s for all caller saved registers */ #define ALL_CALLER_SAVED_REGS ((1u << CALLER_SAVED_REGS) - 1) -/* Return a bitmask specifying which caller saved registers are - * clobbered by a call to a helper *as if* this helper follows - * bpf_fastcall contract: - * - includes R0 if function is non-void; - * - includes R1-R5 if corresponding parameter has is described - * in the function prototype. - */ -static u32 helper_fastcall_clobber_mask(const struct bpf_func_proto *fn) -{ - u32 mask; - int i; - - mask = 0; - if (fn->ret_type != RET_VOID) - mask |= BIT(BPF_REG_0); - for (i = 0; i < ARRAY_SIZE(fn->arg_type); ++i) - if (fn->arg_type[i] != ARG_DONTCARE) - mask |= BIT(BPF_REG_1 + i); - return mask; -} - /* True if do_misc_fixups() replaces calls to helper number 'imm', * replacement patch is presumed to follow bpf_fastcall contract * (see mark_fastcall_pattern_for_call() below). */ -static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm) +bool bpf_verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm) { switch (imm) { #ifdef CONFIG_X86_64 case BPF_FUNC_get_smp_processor_id: +#ifdef CONFIG_SMP + case BPF_FUNC_get_current_task_btf: + case BPF_FUNC_get_current_task: +#endif return env->prog->jit_requested && bpf_jit_supports_percpu_insn(); #endif default: @@ -16790,24 +17014,48 @@ static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm) } } -/* Same as helper_fastcall_clobber_mask() but for kfuncs, see comment above */ -static u32 kfunc_fastcall_clobber_mask(struct bpf_kfunc_call_arg_meta *meta) +/* If @call is a kfunc or helper call, fills @cs and returns true, + * otherwise returns false. + */ +bool bpf_get_call_summary(struct bpf_verifier_env *env, struct bpf_insn *call, + struct bpf_call_summary *cs) { - u32 vlen, i, mask; + struct bpf_kfunc_call_arg_meta meta; + const struct bpf_func_proto *fn; + int i; - vlen = btf_type_vlen(meta->func_proto); - mask = 0; - if (!btf_type_is_void(btf_type_by_id(meta->btf, meta->func_proto->type))) - mask |= BIT(BPF_REG_0); - for (i = 0; i < vlen; ++i) - mask |= BIT(BPF_REG_1 + i); - return mask; -} + if (bpf_helper_call(call)) { -/* Same as verifier_inlines_helper_call() but for kfuncs, see comment above */ -static bool is_fastcall_kfunc_call(struct bpf_kfunc_call_arg_meta *meta) -{ - return meta->kfunc_flags & KF_FASTCALL; + if (bpf_get_helper_proto(env, call->imm, &fn) < 0) + /* error would be reported later */ + return false; + cs->fastcall = fn->allow_fastcall && + (bpf_verifier_inlines_helper_call(env, call->imm) || + bpf_jit_inlines_helper_call(call->imm)); + cs->is_void = fn->ret_type == RET_VOID; + cs->num_params = 0; + for (i = 0; i < ARRAY_SIZE(fn->arg_type); ++i) { + if (fn->arg_type[i] == ARG_DONTCARE) + break; + cs->num_params++; + } + return true; + } + + if (bpf_pseudo_kfunc_call(call)) { + int err; + + err = bpf_fetch_kfunc_arg_meta(env, call->imm, call->off, &meta); + if (err < 0) + /* error would be reported later */ + return false; + cs->num_params = btf_type_vlen(meta.func_proto); + cs->fastcall = meta.kfunc_flags & KF_FASTCALL; + cs->is_void = btf_type_is_void(btf_type_by_id(meta.btf, meta.func_proto->type)); + return true; + } + + return false; } /* LLVM define a bpf_fastcall function attribute. @@ -16890,39 +17138,23 @@ static void mark_fastcall_pattern_for_call(struct bpf_verifier_env *env, { struct bpf_insn *insns = env->prog->insnsi, *stx, *ldx; struct bpf_insn *call = &env->prog->insnsi[insn_idx]; - const struct bpf_func_proto *fn; - u32 clobbered_regs_mask = ALL_CALLER_SAVED_REGS; + u32 clobbered_regs_mask; + struct bpf_call_summary cs; u32 expected_regs_mask; - bool can_be_inlined = false; s16 off; int i; - if (bpf_helper_call(call)) { - if (get_helper_proto(env, call->imm, &fn) < 0) - /* error would be reported later */ - return; - clobbered_regs_mask = helper_fastcall_clobber_mask(fn); - can_be_inlined = fn->allow_fastcall && - (verifier_inlines_helper_call(env, call->imm) || - bpf_jit_inlines_helper_call(call->imm)); - } - - if (bpf_pseudo_kfunc_call(call)) { - struct bpf_kfunc_call_arg_meta meta; - int err; - - err = fetch_kfunc_meta(env, call, &meta, NULL); - if (err < 0) - /* error would be reported later */ - return; - - clobbered_regs_mask = kfunc_fastcall_clobber_mask(&meta); - can_be_inlined = is_fastcall_kfunc_call(&meta); - } - - if (clobbered_regs_mask == ALL_CALLER_SAVED_REGS) + if (!bpf_get_call_summary(env, call, &cs)) return; + /* A bitmask specifying which caller saved registers are clobbered + * by a call to a helper/kfunc *as if* this helper/kfunc follows + * bpf_fastcall contract: + * - includes R0 if function is non-void; + * - includes R1-R5 if corresponding parameter has is described + * in the function prototype. + */ + clobbered_regs_mask = GENMASK(cs.num_params, cs.is_void ? 1 : 0); /* e.g. if helper call clobbers r{0,1}, expect r{2,3,4,5} in the pattern */ expected_regs_mask = ~clobbered_regs_mask & ALL_CALLER_SAVED_REGS; @@ -16980,7 +17212,7 @@ static void mark_fastcall_pattern_for_call(struct bpf_verifier_env *env, * don't set 'fastcall_spills_num' for call B so that remove_fastcall_spills_fills() * does not remove spill/fill pair {4,6}. */ - if (can_be_inlined) + if (cs.fastcall) env->insn_aux_data[insn_idx].fastcall_spills_num = i - 1; else subprog->keep_fastcall_stack = 1; @@ -17015,396 +17247,6 @@ static int mark_fastcall_patterns(struct bpf_verifier_env *env) return 0; } -/* Visits the instruction at index t and returns one of the following: - * < 0 - an error occurred - * DONE_EXPLORING - the instruction was fully explored - * KEEP_EXPLORING - there is still work to be done before it is fully explored - */ -static int visit_insn(int t, struct bpf_verifier_env *env) -{ - struct bpf_insn *insns = env->prog->insnsi, *insn = &insns[t]; - int ret, off, insn_sz; - - if (bpf_pseudo_func(insn)) - return visit_func_call_insn(t, insns, env, true); - - /* All non-branch instructions have a single fall-through edge. */ - if (BPF_CLASS(insn->code) != BPF_JMP && - BPF_CLASS(insn->code) != BPF_JMP32) { - insn_sz = bpf_is_ldimm64(insn) ? 2 : 1; - return push_insn(t, t + insn_sz, FALLTHROUGH, env); - } - - switch (BPF_OP(insn->code)) { - case BPF_EXIT: - return DONE_EXPLORING; - - case BPF_CALL: - if (is_async_callback_calling_insn(insn)) - /* Mark this call insn as a prune point to trigger - * is_state_visited() check before call itself is - * processed by __check_func_call(). Otherwise new - * async state will be pushed for further exploration. - */ - mark_prune_point(env, t); - /* For functions that invoke callbacks it is not known how many times - * callback would be called. Verifier models callback calling functions - * by repeatedly visiting callback bodies and returning to origin call - * instruction. - * In order to stop such iteration verifier needs to identify when a - * state identical some state from a previous iteration is reached. - * Check below forces creation of checkpoint before callback calling - * instruction to allow search for such identical states. - */ - if (is_sync_callback_calling_insn(insn)) { - mark_calls_callback(env, t); - mark_force_checkpoint(env, t); - mark_prune_point(env, t); - mark_jmp_point(env, t); - } - if (bpf_helper_call(insn) && bpf_helper_changes_pkt_data(insn->imm)) - mark_subprog_changes_pkt_data(env, t); - if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { - struct bpf_kfunc_call_arg_meta meta; - - ret = fetch_kfunc_meta(env, insn, &meta, NULL); - if (ret == 0 && is_iter_next_kfunc(&meta)) { - mark_prune_point(env, t); - /* Checking and saving state checkpoints at iter_next() call - * is crucial for fast convergence of open-coded iterator loop - * logic, so we need to force it. If we don't do that, - * is_state_visited() might skip saving a checkpoint, causing - * unnecessarily long sequence of not checkpointed - * instructions and jumps, leading to exhaustion of jump - * history buffer, and potentially other undesired outcomes. - * It is expected that with correct open-coded iterators - * convergence will happen quickly, so we don't run a risk of - * exhausting memory. - */ - mark_force_checkpoint(env, t); - } - } - return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL); - - case BPF_JA: - if (BPF_SRC(insn->code) != BPF_K) - return -EINVAL; - - if (BPF_CLASS(insn->code) == BPF_JMP) - off = insn->off; - else - off = insn->imm; - - /* unconditional jump with single edge */ - ret = push_insn(t, t + off + 1, FALLTHROUGH, env); - if (ret) - return ret; - - mark_prune_point(env, t + off + 1); - mark_jmp_point(env, t + off + 1); - - return ret; - - default: - /* conditional jump with two edges */ - mark_prune_point(env, t); - if (is_may_goto_insn(insn)) - mark_force_checkpoint(env, t); - - ret = push_insn(t, t + 1, FALLTHROUGH, env); - if (ret) - return ret; - - return push_insn(t, t + insn->off + 1, BRANCH, env); - } -} - -/* non-recursive depth-first-search to detect loops in BPF program - * loop == back-edge in directed graph - */ -static int check_cfg(struct bpf_verifier_env *env) -{ - int insn_cnt = env->prog->len; - int *insn_stack, *insn_state; - int ex_insn_beg, i, ret = 0; - bool ex_done = false; - - insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); - if (!insn_state) - return -ENOMEM; - - insn_stack = env->cfg.insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); - if (!insn_stack) { - kvfree(insn_state); - return -ENOMEM; - } - - insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */ - insn_stack[0] = 0; /* 0 is the first instruction */ - env->cfg.cur_stack = 1; - -walk_cfg: - while (env->cfg.cur_stack > 0) { - int t = insn_stack[env->cfg.cur_stack - 1]; - - ret = visit_insn(t, env); - switch (ret) { - case DONE_EXPLORING: - insn_state[t] = EXPLORED; - env->cfg.cur_stack--; - break; - case KEEP_EXPLORING: - break; - default: - if (ret > 0) { - verbose(env, "visit_insn internal bug\n"); - ret = -EFAULT; - } - goto err_free; - } - } - - if (env->cfg.cur_stack < 0) { - verbose(env, "pop stack internal bug\n"); - ret = -EFAULT; - goto err_free; - } - - if (env->exception_callback_subprog && !ex_done) { - ex_insn_beg = env->subprog_info[env->exception_callback_subprog].start; - - insn_state[ex_insn_beg] = DISCOVERED; - insn_stack[0] = ex_insn_beg; - env->cfg.cur_stack = 1; - ex_done = true; - goto walk_cfg; - } - - for (i = 0; i < insn_cnt; i++) { - struct bpf_insn *insn = &env->prog->insnsi[i]; - - if (insn_state[i] != EXPLORED) { - verbose(env, "unreachable insn %d\n", i); - ret = -EINVAL; - goto err_free; - } - if (bpf_is_ldimm64(insn)) { - if (insn_state[i + 1] != 0) { - verbose(env, "jump into the middle of ldimm64 insn %d\n", i); - ret = -EINVAL; - goto err_free; - } - i++; /* skip second half of ldimm64 */ - } - } - ret = 0; /* cfg looks good */ - env->prog->aux->changes_pkt_data = env->subprog_info[0].changes_pkt_data; - -err_free: - kvfree(insn_state); - kvfree(insn_stack); - env->cfg.insn_state = env->cfg.insn_stack = NULL; - return ret; -} - -static int check_abnormal_return(struct bpf_verifier_env *env) -{ - int i; - - for (i = 1; i < env->subprog_cnt; i++) { - if (env->subprog_info[i].has_ld_abs) { - verbose(env, "LD_ABS is not allowed in subprogs without BTF\n"); - return -EINVAL; - } - if (env->subprog_info[i].has_tail_call) { - verbose(env, "tail_call is not allowed in subprogs without BTF\n"); - return -EINVAL; - } - } - return 0; -} - -/* The minimum supported BTF func info size */ -#define MIN_BPF_FUNCINFO_SIZE 8 -#define MAX_FUNCINFO_REC_SIZE 252 - -static int check_btf_func_early(struct bpf_verifier_env *env, - const union bpf_attr *attr, - bpfptr_t uattr) -{ - u32 krec_size = sizeof(struct bpf_func_info); - const struct btf_type *type, *func_proto; - u32 i, nfuncs, urec_size, min_size; - struct bpf_func_info *krecord; - struct bpf_prog *prog; - const struct btf *btf; - u32 prev_offset = 0; - bpfptr_t urecord; - int ret = -ENOMEM; - - nfuncs = attr->func_info_cnt; - if (!nfuncs) { - if (check_abnormal_return(env)) - return -EINVAL; - return 0; - } - - urec_size = attr->func_info_rec_size; - if (urec_size < MIN_BPF_FUNCINFO_SIZE || - urec_size > MAX_FUNCINFO_REC_SIZE || - urec_size % sizeof(u32)) { - verbose(env, "invalid func info rec size %u\n", urec_size); - return -EINVAL; - } - - prog = env->prog; - btf = prog->aux->btf; - - urecord = make_bpfptr(attr->func_info, uattr.is_kernel); - min_size = min_t(u32, krec_size, urec_size); - - krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL | __GFP_NOWARN); - if (!krecord) - return -ENOMEM; - - for (i = 0; i < nfuncs; i++) { - ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size); - if (ret) { - if (ret == -E2BIG) { - verbose(env, "nonzero tailing record in func info"); - /* set the size kernel expects so loader can zero - * out the rest of the record. - */ - if (copy_to_bpfptr_offset(uattr, - offsetof(union bpf_attr, func_info_rec_size), - &min_size, sizeof(min_size))) - ret = -EFAULT; - } - goto err_free; - } - - if (copy_from_bpfptr(&krecord[i], urecord, min_size)) { - ret = -EFAULT; - goto err_free; - } - - /* check insn_off */ - ret = -EINVAL; - if (i == 0) { - if (krecord[i].insn_off) { - verbose(env, - "nonzero insn_off %u for the first func info record", - krecord[i].insn_off); - goto err_free; - } - } else if (krecord[i].insn_off <= prev_offset) { - verbose(env, - "same or smaller insn offset (%u) than previous func info record (%u)", - krecord[i].insn_off, prev_offset); - goto err_free; - } - - /* check type_id */ - type = btf_type_by_id(btf, krecord[i].type_id); - if (!type || !btf_type_is_func(type)) { - verbose(env, "invalid type id %d in func info", - krecord[i].type_id); - goto err_free; - } - - func_proto = btf_type_by_id(btf, type->type); - if (unlikely(!func_proto || !btf_type_is_func_proto(func_proto))) - /* btf_func_check() already verified it during BTF load */ - goto err_free; - - prev_offset = krecord[i].insn_off; - bpfptr_add(&urecord, urec_size); - } - - prog->aux->func_info = krecord; - prog->aux->func_info_cnt = nfuncs; - return 0; - -err_free: - kvfree(krecord); - return ret; -} - -static int check_btf_func(struct bpf_verifier_env *env, - const union bpf_attr *attr, - bpfptr_t uattr) -{ - const struct btf_type *type, *func_proto, *ret_type; - u32 i, nfuncs, urec_size; - struct bpf_func_info *krecord; - struct bpf_func_info_aux *info_aux = NULL; - struct bpf_prog *prog; - const struct btf *btf; - bpfptr_t urecord; - bool scalar_return; - int ret = -ENOMEM; - - nfuncs = attr->func_info_cnt; - if (!nfuncs) { - if (check_abnormal_return(env)) - return -EINVAL; - return 0; - } - if (nfuncs != env->subprog_cnt) { - verbose(env, "number of funcs in func_info doesn't match number of subprogs\n"); - return -EINVAL; - } - - urec_size = attr->func_info_rec_size; - - prog = env->prog; - btf = prog->aux->btf; - - urecord = make_bpfptr(attr->func_info, uattr.is_kernel); - - krecord = prog->aux->func_info; - info_aux = kcalloc(nfuncs, sizeof(*info_aux), GFP_KERNEL | __GFP_NOWARN); - if (!info_aux) - return -ENOMEM; - - for (i = 0; i < nfuncs; i++) { - /* check insn_off */ - ret = -EINVAL; - - if (env->subprog_info[i].start != krecord[i].insn_off) { - verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n"); - goto err_free; - } - - /* Already checked type_id */ - type = btf_type_by_id(btf, krecord[i].type_id); - info_aux[i].linkage = BTF_INFO_VLEN(type->info); - /* Already checked func_proto */ - func_proto = btf_type_by_id(btf, type->type); - - ret_type = btf_type_skip_modifiers(btf, func_proto->type, NULL); - scalar_return = - btf_type_is_small_int(ret_type) || btf_is_any_enum(ret_type); - if (i && !scalar_return && env->subprog_info[i].has_ld_abs) { - verbose(env, "LD_ABS is only allowed in functions that return 'int'.\n"); - goto err_free; - } - if (i && !scalar_return && env->subprog_info[i].has_tail_call) { - verbose(env, "tail_call is only allowed in functions that return 'int'.\n"); - goto err_free; - } - - bpfptr_add(&urecord, urec_size); - } - - prog->aux->func_info_aux = info_aux; - return 0; - -err_free: - kfree(info_aux); - return ret; -} - static void adjust_btf_func(struct bpf_verifier_env *env) { struct bpf_prog_aux *aux = env->prog->aux; @@ -17418,1518 +17260,419 @@ static void adjust_btf_func(struct bpf_verifier_env *env) aux->func_info[i].insn_off = env->subprog_info[i].start; } -#define MIN_BPF_LINEINFO_SIZE offsetofend(struct bpf_line_info, line_col) -#define MAX_LINEINFO_REC_SIZE MAX_FUNCINFO_REC_SIZE - -static int check_btf_line(struct bpf_verifier_env *env, - const union bpf_attr *attr, - bpfptr_t uattr) +/* Find id in idset and increment its count, or add new entry */ +static void idset_cnt_inc(struct bpf_idset *idset, u32 id) { - u32 i, s, nr_linfo, ncopy, expected_size, rec_size, prev_offset = 0; - struct bpf_subprog_info *sub; - struct bpf_line_info *linfo; - struct bpf_prog *prog; - const struct btf *btf; - bpfptr_t ulinfo; - int err; - - nr_linfo = attr->line_info_cnt; - if (!nr_linfo) - return 0; - if (nr_linfo > INT_MAX / sizeof(struct bpf_line_info)) - return -EINVAL; - - rec_size = attr->line_info_rec_size; - if (rec_size < MIN_BPF_LINEINFO_SIZE || - rec_size > MAX_LINEINFO_REC_SIZE || - rec_size & (sizeof(u32) - 1)) - return -EINVAL; - - /* Need to zero it in case the userspace may - * pass in a smaller bpf_line_info object. - */ - linfo = kvcalloc(nr_linfo, sizeof(struct bpf_line_info), - GFP_KERNEL | __GFP_NOWARN); - if (!linfo) - return -ENOMEM; - - prog = env->prog; - btf = prog->aux->btf; - - s = 0; - sub = env->subprog_info; - ulinfo = make_bpfptr(attr->line_info, uattr.is_kernel); - expected_size = sizeof(struct bpf_line_info); - ncopy = min_t(u32, expected_size, rec_size); - for (i = 0; i < nr_linfo; i++) { - err = bpf_check_uarg_tail_zero(ulinfo, expected_size, rec_size); - if (err) { - if (err == -E2BIG) { - verbose(env, "nonzero tailing record in line_info"); - if (copy_to_bpfptr_offset(uattr, - offsetof(union bpf_attr, line_info_rec_size), - &expected_size, sizeof(expected_size))) - err = -EFAULT; - } - goto err_free; - } - - if (copy_from_bpfptr(&linfo[i], ulinfo, ncopy)) { - err = -EFAULT; - goto err_free; - } - - /* - * Check insn_off to ensure - * 1) strictly increasing AND - * 2) bounded by prog->len - * - * The linfo[0].insn_off == 0 check logically falls into - * the later "missing bpf_line_info for func..." case - * because the first linfo[0].insn_off must be the - * first sub also and the first sub must have - * subprog_info[0].start == 0. - */ - if ((i && linfo[i].insn_off <= prev_offset) || - linfo[i].insn_off >= prog->len) { - verbose(env, "Invalid line_info[%u].insn_off:%u (prev_offset:%u prog->len:%u)\n", - i, linfo[i].insn_off, prev_offset, - prog->len); - err = -EINVAL; - goto err_free; - } + u32 i; - if (!prog->insnsi[linfo[i].insn_off].code) { - verbose(env, - "Invalid insn code at line_info[%u].insn_off\n", - i); - err = -EINVAL; - goto err_free; - } - - if (!btf_name_by_offset(btf, linfo[i].line_off) || - !btf_name_by_offset(btf, linfo[i].file_name_off)) { - verbose(env, "Invalid line_info[%u].line_off or .file_name_off\n", i); - err = -EINVAL; - goto err_free; - } - - if (s != env->subprog_cnt) { - if (linfo[i].insn_off == sub[s].start) { - sub[s].linfo_idx = i; - s++; - } else if (sub[s].start < linfo[i].insn_off) { - verbose(env, "missing bpf_line_info for func#%u\n", s); - err = -EINVAL; - goto err_free; - } + for (i = 0; i < idset->num_ids; i++) { + if (idset->entries[i].id == id) { + idset->entries[i].cnt++; + return; } - - prev_offset = linfo[i].insn_off; - bpfptr_add(&ulinfo, rec_size); - } - - if (s != env->subprog_cnt) { - verbose(env, "missing bpf_line_info for %u funcs starting from func#%u\n", - env->subprog_cnt - s, s); - err = -EINVAL; - goto err_free; } - - prog->aux->linfo = linfo; - prog->aux->nr_linfo = nr_linfo; - - return 0; - -err_free: - kvfree(linfo); - return err; -} - -#define MIN_CORE_RELO_SIZE sizeof(struct bpf_core_relo) -#define MAX_CORE_RELO_SIZE MAX_FUNCINFO_REC_SIZE - -static int check_core_relo(struct bpf_verifier_env *env, - const union bpf_attr *attr, - bpfptr_t uattr) -{ - u32 i, nr_core_relo, ncopy, expected_size, rec_size; - struct bpf_core_relo core_relo = {}; - struct bpf_prog *prog = env->prog; - const struct btf *btf = prog->aux->btf; - struct bpf_core_ctx ctx = { - .log = &env->log, - .btf = btf, - }; - bpfptr_t u_core_relo; - int err; - - nr_core_relo = attr->core_relo_cnt; - if (!nr_core_relo) - return 0; - if (nr_core_relo > INT_MAX / sizeof(struct bpf_core_relo)) - return -EINVAL; - - rec_size = attr->core_relo_rec_size; - if (rec_size < MIN_CORE_RELO_SIZE || - rec_size > MAX_CORE_RELO_SIZE || - rec_size % sizeof(u32)) - return -EINVAL; - - u_core_relo = make_bpfptr(attr->core_relos, uattr.is_kernel); - expected_size = sizeof(struct bpf_core_relo); - ncopy = min_t(u32, expected_size, rec_size); - - /* Unlike func_info and line_info, copy and apply each CO-RE - * relocation record one at a time. - */ - for (i = 0; i < nr_core_relo; i++) { - /* future proofing when sizeof(bpf_core_relo) changes */ - err = bpf_check_uarg_tail_zero(u_core_relo, expected_size, rec_size); - if (err) { - if (err == -E2BIG) { - verbose(env, "nonzero tailing record in core_relo"); - if (copy_to_bpfptr_offset(uattr, - offsetof(union bpf_attr, core_relo_rec_size), - &expected_size, sizeof(expected_size))) - err = -EFAULT; - } - break; - } - - if (copy_from_bpfptr(&core_relo, u_core_relo, ncopy)) { - err = -EFAULT; - break; - } - - if (core_relo.insn_off % 8 || core_relo.insn_off / 8 >= prog->len) { - verbose(env, "Invalid core_relo[%u].insn_off:%u prog->len:%u\n", - i, core_relo.insn_off, prog->len); - err = -EINVAL; - break; - } - - err = bpf_core_apply(&ctx, &core_relo, i, - &prog->insnsi[core_relo.insn_off / 8]); - if (err) - break; - bpfptr_add(&u_core_relo, rec_size); + /* New id */ + if (idset->num_ids < BPF_ID_MAP_SIZE) { + idset->entries[idset->num_ids].id = id; + idset->entries[idset->num_ids].cnt = 1; + idset->num_ids++; } - return err; } -static int check_btf_info_early(struct bpf_verifier_env *env, - const union bpf_attr *attr, - bpfptr_t uattr) +/* Find id in idset and return its count, or 0 if not found */ +static u32 idset_cnt_get(struct bpf_idset *idset, u32 id) { - struct btf *btf; - int err; - - if (!attr->func_info_cnt && !attr->line_info_cnt) { - if (check_abnormal_return(env)) - return -EINVAL; - return 0; - } + u32 i; - btf = btf_get_by_fd(attr->prog_btf_fd); - if (IS_ERR(btf)) - return PTR_ERR(btf); - if (btf_is_kernel(btf)) { - btf_put(btf); - return -EACCES; + for (i = 0; i < idset->num_ids; i++) { + if (idset->entries[i].id == id) + return idset->entries[i].cnt; } - env->prog->aux->btf = btf; - - err = check_btf_func_early(env, attr, uattr); - if (err) - return err; return 0; } -static int check_btf_info(struct bpf_verifier_env *env, - const union bpf_attr *attr, - bpfptr_t uattr) +/* + * Clear singular scalar ids in a state. + * A register with a non-zero id is called singular if no other register shares + * the same base id. Such registers can be treated as independent (id=0). + */ +void bpf_clear_singular_ids(struct bpf_verifier_env *env, + struct bpf_verifier_state *st) { - int err; - - if (!attr->func_info_cnt && !attr->line_info_cnt) { - if (check_abnormal_return(env)) - return -EINVAL; - return 0; - } - - err = check_btf_func(env, attr, uattr); - if (err) - return err; + struct bpf_idset *idset = &env->idset_scratch; + struct bpf_func_state *func; + struct bpf_reg_state *reg; - err = check_btf_line(env, attr, uattr); - if (err) - return err; + idset->num_ids = 0; - err = check_core_relo(env, attr, uattr); - if (err) - return err; + bpf_for_each_reg_in_vstate(st, func, reg, ({ + if (reg->type != SCALAR_VALUE) + continue; + if (!reg->id) + continue; + idset_cnt_inc(idset, reg->id & ~BPF_ADD_CONST); + })); - return 0; + bpf_for_each_reg_in_vstate(st, func, reg, ({ + if (reg->type != SCALAR_VALUE) + continue; + if (!reg->id) + continue; + if (idset_cnt_get(idset, reg->id & ~BPF_ADD_CONST) == 1) + clear_scalar_id(reg); + })); } -/* check %cur's range satisfies %old's */ -static bool range_within(const struct bpf_reg_state *old, - const struct bpf_reg_state *cur) -{ - return old->umin_value <= cur->umin_value && - old->umax_value >= cur->umax_value && - old->smin_value <= cur->smin_value && - old->smax_value >= cur->smax_value && - old->u32_min_value <= cur->u32_min_value && - old->u32_max_value >= cur->u32_max_value && - old->s32_min_value <= cur->s32_min_value && - old->s32_max_value >= cur->s32_max_value; -} - -/* If in the old state two registers had the same id, then they need to have - * the same id in the new state as well. But that id could be different from - * the old state, so we need to track the mapping from old to new ids. - * Once we have seen that, say, a reg with old id 5 had new id 9, any subsequent - * regs with old id 5 must also have new id 9 for the new state to be safe. But - * regs with a different old id could still have new id 9, we don't care about - * that. - * So we look through our idmap to see if this old id has been seen before. If - * so, we require the new id to match; otherwise, we add the id pair to the map. - */ -static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap) +/* Return true if it's OK to have the same insn return a different type. */ +static bool reg_type_mismatch_ok(enum bpf_reg_type type) { - struct bpf_id_pair *map = idmap->map; - unsigned int i; - - /* either both IDs should be set or both should be zero */ - if (!!old_id != !!cur_id) + switch (base_type(type)) { + case PTR_TO_CTX: + case PTR_TO_SOCKET: + case PTR_TO_SOCK_COMMON: + case PTR_TO_TCP_SOCK: + case PTR_TO_XDP_SOCK: + case PTR_TO_BTF_ID: + case PTR_TO_ARENA: return false; - - if (old_id == 0) /* cur_id == 0 as well */ + default: return true; - - for (i = 0; i < BPF_ID_MAP_SIZE; i++) { - if (!map[i].old) { - /* Reached an empty slot; haven't seen this id before */ - map[i].old = old_id; - map[i].cur = cur_id; - return true; - } - if (map[i].old == old_id) - return map[i].cur == cur_id; - if (map[i].cur == cur_id) - return false; - } - /* We ran out of idmap slots, which should be impossible */ - WARN_ON_ONCE(1); - return false; -} - -/* Similar to check_ids(), but allocate a unique temporary ID - * for 'old_id' or 'cur_id' of zero. - * This makes pairs like '0 vs unique ID', 'unique ID vs 0' valid. - */ -static bool check_scalar_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap) -{ - old_id = old_id ? old_id : ++idmap->tmp_id_gen; - cur_id = cur_id ? cur_id : ++idmap->tmp_id_gen; - - return check_ids(old_id, cur_id, idmap); -} - -static void clean_func_state(struct bpf_verifier_env *env, - struct bpf_func_state *st) -{ - enum bpf_reg_liveness live; - int i, j; - - for (i = 0; i < BPF_REG_FP; i++) { - live = st->regs[i].live; - /* liveness must not touch this register anymore */ - st->regs[i].live |= REG_LIVE_DONE; - if (!(live & REG_LIVE_READ)) - /* since the register is unused, clear its state - * to make further comparison simpler - */ - __mark_reg_not_init(env, &st->regs[i]); - } - - for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) { - live = st->stack[i].spilled_ptr.live; - /* liveness must not touch this stack slot anymore */ - st->stack[i].spilled_ptr.live |= REG_LIVE_DONE; - if (!(live & REG_LIVE_READ)) { - __mark_reg_not_init(env, &st->stack[i].spilled_ptr); - for (j = 0; j < BPF_REG_SIZE; j++) - st->stack[i].slot_type[j] = STACK_INVALID; - } } } -static void clean_verifier_state(struct bpf_verifier_env *env, - struct bpf_verifier_state *st) -{ - int i; - - if (st->frame[0]->regs[0].live & REG_LIVE_DONE) - /* all regs in this state in all frames were already marked */ - return; - - for (i = 0; i <= st->curframe; i++) - clean_func_state(env, st->frame[i]); -} - -/* the parentage chains form a tree. - * the verifier states are added to state lists at given insn and - * pushed into state stack for future exploration. - * when the verifier reaches bpf_exit insn some of the verifer states - * stored in the state lists have their final liveness state already, - * but a lot of states will get revised from liveness point of view when - * the verifier explores other branches. - * Example: - * 1: r0 = 1 - * 2: if r1 == 100 goto pc+1 - * 3: r0 = 2 - * 4: exit - * when the verifier reaches exit insn the register r0 in the state list of - * insn 2 will be seen as !REG_LIVE_READ. Then the verifier pops the other_branch - * of insn 2 and goes exploring further. At the insn 4 it will walk the - * parentage chain from insn 4 into insn 2 and will mark r0 as REG_LIVE_READ. - * - * Since the verifier pushes the branch states as it sees them while exploring - * the program the condition of walking the branch instruction for the second - * time means that all states below this branch were already explored and - * their final liveness marks are already propagated. - * Hence when the verifier completes the search of state list in is_state_visited() - * we can call this clean_live_states() function to mark all liveness states - * as REG_LIVE_DONE to indicate that 'parent' pointers of 'struct bpf_reg_state' - * will not be used. - * This function also clears the registers and stack for states that !READ - * to simplify state merging. +/* If an instruction was previously used with particular pointer types, then we + * need to be careful to avoid cases such as the below, where it may be ok + * for one branch accessing the pointer, but not ok for the other branch: * - * Important note here that walking the same branch instruction in the callee - * doesn't meant that the states are DONE. The verifier has to compare - * the callsites + * R1 = sock_ptr + * goto X; + * ... + * R1 = some_other_valid_ptr; + * goto X; + * ... + * R2 = *(u32 *)(R1 + 0); */ -static void clean_live_states(struct bpf_verifier_env *env, int insn, - struct bpf_verifier_state *cur) -{ - struct bpf_verifier_state_list *sl; - - sl = *explored_state(env, insn); - while (sl) { - if (sl->state.branches) - goto next; - if (sl->state.insn_idx != insn || - !same_callsites(&sl->state, cur)) - goto next; - clean_verifier_state(env, &sl->state); -next: - sl = sl->next; - } -} - -static bool regs_exact(const struct bpf_reg_state *rold, - const struct bpf_reg_state *rcur, - struct bpf_idmap *idmap) +static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev) { - return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && - check_ids(rold->id, rcur->id, idmap) && - check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap); + return src != prev && (!reg_type_mismatch_ok(src) || + !reg_type_mismatch_ok(prev)); } -enum exact_level { - NOT_EXACT, - EXACT, - RANGE_WITHIN -}; - -/* Returns true if (rold safe implies rcur safe) */ -static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, - struct bpf_reg_state *rcur, struct bpf_idmap *idmap, - enum exact_level exact) +static bool is_ptr_to_mem_or_btf_id(enum bpf_reg_type type) { - if (exact == EXACT) - return regs_exact(rold, rcur, idmap); - - if (!(rold->live & REG_LIVE_READ) && exact == NOT_EXACT) - /* explored state didn't use this */ - return true; - if (rold->type == NOT_INIT) { - if (exact == NOT_EXACT || rcur->type == NOT_INIT) - /* explored state can't have used this */ - return true; - } - - /* Enforce that register types have to match exactly, including their - * modifiers (like PTR_MAYBE_NULL, MEM_RDONLY, etc), as a general - * rule. - * - * One can make a point that using a pointer register as unbounded - * SCALAR would be technically acceptable, but this could lead to - * pointer leaks because scalars are allowed to leak while pointers - * are not. We could make this safe in special cases if root is - * calling us, but it's probably not worth the hassle. - * - * Also, register types that are *not* MAYBE_NULL could technically be - * safe to use as their MAYBE_NULL variants (e.g., PTR_TO_MAP_VALUE - * is safe to be used as PTR_TO_MAP_VALUE_OR_NULL, provided both point - * to the same map). - * However, if the old MAYBE_NULL register then got NULL checked, - * doing so could have affected others with the same id, and we can't - * check for that because we lost the id when we converted to - * a non-MAYBE_NULL variant. - * So, as a general rule we don't allow mixing MAYBE_NULL and - * non-MAYBE_NULL registers as well. - */ - if (rold->type != rcur->type) - return false; - - switch (base_type(rold->type)) { - case SCALAR_VALUE: - if (env->explore_alu_limits) { - /* explore_alu_limits disables tnum_in() and range_within() - * logic and requires everything to be strict - */ - return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && - check_scalar_ids(rold->id, rcur->id, idmap); - } - if (!rold->precise && exact == NOT_EXACT) - return true; - if ((rold->id & BPF_ADD_CONST) != (rcur->id & BPF_ADD_CONST)) - return false; - if ((rold->id & BPF_ADD_CONST) && (rold->off != rcur->off)) - return false; - /* Why check_ids() for scalar registers? - * - * Consider the following BPF code: - * 1: r6 = ... unbound scalar, ID=a ... - * 2: r7 = ... unbound scalar, ID=b ... - * 3: if (r6 > r7) goto +1 - * 4: r6 = r7 - * 5: if (r6 > X) goto ... - * 6: ... memory operation using r7 ... - * - * First verification path is [1-6]: - * - at (4) same bpf_reg_state::id (b) would be assigned to r6 and r7; - * - at (5) r6 would be marked <= X, sync_linked_regs() would also mark - * r7 <= X, because r6 and r7 share same id. - * Next verification path is [1-4, 6]. - * - * Instruction (6) would be reached in two states: - * I. r6{.id=b}, r7{.id=b} via path 1-6; - * II. r6{.id=a}, r7{.id=b} via path 1-4, 6. - * - * Use check_ids() to distinguish these states. - * --- - * Also verify that new value satisfies old value range knowledge. - */ - return range_within(rold, rcur) && - tnum_in(rold->var_off, rcur->var_off) && - check_scalar_ids(rold->id, rcur->id, idmap); - case PTR_TO_MAP_KEY: - case PTR_TO_MAP_VALUE: + switch (base_type(type)) { case PTR_TO_MEM: - case PTR_TO_BUF: - case PTR_TO_TP_BUFFER: - /* If the new min/max/var_off satisfy the old ones and - * everything else matches, we are OK. - */ - return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 && - range_within(rold, rcur) && - tnum_in(rold->var_off, rcur->var_off) && - check_ids(rold->id, rcur->id, idmap) && - check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap); - case PTR_TO_PACKET_META: - case PTR_TO_PACKET: - /* We must have at least as much range as the old ptr - * did, so that any accesses which were safe before are - * still safe. This is true even if old range < old off, - * since someone could have accessed through (ptr - k), or - * even done ptr -= k in a register, to get a safe access. - */ - if (rold->range > rcur->range) - return false; - /* If the offsets don't match, we can't trust our alignment; - * nor can we be sure that we won't fall out of range. - */ - if (rold->off != rcur->off) - return false; - /* id relations must be preserved */ - if (!check_ids(rold->id, rcur->id, idmap)) - return false; - /* new val must satisfy old val knowledge */ - return range_within(rold, rcur) && - tnum_in(rold->var_off, rcur->var_off); - case PTR_TO_STACK: - /* two stack pointers are equal only if they're pointing to - * the same stack frame, since fp-8 in foo != fp-8 in bar - */ - return regs_exact(rold, rcur, idmap) && rold->frameno == rcur->frameno; - case PTR_TO_ARENA: + case PTR_TO_BTF_ID: return true; default: - return regs_exact(rold, rcur, idmap); - } -} - -static struct bpf_reg_state unbound_reg; - -static __init int unbound_reg_init(void) -{ - __mark_reg_unknown_imprecise(&unbound_reg); - unbound_reg.live |= REG_LIVE_READ; - return 0; -} -late_initcall(unbound_reg_init); - -static bool is_stack_all_misc(struct bpf_verifier_env *env, - struct bpf_stack_state *stack) -{ - u32 i; - - for (i = 0; i < ARRAY_SIZE(stack->slot_type); ++i) { - if ((stack->slot_type[i] == STACK_MISC) || - (stack->slot_type[i] == STACK_INVALID && env->allow_uninit_stack)) - continue; return false; } - - return true; } -static struct bpf_reg_state *scalar_reg_for_stack(struct bpf_verifier_env *env, - struct bpf_stack_state *stack) +static bool is_ptr_to_mem(enum bpf_reg_type type) { - if (is_spilled_scalar_reg64(stack)) - return &stack->spilled_ptr; - - if (is_stack_all_misc(env, stack)) - return &unbound_reg; - - return NULL; + return base_type(type) == PTR_TO_MEM; } -static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, - struct bpf_func_state *cur, struct bpf_idmap *idmap, - enum exact_level exact) +static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type, + bool allow_trust_mismatch) { - int i, spi; - - /* walk slots of the explored stack and ignore any additional - * slots in the current stack, since explored(safe) state - * didn't use them - */ - for (i = 0; i < old->allocated_stack; i++) { - struct bpf_reg_state *old_reg, *cur_reg; - - spi = i / BPF_REG_SIZE; - - if (exact != NOT_EXACT && - (i >= cur->allocated_stack || - old->stack[spi].slot_type[i % BPF_REG_SIZE] != - cur->stack[spi].slot_type[i % BPF_REG_SIZE])) - return false; - - if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ) - && exact == NOT_EXACT) { - i += BPF_REG_SIZE - 1; - /* explored state didn't use this */ - continue; - } - - if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID) - continue; - - if (env->allow_uninit_stack && - old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC) - continue; - - /* explored stack has more populated slots than current stack - * and these slots were used - */ - if (i >= cur->allocated_stack) - return false; + enum bpf_reg_type *prev_type = &env->insn_aux_data[env->insn_idx].ptr_type; + enum bpf_reg_type merged_type; - /* 64-bit scalar spill vs all slots MISC and vice versa. - * Load from all slots MISC produces unbound scalar. - * Construct a fake register for such stack and call - * regsafe() to ensure scalar ids are compared. + if (*prev_type == NOT_INIT) { + /* Saw a valid insn + * dst_reg = *(u32 *)(src_reg + off) + * save type to validate intersecting paths */ - old_reg = scalar_reg_for_stack(env, &old->stack[spi]); - cur_reg = scalar_reg_for_stack(env, &cur->stack[spi]); - if (old_reg && cur_reg) { - if (!regsafe(env, old_reg, cur_reg, idmap, exact)) - return false; - i += BPF_REG_SIZE - 1; - continue; - } - - /* if old state was safe with misc data in the stack - * it will be safe with zero-initialized stack. - * The opposite is not true + *prev_type = type; + } else if (reg_type_mismatch(type, *prev_type)) { + /* Abuser program is trying to use the same insn + * dst_reg = *(u32*) (src_reg + off) + * with different pointer types: + * src_reg == ctx in one branch and + * src_reg == stack|map in some other branch. + * Reject it. */ - if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC && - cur->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_ZERO) - continue; - if (old->stack[spi].slot_type[i % BPF_REG_SIZE] != - cur->stack[spi].slot_type[i % BPF_REG_SIZE]) - /* Ex: old explored (safe) state has STACK_SPILL in - * this stack slot, but current has STACK_MISC -> - * this verifier states are not equivalent, - * return false to continue verification of this path - */ - return false; - if (i % BPF_REG_SIZE != BPF_REG_SIZE - 1) - continue; - /* Both old and cur are having same slot_type */ - switch (old->stack[spi].slot_type[BPF_REG_SIZE - 1]) { - case STACK_SPILL: - /* when explored and current stack slot are both storing - * spilled registers, check that stored pointers types - * are the same as well. - * Ex: explored safe path could have stored - * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8} - * but current path has stored: - * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16} - * such verifier states are not equivalent. - * return false to continue verification of this path - */ - if (!regsafe(env, &old->stack[spi].spilled_ptr, - &cur->stack[spi].spilled_ptr, idmap, exact)) - return false; - break; - case STACK_DYNPTR: - old_reg = &old->stack[spi].spilled_ptr; - cur_reg = &cur->stack[spi].spilled_ptr; - if (old_reg->dynptr.type != cur_reg->dynptr.type || - old_reg->dynptr.first_slot != cur_reg->dynptr.first_slot || - !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap)) - return false; - break; - case STACK_ITER: - old_reg = &old->stack[spi].spilled_ptr; - cur_reg = &cur->stack[spi].spilled_ptr; - /* iter.depth is not compared between states as it - * doesn't matter for correctness and would otherwise - * prevent convergence; we maintain it only to prevent - * infinite loop check triggering, see - * iter_active_depths_differ() + if (allow_trust_mismatch && + is_ptr_to_mem_or_btf_id(type) && + is_ptr_to_mem_or_btf_id(*prev_type)) { + /* + * Have to support a use case when one path through + * the program yields TRUSTED pointer while another + * is UNTRUSTED. Fallback to UNTRUSTED to generate + * BPF_PROBE_MEM/BPF_PROBE_MEMSX. + * Same behavior of MEM_RDONLY flag. */ - if (old_reg->iter.btf != cur_reg->iter.btf || - old_reg->iter.btf_id != cur_reg->iter.btf_id || - old_reg->iter.state != cur_reg->iter.state || - /* ignore {old_reg,cur_reg}->iter.depth, see above */ - !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap)) - return false; - break; - case STACK_IRQ_FLAG: - old_reg = &old->stack[spi].spilled_ptr; - cur_reg = &cur->stack[spi].spilled_ptr; - if (!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap)) - return false; - break; - case STACK_MISC: - case STACK_ZERO: - case STACK_INVALID: - continue; - /* Ensure that new unhandled slot types return false by default */ - default: - return false; - } - } - return true; -} - -static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *cur, - struct bpf_idmap *idmap) -{ - int i; - - if (old->acquired_refs != cur->acquired_refs) - return false; - - if (old->active_locks != cur->active_locks) - return false; - - if (old->active_preempt_locks != cur->active_preempt_locks) - return false; - - if (old->active_rcu_lock != cur->active_rcu_lock) - return false; - - if (!check_ids(old->active_irq_id, cur->active_irq_id, idmap)) - return false; - - for (i = 0; i < old->acquired_refs; i++) { - if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap) || - old->refs[i].type != cur->refs[i].type) - return false; - switch (old->refs[i].type) { - case REF_TYPE_PTR: - case REF_TYPE_IRQ: - break; - case REF_TYPE_LOCK: - if (old->refs[i].ptr != cur->refs[i].ptr) - return false; - break; - default: - WARN_ONCE(1, "Unhandled enum type for reference state: %d\n", old->refs[i].type); - return false; + if (is_ptr_to_mem(type) || is_ptr_to_mem(*prev_type)) + merged_type = PTR_TO_MEM; + else + merged_type = PTR_TO_BTF_ID; + if ((type & PTR_UNTRUSTED) || (*prev_type & PTR_UNTRUSTED)) + merged_type |= PTR_UNTRUSTED; + if ((type & MEM_RDONLY) || (*prev_type & MEM_RDONLY)) + merged_type |= MEM_RDONLY; + *prev_type = merged_type; + } else { + verbose(env, "same insn cannot be used with different pointers\n"); + return -EINVAL; } } - return true; -} - -/* compare two verifier states - * - * all states stored in state_list are known to be valid, since - * verifier reached 'bpf_exit' instruction through them - * - * this function is called when verifier exploring different branches of - * execution popped from the state stack. If it sees an old state that has - * more strict register state and more strict stack state then this execution - * branch doesn't need to be explored further, since verifier already - * concluded that more strict state leads to valid finish. - * - * Therefore two states are equivalent if register state is more conservative - * and explored stack state is more conservative than the current one. - * Example: - * explored current - * (slot1=INV slot2=MISC) == (slot1=MISC slot2=MISC) - * (slot1=MISC slot2=MISC) != (slot1=INV slot2=MISC) - * - * In other words if current stack state (one being explored) has more - * valid slots than old one that already passed validation, it means - * the verifier can stop exploring and conclude that current state is valid too - * - * Similarly with registers. If explored state has register type as invalid - * whereas register type in current state is meaningful, it means that - * the current state will reach 'bpf_exit' instruction safely - */ -static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old, - struct bpf_func_state *cur, enum exact_level exact) -{ - int i; - - if (old->callback_depth > cur->callback_depth) - return false; - - for (i = 0; i < MAX_BPF_REG; i++) - if (!regsafe(env, &old->regs[i], &cur->regs[i], - &env->idmap_scratch, exact)) - return false; - - if (!stacksafe(env, old, cur, &env->idmap_scratch, exact)) - return false; - - return true; + return 0; } -static void reset_idmap_scratch(struct bpf_verifier_env *env) -{ - env->idmap_scratch.tmp_id_gen = env->id_gen; - memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map)); -} +enum { + PROCESS_BPF_EXIT = 1, + INSN_IDX_UPDATED = 2, +}; -static bool states_equal(struct bpf_verifier_env *env, - struct bpf_verifier_state *old, - struct bpf_verifier_state *cur, - enum exact_level exact) +static int process_bpf_exit_full(struct bpf_verifier_env *env, + bool *do_print_state, + bool exception_exit) { - int i; - - if (old->curframe != cur->curframe) - return false; - - reset_idmap_scratch(env); + struct bpf_func_state *cur_frame = cur_func(env); - /* Verification state from speculative execution simulation - * must never prune a non-speculative execution one. + /* We must do check_reference_leak here before + * prepare_func_exit to handle the case when + * state->curframe > 0, it may be a callback function, + * for which reference_state must match caller reference + * state when it exits. */ - if (old->speculative && !cur->speculative) - return false; - - if (old->in_sleepable != cur->in_sleepable) - return false; - - if (!refsafe(old, cur, &env->idmap_scratch)) - return false; + int err = check_resource_leak(env, exception_exit, + exception_exit || !env->cur_state->curframe, + exception_exit ? "bpf_throw" : + "BPF_EXIT instruction in main prog"); + if (err) + return err; - /* for states to be equal callsites have to be the same - * and all frame states need to be equivalent + /* The side effect of the prepare_func_exit which is + * being skipped is that it frees bpf_func_state. + * Typically, process_bpf_exit will only be hit with + * outermost exit. copy_verifier_state in pop_stack will + * handle freeing of any extra bpf_func_state left over + * from not processing all nested function exits. We + * also skip return code checks as they are not needed + * for exceptional exits. */ - for (i = 0; i <= old->curframe; i++) { - if (old->frame[i]->callsite != cur->frame[i]->callsite) - return false; - if (!func_states_equal(env, old->frame[i], cur->frame[i], exact)) - return false; - } - return true; -} + if (exception_exit) + return PROCESS_BPF_EXIT; -/* Return 0 if no propagation happened. Return negative error code if error - * happened. Otherwise, return the propagated bit. - */ -static int propagate_liveness_reg(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, - struct bpf_reg_state *parent_reg) -{ - u8 parent_flag = parent_reg->live & REG_LIVE_READ; - u8 flag = reg->live & REG_LIVE_READ; - int err; + if (env->cur_state->curframe) { + /* exit from nested function */ + err = prepare_func_exit(env, &env->insn_idx); + if (err) + return err; + *do_print_state = true; + return INSN_IDX_UPDATED; + } - /* When comes here, read flags of PARENT_REG or REG could be any of - * REG_LIVE_READ64, REG_LIVE_READ32, REG_LIVE_NONE. There is no need - * of propagation if PARENT_REG has strongest REG_LIVE_READ64. + /* + * Return from a regular global subprogram differs from return + * from the main program or async/exception callback. + * Main program exit implies return code restrictions + * that depend on program type. + * Exit from exception callback is equivalent to main program exit. + * Exit from async callback implies return code restrictions + * that depend on async scheduling mechanism. */ - if (parent_flag == REG_LIVE_READ64 || - /* Or if there is no read flag from REG. */ - !flag || - /* Or if the read flag from REG is the same as PARENT_REG. */ - parent_flag == flag) - return 0; - - err = mark_reg_read(env, reg, parent_reg, flag); + if (cur_frame->subprogno && + !cur_frame->in_async_callback_fn && + !cur_frame->in_exception_callback_fn) + err = check_global_subprog_return_code(env); + else + err = check_return_code(env, BPF_REG_0, "R0"); if (err) return err; - - return flag; + return PROCESS_BPF_EXIT; } -/* A write screens off any subsequent reads; but write marks come from the - * straight-line code between a state and its parent. When we arrive at an - * equivalent state (jump target or such) we didn't arrive by the straight-line - * code, so read marks in the state must propagate to the parent regardless - * of the state's write marks. That's what 'parent == state->parent' comparison - * in mark_reg_read() is for. - */ -static int propagate_liveness(struct bpf_verifier_env *env, - const struct bpf_verifier_state *vstate, - struct bpf_verifier_state *vparent) +static int indirect_jump_min_max_index(struct bpf_verifier_env *env, + int regno, + struct bpf_map *map, + u32 *pmin_index, u32 *pmax_index) { - struct bpf_reg_state *state_reg, *parent_reg; - struct bpf_func_state *state, *parent; - int i, frame, err = 0; + struct bpf_reg_state *reg = reg_state(env, regno); + u64 min_index = reg->umin_value; + u64 max_index = reg->umax_value; + const u32 size = 8; - if (vparent->curframe != vstate->curframe) { - WARN(1, "propagate_live: parent frame %d current frame %d\n", - vparent->curframe, vstate->curframe); - return -EFAULT; + if (min_index > (u64) U32_MAX * size) { + verbose(env, "the sum of R%u umin_value %llu is too big\n", regno, reg->umin_value); + return -ERANGE; } - /* Propagate read liveness of registers... */ - BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); - for (frame = 0; frame <= vstate->curframe; frame++) { - parent = vparent->frame[frame]; - state = vstate->frame[frame]; - parent_reg = parent->regs; - state_reg = state->regs; - /* We don't need to worry about FP liveness, it's read-only */ - for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) { - err = propagate_liveness_reg(env, &state_reg[i], - &parent_reg[i]); - if (err < 0) - return err; - if (err == REG_LIVE_READ64) - mark_insn_zext(env, &parent_reg[i]); - } - - /* Propagate stack slots. */ - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && - i < parent->allocated_stack / BPF_REG_SIZE; i++) { - parent_reg = &parent->stack[i].spilled_ptr; - state_reg = &state->stack[i].spilled_ptr; - err = propagate_liveness_reg(env, state_reg, - parent_reg); - if (err < 0) - return err; - } + if (max_index > (u64) U32_MAX * size) { + verbose(env, "the sum of R%u umax_value %llu is too big\n", regno, reg->umax_value); + return -ERANGE; } - return 0; -} - -/* find precise scalars in the previous equivalent state and - * propagate them into the current state - */ -static int propagate_precision(struct bpf_verifier_env *env, - const struct bpf_verifier_state *old) -{ - struct bpf_reg_state *state_reg; - struct bpf_func_state *state; - int i, err = 0, fr; - bool first; - for (fr = old->curframe; fr >= 0; fr--) { - state = old->frame[fr]; - state_reg = state->regs; - first = true; - for (i = 0; i < BPF_REG_FP; i++, state_reg++) { - if (state_reg->type != SCALAR_VALUE || - !state_reg->precise || - !(state_reg->live & REG_LIVE_READ)) - continue; - if (env->log.level & BPF_LOG_LEVEL2) { - if (first) - verbose(env, "frame %d: propagating r%d", fr, i); - else - verbose(env, ",r%d", i); - } - bt_set_frame_reg(&env->bt, fr, i); - first = false; - } + min_index /= size; + max_index /= size; - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (!is_spilled_reg(&state->stack[i])) - continue; - state_reg = &state->stack[i].spilled_ptr; - if (state_reg->type != SCALAR_VALUE || - !state_reg->precise || - !(state_reg->live & REG_LIVE_READ)) - continue; - if (env->log.level & BPF_LOG_LEVEL2) { - if (first) - verbose(env, "frame %d: propagating fp%d", - fr, (-i - 1) * BPF_REG_SIZE); - else - verbose(env, ",fp%d", (-i - 1) * BPF_REG_SIZE); - } - bt_set_frame_slot(&env->bt, fr, i); - first = false; - } - if (!first) - verbose(env, "\n"); + if (max_index >= map->max_entries) { + verbose(env, "R%u points to outside of jump table: [%llu,%llu] max_entries %u\n", + regno, min_index, max_index, map->max_entries); + return -EINVAL; } - err = mark_chain_precision_batch(env); - if (err < 0) - return err; - + *pmin_index = min_index; + *pmax_index = max_index; return 0; } -static bool states_maybe_looping(struct bpf_verifier_state *old, - struct bpf_verifier_state *cur) +/* gotox *dst_reg */ +static int check_indirect_jump(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct bpf_func_state *fold, *fcur; - int i, fr = cur->curframe; + struct bpf_verifier_state *other_branch; + struct bpf_reg_state *dst_reg; + struct bpf_map *map; + u32 min_index, max_index; + int err = 0; + int n; + int i; - if (old->curframe != fr) - return false; + dst_reg = reg_state(env, insn->dst_reg); + if (dst_reg->type != PTR_TO_INSN) { + verbose(env, "R%d has type %s, expected PTR_TO_INSN\n", + insn->dst_reg, reg_type_str(env, dst_reg->type)); + return -EINVAL; + } - fold = old->frame[fr]; - fcur = cur->frame[fr]; - for (i = 0; i < MAX_BPF_REG; i++) - if (memcmp(&fold->regs[i], &fcur->regs[i], - offsetof(struct bpf_reg_state, parent))) - return false; - return true; -} + map = dst_reg->map_ptr; + if (verifier_bug_if(!map, env, "R%d has an empty map pointer", insn->dst_reg)) + return -EFAULT; -static bool is_iter_next_insn(struct bpf_verifier_env *env, int insn_idx) -{ - return env->insn_aux_data[insn_idx].is_iter_next; -} + if (verifier_bug_if(map->map_type != BPF_MAP_TYPE_INSN_ARRAY, env, + "R%d has incorrect map type %d", insn->dst_reg, map->map_type)) + return -EFAULT; -/* is_state_visited() handles iter_next() (see process_iter_next_call() for - * terminology) calls specially: as opposed to bounded BPF loops, it *expects* - * states to match, which otherwise would look like an infinite loop. So while - * iter_next() calls are taken care of, we still need to be careful and - * prevent erroneous and too eager declaration of "ininite loop", when - * iterators are involved. - * - * Here's a situation in pseudo-BPF assembly form: - * - * 0: again: ; set up iter_next() call args - * 1: r1 = &it ; <CHECKPOINT HERE> - * 2: call bpf_iter_num_next ; this is iter_next() call - * 3: if r0 == 0 goto done - * 4: ... something useful here ... - * 5: goto again ; another iteration - * 6: done: - * 7: r1 = &it - * 8: call bpf_iter_num_destroy ; clean up iter state - * 9: exit - * - * This is a typical loop. Let's assume that we have a prune point at 1:, - * before we get to `call bpf_iter_num_next` (e.g., because of that `goto - * again`, assuming other heuristics don't get in a way). - * - * When we first time come to 1:, let's say we have some state X. We proceed - * to 2:, fork states, enqueue ACTIVE, validate NULL case successfully, exit. - * Now we come back to validate that forked ACTIVE state. We proceed through - * 3-5, come to goto, jump to 1:. Let's assume our state didn't change, so we - * are converging. But the problem is that we don't know that yet, as this - * convergence has to happen at iter_next() call site only. So if nothing is - * done, at 1: verifier will use bounded loop logic and declare infinite - * looping (and would be *technically* correct, if not for iterator's - * "eventual sticky NULL" contract, see process_iter_next_call()). But we - * don't want that. So what we do in process_iter_next_call() when we go on - * another ACTIVE iteration, we bump slot->iter.depth, to mark that it's - * a different iteration. So when we suspect an infinite loop, we additionally - * check if any of the *ACTIVE* iterator states depths differ. If yes, we - * pretend we are not looping and wait for next iter_next() call. - * - * This only applies to ACTIVE state. In DRAINED state we don't expect to - * loop, because that would actually mean infinite loop, as DRAINED state is - * "sticky", and so we'll keep returning into the same instruction with the - * same state (at least in one of possible code paths). - * - * This approach allows to keep infinite loop heuristic even in the face of - * active iterator. E.g., C snippet below is and will be detected as - * inifintely looping: - * - * struct bpf_iter_num it; - * int *p, x; - * - * bpf_iter_num_new(&it, 0, 10); - * while ((p = bpf_iter_num_next(&t))) { - * x = p; - * while (x--) {} // <<-- infinite loop here - * } - * - */ -static bool iter_active_depths_differ(struct bpf_verifier_state *old, struct bpf_verifier_state *cur) -{ - struct bpf_reg_state *slot, *cur_slot; - struct bpf_func_state *state; - int i, fr; + err = indirect_jump_min_max_index(env, insn->dst_reg, map, &min_index, &max_index); + if (err) + return err; - for (fr = old->curframe; fr >= 0; fr--) { - state = old->frame[fr]; - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] != STACK_ITER) - continue; + /* Ensure that the buffer is large enough */ + if (!env->gotox_tmp_buf || env->gotox_tmp_buf->cnt < max_index - min_index + 1) { + env->gotox_tmp_buf = bpf_iarray_realloc(env->gotox_tmp_buf, + max_index - min_index + 1); + if (!env->gotox_tmp_buf) + return -ENOMEM; + } - slot = &state->stack[i].spilled_ptr; - if (slot->iter.state != BPF_ITER_STATE_ACTIVE) - continue; + n = bpf_copy_insn_array_uniq(map, min_index, max_index, env->gotox_tmp_buf->items); + if (n < 0) + return n; + if (n == 0) { + verbose(env, "register R%d doesn't point to any offset in map id=%d\n", + insn->dst_reg, map->id); + return -EINVAL; + } - cur_slot = &cur->frame[fr]->stack[i].spilled_ptr; - if (cur_slot->iter.depth != slot->iter.depth) - return true; - } + for (i = 0; i < n - 1; i++) { + mark_indirect_target(env, env->gotox_tmp_buf->items[i]); + other_branch = push_stack(env, env->gotox_tmp_buf->items[i], + env->insn_idx, env->cur_state->speculative); + if (IS_ERR(other_branch)) + return PTR_ERR(other_branch); } - return false; + env->insn_idx = env->gotox_tmp_buf->items[n-1]; + mark_indirect_target(env, env->insn_idx); + return INSN_IDX_UPDATED; } -static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) +static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state) { - struct bpf_verifier_state_list *new_sl; - struct bpf_verifier_state_list *sl, **pprev; - struct bpf_verifier_state *cur = env->cur_state, *new, *loop_entry; - int i, j, n, err, states_cnt = 0; - bool force_new_state, add_new_state, force_exact; - - force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) || - /* Avoid accumulating infinitely long jmp history */ - cur->insn_hist_end - cur->insn_hist_start > 40; - - /* bpf progs typically have pruning point every 4 instructions - * http://vger.kernel.org/bpfconf2019.html#session-1 - * Do not add new state for future pruning if the verifier hasn't seen - * at least 2 jumps and at least 8 instructions. - * This heuristics helps decrease 'total_states' and 'peak_states' metric. - * In tests that amounts to up to 50% reduction into total verifier - * memory consumption and 20% verifier time speedup. - */ - add_new_state = force_new_state; - if (env->jmps_processed - env->prev_jmps_processed >= 2 && - env->insn_processed - env->prev_insn_processed >= 8) - add_new_state = true; + int err; + struct bpf_insn *insn = &env->prog->insnsi[env->insn_idx]; + u8 class = BPF_CLASS(insn->code); - pprev = explored_state(env, insn_idx); - sl = *pprev; + switch (class) { + case BPF_ALU: + case BPF_ALU64: + return check_alu_op(env, insn); - clean_live_states(env, insn_idx, cur); + case BPF_LDX: + return check_load_mem(env, insn, false, + BPF_MODE(insn->code) == BPF_MEMSX, + true, "ldx"); - while (sl) { - states_cnt++; - if (sl->state.insn_idx != insn_idx) - goto next; + case BPF_STX: + if (BPF_MODE(insn->code) == BPF_ATOMIC) + return check_atomic(env, insn); + return check_store_reg(env, insn, false); - if (sl->state.branches) { - struct bpf_func_state *frame = sl->state.frame[sl->state.curframe]; - - if (frame->in_async_callback_fn && - frame->async_entry_cnt != cur->frame[cur->curframe]->async_entry_cnt) { - /* Different async_entry_cnt means that the verifier is - * processing another entry into async callback. - * Seeing the same state is not an indication of infinite - * loop or infinite recursion. - * But finding the same state doesn't mean that it's safe - * to stop processing the current state. The previous state - * hasn't yet reached bpf_exit, since state.branches > 0. - * Checking in_async_callback_fn alone is not enough either. - * Since the verifier still needs to catch infinite loops - * inside async callbacks. - */ - goto skip_inf_loop_check; - } - /* BPF open-coded iterators loop detection is special. - * states_maybe_looping() logic is too simplistic in detecting - * states that *might* be equivalent, because it doesn't know - * about ID remapping, so don't even perform it. - * See process_iter_next_call() and iter_active_depths_differ() - * for overview of the logic. When current and one of parent - * states are detected as equivalent, it's a good thing: we prove - * convergence and can stop simulating further iterations. - * It's safe to assume that iterator loop will finish, taking into - * account iter_next() contract of eventually returning - * sticky NULL result. - * - * Note, that states have to be compared exactly in this case because - * read and precision marks might not be finalized inside the loop. - * E.g. as in the program below: - * - * 1. r7 = -16 - * 2. r6 = bpf_get_prandom_u32() - * 3. while (bpf_iter_num_next(&fp[-8])) { - * 4. if (r6 != 42) { - * 5. r7 = -32 - * 6. r6 = bpf_get_prandom_u32() - * 7. continue - * 8. } - * 9. r0 = r10 - * 10. r0 += r7 - * 11. r8 = *(u64 *)(r0 + 0) - * 12. r6 = bpf_get_prandom_u32() - * 13. } - * - * Here verifier would first visit path 1-3, create a checkpoint at 3 - * with r7=-16, continue to 4-7,3. Existing checkpoint at 3 does - * not have read or precision mark for r7 yet, thus inexact states - * comparison would discard current state with r7=-32 - * => unsafe memory access at 11 would not be caught. - */ - if (is_iter_next_insn(env, insn_idx)) { - if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) { - struct bpf_func_state *cur_frame; - struct bpf_reg_state *iter_state, *iter_reg; - int spi; - - cur_frame = cur->frame[cur->curframe]; - /* btf_check_iter_kfuncs() enforces that - * iter state pointer is always the first arg - */ - iter_reg = &cur_frame->regs[BPF_REG_1]; - /* current state is valid due to states_equal(), - * so we can assume valid iter and reg state, - * no need for extra (re-)validations - */ - spi = __get_spi(iter_reg->off + iter_reg->var_off.value); - iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr; - if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) { - update_loop_entry(cur, &sl->state); - goto hit; - } - } - goto skip_inf_loop_check; - } - if (is_may_goto_insn_at(env, insn_idx)) { - if (sl->state.may_goto_depth != cur->may_goto_depth && - states_equal(env, &sl->state, cur, RANGE_WITHIN)) { - update_loop_entry(cur, &sl->state); - goto hit; - } - } - if (calls_callback(env, insn_idx)) { - if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) - goto hit; - goto skip_inf_loop_check; - } - /* attempt to detect infinite loop to avoid unnecessary doomed work */ - if (states_maybe_looping(&sl->state, cur) && - states_equal(env, &sl->state, cur, EXACT) && - !iter_active_depths_differ(&sl->state, cur) && - sl->state.may_goto_depth == cur->may_goto_depth && - sl->state.callback_unroll_depth == cur->callback_unroll_depth) { - verbose_linfo(env, insn_idx, "; "); - verbose(env, "infinite loop detected at insn %d\n", insn_idx); - verbose(env, "cur state:"); - print_verifier_state(env, cur, cur->curframe, true); - verbose(env, "old state:"); - print_verifier_state(env, &sl->state, cur->curframe, true); - return -EINVAL; - } - /* if the verifier is processing a loop, avoid adding new state - * too often, since different loop iterations have distinct - * states and may not help future pruning. - * This threshold shouldn't be too low to make sure that - * a loop with large bound will be rejected quickly. - * The most abusive loop will be: - * r1 += 1 - * if r1 < 1000000 goto pc-2 - * 1M insn_procssed limit / 100 == 10k peak states. - * This threshold shouldn't be too high either, since states - * at the end of the loop are likely to be useful in pruning. - */ -skip_inf_loop_check: - if (!force_new_state && - env->jmps_processed - env->prev_jmps_processed < 20 && - env->insn_processed - env->prev_insn_processed < 100) - add_new_state = false; - goto miss; - } - /* If sl->state is a part of a loop and this loop's entry is a part of - * current verification path then states have to be compared exactly. - * 'force_exact' is needed to catch the following case: - * - * initial Here state 'succ' was processed first, - * | it was eventually tracked to produce a - * V state identical to 'hdr'. - * .---------> hdr All branches from 'succ' had been explored - * | | and thus 'succ' has its .branches == 0. - * | V - * | .------... Suppose states 'cur' and 'succ' correspond - * | | | to the same instruction + callsites. - * | V V In such case it is necessary to check - * | ... ... if 'succ' and 'cur' are states_equal(). - * | | | If 'succ' and 'cur' are a part of the - * | V V same loop exact flag has to be set. - * | succ <- cur To check if that is the case, verify - * | | if loop entry of 'succ' is in current - * | V DFS path. - * | ... - * | | - * '----' - * - * Additional details are in the comment before get_loop_entry(). - */ - loop_entry = get_loop_entry(&sl->state); - force_exact = loop_entry && loop_entry->branches > 0; - if (states_equal(env, &sl->state, cur, force_exact ? RANGE_WITHIN : NOT_EXACT)) { - if (force_exact) - update_loop_entry(cur, loop_entry); -hit: - sl->hit_cnt++; - /* reached equivalent register/stack state, - * prune the search. - * Registers read by the continuation are read by us. - * If we have any write marks in env->cur_state, they - * will prevent corresponding reads in the continuation - * from reaching our parent (an explored_state). Our - * own state will get the read marks recorded, but - * they'll be immediately forgotten as we're pruning - * this state and will pop a new one. - */ - err = propagate_liveness(env, &sl->state, cur); + case BPF_ST: { + enum bpf_reg_type dst_reg_type; - /* if previous state reached the exit with precision and - * current state is equivalent to it (except precision marks) - * the precision needs to be propagated back in - * the current state. - */ - if (is_jmp_point(env, env->insn_idx)) - err = err ? : push_insn_history(env, cur, 0, 0); - err = err ? : propagate_precision(env, &sl->state); - if (err) - return err; - return 1; - } -miss: - /* when new state is not going to be added do not increase miss count. - * Otherwise several loop iterations will remove the state - * recorded earlier. The goal of these heuristics is to have - * states from some iterations of the loop (some in the beginning - * and some at the end) to help pruning. - */ - if (add_new_state) - sl->miss_cnt++; - /* heuristic to determine whether this state is beneficial - * to keep checking from state equivalence point of view. - * Higher numbers increase max_states_per_insn and verification time, - * but do not meaningfully decrease insn_processed. - * 'n' controls how many times state could miss before eviction. - * Use bigger 'n' for checkpoints because evicting checkpoint states - * too early would hinder iterator convergence. - */ - n = is_force_checkpoint(env, insn_idx) && sl->state.branches > 0 ? 64 : 3; - if (sl->miss_cnt > sl->hit_cnt * n + n) { - /* the state is unlikely to be useful. Remove it to - * speed up verification - */ - *pprev = sl->next; - if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE && - !sl->state.used_as_loop_entry) { - u32 br = sl->state.branches; - - WARN_ONCE(br, - "BUG live_done but branches_to_explore %d\n", - br); - free_verifier_state(&sl->state, false); - kfree(sl); - env->peak_states--; - } else { - /* cannot free this state, since parentage chain may - * walk it later. Add it for free_list instead to - * be freed at the end of verification - */ - sl->next = env->free_list; - env->free_list = sl; - } - sl = *pprev; - continue; - } -next: - pprev = &sl->next; - sl = *pprev; - } - - if (env->max_states_per_insn < states_cnt) - env->max_states_per_insn = states_cnt; + err = check_reg_arg(env, insn->dst_reg, SRC_OP); + if (err) + return err; - if (!env->bpf_capable && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) - return 0; + dst_reg_type = cur_regs(env)[insn->dst_reg].type; - if (!add_new_state) - return 0; + err = check_mem_access(env, env->insn_idx, insn->dst_reg, + insn->off, BPF_SIZE(insn->code), + BPF_WRITE, -1, false, false); + if (err) + return err; - /* There were no equivalent states, remember the current one. - * Technically the current state is not proven to be safe yet, - * but it will either reach outer most bpf_exit (which means it's safe) - * or it will be rejected. When there are no loops the verifier won't be - * seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx) - * again on the way to bpf_exit. - * When looping the sl->state.branches will be > 0 and this state - * will not be considered for equivalence until branches == 0. - */ - new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL); - if (!new_sl) - return -ENOMEM; - env->total_states++; - env->peak_states++; - env->prev_jmps_processed = env->jmps_processed; - env->prev_insn_processed = env->insn_processed; - - /* forget precise markings we inherited, see __mark_chain_precision */ - if (env->bpf_capable) - mark_all_scalars_imprecise(env, cur); - - /* add new state to the head of linked list */ - new = &new_sl->state; - err = copy_verifier_state(new, cur); - if (err) { - free_verifier_state(new, false); - kfree(new_sl); - return err; - } - new->insn_idx = insn_idx; - WARN_ONCE(new->branches != 1, - "BUG is_state_visited:branches_to_explore=%d insn %d\n", new->branches, insn_idx); - - cur->parent = new; - cur->first_insn_idx = insn_idx; - cur->insn_hist_start = cur->insn_hist_end; - cur->dfs_depth = new->dfs_depth + 1; - new_sl->next = *explored_state(env, insn_idx); - *explored_state(env, insn_idx) = new_sl; - /* connect new state to parentage chain. Current frame needs all - * registers connected. Only r6 - r9 of the callers are alive (pushed - * to the stack implicitly by JITs) so in callers' frames connect just - * r6 - r9 as an optimization. Callers will have r1 - r5 connected to - * the state of the call instruction (with WRITTEN set), and r0 comes - * from callee with its full parentage chain, anyway. - */ - /* clear write marks in current state: the writes we did are not writes - * our child did, so they don't screen off its reads from us. - * (There are no read marks in current state, because reads always mark - * their parent and current state never has children yet. Only - * explored_states can get read marks.) - */ - for (j = 0; j <= cur->curframe; j++) { - for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) - cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i]; - for (i = 0; i < BPF_REG_FP; i++) - cur->frame[j]->regs[i].live = REG_LIVE_NONE; + return save_aux_ptr_type(env, dst_reg_type, false); } - - /* all stack frames are accessible from callee, clear them all */ - for (j = 0; j <= cur->curframe; j++) { - struct bpf_func_state *frame = cur->frame[j]; - struct bpf_func_state *newframe = new->frame[j]; - - for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) { - frame->stack[i].spilled_ptr.live = REG_LIVE_NONE; - frame->stack[i].spilled_ptr.parent = - &newframe->stack[i].spilled_ptr; + case BPF_JMP: + case BPF_JMP32: { + u8 opcode = BPF_OP(insn->code); + + env->jmps_processed++; + if (opcode == BPF_CALL) { + if (env->cur_state->active_locks) { + if ((insn->src_reg == BPF_REG_0 && + insn->imm != BPF_FUNC_spin_unlock && + insn->imm != BPF_FUNC_kptr_xchg) || + (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && + (insn->off != 0 || !kfunc_spin_allowed(insn->imm)))) { + verbose(env, + "function calls are not allowed while holding a lock\n"); + return -EINVAL; + } + } + mark_reg_scratched(env, BPF_REG_0); + if (insn->src_reg == BPF_PSEUDO_CALL) + return check_func_call(env, insn, &env->insn_idx); + if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) + return check_kfunc_call(env, insn, &env->insn_idx); + return check_helper_call(env, insn, &env->insn_idx); + } else if (opcode == BPF_JA) { + if (BPF_SRC(insn->code) == BPF_X) + return check_indirect_jump(env, insn); + + if (class == BPF_JMP) + env->insn_idx += insn->off + 1; + else + env->insn_idx += insn->imm + 1; + return INSN_IDX_UPDATED; + } else if (opcode == BPF_EXIT) { + return process_bpf_exit_full(env, do_print_state, false); } + return check_cond_jmp_op(env, insn, &env->insn_idx); } - return 0; -} - -/* Return true if it's OK to have the same insn return a different type. */ -static bool reg_type_mismatch_ok(enum bpf_reg_type type) -{ - switch (base_type(type)) { - case PTR_TO_CTX: - case PTR_TO_SOCKET: - case PTR_TO_SOCK_COMMON: - case PTR_TO_TCP_SOCK: - case PTR_TO_XDP_SOCK: - case PTR_TO_BTF_ID: - case PTR_TO_ARENA: - return false; - default: - return true; - } -} + case BPF_LD: { + u8 mode = BPF_MODE(insn->code); -/* If an instruction was previously used with particular pointer types, then we - * need to be careful to avoid cases such as the below, where it may be ok - * for one branch accessing the pointer, but not ok for the other branch: - * - * R1 = sock_ptr - * goto X; - * ... - * R1 = some_other_valid_ptr; - * goto X; - * ... - * R2 = *(u32 *)(R1 + 0); - */ -static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev) -{ - return src != prev && (!reg_type_mismatch_ok(src) || - !reg_type_mismatch_ok(prev)); -} + if (mode == BPF_ABS || mode == BPF_IND) + return check_ld_abs(env, insn); -static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type, - bool allow_trust_mismatch) -{ - enum bpf_reg_type *prev_type = &env->insn_aux_data[env->insn_idx].ptr_type; + if (mode == BPF_IMM) { + err = check_ld_imm(env, insn); + if (err) + return err; - if (*prev_type == NOT_INIT) { - /* Saw a valid insn - * dst_reg = *(u32 *)(src_reg + off) - * save type to validate intersecting paths - */ - *prev_type = type; - } else if (reg_type_mismatch(type, *prev_type)) { - /* Abuser program is trying to use the same insn - * dst_reg = *(u32*) (src_reg + off) - * with different pointer types: - * src_reg == ctx in one branch and - * src_reg == stack|map in some other branch. - * Reject it. - */ - if (allow_trust_mismatch && - base_type(type) == PTR_TO_BTF_ID && - base_type(*prev_type) == PTR_TO_BTF_ID) { - /* - * Have to support a use case when one path through - * the program yields TRUSTED pointer while another - * is UNTRUSTED. Fallback to UNTRUSTED to generate - * BPF_PROBE_MEM/BPF_PROBE_MEMSX. - */ - *prev_type = PTR_TO_BTF_ID | PTR_UNTRUSTED; - } else { - verbose(env, "same insn cannot be used with different pointers\n"); - return -EINVAL; + env->insn_idx++; + sanitize_mark_insn_seen(env); } + return 0; } - - return 0; + } + /* all class values are handled above. silence compiler warning */ + return -EFAULT; } static int do_check(struct bpf_verifier_env *env) @@ -18937,15 +17680,13 @@ static int do_check(struct bpf_verifier_env *env) bool pop_log = !(env->log.level & BPF_LOG_LEVEL2); struct bpf_verifier_state *state = env->cur_state; struct bpf_insn *insns = env->prog->insnsi; - struct bpf_reg_state *regs; int insn_cnt = env->prog->len; bool do_print_state = false; int prev_insn_idx = -1; for (;;) { - bool exception_exit = false; struct bpf_insn *insn; - u8 class; + struct bpf_insn_aux_data *insn_aux; int err; /* reset current history entry on each new instruction */ @@ -18959,7 +17700,7 @@ static int do_check(struct bpf_verifier_env *env) } insn = &insns[env->insn_idx]; - class = BPF_CLASS(insn->code); + insn_aux = &env->insn_aux_data[env->insn_idx]; if (++env->insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) { verbose(env, @@ -18969,9 +17710,10 @@ static int do_check(struct bpf_verifier_env *env) } state->last_insn_idx = env->prev_insn_idx; + state->insn_idx = env->insn_idx; - if (is_prune_point(env, env->insn_idx)) { - err = is_state_visited(env, env->insn_idx); + if (bpf_is_prune_point(env, env->insn_idx)) { + err = bpf_is_state_visited(env, env->insn_idx); if (err < 0) return err; if (err == 1) { @@ -18989,8 +17731,8 @@ static int do_check(struct bpf_verifier_env *env) } } - if (is_jmp_point(env, env->insn_idx)) { - err = push_insn_history(env, state, 0, 0); + if (bpf_is_jmp_point(env, env->insn_idx)) { + err = bpf_push_jmp_history(env, state, 0, 0); if (err) return err; } @@ -19011,19 +17753,13 @@ static int do_check(struct bpf_verifier_env *env) } if (env->log.level & BPF_LOG_LEVEL) { - const struct bpf_insn_cbs cbs = { - .cb_call = disasm_kfunc_name, - .cb_print = verbose, - .private_data = env, - }; - if (verifier_state_scratched(env)) print_insn_state(env, state, state->curframe); verbose_linfo(env, env->insn_idx, "; "); env->prev_log_pos = env->log.end_pos; verbose(env, "%d: ", env->insn_idx); - print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); + bpf_verbose_insn(env, insn); env->prev_insn_print_pos = env->log.end_pos - env->prev_log_pos; env->prev_log_pos = env->log.end_pos; } @@ -19035,249 +17771,92 @@ static int do_check(struct bpf_verifier_env *env) return err; } - regs = cur_regs(env); sanitize_mark_insn_seen(env); prev_insn_idx = env->insn_idx; - if (class == BPF_ALU || class == BPF_ALU64) { - err = check_alu_op(env, insn); - if (err) - return err; - - } else if (class == BPF_LDX) { - enum bpf_reg_type src_reg_type; - - /* check for reserved fields is already done */ - - /* check src operand */ - err = check_reg_arg(env, insn->src_reg, SRC_OP); - if (err) - return err; - - err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); - if (err) - return err; - - src_reg_type = regs[insn->src_reg].type; - - /* check that memory (src_reg + off) is readable, - * the state of dst_reg will be updated by this func - */ - err = check_mem_access(env, env->insn_idx, insn->src_reg, - insn->off, BPF_SIZE(insn->code), - BPF_READ, insn->dst_reg, false, - BPF_MODE(insn->code) == BPF_MEMSX); - err = err ?: save_aux_ptr_type(env, src_reg_type, true); - err = err ?: reg_bounds_sanity_check(env, ®s[insn->dst_reg], "ldx"); - if (err) - return err; - } else if (class == BPF_STX) { - enum bpf_reg_type dst_reg_type; - - if (BPF_MODE(insn->code) == BPF_ATOMIC) { - err = check_atomic(env, env->insn_idx, insn); - if (err) - return err; - env->insn_idx++; - continue; - } - - if (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0) { - verbose(env, "BPF_STX uses reserved fields\n"); - return -EINVAL; - } - - /* check src1 operand */ - err = check_reg_arg(env, insn->src_reg, SRC_OP); - if (err) - return err; - /* check src2 operand */ - err = check_reg_arg(env, insn->dst_reg, SRC_OP); - if (err) - return err; - - dst_reg_type = regs[insn->dst_reg].type; - - /* check that memory (dst_reg + off) is writeable */ - err = check_mem_access(env, env->insn_idx, insn->dst_reg, - insn->off, BPF_SIZE(insn->code), - BPF_WRITE, insn->src_reg, false, false); - if (err) - return err; - - err = save_aux_ptr_type(env, dst_reg_type, false); - if (err) - return err; - } else if (class == BPF_ST) { - enum bpf_reg_type dst_reg_type; - - if (BPF_MODE(insn->code) != BPF_MEM || - insn->src_reg != BPF_REG_0) { - verbose(env, "BPF_ST uses reserved fields\n"); - return -EINVAL; - } - /* check src operand */ - err = check_reg_arg(env, insn->dst_reg, SRC_OP); - if (err) - return err; + /* Sanity check: precomputed constants must match verifier state */ + if (!state->speculative && insn_aux->const_reg_mask) { + struct bpf_reg_state *regs = cur_regs(env); + u16 mask = insn_aux->const_reg_mask; - dst_reg_type = regs[insn->dst_reg].type; - - /* check that memory (dst_reg + off) is writeable */ - err = check_mem_access(env, env->insn_idx, insn->dst_reg, - insn->off, BPF_SIZE(insn->code), - BPF_WRITE, -1, false, false); - if (err) - return err; - - err = save_aux_ptr_type(env, dst_reg_type, false); - if (err) - return err; - } else if (class == BPF_JMP || class == BPF_JMP32) { - u8 opcode = BPF_OP(insn->code); - - env->jmps_processed++; - if (opcode == BPF_CALL) { - if (BPF_SRC(insn->code) != BPF_K || - (insn->src_reg != BPF_PSEUDO_KFUNC_CALL - && insn->off != 0) || - (insn->src_reg != BPF_REG_0 && - insn->src_reg != BPF_PSEUDO_CALL && - insn->src_reg != BPF_PSEUDO_KFUNC_CALL) || - insn->dst_reg != BPF_REG_0 || - class == BPF_JMP32) { - verbose(env, "BPF_CALL uses reserved fields\n"); - return -EINVAL; - } - - if (env->cur_state->active_locks) { - if ((insn->src_reg == BPF_REG_0 && insn->imm != BPF_FUNC_spin_unlock) || - (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && - (insn->off != 0 || !kfunc_spin_allowed(insn->imm)))) { - verbose(env, "function calls are not allowed while holding a lock\n"); - return -EINVAL; - } - } - if (insn->src_reg == BPF_PSEUDO_CALL) { - err = check_func_call(env, insn, &env->insn_idx); - } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { - err = check_kfunc_call(env, insn, &env->insn_idx); - if (!err && is_bpf_throw_kfunc(insn)) { - exception_exit = true; - goto process_bpf_exit_full; - } - } else { - err = check_helper_call(env, insn, &env->insn_idx); - } - if (err) - return err; + for (int r = 0; r < ARRAY_SIZE(insn_aux->const_reg_vals); r++) { + u32 cval = insn_aux->const_reg_vals[r]; - mark_reg_scratched(env, BPF_REG_0); - } else if (opcode == BPF_JA) { - if (BPF_SRC(insn->code) != BPF_K || - insn->src_reg != BPF_REG_0 || - insn->dst_reg != BPF_REG_0 || - (class == BPF_JMP && insn->imm != 0) || - (class == BPF_JMP32 && insn->off != 0)) { - verbose(env, "BPF_JA uses reserved fields\n"); - return -EINVAL; - } - - if (class == BPF_JMP) - env->insn_idx += insn->off + 1; - else - env->insn_idx += insn->imm + 1; - continue; - - } else if (opcode == BPF_EXIT) { - if (BPF_SRC(insn->code) != BPF_K || - insn->imm != 0 || - insn->src_reg != BPF_REG_0 || - insn->dst_reg != BPF_REG_0 || - class == BPF_JMP32) { - verbose(env, "BPF_EXIT uses reserved fields\n"); - return -EINVAL; - } -process_bpf_exit_full: - /* We must do check_reference_leak here before - * prepare_func_exit to handle the case when - * state->curframe > 0, it may be a callback - * function, for which reference_state must - * match caller reference state when it exits. - */ - err = check_resource_leak(env, exception_exit, !env->cur_state->curframe, - "BPF_EXIT instruction in main prog"); - if (err) - return err; - - /* The side effect of the prepare_func_exit - * which is being skipped is that it frees - * bpf_func_state. Typically, process_bpf_exit - * will only be hit with outermost exit. - * copy_verifier_state in pop_stack will handle - * freeing of any extra bpf_func_state left over - * from not processing all nested function - * exits. We also skip return code checks as - * they are not needed for exceptional exits. - */ - if (exception_exit) - goto process_bpf_exit; - - if (state->curframe) { - /* exit from nested function */ - err = prepare_func_exit(env, &env->insn_idx); - if (err) - return err; - do_print_state = true; + if (!(mask & BIT(r))) continue; - } - - err = check_return_code(env, BPF_REG_0, "R0"); - if (err) - return err; -process_bpf_exit: - mark_verifier_state_scratched(env); - update_branch_counts(env, env->cur_state); - err = pop_stack(env, &prev_insn_idx, - &env->insn_idx, pop_log); - if (err < 0) { - if (err != -ENOENT) - return err; - break; - } else { - do_print_state = true; + if (regs[r].type != SCALAR_VALUE) continue; - } - } else { - err = check_cond_jmp_op(env, insn, &env->insn_idx); - if (err) - return err; + if (!tnum_is_const(regs[r].var_off)) + continue; + if (verifier_bug_if((u32)regs[r].var_off.value != cval, + env, "const R%d: %u != %llu", + r, cval, regs[r].var_off.value)) + return -EFAULT; } - } else if (class == BPF_LD) { - u8 mode = BPF_MODE(insn->code); + } - if (mode == BPF_ABS || mode == BPF_IND) { - err = check_ld_abs(env, insn); - if (err) - return err; + /* Reduce verification complexity by stopping speculative path + * verification when a nospec is encountered. + */ + if (state->speculative && insn_aux->nospec) + goto process_bpf_exit; - } else if (mode == BPF_IMM) { - err = check_ld_imm(env, insn); - if (err) + err = do_check_insn(env, &do_print_state); + if (error_recoverable_with_nospec(err) && state->speculative) { + /* Prevent this speculative path from ever reaching the + * insn that would have been unsafe to execute. + */ + insn_aux->nospec = true; + /* If it was an ADD/SUB insn, potentially remove any + * markings for alu sanitization. + */ + insn_aux->alu_state = 0; + goto process_bpf_exit; + } else if (err < 0) { + return err; + } else if (err == PROCESS_BPF_EXIT) { + goto process_bpf_exit; + } else if (err == INSN_IDX_UPDATED) { + } else if (err == 0) { + env->insn_idx++; + } + + if (state->speculative && insn_aux->nospec_result) { + /* If we are on a path that performed a jump-op, this + * may skip a nospec patched-in after the jump. This can + * currently never happen because nospec_result is only + * used for the write-ops + * `*(size*)(dst_reg+off)=src_reg|imm32` and helper + * calls. These must never skip the following insn + * (i.e., bpf_insn_successors()'s opcode_info.can_jump + * is false). Still, add a warning to document this in + * case nospec_result is used elsewhere in the future. + * + * All non-branch instructions have a single + * fall-through edge. For these, nospec_result should + * already work. + */ + if (verifier_bug_if((BPF_CLASS(insn->code) == BPF_JMP || + BPF_CLASS(insn->code) == BPF_JMP32) && + BPF_OP(insn->code) != BPF_CALL, env, + "speculation barrier after jump instruction may not have the desired effect")) + return -EFAULT; +process_bpf_exit: + mark_verifier_state_scratched(env); + err = bpf_update_branch_counts(env, env->cur_state); + if (err) + return err; + err = pop_stack(env, &prev_insn_idx, &env->insn_idx, + pop_log); + if (err < 0) { + if (err != -ENOENT) return err; - - env->insn_idx++; - sanitize_mark_insn_seen(env); + break; } else { - verbose(env, "invalid BPF_LD mode\n"); - return -EINVAL; + do_print_state = true; + continue; } - } else { - verbose(env, "unknown insn class %d\n", class); - return -EINVAL; } - - env->insn_idx++; } return 0; @@ -19295,12 +17874,7 @@ static int find_btf_percpu_datasec(struct btf *btf) * types to look at only module's own BTF types. */ n = btf_nr_types(btf); - if (btf_is_module(btf)) - i = btf_nr_types(btf_vmlinux); - else - i = 1; - - for(; i < n; i++) { + for (i = btf_named_start_id(btf, true); i < n; i++) { t = btf_type_by_id(btf, i); if (BTF_INFO_KIND(t->info) != BTF_KIND_DATASEC) continue; @@ -19314,25 +17888,28 @@ static int find_btf_percpu_datasec(struct btf *btf) } /* - * Add btf to the used_btfs array and return the index. (If the btf was - * already added, then just return the index.) Upon successful insertion - * increase btf refcnt, and, if present, also refcount the corresponding - * kernel module. + * Add btf to the env->used_btfs array. If needed, refcount the + * corresponding kernel module. To simplify caller's logic + * in case of error or if btf was added before the function + * decreases the btf refcount. */ static int __add_used_btf(struct bpf_verifier_env *env, struct btf *btf) { struct btf_mod_pair *btf_mod; + int ret = 0; int i; /* check whether we recorded this BTF (and maybe module) already */ for (i = 0; i < env->used_btf_cnt; i++) if (env->used_btfs[i].btf == btf) - return i; - - if (env->used_btf_cnt >= MAX_USED_BTFS) - return -E2BIG; + goto ret_put; - btf_get(btf); + if (env->used_btf_cnt >= MAX_USED_BTFS) { + verbose(env, "The total number of btfs per program has reached the limit of %u\n", + MAX_USED_BTFS); + ret = -E2BIG; + goto ret_put; + } btf_mod = &env->used_btfs[env->used_btf_cnt]; btf_mod->btf = btf; @@ -19342,12 +17919,18 @@ static int __add_used_btf(struct bpf_verifier_env *env, struct btf *btf) if (btf_is_module(btf)) { btf_mod->module = btf_try_get_module(btf); if (!btf_mod->module) { - btf_put(btf); - return -ENXIO; + ret = -ENXIO; + goto ret_put; } } - return env->used_btf_cnt++; + env->used_btf_cnt++; + return 0; + +ret_put: + /* Either error or this BTF was already added */ + btf_put(btf); + return ret; } /* replace pseudo btf_id with kernel symbol address */ @@ -19444,9 +18027,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, btf_fd = insn[1].imm; if (btf_fd) { - CLASS(fd, f)(btf_fd); - - btf = __btf_get_by_fd(f); + btf = btf_get_by_fd(btf_fd); if (IS_ERR(btf)) { verbose(env, "invalid module BTF object FD specified.\n"); return -EINVAL; @@ -19456,17 +18037,17 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n"); return -EINVAL; } + btf_get(btf_vmlinux); btf = btf_vmlinux; } err = __check_pseudo_btf_id(env, insn, aux, btf); - if (err) + if (err) { + btf_put(btf); return err; + } - err = __add_used_btf(env, btf); - if (err < 0) - return err; - return 0; + return __add_used_btf(env, btf); } static bool is_tracing_prog_type(enum bpf_prog_type type) @@ -19496,6 +18077,12 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, { enum bpf_prog_type prog_type = resolve_prog_type(prog); + if (map->excl_prog_sha && + memcmp(map->excl_prog_sha, prog->digest, SHA256_DIGEST_SIZE)) { + verbose(env, "program's hash doesn't match map's excl_prog_hash\n"); + return -EACCES; + } + if (btf_record_has_field(map->record, BPF_LIST_HEAD) || btf_record_has_field(map->record, BPF_RB_ROOT)) { if (is_tracing_prog_type(prog_type)) { @@ -19504,7 +18091,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, } } - if (btf_record_has_field(map->record, BPF_SPIN_LOCK)) { + if (btf_record_has_field(map->record, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK)) { if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) { verbose(env, "socket filter progs cannot use bpf_spin_lock yet\n"); return -EINVAL; @@ -19516,20 +18103,6 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, } } - if (btf_record_has_field(map->record, BPF_TIMER)) { - if (is_tracing_prog_type(prog_type)) { - verbose(env, "tracing progs cannot use bpf_timer yet\n"); - return -EINVAL; - } - } - - if (btf_record_has_field(map->record, BPF_WORKQUEUE)) { - if (is_tracing_prog_type(prog_type)) { - verbose(env, "tracing progs cannot use bpf_wq yet\n"); - return -EINVAL; - } - } - if ((bpf_prog_is_offloaded(prog->aux) || bpf_map_is_offloaded(map)) && !bpf_offload_prog_map_match(prog, map)) { verbose(env, "offload device mismatch between prog and map\n"); @@ -19560,6 +18133,8 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_QUEUE: case BPF_MAP_TYPE_STACK: case BPF_MAP_TYPE_ARENA: + case BPF_MAP_TYPE_INSN_ARRAY: + case BPF_MAP_TYPE_PROG_ARRAY: break; default: verbose(env, @@ -19631,6 +18206,15 @@ static int __add_used_map(struct bpf_verifier_env *env, struct bpf_map *map) env->used_maps[env->used_map_cnt++] = map; + if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) { + err = bpf_insn_array_init(map, env->prog); + if (err) { + verbose(env, "Failed to properly initialize insn array\n"); + return err; + } + env->insn_array_maps[env->insn_array_map_cnt++] = map; + } + return env->used_map_cnt - 1; } @@ -19652,14 +18236,199 @@ static int add_used_map(struct bpf_verifier_env *env, int fd) return __add_used_map(env, map); } -/* find and rewrite pseudo imm in ld_imm64 instructions: +static int check_alu_fields(struct bpf_verifier_env *env, struct bpf_insn *insn) +{ + u8 class = BPF_CLASS(insn->code); + u8 opcode = BPF_OP(insn->code); + + switch (opcode) { + case BPF_NEG: + if (BPF_SRC(insn->code) != BPF_K || insn->src_reg != BPF_REG_0 || + insn->off != 0 || insn->imm != 0) { + verbose(env, "BPF_NEG uses reserved fields\n"); + return -EINVAL; + } + return 0; + case BPF_END: + if (insn->src_reg != BPF_REG_0 || insn->off != 0 || + (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) || + (class == BPF_ALU64 && BPF_SRC(insn->code) != BPF_TO_LE)) { + verbose(env, "BPF_END uses reserved fields\n"); + return -EINVAL; + } + return 0; + case BPF_MOV: + if (BPF_SRC(insn->code) == BPF_X) { + if (class == BPF_ALU) { + if ((insn->off != 0 && insn->off != 8 && insn->off != 16) || + insn->imm) { + verbose(env, "BPF_MOV uses reserved fields\n"); + return -EINVAL; + } + } else if (insn->off == BPF_ADDR_SPACE_CAST) { + if (insn->imm != 1 && insn->imm != 1u << 16) { + verbose(env, "addr_space_cast insn can only convert between address space 1 and 0\n"); + return -EINVAL; + } + } else if ((insn->off != 0 && insn->off != 8 && + insn->off != 16 && insn->off != 32) || insn->imm) { + verbose(env, "BPF_MOV uses reserved fields\n"); + return -EINVAL; + } + } else if (insn->src_reg != BPF_REG_0 || insn->off != 0) { + verbose(env, "BPF_MOV uses reserved fields\n"); + return -EINVAL; + } + return 0; + case BPF_ADD: + case BPF_SUB: + case BPF_AND: + case BPF_OR: + case BPF_XOR: + case BPF_LSH: + case BPF_RSH: + case BPF_ARSH: + case BPF_MUL: + case BPF_DIV: + case BPF_MOD: + if (BPF_SRC(insn->code) == BPF_X) { + if (insn->imm != 0 || (insn->off != 0 && insn->off != 1) || + (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) { + verbose(env, "BPF_ALU uses reserved fields\n"); + return -EINVAL; + } + } else if (insn->src_reg != BPF_REG_0 || + (insn->off != 0 && insn->off != 1) || + (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) { + verbose(env, "BPF_ALU uses reserved fields\n"); + return -EINVAL; + } + return 0; + default: + verbose(env, "invalid BPF_ALU opcode %x\n", opcode); + return -EINVAL; + } +} + +static int check_jmp_fields(struct bpf_verifier_env *env, struct bpf_insn *insn) +{ + u8 class = BPF_CLASS(insn->code); + u8 opcode = BPF_OP(insn->code); + + switch (opcode) { + case BPF_CALL: + if (BPF_SRC(insn->code) != BPF_K || + (insn->src_reg != BPF_PSEUDO_KFUNC_CALL && insn->off != 0) || + (insn->src_reg != BPF_REG_0 && insn->src_reg != BPF_PSEUDO_CALL && + insn->src_reg != BPF_PSEUDO_KFUNC_CALL) || + insn->dst_reg != BPF_REG_0 || class == BPF_JMP32) { + verbose(env, "BPF_CALL uses reserved fields\n"); + return -EINVAL; + } + return 0; + case BPF_JA: + if (BPF_SRC(insn->code) == BPF_X) { + if (insn->src_reg != BPF_REG_0 || insn->imm != 0 || insn->off != 0) { + verbose(env, "BPF_JA|BPF_X uses reserved fields\n"); + return -EINVAL; + } + } else if (insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0 || + (class == BPF_JMP && insn->imm != 0) || + (class == BPF_JMP32 && insn->off != 0)) { + verbose(env, "BPF_JA uses reserved fields\n"); + return -EINVAL; + } + return 0; + case BPF_EXIT: + if (BPF_SRC(insn->code) != BPF_K || insn->imm != 0 || + insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0 || + class == BPF_JMP32) { + verbose(env, "BPF_EXIT uses reserved fields\n"); + return -EINVAL; + } + return 0; + case BPF_JCOND: + if (insn->code != (BPF_JMP | BPF_JCOND) || insn->src_reg != BPF_MAY_GOTO || + insn->dst_reg || insn->imm) { + verbose(env, "invalid may_goto imm %d\n", insn->imm); + return -EINVAL; + } + return 0; + default: + if (BPF_SRC(insn->code) == BPF_X) { + if (insn->imm != 0) { + verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); + return -EINVAL; + } + } else if (insn->src_reg != BPF_REG_0) { + verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); + return -EINVAL; + } + return 0; + } +} + +static int check_insn_fields(struct bpf_verifier_env *env, struct bpf_insn *insn) +{ + switch (BPF_CLASS(insn->code)) { + case BPF_ALU: + case BPF_ALU64: + return check_alu_fields(env, insn); + case BPF_LDX: + if ((BPF_MODE(insn->code) != BPF_MEM && BPF_MODE(insn->code) != BPF_MEMSX) || + insn->imm != 0) { + verbose(env, "BPF_LDX uses reserved fields\n"); + return -EINVAL; + } + return 0; + case BPF_STX: + if (BPF_MODE(insn->code) == BPF_ATOMIC) + return 0; + if (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0) { + verbose(env, "BPF_STX uses reserved fields\n"); + return -EINVAL; + } + return 0; + case BPF_ST: + if (BPF_MODE(insn->code) != BPF_MEM || insn->src_reg != BPF_REG_0) { + verbose(env, "BPF_ST uses reserved fields\n"); + return -EINVAL; + } + return 0; + case BPF_JMP: + case BPF_JMP32: + return check_jmp_fields(env, insn); + case BPF_LD: { + u8 mode = BPF_MODE(insn->code); + + if (mode == BPF_ABS || mode == BPF_IND) { + if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || + BPF_SIZE(insn->code) == BPF_DW || + (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { + verbose(env, "BPF_LD_[ABS|IND] uses reserved fields\n"); + return -EINVAL; + } + } else if (mode != BPF_IMM) { + verbose(env, "invalid BPF_LD mode\n"); + return -EINVAL; + } + return 0; + } + default: + verbose(env, "unknown insn class %d\n", BPF_CLASS(insn->code)); + return -EINVAL; + } +} + +/* + * Check that insns are sane and rewrite pseudo imm in ld_imm64 instructions: * * 1. if it accesses map FD, replace it with actual map pointer. * 2. if it accesses btf_id of a VAR, replace it with pointer to the var. * * NOTE: btf_vmlinux is required for converting pseudo btf_id. */ -static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) +static int check_and_resolve_insns(struct bpf_verifier_env *env) { struct bpf_insn *insn = env->prog->insnsi; int insn_cnt = env->prog->len; @@ -19670,13 +18439,14 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) return err; for (i = 0; i < insn_cnt; i++, insn++) { - if (BPF_CLASS(insn->code) == BPF_LDX && - ((BPF_MODE(insn->code) != BPF_MEM && BPF_MODE(insn->code) != BPF_MEMSX) || - insn->imm != 0)) { - verbose(env, "BPF_LDX uses reserved fields\n"); + if (insn->dst_reg >= MAX_BPF_REG) { + verbose(env, "R%d is invalid\n", insn->dst_reg); + return -EINVAL; + } + if (insn->src_reg >= MAX_BPF_REG) { + verbose(env, "R%d is invalid\n", insn->src_reg); return -EINVAL; } - if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { struct bpf_insn_aux_data *aux; struct bpf_map *map; @@ -19691,6 +18461,11 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) return -EINVAL; } + if (insn[0].off != 0) { + verbose(env, "BPF_LD_IMM64 uses reserved fields\n"); + return -EINVAL; + } + if (insn[0].src_reg == 0) /* valid generic load 64-bit imm */ goto next_insn; @@ -19757,11 +18532,6 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) } else { u32 off = insn[1].imm; - if (off >= BPF_MAX_VAR_OFF) { - verbose(env, "direct value offset of %u is not allowed\n", off); - return -EINVAL; - } - if (!map->ops->map_direct_value_addr) { verbose(env, "no direct value access support for this map type\n"); return -EINVAL; @@ -19792,6 +18562,10 @@ next_insn: verbose(env, "unknown opcode %02x\n", insn->code); return -EINVAL; } + + err = check_insn_fields(env, insn); + if (err) + return err; } /* now all pseudo BPF_LD_IMM64 instructions load valid @@ -19830,282 +18604,15 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env) } } -/* single env->prog->insni[off] instruction was replaced with the range - * insni[off, off + cnt). Adjust corresponding insn_aux_data by copying - * [0, off) and [off, end) to new locations, so the patched range stays zero - */ -static void adjust_insn_aux_data(struct bpf_verifier_env *env, - struct bpf_insn_aux_data *new_data, - struct bpf_prog *new_prog, u32 off, u32 cnt) -{ - struct bpf_insn_aux_data *old_data = env->insn_aux_data; - struct bpf_insn *insn = new_prog->insnsi; - u32 old_seen = old_data[off].seen; - u32 prog_len; - int i; - - /* aux info at OFF always needs adjustment, no matter fast path - * (cnt == 1) is taken or not. There is no guarantee INSN at OFF is the - * original insn at old prog. - */ - old_data[off].zext_dst = insn_has_def32(env, insn + off + cnt - 1); - - if (cnt == 1) - return; - prog_len = new_prog->len; - - memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off); - memcpy(new_data + off + cnt - 1, old_data + off, - sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); - for (i = off; i < off + cnt - 1; i++) { - /* Expand insni[off]'s seen count to the patched range. */ - new_data[i].seen = old_seen; - new_data[i].zext_dst = insn_has_def32(env, insn + i); - } - env->insn_aux_data = new_data; - vfree(old_data); -} - -static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len) +static void release_insn_arrays(struct bpf_verifier_env *env) { int i; - if (len == 1) - return; - /* NOTE: fake 'exit' subprog should be updated as well. */ - for (i = 0; i <= env->subprog_cnt; i++) { - if (env->subprog_info[i].start <= off) - continue; - env->subprog_info[i].start += len - 1; - } + for (i = 0; i < env->insn_array_map_cnt; i++) + bpf_insn_array_release(env->insn_array_maps[i]); } -static void adjust_poke_descs(struct bpf_prog *prog, u32 off, u32 len) -{ - struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab; - int i, sz = prog->aux->size_poke_tab; - struct bpf_jit_poke_descriptor *desc; - for (i = 0; i < sz; i++) { - desc = &tab[i]; - if (desc->insn_idx <= off) - continue; - desc->insn_idx += len - 1; - } -} - -static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off, - const struct bpf_insn *patch, u32 len) -{ - struct bpf_prog *new_prog; - struct bpf_insn_aux_data *new_data = NULL; - - if (len > 1) { - new_data = vzalloc(array_size(env->prog->len + len - 1, - sizeof(struct bpf_insn_aux_data))); - if (!new_data) - return NULL; - } - - new_prog = bpf_patch_insn_single(env->prog, off, patch, len); - if (IS_ERR(new_prog)) { - if (PTR_ERR(new_prog) == -ERANGE) - verbose(env, - "insn %d cannot be patched due to 16-bit range\n", - env->insn_aux_data[off].orig_idx); - vfree(new_data); - return NULL; - } - adjust_insn_aux_data(env, new_data, new_prog, off, len); - adjust_subprog_starts(env, off, len); - adjust_poke_descs(new_prog, off, len); - return new_prog; -} - -/* - * For all jmp insns in a given 'prog' that point to 'tgt_idx' insn adjust the - * jump offset by 'delta'. - */ -static int adjust_jmp_off(struct bpf_prog *prog, u32 tgt_idx, u32 delta) -{ - struct bpf_insn *insn = prog->insnsi; - u32 insn_cnt = prog->len, i; - s32 imm; - s16 off; - - for (i = 0; i < insn_cnt; i++, insn++) { - u8 code = insn->code; - - if (tgt_idx <= i && i < tgt_idx + delta) - continue; - - if ((BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) || - BPF_OP(code) == BPF_CALL || BPF_OP(code) == BPF_EXIT) - continue; - - if (insn->code == (BPF_JMP32 | BPF_JA)) { - if (i + 1 + insn->imm != tgt_idx) - continue; - if (check_add_overflow(insn->imm, delta, &imm)) - return -ERANGE; - insn->imm = imm; - } else { - if (i + 1 + insn->off != tgt_idx) - continue; - if (check_add_overflow(insn->off, delta, &off)) - return -ERANGE; - insn->off = off; - } - } - return 0; -} - -static int adjust_subprog_starts_after_remove(struct bpf_verifier_env *env, - u32 off, u32 cnt) -{ - int i, j; - - /* find first prog starting at or after off (first to remove) */ - for (i = 0; i < env->subprog_cnt; i++) - if (env->subprog_info[i].start >= off) - break; - /* find first prog starting at or after off + cnt (first to stay) */ - for (j = i; j < env->subprog_cnt; j++) - if (env->subprog_info[j].start >= off + cnt) - break; - /* if j doesn't start exactly at off + cnt, we are just removing - * the front of previous prog - */ - if (env->subprog_info[j].start != off + cnt) - j--; - - if (j > i) { - struct bpf_prog_aux *aux = env->prog->aux; - int move; - - /* move fake 'exit' subprog as well */ - move = env->subprog_cnt + 1 - j; - - memmove(env->subprog_info + i, - env->subprog_info + j, - sizeof(*env->subprog_info) * move); - env->subprog_cnt -= j - i; - - /* remove func_info */ - if (aux->func_info) { - move = aux->func_info_cnt - j; - - memmove(aux->func_info + i, - aux->func_info + j, - sizeof(*aux->func_info) * move); - aux->func_info_cnt -= j - i; - /* func_info->insn_off is set after all code rewrites, - * in adjust_btf_func() - no need to adjust - */ - } - } else { - /* convert i from "first prog to remove" to "first to adjust" */ - if (env->subprog_info[i].start == off) - i++; - } - - /* update fake 'exit' subprog as well */ - for (; i <= env->subprog_cnt; i++) - env->subprog_info[i].start -= cnt; - - return 0; -} - -static int bpf_adj_linfo_after_remove(struct bpf_verifier_env *env, u32 off, - u32 cnt) -{ - struct bpf_prog *prog = env->prog; - u32 i, l_off, l_cnt, nr_linfo; - struct bpf_line_info *linfo; - - nr_linfo = prog->aux->nr_linfo; - if (!nr_linfo) - return 0; - - linfo = prog->aux->linfo; - - /* find first line info to remove, count lines to be removed */ - for (i = 0; i < nr_linfo; i++) - if (linfo[i].insn_off >= off) - break; - - l_off = i; - l_cnt = 0; - for (; i < nr_linfo; i++) - if (linfo[i].insn_off < off + cnt) - l_cnt++; - else - break; - - /* First live insn doesn't match first live linfo, it needs to "inherit" - * last removed linfo. prog is already modified, so prog->len == off - * means no live instructions after (tail of the program was removed). - */ - if (prog->len != off && l_cnt && - (i == nr_linfo || linfo[i].insn_off != off + cnt)) { - l_cnt--; - linfo[--i].insn_off = off + cnt; - } - - /* remove the line info which refer to the removed instructions */ - if (l_cnt) { - memmove(linfo + l_off, linfo + i, - sizeof(*linfo) * (nr_linfo - i)); - - prog->aux->nr_linfo -= l_cnt; - nr_linfo = prog->aux->nr_linfo; - } - - /* pull all linfo[i].insn_off >= off + cnt in by cnt */ - for (i = l_off; i < nr_linfo; i++) - linfo[i].insn_off -= cnt; - - /* fix up all subprogs (incl. 'exit') which start >= off */ - for (i = 0; i <= env->subprog_cnt; i++) - if (env->subprog_info[i].linfo_idx > l_off) { - /* program may have started in the removed region but - * may not be fully removed - */ - if (env->subprog_info[i].linfo_idx >= l_off + l_cnt) - env->subprog_info[i].linfo_idx -= l_cnt; - else - env->subprog_info[i].linfo_idx = l_off; - } - - return 0; -} - -static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) -{ - struct bpf_insn_aux_data *aux_data = env->insn_aux_data; - unsigned int orig_prog_len = env->prog->len; - int err; - - if (bpf_prog_is_offloaded(env->prog->aux)) - bpf_prog_offload_remove_insns(env, off, cnt); - - err = bpf_remove_insns(env->prog, off, cnt); - if (err) - return err; - - err = adjust_subprog_starts_after_remove(env, off, cnt); - if (err) - return err; - - err = bpf_adj_linfo_after_remove(env, off, cnt); - if (err) - return err; - - memmove(aux_data + off, aux_data + off + cnt, - sizeof(*aux_data) * (orig_prog_len - off - cnt)); - - return 0; -} /* The verifier does more data flow analysis than llvm and will not * explore branches that are dead at run time. Malicious programs can @@ -20134,2028 +18641,48 @@ static void sanitize_dead_code(struct bpf_verifier_env *env) } } -static bool insn_is_cond_jump(u8 code) -{ - u8 op; - - op = BPF_OP(code); - if (BPF_CLASS(code) == BPF_JMP32) - return op != BPF_JA; - - if (BPF_CLASS(code) != BPF_JMP) - return false; - - return op != BPF_JA && op != BPF_EXIT && op != BPF_CALL; -} - -static void opt_hard_wire_dead_code_branches(struct bpf_verifier_env *env) -{ - struct bpf_insn_aux_data *aux_data = env->insn_aux_data; - struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0); - struct bpf_insn *insn = env->prog->insnsi; - const int insn_cnt = env->prog->len; - int i; - - for (i = 0; i < insn_cnt; i++, insn++) { - if (!insn_is_cond_jump(insn->code)) - continue; - - if (!aux_data[i + 1].seen) - ja.off = insn->off; - else if (!aux_data[i + 1 + insn->off].seen) - ja.off = 0; - else - continue; - - if (bpf_prog_is_offloaded(env->prog->aux)) - bpf_prog_offload_replace_insn(env, i, &ja); - - memcpy(insn, &ja, sizeof(ja)); - } -} - -static int opt_remove_dead_code(struct bpf_verifier_env *env) -{ - struct bpf_insn_aux_data *aux_data = env->insn_aux_data; - int insn_cnt = env->prog->len; - int i, err; - - for (i = 0; i < insn_cnt; i++) { - int j; - - j = 0; - while (i + j < insn_cnt && !aux_data[i + j].seen) - j++; - if (!j) - continue; - - err = verifier_remove_insns(env, i, j); - if (err) - return err; - insn_cnt = env->prog->len; - } - - return 0; -} - -static const struct bpf_insn NOP = BPF_JMP_IMM(BPF_JA, 0, 0, 0); -static const struct bpf_insn MAY_GOTO_0 = BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, 0, 0); - -static int opt_remove_nops(struct bpf_verifier_env *env) -{ - struct bpf_insn *insn = env->prog->insnsi; - int insn_cnt = env->prog->len; - bool is_may_goto_0, is_ja; - int i, err; - - for (i = 0; i < insn_cnt; i++) { - is_may_goto_0 = !memcmp(&insn[i], &MAY_GOTO_0, sizeof(MAY_GOTO_0)); - is_ja = !memcmp(&insn[i], &NOP, sizeof(NOP)); - - if (!is_may_goto_0 && !is_ja) - continue; - - err = verifier_remove_insns(env, i, 1); - if (err) - return err; - insn_cnt--; - /* Go back one insn to catch may_goto +1; may_goto +0 sequence */ - i -= (is_may_goto_0 && i > 0) ? 2 : 1; - } - - return 0; -} - -static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, - const union bpf_attr *attr) -{ - struct bpf_insn *patch, zext_patch[2], rnd_hi32_patch[4]; - struct bpf_insn_aux_data *aux = env->insn_aux_data; - int i, patch_len, delta = 0, len = env->prog->len; - struct bpf_insn *insns = env->prog->insnsi; - struct bpf_prog *new_prog; - bool rnd_hi32; - - rnd_hi32 = attr->prog_flags & BPF_F_TEST_RND_HI32; - zext_patch[1] = BPF_ZEXT_REG(0); - rnd_hi32_patch[1] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, 0); - rnd_hi32_patch[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32); - rnd_hi32_patch[3] = BPF_ALU64_REG(BPF_OR, 0, BPF_REG_AX); - for (i = 0; i < len; i++) { - int adj_idx = i + delta; - struct bpf_insn insn; - int load_reg; - - insn = insns[adj_idx]; - load_reg = insn_def_regno(&insn); - if (!aux[adj_idx].zext_dst) { - u8 code, class; - u32 imm_rnd; - - if (!rnd_hi32) - continue; - - code = insn.code; - class = BPF_CLASS(code); - if (load_reg == -1) - continue; - - /* NOTE: arg "reg" (the fourth one) is only used for - * BPF_STX + SRC_OP, so it is safe to pass NULL - * here. - */ - if (is_reg64(env, &insn, load_reg, NULL, DST_OP)) { - if (class == BPF_LD && - BPF_MODE(code) == BPF_IMM) - i++; - continue; - } - - /* ctx load could be transformed into wider load. */ - if (class == BPF_LDX && - aux[adj_idx].ptr_type == PTR_TO_CTX) - continue; - - imm_rnd = get_random_u32(); - rnd_hi32_patch[0] = insn; - rnd_hi32_patch[1].imm = imm_rnd; - rnd_hi32_patch[3].dst_reg = load_reg; - patch = rnd_hi32_patch; - patch_len = 4; - goto apply_patch_buffer; - } - - /* Add in an zero-extend instruction if a) the JIT has requested - * it or b) it's a CMPXCHG. - * - * The latter is because: BPF_CMPXCHG always loads a value into - * R0, therefore always zero-extends. However some archs' - * equivalent instruction only does this load when the - * comparison is successful. This detail of CMPXCHG is - * orthogonal to the general zero-extension behaviour of the - * CPU, so it's treated independently of bpf_jit_needs_zext. - */ - if (!bpf_jit_needs_zext() && !is_cmpxchg_insn(&insn)) - continue; - - /* Zero-extension is done by the caller. */ - if (bpf_pseudo_kfunc_call(&insn)) - continue; - - if (WARN_ON(load_reg == -1)) { - verbose(env, "verifier bug. zext_dst is set, but no reg is defined\n"); - return -EFAULT; - } - - zext_patch[0] = insn; - zext_patch[1].dst_reg = load_reg; - zext_patch[1].src_reg = load_reg; - patch = zext_patch; - patch_len = 2; -apply_patch_buffer: - new_prog = bpf_patch_insn_data(env, adj_idx, patch, patch_len); - if (!new_prog) - return -ENOMEM; - env->prog = new_prog; - insns = new_prog->insnsi; - aux = env->insn_aux_data; - delta += patch_len - 1; - } - - return 0; -} - -/* convert load instructions that access fields of a context type into a - * sequence of instructions that access fields of the underlying structure: - * struct __sk_buff -> struct sk_buff - * struct bpf_sock_ops -> struct sock - */ -static int convert_ctx_accesses(struct bpf_verifier_env *env) -{ - struct bpf_subprog_info *subprogs = env->subprog_info; - const struct bpf_verifier_ops *ops = env->ops; - int i, cnt, size, ctx_field_size, delta = 0, epilogue_cnt = 0; - const int insn_cnt = env->prog->len; - struct bpf_insn *epilogue_buf = env->epilogue_buf; - struct bpf_insn *insn_buf = env->insn_buf; - struct bpf_insn *insn; - u32 target_size, size_default, off; - struct bpf_prog *new_prog; - enum bpf_access_type type; - bool is_narrower_load; - int epilogue_idx = 0; - - if (ops->gen_epilogue) { - epilogue_cnt = ops->gen_epilogue(epilogue_buf, env->prog, - -(subprogs[0].stack_depth + 8)); - if (epilogue_cnt >= INSN_BUF_SIZE) { - verbose(env, "bpf verifier is misconfigured\n"); - return -EINVAL; - } else if (epilogue_cnt) { - /* Save the ARG_PTR_TO_CTX for the epilogue to use */ - cnt = 0; - subprogs[0].stack_depth += 8; - insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_FP, BPF_REG_1, - -subprogs[0].stack_depth); - insn_buf[cnt++] = env->prog->insnsi[0]; - new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - env->prog = new_prog; - delta += cnt - 1; - } - } - - if (ops->gen_prologue || env->seen_direct_write) { - if (!ops->gen_prologue) { - verbose(env, "bpf verifier is misconfigured\n"); - return -EINVAL; - } - cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, - env->prog); - if (cnt >= INSN_BUF_SIZE) { - verbose(env, "bpf verifier is misconfigured\n"); - return -EINVAL; - } else if (cnt) { - new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - - env->prog = new_prog; - delta += cnt - 1; - } - } - - if (delta) - WARN_ON(adjust_jmp_off(env->prog, 0, delta)); - - if (bpf_prog_is_offloaded(env->prog->aux)) - return 0; - - insn = env->prog->insnsi + delta; - - for (i = 0; i < insn_cnt; i++, insn++) { - bpf_convert_ctx_access_t convert_ctx_access; - u8 mode; - - if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) || - insn->code == (BPF_LDX | BPF_MEM | BPF_H) || - insn->code == (BPF_LDX | BPF_MEM | BPF_W) || - insn->code == (BPF_LDX | BPF_MEM | BPF_DW) || - insn->code == (BPF_LDX | BPF_MEMSX | BPF_B) || - insn->code == (BPF_LDX | BPF_MEMSX | BPF_H) || - insn->code == (BPF_LDX | BPF_MEMSX | BPF_W)) { - type = BPF_READ; - } else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) || - insn->code == (BPF_STX | BPF_MEM | BPF_H) || - insn->code == (BPF_STX | BPF_MEM | BPF_W) || - insn->code == (BPF_STX | BPF_MEM | BPF_DW) || - insn->code == (BPF_ST | BPF_MEM | BPF_B) || - insn->code == (BPF_ST | BPF_MEM | BPF_H) || - insn->code == (BPF_ST | BPF_MEM | BPF_W) || - insn->code == (BPF_ST | BPF_MEM | BPF_DW)) { - type = BPF_WRITE; - } else if ((insn->code == (BPF_STX | BPF_ATOMIC | BPF_W) || - insn->code == (BPF_STX | BPF_ATOMIC | BPF_DW)) && - env->insn_aux_data[i + delta].ptr_type == PTR_TO_ARENA) { - insn->code = BPF_STX | BPF_PROBE_ATOMIC | BPF_SIZE(insn->code); - env->prog->aux->num_exentries++; - continue; - } else if (insn->code == (BPF_JMP | BPF_EXIT) && - epilogue_cnt && - i + delta < subprogs[1].start) { - /* Generate epilogue for the main prog */ - if (epilogue_idx) { - /* jump back to the earlier generated epilogue */ - insn_buf[0] = BPF_JMP32_A(epilogue_idx - i - delta - 1); - cnt = 1; - } else { - memcpy(insn_buf, epilogue_buf, - epilogue_cnt * sizeof(*epilogue_buf)); - cnt = epilogue_cnt; - /* epilogue_idx cannot be 0. It must have at - * least one ctx ptr saving insn before the - * epilogue. - */ - epilogue_idx = i + delta; - } - goto patch_insn_buf; - } else { - continue; - } - - if (type == BPF_WRITE && - env->insn_aux_data[i + delta].sanitize_stack_spill) { - struct bpf_insn patch[] = { - *insn, - BPF_ST_NOSPEC(), - }; - - cnt = ARRAY_SIZE(patch); - new_prog = bpf_patch_insn_data(env, i + delta, patch, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = new_prog; - insn = new_prog->insnsi + i + delta; - continue; - } - - switch ((int)env->insn_aux_data[i + delta].ptr_type) { - case PTR_TO_CTX: - if (!ops->convert_ctx_access) - continue; - convert_ctx_access = ops->convert_ctx_access; - break; - case PTR_TO_SOCKET: - case PTR_TO_SOCK_COMMON: - convert_ctx_access = bpf_sock_convert_ctx_access; - break; - case PTR_TO_TCP_SOCK: - convert_ctx_access = bpf_tcp_sock_convert_ctx_access; - break; - case PTR_TO_XDP_SOCK: - convert_ctx_access = bpf_xdp_sock_convert_ctx_access; - break; - case PTR_TO_BTF_ID: - case PTR_TO_BTF_ID | PTR_UNTRUSTED: - /* PTR_TO_BTF_ID | MEM_ALLOC always has a valid lifetime, unlike - * PTR_TO_BTF_ID, and an active ref_obj_id, but the same cannot - * be said once it is marked PTR_UNTRUSTED, hence we must handle - * any faults for loads into such types. BPF_WRITE is disallowed - * for this case. - */ - case PTR_TO_BTF_ID | MEM_ALLOC | PTR_UNTRUSTED: - if (type == BPF_READ) { - if (BPF_MODE(insn->code) == BPF_MEM) - insn->code = BPF_LDX | BPF_PROBE_MEM | - BPF_SIZE((insn)->code); - else - insn->code = BPF_LDX | BPF_PROBE_MEMSX | - BPF_SIZE((insn)->code); - env->prog->aux->num_exentries++; - } - continue; - case PTR_TO_ARENA: - if (BPF_MODE(insn->code) == BPF_MEMSX) { - verbose(env, "sign extending loads from arena are not supported yet\n"); - return -EOPNOTSUPP; - } - insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32 | BPF_SIZE(insn->code); - env->prog->aux->num_exentries++; - continue; - default: - continue; - } - - ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size; - size = BPF_LDST_BYTES(insn); - mode = BPF_MODE(insn->code); - - /* If the read access is a narrower load of the field, - * convert to a 4/8-byte load, to minimum program type specific - * convert_ctx_access changes. If conversion is successful, - * we will apply proper mask to the result. - */ - is_narrower_load = size < ctx_field_size; - size_default = bpf_ctx_off_adjust_machine(ctx_field_size); - off = insn->off; - if (is_narrower_load) { - u8 size_code; - - if (type == BPF_WRITE) { - verbose(env, "bpf verifier narrow ctx access misconfigured\n"); - return -EINVAL; - } - - size_code = BPF_H; - if (ctx_field_size == 4) - size_code = BPF_W; - else if (ctx_field_size == 8) - size_code = BPF_DW; - - insn->off = off & ~(size_default - 1); - insn->code = BPF_LDX | BPF_MEM | size_code; - } - - target_size = 0; - cnt = convert_ctx_access(type, insn, insn_buf, env->prog, - &target_size); - if (cnt == 0 || cnt >= INSN_BUF_SIZE || - (ctx_field_size && !target_size)) { - verbose(env, "bpf verifier is misconfigured\n"); - return -EINVAL; - } - - if (is_narrower_load && size < target_size) { - u8 shift = bpf_ctx_narrow_access_offset( - off, size, size_default) * 8; - if (shift && cnt + 1 >= INSN_BUF_SIZE) { - verbose(env, "bpf verifier narrow ctx load misconfigured\n"); - return -EINVAL; - } - if (ctx_field_size <= 4) { - if (shift) - insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH, - insn->dst_reg, - shift); - insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg, - (1 << size * 8) - 1); - } else { - if (shift) - insn_buf[cnt++] = BPF_ALU64_IMM(BPF_RSH, - insn->dst_reg, - shift); - insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg, - (1ULL << size * 8) - 1); - } - } - if (mode == BPF_MEMSX) - insn_buf[cnt++] = BPF_RAW_INSN(BPF_ALU64 | BPF_MOV | BPF_X, - insn->dst_reg, insn->dst_reg, - size * 8, 0); - -patch_insn_buf: - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - - /* keep walking new program and skip insns we just inserted */ - env->prog = new_prog; - insn = new_prog->insnsi + i + delta; - } - - return 0; -} - -static int jit_subprogs(struct bpf_verifier_env *env) -{ - struct bpf_prog *prog = env->prog, **func, *tmp; - int i, j, subprog_start, subprog_end = 0, len, subprog; - struct bpf_map *map_ptr; - struct bpf_insn *insn; - void *old_bpf_func; - int err, num_exentries; - - if (env->subprog_cnt <= 1) - return 0; - - for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { - if (!bpf_pseudo_func(insn) && !bpf_pseudo_call(insn)) - continue; - - /* Upon error here we cannot fall back to interpreter but - * need a hard reject of the program. Thus -EFAULT is - * propagated in any case. - */ - subprog = find_subprog(env, i + insn->imm + 1); - if (subprog < 0) { - WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", - i + insn->imm + 1); - return -EFAULT; - } - /* temporarily remember subprog id inside insn instead of - * aux_data, since next loop will split up all insns into funcs - */ - insn->off = subprog; - /* remember original imm in case JIT fails and fallback - * to interpreter will be needed - */ - env->insn_aux_data[i].call_imm = insn->imm; - /* point imm to __bpf_call_base+1 from JITs point of view */ - insn->imm = 1; - if (bpf_pseudo_func(insn)) { -#if defined(MODULES_VADDR) - u64 addr = MODULES_VADDR; -#else - u64 addr = VMALLOC_START; -#endif - /* jit (e.g. x86_64) may emit fewer instructions - * if it learns a u32 imm is the same as a u64 imm. - * Set close enough to possible prog address. - */ - insn[0].imm = (u32)addr; - insn[1].imm = addr >> 32; - } - } - - err = bpf_prog_alloc_jited_linfo(prog); - if (err) - goto out_undo_insn; - - err = -ENOMEM; - func = kcalloc(env->subprog_cnt, sizeof(prog), GFP_KERNEL); - if (!func) - goto out_undo_insn; - - for (i = 0; i < env->subprog_cnt; i++) { - subprog_start = subprog_end; - subprog_end = env->subprog_info[i + 1].start; - - len = subprog_end - subprog_start; - /* bpf_prog_run() doesn't call subprogs directly, - * hence main prog stats include the runtime of subprogs. - * subprogs don't have IDs and not reachable via prog_get_next_id - * func[i]->stats will never be accessed and stays NULL - */ - func[i] = bpf_prog_alloc_no_stats(bpf_prog_size(len), GFP_USER); - if (!func[i]) - goto out_free; - memcpy(func[i]->insnsi, &prog->insnsi[subprog_start], - len * sizeof(struct bpf_insn)); - func[i]->type = prog->type; - func[i]->len = len; - if (bpf_prog_calc_tag(func[i])) - goto out_free; - func[i]->is_func = 1; - func[i]->sleepable = prog->sleepable; - func[i]->aux->func_idx = i; - /* Below members will be freed only at prog->aux */ - func[i]->aux->btf = prog->aux->btf; - func[i]->aux->func_info = prog->aux->func_info; - func[i]->aux->func_info_cnt = prog->aux->func_info_cnt; - func[i]->aux->poke_tab = prog->aux->poke_tab; - func[i]->aux->size_poke_tab = prog->aux->size_poke_tab; - - for (j = 0; j < prog->aux->size_poke_tab; j++) { - struct bpf_jit_poke_descriptor *poke; - - poke = &prog->aux->poke_tab[j]; - if (poke->insn_idx < subprog_end && - poke->insn_idx >= subprog_start) - poke->aux = func[i]->aux; - } - - func[i]->aux->name[0] = 'F'; - func[i]->aux->stack_depth = env->subprog_info[i].stack_depth; - if (env->subprog_info[i].priv_stack_mode == PRIV_STACK_ADAPTIVE) - func[i]->aux->jits_use_priv_stack = true; - - func[i]->jit_requested = 1; - func[i]->blinding_requested = prog->blinding_requested; - func[i]->aux->kfunc_tab = prog->aux->kfunc_tab; - func[i]->aux->kfunc_btf_tab = prog->aux->kfunc_btf_tab; - func[i]->aux->linfo = prog->aux->linfo; - func[i]->aux->nr_linfo = prog->aux->nr_linfo; - func[i]->aux->jited_linfo = prog->aux->jited_linfo; - func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx; - func[i]->aux->arena = prog->aux->arena; - num_exentries = 0; - insn = func[i]->insnsi; - for (j = 0; j < func[i]->len; j++, insn++) { - if (BPF_CLASS(insn->code) == BPF_LDX && - (BPF_MODE(insn->code) == BPF_PROBE_MEM || - BPF_MODE(insn->code) == BPF_PROBE_MEM32 || - BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) - num_exentries++; - if ((BPF_CLASS(insn->code) == BPF_STX || - BPF_CLASS(insn->code) == BPF_ST) && - BPF_MODE(insn->code) == BPF_PROBE_MEM32) - num_exentries++; - if (BPF_CLASS(insn->code) == BPF_STX && - BPF_MODE(insn->code) == BPF_PROBE_ATOMIC) - num_exentries++; - } - func[i]->aux->num_exentries = num_exentries; - func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable; - func[i]->aux->exception_cb = env->subprog_info[i].is_exception_cb; - func[i]->aux->changes_pkt_data = env->subprog_info[i].changes_pkt_data; - if (!i) - func[i]->aux->exception_boundary = env->seen_exception; - func[i] = bpf_int_jit_compile(func[i]); - if (!func[i]->jited) { - err = -ENOTSUPP; - goto out_free; - } - cond_resched(); - } - - /* at this point all bpf functions were successfully JITed - * now populate all bpf_calls with correct addresses and - * run last pass of JIT - */ - for (i = 0; i < env->subprog_cnt; i++) { - insn = func[i]->insnsi; - for (j = 0; j < func[i]->len; j++, insn++) { - if (bpf_pseudo_func(insn)) { - subprog = insn->off; - insn[0].imm = (u32)(long)func[subprog]->bpf_func; - insn[1].imm = ((u64)(long)func[subprog]->bpf_func) >> 32; - continue; - } - if (!bpf_pseudo_call(insn)) - continue; - subprog = insn->off; - insn->imm = BPF_CALL_IMM(func[subprog]->bpf_func); - } - - /* we use the aux data to keep a list of the start addresses - * of the JITed images for each function in the program - * - * for some architectures, such as powerpc64, the imm field - * might not be large enough to hold the offset of the start - * address of the callee's JITed image from __bpf_call_base - * - * in such cases, we can lookup the start address of a callee - * by using its subprog id, available from the off field of - * the call instruction, as an index for this list - */ - func[i]->aux->func = func; - func[i]->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt; - func[i]->aux->real_func_cnt = env->subprog_cnt; - } - for (i = 0; i < env->subprog_cnt; i++) { - old_bpf_func = func[i]->bpf_func; - tmp = bpf_int_jit_compile(func[i]); - if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) { - verbose(env, "JIT doesn't support bpf-to-bpf calls\n"); - err = -ENOTSUPP; - goto out_free; - } - cond_resched(); - } - - /* finally lock prog and jit images for all functions and - * populate kallsysm. Begin at the first subprogram, since - * bpf_prog_load will add the kallsyms for the main program. - */ - for (i = 1; i < env->subprog_cnt; i++) { - err = bpf_prog_lock_ro(func[i]); - if (err) - goto out_free; - } - - for (i = 1; i < env->subprog_cnt; i++) - bpf_prog_kallsyms_add(func[i]); - - /* Last step: make now unused interpreter insns from main - * prog consistent for later dump requests, so they can - * later look the same as if they were interpreted only. - */ - for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { - if (bpf_pseudo_func(insn)) { - insn[0].imm = env->insn_aux_data[i].call_imm; - insn[1].imm = insn->off; - insn->off = 0; - continue; - } - if (!bpf_pseudo_call(insn)) - continue; - insn->off = env->insn_aux_data[i].call_imm; - subprog = find_subprog(env, i + insn->off + 1); - insn->imm = subprog; - } - - prog->jited = 1; - prog->bpf_func = func[0]->bpf_func; - prog->jited_len = func[0]->jited_len; - prog->aux->extable = func[0]->aux->extable; - prog->aux->num_exentries = func[0]->aux->num_exentries; - prog->aux->func = func; - prog->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt; - prog->aux->real_func_cnt = env->subprog_cnt; - prog->aux->bpf_exception_cb = (void *)func[env->exception_callback_subprog]->bpf_func; - prog->aux->exception_boundary = func[0]->aux->exception_boundary; - bpf_prog_jit_attempt_done(prog); - return 0; -out_free: - /* We failed JIT'ing, so at this point we need to unregister poke - * descriptors from subprogs, so that kernel is not attempting to - * patch it anymore as we're freeing the subprog JIT memory. - */ - for (i = 0; i < prog->aux->size_poke_tab; i++) { - map_ptr = prog->aux->poke_tab[i].tail_call.map; - map_ptr->ops->map_poke_untrack(map_ptr, prog->aux); - } - /* At this point we're guaranteed that poke descriptors are not - * live anymore. We can just unlink its descriptor table as it's - * released with the main prog. - */ - for (i = 0; i < env->subprog_cnt; i++) { - if (!func[i]) - continue; - func[i]->aux->poke_tab = NULL; - bpf_jit_free(func[i]); - } - kfree(func); -out_undo_insn: - /* cleanup main prog to be interpreted */ - prog->jit_requested = 0; - prog->blinding_requested = 0; - for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { - if (!bpf_pseudo_call(insn)) - continue; - insn->off = 0; - insn->imm = env->insn_aux_data[i].call_imm; - } - bpf_prog_jit_attempt_done(prog); - return err; -} - -static int fixup_call_args(struct bpf_verifier_env *env) -{ -#ifndef CONFIG_BPF_JIT_ALWAYS_ON - struct bpf_prog *prog = env->prog; - struct bpf_insn *insn = prog->insnsi; - bool has_kfunc_call = bpf_prog_has_kfunc_call(prog); - int i, depth; -#endif - int err = 0; - - if (env->prog->jit_requested && - !bpf_prog_is_offloaded(env->prog->aux)) { - err = jit_subprogs(env); - if (err == 0) - return 0; - if (err == -EFAULT) - return err; - } -#ifndef CONFIG_BPF_JIT_ALWAYS_ON - if (has_kfunc_call) { - verbose(env, "calling kernel functions are not allowed in non-JITed programs\n"); - return -EINVAL; - } - if (env->subprog_cnt > 1 && env->prog->aux->tail_call_reachable) { - /* When JIT fails the progs with bpf2bpf calls and tail_calls - * have to be rejected, since interpreter doesn't support them yet. - */ - verbose(env, "tail_calls are not allowed in non-JITed programs with bpf-to-bpf calls\n"); - return -EINVAL; - } - for (i = 0; i < prog->len; i++, insn++) { - if (bpf_pseudo_func(insn)) { - /* When JIT fails the progs with callback calls - * have to be rejected, since interpreter doesn't support them yet. - */ - verbose(env, "callbacks are not allowed in non-JITed programs\n"); - return -EINVAL; - } - - if (!bpf_pseudo_call(insn)) - continue; - depth = get_callee_stack_depth(env, insn, i); - if (depth < 0) - return depth; - bpf_patch_call_args(insn, depth); - } - err = 0; -#endif - return err; -} - -/* replace a generic kfunc with a specialized version if necessary */ -static void specialize_kfunc(struct bpf_verifier_env *env, - u32 func_id, u16 offset, unsigned long *addr) -{ - struct bpf_prog *prog = env->prog; - bool seen_direct_write; - void *xdp_kfunc; - bool is_rdonly; - - if (bpf_dev_bound_kfunc_id(func_id)) { - xdp_kfunc = bpf_dev_bound_resolve_kfunc(prog, func_id); - if (xdp_kfunc) { - *addr = (unsigned long)xdp_kfunc; - return; - } - /* fallback to default kfunc when not supported by netdev */ - } - - if (offset) - return; - - if (func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) { - seen_direct_write = env->seen_direct_write; - is_rdonly = !may_access_direct_pkt_data(env, NULL, BPF_WRITE); - - if (is_rdonly) - *addr = (unsigned long)bpf_dynptr_from_skb_rdonly; - - /* restore env->seen_direct_write to its original value, since - * may_access_direct_pkt_data mutates it - */ - env->seen_direct_write = seen_direct_write; - } -} - -static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux, - u16 struct_meta_reg, - u16 node_offset_reg, - struct bpf_insn *insn, - struct bpf_insn *insn_buf, - int *cnt) -{ - struct btf_struct_meta *kptr_struct_meta = insn_aux->kptr_struct_meta; - struct bpf_insn addr[2] = { BPF_LD_IMM64(struct_meta_reg, (long)kptr_struct_meta) }; - - insn_buf[0] = addr[0]; - insn_buf[1] = addr[1]; - insn_buf[2] = BPF_MOV64_IMM(node_offset_reg, insn_aux->insert_off); - insn_buf[3] = *insn; - *cnt = 4; -} - -static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, - struct bpf_insn *insn_buf, int insn_idx, int *cnt) -{ - const struct bpf_kfunc_desc *desc; - - if (!insn->imm) { - verbose(env, "invalid kernel function call not eliminated in verifier pass\n"); - return -EINVAL; - } - - *cnt = 0; - - /* insn->imm has the btf func_id. Replace it with an offset relative to - * __bpf_call_base, unless the JIT needs to call functions that are - * further than 32 bits away (bpf_jit_supports_far_kfunc_call()). - */ - desc = find_kfunc_desc(env->prog, insn->imm, insn->off); - if (!desc) { - verbose(env, "verifier internal error: kernel function descriptor not found for func_id %u\n", - insn->imm); - return -EFAULT; - } - - if (!bpf_jit_supports_far_kfunc_call()) - insn->imm = BPF_CALL_IMM(desc->addr); - if (insn->off) - return 0; - if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl] || - desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { - struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta; - struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) }; - u64 obj_new_size = env->insn_aux_data[insn_idx].obj_new_size; - - if (desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl] && kptr_struct_meta) { - verbose(env, "verifier internal error: NULL kptr_struct_meta expected at insn_idx %d\n", - insn_idx); - return -EFAULT; - } - - insn_buf[0] = BPF_MOV64_IMM(BPF_REG_1, obj_new_size); - insn_buf[1] = addr[0]; - insn_buf[2] = addr[1]; - insn_buf[3] = *insn; - *cnt = 4; - } else if (desc->func_id == special_kfunc_list[KF_bpf_obj_drop_impl] || - desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl] || - desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) { - struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta; - struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) }; - - if (desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl] && kptr_struct_meta) { - verbose(env, "verifier internal error: NULL kptr_struct_meta expected at insn_idx %d\n", - insn_idx); - return -EFAULT; - } - - if (desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] && - !kptr_struct_meta) { - verbose(env, "verifier internal error: kptr_struct_meta expected at insn_idx %d\n", - insn_idx); - return -EFAULT; - } - - insn_buf[0] = addr[0]; - insn_buf[1] = addr[1]; - insn_buf[2] = *insn; - *cnt = 3; - } else if (desc->func_id == special_kfunc_list[KF_bpf_list_push_back_impl] || - desc->func_id == special_kfunc_list[KF_bpf_list_push_front_impl] || - desc->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) { - struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta; - int struct_meta_reg = BPF_REG_3; - int node_offset_reg = BPF_REG_4; - - /* rbtree_add has extra 'less' arg, so args-to-fixup are in diff regs */ - if (desc->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) { - struct_meta_reg = BPF_REG_4; - node_offset_reg = BPF_REG_5; - } - - if (!kptr_struct_meta) { - verbose(env, "verifier internal error: kptr_struct_meta expected at insn_idx %d\n", - insn_idx); - return -EFAULT; - } - - __fixup_collection_insert_kfunc(&env->insn_aux_data[insn_idx], struct_meta_reg, - node_offset_reg, insn, insn_buf, cnt); - } else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] || - desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { - insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1); - *cnt = 1; - } else if (is_bpf_wq_set_callback_impl_kfunc(desc->func_id)) { - struct bpf_insn ld_addrs[2] = { BPF_LD_IMM64(BPF_REG_4, (long)env->prog->aux) }; - - insn_buf[0] = ld_addrs[0]; - insn_buf[1] = ld_addrs[1]; - insn_buf[2] = *insn; - *cnt = 3; - } - return 0; -} - -/* The function requires that first instruction in 'patch' is insnsi[prog->len - 1] */ -static int add_hidden_subprog(struct bpf_verifier_env *env, struct bpf_insn *patch, int len) -{ - struct bpf_subprog_info *info = env->subprog_info; - int cnt = env->subprog_cnt; - struct bpf_prog *prog; - /* We only reserve one slot for hidden subprogs in subprog_info. */ - if (env->hidden_subprog_cnt) { - verbose(env, "verifier internal error: only one hidden subprog supported\n"); - return -EFAULT; - } - /* We're not patching any existing instruction, just appending the new - * ones for the hidden subprog. Hence all of the adjustment operations - * in bpf_patch_insn_data are no-ops. - */ - prog = bpf_patch_insn_data(env, env->prog->len - 1, patch, len); - if (!prog) - return -ENOMEM; - env->prog = prog; - info[cnt + 1].start = info[cnt].start; - info[cnt].start = prog->len - len + 1; - env->subprog_cnt++; - env->hidden_subprog_cnt++; - return 0; -} -/* Do various post-verification rewrites in a single program pass. - * These rewrites simplify JIT and interpreter implementations. - */ -static int do_misc_fixups(struct bpf_verifier_env *env) +static void free_states(struct bpf_verifier_env *env) { - struct bpf_prog *prog = env->prog; - enum bpf_attach_type eatype = prog->expected_attach_type; - enum bpf_prog_type prog_type = resolve_prog_type(prog); - struct bpf_insn *insn = prog->insnsi; - const struct bpf_func_proto *fn; - const int insn_cnt = prog->len; - const struct bpf_map_ops *ops; - struct bpf_insn_aux_data *aux; - struct bpf_insn *insn_buf = env->insn_buf; - struct bpf_prog *new_prog; - struct bpf_map *map_ptr; - int i, ret, cnt, delta = 0, cur_subprog = 0; - struct bpf_subprog_info *subprogs = env->subprog_info; - u16 stack_depth = subprogs[cur_subprog].stack_depth; - u16 stack_depth_extra = 0; - - if (env->seen_exception && !env->exception_callback_subprog) { - struct bpf_insn patch[] = { - env->prog->insnsi[insn_cnt - 1], - BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), - BPF_EXIT_INSN(), - }; - - ret = add_hidden_subprog(env, patch, ARRAY_SIZE(patch)); - if (ret < 0) - return ret; - prog = env->prog; - insn = prog->insnsi; - - env->exception_callback_subprog = env->subprog_cnt - 1; - /* Don't update insn_cnt, as add_hidden_subprog always appends insns */ - mark_subprog_exc_cb(env, env->exception_callback_subprog); - } - - for (i = 0; i < insn_cnt;) { - if (insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->imm) { - if ((insn->off == BPF_ADDR_SPACE_CAST && insn->imm == 1) || - (((struct bpf_map *)env->prog->aux->arena)->map_flags & BPF_F_NO_USER_CONV)) { - /* convert to 32-bit mov that clears upper 32-bit */ - insn->code = BPF_ALU | BPF_MOV | BPF_X; - /* clear off and imm, so it's a normal 'wX = wY' from JIT pov */ - insn->off = 0; - insn->imm = 0; - } /* cast from as(0) to as(1) should be handled by JIT */ - goto next_insn; - } - - if (env->insn_aux_data[i + delta].needs_zext) - /* Convert BPF_CLASS(insn->code) == BPF_ALU64 to 32-bit ALU */ - insn->code = BPF_ALU | BPF_OP(insn->code) | BPF_SRC(insn->code); - - /* Make sdiv/smod divide-by-minus-one exceptions impossible. */ - if ((insn->code == (BPF_ALU64 | BPF_MOD | BPF_K) || - insn->code == (BPF_ALU64 | BPF_DIV | BPF_K) || - insn->code == (BPF_ALU | BPF_MOD | BPF_K) || - insn->code == (BPF_ALU | BPF_DIV | BPF_K)) && - insn->off == 1 && insn->imm == -1) { - bool is64 = BPF_CLASS(insn->code) == BPF_ALU64; - bool isdiv = BPF_OP(insn->code) == BPF_DIV; - struct bpf_insn *patchlet; - struct bpf_insn chk_and_sdiv[] = { - BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | - BPF_NEG | BPF_K, insn->dst_reg, - 0, 0, 0), - }; - struct bpf_insn chk_and_smod[] = { - BPF_MOV32_IMM(insn->dst_reg, 0), - }; - - patchlet = isdiv ? chk_and_sdiv : chk_and_smod; - cnt = isdiv ? ARRAY_SIZE(chk_and_sdiv) : ARRAY_SIZE(chk_and_smod); - - new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto next_insn; - } - - /* Make divide-by-zero and divide-by-minus-one exceptions impossible. */ - if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) || - insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) || - insn->code == (BPF_ALU | BPF_MOD | BPF_X) || - insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { - bool is64 = BPF_CLASS(insn->code) == BPF_ALU64; - bool isdiv = BPF_OP(insn->code) == BPF_DIV; - bool is_sdiv = isdiv && insn->off == 1; - bool is_smod = !isdiv && insn->off == 1; - struct bpf_insn *patchlet; - struct bpf_insn chk_and_div[] = { - /* [R,W]x div 0 -> 0 */ - BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | - BPF_JNE | BPF_K, insn->src_reg, - 0, 2, 0), - BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg), - BPF_JMP_IMM(BPF_JA, 0, 0, 1), - *insn, - }; - struct bpf_insn chk_and_mod[] = { - /* [R,W]x mod 0 -> [R,W]x */ - BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | - BPF_JEQ | BPF_K, insn->src_reg, - 0, 1 + (is64 ? 0 : 1), 0), - *insn, - BPF_JMP_IMM(BPF_JA, 0, 0, 1), - BPF_MOV32_REG(insn->dst_reg, insn->dst_reg), - }; - struct bpf_insn chk_and_sdiv[] = { - /* [R,W]x sdiv 0 -> 0 - * LLONG_MIN sdiv -1 -> LLONG_MIN - * INT_MIN sdiv -1 -> INT_MIN - */ - BPF_MOV64_REG(BPF_REG_AX, insn->src_reg), - BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | - BPF_ADD | BPF_K, BPF_REG_AX, - 0, 0, 1), - BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | - BPF_JGT | BPF_K, BPF_REG_AX, - 0, 4, 1), - BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | - BPF_JEQ | BPF_K, BPF_REG_AX, - 0, 1, 0), - BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | - BPF_MOV | BPF_K, insn->dst_reg, - 0, 0, 0), - /* BPF_NEG(LLONG_MIN) == -LLONG_MIN == LLONG_MIN */ - BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | - BPF_NEG | BPF_K, insn->dst_reg, - 0, 0, 0), - BPF_JMP_IMM(BPF_JA, 0, 0, 1), - *insn, - }; - struct bpf_insn chk_and_smod[] = { - /* [R,W]x mod 0 -> [R,W]x */ - /* [R,W]x mod -1 -> 0 */ - BPF_MOV64_REG(BPF_REG_AX, insn->src_reg), - BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | - BPF_ADD | BPF_K, BPF_REG_AX, - 0, 0, 1), - BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | - BPF_JGT | BPF_K, BPF_REG_AX, - 0, 3, 1), - BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | - BPF_JEQ | BPF_K, BPF_REG_AX, - 0, 3 + (is64 ? 0 : 1), 1), - BPF_MOV32_IMM(insn->dst_reg, 0), - BPF_JMP_IMM(BPF_JA, 0, 0, 1), - *insn, - BPF_JMP_IMM(BPF_JA, 0, 0, 1), - BPF_MOV32_REG(insn->dst_reg, insn->dst_reg), - }; - - if (is_sdiv) { - patchlet = chk_and_sdiv; - cnt = ARRAY_SIZE(chk_and_sdiv); - } else if (is_smod) { - patchlet = chk_and_smod; - cnt = ARRAY_SIZE(chk_and_smod) - (is64 ? 2 : 0); - } else { - patchlet = isdiv ? chk_and_div : chk_and_mod; - cnt = isdiv ? ARRAY_SIZE(chk_and_div) : - ARRAY_SIZE(chk_and_mod) - (is64 ? 2 : 0); - } - - new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto next_insn; - } - - /* Make it impossible to de-reference a userspace address */ - if (BPF_CLASS(insn->code) == BPF_LDX && - (BPF_MODE(insn->code) == BPF_PROBE_MEM || - BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) { - struct bpf_insn *patch = &insn_buf[0]; - u64 uaddress_limit = bpf_arch_uaddress_limit(); - - if (!uaddress_limit) - goto next_insn; - - *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg); - if (insn->off) - *patch++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_AX, insn->off); - *patch++ = BPF_ALU64_IMM(BPF_RSH, BPF_REG_AX, 32); - *patch++ = BPF_JMP_IMM(BPF_JLE, BPF_REG_AX, uaddress_limit >> 32, 2); - *patch++ = *insn; - *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); - *patch++ = BPF_MOV64_IMM(insn->dst_reg, 0); - - cnt = patch - insn_buf; - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto next_insn; - } - - /* Implement LD_ABS and LD_IND with a rewrite, if supported by the program type. */ - if (BPF_CLASS(insn->code) == BPF_LD && - (BPF_MODE(insn->code) == BPF_ABS || - BPF_MODE(insn->code) == BPF_IND)) { - cnt = env->ops->gen_ld_abs(insn, insn_buf); - if (cnt == 0 || cnt >= INSN_BUF_SIZE) { - verbose(env, "bpf verifier is misconfigured\n"); - return -EINVAL; - } - - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto next_insn; - } - - /* Rewrite pointer arithmetic to mitigate speculation attacks. */ - if (insn->code == (BPF_ALU64 | BPF_ADD | BPF_X) || - insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) { - const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X; - const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X; - struct bpf_insn *patch = &insn_buf[0]; - bool issrc, isneg, isimm; - u32 off_reg; - - aux = &env->insn_aux_data[i + delta]; - if (!aux->alu_state || - aux->alu_state == BPF_ALU_NON_POINTER) - goto next_insn; - - isneg = aux->alu_state & BPF_ALU_NEG_VALUE; - issrc = (aux->alu_state & BPF_ALU_SANITIZE) == - BPF_ALU_SANITIZE_SRC; - isimm = aux->alu_state & BPF_ALU_IMMEDIATE; - - off_reg = issrc ? insn->src_reg : insn->dst_reg; - if (isimm) { - *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit); - } else { - if (isneg) - *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1); - *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit); - *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg); - *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg); - *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0); - *patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63); - *patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX, off_reg); - } - if (!issrc) - *patch++ = BPF_MOV64_REG(insn->dst_reg, insn->src_reg); - insn->src_reg = BPF_REG_AX; - if (isneg) - insn->code = insn->code == code_add ? - code_sub : code_add; - *patch++ = *insn; - if (issrc && isneg && !isimm) - *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1); - cnt = patch - insn_buf; - - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto next_insn; - } - - if (is_may_goto_insn(insn)) { - int stack_off = -stack_depth - 8; - - stack_depth_extra = 8; - insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off); - if (insn->off >= 0) - insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 2); - else - insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off - 1); - insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1); - insn_buf[3] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off); - cnt = 4; - - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto next_insn; - } - - if (insn->code != (BPF_JMP | BPF_CALL)) - goto next_insn; - if (insn->src_reg == BPF_PSEUDO_CALL) - goto next_insn; - if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { - ret = fixup_kfunc_call(env, insn, insn_buf, i + delta, &cnt); - if (ret) - return ret; - if (cnt == 0) - goto next_insn; - - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto next_insn; - } - - /* Skip inlining the helper call if the JIT does it. */ - if (bpf_jit_inlines_helper_call(insn->imm)) - goto next_insn; - - if (insn->imm == BPF_FUNC_get_route_realm) - prog->dst_needed = 1; - if (insn->imm == BPF_FUNC_get_prandom_u32) - bpf_user_rnd_init_once(); - if (insn->imm == BPF_FUNC_override_return) - prog->kprobe_override = 1; - if (insn->imm == BPF_FUNC_tail_call) { - /* If we tail call into other programs, we - * cannot make any assumptions since they can - * be replaced dynamically during runtime in - * the program array. - */ - prog->cb_access = 1; - if (!allow_tail_call_in_subprogs(env)) - prog->aux->stack_depth = MAX_BPF_STACK; - prog->aux->max_pkt_offset = MAX_PACKET_OFF; - - /* mark bpf_tail_call as different opcode to avoid - * conditional branch in the interpreter for every normal - * call and to prevent accidental JITing by JIT compiler - * that doesn't support bpf_tail_call yet - */ - insn->imm = 0; - insn->code = BPF_JMP | BPF_TAIL_CALL; - - aux = &env->insn_aux_data[i + delta]; - if (env->bpf_capable && !prog->blinding_requested && - prog->jit_requested && - !bpf_map_key_poisoned(aux) && - !bpf_map_ptr_poisoned(aux) && - !bpf_map_ptr_unpriv(aux)) { - struct bpf_jit_poke_descriptor desc = { - .reason = BPF_POKE_REASON_TAIL_CALL, - .tail_call.map = aux->map_ptr_state.map_ptr, - .tail_call.key = bpf_map_key_immediate(aux), - .insn_idx = i + delta, - }; - - ret = bpf_jit_add_poke_descriptor(prog, &desc); - if (ret < 0) { - verbose(env, "adding tail call poke descriptor failed\n"); - return ret; - } - - insn->imm = ret + 1; - goto next_insn; - } - - if (!bpf_map_ptr_unpriv(aux)) - goto next_insn; - - /* instead of changing every JIT dealing with tail_call - * emit two extra insns: - * if (index >= max_entries) goto out; - * index &= array->index_mask; - * to avoid out-of-bounds cpu speculation - */ - if (bpf_map_ptr_poisoned(aux)) { - verbose(env, "tail_call abusing map_ptr\n"); - return -EINVAL; - } - - map_ptr = aux->map_ptr_state.map_ptr; - insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3, - map_ptr->max_entries, 2); - insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3, - container_of(map_ptr, - struct bpf_array, - map)->index_mask); - insn_buf[2] = *insn; - cnt = 3; - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto next_insn; - } - - if (insn->imm == BPF_FUNC_timer_set_callback) { - /* The verifier will process callback_fn as many times as necessary - * with different maps and the register states prepared by - * set_timer_callback_state will be accurate. - * - * The following use case is valid: - * map1 is shared by prog1, prog2, prog3. - * prog1 calls bpf_timer_init for some map1 elements - * prog2 calls bpf_timer_set_callback for some map1 elements. - * Those that were not bpf_timer_init-ed will return -EINVAL. - * prog3 calls bpf_timer_start for some map1 elements. - * Those that were not both bpf_timer_init-ed and - * bpf_timer_set_callback-ed will return -EINVAL. - */ - struct bpf_insn ld_addrs[2] = { - BPF_LD_IMM64(BPF_REG_3, (long)prog->aux), - }; - - insn_buf[0] = ld_addrs[0]; - insn_buf[1] = ld_addrs[1]; - insn_buf[2] = *insn; - cnt = 3; - - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto patch_call_imm; - } - - if (is_storage_get_function(insn->imm)) { - if (!in_sleepable(env) || - env->insn_aux_data[i + delta].storage_get_func_atomic) - insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC); - else - insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL); - insn_buf[1] = *insn; - cnt = 2; - - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto patch_call_imm; - } - - /* bpf_per_cpu_ptr() and bpf_this_cpu_ptr() */ - if (env->insn_aux_data[i + delta].call_with_percpu_alloc_ptr) { - /* patch with 'r1 = *(u64 *)(r1 + 0)' since for percpu data, - * bpf_mem_alloc() returns a ptr to the percpu data ptr. - */ - insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0); - insn_buf[1] = *insn; - cnt = 2; - - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto patch_call_imm; - } - - /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup - * and other inlining handlers are currently limited to 64 bit - * only. - */ - if (prog->jit_requested && BITS_PER_LONG == 64 && - (insn->imm == BPF_FUNC_map_lookup_elem || - insn->imm == BPF_FUNC_map_update_elem || - insn->imm == BPF_FUNC_map_delete_elem || - insn->imm == BPF_FUNC_map_push_elem || - insn->imm == BPF_FUNC_map_pop_elem || - insn->imm == BPF_FUNC_map_peek_elem || - insn->imm == BPF_FUNC_redirect_map || - insn->imm == BPF_FUNC_for_each_map_elem || - insn->imm == BPF_FUNC_map_lookup_percpu_elem)) { - aux = &env->insn_aux_data[i + delta]; - if (bpf_map_ptr_poisoned(aux)) - goto patch_call_imm; - - map_ptr = aux->map_ptr_state.map_ptr; - ops = map_ptr->ops; - if (insn->imm == BPF_FUNC_map_lookup_elem && - ops->map_gen_lookup) { - cnt = ops->map_gen_lookup(map_ptr, insn_buf); - if (cnt == -EOPNOTSUPP) - goto patch_map_ops_generic; - if (cnt <= 0 || cnt >= INSN_BUF_SIZE) { - verbose(env, "bpf verifier is misconfigured\n"); - return -EINVAL; - } - - new_prog = bpf_patch_insn_data(env, i + delta, - insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto next_insn; - } - - BUILD_BUG_ON(!__same_type(ops->map_lookup_elem, - (void *(*)(struct bpf_map *map, void *key))NULL)); - BUILD_BUG_ON(!__same_type(ops->map_delete_elem, - (long (*)(struct bpf_map *map, void *key))NULL)); - BUILD_BUG_ON(!__same_type(ops->map_update_elem, - (long (*)(struct bpf_map *map, void *key, void *value, - u64 flags))NULL)); - BUILD_BUG_ON(!__same_type(ops->map_push_elem, - (long (*)(struct bpf_map *map, void *value, - u64 flags))NULL)); - BUILD_BUG_ON(!__same_type(ops->map_pop_elem, - (long (*)(struct bpf_map *map, void *value))NULL)); - BUILD_BUG_ON(!__same_type(ops->map_peek_elem, - (long (*)(struct bpf_map *map, void *value))NULL)); - BUILD_BUG_ON(!__same_type(ops->map_redirect, - (long (*)(struct bpf_map *map, u64 index, u64 flags))NULL)); - BUILD_BUG_ON(!__same_type(ops->map_for_each_callback, - (long (*)(struct bpf_map *map, - bpf_callback_t callback_fn, - void *callback_ctx, - u64 flags))NULL)); - BUILD_BUG_ON(!__same_type(ops->map_lookup_percpu_elem, - (void *(*)(struct bpf_map *map, void *key, u32 cpu))NULL)); - -patch_map_ops_generic: - switch (insn->imm) { - case BPF_FUNC_map_lookup_elem: - insn->imm = BPF_CALL_IMM(ops->map_lookup_elem); - goto next_insn; - case BPF_FUNC_map_update_elem: - insn->imm = BPF_CALL_IMM(ops->map_update_elem); - goto next_insn; - case BPF_FUNC_map_delete_elem: - insn->imm = BPF_CALL_IMM(ops->map_delete_elem); - goto next_insn; - case BPF_FUNC_map_push_elem: - insn->imm = BPF_CALL_IMM(ops->map_push_elem); - goto next_insn; - case BPF_FUNC_map_pop_elem: - insn->imm = BPF_CALL_IMM(ops->map_pop_elem); - goto next_insn; - case BPF_FUNC_map_peek_elem: - insn->imm = BPF_CALL_IMM(ops->map_peek_elem); - goto next_insn; - case BPF_FUNC_redirect_map: - insn->imm = BPF_CALL_IMM(ops->map_redirect); - goto next_insn; - case BPF_FUNC_for_each_map_elem: - insn->imm = BPF_CALL_IMM(ops->map_for_each_callback); - goto next_insn; - case BPF_FUNC_map_lookup_percpu_elem: - insn->imm = BPF_CALL_IMM(ops->map_lookup_percpu_elem); - goto next_insn; - } - - goto patch_call_imm; - } - - /* Implement bpf_jiffies64 inline. */ - if (prog->jit_requested && BITS_PER_LONG == 64 && - insn->imm == BPF_FUNC_jiffies64) { - struct bpf_insn ld_jiffies_addr[2] = { - BPF_LD_IMM64(BPF_REG_0, - (unsigned long)&jiffies), - }; - - insn_buf[0] = ld_jiffies_addr[0]; - insn_buf[1] = ld_jiffies_addr[1]; - insn_buf[2] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, - BPF_REG_0, 0); - cnt = 3; - - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, - cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto next_insn; - } - -#if defined(CONFIG_X86_64) && !defined(CONFIG_UML) - /* Implement bpf_get_smp_processor_id() inline. */ - if (insn->imm == BPF_FUNC_get_smp_processor_id && - verifier_inlines_helper_call(env, insn->imm)) { - /* BPF_FUNC_get_smp_processor_id inlining is an - * optimization, so if pcpu_hot.cpu_number is ever - * changed in some incompatible and hard to support - * way, it's fine to back out this inlining logic - */ -#ifdef CONFIG_SMP - insn_buf[0] = BPF_MOV32_IMM(BPF_REG_0, (u32)(unsigned long)&pcpu_hot.cpu_number); - insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); - insn_buf[2] = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0); - cnt = 3; -#else - insn_buf[0] = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0); - cnt = 1; -#endif - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto next_insn; - } -#endif - /* Implement bpf_get_func_arg inline. */ - if (prog_type == BPF_PROG_TYPE_TRACING && - insn->imm == BPF_FUNC_get_func_arg) { - /* Load nr_args from ctx - 8 */ - insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); - insn_buf[1] = BPF_JMP32_REG(BPF_JGE, BPF_REG_2, BPF_REG_0, 6); - insn_buf[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 3); - insn_buf[3] = BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_1); - insn_buf[4] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 0); - insn_buf[5] = BPF_STX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0); - insn_buf[6] = BPF_MOV64_IMM(BPF_REG_0, 0); - insn_buf[7] = BPF_JMP_A(1); - insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL); - cnt = 9; - - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto next_insn; - } - - /* Implement bpf_get_func_ret inline. */ - if (prog_type == BPF_PROG_TYPE_TRACING && - insn->imm == BPF_FUNC_get_func_ret) { - if (eatype == BPF_TRACE_FEXIT || - eatype == BPF_MODIFY_RETURN) { - /* Load nr_args from ctx - 8 */ - insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); - insn_buf[1] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3); - insn_buf[2] = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1); - insn_buf[3] = BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0); - insn_buf[4] = BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, 0); - insn_buf[5] = BPF_MOV64_IMM(BPF_REG_0, 0); - cnt = 6; - } else { - insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, -EOPNOTSUPP); - cnt = 1; - } - - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto next_insn; - } - - /* Implement get_func_arg_cnt inline. */ - if (prog_type == BPF_PROG_TYPE_TRACING && - insn->imm == BPF_FUNC_get_func_arg_cnt) { - /* Load nr_args from ctx - 8 */ - insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); - - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1); - if (!new_prog) - return -ENOMEM; - - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto next_insn; - } - - /* Implement bpf_get_func_ip inline. */ - if (prog_type == BPF_PROG_TYPE_TRACING && - insn->imm == BPF_FUNC_get_func_ip) { - /* Load IP address from ctx - 16 */ - insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -16); - - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1); - if (!new_prog) - return -ENOMEM; - - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto next_insn; - } - - /* Implement bpf_get_branch_snapshot inline. */ - if (IS_ENABLED(CONFIG_PERF_EVENTS) && - prog->jit_requested && BITS_PER_LONG == 64 && - insn->imm == BPF_FUNC_get_branch_snapshot) { - /* We are dealing with the following func protos: - * u64 bpf_get_branch_snapshot(void *buf, u32 size, u64 flags); - * int perf_snapshot_branch_stack(struct perf_branch_entry *entries, u32 cnt); - */ - const u32 br_entry_size = sizeof(struct perf_branch_entry); - - /* struct perf_branch_entry is part of UAPI and is - * used as an array element, so extremely unlikely to - * ever grow or shrink - */ - BUILD_BUG_ON(br_entry_size != 24); - - /* if (unlikely(flags)) return -EINVAL */ - insn_buf[0] = BPF_JMP_IMM(BPF_JNE, BPF_REG_3, 0, 7); - - /* Transform size (bytes) into number of entries (cnt = size / 24). - * But to avoid expensive division instruction, we implement - * divide-by-3 through multiplication, followed by further - * division by 8 through 3-bit right shift. - * Refer to book "Hacker's Delight, 2nd ed." by Henry S. Warren, Jr., - * p. 227, chapter "Unsigned Division by 3" for details and proofs. - * - * N / 3 <=> M * N / 2^33, where M = (2^33 + 1) / 3 = 0xaaaaaaab. - */ - insn_buf[1] = BPF_MOV32_IMM(BPF_REG_0, 0xaaaaaaab); - insn_buf[2] = BPF_ALU64_REG(BPF_MUL, BPF_REG_2, BPF_REG_0); - insn_buf[3] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 36); - - /* call perf_snapshot_branch_stack implementation */ - insn_buf[4] = BPF_EMIT_CALL(static_call_query(perf_snapshot_branch_stack)); - /* if (entry_cnt == 0) return -ENOENT */ - insn_buf[5] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4); - /* return entry_cnt * sizeof(struct perf_branch_entry) */ - insn_buf[6] = BPF_ALU32_IMM(BPF_MUL, BPF_REG_0, br_entry_size); - insn_buf[7] = BPF_JMP_A(3); - /* return -EINVAL; */ - insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL); - insn_buf[9] = BPF_JMP_A(1); - /* return -ENOENT; */ - insn_buf[10] = BPF_MOV64_IMM(BPF_REG_0, -ENOENT); - cnt = 11; - - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto next_insn; - } - - /* Implement bpf_kptr_xchg inline */ - if (prog->jit_requested && BITS_PER_LONG == 64 && - insn->imm == BPF_FUNC_kptr_xchg && - bpf_jit_supports_ptr_xchg()) { - insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_2); - insn_buf[1] = BPF_ATOMIC_OP(BPF_DW, BPF_XCHG, BPF_REG_1, BPF_REG_0, 0); - cnt = 2; + struct bpf_verifier_state_list *sl; + struct list_head *head, *pos, *tmp; + struct bpf_scc_info *info; + int i, j; - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; + bpf_free_verifier_state(env->cur_state, true); + env->cur_state = NULL; + while (!pop_stack(env, NULL, NULL, false)); - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto next_insn; - } -patch_call_imm: - fn = env->ops->get_func_proto(insn->imm, env->prog); - /* all functions that have prototype and verifier allowed - * programs to call them, must be real in-kernel functions - */ - if (!fn->func) { - verbose(env, - "kernel subsystem misconfigured func %s#%d\n", - func_id_name(insn->imm), insn->imm); - return -EFAULT; - } - insn->imm = fn->func - __bpf_call_base; -next_insn: - if (subprogs[cur_subprog + 1].start == i + delta + 1) { - subprogs[cur_subprog].stack_depth += stack_depth_extra; - subprogs[cur_subprog].stack_extra = stack_depth_extra; - cur_subprog++; - stack_depth = subprogs[cur_subprog].stack_depth; - stack_depth_extra = 0; - } - i++; - insn++; + list_for_each_safe(pos, tmp, &env->free_list) { + sl = container_of(pos, struct bpf_verifier_state_list, node); + bpf_free_verifier_state(&sl->state, false); + kfree(sl); } + INIT_LIST_HEAD(&env->free_list); - env->prog->aux->stack_depth = subprogs[0].stack_depth; - for (i = 0; i < env->subprog_cnt; i++) { - int subprog_start = subprogs[i].start; - int stack_slots = subprogs[i].stack_extra / 8; - - if (!stack_slots) + for (i = 0; i < env->scc_cnt; ++i) { + info = env->scc_info[i]; + if (!info) continue; - if (stack_slots > 1) { - verbose(env, "verifier bug: stack_slots supports may_goto only\n"); - return -EFAULT; - } - - /* Add ST insn to subprog prologue to init extra stack */ - insn_buf[0] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, - -subprogs[i].stack_depth, BPF_MAX_LOOPS); - /* Copy first actual insn to preserve it */ - insn_buf[1] = env->prog->insnsi[subprog_start]; - - new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, 2); - if (!new_prog) - return -ENOMEM; - env->prog = prog = new_prog; - /* - * If may_goto is a first insn of a prog there could be a jmp - * insn that points to it, hence adjust all such jmps to point - * to insn after BPF_ST that inits may_goto count. - * Adjustment will succeed because bpf_patch_insn_data() didn't fail. - */ - WARN_ON(adjust_jmp_off(env->prog, subprog_start, 1)); - } - - /* Since poke tab is now finalized, publish aux to tracker. */ - for (i = 0; i < prog->aux->size_poke_tab; i++) { - map_ptr = prog->aux->poke_tab[i].tail_call.map; - if (!map_ptr->ops->map_poke_track || - !map_ptr->ops->map_poke_untrack || - !map_ptr->ops->map_poke_run) { - verbose(env, "bpf verifier is misconfigured\n"); - return -EINVAL; - } - - ret = map_ptr->ops->map_poke_track(map_ptr, prog->aux); - if (ret < 0) { - verbose(env, "tracking tail call prog failed\n"); - return ret; - } - } - - sort_kfunc_descs_by_imm_off(env->prog); - - return 0; -} - -static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env, - int position, - s32 stack_base, - u32 callback_subprogno, - u32 *total_cnt) -{ - s32 r6_offset = stack_base + 0 * BPF_REG_SIZE; - s32 r7_offset = stack_base + 1 * BPF_REG_SIZE; - s32 r8_offset = stack_base + 2 * BPF_REG_SIZE; - int reg_loop_max = BPF_REG_6; - int reg_loop_cnt = BPF_REG_7; - int reg_loop_ctx = BPF_REG_8; - - struct bpf_insn *insn_buf = env->insn_buf; - struct bpf_prog *new_prog; - u32 callback_start; - u32 call_insn_offset; - s32 callback_offset; - u32 cnt = 0; - - /* This represents an inlined version of bpf_iter.c:bpf_loop, - * be careful to modify this code in sync. - */ - - /* Return error and jump to the end of the patch if - * expected number of iterations is too big. - */ - insn_buf[cnt++] = BPF_JMP_IMM(BPF_JLE, BPF_REG_1, BPF_MAX_LOOPS, 2); - insn_buf[cnt++] = BPF_MOV32_IMM(BPF_REG_0, -E2BIG); - insn_buf[cnt++] = BPF_JMP_IMM(BPF_JA, 0, 0, 16); - /* spill R6, R7, R8 to use these as loop vars */ - insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, r6_offset); - insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, r7_offset); - insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, r8_offset); - /* initialize loop vars */ - insn_buf[cnt++] = BPF_MOV64_REG(reg_loop_max, BPF_REG_1); - insn_buf[cnt++] = BPF_MOV32_IMM(reg_loop_cnt, 0); - insn_buf[cnt++] = BPF_MOV64_REG(reg_loop_ctx, BPF_REG_3); - /* loop header, - * if reg_loop_cnt >= reg_loop_max skip the loop body - */ - insn_buf[cnt++] = BPF_JMP_REG(BPF_JGE, reg_loop_cnt, reg_loop_max, 5); - /* callback call, - * correct callback offset would be set after patching - */ - insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt); - insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx); - insn_buf[cnt++] = BPF_CALL_REL(0); - /* increment loop counter */ - insn_buf[cnt++] = BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1); - /* jump to loop header if callback returned 0 */ - insn_buf[cnt++] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -6); - /* return value of bpf_loop, - * set R0 to the number of iterations - */ - insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_0, reg_loop_cnt); - /* restore original values of R6, R7, R8 */ - insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, r6_offset); - insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, r7_offset); - insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_10, r8_offset); - - *total_cnt = cnt; - new_prog = bpf_patch_insn_data(env, position, insn_buf, cnt); - if (!new_prog) - return new_prog; - - /* callback start is known only after patching */ - callback_start = env->subprog_info[callback_subprogno].start; - /* Note: insn_buf[12] is an offset of BPF_CALL_REL instruction */ - call_insn_offset = position + 12; - callback_offset = callback_start - call_insn_offset - 1; - new_prog->insnsi[call_insn_offset].imm = callback_offset; - - return new_prog; -} - -static bool is_bpf_loop_call(struct bpf_insn *insn) -{ - return insn->code == (BPF_JMP | BPF_CALL) && - insn->src_reg == 0 && - insn->imm == BPF_FUNC_loop; -} - -/* For all sub-programs in the program (including main) check - * insn_aux_data to see if there are bpf_loop calls that require - * inlining. If such calls are found the calls are replaced with a - * sequence of instructions produced by `inline_bpf_loop` function and - * subprog stack_depth is increased by the size of 3 registers. - * This stack space is used to spill values of the R6, R7, R8. These - * registers are used to store the loop bound, counter and context - * variables. - */ -static int optimize_bpf_loop(struct bpf_verifier_env *env) -{ - struct bpf_subprog_info *subprogs = env->subprog_info; - int i, cur_subprog = 0, cnt, delta = 0; - struct bpf_insn *insn = env->prog->insnsi; - int insn_cnt = env->prog->len; - u16 stack_depth = subprogs[cur_subprog].stack_depth; - u16 stack_depth_roundup = round_up(stack_depth, 8) - stack_depth; - u16 stack_depth_extra = 0; - - for (i = 0; i < insn_cnt; i++, insn++) { - struct bpf_loop_inline_state *inline_state = - &env->insn_aux_data[i + delta].loop_inline_state; - - if (is_bpf_loop_call(insn) && inline_state->fit_for_inline) { - struct bpf_prog *new_prog; - - stack_depth_extra = BPF_REG_SIZE * 3 + stack_depth_roundup; - new_prog = inline_bpf_loop(env, - i + delta, - -(stack_depth + stack_depth_extra), - inline_state->callback_subprogno, - &cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = new_prog; - insn = new_prog->insnsi + i + delta; - } - - if (subprogs[cur_subprog + 1].start == i + delta + 1) { - subprogs[cur_subprog].stack_depth += stack_depth_extra; - cur_subprog++; - stack_depth = subprogs[cur_subprog].stack_depth; - stack_depth_roundup = round_up(stack_depth, 8) - stack_depth; - stack_depth_extra = 0; - } - } - - env->prog->aux->stack_depth = env->subprog_info[0].stack_depth; - - return 0; -} - -/* Remove unnecessary spill/fill pairs, members of fastcall pattern, - * adjust subprograms stack depth when possible. - */ -static int remove_fastcall_spills_fills(struct bpf_verifier_env *env) -{ - struct bpf_subprog_info *subprog = env->subprog_info; - struct bpf_insn_aux_data *aux = env->insn_aux_data; - struct bpf_insn *insn = env->prog->insnsi; - int insn_cnt = env->prog->len; - u32 spills_num; - bool modified = false; - int i, j; - - for (i = 0; i < insn_cnt; i++, insn++) { - if (aux[i].fastcall_spills_num > 0) { - spills_num = aux[i].fastcall_spills_num; - /* NOPs would be removed by opt_remove_nops() */ - for (j = 1; j <= spills_num; ++j) { - *(insn - j) = NOP; - *(insn + j) = NOP; - } - modified = true; - } - if ((subprog + 1)->start == i + 1) { - if (modified && !subprog->keep_fastcall_stack) - subprog->stack_depth = -subprog->fastcall_stack_off; - subprog++; - modified = false; - } + for (j = 0; j < info->num_visits; j++) + bpf_free_backedges(&info->visits[j]); + kvfree(info); + env->scc_info[i] = NULL; } - return 0; -} - -static void free_states(struct bpf_verifier_env *env) -{ - struct bpf_verifier_state_list *sl, *sln; - int i; - - sl = env->free_list; - while (sl) { - sln = sl->next; - free_verifier_state(&sl->state, false); - kfree(sl); - sl = sln; - } - env->free_list = NULL; - if (!env->explored_states) return; for (i = 0; i < state_htab_size(env); i++) { - sl = env->explored_states[i]; + head = &env->explored_states[i]; - while (sl) { - sln = sl->next; - free_verifier_state(&sl->state, false); + list_for_each_safe(pos, tmp, head) { + sl = container_of(pos, struct bpf_verifier_state_list, node); + bpf_free_verifier_state(&sl->state, false); kfree(sl); - sl = sln; } - env->explored_states[i] = NULL; + INIT_LIST_HEAD(&env->explored_states[i]); } } @@ -22163,6 +18690,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) { bool pop_log = !(env->log.level & BPF_LOG_LEVEL2); struct bpf_subprog_info *sub = subprog_info(env, subprog); + struct bpf_prog_aux *aux = env->prog->aux; struct bpf_verifier_state *state; struct bpf_reg_state *regs; int ret, i; @@ -22170,13 +18698,14 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) env->prev_linfo = NULL; env->pass_cnt++; - state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL); + state = kzalloc_obj(struct bpf_verifier_state, GFP_KERNEL_ACCOUNT); if (!state) return -ENOMEM; state->curframe = 0; state->speculative = false; state->branches = 1; - state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); + state->in_sleepable = env->prog->sleepable; + state->frame[0] = kzalloc_obj(struct bpf_func_state, GFP_KERNEL_ACCOUNT); if (!state->frame[0]) { kfree(state); return -ENOMEM; @@ -22195,17 +18724,26 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) struct bpf_subprog_arg_info *arg; struct bpf_reg_state *reg; - verbose(env, "Validating %s() func#%d...\n", sub_name, subprog); + if (env->log.level & BPF_LOG_LEVEL) + verbose(env, "Validating %s() func#%d...\n", sub_name, subprog); ret = btf_prepare_func_args(env, subprog); if (ret) goto out; if (subprog_is_exc_cb(env, subprog)) { state->frame[0]->in_exception_callback_fn = true; - /* We have already ensured that the callback returns an integer, just - * like all global subprogs. We need to determine it only has a single - * scalar argument. + + /* + * Global functions are scalar or void, make sure + * we return a scalar. */ + if (subprog_returns_void(env, subprog)) { + verbose(env, "exception cb cannot return void\n"); + ret = -EINVAL; + goto out; + } + + /* Also ensure the callback only has a single scalar argument. */ if (sub->arg_cnt != 1 || sub->args[0].arg_type != ARG_ANYTHING) { verbose(env, "exception cb only supports single integer argument\n"); ret = -EINVAL; @@ -22227,11 +18765,12 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) __mark_dynptr_reg(reg, BPF_DYNPTR_TYPE_LOCAL, true, ++env->id_gen); } else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) { reg->type = PTR_TO_MEM; - if (arg->arg_type & PTR_MAYBE_NULL) - reg->type |= PTR_MAYBE_NULL; + reg->type |= arg->arg_type & + (PTR_MAYBE_NULL | PTR_UNTRUSTED | MEM_RDONLY); mark_reg_known_zero(env, regs, i); reg->mem_size = arg->mem_size; - reg->id = ++env->id_gen; + if (arg->arg_type & PTR_MAYBE_NULL) + reg->id = ++env->id_gen; } else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) { reg->type = PTR_TO_BTF_ID; if (arg->arg_type & PTR_MAYBE_NULL) @@ -22248,8 +18787,8 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) /* caller can pass either PTR_TO_ARENA or SCALAR */ mark_reg_unknown(env, regs, i); } else { - WARN_ONCE(1, "BUG: unhandled arg#%d type %d\n", - i - BPF_REG_1, arg->arg_type); + verifier_bug(env, "unhandled arg#%d type %d", + i - BPF_REG_1, arg->arg_type); ret = -EFAULT; goto out; } @@ -22270,16 +18809,15 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) mark_reg_known_zero(env, regs, BPF_REG_1); } + /* Acquire references for struct_ops program arguments tagged with "__ref" */ + if (!subprog && env->prog->type == BPF_PROG_TYPE_STRUCT_OPS) { + for (i = 0; i < aux->ctx_arg_info_size; i++) + aux->ctx_arg_info[i].ref_obj_id = aux->ctx_arg_info[i].refcounted ? + acquire_reference(env, 0) : 0; + } + ret = do_check(env); out: - /* check for NULL is necessary, since cur_state can be freed inside - * do_check() under memory pressure. - */ - if (env->cur_state) { - free_verifier_state(env->cur_state, true); - env->cur_state = NULL; - } - while (!pop_stack(env, NULL, NULL, false)); if (!ret && pop_log) bpf_vlog_reset(&env->log, 0); free_states(env); @@ -22322,7 +18860,7 @@ static int do_check_subprogs(struct bpf_verifier_env *env) again: new_cnt = 0; for (i = 1; i < env->subprog_cnt; i++) { - if (!subprog_is_global(env, i)) + if (!bpf_subprog_is_global(env, i)) continue; sub_aux = subprog_aux(env, i); @@ -22392,6 +18930,15 @@ static void print_verification_stats(struct bpf_verifier_env *env) env->peak_states, env->longest_mark_read_walk); } +int bpf_prog_ctx_arg_info_init(struct bpf_prog *prog, + const struct bpf_ctx_arg_aux *info, u32 cnt) +{ + prog->aux->ctx_arg_info = kmemdup_array(info, cnt, sizeof(*info), GFP_KERNEL_ACCOUNT); + prog->aux->ctx_arg_info_size = cnt; + + return prog->aux->ctx_arg_info ? 0 : -ENOMEM; +} + static int check_struct_ops_btf_id(struct bpf_verifier_env *env) { const struct btf_type *t, *func_proto; @@ -22399,10 +18946,11 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) const struct bpf_struct_ops *st_ops; const struct btf_member *member; struct bpf_prog *prog = env->prog; - u32 btf_id, member_idx; + bool has_refcounted_arg = false; + u32 btf_id, member_idx, member_off; struct btf *btf; const char *mname; - int err; + int i, err; if (!prog->gpl_compatible) { verbose(env, "struct ops programs must have a GPL compatible license\n"); @@ -22450,7 +18998,8 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) return -EINVAL; } - err = bpf_struct_ops_supported(st_ops, __btf_member_bit_offset(t, member) / 8); + member_off = __btf_member_bit_offset(t, member) / 8; + err = bpf_struct_ops_supported(st_ops, member_off); if (err) { verbose(env, "attach to unsupported member %s of struct %s\n", mname, st_ops->name); @@ -22472,28 +19021,36 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) return -EACCES; } - /* btf_ctx_access() used this to provide argument type info */ - prog->aux->ctx_arg_info = - st_ops_desc->arg_info[member_idx].info; - prog->aux->ctx_arg_info_size = - st_ops_desc->arg_info[member_idx].cnt; + for (i = 0; i < st_ops_desc->arg_info[member_idx].cnt; i++) { + if (st_ops_desc->arg_info[member_idx].info[i].refcounted) { + has_refcounted_arg = true; + break; + } + } + + /* Tail call is not allowed for programs with refcounted arguments since we + * cannot guarantee that valid refcounted kptrs will be passed to the callee. + */ + for (i = 0; i < env->subprog_cnt; i++) { + if (has_refcounted_arg && env->subprog_info[i].has_tail_call) { + verbose(env, "program with __ref argument cannot tail call\n"); + return -EINVAL; + } + } + + prog->aux->st_ops = st_ops; + prog->aux->attach_st_ops_member_off = member_off; prog->aux->attach_func_proto = func_proto; prog->aux->attach_func_name = mname; env->ops = st_ops->verifier_ops; - return 0; + return bpf_prog_ctx_arg_info_init(prog, st_ops_desc->arg_info[member_idx].info, + st_ops_desc->arg_info[member_idx].cnt); } #define SECURITY_PREFIX "security_" -static int check_attach_modify_return(unsigned long addr, const char *func_name) -{ - if (within_error_injection_list(addr) || - !strncmp(SECURITY_PREFIX, func_name, sizeof(SECURITY_PREFIX) - 1)) - return 0; - - return -EINVAL; -} +#ifdef CONFIG_FUNCTION_ERROR_INJECTION /* list of non-sleepable functions that are otherwise on * ALLOW_ERROR_INJECTION list @@ -22516,6 +19073,75 @@ static int check_non_sleepable_error_inject(u32 btf_id) return btf_id_set_contains(&btf_non_sleepable_error_inject, btf_id); } +static int check_attach_sleepable(u32 btf_id, unsigned long addr, const char *func_name) +{ + /* fentry/fexit/fmod_ret progs can be sleepable if they are + * attached to ALLOW_ERROR_INJECTION and are not in denylist. + */ + if (!check_non_sleepable_error_inject(btf_id) && + within_error_injection_list(addr)) + return 0; + + return -EINVAL; +} + +static int check_attach_modify_return(unsigned long addr, const char *func_name) +{ + if (within_error_injection_list(addr) || + !strncmp(SECURITY_PREFIX, func_name, sizeof(SECURITY_PREFIX) - 1)) + return 0; + + return -EINVAL; +} + +#else + +/* Unfortunately, the arch-specific prefixes are hard-coded in arch syscall code + * so we need to hard-code them, too. Ftrace has arch_syscall_match_sym_name() + * but that just compares two concrete function names. + */ +static bool has_arch_syscall_prefix(const char *func_name) +{ +#if defined(__x86_64__) + return !strncmp(func_name, "__x64_", 6); +#elif defined(__i386__) + return !strncmp(func_name, "__ia32_", 7); +#elif defined(__s390x__) + return !strncmp(func_name, "__s390x_", 8); +#elif defined(__aarch64__) + return !strncmp(func_name, "__arm64_", 8); +#elif defined(__riscv) + return !strncmp(func_name, "__riscv_", 8); +#elif defined(__powerpc__) || defined(__powerpc64__) + return !strncmp(func_name, "sys_", 4); +#elif defined(__loongarch__) + return !strncmp(func_name, "sys_", 4); +#else + return false; +#endif +} + +/* Without error injection, allow sleepable and fmod_ret progs on syscalls. */ + +static int check_attach_sleepable(u32 btf_id, unsigned long addr, const char *func_name) +{ + if (has_arch_syscall_prefix(func_name)) + return 0; + + return -EINVAL; +} + +static int check_attach_modify_return(unsigned long addr, const char *func_name) +{ + if (has_arch_syscall_prefix(func_name) || + !strncmp(SECURITY_PREFIX, func_name, sizeof(SECURITY_PREFIX) - 1)) + return 0; + + return -EINVAL; +} + +#endif /* CONFIG_FUNCTION_ERROR_INJECTION */ + int bpf_check_attach_target(struct bpf_verifier_log *log, const struct bpf_prog *prog, const struct bpf_prog *tgt_prog, @@ -22542,7 +19168,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, btf = tgt_prog ? tgt_prog->aux->btf : prog->aux->attach_btf; if (!btf) { bpf_log(log, - "FENTRY/FEXIT program can only be attached to another program annotated with BTF\n"); + "Tracing program can only be attached to another program annotated with BTF\n"); return -EINVAL; } t = btf_type_by_id(btf, btf_id); @@ -22558,6 +19184,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, if (tgt_prog) { struct bpf_prog_aux *aux = tgt_prog->aux; bool tgt_changes_pkt_data; + bool tgt_might_sleep; if (bpf_prog_is_dev_bound(prog->aux) && !bpf_prog_dev_bound_match(prog, tgt_prog)) { @@ -22577,7 +19204,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, if (aux->func && aux->func[subprog]->aux->exception_cb) { bpf_log(log, "%s programs cannot attach to exception callback\n", - prog_extension ? "Extension" : "FENTRY/FEXIT"); + prog_extension ? "Extension" : "Tracing"); return -EINVAL; } conservative = aux->func_info_aux[subprog].unreliable; @@ -22600,6 +19227,15 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, "Extension program changes packet data, while original does not\n"); return -EINVAL; } + + tgt_might_sleep = aux->func + ? aux->func[subprog]->aux->might_sleep + : aux->might_sleep; + if (prog->aux->might_sleep && !tgt_might_sleep) { + bpf_log(log, + "Extension program may sleep, while original does not\n"); + return -EINVAL; + } } if (!tgt_prog->jited) { bpf_log(log, "Can attach to only JITed progs\n"); @@ -22627,7 +19263,8 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, if (tgt_prog->type == BPF_PROG_TYPE_TRACING && prog_extension && (tgt_prog->expected_attach_type == BPF_TRACE_FENTRY || - tgt_prog->expected_attach_type == BPF_TRACE_FEXIT)) { + tgt_prog->expected_attach_type == BPF_TRACE_FEXIT || + tgt_prog->expected_attach_type == BPF_TRACE_FSESSION)) { /* Program extensions can extend all program types * except fentry/fexit. The reason is the following. * The fentry/fexit programs are used for performance @@ -22642,7 +19279,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, * beyond reasonable stack size. Hence extending fentry * is not allowed. */ - bpf_log(log, "Cannot extend fentry/fexit\n"); + bpf_log(log, "Cannot extend fentry/fexit/fsession\n"); return -EINVAL; } } else { @@ -22656,7 +19293,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, case BPF_TRACE_RAW_TP: if (tgt_prog) { bpf_log(log, - "Only FENTRY/FEXIT progs are attachable to another BPF prog\n"); + "Only FENTRY/FEXIT/FSESSION progs are attachable to another BPF prog\n"); return -EINVAL; } if (!btf_type_is_typedef(t)) { @@ -22726,6 +19363,12 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, case BPF_LSM_CGROUP: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: + if (prog->expected_attach_type == BPF_TRACE_FSESSION && + !bpf_jit_supports_fsession()) { + bpf_log(log, "JIT does not support fsession\n"); + return -EOPNOTSUPP; + } if (!btf_type_is_func(t)) { bpf_log(log, "attach_btf_id %u is not a function\n", btf_id); @@ -22778,12 +19421,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, ret = -EINVAL; switch (prog->type) { case BPF_PROG_TYPE_TRACING: - - /* fentry/fexit/fmod_ret progs can be sleepable if they are - * attached to ALLOW_ERROR_INJECTION and are not in denylist. - */ - if (!check_non_sleepable_error_inject(btf_id) && - within_error_injection_list(addr)) + if (!check_attach_sleepable(btf_id, addr, tname)) ret = 0; /* fentry/fexit/fmod_ret progs can also be sleepable if they are * in the fmodret id set with the KF_SLEEPABLE flag. @@ -22840,6 +19478,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, BTF_SET_START(btf_id_deny) BTF_ID_UNUSED #ifdef CONFIG_SMP +BTF_ID(func, ___migrate_enable) BTF_ID(func, migrate_disable) BTF_ID(func, migrate_enable) #endif @@ -22856,6 +19495,32 @@ BTF_ID(func, __rcu_read_unlock) #endif BTF_SET_END(btf_id_deny) +/* fexit and fmod_ret can't be used to attach to __noreturn functions. + * Currently, we must manually list all __noreturn functions here. Once a more + * robust solution is implemented, this workaround can be removed. + */ +BTF_SET_START(noreturn_deny) +#ifdef CONFIG_IA32_EMULATION +BTF_ID(func, __ia32_sys_exit) +BTF_ID(func, __ia32_sys_exit_group) +#endif +#ifdef CONFIG_KUNIT +BTF_ID(func, __kunit_abort) +BTF_ID(func, kunit_try_catch_throw) +#endif +#ifdef CONFIG_MODULES +BTF_ID(func, __module_put_and_kthread_exit) +#endif +#ifdef CONFIG_X86_64 +BTF_ID(func, __x64_sys_exit) +BTF_ID(func, __x64_sys_exit_group) +#endif +BTF_ID(func, do_exit) +BTF_ID(func, do_group_exit) +BTF_ID(func, kthread_complete_and_exit) +BTF_ID(func, make_task_dead) +BTF_SET_END(noreturn_deny) + static bool can_be_sleepable(struct bpf_prog *prog) { if (prog->type == BPF_PROG_TYPE_TRACING) { @@ -22864,6 +19529,7 @@ static bool can_be_sleepable(struct bpf_prog *prog) case BPF_TRACE_FEXIT: case BPF_MODIFY_RETURN: case BPF_TRACE_ITER: + case BPF_TRACE_FSESSION: return true; default: return false; @@ -22893,7 +19559,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) } if (prog->sleepable && !can_be_sleepable(prog)) { - verbose(env, "Only fentry/fexit/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n"); + verbose(env, "Only fentry/fexit/fsession/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n"); return -EINVAL; } @@ -22932,9 +19598,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) prog->aux->attach_btf_trace = true; return 0; } else if (prog->expected_attach_type == BPF_TRACE_ITER) { - if (!bpf_iter_prog_supported(prog)) - return -EINVAL; - return 0; + return bpf_iter_prog_supported(prog); } if (prog->type == BPF_PROG_TYPE_LSM) { @@ -22943,6 +19607,15 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) return ret; } else if (prog->type == BPF_PROG_TYPE_TRACING && btf_id_set_contains(&btf_id_deny, btf_id)) { + verbose(env, "Attaching tracing programs to function '%s' is rejected.\n", + tgt_info.tgt_name); + return -EINVAL; + } else if ((prog->expected_attach_type == BPF_TRACE_FEXIT || + prog->expected_attach_type == BPF_TRACE_FSESSION || + prog->expected_attach_type == BPF_MODIFY_RETURN) && + btf_id_set_contains(&noreturn_deny, btf_id)) { + verbose(env, "Attaching fexit/fsession/fmod_ret to __noreturn function '%s' is rejected.\n", + tgt_info.tgt_name); return -EINVAL; } @@ -22991,10 +19664,8 @@ static int add_fd_from_fd_array(struct bpf_verifier_env *env, int fd) btf = __btf_get_by_fd(f); if (!IS_ERR(btf)) { - err = __add_used_btf(env, btf); - if (err < 0) - return err; - return 0; + btf_get(btf); + return __add_used_btf(env, btf); } verbose(env, "fd %d is not pointing to valid bpf_map or btf\n", fd); @@ -23036,6 +19707,211 @@ static int process_fd_array(struct bpf_verifier_env *env, union bpf_attr *attr, return 0; } +/* replace a generic kfunc with a specialized version if necessary */ +static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc, int insn_idx) +{ + struct bpf_prog *prog = env->prog; + bool seen_direct_write; + void *xdp_kfunc; + bool is_rdonly; + u32 func_id = desc->func_id; + u16 offset = desc->offset; + unsigned long addr = desc->addr; + + if (offset) /* return if module BTF is used */ + return 0; + + if (bpf_dev_bound_kfunc_id(func_id)) { + xdp_kfunc = bpf_dev_bound_resolve_kfunc(prog, func_id); + if (xdp_kfunc) + addr = (unsigned long)xdp_kfunc; + /* fallback to default kfunc when not supported by netdev */ + } else if (func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) { + seen_direct_write = env->seen_direct_write; + is_rdonly = !may_access_direct_pkt_data(env, NULL, BPF_WRITE); + + if (is_rdonly) + addr = (unsigned long)bpf_dynptr_from_skb_rdonly; + + /* restore env->seen_direct_write to its original value, since + * may_access_direct_pkt_data mutates it + */ + env->seen_direct_write = seen_direct_write; + } else if (func_id == special_kfunc_list[KF_bpf_set_dentry_xattr]) { + if (bpf_lsm_has_d_inode_locked(prog)) + addr = (unsigned long)bpf_set_dentry_xattr_locked; + } else if (func_id == special_kfunc_list[KF_bpf_remove_dentry_xattr]) { + if (bpf_lsm_has_d_inode_locked(prog)) + addr = (unsigned long)bpf_remove_dentry_xattr_locked; + } else if (func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) { + if (!env->insn_aux_data[insn_idx].non_sleepable) + addr = (unsigned long)bpf_dynptr_from_file_sleepable; + } else if (func_id == special_kfunc_list[KF_bpf_arena_alloc_pages]) { + if (env->insn_aux_data[insn_idx].non_sleepable) + addr = (unsigned long)bpf_arena_alloc_pages_non_sleepable; + } else if (func_id == special_kfunc_list[KF_bpf_arena_free_pages]) { + if (env->insn_aux_data[insn_idx].non_sleepable) + addr = (unsigned long)bpf_arena_free_pages_non_sleepable; + } + desc->addr = addr; + return 0; +} + +static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux, + u16 struct_meta_reg, + u16 node_offset_reg, + struct bpf_insn *insn, + struct bpf_insn *insn_buf, + int *cnt) +{ + struct btf_struct_meta *kptr_struct_meta = insn_aux->kptr_struct_meta; + struct bpf_insn addr[2] = { BPF_LD_IMM64(struct_meta_reg, (long)kptr_struct_meta) }; + + insn_buf[0] = addr[0]; + insn_buf[1] = addr[1]; + insn_buf[2] = BPF_MOV64_IMM(node_offset_reg, insn_aux->insert_off); + insn_buf[3] = *insn; + *cnt = 4; +} + +int bpf_fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + struct bpf_insn *insn_buf, int insn_idx, int *cnt) +{ + struct bpf_kfunc_desc *desc; + int err; + + if (!insn->imm) { + verbose(env, "invalid kernel function call not eliminated in verifier pass\n"); + return -EINVAL; + } + + *cnt = 0; + + /* insn->imm has the btf func_id. Replace it with an offset relative to + * __bpf_call_base, unless the JIT needs to call functions that are + * further than 32 bits away (bpf_jit_supports_far_kfunc_call()). + */ + desc = find_kfunc_desc(env->prog, insn->imm, insn->off); + if (!desc) { + verifier_bug(env, "kernel function descriptor not found for func_id %u", + insn->imm); + return -EFAULT; + } + + err = specialize_kfunc(env, desc, insn_idx); + if (err) + return err; + + if (!bpf_jit_supports_far_kfunc_call()) + insn->imm = BPF_CALL_IMM(desc->addr); + + if (is_bpf_obj_new_kfunc(desc->func_id) || is_bpf_percpu_obj_new_kfunc(desc->func_id)) { + struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta; + struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) }; + u64 obj_new_size = env->insn_aux_data[insn_idx].obj_new_size; + + if (is_bpf_percpu_obj_new_kfunc(desc->func_id) && kptr_struct_meta) { + verifier_bug(env, "NULL kptr_struct_meta expected at insn_idx %d", + insn_idx); + return -EFAULT; + } + + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_1, obj_new_size); + insn_buf[1] = addr[0]; + insn_buf[2] = addr[1]; + insn_buf[3] = *insn; + *cnt = 4; + } else if (is_bpf_obj_drop_kfunc(desc->func_id) || + is_bpf_percpu_obj_drop_kfunc(desc->func_id) || + is_bpf_refcount_acquire_kfunc(desc->func_id)) { + struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta; + struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) }; + + if (is_bpf_percpu_obj_drop_kfunc(desc->func_id) && kptr_struct_meta) { + verifier_bug(env, "NULL kptr_struct_meta expected at insn_idx %d", + insn_idx); + return -EFAULT; + } + + if (is_bpf_refcount_acquire_kfunc(desc->func_id) && !kptr_struct_meta) { + verifier_bug(env, "kptr_struct_meta expected at insn_idx %d", + insn_idx); + return -EFAULT; + } + + insn_buf[0] = addr[0]; + insn_buf[1] = addr[1]; + insn_buf[2] = *insn; + *cnt = 3; + } else if (is_bpf_list_push_kfunc(desc->func_id) || + is_bpf_rbtree_add_kfunc(desc->func_id)) { + struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta; + int struct_meta_reg = BPF_REG_3; + int node_offset_reg = BPF_REG_4; + + /* rbtree_add has extra 'less' arg, so args-to-fixup are in diff regs */ + if (is_bpf_rbtree_add_kfunc(desc->func_id)) { + struct_meta_reg = BPF_REG_4; + node_offset_reg = BPF_REG_5; + } + + if (!kptr_struct_meta) { + verifier_bug(env, "kptr_struct_meta expected at insn_idx %d", + insn_idx); + return -EFAULT; + } + + __fixup_collection_insert_kfunc(&env->insn_aux_data[insn_idx], struct_meta_reg, + node_offset_reg, insn, insn_buf, cnt); + } else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] || + desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { + insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1); + *cnt = 1; + } else if (desc->func_id == special_kfunc_list[KF_bpf_session_is_return] && + env->prog->expected_attach_type == BPF_TRACE_FSESSION) { + /* + * inline the bpf_session_is_return() for fsession: + * bool bpf_session_is_return(void *ctx) + * { + * return (((u64 *)ctx)[-1] >> BPF_TRAMP_IS_RETURN_SHIFT) & 1; + * } + */ + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + insn_buf[1] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, BPF_TRAMP_IS_RETURN_SHIFT); + insn_buf[2] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1); + *cnt = 3; + } else if (desc->func_id == special_kfunc_list[KF_bpf_session_cookie] && + env->prog->expected_attach_type == BPF_TRACE_FSESSION) { + /* + * inline bpf_session_cookie() for fsession: + * __u64 *bpf_session_cookie(void *ctx) + * { + * u64 off = (((u64 *)ctx)[-1] >> BPF_TRAMP_COOKIE_INDEX_SHIFT) & 0xFF; + * return &((u64 *)ctx)[-off]; + * } + */ + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + insn_buf[1] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, BPF_TRAMP_COOKIE_INDEX_SHIFT); + insn_buf[2] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF); + insn_buf[3] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3); + insn_buf[4] = BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1); + insn_buf[5] = BPF_ALU64_IMM(BPF_NEG, BPF_REG_0, 0); + *cnt = 6; + } + + if (env->insn_aux_data[insn_idx].arg_prog) { + u32 regno = env->insn_aux_data[insn_idx].arg_prog; + struct bpf_insn ld_addrs[2] = { BPF_LD_IMM64(regno, (long)env->prog->aux) }; + int idx = *cnt; + + insn_buf[idx++] = ld_addrs[0]; + insn_buf[idx++] = ld_addrs[1]; + insn_buf[idx++] = *insn; + *cnt = idx; + } + return 0; +} + int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size) { u64 start_time = ktime_get_ns(); @@ -23044,6 +19920,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 u32 log_true_size; bool is_priv; + BTF_TYPE_EMIT(enum bpf_features); + /* no program is valid */ if (ARRAY_SIZE(bpf_verifier_ops) == 0) return -EINVAL; @@ -23051,7 +19929,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 /* 'struct bpf_verifier_env' can be global, but since it's not small, * allocate/free it every time bpf_check() is called */ - env = kvzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); + env = kvzalloc_obj(struct bpf_verifier_env, GFP_KERNEL_ACCOUNT); if (!env) return -ENOMEM; @@ -23065,6 +19943,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 goto err_free_env; for (i = 0; i < len; i++) env->insn_aux_data[i].orig_idx = i; + env->succ = bpf_iarray_realloc(NULL, 2); + if (!env->succ) + goto err_free_env; env->prog = *prog; env->ops = bpf_verifier_ops[env->prog->type]; @@ -23112,14 +19993,18 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ; env->test_reg_invariants = attr->prog_flags & BPF_F_TEST_REG_INVARIANTS; - env->explored_states = kvcalloc(state_htab_size(env), - sizeof(struct bpf_verifier_state_list *), - GFP_USER); + env->explored_states = kvzalloc_objs(struct list_head, + state_htab_size(env), + GFP_KERNEL_ACCOUNT); ret = -ENOMEM; if (!env->explored_states) goto skip_full_check; - ret = check_btf_info_early(env, attr, uattr); + for (i = 0; i < state_htab_size(env); i++) + INIT_LIST_HEAD(&env->explored_states[i]); + INIT_LIST_HEAD(&env->free_list); + + ret = bpf_check_btf_info_early(env, attr, uattr); if (ret < 0) goto skip_full_check; @@ -23131,11 +20016,11 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (ret < 0) goto skip_full_check; - ret = check_btf_info(env, attr, uattr); + ret = bpf_check_btf_info(env, attr, uattr); if (ret < 0) goto skip_full_check; - ret = resolve_pseudo_ldimm64(env); + ret = check_and_resolve_insns(env); if (ret < 0) goto skip_full_check; @@ -23145,14 +20030,42 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 goto skip_full_check; } - ret = check_cfg(env); + ret = bpf_check_cfg(env); if (ret < 0) goto skip_full_check; + ret = bpf_compute_postorder(env); + if (ret < 0) + goto skip_full_check; + + ret = bpf_stack_liveness_init(env); + if (ret) + goto skip_full_check; + ret = check_attach_btf_id(env); if (ret) goto skip_full_check; + ret = bpf_compute_const_regs(env); + if (ret < 0) + goto skip_full_check; + + ret = bpf_prune_dead_branches(env); + if (ret < 0) + goto skip_full_check; + + ret = sort_subprogs_topo(env); + if (ret < 0) + goto skip_full_check; + + ret = bpf_compute_scc(env); + if (ret < 0) + goto skip_full_check; + + ret = bpf_compute_live_registers(env); + if (ret < 0) + goto skip_full_check; + ret = mark_fastcall_patterns(env); if (ret < 0) goto skip_full_check; @@ -23170,22 +20083,22 @@ skip_full_check: * allocate additional slots. */ if (ret == 0) - ret = remove_fastcall_spills_fills(env); + ret = bpf_remove_fastcall_spills_fills(env); if (ret == 0) ret = check_max_stack_depth(env); /* instruction rewrites happen after this point */ if (ret == 0) - ret = optimize_bpf_loop(env); + ret = bpf_optimize_bpf_loop(env); if (is_priv) { if (ret == 0) - opt_hard_wire_dead_code_branches(env); + bpf_opt_hard_wire_dead_code_branches(env); if (ret == 0) - ret = opt_remove_dead_code(env); + ret = bpf_opt_remove_dead_code(env); if (ret == 0) - ret = opt_remove_nops(env); + ret = bpf_opt_remove_nops(env); } else { if (ret == 0) sanitize_dead_code(env); @@ -23193,22 +20106,22 @@ skip_full_check: if (ret == 0) /* program is valid, convert *(u32*)(ctx + off) accesses */ - ret = convert_ctx_accesses(env); + ret = bpf_convert_ctx_accesses(env); if (ret == 0) - ret = do_misc_fixups(env); + ret = bpf_do_misc_fixups(env); /* do 32-bit optimization after insn patching has done so those patched * insns could be handled correctly. */ if (ret == 0 && !bpf_prog_is_offloaded(env->prog->aux)) { - ret = opt_subreg_zext_lo32_rnd_hi32(env, attr); + ret = bpf_opt_subreg_zext_lo32_rnd_hi32(env, attr); env->prog->aux->verifier_zext = bpf_jit_needs_zext() ? !ret : false; } if (ret == 0) - ret = fixup_call_args(env); + ret = bpf_fixup_call_args(env); env->verification_time = ktime_get_ns() - start_time; print_verification_stats(env); @@ -23231,9 +20144,9 @@ skip_full_check: if (env->used_map_cnt) { /* if program passed verifier, update used_maps in bpf_prog_info */ - env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt, - sizeof(env->used_maps[0]), - GFP_KERNEL); + env->prog->aux->used_maps = kmalloc_objs(env->used_maps[0], + env->used_map_cnt, + GFP_KERNEL_ACCOUNT); if (!env->prog->aux->used_maps) { ret = -ENOMEM; @@ -23246,9 +20159,9 @@ skip_full_check: } if (env->used_btf_cnt) { /* if program passed verifier, update used_btfs in bpf_prog_aux */ - env->prog->aux->used_btfs = kmalloc_array(env->used_btf_cnt, - sizeof(env->used_btfs[0]), - GFP_KERNEL); + env->prog->aux->used_btfs = kmalloc_objs(env->used_btfs[0], + env->used_btf_cnt, + GFP_KERNEL_ACCOUNT); if (!env->prog->aux->used_btfs) { ret = -ENOMEM; goto err_release_maps; @@ -23267,7 +20180,17 @@ skip_full_check: adjust_btf_func(env); + /* extension progs temporarily inherit the attach_type of their targets + for verification purposes, so set it back to zero before returning + */ + if (env->prog->type == BPF_PROG_TYPE_EXT) + env->prog->expected_attach_type = 0; + + env->prog = __bpf_prog_select_runtime(env, env->prog, &ret); + err_release_maps: + if (ret) + release_insn_arrays(env); if (!env->prog->aux->used_maps) /* if we didn't copy map pointers into bpf_prog_info, release * them now. Otherwise free_used_maps() will release them. @@ -23276,21 +20199,20 @@ err_release_maps: if (!env->prog->aux->used_btfs) release_btfs(env); - /* extension progs temporarily inherit the attach_type of their targets - for verification purposes, so set it back to zero before returning - */ - if (env->prog->type == BPF_PROG_TYPE_EXT) - env->prog->expected_attach_type = 0; - *prog = env->prog; module_put(env->attach_btf_mod); err_unlock: if (!is_priv) mutex_unlock(&bpf_verifier_lock); + bpf_clear_insn_aux_data(env, 0, env->prog->len); vfree(env->insn_aux_data); - kvfree(env->insn_hist); err_free_env: + bpf_stack_liveness_free(env); + kvfree(env->cfg.insn_postorder); + kvfree(env->scc_info); + kvfree(env->succ); + kvfree(env->gotox_tmp_buf); kvfree(env); return ret; } |
