From ddef81b5fd1da4d7c3cc8785d2043b73b72f38ef Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Fri, 10 Feb 2023 15:47:32 +0000 Subject: bpf: use bpf_map_kvcalloc in bpf_local_storage Introduce new helper bpf_map_kvcalloc() for the memory allocation in bpf_local_storage(). Then the allocation will charge the memory from the map instead of from current, though currently they are the same thing as it is only used in map creation path now. By charging map's memory into the memcg from the map, it will be more clear. Signed-off-by: Yafang Shao Acked-by: Johannes Weiner Acked-by: Roman Gushchin Link: https://lore.kernel.org/r/20230210154734.4416-3-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_local_storage.c | 4 ++-- kernel/bpf/syscall.c | 15 +++++++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 373c3c2c75bc..35f4138a54dc 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -568,8 +568,8 @@ static struct bpf_local_storage_map *__bpf_local_storage_map_alloc(union bpf_att nbuckets = max_t(u32, 2, nbuckets); smap->bucket_log = ilog2(nbuckets); - smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets, - GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); + smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets), + nbuckets, GFP_USER | __GFP_NOWARN); if (!smap->buckets) { bpf_map_area_free(smap); return ERR_PTR(-ENOMEM); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index bcc97613de76..9d94a35d8b0f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -464,6 +464,21 @@ void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags) return ptr; } +void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size, + gfp_t flags) +{ + struct mem_cgroup *memcg, *old_memcg; + void *ptr; + + memcg = bpf_map_get_memcg(map); + old_memcg = set_active_memcg(memcg); + ptr = kvcalloc(n, size, flags | __GFP_ACCOUNT); + set_active_memcg(old_memcg); + mem_cgroup_put(memcg); + + return ptr; +} + void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, size_t align, gfp_t flags) { -- cgit v1.2.3 From ee53cbfb1ebf990de0d084a7cd6b67b05fe1f7ac Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Fri, 10 Feb 2023 15:47:33 +0000 Subject: bpf: allow to disable bpf map memory accounting We can simply set root memcg as the map's memcg to disable bpf memory accounting. bpf_map_area_alloc is a little special as it gets the memcg from current rather than from the map, so we need to disable GFP_ACCOUNT specifically for it. Signed-off-by: Yafang Shao Acked-by: Johannes Weiner Acked-by: Roman Gushchin Link: https://lore.kernel.org/r/20230210154734.4416-4-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 8 ++++++++ kernel/bpf/memalloc.c | 3 ++- kernel/bpf/syscall.c | 5 +++-- 3 files changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index fe0bf482fdf8..4385418118f6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -28,6 +28,7 @@ #include #include #include +#include struct bpf_verifier_env; struct bpf_verifier_log; @@ -2933,4 +2934,11 @@ static inline bool type_is_alloc(u32 type) return type & MEM_ALLOC; } +static inline gfp_t bpf_memcg_flags(gfp_t flags) +{ + if (memcg_bpf_enabled()) + return flags | __GFP_ACCOUNT; + return flags; +} + #endif /* _LINUX_BPF_H */ diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 1db156405b68..490d03a4581a 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -395,7 +395,8 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) unit_size = size; #ifdef CONFIG_MEMCG_KMEM - objcg = get_obj_cgroup_from_current(); + if (memcg_bpf_enabled()) + objcg = get_obj_cgroup_from_current(); #endif for_each_possible_cpu(cpu) { c = per_cpu_ptr(pc, cpu); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9d94a35d8b0f..cda8d00f3762 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -309,7 +309,7 @@ static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable) * __GFP_RETRY_MAYFAIL to avoid such situations. */ - const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_ACCOUNT; + gfp_t gfp = bpf_memcg_flags(__GFP_NOWARN | __GFP_ZERO); unsigned int flags = 0; unsigned long align = 1; void *area; @@ -418,7 +418,8 @@ static void bpf_map_save_memcg(struct bpf_map *map) * So we have to check map->objcg for being NULL each time it's * being used. */ - map->objcg = get_obj_cgroup_from_current(); + if (memcg_bpf_enabled()) + map->objcg = get_obj_cgroup_from_current(); } static void bpf_map_release_memcg(struct bpf_map *map) -- cgit v1.2.3 From bf3965082491601bf9cd6d9a0ce2d88cb219168a Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Fri, 10 Feb 2023 15:47:34 +0000 Subject: bpf: allow to disable bpf prog memory accounting We can simply disable the bpf prog memory accouting by not setting the GFP_ACCOUNT. Signed-off-by: Yafang Shao Acked-by: Johannes Weiner Acked-by: Roman Gushchin Link: https://lore.kernel.org/r/20230210154734.4416-5-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 16da51093aff..3390961c4e10 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -87,7 +88,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags) { - gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags; + gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags); struct bpf_prog_aux *aux; struct bpf_prog *fp; @@ -96,12 +97,12 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag if (fp == NULL) return NULL; - aux = kzalloc(sizeof(*aux), GFP_KERNEL_ACCOUNT | gfp_extra_flags); + aux = kzalloc(sizeof(*aux), bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags)); if (aux == NULL) { vfree(fp); return NULL; } - fp->active = alloc_percpu_gfp(int, GFP_KERNEL_ACCOUNT | gfp_extra_flags); + fp->active = alloc_percpu_gfp(int, bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags)); if (!fp->active) { vfree(fp); kfree(aux); @@ -126,7 +127,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) { - gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags; + gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags); struct bpf_prog *prog; int cpu; @@ -159,7 +160,7 @@ int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog) prog->aux->jited_linfo = kvcalloc(prog->aux->nr_linfo, sizeof(*prog->aux->jited_linfo), - GFP_KERNEL_ACCOUNT | __GFP_NOWARN); + bpf_memcg_flags(GFP_KERNEL | __GFP_NOWARN)); if (!prog->aux->jited_linfo) return -ENOMEM; @@ -234,7 +235,7 @@ void bpf_prog_fill_jited_linfo(struct bpf_prog *prog, struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags) { - gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags; + gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags); struct bpf_prog *fp; u32 pages; -- cgit v1.2.3 From 6a3cd3318ff65622415e34e8ee39d76331e7c869 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Sun, 12 Feb 2023 01:27:07 -0800 Subject: bpf: Migrate release_on_unlock logic to non-owning ref semantics This patch introduces non-owning reference semantics to the verifier, specifically linked_list API kfunc handling. release_on_unlock logic for refs is refactored - with small functional changes - to implement these semantics, and bpf_list_push_{front,back} are migrated to use them. When a list node is pushed to a list, the program still has a pointer to the node: n = bpf_obj_new(typeof(*n)); bpf_spin_lock(&l); bpf_list_push_back(&l, n); /* n still points to the just-added node */ bpf_spin_unlock(&l); What the verifier considers n to be after the push, and thus what can be done with n, are changed by this patch. Common properties both before/after this patch: * After push, n is only a valid reference to the node until end of critical section * After push, n cannot be pushed to any list * After push, the program can read the node's fields using n Before: * After push, n retains the ref_obj_id which it received on bpf_obj_new, but the associated bpf_reference_state's release_on_unlock field is set to true * release_on_unlock field and associated logic is used to implement "n is only a valid ref until end of critical section" * After push, n cannot be written to, the node must be removed from the list before writing to its fields * After push, n is marked PTR_UNTRUSTED After: * After push, n's ref is released and ref_obj_id set to 0. NON_OWN_REF type flag is added to reg's type, indicating that it's a non-owning reference. * NON_OWN_REF flag and logic is used to implement "n is only a valid ref until end of critical section" * n can be written to (except for special fields e.g. bpf_list_node, timer, ...) Summary of specific implementation changes to achieve the above: * release_on_unlock field, ref_set_release_on_unlock helper, and logic to "release on unlock" based on that field are removed * The anonymous active_lock struct used by bpf_verifier_state is pulled out into a named struct bpf_active_lock. * NON_OWN_REF type flag is introduced along with verifier logic changes to handle non-owning refs * Helpers are added to use NON_OWN_REF flag to implement non-owning ref semantics as described above * invalidate_non_owning_refs - helper to clobber all non-owning refs matching a particular bpf_active_lock identity. Replaces release_on_unlock logic in process_spin_lock. * ref_set_non_owning - set NON_OWN_REF type flag after doing some sanity checking * ref_convert_owning_non_owning - convert owning reference w/ specified ref_obj_id to non-owning references. Set NON_OWN_REF flag for each reg with that ref_obj_id and 0-out its ref_obj_id * Update linked_list selftests to account for minor semantic differences introduced by this patch * Writes to a release_on_unlock node ref are not allowed, while writes to non-owning reference pointees are. As a result the linked_list "write after push" failure tests are no longer scenarios that should fail. * The test##missing_lock##op and test##incorrect_lock##op macro-generated failure tests need to have a valid node argument in order to have the same error output as before. Otherwise verification will fail early and the expected error output won't be seen. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230212092715.1422619-2-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 + include/linux/bpf_verifier.h | 38 +++-- kernel/bpf/verifier.c | 168 +++++++++++++++------ .../testing/selftests/bpf/prog_tests/linked_list.c | 2 - tools/testing/selftests/bpf/progs/linked_list.c | 2 +- .../testing/selftests/bpf/progs/linked_list_fail.c | 100 +++++++----- 6 files changed, 206 insertions(+), 110 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4385418118f6..8b5d0b4c4ada 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -181,6 +181,7 @@ enum btf_field_type { BPF_KPTR = BPF_KPTR_UNREF | BPF_KPTR_REF, BPF_LIST_HEAD = (1 << 4), BPF_LIST_NODE = (1 << 5), + BPF_GRAPH_NODE_OR_ROOT = BPF_LIST_NODE | BPF_LIST_HEAD, }; struct btf_field_kptr { @@ -576,6 +577,11 @@ enum bpf_type_flag { /* MEM is tagged with rcu and memory access needs rcu_read_lock protection. */ MEM_RCU = BIT(13 + BPF_BASE_TYPE_BITS), + /* Used to tag PTR_TO_BTF_ID | MEM_ALLOC references which are non-owning. + * Currently only valid for linked-list and rbtree nodes. + */ + NON_OWN_REF = BIT(14 + BPF_BASE_TYPE_BITS), + __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index aa83de1fe755..cf1bb1cf4a7b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -43,6 +43,22 @@ enum bpf_reg_liveness { REG_LIVE_DONE = 0x8, /* liveness won't be updating this register anymore */ }; +/* For every reg representing a map value or allocated object pointer, + * we consider the tuple of (ptr, id) for them to be unique in verifier + * context and conside them to not alias each other for the purposes of + * tracking lock state. + */ +struct bpf_active_lock { + /* This can either be reg->map_ptr or reg->btf. If ptr is NULL, + * there's no active lock held, and other fields have no + * meaning. If non-NULL, it indicates that a lock is held and + * id member has the reg->id of the register which can be >= 0. + */ + void *ptr; + /* This will be reg->id */ + u32 id; +}; + struct bpf_reg_state { /* Ordering of fields matters. See states_equal() */ enum bpf_reg_type type; @@ -226,11 +242,6 @@ struct bpf_reference_state { * exiting a callback function. */ int callback_ref; - /* Mark the reference state to release the registers sharing the same id - * on bpf_spin_unlock (for nodes that we will lose ownership to but are - * safe to access inside the critical section). - */ - bool release_on_unlock; }; /* state of the program: @@ -331,21 +342,8 @@ struct bpf_verifier_state { u32 branches; u32 insn_idx; u32 curframe; - /* For every reg representing a map value or allocated object pointer, - * we consider the tuple of (ptr, id) for them to be unique in verifier - * context and conside them to not alias each other for the purposes of - * tracking lock state. - */ - struct { - /* This can either be reg->map_ptr or reg->btf. If ptr is NULL, - * there's no active lock held, and other fields have no - * meaning. If non-NULL, it indicates that a lock is held and - * id member has the reg->id of the register which can be >= 0. - */ - void *ptr; - /* This will be reg->id */ - u32 id; - } active_lock; + + struct bpf_active_lock active_lock; bool speculative; bool active_rcu_lock; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 388245e8826e..f176bc15c879 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -190,6 +190,9 @@ struct bpf_verifier_stack_elem { static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx); static int release_reference(struct bpf_verifier_env *env, int ref_obj_id); +static void invalidate_non_owning_refs(struct bpf_verifier_env *env); +static int ref_set_non_owning(struct bpf_verifier_env *env, + struct bpf_reg_state *reg); static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux) { @@ -457,6 +460,11 @@ static bool type_is_ptr_alloc_obj(u32 type) return base_type(type) == PTR_TO_BTF_ID && type_flag(type) & MEM_ALLOC; } +static bool type_is_non_owning_ref(u32 type) +{ + return type_is_ptr_alloc_obj(type) && type_flag(type) & NON_OWN_REF; +} + static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg) { struct btf_record *rec = NULL; @@ -1073,6 +1081,8 @@ static void print_verifier_state(struct bpf_verifier_env *env, verbose_a("id=%d", reg->id); if (reg->ref_obj_id) verbose_a("ref_obj_id=%d", reg->ref_obj_id); + if (type_is_non_owning_ref(reg->type)) + verbose_a("%s", "non_own_ref"); if (t != SCALAR_VALUE) verbose_a("off=%d", reg->off); if (type_is_pkt_pointer(t)) @@ -5052,7 +5062,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, return -EACCES; } - if (type_is_alloc(reg->type) && !reg->ref_obj_id) { + if (type_is_alloc(reg->type) && !type_is_non_owning_ref(reg->type) && + !reg->ref_obj_id) { verbose(env, "verifier internal error: ref_obj_id for allocated object must be non-zero\n"); return -EFAULT; } @@ -6042,9 +6053,7 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, cur->active_lock.ptr = btf; cur->active_lock.id = reg->id; } else { - struct bpf_func_state *fstate = cur_func(env); void *ptr; - int i; if (map) ptr = map; @@ -6060,25 +6069,11 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, verbose(env, "bpf_spin_unlock of different lock\n"); return -EINVAL; } - cur->active_lock.ptr = NULL; - cur->active_lock.id = 0; - for (i = fstate->acquired_refs - 1; i >= 0; i--) { - int err; + invalidate_non_owning_refs(env); - /* Complain on error because this reference state cannot - * be freed before this point, as bpf_spin_lock critical - * section does not allow functions that release the - * allocated object immediately. - */ - if (!fstate->refs[i].release_on_unlock) - continue; - err = release_reference(env, fstate->refs[i].id); - if (err) { - verbose(env, "failed to release release_on_unlock reference"); - return err; - } - } + cur->active_lock.ptr = NULL; + cur->active_lock.id = 0; } return 0; } @@ -6546,6 +6541,23 @@ found: return 0; } +static struct btf_field * +reg_find_field_offset(const struct bpf_reg_state *reg, s32 off, u32 fields) +{ + struct btf_field *field; + struct btf_record *rec; + + rec = reg_btf_record(reg); + if (!rec) + return NULL; + + field = btf_record_find(rec, off, fields); + if (!field) + return NULL; + + return field; +} + int check_func_arg_reg_off(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno, enum bpf_arg_type arg_type) @@ -6567,6 +6579,18 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, */ if (arg_type_is_dynptr(arg_type) && type == PTR_TO_STACK) return 0; + + if ((type_is_ptr_alloc_obj(type) || type_is_non_owning_ref(type)) && reg->off) { + if (reg_find_field_offset(reg, reg->off, BPF_GRAPH_NODE_OR_ROOT)) + return __check_ptr_off_reg(env, reg, regno, true); + + verbose(env, "R%d must have zero offset when passed to release func\n", + regno); + verbose(env, "No graph node or root found at R%d type:%s off:%d\n", regno, + kernel_type_name(reg->btf, reg->btf_id), reg->off); + return -EINVAL; + } + /* Doing check_ptr_off_reg check for the offset will catch this * because fixed_off_ok is false, but checking here allows us * to give the user a better error message. @@ -6601,6 +6625,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, case PTR_TO_BTF_ID | PTR_TRUSTED: case PTR_TO_BTF_ID | MEM_RCU: case PTR_TO_BTF_ID | MEM_ALLOC | PTR_TRUSTED: + case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF: /* When referenced PTR_TO_BTF_ID is passed to release function, * its fixed offset must be 0. In the other cases, fixed offset * can be non-zero. This was already checked above. So pass @@ -7363,6 +7388,17 @@ static int release_reference(struct bpf_verifier_env *env, return 0; } +static void invalidate_non_owning_refs(struct bpf_verifier_env *env) +{ + struct bpf_func_state *unused; + struct bpf_reg_state *reg; + + bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({ + if (type_is_non_owning_ref(reg->type)) + __mark_reg_unknown(env, reg); + })); +} + static void clear_caller_saved_regs(struct bpf_verifier_env *env, struct bpf_reg_state *regs) { @@ -8915,38 +8951,54 @@ static int process_kf_arg_ptr_to_kptr(struct bpf_verifier_env *env, return 0; } -static int ref_set_release_on_unlock(struct bpf_verifier_env *env, u32 ref_obj_id) +static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { - struct bpf_func_state *state = cur_func(env); + struct bpf_verifier_state *state = env->cur_state; + + if (!state->active_lock.ptr) { + verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n"); + return -EFAULT; + } + + if (type_flag(reg->type) & NON_OWN_REF) { + verbose(env, "verifier internal error: NON_OWN_REF already set\n"); + return -EFAULT; + } + + reg->type |= NON_OWN_REF; + return 0; +} + +static int ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 ref_obj_id) +{ + struct bpf_func_state *state, *unused; struct bpf_reg_state *reg; int i; - /* bpf_spin_lock only allows calling list_push and list_pop, no BPF - * subprogs, no global functions. This means that the references would - * not be released inside the critical section but they may be added to - * the reference state, and the acquired_refs are never copied out for a - * different frame as BPF to BPF calls don't work in bpf_spin_lock - * critical sections. - */ + state = cur_func(env); + if (!ref_obj_id) { - verbose(env, "verifier internal error: ref_obj_id is zero for release_on_unlock\n"); + verbose(env, "verifier internal error: ref_obj_id is zero for " + "owning -> non-owning conversion\n"); return -EFAULT; } + for (i = 0; i < state->acquired_refs; i++) { - if (state->refs[i].id == ref_obj_id) { - if (state->refs[i].release_on_unlock) { - verbose(env, "verifier internal error: expected false release_on_unlock"); - return -EFAULT; + if (state->refs[i].id != ref_obj_id) + continue; + + /* Clear ref_obj_id here so release_reference doesn't clobber + * the whole reg + */ + bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({ + if (reg->ref_obj_id == ref_obj_id) { + reg->ref_obj_id = 0; + ref_set_non_owning(env, reg); } - state->refs[i].release_on_unlock = true; - /* Now mark everyone sharing same ref_obj_id as untrusted */ - bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ - if (reg->ref_obj_id == ref_obj_id) - reg->type |= PTR_UNTRUSTED; - })); - return 0; - } + })); + return 0; } + verbose(env, "verifier internal error: ref state missing for ref_obj_id\n"); return -EFAULT; } @@ -9081,7 +9133,6 @@ static int process_kf_arg_ptr_to_list_node(struct bpf_verifier_env *env, { const struct btf_type *et, *t; struct btf_field *field; - struct btf_record *rec; u32 list_node_off; if (meta->btf != btf_vmlinux || @@ -9098,9 +9149,8 @@ static int process_kf_arg_ptr_to_list_node(struct bpf_verifier_env *env, return -EINVAL; } - rec = reg_btf_record(reg); list_node_off = reg->off + reg->var_off.value; - field = btf_record_find(rec, list_node_off, BPF_LIST_NODE); + field = reg_find_field_offset(reg, list_node_off, BPF_LIST_NODE); if (!field || field->offset != list_node_off) { verbose(env, "bpf_list_node not found at offset=%u\n", list_node_off); return -EINVAL; @@ -9126,8 +9176,8 @@ static int process_kf_arg_ptr_to_list_node(struct bpf_verifier_env *env, btf_name_by_offset(field->graph_root.btf, et->name_off)); return -EINVAL; } - /* Set arg#1 for expiration after unlock */ - return ref_set_release_on_unlock(env, reg->ref_obj_id); + + return 0; } static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta) @@ -9406,11 +9456,11 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { const struct btf_type *t, *func, *func_proto, *ptr_type; + u32 i, nargs, func_id, ptr_type_id, release_ref_obj_id; struct bpf_reg_state *regs = cur_regs(env); const char *func_name, *ptr_type_name; bool sleepable, rcu_lock, rcu_unlock; struct bpf_kfunc_call_arg_meta meta; - u32 i, nargs, func_id, ptr_type_id; int err, insn_idx = *insn_idx_p; const struct btf_param *args; const struct btf_type *ret_t; @@ -9505,6 +9555,24 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } } + if (meta.func_id == special_kfunc_list[KF_bpf_list_push_front] || + meta.func_id == special_kfunc_list[KF_bpf_list_push_back]) { + release_ref_obj_id = regs[BPF_REG_2].ref_obj_id; + err = ref_convert_owning_non_owning(env, release_ref_obj_id); + if (err) { + verbose(env, "kfunc %s#%d conversion of owning ref to non-owning failed\n", + func_name, func_id); + return err; + } + + err = release_reference(env, release_ref_obj_id); + if (err) { + verbose(env, "kfunc %s#%d reference has not been acquired before\n", + func_name, func_id); + return err; + } + } + for (i = 0; i < CALLER_SAVED_REGS; i++) mark_reg_not_init(env, regs, caller_saved[i]); @@ -11825,8 +11893,10 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, */ if (WARN_ON_ONCE(reg->smin_value || reg->smax_value || !tnum_equals_const(reg->var_off, 0))) return; - if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC | PTR_MAYBE_NULL) && WARN_ON_ONCE(reg->off)) + if (!(type_is_ptr_alloc_obj(reg->type) || type_is_non_owning_ref(reg->type)) && + WARN_ON_ONCE(reg->off)) return; + if (is_null) { reg->type = SCALAR_VALUE; /* We don't need id and ref_obj_id from this point diff --git a/tools/testing/selftests/bpf/prog_tests/linked_list.c b/tools/testing/selftests/bpf/prog_tests/linked_list.c index 9a7d4c47af63..2592b8aa5e41 100644 --- a/tools/testing/selftests/bpf/prog_tests/linked_list.c +++ b/tools/testing/selftests/bpf/prog_tests/linked_list.c @@ -78,8 +78,6 @@ static struct { { "direct_write_head", "direct access to bpf_list_head is disallowed" }, { "direct_read_node", "direct access to bpf_list_node is disallowed" }, { "direct_write_node", "direct access to bpf_list_node is disallowed" }, - { "write_after_push_front", "only read is supported" }, - { "write_after_push_back", "only read is supported" }, { "use_after_unlock_push_front", "invalid mem access 'scalar'" }, { "use_after_unlock_push_back", "invalid mem access 'scalar'" }, { "double_push_front", "arg#1 expected pointer to allocated object" }, diff --git a/tools/testing/selftests/bpf/progs/linked_list.c b/tools/testing/selftests/bpf/progs/linked_list.c index 4ad88da5cda2..4fa4a9b01bde 100644 --- a/tools/testing/selftests/bpf/progs/linked_list.c +++ b/tools/testing/selftests/bpf/progs/linked_list.c @@ -260,7 +260,7 @@ int test_list_push_pop_multiple(struct bpf_spin_lock *lock, struct bpf_list_head { int ret; - ret = list_push_pop_multiple(lock ,head, false); + ret = list_push_pop_multiple(lock, head, false); if (ret) return ret; return list_push_pop_multiple(lock, head, true); diff --git a/tools/testing/selftests/bpf/progs/linked_list_fail.c b/tools/testing/selftests/bpf/progs/linked_list_fail.c index 1d9017240e19..69cdc07cba13 100644 --- a/tools/testing/selftests/bpf/progs/linked_list_fail.c +++ b/tools/testing/selftests/bpf/progs/linked_list_fail.c @@ -54,28 +54,44 @@ return 0; \ } -CHECK(kptr, push_front, &f->head); -CHECK(kptr, push_back, &f->head); CHECK(kptr, pop_front, &f->head); CHECK(kptr, pop_back, &f->head); -CHECK(global, push_front, &ghead); -CHECK(global, push_back, &ghead); CHECK(global, pop_front, &ghead); CHECK(global, pop_back, &ghead); -CHECK(map, push_front, &v->head); -CHECK(map, push_back, &v->head); CHECK(map, pop_front, &v->head); CHECK(map, pop_back, &v->head); -CHECK(inner_map, push_front, &iv->head); -CHECK(inner_map, push_back, &iv->head); CHECK(inner_map, pop_front, &iv->head); CHECK(inner_map, pop_back, &iv->head); #undef CHECK +#define CHECK(test, op, hexpr, nexpr) \ + SEC("?tc") \ + int test##_missing_lock_##op(void *ctx) \ + { \ + INIT; \ + void (*p)(void *, void *) = (void *)&bpf_list_##op; \ + p(hexpr, nexpr); \ + return 0; \ + } + +CHECK(kptr, push_front, &f->head, b); +CHECK(kptr, push_back, &f->head, b); + +CHECK(global, push_front, &ghead, f); +CHECK(global, push_back, &ghead, f); + +CHECK(map, push_front, &v->head, f); +CHECK(map, push_back, &v->head, f); + +CHECK(inner_map, push_front, &iv->head, f); +CHECK(inner_map, push_back, &iv->head, f); + +#undef CHECK + #define CHECK(test, op, lexpr, hexpr) \ SEC("?tc") \ int test##_incorrect_lock_##op(void *ctx) \ @@ -108,11 +124,47 @@ CHECK(inner_map, pop_back, &iv->head); CHECK(inner_map_global, op, &iv->lock, &ghead); \ CHECK(inner_map_map, op, &iv->lock, &v->head); -CHECK_OP(push_front); -CHECK_OP(push_back); CHECK_OP(pop_front); CHECK_OP(pop_back); +#undef CHECK +#undef CHECK_OP + +#define CHECK(test, op, lexpr, hexpr, nexpr) \ + SEC("?tc") \ + int test##_incorrect_lock_##op(void *ctx) \ + { \ + INIT; \ + void (*p)(void *, void*) = (void *)&bpf_list_##op; \ + bpf_spin_lock(lexpr); \ + p(hexpr, nexpr); \ + return 0; \ + } + +#define CHECK_OP(op) \ + CHECK(kptr_kptr, op, &f1->lock, &f2->head, b); \ + CHECK(kptr_global, op, &f1->lock, &ghead, f); \ + CHECK(kptr_map, op, &f1->lock, &v->head, f); \ + CHECK(kptr_inner_map, op, &f1->lock, &iv->head, f); \ + \ + CHECK(global_global, op, &glock2, &ghead, f); \ + CHECK(global_kptr, op, &glock, &f1->head, b); \ + CHECK(global_map, op, &glock, &v->head, f); \ + CHECK(global_inner_map, op, &glock, &iv->head, f); \ + \ + CHECK(map_map, op, &v->lock, &v2->head, f); \ + CHECK(map_kptr, op, &v->lock, &f2->head, b); \ + CHECK(map_global, op, &v->lock, &ghead, f); \ + CHECK(map_inner_map, op, &v->lock, &iv->head, f); \ + \ + CHECK(inner_map_inner_map, op, &iv->lock, &iv2->head, f); \ + CHECK(inner_map_kptr, op, &iv->lock, &f2->head, b); \ + CHECK(inner_map_global, op, &iv->lock, &ghead, f); \ + CHECK(inner_map_map, op, &iv->lock, &v->head, f); + +CHECK_OP(push_front); +CHECK_OP(push_back); + #undef CHECK #undef CHECK_OP #undef INIT @@ -303,34 +355,6 @@ int direct_write_node(void *ctx) return 0; } -static __always_inline -int write_after_op(void (*push_op)(void *head, void *node)) -{ - struct foo *f; - - f = bpf_obj_new(typeof(*f)); - if (!f) - return 0; - bpf_spin_lock(&glock); - push_op(&ghead, &f->node); - f->data = 42; - bpf_spin_unlock(&glock); - - return 0; -} - -SEC("?tc") -int write_after_push_front(void *ctx) -{ - return write_after_op((void *)bpf_list_push_front); -} - -SEC("?tc") -int write_after_push_back(void *ctx) -{ - return write_after_op((void *)bpf_list_push_back); -} - static __always_inline int use_after_unlock(void (*op)(void *head, void *node)) { -- cgit v1.2.3 From 9c395c1b99bd23f74bc628fa000480c49593d17f Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Mon, 13 Feb 2023 16:40:10 -0800 Subject: bpf: Add basic bpf_rb_{root,node} support This patch adds special BPF_RB_{ROOT,NODE} btf_field_types similar to BPF_LIST_{HEAD,NODE}, adds the necessary plumbing to detect the new types, and adds bpf_rb_root_free function for freeing bpf_rb_root in map_values. structs bpf_rb_root and bpf_rb_node are opaque types meant to obscure structs rb_root_cached rb_node, respectively. btf_struct_access will prevent BPF programs from touching these special fields automatically now that they're recognized. btf_check_and_fixup_fields now groups list_head and rb_root together as "graph root" fields and {list,rb}_node as "graph node", and does same ownership cycle checking as before. Note that this function does _not_ prevent ownership type mixups (e.g. rb_root owning list_node) - that's handled by btf_parse_graph_root. After this patch, a bpf program can have a struct bpf_rb_root in a map_value, but not add anything to nor do anything useful with it. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230214004017.2534011-2-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 20 ++- include/uapi/linux/bpf.h | 11 ++ kernel/bpf/btf.c | 162 ++++++++++++++------- kernel/bpf/helpers.c | 40 +++++ kernel/bpf/syscall.c | 28 ++-- kernel/bpf/verifier.c | 5 +- tools/include/uapi/linux/bpf.h | 11 ++ .../testing/selftests/bpf/prog_tests/linked_list.c | 12 +- 8 files changed, 216 insertions(+), 73 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8b5d0b4c4ada..be34f7deb6c3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -181,7 +181,10 @@ enum btf_field_type { BPF_KPTR = BPF_KPTR_UNREF | BPF_KPTR_REF, BPF_LIST_HEAD = (1 << 4), BPF_LIST_NODE = (1 << 5), - BPF_GRAPH_NODE_OR_ROOT = BPF_LIST_NODE | BPF_LIST_HEAD, + BPF_RB_ROOT = (1 << 6), + BPF_RB_NODE = (1 << 7), + BPF_GRAPH_NODE_OR_ROOT = BPF_LIST_NODE | BPF_LIST_HEAD | + BPF_RB_NODE | BPF_RB_ROOT, }; struct btf_field_kptr { @@ -285,6 +288,10 @@ static inline const char *btf_field_type_name(enum btf_field_type type) return "bpf_list_head"; case BPF_LIST_NODE: return "bpf_list_node"; + case BPF_RB_ROOT: + return "bpf_rb_root"; + case BPF_RB_NODE: + return "bpf_rb_node"; default: WARN_ON_ONCE(1); return "unknown"; @@ -305,6 +312,10 @@ static inline u32 btf_field_type_size(enum btf_field_type type) return sizeof(struct bpf_list_head); case BPF_LIST_NODE: return sizeof(struct bpf_list_node); + case BPF_RB_ROOT: + return sizeof(struct bpf_rb_root); + case BPF_RB_NODE: + return sizeof(struct bpf_rb_node); default: WARN_ON_ONCE(1); return 0; @@ -325,6 +336,10 @@ static inline u32 btf_field_type_align(enum btf_field_type type) return __alignof__(struct bpf_list_head); case BPF_LIST_NODE: return __alignof__(struct bpf_list_node); + case BPF_RB_ROOT: + return __alignof__(struct bpf_rb_root); + case BPF_RB_NODE: + return __alignof__(struct bpf_rb_node); default: WARN_ON_ONCE(1); return 0; @@ -435,6 +450,9 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, void bpf_timer_cancel_and_free(void *timer); void bpf_list_head_free(const struct btf_field *field, void *list_head, struct bpf_spin_lock *spin_lock); +void bpf_rb_root_free(const struct btf_field *field, void *rb_root, + struct bpf_spin_lock *spin_lock); + int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 17afd2b35ee5..1503f61336b6 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6917,6 +6917,17 @@ struct bpf_list_node { __u64 :64; } __attribute__((aligned(8))); +struct bpf_rb_root { + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + +struct bpf_rb_node { + __u64 :64; + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + struct bpf_sysctl { __u32 write; /* Sysctl is being read (= 0) or written (= 1). * Allows 1,2,4-byte read, but no write. diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 740bdb045b14..b9d1f5c4e316 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3324,12 +3324,14 @@ static const char *btf_find_decl_tag_value(const struct btf *btf, return NULL; } -static int btf_find_list_head(const struct btf *btf, const struct btf_type *pt, - const struct btf_type *t, int comp_idx, - u32 off, int sz, struct btf_field_info *info) +static int +btf_find_graph_root(const struct btf *btf, const struct btf_type *pt, + const struct btf_type *t, int comp_idx, u32 off, + int sz, struct btf_field_info *info, + enum btf_field_type head_type) { + const char *node_field_name; const char *value_type; - const char *list_node; s32 id; if (!__btf_type_is_struct(t)) @@ -3339,26 +3341,32 @@ static int btf_find_list_head(const struct btf *btf, const struct btf_type *pt, value_type = btf_find_decl_tag_value(btf, pt, comp_idx, "contains:"); if (!value_type) return -EINVAL; - list_node = strstr(value_type, ":"); - if (!list_node) + node_field_name = strstr(value_type, ":"); + if (!node_field_name) return -EINVAL; - value_type = kstrndup(value_type, list_node - value_type, GFP_KERNEL | __GFP_NOWARN); + value_type = kstrndup(value_type, node_field_name - value_type, GFP_KERNEL | __GFP_NOWARN); if (!value_type) return -ENOMEM; id = btf_find_by_name_kind(btf, value_type, BTF_KIND_STRUCT); kfree(value_type); if (id < 0) return id; - list_node++; - if (str_is_empty(list_node)) + node_field_name++; + if (str_is_empty(node_field_name)) return -EINVAL; - info->type = BPF_LIST_HEAD; + info->type = head_type; info->off = off; info->graph_root.value_btf_id = id; - info->graph_root.node_name = list_node; + info->graph_root.node_name = node_field_name; return BTF_FIELD_FOUND; } +#define field_mask_test_name(field_type, field_type_str) \ + if (field_mask & field_type && !strcmp(name, field_type_str)) { \ + type = field_type; \ + goto end; \ + } + static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask, int *align, int *sz) { @@ -3382,18 +3390,11 @@ static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask, goto end; } } - if (field_mask & BPF_LIST_HEAD) { - if (!strcmp(name, "bpf_list_head")) { - type = BPF_LIST_HEAD; - goto end; - } - } - if (field_mask & BPF_LIST_NODE) { - if (!strcmp(name, "bpf_list_node")) { - type = BPF_LIST_NODE; - goto end; - } - } + field_mask_test_name(BPF_LIST_HEAD, "bpf_list_head"); + field_mask_test_name(BPF_LIST_NODE, "bpf_list_node"); + field_mask_test_name(BPF_RB_ROOT, "bpf_rb_root"); + field_mask_test_name(BPF_RB_NODE, "bpf_rb_node"); + /* Only return BPF_KPTR when all other types with matchable names fail */ if (field_mask & BPF_KPTR) { type = BPF_KPTR_REF; @@ -3406,6 +3407,8 @@ end: return type; } +#undef field_mask_test_name + static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t, u32 field_mask, struct btf_field_info *info, int info_cnt) @@ -3438,6 +3441,7 @@ static int btf_find_struct_field(const struct btf *btf, case BPF_SPIN_LOCK: case BPF_TIMER: case BPF_LIST_NODE: + case BPF_RB_NODE: ret = btf_find_struct(btf, member_type, off, sz, field_type, idx < info_cnt ? &info[idx] : &tmp); if (ret < 0) @@ -3451,8 +3455,11 @@ static int btf_find_struct_field(const struct btf *btf, return ret; break; case BPF_LIST_HEAD: - ret = btf_find_list_head(btf, t, member_type, i, off, sz, - idx < info_cnt ? &info[idx] : &tmp); + case BPF_RB_ROOT: + ret = btf_find_graph_root(btf, t, member_type, + i, off, sz, + idx < info_cnt ? &info[idx] : &tmp, + field_type); if (ret < 0) return ret; break; @@ -3499,6 +3506,7 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, case BPF_SPIN_LOCK: case BPF_TIMER: case BPF_LIST_NODE: + case BPF_RB_NODE: ret = btf_find_struct(btf, var_type, off, sz, field_type, idx < info_cnt ? &info[idx] : &tmp); if (ret < 0) @@ -3512,8 +3520,11 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, return ret; break; case BPF_LIST_HEAD: - ret = btf_find_list_head(btf, var, var_type, -1, off, sz, - idx < info_cnt ? &info[idx] : &tmp); + case BPF_RB_ROOT: + ret = btf_find_graph_root(btf, var, var_type, + -1, off, sz, + idx < info_cnt ? &info[idx] : &tmp, + field_type); if (ret < 0) return ret; break; @@ -3615,8 +3626,11 @@ end_btf: return ret; } -static int btf_parse_list_head(const struct btf *btf, struct btf_field *field, - struct btf_field_info *info) +static int btf_parse_graph_root(const struct btf *btf, + struct btf_field *field, + struct btf_field_info *info, + const char *node_type_name, + size_t node_type_align) { const struct btf_type *t, *n = NULL; const struct btf_member *member; @@ -3638,13 +3652,13 @@ static int btf_parse_list_head(const struct btf *btf, struct btf_field *field, n = btf_type_by_id(btf, member->type); if (!__btf_type_is_struct(n)) return -EINVAL; - if (strcmp("bpf_list_node", __btf_name_by_offset(btf, n->name_off))) + if (strcmp(node_type_name, __btf_name_by_offset(btf, n->name_off))) return -EINVAL; offset = __btf_member_bit_offset(n, member); if (offset % 8) return -EINVAL; offset /= 8; - if (offset % __alignof__(struct bpf_list_node)) + if (offset % node_type_align) return -EINVAL; field->graph_root.btf = (struct btf *)btf; @@ -3656,6 +3670,20 @@ static int btf_parse_list_head(const struct btf *btf, struct btf_field *field, return 0; } +static int btf_parse_list_head(const struct btf *btf, struct btf_field *field, + struct btf_field_info *info) +{ + return btf_parse_graph_root(btf, field, info, "bpf_list_node", + __alignof__(struct bpf_list_node)); +} + +static int btf_parse_rb_root(const struct btf *btf, struct btf_field *field, + struct btf_field_info *info) +{ + return btf_parse_graph_root(btf, field, info, "bpf_rb_node", + __alignof__(struct bpf_rb_node)); +} + struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t, u32 field_mask, u32 value_size) { @@ -3718,7 +3746,13 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type if (ret < 0) goto end; break; + case BPF_RB_ROOT: + ret = btf_parse_rb_root(btf, &rec->fields[i], &info_arr[i]); + if (ret < 0) + goto end; + break; case BPF_LIST_NODE: + case BPF_RB_NODE: break; default: ret = -EFAULT; @@ -3727,8 +3761,9 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type rec->cnt++; } - /* bpf_list_head requires bpf_spin_lock */ - if (btf_record_has_field(rec, BPF_LIST_HEAD) && rec->spin_lock_off < 0) { + /* bpf_{list_head, rb_node} require bpf_spin_lock */ + if ((btf_record_has_field(rec, BPF_LIST_HEAD) || + btf_record_has_field(rec, BPF_RB_ROOT)) && rec->spin_lock_off < 0) { ret = -EINVAL; goto end; } @@ -3739,22 +3774,28 @@ end: return ERR_PTR(ret); } +#define GRAPH_ROOT_MASK (BPF_LIST_HEAD | BPF_RB_ROOT) +#define GRAPH_NODE_MASK (BPF_LIST_NODE | BPF_RB_NODE) + int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec) { int i; - /* There are two owning types, kptr_ref and bpf_list_head. The former - * only supports storing kernel types, which can never store references - * to program allocated local types, atleast not yet. Hence we only need - * to ensure that bpf_list_head ownership does not form cycles. + /* There are three types that signify ownership of some other type: + * kptr_ref, bpf_list_head, bpf_rb_root. + * kptr_ref only supports storing kernel types, which can't store + * references to program allocated local types. + * + * Hence we only need to ensure that bpf_{list_head,rb_root} ownership + * does not form cycles. */ - if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & BPF_LIST_HEAD)) + if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & GRAPH_ROOT_MASK)) return 0; for (i = 0; i < rec->cnt; i++) { struct btf_struct_meta *meta; u32 btf_id; - if (!(rec->fields[i].type & BPF_LIST_HEAD)) + if (!(rec->fields[i].type & GRAPH_ROOT_MASK)) continue; btf_id = rec->fields[i].graph_root.value_btf_id; meta = btf_find_struct_meta(btf, btf_id); @@ -3762,39 +3803,47 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec) return -EFAULT; rec->fields[i].graph_root.value_rec = meta->record; - if (!(rec->field_mask & BPF_LIST_NODE)) + /* We need to set value_rec for all root types, but no need + * to check ownership cycle for a type unless it's also a + * node type. + */ + if (!(rec->field_mask & GRAPH_NODE_MASK)) continue; /* We need to ensure ownership acyclicity among all types. The * proper way to do it would be to topologically sort all BTF * IDs based on the ownership edges, since there can be multiple - * bpf_list_head in a type. Instead, we use the following - * reasoning: + * bpf_{list_head,rb_node} in a type. Instead, we use the + * following resaoning: * * - A type can only be owned by another type in user BTF if it - * has a bpf_list_node. + * has a bpf_{list,rb}_node. Let's call these node types. * - A type can only _own_ another type in user BTF if it has a - * bpf_list_head. + * bpf_{list_head,rb_root}. Let's call these root types. * - * We ensure that if a type has both bpf_list_head and - * bpf_list_node, its element types cannot be owning types. + * We ensure that if a type is both a root and node, its + * element types cannot be root types. * * To ensure acyclicity: * - * When A only has bpf_list_head, ownership chain can be: + * When A is an root type but not a node, its ownership + * chain can be: * A -> B -> C * Where: - * - B has both bpf_list_head and bpf_list_node. - * - C only has bpf_list_node. + * - A is an root, e.g. has bpf_rb_root. + * - B is both a root and node, e.g. has bpf_rb_node and + * bpf_list_head. + * - C is only an root, e.g. has bpf_list_node * - * When A has both bpf_list_head and bpf_list_node, some other - * type already owns it in the BTF domain, hence it can not own - * another owning type through any of the bpf_list_head edges. + * When A is both a root and node, some other type already + * owns it in the BTF domain, hence it can not own + * another root type through any of the ownership edges. * A -> B * Where: - * - B only has bpf_list_node. + * - A is both an root and node. + * - B is only an node. */ - if (meta->record->field_mask & BPF_LIST_HEAD) + if (meta->record->field_mask & GRAPH_ROOT_MASK) return -ELOOP; } return 0; @@ -5256,6 +5305,8 @@ static const char *alloc_obj_fields[] = { "bpf_spin_lock", "bpf_list_head", "bpf_list_node", + "bpf_rb_root", + "bpf_rb_node", }; static struct btf_struct_metas * @@ -5329,7 +5380,8 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf) type = &tab->types[tab->cnt]; type->btf_id = i; - record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE, t->size); + record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE | + BPF_RB_ROOT | BPF_RB_NODE, t->size); /* The record cannot be unset, treat it as an error if so */ if (IS_ERR_OR_NULL(record)) { ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 2dae44581922..192184b5156e 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1772,6 +1772,46 @@ unlock: } } +/* Like rbtree_postorder_for_each_entry_safe, but 'pos' and 'n' are + * 'rb_node *', so field name of rb_node within containing struct is not + * needed. + * + * Since bpf_rb_tree's node type has a corresponding struct btf_field with + * graph_root.node_offset, it's not necessary to know field name + * or type of node struct + */ +#define bpf_rbtree_postorder_for_each_entry_safe(pos, n, root) \ + for (pos = rb_first_postorder(root); \ + pos && ({ n = rb_next_postorder(pos); 1; }); \ + pos = n) + +void bpf_rb_root_free(const struct btf_field *field, void *rb_root, + struct bpf_spin_lock *spin_lock) +{ + struct rb_root_cached orig_root, *root = rb_root; + struct rb_node *pos, *n; + void *obj; + + BUILD_BUG_ON(sizeof(struct rb_root_cached) > sizeof(struct bpf_rb_root)); + BUILD_BUG_ON(__alignof__(struct rb_root_cached) > __alignof__(struct bpf_rb_root)); + + __bpf_spin_lock_irqsave(spin_lock); + orig_root = *root; + *root = RB_ROOT_CACHED; + __bpf_spin_unlock_irqrestore(spin_lock); + + bpf_rbtree_postorder_for_each_entry_safe(pos, n, &orig_root.rb_root) { + obj = pos; + obj -= field->graph_root.node_offset; + + bpf_obj_free_fields(field->graph_root.value_rec, obj); + + migrate_disable(); + bpf_mem_free(&bpf_global_ma, obj); + migrate_enable(); + } +} + __diag_push(); __diag_ignore_all("-Wmissing-prototypes", "Global functions as their definitions will be in vmlinux BTF"); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cda8d00f3762..e3fcdc9836a6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -537,9 +537,6 @@ void btf_record_free(struct btf_record *rec) return; for (i = 0; i < rec->cnt; i++) { switch (rec->fields[i].type) { - case BPF_SPIN_LOCK: - case BPF_TIMER: - break; case BPF_KPTR_UNREF: case BPF_KPTR_REF: if (rec->fields[i].kptr.module) @@ -548,7 +545,11 @@ void btf_record_free(struct btf_record *rec) break; case BPF_LIST_HEAD: case BPF_LIST_NODE: - /* Nothing to release for bpf_list_head */ + case BPF_RB_ROOT: + case BPF_RB_NODE: + case BPF_SPIN_LOCK: + case BPF_TIMER: + /* Nothing to release */ break; default: WARN_ON_ONCE(1); @@ -581,9 +582,6 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) new_rec->cnt = 0; for (i = 0; i < rec->cnt; i++) { switch (fields[i].type) { - case BPF_SPIN_LOCK: - case BPF_TIMER: - break; case BPF_KPTR_UNREF: case BPF_KPTR_REF: btf_get(fields[i].kptr.btf); @@ -594,7 +592,11 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) break; case BPF_LIST_HEAD: case BPF_LIST_NODE: - /* Nothing to acquire for bpf_list_head */ + case BPF_RB_ROOT: + case BPF_RB_NODE: + case BPF_SPIN_LOCK: + case BPF_TIMER: + /* Nothing to acquire */ break; default: ret = -EFAULT; @@ -674,7 +676,13 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) continue; bpf_list_head_free(field, field_ptr, obj + rec->spin_lock_off); break; + case BPF_RB_ROOT: + if (WARN_ON_ONCE(rec->spin_lock_off < 0)) + continue; + bpf_rb_root_free(field, field_ptr, obj + rec->spin_lock_off); + break; case BPF_LIST_NODE: + case BPF_RB_NODE: break; default: WARN_ON_ONCE(1); @@ -1010,7 +1018,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, return -EINVAL; map->record = btf_parse_fields(btf, value_type, - BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD, + BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | + BPF_RB_ROOT, map->value_size); if (!IS_ERR_OR_NULL(map->record)) { int i; @@ -1058,6 +1067,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, } break; case BPF_LIST_HEAD: + case BPF_RB_ROOT: if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && map->map_type != BPF_MAP_TYPE_ARRAY) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f176bc15c879..4fd098851f43 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14703,9 +14703,10 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, { enum bpf_prog_type prog_type = resolve_prog_type(prog); - if (btf_record_has_field(map->record, BPF_LIST_HEAD)) { + if (btf_record_has_field(map->record, BPF_LIST_HEAD) || + btf_record_has_field(map->record, BPF_RB_ROOT)) { if (is_tracing_prog_type(prog_type)) { - verbose(env, "tracing progs cannot use bpf_list_head yet\n"); + verbose(env, "tracing progs cannot use bpf_{list_head,rb_root} yet\n"); return -EINVAL; } } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 17afd2b35ee5..1503f61336b6 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -6917,6 +6917,17 @@ struct bpf_list_node { __u64 :64; } __attribute__((aligned(8))); +struct bpf_rb_root { + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + +struct bpf_rb_node { + __u64 :64; + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + struct bpf_sysctl { __u32 write; /* Sysctl is being read (= 0) or written (= 1). * Allows 1,2,4-byte read, but no write. diff --git a/tools/testing/selftests/bpf/prog_tests/linked_list.c b/tools/testing/selftests/bpf/prog_tests/linked_list.c index 2592b8aa5e41..c456b34a823a 100644 --- a/tools/testing/selftests/bpf/prog_tests/linked_list.c +++ b/tools/testing/selftests/bpf/prog_tests/linked_list.c @@ -58,12 +58,12 @@ static struct { TEST(inner_map, pop_front) TEST(inner_map, pop_back) #undef TEST - { "map_compat_kprobe", "tracing progs cannot use bpf_list_head yet" }, - { "map_compat_kretprobe", "tracing progs cannot use bpf_list_head yet" }, - { "map_compat_tp", "tracing progs cannot use bpf_list_head yet" }, - { "map_compat_perf", "tracing progs cannot use bpf_list_head yet" }, - { "map_compat_raw_tp", "tracing progs cannot use bpf_list_head yet" }, - { "map_compat_raw_tp_w", "tracing progs cannot use bpf_list_head yet" }, + { "map_compat_kprobe", "tracing progs cannot use bpf_{list_head,rb_root} yet" }, + { "map_compat_kretprobe", "tracing progs cannot use bpf_{list_head,rb_root} yet" }, + { "map_compat_tp", "tracing progs cannot use bpf_{list_head,rb_root} yet" }, + { "map_compat_perf", "tracing progs cannot use bpf_{list_head,rb_root} yet" }, + { "map_compat_raw_tp", "tracing progs cannot use bpf_{list_head,rb_root} yet" }, + { "map_compat_raw_tp_w", "tracing progs cannot use bpf_{list_head,rb_root} yet" }, { "obj_type_id_oor", "local type ID argument must be in range [0, U32_MAX]" }, { "obj_new_no_composite", "bpf_obj_new type ID argument must be of a struct" }, { "obj_new_no_struct", "bpf_obj_new type ID argument must be of a struct" }, -- cgit v1.2.3 From bd1279ae8a691d7ec75852c6d0a22139afb034a4 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Mon, 13 Feb 2023 16:40:11 -0800 Subject: bpf: Add bpf_rbtree_{add,remove,first} kfuncs This patch adds implementations of bpf_rbtree_{add,remove,first} and teaches verifier about their BTF_IDs as well as those of bpf_rb_{root,node}. All three kfuncs have some nonstandard component to their verification that needs to be addressed in future patches before programs can properly use them: * bpf_rbtree_add: Takes 'less' callback, need to verify it * bpf_rbtree_first: Returns ptr_to_node_type(off=rb_node_off) instead of ptr_to_rb_node(off=0). Return value ref is non-owning. * bpf_rbtree_remove: Returns ptr_to_node_type(off=rb_node_off) instead of ptr_to_rb_node(off=0). 2nd arg (node) is a non-owning reference. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230214004017.2534011-3-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 14 ++++++++++++- 2 files changed, 67 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 192184b5156e..5b278a38ae58 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1884,6 +1884,56 @@ __bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) return __bpf_list_del(head, true); } +__bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, + struct bpf_rb_node *node) +{ + struct rb_root_cached *r = (struct rb_root_cached *)root; + struct rb_node *n = (struct rb_node *)node; + + rb_erase_cached(n, r); + RB_CLEAR_NODE(n); + return (struct bpf_rb_node *)n; +} + +/* Need to copy rbtree_add_cached's logic here because our 'less' is a BPF + * program + */ +static void __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node, + void *less) +{ + struct rb_node **link = &((struct rb_root_cached *)root)->rb_root.rb_node; + bpf_callback_t cb = (bpf_callback_t)less; + struct rb_node *parent = NULL; + bool leftmost = true; + + while (*link) { + parent = *link; + if (cb((uintptr_t)node, (uintptr_t)parent, 0, 0, 0)) { + link = &parent->rb_left; + } else { + link = &parent->rb_right; + leftmost = false; + } + } + + rb_link_node((struct rb_node *)node, parent, link); + rb_insert_color_cached((struct rb_node *)node, + (struct rb_root_cached *)root, leftmost); +} + +__bpf_kfunc void bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node, + bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b)) +{ + __bpf_rbtree_add(root, node, (void *)less); +} + +__bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) +{ + struct rb_root_cached *r = (struct rb_root_cached *)root; + + return (struct bpf_rb_node *)rb_first_cached(r); +} + /** * bpf_task_acquire - Acquire a reference to a task. A task acquired by this * kfunc which is not stored in a map as a kptr, must be released by calling @@ -2108,6 +2158,10 @@ BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_task_acquire_not_zero, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE) +BTF_ID_FLAGS(func, bpf_rbtree_add) +BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL) + #ifdef CONFIG_CGROUPS BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_cgroup_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4fd098851f43..e6d2a599c7d1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8638,6 +8638,8 @@ BTF_ID_LIST(kf_arg_btf_ids) BTF_ID(struct, bpf_dynptr_kern) BTF_ID(struct, bpf_list_head) BTF_ID(struct, bpf_list_node) +BTF_ID(struct, bpf_rb_root) +BTF_ID(struct, bpf_rb_node) static bool __is_kfunc_ptr_arg_type(const struct btf *btf, const struct btf_param *arg, int type) @@ -8743,6 +8745,9 @@ enum special_kfunc_type { KF_bpf_rdonly_cast, KF_bpf_rcu_read_lock, KF_bpf_rcu_read_unlock, + KF_bpf_rbtree_remove, + KF_bpf_rbtree_add, + KF_bpf_rbtree_first, }; BTF_SET_START(special_kfunc_set) @@ -8754,6 +8759,9 @@ BTF_ID(func, bpf_list_pop_front) BTF_ID(func, bpf_list_pop_back) BTF_ID(func, bpf_cast_to_kern_ctx) BTF_ID(func, bpf_rdonly_cast) +BTF_ID(func, bpf_rbtree_remove) +BTF_ID(func, bpf_rbtree_add) +BTF_ID(func, bpf_rbtree_first) BTF_SET_END(special_kfunc_set) BTF_ID_LIST(special_kfunc_list) @@ -8767,6 +8775,9 @@ BTF_ID(func, bpf_cast_to_kern_ctx) BTF_ID(func, bpf_rdonly_cast) BTF_ID(func, bpf_rcu_read_lock) BTF_ID(func, bpf_rcu_read_unlock) +BTF_ID(func, bpf_rbtree_remove) +BTF_ID(func, bpf_rbtree_add) +BTF_ID(func, bpf_rbtree_first) static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta) { @@ -9556,7 +9567,8 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } if (meta.func_id == special_kfunc_list[KF_bpf_list_push_front] || - meta.func_id == special_kfunc_list[KF_bpf_list_push_back]) { + meta.func_id == special_kfunc_list[KF_bpf_list_push_back] || + meta.func_id == special_kfunc_list[KF_bpf_rbtree_add]) { release_ref_obj_id = regs[BPF_REG_2].ref_obj_id; err = ref_convert_owning_non_owning(env, release_ref_obj_id); if (err) { -- cgit v1.2.3 From cd6791b4b6f66f6b7925c840efe5c8fa0ce1ac87 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Mon, 13 Feb 2023 16:40:12 -0800 Subject: bpf: Add support for bpf_rb_root and bpf_rb_node in kfunc args Now that we find bpf_rb_root and bpf_rb_node in structs, let's give args that contain those types special classification and properly handle these types when checking kfunc args. "Properly handling" these types largely requires generalizing similar handling for bpf_list_{head,node}, with little new logic added in this patch. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230214004017.2534011-4-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 238 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 203 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e6d2a599c7d1..abfd57dd01e5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8521,6 +8521,9 @@ struct bpf_kfunc_call_arg_meta { struct { struct btf_field *field; } arg_list_head; + struct { + struct btf_field *field; + } arg_rbtree_root; }; static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta) @@ -8632,6 +8635,8 @@ enum { KF_ARG_DYNPTR_ID, KF_ARG_LIST_HEAD_ID, KF_ARG_LIST_NODE_ID, + KF_ARG_RB_ROOT_ID, + KF_ARG_RB_NODE_ID, }; BTF_ID_LIST(kf_arg_btf_ids) @@ -8673,6 +8678,16 @@ static bool is_kfunc_arg_list_node(const struct btf *btf, const struct btf_param return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_LIST_NODE_ID); } +static bool is_kfunc_arg_rbtree_root(const struct btf *btf, const struct btf_param *arg) +{ + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RB_ROOT_ID); +} + +static bool is_kfunc_arg_rbtree_node(const struct btf *btf, const struct btf_param *arg) +{ + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RB_NODE_ID); +} + /* Returns true if struct is composed of scalars, 4 levels of nesting allowed */ static bool __btf_type_is_scalar_struct(struct bpf_verifier_env *env, const struct btf *btf, @@ -8732,6 +8747,8 @@ enum kfunc_ptr_arg_type { KF_ARG_PTR_TO_BTF_ID, /* Also covers reg2btf_ids conversions */ KF_ARG_PTR_TO_MEM, KF_ARG_PTR_TO_MEM_SIZE, /* Size derived from next argument, skip it */ + KF_ARG_PTR_TO_RB_ROOT, + KF_ARG_PTR_TO_RB_NODE, }; enum special_kfunc_type { @@ -8839,6 +8856,12 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_list_node(meta->btf, &args[argno])) return KF_ARG_PTR_TO_LIST_NODE; + if (is_kfunc_arg_rbtree_root(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_RB_ROOT; + + if (is_kfunc_arg_rbtree_node(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_RB_NODE; + if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) { if (!btf_type_is_struct(ref_t)) { verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n", @@ -9095,95 +9118,193 @@ static bool is_bpf_list_api_kfunc(u32 btf_id) btf_id == special_kfunc_list[KF_bpf_list_pop_back]; } -static int process_kf_arg_ptr_to_list_head(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, u32 regno, - struct bpf_kfunc_call_arg_meta *meta) +static bool is_bpf_rbtree_api_kfunc(u32 btf_id) +{ + return btf_id == special_kfunc_list[KF_bpf_rbtree_add] || + btf_id == special_kfunc_list[KF_bpf_rbtree_remove] || + btf_id == special_kfunc_list[KF_bpf_rbtree_first]; +} + +static bool is_bpf_graph_api_kfunc(u32 btf_id) +{ + return is_bpf_list_api_kfunc(btf_id) || is_bpf_rbtree_api_kfunc(btf_id); +} + +static bool check_kfunc_is_graph_root_api(struct bpf_verifier_env *env, + enum btf_field_type head_field_type, + u32 kfunc_btf_id) { + bool ret; + + switch (head_field_type) { + case BPF_LIST_HEAD: + ret = is_bpf_list_api_kfunc(kfunc_btf_id); + break; + case BPF_RB_ROOT: + ret = is_bpf_rbtree_api_kfunc(kfunc_btf_id); + break; + default: + verbose(env, "verifier internal error: unexpected graph root argument type %s\n", + btf_field_type_name(head_field_type)); + return false; + } + + if (!ret) + verbose(env, "verifier internal error: %s head arg for unknown kfunc\n", + btf_field_type_name(head_field_type)); + return ret; +} + +static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env, + enum btf_field_type node_field_type, + u32 kfunc_btf_id) +{ + bool ret; + + switch (node_field_type) { + case BPF_LIST_NODE: + ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_front] || + kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_back]); + break; + case BPF_RB_NODE: + ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_remove] || + kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add]); + break; + default: + verbose(env, "verifier internal error: unexpected graph node argument type %s\n", + btf_field_type_name(node_field_type)); + return false; + } + + if (!ret) + verbose(env, "verifier internal error: %s node arg for unknown kfunc\n", + btf_field_type_name(node_field_type)); + return ret; +} + +static int +__process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, u32 regno, + struct bpf_kfunc_call_arg_meta *meta, + enum btf_field_type head_field_type, + struct btf_field **head_field) +{ + const char *head_type_name; struct btf_field *field; struct btf_record *rec; - u32 list_head_off; + u32 head_off; - if (meta->btf != btf_vmlinux || !is_bpf_list_api_kfunc(meta->func_id)) { - verbose(env, "verifier internal error: bpf_list_head argument for unknown kfunc\n"); + if (meta->btf != btf_vmlinux) { + verbose(env, "verifier internal error: unexpected btf mismatch in kfunc call\n"); return -EFAULT; } + if (!check_kfunc_is_graph_root_api(env, head_field_type, meta->func_id)) + return -EFAULT; + + head_type_name = btf_field_type_name(head_field_type); if (!tnum_is_const(reg->var_off)) { verbose(env, - "R%d doesn't have constant offset. bpf_list_head has to be at the constant offset\n", - regno); + "R%d doesn't have constant offset. %s has to be at the constant offset\n", + regno, head_type_name); return -EINVAL; } rec = reg_btf_record(reg); - list_head_off = reg->off + reg->var_off.value; - field = btf_record_find(rec, list_head_off, BPF_LIST_HEAD); + head_off = reg->off + reg->var_off.value; + field = btf_record_find(rec, head_off, head_field_type); if (!field) { - verbose(env, "bpf_list_head not found at offset=%u\n", list_head_off); + verbose(env, "%s not found at offset=%u\n", head_type_name, head_off); return -EINVAL; } /* All functions require bpf_list_head to be protected using a bpf_spin_lock */ if (check_reg_allocation_locked(env, reg)) { - verbose(env, "bpf_spin_lock at off=%d must be held for bpf_list_head\n", - rec->spin_lock_off); + verbose(env, "bpf_spin_lock at off=%d must be held for %s\n", + rec->spin_lock_off, head_type_name); return -EINVAL; } - if (meta->arg_list_head.field) { - verbose(env, "verifier internal error: repeating bpf_list_head arg\n"); + if (*head_field) { + verbose(env, "verifier internal error: repeating %s arg\n", head_type_name); return -EFAULT; } - meta->arg_list_head.field = field; + *head_field = field; return 0; } -static int process_kf_arg_ptr_to_list_node(struct bpf_verifier_env *env, +static int process_kf_arg_ptr_to_list_head(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, struct bpf_kfunc_call_arg_meta *meta) { + return __process_kf_arg_ptr_to_graph_root(env, reg, regno, meta, BPF_LIST_HEAD, + &meta->arg_list_head.field); +} + +static int process_kf_arg_ptr_to_rbtree_root(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, u32 regno, + struct bpf_kfunc_call_arg_meta *meta) +{ + return __process_kf_arg_ptr_to_graph_root(env, reg, regno, meta, BPF_RB_ROOT, + &meta->arg_rbtree_root.field); +} + +static int +__process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, u32 regno, + struct bpf_kfunc_call_arg_meta *meta, + enum btf_field_type head_field_type, + enum btf_field_type node_field_type, + struct btf_field **node_field) +{ + const char *node_type_name; const struct btf_type *et, *t; struct btf_field *field; - u32 list_node_off; + u32 node_off; - if (meta->btf != btf_vmlinux || - (meta->func_id != special_kfunc_list[KF_bpf_list_push_front] && - meta->func_id != special_kfunc_list[KF_bpf_list_push_back])) { - verbose(env, "verifier internal error: bpf_list_node argument for unknown kfunc\n"); + if (meta->btf != btf_vmlinux) { + verbose(env, "verifier internal error: unexpected btf mismatch in kfunc call\n"); return -EFAULT; } + if (!check_kfunc_is_graph_node_api(env, node_field_type, meta->func_id)) + return -EFAULT; + + node_type_name = btf_field_type_name(node_field_type); if (!tnum_is_const(reg->var_off)) { verbose(env, - "R%d doesn't have constant offset. bpf_list_node has to be at the constant offset\n", - regno); + "R%d doesn't have constant offset. %s has to be at the constant offset\n", + regno, node_type_name); return -EINVAL; } - list_node_off = reg->off + reg->var_off.value; - field = reg_find_field_offset(reg, list_node_off, BPF_LIST_NODE); - if (!field || field->offset != list_node_off) { - verbose(env, "bpf_list_node not found at offset=%u\n", list_node_off); + node_off = reg->off + reg->var_off.value; + field = reg_find_field_offset(reg, node_off, node_field_type); + if (!field || field->offset != node_off) { + verbose(env, "%s not found at offset=%u\n", node_type_name, node_off); return -EINVAL; } - field = meta->arg_list_head.field; + field = *node_field; et = btf_type_by_id(field->graph_root.btf, field->graph_root.value_btf_id); t = btf_type_by_id(reg->btf, reg->btf_id); if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, 0, field->graph_root.btf, field->graph_root.value_btf_id, true)) { - verbose(env, "operation on bpf_list_head expects arg#1 bpf_list_node at offset=%d " + verbose(env, "operation on %s expects arg#1 %s at offset=%d " "in struct %s, but arg is at offset=%d in struct %s\n", + btf_field_type_name(head_field_type), + btf_field_type_name(node_field_type), field->graph_root.node_offset, btf_name_by_offset(field->graph_root.btf, et->name_off), - list_node_off, btf_name_by_offset(reg->btf, t->name_off)); + node_off, btf_name_by_offset(reg->btf, t->name_off)); return -EINVAL; } - if (list_node_off != field->graph_root.node_offset) { - verbose(env, "arg#1 offset=%d, but expected bpf_list_node at offset=%d in struct %s\n", - list_node_off, field->graph_root.node_offset, + if (node_off != field->graph_root.node_offset) { + verbose(env, "arg#1 offset=%d, but expected %s at offset=%d in struct %s\n", + node_off, btf_field_type_name(node_field_type), + field->graph_root.node_offset, btf_name_by_offset(field->graph_root.btf, et->name_off)); return -EINVAL; } @@ -9191,6 +9312,24 @@ static int process_kf_arg_ptr_to_list_node(struct bpf_verifier_env *env, return 0; } +static int process_kf_arg_ptr_to_list_node(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, u32 regno, + struct bpf_kfunc_call_arg_meta *meta) +{ + return __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta, + BPF_LIST_HEAD, BPF_LIST_NODE, + &meta->arg_list_head.field); +} + +static int process_kf_arg_ptr_to_rbtree_node(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, u32 regno, + struct bpf_kfunc_call_arg_meta *meta) +{ + return __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta, + BPF_RB_ROOT, BPF_RB_NODE, + &meta->arg_rbtree_root.field); +} + static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta) { const char *func_name = meta->func_name, *ref_tname; @@ -9325,6 +9464,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_DYNPTR: case KF_ARG_PTR_TO_LIST_HEAD: case KF_ARG_PTR_TO_LIST_NODE: + case KF_ARG_PTR_TO_RB_ROOT: + case KF_ARG_PTR_TO_RB_NODE: case KF_ARG_PTR_TO_MEM: case KF_ARG_PTR_TO_MEM_SIZE: /* Trusted by default */ @@ -9403,6 +9544,20 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (ret < 0) return ret; break; + case KF_ARG_PTR_TO_RB_ROOT: + if (reg->type != PTR_TO_MAP_VALUE && + reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { + verbose(env, "arg#%d expected pointer to map value or allocated object\n", i); + return -EINVAL; + } + if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) { + verbose(env, "allocated object must be referenced\n"); + return -EINVAL; + } + ret = process_kf_arg_ptr_to_rbtree_root(env, reg, regno, meta); + if (ret < 0) + return ret; + break; case KF_ARG_PTR_TO_LIST_NODE: if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { verbose(env, "arg#%d expected pointer to allocated object\n", i); @@ -9416,6 +9571,19 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (ret < 0) return ret; break; + case KF_ARG_PTR_TO_RB_NODE: + if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { + verbose(env, "arg#%d expected pointer to allocated object\n", i); + return -EINVAL; + } + if (!reg->ref_obj_id) { + verbose(env, "allocated object must be referenced\n"); + return -EINVAL; + } + ret = process_kf_arg_ptr_to_rbtree_node(env, reg, regno, meta); + if (ret < 0) + return ret; + break; case KF_ARG_PTR_TO_BTF_ID: /* Only base_type is checked, further checks are done here */ if ((base_type(reg->type) != PTR_TO_BTF_ID || @@ -14417,7 +14585,7 @@ static int do_check(struct bpf_verifier_env *env) if ((insn->src_reg == BPF_REG_0 && insn->imm != BPF_FUNC_spin_unlock) || (insn->src_reg == BPF_PSEUDO_CALL) || (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && - (insn->off != 0 || !is_bpf_list_api_kfunc(insn->imm)))) { + (insn->off != 0 || !is_bpf_graph_api_kfunc(insn->imm)))) { verbose(env, "function calls are not allowed while holding a lock\n"); return -EINVAL; } -- cgit v1.2.3 From 5d92ddc3de1b44a82108af68ed71f638ca20509a Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Mon, 13 Feb 2023 16:40:13 -0800 Subject: bpf: Add callback validation to kfunc verifier logic Some BPF helpers take a callback function which the helper calls. For each helper that takes such a callback, there's a special call to __check_func_call with a callback-state-setting callback that sets up verifier bpf_func_state for the callback's frame. kfuncs don't have any of this infrastructure yet, so let's add it in this patch, following existing helper pattern as much as possible. To validate functionality of this added plumbing, this patch adds callback handling for the bpf_rbtree_add kfunc and hopes to lay groundwork for future graph datastructure callbacks. In the "general plumbing" category we have: * check_kfunc_call doing callback verification right before clearing CALLER_SAVED_REGS, exactly like check_helper_call * recognition of func_ptr BTF types in kfunc args as KF_ARG_PTR_TO_CALLBACK + propagation of subprogno for this arg type In the "rbtree_add / graph datastructure-specific plumbing" category: * Since bpf_rbtree_add must be called while the spin_lock associated with the tree is held, don't complain when callback's func_state doesn't unlock it by frame exit * Mark rbtree_add callback's args with ref_set_non_owning to prevent rbtree api functions from being called in the callback. Semantically this makes sense, as less() takes no ownership of its args when determining which comes first. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230214004017.2534011-5-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 134 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 129 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index abfd57dd01e5..88c8edf67007 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -191,6 +191,7 @@ struct bpf_verifier_stack_elem { static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx); static int release_reference(struct bpf_verifier_env *env, int ref_obj_id); static void invalidate_non_owning_refs(struct bpf_verifier_env *env); +static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env); static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg); @@ -1642,6 +1643,16 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) reg->type &= ~PTR_MAYBE_NULL; } +static void mark_reg_graph_node(struct bpf_reg_state *regs, u32 regno, + struct btf_field_graph_root *ds_head) +{ + __mark_reg_known_zero(®s[regno]); + regs[regno].type = PTR_TO_BTF_ID | MEM_ALLOC; + regs[regno].btf = ds_head->btf; + regs[regno].btf_id = ds_head->value_btf_id; + regs[regno].off = ds_head->node_offset; +} + static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg) { return type_is_pkt_pointer(reg->type); @@ -6837,6 +6848,10 @@ skip_type_check: meta->ret_btf_id = reg->btf_id; break; case ARG_PTR_TO_SPIN_LOCK: + if (in_rbtree_lock_required_cb(env)) { + verbose(env, "can't spin_{lock,unlock} in rbtree cb\n"); + return -EACCES; + } if (meta->func_id == BPF_FUNC_spin_lock) { err = process_spin_lock(env, regno, true); if (err) @@ -7420,6 +7435,8 @@ static int set_callee_state(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee, int insn_idx); +static bool is_callback_calling_kfunc(u32 btf_id); + static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx, int subprog, set_callee_state_fn set_callee_state_cb) @@ -7474,10 +7491,18 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn * interested in validating only BPF helpers that can call subprogs as * callbacks */ - if (set_callee_state_cb != set_callee_state && !is_callback_calling_function(insn->imm)) { - verbose(env, "verifier bug: helper %s#%d is not marked as callback-calling\n", - func_id_name(insn->imm), insn->imm); - return -EFAULT; + if (set_callee_state_cb != set_callee_state) { + if (bpf_pseudo_kfunc_call(insn) && + !is_callback_calling_kfunc(insn->imm)) { + verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n", + func_id_name(insn->imm), insn->imm); + return -EFAULT; + } else if (!bpf_pseudo_kfunc_call(insn) && + !is_callback_calling_function(insn->imm)) { /* helper */ + verbose(env, "verifier bug: helper %s#%d not marked as callback-calling\n", + func_id_name(insn->imm), insn->imm); + return -EFAULT; + } } if (insn->code == (BPF_JMP | BPF_CALL) && @@ -7742,6 +7767,63 @@ static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env, return 0; } +static int set_rbtree_add_callback_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, + int insn_idx) +{ + /* void bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node, + * bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b)); + * + * 'struct bpf_rb_node *node' arg to bpf_rbtree_add is the same PTR_TO_BTF_ID w/ offset + * that 'less' callback args will be receiving. However, 'node' arg was release_reference'd + * by this point, so look at 'root' + */ + struct btf_field *field; + + field = reg_find_field_offset(&caller->regs[BPF_REG_1], caller->regs[BPF_REG_1].off, + BPF_RB_ROOT); + if (!field || !field->graph_root.value_btf_id) + return -EFAULT; + + mark_reg_graph_node(callee->regs, BPF_REG_1, &field->graph_root); + ref_set_non_owning(env, &callee->regs[BPF_REG_1]); + mark_reg_graph_node(callee->regs, BPF_REG_2, &field->graph_root); + ref_set_non_owning(env, &callee->regs[BPF_REG_2]); + + __mark_reg_not_init(env, &callee->regs[BPF_REG_3]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + callee->in_callback_fn = true; + callee->callback_ret_range = tnum_range(0, 1); + return 0; +} + +static bool is_rbtree_lock_required_kfunc(u32 btf_id); + +/* Are we currently verifying the callback for a rbtree helper that must + * be called with lock held? If so, no need to complain about unreleased + * lock + */ +static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env) +{ + struct bpf_verifier_state *state = env->cur_state; + struct bpf_insn *insn = env->prog->insnsi; + struct bpf_func_state *callee; + int kfunc_btf_id; + + if (!state->curframe) + return false; + + callee = state->frame[state->curframe]; + + if (!callee->in_callback_fn) + return false; + + kfunc_btf_id = insn[callee->callsite].imm; + return is_rbtree_lock_required_kfunc(kfunc_btf_id); +} + static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) { struct bpf_verifier_state *state = env->cur_state; @@ -8510,6 +8592,7 @@ struct bpf_kfunc_call_arg_meta { bool r0_rdonly; u32 ret_btf_id; u64 r0_size; + u32 subprogno; struct { u64 value; bool found; @@ -8688,6 +8771,18 @@ static bool is_kfunc_arg_rbtree_node(const struct btf *btf, const struct btf_par return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RB_NODE_ID); } +static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf *btf, + const struct btf_param *arg) +{ + const struct btf_type *t; + + t = btf_type_resolve_func_ptr(btf, arg->type, NULL); + if (!t) + return false; + + return true; +} + /* Returns true if struct is composed of scalars, 4 levels of nesting allowed */ static bool __btf_type_is_scalar_struct(struct bpf_verifier_env *env, const struct btf *btf, @@ -8747,6 +8842,7 @@ enum kfunc_ptr_arg_type { KF_ARG_PTR_TO_BTF_ID, /* Also covers reg2btf_ids conversions */ KF_ARG_PTR_TO_MEM, KF_ARG_PTR_TO_MEM_SIZE, /* Size derived from next argument, skip it */ + KF_ARG_PTR_TO_CALLBACK, KF_ARG_PTR_TO_RB_ROOT, KF_ARG_PTR_TO_RB_NODE, }; @@ -8871,6 +8967,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, return KF_ARG_PTR_TO_BTF_ID; } + if (is_kfunc_arg_callback(env, meta->btf, &args[argno])) + return KF_ARG_PTR_TO_CALLBACK; + if (argno + 1 < nargs && is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1])) arg_mem_size = true; @@ -9130,6 +9229,16 @@ static bool is_bpf_graph_api_kfunc(u32 btf_id) return is_bpf_list_api_kfunc(btf_id) || is_bpf_rbtree_api_kfunc(btf_id); } +static bool is_callback_calling_kfunc(u32 btf_id) +{ + return btf_id == special_kfunc_list[KF_bpf_rbtree_add]; +} + +static bool is_rbtree_lock_required_kfunc(u32 btf_id) +{ + return is_bpf_rbtree_api_kfunc(btf_id); +} + static bool check_kfunc_is_graph_root_api(struct bpf_verifier_env *env, enum btf_field_type head_field_type, u32 kfunc_btf_id) @@ -9468,6 +9577,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_RB_NODE: case KF_ARG_PTR_TO_MEM: case KF_ARG_PTR_TO_MEM_SIZE: + case KF_ARG_PTR_TO_CALLBACK: /* Trusted by default */ break; default: @@ -9619,6 +9729,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ /* Skip next '__sz' argument */ i++; break; + case KF_ARG_PTR_TO_CALLBACK: + meta->subprogno = reg->subprogno; + break; } } @@ -9753,6 +9866,16 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } } + if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add]) { + err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, + set_rbtree_add_callback_state); + if (err) { + verbose(env, "kfunc %s#%d failed callback verification\n", + func_name, func_id); + return err; + } + } + for (i = 0; i < CALLER_SAVED_REGS; i++) mark_reg_not_init(env, regs, caller_saved[i]); @@ -14621,7 +14744,8 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } - if (env->cur_state->active_lock.ptr) { + if (env->cur_state->active_lock.ptr && + !in_rbtree_lock_required_cb(env)) { verbose(env, "bpf_spin_unlock is missing\n"); return -EINVAL; } -- cgit v1.2.3 From a40d3632436b1677a94c16e77be8da798ee9e12b Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Mon, 13 Feb 2023 16:40:14 -0800 Subject: bpf: Special verifier handling for bpf_rbtree_{remove, first} Newly-added bpf_rbtree_{remove,first} kfuncs have some special properties that require handling in the verifier: * both bpf_rbtree_remove and bpf_rbtree_first return the type containing the bpf_rb_node field, with the offset set to that field's offset, instead of a struct bpf_rb_node * * mark_reg_graph_node helper added in previous patch generalizes this logic, use it * bpf_rbtree_remove's node input is a node that's been inserted in the tree - a non-owning reference. * bpf_rbtree_remove must invalidate non-owning references in order to avoid aliasing issue. Use previously-added invalidate_non_owning_refs helper to mark this function as a non-owning ref invalidation point. * Unlike other functions, which convert one of their input arg regs to non-owning reference, bpf_rbtree_first takes no arguments and just returns a non-owning reference (possibly null) * For now verifier logic for this is special-cased instead of adding new kfunc flag. This patch, along with the previous one, complete special verifier handling for all rbtree API functions added in this series. With functional verifier handling of rbtree_remove, under current non-owning reference scheme, a node type with both bpf_{list,rb}_node fields could cause the verifier to accept programs which remove such nodes from collections they haven't been added to. In order to prevent this, this patch adds a check to btf_parse_fields which rejects structs with both bpf_{list,rb}_node fields. This is a temporary measure that can be removed after "collection identity" followup. See comment added in btf_parse_fields. A linked_list BTF test exercising the new check is added in this patch as well. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230214004017.2534011-6-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 24 ++++++++++++ kernel/bpf/verifier.c | 43 ++++++++++++++++------ .../testing/selftests/bpf/prog_tests/linked_list.c | 37 +++++++++++++++++++ 3 files changed, 92 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index b9d1f5c4e316..6582735ef1fc 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3768,6 +3768,30 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type goto end; } + /* need collection identity for non-owning refs before allowing this + * + * Consider a node type w/ both list and rb_node fields: + * struct node { + * struct bpf_list_node l; + * struct bpf_rb_node r; + * } + * + * Used like so: + * struct node *n = bpf_obj_new(....); + * bpf_list_push_front(&list_head, &n->l); + * bpf_rbtree_remove(&rb_root, &n->r); + * + * It should not be possible to rbtree_remove the node since it hasn't + * been added to a tree. But push_front converts n to a non-owning + * reference, and rbtree_remove accepts the non-owning reference to + * a type w/ bpf_rb_node field. + */ + if (btf_record_has_field(rec, BPF_LIST_NODE) && + btf_record_has_field(rec, BPF_RB_NODE)) { + ret = -EINVAL; + goto end; + } + return rec; end: btf_record_free(rec); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 88c8edf67007..21e08c111702 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9682,14 +9682,26 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return ret; break; case KF_ARG_PTR_TO_RB_NODE: - if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { - verbose(env, "arg#%d expected pointer to allocated object\n", i); - return -EINVAL; - } - if (!reg->ref_obj_id) { - verbose(env, "allocated object must be referenced\n"); - return -EINVAL; + if (meta->func_id == special_kfunc_list[KF_bpf_rbtree_remove]) { + if (!type_is_non_owning_ref(reg->type) || reg->ref_obj_id) { + verbose(env, "rbtree_remove node input must be non-owning ref\n"); + return -EINVAL; + } + if (in_rbtree_lock_required_cb(env)) { + verbose(env, "rbtree_remove not allowed in rbtree cb\n"); + return -EINVAL; + } + } else { + if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { + verbose(env, "arg#%d expected pointer to allocated object\n", i); + return -EINVAL; + } + if (!reg->ref_obj_id) { + verbose(env, "allocated object must be referenced\n"); + return -EINVAL; + } } + ret = process_kf_arg_ptr_to_rbtree_node(env, reg, regno, meta); if (ret < 0) return ret; @@ -9940,11 +9952,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, meta.func_id == special_kfunc_list[KF_bpf_list_pop_back]) { struct btf_field *field = meta.arg_list_head.field; - mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; - regs[BPF_REG_0].btf = field->graph_root.btf; - regs[BPF_REG_0].btf_id = field->graph_root.value_btf_id; - regs[BPF_REG_0].off = field->graph_root.node_offset; + mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root); + } else if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_remove] || + meta.func_id == special_kfunc_list[KF_bpf_rbtree_first]) { + struct btf_field *field = meta.arg_rbtree_root.field; + + mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root); } else if (meta.func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_TRUSTED; @@ -10010,7 +10023,13 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (is_kfunc_ret_null(&meta)) regs[BPF_REG_0].id = id; regs[BPF_REG_0].ref_obj_id = id; + } else if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_first]) { + ref_set_non_owning(env, ®s[BPF_REG_0]); } + + if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_remove]) + invalidate_non_owning_refs(env); + if (reg_may_point_to_spin_lock(®s[BPF_REG_0]) && !regs[BPF_REG_0].id) regs[BPF_REG_0].id = ++env->id_gen; } /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */ diff --git a/tools/testing/selftests/bpf/prog_tests/linked_list.c b/tools/testing/selftests/bpf/prog_tests/linked_list.c index c456b34a823a..0ed8132ce1c3 100644 --- a/tools/testing/selftests/bpf/prog_tests/linked_list.c +++ b/tools/testing/selftests/bpf/prog_tests/linked_list.c @@ -715,6 +715,43 @@ static void test_btf(void) btf__free(btf); break; } + + while (test__start_subtest("btf: list_node and rb_node in same struct")) { + btf = init_btf(); + if (!ASSERT_OK_PTR(btf, "init_btf")) + break; + + id = btf__add_struct(btf, "bpf_rb_node", 24); + if (!ASSERT_EQ(id, 5, "btf__add_struct bpf_rb_node")) + break; + id = btf__add_struct(btf, "bar", 40); + if (!ASSERT_EQ(id, 6, "btf__add_struct bar")) + break; + err = btf__add_field(btf, "a", LIST_NODE, 0, 0); + if (!ASSERT_OK(err, "btf__add_field bar::a")) + break; + err = btf__add_field(btf, "c", 5, 128, 0); + if (!ASSERT_OK(err, "btf__add_field bar::c")) + break; + + id = btf__add_struct(btf, "foo", 20); + if (!ASSERT_EQ(id, 7, "btf__add_struct foo")) + break; + err = btf__add_field(btf, "a", LIST_HEAD, 0, 0); + if (!ASSERT_OK(err, "btf__add_field foo::a")) + break; + err = btf__add_field(btf, "b", SPIN_LOCK, 128, 0); + if (!ASSERT_OK(err, "btf__add_field foo::b")) + break; + id = btf__add_decl_tag(btf, "contains:bar:a", 7, 0); + if (!ASSERT_EQ(id, 8, "btf__add_decl_tag contains:bar:a")) + break; + + err = btf__load_into_kernel(btf); + ASSERT_EQ(err, -EINVAL, "check btf"); + btf__free(btf); + break; + } } void test_linked_list(void) -- cgit v1.2.3 From ecdf985d7615356b78241fdb159c091830ed0380 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Wed, 15 Feb 2023 01:20:27 +0200 Subject: bpf: track immediate values written to stack by BPF_ST instruction For aligned stack writes using BPF_ST instruction track stored values in a same way BPF_STX is handled, e.g. make sure that the following commands produce similar verifier knowledge: fp[-8] = 42; r1 = 42; fp[-8] = r1; This covers two cases: - non-null values written to stack are stored as spill of fake registers; - null values written to stack are stored as STACK_ZERO marks. Previously both cases above used STACK_MISC marks instead. Some verifier test cases relied on the old logic to obtain STACK_MISC marks for some stack values. These test cases are updated in the same commit to avoid failures during bisect. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20230214232030.1502829-2-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 18 +++- .../bpf/verifier/bounds_mix_sign_unsign.c | 110 ++++++++++++--------- 2 files changed, 80 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 21e08c111702..c28afae60874 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3473,6 +3473,11 @@ static void save_register_state(struct bpf_func_state *state, scrub_spilled_slot(&state->stack[spi].slot_type[i - 1]); } +static bool is_bpf_st_mem(struct bpf_insn *insn) +{ + return BPF_CLASS(insn->code) == BPF_ST && BPF_MODE(insn->code) == BPF_MEM; +} + /* check_stack_{read,write}_fixed_off functions track spill/fill of registers, * stack boundary and alignment are checked in check_mem_access() */ @@ -3484,8 +3489,9 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, { struct bpf_func_state *cur; /* state of the current function */ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; - u32 dst_reg = env->prog->insnsi[insn_idx].dst_reg; + struct bpf_insn *insn = &env->prog->insnsi[insn_idx]; struct bpf_reg_state *reg = NULL; + u32 dst_reg = insn->dst_reg; err = grow_stack_state(state, round_up(slot + 1, BPF_REG_SIZE)); if (err) @@ -3538,6 +3544,13 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, return err; } save_register_state(state, spi, reg, size); + } else if (!reg && !(off % BPF_REG_SIZE) && is_bpf_st_mem(insn) && + insn->imm != 0 && env->bpf_capable) { + struct bpf_reg_state fake_reg = {}; + + __mark_reg_known(&fake_reg, (u32)insn->imm); + fake_reg.type = SCALAR_VALUE; + save_register_state(state, spi, &fake_reg, size); } else if (reg && is_spillable_regtype(reg->type)) { /* register containing pointer is being spilled into stack */ if (size != BPF_REG_SIZE) { @@ -3572,7 +3585,8 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; /* when we zero initialize stack slots mark them as such */ - if (reg && register_is_null(reg)) { + if ((reg && register_is_null(reg)) || + (!reg && is_bpf_st_mem(insn) && insn->imm == 0)) { /* backtracking doesn't work for STACK_ZERO yet. */ err = mark_chain_precision(env, value_regno); if (err) diff --git a/tools/testing/selftests/bpf/verifier/bounds_mix_sign_unsign.c b/tools/testing/selftests/bpf/verifier/bounds_mix_sign_unsign.c index c2aa6f26738b..bf82b923c5fe 100644 --- a/tools/testing/selftests/bpf/verifier/bounds_mix_sign_unsign.c +++ b/tools/testing/selftests/bpf/verifier/bounds_mix_sign_unsign.c @@ -1,13 +1,14 @@ { "bounds checks mixing signed and unsigned, positive bounds", .insns = { + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7), - BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), BPF_MOV64_IMM(BPF_REG_2, 2), BPF_JMP_REG(BPF_JGE, BPF_REG_2, BPF_REG_1, 3), @@ -17,20 +18,21 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map_hash_8b = { 3 }, + .fixup_map_hash_8b = { 5 }, .errstr = "unbounded min value", .result = REJECT, }, { "bounds checks mixing signed and unsigned", .insns = { + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7), - BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), BPF_MOV64_IMM(BPF_REG_2, -1), BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_2, 3), @@ -40,20 +42,21 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map_hash_8b = { 3 }, + .fixup_map_hash_8b = { 5 }, .errstr = "unbounded min value", .result = REJECT, }, { "bounds checks mixing signed and unsigned, variant 2", .insns = { + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9), - BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), BPF_MOV64_IMM(BPF_REG_2, -1), BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_2, 5), @@ -65,20 +68,21 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map_hash_8b = { 3 }, + .fixup_map_hash_8b = { 5 }, .errstr = "unbounded min value", .result = REJECT, }, { "bounds checks mixing signed and unsigned, variant 3", .insns = { + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8), - BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), BPF_MOV64_IMM(BPF_REG_2, -1), BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_2, 4), @@ -89,20 +93,21 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map_hash_8b = { 3 }, + .fixup_map_hash_8b = { 5 }, .errstr = "unbounded min value", .result = REJECT, }, { "bounds checks mixing signed and unsigned, variant 4", .insns = { + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7), - BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), BPF_MOV64_IMM(BPF_REG_2, 1), BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), @@ -112,19 +117,20 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map_hash_8b = { 3 }, + .fixup_map_hash_8b = { 5 }, .result = ACCEPT, }, { "bounds checks mixing signed and unsigned, variant 5", .insns = { + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9), - BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), BPF_MOV64_IMM(BPF_REG_2, -1), BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_2, 5), @@ -135,17 +141,20 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map_hash_8b = { 3 }, + .fixup_map_hash_8b = { 5 }, .errstr = "unbounded min value", .result = REJECT, }, { "bounds checks mixing signed and unsigned, variant 6", .insns = { + BPF_MOV64_REG(BPF_REG_9, BPF_REG_1), + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_9), BPF_MOV64_IMM(BPF_REG_2, 0), BPF_MOV64_REG(BPF_REG_3, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -512), - BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8), BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -16), BPF_MOV64_IMM(BPF_REG_6, -1), BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_6, 5), @@ -163,13 +172,14 @@ { "bounds checks mixing signed and unsigned, variant 7", .insns = { + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7), - BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), BPF_MOV64_IMM(BPF_REG_2, 1024 * 1024 * 1024), BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_2, 3), @@ -179,19 +189,20 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map_hash_8b = { 3 }, + .fixup_map_hash_8b = { 5 }, .result = ACCEPT, }, { "bounds checks mixing signed and unsigned, variant 8", .insns = { + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9), - BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), BPF_MOV64_IMM(BPF_REG_2, -1), BPF_JMP_REG(BPF_JGT, BPF_REG_2, BPF_REG_1, 2), @@ -203,20 +214,21 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map_hash_8b = { 3 }, + .fixup_map_hash_8b = { 5 }, .errstr = "unbounded min value", .result = REJECT, }, { "bounds checks mixing signed and unsigned, variant 9", .insns = { + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 10), - BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), BPF_LD_IMM64(BPF_REG_2, -9223372036854775808ULL), BPF_JMP_REG(BPF_JGT, BPF_REG_2, BPF_REG_1, 2), @@ -228,19 +240,20 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map_hash_8b = { 3 }, + .fixup_map_hash_8b = { 5 }, .result = ACCEPT, }, { "bounds checks mixing signed and unsigned, variant 10", .insns = { + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9), - BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), BPF_MOV64_IMM(BPF_REG_2, 0), BPF_JMP_REG(BPF_JGT, BPF_REG_2, BPF_REG_1, 2), @@ -252,20 +265,21 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map_hash_8b = { 3 }, + .fixup_map_hash_8b = { 5 }, .errstr = "unbounded min value", .result = REJECT, }, { "bounds checks mixing signed and unsigned, variant 11", .insns = { + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9), - BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), BPF_MOV64_IMM(BPF_REG_2, -1), BPF_JMP_REG(BPF_JGE, BPF_REG_2, BPF_REG_1, 2), @@ -278,20 +292,21 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map_hash_8b = { 3 }, + .fixup_map_hash_8b = { 5 }, .errstr = "unbounded min value", .result = REJECT, }, { "bounds checks mixing signed and unsigned, variant 12", .insns = { + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9), - BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), BPF_MOV64_IMM(BPF_REG_2, -6), BPF_JMP_REG(BPF_JGE, BPF_REG_2, BPF_REG_1, 2), @@ -303,20 +318,21 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map_hash_8b = { 3 }, + .fixup_map_hash_8b = { 5 }, .errstr = "unbounded min value", .result = REJECT, }, { "bounds checks mixing signed and unsigned, variant 13", .insns = { + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6), - BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), BPF_MOV64_IMM(BPF_REG_2, 2), BPF_JMP_REG(BPF_JGE, BPF_REG_2, BPF_REG_1, 2), @@ -331,7 +347,7 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map_hash_8b = { 3 }, + .fixup_map_hash_8b = { 5 }, .errstr = "unbounded min value", .result = REJECT, }, @@ -340,13 +356,14 @@ .insns = { BPF_LDX_MEM(BPF_W, BPF_REG_9, BPF_REG_1, offsetof(struct __sk_buff, mark)), + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8), - BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), BPF_MOV64_IMM(BPF_REG_2, -1), BPF_MOV64_IMM(BPF_REG_8, 2), @@ -360,20 +377,21 @@ BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_2, -3), BPF_JMP_IMM(BPF_JA, 0, 0, -7), }, - .fixup_map_hash_8b = { 4 }, + .fixup_map_hash_8b = { 6 }, .errstr = "unbounded min value", .result = REJECT, }, { "bounds checks mixing signed and unsigned, variant 15", .insns = { + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4), - BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), BPF_MOV64_IMM(BPF_REG_2, -6), BPF_JMP_REG(BPF_JGE, BPF_REG_2, BPF_REG_1, 2), @@ -387,7 +405,7 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map_hash_8b = { 3 }, + .fixup_map_hash_8b = { 5 }, .errstr = "unbounded min value", .result = REJECT, }, -- cgit v1.2.3 From 31ff2135121ca9c0fd6c60de6b851509a24446ab Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Wed, 15 Feb 2023 01:20:29 +0200 Subject: bpf: BPF_ST with variable offset should preserve STACK_ZERO marks BPF_STX instruction preserves STACK_ZERO marks for variable offset writes in situations like below: *(u64*)(r10 - 8) = 0 ; STACK_ZERO marks for fp[-8] r0 = random(-7, -1) ; some random number in range of [-7, -1] r0 += r10 ; r0 is now a variable offset pointer to stack r1 = 0 *(u8*)(r0) = r1 ; BPF_STX writing zero, STACK_ZERO mark for ; fp[-8] is preserved This commit updates verifier.c:check_stack_write_var_off() to process BPF_ST in a similar manner, e.g. the following example: *(u64*)(r10 - 8) = 0 ; STACK_ZERO marks for fp[-8] r0 = random(-7, -1) ; some random number in range of [-7, -1] r0 += r10 ; r0 is now variable offset pointer to stack *(u8*)(r0) = 0 ; BPF_ST writing zero, STACK_ZERO mark for ; fp[-8] is preserved Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20230214232030.1502829-4-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c28afae60874..272563a0b770 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3631,6 +3631,7 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env, int min_off, max_off; int i, err; struct bpf_reg_state *ptr_reg = NULL, *value_reg = NULL; + struct bpf_insn *insn = &env->prog->insnsi[insn_idx]; bool writing_zero = false; /* set if the fact that we're writing a zero is used to let any * stack slots remain STACK_ZERO @@ -3643,7 +3644,8 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env, max_off = ptr_reg->smax_value + off + size; if (value_regno >= 0) value_reg = &cur->regs[value_regno]; - if (value_reg && register_is_null(value_reg)) + if ((value_reg && register_is_null(value_reg)) || + (!value_reg && is_bpf_st_mem(insn) && insn->imm == 0)) writing_zero = true; err = grow_stack_state(state, round_up(-min_off, BPF_REG_SIZE)); -- cgit v1.2.3 From 997849c4b969034e225153f41026657def66d286 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 15 Feb 2023 16:21:31 +0800 Subject: bpf: Zeroing allocated object from slab in bpf memory allocator Currently the freed element in bpf memory allocator may be immediately reused, for htab map the reuse will reinitialize special fields in map value (e.g., bpf_spin_lock), but lookup procedure may still access these special fields, and it may lead to hard-lockup as shown below: NMI backtrace for cpu 16 CPU: 16 PID: 2574 Comm: htab.bin Tainted: G L 6.1.0+ #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), RIP: 0010:queued_spin_lock_slowpath+0x283/0x2c0 ...... Call Trace: copy_map_value_locked+0xb7/0x170 bpf_map_copy_value+0x113/0x3c0 __sys_bpf+0x1c67/0x2780 __x64_sys_bpf+0x1c/0x20 do_syscall_64+0x30/0x60 entry_SYSCALL_64_after_hwframe+0x46/0xb0 ...... For htab map, just like the preallocated case, these is no need to initialize these special fields in map value again once these fields have been initialized. For preallocated htab map, these fields are initialized through __GFP_ZERO in bpf_map_area_alloc(), so do the similar thing for non-preallocated htab in bpf memory allocator. And there is no need to use __GFP_ZERO for per-cpu bpf memory allocator, because __alloc_percpu_gfp() does it implicitly. Fixes: 0fd7c5d43339 ("bpf: Optimize call_rcu in non-preallocated hash map.") Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20230215082132.3856544-2-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 7 +++++++ kernel/bpf/hashtab.c | 4 ++-- kernel/bpf/memalloc.c | 2 +- 3 files changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index be34f7deb6c3..520b238abd5a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -363,6 +363,13 @@ static inline void bpf_obj_init(const struct btf_field_offs *foffs, void *obj) memset(obj + foffs->field_off[i], 0, foffs->field_sz[i]); } +/* 'dst' must be a temporary buffer and should not point to memory that is being + * used in parallel by a bpf program or bpf syscall, otherwise the access from + * the bpf program or bpf syscall may be corrupted by the reinitialization, + * leading to weird problems. Even 'dst' is newly-allocated from bpf memory + * allocator, it is still possible for 'dst' to be used in parallel by a bpf + * program or bpf syscall. + */ static inline void check_and_init_map_value(struct bpf_map *map, void *dst) { bpf_obj_init(map->field_offs, dst); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 66bded144377..5dfcb5ad0d06 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1004,8 +1004,6 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, l_new = ERR_PTR(-ENOMEM); goto dec_count; } - check_and_init_map_value(&htab->map, - l_new->key + round_up(key_size, 8)); } memcpy(l_new->key, key, key_size); @@ -1592,6 +1590,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, else copy_map_value(map, value, l->key + roundup_key_size); + /* Zeroing special fields in the temp buffer */ check_and_init_map_value(map, value); } @@ -1792,6 +1791,7 @@ again_nocopy: true); else copy_map_value(map, dst_val, value); + /* Zeroing special fields in the temp buffer */ check_and_init_map_value(map, dst_val); } if (do_delete) { diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 490d03a4581a..5fcdacbb8439 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -143,7 +143,7 @@ static void *__alloc(struct bpf_mem_cache *c, int node) return obj; } - return kmalloc_node(c->unit_size, flags, node); + return kmalloc_node(c->unit_size, flags | __GFP_ZERO, node); } static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c) -- cgit v1.2.3 From d384dce281ed1b504fae2e279507827638d56fa3 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 15 Feb 2023 20:59:52 -0800 Subject: bpf: Fix global subprog context argument resolution logic KPROBE program's user-facing context type is defined as typedef bpf_user_pt_regs_t. This leads to a problem when trying to passing kprobe/uprobe/usdt context argument into global subprog, as kernel always strip away mods and typedefs of user-supplied type, but takes expected type from bpf_ctx_convert as is, which causes mismatch. Current way to work around this is to define a fake struct with the same name as expected typedef: struct bpf_user_pt_regs_t {}; __noinline my_global_subprog(struct bpf_user_pt_regs_t *ctx) { ... } This patch fixes the issue by resolving expected type, if it's not a struct. It still leaves the above work-around working for backwards compatibility. Fixes: 91cc1a99740e ("bpf: Annotate context types") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20230216045954.3002473-2-andrii@kernel.org --- kernel/bpf/btf.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 6582735ef1fc..fa22ec79ac0e 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5669,6 +5669,7 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, if (!ctx_struct) /* should not happen */ return NULL; +again: ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_struct->name_off); if (!ctx_tname) { /* should not happen */ @@ -5682,8 +5683,16 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, * int socket_filter_bpf_prog(struct __sk_buff *skb) * { // no fields of skb are ever used } */ - if (strcmp(ctx_tname, tname)) - return NULL; + if (strcmp(ctx_tname, tname)) { + /* bpf_user_pt_regs_t is a typedef, so resolve it to + * underlying struct and check name again + */ + if (!btf_type_is_modifier(ctx_struct)) + return NULL; + while (btf_type_is_modifier(ctx_struct)) + ctx_struct = btf_type_by_id(btf_vmlinux, ctx_struct->type); + goto again; + } return ctx_type; } -- cgit v1.2.3